This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "IPFire 3.x development tree".
The branch, master has been updated via 0d9ae7080d575dedf3a578771e56e41afc932eef (commit) via d459e8c1ce31ecd77c68e5aa303b4aefa3deabe4 (commit) via 7a54de68aca3a9c6d8c966613135068f839f426f (commit) via ae4e228f0e4259e2f075f977a1966b05b35a2a1d (commit) via f81ab17324579b2362c698fc4f5a7f492b2b0c0f (commit) via ac944179b1f33bc19ef1d0c60c4d82d213255714 (commit) from 21fffd0984a5459f80e638d14ebfc20e08ff2030 (commit)
Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below.
- Log ----------------------------------------------------------------- commit 0d9ae7080d575dedf3a578771e56e41afc932eef Merge: 21fffd0984a5459f80e638d14ebfc20e08ff2030 d459e8c1ce31ecd77c68e5aa303b4aefa3deabe4 Author: Michael Tremer michael.tremer@ipfire.org Date: Tue Mar 23 21:35:47 2010 +0100
Merge branch 'next'
commit d459e8c1ce31ecd77c68e5aa303b4aefa3deabe4 Merge: 7a54de68aca3a9c6d8c966613135068f839f426f ae4e228f0e4259e2f075f977a1966b05b35a2a1d Author: Michael Tremer michael.tremer@ipfire.org Date: Tue Mar 23 21:34:57 2010 +0100
Merge commit 'ms/kernel-update' into next
commit 7a54de68aca3a9c6d8c966613135068f839f426f Author: Michael Tremer michael.tremer@ipfire.org Date: Tue Mar 23 21:26:12 2010 +0100
python: Add bunch of patches and change to shared lib.
commit ae4e228f0e4259e2f075f977a1966b05b35a2a1d Author: Michael Tremer michael.tremer@ipfire.org Date: Mon Mar 22 10:25:16 2010 +0100
kernel: Update to 2.6.33.1.
Currently reiser4 and aufs support is not available.
commit f81ab17324579b2362c698fc4f5a7f492b2b0c0f Author: Michael Tremer michael.tremer@ipfire.org Date: Sun Mar 21 18:58:17 2010 +0100
kernel: glibc is not a dependency.
commit ac944179b1f33bc19ef1d0c60c4d82d213255714 Author: Michael Tremer michael.tremer@ipfire.org Date: Sun Mar 21 18:57:54 2010 +0100
kernel: Create localversion out of meta data.
-----------------------------------------------------------------------
Summary of changes: pkgs/core/kernel/config | 544 +- pkgs/core/kernel/kernel.nm | 8 +- ...2.6.31.1-1.patch => aufs2-2.6.31.1-1.patch.off} | 0 ... grsecurity-2.1.14-2.6.33.1-201003201735.patch} |26592 ++++++++++++-------- ...patch => linux-2.6.31.1-scsi.h-fix-1.patch.off} | 0 ....off => linux-2.6.33-disable-compat-vdso.patch} | 56 +- ...2.6.31.1.patch => reiser4-for-2.6.33.patch.off} | 1945 +- ...tes-2.6.31.1-16.diff => routes-2.6.33-16.patch} | 196 +- .../python/patches/python-2.3.4-lib64-regex.patch | 18 + pkgs/core/python/patches/python-2.5-cflags.patch | 11 + .../python-2.5.1-socketmodule-constants.patch | 63 + .../python-2.5.1-socketmodule-constants2.patch | 20 + .../patches/python-2.6.2-binutils-no-dep.patch | 15 + .../patches/python-2.6.4-distutils-rpath.patch | 20 + .../patches/python-2.6.4-no-static-lib.patch | 50 + pkgs/core/python/python.nm | 9 +- 16 files changed, 18417 insertions(+), 11130 deletions(-) rename pkgs/core/kernel/patches/{aufs2-2.6.31.1-1.patch => aufs2-2.6.31.1-1.patch.off} (100%) rename pkgs/core/kernel/patches/{grsecurity-2.1.14-2.6.31.1-200910012153.patch => grsecurity-2.1.14-2.6.33.1-201003201735.patch} (62%) rename pkgs/core/kernel/patches/{linux-2.6.31.1-scsi.h-fix-1.patch => linux-2.6.31.1-scsi.h-fix-1.patch.off} (100%) rename pkgs/core/kernel/patches/{linux-2.6.31.1-disable-compat_vdso-1.patch.off => linux-2.6.33-disable-compat-vdso.patch} (57%) rename pkgs/core/kernel/patches/{reiser4-for-2.6.31.1.patch => reiser4-for-2.6.33.patch.off} (98%) rename pkgs/core/kernel/patches/{routes-2.6.31.1-16.diff => routes-2.6.33-16.patch} (85%) create mode 100644 pkgs/core/python/patches/python-2.3.4-lib64-regex.patch create mode 100644 pkgs/core/python/patches/python-2.5-cflags.patch create mode 100644 pkgs/core/python/patches/python-2.5.1-socketmodule-constants.patch create mode 100644 pkgs/core/python/patches/python-2.5.1-socketmodule-constants2.patch create mode 100644 pkgs/core/python/patches/python-2.6.2-binutils-no-dep.patch create mode 100644 pkgs/core/python/patches/python-2.6.4-distutils-rpath.patch create mode 100644 pkgs/core/python/patches/python-2.6.4-no-static-lib.patch
Difference in files: diff --git a/pkgs/core/kernel/config b/pkgs/core/kernel/config index f6ed5b5..27a349e 100644 --- a/pkgs/core/kernel/config +++ b/pkgs/core/kernel/config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.31 -# Wed Sep 16 12:05:54 2009 +# Linux kernel version: 2.6.33.1 +# Sun Mar 21 20:09:46 2010 # # CONFIG_64BIT is not set CONFIG_X86_32=y @@ -17,7 +17,6 @@ CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_HAVE_LATENCYTOP_SUPPORT=y -CONFIG_FAST_CMPXCHG_LOCAL=y CONFIG_MMU=y CONFIG_ZONE_DMA=y CONFIG_GENERIC_ISA_DMA=y @@ -35,7 +34,8 @@ CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_DEFAULT_IDLE=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y -CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y # CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y @@ -44,6 +44,7 @@ CONFIG_ARCH_POPULATES_NODE_MAP=y # CONFIG_AUDIT_ARCH is not set CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y CONFIG_GENERIC_IRQ_PROBE=y @@ -67,9 +68,11 @@ CONFIG_LOCALVERSION_AUTO=y CONFIG_HAVE_KERNEL_GZIP=y CONFIG_HAVE_KERNEL_BZIP2=y CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_LZO=y # CONFIG_KERNEL_GZIP is not set # CONFIG_KERNEL_BZIP2 is not set CONFIG_KERNEL_LZMA=y +# CONFIG_KERNEL_LZO is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y @@ -82,11 +85,13 @@ CONFIG_POSIX_MQUEUE_SYSCTL=y # # RCU Subsystem # -CONFIG_CLASSIC_RCU=y -# CONFIG_TREE_RCU is not set -# CONFIG_PREEMPT_RCU is not set +CONFIG_TREE_RCU=y +# CONFIG_TREE_PREEMPT_RCU is not set +# CONFIG_TINY_RCU is not set +# CONFIG_RCU_TRACE is not set +CONFIG_RCU_FANOUT=32 +# CONFIG_RCU_FANOUT_EXACT is not set # CONFIG_TREE_RCU_TRACE is not set -# CONFIG_PREEMPT_RCU_TRACE is not set CONFIG_IKCONFIG=y # CONFIG_IKCONFIG_PROC is not set CONFIG_LOG_BUF_SHIFT=18 @@ -110,12 +115,15 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y CONFIG_RD_BZIP2=y CONFIG_RD_LZMA=y +CONFIG_RD_LZO=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y +CONFIG_KALLSYMS=y +# CONFIG_KALLSYMS_EXTRA_PASS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -129,40 +137,43 @@ CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_AIO=y -CONFIG_HAVE_PERF_COUNTERS=y +CONFIG_HAVE_PERF_EVENTS=y
# -# Performance Counters +# Kernel Performance Events And Counters # +CONFIG_PERF_EVENTS=y CONFIG_PERF_COUNTERS=y -CONFIG_EVENT_PROFILE=y CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y -CONFIG_STRIP_ASM_SYMS=y # CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set CONFIG_PROFILING=y -CONFIG_TRACEPOINTS=y -CONFIG_MARKERS=y CONFIG_OPROFILE=y CONFIG_OPROFILE_IBS=y +# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set CONFIG_HAVE_OPROFILE=y +# CONFIG_KPROBES is not set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_USER_RETURN_NOTIFIER=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_DMA_ATTRS=y CONFIG_HAVE_DMA_API_DEBUG=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y
# # GCOV-based kernel profiling # # CONFIG_GCOV_KERNEL is not set CONFIG_SLOW_WORK=y +# CONFIG_SLOW_WORK_DEBUG is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y # CONFIG_SLABINFO is not set CONFIG_RT_MUTEXES=y @@ -183,15 +194,42 @@ CONFIG_LBDAF=y # IO Schedulers # CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y -CONFIG_DEFAULT_AS=y # CONFIG_DEFAULT_DEADLINE is not set -# CONFIG_DEFAULT_CFQ is not set +CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="anticipatory" +CONFIG_DEFAULT_IOSCHED="cfq" CONFIG_PREEMPT_NOTIFIERS=y +# CONFIG_INLINE_SPIN_TRYLOCK is not set +# CONFIG_INLINE_SPIN_TRYLOCK_BH is not set +# CONFIG_INLINE_SPIN_LOCK is not set +# CONFIG_INLINE_SPIN_LOCK_BH is not set +# CONFIG_INLINE_SPIN_LOCK_IRQ is not set +# CONFIG_INLINE_SPIN_LOCK_IRQSAVE is not set +CONFIG_INLINE_SPIN_UNLOCK=y +# CONFIG_INLINE_SPIN_UNLOCK_BH is not set +CONFIG_INLINE_SPIN_UNLOCK_IRQ=y +# CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE is not set +# CONFIG_INLINE_READ_TRYLOCK is not set +# CONFIG_INLINE_READ_LOCK is not set +# CONFIG_INLINE_READ_LOCK_BH is not set +# CONFIG_INLINE_READ_LOCK_IRQ is not set +# CONFIG_INLINE_READ_LOCK_IRQSAVE is not set +CONFIG_INLINE_READ_UNLOCK=y +# CONFIG_INLINE_READ_UNLOCK_BH is not set +CONFIG_INLINE_READ_UNLOCK_IRQ=y +# CONFIG_INLINE_READ_UNLOCK_IRQRESTORE is not set +# CONFIG_INLINE_WRITE_TRYLOCK is not set +# CONFIG_INLINE_WRITE_LOCK is not set +# CONFIG_INLINE_WRITE_LOCK_BH is not set +# CONFIG_INLINE_WRITE_LOCK_IRQ is not set +# CONFIG_INLINE_WRITE_LOCK_IRQSAVE is not set +CONFIG_INLINE_WRITE_UNLOCK=y +# CONFIG_INLINE_WRITE_UNLOCK_BH is not set +CONFIG_INLINE_WRITE_UNLOCK_IRQ=y +# CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE is not set +CONFIG_MUTEX_SPIN_ON_OWNER=y # CONFIG_FREEZER is not set
# @@ -207,8 +245,10 @@ CONFIG_X86_MPPARSE=y # CONFIG_X86_BIGSMP is not set CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_ELAN is not set +# CONFIG_X86_MRST is not set # CONFIG_X86_RDC321X is not set # CONFIG_X86_32_NON_STANDARD is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set # CONFIG_MEMTEST is not set @@ -236,13 +276,13 @@ CONFIG_M686=y # CONFIG_MVIAC7 is not set # CONFIG_MPSC is not set # CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set # CONFIG_GENERIC_CPU is not set CONFIG_X86_GENERIC=y CONFIG_X86_CPU=y -CONFIG_X86_L1_CACHE_BYTES=64 -CONFIG_X86_INTERNODE_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=5 +CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_XADD=y CONFIG_X86_PPRO_FENCE=y CONFIG_X86_WP_WORKS_OK=y @@ -253,8 +293,9 @@ CONFIG_X86_ALIGNMENT_16=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y CONFIG_X86_CMOV=y -CONFIG_X86_MINIMUM_CPU_FAMILY=4 +CONFIG_X86_MINIMUM_CPU_FAMILY=5 CONFIG_X86_DEBUGCTLMSR=y CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_CYRIX_32=y @@ -278,8 +319,6 @@ CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y # CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS is not set CONFIG_X86_MCE=y -# CONFIG_X86_OLD_MCE is not set -CONFIG_X86_NEW_MCE=y CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y # CONFIG_X86_ANCIENT_MCE is not set @@ -296,7 +335,6 @@ CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y -# CONFIG_X86_CPU_DEBUG is not set CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set @@ -306,6 +344,7 @@ CONFIG_PAGE_OFFSET=0xC0000000 CONFIG_ARCH_FLATMEM_ENABLE=y CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ILLEGAL_POINTER_VALUE=0 CONFIG_SELECT_MEMORY_MODEL=y CONFIG_FLATMEM_MANUAL=y # CONFIG_DISCONTIGMEM_MANUAL is not set @@ -319,10 +358,11 @@ CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y -CONFIG_HAVE_MLOCK=y -CONFIG_HAVE_MLOCKED_PAGE_BIT=y CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y CONFIG_X86_RESERVE_LOW_64K=y @@ -330,8 +370,8 @@ CONFIG_X86_RESERVE_LOW_64K=y CONFIG_MTRR=y # CONFIG_MTRR_SANITIZER is not set CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y CONFIG_SECCOMP=y -CONFIG_CC_STACKPROTECTOR_ALL=y CONFIG_CC_STACKPROTECTOR=y # CONFIG_HZ_100 is not set CONFIG_HZ_250=y @@ -342,7 +382,7 @@ CONFIG_SCHED_HRTICK=y # CONFIG_KEXEC is not set CONFIG_PHYSICAL_START=0x1000000 # CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_ALIGN=0x100000 +CONFIG_PHYSICAL_ALIGN=0x1000000 # CONFIG_HOTPLUG_CPU is not set # CONFIG_CMDLINE_BOOL is not set
@@ -353,9 +393,11 @@ CONFIG_PM=y # CONFIG_PM_DEBUG is not set # CONFIG_SUSPEND is not set # CONFIG_HIBERNATION is not set +# CONFIG_PM_RUNTIME is not set CONFIG_ACPI=y CONFIG_ACPI_PROCFS=y CONFIG_ACPI_PROCFS_POWER=y +CONFIG_ACPI_POWER_METER=m CONFIG_ACPI_SYSFS_POWER=y CONFIG_ACPI_PROC_EVENT=y CONFIG_ACPI_AC=y @@ -364,6 +406,7 @@ CONFIG_ACPI_BUTTON=y CONFIG_ACPI_FAN=y CONFIG_ACPI_DOCK=y CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=m CONFIG_ACPI_THERMAL=y # CONFIG_ACPI_CUSTOM_DSDT is not set CONFIG_ACPI_BLACKLIST_YEAR=2001 @@ -372,6 +415,7 @@ CONFIG_ACPI_PCI_SLOT=y CONFIG_X86_PM_TIMER=y # CONFIG_ACPI_CONTAINER is not set # CONFIG_ACPI_SBS is not set +# CONFIG_SFI is not set
# # CPU Frequency scaling @@ -436,7 +480,6 @@ CONFIG_PCI_OLPC=y CONFIG_PCI_DOMAINS=y CONFIG_DMAR=y CONFIG_DMAR_DEFAULT_ON=y -# CONFIG_DMAR_BROKEN_GFX_WA is not set CONFIG_DMAR_FLOPPY_WA=y CONFIG_PCIEPORTBUS=y CONFIG_PCIEAER=y @@ -450,6 +493,7 @@ CONFIG_PCI_LEGACY=y CONFIG_PCI_STUB=m CONFIG_HT_IRQ=y CONFIG_PCI_IOV=y +CONFIG_PCI_IOAPIC=y CONFIG_ISA_DMA_API=y # CONFIG_ISA is not set # CONFIG_MCA is not set @@ -457,7 +501,6 @@ CONFIG_ISA_DMA_API=y CONFIG_OLPC=y CONFIG_K8_NB=y CONFIG_PCCARD=m -# CONFIG_PCMCIA_DEBUG is not set CONFIG_PCMCIA=m CONFIG_PCMCIA_LOAD_CIS=y CONFIG_PCMCIA_IOCTL=y @@ -554,6 +597,7 @@ CONFIG_INET6_XFRM_MODE_TUNNEL=m CONFIG_INET6_XFRM_MODE_BEET=m CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y CONFIG_IPV6_NDISC_NODETYPE=y CONFIG_IPV6_TUNNEL=m CONFIG_IPV6_MULTIPLE_TABLES=y @@ -733,6 +777,7 @@ CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m # CONFIG_IP_DCCP is not set # CONFIG_IP_SCTP is not set +# CONFIG_RDS is not set # CONFIG_TIPC is not set CONFIG_ATM=m CONFIG_ATM_CLIP=m @@ -826,7 +871,6 @@ CONFIG_DCB=y # Network testing # # CONFIG_NET_PKTGEN is not set -CONFIG_NET_DROP_MONITOR=y # CONFIG_HAMRADIO is not set # CONFIG_CAN is not set # CONFIG_IRDA is not set @@ -834,11 +878,19 @@ CONFIG_NET_DROP_MONITOR=y # CONFIG_AF_RXRPC is not set CONFIG_FIB_RULES=y CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y CONFIG_CFG80211=m +CONFIG_NL80211_TESTMODE=y +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set # CONFIG_CFG80211_REG_DEBUG is not set +CONFIG_CFG80211_DEFAULT_PS=y # CONFIG_CFG80211_DEBUGFS is not set # CONFIG_WIRELESS_OLD_REGULATORY is not set -CONFIG_WIRELESS_EXT=y +CONFIG_CFG80211_WEXT=y CONFIG_WIRELESS_EXT_SYSFS=y CONFIG_LIB80211=m CONFIG_LIB80211_CRYPT_WEP=m @@ -846,16 +898,11 @@ CONFIG_LIB80211_CRYPT_CCMP=m CONFIG_LIB80211_CRYPT_TKIP=m # CONFIG_LIB80211_DEBUG is not set CONFIG_MAC80211=m -CONFIG_MAC80211_DEFAULT_PS=y -CONFIG_MAC80211_DEFAULT_PS_VALUE=1 - -# -# Rate control algorithm selection -# CONFIG_MAC80211_RC_MINSTREL=y # CONFIG_MAC80211_RC_DEFAULT_PID is not set CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y CONFIG_MAC80211_RC_DEFAULT="minstrel" +# CONFIG_MAC80211_MESH is not set CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set # CONFIG_MAC80211_DEBUG_MENU is not set @@ -874,6 +921,8 @@ CONFIG_RFKILL_INPUT=y # Generic Driver Options # CONFIG_UEVENT_HELPER_PATH="" +CONFIG_DEVTMPFS=y +# CONFIG_DEVTMPFS_MOUNT is not set CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y @@ -909,6 +958,7 @@ CONFIG_BLK_DEV_DAC960=m # CONFIG_BLK_DEV_COW_COMMON is not set CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m +# CONFIG_BLK_DEV_DRBD is not set # CONFIG_BLK_DEV_NBD is not set CONFIG_BLK_DEV_OSD=m CONFIG_BLK_DEV_SX8=m @@ -922,14 +972,18 @@ CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_VIRTIO_BLK=m CONFIG_BLK_DEV_HD=y CONFIG_MISC_DEVICES=y +# CONFIG_AD525X_DPOT is not set # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set CONFIG_ICS932S401=m # CONFIG_ENCLOSURE_SERVICES is not set +CONFIG_CS5535_MFGPT=m +CONFIG_CS5535_MFGPT_DEFAULT_IRQ=7 CONFIG_HP_ILO=m CONFIG_ISL29003=m +CONFIG_DS1682=m CONFIG_C2PORT=m # CONFIG_C2PORT_DURAMAR_2150 is not set
@@ -989,8 +1043,11 @@ CONFIG_SCSI_LOWLEVEL=y CONFIG_ISCSI_TCP=m CONFIG_SCSI_CXGB3_ISCSI=m CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_BE2ISCSI=m CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m CONFIG_SCSI_ACARD=m CONFIG_SCSI_AACRAID=m CONFIG_SCSI_AIC7XXX=m @@ -1025,6 +1082,7 @@ CONFIG_SCSI_MPT2SAS_MAX_SGE=128 CONFIG_SCSI_HPTIOP=m CONFIG_SCSI_BUSLOGIC=m CONFIG_SCSI_FLASHPOINT=y +CONFIG_VMWARE_PVSCSI=m CONFIG_LIBFC=m CONFIG_LIBFCOE=m CONFIG_FCOE=m @@ -1061,7 +1119,10 @@ CONFIG_SCSI_DC395x=m CONFIG_SCSI_DC390T=m CONFIG_SCSI_NSP32=m # CONFIG_SCSI_DEBUG is not set +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m # CONFIG_SCSI_SRP is not set +CONFIG_SCSI_BFA_FC=m CONFIG_SCSI_LOWLEVEL_PCMCIA=y CONFIG_PCMCIA_AHA152X=m CONFIG_PCMCIA_FDOMAIN=m @@ -1079,6 +1140,7 @@ CONFIG_SCSI_OSD_DPRINT_SENSE=1 # CONFIG_SCSI_OSD_DEBUG is not set CONFIG_ATA=m # CONFIG_ATA_NONSTANDARD is not set +CONFIG_ATA_VERBOSE_ERROR=y CONFIG_ATA_ACPI=y CONFIG_SATA_PMP=y CONFIG_SATA_AHCI=m @@ -1102,6 +1164,7 @@ CONFIG_PATA_ACPI=m CONFIG_PATA_ALI=m CONFIG_PATA_AMD=m CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATP867X=m CONFIG_PATA_ATIIXP=m CONFIG_PATA_CMD640_PCI=m CONFIG_PATA_CMD64X=m @@ -1131,14 +1194,16 @@ CONFIG_PATA_NS87415=m CONFIG_PATA_OPTI=m CONFIG_PATA_OPTIDMA=m CONFIG_PATA_PCMCIA=m +CONFIG_PATA_PDC2027X=m CONFIG_PATA_PDC_OLD=m CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m CONFIG_PATA_RZ1000=m CONFIG_PATA_SC1200=m CONFIG_PATA_SERVERWORKS=m -CONFIG_PATA_PDC2027X=m CONFIG_PATA_SIL680=m CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m CONFIG_PATA_VIA=m CONFIG_PATA_WINBOND=m CONFIG_PATA_SCH=m @@ -1149,7 +1214,9 @@ CONFIG_MD_RAID0=m CONFIG_MD_RAID1=m CONFIG_MD_RAID10=m CONFIG_MD_RAID456=m +CONFIG_MULTICORE_RAID456=y CONFIG_MD_RAID6_PQ=m +CONFIG_ASYNC_RAID6_TEST=m CONFIG_MD_MULTIPATH=m # CONFIG_MD_FAULTY is not set CONFIG_BLK_DEV_DM=m @@ -1181,7 +1248,7 @@ CONFIG_FUSION_CTL=m #
# -# See the help texts for more information. +# The newer stack is recommended. # CONFIG_FIREWIRE=m CONFIG_FIREWIRE_OHCI=m @@ -1289,6 +1356,7 @@ CONFIG_SUNDANCE=m CONFIG_SUNDANCE_MMIO=y CONFIG_TLAN=m CONFIG_KS8842=m +CONFIG_KS8851_MLL=m CONFIG_VIA_RHINE=m CONFIG_VIA_RHINE_MMIO=y CONFIG_SC92031=m @@ -1350,17 +1418,8 @@ CONFIG_QLGE=m CONFIG_SFC=m CONFIG_BE2NET=m # CONFIG_TR is not set - -# -# Wireless LAN -# -# CONFIG_WLAN_PRE80211 is not set -CONFIG_WLAN_80211=y +CONFIG_WLAN=y CONFIG_PCMCIA_RAYCS=m -CONFIG_LIBERTAS=m -CONFIG_LIBERTAS_USB=m -CONFIG_LIBERTAS_CS=m -# CONFIG_LIBERTAS_DEBUG is not set CONFIG_LIBERTAS_THINFIRM=m CONFIG_LIBERTAS_THINFIRM_USB=m CONFIG_AIRO=m @@ -1379,48 +1438,22 @@ CONFIG_RTL8187_LEDS=y CONFIG_ADM8211=m CONFIG_MAC80211_HWSIM=m CONFIG_MWL8K=m -CONFIG_P54_COMMON=m -CONFIG_P54_USB=m -CONFIG_P54_PCI=m -CONFIG_P54_LEDS=y CONFIG_ATH_COMMON=m +# CONFIG_ATH_DEBUG is not set CONFIG_ATH5K=m # CONFIG_ATH5K_DEBUG is not set +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m CONFIG_ATH9K=m -# CONFIG_ATH9K_DEBUG is not set +# CONFIG_ATH9K_DEBUGFS is not set CONFIG_AR9170_USB=m CONFIG_AR9170_LEDS=y -CONFIG_IPW2100=m -CONFIG_IPW2100_MONITOR=y -# CONFIG_IPW2100_DEBUG is not set -CONFIG_IPW2200=m -CONFIG_IPW2200_MONITOR=y -CONFIG_IPW2200_RADIOTAP=y -CONFIG_IPW2200_PROMISCUOUS=y -CONFIG_IPW2200_QOS=y -# CONFIG_IPW2200_DEBUG is not set -CONFIG_LIBIPW=m -# CONFIG_LIBIPW_DEBUG is not set -CONFIG_IWLWIFI=m -CONFIG_IWLWIFI_LEDS=y -CONFIG_IWLWIFI_SPECTRUM_MEASUREMENT=y -# CONFIG_IWLWIFI_DEBUG is not set -CONFIG_IWLAGN=m -CONFIG_IWL4965=y -CONFIG_IWL5000=y -CONFIG_IWL3945=m -CONFIG_IWL3945_SPECTRUM_MEASUREMENT=y -CONFIG_HOSTAP=m -CONFIG_HOSTAP_FIRMWARE=y -# CONFIG_HOSTAP_FIRMWARE_NVRAM is not set -CONFIG_HOSTAP_PLX=m -CONFIG_HOSTAP_PCI=m -CONFIG_HOSTAP_CS=m CONFIG_B43=m CONFIG_B43_PCI_AUTOSELECT=y CONFIG_B43_PCICORE_AUTOSELECT=y CONFIG_B43_PCMCIA=y CONFIG_B43_PIO=y +CONFIG_B43_PHY_LP=y CONFIG_B43_LEDS=y CONFIG_B43_HWRNG=y # CONFIG_B43_DEBUG is not set @@ -1435,32 +1468,69 @@ CONFIG_B43LEGACY_PIO=y CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y # CONFIG_B43LEGACY_DMA_MODE is not set # CONFIG_B43LEGACY_PIO_MODE is not set -CONFIG_ZD1211RW=m -# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +# CONFIG_HOSTAP_FIRMWARE_NVRAM is not set +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_SPECTRUM_MEASUREMENT=y +# CONFIG_IWLWIFI_DEBUG is not set +CONFIG_IWLAGN=m +CONFIG_IWL4965=y +CONFIG_IWL5000=y +CONFIG_IWL3945=m +CONFIG_IWL3945_SPECTRUM_MEASUREMENT=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_HERMES=m +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_LEDS=y CONFIG_RT2X00=m CONFIG_RT2400PCI=m CONFIG_RT2500PCI=m CONFIG_RT61PCI=m +CONFIG_RT2800PCI_PCI=m +CONFIG_RT2800PCI=m CONFIG_RT2500USB=m CONFIG_RT73USB=m CONFIG_RT2800USB=m +CONFIG_RT2800_LIB=m CONFIG_RT2X00_LIB_PCI=m CONFIG_RT2X00_LIB_USB=m CONFIG_RT2X00_LIB=m CONFIG_RT2X00_LIB_HT=y CONFIG_RT2X00_LIB_FIRMWARE=y CONFIG_RT2X00_LIB_CRYPTO=y -CONFIG_RT2X00_LIB_RFKILL=y CONFIG_RT2X00_LIB_LEDS=y # CONFIG_RT2X00_DEBUG is not set -CONFIG_HERMES=m -CONFIG_HERMES_CACHE_FW_ON_INIT=y -CONFIG_PLX_HERMES=m -CONFIG_TMD_HERMES=m -CONFIG_NORTEL_HERMES=m -CONFIG_PCI_HERMES=m -CONFIG_PCMCIA_HERMES=m -CONFIG_PCMCIA_SPECTRUM=m +CONFIG_WL12XX=m +CONFIG_WL1251=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set
# # WiMAX Wireless Broadband devices @@ -1565,7 +1635,9 @@ CONFIG_NETPOLL=y # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y CONFIG_VIRTIO_NET=m +CONFIG_VMXNET3=m CONFIG_ISDN=y +CONFIG_ISDN_I4L=m CONFIG_MISDN=m CONFIG_MISDN_DSP=m CONFIG_MISDN_L1OIP=m @@ -1576,8 +1648,98 @@ CONFIG_MISDN_L1OIP=m CONFIG_MISDN_HFCPCI=m CONFIG_MISDN_HFCMULTI=m CONFIG_MISDN_HFCUSB=m -# CONFIG_ISDN_I4L is not set +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_ISDN_PPP=y +CONFIG_ISDN_PPP_VJ=y +CONFIG_ISDN_MPP=y +CONFIG_IPPP_FILTER=y +CONFIG_ISDN_PPP_BSDCOMP=m +# CONFIG_ISDN_AUDIO is not set + +# +# ISDN feature submodules +# +CONFIG_ISDN_DIVERSION=m + +# +# ISDN4Linux hardware drivers +# + +# +# Passive cards +# +CONFIG_ISDN_DRV_HISAX=m + +# +# D-channel protocol features +# +CONFIG_HISAX_EURO=y +CONFIG_DE_AOC=y +# CONFIG_HISAX_NO_SENDCOMPLETE is not set +# CONFIG_HISAX_NO_LLC is not set +# CONFIG_HISAX_NO_KEYPAD is not set +CONFIG_HISAX_1TR6=y +CONFIG_HISAX_NI1=y +CONFIG_HISAX_MAX_CARDS=8 + +# +# HiSax supported cards +# +CONFIG_HISAX_16_3=y +CONFIG_HISAX_TELESPCI=y +CONFIG_HISAX_S0BOX=y +CONFIG_HISAX_FRITZPCI=y +CONFIG_HISAX_AVM_A1_PCMCIA=y +CONFIG_HISAX_ELSA=y +CONFIG_HISAX_DIEHLDIVA=y +CONFIG_HISAX_SEDLBAUER=y +CONFIG_HISAX_NETJET=y +CONFIG_HISAX_NETJET_U=y +CONFIG_HISAX_NICCY=y +CONFIG_HISAX_BKM_A4T=y +CONFIG_HISAX_SCT_QUADRO=y +CONFIG_HISAX_GAZEL=y +CONFIG_HISAX_HFC_PCI=y +CONFIG_HISAX_W6692=y +CONFIG_HISAX_HFC_SX=y +CONFIG_HISAX_ENTERNOW_PCI=y +# CONFIG_HISAX_DEBUG is not set + +# +# HiSax PCMCIA card service modules +# +CONFIG_HISAX_SEDLBAUER_CS=m +CONFIG_HISAX_ELSA_CS=m +CONFIG_HISAX_AVM_A1_CS=m +CONFIG_HISAX_TELES_CS=m + +# +# HiSax sub driver modules +# +CONFIG_HISAX_ST5481=m +CONFIG_HISAX_HFCUSB=m +CONFIG_HISAX_HFC4S8S=m +CONFIG_HISAX_FRITZ_PCIPNP=m + +# +# Active cards +# +CONFIG_HYSDN=m +CONFIG_ISDN_HDLC=m # CONFIG_ISDN_CAPI is not set +CONFIG_ISDN_DRV_GIGASET=m +CONFIG_GIGASET_I4L=y +# CONFIG_GIGASET_DUMMYLL is not set +CONFIG_GIGASET_BASE=m +CONFIG_GIGASET_M105=m +CONFIG_GIGASET_M101=m +# CONFIG_GIGASET_DEBUG is not set # CONFIG_PHONE is not set
# @@ -1586,6 +1748,7 @@ CONFIG_MISDN_HFCUSB=m CONFIG_INPUT=y # CONFIG_INPUT_FF_MEMLESS is not set CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m
# # Userland interfaces @@ -1602,12 +1765,16 @@ CONFIG_INPUT_EVDEV=y # Input Device Drivers # CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADP5588=m CONFIG_KEYBOARD_ATKBD=y +CONFIG_QT2160=m # CONFIG_KEYBOARD_LKKBD is not set CONFIG_KEYBOARD_GPIO=m CONFIG_KEYBOARD_MATRIX=m CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_MAX7359=m # CONFIG_KEYBOARD_NEWTON is not set +CONFIG_KEYBOARD_OPENCORES=m # CONFIG_KEYBOARD_STOWAWAY is not set # CONFIG_KEYBOARD_SUNKBD is not set # CONFIG_KEYBOARD_XTKBD is not set @@ -1628,6 +1795,7 @@ CONFIG_SERIO_SERPORT=m # CONFIG_SERIO_PCIPS2 is not set CONFIG_SERIO_LIBPS2=y # CONFIG_SERIO_RAW is not set +CONFIG_SERIO_ALTERA_PS2=m # CONFIG_GAMEPORT is not set
# @@ -1709,6 +1877,7 @@ CONFIG_HPET_MMAP=y CONFIG_DEVPORT=y CONFIG_I2C=m CONFIG_I2C_BOARDINFO=y +# CONFIG_I2C_COMPAT is not set CONFIG_I2C_CHARDEV=m CONFIG_I2C_HELPER_AUTO=y CONFIG_I2C_ALGOBIT=m @@ -1739,6 +1908,11 @@ CONFIG_I2C_VIA=m CONFIG_I2C_VIAPRO=m
# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# # I2C system bus drivers (mostly embedded / system-on-chip) # CONFIG_I2C_GPIO=m @@ -1754,11 +1928,6 @@ CONFIG_I2C_TAOS_EVM=m CONFIG_I2C_TINY_USB=m
# -# Graphics adapter I2C/DDC channel drivers -# -CONFIG_I2C_VOODOO3=m - -# # Other I2C/SMBus bus drivers # CONFIG_I2C_PCA_PLATFORM=m @@ -1768,7 +1937,6 @@ CONFIG_SCx200_ACB=m # # Miscellaneous I2C Chip support # -CONFIG_DS1682=m CONFIG_SENSORS_TSL2550=m # CONFIG_I2C_DEBUG_CORE is not set # CONFIG_I2C_DEBUG_ALGO is not set @@ -1794,14 +1962,21 @@ CONFIG_GPIO_SYSFS=y CONFIG_GPIO_MAX732X=m CONFIG_GPIO_PCA953X=m CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_ADP5588=m
# # PCI GPIO expanders: # +CONFIG_GPIO_CS5535=m +CONFIG_GPIO_LANGWELL=y
# # SPI GPIO expanders: # + +# +# AC97 GPIO expanders: +# CONFIG_W1=m CONFIG_W1_CON=y
@@ -1835,6 +2010,11 @@ CONFIG_BATTERY_MAX17040=m CONFIG_CHARGER_PCF50633=m CONFIG_HWMON=m CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# CONFIG_SENSORS_ABITUGURU=m CONFIG_SENSORS_ABITUGURU3=m CONFIG_SENSORS_AD7414=m @@ -1850,16 +2030,14 @@ CONFIG_SENSORS_ADT7470=m CONFIG_SENSORS_ADT7473=m CONFIG_SENSORS_ADT7475=m CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m CONFIG_SENSORS_ASB100=m -CONFIG_SENSORS_ATK0110=m CONFIG_SENSORS_ATXP1=m CONFIG_SENSORS_DS1621=m CONFIG_SENSORS_I5K_AMB=m CONFIG_SENSORS_F71805F=m CONFIG_SENSORS_F71882FG=m CONFIG_SENSORS_F75375S=m -CONFIG_SENSORS_FSCHER=m -CONFIG_SENSORS_FSCPOS=m CONFIG_SENSORS_FSCHMD=m CONFIG_SENSORS_G760A=m CONFIG_SENSORS_GL518SM=m @@ -1869,6 +2047,7 @@ CONFIG_SENSORS_IBMAEM=m CONFIG_SENSORS_IBMPEX=m CONFIG_SENSORS_IT87=m CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM73=m CONFIG_SENSORS_LM75=m CONFIG_SENSORS_LM77=m CONFIG_SENSORS_LM78=m @@ -1894,8 +2073,11 @@ CONFIG_SENSORS_SMSC47M1=m CONFIG_SENSORS_SMSC47M192=m CONFIG_SENSORS_SMSC47B397=m CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_AMC6821=m CONFIG_SENSORS_THMC50=m CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_VIA_CPUTEMP=m CONFIG_SENSORS_VIA686A=m CONFIG_SENSORS_VT1211=m CONFIG_SENSORS_VT8231=m @@ -1907,10 +2089,16 @@ CONFIG_SENSORS_W83L785TS=m CONFIG_SENSORS_W83L786NG=m CONFIG_SENSORS_W83627HF=m CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM8350=m CONFIG_SENSORS_HDAPS=m -CONFIG_SENSORS_LIS3LV02D=m +CONFIG_SENSORS_LIS3_I2C=m CONFIG_SENSORS_APPLESMC=m -# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# ACPI drivers +# +CONFIG_SENSORS_ATK0110=m +CONFIG_SENSORS_LIS3LV02D=m CONFIG_THERMAL=y CONFIG_WATCHDOG=y CONFIG_WATCHDOG_NOWAYOUT=y @@ -1924,7 +2112,9 @@ CONFIG_ACQUIRE_WDT=m CONFIG_ADVANTECH_WDT=m CONFIG_ALIM1535_WDT=m CONFIG_ALIM7101_WDT=m +CONFIG_GEODE_WDT=m CONFIG_SC520_WDT=m +CONFIG_SBC_FITPC2_WATCHDOG=m CONFIG_EUROTECH_WDT=m CONFIG_IB700_WDT=m CONFIG_IBMASR=m @@ -1994,6 +2184,7 @@ CONFIG_MFD_PCF50633=m CONFIG_PCF50633_ADC=m CONFIG_PCF50633_GPIO=m CONFIG_AB3100_CORE=m +CONFIG_AB3100_OTP=m CONFIG_REGULATOR=y # CONFIG_REGULATOR_DEBUG is not set # CONFIG_REGULATOR_FIXED_VOLTAGE is not set @@ -2001,10 +2192,14 @@ CONFIG_REGULATOR_VIRTUAL_CONSUMER=m CONFIG_REGULATOR_USERSPACE_CONSUMER=m CONFIG_REGULATOR_BQ24022=m CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX8660=m CONFIG_REGULATOR_WM8350=m CONFIG_REGULATOR_WM8400=m CONFIG_REGULATOR_PCF50633=m CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_AB3100=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m CONFIG_MEDIA_SUPPORT=m
# @@ -2022,6 +2217,8 @@ CONFIG_VIDEO_MEDIA=m # CONFIG_VIDEO_SAA7146=m CONFIG_VIDEO_SAA7146_VV=m +CONFIG_IR_CORE=m +CONFIG_VIDEO_IR=m CONFIG_MEDIA_ATTACH=y CONFIG_MEDIA_TUNER=m # CONFIG_MEDIA_TUNER_CUSTOMISE is not set @@ -2042,13 +2239,13 @@ CONFIG_MEDIA_TUNER_XC5000=m CONFIG_MEDIA_TUNER_MXL5005S=m CONFIG_MEDIA_TUNER_MXL5007T=m CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m CONFIG_VIDEO_V4L2=m CONFIG_VIDEOBUF_GEN=m CONFIG_VIDEOBUF_DMA_SG=m CONFIG_VIDEOBUF_VMALLOC=m CONFIG_VIDEOBUF_DVB=m CONFIG_VIDEO_BTCX=m -CONFIG_VIDEO_IR=m CONFIG_VIDEO_TVEEPROM=m CONFIG_VIDEO_TUNER=m CONFIG_VIDEO_CAPTURE_DRIVERS=y @@ -2112,29 +2309,36 @@ CONFIG_VIDEO_AU0828=m CONFIG_VIDEO_IVTV=m CONFIG_VIDEO_FB_IVTV=m CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_SAA7164=m CONFIG_VIDEO_CAFE_CCIC=m CONFIG_SOC_CAMERA=m CONFIG_SOC_CAMERA_MT9M001=m CONFIG_SOC_CAMERA_MT9M111=m CONFIG_SOC_CAMERA_MT9T031=m +CONFIG_SOC_CAMERA_MT9T112=m CONFIG_SOC_CAMERA_MT9V022=m +CONFIG_SOC_CAMERA_RJ54N1=m CONFIG_SOC_CAMERA_TW9910=m CONFIG_SOC_CAMERA_PLATFORM=m CONFIG_SOC_CAMERA_OV772X=m +CONFIG_SOC_CAMERA_OV9640=m CONFIG_V4L_USB_DRIVERS=y CONFIG_USB_VIDEO_CLASS=m CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y CONFIG_USB_GSPCA=m CONFIG_USB_M5602=m CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m CONFIG_USB_GSPCA_CONEX=m CONFIG_USB_GSPCA_ETOMS=m CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m CONFIG_USB_GSPCA_MARS=m CONFIG_USB_GSPCA_MR97310A=m CONFIG_USB_GSPCA_OV519=m CONFIG_USB_GSPCA_OV534=m CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m CONFIG_USB_GSPCA_PAC7311=m CONFIG_USB_GSPCA_SN9C20X=m CONFIG_USB_GSPCA_SN9C20X_EVDEV=y @@ -2149,6 +2353,7 @@ CONFIG_USB_GSPCA_SPCA561=m CONFIG_USB_GSPCA_SQ905=m CONFIG_USB_GSPCA_SQ905C=m CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STV0680=m CONFIG_USB_GSPCA_SUNPLUS=m CONFIG_USB_GSPCA_T613=m CONFIG_USB_GSPCA_TV8532=m @@ -2172,10 +2377,14 @@ CONFIG_RADIO_ADAPTERS=y # CONFIG_RADIO_GEMTEK_PCI is not set # CONFIG_RADIO_MAXIRADIO is not set # CONFIG_RADIO_MAESTRO is not set +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI4713=m # CONFIG_USB_DSBR is not set -# CONFIG_USB_SI470X is not set +# CONFIG_RADIO_SI470X is not set # CONFIG_USB_MR800 is not set # CONFIG_RADIO_TEA5764 is not set +# CONFIG_RADIO_TEF6862 is not set +CONFIG_DVB_MAX_ADAPTERS=8 CONFIG_DVB_DYNAMIC_MINORS=y CONFIG_DVB_CAPTURE_DRIVERS=y
@@ -2222,6 +2431,8 @@ CONFIG_DVB_USB_ANYSEE=m CONFIG_DVB_USB_DTV5100=m CONFIG_DVB_USB_AF9015=m CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_FRIIO=m +CONFIG_DVB_USB_EC168=m CONFIG_DVB_TTUSB_BUDGET=m CONFIG_DVB_TTUSB_DEC=m CONFIG_SMS_SIANO_MDTV=m @@ -2253,6 +2464,22 @@ CONFIG_DVB_PLUTO2=m # Supported SDMC DM1105 Adapters # CONFIG_DVB_DM1105=m +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_FIREWIRE=y +# CONFIG_DVB_FIREDTV_IEEE1394 is not set +CONFIG_DVB_FIREDTV_INPUT=y + +# +# Supported Earthsoft PT1 Adapters +# +CONFIG_DVB_PT1=m + +# +# Supported Mantis Adapters +# +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m
# # Supported DVB Frontends @@ -2264,6 +2491,7 @@ CONFIG_DVB_CX24110=m CONFIG_DVB_CX24123=m CONFIG_DVB_MT312=m CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m CONFIG_DVB_S5H1420=m CONFIG_DVB_STV0288=m CONFIG_DVB_STB6000=m @@ -2280,6 +2508,8 @@ CONFIG_DVB_TDA826X=m CONFIG_DVB_TUA6100=m CONFIG_DVB_CX24116=m CONFIG_DVB_SI21XX=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m CONFIG_DVB_SP8870=m CONFIG_DVB_SP887X=m CONFIG_DVB_CX22700=m @@ -2295,6 +2525,7 @@ CONFIG_DVB_DIB7000M=m CONFIG_DVB_DIB7000P=m CONFIG_DVB_TDA10048=m CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m CONFIG_DVB_VES1820=m CONFIG_DVB_TDA10021=m CONFIG_DVB_TDA10023=m @@ -2308,12 +2539,14 @@ CONFIG_DVB_LGDT3305=m CONFIG_DVB_S5H1409=m CONFIG_DVB_AU8522=m CONFIG_DVB_S5H1411=m +CONFIG_DVB_DIB8000=m CONFIG_DVB_PLL=m CONFIG_DVB_TUNER_DIB0070=m CONFIG_DVB_LNBP21=m CONFIG_DVB_ISL6405=m CONFIG_DVB_ISL6421=m -CONFIG_DVB_LGS8GL5=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m CONFIG_DAB=y CONFIG_USB_DABUSB=m
@@ -2331,6 +2564,7 @@ CONFIG_AGP_SIS=m CONFIG_AGP_SWORKS=m CONFIG_AGP_VIA=m CONFIG_AGP_EFFICEON=m +CONFIG_VGA_ARB=y # CONFIG_DRM is not set CONFIG_VGASTATE=m # CONFIG_VIDEO_OUTPUT_CONTROL is not set @@ -2394,7 +2628,6 @@ CONFIG_FB_MATROX_MYSTIQUE=y CONFIG_FB_MATROX_G=y CONFIG_FB_MATROX_I2C=m CONFIG_FB_MATROX_MAVEN=m -CONFIG_FB_MATROX_MULTIHEAD=y CONFIG_FB_RADEON=m CONFIG_FB_RADEON_I2C=y CONFIG_FB_RADEON_BACKLIGHT=y @@ -2473,6 +2706,7 @@ CONFIG_LOGO=y CONFIG_LOGO_LINUX_CLUT224=y CONFIG_SOUND=y CONFIG_SOUND_OSS_CORE=y +CONFIG_SOUND_OSS_CORE_PRECLAIM=y CONFIG_SND=m CONFIG_SND_TIMER=m CONFIG_SND_PCM=m @@ -2494,6 +2728,7 @@ CONFIG_SND_VERBOSE_PROCFS=y # CONFIG_SND_VERBOSE_PRINTK is not set # CONFIG_SND_DEBUG is not set CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y CONFIG_SND_RAWMIDI_SEQ=m CONFIG_SND_OPL3_LIB_SEQ=m # CONFIG_SND_OPL4_LIB_SEQ is not set @@ -2567,7 +2802,9 @@ CONFIG_SND_HDA_INTEL=m CONFIG_SND_HDA_HWDEP=y CONFIG_SND_HDA_RECONFIG=y CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 CONFIG_SND_HDA_INPUT_JACK=y +# CONFIG_SND_HDA_PATCH_LOADER is not set CONFIG_SND_HDA_CODEC_REALTEK=y CONFIG_SND_HDA_CODEC_ANALOG=y CONFIG_SND_HDA_CODEC_SIGMATEL=y @@ -2576,6 +2813,7 @@ CONFIG_SND_HDA_CODEC_ATIHDMI=y CONFIG_SND_HDA_CODEC_NVHDMI=y CONFIG_SND_HDA_CODEC_INTELHDMI=y CONFIG_SND_HDA_ELD=y +CONFIG_SND_HDA_CODEC_CIRRUS=y CONFIG_SND_HDA_CODEC_CONEXANT=y CONFIG_SND_HDA_CODEC_CA0110=y CONFIG_SND_HDA_CODEC_CMEDIA=y @@ -2619,8 +2857,12 @@ CONFIG_SND_PDAUDIOCF=m CONFIG_SND_SOC=m CONFIG_SND_SOC_I2C_AND_SPI=m CONFIG_SND_SOC_ALL_CODECS=m +CONFIG_SND_SOC_WM_HUBS=m CONFIG_SND_SOC_AD73311=m +CONFIG_SND_SOC_ADS117X=m CONFIG_SND_SOC_AK4535=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK4671=m CONFIG_SND_SOC_CS4270=m CONFIG_SND_SOC_L3=m CONFIG_SND_SOC_PCM3008=m @@ -2628,29 +2870,38 @@ CONFIG_SND_SOC_SPDIF=m CONFIG_SND_SOC_SSM2602=m CONFIG_SND_SOC_TLV320AIC23=m CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TLV320DAC33=m CONFIG_SND_SOC_UDA134X=m CONFIG_SND_SOC_UDA1380=m CONFIG_SND_SOC_WM8350=m CONFIG_SND_SOC_WM8400=m CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8727=m CONFIG_SND_SOC_WM8728=m CONFIG_SND_SOC_WM8731=m CONFIG_SND_SOC_WM8750=m CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8776=m CONFIG_SND_SOC_WM8900=m CONFIG_SND_SOC_WM8903=m CONFIG_SND_SOC_WM8940=m CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8961=m CONFIG_SND_SOC_WM8971=m +CONFIG_SND_SOC_WM8974=m CONFIG_SND_SOC_WM8988=m CONFIG_SND_SOC_WM8990=m +CONFIG_SND_SOC_WM8993=m CONFIG_SND_SOC_WM9081=m +CONFIG_SND_SOC_MAX9877=m +CONFIG_SND_SOC_TPA6130A2=m # CONFIG_SOUND_PRIME is not set CONFIG_AC97_BUS=m CONFIG_HID_SUPPORT=y CONFIG_HID=m -# CONFIG_HID_DEBUG is not set CONFIG_HIDRAW=y
# @@ -2674,6 +2925,7 @@ CONFIG_HID_DRAGONRISE=m CONFIG_HID_EZKEY=m CONFIG_HID_KYE=m CONFIG_HID_GYRATION=m +CONFIG_HID_TWINHAN=m CONFIG_HID_KENSINGTON=m CONFIG_HID_LOGITECH=m # CONFIG_LOGITECH_FF is not set @@ -2728,6 +2980,7 @@ CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OXU210HP_HCD=m CONFIG_USB_ISP116X_HCD=m CONFIG_USB_ISP1760_HCD=m +CONFIG_USB_ISP1362_HCD=m CONFIG_USB_OHCI_HCD=m CONFIG_USB_OHCI_HCD_SSB=y # CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set @@ -2841,7 +3094,10 @@ CONFIG_LEDS_LP3944=m # CONFIG_LEDS_CLEVO_MAIL is not set CONFIG_LEDS_PCA955X=m CONFIG_LEDS_WM8350=m +CONFIG_LEDS_REGULATOR=m CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m
# # LED Triggers @@ -2885,6 +3141,7 @@ CONFIG_RTC_DRV_PCF8563=m CONFIG_RTC_DRV_PCF8583=m CONFIG_RTC_DRV_M41T80=m CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BQ32K=m CONFIG_RTC_DRV_S35390A=m CONFIG_RTC_DRV_FM3130=m CONFIG_RTC_DRV_RX8581=m @@ -2906,19 +3163,24 @@ CONFIG_RTC_DRV_STK17TA8=m CONFIG_RTC_DRV_M48T86=m CONFIG_RTC_DRV_M48T35=m CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m CONFIG_RTC_DRV_V3020=m CONFIG_RTC_DRV_WM8350=m CONFIG_RTC_DRV_PCF50633=m +CONFIG_RTC_DRV_AB3100=m
# # on-CPU RTC drivers # +CONFIG_CS5535_CLOCK_EVENT_SRC=m CONFIG_DMADEVICES=y
# # DMA Devices # +CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH=y CONFIG_INTEL_IOATDMA=m CONFIG_DMA_ENGINE=y
@@ -2937,6 +3199,7 @@ CONFIG_UIO=m # CONFIG_UIO_SMX is not set # CONFIG_UIO_AEC is not set # CONFIG_UIO_SERCOS3 is not set +CONFIG_UIO_PCI_GENERIC=m
# # TI VLYNQ @@ -2956,6 +3219,7 @@ CONFIG_COMPAL_LAPTOP=m CONFIG_SONY_LAPTOP=m CONFIG_SONYPI_COMPAT=y CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y # CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set # CONFIG_THINKPAD_ACPI_DEBUG is not set # CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set @@ -2963,8 +3227,12 @@ CONFIG_THINKPAD_ACPI_VIDEO=y CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y CONFIG_INTEL_MENLOW=m CONFIG_ACPI_WMI=m +CONFIG_MSI_WMI=m CONFIG_ACPI_ASUS=m +CONFIG_TOPSTAR_LAPTOP=m CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_ACPI_CMPC=m
# # Firmware Drivers @@ -2990,10 +3258,10 @@ CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y CONFIG_EXT4_FS=m -# CONFIG_EXT4DEV_COMPAT is not set CONFIG_EXT4_FS_XATTR=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y +# CONFIG_EXT4_DEBUG is not set CONFIG_FS_XIP=y CONFIG_JBD=m # CONFIG_JBD_DEBUG is not set @@ -3019,12 +3287,14 @@ CONFIG_XFS_RT=y # CONFIG_OCFS2_FS is not set CONFIG_BTRFS_FS=m CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_NILFS2_FS is not set CONFIG_FILE_LOCKING=y CONFIG_FSNOTIFY=y CONFIG_DNOTIFY=y CONFIG_INOTIFY=y CONFIG_INOTIFY_USER=y # CONFIG_QUOTA is not set +CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_QUOTACTL=y # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=m @@ -3039,6 +3309,7 @@ CONFIG_FSCACHE=m CONFIG_FSCACHE_STATS=y CONFIG_FSCACHE_HISTOGRAM=y # CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set CONFIG_CACHEFILES=m # CONFIG_CACHEFILES_DEBUG is not set CONFIG_CACHEFILES_HISTOGRAM=y @@ -3095,20 +3366,6 @@ CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set # CONFIG_EXOFS_FS is not set -# CONFIG_NILFS2_FS is not set -CONFIG_AUFS_FS=m -CONFIG_AUFS_BRANCH_MAX_127=y -# CONFIG_AUFS_BRANCH_MAX_511 is not set -# CONFIG_AUFS_BRANCH_MAX_1023 is not set -# CONFIG_AUFS_BRANCH_MAX_32767 is not set -# CONFIG_AUFS_HINOTIFY is not set -CONFIG_AUFS_EXPORT=y -# CONFIG_AUFS_RDU is not set -# CONFIG_AUFS_SHWH is not set -# CONFIG_AUFS_BR_RAMFS is not set -# CONFIG_AUFS_BR_FUSE is not set -# CONFIG_AUFS_DEBUG is not set -CONFIG_AUFS_BDEV_LOOP=y CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=m CONFIG_NFS_V3=y @@ -3197,13 +3454,13 @@ CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_FRAME_WARN=1024 CONFIG_MAGIC_SYSRQ=y +CONFIG_STRIP_ASM_SYMS=y # CONFIG_UNUSED_SYMBOLS is not set CONFIG_DEBUG_FS=y # CONFIG_HEADERS_CHECK is not set # CONFIG_DEBUG_KERNEL is not set # CONFIG_SLUB_DEBUG_ON is not set # CONFIG_SLUB_STATS is not set -CONFIG_STACKTRACE=y CONFIG_DEBUG_BUGVERBOSE=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_ARCH_WANT_FRAME_POINTERS=y @@ -3211,18 +3468,15 @@ CONFIG_ARCH_WANT_FRAME_POINTERS=y # CONFIG_RCU_CPU_STALL_DETECTOR is not set CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_USER_STACKTRACE_SUPPORT=y -CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_RING_BUFFER=y -CONFIG_EVENT_TRACING=y -CONFIG_CONTEXT_SWITCH_TRACER=y -CONFIG_TRACING=y +CONFIG_RING_BUFFER_ALLOW_SWAP=y CONFIG_TRACING_SUPPORT=y # CONFIG_FTRACE is not set # CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set @@ -3268,6 +3522,7 @@ CONFIG_GRKERNSEC_CUSTOM=y # Address Space Protection # # CONFIG_GRKERNSEC_KMEM is not set +CONFIG_GRKERNSEC_VM86=y # CONFIG_GRKERNSEC_IO is not set CONFIG_GRKERNSEC_PROC_MEMMAP=y CONFIG_GRKERNSEC_BRUTE=y @@ -3288,6 +3543,7 @@ CONFIG_GRKERNSEC_ACL_TIMEOUT=30 # CONFIG_GRKERNSEC_PROC is not set CONFIG_GRKERNSEC_LINK=y CONFIG_GRKERNSEC_FIFO=y +# CONFIG_GRKERNSEC_ROFS is not set CONFIG_GRKERNSEC_CHROOT=y CONFIG_GRKERNSEC_CHROOT_MOUNT=y CONFIG_GRKERNSEC_CHROOT_DOUBLE=y @@ -3310,9 +3566,9 @@ CONFIG_GRKERNSEC_CHROOT_CAPS=y # CONFIG_GRKERNSEC_EXECLOG is not set CONFIG_GRKERNSEC_RESLOG=y CONFIG_GRKERNSEC_CHROOT_EXECLOG=y +CONFIG_GRKERNSEC_AUDIT_PTRACE=y # CONFIG_GRKERNSEC_AUDIT_CHDIR is not set CONFIG_GRKERNSEC_AUDIT_MOUNT=y -CONFIG_GRKERNSEC_AUDIT_IPC=y CONFIG_GRKERNSEC_SIGNAL=y CONFIG_GRKERNSEC_FORKFAIL=y CONFIG_GRKERNSEC_TIME=y @@ -3371,6 +3627,7 @@ CONFIG_PAX_EMUTRAMP=y CONFIG_PAX_MPROTECT=y CONFIG_PAX_NOELFRELOCS=y CONFIG_PAX_KERNEXEC=y +CONFIG_PAX_KERNEXEC_MODULE_TEXT=4
# # Address Space Layout Randomization @@ -3393,13 +3650,22 @@ CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_NETWORK_XFRM=y # CONFIG_SECURITY_PATH is not set -CONFIG_SECURITY_FILE_CAPABILITIES=y +CONFIG_INTEL_TXT=y # CONFIG_SECURITY_TOMOYO is not set # CONFIG_IMA is not set +# CONFIG_DEFAULT_SECURITY_SELINUX is not set +# CONFIG_DEFAULT_SECURITY_SMACK is not set +# CONFIG_DEFAULT_SECURITY_TOMOYO is not set +CONFIG_DEFAULT_SECURITY_DAC=y +CONFIG_DEFAULT_SECURITY="" CONFIG_XOR_BLOCKS=m CONFIG_ASYNC_CORE=m CONFIG_ASYNC_MEMCPY=m CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_ASYNC_TX_DISABLE_PQ_VAL_DMA=y +CONFIG_ASYNC_TX_DISABLE_XOR_VAL_DMA=y CONFIG_CRYPTO=y
# @@ -3449,12 +3715,14 @@ CONFIG_CRYPTO_PCBC=m # CONFIG_CRYPTO_HMAC=m # CONFIG_CRYPTO_XCBC is not set +CONFIG_CRYPTO_VMAC=m
# # Digest # CONFIG_CRYPTO_CRC32C=m CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_GHASH=m # CONFIG_CRYPTO_MD4 is not set CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_MICHAEL_MIC=m @@ -3511,17 +3779,18 @@ CONFIG_CRYPTO_DEV_HIFN_795X=m CONFIG_CRYPTO_DEV_HIFN_795X_RNG=y CONFIG_HAVE_KVM=y CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_APIC_ARCHITECTURE=y CONFIG_VIRTUALIZATION=y CONFIG_KVM=m CONFIG_KVM_INTEL=m CONFIG_KVM_AMD=m -# CONFIG_KVM_TRACE is not set CONFIG_LGUEST=m CONFIG_VIRTIO=m CONFIG_VIRTIO_RING=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m -CONFIG_BINARY_PRINTF=y +# CONFIG_BINARY_PRINTF is not set
# # Library routines @@ -3535,15 +3804,16 @@ CONFIG_CRC16=m CONFIG_CRC_T10DIF=m CONFIG_CRC_ITU_T=m CONFIG_CRC32=y -# CONFIG_CRC7 is not set +CONFIG_CRC7=m CONFIG_LIBCRC32C=m CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=m CONFIG_LZO_COMPRESS=m -CONFIG_LZO_DECOMPRESS=m +CONFIG_LZO_DECOMPRESS=y CONFIG_DECOMPRESS_GZIP=y CONFIG_DECOMPRESS_BZIP2=y CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_LZO=y CONFIG_TEXTSEARCH=y CONFIG_TEXTSEARCH_KMP=m CONFIG_TEXTSEARCH_BM=m diff --git a/pkgs/core/kernel/kernel.nm b/pkgs/core/kernel/kernel.nm index 27fc3f9..f8f5dcf 100644 --- a/pkgs/core/kernel/kernel.nm +++ b/pkgs/core/kernel/kernel.nm @@ -25,7 +25,7 @@ include $(PKGROOT)/Include
PKG_NAME = linux -PKG_VER = 2.6.31.1 +PKG_VER = 2.6.33.1 PKG_REL = 0
PKG_MAINTAINER = Michael Tremer michael.tremer@ipfire.org @@ -34,6 +34,8 @@ PKG_URL = http://www.kernel.org/ PKG_LICENSE = GPLv2 PKG_SUMMARY = The Linux kernel.
+PKG_BUILD_DEPS = # No build dependencies + define PKG_DESCRIPTION The kernel package contains the Linux kernel (vmlinuz), the core of any \ Linux operating system. The kernel handles the basic functions \ @@ -44,7 +46,7 @@ endef CFLAGS = CXXFLAGS =
-LOCALVERSION = -ipfire1 +LOCALVERSION = -$(DISTRO_SNAME)$(PKG_REL) FULLVER = $(PKG_VER)$(LOCALVERSION)
PKG_TARBALL = $(THISAPP).tar.bz2 @@ -54,7 +56,7 @@ PKG_TARBALL = $(THISAPP).tar.bz2 ###############################################################################
define STAGE_PREPARE_CMDS - cd $(DIR_APP) && echo "$(LOCALVERSION)" > localversion-ipfire + cd $(DIR_APP) && echo "$(LOCALVERSION)" > localversion-$(DISTRO_SNAME) rm -f $(DIR_APP)/localversion-grsec
cd $(DIR_APP) && sed -e "s/^HOSTCFLAGS.*=.*/& -fPIC/g" -i Makefile diff --git a/pkgs/core/kernel/patches/aufs2-2.6.31.1-1.patch b/pkgs/core/kernel/patches/aufs2-2.6.31.1-1.patch deleted file mode 100644 index 1f6f612..0000000 --- a/pkgs/core/kernel/patches/aufs2-2.6.31.1-1.patch +++ /dev/null @@ -1,25456 +0,0 @@ -diff -Nur linux-2.6.31-vanilla/Documentation/ABI/testing/debugfs-aufs linux-2.6.31/Documentation/ABI/testing/debugfs-aufs ---- linux-2.6.31-vanilla/Documentation/ABI/testing/debugfs-aufs 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/Documentation/ABI/testing/debugfs-aufs 2009-09-16 13:55:29.000000000 +0200 -@@ -0,0 +1,40 @@ -+What: /debug/aufs/si_<id>/ -+Date: March 2009 -+Contact: J. R. Okajima hooanon05@yahoo.co.jp -+Description: -+ Under /debug/aufs, a directory named si_<id> is created -+ per aufs mount, where <id> is a unique id generated -+ internally. -+ -+What: /debug/aufs/si_<id>/xib -+Date: March 2009 -+Contact: J. R. Okajima hooanon05@yahoo.co.jp -+Description: -+ It shows the consumed blocks by xib (External Inode Number -+ Bitmap), its block size and file size. -+ When the aufs mount option 'noxino' is specified, it -+ will be empty. About XINO files, see -+ Documentation/filesystems/aufs/aufs.5 in detail. -+ -+What: /debug/aufs/si_<id>/xino0, xino1 ... xinoN -+Date: March 2009 -+Contact: J. R. Okajima hooanon05@yahoo.co.jp -+Description: -+ It shows the consumed blocks by xino (External Inode Number -+ Translation Table), its link count, block size and file -+ size. -+ When the aufs mount option 'noxino' is specified, it -+ will be empty. About XINO files, see -+ Documentation/filesystems/aufs/aufs.5 in detail. -+ -+What: /debug/aufs/si_<id>/xigen -+Date: March 2009 -+Contact: J. R. Okajima hooanon05@yahoo.co.jp -+Description: -+ It shows the consumed blocks by xigen (External Inode -+ Generation Table), its block size and file size. -+ If CONFIG_AUFS_EXPORT is disabled, this entry will not -+ be created. -+ When the aufs mount option 'noxino' is specified, it -+ will be empty. About XINO files, see -+ Documentation/filesystems/aufs/aufs.5 in detail. -diff -Nur linux-2.6.31-vanilla/Documentation/ABI/testing/sysfs-aufs linux-2.6.31/Documentation/ABI/testing/sysfs-aufs ---- linux-2.6.31-vanilla/Documentation/ABI/testing/sysfs-aufs 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/Documentation/ABI/testing/sysfs-aufs 2009-09-16 13:55:29.000000000 +0200 -@@ -0,0 +1,25 @@ -+What: /sys/fs/aufs/si_<id>/ -+Date: March 2009 -+Contact: J. R. Okajima hooanon05@yahoo.co.jp -+Description: -+ Under /sys/fs/aufs, a directory named si_<id> is created -+ per aufs mount, where <id> is a unique id generated -+ internally. -+ -+What: /sys/fs/aufs/si_<id>/br0, br1 ... brN -+Date: March 2009 -+Contact: J. R. Okajima hooanon05@yahoo.co.jp -+Description: -+ It shows the abolute path of a member directory (which -+ is called branch) in aufs, and its permission. -+ -+What: /sys/fs/aufs/si_<id>/xi_path -+Date: March 2009 -+Contact: J. R. Okajima hooanon05@yahoo.co.jp -+Description: -+ It shows the abolute path of XINO (External Inode Number -+ Bitmap, Translation Table and Generation Table) file -+ even if it is the default path. -+ When the aufs mount option 'noxino' is specified, it -+ will be empty. About XINO files, see -+ Documentation/filesystems/aufs/aufs.5 in detail. -diff -Nur linux-2.6.31-vanilla/fs/aufs/aufs.h linux-2.6.31/fs/aufs/aufs.h ---- linux-2.6.31-vanilla/fs/aufs/aufs.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/aufs.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,51 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * all header files -+ */ -+ -+#ifndef __AUFS_H__ -+#define __AUFS_H__ -+ -+#ifdef __KERNEL__ -+ -+#include "debug.h" -+ -+#include "branch.h" -+#include "cpup.h" -+#include "dcsub.h" -+#include "dbgaufs.h" -+#include "dentry.h" -+#include "dir.h" -+#include "file.h" -+#include "fstype.h" -+#include "inode.h" -+#include "loop.h" -+#include "module.h" -+#include "opts.h" -+#include "rwsem.h" -+#include "spl.h" -+#include "super.h" -+#include "sysaufs.h" -+#include "vfsub.h" -+#include "whout.h" -+#include "wkq.h" -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/branch.c linux-2.6.31/fs/aufs/branch.c ---- linux-2.6.31-vanilla/fs/aufs/branch.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/branch.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,969 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * branch management -+ */ -+ -+#include <linux/file.h> -+#include "aufs.h" -+ -+/* -+ * free a single branch -+ */ -+static void au_br_do_free(struct au_branch *br) -+{ -+ int i; -+ struct au_wbr *wbr; -+ -+ if (br->br_xino.xi_file) -+ fput(br->br_xino.xi_file); -+ mutex_destroy(&br->br_xino.xi_nondir_mtx); -+ -+ AuDebugOn(atomic_read(&br->br_count)); -+ -+ wbr = br->br_wbr; -+ if (wbr) { -+ for (i = 0; i < AuBrWh_Last; i++) -+ dput(wbr->wbr_wh[i]); -+ AuDebugOn(atomic_read(&wbr->wbr_wh_running)); -+ AuRwDestroy(&wbr->wbr_wh_rwsem); -+ } -+ -+ /* some filesystems acquire extra lock */ -+ lockdep_off(); -+ mntput(br->br_mnt); -+ lockdep_on(); -+ -+ kfree(wbr); -+ kfree(br); -+} -+ -+/* -+ * frees all branches -+ */ -+void au_br_free(struct au_sbinfo *sbinfo) -+{ -+ aufs_bindex_t bmax; -+ struct au_branch **br; -+ -+ AuRwMustWriteLock(&sbinfo->si_rwsem); -+ -+ bmax = sbinfo->si_bend + 1; -+ br = sbinfo->si_branch; -+ while (bmax--) -+ au_br_do_free(*br++); -+} -+ -+/* -+ * find the index of a branch which is specified by @br_id. -+ */ -+int au_br_index(struct super_block *sb, aufs_bindex_t br_id) -+{ -+ aufs_bindex_t bindex, bend; -+ -+ bend = au_sbend(sb); -+ for (bindex = 0; bindex <= bend; bindex++) -+ if (au_sbr_id(sb, bindex) == br_id) -+ return bindex; -+ return -1; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * add a branch -+ */ -+ -+static int test_overlap(struct super_block *sb, struct dentry *h_d1, -+ struct dentry *h_d2) -+{ -+ if (unlikely(h_d1 == h_d2)) -+ return 1; -+ return !!au_test_subdir(h_d1, h_d2) -+ || !!au_test_subdir(h_d2, h_d1) -+ || au_test_loopback_overlap(sb, h_d1, h_d2) -+ || au_test_loopback_overlap(sb, h_d2, h_d1); -+} -+ -+/* -+ * returns a newly allocated branch. @new_nbranch is a number of branches -+ * after adding a branch. -+ */ -+static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch, -+ int perm) -+{ -+ struct au_branch *add_branch; -+ struct dentry *root; -+ -+ root = sb->s_root; -+ add_branch = kmalloc(sizeof(*add_branch), GFP_NOFS); -+ if (unlikely(!add_branch)) -+ goto out; -+ -+ add_branch->br_wbr = NULL; -+ if (au_br_writable(perm)) { -+ /* may be freed separately at changing the branch permission */ -+ add_branch->br_wbr = kmalloc(sizeof(*add_branch->br_wbr), -+ GFP_NOFS); -+ if (unlikely(!add_branch->br_wbr)) -+ goto out_br; -+ } -+ -+ if (unlikely(au_sbr_realloc(au_sbi(sb), new_nbranch) -+ || au_di_realloc(au_di(root), new_nbranch) -+ || au_ii_realloc(au_ii(root->d_inode), new_nbranch))) -+ goto out_wbr; -+ return add_branch; /* success */ -+ -+ out_wbr: -+ kfree(add_branch->br_wbr); -+ out_br: -+ kfree(add_branch); -+ out: -+ return ERR_PTR(-ENOMEM); -+} -+ -+/* -+ * test if the branch permission is legal or not. -+ */ -+static int test_br(struct inode *inode, int brperm, char *path) -+{ -+ int err; -+ -+ err = 0; -+ if (unlikely(au_br_writable(brperm) && IS_RDONLY(inode))) { -+ AuErr("write permission for readonly mount or inode, %s\n", -+ path); -+ err = -EINVAL; -+ } -+ -+ return err; -+} -+ -+/* -+ * returns: -+ * 0: success, the caller will add it -+ * plus: success, it is already unified, the caller should ignore it -+ * minus: error -+ */ -+static int test_add(struct super_block *sb, struct au_opt_add *add, int remount) -+{ -+ int err; -+ aufs_bindex_t bend, bindex; -+ struct dentry *root; -+ struct inode *inode, *h_inode; -+ -+ root = sb->s_root; -+ bend = au_sbend(sb); -+ if (unlikely(bend >= 0 -+ && au_find_dbindex(root, add->path.dentry) >= 0)) { -+ err = 1; -+ if (!remount) { -+ err = -EINVAL; -+ AuErr("%s duplicated\n", add->pathname); -+ } -+ goto out; -+ } -+ -+ err = -ENOSPC; /* -E2BIG; */ -+ if (unlikely(AUFS_BRANCH_MAX <= add->bindex -+ || AUFS_BRANCH_MAX - 1 <= bend)) { -+ AuErr("number of branches exceeded %s\n", add->pathname); -+ goto out; -+ } -+ -+ err = -EDOM; -+ if (unlikely(add->bindex < 0 || bend + 1 < add->bindex)) { -+ AuErr("bad index %d\n", add->bindex); -+ goto out; -+ } -+ -+ inode = add->path.dentry->d_inode; -+ err = -ENOENT; -+ if (unlikely(!inode->i_nlink)) { -+ AuErr("no existence %s\n", add->pathname); -+ goto out; -+ } -+ -+ err = -EINVAL; -+ if (unlikely(inode->i_sb == sb)) { -+ AuErr("%s must be outside\n", add->pathname); -+ goto out; -+ } -+ -+ if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) { -+ AuErr("unsupported filesystem, %s (%s)\n", -+ add->pathname, au_sbtype(inode->i_sb)); -+ goto out; -+ } -+ -+ err = test_br(add->path.dentry->d_inode, add->perm, add->pathname); -+ if (unlikely(err)) -+ goto out; -+ -+ if (bend < 0) -+ return 0; /* success */ -+ -+ err = -EINVAL; -+ for (bindex = 0; bindex <= bend; bindex++) -+ if (unlikely(test_overlap(sb, add->path.dentry, -+ au_h_dptr(root, bindex)))) { -+ AuErr("%s is overlapped\n", add->pathname); -+ goto out; -+ } -+ -+ err = 0; -+ if (au_opt_test(au_mntflags(sb), WARN_PERM)) { -+ h_inode = au_h_dptr(root, 0)->d_inode; -+ if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO) -+ || h_inode->i_uid != inode->i_uid -+ || h_inode->i_gid != inode->i_gid) -+ AuWarn("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n", -+ add->pathname, -+ inode->i_uid, inode->i_gid, -+ (inode->i_mode & S_IALLUGO), -+ h_inode->i_uid, h_inode->i_gid, -+ (h_inode->i_mode & S_IALLUGO)); -+ } -+ -+ out: -+ return err; -+} -+ -+/* -+ * initialize or clean the whiteouts for an adding branch -+ */ -+static int au_br_init_wh(struct super_block *sb, struct au_branch *br, -+ int new_perm, struct dentry *h_root) -+{ -+ int err, old_perm; -+ aufs_bindex_t bindex; -+ struct mutex *h_mtx; -+ struct au_wbr *wbr; -+ struct au_hinode *hdir; -+ -+ wbr = br->br_wbr; -+ old_perm = br->br_perm; -+ br->br_perm = new_perm; -+ hdir = NULL; -+ h_mtx = NULL; -+ bindex = au_br_index(sb, br->br_id); -+ if (0 <= bindex) { -+ hdir = au_hi(sb->s_root->d_inode, bindex); -+ au_hin_imtx_lock_nested(hdir, AuLsc_I_PARENT); -+ } else { -+ h_mtx = &h_root->d_inode->i_mutex; -+ mutex_lock_nested(h_mtx, AuLsc_I_PARENT); -+ } -+ if (!wbr) -+ err = au_wh_init(h_root, br, sb); -+ else { -+ wbr_wh_write_lock(wbr); -+ err = au_wh_init(h_root, br, sb); -+ wbr_wh_write_unlock(wbr); -+ } -+ if (hdir) -+ au_hin_imtx_unlock(hdir); -+ else -+ mutex_unlock(h_mtx); -+ br->br_perm = old_perm; -+ -+ if (!err && wbr && !au_br_writable(new_perm)) { -+ kfree(wbr); -+ br->br_wbr = NULL; -+ } -+ -+ return err; -+} -+ -+static int au_wbr_init(struct au_branch *br, struct super_block *sb, -+ int perm, struct path *path) -+{ -+ int err; -+ struct au_wbr *wbr; -+ -+ wbr = br->br_wbr; -+ au_rw_init(&wbr->wbr_wh_rwsem); -+ memset(wbr->wbr_wh, 0, sizeof(wbr->wbr_wh)); -+ atomic_set(&wbr->wbr_wh_running, 0); -+ wbr->wbr_bytes = 0; -+ -+ err = au_br_init_wh(sb, br, perm, path->dentry); -+ -+ return err; -+} -+ -+/* intialize a new branch */ -+static int au_br_init(struct au_branch *br, struct super_block *sb, -+ struct au_opt_add *add) -+{ -+ int err; -+ -+ err = 0; -+ memset(&br->br_xino, 0, sizeof(br->br_xino)); -+ mutex_init(&br->br_xino.xi_nondir_mtx); -+ br->br_perm = add->perm; -+ br->br_mnt = add->path.mnt; /* set first, mntget() later */ -+ atomic_set(&br->br_count, 0); -+ br->br_xino_upper = AUFS_XINO_TRUNC_INIT; -+ atomic_set(&br->br_xino_running, 0); -+ br->br_id = au_new_br_id(sb); -+ -+ if (au_br_writable(add->perm)) { -+ err = au_wbr_init(br, sb, add->perm, &add->path); -+ if (unlikely(err)) -+ goto out; -+ } -+ -+ if (au_opt_test(au_mntflags(sb), XINO)) { -+ err = au_xino_br(sb, br, add->path.dentry->d_inode->i_ino, -+ au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1); -+ if (unlikely(err)) { -+ AuDebugOn(br->br_xino.xi_file); -+ goto out; -+ } -+ } -+ -+ sysaufs_br_init(br); -+ mntget(add->path.mnt); -+ -+ out: -+ return err; -+} -+ -+static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex, -+ struct au_branch *br, aufs_bindex_t bend, -+ aufs_bindex_t amount) -+{ -+ struct au_branch **brp; -+ -+ AuRwMustWriteLock(&sbinfo->si_rwsem); -+ -+ brp = sbinfo->si_branch + bindex; -+ memmove(brp + 1, brp, sizeof(*brp) * amount); -+ *brp = br; -+ sbinfo->si_bend++; -+ if (unlikely(bend < 0)) -+ sbinfo->si_bend = 0; -+} -+ -+static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex, -+ aufs_bindex_t bend, aufs_bindex_t amount) -+{ -+ struct au_hdentry *hdp; -+ -+ AuRwMustWriteLock(&dinfo->di_rwsem); -+ -+ hdp = dinfo->di_hdentry + bindex; -+ memmove(hdp + 1, hdp, sizeof(*hdp) * amount); -+ au_h_dentry_init(hdp); -+ dinfo->di_bend++; -+ if (unlikely(bend < 0)) -+ dinfo->di_bstart = 0; -+} -+ -+static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex, -+ aufs_bindex_t bend, aufs_bindex_t amount) -+{ -+ struct au_hinode *hip; -+ -+ AuRwMustWriteLock(&iinfo->ii_rwsem); -+ -+ hip = iinfo->ii_hinode + bindex; -+ memmove(hip + 1, hip, sizeof(*hip) * amount); -+ hip->hi_inode = NULL; -+ au_hin_init(hip, NULL); -+ iinfo->ii_bend++; -+ if (unlikely(bend < 0)) -+ iinfo->ii_bstart = 0; -+} -+ -+static void au_br_do_add(struct super_block *sb, struct dentry *h_dentry, -+ struct au_branch *br, aufs_bindex_t bindex) -+{ -+ struct dentry *root; -+ struct inode *root_inode; -+ aufs_bindex_t bend, amount; -+ -+ root = sb->s_root; -+ root_inode = root->d_inode; -+ au_plink_block_maintain(sb); -+ bend = au_sbend(sb); -+ amount = bend + 1 - bindex; -+ au_br_do_add_brp(au_sbi(sb), bindex, br, bend, amount); -+ au_br_do_add_hdp(au_di(root), bindex, bend, amount); -+ au_br_do_add_hip(au_ii(root_inode), bindex, bend, amount); -+ au_set_h_dptr(root, bindex, dget(h_dentry)); -+ au_set_h_iptr(root_inode, bindex, au_igrab(h_dentry->d_inode), -+ /*flags*/0); -+} -+ -+int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount) -+{ -+ int err; -+ aufs_bindex_t bend, add_bindex; -+ struct dentry *root, *h_dentry; -+ struct inode *root_inode; -+ struct au_branch *add_branch; -+ -+ root = sb->s_root; -+ root_inode = root->d_inode; -+ IMustLock(root_inode); -+ err = test_add(sb, add, remount); -+ if (unlikely(err < 0)) -+ goto out; -+ if (err) { -+ err = 0; -+ goto out; /* success */ -+ } -+ -+ bend = au_sbend(sb); -+ add_branch = au_br_alloc(sb, bend + 2, add->perm); -+ err = PTR_ERR(add_branch); -+ if (IS_ERR(add_branch)) -+ goto out; -+ -+ err = au_br_init(add_branch, sb, add); -+ if (unlikely(err)) { -+ au_br_do_free(add_branch); -+ goto out; -+ } -+ -+ add_bindex = add->bindex; -+ h_dentry = add->path.dentry; -+ if (!remount) -+ au_br_do_add(sb, h_dentry, add_branch, add_bindex); -+ else { -+ sysaufs_brs_del(sb, add_bindex); -+ au_br_do_add(sb, h_dentry, add_branch, add_bindex); -+ sysaufs_brs_add(sb, add_bindex); -+ } -+ -+ if (!add_bindex) { -+ au_cpup_attr_all(root_inode, /*force*/1); -+ sb->s_maxbytes = h_dentry->d_sb->s_maxbytes; -+ } else -+ au_add_nlink(root_inode, h_dentry->d_inode); -+ -+ /* -+ * this test/set prevents aufs from handling unnecesary inotify events -+ * of xino files, in a case of re-adding a writable branch which was -+ * once detached from aufs. -+ */ -+ if (au_xino_brid(sb) < 0 -+ && au_br_writable(add_branch->br_perm) -+ && !au_test_fs_bad_xino(h_dentry->d_sb) -+ && add_branch->br_xino.xi_file -+ && add_branch->br_xino.xi_file->f_dentry->d_parent == h_dentry) -+ au_xino_brid_set(sb, add_branch->br_id); -+ -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * delete a branch -+ */ -+ -+/* to show the line number, do not make it inlined function */ -+#define AuVerbose(do_info, fmt, args...) do { \ -+ if (do_info) \ -+ AuInfo(fmt, ##args); \ -+} while (0) -+ -+/* -+ * test if the branch is deletable or not. -+ */ -+static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex, -+ unsigned int sigen) -+{ -+ int err, i, j, ndentry; -+ aufs_bindex_t bstart, bend; -+ unsigned char verbose; -+ struct au_dcsub_pages dpages; -+ struct au_dpage *dpage; -+ struct dentry *d; -+ struct inode *inode; -+ -+ err = au_dpages_init(&dpages, GFP_NOFS); -+ if (unlikely(err)) -+ goto out; -+ err = au_dcsub_pages(&dpages, root, NULL, NULL); -+ if (unlikely(err)) -+ goto out_dpages; -+ -+ verbose = !!au_opt_test(au_mntflags(root->d_sb), VERBOSE); -+ for (i = 0; !err && i < dpages.ndpage; i++) { -+ dpage = dpages.dpages + i; -+ ndentry = dpage->ndentry; -+ for (j = 0; !err && j < ndentry; j++) { -+ d = dpage->dentries[j]; -+ AuDebugOn(!atomic_read(&d->d_count)); -+ inode = d->d_inode; -+ if (au_digen(d) == sigen && au_iigen(inode) == sigen) -+ di_read_lock_child(d, AuLock_IR); -+ else { -+ di_write_lock_child(d); -+ err = au_reval_dpath(d, sigen); -+ if (!err) -+ di_downgrade_lock(d, AuLock_IR); -+ else { -+ di_write_unlock(d); -+ break; -+ } -+ } -+ -+ bstart = au_dbstart(d); -+ bend = au_dbend(d); -+ if (bstart <= bindex -+ && bindex <= bend -+ && au_h_dptr(d, bindex) -+ && (!S_ISDIR(inode->i_mode) || bstart == bend)) { -+ err = -EBUSY; -+ AuVerbose(verbose, "busy %.*s\n", AuDLNPair(d)); -+ } -+ di_read_unlock(d, AuLock_IR); -+ } -+ } -+ -+ out_dpages: -+ au_dpages_free(&dpages); -+ out: -+ return err; -+} -+ -+static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex, -+ unsigned int sigen) -+{ -+ int err; -+ struct inode *i; -+ aufs_bindex_t bstart, bend; -+ unsigned char verbose; -+ -+ err = 0; -+ verbose = !!au_opt_test(au_mntflags(sb), VERBOSE); -+ list_for_each_entry(i, &sb->s_inodes, i_sb_list) { -+ AuDebugOn(!atomic_read(&i->i_count)); -+ if (!list_empty(&i->i_dentry)) -+ continue; -+ -+ if (au_iigen(i) == sigen) -+ ii_read_lock_child(i); -+ else { -+ ii_write_lock_child(i); -+ err = au_refresh_hinode_self(i, /*do_attr*/1); -+ if (!err) -+ ii_downgrade_lock(i); -+ else { -+ ii_write_unlock(i); -+ break; -+ } -+ } -+ -+ bstart = au_ibstart(i); -+ bend = au_ibend(i); -+ if (bstart <= bindex -+ && bindex <= bend -+ && au_h_iptr(i, bindex) -+ && (!S_ISDIR(i->i_mode) || bstart == bend)) { -+ err = -EBUSY; -+ AuVerbose(verbose, "busy i%lu\n", i->i_ino); -+ ii_read_unlock(i); -+ break; -+ } -+ ii_read_unlock(i); -+ } -+ -+ return err; -+} -+ -+static int test_children_busy(struct dentry *root, aufs_bindex_t bindex) -+{ -+ int err; -+ unsigned int sigen; -+ -+ sigen = au_sigen(root->d_sb); -+ DiMustNoWaiters(root); -+ IiMustNoWaiters(root->d_inode); -+ di_write_unlock(root); -+ err = test_dentry_busy(root, bindex, sigen); -+ if (!err) -+ err = test_inode_busy(root->d_sb, bindex, sigen); -+ di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */ -+ -+ return err; -+} -+ -+static void au_br_do_del_brp(struct au_sbinfo *sbinfo, -+ const aufs_bindex_t bindex, -+ const aufs_bindex_t bend) -+{ -+ struct au_branch **brp, **p; -+ -+ AuRwMustWriteLock(&sbinfo->si_rwsem); -+ -+ brp = sbinfo->si_branch + bindex; -+ if (bindex < bend) -+ memmove(brp, brp + 1, sizeof(*brp) * (bend - bindex)); -+ sbinfo->si_branch[0 + bend] = NULL; -+ sbinfo->si_bend--; -+ -+ p = krealloc(sbinfo->si_branch, sizeof(*p) * bend, GFP_NOFS); -+ if (p) -+ sbinfo->si_branch = p; -+} -+ -+static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex, -+ const aufs_bindex_t bend) -+{ -+ struct au_hdentry *hdp, *p; -+ -+ AuRwMustWriteLock(&dinfo->di_rwsem); -+ -+ hdp = dinfo->di_hdentry + bindex; -+ if (bindex < bend) -+ memmove(hdp, hdp + 1, sizeof(*hdp) * (bend - bindex)); -+ dinfo->di_hdentry[0 + bend].hd_dentry = NULL; -+ dinfo->di_bend--; -+ -+ p = krealloc(dinfo->di_hdentry, sizeof(*p) * bend, GFP_NOFS); -+ if (p) -+ dinfo->di_hdentry = p; -+} -+ -+static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex, -+ const aufs_bindex_t bend) -+{ -+ struct au_hinode *hip, *p; -+ -+ AuRwMustWriteLock(&iinfo->ii_rwsem); -+ -+ hip = iinfo->ii_hinode + bindex; -+ if (bindex < bend) -+ memmove(hip, hip + 1, sizeof(*hip) * (bend - bindex)); -+ iinfo->ii_hinode[0 + bend].hi_inode = NULL; -+ au_hin_init(iinfo->ii_hinode + bend, NULL); -+ iinfo->ii_bend--; -+ -+ p = krealloc(iinfo->ii_hinode, sizeof(*p) * bend, GFP_NOFS); -+ if (p) -+ iinfo->ii_hinode = p; -+} -+ -+static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex, -+ struct au_branch *br) -+{ -+ aufs_bindex_t bend; -+ struct au_sbinfo *sbinfo; -+ struct dentry *root; -+ struct inode *inode; -+ -+ SiMustWriteLock(sb); -+ -+ root = sb->s_root; -+ inode = root->d_inode; -+ au_plink_block_maintain(sb); -+ sbinfo = au_sbi(sb); -+ bend = sbinfo->si_bend; -+ -+ dput(au_h_dptr(root, bindex)); -+ au_hiput(au_hi(inode, bindex)); -+ au_br_do_free(br); -+ -+ au_br_do_del_brp(sbinfo, bindex, bend); -+ au_br_do_del_hdp(au_di(root), bindex, bend); -+ au_br_do_del_hip(au_ii(inode), bindex, bend); -+} -+ -+int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount) -+{ -+ int err, rerr, i; -+ unsigned int mnt_flags; -+ aufs_bindex_t bindex, bend, br_id; -+ unsigned char do_wh, verbose; -+ struct au_branch *br; -+ struct au_wbr *wbr; -+ -+ err = 0; -+ bindex = au_find_dbindex(sb->s_root, del->h_path.dentry); -+ if (bindex < 0) { -+ if (remount) -+ goto out; /* success */ -+ err = -ENOENT; -+ AuErr("%s no such branch\n", del->pathname); -+ goto out; -+ } -+ AuDbg("bindex b%d\n", bindex); -+ -+ err = -EBUSY; -+ mnt_flags = au_mntflags(sb); -+ verbose = !!au_opt_test(mnt_flags, VERBOSE); -+ bend = au_sbend(sb); -+ if (unlikely(!bend)) { -+ AuVerbose(verbose, "no more branches left\n"); -+ goto out; -+ } -+ br = au_sbr(sb, bindex); -+ i = atomic_read(&br->br_count); -+ if (unlikely(i)) { -+ AuVerbose(verbose, "%d file(s) opened\n", i); -+ goto out; -+ } -+ -+ wbr = br->br_wbr; -+ do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph); -+ if (do_wh) { -+ /* instead of WbrWhMustWriteLock(wbr) */ -+ SiMustWriteLock(sb); -+ for (i = 0; i < AuBrWh_Last; i++) { -+ dput(wbr->wbr_wh[i]); -+ wbr->wbr_wh[i] = NULL; -+ } -+ } -+ -+ err = test_children_busy(sb->s_root, bindex); -+ if (unlikely(err)) { -+ if (do_wh) -+ goto out_wh; -+ goto out; -+ } -+ -+ err = 0; -+ br_id = br->br_id; -+ if (!remount) -+ au_br_do_del(sb, bindex, br); -+ else { -+ sysaufs_brs_del(sb, bindex); -+ au_br_do_del(sb, bindex, br); -+ sysaufs_brs_add(sb, bindex); -+ } -+ -+ if (!bindex) { -+ au_cpup_attr_all(sb->s_root->d_inode, /*force*/1); -+ sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes; -+ } else -+ au_sub_nlink(sb->s_root->d_inode, del->h_path.dentry->d_inode); -+ if (au_opt_test(mnt_flags, PLINK)) -+ au_plink_half_refresh(sb, br_id); -+ -+ if (au_xino_brid(sb) == br->br_id) -+ au_xino_brid_set(sb, -1); -+ goto out; /* success */ -+ -+ out_wh: -+ /* revert */ -+ rerr = au_br_init_wh(sb, br, br->br_perm, del->h_path.dentry); -+ if (rerr) -+ AuWarn("failed re-creating base whiteout, %s. (%d)\n", -+ del->pathname, rerr); -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * change a branch permission -+ */ -+ -+static void au_warn_ima(void) -+{ -+#ifdef CONFIG_IMA -+ AuWarn("RW -> RO makes IMA to produce wrong message"); -+#endif -+} -+ -+static int do_need_sigen_inc(int a, int b) -+{ -+ return au_br_whable(a) && !au_br_whable(b); -+} -+ -+static int need_sigen_inc(int old, int new) -+{ -+ return do_need_sigen_inc(old, new) -+ || do_need_sigen_inc(new, old); -+} -+ -+static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ int err; -+ unsigned long n, ul, bytes, files; -+ aufs_bindex_t bstart; -+ struct file *file, *hf, **a; -+ const int step_bytes = 1024, /* memory allocation unit */ -+ step_files = step_bytes / sizeof(*a); -+ -+ err = -ENOMEM; -+ n = 0; -+ bytes = step_bytes; -+ files = step_files; -+ a = kmalloc(bytes, GFP_NOFS); -+ if (unlikely(!a)) -+ goto out; -+ -+ /* no need file_list_lock() since sbinfo is locked? defered? */ -+ list_for_each_entry(file, &sb->s_files, f_u.fu_list) { -+ if (special_file(file->f_dentry->d_inode->i_mode)) -+ continue; -+ -+ AuDbg("%.*s\n", AuDLNPair(file->f_dentry)); -+ fi_read_lock(file); -+ if (unlikely(au_test_mmapped(file))) { -+ err = -EBUSY; -+ FiMustNoWaiters(file); -+ fi_read_unlock(file); -+ goto out_free; -+ } -+ -+ bstart = au_fbstart(file); -+ if (!S_ISREG(file->f_dentry->d_inode->i_mode) -+ || !(file->f_mode & FMODE_WRITE) -+ || bstart != bindex) { -+ FiMustNoWaiters(file); -+ fi_read_unlock(file); -+ continue; -+ } -+ -+ hf = au_h_fptr(file, bstart); -+ FiMustNoWaiters(file); -+ fi_read_unlock(file); -+ -+ if (n < files) -+ a[n++] = hf; -+ else { -+ void *p; -+ -+ err = -ENOMEM; -+ bytes += step_bytes; -+ files += step_files; -+ p = krealloc(a, bytes, GFP_NOFS); -+ if (p) { -+ a = p; -+ a[n++] = hf; -+ } else -+ goto out_free; -+ } -+ } -+ -+ err = 0; -+ if (n) -+ au_warn_ima(); -+ for (ul = 0; ul < n; ul++) { -+ /* todo: already flushed? */ -+ /* cf. fs/super.c:mark_files_ro() */ -+ hf = a[ul]; -+ hf->f_mode &= ~FMODE_WRITE; -+ if (!file_check_writeable(hf)) { -+ file_release_write(hf); -+ mnt_drop_write(hf->f_vfsmnt); -+ } -+ } -+ -+ out_free: -+ kfree(a); -+ out: -+ return err; -+} -+ -+int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, -+ int *do_update) -+{ -+ int err, rerr; -+ aufs_bindex_t bindex; -+ struct path path; -+ struct dentry *root; -+ struct au_branch *br; -+ -+ root = sb->s_root; -+ au_plink_block_maintain(sb); -+ bindex = au_find_dbindex(root, mod->h_root); -+ if (bindex < 0) { -+ if (remount) -+ return 0; /* success */ -+ err = -ENOENT; -+ AuErr("%s no such branch\n", mod->path); -+ goto out; -+ } -+ AuDbg("bindex b%d\n", bindex); -+ -+ err = test_br(mod->h_root->d_inode, mod->perm, mod->path); -+ if (unlikely(err)) -+ goto out; -+ -+ br = au_sbr(sb, bindex); -+ if (br->br_perm == mod->perm) -+ return 0; /* success */ -+ -+ if (au_br_writable(br->br_perm)) { -+ /* remove whiteout base */ -+ err = au_br_init_wh(sb, br, mod->perm, mod->h_root); -+ if (unlikely(err)) -+ goto out; -+ -+ if (!au_br_writable(mod->perm)) { -+ /* rw --> ro, file might be mmapped */ -+ DiMustNoWaiters(root); -+ IiMustNoWaiters(root->d_inode); -+ di_write_unlock(root); -+ err = au_br_mod_files_ro(sb, bindex); -+ /* aufs_write_lock() calls ..._child() */ -+ di_write_lock_child(root); -+ -+ if (unlikely(err)) { -+ rerr = -ENOMEM; -+ br->br_wbr = kmalloc(sizeof(*br->br_wbr), -+ GFP_NOFS); -+ if (br->br_wbr) { -+ path.mnt = br->br_mnt; -+ path.dentry = mod->h_root; -+ rerr = au_wbr_init(br, sb, br->br_perm, -+ &path); -+ } -+ if (unlikely(rerr)) { -+ AuIOErr("nested error %d (%d)\n", -+ rerr, err); -+ br->br_perm = mod->perm; -+ } -+ } -+ } -+ } else if (au_br_writable(mod->perm)) { -+ /* ro --> rw */ -+ err = -ENOMEM; -+ br->br_wbr = kmalloc(sizeof(*br->br_wbr), GFP_NOFS); -+ if (br->br_wbr) { -+ path.mnt = br->br_mnt; -+ path.dentry = mod->h_root; -+ err = au_wbr_init(br, sb, mod->perm, &path); -+ if (unlikely(err)) { -+ kfree(br->br_wbr); -+ br->br_wbr = NULL; -+ } -+ } -+ } -+ -+ if (!err) { -+ *do_update |= need_sigen_inc(br->br_perm, mod->perm); -+ br->br_perm = mod->perm; -+ } -+ -+ out: -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/branch.h linux-2.6.31/fs/aufs/branch.h ---- linux-2.6.31-vanilla/fs/aufs/branch.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/branch.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,219 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * branch filesystems and xino for them -+ */ -+ -+#ifndef __AUFS_BRANCH_H__ -+#define __AUFS_BRANCH_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/fs.h> -+#include <linux/mount.h> -+#include <linux/aufs_type.h> -+#include "rwsem.h" -+#include "super.h" -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* a xino file */ -+struct au_xino_file { -+ struct file *xi_file; -+ struct mutex xi_nondir_mtx; -+ -+ /* todo: make xino files an array to support huge inode number */ -+ -+#ifdef CONFIG_DEBUG_FS -+ struct dentry *xi_dbgaufs; -+#endif -+}; -+ -+/* members for writable branch only */ -+enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last}; -+struct au_wbr { -+ struct au_rwsem wbr_wh_rwsem; -+ struct dentry *wbr_wh[AuBrWh_Last]; -+ atomic_t wbr_wh_running; -+#define wbr_whbase wbr_wh[AuBrWh_BASE] /* whiteout base */ -+#define wbr_plink wbr_wh[AuBrWh_PLINK] /* pseudo-link dir */ -+#define wbr_orph wbr_wh[AuBrWh_ORPH] /* dir for orphans */ -+ -+ /* mfs mode */ -+ unsigned long long wbr_bytes; -+}; -+ -+/* protected by superblock rwsem */ -+struct au_branch { -+ struct au_xino_file br_xino; -+ -+ aufs_bindex_t br_id; -+ -+ int br_perm; -+ struct vfsmount *br_mnt; -+ atomic_t br_count; -+ -+ struct au_wbr *br_wbr; -+ -+ /* xino truncation */ -+ blkcnt_t br_xino_upper; /* watermark in blocks */ -+ atomic_t br_xino_running; -+ -+#ifdef CONFIG_SYSFS -+ /* an entry under sysfs per mount-point */ -+ char br_name[8]; -+ struct attribute br_attr; -+#endif -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* branch permission and attribute */ -+enum { -+ AuBrPerm_RW, /* writable, linkable wh */ -+ AuBrPerm_RO, /* readonly, no wh */ -+ AuBrPerm_RR, /* natively readonly, no wh */ -+ -+ AuBrPerm_RWNoLinkWH, /* un-linkable whiteouts */ -+ -+ AuBrPerm_ROWH, /* whiteout-able */ -+ AuBrPerm_RRWH, /* whiteout-able */ -+ -+ AuBrPerm_Last -+}; -+ -+static inline int au_br_writable(int brperm) -+{ -+ return brperm == AuBrPerm_RW || brperm == AuBrPerm_RWNoLinkWH; -+} -+ -+static inline int au_br_whable(int brperm) -+{ -+ return brperm == AuBrPerm_RW -+ || brperm == AuBrPerm_ROWH -+ || brperm == AuBrPerm_RRWH; -+} -+ -+static inline int au_br_rdonly(struct au_branch *br) -+{ -+ return ((br->br_mnt->mnt_sb->s_flags & MS_RDONLY) -+ || !au_br_writable(br->br_perm)) -+ ? -EROFS : 0; -+} -+ -+static inline int au_br_hinotifyable(int brperm __maybe_unused) -+{ -+#ifdef CONFIG_AUFS_HINOTIFY -+ return brperm != AuBrPerm_RR && brperm != AuBrPerm_RRWH; -+#else -+ return 0; -+#endif -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* branch.c */ -+struct au_sbinfo; -+void au_br_free(struct au_sbinfo *sinfo); -+int au_br_index(struct super_block *sb, aufs_bindex_t br_id); -+struct au_opt_add; -+int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount); -+struct au_opt_del; -+int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount); -+struct au_opt_mod; -+int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, -+ int *do_update); -+ -+/* xino.c */ -+static const loff_t au_loff_max = LLONG_MAX; -+ -+int au_xib_trunc(struct super_block *sb); -+ssize_t xino_fread(au_readf_t func, struct file *file, void *buf, size_t size, -+ loff_t *pos); -+ssize_t xino_fwrite(au_writef_t func, struct file *file, void *buf, size_t size, -+ loff_t *pos); -+struct file *au_xino_create2(struct file *base_file, struct file *copy_src); -+struct file *au_xino_create(struct super_block *sb, char *fname, int silent); -+ino_t au_xino_new_ino(struct super_block *sb); -+int au_xino_write0(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, -+ ino_t ino); -+int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, -+ ino_t ino); -+int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, -+ ino_t *ino); -+int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino, -+ struct file *base_file, int do_test); -+int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex); -+ -+struct au_opt_xino; -+int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount); -+void au_xino_clr(struct super_block *sb); -+struct file *au_xino_def(struct super_block *sb); -+int au_xino_path(struct seq_file *seq, struct file *file); -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* Superblock to branch */ -+static inline -+aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ return au_sbr(sb, bindex)->br_id; -+} -+ -+static inline -+struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ return au_sbr(sb, bindex)->br_mnt; -+} -+ -+static inline -+struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ return au_sbr_mnt(sb, bindex)->mnt_sb; -+} -+ -+static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ atomic_dec_return(&au_sbr(sb, bindex)->br_count); -+} -+ -+static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ return au_sbr(sb, bindex)->br_perm; -+} -+ -+static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ return au_br_whable(au_sbr_perm(sb, bindex)); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * wbr_wh_read_lock, wbr_wh_write_lock -+ * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock -+ */ -+AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem); -+ -+#define WbrWhMustNoWaiters(wbr) AuRwMustNoWaiters(&wbr->wbr_wh_rwsem) -+#define WbrWhMustAnyLock(wbr) AuRwMustAnyLock(&wbr->wbr_wh_rwsem) -+#define WbrWhMustWriteLock(wbr) AuRwMustWriteLock(&wbr->wbr_wh_rwsem) -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_BRANCH_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/cpup.c linux-2.6.31/fs/aufs/cpup.c ---- linux-2.6.31-vanilla/fs/aufs/cpup.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/cpup.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,1048 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * copy-up functions, see wbr_policy.c for copy-down -+ */ -+ -+#include <linux/file.h> -+#include <linux/fs_stack.h> -+#include <linux/mm.h> -+#include <linux/uaccess.h> -+#include "aufs.h" -+ -+void au_cpup_attr_flags(struct inode *dst, struct inode *src) -+{ -+ const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE -+ | S_NOATIME | S_NOCMTIME; -+ -+ dst->i_flags |= src->i_flags & ~mask; -+ if (au_test_fs_notime(dst->i_sb)) -+ dst->i_flags |= S_NOATIME | S_NOCMTIME; -+} -+ -+void au_cpup_attr_timesizes(struct inode *inode) -+{ -+ struct inode *h_inode; -+ -+ h_inode = au_h_iptr(inode, au_ibstart(inode)); -+ fsstack_copy_attr_times(inode, h_inode); -+ vfsub_copy_inode_size(inode, h_inode); -+} -+ -+void au_cpup_attr_nlink(struct inode *inode, int force) -+{ -+ struct inode *h_inode; -+ struct super_block *sb; -+ aufs_bindex_t bindex, bend; -+ -+ sb = inode->i_sb; -+ bindex = au_ibstart(inode); -+ h_inode = au_h_iptr(inode, bindex); -+ if (!force -+ && !S_ISDIR(h_inode->i_mode) -+ && au_opt_test(au_mntflags(sb), PLINK) -+ && au_plink_test(inode)) -+ return; -+ -+ inode->i_nlink = h_inode->i_nlink; -+ -+ /* -+ * fewer nlink makes find(1) noisy, but larger nlink doesn't. -+ * it may includes whplink directory. -+ */ -+ if (S_ISDIR(h_inode->i_mode)) { -+ bend = au_ibend(inode); -+ for (bindex++; bindex <= bend; bindex++) { -+ h_inode = au_h_iptr(inode, bindex); -+ if (h_inode) -+ au_add_nlink(inode, h_inode); -+ } -+ } -+} -+ -+void au_cpup_attr_changeable(struct inode *inode) -+{ -+ struct inode *h_inode; -+ -+ h_inode = au_h_iptr(inode, au_ibstart(inode)); -+ inode->i_mode = h_inode->i_mode; -+ inode->i_uid = h_inode->i_uid; -+ inode->i_gid = h_inode->i_gid; -+ au_cpup_attr_timesizes(inode); -+ au_cpup_attr_flags(inode, h_inode); -+} -+ -+void au_cpup_igen(struct inode *inode, struct inode *h_inode) -+{ -+ struct au_iinfo *iinfo = au_ii(inode); -+ -+ IiMustWriteLock(inode); -+ -+ iinfo->ii_higen = h_inode->i_generation; -+ iinfo->ii_hsb1 = h_inode->i_sb; -+} -+ -+void au_cpup_attr_all(struct inode *inode, int force) -+{ -+ struct inode *h_inode; -+ -+ h_inode = au_h_iptr(inode, au_ibstart(inode)); -+ au_cpup_attr_changeable(inode); -+ if (inode->i_nlink > 0) -+ au_cpup_attr_nlink(inode, force); -+ inode->i_rdev = h_inode->i_rdev; -+ inode->i_blkbits = h_inode->i_blkbits; -+ au_cpup_igen(inode, h_inode); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */ -+ -+/* keep the timestamps of the parent dir when cpup */ -+void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, -+ struct path *h_path) -+{ -+ struct inode *h_inode; -+ -+ dt->dt_dentry = dentry; -+ dt->dt_h_path = *h_path; -+ h_inode = h_path->dentry->d_inode; -+ dt->dt_atime = h_inode->i_atime; -+ dt->dt_mtime = h_inode->i_mtime; -+ /* smp_mb(); */ -+} -+ -+void au_dtime_revert(struct au_dtime *dt) -+{ -+ struct iattr attr; -+ int err; -+ -+ attr.ia_atime = dt->dt_atime; -+ attr.ia_mtime = dt->dt_mtime; -+ attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET -+ | ATTR_ATIME | ATTR_ATIME_SET; -+ -+ err = vfsub_notify_change(&dt->dt_h_path, &attr); -+ if (unlikely(err)) -+ AuWarn("restoring timestamps failed(%d). ignored\n", err); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static noinline_for_stack -+int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src) -+{ -+ int err, sbits; -+ struct iattr ia; -+ struct path h_path; -+ struct inode *h_isrc, *h_idst; -+ -+ h_path.dentry = au_h_dptr(dst, bindex); -+ h_idst = h_path.dentry->d_inode; -+ h_path.mnt = au_sbr_mnt(dst->d_sb, bindex); -+ h_isrc = h_src->d_inode; -+ ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID -+ | ATTR_ATIME | ATTR_MTIME -+ | ATTR_ATIME_SET | ATTR_MTIME_SET; -+ ia.ia_uid = h_isrc->i_uid; -+ ia.ia_gid = h_isrc->i_gid; -+ ia.ia_atime = h_isrc->i_atime; -+ ia.ia_mtime = h_isrc->i_mtime; -+ if (h_idst->i_mode != h_isrc->i_mode -+ && !S_ISLNK(h_idst->i_mode)) { -+ ia.ia_valid |= ATTR_MODE; -+ ia.ia_mode = h_isrc->i_mode; -+ } -+ sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID)); -+ au_cpup_attr_flags(h_idst, h_isrc); -+ err = vfsub_notify_change(&h_path, &ia); -+ -+ /* is this nfs only? */ -+ if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) { -+ ia.ia_valid = ATTR_FORCE | ATTR_MODE; -+ ia.ia_mode = h_isrc->i_mode; -+ err = vfsub_notify_change(&h_path, &ia); -+ } -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int au_do_copy_file(struct file *dst, struct file *src, loff_t len, -+ char *buf, unsigned long blksize) -+{ -+ int err; -+ size_t sz, rbytes, wbytes; -+ unsigned char all_zero; -+ char *p, *zp; -+ struct mutex *h_mtx; -+ /* reduce stack usage */ -+ struct iattr *ia; -+ -+ zp = page_address(ZERO_PAGE(0)); -+ if (unlikely(!zp)) -+ return -ENOMEM; /* possible? */ -+ -+ err = 0; -+ all_zero = 0; -+ while (len) { -+ AuDbg("len %lld\n", len); -+ sz = blksize; -+ if (len < blksize) -+ sz = len; -+ -+ rbytes = 0; -+ /* todo: signal_pending? */ -+ while (!rbytes || err == -EAGAIN || err == -EINTR) { -+ rbytes = vfsub_read_k(src, buf, sz, &src->f_pos); -+ err = rbytes; -+ } -+ if (unlikely(err < 0)) -+ break; -+ -+ all_zero = 0; -+ if (len >= rbytes && rbytes == blksize) -+ all_zero = !memcmp(buf, zp, rbytes); -+ if (!all_zero) { -+ wbytes = rbytes; -+ p = buf; -+ while (wbytes) { -+ size_t b; -+ -+ b = vfsub_write_k(dst, p, wbytes, &dst->f_pos); -+ err = b; -+ /* todo: signal_pending? */ -+ if (unlikely(err == -EAGAIN || err == -EINTR)) -+ continue; -+ if (unlikely(err < 0)) -+ break; -+ wbytes -= b; -+ p += b; -+ } -+ } else { -+ loff_t res; -+ -+ AuLabel(hole); -+ res = vfsub_llseek(dst, rbytes, SEEK_CUR); -+ err = res; -+ if (unlikely(res < 0)) -+ break; -+ } -+ len -= rbytes; -+ err = 0; -+ } -+ -+ /* the last block may be a hole */ -+ if (!err && all_zero) { -+ AuLabel(last hole); -+ -+ err = 1; -+ if (au_test_nfs(dst->f_dentry->d_sb)) { -+ /* nfs requires this step to make last hole */ -+ /* is this only nfs? */ -+ do { -+ /* todo: signal_pending? */ -+ err = vfsub_write_k(dst, "\0", 1, &dst->f_pos); -+ } while (err == -EAGAIN || err == -EINTR); -+ if (err == 1) -+ dst->f_pos--; -+ } -+ -+ if (err == 1) { -+ ia = (void *)buf; -+ ia->ia_size = dst->f_pos; -+ ia->ia_valid = ATTR_SIZE | ATTR_FILE; -+ ia->ia_file = dst; -+ h_mtx = &dst->f_dentry->d_inode->i_mutex; -+ mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); -+ err = vfsub_notify_change(&dst->f_path, ia); -+ mutex_unlock(h_mtx); -+ } -+ } -+ -+ return err; -+} -+ -+int au_copy_file(struct file *dst, struct file *src, loff_t len) -+{ -+ int err; -+ unsigned long blksize; -+ unsigned char do_kfree; -+ char *buf; -+ -+ err = -ENOMEM; -+ blksize = dst->f_dentry->d_sb->s_blocksize; -+ if (!blksize || PAGE_SIZE < blksize) -+ blksize = PAGE_SIZE; -+ AuDbg("blksize %lu\n", blksize); -+ do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *)); -+ if (do_kfree) -+ buf = kmalloc(blksize, GFP_NOFS); -+ else -+ buf = (void *)__get_free_page(GFP_NOFS); -+ if (unlikely(!buf)) -+ goto out; -+ -+ if (len > (1 << 22)) -+ AuDbg("copying a large file %lld\n", (long long)len); -+ -+ src->f_pos = 0; -+ dst->f_pos = 0; -+ err = au_do_copy_file(dst, src, len, buf, blksize); -+ if (do_kfree) -+ kfree(buf); -+ else -+ free_page((unsigned long)buf); -+ -+ out: -+ return err; -+} -+ -+/* -+ * to support a sparse file which is opened with O_APPEND, -+ * we need to close the file. -+ */ -+static int au_cp_regular(struct dentry *dentry, aufs_bindex_t bdst, -+ aufs_bindex_t bsrc, loff_t len) -+{ -+ int err, i; -+ enum { SRC, DST }; -+ struct { -+ aufs_bindex_t bindex; -+ unsigned int flags; -+ struct dentry *dentry; -+ struct file *file; -+ void *label, *label_file; -+ } *f, file[] = { -+ { -+ .bindex = bsrc, -+ .flags = O_RDONLY | O_NOATIME | O_LARGEFILE, -+ .file = NULL, -+ .label = &&out, -+ .label_file = &&out_src -+ }, -+ { -+ .bindex = bdst, -+ .flags = O_WRONLY | O_NOATIME | O_LARGEFILE, -+ .file = NULL, -+ .label = &&out_src, -+ .label_file = &&out_dst -+ } -+ }; -+ struct super_block *sb; -+ -+ /* bsrc branch can be ro/rw. */ -+ sb = dentry->d_sb; -+ f = file; -+ for (i = 0; i < 2; i++, f++) { -+ f->dentry = au_h_dptr(dentry, f->bindex); -+ f->file = au_h_open(dentry, f->bindex, f->flags, /*file*/NULL); -+ err = PTR_ERR(f->file); -+ if (IS_ERR(f->file)) -+ goto *f->label; -+ err = -EINVAL; -+ if (unlikely(!f->file->f_op)) -+ goto *f->label_file; -+ } -+ -+ /* try stopping to update while we copyup */ -+ IMustLock(file[SRC].dentry->d_inode); -+ err = au_copy_file(file[DST].file, file[SRC].file, len); -+ -+ out_dst: -+ fput(file[DST].file); -+ au_sbr_put(sb, file[DST].bindex); -+ out_src: -+ fput(file[SRC].file); -+ au_sbr_put(sb, file[SRC].bindex); -+ out: -+ return err; -+} -+ -+static int au_do_cpup_regular(struct dentry *dentry, aufs_bindex_t bdst, -+ aufs_bindex_t bsrc, loff_t len, -+ struct inode *h_dir, struct path *h_path) -+{ -+ int err, rerr; -+ loff_t l; -+ -+ err = 0; -+ l = i_size_read(au_h_iptr(dentry->d_inode, bsrc)); -+ if (len == -1 || l < len) -+ len = l; -+ if (len) -+ err = au_cp_regular(dentry, bdst, bsrc, len); -+ if (!err) -+ goto out; /* success */ -+ -+ rerr = vfsub_unlink(h_dir, h_path, /*force*/0); -+ if (rerr) { -+ AuIOErr("failed unlinking cpup-ed %.*s(%d, %d)\n", -+ AuDLNPair(h_path->dentry), err, rerr); -+ err = -EIO; -+ } -+ -+ out: -+ return err; -+} -+ -+static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src, -+ struct inode *h_dir) -+{ -+ int err, symlen; -+ mm_segment_t old_fs; -+ char *sym; -+ -+ err = -ENOSYS; -+ if (unlikely(!h_src->d_inode->i_op->readlink)) -+ goto out; -+ -+ err = -ENOMEM; -+ sym = __getname(); -+ if (unlikely(!sym)) -+ goto out; -+ -+ old_fs = get_fs(); -+ set_fs(KERNEL_DS); -+ symlen = h_src->d_inode->i_op->readlink(h_src, (char __user *)sym, -+ PATH_MAX); -+ err = symlen; -+ set_fs(old_fs); -+ -+ if (symlen > 0) { -+ sym[symlen] = 0; -+ err = vfsub_symlink(h_dir, h_path, sym); -+ } -+ __putname(sym); -+ -+ out: -+ return err; -+} -+ -+/* return with the lower dst inode is locked */ -+static noinline_for_stack -+int cpup_entry(struct dentry *dentry, aufs_bindex_t bdst, -+ aufs_bindex_t bsrc, loff_t len, unsigned int flags, -+ struct dentry *dst_parent) -+{ -+ int err; -+ umode_t mode; -+ unsigned int mnt_flags; -+ unsigned char isdir; -+ const unsigned char do_dt = !!au_ftest_cpup(flags, DTIME); -+ struct au_dtime dt; -+ struct path h_path; -+ struct dentry *h_src, *h_dst, *h_parent; -+ struct inode *h_inode, *h_dir; -+ struct super_block *sb; -+ -+ /* bsrc branch can be ro/rw. */ -+ h_src = au_h_dptr(dentry, bsrc); -+ h_inode = h_src->d_inode; -+ AuDebugOn(h_inode != au_h_iptr(dentry->d_inode, bsrc)); -+ -+ /* try stopping to be referenced while we are creating */ -+ h_dst = au_h_dptr(dentry, bdst); -+ h_parent = h_dst->d_parent; /* dir inode is locked */ -+ h_dir = h_parent->d_inode; -+ IMustLock(h_dir); -+ AuDebugOn(h_parent != h_dst->d_parent); -+ -+ sb = dentry->d_sb; -+ h_path.mnt = au_sbr_mnt(sb, bdst); -+ if (do_dt) { -+ h_path.dentry = h_parent; -+ au_dtime_store(&dt, dst_parent, &h_path); -+ } -+ h_path.dentry = h_dst; -+ -+ isdir = 0; -+ mode = h_inode->i_mode; -+ switch (mode & S_IFMT) { -+ case S_IFREG: -+ /* try stopping to update while we are referencing */ -+ IMustLock(h_inode); -+ err = vfsub_create(h_dir, &h_path, mode | S_IWUSR); -+ if (!err) -+ err = au_do_cpup_regular -+ (dentry, bdst, bsrc, len, -+ au_h_iptr(dst_parent->d_inode, bdst), &h_path); -+ break; -+ case S_IFDIR: -+ isdir = 1; -+ err = vfsub_mkdir(h_dir, &h_path, mode); -+ if (!err) { -+ /* -+ * strange behaviour from the users view, -+ * particularry setattr case -+ */ -+ if (au_ibstart(dst_parent->d_inode) == bdst) -+ au_cpup_attr_nlink(dst_parent->d_inode, -+ /*force*/1); -+ au_cpup_attr_nlink(dentry->d_inode, /*force*/1); -+ } -+ break; -+ case S_IFLNK: -+ err = au_do_cpup_symlink(&h_path, h_src, h_dir); -+ break; -+ case S_IFCHR: -+ case S_IFBLK: -+ AuDebugOn(!capable(CAP_MKNOD)); -+ /*FALLTHROUGH*/ -+ case S_IFIFO: -+ case S_IFSOCK: -+ err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev); -+ break; -+ default: -+ AuIOErr("Unknown inode type 0%o\n", mode); -+ err = -EIO; -+ } -+ -+ mnt_flags = au_mntflags(sb); -+ if (!au_opt_test(mnt_flags, UDBA_NONE) -+ && !isdir -+ && au_opt_test(mnt_flags, XINO) -+ && h_inode->i_nlink == 1 -+ /* todo: unnecessary? */ -+ /* && dentry->d_inode->i_nlink == 1 */ -+ && bdst < bsrc -+ && !au_ftest_cpup(flags, KEEPLINO)) -+ au_xino_write(sb, bsrc, h_inode->i_ino, /*ino*/0); -+ /* ignore this error */ -+ -+ if (do_dt) -+ au_dtime_revert(&dt); -+ return err; -+} -+ -+/* -+ * copyup the @dentry from @bsrc to @bdst. -+ * the caller must set the both of lower dentries. -+ * @len is for truncating when it is -1 copyup the entire file. -+ * in link/rename cases, @dst_parent may be different from the real one. -+ */ -+static int au_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, -+ aufs_bindex_t bsrc, loff_t len, unsigned int flags, -+ struct dentry *dst_parent) -+{ -+ int err, rerr; -+ aufs_bindex_t old_ibstart; -+ unsigned char isdir, plink; -+ struct au_dtime dt; -+ struct path h_path; -+ struct dentry *h_src, *h_dst, *h_parent; -+ struct inode *dst_inode, *h_dir, *inode; -+ struct super_block *sb; -+ -+ AuDebugOn(bsrc <= bdst); -+ -+ sb = dentry->d_sb; -+ h_path.mnt = au_sbr_mnt(sb, bdst); -+ h_dst = au_h_dptr(dentry, bdst); -+ h_parent = h_dst->d_parent; /* dir inode is locked */ -+ h_dir = h_parent->d_inode; -+ IMustLock(h_dir); -+ -+ h_src = au_h_dptr(dentry, bsrc); -+ inode = dentry->d_inode; -+ -+ if (!dst_parent) -+ dst_parent = dget_parent(dentry); -+ else -+ dget(dst_parent); -+ -+ plink = !!au_opt_test(au_mntflags(sb), PLINK); -+ dst_inode = au_h_iptr(inode, bdst); -+ if (dst_inode) { -+ if (unlikely(!plink)) { -+ err = -EIO; -+ AuIOErr("i%lu exists on a upper branch " -+ "but plink is disabled\n", inode->i_ino); -+ goto out; -+ } -+ -+ if (dst_inode->i_nlink) { -+ const int do_dt = au_ftest_cpup(flags, DTIME); -+ -+ h_src = au_plink_lkup(inode, bdst); -+ err = PTR_ERR(h_src); -+ if (IS_ERR(h_src)) -+ goto out; -+ if (unlikely(!h_src->d_inode)) { -+ err = -EIO; -+ AuIOErr("i%lu exists on a upper branch " -+ "but plink is broken\n", inode->i_ino); -+ dput(h_src); -+ goto out; -+ } -+ -+ if (do_dt) { -+ h_path.dentry = h_parent; -+ au_dtime_store(&dt, dst_parent, &h_path); -+ } -+ h_path.dentry = h_dst; -+ err = vfsub_link(h_src, h_dir, &h_path); -+ if (do_dt) -+ au_dtime_revert(&dt); -+ dput(h_src); -+ goto out; -+ } else -+ /* todo: cpup_wh_file? */ -+ /* udba work */ -+ au_update_brange(inode, 1); -+ } -+ -+ old_ibstart = au_ibstart(inode); -+ err = cpup_entry(dentry, bdst, bsrc, len, flags, dst_parent); -+ if (unlikely(err)) -+ goto out; -+ dst_inode = h_dst->d_inode; -+ mutex_lock_nested(&dst_inode->i_mutex, AuLsc_I_CHILD2); -+ -+ err = cpup_iattr(dentry, bdst, h_src); -+ isdir = S_ISDIR(dst_inode->i_mode); -+ if (!err) { -+ if (bdst < old_ibstart) -+ au_set_ibstart(inode, bdst); -+ au_set_h_iptr(inode, bdst, au_igrab(dst_inode), -+ au_hi_flags(inode, isdir)); -+ mutex_unlock(&dst_inode->i_mutex); -+ if (!isdir -+ && h_src->d_inode->i_nlink > 1 -+ && plink) -+ au_plink_append(inode, bdst, h_dst); -+ goto out; /* success */ -+ } -+ -+ /* revert */ -+ h_path.dentry = h_parent; -+ mutex_unlock(&dst_inode->i_mutex); -+ au_dtime_store(&dt, dst_parent, &h_path); -+ h_path.dentry = h_dst; -+ if (!isdir) -+ rerr = vfsub_unlink(h_dir, &h_path, /*force*/0); -+ else -+ rerr = vfsub_rmdir(h_dir, &h_path); -+ au_dtime_revert(&dt); -+ if (rerr) { -+ AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr); -+ err = -EIO; -+ } -+ -+ out: -+ dput(dst_parent); -+ return err; -+} -+ -+struct au_cpup_single_args { -+ int *errp; -+ struct dentry *dentry; -+ aufs_bindex_t bdst, bsrc; -+ loff_t len; -+ unsigned int flags; -+ struct dentry *dst_parent; -+}; -+ -+static void au_call_cpup_single(void *args) -+{ -+ struct au_cpup_single_args *a = args; -+ *a->errp = au_cpup_single(a->dentry, a->bdst, a->bsrc, a->len, -+ a->flags, a->dst_parent); -+} -+ -+int au_sio_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, -+ aufs_bindex_t bsrc, loff_t len, unsigned int flags, -+ struct dentry *dst_parent) -+{ -+ int err, wkq_err; -+ umode_t mode; -+ struct dentry *h_dentry; -+ -+ h_dentry = au_h_dptr(dentry, bsrc); -+ mode = h_dentry->d_inode->i_mode & S_IFMT; -+ if ((mode != S_IFCHR && mode != S_IFBLK) -+ || capable(CAP_MKNOD)) -+ err = au_cpup_single(dentry, bdst, bsrc, len, flags, -+ dst_parent); -+ else { -+ struct au_cpup_single_args args = { -+ .errp = &err, -+ .dentry = dentry, -+ .bdst = bdst, -+ .bsrc = bsrc, -+ .len = len, -+ .flags = flags, -+ .dst_parent = dst_parent -+ }; -+ wkq_err = au_wkq_wait(au_call_cpup_single, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ } -+ -+ return err; -+} -+ -+/* -+ * copyup the @dentry from the first active lower branch to @bdst, -+ * using au_cpup_single(). -+ */ -+static int au_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, -+ unsigned int flags) -+{ -+ int err; -+ aufs_bindex_t bsrc, bend; -+ -+ bend = au_dbend(dentry); -+ for (bsrc = bdst + 1; bsrc <= bend; bsrc++) -+ if (au_h_dptr(dentry, bsrc)) -+ break; -+ -+ err = au_lkup_neg(dentry, bdst); -+ if (!err) { -+ err = au_cpup_single(dentry, bdst, bsrc, len, flags, NULL); -+ if (!err) -+ return 0; /* success */ -+ -+ /* revert */ -+ au_set_h_dptr(dentry, bdst, NULL); -+ au_set_dbstart(dentry, bsrc); -+ } -+ -+ return err; -+} -+ -+struct au_cpup_simple_args { -+ int *errp; -+ struct dentry *dentry; -+ aufs_bindex_t bdst; -+ loff_t len; -+ unsigned int flags; -+}; -+ -+static void au_call_cpup_simple(void *args) -+{ -+ struct au_cpup_simple_args *a = args; -+ *a->errp = au_cpup_simple(a->dentry, a->bdst, a->len, a->flags); -+} -+ -+int au_sio_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, -+ unsigned int flags) -+{ -+ int err, wkq_err; -+ unsigned char do_sio; -+ struct dentry *parent; -+ struct inode *h_dir; -+ -+ parent = dget_parent(dentry); -+ h_dir = au_h_iptr(parent->d_inode, bdst); -+ do_sio = !!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE); -+ if (!do_sio) { -+ /* -+ * testing CAP_MKNOD is for generic fs, -+ * but CAP_FSETID is for xfs only, currently. -+ */ -+ umode_t mode = dentry->d_inode->i_mode; -+ do_sio = (((mode & (S_IFCHR | S_IFBLK)) -+ && !capable(CAP_MKNOD)) -+ || ((mode & (S_ISUID | S_ISGID)) -+ && !capable(CAP_FSETID))); -+ } -+ if (!do_sio) -+ err = au_cpup_simple(dentry, bdst, len, flags); -+ else { -+ struct au_cpup_simple_args args = { -+ .errp = &err, -+ .dentry = dentry, -+ .bdst = bdst, -+ .len = len, -+ .flags = flags -+ }; -+ wkq_err = au_wkq_wait(au_call_cpup_simple, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ } -+ -+ dput(parent); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * copyup the deleted file for writing. -+ */ -+static int au_do_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, -+ struct dentry *wh_dentry, struct file *file, -+ loff_t len) -+{ -+ int err; -+ aufs_bindex_t bstart; -+ struct au_dinfo *dinfo; -+ struct dentry *h_d_dst, *h_d_start; -+ -+ dinfo = au_di(dentry); -+ AuRwMustWriteLock(&dinfo->di_rwsem); -+ -+ bstart = dinfo->di_bstart; -+ h_d_dst = dinfo->di_hdentry[0 + bdst].hd_dentry; -+ dinfo->di_bstart = bdst; -+ dinfo->di_hdentry[0 + bdst].hd_dentry = wh_dentry; -+ h_d_start = dinfo->di_hdentry[0 + bstart].hd_dentry; -+ if (file) -+ dinfo->di_hdentry[0 + bstart].hd_dentry -+ = au_h_fptr(file, au_fbstart(file))->f_dentry; -+ err = au_cpup_single(dentry, bdst, bstart, len, !AuCpup_DTIME, -+ /*h_parent*/NULL); -+ if (!err && file) { -+ err = au_reopen_nondir(file); -+ dinfo->di_hdentry[0 + bstart].hd_dentry = h_d_start; -+ } -+ dinfo->di_hdentry[0 + bdst].hd_dentry = h_d_dst; -+ dinfo->di_bstart = bstart; -+ -+ return err; -+} -+ -+static int au_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, -+ struct file *file) -+{ -+ int err; -+ struct au_dtime dt; -+ struct dentry *parent, *h_parent, *wh_dentry; -+ struct au_branch *br; -+ struct path h_path; -+ -+ br = au_sbr(dentry->d_sb, bdst); -+ parent = dget_parent(dentry); -+ h_parent = au_h_dptr(parent, bdst); -+ wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); -+ err = PTR_ERR(wh_dentry); -+ if (IS_ERR(wh_dentry)) -+ goto out; -+ -+ h_path.dentry = h_parent; -+ h_path.mnt = br->br_mnt; -+ au_dtime_store(&dt, parent, &h_path); -+ err = au_do_cpup_wh(dentry, bdst, wh_dentry, file, len); -+ if (unlikely(err)) -+ goto out_wh; -+ -+ dget(wh_dentry); -+ h_path.dentry = wh_dentry; -+ err = vfsub_unlink(h_parent->d_inode, &h_path, /*force*/0); -+ if (unlikely(err)) { -+ AuIOErr("failed remove copied-up tmp file %.*s(%d)\n", -+ AuDLNPair(wh_dentry), err); -+ err = -EIO; -+ } -+ au_dtime_revert(&dt); -+ au_set_hi_wh(dentry->d_inode, bdst, wh_dentry); -+ -+ out_wh: -+ dput(wh_dentry); -+ out: -+ dput(parent); -+ return err; -+} -+ -+struct au_cpup_wh_args { -+ int *errp; -+ struct dentry *dentry; -+ aufs_bindex_t bdst; -+ loff_t len; -+ struct file *file; -+}; -+ -+static void au_call_cpup_wh(void *args) -+{ -+ struct au_cpup_wh_args *a = args; -+ *a->errp = au_cpup_wh(a->dentry, a->bdst, a->len, a->file); -+} -+ -+int au_sio_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, -+ struct file *file) -+{ -+ int err, wkq_err; -+ struct dentry *parent, *h_orph, *h_parent, *h_dentry; -+ struct inode *dir, *h_dir, *h_tmpdir, *h_inode; -+ struct au_wbr *wbr; -+ -+ parent = dget_parent(dentry); -+ dir = parent->d_inode; -+ h_orph = NULL; -+ h_parent = NULL; -+ h_dir = au_igrab(au_h_iptr(dir, bdst)); -+ h_tmpdir = h_dir; -+ if (!h_dir->i_nlink) { -+ wbr = au_sbr(dentry->d_sb, bdst)->br_wbr; -+ h_orph = wbr->wbr_orph; -+ -+ h_parent = dget(au_h_dptr(parent, bdst)); -+ au_set_h_dptr(parent, bdst, NULL); -+ au_set_h_dptr(parent, bdst, dget(h_orph)); -+ h_tmpdir = h_orph->d_inode; -+ au_set_h_iptr(dir, bdst, NULL, 0); -+ au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0); -+ -+ /* this temporary unlock is safe */ -+ if (file) -+ h_dentry = au_h_fptr(file, au_fbstart(file))->f_dentry; -+ else -+ h_dentry = au_h_dptr(dentry, au_dbstart(dentry)); -+ h_inode = h_dentry->d_inode; -+ IMustLock(h_inode); -+ mutex_unlock(&h_inode->i_mutex); -+ mutex_lock_nested(&h_tmpdir->i_mutex, AuLsc_I_PARENT3); -+ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); -+ } -+ -+ if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE)) -+ err = au_cpup_wh(dentry, bdst, len, file); -+ else { -+ struct au_cpup_wh_args args = { -+ .errp = &err, -+ .dentry = dentry, -+ .bdst = bdst, -+ .len = len, -+ .file = file -+ }; -+ wkq_err = au_wkq_wait(au_call_cpup_wh, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ } -+ -+ if (h_orph) { -+ mutex_unlock(&h_tmpdir->i_mutex); -+ au_set_h_iptr(dir, bdst, NULL, 0); -+ au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0); -+ au_set_h_dptr(parent, bdst, NULL); -+ au_set_h_dptr(parent, bdst, h_parent); -+ } -+ iput(h_dir); -+ dput(parent); -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * generic routine for both of copy-up and copy-down. -+ */ -+/* cf. revalidate function in file.c */ -+int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, -+ int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, -+ struct dentry *h_parent, void *arg), -+ void *arg) -+{ -+ int err; -+ struct au_pin pin; -+ struct dentry *d, *parent, *h_parent, *real_parent; -+ -+ err = 0; -+ parent = dget_parent(dentry); -+ if (IS_ROOT(parent)) -+ goto out; -+ -+ au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2, -+ au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE); -+ -+ /* do not use au_dpage */ -+ real_parent = parent; -+ while (1) { -+ dput(parent); -+ parent = dget_parent(dentry); -+ h_parent = au_h_dptr(parent, bdst); -+ if (h_parent) -+ goto out; /* success */ -+ -+ /* find top dir which is necessary to cpup */ -+ do { -+ d = parent; -+ dput(parent); -+ parent = dget_parent(d); -+ di_read_lock_parent3(parent, !AuLock_IR); -+ h_parent = au_h_dptr(parent, bdst); -+ di_read_unlock(parent, !AuLock_IR); -+ } while (!h_parent); -+ -+ if (d != real_parent) -+ di_write_lock_child3(d); -+ -+ /* somebody else might create while we were sleeping */ -+ if (!au_h_dptr(d, bdst) || !au_h_dptr(d, bdst)->d_inode) { -+ if (au_h_dptr(d, bdst)) -+ au_update_dbstart(d); -+ -+ au_pin_set_dentry(&pin, d); -+ err = au_do_pin(&pin); -+ if (!err) { -+ err = cp(d, bdst, h_parent, arg); -+ au_unpin(&pin); -+ } -+ } -+ -+ if (d != real_parent) -+ di_write_unlock(d); -+ if (unlikely(err)) -+ break; -+ } -+ -+ out: -+ dput(parent); -+ return err; -+} -+ -+static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst, -+ struct dentry *h_parent __maybe_unused , -+ void *arg __maybe_unused) -+{ -+ return au_sio_cpup_simple(dentry, bdst, -1, AuCpup_DTIME); -+} -+ -+int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) -+{ -+ return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL); -+} -+ -+int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) -+{ -+ int err; -+ struct dentry *parent; -+ struct inode *dir; -+ -+ parent = dget_parent(dentry); -+ dir = parent->d_inode; -+ err = 0; -+ if (au_h_iptr(dir, bdst)) -+ goto out; -+ -+ di_read_unlock(parent, AuLock_IR); -+ di_write_lock_parent(parent); -+ /* someone else might change our inode while we were sleeping */ -+ if (!au_h_iptr(dir, bdst)) -+ err = au_cpup_dirs(dentry, bdst); -+ di_downgrade_lock(parent, AuLock_IR); -+ -+ out: -+ dput(parent); -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/cpup.h linux-2.6.31/fs/aufs/cpup.h ---- linux-2.6.31-vanilla/fs/aufs/cpup.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/cpup.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,81 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * copy-up/down functions -+ */ -+ -+#ifndef __AUFS_CPUP_H__ -+#define __AUFS_CPUP_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/path.h> -+#include <linux/time.h> -+#include <linux/aufs_type.h> -+ -+struct inode; -+struct file; -+ -+void au_cpup_attr_flags(struct inode *dst, struct inode *src); -+void au_cpup_attr_timesizes(struct inode *inode); -+void au_cpup_attr_nlink(struct inode *inode, int force); -+void au_cpup_attr_changeable(struct inode *inode); -+void au_cpup_igen(struct inode *inode, struct inode *h_inode); -+void au_cpup_attr_all(struct inode *inode, int force); -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* cpup flags */ -+#define AuCpup_DTIME 1 /* do dtime_store/revert */ -+#define AuCpup_KEEPLINO (1 << 1) /* do not clear the lower xino, -+ for link(2) */ -+#define au_ftest_cpup(flags, name) ((flags) & AuCpup_##name) -+#define au_fset_cpup(flags, name) { (flags) |= AuCpup_##name; } -+#define au_fclr_cpup(flags, name) { (flags) &= ~AuCpup_##name; } -+ -+int au_copy_file(struct file *dst, struct file *src, loff_t len); -+int au_sio_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, -+ aufs_bindex_t bsrc, loff_t len, unsigned int flags, -+ struct dentry *dst_parent); -+int au_sio_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, -+ unsigned int flags); -+int au_sio_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, -+ struct file *file); -+ -+int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, -+ int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, -+ struct dentry *h_parent, void *arg), -+ void *arg); -+int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); -+int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* keep timestamps when copyup */ -+struct au_dtime { -+ struct dentry *dt_dentry; -+ struct path dt_h_path; -+ struct timespec dt_atime, dt_mtime; -+}; -+void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, -+ struct path *h_path); -+void au_dtime_revert(struct au_dtime *dt); -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_CPUP_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/dbgaufs.c linux-2.6.31/fs/aufs/dbgaufs.c ---- linux-2.6.31-vanilla/fs/aufs/dbgaufs.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/dbgaufs.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,331 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * debugfs interface -+ */ -+ -+#include <linux/debugfs.h> -+#include "aufs.h" -+ -+#ifndef CONFIG_SYSFS -+#error DEBUG_FS depends upon SYSFS -+#endif -+ -+static struct dentry *dbgaufs; -+static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH; -+ -+/* 20 is max digits length of ulong 64 */ -+struct dbgaufs_arg { -+ int n; -+ char a[20 * 4]; -+}; -+ -+/* -+ * common function for all XINO files -+ */ -+static int dbgaufs_xi_release(struct inode *inode __maybe_unused, -+ struct file *file) -+{ -+ kfree(file->private_data); -+ return 0; -+} -+ -+static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt) -+{ -+ int err; -+ struct kstat st; -+ struct dbgaufs_arg *p; -+ -+ err = -ENOMEM; -+ p = kmalloc(sizeof(*p), GFP_NOFS); -+ if (unlikely(!p)) -+ goto out; -+ -+ err = 0; -+ p->n = 0; -+ file->private_data = p; -+ if (!xf) -+ goto out; -+ -+ err = vfs_getattr(xf->f_vfsmnt, xf->f_dentry, &st); -+ if (!err) { -+ if (do_fcnt) -+ p->n = snprintf -+ (p->a, sizeof(p->a), "%ld, %llux%lu %lld\n", -+ (long)file_count(xf), st.blocks, st.blksize, -+ (long long)st.size); -+ else -+ p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n", -+ st.blocks, st.blksize, -+ (long long)st.size); -+ AuDebugOn(p->n >= sizeof(p->a)); -+ } else { -+ p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err); -+ err = 0; -+ } -+ -+ out: -+ return err; -+ -+} -+ -+static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf, -+ size_t count, loff_t *ppos) -+{ -+ struct dbgaufs_arg *p; -+ -+ p = file->private_data; -+ return simple_read_from_buffer(buf, count, ppos, p->a, p->n); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int dbgaufs_xib_open(struct inode *inode, struct file *file) -+{ -+ int err; -+ struct au_sbinfo *sbinfo; -+ struct super_block *sb; -+ -+ sbinfo = inode->i_private; -+ sb = sbinfo->si_sb; -+ si_noflush_read_lock(sb); -+ err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0); -+ si_read_unlock(sb); -+ return err; -+} -+ -+static const struct file_operations dbgaufs_xib_fop = { -+ .open = dbgaufs_xib_open, -+ .release = dbgaufs_xi_release, -+ .read = dbgaufs_xi_read -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+#define DbgaufsXi_PREFIX "xi" -+ -+static int dbgaufs_xino_open(struct inode *inode, struct file *file) -+{ -+ int err; -+ long l; -+ struct au_sbinfo *sbinfo; -+ struct super_block *sb; -+ struct file *xf; -+ struct qstr *name; -+ -+ err = -ENOENT; -+ xf = NULL; -+ name = &file->f_dentry->d_name; -+ if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX) -+ || memcmp(name->name, DbgaufsXi_PREFIX, -+ sizeof(DbgaufsXi_PREFIX) - 1))) -+ goto out; -+ err = strict_strtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l); -+ if (unlikely(err)) -+ goto out; -+ -+ sbinfo = inode->i_private; -+ sb = sbinfo->si_sb; -+ si_noflush_read_lock(sb); -+ if (l <= au_sbend(sb)) { -+ xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file; -+ err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1); -+ } else -+ err = -ENOENT; -+ si_read_unlock(sb); -+ -+ out: -+ return err; -+} -+ -+static const struct file_operations dbgaufs_xino_fop = { -+ .open = dbgaufs_xino_open, -+ .release = dbgaufs_xi_release, -+ .read = dbgaufs_xi_read -+}; -+ -+void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ aufs_bindex_t bend; -+ struct au_branch *br; -+ struct au_xino_file *xi; -+ -+ if (!au_sbi(sb)->si_dbgaufs) -+ return; -+ -+ bend = au_sbend(sb); -+ for (; bindex <= bend; bindex++) { -+ br = au_sbr(sb, bindex); -+ xi = &br->br_xino; -+ if (xi->xi_dbgaufs) { -+ debugfs_remove(xi->xi_dbgaufs); -+ xi->xi_dbgaufs = NULL; -+ } -+ } -+} -+ -+void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ struct au_sbinfo *sbinfo; -+ struct dentry *parent; -+ struct au_branch *br; -+ struct au_xino_file *xi; -+ aufs_bindex_t bend; -+ char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */ -+ -+ sbinfo = au_sbi(sb); -+ parent = sbinfo->si_dbgaufs; -+ if (!parent) -+ return; -+ -+ bend = au_sbend(sb); -+ for (; bindex <= bend; bindex++) { -+ snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex); -+ br = au_sbr(sb, bindex); -+ xi = &br->br_xino; -+ AuDebugOn(xi->xi_dbgaufs); -+ xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent, -+ sbinfo, &dbgaufs_xino_fop); -+ /* ignore an error */ -+ if (unlikely(!xi->xi_dbgaufs)) -+ AuWarn1("failed %s under debugfs\n", name); -+ } -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+#ifdef CONFIG_AUFS_EXPORT -+static int dbgaufs_xigen_open(struct inode *inode, struct file *file) -+{ -+ int err; -+ struct au_sbinfo *sbinfo; -+ struct super_block *sb; -+ -+ sbinfo = inode->i_private; -+ sb = sbinfo->si_sb; -+ si_noflush_read_lock(sb); -+ err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0); -+ si_read_unlock(sb); -+ return err; -+} -+ -+static const struct file_operations dbgaufs_xigen_fop = { -+ .open = dbgaufs_xigen_open, -+ .release = dbgaufs_xi_release, -+ .read = dbgaufs_xi_read -+}; -+ -+static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) -+{ -+ int err; -+ -+ /* -+ * This function is a dynamic '__init' fucntion actually, -+ * so the tiny check for si_rwsem is unnecessary. -+ */ -+ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ -+ -+ err = -EIO; -+ sbinfo->si_dbgaufs_xigen = debugfs_create_file -+ ("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, -+ &dbgaufs_xigen_fop); -+ if (sbinfo->si_dbgaufs_xigen) -+ err = 0; -+ -+ return err; -+} -+#else -+static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) -+{ -+ return 0; -+} -+#endif /* CONFIG_AUFS_EXPORT */ -+ -+/* ---------------------------------------------------------------------- */ -+ -+void dbgaufs_si_fin(struct au_sbinfo *sbinfo) -+{ -+ /* -+ * This function is a dynamic '__init' fucntion actually, -+ * so the tiny check for si_rwsem is unnecessary. -+ */ -+ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ -+ -+ debugfs_remove_recursive(sbinfo->si_dbgaufs); -+ sbinfo->si_dbgaufs = NULL; -+ kobject_put(&sbinfo->si_kobj); -+} -+ -+int dbgaufs_si_init(struct au_sbinfo *sbinfo) -+{ -+ int err; -+ char name[SysaufsSiNameLen]; -+ -+ /* -+ * This function is a dynamic '__init' fucntion actually, -+ * so the tiny check for si_rwsem is unnecessary. -+ */ -+ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ -+ -+ err = -ENOENT; -+ if (!dbgaufs) { -+ AuErr1("/debug/aufs is uninitialized\n"); -+ goto out; -+ } -+ -+ err = -EIO; -+ sysaufs_name(sbinfo, name); -+ sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs); -+ if (unlikely(!sbinfo->si_dbgaufs)) -+ goto out; -+ kobject_get(&sbinfo->si_kobj); -+ -+ sbinfo->si_dbgaufs_xib = debugfs_create_file -+ ("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, -+ &dbgaufs_xib_fop); -+ if (unlikely(!sbinfo->si_dbgaufs_xib)) -+ goto out_dir; -+ -+ err = dbgaufs_xigen_init(sbinfo); -+ if (!err) -+ goto out; /* success */ -+ -+ out_dir: -+ dbgaufs_si_fin(sbinfo); -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+void dbgaufs_fin(void) -+{ -+ debugfs_remove(dbgaufs); -+} -+ -+int __init dbgaufs_init(void) -+{ -+ int err; -+ -+ err = -EIO; -+ dbgaufs = debugfs_create_dir(AUFS_NAME, NULL); -+ if (dbgaufs) -+ err = 0; -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/dbgaufs.h linux-2.6.31/fs/aufs/dbgaufs.h ---- linux-2.6.31-vanilla/fs/aufs/dbgaufs.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/dbgaufs.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,79 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * debugfs interface -+ */ -+ -+#ifndef __DBGAUFS_H__ -+#define __DBGAUFS_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/init.h> -+#include <linux/aufs_type.h> -+ -+struct super_block; -+struct au_sbinfo; -+ -+#ifdef CONFIG_DEBUG_FS -+/* dbgaufs.c */ -+void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); -+void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); -+void dbgaufs_si_fin(struct au_sbinfo *sbinfo); -+int dbgaufs_si_init(struct au_sbinfo *sbinfo); -+void dbgaufs_fin(void); -+int __init dbgaufs_init(void); -+ -+#else -+ -+static inline -+void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ /* empty */ -+} -+ -+static inline -+void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ /* empty */ -+} -+ -+static inline -+void dbgaufs_si_fin(struct au_sbinfo *sbinfo) -+{ -+ /* empty */ -+} -+ -+static inline -+int dbgaufs_si_init(struct au_sbinfo *sbinfo) -+{ -+ return 0; -+} -+ -+#define dbgaufs_fin() do {} while (0) -+ -+static inline -+int __init dbgaufs_init(void) -+{ -+ return 0; -+} -+#endif /* CONFIG_DEBUG_FS */ -+ -+#endif /* __KERNEL__ */ -+#endif /* __DBGAUFS_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/dcsub.c linux-2.6.31/fs/aufs/dcsub.c ---- linux-2.6.31-vanilla/fs/aufs/dcsub.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/dcsub.c 2009-09-16 13:55:29.000000000 +0200 -@@ -0,0 +1,223 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * sub-routines for dentry cache -+ */ -+ -+#include "aufs.h" -+ -+static void au_dpage_free(struct au_dpage *dpage) -+{ -+ int i; -+ struct dentry **p; -+ -+ p = dpage->dentries; -+ for (i = 0; i < dpage->ndentry; i++) -+ dput(*p++); -+ free_page((unsigned long)dpage->dentries); -+} -+ -+int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp) -+{ -+ int err; -+ void *p; -+ -+ err = -ENOMEM; -+ dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp); -+ if (unlikely(!dpages->dpages)) -+ goto out; -+ -+ p = (void *)__get_free_page(gfp); -+ if (unlikely(!p)) -+ goto out_dpages; -+ -+ dpages->dpages[0].ndentry = 0; -+ dpages->dpages[0].dentries = p; -+ dpages->ndpage = 1; -+ return 0; /* success */ -+ -+ out_dpages: -+ kfree(dpages->dpages); -+ out: -+ return err; -+} -+ -+void au_dpages_free(struct au_dcsub_pages *dpages) -+{ -+ int i; -+ struct au_dpage *p; -+ -+ p = dpages->dpages; -+ for (i = 0; i < dpages->ndpage; i++) -+ au_dpage_free(p++); -+ kfree(dpages->dpages); -+} -+ -+static int au_dpages_append(struct au_dcsub_pages *dpages, -+ struct dentry *dentry, gfp_t gfp) -+{ -+ int err, sz; -+ struct au_dpage *dpage; -+ void *p; -+ -+ dpage = dpages->dpages + dpages->ndpage - 1; -+ sz = PAGE_SIZE / sizeof(dentry); -+ if (unlikely(dpage->ndentry >= sz)) { -+ AuLabel(new dpage); -+ err = -ENOMEM; -+ sz = dpages->ndpage * sizeof(*dpages->dpages); -+ p = au_kzrealloc(dpages->dpages, sz, -+ sz + sizeof(*dpages->dpages), gfp); -+ if (unlikely(!p)) -+ goto out; -+ -+ dpages->dpages = p; -+ dpage = dpages->dpages + dpages->ndpage; -+ p = (void *)__get_free_page(gfp); -+ if (unlikely(!p)) -+ goto out; -+ -+ dpage->ndentry = 0; -+ dpage->dentries = p; -+ dpages->ndpage++; -+ } -+ -+ dpage->dentries[dpage->ndentry++] = dget(dentry); -+ return 0; /* success */ -+ -+ out: -+ return err; -+} -+ -+int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, -+ au_dpages_test test, void *arg) -+{ -+ int err; -+ struct dentry *this_parent = root; -+ struct list_head *next; -+ struct super_block *sb = root->d_sb; -+ -+ err = 0; -+ spin_lock(&dcache_lock); -+ repeat: -+ next = this_parent->d_subdirs.next; -+ resume: -+ if (this_parent->d_sb == sb -+ && !IS_ROOT(this_parent) -+ && atomic_read(&this_parent->d_count) -+ && this_parent->d_inode -+ && (!test || test(this_parent, arg))) { -+ err = au_dpages_append(dpages, this_parent, GFP_ATOMIC); -+ if (unlikely(err)) -+ goto out; -+ } -+ -+ while (next != &this_parent->d_subdirs) { -+ struct list_head *tmp = next; -+ struct dentry *dentry = list_entry(tmp, struct dentry, -+ d_u.d_child); -+ next = tmp->next; -+ if (/*d_unhashed(dentry) || */!dentry->d_inode) -+ continue; -+ if (!list_empty(&dentry->d_subdirs)) { -+ this_parent = dentry; -+ goto repeat; -+ } -+ if (dentry->d_sb == sb -+ && atomic_read(&dentry->d_count) -+ && (!test || test(dentry, arg))) { -+ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); -+ if (unlikely(err)) -+ goto out; -+ } -+ } -+ -+ if (this_parent != root) { -+ next = this_parent->d_u.d_child.next; -+ this_parent = this_parent->d_parent; /* dcache_lock is locked */ -+ goto resume; -+ } -+ out: -+ spin_unlock(&dcache_lock); -+ return err; -+} -+ -+int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, -+ int do_include, au_dpages_test test, void *arg) -+{ -+ int err; -+ -+ err = 0; -+ spin_lock(&dcache_lock); -+ if (do_include && (!test || test(dentry, arg))) { -+ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); -+ if (unlikely(err)) -+ goto out; -+ } -+ while (!IS_ROOT(dentry)) { -+ dentry = dentry->d_parent; /* dcache_lock is locked */ -+ if (!test || test(dentry, arg)) { -+ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); -+ if (unlikely(err)) -+ break; -+ } -+ } -+ -+ out: -+ spin_unlock(&dcache_lock); -+ -+ return err; -+} -+ -+struct dentry *au_test_subdir(struct dentry *d1, struct dentry *d2) -+{ -+ struct dentry *trap, **dentries; -+ int err, i, j; -+ struct au_dcsub_pages dpages; -+ struct au_dpage *dpage; -+ -+ trap = ERR_PTR(-ENOMEM); -+ err = au_dpages_init(&dpages, GFP_NOFS); -+ if (unlikely(err)) -+ goto out; -+ err = au_dcsub_pages_rev(&dpages, d1, /*do_include*/1, NULL, NULL); -+ if (unlikely(err)) -+ goto out_dpages; -+ -+ trap = d1; -+ for (i = 0; !err && i < dpages.ndpage; i++) { -+ dpage = dpages.dpages + i; -+ dentries = dpage->dentries; -+ for (j = 0; !err && j < dpage->ndentry; j++) { -+ struct dentry *d; -+ -+ d = dentries[j]; -+ err = (d == d2); -+ if (!err) -+ trap = d; -+ } -+ } -+ if (!err) -+ trap = NULL; -+ -+ out_dpages: -+ au_dpages_free(&dpages); -+ out: -+ return trap; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/dcsub.h linux-2.6.31/fs/aufs/dcsub.h ---- linux-2.6.31-vanilla/fs/aufs/dcsub.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/dcsub.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,54 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * sub-routines for dentry cache -+ */ -+ -+#ifndef __AUFS_DCSUB_H__ -+#define __AUFS_DCSUB_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/types.h> -+ -+struct dentry; -+ -+struct au_dpage { -+ int ndentry; -+ struct dentry **dentries; -+}; -+ -+struct au_dcsub_pages { -+ int ndpage; -+ struct au_dpage *dpages; -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp); -+void au_dpages_free(struct au_dcsub_pages *dpages); -+typedef int (*au_dpages_test)(struct dentry *dentry, void *arg); -+int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, -+ au_dpages_test test, void *arg); -+int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, -+ int do_include, au_dpages_test test, void *arg); -+struct dentry *au_test_subdir(struct dentry *d1, struct dentry *d2); -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_DCSUB_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/debug.c linux-2.6.31/fs/aufs/debug.c ---- linux-2.6.31-vanilla/fs/aufs/debug.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/debug.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,431 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * debug print functions -+ */ -+ -+#include <linux/module.h> -+#include <linux/vt_kern.h> -+#include "aufs.h" -+ -+int aufs_debug; -+MODULE_PARM_DESC(debug, "debug print"); -+module_param_named(debug, aufs_debug, int, S_IRUGO | S_IWUSR | S_IWGRP); -+ -+char *au_plevel = KERN_DEBUG; -+#define dpri(fmt, arg...) do { \ -+ if (au_debug_test()) \ -+ printk("%s" fmt, au_plevel, ##arg); \ -+} while (0) -+ -+/* ---------------------------------------------------------------------- */ -+ -+void au_dpri_whlist(struct au_nhash *whlist) -+{ -+ unsigned long ul, n; -+ struct hlist_head *head; -+ struct au_vdir_wh *tpos; -+ struct hlist_node *pos; -+ -+ n = whlist->nh_num; -+ head = whlist->nh_head; -+ for (ul = 0; ul < n; ul++) { -+ hlist_for_each_entry(tpos, pos, head, wh_hash) -+ dpri("b%d, %.*s, %d\n", -+ tpos->wh_bindex, -+ tpos->wh_str.len, tpos->wh_str.name, -+ tpos->wh_str.len); -+ head++; -+ } -+} -+ -+void au_dpri_vdir(struct au_vdir *vdir) -+{ -+ unsigned long ul; -+ union au_vdir_deblk_p p; -+ unsigned char *o; -+ -+ if (!vdir || IS_ERR(vdir)) { -+ dpri("err %ld\n", PTR_ERR(vdir)); -+ return; -+ } -+ -+ dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n", -+ vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk, -+ vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version); -+ for (ul = 0; ul < vdir->vd_nblk; ul++) { -+ p.deblk = vdir->vd_deblk[ul]; -+ o = p.deblk; -+ dpri("[%lu]: %p\n", ul, o); -+ } -+} -+ -+static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, -+ struct dentry *wh) -+{ -+ char *n = NULL; -+ int l = 0; -+ -+ if (!inode || IS_ERR(inode)) { -+ dpri("i%d: err %ld\n", bindex, PTR_ERR(inode)); -+ return -1; -+ } -+ -+ /* the type of i_blocks depends upon CONFIG_LSF */ -+ BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long) -+ && sizeof(inode->i_blocks) != sizeof(u64)); -+ if (wh) { -+ n = (void *)wh->d_name.name; -+ l = wh->d_name.len; -+ } -+ -+ dpri("i%d: i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu," -+ " ct %lld, np %lu, st 0x%lx, f 0x%x, g %x%s%.*s\n", -+ bindex, -+ inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??", -+ atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode, -+ i_size_read(inode), (unsigned long long)inode->i_blocks, -+ (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff, -+ inode->i_mapping ? inode->i_mapping->nrpages : 0, -+ inode->i_state, inode->i_flags, inode->i_generation, -+ l ? ", wh " : "", l, n); -+ return 0; -+} -+ -+void au_dpri_inode(struct inode *inode) -+{ -+ struct au_iinfo *iinfo; -+ aufs_bindex_t bindex; -+ int err; -+ -+ err = do_pri_inode(-1, inode, NULL); -+ if (err || !au_test_aufs(inode->i_sb)) -+ return; -+ -+ iinfo = au_ii(inode); -+ if (!iinfo) -+ return; -+ dpri("i-1: bstart %d, bend %d, gen %d\n", -+ iinfo->ii_bstart, iinfo->ii_bend, au_iigen(inode)); -+ if (iinfo->ii_bstart < 0) -+ return; -+ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; bindex++) -+ do_pri_inode(bindex, iinfo->ii_hinode[0 + bindex].hi_inode, -+ iinfo->ii_hinode[0 + bindex].hi_whdentry); -+} -+ -+static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry) -+{ -+ struct dentry *wh = NULL; -+ -+ if (!dentry || IS_ERR(dentry)) { -+ dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry)); -+ return -1; -+ } -+ /* do not call dget_parent() here */ -+ dpri("d%d: %.*s?/%.*s, %s, cnt %d, flags 0x%x\n", -+ bindex, -+ AuDLNPair(dentry->d_parent), AuDLNPair(dentry), -+ dentry->d_sb ? au_sbtype(dentry->d_sb) : "??", -+ atomic_read(&dentry->d_count), dentry->d_flags); -+ if (bindex >= 0 && dentry->d_inode && au_test_aufs(dentry->d_sb)) { -+ struct au_iinfo *iinfo = au_ii(dentry->d_inode); -+ if (iinfo) -+ wh = iinfo->ii_hinode[0 + bindex].hi_whdentry; -+ } -+ do_pri_inode(bindex, dentry->d_inode, wh); -+ return 0; -+} -+ -+void au_dpri_dentry(struct dentry *dentry) -+{ -+ struct au_dinfo *dinfo; -+ aufs_bindex_t bindex; -+ int err; -+ -+ err = do_pri_dentry(-1, dentry); -+ if (err || !au_test_aufs(dentry->d_sb)) -+ return; -+ -+ dinfo = au_di(dentry); -+ if (!dinfo) -+ return; -+ dpri("d-1: bstart %d, bend %d, bwh %d, bdiropq %d, gen %d\n", -+ dinfo->di_bstart, dinfo->di_bend, -+ dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry)); -+ if (dinfo->di_bstart < 0) -+ return; -+ for (bindex = dinfo->di_bstart; bindex <= dinfo->di_bend; bindex++) -+ do_pri_dentry(bindex, dinfo->di_hdentry[0 + bindex].hd_dentry); -+} -+ -+static int do_pri_file(aufs_bindex_t bindex, struct file *file) -+{ -+ char a[32]; -+ -+ if (!file || IS_ERR(file)) { -+ dpri("f%d: err %ld\n", bindex, PTR_ERR(file)); -+ return -1; -+ } -+ a[0] = 0; -+ if (bindex < 0 -+ && file->f_dentry -+ && au_test_aufs(file->f_dentry->d_sb) -+ && au_fi(file)) -+ snprintf(a, sizeof(a), ", mmapped %d", au_test_mmapped(file)); -+ dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, pos %llu%s\n", -+ bindex, file->f_mode, file->f_flags, (long)file_count(file), -+ file->f_pos, a); -+ if (file->f_dentry) -+ do_pri_dentry(bindex, file->f_dentry); -+ return 0; -+} -+ -+void au_dpri_file(struct file *file) -+{ -+ struct au_finfo *finfo; -+ aufs_bindex_t bindex; -+ int err; -+ -+ err = do_pri_file(-1, file); -+ if (err || !file->f_dentry || !au_test_aufs(file->f_dentry->d_sb)) -+ return; -+ -+ finfo = au_fi(file); -+ if (!finfo) -+ return; -+ if (finfo->fi_bstart < 0) -+ return; -+ for (bindex = finfo->fi_bstart; bindex <= finfo->fi_bend; bindex++) { -+ struct au_hfile *hf; -+ -+ hf = finfo->fi_hfile + bindex; -+ do_pri_file(bindex, hf ? hf->hf_file : NULL); -+ } -+} -+ -+static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br) -+{ -+ struct vfsmount *mnt; -+ struct super_block *sb; -+ -+ if (!br || IS_ERR(br)) -+ goto out; -+ mnt = br->br_mnt; -+ if (!mnt || IS_ERR(mnt)) -+ goto out; -+ sb = mnt->mnt_sb; -+ if (!sb || IS_ERR(sb)) -+ goto out; -+ -+ dpri("s%d: {perm 0x%x, cnt %d, wbr %p}, " -+ "%s, dev 0x%02x%02x, flags 0x%lx, cnt(BIAS) %d, active %d, " -+ "xino %d\n", -+ bindex, br->br_perm, atomic_read(&br->br_count), br->br_wbr, -+ au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev), -+ sb->s_flags, sb->s_count - S_BIAS, -+ atomic_read(&sb->s_active), !!br->br_xino.xi_file); -+ return 0; -+ -+ out: -+ dpri("s%d: err %ld\n", bindex, PTR_ERR(br)); -+ return -1; -+} -+ -+void au_dpri_sb(struct super_block *sb) -+{ -+ struct au_sbinfo *sbinfo; -+ aufs_bindex_t bindex; -+ int err; -+ /* to reuduce stack size */ -+ struct { -+ struct vfsmount mnt; -+ struct au_branch fake; -+ } *a; -+ -+ /* this function can be called from magic sysrq */ -+ a = kzalloc(sizeof(*a), GFP_ATOMIC); -+ if (unlikely(!a)) { -+ dpri("no memory\n"); -+ return; -+ } -+ -+ a->mnt.mnt_sb = sb; -+ a->fake.br_perm = 0; -+ a->fake.br_mnt = &a->mnt; -+ a->fake.br_xino.xi_file = NULL; -+ atomic_set(&a->fake.br_count, 0); -+ smp_mb(); /* atomic_set */ -+ err = do_pri_br(-1, &a->fake); -+ kfree(a); -+ dpri("dev 0x%x\n", sb->s_dev); -+ if (err || !au_test_aufs(sb)) -+ return; -+ -+ sbinfo = au_sbi(sb); -+ if (!sbinfo) -+ return; -+ dpri("nw %d, gen %u, kobj %d\n", -+ atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation, -+ atomic_read(&sbinfo->si_kobj.kref.refcount)); -+ for (bindex = 0; bindex <= sbinfo->si_bend; bindex++) -+ do_pri_br(bindex, sbinfo->si_branch[0 + bindex]); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+void au_dbg_sleep_jiffy(int jiffy) -+{ -+ while (jiffy) -+ jiffy = schedule_timeout_uninterruptible(jiffy); -+} -+ -+void au_dbg_iattr(struct iattr *ia) -+{ -+#define AuBit(name) if (ia->ia_valid & ATTR_ ## name) \ -+ dpri(#name "\n") -+ AuBit(MODE); -+ AuBit(UID); -+ AuBit(GID); -+ AuBit(SIZE); -+ AuBit(ATIME); -+ AuBit(MTIME); -+ AuBit(CTIME); -+ AuBit(ATIME_SET); -+ AuBit(MTIME_SET); -+ AuBit(FORCE); -+ AuBit(ATTR_FLAG); -+ AuBit(KILL_SUID); -+ AuBit(KILL_SGID); -+ AuBit(FILE); -+ AuBit(KILL_PRIV); -+ AuBit(OPEN); -+ AuBit(TIMES_SET); -+#undef AuBit -+ dpri("ia_file %p\n", ia->ia_file); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+void au_dbg_verify_dir_parent(struct dentry *dentry, unsigned int sigen) -+{ -+ struct dentry *parent; -+ -+ parent = dget_parent(dentry); -+ AuDebugOn(!S_ISDIR(dentry->d_inode->i_mode) -+ || IS_ROOT(dentry) -+ || au_digen(parent) != sigen); -+ dput(parent); -+} -+ -+void au_dbg_verify_nondir_parent(struct dentry *dentry, unsigned int sigen) -+{ -+ struct dentry *parent; -+ -+ parent = dget_parent(dentry); -+ AuDebugOn(S_ISDIR(dentry->d_inode->i_mode) -+ || au_digen(parent) != sigen); -+ dput(parent); -+} -+ -+void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen) -+{ -+ int err, i, j; -+ struct au_dcsub_pages dpages; -+ struct au_dpage *dpage; -+ struct dentry **dentries; -+ -+ err = au_dpages_init(&dpages, GFP_NOFS); -+ AuDebugOn(err); -+ err = au_dcsub_pages_rev(&dpages, parent, /*do_include*/1, NULL, NULL); -+ AuDebugOn(err); -+ for (i = dpages.ndpage - 1; !err && i >= 0; i--) { -+ dpage = dpages.dpages + i; -+ dentries = dpage->dentries; -+ for (j = dpage->ndentry - 1; !err && j >= 0; j--) -+ AuDebugOn(au_digen(dentries[j]) != sigen); -+ } -+ au_dpages_free(&dpages); -+} -+ -+void au_dbg_verify_hf(struct au_finfo *finfo) -+{ -+ struct au_hfile *hf; -+ aufs_bindex_t bend, bindex; -+ -+ if (finfo->fi_bstart >= 0) { -+ bend = finfo->fi_bend; -+ for (bindex = finfo->fi_bstart; bindex <= bend; bindex++) { -+ hf = finfo->fi_hfile + bindex; -+ AuDebugOn(hf->hf_file || hf->hf_br); -+ } -+ } -+} -+ -+void au_dbg_verify_kthread(void) -+{ -+ if (au_test_wkq(current)) { -+ au_dbg_blocked(); -+ BUG(); -+ } -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+void au_debug_sbinfo_init(struct au_sbinfo *sbinfo __maybe_unused) -+{ -+#ifdef AuForceNoPlink -+ au_opt_clr(sbinfo->si_mntflags, PLINK); -+#endif -+#ifdef AuForceNoXino -+ au_opt_clr(sbinfo->si_mntflags, XINO); -+#endif -+#ifdef AuForceNoRefrof -+ au_opt_clr(sbinfo->si_mntflags, REFROF); -+#endif -+#ifdef AuForceHinotify -+ au_opt_set_udba(sbinfo->si_mntflags, UDBA_HINOTIFY); -+#endif -+#ifdef AuForceRd0 -+ sbinfo->si_rdblk = 0; -+ sbinfo->si_rdhash = 0; -+#endif -+} -+ -+int __init au_debug_init(void) -+{ -+ aufs_bindex_t bindex; -+ struct au_vdir_destr destr; -+ -+ bindex = -1; -+ AuDebugOn(bindex >= 0); -+ -+ destr.len = -1; -+ AuDebugOn(destr.len < NAME_MAX); -+ -+#ifdef CONFIG_4KSTACKS -+ AuWarn("CONFIG_4KSTACKS is defined.\n"); -+#endif -+ -+#ifdef AuForceNoBrs -+ sysaufs_brs = 0; -+#endif -+ -+ return 0; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/debug.h linux-2.6.31/fs/aufs/debug.h ---- linux-2.6.31-vanilla/fs/aufs/debug.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/debug.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,263 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * debug print functions -+ */ -+ -+#ifndef __AUFS_DEBUG_H__ -+#define __AUFS_DEBUG_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <asm/system.h> -+#include <linux/bug.h> -+/* #include <linux/err.h> */ -+#include <linux/init.h> -+/* #include <linux/kernel.h> */ -+#include <linux/delay.h> -+/* #include <linux/kd.h> */ -+/* #include <linux/vt_kern.h> */ -+#include <linux/sysrq.h> -+#include <linux/aufs_type.h> -+ -+#include <asm/system.h> -+ -+#ifdef CONFIG_AUFS_DEBUG -+#define AuDebugOn(a) BUG_ON(a) -+ -+/* module parameter */ -+extern int aufs_debug; -+static inline void au_debug(int n) -+{ -+ aufs_debug = n; -+ smp_mb(); -+} -+ -+static inline int au_debug_test(void) -+{ -+ return aufs_debug; -+} -+#else -+#define AuDebugOn(a) do {} while (0) -+#define au_debug() do {} while (0) -+static inline int au_debug_test(void) -+{ -+ return 0; -+} -+#endif /* CONFIG_AUFS_DEBUG */ -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* debug print */ -+ -+#define AuDpri(lvl, fmt, arg...) \ -+ printk(lvl AUFS_NAME " %s:%d:%s[%d]: " fmt, \ -+ __func__, __LINE__, current->comm, current->pid, ##arg) -+#define AuDbg(fmt, arg...) do { \ -+ if (au_debug_test()) \ -+ AuDpri(KERN_DEBUG, "DEBUG: " fmt, ##arg); \ -+} while (0) -+#define AuLabel(l) AuDbg(#l "\n") -+#define AuInfo(fmt, arg...) AuDpri(KERN_INFO, fmt, ##arg) -+#define AuWarn(fmt, arg...) AuDpri(KERN_WARNING, fmt, ##arg) -+#define AuErr(fmt, arg...) AuDpri(KERN_ERR, fmt, ##arg) -+#define AuIOErr(fmt, arg...) AuErr("I/O Error, " fmt, ##arg) -+#define AuWarn1(fmt, arg...) do { \ -+ static unsigned char _c; \ -+ if (!_c++) \ -+ AuWarn(fmt, ##arg); \ -+} while (0) -+ -+#define AuErr1(fmt, arg...) do { \ -+ static unsigned char _c; \ -+ if (!_c++) \ -+ AuErr(fmt, ##arg); \ -+} while (0) -+ -+#define AuIOErr1(fmt, arg...) do { \ -+ static unsigned char _c; \ -+ if (!_c++) \ -+ AuIOErr(fmt, ##arg); \ -+} while (0) -+ -+#define AuUnsupportMsg "This operation is not supported." \ -+ " Please report this application to aufs-users ML." -+#define AuUnsupport(fmt, args...) do { \ -+ AuErr(AuUnsupportMsg "\n" fmt, ##args); \ -+ dump_stack(); \ -+} while (0) -+ -+#define AuTraceErr(e) do { \ -+ if (unlikely((e) < 0)) \ -+ AuDbg("err %d\n", (int)(e)); \ -+} while (0) -+ -+#define AuTraceErrPtr(p) do { \ -+ if (IS_ERR(p)) \ -+ AuDbg("err %ld\n", PTR_ERR(p)); \ -+} while (0) -+ -+/* dirty macros for debug print, use with "%.*s" and caution */ -+#define AuLNPair(qstr) (qstr)->len, (qstr)->name -+#define AuDLNPair(d) AuLNPair(&(d)->d_name) -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct au_sbinfo; -+struct au_finfo; -+struct dentry; -+#ifdef CONFIG_AUFS_DEBUG -+extern char *au_plevel; -+struct au_nhash; -+void au_dpri_whlist(struct au_nhash *whlist); -+struct au_vdir; -+void au_dpri_vdir(struct au_vdir *vdir); -+struct inode; -+void au_dpri_inode(struct inode *inode); -+void au_dpri_dentry(struct dentry *dentry); -+struct file; -+void au_dpri_file(struct file *filp); -+struct super_block; -+void au_dpri_sb(struct super_block *sb); -+ -+void au_dbg_sleep_jiffy(int jiffy); -+struct iattr; -+void au_dbg_iattr(struct iattr *ia); -+ -+void au_dbg_verify_dir_parent(struct dentry *dentry, unsigned int sigen); -+void au_dbg_verify_nondir_parent(struct dentry *dentry, unsigned int sigen); -+void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen); -+void au_dbg_verify_hf(struct au_finfo *finfo); -+void au_dbg_verify_kthread(void); -+ -+int __init au_debug_init(void); -+void au_debug_sbinfo_init(struct au_sbinfo *sbinfo); -+#define AuDbgWhlist(w) do { \ -+ AuDbg(#w "\n"); \ -+ au_dpri_whlist(w); \ -+} while (0) -+ -+#define AuDbgVdir(v) do { \ -+ AuDbg(#v "\n"); \ -+ au_dpri_vdir(v); \ -+} while (0) -+ -+#define AuDbgInode(i) do { \ -+ AuDbg(#i "\n"); \ -+ au_dpri_inode(i); \ -+} while (0) -+ -+#define AuDbgDentry(d) do { \ -+ AuDbg(#d "\n"); \ -+ au_dpri_dentry(d); \ -+} while (0) -+ -+#define AuDbgFile(f) do { \ -+ AuDbg(#f "\n"); \ -+ au_dpri_file(f); \ -+} while (0) -+ -+#define AuDbgSb(sb) do { \ -+ AuDbg(#sb "\n"); \ -+ au_dpri_sb(sb); \ -+} while (0) -+ -+#define AuDbgSleep(sec) do { \ -+ AuDbg("sleep %d sec\n", sec); \ -+ ssleep(sec); \ -+} while (0) -+ -+#define AuDbgSleepJiffy(jiffy) do { \ -+ AuDbg("sleep %d jiffies\n", jiffy); \ -+ au_dbg_sleep_jiffy(jiffy); \ -+} while (0) -+ -+#define AuDbgIAttr(ia) do { \ -+ AuDbg("ia_valid 0x%x\n", (ia)->ia_valid); \ -+ au_dbg_iattr(ia); \ -+} while (0) -+#else -+static inline void au_dbg_verify_dir_parent(struct dentry *dentry, -+ unsigned int sigen) -+{ -+ /* empty */ -+} -+static inline void au_dbg_verify_nondir_parent(struct dentry *dentry, -+ unsigned int sigen) -+{ -+ /* empty */ -+} -+static inline void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen) -+{ -+ /* empty */ -+} -+static inline void au_dbg_verify_hf(struct au_finfo *finfo) -+{ -+ /* empty */ -+} -+static inline void au_dbg_verify_kthread(void) -+{ -+ /* empty */ -+} -+ -+static inline int au_debug_init(void) -+{ -+ return 0; -+} -+static inline void au_debug_sbinfo_init(struct au_sbinfo *sbinfo) -+{ -+ /* empty */ -+} -+#define AuDbgWhlist(w) do {} while (0) -+#define AuDbgVdir(v) do {} while (0) -+#define AuDbgInode(i) do {} while (0) -+#define AuDbgDentry(d) do {} while (0) -+#define AuDbgFile(f) do {} while (0) -+#define AuDbgSb(sb) do {} while (0) -+#define AuDbgSleep(sec) do {} while (0) -+#define AuDbgSleepJiffy(jiffy) do {} while (0) -+#define AuDbgIAttr(ia) do {} while (0) -+#endif /* CONFIG_AUFS_DEBUG */ -+ -+/* ---------------------------------------------------------------------- */ -+ -+#ifdef CONFIG_AUFS_MAGIC_SYSRQ -+int __init au_sysrq_init(void); -+void au_sysrq_fin(void); -+ -+#ifdef CONFIG_HW_CONSOLE -+#define au_dbg_blocked() do { \ -+ WARN_ON(1); \ -+ handle_sysrq('w', vc_cons[fg_console].d->vc_tty); \ -+} while (0) -+#else -+#define au_dbg_blocked() do {} while (0) -+#endif -+ -+#else -+static inline int au_sysrq_init(void) -+{ -+ return 0; -+} -+#define au_sysrq_fin() do {} while (0) -+#define au_dbg_blocked() do {} while (0) -+#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_DEBUG_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/dentry.c linux-2.6.31/fs/aufs/dentry.c ---- linux-2.6.31-vanilla/fs/aufs/dentry.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/dentry.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,879 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * lookup and dentry operations -+ */ -+ -+#include <linux/namei.h> -+#include "aufs.h" -+ -+static void au_h_nd(struct nameidata *h_nd, struct nameidata *nd) -+{ -+ if (nd) { -+ *h_nd = *nd; -+ -+ /* -+ * gave up supporting LOOKUP_CREATE/OPEN for lower fs, -+ * due to whiteout and branch permission. -+ */ -+ h_nd->flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE -+ | LOOKUP_FOLLOW); -+ /* unnecessary? */ -+ h_nd->intent.open.file = NULL; -+ } else -+ memset(h_nd, 0, sizeof(*h_nd)); -+} -+ -+struct au_lkup_one_args { -+ struct dentry **errp; -+ struct qstr *name; -+ struct dentry *h_parent; -+ struct au_branch *br; -+ struct nameidata *nd; -+}; -+ -+struct dentry *au_lkup_one(struct qstr *name, struct dentry *h_parent, -+ struct au_branch *br, struct nameidata *nd) -+{ -+ struct dentry *h_dentry; -+ int err; -+ struct nameidata h_nd; -+ -+ if (au_test_fs_null_nd(h_parent->d_sb)) -+ return vfsub_lookup_one_len(name->name, h_parent, name->len); -+ -+ au_h_nd(&h_nd, nd); -+ h_nd.path.dentry = h_parent; -+ h_nd.path.mnt = br->br_mnt; -+ -+ err = __lookup_one_len(name->name, &h_nd.last, NULL, name->len); -+ h_dentry = ERR_PTR(err); -+ if (!err) { -+ path_get(&h_nd.path); -+ h_dentry = vfsub_lookup_hash(&h_nd); -+ path_put(&h_nd.path); -+ } -+ -+ return h_dentry; -+} -+ -+static void au_call_lkup_one(void *args) -+{ -+ struct au_lkup_one_args *a = args; -+ *a->errp = au_lkup_one(a->name, a->h_parent, a->br, a->nd); -+} -+ -+#define AuLkup_ALLOW_NEG 1 -+#define au_ftest_lkup(flags, name) ((flags) & AuLkup_##name) -+#define au_fset_lkup(flags, name) { (flags) |= AuLkup_##name; } -+#define au_fclr_lkup(flags, name) { (flags) &= ~AuLkup_##name; } -+ -+struct au_do_lookup_args { -+ unsigned int flags; -+ mode_t type; -+ struct nameidata *nd; -+}; -+ -+/* -+ * returns positive/negative dentry, NULL or an error. -+ * NULL means whiteout-ed or not-found. -+ */ -+static struct dentry* -+au_do_lookup(struct dentry *h_parent, struct dentry *dentry, -+ aufs_bindex_t bindex, struct qstr *wh_name, -+ struct au_do_lookup_args *args) -+{ -+ struct dentry *h_dentry; -+ struct inode *h_inode, *inode; -+ struct qstr *name; -+ struct au_branch *br; -+ int wh_found, opq; -+ unsigned char wh_able; -+ const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG); -+ -+ name = &dentry->d_name; -+ wh_found = 0; -+ br = au_sbr(dentry->d_sb, bindex); -+ wh_able = !!au_br_whable(br->br_perm); -+ if (wh_able) -+ wh_found = au_wh_test(h_parent, wh_name, br, /*try_sio*/0); -+ h_dentry = ERR_PTR(wh_found); -+ if (!wh_found) -+ goto real_lookup; -+ if (unlikely(wh_found < 0)) -+ goto out; -+ -+ /* We found a whiteout */ -+ /* au_set_dbend(dentry, bindex); */ -+ au_set_dbwh(dentry, bindex); -+ if (!allow_neg) -+ return NULL; /* success */ -+ -+ real_lookup: -+ h_dentry = au_lkup_one(name, h_parent, br, args->nd); -+ if (IS_ERR(h_dentry)) -+ goto out; -+ -+ h_inode = h_dentry->d_inode; -+ if (!h_inode) { -+ if (!allow_neg) -+ goto out_neg; -+ } else if (wh_found -+ || (args->type && args->type != (h_inode->i_mode & S_IFMT))) -+ goto out_neg; -+ -+ if (au_dbend(dentry) <= bindex) -+ au_set_dbend(dentry, bindex); -+ if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) -+ au_set_dbstart(dentry, bindex); -+ au_set_h_dptr(dentry, bindex, h_dentry); -+ -+ inode = dentry->d_inode; -+ if (!h_inode || !S_ISDIR(h_inode->i_mode) || !wh_able -+ || (inode && !S_ISDIR(inode->i_mode))) -+ goto out; /* success */ -+ -+ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); -+ opq = au_diropq_test(h_dentry, br); -+ mutex_unlock(&h_inode->i_mutex); -+ if (opq > 0) -+ au_set_dbdiropq(dentry, bindex); -+ else if (unlikely(opq < 0)) { -+ au_set_h_dptr(dentry, bindex, NULL); -+ h_dentry = ERR_PTR(opq); -+ } -+ goto out; -+ -+ out_neg: -+ dput(h_dentry); -+ h_dentry = NULL; -+ out: -+ return h_dentry; -+} -+ -+static int au_test_shwh(struct super_block *sb, const struct qstr *name) -+{ -+ if (unlikely(!au_opt_test(au_mntflags(sb), SHWH) -+ && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))) -+ return -EPERM; -+ return 0; -+} -+ -+/* -+ * returns the number of lower positive dentries, -+ * otherwise an error. -+ * can be called at unlinking with @type is zero. -+ */ -+int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type, -+ struct nameidata *nd) -+{ -+ int npositive, err; -+ aufs_bindex_t bindex, btail, bdiropq; -+ unsigned char isdir; -+ struct qstr whname; -+ struct au_do_lookup_args args = { -+ .flags = 0, -+ .type = type, -+ .nd = nd -+ }; -+ const struct qstr *name = &dentry->d_name; -+ struct dentry *parent; -+ struct inode *inode; -+ -+ parent = dget_parent(dentry); -+ err = au_test_shwh(dentry->d_sb, name); -+ if (unlikely(err)) -+ goto out; -+ -+ err = au_wh_name_alloc(&whname, name); -+ if (unlikely(err)) -+ goto out; -+ -+ inode = dentry->d_inode; -+ isdir = !!(inode && S_ISDIR(inode->i_mode)); -+ if (!type) -+ au_fset_lkup(args.flags, ALLOW_NEG); -+ -+ npositive = 0; -+ btail = au_dbtaildir(parent); -+ for (bindex = bstart; bindex <= btail; bindex++) { -+ struct dentry *h_parent, *h_dentry; -+ struct inode *h_inode, *h_dir; -+ -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (h_dentry) { -+ if (h_dentry->d_inode) -+ npositive++; -+ if (type != S_IFDIR) -+ break; -+ continue; -+ } -+ h_parent = au_h_dptr(parent, bindex); -+ if (!h_parent) -+ continue; -+ h_dir = h_parent->d_inode; -+ if (!h_dir || !S_ISDIR(h_dir->i_mode)) -+ continue; -+ -+ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); -+ h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname, -+ &args); -+ mutex_unlock(&h_dir->i_mutex); -+ err = PTR_ERR(h_dentry); -+ if (IS_ERR(h_dentry)) -+ goto out_wh; -+ au_fclr_lkup(args.flags, ALLOW_NEG); -+ -+ if (au_dbwh(dentry) >= 0) -+ break; -+ if (!h_dentry) -+ continue; -+ h_inode = h_dentry->d_inode; -+ if (!h_inode) -+ continue; -+ npositive++; -+ if (!args.type) -+ args.type = h_inode->i_mode & S_IFMT; -+ if (args.type != S_IFDIR) -+ break; -+ else if (isdir) { -+ /* the type of lower may be different */ -+ bdiropq = au_dbdiropq(dentry); -+ if (bdiropq >= 0 && bdiropq <= bindex) -+ break; -+ } -+ } -+ -+ if (npositive) { -+ AuLabel(positive); -+ au_update_dbstart(dentry); -+ } -+ err = npositive; -+ if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) -+ && au_dbstart(dentry) < 0)) -+ /* both of real entry and whiteout found */ -+ err = -EIO; -+ -+ out_wh: -+ kfree(whname.name); -+ out: -+ dput(parent); -+ return err; -+} -+ -+struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent, -+ struct au_branch *br) -+{ -+ struct dentry *dentry; -+ int wkq_err; -+ -+ if (!au_test_h_perm_sio(parent->d_inode, MAY_EXEC)) -+ dentry = au_lkup_one(name, parent, br, /*nd*/NULL); -+ else { -+ struct au_lkup_one_args args = { -+ .errp = &dentry, -+ .name = name, -+ .h_parent = parent, -+ .br = br, -+ .nd = NULL -+ }; -+ -+ wkq_err = au_wkq_wait(au_call_lkup_one, &args); -+ if (unlikely(wkq_err)) -+ dentry = ERR_PTR(wkq_err); -+ } -+ -+ return dentry; -+} -+ -+/* -+ * lookup @dentry on @bindex which should be negative. -+ */ -+int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex) -+{ -+ int err; -+ struct dentry *parent, *h_parent, *h_dentry; -+ struct qstr *name; -+ -+ name = &dentry->d_name; -+ parent = dget_parent(dentry); -+ h_parent = au_h_dptr(parent, bindex); -+ h_dentry = au_sio_lkup_one(name, h_parent, -+ au_sbr(dentry->d_sb, bindex)); -+ err = PTR_ERR(h_dentry); -+ if (IS_ERR(h_dentry)) -+ goto out; -+ if (unlikely(h_dentry->d_inode)) { -+ err = -EIO; -+ AuIOErr("b%d %.*s should be negative.\n", -+ bindex, AuDLNPair(h_dentry)); -+ dput(h_dentry); -+ goto out; -+ } -+ -+ if (bindex < au_dbstart(dentry)) -+ au_set_dbstart(dentry, bindex); -+ if (au_dbend(dentry) < bindex) -+ au_set_dbend(dentry, bindex); -+ au_set_h_dptr(dentry, bindex, h_dentry); -+ err = 0; -+ -+ out: -+ dput(parent); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* subset of struct inode */ -+struct au_iattr { -+ unsigned long i_ino; -+ /* unsigned int i_nlink; */ -+ uid_t i_uid; -+ gid_t i_gid; -+ u64 i_version; -+/* -+ loff_t i_size; -+ blkcnt_t i_blocks; -+*/ -+ umode_t i_mode; -+}; -+ -+static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode) -+{ -+ ia->i_ino = h_inode->i_ino; -+ /* ia->i_nlink = h_inode->i_nlink; */ -+ ia->i_uid = h_inode->i_uid; -+ ia->i_gid = h_inode->i_gid; -+ ia->i_version = h_inode->i_version; -+/* -+ ia->i_size = h_inode->i_size; -+ ia->i_blocks = h_inode->i_blocks; -+*/ -+ ia->i_mode = (h_inode->i_mode & S_IFMT); -+} -+ -+static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode) -+{ -+ return ia->i_ino != h_inode->i_ino -+ /* || ia->i_nlink != h_inode->i_nlink */ -+ || ia->i_uid != h_inode->i_uid -+ || ia->i_gid != h_inode->i_gid -+ || ia->i_version != h_inode->i_version -+/* -+ || ia->i_size != h_inode->i_size -+ || ia->i_blocks != h_inode->i_blocks -+*/ -+ || ia->i_mode != (h_inode->i_mode & S_IFMT); -+} -+ -+static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent, -+ struct au_branch *br) -+{ -+ int err; -+ struct au_iattr ia; -+ struct inode *h_inode; -+ struct dentry *h_d; -+ struct super_block *h_sb; -+ -+ err = 0; -+ memset(&ia, -1, sizeof(ia)); -+ h_sb = h_dentry->d_sb; -+ h_inode = h_dentry->d_inode; -+ if (h_inode) -+ au_iattr_save(&ia, h_inode); -+ else if (au_test_nfs(h_sb) || au_test_fuse(h_sb)) -+ /* nfs d_revalidate may return 0 for negative dentry */ -+ /* fuse d_revalidate always return 0 for negative dentry */ -+ goto out; -+ -+ /* main purpose is namei.c:cached_lookup() and d_revalidate */ -+ h_d = au_lkup_one(&h_dentry->d_name, h_parent, br, /*nd*/NULL); -+ err = PTR_ERR(h_d); -+ if (IS_ERR(h_d)) -+ goto out; -+ -+ err = 0; -+ if (unlikely(h_d != h_dentry -+ || h_d->d_inode != h_inode -+ || (h_inode && au_iattr_test(&ia, h_inode)))) -+ err = au_busy_or_stale(); -+ dput(h_d); -+ -+ out: -+ AuTraceErr(err); -+ return err; -+} -+ -+int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, -+ struct dentry *h_parent, struct au_branch *br) -+{ -+ int err; -+ -+ err = 0; -+ if (udba == AuOpt_UDBA_REVAL) { -+ IMustLock(h_dir); -+ err = (h_dentry->d_parent->d_inode != h_dir); -+ } else if (udba == AuOpt_UDBA_HINOTIFY) -+ err = au_h_verify_dentry(h_dentry, h_parent, br); -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static void au_do_refresh_hdentry(struct au_hdentry *p, struct au_dinfo *dinfo, -+ struct dentry *parent) -+{ -+ struct dentry *h_d, *h_dp; -+ struct au_hdentry tmp, *q; -+ struct super_block *sb; -+ aufs_bindex_t new_bindex, bindex, bend, bwh, bdiropq; -+ -+ AuRwMustWriteLock(&dinfo->di_rwsem); -+ -+ bend = dinfo->di_bend; -+ bwh = dinfo->di_bwh; -+ bdiropq = dinfo->di_bdiropq; -+ for (bindex = dinfo->di_bstart; bindex <= bend; bindex++, p++) { -+ h_d = p->hd_dentry; -+ if (!h_d) -+ continue; -+ -+ h_dp = dget_parent(h_d); -+ if (h_dp == au_h_dptr(parent, bindex)) { -+ dput(h_dp); -+ continue; -+ } -+ -+ new_bindex = au_find_dbindex(parent, h_dp); -+ dput(h_dp); -+ if (dinfo->di_bwh == bindex) -+ bwh = new_bindex; -+ if (dinfo->di_bdiropq == bindex) -+ bdiropq = new_bindex; -+ if (new_bindex < 0) { -+ au_hdput(p); -+ p->hd_dentry = NULL; -+ continue; -+ } -+ -+ /* swap two lower dentries, and loop again */ -+ q = dinfo->di_hdentry + new_bindex; -+ tmp = *q; -+ *q = *p; -+ *p = tmp; -+ if (tmp.hd_dentry) { -+ bindex--; -+ p--; -+ } -+ } -+ -+ sb = parent->d_sb; -+ dinfo->di_bwh = -1; -+ if (bwh >= 0 && bwh <= au_sbend(sb) && au_sbr_whable(sb, bwh)) -+ dinfo->di_bwh = bwh; -+ -+ dinfo->di_bdiropq = -1; -+ if (bdiropq >= 0 -+ && bdiropq <= au_sbend(sb) -+ && au_sbr_whable(sb, bdiropq)) -+ dinfo->di_bdiropq = bdiropq; -+ -+ bend = au_dbend(parent); -+ p = dinfo->di_hdentry; -+ for (bindex = 0; bindex <= bend; bindex++, p++) -+ if (p->hd_dentry) { -+ dinfo->di_bstart = bindex; -+ break; -+ } -+ -+ p = dinfo->di_hdentry + bend; -+ for (bindex = bend; bindex >= 0; bindex--, p--) -+ if (p->hd_dentry) { -+ dinfo->di_bend = bindex; -+ break; -+ } -+} -+ -+/* -+ * returns the number of found lower positive dentries, -+ * otherwise an error. -+ */ -+int au_refresh_hdentry(struct dentry *dentry, mode_t type) -+{ -+ int npositive, err; -+ unsigned int sigen; -+ aufs_bindex_t bstart; -+ struct au_dinfo *dinfo; -+ struct super_block *sb; -+ struct dentry *parent; -+ -+ DiMustWriteLock(dentry); -+ -+ sb = dentry->d_sb; -+ AuDebugOn(IS_ROOT(dentry)); -+ sigen = au_sigen(sb); -+ parent = dget_parent(dentry); -+ AuDebugOn(au_digen(parent) != sigen -+ || au_iigen(parent->d_inode) != sigen); -+ -+ dinfo = au_di(dentry); -+ err = au_di_realloc(dinfo, au_sbend(sb) + 1); -+ npositive = err; -+ if (unlikely(err)) -+ goto out; -+ au_do_refresh_hdentry(dinfo->di_hdentry + dinfo->di_bstart, dinfo, -+ parent); -+ -+ npositive = 0; -+ bstart = au_dbstart(parent); -+ if (type != S_IFDIR && dinfo->di_bstart == bstart) -+ goto out_dgen; /* success */ -+ -+ npositive = au_lkup_dentry(dentry, bstart, type, /*nd*/NULL); -+ if (npositive < 0) -+ goto out; -+ if (dinfo->di_bwh >= 0 && dinfo->di_bwh <= dinfo->di_bstart) -+ d_drop(dentry); -+ -+ out_dgen: -+ au_update_digen(dentry); -+ out: -+ dput(parent); -+ AuTraceErr(npositive); -+ return npositive; -+} -+ -+static noinline_for_stack -+int au_do_h_d_reval(struct dentry *h_dentry, struct nameidata *nd, -+ struct dentry *dentry, aufs_bindex_t bindex) -+{ -+ int err, valid; -+ int (*reval)(struct dentry *, struct nameidata *); -+ -+ err = 0; -+ reval = NULL; -+ if (h_dentry->d_op) -+ reval = h_dentry->d_op->d_revalidate; -+ if (!reval) -+ goto out; -+ -+ AuDbg("b%d\n", bindex); -+ if (au_test_fs_null_nd(h_dentry->d_sb)) -+ /* it may return tri-state */ -+ valid = reval(h_dentry, NULL); -+ else { -+ struct nameidata h_nd; -+ int locked; -+ struct dentry *parent; -+ -+ au_h_nd(&h_nd, nd); -+ parent = nd->path.dentry; -+ locked = (nd && nd->path.dentry != dentry); -+ if (locked) -+ di_read_lock_parent(parent, AuLock_IR); -+ BUG_ON(bindex > au_dbend(parent)); -+ h_nd.path.dentry = au_h_dptr(parent, bindex); -+ BUG_ON(!h_nd.path.dentry); -+ h_nd.path.mnt = au_sbr(parent->d_sb, bindex)->br_mnt; -+ path_get(&h_nd.path); -+ valid = reval(h_dentry, &h_nd); -+ path_put(&h_nd.path); -+ if (locked) -+ di_read_unlock(parent, AuLock_IR); -+ } -+ -+ if (unlikely(valid < 0)) -+ err = valid; -+ else if (!valid) -+ err = -EINVAL; -+ -+ out: -+ AuTraceErr(err); -+ return err; -+} -+ -+/* todo: remove this */ -+static int h_d_revalidate(struct dentry *dentry, struct inode *inode, -+ struct nameidata *nd, int do_udba) -+{ -+ int err; -+ umode_t mode, h_mode; -+ aufs_bindex_t bindex, btail, bstart, ibs, ibe; -+ unsigned char plus, unhashed, is_root, h_plus; -+ struct inode *first, *h_inode, *h_cached_inode; -+ struct dentry *h_dentry; -+ struct qstr *name, *h_name; -+ -+ err = 0; -+ plus = 0; -+ mode = 0; -+ first = NULL; -+ ibs = -1; -+ ibe = -1; -+ unhashed = !!d_unhashed(dentry); -+ is_root = !!IS_ROOT(dentry); -+ name = &dentry->d_name; -+ -+ /* -+ * Theoretically, REVAL test should be unnecessary in case of INOTIFY. -+ * But inotify doesn't fire some necessary events, -+ * IN_ATTRIB for atime/nlink/pageio -+ * IN_DELETE for NFS dentry -+ * Let's do REVAL test too. -+ */ -+ if (do_udba && inode) { -+ mode = (inode->i_mode & S_IFMT); -+ plus = (inode->i_nlink > 0); -+ first = au_h_iptr(inode, au_ibstart(inode)); -+ ibs = au_ibstart(inode); -+ ibe = au_ibend(inode); -+ } -+ -+ bstart = au_dbstart(dentry); -+ btail = bstart; -+ if (inode && S_ISDIR(inode->i_mode)) -+ btail = au_dbtaildir(dentry); -+ for (bindex = bstart; bindex <= btail; bindex++) { -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (!h_dentry) -+ continue; -+ -+ AuDbg("b%d, %.*s\n", bindex, AuDLNPair(h_dentry)); -+ h_name = &h_dentry->d_name; -+ if (unlikely(do_udba -+ && !is_root -+ && (unhashed != !!d_unhashed(h_dentry) -+ || name->len != h_name->len -+ || memcmp(name->name, h_name->name, name->len)) -+ )) { -+ AuDbg("unhash 0x%x 0x%x, %.*s %.*s\n", -+ unhashed, d_unhashed(h_dentry), -+ AuDLNPair(dentry), AuDLNPair(h_dentry)); -+ goto err; -+ } -+ -+ err = au_do_h_d_reval(h_dentry, nd, dentry, bindex); -+ if (unlikely(err)) -+ /* do not goto err, to keep the errno */ -+ break; -+ -+ /* todo: plink too? */ -+ if (!do_udba) -+ continue; -+ -+ /* UDBA tests */ -+ h_inode = h_dentry->d_inode; -+ if (unlikely(!!inode != !!h_inode)) -+ goto err; -+ -+ h_plus = plus; -+ h_mode = mode; -+ h_cached_inode = h_inode; -+ if (h_inode) { -+ h_mode = (h_inode->i_mode & S_IFMT); -+ h_plus = (h_inode->i_nlink > 0); -+ } -+ if (inode && ibs <= bindex && bindex <= ibe) -+ h_cached_inode = au_h_iptr(inode, bindex); -+ -+ if (unlikely(plus != h_plus -+ || mode != h_mode -+ || h_cached_inode != h_inode)) -+ goto err; -+ continue; -+ -+ err: -+ err = -EINVAL; -+ break; -+ } -+ -+ return err; -+} -+ -+static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen) -+{ -+ int err; -+ struct dentry *parent; -+ struct inode *inode; -+ -+ inode = dentry->d_inode; -+ if (au_digen(dentry) == sigen && au_iigen(inode) == sigen) -+ return 0; -+ -+ parent = dget_parent(dentry); -+ di_read_lock_parent(parent, AuLock_IR); -+ AuDebugOn(au_digen(parent) != sigen -+ || au_iigen(parent->d_inode) != sigen); -+ au_dbg_verify_gen(parent, sigen); -+ -+ /* returns a number of positive dentries */ -+ err = au_refresh_hdentry(dentry, inode->i_mode & S_IFMT); -+ if (err >= 0) -+ err = au_refresh_hinode(inode, dentry); -+ -+ di_read_unlock(parent, AuLock_IR); -+ dput(parent); -+ return err; -+} -+ -+int au_reval_dpath(struct dentry *dentry, unsigned int sigen) -+{ -+ int err; -+ struct dentry *d, *parent; -+ struct inode *inode; -+ -+ if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIRS)) -+ return simple_reval_dpath(dentry, sigen); -+ -+ /* slow loop, keep it simple and stupid */ -+ /* cf: au_cpup_dirs() */ -+ err = 0; -+ parent = NULL; -+ while (au_digen(dentry) != sigen -+ || au_iigen(dentry->d_inode) != sigen) { -+ d = dentry; -+ while (1) { -+ dput(parent); -+ parent = dget_parent(d); -+ if (au_digen(parent) == sigen -+ && au_iigen(parent->d_inode) == sigen) -+ break; -+ d = parent; -+ } -+ -+ inode = d->d_inode; -+ if (d != dentry) -+ di_write_lock_child(d); -+ -+ /* someone might update our dentry while we were sleeping */ -+ if (au_digen(d) != sigen || au_iigen(d->d_inode) != sigen) { -+ di_read_lock_parent(parent, AuLock_IR); -+ /* returns a number of positive dentries */ -+ err = au_refresh_hdentry(d, inode->i_mode & S_IFMT); -+ if (err >= 0) -+ err = au_refresh_hinode(inode, d); -+ di_read_unlock(parent, AuLock_IR); -+ } -+ -+ if (d != dentry) -+ di_write_unlock(d); -+ dput(parent); -+ if (unlikely(err)) -+ break; -+ } -+ -+ return err; -+} -+ -+/* -+ * if valid returns 1, otherwise 0. -+ */ -+static int aufs_d_revalidate(struct dentry *dentry, struct nameidata *nd) -+{ -+ int valid, err; -+ unsigned int sigen; -+ unsigned char do_udba; -+ struct super_block *sb; -+ struct inode *inode; -+ -+ err = -EINVAL; -+ sb = dentry->d_sb; -+ inode = dentry->d_inode; -+ aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW); -+ sigen = au_sigen(sb); -+ if (au_digen(dentry) != sigen) { -+ AuDebugOn(IS_ROOT(dentry)); -+ if (inode) -+ err = au_reval_dpath(dentry, sigen); -+ if (unlikely(err)) -+ goto out_dgrade; -+ AuDebugOn(au_digen(dentry) != sigen); -+ } -+ if (inode && au_iigen(inode) != sigen) { -+ AuDebugOn(IS_ROOT(dentry)); -+ err = au_refresh_hinode(inode, dentry); -+ if (unlikely(err)) -+ goto out_dgrade; -+ AuDebugOn(au_iigen(inode) != sigen); -+ } -+ di_downgrade_lock(dentry, AuLock_IR); -+ -+ AuDebugOn(au_digen(dentry) != sigen); -+ AuDebugOn(inode && au_iigen(inode) != sigen); -+ err = -EINVAL; -+ do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE); -+ if (do_udba && inode) { -+ aufs_bindex_t bstart = au_ibstart(inode); -+ -+ if (bstart >= 0 -+ && au_test_higen(inode, au_h_iptr(inode, bstart))) -+ goto out; -+ } -+ -+ err = h_d_revalidate(dentry, inode, nd, do_udba); -+ if (unlikely(!err && do_udba && au_dbstart(dentry) < 0)) -+ /* both of real entry and whiteout found */ -+ err = -EIO; -+ goto out; -+ -+ out_dgrade: -+ di_downgrade_lock(dentry, AuLock_IR); -+ out: -+ aufs_read_unlock(dentry, AuLock_IR); -+ AuTraceErr(err); -+ valid = !err; -+ if (!valid) -+ AuDbg("%.*s invalid\n", AuDLNPair(dentry)); -+ return valid; -+} -+ -+static void aufs_d_release(struct dentry *dentry) -+{ -+ struct au_dinfo *dinfo; -+ aufs_bindex_t bend, bindex; -+ -+ dinfo = dentry->d_fsdata; -+ if (!dinfo) -+ return; -+ -+ /* dentry may not be revalidated */ -+ bindex = dinfo->di_bstart; -+ if (bindex >= 0) { -+ struct au_hdentry *p; -+ -+ bend = dinfo->di_bend; -+ p = dinfo->di_hdentry + bindex; -+ while (bindex++ <= bend) { -+ if (p->hd_dentry) -+ au_hdput(p); -+ p++; -+ } -+ } -+ kfree(dinfo->di_hdentry); -+ AuRwDestroy(&dinfo->di_rwsem); -+ au_cache_free_dinfo(dinfo); -+ au_hin_di_reinit(dentry); -+} -+ -+struct dentry_operations aufs_dop = { -+ .d_revalidate = aufs_d_revalidate, -+ .d_release = aufs_d_release -+}; -diff -Nur linux-2.6.31-vanilla/fs/aufs/dentry.h linux-2.6.31/fs/aufs/dentry.h ---- linux-2.6.31-vanilla/fs/aufs/dentry.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/dentry.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,231 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * lookup and dentry operations -+ */ -+ -+#ifndef __AUFS_DENTRY_H__ -+#define __AUFS_DENTRY_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/dcache.h> -+#include <linux/aufs_type.h> -+#include "rwsem.h" -+ -+/* make a single member structure for future use */ -+/* todo: remove this structure */ -+struct au_hdentry { -+ struct dentry *hd_dentry; -+}; -+ -+struct au_dinfo { -+ atomic_t di_generation; -+ -+ struct au_rwsem di_rwsem; -+ aufs_bindex_t di_bstart, di_bend, di_bwh, di_bdiropq; -+ struct au_hdentry *di_hdentry; -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* dentry.c */ -+extern struct dentry_operations aufs_dop; -+struct au_branch; -+struct dentry *au_lkup_one(struct qstr *name, struct dentry *h_parent, -+ struct au_branch *br, struct nameidata *nd); -+struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent, -+ struct au_branch *br); -+int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, -+ struct dentry *h_parent, struct au_branch *br); -+ -+int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type, -+ struct nameidata *nd); -+int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex); -+int au_refresh_hdentry(struct dentry *dentry, mode_t type); -+int au_reval_dpath(struct dentry *dentry, unsigned int sigen); -+ -+/* dinfo.c */ -+int au_alloc_dinfo(struct dentry *dentry); -+int au_di_realloc(struct au_dinfo *dinfo, int nbr); -+ -+void di_read_lock(struct dentry *d, int flags, unsigned int lsc); -+void di_read_unlock(struct dentry *d, int flags); -+void di_downgrade_lock(struct dentry *d, int flags); -+void di_write_lock(struct dentry *d, unsigned int lsc); -+void di_write_unlock(struct dentry *d); -+void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir); -+void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir); -+void di_write_unlock2(struct dentry *d1, struct dentry *d2); -+ -+struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex); -+aufs_bindex_t au_dbtail(struct dentry *dentry); -+aufs_bindex_t au_dbtaildir(struct dentry *dentry); -+ -+void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, -+ struct dentry *h_dentry); -+void au_update_digen(struct dentry *dentry); -+void au_update_dbrange(struct dentry *dentry, int do_put_zero); -+void au_update_dbstart(struct dentry *dentry); -+void au_update_dbend(struct dentry *dentry); -+int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry); -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline struct au_dinfo *au_di(struct dentry *dentry) -+{ -+ return dentry->d_fsdata; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* lock subclass for dinfo */ -+enum { -+ AuLsc_DI_CHILD, /* child first */ -+ AuLsc_DI_CHILD2, /* rename(2), link(2), and cpup at hinotify */ -+ AuLsc_DI_CHILD3, /* copyup dirs */ -+ AuLsc_DI_PARENT, -+ AuLsc_DI_PARENT2, -+ AuLsc_DI_PARENT3 -+}; -+ -+/* -+ * di_read_lock_child, di_write_lock_child, -+ * di_read_lock_child2, di_write_lock_child2, -+ * di_read_lock_child3, di_write_lock_child3, -+ * di_read_lock_parent, di_write_lock_parent, -+ * di_read_lock_parent2, di_write_lock_parent2, -+ * di_read_lock_parent3, di_write_lock_parent3, -+ */ -+#define AuReadLockFunc(name, lsc) \ -+static inline void di_read_lock_##name(struct dentry *d, int flags) \ -+{ di_read_lock(d, flags, AuLsc_DI_##lsc); } -+ -+#define AuWriteLockFunc(name, lsc) \ -+static inline void di_write_lock_##name(struct dentry *d) \ -+{ di_write_lock(d, AuLsc_DI_##lsc); } -+ -+#define AuRWLockFuncs(name, lsc) \ -+ AuReadLockFunc(name, lsc) \ -+ AuWriteLockFunc(name, lsc) -+ -+AuRWLockFuncs(child, CHILD); -+AuRWLockFuncs(child2, CHILD2); -+AuRWLockFuncs(child3, CHILD3); -+AuRWLockFuncs(parent, PARENT); -+AuRWLockFuncs(parent2, PARENT2); -+AuRWLockFuncs(parent3, PARENT3); -+ -+#undef AuReadLockFunc -+#undef AuWriteLockFunc -+#undef AuRWLockFuncs -+ -+#define DiMustNoWaiters(d) AuRwMustNoWaiters(&au_di(d)->di_rwsem) -+#define DiMustAnyLock(d) AuRwMustAnyLock(&au_di(d)->di_rwsem) -+#define DiMustWriteLock(d) AuRwMustWriteLock(&au_di(d)->di_rwsem) -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* todo: memory barrier? */ -+static inline unsigned int au_digen(struct dentry *d) -+{ -+ return atomic_read(&au_di(d)->di_generation); -+} -+ -+static inline void au_h_dentry_init(struct au_hdentry *hdentry) -+{ -+ hdentry->hd_dentry = NULL; -+} -+ -+static inline void au_hdput(struct au_hdentry *hd) -+{ -+ dput(hd->hd_dentry); -+} -+ -+static inline aufs_bindex_t au_dbstart(struct dentry *dentry) -+{ -+ DiMustAnyLock(dentry); -+ return au_di(dentry)->di_bstart; -+} -+ -+static inline aufs_bindex_t au_dbend(struct dentry *dentry) -+{ -+ DiMustAnyLock(dentry); -+ return au_di(dentry)->di_bend; -+} -+ -+static inline aufs_bindex_t au_dbwh(struct dentry *dentry) -+{ -+ DiMustAnyLock(dentry); -+ return au_di(dentry)->di_bwh; -+} -+ -+static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry) -+{ -+ DiMustAnyLock(dentry); -+ return au_di(dentry)->di_bdiropq; -+} -+ -+/* todo: hard/soft set? */ -+static inline void au_set_dbstart(struct dentry *dentry, aufs_bindex_t bindex) -+{ -+ DiMustWriteLock(dentry); -+ au_di(dentry)->di_bstart = bindex; -+} -+ -+static inline void au_set_dbend(struct dentry *dentry, aufs_bindex_t bindex) -+{ -+ DiMustWriteLock(dentry); -+ au_di(dentry)->di_bend = bindex; -+} -+ -+static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex) -+{ -+ DiMustWriteLock(dentry); -+ /* dbwh can be outside of bstart - bend range */ -+ au_di(dentry)->di_bwh = bindex; -+} -+ -+static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex) -+{ -+ DiMustWriteLock(dentry); -+ au_di(dentry)->di_bdiropq = bindex; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+#ifdef CONFIG_AUFS_HINOTIFY -+static inline void au_digen_dec(struct dentry *d) -+{ -+ atomic_dec_return(&au_di(d)->di_generation); -+} -+ -+static inline void au_hin_di_reinit(struct dentry *dentry) -+{ -+ dentry->d_fsdata = NULL; -+} -+#else -+static inline void au_hin_di_reinit(struct dentry *dentry __maybe_unused) -+{ -+ /* empty */ -+} -+#endif /* CONFIG_AUFS_HINOTIFY */ -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_DENTRY_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/dinfo.c linux-2.6.31/fs/aufs/dinfo.c ---- linux-2.6.31-vanilla/fs/aufs/dinfo.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/dinfo.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,367 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * dentry private data -+ */ -+ -+#include "aufs.h" -+ -+int au_alloc_dinfo(struct dentry *dentry) -+{ -+ struct au_dinfo *dinfo; -+ struct super_block *sb; -+ int nbr; -+ -+ dinfo = au_cache_alloc_dinfo(); -+ if (unlikely(!dinfo)) -+ goto out; -+ -+ sb = dentry->d_sb; -+ nbr = au_sbend(sb) + 1; -+ if (nbr <= 0) -+ nbr = 1; -+ dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS); -+ if (unlikely(!dinfo->di_hdentry)) -+ goto out_dinfo; -+ -+ atomic_set(&dinfo->di_generation, au_sigen(sb)); -+ /* smp_mb(); */ /* atomic_set */ -+ au_rw_init_wlock_nested(&dinfo->di_rwsem, AuLsc_DI_CHILD); -+ dinfo->di_bstart = -1; -+ dinfo->di_bend = -1; -+ dinfo->di_bwh = -1; -+ dinfo->di_bdiropq = -1; -+ -+ dentry->d_fsdata = dinfo; -+ dentry->d_op = &aufs_dop; -+ return 0; /* success */ -+ -+ out_dinfo: -+ au_cache_free_dinfo(dinfo); -+ out: -+ return -ENOMEM; -+} -+ -+int au_di_realloc(struct au_dinfo *dinfo, int nbr) -+{ -+ int err, sz; -+ struct au_hdentry *hdp; -+ -+ AuRwMustWriteLock(&dinfo->di_rwsem); -+ -+ err = -ENOMEM; -+ sz = sizeof(*hdp) * (dinfo->di_bend + 1); -+ if (!sz) -+ sz = sizeof(*hdp); -+ hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS); -+ if (hdp) { -+ dinfo->di_hdentry = hdp; -+ err = 0; -+ } -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static void do_ii_write_lock(struct inode *inode, unsigned int lsc) -+{ -+ switch (lsc) { -+ case AuLsc_DI_CHILD: -+ ii_write_lock_child(inode); -+ break; -+ case AuLsc_DI_CHILD2: -+ ii_write_lock_child2(inode); -+ break; -+ case AuLsc_DI_CHILD3: -+ ii_write_lock_child3(inode); -+ break; -+ case AuLsc_DI_PARENT: -+ ii_write_lock_parent(inode); -+ break; -+ case AuLsc_DI_PARENT2: -+ ii_write_lock_parent2(inode); -+ break; -+ case AuLsc_DI_PARENT3: -+ ii_write_lock_parent3(inode); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static void do_ii_read_lock(struct inode *inode, unsigned int lsc) -+{ -+ switch (lsc) { -+ case AuLsc_DI_CHILD: -+ ii_read_lock_child(inode); -+ break; -+ case AuLsc_DI_CHILD2: -+ ii_read_lock_child2(inode); -+ break; -+ case AuLsc_DI_CHILD3: -+ ii_read_lock_child3(inode); -+ break; -+ case AuLsc_DI_PARENT: -+ ii_read_lock_parent(inode); -+ break; -+ case AuLsc_DI_PARENT2: -+ ii_read_lock_parent2(inode); -+ break; -+ case AuLsc_DI_PARENT3: -+ ii_read_lock_parent3(inode); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+void di_read_lock(struct dentry *d, int flags, unsigned int lsc) -+{ -+ au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc); -+ if (d->d_inode) { -+ if (au_ftest_lock(flags, IW)) -+ do_ii_write_lock(d->d_inode, lsc); -+ else if (au_ftest_lock(flags, IR)) -+ do_ii_read_lock(d->d_inode, lsc); -+ } -+} -+ -+void di_read_unlock(struct dentry *d, int flags) -+{ -+ if (d->d_inode) { -+ if (au_ftest_lock(flags, IW)) -+ ii_write_unlock(d->d_inode); -+ else if (au_ftest_lock(flags, IR)) -+ ii_read_unlock(d->d_inode); -+ } -+ au_rw_read_unlock(&au_di(d)->di_rwsem); -+} -+ -+void di_downgrade_lock(struct dentry *d, int flags) -+{ -+ if (d->d_inode && au_ftest_lock(flags, IR)) -+ ii_downgrade_lock(d->d_inode); -+ au_rw_dgrade_lock(&au_di(d)->di_rwsem); -+} -+ -+void di_write_lock(struct dentry *d, unsigned int lsc) -+{ -+ au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc); -+ if (d->d_inode) -+ do_ii_write_lock(d->d_inode, lsc); -+} -+ -+void di_write_unlock(struct dentry *d) -+{ -+ if (d->d_inode) -+ ii_write_unlock(d->d_inode); -+ au_rw_write_unlock(&au_di(d)->di_rwsem); -+} -+ -+void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir) -+{ -+ AuDebugOn(d1 == d2 -+ || d1->d_inode == d2->d_inode -+ || d1->d_sb != d2->d_sb); -+ -+ if (isdir && au_test_subdir(d1, d2)) { -+ di_write_lock_child(d1); -+ di_write_lock_child2(d2); -+ } else { -+ /* there should be no races */ -+ di_write_lock_child(d2); -+ di_write_lock_child2(d1); -+ } -+} -+ -+void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir) -+{ -+ AuDebugOn(d1 == d2 -+ || d1->d_inode == d2->d_inode -+ || d1->d_sb != d2->d_sb); -+ -+ if (isdir && au_test_subdir(d1, d2)) { -+ di_write_lock_parent(d1); -+ di_write_lock_parent2(d2); -+ } else { -+ /* there should be no races */ -+ di_write_lock_parent(d2); -+ di_write_lock_parent2(d1); -+ } -+} -+ -+void di_write_unlock2(struct dentry *d1, struct dentry *d2) -+{ -+ di_write_unlock(d1); -+ if (d1->d_inode == d2->d_inode) -+ au_rw_write_unlock(&au_di(d2)->di_rwsem); -+ else -+ di_write_unlock(d2); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex) -+{ -+ struct dentry *d; -+ -+ DiMustAnyLock(dentry); -+ -+ if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) -+ return NULL; -+ AuDebugOn(bindex < 0); -+ d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry; -+ AuDebugOn(d && (atomic_read(&d->d_count) <= 0)); -+ return d; -+} -+ -+aufs_bindex_t au_dbtail(struct dentry *dentry) -+{ -+ aufs_bindex_t bend, bwh; -+ -+ bend = au_dbend(dentry); -+ if (0 <= bend) { -+ bwh = au_dbwh(dentry); -+ if (!bwh) -+ return bwh; -+ if (0 < bwh && bwh < bend) -+ return bwh - 1; -+ } -+ return bend; -+} -+ -+aufs_bindex_t au_dbtaildir(struct dentry *dentry) -+{ -+ aufs_bindex_t bend, bopq; -+ -+ bend = au_dbtail(dentry); -+ if (0 <= bend) { -+ bopq = au_dbdiropq(dentry); -+ if (0 <= bopq && bopq < bend) -+ bend = bopq; -+ } -+ return bend; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, -+ struct dentry *h_dentry) -+{ -+ struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex; -+ -+ DiMustWriteLock(dentry); -+ -+ if (hd->hd_dentry) -+ au_hdput(hd); -+ hd->hd_dentry = h_dentry; -+} -+ -+void au_update_digen(struct dentry *dentry) -+{ -+ atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb)); -+ /* smp_mb(); */ /* atomic_set */ -+} -+ -+void au_update_dbrange(struct dentry *dentry, int do_put_zero) -+{ -+ struct au_dinfo *dinfo; -+ struct dentry *h_d; -+ -+ DiMustWriteLock(dentry); -+ -+ dinfo = au_di(dentry); -+ if (!dinfo || dinfo->di_bstart < 0) -+ return; -+ -+ if (do_put_zero) { -+ aufs_bindex_t bindex, bend; -+ -+ bend = dinfo->di_bend; -+ for (bindex = dinfo->di_bstart; bindex <= bend; bindex++) { -+ h_d = dinfo->di_hdentry[0 + bindex].hd_dentry; -+ if (h_d && !h_d->d_inode) -+ au_set_h_dptr(dentry, bindex, NULL); -+ } -+ } -+ -+ dinfo->di_bstart = -1; -+ while (++dinfo->di_bstart <= dinfo->di_bend) -+ if (dinfo->di_hdentry[0 + dinfo->di_bstart].hd_dentry) -+ break; -+ if (dinfo->di_bstart > dinfo->di_bend) { -+ dinfo->di_bstart = -1; -+ dinfo->di_bend = -1; -+ return; -+ } -+ -+ dinfo->di_bend++; -+ while (0 <= --dinfo->di_bend) -+ if (dinfo->di_hdentry[0 + dinfo->di_bend].hd_dentry) -+ break; -+ AuDebugOn(dinfo->di_bstart > dinfo->di_bend || dinfo->di_bend < 0); -+} -+ -+void au_update_dbstart(struct dentry *dentry) -+{ -+ aufs_bindex_t bindex, bend; -+ struct dentry *h_dentry; -+ -+ bend = au_dbend(dentry); -+ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (!h_dentry) -+ continue; -+ if (h_dentry->d_inode) { -+ au_set_dbstart(dentry, bindex); -+ return; -+ } -+ au_set_h_dptr(dentry, bindex, NULL); -+ } -+} -+ -+void au_update_dbend(struct dentry *dentry) -+{ -+ aufs_bindex_t bindex, bstart; -+ struct dentry *h_dentry; -+ -+ bstart = au_dbstart(dentry); -+ for (bindex = au_dbend(dentry); bindex <= bstart; bindex--) { -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (!h_dentry) -+ continue; -+ if (h_dentry->d_inode) { -+ au_set_dbend(dentry, bindex); -+ return; -+ } -+ au_set_h_dptr(dentry, bindex, NULL); -+ } -+} -+ -+int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry) -+{ -+ aufs_bindex_t bindex, bend; -+ -+ bend = au_dbend(dentry); -+ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) -+ if (au_h_dptr(dentry, bindex) == h_dentry) -+ return bindex; -+ return -1; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/dir.c linux-2.6.31/fs/aufs/dir.c ---- linux-2.6.31-vanilla/fs/aufs/dir.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/dir.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,593 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * directory operations -+ */ -+ -+#include <linux/file.h> -+#include <linux/fs_stack.h> -+#include "aufs.h" -+ -+void au_add_nlink(struct inode *dir, struct inode *h_dir) -+{ -+ AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); -+ -+ dir->i_nlink += h_dir->i_nlink - 2; -+ if (h_dir->i_nlink < 2) -+ dir->i_nlink += 2; -+} -+ -+void au_sub_nlink(struct inode *dir, struct inode *h_dir) -+{ -+ AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); -+ -+ dir->i_nlink -= h_dir->i_nlink - 2; -+ if (h_dir->i_nlink < 2) -+ dir->i_nlink -= 2; -+} -+ -+loff_t au_dir_size(struct file *file, struct dentry *dentry) -+{ -+ loff_t sz; -+ aufs_bindex_t bindex, bend; -+ struct file *h_file; -+ struct dentry *h_dentry; -+ -+ sz = 0; -+ if (file) { -+ AuDebugOn(!file->f_dentry); -+ AuDebugOn(!file->f_dentry->d_inode); -+ AuDebugOn(!S_ISDIR(file->f_dentry->d_inode->i_mode)); -+ -+ bend = au_fbend(file); -+ for (bindex = au_fbstart(file); -+ bindex <= bend && sz < KMALLOC_MAX_SIZE; -+ bindex++) { -+ h_file = au_h_fptr(file, bindex); -+ if (h_file -+ && h_file->f_dentry -+ && h_file->f_dentry->d_inode) -+ sz += i_size_read(h_file->f_dentry->d_inode); -+ } -+ } else { -+ AuDebugOn(!dentry); -+ AuDebugOn(!dentry->d_inode); -+ AuDebugOn(!S_ISDIR(dentry->d_inode->i_mode)); -+ -+ bend = au_dbtaildir(dentry); -+ for (bindex = au_dbstart(dentry); -+ bindex <= bend && sz < KMALLOC_MAX_SIZE; -+ bindex++) { -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (h_dentry && h_dentry->d_inode) -+ sz += i_size_read(h_dentry->d_inode); -+ } -+ } -+ if (sz < KMALLOC_MAX_SIZE) -+ sz = roundup_pow_of_two(sz); -+ if (sz > KMALLOC_MAX_SIZE) -+ sz = KMALLOC_MAX_SIZE; -+ else if (sz < NAME_MAX) { -+ BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX); -+ sz = AUFS_RDBLK_DEF; -+ } -+ return sz; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int reopen_dir(struct file *file) -+{ -+ int err; -+ unsigned int flags; -+ aufs_bindex_t bindex, btail, bstart; -+ struct dentry *dentry, *h_dentry; -+ struct file *h_file; -+ -+ /* open all lower dirs */ -+ dentry = file->f_dentry; -+ bstart = au_dbstart(dentry); -+ for (bindex = au_fbstart(file); bindex < bstart; bindex++) -+ au_set_h_fptr(file, bindex, NULL); -+ au_set_fbstart(file, bstart); -+ -+ btail = au_dbtaildir(dentry); -+ for (bindex = au_fbend(file); btail < bindex; bindex--) -+ au_set_h_fptr(file, bindex, NULL); -+ au_set_fbend(file, btail); -+ -+ spin_lock(&file->f_lock); -+ flags = file->f_flags; -+ spin_unlock(&file->f_lock); -+ for (bindex = bstart; bindex <= btail; bindex++) { -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (!h_dentry) -+ continue; -+ h_file = au_h_fptr(file, bindex); -+ if (h_file) -+ continue; -+ -+ h_file = au_h_open(dentry, bindex, flags, file); -+ err = PTR_ERR(h_file); -+ if (IS_ERR(h_file)) -+ goto out; /* close all? */ -+ au_set_h_fptr(file, bindex, h_file); -+ } -+ au_update_figen(file); -+ /* todo: necessary? */ -+ /* file->f_ra = h_file->f_ra; */ -+ err = 0; -+ -+ out: -+ return err; -+} -+ -+static int do_open_dir(struct file *file, int flags) -+{ -+ int err; -+ aufs_bindex_t bindex, btail; -+ struct dentry *dentry, *h_dentry; -+ struct file *h_file; -+ -+ FiMustWriteLock(file); -+ -+ err = 0; -+ dentry = file->f_dentry; -+ au_set_fvdir_cache(file, NULL); -+ au_fi(file)->fi_maintain_plink = 0; -+ file->f_version = dentry->d_inode->i_version; -+ bindex = au_dbstart(dentry); -+ au_set_fbstart(file, bindex); -+ btail = au_dbtaildir(dentry); -+ au_set_fbend(file, btail); -+ for (; !err && bindex <= btail; bindex++) { -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (!h_dentry) -+ continue; -+ -+ h_file = au_h_open(dentry, bindex, flags, file); -+ if (IS_ERR(h_file)) { -+ err = PTR_ERR(h_file); -+ break; -+ } -+ au_set_h_fptr(file, bindex, h_file); -+ } -+ au_update_figen(file); -+ /* todo: necessary? */ -+ /* file->f_ra = h_file->f_ra; */ -+ if (!err) -+ return 0; /* success */ -+ -+ /* close all */ -+ for (bindex = au_fbstart(file); bindex <= btail; bindex++) -+ au_set_h_fptr(file, bindex, NULL); -+ au_set_fbstart(file, -1); -+ au_set_fbend(file, -1); -+ return err; -+} -+ -+static int aufs_open_dir(struct inode *inode __maybe_unused, -+ struct file *file) -+{ -+ return au_do_open(file, do_open_dir); -+} -+ -+static int aufs_release_dir(struct inode *inode __maybe_unused, -+ struct file *file) -+{ -+ struct au_vdir *vdir_cache; -+ struct super_block *sb; -+ struct au_sbinfo *sbinfo; -+ -+ sb = file->f_dentry->d_sb; -+ si_noflush_read_lock(sb); -+ fi_write_lock(file); -+ vdir_cache = au_fvdir_cache(file); -+ if (vdir_cache) -+ au_vdir_free(vdir_cache); -+ if (au_fi(file)->fi_maintain_plink) { -+ sbinfo = au_sbi(sb); -+ /* clear the flag without write-lock */ -+ sbinfo->au_si_status &= ~AuSi_MAINTAIN_PLINK; -+ smp_mb(); -+ wake_up_all(&sbinfo->si_plink_wq); -+ } -+ fi_write_unlock(file); -+ au_finfo_fin(file); -+ si_read_unlock(sb); -+ return 0; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync) -+{ -+ int err; -+ aufs_bindex_t bend, bindex; -+ struct inode *inode; -+ struct super_block *sb; -+ -+ err = 0; -+ sb = dentry->d_sb; -+ inode = dentry->d_inode; -+ IMustLock(inode); -+ bend = au_dbend(dentry); -+ for (bindex = au_dbstart(dentry); !err && bindex <= bend; bindex++) { -+ struct path h_path; -+ struct inode *h_inode; -+ -+ if (au_test_ro(sb, bindex, inode)) -+ continue; -+ h_path.dentry = au_h_dptr(dentry, bindex); -+ if (!h_path.dentry) -+ continue; -+ h_inode = h_path.dentry->d_inode; -+ if (!h_inode) -+ continue; -+ -+ /* no mnt_want_write() */ -+ /* cf. fs/nsfd/vfs.c and fs/nfsd/nfs4recover.c */ -+ /* todo: inotiry fired? */ -+ h_path.mnt = au_sbr_mnt(sb, bindex); -+ mutex_lock(&h_inode->i_mutex); -+ err = filemap_fdatawrite(h_inode->i_mapping); -+ AuDebugOn(!h_inode->i_fop); -+ if (!err && h_inode->i_fop->fsync) -+ err = h_inode->i_fop->fsync(NULL, h_path.dentry, -+ datasync); -+ if (!err) -+ err = filemap_fdatawrite(h_inode->i_mapping); -+ if (!err) -+ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ -+ mutex_unlock(&h_inode->i_mutex); -+ } -+ -+ return err; -+} -+ -+static int au_do_fsync_dir(struct file *file, int datasync) -+{ -+ int err; -+ aufs_bindex_t bend, bindex; -+ struct file *h_file; -+ struct super_block *sb; -+ struct inode *inode; -+ struct mutex *h_mtx; -+ -+ err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); -+ if (unlikely(err)) -+ goto out; -+ -+ sb = file->f_dentry->d_sb; -+ inode = file->f_dentry->d_inode; -+ bend = au_fbend(file); -+ for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { -+ h_file = au_h_fptr(file, bindex); -+ if (!h_file || au_test_ro(sb, bindex, inode)) -+ continue; -+ -+ err = vfs_fsync(h_file, h_file->f_dentry, datasync); -+ if (!err) { -+ h_mtx = &h_file->f_dentry->d_inode->i_mutex; -+ mutex_lock(h_mtx); -+ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); -+ /*ignore*/ -+ mutex_unlock(h_mtx); -+ } -+ } -+ -+ out: -+ return err; -+} -+ -+/* -+ * @file may be NULL -+ */ -+static int aufs_fsync_dir(struct file *file, struct dentry *dentry, -+ int datasync) -+{ -+ int err; -+ struct super_block *sb; -+ -+ IMustLock(dentry->d_inode); -+ -+ err = 0; -+ sb = dentry->d_sb; -+ si_noflush_read_lock(sb); -+ if (file) -+ err = au_do_fsync_dir(file, datasync); -+ else { -+ di_write_lock_child(dentry); -+ err = au_do_fsync_dir_no_file(dentry, datasync); -+ } -+ au_cpup_attr_timesizes(dentry->d_inode); -+ di_write_unlock(dentry); -+ if (file) -+ fi_write_unlock(file); -+ -+ si_read_unlock(sb); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int aufs_readdir(struct file *file, void *dirent, filldir_t filldir) -+{ -+ int err; -+ struct dentry *dentry; -+ struct inode *inode; -+ struct super_block *sb; -+ -+ dentry = file->f_dentry; -+ inode = dentry->d_inode; -+ IMustLock(inode); -+ -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); -+ if (unlikely(err)) -+ goto out; -+ err = au_vdir_init(file); -+ di_downgrade_lock(dentry, AuLock_IR); -+ if (unlikely(err)) -+ goto out_unlock; -+ -+ if (!au_test_nfsd(current)) { -+ err = au_vdir_fill_de(file, dirent, filldir); -+ fsstack_copy_attr_atime(inode, -+ au_h_iptr(inode, au_ibstart(inode))); -+ } else { -+ /* -+ * nfsd filldir may call lookup_one_len(), vfs_getattr(), -+ * encode_fh() and others. -+ */ -+ struct inode *h_inode = au_h_iptr(inode, au_ibstart(inode)); -+ -+ di_read_unlock(dentry, AuLock_IR); -+ si_read_unlock(sb); -+ lockdep_off(); -+ err = au_vdir_fill_de(file, dirent, filldir); -+ lockdep_on(); -+ fsstack_copy_attr_atime(inode, h_inode); -+ fi_write_unlock(file); -+ -+ AuTraceErr(err); -+ return err; -+ } -+ -+ out_unlock: -+ di_read_unlock(dentry, AuLock_IR); -+ fi_write_unlock(file); -+ out: -+ si_read_unlock(sb); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+#define AuTestEmpty_WHONLY 1 -+#define AuTestEmpty_CALLED (1 << 1) -+#define AuTestEmpty_SHWH (1 << 2) -+#define au_ftest_testempty(flags, name) ((flags) & AuTestEmpty_##name) -+#define au_fset_testempty(flags, name) { (flags) |= AuTestEmpty_##name; } -+#define au_fclr_testempty(flags, name) { (flags) &= ~AuTestEmpty_##name; } -+ -+#ifndef CONFIG_AUFS_SHWH -+#undef AuTestEmpty_SHWH -+#define AuTestEmpty_SHWH 0 -+#endif -+ -+struct test_empty_arg { -+ struct au_nhash *whlist; -+ unsigned int flags; -+ int err; -+ aufs_bindex_t bindex; -+}; -+ -+static int test_empty_cb(void *__arg, const char *__name, int namelen, -+ loff_t offset __maybe_unused, u64 ino, -+ unsigned int d_type) -+{ -+ struct test_empty_arg *arg = __arg; -+ char *name = (void *)__name; -+ -+ arg->err = 0; -+ au_fset_testempty(arg->flags, CALLED); -+ /* smp_mb(); */ -+ if (name[0] == '.' -+ && (namelen == 1 || (name[1] == '.' && namelen == 2))) -+ goto out; /* success */ -+ -+ if (namelen <= AUFS_WH_PFX_LEN -+ || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { -+ if (au_ftest_testempty(arg->flags, WHONLY) -+ && !au_nhash_test_known_wh(arg->whlist, name, namelen)) -+ arg->err = -ENOTEMPTY; -+ goto out; -+ } -+ -+ name += AUFS_WH_PFX_LEN; -+ namelen -= AUFS_WH_PFX_LEN; -+ if (!au_nhash_test_known_wh(arg->whlist, name, namelen)) -+ arg->err = au_nhash_append_wh -+ (arg->whlist, name, namelen, ino, d_type, arg->bindex, -+ au_ftest_testempty(arg->flags, SHWH)); -+ -+ out: -+ /* smp_mb(); */ -+ AuTraceErr(arg->err); -+ return arg->err; -+} -+ -+static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg) -+{ -+ int err; -+ struct file *h_file; -+ -+ h_file = au_h_open(dentry, arg->bindex, -+ O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE, -+ /*file*/NULL); -+ err = PTR_ERR(h_file); -+ if (IS_ERR(h_file)) -+ goto out; -+ -+ err = 0; -+ if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) -+ && !h_file->f_dentry->d_inode->i_nlink) -+ goto out_put; -+ -+ do { -+ arg->err = 0; -+ au_fclr_testempty(arg->flags, CALLED); -+ /* smp_mb(); */ -+ err = vfsub_readdir(h_file, test_empty_cb, arg); -+ if (err >= 0) -+ err = arg->err; -+ } while (!err && au_ftest_testempty(arg->flags, CALLED)); -+ -+ out_put: -+ fput(h_file); -+ au_sbr_put(dentry->d_sb, arg->bindex); -+ out: -+ return err; -+} -+ -+struct do_test_empty_args { -+ int *errp; -+ struct dentry *dentry; -+ struct test_empty_arg *arg; -+}; -+ -+static void call_do_test_empty(void *args) -+{ -+ struct do_test_empty_args *a = args; -+ *a->errp = do_test_empty(a->dentry, a->arg); -+} -+ -+static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg) -+{ -+ int err, wkq_err; -+ struct dentry *h_dentry; -+ struct inode *h_inode; -+ -+ h_dentry = au_h_dptr(dentry, arg->bindex); -+ h_inode = h_dentry->d_inode; -+ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); -+ err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ); -+ mutex_unlock(&h_inode->i_mutex); -+ if (!err) -+ err = do_test_empty(dentry, arg); -+ else { -+ struct do_test_empty_args args = { -+ .errp = &err, -+ .dentry = dentry, -+ .arg = arg -+ }; -+ unsigned int flags = arg->flags; -+ -+ wkq_err = au_wkq_wait(call_do_test_empty, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ arg->flags = flags; -+ } -+ -+ return err; -+} -+ -+int au_test_empty_lower(struct dentry *dentry) -+{ -+ int err; -+ unsigned int rdhash; -+ aufs_bindex_t bindex, bstart, btail; -+ struct au_nhash whlist; -+ struct test_empty_arg arg; -+ -+ SiMustAnyLock(dentry->d_sb); -+ -+ rdhash = au_sbi(dentry->d_sb)->si_rdhash; -+ if (!rdhash) -+ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry)); -+ err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); -+ if (unlikely(err)) -+ goto out; -+ -+ arg.flags = 0; -+ arg.whlist = &whlist; -+ bstart = au_dbstart(dentry); -+ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) -+ au_fset_testempty(arg.flags, SHWH); -+ arg.bindex = bstart; -+ err = do_test_empty(dentry, &arg); -+ if (unlikely(err)) -+ goto out_whlist; -+ -+ au_fset_testempty(arg.flags, WHONLY); -+ btail = au_dbtaildir(dentry); -+ for (bindex = bstart + 1; !err && bindex <= btail; bindex++) { -+ struct dentry *h_dentry; -+ -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (h_dentry && h_dentry->d_inode) { -+ arg.bindex = bindex; -+ err = do_test_empty(dentry, &arg); -+ } -+ } -+ -+ out_whlist: -+ au_nhash_wh_free(&whlist); -+ out: -+ return err; -+} -+ -+int au_test_empty(struct dentry *dentry, struct au_nhash *whlist) -+{ -+ int err; -+ struct test_empty_arg arg; -+ aufs_bindex_t bindex, btail; -+ -+ err = 0; -+ arg.whlist = whlist; -+ arg.flags = AuTestEmpty_WHONLY; -+ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) -+ au_fset_testempty(arg.flags, SHWH); -+ btail = au_dbtaildir(dentry); -+ for (bindex = au_dbstart(dentry); !err && bindex <= btail; bindex++) { -+ struct dentry *h_dentry; -+ -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (h_dentry && h_dentry->d_inode) { -+ arg.bindex = bindex; -+ err = sio_test_empty(dentry, &arg); -+ } -+ } -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+const struct file_operations aufs_dir_fop = { -+ .read = generic_read_dir, -+ .readdir = aufs_readdir, -+ .unlocked_ioctl = aufs_ioctl_dir, -+ .open = aufs_open_dir, -+ .release = aufs_release_dir, -+ .flush = aufs_flush, -+ .fsync = aufs_fsync_dir -+}; -diff -Nur linux-2.6.31-vanilla/fs/aufs/dir.h linux-2.6.31/fs/aufs/dir.h ---- linux-2.6.31-vanilla/fs/aufs/dir.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/dir.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,127 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * directory operations -+ */ -+ -+#ifndef __AUFS_DIR_H__ -+#define __AUFS_DIR_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/fs.h> -+#include <linux/aufs_type.h> -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* need to be faster and smaller */ -+ -+struct au_nhash { -+ unsigned int nh_num; -+ struct hlist_head *nh_head; -+}; -+ -+struct au_vdir_destr { -+ unsigned char len; -+ unsigned char name[0]; -+} __packed; -+ -+struct au_vdir_dehstr { -+ struct hlist_node hash; -+ struct au_vdir_destr *str; -+}; -+ -+struct au_vdir_de { -+ ino_t de_ino; -+ unsigned char de_type; -+ /* caution: packed */ -+ struct au_vdir_destr de_str; -+} __packed; -+ -+struct au_vdir_wh { -+ struct hlist_node wh_hash; -+#ifdef CONFIG_AUFS_SHWH -+ ino_t wh_ino; -+ aufs_bindex_t wh_bindex; -+ unsigned char wh_type; -+#else -+ aufs_bindex_t wh_bindex; -+#endif -+ /* caution: packed */ -+ struct au_vdir_destr wh_str; -+} __packed; -+ -+union au_vdir_deblk_p { -+ unsigned char *deblk; -+ struct au_vdir_de *de; -+}; -+ -+struct au_vdir { -+ unsigned char **vd_deblk; -+ unsigned long vd_nblk; -+ struct { -+ unsigned long ul; -+ union au_vdir_deblk_p p; -+ } vd_last; -+ -+ unsigned long vd_version; -+ unsigned int vd_deblk_sz; -+ unsigned long vd_jiffy; -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* dir.c */ -+extern const struct file_operations aufs_dir_fop; -+void au_add_nlink(struct inode *dir, struct inode *h_dir); -+void au_sub_nlink(struct inode *dir, struct inode *h_dir); -+loff_t au_dir_size(struct file *file, struct dentry *dentry); -+int au_test_empty_lower(struct dentry *dentry); -+int au_test_empty(struct dentry *dentry, struct au_nhash *whlist); -+ -+/* vdir.c */ -+unsigned int au_rdhash_est(loff_t sz); -+int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp); -+void au_nhash_wh_free(struct au_nhash *whlist); -+int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, -+ int limit); -+int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen); -+int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, -+ unsigned int d_type, aufs_bindex_t bindex, -+ unsigned char shwh); -+void au_vdir_free(struct au_vdir *vdir); -+int au_vdir_init(struct file *file); -+int au_vdir_fill_de(struct file *file, void *dirent, filldir_t filldir); -+ -+/* ioctl.c */ -+long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg); -+ -+#ifdef CONFIG_AUFS_RDU -+/* rdu.c */ -+long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -+#else -+static inline long au_rdu_ioctl(struct file *file, unsigned int cmd, -+ unsigned long arg) -+{ -+ return -EINVAL; -+} -+#endif -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_DIR_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/export.c linux-2.6.31/fs/aufs/export.c ---- linux-2.6.31-vanilla/fs/aufs/export.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/export.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,746 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * export via nfs -+ */ -+ -+#include <linux/exportfs.h> -+#include <linux/file.h> -+#include <linux/mnt_namespace.h> -+#include <linux/namei.h> -+#include <linux/nsproxy.h> -+#include <linux/random.h> -+#include "aufs.h" -+ -+union conv { -+#ifdef CONFIG_AUFS_INO_T_64 -+ __u32 a[2]; -+#else -+ __u32 a[1]; -+#endif -+ ino_t ino; -+}; -+ -+static ino_t decode_ino(__u32 *a) -+{ -+ union conv u; -+ -+ BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a)); -+ u.a[0] = a[0]; -+#ifdef CONFIG_AUFS_INO_T_64 -+ u.a[1] = a[1]; -+#endif -+ return u.ino; -+} -+ -+static void encode_ino(__u32 *a, ino_t ino) -+{ -+ union conv u; -+ -+ u.ino = ino; -+ a[0] = u.a[0]; -+#ifdef CONFIG_AUFS_INO_T_64 -+ a[1] = u.a[1]; -+#endif -+} -+ -+/* NFS file handle */ -+enum { -+ Fh_br_id, -+ Fh_sigen, -+#ifdef CONFIG_AUFS_INO_T_64 -+ /* support 64bit inode number */ -+ Fh_ino1, -+ Fh_ino2, -+ Fh_dir_ino1, -+ Fh_dir_ino2, -+#else -+ Fh_ino1, -+ Fh_dir_ino1, -+#endif -+ Fh_igen, -+ Fh_h_type, -+ Fh_tail, -+ -+ Fh_ino = Fh_ino1, -+ Fh_dir_ino = Fh_dir_ino1 -+}; -+ -+static int au_test_anon(struct dentry *dentry) -+{ -+ return !!(dentry->d_flags & DCACHE_DISCONNECTED); -+} -+ -+/* ---------------------------------------------------------------------- */ -+/* inode generation external table */ -+ -+int au_xigen_inc(struct inode *inode) -+{ -+ int err; -+ loff_t pos; -+ ssize_t sz; -+ __u32 igen; -+ struct super_block *sb; -+ struct au_sbinfo *sbinfo; -+ -+ err = 0; -+ sb = inode->i_sb; -+ sbinfo = au_sbi(sb); -+ /* -+ * temporary workaround for escaping from SiMustAnyLock() in -+ * au_mntflags(), since this function is called from au_iinfo_fin(). -+ */ -+ if (unlikely(!au_opt_test(sbinfo->si_mntflags, XINO))) -+ goto out; -+ -+ pos = inode->i_ino; -+ pos *= sizeof(igen); -+ igen = inode->i_generation + 1; -+ sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen, -+ sizeof(igen), &pos); -+ if (sz == sizeof(igen)) -+ goto out; /* success */ -+ -+ err = sz; -+ if (unlikely(sz >= 0)) { -+ err = -EIO; -+ AuIOErr("xigen error (%zd)\n", sz); -+ } -+ -+ out: -+ return err; -+} -+ -+int au_xigen_new(struct inode *inode) -+{ -+ int err; -+ loff_t pos; -+ ssize_t sz; -+ struct super_block *sb; -+ struct au_sbinfo *sbinfo; -+ struct file *file; -+ -+ err = 0; -+ /* todo: dirty, at mount time */ -+ if (inode->i_ino == AUFS_ROOT_INO) -+ goto out; -+ sb = inode->i_sb; -+ SiMustAnyLock(sb); -+ if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) -+ goto out; -+ -+ err = -EFBIG; -+ pos = inode->i_ino; -+ if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) { -+ AuIOErr1("too large i%lld\n", pos); -+ goto out; -+ } -+ pos *= sizeof(inode->i_generation); -+ -+ err = 0; -+ sbinfo = au_sbi(sb); -+ file = sbinfo->si_xigen; -+ BUG_ON(!file); -+ -+ if (i_size_read(file->f_dentry->d_inode) -+ < pos + sizeof(inode->i_generation)) { -+ inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next); -+ sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation, -+ sizeof(inode->i_generation), &pos); -+ } else -+ sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation, -+ sizeof(inode->i_generation), &pos); -+ if (sz == sizeof(inode->i_generation)) -+ goto out; /* success */ -+ -+ err = sz; -+ if (unlikely(sz >= 0)) { -+ err = -EIO; -+ AuIOErr("xigen error (%zd)\n", sz); -+ } -+ -+ out: -+ return err; -+} -+ -+int au_xigen_set(struct super_block *sb, struct file *base) -+{ -+ int err; -+ struct au_sbinfo *sbinfo; -+ struct file *file; -+ -+ SiMustWriteLock(sb); -+ -+ sbinfo = au_sbi(sb); -+ file = au_xino_create2(base, sbinfo->si_xigen); -+ err = PTR_ERR(file); -+ if (IS_ERR(file)) -+ goto out; -+ err = 0; -+ if (sbinfo->si_xigen) -+ fput(sbinfo->si_xigen); -+ sbinfo->si_xigen = file; -+ -+ out: -+ return err; -+} -+ -+void au_xigen_clr(struct super_block *sb) -+{ -+ struct au_sbinfo *sbinfo; -+ -+ SiMustWriteLock(sb); -+ -+ sbinfo = au_sbi(sb); -+ if (sbinfo->si_xigen) { -+ fput(sbinfo->si_xigen); -+ sbinfo->si_xigen = NULL; -+ } -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino, -+ ino_t dir_ino) -+{ -+ struct dentry *dentry, *d; -+ struct inode *inode; -+ unsigned int sigen; -+ -+ dentry = NULL; -+ inode = ilookup(sb, ino); -+ if (!inode) -+ goto out; -+ -+ dentry = ERR_PTR(-ESTALE); -+ sigen = au_sigen(sb); -+ if (unlikely(is_bad_inode(inode) -+ || IS_DEADDIR(inode) -+ || sigen != au_iigen(inode))) -+ goto out_iput; -+ -+ dentry = NULL; -+ if (!dir_ino || S_ISDIR(inode->i_mode)) -+ dentry = d_find_alias(inode); -+ else { -+ spin_lock(&dcache_lock); -+ list_for_each_entry(d, &inode->i_dentry, d_alias) -+ if (!au_test_anon(d) -+ && d->d_parent->d_inode->i_ino == dir_ino) { -+ dentry = dget_locked(d); -+ break; -+ } -+ spin_unlock(&dcache_lock); -+ } -+ if (unlikely(dentry && sigen != au_digen(dentry))) { -+ dput(dentry); -+ dentry = ERR_PTR(-ESTALE); -+ } -+ -+ out_iput: -+ iput(inode); -+ out: -+ return dentry; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* todo: dirty? */ -+/* if exportfs_decode_fh() passed vfsmount*, we could be happy */ -+static struct vfsmount *au_mnt_get(struct super_block *sb) -+{ -+ struct mnt_namespace *ns; -+ struct vfsmount *pos, *mnt; -+ -+ spin_lock(&vfsmount_lock); -+ /* no get/put ?? */ -+ AuDebugOn(!current->nsproxy); -+ ns = current->nsproxy->mnt_ns; -+ AuDebugOn(!ns); -+ mnt = NULL; -+ /* the order (reverse) will not be a problem */ -+ list_for_each_entry(pos, &ns->list, mnt_list) -+ if (pos->mnt_sb == sb) { -+ mnt = mntget(pos); -+ break; -+ } -+ spin_unlock(&vfsmount_lock); -+ AuDebugOn(!mnt); -+ -+ return mnt; -+} -+ -+struct au_nfsd_si_lock { -+ const unsigned int sigen; -+ const aufs_bindex_t br_id; -+ unsigned char force_lock; -+}; -+ -+static aufs_bindex_t si_nfsd_read_lock(struct super_block *sb, -+ struct au_nfsd_si_lock *nsi_lock) -+{ -+ aufs_bindex_t bindex; -+ -+ si_read_lock(sb, AuLock_FLUSH); -+ -+ /* branch id may be wrapped around */ -+ bindex = au_br_index(sb, nsi_lock->br_id); -+ if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb)) -+ goto out; /* success */ -+ -+ if (!nsi_lock->force_lock) -+ si_read_unlock(sb); -+ bindex = -1; -+ -+ out: -+ return bindex; -+} -+ -+struct find_name_by_ino { -+ int called, found; -+ ino_t ino; -+ char *name; -+ int namelen; -+}; -+ -+static int -+find_name_by_ino(void *arg, const char *name, int namelen, loff_t offset, -+ u64 ino, unsigned int d_type) -+{ -+ struct find_name_by_ino *a = arg; -+ -+ a->called++; -+ if (a->ino != ino) -+ return 0; -+ -+ memcpy(a->name, name, namelen); -+ a->namelen = namelen; -+ a->found = 1; -+ return 1; -+} -+ -+static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino, -+ struct au_nfsd_si_lock *nsi_lock) -+{ -+ struct dentry *dentry, *parent; -+ struct file *file; -+ struct inode *dir; -+ struct find_name_by_ino arg; -+ int err; -+ -+ parent = path->dentry; -+ if (nsi_lock) -+ si_read_unlock(parent->d_sb); -+ path_get(path); -+ file = vfsub_dentry_open(path, au_dir_roflags, current_cred()); -+ dentry = (void *)file; -+ if (IS_ERR(file)) -+ goto out; -+ -+ dentry = ERR_PTR(-ENOMEM); -+ arg.name = __getname(); -+ if (unlikely(!arg.name)) -+ goto out_file; -+ arg.ino = ino; -+ arg.found = 0; -+ do { -+ arg.called = 0; -+ /* smp_mb(); */ -+ err = vfsub_readdir(file, find_name_by_ino, &arg); -+ } while (!err && !arg.found && arg.called); -+ dentry = ERR_PTR(err); -+ if (unlikely(err)) -+ goto out_name; -+ dentry = ERR_PTR(-ENOENT); -+ if (!arg.found) -+ goto out_name; -+ -+ /* do not call au_lkup_one() */ -+ dir = parent->d_inode; -+ mutex_lock(&dir->i_mutex); -+ dentry = vfsub_lookup_one_len(arg.name, parent, arg.namelen); -+ mutex_unlock(&dir->i_mutex); -+ AuTraceErrPtr(dentry); -+ if (IS_ERR(dentry)) -+ goto out_name; -+ AuDebugOn(au_test_anon(dentry)); -+ if (unlikely(!dentry->d_inode)) { -+ dput(dentry); -+ dentry = ERR_PTR(-ENOENT); -+ } -+ -+ out_name: -+ __putname(arg.name); -+ out_file: -+ fput(file); -+ out: -+ if (unlikely(nsi_lock -+ && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0)) -+ if (!IS_ERR(dentry)) { -+ dput(dentry); -+ dentry = ERR_PTR(-ESTALE); -+ } -+ AuTraceErrPtr(dentry); -+ return dentry; -+} -+ -+static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino, -+ ino_t dir_ino, -+ struct au_nfsd_si_lock *nsi_lock) -+{ -+ struct dentry *dentry; -+ struct path path; -+ -+ if (dir_ino != AUFS_ROOT_INO) { -+ path.dentry = decode_by_ino(sb, dir_ino, 0); -+ dentry = path.dentry; -+ if (!path.dentry || IS_ERR(path.dentry)) -+ goto out; -+ AuDebugOn(au_test_anon(path.dentry)); -+ } else -+ path.dentry = dget(sb->s_root); -+ -+ path.mnt = au_mnt_get(sb); -+ dentry = au_lkup_by_ino(&path, ino, nsi_lock); -+ path_put(&path); -+ -+ out: -+ AuTraceErrPtr(dentry); -+ return dentry; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int h_acceptable(void *expv, struct dentry *dentry) -+{ -+ return 1; -+} -+ -+static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath, -+ char *buf, int len, struct super_block *sb) -+{ -+ char *p; -+ int n; -+ struct path path; -+ -+ p = d_path(h_rootpath, buf, len); -+ if (IS_ERR(p)) -+ goto out; -+ n = strlen(p); -+ -+ path.mnt = h_rootpath->mnt; -+ path.dentry = h_parent; -+ p = d_path(&path, buf, len); -+ if (IS_ERR(p)) -+ goto out; -+ if (n != 1) -+ p += n; -+ -+ path.mnt = au_mnt_get(sb); -+ path.dentry = sb->s_root; -+ p = d_path(&path, buf, len - strlen(p)); -+ mntput(path.mnt); -+ if (IS_ERR(p)) -+ goto out; -+ if (n != 1) -+ p[strlen(p)] = '/'; -+ -+ out: -+ AuTraceErrPtr(p); -+ return p; -+} -+ -+static -+struct dentry *decode_by_path(struct super_block *sb, aufs_bindex_t bindex, -+ ino_t ino, __u32 *fh, int fh_len, -+ struct au_nfsd_si_lock *nsi_lock) -+{ -+ struct dentry *dentry, *h_parent, *root; -+ struct super_block *h_sb; -+ char *pathname, *p; -+ struct vfsmount *h_mnt; -+ struct au_branch *br; -+ int err; -+ struct path path; -+ -+ br = au_sbr(sb, bindex); -+ /* au_br_get(br); */ -+ h_mnt = br->br_mnt; -+ h_sb = h_mnt->mnt_sb; -+ /* todo: call lower fh_to_dentry()? fh_to_parent()? */ -+ h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail), -+ fh_len - Fh_tail, fh[Fh_h_type], -+ h_acceptable, /*context*/NULL); -+ dentry = h_parent; -+ if (unlikely(!h_parent || IS_ERR(h_parent))) { -+ AuWarn1("%s decode_fh failed, %ld\n", -+ au_sbtype(h_sb), PTR_ERR(h_parent)); -+ goto out; -+ } -+ dentry = NULL; -+ if (unlikely(au_test_anon(h_parent))) { -+ AuWarn1("%s decode_fh returned a disconnected dentry\n", -+ au_sbtype(h_sb)); -+ goto out_h_parent; -+ } -+ -+ dentry = ERR_PTR(-ENOMEM); -+ pathname = (void *)__get_free_page(GFP_NOFS); -+ if (unlikely(!pathname)) -+ goto out_h_parent; -+ -+ root = sb->s_root; -+ path.mnt = h_mnt; -+ di_read_lock_parent(root, !AuLock_IR); -+ path.dentry = au_h_dptr(root, bindex); -+ di_read_unlock(root, !AuLock_IR); -+ p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb); -+ dentry = (void *)p; -+ if (IS_ERR(p)) -+ goto out_pathname; -+ -+ si_read_unlock(sb); -+ err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); -+ dentry = ERR_PTR(err); -+ if (unlikely(err)) -+ goto out_relock; -+ -+ dentry = ERR_PTR(-ENOENT); -+ AuDebugOn(au_test_anon(path.dentry)); -+ if (unlikely(!path.dentry->d_inode)) -+ goto out_path; -+ -+ if (ino != path.dentry->d_inode->i_ino) -+ dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL); -+ else -+ dentry = dget(path.dentry); -+ -+ out_path: -+ path_put(&path); -+ out_relock: -+ if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0)) -+ if (!IS_ERR(dentry)) { -+ dput(dentry); -+ dentry = ERR_PTR(-ESTALE); -+ } -+ out_pathname: -+ free_page((unsigned long)pathname); -+ out_h_parent: -+ dput(h_parent); -+ out: -+ /* au_br_put(br); */ -+ AuTraceErrPtr(dentry); -+ return dentry; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static struct dentry * -+aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, -+ int fh_type) -+{ -+ struct dentry *dentry; -+ __u32 *fh = fid->raw; -+ ino_t ino, dir_ino; -+ aufs_bindex_t bindex; -+ struct au_nfsd_si_lock nsi_lock = { -+ .sigen = fh[Fh_sigen], -+ .br_id = fh[Fh_br_id], -+ .force_lock = 0 -+ }; -+ -+ AuDebugOn(fh_len < Fh_tail); -+ -+ dentry = ERR_PTR(-ESTALE); -+ /* branch id may be wrapped around */ -+ bindex = si_nfsd_read_lock(sb, &nsi_lock); -+ if (unlikely(bindex < 0)) -+ goto out; -+ nsi_lock.force_lock = 1; -+ -+ /* is this inode still cached? */ -+ ino = decode_ino(fh + Fh_ino); -+ AuDebugOn(ino == AUFS_ROOT_INO); -+ dir_ino = decode_ino(fh + Fh_dir_ino); -+ dentry = decode_by_ino(sb, ino, dir_ino); -+ if (IS_ERR(dentry)) -+ goto out_unlock; -+ if (dentry) -+ goto accept; -+ -+ /* is the parent dir cached? */ -+ dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock); -+ if (IS_ERR(dentry)) -+ goto out_unlock; -+ if (dentry) -+ goto accept; -+ -+ /* lookup path */ -+ dentry = decode_by_path(sb, bindex, ino, fh, fh_len, &nsi_lock); -+ if (IS_ERR(dentry)) -+ goto out_unlock; -+ if (unlikely(!dentry)) -+ /* todo?: make it ESTALE */ -+ goto out_unlock; -+ -+ accept: -+ if (dentry->d_inode->i_generation == fh[Fh_igen]) -+ goto out_unlock; /* success */ -+ -+ dput(dentry); -+ dentry = ERR_PTR(-ESTALE); -+ out_unlock: -+ si_read_unlock(sb); -+ out: -+ AuTraceErrPtr(dentry); -+ return dentry; -+} -+ -+#if 0 /* reserved for future use */ -+/* support subtreecheck option */ -+static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ struct dentry *parent; -+ __u32 *fh = fid->raw; -+ ino_t dir_ino; -+ -+ dir_ino = decode_ino(fh + Fh_dir_ino); -+ parent = decode_by_ino(sb, dir_ino, 0); -+ if (IS_ERR(parent)) -+ goto out; -+ if (!parent) -+ parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]), -+ dir_ino, fh, fh_len); -+ -+ out: -+ AuTraceErrPtr(parent); -+ return parent; -+} -+#endif -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int aufs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, -+ int connectable) -+{ -+ int err; -+ aufs_bindex_t bindex, bend; -+ struct super_block *sb, *h_sb; -+ struct inode *inode; -+ struct dentry *parent, *h_parent; -+ struct au_branch *br; -+ -+ AuDebugOn(au_test_anon(dentry)); -+ -+ parent = NULL; -+ err = -ENOSPC; -+ if (unlikely(*max_len <= Fh_tail)) { -+ AuWarn1("NFSv2 client (max_len %d)?\n", *max_len); -+ goto out; -+ } -+ -+ err = FILEID_ROOT; -+ if (IS_ROOT(dentry)) { -+ AuDebugOn(dentry->d_inode->i_ino != AUFS_ROOT_INO); -+ goto out; -+ } -+ -+ err = -EIO; -+ h_parent = NULL; -+ sb = dentry->d_sb; -+ aufs_read_lock(dentry, AuLock_FLUSH | AuLock_IR); -+ parent = dget_parent(dentry); -+ di_read_lock_parent(parent, !AuLock_IR); -+ inode = dentry->d_inode; -+ AuDebugOn(!inode); -+#ifdef CONFIG_AUFS_DEBUG -+ if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) -+ AuWarn1("NFS-exporting requires xino\n"); -+#endif -+ -+ bend = au_dbtaildir(parent); -+ for (bindex = au_dbstart(parent); bindex <= bend; bindex++) { -+ h_parent = au_h_dptr(parent, bindex); -+ if (h_parent) { -+ dget(h_parent); -+ break; -+ } -+ } -+ if (unlikely(!h_parent)) -+ goto out_unlock; -+ -+ err = -EPERM; -+ br = au_sbr(sb, bindex); -+ h_sb = br->br_mnt->mnt_sb; -+ if (unlikely(!h_sb->s_export_op)) { -+ AuErr1("%s branch is not exportable\n", au_sbtype(h_sb)); -+ goto out_dput; -+ } -+ -+ fh[Fh_br_id] = br->br_id; -+ fh[Fh_sigen] = au_sigen(sb); -+ encode_ino(fh + Fh_ino, inode->i_ino); -+ encode_ino(fh + Fh_dir_ino, parent->d_inode->i_ino); -+ fh[Fh_igen] = inode->i_generation; -+ -+ *max_len -= Fh_tail; -+ fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail), -+ max_len, -+ /*connectable or subtreecheck*/0); -+ err = fh[Fh_h_type]; -+ *max_len += Fh_tail; -+ /* todo: macros? */ -+ if (err != 255) -+ err = 99; -+ else -+ AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb)); -+ -+ out_dput: -+ dput(h_parent); -+ out_unlock: -+ di_read_unlock(parent, !AuLock_IR); -+ dput(parent); -+ aufs_read_unlock(dentry, AuLock_IR); -+ out: -+ if (unlikely(err < 0)) -+ err = 255; -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static struct export_operations aufs_export_op = { -+ .fh_to_dentry = aufs_fh_to_dentry, -+ /* .fh_to_parent = aufs_fh_to_parent, */ -+ .encode_fh = aufs_encode_fh -+}; -+ -+void au_export_init(struct super_block *sb) -+{ -+ struct au_sbinfo *sbinfo; -+ __u32 u; -+ -+ sb->s_export_op = &aufs_export_op; -+ sbinfo = au_sbi(sb); -+ sbinfo->si_xigen = NULL; -+ get_random_bytes(&u, sizeof(u)); -+ BUILD_BUG_ON(sizeof(u) != sizeof(int)); -+ atomic_set(&sbinfo->si_xigen_next, u); -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/file.c linux-2.6.31/fs/aufs/file.c ---- linux-2.6.31-vanilla/fs/aufs/file.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/file.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,568 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * handling file/dir, and address_space operation -+ */ -+ -+#include <linux/file.h> -+#include <linux/fsnotify.h> -+#include <linux/namei.h> -+#include <linux/pagemap.h> -+#include "aufs.h" -+ -+/* drop flags for writing */ -+unsigned int au_file_roflags(unsigned int flags) -+{ -+ flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC); -+ flags |= O_RDONLY | O_NOATIME; -+ return flags; -+} -+ -+/* common functions to regular file and dir */ -+struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, -+ struct file *file) -+{ -+ struct file *h_file; -+ struct dentry *h_dentry; -+ struct inode *h_inode; -+ struct super_block *sb; -+ struct au_branch *br; -+ int err, exec_flag; -+ struct path h_path; -+ -+ /* a race condition can happen between open and unlink/rmdir */ -+ h_file = ERR_PTR(-ENOENT); -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (au_test_nfsd(current) && !h_dentry) -+ goto out; -+ h_inode = h_dentry->d_inode; -+ if (au_test_nfsd(current) && !h_inode) -+ goto out; -+ if (unlikely((!d_unhashed(dentry) && d_unhashed(h_dentry)) -+ || !h_inode)) -+ goto out; -+ -+ sb = dentry->d_sb; -+ br = au_sbr(sb, bindex); -+ h_file = ERR_PTR(-EACCES); -+ exec_flag = flags & vfsub_fmode_to_uint(FMODE_EXEC); -+ if (exec_flag && (br->br_mnt->mnt_flags & MNT_NOEXEC)) -+ goto out; -+ -+ /* drop flags for writing */ -+ if (au_test_ro(sb, bindex, dentry->d_inode)) -+ flags = au_file_roflags(flags); -+ flags &= ~O_CREAT; -+ atomic_inc(&br->br_count); -+ h_path.dentry = h_dentry; -+ h_path.mnt = br->br_mnt; -+ path_get(&h_path); -+ h_file = vfsub_dentry_open(&h_path, flags, current_cred()); -+ if (IS_ERR(h_file)) -+ goto out_br; -+ -+ if (exec_flag) { -+ err = deny_write_access(h_file); -+ if (unlikely(err)) { -+ fput(h_file); -+ h_file = ERR_PTR(err); -+ goto out_br; -+ } -+ } -+ fsnotify_open(h_dentry); -+ goto out; /* success */ -+ -+ out_br: -+ atomic_dec(&br->br_count); -+ out: -+ return h_file; -+} -+ -+int au_do_open(struct file *file, int (*open)(struct file *file, int flags)) -+{ -+ int err; -+ unsigned int flags; -+ struct dentry *dentry; -+ struct super_block *sb; -+ -+ dentry = file->f_dentry; -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ err = au_finfo_init(file); -+ if (unlikely(err)) -+ goto out; -+ -+ di_read_lock_child(dentry, AuLock_IR); -+ spin_lock(&file->f_lock); -+ flags = file->f_flags; -+ spin_unlock(&file->f_lock); -+ err = open(file, flags); -+ di_read_unlock(dentry, AuLock_IR); -+ -+ fi_write_unlock(file); -+ if (unlikely(err)) -+ au_finfo_fin(file); -+ out: -+ si_read_unlock(sb); -+ return err; -+} -+ -+int au_reopen_nondir(struct file *file) -+{ -+ int err; -+ unsigned int flags; -+ aufs_bindex_t bstart, bindex, bend; -+ struct dentry *dentry; -+ struct file *h_file, *h_file_tmp; -+ -+ dentry = file->f_dentry; -+ bstart = au_dbstart(dentry); -+ h_file_tmp = NULL; -+ if (au_fbstart(file) == bstart) { -+ h_file = au_h_fptr(file, bstart); -+ if (file->f_mode == h_file->f_mode) -+ return 0; /* success */ -+ h_file_tmp = h_file; -+ get_file(h_file_tmp); -+ au_set_h_fptr(file, bstart, NULL); -+ } -+ AuDebugOn(au_fbstart(file) < bstart -+ || au_fi(file)->fi_hfile[0 + bstart].hf_file); -+ -+ spin_lock(&file->f_lock); -+ flags = file->f_flags & ~O_TRUNC; -+ spin_unlock(&file->f_lock); -+ h_file = au_h_open(dentry, bstart, flags, file); -+ err = PTR_ERR(h_file); -+ if (IS_ERR(h_file)) -+ goto out; /* todo: close all? */ -+ -+ err = 0; -+ au_set_fbstart(file, bstart); -+ au_set_h_fptr(file, bstart, h_file); -+ au_update_figen(file); -+ /* todo: necessary? */ -+ /* file->f_ra = h_file->f_ra; */ -+ -+ /* close lower files */ -+ bend = au_fbend(file); -+ for (bindex = bstart + 1; bindex <= bend; bindex++) -+ au_set_h_fptr(file, bindex, NULL); -+ au_set_fbend(file, bstart); -+ -+ out: -+ if (h_file_tmp) -+ fput(h_file_tmp); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int au_reopen_wh(struct file *file, aufs_bindex_t btgt, -+ struct dentry *hi_wh) -+{ -+ int err; -+ aufs_bindex_t bstart; -+ struct au_dinfo *dinfo; -+ struct dentry *h_dentry; -+ -+ dinfo = au_di(file->f_dentry); -+ AuRwMustWriteLock(&dinfo->di_rwsem); -+ -+ bstart = dinfo->di_bstart; -+ dinfo->di_bstart = btgt; -+ h_dentry = dinfo->di_hdentry[0 + btgt].hd_dentry; -+ dinfo->di_hdentry[0 + btgt].hd_dentry = hi_wh; -+ err = au_reopen_nondir(file); -+ dinfo->di_hdentry[0 + btgt].hd_dentry = h_dentry; -+ dinfo->di_bstart = bstart; -+ -+ return err; -+} -+ -+static int au_ready_to_write_wh(struct file *file, loff_t len, -+ aufs_bindex_t bcpup) -+{ -+ int err; -+ struct inode *inode; -+ struct dentry *dentry, *hi_wh; -+ struct super_block *sb; -+ -+ dentry = file->f_dentry; -+ inode = dentry->d_inode; -+ hi_wh = au_hi_wh(inode, bcpup); -+ if (!hi_wh) -+ err = au_sio_cpup_wh(dentry, bcpup, len, file); -+ else -+ /* already copied-up after unlink */ -+ err = au_reopen_wh(file, bcpup, hi_wh); -+ -+ sb = dentry->d_sb; -+ if (!err && inode->i_nlink > 1 && au_opt_test(au_mntflags(sb), PLINK)) -+ au_plink_append(inode, bcpup, au_h_dptr(dentry, bcpup)); -+ -+ return err; -+} -+ -+/* -+ * prepare the @file for writing. -+ */ -+int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin) -+{ -+ int err; -+ aufs_bindex_t bstart, bcpup; -+ struct dentry *dentry, *parent, *h_dentry; -+ struct inode *h_inode, *inode; -+ struct super_block *sb; -+ -+ dentry = file->f_dentry; -+ sb = dentry->d_sb; -+ bstart = au_fbstart(file); -+ inode = dentry->d_inode; -+ err = au_test_ro(sb, bstart, inode); -+ if (!err && (au_h_fptr(file, bstart)->f_mode & FMODE_WRITE)) { -+ err = au_pin(pin, dentry, bstart, AuOpt_UDBA_NONE, /*flags*/0); -+ goto out; -+ } -+ -+ /* need to cpup */ -+ parent = dget_parent(dentry); -+ di_write_lock_parent(parent); -+ err = AuWbrCopyup(au_sbi(sb), dentry); -+ bcpup = err; -+ if (unlikely(err < 0)) -+ goto out_dgrade; -+ err = 0; -+ -+ if (!au_h_dptr(parent, bcpup)) { -+ err = au_cpup_dirs(dentry, bcpup); -+ if (unlikely(err)) -+ goto out_dgrade; -+ } -+ -+ err = au_pin(pin, dentry, bcpup, AuOpt_UDBA_NONE, -+ AuPin_DI_LOCKED | AuPin_MNT_WRITE); -+ if (unlikely(err)) -+ goto out_dgrade; -+ -+ h_dentry = au_h_fptr(file, bstart)->f_dentry; -+ h_inode = h_dentry->d_inode; -+ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); -+ if (d_unhashed(dentry) /* || d_unhashed(h_dentry) */ -+ /* || !h_inode->i_nlink */) { -+ err = au_ready_to_write_wh(file, len, bcpup); -+ di_downgrade_lock(parent, AuLock_IR); -+ } else { -+ di_downgrade_lock(parent, AuLock_IR); -+ if (!au_h_dptr(dentry, bcpup)) -+ err = au_sio_cpup_simple(dentry, bcpup, len, -+ AuCpup_DTIME); -+ if (!err) -+ err = au_reopen_nondir(file); -+ } -+ mutex_unlock(&h_inode->i_mutex); -+ -+ if (!err) { -+ au_pin_set_parent_lflag(pin, /*lflag*/0); -+ goto out_dput; /* success */ -+ } -+ au_unpin(pin); -+ goto out_unlock; -+ -+ out_dgrade: -+ di_downgrade_lock(parent, AuLock_IR); -+ out_unlock: -+ di_read_unlock(parent, AuLock_IR); -+ out_dput: -+ dput(parent); -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int au_file_refresh_by_inode(struct file *file, int *need_reopen) -+{ -+ int err; -+ aufs_bindex_t bstart; -+ struct au_pin pin; -+ struct au_finfo *finfo; -+ struct dentry *dentry, *parent, *hi_wh; -+ struct inode *inode; -+ struct super_block *sb; -+ -+ FiMustWriteLock(file); -+ -+ err = 0; -+ finfo = au_fi(file); -+ dentry = file->f_dentry; -+ sb = dentry->d_sb; -+ inode = dentry->d_inode; -+ bstart = au_ibstart(inode); -+ if (bstart == finfo->fi_bstart) -+ goto out; -+ -+ parent = dget_parent(dentry); -+ if (au_test_ro(sb, bstart, inode)) { -+ di_read_lock_parent(parent, !AuLock_IR); -+ err = AuWbrCopyup(au_sbi(sb), dentry); -+ bstart = err; -+ di_read_unlock(parent, !AuLock_IR); -+ if (unlikely(err < 0)) -+ goto out_parent; -+ err = 0; -+ } -+ -+ di_read_lock_parent(parent, AuLock_IR); -+ hi_wh = au_hi_wh(inode, bstart); -+ if (au_opt_test(au_mntflags(sb), PLINK) -+ && au_plink_test(inode) -+ && !d_unhashed(dentry)) { -+ err = au_test_and_cpup_dirs(dentry, bstart); -+ if (unlikely(err)) -+ goto out_unlock; -+ -+ /* always superio. */ -+ err = au_pin(&pin, dentry, bstart, AuOpt_UDBA_NONE, -+ AuPin_DI_LOCKED | AuPin_MNT_WRITE); -+ if (!err) -+ err = au_sio_cpup_simple(dentry, bstart, -1, -+ AuCpup_DTIME); -+ au_unpin(&pin); -+ } else if (hi_wh) { -+ /* already copied-up after unlink */ -+ err = au_reopen_wh(file, bstart, hi_wh); -+ *need_reopen = 0; -+ } -+ -+ out_unlock: -+ di_read_unlock(parent, AuLock_IR); -+ out_parent: -+ dput(parent); -+ out: -+ return err; -+} -+ -+static void au_do_refresh_file(struct file *file) -+{ -+ aufs_bindex_t bindex, bend, new_bindex, brid; -+ struct au_hfile *p, tmp, *q; -+ struct au_finfo *finfo; -+ struct super_block *sb; -+ -+ FiMustWriteLock(file); -+ -+ sb = file->f_dentry->d_sb; -+ finfo = au_fi(file); -+ p = finfo->fi_hfile + finfo->fi_bstart; -+ brid = p->hf_br->br_id; -+ bend = finfo->fi_bend; -+ for (bindex = finfo->fi_bstart; bindex <= bend; bindex++, p++) { -+ if (!p->hf_file) -+ continue; -+ -+ new_bindex = au_br_index(sb, p->hf_br->br_id); -+ if (new_bindex == bindex) -+ continue; -+ if (new_bindex < 0) { -+ au_set_h_fptr(file, bindex, NULL); -+ continue; -+ } -+ -+ /* swap two lower inode, and loop again */ -+ q = finfo->fi_hfile + new_bindex; -+ tmp = *q; -+ *q = *p; -+ *p = tmp; -+ if (tmp.hf_file) { -+ bindex--; -+ p--; -+ } -+ } -+ -+ p = finfo->fi_hfile; -+ if (!au_test_mmapped(file) && !d_unhashed(file->f_dentry)) { -+ bend = au_sbend(sb); -+ for (finfo->fi_bstart = 0; finfo->fi_bstart <= bend; -+ finfo->fi_bstart++, p++) -+ if (p->hf_file) { -+ if (p->hf_file->f_dentry -+ && p->hf_file->f_dentry->d_inode) -+ break; -+ else -+ au_hfput(p, file); -+ } -+ } else { -+ bend = au_br_index(sb, brid); -+ for (finfo->fi_bstart = 0; finfo->fi_bstart < bend; -+ finfo->fi_bstart++, p++) -+ if (p->hf_file) -+ au_hfput(p, file); -+ bend = au_sbend(sb); -+ } -+ -+ p = finfo->fi_hfile + bend; -+ for (finfo->fi_bend = bend; finfo->fi_bend >= finfo->fi_bstart; -+ finfo->fi_bend--, p--) -+ if (p->hf_file) { -+ if (p->hf_file->f_dentry -+ && p->hf_file->f_dentry->d_inode) -+ break; -+ else -+ au_hfput(p, file); -+ } -+ AuDebugOn(finfo->fi_bend < finfo->fi_bstart); -+} -+ -+/* -+ * after branch manipulating, refresh the file. -+ */ -+static int refresh_file(struct file *file, int (*reopen)(struct file *file)) -+{ -+ int err, need_reopen; -+ struct dentry *dentry; -+ aufs_bindex_t bend, bindex; -+ -+ dentry = file->f_dentry; -+ err = au_fi_realloc(au_fi(file), au_sbend(dentry->d_sb) + 1); -+ if (unlikely(err)) -+ goto out; -+ au_do_refresh_file(file); -+ -+ err = 0; -+ need_reopen = 1; -+ if (!au_test_mmapped(file)) -+ err = au_file_refresh_by_inode(file, &need_reopen); -+ if (!err && need_reopen && !d_unhashed(dentry)) -+ err = reopen(file); -+ if (!err) { -+ au_update_figen(file); -+ return 0; /* success */ -+ } -+ -+ /* error, close all lower files */ -+ bend = au_fbend(file); -+ for (bindex = au_fbstart(file); bindex <= bend; bindex++) -+ au_set_h_fptr(file, bindex, NULL); -+ -+ out: -+ return err; -+} -+ -+/* common function to regular file and dir */ -+int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), -+ int wlock) -+{ -+ int err; -+ unsigned int sigen, figen; -+ aufs_bindex_t bstart; -+ unsigned char pseudo_link; -+ struct dentry *dentry; -+ -+ err = 0; -+ dentry = file->f_dentry; -+ sigen = au_sigen(dentry->d_sb); -+ fi_write_lock(file); -+ figen = au_figen(file); -+ di_write_lock_child(dentry); -+ bstart = au_dbstart(dentry); -+ pseudo_link = (bstart != au_ibstart(dentry->d_inode)); -+ if (sigen == figen && !pseudo_link && au_fbstart(file) == bstart) { -+ if (!wlock) { -+ di_downgrade_lock(dentry, AuLock_IR); -+ fi_downgrade_lock(file); -+ } -+ goto out; /* success */ -+ } -+ -+ AuDbg("sigen %d, figen %d\n", sigen, figen); -+ if (sigen != au_digen(dentry) -+ || sigen != au_iigen(dentry->d_inode)) { -+ err = au_reval_dpath(dentry, sigen); -+ if (unlikely(err < 0)) -+ goto out; -+ AuDebugOn(au_digen(dentry) != sigen -+ || au_iigen(dentry->d_inode) != sigen); -+ } -+ -+ err = refresh_file(file, reopen); -+ if (!err) { -+ if (!wlock) { -+ di_downgrade_lock(dentry, AuLock_IR); -+ fi_downgrade_lock(file); -+ } -+ } else { -+ di_write_unlock(dentry); -+ fi_write_unlock(file); -+ } -+ -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* cf. aufs_nopage() */ -+/* for madvise(2) */ -+static int aufs_readpage(struct file *file __maybe_unused, struct page *page) -+{ -+ unlock_page(page); -+ return 0; -+} -+ -+/* they will never be called. */ -+#ifdef CONFIG_AUFS_DEBUG -+static int aufs_write_begin(struct file *file, struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned flags, -+ struct page **pagep, void **fsdata) -+{ AuUnsupport(); return 0; } -+static int aufs_write_end(struct file *file, struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned copied, -+ struct page *page, void *fsdata) -+{ AuUnsupport(); return 0; } -+static int aufs_writepage(struct page *page, struct writeback_control *wbc) -+{ AuUnsupport(); return 0; } -+static void aufs_sync_page(struct page *page) -+{ AuUnsupport(); } -+ -+static int aufs_set_page_dirty(struct page *page) -+{ AuUnsupport(); return 0; } -+static void aufs_invalidatepage(struct page *page, unsigned long offset) -+{ AuUnsupport(); } -+static int aufs_releasepage(struct page *page, gfp_t gfp) -+{ AuUnsupport(); return 0; } -+static ssize_t aufs_direct_IO(int rw, struct kiocb *iocb, -+ const struct iovec *iov, loff_t offset, -+ unsigned long nr_segs) -+{ AuUnsupport(); return 0; } -+#endif /* CONFIG_AUFS_DEBUG */ -+ -+struct address_space_operations aufs_aop = { -+ .readpage = aufs_readpage, -+#ifdef CONFIG_AUFS_DEBUG -+ .writepage = aufs_writepage, -+ .sync_page = aufs_sync_page, -+ .set_page_dirty = aufs_set_page_dirty, -+ .write_begin = aufs_write_begin, -+ .write_end = aufs_write_end, -+ .invalidatepage = aufs_invalidatepage, -+ .releasepage = aufs_releasepage, -+ .direct_IO = aufs_direct_IO, -+#endif /* CONFIG_AUFS_DEBUG */ -+}; -diff -Nur linux-2.6.31-vanilla/fs/aufs/file.h linux-2.6.31/fs/aufs/file.h ---- linux-2.6.31-vanilla/fs/aufs/file.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/file.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,174 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * file operations -+ */ -+ -+#ifndef __AUFS_FILE_H__ -+#define __AUFS_FILE_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/fs.h> -+#include <linux/poll.h> -+#include <linux/aufs_type.h> -+#include "rwsem.h" -+ -+struct au_branch; -+struct au_hfile { -+ struct file *hf_file; -+ struct au_branch *hf_br; -+}; -+ -+struct au_vdir; -+struct au_finfo { -+ atomic_t fi_generation; -+ -+ struct au_rwsem fi_rwsem; -+ struct au_hfile *fi_hfile; -+ aufs_bindex_t fi_bstart, fi_bend; -+ -+ union { -+ /* non-dir only */ -+ struct { -+ struct vm_operations_struct *fi_h_vm_ops; -+ struct vm_operations_struct *fi_vm_ops; -+ }; -+ -+ /* dir only */ -+ struct { -+ struct au_vdir *fi_vdir_cache; -+ int fi_maintain_plink; -+ }; -+ }; -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* file.c */ -+extern struct address_space_operations aufs_aop; -+unsigned int au_file_roflags(unsigned int flags); -+struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, -+ struct file *file); -+int au_do_open(struct file *file, int (*open)(struct file *file, int flags)); -+int au_reopen_nondir(struct file *file); -+struct au_pin; -+int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin); -+int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), -+ int wlock); -+ -+/* poll.c */ -+#ifdef CONFIG_AUFS_POLL -+unsigned int aufs_poll(struct file *file, poll_table *wait); -+#endif -+ -+/* f_op.c */ -+extern const struct file_operations aufs_file_fop; -+int aufs_flush(struct file *file, fl_owner_t id); -+ -+/* finfo.c */ -+void au_hfput(struct au_hfile *hf, struct file *file); -+void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, -+ struct file *h_file); -+ -+void au_update_figen(struct file *file); -+ -+void au_finfo_fin(struct file *file); -+int au_finfo_init(struct file *file); -+int au_fi_realloc(struct au_finfo *finfo, int nbr); -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline struct au_finfo *au_fi(struct file *file) -+{ -+ return file->private_data; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * fi_read_lock, fi_write_lock, -+ * fi_read_unlock, fi_write_unlock, fi_downgrade_lock -+ */ -+AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem); -+ -+#define FiMustNoWaiters(f) AuRwMustNoWaiters(&au_fi(f)->fi_rwsem) -+#define FiMustAnyLock(f) AuRwMustAnyLock(&au_fi(f)->fi_rwsem) -+#define FiMustWriteLock(f) AuRwMustWriteLock(&au_fi(f)->fi_rwsem) -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* todo: hard/soft set? */ -+static inline aufs_bindex_t au_fbstart(struct file *file) -+{ -+ FiMustAnyLock(file); -+ return au_fi(file)->fi_bstart; -+} -+ -+static inline aufs_bindex_t au_fbend(struct file *file) -+{ -+ FiMustAnyLock(file); -+ return au_fi(file)->fi_bend; -+} -+ -+static inline struct au_vdir *au_fvdir_cache(struct file *file) -+{ -+ FiMustAnyLock(file); -+ return au_fi(file)->fi_vdir_cache; -+} -+ -+static inline void au_set_fbstart(struct file *file, aufs_bindex_t bindex) -+{ -+ FiMustWriteLock(file); -+ au_fi(file)->fi_bstart = bindex; -+} -+ -+static inline void au_set_fbend(struct file *file, aufs_bindex_t bindex) -+{ -+ FiMustWriteLock(file); -+ au_fi(file)->fi_bend = bindex; -+} -+ -+static inline void au_set_fvdir_cache(struct file *file, -+ struct au_vdir *vdir_cache) -+{ -+ FiMustWriteLock(file); -+ au_fi(file)->fi_vdir_cache = vdir_cache; -+} -+ -+static inline struct file *au_h_fptr(struct file *file, aufs_bindex_t bindex) -+{ -+ FiMustAnyLock(file); -+ return au_fi(file)->fi_hfile[0 + bindex].hf_file; -+} -+ -+/* todo: memory barrier? */ -+static inline unsigned int au_figen(struct file *f) -+{ -+ return atomic_read(&au_fi(f)->fi_generation); -+} -+ -+static inline int au_test_mmapped(struct file *f) -+{ -+ /* FiMustAnyLock(f); */ -+ return !!(au_fi(f)->fi_h_vm_ops); -+} -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_FILE_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/finfo.c linux-2.6.31/fs/aufs/finfo.c ---- linux-2.6.31-vanilla/fs/aufs/finfo.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/finfo.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,128 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * file private data -+ */ -+ -+#include <linux/file.h> -+#include "aufs.h" -+ -+void au_hfput(struct au_hfile *hf, struct file *file) -+{ -+ if (file->f_flags & vfsub_fmode_to_uint(FMODE_EXEC)) -+ allow_write_access(hf->hf_file); -+ fput(hf->hf_file); -+ hf->hf_file = NULL; -+ atomic_dec_return(&hf->hf_br->br_count); -+ hf->hf_br = NULL; -+} -+ -+void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val) -+{ -+ struct au_finfo *finfo = au_fi(file); -+ struct au_hfile *hf; -+ -+ hf = finfo->fi_hfile + bindex; -+ if (hf->hf_file) -+ au_hfput(hf, file); -+ if (val) { -+ hf->hf_file = val; -+ hf->hf_br = au_sbr(file->f_dentry->d_sb, bindex); -+ } -+} -+ -+void au_update_figen(struct file *file) -+{ -+ atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_dentry)); -+ /* smp_mb(); */ /* atomic_set */ -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+void au_finfo_fin(struct file *file) -+{ -+ struct au_finfo *finfo; -+ aufs_bindex_t bindex, bend; -+ -+ fi_write_lock(file); -+ bend = au_fbend(file); -+ bindex = au_fbstart(file); -+ if (bindex >= 0) -+ /* -+ * calls fput() instead of filp_close(), -+ * since no dnotify or lock for the lower file. -+ */ -+ for (; bindex <= bend; bindex++) -+ au_set_h_fptr(file, bindex, NULL); -+ -+ finfo = au_fi(file); -+ au_dbg_verify_hf(finfo); -+ kfree(finfo->fi_hfile); -+ fi_write_unlock(file); -+ AuRwDestroy(&finfo->fi_rwsem); -+ au_cache_free_finfo(finfo); -+} -+ -+int au_finfo_init(struct file *file) -+{ -+ struct au_finfo *finfo; -+ struct dentry *dentry; -+ -+ dentry = file->f_dentry; -+ finfo = au_cache_alloc_finfo(); -+ if (unlikely(!finfo)) -+ goto out; -+ -+ finfo->fi_hfile = kcalloc(au_sbend(dentry->d_sb) + 1, -+ sizeof(*finfo->fi_hfile), GFP_NOFS); -+ if (unlikely(!finfo->fi_hfile)) -+ goto out_finfo; -+ -+ au_rw_init_wlock(&finfo->fi_rwsem); -+ finfo->fi_bstart = -1; -+ finfo->fi_bend = -1; -+ atomic_set(&finfo->fi_generation, au_digen(dentry)); -+ /* smp_mb(); */ /* atomic_set */ -+ -+ file->private_data = finfo; -+ return 0; /* success */ -+ -+ out_finfo: -+ au_cache_free_finfo(finfo); -+ out: -+ return -ENOMEM; -+} -+ -+int au_fi_realloc(struct au_finfo *finfo, int nbr) -+{ -+ int err, sz; -+ struct au_hfile *hfp; -+ -+ err = -ENOMEM; -+ sz = sizeof(*hfp) * (finfo->fi_bend + 1); -+ if (!sz) -+ sz = sizeof(*hfp); -+ hfp = au_kzrealloc(finfo->fi_hfile, sz, sizeof(*hfp) * nbr, GFP_NOFS); -+ if (hfp) { -+ finfo->fi_hfile = hfp; -+ err = 0; -+ } -+ -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/f_op.c linux-2.6.31/fs/aufs/f_op.c ---- linux-2.6.31-vanilla/fs/aufs/f_op.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/f_op.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,823 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * file and vm operations -+ */ -+ -+#include <linux/file.h> -+#include <linux/fs_stack.h> -+#include <linux/ima.h> -+#include <linux/mman.h> -+#include <linux/mm.h> -+#include <linux/security.h> -+#include "aufs.h" -+ -+/* common function to regular file and dir */ -+int aufs_flush(struct file *file, fl_owner_t id) -+{ -+ int err; -+ aufs_bindex_t bindex, bend; -+ struct dentry *dentry; -+ struct file *h_file; -+ -+ dentry = file->f_dentry; -+ si_noflush_read_lock(dentry->d_sb); -+ fi_read_lock(file); -+ di_read_lock_child(dentry, AuLock_IW); -+ -+ err = 0; -+ bend = au_fbend(file); -+ for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { -+ h_file = au_h_fptr(file, bindex); -+ if (!h_file || !h_file->f_op || !h_file->f_op->flush) -+ continue; -+ -+ err = h_file->f_op->flush(h_file, id); -+ if (!err) -+ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); -+ /*ignore*/ -+ } -+ au_cpup_attr_timesizes(dentry->d_inode); -+ -+ di_read_unlock(dentry, AuLock_IW); -+ fi_read_unlock(file); -+ si_read_unlock(dentry->d_sb); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int do_open_nondir(struct file *file, int flags) -+{ -+ int err; -+ aufs_bindex_t bindex; -+ struct file *h_file; -+ struct dentry *dentry; -+ struct au_finfo *finfo; -+ -+ FiMustWriteLock(file); -+ -+ err = 0; -+ dentry = file->f_dentry; -+ finfo = au_fi(file); -+ finfo->fi_h_vm_ops = NULL; -+ finfo->fi_vm_ops = NULL; -+ bindex = au_dbstart(dentry); -+ /* O_TRUNC is processed already */ -+ BUG_ON(au_test_ro(dentry->d_sb, bindex, dentry->d_inode) -+ && (flags & O_TRUNC)); -+ -+ h_file = au_h_open(dentry, bindex, flags, file); -+ if (IS_ERR(h_file)) -+ err = PTR_ERR(h_file); -+ else { -+ au_set_fbstart(file, bindex); -+ au_set_fbend(file, bindex); -+ au_set_h_fptr(file, bindex, h_file); -+ au_update_figen(file); -+ /* todo: necessary? */ -+ /* file->f_ra = h_file->f_ra; */ -+ } -+ return err; -+} -+ -+static int aufs_open_nondir(struct inode *inode __maybe_unused, -+ struct file *file) -+{ -+ return au_do_open(file, do_open_nondir); -+} -+ -+static int aufs_release_nondir(struct inode *inode __maybe_unused, -+ struct file *file) -+{ -+ struct super_block *sb = file->f_dentry->d_sb; -+ -+ si_noflush_read_lock(sb); -+ kfree(au_fi(file)->fi_vm_ops); -+ au_finfo_fin(file); -+ si_read_unlock(sb); -+ return 0; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static ssize_t aufs_read(struct file *file, char __user *buf, size_t count, -+ loff_t *ppos) -+{ -+ ssize_t err; -+ struct dentry *dentry; -+ struct file *h_file; -+ struct super_block *sb; -+ -+ dentry = file->f_dentry; -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); -+ if (unlikely(err)) -+ goto out; -+ -+ h_file = au_h_fptr(file, au_fbstart(file)); -+ err = vfsub_read_u(h_file, buf, count, ppos); -+ /* todo: necessary? */ -+ /* file->f_ra = h_file->f_ra; */ -+ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); -+ -+ di_read_unlock(dentry, AuLock_IR); -+ fi_read_unlock(file); -+ out: -+ si_read_unlock(sb); -+ return err; -+} -+ -+static ssize_t aufs_write(struct file *file, const char __user *ubuf, -+ size_t count, loff_t *ppos) -+{ -+ ssize_t err; -+ aufs_bindex_t bstart; -+ struct au_pin pin; -+ struct dentry *dentry; -+ struct inode *inode; -+ struct super_block *sb; -+ struct file *h_file; -+ char __user *buf = (char __user *)ubuf; -+ -+ dentry = file->f_dentry; -+ sb = dentry->d_sb; -+ inode = dentry->d_inode; -+ mutex_lock(&inode->i_mutex); -+ si_read_lock(sb, AuLock_FLUSH); -+ -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); -+ if (unlikely(err)) -+ goto out; -+ -+ err = au_ready_to_write(file, -1, &pin); -+ di_downgrade_lock(dentry, AuLock_IR); -+ if (unlikely(err)) -+ goto out_unlock; -+ -+ bstart = au_fbstart(file); -+ h_file = au_h_fptr(file, bstart); -+ au_unpin(&pin); -+ err = vfsub_write_u(h_file, buf, count, ppos); -+ au_cpup_attr_timesizes(inode); -+ inode->i_mode = h_file->f_dentry->d_inode->i_mode; -+ -+ out_unlock: -+ di_read_unlock(dentry, AuLock_IR); -+ fi_write_unlock(file); -+ out: -+ si_read_unlock(sb); -+ mutex_unlock(&inode->i_mutex); -+ return err; -+} -+ -+static ssize_t aufs_aio_read(struct kiocb *kio, const struct iovec *iov, -+ unsigned long nv, loff_t pos) -+{ -+ ssize_t err; -+ struct file *file, *h_file; -+ struct dentry *dentry; -+ struct super_block *sb; -+ -+ file = kio->ki_filp; -+ dentry = file->f_dentry; -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); -+ if (unlikely(err)) -+ goto out; -+ -+ err = -ENOSYS; -+ h_file = au_h_fptr(file, au_fbstart(file)); -+ if (h_file->f_op && h_file->f_op->aio_read) { -+ err = security_file_permission(h_file, MAY_READ); -+ if (unlikely(err)) -+ goto out_unlock; -+ if (!is_sync_kiocb(kio)) { -+ get_file(h_file); -+ fput(file); -+ } -+ kio->ki_filp = h_file; -+ err = h_file->f_op->aio_read(kio, iov, nv, pos); -+ /* todo: necessary? */ -+ /* file->f_ra = h_file->f_ra; */ -+ fsstack_copy_attr_atime(dentry->d_inode, -+ h_file->f_dentry->d_inode); -+ } else -+ /* currently there is no such fs */ -+ WARN_ON_ONCE(h_file->f_op && h_file->f_op->read); -+ -+ out_unlock: -+ di_read_unlock(dentry, AuLock_IR); -+ fi_read_unlock(file); -+ out: -+ si_read_unlock(sb); -+ return err; -+} -+ -+static ssize_t aufs_aio_write(struct kiocb *kio, const struct iovec *iov, -+ unsigned long nv, loff_t pos) -+{ -+ ssize_t err; -+ aufs_bindex_t bstart; -+ struct au_pin pin; -+ struct dentry *dentry; -+ struct inode *inode; -+ struct super_block *sb; -+ struct file *file, *h_file; -+ -+ file = kio->ki_filp; -+ dentry = file->f_dentry; -+ sb = dentry->d_sb; -+ inode = dentry->d_inode; -+ mutex_lock(&inode->i_mutex); -+ si_read_lock(sb, AuLock_FLUSH); -+ -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); -+ if (unlikely(err)) -+ goto out; -+ -+ err = au_ready_to_write(file, -1, &pin); -+ di_downgrade_lock(dentry, AuLock_IR); -+ if (unlikely(err)) -+ goto out_unlock; -+ -+ err = -ENOSYS; -+ bstart = au_fbstart(file); -+ h_file = au_h_fptr(file, bstart); -+ au_unpin(&pin); -+ if (h_file->f_op && h_file->f_op->aio_write) { -+ err = security_file_permission(h_file, MAY_WRITE); -+ if (unlikely(err)) -+ goto out_unlock; -+ if (!is_sync_kiocb(kio)) { -+ get_file(h_file); -+ fput(file); -+ } -+ kio->ki_filp = h_file; -+ err = h_file->f_op->aio_write(kio, iov, nv, pos); -+ au_cpup_attr_timesizes(inode); -+ inode->i_mode = h_file->f_dentry->d_inode->i_mode; -+ } else -+ /* currently there is no such fs */ -+ WARN_ON_ONCE(h_file->f_op && h_file->f_op->write); -+ -+ out_unlock: -+ di_read_unlock(dentry, AuLock_IR); -+ fi_write_unlock(file); -+ out: -+ si_read_unlock(sb); -+ mutex_unlock(&inode->i_mutex); -+ return err; -+} -+ -+static ssize_t aufs_splice_read(struct file *file, loff_t *ppos, -+ struct pipe_inode_info *pipe, size_t len, -+ unsigned int flags) -+{ -+ ssize_t err; -+ struct file *h_file; -+ struct dentry *dentry; -+ struct super_block *sb; -+ -+ dentry = file->f_dentry; -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); -+ if (unlikely(err)) -+ goto out; -+ -+ err = -EINVAL; -+ h_file = au_h_fptr(file, au_fbstart(file)); -+ if (au_test_loopback_kthread()) { -+ file->f_mapping = h_file->f_mapping; -+ smp_mb(); /* unnecessary? */ -+ } -+ err = vfsub_splice_to(h_file, ppos, pipe, len, flags); -+ /* todo: necessasry? */ -+ /* file->f_ra = h_file->f_ra; */ -+ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); -+ -+ di_read_unlock(dentry, AuLock_IR); -+ fi_read_unlock(file); -+ -+ out: -+ si_read_unlock(sb); -+ return err; -+} -+ -+static ssize_t -+aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos, -+ size_t len, unsigned int flags) -+{ -+ ssize_t err; -+ struct au_pin pin; -+ struct dentry *dentry; -+ struct inode *inode; -+ struct super_block *sb; -+ struct file *h_file; -+ -+ dentry = file->f_dentry; -+ inode = dentry->d_inode; -+ mutex_lock(&inode->i_mutex); -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); -+ if (unlikely(err)) -+ goto out; -+ -+ err = au_ready_to_write(file, -1, &pin); -+ di_downgrade_lock(dentry, AuLock_IR); -+ if (unlikely(err)) -+ goto out_unlock; -+ -+ h_file = au_h_fptr(file, au_fbstart(file)); -+ au_unpin(&pin); -+ err = vfsub_splice_from(pipe, h_file, ppos, len, flags); -+ au_cpup_attr_timesizes(inode); -+ inode->i_mode = h_file->f_dentry->d_inode->i_mode; -+ -+ out_unlock: -+ di_read_unlock(dentry, AuLock_IR); -+ fi_write_unlock(file); -+ out: -+ si_read_unlock(sb); -+ mutex_unlock(&inode->i_mutex); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static struct file *au_safe_file(struct vm_area_struct *vma) -+{ -+ struct file *file; -+ -+ file = vma->vm_file; -+ if (file->private_data && au_test_aufs(file->f_dentry->d_sb)) -+ return file; -+ return NULL; -+} -+ -+static void au_reset_file(struct vm_area_struct *vma, struct file *file) -+{ -+ vma->vm_file = file; -+ /* smp_mb(); */ /* flush vm_file */ -+} -+ -+static int aufs_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -+{ -+ int err; -+ static DECLARE_WAIT_QUEUE_HEAD(wq); -+ struct file *file, *h_file; -+ struct au_finfo *finfo; -+ -+ /* todo: non-robr mode, user vm_file as it is? */ -+ wait_event(wq, (file = au_safe_file(vma))); -+ -+ /* do not revalidate, no si lock */ -+ finfo = au_fi(file); -+ h_file = finfo->fi_hfile[0 + finfo->fi_bstart].hf_file; -+ AuDebugOn(!h_file || !finfo->fi_h_vm_ops); -+ -+ fi_write_lock(file); -+ vma->vm_file = h_file; -+ err = finfo->fi_h_vm_ops->fault(vma, vmf); -+ /* todo: necessary? */ -+ /* file->f_ra = h_file->f_ra; */ -+ au_reset_file(vma, file); -+ fi_write_unlock(file); -+#if 0 /* def CONFIG_SMP */ -+ /* wake_up_nr(&wq, online_cpu - 1); */ -+ wake_up_all(&wq); -+#else -+ wake_up(&wq); -+#endif -+ -+ return err; -+} -+ -+static int aufs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -+{ -+ int err; -+ static DECLARE_WAIT_QUEUE_HEAD(wq); -+ struct file *file, *h_file; -+ struct au_finfo *finfo; -+ -+ wait_event(wq, (file = au_safe_file(vma))); -+ -+ finfo = au_fi(file); -+ h_file = finfo->fi_hfile[0 + finfo->fi_bstart].hf_file; -+ AuDebugOn(!h_file || !finfo->fi_h_vm_ops); -+ -+ fi_write_lock(file); -+ vma->vm_file = h_file; -+ err = finfo->fi_h_vm_ops->page_mkwrite(vma, vmf); -+ au_reset_file(vma, file); -+ fi_write_unlock(file); -+ wake_up(&wq); -+ -+ return err; -+} -+ -+static void aufs_vm_close(struct vm_area_struct *vma) -+{ -+ static DECLARE_WAIT_QUEUE_HEAD(wq); -+ struct file *file, *h_file; -+ struct au_finfo *finfo; -+ -+ wait_event(wq, (file = au_safe_file(vma))); -+ -+ finfo = au_fi(file); -+ h_file = finfo->fi_hfile[0 + finfo->fi_bstart].hf_file; -+ AuDebugOn(!h_file || !finfo->fi_h_vm_ops); -+ -+ fi_write_lock(file); -+ vma->vm_file = h_file; -+ finfo->fi_h_vm_ops->close(vma); -+ au_reset_file(vma, file); -+ fi_write_unlock(file); -+ wake_up(&wq); -+} -+ -+static struct vm_operations_struct aufs_vm_ops = { -+ /* .close and .page_mkwrite are not set by default */ -+ .fault = aufs_fault, -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+static unsigned long au_prot_conv(unsigned long flags) -+{ -+ unsigned long prot; -+ -+ prot = 0; -+ if (flags & VM_READ) -+ prot |= PROT_READ; -+ if (flags & VM_WRITE) -+ prot |= PROT_WRITE; -+ if (flags & VM_EXEC) -+ prot |= PROT_EXEC; -+ return prot; -+} -+ -+static struct vm_operations_struct *au_vm_ops(struct file *h_file, -+ struct vm_area_struct *vma) -+{ -+ struct vm_operations_struct *vm_ops; -+ int err; -+ -+ vm_ops = ERR_PTR(-ENODEV); -+ if (!h_file->f_op || !h_file->f_op->mmap) -+ goto out; -+ -+ err = ima_file_mmap(h_file, au_prot_conv(vma->vm_flags)); -+ vm_ops = ERR_PTR(err); -+ if (err) -+ goto out; -+ -+ err = h_file->f_op->mmap(h_file, vma); -+ vm_ops = ERR_PTR(err); -+ if (unlikely(err)) -+ goto out; -+ -+ vm_ops = vma->vm_ops; -+ err = do_munmap(current->mm, vma->vm_start, -+ vma->vm_end - vma->vm_start); -+ if (unlikely(err)) { -+ AuIOErr("failed internal unmapping %.*s, %d\n", -+ AuDLNPair(h_file->f_dentry), err); -+ vm_ops = ERR_PTR(-EIO); -+ } -+ -+ out: -+ return vm_ops; -+} -+ -+static int au_custom_vm_ops(struct au_finfo *finfo, struct vm_area_struct *vma) -+{ -+ int err; -+ struct vm_operations_struct *h_ops; -+ -+ AuRwMustAnyLock(&finfo->fi_rwsem); -+ -+ err = 0; -+ h_ops = finfo->fi_h_vm_ops; -+ AuDebugOn(!h_ops); -+ if ((!h_ops->page_mkwrite && !h_ops->close) -+ || finfo->fi_vm_ops) -+ goto out; -+ -+ err = -ENOMEM; -+ finfo->fi_vm_ops = kmemdup(&aufs_vm_ops, sizeof(aufs_vm_ops), GFP_NOFS); -+ if (unlikely(!finfo->fi_vm_ops)) -+ goto out; -+ -+ err = 0; -+ if (h_ops->page_mkwrite) -+ finfo->fi_vm_ops->page_mkwrite = aufs_page_mkwrite; -+ if (h_ops->close) -+ finfo->fi_vm_ops->close = aufs_vm_close; -+ -+ vma->vm_ops = finfo->fi_vm_ops; -+ -+ out: -+ return err; -+} -+ -+static int aufs_mmap(struct file *file, struct vm_area_struct *vma) -+{ -+ int err; -+ unsigned char wlock, mmapped; -+ struct dentry *dentry; -+ struct super_block *sb; -+ struct file *h_file; -+ struct vm_operations_struct *vm_ops; -+ -+ dentry = file->f_dentry; -+ wlock = !!(file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED); -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); -+ if (unlikely(err)) -+ goto out; -+ -+ mmapped = !!au_test_mmapped(file); -+ if (wlock) { -+ struct au_pin pin; -+ -+ err = au_ready_to_write(file, -1, &pin); -+ di_downgrade_lock(dentry, AuLock_IR); -+ if (unlikely(err)) -+ goto out_unlock; -+ au_unpin(&pin); -+ } else -+ di_downgrade_lock(dentry, AuLock_IR); -+ -+ h_file = au_h_fptr(file, au_fbstart(file)); -+ if (!mmapped && au_test_fs_bad_mapping(h_file->f_dentry->d_sb)) { -+ /* -+ * by this assignment, f_mapping will differs from aufs inode -+ * i_mapping. -+ * if someone else mixes the use of f_dentry->d_inode and -+ * f_mapping->host, then a problem may arise. -+ */ -+ file->f_mapping = h_file->f_mapping; -+ } -+ -+ vm_ops = NULL; -+ if (!mmapped) { -+ vm_ops = au_vm_ops(h_file, vma); -+ err = PTR_ERR(vm_ops); -+ if (IS_ERR(vm_ops)) -+ goto out_unlock; -+ } -+ -+ /* -+ * unnecessary to handle MAP_DENYWRITE and deny_write_access()? -+ * currently MAP_DENYWRITE from userspace is ignored, but elf loader -+ * sets it. when FMODE_EXEC is set (by open_exec() or sys_uselib()), -+ * both of the aufs file and the lower file is deny_write_access()-ed. -+ * finally I hope we can skip handlling MAP_DENYWRITE here. -+ */ -+ err = generic_file_mmap(file, vma); -+ if (unlikely(err)) -+ goto out_unlock; -+ -+ vma->vm_ops = &aufs_vm_ops; -+ /* test again */ -+ if (!au_test_mmapped(file)) -+ au_fi(file)->fi_h_vm_ops = vm_ops; -+ -+ err = au_custom_vm_ops(au_fi(file), vma); -+ if (unlikely(err)) -+ goto out_unlock; -+ -+ vfsub_file_accessed(h_file); -+ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); -+ -+ out_unlock: -+ di_read_unlock(dentry, AuLock_IR); -+ fi_write_unlock(file); -+ out: -+ si_read_unlock(sb); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int aufs_fsync_nondir(struct file *file, struct dentry *dentry, -+ int datasync) -+{ -+ int err; -+ struct au_pin pin; -+ struct inode *inode; -+ struct file *h_file; -+ struct super_block *sb; -+ -+ inode = dentry->d_inode; -+ IMustLock(file->f_mapping->host); -+ if (inode != file->f_mapping->host) { -+ mutex_unlock(&file->f_mapping->host->i_mutex); -+ mutex_lock(&inode->i_mutex); -+ } -+ IMustLock(inode); -+ -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ -+ err = 0; /* -EBADF; */ /* posix? */ -+ if (unlikely(!(file->f_mode & FMODE_WRITE))) -+ goto out; -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); -+ if (unlikely(err)) -+ goto out; -+ -+ err = au_ready_to_write(file, -1, &pin); -+ di_downgrade_lock(dentry, AuLock_IR); -+ if (unlikely(err)) -+ goto out_unlock; -+ au_unpin(&pin); -+ -+ err = -EINVAL; -+ h_file = au_h_fptr(file, au_fbstart(file)); -+ if (h_file->f_op && h_file->f_op->fsync) { -+ struct dentry *h_d; -+ struct mutex *h_mtx; -+ -+ /* -+ * no filemap_fdatawrite() since aufs file has no its own -+ * mapping, but dir. -+ */ -+ h_d = h_file->f_dentry; -+ h_mtx = &h_d->d_inode->i_mutex; -+ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); -+ err = h_file->f_op->fsync(h_file, h_d, datasync); -+ if (!err) -+ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); -+ /*ignore*/ -+ au_cpup_attr_timesizes(inode); -+ mutex_unlock(h_mtx); -+ } -+ -+ out_unlock: -+ di_read_unlock(dentry, AuLock_IR); -+ fi_write_unlock(file); -+ out: -+ si_read_unlock(sb); -+ if (inode != file->f_mapping->host) { -+ mutex_unlock(&inode->i_mutex); -+ mutex_lock(&file->f_mapping->host->i_mutex); -+ } -+ return err; -+} -+ -+/* no one supports this operation, currently */ -+#if 0 -+static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync) -+{ -+ int err; -+ struct au_pin pin; -+ struct dentry *dentry; -+ struct inode *inode; -+ struct file *file, *h_file; -+ struct super_block *sb; -+ -+ file = kio->ki_filp; -+ dentry = file->f_dentry; -+ inode = dentry->d_inode; -+ mutex_lock(&inode->i_mutex); -+ -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ -+ err = 0; /* -EBADF; */ /* posix? */ -+ if (unlikely(!(file->f_mode & FMODE_WRITE))) -+ goto out; -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); -+ if (unlikely(err)) -+ goto out; -+ -+ err = au_ready_to_write(file, -1, &pin); -+ di_downgrade_lock(dentry, AuLock_IR); -+ if (unlikely(err)) -+ goto out_unlock; -+ au_unpin(&pin); -+ -+ err = -ENOSYS; -+ h_file = au_h_fptr(file, au_fbstart(file)); -+ if (h_file->f_op && h_file->f_op->aio_fsync) { -+ struct dentry *h_d; -+ struct mutex *h_mtx; -+ -+ h_d = h_file->f_dentry; -+ h_mtx = &h_d->d_inode->i_mutex; -+ if (!is_sync_kiocb(kio)) { -+ get_file(h_file); -+ fput(file); -+ } -+ kio->ki_filp = h_file; -+ err = h_file->f_op->aio_fsync(kio, datasync); -+ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); -+ if (!err) -+ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); -+ /*ignore*/ -+ au_cpup_attr_timesizes(inode); -+ mutex_unlock(h_mtx); -+ } -+ -+ out_unlock: -+ di_read_unlock(dentry, AuLock_IR); -+ fi_write_unlock(file); -+ out: -+ si_read_unlock(sb); -+ mutex_unlock(&inode->i_mutex); -+ return err; -+} -+#endif -+ -+static int aufs_fasync(int fd, struct file *file, int flag) -+{ -+ int err; -+ struct file *h_file; -+ struct dentry *dentry; -+ struct super_block *sb; -+ -+ dentry = file->f_dentry; -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); -+ if (unlikely(err)) -+ goto out; -+ -+ h_file = au_h_fptr(file, au_fbstart(file)); -+ if (h_file->f_op && h_file->f_op->fasync) -+ err = h_file->f_op->fasync(fd, h_file, flag); -+ -+ di_read_unlock(dentry, AuLock_IR); -+ fi_read_unlock(file); -+ -+ out: -+ si_read_unlock(sb); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* no one supports this operation, currently */ -+#if 0 -+static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset, -+ size_t len, loff_t *pos , int more) -+{ -+} -+#endif -+ -+/* ---------------------------------------------------------------------- */ -+ -+const struct file_operations aufs_file_fop = { -+ /* -+ * while generic_file_llseek/_unlocked() don't use BKL, -+ * don't use it since it operates file->f_mapping->host. -+ * in aufs, it may be a real file and may confuse users by UDBA. -+ */ -+ /* .llseek = generic_file_llseek, */ -+ -+ .read = aufs_read, -+ .write = aufs_write, -+ .aio_read = aufs_aio_read, -+ .aio_write = aufs_aio_write, -+#ifdef CONFIG_AUFS_POLL -+ .poll = aufs_poll, -+#endif -+ .mmap = aufs_mmap, -+ .open = aufs_open_nondir, -+ .flush = aufs_flush, -+ .release = aufs_release_nondir, -+ .fsync = aufs_fsync_nondir, -+ /* .aio_fsync = aufs_aio_fsync_nondir, */ -+ .fasync = aufs_fasync, -+ /* .sendpage = aufs_sendpage, */ -+ .splice_write = aufs_splice_write, -+ .splice_read = aufs_splice_read, -+#if 0 -+ .aio_splice_write = aufs_aio_splice_write, -+ .aio_splice_read = aufs_aio_splice_read -+#endif -+}; -diff -Nur linux-2.6.31-vanilla/fs/aufs/fstype.h linux-2.6.31/fs/aufs/fstype.h ---- linux-2.6.31-vanilla/fs/aufs/fstype.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/fstype.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,485 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * judging filesystem type -+ */ -+ -+#ifndef __AUFS_FSTYPE_H__ -+#define __AUFS_FSTYPE_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/cramfs_fs.h> -+#include <linux/fs.h> -+#include <linux/magic.h> -+#include <linux/romfs_fs.h> -+#include <linux/aufs_type.h> -+ -+static inline int au_test_aufs(struct super_block *sb) -+{ -+ return sb->s_magic == AUFS_SUPER_MAGIC; -+} -+ -+static inline const char *au_sbtype(struct super_block *sb) -+{ -+ return sb->s_type->name; -+} -+ -+static inline int au_test_iso9660(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE) -+ return sb->s_magic == ROMFS_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_romfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE) -+ return sb->s_magic == ISOFS_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_cramfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE) -+ return sb->s_magic == CRAMFS_MAGIC; -+#endif -+ return 0; -+} -+ -+static inline int au_test_nfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) -+ return sb->s_magic == NFS_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_fuse(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) -+ return sb->s_magic == FUSE_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_xfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE) -+ return sb->s_magic == XFS_SB_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_tmpfs(struct super_block *sb __maybe_unused) -+{ -+#ifdef CONFIG_TMPFS -+ return sb->s_magic == TMPFS_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE) -+ return !strcmp(au_sbtype(sb), "ecryptfs"); -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_smbfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_SMB_FS) || defined(CONFIG_SMB_FS_MODULE) -+ return sb->s_magic == SMB_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_ocfs2(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_OCFS2_FS) || defined(CONFIG_OCFS2_FS_MODULE) -+ return sb->s_magic == OCFS2_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_ocfs2_dlmfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_OCFS2_FS_O2CB) || defined(CONFIG_OCFS2_FS_O2CB_MODULE) -+ return sb->s_magic == DLMFS_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_coda(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_CODA_FS) || defined(CONFIG_CODA_FS_MODULE) -+ return sb->s_magic == CODA_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_v9fs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_9P_FS) || defined(CONFIG_9P_FS_MODULE) -+ return sb->s_magic == V9FS_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_ext4(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_EXT4DEV_FS) || defined(CONFIG_EXT4DEV_FS_MODULE) -+ return sb->s_magic == EXT4_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_sysv(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_SYSV_FS) || defined(CONFIG_SYSV_FS_MODULE) -+ return !strcmp(au_sbtype(sb), "sysv"); -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_ramfs(struct super_block *sb) -+{ -+ return sb->s_magic == RAMFS_MAGIC; -+} -+ -+static inline int au_test_ubifs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE) -+ return sb->s_magic == UBIFS_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_procfs(struct super_block *sb __maybe_unused) -+{ -+#ifdef CONFIG_PROC_FS -+ return sb->s_magic == PROC_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_sysfs(struct super_block *sb __maybe_unused) -+{ -+#ifdef CONFIG_SYSFS -+ return sb->s_magic == SYSFS_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_configfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE) -+ return sb->s_magic == CONFIGFS_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_minix(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE) -+ return sb->s_magic == MINIX3_SUPER_MAGIC -+ || sb->s_magic == MINIX2_SUPER_MAGIC -+ || sb->s_magic == MINIX2_SUPER_MAGIC2 -+ || sb->s_magic == MINIX_SUPER_MAGIC -+ || sb->s_magic == MINIX_SUPER_MAGIC2; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_cifs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_CIFS_FS) || defined(CONFIGCIFS_FS_MODULE) -+ return sb->s_magic == CIFS_MAGIC_NUMBER; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_fat(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE) -+ return sb->s_magic == MSDOS_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_msdos(struct super_block *sb) -+{ -+ return au_test_fat(sb); -+} -+ -+static inline int au_test_vfat(struct super_block *sb) -+{ -+ return au_test_fat(sb); -+} -+ -+static inline int au_test_securityfs(struct super_block *sb __maybe_unused) -+{ -+#ifdef CONFIG_SECURITYFS -+ return sb->s_magic == SECURITYFS_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_squashfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE) -+ return sb->s_magic == SQUASHFS_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_btrfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE) -+ return sb->s_magic == BTRFS_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_xenfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE) -+ return sb->s_magic == XENFS_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_debugfs(struct super_block *sb __maybe_unused) -+{ -+#ifdef CONFIG_DEBUG_FS -+ return sb->s_magic == DEBUGFS_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+static inline int au_test_nilfs(struct super_block *sb __maybe_unused) -+{ -+#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE) -+ return sb->s_magic == NILFS_SUPER_MAGIC; -+#else -+ return 0; -+#endif -+} -+ -+/* ---------------------------------------------------------------------- */ -+/* -+ * they can't be an aufs branch. -+ */ -+static inline int au_test_fs_unsuppoted(struct super_block *sb) -+{ -+ return -+#ifndef CONFIG_AUFS_BR_RAMFS -+ au_test_ramfs(sb) || -+#endif -+ au_test_procfs(sb) -+ || au_test_sysfs(sb) -+ || au_test_configfs(sb) -+ || au_test_debugfs(sb) -+ || au_test_securityfs(sb) -+ || au_test_xenfs(sb) -+ || au_test_ecryptfs(sb) -+ /* || !strcmp(au_sbtype(sb), "unionfs") */ -+ || au_test_aufs(sb); /* will be supported in next version */ -+} -+ -+/* -+ * If the filesystem supports NFS-export, then it has to support NULL as -+ * a nameidata parameter for ->create(), ->lookup() and ->d_revalidate(). -+ * We can apply this principle when we handle a lower filesystem. -+ */ -+static inline int au_test_fs_null_nd(struct super_block *sb) -+{ -+ return !!sb->s_export_op; -+} -+ -+static inline int au_test_fs_remote(struct super_block *sb) -+{ -+ return !au_test_tmpfs(sb) -+#ifdef CONFIG_AUFS_BR_RAMFS -+ && !au_test_ramfs(sb) -+#endif -+ && !(sb->s_type->fs_flags & FS_REQUIRES_DEV); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * Note: these functions (below) are created after reading ->getattr() in all -+ * filesystems under linux/fs. it means we have to do so in every update... -+ */ -+ -+/* -+ * some filesystems require getattr to refresh the inode attributes before -+ * referencing. -+ * in most cases, we can rely on the inode attribute in NFS (or every remote fs) -+ * and leave the work for d_revalidate() -+ */ -+static inline int au_test_fs_refresh_iattr(struct super_block *sb) -+{ -+ return au_test_nfs(sb) -+ || au_test_fuse(sb) -+ /* || au_test_smbfs(sb) */ /* untested */ -+ /* || au_test_ocfs2(sb) */ /* untested */ -+ /* || au_test_btrfs(sb) */ /* untested */ -+ /* || au_test_coda(sb) */ /* untested */ -+ /* || au_test_v9fs(sb) */ /* untested */ -+ ; -+} -+ -+/* -+ * filesystems which don't maintain i_size or i_blocks. -+ */ -+static inline int au_test_fs_bad_iattr_size(struct super_block *sb) -+{ -+ return au_test_xfs(sb) -+ /* || au_test_ext4(sb) */ /* untested */ -+ /* || au_test_ocfs2(sb) */ /* untested */ -+ /* || au_test_ocfs2_dlmfs(sb) */ /* untested */ -+ /* || au_test_sysv(sb) */ /* untested */ -+ /* || au_test_ubifs(sb) */ /* untested */ -+ /* || au_test_minix(sb) */ /* untested */ -+ ; -+} -+ -+/* -+ * filesystems which don't store the correct value in some of their inode -+ * attributes. -+ */ -+static inline int au_test_fs_bad_iattr(struct super_block *sb) -+{ -+ return au_test_fs_bad_iattr_size(sb) -+ /* || au_test_cifs(sb) */ /* untested */ -+ || au_test_fat(sb) -+ || au_test_msdos(sb) -+ || au_test_vfat(sb); -+} -+ -+/* they don't check i_nlink in link(2) */ -+static inline int au_test_fs_no_limit_nlink(struct super_block *sb) -+{ -+ return au_test_tmpfs(sb) -+#ifdef CONFIG_AUFS_BR_RAMFS -+ || au_test_ramfs(sb) -+#endif -+ || au_test_ubifs(sb); -+} -+ -+/* -+ * filesystems which sets S_NOATIME and S_NOCMTIME. -+ */ -+static inline int au_test_fs_notime(struct super_block *sb) -+{ -+ return au_test_nfs(sb) -+ || au_test_fuse(sb) -+ || au_test_ubifs(sb) -+ /* || au_test_cifs(sb) */ /* untested */ -+ ; -+} -+ -+/* -+ * filesystems which requires replacing i_mapping. -+ */ -+static inline int au_test_fs_bad_mapping(struct super_block *sb) -+{ -+ return au_test_fuse(sb) -+ || au_test_ubifs(sb); -+} -+ -+/* temporary support for i#1 in cramfs */ -+static inline int au_test_fs_unique_ino(struct inode *inode) -+{ -+ if (au_test_cramfs(inode->i_sb)) -+ return inode->i_ino != 1; -+ return 1; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * the filesystem where the xino files placed must support i/o after unlink and -+ * maintain i_size and i_blocks. -+ */ -+static inline int au_test_fs_bad_xino(struct super_block *sb) -+{ -+ return au_test_fs_remote(sb) -+ || au_test_fs_bad_iattr_size(sb) -+#ifdef CONFIG_AUFS_BR_RAMFS -+ || !(au_test_ramfs(sb) || au_test_fs_null_nd(sb)) -+#else -+ || !au_test_fs_null_nd(sb) /* to keep xino code simple */ -+#endif -+ /* don't want unnecessary work for xino */ -+ || au_test_aufs(sb) -+ || au_test_ecryptfs(sb) -+ || au_test_nilfs(sb); -+} -+ -+static inline int au_test_fs_trunc_xino(struct super_block *sb) -+{ -+ return au_test_tmpfs(sb) -+ || au_test_ramfs(sb); -+} -+ -+/* -+ * test if the @sb is real-readonly. -+ */ -+static inline int au_test_fs_rr(struct super_block *sb) -+{ -+ return au_test_squashfs(sb) -+ || au_test_iso9660(sb) -+ || au_test_cramfs(sb) -+ || au_test_romfs(sb); -+} -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_FSTYPE_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/hinotify.c linux-2.6.31/fs/aufs/hinotify.c ---- linux-2.6.31-vanilla/fs/aufs/hinotify.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/hinotify.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,755 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * inotify for the lower directories -+ */ -+ -+#include "aufs.h" -+ -+static const __u32 AuHinMask = (IN_MOVE | IN_DELETE | IN_CREATE); -+static struct inotify_handle *au_hin_handle; -+ -+AuCacheFuncs(hinotify, HINOTIFY); -+ -+int au_hin_alloc(struct au_hinode *hinode, struct inode *inode, -+ struct inode *h_inode) -+{ -+ int err; -+ struct au_hinotify *hin; -+ s32 wd; -+ -+ err = -ENOMEM; -+ hin = au_cache_alloc_hinotify(); -+ if (hin) { -+ AuDebugOn(hinode->hi_notify); -+ hinode->hi_notify = hin; -+ hin->hin_aufs_inode = inode; -+ -+ inotify_init_watch(&hin->hin_watch); -+ wd = inotify_add_watch(au_hin_handle, &hin->hin_watch, h_inode, -+ AuHinMask); -+ if (wd >= 0) -+ return 0; /* success */ -+ -+ err = wd; -+ put_inotify_watch(&hin->hin_watch); -+ au_cache_free_hinotify(hin); -+ hinode->hi_notify = NULL; -+ } -+ -+ return err; -+} -+ -+void au_hin_free(struct au_hinode *hinode) -+{ -+ int err; -+ struct au_hinotify *hin; -+ -+ hin = hinode->hi_notify; -+ if (hin) { -+ err = 0; -+ if (atomic_read(&hin->hin_watch.count)) -+ err = inotify_rm_watch(au_hin_handle, &hin->hin_watch); -+ if (unlikely(err)) -+ /* it means the watch is already removed */ -+ AuWarn("failed inotify_rm_watch() %d\n", err); -+ au_cache_free_hinotify(hin); -+ hinode->hi_notify = NULL; -+ } -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+void au_hin_ctl(struct au_hinode *hinode, int do_set) -+{ -+ struct inode *h_inode; -+ struct inotify_watch *watch; -+ -+ if (!hinode->hi_notify) -+ return; -+ -+ h_inode = hinode->hi_inode; -+ IMustLock(h_inode); -+ -+ /* todo: try inotify_find_update_watch()? */ -+ watch = &hinode->hi_notify->hin_watch; -+ mutex_lock(&h_inode->inotify_mutex); -+ /* mutex_lock(&watch->ih->mutex); */ -+ if (do_set) { -+ AuDebugOn(watch->mask & AuHinMask); -+ watch->mask |= AuHinMask; -+ } else { -+ AuDebugOn(!(watch->mask & AuHinMask)); -+ watch->mask &= ~AuHinMask; -+ } -+ /* mutex_unlock(&watch->ih->mutex); */ -+ mutex_unlock(&h_inode->inotify_mutex); -+} -+ -+void au_reset_hinotify(struct inode *inode, unsigned int flags) -+{ -+ aufs_bindex_t bindex, bend; -+ struct inode *hi; -+ struct dentry *iwhdentry; -+ -+ bend = au_ibend(inode); -+ for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { -+ hi = au_h_iptr(inode, bindex); -+ if (!hi) -+ continue; -+ -+ /* mutex_lock_nested(&hi->i_mutex, AuLsc_I_CHILD); */ -+ iwhdentry = au_hi_wh(inode, bindex); -+ if (iwhdentry) -+ dget(iwhdentry); -+ au_igrab(hi); -+ au_set_h_iptr(inode, bindex, NULL, 0); -+ au_set_h_iptr(inode, bindex, au_igrab(hi), -+ flags & ~AuHi_XINO); -+ iput(hi); -+ dput(iwhdentry); -+ /* mutex_unlock(&hi->i_mutex); */ -+ } -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int hin_xino(struct inode *inode, struct inode *h_inode) -+{ -+ int err; -+ aufs_bindex_t bindex, bend, bfound, bstart; -+ struct inode *h_i; -+ -+ err = 0; -+ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { -+ AuWarn("branch root dir was changed\n"); -+ goto out; -+ } -+ -+ bfound = -1; -+ bend = au_ibend(inode); -+ bstart = au_ibstart(inode); -+#if 0 /* reserved for future use */ -+ if (bindex == bend) { -+ /* keep this ino in rename case */ -+ goto out; -+ } -+#endif -+ for (bindex = bstart; bindex <= bend; bindex++) { -+ if (au_h_iptr(inode, bindex) == h_inode) { -+ bfound = bindex; -+ break; -+ } -+ } -+ if (bfound < 0) -+ goto out; -+ -+ for (bindex = bstart; bindex <= bend; bindex++) { -+ h_i = au_h_iptr(inode, bindex); -+ if (!h_i) -+ continue; -+ -+ err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0); -+ /* ignore this error */ -+ /* bad action? */ -+ } -+ -+ /* children inode number will be broken */ -+ -+ out: -+ AuTraceErr(err); -+ return err; -+} -+ -+static int hin_gen_tree(struct dentry *dentry) -+{ -+ int err, i, j, ndentry; -+ struct au_dcsub_pages dpages; -+ struct au_dpage *dpage; -+ struct dentry **dentries; -+ -+ err = au_dpages_init(&dpages, GFP_NOFS); -+ if (unlikely(err)) -+ goto out; -+ err = au_dcsub_pages(&dpages, dentry, NULL, NULL); -+ if (unlikely(err)) -+ goto out_dpages; -+ -+ for (i = 0; i < dpages.ndpage; i++) { -+ dpage = dpages.dpages + i; -+ dentries = dpage->dentries; -+ ndentry = dpage->ndentry; -+ for (j = 0; j < ndentry; j++) { -+ struct dentry *d; -+ -+ d = dentries[j]; -+ if (IS_ROOT(d)) -+ continue; -+ -+ d_drop(d); -+ au_digen_dec(d); -+ if (d->d_inode) -+ /* todo: reset children xino? -+ cached children only? */ -+ au_iigen_dec(d->d_inode); -+ } -+ } -+ -+ out_dpages: -+ au_dpages_free(&dpages); -+ -+ /* discard children */ -+ dentry_unhash(dentry); -+ dput(dentry); -+ out: -+ return err; -+} -+ -+/* -+ * return 0 if processed. -+ */ -+static int hin_gen_by_inode(char *name, unsigned int nlen, struct inode *inode, -+ const unsigned int isdir) -+{ -+ int err; -+ struct dentry *d; -+ struct qstr *dname; -+ -+ err = 1; -+ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { -+ AuWarn("branch root dir was changed\n"); -+ err = 0; -+ goto out; -+ } -+ -+ if (!isdir) { -+ AuDebugOn(!name); -+ au_iigen_dec(inode); -+ spin_lock(&dcache_lock); -+ list_for_each_entry(d, &inode->i_dentry, d_alias) { -+ dname = &d->d_name; -+ if (dname->len != nlen -+ && memcmp(dname->name, name, nlen)) -+ continue; -+ err = 0; -+ spin_lock(&d->d_lock); -+ __d_drop(d); -+ au_digen_dec(d); -+ spin_unlock(&d->d_lock); -+ break; -+ } -+ spin_unlock(&dcache_lock); -+ } else { -+ au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIRS); -+ d = d_find_alias(inode); -+ if (!d) { -+ au_iigen_dec(inode); -+ goto out; -+ } -+ -+ dname = &d->d_name; -+ if (dname->len == nlen && !memcmp(dname->name, name, nlen)) -+ err = hin_gen_tree(d); -+ dput(d); -+ } -+ -+ out: -+ AuTraceErr(err); -+ return err; -+} -+ -+static int hin_gen_by_name(struct dentry *dentry, const unsigned int isdir) -+{ -+ int err; -+ struct inode *inode; -+ -+ inode = dentry->d_inode; -+ if (IS_ROOT(dentry) -+ /* || (inode && inode->i_ino == AUFS_ROOT_INO) */ -+ ) { -+ AuWarn("branch root dir was changed\n"); -+ return 0; -+ } -+ -+ err = 0; -+ if (!isdir) { -+ d_drop(dentry); -+ au_digen_dec(dentry); -+ if (inode) -+ au_iigen_dec(inode); -+ } else { -+ au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIRS); -+ if (inode) -+ err = hin_gen_tree(dentry); -+ } -+ -+ AuTraceErr(err); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* hinotify job flags */ -+#define AuHinJob_XINO0 1 -+#define AuHinJob_GEN (1 << 1) -+#define AuHinJob_DIRENT (1 << 2) -+#define AuHinJob_ISDIR (1 << 3) -+#define AuHinJob_TRYXINO0 (1 << 4) -+#define AuHinJob_MNTPNT (1 << 5) -+#define au_ftest_hinjob(flags, name) ((flags) & AuHinJob_##name) -+#define au_fset_hinjob(flags, name) { (flags) |= AuHinJob_##name; } -+#define au_fclr_hinjob(flags, name) { (flags) &= ~AuHinJob_##name; } -+ -+struct hin_job_args { -+ unsigned int flags; -+ struct inode *inode, *h_inode, *dir, *h_dir; -+ struct dentry *dentry; -+ char *h_name; -+ int h_nlen; -+}; -+ -+static int hin_job(struct hin_job_args *a) -+{ -+ const unsigned int isdir = au_ftest_hinjob(a->flags, ISDIR); -+ -+ /* reset xino */ -+ if (au_ftest_hinjob(a->flags, XINO0) && a->inode) -+ hin_xino(a->inode, a->h_inode); /* ignore this error */ -+ -+ if (au_ftest_hinjob(a->flags, TRYXINO0) -+ && a->inode -+ && a->h_inode) { -+ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); -+ if (!a->h_inode->i_nlink) -+ hin_xino(a->inode, a->h_inode); /* ignore this error */ -+ mutex_unlock(&a->h_inode->i_mutex); -+ } -+ -+ /* make the generation obsolete */ -+ if (au_ftest_hinjob(a->flags, GEN)) { -+ int err = -1; -+ if (a->inode) -+ err = hin_gen_by_inode(a->h_name, a->h_nlen, a->inode, -+ isdir); -+ if (err && a->dentry) -+ hin_gen_by_name(a->dentry, isdir); -+ /* ignore this error */ -+ } -+ -+ /* make dir entries obsolete */ -+ if (au_ftest_hinjob(a->flags, DIRENT) && a->inode) { -+ struct au_vdir *vdir; -+ -+ vdir = au_ivdir(a->inode); -+ if (vdir) -+ vdir->vd_jiffy = 0; -+ /* IMustLock(a->inode); */ -+ /* a->inode->i_version++; */ -+ } -+ -+ /* can do nothing but warn */ -+ if (au_ftest_hinjob(a->flags, MNTPNT) -+ && a->dentry -+ && d_mountpoint(a->dentry)) -+ AuWarn("mount-point %.*s is removed or renamed\n", -+ AuDLNPair(a->dentry)); -+ -+ return 0; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static char *in_name(u32 mask) -+{ -+#ifdef CONFIG_AUFS_DEBUG -+#define test_ret(flag) if (mask & flag) \ -+ return #flag; -+ test_ret(IN_ACCESS); -+ test_ret(IN_MODIFY); -+ test_ret(IN_ATTRIB); -+ test_ret(IN_CLOSE_WRITE); -+ test_ret(IN_CLOSE_NOWRITE); -+ test_ret(IN_OPEN); -+ test_ret(IN_MOVED_FROM); -+ test_ret(IN_MOVED_TO); -+ test_ret(IN_CREATE); -+ test_ret(IN_DELETE); -+ test_ret(IN_DELETE_SELF); -+ test_ret(IN_MOVE_SELF); -+ test_ret(IN_UNMOUNT); -+ test_ret(IN_Q_OVERFLOW); -+ test_ret(IN_IGNORED); -+ return ""; -+#undef test_ret -+#else -+ return "??"; -+#endif -+} -+ -+static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen, -+ struct inode *dir) -+{ -+ struct dentry *dentry, *d, *parent; -+ struct qstr *dname; -+ -+ parent = d_find_alias(dir); -+ if (!parent) -+ return NULL; -+ -+ dentry = NULL; -+ spin_lock(&dcache_lock); -+ list_for_each_entry(d, &parent->d_subdirs, d_u.d_child) { -+ /* AuDbg("%.*s\n", AuDLNPair(d)); */ -+ dname = &d->d_name; -+ if (dname->len != nlen || memcmp(dname->name, name, nlen)) -+ continue; -+ if (!atomic_read(&d->d_count) || !d->d_fsdata) { -+ spin_lock(&d->d_lock); -+ __d_drop(d); -+ spin_unlock(&d->d_lock); -+ continue; -+ } -+ -+ dentry = dget(d); -+ break; -+ } -+ spin_unlock(&dcache_lock); -+ dput(parent); -+ -+ if (dentry) -+ di_write_lock_child(dentry); -+ -+ return dentry; -+} -+ -+static struct inode *lookup_wlock_by_ino(struct super_block *sb, -+ aufs_bindex_t bindex, ino_t h_ino) -+{ -+ struct inode *inode; -+ ino_t ino; -+ int err; -+ -+ inode = NULL; -+ err = au_xino_read(sb, bindex, h_ino, &ino); -+ if (!err && ino) -+ inode = ilookup(sb, ino); -+ if (!inode) -+ goto out; -+ -+ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { -+ AuWarn("wrong root branch\n"); -+ iput(inode); -+ inode = NULL; -+ goto out; -+ } -+ -+ ii_write_lock_child(inode); -+ -+ out: -+ return inode; -+} -+ -+enum { CHILD, PARENT }; -+struct postproc_args { -+ struct inode *h_dir, *dir, *h_child_inode; -+ u32 mask; -+ unsigned int flags[2]; -+ unsigned int h_child_nlen; -+ char h_child_name[]; -+}; -+ -+static void postproc(void *_args) -+{ -+ struct postproc_args *a = _args; -+ struct super_block *sb; -+ aufs_bindex_t bindex, bend, bfound; -+ unsigned char xino, try_iput; -+ int err; -+ struct inode *inode; -+ ino_t h_ino; -+ struct hin_job_args args; -+ struct dentry *dentry; -+ struct au_sbinfo *sbinfo; -+ -+ AuDebugOn(!_args); -+ AuDebugOn(!a->h_dir); -+ AuDebugOn(!a->dir); -+ AuDebugOn(!a->mask); -+ AuDbg("mask 0x%x %s, i%lu, hi%lu, hci%lu\n", -+ a->mask, in_name(a->mask), a->dir->i_ino, a->h_dir->i_ino, -+ a->h_child_inode ? a->h_child_inode->i_ino : 0); -+ -+ inode = NULL; -+ dentry = NULL; -+ /* -+ * do not lock a->dir->i_mutex here -+ * because of d_revalidate() may cause a deadlock. -+ */ -+ sb = a->dir->i_sb; -+ AuDebugOn(!sb); -+ sbinfo = au_sbi(sb); -+ AuDebugOn(!sbinfo); -+ /* big aufs lock */ -+ si_noflush_write_lock(sb); -+ -+ ii_read_lock_parent(a->dir); -+ bfound = -1; -+ bend = au_ibend(a->dir); -+ for (bindex = au_ibstart(a->dir); bindex <= bend; bindex++) -+ if (au_h_iptr(a->dir, bindex) == a->h_dir) { -+ bfound = bindex; -+ break; -+ } -+ ii_read_unlock(a->dir); -+ if (unlikely(bfound < 0)) -+ goto out; -+ -+ xino = !!au_opt_test(au_mntflags(sb), XINO); -+ h_ino = 0; -+ if (a->h_child_inode) -+ h_ino = a->h_child_inode->i_ino; -+ -+ if (a->h_child_nlen -+ && (au_ftest_hinjob(a->flags[CHILD], GEN) -+ || au_ftest_hinjob(a->flags[CHILD], MNTPNT))) -+ dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen, -+ a->dir); -+ try_iput = 0; -+ if (dentry) -+ inode = dentry->d_inode; -+ if (xino && !inode && h_ino -+ && (au_ftest_hinjob(a->flags[CHILD], XINO0) -+ || au_ftest_hinjob(a->flags[CHILD], TRYXINO0) -+ || au_ftest_hinjob(a->flags[CHILD], GEN))) { -+ inode = lookup_wlock_by_ino(sb, bfound, h_ino); -+ try_iput = 1; -+ } -+ -+ args.flags = a->flags[CHILD]; -+ args.dentry = dentry; -+ args.inode = inode; -+ args.h_inode = a->h_child_inode; -+ args.dir = a->dir; -+ args.h_dir = a->h_dir; -+ args.h_name = a->h_child_name; -+ args.h_nlen = a->h_child_nlen; -+ err = hin_job(&args); -+ if (dentry) { -+ if (dentry->d_fsdata) -+ di_write_unlock(dentry); -+ dput(dentry); -+ } -+ if (inode && try_iput) { -+ ii_write_unlock(inode); -+ iput(inode); -+ } -+ -+ ii_write_lock_parent(a->dir); -+ args.flags = a->flags[PARENT]; -+ args.dentry = NULL; -+ args.inode = a->dir; -+ args.h_inode = a->h_dir; -+ args.dir = NULL; -+ args.h_dir = NULL; -+ args.h_name = NULL; -+ args.h_nlen = 0; -+ err = hin_job(&args); -+ ii_write_unlock(a->dir); -+ -+ out: -+ au_nwt_done(&sbinfo->si_nowait); -+ si_write_unlock(sb); -+ -+ iput(a->h_child_inode); -+ iput(a->h_dir); -+ iput(a->dir); -+ kfree(a); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static void aufs_inotify(struct inotify_watch *watch, u32 wd __maybe_unused, -+ u32 mask, u32 cookie __maybe_unused, -+ const char *h_child_name, struct inode *h_child_inode) -+{ -+ struct au_hinotify *hinotify; -+ struct postproc_args *args; -+ int len, wkq_err; -+ unsigned char isdir, isroot, wh; -+ char *p; -+ struct inode *dir; -+ unsigned int flags[2]; -+ -+ /* if IN_UNMOUNT happens, there must be another bug */ -+ AuDebugOn(mask & IN_UNMOUNT); -+ if (mask & (IN_IGNORED | IN_UNMOUNT)) { -+ put_inotify_watch(watch); -+ return; -+ } -+#ifdef AuDbgHinotify -+ au_debug(1); -+ if (1 || !h_child_name || strcmp(h_child_name, AUFS_XINO_FNAME)) { -+ AuDbg("i%lu, wd %d, mask 0x%x %s, cookie 0x%x, hcname %s," -+ " hi%lu\n", -+ watch->inode->i_ino, wd, mask, in_name(mask), cookie, -+ h_child_name ? h_child_name : "", -+ h_child_inode ? h_child_inode->i_ino : 0); -+ WARN_ON(1); -+ } -+ au_debug(0); -+#endif -+ -+ hinotify = container_of(watch, struct au_hinotify, hin_watch); -+ AuDebugOn(!hinotify || !hinotify->hin_aufs_inode); -+ dir = igrab(hinotify->hin_aufs_inode); -+ if (!dir) -+ return; -+ -+ isroot = (dir->i_ino == AUFS_ROOT_INO); -+ len = 0; -+ wh = 0; -+ if (h_child_name) { -+ len = strlen(h_child_name); -+ if (!memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { -+ h_child_name += AUFS_WH_PFX_LEN; -+ len -= AUFS_WH_PFX_LEN; -+ wh = 1; -+ } -+ } -+ -+ isdir = 0; -+ if (h_child_inode) -+ isdir = !!S_ISDIR(h_child_inode->i_mode); -+ flags[PARENT] = AuHinJob_ISDIR; -+ flags[CHILD] = 0; -+ if (isdir) -+ flags[CHILD] = AuHinJob_ISDIR; -+ switch (mask & IN_ALL_EVENTS) { -+ case IN_MOVED_FROM: -+ case IN_MOVED_TO: -+ AuDebugOn(!h_child_name || !h_child_inode); -+ au_fset_hinjob(flags[CHILD], GEN); -+ au_fset_hinjob(flags[CHILD], XINO0); -+ au_fset_hinjob(flags[CHILD], MNTPNT); -+ au_fset_hinjob(flags[PARENT], DIRENT); -+ break; -+ -+ case IN_CREATE: -+ AuDebugOn(!h_child_name || !h_child_inode); -+ au_fset_hinjob(flags[PARENT], DIRENT); -+ au_fset_hinjob(flags[CHILD], GEN); -+ break; -+ -+ case IN_DELETE: -+ /* -+ * aufs never be able to get this child inode. -+ * revalidation should be in d_revalidate() -+ * by checking i_nlink, i_generation or d_unhashed(). -+ */ -+ AuDebugOn(!h_child_name); -+ au_fset_hinjob(flags[PARENT], DIRENT); -+ au_fset_hinjob(flags[CHILD], GEN); -+ au_fset_hinjob(flags[CHILD], TRYXINO0); -+ au_fset_hinjob(flags[CHILD], MNTPNT); -+ break; -+ -+ default: -+ AuDebugOn(1); -+ } -+ -+ if (wh) -+ h_child_inode = NULL; -+ -+ /* iput() and kfree() will be called in postproc() */ -+ /* -+ * inotify_mutex is already acquired and kmalloc/prune_icache may lock -+ * iprune_mutex. strange. -+ */ -+ lockdep_off(); -+ args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS); -+ lockdep_on(); -+ if (unlikely(!args)) { -+ AuErr1("no memory\n"); -+ iput(dir); -+ return; -+ } -+ args->flags[PARENT] = flags[PARENT]; -+ args->flags[CHILD] = flags[CHILD]; -+ args->mask = mask; -+ args->dir = dir; -+ args->h_dir = igrab(watch->inode); -+ if (h_child_inode) -+ h_child_inode = igrab(h_child_inode); /* can be NULL */ -+ args->h_child_inode = h_child_inode; -+ args->h_child_nlen = len; -+ if (len) { -+ p = (void *)args; -+ p += sizeof(*args); -+ memcpy(p, h_child_name, len + 1); -+ } -+ -+ lockdep_off(); -+ wkq_err = au_wkq_nowait(postproc, args, dir->i_sb); -+ lockdep_on(); -+ if (unlikely(wkq_err)) -+ AuErr("wkq %d\n", wkq_err); -+} -+ -+static void aufs_inotify_destroy(struct inotify_watch *watch __maybe_unused) -+{ -+ return; -+} -+ -+static struct inotify_operations aufs_inotify_ops = { -+ .handle_event = aufs_inotify, -+ .destroy_watch = aufs_inotify_destroy -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+static void au_hin_destroy_cache(void) -+{ -+ kmem_cache_destroy(au_cachep[AuCache_HINOTIFY]); -+ au_cachep[AuCache_HINOTIFY] = NULL; -+} -+ -+int __init au_hinotify_init(void) -+{ -+ int err; -+ -+ err = -ENOMEM; -+ au_cachep[AuCache_HINOTIFY] = AuCache(au_hinotify); -+ if (au_cachep[AuCache_HINOTIFY]) { -+ err = 0; -+ au_hin_handle = inotify_init(&aufs_inotify_ops); -+ if (IS_ERR(au_hin_handle)) { -+ err = PTR_ERR(au_hin_handle); -+ au_hin_destroy_cache(); -+ } -+ } -+ AuTraceErr(err); -+ return err; -+} -+ -+void au_hinotify_fin(void) -+{ -+ inotify_destroy(au_hin_handle); -+ if (au_cachep[AuCache_HINOTIFY]) -+ au_hin_destroy_cache(); -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/iinfo.c linux-2.6.31/fs/aufs/iinfo.c ---- linux-2.6.31-vanilla/fs/aufs/iinfo.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/iinfo.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,283 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * inode private data -+ */ -+ -+#include "aufs.h" -+ -+struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex) -+{ -+ struct inode *h_inode; -+ -+ IiMustAnyLock(inode); -+ -+ h_inode = au_ii(inode)->ii_hinode[0 + bindex].hi_inode; -+ AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); -+ return h_inode; -+} -+ -+/* todo: hard/soft set? */ -+void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex) -+{ -+ struct au_iinfo *iinfo = au_ii(inode); -+ struct inode *h_inode; -+ -+ IiMustWriteLock(inode); -+ -+ iinfo->ii_bstart = bindex; -+ h_inode = iinfo->ii_hinode[bindex + 0].hi_inode; -+ if (h_inode) -+ au_cpup_igen(inode, h_inode); -+} -+ -+void au_hiput(struct au_hinode *hinode) -+{ -+ au_hin_free(hinode); -+ dput(hinode->hi_whdentry); -+ iput(hinode->hi_inode); -+} -+ -+unsigned int au_hi_flags(struct inode *inode, int isdir) -+{ -+ unsigned int flags; -+ const unsigned int mnt_flags = au_mntflags(inode->i_sb); -+ -+ flags = 0; -+ if (au_opt_test(mnt_flags, XINO)) -+ au_fset_hi(flags, XINO); -+ if (isdir && au_opt_test(mnt_flags, UDBA_HINOTIFY)) -+ au_fset_hi(flags, HINOTIFY); -+ return flags; -+} -+ -+void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, -+ struct inode *h_inode, unsigned int flags) -+{ -+ struct au_hinode *hinode; -+ struct inode *hi; -+ struct au_iinfo *iinfo = au_ii(inode); -+ -+ IiMustWriteLock(inode); -+ -+ hinode = iinfo->ii_hinode + bindex; -+ hi = hinode->hi_inode; -+ AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); -+ AuDebugOn(h_inode && hi); -+ -+ if (hi) -+ au_hiput(hinode); -+ hinode->hi_inode = h_inode; -+ if (h_inode) { -+ int err; -+ struct super_block *sb = inode->i_sb; -+ struct au_branch *br; -+ -+ if (bindex == iinfo->ii_bstart) -+ au_cpup_igen(inode, h_inode); -+ br = au_sbr(sb, bindex); -+ hinode->hi_id = br->br_id; -+ if (au_ftest_hi(flags, XINO)) { -+ err = au_xino_write(sb, bindex, h_inode->i_ino, -+ inode->i_ino); -+ if (unlikely(err)) -+ AuIOErr1("failed au_xino_write() %d\n", err); -+ } -+ -+ if (au_ftest_hi(flags, HINOTIFY) -+ && au_br_hinotifyable(br->br_perm)) { -+ err = au_hin_alloc(hinode, inode, h_inode); -+ if (unlikely(err)) -+ AuIOErr1("au_hin_alloc() %d\n", err); -+ } -+ } -+} -+ -+void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, -+ struct dentry *h_wh) -+{ -+ struct au_hinode *hinode; -+ -+ IiMustWriteLock(inode); -+ -+ hinode = au_ii(inode)->ii_hinode + bindex; -+ AuDebugOn(hinode->hi_whdentry); -+ hinode->hi_whdentry = h_wh; -+} -+ -+void au_update_iigen(struct inode *inode) -+{ -+ atomic_set(&au_ii(inode)->ii_generation, au_sigen(inode->i_sb)); -+ /* smp_mb(); */ /* atomic_set */ -+} -+ -+/* it may be called at remount time, too */ -+void au_update_brange(struct inode *inode, int do_put_zero) -+{ -+ struct au_iinfo *iinfo; -+ -+ iinfo = au_ii(inode); -+ if (!iinfo || iinfo->ii_bstart < 0) -+ return; -+ -+ IiMustWriteLock(inode); -+ -+ if (do_put_zero) { -+ aufs_bindex_t bindex; -+ -+ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; -+ bindex++) { -+ struct inode *h_i; -+ -+ h_i = iinfo->ii_hinode[0 + bindex].hi_inode; -+ if (h_i && !h_i->i_nlink) -+ au_set_h_iptr(inode, bindex, NULL, 0); -+ } -+ } -+ -+ iinfo->ii_bstart = -1; -+ while (++iinfo->ii_bstart <= iinfo->ii_bend) -+ if (iinfo->ii_hinode[0 + iinfo->ii_bstart].hi_inode) -+ break; -+ if (iinfo->ii_bstart > iinfo->ii_bend) { -+ iinfo->ii_bstart = -1; -+ iinfo->ii_bend = -1; -+ return; -+ } -+ -+ iinfo->ii_bend++; -+ while (0 <= --iinfo->ii_bend) -+ if (iinfo->ii_hinode[0 + iinfo->ii_bend].hi_inode) -+ break; -+ AuDebugOn(iinfo->ii_bstart > iinfo->ii_bend || iinfo->ii_bend < 0); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+int au_iinfo_init(struct inode *inode) -+{ -+ struct au_iinfo *iinfo; -+ struct super_block *sb; -+ int nbr, i; -+ -+ sb = inode->i_sb; -+ iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); -+ nbr = au_sbend(sb) + 1; -+ if (unlikely(nbr <= 0)) -+ nbr = 1; -+ iinfo->ii_hinode = kcalloc(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS); -+ if (iinfo->ii_hinode) { -+ for (i = 0; i < nbr; i++) -+ iinfo->ii_hinode[i].hi_id = -1; -+ -+ atomic_set(&iinfo->ii_generation, au_sigen(sb)); -+ /* smp_mb(); */ /* atomic_set */ -+ au_rw_init(&iinfo->ii_rwsem); -+ iinfo->ii_bstart = -1; -+ iinfo->ii_bend = -1; -+ iinfo->ii_vdir = NULL; -+ return 0; -+ } -+ return -ENOMEM; -+} -+ -+int au_ii_realloc(struct au_iinfo *iinfo, int nbr) -+{ -+ int err, sz; -+ struct au_hinode *hip; -+ -+ AuRwMustWriteLock(&iinfo->ii_rwsem); -+ -+ err = -ENOMEM; -+ sz = sizeof(*hip) * (iinfo->ii_bend + 1); -+ if (!sz) -+ sz = sizeof(*hip); -+ hip = au_kzrealloc(iinfo->ii_hinode, sz, sizeof(*hip) * nbr, GFP_NOFS); -+ if (hip) { -+ iinfo->ii_hinode = hip; -+ err = 0; -+ } -+ -+ return err; -+} -+ -+static int au_iinfo_write0(struct super_block *sb, struct au_hinode *hinode, -+ ino_t ino) -+{ -+ int err; -+ aufs_bindex_t bindex; -+ unsigned char locked; -+ -+ err = 0; -+ locked = !!si_noflush_read_trylock(sb); -+ bindex = au_br_index(sb, hinode->hi_id); -+ if (bindex >= 0) -+ err = au_xino_write0(sb, bindex, hinode->hi_inode->i_ino, ino); -+ /* error action? */ -+ if (locked) -+ si_read_unlock(sb); -+ return err; -+} -+ -+void au_iinfo_fin(struct inode *inode) -+{ -+ ino_t ino; -+ aufs_bindex_t bend; -+ unsigned char unlinked = !inode->i_nlink; -+ struct au_iinfo *iinfo; -+ struct au_hinode *hi; -+ struct super_block *sb; -+ -+ if (unlinked) { -+ int err = au_xigen_inc(inode); -+ if (unlikely(err)) -+ AuWarn1("failed resetting i_generation, %d\n", err); -+ } -+ -+ iinfo = au_ii(inode); -+ /* bad_inode case */ -+ if (!iinfo) -+ return; -+ -+ if (iinfo->ii_vdir) -+ au_vdir_free(iinfo->ii_vdir); -+ -+ if (iinfo->ii_bstart >= 0) { -+ sb = inode->i_sb; -+ ino = 0; -+ if (unlinked) -+ ino = inode->i_ino; -+ hi = iinfo->ii_hinode + iinfo->ii_bstart; -+ bend = iinfo->ii_bend; -+ while (iinfo->ii_bstart++ <= bend) { -+ if (hi->hi_inode) { -+ if (unlinked || !hi->hi_inode->i_nlink) { -+ au_iinfo_write0(sb, hi, ino); -+ /* ignore this error */ -+ ino = 0; -+ } -+ au_hiput(hi); -+ } -+ hi++; -+ } -+ } -+ -+ kfree(iinfo->ii_hinode); -+ AuRwDestroy(&iinfo->ii_rwsem); -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/inode.c linux-2.6.31/fs/aufs/inode.c ---- linux-2.6.31-vanilla/fs/aufs/inode.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/inode.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,413 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * inode functions -+ */ -+ -+#include "aufs.h" -+ -+struct inode *au_igrab(struct inode *inode) -+{ -+ if (inode) { -+ AuDebugOn(!atomic_read(&inode->i_count)); -+ atomic_inc_return(&inode->i_count); -+ } -+ return inode; -+} -+ -+static void au_refresh_hinode_attr(struct inode *inode, int do_version) -+{ -+ au_cpup_attr_all(inode, /*force*/0); -+ au_update_iigen(inode); -+ if (do_version) -+ inode->i_version++; -+} -+ -+int au_refresh_hinode_self(struct inode *inode, int do_attr) -+{ -+ int err; -+ aufs_bindex_t bindex, new_bindex; -+ unsigned char update; -+ struct inode *first; -+ struct au_hinode *p, *q, tmp; -+ struct super_block *sb; -+ struct au_iinfo *iinfo; -+ -+ IiMustWriteLock(inode); -+ -+ update = 0; -+ sb = inode->i_sb; -+ iinfo = au_ii(inode); -+ err = au_ii_realloc(iinfo, au_sbend(sb) + 1); -+ if (unlikely(err)) -+ goto out; -+ -+ p = iinfo->ii_hinode + iinfo->ii_bstart; -+ first = p->hi_inode; -+ err = 0; -+ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; -+ bindex++, p++) { -+ if (!p->hi_inode) -+ continue; -+ -+ new_bindex = au_br_index(sb, p->hi_id); -+ if (new_bindex == bindex) -+ continue; -+ -+ if (new_bindex < 0) { -+ update++; -+ au_hiput(p); -+ p->hi_inode = NULL; -+ continue; -+ } -+ -+ if (new_bindex < iinfo->ii_bstart) -+ iinfo->ii_bstart = new_bindex; -+ if (iinfo->ii_bend < new_bindex) -+ iinfo->ii_bend = new_bindex; -+ /* swap two lower inode, and loop again */ -+ q = iinfo->ii_hinode + new_bindex; -+ tmp = *q; -+ *q = *p; -+ *p = tmp; -+ if (tmp.hi_inode) { -+ bindex--; -+ p--; -+ } -+ } -+ au_update_brange(inode, /*do_put_zero*/0); -+ if (do_attr) -+ au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode)); -+ -+ out: -+ return err; -+} -+ -+int au_refresh_hinode(struct inode *inode, struct dentry *dentry) -+{ -+ int err, update; -+ unsigned int flags; -+ aufs_bindex_t bindex, bend; -+ unsigned char isdir; -+ struct inode *first; -+ struct au_hinode *p; -+ struct au_iinfo *iinfo; -+ -+ err = au_refresh_hinode_self(inode, /*do_attr*/0); -+ if (unlikely(err)) -+ goto out; -+ -+ update = 0; -+ iinfo = au_ii(inode); -+ p = iinfo->ii_hinode + iinfo->ii_bstart; -+ first = p->hi_inode; -+ isdir = S_ISDIR(inode->i_mode); -+ flags = au_hi_flags(inode, isdir); -+ bend = au_dbend(dentry); -+ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { -+ struct inode *h_i; -+ struct dentry *h_d; -+ -+ h_d = au_h_dptr(dentry, bindex); -+ if (!h_d || !h_d->d_inode) -+ continue; -+ -+ if (iinfo->ii_bstart <= bindex && bindex <= iinfo->ii_bend) { -+ h_i = au_h_iptr(inode, bindex); -+ if (h_i) { -+ if (h_i == h_d->d_inode) -+ continue; -+ err = -EIO; -+ break; -+ } -+ } -+ if (bindex < iinfo->ii_bstart) -+ iinfo->ii_bstart = bindex; -+ if (iinfo->ii_bend < bindex) -+ iinfo->ii_bend = bindex; -+ au_set_h_iptr(inode, bindex, au_igrab(h_d->d_inode), flags); -+ update = 1; -+ } -+ au_update_brange(inode, /*do_put_zero*/0); -+ -+ if (unlikely(err)) -+ goto out; -+ -+ au_refresh_hinode_attr(inode, update && isdir); -+ -+ out: -+ return err; -+} -+ -+static int set_inode(struct inode *inode, struct dentry *dentry) -+{ -+ int err; -+ unsigned int flags; -+ umode_t mode; -+ aufs_bindex_t bindex, bstart, btail; -+ unsigned char isdir; -+ struct dentry *h_dentry; -+ struct inode *h_inode; -+ struct au_iinfo *iinfo; -+ -+ IiMustWriteLock(inode); -+ -+ err = 0; -+ isdir = 0; -+ bstart = au_dbstart(dentry); -+ h_inode = au_h_dptr(dentry, bstart)->d_inode; -+ mode = h_inode->i_mode; -+ switch (mode & S_IFMT) { -+ case S_IFREG: -+ btail = au_dbtail(dentry); -+ inode->i_op = &aufs_iop; -+ inode->i_fop = &aufs_file_fop; -+ inode->i_mapping->a_ops = &aufs_aop; -+ break; -+ case S_IFDIR: -+ isdir = 1; -+ btail = au_dbtaildir(dentry); -+ inode->i_op = &aufs_dir_iop; -+ inode->i_fop = &aufs_dir_fop; -+ break; -+ case S_IFLNK: -+ btail = au_dbtail(dentry); -+ inode->i_op = &aufs_symlink_iop; -+ break; -+ case S_IFBLK: -+ case S_IFCHR: -+ case S_IFIFO: -+ case S_IFSOCK: -+ btail = au_dbtail(dentry); -+ inode->i_op = &aufs_iop; -+ init_special_inode(inode, mode, h_inode->i_rdev); -+ break; -+ default: -+ AuIOErr("Unknown file type 0%o\n", mode); -+ err = -EIO; -+ goto out; -+ } -+ -+ /* do not set inotify for whiteouted dirs (SHWH mode) */ -+ flags = au_hi_flags(inode, isdir); -+ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH) -+ && au_ftest_hi(flags, HINOTIFY) -+ && dentry->d_name.len > AUFS_WH_PFX_LEN -+ && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) -+ au_fclr_hi(flags, HINOTIFY); -+ iinfo = au_ii(inode); -+ iinfo->ii_bstart = bstart; -+ iinfo->ii_bend = btail; -+ for (bindex = bstart; bindex <= btail; bindex++) { -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (h_dentry) -+ au_set_h_iptr(inode, bindex, -+ au_igrab(h_dentry->d_inode), flags); -+ } -+ au_cpup_attr_all(inode, /*force*/1); -+ -+ out: -+ return err; -+} -+ -+/* successful returns with iinfo write_locked */ -+static int reval_inode(struct inode *inode, struct dentry *dentry, int *matched) -+{ -+ int err; -+ aufs_bindex_t bindex, bend; -+ struct inode *h_inode, *h_dinode; -+ -+ *matched = 0; -+ -+ /* -+ * before this function, if aufs got any iinfo lock, it must be only -+ * one, the parent dir. -+ * it can happen by UDBA and the obsoleted inode number. -+ */ -+ err = -EIO; -+ if (unlikely(inode->i_ino == parent_ino(dentry))) -+ goto out; -+ -+ err = 0; -+ ii_write_lock_new_child(inode); -+ h_dinode = au_h_dptr(dentry, au_dbstart(dentry))->d_inode; -+ bend = au_ibend(inode); -+ for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { -+ h_inode = au_h_iptr(inode, bindex); -+ if (h_inode && h_inode == h_dinode) { -+ *matched = 1; -+ err = 0; -+ if (au_iigen(inode) != au_digen(dentry)) -+ err = au_refresh_hinode(inode, dentry); -+ break; -+ } -+ } -+ -+ if (unlikely(err)) -+ ii_write_unlock(inode); -+ out: -+ return err; -+} -+ -+int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, -+ unsigned int d_type, ino_t *ino) -+{ -+ int err; -+ struct mutex *mtx; -+ const int isdir = (d_type == DT_DIR); -+ -+ /* prevent hardlinks from race condition */ -+ mtx = NULL; -+ if (!isdir) { -+ mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx; -+ mutex_lock(mtx); -+ } -+ err = au_xino_read(sb, bindex, h_ino, ino); -+ if (unlikely(err)) -+ goto out; -+ -+ if (!*ino) { -+ err = -EIO; -+ *ino = au_xino_new_ino(sb); -+ if (unlikely(!*ino)) -+ goto out; -+ err = au_xino_write(sb, bindex, h_ino, *ino); -+ if (unlikely(err)) -+ goto out; -+ } -+ -+ out: -+ if (!isdir) -+ mutex_unlock(mtx); -+ return err; -+} -+ -+/* successful returns with iinfo write_locked */ -+/* todo: return with unlocked? */ -+struct inode *au_new_inode(struct dentry *dentry, int must_new) -+{ -+ struct inode *inode; -+ struct dentry *h_dentry; -+ struct super_block *sb; -+ ino_t h_ino, ino; -+ int err, match; -+ aufs_bindex_t bstart; -+ -+ sb = dentry->d_sb; -+ bstart = au_dbstart(dentry); -+ h_dentry = au_h_dptr(dentry, bstart); -+ h_ino = h_dentry->d_inode->i_ino; -+ err = au_xino_read(sb, bstart, h_ino, &ino); -+ inode = ERR_PTR(err); -+ if (unlikely(err)) -+ goto out; -+ new_ino: -+ if (!ino) { -+ ino = au_xino_new_ino(sb); -+ if (unlikely(!ino)) { -+ inode = ERR_PTR(-EIO); -+ goto out; -+ } -+ } -+ -+ AuDbg("i%lu\n", (unsigned long)ino); -+ inode = au_iget_locked(sb, ino); -+ err = PTR_ERR(inode); -+ if (IS_ERR(inode)) -+ goto out; -+ -+ AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW)); -+ if (inode->i_state & I_NEW) { -+ ii_write_lock_new_child(inode); -+ err = set_inode(inode, dentry); -+ unlock_new_inode(inode); -+ if (!err) -+ goto out; /* success */ -+ -+ iget_failed(inode); -+ ii_write_unlock(inode); -+ goto out_iput; -+ } else if (!must_new) { -+ err = reval_inode(inode, dentry, &match); -+ if (!err) -+ goto out; /* success */ -+ else if (match) -+ goto out_iput; -+ } -+ -+ if (unlikely(au_test_fs_unique_ino(h_dentry->d_inode))) -+ AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir," -+ " b%d, %s, %.*s, hi%lu, i%lu.\n", -+ bstart, au_sbtype(h_dentry->d_sb), AuDLNPair(dentry), -+ (unsigned long)h_ino, (unsigned long)ino); -+ ino = 0; -+ err = au_xino_write(sb, bstart, h_ino, /*ino*/0); -+ if (!err) { -+ iput(inode); -+ goto new_ino; -+ } -+ -+ out_iput: -+ iput(inode); -+ inode = ERR_PTR(err); -+ out: -+ return inode; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, -+ struct inode *inode) -+{ -+ int err; -+ -+ err = au_br_rdonly(au_sbr(sb, bindex)); -+ -+ /* pseudo-link after flushed may happen out of bounds */ -+ if (!err -+ && inode -+ && au_ibstart(inode) <= bindex -+ && bindex <= au_ibend(inode)) { -+ /* -+ * permission check is unnecessary since vfsub routine -+ * will be called later -+ */ -+ struct inode *hi = au_h_iptr(inode, bindex); -+ if (hi) -+ err = IS_IMMUTABLE(hi) ? -EROFS : 0; -+ } -+ -+ return err; -+} -+ -+int au_test_h_perm(struct inode *h_inode, int mask) -+{ -+ if (!current_fsuid()) -+ return 0; -+ return inode_permission(h_inode, mask); -+} -+ -+int au_test_h_perm_sio(struct inode *h_inode, int mask) -+{ -+ if (au_test_nfs(h_inode->i_sb) -+ && (mask & MAY_WRITE) -+ && S_ISDIR(h_inode->i_mode)) -+ mask |= MAY_READ; /* force permission check */ -+ return au_test_h_perm(h_inode, mask); -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/inode.h linux-2.6.31/fs/aufs/inode.h ---- linux-2.6.31-vanilla/fs/aufs/inode.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/inode.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,497 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * inode operations -+ */ -+ -+#ifndef __AUFS_INODE_H__ -+#define __AUFS_INODE_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/fs.h> -+#include <linux/inotify.h> -+#include <linux/aufs_type.h> -+#include "rwsem.h" -+ -+struct vfsmount; -+ -+struct au_hinotify { -+#ifdef CONFIG_AUFS_HINOTIFY -+ struct inotify_watch hin_watch; -+ struct inode *hin_aufs_inode; /* no get/put */ -+#endif -+}; -+ -+struct au_hinode { -+ struct inode *hi_inode; -+ aufs_bindex_t hi_id; -+#ifdef CONFIG_AUFS_HINOTIFY -+ struct au_hinotify *hi_notify; -+#endif -+ -+ /* reference to the copied-up whiteout with get/put */ -+ struct dentry *hi_whdentry; -+}; -+ -+struct au_vdir; -+struct au_iinfo { -+ atomic_t ii_generation; -+ struct super_block *ii_hsb1; /* no get/put */ -+ -+ struct au_rwsem ii_rwsem; -+ aufs_bindex_t ii_bstart, ii_bend; -+ __u32 ii_higen; -+ struct au_hinode *ii_hinode; -+ struct au_vdir *ii_vdir; -+}; -+ -+struct au_icntnr { -+ struct au_iinfo iinfo; -+ struct inode vfs_inode; -+}; -+ -+/* au_pin flags */ -+#define AuPin_DI_LOCKED 1 -+#define AuPin_MNT_WRITE (1 << 1) -+#define au_ftest_pin(flags, name) ((flags) & AuPin_##name) -+#define au_fset_pin(flags, name) { (flags) |= AuPin_##name; } -+#define au_fclr_pin(flags, name) { (flags) &= ~AuPin_##name; } -+ -+struct au_pin { -+ /* input */ -+ struct dentry *dentry; -+ unsigned int udba; -+ unsigned char lsc_di, lsc_hi, flags; -+ aufs_bindex_t bindex; -+ -+ /* output */ -+ struct dentry *parent; -+ struct au_hinode *hdir; -+ struct vfsmount *h_mnt; -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline struct au_iinfo *au_ii(struct inode *inode) -+{ -+ struct au_iinfo *iinfo; -+ -+ iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); -+ if (iinfo->ii_hinode) -+ return iinfo; -+ return NULL; /* debugging bad_inode case */ -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* inode.c */ -+struct inode *au_igrab(struct inode *inode); -+int au_refresh_hinode_self(struct inode *inode, int do_attr); -+int au_refresh_hinode(struct inode *inode, struct dentry *dentry); -+int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, -+ unsigned int d_type, ino_t *ino); -+struct inode *au_new_inode(struct dentry *dentry, int must_new); -+int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, -+ struct inode *inode); -+int au_test_h_perm(struct inode *h_inode, int mask); -+int au_test_h_perm_sio(struct inode *h_inode, int mask); -+ -+static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex, -+ ino_t h_ino, unsigned int d_type, ino_t *ino) -+{ -+#ifdef CONFIG_AUFS_SHWH -+ return au_ino(sb, bindex, h_ino, d_type, ino); -+#else -+ return 0; -+#endif -+} -+ -+/* i_op.c */ -+extern struct inode_operations aufs_iop, aufs_symlink_iop, aufs_dir_iop; -+ -+/* au_wr_dir flags */ -+#define AuWrDir_ADD_ENTRY 1 -+#define AuWrDir_ISDIR (1 << 1) -+#define au_ftest_wrdir(flags, name) ((flags) & AuWrDir_##name) -+#define au_fset_wrdir(flags, name) { (flags) |= AuWrDir_##name; } -+#define au_fclr_wrdir(flags, name) { (flags) &= ~AuWrDir_##name; } -+ -+struct au_wr_dir_args { -+ aufs_bindex_t force_btgt; -+ unsigned char flags; -+}; -+int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, -+ struct au_wr_dir_args *args); -+ -+struct dentry *au_pinned_h_parent(struct au_pin *pin); -+void au_pin_init(struct au_pin *pin, struct dentry *dentry, -+ aufs_bindex_t bindex, int lsc_di, int lsc_hi, -+ unsigned int udba, unsigned char flags); -+int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, -+ unsigned int udba, unsigned char flags) __must_check; -+int au_do_pin(struct au_pin *pin) __must_check; -+void au_unpin(struct au_pin *pin); -+ -+/* i_op_add.c */ -+int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, -+ struct dentry *h_parent, int isdir); -+int aufs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev); -+int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); -+int aufs_create(struct inode *dir, struct dentry *dentry, int mode, -+ struct nameidata *nd); -+int aufs_link(struct dentry *src_dentry, struct inode *dir, -+ struct dentry *dentry); -+int aufs_mkdir(struct inode *dir, struct dentry *dentry, int mode); -+ -+/* i_op_del.c */ -+int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup); -+int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, -+ struct dentry *h_parent, int isdir); -+int aufs_unlink(struct inode *dir, struct dentry *dentry); -+int aufs_rmdir(struct inode *dir, struct dentry *dentry); -+ -+/* i_op_ren.c */ -+int au_wbr(struct dentry *dentry, aufs_bindex_t btgt); -+int aufs_rename(struct inode *src_dir, struct dentry *src_dentry, -+ struct inode *dir, struct dentry *dentry); -+ -+/* iinfo.c */ -+struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex); -+void au_hiput(struct au_hinode *hinode); -+void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex); -+void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, -+ struct dentry *h_wh); -+unsigned int au_hi_flags(struct inode *inode, int isdir); -+ -+/* hinode flags */ -+#define AuHi_XINO 1 -+#define AuHi_HINOTIFY (1 << 1) -+#define au_ftest_hi(flags, name) ((flags) & AuHi_##name) -+#define au_fset_hi(flags, name) { (flags) |= AuHi_##name; } -+#define au_fclr_hi(flags, name) { (flags) &= ~AuHi_##name; } -+ -+#ifndef CONFIG_AUFS_HINOTIFY -+#undef AuHi_HINOTIFY -+#define AuHi_HINOTIFY 0 -+#endif -+ -+void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, -+ struct inode *h_inode, unsigned int flags); -+ -+void au_update_iigen(struct inode *inode); -+void au_update_brange(struct inode *inode, int do_put_zero); -+ -+int au_iinfo_init(struct inode *inode); -+void au_iinfo_fin(struct inode *inode); -+int au_ii_realloc(struct au_iinfo *iinfo, int nbr); -+ -+/* plink.c */ -+void au_plink_block_maintain(struct super_block *sb); -+#ifdef CONFIG_AUFS_DEBUG -+void au_plink_list(struct super_block *sb); -+#else -+static inline void au_plink_list(struct super_block *sb) -+{ -+ /* nothing */ -+} -+#endif -+int au_plink_test(struct inode *inode); -+struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex); -+void au_plink_append(struct inode *inode, aufs_bindex_t bindex, -+ struct dentry *h_dentry); -+void au_plink_put(struct super_block *sb); -+void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id); -+long au_plink_ioctl(struct file *file, unsigned int cmd); -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* lock subclass for iinfo */ -+enum { -+ AuLsc_II_CHILD, /* child first */ -+ AuLsc_II_CHILD2, /* rename(2), link(2), and cpup at hinotify */ -+ AuLsc_II_CHILD3, /* copyup dirs */ -+ AuLsc_II_PARENT, /* see AuLsc_I_PARENT in vfsub.h */ -+ AuLsc_II_PARENT2, -+ AuLsc_II_PARENT3, /* copyup dirs */ -+ AuLsc_II_NEW_CHILD -+}; -+ -+/* -+ * ii_read_lock_child, ii_write_lock_child, -+ * ii_read_lock_child2, ii_write_lock_child2, -+ * ii_read_lock_child3, ii_write_lock_child3, -+ * ii_read_lock_parent, ii_write_lock_parent, -+ * ii_read_lock_parent2, ii_write_lock_parent2, -+ * ii_read_lock_parent3, ii_write_lock_parent3, -+ * ii_read_lock_new_child, ii_write_lock_new_child, -+ */ -+#define AuReadLockFunc(name, lsc) \ -+static inline void ii_read_lock_##name(struct inode *i) \ -+{ \ -+ au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ -+} -+ -+#define AuWriteLockFunc(name, lsc) \ -+static inline void ii_write_lock_##name(struct inode *i) \ -+{ \ -+ au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ -+} -+ -+#define AuRWLockFuncs(name, lsc) \ -+ AuReadLockFunc(name, lsc) \ -+ AuWriteLockFunc(name, lsc) -+ -+AuRWLockFuncs(child, CHILD); -+AuRWLockFuncs(child2, CHILD2); -+AuRWLockFuncs(child3, CHILD3); -+AuRWLockFuncs(parent, PARENT); -+AuRWLockFuncs(parent2, PARENT2); -+AuRWLockFuncs(parent3, PARENT3); -+AuRWLockFuncs(new_child, NEW_CHILD); -+ -+#undef AuReadLockFunc -+#undef AuWriteLockFunc -+#undef AuRWLockFuncs -+ -+/* -+ * ii_read_unlock, ii_write_unlock, ii_downgrade_lock -+ */ -+AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem); -+ -+#define IiMustNoWaiters(i) AuRwMustNoWaiters(&au_ii(i)->ii_rwsem) -+#define IiMustAnyLock(i) AuRwMustAnyLock(&au_ii(i)->ii_rwsem) -+#define IiMustWriteLock(i) AuRwMustWriteLock(&au_ii(i)->ii_rwsem) -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline unsigned int au_iigen(struct inode *inode) -+{ -+ return atomic_read(&au_ii(inode)->ii_generation); -+} -+ -+/* tiny test for inode number */ -+/* tmpfs generation is too rough */ -+static inline int au_test_higen(struct inode *inode, struct inode *h_inode) -+{ -+ struct au_iinfo *iinfo; -+ -+ iinfo = au_ii(inode); -+ AuRwMustAnyLock(&iinfo->ii_rwsem); -+ return !(iinfo->ii_hsb1 == h_inode->i_sb -+ && iinfo->ii_higen == h_inode->i_generation); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline aufs_bindex_t au_ii_br_id(struct inode *inode, -+ aufs_bindex_t bindex) -+{ -+ IiMustAnyLock(inode); -+ return au_ii(inode)->ii_hinode[0 + bindex].hi_id; -+} -+ -+static inline aufs_bindex_t au_ibstart(struct inode *inode) -+{ -+ IiMustAnyLock(inode); -+ return au_ii(inode)->ii_bstart; -+} -+ -+static inline aufs_bindex_t au_ibend(struct inode *inode) -+{ -+ IiMustAnyLock(inode); -+ return au_ii(inode)->ii_bend; -+} -+ -+static inline struct au_vdir *au_ivdir(struct inode *inode) -+{ -+ IiMustAnyLock(inode); -+ return au_ii(inode)->ii_vdir; -+} -+ -+static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex) -+{ -+ IiMustAnyLock(inode); -+ return au_ii(inode)->ii_hinode[0 + bindex].hi_whdentry; -+} -+ -+static inline void au_set_ibend(struct inode *inode, aufs_bindex_t bindex) -+{ -+ IiMustWriteLock(inode); -+ au_ii(inode)->ii_bend = bindex; -+} -+ -+static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir) -+{ -+ IiMustWriteLock(inode); -+ au_ii(inode)->ii_vdir = vdir; -+} -+ -+static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex) -+{ -+ IiMustAnyLock(inode); -+ return au_ii(inode)->ii_hinode + bindex; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline struct dentry *au_pinned_parent(struct au_pin *pin) -+{ -+ if (pin) -+ return pin->parent; -+ return NULL; -+} -+ -+static inline struct inode *au_pinned_h_dir(struct au_pin *pin) -+{ -+ if (pin && pin->hdir) -+ return pin->hdir->hi_inode; -+ return NULL; -+} -+ -+static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin) -+{ -+ if (pin) -+ return pin->hdir; -+ return NULL; -+} -+ -+static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry) -+{ -+ if (pin) -+ pin->dentry = dentry; -+} -+ -+static inline void au_pin_set_parent_lflag(struct au_pin *pin, -+ unsigned char lflag) -+{ -+ if (pin) { -+ /* dirty macros require brackets */ -+ if (lflag) { -+ au_fset_pin(pin->flags, DI_LOCKED); -+ } else { -+ au_fclr_pin(pin->flags, DI_LOCKED); -+ } -+ } -+} -+ -+static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent) -+{ -+ if (pin) { -+ dput(pin->parent); -+ pin->parent = dget(parent); -+ } -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+#ifdef CONFIG_AUFS_HINOTIFY -+/* hinotify.c */ -+int au_hin_alloc(struct au_hinode *hinode, struct inode *inode, -+ struct inode *h_inode); -+void au_hin_free(struct au_hinode *hinode); -+void au_hin_ctl(struct au_hinode *hinode, int do_set); -+void au_reset_hinotify(struct inode *inode, unsigned int flags); -+ -+int __init au_hinotify_init(void); -+void au_hinotify_fin(void); -+ -+static inline -+void au_hin_init(struct au_hinode *hinode, struct au_hinotify *val) -+{ -+ hinode->hi_notify = val; -+} -+ -+static inline void au_iigen_dec(struct inode *inode) -+{ -+ atomic_dec_return(&au_ii(inode)->ii_generation); -+} -+ -+#else -+static inline -+int au_hin_alloc(struct au_hinode *hinode __maybe_unused, -+ struct inode *inode __maybe_unused, -+ struct inode *h_inode __maybe_unused) -+{ -+ return -EOPNOTSUPP; -+} -+ -+static inline void au_hin_free(struct au_hinode *hinode __maybe_unused) -+{ -+ /* nothing */ -+} -+ -+static inline void au_hin_ctl(struct au_hinode *hinode __maybe_unused, -+ int do_set __maybe_unused) -+{ -+ /* nothing */ -+} -+ -+static inline void au_reset_hinotify(struct inode *inode __maybe_unused, -+ unsigned int flags __maybe_unused) -+{ -+ /* nothing */ -+} -+ -+static inline int au_hinotify_init(void) -+{ -+ return 0; -+} -+ -+#define au_hinotify_fin() do {} while (0) -+ -+static inline -+void au_hin_init(struct au_hinode *hinode __maybe_unused, -+ struct au_hinotify *val __maybe_unused) -+{ -+ /* empty */ -+} -+#endif /* CONFIG_AUFS_HINOTIFY */ -+ -+static inline void au_hin_suspend(struct au_hinode *hdir) -+{ -+ au_hin_ctl(hdir, /*do_set*/0); -+} -+ -+static inline void au_hin_resume(struct au_hinode *hdir) -+{ -+ au_hin_ctl(hdir, /*do_set*/1); -+} -+ -+static inline void au_hin_imtx_lock(struct au_hinode *hdir) -+{ -+ mutex_lock(&hdir->hi_inode->i_mutex); -+ au_hin_suspend(hdir); -+} -+ -+static inline void au_hin_imtx_lock_nested(struct au_hinode *hdir, -+ unsigned int sc __maybe_unused) -+{ -+ mutex_lock_nested(&hdir->hi_inode->i_mutex, sc); -+ au_hin_suspend(hdir); -+} -+ -+static inline void au_hin_imtx_unlock(struct au_hinode *hdir) -+{ -+ au_hin_resume(hdir); -+ mutex_unlock(&hdir->hi_inode->i_mutex); -+} -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_INODE_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/ioctl.c linux-2.6.31/fs/aufs/ioctl.c ---- linux-2.6.31-vanilla/fs/aufs/ioctl.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/ioctl.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,47 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * ioctl -+ * plink-management and readdir in userspace. -+ */ -+ -+#include "aufs.h" -+ -+long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg) -+{ -+ long err; -+ -+ switch (cmd) { -+ case AUFS_CTL_PLINK_MAINT: -+ case AUFS_CTL_PLINK_CLEAN: -+ err = au_plink_ioctl(file, cmd); -+ break; -+ -+ case AUFS_CTL_RDU: -+ case AUFS_CTL_RDU_INO: -+ err = au_rdu_ioctl(file, cmd, arg); -+ break; -+ -+ default: -+ err = -EINVAL; -+ } -+ -+ AuTraceErr(err); -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/i_op_add.c linux-2.6.31/fs/aufs/i_op_add.c ---- linux-2.6.31-vanilla/fs/aufs/i_op_add.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/i_op_add.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,649 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * inode operations (add entry) -+ */ -+ -+#include "aufs.h" -+ -+/* -+ * final procedure of adding a new entry, except link(2). -+ * remove whiteout, instantiate, copyup the parent dir's times and size -+ * and update version. -+ * if it failed, re-create the removed whiteout. -+ */ -+static int epilog(struct inode *dir, aufs_bindex_t bindex, -+ struct dentry *wh_dentry, struct dentry *dentry) -+{ -+ int err, rerr; -+ aufs_bindex_t bwh; -+ struct path h_path; -+ struct inode *inode, *h_dir; -+ struct dentry *wh; -+ -+ bwh = -1; -+ if (wh_dentry) { -+ h_dir = wh_dentry->d_parent->d_inode; /* dir inode is locked */ -+ IMustLock(h_dir); -+ AuDebugOn(au_h_iptr(dir, bindex) != h_dir); -+ bwh = au_dbwh(dentry); -+ h_path.dentry = wh_dentry; -+ h_path.mnt = au_sbr_mnt(dir->i_sb, bindex); -+ err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, -+ dentry); -+ if (unlikely(err)) -+ goto out; -+ } -+ -+ inode = au_new_inode(dentry, /*must_new*/1); -+ if (!IS_ERR(inode)) { -+ d_instantiate(dentry, inode); -+ dir = dentry->d_parent->d_inode; /* dir inode is locked */ -+ IMustLock(dir); -+ if (au_ibstart(dir) == au_dbstart(dentry)) -+ au_cpup_attr_timesizes(dir); -+ dir->i_version++; -+ return 0; /* success */ -+ } -+ -+ err = PTR_ERR(inode); -+ if (!wh_dentry) -+ goto out; -+ -+ /* revert */ -+ /* dir inode is locked */ -+ wh = au_wh_create(dentry, bwh, wh_dentry->d_parent); -+ rerr = PTR_ERR(wh); -+ if (IS_ERR(wh)) { -+ AuIOErr("%.*s reverting whiteout failed(%d, %d)\n", -+ AuDLNPair(dentry), err, rerr); -+ err = -EIO; -+ } else -+ dput(wh); -+ -+ out: -+ return err; -+} -+ -+/* -+ * simple tests for the adding inode operations. -+ * following the checks in vfs, plus the parent-child relationship. -+ */ -+int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, -+ struct dentry *h_parent, int isdir) -+{ -+ int err; -+ umode_t h_mode; -+ struct dentry *h_dentry; -+ struct inode *h_inode; -+ -+ h_dentry = au_h_dptr(dentry, bindex); -+ h_inode = h_dentry->d_inode; -+ if (!dentry->d_inode) { -+ err = -EEXIST; -+ if (unlikely(h_inode)) -+ goto out; -+ } else { -+ /* rename(2) case */ -+ err = -EIO; -+ if (unlikely(!h_inode || !h_inode->i_nlink)) -+ goto out; -+ -+ h_mode = h_inode->i_mode; -+ if (!isdir) { -+ err = -EISDIR; -+ if (unlikely(S_ISDIR(h_mode))) -+ goto out; -+ } else if (unlikely(!S_ISDIR(h_mode))) { -+ err = -ENOTDIR; -+ goto out; -+ } -+ } -+ -+ err = -EIO; -+ /* expected parent dir is locked */ -+ if (unlikely(h_parent != h_dentry->d_parent)) -+ goto out; -+ err = 0; -+ -+ out: -+ return err; -+} -+ -+/* -+ * initial procedure of adding a new entry. -+ * prepare writable branch and the parent dir, lock it, -+ * and lookup whiteout for the new entry. -+ */ -+static struct dentry* -+lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt, -+ struct dentry *src_dentry, struct au_pin *pin, -+ struct au_wr_dir_args *wr_dir_args) -+{ -+ struct dentry *wh_dentry, *h_parent; -+ struct super_block *sb; -+ struct au_branch *br; -+ int err; -+ unsigned int udba; -+ aufs_bindex_t bcpup; -+ -+ err = au_wr_dir(dentry, src_dentry, wr_dir_args); -+ bcpup = err; -+ wh_dentry = ERR_PTR(err); -+ if (unlikely(err < 0)) -+ goto out; -+ -+ sb = dentry->d_sb; -+ udba = au_opt_udba(sb); -+ err = au_pin(pin, dentry, bcpup, udba, -+ AuPin_DI_LOCKED | AuPin_MNT_WRITE); -+ wh_dentry = ERR_PTR(err); -+ if (unlikely(err)) -+ goto out; -+ -+ h_parent = au_pinned_h_parent(pin); -+ if (udba != AuOpt_UDBA_NONE -+ && au_dbstart(dentry) == bcpup) { -+ err = au_may_add(dentry, bcpup, h_parent, -+ au_ftest_wrdir(wr_dir_args->flags, ISDIR)); -+ wh_dentry = ERR_PTR(err); -+ if (unlikely(err)) -+ goto out_unpin; -+ } -+ -+ br = au_sbr(sb, bcpup); -+ if (dt) { -+ struct path tmp = { -+ .dentry = h_parent, -+ .mnt = br->br_mnt -+ }; -+ au_dtime_store(dt, au_pinned_parent(pin), &tmp); -+ } -+ -+ wh_dentry = NULL; -+ if (bcpup != au_dbwh(dentry)) -+ goto out; /* success */ -+ -+ wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br); -+ -+ out_unpin: -+ if (IS_ERR(wh_dentry)) -+ au_unpin(pin); -+ out: -+ return wh_dentry; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+enum { Mknod, Symlink, Creat }; -+struct simple_arg { -+ int type; -+ union { -+ struct { -+ int mode; -+ struct nameidata *nd; -+ } c; -+ struct { -+ const char *symname; -+ } s; -+ struct { -+ int mode; -+ dev_t dev; -+ } m; -+ } u; -+}; -+ -+static int add_simple(struct inode *dir, struct dentry *dentry, -+ struct simple_arg *arg) -+{ -+ int err; -+ aufs_bindex_t bstart; -+ unsigned char created; -+ struct au_dtime dt; -+ struct au_pin pin; -+ struct path h_path; -+ struct dentry *wh_dentry, *parent; -+ struct inode *h_dir; -+ struct au_wr_dir_args wr_dir_args = { -+ .force_btgt = -1, -+ .flags = AuWrDir_ADD_ENTRY -+ }; -+ -+ IMustLock(dir); -+ -+ parent = dentry->d_parent; /* dir inode is locked */ -+ aufs_read_lock(dentry, AuLock_DW); -+ di_write_lock_parent(parent); -+ wh_dentry = lock_hdir_lkup_wh(dentry, &dt, /*src_dentry*/NULL, &pin, -+ &wr_dir_args); -+ err = PTR_ERR(wh_dentry); -+ if (IS_ERR(wh_dentry)) -+ goto out; -+ -+ bstart = au_dbstart(dentry); -+ h_path.dentry = au_h_dptr(dentry, bstart); -+ h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); -+ h_dir = au_pinned_h_dir(&pin); -+ switch (arg->type) { -+ case Creat: -+ err = vfsub_create(h_dir, &h_path, arg->u.c.mode); -+ break; -+ case Symlink: -+ err = vfsub_symlink(h_dir, &h_path, arg->u.s.symname); -+ break; -+ case Mknod: -+ err = vfsub_mknod(h_dir, &h_path, arg->u.m.mode, arg->u.m.dev); -+ break; -+ default: -+ BUG(); -+ } -+ created = !err; -+ if (!err) -+ err = epilog(dir, bstart, wh_dentry, dentry); -+ -+ /* revert */ -+ if (unlikely(created && err && h_path.dentry->d_inode)) { -+ int rerr; -+ rerr = vfsub_unlink(h_dir, &h_path, /*force*/0); -+ if (rerr) { -+ AuIOErr("%.*s revert failure(%d, %d)\n", -+ AuDLNPair(dentry), err, rerr); -+ err = -EIO; -+ } -+ au_dtime_revert(&dt); -+ d_drop(dentry); -+ } -+ -+ au_unpin(&pin); -+ dput(wh_dentry); -+ -+ out: -+ if (unlikely(err)) { -+ au_update_dbstart(dentry); -+ d_drop(dentry); -+ } -+ di_write_unlock(parent); -+ aufs_read_unlock(dentry, AuLock_DW); -+ return err; -+} -+ -+int aufs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) -+{ -+ struct simple_arg arg = { -+ .type = Mknod, -+ .u.m = { -+ .mode = mode, -+ .dev = dev -+ } -+ }; -+ return add_simple(dir, dentry, &arg); -+} -+ -+int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) -+{ -+ struct simple_arg arg = { -+ .type = Symlink, -+ .u.s.symname = symname -+ }; -+ return add_simple(dir, dentry, &arg); -+} -+ -+int aufs_create(struct inode *dir, struct dentry *dentry, int mode, -+ struct nameidata *nd) -+{ -+ struct simple_arg arg = { -+ .type = Creat, -+ .u.c = { -+ .mode = mode, -+ .nd = nd -+ } -+ }; -+ return add_simple(dir, dentry, &arg); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct au_link_args { -+ aufs_bindex_t bdst, bsrc; -+ struct au_pin pin; -+ struct path h_path; -+ struct dentry *src_parent, *parent; -+}; -+ -+static int au_cpup_before_link(struct dentry *src_dentry, -+ struct au_link_args *a) -+{ -+ int err; -+ struct dentry *h_src_dentry; -+ struct mutex *h_mtx; -+ -+ di_read_lock_parent(a->src_parent, AuLock_IR); -+ err = au_test_and_cpup_dirs(src_dentry, a->bdst); -+ if (unlikely(err)) -+ goto out; -+ -+ h_src_dentry = au_h_dptr(src_dentry, a->bsrc); -+ h_mtx = &h_src_dentry->d_inode->i_mutex; -+ err = au_pin(&a->pin, src_dentry, a->bdst, -+ au_opt_udba(src_dentry->d_sb), -+ AuPin_DI_LOCKED | AuPin_MNT_WRITE); -+ if (unlikely(err)) -+ goto out; -+ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); -+ err = au_sio_cpup_simple(src_dentry, a->bdst, -1, -+ AuCpup_DTIME /* | AuCpup_KEEPLINO */); -+ mutex_unlock(h_mtx); -+ au_unpin(&a->pin); -+ -+ out: -+ di_read_unlock(a->src_parent, AuLock_IR); -+ return err; -+} -+ -+static int au_cpup_or_link(struct dentry *src_dentry, struct au_link_args *a) -+{ -+ int err; -+ unsigned char plink; -+ struct inode *h_inode, *inode; -+ struct dentry *h_src_dentry; -+ struct super_block *sb; -+ -+ plink = 0; -+ h_inode = NULL; -+ sb = src_dentry->d_sb; -+ inode = src_dentry->d_inode; -+ if (au_ibstart(inode) <= a->bdst) -+ h_inode = au_h_iptr(inode, a->bdst); -+ if (!h_inode || !h_inode->i_nlink) { -+ /* copyup src_dentry as the name of dentry. */ -+ au_set_dbstart(src_dentry, a->bdst); -+ au_set_h_dptr(src_dentry, a->bdst, dget(a->h_path.dentry)); -+ h_inode = au_h_dptr(src_dentry, a->bsrc)->d_inode; -+ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); -+ err = au_sio_cpup_single(src_dentry, a->bdst, a->bsrc, -1, -+ AuCpup_KEEPLINO, a->parent); -+ mutex_unlock(&h_inode->i_mutex); -+ au_set_h_dptr(src_dentry, a->bdst, NULL); -+ au_set_dbstart(src_dentry, a->bsrc); -+ } else { -+ /* the inode of src_dentry already exists on a.bdst branch */ -+ h_src_dentry = d_find_alias(h_inode); -+ if (!h_src_dentry && au_plink_test(inode)) { -+ plink = 1; -+ h_src_dentry = au_plink_lkup(inode, a->bdst); -+ err = PTR_ERR(h_src_dentry); -+ if (IS_ERR(h_src_dentry)) -+ goto out; -+ -+ if (unlikely(!h_src_dentry->d_inode)) { -+ dput(h_src_dentry); -+ h_src_dentry = NULL; -+ } -+ -+ } -+ if (h_src_dentry) { -+ err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), -+ &a->h_path); -+ dput(h_src_dentry); -+ } else { -+ AuIOErr("no dentry found for hi%lu on b%d\n", -+ h_inode->i_ino, a->bdst); -+ err = -EIO; -+ } -+ } -+ -+ if (!err && !plink) -+ au_plink_append(inode, a->bdst, a->h_path.dentry); -+ -+out: -+ return err; -+} -+ -+int aufs_link(struct dentry *src_dentry, struct inode *dir, -+ struct dentry *dentry) -+{ -+ int err, rerr; -+ struct au_dtime dt; -+ struct au_link_args *a; -+ struct dentry *wh_dentry, *h_src_dentry; -+ struct inode *inode; -+ struct super_block *sb; -+ struct au_wr_dir_args wr_dir_args = { -+ /* .force_btgt = -1, */ -+ .flags = AuWrDir_ADD_ENTRY -+ }; -+ -+ IMustLock(dir); -+ inode = src_dentry->d_inode; -+ IMustLock(inode); -+ -+ err = -ENOENT; -+ if (unlikely(!inode->i_nlink)) -+ goto out; -+ -+ err = -ENOMEM; -+ a = kzalloc(sizeof(*a), GFP_NOFS); -+ if (unlikely(!a)) -+ goto out; -+ -+ a->parent = dentry->d_parent; /* dir inode is locked */ -+ aufs_read_and_write_lock2(dentry, src_dentry, /*AuLock_FLUSH*/0); -+ a->src_parent = dget_parent(src_dentry); -+ wr_dir_args.force_btgt = au_dbstart(src_dentry); -+ -+ di_write_lock_parent(a->parent); -+ wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt); -+ wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin, -+ &wr_dir_args); -+ err = PTR_ERR(wh_dentry); -+ if (IS_ERR(wh_dentry)) -+ goto out_unlock; -+ -+ err = 0; -+ sb = dentry->d_sb; -+ a->bdst = au_dbstart(dentry); -+ a->h_path.dentry = au_h_dptr(dentry, a->bdst); -+ a->h_path.mnt = au_sbr_mnt(sb, a->bdst); -+ a->bsrc = au_dbstart(src_dentry); -+ if (au_opt_test(au_mntflags(sb), PLINK)) { -+ if (a->bdst < a->bsrc -+ /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) -+ err = au_cpup_or_link(src_dentry, a); -+ else { -+ h_src_dentry = au_h_dptr(src_dentry, a->bdst); -+ err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), -+ &a->h_path); -+ } -+ } else { -+ /* -+ * copyup src_dentry to the branch we process, -+ * and then link(2) to it. -+ */ -+ if (a->bdst < a->bsrc -+ /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) { -+ au_unpin(&a->pin); -+ di_write_unlock(a->parent); -+ err = au_cpup_before_link(src_dentry, a); -+ di_write_lock_parent(a->parent); -+ if (!err) -+ err = au_pin(&a->pin, dentry, a->bdst, -+ au_opt_udba(sb), -+ AuPin_DI_LOCKED | AuPin_MNT_WRITE); -+ if (unlikely(err)) -+ goto out_wh; -+ } -+ if (!err) { -+ h_src_dentry = au_h_dptr(src_dentry, a->bdst); -+ err = -ENOENT; -+ if (h_src_dentry && h_src_dentry->d_inode) -+ err = vfsub_link(h_src_dentry, -+ au_pinned_h_dir(&a->pin), -+ &a->h_path); -+ } -+ } -+ if (unlikely(err)) -+ goto out_unpin; -+ -+ if (wh_dentry) { -+ a->h_path.dentry = wh_dentry; -+ err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path, -+ dentry); -+ if (unlikely(err)) -+ goto out_revert; -+ } -+ -+ dir->i_version++; -+ if (au_ibstart(dir) == au_dbstart(dentry)) -+ au_cpup_attr_timesizes(dir); -+ inc_nlink(inode); -+ inode->i_ctime = dir->i_ctime; -+ if (!d_unhashed(a->h_path.dentry)) -+ d_instantiate(dentry, au_igrab(inode)); -+ else -+ /* some filesystem calls d_drop() */ -+ d_drop(dentry); -+ goto out_unpin; /* success */ -+ -+ out_revert: -+ rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path, /*force*/0); -+ if (!rerr) -+ goto out_dt; -+ AuIOErr("%.*s reverting failed(%d, %d)\n", -+ AuDLNPair(dentry), err, rerr); -+ err = -EIO; -+ out_dt: -+ d_drop(dentry); -+ au_dtime_revert(&dt); -+ out_unpin: -+ au_unpin(&a->pin); -+ out_wh: -+ dput(wh_dentry); -+ out_unlock: -+ if (unlikely(err)) { -+ au_update_dbstart(dentry); -+ d_drop(dentry); -+ } -+ di_write_unlock(a->parent); -+ dput(a->src_parent); -+ aufs_read_and_write_unlock2(dentry, src_dentry); -+ kfree(a); -+ out: -+ return err; -+} -+ -+int aufs_mkdir(struct inode *dir, struct dentry *dentry, int mode) -+{ -+ int err, rerr; -+ aufs_bindex_t bindex; -+ unsigned char diropq; -+ struct path h_path; -+ struct dentry *wh_dentry, *parent, *opq_dentry; -+ struct mutex *h_mtx; -+ struct super_block *sb; -+ struct { -+ struct au_pin pin; -+ struct au_dtime dt; -+ } *a; /* reduce the stack usage */ -+ struct au_wr_dir_args wr_dir_args = { -+ .force_btgt = -1, -+ .flags = AuWrDir_ADD_ENTRY | AuWrDir_ISDIR -+ }; -+ -+ IMustLock(dir); -+ -+ err = -ENOMEM; -+ a = kmalloc(sizeof(*a), GFP_NOFS); -+ if (unlikely(!a)) -+ goto out; -+ -+ aufs_read_lock(dentry, AuLock_DW); -+ parent = dentry->d_parent; /* dir inode is locked */ -+ di_write_lock_parent(parent); -+ wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, -+ &a->pin, &wr_dir_args); -+ err = PTR_ERR(wh_dentry); -+ if (IS_ERR(wh_dentry)) -+ goto out_free; -+ -+ sb = dentry->d_sb; -+ bindex = au_dbstart(dentry); -+ h_path.dentry = au_h_dptr(dentry, bindex); -+ h_path.mnt = au_sbr_mnt(sb, bindex); -+ err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode); -+ if (unlikely(err)) -+ goto out_unlock; -+ -+ /* make the dir opaque */ -+ diropq = 0; -+ h_mtx = &h_path.dentry->d_inode->i_mutex; -+ if (wh_dentry -+ || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) { -+ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); -+ opq_dentry = au_diropq_create(dentry, bindex); -+ mutex_unlock(h_mtx); -+ err = PTR_ERR(opq_dentry); -+ if (IS_ERR(opq_dentry)) -+ goto out_dir; -+ dput(opq_dentry); -+ diropq = 1; -+ } -+ -+ err = epilog(dir, bindex, wh_dentry, dentry); -+ if (!err) { -+ inc_nlink(dir); -+ goto out_unlock; /* success */ -+ } -+ -+ /* revert */ -+ if (diropq) { -+ AuLabel(revert opq); -+ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); -+ rerr = au_diropq_remove(dentry, bindex); -+ mutex_unlock(h_mtx); -+ if (rerr) { -+ AuIOErr("%.*s reverting diropq failed(%d, %d)\n", -+ AuDLNPair(dentry), err, rerr); -+ err = -EIO; -+ } -+ } -+ -+ out_dir: -+ AuLabel(revert dir); -+ rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path); -+ if (rerr) { -+ AuIOErr("%.*s reverting dir failed(%d, %d)\n", -+ AuDLNPair(dentry), err, rerr); -+ err = -EIO; -+ } -+ d_drop(dentry); -+ au_dtime_revert(&a->dt); -+ out_unlock: -+ au_unpin(&a->pin); -+ dput(wh_dentry); -+ out_free: -+ if (unlikely(err)) { -+ au_update_dbstart(dentry); -+ d_drop(dentry); -+ } -+ di_write_unlock(parent); -+ aufs_read_unlock(dentry, AuLock_DW); -+ kfree(a); -+ out: -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/i_op.c linux-2.6.31/fs/aufs/i_op.c ---- linux-2.6.31-vanilla/fs/aufs/i_op.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/i_op.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,891 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * inode operations (except add/del/rename) -+ */ -+ -+#include <linux/device_cgroup.h> -+#include <linux/fs_stack.h> -+#include <linux/mm.h> -+#include <linux/namei.h> -+#include <linux/security.h> -+#include <linux/uaccess.h> -+#include "aufs.h" -+ -+static int h_permission(struct inode *h_inode, int mask, -+ struct vfsmount *h_mnt, int brperm) -+{ -+ int err; -+ const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); -+ -+ err = -EACCES; -+ if ((write_mask && IS_IMMUTABLE(h_inode)) -+ || ((mask & MAY_EXEC) -+ && S_ISREG(h_inode->i_mode) -+ && ((h_mnt->mnt_flags & MNT_NOEXEC) -+ || !(h_inode->i_mode & S_IXUGO)))) -+ goto out; -+ -+ /* -+ * - skip the lower fs test in the case of write to ro branch. -+ * - nfs dir permission write check is optimized, but a policy for -+ * link/rename requires a real check. -+ */ -+ if ((write_mask && !au_br_writable(brperm)) -+ || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode) -+ && write_mask && !(mask & MAY_READ)) -+ || !h_inode->i_op->permission) { -+ /* AuLabel(generic_permission); */ -+ err = generic_permission(h_inode, mask, NULL); -+ } else { -+ /* AuLabel(h_inode->permission); */ -+ err = h_inode->i_op->permission(h_inode, mask); -+ AuTraceErr(err); -+ } -+ -+ if (!err) -+ err = devcgroup_inode_permission(h_inode, mask); -+ if (!err) -+ err = security_inode_permission -+ (h_inode, mask & (MAY_READ | MAY_WRITE | MAY_EXEC -+ | MAY_APPEND)); -+ -+#if 0 -+ if (!err) { -+ /* todo: do we need to call ima_path_check()? */ -+ struct path h_path = { -+ .dentry = -+ .mnt = h_mnt -+ }; -+ err = ima_path_check(&h_path, -+ mask & (MAY_READ | MAY_WRITE | MAY_EXEC), -+ IMA_COUNT_LEAVE); -+ } -+#endif -+ -+ out: -+ return err; -+} -+ -+static int aufs_permission(struct inode *inode, int mask) -+{ -+ int err; -+ aufs_bindex_t bindex, bend; -+ const unsigned char isdir = !!S_ISDIR(inode->i_mode); -+ const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); -+ struct inode *h_inode; -+ struct super_block *sb; -+ struct au_branch *br; -+ -+ sb = inode->i_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ ii_read_lock_child(inode); -+ -+ if (!isdir || write_mask) { -+ err = au_busy_or_stale(); -+ h_inode = au_h_iptr(inode, au_ibstart(inode)); -+ if (unlikely(!h_inode -+ || (h_inode->i_mode & S_IFMT) -+ != (inode->i_mode & S_IFMT))) -+ goto out; -+ -+ err = 0; -+ bindex = au_ibstart(inode); -+ br = au_sbr(sb, bindex); -+ err = h_permission(h_inode, mask, br->br_mnt, br->br_perm); -+ if (write_mask && !err) { -+ /* test whether the upper writable branch exists */ -+ err = -EROFS; -+ for (; bindex >= 0; bindex--) -+ if (!au_br_rdonly(au_sbr(sb, bindex))) { -+ err = 0; -+ break; -+ } -+ } -+ goto out; -+ } -+ -+ /* non-write to dir */ -+ err = 0; -+ bend = au_ibend(inode); -+ for (bindex = au_ibstart(inode); !err && bindex <= bend; bindex++) { -+ h_inode = au_h_iptr(inode, bindex); -+ if (h_inode) { -+ err = au_busy_or_stale(); -+ if (unlikely(!S_ISDIR(h_inode->i_mode))) -+ break; -+ -+ br = au_sbr(sb, bindex); -+ err = h_permission(h_inode, mask, br->br_mnt, -+ br->br_perm); -+ } -+ } -+ -+ out: -+ ii_read_unlock(inode); -+ si_read_unlock(sb); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry, -+ struct nameidata *nd) -+{ -+ struct dentry *ret, *parent; -+ struct inode *inode, *h_inode; -+ struct mutex *mtx; -+ struct super_block *sb; -+ int err, npositive; -+ aufs_bindex_t bstart; -+ -+ IMustLock(dir); -+ -+ sb = dir->i_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ err = au_alloc_dinfo(dentry); -+ ret = ERR_PTR(err); -+ if (unlikely(err)) -+ goto out; -+ -+ parent = dentry->d_parent; /* dir inode is locked */ -+ di_read_lock_parent(parent, AuLock_IR); -+ npositive = au_lkup_dentry(dentry, au_dbstart(parent), /*type*/0, nd); -+ di_read_unlock(parent, AuLock_IR); -+ err = npositive; -+ ret = ERR_PTR(err); -+ if (unlikely(err < 0)) -+ goto out_unlock; -+ -+ inode = NULL; -+ if (npositive) { -+ bstart = au_dbstart(dentry); -+ h_inode = au_h_dptr(dentry, bstart)->d_inode; -+ if (!S_ISDIR(h_inode->i_mode)) { -+ /* -+ * stop 'race'-ing between hardlinks under different -+ * parents. -+ */ -+ mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx; -+ mutex_lock(mtx); -+ inode = au_new_inode(dentry, /*must_new*/0); -+ mutex_unlock(mtx); -+ } else -+ inode = au_new_inode(dentry, /*must_new*/0); -+ ret = (void *)inode; -+ } -+ if (IS_ERR(inode)) -+ goto out_unlock; -+ -+ ret = d_splice_alias(inode, dentry); -+ if (unlikely(IS_ERR(ret) && inode)) -+ ii_write_unlock(inode); -+ -+ out_unlock: -+ di_write_unlock(dentry); -+ out: -+ si_read_unlock(sb); -+ return ret; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent, -+ const unsigned char add_entry, aufs_bindex_t bcpup, -+ aufs_bindex_t bstart) -+{ -+ int err; -+ struct dentry *h_parent; -+ struct inode *h_dir; -+ -+ if (add_entry) { -+ au_update_dbstart(dentry); -+ IMustLock(parent->d_inode); -+ } else -+ di_write_lock_parent(parent); -+ -+ err = 0; -+ if (!au_h_dptr(parent, bcpup)) { -+ if (bstart < bcpup) -+ err = au_cpdown_dirs(dentry, bcpup); -+ else -+ err = au_cpup_dirs(dentry, bcpup); -+ } -+ if (!err && add_entry) { -+ h_parent = au_h_dptr(parent, bcpup); -+ h_dir = h_parent->d_inode; -+ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); -+ err = au_lkup_neg(dentry, bcpup); -+ /* todo: no unlock here */ -+ mutex_unlock(&h_dir->i_mutex); -+ if (bstart < bcpup && au_dbstart(dentry) < 0) { -+ au_set_dbstart(dentry, 0); -+ au_update_dbrange(dentry, /*do_put_zero*/0); -+ } -+ } -+ -+ if (!add_entry) -+ di_write_unlock(parent); -+ if (!err) -+ err = bcpup; /* success */ -+ -+ return err; -+} -+ -+/* -+ * decide the branch and the parent dir where we will create a new entry. -+ * returns new bindex or an error. -+ * copyup the parent dir if needed. -+ */ -+int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, -+ struct au_wr_dir_args *args) -+{ -+ int err; -+ aufs_bindex_t bcpup, bstart, src_bstart; -+ const unsigned char add_entry = !!au_ftest_wrdir(args->flags, -+ ADD_ENTRY); -+ struct super_block *sb; -+ struct dentry *parent; -+ struct au_sbinfo *sbinfo; -+ -+ sb = dentry->d_sb; -+ sbinfo = au_sbi(sb); -+ parent = dget_parent(dentry); -+ bstart = au_dbstart(dentry); -+ bcpup = bstart; -+ if (args->force_btgt < 0) { -+ if (src_dentry) { -+ src_bstart = au_dbstart(src_dentry); -+ if (src_bstart < bstart) -+ bcpup = src_bstart; -+ } else if (add_entry) { -+ err = AuWbrCreate(sbinfo, dentry, -+ au_ftest_wrdir(args->flags, ISDIR)); -+ bcpup = err; -+ } -+ -+ if (bcpup < 0 || au_test_ro(sb, bcpup, dentry->d_inode)) { -+ if (add_entry) -+ err = AuWbrCopyup(sbinfo, dentry); -+ else { -+ if (!IS_ROOT(dentry)) { -+ di_read_lock_parent(parent, !AuLock_IR); -+ err = AuWbrCopyup(sbinfo, dentry); -+ di_read_unlock(parent, !AuLock_IR); -+ } else -+ err = AuWbrCopyup(sbinfo, dentry); -+ } -+ bcpup = err; -+ if (unlikely(err < 0)) -+ goto out; -+ } -+ } else { -+ bcpup = args->force_btgt; -+ AuDebugOn(au_test_ro(sb, bcpup, dentry->d_inode)); -+ } -+ AuDbg("bstart %d, bcpup %d\n", bstart, bcpup); -+ if (bstart < bcpup) -+ au_update_dbrange(dentry, /*do_put_zero*/1); -+ -+ err = bcpup; -+ if (bcpup == bstart) -+ goto out; /* success */ -+ -+ /* copyup the new parent into the branch we process */ -+ err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, bstart); -+ -+ out: -+ dput(parent); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct dentry *au_pinned_h_parent(struct au_pin *pin) -+{ -+ if (pin && pin->parent) -+ return au_h_dptr(pin->parent, pin->bindex); -+ return NULL; -+} -+ -+void au_unpin(struct au_pin *p) -+{ -+ if (au_ftest_pin(p->flags, MNT_WRITE)) -+ mnt_drop_write(p->h_mnt); -+ if (!p->hdir) -+ return; -+ -+ au_hin_imtx_unlock(p->hdir); -+ if (!au_ftest_pin(p->flags, DI_LOCKED)) -+ di_read_unlock(p->parent, AuLock_IR); -+ iput(p->hdir->hi_inode); -+ dput(p->parent); -+ p->parent = NULL; -+ p->hdir = NULL; -+ p->h_mnt = NULL; -+} -+ -+int au_do_pin(struct au_pin *p) -+{ -+ int err; -+ struct super_block *sb; -+ struct dentry *h_dentry, *h_parent; -+ struct au_branch *br; -+ struct inode *h_dir; -+ -+ err = 0; -+ sb = p->dentry->d_sb; -+ br = au_sbr(sb, p->bindex); -+ if (IS_ROOT(p->dentry)) { -+ if (au_ftest_pin(p->flags, MNT_WRITE)) { -+ p->h_mnt = br->br_mnt; -+ err = mnt_want_write(p->h_mnt); -+ if (unlikely(err)) { -+ au_fclr_pin(p->flags, MNT_WRITE); -+ goto out_err; -+ } -+ } -+ goto out; -+ } -+ -+ h_dentry = NULL; -+ if (p->bindex <= au_dbend(p->dentry)) -+ h_dentry = au_h_dptr(p->dentry, p->bindex); -+ -+ p->parent = dget_parent(p->dentry); -+ if (!au_ftest_pin(p->flags, DI_LOCKED)) -+ di_read_lock(p->parent, AuLock_IR, p->lsc_di); -+ -+ h_dir = NULL; -+ h_parent = au_h_dptr(p->parent, p->bindex); -+ p->hdir = au_hi(p->parent->d_inode, p->bindex); -+ if (p->hdir) -+ h_dir = p->hdir->hi_inode; -+ -+ /* udba case */ -+ if (unlikely(!p->hdir || !h_dir)) { -+ if (!au_ftest_pin(p->flags, DI_LOCKED)) -+ di_read_unlock(p->parent, AuLock_IR); -+ dput(p->parent); -+ p->parent = NULL; -+ goto out_err; -+ } -+ -+ au_igrab(h_dir); -+ au_hin_imtx_lock_nested(p->hdir, p->lsc_hi); -+ -+ if (unlikely(p->hdir->hi_inode != h_parent->d_inode)) { -+ err = -EBUSY; -+ goto out_unpin; -+ } -+ if (h_dentry) { -+ err = au_h_verify(h_dentry, p->udba, h_dir, h_parent, br); -+ if (unlikely(err)) { -+ au_fclr_pin(p->flags, MNT_WRITE); -+ goto out_unpin; -+ } -+ } -+ -+ if (au_ftest_pin(p->flags, MNT_WRITE)) { -+ p->h_mnt = br->br_mnt; -+ err = mnt_want_write(p->h_mnt); -+ if (unlikely(err)) { -+ au_fclr_pin(p->flags, MNT_WRITE); -+ goto out_unpin; -+ } -+ } -+ goto out; /* success */ -+ -+ out_unpin: -+ au_unpin(p); -+ out_err: -+ AuErr("err %d\n", err); -+ err = au_busy_or_stale(); -+ out: -+ return err; -+} -+ -+void au_pin_init(struct au_pin *p, struct dentry *dentry, -+ aufs_bindex_t bindex, int lsc_di, int lsc_hi, -+ unsigned int udba, unsigned char flags) -+{ -+ p->dentry = dentry; -+ p->udba = udba; -+ p->lsc_di = lsc_di; -+ p->lsc_hi = lsc_hi; -+ p->flags = flags; -+ p->bindex = bindex; -+ -+ p->parent = NULL; -+ p->hdir = NULL; -+ p->h_mnt = NULL; -+} -+ -+int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, -+ unsigned int udba, unsigned char flags) -+{ -+ au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2, -+ udba, flags); -+ return au_do_pin(pin); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+#define AuIcpup_DID_CPUP 1 -+#define au_ftest_icpup(flags, name) ((flags) & AuIcpup_##name) -+#define au_fset_icpup(flags, name) { (flags) |= AuIcpup_##name; } -+#define au_fclr_icpup(flags, name) { (flags) &= ~AuIcpup_##name; } -+ -+struct au_icpup_args { -+ unsigned char flags; -+ unsigned char pin_flags; -+ aufs_bindex_t btgt; -+ struct au_pin pin; -+ struct path h_path; -+ struct inode *h_inode; -+}; -+ -+static int au_lock_and_icpup(struct dentry *dentry, struct iattr *ia, -+ struct au_icpup_args *a) -+{ -+ int err; -+ unsigned int udba; -+ loff_t sz; -+ aufs_bindex_t bstart; -+ struct dentry *hi_wh, *parent; -+ struct inode *inode; -+ struct au_wr_dir_args wr_dir_args = { -+ .force_btgt = -1, -+ .flags = 0 -+ }; -+ -+ di_write_lock_child(dentry); -+ bstart = au_dbstart(dentry); -+ inode = dentry->d_inode; -+ if (S_ISDIR(inode->i_mode)) -+ au_fset_wrdir(wr_dir_args.flags, ISDIR); -+ /* plink or hi_wh() case */ -+ if (bstart != au_ibstart(inode)) -+ wr_dir_args.force_btgt = au_ibstart(inode); -+ err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); -+ if (unlikely(err < 0)) -+ goto out_dentry; -+ a->btgt = err; -+ if (err != bstart) -+ au_fset_icpup(a->flags, DID_CPUP); -+ -+ err = 0; -+ a->pin_flags = AuPin_MNT_WRITE; -+ parent = NULL; -+ if (!IS_ROOT(dentry)) { -+ au_fset_pin(a->pin_flags, DI_LOCKED); -+ parent = dget_parent(dentry); -+ di_write_lock_parent(parent); -+ } -+ -+ udba = au_opt_udba(dentry->d_sb); -+ if (d_unhashed(dentry) || (ia->ia_valid & ATTR_FILE)) -+ udba = AuOpt_UDBA_NONE; -+ err = au_pin(&a->pin, dentry, a->btgt, udba, a->pin_flags); -+ if (unlikely(err)) { -+ if (parent) { -+ di_write_unlock(parent); -+ dput(parent); -+ } -+ goto out_dentry; -+ } -+ a->h_path.dentry = au_h_dptr(dentry, bstart); -+ a->h_inode = a->h_path.dentry->d_inode; -+ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); -+ sz = -1; -+ if ((ia->ia_valid & ATTR_SIZE) && ia->ia_size < i_size_read(a->h_inode)) -+ sz = ia->ia_size; -+ -+ hi_wh = NULL; -+ if (au_ftest_icpup(a->flags, DID_CPUP) && d_unhashed(dentry)) { -+ hi_wh = au_hi_wh(inode, a->btgt); -+ if (!hi_wh) { -+ err = au_sio_cpup_wh(dentry, a->btgt, sz, /*file*/NULL); -+ if (unlikely(err)) -+ goto out_unlock; -+ hi_wh = au_hi_wh(inode, a->btgt); -+ /* todo: revalidate hi_wh? */ -+ } -+ } -+ -+ if (parent) { -+ au_pin_set_parent_lflag(&a->pin, /*lflag*/0); -+ di_downgrade_lock(parent, AuLock_IR); -+ dput(parent); -+ } -+ if (!au_ftest_icpup(a->flags, DID_CPUP)) -+ goto out; /* success */ -+ -+ if (!d_unhashed(dentry)) { -+ err = au_sio_cpup_simple(dentry, a->btgt, sz, AuCpup_DTIME); -+ if (!err) -+ a->h_path.dentry = au_h_dptr(dentry, a->btgt); -+ } else if (!hi_wh) -+ a->h_path.dentry = au_h_dptr(dentry, a->btgt); -+ else -+ a->h_path.dentry = hi_wh; /* do not dget here */ -+ -+ out_unlock: -+ mutex_unlock(&a->h_inode->i_mutex); -+ a->h_inode = a->h_path.dentry->d_inode; -+ if (!err) { -+ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); -+ goto out; /* success */ -+ } -+ -+ au_unpin(&a->pin); -+ -+ out_dentry: -+ di_write_unlock(dentry); -+ out: -+ return err; -+} -+ -+static int aufs_setattr(struct dentry *dentry, struct iattr *ia) -+{ -+ int err; -+ struct inode *inode; -+ struct super_block *sb; -+ struct file *file; -+ struct au_icpup_args *a; -+ -+ err = -ENOMEM; -+ a = kzalloc(sizeof(*a), GFP_NOFS); -+ if (unlikely(!a)) -+ goto out; -+ -+ inode = dentry->d_inode; -+ IMustLock(inode); -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ -+ file = NULL; -+ if (ia->ia_valid & ATTR_FILE) { -+ /* currently ftruncate(2) only */ -+ file = ia->ia_file; -+ fi_write_lock(file); -+ ia->ia_file = au_h_fptr(file, au_fbstart(file)); -+ } -+ -+ if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) -+ ia->ia_valid &= ~ATTR_MODE; -+ -+ err = au_lock_and_icpup(dentry, ia, a); -+ if (unlikely(err < 0)) -+ goto out_si; -+ if (au_ftest_icpup(a->flags, DID_CPUP)) { -+ ia->ia_file = NULL; -+ ia->ia_valid &= ~ATTR_FILE; -+ } -+ -+ a->h_path.mnt = au_sbr_mnt(sb, a->btgt); -+ if (ia->ia_valid & ATTR_SIZE) { -+ struct file *f; -+ -+ if (ia->ia_size < i_size_read(inode)) { -+ /* unmap only */ -+ err = vmtruncate(inode, ia->ia_size); -+ if (unlikely(err)) -+ goto out_unlock; -+ } -+ -+ f = NULL; -+ if (ia->ia_valid & ATTR_FILE) -+ f = ia->ia_file; -+ mutex_unlock(&a->h_inode->i_mutex); -+ err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f); -+ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); -+ } else -+ err = vfsub_notify_change(&a->h_path, ia); -+ if (!err) -+ au_cpup_attr_changeable(inode); -+ -+ out_unlock: -+ mutex_unlock(&a->h_inode->i_mutex); -+ au_unpin(&a->pin); -+ di_write_unlock(dentry); -+ out_si: -+ if (file) { -+ fi_write_unlock(file); -+ ia->ia_file = file; -+ ia->ia_valid |= ATTR_FILE; -+ } -+ si_read_unlock(sb); -+ kfree(a); -+ out: -+ return err; -+} -+ -+static int au_getattr_lock_reval(struct dentry *dentry, unsigned int sigen) -+{ -+ int err; -+ struct inode *inode; -+ struct dentry *parent; -+ -+ err = 0; -+ inode = dentry->d_inode; -+ di_write_lock_child(dentry); -+ if (au_digen(dentry) != sigen || au_iigen(inode) != sigen) { -+ parent = dget_parent(dentry); -+ di_read_lock_parent(parent, AuLock_IR); -+ /* returns a number of positive dentries */ -+ err = au_refresh_hdentry(dentry, inode->i_mode & S_IFMT); -+ if (err > 0) -+ err = au_refresh_hinode(inode, dentry); -+ di_read_unlock(parent, AuLock_IR); -+ dput(parent); -+ if (unlikely(!err)) -+ err = -EIO; -+ } -+ di_downgrade_lock(dentry, AuLock_IR); -+ if (unlikely(err)) -+ di_read_unlock(dentry, AuLock_IR); -+ -+ return err; -+} -+ -+static void au_refresh_iattr(struct inode *inode, struct kstat *st, -+ unsigned int nlink) -+{ -+ inode->i_mode = st->mode; -+ inode->i_uid = st->uid; -+ inode->i_gid = st->gid; -+ inode->i_atime = st->atime; -+ inode->i_mtime = st->mtime; -+ inode->i_ctime = st->ctime; -+ -+ au_cpup_attr_nlink(inode, /*force*/0); -+ if (S_ISDIR(inode->i_mode)) { -+ inode->i_nlink -= nlink; -+ inode->i_nlink += st->nlink; -+ } -+ -+ spin_lock(&inode->i_lock); -+ inode->i_blocks = st->blocks; -+ i_size_write(inode, st->size); -+ spin_unlock(&inode->i_lock); -+} -+ -+static int aufs_getattr(struct vfsmount *mnt __maybe_unused, -+ struct dentry *dentry, struct kstat *st) -+{ -+ int err; -+ unsigned int mnt_flags; -+ aufs_bindex_t bindex; -+ unsigned char udba_none, positive; -+ struct super_block *sb, *h_sb; -+ struct inode *inode; -+ struct vfsmount *h_mnt; -+ struct dentry *h_dentry; -+ -+ err = 0; -+ sb = dentry->d_sb; -+ inode = dentry->d_inode; -+ si_read_lock(sb, AuLock_FLUSH); -+ mnt_flags = au_mntflags(sb); -+ udba_none = !!au_opt_test(mnt_flags, UDBA_NONE); -+ -+ /* support fstat(2) */ -+ if (!d_unhashed(dentry) && !udba_none) { -+ unsigned int sigen = au_sigen(sb); -+ if (au_digen(dentry) == sigen && au_iigen(inode) == sigen) -+ di_read_lock_child(dentry, AuLock_IR); -+ else { -+ /* NFSD may skip the revalidation */ -+ if (!au_test_nfsd(current)) -+ AuDebugOn(!IS_ROOT(dentry)); -+ else { -+ err = au_busy_or_stale(); -+ if (unlikely(!IS_ROOT(dentry))) -+ goto out; -+ } -+ err = au_getattr_lock_reval(dentry, sigen); -+ if (unlikely(err)) -+ goto out; -+ } -+ } else -+ di_read_lock_child(dentry, AuLock_IR); -+ -+ bindex = au_ibstart(inode); -+ h_mnt = au_sbr_mnt(sb, bindex); -+ h_sb = h_mnt->mnt_sb; -+ if (!au_test_fs_bad_iattr(h_sb) && udba_none) -+ goto out_fill; /* success */ -+ -+ h_dentry = NULL; -+ if (au_dbstart(dentry) == bindex) -+ h_dentry = dget(au_h_dptr(dentry, bindex)); -+ else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) { -+ h_dentry = au_plink_lkup(inode, bindex); -+ if (IS_ERR(h_dentry)) -+ goto out_fill; /* pretending success */ -+ } -+ /* illegally overlapped or something */ -+ if (unlikely(!h_dentry)) -+ goto out_fill; /* pretending success */ -+ -+ positive = !!h_dentry->d_inode; -+ if (positive) -+ err = vfs_getattr(h_mnt, h_dentry, st); -+ dput(h_dentry); -+ if (!err) { -+ if (positive) -+ au_refresh_iattr(inode, st, h_dentry->d_inode->i_nlink); -+ goto out_fill; /* success */ -+ } -+ goto out_unlock; -+ -+ out_fill: -+ generic_fillattr(inode, st); -+ out_unlock: -+ di_read_unlock(dentry, AuLock_IR); -+ out: -+ si_read_unlock(sb); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int h_readlink(struct dentry *dentry, int bindex, char __user *buf, -+ int bufsiz) -+{ -+ int err; -+ struct super_block *sb; -+ struct dentry *h_dentry; -+ -+ err = -EINVAL; -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (unlikely(/* !h_dentry -+ || !h_dentry->d_inode -+ || !h_dentry->d_inode->i_op -+ || */ !h_dentry->d_inode->i_op->readlink)) -+ goto out; -+ -+ err = security_inode_readlink(h_dentry); -+ if (unlikely(err)) -+ goto out; -+ -+ sb = dentry->d_sb; -+ if (!au_test_ro(sb, bindex, dentry->d_inode)) { -+ vfsub_touch_atime(au_sbr_mnt(sb, bindex), h_dentry); -+ fsstack_copy_attr_atime(dentry->d_inode, h_dentry->d_inode); -+ } -+ err = h_dentry->d_inode->i_op->readlink(h_dentry, buf, bufsiz); -+ -+ out: -+ return err; -+} -+ -+static int aufs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) -+{ -+ int err; -+ -+ aufs_read_lock(dentry, AuLock_IR); -+ err = h_readlink(dentry, au_dbstart(dentry), buf, bufsiz); -+ aufs_read_unlock(dentry, AuLock_IR); -+ -+ return err; -+} -+ -+static void *aufs_follow_link(struct dentry *dentry, struct nameidata *nd) -+{ -+ int err; -+ char *buf; -+ mm_segment_t old_fs; -+ -+ err = -ENOMEM; -+ buf = __getname(); -+ if (unlikely(!buf)) -+ goto out; -+ -+ aufs_read_lock(dentry, AuLock_IR); -+ old_fs = get_fs(); -+ set_fs(KERNEL_DS); -+ err = h_readlink(dentry, au_dbstart(dentry), (char __user *)buf, -+ PATH_MAX); -+ set_fs(old_fs); -+ aufs_read_unlock(dentry, AuLock_IR); -+ -+ if (err >= 0) { -+ buf[err] = 0; -+ /* will be freed by put_link */ -+ nd_set_link(nd, buf); -+ return NULL; /* success */ -+ } -+ __putname(buf); -+ -+ out: -+ path_put(&nd->path); -+ AuTraceErr(err); -+ return ERR_PTR(err); -+} -+ -+static void aufs_put_link(struct dentry *dentry __maybe_unused, -+ struct nameidata *nd, void *cookie __maybe_unused) -+{ -+ __putname(nd_get_link(nd)); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static void aufs_truncate_range(struct inode *inode __maybe_unused, -+ loff_t start __maybe_unused, -+ loff_t end __maybe_unused) -+{ -+ AuUnsupport(); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct inode_operations aufs_symlink_iop = { -+ .permission = aufs_permission, -+ .setattr = aufs_setattr, -+ .getattr = aufs_getattr, -+ .readlink = aufs_readlink, -+ .follow_link = aufs_follow_link, -+ .put_link = aufs_put_link -+}; -+ -+struct inode_operations aufs_dir_iop = { -+ .create = aufs_create, -+ .lookup = aufs_lookup, -+ .link = aufs_link, -+ .unlink = aufs_unlink, -+ .symlink = aufs_symlink, -+ .mkdir = aufs_mkdir, -+ .rmdir = aufs_rmdir, -+ .mknod = aufs_mknod, -+ .rename = aufs_rename, -+ -+ .permission = aufs_permission, -+ .setattr = aufs_setattr, -+ .getattr = aufs_getattr -+}; -+ -+struct inode_operations aufs_iop = { -+ .permission = aufs_permission, -+ .setattr = aufs_setattr, -+ .getattr = aufs_getattr, -+ .truncate_range = aufs_truncate_range -+}; -diff -Nur linux-2.6.31-vanilla/fs/aufs/i_op_del.c linux-2.6.31/fs/aufs/i_op_del.c ---- linux-2.6.31-vanilla/fs/aufs/i_op_del.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/i_op_del.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,468 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * inode operations (del entry) -+ */ -+ -+#include "aufs.h" -+ -+/* -+ * decide if a new whiteout for @dentry is necessary or not. -+ * when it is necessary, prepare the parent dir for the upper branch whose -+ * branch index is @bcpup for creation. the actual creation of the whiteout will -+ * be done by caller. -+ * return value: -+ * 0: wh is unnecessary -+ * plus: wh is necessary -+ * minus: error -+ */ -+int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup) -+{ -+ int need_wh, err; -+ aufs_bindex_t bstart; -+ struct super_block *sb; -+ -+ sb = dentry->d_sb; -+ bstart = au_dbstart(dentry); -+ if (*bcpup < 0) { -+ *bcpup = bstart; -+ if (au_test_ro(sb, bstart, dentry->d_inode)) { -+ err = AuWbrCopyup(au_sbi(sb), dentry); -+ *bcpup = err; -+ if (unlikely(err < 0)) -+ goto out; -+ } -+ } else -+ AuDebugOn(bstart < *bcpup -+ || au_test_ro(sb, *bcpup, dentry->d_inode)); -+ AuDbg("bcpup %d, bstart %d\n", *bcpup, bstart); -+ -+ if (*bcpup != bstart) { -+ err = au_cpup_dirs(dentry, *bcpup); -+ if (unlikely(err)) -+ goto out; -+ need_wh = 1; -+ } else { -+ aufs_bindex_t old_bend, new_bend, bdiropq = -1; -+ -+ old_bend = au_dbend(dentry); -+ if (isdir) { -+ bdiropq = au_dbdiropq(dentry); -+ au_set_dbdiropq(dentry, -1); -+ } -+ need_wh = au_lkup_dentry(dentry, bstart + 1, /*type*/0, -+ /*nd*/NULL); -+ err = need_wh; -+ if (isdir) -+ au_set_dbdiropq(dentry, bdiropq); -+ if (unlikely(err < 0)) -+ goto out; -+ new_bend = au_dbend(dentry); -+ if (!need_wh && old_bend != new_bend) { -+ au_set_h_dptr(dentry, new_bend, NULL); -+ au_set_dbend(dentry, old_bend); -+ } -+ } -+ AuDbg("need_wh %d\n", need_wh); -+ err = need_wh; -+ -+ out: -+ return err; -+} -+ -+/* -+ * simple tests for the del-entry operations. -+ * following the checks in vfs, plus the parent-child relationship. -+ */ -+int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, -+ struct dentry *h_parent, int isdir) -+{ -+ int err; -+ umode_t h_mode; -+ struct dentry *h_dentry, *h_latest; -+ struct inode *h_inode; -+ -+ h_dentry = au_h_dptr(dentry, bindex); -+ h_inode = h_dentry->d_inode; -+ if (dentry->d_inode) { -+ err = -ENOENT; -+ if (unlikely(!h_inode || !h_inode->i_nlink)) -+ goto out; -+ -+ h_mode = h_inode->i_mode; -+ if (!isdir) { -+ err = -EISDIR; -+ if (unlikely(S_ISDIR(h_mode))) -+ goto out; -+ } else if (unlikely(!S_ISDIR(h_mode))) { -+ err = -ENOTDIR; -+ goto out; -+ } -+ } else { -+ /* rename(2) case */ -+ err = -EIO; -+ if (unlikely(h_inode)) -+ goto out; -+ } -+ -+ err = -ENOENT; -+ /* expected parent dir is locked */ -+ if (unlikely(h_parent != h_dentry->d_parent)) -+ goto out; -+ err = 0; -+ -+ /* -+ * rmdir a dir may break the consistency on some filesystem. -+ * let's try heavy test. -+ */ -+ err = -EACCES; -+ if (unlikely(au_test_h_perm(h_parent->d_inode, MAY_EXEC | MAY_WRITE))) -+ goto out; -+ -+ h_latest = au_sio_lkup_one(&dentry->d_name, h_parent, -+ au_sbr(dentry->d_sb, bindex)); -+ err = -EIO; -+ if (IS_ERR(h_latest)) -+ goto out; -+ if (h_latest == h_dentry) -+ err = 0; -+ dput(h_latest); -+ -+ out: -+ return err; -+} -+ -+/* -+ * decide the branch where we operate for @dentry. the branch index will be set -+ * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent -+ * dir for reverting. -+ * when a new whiteout is necessary, create it. -+ */ -+static struct dentry* -+lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup, -+ struct au_dtime *dt, struct au_pin *pin) -+{ -+ struct dentry *wh_dentry; -+ struct super_block *sb; -+ struct path h_path; -+ int err, need_wh; -+ unsigned int udba; -+ aufs_bindex_t bcpup; -+ -+ need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup); -+ wh_dentry = ERR_PTR(need_wh); -+ if (unlikely(need_wh < 0)) -+ goto out; -+ -+ sb = dentry->d_sb; -+ udba = au_opt_udba(sb); -+ bcpup = *rbcpup; -+ err = au_pin(pin, dentry, bcpup, udba, -+ AuPin_DI_LOCKED | AuPin_MNT_WRITE); -+ wh_dentry = ERR_PTR(err); -+ if (unlikely(err)) -+ goto out; -+ -+ h_path.dentry = au_pinned_h_parent(pin); -+ if (udba != AuOpt_UDBA_NONE -+ && au_dbstart(dentry) == bcpup) { -+ err = au_may_del(dentry, bcpup, h_path.dentry, isdir); -+ wh_dentry = ERR_PTR(err); -+ if (unlikely(err)) -+ goto out_unpin; -+ } -+ -+ h_path.mnt = au_sbr_mnt(sb, bcpup); -+ au_dtime_store(dt, au_pinned_parent(pin), &h_path); -+ wh_dentry = NULL; -+ if (!need_wh) -+ goto out; /* success, no need to create whiteout */ -+ -+ wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry); -+ if (!IS_ERR(wh_dentry)) -+ goto out; /* success */ -+ /* returns with the parent is locked and wh_dentry is dget-ed */ -+ -+ out_unpin: -+ au_unpin(pin); -+ out: -+ return wh_dentry; -+} -+ -+/* -+ * when removing a dir, rename it to a unique temporary whiteout-ed name first -+ * in order to be revertible and save time for removing many child whiteouts -+ * under the dir. -+ * returns 1 when there are too many child whiteout and caller should remove -+ * them asynchronously. returns 0 when the number of children is enough small to -+ * remove now or the branch fs is a remote fs. -+ * otherwise return an error. -+ */ -+static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex, -+ struct au_nhash *whlist, struct inode *dir) -+{ -+ int rmdir_later, err, dirwh; -+ struct dentry *h_dentry; -+ struct super_block *sb; -+ -+ sb = dentry->d_sb; -+ SiMustAnyLock(sb); -+ h_dentry = au_h_dptr(dentry, bindex); -+ err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex)); -+ if (unlikely(err)) -+ goto out; -+ -+ /* stop monitoring */ -+ au_hin_free(au_hi(dentry->d_inode, bindex)); -+ -+ if (!au_test_fs_remote(h_dentry->d_sb)) { -+ dirwh = au_sbi(sb)->si_dirwh; -+ rmdir_later = (dirwh <= 1); -+ if (!rmdir_later) -+ rmdir_later = au_nhash_test_longer_wh(whlist, bindex, -+ dirwh); -+ if (rmdir_later) -+ return rmdir_later; -+ } -+ -+ err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist); -+ if (unlikely(err)) { -+ AuIOErr("rmdir %.*s, b%d failed, %d. ignored\n", -+ AuDLNPair(h_dentry), bindex, err); -+ err = 0; -+ } -+ -+ out: -+ return err; -+} -+ -+/* -+ * final procedure for deleting a entry. -+ * maintain dentry and iattr. -+ */ -+static void epilog(struct inode *dir, struct dentry *dentry, -+ aufs_bindex_t bindex) -+{ -+ struct inode *inode; -+ -+ inode = dentry->d_inode; -+ d_drop(dentry); -+ inode->i_ctime = dir->i_ctime; -+ -+ if (atomic_read(&dentry->d_count) == 1) { -+ au_set_h_dptr(dentry, au_dbstart(dentry), NULL); -+ au_update_dbstart(dentry); -+ } -+ if (au_ibstart(dir) == bindex) -+ au_cpup_attr_timesizes(dir); -+ dir->i_version++; -+} -+ -+/* -+ * when an error happened, remove the created whiteout and revert everything. -+ */ -+static int do_revert(int err, struct inode *dir, aufs_bindex_t bwh, -+ struct dentry *wh_dentry, struct dentry *dentry, -+ struct au_dtime *dt) -+{ -+ int rerr; -+ struct path h_path = { -+ .dentry = wh_dentry, -+ .mnt = au_sbr_mnt(dir->i_sb, bwh) -+ }; -+ -+ rerr = au_wh_unlink_dentry(au_h_iptr(dir, bwh), &h_path, dentry); -+ if (!rerr) { -+ au_set_dbwh(dentry, bwh); -+ au_dtime_revert(dt); -+ return 0; -+ } -+ -+ AuIOErr("%.*s reverting whiteout failed(%d, %d)\n", -+ AuDLNPair(dentry), err, rerr); -+ return -EIO; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+int aufs_unlink(struct inode *dir, struct dentry *dentry) -+{ -+ int err; -+ aufs_bindex_t bwh, bindex, bstart; -+ struct au_dtime dt; -+ struct au_pin pin; -+ struct path h_path; -+ struct inode *inode, *h_dir; -+ struct dentry *parent, *wh_dentry; -+ -+ IMustLock(dir); -+ inode = dentry->d_inode; -+ if (unlikely(!inode)) -+ return -ENOENT; /* possible? */ -+ IMustLock(inode); -+ -+ aufs_read_lock(dentry, AuLock_DW); -+ parent = dentry->d_parent; /* dir inode is locked */ -+ di_write_lock_parent(parent); -+ -+ bstart = au_dbstart(dentry); -+ bwh = au_dbwh(dentry); -+ bindex = -1; -+ wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &dt, &pin); -+ err = PTR_ERR(wh_dentry); -+ if (IS_ERR(wh_dentry)) -+ goto out; -+ -+ h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); -+ h_path.dentry = au_h_dptr(dentry, bstart); -+ dget(h_path.dentry); -+ if (bindex == bstart) { -+ h_dir = au_pinned_h_dir(&pin); -+ err = vfsub_unlink(h_dir, &h_path, /*force*/0); -+ } else { -+ /* dir inode is locked */ -+ h_dir = wh_dentry->d_parent->d_inode; -+ IMustLock(h_dir); -+ err = 0; -+ } -+ -+ if (!err) { -+ drop_nlink(inode); -+ epilog(dir, dentry, bindex); -+ -+ /* update target timestamps */ -+ if (bindex == bstart) { -+ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ -+ inode->i_ctime = h_path.dentry->d_inode->i_ctime; -+ } else -+ /* todo: this timestamp may be reverted later */ -+ inode->i_ctime = h_dir->i_ctime; -+ goto out_unlock; /* success */ -+ } -+ -+ /* revert */ -+ if (wh_dentry) { -+ int rerr; -+ -+ rerr = do_revert(err, dir, bwh, wh_dentry, dentry, &dt); -+ if (rerr) -+ err = rerr; -+ } -+ -+ out_unlock: -+ au_unpin(&pin); -+ dput(wh_dentry); -+ dput(h_path.dentry); -+ out: -+ di_write_unlock(parent); -+ aufs_read_unlock(dentry, AuLock_DW); -+ return err; -+} -+ -+int aufs_rmdir(struct inode *dir, struct dentry *dentry) -+{ -+ int err, rmdir_later; -+ aufs_bindex_t bwh, bindex, bstart; -+ struct au_dtime dt; -+ struct au_pin pin; -+ struct inode *inode; -+ struct dentry *parent, *wh_dentry, *h_dentry; -+ struct au_whtmp_rmdir *args; -+ -+ IMustLock(dir); -+ inode = dentry->d_inode; -+ err = -ENOENT; /* possible? */ -+ if (unlikely(!inode)) -+ goto out; -+ IMustLock(inode); -+ -+ aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH); -+ err = -ENOMEM; -+ args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS); -+ if (unlikely(!args)) -+ goto out_unlock; -+ -+ parent = dentry->d_parent; /* dir inode is locked */ -+ di_write_lock_parent(parent); -+ err = au_test_empty(dentry, &args->whlist); -+ if (unlikely(err)) -+ goto out_args; -+ -+ bstart = au_dbstart(dentry); -+ bwh = au_dbwh(dentry); -+ bindex = -1; -+ wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &dt, &pin); -+ err = PTR_ERR(wh_dentry); -+ if (IS_ERR(wh_dentry)) -+ goto out_args; -+ -+ h_dentry = au_h_dptr(dentry, bstart); -+ dget(h_dentry); -+ rmdir_later = 0; -+ if (bindex == bstart) { -+ err = renwh_and_rmdir(dentry, bstart, &args->whlist, dir); -+ if (err > 0) { -+ rmdir_later = err; -+ err = 0; -+ } -+ } else { -+ /* stop monitoring */ -+ au_hin_free(au_hi(inode, bstart)); -+ -+ /* dir inode is locked */ -+ IMustLock(wh_dentry->d_parent->d_inode); -+ err = 0; -+ } -+ -+ if (!err) { -+ clear_nlink(inode); -+ au_set_dbdiropq(dentry, -1); -+ epilog(dir, dentry, bindex); -+ -+ if (rmdir_later) { -+ au_whtmp_kick_rmdir(dir, bstart, h_dentry, args); -+ args = NULL; -+ } -+ -+ goto out_unpin; /* success */ -+ } -+ -+ /* revert */ -+ AuLabel(revert); -+ if (wh_dentry) { -+ int rerr; -+ -+ rerr = do_revert(err, dir, bwh, wh_dentry, dentry, &dt); -+ if (rerr) -+ err = rerr; -+ } -+ -+ out_unpin: -+ au_unpin(&pin); -+ dput(wh_dentry); -+ dput(h_dentry); -+ out_args: -+ di_write_unlock(parent); -+ if (args) -+ au_whtmp_rmdir_free(args); -+ out_unlock: -+ aufs_read_unlock(dentry, AuLock_DW); -+ out: -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/i_op_ren.c linux-2.6.31/fs/aufs/i_op_ren.c ---- linux-2.6.31-vanilla/fs/aufs/i_op_ren.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/i_op_ren.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,957 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * inode operation (rename entry) -+ * todo: this is crazy monster -+ */ -+ -+#include "aufs.h" -+ -+enum { AuSRC, AuDST, AuSrcDst }; -+enum { AuPARENT, AuCHILD, AuParentChild }; -+ -+#define AuRen_ISDIR 1 -+#define AuRen_ISSAMEDIR (1 << 1) -+#define AuRen_WHSRC (1 << 2) -+#define AuRen_WHDST (1 << 3) -+#define AuRen_MNT_WRITE (1 << 4) -+#define AuRen_DT_DSTDIR (1 << 5) -+#define AuRen_DIROPQ (1 << 6) -+#define AuRen_CPUP (1 << 7) -+#define au_ftest_ren(flags, name) ((flags) & AuRen_##name) -+#define au_fset_ren(flags, name) { (flags) |= AuRen_##name; } -+#define au_fclr_ren(flags, name) { (flags) &= ~AuRen_##name; } -+ -+struct au_ren_args { -+ struct { -+ struct dentry *dentry, *h_dentry, *parent, *h_parent, -+ *wh_dentry; -+ struct inode *dir, *inode; -+ struct au_hinode *hdir; -+ struct au_dtime dt[AuParentChild]; -+ aufs_bindex_t bstart; -+ } sd[AuSrcDst]; -+ -+#define src_dentry sd[AuSRC].dentry -+#define src_dir sd[AuSRC].dir -+#define src_inode sd[AuSRC].inode -+#define src_h_dentry sd[AuSRC].h_dentry -+#define src_parent sd[AuSRC].parent -+#define src_h_parent sd[AuSRC].h_parent -+#define src_wh_dentry sd[AuSRC].wh_dentry -+#define src_hdir sd[AuSRC].hdir -+#define src_h_dir sd[AuSRC].hdir->hi_inode -+#define src_dt sd[AuSRC].dt -+#define src_bstart sd[AuSRC].bstart -+ -+#define dst_dentry sd[AuDST].dentry -+#define dst_dir sd[AuDST].dir -+#define dst_inode sd[AuDST].inode -+#define dst_h_dentry sd[AuDST].h_dentry -+#define dst_parent sd[AuDST].parent -+#define dst_h_parent sd[AuDST].h_parent -+#define dst_wh_dentry sd[AuDST].wh_dentry -+#define dst_hdir sd[AuDST].hdir -+#define dst_h_dir sd[AuDST].hdir->hi_inode -+#define dst_dt sd[AuDST].dt -+#define dst_bstart sd[AuDST].bstart -+ -+ struct dentry *h_trap; -+ struct au_branch *br; -+ struct au_hinode *src_hinode; -+ struct path h_path; -+ struct au_nhash whlist; -+ aufs_bindex_t btgt; -+ -+ unsigned int flags; -+ -+ struct au_whtmp_rmdir *thargs; -+ struct dentry *h_dst; -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * functions for reverting. -+ * when an error happened in a single rename systemcall, we should revert -+ * everything as if nothing happend. -+ * we don't need to revert the copied-up/down the parent dir since they are -+ * harmless. -+ */ -+ -+#define RevertFailure(fmt, args...) do { \ -+ AuIOErr("revert failure: " fmt " (%d, %d)\n", \ -+ ##args, err, rerr); \ -+ err = -EIO; \ -+} while (0) -+ -+static void au_ren_rev_diropq(int err, struct au_ren_args *a) -+{ -+ int rerr; -+ -+ au_hin_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); -+ rerr = au_diropq_remove(a->src_dentry, a->btgt); -+ au_hin_imtx_unlock(a->src_hinode); -+ if (rerr) -+ RevertFailure("remove diropq %.*s", AuDLNPair(a->src_dentry)); -+} -+ -+ -+static void au_ren_rev_rename(int err, struct au_ren_args *a) -+{ -+ int rerr; -+ -+ a->h_path.dentry = au_lkup_one(&a->src_dentry->d_name, a->src_h_parent, -+ a->br, /*nd*/NULL); -+ rerr = PTR_ERR(a->h_path.dentry); -+ if (IS_ERR(a->h_path.dentry)) { -+ RevertFailure("au_lkup_one %.*s", AuDLNPair(a->src_dentry)); -+ return; -+ } -+ -+ rerr = vfsub_rename(a->dst_h_dir, -+ au_h_dptr(a->src_dentry, a->btgt), -+ a->src_h_dir, &a->h_path); -+ d_drop(a->h_path.dentry); -+ dput(a->h_path.dentry); -+ /* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */ -+ if (rerr) -+ RevertFailure("rename %.*s", AuDLNPair(a->src_dentry)); -+} -+ -+static void au_ren_rev_cpup(int err, struct au_ren_args *a) -+{ -+ int rerr; -+ -+ a->h_path.dentry = a->dst_h_dentry; -+ rerr = vfsub_unlink(a->dst_h_dir, &a->h_path, /*force*/0); -+ au_set_h_dptr(a->src_dentry, a->btgt, NULL); -+ au_set_dbstart(a->src_dentry, a->src_bstart); -+ if (rerr) -+ RevertFailure("unlink %.*s", AuDLNPair(a->dst_h_dentry)); -+} -+ -+ -+static void au_ren_rev_whtmp(int err, struct au_ren_args *a) -+{ -+ int rerr; -+ -+ a->h_path.dentry = au_lkup_one(&a->dst_dentry->d_name, a->dst_h_parent, -+ a->br, /*nd*/NULL); -+ rerr = PTR_ERR(a->h_path.dentry); -+ if (IS_ERR(a->h_path.dentry)) { -+ RevertFailure("lookup %.*s", AuDLNPair(a->dst_dentry)); -+ return; -+ } -+ if (a->h_path.dentry->d_inode) { -+ d_drop(a->h_path.dentry); -+ dput(a->h_path.dentry); -+ return; -+ } -+ -+ rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path); -+ d_drop(a->h_path.dentry); -+ dput(a->h_path.dentry); -+ if (!rerr) { -+ au_set_h_dptr(a->dst_dentry, a->btgt, NULL); -+ au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst)); -+ } else -+ RevertFailure("rename %.*s", AuDLNPair(a->h_dst)); -+} -+ -+static void au_ren_rev_whsrc(int err, struct au_ren_args *a) -+{ -+ int rerr; -+ -+ a->h_path.dentry = a->src_wh_dentry; -+ rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry); -+ if (rerr) -+ RevertFailure("unlink %.*s", AuDLNPair(a->src_wh_dentry)); -+} -+ -+static void au_ren_rev_drop(struct au_ren_args *a) -+{ -+ struct dentry *d, *h_d; -+ int i; -+ aufs_bindex_t bend, bindex; -+ -+ for (i = 0; i < AuSrcDst; i++) { -+ d = a->sd[i].dentry; -+ d_drop(d); -+ bend = au_dbend(d); -+ for (bindex = au_dbstart(d); bindex <= bend; bindex++) { -+ h_d = au_h_dptr(d, bindex); -+ if (h_d) -+ d_drop(h_d); -+ } -+ } -+ -+ au_update_dbstart(a->dst_dentry); -+ if (a->thargs) -+ d_drop(a->h_dst); -+} -+#undef RevertFailure -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * when we have to copyup the renaming entry, do it with the rename-target name -+ * in order to minimize the cost (the later actual rename is unnecessary). -+ * otherwise rename it on the target branch. -+ */ -+static int au_ren_or_cpup(struct au_ren_args *a) -+{ -+ int err; -+ struct dentry *d; -+ -+ d = a->src_dentry; -+ if (au_dbstart(d) == a->btgt) { -+ a->h_path.dentry = a->dst_h_dentry; -+ if (au_ftest_ren(a->flags, DIROPQ) -+ && au_dbdiropq(d) == a->btgt) -+ au_fclr_ren(a->flags, DIROPQ); -+ AuDebugOn(au_dbstart(d) != a->btgt); -+ err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt), -+ a->dst_h_dir, &a->h_path); -+ } else { -+ struct mutex *h_mtx = &a->src_h_dentry->d_inode->i_mutex; -+ -+ au_fset_ren(a->flags, CPUP); -+ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); -+ au_set_dbstart(d, a->btgt); -+ au_set_h_dptr(d, a->btgt, dget(a->dst_h_dentry)); -+ err = au_sio_cpup_single(d, a->btgt, a->src_bstart, -1, -+ !AuCpup_DTIME, a->dst_parent); -+ if (unlikely(err)) { -+ au_set_h_dptr(d, a->btgt, NULL); -+ au_set_dbstart(d, a->src_bstart); -+ } -+ mutex_unlock(h_mtx); -+ } -+ -+ return err; -+} -+ -+/* cf. aufs_rmdir() */ -+static int au_ren_del_whtmp(struct au_ren_args *a) -+{ -+ int err; -+ struct inode *dir; -+ -+ dir = a->dst_dir; -+ SiMustAnyLock(dir->i_sb); -+ if (!au_nhash_test_longer_wh(&a->whlist, a->btgt, -+ au_sbi(dir->i_sb)->si_dirwh) -+ || au_test_fs_remote(a->h_dst->d_sb)) { -+ err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist); -+ if (unlikely(err)) -+ AuWarn("failed removing whtmp dir %.*s (%d), " -+ "ignored.\n", AuDLNPair(a->h_dst), err); -+ } else { -+ au_nhash_wh_free(&a->thargs->whlist); -+ a->thargs->whlist = a->whlist; -+ a->whlist.nh_num = 0; -+ au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs); -+ dput(a->h_dst); -+ a->thargs = NULL; -+ } -+ -+ return 0; -+} -+ -+/* make it 'opaque' dir. */ -+static int au_ren_diropq(struct au_ren_args *a) -+{ -+ int err; -+ struct dentry *diropq; -+ -+ err = 0; -+ a->src_hinode = au_hi(a->src_inode, a->btgt); -+ au_hin_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); -+ diropq = au_diropq_create(a->src_dentry, a->btgt); -+ au_hin_imtx_unlock(a->src_hinode); -+ if (IS_ERR(diropq)) -+ err = PTR_ERR(diropq); -+ dput(diropq); -+ -+ return err; -+} -+ -+static int do_rename(struct au_ren_args *a) -+{ -+ int err; -+ struct dentry *d, *h_d; -+ -+ /* prepare workqueue args for asynchronous rmdir */ -+ h_d = a->dst_h_dentry; -+ if (au_ftest_ren(a->flags, ISDIR) && h_d->d_inode) { -+ err = -ENOMEM; -+ a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS); -+ if (unlikely(!a->thargs)) -+ goto out; -+ a->h_dst = dget(h_d); -+ } -+ -+ /* create whiteout for src_dentry */ -+ if (au_ftest_ren(a->flags, WHSRC)) { -+ a->src_wh_dentry -+ = au_wh_create(a->src_dentry, a->btgt, a->src_h_parent); -+ err = PTR_ERR(a->src_wh_dentry); -+ if (IS_ERR(a->src_wh_dentry)) -+ goto out_thargs; -+ } -+ -+ /* lookup whiteout for dentry */ -+ if (au_ftest_ren(a->flags, WHDST)) { -+ h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name, -+ a->br); -+ err = PTR_ERR(h_d); -+ if (IS_ERR(h_d)) -+ goto out_whsrc; -+ if (!h_d->d_inode) -+ dput(h_d); -+ else -+ a->dst_wh_dentry = h_d; -+ } -+ -+ /* rename dentry to tmpwh */ -+ if (a->thargs) { -+ err = au_whtmp_ren(a->dst_h_dentry, a->br); -+ if (unlikely(err)) -+ goto out_whdst; -+ -+ d = a->dst_dentry; -+ au_set_h_dptr(d, a->btgt, NULL); -+ err = au_lkup_neg(d, a->btgt); -+ if (unlikely(err)) -+ goto out_whtmp; -+ a->dst_h_dentry = au_h_dptr(d, a->btgt); -+ } -+ -+ /* cpup src */ -+ if (a->dst_h_dentry->d_inode && a->src_bstart != a->btgt) { -+ struct mutex *h_mtx = &a->src_h_dentry->d_inode->i_mutex; -+ -+ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); -+ err = au_sio_cpup_simple(a->src_dentry, a->btgt, -1, -+ !AuCpup_DTIME); -+ mutex_unlock(h_mtx); -+ if (unlikely(err)) -+ goto out_whtmp; -+ } -+ -+ /* rename by vfs_rename or cpup */ -+ d = a->dst_dentry; -+ if (au_ftest_ren(a->flags, ISDIR) -+ && (a->dst_wh_dentry -+ || au_dbdiropq(d) == a->btgt -+ /* hide the lower to keep xino */ -+ || a->btgt < au_dbend(d) -+ || au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ))) -+ au_fset_ren(a->flags, DIROPQ); -+ err = au_ren_or_cpup(a); -+ if (unlikely(err)) -+ /* leave the copied-up one */ -+ goto out_whtmp; -+ -+ /* make dir opaque */ -+ if (au_ftest_ren(a->flags, DIROPQ)) { -+ err = au_ren_diropq(a); -+ if (unlikely(err)) -+ goto out_rename; -+ } -+ -+ /* update target timestamps */ -+ AuDebugOn(au_dbstart(a->src_dentry) != a->btgt); -+ a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt); -+ vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/ -+ a->src_inode->i_ctime = a->h_path.dentry->d_inode->i_ctime; -+ -+ /* remove whiteout for dentry */ -+ if (a->dst_wh_dentry) { -+ a->h_path.dentry = a->dst_wh_dentry; -+ err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path, -+ a->dst_dentry); -+ if (unlikely(err)) -+ goto out_diropq; -+ } -+ -+ /* remove whtmp */ -+ if (a->thargs) -+ au_ren_del_whtmp(a); /* ignore this error */ -+ -+ err = 0; -+ goto out_success; -+ -+ out_diropq: -+ if (au_ftest_ren(a->flags, DIROPQ)) -+ au_ren_rev_diropq(err, a); -+ out_rename: -+ if (!au_ftest_ren(a->flags, CPUP)) -+ au_ren_rev_rename(err, a); -+ else -+ au_ren_rev_cpup(err, a); -+ out_whtmp: -+ if (a->thargs) -+ au_ren_rev_whtmp(err, a); -+ out_whdst: -+ dput(a->dst_wh_dentry); -+ a->dst_wh_dentry = NULL; -+ out_whsrc: -+ if (a->src_wh_dentry) -+ au_ren_rev_whsrc(err, a); -+ au_ren_rev_drop(a); -+ out_success: -+ dput(a->src_wh_dentry); -+ dput(a->dst_wh_dentry); -+ out_thargs: -+ if (a->thargs) { -+ dput(a->h_dst); -+ au_whtmp_rmdir_free(a->thargs); -+ a->thargs = NULL; -+ } -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * test if @dentry dir can be rename destination or not. -+ * success means, it is a logically empty dir. -+ */ -+static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist) -+{ -+ return au_test_empty(dentry, whlist); -+} -+ -+/* -+ * test if @dentry dir can be rename source or not. -+ * if it can, return 0 and @children is filled. -+ * success means, -+ * - it is a logically empty dir. -+ * - or, it exists on writable branch and has no children including whiteouts -+ * on the lower branch. -+ */ -+static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt) -+{ -+ int err; -+ unsigned int rdhash; -+ aufs_bindex_t bstart; -+ -+ bstart = au_dbstart(dentry); -+ if (bstart != btgt) { -+ struct au_nhash whlist; -+ -+ SiMustAnyLock(dentry->d_sb); -+ rdhash = au_sbi(dentry->d_sb)->si_rdhash; -+ if (!rdhash) -+ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, -+ dentry)); -+ err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); -+ if (unlikely(err)) -+ goto out; -+ err = au_test_empty(dentry, &whlist); -+ au_nhash_wh_free(&whlist); -+ goto out; -+ } -+ -+ if (bstart == au_dbtaildir(dentry)) -+ return 0; /* success */ -+ -+ err = au_test_empty_lower(dentry); -+ -+ out: -+ if (err == -ENOTEMPTY) { -+ AuWarn1("renaming dir who has child(ren) on multiple branches," -+ " is not supported\n"); -+ err = -EXDEV; -+ } -+ return err; -+} -+ -+/* side effect: sets whlist and h_dentry */ -+static int au_ren_may_dir(struct au_ren_args *a) -+{ -+ int err; -+ unsigned int rdhash; -+ struct dentry *d; -+ -+ d = a->dst_dentry; -+ SiMustAnyLock(d->d_sb); -+ -+ err = 0; -+ if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) { -+ rdhash = au_sbi(d->d_sb)->si_rdhash; -+ if (!rdhash) -+ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d)); -+ err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS); -+ if (unlikely(err)) -+ goto out; -+ -+ au_set_dbstart(d, a->dst_bstart); -+ err = may_rename_dstdir(d, &a->whlist); -+ au_set_dbstart(d, a->btgt); -+ } -+ a->dst_h_dentry = au_h_dptr(d, au_dbstart(d)); -+ if (unlikely(err)) -+ goto out; -+ -+ d = a->src_dentry; -+ a->src_h_dentry = au_h_dptr(d, au_dbstart(d)); -+ if (au_ftest_ren(a->flags, ISDIR)) { -+ err = may_rename_srcdir(d, a->btgt); -+ if (unlikely(err)) { -+ au_nhash_wh_free(&a->whlist); -+ a->whlist.nh_num = 0; -+ } -+ } -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * simple tests for rename. -+ * following the checks in vfs, plus the parent-child relationship. -+ */ -+static int au_may_ren(struct au_ren_args *a) -+{ -+ int err, isdir; -+ struct inode *h_inode; -+ -+ if (a->src_bstart == a->btgt) { -+ err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent, -+ au_ftest_ren(a->flags, ISDIR)); -+ if (unlikely(err)) -+ goto out; -+ err = -EINVAL; -+ if (unlikely(a->src_h_dentry == a->h_trap)) -+ goto out; -+ } -+ -+ err = 0; -+ if (a->dst_bstart != a->btgt) -+ goto out; -+ -+ err = -EIO; -+ h_inode = a->dst_h_dentry->d_inode; -+ isdir = !!au_ftest_ren(a->flags, ISDIR); -+ if (!a->dst_dentry->d_inode) { -+ if (unlikely(h_inode)) -+ goto out; -+ err = au_may_add(a->dst_dentry, a->btgt, a->dst_h_parent, -+ isdir); -+ } else { -+ if (unlikely(!h_inode || !h_inode->i_nlink)) -+ goto out; -+ err = au_may_del(a->dst_dentry, a->btgt, a->dst_h_parent, -+ isdir); -+ if (unlikely(err)) -+ goto out; -+ err = -ENOTEMPTY; -+ if (unlikely(a->dst_h_dentry == a->h_trap)) -+ goto out; -+ err = 0; -+ } -+ -+ out: -+ if (unlikely(err == -ENOENT || err == -EEXIST)) -+ err = -EIO; -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * locking order -+ * (VFS) -+ * - src_dir and dir by lock_rename() -+ * - inode if exitsts -+ * (aufs) -+ * - lock all -+ * + src_dentry and dentry by aufs_read_and_write_lock2() which calls, -+ * + si_read_lock -+ * + di_write_lock2_child() -+ * + di_write_lock_child() -+ * + ii_write_lock_child() -+ * + di_write_lock_child2() -+ * + ii_write_lock_child2() -+ * + src_parent and parent -+ * + di_write_lock_parent() -+ * + ii_write_lock_parent() -+ * + di_write_lock_parent2() -+ * + ii_write_lock_parent2() -+ * + lower src_dir and dir by vfsub_lock_rename() -+ * + verify the every relationships between child and parent. if any -+ * of them failed, unlock all and return -EBUSY. -+ */ -+static void au_ren_unlock(struct au_ren_args *a) -+{ -+ struct super_block *sb; -+ -+ sb = a->dst_dentry->d_sb; -+ if (au_ftest_ren(a->flags, MNT_WRITE)) -+ mnt_drop_write(a->br->br_mnt); -+ vfsub_unlock_rename(a->src_h_parent, a->src_hdir, -+ a->dst_h_parent, a->dst_hdir); -+} -+ -+static int au_ren_lock(struct au_ren_args *a) -+{ -+ int err; -+ unsigned int udba; -+ -+ err = 0; -+ a->src_h_parent = au_h_dptr(a->src_parent, a->btgt); -+ a->src_hdir = au_hi(a->src_dir, a->btgt); -+ a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt); -+ a->dst_hdir = au_hi(a->dst_dir, a->btgt); -+ a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir, -+ a->dst_h_parent, a->dst_hdir); -+ udba = au_opt_udba(a->src_dentry->d_sb); -+ if (unlikely(a->src_hdir->hi_inode != a->src_h_parent->d_inode -+ || a->dst_hdir->hi_inode != a->dst_h_parent->d_inode)) -+ err = au_busy_or_stale(); -+ if (!err && au_dbstart(a->src_dentry) == a->btgt) -+ err = au_h_verify(a->src_h_dentry, udba, -+ a->src_h_parent->d_inode, a->src_h_parent, -+ a->br); -+ if (!err && au_dbstart(a->dst_dentry) == a->btgt) -+ err = au_h_verify(a->dst_h_dentry, udba, -+ a->dst_h_parent->d_inode, a->dst_h_parent, -+ a->br); -+ if (!err) { -+ err = mnt_want_write(a->br->br_mnt); -+ if (unlikely(err)) -+ goto out_unlock; -+ au_fset_ren(a->flags, MNT_WRITE); -+ goto out; /* success */ -+ } -+ -+ err = au_busy_or_stale(); -+ -+ out_unlock: -+ au_ren_unlock(a); -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static void au_ren_refresh_dir(struct au_ren_args *a) -+{ -+ struct inode *dir; -+ -+ dir = a->dst_dir; -+ dir->i_version++; -+ if (au_ftest_ren(a->flags, ISDIR)) { -+ /* is this updating defined in POSIX? */ -+ au_cpup_attr_timesizes(a->src_inode); -+ au_cpup_attr_nlink(dir, /*force*/1); -+ if (a->dst_inode) { -+ clear_nlink(a->dst_inode); -+ au_cpup_attr_timesizes(a->dst_inode); -+ } -+ } -+ if (au_ibstart(dir) == a->btgt) -+ au_cpup_attr_timesizes(dir); -+ -+ if (au_ftest_ren(a->flags, ISSAMEDIR)) -+ return; -+ -+ dir = a->src_dir; -+ dir->i_version++; -+ if (au_ftest_ren(a->flags, ISDIR)) -+ au_cpup_attr_nlink(dir, /*force*/1); -+ if (au_ibstart(dir) == a->btgt) -+ au_cpup_attr_timesizes(dir); -+} -+ -+static void au_ren_refresh(struct au_ren_args *a) -+{ -+ aufs_bindex_t bend, bindex; -+ struct dentry *d, *h_d; -+ struct inode *i, *h_i; -+ struct super_block *sb; -+ -+ d = a->src_dentry; -+ au_set_dbwh(d, -1); -+ bend = au_dbend(d); -+ for (bindex = a->btgt + 1; bindex <= bend; bindex++) { -+ h_d = au_h_dptr(d, bindex); -+ if (h_d) -+ au_set_h_dptr(d, bindex, NULL); -+ } -+ au_set_dbend(d, a->btgt); -+ -+ sb = d->d_sb; -+ i = a->src_inode; -+ if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i)) -+ return; /* success */ -+ -+ bend = au_ibend(i); -+ for (bindex = a->btgt + 1; bindex <= bend; bindex++) { -+ h_i = au_h_iptr(i, bindex); -+ if (h_i) { -+ au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0); -+ /* ignore this error */ -+ au_set_h_iptr(i, bindex, NULL, 0); -+ } -+ } -+ au_set_ibend(i, a->btgt); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* mainly for link(2) and rename(2) */ -+int au_wbr(struct dentry *dentry, aufs_bindex_t btgt) -+{ -+ aufs_bindex_t bdiropq, bwh; -+ struct dentry *parent; -+ struct au_branch *br; -+ -+ parent = dentry->d_parent; -+ IMustLock(parent->d_inode); /* dir is locked */ -+ -+ bdiropq = au_dbdiropq(parent); -+ bwh = au_dbwh(dentry); -+ br = au_sbr(dentry->d_sb, btgt); -+ if (au_br_rdonly(br) -+ || (0 <= bdiropq && bdiropq < btgt) -+ || (0 <= bwh && bwh < btgt)) -+ btgt = -1; -+ -+ AuDbg("btgt %d\n", btgt); -+ return btgt; -+} -+ -+/* sets src_bstart, dst_bstart and btgt */ -+static int au_ren_wbr(struct au_ren_args *a) -+{ -+ int err; -+ struct au_wr_dir_args wr_dir_args = { -+ /* .force_btgt = -1, */ -+ .flags = AuWrDir_ADD_ENTRY -+ }; -+ -+ a->src_bstart = au_dbstart(a->src_dentry); -+ a->dst_bstart = au_dbstart(a->dst_dentry); -+ if (au_ftest_ren(a->flags, ISDIR)) -+ au_fset_wrdir(wr_dir_args.flags, ISDIR); -+ wr_dir_args.force_btgt = a->src_bstart; -+ if (a->dst_inode && a->dst_bstart < a->src_bstart) -+ wr_dir_args.force_btgt = a->dst_bstart; -+ wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt); -+ err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args); -+ a->btgt = err; -+ -+ return err; -+} -+ -+static void au_ren_dt(struct au_ren_args *a) -+{ -+ a->h_path.dentry = a->src_h_parent; -+ au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path); -+ if (!au_ftest_ren(a->flags, ISSAMEDIR)) { -+ a->h_path.dentry = a->dst_h_parent; -+ au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path); -+ } -+ -+ au_fclr_ren(a->flags, DT_DSTDIR); -+ if (!au_ftest_ren(a->flags, ISDIR)) -+ return; -+ -+ a->h_path.dentry = a->src_h_dentry; -+ au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path); -+ if (a->dst_h_dentry->d_inode) { -+ au_fset_ren(a->flags, DT_DSTDIR); -+ a->h_path.dentry = a->dst_h_dentry; -+ au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path); -+ } -+} -+ -+static void au_ren_rev_dt(int err, struct au_ren_args *a) -+{ -+ struct dentry *h_d; -+ struct mutex *h_mtx; -+ -+ au_dtime_revert(a->src_dt + AuPARENT); -+ if (!au_ftest_ren(a->flags, ISSAMEDIR)) -+ au_dtime_revert(a->dst_dt + AuPARENT); -+ -+ if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) { -+ h_d = a->src_dt[AuCHILD].dt_h_path.dentry; -+ h_mtx = &h_d->d_inode->i_mutex; -+ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); -+ au_dtime_revert(a->src_dt + AuCHILD); -+ mutex_unlock(h_mtx); -+ -+ if (au_ftest_ren(a->flags, DT_DSTDIR)) { -+ h_d = a->dst_dt[AuCHILD].dt_h_path.dentry; -+ h_mtx = &h_d->d_inode->i_mutex; -+ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); -+ au_dtime_revert(a->dst_dt + AuCHILD); -+ mutex_unlock(h_mtx); -+ } -+ } -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry, -+ struct inode *_dst_dir, struct dentry *_dst_dentry) -+{ -+ int err; -+ /* reduce stack space */ -+ struct au_ren_args *a; -+ -+ IMustLock(_src_dir); -+ IMustLock(_dst_dir); -+ -+ err = -ENOMEM; -+ BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE); -+ a = kzalloc(sizeof(*a), GFP_NOFS); -+ if (unlikely(!a)) -+ goto out; -+ -+ a->src_dir = _src_dir; -+ a->src_dentry = _src_dentry; -+ a->src_inode = a->src_dentry->d_inode; -+ a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */ -+ a->dst_dir = _dst_dir; -+ a->dst_dentry = _dst_dentry; -+ a->dst_inode = a->dst_dentry->d_inode; -+ a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */ -+ if (a->dst_inode) { -+ IMustLock(a->dst_inode); -+ au_igrab(a->dst_inode); -+ } -+ -+ err = -ENOTDIR; -+ if (S_ISDIR(a->src_inode->i_mode)) { -+ au_fset_ren(a->flags, ISDIR); -+ if (unlikely(a->dst_inode && !S_ISDIR(a->dst_inode->i_mode))) -+ goto out_free; -+ aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, -+ AuLock_DIR | AuLock_FLUSH); -+ } else -+ aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, -+ AuLock_FLUSH); -+ -+ au_fset_ren(a->flags, ISSAMEDIR); /* temporary */ -+ di_write_lock_parent(a->dst_parent); -+ -+ /* which branch we process */ -+ err = au_ren_wbr(a); -+ if (unlikely(err < 0)) -+ goto out_unlock; -+ a->br = au_sbr(a->dst_dentry->d_sb, a->btgt); -+ a->h_path.mnt = a->br->br_mnt; -+ -+ /* are they available to be renamed */ -+ err = au_ren_may_dir(a); -+ if (unlikely(err)) -+ goto out_children; -+ -+ /* prepare the writable parent dir on the same branch */ -+ if (a->dst_bstart == a->btgt) { -+ au_fset_ren(a->flags, WHDST); -+ } else { -+ err = au_cpup_dirs(a->dst_dentry, a->btgt); -+ if (unlikely(err)) -+ goto out_children; -+ } -+ -+ if (a->src_dir != a->dst_dir) { -+ /* -+ * this temporary unlock is safe, -+ * because both dir->i_mutex are locked. -+ */ -+ di_write_unlock(a->dst_parent); -+ di_write_lock_parent(a->src_parent); -+ err = au_wr_dir_need_wh(a->src_dentry, -+ au_ftest_ren(a->flags, ISDIR), -+ &a->btgt); -+ di_write_unlock(a->src_parent); -+ di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1); -+ au_fclr_ren(a->flags, ISSAMEDIR); -+ } else -+ err = au_wr_dir_need_wh(a->src_dentry, -+ au_ftest_ren(a->flags, ISDIR), -+ &a->btgt); -+ if (unlikely(err < 0)) -+ goto out_children; -+ if (err) -+ au_fset_ren(a->flags, WHSRC); -+ -+ /* lock them all */ -+ err = au_ren_lock(a); -+ if (unlikely(err)) -+ goto out_children; -+ -+ if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE)) { -+ err = au_may_ren(a); -+ if (unlikely(err)) -+ goto out_hdir; -+ } -+ -+ /* store timestamps to be revertible */ -+ au_ren_dt(a); -+ -+ /* here we go */ -+ err = do_rename(a); -+ if (unlikely(err)) -+ goto out_dt; -+ -+ /* update dir attributes */ -+ au_ren_refresh_dir(a); -+ -+ /* dput/iput all lower dentries */ -+ au_ren_refresh(a); -+ -+ goto out_hdir; /* success */ -+ -+ out_dt: -+ au_ren_rev_dt(err, a); -+ out_hdir: -+ au_ren_unlock(a); -+ out_children: -+ au_nhash_wh_free(&a->whlist); -+ out_unlock: -+ if (unlikely(err && au_ftest_ren(a->flags, ISDIR))) { -+ au_update_dbstart(a->dst_dentry); -+ d_drop(a->dst_dentry); -+ } -+ if (!err) -+ d_move(a->src_dentry, a->dst_dentry); -+ if (au_ftest_ren(a->flags, ISSAMEDIR)) -+ di_write_unlock(a->dst_parent); -+ else -+ di_write_unlock2(a->src_parent, a->dst_parent); -+ aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry); -+ out_free: -+ iput(a->dst_inode); -+ if (a->thargs) -+ au_whtmp_rmdir_free(a->thargs); -+ kfree(a); -+ out: -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/Kconfig linux-2.6.31/fs/aufs/Kconfig ---- linux-2.6.31-vanilla/fs/aufs/Kconfig 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/Kconfig 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,140 @@ -+config AUFS_FS -+ tristate "Aufs (Advanced multi layered unification filesystem) support" -+ depends on EXPERIMENTAL -+ help -+ Aufs is a stackable unification filesystem such as Unionfs, -+ which unifies several directories and provides a merged single -+ directory. -+ In the early days, aufs was entirely re-designed and -+ re-implemented Unionfs Version 1.x series. Introducing many -+ original ideas, approaches and improvements, it becomes totally -+ different from Unionfs while keeping the basic features. -+ -+if AUFS_FS -+choice -+ prompt "Maximum number of branches" -+ default AUFS_BRANCH_MAX_127 -+ help -+ Specifies the maximum number of branches (or member directories) -+ in a single aufs. The larger value consumes more system -+ resources and has a minor impact to performance. -+config AUFS_BRANCH_MAX_127 -+ bool "127" -+ help -+ Specifies the maximum number of branches (or member directories) -+ in a single aufs. The larger value consumes more system -+ resources and has a minor impact to performance. -+config AUFS_BRANCH_MAX_511 -+ bool "511" -+ help -+ Specifies the maximum number of branches (or member directories) -+ in a single aufs. The larger value consumes more system -+ resources and has a minor impact to performance. -+config AUFS_BRANCH_MAX_1023 -+ bool "1023" -+ help -+ Specifies the maximum number of branches (or member directories) -+ in a single aufs. The larger value consumes more system -+ resources and has a minor impact to performance. -+config AUFS_BRANCH_MAX_32767 -+ bool "32767" -+ help -+ Specifies the maximum number of branches (or member directories) -+ in a single aufs. The larger value consumes more system -+ resources and has a minor impact to performance. -+endchoice -+ -+config AUFS_HINOTIFY -+ bool "Use inotify to detect actions on a branch" -+ depends on INOTIFY -+ help -+ If you want to modify files on branches directly, eg. bypassing aufs, -+ and want aufs to detect the changes of them fully, then enable this -+ option and use 'udba=inotify' mount option. -+ It will have a negative impact to the performance. -+ See detail in aufs.5. -+ -+config AUFS_EXPORT -+ bool "NFS-exportable aufs" -+ depends on (AUFS_FS = y && EXPORTFS = y) || (AUFS_FS = m && EXPORTFS) -+ help -+ If you want to export your mounted aufs via NFS, then enable this -+ option. There are several requirements for this configuration. -+ See detail in aufs.5. -+ -+config AUFS_RDU -+ bool "Readdir in userspace" -+ help -+ If you have millions of files under a single aufs directory, and -+ meet the out of memory, then enable this option and set -+ environment variables for your readdir(3). -+ See detail in aufs.5. -+ -+config AUFS_SHWH -+ bool "Show whiteouts" -+ help -+ If you want to make the whiteouts in aufs visible, then enable -+ this option and specify 'shwh' mount option. Although it may -+ sounds like philosophy or something, but in technically it -+ simply shows the name of whiteout with keeping its behaviour. -+ -+config AUFS_BR_RAMFS -+ bool "Ramfs (initramfs/rootfs) as an aufs branch" -+ help -+ If you want to use ramfs as an aufs branch fs, then enable this -+ option. Generally tmpfs is recommended. -+ Aufs prohibited them to be a branch fs by default, because -+ initramfs becomes unusable after switch_root or something -+ generally. If you sets initramfs as an aufs branch and boot your -+ system by switch_root, you will meet a problem easily since the -+ files in initramfs may be inaccessible. -+ Unless you are going to use ramfs as an aufs branch fs without -+ switch_root or something, leave it N. -+ -+config AUFS_BR_FUSE -+ bool "Fuse fs as an aufs branch" -+ depends on FUSE_FS -+ select AUFS_POLL -+ help -+ If you want to use fuse-based userspace filesystem as an aufs -+ branch fs, then enable this option. -+ It implements the internal poll(2) operation which is -+ implemented by fuse only (curretnly). -+ -+config AUFS_DEBUG -+ bool "Debug aufs" -+ help -+ Enable this to compile aufs internal debug code. -+ It will have a negative impact to the performance. -+ -+config AUFS_MAGIC_SYSRQ -+ bool -+ depends on AUFS_DEBUG && MAGIC_SYSRQ -+ default y -+ help -+ Automatic configuration for internal use. -+ When aufs supports Magic SysRq, enabled automatically. -+ -+config AUFS_BDEV_LOOP -+ bool -+ depends on BLK_DEV_LOOP -+ default y -+ help -+ Automatic configuration for internal use. -+ Convert =[ym] into =y. -+ -+config AUFS_INO_T_64 -+ bool -+ depends on AUFS_EXPORT -+ depends on 64BIT && !(ALPHA || S390) -+ default y -+ help -+ Automatic configuration for internal use. -+ /* typedef unsigned long/int __kernel_ino_t */ -+ /* alpha and s390x are int */ -+ -+config AUFS_POLL -+ bool -+ help -+ Automatic configuration for internal use. -+endif -diff -Nur linux-2.6.31-vanilla/fs/aufs/loop.c linux-2.6.31/fs/aufs/loop.c ---- linux-2.6.31-vanilla/fs/aufs/loop.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/loop.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,55 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * support for loopback block device as a branch -+ */ -+ -+#include <linux/loop.h> -+#include "aufs.h" -+ -+/* -+ * test if two lower dentries have overlapping branches. -+ */ -+int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_d1, -+ struct dentry *h_d2) -+{ -+ struct inode *h_inode; -+ struct loop_device *l; -+ -+ h_inode = h_d1->d_inode; -+ if (MAJOR(h_inode->i_sb->s_dev) != LOOP_MAJOR) -+ return 0; -+ -+ l = h_inode->i_sb->s_bdev->bd_disk->private_data; -+ h_d1 = l->lo_backing_file->f_dentry; -+ /* h_d1 can be local NFS. in this case aufs cannot detect the loop */ -+ if (unlikely(h_d1->d_sb == sb)) -+ return 1; -+ return !!au_test_subdir(h_d1, h_d2); -+} -+ -+/* true if a kernel thread named 'loop[0-9].*' accesses a file */ -+int au_test_loopback_kthread(void) -+{ -+ const char c = current->comm[4]; -+ -+ return current->mm == NULL -+ && '0' <= c && c <= '9' -+ && strncmp(current->comm, "loop", 4) == 0; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/loop.h linux-2.6.31/fs/aufs/loop.h ---- linux-2.6.31-vanilla/fs/aufs/loop.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/loop.h 2009-09-16 13:55:29.000000000 +0200 -@@ -0,0 +1,51 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * support for loopback mount as a branch -+ */ -+ -+#ifndef __AUFS_LOOP_H__ -+#define __AUFS_LOOP_H__ -+ -+#ifdef __KERNEL__ -+ -+struct dentry; -+struct super_block; -+ -+#ifdef CONFIG_AUFS_BDEV_LOOP -+/* loop.c */ -+int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_d1, -+ struct dentry *h_d2); -+int au_test_loopback_kthread(void); -+#else -+static inline -+int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_d1, -+ struct dentry *h_d2) -+{ -+ return 0; -+} -+ -+static inline int au_test_loopback_kthread(void) -+{ -+ return 0; -+} -+#endif /* BLK_DEV_LOOP */ -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_LOOP_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/magic.mk linux-2.6.31/fs/aufs/magic.mk ---- linux-2.6.31-vanilla/fs/aufs/magic.mk 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/magic.mk 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,52 @@ -+ -+# defined in ${srctree}/fs/fuse/inode.c -+# tristate -+ifdef CONFIG_FUSE_FS -+ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546 -+endif -+ -+# defined in ${srctree}/fs/ocfs2/ocfs2_fs.h -+# tristate -+ifdef CONFIG_OCFS2_FS -+ccflags-y += -DOCFS2_SUPER_MAGIC=0x7461636f -+endif -+ -+# defined in ${srctree}/fs/ocfs2/dlm/userdlm.h -+# tristate -+ifdef CONFIG_OCFS2_FS_O2CB -+ccflags-y += -DDLMFS_MAGIC=0x76a9f425 -+endif -+ -+# defined in ${srctree}/fs/ramfs/inode.c -+# always true -+ccflags-y += -DRAMFS_MAGIC=0x858458f6 -+ -+# defined in ${srctree}/fs/cifs/cifsfs.c -+# tristate -+ifdef CONFIG_CIFS_FS -+ccflags-y += -DCIFS_MAGIC_NUMBER=0xFF534D42 -+endif -+ -+# defined in ${srctree}/fs/xfs/xfs_sb.h -+# tristate -+ifdef CONFIG_XFS_FS -+ccflags-y += -DXFS_SB_MAGIC=0x58465342 -+endif -+ -+# defined in ${srctree}/fs/configfs/mount.c -+# tristate -+ifdef CONFIG_CONFIGFS_FS -+ccflags-y += -DCONFIGFS_MAGIC=0x62656570 -+endif -+ -+# defined in ${srctree}/fs/9p/v9fs.h -+# tristate -+ifdef CONFIG_9P_FS -+ccflags-y += -DV9FS_MAGIC=0x01021997 -+endif -+ -+# defined in ${srctree}/fs/ubifs/ubifs.h -+# tristate -+ifdef CONFIG_UBIFS_FS -+ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905 -+endif -diff -Nur linux-2.6.31-vanilla/fs/aufs/Makefile linux-2.6.31/fs/aufs/Makefile ---- linux-2.6.31-vanilla/fs/aufs/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/Makefile 2009-09-16 13:55:29.000000000 +0200 -@@ -0,0 +1,24 @@ -+ -+include ${src}/magic.mk -+-include ${src}/priv_def.mk -+ -+obj-$(CONFIG_AUFS_FS) += aufs.o -+aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \ -+ wkq.o vfsub.o dcsub.o \ -+ cpup.o whout.o plink.o wbr_policy.o \ -+ dinfo.o dentry.o \ -+ finfo.o file.o f_op.o \ -+ dir.o vdir.o \ -+ iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \ -+ ioctl.o -+ -+# all are boolean -+aufs-$(CONFIG_SYSFS) += sysfs.o -+aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o -+aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o -+aufs-$(CONFIG_AUFS_HINOTIFY) += hinotify.o -+aufs-$(CONFIG_AUFS_EXPORT) += export.o -+aufs-$(CONFIG_AUFS_POLL) += poll.o -+aufs-$(CONFIG_AUFS_RDU) += rdu.o -+aufs-$(CONFIG_AUFS_DEBUG) += debug.o -+aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o -diff -Nur linux-2.6.31-vanilla/fs/aufs/module.c linux-2.6.31/fs/aufs/module.c ---- linux-2.6.31-vanilla/fs/aufs/module.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/module.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,173 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * module global variables and operations -+ */ -+ -+#include <linux/module.h> -+#include <linux/seq_file.h> -+#include "aufs.h" -+ -+void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp) -+{ -+ if (new_sz <= nused) -+ return p; -+ -+ p = krealloc(p, new_sz, gfp); -+ if (p) -+ memset(p + nused, 0, new_sz - nused); -+ return p; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * aufs caches -+ */ -+struct kmem_cache *au_cachep[AuCache_Last]; -+static int __init au_cache_init(void) -+{ -+ au_cachep[AuCache_DINFO] = AuCache(au_dinfo); -+ if (au_cachep[AuCache_DINFO]) -+ au_cachep[AuCache_ICNTNR] = AuCache(au_icntnr); -+ if (au_cachep[AuCache_ICNTNR]) -+ au_cachep[AuCache_FINFO] = AuCache(au_finfo); -+ if (au_cachep[AuCache_FINFO]) -+ au_cachep[AuCache_VDIR] = AuCache(au_vdir); -+ if (au_cachep[AuCache_VDIR]) -+ au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr); -+ if (au_cachep[AuCache_DEHSTR]) -+ return 0; -+ -+ return -ENOMEM; -+} -+ -+static void au_cache_fin(void) -+{ -+ int i; -+ for (i = 0; i < AuCache_Last; i++) -+ if (au_cachep[i]) { -+ kmem_cache_destroy(au_cachep[i]); -+ au_cachep[i] = NULL; -+ } -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+int au_dir_roflags; -+ -+/* -+ * functions for module interface. -+ */ -+MODULE_LICENSE("GPL"); -+/* MODULE_LICENSE("GPL v2"); */ -+MODULE_AUTHOR("Junjiro R. Okajima aufs-users@lists.sourceforge.net"); -+MODULE_DESCRIPTION(AUFS_NAME -+ " -- Advanced multi layered unification filesystem"); -+MODULE_VERSION(AUFS_VERSION); -+ -+/* it should be 'byte', but param_set_byte() prints it by "%c" */ -+short aufs_nwkq = AUFS_NWKQ_DEF; -+MODULE_PARM_DESC(nwkq, "the number of workqueue thread, " AUFS_WKQ_NAME); -+module_param_named(nwkq, aufs_nwkq, short, S_IRUGO); -+ -+/* this module parameter has no meaning when SYSFS is disabled */ -+int sysaufs_brs = 1; -+MODULE_PARM_DESC(brs, "use <sysfs>/fs/aufs/si_*/brN"); -+module_param_named(brs, sysaufs_brs, int, S_IRUGO); -+ -+/* ---------------------------------------------------------------------- */ -+ -+static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */ -+ -+int au_seq_path(struct seq_file *seq, struct path *path) -+{ -+ return seq_path(seq, path, au_esc_chars); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int __init aufs_init(void) -+{ -+ int err, i; -+ char *p; -+ -+ p = au_esc_chars; -+ for (i = 1; i <= ' '; i++) -+ *p++ = i; -+ *p++ = '\'; -+ *p++ = '\x7f'; -+ *p = 0; -+ -+ au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE); -+ -+ sysaufs_brs_init(); -+ au_debug_init(); -+ -+ err = -EINVAL; -+ if (unlikely(aufs_nwkq <= 0)) -+ goto out; -+ -+ err = sysaufs_init(); -+ if (unlikely(err)) -+ goto out; -+ err = au_wkq_init(); -+ if (unlikely(err)) -+ goto out_sysaufs; -+ err = au_hinotify_init(); -+ if (unlikely(err)) -+ goto out_wkq; -+ err = au_sysrq_init(); -+ if (unlikely(err)) -+ goto out_hin; -+ err = au_cache_init(); -+ if (unlikely(err)) -+ goto out_sysrq; -+ err = register_filesystem(&aufs_fs_type); -+ if (unlikely(err)) -+ goto out_cache; -+ pr_info(AUFS_NAME " " AUFS_VERSION "\n"); -+ goto out; /* success */ -+ -+ out_cache: -+ au_cache_fin(); -+ out_sysrq: -+ au_sysrq_fin(); -+ out_hin: -+ au_hinotify_fin(); -+ out_wkq: -+ au_wkq_fin(); -+ out_sysaufs: -+ sysaufs_fin(); -+ out: -+ return err; -+} -+ -+static void __exit aufs_exit(void) -+{ -+ unregister_filesystem(&aufs_fs_type); -+ au_cache_fin(); -+ au_sysrq_fin(); -+ au_hinotify_fin(); -+ au_wkq_fin(); -+ sysaufs_fin(); -+} -+ -+module_init(aufs_init); -+module_exit(aufs_exit); -diff -Nur linux-2.6.31-vanilla/fs/aufs/module.h linux-2.6.31/fs/aufs/module.h ---- linux-2.6.31-vanilla/fs/aufs/module.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/module.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,78 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * module initialization and module-global -+ */ -+ -+#ifndef __AUFS_MODULE_H__ -+#define __AUFS_MODULE_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/slab.h> -+ -+struct path; -+struct seq_file; -+ -+/* module parameters */ -+extern short aufs_nwkq; -+extern int sysaufs_brs; -+ -+/* ---------------------------------------------------------------------- */ -+ -+extern int au_dir_roflags; -+ -+void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp); -+int au_seq_path(struct seq_file *seq, struct path *path); -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* kmem cache */ -+enum { -+ AuCache_DINFO, -+ AuCache_ICNTNR, -+ AuCache_FINFO, -+ AuCache_VDIR, -+ AuCache_DEHSTR, -+#ifdef CONFIG_AUFS_HINOTIFY -+ AuCache_HINOTIFY, -+#endif -+ AuCache_Last -+}; -+ -+#define AuCache(type) KMEM_CACHE(type, SLAB_RECLAIM_ACCOUNT) -+ -+extern struct kmem_cache *au_cachep[]; -+ -+#define AuCacheFuncs(name, index) \ -+static inline void *au_cache_alloc_##name(void) \ -+{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \ -+static inline void au_cache_free_##name(void *p) \ -+{ kmem_cache_free(au_cachep[AuCache_##index], p); } -+ -+AuCacheFuncs(dinfo, DINFO); -+AuCacheFuncs(icntnr, ICNTNR); -+AuCacheFuncs(finfo, FINFO); -+AuCacheFuncs(vdir, VDIR); -+AuCacheFuncs(dehstr, DEHSTR); -+ -+/* ---------------------------------------------------------------------- */ -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_MODULE_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/opts.c linux-2.6.31/fs/aufs/opts.c ---- linux-2.6.31-vanilla/fs/aufs/opts.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/opts.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,1546 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * mount options/flags -+ */ -+ -+#include <linux/file.h> -+#include <linux/namei.h> -+#include <linux/types.h> /* a distribution requires */ -+#include <linux/parser.h> -+#include "aufs.h" -+ -+/* ---------------------------------------------------------------------- */ -+ -+enum { -+ Opt_br, -+ Opt_add, Opt_del, Opt_mod, Opt_reorder, Opt_append, Opt_prepend, -+ Opt_idel, Opt_imod, Opt_ireorder, -+ Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash, Opt_rendir, -+ Opt_rdblk_def, Opt_rdhash_def, -+ Opt_xino, Opt_zxino, Opt_noxino, -+ Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino, -+ Opt_trunc_xino_path, Opt_itrunc_xino, -+ Opt_trunc_xib, Opt_notrunc_xib, -+ Opt_shwh, Opt_noshwh, -+ Opt_plink, Opt_noplink, Opt_list_plink, -+ Opt_udba, -+ /* Opt_lock, Opt_unlock, */ -+ Opt_cmd, Opt_cmd_args, -+ Opt_diropq_a, Opt_diropq_w, -+ Opt_warn_perm, Opt_nowarn_perm, -+ Opt_wbr_copyup, Opt_wbr_create, -+ Opt_refrof, Opt_norefrof, -+ Opt_verbose, Opt_noverbose, -+ Opt_sum, Opt_nosum, Opt_wsum, -+ Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err -+}; -+ -+static match_table_t options = { -+ {Opt_br, "br=%s"}, -+ {Opt_br, "br:%s"}, -+ -+ {Opt_add, "add=%d:%s"}, -+ {Opt_add, "add:%d:%s"}, -+ {Opt_add, "ins=%d:%s"}, -+ {Opt_add, "ins:%d:%s"}, -+ {Opt_append, "append=%s"}, -+ {Opt_append, "append:%s"}, -+ {Opt_prepend, "prepend=%s"}, -+ {Opt_prepend, "prepend:%s"}, -+ -+ {Opt_del, "del=%s"}, -+ {Opt_del, "del:%s"}, -+ /* {Opt_idel, "idel:%d"}, */ -+ {Opt_mod, "mod=%s"}, -+ {Opt_mod, "mod:%s"}, -+ /* {Opt_imod, "imod:%d:%s"}, */ -+ -+ {Opt_dirwh, "dirwh=%d"}, -+ -+ {Opt_xino, "xino=%s"}, -+ {Opt_noxino, "noxino"}, -+ {Opt_trunc_xino, "trunc_xino"}, -+ {Opt_trunc_xino_v, "trunc_xino_v=%d:%d"}, -+ {Opt_notrunc_xino, "notrunc_xino"}, -+ {Opt_trunc_xino_path, "trunc_xino=%s"}, -+ {Opt_itrunc_xino, "itrunc_xino=%d"}, -+ /* {Opt_zxino, "zxino=%s"}, */ -+ {Opt_trunc_xib, "trunc_xib"}, -+ {Opt_notrunc_xib, "notrunc_xib"}, -+ -+ {Opt_plink, "plink"}, -+ {Opt_noplink, "noplink"}, -+#ifdef CONFIG_AUFS_DEBUG -+ {Opt_list_plink, "list_plink"}, -+#endif -+ -+ {Opt_udba, "udba=%s"}, -+ -+ {Opt_diropq_a, "diropq=always"}, -+ {Opt_diropq_a, "diropq=a"}, -+ {Opt_diropq_w, "diropq=whiteouted"}, -+ {Opt_diropq_w, "diropq=w"}, -+ -+ {Opt_warn_perm, "warn_perm"}, -+ {Opt_nowarn_perm, "nowarn_perm"}, -+ -+ /* keep them temporary */ -+ {Opt_ignore_silent, "coo=%s"}, -+ {Opt_ignore_silent, "nodlgt"}, -+ {Opt_ignore_silent, "nodirperm1"}, -+ {Opt_ignore_silent, "clean_plink"}, -+ -+#ifdef CONFIG_AUFS_SHWH -+ {Opt_shwh, "shwh"}, -+#endif -+ {Opt_noshwh, "noshwh"}, -+ -+ {Opt_rendir, "rendir=%d"}, -+ -+ {Opt_refrof, "refrof"}, -+ {Opt_norefrof, "norefrof"}, -+ -+ {Opt_verbose, "verbose"}, -+ {Opt_verbose, "v"}, -+ {Opt_noverbose, "noverbose"}, -+ {Opt_noverbose, "quiet"}, -+ {Opt_noverbose, "q"}, -+ {Opt_noverbose, "silent"}, -+ -+ {Opt_sum, "sum"}, -+ {Opt_nosum, "nosum"}, -+ {Opt_wsum, "wsum"}, -+ -+ {Opt_rdcache, "rdcache=%d"}, -+ {Opt_rdblk, "rdblk=%d"}, -+ {Opt_rdblk_def, "rdblk=def"}, -+ {Opt_rdhash, "rdhash=%d"}, -+ {Opt_rdhash_def, "rdhash=def"}, -+ -+ {Opt_wbr_create, "create=%s"}, -+ {Opt_wbr_create, "create_policy=%s"}, -+ {Opt_wbr_copyup, "cpup=%s"}, -+ {Opt_wbr_copyup, "copyup=%s"}, -+ {Opt_wbr_copyup, "copyup_policy=%s"}, -+ -+ /* internal use for the scripts */ -+ {Opt_ignore_silent, "si=%s"}, -+ -+ {Opt_br, "dirs=%s"}, -+ {Opt_ignore, "debug=%d"}, -+ {Opt_ignore, "delete=whiteout"}, -+ {Opt_ignore, "delete=all"}, -+ {Opt_ignore, "imap=%s"}, -+ -+ /* temporary workaround, due to old mount(8)? */ -+ {Opt_ignore_silent, "relatime"}, -+ -+ {Opt_err, NULL} -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+static const char *au_parser_pattern(int val, struct match_token *token) -+{ -+ while (token->pattern) { -+ if (token->token == val) -+ return token->pattern; -+ token++; -+ } -+ BUG(); -+ return "??"; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static match_table_t brperms = { -+ {AuBrPerm_RO, AUFS_BRPERM_RO}, -+ {AuBrPerm_RR, AUFS_BRPERM_RR}, -+ {AuBrPerm_RW, AUFS_BRPERM_RW}, -+ -+ {AuBrPerm_ROWH, AUFS_BRPERM_ROWH}, -+ {AuBrPerm_RRWH, AUFS_BRPERM_RRWH}, -+ {AuBrPerm_RWNoLinkWH, AUFS_BRPERM_RWNLWH}, -+ -+ {AuBrPerm_ROWH, "nfsro"}, -+ {AuBrPerm_RO, NULL} -+}; -+ -+static int br_perm_val(char *perm) -+{ -+ int val; -+ substring_t args[MAX_OPT_ARGS]; -+ -+ val = match_token(perm, brperms, args); -+ return val; -+} -+ -+const char *au_optstr_br_perm(int brperm) -+{ -+ return au_parser_pattern(brperm, (void *)brperms); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static match_table_t udbalevel = { -+ {AuOpt_UDBA_REVAL, "reval"}, -+ {AuOpt_UDBA_NONE, "none"}, -+#ifdef CONFIG_AUFS_HINOTIFY -+ {AuOpt_UDBA_HINOTIFY, "inotify"}, -+#endif -+ {-1, NULL} -+}; -+ -+static int udba_val(char *str) -+{ -+ substring_t args[MAX_OPT_ARGS]; -+ -+ return match_token(str, udbalevel, args); -+} -+ -+const char *au_optstr_udba(int udba) -+{ -+ return au_parser_pattern(udba, (void *)udbalevel); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static match_table_t au_wbr_create_policy = { -+ {AuWbrCreate_TDP, "tdp"}, -+ {AuWbrCreate_TDP, "top-down-parent"}, -+ {AuWbrCreate_RR, "rr"}, -+ {AuWbrCreate_RR, "round-robin"}, -+ {AuWbrCreate_MFS, "mfs"}, -+ {AuWbrCreate_MFS, "most-free-space"}, -+ {AuWbrCreate_MFSV, "mfs:%d"}, -+ {AuWbrCreate_MFSV, "most-free-space:%d"}, -+ -+ {AuWbrCreate_MFSRR, "mfsrr:%d"}, -+ {AuWbrCreate_MFSRRV, "mfsrr:%d:%d"}, -+ {AuWbrCreate_PMFS, "pmfs"}, -+ {AuWbrCreate_PMFSV, "pmfs:%d"}, -+ -+ {-1, NULL} -+}; -+ -+/* -+ * cf. linux/lib/parser.c and cmdline.c -+ * gave up calling memparse() since it uses simple_strtoull() instead of -+ * strict_...(). -+ */ -+static int au_match_ull(substring_t *s, unsigned long long *result) -+{ -+ int err; -+ unsigned int len; -+ char a[32]; -+ -+ err = -ERANGE; -+ len = s->to - s->from; -+ if (len + 1 <= sizeof(a)) { -+ memcpy(a, s->from, len); -+ a[len] = '\0'; -+ err = strict_strtoull(a, 0, result); -+ } -+ return err; -+} -+ -+static int au_wbr_mfs_wmark(substring_t *arg, char *str, -+ struct au_opt_wbr_create *create) -+{ -+ int err; -+ unsigned long long ull; -+ -+ err = 0; -+ if (!au_match_ull(arg, &ull)) -+ create->mfsrr_watermark = ull; -+ else { -+ AuErr("bad integer in %s\n", str); -+ err = -EINVAL; -+ } -+ -+ return err; -+} -+ -+static int au_wbr_mfs_sec(substring_t *arg, char *str, -+ struct au_opt_wbr_create *create) -+{ -+ int n, err; -+ -+ err = 0; -+ if (!match_int(arg, &n) && 0 <= n) -+ create->mfs_second = n; -+ else { -+ AuErr("bad integer in %s\n", str); -+ err = -EINVAL; -+ } -+ -+ return err; -+} -+ -+static int au_wbr_create_val(char *str, struct au_opt_wbr_create *create) -+{ -+ int err, e; -+ substring_t args[MAX_OPT_ARGS]; -+ -+ err = match_token(str, au_wbr_create_policy, args); -+ create->wbr_create = err; -+ switch (err) { -+ case AuWbrCreate_MFSRRV: -+ e = au_wbr_mfs_wmark(&args[0], str, create); -+ if (!e) -+ e = au_wbr_mfs_sec(&args[1], str, create); -+ if (unlikely(e)) -+ err = e; -+ break; -+ case AuWbrCreate_MFSRR: -+ e = au_wbr_mfs_wmark(&args[0], str, create); -+ if (unlikely(e)) { -+ err = e; -+ break; -+ } -+ /*FALLTHROUGH*/ -+ case AuWbrCreate_MFS: -+ case AuWbrCreate_PMFS: -+ create->mfs_second = AUFS_MFS_SECOND_DEF; -+ break; -+ case AuWbrCreate_MFSV: -+ case AuWbrCreate_PMFSV: -+ e = au_wbr_mfs_sec(&args[0], str, create); -+ if (unlikely(e)) -+ err = e; -+ break; -+ } -+ -+ return err; -+} -+ -+const char *au_optstr_wbr_create(int wbr_create) -+{ -+ return au_parser_pattern(wbr_create, (void *)au_wbr_create_policy); -+} -+ -+static match_table_t au_wbr_copyup_policy = { -+ {AuWbrCopyup_TDP, "tdp"}, -+ {AuWbrCopyup_TDP, "top-down-parent"}, -+ {AuWbrCopyup_BUP, "bup"}, -+ {AuWbrCopyup_BUP, "bottom-up-parent"}, -+ {AuWbrCopyup_BU, "bu"}, -+ {AuWbrCopyup_BU, "bottom-up"}, -+ {-1, NULL} -+}; -+ -+static int au_wbr_copyup_val(char *str) -+{ -+ substring_t args[MAX_OPT_ARGS]; -+ -+ return match_token(str, au_wbr_copyup_policy, args); -+} -+ -+const char *au_optstr_wbr_copyup(int wbr_copyup) -+{ -+ return au_parser_pattern(wbr_copyup, (void *)au_wbr_copyup_policy); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; -+ -+static void dump_opts(struct au_opts *opts) -+{ -+#ifdef CONFIG_AUFS_DEBUG -+ /* reduce stack space */ -+ union { -+ struct au_opt_add *add; -+ struct au_opt_del *del; -+ struct au_opt_mod *mod; -+ struct au_opt_xino *xino; -+ struct au_opt_xino_itrunc *xino_itrunc; -+ struct au_opt_wbr_create *create; -+ } u; -+ struct au_opt *opt; -+ -+ opt = opts->opt; -+ while (opt->type != Opt_tail) { -+ switch (opt->type) { -+ case Opt_add: -+ u.add = &opt->add; -+ AuDbg("add {b%d, %s, 0x%x, %p}\n", -+ u.add->bindex, u.add->pathname, u.add->perm, -+ u.add->path.dentry); -+ break; -+ case Opt_del: -+ case Opt_idel: -+ u.del = &opt->del; -+ AuDbg("del {%s, %p}\n", -+ u.del->pathname, u.del->h_path.dentry); -+ break; -+ case Opt_mod: -+ case Opt_imod: -+ u.mod = &opt->mod; -+ AuDbg("mod {%s, 0x%x, %p}\n", -+ u.mod->path, u.mod->perm, u.mod->h_root); -+ break; -+ case Opt_append: -+ u.add = &opt->add; -+ AuDbg("append {b%d, %s, 0x%x, %p}\n", -+ u.add->bindex, u.add->pathname, u.add->perm, -+ u.add->path.dentry); -+ break; -+ case Opt_prepend: -+ u.add = &opt->add; -+ AuDbg("prepend {b%d, %s, 0x%x, %p}\n", -+ u.add->bindex, u.add->pathname, u.add->perm, -+ u.add->path.dentry); -+ break; -+ case Opt_dirwh: -+ AuDbg("dirwh %d\n", opt->dirwh); -+ break; -+ case Opt_rdcache: -+ AuDbg("rdcache %d\n", opt->rdcache); -+ break; -+ case Opt_rdblk: -+ AuDbg("rdblk %u\n", opt->rdblk); -+ break; -+ case Opt_rdblk_def: -+ AuDbg("rdblk_def\n"); -+ break; -+ case Opt_rdhash: -+ AuDbg("rdhash %u\n", opt->rdhash); -+ break; -+ case Opt_rdhash_def: -+ AuDbg("rdhash_def\n"); -+ break; -+ case Opt_xino: -+ u.xino = &opt->xino; -+ AuDbg("xino {%s %.*s}\n", -+ u.xino->path, -+ AuDLNPair(u.xino->file->f_dentry)); -+ break; -+ case Opt_trunc_xino: -+ AuLabel(trunc_xino); -+ break; -+ case Opt_notrunc_xino: -+ AuLabel(notrunc_xino); -+ break; -+ case Opt_trunc_xino_path: -+ case Opt_itrunc_xino: -+ u.xino_itrunc = &opt->xino_itrunc; -+ AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex); -+ break; -+ -+ case Opt_noxino: -+ AuLabel(noxino); -+ break; -+ case Opt_trunc_xib: -+ AuLabel(trunc_xib); -+ break; -+ case Opt_notrunc_xib: -+ AuLabel(notrunc_xib); -+ break; -+ case Opt_shwh: -+ AuLabel(shwh); -+ break; -+ case Opt_noshwh: -+ AuLabel(noshwh); -+ break; -+ case Opt_plink: -+ AuLabel(plink); -+ break; -+ case Opt_noplink: -+ AuLabel(noplink); -+ break; -+ case Opt_list_plink: -+ AuLabel(list_plink); -+ break; -+ case Opt_udba: -+ AuDbg("udba %d, %s\n", -+ opt->udba, au_optstr_udba(opt->udba)); -+ break; -+ case Opt_diropq_a: -+ AuLabel(diropq_a); -+ break; -+ case Opt_diropq_w: -+ AuLabel(diropq_w); -+ break; -+ case Opt_warn_perm: -+ AuLabel(warn_perm); -+ break; -+ case Opt_nowarn_perm: -+ AuLabel(nowarn_perm); -+ break; -+ case Opt_refrof: -+ AuLabel(refrof); -+ break; -+ case Opt_norefrof: -+ AuLabel(norefrof); -+ break; -+ case Opt_verbose: -+ AuLabel(verbose); -+ break; -+ case Opt_noverbose: -+ AuLabel(noverbose); -+ break; -+ case Opt_sum: -+ AuLabel(sum); -+ break; -+ case Opt_nosum: -+ AuLabel(nosum); -+ break; -+ case Opt_wsum: -+ AuLabel(wsum); -+ break; -+ case Opt_wbr_create: -+ u.create = &opt->wbr_create; -+ AuDbg("create %d, %s\n", u.create->wbr_create, -+ au_optstr_wbr_create(u.create->wbr_create)); -+ switch (u.create->wbr_create) { -+ case AuWbrCreate_MFSV: -+ case AuWbrCreate_PMFSV: -+ AuDbg("%d sec\n", u.create->mfs_second); -+ break; -+ case AuWbrCreate_MFSRR: -+ AuDbg("%llu watermark\n", -+ u.create->mfsrr_watermark); -+ break; -+ case AuWbrCreate_MFSRRV: -+ AuDbg("%llu watermark, %d sec\n", -+ u.create->mfsrr_watermark, -+ u.create->mfs_second); -+ break; -+ } -+ break; -+ case Opt_wbr_copyup: -+ AuDbg("copyup %d, %s\n", opt->wbr_copyup, -+ au_optstr_wbr_copyup(opt->wbr_copyup)); -+ break; -+ default: -+ BUG(); -+ } -+ opt++; -+ } -+#endif -+} -+ -+void au_opts_free(struct au_opts *opts) -+{ -+ struct au_opt *opt; -+ -+ opt = opts->opt; -+ while (opt->type != Opt_tail) { -+ switch (opt->type) { -+ case Opt_add: -+ case Opt_append: -+ case Opt_prepend: -+ path_put(&opt->add.path); -+ break; -+ case Opt_del: -+ case Opt_idel: -+ path_put(&opt->del.h_path); -+ break; -+ case Opt_mod: -+ case Opt_imod: -+ dput(opt->mod.h_root); -+ break; -+ case Opt_xino: -+ fput(opt->xino.file); -+ break; -+ } -+ opt++; -+ } -+} -+ -+static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags, -+ aufs_bindex_t bindex) -+{ -+ int err; -+ struct au_opt_add *add = &opt->add; -+ char *p; -+ -+ add->bindex = bindex; -+ add->perm = AuBrPerm_Last; -+ add->pathname = opt_str; -+ p = strchr(opt_str, '='); -+ if (p) { -+ *p++ = 0; -+ if (*p) -+ add->perm = br_perm_val(p); -+ } -+ -+ err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path); -+ if (!err) { -+ if (!p) { -+ add->perm = AuBrPerm_RO; -+ if (au_test_fs_rr(add->path.dentry->d_sb)) -+ add->perm = AuBrPerm_RR; -+ else if (!bindex && !(sb_flags & MS_RDONLY)) -+ add->perm = AuBrPerm_RW; -+ } -+ opt->type = Opt_add; -+ goto out; -+ } -+ AuErr("lookup failed %s (%d)\n", add->pathname, err); -+ err = -EINVAL; -+ -+ out: -+ return err; -+} -+ -+static int au_opts_parse_del(struct au_opt_del *del, substring_t args[]) -+{ -+ int err; -+ -+ del->pathname = args[0].from; -+ AuDbg("del path %s\n", del->pathname); -+ -+ err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path); -+ if (unlikely(err)) -+ AuErr("lookup failed %s (%d)\n", del->pathname, err); -+ -+ return err; -+} -+ -+#if 0 /* reserved for future use */ -+static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex, -+ struct au_opt_del *del, substring_t args[]) -+{ -+ int err; -+ struct dentry *root; -+ -+ err = -EINVAL; -+ root = sb->s_root; -+ aufs_read_lock(root, AuLock_FLUSH); -+ if (bindex < 0 || au_sbend(sb) < bindex) { -+ AuErr("out of bounds, %d\n", bindex); -+ goto out; -+ } -+ -+ err = 0; -+ del->h_path.dentry = dget(au_h_dptr(root, bindex)); -+ del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex)); -+ -+ out: -+ aufs_read_unlock(root, !AuLock_IR); -+ return err; -+} -+#endif -+ -+static int au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[]) -+{ -+ int err; -+ struct path path; -+ char *p; -+ -+ err = -EINVAL; -+ mod->path = args[0].from; -+ p = strchr(mod->path, '='); -+ if (unlikely(!p)) { -+ AuErr("no permssion %s\n", args[0].from); -+ goto out; -+ } -+ -+ *p++ = 0; -+ err = vfsub_kern_path(mod->path, lkup_dirflags, &path); -+ if (unlikely(err)) { -+ AuErr("lookup failed %s (%d)\n", mod->path, err); -+ goto out; -+ } -+ -+ mod->perm = br_perm_val(p); -+ AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p); -+ mod->h_root = dget(path.dentry); -+ path_put(&path); -+ -+ out: -+ return err; -+} -+ -+#if 0 /* reserved for future use */ -+static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex, -+ struct au_opt_mod *mod, substring_t args[]) -+{ -+ int err; -+ struct dentry *root; -+ -+ err = -EINVAL; -+ root = sb->s_root; -+ aufs_read_lock(root, AuLock_FLUSH); -+ if (bindex < 0 || au_sbend(sb) < bindex) { -+ AuErr("out of bounds, %d\n", bindex); -+ goto out; -+ } -+ -+ err = 0; -+ mod->perm = br_perm_val(args[1].from); -+ AuDbg("mod path %s, perm 0x%x, %s\n", -+ mod->path, mod->perm, args[1].from); -+ mod->h_root = dget(au_h_dptr(root, bindex)); -+ -+ out: -+ aufs_read_unlock(root, !AuLock_IR); -+ return err; -+} -+#endif -+ -+static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino, -+ substring_t args[]) -+{ -+ int err; -+ struct file *file; -+ -+ file = au_xino_create(sb, args[0].from, /*silent*/0); -+ err = PTR_ERR(file); -+ if (IS_ERR(file)) -+ goto out; -+ -+ err = -EINVAL; -+ if (unlikely(file->f_dentry->d_sb == sb)) { -+ fput(file); -+ AuErr("%s must be outside\n", args[0].from); -+ goto out; -+ } -+ -+ err = 0; -+ xino->file = file; -+ xino->path = args[0].from; -+ -+ out: -+ return err; -+} -+ -+static -+int au_opts_parse_xino_itrunc_path(struct super_block *sb, -+ struct au_opt_xino_itrunc *xino_itrunc, -+ substring_t args[]) -+{ -+ int err; -+ aufs_bindex_t bend, bindex; -+ struct path path; -+ struct dentry *root; -+ -+ err = vfsub_kern_path(args[0].from, lkup_dirflags, &path); -+ if (unlikely(err)) { -+ AuErr("lookup failed %s (%d)\n", args[0].from, err); -+ goto out; -+ } -+ -+ xino_itrunc->bindex = -1; -+ root = sb->s_root; -+ aufs_read_lock(root, AuLock_FLUSH); -+ bend = au_sbend(sb); -+ for (bindex = 0; bindex <= bend; bindex++) { -+ if (au_h_dptr(root, bindex) == path.dentry) { -+ xino_itrunc->bindex = bindex; -+ break; -+ } -+ } -+ aufs_read_unlock(root, !AuLock_IR); -+ path_put(&path); -+ -+ if (unlikely(xino_itrunc->bindex < 0)) { -+ AuErr("no such branch %s\n", args[0].from); -+ err = -EINVAL; -+ } -+ -+ out: -+ return err; -+} -+ -+/* called without aufs lock */ -+int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts) -+{ -+ int err, n, token; -+ aufs_bindex_t bindex; -+ unsigned char skipped; -+ struct dentry *root; -+ struct au_opt *opt, *opt_tail; -+ char *opt_str; -+ /* reduce the stack space */ -+ union { -+ struct au_opt_xino_itrunc *xino_itrunc; -+ struct au_opt_wbr_create *create; -+ } u; -+ struct { -+ substring_t args[MAX_OPT_ARGS]; -+ } *a; -+ -+ err = -ENOMEM; -+ a = kmalloc(sizeof(*a), GFP_NOFS); -+ if (unlikely(!a)) -+ goto out; -+ -+ root = sb->s_root; -+ err = 0; -+ bindex = 0; -+ opt = opts->opt; -+ opt_tail = opt + opts->max_opt - 1; -+ opt->type = Opt_tail; -+ while (!err && (opt_str = strsep(&str, ",")) && *opt_str) { -+ err = -EINVAL; -+ skipped = 0; -+ token = match_token(opt_str, options, a->args); -+ switch (token) { -+ case Opt_br: -+ err = 0; -+ while (!err && (opt_str = strsep(&a->args[0].from, ":")) -+ && *opt_str) { -+ err = opt_add(opt, opt_str, opts->sb_flags, -+ bindex++); -+ if (unlikely(!err && ++opt > opt_tail)) { -+ err = -E2BIG; -+ break; -+ } -+ opt->type = Opt_tail; -+ skipped = 1; -+ } -+ break; -+ case Opt_add: -+ if (unlikely(match_int(&a->args[0], &n))) { -+ AuErr("bad integer in %s\n", opt_str); -+ break; -+ } -+ bindex = n; -+ err = opt_add(opt, a->args[1].from, opts->sb_flags, -+ bindex); -+ if (!err) -+ opt->type = token; -+ break; -+ case Opt_append: -+ err = opt_add(opt, a->args[0].from, opts->sb_flags, -+ /*dummy bindex*/1); -+ if (!err) -+ opt->type = token; -+ break; -+ case Opt_prepend: -+ err = opt_add(opt, a->args[0].from, opts->sb_flags, -+ /*bindex*/0); -+ if (!err) -+ opt->type = token; -+ break; -+ case Opt_del: -+ err = au_opts_parse_del(&opt->del, a->args); -+ if (!err) -+ opt->type = token; -+ break; -+#if 0 /* reserved for future use */ -+ case Opt_idel: -+ del->pathname = "(indexed)"; -+ if (unlikely(match_int(&args[0], &n))) { -+ AuErr("bad integer in %s\n", opt_str); -+ break; -+ } -+ err = au_opts_parse_idel(sb, n, &opt->del, a->args); -+ if (!err) -+ opt->type = token; -+ break; -+#endif -+ case Opt_mod: -+ err = au_opts_parse_mod(&opt->mod, a->args); -+ if (!err) -+ opt->type = token; -+ break; -+#ifdef IMOD /* reserved for future use */ -+ case Opt_imod: -+ u.mod->path = "(indexed)"; -+ if (unlikely(match_int(&a->args[0], &n))) { -+ AuErr("bad integer in %s\n", opt_str); -+ break; -+ } -+ err = au_opts_parse_imod(sb, n, &opt->mod, a->args); -+ if (!err) -+ opt->type = token; -+ break; -+#endif -+ case Opt_xino: -+ err = au_opts_parse_xino(sb, &opt->xino, a->args); -+ if (!err) -+ opt->type = token; -+ break; -+ -+ case Opt_trunc_xino_path: -+ err = au_opts_parse_xino_itrunc_path -+ (sb, &opt->xino_itrunc, a->args); -+ if (!err) -+ opt->type = token; -+ break; -+ -+ case Opt_itrunc_xino: -+ u.xino_itrunc = &opt->xino_itrunc; -+ if (unlikely(match_int(&a->args[0], &n))) { -+ AuErr("bad integer in %s\n", opt_str); -+ break; -+ } -+ u.xino_itrunc->bindex = n; -+ aufs_read_lock(root, AuLock_FLUSH); -+ if (n < 0 || au_sbend(sb) < n) { -+ AuErr("out of bounds, %d\n", n); -+ aufs_read_unlock(root, !AuLock_IR); -+ break; -+ } -+ aufs_read_unlock(root, !AuLock_IR); -+ err = 0; -+ opt->type = token; -+ break; -+ -+ case Opt_dirwh: -+ if (unlikely(match_int(&a->args[0], &opt->dirwh))) -+ break; -+ err = 0; -+ opt->type = token; -+ break; -+ -+ case Opt_rdcache: -+ if (unlikely(match_int(&a->args[0], &opt->rdcache))) -+ break; -+ err = 0; -+ opt->type = token; -+ break; -+ case Opt_rdblk: -+ if (unlikely(match_int(&a->args[0], &n) -+ || n < 0 -+ || n > KMALLOC_MAX_SIZE)) { -+ AuErr("bad integer in %s\n", opt_str); -+ break; -+ } -+ if (unlikely(n && n < NAME_MAX)) { -+ AuErr("rdblk must be larger than %d\n", -+ NAME_MAX); -+ break; -+ } -+ opt->rdblk = n; -+ err = 0; -+ opt->type = token; -+ break; -+ case Opt_rdhash: -+ if (unlikely(match_int(&a->args[0], &n) -+ || n < 0 -+ || n * sizeof(struct hlist_head) -+ > KMALLOC_MAX_SIZE)) { -+ AuErr("bad integer in %s\n", opt_str); -+ break; -+ } -+ opt->rdhash = n; -+ err = 0; -+ opt->type = token; -+ break; -+ -+ case Opt_trunc_xino: -+ case Opt_notrunc_xino: -+ case Opt_noxino: -+ case Opt_trunc_xib: -+ case Opt_notrunc_xib: -+ case Opt_shwh: -+ case Opt_noshwh: -+ case Opt_plink: -+ case Opt_noplink: -+ case Opt_list_plink: -+ case Opt_diropq_a: -+ case Opt_diropq_w: -+ case Opt_warn_perm: -+ case Opt_nowarn_perm: -+ case Opt_refrof: -+ case Opt_norefrof: -+ case Opt_verbose: -+ case Opt_noverbose: -+ case Opt_sum: -+ case Opt_nosum: -+ case Opt_wsum: -+ case Opt_rdblk_def: -+ case Opt_rdhash_def: -+ err = 0; -+ opt->type = token; -+ break; -+ -+ case Opt_udba: -+ opt->udba = udba_val(a->args[0].from); -+ if (opt->udba >= 0) { -+ err = 0; -+ opt->type = token; -+ } else -+ AuErr("wrong value, %s\n", opt_str); -+ break; -+ -+ case Opt_wbr_create: -+ u.create = &opt->wbr_create; -+ u.create->wbr_create -+ = au_wbr_create_val(a->args[0].from, u.create); -+ if (u.create->wbr_create >= 0) { -+ err = 0; -+ opt->type = token; -+ } else -+ AuErr("wrong value, %s\n", opt_str); -+ break; -+ case Opt_wbr_copyup: -+ opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from); -+ if (opt->wbr_copyup >= 0) { -+ err = 0; -+ opt->type = token; -+ } else -+ AuErr("wrong value, %s\n", opt_str); -+ break; -+ -+ case Opt_ignore: -+ AuWarn("ignored %s\n", opt_str); -+ /*FALLTHROUGH*/ -+ case Opt_ignore_silent: -+ skipped = 1; -+ err = 0; -+ break; -+ case Opt_err: -+ AuErr("unknown option %s\n", opt_str); -+ break; -+ } -+ -+ if (!err && !skipped) { -+ if (unlikely(++opt > opt_tail)) { -+ err = -E2BIG; -+ opt--; -+ opt->type = Opt_tail; -+ break; -+ } -+ opt->type = Opt_tail; -+ } -+ } -+ -+ kfree(a); -+ dump_opts(opts); -+ if (unlikely(err)) -+ au_opts_free(opts); -+ -+ out: -+ return err; -+} -+ -+static int au_opt_wbr_create(struct super_block *sb, -+ struct au_opt_wbr_create *create) -+{ -+ int err; -+ struct au_sbinfo *sbinfo; -+ -+ SiMustWriteLock(sb); -+ -+ err = 1; /* handled */ -+ sbinfo = au_sbi(sb); -+ if (sbinfo->si_wbr_create_ops->fin) { -+ err = sbinfo->si_wbr_create_ops->fin(sb); -+ if (!err) -+ err = 1; -+ } -+ -+ sbinfo->si_wbr_create = create->wbr_create; -+ sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create; -+ switch (create->wbr_create) { -+ case AuWbrCreate_MFSRRV: -+ case AuWbrCreate_MFSRR: -+ sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark; -+ /*FALLTHROUGH*/ -+ case AuWbrCreate_MFS: -+ case AuWbrCreate_MFSV: -+ case AuWbrCreate_PMFS: -+ case AuWbrCreate_PMFSV: -+ sbinfo->si_wbr_mfs.mfs_expire = create->mfs_second * HZ; -+ break; -+ } -+ -+ if (sbinfo->si_wbr_create_ops->init) -+ sbinfo->si_wbr_create_ops->init(sb); /* ignore */ -+ -+ return err; -+} -+ -+/* -+ * returns, -+ * plus: processed without an error -+ * zero: unprocessed -+ */ -+static int au_opt_simple(struct super_block *sb, struct au_opt *opt, -+ struct au_opts *opts) -+{ -+ int err; -+ struct au_sbinfo *sbinfo; -+ -+ SiMustWriteLock(sb); -+ -+ err = 1; /* handled */ -+ sbinfo = au_sbi(sb); -+ switch (opt->type) { -+ case Opt_udba: -+ sbinfo->si_mntflags &= ~AuOptMask_UDBA; -+ sbinfo->si_mntflags |= opt->udba; -+ opts->given_udba |= opt->udba; -+ break; -+ -+ case Opt_plink: -+ au_opt_set(sbinfo->si_mntflags, PLINK); -+ break; -+ case Opt_noplink: -+ if (au_opt_test(sbinfo->si_mntflags, PLINK)) -+ au_plink_put(sb); -+ au_opt_clr(sbinfo->si_mntflags, PLINK); -+ break; -+ case Opt_list_plink: -+ if (au_opt_test(sbinfo->si_mntflags, PLINK)) -+ au_plink_list(sb); -+ break; -+ -+ case Opt_diropq_a: -+ au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ); -+ break; -+ case Opt_diropq_w: -+ au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ); -+ break; -+ -+ case Opt_warn_perm: -+ au_opt_set(sbinfo->si_mntflags, WARN_PERM); -+ break; -+ case Opt_nowarn_perm: -+ au_opt_clr(sbinfo->si_mntflags, WARN_PERM); -+ break; -+ -+ case Opt_refrof: -+ au_opt_set(sbinfo->si_mntflags, REFROF); -+ break; -+ case Opt_norefrof: -+ au_opt_clr(sbinfo->si_mntflags, REFROF); -+ break; -+ -+ case Opt_verbose: -+ au_opt_set(sbinfo->si_mntflags, VERBOSE); -+ break; -+ case Opt_noverbose: -+ au_opt_clr(sbinfo->si_mntflags, VERBOSE); -+ break; -+ -+ case Opt_sum: -+ au_opt_set(sbinfo->si_mntflags, SUM); -+ break; -+ case Opt_wsum: -+ au_opt_clr(sbinfo->si_mntflags, SUM); -+ au_opt_set(sbinfo->si_mntflags, SUM_W); -+ case Opt_nosum: -+ au_opt_clr(sbinfo->si_mntflags, SUM); -+ au_opt_clr(sbinfo->si_mntflags, SUM_W); -+ break; -+ -+ case Opt_wbr_create: -+ err = au_opt_wbr_create(sb, &opt->wbr_create); -+ break; -+ case Opt_wbr_copyup: -+ sbinfo->si_wbr_copyup = opt->wbr_copyup; -+ sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup; -+ break; -+ -+ case Opt_dirwh: -+ sbinfo->si_dirwh = opt->dirwh; -+ break; -+ -+ case Opt_rdcache: -+ sbinfo->si_rdcache = opt->rdcache * HZ; -+ break; -+ case Opt_rdblk: -+ sbinfo->si_rdblk = opt->rdblk; -+ break; -+ case Opt_rdblk_def: -+ sbinfo->si_rdblk = AUFS_RDBLK_DEF; -+ break; -+ case Opt_rdhash: -+ sbinfo->si_rdhash = opt->rdhash; -+ break; -+ case Opt_rdhash_def: -+ sbinfo->si_rdhash = AUFS_RDHASH_DEF; -+ break; -+ -+ case Opt_shwh: -+ au_opt_set(sbinfo->si_mntflags, SHWH); -+ break; -+ case Opt_noshwh: -+ au_opt_clr(sbinfo->si_mntflags, SHWH); -+ break; -+ -+ case Opt_trunc_xino: -+ au_opt_set(sbinfo->si_mntflags, TRUNC_XINO); -+ break; -+ case Opt_notrunc_xino: -+ au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO); -+ break; -+ -+ case Opt_trunc_xino_path: -+ case Opt_itrunc_xino: -+ err = au_xino_trunc(sb, opt->xino_itrunc.bindex); -+ if (!err) -+ err = 1; -+ break; -+ -+ case Opt_trunc_xib: -+ au_fset_opts(opts->flags, TRUNC_XIB); -+ break; -+ case Opt_notrunc_xib: -+ au_fclr_opts(opts->flags, TRUNC_XIB); -+ break; -+ -+ default: -+ err = 0; -+ break; -+ } -+ -+ return err; -+} -+ -+/* -+ * returns tri-state. -+ * plus: processed without an error -+ * zero: unprocessed -+ * minus: error -+ */ -+static int au_opt_br(struct super_block *sb, struct au_opt *opt, -+ struct au_opts *opts) -+{ -+ int err, do_refresh; -+ -+ err = 0; -+ switch (opt->type) { -+ case Opt_append: -+ opt->add.bindex = au_sbend(sb) + 1; -+ if (opt->add.bindex < 0) -+ opt->add.bindex = 0; -+ goto add; -+ case Opt_prepend: -+ opt->add.bindex = 0; -+ add: -+ case Opt_add: -+ err = au_br_add(sb, &opt->add, -+ au_ftest_opts(opts->flags, REMOUNT)); -+ if (!err) { -+ err = 1; -+ au_fset_opts(opts->flags, REFRESH_DIR); -+ if (au_br_whable(opt->add.perm)) -+ au_fset_opts(opts->flags, REFRESH_NONDIR); -+ } -+ break; -+ -+ case Opt_del: -+ case Opt_idel: -+ err = au_br_del(sb, &opt->del, -+ au_ftest_opts(opts->flags, REMOUNT)); -+ if (!err) { -+ err = 1; -+ au_fset_opts(opts->flags, TRUNC_XIB); -+ au_fset_opts(opts->flags, REFRESH_DIR); -+ au_fset_opts(opts->flags, REFRESH_NONDIR); -+ } -+ break; -+ -+ case Opt_mod: -+ case Opt_imod: -+ err = au_br_mod(sb, &opt->mod, -+ au_ftest_opts(opts->flags, REMOUNT), -+ &do_refresh); -+ if (!err) { -+ err = 1; -+ if (do_refresh) { -+ au_fset_opts(opts->flags, REFRESH_DIR); -+ au_fset_opts(opts->flags, REFRESH_NONDIR); -+ } -+ } -+ break; -+ } -+ -+ return err; -+} -+ -+static int au_opt_xino(struct super_block *sb, struct au_opt *opt, -+ struct au_opt_xino **opt_xino, -+ struct au_opts *opts) -+{ -+ int err; -+ aufs_bindex_t bend, bindex; -+ struct dentry *root, *parent, *h_root; -+ -+ err = 0; -+ switch (opt->type) { -+ case Opt_xino: -+ err = au_xino_set(sb, &opt->xino, -+ !!au_ftest_opts(opts->flags, REMOUNT)); -+ if (unlikely(err)) -+ break; -+ -+ *opt_xino = &opt->xino; -+ au_xino_brid_set(sb, -1); -+ -+ /* safe d_parent access */ -+ parent = opt->xino.file->f_dentry->d_parent; -+ root = sb->s_root; -+ bend = au_sbend(sb); -+ for (bindex = 0; bindex <= bend; bindex++) { -+ h_root = au_h_dptr(root, bindex); -+ if (h_root == parent) { -+ au_xino_brid_set(sb, au_sbr_id(sb, bindex)); -+ break; -+ } -+ } -+ break; -+ -+ case Opt_noxino: -+ au_xino_clr(sb); -+ au_xino_brid_set(sb, -1); -+ *opt_xino = (void *)-1; -+ break; -+ } -+ -+ return err; -+} -+ -+int au_opts_verify(struct super_block *sb, unsigned long sb_flags, -+ unsigned int pending) -+{ -+ int err; -+ aufs_bindex_t bindex, bend; -+ unsigned char do_plink, skip, do_free; -+ struct au_branch *br; -+ struct au_wbr *wbr; -+ struct dentry *root; -+ struct inode *dir, *h_dir; -+ struct au_sbinfo *sbinfo; -+ struct au_hinode *hdir; -+ -+ SiMustAnyLock(sb); -+ -+ sbinfo = au_sbi(sb); -+ AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA)); -+ -+ if (!(sb_flags & MS_RDONLY)) { -+ if (unlikely(!au_br_writable(au_sbr_perm(sb, 0)))) -+ AuWarn("first branch should be rw\n"); -+ if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH))) -+ AuWarn("shwh should be used with ro\n"); -+ } -+ -+ if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HINOTIFY) -+ && !au_opt_test(sbinfo->si_mntflags, XINO)) -+ AuWarn("udba=inotify requires xino\n"); -+ -+ err = 0; -+ root = sb->s_root; -+ dir = sb->s_root->d_inode; -+ do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK); -+ bend = au_sbend(sb); -+ for (bindex = 0; !err && bindex <= bend; bindex++) { -+ skip = 0; -+ h_dir = au_h_iptr(dir, bindex); -+ br = au_sbr(sb, bindex); -+ do_free = 0; -+ -+ wbr = br->br_wbr; -+ if (wbr) -+ wbr_wh_read_lock(wbr); -+ -+ switch (br->br_perm) { -+ case AuBrPerm_RO: -+ case AuBrPerm_ROWH: -+ case AuBrPerm_RR: -+ case AuBrPerm_RRWH: -+ do_free = !!wbr; -+ skip = (!wbr -+ || (!wbr->wbr_whbase -+ && !wbr->wbr_plink -+ && !wbr->wbr_orph)); -+ break; -+ -+ case AuBrPerm_RWNoLinkWH: -+ /* skip = (!br->br_whbase && !br->br_orph); */ -+ skip = (!wbr || !wbr->wbr_whbase); -+ if (skip && wbr) { -+ if (do_plink) -+ skip = !!wbr->wbr_plink; -+ else -+ skip = !wbr->wbr_plink; -+ } -+ break; -+ -+ case AuBrPerm_RW: -+ /* skip = (br->br_whbase && br->br_ohph); */ -+ skip = (wbr && wbr->wbr_whbase); -+ if (skip) { -+ if (do_plink) -+ skip = !!wbr->wbr_plink; -+ else -+ skip = !wbr->wbr_plink; -+ } -+ break; -+ -+ default: -+ BUG(); -+ } -+ if (wbr) -+ wbr_wh_read_unlock(wbr); -+ -+ if (skip) -+ continue; -+ -+ hdir = au_hi(dir, bindex); -+ au_hin_imtx_lock_nested(hdir, AuLsc_I_PARENT); -+ if (wbr) -+ wbr_wh_write_lock(wbr); -+ err = au_wh_init(au_h_dptr(root, bindex), br, sb); -+ if (wbr) -+ wbr_wh_write_unlock(wbr); -+ au_hin_imtx_unlock(hdir); -+ -+ if (!err && do_free) { -+ kfree(wbr); -+ br->br_wbr = NULL; -+ } -+ } -+ -+ return err; -+} -+ -+int au_opts_mount(struct super_block *sb, struct au_opts *opts) -+{ -+ int err; -+ unsigned int tmp; -+ aufs_bindex_t bend; -+ struct au_opt *opt; -+ struct au_opt_xino *opt_xino, xino; -+ struct au_sbinfo *sbinfo; -+ -+ SiMustWriteLock(sb); -+ -+ err = 0; -+ opt_xino = NULL; -+ opt = opts->opt; -+ while (err >= 0 && opt->type != Opt_tail) -+ err = au_opt_simple(sb, opt++, opts); -+ if (err > 0) -+ err = 0; -+ else if (unlikely(err < 0)) -+ goto out; -+ -+ /* disable xino and udba temporary */ -+ sbinfo = au_sbi(sb); -+ tmp = sbinfo->si_mntflags; -+ au_opt_clr(sbinfo->si_mntflags, XINO); -+ au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL); -+ -+ opt = opts->opt; -+ while (err >= 0 && opt->type != Opt_tail) -+ err = au_opt_br(sb, opt++, opts); -+ if (err > 0) -+ err = 0; -+ else if (unlikely(err < 0)) -+ goto out; -+ -+ bend = au_sbend(sb); -+ if (unlikely(bend < 0)) { -+ err = -EINVAL; -+ AuErr("no branches\n"); -+ goto out; -+ } -+ -+ if (au_opt_test(tmp, XINO)) -+ au_opt_set(sbinfo->si_mntflags, XINO); -+ opt = opts->opt; -+ while (!err && opt->type != Opt_tail) -+ err = au_opt_xino(sb, opt++, &opt_xino, opts); -+ if (unlikely(err)) -+ goto out; -+ -+ err = au_opts_verify(sb, sb->s_flags, tmp); -+ if (unlikely(err)) -+ goto out; -+ -+ /* restore xino */ -+ if (au_opt_test(tmp, XINO) && !opt_xino) { -+ xino.file = au_xino_def(sb); -+ err = PTR_ERR(xino.file); -+ if (IS_ERR(xino.file)) -+ goto out; -+ -+ err = au_xino_set(sb, &xino, /*remount*/0); -+ fput(xino.file); -+ if (unlikely(err)) -+ goto out; -+ } -+ -+ /* restore udba */ -+ sbinfo->si_mntflags &= ~AuOptMask_UDBA; -+ sbinfo->si_mntflags |= (tmp & AuOptMask_UDBA); -+ if (au_opt_test(tmp, UDBA_HINOTIFY)) { -+ struct inode *dir = sb->s_root->d_inode; -+ au_reset_hinotify(dir, -+ au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO); -+ } -+ -+ out: -+ return err; -+} -+ -+int au_opts_remount(struct super_block *sb, struct au_opts *opts) -+{ -+ int err, rerr; -+ struct inode *dir; -+ struct au_opt_xino *opt_xino; -+ struct au_opt *opt; -+ struct au_sbinfo *sbinfo; -+ -+ SiMustWriteLock(sb); -+ -+ dir = sb->s_root->d_inode; -+ sbinfo = au_sbi(sb); -+ err = 0; -+ opt_xino = NULL; -+ opt = opts->opt; -+ while (err >= 0 && opt->type != Opt_tail) { -+ err = au_opt_simple(sb, opt, opts); -+ if (!err) -+ err = au_opt_br(sb, opt, opts); -+ if (!err) -+ err = au_opt_xino(sb, opt, &opt_xino, opts); -+ opt++; -+ } -+ if (err > 0) -+ err = 0; -+ AuTraceErr(err); -+ /* go on even err */ -+ -+ rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0); -+ if (unlikely(rerr && !err)) -+ err = rerr; -+ -+ if (au_ftest_opts(opts->flags, TRUNC_XIB)) { -+ rerr = au_xib_trunc(sb); -+ if (unlikely(rerr && !err)) -+ err = rerr; -+ } -+ -+ /* will be handled by the caller */ -+ if (!au_ftest_opts(opts->flags, REFRESH_DIR) -+ && (opts->given_udba || au_opt_test(sbinfo->si_mntflags, XINO))) -+ au_fset_opts(opts->flags, REFRESH_DIR); -+ -+ AuDbg("status 0x%x\n", opts->flags); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+unsigned int au_opt_udba(struct super_block *sb) -+{ -+ return au_mntflags(sb) & AuOptMask_UDBA; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/opts.h linux-2.6.31/fs/aufs/opts.h ---- linux-2.6.31-vanilla/fs/aufs/opts.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/opts.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,196 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * mount options/flags -+ */ -+ -+#ifndef __AUFS_OPTS_H__ -+#define __AUFS_OPTS_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/path.h> -+#include <linux/aufs_type.h> -+ -+struct file; -+struct super_block; -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* mount flags */ -+#define AuOpt_XINO 1 /* external inode number bitmap -+ and translation table */ -+#define AuOpt_TRUNC_XINO (1 << 1) /* truncate xino files */ -+#define AuOpt_UDBA_NONE (1 << 2) /* users direct branch access */ -+#define AuOpt_UDBA_REVAL (1 << 3) -+#define AuOpt_UDBA_HINOTIFY (1 << 4) -+#define AuOpt_SHWH (1 << 5) /* show whiteout */ -+#define AuOpt_PLINK (1 << 6) /* pseudo-link */ -+#define AuOpt_DIRPERM1 (1 << 7) /* unimplemented */ -+#define AuOpt_REFROF (1 << 8) /* unimplemented */ -+#define AuOpt_ALWAYS_DIROPQ (1 << 9) /* policy to creating diropq */ -+#define AuOpt_SUM (1 << 10) /* summation for statfs(2) */ -+#define AuOpt_SUM_W (1 << 11) /* unimplemented */ -+#define AuOpt_WARN_PERM (1 << 12) /* warn when add-branch */ -+#define AuOpt_VERBOSE (1 << 13) /* busy inode when del-branch */ -+ -+#ifndef CONFIG_AUFS_HINOTIFY -+#undef AuOpt_UDBA_HINOTIFY -+#define AuOpt_UDBA_HINOTIFY 0 -+#endif -+#ifndef CONFIG_AUFS_SHWH -+#undef AuOpt_SHWH -+#define AuOpt_SHWH 0 -+#endif -+ -+#define AuOpt_Def (AuOpt_XINO \ -+ | AuOpt_UDBA_REVAL \ -+ | AuOpt_PLINK \ -+ /* | AuOpt_DIRPERM1 */ \ -+ | AuOpt_WARN_PERM) -+#define AuOptMask_UDBA (AuOpt_UDBA_NONE \ -+ | AuOpt_UDBA_REVAL \ -+ | AuOpt_UDBA_HINOTIFY) -+ -+#define au_opt_test(flags, name) (flags & AuOpt_##name) -+#define au_opt_set(flags, name) do { \ -+ BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \ -+ ((flags) |= AuOpt_##name); \ -+} while (0) -+#define au_opt_set_udba(flags, name) do { \ -+ (flags) &= ~AuOptMask_UDBA; \ -+ ((flags) |= AuOpt_##name); \ -+} while (0) -+#define au_opt_clr(flags, name) { ((flags) &= ~AuOpt_##name); } -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* policies to select one among multiple writable branches */ -+enum { -+ AuWbrCreate_TDP, /* top down parent */ -+ AuWbrCreate_RR, /* round robin */ -+ AuWbrCreate_MFS, /* most free space */ -+ AuWbrCreate_MFSV, /* mfs with seconds */ -+ AuWbrCreate_MFSRR, /* mfs then rr */ -+ AuWbrCreate_MFSRRV, /* mfs then rr with seconds */ -+ AuWbrCreate_PMFS, /* parent and mfs */ -+ AuWbrCreate_PMFSV, /* parent and mfs with seconds */ -+ -+ AuWbrCreate_Def = AuWbrCreate_TDP -+}; -+ -+enum { -+ AuWbrCopyup_TDP, /* top down parent */ -+ AuWbrCopyup_BUP, /* bottom up parent */ -+ AuWbrCopyup_BU, /* bottom up */ -+ -+ AuWbrCopyup_Def = AuWbrCopyup_TDP -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct au_opt_add { -+ aufs_bindex_t bindex; -+ char *pathname; -+ int perm; -+ struct path path; -+}; -+ -+struct au_opt_del { -+ char *pathname; -+ struct path h_path; -+}; -+ -+struct au_opt_mod { -+ char *path; -+ int perm; -+ struct dentry *h_root; -+}; -+ -+struct au_opt_xino { -+ char *path; -+ struct file *file; -+}; -+ -+struct au_opt_xino_itrunc { -+ aufs_bindex_t bindex; -+}; -+ -+struct au_opt_wbr_create { -+ int wbr_create; -+ int mfs_second; -+ unsigned long long mfsrr_watermark; -+}; -+ -+struct au_opt { -+ int type; -+ union { -+ struct au_opt_xino xino; -+ struct au_opt_xino_itrunc xino_itrunc; -+ struct au_opt_add add; -+ struct au_opt_del del; -+ struct au_opt_mod mod; -+ int dirwh; -+ int rdcache; -+ unsigned int rdblk; -+ unsigned int rdhash; -+ int udba; -+ struct au_opt_wbr_create wbr_create; -+ int wbr_copyup; -+ }; -+}; -+ -+/* opts flags */ -+#define AuOpts_REMOUNT 1 -+#define AuOpts_REFRESH_DIR (1 << 1) -+#define AuOpts_REFRESH_NONDIR (1 << 2) -+#define AuOpts_TRUNC_XIB (1 << 3) -+#define au_ftest_opts(flags, name) ((flags) & AuOpts_##name) -+#define au_fset_opts(flags, name) { (flags) |= AuOpts_##name; } -+#define au_fclr_opts(flags, name) { (flags) &= ~AuOpts_##name; } -+ -+struct au_opts { -+ struct au_opt *opt; -+ int max_opt; -+ -+ unsigned int given_udba; -+ unsigned int flags; -+ unsigned long sb_flags; -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+const char *au_optstr_br_perm(int brperm); -+const char *au_optstr_udba(int udba); -+const char *au_optstr_wbr_copyup(int wbr_copyup); -+const char *au_optstr_wbr_create(int wbr_create); -+ -+void au_opts_free(struct au_opts *opts); -+int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts); -+int au_opts_verify(struct super_block *sb, unsigned long sb_flags, -+ unsigned int pending); -+int au_opts_mount(struct super_block *sb, struct au_opts *opts); -+int au_opts_remount(struct super_block *sb, struct au_opts *opts); -+ -+unsigned int au_opt_udba(struct super_block *sb); -+ -+/* ---------------------------------------------------------------------- */ -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_OPTS_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/plink.c linux-2.6.31/fs/aufs/plink.c ---- linux-2.6.31-vanilla/fs/aufs/plink.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/plink.c 2009-09-16 13:55:29.000000000 +0200 -@@ -0,0 +1,396 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * pseudo-link -+ */ -+ -+#include "aufs.h" -+ -+/* -+ * during a user process maintains the pseudo-links, -+ * prohibit adding a new plink and branch manipulation. -+ */ -+void au_plink_block_maintain(struct super_block *sb) -+{ -+ struct au_sbinfo *sbi = au_sbi(sb); -+ -+ SiMustAnyLock(sb); -+ -+ /* gave up wake_up_bit() */ -+ wait_event(sbi->si_plink_wq, !au_ftest_si(sbi, MAINTAIN_PLINK)); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct pseudo_link { -+ struct list_head list; -+ struct inode *inode; -+}; -+ -+#ifdef CONFIG_AUFS_DEBUG -+void au_plink_list(struct super_block *sb) -+{ -+ struct au_sbinfo *sbinfo; -+ struct list_head *plink_list; -+ struct pseudo_link *plink; -+ -+ SiMustAnyLock(sb); -+ -+ sbinfo = au_sbi(sb); -+ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); -+ -+ plink_list = &sbinfo->si_plink.head; -+ spin_lock(&sbinfo->si_plink.spin); -+ list_for_each_entry(plink, plink_list, list) -+ AuDbg("%lu\n", plink->inode->i_ino); -+ spin_unlock(&sbinfo->si_plink.spin); -+} -+#endif -+ -+/* is the inode pseudo-linked? */ -+int au_plink_test(struct inode *inode) -+{ -+ int found; -+ struct au_sbinfo *sbinfo; -+ struct list_head *plink_list; -+ struct pseudo_link *plink; -+ -+ sbinfo = au_sbi(inode->i_sb); -+ AuRwMustAnyLock(&sbinfo->si_rwsem); -+ AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK)); -+ -+ found = 0; -+ plink_list = &sbinfo->si_plink.head; -+ spin_lock(&sbinfo->si_plink.spin); -+ list_for_each_entry(plink, plink_list, list) -+ if (plink->inode == inode) { -+ found = 1; -+ break; -+ } -+ spin_unlock(&sbinfo->si_plink.spin); -+ return found; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * generate a name for plink. -+ * the file will be stored under AUFS_WH_PLINKDIR. -+ */ -+/* 20 is max digits length of ulong 64 */ -+#define PLINK_NAME_LEN ((20 + 1) * 2) -+ -+static int plink_name(char *name, int len, struct inode *inode, -+ aufs_bindex_t bindex) -+{ -+ int rlen; -+ struct inode *h_inode; -+ -+ h_inode = au_h_iptr(inode, bindex); -+ rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino); -+ return rlen; -+} -+ -+/* lookup the plink-ed @inode under the branch at @bindex */ -+struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex) -+{ -+ struct dentry *h_dentry, *h_parent; -+ struct au_branch *br; -+ struct inode *h_dir; -+ char a[PLINK_NAME_LEN]; -+ struct qstr tgtname = { -+ .name = a -+ }; -+ -+ br = au_sbr(inode->i_sb, bindex); -+ h_parent = br->br_wbr->wbr_plink; -+ h_dir = h_parent->d_inode; -+ tgtname.len = plink_name(a, sizeof(a), inode, bindex); -+ -+ /* always superio. */ -+ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2); -+ h_dentry = au_sio_lkup_one(&tgtname, h_parent, br); -+ mutex_unlock(&h_dir->i_mutex); -+ return h_dentry; -+} -+ -+/* create a pseudo-link */ -+static int do_whplink(struct qstr *tgt, struct dentry *h_parent, -+ struct dentry *h_dentry, struct au_branch *br) -+{ -+ int err; -+ struct path h_path = { -+ .mnt = br->br_mnt -+ }; -+ struct inode *h_dir; -+ -+ h_dir = h_parent->d_inode; -+ again: -+ h_path.dentry = au_lkup_one(tgt, h_parent, br, /*nd*/NULL); -+ err = PTR_ERR(h_path.dentry); -+ if (IS_ERR(h_path.dentry)) -+ goto out; -+ -+ err = 0; -+ /* wh.plink dir is not monitored */ -+ if (h_path.dentry->d_inode -+ && h_path.dentry->d_inode != h_dentry->d_inode) { -+ err = vfsub_unlink(h_dir, &h_path, /*force*/0); -+ dput(h_path.dentry); -+ h_path.dentry = NULL; -+ if (!err) -+ goto again; -+ } -+ if (!err && !h_path.dentry->d_inode) -+ err = vfsub_link(h_dentry, h_dir, &h_path); -+ dput(h_path.dentry); -+ -+ out: -+ return err; -+} -+ -+struct do_whplink_args { -+ int *errp; -+ struct qstr *tgt; -+ struct dentry *h_parent; -+ struct dentry *h_dentry; -+ struct au_branch *br; -+}; -+ -+static void call_do_whplink(void *args) -+{ -+ struct do_whplink_args *a = args; -+ *a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br); -+} -+ -+static int whplink(struct dentry *h_dentry, struct inode *inode, -+ aufs_bindex_t bindex, struct au_branch *br) -+{ -+ int err, wkq_err; -+ struct au_wbr *wbr; -+ struct dentry *h_parent; -+ struct inode *h_dir; -+ char a[PLINK_NAME_LEN]; -+ struct qstr tgtname = { -+ .name = a -+ }; -+ -+ wbr = au_sbr(inode->i_sb, bindex)->br_wbr; -+ h_parent = wbr->wbr_plink; -+ h_dir = h_parent->d_inode; -+ tgtname.len = plink_name(a, sizeof(a), inode, bindex); -+ -+ /* always superio. */ -+ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2); -+ if (!au_test_wkq(current)) { -+ struct do_whplink_args args = { -+ .errp = &err, -+ .tgt = &tgtname, -+ .h_parent = h_parent, -+ .h_dentry = h_dentry, -+ .br = br -+ }; -+ wkq_err = au_wkq_wait(call_do_whplink, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ } else -+ err = do_whplink(&tgtname, h_parent, h_dentry, br); -+ mutex_unlock(&h_dir->i_mutex); -+ -+ return err; -+} -+ -+/* free a single plink */ -+static void do_put_plink(struct pseudo_link *plink, int do_del) -+{ -+ iput(plink->inode); -+ if (do_del) -+ list_del(&plink->list); -+ kfree(plink); -+} -+ -+/* -+ * create a new pseudo-link for @h_dentry on @bindex. -+ * the linked inode is held in aufs @inode. -+ */ -+void au_plink_append(struct inode *inode, aufs_bindex_t bindex, -+ struct dentry *h_dentry) -+{ -+ struct super_block *sb; -+ struct au_sbinfo *sbinfo; -+ struct list_head *plink_list; -+ struct pseudo_link *plink; -+ int found, err, cnt; -+ -+ sb = inode->i_sb; -+ sbinfo = au_sbi(sb); -+ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); -+ -+ err = 0; -+ cnt = 0; -+ found = 0; -+ plink_list = &sbinfo->si_plink.head; -+ spin_lock(&sbinfo->si_plink.spin); -+ list_for_each_entry(plink, plink_list, list) { -+ cnt++; -+ if (plink->inode == inode) { -+ found = 1; -+ break; -+ } -+ } -+ if (found) { -+ spin_unlock(&sbinfo->si_plink.spin); -+ return; -+ } -+ -+ plink = NULL; -+ if (!found) { -+ plink = kmalloc(sizeof(*plink), GFP_ATOMIC); -+ if (plink) { -+ plink->inode = au_igrab(inode); -+ list_add(&plink->list, plink_list); -+ cnt++; -+ } else -+ err = -ENOMEM; -+ } -+ spin_unlock(&sbinfo->si_plink.spin); -+ -+ if (!err) { -+ au_plink_block_maintain(sb); -+ err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex)); -+ } -+ -+ if (unlikely(cnt > AUFS_PLINK_WARN)) -+ AuWarn1("unexpectedly many pseudo links, %d\n", cnt); -+ if (unlikely(err)) { -+ AuWarn("err %d, damaged pseudo link.\n", err); -+ if (!found && plink) -+ do_put_plink(plink, /*do_del*/1); -+ } -+} -+ -+/* free all plinks */ -+void au_plink_put(struct super_block *sb) -+{ -+ struct au_sbinfo *sbinfo; -+ struct list_head *plink_list; -+ struct pseudo_link *plink, *tmp; -+ -+ SiMustWriteLock(sb); -+ -+ sbinfo = au_sbi(sb); -+ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); -+ -+ plink_list = &sbinfo->si_plink.head; -+ /* no spin_lock since sbinfo is write-locked */ -+ list_for_each_entry_safe(plink, tmp, plink_list, list) -+ do_put_plink(plink, 0); -+ INIT_LIST_HEAD(plink_list); -+} -+ -+/* free the plinks on a branch specified by @br_id */ -+void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id) -+{ -+ struct au_sbinfo *sbinfo; -+ struct list_head *plink_list; -+ struct pseudo_link *plink, *tmp; -+ struct inode *inode; -+ aufs_bindex_t bstart, bend, bindex; -+ unsigned char do_put; -+ -+ SiMustWriteLock(sb); -+ -+ sbinfo = au_sbi(sb); -+ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); -+ -+ plink_list = &sbinfo->si_plink.head; -+ /* no spin_lock since sbinfo is write-locked */ -+ list_for_each_entry_safe(plink, tmp, plink_list, list) { -+ do_put = 0; -+ inode = au_igrab(plink->inode); -+ ii_write_lock_child(inode); -+ bstart = au_ibstart(inode); -+ bend = au_ibend(inode); -+ if (bstart >= 0) { -+ for (bindex = bstart; bindex <= bend; bindex++) { -+ if (!au_h_iptr(inode, bindex) -+ || au_ii_br_id(inode, bindex) != br_id) -+ continue; -+ au_set_h_iptr(inode, bindex, NULL, 0); -+ do_put = 1; -+ break; -+ } -+ } else -+ do_put_plink(plink, 1); -+ -+ if (do_put) { -+ for (bindex = bstart; bindex <= bend; bindex++) -+ if (au_h_iptr(inode, bindex)) { -+ do_put = 0; -+ break; -+ } -+ if (do_put) -+ do_put_plink(plink, 1); -+ } -+ ii_write_unlock(inode); -+ iput(inode); -+ } -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+long au_plink_ioctl(struct file *file, unsigned int cmd) -+{ -+ long err; -+ struct super_block *sb; -+ struct au_sbinfo *sbinfo; -+ -+ err = -EACCES; -+ if (!capable(CAP_SYS_ADMIN)) -+ goto out; -+ -+ err = 0; -+ sb = file->f_dentry->d_sb; -+ sbinfo = au_sbi(sb); -+ switch (cmd) { -+ case AUFS_CTL_PLINK_MAINT: -+ /* -+ * pseudo-link maintenance mode, -+ * cleared by aufs_release_dir() -+ */ -+ si_write_lock(sb); -+ if (!au_ftest_si(sbinfo, MAINTAIN_PLINK)) { -+ au_fset_si(sbinfo, MAINTAIN_PLINK); -+ au_fi(file)->fi_maintain_plink = 1; -+ } else -+ err = -EBUSY; -+ si_write_unlock(sb); -+ break; -+ case AUFS_CTL_PLINK_CLEAN: -+ aufs_write_lock(sb->s_root); -+ if (au_opt_test(sbinfo->si_mntflags, PLINK)) -+ au_plink_put(sb); -+ aufs_write_unlock(sb->s_root); -+ break; -+ default: -+ err = -EINVAL; -+ } -+ out: -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/poll.c linux-2.6.31/fs/aufs/poll.c ---- linux-2.6.31-vanilla/fs/aufs/poll.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/poll.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,56 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * poll operation -+ * There is only one filesystem which implements ->poll operation, currently. -+ */ -+ -+#include "aufs.h" -+ -+unsigned int aufs_poll(struct file *file, poll_table *wait) -+{ -+ unsigned int mask; -+ int err; -+ struct file *h_file; -+ struct dentry *dentry; -+ struct super_block *sb; -+ -+ /* We should pretend an error happened. */ -+ mask = POLLERR /* | POLLIN | POLLOUT */; -+ dentry = file->f_dentry; -+ sb = dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); -+ if (unlikely(err)) -+ goto out; -+ -+ /* it is not an error if h_file has no operation */ -+ mask = DEFAULT_POLLMASK; -+ h_file = au_h_fptr(file, au_fbstart(file)); -+ if (h_file->f_op && h_file->f_op->poll) -+ mask = h_file->f_op->poll(h_file, wait); -+ -+ di_read_unlock(dentry, AuLock_IR); -+ fi_read_unlock(file); -+ -+ out: -+ si_read_unlock(sb); -+ AuTraceErr((int)mask); -+ return mask; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/rdu.c linux-2.6.31/fs/aufs/rdu.c ---- linux-2.6.31-vanilla/fs/aufs/rdu.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/rdu.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,331 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * readdir in userspace. -+ */ -+ -+#include <linux/security.h> -+#include <linux/uaccess.h> -+#include <linux/aufs_type.h> -+#include "aufs.h" -+ -+/* bits for struct aufs_rdu.flags */ -+#define AuRdu_CALLED 1 -+#define AuRdu_CONT (1 << 1) -+#define AuRdu_FULL (1 << 2) -+#define au_ftest_rdu(flags, name) ((flags) & AuRdu_##name) -+#define au_fset_rdu(flags, name) { (flags) |= AuRdu_##name; } -+#define au_fclr_rdu(flags, name) { (flags) &= ~AuRdu_##name; } -+ -+struct au_rdu_arg { -+ struct aufs_rdu *rdu; -+ union au_rdu_ent_ul ent; -+ unsigned long end; -+ -+ struct super_block *sb; -+ int err; -+}; -+ -+static int au_rdu_fill(void *__arg, const char *name, int nlen, -+ loff_t offset, u64 h_ino, unsigned int d_type) -+{ -+ int err, len; -+ struct au_rdu_arg *arg = __arg; -+ struct aufs_rdu *rdu = arg->rdu; -+ struct au_rdu_ent ent; -+ -+ err = 0; -+ arg->err = 0; -+ au_fset_rdu(rdu->cookie.flags, CALLED); -+ len = au_rdu_len(nlen); -+ if (arg->ent.ul + len < arg->end) { -+ ent.ino = h_ino; -+ ent.bindex = rdu->cookie.bindex; -+ ent.type = d_type; -+ ent.nlen = nlen; -+ -+ err = -EFAULT; -+ if (copy_to_user(arg->ent.e, &ent, sizeof(ent))) -+ goto out; -+ if (copy_to_user(arg->ent.e->name, name, nlen)) -+ goto out; -+ /* the terminating NULL */ -+ if (__put_user(0, arg->ent.e->name + nlen)) -+ goto out; -+ err = 0; -+ /* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */ -+ arg->ent.ul += len; -+ rdu->rent++; -+ } else { -+ err = -EFAULT; -+ au_fset_rdu(rdu->cookie.flags, FULL); -+ rdu->full = 1; -+ rdu->tail = arg->ent; -+ } -+ -+ out: -+ /* AuTraceErr(err); */ -+ return err; -+} -+ -+static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg) -+{ -+ int err; -+ loff_t offset; -+ struct au_rdu_cookie *cookie = &arg->rdu->cookie; -+ -+ offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET); -+ err = offset; -+ if (unlikely(offset != cookie->h_pos)) -+ goto out; -+ -+ err = 0; -+ do { -+ arg->err = 0; -+ au_fclr_rdu(cookie->flags, CALLED); -+ /* smp_mb(); */ -+ err = vfsub_readdir(h_file, au_rdu_fill, arg); -+ if (err >= 0) -+ err = arg->err; -+ } while (!err -+ && au_ftest_rdu(cookie->flags, CALLED) -+ && !au_ftest_rdu(cookie->flags, FULL)); -+ cookie->h_pos = h_file->f_pos; -+ -+ out: -+ AuTraceErr(err); -+ return err; -+} -+ -+static int au_rdu(struct file *file, struct aufs_rdu *rdu) -+{ -+ int err; -+ aufs_bindex_t bend; -+ struct au_rdu_arg arg; -+ struct dentry *dentry; -+ struct inode *inode; -+ struct file *h_file; -+ struct au_rdu_cookie *cookie = &rdu->cookie; -+ -+ err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz); -+ if (unlikely(err)) { -+ err = -EFAULT; -+ AuTraceErr(err); -+ goto out; -+ } -+ rdu->rent = 0; -+ rdu->tail = rdu->ent; -+ rdu->full = 0; -+ arg.rdu = rdu; -+ arg.ent = rdu->ent; -+ arg.end = arg.ent.ul; -+ arg.end += rdu->sz; -+ -+ err = -ENOTDIR; -+ if (unlikely(!file->f_op || !file->f_op->readdir)) -+ goto out; -+ -+ err = security_file_permission(file, MAY_READ); -+ AuTraceErr(err); -+ if (unlikely(err)) -+ goto out; -+ -+ dentry = file->f_dentry; -+ inode = dentry->d_inode; -+#if 1 -+ mutex_lock(&inode->i_mutex); -+#else -+ err = mutex_lock_killable(&inode->i_mutex); -+ AuTraceErr(err); -+ if (unlikely(err)) -+ goto out; -+#endif -+ err = -ENOENT; -+ if (unlikely(IS_DEADDIR(inode))) -+ goto out_mtx; -+ -+ arg.sb = inode->i_sb; -+ si_read_lock(arg.sb, AuLock_FLUSH); -+ fi_read_lock(file); -+ -+ err = -EAGAIN; -+ if (unlikely(au_ftest_rdu(cookie->flags, CONT) -+ && cookie->generation != au_figen(file))) -+ goto out_unlock; -+ -+ err = 0; -+ if (!rdu->blk) { -+ rdu->blk = au_sbi(arg.sb)->si_rdblk; -+ if (!rdu->blk) -+ rdu->blk = au_dir_size(file, /*dentry*/NULL); -+ } -+ bend = au_fbstart(file); -+ if (cookie->bindex < bend) -+ cookie->bindex = bend; -+ bend = au_fbend(file); -+ /* AuDbg("b%d, b%d\n", cookie->bindex, bend); */ -+ for (; !err && cookie->bindex <= bend; -+ cookie->bindex++, cookie->h_pos = 0) { -+ h_file = au_h_fptr(file, cookie->bindex); -+ if (!h_file) -+ continue; -+ -+ au_fclr_rdu(cookie->flags, FULL); -+ err = au_rdu_do(h_file, &arg); -+ AuTraceErr(err); -+ if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err)) -+ break; -+ } -+ AuDbg("rent %llu\n", rdu->rent); -+ -+ if (!err && !au_ftest_rdu(cookie->flags, CONT)) { -+ rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH); -+ au_fset_rdu(cookie->flags, CONT); -+ cookie->generation = au_figen(file); -+ } -+ -+ ii_read_lock_child(inode); -+ fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibstart(inode))); -+ ii_read_unlock(inode); -+ -+ out_unlock: -+ fi_read_unlock(file); -+ si_read_unlock(arg.sb); -+ out_mtx: -+ mutex_unlock(&inode->i_mutex); -+ out: -+ AuTraceErr(err); -+ return err; -+} -+ -+static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu) -+{ -+ int err; -+ ino_t ino; -+ unsigned long long nent; -+ union au_rdu_ent_ul *u; -+ struct au_rdu_ent ent; -+ struct super_block *sb; -+ -+ err = 0; -+ nent = rdu->nent; -+ u = &rdu->ent; -+ sb = file->f_dentry->d_sb; -+ si_read_lock(sb, AuLock_FLUSH); -+ while (nent-- > 0) { -+ err = !access_ok(VERIFY_WRITE, u->e, sizeof(ent)); -+ if (unlikely(err)) { -+ err = -EFAULT; -+ AuTraceErr(err); -+ break; -+ } -+ -+ err = copy_from_user(&ent, u->e, sizeof(ent)); -+ if (unlikely(err)) { -+ err = -EFAULT; -+ AuTraceErr(err); -+ break; -+ } -+ -+ /* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */ -+ if (!ent.wh) -+ err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino); -+ else -+ err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type, -+ &ino); -+ if (unlikely(err)) { -+ AuTraceErr(err); -+ break; -+ } -+ -+ err = __put_user(ino, &u->e->ino); -+ if (unlikely(err)) { -+ err = -EFAULT; -+ AuTraceErr(err); -+ break; -+ } -+ u->ul += au_rdu_len(ent.nlen); -+ } -+ si_read_unlock(sb); -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int au_rdu_verify(struct aufs_rdu *rdu) -+{ -+ AuDbg("rdu{%llu, %p, (%u, %u) | %u | %llu, %u, %u | " -+ "%llu, b%d, 0x%x, g%u}\n", -+ rdu->sz, rdu->ent.e, rdu->verify[0], rdu->verify[1], -+ rdu->blk, -+ rdu->rent, rdu->shwh, rdu->full, -+ rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags, -+ rdu->cookie.generation); -+ -+ if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu) -+ && rdu->verify[AufsCtlRduV_SZ_PTR] == sizeof(rdu)) -+ return 0; -+ -+ AuDbg("%u:%u, %u:%u\n", -+ rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu), -+ rdu->verify[AufsCtlRduV_SZ_PTR], (unsigned int)sizeof(rdu)); -+ return -EINVAL; -+} -+ -+long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -+{ -+ long err, e; -+ struct aufs_rdu rdu; -+ void __user *p = (void __user *)arg; -+ -+ err = copy_from_user(&rdu, p, sizeof(rdu)); -+ if (unlikely(err)) { -+ err = -EFAULT; -+ AuTraceErr(err); -+ goto out; -+ } -+ err = au_rdu_verify(&rdu); -+ if (unlikely(err)) -+ goto out; -+ -+ switch (cmd) { -+ case AUFS_CTL_RDU: -+ err = au_rdu(file, &rdu); -+ if (unlikely(err)) -+ break; -+ -+ e = copy_to_user(p, &rdu, sizeof(rdu)); -+ if (unlikely(e)) { -+ err = -EFAULT; -+ AuTraceErr(err); -+ } -+ break; -+ case AUFS_CTL_RDU_INO: -+ err = au_rdu_ino(file, &rdu); -+ break; -+ -+ default: -+ err = -EINVAL; -+ } -+ -+ out: -+ AuTraceErr(err); -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/rwsem.h linux-2.6.31/fs/aufs/rwsem.h ---- linux-2.6.31-vanilla/fs/aufs/rwsem.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/rwsem.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,186 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * simple read-write semaphore wrappers -+ */ -+ -+#ifndef __AUFS_RWSEM_H__ -+#define __AUFS_RWSEM_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/rwsem.h> -+ -+struct au_rwsem { -+ struct rw_semaphore rwsem; -+#ifdef CONFIG_AUFS_DEBUG -+ /* just for debugging, not almighty counter */ -+ atomic_t rcnt, wcnt; -+#endif -+}; -+ -+#ifdef CONFIG_AUFS_DEBUG -+#define AuDbgCntInit(rw) do { \ -+ atomic_set(&(rw)->rcnt, 0); \ -+ atomic_set(&(rw)->wcnt, 0); \ -+ smp_mb(); /* atomic set */ \ -+} while (0) -+ -+#define AuDbgRcntInc(rw) atomic_inc_return(&(rw)->rcnt) -+#define AuDbgRcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->rcnt) < 0) -+#define AuDbgWcntInc(rw) WARN_ON(atomic_inc_return(&(rw)->wcnt) > 1) -+#define AuDbgWcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->wcnt) < 0) -+#else -+#define AuDbgCntInit(rw) do {} while (0) -+#define AuDbgRcntInc(rw) do {} while (0) -+#define AuDbgRcntDec(rw) do {} while (0) -+#define AuDbgWcntInc(rw) do {} while (0) -+#define AuDbgWcntDec(rw) do {} while (0) -+#endif /* CONFIG_AUFS_DEBUG */ -+ -+/* to debug easier, do not make them inlined functions */ -+#define AuRwMustNoWaiters(rw) AuDebugOn(!list_empty(&(rw)->rwsem.wait_list)) -+/* rwsem_is_locked() is unusable */ -+#define AuRwMustReadLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0) -+#define AuRwMustWriteLock(rw) AuDebugOn(atomic_read(&(rw)->wcnt) <= 0) -+#define AuRwMustAnyLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0 \ -+ && atomic_read(&(rw)->wcnt) <= 0) -+#define AuRwDestroy(rw) AuDebugOn(atomic_read(&(rw)->rcnt) \ -+ || atomic_read(&(rw)->wcnt)) -+ -+static inline void au_rw_init(struct au_rwsem *rw) -+{ -+ AuDbgCntInit(rw); -+ init_rwsem(&rw->rwsem); -+} -+ -+static inline void au_rw_init_wlock(struct au_rwsem *rw) -+{ -+ au_rw_init(rw); -+ down_write(&rw->rwsem); -+ AuDbgWcntInc(rw); -+} -+ -+static inline void au_rw_init_wlock_nested(struct au_rwsem *rw, -+ unsigned int lsc) -+{ -+ au_rw_init(rw); -+ down_write_nested(&rw->rwsem, lsc); -+ AuDbgWcntInc(rw); -+} -+ -+static inline void au_rw_read_lock(struct au_rwsem *rw) -+{ -+ down_read(&rw->rwsem); -+ AuDbgRcntInc(rw); -+} -+ -+static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc) -+{ -+ down_read_nested(&rw->rwsem, lsc); -+ AuDbgRcntInc(rw); -+} -+ -+static inline void au_rw_read_unlock(struct au_rwsem *rw) -+{ -+ AuRwMustReadLock(rw); -+ AuDbgRcntDec(rw); -+ up_read(&rw->rwsem); -+} -+ -+static inline void au_rw_dgrade_lock(struct au_rwsem *rw) -+{ -+ AuRwMustWriteLock(rw); -+ AuDbgRcntInc(rw); -+ AuDbgWcntDec(rw); -+ downgrade_write(&rw->rwsem); -+} -+ -+static inline void au_rw_write_lock(struct au_rwsem *rw) -+{ -+ down_write(&rw->rwsem); -+ AuDbgWcntInc(rw); -+} -+ -+static inline void au_rw_write_lock_nested(struct au_rwsem *rw, -+ unsigned int lsc) -+{ -+ down_write_nested(&rw->rwsem, lsc); -+ AuDbgWcntInc(rw); -+} -+ -+static inline void au_rw_write_unlock(struct au_rwsem *rw) -+{ -+ AuRwMustWriteLock(rw); -+ AuDbgWcntDec(rw); -+ up_write(&rw->rwsem); -+} -+ -+/* why is not _nested version defined */ -+static inline int au_rw_read_trylock(struct au_rwsem *rw) -+{ -+ int ret = down_read_trylock(&rw->rwsem); -+ if (ret) -+ AuDbgRcntInc(rw); -+ return ret; -+} -+ -+static inline int au_rw_write_trylock(struct au_rwsem *rw) -+{ -+ int ret = down_write_trylock(&rw->rwsem); -+ if (ret) -+ AuDbgWcntInc(rw); -+ return ret; -+} -+ -+#undef AuDbgCntInit -+#undef AuDbgRcntInc -+#undef AuDbgRcntDec -+#undef AuDbgWcntInc -+#undef AuDbgWcntDec -+ -+#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ -+static inline void prefix##_read_lock(param) \ -+{ au_rw_read_lock(rwsem); } \ -+static inline void prefix##_write_lock(param) \ -+{ au_rw_write_lock(rwsem); } \ -+static inline int prefix##_read_trylock(param) \ -+{ return au_rw_read_trylock(rwsem); } \ -+static inline int prefix##_write_trylock(param) \ -+{ return au_rw_write_trylock(rwsem); } -+/* why is not _nested version defined */ -+/* static inline void prefix##_read_trylock_nested(param, lsc) -+{ au_rw_read_trylock_nested(rwsem, lsc)); } -+static inline void prefix##_write_trylock_nestd(param, lsc) -+{ au_rw_write_trylock_nested(rwsem, lsc); } */ -+ -+#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \ -+static inline void prefix##_read_unlock(param) \ -+{ au_rw_read_unlock(rwsem); } \ -+static inline void prefix##_write_unlock(param) \ -+{ au_rw_write_unlock(rwsem); } \ -+static inline void prefix##_downgrade_lock(param) \ -+{ au_rw_dgrade_lock(rwsem); } -+ -+#define AuSimpleRwsemFuncs(prefix, param, rwsem) \ -+ AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ -+ AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_RWSEM_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/sbinfo.c linux-2.6.31/fs/aufs/sbinfo.c ---- linux-2.6.31-vanilla/fs/aufs/sbinfo.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/sbinfo.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,208 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * superblock private data -+ */ -+ -+#include "aufs.h" -+ -+/* -+ * they are necessary regardless sysfs is disabled. -+ */ -+void au_si_free(struct kobject *kobj) -+{ -+ struct au_sbinfo *sbinfo; -+ struct super_block *sb; -+ -+ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); -+ AuDebugOn(!list_empty(&sbinfo->si_plink.head)); -+ -+ sb = sbinfo->si_sb; -+ si_write_lock(sb); -+ au_xino_clr(sb); -+ au_br_free(sbinfo); -+ kfree(sbinfo->si_branch); -+ mutex_destroy(&sbinfo->si_xib_mtx); -+ si_write_unlock(sb); -+ AuRwDestroy(&sbinfo->si_rwsem); -+ -+ kfree(sbinfo); -+} -+ -+int au_si_alloc(struct super_block *sb) -+{ -+ int err; -+ struct au_sbinfo *sbinfo; -+ -+ err = -ENOMEM; -+ sbinfo = kmalloc(sizeof(*sbinfo), GFP_NOFS); -+ if (unlikely(!sbinfo)) -+ goto out; -+ -+ /* will be reallocated separately */ -+ sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS); -+ if (unlikely(!sbinfo->si_branch)) -+ goto out_sbinfo; -+ -+ memset(&sbinfo->si_kobj, 0, sizeof(sbinfo->si_kobj)); -+ err = sysaufs_si_init(sbinfo); -+ if (unlikely(err)) -+ goto out_br; -+ -+ au_nwt_init(&sbinfo->si_nowait); -+ au_rw_init_wlock(&sbinfo->si_rwsem); -+ sbinfo->si_generation = 0; -+ sbinfo->au_si_status = 0; -+ sbinfo->si_bend = -1; -+ sbinfo->si_last_br_id = 0; -+ -+ sbinfo->si_wbr_copyup = AuWbrCopyup_Def; -+ sbinfo->si_wbr_create = AuWbrCreate_Def; -+ sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + AuWbrCopyup_Def; -+ sbinfo->si_wbr_create_ops = au_wbr_create_ops + AuWbrCreate_Def; -+ -+ sbinfo->si_mntflags = AuOpt_Def; -+ -+ sbinfo->si_xread = NULL; -+ sbinfo->si_xwrite = NULL; -+ sbinfo->si_xib = NULL; -+ mutex_init(&sbinfo->si_xib_mtx); -+ sbinfo->si_xib_buf = NULL; -+ sbinfo->si_xino_brid = -1; -+ /* leave si_xib_last_pindex and si_xib_next_bit */ -+ -+ sbinfo->si_rdcache = AUFS_RDCACHE_DEF * HZ; -+ sbinfo->si_rdblk = AUFS_RDBLK_DEF; -+ sbinfo->si_rdhash = AUFS_RDHASH_DEF; -+ sbinfo->si_dirwh = AUFS_DIRWH_DEF; -+ -+ au_spl_init(&sbinfo->si_plink); -+ init_waitqueue_head(&sbinfo->si_plink_wq); -+ -+ /* leave other members for sysaufs and si_mnt. */ -+ sbinfo->si_sb = sb; -+ sb->s_fs_info = sbinfo; -+ au_debug_sbinfo_init(sbinfo); -+ return 0; /* success */ -+ -+ out_br: -+ kfree(sbinfo->si_branch); -+ out_sbinfo: -+ kfree(sbinfo); -+ out: -+ return err; -+} -+ -+int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr) -+{ -+ int err, sz; -+ struct au_branch **brp; -+ -+ AuRwMustWriteLock(&sbinfo->si_rwsem); -+ -+ err = -ENOMEM; -+ sz = sizeof(*brp) * (sbinfo->si_bend + 1); -+ if (unlikely(!sz)) -+ sz = sizeof(*brp); -+ brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS); -+ if (brp) { -+ sbinfo->si_branch = brp; -+ err = 0; -+ } -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+unsigned int au_sigen_inc(struct super_block *sb) -+{ -+ unsigned int gen; -+ -+ SiMustWriteLock(sb); -+ -+ gen = ++au_sbi(sb)->si_generation; -+ au_update_digen(sb->s_root); -+ au_update_iigen(sb->s_root->d_inode); -+ sb->s_root->d_inode->i_version++; -+ return gen; -+} -+ -+aufs_bindex_t au_new_br_id(struct super_block *sb) -+{ -+ aufs_bindex_t br_id; -+ int i; -+ struct au_sbinfo *sbinfo; -+ -+ SiMustWriteLock(sb); -+ -+ sbinfo = au_sbi(sb); -+ for (i = 0; i <= AUFS_BRANCH_MAX; i++) { -+ br_id = ++sbinfo->si_last_br_id; -+ if (br_id && au_br_index(sb, br_id) < 0) -+ return br_id; -+ } -+ -+ return -1; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* dentry and super_block lock. call at entry point */ -+void aufs_read_lock(struct dentry *dentry, int flags) -+{ -+ si_read_lock(dentry->d_sb, flags); -+ if (au_ftest_lock(flags, DW)) -+ di_write_lock_child(dentry); -+ else -+ di_read_lock_child(dentry, flags); -+} -+ -+void aufs_read_unlock(struct dentry *dentry, int flags) -+{ -+ if (au_ftest_lock(flags, DW)) -+ di_write_unlock(dentry); -+ else -+ di_read_unlock(dentry, flags); -+ si_read_unlock(dentry->d_sb); -+} -+ -+void aufs_write_lock(struct dentry *dentry) -+{ -+ si_write_lock(dentry->d_sb); -+ di_write_lock_child(dentry); -+} -+ -+void aufs_write_unlock(struct dentry *dentry) -+{ -+ di_write_unlock(dentry); -+ si_write_unlock(dentry->d_sb); -+} -+ -+void aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags) -+{ -+ si_read_lock(d1->d_sb, flags); -+ di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIR)); -+} -+ -+void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2) -+{ -+ di_write_unlock2(d1, d2); -+ si_read_unlock(d1->d_sb); -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/spl.h linux-2.6.31/fs/aufs/spl.h ---- linux-2.6.31-vanilla/fs/aufs/spl.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/spl.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,57 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * simple list protected by a spinlock -+ */ -+ -+#ifndef __AUFS_SPL_H__ -+#define __AUFS_SPL_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/spinlock.h> -+#include <linux/list.h> -+ -+struct au_splhead { -+ spinlock_t spin; -+ struct list_head head; -+}; -+ -+static inline void au_spl_init(struct au_splhead *spl) -+{ -+ spin_lock_init(&spl->spin); -+ INIT_LIST_HEAD(&spl->head); -+} -+ -+static inline void au_spl_add(struct list_head *list, struct au_splhead *spl) -+{ -+ spin_lock(&spl->spin); -+ list_add(list, &spl->head); -+ spin_unlock(&spl->spin); -+} -+ -+static inline void au_spl_del(struct list_head *list, struct au_splhead *spl) -+{ -+ spin_lock(&spl->spin); -+ list_del(list); -+ spin_unlock(&spl->spin); -+} -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_SPL_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/super.c linux-2.6.31/fs/aufs/super.c ---- linux-2.6.31-vanilla/fs/aufs/super.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/super.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,874 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * mount and super_block operations -+ */ -+ -+#include <linux/buffer_head.h> -+#include <linux/module.h> -+#include <linux/seq_file.h> -+#include <linux/statfs.h> -+#include "aufs.h" -+ -+/* -+ * super_operations -+ */ -+static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused) -+{ -+ struct au_icntnr *c; -+ -+ c = au_cache_alloc_icntnr(); -+ if (c) { -+ inode_init_once(&c->vfs_inode); -+ c->vfs_inode.i_version = 1; /* sigen(sb); */ -+ c->iinfo.ii_hinode = NULL; -+ return &c->vfs_inode; -+ } -+ return NULL; -+} -+ -+static void aufs_destroy_inode(struct inode *inode) -+{ -+ au_iinfo_fin(inode); -+ au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode)); -+} -+ -+struct inode *au_iget_locked(struct super_block *sb, ino_t ino) -+{ -+ struct inode *inode; -+ int err; -+ -+ inode = iget_locked(sb, ino); -+ if (unlikely(!inode)) { -+ inode = ERR_PTR(-ENOMEM); -+ goto out; -+ } -+ if (!(inode->i_state & I_NEW)) -+ goto out; -+ -+ err = au_xigen_new(inode); -+ if (!err) -+ err = au_iinfo_init(inode); -+ if (!err) -+ inode->i_version++; -+ else { -+ iget_failed(inode); -+ inode = ERR_PTR(err); -+ } -+ -+ out: -+ /* never return NULL */ -+ AuDebugOn(!inode); -+ AuTraceErrPtr(inode); -+ return inode; -+} -+ -+/* lock free root dinfo */ -+static int au_show_brs(struct seq_file *seq, struct super_block *sb) -+{ -+ int err; -+ aufs_bindex_t bindex, bend; -+ struct path path; -+ struct au_hdentry *hd; -+ struct au_branch *br; -+ -+ err = 0; -+ bend = au_sbend(sb); -+ hd = au_di(sb->s_root)->di_hdentry; -+ for (bindex = 0; !err && bindex <= bend; bindex++) { -+ br = au_sbr(sb, bindex); -+ path.mnt = br->br_mnt; -+ path.dentry = hd[bindex].hd_dentry; -+ err = au_seq_path(seq, &path); -+ if (err > 0) -+ err = seq_printf(seq, "=%s", -+ au_optstr_br_perm(br->br_perm)); -+ if (!err && bindex != bend) -+ err = seq_putc(seq, ':'); -+ } -+ -+ return err; -+} -+ -+static void au_show_wbr_create(struct seq_file *m, int v, -+ struct au_sbinfo *sbinfo) -+{ -+ const char *pat; -+ -+ AuRwMustAnyLock(&sbinfo->si_rwsem); -+ -+ seq_printf(m, ",create="); -+ pat = au_optstr_wbr_create(v); -+ switch (v) { -+ case AuWbrCreate_TDP: -+ case AuWbrCreate_RR: -+ case AuWbrCreate_MFS: -+ case AuWbrCreate_PMFS: -+ seq_printf(m, pat); -+ break; -+ case AuWbrCreate_MFSV: -+ seq_printf(m, /*pat*/"mfs:%lu", -+ sbinfo->si_wbr_mfs.mfs_expire / HZ); -+ break; -+ case AuWbrCreate_PMFSV: -+ seq_printf(m, /*pat*/"pmfs:%lu", -+ sbinfo->si_wbr_mfs.mfs_expire / HZ); -+ break; -+ case AuWbrCreate_MFSRR: -+ seq_printf(m, /*pat*/"mfsrr:%llu", -+ sbinfo->si_wbr_mfs.mfsrr_watermark); -+ break; -+ case AuWbrCreate_MFSRRV: -+ seq_printf(m, /*pat*/"mfsrr:%llu:%lu", -+ sbinfo->si_wbr_mfs.mfsrr_watermark, -+ sbinfo->si_wbr_mfs.mfs_expire / HZ); -+ break; -+ } -+} -+ -+static int au_show_xino(struct seq_file *seq, struct vfsmount *mnt) -+{ -+#ifdef CONFIG_SYSFS -+ return 0; -+#else -+ int err; -+ const int len = sizeof(AUFS_XINO_FNAME) - 1; -+ aufs_bindex_t bindex, brid; -+ struct super_block *sb; -+ struct qstr *name; -+ struct file *f; -+ struct dentry *d, *h_root; -+ -+ AuRwMustAnyLock(&sbinfo->si_rwsem); -+ -+ err = 0; -+ sb = mnt->mnt_sb; -+ f = au_sbi(sb)->si_xib; -+ if (!f) -+ goto out; -+ -+ /* stop printing the default xino path on the first writable branch */ -+ h_root = NULL; -+ brid = au_xino_brid(sb); -+ if (brid >= 0) { -+ bindex = au_br_index(sb, brid); -+ h_root = au_di(sb->s_root)->di_hdentry[0 + bindex].hd_dentry; -+ } -+ d = f->f_dentry; -+ name = &d->d_name; -+ /* safe ->d_parent because the file is unlinked */ -+ if (d->d_parent == h_root -+ && name->len == len -+ && !memcmp(name->name, AUFS_XINO_FNAME, len)) -+ goto out; -+ -+ seq_puts(seq, ",xino="); -+ err = au_xino_path(seq, f); -+ -+ out: -+ return err; -+#endif -+} -+ -+/* seq_file will re-call me in case of too long string */ -+static int aufs_show_options(struct seq_file *m, struct vfsmount *mnt) -+{ -+ int err, n; -+ unsigned int mnt_flags, v; -+ struct super_block *sb; -+ struct au_sbinfo *sbinfo; -+ -+#define AuBool(name, str) do { \ -+ v = au_opt_test(mnt_flags, name); \ -+ if (v != au_opt_test(AuOpt_Def, name)) \ -+ seq_printf(m, ",%s" #str, v ? "" : "no"); \ -+} while (0) -+ -+#define AuStr(name, str) do { \ -+ v = mnt_flags & AuOptMask_##name; \ -+ if (v != (AuOpt_Def & AuOptMask_##name)) \ -+ seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \ -+} while (0) -+ -+#define AuUInt(name, str, val) do { \ -+ if (val != AUFS_##name##_DEF) \ -+ seq_printf(m, "," #str "=%u", val); \ -+} while (0) -+ -+ /* lock free root dinfo */ -+ sb = mnt->mnt_sb; -+ si_noflush_read_lock(sb); -+ sbinfo = au_sbi(sb); -+ seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo)); -+ -+ mnt_flags = au_mntflags(sb); -+ if (au_opt_test(mnt_flags, XINO)) { -+ err = au_show_xino(m, mnt); -+ if (unlikely(err)) -+ goto out; -+ } else -+ seq_puts(m, ",noxino"); -+ -+ AuBool(TRUNC_XINO, trunc_xino); -+ AuStr(UDBA, udba); -+ AuBool(SHWH, shwh); -+ AuBool(PLINK, plink); -+ /* AuBool(DIRPERM1, dirperm1); */ -+ /* AuBool(REFROF, refrof); */ -+ -+ v = sbinfo->si_wbr_create; -+ if (v != AuWbrCreate_Def) -+ au_show_wbr_create(m, v, sbinfo); -+ -+ v = sbinfo->si_wbr_copyup; -+ if (v != AuWbrCopyup_Def) -+ seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v)); -+ -+ v = au_opt_test(mnt_flags, ALWAYS_DIROPQ); -+ if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ)) -+ seq_printf(m, ",diropq=%c", v ? 'a' : 'w'); -+ -+ AuUInt(DIRWH, dirwh, sbinfo->si_dirwh); -+ -+ n = sbinfo->si_rdcache / HZ; -+ AuUInt(RDCACHE, rdcache, n); -+ -+ AuUInt(RDBLK, rdblk, sbinfo->si_rdblk); -+ AuUInt(RDHASH, rdhash, sbinfo->si_rdhash); -+ -+ AuBool(SUM, sum); -+ /* AuBool(SUM_W, wsum); */ -+ AuBool(WARN_PERM, warn_perm); -+ AuBool(VERBOSE, verbose); -+ -+ out: -+ /* be sure to print "br:" last */ -+ if (!sysaufs_brs) { -+ seq_puts(m, ",br:"); -+ au_show_brs(m, sb); -+ } -+ si_read_unlock(sb); -+ return 0; -+ -+#undef Deleted -+#undef AuBool -+#undef AuStr -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* sum mode which returns the summation for statfs(2) */ -+ -+static u64 au_add_till_max(u64 a, u64 b) -+{ -+ u64 old; -+ -+ old = a; -+ a += b; -+ if (old < a) -+ return a; -+ return ULLONG_MAX; -+} -+ -+static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf) -+{ -+ int err; -+ u64 blocks, bfree, bavail, files, ffree; -+ aufs_bindex_t bend, bindex, i; -+ unsigned char shared; -+ struct vfsmount *h_mnt; -+ struct super_block *h_sb; -+ -+ blocks = 0; -+ bfree = 0; -+ bavail = 0; -+ files = 0; -+ ffree = 0; -+ -+ err = 0; -+ bend = au_sbend(sb); -+ for (bindex = bend; bindex >= 0; bindex--) { -+ h_mnt = au_sbr_mnt(sb, bindex); -+ h_sb = h_mnt->mnt_sb; -+ shared = 0; -+ for (i = bindex + 1; !shared && i <= bend; i++) -+ shared = (au_sbr_sb(sb, i) == h_sb); -+ if (shared) -+ continue; -+ -+ /* sb->s_root for NFS is unreliable */ -+ err = vfs_statfs(h_mnt->mnt_root, buf); -+ if (unlikely(err)) -+ goto out; -+ -+ blocks = au_add_till_max(blocks, buf->f_blocks); -+ bfree = au_add_till_max(bfree, buf->f_bfree); -+ bavail = au_add_till_max(bavail, buf->f_bavail); -+ files = au_add_till_max(files, buf->f_files); -+ ffree = au_add_till_max(ffree, buf->f_ffree); -+ } -+ -+ buf->f_blocks = blocks; -+ buf->f_bfree = bfree; -+ buf->f_bavail = bavail; -+ buf->f_files = files; -+ buf->f_ffree = ffree; -+ -+ out: -+ return err; -+} -+ -+static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf) -+{ -+ int err; -+ struct super_block *sb; -+ -+ /* lock free root dinfo */ -+ sb = dentry->d_sb; -+ si_noflush_read_lock(sb); -+ if (!au_opt_test(au_mntflags(sb), SUM)) -+ /* sb->s_root for NFS is unreliable */ -+ err = vfs_statfs(au_sbr_mnt(sb, 0)->mnt_root, buf); -+ else -+ err = au_statfs_sum(sb, buf); -+ si_read_unlock(sb); -+ -+ if (!err) { -+ buf->f_type = AUFS_SUPER_MAGIC; -+ buf->f_namelen -= AUFS_WH_PFX_LEN; -+ memset(&buf->f_fsid, 0, sizeof(buf->f_fsid)); -+ } -+ /* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */ -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* try flushing the lower fs at aufs remount/unmount time */ -+ -+static void au_fsync_br(struct super_block *sb) -+{ -+ aufs_bindex_t bend, bindex; -+ int brperm; -+ struct au_branch *br; -+ struct super_block *h_sb; -+ -+ bend = au_sbend(sb); -+ for (bindex = 0; bindex < bend; bindex++) { -+ br = au_sbr(sb, bindex); -+ brperm = br->br_perm; -+ if (brperm == AuBrPerm_RR || brperm == AuBrPerm_RRWH) -+ continue; -+ h_sb = br->br_mnt->mnt_sb; -+ if (bdev_read_only(h_sb->s_bdev)) -+ continue; -+ -+ lockdep_off(); -+ down_write(&h_sb->s_umount); -+ shrink_dcache_sb(h_sb); -+ sync_filesystem(h_sb); -+ up_write(&h_sb->s_umount); -+ lockdep_on(); -+ } -+} -+ -+/* -+ * this IS NOT for super_operations. -+ * I guess it will be reverted someday. -+ */ -+static void aufs_umount_begin(struct super_block *sb) -+{ -+ struct au_sbinfo *sbinfo; -+ -+ sbinfo = au_sbi(sb); -+ if (!sbinfo) -+ return; -+ -+ si_write_lock(sb); -+ au_fsync_br(sb); -+ if (au_opt_test(au_mntflags(sb), PLINK)) -+ au_plink_put(sb); -+ if (sbinfo->si_wbr_create_ops->fin) -+ sbinfo->si_wbr_create_ops->fin(sb); -+ si_write_unlock(sb); -+} -+ -+/* final actions when unmounting a file system */ -+static void aufs_put_super(struct super_block *sb) -+{ -+ struct au_sbinfo *sbinfo; -+ -+ sbinfo = au_sbi(sb); -+ if (!sbinfo) -+ return; -+ -+ aufs_umount_begin(sb); -+ dbgaufs_si_fin(sbinfo); -+ kobject_put(&sbinfo->si_kobj); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * refresh dentry and inode at remount time. -+ */ -+static int do_refresh(struct dentry *dentry, mode_t type, -+ unsigned int dir_flags) -+{ -+ int err; -+ struct dentry *parent; -+ -+ di_write_lock_child(dentry); -+ parent = dget_parent(dentry); -+ di_read_lock_parent(parent, AuLock_IR); -+ -+ /* returns the number of positive dentries */ -+ err = au_refresh_hdentry(dentry, type); -+ if (err >= 0) { -+ struct inode *inode = dentry->d_inode; -+ err = au_refresh_hinode(inode, dentry); -+ if (!err && type == S_IFDIR) -+ au_reset_hinotify(inode, dir_flags); -+ } -+ if (unlikely(err)) -+ AuErr("unrecoverable error %d, %.*s\n", err, AuDLNPair(dentry)); -+ -+ di_read_unlock(parent, AuLock_IR); -+ dput(parent); -+ di_write_unlock(dentry); -+ -+ return err; -+} -+ -+static int test_dir(struct dentry *dentry, void *arg __maybe_unused) -+{ -+ return S_ISDIR(dentry->d_inode->i_mode); -+} -+ -+/* gave up consolidating with refresh_nondir() */ -+static int refresh_dir(struct dentry *root, unsigned int sigen) -+{ -+ int err, i, j, ndentry, e; -+ struct au_dcsub_pages dpages; -+ struct au_dpage *dpage; -+ struct dentry **dentries; -+ struct inode *inode; -+ const unsigned int flags = au_hi_flags(root->d_inode, /*isdir*/1); -+ -+ err = 0; -+ list_for_each_entry(inode, &root->d_sb->s_inodes, i_sb_list) -+ if (S_ISDIR(inode->i_mode) && au_iigen(inode) != sigen) { -+ ii_write_lock_child(inode); -+ e = au_refresh_hinode_self(inode, /*do_attr*/1); -+ ii_write_unlock(inode); -+ if (unlikely(e)) { -+ AuDbg("e %d, i%lu\n", e, inode->i_ino); -+ if (!err) -+ err = e; -+ /* go on even if err */ -+ } -+ } -+ -+ e = au_dpages_init(&dpages, GFP_NOFS); -+ if (unlikely(e)) { -+ if (!err) -+ err = e; -+ goto out; -+ } -+ e = au_dcsub_pages(&dpages, root, test_dir, NULL); -+ if (unlikely(e)) { -+ if (!err) -+ err = e; -+ goto out_dpages; -+ } -+ -+ for (i = 0; !e && i < dpages.ndpage; i++) { -+ dpage = dpages.dpages + i; -+ dentries = dpage->dentries; -+ ndentry = dpage->ndentry; -+ for (j = 0; !e && j < ndentry; j++) { -+ struct dentry *d; -+ -+ d = dentries[j]; -+ au_dbg_verify_dir_parent(d, sigen); -+ if (au_digen(d) != sigen) { -+ e = do_refresh(d, S_IFDIR, flags); -+ if (unlikely(e && !err)) -+ err = e; -+ /* break on err */ -+ } -+ } -+ } -+ -+ out_dpages: -+ au_dpages_free(&dpages); -+ out: -+ return err; -+} -+ -+static int test_nondir(struct dentry *dentry, void *arg __maybe_unused) -+{ -+ return !S_ISDIR(dentry->d_inode->i_mode); -+} -+ -+static int refresh_nondir(struct dentry *root, unsigned int sigen, -+ int do_dentry) -+{ -+ int err, i, j, ndentry, e; -+ struct au_dcsub_pages dpages; -+ struct au_dpage *dpage; -+ struct dentry **dentries; -+ struct inode *inode; -+ -+ err = 0; -+ list_for_each_entry(inode, &root->d_sb->s_inodes, i_sb_list) -+ if (!S_ISDIR(inode->i_mode) && au_iigen(inode) != sigen) { -+ ii_write_lock_child(inode); -+ e = au_refresh_hinode_self(inode, /*do_attr*/1); -+ ii_write_unlock(inode); -+ if (unlikely(e)) { -+ AuDbg("e %d, i%lu\n", e, inode->i_ino); -+ if (!err) -+ err = e; -+ /* go on even if err */ -+ } -+ } -+ -+ if (!do_dentry) -+ goto out; -+ -+ e = au_dpages_init(&dpages, GFP_NOFS); -+ if (unlikely(e)) { -+ if (!err) -+ err = e; -+ goto out; -+ } -+ e = au_dcsub_pages(&dpages, root, test_nondir, NULL); -+ if (unlikely(e)) { -+ if (!err) -+ err = e; -+ goto out_dpages; -+ } -+ -+ for (i = 0; i < dpages.ndpage; i++) { -+ dpage = dpages.dpages + i; -+ dentries = dpage->dentries; -+ ndentry = dpage->ndentry; -+ for (j = 0; j < ndentry; j++) { -+ struct dentry *d; -+ -+ d = dentries[j]; -+ au_dbg_verify_nondir_parent(d, sigen); -+ inode = d->d_inode; -+ if (inode && au_digen(d) != sigen) { -+ e = do_refresh(d, inode->i_mode & S_IFMT, -+ /*dir_flags*/0); -+ if (unlikely(e && !err)) -+ err = e; -+ /* go on even err */ -+ } -+ } -+ } -+ -+ out_dpages: -+ au_dpages_free(&dpages); -+ out: -+ return err; -+} -+ -+static void au_remount_refresh(struct super_block *sb, unsigned int flags) -+{ -+ int err; -+ unsigned int sigen; -+ struct au_sbinfo *sbinfo; -+ struct dentry *root; -+ struct inode *inode; -+ -+ au_sigen_inc(sb); -+ sigen = au_sigen(sb); -+ sbinfo = au_sbi(sb); -+ au_fclr_si(sbinfo, FAILED_REFRESH_DIRS); -+ -+ root = sb->s_root; -+ DiMustNoWaiters(root); -+ inode = root->d_inode; -+ IiMustNoWaiters(inode); -+ au_reset_hinotify(inode, au_hi_flags(inode, /*isdir*/1)); -+ di_write_unlock(root); -+ -+ err = refresh_dir(root, sigen); -+ if (unlikely(err)) { -+ au_fset_si(sbinfo, FAILED_REFRESH_DIRS); -+ AuWarn("Refreshing directories failed, ignored (%d)\n", err); -+ } -+ -+ if (au_ftest_opts(flags, REFRESH_NONDIR)) { -+ err = refresh_nondir(root, sigen, !err); -+ if (unlikely(err)) -+ AuWarn("Refreshing non-directories failed, ignored" -+ "(%d)\n", err); -+ } -+ -+ /* aufs_write_lock() calls ..._child() */ -+ di_write_lock_child(root); -+ au_cpup_attr_all(root->d_inode, /*force*/1); -+} -+ -+/* stop extra interpretation of errno in mount(8), and strange error messages */ -+static int cvt_err(int err) -+{ -+ AuTraceErr(err); -+ -+ switch (err) { -+ case -ENOENT: -+ case -ENOTDIR: -+ case -EEXIST: -+ case -EIO: -+ err = -EINVAL; -+ } -+ return err; -+} -+ -+static int aufs_remount_fs(struct super_block *sb, int *flags, char *data) -+{ -+ int err; -+ struct au_opts opts; -+ struct dentry *root; -+ struct inode *inode; -+ struct au_sbinfo *sbinfo; -+ -+ err = 0; -+ root = sb->s_root; -+ if (!data || !*data) { -+ aufs_write_lock(root); -+ err = au_opts_verify(sb, *flags, /*pending*/0); -+ if (!err) -+ au_fsync_br(sb); -+ aufs_write_unlock(root); -+ goto out; -+ } -+ -+ err = -ENOMEM; -+ memset(&opts, 0, sizeof(opts)); -+ opts.opt = (void *)__get_free_page(GFP_NOFS); -+ if (unlikely(!opts.opt)) -+ goto out; -+ opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); -+ opts.flags = AuOpts_REMOUNT; -+ opts.sb_flags = *flags; -+ -+ /* parse it before aufs lock */ -+ err = au_opts_parse(sb, data, &opts); -+ if (unlikely(err)) -+ goto out_opts; -+ -+ sbinfo = au_sbi(sb); -+ inode = root->d_inode; -+ mutex_lock(&inode->i_mutex); -+ aufs_write_lock(root); -+ au_fsync_br(sb); -+ -+ /* au_opts_remount() may return an error */ -+ err = au_opts_remount(sb, &opts); -+ au_opts_free(&opts); -+ -+ if (au_ftest_opts(opts.flags, REFRESH_DIR) -+ || au_ftest_opts(opts.flags, REFRESH_NONDIR)) -+ au_remount_refresh(sb, opts.flags); -+ -+ aufs_write_unlock(root); -+ mutex_unlock(&inode->i_mutex); -+ -+ out_opts: -+ free_page((unsigned long)opts.opt); -+ out: -+ err = cvt_err(err); -+ AuTraceErr(err); -+ return err; -+} -+ -+static struct super_operations aufs_sop = { -+ .alloc_inode = aufs_alloc_inode, -+ .destroy_inode = aufs_destroy_inode, -+ .drop_inode = generic_delete_inode, -+ .show_options = aufs_show_options, -+ .statfs = aufs_statfs, -+ .put_super = aufs_put_super, -+ .remount_fs = aufs_remount_fs -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int alloc_root(struct super_block *sb) -+{ -+ int err; -+ struct inode *inode; -+ struct dentry *root; -+ -+ err = -ENOMEM; -+ inode = au_iget_locked(sb, AUFS_ROOT_INO); -+ err = PTR_ERR(inode); -+ if (IS_ERR(inode)) -+ goto out; -+ -+ inode->i_op = &aufs_dir_iop; -+ inode->i_fop = &aufs_dir_fop; -+ inode->i_mode = S_IFDIR; -+ inode->i_nlink = 2; -+ unlock_new_inode(inode); -+ -+ root = d_alloc_root(inode); -+ if (unlikely(!root)) -+ goto out_iput; -+ err = PTR_ERR(root); -+ if (IS_ERR(root)) -+ goto out_iput; -+ -+ err = au_alloc_dinfo(root); -+ if (!err) { -+ sb->s_root = root; -+ return 0; /* success */ -+ } -+ dput(root); -+ goto out; /* do not iput */ -+ -+ out_iput: -+ iget_failed(inode); -+ iput(inode); -+ out: -+ return err; -+ -+} -+ -+static int aufs_fill_super(struct super_block *sb, void *raw_data, -+ int silent __maybe_unused) -+{ -+ int err; -+ struct au_opts opts; -+ struct dentry *root; -+ struct inode *inode; -+ char *arg = raw_data; -+ -+ if (unlikely(!arg || !*arg)) { -+ err = -EINVAL; -+ AuErr("no arg\n"); -+ goto out; -+ } -+ -+ err = -ENOMEM; -+ memset(&opts, 0, sizeof(opts)); -+ opts.opt = (void *)__get_free_page(GFP_NOFS); -+ if (unlikely(!opts.opt)) -+ goto out; -+ opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); -+ opts.sb_flags = sb->s_flags; -+ -+ err = au_si_alloc(sb); -+ if (unlikely(err)) -+ goto out_opts; -+ -+ /* all timestamps always follow the ones on the branch */ -+ sb->s_flags |= MS_NOATIME | MS_NODIRATIME; -+ sb->s_op = &aufs_sop; -+ sb->s_magic = AUFS_SUPER_MAGIC; -+ sb->s_maxbytes = 0; -+ au_export_init(sb); -+ -+ err = alloc_root(sb); -+ if (unlikely(err)) { -+ si_write_unlock(sb); -+ goto out_info; -+ } -+ root = sb->s_root; -+ inode = root->d_inode; -+ -+ /* -+ * actually we can parse options regardless aufs lock here. -+ * but at remount time, parsing must be done before aufs lock. -+ * so we follow the same rule. -+ */ -+ ii_write_lock_parent(inode); -+ aufs_write_unlock(root); -+ err = au_opts_parse(sb, arg, &opts); -+ if (unlikely(err)) -+ goto out_root; -+ -+ /* lock vfs_inode first, then aufs. */ -+ mutex_lock(&inode->i_mutex); -+ inode->i_op = &aufs_dir_iop; -+ inode->i_fop = &aufs_dir_fop; -+ aufs_write_lock(root); -+ err = au_opts_mount(sb, &opts); -+ au_opts_free(&opts); -+ if (unlikely(err)) -+ goto out_unlock; -+ aufs_write_unlock(root); -+ mutex_unlock(&inode->i_mutex); -+ goto out_opts; /* success */ -+ -+ out_unlock: -+ aufs_write_unlock(root); -+ mutex_unlock(&inode->i_mutex); -+ out_root: -+ dput(root); -+ sb->s_root = NULL; -+ out_info: -+ kobject_put(&au_sbi(sb)->si_kobj); -+ sb->s_fs_info = NULL; -+ out_opts: -+ free_page((unsigned long)opts.opt); -+ out: -+ AuTraceErr(err); -+ err = cvt_err(err); -+ AuTraceErr(err); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int aufs_get_sb(struct file_system_type *fs_type, int flags, -+ const char *dev_name __maybe_unused, void *raw_data, -+ struct vfsmount *mnt) -+{ -+ int err; -+ struct super_block *sb; -+ -+ /* all timestamps always follow the ones on the branch */ -+ /* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */ -+ err = get_sb_nodev(fs_type, flags, raw_data, aufs_fill_super, mnt); -+ if (!err) { -+ sb = mnt->mnt_sb; -+ si_write_lock(sb); -+ sysaufs_brs_add(sb, 0); -+ si_write_unlock(sb); -+ } -+ return err; -+} -+ -+struct file_system_type aufs_fs_type = { -+ .name = AUFS_FSTYPE, -+ .fs_flags = -+ FS_RENAME_DOES_D_MOVE /* a race between rename and others */ -+ | FS_REVAL_DOT, /* for NFS branch and udba */ -+ .get_sb = aufs_get_sb, -+ .kill_sb = generic_shutdown_super, -+ /* no need to __module_get() and module_put(). */ -+ .owner = THIS_MODULE, -+}; -diff -Nur linux-2.6.31-vanilla/fs/aufs/super.h linux-2.6.31/fs/aufs/super.h ---- linux-2.6.31-vanilla/fs/aufs/super.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/super.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,384 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * super_block operations -+ */ -+ -+#ifndef __AUFS_SUPER_H__ -+#define __AUFS_SUPER_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/fs.h> -+#include <linux/aufs_type.h> -+#include "rwsem.h" -+#include "spl.h" -+#include "wkq.h" -+ -+typedef ssize_t (*au_readf_t)(struct file *, char __user *, size_t, loff_t *); -+typedef ssize_t (*au_writef_t)(struct file *, const char __user *, size_t, -+ loff_t *); -+ -+/* policies to select one among multiple writable branches */ -+struct au_wbr_copyup_operations { -+ int (*copyup)(struct dentry *dentry); -+}; -+ -+struct au_wbr_create_operations { -+ int (*create)(struct dentry *dentry, int isdir); -+ int (*init)(struct super_block *sb); -+ int (*fin)(struct super_block *sb); -+}; -+ -+struct au_wbr_mfs { -+ struct mutex mfs_lock; /* protect this structure */ -+ unsigned long mfs_jiffy; -+ unsigned long mfs_expire; -+ aufs_bindex_t mfs_bindex; -+ -+ unsigned long long mfsrr_bytes; -+ unsigned long long mfsrr_watermark; -+}; -+ -+struct au_branch; -+struct au_sbinfo { -+ /* nowait tasks in the system-wide workqueue */ -+ struct au_nowait_tasks si_nowait; -+ -+ struct au_rwsem si_rwsem; -+ -+ /* branch management */ -+ unsigned int si_generation; -+ -+ /* see above flags */ -+ unsigned char au_si_status; -+ -+ aufs_bindex_t si_bend; -+ aufs_bindex_t si_last_br_id; -+ struct au_branch **si_branch; -+ -+ /* policy to select a writable branch */ -+ unsigned char si_wbr_copyup; -+ unsigned char si_wbr_create; -+ struct au_wbr_copyup_operations *si_wbr_copyup_ops; -+ struct au_wbr_create_operations *si_wbr_create_ops; -+ -+ /* round robin */ -+ atomic_t si_wbr_rr_next; -+ -+ /* most free space */ -+ struct au_wbr_mfs si_wbr_mfs; -+ -+ /* mount flags */ -+ /* include/asm-ia64/siginfo.h defines a macro named si_flags */ -+ unsigned int si_mntflags; -+ -+ /* external inode number (bitmap and translation table) */ -+ au_readf_t si_xread; -+ au_writef_t si_xwrite; -+ struct file *si_xib; -+ struct mutex si_xib_mtx; /* protect xib members */ -+ unsigned long *si_xib_buf; -+ unsigned long si_xib_last_pindex; -+ int si_xib_next_bit; -+ aufs_bindex_t si_xino_brid; -+ /* reserved for future use */ -+ /* unsigned long long si_xib_limit; */ /* Max xib file size */ -+ -+#ifdef CONFIG_AUFS_EXPORT -+ /* i_generation */ -+ struct file *si_xigen; -+ atomic_t si_xigen_next; -+#endif -+ -+ /* vdir parameters */ -+ unsigned long si_rdcache; /* max cache time in HZ */ -+ unsigned int si_rdblk; /* deblk size */ -+ unsigned int si_rdhash; /* hash size */ -+ -+ /* -+ * If the number of whiteouts are larger than si_dirwh, leave all of -+ * them after au_whtmp_ren to reduce the cost of rmdir(2). -+ * future fsck.aufs or kernel thread will remove them later. -+ * Otherwise, remove all whiteouts and the dir in rmdir(2). -+ */ -+ unsigned int si_dirwh; -+ -+ /* -+ * rename(2) a directory with all children. -+ */ -+ /* reserved for future use */ -+ /* int si_rendir; */ -+ -+ /* pseudo_link list */ -+ struct au_splhead si_plink; -+ wait_queue_head_t si_plink_wq; -+ -+ /* -+ * sysfs and lifetime management. -+ * this is not a small structure and it may be a waste of memory in case -+ * of sysfs is disabled, particulary when many aufs-es are mounted. -+ * but using sysfs is majority. -+ */ -+ struct kobject si_kobj; -+#ifdef CONFIG_DEBUG_FS -+ struct dentry *si_dbgaufs, *si_dbgaufs_xib; -+#ifdef CONFIG_AUFS_EXPORT -+ struct dentry *si_dbgaufs_xigen; -+#endif -+#endif -+ -+ /* dirty, necessary for unmounting, sysfs and sysrq */ -+ struct super_block *si_sb; -+}; -+ -+/* sbinfo status flags */ -+/* -+ * set true when refresh_dirs() failed at remount time. -+ * then try refreshing dirs at access time again. -+ * if it is false, refreshing dirs at access time is unnecesary -+ */ -+#define AuSi_FAILED_REFRESH_DIRS 1 -+#define AuSi_MAINTAIN_PLINK (1 << 1) /* ioctl */ -+static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi, -+ unsigned int flag) -+{ -+ AuRwMustAnyLock(&sbi->si_rwsem); -+ return sbi->au_si_status & flag; -+} -+#define au_ftest_si(sbinfo, name) au_do_ftest_si(sbinfo, AuSi_##name) -+#define au_fset_si(sbinfo, name) do { \ -+ AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ -+ (sbinfo)->au_si_status |= AuSi_##name; \ -+} while (0) -+#define au_fclr_si(sbinfo, name) do { \ -+ AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ -+ (sbinfo)->au_si_status &= ~AuSi_##name; \ -+} while (0) -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* policy to select one among writable branches */ -+#define AuWbrCopyup(sbinfo, args...) \ -+ ((sbinfo)->si_wbr_copyup_ops->copyup(args)) -+#define AuWbrCreate(sbinfo, args...) \ -+ ((sbinfo)->si_wbr_create_ops->create(args)) -+ -+/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */ -+#define AuLock_DW 1 /* write-lock dentry */ -+#define AuLock_IR (1 << 1) /* read-lock inode */ -+#define AuLock_IW (1 << 2) /* write-lock inode */ -+#define AuLock_FLUSH (1 << 3) /* wait for 'nowait' tasks */ -+#define AuLock_DIR (1 << 4) /* target is a dir */ -+#define au_ftest_lock(flags, name) ((flags) & AuLock_##name) -+#define au_fset_lock(flags, name) { (flags) |= AuLock_##name; } -+#define au_fclr_lock(flags, name) { (flags) &= ~AuLock_##name; } -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* super.c */ -+extern struct file_system_type aufs_fs_type; -+struct inode *au_iget_locked(struct super_block *sb, ino_t ino); -+ -+/* sbinfo.c */ -+void au_si_free(struct kobject *kobj); -+int au_si_alloc(struct super_block *sb); -+int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr); -+ -+unsigned int au_sigen_inc(struct super_block *sb); -+aufs_bindex_t au_new_br_id(struct super_block *sb); -+ -+void aufs_read_lock(struct dentry *dentry, int flags); -+void aufs_read_unlock(struct dentry *dentry, int flags); -+void aufs_write_lock(struct dentry *dentry); -+void aufs_write_unlock(struct dentry *dentry); -+void aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int isdir); -+void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2); -+ -+/* wbr_policy.c */ -+extern struct au_wbr_copyup_operations au_wbr_copyup_ops[]; -+extern struct au_wbr_create_operations au_wbr_create_ops[]; -+int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst); -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline struct au_sbinfo *au_sbi(struct super_block *sb) -+{ -+ return sb->s_fs_info; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+#ifdef CONFIG_AUFS_EXPORT -+void au_export_init(struct super_block *sb); -+ -+static inline int au_test_nfsd(struct task_struct *tsk) -+{ -+ return !tsk->mm && !strcmp(tsk->comm, "nfsd"); -+} -+ -+int au_xigen_inc(struct inode *inode); -+int au_xigen_new(struct inode *inode); -+int au_xigen_set(struct super_block *sb, struct file *base); -+void au_xigen_clr(struct super_block *sb); -+ -+static inline int au_busy_or_stale(void) -+{ -+ if (!au_test_nfsd(current)) -+ return -EBUSY; -+ return -ESTALE; -+} -+#else -+static inline void au_export_init(struct super_block *sb) -+{ -+ /* nothing */ -+} -+ -+static inline int au_test_nfsd(struct task_struct *tsk) -+{ -+ return 0; -+} -+ -+static inline int au_xigen_inc(struct inode *inode) -+{ -+ return 0; -+} -+ -+static inline int au_xigen_new(struct inode *inode) -+{ -+ return 0; -+} -+ -+static inline int au_xigen_set(struct super_block *sb, struct file *base) -+{ -+ return 0; -+} -+ -+static inline void au_xigen_clr(struct super_block *sb) -+{ -+ /* empty */ -+} -+ -+static inline int au_busy_or_stale(void) -+{ -+ return -EBUSY; -+} -+#endif /* CONFIG_AUFS_EXPORT */ -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo) -+{ -+ /* -+ * This function is a dynamic '__init' fucntion actually, -+ * so the tiny check for si_rwsem is unnecessary. -+ */ -+ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ -+#ifdef CONFIG_DEBUG_FS -+ sbinfo->si_dbgaufs = NULL; -+ sbinfo->si_dbgaufs_xib = NULL; -+#ifdef CONFIG_AUFS_EXPORT -+ sbinfo->si_dbgaufs_xigen = NULL; -+#endif -+#endif -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* lock superblock. mainly for entry point functions */ -+/* -+ * si_noflush_read_lock, si_noflush_write_lock, -+ * si_read_unlock, si_write_unlock, si_downgrade_lock -+ */ -+AuSimpleLockRwsemFuncs(si_noflush, struct super_block *sb, -+ &au_sbi(sb)->si_rwsem); -+AuSimpleUnlockRwsemFuncs(si, struct super_block *sb, &au_sbi(sb)->si_rwsem); -+ -+#define SiMustNoWaiters(sb) AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem) -+#define SiMustAnyLock(sb) AuRwMustAnyLock(&au_sbi(sb)->si_rwsem) -+#define SiMustWriteLock(sb) AuRwMustWriteLock(&au_sbi(sb)->si_rwsem) -+ -+static inline void si_read_lock(struct super_block *sb, int flags) -+{ -+ if (au_ftest_lock(flags, FLUSH)) -+ au_nwt_flush(&au_sbi(sb)->si_nowait); -+ si_noflush_read_lock(sb); -+} -+ -+static inline void si_write_lock(struct super_block *sb) -+{ -+ au_nwt_flush(&au_sbi(sb)->si_nowait); -+ si_noflush_write_lock(sb); -+} -+ -+static inline int si_read_trylock(struct super_block *sb, int flags) -+{ -+ if (au_ftest_lock(flags, FLUSH)) -+ au_nwt_flush(&au_sbi(sb)->si_nowait); -+ return si_noflush_read_trylock(sb); -+} -+ -+static inline int si_write_trylock(struct super_block *sb, int flags) -+{ -+ if (au_ftest_lock(flags, FLUSH)) -+ au_nwt_flush(&au_sbi(sb)->si_nowait); -+ return si_noflush_write_trylock(sb); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline aufs_bindex_t au_sbend(struct super_block *sb) -+{ -+ SiMustAnyLock(sb); -+ return au_sbi(sb)->si_bend; -+} -+ -+static inline unsigned int au_mntflags(struct super_block *sb) -+{ -+ SiMustAnyLock(sb); -+ return au_sbi(sb)->si_mntflags; -+} -+ -+static inline unsigned int au_sigen(struct super_block *sb) -+{ -+ SiMustAnyLock(sb); -+ return au_sbi(sb)->si_generation; -+} -+ -+static inline struct au_branch *au_sbr(struct super_block *sb, -+ aufs_bindex_t bindex) -+{ -+ SiMustAnyLock(sb); -+ return au_sbi(sb)->si_branch[0 + bindex]; -+} -+ -+static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid) -+{ -+ SiMustWriteLock(sb); -+ au_sbi(sb)->si_xino_brid = brid; -+} -+ -+static inline aufs_bindex_t au_xino_brid(struct super_block *sb) -+{ -+ SiMustAnyLock(sb); -+ return au_sbi(sb)->si_xino_brid; -+} -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_SUPER_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/sysaufs.c linux-2.6.31/fs/aufs/sysaufs.c ---- linux-2.6.31-vanilla/fs/aufs/sysaufs.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/sysaufs.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,104 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * sysfs interface and lifetime management -+ * they are necessary regardless sysfs is disabled. -+ */ -+ -+#include <linux/fs.h> -+#include <linux/random.h> -+#include <linux/sysfs.h> -+#include "aufs.h" -+ -+unsigned long sysaufs_si_mask; -+struct kset *sysaufs_ket; -+ -+#define AuSiAttr(_name) { \ -+ .attr = { .name = __stringify(_name), .mode = 0444 }, \ -+ .show = sysaufs_si_##_name, \ -+} -+ -+static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path); -+struct attribute *sysaufs_si_attrs[] = { -+ &sysaufs_si_attr_xi_path.attr, -+ NULL, -+}; -+ -+static struct sysfs_ops au_sbi_ops = { -+ .show = sysaufs_si_show -+}; -+ -+static struct kobj_type au_sbi_ktype = { -+ .release = au_si_free, -+ .sysfs_ops = &au_sbi_ops, -+ .default_attrs = sysaufs_si_attrs -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+int sysaufs_si_init(struct au_sbinfo *sbinfo) -+{ -+ int err; -+ -+ sbinfo->si_kobj.kset = sysaufs_ket; -+ /* cf. sysaufs_name() */ -+ err = kobject_init_and_add -+ (&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_ket->kobj*/NULL, -+ SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo)); -+ -+ dbgaufs_si_null(sbinfo); -+ if (!err) { -+ err = dbgaufs_si_init(sbinfo); -+ if (unlikely(err)) -+ kobject_put(&sbinfo->si_kobj); -+ } -+ return err; -+} -+ -+void sysaufs_fin(void) -+{ -+ dbgaufs_fin(); -+ sysfs_remove_group(&sysaufs_ket->kobj, sysaufs_attr_group); -+ kset_unregister(sysaufs_ket); -+} -+ -+int __init sysaufs_init(void) -+{ -+ int err; -+ -+ do { -+ get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask)); -+ } while (!sysaufs_si_mask); -+ -+ sysaufs_ket = kset_create_and_add(AUFS_NAME, NULL, fs_kobj); -+ err = PTR_ERR(sysaufs_ket); -+ if (IS_ERR(sysaufs_ket)) -+ goto out; -+ err = sysfs_create_group(&sysaufs_ket->kobj, sysaufs_attr_group); -+ if (unlikely(err)) { -+ kset_unregister(sysaufs_ket); -+ goto out; -+ } -+ -+ err = dbgaufs_init(); -+ if (unlikely(err)) -+ sysaufs_fin(); -+ out: -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/sysaufs.h linux-2.6.31/fs/aufs/sysaufs.h ---- linux-2.6.31-vanilla/fs/aufs/sysaufs.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/sysaufs.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,120 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * sysfs interface and mount lifetime management -+ */ -+ -+#ifndef __SYSAUFS_H__ -+#define __SYSAUFS_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/sysfs.h> -+#include <linux/aufs_type.h> -+#include "module.h" -+ -+struct super_block; -+struct au_sbinfo; -+ -+struct sysaufs_si_attr { -+ struct attribute attr; -+ int (*show)(struct seq_file *seq, struct super_block *sb); -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* sysaufs.c */ -+extern unsigned long sysaufs_si_mask; -+extern struct kset *sysaufs_ket; -+extern struct attribute *sysaufs_si_attrs[]; -+int sysaufs_si_init(struct au_sbinfo *sbinfo); -+int __init sysaufs_init(void); -+void sysaufs_fin(void); -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* some people doesn't like to show a pointer in kernel */ -+static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo) -+{ -+ return sysaufs_si_mask ^ (unsigned long)sbinfo; -+} -+ -+#define SysaufsSiNamePrefix "si_" -+#define SysaufsSiNameLen (sizeof(SysaufsSiNamePrefix) + 16) -+static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name) -+{ -+ snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx", -+ sysaufs_si_id(sbinfo)); -+} -+ -+struct au_branch; -+#ifdef CONFIG_SYSFS -+/* sysfs.c */ -+extern struct attribute_group *sysaufs_attr_group; -+ -+int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb); -+ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, -+ char *buf); -+ -+void sysaufs_br_init(struct au_branch *br); -+void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); -+void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); -+ -+#define sysaufs_brs_init() do {} while (0) -+ -+#else -+#define sysaufs_attr_group NULL -+ -+static inline -+int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb) -+{ -+ return 0; -+} -+ -+static inline -+ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, -+ char *buf) -+{ -+ return 0; -+} -+ -+static inline void sysaufs_br_init(struct au_branch *br) -+{ -+ /* empty */ -+} -+ -+static inline void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ /* nothing */ -+} -+ -+static inline void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ /* nothing */ -+} -+ -+static inline void sysaufs_brs_init(void) -+{ -+ sysaufs_brs = 0; -+} -+ -+#endif /* CONFIG_SYSFS */ -+ -+#endif /* __KERNEL__ */ -+#endif /* __SYSAUFS_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/sysfs.c linux-2.6.31/fs/aufs/sysfs.c ---- linux-2.6.31-vanilla/fs/aufs/sysfs.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/sysfs.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,210 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * sysfs interface -+ */ -+ -+#include <linux/fs.h> -+#include <linux/module.h> -+#include <linux/seq_file.h> -+#include <linux/sysfs.h> -+#include "aufs.h" -+ -+static struct attribute *au_attr[] = { -+ NULL, /* need to NULL terminate the list of attributes */ -+}; -+ -+static struct attribute_group sysaufs_attr_group_body = { -+ .attrs = au_attr -+}; -+ -+struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body; -+ -+/* ---------------------------------------------------------------------- */ -+ -+int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb) -+{ -+ int err; -+ -+ SiMustAnyLock(sb); -+ -+ err = 0; -+ if (au_opt_test(au_mntflags(sb), XINO)) { -+ err = au_xino_path(seq, au_sbi(sb)->si_xib); -+ seq_putc(seq, '\n'); -+ } -+ return err; -+} -+ -+/* -+ * the lifetime of branch is independent from the entry under sysfs. -+ * sysfs handles the lifetime of the entry, and never call ->show() after it is -+ * unlinked. -+ */ -+static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb, -+ aufs_bindex_t bindex) -+{ -+ struct path path; -+ struct dentry *root; -+ struct au_branch *br; -+ -+ AuDbg("b%d\n", bindex); -+ -+ root = sb->s_root; -+ di_read_lock_parent(root, !AuLock_IR); -+ br = au_sbr(sb, bindex); -+ path.mnt = br->br_mnt; -+ path.dentry = au_h_dptr(root, bindex); -+ au_seq_path(seq, &path); -+ di_read_unlock(root, !AuLock_IR); -+ seq_printf(seq, "=%s\n", au_optstr_br_perm(br->br_perm)); -+ return 0; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static struct seq_file *au_seq(char *p, ssize_t len) -+{ -+ struct seq_file *seq; -+ -+ seq = kzalloc(sizeof(*seq), GFP_NOFS); -+ if (seq) { -+ /* mutex_init(&seq.lock); */ -+ seq->buf = p; -+ seq->size = len; -+ return seq; /* success */ -+ } -+ -+ seq = ERR_PTR(-ENOMEM); -+ return seq; -+} -+ -+#define SysaufsBr_PREFIX "br" -+ -+/* todo: file size may exceed PAGE_SIZE */ -+ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, -+ char *buf) -+{ -+ ssize_t err; -+ long l; -+ aufs_bindex_t bend; -+ struct au_sbinfo *sbinfo; -+ struct super_block *sb; -+ struct seq_file *seq; -+ char *name; -+ struct attribute **cattr; -+ -+ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); -+ sb = sbinfo->si_sb; -+ si_noflush_read_lock(sb); -+ -+ seq = au_seq(buf, PAGE_SIZE); -+ err = PTR_ERR(seq); -+ if (IS_ERR(seq)) -+ goto out; -+ -+ name = (void *)attr->name; -+ cattr = sysaufs_si_attrs; -+ while (*cattr) { -+ if (!strcmp(name, (*cattr)->name)) { -+ err = container_of(*cattr, struct sysaufs_si_attr, attr) -+ ->show(seq, sb); -+ goto out_seq; -+ } -+ cattr++; -+ } -+ -+ bend = au_sbend(sb); -+ if (!strncmp(name, SysaufsBr_PREFIX, sizeof(SysaufsBr_PREFIX) - 1)) { -+ name += sizeof(SysaufsBr_PREFIX) - 1; -+ err = strict_strtol(name, 10, &l); -+ if (!err) { -+ if (l <= bend) -+ err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l); -+ else -+ err = -ENOENT; -+ } -+ goto out_seq; -+ } -+ BUG(); -+ -+ out_seq: -+ if (!err) { -+ err = seq->count; -+ /* sysfs limit */ -+ if (unlikely(err == PAGE_SIZE)) -+ err = -EFBIG; -+ } -+ kfree(seq); -+ out: -+ si_read_unlock(sb); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+void sysaufs_br_init(struct au_branch *br) -+{ -+ br->br_attr.name = br->br_name; -+ br->br_attr.mode = S_IRUGO; -+ br->br_attr.owner = THIS_MODULE; -+} -+ -+void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ struct au_branch *br; -+ struct kobject *kobj; -+ aufs_bindex_t bend; -+ -+ dbgaufs_brs_del(sb, bindex); -+ -+ if (!sysaufs_brs) -+ return; -+ -+ kobj = &au_sbi(sb)->si_kobj; -+ bend = au_sbend(sb); -+ for (; bindex <= bend; bindex++) { -+ br = au_sbr(sb, bindex); -+ sysfs_remove_file(kobj, &br->br_attr); -+ } -+} -+ -+void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ int err; -+ aufs_bindex_t bend; -+ struct kobject *kobj; -+ struct au_branch *br; -+ -+ dbgaufs_brs_add(sb, bindex); -+ -+ if (!sysaufs_brs) -+ return; -+ -+ kobj = &au_sbi(sb)->si_kobj; -+ bend = au_sbend(sb); -+ for (; bindex <= bend; bindex++) { -+ br = au_sbr(sb, bindex); -+ snprintf(br->br_name, sizeof(br->br_name), SysaufsBr_PREFIX -+ "%d", bindex); -+ err = sysfs_create_file(kobj, &br->br_attr); -+ if (unlikely(err)) -+ AuWarn("failed %s under sysfs(%d)\n", br->br_name, err); -+ } -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/sysrq.c linux-2.6.31/fs/aufs/sysrq.c ---- linux-2.6.31-vanilla/fs/aufs/sysrq.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/sysrq.c 2009-09-16 13:55:29.000000000 +0200 -@@ -0,0 +1,115 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * magic sysrq hanlder -+ */ -+ -+#include <linux/fs.h> -+#include <linux/module.h> -+#include <linux/moduleparam.h> -+/* #include <linux/sysrq.h> */ -+#include "aufs.h" -+ -+/* ---------------------------------------------------------------------- */ -+ -+static void sysrq_sb(struct super_block *sb) -+{ -+ char *plevel; -+ struct au_sbinfo *sbinfo; -+ struct file *file; -+ -+ plevel = au_plevel; -+ au_plevel = KERN_WARNING; -+ au_debug(1); -+ -+ sbinfo = au_sbi(sb); -+ pr_warning("si=%lx\n", sysaufs_si_id(sbinfo)); -+ pr_warning(AUFS_NAME ": superblock\n"); -+ au_dpri_sb(sb); -+ pr_warning(AUFS_NAME ": root dentry\n"); -+ au_dpri_dentry(sb->s_root); -+ pr_warning(AUFS_NAME ": root inode\n"); -+ au_dpri_inode(sb->s_root->d_inode); -+#if 0 -+ struct inode *i; -+ pr_warning(AUFS_NAME ": isolated inode\n"); -+ list_for_each_entry(i, &sb->s_inodes, i_sb_list) -+ if (list_empty(&i->i_dentry)) -+ au_dpri_inode(i); -+#endif -+ pr_warning(AUFS_NAME ": files\n"); -+ list_for_each_entry(file, &sb->s_files, f_u.fu_list) -+ if (!special_file(file->f_dentry->d_inode->i_mode)) -+ au_dpri_file(file); -+ -+ au_plevel = plevel; -+ au_debug(0); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* module parameter */ -+static char *aufs_sysrq_key = "a"; -+module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO); -+MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME); -+ -+static void au_sysrq(int key __maybe_unused, -+ struct tty_struct *tty __maybe_unused) -+{ -+ struct kobject *kobj; -+ struct au_sbinfo *sbinfo; -+ -+ /* spin_lock(&sysaufs_ket->list_lock); */ -+ list_for_each_entry(kobj, &sysaufs_ket->list, entry) { -+ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); -+ sysrq_sb(sbinfo->si_sb); -+ } -+ /* spin_unlock(&sysaufs_ket->list_lock); */ -+} -+ -+static struct sysrq_key_op au_sysrq_op = { -+ .handler = au_sysrq, -+ .help_msg = "Aufs", -+ .action_msg = "Aufs", -+ .enable_mask = SYSRQ_ENABLE_DUMP -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+int __init au_sysrq_init(void) -+{ -+ int err; -+ char key; -+ -+ err = -1; -+ key = *aufs_sysrq_key; -+ if ('a' <= key && key <= 'z') -+ err = register_sysrq_key(key, &au_sysrq_op); -+ if (unlikely(err)) -+ AuErr("err %d, sysrq=%c\n", err, key); -+ return err; -+} -+ -+void au_sysrq_fin(void) -+{ -+ int err; -+ err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op); -+ if (unlikely(err)) -+ AuErr("err %d (ignored)\n", err); -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/vdir.c linux-2.6.31/fs/aufs/vdir.c ---- linux-2.6.31-vanilla/fs/aufs/vdir.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/vdir.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,879 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * virtual or vertical directory -+ */ -+ -+#include <linux/hash.h> -+#include "aufs.h" -+ -+static unsigned int calc_size(int nlen) -+{ -+ BUILD_BUG_ON(sizeof(ino_t) != sizeof(long)); -+ return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t)); -+} -+ -+static int set_deblk_end(union au_vdir_deblk_p *p, -+ union au_vdir_deblk_p *deblk_end) -+{ -+ if (calc_size(0) <= deblk_end->deblk - p->deblk) { -+ p->de->de_str.len = 0; -+ /* smp_mb(); */ -+ return 0; -+ } -+ return -1; /* error */ -+} -+ -+/* returns true or false */ -+static int is_deblk_end(union au_vdir_deblk_p *p, -+ union au_vdir_deblk_p *deblk_end) -+{ -+ if (calc_size(0) <= deblk_end->deblk - p->deblk) -+ return !p->de->de_str.len; -+ return 1; -+} -+ -+static unsigned char *last_deblk(struct au_vdir *vdir) -+{ -+ return vdir->vd_deblk[vdir->vd_nblk - 1]; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* estimate the apropriate size for name hash table */ -+unsigned int au_rdhash_est(loff_t sz) -+{ -+ unsigned int n; -+ -+ n = UINT_MAX; -+ sz >>= 10; -+ if (sz < n) -+ n = sz; -+ if (sz < AUFS_RDHASH_DEF) -+ n = AUFS_RDHASH_DEF; -+ /* AuInfo("n %u\n", n); */ -+ return n; -+} -+ -+/* -+ * the allocated memory has to be freed by -+ * au_nhash_wh_free() or au_nhash_de_free(). -+ */ -+int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp) -+{ -+ struct hlist_head *head; -+ unsigned int u; -+ -+ head = kmalloc(sizeof(*nhash->nh_head) * num_hash, gfp); -+ if (head) { -+ nhash->nh_num = num_hash; -+ nhash->nh_head = head; -+ for (u = 0; u < num_hash; u++) -+ INIT_HLIST_HEAD(head++); -+ return 0; /* success */ -+ } -+ -+ return -ENOMEM; -+} -+ -+static void nhash_count(struct hlist_head *head) -+{ -+#if 0 -+ unsigned long n; -+ struct hlist_node *pos; -+ -+ n = 0; -+ hlist_for_each(pos, head) -+ n++; -+ AuInfo("%lu\n", n); -+#endif -+} -+ -+static void au_nhash_wh_do_free(struct hlist_head *head) -+{ -+ struct au_vdir_wh *tpos; -+ struct hlist_node *pos, *node; -+ -+ hlist_for_each_entry_safe(tpos, pos, node, head, wh_hash) { -+ /* hlist_del(pos); */ -+ kfree(tpos); -+ } -+} -+ -+static void au_nhash_de_do_free(struct hlist_head *head) -+{ -+ struct au_vdir_dehstr *tpos; -+ struct hlist_node *pos, *node; -+ -+ hlist_for_each_entry_safe(tpos, pos, node, head, hash) { -+ /* hlist_del(pos); */ -+ au_cache_free_dehstr(tpos); -+ } -+} -+ -+static void au_nhash_do_free(struct au_nhash *nhash, -+ void (*free)(struct hlist_head *head)) -+{ -+ unsigned int n; -+ struct hlist_head *head; -+ -+ n = nhash->nh_num; -+ if (!n) -+ return; -+ -+ head = nhash->nh_head; -+ while (n-- > 0) { -+ nhash_count(head); -+ free(head++); -+ } -+ kfree(nhash->nh_head); -+} -+ -+void au_nhash_wh_free(struct au_nhash *whlist) -+{ -+ au_nhash_do_free(whlist, au_nhash_wh_do_free); -+} -+ -+static void au_nhash_de_free(struct au_nhash *delist) -+{ -+ au_nhash_do_free(delist, au_nhash_de_do_free); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, -+ int limit) -+{ -+ int num; -+ unsigned int u, n; -+ struct hlist_head *head; -+ struct au_vdir_wh *tpos; -+ struct hlist_node *pos; -+ -+ num = 0; -+ n = whlist->nh_num; -+ head = whlist->nh_head; -+ for (u = 0; u < n; u++, head++) -+ hlist_for_each_entry(tpos, pos, head, wh_hash) -+ if (tpos->wh_bindex == btgt && ++num > limit) -+ return 1; -+ return 0; -+} -+ -+static struct hlist_head *au_name_hash(struct au_nhash *nhash, -+ unsigned char *name, -+ unsigned int len) -+{ -+ unsigned int v; -+ /* const unsigned int magic_bit = 12; */ -+ -+ AuDebugOn(!nhash->nh_num || !nhash->nh_head); -+ -+ v = 0; -+ while (len--) -+ v += *name++; -+ /* v = hash_long(v, magic_bit); */ -+ v %= nhash->nh_num; -+ return nhash->nh_head + v; -+} -+ -+static int au_nhash_test_name(struct au_vdir_destr *str, const char *name, -+ int nlen) -+{ -+ return str->len == nlen && !memcmp(str->name, name, nlen); -+} -+ -+/* returns found or not */ -+int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen) -+{ -+ struct hlist_head *head; -+ struct au_vdir_wh *tpos; -+ struct hlist_node *pos; -+ struct au_vdir_destr *str; -+ -+ head = au_name_hash(whlist, name, nlen); -+ hlist_for_each_entry(tpos, pos, head, wh_hash) { -+ str = &tpos->wh_str; -+ AuDbg("%.*s\n", str->len, str->name); -+ if (au_nhash_test_name(str, name, nlen)) -+ return 1; -+ } -+ return 0; -+} -+ -+/* returns found(true) or not */ -+static int test_known(struct au_nhash *delist, char *name, int nlen) -+{ -+ struct hlist_head *head; -+ struct au_vdir_dehstr *tpos; -+ struct hlist_node *pos; -+ struct au_vdir_destr *str; -+ -+ head = au_name_hash(delist, name, nlen); -+ hlist_for_each_entry(tpos, pos, head, hash) { -+ str = tpos->str; -+ AuDbg("%.*s\n", str->len, str->name); -+ if (au_nhash_test_name(str, name, nlen)) -+ return 1; -+ } -+ return 0; -+} -+ -+static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino, -+ unsigned char d_type) -+{ -+#ifdef CONFIG_AUFS_SHWH -+ wh->wh_ino = ino; -+ wh->wh_type = d_type; -+#endif -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, -+ unsigned int d_type, aufs_bindex_t bindex, -+ unsigned char shwh) -+{ -+ int err; -+ struct au_vdir_destr *str; -+ struct au_vdir_wh *wh; -+ -+ AuDbg("%.*s\n", nlen, name); -+ AuDebugOn(!whlist->nh_num || !whlist->nh_head); -+ -+ err = -ENOMEM; -+ wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS); -+ if (unlikely(!wh)) -+ goto out; -+ -+ err = 0; -+ wh->wh_bindex = bindex; -+ if (shwh) -+ au_shwh_init_wh(wh, ino, d_type); -+ str = &wh->wh_str; -+ str->len = nlen; -+ memcpy(str->name, name, nlen); -+ hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen)); -+ /* smp_mb(); */ -+ -+ out: -+ return err; -+} -+ -+static int append_deblk(struct au_vdir *vdir) -+{ -+ int err; -+ unsigned long ul; -+ const unsigned int deblk_sz = vdir->vd_deblk_sz; -+ union au_vdir_deblk_p p, deblk_end; -+ unsigned char **o; -+ -+ err = -ENOMEM; -+ o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1), -+ GFP_NOFS); -+ if (unlikely(!o)) -+ goto out; -+ -+ vdir->vd_deblk = o; -+ p.deblk = kmalloc(deblk_sz, GFP_NOFS); -+ if (p.deblk) { -+ ul = vdir->vd_nblk++; -+ vdir->vd_deblk[ul] = p.deblk; -+ vdir->vd_last.ul = ul; -+ vdir->vd_last.p.deblk = p.deblk; -+ deblk_end.deblk = p.deblk + deblk_sz; -+ err = set_deblk_end(&p, &deblk_end); -+ } -+ -+ out: -+ return err; -+} -+ -+static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino, -+ unsigned int d_type, struct au_nhash *delist) -+{ -+ int err; -+ unsigned int sz; -+ const unsigned int deblk_sz = vdir->vd_deblk_sz; -+ union au_vdir_deblk_p p, *room, deblk_end; -+ struct au_vdir_dehstr *dehstr; -+ -+ p.deblk = last_deblk(vdir); -+ deblk_end.deblk = p.deblk + deblk_sz; -+ room = &vdir->vd_last.p; -+ AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk -+ || !is_deblk_end(room, &deblk_end)); -+ -+ sz = calc_size(nlen); -+ if (unlikely(sz > deblk_end.deblk - room->deblk)) { -+ err = append_deblk(vdir); -+ if (unlikely(err)) -+ goto out; -+ -+ p.deblk = last_deblk(vdir); -+ deblk_end.deblk = p.deblk + deblk_sz; -+ /* smp_mb(); */ -+ AuDebugOn(room->deblk != p.deblk); -+ } -+ -+ err = -ENOMEM; -+ dehstr = au_cache_alloc_dehstr(); -+ if (unlikely(!dehstr)) -+ goto out; -+ -+ dehstr->str = &room->de->de_str; -+ hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen)); -+ room->de->de_ino = ino; -+ room->de->de_type = d_type; -+ room->de->de_str.len = nlen; -+ memcpy(room->de->de_str.name, name, nlen); -+ -+ err = 0; -+ room->deblk += sz; -+ if (unlikely(set_deblk_end(room, &deblk_end))) -+ err = append_deblk(vdir); -+ /* smp_mb(); */ -+ -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+void au_vdir_free(struct au_vdir *vdir) -+{ -+ unsigned char **deblk; -+ -+ deblk = vdir->vd_deblk; -+ while (vdir->vd_nblk--) -+ kfree(*deblk++); -+ kfree(vdir->vd_deblk); -+ au_cache_free_vdir(vdir); -+} -+ -+static struct au_vdir *alloc_vdir(struct file *file) -+{ -+ struct au_vdir *vdir; -+ struct super_block *sb; -+ int err; -+ -+ sb = file->f_dentry->d_sb; -+ SiMustAnyLock(sb); -+ -+ err = -ENOMEM; -+ vdir = au_cache_alloc_vdir(); -+ if (unlikely(!vdir)) -+ goto out; -+ -+ vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS); -+ if (unlikely(!vdir->vd_deblk)) -+ goto out_free; -+ -+ vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk; -+ if (!vdir->vd_deblk_sz) { -+ /* estimate the apropriate size for deblk */ -+ vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL); -+ /* AuInfo("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */ -+ } -+ vdir->vd_nblk = 0; -+ vdir->vd_version = 0; -+ vdir->vd_jiffy = 0; -+ err = append_deblk(vdir); -+ if (!err) -+ return vdir; /* success */ -+ -+ kfree(vdir->vd_deblk); -+ -+ out_free: -+ au_cache_free_vdir(vdir); -+ out: -+ vdir = ERR_PTR(err); -+ return vdir; -+} -+ -+static int reinit_vdir(struct au_vdir *vdir) -+{ -+ int err; -+ union au_vdir_deblk_p p, deblk_end; -+ -+ while (vdir->vd_nblk > 1) { -+ kfree(vdir->vd_deblk[vdir->vd_nblk - 1]); -+ /* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */ -+ vdir->vd_nblk--; -+ } -+ p.deblk = vdir->vd_deblk[0]; -+ deblk_end.deblk = p.deblk + vdir->vd_deblk_sz; -+ err = set_deblk_end(&p, &deblk_end); -+ /* keep vd_dblk_sz */ -+ vdir->vd_last.ul = 0; -+ vdir->vd_last.p.deblk = vdir->vd_deblk[0]; -+ vdir->vd_version = 0; -+ vdir->vd_jiffy = 0; -+ /* smp_mb(); */ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+#define AuFillVdir_CALLED 1 -+#define AuFillVdir_WHABLE (1 << 1) -+#define AuFillVdir_SHWH (1 << 2) -+#define au_ftest_fillvdir(flags, name) ((flags) & AuFillVdir_##name) -+#define au_fset_fillvdir(flags, name) { (flags) |= AuFillVdir_##name; } -+#define au_fclr_fillvdir(flags, name) { (flags) &= ~AuFillVdir_##name; } -+ -+#ifndef CONFIG_AUFS_SHWH -+#undef AuFillVdir_SHWH -+#define AuFillVdir_SHWH 0 -+#endif -+ -+struct fillvdir_arg { -+ struct file *file; -+ struct au_vdir *vdir; -+ struct au_nhash delist; -+ struct au_nhash whlist; -+ aufs_bindex_t bindex; -+ unsigned int flags; -+ int err; -+}; -+ -+static int fillvdir(void *__arg, const char *__name, int nlen, -+ loff_t offset __maybe_unused, u64 h_ino, -+ unsigned int d_type) -+{ -+ struct fillvdir_arg *arg = __arg; -+ char *name = (void *)__name; -+ struct super_block *sb; -+ ino_t ino; -+ const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH); -+ -+ arg->err = 0; -+ sb = arg->file->f_dentry->d_sb; -+ au_fset_fillvdir(arg->flags, CALLED); -+ /* smp_mb(); */ -+ if (nlen <= AUFS_WH_PFX_LEN -+ || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { -+ if (test_known(&arg->delist, name, nlen) -+ || au_nhash_test_known_wh(&arg->whlist, name, nlen)) -+ goto out; /* already exists or whiteouted */ -+ -+ sb = arg->file->f_dentry->d_sb; -+ arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino); -+ if (!arg->err) -+ arg->err = append_de(arg->vdir, name, nlen, ino, -+ d_type, &arg->delist); -+ } else if (au_ftest_fillvdir(arg->flags, WHABLE)) { -+ name += AUFS_WH_PFX_LEN; -+ nlen -= AUFS_WH_PFX_LEN; -+ if (au_nhash_test_known_wh(&arg->whlist, name, nlen)) -+ goto out; /* already whiteouted */ -+ -+ if (shwh) -+ arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type, -+ &ino); -+ if (!arg->err) -+ arg->err = au_nhash_append_wh -+ (&arg->whlist, name, nlen, ino, d_type, -+ arg->bindex, shwh); -+ } -+ -+ out: -+ if (!arg->err) -+ arg->vdir->vd_jiffy = jiffies; -+ /* smp_mb(); */ -+ AuTraceErr(arg->err); -+ return arg->err; -+} -+ -+static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir, -+ struct au_nhash *whlist, struct au_nhash *delist) -+{ -+#ifdef CONFIG_AUFS_SHWH -+ int err; -+ unsigned int nh, u; -+ struct hlist_head *head; -+ struct au_vdir_wh *tpos; -+ struct hlist_node *pos, *n; -+ char *p, *o; -+ struct au_vdir_destr *destr; -+ -+ AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH)); -+ -+ err = -ENOMEM; -+ o = p = __getname(); -+ if (unlikely(!p)) -+ goto out; -+ -+ err = 0; -+ nh = whlist->nh_num; -+ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); -+ p += AUFS_WH_PFX_LEN; -+ for (u = 0; u < nh; u++) { -+ head = whlist->nh_head + u; -+ hlist_for_each_entry_safe(tpos, pos, n, head, wh_hash) { -+ destr = &tpos->wh_str; -+ memcpy(p, destr->name, destr->len); -+ err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN, -+ tpos->wh_ino, tpos->wh_type, delist); -+ if (unlikely(err)) -+ break; -+ } -+ } -+ -+ __putname(o); -+ -+ out: -+ AuTraceErr(err); -+ return err; -+#else -+ return 0; -+#endif -+} -+ -+static int au_do_read_vdir(struct fillvdir_arg *arg) -+{ -+ int err; -+ unsigned int rdhash; -+ loff_t offset; -+ aufs_bindex_t bend, bindex, bstart; -+ unsigned char shwh; -+ struct file *hf, *file; -+ struct super_block *sb; -+ -+ file = arg->file; -+ sb = file->f_dentry->d_sb; -+ SiMustAnyLock(sb); -+ -+ rdhash = au_sbi(sb)->si_rdhash; -+ if (!rdhash) -+ rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL)); -+ err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS); -+ if (unlikely(err)) -+ goto out; -+ err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS); -+ if (unlikely(err)) -+ goto out_delist; -+ -+ err = 0; -+ arg->flags = 0; -+ shwh = 0; -+ if (au_opt_test(au_mntflags(sb), SHWH)) { -+ shwh = 1; -+ au_fset_fillvdir(arg->flags, SHWH); -+ } -+ bstart = au_fbstart(file); -+ bend = au_fbend(file); -+ for (bindex = bstart; !err && bindex <= bend; bindex++) { -+ hf = au_h_fptr(file, bindex); -+ if (!hf) -+ continue; -+ -+ offset = vfsub_llseek(hf, 0, SEEK_SET); -+ err = offset; -+ if (unlikely(offset)) -+ break; -+ -+ arg->bindex = bindex; -+ au_fclr_fillvdir(arg->flags, WHABLE); -+ if (shwh -+ || (bindex != bend -+ && au_br_whable(au_sbr_perm(sb, bindex)))) -+ au_fset_fillvdir(arg->flags, WHABLE); -+ do { -+ arg->err = 0; -+ au_fclr_fillvdir(arg->flags, CALLED); -+ /* smp_mb(); */ -+ err = vfsub_readdir(hf, fillvdir, arg); -+ if (err >= 0) -+ err = arg->err; -+ } while (!err && au_ftest_fillvdir(arg->flags, CALLED)); -+ } -+ -+ if (!err && shwh) -+ err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist); -+ -+ au_nhash_wh_free(&arg->whlist); -+ -+ out_delist: -+ au_nhash_de_free(&arg->delist); -+ out: -+ return err; -+} -+ -+static int read_vdir(struct file *file, int may_read) -+{ -+ int err; -+ unsigned long expire; -+ unsigned char do_read; -+ struct fillvdir_arg arg; -+ struct inode *inode; -+ struct au_vdir *vdir, *allocated; -+ -+ err = 0; -+ inode = file->f_dentry->d_inode; -+ IMustLock(inode); -+ SiMustAnyLock(inode->i_sb); -+ -+ allocated = NULL; -+ do_read = 0; -+ expire = au_sbi(inode->i_sb)->si_rdcache; -+ vdir = au_ivdir(inode); -+ if (!vdir) { -+ do_read = 1; -+ vdir = alloc_vdir(file); -+ err = PTR_ERR(vdir); -+ if (IS_ERR(vdir)) -+ goto out; -+ err = 0; -+ allocated = vdir; -+ } else if (may_read -+ && (inode->i_version != vdir->vd_version -+ || time_after(jiffies, vdir->vd_jiffy + expire))) { -+ do_read = 1; -+ err = reinit_vdir(vdir); -+ if (unlikely(err)) -+ goto out; -+ } -+ -+ if (!do_read) -+ return 0; /* success */ -+ -+ arg.file = file; -+ arg.vdir = vdir; -+ err = au_do_read_vdir(&arg); -+ if (!err) { -+ /* file->f_pos = 0; */ -+ vdir->vd_version = inode->i_version; -+ vdir->vd_last.ul = 0; -+ vdir->vd_last.p.deblk = vdir->vd_deblk[0]; -+ if (allocated) -+ au_set_ivdir(inode, allocated); -+ } else if (allocated) -+ au_vdir_free(allocated); -+ -+ out: -+ return err; -+} -+ -+static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src) -+{ -+ int err, rerr; -+ unsigned long ul, n; -+ const unsigned int deblk_sz = src->vd_deblk_sz; -+ -+ AuDebugOn(tgt->vd_nblk != 1); -+ -+ err = -ENOMEM; -+ if (tgt->vd_nblk < src->vd_nblk) { -+ unsigned char **p; -+ -+ p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk, -+ GFP_NOFS); -+ if (unlikely(!p)) -+ goto out; -+ tgt->vd_deblk = p; -+ } -+ -+ if (tgt->vd_deblk_sz != deblk_sz) { -+ unsigned char *p; -+ -+ tgt->vd_deblk_sz = deblk_sz; -+ p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS); -+ if (unlikely(!p)) -+ goto out; -+ tgt->vd_deblk[0] = p; -+ } -+ memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz); -+ tgt->vd_version = src->vd_version; -+ tgt->vd_jiffy = src->vd_jiffy; -+ -+ n = src->vd_nblk; -+ for (ul = 1; ul < n; ul++) { -+ tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz, -+ GFP_NOFS); -+ if (unlikely(!tgt->vd_deblk[ul])) -+ goto out; -+ tgt->vd_nblk++; -+ } -+ tgt->vd_nblk = n; -+ tgt->vd_last.ul = tgt->vd_last.ul; -+ tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul]; -+ tgt->vd_last.p.deblk += src->vd_last.p.deblk -+ - src->vd_deblk[src->vd_last.ul]; -+ /* smp_mb(); */ -+ return 0; /* success */ -+ -+ out: -+ rerr = reinit_vdir(tgt); -+ BUG_ON(rerr); -+ return err; -+} -+ -+int au_vdir_init(struct file *file) -+{ -+ int err; -+ struct inode *inode; -+ struct au_vdir *vdir_cache, *allocated; -+ -+ err = read_vdir(file, !file->f_pos); -+ if (unlikely(err)) -+ goto out; -+ -+ allocated = NULL; -+ vdir_cache = au_fvdir_cache(file); -+ if (!vdir_cache) { -+ vdir_cache = alloc_vdir(file); -+ err = PTR_ERR(vdir_cache); -+ if (IS_ERR(vdir_cache)) -+ goto out; -+ allocated = vdir_cache; -+ } else if (!file->f_pos && vdir_cache->vd_version != file->f_version) { -+ err = reinit_vdir(vdir_cache); -+ if (unlikely(err)) -+ goto out; -+ } else -+ return 0; /* success */ -+ -+ inode = file->f_dentry->d_inode; -+ err = copy_vdir(vdir_cache, au_ivdir(inode)); -+ if (!err) { -+ file->f_version = inode->i_version; -+ if (allocated) -+ au_set_fvdir_cache(file, allocated); -+ } else if (allocated) -+ au_vdir_free(allocated); -+ -+ out: -+ return err; -+} -+ -+static loff_t calc_offset(struct au_vdir *vdir) -+{ -+ loff_t offset; -+ union au_vdir_deblk_p p; -+ -+ p.deblk = vdir->vd_deblk[vdir->vd_last.ul]; -+ offset = vdir->vd_last.p.deblk - p.deblk; -+ offset += vdir->vd_deblk_sz * vdir->vd_last.ul; -+ return offset; -+} -+ -+/* returns true or false */ -+static int seek_vdir(struct file *file) -+{ -+ int valid; -+ unsigned int deblk_sz; -+ unsigned long ul, n; -+ loff_t offset; -+ union au_vdir_deblk_p p, deblk_end; -+ struct au_vdir *vdir_cache; -+ -+ valid = 1; -+ vdir_cache = au_fvdir_cache(file); -+ offset = calc_offset(vdir_cache); -+ AuDbg("offset %lld\n", offset); -+ if (file->f_pos == offset) -+ goto out; -+ -+ vdir_cache->vd_last.ul = 0; -+ vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0]; -+ if (!file->f_pos) -+ goto out; -+ -+ valid = 0; -+ deblk_sz = vdir_cache->vd_deblk_sz; -+ ul = div64_u64(file->f_pos, deblk_sz); -+ AuDbg("ul %lu\n", ul); -+ if (ul >= vdir_cache->vd_nblk) -+ goto out; -+ -+ n = vdir_cache->vd_nblk; -+ for (; ul < n; ul++) { -+ p.deblk = vdir_cache->vd_deblk[ul]; -+ deblk_end.deblk = p.deblk + deblk_sz; -+ offset = ul; -+ offset *= deblk_sz; -+ while (!is_deblk_end(&p, &deblk_end) && offset < file->f_pos) { -+ unsigned int l; -+ -+ l = calc_size(p.de->de_str.len); -+ offset += l; -+ p.deblk += l; -+ } -+ if (!is_deblk_end(&p, &deblk_end)) { -+ valid = 1; -+ vdir_cache->vd_last.ul = ul; -+ vdir_cache->vd_last.p = p; -+ break; -+ } -+ } -+ -+ out: -+ /* smp_mb(); */ -+ AuTraceErr(!valid); -+ return valid; -+} -+ -+int au_vdir_fill_de(struct file *file, void *dirent, filldir_t filldir) -+{ -+ int err; -+ unsigned int l, deblk_sz; -+ union au_vdir_deblk_p deblk_end; -+ struct au_vdir *vdir_cache; -+ struct au_vdir_de *de; -+ -+ vdir_cache = au_fvdir_cache(file); -+ if (!seek_vdir(file)) -+ return 0; -+ -+ deblk_sz = vdir_cache->vd_deblk_sz; -+ while (1) { -+ deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; -+ deblk_end.deblk += deblk_sz; -+ while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) { -+ de = vdir_cache->vd_last.p.de; -+ AuDbg("%.*s, off%lld, i%lu, dt%d\n", -+ de->de_str.len, de->de_str.name, file->f_pos, -+ (unsigned long)de->de_ino, de->de_type); -+ err = filldir(dirent, de->de_str.name, de->de_str.len, -+ file->f_pos, de->de_ino, de->de_type); -+ if (unlikely(err)) { -+ AuTraceErr(err); -+ /* todo: ignore the error caused by udba? */ -+ /* return err; */ -+ return 0; -+ } -+ -+ l = calc_size(de->de_str.len); -+ vdir_cache->vd_last.p.deblk += l; -+ file->f_pos += l; -+ } -+ if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) { -+ vdir_cache->vd_last.ul++; -+ vdir_cache->vd_last.p.deblk -+ = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; -+ file->f_pos = deblk_sz * vdir_cache->vd_last.ul; -+ continue; -+ } -+ break; -+ } -+ -+ /* smp_mb(); */ -+ return 0; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/vfsub.c linux-2.6.31/fs/aufs/vfsub.c ---- linux-2.6.31-vanilla/fs/aufs/vfsub.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/vfsub.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,755 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * sub-routines for VFS -+ */ -+ -+#include <linux/ima.h> -+#include <linux/namei.h> -+#include <linux/security.h> -+#include <linux/splice.h> -+#include <linux/uaccess.h> -+#include "aufs.h" -+ -+int vfsub_update_h_iattr(struct path *h_path, int *did) -+{ -+ int err; -+ struct kstat st; -+ struct super_block *h_sb; -+ -+ /* for remote fs, leave work for its getattr or d_revalidate */ -+ /* for bad i_attr fs, handle them in aufs_getattr() */ -+ /* still some fs may acquire i_mutex. we need to skip them */ -+ err = 0; -+ if (!did) -+ did = &err; -+ h_sb = h_path->dentry->d_sb; -+ *did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb)); -+ if (*did) -+ err = vfs_getattr(h_path->mnt, h_path->dentry, &st); -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+#ifdef CONFIG_IMA -+#error IMA is not supported since it does not work well. Wait for their fixing. -+#endif -+ -+struct file *vfsub_dentry_open(struct path *path, int flags, -+ const struct cred *cred) -+{ -+ struct file *file; -+ -+ file = dentry_open(path->dentry, path->mnt, flags, cred); -+ if (IS_ERR(file)) -+ return file; -+ /* as NFSD does, just call ima_..._get() simply after dentry_open */ -+ ima_counts_get(file); -+ return file; -+} -+ -+struct file *vfsub_filp_open(const char *path, int oflags, int mode) -+{ -+ struct file *file; -+ -+ lockdep_off(); -+ file = filp_open(path, oflags, mode); -+ lockdep_on(); -+ if (IS_ERR(file)) -+ goto out; -+ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ -+ -+ out: -+ return file; -+} -+ -+int vfsub_kern_path(const char *name, unsigned int flags, struct path *path) -+{ -+ int err; -+ -+ /* lockdep_off(); */ -+ err = kern_path(name, flags, path); -+ /* lockdep_on(); */ -+ if (!err && path->dentry->d_inode) -+ vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ -+ return err; -+} -+ -+struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, -+ int len) -+{ -+ struct path path = { -+ .mnt = NULL -+ }; -+ -+ /* VFS checks it too, but by WARN_ON_ONCE() */ -+ IMustLock(parent->d_inode); -+ -+ path.dentry = lookup_one_len(name, parent, len); -+ if (IS_ERR(path.dentry)) -+ goto out; -+ if (path.dentry->d_inode) -+ vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ -+ -+ out: -+ return path.dentry; -+} -+ -+struct dentry *vfsub_lookup_hash(struct nameidata *nd) -+{ -+ struct path path = { -+ .mnt = nd->path.mnt -+ }; -+ -+ IMustLock(nd->path.dentry->d_inode); -+ -+ path.dentry = lookup_hash(nd); -+ if (!IS_ERR(path.dentry) && path.dentry->d_inode) -+ vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ -+ -+ return path.dentry; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, -+ struct dentry *d2, struct au_hinode *hdir2) -+{ -+ struct dentry *d; -+ -+ lockdep_off(); -+ d = lock_rename(d1, d2); -+ lockdep_on(); -+ au_hin_suspend(hdir1); -+ if (hdir1 != hdir2) -+ au_hin_suspend(hdir2); -+ -+ return d; -+} -+ -+void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, -+ struct dentry *d2, struct au_hinode *hdir2) -+{ -+ au_hin_resume(hdir1); -+ if (hdir1 != hdir2) -+ au_hin_resume(hdir2); -+ lockdep_off(); -+ unlock_rename(d1, d2); -+ lockdep_on(); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+int vfsub_create(struct inode *dir, struct path *path, int mode) -+{ -+ int err; -+ struct dentry *d; -+ -+ IMustLock(dir); -+ -+ d = path->dentry; -+ path->dentry = d->d_parent; -+ err = security_path_mknod(path, path->dentry, mode, 0); -+ path->dentry = d; -+ if (unlikely(err)) -+ goto out; -+ -+ if (au_test_fs_null_nd(dir->i_sb)) -+ err = vfs_create(dir, path->dentry, mode, NULL); -+ else { -+ struct nameidata h_nd; -+ -+ memset(&h_nd, 0, sizeof(h_nd)); -+ h_nd.flags = LOOKUP_CREATE; -+ h_nd.intent.open.flags = O_CREAT -+ | vfsub_fmode_to_uint(FMODE_READ); -+ h_nd.intent.open.create_mode = mode; -+ h_nd.path.dentry = path->dentry->d_parent; -+ h_nd.path.mnt = path->mnt; -+ path_get(&h_nd.path); -+ err = vfs_create(dir, path->dentry, mode, &h_nd); -+ path_put(&h_nd.path); -+ } -+ -+ if (!err) { -+ struct path tmp = *path; -+ int did; -+ -+ vfsub_update_h_iattr(&tmp, &did); -+ if (did) { -+ tmp.dentry = path->dentry->d_parent; -+ vfsub_update_h_iattr(&tmp, /*did*/NULL); -+ } -+ /*ignore*/ -+ } -+ -+ out: -+ return err; -+} -+ -+int vfsub_symlink(struct inode *dir, struct path *path, const char *symname) -+{ -+ int err; -+ struct dentry *d; -+ -+ IMustLock(dir); -+ -+ d = path->dentry; -+ path->dentry = d->d_parent; -+ err = security_path_symlink(path, path->dentry, symname); -+ path->dentry = d; -+ if (unlikely(err)) -+ goto out; -+ -+ err = vfs_symlink(dir, path->dentry, symname); -+ if (!err) { -+ struct path tmp = *path; -+ int did; -+ -+ vfsub_update_h_iattr(&tmp, &did); -+ if (did) { -+ tmp.dentry = path->dentry->d_parent; -+ vfsub_update_h_iattr(&tmp, /*did*/NULL); -+ } -+ /*ignore*/ -+ } -+ -+ out: -+ return err; -+} -+ -+int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev) -+{ -+ int err; -+ struct dentry *d; -+ -+ IMustLock(dir); -+ -+ d = path->dentry; -+ path->dentry = d->d_parent; -+ err = security_path_mknod(path, path->dentry, mode, dev); -+ path->dentry = d; -+ if (unlikely(err)) -+ goto out; -+ -+ err = vfs_mknod(dir, path->dentry, mode, dev); -+ if (!err) { -+ struct path tmp = *path; -+ int did; -+ -+ vfsub_update_h_iattr(&tmp, &did); -+ if (did) { -+ tmp.dentry = path->dentry->d_parent; -+ vfsub_update_h_iattr(&tmp, /*did*/NULL); -+ } -+ /*ignore*/ -+ } -+ -+ out: -+ return err; -+} -+ -+static int au_test_nlink(struct inode *inode) -+{ -+ const unsigned int link_max = UINT_MAX >> 1; /* rough margin */ -+ -+ if (!au_test_fs_no_limit_nlink(inode->i_sb) -+ || inode->i_nlink < link_max) -+ return 0; -+ return -EMLINK; -+} -+ -+int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path) -+{ -+ int err; -+ struct dentry *d; -+ -+ IMustLock(dir); -+ -+ err = au_test_nlink(src_dentry->d_inode); -+ if (unlikely(err)) -+ return err; -+ -+ d = path->dentry; -+ path->dentry = d->d_parent; -+ err = security_path_link(src_dentry, path, path->dentry); -+ path->dentry = d; -+ if (unlikely(err)) -+ goto out; -+ -+ lockdep_off(); -+ err = vfs_link(src_dentry, dir, path->dentry); -+ lockdep_on(); -+ if (!err) { -+ struct path tmp = *path; -+ int did; -+ -+ /* fuse has different memory inode for the same inumber */ -+ vfsub_update_h_iattr(&tmp, &did); -+ if (did) { -+ tmp.dentry = path->dentry->d_parent; -+ vfsub_update_h_iattr(&tmp, /*did*/NULL); -+ tmp.dentry = src_dentry; -+ vfsub_update_h_iattr(&tmp, /*did*/NULL); -+ } -+ /*ignore*/ -+ } -+ -+ out: -+ return err; -+} -+ -+int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry, -+ struct inode *dir, struct path *path) -+{ -+ int err; -+ struct path tmp = { -+ .mnt = path->mnt -+ }; -+ struct dentry *d; -+ -+ IMustLock(dir); -+ IMustLock(src_dir); -+ -+ d = path->dentry; -+ path->dentry = d->d_parent; -+ tmp.dentry = src_dentry->d_parent; -+ err = security_path_rename(&tmp, src_dentry, path, path->dentry); -+ path->dentry = d; -+ if (unlikely(err)) -+ goto out; -+ -+ lockdep_off(); -+ err = vfs_rename(src_dir, src_dentry, dir, path->dentry); -+ lockdep_on(); -+ if (!err) { -+ int did; -+ -+ tmp.dentry = d->d_parent; -+ vfsub_update_h_iattr(&tmp, &did); -+ if (did) { -+ tmp.dentry = src_dentry; -+ vfsub_update_h_iattr(&tmp, /*did*/NULL); -+ tmp.dentry = src_dentry->d_parent; -+ vfsub_update_h_iattr(&tmp, /*did*/NULL); -+ } -+ /*ignore*/ -+ } -+ -+ out: -+ return err; -+} -+ -+int vfsub_mkdir(struct inode *dir, struct path *path, int mode) -+{ -+ int err; -+ struct dentry *d; -+ -+ IMustLock(dir); -+ -+ d = path->dentry; -+ path->dentry = d->d_parent; -+ err = security_path_mkdir(path, path->dentry, mode); -+ path->dentry = d; -+ if (unlikely(err)) -+ goto out; -+ -+ err = vfs_mkdir(dir, path->dentry, mode); -+ if (!err) { -+ struct path tmp = *path; -+ int did; -+ -+ vfsub_update_h_iattr(&tmp, &did); -+ if (did) { -+ tmp.dentry = path->dentry->d_parent; -+ vfsub_update_h_iattr(&tmp, /*did*/NULL); -+ } -+ /*ignore*/ -+ } -+ -+ out: -+ return err; -+} -+ -+int vfsub_rmdir(struct inode *dir, struct path *path) -+{ -+ int err; -+ struct dentry *d; -+ -+ IMustLock(dir); -+ -+ d = path->dentry; -+ path->dentry = d->d_parent; -+ err = security_path_rmdir(path, path->dentry); -+ path->dentry = d; -+ if (unlikely(err)) -+ goto out; -+ -+ lockdep_off(); -+ err = vfs_rmdir(dir, path->dentry); -+ lockdep_on(); -+ if (!err) { -+ struct path tmp = { -+ .dentry = path->dentry->d_parent, -+ .mnt = path->mnt -+ }; -+ -+ vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ -+ } -+ -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, -+ loff_t *ppos) -+{ -+ ssize_t err; -+ -+ err = vfs_read(file, ubuf, count, ppos); -+ if (err >= 0) -+ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ -+ return err; -+} -+ -+/* todo: kernel_read()? */ -+ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, -+ loff_t *ppos) -+{ -+ ssize_t err; -+ mm_segment_t oldfs; -+ -+ oldfs = get_fs(); -+ set_fs(KERNEL_DS); -+ err = vfsub_read_u(file, (char __user *)kbuf, count, ppos); -+ set_fs(oldfs); -+ return err; -+} -+ -+ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, -+ loff_t *ppos) -+{ -+ ssize_t err; -+ -+ lockdep_off(); -+ err = vfs_write(file, ubuf, count, ppos); -+ lockdep_on(); -+ if (err >= 0) -+ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ -+ return err; -+} -+ -+ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos) -+{ -+ ssize_t err; -+ mm_segment_t oldfs; -+ -+ oldfs = get_fs(); -+ set_fs(KERNEL_DS); -+ err = vfsub_write_u(file, (const char __user *)kbuf, count, ppos); -+ set_fs(oldfs); -+ return err; -+} -+ -+int vfsub_readdir(struct file *file, filldir_t filldir, void *arg) -+{ -+ int err; -+ -+ lockdep_off(); -+ err = vfs_readdir(file, filldir, arg); -+ lockdep_on(); -+ if (err >= 0) -+ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ -+ return err; -+} -+ -+long vfsub_splice_to(struct file *in, loff_t *ppos, -+ struct pipe_inode_info *pipe, size_t len, -+ unsigned int flags) -+{ -+ long err; -+ -+ lockdep_off(); -+ err = do_splice_to(in, ppos, pipe, len, flags); -+ lockdep_on(); -+ if (err >= 0) -+ vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/ -+ return err; -+} -+ -+long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, -+ loff_t *ppos, size_t len, unsigned int flags) -+{ -+ long err; -+ -+ lockdep_off(); -+ err = do_splice_from(pipe, out, ppos, len, flags); -+ lockdep_on(); -+ if (err >= 0) -+ vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/ -+ return err; -+} -+ -+/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */ -+int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, -+ struct file *h_file) -+{ -+ int err; -+ struct inode *h_inode; -+ -+ h_inode = h_path->dentry->d_inode; -+ if (!h_file) { -+ err = mnt_want_write(h_path->mnt); -+ if (err) -+ goto out; -+ err = inode_permission(h_inode, MAY_WRITE); -+ if (err) -+ goto out_mnt; -+ err = get_write_access(h_inode); -+ if (err) -+ goto out_mnt; -+ err = break_lease(h_inode, vfsub_fmode_to_uint(FMODE_WRITE)); -+ if (err) -+ goto out_inode; -+ } -+ -+ err = locks_verify_truncate(h_inode, h_file, length); -+ if (!err) -+ err = security_path_truncate(h_path, length, attr); -+ if (!err) { -+ lockdep_off(); -+ err = do_truncate(h_path->dentry, length, attr, h_file); -+ lockdep_on(); -+ } -+ -+ out_inode: -+ if (!h_file) -+ put_write_access(h_inode); -+ out_mnt: -+ if (!h_file) -+ mnt_drop_write(h_path->mnt); -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct au_vfsub_mkdir_args { -+ int *errp; -+ struct inode *dir; -+ struct path *path; -+ int mode; -+}; -+ -+static void au_call_vfsub_mkdir(void *args) -+{ -+ struct au_vfsub_mkdir_args *a = args; -+ *a->errp = vfsub_mkdir(a->dir, a->path, a->mode); -+} -+ -+int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode) -+{ -+ int err, do_sio, wkq_err; -+ -+ do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); -+ if (!do_sio) -+ err = vfsub_mkdir(dir, path, mode); -+ else { -+ struct au_vfsub_mkdir_args args = { -+ .errp = &err, -+ .dir = dir, -+ .path = path, -+ .mode = mode -+ }; -+ wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ } -+ -+ return err; -+} -+ -+struct au_vfsub_rmdir_args { -+ int *errp; -+ struct inode *dir; -+ struct path *path; -+}; -+ -+static void au_call_vfsub_rmdir(void *args) -+{ -+ struct au_vfsub_rmdir_args *a = args; -+ *a->errp = vfsub_rmdir(a->dir, a->path); -+} -+ -+int vfsub_sio_rmdir(struct inode *dir, struct path *path) -+{ -+ int err, do_sio, wkq_err; -+ -+ do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); -+ if (!do_sio) -+ err = vfsub_rmdir(dir, path); -+ else { -+ struct au_vfsub_rmdir_args args = { -+ .errp = &err, -+ .dir = dir, -+ .path = path -+ }; -+ wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ } -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct notify_change_args { -+ int *errp; -+ struct path *path; -+ struct iattr *ia; -+}; -+ -+static void call_notify_change(void *args) -+{ -+ struct notify_change_args *a = args; -+ struct inode *h_inode; -+ -+ h_inode = a->path->dentry->d_inode; -+ IMustLock(h_inode); -+ -+ *a->errp = -EPERM; -+ if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) { -+ lockdep_off(); -+ *a->errp = notify_change(a->path->dentry, a->ia); -+ lockdep_on(); -+ if (!*a->errp) -+ vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/ -+ } -+ AuTraceErr(*a->errp); -+} -+ -+int vfsub_notify_change(struct path *path, struct iattr *ia) -+{ -+ int err; -+ struct notify_change_args args = { -+ .errp = &err, -+ .path = path, -+ .ia = ia -+ }; -+ -+ call_notify_change(&args); -+ -+ return err; -+} -+ -+int vfsub_sio_notify_change(struct path *path, struct iattr *ia) -+{ -+ int err, wkq_err; -+ struct notify_change_args args = { -+ .errp = &err, -+ .path = path, -+ .ia = ia -+ }; -+ -+ wkq_err = au_wkq_wait(call_notify_change, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct unlink_args { -+ int *errp; -+ struct inode *dir; -+ struct path *path; -+}; -+ -+static void call_unlink(void *args) -+{ -+ struct unlink_args *a = args; -+ struct dentry *d = a->path->dentry; -+ struct inode *h_inode; -+ const int stop_sillyrename = (au_test_nfs(d->d_sb) -+ && atomic_read(&d->d_count) == 1); -+ -+ IMustLock(a->dir); -+ -+ a->path->dentry = d->d_parent; -+ *a->errp = security_path_unlink(a->path, d); -+ a->path->dentry = d; -+ if (unlikely(*a->errp)) -+ return; -+ -+ if (!stop_sillyrename) -+ dget(d); -+ h_inode = d->d_inode; -+ if (h_inode) -+ atomic_inc(&h_inode->i_count); -+ -+ lockdep_off(); -+ *a->errp = vfs_unlink(a->dir, d); -+ lockdep_on(); -+ if (!*a->errp) { -+ struct path tmp = { -+ .dentry = d->d_parent, -+ .mnt = a->path->mnt -+ }; -+ vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ -+ } -+ -+ if (!stop_sillyrename) -+ dput(d); -+ if (h_inode) -+ iput(h_inode); -+ -+ AuTraceErr(*a->errp); -+} -+ -+/* -+ * @dir: must be locked. -+ * @dentry: target dentry. -+ */ -+int vfsub_unlink(struct inode *dir, struct path *path, int force) -+{ -+ int err; -+ struct unlink_args args = { -+ .errp = &err, -+ .dir = dir, -+ .path = path -+ }; -+ -+ if (!force) -+ call_unlink(&args); -+ else { -+ int wkq_err; -+ -+ wkq_err = au_wkq_wait(call_unlink, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ } -+ -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/vfsub.h linux-2.6.31/fs/aufs/vfsub.h ---- linux-2.6.31-vanilla/fs/aufs/vfsub.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/vfsub.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,172 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * sub-routines for VFS -+ */ -+ -+#ifndef __AUFS_VFSUB_H__ -+#define __AUFS_VFSUB_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/fs.h> -+#include <linux/fs_stack.h> -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* lock subclass for lower inode */ -+/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */ -+/* reduce? gave up. */ -+enum { -+ AuLsc_I_Begin = I_MUTEX_QUOTA, /* 4 */ -+ AuLsc_I_PARENT, /* lower inode, parent first */ -+ AuLsc_I_PARENT2, /* copyup dirs */ -+ AuLsc_I_PARENT3, /* copyup wh */ -+ AuLsc_I_CHILD, -+ AuLsc_I_CHILD2, -+ AuLsc_I_End -+}; -+ -+/* to debug easier, do not make them inlined functions */ -+#define MtxMustLock(mtx) AuDebugOn(!mutex_is_locked(mtx)) -+#define IMustLock(i) MtxMustLock(&(i)->i_mutex) -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline void vfsub_copy_inode_size(struct inode *inode, -+ struct inode *h_inode) -+{ -+ spin_lock(&inode->i_lock); -+ fsstack_copy_inode_size(inode, h_inode); -+ spin_unlock(&inode->i_lock); -+} -+ -+int vfsub_update_h_iattr(struct path *h_path, int *did); -+struct file *vfsub_filp_open(const char *path, int oflags, int mode); -+struct file *vfsub_dentry_open(struct path *path, int flags, -+ const struct cred *cred); -+int vfsub_kern_path(const char *name, unsigned int flags, struct path *path); -+struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, -+ int len); -+struct dentry *vfsub_lookup_hash(struct nameidata *nd); -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct au_hinode; -+struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, -+ struct dentry *d2, struct au_hinode *hdir2); -+void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, -+ struct dentry *d2, struct au_hinode *hdir2); -+ -+int vfsub_create(struct inode *dir, struct path *path, int mode); -+int vfsub_symlink(struct inode *dir, struct path *path, -+ const char *symname); -+int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev); -+int vfsub_link(struct dentry *src_dentry, struct inode *dir, -+ struct path *path); -+int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry, -+ struct inode *hdir, struct path *path); -+int vfsub_mkdir(struct inode *dir, struct path *path, int mode); -+int vfsub_rmdir(struct inode *dir, struct path *path); -+ -+int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode); -+int vfsub_sio_rmdir(struct inode *dir, struct path *path); -+int vfsub_sio_notify_change(struct path *path, struct iattr *ia); -+int vfsub_notify_change(struct path *path, struct iattr *ia); -+int vfsub_unlink(struct inode *dir, struct path *path, int force); -+ -+/* ---------------------------------------------------------------------- */ -+ -+ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, -+ loff_t *ppos); -+ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, -+ loff_t *ppos); -+ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, -+ loff_t *ppos); -+ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, -+ loff_t *ppos); -+int vfsub_readdir(struct file *file, filldir_t filldir, void *arg); -+ -+long vfsub_splice_to(struct file *in, loff_t *ppos, -+ struct pipe_inode_info *pipe, size_t len, -+ unsigned int flags); -+long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, -+ loff_t *ppos, size_t len, unsigned int flags); -+int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, -+ struct file *h_file); -+ -+static inline void vfsub_file_accessed(struct file *h_file) -+{ -+ file_accessed(h_file); -+ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/ -+} -+ -+static inline void vfsub_touch_atime(struct vfsmount *h_mnt, -+ struct dentry *h_dentry) -+{ -+ struct path h_path = { -+ .dentry = h_dentry, -+ .mnt = h_mnt -+ }; -+ touch_atime(h_mnt, h_dentry); -+ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin) -+{ -+ loff_t err; -+ -+ lockdep_off(); -+ err = vfs_llseek(file, offset, origin); -+ lockdep_on(); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* dirty workaround for strict type of fmode_t */ -+union vfsub_fmu { -+ fmode_t fm; -+ unsigned int ui; -+}; -+ -+static inline unsigned int vfsub_fmode_to_uint(fmode_t fm) -+{ -+ union vfsub_fmu u = { -+ .fm = fm -+ }; -+ -+ BUILD_BUG_ON(sizeof(u.fm) != sizeof(u.ui)); -+ -+ return u.ui; -+} -+ -+static inline fmode_t vfsub_uint_to_fmode(unsigned int ui) -+{ -+ union vfsub_fmu u = { -+ .ui = ui -+ }; -+ -+ return u.fm; -+} -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_VFSUB_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/wbr_policy.c linux-2.6.31/fs/aufs/wbr_policy.c ---- linux-2.6.31-vanilla/fs/aufs/wbr_policy.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/wbr_policy.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,641 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * policies for selecting one among multiple writable branches -+ */ -+ -+#include <linux/statfs.h> -+#include "aufs.h" -+ -+/* subset of cpup_attr() */ -+static noinline_for_stack -+int au_cpdown_attr(struct path *h_path, struct dentry *h_src) -+{ -+ int err, sbits; -+ struct iattr ia; -+ struct inode *h_isrc; -+ -+ h_isrc = h_src->d_inode; -+ ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID; -+ ia.ia_mode = h_isrc->i_mode; -+ ia.ia_uid = h_isrc->i_uid; -+ ia.ia_gid = h_isrc->i_gid; -+ sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID)); -+ au_cpup_attr_flags(h_path->dentry->d_inode, h_isrc); -+ err = vfsub_sio_notify_change(h_path, &ia); -+ -+ /* is this nfs only? */ -+ if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) { -+ ia.ia_valid = ATTR_FORCE | ATTR_MODE; -+ ia.ia_mode = h_isrc->i_mode; -+ err = vfsub_sio_notify_change(h_path, &ia); -+ } -+ -+ return err; -+} -+ -+#define AuCpdown_PARENT_OPQ 1 -+#define AuCpdown_WHED (1 << 1) -+#define AuCpdown_MADE_DIR (1 << 2) -+#define AuCpdown_DIROPQ (1 << 3) -+#define au_ftest_cpdown(flags, name) ((flags) & AuCpdown_##name) -+#define au_fset_cpdown(flags, name) { (flags) |= AuCpdown_##name; } -+#define au_fclr_cpdown(flags, name) { (flags) &= ~AuCpdown_##name; } -+ -+struct au_cpdown_dir_args { -+ struct dentry *parent; -+ unsigned int flags; -+}; -+ -+static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst, -+ struct au_cpdown_dir_args *a) -+{ -+ int err; -+ struct dentry *opq_dentry; -+ -+ opq_dentry = au_diropq_create(dentry, bdst); -+ err = PTR_ERR(opq_dentry); -+ if (IS_ERR(opq_dentry)) -+ goto out; -+ dput(opq_dentry); -+ au_fset_cpdown(a->flags, DIROPQ); -+ -+ out: -+ return err; -+} -+ -+static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent, -+ struct inode *dir, aufs_bindex_t bdst) -+{ -+ int err; -+ struct path h_path; -+ struct au_branch *br; -+ -+ br = au_sbr(dentry->d_sb, bdst); -+ h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br); -+ err = PTR_ERR(h_path.dentry); -+ if (IS_ERR(h_path.dentry)) -+ goto out; -+ -+ err = 0; -+ if (h_path.dentry->d_inode) { -+ h_path.mnt = br->br_mnt; -+ err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path, -+ dentry); -+ } -+ dput(h_path.dentry); -+ -+ out: -+ return err; -+} -+ -+static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst, -+ struct dentry *h_parent, void *arg) -+{ -+ int err, rerr; -+ aufs_bindex_t bend, bopq, bstart; -+ unsigned char parent_opq; -+ struct path h_path; -+ struct dentry *parent; -+ struct inode *h_dir, *h_inode, *inode, *dir; -+ struct au_cpdown_dir_args *args = arg; -+ -+ bstart = au_dbstart(dentry); -+ /* dentry is di-locked */ -+ parent = dget_parent(dentry); -+ dir = parent->d_inode; -+ h_dir = h_parent->d_inode; -+ AuDebugOn(h_dir != au_h_iptr(dir, bdst)); -+ IMustLock(h_dir); -+ -+ err = au_lkup_neg(dentry, bdst); -+ if (unlikely(err < 0)) -+ goto out; -+ h_path.dentry = au_h_dptr(dentry, bdst); -+ h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst); -+ err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path, -+ S_IRWXU | S_IRUGO | S_IXUGO); -+ if (unlikely(err)) -+ goto out_put; -+ au_fset_cpdown(args->flags, MADE_DIR); -+ -+ bend = au_dbend(dentry); -+ bopq = au_dbdiropq(dentry); -+ au_fclr_cpdown(args->flags, WHED); -+ au_fclr_cpdown(args->flags, DIROPQ); -+ if (au_dbwh(dentry) == bdst) -+ au_fset_cpdown(args->flags, WHED); -+ if (!au_ftest_cpdown(args->flags, PARENT_OPQ) && bopq <= bdst) -+ au_fset_cpdown(args->flags, PARENT_OPQ); -+ parent_opq = (au_ftest_cpdown(args->flags, PARENT_OPQ) -+ && args->parent == dentry); -+ h_inode = h_path.dentry->d_inode; -+ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); -+ if (au_ftest_cpdown(args->flags, WHED)) { -+ err = au_cpdown_dir_opq(dentry, bdst, args); -+ if (unlikely(err)) { -+ mutex_unlock(&h_inode->i_mutex); -+ goto out_dir; -+ } -+ } -+ -+ err = au_cpdown_attr(&h_path, au_h_dptr(dentry, bstart)); -+ mutex_unlock(&h_inode->i_mutex); -+ if (unlikely(err)) -+ goto out_opq; -+ -+ if (au_ftest_cpdown(args->flags, WHED)) { -+ err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst); -+ if (unlikely(err)) -+ goto out_opq; -+ } -+ -+ inode = dentry->d_inode; -+ if (au_ibend(inode) < bdst) -+ au_set_ibend(inode, bdst); -+ au_set_h_iptr(inode, bdst, au_igrab(h_inode), -+ au_hi_flags(inode, /*isdir*/1)); -+ goto out; /* success */ -+ -+ /* revert */ -+ out_opq: -+ if (au_ftest_cpdown(args->flags, DIROPQ)) { -+ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); -+ rerr = au_diropq_remove(dentry, bdst); -+ mutex_unlock(&h_inode->i_mutex); -+ if (unlikely(rerr)) { -+ AuIOErr("failed removing diropq for %.*s b%d (%d)\n", -+ AuDLNPair(dentry), bdst, rerr); -+ err = -EIO; -+ goto out; -+ } -+ } -+ out_dir: -+ if (au_ftest_cpdown(args->flags, MADE_DIR)) { -+ rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path); -+ if (unlikely(rerr)) { -+ AuIOErr("failed removing %.*s b%d (%d)\n", -+ AuDLNPair(dentry), bdst, rerr); -+ err = -EIO; -+ } -+ } -+ out_put: -+ au_set_h_dptr(dentry, bdst, NULL); -+ if (au_dbend(dentry) == bdst) -+ au_update_dbend(dentry); -+ out: -+ dput(parent); -+ return err; -+} -+ -+int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst) -+{ -+ int err; -+ struct au_cpdown_dir_args args = { -+ .parent = dget_parent(dentry), -+ .flags = 0 -+ }; -+ -+ err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &args); -+ dput(args.parent); -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* policies for create */ -+ -+static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ for (; bindex >= 0; bindex--) -+ if (!au_br_rdonly(au_sbr(sb, bindex))) -+ return bindex; -+ return -EROFS; -+} -+ -+/* top down parent */ -+static int au_wbr_create_tdp(struct dentry *dentry, int isdir __maybe_unused) -+{ -+ int err; -+ aufs_bindex_t bstart, bindex; -+ struct super_block *sb; -+ struct dentry *parent, *h_parent; -+ -+ sb = dentry->d_sb; -+ bstart = au_dbstart(dentry); -+ err = bstart; -+ if (!au_br_rdonly(au_sbr(sb, bstart))) -+ goto out; -+ -+ err = -EROFS; -+ parent = dget_parent(dentry); -+ for (bindex = au_dbstart(parent); bindex < bstart; bindex++) { -+ h_parent = au_h_dptr(parent, bindex); -+ if (!h_parent || !h_parent->d_inode) -+ continue; -+ -+ if (!au_br_rdonly(au_sbr(sb, bindex))) { -+ err = bindex; -+ break; -+ } -+ } -+ dput(parent); -+ -+ /* bottom up here */ -+ if (unlikely(err < 0)) -+ err = au_wbr_bu(sb, bstart - 1); -+ -+ out: -+ AuDbg("b%d\n", err); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* an exception for the policy other than tdp */ -+static int au_wbr_create_exp(struct dentry *dentry) -+{ -+ int err; -+ aufs_bindex_t bwh, bdiropq; -+ struct dentry *parent; -+ -+ err = -1; -+ bwh = au_dbwh(dentry); -+ parent = dget_parent(dentry); -+ bdiropq = au_dbdiropq(parent); -+ if (bwh >= 0) { -+ if (bdiropq >= 0) -+ err = min(bdiropq, bwh); -+ else -+ err = bwh; -+ AuDbg("%d\n", err); -+ } else if (bdiropq >= 0) { -+ err = bdiropq; -+ AuDbg("%d\n", err); -+ } -+ dput(parent); -+ -+ if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err))) -+ err = -1; -+ -+ AuDbg("%d\n", err); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* round robin */ -+static int au_wbr_create_init_rr(struct super_block *sb) -+{ -+ int err; -+ -+ err = au_wbr_bu(sb, au_sbend(sb)); -+ atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */ -+ /* smp_mb(); */ -+ -+ AuDbg("b%d\n", err); -+ return err; -+} -+ -+static int au_wbr_create_rr(struct dentry *dentry, int isdir) -+{ -+ int err, nbr; -+ unsigned int u; -+ aufs_bindex_t bindex, bend; -+ struct super_block *sb; -+ atomic_t *next; -+ -+ err = au_wbr_create_exp(dentry); -+ if (err >= 0) -+ goto out; -+ -+ sb = dentry->d_sb; -+ next = &au_sbi(sb)->si_wbr_rr_next; -+ bend = au_sbend(sb); -+ nbr = bend + 1; -+ for (bindex = 0; bindex <= bend; bindex++) { -+ if (!isdir) { -+ err = atomic_dec_return(next) + 1; -+ /* modulo for 0 is meaningless */ -+ if (unlikely(!err)) -+ err = atomic_dec_return(next) + 1; -+ } else -+ err = atomic_read(next); -+ AuDbg("%d\n", err); -+ u = err; -+ err = u % nbr; -+ AuDbg("%d\n", err); -+ if (!au_br_rdonly(au_sbr(sb, err))) -+ break; -+ err = -EROFS; -+ } -+ -+ out: -+ AuDbg("%d\n", err); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* most free space */ -+static void au_mfs(struct dentry *dentry) -+{ -+ struct super_block *sb; -+ struct au_branch *br; -+ struct au_wbr_mfs *mfs; -+ aufs_bindex_t bindex, bend; -+ int err; -+ unsigned long long b, bavail; -+ /* reduce the stack usage */ -+ struct kstatfs *st; -+ -+ st = kmalloc(sizeof(*st), GFP_NOFS); -+ if (unlikely(!st)) { -+ AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM); -+ return; -+ } -+ -+ bavail = 0; -+ sb = dentry->d_sb; -+ mfs = &au_sbi(sb)->si_wbr_mfs; -+ MtxMustLock(&mfs->mfs_lock); -+ mfs->mfs_bindex = -EROFS; -+ mfs->mfsrr_bytes = 0; -+ bend = au_sbend(sb); -+ for (bindex = 0; bindex <= bend; bindex++) { -+ br = au_sbr(sb, bindex); -+ if (au_br_rdonly(br)) -+ continue; -+ -+ /* sb->s_root for NFS is unreliable */ -+ err = vfs_statfs(br->br_mnt->mnt_root, st); -+ if (unlikely(err)) { -+ AuWarn1("failed statfs, b%d, %d\n", bindex, err); -+ continue; -+ } -+ -+ /* when the available size is equal, select the lower one */ -+ BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail) -+ || sizeof(b) < sizeof(st->f_bsize)); -+ b = st->f_bavail * st->f_bsize; -+ br->br_wbr->wbr_bytes = b; -+ if (b >= bavail) { -+ bavail = b; -+ mfs->mfs_bindex = bindex; -+ mfs->mfs_jiffy = jiffies; -+ } -+ } -+ -+ mfs->mfsrr_bytes = bavail; -+ AuDbg("b%d\n", mfs->mfs_bindex); -+ kfree(st); -+} -+ -+static int au_wbr_create_mfs(struct dentry *dentry, int isdir __maybe_unused) -+{ -+ int err; -+ struct super_block *sb; -+ struct au_wbr_mfs *mfs; -+ -+ err = au_wbr_create_exp(dentry); -+ if (err >= 0) -+ goto out; -+ -+ sb = dentry->d_sb; -+ mfs = &au_sbi(sb)->si_wbr_mfs; -+ mutex_lock(&mfs->mfs_lock); -+ if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire) -+ || mfs->mfs_bindex < 0 -+ || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex))) -+ au_mfs(dentry); -+ mutex_unlock(&mfs->mfs_lock); -+ err = mfs->mfs_bindex; -+ -+ out: -+ AuDbg("b%d\n", err); -+ return err; -+} -+ -+static int au_wbr_create_init_mfs(struct super_block *sb) -+{ -+ struct au_wbr_mfs *mfs; -+ -+ mfs = &au_sbi(sb)->si_wbr_mfs; -+ mutex_init(&mfs->mfs_lock); -+ mfs->mfs_jiffy = 0; -+ mfs->mfs_bindex = -EROFS; -+ -+ return 0; -+} -+ -+static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused) -+{ -+ mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock); -+ return 0; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* most free space and then round robin */ -+static int au_wbr_create_mfsrr(struct dentry *dentry, int isdir) -+{ -+ int err; -+ struct au_wbr_mfs *mfs; -+ -+ err = au_wbr_create_mfs(dentry, isdir); -+ if (err >= 0) { -+ mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs; -+ mutex_lock(&mfs->mfs_lock); -+ if (mfs->mfsrr_bytes < mfs->mfsrr_watermark) -+ err = au_wbr_create_rr(dentry, isdir); -+ mutex_unlock(&mfs->mfs_lock); -+ } -+ -+ AuDbg("b%d\n", err); -+ return err; -+} -+ -+static int au_wbr_create_init_mfsrr(struct super_block *sb) -+{ -+ int err; -+ -+ au_wbr_create_init_mfs(sb); /* ignore */ -+ err = au_wbr_create_init_rr(sb); -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* top down parent and most free space */ -+static int au_wbr_create_pmfs(struct dentry *dentry, int isdir) -+{ -+ int err, e2; -+ unsigned long long b; -+ aufs_bindex_t bindex, bstart, bend; -+ struct super_block *sb; -+ struct dentry *parent, *h_parent; -+ struct au_branch *br; -+ -+ err = au_wbr_create_tdp(dentry, isdir); -+ if (unlikely(err < 0)) -+ goto out; -+ parent = dget_parent(dentry); -+ bstart = au_dbstart(parent); -+ bend = au_dbtaildir(parent); -+ if (bstart == bend) -+ goto out_parent; /* success */ -+ -+ e2 = au_wbr_create_mfs(dentry, isdir); -+ if (e2 < 0) -+ goto out_parent; /* success */ -+ -+ /* when the available size is equal, select upper one */ -+ sb = dentry->d_sb; -+ br = au_sbr(sb, err); -+ b = br->br_wbr->wbr_bytes; -+ AuDbg("b%d, %llu\n", err, b); -+ -+ for (bindex = bstart; bindex <= bend; bindex++) { -+ h_parent = au_h_dptr(parent, bindex); -+ if (!h_parent || !h_parent->d_inode) -+ continue; -+ -+ br = au_sbr(sb, bindex); -+ if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) { -+ b = br->br_wbr->wbr_bytes; -+ err = bindex; -+ AuDbg("b%d, %llu\n", err, b); -+ } -+ } -+ -+ out_parent: -+ dput(parent); -+ out: -+ AuDbg("b%d\n", err); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* policies for copyup */ -+ -+/* top down parent */ -+static int au_wbr_copyup_tdp(struct dentry *dentry) -+{ -+ return au_wbr_create_tdp(dentry, /*isdir, anything is ok*/0); -+} -+ -+/* bottom up parent */ -+static int au_wbr_copyup_bup(struct dentry *dentry) -+{ -+ int err; -+ aufs_bindex_t bindex, bstart; -+ struct dentry *parent, *h_parent; -+ struct super_block *sb; -+ -+ err = -EROFS; -+ sb = dentry->d_sb; -+ parent = dget_parent(dentry); -+ bstart = au_dbstart(parent); -+ for (bindex = au_dbstart(dentry); bindex >= bstart; bindex--) { -+ h_parent = au_h_dptr(parent, bindex); -+ if (!h_parent || !h_parent->d_inode) -+ continue; -+ -+ if (!au_br_rdonly(au_sbr(sb, bindex))) { -+ err = bindex; -+ break; -+ } -+ } -+ dput(parent); -+ -+ /* bottom up here */ -+ if (unlikely(err < 0)) -+ err = au_wbr_bu(sb, bstart - 1); -+ -+ AuDbg("b%d\n", err); -+ return err; -+} -+ -+/* bottom up */ -+static int au_wbr_copyup_bu(struct dentry *dentry) -+{ -+ int err; -+ -+ err = au_wbr_bu(dentry->d_sb, au_dbstart(dentry)); -+ -+ AuDbg("b%d\n", err); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct au_wbr_copyup_operations au_wbr_copyup_ops[] = { -+ [AuWbrCopyup_TDP] = { -+ .copyup = au_wbr_copyup_tdp -+ }, -+ [AuWbrCopyup_BUP] = { -+ .copyup = au_wbr_copyup_bup -+ }, -+ [AuWbrCopyup_BU] = { -+ .copyup = au_wbr_copyup_bu -+ } -+}; -+ -+struct au_wbr_create_operations au_wbr_create_ops[] = { -+ [AuWbrCreate_TDP] = { -+ .create = au_wbr_create_tdp -+ }, -+ [AuWbrCreate_RR] = { -+ .create = au_wbr_create_rr, -+ .init = au_wbr_create_init_rr -+ }, -+ [AuWbrCreate_MFS] = { -+ .create = au_wbr_create_mfs, -+ .init = au_wbr_create_init_mfs, -+ .fin = au_wbr_create_fin_mfs -+ }, -+ [AuWbrCreate_MFSV] = { -+ .create = au_wbr_create_mfs, -+ .init = au_wbr_create_init_mfs, -+ .fin = au_wbr_create_fin_mfs -+ }, -+ [AuWbrCreate_MFSRR] = { -+ .create = au_wbr_create_mfsrr, -+ .init = au_wbr_create_init_mfsrr, -+ .fin = au_wbr_create_fin_mfs -+ }, -+ [AuWbrCreate_MFSRRV] = { -+ .create = au_wbr_create_mfsrr, -+ .init = au_wbr_create_init_mfsrr, -+ .fin = au_wbr_create_fin_mfs -+ }, -+ [AuWbrCreate_PMFS] = { -+ .create = au_wbr_create_pmfs, -+ .init = au_wbr_create_init_mfs, -+ .fin = au_wbr_create_fin_mfs -+ }, -+ [AuWbrCreate_PMFSV] = { -+ .create = au_wbr_create_pmfs, -+ .init = au_wbr_create_init_mfs, -+ .fin = au_wbr_create_fin_mfs -+ } -+}; -diff -Nur linux-2.6.31-vanilla/fs/aufs/whout.c linux-2.6.31/fs/aufs/whout.c ---- linux-2.6.31-vanilla/fs/aufs/whout.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/whout.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,1052 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * whiteout for logical deletion and opaque directory -+ */ -+ -+#include <linux/fs.h> -+#include "aufs.h" -+ -+#define WH_MASK S_IRUGO -+ -+/* -+ * If a directory contains this file, then it is opaque. We start with the -+ * .wh. flag so that it is blocked by lookup. -+ */ -+static struct qstr diropq_name = { -+ .name = AUFS_WH_DIROPQ, -+ .len = sizeof(AUFS_WH_DIROPQ) - 1 -+}; -+ -+/* -+ * generate whiteout name, which is NOT terminated by NULL. -+ * @name: original d_name.name -+ * @len: original d_name.len -+ * @wh: whiteout qstr -+ * returns zero when succeeds, otherwise error. -+ * succeeded value as wh->name should be freed by kfree(). -+ */ -+int au_wh_name_alloc(struct qstr *wh, const struct qstr *name) -+{ -+ char *p; -+ -+ if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN)) -+ return -ENAMETOOLONG; -+ -+ wh->len = name->len + AUFS_WH_PFX_LEN; -+ p = kmalloc(wh->len, GFP_NOFS); -+ wh->name = p; -+ if (p) { -+ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); -+ memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len); -+ /* smp_mb(); */ -+ return 0; -+ } -+ return -ENOMEM; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * test if the @wh_name exists under @h_parent. -+ * @try_sio specifies the necessary of super-io. -+ */ -+int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, -+ struct au_branch *br, int try_sio) -+{ -+ int err; -+ struct dentry *wh_dentry; -+ struct inode *h_dir; -+ -+ h_dir = h_parent->d_inode; -+ if (!try_sio) -+ wh_dentry = au_lkup_one(wh_name, h_parent, br, /*nd*/NULL); -+ else -+ wh_dentry = au_sio_lkup_one(wh_name, h_parent, br); -+ err = PTR_ERR(wh_dentry); -+ if (IS_ERR(wh_dentry)) -+ goto out; -+ -+ err = 0; -+ if (!wh_dentry->d_inode) -+ goto out_wh; /* success */ -+ -+ err = 1; -+ if (S_ISREG(wh_dentry->d_inode->i_mode)) -+ goto out_wh; /* success */ -+ -+ err = -EIO; -+ AuIOErr("%.*s Invalid whiteout entry type 0%o.\n", -+ AuDLNPair(wh_dentry), wh_dentry->d_inode->i_mode); -+ -+ out_wh: -+ dput(wh_dentry); -+ out: -+ return err; -+} -+ -+/* -+ * test if the @h_dentry sets opaque or not. -+ */ -+int au_diropq_test(struct dentry *h_dentry, struct au_branch *br) -+{ -+ int err; -+ struct inode *h_dir; -+ -+ h_dir = h_dentry->d_inode; -+ err = au_wh_test(h_dentry, &diropq_name, br, -+ au_test_h_perm_sio(h_dir, MAY_EXEC)); -+ return err; -+} -+ -+/* -+ * returns a negative dentry whose name is unique and temporary. -+ */ -+struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, -+ struct qstr *prefix) -+{ -+#define HEX_LEN 4 -+ struct dentry *dentry; -+ int i; -+ char defname[AUFS_WH_PFX_LEN * 2 + DNAME_INLINE_LEN_MIN + 1 -+ + HEX_LEN + 1], *name, *p; -+ static unsigned short cnt; -+ struct qstr qs; -+ -+ name = defname; -+ qs.len = sizeof(defname) - DNAME_INLINE_LEN_MIN + prefix->len - 1; -+ if (unlikely(prefix->len > DNAME_INLINE_LEN_MIN)) { -+ dentry = ERR_PTR(-ENAMETOOLONG); -+ if (unlikely(qs.len >= PATH_MAX)) -+ goto out; -+ dentry = ERR_PTR(-ENOMEM); -+ name = kmalloc(qs.len + 1, GFP_NOFS); -+ if (unlikely(!name)) -+ goto out; -+ } -+ -+ /* doubly whiteout-ed */ -+ memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2); -+ p = name + AUFS_WH_PFX_LEN * 2; -+ memcpy(p, prefix->name, prefix->len); -+ p += prefix->len; -+ *p++ = '.'; -+ AuDebugOn(name + qs.len + 1 - p <= HEX_LEN); -+ -+ qs.name = name; -+ for (i = 0; i < 3; i++) { -+ sprintf(p, "%.*d", HEX_LEN, cnt++); -+ dentry = au_sio_lkup_one(&qs, h_parent, br); -+ if (IS_ERR(dentry) || !dentry->d_inode) -+ goto out_name; -+ dput(dentry); -+ } -+ /* AuWarn("could not get random name\n"); */ -+ dentry = ERR_PTR(-EEXIST); -+ AuDbg("%.*s\n", AuLNPair(&qs)); -+ BUG(); -+ -+ out_name: -+ if (name != defname) -+ kfree(name); -+ out: -+ return dentry; -+#undef HEX_LEN -+} -+ -+/* -+ * rename the @h_dentry on @br to the whiteouted temporary name. -+ */ -+int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br) -+{ -+ int err; -+ struct path h_path = { -+ .mnt = br->br_mnt -+ }; -+ struct inode *h_dir; -+ struct dentry *h_parent; -+ -+ h_parent = h_dentry->d_parent; /* dir inode is locked */ -+ h_dir = h_parent->d_inode; -+ IMustLock(h_dir); -+ -+ h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name); -+ err = PTR_ERR(h_path.dentry); -+ if (IS_ERR(h_path.dentry)) -+ goto out; -+ -+ /* under the same dir, no need to lock_rename() */ -+ err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path); -+ AuTraceErr(err); -+ dput(h_path.dentry); -+ -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+/* -+ * functions for removing a whiteout -+ */ -+ -+static int do_unlink_wh(struct inode *h_dir, struct path *h_path) -+{ -+ int force; -+ -+ /* -+ * forces superio when the dir has a sticky bit. -+ * this may be a violation of unix fs semantics. -+ */ -+ force = (h_dir->i_mode & S_ISVTX) -+ && h_path->dentry->d_inode->i_uid != current_fsuid(); -+ return vfsub_unlink(h_dir, h_path, force); -+} -+ -+int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, -+ struct dentry *dentry) -+{ -+ int err; -+ -+ err = do_unlink_wh(h_dir, h_path); -+ if (!err && dentry) -+ au_set_dbwh(dentry, -1); -+ -+ return err; -+} -+ -+static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh, -+ struct au_branch *br) -+{ -+ int err; -+ struct path h_path = { -+ .mnt = br->br_mnt -+ }; -+ -+ err = 0; -+ h_path.dentry = au_lkup_one(wh, h_parent, br, /*nd*/NULL); -+ if (IS_ERR(h_path.dentry)) -+ err = PTR_ERR(h_path.dentry); -+ else { -+ if (h_path.dentry->d_inode -+ && S_ISREG(h_path.dentry->d_inode->i_mode)) -+ err = do_unlink_wh(h_parent->d_inode, &h_path); -+ dput(h_path.dentry); -+ } -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+/* -+ * initialize/clean whiteout for a branch -+ */ -+ -+static void au_wh_clean(struct inode *h_dir, struct path *whpath, -+ const int isdir) -+{ -+ int err; -+ -+ if (!whpath->dentry->d_inode) -+ return; -+ -+ err = mnt_want_write(whpath->mnt); -+ if (!err) { -+ if (isdir) -+ err = vfsub_rmdir(h_dir, whpath); -+ else -+ err = vfsub_unlink(h_dir, whpath, /*force*/0); -+ mnt_drop_write(whpath->mnt); -+ } -+ if (unlikely(err)) -+ AuWarn("failed removing %.*s (%d), ignored.\n", -+ AuDLNPair(whpath->dentry), err); -+} -+ -+static int test_linkable(struct dentry *h_root) -+{ -+ struct inode *h_dir = h_root->d_inode; -+ -+ if (h_dir->i_op->link) -+ return 0; -+ -+ AuErr("%.*s (%s) doesn't support link(2), use noplink and rw+nolwh\n", -+ AuDLNPair(h_root), au_sbtype(h_root->d_sb)); -+ return -ENOSYS; -+} -+ -+/* todo: should this mkdir be done in /sbin/mount.aufs helper? */ -+static int au_whdir(struct inode *h_dir, struct path *path) -+{ -+ int err; -+ -+ err = -EEXIST; -+ if (!path->dentry->d_inode) { -+ int mode = S_IRWXU; -+ -+ if (au_test_nfs(path->dentry->d_sb)) -+ mode |= S_IXUGO; -+ err = mnt_want_write(path->mnt); -+ if (!err) { -+ err = vfsub_mkdir(h_dir, path, mode); -+ mnt_drop_write(path->mnt); -+ } -+ } else if (S_ISDIR(path->dentry->d_inode->i_mode)) -+ err = 0; -+ else -+ AuErr("unknown %.*s exists\n", AuDLNPair(path->dentry)); -+ -+ return err; -+} -+ -+struct au_wh_base { -+ const struct qstr *name; -+ struct dentry *dentry; -+}; -+ -+static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[], -+ struct path *h_path) -+{ -+ h_path->dentry = base[AuBrWh_BASE].dentry; -+ au_wh_clean(h_dir, h_path, /*isdir*/0); -+ h_path->dentry = base[AuBrWh_PLINK].dentry; -+ au_wh_clean(h_dir, h_path, /*isdir*/1); -+ h_path->dentry = base[AuBrWh_ORPH].dentry; -+ au_wh_clean(h_dir, h_path, /*isdir*/1); -+} -+ -+/* -+ * returns tri-state, -+ * minus: error, caller should print the mesage -+ * zero: succuess -+ * plus: error, caller should NOT print the mesage -+ */ -+static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr, -+ int do_plink, struct au_wh_base base[], -+ struct path *h_path) -+{ -+ int err; -+ struct inode *h_dir; -+ -+ h_dir = h_root->d_inode; -+ h_path->dentry = base[AuBrWh_BASE].dentry; -+ au_wh_clean(h_dir, h_path, /*isdir*/0); -+ h_path->dentry = base[AuBrWh_PLINK].dentry; -+ if (do_plink) { -+ err = test_linkable(h_root); -+ if (unlikely(err)) { -+ err = 1; -+ goto out; -+ } -+ -+ err = au_whdir(h_dir, h_path); -+ if (unlikely(err)) -+ goto out; -+ wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); -+ } else -+ au_wh_clean(h_dir, h_path, /*isdir*/1); -+ h_path->dentry = base[AuBrWh_ORPH].dentry; -+ err = au_whdir(h_dir, h_path); -+ if (unlikely(err)) -+ goto out; -+ wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); -+ -+ out: -+ return err; -+} -+ -+/* -+ * for the moment, aufs supports the branch filesystem which does not support -+ * link(2). testing on FAT which does not support i_op->setattr() fully either, -+ * copyup failed. finally, such filesystem will not be used as the writable -+ * branch. -+ * -+ * returns tri-state, see above. -+ */ -+static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr, -+ int do_plink, struct au_wh_base base[], -+ struct path *h_path) -+{ -+ int err; -+ struct inode *h_dir; -+ -+ WbrWhMustWriteLock(wbr); -+ -+ err = test_linkable(h_root); -+ if (unlikely(err)) { -+ err = 1; -+ goto out; -+ } -+ -+ /* -+ * todo: should this create be done in /sbin/mount.aufs helper? -+ */ -+ err = -EEXIST; -+ h_dir = h_root->d_inode; -+ if (!base[AuBrWh_BASE].dentry->d_inode) { -+ err = mnt_want_write(h_path->mnt); -+ if (!err) { -+ h_path->dentry = base[AuBrWh_BASE].dentry; -+ err = vfsub_create(h_dir, h_path, WH_MASK); -+ mnt_drop_write(h_path->mnt); -+ } -+ } else if (S_ISREG(base[AuBrWh_BASE].dentry->d_inode->i_mode)) -+ err = 0; -+ else -+ AuErr("unknown %.*s/%.*s exists\n", -+ AuDLNPair(h_root), AuDLNPair(base[AuBrWh_BASE].dentry)); -+ if (unlikely(err)) -+ goto out; -+ -+ h_path->dentry = base[AuBrWh_PLINK].dentry; -+ if (do_plink) { -+ err = au_whdir(h_dir, h_path); -+ if (unlikely(err)) -+ goto out; -+ wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); -+ } else -+ au_wh_clean(h_dir, h_path, /*isdir*/1); -+ wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry); -+ -+ h_path->dentry = base[AuBrWh_ORPH].dentry; -+ err = au_whdir(h_dir, h_path); -+ if (unlikely(err)) -+ goto out; -+ wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); -+ -+ out: -+ return err; -+} -+ -+/* -+ * initialize the whiteout base file/dir for @br. -+ */ -+int au_wh_init(struct dentry *h_root, struct au_branch *br, -+ struct super_block *sb) -+{ -+ int err, i; -+ const unsigned char do_plink -+ = !!au_opt_test(au_mntflags(sb), PLINK); -+ struct path path = { -+ .mnt = br->br_mnt -+ }; -+ struct inode *h_dir; -+ struct au_wbr *wbr = br->br_wbr; -+ static const struct qstr base_name[] = { -+ [AuBrWh_BASE] = { -+ .name = AUFS_BASE_NAME, -+ .len = sizeof(AUFS_BASE_NAME) - 1 -+ }, -+ [AuBrWh_PLINK] = { -+ .name = AUFS_PLINKDIR_NAME, -+ .len = sizeof(AUFS_PLINKDIR_NAME) - 1 -+ }, -+ [AuBrWh_ORPH] = { -+ .name = AUFS_ORPHDIR_NAME, -+ .len = sizeof(AUFS_ORPHDIR_NAME) - 1 -+ } -+ }; -+ struct au_wh_base base[] = { -+ [AuBrWh_BASE] = { -+ .name = base_name + AuBrWh_BASE, -+ .dentry = NULL -+ }, -+ [AuBrWh_PLINK] = { -+ .name = base_name + AuBrWh_PLINK, -+ .dentry = NULL -+ }, -+ [AuBrWh_ORPH] = { -+ .name = base_name + AuBrWh_ORPH, -+ .dentry = NULL -+ } -+ }; -+ -+ if (wbr) -+ WbrWhMustWriteLock(wbr); -+ -+ h_dir = h_root->d_inode; -+ for (i = 0; i < AuBrWh_Last; i++) { -+ /* doubly whiteouted */ -+ struct dentry *d; -+ -+ d = au_wh_lkup(h_root, (void *)base[i].name, br); -+ err = PTR_ERR(d); -+ if (IS_ERR(d)) -+ goto out; -+ -+ base[i].dentry = d; -+ AuDebugOn(wbr -+ && wbr->wbr_wh[i] -+ && wbr->wbr_wh[i] != base[i].dentry); -+ } -+ -+ if (wbr) -+ for (i = 0; i < AuBrWh_Last; i++) { -+ dput(wbr->wbr_wh[i]); -+ wbr->wbr_wh[i] = NULL; -+ } -+ -+ err = 0; -+ -+ switch (br->br_perm) { -+ case AuBrPerm_RO: -+ case AuBrPerm_ROWH: -+ case AuBrPerm_RR: -+ case AuBrPerm_RRWH: -+ au_wh_init_ro(h_dir, base, &path); -+ break; -+ -+ case AuBrPerm_RWNoLinkWH: -+ err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path); -+ if (err > 0) -+ goto out; -+ else if (err) -+ goto out_err; -+ break; -+ -+ case AuBrPerm_RW: -+ err = au_wh_init_rw(h_root, wbr, do_plink, base, &path); -+ if (err > 0) -+ goto out; -+ else if (err) -+ goto out_err; -+ break; -+ -+ default: -+ BUG(); -+ } -+ goto out; /* success */ -+ -+ out_err: -+ AuErr("an error(%d) on the writable branch %.*s(%s)\n", -+ err, AuDLNPair(h_root), au_sbtype(h_root->d_sb)); -+ out: -+ for (i = 0; i < AuBrWh_Last; i++) -+ dput(base[i].dentry); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+/* -+ * whiteouts are all hard-linked usually. -+ * when its link count reaches a ceiling, we create a new whiteout base -+ * asynchronously. -+ */ -+ -+struct reinit_br_wh { -+ struct super_block *sb; -+ struct au_branch *br; -+}; -+ -+static void reinit_br_wh(void *arg) -+{ -+ int err; -+ aufs_bindex_t bindex; -+ struct path h_path; -+ struct reinit_br_wh *a = arg; -+ struct au_wbr *wbr; -+ struct inode *dir; -+ struct dentry *h_root; -+ struct au_hinode *hdir; -+ -+ err = 0; -+ wbr = a->br->br_wbr; -+ /* big aufs lock */ -+ si_noflush_write_lock(a->sb); -+ if (!au_br_writable(a->br->br_perm)) -+ goto out; -+ bindex = au_br_index(a->sb, a->br->br_id); -+ if (unlikely(bindex < 0)) -+ goto out; -+ -+ di_read_lock_parent(a->sb->s_root, AuLock_IR); -+ dir = a->sb->s_root->d_inode; -+ hdir = au_hi(dir, bindex); -+ h_root = au_h_dptr(a->sb->s_root, bindex); -+ -+ au_hin_imtx_lock_nested(hdir, AuLsc_I_PARENT); -+ wbr_wh_write_lock(wbr); -+ err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode, -+ h_root, a->br); -+ if (!err) { -+ err = mnt_want_write(a->br->br_mnt); -+ if (!err) { -+ h_path.dentry = wbr->wbr_whbase; -+ h_path.mnt = a->br->br_mnt; -+ err = vfsub_unlink(hdir->hi_inode, &h_path, /*force*/0); -+ mnt_drop_write(a->br->br_mnt); -+ } -+ } else { -+ AuWarn("%.*s is moved, ignored\n", AuDLNPair(wbr->wbr_whbase)); -+ err = 0; -+ } -+ dput(wbr->wbr_whbase); -+ wbr->wbr_whbase = NULL; -+ if (!err) -+ err = au_wh_init(h_root, a->br, a->sb); -+ wbr_wh_write_unlock(wbr); -+ au_hin_imtx_unlock(hdir); -+ di_read_unlock(a->sb->s_root, AuLock_IR); -+ -+ out: -+ if (wbr) -+ atomic_dec(&wbr->wbr_wh_running); -+ atomic_dec(&a->br->br_count); -+ au_nwt_done(&au_sbi(a->sb)->si_nowait); -+ si_write_unlock(a->sb); -+ kfree(arg); -+ if (unlikely(err)) -+ AuIOErr("err %d\n", err); -+} -+ -+static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br) -+{ -+ int do_dec, wkq_err; -+ struct reinit_br_wh *arg; -+ -+ do_dec = 1; -+ if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1) -+ goto out; -+ -+ /* ignore ENOMEM */ -+ arg = kmalloc(sizeof(*arg), GFP_NOFS); -+ if (arg) { -+ /* -+ * dec(wh_running), kfree(arg) and dec(br_count) -+ * in reinit function -+ */ -+ arg->sb = sb; -+ arg->br = br; -+ atomic_inc(&br->br_count); -+ wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb); -+ if (unlikely(wkq_err)) { -+ atomic_dec(&br->br_wbr->wbr_wh_running); -+ atomic_dec(&br->br_count); -+ kfree(arg); -+ } -+ do_dec = 0; -+ } -+ -+ out: -+ if (do_dec) -+ atomic_dec(&br->br_wbr->wbr_wh_running); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * create the whiteout @wh. -+ */ -+static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex, -+ struct dentry *wh) -+{ -+ int err; -+ struct path h_path = { -+ .dentry = wh -+ }; -+ struct au_branch *br; -+ struct au_wbr *wbr; -+ struct dentry *h_parent; -+ struct inode *h_dir; -+ -+ h_parent = wh->d_parent; /* dir inode is locked */ -+ h_dir = h_parent->d_inode; -+ IMustLock(h_dir); -+ -+ br = au_sbr(sb, bindex); -+ h_path.mnt = br->br_mnt; -+ wbr = br->br_wbr; -+ wbr_wh_read_lock(wbr); -+ if (wbr->wbr_whbase) { -+ err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path); -+ if (!err || err != -EMLINK) -+ goto out; -+ -+ /* link count full. re-initialize br_whbase. */ -+ kick_reinit_br_wh(sb, br); -+ } -+ -+ /* return this error in this context */ -+ err = vfsub_create(h_dir, &h_path, WH_MASK); -+ -+ out: -+ wbr_wh_read_unlock(wbr); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * create or remove the diropq. -+ */ -+static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex, -+ unsigned int flags) -+{ -+ struct dentry *opq_dentry, *h_dentry; -+ struct super_block *sb; -+ struct au_branch *br; -+ int err; -+ -+ sb = dentry->d_sb; -+ br = au_sbr(sb, bindex); -+ h_dentry = au_h_dptr(dentry, bindex); -+ opq_dentry = au_lkup_one(&diropq_name, h_dentry, br, /*nd*/NULL); -+ if (IS_ERR(opq_dentry)) -+ goto out; -+ -+ if (au_ftest_diropq(flags, CREATE)) { -+ err = link_or_create_wh(sb, bindex, opq_dentry); -+ if (!err) { -+ au_set_dbdiropq(dentry, bindex); -+ goto out; /* success */ -+ } -+ } else { -+ struct path tmp = { -+ .dentry = opq_dentry, -+ .mnt = br->br_mnt -+ }; -+ err = do_unlink_wh(au_h_iptr(dentry->d_inode, bindex), &tmp); -+ if (!err) -+ au_set_dbdiropq(dentry, -1); -+ } -+ dput(opq_dentry); -+ opq_dentry = ERR_PTR(err); -+ -+ out: -+ return opq_dentry; -+} -+ -+struct do_diropq_args { -+ struct dentry **errp; -+ struct dentry *dentry; -+ aufs_bindex_t bindex; -+ unsigned int flags; -+}; -+ -+static void call_do_diropq(void *args) -+{ -+ struct do_diropq_args *a = args; -+ *a->errp = do_diropq(a->dentry, a->bindex, a->flags); -+} -+ -+struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, -+ unsigned int flags) -+{ -+ struct dentry *diropq, *h_dentry; -+ -+ h_dentry = au_h_dptr(dentry, bindex); -+ if (!au_test_h_perm_sio(h_dentry->d_inode, MAY_EXEC | MAY_WRITE)) -+ diropq = do_diropq(dentry, bindex, flags); -+ else { -+ int wkq_err; -+ struct do_diropq_args args = { -+ .errp = &diropq, -+ .dentry = dentry, -+ .bindex = bindex, -+ .flags = flags -+ }; -+ -+ wkq_err = au_wkq_wait(call_do_diropq, &args); -+ if (unlikely(wkq_err)) -+ diropq = ERR_PTR(wkq_err); -+ } -+ -+ return diropq; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * lookup whiteout dentry. -+ * @h_parent: lower parent dentry which must exist and be locked -+ * @base_name: name of dentry which will be whiteouted -+ * returns dentry for whiteout. -+ */ -+struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, -+ struct au_branch *br) -+{ -+ int err; -+ struct qstr wh_name; -+ struct dentry *wh_dentry; -+ -+ err = au_wh_name_alloc(&wh_name, base_name); -+ wh_dentry = ERR_PTR(err); -+ if (!err) { -+ wh_dentry = au_lkup_one(&wh_name, h_parent, br, /*nd*/NULL); -+ kfree(wh_name.name); -+ } -+ return wh_dentry; -+} -+ -+/* -+ * link/create a whiteout for @dentry on @bindex. -+ */ -+struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, -+ struct dentry *h_parent) -+{ -+ struct dentry *wh_dentry; -+ struct super_block *sb; -+ int err; -+ -+ sb = dentry->d_sb; -+ wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex)); -+ if (!IS_ERR(wh_dentry) && !wh_dentry->d_inode) { -+ err = link_or_create_wh(sb, bindex, wh_dentry); -+ if (!err) -+ au_set_dbwh(dentry, bindex); -+ else { -+ dput(wh_dentry); -+ wh_dentry = ERR_PTR(err); -+ } -+ } -+ -+ return wh_dentry; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* Delete all whiteouts in this directory on branch bindex. */ -+static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist, -+ aufs_bindex_t bindex, struct au_branch *br) -+{ -+ int err; -+ unsigned long ul, n; -+ struct qstr wh_name; -+ char *p; -+ struct hlist_head *head; -+ struct au_vdir_wh *tpos; -+ struct hlist_node *pos; -+ struct au_vdir_destr *str; -+ -+ err = -ENOMEM; -+ p = __getname(); -+ wh_name.name = p; -+ if (unlikely(!wh_name.name)) -+ goto out; -+ -+ err = 0; -+ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); -+ p += AUFS_WH_PFX_LEN; -+ n = whlist->nh_num; -+ head = whlist->nh_head; -+ for (ul = 0; !err && ul < n; ul++, head++) { -+ hlist_for_each_entry(tpos, pos, head, wh_hash) { -+ if (tpos->wh_bindex != bindex) -+ continue; -+ -+ str = &tpos->wh_str; -+ if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) { -+ memcpy(p, str->name, str->len); -+ wh_name.len = AUFS_WH_PFX_LEN + str->len; -+ err = unlink_wh_name(h_dentry, &wh_name, br); -+ if (!err) -+ continue; -+ break; -+ } -+ AuIOErr("whiteout name too long %.*s\n", -+ str->len, str->name); -+ err = -EIO; -+ break; -+ } -+ } -+ __putname(wh_name.name); -+ -+ out: -+ return err; -+} -+ -+struct del_wh_children_args { -+ int *errp; -+ struct dentry *h_dentry; -+ struct au_nhash *whlist; -+ aufs_bindex_t bindex; -+ struct au_branch *br; -+}; -+ -+static void call_del_wh_children(void *args) -+{ -+ struct del_wh_children_args *a = args; -+ *a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp) -+{ -+ struct au_whtmp_rmdir *whtmp; -+ int err; -+ unsigned int rdhash; -+ -+ SiMustAnyLock(sb); -+ -+ whtmp = kmalloc(sizeof(*whtmp), gfp); -+ if (unlikely(!whtmp)) { -+ whtmp = ERR_PTR(-ENOMEM); -+ goto out; -+ } -+ -+ whtmp->dir = NULL; -+ whtmp->wh_dentry = NULL; -+ /* no estimation for dir size */ -+ rdhash = au_sbi(sb)->si_rdhash; -+ if (!rdhash) -+ rdhash = AUFS_RDHASH_DEF; -+ err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp); -+ if (unlikely(err)) { -+ kfree(whtmp); -+ whtmp = ERR_PTR(err); -+ } -+ -+ out: -+ return whtmp; -+} -+ -+void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp) -+{ -+ dput(whtmp->wh_dentry); -+ iput(whtmp->dir); -+ au_nhash_wh_free(&whtmp->whlist); -+ kfree(whtmp); -+} -+ -+/* -+ * rmdir the whiteouted temporary named dir @h_dentry. -+ * @whlist: whiteouted children. -+ */ -+int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, -+ struct dentry *wh_dentry, struct au_nhash *whlist) -+{ -+ int err; -+ struct path h_tmp; -+ struct inode *wh_inode, *h_dir; -+ struct au_branch *br; -+ -+ h_dir = wh_dentry->d_parent->d_inode; /* dir inode is locked */ -+ IMustLock(h_dir); -+ -+ br = au_sbr(dir->i_sb, bindex); -+ wh_inode = wh_dentry->d_inode; -+ mutex_lock_nested(&wh_inode->i_mutex, AuLsc_I_CHILD); -+ -+ /* -+ * someone else might change some whiteouts while we were sleeping. -+ * it means this whlist may have an obsoleted entry. -+ */ -+ if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE)) -+ err = del_wh_children(wh_dentry, whlist, bindex, br); -+ else { -+ int wkq_err; -+ struct del_wh_children_args args = { -+ .errp = &err, -+ .h_dentry = wh_dentry, -+ .whlist = whlist, -+ .bindex = bindex, -+ .br = br -+ }; -+ -+ wkq_err = au_wkq_wait(call_del_wh_children, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ } -+ mutex_unlock(&wh_inode->i_mutex); -+ -+ if (!err) { -+ h_tmp.dentry = wh_dentry; -+ h_tmp.mnt = br->br_mnt; -+ err = vfsub_rmdir(h_dir, &h_tmp); -+ /* d_drop(h_dentry); */ -+ } -+ -+ if (!err) { -+ if (au_ibstart(dir) == bindex) { -+ au_cpup_attr_timesizes(dir); -+ drop_nlink(dir); -+ } -+ return 0; /* success */ -+ } -+ -+ AuWarn("failed removing %.*s(%d), ignored\n", -+ AuDLNPair(wh_dentry), err); -+ return err; -+} -+ -+static void call_rmdir_whtmp(void *args) -+{ -+ int err; -+ struct au_whtmp_rmdir *a = args; -+ struct super_block *sb; -+ struct dentry *h_parent; -+ struct inode *h_dir; -+ struct au_branch *br; -+ struct au_hinode *hdir; -+ -+ /* rmdir by nfsd may cause deadlock with this i_mutex */ -+ /* mutex_lock(&a->dir->i_mutex); */ -+ sb = a->dir->i_sb; -+ si_noflush_read_lock(sb); -+ err = au_test_ro(sb, a->bindex, NULL); -+ if (unlikely(err)) -+ goto out; -+ -+ err = -EIO; -+ br = au_sbr(sb, a->bindex); -+ ii_write_lock_parent(a->dir); -+ h_parent = dget_parent(a->wh_dentry); -+ h_dir = h_parent->d_inode; -+ hdir = au_hi(a->dir, a->bindex); -+ au_hin_imtx_lock_nested(hdir, AuLsc_I_PARENT); -+ err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent, br); -+ if (!err) { -+ err = mnt_want_write(br->br_mnt); -+ if (!err) { -+ err = au_whtmp_rmdir(a->dir, a->bindex, a->wh_dentry, -+ &a->whlist); -+ mnt_drop_write(br->br_mnt); -+ } -+ } -+ au_hin_imtx_unlock(hdir); -+ dput(h_parent); -+ ii_write_unlock(a->dir); -+ -+ out: -+ /* mutex_unlock(&a->dir->i_mutex); */ -+ au_nwt_done(&au_sbi(sb)->si_nowait); -+ si_read_unlock(sb); -+ au_whtmp_rmdir_free(a); -+ if (unlikely(err)) -+ AuIOErr("err %d\n", err); -+} -+ -+void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, -+ struct dentry *wh_dentry, struct au_whtmp_rmdir *args) -+{ -+ int wkq_err; -+ -+ IMustLock(dir); -+ -+ /* all post-process will be done in do_rmdir_whtmp(). */ -+ args->dir = au_igrab(dir); -+ args->bindex = bindex; -+ args->wh_dentry = dget(wh_dentry); -+ wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, dir->i_sb); -+ if (unlikely(wkq_err)) { -+ AuWarn("rmdir error %.*s (%d), ignored\n", -+ AuDLNPair(wh_dentry), wkq_err); -+ au_whtmp_rmdir_free(args); -+ } -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/whout.h linux-2.6.31/fs/aufs/whout.h ---- linux-2.6.31-vanilla/fs/aufs/whout.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/whout.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,87 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * whiteout for logical deletion and opaque directory -+ */ -+ -+#ifndef __AUFS_WHOUT_H__ -+#define __AUFS_WHOUT_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/aufs_type.h> -+#include "dir.h" -+ -+/* whout.c */ -+int au_wh_name_alloc(struct qstr *wh, const struct qstr *name); -+struct au_branch; -+int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, -+ struct au_branch *br, int try_sio); -+int au_diropq_test(struct dentry *h_dentry, struct au_branch *br); -+struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, -+ struct qstr *prefix); -+int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br); -+int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, -+ struct dentry *dentry); -+int au_wh_init(struct dentry *h_parent, struct au_branch *br, -+ struct super_block *sb); -+ -+/* diropq flags */ -+#define AuDiropq_CREATE 1 -+#define au_ftest_diropq(flags, name) ((flags) & AuDiropq_##name) -+#define au_fset_diropq(flags, name) { (flags) |= AuDiropq_##name; } -+#define au_fclr_diropq(flags, name) { (flags) &= ~AuDiropq_##name; } -+ -+struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, -+ unsigned int flags); -+struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, -+ struct au_branch *br); -+struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, -+ struct dentry *h_parent); -+ -+/* real rmdir for the whiteout-ed dir */ -+struct au_whtmp_rmdir { -+ struct inode *dir; -+ aufs_bindex_t bindex; -+ struct dentry *wh_dentry; -+ struct au_nhash whlist; -+}; -+ -+struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp); -+void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp); -+int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, -+ struct dentry *wh_dentry, struct au_nhash *whlist); -+void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, -+ struct dentry *wh_dentry, struct au_whtmp_rmdir *args); -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline struct dentry *au_diropq_create(struct dentry *dentry, -+ aufs_bindex_t bindex) -+{ -+ return au_diropq_sio(dentry, bindex, AuDiropq_CREATE); -+} -+ -+static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex) -+{ -+ return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE)); -+} -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_WHOUT_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/wkq.c linux-2.6.31/fs/aufs/wkq.c ---- linux-2.6.31-vanilla/fs/aufs/wkq.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/wkq.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,259 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * workqueue for asynchronous/super-io operations -+ * todo: try new dredential scheme -+ */ -+ -+#include <linux/module.h> -+#include "aufs.h" -+ -+/* internal workqueue named AUFS_WKQ_NAME */ -+static struct au_wkq { -+ struct workqueue_struct *q; -+ -+ /* balancing */ -+ atomic_t busy; -+} *au_wkq; -+ -+struct au_wkinfo { -+ struct work_struct wk; -+ struct super_block *sb; -+ -+ unsigned int flags; /* see wkq.h */ -+ -+ au_wkq_func_t func; -+ void *args; -+ -+ atomic_t *busyp; -+ struct completion *comp; -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int enqueue(struct au_wkq *wkq, struct au_wkinfo *wkinfo) -+{ -+ wkinfo->busyp = &wkq->busy; -+ if (au_ftest_wkq(wkinfo->flags, WAIT)) -+ return !queue_work(wkq->q, &wkinfo->wk); -+ else -+ return !schedule_work(&wkinfo->wk); -+} -+ -+static void do_wkq(struct au_wkinfo *wkinfo) -+{ -+ unsigned int idle, n; -+ int i, idle_idx; -+ -+ while (1) { -+ if (au_ftest_wkq(wkinfo->flags, WAIT)) { -+ idle_idx = 0; -+ idle = UINT_MAX; -+ for (i = 0; i < aufs_nwkq; i++) { -+ n = atomic_inc_return(&au_wkq[i].busy); -+ if (n == 1 && !enqueue(au_wkq + i, wkinfo)) -+ return; /* success */ -+ -+ if (n < idle) { -+ idle_idx = i; -+ idle = n; -+ } -+ atomic_dec(&au_wkq[i].busy); -+ } -+ } else -+ idle_idx = aufs_nwkq; -+ -+ atomic_inc(&au_wkq[idle_idx].busy); -+ if (!enqueue(au_wkq + idle_idx, wkinfo)) -+ return; /* success */ -+ -+ /* impossible? */ -+ AuWarn1("failed to queue_work()\n"); -+ yield(); -+ } -+} -+ -+static void wkq_func(struct work_struct *wk) -+{ -+ struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk); -+ -+ wkinfo->func(wkinfo->args); -+ atomic_dec_return(wkinfo->busyp); -+ if (au_ftest_wkq(wkinfo->flags, WAIT)) -+ complete(wkinfo->comp); -+ else { -+ kobject_put(&au_sbi(wkinfo->sb)->si_kobj); -+ module_put(THIS_MODULE); -+ kfree(wkinfo); -+ } -+} -+ -+/* -+ * Since struct completion is large, try allocating it dynamically. -+ */ -+#if defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) -+#define AuWkqCompDeclare(name) struct completion *comp = NULL -+ -+static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) -+{ -+ *comp = kmalloc(sizeof(**comp), GFP_NOFS); -+ if (*comp) { -+ init_completion(*comp); -+ wkinfo->comp = *comp; -+ return 0; -+ } -+ return -ENOMEM; -+} -+ -+static void au_wkq_comp_free(struct completion *comp) -+{ -+ kfree(comp); -+} -+ -+#else -+ -+/* no braces */ -+#define AuWkqCompDeclare(name) \ -+ DECLARE_COMPLETION_ONSTACK(_ ## name); \ -+ struct completion *comp = &_ ## name -+ -+static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) -+{ -+ wkinfo->comp = *comp; -+ return 0; -+} -+ -+static void au_wkq_comp_free(struct completion *comp __maybe_unused) -+{ -+ /* empty */ -+} -+#endif /* 4KSTACKS */ -+ -+static void au_wkq_run(struct au_wkinfo *wkinfo) -+{ -+ au_dbg_verify_kthread(); -+ INIT_WORK(&wkinfo->wk, wkq_func); -+ do_wkq(wkinfo); -+} -+ -+int au_wkq_wait(au_wkq_func_t func, void *args) -+{ -+ int err; -+ AuWkqCompDeclare(comp); -+ struct au_wkinfo wkinfo = { -+ .flags = AuWkq_WAIT, -+ .func = func, -+ .args = args -+ }; -+ -+ err = au_wkq_comp_alloc(&wkinfo, &comp); -+ if (!err) { -+ au_wkq_run(&wkinfo); -+ /* no timeout, no interrupt */ -+ wait_for_completion(wkinfo.comp); -+ au_wkq_comp_free(comp); -+ } -+ -+ return err; -+ -+} -+ -+int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb) -+{ -+ int err; -+ struct au_wkinfo *wkinfo; -+ -+ atomic_inc(&au_sbi(sb)->si_nowait.nw_len); -+ -+ /* -+ * wkq_func() must free this wkinfo. -+ * it highly depends upon the implementation of workqueue. -+ */ -+ err = 0; -+ wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS); -+ if (wkinfo) { -+ wkinfo->sb = sb; -+ wkinfo->flags = !AuWkq_WAIT; -+ wkinfo->func = func; -+ wkinfo->args = args; -+ wkinfo->comp = NULL; -+ kobject_get(&au_sbi(sb)->si_kobj); -+ __module_get(THIS_MODULE); -+ -+ au_wkq_run(wkinfo); -+ } else { -+ err = -ENOMEM; -+ atomic_dec(&au_sbi(sb)->si_nowait.nw_len); -+ } -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+void au_nwt_init(struct au_nowait_tasks *nwt) -+{ -+ atomic_set(&nwt->nw_len, 0); -+ /* smp_mb();*/ /* atomic_set */ -+ init_waitqueue_head(&nwt->nw_wq); -+} -+ -+void au_wkq_fin(void) -+{ -+ int i; -+ -+ for (i = 0; i < aufs_nwkq; i++) -+ if (au_wkq[i].q && !IS_ERR(au_wkq[i].q)) -+ destroy_workqueue(au_wkq[i].q); -+ kfree(au_wkq); -+} -+ -+int __init au_wkq_init(void) -+{ -+ int err, i; -+ struct au_wkq *nowaitq; -+ -+ /* '+1' is for accounting of nowait queue */ -+ err = -ENOMEM; -+ au_wkq = kcalloc(aufs_nwkq + 1, sizeof(*au_wkq), GFP_NOFS); -+ if (unlikely(!au_wkq)) -+ goto out; -+ -+ err = 0; -+ for (i = 0; i < aufs_nwkq; i++) { -+ au_wkq[i].q = create_singlethread_workqueue(AUFS_WKQ_NAME); -+ if (au_wkq[i].q && !IS_ERR(au_wkq[i].q)) { -+ atomic_set(&au_wkq[i].busy, 0); -+ continue; -+ } -+ -+ err = PTR_ERR(au_wkq[i].q); -+ au_wkq_fin(); -+ goto out; -+ } -+ -+ /* nowait accounting */ -+ nowaitq = au_wkq + aufs_nwkq; -+ atomic_set(&nowaitq->busy, 0); -+ nowaitq->q = NULL; -+ /* smp_mb(); */ /* atomic_set */ -+ -+ out: -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/aufs/wkq.h linux-2.6.31/fs/aufs/wkq.h ---- linux-2.6.31-vanilla/fs/aufs/wkq.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/wkq.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,82 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * workqueue for asynchronous/super-io operations -+ * todo: try new credentials management scheme -+ */ -+ -+#ifndef __AUFS_WKQ_H__ -+#define __AUFS_WKQ_H__ -+ -+#ifdef __KERNEL__ -+ -+#include <linux/sched.h> -+#include <linux/wait.h> -+#include <linux/aufs_type.h> -+ -+struct super_block; -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * in the next operation, wait for the 'nowait' tasks in system-wide workqueue -+ */ -+struct au_nowait_tasks { -+ atomic_t nw_len; -+ wait_queue_head_t nw_wq; -+}; -+ -+/* ---------------------------------------------------------------------- */ -+ -+typedef void (*au_wkq_func_t)(void *args); -+ -+/* wkq flags */ -+#define AuWkq_WAIT 1 -+#define au_ftest_wkq(flags, name) ((flags) & AuWkq_##name) -+#define au_fset_wkq(flags, name) { (flags) |= AuWkq_##name; } -+#define au_fclr_wkq(flags, name) { (flags) &= ~AuWkq_##name; } -+ -+/* wkq.c */ -+int au_wkq_wait(au_wkq_func_t func, void *args); -+int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb); -+void au_nwt_init(struct au_nowait_tasks *nwt); -+int __init au_wkq_init(void); -+void au_wkq_fin(void); -+ -+/* ---------------------------------------------------------------------- */ -+ -+static inline int au_test_wkq(struct task_struct *tsk) -+{ -+ return !tsk->mm && !strcmp(tsk->comm, AUFS_WKQ_NAME); -+} -+ -+static inline void au_nwt_done(struct au_nowait_tasks *nwt) -+{ -+ if (!atomic_dec_return(&nwt->nw_len)) -+ wake_up_all(&nwt->nw_wq); -+} -+ -+static inline int au_nwt_flush(struct au_nowait_tasks *nwt) -+{ -+ wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len)); -+ return 0; -+} -+ -+#endif /* __KERNEL__ */ -+#endif /* __AUFS_WKQ_H__ */ -diff -Nur linux-2.6.31-vanilla/fs/aufs/xino.c linux-2.6.31/fs/aufs/xino.c ---- linux-2.6.31-vanilla/fs/aufs/xino.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/fs/aufs/xino.c 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,1203 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/* -+ * external inode number translation table and bitmap -+ */ -+ -+#include <linux/file.h> -+#include <linux/seq_file.h> -+#include <linux/uaccess.h> -+#include "aufs.h" -+ -+ssize_t xino_fread(au_readf_t func, struct file *file, void *buf, size_t size, -+ loff_t *pos) -+{ -+ ssize_t err; -+ mm_segment_t oldfs; -+ -+ oldfs = get_fs(); -+ set_fs(KERNEL_DS); -+ do { -+ /* todo: signal_pending? */ -+ err = func(file, (char __user *)buf, size, pos); -+ } while (err == -EAGAIN || err == -EINTR); -+ set_fs(oldfs); -+ -+#if 0 /* reserved for future use */ -+ if (err > 0) -+ fsnotify_access(file->f_dentry); -+#endif -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static ssize_t do_xino_fwrite(au_writef_t func, struct file *file, void *buf, -+ size_t size, loff_t *pos) -+{ -+ ssize_t err; -+ mm_segment_t oldfs; -+ -+ oldfs = get_fs(); -+ set_fs(KERNEL_DS); -+ lockdep_off(); -+ do { -+ /* todo: signal_pending? */ -+ err = func(file, (const char __user *)buf, size, pos); -+ } while (err == -EAGAIN || err == -EINTR); -+ lockdep_on(); -+ set_fs(oldfs); -+ -+#if 0 /* reserved for future use */ -+ if (err > 0) -+ fsnotify_modify(file->f_dentry); -+#endif -+ -+ return err; -+} -+ -+struct do_xino_fwrite_args { -+ ssize_t *errp; -+ au_writef_t func; -+ struct file *file; -+ void *buf; -+ size_t size; -+ loff_t *pos; -+}; -+ -+static void call_do_xino_fwrite(void *args) -+{ -+ struct do_xino_fwrite_args *a = args; -+ *a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos); -+} -+ -+ssize_t xino_fwrite(au_writef_t func, struct file *file, void *buf, size_t size, -+ loff_t *pos) -+{ -+ ssize_t err; -+ -+ /* todo: signal block and no wkq? */ -+ /* todo: new credential scheme */ -+ /* -+ * it breaks RLIMIT_FSIZE and normal user's limit, -+ * users should care about quota and real 'filesystem full.' -+ */ -+ if (!au_test_wkq(current)) { -+ int wkq_err; -+ struct do_xino_fwrite_args args = { -+ .errp = &err, -+ .func = func, -+ .file = file, -+ .buf = buf, -+ .size = size, -+ .pos = pos -+ }; -+ -+ wkq_err = au_wkq_wait(call_do_xino_fwrite, &args); -+ if (unlikely(wkq_err)) -+ err = wkq_err; -+ } else -+ err = do_xino_fwrite(func, file, buf, size, pos); -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * create a new xinofile at the same place/path as @base_file. -+ */ -+struct file *au_xino_create2(struct file *base_file, struct file *copy_src) -+{ -+ struct file *file; -+ struct dentry *base, *dentry, *parent; -+ struct inode *dir; -+ struct qstr *name; -+ int err; -+ struct path path; -+ -+ base = base_file->f_dentry; -+ parent = base->d_parent; /* dir inode is locked */ -+ dir = parent->d_inode; -+ IMustLock(dir); -+ -+ file = ERR_PTR(-EINVAL); -+ name = &base->d_name; -+ dentry = vfsub_lookup_one_len(name->name, parent, name->len); -+ if (IS_ERR(dentry)) { -+ file = (void *)dentry; -+ AuErr("%.*s lookup err %ld\n", AuLNPair(name), PTR_ERR(dentry)); -+ goto out; -+ } -+ -+ /* no need to mnt_want_write() since we call dentry_open() later */ -+ err = vfs_create(dir, dentry, S_IRUGO | S_IWUGO, NULL); -+ if (unlikely(err)) { -+ file = ERR_PTR(err); -+ AuErr("%.*s create err %d\n", AuLNPair(name), err); -+ goto out_dput; -+ } -+ -+ path.dentry = dentry; -+ path.mnt = base_file->f_vfsmnt; -+ path_get(&path); -+ file = vfsub_dentry_open(&path, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE, -+ current_cred()); -+ if (IS_ERR(file)) { -+ AuErr("%.*s open err %ld\n", AuLNPair(name), PTR_ERR(file)); -+ goto out_dput; -+ } -+ -+ err = vfsub_unlink(dir, &file->f_path, /*force*/0); -+ if (unlikely(err)) { -+ AuErr("%.*s unlink err %d\n", AuLNPair(name), err); -+ goto out_fput; -+ } -+ -+ if (copy_src) { -+ /* no one can touch copy_src xino */ -+ err = au_copy_file(file, copy_src, -+ i_size_read(copy_src->f_dentry->d_inode)); -+ if (unlikely(err)) { -+ AuErr("%.*s copy err %d\n", AuLNPair(name), err); -+ goto out_fput; -+ } -+ } -+ goto out_dput; /* success */ -+ -+ out_fput: -+ fput(file); -+ file = ERR_PTR(err); -+ out_dput: -+ dput(dentry); -+ out: -+ return file; -+} -+ -+struct au_xino_lock_dir { -+ struct au_hinode *hdir; -+ struct dentry *parent; -+ struct mutex *mtx; -+}; -+ -+static void au_xino_lock_dir(struct super_block *sb, struct file *xino, -+ struct au_xino_lock_dir *ldir) -+{ -+ aufs_bindex_t brid, bindex; -+ -+ ldir->hdir = NULL; -+ bindex = -1; -+ brid = au_xino_brid(sb); -+ if (brid >= 0) -+ bindex = au_br_index(sb, brid); -+ if (bindex >= 0) { -+ ldir->hdir = au_hi(sb->s_root->d_inode, bindex); -+ au_hin_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT); -+ } else { -+ ldir->parent = dget_parent(xino->f_dentry); -+ ldir->mtx = &ldir->parent->d_inode->i_mutex; -+ mutex_lock_nested(ldir->mtx, AuLsc_I_PARENT); -+ } -+} -+ -+static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir) -+{ -+ if (ldir->hdir) -+ au_hin_imtx_unlock(ldir->hdir); -+ else { -+ mutex_unlock(ldir->mtx); -+ dput(ldir->parent); -+ } -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* trucate xino files asynchronously */ -+ -+int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex) -+{ -+ int err; -+ aufs_bindex_t bi, bend; -+ struct au_branch *br; -+ struct file *new_xino, *file; -+ struct super_block *h_sb; -+ struct au_xino_lock_dir ldir; -+ -+ err = -EINVAL; -+ bend = au_sbend(sb); -+ if (unlikely(bindex < 0 || bend < bindex)) -+ goto out; -+ br = au_sbr(sb, bindex); -+ file = br->br_xino.xi_file; -+ if (!file) -+ goto out; -+ -+ au_xino_lock_dir(sb, file, &ldir); -+ /* mnt_want_write() is unnecessary here */ -+ new_xino = au_xino_create2(file, file); -+ au_xino_unlock_dir(&ldir); -+ err = PTR_ERR(new_xino); -+ if (IS_ERR(new_xino)) -+ goto out; -+ err = 0; -+ fput(file); -+ br->br_xino.xi_file = new_xino; -+ -+ h_sb = br->br_mnt->mnt_sb; -+ for (bi = 0; bi <= bend; bi++) { -+ if (unlikely(bi == bindex)) -+ continue; -+ br = au_sbr(sb, bi); -+ if (br->br_mnt->mnt_sb != h_sb) -+ continue; -+ -+ fput(br->br_xino.xi_file); -+ br->br_xino.xi_file = new_xino; -+ get_file(new_xino); -+ } -+ -+ out: -+ return err; -+} -+ -+struct xino_do_trunc_args { -+ struct super_block *sb; -+ struct au_branch *br; -+}; -+ -+static void xino_do_trunc(void *_args) -+{ -+ struct xino_do_trunc_args *args = _args; -+ struct super_block *sb; -+ struct au_branch *br; -+ struct inode *dir; -+ int err; -+ aufs_bindex_t bindex; -+ -+ err = 0; -+ sb = args->sb; -+ dir = sb->s_root->d_inode; -+ br = args->br; -+ -+ si_noflush_write_lock(sb); -+ ii_read_lock_parent(dir); -+ bindex = au_br_index(sb, br->br_id); -+ err = au_xino_trunc(sb, bindex); -+ if (!err -+ && br->br_xino.xi_file->f_dentry->d_inode->i_blocks -+ >= br->br_xino_upper) -+ br->br_xino_upper += AUFS_XINO_TRUNC_STEP; -+ -+ ii_read_unlock(dir); -+ if (unlikely(err)) -+ AuWarn("err b%d, (%d)\n", bindex, err); -+ atomic_dec(&br->br_xino_running); -+ atomic_dec(&br->br_count); -+ au_nwt_done(&au_sbi(sb)->si_nowait); -+ si_write_unlock(sb); -+ kfree(args); -+} -+ -+static void xino_try_trunc(struct super_block *sb, struct au_branch *br) -+{ -+ struct xino_do_trunc_args *args; -+ int wkq_err; -+ -+ if (br->br_xino.xi_file->f_dentry->d_inode->i_blocks -+ < br->br_xino_upper) -+ return; -+ -+ if (atomic_inc_return(&br->br_xino_running) > 1) -+ goto out; -+ -+ /* lock and kfree() will be called in trunc_xino() */ -+ args = kmalloc(sizeof(*args), GFP_NOFS); -+ if (unlikely(!args)) { -+ AuErr1("no memory\n"); -+ goto out_args; -+ } -+ -+ atomic_inc_return(&br->br_count); -+ args->sb = sb; -+ args->br = br; -+ wkq_err = au_wkq_nowait(xino_do_trunc, args, sb); -+ if (!wkq_err) -+ return; /* success */ -+ -+ AuErr("wkq %d\n", wkq_err); -+ atomic_dec_return(&br->br_count); -+ -+ out_args: -+ kfree(args); -+ out: -+ atomic_dec_return(&br->br_xino_running); -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+static int au_xino_do_write(au_writef_t write, struct file *file, -+ ino_t h_ino, ino_t ino) -+{ -+ loff_t pos; -+ ssize_t sz; -+ -+ pos = h_ino; -+ if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) { -+ AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); -+ return -EFBIG; -+ } -+ pos *= sizeof(ino); -+ sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos); -+ if (sz == sizeof(ino)) -+ return 0; /* success */ -+ -+ AuIOErr("write failed (%zd)\n", sz); -+ return -EIO; -+} -+ -+/* -+ * write @ino to the xinofile for the specified branch{@sb, @bindex} -+ * at the position of @h_ino. -+ * even if @ino is zero, it is written to the xinofile and means no entry. -+ * if the size of the xino file on a specific filesystem exceeds the watermark, -+ * try truncating it. -+ */ -+int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, -+ ino_t ino) -+{ -+ int err; -+ unsigned int mnt_flags; -+ struct au_branch *br; -+ -+ BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max) -+ || ((loff_t)-1) > 0); -+ SiMustAnyLock(sb); -+ -+ mnt_flags = au_mntflags(sb); -+ if (!au_opt_test(mnt_flags, XINO)) -+ return 0; -+ -+ br = au_sbr(sb, bindex); -+ err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, -+ h_ino, ino); -+ if (!err) { -+ if (au_opt_test(mnt_flags, TRUNC_XINO) -+ && au_test_fs_trunc_xino(br->br_mnt->mnt_sb)) -+ xino_try_trunc(sb, br); -+ return 0; /* success */ -+ } -+ -+ AuIOErr("write failed (%d)\n", err); -+ return -EIO; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* aufs inode number bitmap */ -+ -+static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE; -+static ino_t xib_calc_ino(unsigned long pindex, int bit) -+{ -+ ino_t ino; -+ -+ AuDebugOn(bit < 0 || page_bits <= bit); -+ ino = AUFS_FIRST_INO + pindex * page_bits + bit; -+ return ino; -+} -+ -+static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit) -+{ -+ AuDebugOn(ino < AUFS_FIRST_INO); -+ ino -= AUFS_FIRST_INO; -+ *pindex = ino / page_bits; -+ *bit = ino % page_bits; -+} -+ -+static int xib_pindex(struct super_block *sb, unsigned long pindex) -+{ -+ int err; -+ loff_t pos; -+ ssize_t sz; -+ struct au_sbinfo *sbinfo; -+ struct file *xib; -+ unsigned long *p; -+ -+ sbinfo = au_sbi(sb); -+ MtxMustLock(&sbinfo->si_xib_mtx); -+ AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE -+ || !au_opt_test(sbinfo->si_mntflags, XINO)); -+ -+ if (pindex == sbinfo->si_xib_last_pindex) -+ return 0; -+ -+ xib = sbinfo->si_xib; -+ p = sbinfo->si_xib_buf; -+ pos = sbinfo->si_xib_last_pindex; -+ pos *= PAGE_SIZE; -+ sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); -+ if (unlikely(sz != PAGE_SIZE)) -+ goto out; -+ -+ pos = pindex; -+ pos *= PAGE_SIZE; -+ if (i_size_read(xib->f_dentry->d_inode) >= pos + PAGE_SIZE) -+ sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos); -+ else { -+ memset(p, 0, PAGE_SIZE); -+ sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); -+ } -+ if (sz == PAGE_SIZE) { -+ sbinfo->si_xib_last_pindex = pindex; -+ return 0; /* success */ -+ } -+ -+ out: -+ AuIOErr1("write failed (%zd)\n", sz); -+ err = sz; -+ if (sz >= 0) -+ err = -EIO; -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+int au_xino_write0(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, -+ ino_t ino) -+{ -+ int err, bit; -+ unsigned long pindex; -+ struct au_sbinfo *sbinfo; -+ -+ if (!au_opt_test(au_mntflags(sb), XINO)) -+ return 0; -+ -+ err = 0; -+ if (ino) { -+ sbinfo = au_sbi(sb); -+ xib_calc_bit(ino, &pindex, &bit); -+ AuDebugOn(page_bits <= bit); -+ mutex_lock(&sbinfo->si_xib_mtx); -+ err = xib_pindex(sb, pindex); -+ if (!err) { -+ clear_bit(bit, sbinfo->si_xib_buf); -+ sbinfo->si_xib_next_bit = bit; -+ } -+ mutex_unlock(&sbinfo->si_xib_mtx); -+ } -+ -+ if (!err) -+ err = au_xino_write(sb, bindex, h_ino, 0); -+ return err; -+} -+ -+/* get an unused inode number from bitmap */ -+ino_t au_xino_new_ino(struct super_block *sb) -+{ -+ ino_t ino; -+ unsigned long *p, pindex, ul, pend; -+ struct au_sbinfo *sbinfo; -+ struct file *file; -+ int free_bit, err; -+ -+ if (!au_opt_test(au_mntflags(sb), XINO)) -+ return iunique(sb, AUFS_FIRST_INO); -+ -+ sbinfo = au_sbi(sb); -+ mutex_lock(&sbinfo->si_xib_mtx); -+ p = sbinfo->si_xib_buf; -+ free_bit = sbinfo->si_xib_next_bit; -+ if (free_bit < page_bits && !test_bit(free_bit, p)) -+ goto out; /* success */ -+ free_bit = find_first_zero_bit(p, page_bits); -+ if (free_bit < page_bits) -+ goto out; /* success */ -+ -+ pindex = sbinfo->si_xib_last_pindex; -+ for (ul = pindex - 1; ul < ULONG_MAX; ul--) { -+ err = xib_pindex(sb, ul); -+ if (unlikely(err)) -+ goto out_err; -+ free_bit = find_first_zero_bit(p, page_bits); -+ if (free_bit < page_bits) -+ goto out; /* success */ -+ } -+ -+ file = sbinfo->si_xib; -+ pend = i_size_read(file->f_dentry->d_inode) / PAGE_SIZE; -+ for (ul = pindex + 1; ul <= pend; ul++) { -+ err = xib_pindex(sb, ul); -+ if (unlikely(err)) -+ goto out_err; -+ free_bit = find_first_zero_bit(p, page_bits); -+ if (free_bit < page_bits) -+ goto out; /* success */ -+ } -+ BUG(); -+ -+ out: -+ set_bit(free_bit, p); -+ sbinfo->si_xib_next_bit++; -+ pindex = sbinfo->si_xib_last_pindex; -+ mutex_unlock(&sbinfo->si_xib_mtx); -+ ino = xib_calc_ino(pindex, free_bit); -+ AuDbg("i%lu\n", (unsigned long)ino); -+ return ino; -+ out_err: -+ mutex_unlock(&sbinfo->si_xib_mtx); -+ AuDbg("i0\n"); -+ return 0; -+} -+ -+/* -+ * read @ino from xinofile for the specified branch{@sb, @bindex} -+ * at the position of @h_ino. -+ * if @ino does not exist and @do_new is true, get new one. -+ */ -+int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, -+ ino_t *ino) -+{ -+ int err; -+ ssize_t sz; -+ loff_t pos; -+ struct file *file; -+ struct au_sbinfo *sbinfo; -+ -+ *ino = 0; -+ if (!au_opt_test(au_mntflags(sb), XINO)) -+ return 0; /* no xino */ -+ -+ err = 0; -+ sbinfo = au_sbi(sb); -+ pos = h_ino; -+ if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) { -+ AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); -+ return -EFBIG; -+ } -+ pos *= sizeof(*ino); -+ -+ file = au_sbr(sb, bindex)->br_xino.xi_file; -+ if (i_size_read(file->f_dentry->d_inode) < pos + sizeof(*ino)) -+ return 0; /* no ino */ -+ -+ sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos); -+ if (sz == sizeof(*ino)) -+ return 0; /* success */ -+ -+ err = sz; -+ if (unlikely(sz >= 0)) { -+ err = -EIO; -+ AuIOErr("xino read error (%zd)\n", sz); -+ } -+ -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* create and set a new xino file */ -+ -+struct file *au_xino_create(struct super_block *sb, char *fname, int silent) -+{ -+ struct file *file; -+ struct dentry *h_parent, *d; -+ struct inode *h_dir; -+ int err; -+ -+ /* -+ * at mount-time, and the xino file is the default path, -+ * hinotify is disabled so we have no inotify events to ignore. -+ * when a user specified the xino, we cannot get au_hdir to be ignored. -+ */ -+ file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE, -+ S_IRUGO | S_IWUGO); -+ if (IS_ERR(file)) { -+ if (!silent) -+ AuErr("open %s(%ld)\n", fname, PTR_ERR(file)); -+ return file; -+ } -+ -+ /* keep file count */ -+ h_parent = dget_parent(file->f_dentry); -+ h_dir = h_parent->d_inode; -+ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); -+ /* mnt_want_write() is unnecessary here */ -+ err = vfsub_unlink(h_dir, &file->f_path, /*force*/0); -+ mutex_unlock(&h_dir->i_mutex); -+ dput(h_parent); -+ if (unlikely(err)) { -+ if (!silent) -+ AuErr("unlink %s(%d)\n", fname, err); -+ goto out; -+ } -+ -+ err = -EINVAL; -+ d = file->f_dentry; -+ if (unlikely(sb == d->d_sb)) { -+ if (!silent) -+ AuErr("%s must be outside\n", fname); -+ goto out; -+ } -+ if (unlikely(au_test_fs_bad_xino(d->d_sb))) { -+ if (!silent) -+ AuErr("xino doesn't support %s(%s)\n", -+ fname, au_sbtype(d->d_sb)); -+ goto out; -+ } -+ return file; /* success */ -+ -+ out: -+ fput(file); -+ file = ERR_PTR(err); -+ return file; -+} -+ -+/* -+ * find another branch who is on the same filesystem of the specified -+ * branch{@btgt}. search until @bend. -+ */ -+static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt, -+ aufs_bindex_t bend) -+{ -+ aufs_bindex_t bindex; -+ struct super_block *tgt_sb = au_sbr_sb(sb, btgt); -+ -+ for (bindex = 0; bindex < btgt; bindex++) -+ if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) -+ return bindex; -+ for (bindex++; bindex <= bend; bindex++) -+ if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) -+ return bindex; -+ return -1; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * initialize the xinofile for the specified branch @br -+ * at the place/path where @base_file indicates. -+ * test whether another branch is on the same filesystem or not, -+ * if @do_test is true. -+ */ -+int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino, -+ struct file *base_file, int do_test) -+{ -+ int err; -+ ino_t ino; -+ aufs_bindex_t bend, bindex; -+ struct au_branch *shared_br, *b; -+ struct file *file; -+ struct super_block *tgt_sb; -+ -+ shared_br = NULL; -+ bend = au_sbend(sb); -+ if (do_test) { -+ tgt_sb = br->br_mnt->mnt_sb; -+ for (bindex = 0; bindex <= bend; bindex++) { -+ b = au_sbr(sb, bindex); -+ if (tgt_sb == b->br_mnt->mnt_sb) { -+ shared_br = b; -+ break; -+ } -+ } -+ } -+ -+ if (!shared_br || !shared_br->br_xino.xi_file) { -+ struct au_xino_lock_dir ldir; -+ -+ au_xino_lock_dir(sb, base_file, &ldir); -+ /* mnt_want_write() is unnecessary here */ -+ file = au_xino_create2(base_file, NULL); -+ au_xino_unlock_dir(&ldir); -+ err = PTR_ERR(file); -+ if (IS_ERR(file)) -+ goto out; -+ br->br_xino.xi_file = file; -+ } else { -+ br->br_xino.xi_file = shared_br->br_xino.xi_file; -+ get_file(br->br_xino.xi_file); -+ } -+ -+ ino = AUFS_ROOT_INO; -+ err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, -+ h_ino, ino); -+ if (!err) -+ return 0; /* success */ -+ -+ -+ out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* trucate a xino bitmap file */ -+ -+/* todo: slow */ -+static int do_xib_restore(struct super_block *sb, struct file *file, void *page) -+{ -+ int err, bit; -+ ssize_t sz; -+ unsigned long pindex; -+ loff_t pos, pend; -+ struct au_sbinfo *sbinfo; -+ au_readf_t func; -+ ino_t *ino; -+ unsigned long *p; -+ -+ err = 0; -+ sbinfo = au_sbi(sb); -+ MtxMustLock(&sbinfo->si_xib_mtx); -+ p = sbinfo->si_xib_buf; -+ func = sbinfo->si_xread; -+ pend = i_size_read(file->f_dentry->d_inode); -+ pos = 0; -+ while (pos < pend) { -+ sz = xino_fread(func, file, page, PAGE_SIZE, &pos); -+ err = sz; -+ if (unlikely(sz <= 0)) -+ goto out; -+ -+ err = 0; -+ for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) { -+ if (unlikely(*ino < AUFS_FIRST_INO)) -+ continue; -+ -+ xib_calc_bit(*ino, &pindex, &bit); -+ AuDebugOn(page_bits <= bit); -+ err = xib_pindex(sb, pindex); -+ if (!err) -+ set_bit(bit, p); -+ else -+ goto out; -+ } -+ } -+ -+ out: -+ return err; -+} -+ -+static int xib_restore(struct super_block *sb) -+{ -+ int err; -+ aufs_bindex_t bindex, bend; -+ void *page; -+ -+ err = -ENOMEM; -+ page = (void *)__get_free_page(GFP_NOFS); -+ if (unlikely(!page)) -+ goto out; -+ -+ err = 0; -+ bend = au_sbend(sb); -+ for (bindex = 0; !err && bindex <= bend; bindex++) -+ if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0) -+ err = do_xib_restore -+ (sb, au_sbr(sb, bindex)->br_xino.xi_file, page); -+ else -+ AuDbg("b%d\n", bindex); -+ free_page((unsigned long)page); -+ -+ out: -+ return err; -+} -+ -+int au_xib_trunc(struct super_block *sb) -+{ -+ int err; -+ ssize_t sz; -+ loff_t pos; -+ struct au_xino_lock_dir ldir; -+ struct au_sbinfo *sbinfo; -+ unsigned long *p; -+ struct file *file; -+ -+ SiMustWriteLock(sb); -+ -+ err = 0; -+ sbinfo = au_sbi(sb); -+ if (!au_opt_test(sbinfo->si_mntflags, XINO)) -+ goto out; -+ -+ file = sbinfo->si_xib; -+ if (i_size_read(file->f_dentry->d_inode) <= PAGE_SIZE) -+ goto out; -+ -+ au_xino_lock_dir(sb, file, &ldir); -+ /* mnt_want_write() is unnecessary here */ -+ file = au_xino_create2(sbinfo->si_xib, NULL); -+ au_xino_unlock_dir(&ldir); -+ err = PTR_ERR(file); -+ if (IS_ERR(file)) -+ goto out; -+ fput(sbinfo->si_xib); -+ sbinfo->si_xib = file; -+ -+ p = sbinfo->si_xib_buf; -+ memset(p, 0, PAGE_SIZE); -+ pos = 0; -+ sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos); -+ if (unlikely(sz != PAGE_SIZE)) { -+ err = sz; -+ AuIOErr("err %d\n", err); -+ if (sz >= 0) -+ err = -EIO; -+ goto out; -+ } -+ -+ mutex_lock(&sbinfo->si_xib_mtx); -+ /* mnt_want_write() is unnecessary here */ -+ err = xib_restore(sb); -+ mutex_unlock(&sbinfo->si_xib_mtx); -+ -+out: -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * xino mount option handlers -+ */ -+static au_readf_t find_readf(struct file *h_file) -+{ -+ const struct file_operations *fop = h_file->f_op; -+ -+ if (fop) { -+ if (fop->read) -+ return fop->read; -+ if (fop->aio_read) -+ return do_sync_read; -+ } -+ return ERR_PTR(-ENOSYS); -+} -+ -+static au_writef_t find_writef(struct file *h_file) -+{ -+ const struct file_operations *fop = h_file->f_op; -+ -+ if (fop) { -+ if (fop->write) -+ return fop->write; -+ if (fop->aio_write) -+ return do_sync_write; -+ } -+ return ERR_PTR(-ENOSYS); -+} -+ -+/* xino bitmap */ -+static void xino_clear_xib(struct super_block *sb) -+{ -+ struct au_sbinfo *sbinfo; -+ -+ SiMustWriteLock(sb); -+ -+ sbinfo = au_sbi(sb); -+ sbinfo->si_xread = NULL; -+ sbinfo->si_xwrite = NULL; -+ if (sbinfo->si_xib) -+ fput(sbinfo->si_xib); -+ sbinfo->si_xib = NULL; -+ free_page((unsigned long)sbinfo->si_xib_buf); -+ sbinfo->si_xib_buf = NULL; -+} -+ -+static int au_xino_set_xib(struct super_block *sb, struct file *base) -+{ -+ int err; -+ loff_t pos; -+ struct au_sbinfo *sbinfo; -+ struct file *file; -+ -+ SiMustWriteLock(sb); -+ -+ sbinfo = au_sbi(sb); -+ file = au_xino_create2(base, sbinfo->si_xib); -+ err = PTR_ERR(file); -+ if (IS_ERR(file)) -+ goto out; -+ if (sbinfo->si_xib) -+ fput(sbinfo->si_xib); -+ sbinfo->si_xib = file; -+ sbinfo->si_xread = find_readf(file); -+ sbinfo->si_xwrite = find_writef(file); -+ -+ err = -ENOMEM; -+ if (!sbinfo->si_xib_buf) -+ sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS); -+ if (unlikely(!sbinfo->si_xib_buf)) -+ goto out_unset; -+ -+ sbinfo->si_xib_last_pindex = 0; -+ sbinfo->si_xib_next_bit = 0; -+ if (i_size_read(file->f_dentry->d_inode) < PAGE_SIZE) { -+ pos = 0; -+ err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf, -+ PAGE_SIZE, &pos); -+ if (unlikely(err != PAGE_SIZE)) -+ goto out_free; -+ } -+ err = 0; -+ goto out; /* success */ -+ -+ out_free: -+ free_page((unsigned long)sbinfo->si_xib_buf); -+ sbinfo->si_xib_buf = NULL; -+ if (err >= 0) -+ err = -EIO; -+ out_unset: -+ fput(sbinfo->si_xib); -+ sbinfo->si_xib = NULL; -+ sbinfo->si_xread = NULL; -+ sbinfo->si_xwrite = NULL; -+ out: -+ return err; -+} -+ -+/* xino for each branch */ -+static void xino_clear_br(struct super_block *sb) -+{ -+ aufs_bindex_t bindex, bend; -+ struct au_branch *br; -+ -+ bend = au_sbend(sb); -+ for (bindex = 0; bindex <= bend; bindex++) { -+ br = au_sbr(sb, bindex); -+ if (!br || !br->br_xino.xi_file) -+ continue; -+ -+ fput(br->br_xino.xi_file); -+ br->br_xino.xi_file = NULL; -+ } -+} -+ -+static int au_xino_set_br(struct super_block *sb, struct file *base) -+{ -+ int err; -+ ino_t ino; -+ aufs_bindex_t bindex, bend, bshared; -+ struct { -+ struct file *old, *new; -+ } *fpair, *p; -+ struct au_branch *br; -+ struct inode *inode; -+ au_writef_t writef; -+ -+ SiMustWriteLock(sb); -+ -+ err = -ENOMEM; -+ bend = au_sbend(sb); -+ fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS); -+ if (unlikely(!fpair)) -+ goto out; -+ -+ inode = sb->s_root->d_inode; -+ ino = AUFS_ROOT_INO; -+ writef = au_sbi(sb)->si_xwrite; -+ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { -+ br = au_sbr(sb, bindex); -+ bshared = is_sb_shared(sb, bindex, bindex - 1); -+ if (bshared >= 0) { -+ /* shared xino */ -+ *p = fpair[bshared]; -+ get_file(p->new); -+ } -+ -+ if (!p->new) { -+ /* new xino */ -+ p->old = br->br_xino.xi_file; -+ p->new = au_xino_create2(base, br->br_xino.xi_file); -+ err = PTR_ERR(p->new); -+ if (IS_ERR(p->new)) { -+ p->new = NULL; -+ goto out_pair; -+ } -+ } -+ -+ err = au_xino_do_write(writef, p->new, -+ au_h_iptr(inode, bindex)->i_ino, ino); -+ if (unlikely(err)) -+ goto out_pair; -+ } -+ -+ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { -+ br = au_sbr(sb, bindex); -+ if (br->br_xino.xi_file) -+ fput(br->br_xino.xi_file); -+ get_file(p->new); -+ br->br_xino.xi_file = p->new; -+ } -+ -+ out_pair: -+ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) -+ if (p->new) -+ fput(p->new); -+ else -+ break; -+ kfree(fpair); -+ out: -+ return err; -+} -+ -+void au_xino_clr(struct super_block *sb) -+{ -+ struct au_sbinfo *sbinfo; -+ -+ au_xigen_clr(sb); -+ xino_clear_xib(sb); -+ xino_clear_br(sb); -+ sbinfo = au_sbi(sb); -+ /* lvalue, do not call au_mntflags() */ -+ au_opt_clr(sbinfo->si_mntflags, XINO); -+} -+ -+int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount) -+{ -+ int err, skip; -+ struct dentry *parent, *cur_parent; -+ struct qstr *dname, *cur_name; -+ struct file *cur_xino; -+ struct inode *dir; -+ struct au_sbinfo *sbinfo; -+ -+ SiMustWriteLock(sb); -+ -+ err = 0; -+ sbinfo = au_sbi(sb); -+ parent = dget_parent(xino->file->f_dentry); -+ if (remount) { -+ skip = 0; -+ dname = &xino->file->f_dentry->d_name; -+ cur_xino = sbinfo->si_xib; -+ if (cur_xino) { -+ cur_parent = dget_parent(cur_xino->f_dentry); -+ cur_name = &cur_xino->f_dentry->d_name; -+ skip = (cur_parent == parent -+ && dname->len == cur_name->len -+ && !memcmp(dname->name, cur_name->name, -+ dname->len)); -+ dput(cur_parent); -+ } -+ if (skip) -+ goto out; -+ } -+ -+ au_opt_set(sbinfo->si_mntflags, XINO); -+ dir = parent->d_inode; -+ mutex_lock_nested(&dir->i_mutex, AuLsc_I_PARENT); -+ /* mnt_want_write() is unnecessary here */ -+ err = au_xino_set_xib(sb, xino->file); -+ if (!err) -+ err = au_xigen_set(sb, xino->file); -+ if (!err) -+ err = au_xino_set_br(sb, xino->file); -+ mutex_unlock(&dir->i_mutex); -+ if (!err) -+ goto out; /* success */ -+ -+ /* reset all */ -+ AuIOErr("failed creating xino(%d).\n", err); -+ -+ out: -+ dput(parent); -+ return err; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* -+ * create a xinofile at the default place/path. -+ */ -+struct file *au_xino_def(struct super_block *sb) -+{ -+ struct file *file; -+ char *page, *p; -+ struct au_branch *br; -+ struct super_block *h_sb; -+ struct path path; -+ aufs_bindex_t bend, bindex, bwr; -+ -+ br = NULL; -+ bend = au_sbend(sb); -+ bwr = -1; -+ for (bindex = 0; bindex <= bend; bindex++) { -+ br = au_sbr(sb, bindex); -+ if (au_br_writable(br->br_perm) -+ && !au_test_fs_bad_xino(br->br_mnt->mnt_sb)) { -+ bwr = bindex; -+ break; -+ } -+ } -+ -+ if (bwr >= 0) { -+ file = ERR_PTR(-ENOMEM); -+ page = __getname(); -+ if (unlikely(!page)) -+ goto out; -+ path.mnt = br->br_mnt; -+ path.dentry = au_h_dptr(sb->s_root, bwr); -+ p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME)); -+ file = (void *)p; -+ if (!IS_ERR(p)) { -+ strcat(p, "/" AUFS_XINO_FNAME); -+ AuDbg("%s\n", p); -+ file = au_xino_create(sb, p, /*silent*/0); -+ if (!IS_ERR(file)) -+ au_xino_brid_set(sb, br->br_id); -+ } -+ __putname(page); -+ } else { -+ file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0); -+ if (IS_ERR(file)) -+ goto out; -+ h_sb = file->f_dentry->d_sb; -+ if (unlikely(au_test_fs_bad_xino(h_sb))) { -+ AuErr("xino doesn't support %s(%s)\n", -+ AUFS_XINO_DEFPATH, au_sbtype(h_sb)); -+ fput(file); -+ file = ERR_PTR(-EINVAL); -+ } -+ if (!IS_ERR(file)) -+ au_xino_brid_set(sb, -1); -+ } -+ -+ out: -+ return file; -+} -+ -+/* ---------------------------------------------------------------------- */ -+ -+int au_xino_path(struct seq_file *seq, struct file *file) -+{ -+ int err; -+ -+ err = au_seq_path(seq, &file->f_path); -+ if (unlikely(err < 0)) -+ goto out; -+ -+ err = 0; -+#define Deleted "\040(deleted)" -+ seq->count -= sizeof(Deleted) - 1; -+ AuDebugOn(memcmp(seq->buf + seq->count, Deleted, -+ sizeof(Deleted) - 1)); -+#undef Deleted -+ -+ out: -+ return err; -+} -diff -Nur linux-2.6.31-vanilla/fs/Kconfig linux-2.6.31/fs/Kconfig ---- linux-2.6.31-vanilla/fs/Kconfig 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/fs/Kconfig 2009-09-16 13:55:56.000000000 +0200 -@@ -187,6 +187,7 @@ - source "fs/ufs/Kconfig" - source "fs/exofs/Kconfig" - source "fs/nilfs2/Kconfig" -+source "fs/aufs/Kconfig" - - endif # MISC_FILESYSTEMS - -diff -Nur linux-2.6.31-vanilla/fs/Makefile linux-2.6.31/fs/Makefile ---- linux-2.6.31-vanilla/fs/Makefile 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/fs/Makefile 2009-09-16 13:55:56.000000000 +0200 -@@ -124,3 +124,4 @@ - obj-$(CONFIG_BTRFS_FS) += btrfs/ - obj-$(CONFIG_GFS2_FS) += gfs2/ - obj-$(CONFIG_EXOFS_FS) += exofs/ -+obj-$(CONFIG_AUFS_FS) += aufs/ -diff -Nur linux-2.6.31-vanilla/fs/namei.c linux-2.6.31/fs/namei.c ---- linux-2.6.31-vanilla/fs/namei.c 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/fs/namei.c 2009-09-16 13:55:49.000000000 +0200 -@@ -337,6 +337,7 @@ - - return 0; - } -+EXPORT_SYMBOL(deny_write_access); - - /** - * path_get - get a reference to a path -@@ -1219,7 +1220,7 @@ - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ --static struct dentry *lookup_hash(struct nameidata *nd) -+struct dentry *lookup_hash(struct nameidata *nd) - { - int err; - -@@ -1228,8 +1229,9 @@ - return ERR_PTR(err); - return __lookup_hash(&nd->last, nd->path.dentry, nd); - } -+EXPORT_SYMBOL(lookup_hash); - --static int __lookup_one_len(const char *name, struct qstr *this, -+int __lookup_one_len(const char *name, struct qstr *this, - struct dentry *base, int len) - { - unsigned long hash; -@@ -1250,6 +1252,7 @@ - this->hash = end_name_hash(hash); - return 0; - } -+EXPORT_SYMBOL(__lookup_one_len); - - /** - * lookup_one_len - filesystem helper to lookup single pathname component -diff -Nur linux-2.6.31-vanilla/fs/namespace.c linux-2.6.31/fs/namespace.c ---- linux-2.6.31-vanilla/fs/namespace.c 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/fs/namespace.c 2009-09-16 13:55:49.000000000 +0200 -@@ -39,6 +39,7 @@ - - /* spinlock for vfsmount related operations, inplace of dcache_lock */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); -+EXPORT_SYMBOL(vfsmount_lock); - - static int event; - static DEFINE_IDA(mnt_id_ida); -diff -Nur linux-2.6.31-vanilla/fs/open.c linux-2.6.31/fs/open.c ---- linux-2.6.31-vanilla/fs/open.c 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/fs/open.c 2009-09-16 13:55:49.000000000 +0200 -@@ -221,6 +221,7 @@ - mutex_unlock(&dentry->d_inode->i_mutex); - return err; - } -+EXPORT_SYMBOL(do_truncate); - - static long do_sys_truncate(const char __user *pathname, loff_t length) - { -diff -Nur linux-2.6.31-vanilla/fs/splice.c linux-2.6.31/fs/splice.c ---- linux-2.6.31-vanilla/fs/splice.c 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/fs/splice.c 2009-09-16 13:55:49.000000000 +0200 -@@ -1057,8 +1057,8 @@ - /* - * Attempt to initiate a splice from pipe to file. - */ --static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, -- loff_t *ppos, size_t len, unsigned int flags) -+long do_splice_from(struct pipe_inode_info *pipe, struct file *out, -+ loff_t *ppos, size_t len, unsigned int flags) - { - ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, - loff_t *, size_t, unsigned int); -@@ -1080,13 +1080,14 @@ - - return splice_write(pipe, out, ppos, len, flags); - } -+EXPORT_SYMBOL(do_splice_from); - - /* - * Attempt to initiate a splice from a file to a pipe. - */ --static long do_splice_to(struct file *in, loff_t *ppos, -- struct pipe_inode_info *pipe, size_t len, -- unsigned int flags) -+long do_splice_to(struct file *in, loff_t *ppos, -+ struct pipe_inode_info *pipe, size_t len, -+ unsigned int flags) - { - ssize_t (*splice_read)(struct file *, loff_t *, - struct pipe_inode_info *, size_t, unsigned int); -@@ -1105,6 +1106,7 @@ - - return splice_read(in, ppos, pipe, len, flags); - } -+EXPORT_SYMBOL(do_splice_to); - - /** - * splice_direct_to_actor - splices data directly between two non-pipes -diff -Nur linux-2.6.31-vanilla/include/linux/aufs_type.h linux-2.6.31/include/linux/aufs_type.h ---- linux-2.6.31-vanilla/include/linux/aufs_type.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/include/linux/aufs_type.h 2009-09-16 13:55:30.000000000 +0200 -@@ -0,0 +1,184 @@ -+/* -+ * Copyright (C) 2005-2009 Junjiro R. Okajima -+ * -+ * This program, aufs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef __AUFS_TYPE_H__ -+#define __AUFS_TYPE_H__ -+ -+#include <linux/ioctl.h> -+#include <linux/types.h> -+ -+#define AUFS_VERSION "2-standalone.tree-20090914" -+ -+/* todo? move this to linux-2.6.19/include/magic.h */ -+#define AUFS_SUPER_MAGIC ('a' << 24 | 'u' << 16 | 'f' << 8 | 's') -+ -+/* ---------------------------------------------------------------------- */ -+ -+#ifdef CONFIG_AUFS_BRANCH_MAX_127 -+typedef __s8 aufs_bindex_t; -+#define AUFS_BRANCH_MAX 127 -+#else -+typedef __s16 aufs_bindex_t; -+#ifdef CONFIG_AUFS_BRANCH_MAX_511 -+#define AUFS_BRANCH_MAX 511 -+#elif defined(CONFIG_AUFS_BRANCH_MAX_1023) -+#define AUFS_BRANCH_MAX 1023 -+#elif defined(CONFIG_AUFS_BRANCH_MAX_32767) -+#define AUFS_BRANCH_MAX 32767 -+#endif -+#endif -+ -+#ifdef __KERNEL__ -+#ifndef AUFS_BRANCH_MAX -+#error unknown CONFIG_AUFS_BRANCH_MAX value -+#endif -+#endif /* __KERNEL__ */ -+ -+/* ---------------------------------------------------------------------- */ -+ -+#define AUFS_NAME "aufs" -+#define AUFS_FSTYPE AUFS_NAME -+ -+#define AUFS_ROOT_INO 2 -+#define AUFS_FIRST_INO 11 -+ -+#define AUFS_WH_PFX ".wh." -+#define AUFS_WH_PFX_LEN ((int)sizeof(AUFS_WH_PFX) - 1) -+#define AUFS_XINO_FNAME "." AUFS_NAME ".xino" -+#define AUFS_XINO_DEFPATH "/tmp/" AUFS_XINO_FNAME -+#define AUFS_XINO_TRUNC_INIT 64 /* blocks */ -+#define AUFS_XINO_TRUNC_STEP 4 /* blocks */ -+#define AUFS_DIRWH_DEF 3 -+#define AUFS_RDCACHE_DEF 10 /* seconds */ -+#define AUFS_RDBLK_DEF 512 /* bytes */ -+#define AUFS_RDHASH_DEF 32 -+#define AUFS_WKQ_NAME AUFS_NAME "d" -+#define AUFS_NWKQ_DEF 4 -+#define AUFS_MFS_SECOND_DEF 30 /* seconds */ -+#define AUFS_PLINK_WARN 100 /* number of plinks */ -+ -+#define AUFS_DIROPQ_NAME AUFS_WH_PFX ".opq" /* whiteouted doubly */ -+#define AUFS_WH_DIROPQ AUFS_WH_PFX AUFS_DIROPQ_NAME -+ -+#define AUFS_BASE_NAME AUFS_WH_PFX AUFS_NAME -+#define AUFS_PLINKDIR_NAME AUFS_WH_PFX "plnk" -+#define AUFS_ORPHDIR_NAME AUFS_WH_PFX "orph" -+ -+/* doubly whiteouted */ -+#define AUFS_WH_BASE AUFS_WH_PFX AUFS_BASE_NAME -+#define AUFS_WH_PLINKDIR AUFS_WH_PFX AUFS_PLINKDIR_NAME -+#define AUFS_WH_ORPHDIR AUFS_WH_PFX AUFS_ORPHDIR_NAME -+ -+/* branch permission */ -+#define AUFS_BRPERM_RW "rw" -+#define AUFS_BRPERM_RO "ro" -+#define AUFS_BRPERM_RR "rr" -+#define AUFS_BRPERM_WH "wh" -+#define AUFS_BRPERM_NLWH "nolwh" -+#define AUFS_BRPERM_ROWH AUFS_BRPERM_RO "+" AUFS_BRPERM_WH -+#define AUFS_BRPERM_RRWH AUFS_BRPERM_RR "+" AUFS_BRPERM_WH -+#define AUFS_BRPERM_RWNLWH AUFS_BRPERM_RW "+" AUFS_BRPERM_NLWH -+ -+/* ---------------------------------------------------------------------- */ -+ -+/* ioctl */ -+enum { -+ AuCtl_PLINK_MAINT, -+ AuCtl_PLINK_CLEAN, -+ -+ /* readdir in userspace */ -+ AuCtl_RDU, -+ AuCtl_RDU_INO -+}; -+ -+/* borrowed from linux/include/linux/kernel.h */ -+#ifndef ALIGN -+#define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) -+#define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) -+#endif -+ -+/* borrowed from linux/include/linux/compiler-gcc3.h */ -+#ifndef __aligned -+#define __aligned(x) __attribute__((aligned(x))) -+#define __packed __attribute__((packed)) -+#endif -+ -+struct au_rdu_cookie { -+ __u64 h_pos; -+ __s16 bindex; -+ __u8 flags; -+ __u8 pad; -+ __u32 generation; -+} __aligned(8); -+ -+struct au_rdu_ent { -+ __u64 ino; -+ __s16 bindex; -+ __u8 type; -+ __u8 nlen; -+ __u8 wh; -+ char name[0]; -+} __aligned(8); -+ -+static inline int au_rdu_len(int nlen) -+{ -+ /* include the terminating NULL */ -+ return ALIGN(sizeof(struct au_rdu_ent) + nlen + 1, -+ sizeof(__u64)); -+} -+ -+union au_rdu_ent_ul { -+ struct au_rdu_ent __user *e; -+ unsigned long ul; -+}; -+ -+enum { -+ AufsCtlRduV_SZ, -+ AufsCtlRduV_SZ_PTR, -+ AufsCtlRduV_End -+}; -+ -+struct aufs_rdu { -+ /* input */ -+ union { -+ __u64 sz; /* AuCtl_RDU */ -+ __u64 nent; /* AuCtl_RDU_INO */ -+ }; -+ union au_rdu_ent_ul ent; -+ __u16 verify[AufsCtlRduV_End]; -+ -+ /* input/output */ -+ __u32 blk; -+ -+ /* output */ -+ union au_rdu_ent_ul tail; -+ /* number of entries which were added in a single call */ -+ __u64 rent; -+ __u8 full; -+ __u8 shwh; -+ -+ struct au_rdu_cookie cookie; -+} __aligned(8); -+ -+#define AuCtlType 'A' -+#define AUFS_CTL_PLINK_MAINT _IO(AuCtlType, AuCtl_PLINK_MAINT) -+#define AUFS_CTL_PLINK_CLEAN _IO(AuCtlType, AuCtl_PLINK_CLEAN) -+#define AUFS_CTL_RDU _IOWR(AuCtlType, AuCtl_RDU, struct aufs_rdu) -+#define AUFS_CTL_RDU_INO _IOWR(AuCtlType, AuCtl_RDU_INO, struct aufs_rdu) -+ -+#endif /* __AUFS_TYPE_H__ */ -diff -Nur linux-2.6.31-vanilla/include/linux/Kbuild linux-2.6.31/include/linux/Kbuild ---- linux-2.6.31-vanilla/include/linux/Kbuild 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/include/linux/Kbuild 2009-09-16 13:55:56.000000000 +0200 -@@ -34,6 +34,7 @@ - header-y += atmsap.h - header-y += atmsvc.h - header-y += atm_zatm.h -+header-y += aufs_type.h - header-y += auto_fs4.h - header-y += ax25.h - header-y += b1lli.h -diff -Nur linux-2.6.31-vanilla/include/linux/namei.h linux-2.6.31/include/linux/namei.h ---- linux-2.6.31-vanilla/include/linux/namei.h 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/include/linux/namei.h 2009-09-16 13:55:46.000000000 +0200 -@@ -75,6 +75,9 @@ - extern struct file *nameidata_to_filp(struct nameidata *nd, int flags); - extern void release_open_intent(struct nameidata *); - -+extern struct dentry *lookup_hash(struct nameidata *nd); -+extern int __lookup_one_len(const char *name, struct qstr *this, -+ struct dentry *base, int len); - extern struct dentry *lookup_one_len(const char *, struct dentry *, int); - extern struct dentry *lookup_one_noperm(const char *, struct dentry *); - -diff -Nur linux-2.6.31-vanilla/include/linux/splice.h linux-2.6.31/include/linux/splice.h ---- linux-2.6.31-vanilla/include/linux/splice.h 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/include/linux/splice.h 2009-09-16 13:55:46.000000000 +0200 -@@ -82,4 +82,10 @@ - extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, - splice_direct_actor *); - -+extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out, -+ loff_t *ppos, size_t len, unsigned int flags); -+extern long do_splice_to(struct file *in, loff_t *ppos, -+ struct pipe_inode_info *pipe, size_t len, -+ unsigned int flags); -+ - #endif -diff -Nur linux-2.6.31-vanilla/security/device_cgroup.c linux-2.6.31/security/device_cgroup.c ---- linux-2.6.31-vanilla/security/device_cgroup.c 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/security/device_cgroup.c 2009-09-16 13:55:49.000000000 +0200 -@@ -513,6 +513,7 @@ - - return -EPERM; - } -+EXPORT_SYMBOL(devcgroup_inode_permission); - - int devcgroup_inode_mknod(int mode, dev_t dev) - { -diff -Nur linux-2.6.31-vanilla/security/integrity/ima/ima_main.c linux-2.6.31/security/integrity/ima/ima_main.c ---- linux-2.6.31-vanilla/security/integrity/ima/ima_main.c 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/security/integrity/ima/ima_main.c 2009-09-16 13:55:49.000000000 +0200 -@@ -324,6 +324,7 @@ - MAY_EXEC, FILE_MMAP); - return 0; - } -+EXPORT_SYMBOL(ima_file_mmap); - - /** - * ima_bprm_check - based on policy, collect/store measurement. -diff -Nur linux-2.6.31-vanilla/security/integrity/ima/ima_main.c.orig linux-2.6.31/security/integrity/ima/ima_main.c.orig ---- linux-2.6.31-vanilla/security/integrity/ima/ima_main.c.orig 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.31/security/integrity/ima/ima_main.c.orig 2009-09-10 00:13:59.000000000 +0200 -@@ -0,0 +1,368 @@ -+/* -+ * Copyright (C) 2005,2006,2007,2008 IBM Corporation -+ * -+ * Authors: -+ * Reiner Sailer sailer@watson.ibm.com -+ * Serge Hallyn serue@us.ibm.com -+ * Kylene Hall kylene@us.ibm.com -+ * Mimi Zohar zohar@us.ibm.com -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License as -+ * published by the Free Software Foundation, version 2 of the -+ * License. -+ * -+ * File: ima_main.c -+ * implements the IMA hooks: ima_bprm_check, ima_file_mmap, -+ * and ima_path_check. -+ */ -+#include <linux/module.h> -+#include <linux/file.h> -+#include <linux/binfmts.h> -+#include <linux/mount.h> -+#include <linux/mman.h> -+ -+#include "ima.h" -+ -+int ima_initialized; -+ -+char *ima_hash = "sha1"; -+static int __init hash_setup(char *str) -+{ -+ if (strncmp(str, "md5", 3) == 0) -+ ima_hash = "md5"; -+ return 1; -+} -+__setup("ima_hash=", hash_setup); -+ -+/** -+ * ima_file_free - called on __fput() -+ * @file: pointer to file structure being freed -+ * -+ * Flag files that changed, based on i_version; -+ * and decrement the iint readcount/writecount. -+ */ -+void ima_file_free(struct file *file) -+{ -+ struct inode *inode = file->f_dentry->d_inode; -+ struct ima_iint_cache *iint; -+ -+ if (!ima_initialized || !S_ISREG(inode->i_mode)) -+ return; -+ iint = ima_iint_find_get(inode); -+ if (!iint) -+ return; -+ -+ mutex_lock(&iint->mutex); -+ if (iint->opencount <= 0) { -+ printk(KERN_INFO -+ "%s: %s open/free imbalance (r:%ld w:%ld o:%ld f:%ld)\n", -+ __FUNCTION__, file->f_dentry->d_name.name, -+ iint->readcount, iint->writecount, -+ iint->opencount, atomic_long_read(&file->f_count)); -+ if (!(iint->flags & IMA_IINT_DUMP_STACK)) { -+ dump_stack(); -+ iint->flags |= IMA_IINT_DUMP_STACK; -+ } -+ } -+ iint->opencount--; -+ -+ if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) -+ iint->readcount--; -+ -+ if (file->f_mode & FMODE_WRITE) { -+ iint->writecount--; -+ if (iint->writecount == 0) { -+ if (iint->version != inode->i_version) -+ iint->flags &= ~IMA_MEASURED; -+ } -+ } -+ mutex_unlock(&iint->mutex); -+ kref_put(&iint->refcount, iint_free); -+} -+ -+/* ima_read_write_check - reflect possible reading/writing errors in the PCR. -+ * -+ * When opening a file for read, if the file is already open for write, -+ * the file could change, resulting in a file measurement error. -+ * -+ * Opening a file for write, if the file is already open for read, results -+ * in a time of measure, time of use (ToMToU) error. -+ * -+ * In either case invalidate the PCR. -+ */ -+enum iint_pcr_error { TOMTOU, OPEN_WRITERS }; -+static void ima_read_write_check(enum iint_pcr_error error, -+ struct ima_iint_cache *iint, -+ struct inode *inode, -+ const unsigned char *filename) -+{ -+ switch (error) { -+ case TOMTOU: -+ if (iint->readcount > 0) -+ ima_add_violation(inode, filename, "invalid_pcr", -+ "ToMToU"); -+ break; -+ case OPEN_WRITERS: -+ if (iint->writecount > 0) -+ ima_add_violation(inode, filename, "invalid_pcr", -+ "open_writers"); -+ break; -+ } -+} -+ -+static int get_path_measurement(struct ima_iint_cache *iint, struct file *file, -+ const unsigned char *filename) -+{ -+ int rc = 0; -+ -+ iint->opencount++; -+ iint->readcount++; -+ -+ rc = ima_collect_measurement(iint, file); -+ if (!rc) -+ ima_store_measurement(iint, file, filename); -+ return rc; -+} -+ -+static void ima_update_counts(struct ima_iint_cache *iint, int mask) -+{ -+ iint->opencount++; -+ if ((mask & MAY_WRITE) || (mask == 0)) -+ iint->writecount++; -+ else if (mask & (MAY_READ | MAY_EXEC)) -+ iint->readcount++; -+} -+ -+/** -+ * ima_path_check - based on policy, collect/store measurement. -+ * @path: contains a pointer to the path to be measured -+ * @mask: contains MAY_READ, MAY_WRITE or MAY_EXECUTE -+ * -+ * Measure the file being open for readonly, based on the -+ * ima_must_measure() policy decision. -+ * -+ * Keep read/write counters for all files, but only -+ * invalidate the PCR for measured files: -+ * - Opening a file for write when already open for read, -+ * results in a time of measure, time of use (ToMToU) error. -+ * - Opening a file for read when already open for write, -+ * could result in a file measurement error. -+ * -+ * Always return 0 and audit dentry_open failures. -+ * (Return code will be based upon measurement appraisal.) -+ */ -+int ima_path_check(struct path *path, int mask, int update_counts) -+{ -+ struct inode *inode = path->dentry->d_inode; -+ struct ima_iint_cache *iint; -+ struct file *file = NULL; -+ int rc; -+ -+ if (!ima_initialized || !S_ISREG(inode->i_mode)) -+ return 0; -+ iint = ima_iint_find_insert_get(inode); -+ if (!iint) -+ return 0; -+ -+ mutex_lock(&iint->mutex); -+ if (update_counts) -+ ima_update_counts(iint, mask); -+ -+ rc = ima_must_measure(iint, inode, MAY_READ, PATH_CHECK); -+ if (rc < 0) -+ goto out; -+ -+ if ((mask & MAY_WRITE) || (mask == 0)) -+ ima_read_write_check(TOMTOU, iint, inode, -+ path->dentry->d_name.name); -+ -+ if ((mask & (MAY_WRITE | MAY_READ | MAY_EXEC)) != MAY_READ) -+ goto out; -+ -+ ima_read_write_check(OPEN_WRITERS, iint, inode, -+ path->dentry->d_name.name); -+ if (!(iint->flags & IMA_MEASURED)) { -+ struct dentry *dentry = dget(path->dentry); -+ struct vfsmount *mnt = mntget(path->mnt); -+ -+ file = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE, -+ current_cred()); -+ if (IS_ERR(file)) { -+ int audit_info = 0; -+ -+ integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, -+ dentry->d_name.name, -+ "add_measurement", -+ "dentry_open failed", -+ 1, audit_info); -+ file = NULL; -+ goto out; -+ } -+ rc = get_path_measurement(iint, file, dentry->d_name.name); -+ } -+out: -+ mutex_unlock(&iint->mutex); -+ if (file) -+ fput(file); -+ kref_put(&iint->refcount, iint_free); -+ return 0; -+} -+EXPORT_SYMBOL_GPL(ima_path_check); -+ -+static int process_measurement(struct file *file, const unsigned char *filename, -+ int mask, int function) -+{ -+ struct inode *inode = file->f_dentry->d_inode; -+ struct ima_iint_cache *iint; -+ int rc; -+ -+ if (!ima_initialized || !S_ISREG(inode->i_mode)) -+ return 0; -+ iint = ima_iint_find_insert_get(inode); -+ if (!iint) -+ return -ENOMEM; -+ -+ mutex_lock(&iint->mutex); -+ rc = ima_must_measure(iint, inode, mask, function); -+ if (rc != 0) -+ goto out; -+ -+ rc = ima_collect_measurement(iint, file); -+ if (!rc) -+ ima_store_measurement(iint, file, filename); -+out: -+ mutex_unlock(&iint->mutex); -+ kref_put(&iint->refcount, iint_free); -+ return rc; -+} -+ -+/* -+ * ima_counts_put - decrement file counts -+ * -+ * File counts are incremented in ima_path_check. On file open -+ * error, such as ETXTBSY, decrement the counts to prevent -+ * unnecessary imbalance messages. -+ */ -+void ima_counts_put(struct path *path, int mask) -+{ -+ struct inode *inode = path->dentry->d_inode; -+ struct ima_iint_cache *iint; -+ -+ /* The inode may already have been freed, freeing the iint -+ * with it. Verify the inode is not NULL before dereferencing -+ * it. -+ */ -+ if (!ima_initialized || !inode || !S_ISREG(inode->i_mode)) -+ return; -+ iint = ima_iint_find_insert_get(inode); -+ if (!iint) -+ return; -+ -+ mutex_lock(&iint->mutex); -+ iint->opencount--; -+ if ((mask & MAY_WRITE) || (mask == 0)) -+ iint->writecount--; -+ else if (mask & (MAY_READ | MAY_EXEC)) -+ iint->readcount--; -+ mutex_unlock(&iint->mutex); -+ -+ kref_put(&iint->refcount, iint_free); -+} -+ -+/* -+ * ima_counts_get - increment file counts -+ * -+ * - for IPC shm and shmat file. -+ * - for nfsd exported files. -+ * -+ * Increment the counts for these files to prevent unnecessary -+ * imbalance messages. -+ */ -+void ima_counts_get(struct file *file) -+{ -+ struct inode *inode = file->f_dentry->d_inode; -+ struct ima_iint_cache *iint; -+ -+ if (!ima_initialized || !S_ISREG(inode->i_mode)) -+ return; -+ iint = ima_iint_find_insert_get(inode); -+ if (!iint) -+ return; -+ mutex_lock(&iint->mutex); -+ iint->opencount++; -+ if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) -+ iint->readcount++; -+ -+ if (file->f_mode & FMODE_WRITE) -+ iint->writecount++; -+ mutex_unlock(&iint->mutex); -+ -+ kref_put(&iint->refcount, iint_free); -+} -+EXPORT_SYMBOL_GPL(ima_counts_get); -+ -+/** -+ * ima_file_mmap - based on policy, collect/store measurement. -+ * @file: pointer to the file to be measured (May be NULL) -+ * @prot: contains the protection that will be applied by the kernel. -+ * -+ * Measure files being mmapped executable based on the ima_must_measure() -+ * policy decision. -+ * -+ * Return 0 on success, an error code on failure. -+ * (Based on the results of appraise_measurement().) -+ */ -+int ima_file_mmap(struct file *file, unsigned long prot) -+{ -+ int rc; -+ -+ if (!file) -+ return 0; -+ if (prot & PROT_EXEC) -+ rc = process_measurement(file, file->f_dentry->d_name.name, -+ MAY_EXEC, FILE_MMAP); -+ return 0; -+} -+ -+/** -+ * ima_bprm_check - based on policy, collect/store measurement. -+ * @bprm: contains the linux_binprm structure -+ * -+ * The OS protects against an executable file, already open for write, -+ * from being executed in deny_write_access() and an executable file, -+ * already open for execute, from being modified in get_write_access(). -+ * So we can be certain that what we verify and measure here is actually -+ * what is being executed. -+ * -+ * Return 0 on success, an error code on failure. -+ * (Based on the results of appraise_measurement().) -+ */ -+int ima_bprm_check(struct linux_binprm *bprm) -+{ -+ int rc; -+ -+ rc = process_measurement(bprm->file, bprm->filename, -+ MAY_EXEC, BPRM_CHECK); -+ return 0; -+} -+ -+static int __init init_ima(void) -+{ -+ int error; -+ -+ ima_iintcache_init(); -+ error = ima_init(); -+ ima_initialized = 1; -+ return error; -+} -+ -+static void __exit cleanup_ima(void) -+{ -+ ima_cleanup(); -+} -+ -+late_initcall(init_ima); /* Start IMA after the TPM is available */ -+ -+MODULE_DESCRIPTION("Integrity Measurement Architecture"); -+MODULE_LICENSE("GPL"); -diff -Nur linux-2.6.31-vanilla/security/security.c linux-2.6.31/security/security.c ---- linux-2.6.31-vanilla/security/security.c 2009-09-10 00:13:59.000000000 +0200 -+++ linux-2.6.31/security/security.c 2009-09-16 13:55:49.000000000 +0200 -@@ -386,6 +386,7 @@ - return 0; - return security_ops->path_mkdir(path, dentry, mode); - } -+EXPORT_SYMBOL(security_path_mkdir); - - int security_path_rmdir(struct path *path, struct dentry *dentry) - { -@@ -393,6 +394,7 @@ - return 0; - return security_ops->path_rmdir(path, dentry); - } -+EXPORT_SYMBOL(security_path_rmdir); - - int security_path_unlink(struct path *path, struct dentry *dentry) - { -@@ -400,6 +402,7 @@ - return 0; - return security_ops->path_unlink(path, dentry); - } -+EXPORT_SYMBOL(security_path_unlink); - - int security_path_symlink(struct path *path, struct dentry *dentry, - const char *old_name) -@@ -408,6 +411,7 @@ - return 0; - return security_ops->path_symlink(path, dentry, old_name); - } -+EXPORT_SYMBOL(security_path_symlink); - - int security_path_link(struct dentry *old_dentry, struct path *new_dir, - struct dentry *new_dentry) -@@ -416,6 +420,7 @@ - return 0; - return security_ops->path_link(old_dentry, new_dir, new_dentry); - } -+EXPORT_SYMBOL(security_path_link); - - int security_path_rename(struct path *old_dir, struct dentry *old_dentry, - struct path *new_dir, struct dentry *new_dentry) -@@ -426,6 +431,7 @@ - return security_ops->path_rename(old_dir, old_dentry, new_dir, - new_dentry); - } -+EXPORT_SYMBOL(security_path_rename); - - int security_path_truncate(struct path *path, loff_t length, - unsigned int time_attrs) -@@ -434,6 +440,7 @@ - return 0; - return security_ops->path_truncate(path, length, time_attrs); - } -+EXPORT_SYMBOL(security_path_truncate); - #endif - - int security_inode_create(struct inode *dir, struct dentry *dentry, int mode) -@@ -505,6 +512,7 @@ - return 0; - return security_ops->inode_readlink(dentry); - } -+EXPORT_SYMBOL(security_inode_readlink); - - int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd) - { -@@ -519,6 +527,7 @@ - return 0; - return security_ops->inode_permission(inode, mask); - } -+EXPORT_SYMBOL(security_inode_permission); - - int security_inode_setattr(struct dentry *dentry, struct iattr *attr) - { -@@ -619,6 +628,7 @@ - { - return security_ops->file_permission(file, mask); - } -+EXPORT_SYMBOL(security_file_permission); - - int security_file_alloc(struct file *file) - { diff --git a/pkgs/core/kernel/patches/aufs2-2.6.31.1-1.patch.off b/pkgs/core/kernel/patches/aufs2-2.6.31.1-1.patch.off new file mode 100644 index 0000000..1f6f612 --- /dev/null +++ b/pkgs/core/kernel/patches/aufs2-2.6.31.1-1.patch.off @@ -0,0 +1,25456 @@ +diff -Nur linux-2.6.31-vanilla/Documentation/ABI/testing/debugfs-aufs linux-2.6.31/Documentation/ABI/testing/debugfs-aufs +--- linux-2.6.31-vanilla/Documentation/ABI/testing/debugfs-aufs 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/Documentation/ABI/testing/debugfs-aufs 2009-09-16 13:55:29.000000000 +0200 +@@ -0,0 +1,40 @@ ++What: /debug/aufs/si_<id>/ ++Date: March 2009 ++Contact: J. R. Okajima hooanon05@yahoo.co.jp ++Description: ++ Under /debug/aufs, a directory named si_<id> is created ++ per aufs mount, where <id> is a unique id generated ++ internally. ++ ++What: /debug/aufs/si_<id>/xib ++Date: March 2009 ++Contact: J. R. Okajima hooanon05@yahoo.co.jp ++Description: ++ It shows the consumed blocks by xib (External Inode Number ++ Bitmap), its block size and file size. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see ++ Documentation/filesystems/aufs/aufs.5 in detail. ++ ++What: /debug/aufs/si_<id>/xino0, xino1 ... xinoN ++Date: March 2009 ++Contact: J. R. Okajima hooanon05@yahoo.co.jp ++Description: ++ It shows the consumed blocks by xino (External Inode Number ++ Translation Table), its link count, block size and file ++ size. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see ++ Documentation/filesystems/aufs/aufs.5 in detail. ++ ++What: /debug/aufs/si_<id>/xigen ++Date: March 2009 ++Contact: J. R. Okajima hooanon05@yahoo.co.jp ++Description: ++ It shows the consumed blocks by xigen (External Inode ++ Generation Table), its block size and file size. ++ If CONFIG_AUFS_EXPORT is disabled, this entry will not ++ be created. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see ++ Documentation/filesystems/aufs/aufs.5 in detail. +diff -Nur linux-2.6.31-vanilla/Documentation/ABI/testing/sysfs-aufs linux-2.6.31/Documentation/ABI/testing/sysfs-aufs +--- linux-2.6.31-vanilla/Documentation/ABI/testing/sysfs-aufs 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/Documentation/ABI/testing/sysfs-aufs 2009-09-16 13:55:29.000000000 +0200 +@@ -0,0 +1,25 @@ ++What: /sys/fs/aufs/si_<id>/ ++Date: March 2009 ++Contact: J. R. Okajima hooanon05@yahoo.co.jp ++Description: ++ Under /sys/fs/aufs, a directory named si_<id> is created ++ per aufs mount, where <id> is a unique id generated ++ internally. ++ ++What: /sys/fs/aufs/si_<id>/br0, br1 ... brN ++Date: March 2009 ++Contact: J. R. Okajima hooanon05@yahoo.co.jp ++Description: ++ It shows the abolute path of a member directory (which ++ is called branch) in aufs, and its permission. ++ ++What: /sys/fs/aufs/si_<id>/xi_path ++Date: March 2009 ++Contact: J. R. Okajima hooanon05@yahoo.co.jp ++Description: ++ It shows the abolute path of XINO (External Inode Number ++ Bitmap, Translation Table and Generation Table) file ++ even if it is the default path. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see ++ Documentation/filesystems/aufs/aufs.5 in detail. +diff -Nur linux-2.6.31-vanilla/fs/aufs/aufs.h linux-2.6.31/fs/aufs/aufs.h +--- linux-2.6.31-vanilla/fs/aufs/aufs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/aufs.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,51 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * all header files ++ */ ++ ++#ifndef __AUFS_H__ ++#define __AUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++#include "debug.h" ++ ++#include "branch.h" ++#include "cpup.h" ++#include "dcsub.h" ++#include "dbgaufs.h" ++#include "dentry.h" ++#include "dir.h" ++#include "file.h" ++#include "fstype.h" ++#include "inode.h" ++#include "loop.h" ++#include "module.h" ++#include "opts.h" ++#include "rwsem.h" ++#include "spl.h" ++#include "super.h" ++#include "sysaufs.h" ++#include "vfsub.h" ++#include "whout.h" ++#include "wkq.h" ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/branch.c linux-2.6.31/fs/aufs/branch.c +--- linux-2.6.31-vanilla/fs/aufs/branch.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/branch.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,969 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * branch management ++ */ ++ ++#include <linux/file.h> ++#include "aufs.h" ++ ++/* ++ * free a single branch ++ */ ++static void au_br_do_free(struct au_branch *br) ++{ ++ int i; ++ struct au_wbr *wbr; ++ ++ if (br->br_xino.xi_file) ++ fput(br->br_xino.xi_file); ++ mutex_destroy(&br->br_xino.xi_nondir_mtx); ++ ++ AuDebugOn(atomic_read(&br->br_count)); ++ ++ wbr = br->br_wbr; ++ if (wbr) { ++ for (i = 0; i < AuBrWh_Last; i++) ++ dput(wbr->wbr_wh[i]); ++ AuDebugOn(atomic_read(&wbr->wbr_wh_running)); ++ AuRwDestroy(&wbr->wbr_wh_rwsem); ++ } ++ ++ /* some filesystems acquire extra lock */ ++ lockdep_off(); ++ mntput(br->br_mnt); ++ lockdep_on(); ++ ++ kfree(wbr); ++ kfree(br); ++} ++ ++/* ++ * frees all branches ++ */ ++void au_br_free(struct au_sbinfo *sbinfo) ++{ ++ aufs_bindex_t bmax; ++ struct au_branch **br; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ bmax = sbinfo->si_bend + 1; ++ br = sbinfo->si_branch; ++ while (bmax--) ++ au_br_do_free(*br++); ++} ++ ++/* ++ * find the index of a branch which is specified by @br_id. ++ */ ++int au_br_index(struct super_block *sb, aufs_bindex_t br_id) ++{ ++ aufs_bindex_t bindex, bend; ++ ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) ++ if (au_sbr_id(sb, bindex) == br_id) ++ return bindex; ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * add a branch ++ */ ++ ++static int test_overlap(struct super_block *sb, struct dentry *h_d1, ++ struct dentry *h_d2) ++{ ++ if (unlikely(h_d1 == h_d2)) ++ return 1; ++ return !!au_test_subdir(h_d1, h_d2) ++ || !!au_test_subdir(h_d2, h_d1) ++ || au_test_loopback_overlap(sb, h_d1, h_d2) ++ || au_test_loopback_overlap(sb, h_d2, h_d1); ++} ++ ++/* ++ * returns a newly allocated branch. @new_nbranch is a number of branches ++ * after adding a branch. ++ */ ++static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch, ++ int perm) ++{ ++ struct au_branch *add_branch; ++ struct dentry *root; ++ ++ root = sb->s_root; ++ add_branch = kmalloc(sizeof(*add_branch), GFP_NOFS); ++ if (unlikely(!add_branch)) ++ goto out; ++ ++ add_branch->br_wbr = NULL; ++ if (au_br_writable(perm)) { ++ /* may be freed separately at changing the branch permission */ ++ add_branch->br_wbr = kmalloc(sizeof(*add_branch->br_wbr), ++ GFP_NOFS); ++ if (unlikely(!add_branch->br_wbr)) ++ goto out_br; ++ } ++ ++ if (unlikely(au_sbr_realloc(au_sbi(sb), new_nbranch) ++ || au_di_realloc(au_di(root), new_nbranch) ++ || au_ii_realloc(au_ii(root->d_inode), new_nbranch))) ++ goto out_wbr; ++ return add_branch; /* success */ ++ ++ out_wbr: ++ kfree(add_branch->br_wbr); ++ out_br: ++ kfree(add_branch); ++ out: ++ return ERR_PTR(-ENOMEM); ++} ++ ++/* ++ * test if the branch permission is legal or not. ++ */ ++static int test_br(struct inode *inode, int brperm, char *path) ++{ ++ int err; ++ ++ err = 0; ++ if (unlikely(au_br_writable(brperm) && IS_RDONLY(inode))) { ++ AuErr("write permission for readonly mount or inode, %s\n", ++ path); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++/* ++ * returns: ++ * 0: success, the caller will add it ++ * plus: success, it is already unified, the caller should ignore it ++ * minus: error ++ */ ++static int test_add(struct super_block *sb, struct au_opt_add *add, int remount) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct dentry *root; ++ struct inode *inode, *h_inode; ++ ++ root = sb->s_root; ++ bend = au_sbend(sb); ++ if (unlikely(bend >= 0 ++ && au_find_dbindex(root, add->path.dentry) >= 0)) { ++ err = 1; ++ if (!remount) { ++ err = -EINVAL; ++ AuErr("%s duplicated\n", add->pathname); ++ } ++ goto out; ++ } ++ ++ err = -ENOSPC; /* -E2BIG; */ ++ if (unlikely(AUFS_BRANCH_MAX <= add->bindex ++ || AUFS_BRANCH_MAX - 1 <= bend)) { ++ AuErr("number of branches exceeded %s\n", add->pathname); ++ goto out; ++ } ++ ++ err = -EDOM; ++ if (unlikely(add->bindex < 0 || bend + 1 < add->bindex)) { ++ AuErr("bad index %d\n", add->bindex); ++ goto out; ++ } ++ ++ inode = add->path.dentry->d_inode; ++ err = -ENOENT; ++ if (unlikely(!inode->i_nlink)) { ++ AuErr("no existence %s\n", add->pathname); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ if (unlikely(inode->i_sb == sb)) { ++ AuErr("%s must be outside\n", add->pathname); ++ goto out; ++ } ++ ++ if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) { ++ AuErr("unsupported filesystem, %s (%s)\n", ++ add->pathname, au_sbtype(inode->i_sb)); ++ goto out; ++ } ++ ++ err = test_br(add->path.dentry->d_inode, add->perm, add->pathname); ++ if (unlikely(err)) ++ goto out; ++ ++ if (bend < 0) ++ return 0; /* success */ ++ ++ err = -EINVAL; ++ for (bindex = 0; bindex <= bend; bindex++) ++ if (unlikely(test_overlap(sb, add->path.dentry, ++ au_h_dptr(root, bindex)))) { ++ AuErr("%s is overlapped\n", add->pathname); ++ goto out; ++ } ++ ++ err = 0; ++ if (au_opt_test(au_mntflags(sb), WARN_PERM)) { ++ h_inode = au_h_dptr(root, 0)->d_inode; ++ if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO) ++ || h_inode->i_uid != inode->i_uid ++ || h_inode->i_gid != inode->i_gid) ++ AuWarn("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n", ++ add->pathname, ++ inode->i_uid, inode->i_gid, ++ (inode->i_mode & S_IALLUGO), ++ h_inode->i_uid, h_inode->i_gid, ++ (h_inode->i_mode & S_IALLUGO)); ++ } ++ ++ out: ++ return err; ++} ++ ++/* ++ * initialize or clean the whiteouts for an adding branch ++ */ ++static int au_br_init_wh(struct super_block *sb, struct au_branch *br, ++ int new_perm, struct dentry *h_root) ++{ ++ int err, old_perm; ++ aufs_bindex_t bindex; ++ struct mutex *h_mtx; ++ struct au_wbr *wbr; ++ struct au_hinode *hdir; ++ ++ wbr = br->br_wbr; ++ old_perm = br->br_perm; ++ br->br_perm = new_perm; ++ hdir = NULL; ++ h_mtx = NULL; ++ bindex = au_br_index(sb, br->br_id); ++ if (0 <= bindex) { ++ hdir = au_hi(sb->s_root->d_inode, bindex); ++ au_hin_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ } else { ++ h_mtx = &h_root->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_PARENT); ++ } ++ if (!wbr) ++ err = au_wh_init(h_root, br, sb); ++ else { ++ wbr_wh_write_lock(wbr); ++ err = au_wh_init(h_root, br, sb); ++ wbr_wh_write_unlock(wbr); ++ } ++ if (hdir) ++ au_hin_imtx_unlock(hdir); ++ else ++ mutex_unlock(h_mtx); ++ br->br_perm = old_perm; ++ ++ if (!err && wbr && !au_br_writable(new_perm)) { ++ kfree(wbr); ++ br->br_wbr = NULL; ++ } ++ ++ return err; ++} ++ ++static int au_wbr_init(struct au_branch *br, struct super_block *sb, ++ int perm, struct path *path) ++{ ++ int err; ++ struct au_wbr *wbr; ++ ++ wbr = br->br_wbr; ++ au_rw_init(&wbr->wbr_wh_rwsem); ++ memset(wbr->wbr_wh, 0, sizeof(wbr->wbr_wh)); ++ atomic_set(&wbr->wbr_wh_running, 0); ++ wbr->wbr_bytes = 0; ++ ++ err = au_br_init_wh(sb, br, perm, path->dentry); ++ ++ return err; ++} ++ ++/* intialize a new branch */ ++static int au_br_init(struct au_branch *br, struct super_block *sb, ++ struct au_opt_add *add) ++{ ++ int err; ++ ++ err = 0; ++ memset(&br->br_xino, 0, sizeof(br->br_xino)); ++ mutex_init(&br->br_xino.xi_nondir_mtx); ++ br->br_perm = add->perm; ++ br->br_mnt = add->path.mnt; /* set first, mntget() later */ ++ atomic_set(&br->br_count, 0); ++ br->br_xino_upper = AUFS_XINO_TRUNC_INIT; ++ atomic_set(&br->br_xino_running, 0); ++ br->br_id = au_new_br_id(sb); ++ ++ if (au_br_writable(add->perm)) { ++ err = au_wbr_init(br, sb, add->perm, &add->path); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ if (au_opt_test(au_mntflags(sb), XINO)) { ++ err = au_xino_br(sb, br, add->path.dentry->d_inode->i_ino, ++ au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1); ++ if (unlikely(err)) { ++ AuDebugOn(br->br_xino.xi_file); ++ goto out; ++ } ++ } ++ ++ sysaufs_br_init(br); ++ mntget(add->path.mnt); ++ ++ out: ++ return err; ++} ++ ++static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex, ++ struct au_branch *br, aufs_bindex_t bend, ++ aufs_bindex_t amount) ++{ ++ struct au_branch **brp; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ brp = sbinfo->si_branch + bindex; ++ memmove(brp + 1, brp, sizeof(*brp) * amount); ++ *brp = br; ++ sbinfo->si_bend++; ++ if (unlikely(bend < 0)) ++ sbinfo->si_bend = 0; ++} ++ ++static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex, ++ aufs_bindex_t bend, aufs_bindex_t amount) ++{ ++ struct au_hdentry *hdp; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ hdp = dinfo->di_hdentry + bindex; ++ memmove(hdp + 1, hdp, sizeof(*hdp) * amount); ++ au_h_dentry_init(hdp); ++ dinfo->di_bend++; ++ if (unlikely(bend < 0)) ++ dinfo->di_bstart = 0; ++} ++ ++static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex, ++ aufs_bindex_t bend, aufs_bindex_t amount) ++{ ++ struct au_hinode *hip; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ hip = iinfo->ii_hinode + bindex; ++ memmove(hip + 1, hip, sizeof(*hip) * amount); ++ hip->hi_inode = NULL; ++ au_hin_init(hip, NULL); ++ iinfo->ii_bend++; ++ if (unlikely(bend < 0)) ++ iinfo->ii_bstart = 0; ++} ++ ++static void au_br_do_add(struct super_block *sb, struct dentry *h_dentry, ++ struct au_branch *br, aufs_bindex_t bindex) ++{ ++ struct dentry *root; ++ struct inode *root_inode; ++ aufs_bindex_t bend, amount; ++ ++ root = sb->s_root; ++ root_inode = root->d_inode; ++ au_plink_block_maintain(sb); ++ bend = au_sbend(sb); ++ amount = bend + 1 - bindex; ++ au_br_do_add_brp(au_sbi(sb), bindex, br, bend, amount); ++ au_br_do_add_hdp(au_di(root), bindex, bend, amount); ++ au_br_do_add_hip(au_ii(root_inode), bindex, bend, amount); ++ au_set_h_dptr(root, bindex, dget(h_dentry)); ++ au_set_h_iptr(root_inode, bindex, au_igrab(h_dentry->d_inode), ++ /*flags*/0); ++} ++ ++int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount) ++{ ++ int err; ++ aufs_bindex_t bend, add_bindex; ++ struct dentry *root, *h_dentry; ++ struct inode *root_inode; ++ struct au_branch *add_branch; ++ ++ root = sb->s_root; ++ root_inode = root->d_inode; ++ IMustLock(root_inode); ++ err = test_add(sb, add, remount); ++ if (unlikely(err < 0)) ++ goto out; ++ if (err) { ++ err = 0; ++ goto out; /* success */ ++ } ++ ++ bend = au_sbend(sb); ++ add_branch = au_br_alloc(sb, bend + 2, add->perm); ++ err = PTR_ERR(add_branch); ++ if (IS_ERR(add_branch)) ++ goto out; ++ ++ err = au_br_init(add_branch, sb, add); ++ if (unlikely(err)) { ++ au_br_do_free(add_branch); ++ goto out; ++ } ++ ++ add_bindex = add->bindex; ++ h_dentry = add->path.dentry; ++ if (!remount) ++ au_br_do_add(sb, h_dentry, add_branch, add_bindex); ++ else { ++ sysaufs_brs_del(sb, add_bindex); ++ au_br_do_add(sb, h_dentry, add_branch, add_bindex); ++ sysaufs_brs_add(sb, add_bindex); ++ } ++ ++ if (!add_bindex) { ++ au_cpup_attr_all(root_inode, /*force*/1); ++ sb->s_maxbytes = h_dentry->d_sb->s_maxbytes; ++ } else ++ au_add_nlink(root_inode, h_dentry->d_inode); ++ ++ /* ++ * this test/set prevents aufs from handling unnecesary inotify events ++ * of xino files, in a case of re-adding a writable branch which was ++ * once detached from aufs. ++ */ ++ if (au_xino_brid(sb) < 0 ++ && au_br_writable(add_branch->br_perm) ++ && !au_test_fs_bad_xino(h_dentry->d_sb) ++ && add_branch->br_xino.xi_file ++ && add_branch->br_xino.xi_file->f_dentry->d_parent == h_dentry) ++ au_xino_brid_set(sb, add_branch->br_id); ++ ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * delete a branch ++ */ ++ ++/* to show the line number, do not make it inlined function */ ++#define AuVerbose(do_info, fmt, args...) do { \ ++ if (do_info) \ ++ AuInfo(fmt, ##args); \ ++} while (0) ++ ++/* ++ * test if the branch is deletable or not. ++ */ ++static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex, ++ unsigned int sigen) ++{ ++ int err, i, j, ndentry; ++ aufs_bindex_t bstart, bend; ++ unsigned char verbose; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry *d; ++ struct inode *inode; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, root, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ verbose = !!au_opt_test(au_mntflags(root->d_sb), VERBOSE); ++ for (i = 0; !err && i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ ndentry = dpage->ndentry; ++ for (j = 0; !err && j < ndentry; j++) { ++ d = dpage->dentries[j]; ++ AuDebugOn(!atomic_read(&d->d_count)); ++ inode = d->d_inode; ++ if (au_digen(d) == sigen && au_iigen(inode) == sigen) ++ di_read_lock_child(d, AuLock_IR); ++ else { ++ di_write_lock_child(d); ++ err = au_reval_dpath(d, sigen); ++ if (!err) ++ di_downgrade_lock(d, AuLock_IR); ++ else { ++ di_write_unlock(d); ++ break; ++ } ++ } ++ ++ bstart = au_dbstart(d); ++ bend = au_dbend(d); ++ if (bstart <= bindex ++ && bindex <= bend ++ && au_h_dptr(d, bindex) ++ && (!S_ISDIR(inode->i_mode) || bstart == bend)) { ++ err = -EBUSY; ++ AuVerbose(verbose, "busy %.*s\n", AuDLNPair(d)); ++ } ++ di_read_unlock(d, AuLock_IR); ++ } ++ } ++ ++ out_dpages: ++ au_dpages_free(&dpages); ++ out: ++ return err; ++} ++ ++static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex, ++ unsigned int sigen) ++{ ++ int err; ++ struct inode *i; ++ aufs_bindex_t bstart, bend; ++ unsigned char verbose; ++ ++ err = 0; ++ verbose = !!au_opt_test(au_mntflags(sb), VERBOSE); ++ list_for_each_entry(i, &sb->s_inodes, i_sb_list) { ++ AuDebugOn(!atomic_read(&i->i_count)); ++ if (!list_empty(&i->i_dentry)) ++ continue; ++ ++ if (au_iigen(i) == sigen) ++ ii_read_lock_child(i); ++ else { ++ ii_write_lock_child(i); ++ err = au_refresh_hinode_self(i, /*do_attr*/1); ++ if (!err) ++ ii_downgrade_lock(i); ++ else { ++ ii_write_unlock(i); ++ break; ++ } ++ } ++ ++ bstart = au_ibstart(i); ++ bend = au_ibend(i); ++ if (bstart <= bindex ++ && bindex <= bend ++ && au_h_iptr(i, bindex) ++ && (!S_ISDIR(i->i_mode) || bstart == bend)) { ++ err = -EBUSY; ++ AuVerbose(verbose, "busy i%lu\n", i->i_ino); ++ ii_read_unlock(i); ++ break; ++ } ++ ii_read_unlock(i); ++ } ++ ++ return err; ++} ++ ++static int test_children_busy(struct dentry *root, aufs_bindex_t bindex) ++{ ++ int err; ++ unsigned int sigen; ++ ++ sigen = au_sigen(root->d_sb); ++ DiMustNoWaiters(root); ++ IiMustNoWaiters(root->d_inode); ++ di_write_unlock(root); ++ err = test_dentry_busy(root, bindex, sigen); ++ if (!err) ++ err = test_inode_busy(root->d_sb, bindex, sigen); ++ di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */ ++ ++ return err; ++} ++ ++static void au_br_do_del_brp(struct au_sbinfo *sbinfo, ++ const aufs_bindex_t bindex, ++ const aufs_bindex_t bend) ++{ ++ struct au_branch **brp, **p; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ brp = sbinfo->si_branch + bindex; ++ if (bindex < bend) ++ memmove(brp, brp + 1, sizeof(*brp) * (bend - bindex)); ++ sbinfo->si_branch[0 + bend] = NULL; ++ sbinfo->si_bend--; ++ ++ p = krealloc(sbinfo->si_branch, sizeof(*p) * bend, GFP_NOFS); ++ if (p) ++ sbinfo->si_branch = p; ++} ++ ++static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex, ++ const aufs_bindex_t bend) ++{ ++ struct au_hdentry *hdp, *p; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ hdp = dinfo->di_hdentry + bindex; ++ if (bindex < bend) ++ memmove(hdp, hdp + 1, sizeof(*hdp) * (bend - bindex)); ++ dinfo->di_hdentry[0 + bend].hd_dentry = NULL; ++ dinfo->di_bend--; ++ ++ p = krealloc(dinfo->di_hdentry, sizeof(*p) * bend, GFP_NOFS); ++ if (p) ++ dinfo->di_hdentry = p; ++} ++ ++static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex, ++ const aufs_bindex_t bend) ++{ ++ struct au_hinode *hip, *p; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ hip = iinfo->ii_hinode + bindex; ++ if (bindex < bend) ++ memmove(hip, hip + 1, sizeof(*hip) * (bend - bindex)); ++ iinfo->ii_hinode[0 + bend].hi_inode = NULL; ++ au_hin_init(iinfo->ii_hinode + bend, NULL); ++ iinfo->ii_bend--; ++ ++ p = krealloc(iinfo->ii_hinode, sizeof(*p) * bend, GFP_NOFS); ++ if (p) ++ iinfo->ii_hinode = p; ++} ++ ++static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_branch *br) ++{ ++ aufs_bindex_t bend; ++ struct au_sbinfo *sbinfo; ++ struct dentry *root; ++ struct inode *inode; ++ ++ SiMustWriteLock(sb); ++ ++ root = sb->s_root; ++ inode = root->d_inode; ++ au_plink_block_maintain(sb); ++ sbinfo = au_sbi(sb); ++ bend = sbinfo->si_bend; ++ ++ dput(au_h_dptr(root, bindex)); ++ au_hiput(au_hi(inode, bindex)); ++ au_br_do_free(br); ++ ++ au_br_do_del_brp(sbinfo, bindex, bend); ++ au_br_do_del_hdp(au_di(root), bindex, bend); ++ au_br_do_del_hip(au_ii(inode), bindex, bend); ++} ++ ++int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount) ++{ ++ int err, rerr, i; ++ unsigned int mnt_flags; ++ aufs_bindex_t bindex, bend, br_id; ++ unsigned char do_wh, verbose; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ ++ err = 0; ++ bindex = au_find_dbindex(sb->s_root, del->h_path.dentry); ++ if (bindex < 0) { ++ if (remount) ++ goto out; /* success */ ++ err = -ENOENT; ++ AuErr("%s no such branch\n", del->pathname); ++ goto out; ++ } ++ AuDbg("bindex b%d\n", bindex); ++ ++ err = -EBUSY; ++ mnt_flags = au_mntflags(sb); ++ verbose = !!au_opt_test(mnt_flags, VERBOSE); ++ bend = au_sbend(sb); ++ if (unlikely(!bend)) { ++ AuVerbose(verbose, "no more branches left\n"); ++ goto out; ++ } ++ br = au_sbr(sb, bindex); ++ i = atomic_read(&br->br_count); ++ if (unlikely(i)) { ++ AuVerbose(verbose, "%d file(s) opened\n", i); ++ goto out; ++ } ++ ++ wbr = br->br_wbr; ++ do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph); ++ if (do_wh) { ++ /* instead of WbrWhMustWriteLock(wbr) */ ++ SiMustWriteLock(sb); ++ for (i = 0; i < AuBrWh_Last; i++) { ++ dput(wbr->wbr_wh[i]); ++ wbr->wbr_wh[i] = NULL; ++ } ++ } ++ ++ err = test_children_busy(sb->s_root, bindex); ++ if (unlikely(err)) { ++ if (do_wh) ++ goto out_wh; ++ goto out; ++ } ++ ++ err = 0; ++ br_id = br->br_id; ++ if (!remount) ++ au_br_do_del(sb, bindex, br); ++ else { ++ sysaufs_brs_del(sb, bindex); ++ au_br_do_del(sb, bindex, br); ++ sysaufs_brs_add(sb, bindex); ++ } ++ ++ if (!bindex) { ++ au_cpup_attr_all(sb->s_root->d_inode, /*force*/1); ++ sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes; ++ } else ++ au_sub_nlink(sb->s_root->d_inode, del->h_path.dentry->d_inode); ++ if (au_opt_test(mnt_flags, PLINK)) ++ au_plink_half_refresh(sb, br_id); ++ ++ if (au_xino_brid(sb) == br->br_id) ++ au_xino_brid_set(sb, -1); ++ goto out; /* success */ ++ ++ out_wh: ++ /* revert */ ++ rerr = au_br_init_wh(sb, br, br->br_perm, del->h_path.dentry); ++ if (rerr) ++ AuWarn("failed re-creating base whiteout, %s. (%d)\n", ++ del->pathname, rerr); ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * change a branch permission ++ */ ++ ++static void au_warn_ima(void) ++{ ++#ifdef CONFIG_IMA ++ AuWarn("RW -> RO makes IMA to produce wrong message"); ++#endif ++} ++ ++static int do_need_sigen_inc(int a, int b) ++{ ++ return au_br_whable(a) && !au_br_whable(b); ++} ++ ++static int need_sigen_inc(int old, int new) ++{ ++ return do_need_sigen_inc(old, new) ++ || do_need_sigen_inc(new, old); ++} ++ ++static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err; ++ unsigned long n, ul, bytes, files; ++ aufs_bindex_t bstart; ++ struct file *file, *hf, **a; ++ const int step_bytes = 1024, /* memory allocation unit */ ++ step_files = step_bytes / sizeof(*a); ++ ++ err = -ENOMEM; ++ n = 0; ++ bytes = step_bytes; ++ files = step_files; ++ a = kmalloc(bytes, GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ /* no need file_list_lock() since sbinfo is locked? defered? */ ++ list_for_each_entry(file, &sb->s_files, f_u.fu_list) { ++ if (special_file(file->f_dentry->d_inode->i_mode)) ++ continue; ++ ++ AuDbg("%.*s\n", AuDLNPair(file->f_dentry)); ++ fi_read_lock(file); ++ if (unlikely(au_test_mmapped(file))) { ++ err = -EBUSY; ++ FiMustNoWaiters(file); ++ fi_read_unlock(file); ++ goto out_free; ++ } ++ ++ bstart = au_fbstart(file); ++ if (!S_ISREG(file->f_dentry->d_inode->i_mode) ++ || !(file->f_mode & FMODE_WRITE) ++ || bstart != bindex) { ++ FiMustNoWaiters(file); ++ fi_read_unlock(file); ++ continue; ++ } ++ ++ hf = au_h_fptr(file, bstart); ++ FiMustNoWaiters(file); ++ fi_read_unlock(file); ++ ++ if (n < files) ++ a[n++] = hf; ++ else { ++ void *p; ++ ++ err = -ENOMEM; ++ bytes += step_bytes; ++ files += step_files; ++ p = krealloc(a, bytes, GFP_NOFS); ++ if (p) { ++ a = p; ++ a[n++] = hf; ++ } else ++ goto out_free; ++ } ++ } ++ ++ err = 0; ++ if (n) ++ au_warn_ima(); ++ for (ul = 0; ul < n; ul++) { ++ /* todo: already flushed? */ ++ /* cf. fs/super.c:mark_files_ro() */ ++ hf = a[ul]; ++ hf->f_mode &= ~FMODE_WRITE; ++ if (!file_check_writeable(hf)) { ++ file_release_write(hf); ++ mnt_drop_write(hf->f_vfsmnt); ++ } ++ } ++ ++ out_free: ++ kfree(a); ++ out: ++ return err; ++} ++ ++int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, ++ int *do_update) ++{ ++ int err, rerr; ++ aufs_bindex_t bindex; ++ struct path path; ++ struct dentry *root; ++ struct au_branch *br; ++ ++ root = sb->s_root; ++ au_plink_block_maintain(sb); ++ bindex = au_find_dbindex(root, mod->h_root); ++ if (bindex < 0) { ++ if (remount) ++ return 0; /* success */ ++ err = -ENOENT; ++ AuErr("%s no such branch\n", mod->path); ++ goto out; ++ } ++ AuDbg("bindex b%d\n", bindex); ++ ++ err = test_br(mod->h_root->d_inode, mod->perm, mod->path); ++ if (unlikely(err)) ++ goto out; ++ ++ br = au_sbr(sb, bindex); ++ if (br->br_perm == mod->perm) ++ return 0; /* success */ ++ ++ if (au_br_writable(br->br_perm)) { ++ /* remove whiteout base */ ++ err = au_br_init_wh(sb, br, mod->perm, mod->h_root); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!au_br_writable(mod->perm)) { ++ /* rw --> ro, file might be mmapped */ ++ DiMustNoWaiters(root); ++ IiMustNoWaiters(root->d_inode); ++ di_write_unlock(root); ++ err = au_br_mod_files_ro(sb, bindex); ++ /* aufs_write_lock() calls ..._child() */ ++ di_write_lock_child(root); ++ ++ if (unlikely(err)) { ++ rerr = -ENOMEM; ++ br->br_wbr = kmalloc(sizeof(*br->br_wbr), ++ GFP_NOFS); ++ if (br->br_wbr) { ++ path.mnt = br->br_mnt; ++ path.dentry = mod->h_root; ++ rerr = au_wbr_init(br, sb, br->br_perm, ++ &path); ++ } ++ if (unlikely(rerr)) { ++ AuIOErr("nested error %d (%d)\n", ++ rerr, err); ++ br->br_perm = mod->perm; ++ } ++ } ++ } ++ } else if (au_br_writable(mod->perm)) { ++ /* ro --> rw */ ++ err = -ENOMEM; ++ br->br_wbr = kmalloc(sizeof(*br->br_wbr), GFP_NOFS); ++ if (br->br_wbr) { ++ path.mnt = br->br_mnt; ++ path.dentry = mod->h_root; ++ err = au_wbr_init(br, sb, mod->perm, &path); ++ if (unlikely(err)) { ++ kfree(br->br_wbr); ++ br->br_wbr = NULL; ++ } ++ } ++ } ++ ++ if (!err) { ++ *do_update |= need_sigen_inc(br->br_perm, mod->perm); ++ br->br_perm = mod->perm; ++ } ++ ++ out: ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/branch.h linux-2.6.31/fs/aufs/branch.h +--- linux-2.6.31-vanilla/fs/aufs/branch.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/branch.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,219 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * branch filesystems and xino for them ++ */ ++ ++#ifndef __AUFS_BRANCH_H__ ++#define __AUFS_BRANCH_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/fs.h> ++#include <linux/mount.h> ++#include <linux/aufs_type.h> ++#include "rwsem.h" ++#include "super.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* a xino file */ ++struct au_xino_file { ++ struct file *xi_file; ++ struct mutex xi_nondir_mtx; ++ ++ /* todo: make xino files an array to support huge inode number */ ++ ++#ifdef CONFIG_DEBUG_FS ++ struct dentry *xi_dbgaufs; ++#endif ++}; ++ ++/* members for writable branch only */ ++enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last}; ++struct au_wbr { ++ struct au_rwsem wbr_wh_rwsem; ++ struct dentry *wbr_wh[AuBrWh_Last]; ++ atomic_t wbr_wh_running; ++#define wbr_whbase wbr_wh[AuBrWh_BASE] /* whiteout base */ ++#define wbr_plink wbr_wh[AuBrWh_PLINK] /* pseudo-link dir */ ++#define wbr_orph wbr_wh[AuBrWh_ORPH] /* dir for orphans */ ++ ++ /* mfs mode */ ++ unsigned long long wbr_bytes; ++}; ++ ++/* protected by superblock rwsem */ ++struct au_branch { ++ struct au_xino_file br_xino; ++ ++ aufs_bindex_t br_id; ++ ++ int br_perm; ++ struct vfsmount *br_mnt; ++ atomic_t br_count; ++ ++ struct au_wbr *br_wbr; ++ ++ /* xino truncation */ ++ blkcnt_t br_xino_upper; /* watermark in blocks */ ++ atomic_t br_xino_running; ++ ++#ifdef CONFIG_SYSFS ++ /* an entry under sysfs per mount-point */ ++ char br_name[8]; ++ struct attribute br_attr; ++#endif ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* branch permission and attribute */ ++enum { ++ AuBrPerm_RW, /* writable, linkable wh */ ++ AuBrPerm_RO, /* readonly, no wh */ ++ AuBrPerm_RR, /* natively readonly, no wh */ ++ ++ AuBrPerm_RWNoLinkWH, /* un-linkable whiteouts */ ++ ++ AuBrPerm_ROWH, /* whiteout-able */ ++ AuBrPerm_RRWH, /* whiteout-able */ ++ ++ AuBrPerm_Last ++}; ++ ++static inline int au_br_writable(int brperm) ++{ ++ return brperm == AuBrPerm_RW || brperm == AuBrPerm_RWNoLinkWH; ++} ++ ++static inline int au_br_whable(int brperm) ++{ ++ return brperm == AuBrPerm_RW ++ || brperm == AuBrPerm_ROWH ++ || brperm == AuBrPerm_RRWH; ++} ++ ++static inline int au_br_rdonly(struct au_branch *br) ++{ ++ return ((br->br_mnt->mnt_sb->s_flags & MS_RDONLY) ++ || !au_br_writable(br->br_perm)) ++ ? -EROFS : 0; ++} ++ ++static inline int au_br_hinotifyable(int brperm __maybe_unused) ++{ ++#ifdef CONFIG_AUFS_HINOTIFY ++ return brperm != AuBrPerm_RR && brperm != AuBrPerm_RRWH; ++#else ++ return 0; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* branch.c */ ++struct au_sbinfo; ++void au_br_free(struct au_sbinfo *sinfo); ++int au_br_index(struct super_block *sb, aufs_bindex_t br_id); ++struct au_opt_add; ++int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount); ++struct au_opt_del; ++int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount); ++struct au_opt_mod; ++int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, ++ int *do_update); ++ ++/* xino.c */ ++static const loff_t au_loff_max = LLONG_MAX; ++ ++int au_xib_trunc(struct super_block *sb); ++ssize_t xino_fread(au_readf_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos); ++ssize_t xino_fwrite(au_writef_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos); ++struct file *au_xino_create2(struct file *base_file, struct file *copy_src); ++struct file *au_xino_create(struct super_block *sb, char *fname, int silent); ++ino_t au_xino_new_ino(struct super_block *sb); ++int au_xino_write0(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t ino); ++int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t ino); ++int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t *ino); ++int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino, ++ struct file *base_file, int do_test); ++int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex); ++ ++struct au_opt_xino; ++int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount); ++void au_xino_clr(struct super_block *sb); ++struct file *au_xino_def(struct super_block *sb); ++int au_xino_path(struct seq_file *seq, struct file *file); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Superblock to branch */ ++static inline ++aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_id; ++} ++ ++static inline ++struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_mnt; ++} ++ ++static inline ++struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr_mnt(sb, bindex)->mnt_sb; ++} ++ ++static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ atomic_dec_return(&au_sbr(sb, bindex)->br_count); ++} ++ ++static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_perm; ++} ++ ++static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_br_whable(au_sbr_perm(sb, bindex)); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * wbr_wh_read_lock, wbr_wh_write_lock ++ * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem); ++ ++#define WbrWhMustNoWaiters(wbr) AuRwMustNoWaiters(&wbr->wbr_wh_rwsem) ++#define WbrWhMustAnyLock(wbr) AuRwMustAnyLock(&wbr->wbr_wh_rwsem) ++#define WbrWhMustWriteLock(wbr) AuRwMustWriteLock(&wbr->wbr_wh_rwsem) ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_BRANCH_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/cpup.c linux-2.6.31/fs/aufs/cpup.c +--- linux-2.6.31-vanilla/fs/aufs/cpup.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/cpup.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,1048 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * copy-up functions, see wbr_policy.c for copy-down ++ */ ++ ++#include <linux/file.h> ++#include <linux/fs_stack.h> ++#include <linux/mm.h> ++#include <linux/uaccess.h> ++#include "aufs.h" ++ ++void au_cpup_attr_flags(struct inode *dst, struct inode *src) ++{ ++ const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE ++ | S_NOATIME | S_NOCMTIME; ++ ++ dst->i_flags |= src->i_flags & ~mask; ++ if (au_test_fs_notime(dst->i_sb)) ++ dst->i_flags |= S_NOATIME | S_NOCMTIME; ++} ++ ++void au_cpup_attr_timesizes(struct inode *inode) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ fsstack_copy_attr_times(inode, h_inode); ++ vfsub_copy_inode_size(inode, h_inode); ++} ++ ++void au_cpup_attr_nlink(struct inode *inode, int force) ++{ ++ struct inode *h_inode; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bend; ++ ++ sb = inode->i_sb; ++ bindex = au_ibstart(inode); ++ h_inode = au_h_iptr(inode, bindex); ++ if (!force ++ && !S_ISDIR(h_inode->i_mode) ++ && au_opt_test(au_mntflags(sb), PLINK) ++ && au_plink_test(inode)) ++ return; ++ ++ inode->i_nlink = h_inode->i_nlink; ++ ++ /* ++ * fewer nlink makes find(1) noisy, but larger nlink doesn't. ++ * it may includes whplink directory. ++ */ ++ if (S_ISDIR(h_inode->i_mode)) { ++ bend = au_ibend(inode); ++ for (bindex++; bindex <= bend; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode) ++ au_add_nlink(inode, h_inode); ++ } ++ } ++} ++ ++void au_cpup_attr_changeable(struct inode *inode) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ inode->i_mode = h_inode->i_mode; ++ inode->i_uid = h_inode->i_uid; ++ inode->i_gid = h_inode->i_gid; ++ au_cpup_attr_timesizes(inode); ++ au_cpup_attr_flags(inode, h_inode); ++} ++ ++void au_cpup_igen(struct inode *inode, struct inode *h_inode) ++{ ++ struct au_iinfo *iinfo = au_ii(inode); ++ ++ IiMustWriteLock(inode); ++ ++ iinfo->ii_higen = h_inode->i_generation; ++ iinfo->ii_hsb1 = h_inode->i_sb; ++} ++ ++void au_cpup_attr_all(struct inode *inode, int force) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ au_cpup_attr_changeable(inode); ++ if (inode->i_nlink > 0) ++ au_cpup_attr_nlink(inode, force); ++ inode->i_rdev = h_inode->i_rdev; ++ inode->i_blkbits = h_inode->i_blkbits; ++ au_cpup_igen(inode, h_inode); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */ ++ ++/* keep the timestamps of the parent dir when cpup */ ++void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, ++ struct path *h_path) ++{ ++ struct inode *h_inode; ++ ++ dt->dt_dentry = dentry; ++ dt->dt_h_path = *h_path; ++ h_inode = h_path->dentry->d_inode; ++ dt->dt_atime = h_inode->i_atime; ++ dt->dt_mtime = h_inode->i_mtime; ++ /* smp_mb(); */ ++} ++ ++void au_dtime_revert(struct au_dtime *dt) ++{ ++ struct iattr attr; ++ int err; ++ ++ attr.ia_atime = dt->dt_atime; ++ attr.ia_mtime = dt->dt_mtime; ++ attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET ++ | ATTR_ATIME | ATTR_ATIME_SET; ++ ++ err = vfsub_notify_change(&dt->dt_h_path, &attr); ++ if (unlikely(err)) ++ AuWarn("restoring timestamps failed(%d). ignored\n", err); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static noinline_for_stack ++int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src) ++{ ++ int err, sbits; ++ struct iattr ia; ++ struct path h_path; ++ struct inode *h_isrc, *h_idst; ++ ++ h_path.dentry = au_h_dptr(dst, bindex); ++ h_idst = h_path.dentry->d_inode; ++ h_path.mnt = au_sbr_mnt(dst->d_sb, bindex); ++ h_isrc = h_src->d_inode; ++ ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID ++ | ATTR_ATIME | ATTR_MTIME ++ | ATTR_ATIME_SET | ATTR_MTIME_SET; ++ ia.ia_uid = h_isrc->i_uid; ++ ia.ia_gid = h_isrc->i_gid; ++ ia.ia_atime = h_isrc->i_atime; ++ ia.ia_mtime = h_isrc->i_mtime; ++ if (h_idst->i_mode != h_isrc->i_mode ++ && !S_ISLNK(h_idst->i_mode)) { ++ ia.ia_valid |= ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ } ++ sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID)); ++ au_cpup_attr_flags(h_idst, h_isrc); ++ err = vfsub_notify_change(&h_path, &ia); ++ ++ /* is this nfs only? */ ++ if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) { ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ err = vfsub_notify_change(&h_path, &ia); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_copy_file(struct file *dst, struct file *src, loff_t len, ++ char *buf, unsigned long blksize) ++{ ++ int err; ++ size_t sz, rbytes, wbytes; ++ unsigned char all_zero; ++ char *p, *zp; ++ struct mutex *h_mtx; ++ /* reduce stack usage */ ++ struct iattr *ia; ++ ++ zp = page_address(ZERO_PAGE(0)); ++ if (unlikely(!zp)) ++ return -ENOMEM; /* possible? */ ++ ++ err = 0; ++ all_zero = 0; ++ while (len) { ++ AuDbg("len %lld\n", len); ++ sz = blksize; ++ if (len < blksize) ++ sz = len; ++ ++ rbytes = 0; ++ /* todo: signal_pending? */ ++ while (!rbytes || err == -EAGAIN || err == -EINTR) { ++ rbytes = vfsub_read_k(src, buf, sz, &src->f_pos); ++ err = rbytes; ++ } ++ if (unlikely(err < 0)) ++ break; ++ ++ all_zero = 0; ++ if (len >= rbytes && rbytes == blksize) ++ all_zero = !memcmp(buf, zp, rbytes); ++ if (!all_zero) { ++ wbytes = rbytes; ++ p = buf; ++ while (wbytes) { ++ size_t b; ++ ++ b = vfsub_write_k(dst, p, wbytes, &dst->f_pos); ++ err = b; ++ /* todo: signal_pending? */ ++ if (unlikely(err == -EAGAIN || err == -EINTR)) ++ continue; ++ if (unlikely(err < 0)) ++ break; ++ wbytes -= b; ++ p += b; ++ } ++ } else { ++ loff_t res; ++ ++ AuLabel(hole); ++ res = vfsub_llseek(dst, rbytes, SEEK_CUR); ++ err = res; ++ if (unlikely(res < 0)) ++ break; ++ } ++ len -= rbytes; ++ err = 0; ++ } ++ ++ /* the last block may be a hole */ ++ if (!err && all_zero) { ++ AuLabel(last hole); ++ ++ err = 1; ++ if (au_test_nfs(dst->f_dentry->d_sb)) { ++ /* nfs requires this step to make last hole */ ++ /* is this only nfs? */ ++ do { ++ /* todo: signal_pending? */ ++ err = vfsub_write_k(dst, "\0", 1, &dst->f_pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ if (err == 1) ++ dst->f_pos--; ++ } ++ ++ if (err == 1) { ++ ia = (void *)buf; ++ ia->ia_size = dst->f_pos; ++ ia->ia_valid = ATTR_SIZE | ATTR_FILE; ++ ia->ia_file = dst; ++ h_mtx = &dst->f_dentry->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); ++ err = vfsub_notify_change(&dst->f_path, ia); ++ mutex_unlock(h_mtx); ++ } ++ } ++ ++ return err; ++} ++ ++int au_copy_file(struct file *dst, struct file *src, loff_t len) ++{ ++ int err; ++ unsigned long blksize; ++ unsigned char do_kfree; ++ char *buf; ++ ++ err = -ENOMEM; ++ blksize = dst->f_dentry->d_sb->s_blocksize; ++ if (!blksize || PAGE_SIZE < blksize) ++ blksize = PAGE_SIZE; ++ AuDbg("blksize %lu\n", blksize); ++ do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *)); ++ if (do_kfree) ++ buf = kmalloc(blksize, GFP_NOFS); ++ else ++ buf = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!buf)) ++ goto out; ++ ++ if (len > (1 << 22)) ++ AuDbg("copying a large file %lld\n", (long long)len); ++ ++ src->f_pos = 0; ++ dst->f_pos = 0; ++ err = au_do_copy_file(dst, src, len, buf, blksize); ++ if (do_kfree) ++ kfree(buf); ++ else ++ free_page((unsigned long)buf); ++ ++ out: ++ return err; ++} ++ ++/* ++ * to support a sparse file which is opened with O_APPEND, ++ * we need to close the file. ++ */ ++static int au_cp_regular(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len) ++{ ++ int err, i; ++ enum { SRC, DST }; ++ struct { ++ aufs_bindex_t bindex; ++ unsigned int flags; ++ struct dentry *dentry; ++ struct file *file; ++ void *label, *label_file; ++ } *f, file[] = { ++ { ++ .bindex = bsrc, ++ .flags = O_RDONLY | O_NOATIME | O_LARGEFILE, ++ .file = NULL, ++ .label = &&out, ++ .label_file = &&out_src ++ }, ++ { ++ .bindex = bdst, ++ .flags = O_WRONLY | O_NOATIME | O_LARGEFILE, ++ .file = NULL, ++ .label = &&out_src, ++ .label_file = &&out_dst ++ } ++ }; ++ struct super_block *sb; ++ ++ /* bsrc branch can be ro/rw. */ ++ sb = dentry->d_sb; ++ f = file; ++ for (i = 0; i < 2; i++, f++) { ++ f->dentry = au_h_dptr(dentry, f->bindex); ++ f->file = au_h_open(dentry, f->bindex, f->flags, /*file*/NULL); ++ err = PTR_ERR(f->file); ++ if (IS_ERR(f->file)) ++ goto *f->label; ++ err = -EINVAL; ++ if (unlikely(!f->file->f_op)) ++ goto *f->label_file; ++ } ++ ++ /* try stopping to update while we copyup */ ++ IMustLock(file[SRC].dentry->d_inode); ++ err = au_copy_file(file[DST].file, file[SRC].file, len); ++ ++ out_dst: ++ fput(file[DST].file); ++ au_sbr_put(sb, file[DST].bindex); ++ out_src: ++ fput(file[SRC].file); ++ au_sbr_put(sb, file[SRC].bindex); ++ out: ++ return err; ++} ++ ++static int au_do_cpup_regular(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, ++ struct inode *h_dir, struct path *h_path) ++{ ++ int err, rerr; ++ loff_t l; ++ ++ err = 0; ++ l = i_size_read(au_h_iptr(dentry->d_inode, bsrc)); ++ if (len == -1 || l < len) ++ len = l; ++ if (len) ++ err = au_cp_regular(dentry, bdst, bsrc, len); ++ if (!err) ++ goto out; /* success */ ++ ++ rerr = vfsub_unlink(h_dir, h_path, /*force*/0); ++ if (rerr) { ++ AuIOErr("failed unlinking cpup-ed %.*s(%d, %d)\n", ++ AuDLNPair(h_path->dentry), err, rerr); ++ err = -EIO; ++ } ++ ++ out: ++ return err; ++} ++ ++static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src, ++ struct inode *h_dir) ++{ ++ int err, symlen; ++ mm_segment_t old_fs; ++ char *sym; ++ ++ err = -ENOSYS; ++ if (unlikely(!h_src->d_inode->i_op->readlink)) ++ goto out; ++ ++ err = -ENOMEM; ++ sym = __getname(); ++ if (unlikely(!sym)) ++ goto out; ++ ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ symlen = h_src->d_inode->i_op->readlink(h_src, (char __user *)sym, ++ PATH_MAX); ++ err = symlen; ++ set_fs(old_fs); ++ ++ if (symlen > 0) { ++ sym[symlen] = 0; ++ err = vfsub_symlink(h_dir, h_path, sym); ++ } ++ __putname(sym); ++ ++ out: ++ return err; ++} ++ ++/* return with the lower dst inode is locked */ ++static noinline_for_stack ++int cpup_entry(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent) ++{ ++ int err; ++ umode_t mode; ++ unsigned int mnt_flags; ++ unsigned char isdir; ++ const unsigned char do_dt = !!au_ftest_cpup(flags, DTIME); ++ struct au_dtime dt; ++ struct path h_path; ++ struct dentry *h_src, *h_dst, *h_parent; ++ struct inode *h_inode, *h_dir; ++ struct super_block *sb; ++ ++ /* bsrc branch can be ro/rw. */ ++ h_src = au_h_dptr(dentry, bsrc); ++ h_inode = h_src->d_inode; ++ AuDebugOn(h_inode != au_h_iptr(dentry->d_inode, bsrc)); ++ ++ /* try stopping to be referenced while we are creating */ ++ h_dst = au_h_dptr(dentry, bdst); ++ h_parent = h_dst->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ AuDebugOn(h_parent != h_dst->d_parent); ++ ++ sb = dentry->d_sb; ++ h_path.mnt = au_sbr_mnt(sb, bdst); ++ if (do_dt) { ++ h_path.dentry = h_parent; ++ au_dtime_store(&dt, dst_parent, &h_path); ++ } ++ h_path.dentry = h_dst; ++ ++ isdir = 0; ++ mode = h_inode->i_mode; ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ /* try stopping to update while we are referencing */ ++ IMustLock(h_inode); ++ err = vfsub_create(h_dir, &h_path, mode | S_IWUSR); ++ if (!err) ++ err = au_do_cpup_regular ++ (dentry, bdst, bsrc, len, ++ au_h_iptr(dst_parent->d_inode, bdst), &h_path); ++ break; ++ case S_IFDIR: ++ isdir = 1; ++ err = vfsub_mkdir(h_dir, &h_path, mode); ++ if (!err) { ++ /* ++ * strange behaviour from the users view, ++ * particularry setattr case ++ */ ++ if (au_ibstart(dst_parent->d_inode) == bdst) ++ au_cpup_attr_nlink(dst_parent->d_inode, ++ /*force*/1); ++ au_cpup_attr_nlink(dentry->d_inode, /*force*/1); ++ } ++ break; ++ case S_IFLNK: ++ err = au_do_cpup_symlink(&h_path, h_src, h_dir); ++ break; ++ case S_IFCHR: ++ case S_IFBLK: ++ AuDebugOn(!capable(CAP_MKNOD)); ++ /*FALLTHROUGH*/ ++ case S_IFIFO: ++ case S_IFSOCK: ++ err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev); ++ break; ++ default: ++ AuIOErr("Unknown inode type 0%o\n", mode); ++ err = -EIO; ++ } ++ ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, UDBA_NONE) ++ && !isdir ++ && au_opt_test(mnt_flags, XINO) ++ && h_inode->i_nlink == 1 ++ /* todo: unnecessary? */ ++ /* && dentry->d_inode->i_nlink == 1 */ ++ && bdst < bsrc ++ && !au_ftest_cpup(flags, KEEPLINO)) ++ au_xino_write(sb, bsrc, h_inode->i_ino, /*ino*/0); ++ /* ignore this error */ ++ ++ if (do_dt) ++ au_dtime_revert(&dt); ++ return err; ++} ++ ++/* ++ * copyup the @dentry from @bsrc to @bdst. ++ * the caller must set the both of lower dentries. ++ * @len is for truncating when it is -1 copyup the entire file. ++ * in link/rename cases, @dst_parent may be different from the real one. ++ */ ++static int au_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent) ++{ ++ int err, rerr; ++ aufs_bindex_t old_ibstart; ++ unsigned char isdir, plink; ++ struct au_dtime dt; ++ struct path h_path; ++ struct dentry *h_src, *h_dst, *h_parent; ++ struct inode *dst_inode, *h_dir, *inode; ++ struct super_block *sb; ++ ++ AuDebugOn(bsrc <= bdst); ++ ++ sb = dentry->d_sb; ++ h_path.mnt = au_sbr_mnt(sb, bdst); ++ h_dst = au_h_dptr(dentry, bdst); ++ h_parent = h_dst->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ ++ h_src = au_h_dptr(dentry, bsrc); ++ inode = dentry->d_inode; ++ ++ if (!dst_parent) ++ dst_parent = dget_parent(dentry); ++ else ++ dget(dst_parent); ++ ++ plink = !!au_opt_test(au_mntflags(sb), PLINK); ++ dst_inode = au_h_iptr(inode, bdst); ++ if (dst_inode) { ++ if (unlikely(!plink)) { ++ err = -EIO; ++ AuIOErr("i%lu exists on a upper branch " ++ "but plink is disabled\n", inode->i_ino); ++ goto out; ++ } ++ ++ if (dst_inode->i_nlink) { ++ const int do_dt = au_ftest_cpup(flags, DTIME); ++ ++ h_src = au_plink_lkup(inode, bdst); ++ err = PTR_ERR(h_src); ++ if (IS_ERR(h_src)) ++ goto out; ++ if (unlikely(!h_src->d_inode)) { ++ err = -EIO; ++ AuIOErr("i%lu exists on a upper branch " ++ "but plink is broken\n", inode->i_ino); ++ dput(h_src); ++ goto out; ++ } ++ ++ if (do_dt) { ++ h_path.dentry = h_parent; ++ au_dtime_store(&dt, dst_parent, &h_path); ++ } ++ h_path.dentry = h_dst; ++ err = vfsub_link(h_src, h_dir, &h_path); ++ if (do_dt) ++ au_dtime_revert(&dt); ++ dput(h_src); ++ goto out; ++ } else ++ /* todo: cpup_wh_file? */ ++ /* udba work */ ++ au_update_brange(inode, 1); ++ } ++ ++ old_ibstart = au_ibstart(inode); ++ err = cpup_entry(dentry, bdst, bsrc, len, flags, dst_parent); ++ if (unlikely(err)) ++ goto out; ++ dst_inode = h_dst->d_inode; ++ mutex_lock_nested(&dst_inode->i_mutex, AuLsc_I_CHILD2); ++ ++ err = cpup_iattr(dentry, bdst, h_src); ++ isdir = S_ISDIR(dst_inode->i_mode); ++ if (!err) { ++ if (bdst < old_ibstart) ++ au_set_ibstart(inode, bdst); ++ au_set_h_iptr(inode, bdst, au_igrab(dst_inode), ++ au_hi_flags(inode, isdir)); ++ mutex_unlock(&dst_inode->i_mutex); ++ if (!isdir ++ && h_src->d_inode->i_nlink > 1 ++ && plink) ++ au_plink_append(inode, bdst, h_dst); ++ goto out; /* success */ ++ } ++ ++ /* revert */ ++ h_path.dentry = h_parent; ++ mutex_unlock(&dst_inode->i_mutex); ++ au_dtime_store(&dt, dst_parent, &h_path); ++ h_path.dentry = h_dst; ++ if (!isdir) ++ rerr = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ else ++ rerr = vfsub_rmdir(h_dir, &h_path); ++ au_dtime_revert(&dt); ++ if (rerr) { ++ AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr); ++ err = -EIO; ++ } ++ ++ out: ++ dput(dst_parent); ++ return err; ++} ++ ++struct au_cpup_single_args { ++ int *errp; ++ struct dentry *dentry; ++ aufs_bindex_t bdst, bsrc; ++ loff_t len; ++ unsigned int flags; ++ struct dentry *dst_parent; ++}; ++ ++static void au_call_cpup_single(void *args) ++{ ++ struct au_cpup_single_args *a = args; ++ *a->errp = au_cpup_single(a->dentry, a->bdst, a->bsrc, a->len, ++ a->flags, a->dst_parent); ++} ++ ++int au_sio_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent) ++{ ++ int err, wkq_err; ++ umode_t mode; ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bsrc); ++ mode = h_dentry->d_inode->i_mode & S_IFMT; ++ if ((mode != S_IFCHR && mode != S_IFBLK) ++ || capable(CAP_MKNOD)) ++ err = au_cpup_single(dentry, bdst, bsrc, len, flags, ++ dst_parent); ++ else { ++ struct au_cpup_single_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .bdst = bdst, ++ .bsrc = bsrc, ++ .len = len, ++ .flags = flags, ++ .dst_parent = dst_parent ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_single, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++/* ++ * copyup the @dentry from the first active lower branch to @bdst, ++ * using au_cpup_single(). ++ */ ++static int au_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ unsigned int flags) ++{ ++ int err; ++ aufs_bindex_t bsrc, bend; ++ ++ bend = au_dbend(dentry); ++ for (bsrc = bdst + 1; bsrc <= bend; bsrc++) ++ if (au_h_dptr(dentry, bsrc)) ++ break; ++ ++ err = au_lkup_neg(dentry, bdst); ++ if (!err) { ++ err = au_cpup_single(dentry, bdst, bsrc, len, flags, NULL); ++ if (!err) ++ return 0; /* success */ ++ ++ /* revert */ ++ au_set_h_dptr(dentry, bdst, NULL); ++ au_set_dbstart(dentry, bsrc); ++ } ++ ++ return err; ++} ++ ++struct au_cpup_simple_args { ++ int *errp; ++ struct dentry *dentry; ++ aufs_bindex_t bdst; ++ loff_t len; ++ unsigned int flags; ++}; ++ ++static void au_call_cpup_simple(void *args) ++{ ++ struct au_cpup_simple_args *a = args; ++ *a->errp = au_cpup_simple(a->dentry, a->bdst, a->len, a->flags); ++} ++ ++int au_sio_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ unsigned int flags) ++{ ++ int err, wkq_err; ++ unsigned char do_sio; ++ struct dentry *parent; ++ struct inode *h_dir; ++ ++ parent = dget_parent(dentry); ++ h_dir = au_h_iptr(parent->d_inode, bdst); ++ do_sio = !!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE); ++ if (!do_sio) { ++ /* ++ * testing CAP_MKNOD is for generic fs, ++ * but CAP_FSETID is for xfs only, currently. ++ */ ++ umode_t mode = dentry->d_inode->i_mode; ++ do_sio = (((mode & (S_IFCHR | S_IFBLK)) ++ && !capable(CAP_MKNOD)) ++ || ((mode & (S_ISUID | S_ISGID)) ++ && !capable(CAP_FSETID))); ++ } ++ if (!do_sio) ++ err = au_cpup_simple(dentry, bdst, len, flags); ++ else { ++ struct au_cpup_simple_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .bdst = bdst, ++ .len = len, ++ .flags = flags ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_simple, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * copyup the deleted file for writing. ++ */ ++static int au_do_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *wh_dentry, struct file *file, ++ loff_t len) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct au_dinfo *dinfo; ++ struct dentry *h_d_dst, *h_d_start; ++ ++ dinfo = au_di(dentry); ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ bstart = dinfo->di_bstart; ++ h_d_dst = dinfo->di_hdentry[0 + bdst].hd_dentry; ++ dinfo->di_bstart = bdst; ++ dinfo->di_hdentry[0 + bdst].hd_dentry = wh_dentry; ++ h_d_start = dinfo->di_hdentry[0 + bstart].hd_dentry; ++ if (file) ++ dinfo->di_hdentry[0 + bstart].hd_dentry ++ = au_h_fptr(file, au_fbstart(file))->f_dentry; ++ err = au_cpup_single(dentry, bdst, bstart, len, !AuCpup_DTIME, ++ /*h_parent*/NULL); ++ if (!err && file) { ++ err = au_reopen_nondir(file); ++ dinfo->di_hdentry[0 + bstart].hd_dentry = h_d_start; ++ } ++ dinfo->di_hdentry[0 + bdst].hd_dentry = h_d_dst; ++ dinfo->di_bstart = bstart; ++ ++ return err; ++} ++ ++static int au_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ struct file *file) ++{ ++ int err; ++ struct au_dtime dt; ++ struct dentry *parent, *h_parent, *wh_dentry; ++ struct au_branch *br; ++ struct path h_path; ++ ++ br = au_sbr(dentry->d_sb, bdst); ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bdst); ++ wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out; ++ ++ h_path.dentry = h_parent; ++ h_path.mnt = br->br_mnt; ++ au_dtime_store(&dt, parent, &h_path); ++ err = au_do_cpup_wh(dentry, bdst, wh_dentry, file, len); ++ if (unlikely(err)) ++ goto out_wh; ++ ++ dget(wh_dentry); ++ h_path.dentry = wh_dentry; ++ err = vfsub_unlink(h_parent->d_inode, &h_path, /*force*/0); ++ if (unlikely(err)) { ++ AuIOErr("failed remove copied-up tmp file %.*s(%d)\n", ++ AuDLNPair(wh_dentry), err); ++ err = -EIO; ++ } ++ au_dtime_revert(&dt); ++ au_set_hi_wh(dentry->d_inode, bdst, wh_dentry); ++ ++ out_wh: ++ dput(wh_dentry); ++ out: ++ dput(parent); ++ return err; ++} ++ ++struct au_cpup_wh_args { ++ int *errp; ++ struct dentry *dentry; ++ aufs_bindex_t bdst; ++ loff_t len; ++ struct file *file; ++}; ++ ++static void au_call_cpup_wh(void *args) ++{ ++ struct au_cpup_wh_args *a = args; ++ *a->errp = au_cpup_wh(a->dentry, a->bdst, a->len, a->file); ++} ++ ++int au_sio_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ struct file *file) ++{ ++ int err, wkq_err; ++ struct dentry *parent, *h_orph, *h_parent, *h_dentry; ++ struct inode *dir, *h_dir, *h_tmpdir, *h_inode; ++ struct au_wbr *wbr; ++ ++ parent = dget_parent(dentry); ++ dir = parent->d_inode; ++ h_orph = NULL; ++ h_parent = NULL; ++ h_dir = au_igrab(au_h_iptr(dir, bdst)); ++ h_tmpdir = h_dir; ++ if (!h_dir->i_nlink) { ++ wbr = au_sbr(dentry->d_sb, bdst)->br_wbr; ++ h_orph = wbr->wbr_orph; ++ ++ h_parent = dget(au_h_dptr(parent, bdst)); ++ au_set_h_dptr(parent, bdst, NULL); ++ au_set_h_dptr(parent, bdst, dget(h_orph)); ++ h_tmpdir = h_orph->d_inode; ++ au_set_h_iptr(dir, bdst, NULL, 0); ++ au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0); ++ ++ /* this temporary unlock is safe */ ++ if (file) ++ h_dentry = au_h_fptr(file, au_fbstart(file))->f_dentry; ++ else ++ h_dentry = au_h_dptr(dentry, au_dbstart(dentry)); ++ h_inode = h_dentry->d_inode; ++ IMustLock(h_inode); ++ mutex_unlock(&h_inode->i_mutex); ++ mutex_lock_nested(&h_tmpdir->i_mutex, AuLsc_I_PARENT3); ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ } ++ ++ if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE)) ++ err = au_cpup_wh(dentry, bdst, len, file); ++ else { ++ struct au_cpup_wh_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .bdst = bdst, ++ .len = len, ++ .file = file ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_wh, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ if (h_orph) { ++ mutex_unlock(&h_tmpdir->i_mutex); ++ au_set_h_iptr(dir, bdst, NULL, 0); ++ au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0); ++ au_set_h_dptr(parent, bdst, NULL); ++ au_set_h_dptr(parent, bdst, h_parent); ++ } ++ iput(h_dir); ++ dput(parent); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * generic routine for both of copy-up and copy-down. ++ */ ++/* cf. revalidate function in file.c */ ++int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, ++ int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent, void *arg), ++ void *arg) ++{ ++ int err; ++ struct au_pin pin; ++ struct dentry *d, *parent, *h_parent, *real_parent; ++ ++ err = 0; ++ parent = dget_parent(dentry); ++ if (IS_ROOT(parent)) ++ goto out; ++ ++ au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2, ++ au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE); ++ ++ /* do not use au_dpage */ ++ real_parent = parent; ++ while (1) { ++ dput(parent); ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bdst); ++ if (h_parent) ++ goto out; /* success */ ++ ++ /* find top dir which is necessary to cpup */ ++ do { ++ d = parent; ++ dput(parent); ++ parent = dget_parent(d); ++ di_read_lock_parent3(parent, !AuLock_IR); ++ h_parent = au_h_dptr(parent, bdst); ++ di_read_unlock(parent, !AuLock_IR); ++ } while (!h_parent); ++ ++ if (d != real_parent) ++ di_write_lock_child3(d); ++ ++ /* somebody else might create while we were sleeping */ ++ if (!au_h_dptr(d, bdst) || !au_h_dptr(d, bdst)->d_inode) { ++ if (au_h_dptr(d, bdst)) ++ au_update_dbstart(d); ++ ++ au_pin_set_dentry(&pin, d); ++ err = au_do_pin(&pin); ++ if (!err) { ++ err = cp(d, bdst, h_parent, arg); ++ au_unpin(&pin); ++ } ++ } ++ ++ if (d != real_parent) ++ di_write_unlock(d); ++ if (unlikely(err)) ++ break; ++ } ++ ++ out: ++ dput(parent); ++ return err; ++} ++ ++static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent __maybe_unused , ++ void *arg __maybe_unused) ++{ ++ return au_sio_cpup_simple(dentry, bdst, -1, AuCpup_DTIME); ++} ++ ++int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL); ++} ++ ++int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ int err; ++ struct dentry *parent; ++ struct inode *dir; ++ ++ parent = dget_parent(dentry); ++ dir = parent->d_inode; ++ err = 0; ++ if (au_h_iptr(dir, bdst)) ++ goto out; ++ ++ di_read_unlock(parent, AuLock_IR); ++ di_write_lock_parent(parent); ++ /* someone else might change our inode while we were sleeping */ ++ if (!au_h_iptr(dir, bdst)) ++ err = au_cpup_dirs(dentry, bdst); ++ di_downgrade_lock(parent, AuLock_IR); ++ ++ out: ++ dput(parent); ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/cpup.h linux-2.6.31/fs/aufs/cpup.h +--- linux-2.6.31-vanilla/fs/aufs/cpup.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/cpup.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,81 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * copy-up/down functions ++ */ ++ ++#ifndef __AUFS_CPUP_H__ ++#define __AUFS_CPUP_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/path.h> ++#include <linux/time.h> ++#include <linux/aufs_type.h> ++ ++struct inode; ++struct file; ++ ++void au_cpup_attr_flags(struct inode *dst, struct inode *src); ++void au_cpup_attr_timesizes(struct inode *inode); ++void au_cpup_attr_nlink(struct inode *inode, int force); ++void au_cpup_attr_changeable(struct inode *inode); ++void au_cpup_igen(struct inode *inode, struct inode *h_inode); ++void au_cpup_attr_all(struct inode *inode, int force); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* cpup flags */ ++#define AuCpup_DTIME 1 /* do dtime_store/revert */ ++#define AuCpup_KEEPLINO (1 << 1) /* do not clear the lower xino, ++ for link(2) */ ++#define au_ftest_cpup(flags, name) ((flags) & AuCpup_##name) ++#define au_fset_cpup(flags, name) { (flags) |= AuCpup_##name; } ++#define au_fclr_cpup(flags, name) { (flags) &= ~AuCpup_##name; } ++ ++int au_copy_file(struct file *dst, struct file *src, loff_t len); ++int au_sio_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent); ++int au_sio_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ unsigned int flags); ++int au_sio_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ struct file *file); ++ ++int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, ++ int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent, void *arg), ++ void *arg); ++int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* keep timestamps when copyup */ ++struct au_dtime { ++ struct dentry *dt_dentry; ++ struct path dt_h_path; ++ struct timespec dt_atime, dt_mtime; ++}; ++void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, ++ struct path *h_path); ++void au_dtime_revert(struct au_dtime *dt); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_CPUP_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/dbgaufs.c linux-2.6.31/fs/aufs/dbgaufs.c +--- linux-2.6.31-vanilla/fs/aufs/dbgaufs.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/dbgaufs.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,331 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debugfs interface ++ */ ++ ++#include <linux/debugfs.h> ++#include "aufs.h" ++ ++#ifndef CONFIG_SYSFS ++#error DEBUG_FS depends upon SYSFS ++#endif ++ ++static struct dentry *dbgaufs; ++static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH; ++ ++/* 20 is max digits length of ulong 64 */ ++struct dbgaufs_arg { ++ int n; ++ char a[20 * 4]; ++}; ++ ++/* ++ * common function for all XINO files ++ */ ++static int dbgaufs_xi_release(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt) ++{ ++ int err; ++ struct kstat st; ++ struct dbgaufs_arg *p; ++ ++ err = -ENOMEM; ++ p = kmalloc(sizeof(*p), GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ ++ err = 0; ++ p->n = 0; ++ file->private_data = p; ++ if (!xf) ++ goto out; ++ ++ err = vfs_getattr(xf->f_vfsmnt, xf->f_dentry, &st); ++ if (!err) { ++ if (do_fcnt) ++ p->n = snprintf ++ (p->a, sizeof(p->a), "%ld, %llux%lu %lld\n", ++ (long)file_count(xf), st.blocks, st.blksize, ++ (long long)st.size); ++ else ++ p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n", ++ st.blocks, st.blksize, ++ (long long)st.size); ++ AuDebugOn(p->n >= sizeof(p->a)); ++ } else { ++ p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err); ++ err = 0; ++ } ++ ++ out: ++ return err; ++ ++} ++ ++static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct dbgaufs_arg *p; ++ ++ p = file->private_data; ++ return simple_read_from_buffer(buf, count, ppos, p->a, p->n); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int dbgaufs_xib_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0); ++ si_read_unlock(sb); ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xib_fop = { ++ .open = dbgaufs_xib_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define DbgaufsXi_PREFIX "xi" ++ ++static int dbgaufs_xino_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ long l; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ struct file *xf; ++ struct qstr *name; ++ ++ err = -ENOENT; ++ xf = NULL; ++ name = &file->f_dentry->d_name; ++ if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX) ++ || memcmp(name->name, DbgaufsXi_PREFIX, ++ sizeof(DbgaufsXi_PREFIX) - 1))) ++ goto out; ++ err = strict_strtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l); ++ if (unlikely(err)) ++ goto out; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ if (l <= au_sbend(sb)) { ++ xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file; ++ err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1); ++ } else ++ err = -ENOENT; ++ si_read_unlock(sb); ++ ++ out: ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xino_fop = { ++ .open = dbgaufs_xino_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ aufs_bindex_t bend; ++ struct au_branch *br; ++ struct au_xino_file *xi; ++ ++ if (!au_sbi(sb)->si_dbgaufs) ++ return; ++ ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ xi = &br->br_xino; ++ if (xi->xi_dbgaufs) { ++ debugfs_remove(xi->xi_dbgaufs); ++ xi->xi_dbgaufs = NULL; ++ } ++ } ++} ++ ++void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ struct au_sbinfo *sbinfo; ++ struct dentry *parent; ++ struct au_branch *br; ++ struct au_xino_file *xi; ++ aufs_bindex_t bend; ++ char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */ ++ ++ sbinfo = au_sbi(sb); ++ parent = sbinfo->si_dbgaufs; ++ if (!parent) ++ return; ++ ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex); ++ br = au_sbr(sb, bindex); ++ xi = &br->br_xino; ++ AuDebugOn(xi->xi_dbgaufs); ++ xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent, ++ sbinfo, &dbgaufs_xino_fop); ++ /* ignore an error */ ++ if (unlikely(!xi->xi_dbgaufs)) ++ AuWarn1("failed %s under debugfs\n", name); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++static int dbgaufs_xigen_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0); ++ si_read_unlock(sb); ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xigen_fop = { ++ .open = dbgaufs_xigen_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ err = -EIO; ++ sbinfo->si_dbgaufs_xigen = debugfs_create_file ++ ("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, ++ &dbgaufs_xigen_fop); ++ if (sbinfo->si_dbgaufs_xigen) ++ err = 0; ++ ++ return err; ++} ++#else ++static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) ++{ ++ return 0; ++} ++#endif /* CONFIG_AUFS_EXPORT */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++void dbgaufs_si_fin(struct au_sbinfo *sbinfo) ++{ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ debugfs_remove_recursive(sbinfo->si_dbgaufs); ++ sbinfo->si_dbgaufs = NULL; ++ kobject_put(&sbinfo->si_kobj); ++} ++ ++int dbgaufs_si_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ char name[SysaufsSiNameLen]; ++ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ err = -ENOENT; ++ if (!dbgaufs) { ++ AuErr1("/debug/aufs is uninitialized\n"); ++ goto out; ++ } ++ ++ err = -EIO; ++ sysaufs_name(sbinfo, name); ++ sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs); ++ if (unlikely(!sbinfo->si_dbgaufs)) ++ goto out; ++ kobject_get(&sbinfo->si_kobj); ++ ++ sbinfo->si_dbgaufs_xib = debugfs_create_file ++ ("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, ++ &dbgaufs_xib_fop); ++ if (unlikely(!sbinfo->si_dbgaufs_xib)) ++ goto out_dir; ++ ++ err = dbgaufs_xigen_init(sbinfo); ++ if (!err) ++ goto out; /* success */ ++ ++ out_dir: ++ dbgaufs_si_fin(sbinfo); ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void dbgaufs_fin(void) ++{ ++ debugfs_remove(dbgaufs); ++} ++ ++int __init dbgaufs_init(void) ++{ ++ int err; ++ ++ err = -EIO; ++ dbgaufs = debugfs_create_dir(AUFS_NAME, NULL); ++ if (dbgaufs) ++ err = 0; ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/dbgaufs.h linux-2.6.31/fs/aufs/dbgaufs.h +--- linux-2.6.31-vanilla/fs/aufs/dbgaufs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/dbgaufs.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,79 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debugfs interface ++ */ ++ ++#ifndef __DBGAUFS_H__ ++#define __DBGAUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/init.h> ++#include <linux/aufs_type.h> ++ ++struct super_block; ++struct au_sbinfo; ++ ++#ifdef CONFIG_DEBUG_FS ++/* dbgaufs.c */ ++void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); ++void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); ++void dbgaufs_si_fin(struct au_sbinfo *sbinfo); ++int dbgaufs_si_init(struct au_sbinfo *sbinfo); ++void dbgaufs_fin(void); ++int __init dbgaufs_init(void); ++ ++#else ++ ++static inline ++void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ /* empty */ ++} ++ ++static inline ++void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ /* empty */ ++} ++ ++static inline ++void dbgaufs_si_fin(struct au_sbinfo *sbinfo) ++{ ++ /* empty */ ++} ++ ++static inline ++int dbgaufs_si_init(struct au_sbinfo *sbinfo) ++{ ++ return 0; ++} ++ ++#define dbgaufs_fin() do {} while (0) ++ ++static inline ++int __init dbgaufs_init(void) ++{ ++ return 0; ++} ++#endif /* CONFIG_DEBUG_FS */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __DBGAUFS_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/dcsub.c linux-2.6.31/fs/aufs/dcsub.c +--- linux-2.6.31-vanilla/fs/aufs/dcsub.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/dcsub.c 2009-09-16 13:55:29.000000000 +0200 +@@ -0,0 +1,223 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for dentry cache ++ */ ++ ++#include "aufs.h" ++ ++static void au_dpage_free(struct au_dpage *dpage) ++{ ++ int i; ++ struct dentry **p; ++ ++ p = dpage->dentries; ++ for (i = 0; i < dpage->ndentry; i++) ++ dput(*p++); ++ free_page((unsigned long)dpage->dentries); ++} ++ ++int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp) ++{ ++ int err; ++ void *p; ++ ++ err = -ENOMEM; ++ dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp); ++ if (unlikely(!dpages->dpages)) ++ goto out; ++ ++ p = (void *)__get_free_page(gfp); ++ if (unlikely(!p)) ++ goto out_dpages; ++ ++ dpages->dpages[0].ndentry = 0; ++ dpages->dpages[0].dentries = p; ++ dpages->ndpage = 1; ++ return 0; /* success */ ++ ++ out_dpages: ++ kfree(dpages->dpages); ++ out: ++ return err; ++} ++ ++void au_dpages_free(struct au_dcsub_pages *dpages) ++{ ++ int i; ++ struct au_dpage *p; ++ ++ p = dpages->dpages; ++ for (i = 0; i < dpages->ndpage; i++) ++ au_dpage_free(p++); ++ kfree(dpages->dpages); ++} ++ ++static int au_dpages_append(struct au_dcsub_pages *dpages, ++ struct dentry *dentry, gfp_t gfp) ++{ ++ int err, sz; ++ struct au_dpage *dpage; ++ void *p; ++ ++ dpage = dpages->dpages + dpages->ndpage - 1; ++ sz = PAGE_SIZE / sizeof(dentry); ++ if (unlikely(dpage->ndentry >= sz)) { ++ AuLabel(new dpage); ++ err = -ENOMEM; ++ sz = dpages->ndpage * sizeof(*dpages->dpages); ++ p = au_kzrealloc(dpages->dpages, sz, ++ sz + sizeof(*dpages->dpages), gfp); ++ if (unlikely(!p)) ++ goto out; ++ ++ dpages->dpages = p; ++ dpage = dpages->dpages + dpages->ndpage; ++ p = (void *)__get_free_page(gfp); ++ if (unlikely(!p)) ++ goto out; ++ ++ dpage->ndentry = 0; ++ dpage->dentries = p; ++ dpages->ndpage++; ++ } ++ ++ dpage->dentries[dpage->ndentry++] = dget(dentry); ++ return 0; /* success */ ++ ++ out: ++ return err; ++} ++ ++int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, ++ au_dpages_test test, void *arg) ++{ ++ int err; ++ struct dentry *this_parent = root; ++ struct list_head *next; ++ struct super_block *sb = root->d_sb; ++ ++ err = 0; ++ spin_lock(&dcache_lock); ++ repeat: ++ next = this_parent->d_subdirs.next; ++ resume: ++ if (this_parent->d_sb == sb ++ && !IS_ROOT(this_parent) ++ && atomic_read(&this_parent->d_count) ++ && this_parent->d_inode ++ && (!test || test(this_parent, arg))) { ++ err = au_dpages_append(dpages, this_parent, GFP_ATOMIC); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ while (next != &this_parent->d_subdirs) { ++ struct list_head *tmp = next; ++ struct dentry *dentry = list_entry(tmp, struct dentry, ++ d_u.d_child); ++ next = tmp->next; ++ if (/*d_unhashed(dentry) || */!dentry->d_inode) ++ continue; ++ if (!list_empty(&dentry->d_subdirs)) { ++ this_parent = dentry; ++ goto repeat; ++ } ++ if (dentry->d_sb == sb ++ && atomic_read(&dentry->d_count) ++ && (!test || test(dentry, arg))) { ++ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); ++ if (unlikely(err)) ++ goto out; ++ } ++ } ++ ++ if (this_parent != root) { ++ next = this_parent->d_u.d_child.next; ++ this_parent = this_parent->d_parent; /* dcache_lock is locked */ ++ goto resume; ++ } ++ out: ++ spin_unlock(&dcache_lock); ++ return err; ++} ++ ++int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, ++ int do_include, au_dpages_test test, void *arg) ++{ ++ int err; ++ ++ err = 0; ++ spin_lock(&dcache_lock); ++ if (do_include && (!test || test(dentry, arg))) { ++ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); ++ if (unlikely(err)) ++ goto out; ++ } ++ while (!IS_ROOT(dentry)) { ++ dentry = dentry->d_parent; /* dcache_lock is locked */ ++ if (!test || test(dentry, arg)) { ++ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); ++ if (unlikely(err)) ++ break; ++ } ++ } ++ ++ out: ++ spin_unlock(&dcache_lock); ++ ++ return err; ++} ++ ++struct dentry *au_test_subdir(struct dentry *d1, struct dentry *d2) ++{ ++ struct dentry *trap, **dentries; ++ int err, i, j; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ ++ trap = ERR_PTR(-ENOMEM); ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages_rev(&dpages, d1, /*do_include*/1, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ trap = d1; ++ for (i = 0; !err && i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ for (j = 0; !err && j < dpage->ndentry; j++) { ++ struct dentry *d; ++ ++ d = dentries[j]; ++ err = (d == d2); ++ if (!err) ++ trap = d; ++ } ++ } ++ if (!err) ++ trap = NULL; ++ ++ out_dpages: ++ au_dpages_free(&dpages); ++ out: ++ return trap; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/dcsub.h linux-2.6.31/fs/aufs/dcsub.h +--- linux-2.6.31-vanilla/fs/aufs/dcsub.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/dcsub.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,54 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for dentry cache ++ */ ++ ++#ifndef __AUFS_DCSUB_H__ ++#define __AUFS_DCSUB_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/types.h> ++ ++struct dentry; ++ ++struct au_dpage { ++ int ndentry; ++ struct dentry **dentries; ++}; ++ ++struct au_dcsub_pages { ++ int ndpage; ++ struct au_dpage *dpages; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp); ++void au_dpages_free(struct au_dcsub_pages *dpages); ++typedef int (*au_dpages_test)(struct dentry *dentry, void *arg); ++int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, ++ au_dpages_test test, void *arg); ++int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, ++ int do_include, au_dpages_test test, void *arg); ++struct dentry *au_test_subdir(struct dentry *d1, struct dentry *d2); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DCSUB_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/debug.c linux-2.6.31/fs/aufs/debug.c +--- linux-2.6.31-vanilla/fs/aufs/debug.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/debug.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,431 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debug print functions ++ */ ++ ++#include <linux/module.h> ++#include <linux/vt_kern.h> ++#include "aufs.h" ++ ++int aufs_debug; ++MODULE_PARM_DESC(debug, "debug print"); ++module_param_named(debug, aufs_debug, int, S_IRUGO | S_IWUSR | S_IWGRP); ++ ++char *au_plevel = KERN_DEBUG; ++#define dpri(fmt, arg...) do { \ ++ if (au_debug_test()) \ ++ printk("%s" fmt, au_plevel, ##arg); \ ++} while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_dpri_whlist(struct au_nhash *whlist) ++{ ++ unsigned long ul, n; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (ul = 0; ul < n; ul++) { ++ hlist_for_each_entry(tpos, pos, head, wh_hash) ++ dpri("b%d, %.*s, %d\n", ++ tpos->wh_bindex, ++ tpos->wh_str.len, tpos->wh_str.name, ++ tpos->wh_str.len); ++ head++; ++ } ++} ++ ++void au_dpri_vdir(struct au_vdir *vdir) ++{ ++ unsigned long ul; ++ union au_vdir_deblk_p p; ++ unsigned char *o; ++ ++ if (!vdir || IS_ERR(vdir)) { ++ dpri("err %ld\n", PTR_ERR(vdir)); ++ return; ++ } ++ ++ dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n", ++ vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk, ++ vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version); ++ for (ul = 0; ul < vdir->vd_nblk; ul++) { ++ p.deblk = vdir->vd_deblk[ul]; ++ o = p.deblk; ++ dpri("[%lu]: %p\n", ul, o); ++ } ++} ++ ++static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, ++ struct dentry *wh) ++{ ++ char *n = NULL; ++ int l = 0; ++ ++ if (!inode || IS_ERR(inode)) { ++ dpri("i%d: err %ld\n", bindex, PTR_ERR(inode)); ++ return -1; ++ } ++ ++ /* the type of i_blocks depends upon CONFIG_LSF */ ++ BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long) ++ && sizeof(inode->i_blocks) != sizeof(u64)); ++ if (wh) { ++ n = (void *)wh->d_name.name; ++ l = wh->d_name.len; ++ } ++ ++ dpri("i%d: i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu," ++ " ct %lld, np %lu, st 0x%lx, f 0x%x, g %x%s%.*s\n", ++ bindex, ++ inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??", ++ atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode, ++ i_size_read(inode), (unsigned long long)inode->i_blocks, ++ (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff, ++ inode->i_mapping ? inode->i_mapping->nrpages : 0, ++ inode->i_state, inode->i_flags, inode->i_generation, ++ l ? ", wh " : "", l, n); ++ return 0; ++} ++ ++void au_dpri_inode(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ aufs_bindex_t bindex; ++ int err; ++ ++ err = do_pri_inode(-1, inode, NULL); ++ if (err || !au_test_aufs(inode->i_sb)) ++ return; ++ ++ iinfo = au_ii(inode); ++ if (!iinfo) ++ return; ++ dpri("i-1: bstart %d, bend %d, gen %d\n", ++ iinfo->ii_bstart, iinfo->ii_bend, au_iigen(inode)); ++ if (iinfo->ii_bstart < 0) ++ return; ++ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; bindex++) ++ do_pri_inode(bindex, iinfo->ii_hinode[0 + bindex].hi_inode, ++ iinfo->ii_hinode[0 + bindex].hi_whdentry); ++} ++ ++static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry) ++{ ++ struct dentry *wh = NULL; ++ ++ if (!dentry || IS_ERR(dentry)) { ++ dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry)); ++ return -1; ++ } ++ /* do not call dget_parent() here */ ++ dpri("d%d: %.*s?/%.*s, %s, cnt %d, flags 0x%x\n", ++ bindex, ++ AuDLNPair(dentry->d_parent), AuDLNPair(dentry), ++ dentry->d_sb ? au_sbtype(dentry->d_sb) : "??", ++ atomic_read(&dentry->d_count), dentry->d_flags); ++ if (bindex >= 0 && dentry->d_inode && au_test_aufs(dentry->d_sb)) { ++ struct au_iinfo *iinfo = au_ii(dentry->d_inode); ++ if (iinfo) ++ wh = iinfo->ii_hinode[0 + bindex].hi_whdentry; ++ } ++ do_pri_inode(bindex, dentry->d_inode, wh); ++ return 0; ++} ++ ++void au_dpri_dentry(struct dentry *dentry) ++{ ++ struct au_dinfo *dinfo; ++ aufs_bindex_t bindex; ++ int err; ++ ++ err = do_pri_dentry(-1, dentry); ++ if (err || !au_test_aufs(dentry->d_sb)) ++ return; ++ ++ dinfo = au_di(dentry); ++ if (!dinfo) ++ return; ++ dpri("d-1: bstart %d, bend %d, bwh %d, bdiropq %d, gen %d\n", ++ dinfo->di_bstart, dinfo->di_bend, ++ dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry)); ++ if (dinfo->di_bstart < 0) ++ return; ++ for (bindex = dinfo->di_bstart; bindex <= dinfo->di_bend; bindex++) ++ do_pri_dentry(bindex, dinfo->di_hdentry[0 + bindex].hd_dentry); ++} ++ ++static int do_pri_file(aufs_bindex_t bindex, struct file *file) ++{ ++ char a[32]; ++ ++ if (!file || IS_ERR(file)) { ++ dpri("f%d: err %ld\n", bindex, PTR_ERR(file)); ++ return -1; ++ } ++ a[0] = 0; ++ if (bindex < 0 ++ && file->f_dentry ++ && au_test_aufs(file->f_dentry->d_sb) ++ && au_fi(file)) ++ snprintf(a, sizeof(a), ", mmapped %d", au_test_mmapped(file)); ++ dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, pos %llu%s\n", ++ bindex, file->f_mode, file->f_flags, (long)file_count(file), ++ file->f_pos, a); ++ if (file->f_dentry) ++ do_pri_dentry(bindex, file->f_dentry); ++ return 0; ++} ++ ++void au_dpri_file(struct file *file) ++{ ++ struct au_finfo *finfo; ++ aufs_bindex_t bindex; ++ int err; ++ ++ err = do_pri_file(-1, file); ++ if (err || !file->f_dentry || !au_test_aufs(file->f_dentry->d_sb)) ++ return; ++ ++ finfo = au_fi(file); ++ if (!finfo) ++ return; ++ if (finfo->fi_bstart < 0) ++ return; ++ for (bindex = finfo->fi_bstart; bindex <= finfo->fi_bend; bindex++) { ++ struct au_hfile *hf; ++ ++ hf = finfo->fi_hfile + bindex; ++ do_pri_file(bindex, hf ? hf->hf_file : NULL); ++ } ++} ++ ++static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br) ++{ ++ struct vfsmount *mnt; ++ struct super_block *sb; ++ ++ if (!br || IS_ERR(br)) ++ goto out; ++ mnt = br->br_mnt; ++ if (!mnt || IS_ERR(mnt)) ++ goto out; ++ sb = mnt->mnt_sb; ++ if (!sb || IS_ERR(sb)) ++ goto out; ++ ++ dpri("s%d: {perm 0x%x, cnt %d, wbr %p}, " ++ "%s, dev 0x%02x%02x, flags 0x%lx, cnt(BIAS) %d, active %d, " ++ "xino %d\n", ++ bindex, br->br_perm, atomic_read(&br->br_count), br->br_wbr, ++ au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev), ++ sb->s_flags, sb->s_count - S_BIAS, ++ atomic_read(&sb->s_active), !!br->br_xino.xi_file); ++ return 0; ++ ++ out: ++ dpri("s%d: err %ld\n", bindex, PTR_ERR(br)); ++ return -1; ++} ++ ++void au_dpri_sb(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ aufs_bindex_t bindex; ++ int err; ++ /* to reuduce stack size */ ++ struct { ++ struct vfsmount mnt; ++ struct au_branch fake; ++ } *a; ++ ++ /* this function can be called from magic sysrq */ ++ a = kzalloc(sizeof(*a), GFP_ATOMIC); ++ if (unlikely(!a)) { ++ dpri("no memory\n"); ++ return; ++ } ++ ++ a->mnt.mnt_sb = sb; ++ a->fake.br_perm = 0; ++ a->fake.br_mnt = &a->mnt; ++ a->fake.br_xino.xi_file = NULL; ++ atomic_set(&a->fake.br_count, 0); ++ smp_mb(); /* atomic_set */ ++ err = do_pri_br(-1, &a->fake); ++ kfree(a); ++ dpri("dev 0x%x\n", sb->s_dev); ++ if (err || !au_test_aufs(sb)) ++ return; ++ ++ sbinfo = au_sbi(sb); ++ if (!sbinfo) ++ return; ++ dpri("nw %d, gen %u, kobj %d\n", ++ atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation, ++ atomic_read(&sbinfo->si_kobj.kref.refcount)); ++ for (bindex = 0; bindex <= sbinfo->si_bend; bindex++) ++ do_pri_br(bindex, sbinfo->si_branch[0 + bindex]); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_dbg_sleep_jiffy(int jiffy) ++{ ++ while (jiffy) ++ jiffy = schedule_timeout_uninterruptible(jiffy); ++} ++ ++void au_dbg_iattr(struct iattr *ia) ++{ ++#define AuBit(name) if (ia->ia_valid & ATTR_ ## name) \ ++ dpri(#name "\n") ++ AuBit(MODE); ++ AuBit(UID); ++ AuBit(GID); ++ AuBit(SIZE); ++ AuBit(ATIME); ++ AuBit(MTIME); ++ AuBit(CTIME); ++ AuBit(ATIME_SET); ++ AuBit(MTIME_SET); ++ AuBit(FORCE); ++ AuBit(ATTR_FLAG); ++ AuBit(KILL_SUID); ++ AuBit(KILL_SGID); ++ AuBit(FILE); ++ AuBit(KILL_PRIV); ++ AuBit(OPEN); ++ AuBit(TIMES_SET); ++#undef AuBit ++ dpri("ia_file %p\n", ia->ia_file); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_dbg_verify_dir_parent(struct dentry *dentry, unsigned int sigen) ++{ ++ struct dentry *parent; ++ ++ parent = dget_parent(dentry); ++ AuDebugOn(!S_ISDIR(dentry->d_inode->i_mode) ++ || IS_ROOT(dentry) ++ || au_digen(parent) != sigen); ++ dput(parent); ++} ++ ++void au_dbg_verify_nondir_parent(struct dentry *dentry, unsigned int sigen) ++{ ++ struct dentry *parent; ++ ++ parent = dget_parent(dentry); ++ AuDebugOn(S_ISDIR(dentry->d_inode->i_mode) ++ || au_digen(parent) != sigen); ++ dput(parent); ++} ++ ++void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen) ++{ ++ int err, i, j; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ AuDebugOn(err); ++ err = au_dcsub_pages_rev(&dpages, parent, /*do_include*/1, NULL, NULL); ++ AuDebugOn(err); ++ for (i = dpages.ndpage - 1; !err && i >= 0; i--) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ for (j = dpage->ndentry - 1; !err && j >= 0; j--) ++ AuDebugOn(au_digen(dentries[j]) != sigen); ++ } ++ au_dpages_free(&dpages); ++} ++ ++void au_dbg_verify_hf(struct au_finfo *finfo) ++{ ++ struct au_hfile *hf; ++ aufs_bindex_t bend, bindex; ++ ++ if (finfo->fi_bstart >= 0) { ++ bend = finfo->fi_bend; ++ for (bindex = finfo->fi_bstart; bindex <= bend; bindex++) { ++ hf = finfo->fi_hfile + bindex; ++ AuDebugOn(hf->hf_file || hf->hf_br); ++ } ++ } ++} ++ ++void au_dbg_verify_kthread(void) ++{ ++ if (au_test_wkq(current)) { ++ au_dbg_blocked(); ++ BUG(); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_debug_sbinfo_init(struct au_sbinfo *sbinfo __maybe_unused) ++{ ++#ifdef AuForceNoPlink ++ au_opt_clr(sbinfo->si_mntflags, PLINK); ++#endif ++#ifdef AuForceNoXino ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++#endif ++#ifdef AuForceNoRefrof ++ au_opt_clr(sbinfo->si_mntflags, REFROF); ++#endif ++#ifdef AuForceHinotify ++ au_opt_set_udba(sbinfo->si_mntflags, UDBA_HINOTIFY); ++#endif ++#ifdef AuForceRd0 ++ sbinfo->si_rdblk = 0; ++ sbinfo->si_rdhash = 0; ++#endif ++} ++ ++int __init au_debug_init(void) ++{ ++ aufs_bindex_t bindex; ++ struct au_vdir_destr destr; ++ ++ bindex = -1; ++ AuDebugOn(bindex >= 0); ++ ++ destr.len = -1; ++ AuDebugOn(destr.len < NAME_MAX); ++ ++#ifdef CONFIG_4KSTACKS ++ AuWarn("CONFIG_4KSTACKS is defined.\n"); ++#endif ++ ++#ifdef AuForceNoBrs ++ sysaufs_brs = 0; ++#endif ++ ++ return 0; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/debug.h linux-2.6.31/fs/aufs/debug.h +--- linux-2.6.31-vanilla/fs/aufs/debug.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/debug.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,263 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debug print functions ++ */ ++ ++#ifndef __AUFS_DEBUG_H__ ++#define __AUFS_DEBUG_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <asm/system.h> ++#include <linux/bug.h> ++/* #include <linux/err.h> */ ++#include <linux/init.h> ++/* #include <linux/kernel.h> */ ++#include <linux/delay.h> ++/* #include <linux/kd.h> */ ++/* #include <linux/vt_kern.h> */ ++#include <linux/sysrq.h> ++#include <linux/aufs_type.h> ++ ++#include <asm/system.h> ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define AuDebugOn(a) BUG_ON(a) ++ ++/* module parameter */ ++extern int aufs_debug; ++static inline void au_debug(int n) ++{ ++ aufs_debug = n; ++ smp_mb(); ++} ++ ++static inline int au_debug_test(void) ++{ ++ return aufs_debug; ++} ++#else ++#define AuDebugOn(a) do {} while (0) ++#define au_debug() do {} while (0) ++static inline int au_debug_test(void) ++{ ++ return 0; ++} ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* debug print */ ++ ++#define AuDpri(lvl, fmt, arg...) \ ++ printk(lvl AUFS_NAME " %s:%d:%s[%d]: " fmt, \ ++ __func__, __LINE__, current->comm, current->pid, ##arg) ++#define AuDbg(fmt, arg...) do { \ ++ if (au_debug_test()) \ ++ AuDpri(KERN_DEBUG, "DEBUG: " fmt, ##arg); \ ++} while (0) ++#define AuLabel(l) AuDbg(#l "\n") ++#define AuInfo(fmt, arg...) AuDpri(KERN_INFO, fmt, ##arg) ++#define AuWarn(fmt, arg...) AuDpri(KERN_WARNING, fmt, ##arg) ++#define AuErr(fmt, arg...) AuDpri(KERN_ERR, fmt, ##arg) ++#define AuIOErr(fmt, arg...) AuErr("I/O Error, " fmt, ##arg) ++#define AuWarn1(fmt, arg...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ AuWarn(fmt, ##arg); \ ++} while (0) ++ ++#define AuErr1(fmt, arg...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ AuErr(fmt, ##arg); \ ++} while (0) ++ ++#define AuIOErr1(fmt, arg...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ AuIOErr(fmt, ##arg); \ ++} while (0) ++ ++#define AuUnsupportMsg "This operation is not supported." \ ++ " Please report this application to aufs-users ML." ++#define AuUnsupport(fmt, args...) do { \ ++ AuErr(AuUnsupportMsg "\n" fmt, ##args); \ ++ dump_stack(); \ ++} while (0) ++ ++#define AuTraceErr(e) do { \ ++ if (unlikely((e) < 0)) \ ++ AuDbg("err %d\n", (int)(e)); \ ++} while (0) ++ ++#define AuTraceErrPtr(p) do { \ ++ if (IS_ERR(p)) \ ++ AuDbg("err %ld\n", PTR_ERR(p)); \ ++} while (0) ++ ++/* dirty macros for debug print, use with "%.*s" and caution */ ++#define AuLNPair(qstr) (qstr)->len, (qstr)->name ++#define AuDLNPair(d) AuLNPair(&(d)->d_name) ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_sbinfo; ++struct au_finfo; ++struct dentry; ++#ifdef CONFIG_AUFS_DEBUG ++extern char *au_plevel; ++struct au_nhash; ++void au_dpri_whlist(struct au_nhash *whlist); ++struct au_vdir; ++void au_dpri_vdir(struct au_vdir *vdir); ++struct inode; ++void au_dpri_inode(struct inode *inode); ++void au_dpri_dentry(struct dentry *dentry); ++struct file; ++void au_dpri_file(struct file *filp); ++struct super_block; ++void au_dpri_sb(struct super_block *sb); ++ ++void au_dbg_sleep_jiffy(int jiffy); ++struct iattr; ++void au_dbg_iattr(struct iattr *ia); ++ ++void au_dbg_verify_dir_parent(struct dentry *dentry, unsigned int sigen); ++void au_dbg_verify_nondir_parent(struct dentry *dentry, unsigned int sigen); ++void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen); ++void au_dbg_verify_hf(struct au_finfo *finfo); ++void au_dbg_verify_kthread(void); ++ ++int __init au_debug_init(void); ++void au_debug_sbinfo_init(struct au_sbinfo *sbinfo); ++#define AuDbgWhlist(w) do { \ ++ AuDbg(#w "\n"); \ ++ au_dpri_whlist(w); \ ++} while (0) ++ ++#define AuDbgVdir(v) do { \ ++ AuDbg(#v "\n"); \ ++ au_dpri_vdir(v); \ ++} while (0) ++ ++#define AuDbgInode(i) do { \ ++ AuDbg(#i "\n"); \ ++ au_dpri_inode(i); \ ++} while (0) ++ ++#define AuDbgDentry(d) do { \ ++ AuDbg(#d "\n"); \ ++ au_dpri_dentry(d); \ ++} while (0) ++ ++#define AuDbgFile(f) do { \ ++ AuDbg(#f "\n"); \ ++ au_dpri_file(f); \ ++} while (0) ++ ++#define AuDbgSb(sb) do { \ ++ AuDbg(#sb "\n"); \ ++ au_dpri_sb(sb); \ ++} while (0) ++ ++#define AuDbgSleep(sec) do { \ ++ AuDbg("sleep %d sec\n", sec); \ ++ ssleep(sec); \ ++} while (0) ++ ++#define AuDbgSleepJiffy(jiffy) do { \ ++ AuDbg("sleep %d jiffies\n", jiffy); \ ++ au_dbg_sleep_jiffy(jiffy); \ ++} while (0) ++ ++#define AuDbgIAttr(ia) do { \ ++ AuDbg("ia_valid 0x%x\n", (ia)->ia_valid); \ ++ au_dbg_iattr(ia); \ ++} while (0) ++#else ++static inline void au_dbg_verify_dir_parent(struct dentry *dentry, ++ unsigned int sigen) ++{ ++ /* empty */ ++} ++static inline void au_dbg_verify_nondir_parent(struct dentry *dentry, ++ unsigned int sigen) ++{ ++ /* empty */ ++} ++static inline void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen) ++{ ++ /* empty */ ++} ++static inline void au_dbg_verify_hf(struct au_finfo *finfo) ++{ ++ /* empty */ ++} ++static inline void au_dbg_verify_kthread(void) ++{ ++ /* empty */ ++} ++ ++static inline int au_debug_init(void) ++{ ++ return 0; ++} ++static inline void au_debug_sbinfo_init(struct au_sbinfo *sbinfo) ++{ ++ /* empty */ ++} ++#define AuDbgWhlist(w) do {} while (0) ++#define AuDbgVdir(v) do {} while (0) ++#define AuDbgInode(i) do {} while (0) ++#define AuDbgDentry(d) do {} while (0) ++#define AuDbgFile(f) do {} while (0) ++#define AuDbgSb(sb) do {} while (0) ++#define AuDbgSleep(sec) do {} while (0) ++#define AuDbgSleepJiffy(jiffy) do {} while (0) ++#define AuDbgIAttr(ia) do {} while (0) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_MAGIC_SYSRQ ++int __init au_sysrq_init(void); ++void au_sysrq_fin(void); ++ ++#ifdef CONFIG_HW_CONSOLE ++#define au_dbg_blocked() do { \ ++ WARN_ON(1); \ ++ handle_sysrq('w', vc_cons[fg_console].d->vc_tty); \ ++} while (0) ++#else ++#define au_dbg_blocked() do {} while (0) ++#endif ++ ++#else ++static inline int au_sysrq_init(void) ++{ ++ return 0; ++} ++#define au_sysrq_fin() do {} while (0) ++#define au_dbg_blocked() do {} while (0) ++#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DEBUG_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/dentry.c linux-2.6.31/fs/aufs/dentry.c +--- linux-2.6.31-vanilla/fs/aufs/dentry.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/dentry.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,879 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * lookup and dentry operations ++ */ ++ ++#include <linux/namei.h> ++#include "aufs.h" ++ ++static void au_h_nd(struct nameidata *h_nd, struct nameidata *nd) ++{ ++ if (nd) { ++ *h_nd = *nd; ++ ++ /* ++ * gave up supporting LOOKUP_CREATE/OPEN for lower fs, ++ * due to whiteout and branch permission. ++ */ ++ h_nd->flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE ++ | LOOKUP_FOLLOW); ++ /* unnecessary? */ ++ h_nd->intent.open.file = NULL; ++ } else ++ memset(h_nd, 0, sizeof(*h_nd)); ++} ++ ++struct au_lkup_one_args { ++ struct dentry **errp; ++ struct qstr *name; ++ struct dentry *h_parent; ++ struct au_branch *br; ++ struct nameidata *nd; ++}; ++ ++struct dentry *au_lkup_one(struct qstr *name, struct dentry *h_parent, ++ struct au_branch *br, struct nameidata *nd) ++{ ++ struct dentry *h_dentry; ++ int err; ++ struct nameidata h_nd; ++ ++ if (au_test_fs_null_nd(h_parent->d_sb)) ++ return vfsub_lookup_one_len(name->name, h_parent, name->len); ++ ++ au_h_nd(&h_nd, nd); ++ h_nd.path.dentry = h_parent; ++ h_nd.path.mnt = br->br_mnt; ++ ++ err = __lookup_one_len(name->name, &h_nd.last, NULL, name->len); ++ h_dentry = ERR_PTR(err); ++ if (!err) { ++ path_get(&h_nd.path); ++ h_dentry = vfsub_lookup_hash(&h_nd); ++ path_put(&h_nd.path); ++ } ++ ++ return h_dentry; ++} ++ ++static void au_call_lkup_one(void *args) ++{ ++ struct au_lkup_one_args *a = args; ++ *a->errp = au_lkup_one(a->name, a->h_parent, a->br, a->nd); ++} ++ ++#define AuLkup_ALLOW_NEG 1 ++#define au_ftest_lkup(flags, name) ((flags) & AuLkup_##name) ++#define au_fset_lkup(flags, name) { (flags) |= AuLkup_##name; } ++#define au_fclr_lkup(flags, name) { (flags) &= ~AuLkup_##name; } ++ ++struct au_do_lookup_args { ++ unsigned int flags; ++ mode_t type; ++ struct nameidata *nd; ++}; ++ ++/* ++ * returns positive/negative dentry, NULL or an error. ++ * NULL means whiteout-ed or not-found. ++ */ ++static struct dentry* ++au_do_lookup(struct dentry *h_parent, struct dentry *dentry, ++ aufs_bindex_t bindex, struct qstr *wh_name, ++ struct au_do_lookup_args *args) ++{ ++ struct dentry *h_dentry; ++ struct inode *h_inode, *inode; ++ struct qstr *name; ++ struct au_branch *br; ++ int wh_found, opq; ++ unsigned char wh_able; ++ const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG); ++ ++ name = &dentry->d_name; ++ wh_found = 0; ++ br = au_sbr(dentry->d_sb, bindex); ++ wh_able = !!au_br_whable(br->br_perm); ++ if (wh_able) ++ wh_found = au_wh_test(h_parent, wh_name, br, /*try_sio*/0); ++ h_dentry = ERR_PTR(wh_found); ++ if (!wh_found) ++ goto real_lookup; ++ if (unlikely(wh_found < 0)) ++ goto out; ++ ++ /* We found a whiteout */ ++ /* au_set_dbend(dentry, bindex); */ ++ au_set_dbwh(dentry, bindex); ++ if (!allow_neg) ++ return NULL; /* success */ ++ ++ real_lookup: ++ h_dentry = au_lkup_one(name, h_parent, br, args->nd); ++ if (IS_ERR(h_dentry)) ++ goto out; ++ ++ h_inode = h_dentry->d_inode; ++ if (!h_inode) { ++ if (!allow_neg) ++ goto out_neg; ++ } else if (wh_found ++ || (args->type && args->type != (h_inode->i_mode & S_IFMT))) ++ goto out_neg; ++ ++ if (au_dbend(dentry) <= bindex) ++ au_set_dbend(dentry, bindex); ++ if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) ++ au_set_dbstart(dentry, bindex); ++ au_set_h_dptr(dentry, bindex, h_dentry); ++ ++ inode = dentry->d_inode; ++ if (!h_inode || !S_ISDIR(h_inode->i_mode) || !wh_able ++ || (inode && !S_ISDIR(inode->i_mode))) ++ goto out; /* success */ ++ ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ opq = au_diropq_test(h_dentry, br); ++ mutex_unlock(&h_inode->i_mutex); ++ if (opq > 0) ++ au_set_dbdiropq(dentry, bindex); ++ else if (unlikely(opq < 0)) { ++ au_set_h_dptr(dentry, bindex, NULL); ++ h_dentry = ERR_PTR(opq); ++ } ++ goto out; ++ ++ out_neg: ++ dput(h_dentry); ++ h_dentry = NULL; ++ out: ++ return h_dentry; ++} ++ ++static int au_test_shwh(struct super_block *sb, const struct qstr *name) ++{ ++ if (unlikely(!au_opt_test(au_mntflags(sb), SHWH) ++ && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))) ++ return -EPERM; ++ return 0; ++} ++ ++/* ++ * returns the number of lower positive dentries, ++ * otherwise an error. ++ * can be called at unlinking with @type is zero. ++ */ ++int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type, ++ struct nameidata *nd) ++{ ++ int npositive, err; ++ aufs_bindex_t bindex, btail, bdiropq; ++ unsigned char isdir; ++ struct qstr whname; ++ struct au_do_lookup_args args = { ++ .flags = 0, ++ .type = type, ++ .nd = nd ++ }; ++ const struct qstr *name = &dentry->d_name; ++ struct dentry *parent; ++ struct inode *inode; ++ ++ parent = dget_parent(dentry); ++ err = au_test_shwh(dentry->d_sb, name); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_wh_name_alloc(&whname, name); ++ if (unlikely(err)) ++ goto out; ++ ++ inode = dentry->d_inode; ++ isdir = !!(inode && S_ISDIR(inode->i_mode)); ++ if (!type) ++ au_fset_lkup(args.flags, ALLOW_NEG); ++ ++ npositive = 0; ++ btail = au_dbtaildir(parent); ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ struct dentry *h_parent, *h_dentry; ++ struct inode *h_inode, *h_dir; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry) { ++ if (h_dentry->d_inode) ++ npositive++; ++ if (type != S_IFDIR) ++ break; ++ continue; ++ } ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent) ++ continue; ++ h_dir = h_parent->d_inode; ++ if (!h_dir || !S_ISDIR(h_dir->i_mode)) ++ continue; ++ ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); ++ h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname, ++ &args); ++ mutex_unlock(&h_dir->i_mutex); ++ err = PTR_ERR(h_dentry); ++ if (IS_ERR(h_dentry)) ++ goto out_wh; ++ au_fclr_lkup(args.flags, ALLOW_NEG); ++ ++ if (au_dbwh(dentry) >= 0) ++ break; ++ if (!h_dentry) ++ continue; ++ h_inode = h_dentry->d_inode; ++ if (!h_inode) ++ continue; ++ npositive++; ++ if (!args.type) ++ args.type = h_inode->i_mode & S_IFMT; ++ if (args.type != S_IFDIR) ++ break; ++ else if (isdir) { ++ /* the type of lower may be different */ ++ bdiropq = au_dbdiropq(dentry); ++ if (bdiropq >= 0 && bdiropq <= bindex) ++ break; ++ } ++ } ++ ++ if (npositive) { ++ AuLabel(positive); ++ au_update_dbstart(dentry); ++ } ++ err = npositive; ++ if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) ++ && au_dbstart(dentry) < 0)) ++ /* both of real entry and whiteout found */ ++ err = -EIO; ++ ++ out_wh: ++ kfree(whname.name); ++ out: ++ dput(parent); ++ return err; ++} ++ ++struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent, ++ struct au_branch *br) ++{ ++ struct dentry *dentry; ++ int wkq_err; ++ ++ if (!au_test_h_perm_sio(parent->d_inode, MAY_EXEC)) ++ dentry = au_lkup_one(name, parent, br, /*nd*/NULL); ++ else { ++ struct au_lkup_one_args args = { ++ .errp = &dentry, ++ .name = name, ++ .h_parent = parent, ++ .br = br, ++ .nd = NULL ++ }; ++ ++ wkq_err = au_wkq_wait(au_call_lkup_one, &args); ++ if (unlikely(wkq_err)) ++ dentry = ERR_PTR(wkq_err); ++ } ++ ++ return dentry; ++} ++ ++/* ++ * lookup @dentry on @bindex which should be negative. ++ */ ++int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ int err; ++ struct dentry *parent, *h_parent, *h_dentry; ++ struct qstr *name; ++ ++ name = &dentry->d_name; ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bindex); ++ h_dentry = au_sio_lkup_one(name, h_parent, ++ au_sbr(dentry->d_sb, bindex)); ++ err = PTR_ERR(h_dentry); ++ if (IS_ERR(h_dentry)) ++ goto out; ++ if (unlikely(h_dentry->d_inode)) { ++ err = -EIO; ++ AuIOErr("b%d %.*s should be negative.\n", ++ bindex, AuDLNPair(h_dentry)); ++ dput(h_dentry); ++ goto out; ++ } ++ ++ if (bindex < au_dbstart(dentry)) ++ au_set_dbstart(dentry, bindex); ++ if (au_dbend(dentry) < bindex) ++ au_set_dbend(dentry, bindex); ++ au_set_h_dptr(dentry, bindex, h_dentry); ++ err = 0; ++ ++ out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* subset of struct inode */ ++struct au_iattr { ++ unsigned long i_ino; ++ /* unsigned int i_nlink; */ ++ uid_t i_uid; ++ gid_t i_gid; ++ u64 i_version; ++/* ++ loff_t i_size; ++ blkcnt_t i_blocks; ++*/ ++ umode_t i_mode; ++}; ++ ++static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode) ++{ ++ ia->i_ino = h_inode->i_ino; ++ /* ia->i_nlink = h_inode->i_nlink; */ ++ ia->i_uid = h_inode->i_uid; ++ ia->i_gid = h_inode->i_gid; ++ ia->i_version = h_inode->i_version; ++/* ++ ia->i_size = h_inode->i_size; ++ ia->i_blocks = h_inode->i_blocks; ++*/ ++ ia->i_mode = (h_inode->i_mode & S_IFMT); ++} ++ ++static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode) ++{ ++ return ia->i_ino != h_inode->i_ino ++ /* || ia->i_nlink != h_inode->i_nlink */ ++ || ia->i_uid != h_inode->i_uid ++ || ia->i_gid != h_inode->i_gid ++ || ia->i_version != h_inode->i_version ++/* ++ || ia->i_size != h_inode->i_size ++ || ia->i_blocks != h_inode->i_blocks ++*/ ++ || ia->i_mode != (h_inode->i_mode & S_IFMT); ++} ++ ++static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent, ++ struct au_branch *br) ++{ ++ int err; ++ struct au_iattr ia; ++ struct inode *h_inode; ++ struct dentry *h_d; ++ struct super_block *h_sb; ++ ++ err = 0; ++ memset(&ia, -1, sizeof(ia)); ++ h_sb = h_dentry->d_sb; ++ h_inode = h_dentry->d_inode; ++ if (h_inode) ++ au_iattr_save(&ia, h_inode); ++ else if (au_test_nfs(h_sb) || au_test_fuse(h_sb)) ++ /* nfs d_revalidate may return 0 for negative dentry */ ++ /* fuse d_revalidate always return 0 for negative dentry */ ++ goto out; ++ ++ /* main purpose is namei.c:cached_lookup() and d_revalidate */ ++ h_d = au_lkup_one(&h_dentry->d_name, h_parent, br, /*nd*/NULL); ++ err = PTR_ERR(h_d); ++ if (IS_ERR(h_d)) ++ goto out; ++ ++ err = 0; ++ if (unlikely(h_d != h_dentry ++ || h_d->d_inode != h_inode ++ || (h_inode && au_iattr_test(&ia, h_inode)))) ++ err = au_busy_or_stale(); ++ dput(h_d); ++ ++ out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, ++ struct dentry *h_parent, struct au_branch *br) ++{ ++ int err; ++ ++ err = 0; ++ if (udba == AuOpt_UDBA_REVAL) { ++ IMustLock(h_dir); ++ err = (h_dentry->d_parent->d_inode != h_dir); ++ } else if (udba == AuOpt_UDBA_HINOTIFY) ++ err = au_h_verify_dentry(h_dentry, h_parent, br); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_do_refresh_hdentry(struct au_hdentry *p, struct au_dinfo *dinfo, ++ struct dentry *parent) ++{ ++ struct dentry *h_d, *h_dp; ++ struct au_hdentry tmp, *q; ++ struct super_block *sb; ++ aufs_bindex_t new_bindex, bindex, bend, bwh, bdiropq; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ bend = dinfo->di_bend; ++ bwh = dinfo->di_bwh; ++ bdiropq = dinfo->di_bdiropq; ++ for (bindex = dinfo->di_bstart; bindex <= bend; bindex++, p++) { ++ h_d = p->hd_dentry; ++ if (!h_d) ++ continue; ++ ++ h_dp = dget_parent(h_d); ++ if (h_dp == au_h_dptr(parent, bindex)) { ++ dput(h_dp); ++ continue; ++ } ++ ++ new_bindex = au_find_dbindex(parent, h_dp); ++ dput(h_dp); ++ if (dinfo->di_bwh == bindex) ++ bwh = new_bindex; ++ if (dinfo->di_bdiropq == bindex) ++ bdiropq = new_bindex; ++ if (new_bindex < 0) { ++ au_hdput(p); ++ p->hd_dentry = NULL; ++ continue; ++ } ++ ++ /* swap two lower dentries, and loop again */ ++ q = dinfo->di_hdentry + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hd_dentry) { ++ bindex--; ++ p--; ++ } ++ } ++ ++ sb = parent->d_sb; ++ dinfo->di_bwh = -1; ++ if (bwh >= 0 && bwh <= au_sbend(sb) && au_sbr_whable(sb, bwh)) ++ dinfo->di_bwh = bwh; ++ ++ dinfo->di_bdiropq = -1; ++ if (bdiropq >= 0 ++ && bdiropq <= au_sbend(sb) ++ && au_sbr_whable(sb, bdiropq)) ++ dinfo->di_bdiropq = bdiropq; ++ ++ bend = au_dbend(parent); ++ p = dinfo->di_hdentry; ++ for (bindex = 0; bindex <= bend; bindex++, p++) ++ if (p->hd_dentry) { ++ dinfo->di_bstart = bindex; ++ break; ++ } ++ ++ p = dinfo->di_hdentry + bend; ++ for (bindex = bend; bindex >= 0; bindex--, p--) ++ if (p->hd_dentry) { ++ dinfo->di_bend = bindex; ++ break; ++ } ++} ++ ++/* ++ * returns the number of found lower positive dentries, ++ * otherwise an error. ++ */ ++int au_refresh_hdentry(struct dentry *dentry, mode_t type) ++{ ++ int npositive, err; ++ unsigned int sigen; ++ aufs_bindex_t bstart; ++ struct au_dinfo *dinfo; ++ struct super_block *sb; ++ struct dentry *parent; ++ ++ DiMustWriteLock(dentry); ++ ++ sb = dentry->d_sb; ++ AuDebugOn(IS_ROOT(dentry)); ++ sigen = au_sigen(sb); ++ parent = dget_parent(dentry); ++ AuDebugOn(au_digen(parent) != sigen ++ || au_iigen(parent->d_inode) != sigen); ++ ++ dinfo = au_di(dentry); ++ err = au_di_realloc(dinfo, au_sbend(sb) + 1); ++ npositive = err; ++ if (unlikely(err)) ++ goto out; ++ au_do_refresh_hdentry(dinfo->di_hdentry + dinfo->di_bstart, dinfo, ++ parent); ++ ++ npositive = 0; ++ bstart = au_dbstart(parent); ++ if (type != S_IFDIR && dinfo->di_bstart == bstart) ++ goto out_dgen; /* success */ ++ ++ npositive = au_lkup_dentry(dentry, bstart, type, /*nd*/NULL); ++ if (npositive < 0) ++ goto out; ++ if (dinfo->di_bwh >= 0 && dinfo->di_bwh <= dinfo->di_bstart) ++ d_drop(dentry); ++ ++ out_dgen: ++ au_update_digen(dentry); ++ out: ++ dput(parent); ++ AuTraceErr(npositive); ++ return npositive; ++} ++ ++static noinline_for_stack ++int au_do_h_d_reval(struct dentry *h_dentry, struct nameidata *nd, ++ struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ int err, valid; ++ int (*reval)(struct dentry *, struct nameidata *); ++ ++ err = 0; ++ reval = NULL; ++ if (h_dentry->d_op) ++ reval = h_dentry->d_op->d_revalidate; ++ if (!reval) ++ goto out; ++ ++ AuDbg("b%d\n", bindex); ++ if (au_test_fs_null_nd(h_dentry->d_sb)) ++ /* it may return tri-state */ ++ valid = reval(h_dentry, NULL); ++ else { ++ struct nameidata h_nd; ++ int locked; ++ struct dentry *parent; ++ ++ au_h_nd(&h_nd, nd); ++ parent = nd->path.dentry; ++ locked = (nd && nd->path.dentry != dentry); ++ if (locked) ++ di_read_lock_parent(parent, AuLock_IR); ++ BUG_ON(bindex > au_dbend(parent)); ++ h_nd.path.dentry = au_h_dptr(parent, bindex); ++ BUG_ON(!h_nd.path.dentry); ++ h_nd.path.mnt = au_sbr(parent->d_sb, bindex)->br_mnt; ++ path_get(&h_nd.path); ++ valid = reval(h_dentry, &h_nd); ++ path_put(&h_nd.path); ++ if (locked) ++ di_read_unlock(parent, AuLock_IR); ++ } ++ ++ if (unlikely(valid < 0)) ++ err = valid; ++ else if (!valid) ++ err = -EINVAL; ++ ++ out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* todo: remove this */ ++static int h_d_revalidate(struct dentry *dentry, struct inode *inode, ++ struct nameidata *nd, int do_udba) ++{ ++ int err; ++ umode_t mode, h_mode; ++ aufs_bindex_t bindex, btail, bstart, ibs, ibe; ++ unsigned char plus, unhashed, is_root, h_plus; ++ struct inode *first, *h_inode, *h_cached_inode; ++ struct dentry *h_dentry; ++ struct qstr *name, *h_name; ++ ++ err = 0; ++ plus = 0; ++ mode = 0; ++ first = NULL; ++ ibs = -1; ++ ibe = -1; ++ unhashed = !!d_unhashed(dentry); ++ is_root = !!IS_ROOT(dentry); ++ name = &dentry->d_name; ++ ++ /* ++ * Theoretically, REVAL test should be unnecessary in case of INOTIFY. ++ * But inotify doesn't fire some necessary events, ++ * IN_ATTRIB for atime/nlink/pageio ++ * IN_DELETE for NFS dentry ++ * Let's do REVAL test too. ++ */ ++ if (do_udba && inode) { ++ mode = (inode->i_mode & S_IFMT); ++ plus = (inode->i_nlink > 0); ++ first = au_h_iptr(inode, au_ibstart(inode)); ++ ibs = au_ibstart(inode); ++ ibe = au_ibend(inode); ++ } ++ ++ bstart = au_dbstart(dentry); ++ btail = bstart; ++ if (inode && S_ISDIR(inode->i_mode)) ++ btail = au_dbtaildir(dentry); ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ ++ AuDbg("b%d, %.*s\n", bindex, AuDLNPair(h_dentry)); ++ h_name = &h_dentry->d_name; ++ if (unlikely(do_udba ++ && !is_root ++ && (unhashed != !!d_unhashed(h_dentry) ++ || name->len != h_name->len ++ || memcmp(name->name, h_name->name, name->len)) ++ )) { ++ AuDbg("unhash 0x%x 0x%x, %.*s %.*s\n", ++ unhashed, d_unhashed(h_dentry), ++ AuDLNPair(dentry), AuDLNPair(h_dentry)); ++ goto err; ++ } ++ ++ err = au_do_h_d_reval(h_dentry, nd, dentry, bindex); ++ if (unlikely(err)) ++ /* do not goto err, to keep the errno */ ++ break; ++ ++ /* todo: plink too? */ ++ if (!do_udba) ++ continue; ++ ++ /* UDBA tests */ ++ h_inode = h_dentry->d_inode; ++ if (unlikely(!!inode != !!h_inode)) ++ goto err; ++ ++ h_plus = plus; ++ h_mode = mode; ++ h_cached_inode = h_inode; ++ if (h_inode) { ++ h_mode = (h_inode->i_mode & S_IFMT); ++ h_plus = (h_inode->i_nlink > 0); ++ } ++ if (inode && ibs <= bindex && bindex <= ibe) ++ h_cached_inode = au_h_iptr(inode, bindex); ++ ++ if (unlikely(plus != h_plus ++ || mode != h_mode ++ || h_cached_inode != h_inode)) ++ goto err; ++ continue; ++ ++ err: ++ err = -EINVAL; ++ break; ++ } ++ ++ return err; ++} ++ ++static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct dentry *parent; ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ if (au_digen(dentry) == sigen && au_iigen(inode) == sigen) ++ return 0; ++ ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ AuDebugOn(au_digen(parent) != sigen ++ || au_iigen(parent->d_inode) != sigen); ++ au_dbg_verify_gen(parent, sigen); ++ ++ /* returns a number of positive dentries */ ++ err = au_refresh_hdentry(dentry, inode->i_mode & S_IFMT); ++ if (err >= 0) ++ err = au_refresh_hinode(inode, dentry); ++ ++ di_read_unlock(parent, AuLock_IR); ++ dput(parent); ++ return err; ++} ++ ++int au_reval_dpath(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct dentry *d, *parent; ++ struct inode *inode; ++ ++ if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIRS)) ++ return simple_reval_dpath(dentry, sigen); ++ ++ /* slow loop, keep it simple and stupid */ ++ /* cf: au_cpup_dirs() */ ++ err = 0; ++ parent = NULL; ++ while (au_digen(dentry) != sigen ++ || au_iigen(dentry->d_inode) != sigen) { ++ d = dentry; ++ while (1) { ++ dput(parent); ++ parent = dget_parent(d); ++ if (au_digen(parent) == sigen ++ && au_iigen(parent->d_inode) == sigen) ++ break; ++ d = parent; ++ } ++ ++ inode = d->d_inode; ++ if (d != dentry) ++ di_write_lock_child(d); ++ ++ /* someone might update our dentry while we were sleeping */ ++ if (au_digen(d) != sigen || au_iigen(d->d_inode) != sigen) { ++ di_read_lock_parent(parent, AuLock_IR); ++ /* returns a number of positive dentries */ ++ err = au_refresh_hdentry(d, inode->i_mode & S_IFMT); ++ if (err >= 0) ++ err = au_refresh_hinode(inode, d); ++ di_read_unlock(parent, AuLock_IR); ++ } ++ ++ if (d != dentry) ++ di_write_unlock(d); ++ dput(parent); ++ if (unlikely(err)) ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * if valid returns 1, otherwise 0. ++ */ ++static int aufs_d_revalidate(struct dentry *dentry, struct nameidata *nd) ++{ ++ int valid, err; ++ unsigned int sigen; ++ unsigned char do_udba; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ err = -EINVAL; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW); ++ sigen = au_sigen(sb); ++ if (au_digen(dentry) != sigen) { ++ AuDebugOn(IS_ROOT(dentry)); ++ if (inode) ++ err = au_reval_dpath(dentry, sigen); ++ if (unlikely(err)) ++ goto out_dgrade; ++ AuDebugOn(au_digen(dentry) != sigen); ++ } ++ if (inode && au_iigen(inode) != sigen) { ++ AuDebugOn(IS_ROOT(dentry)); ++ err = au_refresh_hinode(inode, dentry); ++ if (unlikely(err)) ++ goto out_dgrade; ++ AuDebugOn(au_iigen(inode) != sigen); ++ } ++ di_downgrade_lock(dentry, AuLock_IR); ++ ++ AuDebugOn(au_digen(dentry) != sigen); ++ AuDebugOn(inode && au_iigen(inode) != sigen); ++ err = -EINVAL; ++ do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE); ++ if (do_udba && inode) { ++ aufs_bindex_t bstart = au_ibstart(inode); ++ ++ if (bstart >= 0 ++ && au_test_higen(inode, au_h_iptr(inode, bstart))) ++ goto out; ++ } ++ ++ err = h_d_revalidate(dentry, inode, nd, do_udba); ++ if (unlikely(!err && do_udba && au_dbstart(dentry) < 0)) ++ /* both of real entry and whiteout found */ ++ err = -EIO; ++ goto out; ++ ++ out_dgrade: ++ di_downgrade_lock(dentry, AuLock_IR); ++ out: ++ aufs_read_unlock(dentry, AuLock_IR); ++ AuTraceErr(err); ++ valid = !err; ++ if (!valid) ++ AuDbg("%.*s invalid\n", AuDLNPair(dentry)); ++ return valid; ++} ++ ++static void aufs_d_release(struct dentry *dentry) ++{ ++ struct au_dinfo *dinfo; ++ aufs_bindex_t bend, bindex; ++ ++ dinfo = dentry->d_fsdata; ++ if (!dinfo) ++ return; ++ ++ /* dentry may not be revalidated */ ++ bindex = dinfo->di_bstart; ++ if (bindex >= 0) { ++ struct au_hdentry *p; ++ ++ bend = dinfo->di_bend; ++ p = dinfo->di_hdentry + bindex; ++ while (bindex++ <= bend) { ++ if (p->hd_dentry) ++ au_hdput(p); ++ p++; ++ } ++ } ++ kfree(dinfo->di_hdentry); ++ AuRwDestroy(&dinfo->di_rwsem); ++ au_cache_free_dinfo(dinfo); ++ au_hin_di_reinit(dentry); ++} ++ ++struct dentry_operations aufs_dop = { ++ .d_revalidate = aufs_d_revalidate, ++ .d_release = aufs_d_release ++}; +diff -Nur linux-2.6.31-vanilla/fs/aufs/dentry.h linux-2.6.31/fs/aufs/dentry.h +--- linux-2.6.31-vanilla/fs/aufs/dentry.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/dentry.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,231 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * lookup and dentry operations ++ */ ++ ++#ifndef __AUFS_DENTRY_H__ ++#define __AUFS_DENTRY_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/dcache.h> ++#include <linux/aufs_type.h> ++#include "rwsem.h" ++ ++/* make a single member structure for future use */ ++/* todo: remove this structure */ ++struct au_hdentry { ++ struct dentry *hd_dentry; ++}; ++ ++struct au_dinfo { ++ atomic_t di_generation; ++ ++ struct au_rwsem di_rwsem; ++ aufs_bindex_t di_bstart, di_bend, di_bwh, di_bdiropq; ++ struct au_hdentry *di_hdentry; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dentry.c */ ++extern struct dentry_operations aufs_dop; ++struct au_branch; ++struct dentry *au_lkup_one(struct qstr *name, struct dentry *h_parent, ++ struct au_branch *br, struct nameidata *nd); ++struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent, ++ struct au_branch *br); ++int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, ++ struct dentry *h_parent, struct au_branch *br); ++ ++int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type, ++ struct nameidata *nd); ++int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex); ++int au_refresh_hdentry(struct dentry *dentry, mode_t type); ++int au_reval_dpath(struct dentry *dentry, unsigned int sigen); ++ ++/* dinfo.c */ ++int au_alloc_dinfo(struct dentry *dentry); ++int au_di_realloc(struct au_dinfo *dinfo, int nbr); ++ ++void di_read_lock(struct dentry *d, int flags, unsigned int lsc); ++void di_read_unlock(struct dentry *d, int flags); ++void di_downgrade_lock(struct dentry *d, int flags); ++void di_write_lock(struct dentry *d, unsigned int lsc); ++void di_write_unlock(struct dentry *d); ++void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir); ++void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir); ++void di_write_unlock2(struct dentry *d1, struct dentry *d2); ++ ++struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex); ++aufs_bindex_t au_dbtail(struct dentry *dentry); ++aufs_bindex_t au_dbtaildir(struct dentry *dentry); ++ ++void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++void au_update_digen(struct dentry *dentry); ++void au_update_dbrange(struct dentry *dentry, int do_put_zero); ++void au_update_dbstart(struct dentry *dentry); ++void au_update_dbend(struct dentry *dentry); ++int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_dinfo *au_di(struct dentry *dentry) ++{ ++ return dentry->d_fsdata; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for dinfo */ ++enum { ++ AuLsc_DI_CHILD, /* child first */ ++ AuLsc_DI_CHILD2, /* rename(2), link(2), and cpup at hinotify */ ++ AuLsc_DI_CHILD3, /* copyup dirs */ ++ AuLsc_DI_PARENT, ++ AuLsc_DI_PARENT2, ++ AuLsc_DI_PARENT3 ++}; ++ ++/* ++ * di_read_lock_child, di_write_lock_child, ++ * di_read_lock_child2, di_write_lock_child2, ++ * di_read_lock_child3, di_write_lock_child3, ++ * di_read_lock_parent, di_write_lock_parent, ++ * di_read_lock_parent2, di_write_lock_parent2, ++ * di_read_lock_parent3, di_write_lock_parent3, ++ */ ++#define AuReadLockFunc(name, lsc) \ ++static inline void di_read_lock_##name(struct dentry *d, int flags) \ ++{ di_read_lock(d, flags, AuLsc_DI_##lsc); } ++ ++#define AuWriteLockFunc(name, lsc) \ ++static inline void di_write_lock_##name(struct dentry *d) \ ++{ di_write_lock(d, AuLsc_DI_##lsc); } ++ ++#define AuRWLockFuncs(name, lsc) \ ++ AuReadLockFunc(name, lsc) \ ++ AuWriteLockFunc(name, lsc) ++ ++AuRWLockFuncs(child, CHILD); ++AuRWLockFuncs(child2, CHILD2); ++AuRWLockFuncs(child3, CHILD3); ++AuRWLockFuncs(parent, PARENT); ++AuRWLockFuncs(parent2, PARENT2); ++AuRWLockFuncs(parent3, PARENT3); ++ ++#undef AuReadLockFunc ++#undef AuWriteLockFunc ++#undef AuRWLockFuncs ++ ++#define DiMustNoWaiters(d) AuRwMustNoWaiters(&au_di(d)->di_rwsem) ++#define DiMustAnyLock(d) AuRwMustAnyLock(&au_di(d)->di_rwsem) ++#define DiMustWriteLock(d) AuRwMustWriteLock(&au_di(d)->di_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: memory barrier? */ ++static inline unsigned int au_digen(struct dentry *d) ++{ ++ return atomic_read(&au_di(d)->di_generation); ++} ++ ++static inline void au_h_dentry_init(struct au_hdentry *hdentry) ++{ ++ hdentry->hd_dentry = NULL; ++} ++ ++static inline void au_hdput(struct au_hdentry *hd) ++{ ++ dput(hd->hd_dentry); ++} ++ ++static inline aufs_bindex_t au_dbstart(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bstart; ++} ++ ++static inline aufs_bindex_t au_dbend(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bend; ++} ++ ++static inline aufs_bindex_t au_dbwh(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bwh; ++} ++ ++static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bdiropq; ++} ++ ++/* todo: hard/soft set? */ ++static inline void au_set_dbstart(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bstart = bindex; ++} ++ ++static inline void au_set_dbend(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bend = bindex; ++} ++ ++static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ /* dbwh can be outside of bstart - bend range */ ++ au_di(dentry)->di_bwh = bindex; ++} ++ ++static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bdiropq = bindex; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_HINOTIFY ++static inline void au_digen_dec(struct dentry *d) ++{ ++ atomic_dec_return(&au_di(d)->di_generation); ++} ++ ++static inline void au_hin_di_reinit(struct dentry *dentry) ++{ ++ dentry->d_fsdata = NULL; ++} ++#else ++static inline void au_hin_di_reinit(struct dentry *dentry __maybe_unused) ++{ ++ /* empty */ ++} ++#endif /* CONFIG_AUFS_HINOTIFY */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DENTRY_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/dinfo.c linux-2.6.31/fs/aufs/dinfo.c +--- linux-2.6.31-vanilla/fs/aufs/dinfo.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/dinfo.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,367 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * dentry private data ++ */ ++ ++#include "aufs.h" ++ ++int au_alloc_dinfo(struct dentry *dentry) ++{ ++ struct au_dinfo *dinfo; ++ struct super_block *sb; ++ int nbr; ++ ++ dinfo = au_cache_alloc_dinfo(); ++ if (unlikely(!dinfo)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ nbr = au_sbend(sb) + 1; ++ if (nbr <= 0) ++ nbr = 1; ++ dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS); ++ if (unlikely(!dinfo->di_hdentry)) ++ goto out_dinfo; ++ ++ atomic_set(&dinfo->di_generation, au_sigen(sb)); ++ /* smp_mb(); */ /* atomic_set */ ++ au_rw_init_wlock_nested(&dinfo->di_rwsem, AuLsc_DI_CHILD); ++ dinfo->di_bstart = -1; ++ dinfo->di_bend = -1; ++ dinfo->di_bwh = -1; ++ dinfo->di_bdiropq = -1; ++ ++ dentry->d_fsdata = dinfo; ++ dentry->d_op = &aufs_dop; ++ return 0; /* success */ ++ ++ out_dinfo: ++ au_cache_free_dinfo(dinfo); ++ out: ++ return -ENOMEM; ++} ++ ++int au_di_realloc(struct au_dinfo *dinfo, int nbr) ++{ ++ int err, sz; ++ struct au_hdentry *hdp; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*hdp) * (dinfo->di_bend + 1); ++ if (!sz) ++ sz = sizeof(*hdp); ++ hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS); ++ if (hdp) { ++ dinfo->di_hdentry = hdp; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void do_ii_write_lock(struct inode *inode, unsigned int lsc) ++{ ++ switch (lsc) { ++ case AuLsc_DI_CHILD: ++ ii_write_lock_child(inode); ++ break; ++ case AuLsc_DI_CHILD2: ++ ii_write_lock_child2(inode); ++ break; ++ case AuLsc_DI_CHILD3: ++ ii_write_lock_child3(inode); ++ break; ++ case AuLsc_DI_PARENT: ++ ii_write_lock_parent(inode); ++ break; ++ case AuLsc_DI_PARENT2: ++ ii_write_lock_parent2(inode); ++ break; ++ case AuLsc_DI_PARENT3: ++ ii_write_lock_parent3(inode); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static void do_ii_read_lock(struct inode *inode, unsigned int lsc) ++{ ++ switch (lsc) { ++ case AuLsc_DI_CHILD: ++ ii_read_lock_child(inode); ++ break; ++ case AuLsc_DI_CHILD2: ++ ii_read_lock_child2(inode); ++ break; ++ case AuLsc_DI_CHILD3: ++ ii_read_lock_child3(inode); ++ break; ++ case AuLsc_DI_PARENT: ++ ii_read_lock_parent(inode); ++ break; ++ case AuLsc_DI_PARENT2: ++ ii_read_lock_parent2(inode); ++ break; ++ case AuLsc_DI_PARENT3: ++ ii_read_lock_parent3(inode); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++void di_read_lock(struct dentry *d, int flags, unsigned int lsc) ++{ ++ au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc); ++ if (d->d_inode) { ++ if (au_ftest_lock(flags, IW)) ++ do_ii_write_lock(d->d_inode, lsc); ++ else if (au_ftest_lock(flags, IR)) ++ do_ii_read_lock(d->d_inode, lsc); ++ } ++} ++ ++void di_read_unlock(struct dentry *d, int flags) ++{ ++ if (d->d_inode) { ++ if (au_ftest_lock(flags, IW)) ++ ii_write_unlock(d->d_inode); ++ else if (au_ftest_lock(flags, IR)) ++ ii_read_unlock(d->d_inode); ++ } ++ au_rw_read_unlock(&au_di(d)->di_rwsem); ++} ++ ++void di_downgrade_lock(struct dentry *d, int flags) ++{ ++ if (d->d_inode && au_ftest_lock(flags, IR)) ++ ii_downgrade_lock(d->d_inode); ++ au_rw_dgrade_lock(&au_di(d)->di_rwsem); ++} ++ ++void di_write_lock(struct dentry *d, unsigned int lsc) ++{ ++ au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc); ++ if (d->d_inode) ++ do_ii_write_lock(d->d_inode, lsc); ++} ++ ++void di_write_unlock(struct dentry *d) ++{ ++ if (d->d_inode) ++ ii_write_unlock(d->d_inode); ++ au_rw_write_unlock(&au_di(d)->di_rwsem); ++} ++ ++void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir) ++{ ++ AuDebugOn(d1 == d2 ++ || d1->d_inode == d2->d_inode ++ || d1->d_sb != d2->d_sb); ++ ++ if (isdir && au_test_subdir(d1, d2)) { ++ di_write_lock_child(d1); ++ di_write_lock_child2(d2); ++ } else { ++ /* there should be no races */ ++ di_write_lock_child(d2); ++ di_write_lock_child2(d1); ++ } ++} ++ ++void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir) ++{ ++ AuDebugOn(d1 == d2 ++ || d1->d_inode == d2->d_inode ++ || d1->d_sb != d2->d_sb); ++ ++ if (isdir && au_test_subdir(d1, d2)) { ++ di_write_lock_parent(d1); ++ di_write_lock_parent2(d2); ++ } else { ++ /* there should be no races */ ++ di_write_lock_parent(d2); ++ di_write_lock_parent2(d1); ++ } ++} ++ ++void di_write_unlock2(struct dentry *d1, struct dentry *d2) ++{ ++ di_write_unlock(d1); ++ if (d1->d_inode == d2->d_inode) ++ au_rw_write_unlock(&au_di(d2)->di_rwsem); ++ else ++ di_write_unlock(d2); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ struct dentry *d; ++ ++ DiMustAnyLock(dentry); ++ ++ if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) ++ return NULL; ++ AuDebugOn(bindex < 0); ++ d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry; ++ AuDebugOn(d && (atomic_read(&d->d_count) <= 0)); ++ return d; ++} ++ ++aufs_bindex_t au_dbtail(struct dentry *dentry) ++{ ++ aufs_bindex_t bend, bwh; ++ ++ bend = au_dbend(dentry); ++ if (0 <= bend) { ++ bwh = au_dbwh(dentry); ++ if (!bwh) ++ return bwh; ++ if (0 < bwh && bwh < bend) ++ return bwh - 1; ++ } ++ return bend; ++} ++ ++aufs_bindex_t au_dbtaildir(struct dentry *dentry) ++{ ++ aufs_bindex_t bend, bopq; ++ ++ bend = au_dbtail(dentry); ++ if (0 <= bend) { ++ bopq = au_dbdiropq(dentry); ++ if (0 <= bopq && bopq < bend) ++ bend = bopq; ++ } ++ return bend; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_dentry) ++{ ++ struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex; ++ ++ DiMustWriteLock(dentry); ++ ++ if (hd->hd_dentry) ++ au_hdput(hd); ++ hd->hd_dentry = h_dentry; ++} ++ ++void au_update_digen(struct dentry *dentry) ++{ ++ atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++void au_update_dbrange(struct dentry *dentry, int do_put_zero) ++{ ++ struct au_dinfo *dinfo; ++ struct dentry *h_d; ++ ++ DiMustWriteLock(dentry); ++ ++ dinfo = au_di(dentry); ++ if (!dinfo || dinfo->di_bstart < 0) ++ return; ++ ++ if (do_put_zero) { ++ aufs_bindex_t bindex, bend; ++ ++ bend = dinfo->di_bend; ++ for (bindex = dinfo->di_bstart; bindex <= bend; bindex++) { ++ h_d = dinfo->di_hdentry[0 + bindex].hd_dentry; ++ if (h_d && !h_d->d_inode) ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++ } ++ ++ dinfo->di_bstart = -1; ++ while (++dinfo->di_bstart <= dinfo->di_bend) ++ if (dinfo->di_hdentry[0 + dinfo->di_bstart].hd_dentry) ++ break; ++ if (dinfo->di_bstart > dinfo->di_bend) { ++ dinfo->di_bstart = -1; ++ dinfo->di_bend = -1; ++ return; ++ } ++ ++ dinfo->di_bend++; ++ while (0 <= --dinfo->di_bend) ++ if (dinfo->di_hdentry[0 + dinfo->di_bend].hd_dentry) ++ break; ++ AuDebugOn(dinfo->di_bstart > dinfo->di_bend || dinfo->di_bend < 0); ++} ++ ++void au_update_dbstart(struct dentry *dentry) ++{ ++ aufs_bindex_t bindex, bend; ++ struct dentry *h_dentry; ++ ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ if (h_dentry->d_inode) { ++ au_set_dbstart(dentry, bindex); ++ return; ++ } ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++} ++ ++void au_update_dbend(struct dentry *dentry) ++{ ++ aufs_bindex_t bindex, bstart; ++ struct dentry *h_dentry; ++ ++ bstart = au_dbstart(dentry); ++ for (bindex = au_dbend(dentry); bindex <= bstart; bindex--) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ if (h_dentry->d_inode) { ++ au_set_dbend(dentry, bindex); ++ return; ++ } ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++} ++ ++int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry) ++{ ++ aufs_bindex_t bindex, bend; ++ ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) ++ if (au_h_dptr(dentry, bindex) == h_dentry) ++ return bindex; ++ return -1; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/dir.c linux-2.6.31/fs/aufs/dir.c +--- linux-2.6.31-vanilla/fs/aufs/dir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/dir.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,593 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * directory operations ++ */ ++ ++#include <linux/file.h> ++#include <linux/fs_stack.h> ++#include "aufs.h" ++ ++void au_add_nlink(struct inode *dir, struct inode *h_dir) ++{ ++ AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); ++ ++ dir->i_nlink += h_dir->i_nlink - 2; ++ if (h_dir->i_nlink < 2) ++ dir->i_nlink += 2; ++} ++ ++void au_sub_nlink(struct inode *dir, struct inode *h_dir) ++{ ++ AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); ++ ++ dir->i_nlink -= h_dir->i_nlink - 2; ++ if (h_dir->i_nlink < 2) ++ dir->i_nlink -= 2; ++} ++ ++loff_t au_dir_size(struct file *file, struct dentry *dentry) ++{ ++ loff_t sz; ++ aufs_bindex_t bindex, bend; ++ struct file *h_file; ++ struct dentry *h_dentry; ++ ++ sz = 0; ++ if (file) { ++ AuDebugOn(!file->f_dentry); ++ AuDebugOn(!file->f_dentry->d_inode); ++ AuDebugOn(!S_ISDIR(file->f_dentry->d_inode->i_mode)); ++ ++ bend = au_fbend(file); ++ for (bindex = au_fbstart(file); ++ bindex <= bend && sz < KMALLOC_MAX_SIZE; ++ bindex++) { ++ h_file = au_h_fptr(file, bindex); ++ if (h_file ++ && h_file->f_dentry ++ && h_file->f_dentry->d_inode) ++ sz += i_size_read(h_file->f_dentry->d_inode); ++ } ++ } else { ++ AuDebugOn(!dentry); ++ AuDebugOn(!dentry->d_inode); ++ AuDebugOn(!S_ISDIR(dentry->d_inode->i_mode)); ++ ++ bend = au_dbtaildir(dentry); ++ for (bindex = au_dbstart(dentry); ++ bindex <= bend && sz < KMALLOC_MAX_SIZE; ++ bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && h_dentry->d_inode) ++ sz += i_size_read(h_dentry->d_inode); ++ } ++ } ++ if (sz < KMALLOC_MAX_SIZE) ++ sz = roundup_pow_of_two(sz); ++ if (sz > KMALLOC_MAX_SIZE) ++ sz = KMALLOC_MAX_SIZE; ++ else if (sz < NAME_MAX) { ++ BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX); ++ sz = AUFS_RDBLK_DEF; ++ } ++ return sz; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int reopen_dir(struct file *file) ++{ ++ int err; ++ unsigned int flags; ++ aufs_bindex_t bindex, btail, bstart; ++ struct dentry *dentry, *h_dentry; ++ struct file *h_file; ++ ++ /* open all lower dirs */ ++ dentry = file->f_dentry; ++ bstart = au_dbstart(dentry); ++ for (bindex = au_fbstart(file); bindex < bstart; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbstart(file, bstart); ++ ++ btail = au_dbtaildir(dentry); ++ for (bindex = au_fbend(file); btail < bindex; bindex--) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbend(file, btail); ++ ++ spin_lock(&file->f_lock); ++ flags = file->f_flags; ++ spin_unlock(&file->f_lock); ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ h_file = au_h_fptr(file, bindex); ++ if (h_file) ++ continue; ++ ++ h_file = au_h_open(dentry, bindex, flags, file); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; /* close all? */ ++ au_set_h_fptr(file, bindex, h_file); ++ } ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ err = 0; ++ ++ out: ++ return err; ++} ++ ++static int do_open_dir(struct file *file, int flags) ++{ ++ int err; ++ aufs_bindex_t bindex, btail; ++ struct dentry *dentry, *h_dentry; ++ struct file *h_file; ++ ++ FiMustWriteLock(file); ++ ++ err = 0; ++ dentry = file->f_dentry; ++ au_set_fvdir_cache(file, NULL); ++ au_fi(file)->fi_maintain_plink = 0; ++ file->f_version = dentry->d_inode->i_version; ++ bindex = au_dbstart(dentry); ++ au_set_fbstart(file, bindex); ++ btail = au_dbtaildir(dentry); ++ au_set_fbend(file, btail); ++ for (; !err && bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ ++ h_file = au_h_open(dentry, bindex, flags, file); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ break; ++ } ++ au_set_h_fptr(file, bindex, h_file); ++ } ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ if (!err) ++ return 0; /* success */ ++ ++ /* close all */ ++ for (bindex = au_fbstart(file); bindex <= btail; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbstart(file, -1); ++ au_set_fbend(file, -1); ++ return err; ++} ++ ++static int aufs_open_dir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ return au_do_open(file, do_open_dir); ++} ++ ++static int aufs_release_dir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ struct au_vdir *vdir_cache; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ sb = file->f_dentry->d_sb; ++ si_noflush_read_lock(sb); ++ fi_write_lock(file); ++ vdir_cache = au_fvdir_cache(file); ++ if (vdir_cache) ++ au_vdir_free(vdir_cache); ++ if (au_fi(file)->fi_maintain_plink) { ++ sbinfo = au_sbi(sb); ++ /* clear the flag without write-lock */ ++ sbinfo->au_si_status &= ~AuSi_MAINTAIN_PLINK; ++ smp_mb(); ++ wake_up_all(&sbinfo->si_plink_wq); ++ } ++ fi_write_unlock(file); ++ au_finfo_fin(file); ++ si_read_unlock(sb); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); !err && bindex <= bend; bindex++) { ++ struct path h_path; ++ struct inode *h_inode; ++ ++ if (au_test_ro(sb, bindex, inode)) ++ continue; ++ h_path.dentry = au_h_dptr(dentry, bindex); ++ if (!h_path.dentry) ++ continue; ++ h_inode = h_path.dentry->d_inode; ++ if (!h_inode) ++ continue; ++ ++ /* no mnt_want_write() */ ++ /* cf. fs/nsfd/vfs.c and fs/nfsd/nfs4recover.c */ ++ /* todo: inotiry fired? */ ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ mutex_lock(&h_inode->i_mutex); ++ err = filemap_fdatawrite(h_inode->i_mapping); ++ AuDebugOn(!h_inode->i_fop); ++ if (!err && h_inode->i_fop->fsync) ++ err = h_inode->i_fop->fsync(NULL, h_path.dentry, ++ datasync); ++ if (!err) ++ err = filemap_fdatawrite(h_inode->i_mapping); ++ if (!err) ++ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ ++ mutex_unlock(&h_inode->i_mutex); ++ } ++ ++ return err; ++} ++ ++static int au_do_fsync_dir(struct file *file, int datasync) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct file *h_file; ++ struct super_block *sb; ++ struct inode *inode; ++ struct mutex *h_mtx; ++ ++ err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = file->f_dentry->d_sb; ++ inode = file->f_dentry->d_inode; ++ bend = au_fbend(file); ++ for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { ++ h_file = au_h_fptr(file, bindex); ++ if (!h_file || au_test_ro(sb, bindex, inode)) ++ continue; ++ ++ err = vfs_fsync(h_file, h_file->f_dentry, datasync); ++ if (!err) { ++ h_mtx = &h_file->f_dentry->d_inode->i_mutex; ++ mutex_lock(h_mtx); ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); ++ /*ignore*/ ++ mutex_unlock(h_mtx); ++ } ++ } ++ ++ out: ++ return err; ++} ++ ++/* ++ * @file may be NULL ++ */ ++static int aufs_fsync_dir(struct file *file, struct dentry *dentry, ++ int datasync) ++{ ++ int err; ++ struct super_block *sb; ++ ++ IMustLock(dentry->d_inode); ++ ++ err = 0; ++ sb = dentry->d_sb; ++ si_noflush_read_lock(sb); ++ if (file) ++ err = au_do_fsync_dir(file, datasync); ++ else { ++ di_write_lock_child(dentry); ++ err = au_do_fsync_dir_no_file(dentry, datasync); ++ } ++ au_cpup_attr_timesizes(dentry->d_inode); ++ di_write_unlock(dentry); ++ if (file) ++ fi_write_unlock(file); ++ ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_readdir(struct file *file, void *dirent, filldir_t filldir) ++{ ++ int err; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ err = au_vdir_init(file); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ if (!au_test_nfsd(current)) { ++ err = au_vdir_fill_de(file, dirent, filldir); ++ fsstack_copy_attr_atime(inode, ++ au_h_iptr(inode, au_ibstart(inode))); ++ } else { ++ /* ++ * nfsd filldir may call lookup_one_len(), vfs_getattr(), ++ * encode_fh() and others. ++ */ ++ struct inode *h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ si_read_unlock(sb); ++ lockdep_off(); ++ err = au_vdir_fill_de(file, dirent, filldir); ++ lockdep_on(); ++ fsstack_copy_attr_atime(inode, h_inode); ++ fi_write_unlock(file); ++ ++ AuTraceErr(err); ++ return err; ++ } ++ ++ out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuTestEmpty_WHONLY 1 ++#define AuTestEmpty_CALLED (1 << 1) ++#define AuTestEmpty_SHWH (1 << 2) ++#define au_ftest_testempty(flags, name) ((flags) & AuTestEmpty_##name) ++#define au_fset_testempty(flags, name) { (flags) |= AuTestEmpty_##name; } ++#define au_fclr_testempty(flags, name) { (flags) &= ~AuTestEmpty_##name; } ++ ++#ifndef CONFIG_AUFS_SHWH ++#undef AuTestEmpty_SHWH ++#define AuTestEmpty_SHWH 0 ++#endif ++ ++struct test_empty_arg { ++ struct au_nhash *whlist; ++ unsigned int flags; ++ int err; ++ aufs_bindex_t bindex; ++}; ++ ++static int test_empty_cb(void *__arg, const char *__name, int namelen, ++ loff_t offset __maybe_unused, u64 ino, ++ unsigned int d_type) ++{ ++ struct test_empty_arg *arg = __arg; ++ char *name = (void *)__name; ++ ++ arg->err = 0; ++ au_fset_testempty(arg->flags, CALLED); ++ /* smp_mb(); */ ++ if (name[0] == '.' ++ && (namelen == 1 || (name[1] == '.' && namelen == 2))) ++ goto out; /* success */ ++ ++ if (namelen <= AUFS_WH_PFX_LEN ++ || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ if (au_ftest_testempty(arg->flags, WHONLY) ++ && !au_nhash_test_known_wh(arg->whlist, name, namelen)) ++ arg->err = -ENOTEMPTY; ++ goto out; ++ } ++ ++ name += AUFS_WH_PFX_LEN; ++ namelen -= AUFS_WH_PFX_LEN; ++ if (!au_nhash_test_known_wh(arg->whlist, name, namelen)) ++ arg->err = au_nhash_append_wh ++ (arg->whlist, name, namelen, ino, d_type, arg->bindex, ++ au_ftest_testempty(arg->flags, SHWH)); ++ ++ out: ++ /* smp_mb(); */ ++ AuTraceErr(arg->err); ++ return arg->err; ++} ++ ++static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg) ++{ ++ int err; ++ struct file *h_file; ++ ++ h_file = au_h_open(dentry, arg->bindex, ++ O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE, ++ /*file*/NULL); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ err = 0; ++ if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) ++ && !h_file->f_dentry->d_inode->i_nlink) ++ goto out_put; ++ ++ do { ++ arg->err = 0; ++ au_fclr_testempty(arg->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_readdir(h_file, test_empty_cb, arg); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err && au_ftest_testempty(arg->flags, CALLED)); ++ ++ out_put: ++ fput(h_file); ++ au_sbr_put(dentry->d_sb, arg->bindex); ++ out: ++ return err; ++} ++ ++struct do_test_empty_args { ++ int *errp; ++ struct dentry *dentry; ++ struct test_empty_arg *arg; ++}; ++ ++static void call_do_test_empty(void *args) ++{ ++ struct do_test_empty_args *a = args; ++ *a->errp = do_test_empty(a->dentry, a->arg); ++} ++ ++static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg) ++{ ++ int err, wkq_err; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ ++ h_dentry = au_h_dptr(dentry, arg->bindex); ++ h_inode = h_dentry->d_inode; ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ); ++ mutex_unlock(&h_inode->i_mutex); ++ if (!err) ++ err = do_test_empty(dentry, arg); ++ else { ++ struct do_test_empty_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .arg = arg ++ }; ++ unsigned int flags = arg->flags; ++ ++ wkq_err = au_wkq_wait(call_do_test_empty, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ arg->flags = flags; ++ } ++ ++ return err; ++} ++ ++int au_test_empty_lower(struct dentry *dentry) ++{ ++ int err; ++ unsigned int rdhash; ++ aufs_bindex_t bindex, bstart, btail; ++ struct au_nhash whlist; ++ struct test_empty_arg arg; ++ ++ SiMustAnyLock(dentry->d_sb); ++ ++ rdhash = au_sbi(dentry->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry)); ++ err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ ++ arg.flags = 0; ++ arg.whlist = &whlist; ++ bstart = au_dbstart(dentry); ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) ++ au_fset_testempty(arg.flags, SHWH); ++ arg.bindex = bstart; ++ err = do_test_empty(dentry, &arg); ++ if (unlikely(err)) ++ goto out_whlist; ++ ++ au_fset_testempty(arg.flags, WHONLY); ++ btail = au_dbtaildir(dentry); ++ for (bindex = bstart + 1; !err && bindex <= btail; bindex++) { ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && h_dentry->d_inode) { ++ arg.bindex = bindex; ++ err = do_test_empty(dentry, &arg); ++ } ++ } ++ ++ out_whlist: ++ au_nhash_wh_free(&whlist); ++ out: ++ return err; ++} ++ ++int au_test_empty(struct dentry *dentry, struct au_nhash *whlist) ++{ ++ int err; ++ struct test_empty_arg arg; ++ aufs_bindex_t bindex, btail; ++ ++ err = 0; ++ arg.whlist = whlist; ++ arg.flags = AuTestEmpty_WHONLY; ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) ++ au_fset_testempty(arg.flags, SHWH); ++ btail = au_dbtaildir(dentry); ++ for (bindex = au_dbstart(dentry); !err && bindex <= btail; bindex++) { ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && h_dentry->d_inode) { ++ arg.bindex = bindex; ++ err = sio_test_empty(dentry, &arg); ++ } ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++const struct file_operations aufs_dir_fop = { ++ .read = generic_read_dir, ++ .readdir = aufs_readdir, ++ .unlocked_ioctl = aufs_ioctl_dir, ++ .open = aufs_open_dir, ++ .release = aufs_release_dir, ++ .flush = aufs_flush, ++ .fsync = aufs_fsync_dir ++}; +diff -Nur linux-2.6.31-vanilla/fs/aufs/dir.h linux-2.6.31/fs/aufs/dir.h +--- linux-2.6.31-vanilla/fs/aufs/dir.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/dir.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,127 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * directory operations ++ */ ++ ++#ifndef __AUFS_DIR_H__ ++#define __AUFS_DIR_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/fs.h> ++#include <linux/aufs_type.h> ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* need to be faster and smaller */ ++ ++struct au_nhash { ++ unsigned int nh_num; ++ struct hlist_head *nh_head; ++}; ++ ++struct au_vdir_destr { ++ unsigned char len; ++ unsigned char name[0]; ++} __packed; ++ ++struct au_vdir_dehstr { ++ struct hlist_node hash; ++ struct au_vdir_destr *str; ++}; ++ ++struct au_vdir_de { ++ ino_t de_ino; ++ unsigned char de_type; ++ /* caution: packed */ ++ struct au_vdir_destr de_str; ++} __packed; ++ ++struct au_vdir_wh { ++ struct hlist_node wh_hash; ++#ifdef CONFIG_AUFS_SHWH ++ ino_t wh_ino; ++ aufs_bindex_t wh_bindex; ++ unsigned char wh_type; ++#else ++ aufs_bindex_t wh_bindex; ++#endif ++ /* caution: packed */ ++ struct au_vdir_destr wh_str; ++} __packed; ++ ++union au_vdir_deblk_p { ++ unsigned char *deblk; ++ struct au_vdir_de *de; ++}; ++ ++struct au_vdir { ++ unsigned char **vd_deblk; ++ unsigned long vd_nblk; ++ struct { ++ unsigned long ul; ++ union au_vdir_deblk_p p; ++ } vd_last; ++ ++ unsigned long vd_version; ++ unsigned int vd_deblk_sz; ++ unsigned long vd_jiffy; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dir.c */ ++extern const struct file_operations aufs_dir_fop; ++void au_add_nlink(struct inode *dir, struct inode *h_dir); ++void au_sub_nlink(struct inode *dir, struct inode *h_dir); ++loff_t au_dir_size(struct file *file, struct dentry *dentry); ++int au_test_empty_lower(struct dentry *dentry); ++int au_test_empty(struct dentry *dentry, struct au_nhash *whlist); ++ ++/* vdir.c */ ++unsigned int au_rdhash_est(loff_t sz); ++int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp); ++void au_nhash_wh_free(struct au_nhash *whlist); ++int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, ++ int limit); ++int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen); ++int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, ++ unsigned int d_type, aufs_bindex_t bindex, ++ unsigned char shwh); ++void au_vdir_free(struct au_vdir *vdir); ++int au_vdir_init(struct file *file); ++int au_vdir_fill_de(struct file *file, void *dirent, filldir_t filldir); ++ ++/* ioctl.c */ ++long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg); ++ ++#ifdef CONFIG_AUFS_RDU ++/* rdu.c */ ++long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg); ++#else ++static inline long au_rdu_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ return -EINVAL; ++} ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DIR_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/export.c linux-2.6.31/fs/aufs/export.c +--- linux-2.6.31-vanilla/fs/aufs/export.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/export.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,746 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * export via nfs ++ */ ++ ++#include <linux/exportfs.h> ++#include <linux/file.h> ++#include <linux/mnt_namespace.h> ++#include <linux/namei.h> ++#include <linux/nsproxy.h> ++#include <linux/random.h> ++#include "aufs.h" ++ ++union conv { ++#ifdef CONFIG_AUFS_INO_T_64 ++ __u32 a[2]; ++#else ++ __u32 a[1]; ++#endif ++ ino_t ino; ++}; ++ ++static ino_t decode_ino(__u32 *a) ++{ ++ union conv u; ++ ++ BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a)); ++ u.a[0] = a[0]; ++#ifdef CONFIG_AUFS_INO_T_64 ++ u.a[1] = a[1]; ++#endif ++ return u.ino; ++} ++ ++static void encode_ino(__u32 *a, ino_t ino) ++{ ++ union conv u; ++ ++ u.ino = ino; ++ a[0] = u.a[0]; ++#ifdef CONFIG_AUFS_INO_T_64 ++ a[1] = u.a[1]; ++#endif ++} ++ ++/* NFS file handle */ ++enum { ++ Fh_br_id, ++ Fh_sigen, ++#ifdef CONFIG_AUFS_INO_T_64 ++ /* support 64bit inode number */ ++ Fh_ino1, ++ Fh_ino2, ++ Fh_dir_ino1, ++ Fh_dir_ino2, ++#else ++ Fh_ino1, ++ Fh_dir_ino1, ++#endif ++ Fh_igen, ++ Fh_h_type, ++ Fh_tail, ++ ++ Fh_ino = Fh_ino1, ++ Fh_dir_ino = Fh_dir_ino1 ++}; ++ ++static int au_test_anon(struct dentry *dentry) ++{ ++ return !!(dentry->d_flags & DCACHE_DISCONNECTED); ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* inode generation external table */ ++ ++int au_xigen_inc(struct inode *inode) ++{ ++ int err; ++ loff_t pos; ++ ssize_t sz; ++ __u32 igen; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ err = 0; ++ sb = inode->i_sb; ++ sbinfo = au_sbi(sb); ++ /* ++ * temporary workaround for escaping from SiMustAnyLock() in ++ * au_mntflags(), since this function is called from au_iinfo_fin(). ++ */ ++ if (unlikely(!au_opt_test(sbinfo->si_mntflags, XINO))) ++ goto out; ++ ++ pos = inode->i_ino; ++ pos *= sizeof(igen); ++ igen = inode->i_generation + 1; ++ sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen, ++ sizeof(igen), &pos); ++ if (sz == sizeof(igen)) ++ goto out; /* success */ ++ ++ err = sz; ++ if (unlikely(sz >= 0)) { ++ err = -EIO; ++ AuIOErr("xigen error (%zd)\n", sz); ++ } ++ ++ out: ++ return err; ++} ++ ++int au_xigen_new(struct inode *inode) ++{ ++ int err; ++ loff_t pos; ++ ssize_t sz; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ err = 0; ++ /* todo: dirty, at mount time */ ++ if (inode->i_ino == AUFS_ROOT_INO) ++ goto out; ++ sb = inode->i_sb; ++ SiMustAnyLock(sb); ++ if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) ++ goto out; ++ ++ err = -EFBIG; ++ pos = inode->i_ino; ++ if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) { ++ AuIOErr1("too large i%lld\n", pos); ++ goto out; ++ } ++ pos *= sizeof(inode->i_generation); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ file = sbinfo->si_xigen; ++ BUG_ON(!file); ++ ++ if (i_size_read(file->f_dentry->d_inode) ++ < pos + sizeof(inode->i_generation)) { ++ inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next); ++ sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation, ++ sizeof(inode->i_generation), &pos); ++ } else ++ sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation, ++ sizeof(inode->i_generation), &pos); ++ if (sz == sizeof(inode->i_generation)) ++ goto out; /* success */ ++ ++ err = sz; ++ if (unlikely(sz >= 0)) { ++ err = -EIO; ++ AuIOErr("xigen error (%zd)\n", sz); ++ } ++ ++ out: ++ return err; ++} ++ ++int au_xigen_set(struct super_block *sb, struct file *base) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ file = au_xino_create2(base, sbinfo->si_xigen); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ err = 0; ++ if (sbinfo->si_xigen) ++ fput(sbinfo->si_xigen); ++ sbinfo->si_xigen = file; ++ ++ out: ++ return err; ++} ++ ++void au_xigen_clr(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ if (sbinfo->si_xigen) { ++ fput(sbinfo->si_xigen); ++ sbinfo->si_xigen = NULL; ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino, ++ ino_t dir_ino) ++{ ++ struct dentry *dentry, *d; ++ struct inode *inode; ++ unsigned int sigen; ++ ++ dentry = NULL; ++ inode = ilookup(sb, ino); ++ if (!inode) ++ goto out; ++ ++ dentry = ERR_PTR(-ESTALE); ++ sigen = au_sigen(sb); ++ if (unlikely(is_bad_inode(inode) ++ || IS_DEADDIR(inode) ++ || sigen != au_iigen(inode))) ++ goto out_iput; ++ ++ dentry = NULL; ++ if (!dir_ino || S_ISDIR(inode->i_mode)) ++ dentry = d_find_alias(inode); ++ else { ++ spin_lock(&dcache_lock); ++ list_for_each_entry(d, &inode->i_dentry, d_alias) ++ if (!au_test_anon(d) ++ && d->d_parent->d_inode->i_ino == dir_ino) { ++ dentry = dget_locked(d); ++ break; ++ } ++ spin_unlock(&dcache_lock); ++ } ++ if (unlikely(dentry && sigen != au_digen(dentry))) { ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++ } ++ ++ out_iput: ++ iput(inode); ++ out: ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: dirty? */ ++/* if exportfs_decode_fh() passed vfsmount*, we could be happy */ ++static struct vfsmount *au_mnt_get(struct super_block *sb) ++{ ++ struct mnt_namespace *ns; ++ struct vfsmount *pos, *mnt; ++ ++ spin_lock(&vfsmount_lock); ++ /* no get/put ?? */ ++ AuDebugOn(!current->nsproxy); ++ ns = current->nsproxy->mnt_ns; ++ AuDebugOn(!ns); ++ mnt = NULL; ++ /* the order (reverse) will not be a problem */ ++ list_for_each_entry(pos, &ns->list, mnt_list) ++ if (pos->mnt_sb == sb) { ++ mnt = mntget(pos); ++ break; ++ } ++ spin_unlock(&vfsmount_lock); ++ AuDebugOn(!mnt); ++ ++ return mnt; ++} ++ ++struct au_nfsd_si_lock { ++ const unsigned int sigen; ++ const aufs_bindex_t br_id; ++ unsigned char force_lock; ++}; ++ ++static aufs_bindex_t si_nfsd_read_lock(struct super_block *sb, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ aufs_bindex_t bindex; ++ ++ si_read_lock(sb, AuLock_FLUSH); ++ ++ /* branch id may be wrapped around */ ++ bindex = au_br_index(sb, nsi_lock->br_id); ++ if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb)) ++ goto out; /* success */ ++ ++ if (!nsi_lock->force_lock) ++ si_read_unlock(sb); ++ bindex = -1; ++ ++ out: ++ return bindex; ++} ++ ++struct find_name_by_ino { ++ int called, found; ++ ino_t ino; ++ char *name; ++ int namelen; ++}; ++ ++static int ++find_name_by_ino(void *arg, const char *name, int namelen, loff_t offset, ++ u64 ino, unsigned int d_type) ++{ ++ struct find_name_by_ino *a = arg; ++ ++ a->called++; ++ if (a->ino != ino) ++ return 0; ++ ++ memcpy(a->name, name, namelen); ++ a->namelen = namelen; ++ a->found = 1; ++ return 1; ++} ++ ++static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry, *parent; ++ struct file *file; ++ struct inode *dir; ++ struct find_name_by_ino arg; ++ int err; ++ ++ parent = path->dentry; ++ if (nsi_lock) ++ si_read_unlock(parent->d_sb); ++ path_get(path); ++ file = vfsub_dentry_open(path, au_dir_roflags, current_cred()); ++ dentry = (void *)file; ++ if (IS_ERR(file)) ++ goto out; ++ ++ dentry = ERR_PTR(-ENOMEM); ++ arg.name = __getname(); ++ if (unlikely(!arg.name)) ++ goto out_file; ++ arg.ino = ino; ++ arg.found = 0; ++ do { ++ arg.called = 0; ++ /* smp_mb(); */ ++ err = vfsub_readdir(file, find_name_by_ino, &arg); ++ } while (!err && !arg.found && arg.called); ++ dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_name; ++ dentry = ERR_PTR(-ENOENT); ++ if (!arg.found) ++ goto out_name; ++ ++ /* do not call au_lkup_one() */ ++ dir = parent->d_inode; ++ mutex_lock(&dir->i_mutex); ++ dentry = vfsub_lookup_one_len(arg.name, parent, arg.namelen); ++ mutex_unlock(&dir->i_mutex); ++ AuTraceErrPtr(dentry); ++ if (IS_ERR(dentry)) ++ goto out_name; ++ AuDebugOn(au_test_anon(dentry)); ++ if (unlikely(!dentry->d_inode)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ENOENT); ++ } ++ ++ out_name: ++ __putname(arg.name); ++ out_file: ++ fput(file); ++ out: ++ if (unlikely(nsi_lock ++ && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0)) ++ if (!IS_ERR(dentry)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++ } ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino, ++ ino_t dir_ino, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry; ++ struct path path; ++ ++ if (dir_ino != AUFS_ROOT_INO) { ++ path.dentry = decode_by_ino(sb, dir_ino, 0); ++ dentry = path.dentry; ++ if (!path.dentry || IS_ERR(path.dentry)) ++ goto out; ++ AuDebugOn(au_test_anon(path.dentry)); ++ } else ++ path.dentry = dget(sb->s_root); ++ ++ path.mnt = au_mnt_get(sb); ++ dentry = au_lkup_by_ino(&path, ino, nsi_lock); ++ path_put(&path); ++ ++ out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int h_acceptable(void *expv, struct dentry *dentry) ++{ ++ return 1; ++} ++ ++static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath, ++ char *buf, int len, struct super_block *sb) ++{ ++ char *p; ++ int n; ++ struct path path; ++ ++ p = d_path(h_rootpath, buf, len); ++ if (IS_ERR(p)) ++ goto out; ++ n = strlen(p); ++ ++ path.mnt = h_rootpath->mnt; ++ path.dentry = h_parent; ++ p = d_path(&path, buf, len); ++ if (IS_ERR(p)) ++ goto out; ++ if (n != 1) ++ p += n; ++ ++ path.mnt = au_mnt_get(sb); ++ path.dentry = sb->s_root; ++ p = d_path(&path, buf, len - strlen(p)); ++ mntput(path.mnt); ++ if (IS_ERR(p)) ++ goto out; ++ if (n != 1) ++ p[strlen(p)] = '/'; ++ ++ out: ++ AuTraceErrPtr(p); ++ return p; ++} ++ ++static ++struct dentry *decode_by_path(struct super_block *sb, aufs_bindex_t bindex, ++ ino_t ino, __u32 *fh, int fh_len, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry, *h_parent, *root; ++ struct super_block *h_sb; ++ char *pathname, *p; ++ struct vfsmount *h_mnt; ++ struct au_branch *br; ++ int err; ++ struct path path; ++ ++ br = au_sbr(sb, bindex); ++ /* au_br_get(br); */ ++ h_mnt = br->br_mnt; ++ h_sb = h_mnt->mnt_sb; ++ /* todo: call lower fh_to_dentry()? fh_to_parent()? */ ++ h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail), ++ fh_len - Fh_tail, fh[Fh_h_type], ++ h_acceptable, /*context*/NULL); ++ dentry = h_parent; ++ if (unlikely(!h_parent || IS_ERR(h_parent))) { ++ AuWarn1("%s decode_fh failed, %ld\n", ++ au_sbtype(h_sb), PTR_ERR(h_parent)); ++ goto out; ++ } ++ dentry = NULL; ++ if (unlikely(au_test_anon(h_parent))) { ++ AuWarn1("%s decode_fh returned a disconnected dentry\n", ++ au_sbtype(h_sb)); ++ goto out_h_parent; ++ } ++ ++ dentry = ERR_PTR(-ENOMEM); ++ pathname = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!pathname)) ++ goto out_h_parent; ++ ++ root = sb->s_root; ++ path.mnt = h_mnt; ++ di_read_lock_parent(root, !AuLock_IR); ++ path.dentry = au_h_dptr(root, bindex); ++ di_read_unlock(root, !AuLock_IR); ++ p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb); ++ dentry = (void *)p; ++ if (IS_ERR(p)) ++ goto out_pathname; ++ ++ si_read_unlock(sb); ++ err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); ++ dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_relock; ++ ++ dentry = ERR_PTR(-ENOENT); ++ AuDebugOn(au_test_anon(path.dentry)); ++ if (unlikely(!path.dentry->d_inode)) ++ goto out_path; ++ ++ if (ino != path.dentry->d_inode->i_ino) ++ dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL); ++ else ++ dentry = dget(path.dentry); ++ ++ out_path: ++ path_put(&path); ++ out_relock: ++ if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0)) ++ if (!IS_ERR(dentry)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++ } ++ out_pathname: ++ free_page((unsigned long)pathname); ++ out_h_parent: ++ dput(h_parent); ++ out: ++ /* au_br_put(br); */ ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry * ++aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, ++ int fh_type) ++{ ++ struct dentry *dentry; ++ __u32 *fh = fid->raw; ++ ino_t ino, dir_ino; ++ aufs_bindex_t bindex; ++ struct au_nfsd_si_lock nsi_lock = { ++ .sigen = fh[Fh_sigen], ++ .br_id = fh[Fh_br_id], ++ .force_lock = 0 ++ }; ++ ++ AuDebugOn(fh_len < Fh_tail); ++ ++ dentry = ERR_PTR(-ESTALE); ++ /* branch id may be wrapped around */ ++ bindex = si_nfsd_read_lock(sb, &nsi_lock); ++ if (unlikely(bindex < 0)) ++ goto out; ++ nsi_lock.force_lock = 1; ++ ++ /* is this inode still cached? */ ++ ino = decode_ino(fh + Fh_ino); ++ AuDebugOn(ino == AUFS_ROOT_INO); ++ dir_ino = decode_ino(fh + Fh_dir_ino); ++ dentry = decode_by_ino(sb, ino, dir_ino); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (dentry) ++ goto accept; ++ ++ /* is the parent dir cached? */ ++ dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (dentry) ++ goto accept; ++ ++ /* lookup path */ ++ dentry = decode_by_path(sb, bindex, ino, fh, fh_len, &nsi_lock); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (unlikely(!dentry)) ++ /* todo?: make it ESTALE */ ++ goto out_unlock; ++ ++ accept: ++ if (dentry->d_inode->i_generation == fh[Fh_igen]) ++ goto out_unlock; /* success */ ++ ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++ out_unlock: ++ si_read_unlock(sb); ++ out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++#if 0 /* reserved for future use */ ++/* support subtreecheck option */ ++static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ struct dentry *parent; ++ __u32 *fh = fid->raw; ++ ino_t dir_ino; ++ ++ dir_ino = decode_ino(fh + Fh_dir_ino); ++ parent = decode_by_ino(sb, dir_ino, 0); ++ if (IS_ERR(parent)) ++ goto out; ++ if (!parent) ++ parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]), ++ dir_ino, fh, fh_len); ++ ++ out: ++ AuTraceErrPtr(parent); ++ return parent; ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, ++ int connectable) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct super_block *sb, *h_sb; ++ struct inode *inode; ++ struct dentry *parent, *h_parent; ++ struct au_branch *br; ++ ++ AuDebugOn(au_test_anon(dentry)); ++ ++ parent = NULL; ++ err = -ENOSPC; ++ if (unlikely(*max_len <= Fh_tail)) { ++ AuWarn1("NFSv2 client (max_len %d)?\n", *max_len); ++ goto out; ++ } ++ ++ err = FILEID_ROOT; ++ if (IS_ROOT(dentry)) { ++ AuDebugOn(dentry->d_inode->i_ino != AUFS_ROOT_INO); ++ goto out; ++ } ++ ++ err = -EIO; ++ h_parent = NULL; ++ sb = dentry->d_sb; ++ aufs_read_lock(dentry, AuLock_FLUSH | AuLock_IR); ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, !AuLock_IR); ++ inode = dentry->d_inode; ++ AuDebugOn(!inode); ++#ifdef CONFIG_AUFS_DEBUG ++ if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) ++ AuWarn1("NFS-exporting requires xino\n"); ++#endif ++ ++ bend = au_dbtaildir(parent); ++ for (bindex = au_dbstart(parent); bindex <= bend; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (h_parent) { ++ dget(h_parent); ++ break; ++ } ++ } ++ if (unlikely(!h_parent)) ++ goto out_unlock; ++ ++ err = -EPERM; ++ br = au_sbr(sb, bindex); ++ h_sb = br->br_mnt->mnt_sb; ++ if (unlikely(!h_sb->s_export_op)) { ++ AuErr1("%s branch is not exportable\n", au_sbtype(h_sb)); ++ goto out_dput; ++ } ++ ++ fh[Fh_br_id] = br->br_id; ++ fh[Fh_sigen] = au_sigen(sb); ++ encode_ino(fh + Fh_ino, inode->i_ino); ++ encode_ino(fh + Fh_dir_ino, parent->d_inode->i_ino); ++ fh[Fh_igen] = inode->i_generation; ++ ++ *max_len -= Fh_tail; ++ fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail), ++ max_len, ++ /*connectable or subtreecheck*/0); ++ err = fh[Fh_h_type]; ++ *max_len += Fh_tail; ++ /* todo: macros? */ ++ if (err != 255) ++ err = 99; ++ else ++ AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb)); ++ ++ out_dput: ++ dput(h_parent); ++ out_unlock: ++ di_read_unlock(parent, !AuLock_IR); ++ dput(parent); ++ aufs_read_unlock(dentry, AuLock_IR); ++ out: ++ if (unlikely(err < 0)) ++ err = 255; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct export_operations aufs_export_op = { ++ .fh_to_dentry = aufs_fh_to_dentry, ++ /* .fh_to_parent = aufs_fh_to_parent, */ ++ .encode_fh = aufs_encode_fh ++}; ++ ++void au_export_init(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ __u32 u; ++ ++ sb->s_export_op = &aufs_export_op; ++ sbinfo = au_sbi(sb); ++ sbinfo->si_xigen = NULL; ++ get_random_bytes(&u, sizeof(u)); ++ BUILD_BUG_ON(sizeof(u) != sizeof(int)); ++ atomic_set(&sbinfo->si_xigen_next, u); ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/file.c linux-2.6.31/fs/aufs/file.c +--- linux-2.6.31-vanilla/fs/aufs/file.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/file.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,568 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * handling file/dir, and address_space operation ++ */ ++ ++#include <linux/file.h> ++#include <linux/fsnotify.h> ++#include <linux/namei.h> ++#include <linux/pagemap.h> ++#include "aufs.h" ++ ++/* drop flags for writing */ ++unsigned int au_file_roflags(unsigned int flags) ++{ ++ flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC); ++ flags |= O_RDONLY | O_NOATIME; ++ return flags; ++} ++ ++/* common functions to regular file and dir */ ++struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, ++ struct file *file) ++{ ++ struct file *h_file; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ struct super_block *sb; ++ struct au_branch *br; ++ int err, exec_flag; ++ struct path h_path; ++ ++ /* a race condition can happen between open and unlink/rmdir */ ++ h_file = ERR_PTR(-ENOENT); ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (au_test_nfsd(current) && !h_dentry) ++ goto out; ++ h_inode = h_dentry->d_inode; ++ if (au_test_nfsd(current) && !h_inode) ++ goto out; ++ if (unlikely((!d_unhashed(dentry) && d_unhashed(h_dentry)) ++ || !h_inode)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, bindex); ++ h_file = ERR_PTR(-EACCES); ++ exec_flag = flags & vfsub_fmode_to_uint(FMODE_EXEC); ++ if (exec_flag && (br->br_mnt->mnt_flags & MNT_NOEXEC)) ++ goto out; ++ ++ /* drop flags for writing */ ++ if (au_test_ro(sb, bindex, dentry->d_inode)) ++ flags = au_file_roflags(flags); ++ flags &= ~O_CREAT; ++ atomic_inc(&br->br_count); ++ h_path.dentry = h_dentry; ++ h_path.mnt = br->br_mnt; ++ path_get(&h_path); ++ h_file = vfsub_dentry_open(&h_path, flags, current_cred()); ++ if (IS_ERR(h_file)) ++ goto out_br; ++ ++ if (exec_flag) { ++ err = deny_write_access(h_file); ++ if (unlikely(err)) { ++ fput(h_file); ++ h_file = ERR_PTR(err); ++ goto out_br; ++ } ++ } ++ fsnotify_open(h_dentry); ++ goto out; /* success */ ++ ++ out_br: ++ atomic_dec(&br->br_count); ++ out: ++ return h_file; ++} ++ ++int au_do_open(struct file *file, int (*open)(struct file *file, int flags)) ++{ ++ int err; ++ unsigned int flags; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_finfo_init(file); ++ if (unlikely(err)) ++ goto out; ++ ++ di_read_lock_child(dentry, AuLock_IR); ++ spin_lock(&file->f_lock); ++ flags = file->f_flags; ++ spin_unlock(&file->f_lock); ++ err = open(file, flags); ++ di_read_unlock(dentry, AuLock_IR); ++ ++ fi_write_unlock(file); ++ if (unlikely(err)) ++ au_finfo_fin(file); ++ out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++int au_reopen_nondir(struct file *file) ++{ ++ int err; ++ unsigned int flags; ++ aufs_bindex_t bstart, bindex, bend; ++ struct dentry *dentry; ++ struct file *h_file, *h_file_tmp; ++ ++ dentry = file->f_dentry; ++ bstart = au_dbstart(dentry); ++ h_file_tmp = NULL; ++ if (au_fbstart(file) == bstart) { ++ h_file = au_h_fptr(file, bstart); ++ if (file->f_mode == h_file->f_mode) ++ return 0; /* success */ ++ h_file_tmp = h_file; ++ get_file(h_file_tmp); ++ au_set_h_fptr(file, bstart, NULL); ++ } ++ AuDebugOn(au_fbstart(file) < bstart ++ || au_fi(file)->fi_hfile[0 + bstart].hf_file); ++ ++ spin_lock(&file->f_lock); ++ flags = file->f_flags & ~O_TRUNC; ++ spin_unlock(&file->f_lock); ++ h_file = au_h_open(dentry, bstart, flags, file); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; /* todo: close all? */ ++ ++ err = 0; ++ au_set_fbstart(file, bstart); ++ au_set_h_fptr(file, bstart, h_file); ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ ++ /* close lower files */ ++ bend = au_fbend(file); ++ for (bindex = bstart + 1; bindex <= bend; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbend(file, bstart); ++ ++ out: ++ if (h_file_tmp) ++ fput(h_file_tmp); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_reopen_wh(struct file *file, aufs_bindex_t btgt, ++ struct dentry *hi_wh) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct au_dinfo *dinfo; ++ struct dentry *h_dentry; ++ ++ dinfo = au_di(file->f_dentry); ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ bstart = dinfo->di_bstart; ++ dinfo->di_bstart = btgt; ++ h_dentry = dinfo->di_hdentry[0 + btgt].hd_dentry; ++ dinfo->di_hdentry[0 + btgt].hd_dentry = hi_wh; ++ err = au_reopen_nondir(file); ++ dinfo->di_hdentry[0 + btgt].hd_dentry = h_dentry; ++ dinfo->di_bstart = bstart; ++ ++ return err; ++} ++ ++static int au_ready_to_write_wh(struct file *file, loff_t len, ++ aufs_bindex_t bcpup) ++{ ++ int err; ++ struct inode *inode; ++ struct dentry *dentry, *hi_wh; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ hi_wh = au_hi_wh(inode, bcpup); ++ if (!hi_wh) ++ err = au_sio_cpup_wh(dentry, bcpup, len, file); ++ else ++ /* already copied-up after unlink */ ++ err = au_reopen_wh(file, bcpup, hi_wh); ++ ++ sb = dentry->d_sb; ++ if (!err && inode->i_nlink > 1 && au_opt_test(au_mntflags(sb), PLINK)) ++ au_plink_append(inode, bcpup, au_h_dptr(dentry, bcpup)); ++ ++ return err; ++} ++ ++/* ++ * prepare the @file for writing. ++ */ ++int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin) ++{ ++ int err; ++ aufs_bindex_t bstart, bcpup; ++ struct dentry *dentry, *parent, *h_dentry; ++ struct inode *h_inode, *inode; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ bstart = au_fbstart(file); ++ inode = dentry->d_inode; ++ err = au_test_ro(sb, bstart, inode); ++ if (!err && (au_h_fptr(file, bstart)->f_mode & FMODE_WRITE)) { ++ err = au_pin(pin, dentry, bstart, AuOpt_UDBA_NONE, /*flags*/0); ++ goto out; ++ } ++ ++ /* need to cpup */ ++ parent = dget_parent(dentry); ++ di_write_lock_parent(parent); ++ err = AuWbrCopyup(au_sbi(sb), dentry); ++ bcpup = err; ++ if (unlikely(err < 0)) ++ goto out_dgrade; ++ err = 0; ++ ++ if (!au_h_dptr(parent, bcpup)) { ++ err = au_cpup_dirs(dentry, bcpup); ++ if (unlikely(err)) ++ goto out_dgrade; ++ } ++ ++ err = au_pin(pin, dentry, bcpup, AuOpt_UDBA_NONE, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out_dgrade; ++ ++ h_dentry = au_h_fptr(file, bstart)->f_dentry; ++ h_inode = h_dentry->d_inode; ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ if (d_unhashed(dentry) /* || d_unhashed(h_dentry) */ ++ /* || !h_inode->i_nlink */) { ++ err = au_ready_to_write_wh(file, len, bcpup); ++ di_downgrade_lock(parent, AuLock_IR); ++ } else { ++ di_downgrade_lock(parent, AuLock_IR); ++ if (!au_h_dptr(dentry, bcpup)) ++ err = au_sio_cpup_simple(dentry, bcpup, len, ++ AuCpup_DTIME); ++ if (!err) ++ err = au_reopen_nondir(file); ++ } ++ mutex_unlock(&h_inode->i_mutex); ++ ++ if (!err) { ++ au_pin_set_parent_lflag(pin, /*lflag*/0); ++ goto out_dput; /* success */ ++ } ++ au_unpin(pin); ++ goto out_unlock; ++ ++ out_dgrade: ++ di_downgrade_lock(parent, AuLock_IR); ++ out_unlock: ++ di_read_unlock(parent, AuLock_IR); ++ out_dput: ++ dput(parent); ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_file_refresh_by_inode(struct file *file, int *need_reopen) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct au_pin pin; ++ struct au_finfo *finfo; ++ struct dentry *dentry, *parent, *hi_wh; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ FiMustWriteLock(file); ++ ++ err = 0; ++ finfo = au_fi(file); ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ bstart = au_ibstart(inode); ++ if (bstart == finfo->fi_bstart) ++ goto out; ++ ++ parent = dget_parent(dentry); ++ if (au_test_ro(sb, bstart, inode)) { ++ di_read_lock_parent(parent, !AuLock_IR); ++ err = AuWbrCopyup(au_sbi(sb), dentry); ++ bstart = err; ++ di_read_unlock(parent, !AuLock_IR); ++ if (unlikely(err < 0)) ++ goto out_parent; ++ err = 0; ++ } ++ ++ di_read_lock_parent(parent, AuLock_IR); ++ hi_wh = au_hi_wh(inode, bstart); ++ if (au_opt_test(au_mntflags(sb), PLINK) ++ && au_plink_test(inode) ++ && !d_unhashed(dentry)) { ++ err = au_test_and_cpup_dirs(dentry, bstart); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ /* always superio. */ ++ err = au_pin(&pin, dentry, bstart, AuOpt_UDBA_NONE, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (!err) ++ err = au_sio_cpup_simple(dentry, bstart, -1, ++ AuCpup_DTIME); ++ au_unpin(&pin); ++ } else if (hi_wh) { ++ /* already copied-up after unlink */ ++ err = au_reopen_wh(file, bstart, hi_wh); ++ *need_reopen = 0; ++ } ++ ++ out_unlock: ++ di_read_unlock(parent, AuLock_IR); ++ out_parent: ++ dput(parent); ++ out: ++ return err; ++} ++ ++static void au_do_refresh_file(struct file *file) ++{ ++ aufs_bindex_t bindex, bend, new_bindex, brid; ++ struct au_hfile *p, tmp, *q; ++ struct au_finfo *finfo; ++ struct super_block *sb; ++ ++ FiMustWriteLock(file); ++ ++ sb = file->f_dentry->d_sb; ++ finfo = au_fi(file); ++ p = finfo->fi_hfile + finfo->fi_bstart; ++ brid = p->hf_br->br_id; ++ bend = finfo->fi_bend; ++ for (bindex = finfo->fi_bstart; bindex <= bend; bindex++, p++) { ++ if (!p->hf_file) ++ continue; ++ ++ new_bindex = au_br_index(sb, p->hf_br->br_id); ++ if (new_bindex == bindex) ++ continue; ++ if (new_bindex < 0) { ++ au_set_h_fptr(file, bindex, NULL); ++ continue; ++ } ++ ++ /* swap two lower inode, and loop again */ ++ q = finfo->fi_hfile + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hf_file) { ++ bindex--; ++ p--; ++ } ++ } ++ ++ p = finfo->fi_hfile; ++ if (!au_test_mmapped(file) && !d_unhashed(file->f_dentry)) { ++ bend = au_sbend(sb); ++ for (finfo->fi_bstart = 0; finfo->fi_bstart <= bend; ++ finfo->fi_bstart++, p++) ++ if (p->hf_file) { ++ if (p->hf_file->f_dentry ++ && p->hf_file->f_dentry->d_inode) ++ break; ++ else ++ au_hfput(p, file); ++ } ++ } else { ++ bend = au_br_index(sb, brid); ++ for (finfo->fi_bstart = 0; finfo->fi_bstart < bend; ++ finfo->fi_bstart++, p++) ++ if (p->hf_file) ++ au_hfput(p, file); ++ bend = au_sbend(sb); ++ } ++ ++ p = finfo->fi_hfile + bend; ++ for (finfo->fi_bend = bend; finfo->fi_bend >= finfo->fi_bstart; ++ finfo->fi_bend--, p--) ++ if (p->hf_file) { ++ if (p->hf_file->f_dentry ++ && p->hf_file->f_dentry->d_inode) ++ break; ++ else ++ au_hfput(p, file); ++ } ++ AuDebugOn(finfo->fi_bend < finfo->fi_bstart); ++} ++ ++/* ++ * after branch manipulating, refresh the file. ++ */ ++static int refresh_file(struct file *file, int (*reopen)(struct file *file)) ++{ ++ int err, need_reopen; ++ struct dentry *dentry; ++ aufs_bindex_t bend, bindex; ++ ++ dentry = file->f_dentry; ++ err = au_fi_realloc(au_fi(file), au_sbend(dentry->d_sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ au_do_refresh_file(file); ++ ++ err = 0; ++ need_reopen = 1; ++ if (!au_test_mmapped(file)) ++ err = au_file_refresh_by_inode(file, &need_reopen); ++ if (!err && need_reopen && !d_unhashed(dentry)) ++ err = reopen(file); ++ if (!err) { ++ au_update_figen(file); ++ return 0; /* success */ ++ } ++ ++ /* error, close all lower files */ ++ bend = au_fbend(file); ++ for (bindex = au_fbstart(file); bindex <= bend; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ ++ out: ++ return err; ++} ++ ++/* common function to regular file and dir */ ++int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), ++ int wlock) ++{ ++ int err; ++ unsigned int sigen, figen; ++ aufs_bindex_t bstart; ++ unsigned char pseudo_link; ++ struct dentry *dentry; ++ ++ err = 0; ++ dentry = file->f_dentry; ++ sigen = au_sigen(dentry->d_sb); ++ fi_write_lock(file); ++ figen = au_figen(file); ++ di_write_lock_child(dentry); ++ bstart = au_dbstart(dentry); ++ pseudo_link = (bstart != au_ibstart(dentry->d_inode)); ++ if (sigen == figen && !pseudo_link && au_fbstart(file) == bstart) { ++ if (!wlock) { ++ di_downgrade_lock(dentry, AuLock_IR); ++ fi_downgrade_lock(file); ++ } ++ goto out; /* success */ ++ } ++ ++ AuDbg("sigen %d, figen %d\n", sigen, figen); ++ if (sigen != au_digen(dentry) ++ || sigen != au_iigen(dentry->d_inode)) { ++ err = au_reval_dpath(dentry, sigen); ++ if (unlikely(err < 0)) ++ goto out; ++ AuDebugOn(au_digen(dentry) != sigen ++ || au_iigen(dentry->d_inode) != sigen); ++ } ++ ++ err = refresh_file(file, reopen); ++ if (!err) { ++ if (!wlock) { ++ di_downgrade_lock(dentry, AuLock_IR); ++ fi_downgrade_lock(file); ++ } ++ } else { ++ di_write_unlock(dentry); ++ fi_write_unlock(file); ++ } ++ ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* cf. aufs_nopage() */ ++/* for madvise(2) */ ++static int aufs_readpage(struct file *file __maybe_unused, struct page *page) ++{ ++ unlock_page(page); ++ return 0; ++} ++ ++/* they will never be called. */ ++#ifdef CONFIG_AUFS_DEBUG ++static int aufs_write_begin(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned flags, ++ struct page **pagep, void **fsdata) ++{ AuUnsupport(); return 0; } ++static int aufs_write_end(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned copied, ++ struct page *page, void *fsdata) ++{ AuUnsupport(); return 0; } ++static int aufs_writepage(struct page *page, struct writeback_control *wbc) ++{ AuUnsupport(); return 0; } ++static void aufs_sync_page(struct page *page) ++{ AuUnsupport(); } ++ ++static int aufs_set_page_dirty(struct page *page) ++{ AuUnsupport(); return 0; } ++static void aufs_invalidatepage(struct page *page, unsigned long offset) ++{ AuUnsupport(); } ++static int aufs_releasepage(struct page *page, gfp_t gfp) ++{ AuUnsupport(); return 0; } ++static ssize_t aufs_direct_IO(int rw, struct kiocb *iocb, ++ const struct iovec *iov, loff_t offset, ++ unsigned long nr_segs) ++{ AuUnsupport(); return 0; } ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++struct address_space_operations aufs_aop = { ++ .readpage = aufs_readpage, ++#ifdef CONFIG_AUFS_DEBUG ++ .writepage = aufs_writepage, ++ .sync_page = aufs_sync_page, ++ .set_page_dirty = aufs_set_page_dirty, ++ .write_begin = aufs_write_begin, ++ .write_end = aufs_write_end, ++ .invalidatepage = aufs_invalidatepage, ++ .releasepage = aufs_releasepage, ++ .direct_IO = aufs_direct_IO, ++#endif /* CONFIG_AUFS_DEBUG */ ++}; +diff -Nur linux-2.6.31-vanilla/fs/aufs/file.h linux-2.6.31/fs/aufs/file.h +--- linux-2.6.31-vanilla/fs/aufs/file.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/file.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,174 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file operations ++ */ ++ ++#ifndef __AUFS_FILE_H__ ++#define __AUFS_FILE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/fs.h> ++#include <linux/poll.h> ++#include <linux/aufs_type.h> ++#include "rwsem.h" ++ ++struct au_branch; ++struct au_hfile { ++ struct file *hf_file; ++ struct au_branch *hf_br; ++}; ++ ++struct au_vdir; ++struct au_finfo { ++ atomic_t fi_generation; ++ ++ struct au_rwsem fi_rwsem; ++ struct au_hfile *fi_hfile; ++ aufs_bindex_t fi_bstart, fi_bend; ++ ++ union { ++ /* non-dir only */ ++ struct { ++ struct vm_operations_struct *fi_h_vm_ops; ++ struct vm_operations_struct *fi_vm_ops; ++ }; ++ ++ /* dir only */ ++ struct { ++ struct au_vdir *fi_vdir_cache; ++ int fi_maintain_plink; ++ }; ++ }; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* file.c */ ++extern struct address_space_operations aufs_aop; ++unsigned int au_file_roflags(unsigned int flags); ++struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, ++ struct file *file); ++int au_do_open(struct file *file, int (*open)(struct file *file, int flags)); ++int au_reopen_nondir(struct file *file); ++struct au_pin; ++int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin); ++int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), ++ int wlock); ++ ++/* poll.c */ ++#ifdef CONFIG_AUFS_POLL ++unsigned int aufs_poll(struct file *file, poll_table *wait); ++#endif ++ ++/* f_op.c */ ++extern const struct file_operations aufs_file_fop; ++int aufs_flush(struct file *file, fl_owner_t id); ++ ++/* finfo.c */ ++void au_hfput(struct au_hfile *hf, struct file *file); ++void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, ++ struct file *h_file); ++ ++void au_update_figen(struct file *file); ++ ++void au_finfo_fin(struct file *file); ++int au_finfo_init(struct file *file); ++int au_fi_realloc(struct au_finfo *finfo, int nbr); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_finfo *au_fi(struct file *file) ++{ ++ return file->private_data; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * fi_read_lock, fi_write_lock, ++ * fi_read_unlock, fi_write_unlock, fi_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem); ++ ++#define FiMustNoWaiters(f) AuRwMustNoWaiters(&au_fi(f)->fi_rwsem) ++#define FiMustAnyLock(f) AuRwMustAnyLock(&au_fi(f)->fi_rwsem) ++#define FiMustWriteLock(f) AuRwMustWriteLock(&au_fi(f)->fi_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: hard/soft set? */ ++static inline aufs_bindex_t au_fbstart(struct file *file) ++{ ++ FiMustAnyLock(file); ++ return au_fi(file)->fi_bstart; ++} ++ ++static inline aufs_bindex_t au_fbend(struct file *file) ++{ ++ FiMustAnyLock(file); ++ return au_fi(file)->fi_bend; ++} ++ ++static inline struct au_vdir *au_fvdir_cache(struct file *file) ++{ ++ FiMustAnyLock(file); ++ return au_fi(file)->fi_vdir_cache; ++} ++ ++static inline void au_set_fbstart(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustWriteLock(file); ++ au_fi(file)->fi_bstart = bindex; ++} ++ ++static inline void au_set_fbend(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustWriteLock(file); ++ au_fi(file)->fi_bend = bindex; ++} ++ ++static inline void au_set_fvdir_cache(struct file *file, ++ struct au_vdir *vdir_cache) ++{ ++ FiMustWriteLock(file); ++ au_fi(file)->fi_vdir_cache = vdir_cache; ++} ++ ++static inline struct file *au_h_fptr(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustAnyLock(file); ++ return au_fi(file)->fi_hfile[0 + bindex].hf_file; ++} ++ ++/* todo: memory barrier? */ ++static inline unsigned int au_figen(struct file *f) ++{ ++ return atomic_read(&au_fi(f)->fi_generation); ++} ++ ++static inline int au_test_mmapped(struct file *f) ++{ ++ /* FiMustAnyLock(f); */ ++ return !!(au_fi(f)->fi_h_vm_ops); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_FILE_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/finfo.c linux-2.6.31/fs/aufs/finfo.c +--- linux-2.6.31-vanilla/fs/aufs/finfo.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/finfo.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,128 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file private data ++ */ ++ ++#include <linux/file.h> ++#include "aufs.h" ++ ++void au_hfput(struct au_hfile *hf, struct file *file) ++{ ++ if (file->f_flags & vfsub_fmode_to_uint(FMODE_EXEC)) ++ allow_write_access(hf->hf_file); ++ fput(hf->hf_file); ++ hf->hf_file = NULL; ++ atomic_dec_return(&hf->hf_br->br_count); ++ hf->hf_br = NULL; ++} ++ ++void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val) ++{ ++ struct au_finfo *finfo = au_fi(file); ++ struct au_hfile *hf; ++ ++ hf = finfo->fi_hfile + bindex; ++ if (hf->hf_file) ++ au_hfput(hf, file); ++ if (val) { ++ hf->hf_file = val; ++ hf->hf_br = au_sbr(file->f_dentry->d_sb, bindex); ++ } ++} ++ ++void au_update_figen(struct file *file) ++{ ++ atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_dentry)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_finfo_fin(struct file *file) ++{ ++ struct au_finfo *finfo; ++ aufs_bindex_t bindex, bend; ++ ++ fi_write_lock(file); ++ bend = au_fbend(file); ++ bindex = au_fbstart(file); ++ if (bindex >= 0) ++ /* ++ * calls fput() instead of filp_close(), ++ * since no dnotify or lock for the lower file. ++ */ ++ for (; bindex <= bend; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ ++ finfo = au_fi(file); ++ au_dbg_verify_hf(finfo); ++ kfree(finfo->fi_hfile); ++ fi_write_unlock(file); ++ AuRwDestroy(&finfo->fi_rwsem); ++ au_cache_free_finfo(finfo); ++} ++ ++int au_finfo_init(struct file *file) ++{ ++ struct au_finfo *finfo; ++ struct dentry *dentry; ++ ++ dentry = file->f_dentry; ++ finfo = au_cache_alloc_finfo(); ++ if (unlikely(!finfo)) ++ goto out; ++ ++ finfo->fi_hfile = kcalloc(au_sbend(dentry->d_sb) + 1, ++ sizeof(*finfo->fi_hfile), GFP_NOFS); ++ if (unlikely(!finfo->fi_hfile)) ++ goto out_finfo; ++ ++ au_rw_init_wlock(&finfo->fi_rwsem); ++ finfo->fi_bstart = -1; ++ finfo->fi_bend = -1; ++ atomic_set(&finfo->fi_generation, au_digen(dentry)); ++ /* smp_mb(); */ /* atomic_set */ ++ ++ file->private_data = finfo; ++ return 0; /* success */ ++ ++ out_finfo: ++ au_cache_free_finfo(finfo); ++ out: ++ return -ENOMEM; ++} ++ ++int au_fi_realloc(struct au_finfo *finfo, int nbr) ++{ ++ int err, sz; ++ struct au_hfile *hfp; ++ ++ err = -ENOMEM; ++ sz = sizeof(*hfp) * (finfo->fi_bend + 1); ++ if (!sz) ++ sz = sizeof(*hfp); ++ hfp = au_kzrealloc(finfo->fi_hfile, sz, sizeof(*hfp) * nbr, GFP_NOFS); ++ if (hfp) { ++ finfo->fi_hfile = hfp; ++ err = 0; ++ } ++ ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/f_op.c linux-2.6.31/fs/aufs/f_op.c +--- linux-2.6.31-vanilla/fs/aufs/f_op.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/f_op.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,823 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file and vm operations ++ */ ++ ++#include <linux/file.h> ++#include <linux/fs_stack.h> ++#include <linux/ima.h> ++#include <linux/mman.h> ++#include <linux/mm.h> ++#include <linux/security.h> ++#include "aufs.h" ++ ++/* common function to regular file and dir */ ++int aufs_flush(struct file *file, fl_owner_t id) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct dentry *dentry; ++ struct file *h_file; ++ ++ dentry = file->f_dentry; ++ si_noflush_read_lock(dentry->d_sb); ++ fi_read_lock(file); ++ di_read_lock_child(dentry, AuLock_IW); ++ ++ err = 0; ++ bend = au_fbend(file); ++ for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { ++ h_file = au_h_fptr(file, bindex); ++ if (!h_file || !h_file->f_op || !h_file->f_op->flush) ++ continue; ++ ++ err = h_file->f_op->flush(h_file, id); ++ if (!err) ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); ++ /*ignore*/ ++ } ++ au_cpup_attr_timesizes(dentry->d_inode); ++ ++ di_read_unlock(dentry, AuLock_IW); ++ fi_read_unlock(file); ++ si_read_unlock(dentry->d_sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int do_open_nondir(struct file *file, int flags) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct au_finfo *finfo; ++ ++ FiMustWriteLock(file); ++ ++ err = 0; ++ dentry = file->f_dentry; ++ finfo = au_fi(file); ++ finfo->fi_h_vm_ops = NULL; ++ finfo->fi_vm_ops = NULL; ++ bindex = au_dbstart(dentry); ++ /* O_TRUNC is processed already */ ++ BUG_ON(au_test_ro(dentry->d_sb, bindex, dentry->d_inode) ++ && (flags & O_TRUNC)); ++ ++ h_file = au_h_open(dentry, bindex, flags, file); ++ if (IS_ERR(h_file)) ++ err = PTR_ERR(h_file); ++ else { ++ au_set_fbstart(file, bindex); ++ au_set_fbend(file, bindex); ++ au_set_h_fptr(file, bindex, h_file); ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ } ++ return err; ++} ++ ++static int aufs_open_nondir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ return au_do_open(file, do_open_nondir); ++} ++ ++static int aufs_release_nondir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ struct super_block *sb = file->f_dentry->d_sb; ++ ++ si_noflush_read_lock(sb); ++ kfree(au_fi(file)->fi_vm_ops); ++ au_finfo_fin(file); ++ si_read_unlock(sb); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static ssize_t aufs_read(struct file *file, char __user *buf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ struct dentry *dentry; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ h_file = au_h_fptr(file, au_fbstart(file)); ++ err = vfsub_read_u(h_file, buf, count, ppos); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++static ssize_t aufs_write(struct file *file, const char __user *ubuf, ++ size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ aufs_bindex_t bstart; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct super_block *sb; ++ struct file *h_file; ++ char __user *buf = (char __user *)ubuf; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ mutex_lock(&inode->i_mutex); ++ si_read_lock(sb, AuLock_FLUSH); ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ bstart = au_fbstart(file); ++ h_file = au_h_fptr(file, bstart); ++ au_unpin(&pin); ++ err = vfsub_write_u(h_file, buf, count, ppos); ++ au_cpup_attr_timesizes(inode); ++ inode->i_mode = h_file->f_dentry->d_inode->i_mode; ++ ++ out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ out: ++ si_read_unlock(sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++static ssize_t aufs_aio_read(struct kiocb *kio, const struct iovec *iov, ++ unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ struct file *file, *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ file = kio->ki_filp; ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ err = -ENOSYS; ++ h_file = au_h_fptr(file, au_fbstart(file)); ++ if (h_file->f_op && h_file->f_op->aio_read) { ++ err = security_file_permission(h_file, MAY_READ); ++ if (unlikely(err)) ++ goto out_unlock; ++ if (!is_sync_kiocb(kio)) { ++ get_file(h_file); ++ fput(file); ++ } ++ kio->ki_filp = h_file; ++ err = h_file->f_op->aio_read(kio, iov, nv, pos); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ fsstack_copy_attr_atime(dentry->d_inode, ++ h_file->f_dentry->d_inode); ++ } else ++ /* currently there is no such fs */ ++ WARN_ON_ONCE(h_file->f_op && h_file->f_op->read); ++ ++ out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++static ssize_t aufs_aio_write(struct kiocb *kio, const struct iovec *iov, ++ unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ aufs_bindex_t bstart; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct super_block *sb; ++ struct file *file, *h_file; ++ ++ file = kio->ki_filp; ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ mutex_lock(&inode->i_mutex); ++ si_read_lock(sb, AuLock_FLUSH); ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ err = -ENOSYS; ++ bstart = au_fbstart(file); ++ h_file = au_h_fptr(file, bstart); ++ au_unpin(&pin); ++ if (h_file->f_op && h_file->f_op->aio_write) { ++ err = security_file_permission(h_file, MAY_WRITE); ++ if (unlikely(err)) ++ goto out_unlock; ++ if (!is_sync_kiocb(kio)) { ++ get_file(h_file); ++ fput(file); ++ } ++ kio->ki_filp = h_file; ++ err = h_file->f_op->aio_write(kio, iov, nv, pos); ++ au_cpup_attr_timesizes(inode); ++ inode->i_mode = h_file->f_dentry->d_inode->i_mode; ++ } else ++ /* currently there is no such fs */ ++ WARN_ON_ONCE(h_file->f_op && h_file->f_op->write); ++ ++ out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ out: ++ si_read_unlock(sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++static ssize_t aufs_splice_read(struct file *file, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ ssize_t err; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ err = -EINVAL; ++ h_file = au_h_fptr(file, au_fbstart(file)); ++ if (au_test_loopback_kthread()) { ++ file->f_mapping = h_file->f_mapping; ++ smp_mb(); /* unnecessary? */ ++ } ++ err = vfsub_splice_to(h_file, ppos, pipe, len, flags); ++ /* todo: necessasry? */ ++ /* file->f_ra = h_file->f_ra; */ ++ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++ out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++static ssize_t ++aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos, ++ size_t len, unsigned int flags) ++{ ++ ssize_t err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct super_block *sb; ++ struct file *h_file; ++ ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ mutex_lock(&inode->i_mutex); ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ h_file = au_h_fptr(file, au_fbstart(file)); ++ au_unpin(&pin); ++ err = vfsub_splice_from(pipe, h_file, ppos, len, flags); ++ au_cpup_attr_timesizes(inode); ++ inode->i_mode = h_file->f_dentry->d_inode->i_mode; ++ ++ out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ out: ++ si_read_unlock(sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct file *au_safe_file(struct vm_area_struct *vma) ++{ ++ struct file *file; ++ ++ file = vma->vm_file; ++ if (file->private_data && au_test_aufs(file->f_dentry->d_sb)) ++ return file; ++ return NULL; ++} ++ ++static void au_reset_file(struct vm_area_struct *vma, struct file *file) ++{ ++ vma->vm_file = file; ++ /* smp_mb(); */ /* flush vm_file */ ++} ++ ++static int aufs_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ int err; ++ static DECLARE_WAIT_QUEUE_HEAD(wq); ++ struct file *file, *h_file; ++ struct au_finfo *finfo; ++ ++ /* todo: non-robr mode, user vm_file as it is? */ ++ wait_event(wq, (file = au_safe_file(vma))); ++ ++ /* do not revalidate, no si lock */ ++ finfo = au_fi(file); ++ h_file = finfo->fi_hfile[0 + finfo->fi_bstart].hf_file; ++ AuDebugOn(!h_file || !finfo->fi_h_vm_ops); ++ ++ fi_write_lock(file); ++ vma->vm_file = h_file; ++ err = finfo->fi_h_vm_ops->fault(vma, vmf); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ au_reset_file(vma, file); ++ fi_write_unlock(file); ++#if 0 /* def CONFIG_SMP */ ++ /* wake_up_nr(&wq, online_cpu - 1); */ ++ wake_up_all(&wq); ++#else ++ wake_up(&wq); ++#endif ++ ++ return err; ++} ++ ++static int aufs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ int err; ++ static DECLARE_WAIT_QUEUE_HEAD(wq); ++ struct file *file, *h_file; ++ struct au_finfo *finfo; ++ ++ wait_event(wq, (file = au_safe_file(vma))); ++ ++ finfo = au_fi(file); ++ h_file = finfo->fi_hfile[0 + finfo->fi_bstart].hf_file; ++ AuDebugOn(!h_file || !finfo->fi_h_vm_ops); ++ ++ fi_write_lock(file); ++ vma->vm_file = h_file; ++ err = finfo->fi_h_vm_ops->page_mkwrite(vma, vmf); ++ au_reset_file(vma, file); ++ fi_write_unlock(file); ++ wake_up(&wq); ++ ++ return err; ++} ++ ++static void aufs_vm_close(struct vm_area_struct *vma) ++{ ++ static DECLARE_WAIT_QUEUE_HEAD(wq); ++ struct file *file, *h_file; ++ struct au_finfo *finfo; ++ ++ wait_event(wq, (file = au_safe_file(vma))); ++ ++ finfo = au_fi(file); ++ h_file = finfo->fi_hfile[0 + finfo->fi_bstart].hf_file; ++ AuDebugOn(!h_file || !finfo->fi_h_vm_ops); ++ ++ fi_write_lock(file); ++ vma->vm_file = h_file; ++ finfo->fi_h_vm_ops->close(vma); ++ au_reset_file(vma, file); ++ fi_write_unlock(file); ++ wake_up(&wq); ++} ++ ++static struct vm_operations_struct aufs_vm_ops = { ++ /* .close and .page_mkwrite are not set by default */ ++ .fault = aufs_fault, ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static unsigned long au_prot_conv(unsigned long flags) ++{ ++ unsigned long prot; ++ ++ prot = 0; ++ if (flags & VM_READ) ++ prot |= PROT_READ; ++ if (flags & VM_WRITE) ++ prot |= PROT_WRITE; ++ if (flags & VM_EXEC) ++ prot |= PROT_EXEC; ++ return prot; ++} ++ ++static struct vm_operations_struct *au_vm_ops(struct file *h_file, ++ struct vm_area_struct *vma) ++{ ++ struct vm_operations_struct *vm_ops; ++ int err; ++ ++ vm_ops = ERR_PTR(-ENODEV); ++ if (!h_file->f_op || !h_file->f_op->mmap) ++ goto out; ++ ++ err = ima_file_mmap(h_file, au_prot_conv(vma->vm_flags)); ++ vm_ops = ERR_PTR(err); ++ if (err) ++ goto out; ++ ++ err = h_file->f_op->mmap(h_file, vma); ++ vm_ops = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ vm_ops = vma->vm_ops; ++ err = do_munmap(current->mm, vma->vm_start, ++ vma->vm_end - vma->vm_start); ++ if (unlikely(err)) { ++ AuIOErr("failed internal unmapping %.*s, %d\n", ++ AuDLNPair(h_file->f_dentry), err); ++ vm_ops = ERR_PTR(-EIO); ++ } ++ ++ out: ++ return vm_ops; ++} ++ ++static int au_custom_vm_ops(struct au_finfo *finfo, struct vm_area_struct *vma) ++{ ++ int err; ++ struct vm_operations_struct *h_ops; ++ ++ AuRwMustAnyLock(&finfo->fi_rwsem); ++ ++ err = 0; ++ h_ops = finfo->fi_h_vm_ops; ++ AuDebugOn(!h_ops); ++ if ((!h_ops->page_mkwrite && !h_ops->close) ++ || finfo->fi_vm_ops) ++ goto out; ++ ++ err = -ENOMEM; ++ finfo->fi_vm_ops = kmemdup(&aufs_vm_ops, sizeof(aufs_vm_ops), GFP_NOFS); ++ if (unlikely(!finfo->fi_vm_ops)) ++ goto out; ++ ++ err = 0; ++ if (h_ops->page_mkwrite) ++ finfo->fi_vm_ops->page_mkwrite = aufs_page_mkwrite; ++ if (h_ops->close) ++ finfo->fi_vm_ops->close = aufs_vm_close; ++ ++ vma->vm_ops = finfo->fi_vm_ops; ++ ++ out: ++ return err; ++} ++ ++static int aufs_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ int err; ++ unsigned char wlock, mmapped; ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct file *h_file; ++ struct vm_operations_struct *vm_ops; ++ ++ dentry = file->f_dentry; ++ wlock = !!(file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED); ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ mmapped = !!au_test_mmapped(file); ++ if (wlock) { ++ struct au_pin pin; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ au_unpin(&pin); ++ } else ++ di_downgrade_lock(dentry, AuLock_IR); ++ ++ h_file = au_h_fptr(file, au_fbstart(file)); ++ if (!mmapped && au_test_fs_bad_mapping(h_file->f_dentry->d_sb)) { ++ /* ++ * by this assignment, f_mapping will differs from aufs inode ++ * i_mapping. ++ * if someone else mixes the use of f_dentry->d_inode and ++ * f_mapping->host, then a problem may arise. ++ */ ++ file->f_mapping = h_file->f_mapping; ++ } ++ ++ vm_ops = NULL; ++ if (!mmapped) { ++ vm_ops = au_vm_ops(h_file, vma); ++ err = PTR_ERR(vm_ops); ++ if (IS_ERR(vm_ops)) ++ goto out_unlock; ++ } ++ ++ /* ++ * unnecessary to handle MAP_DENYWRITE and deny_write_access()? ++ * currently MAP_DENYWRITE from userspace is ignored, but elf loader ++ * sets it. when FMODE_EXEC is set (by open_exec() or sys_uselib()), ++ * both of the aufs file and the lower file is deny_write_access()-ed. ++ * finally I hope we can skip handlling MAP_DENYWRITE here. ++ */ ++ err = generic_file_mmap(file, vma); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ vma->vm_ops = &aufs_vm_ops; ++ /* test again */ ++ if (!au_test_mmapped(file)) ++ au_fi(file)->fi_h_vm_ops = vm_ops; ++ ++ err = au_custom_vm_ops(au_fi(file), vma); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ vfsub_file_accessed(h_file); ++ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); ++ ++ out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_fsync_nondir(struct file *file, struct dentry *dentry, ++ int datasync) ++{ ++ int err; ++ struct au_pin pin; ++ struct inode *inode; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ inode = dentry->d_inode; ++ IMustLock(file->f_mapping->host); ++ if (inode != file->f_mapping->host) { ++ mutex_unlock(&file->f_mapping->host->i_mutex); ++ mutex_lock(&inode->i_mutex); ++ } ++ IMustLock(inode); ++ ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ ++ err = 0; /* -EBADF; */ /* posix? */ ++ if (unlikely(!(file->f_mode & FMODE_WRITE))) ++ goto out; ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ au_unpin(&pin); ++ ++ err = -EINVAL; ++ h_file = au_h_fptr(file, au_fbstart(file)); ++ if (h_file->f_op && h_file->f_op->fsync) { ++ struct dentry *h_d; ++ struct mutex *h_mtx; ++ ++ /* ++ * no filemap_fdatawrite() since aufs file has no its own ++ * mapping, but dir. ++ */ ++ h_d = h_file->f_dentry; ++ h_mtx = &h_d->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ err = h_file->f_op->fsync(h_file, h_d, datasync); ++ if (!err) ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); ++ /*ignore*/ ++ au_cpup_attr_timesizes(inode); ++ mutex_unlock(h_mtx); ++ } ++ ++ out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ out: ++ si_read_unlock(sb); ++ if (inode != file->f_mapping->host) { ++ mutex_unlock(&inode->i_mutex); ++ mutex_lock(&file->f_mapping->host->i_mutex); ++ } ++ return err; ++} ++ ++/* no one supports this operation, currently */ ++#if 0 ++static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync) ++{ ++ int err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *file, *h_file; ++ struct super_block *sb; ++ ++ file = kio->ki_filp; ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ mutex_lock(&inode->i_mutex); ++ ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ ++ err = 0; /* -EBADF; */ /* posix? */ ++ if (unlikely(!(file->f_mode & FMODE_WRITE))) ++ goto out; ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ au_unpin(&pin); ++ ++ err = -ENOSYS; ++ h_file = au_h_fptr(file, au_fbstart(file)); ++ if (h_file->f_op && h_file->f_op->aio_fsync) { ++ struct dentry *h_d; ++ struct mutex *h_mtx; ++ ++ h_d = h_file->f_dentry; ++ h_mtx = &h_d->d_inode->i_mutex; ++ if (!is_sync_kiocb(kio)) { ++ get_file(h_file); ++ fput(file); ++ } ++ kio->ki_filp = h_file; ++ err = h_file->f_op->aio_fsync(kio, datasync); ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ if (!err) ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); ++ /*ignore*/ ++ au_cpup_attr_timesizes(inode); ++ mutex_unlock(h_mtx); ++ } ++ ++ out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ out: ++ si_read_unlock(sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++#endif ++ ++static int aufs_fasync(int fd, struct file *file, int flag) ++{ ++ int err; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ h_file = au_h_fptr(file, au_fbstart(file)); ++ if (h_file->f_op && h_file->f_op->fasync) ++ err = h_file->f_op->fasync(fd, h_file, flag); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++ out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* no one supports this operation, currently */ ++#if 0 ++static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset, ++ size_t len, loff_t *pos , int more) ++{ ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++const struct file_operations aufs_file_fop = { ++ /* ++ * while generic_file_llseek/_unlocked() don't use BKL, ++ * don't use it since it operates file->f_mapping->host. ++ * in aufs, it may be a real file and may confuse users by UDBA. ++ */ ++ /* .llseek = generic_file_llseek, */ ++ ++ .read = aufs_read, ++ .write = aufs_write, ++ .aio_read = aufs_aio_read, ++ .aio_write = aufs_aio_write, ++#ifdef CONFIG_AUFS_POLL ++ .poll = aufs_poll, ++#endif ++ .mmap = aufs_mmap, ++ .open = aufs_open_nondir, ++ .flush = aufs_flush, ++ .release = aufs_release_nondir, ++ .fsync = aufs_fsync_nondir, ++ /* .aio_fsync = aufs_aio_fsync_nondir, */ ++ .fasync = aufs_fasync, ++ /* .sendpage = aufs_sendpage, */ ++ .splice_write = aufs_splice_write, ++ .splice_read = aufs_splice_read, ++#if 0 ++ .aio_splice_write = aufs_aio_splice_write, ++ .aio_splice_read = aufs_aio_splice_read ++#endif ++}; +diff -Nur linux-2.6.31-vanilla/fs/aufs/fstype.h linux-2.6.31/fs/aufs/fstype.h +--- linux-2.6.31-vanilla/fs/aufs/fstype.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/fstype.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,485 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * judging filesystem type ++ */ ++ ++#ifndef __AUFS_FSTYPE_H__ ++#define __AUFS_FSTYPE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/cramfs_fs.h> ++#include <linux/fs.h> ++#include <linux/magic.h> ++#include <linux/romfs_fs.h> ++#include <linux/aufs_type.h> ++ ++static inline int au_test_aufs(struct super_block *sb) ++{ ++ return sb->s_magic == AUFS_SUPER_MAGIC; ++} ++ ++static inline const char *au_sbtype(struct super_block *sb) ++{ ++ return sb->s_type->name; ++} ++ ++static inline int au_test_iso9660(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE) ++ return sb->s_magic == ROMFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_romfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE) ++ return sb->s_magic == ISOFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_cramfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE) ++ return sb->s_magic == CRAMFS_MAGIC; ++#endif ++ return 0; ++} ++ ++static inline int au_test_nfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) ++ return sb->s_magic == NFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_fuse(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) ++ return sb->s_magic == FUSE_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_xfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE) ++ return sb->s_magic == XFS_SB_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_tmpfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_TMPFS ++ return sb->s_magic == TMPFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE) ++ return !strcmp(au_sbtype(sb), "ecryptfs"); ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_smbfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_SMB_FS) || defined(CONFIG_SMB_FS_MODULE) ++ return sb->s_magic == SMB_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ocfs2(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_OCFS2_FS) || defined(CONFIG_OCFS2_FS_MODULE) ++ return sb->s_magic == OCFS2_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ocfs2_dlmfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_OCFS2_FS_O2CB) || defined(CONFIG_OCFS2_FS_O2CB_MODULE) ++ return sb->s_magic == DLMFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_coda(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CODA_FS) || defined(CONFIG_CODA_FS_MODULE) ++ return sb->s_magic == CODA_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_v9fs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_9P_FS) || defined(CONFIG_9P_FS_MODULE) ++ return sb->s_magic == V9FS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ext4(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_EXT4DEV_FS) || defined(CONFIG_EXT4DEV_FS_MODULE) ++ return sb->s_magic == EXT4_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_sysv(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_SYSV_FS) || defined(CONFIG_SYSV_FS_MODULE) ++ return !strcmp(au_sbtype(sb), "sysv"); ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ramfs(struct super_block *sb) ++{ ++ return sb->s_magic == RAMFS_MAGIC; ++} ++ ++static inline int au_test_ubifs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE) ++ return sb->s_magic == UBIFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_procfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_PROC_FS ++ return sb->s_magic == PROC_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_sysfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_SYSFS ++ return sb->s_magic == SYSFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_configfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE) ++ return sb->s_magic == CONFIGFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_minix(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE) ++ return sb->s_magic == MINIX3_SUPER_MAGIC ++ || sb->s_magic == MINIX2_SUPER_MAGIC ++ || sb->s_magic == MINIX2_SUPER_MAGIC2 ++ || sb->s_magic == MINIX_SUPER_MAGIC ++ || sb->s_magic == MINIX_SUPER_MAGIC2; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_cifs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CIFS_FS) || defined(CONFIGCIFS_FS_MODULE) ++ return sb->s_magic == CIFS_MAGIC_NUMBER; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_fat(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE) ++ return sb->s_magic == MSDOS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_msdos(struct super_block *sb) ++{ ++ return au_test_fat(sb); ++} ++ ++static inline int au_test_vfat(struct super_block *sb) ++{ ++ return au_test_fat(sb); ++} ++ ++static inline int au_test_securityfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_SECURITYFS ++ return sb->s_magic == SECURITYFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_squashfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE) ++ return sb->s_magic == SQUASHFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_btrfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE) ++ return sb->s_magic == BTRFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_xenfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE) ++ return sb->s_magic == XENFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_debugfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_DEBUG_FS ++ return sb->s_magic == DEBUGFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_nilfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE) ++ return sb->s_magic == NILFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * they can't be an aufs branch. ++ */ ++static inline int au_test_fs_unsuppoted(struct super_block *sb) ++{ ++ return ++#ifndef CONFIG_AUFS_BR_RAMFS ++ au_test_ramfs(sb) || ++#endif ++ au_test_procfs(sb) ++ || au_test_sysfs(sb) ++ || au_test_configfs(sb) ++ || au_test_debugfs(sb) ++ || au_test_securityfs(sb) ++ || au_test_xenfs(sb) ++ || au_test_ecryptfs(sb) ++ /* || !strcmp(au_sbtype(sb), "unionfs") */ ++ || au_test_aufs(sb); /* will be supported in next version */ ++} ++ ++/* ++ * If the filesystem supports NFS-export, then it has to support NULL as ++ * a nameidata parameter for ->create(), ->lookup() and ->d_revalidate(). ++ * We can apply this principle when we handle a lower filesystem. ++ */ ++static inline int au_test_fs_null_nd(struct super_block *sb) ++{ ++ return !!sb->s_export_op; ++} ++ ++static inline int au_test_fs_remote(struct super_block *sb) ++{ ++ return !au_test_tmpfs(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ && !au_test_ramfs(sb) ++#endif ++ && !(sb->s_type->fs_flags & FS_REQUIRES_DEV); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * Note: these functions (below) are created after reading ->getattr() in all ++ * filesystems under linux/fs. it means we have to do so in every update... ++ */ ++ ++/* ++ * some filesystems require getattr to refresh the inode attributes before ++ * referencing. ++ * in most cases, we can rely on the inode attribute in NFS (or every remote fs) ++ * and leave the work for d_revalidate() ++ */ ++static inline int au_test_fs_refresh_iattr(struct super_block *sb) ++{ ++ return au_test_nfs(sb) ++ || au_test_fuse(sb) ++ /* || au_test_smbfs(sb) */ /* untested */ ++ /* || au_test_ocfs2(sb) */ /* untested */ ++ /* || au_test_btrfs(sb) */ /* untested */ ++ /* || au_test_coda(sb) */ /* untested */ ++ /* || au_test_v9fs(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which don't maintain i_size or i_blocks. ++ */ ++static inline int au_test_fs_bad_iattr_size(struct super_block *sb) ++{ ++ return au_test_xfs(sb) ++ /* || au_test_ext4(sb) */ /* untested */ ++ /* || au_test_ocfs2(sb) */ /* untested */ ++ /* || au_test_ocfs2_dlmfs(sb) */ /* untested */ ++ /* || au_test_sysv(sb) */ /* untested */ ++ /* || au_test_ubifs(sb) */ /* untested */ ++ /* || au_test_minix(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which don't store the correct value in some of their inode ++ * attributes. ++ */ ++static inline int au_test_fs_bad_iattr(struct super_block *sb) ++{ ++ return au_test_fs_bad_iattr_size(sb) ++ /* || au_test_cifs(sb) */ /* untested */ ++ || au_test_fat(sb) ++ || au_test_msdos(sb) ++ || au_test_vfat(sb); ++} ++ ++/* they don't check i_nlink in link(2) */ ++static inline int au_test_fs_no_limit_nlink(struct super_block *sb) ++{ ++ return au_test_tmpfs(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ || au_test_ramfs(sb) ++#endif ++ || au_test_ubifs(sb); ++} ++ ++/* ++ * filesystems which sets S_NOATIME and S_NOCMTIME. ++ */ ++static inline int au_test_fs_notime(struct super_block *sb) ++{ ++ return au_test_nfs(sb) ++ || au_test_fuse(sb) ++ || au_test_ubifs(sb) ++ /* || au_test_cifs(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which requires replacing i_mapping. ++ */ ++static inline int au_test_fs_bad_mapping(struct super_block *sb) ++{ ++ return au_test_fuse(sb) ++ || au_test_ubifs(sb); ++} ++ ++/* temporary support for i#1 in cramfs */ ++static inline int au_test_fs_unique_ino(struct inode *inode) ++{ ++ if (au_test_cramfs(inode->i_sb)) ++ return inode->i_ino != 1; ++ return 1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * the filesystem where the xino files placed must support i/o after unlink and ++ * maintain i_size and i_blocks. ++ */ ++static inline int au_test_fs_bad_xino(struct super_block *sb) ++{ ++ return au_test_fs_remote(sb) ++ || au_test_fs_bad_iattr_size(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ || !(au_test_ramfs(sb) || au_test_fs_null_nd(sb)) ++#else ++ || !au_test_fs_null_nd(sb) /* to keep xino code simple */ ++#endif ++ /* don't want unnecessary work for xino */ ++ || au_test_aufs(sb) ++ || au_test_ecryptfs(sb) ++ || au_test_nilfs(sb); ++} ++ ++static inline int au_test_fs_trunc_xino(struct super_block *sb) ++{ ++ return au_test_tmpfs(sb) ++ || au_test_ramfs(sb); ++} ++ ++/* ++ * test if the @sb is real-readonly. ++ */ ++static inline int au_test_fs_rr(struct super_block *sb) ++{ ++ return au_test_squashfs(sb) ++ || au_test_iso9660(sb) ++ || au_test_cramfs(sb) ++ || au_test_romfs(sb); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_FSTYPE_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/hinotify.c linux-2.6.31/fs/aufs/hinotify.c +--- linux-2.6.31-vanilla/fs/aufs/hinotify.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/hinotify.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,755 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inotify for the lower directories ++ */ ++ ++#include "aufs.h" ++ ++static const __u32 AuHinMask = (IN_MOVE | IN_DELETE | IN_CREATE); ++static struct inotify_handle *au_hin_handle; ++ ++AuCacheFuncs(hinotify, HINOTIFY); ++ ++int au_hin_alloc(struct au_hinode *hinode, struct inode *inode, ++ struct inode *h_inode) ++{ ++ int err; ++ struct au_hinotify *hin; ++ s32 wd; ++ ++ err = -ENOMEM; ++ hin = au_cache_alloc_hinotify(); ++ if (hin) { ++ AuDebugOn(hinode->hi_notify); ++ hinode->hi_notify = hin; ++ hin->hin_aufs_inode = inode; ++ ++ inotify_init_watch(&hin->hin_watch); ++ wd = inotify_add_watch(au_hin_handle, &hin->hin_watch, h_inode, ++ AuHinMask); ++ if (wd >= 0) ++ return 0; /* success */ ++ ++ err = wd; ++ put_inotify_watch(&hin->hin_watch); ++ au_cache_free_hinotify(hin); ++ hinode->hi_notify = NULL; ++ } ++ ++ return err; ++} ++ ++void au_hin_free(struct au_hinode *hinode) ++{ ++ int err; ++ struct au_hinotify *hin; ++ ++ hin = hinode->hi_notify; ++ if (hin) { ++ err = 0; ++ if (atomic_read(&hin->hin_watch.count)) ++ err = inotify_rm_watch(au_hin_handle, &hin->hin_watch); ++ if (unlikely(err)) ++ /* it means the watch is already removed */ ++ AuWarn("failed inotify_rm_watch() %d\n", err); ++ au_cache_free_hinotify(hin); ++ hinode->hi_notify = NULL; ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_hin_ctl(struct au_hinode *hinode, int do_set) ++{ ++ struct inode *h_inode; ++ struct inotify_watch *watch; ++ ++ if (!hinode->hi_notify) ++ return; ++ ++ h_inode = hinode->hi_inode; ++ IMustLock(h_inode); ++ ++ /* todo: try inotify_find_update_watch()? */ ++ watch = &hinode->hi_notify->hin_watch; ++ mutex_lock(&h_inode->inotify_mutex); ++ /* mutex_lock(&watch->ih->mutex); */ ++ if (do_set) { ++ AuDebugOn(watch->mask & AuHinMask); ++ watch->mask |= AuHinMask; ++ } else { ++ AuDebugOn(!(watch->mask & AuHinMask)); ++ watch->mask &= ~AuHinMask; ++ } ++ /* mutex_unlock(&watch->ih->mutex); */ ++ mutex_unlock(&h_inode->inotify_mutex); ++} ++ ++void au_reset_hinotify(struct inode *inode, unsigned int flags) ++{ ++ aufs_bindex_t bindex, bend; ++ struct inode *hi; ++ struct dentry *iwhdentry; ++ ++ bend = au_ibend(inode); ++ for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { ++ hi = au_h_iptr(inode, bindex); ++ if (!hi) ++ continue; ++ ++ /* mutex_lock_nested(&hi->i_mutex, AuLsc_I_CHILD); */ ++ iwhdentry = au_hi_wh(inode, bindex); ++ if (iwhdentry) ++ dget(iwhdentry); ++ au_igrab(hi); ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ au_set_h_iptr(inode, bindex, au_igrab(hi), ++ flags & ~AuHi_XINO); ++ iput(hi); ++ dput(iwhdentry); ++ /* mutex_unlock(&hi->i_mutex); */ ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int hin_xino(struct inode *inode, struct inode *h_inode) ++{ ++ int err; ++ aufs_bindex_t bindex, bend, bfound, bstart; ++ struct inode *h_i; ++ ++ err = 0; ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ AuWarn("branch root dir was changed\n"); ++ goto out; ++ } ++ ++ bfound = -1; ++ bend = au_ibend(inode); ++ bstart = au_ibstart(inode); ++#if 0 /* reserved for future use */ ++ if (bindex == bend) { ++ /* keep this ino in rename case */ ++ goto out; ++ } ++#endif ++ for (bindex = bstart; bindex <= bend; bindex++) { ++ if (au_h_iptr(inode, bindex) == h_inode) { ++ bfound = bindex; ++ break; ++ } ++ } ++ if (bfound < 0) ++ goto out; ++ ++ for (bindex = bstart; bindex <= bend; bindex++) { ++ h_i = au_h_iptr(inode, bindex); ++ if (!h_i) ++ continue; ++ ++ err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0); ++ /* ignore this error */ ++ /* bad action? */ ++ } ++ ++ /* children inode number will be broken */ ++ ++ out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int hin_gen_tree(struct dentry *dentry) ++{ ++ int err, i, j, ndentry; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, dentry, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ struct dentry *d; ++ ++ d = dentries[j]; ++ if (IS_ROOT(d)) ++ continue; ++ ++ d_drop(d); ++ au_digen_dec(d); ++ if (d->d_inode) ++ /* todo: reset children xino? ++ cached children only? */ ++ au_iigen_dec(d->d_inode); ++ } ++ } ++ ++ out_dpages: ++ au_dpages_free(&dpages); ++ ++ /* discard children */ ++ dentry_unhash(dentry); ++ dput(dentry); ++ out: ++ return err; ++} ++ ++/* ++ * return 0 if processed. ++ */ ++static int hin_gen_by_inode(char *name, unsigned int nlen, struct inode *inode, ++ const unsigned int isdir) ++{ ++ int err; ++ struct dentry *d; ++ struct qstr *dname; ++ ++ err = 1; ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ AuWarn("branch root dir was changed\n"); ++ err = 0; ++ goto out; ++ } ++ ++ if (!isdir) { ++ AuDebugOn(!name); ++ au_iigen_dec(inode); ++ spin_lock(&dcache_lock); ++ list_for_each_entry(d, &inode->i_dentry, d_alias) { ++ dname = &d->d_name; ++ if (dname->len != nlen ++ && memcmp(dname->name, name, nlen)) ++ continue; ++ err = 0; ++ spin_lock(&d->d_lock); ++ __d_drop(d); ++ au_digen_dec(d); ++ spin_unlock(&d->d_lock); ++ break; ++ } ++ spin_unlock(&dcache_lock); ++ } else { ++ au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIRS); ++ d = d_find_alias(inode); ++ if (!d) { ++ au_iigen_dec(inode); ++ goto out; ++ } ++ ++ dname = &d->d_name; ++ if (dname->len == nlen && !memcmp(dname->name, name, nlen)) ++ err = hin_gen_tree(d); ++ dput(d); ++ } ++ ++ out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int hin_gen_by_name(struct dentry *dentry, const unsigned int isdir) ++{ ++ int err; ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ if (IS_ROOT(dentry) ++ /* || (inode && inode->i_ino == AUFS_ROOT_INO) */ ++ ) { ++ AuWarn("branch root dir was changed\n"); ++ return 0; ++ } ++ ++ err = 0; ++ if (!isdir) { ++ d_drop(dentry); ++ au_digen_dec(dentry); ++ if (inode) ++ au_iigen_dec(inode); ++ } else { ++ au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIRS); ++ if (inode) ++ err = hin_gen_tree(dentry); ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* hinotify job flags */ ++#define AuHinJob_XINO0 1 ++#define AuHinJob_GEN (1 << 1) ++#define AuHinJob_DIRENT (1 << 2) ++#define AuHinJob_ISDIR (1 << 3) ++#define AuHinJob_TRYXINO0 (1 << 4) ++#define AuHinJob_MNTPNT (1 << 5) ++#define au_ftest_hinjob(flags, name) ((flags) & AuHinJob_##name) ++#define au_fset_hinjob(flags, name) { (flags) |= AuHinJob_##name; } ++#define au_fclr_hinjob(flags, name) { (flags) &= ~AuHinJob_##name; } ++ ++struct hin_job_args { ++ unsigned int flags; ++ struct inode *inode, *h_inode, *dir, *h_dir; ++ struct dentry *dentry; ++ char *h_name; ++ int h_nlen; ++}; ++ ++static int hin_job(struct hin_job_args *a) ++{ ++ const unsigned int isdir = au_ftest_hinjob(a->flags, ISDIR); ++ ++ /* reset xino */ ++ if (au_ftest_hinjob(a->flags, XINO0) && a->inode) ++ hin_xino(a->inode, a->h_inode); /* ignore this error */ ++ ++ if (au_ftest_hinjob(a->flags, TRYXINO0) ++ && a->inode ++ && a->h_inode) { ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ if (!a->h_inode->i_nlink) ++ hin_xino(a->inode, a->h_inode); /* ignore this error */ ++ mutex_unlock(&a->h_inode->i_mutex); ++ } ++ ++ /* make the generation obsolete */ ++ if (au_ftest_hinjob(a->flags, GEN)) { ++ int err = -1; ++ if (a->inode) ++ err = hin_gen_by_inode(a->h_name, a->h_nlen, a->inode, ++ isdir); ++ if (err && a->dentry) ++ hin_gen_by_name(a->dentry, isdir); ++ /* ignore this error */ ++ } ++ ++ /* make dir entries obsolete */ ++ if (au_ftest_hinjob(a->flags, DIRENT) && a->inode) { ++ struct au_vdir *vdir; ++ ++ vdir = au_ivdir(a->inode); ++ if (vdir) ++ vdir->vd_jiffy = 0; ++ /* IMustLock(a->inode); */ ++ /* a->inode->i_version++; */ ++ } ++ ++ /* can do nothing but warn */ ++ if (au_ftest_hinjob(a->flags, MNTPNT) ++ && a->dentry ++ && d_mountpoint(a->dentry)) ++ AuWarn("mount-point %.*s is removed or renamed\n", ++ AuDLNPair(a->dentry)); ++ ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static char *in_name(u32 mask) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++#define test_ret(flag) if (mask & flag) \ ++ return #flag; ++ test_ret(IN_ACCESS); ++ test_ret(IN_MODIFY); ++ test_ret(IN_ATTRIB); ++ test_ret(IN_CLOSE_WRITE); ++ test_ret(IN_CLOSE_NOWRITE); ++ test_ret(IN_OPEN); ++ test_ret(IN_MOVED_FROM); ++ test_ret(IN_MOVED_TO); ++ test_ret(IN_CREATE); ++ test_ret(IN_DELETE); ++ test_ret(IN_DELETE_SELF); ++ test_ret(IN_MOVE_SELF); ++ test_ret(IN_UNMOUNT); ++ test_ret(IN_Q_OVERFLOW); ++ test_ret(IN_IGNORED); ++ return ""; ++#undef test_ret ++#else ++ return "??"; ++#endif ++} ++ ++static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen, ++ struct inode *dir) ++{ ++ struct dentry *dentry, *d, *parent; ++ struct qstr *dname; ++ ++ parent = d_find_alias(dir); ++ if (!parent) ++ return NULL; ++ ++ dentry = NULL; ++ spin_lock(&dcache_lock); ++ list_for_each_entry(d, &parent->d_subdirs, d_u.d_child) { ++ /* AuDbg("%.*s\n", AuDLNPair(d)); */ ++ dname = &d->d_name; ++ if (dname->len != nlen || memcmp(dname->name, name, nlen)) ++ continue; ++ if (!atomic_read(&d->d_count) || !d->d_fsdata) { ++ spin_lock(&d->d_lock); ++ __d_drop(d); ++ spin_unlock(&d->d_lock); ++ continue; ++ } ++ ++ dentry = dget(d); ++ break; ++ } ++ spin_unlock(&dcache_lock); ++ dput(parent); ++ ++ if (dentry) ++ di_write_lock_child(dentry); ++ ++ return dentry; ++} ++ ++static struct inode *lookup_wlock_by_ino(struct super_block *sb, ++ aufs_bindex_t bindex, ino_t h_ino) ++{ ++ struct inode *inode; ++ ino_t ino; ++ int err; ++ ++ inode = NULL; ++ err = au_xino_read(sb, bindex, h_ino, &ino); ++ if (!err && ino) ++ inode = ilookup(sb, ino); ++ if (!inode) ++ goto out; ++ ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ AuWarn("wrong root branch\n"); ++ iput(inode); ++ inode = NULL; ++ goto out; ++ } ++ ++ ii_write_lock_child(inode); ++ ++ out: ++ return inode; ++} ++ ++enum { CHILD, PARENT }; ++struct postproc_args { ++ struct inode *h_dir, *dir, *h_child_inode; ++ u32 mask; ++ unsigned int flags[2]; ++ unsigned int h_child_nlen; ++ char h_child_name[]; ++}; ++ ++static void postproc(void *_args) ++{ ++ struct postproc_args *a = _args; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bend, bfound; ++ unsigned char xino, try_iput; ++ int err; ++ struct inode *inode; ++ ino_t h_ino; ++ struct hin_job_args args; ++ struct dentry *dentry; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(!_args); ++ AuDebugOn(!a->h_dir); ++ AuDebugOn(!a->dir); ++ AuDebugOn(!a->mask); ++ AuDbg("mask 0x%x %s, i%lu, hi%lu, hci%lu\n", ++ a->mask, in_name(a->mask), a->dir->i_ino, a->h_dir->i_ino, ++ a->h_child_inode ? a->h_child_inode->i_ino : 0); ++ ++ inode = NULL; ++ dentry = NULL; ++ /* ++ * do not lock a->dir->i_mutex here ++ * because of d_revalidate() may cause a deadlock. ++ */ ++ sb = a->dir->i_sb; ++ AuDebugOn(!sb); ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!sbinfo); ++ /* big aufs lock */ ++ si_noflush_write_lock(sb); ++ ++ ii_read_lock_parent(a->dir); ++ bfound = -1; ++ bend = au_ibend(a->dir); ++ for (bindex = au_ibstart(a->dir); bindex <= bend; bindex++) ++ if (au_h_iptr(a->dir, bindex) == a->h_dir) { ++ bfound = bindex; ++ break; ++ } ++ ii_read_unlock(a->dir); ++ if (unlikely(bfound < 0)) ++ goto out; ++ ++ xino = !!au_opt_test(au_mntflags(sb), XINO); ++ h_ino = 0; ++ if (a->h_child_inode) ++ h_ino = a->h_child_inode->i_ino; ++ ++ if (a->h_child_nlen ++ && (au_ftest_hinjob(a->flags[CHILD], GEN) ++ || au_ftest_hinjob(a->flags[CHILD], MNTPNT))) ++ dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen, ++ a->dir); ++ try_iput = 0; ++ if (dentry) ++ inode = dentry->d_inode; ++ if (xino && !inode && h_ino ++ && (au_ftest_hinjob(a->flags[CHILD], XINO0) ++ || au_ftest_hinjob(a->flags[CHILD], TRYXINO0) ++ || au_ftest_hinjob(a->flags[CHILD], GEN))) { ++ inode = lookup_wlock_by_ino(sb, bfound, h_ino); ++ try_iput = 1; ++ } ++ ++ args.flags = a->flags[CHILD]; ++ args.dentry = dentry; ++ args.inode = inode; ++ args.h_inode = a->h_child_inode; ++ args.dir = a->dir; ++ args.h_dir = a->h_dir; ++ args.h_name = a->h_child_name; ++ args.h_nlen = a->h_child_nlen; ++ err = hin_job(&args); ++ if (dentry) { ++ if (dentry->d_fsdata) ++ di_write_unlock(dentry); ++ dput(dentry); ++ } ++ if (inode && try_iput) { ++ ii_write_unlock(inode); ++ iput(inode); ++ } ++ ++ ii_write_lock_parent(a->dir); ++ args.flags = a->flags[PARENT]; ++ args.dentry = NULL; ++ args.inode = a->dir; ++ args.h_inode = a->h_dir; ++ args.dir = NULL; ++ args.h_dir = NULL; ++ args.h_name = NULL; ++ args.h_nlen = 0; ++ err = hin_job(&args); ++ ii_write_unlock(a->dir); ++ ++ out: ++ au_nwt_done(&sbinfo->si_nowait); ++ si_write_unlock(sb); ++ ++ iput(a->h_child_inode); ++ iput(a->h_dir); ++ iput(a->dir); ++ kfree(a); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void aufs_inotify(struct inotify_watch *watch, u32 wd __maybe_unused, ++ u32 mask, u32 cookie __maybe_unused, ++ const char *h_child_name, struct inode *h_child_inode) ++{ ++ struct au_hinotify *hinotify; ++ struct postproc_args *args; ++ int len, wkq_err; ++ unsigned char isdir, isroot, wh; ++ char *p; ++ struct inode *dir; ++ unsigned int flags[2]; ++ ++ /* if IN_UNMOUNT happens, there must be another bug */ ++ AuDebugOn(mask & IN_UNMOUNT); ++ if (mask & (IN_IGNORED | IN_UNMOUNT)) { ++ put_inotify_watch(watch); ++ return; ++ } ++#ifdef AuDbgHinotify ++ au_debug(1); ++ if (1 || !h_child_name || strcmp(h_child_name, AUFS_XINO_FNAME)) { ++ AuDbg("i%lu, wd %d, mask 0x%x %s, cookie 0x%x, hcname %s," ++ " hi%lu\n", ++ watch->inode->i_ino, wd, mask, in_name(mask), cookie, ++ h_child_name ? h_child_name : "", ++ h_child_inode ? h_child_inode->i_ino : 0); ++ WARN_ON(1); ++ } ++ au_debug(0); ++#endif ++ ++ hinotify = container_of(watch, struct au_hinotify, hin_watch); ++ AuDebugOn(!hinotify || !hinotify->hin_aufs_inode); ++ dir = igrab(hinotify->hin_aufs_inode); ++ if (!dir) ++ return; ++ ++ isroot = (dir->i_ino == AUFS_ROOT_INO); ++ len = 0; ++ wh = 0; ++ if (h_child_name) { ++ len = strlen(h_child_name); ++ if (!memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ h_child_name += AUFS_WH_PFX_LEN; ++ len -= AUFS_WH_PFX_LEN; ++ wh = 1; ++ } ++ } ++ ++ isdir = 0; ++ if (h_child_inode) ++ isdir = !!S_ISDIR(h_child_inode->i_mode); ++ flags[PARENT] = AuHinJob_ISDIR; ++ flags[CHILD] = 0; ++ if (isdir) ++ flags[CHILD] = AuHinJob_ISDIR; ++ switch (mask & IN_ALL_EVENTS) { ++ case IN_MOVED_FROM: ++ case IN_MOVED_TO: ++ AuDebugOn(!h_child_name || !h_child_inode); ++ au_fset_hinjob(flags[CHILD], GEN); ++ au_fset_hinjob(flags[CHILD], XINO0); ++ au_fset_hinjob(flags[CHILD], MNTPNT); ++ au_fset_hinjob(flags[PARENT], DIRENT); ++ break; ++ ++ case IN_CREATE: ++ AuDebugOn(!h_child_name || !h_child_inode); ++ au_fset_hinjob(flags[PARENT], DIRENT); ++ au_fset_hinjob(flags[CHILD], GEN); ++ break; ++ ++ case IN_DELETE: ++ /* ++ * aufs never be able to get this child inode. ++ * revalidation should be in d_revalidate() ++ * by checking i_nlink, i_generation or d_unhashed(). ++ */ ++ AuDebugOn(!h_child_name); ++ au_fset_hinjob(flags[PARENT], DIRENT); ++ au_fset_hinjob(flags[CHILD], GEN); ++ au_fset_hinjob(flags[CHILD], TRYXINO0); ++ au_fset_hinjob(flags[CHILD], MNTPNT); ++ break; ++ ++ default: ++ AuDebugOn(1); ++ } ++ ++ if (wh) ++ h_child_inode = NULL; ++ ++ /* iput() and kfree() will be called in postproc() */ ++ /* ++ * inotify_mutex is already acquired and kmalloc/prune_icache may lock ++ * iprune_mutex. strange. ++ */ ++ lockdep_off(); ++ args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS); ++ lockdep_on(); ++ if (unlikely(!args)) { ++ AuErr1("no memory\n"); ++ iput(dir); ++ return; ++ } ++ args->flags[PARENT] = flags[PARENT]; ++ args->flags[CHILD] = flags[CHILD]; ++ args->mask = mask; ++ args->dir = dir; ++ args->h_dir = igrab(watch->inode); ++ if (h_child_inode) ++ h_child_inode = igrab(h_child_inode); /* can be NULL */ ++ args->h_child_inode = h_child_inode; ++ args->h_child_nlen = len; ++ if (len) { ++ p = (void *)args; ++ p += sizeof(*args); ++ memcpy(p, h_child_name, len + 1); ++ } ++ ++ lockdep_off(); ++ wkq_err = au_wkq_nowait(postproc, args, dir->i_sb); ++ lockdep_on(); ++ if (unlikely(wkq_err)) ++ AuErr("wkq %d\n", wkq_err); ++} ++ ++static void aufs_inotify_destroy(struct inotify_watch *watch __maybe_unused) ++{ ++ return; ++} ++ ++static struct inotify_operations aufs_inotify_ops = { ++ .handle_event = aufs_inotify, ++ .destroy_watch = aufs_inotify_destroy ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_hin_destroy_cache(void) ++{ ++ kmem_cache_destroy(au_cachep[AuCache_HINOTIFY]); ++ au_cachep[AuCache_HINOTIFY] = NULL; ++} ++ ++int __init au_hinotify_init(void) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ au_cachep[AuCache_HINOTIFY] = AuCache(au_hinotify); ++ if (au_cachep[AuCache_HINOTIFY]) { ++ err = 0; ++ au_hin_handle = inotify_init(&aufs_inotify_ops); ++ if (IS_ERR(au_hin_handle)) { ++ err = PTR_ERR(au_hin_handle); ++ au_hin_destroy_cache(); ++ } ++ } ++ AuTraceErr(err); ++ return err; ++} ++ ++void au_hinotify_fin(void) ++{ ++ inotify_destroy(au_hin_handle); ++ if (au_cachep[AuCache_HINOTIFY]) ++ au_hin_destroy_cache(); ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/iinfo.c linux-2.6.31/fs/aufs/iinfo.c +--- linux-2.6.31-vanilla/fs/aufs/iinfo.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/iinfo.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,283 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode private data ++ */ ++ ++#include "aufs.h" ++ ++struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex) ++{ ++ struct inode *h_inode; ++ ++ IiMustAnyLock(inode); ++ ++ h_inode = au_ii(inode)->ii_hinode[0 + bindex].hi_inode; ++ AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); ++ return h_inode; ++} ++ ++/* todo: hard/soft set? */ ++void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex) ++{ ++ struct au_iinfo *iinfo = au_ii(inode); ++ struct inode *h_inode; ++ ++ IiMustWriteLock(inode); ++ ++ iinfo->ii_bstart = bindex; ++ h_inode = iinfo->ii_hinode[bindex + 0].hi_inode; ++ if (h_inode) ++ au_cpup_igen(inode, h_inode); ++} ++ ++void au_hiput(struct au_hinode *hinode) ++{ ++ au_hin_free(hinode); ++ dput(hinode->hi_whdentry); ++ iput(hinode->hi_inode); ++} ++ ++unsigned int au_hi_flags(struct inode *inode, int isdir) ++{ ++ unsigned int flags; ++ const unsigned int mnt_flags = au_mntflags(inode->i_sb); ++ ++ flags = 0; ++ if (au_opt_test(mnt_flags, XINO)) ++ au_fset_hi(flags, XINO); ++ if (isdir && au_opt_test(mnt_flags, UDBA_HINOTIFY)) ++ au_fset_hi(flags, HINOTIFY); ++ return flags; ++} ++ ++void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode, unsigned int flags) ++{ ++ struct au_hinode *hinode; ++ struct inode *hi; ++ struct au_iinfo *iinfo = au_ii(inode); ++ ++ IiMustWriteLock(inode); ++ ++ hinode = iinfo->ii_hinode + bindex; ++ hi = hinode->hi_inode; ++ AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); ++ AuDebugOn(h_inode && hi); ++ ++ if (hi) ++ au_hiput(hinode); ++ hinode->hi_inode = h_inode; ++ if (h_inode) { ++ int err; ++ struct super_block *sb = inode->i_sb; ++ struct au_branch *br; ++ ++ if (bindex == iinfo->ii_bstart) ++ au_cpup_igen(inode, h_inode); ++ br = au_sbr(sb, bindex); ++ hinode->hi_id = br->br_id; ++ if (au_ftest_hi(flags, XINO)) { ++ err = au_xino_write(sb, bindex, h_inode->i_ino, ++ inode->i_ino); ++ if (unlikely(err)) ++ AuIOErr1("failed au_xino_write() %d\n", err); ++ } ++ ++ if (au_ftest_hi(flags, HINOTIFY) ++ && au_br_hinotifyable(br->br_perm)) { ++ err = au_hin_alloc(hinode, inode, h_inode); ++ if (unlikely(err)) ++ AuIOErr1("au_hin_alloc() %d\n", err); ++ } ++ } ++} ++ ++void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_wh) ++{ ++ struct au_hinode *hinode; ++ ++ IiMustWriteLock(inode); ++ ++ hinode = au_ii(inode)->ii_hinode + bindex; ++ AuDebugOn(hinode->hi_whdentry); ++ hinode->hi_whdentry = h_wh; ++} ++ ++void au_update_iigen(struct inode *inode) ++{ ++ atomic_set(&au_ii(inode)->ii_generation, au_sigen(inode->i_sb)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++/* it may be called at remount time, too */ ++void au_update_brange(struct inode *inode, int do_put_zero) ++{ ++ struct au_iinfo *iinfo; ++ ++ iinfo = au_ii(inode); ++ if (!iinfo || iinfo->ii_bstart < 0) ++ return; ++ ++ IiMustWriteLock(inode); ++ ++ if (do_put_zero) { ++ aufs_bindex_t bindex; ++ ++ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; ++ bindex++) { ++ struct inode *h_i; ++ ++ h_i = iinfo->ii_hinode[0 + bindex].hi_inode; ++ if (h_i && !h_i->i_nlink) ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ } ++ } ++ ++ iinfo->ii_bstart = -1; ++ while (++iinfo->ii_bstart <= iinfo->ii_bend) ++ if (iinfo->ii_hinode[0 + iinfo->ii_bstart].hi_inode) ++ break; ++ if (iinfo->ii_bstart > iinfo->ii_bend) { ++ iinfo->ii_bstart = -1; ++ iinfo->ii_bend = -1; ++ return; ++ } ++ ++ iinfo->ii_bend++; ++ while (0 <= --iinfo->ii_bend) ++ if (iinfo->ii_hinode[0 + iinfo->ii_bend].hi_inode) ++ break; ++ AuDebugOn(iinfo->ii_bstart > iinfo->ii_bend || iinfo->ii_bend < 0); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_iinfo_init(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ struct super_block *sb; ++ int nbr, i; ++ ++ sb = inode->i_sb; ++ iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); ++ nbr = au_sbend(sb) + 1; ++ if (unlikely(nbr <= 0)) ++ nbr = 1; ++ iinfo->ii_hinode = kcalloc(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS); ++ if (iinfo->ii_hinode) { ++ for (i = 0; i < nbr; i++) ++ iinfo->ii_hinode[i].hi_id = -1; ++ ++ atomic_set(&iinfo->ii_generation, au_sigen(sb)); ++ /* smp_mb(); */ /* atomic_set */ ++ au_rw_init(&iinfo->ii_rwsem); ++ iinfo->ii_bstart = -1; ++ iinfo->ii_bend = -1; ++ iinfo->ii_vdir = NULL; ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++int au_ii_realloc(struct au_iinfo *iinfo, int nbr) ++{ ++ int err, sz; ++ struct au_hinode *hip; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*hip) * (iinfo->ii_bend + 1); ++ if (!sz) ++ sz = sizeof(*hip); ++ hip = au_kzrealloc(iinfo->ii_hinode, sz, sizeof(*hip) * nbr, GFP_NOFS); ++ if (hip) { ++ iinfo->ii_hinode = hip; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++static int au_iinfo_write0(struct super_block *sb, struct au_hinode *hinode, ++ ino_t ino) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ unsigned char locked; ++ ++ err = 0; ++ locked = !!si_noflush_read_trylock(sb); ++ bindex = au_br_index(sb, hinode->hi_id); ++ if (bindex >= 0) ++ err = au_xino_write0(sb, bindex, hinode->hi_inode->i_ino, ino); ++ /* error action? */ ++ if (locked) ++ si_read_unlock(sb); ++ return err; ++} ++ ++void au_iinfo_fin(struct inode *inode) ++{ ++ ino_t ino; ++ aufs_bindex_t bend; ++ unsigned char unlinked = !inode->i_nlink; ++ struct au_iinfo *iinfo; ++ struct au_hinode *hi; ++ struct super_block *sb; ++ ++ if (unlinked) { ++ int err = au_xigen_inc(inode); ++ if (unlikely(err)) ++ AuWarn1("failed resetting i_generation, %d\n", err); ++ } ++ ++ iinfo = au_ii(inode); ++ /* bad_inode case */ ++ if (!iinfo) ++ return; ++ ++ if (iinfo->ii_vdir) ++ au_vdir_free(iinfo->ii_vdir); ++ ++ if (iinfo->ii_bstart >= 0) { ++ sb = inode->i_sb; ++ ino = 0; ++ if (unlinked) ++ ino = inode->i_ino; ++ hi = iinfo->ii_hinode + iinfo->ii_bstart; ++ bend = iinfo->ii_bend; ++ while (iinfo->ii_bstart++ <= bend) { ++ if (hi->hi_inode) { ++ if (unlinked || !hi->hi_inode->i_nlink) { ++ au_iinfo_write0(sb, hi, ino); ++ /* ignore this error */ ++ ino = 0; ++ } ++ au_hiput(hi); ++ } ++ hi++; ++ } ++ } ++ ++ kfree(iinfo->ii_hinode); ++ AuRwDestroy(&iinfo->ii_rwsem); ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/inode.c linux-2.6.31/fs/aufs/inode.c +--- linux-2.6.31-vanilla/fs/aufs/inode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/inode.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,413 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode functions ++ */ ++ ++#include "aufs.h" ++ ++struct inode *au_igrab(struct inode *inode) ++{ ++ if (inode) { ++ AuDebugOn(!atomic_read(&inode->i_count)); ++ atomic_inc_return(&inode->i_count); ++ } ++ return inode; ++} ++ ++static void au_refresh_hinode_attr(struct inode *inode, int do_version) ++{ ++ au_cpup_attr_all(inode, /*force*/0); ++ au_update_iigen(inode); ++ if (do_version) ++ inode->i_version++; ++} ++ ++int au_refresh_hinode_self(struct inode *inode, int do_attr) ++{ ++ int err; ++ aufs_bindex_t bindex, new_bindex; ++ unsigned char update; ++ struct inode *first; ++ struct au_hinode *p, *q, tmp; ++ struct super_block *sb; ++ struct au_iinfo *iinfo; ++ ++ IiMustWriteLock(inode); ++ ++ update = 0; ++ sb = inode->i_sb; ++ iinfo = au_ii(inode); ++ err = au_ii_realloc(iinfo, au_sbend(sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ ++ p = iinfo->ii_hinode + iinfo->ii_bstart; ++ first = p->hi_inode; ++ err = 0; ++ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; ++ bindex++, p++) { ++ if (!p->hi_inode) ++ continue; ++ ++ new_bindex = au_br_index(sb, p->hi_id); ++ if (new_bindex == bindex) ++ continue; ++ ++ if (new_bindex < 0) { ++ update++; ++ au_hiput(p); ++ p->hi_inode = NULL; ++ continue; ++ } ++ ++ if (new_bindex < iinfo->ii_bstart) ++ iinfo->ii_bstart = new_bindex; ++ if (iinfo->ii_bend < new_bindex) ++ iinfo->ii_bend = new_bindex; ++ /* swap two lower inode, and loop again */ ++ q = iinfo->ii_hinode + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hi_inode) { ++ bindex--; ++ p--; ++ } ++ } ++ au_update_brange(inode, /*do_put_zero*/0); ++ if (do_attr) ++ au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode)); ++ ++ out: ++ return err; ++} ++ ++int au_refresh_hinode(struct inode *inode, struct dentry *dentry) ++{ ++ int err, update; ++ unsigned int flags; ++ aufs_bindex_t bindex, bend; ++ unsigned char isdir; ++ struct inode *first; ++ struct au_hinode *p; ++ struct au_iinfo *iinfo; ++ ++ err = au_refresh_hinode_self(inode, /*do_attr*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ update = 0; ++ iinfo = au_ii(inode); ++ p = iinfo->ii_hinode + iinfo->ii_bstart; ++ first = p->hi_inode; ++ isdir = S_ISDIR(inode->i_mode); ++ flags = au_hi_flags(inode, isdir); ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { ++ struct inode *h_i; ++ struct dentry *h_d; ++ ++ h_d = au_h_dptr(dentry, bindex); ++ if (!h_d || !h_d->d_inode) ++ continue; ++ ++ if (iinfo->ii_bstart <= bindex && bindex <= iinfo->ii_bend) { ++ h_i = au_h_iptr(inode, bindex); ++ if (h_i) { ++ if (h_i == h_d->d_inode) ++ continue; ++ err = -EIO; ++ break; ++ } ++ } ++ if (bindex < iinfo->ii_bstart) ++ iinfo->ii_bstart = bindex; ++ if (iinfo->ii_bend < bindex) ++ iinfo->ii_bend = bindex; ++ au_set_h_iptr(inode, bindex, au_igrab(h_d->d_inode), flags); ++ update = 1; ++ } ++ au_update_brange(inode, /*do_put_zero*/0); ++ ++ if (unlikely(err)) ++ goto out; ++ ++ au_refresh_hinode_attr(inode, update && isdir); ++ ++ out: ++ return err; ++} ++ ++static int set_inode(struct inode *inode, struct dentry *dentry) ++{ ++ int err; ++ unsigned int flags; ++ umode_t mode; ++ aufs_bindex_t bindex, bstart, btail; ++ unsigned char isdir; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ struct au_iinfo *iinfo; ++ ++ IiMustWriteLock(inode); ++ ++ err = 0; ++ isdir = 0; ++ bstart = au_dbstart(dentry); ++ h_inode = au_h_dptr(dentry, bstart)->d_inode; ++ mode = h_inode->i_mode; ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ btail = au_dbtail(dentry); ++ inode->i_op = &aufs_iop; ++ inode->i_fop = &aufs_file_fop; ++ inode->i_mapping->a_ops = &aufs_aop; ++ break; ++ case S_IFDIR: ++ isdir = 1; ++ btail = au_dbtaildir(dentry); ++ inode->i_op = &aufs_dir_iop; ++ inode->i_fop = &aufs_dir_fop; ++ break; ++ case S_IFLNK: ++ btail = au_dbtail(dentry); ++ inode->i_op = &aufs_symlink_iop; ++ break; ++ case S_IFBLK: ++ case S_IFCHR: ++ case S_IFIFO: ++ case S_IFSOCK: ++ btail = au_dbtail(dentry); ++ inode->i_op = &aufs_iop; ++ init_special_inode(inode, mode, h_inode->i_rdev); ++ break; ++ default: ++ AuIOErr("Unknown file type 0%o\n", mode); ++ err = -EIO; ++ goto out; ++ } ++ ++ /* do not set inotify for whiteouted dirs (SHWH mode) */ ++ flags = au_hi_flags(inode, isdir); ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH) ++ && au_ftest_hi(flags, HINOTIFY) ++ && dentry->d_name.len > AUFS_WH_PFX_LEN ++ && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) ++ au_fclr_hi(flags, HINOTIFY); ++ iinfo = au_ii(inode); ++ iinfo->ii_bstart = bstart; ++ iinfo->ii_bend = btail; ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry) ++ au_set_h_iptr(inode, bindex, ++ au_igrab(h_dentry->d_inode), flags); ++ } ++ au_cpup_attr_all(inode, /*force*/1); ++ ++ out: ++ return err; ++} ++ ++/* successful returns with iinfo write_locked */ ++static int reval_inode(struct inode *inode, struct dentry *dentry, int *matched) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct inode *h_inode, *h_dinode; ++ ++ *matched = 0; ++ ++ /* ++ * before this function, if aufs got any iinfo lock, it must be only ++ * one, the parent dir. ++ * it can happen by UDBA and the obsoleted inode number. ++ */ ++ err = -EIO; ++ if (unlikely(inode->i_ino == parent_ino(dentry))) ++ goto out; ++ ++ err = 0; ++ ii_write_lock_new_child(inode); ++ h_dinode = au_h_dptr(dentry, au_dbstart(dentry))->d_inode; ++ bend = au_ibend(inode); ++ for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode && h_inode == h_dinode) { ++ *matched = 1; ++ err = 0; ++ if (au_iigen(inode) != au_digen(dentry)) ++ err = au_refresh_hinode(inode, dentry); ++ break; ++ } ++ } ++ ++ if (unlikely(err)) ++ ii_write_unlock(inode); ++ out: ++ return err; ++} ++ ++int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ unsigned int d_type, ino_t *ino) ++{ ++ int err; ++ struct mutex *mtx; ++ const int isdir = (d_type == DT_DIR); ++ ++ /* prevent hardlinks from race condition */ ++ mtx = NULL; ++ if (!isdir) { ++ mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx; ++ mutex_lock(mtx); ++ } ++ err = au_xino_read(sb, bindex, h_ino, ino); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!*ino) { ++ err = -EIO; ++ *ino = au_xino_new_ino(sb); ++ if (unlikely(!*ino)) ++ goto out; ++ err = au_xino_write(sb, bindex, h_ino, *ino); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ out: ++ if (!isdir) ++ mutex_unlock(mtx); ++ return err; ++} ++ ++/* successful returns with iinfo write_locked */ ++/* todo: return with unlocked? */ ++struct inode *au_new_inode(struct dentry *dentry, int must_new) ++{ ++ struct inode *inode; ++ struct dentry *h_dentry; ++ struct super_block *sb; ++ ino_t h_ino, ino; ++ int err, match; ++ aufs_bindex_t bstart; ++ ++ sb = dentry->d_sb; ++ bstart = au_dbstart(dentry); ++ h_dentry = au_h_dptr(dentry, bstart); ++ h_ino = h_dentry->d_inode->i_ino; ++ err = au_xino_read(sb, bstart, h_ino, &ino); ++ inode = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ new_ino: ++ if (!ino) { ++ ino = au_xino_new_ino(sb); ++ if (unlikely(!ino)) { ++ inode = ERR_PTR(-EIO); ++ goto out; ++ } ++ } ++ ++ AuDbg("i%lu\n", (unsigned long)ino); ++ inode = au_iget_locked(sb, ino); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out; ++ ++ AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW)); ++ if (inode->i_state & I_NEW) { ++ ii_write_lock_new_child(inode); ++ err = set_inode(inode, dentry); ++ unlock_new_inode(inode); ++ if (!err) ++ goto out; /* success */ ++ ++ iget_failed(inode); ++ ii_write_unlock(inode); ++ goto out_iput; ++ } else if (!must_new) { ++ err = reval_inode(inode, dentry, &match); ++ if (!err) ++ goto out; /* success */ ++ else if (match) ++ goto out_iput; ++ } ++ ++ if (unlikely(au_test_fs_unique_ino(h_dentry->d_inode))) ++ AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir," ++ " b%d, %s, %.*s, hi%lu, i%lu.\n", ++ bstart, au_sbtype(h_dentry->d_sb), AuDLNPair(dentry), ++ (unsigned long)h_ino, (unsigned long)ino); ++ ino = 0; ++ err = au_xino_write(sb, bstart, h_ino, /*ino*/0); ++ if (!err) { ++ iput(inode); ++ goto new_ino; ++ } ++ ++ out_iput: ++ iput(inode); ++ inode = ERR_PTR(err); ++ out: ++ return inode; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, ++ struct inode *inode) ++{ ++ int err; ++ ++ err = au_br_rdonly(au_sbr(sb, bindex)); ++ ++ /* pseudo-link after flushed may happen out of bounds */ ++ if (!err ++ && inode ++ && au_ibstart(inode) <= bindex ++ && bindex <= au_ibend(inode)) { ++ /* ++ * permission check is unnecessary since vfsub routine ++ * will be called later ++ */ ++ struct inode *hi = au_h_iptr(inode, bindex); ++ if (hi) ++ err = IS_IMMUTABLE(hi) ? -EROFS : 0; ++ } ++ ++ return err; ++} ++ ++int au_test_h_perm(struct inode *h_inode, int mask) ++{ ++ if (!current_fsuid()) ++ return 0; ++ return inode_permission(h_inode, mask); ++} ++ ++int au_test_h_perm_sio(struct inode *h_inode, int mask) ++{ ++ if (au_test_nfs(h_inode->i_sb) ++ && (mask & MAY_WRITE) ++ && S_ISDIR(h_inode->i_mode)) ++ mask |= MAY_READ; /* force permission check */ ++ return au_test_h_perm(h_inode, mask); ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/inode.h linux-2.6.31/fs/aufs/inode.h +--- linux-2.6.31-vanilla/fs/aufs/inode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/inode.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,497 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations ++ */ ++ ++#ifndef __AUFS_INODE_H__ ++#define __AUFS_INODE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/fs.h> ++#include <linux/inotify.h> ++#include <linux/aufs_type.h> ++#include "rwsem.h" ++ ++struct vfsmount; ++ ++struct au_hinotify { ++#ifdef CONFIG_AUFS_HINOTIFY ++ struct inotify_watch hin_watch; ++ struct inode *hin_aufs_inode; /* no get/put */ ++#endif ++}; ++ ++struct au_hinode { ++ struct inode *hi_inode; ++ aufs_bindex_t hi_id; ++#ifdef CONFIG_AUFS_HINOTIFY ++ struct au_hinotify *hi_notify; ++#endif ++ ++ /* reference to the copied-up whiteout with get/put */ ++ struct dentry *hi_whdentry; ++}; ++ ++struct au_vdir; ++struct au_iinfo { ++ atomic_t ii_generation; ++ struct super_block *ii_hsb1; /* no get/put */ ++ ++ struct au_rwsem ii_rwsem; ++ aufs_bindex_t ii_bstart, ii_bend; ++ __u32 ii_higen; ++ struct au_hinode *ii_hinode; ++ struct au_vdir *ii_vdir; ++}; ++ ++struct au_icntnr { ++ struct au_iinfo iinfo; ++ struct inode vfs_inode; ++}; ++ ++/* au_pin flags */ ++#define AuPin_DI_LOCKED 1 ++#define AuPin_MNT_WRITE (1 << 1) ++#define au_ftest_pin(flags, name) ((flags) & AuPin_##name) ++#define au_fset_pin(flags, name) { (flags) |= AuPin_##name; } ++#define au_fclr_pin(flags, name) { (flags) &= ~AuPin_##name; } ++ ++struct au_pin { ++ /* input */ ++ struct dentry *dentry; ++ unsigned int udba; ++ unsigned char lsc_di, lsc_hi, flags; ++ aufs_bindex_t bindex; ++ ++ /* output */ ++ struct dentry *parent; ++ struct au_hinode *hdir; ++ struct vfsmount *h_mnt; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_iinfo *au_ii(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ ++ iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); ++ if (iinfo->ii_hinode) ++ return iinfo; ++ return NULL; /* debugging bad_inode case */ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* inode.c */ ++struct inode *au_igrab(struct inode *inode); ++int au_refresh_hinode_self(struct inode *inode, int do_attr); ++int au_refresh_hinode(struct inode *inode, struct dentry *dentry); ++int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ unsigned int d_type, ino_t *ino); ++struct inode *au_new_inode(struct dentry *dentry, int must_new); ++int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, ++ struct inode *inode); ++int au_test_h_perm(struct inode *h_inode, int mask); ++int au_test_h_perm_sio(struct inode *h_inode, int mask); ++ ++static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex, ++ ino_t h_ino, unsigned int d_type, ino_t *ino) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ return au_ino(sb, bindex, h_ino, d_type, ino); ++#else ++ return 0; ++#endif ++} ++ ++/* i_op.c */ ++extern struct inode_operations aufs_iop, aufs_symlink_iop, aufs_dir_iop; ++ ++/* au_wr_dir flags */ ++#define AuWrDir_ADD_ENTRY 1 ++#define AuWrDir_ISDIR (1 << 1) ++#define au_ftest_wrdir(flags, name) ((flags) & AuWrDir_##name) ++#define au_fset_wrdir(flags, name) { (flags) |= AuWrDir_##name; } ++#define au_fclr_wrdir(flags, name) { (flags) &= ~AuWrDir_##name; } ++ ++struct au_wr_dir_args { ++ aufs_bindex_t force_btgt; ++ unsigned char flags; ++}; ++int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, ++ struct au_wr_dir_args *args); ++ ++struct dentry *au_pinned_h_parent(struct au_pin *pin); ++void au_pin_init(struct au_pin *pin, struct dentry *dentry, ++ aufs_bindex_t bindex, int lsc_di, int lsc_hi, ++ unsigned int udba, unsigned char flags); ++int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int udba, unsigned char flags) __must_check; ++int au_do_pin(struct au_pin *pin) __must_check; ++void au_unpin(struct au_pin *pin); ++ ++/* i_op_add.c */ ++int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir); ++int aufs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev); ++int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); ++int aufs_create(struct inode *dir, struct dentry *dentry, int mode, ++ struct nameidata *nd); ++int aufs_link(struct dentry *src_dentry, struct inode *dir, ++ struct dentry *dentry); ++int aufs_mkdir(struct inode *dir, struct dentry *dentry, int mode); ++ ++/* i_op_del.c */ ++int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup); ++int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir); ++int aufs_unlink(struct inode *dir, struct dentry *dentry); ++int aufs_rmdir(struct inode *dir, struct dentry *dentry); ++ ++/* i_op_ren.c */ ++int au_wbr(struct dentry *dentry, aufs_bindex_t btgt); ++int aufs_rename(struct inode *src_dir, struct dentry *src_dentry, ++ struct inode *dir, struct dentry *dentry); ++ ++/* iinfo.c */ ++struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex); ++void au_hiput(struct au_hinode *hinode); ++void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex); ++void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_wh); ++unsigned int au_hi_flags(struct inode *inode, int isdir); ++ ++/* hinode flags */ ++#define AuHi_XINO 1 ++#define AuHi_HINOTIFY (1 << 1) ++#define au_ftest_hi(flags, name) ((flags) & AuHi_##name) ++#define au_fset_hi(flags, name) { (flags) |= AuHi_##name; } ++#define au_fclr_hi(flags, name) { (flags) &= ~AuHi_##name; } ++ ++#ifndef CONFIG_AUFS_HINOTIFY ++#undef AuHi_HINOTIFY ++#define AuHi_HINOTIFY 0 ++#endif ++ ++void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode, unsigned int flags); ++ ++void au_update_iigen(struct inode *inode); ++void au_update_brange(struct inode *inode, int do_put_zero); ++ ++int au_iinfo_init(struct inode *inode); ++void au_iinfo_fin(struct inode *inode); ++int au_ii_realloc(struct au_iinfo *iinfo, int nbr); ++ ++/* plink.c */ ++void au_plink_block_maintain(struct super_block *sb); ++#ifdef CONFIG_AUFS_DEBUG ++void au_plink_list(struct super_block *sb); ++#else ++static inline void au_plink_list(struct super_block *sb) ++{ ++ /* nothing */ ++} ++#endif ++int au_plink_test(struct inode *inode); ++struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex); ++void au_plink_append(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++void au_plink_put(struct super_block *sb); ++void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id); ++long au_plink_ioctl(struct file *file, unsigned int cmd); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for iinfo */ ++enum { ++ AuLsc_II_CHILD, /* child first */ ++ AuLsc_II_CHILD2, /* rename(2), link(2), and cpup at hinotify */ ++ AuLsc_II_CHILD3, /* copyup dirs */ ++ AuLsc_II_PARENT, /* see AuLsc_I_PARENT in vfsub.h */ ++ AuLsc_II_PARENT2, ++ AuLsc_II_PARENT3, /* copyup dirs */ ++ AuLsc_II_NEW_CHILD ++}; ++ ++/* ++ * ii_read_lock_child, ii_write_lock_child, ++ * ii_read_lock_child2, ii_write_lock_child2, ++ * ii_read_lock_child3, ii_write_lock_child3, ++ * ii_read_lock_parent, ii_write_lock_parent, ++ * ii_read_lock_parent2, ii_write_lock_parent2, ++ * ii_read_lock_parent3, ii_write_lock_parent3, ++ * ii_read_lock_new_child, ii_write_lock_new_child, ++ */ ++#define AuReadLockFunc(name, lsc) \ ++static inline void ii_read_lock_##name(struct inode *i) \ ++{ \ ++ au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ ++} ++ ++#define AuWriteLockFunc(name, lsc) \ ++static inline void ii_write_lock_##name(struct inode *i) \ ++{ \ ++ au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ ++} ++ ++#define AuRWLockFuncs(name, lsc) \ ++ AuReadLockFunc(name, lsc) \ ++ AuWriteLockFunc(name, lsc) ++ ++AuRWLockFuncs(child, CHILD); ++AuRWLockFuncs(child2, CHILD2); ++AuRWLockFuncs(child3, CHILD3); ++AuRWLockFuncs(parent, PARENT); ++AuRWLockFuncs(parent2, PARENT2); ++AuRWLockFuncs(parent3, PARENT3); ++AuRWLockFuncs(new_child, NEW_CHILD); ++ ++#undef AuReadLockFunc ++#undef AuWriteLockFunc ++#undef AuRWLockFuncs ++ ++/* ++ * ii_read_unlock, ii_write_unlock, ii_downgrade_lock ++ */ ++AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem); ++ ++#define IiMustNoWaiters(i) AuRwMustNoWaiters(&au_ii(i)->ii_rwsem) ++#define IiMustAnyLock(i) AuRwMustAnyLock(&au_ii(i)->ii_rwsem) ++#define IiMustWriteLock(i) AuRwMustWriteLock(&au_ii(i)->ii_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline unsigned int au_iigen(struct inode *inode) ++{ ++ return atomic_read(&au_ii(inode)->ii_generation); ++} ++ ++/* tiny test for inode number */ ++/* tmpfs generation is too rough */ ++static inline int au_test_higen(struct inode *inode, struct inode *h_inode) ++{ ++ struct au_iinfo *iinfo; ++ ++ iinfo = au_ii(inode); ++ AuRwMustAnyLock(&iinfo->ii_rwsem); ++ return !(iinfo->ii_hsb1 == h_inode->i_sb ++ && iinfo->ii_higen == h_inode->i_generation); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline aufs_bindex_t au_ii_br_id(struct inode *inode, ++ aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_hinode[0 + bindex].hi_id; ++} ++ ++static inline aufs_bindex_t au_ibstart(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_bstart; ++} ++ ++static inline aufs_bindex_t au_ibend(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_bend; ++} ++ ++static inline struct au_vdir *au_ivdir(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_vdir; ++} ++ ++static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_hinode[0 + bindex].hi_whdentry; ++} ++ ++static inline void au_set_ibend(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_bend = bindex; ++} ++ ++static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_vdir = vdir; ++} ++ ++static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_hinode + bindex; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct dentry *au_pinned_parent(struct au_pin *pin) ++{ ++ if (pin) ++ return pin->parent; ++ return NULL; ++} ++ ++static inline struct inode *au_pinned_h_dir(struct au_pin *pin) ++{ ++ if (pin && pin->hdir) ++ return pin->hdir->hi_inode; ++ return NULL; ++} ++ ++static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin) ++{ ++ if (pin) ++ return pin->hdir; ++ return NULL; ++} ++ ++static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry) ++{ ++ if (pin) ++ pin->dentry = dentry; ++} ++ ++static inline void au_pin_set_parent_lflag(struct au_pin *pin, ++ unsigned char lflag) ++{ ++ if (pin) { ++ /* dirty macros require brackets */ ++ if (lflag) { ++ au_fset_pin(pin->flags, DI_LOCKED); ++ } else { ++ au_fclr_pin(pin->flags, DI_LOCKED); ++ } ++ } ++} ++ ++static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent) ++{ ++ if (pin) { ++ dput(pin->parent); ++ pin->parent = dget(parent); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_HINOTIFY ++/* hinotify.c */ ++int au_hin_alloc(struct au_hinode *hinode, struct inode *inode, ++ struct inode *h_inode); ++void au_hin_free(struct au_hinode *hinode); ++void au_hin_ctl(struct au_hinode *hinode, int do_set); ++void au_reset_hinotify(struct inode *inode, unsigned int flags); ++ ++int __init au_hinotify_init(void); ++void au_hinotify_fin(void); ++ ++static inline ++void au_hin_init(struct au_hinode *hinode, struct au_hinotify *val) ++{ ++ hinode->hi_notify = val; ++} ++ ++static inline void au_iigen_dec(struct inode *inode) ++{ ++ atomic_dec_return(&au_ii(inode)->ii_generation); ++} ++ ++#else ++static inline ++int au_hin_alloc(struct au_hinode *hinode __maybe_unused, ++ struct inode *inode __maybe_unused, ++ struct inode *h_inode __maybe_unused) ++{ ++ return -EOPNOTSUPP; ++} ++ ++static inline void au_hin_free(struct au_hinode *hinode __maybe_unused) ++{ ++ /* nothing */ ++} ++ ++static inline void au_hin_ctl(struct au_hinode *hinode __maybe_unused, ++ int do_set __maybe_unused) ++{ ++ /* nothing */ ++} ++ ++static inline void au_reset_hinotify(struct inode *inode __maybe_unused, ++ unsigned int flags __maybe_unused) ++{ ++ /* nothing */ ++} ++ ++static inline int au_hinotify_init(void) ++{ ++ return 0; ++} ++ ++#define au_hinotify_fin() do {} while (0) ++ ++static inline ++void au_hin_init(struct au_hinode *hinode __maybe_unused, ++ struct au_hinotify *val __maybe_unused) ++{ ++ /* empty */ ++} ++#endif /* CONFIG_AUFS_HINOTIFY */ ++ ++static inline void au_hin_suspend(struct au_hinode *hdir) ++{ ++ au_hin_ctl(hdir, /*do_set*/0); ++} ++ ++static inline void au_hin_resume(struct au_hinode *hdir) ++{ ++ au_hin_ctl(hdir, /*do_set*/1); ++} ++ ++static inline void au_hin_imtx_lock(struct au_hinode *hdir) ++{ ++ mutex_lock(&hdir->hi_inode->i_mutex); ++ au_hin_suspend(hdir); ++} ++ ++static inline void au_hin_imtx_lock_nested(struct au_hinode *hdir, ++ unsigned int sc __maybe_unused) ++{ ++ mutex_lock_nested(&hdir->hi_inode->i_mutex, sc); ++ au_hin_suspend(hdir); ++} ++ ++static inline void au_hin_imtx_unlock(struct au_hinode *hdir) ++{ ++ au_hin_resume(hdir); ++ mutex_unlock(&hdir->hi_inode->i_mutex); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_INODE_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/ioctl.c linux-2.6.31/fs/aufs/ioctl.c +--- linux-2.6.31-vanilla/fs/aufs/ioctl.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/ioctl.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,47 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * ioctl ++ * plink-management and readdir in userspace. ++ */ ++ ++#include "aufs.h" ++ ++long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err; ++ ++ switch (cmd) { ++ case AUFS_CTL_PLINK_MAINT: ++ case AUFS_CTL_PLINK_CLEAN: ++ err = au_plink_ioctl(file, cmd); ++ break; ++ ++ case AUFS_CTL_RDU: ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ioctl(file, cmd, arg); ++ break; ++ ++ default: ++ err = -EINVAL; ++ } ++ ++ AuTraceErr(err); ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/i_op_add.c linux-2.6.31/fs/aufs/i_op_add.c +--- linux-2.6.31-vanilla/fs/aufs/i_op_add.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/i_op_add.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,649 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations (add entry) ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * final procedure of adding a new entry, except link(2). ++ * remove whiteout, instantiate, copyup the parent dir's times and size ++ * and update version. ++ * if it failed, re-create the removed whiteout. ++ */ ++static int epilog(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct dentry *dentry) ++{ ++ int err, rerr; ++ aufs_bindex_t bwh; ++ struct path h_path; ++ struct inode *inode, *h_dir; ++ struct dentry *wh; ++ ++ bwh = -1; ++ if (wh_dentry) { ++ h_dir = wh_dentry->d_parent->d_inode; /* dir inode is locked */ ++ IMustLock(h_dir); ++ AuDebugOn(au_h_iptr(dir, bindex) != h_dir); ++ bwh = au_dbwh(dentry); ++ h_path.dentry = wh_dentry; ++ h_path.mnt = au_sbr_mnt(dir->i_sb, bindex); ++ err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, ++ dentry); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ inode = au_new_inode(dentry, /*must_new*/1); ++ if (!IS_ERR(inode)) { ++ d_instantiate(dentry, inode); ++ dir = dentry->d_parent->d_inode; /* dir inode is locked */ ++ IMustLock(dir); ++ if (au_ibstart(dir) == au_dbstart(dentry)) ++ au_cpup_attr_timesizes(dir); ++ dir->i_version++; ++ return 0; /* success */ ++ } ++ ++ err = PTR_ERR(inode); ++ if (!wh_dentry) ++ goto out; ++ ++ /* revert */ ++ /* dir inode is locked */ ++ wh = au_wh_create(dentry, bwh, wh_dentry->d_parent); ++ rerr = PTR_ERR(wh); ++ if (IS_ERR(wh)) { ++ AuIOErr("%.*s reverting whiteout failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } else ++ dput(wh); ++ ++ out: ++ return err; ++} ++ ++/* ++ * simple tests for the adding inode operations. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir) ++{ ++ int err; ++ umode_t h_mode; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ h_inode = h_dentry->d_inode; ++ if (!dentry->d_inode) { ++ err = -EEXIST; ++ if (unlikely(h_inode)) ++ goto out; ++ } else { ++ /* rename(2) case */ ++ err = -EIO; ++ if (unlikely(!h_inode || !h_inode->i_nlink)) ++ goto out; ++ ++ h_mode = h_inode->i_mode; ++ if (!isdir) { ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(h_mode))) ++ goto out; ++ } else if (unlikely(!S_ISDIR(h_mode))) { ++ err = -ENOTDIR; ++ goto out; ++ } ++ } ++ ++ err = -EIO; ++ /* expected parent dir is locked */ ++ if (unlikely(h_parent != h_dentry->d_parent)) ++ goto out; ++ err = 0; ++ ++ out: ++ return err; ++} ++ ++/* ++ * initial procedure of adding a new entry. ++ * prepare writable branch and the parent dir, lock it, ++ * and lookup whiteout for the new entry. ++ */ ++static struct dentry* ++lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt, ++ struct dentry *src_dentry, struct au_pin *pin, ++ struct au_wr_dir_args *wr_dir_args) ++{ ++ struct dentry *wh_dentry, *h_parent; ++ struct super_block *sb; ++ struct au_branch *br; ++ int err; ++ unsigned int udba; ++ aufs_bindex_t bcpup; ++ ++ err = au_wr_dir(dentry, src_dentry, wr_dir_args); ++ bcpup = err; ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err < 0)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ udba = au_opt_udba(sb); ++ err = au_pin(pin, dentry, bcpup, udba, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ h_parent = au_pinned_h_parent(pin); ++ if (udba != AuOpt_UDBA_NONE ++ && au_dbstart(dentry) == bcpup) { ++ err = au_may_add(dentry, bcpup, h_parent, ++ au_ftest_wrdir(wr_dir_args->flags, ISDIR)); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_unpin; ++ } ++ ++ br = au_sbr(sb, bcpup); ++ if (dt) { ++ struct path tmp = { ++ .dentry = h_parent, ++ .mnt = br->br_mnt ++ }; ++ au_dtime_store(dt, au_pinned_parent(pin), &tmp); ++ } ++ ++ wh_dentry = NULL; ++ if (bcpup != au_dbwh(dentry)) ++ goto out; /* success */ ++ ++ wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br); ++ ++ out_unpin: ++ if (IS_ERR(wh_dentry)) ++ au_unpin(pin); ++ out: ++ return wh_dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++enum { Mknod, Symlink, Creat }; ++struct simple_arg { ++ int type; ++ union { ++ struct { ++ int mode; ++ struct nameidata *nd; ++ } c; ++ struct { ++ const char *symname; ++ } s; ++ struct { ++ int mode; ++ dev_t dev; ++ } m; ++ } u; ++}; ++ ++static int add_simple(struct inode *dir, struct dentry *dentry, ++ struct simple_arg *arg) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ unsigned char created; ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct path h_path; ++ struct dentry *wh_dentry, *parent; ++ struct inode *h_dir; ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ IMustLock(dir); ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ aufs_read_lock(dentry, AuLock_DW); ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &dt, /*src_dentry*/NULL, &pin, ++ &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out; ++ ++ bstart = au_dbstart(dentry); ++ h_path.dentry = au_h_dptr(dentry, bstart); ++ h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); ++ h_dir = au_pinned_h_dir(&pin); ++ switch (arg->type) { ++ case Creat: ++ err = vfsub_create(h_dir, &h_path, arg->u.c.mode); ++ break; ++ case Symlink: ++ err = vfsub_symlink(h_dir, &h_path, arg->u.s.symname); ++ break; ++ case Mknod: ++ err = vfsub_mknod(h_dir, &h_path, arg->u.m.mode, arg->u.m.dev); ++ break; ++ default: ++ BUG(); ++ } ++ created = !err; ++ if (!err) ++ err = epilog(dir, bstart, wh_dentry, dentry); ++ ++ /* revert */ ++ if (unlikely(created && err && h_path.dentry->d_inode)) { ++ int rerr; ++ rerr = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ if (rerr) { ++ AuIOErr("%.*s revert failure(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ au_dtime_revert(&dt); ++ d_drop(dentry); ++ } ++ ++ au_unpin(&pin); ++ dput(wh_dentry); ++ ++ out: ++ if (unlikely(err)) { ++ au_update_dbstart(dentry); ++ d_drop(dentry); ++ } ++ di_write_unlock(parent); ++ aufs_read_unlock(dentry, AuLock_DW); ++ return err; ++} ++ ++int aufs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) ++{ ++ struct simple_arg arg = { ++ .type = Mknod, ++ .u.m = { ++ .mode = mode, ++ .dev = dev ++ } ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) ++{ ++ struct simple_arg arg = { ++ .type = Symlink, ++ .u.s.symname = symname ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++int aufs_create(struct inode *dir, struct dentry *dentry, int mode, ++ struct nameidata *nd) ++{ ++ struct simple_arg arg = { ++ .type = Creat, ++ .u.c = { ++ .mode = mode, ++ .nd = nd ++ } ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_link_args { ++ aufs_bindex_t bdst, bsrc; ++ struct au_pin pin; ++ struct path h_path; ++ struct dentry *src_parent, *parent; ++}; ++ ++static int au_cpup_before_link(struct dentry *src_dentry, ++ struct au_link_args *a) ++{ ++ int err; ++ struct dentry *h_src_dentry; ++ struct mutex *h_mtx; ++ ++ di_read_lock_parent(a->src_parent, AuLock_IR); ++ err = au_test_and_cpup_dirs(src_dentry, a->bdst); ++ if (unlikely(err)) ++ goto out; ++ ++ h_src_dentry = au_h_dptr(src_dentry, a->bsrc); ++ h_mtx = &h_src_dentry->d_inode->i_mutex; ++ err = au_pin(&a->pin, src_dentry, a->bdst, ++ au_opt_udba(src_dentry->d_sb), ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ err = au_sio_cpup_simple(src_dentry, a->bdst, -1, ++ AuCpup_DTIME /* | AuCpup_KEEPLINO */); ++ mutex_unlock(h_mtx); ++ au_unpin(&a->pin); ++ ++ out: ++ di_read_unlock(a->src_parent, AuLock_IR); ++ return err; ++} ++ ++static int au_cpup_or_link(struct dentry *src_dentry, struct au_link_args *a) ++{ ++ int err; ++ unsigned char plink; ++ struct inode *h_inode, *inode; ++ struct dentry *h_src_dentry; ++ struct super_block *sb; ++ ++ plink = 0; ++ h_inode = NULL; ++ sb = src_dentry->d_sb; ++ inode = src_dentry->d_inode; ++ if (au_ibstart(inode) <= a->bdst) ++ h_inode = au_h_iptr(inode, a->bdst); ++ if (!h_inode || !h_inode->i_nlink) { ++ /* copyup src_dentry as the name of dentry. */ ++ au_set_dbstart(src_dentry, a->bdst); ++ au_set_h_dptr(src_dentry, a->bdst, dget(a->h_path.dentry)); ++ h_inode = au_h_dptr(src_dentry, a->bsrc)->d_inode; ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ err = au_sio_cpup_single(src_dentry, a->bdst, a->bsrc, -1, ++ AuCpup_KEEPLINO, a->parent); ++ mutex_unlock(&h_inode->i_mutex); ++ au_set_h_dptr(src_dentry, a->bdst, NULL); ++ au_set_dbstart(src_dentry, a->bsrc); ++ } else { ++ /* the inode of src_dentry already exists on a.bdst branch */ ++ h_src_dentry = d_find_alias(h_inode); ++ if (!h_src_dentry && au_plink_test(inode)) { ++ plink = 1; ++ h_src_dentry = au_plink_lkup(inode, a->bdst); ++ err = PTR_ERR(h_src_dentry); ++ if (IS_ERR(h_src_dentry)) ++ goto out; ++ ++ if (unlikely(!h_src_dentry->d_inode)) { ++ dput(h_src_dentry); ++ h_src_dentry = NULL; ++ } ++ ++ } ++ if (h_src_dentry) { ++ err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), ++ &a->h_path); ++ dput(h_src_dentry); ++ } else { ++ AuIOErr("no dentry found for hi%lu on b%d\n", ++ h_inode->i_ino, a->bdst); ++ err = -EIO; ++ } ++ } ++ ++ if (!err && !plink) ++ au_plink_append(inode, a->bdst, a->h_path.dentry); ++ ++out: ++ return err; ++} ++ ++int aufs_link(struct dentry *src_dentry, struct inode *dir, ++ struct dentry *dentry) ++{ ++ int err, rerr; ++ struct au_dtime dt; ++ struct au_link_args *a; ++ struct dentry *wh_dentry, *h_src_dentry; ++ struct inode *inode; ++ struct super_block *sb; ++ struct au_wr_dir_args wr_dir_args = { ++ /* .force_btgt = -1, */ ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ IMustLock(dir); ++ inode = src_dentry->d_inode; ++ IMustLock(inode); ++ ++ err = -ENOENT; ++ if (unlikely(!inode->i_nlink)) ++ goto out; ++ ++ err = -ENOMEM; ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ a->parent = dentry->d_parent; /* dir inode is locked */ ++ aufs_read_and_write_lock2(dentry, src_dentry, /*AuLock_FLUSH*/0); ++ a->src_parent = dget_parent(src_dentry); ++ wr_dir_args.force_btgt = au_dbstart(src_dentry); ++ ++ di_write_lock_parent(a->parent); ++ wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin, ++ &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_unlock; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ a->bdst = au_dbstart(dentry); ++ a->h_path.dentry = au_h_dptr(dentry, a->bdst); ++ a->h_path.mnt = au_sbr_mnt(sb, a->bdst); ++ a->bsrc = au_dbstart(src_dentry); ++ if (au_opt_test(au_mntflags(sb), PLINK)) { ++ if (a->bdst < a->bsrc ++ /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) ++ err = au_cpup_or_link(src_dentry, a); ++ else { ++ h_src_dentry = au_h_dptr(src_dentry, a->bdst); ++ err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), ++ &a->h_path); ++ } ++ } else { ++ /* ++ * copyup src_dentry to the branch we process, ++ * and then link(2) to it. ++ */ ++ if (a->bdst < a->bsrc ++ /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) { ++ au_unpin(&a->pin); ++ di_write_unlock(a->parent); ++ err = au_cpup_before_link(src_dentry, a); ++ di_write_lock_parent(a->parent); ++ if (!err) ++ err = au_pin(&a->pin, dentry, a->bdst, ++ au_opt_udba(sb), ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out_wh; ++ } ++ if (!err) { ++ h_src_dentry = au_h_dptr(src_dentry, a->bdst); ++ err = -ENOENT; ++ if (h_src_dentry && h_src_dentry->d_inode) ++ err = vfsub_link(h_src_dentry, ++ au_pinned_h_dir(&a->pin), ++ &a->h_path); ++ } ++ } ++ if (unlikely(err)) ++ goto out_unpin; ++ ++ if (wh_dentry) { ++ a->h_path.dentry = wh_dentry; ++ err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path, ++ dentry); ++ if (unlikely(err)) ++ goto out_revert; ++ } ++ ++ dir->i_version++; ++ if (au_ibstart(dir) == au_dbstart(dentry)) ++ au_cpup_attr_timesizes(dir); ++ inc_nlink(inode); ++ inode->i_ctime = dir->i_ctime; ++ if (!d_unhashed(a->h_path.dentry)) ++ d_instantiate(dentry, au_igrab(inode)); ++ else ++ /* some filesystem calls d_drop() */ ++ d_drop(dentry); ++ goto out_unpin; /* success */ ++ ++ out_revert: ++ rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path, /*force*/0); ++ if (!rerr) ++ goto out_dt; ++ AuIOErr("%.*s reverting failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ out_dt: ++ d_drop(dentry); ++ au_dtime_revert(&dt); ++ out_unpin: ++ au_unpin(&a->pin); ++ out_wh: ++ dput(wh_dentry); ++ out_unlock: ++ if (unlikely(err)) { ++ au_update_dbstart(dentry); ++ d_drop(dentry); ++ } ++ di_write_unlock(a->parent); ++ dput(a->src_parent); ++ aufs_read_and_write_unlock2(dentry, src_dentry); ++ kfree(a); ++ out: ++ return err; ++} ++ ++int aufs_mkdir(struct inode *dir, struct dentry *dentry, int mode) ++{ ++ int err, rerr; ++ aufs_bindex_t bindex; ++ unsigned char diropq; ++ struct path h_path; ++ struct dentry *wh_dentry, *parent, *opq_dentry; ++ struct mutex *h_mtx; ++ struct super_block *sb; ++ struct { ++ struct au_pin pin; ++ struct au_dtime dt; ++ } *a; /* reduce the stack usage */ ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = AuWrDir_ADD_ENTRY | AuWrDir_ISDIR ++ }; ++ ++ IMustLock(dir); ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ aufs_read_lock(dentry, AuLock_DW); ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, ++ &a->pin, &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_free; ++ ++ sb = dentry->d_sb; ++ bindex = au_dbstart(dentry); ++ h_path.dentry = au_h_dptr(dentry, bindex); ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ /* make the dir opaque */ ++ diropq = 0; ++ h_mtx = &h_path.dentry->d_inode->i_mutex; ++ if (wh_dentry ++ || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) { ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ opq_dentry = au_diropq_create(dentry, bindex); ++ mutex_unlock(h_mtx); ++ err = PTR_ERR(opq_dentry); ++ if (IS_ERR(opq_dentry)) ++ goto out_dir; ++ dput(opq_dentry); ++ diropq = 1; ++ } ++ ++ err = epilog(dir, bindex, wh_dentry, dentry); ++ if (!err) { ++ inc_nlink(dir); ++ goto out_unlock; /* success */ ++ } ++ ++ /* revert */ ++ if (diropq) { ++ AuLabel(revert opq); ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(dentry, bindex); ++ mutex_unlock(h_mtx); ++ if (rerr) { ++ AuIOErr("%.*s reverting diropq failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ } ++ ++ out_dir: ++ AuLabel(revert dir); ++ rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path); ++ if (rerr) { ++ AuIOErr("%.*s reverting dir failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ d_drop(dentry); ++ au_dtime_revert(&a->dt); ++ out_unlock: ++ au_unpin(&a->pin); ++ dput(wh_dentry); ++ out_free: ++ if (unlikely(err)) { ++ au_update_dbstart(dentry); ++ d_drop(dentry); ++ } ++ di_write_unlock(parent); ++ aufs_read_unlock(dentry, AuLock_DW); ++ kfree(a); ++ out: ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/i_op.c linux-2.6.31/fs/aufs/i_op.c +--- linux-2.6.31-vanilla/fs/aufs/i_op.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/i_op.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,891 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations (except add/del/rename) ++ */ ++ ++#include <linux/device_cgroup.h> ++#include <linux/fs_stack.h> ++#include <linux/mm.h> ++#include <linux/namei.h> ++#include <linux/security.h> ++#include <linux/uaccess.h> ++#include "aufs.h" ++ ++static int h_permission(struct inode *h_inode, int mask, ++ struct vfsmount *h_mnt, int brperm) ++{ ++ int err; ++ const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); ++ ++ err = -EACCES; ++ if ((write_mask && IS_IMMUTABLE(h_inode)) ++ || ((mask & MAY_EXEC) ++ && S_ISREG(h_inode->i_mode) ++ && ((h_mnt->mnt_flags & MNT_NOEXEC) ++ || !(h_inode->i_mode & S_IXUGO)))) ++ goto out; ++ ++ /* ++ * - skip the lower fs test in the case of write to ro branch. ++ * - nfs dir permission write check is optimized, but a policy for ++ * link/rename requires a real check. ++ */ ++ if ((write_mask && !au_br_writable(brperm)) ++ || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode) ++ && write_mask && !(mask & MAY_READ)) ++ || !h_inode->i_op->permission) { ++ /* AuLabel(generic_permission); */ ++ err = generic_permission(h_inode, mask, NULL); ++ } else { ++ /* AuLabel(h_inode->permission); */ ++ err = h_inode->i_op->permission(h_inode, mask); ++ AuTraceErr(err); ++ } ++ ++ if (!err) ++ err = devcgroup_inode_permission(h_inode, mask); ++ if (!err) ++ err = security_inode_permission ++ (h_inode, mask & (MAY_READ | MAY_WRITE | MAY_EXEC ++ | MAY_APPEND)); ++ ++#if 0 ++ if (!err) { ++ /* todo: do we need to call ima_path_check()? */ ++ struct path h_path = { ++ .dentry = ++ .mnt = h_mnt ++ }; ++ err = ima_path_check(&h_path, ++ mask & (MAY_READ | MAY_WRITE | MAY_EXEC), ++ IMA_COUNT_LEAVE); ++ } ++#endif ++ ++ out: ++ return err; ++} ++ ++static int aufs_permission(struct inode *inode, int mask) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ const unsigned char isdir = !!S_ISDIR(inode->i_mode); ++ const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); ++ struct inode *h_inode; ++ struct super_block *sb; ++ struct au_branch *br; ++ ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ ii_read_lock_child(inode); ++ ++ if (!isdir || write_mask) { ++ err = au_busy_or_stale(); ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ if (unlikely(!h_inode ++ || (h_inode->i_mode & S_IFMT) ++ != (inode->i_mode & S_IFMT))) ++ goto out; ++ ++ err = 0; ++ bindex = au_ibstart(inode); ++ br = au_sbr(sb, bindex); ++ err = h_permission(h_inode, mask, br->br_mnt, br->br_perm); ++ if (write_mask && !err) { ++ /* test whether the upper writable branch exists */ ++ err = -EROFS; ++ for (; bindex >= 0; bindex--) ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = 0; ++ break; ++ } ++ } ++ goto out; ++ } ++ ++ /* non-write to dir */ ++ err = 0; ++ bend = au_ibend(inode); ++ for (bindex = au_ibstart(inode); !err && bindex <= bend; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode) { ++ err = au_busy_or_stale(); ++ if (unlikely(!S_ISDIR(h_inode->i_mode))) ++ break; ++ ++ br = au_sbr(sb, bindex); ++ err = h_permission(h_inode, mask, br->br_mnt, ++ br->br_perm); ++ } ++ } ++ ++ out: ++ ii_read_unlock(inode); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct dentry *ret, *parent; ++ struct inode *inode, *h_inode; ++ struct mutex *mtx; ++ struct super_block *sb; ++ int err, npositive; ++ aufs_bindex_t bstart; ++ ++ IMustLock(dir); ++ ++ sb = dir->i_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_alloc_dinfo(dentry); ++ ret = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_read_lock_parent(parent, AuLock_IR); ++ npositive = au_lkup_dentry(dentry, au_dbstart(parent), /*type*/0, nd); ++ di_read_unlock(parent, AuLock_IR); ++ err = npositive; ++ ret = ERR_PTR(err); ++ if (unlikely(err < 0)) ++ goto out_unlock; ++ ++ inode = NULL; ++ if (npositive) { ++ bstart = au_dbstart(dentry); ++ h_inode = au_h_dptr(dentry, bstart)->d_inode; ++ if (!S_ISDIR(h_inode->i_mode)) { ++ /* ++ * stop 'race'-ing between hardlinks under different ++ * parents. ++ */ ++ mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx; ++ mutex_lock(mtx); ++ inode = au_new_inode(dentry, /*must_new*/0); ++ mutex_unlock(mtx); ++ } else ++ inode = au_new_inode(dentry, /*must_new*/0); ++ ret = (void *)inode; ++ } ++ if (IS_ERR(inode)) ++ goto out_unlock; ++ ++ ret = d_splice_alias(inode, dentry); ++ if (unlikely(IS_ERR(ret) && inode)) ++ ii_write_unlock(inode); ++ ++ out_unlock: ++ di_write_unlock(dentry); ++ out: ++ si_read_unlock(sb); ++ return ret; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent, ++ const unsigned char add_entry, aufs_bindex_t bcpup, ++ aufs_bindex_t bstart) ++{ ++ int err; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ ++ if (add_entry) { ++ au_update_dbstart(dentry); ++ IMustLock(parent->d_inode); ++ } else ++ di_write_lock_parent(parent); ++ ++ err = 0; ++ if (!au_h_dptr(parent, bcpup)) { ++ if (bstart < bcpup) ++ err = au_cpdown_dirs(dentry, bcpup); ++ else ++ err = au_cpup_dirs(dentry, bcpup); ++ } ++ if (!err && add_entry) { ++ h_parent = au_h_dptr(parent, bcpup); ++ h_dir = h_parent->d_inode; ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); ++ err = au_lkup_neg(dentry, bcpup); ++ /* todo: no unlock here */ ++ mutex_unlock(&h_dir->i_mutex); ++ if (bstart < bcpup && au_dbstart(dentry) < 0) { ++ au_set_dbstart(dentry, 0); ++ au_update_dbrange(dentry, /*do_put_zero*/0); ++ } ++ } ++ ++ if (!add_entry) ++ di_write_unlock(parent); ++ if (!err) ++ err = bcpup; /* success */ ++ ++ return err; ++} ++ ++/* ++ * decide the branch and the parent dir where we will create a new entry. ++ * returns new bindex or an error. ++ * copyup the parent dir if needed. ++ */ ++int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, ++ struct au_wr_dir_args *args) ++{ ++ int err; ++ aufs_bindex_t bcpup, bstart, src_bstart; ++ const unsigned char add_entry = !!au_ftest_wrdir(args->flags, ++ ADD_ENTRY); ++ struct super_block *sb; ++ struct dentry *parent; ++ struct au_sbinfo *sbinfo; ++ ++ sb = dentry->d_sb; ++ sbinfo = au_sbi(sb); ++ parent = dget_parent(dentry); ++ bstart = au_dbstart(dentry); ++ bcpup = bstart; ++ if (args->force_btgt < 0) { ++ if (src_dentry) { ++ src_bstart = au_dbstart(src_dentry); ++ if (src_bstart < bstart) ++ bcpup = src_bstart; ++ } else if (add_entry) { ++ err = AuWbrCreate(sbinfo, dentry, ++ au_ftest_wrdir(args->flags, ISDIR)); ++ bcpup = err; ++ } ++ ++ if (bcpup < 0 || au_test_ro(sb, bcpup, dentry->d_inode)) { ++ if (add_entry) ++ err = AuWbrCopyup(sbinfo, dentry); ++ else { ++ if (!IS_ROOT(dentry)) { ++ di_read_lock_parent(parent, !AuLock_IR); ++ err = AuWbrCopyup(sbinfo, dentry); ++ di_read_unlock(parent, !AuLock_IR); ++ } else ++ err = AuWbrCopyup(sbinfo, dentry); ++ } ++ bcpup = err; ++ if (unlikely(err < 0)) ++ goto out; ++ } ++ } else { ++ bcpup = args->force_btgt; ++ AuDebugOn(au_test_ro(sb, bcpup, dentry->d_inode)); ++ } ++ AuDbg("bstart %d, bcpup %d\n", bstart, bcpup); ++ if (bstart < bcpup) ++ au_update_dbrange(dentry, /*do_put_zero*/1); ++ ++ err = bcpup; ++ if (bcpup == bstart) ++ goto out; /* success */ ++ ++ /* copyup the new parent into the branch we process */ ++ err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, bstart); ++ ++ out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *au_pinned_h_parent(struct au_pin *pin) ++{ ++ if (pin && pin->parent) ++ return au_h_dptr(pin->parent, pin->bindex); ++ return NULL; ++} ++ ++void au_unpin(struct au_pin *p) ++{ ++ if (au_ftest_pin(p->flags, MNT_WRITE)) ++ mnt_drop_write(p->h_mnt); ++ if (!p->hdir) ++ return; ++ ++ au_hin_imtx_unlock(p->hdir); ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_unlock(p->parent, AuLock_IR); ++ iput(p->hdir->hi_inode); ++ dput(p->parent); ++ p->parent = NULL; ++ p->hdir = NULL; ++ p->h_mnt = NULL; ++} ++ ++int au_do_pin(struct au_pin *p) ++{ ++ int err; ++ struct super_block *sb; ++ struct dentry *h_dentry, *h_parent; ++ struct au_branch *br; ++ struct inode *h_dir; ++ ++ err = 0; ++ sb = p->dentry->d_sb; ++ br = au_sbr(sb, p->bindex); ++ if (IS_ROOT(p->dentry)) { ++ if (au_ftest_pin(p->flags, MNT_WRITE)) { ++ p->h_mnt = br->br_mnt; ++ err = mnt_want_write(p->h_mnt); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ goto out_err; ++ } ++ } ++ goto out; ++ } ++ ++ h_dentry = NULL; ++ if (p->bindex <= au_dbend(p->dentry)) ++ h_dentry = au_h_dptr(p->dentry, p->bindex); ++ ++ p->parent = dget_parent(p->dentry); ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_lock(p->parent, AuLock_IR, p->lsc_di); ++ ++ h_dir = NULL; ++ h_parent = au_h_dptr(p->parent, p->bindex); ++ p->hdir = au_hi(p->parent->d_inode, p->bindex); ++ if (p->hdir) ++ h_dir = p->hdir->hi_inode; ++ ++ /* udba case */ ++ if (unlikely(!p->hdir || !h_dir)) { ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_unlock(p->parent, AuLock_IR); ++ dput(p->parent); ++ p->parent = NULL; ++ goto out_err; ++ } ++ ++ au_igrab(h_dir); ++ au_hin_imtx_lock_nested(p->hdir, p->lsc_hi); ++ ++ if (unlikely(p->hdir->hi_inode != h_parent->d_inode)) { ++ err = -EBUSY; ++ goto out_unpin; ++ } ++ if (h_dentry) { ++ err = au_h_verify(h_dentry, p->udba, h_dir, h_parent, br); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ goto out_unpin; ++ } ++ } ++ ++ if (au_ftest_pin(p->flags, MNT_WRITE)) { ++ p->h_mnt = br->br_mnt; ++ err = mnt_want_write(p->h_mnt); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ goto out_unpin; ++ } ++ } ++ goto out; /* success */ ++ ++ out_unpin: ++ au_unpin(p); ++ out_err: ++ AuErr("err %d\n", err); ++ err = au_busy_or_stale(); ++ out: ++ return err; ++} ++ ++void au_pin_init(struct au_pin *p, struct dentry *dentry, ++ aufs_bindex_t bindex, int lsc_di, int lsc_hi, ++ unsigned int udba, unsigned char flags) ++{ ++ p->dentry = dentry; ++ p->udba = udba; ++ p->lsc_di = lsc_di; ++ p->lsc_hi = lsc_hi; ++ p->flags = flags; ++ p->bindex = bindex; ++ ++ p->parent = NULL; ++ p->hdir = NULL; ++ p->h_mnt = NULL; ++} ++ ++int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int udba, unsigned char flags) ++{ ++ au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2, ++ udba, flags); ++ return au_do_pin(pin); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuIcpup_DID_CPUP 1 ++#define au_ftest_icpup(flags, name) ((flags) & AuIcpup_##name) ++#define au_fset_icpup(flags, name) { (flags) |= AuIcpup_##name; } ++#define au_fclr_icpup(flags, name) { (flags) &= ~AuIcpup_##name; } ++ ++struct au_icpup_args { ++ unsigned char flags; ++ unsigned char pin_flags; ++ aufs_bindex_t btgt; ++ struct au_pin pin; ++ struct path h_path; ++ struct inode *h_inode; ++}; ++ ++static int au_lock_and_icpup(struct dentry *dentry, struct iattr *ia, ++ struct au_icpup_args *a) ++{ ++ int err; ++ unsigned int udba; ++ loff_t sz; ++ aufs_bindex_t bstart; ++ struct dentry *hi_wh, *parent; ++ struct inode *inode; ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = 0 ++ }; ++ ++ di_write_lock_child(dentry); ++ bstart = au_dbstart(dentry); ++ inode = dentry->d_inode; ++ if (S_ISDIR(inode->i_mode)) ++ au_fset_wrdir(wr_dir_args.flags, ISDIR); ++ /* plink or hi_wh() case */ ++ if (bstart != au_ibstart(inode)) ++ wr_dir_args.force_btgt = au_ibstart(inode); ++ err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); ++ if (unlikely(err < 0)) ++ goto out_dentry; ++ a->btgt = err; ++ if (err != bstart) ++ au_fset_icpup(a->flags, DID_CPUP); ++ ++ err = 0; ++ a->pin_flags = AuPin_MNT_WRITE; ++ parent = NULL; ++ if (!IS_ROOT(dentry)) { ++ au_fset_pin(a->pin_flags, DI_LOCKED); ++ parent = dget_parent(dentry); ++ di_write_lock_parent(parent); ++ } ++ ++ udba = au_opt_udba(dentry->d_sb); ++ if (d_unhashed(dentry) || (ia->ia_valid & ATTR_FILE)) ++ udba = AuOpt_UDBA_NONE; ++ err = au_pin(&a->pin, dentry, a->btgt, udba, a->pin_flags); ++ if (unlikely(err)) { ++ if (parent) { ++ di_write_unlock(parent); ++ dput(parent); ++ } ++ goto out_dentry; ++ } ++ a->h_path.dentry = au_h_dptr(dentry, bstart); ++ a->h_inode = a->h_path.dentry->d_inode; ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ sz = -1; ++ if ((ia->ia_valid & ATTR_SIZE) && ia->ia_size < i_size_read(a->h_inode)) ++ sz = ia->ia_size; ++ ++ hi_wh = NULL; ++ if (au_ftest_icpup(a->flags, DID_CPUP) && d_unhashed(dentry)) { ++ hi_wh = au_hi_wh(inode, a->btgt); ++ if (!hi_wh) { ++ err = au_sio_cpup_wh(dentry, a->btgt, sz, /*file*/NULL); ++ if (unlikely(err)) ++ goto out_unlock; ++ hi_wh = au_hi_wh(inode, a->btgt); ++ /* todo: revalidate hi_wh? */ ++ } ++ } ++ ++ if (parent) { ++ au_pin_set_parent_lflag(&a->pin, /*lflag*/0); ++ di_downgrade_lock(parent, AuLock_IR); ++ dput(parent); ++ } ++ if (!au_ftest_icpup(a->flags, DID_CPUP)) ++ goto out; /* success */ ++ ++ if (!d_unhashed(dentry)) { ++ err = au_sio_cpup_simple(dentry, a->btgt, sz, AuCpup_DTIME); ++ if (!err) ++ a->h_path.dentry = au_h_dptr(dentry, a->btgt); ++ } else if (!hi_wh) ++ a->h_path.dentry = au_h_dptr(dentry, a->btgt); ++ else ++ a->h_path.dentry = hi_wh; /* do not dget here */ ++ ++ out_unlock: ++ mutex_unlock(&a->h_inode->i_mutex); ++ a->h_inode = a->h_path.dentry->d_inode; ++ if (!err) { ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ goto out; /* success */ ++ } ++ ++ au_unpin(&a->pin); ++ ++ out_dentry: ++ di_write_unlock(dentry); ++ out: ++ return err; ++} ++ ++static int aufs_setattr(struct dentry *dentry, struct iattr *ia) ++{ ++ int err; ++ struct inode *inode; ++ struct super_block *sb; ++ struct file *file; ++ struct au_icpup_args *a; ++ ++ err = -ENOMEM; ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ ++ file = NULL; ++ if (ia->ia_valid & ATTR_FILE) { ++ /* currently ftruncate(2) only */ ++ file = ia->ia_file; ++ fi_write_lock(file); ++ ia->ia_file = au_h_fptr(file, au_fbstart(file)); ++ } ++ ++ if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) ++ ia->ia_valid &= ~ATTR_MODE; ++ ++ err = au_lock_and_icpup(dentry, ia, a); ++ if (unlikely(err < 0)) ++ goto out_si; ++ if (au_ftest_icpup(a->flags, DID_CPUP)) { ++ ia->ia_file = NULL; ++ ia->ia_valid &= ~ATTR_FILE; ++ } ++ ++ a->h_path.mnt = au_sbr_mnt(sb, a->btgt); ++ if (ia->ia_valid & ATTR_SIZE) { ++ struct file *f; ++ ++ if (ia->ia_size < i_size_read(inode)) { ++ /* unmap only */ ++ err = vmtruncate(inode, ia->ia_size); ++ if (unlikely(err)) ++ goto out_unlock; ++ } ++ ++ f = NULL; ++ if (ia->ia_valid & ATTR_FILE) ++ f = ia->ia_file; ++ mutex_unlock(&a->h_inode->i_mutex); ++ err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f); ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ } else ++ err = vfsub_notify_change(&a->h_path, ia); ++ if (!err) ++ au_cpup_attr_changeable(inode); ++ ++ out_unlock: ++ mutex_unlock(&a->h_inode->i_mutex); ++ au_unpin(&a->pin); ++ di_write_unlock(dentry); ++ out_si: ++ if (file) { ++ fi_write_unlock(file); ++ ia->ia_file = file; ++ ia->ia_valid |= ATTR_FILE; ++ } ++ si_read_unlock(sb); ++ kfree(a); ++ out: ++ return err; ++} ++ ++static int au_getattr_lock_reval(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct inode *inode; ++ struct dentry *parent; ++ ++ err = 0; ++ inode = dentry->d_inode; ++ di_write_lock_child(dentry); ++ if (au_digen(dentry) != sigen || au_iigen(inode) != sigen) { ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ /* returns a number of positive dentries */ ++ err = au_refresh_hdentry(dentry, inode->i_mode & S_IFMT); ++ if (err > 0) ++ err = au_refresh_hinode(inode, dentry); ++ di_read_unlock(parent, AuLock_IR); ++ dput(parent); ++ if (unlikely(!err)) ++ err = -EIO; ++ } ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ di_read_unlock(dentry, AuLock_IR); ++ ++ return err; ++} ++ ++static void au_refresh_iattr(struct inode *inode, struct kstat *st, ++ unsigned int nlink) ++{ ++ inode->i_mode = st->mode; ++ inode->i_uid = st->uid; ++ inode->i_gid = st->gid; ++ inode->i_atime = st->atime; ++ inode->i_mtime = st->mtime; ++ inode->i_ctime = st->ctime; ++ ++ au_cpup_attr_nlink(inode, /*force*/0); ++ if (S_ISDIR(inode->i_mode)) { ++ inode->i_nlink -= nlink; ++ inode->i_nlink += st->nlink; ++ } ++ ++ spin_lock(&inode->i_lock); ++ inode->i_blocks = st->blocks; ++ i_size_write(inode, st->size); ++ spin_unlock(&inode->i_lock); ++} ++ ++static int aufs_getattr(struct vfsmount *mnt __maybe_unused, ++ struct dentry *dentry, struct kstat *st) ++{ ++ int err; ++ unsigned int mnt_flags; ++ aufs_bindex_t bindex; ++ unsigned char udba_none, positive; ++ struct super_block *sb, *h_sb; ++ struct inode *inode; ++ struct vfsmount *h_mnt; ++ struct dentry *h_dentry; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ si_read_lock(sb, AuLock_FLUSH); ++ mnt_flags = au_mntflags(sb); ++ udba_none = !!au_opt_test(mnt_flags, UDBA_NONE); ++ ++ /* support fstat(2) */ ++ if (!d_unhashed(dentry) && !udba_none) { ++ unsigned int sigen = au_sigen(sb); ++ if (au_digen(dentry) == sigen && au_iigen(inode) == sigen) ++ di_read_lock_child(dentry, AuLock_IR); ++ else { ++ /* NFSD may skip the revalidation */ ++ if (!au_test_nfsd(current)) ++ AuDebugOn(!IS_ROOT(dentry)); ++ else { ++ err = au_busy_or_stale(); ++ if (unlikely(!IS_ROOT(dentry))) ++ goto out; ++ } ++ err = au_getattr_lock_reval(dentry, sigen); ++ if (unlikely(err)) ++ goto out; ++ } ++ } else ++ di_read_lock_child(dentry, AuLock_IR); ++ ++ bindex = au_ibstart(inode); ++ h_mnt = au_sbr_mnt(sb, bindex); ++ h_sb = h_mnt->mnt_sb; ++ if (!au_test_fs_bad_iattr(h_sb) && udba_none) ++ goto out_fill; /* success */ ++ ++ h_dentry = NULL; ++ if (au_dbstart(dentry) == bindex) ++ h_dentry = dget(au_h_dptr(dentry, bindex)); ++ else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) { ++ h_dentry = au_plink_lkup(inode, bindex); ++ if (IS_ERR(h_dentry)) ++ goto out_fill; /* pretending success */ ++ } ++ /* illegally overlapped or something */ ++ if (unlikely(!h_dentry)) ++ goto out_fill; /* pretending success */ ++ ++ positive = !!h_dentry->d_inode; ++ if (positive) ++ err = vfs_getattr(h_mnt, h_dentry, st); ++ dput(h_dentry); ++ if (!err) { ++ if (positive) ++ au_refresh_iattr(inode, st, h_dentry->d_inode->i_nlink); ++ goto out_fill; /* success */ ++ } ++ goto out_unlock; ++ ++ out_fill: ++ generic_fillattr(inode, st); ++ out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int h_readlink(struct dentry *dentry, int bindex, char __user *buf, ++ int bufsiz) ++{ ++ int err; ++ struct super_block *sb; ++ struct dentry *h_dentry; ++ ++ err = -EINVAL; ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (unlikely(/* !h_dentry ++ || !h_dentry->d_inode ++ || !h_dentry->d_inode->i_op ++ || */ !h_dentry->d_inode->i_op->readlink)) ++ goto out; ++ ++ err = security_inode_readlink(h_dentry); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ if (!au_test_ro(sb, bindex, dentry->d_inode)) { ++ vfsub_touch_atime(au_sbr_mnt(sb, bindex), h_dentry); ++ fsstack_copy_attr_atime(dentry->d_inode, h_dentry->d_inode); ++ } ++ err = h_dentry->d_inode->i_op->readlink(h_dentry, buf, bufsiz); ++ ++ out: ++ return err; ++} ++ ++static int aufs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) ++{ ++ int err; ++ ++ aufs_read_lock(dentry, AuLock_IR); ++ err = h_readlink(dentry, au_dbstart(dentry), buf, bufsiz); ++ aufs_read_unlock(dentry, AuLock_IR); ++ ++ return err; ++} ++ ++static void *aufs_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++ int err; ++ char *buf; ++ mm_segment_t old_fs; ++ ++ err = -ENOMEM; ++ buf = __getname(); ++ if (unlikely(!buf)) ++ goto out; ++ ++ aufs_read_lock(dentry, AuLock_IR); ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = h_readlink(dentry, au_dbstart(dentry), (char __user *)buf, ++ PATH_MAX); ++ set_fs(old_fs); ++ aufs_read_unlock(dentry, AuLock_IR); ++ ++ if (err >= 0) { ++ buf[err] = 0; ++ /* will be freed by put_link */ ++ nd_set_link(nd, buf); ++ return NULL; /* success */ ++ } ++ __putname(buf); ++ ++ out: ++ path_put(&nd->path); ++ AuTraceErr(err); ++ return ERR_PTR(err); ++} ++ ++static void aufs_put_link(struct dentry *dentry __maybe_unused, ++ struct nameidata *nd, void *cookie __maybe_unused) ++{ ++ __putname(nd_get_link(nd)); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void aufs_truncate_range(struct inode *inode __maybe_unused, ++ loff_t start __maybe_unused, ++ loff_t end __maybe_unused) ++{ ++ AuUnsupport(); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct inode_operations aufs_symlink_iop = { ++ .permission = aufs_permission, ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr, ++ .readlink = aufs_readlink, ++ .follow_link = aufs_follow_link, ++ .put_link = aufs_put_link ++}; ++ ++struct inode_operations aufs_dir_iop = { ++ .create = aufs_create, ++ .lookup = aufs_lookup, ++ .link = aufs_link, ++ .unlink = aufs_unlink, ++ .symlink = aufs_symlink, ++ .mkdir = aufs_mkdir, ++ .rmdir = aufs_rmdir, ++ .mknod = aufs_mknod, ++ .rename = aufs_rename, ++ ++ .permission = aufs_permission, ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr ++}; ++ ++struct inode_operations aufs_iop = { ++ .permission = aufs_permission, ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr, ++ .truncate_range = aufs_truncate_range ++}; +diff -Nur linux-2.6.31-vanilla/fs/aufs/i_op_del.c linux-2.6.31/fs/aufs/i_op_del.c +--- linux-2.6.31-vanilla/fs/aufs/i_op_del.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/i_op_del.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,468 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations (del entry) ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * decide if a new whiteout for @dentry is necessary or not. ++ * when it is necessary, prepare the parent dir for the upper branch whose ++ * branch index is @bcpup for creation. the actual creation of the whiteout will ++ * be done by caller. ++ * return value: ++ * 0: wh is unnecessary ++ * plus: wh is necessary ++ * minus: error ++ */ ++int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup) ++{ ++ int need_wh, err; ++ aufs_bindex_t bstart; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ bstart = au_dbstart(dentry); ++ if (*bcpup < 0) { ++ *bcpup = bstart; ++ if (au_test_ro(sb, bstart, dentry->d_inode)) { ++ err = AuWbrCopyup(au_sbi(sb), dentry); ++ *bcpup = err; ++ if (unlikely(err < 0)) ++ goto out; ++ } ++ } else ++ AuDebugOn(bstart < *bcpup ++ || au_test_ro(sb, *bcpup, dentry->d_inode)); ++ AuDbg("bcpup %d, bstart %d\n", *bcpup, bstart); ++ ++ if (*bcpup != bstart) { ++ err = au_cpup_dirs(dentry, *bcpup); ++ if (unlikely(err)) ++ goto out; ++ need_wh = 1; ++ } else { ++ aufs_bindex_t old_bend, new_bend, bdiropq = -1; ++ ++ old_bend = au_dbend(dentry); ++ if (isdir) { ++ bdiropq = au_dbdiropq(dentry); ++ au_set_dbdiropq(dentry, -1); ++ } ++ need_wh = au_lkup_dentry(dentry, bstart + 1, /*type*/0, ++ /*nd*/NULL); ++ err = need_wh; ++ if (isdir) ++ au_set_dbdiropq(dentry, bdiropq); ++ if (unlikely(err < 0)) ++ goto out; ++ new_bend = au_dbend(dentry); ++ if (!need_wh && old_bend != new_bend) { ++ au_set_h_dptr(dentry, new_bend, NULL); ++ au_set_dbend(dentry, old_bend); ++ } ++ } ++ AuDbg("need_wh %d\n", need_wh); ++ err = need_wh; ++ ++ out: ++ return err; ++} ++ ++/* ++ * simple tests for the del-entry operations. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir) ++{ ++ int err; ++ umode_t h_mode; ++ struct dentry *h_dentry, *h_latest; ++ struct inode *h_inode; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ h_inode = h_dentry->d_inode; ++ if (dentry->d_inode) { ++ err = -ENOENT; ++ if (unlikely(!h_inode || !h_inode->i_nlink)) ++ goto out; ++ ++ h_mode = h_inode->i_mode; ++ if (!isdir) { ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(h_mode))) ++ goto out; ++ } else if (unlikely(!S_ISDIR(h_mode))) { ++ err = -ENOTDIR; ++ goto out; ++ } ++ } else { ++ /* rename(2) case */ ++ err = -EIO; ++ if (unlikely(h_inode)) ++ goto out; ++ } ++ ++ err = -ENOENT; ++ /* expected parent dir is locked */ ++ if (unlikely(h_parent != h_dentry->d_parent)) ++ goto out; ++ err = 0; ++ ++ /* ++ * rmdir a dir may break the consistency on some filesystem. ++ * let's try heavy test. ++ */ ++ err = -EACCES; ++ if (unlikely(au_test_h_perm(h_parent->d_inode, MAY_EXEC | MAY_WRITE))) ++ goto out; ++ ++ h_latest = au_sio_lkup_one(&dentry->d_name, h_parent, ++ au_sbr(dentry->d_sb, bindex)); ++ err = -EIO; ++ if (IS_ERR(h_latest)) ++ goto out; ++ if (h_latest == h_dentry) ++ err = 0; ++ dput(h_latest); ++ ++ out: ++ return err; ++} ++ ++/* ++ * decide the branch where we operate for @dentry. the branch index will be set ++ * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent ++ * dir for reverting. ++ * when a new whiteout is necessary, create it. ++ */ ++static struct dentry* ++lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup, ++ struct au_dtime *dt, struct au_pin *pin) ++{ ++ struct dentry *wh_dentry; ++ struct super_block *sb; ++ struct path h_path; ++ int err, need_wh; ++ unsigned int udba; ++ aufs_bindex_t bcpup; ++ ++ need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup); ++ wh_dentry = ERR_PTR(need_wh); ++ if (unlikely(need_wh < 0)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ udba = au_opt_udba(sb); ++ bcpup = *rbcpup; ++ err = au_pin(pin, dentry, bcpup, udba, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ h_path.dentry = au_pinned_h_parent(pin); ++ if (udba != AuOpt_UDBA_NONE ++ && au_dbstart(dentry) == bcpup) { ++ err = au_may_del(dentry, bcpup, h_path.dentry, isdir); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_unpin; ++ } ++ ++ h_path.mnt = au_sbr_mnt(sb, bcpup); ++ au_dtime_store(dt, au_pinned_parent(pin), &h_path); ++ wh_dentry = NULL; ++ if (!need_wh) ++ goto out; /* success, no need to create whiteout */ ++ ++ wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry); ++ if (!IS_ERR(wh_dentry)) ++ goto out; /* success */ ++ /* returns with the parent is locked and wh_dentry is dget-ed */ ++ ++ out_unpin: ++ au_unpin(pin); ++ out: ++ return wh_dentry; ++} ++ ++/* ++ * when removing a dir, rename it to a unique temporary whiteout-ed name first ++ * in order to be revertible and save time for removing many child whiteouts ++ * under the dir. ++ * returns 1 when there are too many child whiteout and caller should remove ++ * them asynchronously. returns 0 when the number of children is enough small to ++ * remove now or the branch fs is a remote fs. ++ * otherwise return an error. ++ */ ++static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex, ++ struct au_nhash *whlist, struct inode *dir) ++{ ++ int rmdir_later, err, dirwh; ++ struct dentry *h_dentry; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ SiMustAnyLock(sb); ++ h_dentry = au_h_dptr(dentry, bindex); ++ err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex)); ++ if (unlikely(err)) ++ goto out; ++ ++ /* stop monitoring */ ++ au_hin_free(au_hi(dentry->d_inode, bindex)); ++ ++ if (!au_test_fs_remote(h_dentry->d_sb)) { ++ dirwh = au_sbi(sb)->si_dirwh; ++ rmdir_later = (dirwh <= 1); ++ if (!rmdir_later) ++ rmdir_later = au_nhash_test_longer_wh(whlist, bindex, ++ dirwh); ++ if (rmdir_later) ++ return rmdir_later; ++ } ++ ++ err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist); ++ if (unlikely(err)) { ++ AuIOErr("rmdir %.*s, b%d failed, %d. ignored\n", ++ AuDLNPair(h_dentry), bindex, err); ++ err = 0; ++ } ++ ++ out: ++ return err; ++} ++ ++/* ++ * final procedure for deleting a entry. ++ * maintain dentry and iattr. ++ */ ++static void epilog(struct inode *dir, struct dentry *dentry, ++ aufs_bindex_t bindex) ++{ ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ d_drop(dentry); ++ inode->i_ctime = dir->i_ctime; ++ ++ if (atomic_read(&dentry->d_count) == 1) { ++ au_set_h_dptr(dentry, au_dbstart(dentry), NULL); ++ au_update_dbstart(dentry); ++ } ++ if (au_ibstart(dir) == bindex) ++ au_cpup_attr_timesizes(dir); ++ dir->i_version++; ++} ++ ++/* ++ * when an error happened, remove the created whiteout and revert everything. ++ */ ++static int do_revert(int err, struct inode *dir, aufs_bindex_t bwh, ++ struct dentry *wh_dentry, struct dentry *dentry, ++ struct au_dtime *dt) ++{ ++ int rerr; ++ struct path h_path = { ++ .dentry = wh_dentry, ++ .mnt = au_sbr_mnt(dir->i_sb, bwh) ++ }; ++ ++ rerr = au_wh_unlink_dentry(au_h_iptr(dir, bwh), &h_path, dentry); ++ if (!rerr) { ++ au_set_dbwh(dentry, bwh); ++ au_dtime_revert(dt); ++ return 0; ++ } ++ ++ AuIOErr("%.*s reverting whiteout failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ return -EIO; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int aufs_unlink(struct inode *dir, struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bwh, bindex, bstart; ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct path h_path; ++ struct inode *inode, *h_dir; ++ struct dentry *parent, *wh_dentry; ++ ++ IMustLock(dir); ++ inode = dentry->d_inode; ++ if (unlikely(!inode)) ++ return -ENOENT; /* possible? */ ++ IMustLock(inode); ++ ++ aufs_read_lock(dentry, AuLock_DW); ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ ++ bstart = au_dbstart(dentry); ++ bwh = au_dbwh(dentry); ++ bindex = -1; ++ wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &dt, &pin); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out; ++ ++ h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); ++ h_path.dentry = au_h_dptr(dentry, bstart); ++ dget(h_path.dentry); ++ if (bindex == bstart) { ++ h_dir = au_pinned_h_dir(&pin); ++ err = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ } else { ++ /* dir inode is locked */ ++ h_dir = wh_dentry->d_parent->d_inode; ++ IMustLock(h_dir); ++ err = 0; ++ } ++ ++ if (!err) { ++ drop_nlink(inode); ++ epilog(dir, dentry, bindex); ++ ++ /* update target timestamps */ ++ if (bindex == bstart) { ++ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ ++ inode->i_ctime = h_path.dentry->d_inode->i_ctime; ++ } else ++ /* todo: this timestamp may be reverted later */ ++ inode->i_ctime = h_dir->i_ctime; ++ goto out_unlock; /* success */ ++ } ++ ++ /* revert */ ++ if (wh_dentry) { ++ int rerr; ++ ++ rerr = do_revert(err, dir, bwh, wh_dentry, dentry, &dt); ++ if (rerr) ++ err = rerr; ++ } ++ ++ out_unlock: ++ au_unpin(&pin); ++ dput(wh_dentry); ++ dput(h_path.dentry); ++ out: ++ di_write_unlock(parent); ++ aufs_read_unlock(dentry, AuLock_DW); ++ return err; ++} ++ ++int aufs_rmdir(struct inode *dir, struct dentry *dentry) ++{ ++ int err, rmdir_later; ++ aufs_bindex_t bwh, bindex, bstart; ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct inode *inode; ++ struct dentry *parent, *wh_dentry, *h_dentry; ++ struct au_whtmp_rmdir *args; ++ ++ IMustLock(dir); ++ inode = dentry->d_inode; ++ err = -ENOENT; /* possible? */ ++ if (unlikely(!inode)) ++ goto out; ++ IMustLock(inode); ++ ++ aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH); ++ err = -ENOMEM; ++ args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS); ++ if (unlikely(!args)) ++ goto out_unlock; ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ err = au_test_empty(dentry, &args->whlist); ++ if (unlikely(err)) ++ goto out_args; ++ ++ bstart = au_dbstart(dentry); ++ bwh = au_dbwh(dentry); ++ bindex = -1; ++ wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &dt, &pin); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_args; ++ ++ h_dentry = au_h_dptr(dentry, bstart); ++ dget(h_dentry); ++ rmdir_later = 0; ++ if (bindex == bstart) { ++ err = renwh_and_rmdir(dentry, bstart, &args->whlist, dir); ++ if (err > 0) { ++ rmdir_later = err; ++ err = 0; ++ } ++ } else { ++ /* stop monitoring */ ++ au_hin_free(au_hi(inode, bstart)); ++ ++ /* dir inode is locked */ ++ IMustLock(wh_dentry->d_parent->d_inode); ++ err = 0; ++ } ++ ++ if (!err) { ++ clear_nlink(inode); ++ au_set_dbdiropq(dentry, -1); ++ epilog(dir, dentry, bindex); ++ ++ if (rmdir_later) { ++ au_whtmp_kick_rmdir(dir, bstart, h_dentry, args); ++ args = NULL; ++ } ++ ++ goto out_unpin; /* success */ ++ } ++ ++ /* revert */ ++ AuLabel(revert); ++ if (wh_dentry) { ++ int rerr; ++ ++ rerr = do_revert(err, dir, bwh, wh_dentry, dentry, &dt); ++ if (rerr) ++ err = rerr; ++ } ++ ++ out_unpin: ++ au_unpin(&pin); ++ dput(wh_dentry); ++ dput(h_dentry); ++ out_args: ++ di_write_unlock(parent); ++ if (args) ++ au_whtmp_rmdir_free(args); ++ out_unlock: ++ aufs_read_unlock(dentry, AuLock_DW); ++ out: ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/i_op_ren.c linux-2.6.31/fs/aufs/i_op_ren.c +--- linux-2.6.31-vanilla/fs/aufs/i_op_ren.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/i_op_ren.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,957 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operation (rename entry) ++ * todo: this is crazy monster ++ */ ++ ++#include "aufs.h" ++ ++enum { AuSRC, AuDST, AuSrcDst }; ++enum { AuPARENT, AuCHILD, AuParentChild }; ++ ++#define AuRen_ISDIR 1 ++#define AuRen_ISSAMEDIR (1 << 1) ++#define AuRen_WHSRC (1 << 2) ++#define AuRen_WHDST (1 << 3) ++#define AuRen_MNT_WRITE (1 << 4) ++#define AuRen_DT_DSTDIR (1 << 5) ++#define AuRen_DIROPQ (1 << 6) ++#define AuRen_CPUP (1 << 7) ++#define au_ftest_ren(flags, name) ((flags) & AuRen_##name) ++#define au_fset_ren(flags, name) { (flags) |= AuRen_##name; } ++#define au_fclr_ren(flags, name) { (flags) &= ~AuRen_##name; } ++ ++struct au_ren_args { ++ struct { ++ struct dentry *dentry, *h_dentry, *parent, *h_parent, ++ *wh_dentry; ++ struct inode *dir, *inode; ++ struct au_hinode *hdir; ++ struct au_dtime dt[AuParentChild]; ++ aufs_bindex_t bstart; ++ } sd[AuSrcDst]; ++ ++#define src_dentry sd[AuSRC].dentry ++#define src_dir sd[AuSRC].dir ++#define src_inode sd[AuSRC].inode ++#define src_h_dentry sd[AuSRC].h_dentry ++#define src_parent sd[AuSRC].parent ++#define src_h_parent sd[AuSRC].h_parent ++#define src_wh_dentry sd[AuSRC].wh_dentry ++#define src_hdir sd[AuSRC].hdir ++#define src_h_dir sd[AuSRC].hdir->hi_inode ++#define src_dt sd[AuSRC].dt ++#define src_bstart sd[AuSRC].bstart ++ ++#define dst_dentry sd[AuDST].dentry ++#define dst_dir sd[AuDST].dir ++#define dst_inode sd[AuDST].inode ++#define dst_h_dentry sd[AuDST].h_dentry ++#define dst_parent sd[AuDST].parent ++#define dst_h_parent sd[AuDST].h_parent ++#define dst_wh_dentry sd[AuDST].wh_dentry ++#define dst_hdir sd[AuDST].hdir ++#define dst_h_dir sd[AuDST].hdir->hi_inode ++#define dst_dt sd[AuDST].dt ++#define dst_bstart sd[AuDST].bstart ++ ++ struct dentry *h_trap; ++ struct au_branch *br; ++ struct au_hinode *src_hinode; ++ struct path h_path; ++ struct au_nhash whlist; ++ aufs_bindex_t btgt; ++ ++ unsigned int flags; ++ ++ struct au_whtmp_rmdir *thargs; ++ struct dentry *h_dst; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * functions for reverting. ++ * when an error happened in a single rename systemcall, we should revert ++ * everything as if nothing happend. ++ * we don't need to revert the copied-up/down the parent dir since they are ++ * harmless. ++ */ ++ ++#define RevertFailure(fmt, args...) do { \ ++ AuIOErr("revert failure: " fmt " (%d, %d)\n", \ ++ ##args, err, rerr); \ ++ err = -EIO; \ ++} while (0) ++ ++static void au_ren_rev_diropq(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ au_hin_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(a->src_dentry, a->btgt); ++ au_hin_imtx_unlock(a->src_hinode); ++ if (rerr) ++ RevertFailure("remove diropq %.*s", AuDLNPair(a->src_dentry)); ++} ++ ++ ++static void au_ren_rev_rename(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = au_lkup_one(&a->src_dentry->d_name, a->src_h_parent, ++ a->br, /*nd*/NULL); ++ rerr = PTR_ERR(a->h_path.dentry); ++ if (IS_ERR(a->h_path.dentry)) { ++ RevertFailure("au_lkup_one %.*s", AuDLNPair(a->src_dentry)); ++ return; ++ } ++ ++ rerr = vfsub_rename(a->dst_h_dir, ++ au_h_dptr(a->src_dentry, a->btgt), ++ a->src_h_dir, &a->h_path); ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ /* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */ ++ if (rerr) ++ RevertFailure("rename %.*s", AuDLNPair(a->src_dentry)); ++} ++ ++static void au_ren_rev_cpup(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = a->dst_h_dentry; ++ rerr = vfsub_unlink(a->dst_h_dir, &a->h_path, /*force*/0); ++ au_set_h_dptr(a->src_dentry, a->btgt, NULL); ++ au_set_dbstart(a->src_dentry, a->src_bstart); ++ if (rerr) ++ RevertFailure("unlink %.*s", AuDLNPair(a->dst_h_dentry)); ++} ++ ++ ++static void au_ren_rev_whtmp(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = au_lkup_one(&a->dst_dentry->d_name, a->dst_h_parent, ++ a->br, /*nd*/NULL); ++ rerr = PTR_ERR(a->h_path.dentry); ++ if (IS_ERR(a->h_path.dentry)) { ++ RevertFailure("lookup %.*s", AuDLNPair(a->dst_dentry)); ++ return; ++ } ++ if (a->h_path.dentry->d_inode) { ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ return; ++ } ++ ++ rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path); ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ if (!rerr) { ++ au_set_h_dptr(a->dst_dentry, a->btgt, NULL); ++ au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst)); ++ } else ++ RevertFailure("rename %.*s", AuDLNPair(a->h_dst)); ++} ++ ++static void au_ren_rev_whsrc(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = a->src_wh_dentry; ++ rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry); ++ if (rerr) ++ RevertFailure("unlink %.*s", AuDLNPair(a->src_wh_dentry)); ++} ++ ++static void au_ren_rev_drop(struct au_ren_args *a) ++{ ++ struct dentry *d, *h_d; ++ int i; ++ aufs_bindex_t bend, bindex; ++ ++ for (i = 0; i < AuSrcDst; i++) { ++ d = a->sd[i].dentry; ++ d_drop(d); ++ bend = au_dbend(d); ++ for (bindex = au_dbstart(d); bindex <= bend; bindex++) { ++ h_d = au_h_dptr(d, bindex); ++ if (h_d) ++ d_drop(h_d); ++ } ++ } ++ ++ au_update_dbstart(a->dst_dentry); ++ if (a->thargs) ++ d_drop(a->h_dst); ++} ++#undef RevertFailure ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * when we have to copyup the renaming entry, do it with the rename-target name ++ * in order to minimize the cost (the later actual rename is unnecessary). ++ * otherwise rename it on the target branch. ++ */ ++static int au_ren_or_cpup(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *d; ++ ++ d = a->src_dentry; ++ if (au_dbstart(d) == a->btgt) { ++ a->h_path.dentry = a->dst_h_dentry; ++ if (au_ftest_ren(a->flags, DIROPQ) ++ && au_dbdiropq(d) == a->btgt) ++ au_fclr_ren(a->flags, DIROPQ); ++ AuDebugOn(au_dbstart(d) != a->btgt); ++ err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt), ++ a->dst_h_dir, &a->h_path); ++ } else { ++ struct mutex *h_mtx = &a->src_h_dentry->d_inode->i_mutex; ++ ++ au_fset_ren(a->flags, CPUP); ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ au_set_dbstart(d, a->btgt); ++ au_set_h_dptr(d, a->btgt, dget(a->dst_h_dentry)); ++ err = au_sio_cpup_single(d, a->btgt, a->src_bstart, -1, ++ !AuCpup_DTIME, a->dst_parent); ++ if (unlikely(err)) { ++ au_set_h_dptr(d, a->btgt, NULL); ++ au_set_dbstart(d, a->src_bstart); ++ } ++ mutex_unlock(h_mtx); ++ } ++ ++ return err; ++} ++ ++/* cf. aufs_rmdir() */ ++static int au_ren_del_whtmp(struct au_ren_args *a) ++{ ++ int err; ++ struct inode *dir; ++ ++ dir = a->dst_dir; ++ SiMustAnyLock(dir->i_sb); ++ if (!au_nhash_test_longer_wh(&a->whlist, a->btgt, ++ au_sbi(dir->i_sb)->si_dirwh) ++ || au_test_fs_remote(a->h_dst->d_sb)) { ++ err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist); ++ if (unlikely(err)) ++ AuWarn("failed removing whtmp dir %.*s (%d), " ++ "ignored.\n", AuDLNPair(a->h_dst), err); ++ } else { ++ au_nhash_wh_free(&a->thargs->whlist); ++ a->thargs->whlist = a->whlist; ++ a->whlist.nh_num = 0; ++ au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs); ++ dput(a->h_dst); ++ a->thargs = NULL; ++ } ++ ++ return 0; ++} ++ ++/* make it 'opaque' dir. */ ++static int au_ren_diropq(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *diropq; ++ ++ err = 0; ++ a->src_hinode = au_hi(a->src_inode, a->btgt); ++ au_hin_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); ++ diropq = au_diropq_create(a->src_dentry, a->btgt); ++ au_hin_imtx_unlock(a->src_hinode); ++ if (IS_ERR(diropq)) ++ err = PTR_ERR(diropq); ++ dput(diropq); ++ ++ return err; ++} ++ ++static int do_rename(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *d, *h_d; ++ ++ /* prepare workqueue args for asynchronous rmdir */ ++ h_d = a->dst_h_dentry; ++ if (au_ftest_ren(a->flags, ISDIR) && h_d->d_inode) { ++ err = -ENOMEM; ++ a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS); ++ if (unlikely(!a->thargs)) ++ goto out; ++ a->h_dst = dget(h_d); ++ } ++ ++ /* create whiteout for src_dentry */ ++ if (au_ftest_ren(a->flags, WHSRC)) { ++ a->src_wh_dentry ++ = au_wh_create(a->src_dentry, a->btgt, a->src_h_parent); ++ err = PTR_ERR(a->src_wh_dentry); ++ if (IS_ERR(a->src_wh_dentry)) ++ goto out_thargs; ++ } ++ ++ /* lookup whiteout for dentry */ ++ if (au_ftest_ren(a->flags, WHDST)) { ++ h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name, ++ a->br); ++ err = PTR_ERR(h_d); ++ if (IS_ERR(h_d)) ++ goto out_whsrc; ++ if (!h_d->d_inode) ++ dput(h_d); ++ else ++ a->dst_wh_dentry = h_d; ++ } ++ ++ /* rename dentry to tmpwh */ ++ if (a->thargs) { ++ err = au_whtmp_ren(a->dst_h_dentry, a->br); ++ if (unlikely(err)) ++ goto out_whdst; ++ ++ d = a->dst_dentry; ++ au_set_h_dptr(d, a->btgt, NULL); ++ err = au_lkup_neg(d, a->btgt); ++ if (unlikely(err)) ++ goto out_whtmp; ++ a->dst_h_dentry = au_h_dptr(d, a->btgt); ++ } ++ ++ /* cpup src */ ++ if (a->dst_h_dentry->d_inode && a->src_bstart != a->btgt) { ++ struct mutex *h_mtx = &a->src_h_dentry->d_inode->i_mutex; ++ ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ err = au_sio_cpup_simple(a->src_dentry, a->btgt, -1, ++ !AuCpup_DTIME); ++ mutex_unlock(h_mtx); ++ if (unlikely(err)) ++ goto out_whtmp; ++ } ++ ++ /* rename by vfs_rename or cpup */ ++ d = a->dst_dentry; ++ if (au_ftest_ren(a->flags, ISDIR) ++ && (a->dst_wh_dentry ++ || au_dbdiropq(d) == a->btgt ++ /* hide the lower to keep xino */ ++ || a->btgt < au_dbend(d) ++ || au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ))) ++ au_fset_ren(a->flags, DIROPQ); ++ err = au_ren_or_cpup(a); ++ if (unlikely(err)) ++ /* leave the copied-up one */ ++ goto out_whtmp; ++ ++ /* make dir opaque */ ++ if (au_ftest_ren(a->flags, DIROPQ)) { ++ err = au_ren_diropq(a); ++ if (unlikely(err)) ++ goto out_rename; ++ } ++ ++ /* update target timestamps */ ++ AuDebugOn(au_dbstart(a->src_dentry) != a->btgt); ++ a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt); ++ vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/ ++ a->src_inode->i_ctime = a->h_path.dentry->d_inode->i_ctime; ++ ++ /* remove whiteout for dentry */ ++ if (a->dst_wh_dentry) { ++ a->h_path.dentry = a->dst_wh_dentry; ++ err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path, ++ a->dst_dentry); ++ if (unlikely(err)) ++ goto out_diropq; ++ } ++ ++ /* remove whtmp */ ++ if (a->thargs) ++ au_ren_del_whtmp(a); /* ignore this error */ ++ ++ err = 0; ++ goto out_success; ++ ++ out_diropq: ++ if (au_ftest_ren(a->flags, DIROPQ)) ++ au_ren_rev_diropq(err, a); ++ out_rename: ++ if (!au_ftest_ren(a->flags, CPUP)) ++ au_ren_rev_rename(err, a); ++ else ++ au_ren_rev_cpup(err, a); ++ out_whtmp: ++ if (a->thargs) ++ au_ren_rev_whtmp(err, a); ++ out_whdst: ++ dput(a->dst_wh_dentry); ++ a->dst_wh_dentry = NULL; ++ out_whsrc: ++ if (a->src_wh_dentry) ++ au_ren_rev_whsrc(err, a); ++ au_ren_rev_drop(a); ++ out_success: ++ dput(a->src_wh_dentry); ++ dput(a->dst_wh_dentry); ++ out_thargs: ++ if (a->thargs) { ++ dput(a->h_dst); ++ au_whtmp_rmdir_free(a->thargs); ++ a->thargs = NULL; ++ } ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * test if @dentry dir can be rename destination or not. ++ * success means, it is a logically empty dir. ++ */ ++static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist) ++{ ++ return au_test_empty(dentry, whlist); ++} ++ ++/* ++ * test if @dentry dir can be rename source or not. ++ * if it can, return 0 and @children is filled. ++ * success means, ++ * - it is a logically empty dir. ++ * - or, it exists on writable branch and has no children including whiteouts ++ * on the lower branch. ++ */ ++static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt) ++{ ++ int err; ++ unsigned int rdhash; ++ aufs_bindex_t bstart; ++ ++ bstart = au_dbstart(dentry); ++ if (bstart != btgt) { ++ struct au_nhash whlist; ++ ++ SiMustAnyLock(dentry->d_sb); ++ rdhash = au_sbi(dentry->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, ++ dentry)); ++ err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_test_empty(dentry, &whlist); ++ au_nhash_wh_free(&whlist); ++ goto out; ++ } ++ ++ if (bstart == au_dbtaildir(dentry)) ++ return 0; /* success */ ++ ++ err = au_test_empty_lower(dentry); ++ ++ out: ++ if (err == -ENOTEMPTY) { ++ AuWarn1("renaming dir who has child(ren) on multiple branches," ++ " is not supported\n"); ++ err = -EXDEV; ++ } ++ return err; ++} ++ ++/* side effect: sets whlist and h_dentry */ ++static int au_ren_may_dir(struct au_ren_args *a) ++{ ++ int err; ++ unsigned int rdhash; ++ struct dentry *d; ++ ++ d = a->dst_dentry; ++ SiMustAnyLock(d->d_sb); ++ ++ err = 0; ++ if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) { ++ rdhash = au_sbi(d->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d)); ++ err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ ++ au_set_dbstart(d, a->dst_bstart); ++ err = may_rename_dstdir(d, &a->whlist); ++ au_set_dbstart(d, a->btgt); ++ } ++ a->dst_h_dentry = au_h_dptr(d, au_dbstart(d)); ++ if (unlikely(err)) ++ goto out; ++ ++ d = a->src_dentry; ++ a->src_h_dentry = au_h_dptr(d, au_dbstart(d)); ++ if (au_ftest_ren(a->flags, ISDIR)) { ++ err = may_rename_srcdir(d, a->btgt); ++ if (unlikely(err)) { ++ au_nhash_wh_free(&a->whlist); ++ a->whlist.nh_num = 0; ++ } ++ } ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * simple tests for rename. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++static int au_may_ren(struct au_ren_args *a) ++{ ++ int err, isdir; ++ struct inode *h_inode; ++ ++ if (a->src_bstart == a->btgt) { ++ err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent, ++ au_ftest_ren(a->flags, ISDIR)); ++ if (unlikely(err)) ++ goto out; ++ err = -EINVAL; ++ if (unlikely(a->src_h_dentry == a->h_trap)) ++ goto out; ++ } ++ ++ err = 0; ++ if (a->dst_bstart != a->btgt) ++ goto out; ++ ++ err = -EIO; ++ h_inode = a->dst_h_dentry->d_inode; ++ isdir = !!au_ftest_ren(a->flags, ISDIR); ++ if (!a->dst_dentry->d_inode) { ++ if (unlikely(h_inode)) ++ goto out; ++ err = au_may_add(a->dst_dentry, a->btgt, a->dst_h_parent, ++ isdir); ++ } else { ++ if (unlikely(!h_inode || !h_inode->i_nlink)) ++ goto out; ++ err = au_may_del(a->dst_dentry, a->btgt, a->dst_h_parent, ++ isdir); ++ if (unlikely(err)) ++ goto out; ++ err = -ENOTEMPTY; ++ if (unlikely(a->dst_h_dentry == a->h_trap)) ++ goto out; ++ err = 0; ++ } ++ ++ out: ++ if (unlikely(err == -ENOENT || err == -EEXIST)) ++ err = -EIO; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * locking order ++ * (VFS) ++ * - src_dir and dir by lock_rename() ++ * - inode if exitsts ++ * (aufs) ++ * - lock all ++ * + src_dentry and dentry by aufs_read_and_write_lock2() which calls, ++ * + si_read_lock ++ * + di_write_lock2_child() ++ * + di_write_lock_child() ++ * + ii_write_lock_child() ++ * + di_write_lock_child2() ++ * + ii_write_lock_child2() ++ * + src_parent and parent ++ * + di_write_lock_parent() ++ * + ii_write_lock_parent() ++ * + di_write_lock_parent2() ++ * + ii_write_lock_parent2() ++ * + lower src_dir and dir by vfsub_lock_rename() ++ * + verify the every relationships between child and parent. if any ++ * of them failed, unlock all and return -EBUSY. ++ */ ++static void au_ren_unlock(struct au_ren_args *a) ++{ ++ struct super_block *sb; ++ ++ sb = a->dst_dentry->d_sb; ++ if (au_ftest_ren(a->flags, MNT_WRITE)) ++ mnt_drop_write(a->br->br_mnt); ++ vfsub_unlock_rename(a->src_h_parent, a->src_hdir, ++ a->dst_h_parent, a->dst_hdir); ++} ++ ++static int au_ren_lock(struct au_ren_args *a) ++{ ++ int err; ++ unsigned int udba; ++ ++ err = 0; ++ a->src_h_parent = au_h_dptr(a->src_parent, a->btgt); ++ a->src_hdir = au_hi(a->src_dir, a->btgt); ++ a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt); ++ a->dst_hdir = au_hi(a->dst_dir, a->btgt); ++ a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir, ++ a->dst_h_parent, a->dst_hdir); ++ udba = au_opt_udba(a->src_dentry->d_sb); ++ if (unlikely(a->src_hdir->hi_inode != a->src_h_parent->d_inode ++ || a->dst_hdir->hi_inode != a->dst_h_parent->d_inode)) ++ err = au_busy_or_stale(); ++ if (!err && au_dbstart(a->src_dentry) == a->btgt) ++ err = au_h_verify(a->src_h_dentry, udba, ++ a->src_h_parent->d_inode, a->src_h_parent, ++ a->br); ++ if (!err && au_dbstart(a->dst_dentry) == a->btgt) ++ err = au_h_verify(a->dst_h_dentry, udba, ++ a->dst_h_parent->d_inode, a->dst_h_parent, ++ a->br); ++ if (!err) { ++ err = mnt_want_write(a->br->br_mnt); ++ if (unlikely(err)) ++ goto out_unlock; ++ au_fset_ren(a->flags, MNT_WRITE); ++ goto out; /* success */ ++ } ++ ++ err = au_busy_or_stale(); ++ ++ out_unlock: ++ au_ren_unlock(a); ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_ren_refresh_dir(struct au_ren_args *a) ++{ ++ struct inode *dir; ++ ++ dir = a->dst_dir; ++ dir->i_version++; ++ if (au_ftest_ren(a->flags, ISDIR)) { ++ /* is this updating defined in POSIX? */ ++ au_cpup_attr_timesizes(a->src_inode); ++ au_cpup_attr_nlink(dir, /*force*/1); ++ if (a->dst_inode) { ++ clear_nlink(a->dst_inode); ++ au_cpup_attr_timesizes(a->dst_inode); ++ } ++ } ++ if (au_ibstart(dir) == a->btgt) ++ au_cpup_attr_timesizes(dir); ++ ++ if (au_ftest_ren(a->flags, ISSAMEDIR)) ++ return; ++ ++ dir = a->src_dir; ++ dir->i_version++; ++ if (au_ftest_ren(a->flags, ISDIR)) ++ au_cpup_attr_nlink(dir, /*force*/1); ++ if (au_ibstart(dir) == a->btgt) ++ au_cpup_attr_timesizes(dir); ++} ++ ++static void au_ren_refresh(struct au_ren_args *a) ++{ ++ aufs_bindex_t bend, bindex; ++ struct dentry *d, *h_d; ++ struct inode *i, *h_i; ++ struct super_block *sb; ++ ++ d = a->src_dentry; ++ au_set_dbwh(d, -1); ++ bend = au_dbend(d); ++ for (bindex = a->btgt + 1; bindex <= bend; bindex++) { ++ h_d = au_h_dptr(d, bindex); ++ if (h_d) ++ au_set_h_dptr(d, bindex, NULL); ++ } ++ au_set_dbend(d, a->btgt); ++ ++ sb = d->d_sb; ++ i = a->src_inode; ++ if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i)) ++ return; /* success */ ++ ++ bend = au_ibend(i); ++ for (bindex = a->btgt + 1; bindex <= bend; bindex++) { ++ h_i = au_h_iptr(i, bindex); ++ if (h_i) { ++ au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0); ++ /* ignore this error */ ++ au_set_h_iptr(i, bindex, NULL, 0); ++ } ++ } ++ au_set_ibend(i, a->btgt); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* mainly for link(2) and rename(2) */ ++int au_wbr(struct dentry *dentry, aufs_bindex_t btgt) ++{ ++ aufs_bindex_t bdiropq, bwh; ++ struct dentry *parent; ++ struct au_branch *br; ++ ++ parent = dentry->d_parent; ++ IMustLock(parent->d_inode); /* dir is locked */ ++ ++ bdiropq = au_dbdiropq(parent); ++ bwh = au_dbwh(dentry); ++ br = au_sbr(dentry->d_sb, btgt); ++ if (au_br_rdonly(br) ++ || (0 <= bdiropq && bdiropq < btgt) ++ || (0 <= bwh && bwh < btgt)) ++ btgt = -1; ++ ++ AuDbg("btgt %d\n", btgt); ++ return btgt; ++} ++ ++/* sets src_bstart, dst_bstart and btgt */ ++static int au_ren_wbr(struct au_ren_args *a) ++{ ++ int err; ++ struct au_wr_dir_args wr_dir_args = { ++ /* .force_btgt = -1, */ ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ a->src_bstart = au_dbstart(a->src_dentry); ++ a->dst_bstart = au_dbstart(a->dst_dentry); ++ if (au_ftest_ren(a->flags, ISDIR)) ++ au_fset_wrdir(wr_dir_args.flags, ISDIR); ++ wr_dir_args.force_btgt = a->src_bstart; ++ if (a->dst_inode && a->dst_bstart < a->src_bstart) ++ wr_dir_args.force_btgt = a->dst_bstart; ++ wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt); ++ err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args); ++ a->btgt = err; ++ ++ return err; ++} ++ ++static void au_ren_dt(struct au_ren_args *a) ++{ ++ a->h_path.dentry = a->src_h_parent; ++ au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path); ++ if (!au_ftest_ren(a->flags, ISSAMEDIR)) { ++ a->h_path.dentry = a->dst_h_parent; ++ au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path); ++ } ++ ++ au_fclr_ren(a->flags, DT_DSTDIR); ++ if (!au_ftest_ren(a->flags, ISDIR)) ++ return; ++ ++ a->h_path.dentry = a->src_h_dentry; ++ au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path); ++ if (a->dst_h_dentry->d_inode) { ++ au_fset_ren(a->flags, DT_DSTDIR); ++ a->h_path.dentry = a->dst_h_dentry; ++ au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path); ++ } ++} ++ ++static void au_ren_rev_dt(int err, struct au_ren_args *a) ++{ ++ struct dentry *h_d; ++ struct mutex *h_mtx; ++ ++ au_dtime_revert(a->src_dt + AuPARENT); ++ if (!au_ftest_ren(a->flags, ISSAMEDIR)) ++ au_dtime_revert(a->dst_dt + AuPARENT); ++ ++ if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) { ++ h_d = a->src_dt[AuCHILD].dt_h_path.dentry; ++ h_mtx = &h_d->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ au_dtime_revert(a->src_dt + AuCHILD); ++ mutex_unlock(h_mtx); ++ ++ if (au_ftest_ren(a->flags, DT_DSTDIR)) { ++ h_d = a->dst_dt[AuCHILD].dt_h_path.dentry; ++ h_mtx = &h_d->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ au_dtime_revert(a->dst_dt + AuCHILD); ++ mutex_unlock(h_mtx); ++ } ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry, ++ struct inode *_dst_dir, struct dentry *_dst_dentry) ++{ ++ int err; ++ /* reduce stack space */ ++ struct au_ren_args *a; ++ ++ IMustLock(_src_dir); ++ IMustLock(_dst_dir); ++ ++ err = -ENOMEM; ++ BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE); ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ a->src_dir = _src_dir; ++ a->src_dentry = _src_dentry; ++ a->src_inode = a->src_dentry->d_inode; ++ a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */ ++ a->dst_dir = _dst_dir; ++ a->dst_dentry = _dst_dentry; ++ a->dst_inode = a->dst_dentry->d_inode; ++ a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */ ++ if (a->dst_inode) { ++ IMustLock(a->dst_inode); ++ au_igrab(a->dst_inode); ++ } ++ ++ err = -ENOTDIR; ++ if (S_ISDIR(a->src_inode->i_mode)) { ++ au_fset_ren(a->flags, ISDIR); ++ if (unlikely(a->dst_inode && !S_ISDIR(a->dst_inode->i_mode))) ++ goto out_free; ++ aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, ++ AuLock_DIR | AuLock_FLUSH); ++ } else ++ aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, ++ AuLock_FLUSH); ++ ++ au_fset_ren(a->flags, ISSAMEDIR); /* temporary */ ++ di_write_lock_parent(a->dst_parent); ++ ++ /* which branch we process */ ++ err = au_ren_wbr(a); ++ if (unlikely(err < 0)) ++ goto out_unlock; ++ a->br = au_sbr(a->dst_dentry->d_sb, a->btgt); ++ a->h_path.mnt = a->br->br_mnt; ++ ++ /* are they available to be renamed */ ++ err = au_ren_may_dir(a); ++ if (unlikely(err)) ++ goto out_children; ++ ++ /* prepare the writable parent dir on the same branch */ ++ if (a->dst_bstart == a->btgt) { ++ au_fset_ren(a->flags, WHDST); ++ } else { ++ err = au_cpup_dirs(a->dst_dentry, a->btgt); ++ if (unlikely(err)) ++ goto out_children; ++ } ++ ++ if (a->src_dir != a->dst_dir) { ++ /* ++ * this temporary unlock is safe, ++ * because both dir->i_mutex are locked. ++ */ ++ di_write_unlock(a->dst_parent); ++ di_write_lock_parent(a->src_parent); ++ err = au_wr_dir_need_wh(a->src_dentry, ++ au_ftest_ren(a->flags, ISDIR), ++ &a->btgt); ++ di_write_unlock(a->src_parent); ++ di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1); ++ au_fclr_ren(a->flags, ISSAMEDIR); ++ } else ++ err = au_wr_dir_need_wh(a->src_dentry, ++ au_ftest_ren(a->flags, ISDIR), ++ &a->btgt); ++ if (unlikely(err < 0)) ++ goto out_children; ++ if (err) ++ au_fset_ren(a->flags, WHSRC); ++ ++ /* lock them all */ ++ err = au_ren_lock(a); ++ if (unlikely(err)) ++ goto out_children; ++ ++ if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE)) { ++ err = au_may_ren(a); ++ if (unlikely(err)) ++ goto out_hdir; ++ } ++ ++ /* store timestamps to be revertible */ ++ au_ren_dt(a); ++ ++ /* here we go */ ++ err = do_rename(a); ++ if (unlikely(err)) ++ goto out_dt; ++ ++ /* update dir attributes */ ++ au_ren_refresh_dir(a); ++ ++ /* dput/iput all lower dentries */ ++ au_ren_refresh(a); ++ ++ goto out_hdir; /* success */ ++ ++ out_dt: ++ au_ren_rev_dt(err, a); ++ out_hdir: ++ au_ren_unlock(a); ++ out_children: ++ au_nhash_wh_free(&a->whlist); ++ out_unlock: ++ if (unlikely(err && au_ftest_ren(a->flags, ISDIR))) { ++ au_update_dbstart(a->dst_dentry); ++ d_drop(a->dst_dentry); ++ } ++ if (!err) ++ d_move(a->src_dentry, a->dst_dentry); ++ if (au_ftest_ren(a->flags, ISSAMEDIR)) ++ di_write_unlock(a->dst_parent); ++ else ++ di_write_unlock2(a->src_parent, a->dst_parent); ++ aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry); ++ out_free: ++ iput(a->dst_inode); ++ if (a->thargs) ++ au_whtmp_rmdir_free(a->thargs); ++ kfree(a); ++ out: ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/Kconfig linux-2.6.31/fs/aufs/Kconfig +--- linux-2.6.31-vanilla/fs/aufs/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/Kconfig 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,140 @@ ++config AUFS_FS ++ tristate "Aufs (Advanced multi layered unification filesystem) support" ++ depends on EXPERIMENTAL ++ help ++ Aufs is a stackable unification filesystem such as Unionfs, ++ which unifies several directories and provides a merged single ++ directory. ++ In the early days, aufs was entirely re-designed and ++ re-implemented Unionfs Version 1.x series. Introducing many ++ original ideas, approaches and improvements, it becomes totally ++ different from Unionfs while keeping the basic features. ++ ++if AUFS_FS ++choice ++ prompt "Maximum number of branches" ++ default AUFS_BRANCH_MAX_127 ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_127 ++ bool "127" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_511 ++ bool "511" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_1023 ++ bool "1023" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_32767 ++ bool "32767" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++endchoice ++ ++config AUFS_HINOTIFY ++ bool "Use inotify to detect actions on a branch" ++ depends on INOTIFY ++ help ++ If you want to modify files on branches directly, eg. bypassing aufs, ++ and want aufs to detect the changes of them fully, then enable this ++ option and use 'udba=inotify' mount option. ++ It will have a negative impact to the performance. ++ See detail in aufs.5. ++ ++config AUFS_EXPORT ++ bool "NFS-exportable aufs" ++ depends on (AUFS_FS = y && EXPORTFS = y) || (AUFS_FS = m && EXPORTFS) ++ help ++ If you want to export your mounted aufs via NFS, then enable this ++ option. There are several requirements for this configuration. ++ See detail in aufs.5. ++ ++config AUFS_RDU ++ bool "Readdir in userspace" ++ help ++ If you have millions of files under a single aufs directory, and ++ meet the out of memory, then enable this option and set ++ environment variables for your readdir(3). ++ See detail in aufs.5. ++ ++config AUFS_SHWH ++ bool "Show whiteouts" ++ help ++ If you want to make the whiteouts in aufs visible, then enable ++ this option and specify 'shwh' mount option. Although it may ++ sounds like philosophy or something, but in technically it ++ simply shows the name of whiteout with keeping its behaviour. ++ ++config AUFS_BR_RAMFS ++ bool "Ramfs (initramfs/rootfs) as an aufs branch" ++ help ++ If you want to use ramfs as an aufs branch fs, then enable this ++ option. Generally tmpfs is recommended. ++ Aufs prohibited them to be a branch fs by default, because ++ initramfs becomes unusable after switch_root or something ++ generally. If you sets initramfs as an aufs branch and boot your ++ system by switch_root, you will meet a problem easily since the ++ files in initramfs may be inaccessible. ++ Unless you are going to use ramfs as an aufs branch fs without ++ switch_root or something, leave it N. ++ ++config AUFS_BR_FUSE ++ bool "Fuse fs as an aufs branch" ++ depends on FUSE_FS ++ select AUFS_POLL ++ help ++ If you want to use fuse-based userspace filesystem as an aufs ++ branch fs, then enable this option. ++ It implements the internal poll(2) operation which is ++ implemented by fuse only (curretnly). ++ ++config AUFS_DEBUG ++ bool "Debug aufs" ++ help ++ Enable this to compile aufs internal debug code. ++ It will have a negative impact to the performance. ++ ++config AUFS_MAGIC_SYSRQ ++ bool ++ depends on AUFS_DEBUG && MAGIC_SYSRQ ++ default y ++ help ++ Automatic configuration for internal use. ++ When aufs supports Magic SysRq, enabled automatically. ++ ++config AUFS_BDEV_LOOP ++ bool ++ depends on BLK_DEV_LOOP ++ default y ++ help ++ Automatic configuration for internal use. ++ Convert =[ym] into =y. ++ ++config AUFS_INO_T_64 ++ bool ++ depends on AUFS_EXPORT ++ depends on 64BIT && !(ALPHA || S390) ++ default y ++ help ++ Automatic configuration for internal use. ++ /* typedef unsigned long/int __kernel_ino_t */ ++ /* alpha and s390x are int */ ++ ++config AUFS_POLL ++ bool ++ help ++ Automatic configuration for internal use. ++endif +diff -Nur linux-2.6.31-vanilla/fs/aufs/loop.c linux-2.6.31/fs/aufs/loop.c +--- linux-2.6.31-vanilla/fs/aufs/loop.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/loop.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,55 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * support for loopback block device as a branch ++ */ ++ ++#include <linux/loop.h> ++#include "aufs.h" ++ ++/* ++ * test if two lower dentries have overlapping branches. ++ */ ++int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_d1, ++ struct dentry *h_d2) ++{ ++ struct inode *h_inode; ++ struct loop_device *l; ++ ++ h_inode = h_d1->d_inode; ++ if (MAJOR(h_inode->i_sb->s_dev) != LOOP_MAJOR) ++ return 0; ++ ++ l = h_inode->i_sb->s_bdev->bd_disk->private_data; ++ h_d1 = l->lo_backing_file->f_dentry; ++ /* h_d1 can be local NFS. in this case aufs cannot detect the loop */ ++ if (unlikely(h_d1->d_sb == sb)) ++ return 1; ++ return !!au_test_subdir(h_d1, h_d2); ++} ++ ++/* true if a kernel thread named 'loop[0-9].*' accesses a file */ ++int au_test_loopback_kthread(void) ++{ ++ const char c = current->comm[4]; ++ ++ return current->mm == NULL ++ && '0' <= c && c <= '9' ++ && strncmp(current->comm, "loop", 4) == 0; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/loop.h linux-2.6.31/fs/aufs/loop.h +--- linux-2.6.31-vanilla/fs/aufs/loop.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/loop.h 2009-09-16 13:55:29.000000000 +0200 +@@ -0,0 +1,51 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * support for loopback mount as a branch ++ */ ++ ++#ifndef __AUFS_LOOP_H__ ++#define __AUFS_LOOP_H__ ++ ++#ifdef __KERNEL__ ++ ++struct dentry; ++struct super_block; ++ ++#ifdef CONFIG_AUFS_BDEV_LOOP ++/* loop.c */ ++int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_d1, ++ struct dentry *h_d2); ++int au_test_loopback_kthread(void); ++#else ++static inline ++int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_d1, ++ struct dentry *h_d2) ++{ ++ return 0; ++} ++ ++static inline int au_test_loopback_kthread(void) ++{ ++ return 0; ++} ++#endif /* BLK_DEV_LOOP */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_LOOP_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/magic.mk linux-2.6.31/fs/aufs/magic.mk +--- linux-2.6.31-vanilla/fs/aufs/magic.mk 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/magic.mk 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,52 @@ ++ ++# defined in ${srctree}/fs/fuse/inode.c ++# tristate ++ifdef CONFIG_FUSE_FS ++ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546 ++endif ++ ++# defined in ${srctree}/fs/ocfs2/ocfs2_fs.h ++# tristate ++ifdef CONFIG_OCFS2_FS ++ccflags-y += -DOCFS2_SUPER_MAGIC=0x7461636f ++endif ++ ++# defined in ${srctree}/fs/ocfs2/dlm/userdlm.h ++# tristate ++ifdef CONFIG_OCFS2_FS_O2CB ++ccflags-y += -DDLMFS_MAGIC=0x76a9f425 ++endif ++ ++# defined in ${srctree}/fs/ramfs/inode.c ++# always true ++ccflags-y += -DRAMFS_MAGIC=0x858458f6 ++ ++# defined in ${srctree}/fs/cifs/cifsfs.c ++# tristate ++ifdef CONFIG_CIFS_FS ++ccflags-y += -DCIFS_MAGIC_NUMBER=0xFF534D42 ++endif ++ ++# defined in ${srctree}/fs/xfs/xfs_sb.h ++# tristate ++ifdef CONFIG_XFS_FS ++ccflags-y += -DXFS_SB_MAGIC=0x58465342 ++endif ++ ++# defined in ${srctree}/fs/configfs/mount.c ++# tristate ++ifdef CONFIG_CONFIGFS_FS ++ccflags-y += -DCONFIGFS_MAGIC=0x62656570 ++endif ++ ++# defined in ${srctree}/fs/9p/v9fs.h ++# tristate ++ifdef CONFIG_9P_FS ++ccflags-y += -DV9FS_MAGIC=0x01021997 ++endif ++ ++# defined in ${srctree}/fs/ubifs/ubifs.h ++# tristate ++ifdef CONFIG_UBIFS_FS ++ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905 ++endif +diff -Nur linux-2.6.31-vanilla/fs/aufs/Makefile linux-2.6.31/fs/aufs/Makefile +--- linux-2.6.31-vanilla/fs/aufs/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/Makefile 2009-09-16 13:55:29.000000000 +0200 +@@ -0,0 +1,24 @@ ++ ++include ${src}/magic.mk ++-include ${src}/priv_def.mk ++ ++obj-$(CONFIG_AUFS_FS) += aufs.o ++aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \ ++ wkq.o vfsub.o dcsub.o \ ++ cpup.o whout.o plink.o wbr_policy.o \ ++ dinfo.o dentry.o \ ++ finfo.o file.o f_op.o \ ++ dir.o vdir.o \ ++ iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \ ++ ioctl.o ++ ++# all are boolean ++aufs-$(CONFIG_SYSFS) += sysfs.o ++aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o ++aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o ++aufs-$(CONFIG_AUFS_HINOTIFY) += hinotify.o ++aufs-$(CONFIG_AUFS_EXPORT) += export.o ++aufs-$(CONFIG_AUFS_POLL) += poll.o ++aufs-$(CONFIG_AUFS_RDU) += rdu.o ++aufs-$(CONFIG_AUFS_DEBUG) += debug.o ++aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o +diff -Nur linux-2.6.31-vanilla/fs/aufs/module.c linux-2.6.31/fs/aufs/module.c +--- linux-2.6.31-vanilla/fs/aufs/module.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/module.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,173 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * module global variables and operations ++ */ ++ ++#include <linux/module.h> ++#include <linux/seq_file.h> ++#include "aufs.h" ++ ++void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp) ++{ ++ if (new_sz <= nused) ++ return p; ++ ++ p = krealloc(p, new_sz, gfp); ++ if (p) ++ memset(p + nused, 0, new_sz - nused); ++ return p; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * aufs caches ++ */ ++struct kmem_cache *au_cachep[AuCache_Last]; ++static int __init au_cache_init(void) ++{ ++ au_cachep[AuCache_DINFO] = AuCache(au_dinfo); ++ if (au_cachep[AuCache_DINFO]) ++ au_cachep[AuCache_ICNTNR] = AuCache(au_icntnr); ++ if (au_cachep[AuCache_ICNTNR]) ++ au_cachep[AuCache_FINFO] = AuCache(au_finfo); ++ if (au_cachep[AuCache_FINFO]) ++ au_cachep[AuCache_VDIR] = AuCache(au_vdir); ++ if (au_cachep[AuCache_VDIR]) ++ au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr); ++ if (au_cachep[AuCache_DEHSTR]) ++ return 0; ++ ++ return -ENOMEM; ++} ++ ++static void au_cache_fin(void) ++{ ++ int i; ++ for (i = 0; i < AuCache_Last; i++) ++ if (au_cachep[i]) { ++ kmem_cache_destroy(au_cachep[i]); ++ au_cachep[i] = NULL; ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_dir_roflags; ++ ++/* ++ * functions for module interface. ++ */ ++MODULE_LICENSE("GPL"); ++/* MODULE_LICENSE("GPL v2"); */ ++MODULE_AUTHOR("Junjiro R. Okajima aufs-users@lists.sourceforge.net"); ++MODULE_DESCRIPTION(AUFS_NAME ++ " -- Advanced multi layered unification filesystem"); ++MODULE_VERSION(AUFS_VERSION); ++ ++/* it should be 'byte', but param_set_byte() prints it by "%c" */ ++short aufs_nwkq = AUFS_NWKQ_DEF; ++MODULE_PARM_DESC(nwkq, "the number of workqueue thread, " AUFS_WKQ_NAME); ++module_param_named(nwkq, aufs_nwkq, short, S_IRUGO); ++ ++/* this module parameter has no meaning when SYSFS is disabled */ ++int sysaufs_brs = 1; ++MODULE_PARM_DESC(brs, "use <sysfs>/fs/aufs/si_*/brN"); ++module_param_named(brs, sysaufs_brs, int, S_IRUGO); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */ ++ ++int au_seq_path(struct seq_file *seq, struct path *path) ++{ ++ return seq_path(seq, path, au_esc_chars); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int __init aufs_init(void) ++{ ++ int err, i; ++ char *p; ++ ++ p = au_esc_chars; ++ for (i = 1; i <= ' '; i++) ++ *p++ = i; ++ *p++ = '\'; ++ *p++ = '\x7f'; ++ *p = 0; ++ ++ au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE); ++ ++ sysaufs_brs_init(); ++ au_debug_init(); ++ ++ err = -EINVAL; ++ if (unlikely(aufs_nwkq <= 0)) ++ goto out; ++ ++ err = sysaufs_init(); ++ if (unlikely(err)) ++ goto out; ++ err = au_wkq_init(); ++ if (unlikely(err)) ++ goto out_sysaufs; ++ err = au_hinotify_init(); ++ if (unlikely(err)) ++ goto out_wkq; ++ err = au_sysrq_init(); ++ if (unlikely(err)) ++ goto out_hin; ++ err = au_cache_init(); ++ if (unlikely(err)) ++ goto out_sysrq; ++ err = register_filesystem(&aufs_fs_type); ++ if (unlikely(err)) ++ goto out_cache; ++ pr_info(AUFS_NAME " " AUFS_VERSION "\n"); ++ goto out; /* success */ ++ ++ out_cache: ++ au_cache_fin(); ++ out_sysrq: ++ au_sysrq_fin(); ++ out_hin: ++ au_hinotify_fin(); ++ out_wkq: ++ au_wkq_fin(); ++ out_sysaufs: ++ sysaufs_fin(); ++ out: ++ return err; ++} ++ ++static void __exit aufs_exit(void) ++{ ++ unregister_filesystem(&aufs_fs_type); ++ au_cache_fin(); ++ au_sysrq_fin(); ++ au_hinotify_fin(); ++ au_wkq_fin(); ++ sysaufs_fin(); ++} ++ ++module_init(aufs_init); ++module_exit(aufs_exit); +diff -Nur linux-2.6.31-vanilla/fs/aufs/module.h linux-2.6.31/fs/aufs/module.h +--- linux-2.6.31-vanilla/fs/aufs/module.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/module.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,78 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * module initialization and module-global ++ */ ++ ++#ifndef __AUFS_MODULE_H__ ++#define __AUFS_MODULE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/slab.h> ++ ++struct path; ++struct seq_file; ++ ++/* module parameters */ ++extern short aufs_nwkq; ++extern int sysaufs_brs; ++ ++/* ---------------------------------------------------------------------- */ ++ ++extern int au_dir_roflags; ++ ++void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp); ++int au_seq_path(struct seq_file *seq, struct path *path); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* kmem cache */ ++enum { ++ AuCache_DINFO, ++ AuCache_ICNTNR, ++ AuCache_FINFO, ++ AuCache_VDIR, ++ AuCache_DEHSTR, ++#ifdef CONFIG_AUFS_HINOTIFY ++ AuCache_HINOTIFY, ++#endif ++ AuCache_Last ++}; ++ ++#define AuCache(type) KMEM_CACHE(type, SLAB_RECLAIM_ACCOUNT) ++ ++extern struct kmem_cache *au_cachep[]; ++ ++#define AuCacheFuncs(name, index) \ ++static inline void *au_cache_alloc_##name(void) \ ++{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \ ++static inline void au_cache_free_##name(void *p) \ ++{ kmem_cache_free(au_cachep[AuCache_##index], p); } ++ ++AuCacheFuncs(dinfo, DINFO); ++AuCacheFuncs(icntnr, ICNTNR); ++AuCacheFuncs(finfo, FINFO); ++AuCacheFuncs(vdir, VDIR); ++AuCacheFuncs(dehstr, DEHSTR); ++ ++/* ---------------------------------------------------------------------- */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_MODULE_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/opts.c linux-2.6.31/fs/aufs/opts.c +--- linux-2.6.31-vanilla/fs/aufs/opts.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/opts.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,1546 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * mount options/flags ++ */ ++ ++#include <linux/file.h> ++#include <linux/namei.h> ++#include <linux/types.h> /* a distribution requires */ ++#include <linux/parser.h> ++#include "aufs.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++enum { ++ Opt_br, ++ Opt_add, Opt_del, Opt_mod, Opt_reorder, Opt_append, Opt_prepend, ++ Opt_idel, Opt_imod, Opt_ireorder, ++ Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash, Opt_rendir, ++ Opt_rdblk_def, Opt_rdhash_def, ++ Opt_xino, Opt_zxino, Opt_noxino, ++ Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino, ++ Opt_trunc_xino_path, Opt_itrunc_xino, ++ Opt_trunc_xib, Opt_notrunc_xib, ++ Opt_shwh, Opt_noshwh, ++ Opt_plink, Opt_noplink, Opt_list_plink, ++ Opt_udba, ++ /* Opt_lock, Opt_unlock, */ ++ Opt_cmd, Opt_cmd_args, ++ Opt_diropq_a, Opt_diropq_w, ++ Opt_warn_perm, Opt_nowarn_perm, ++ Opt_wbr_copyup, Opt_wbr_create, ++ Opt_refrof, Opt_norefrof, ++ Opt_verbose, Opt_noverbose, ++ Opt_sum, Opt_nosum, Opt_wsum, ++ Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err ++}; ++ ++static match_table_t options = { ++ {Opt_br, "br=%s"}, ++ {Opt_br, "br:%s"}, ++ ++ {Opt_add, "add=%d:%s"}, ++ {Opt_add, "add:%d:%s"}, ++ {Opt_add, "ins=%d:%s"}, ++ {Opt_add, "ins:%d:%s"}, ++ {Opt_append, "append=%s"}, ++ {Opt_append, "append:%s"}, ++ {Opt_prepend, "prepend=%s"}, ++ {Opt_prepend, "prepend:%s"}, ++ ++ {Opt_del, "del=%s"}, ++ {Opt_del, "del:%s"}, ++ /* {Opt_idel, "idel:%d"}, */ ++ {Opt_mod, "mod=%s"}, ++ {Opt_mod, "mod:%s"}, ++ /* {Opt_imod, "imod:%d:%s"}, */ ++ ++ {Opt_dirwh, "dirwh=%d"}, ++ ++ {Opt_xino, "xino=%s"}, ++ {Opt_noxino, "noxino"}, ++ {Opt_trunc_xino, "trunc_xino"}, ++ {Opt_trunc_xino_v, "trunc_xino_v=%d:%d"}, ++ {Opt_notrunc_xino, "notrunc_xino"}, ++ {Opt_trunc_xino_path, "trunc_xino=%s"}, ++ {Opt_itrunc_xino, "itrunc_xino=%d"}, ++ /* {Opt_zxino, "zxino=%s"}, */ ++ {Opt_trunc_xib, "trunc_xib"}, ++ {Opt_notrunc_xib, "notrunc_xib"}, ++ ++ {Opt_plink, "plink"}, ++ {Opt_noplink, "noplink"}, ++#ifdef CONFIG_AUFS_DEBUG ++ {Opt_list_plink, "list_plink"}, ++#endif ++ ++ {Opt_udba, "udba=%s"}, ++ ++ {Opt_diropq_a, "diropq=always"}, ++ {Opt_diropq_a, "diropq=a"}, ++ {Opt_diropq_w, "diropq=whiteouted"}, ++ {Opt_diropq_w, "diropq=w"}, ++ ++ {Opt_warn_perm, "warn_perm"}, ++ {Opt_nowarn_perm, "nowarn_perm"}, ++ ++ /* keep them temporary */ ++ {Opt_ignore_silent, "coo=%s"}, ++ {Opt_ignore_silent, "nodlgt"}, ++ {Opt_ignore_silent, "nodirperm1"}, ++ {Opt_ignore_silent, "clean_plink"}, ++ ++#ifdef CONFIG_AUFS_SHWH ++ {Opt_shwh, "shwh"}, ++#endif ++ {Opt_noshwh, "noshwh"}, ++ ++ {Opt_rendir, "rendir=%d"}, ++ ++ {Opt_refrof, "refrof"}, ++ {Opt_norefrof, "norefrof"}, ++ ++ {Opt_verbose, "verbose"}, ++ {Opt_verbose, "v"}, ++ {Opt_noverbose, "noverbose"}, ++ {Opt_noverbose, "quiet"}, ++ {Opt_noverbose, "q"}, ++ {Opt_noverbose, "silent"}, ++ ++ {Opt_sum, "sum"}, ++ {Opt_nosum, "nosum"}, ++ {Opt_wsum, "wsum"}, ++ ++ {Opt_rdcache, "rdcache=%d"}, ++ {Opt_rdblk, "rdblk=%d"}, ++ {Opt_rdblk_def, "rdblk=def"}, ++ {Opt_rdhash, "rdhash=%d"}, ++ {Opt_rdhash_def, "rdhash=def"}, ++ ++ {Opt_wbr_create, "create=%s"}, ++ {Opt_wbr_create, "create_policy=%s"}, ++ {Opt_wbr_copyup, "cpup=%s"}, ++ {Opt_wbr_copyup, "copyup=%s"}, ++ {Opt_wbr_copyup, "copyup_policy=%s"}, ++ ++ /* internal use for the scripts */ ++ {Opt_ignore_silent, "si=%s"}, ++ ++ {Opt_br, "dirs=%s"}, ++ {Opt_ignore, "debug=%d"}, ++ {Opt_ignore, "delete=whiteout"}, ++ {Opt_ignore, "delete=all"}, ++ {Opt_ignore, "imap=%s"}, ++ ++ /* temporary workaround, due to old mount(8)? */ ++ {Opt_ignore_silent, "relatime"}, ++ ++ {Opt_err, NULL} ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static const char *au_parser_pattern(int val, struct match_token *token) ++{ ++ while (token->pattern) { ++ if (token->token == val) ++ return token->pattern; ++ token++; ++ } ++ BUG(); ++ return "??"; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t brperms = { ++ {AuBrPerm_RO, AUFS_BRPERM_RO}, ++ {AuBrPerm_RR, AUFS_BRPERM_RR}, ++ {AuBrPerm_RW, AUFS_BRPERM_RW}, ++ ++ {AuBrPerm_ROWH, AUFS_BRPERM_ROWH}, ++ {AuBrPerm_RRWH, AUFS_BRPERM_RRWH}, ++ {AuBrPerm_RWNoLinkWH, AUFS_BRPERM_RWNLWH}, ++ ++ {AuBrPerm_ROWH, "nfsro"}, ++ {AuBrPerm_RO, NULL} ++}; ++ ++static int br_perm_val(char *perm) ++{ ++ int val; ++ substring_t args[MAX_OPT_ARGS]; ++ ++ val = match_token(perm, brperms, args); ++ return val; ++} ++ ++const char *au_optstr_br_perm(int brperm) ++{ ++ return au_parser_pattern(brperm, (void *)brperms); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t udbalevel = { ++ {AuOpt_UDBA_REVAL, "reval"}, ++ {AuOpt_UDBA_NONE, "none"}, ++#ifdef CONFIG_AUFS_HINOTIFY ++ {AuOpt_UDBA_HINOTIFY, "inotify"}, ++#endif ++ {-1, NULL} ++}; ++ ++static int udba_val(char *str) ++{ ++ substring_t args[MAX_OPT_ARGS]; ++ ++ return match_token(str, udbalevel, args); ++} ++ ++const char *au_optstr_udba(int udba) ++{ ++ return au_parser_pattern(udba, (void *)udbalevel); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t au_wbr_create_policy = { ++ {AuWbrCreate_TDP, "tdp"}, ++ {AuWbrCreate_TDP, "top-down-parent"}, ++ {AuWbrCreate_RR, "rr"}, ++ {AuWbrCreate_RR, "round-robin"}, ++ {AuWbrCreate_MFS, "mfs"}, ++ {AuWbrCreate_MFS, "most-free-space"}, ++ {AuWbrCreate_MFSV, "mfs:%d"}, ++ {AuWbrCreate_MFSV, "most-free-space:%d"}, ++ ++ {AuWbrCreate_MFSRR, "mfsrr:%d"}, ++ {AuWbrCreate_MFSRRV, "mfsrr:%d:%d"}, ++ {AuWbrCreate_PMFS, "pmfs"}, ++ {AuWbrCreate_PMFSV, "pmfs:%d"}, ++ ++ {-1, NULL} ++}; ++ ++/* ++ * cf. linux/lib/parser.c and cmdline.c ++ * gave up calling memparse() since it uses simple_strtoull() instead of ++ * strict_...(). ++ */ ++static int au_match_ull(substring_t *s, unsigned long long *result) ++{ ++ int err; ++ unsigned int len; ++ char a[32]; ++ ++ err = -ERANGE; ++ len = s->to - s->from; ++ if (len + 1 <= sizeof(a)) { ++ memcpy(a, s->from, len); ++ a[len] = '\0'; ++ err = strict_strtoull(a, 0, result); ++ } ++ return err; ++} ++ ++static int au_wbr_mfs_wmark(substring_t *arg, char *str, ++ struct au_opt_wbr_create *create) ++{ ++ int err; ++ unsigned long long ull; ++ ++ err = 0; ++ if (!au_match_ull(arg, &ull)) ++ create->mfsrr_watermark = ull; ++ else { ++ AuErr("bad integer in %s\n", str); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++static int au_wbr_mfs_sec(substring_t *arg, char *str, ++ struct au_opt_wbr_create *create) ++{ ++ int n, err; ++ ++ err = 0; ++ if (!match_int(arg, &n) && 0 <= n) ++ create->mfs_second = n; ++ else { ++ AuErr("bad integer in %s\n", str); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++static int au_wbr_create_val(char *str, struct au_opt_wbr_create *create) ++{ ++ int err, e; ++ substring_t args[MAX_OPT_ARGS]; ++ ++ err = match_token(str, au_wbr_create_policy, args); ++ create->wbr_create = err; ++ switch (err) { ++ case AuWbrCreate_MFSRRV: ++ e = au_wbr_mfs_wmark(&args[0], str, create); ++ if (!e) ++ e = au_wbr_mfs_sec(&args[1], str, create); ++ if (unlikely(e)) ++ err = e; ++ break; ++ case AuWbrCreate_MFSRR: ++ e = au_wbr_mfs_wmark(&args[0], str, create); ++ if (unlikely(e)) { ++ err = e; ++ break; ++ } ++ /*FALLTHROUGH*/ ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_PMFS: ++ create->mfs_second = AUFS_MFS_SECOND_DEF; ++ break; ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFSV: ++ e = au_wbr_mfs_sec(&args[0], str, create); ++ if (unlikely(e)) ++ err = e; ++ break; ++ } ++ ++ return err; ++} ++ ++const char *au_optstr_wbr_create(int wbr_create) ++{ ++ return au_parser_pattern(wbr_create, (void *)au_wbr_create_policy); ++} ++ ++static match_table_t au_wbr_copyup_policy = { ++ {AuWbrCopyup_TDP, "tdp"}, ++ {AuWbrCopyup_TDP, "top-down-parent"}, ++ {AuWbrCopyup_BUP, "bup"}, ++ {AuWbrCopyup_BUP, "bottom-up-parent"}, ++ {AuWbrCopyup_BU, "bu"}, ++ {AuWbrCopyup_BU, "bottom-up"}, ++ {-1, NULL} ++}; ++ ++static int au_wbr_copyup_val(char *str) ++{ ++ substring_t args[MAX_OPT_ARGS]; ++ ++ return match_token(str, au_wbr_copyup_policy, args); ++} ++ ++const char *au_optstr_wbr_copyup(int wbr_copyup) ++{ ++ return au_parser_pattern(wbr_copyup, (void *)au_wbr_copyup_policy); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; ++ ++static void dump_opts(struct au_opts *opts) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++ /* reduce stack space */ ++ union { ++ struct au_opt_add *add; ++ struct au_opt_del *del; ++ struct au_opt_mod *mod; ++ struct au_opt_xino *xino; ++ struct au_opt_xino_itrunc *xino_itrunc; ++ struct au_opt_wbr_create *create; ++ } u; ++ struct au_opt *opt; ++ ++ opt = opts->opt; ++ while (opt->type != Opt_tail) { ++ switch (opt->type) { ++ case Opt_add: ++ u.add = &opt->add; ++ AuDbg("add {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_del: ++ case Opt_idel: ++ u.del = &opt->del; ++ AuDbg("del {%s, %p}\n", ++ u.del->pathname, u.del->h_path.dentry); ++ break; ++ case Opt_mod: ++ case Opt_imod: ++ u.mod = &opt->mod; ++ AuDbg("mod {%s, 0x%x, %p}\n", ++ u.mod->path, u.mod->perm, u.mod->h_root); ++ break; ++ case Opt_append: ++ u.add = &opt->add; ++ AuDbg("append {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_prepend: ++ u.add = &opt->add; ++ AuDbg("prepend {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_dirwh: ++ AuDbg("dirwh %d\n", opt->dirwh); ++ break; ++ case Opt_rdcache: ++ AuDbg("rdcache %d\n", opt->rdcache); ++ break; ++ case Opt_rdblk: ++ AuDbg("rdblk %u\n", opt->rdblk); ++ break; ++ case Opt_rdblk_def: ++ AuDbg("rdblk_def\n"); ++ break; ++ case Opt_rdhash: ++ AuDbg("rdhash %u\n", opt->rdhash); ++ break; ++ case Opt_rdhash_def: ++ AuDbg("rdhash_def\n"); ++ break; ++ case Opt_xino: ++ u.xino = &opt->xino; ++ AuDbg("xino {%s %.*s}\n", ++ u.xino->path, ++ AuDLNPair(u.xino->file->f_dentry)); ++ break; ++ case Opt_trunc_xino: ++ AuLabel(trunc_xino); ++ break; ++ case Opt_notrunc_xino: ++ AuLabel(notrunc_xino); ++ break; ++ case Opt_trunc_xino_path: ++ case Opt_itrunc_xino: ++ u.xino_itrunc = &opt->xino_itrunc; ++ AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex); ++ break; ++ ++ case Opt_noxino: ++ AuLabel(noxino); ++ break; ++ case Opt_trunc_xib: ++ AuLabel(trunc_xib); ++ break; ++ case Opt_notrunc_xib: ++ AuLabel(notrunc_xib); ++ break; ++ case Opt_shwh: ++ AuLabel(shwh); ++ break; ++ case Opt_noshwh: ++ AuLabel(noshwh); ++ break; ++ case Opt_plink: ++ AuLabel(plink); ++ break; ++ case Opt_noplink: ++ AuLabel(noplink); ++ break; ++ case Opt_list_plink: ++ AuLabel(list_plink); ++ break; ++ case Opt_udba: ++ AuDbg("udba %d, %s\n", ++ opt->udba, au_optstr_udba(opt->udba)); ++ break; ++ case Opt_diropq_a: ++ AuLabel(diropq_a); ++ break; ++ case Opt_diropq_w: ++ AuLabel(diropq_w); ++ break; ++ case Opt_warn_perm: ++ AuLabel(warn_perm); ++ break; ++ case Opt_nowarn_perm: ++ AuLabel(nowarn_perm); ++ break; ++ case Opt_refrof: ++ AuLabel(refrof); ++ break; ++ case Opt_norefrof: ++ AuLabel(norefrof); ++ break; ++ case Opt_verbose: ++ AuLabel(verbose); ++ break; ++ case Opt_noverbose: ++ AuLabel(noverbose); ++ break; ++ case Opt_sum: ++ AuLabel(sum); ++ break; ++ case Opt_nosum: ++ AuLabel(nosum); ++ break; ++ case Opt_wsum: ++ AuLabel(wsum); ++ break; ++ case Opt_wbr_create: ++ u.create = &opt->wbr_create; ++ AuDbg("create %d, %s\n", u.create->wbr_create, ++ au_optstr_wbr_create(u.create->wbr_create)); ++ switch (u.create->wbr_create) { ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFSV: ++ AuDbg("%d sec\n", u.create->mfs_second); ++ break; ++ case AuWbrCreate_MFSRR: ++ AuDbg("%llu watermark\n", ++ u.create->mfsrr_watermark); ++ break; ++ case AuWbrCreate_MFSRRV: ++ AuDbg("%llu watermark, %d sec\n", ++ u.create->mfsrr_watermark, ++ u.create->mfs_second); ++ break; ++ } ++ break; ++ case Opt_wbr_copyup: ++ AuDbg("copyup %d, %s\n", opt->wbr_copyup, ++ au_optstr_wbr_copyup(opt->wbr_copyup)); ++ break; ++ default: ++ BUG(); ++ } ++ opt++; ++ } ++#endif ++} ++ ++void au_opts_free(struct au_opts *opts) ++{ ++ struct au_opt *opt; ++ ++ opt = opts->opt; ++ while (opt->type != Opt_tail) { ++ switch (opt->type) { ++ case Opt_add: ++ case Opt_append: ++ case Opt_prepend: ++ path_put(&opt->add.path); ++ break; ++ case Opt_del: ++ case Opt_idel: ++ path_put(&opt->del.h_path); ++ break; ++ case Opt_mod: ++ case Opt_imod: ++ dput(opt->mod.h_root); ++ break; ++ case Opt_xino: ++ fput(opt->xino.file); ++ break; ++ } ++ opt++; ++ } ++} ++ ++static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags, ++ aufs_bindex_t bindex) ++{ ++ int err; ++ struct au_opt_add *add = &opt->add; ++ char *p; ++ ++ add->bindex = bindex; ++ add->perm = AuBrPerm_Last; ++ add->pathname = opt_str; ++ p = strchr(opt_str, '='); ++ if (p) { ++ *p++ = 0; ++ if (*p) ++ add->perm = br_perm_val(p); ++ } ++ ++ err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path); ++ if (!err) { ++ if (!p) { ++ add->perm = AuBrPerm_RO; ++ if (au_test_fs_rr(add->path.dentry->d_sb)) ++ add->perm = AuBrPerm_RR; ++ else if (!bindex && !(sb_flags & MS_RDONLY)) ++ add->perm = AuBrPerm_RW; ++ } ++ opt->type = Opt_add; ++ goto out; ++ } ++ AuErr("lookup failed %s (%d)\n", add->pathname, err); ++ err = -EINVAL; ++ ++ out: ++ return err; ++} ++ ++static int au_opts_parse_del(struct au_opt_del *del, substring_t args[]) ++{ ++ int err; ++ ++ del->pathname = args[0].from; ++ AuDbg("del path %s\n", del->pathname); ++ ++ err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path); ++ if (unlikely(err)) ++ AuErr("lookup failed %s (%d)\n", del->pathname, err); ++ ++ return err; ++} ++ ++#if 0 /* reserved for future use */ ++static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_opt_del *del, substring_t args[]) ++{ ++ int err; ++ struct dentry *root; ++ ++ err = -EINVAL; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (bindex < 0 || au_sbend(sb) < bindex) { ++ AuErr("out of bounds, %d\n", bindex); ++ goto out; ++ } ++ ++ err = 0; ++ del->h_path.dentry = dget(au_h_dptr(root, bindex)); ++ del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex)); ++ ++ out: ++ aufs_read_unlock(root, !AuLock_IR); ++ return err; ++} ++#endif ++ ++static int au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[]) ++{ ++ int err; ++ struct path path; ++ char *p; ++ ++ err = -EINVAL; ++ mod->path = args[0].from; ++ p = strchr(mod->path, '='); ++ if (unlikely(!p)) { ++ AuErr("no permssion %s\n", args[0].from); ++ goto out; ++ } ++ ++ *p++ = 0; ++ err = vfsub_kern_path(mod->path, lkup_dirflags, &path); ++ if (unlikely(err)) { ++ AuErr("lookup failed %s (%d)\n", mod->path, err); ++ goto out; ++ } ++ ++ mod->perm = br_perm_val(p); ++ AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p); ++ mod->h_root = dget(path.dentry); ++ path_put(&path); ++ ++ out: ++ return err; ++} ++ ++#if 0 /* reserved for future use */ ++static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_opt_mod *mod, substring_t args[]) ++{ ++ int err; ++ struct dentry *root; ++ ++ err = -EINVAL; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (bindex < 0 || au_sbend(sb) < bindex) { ++ AuErr("out of bounds, %d\n", bindex); ++ goto out; ++ } ++ ++ err = 0; ++ mod->perm = br_perm_val(args[1].from); ++ AuDbg("mod path %s, perm 0x%x, %s\n", ++ mod->path, mod->perm, args[1].from); ++ mod->h_root = dget(au_h_dptr(root, bindex)); ++ ++ out: ++ aufs_read_unlock(root, !AuLock_IR); ++ return err; ++} ++#endif ++ ++static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino, ++ substring_t args[]) ++{ ++ int err; ++ struct file *file; ++ ++ file = au_xino_create(sb, args[0].from, /*silent*/0); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ ++ err = -EINVAL; ++ if (unlikely(file->f_dentry->d_sb == sb)) { ++ fput(file); ++ AuErr("%s must be outside\n", args[0].from); ++ goto out; ++ } ++ ++ err = 0; ++ xino->file = file; ++ xino->path = args[0].from; ++ ++ out: ++ return err; ++} ++ ++static ++int au_opts_parse_xino_itrunc_path(struct super_block *sb, ++ struct au_opt_xino_itrunc *xino_itrunc, ++ substring_t args[]) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct path path; ++ struct dentry *root; ++ ++ err = vfsub_kern_path(args[0].from, lkup_dirflags, &path); ++ if (unlikely(err)) { ++ AuErr("lookup failed %s (%d)\n", args[0].from, err); ++ goto out; ++ } ++ ++ xino_itrunc->bindex = -1; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ if (au_h_dptr(root, bindex) == path.dentry) { ++ xino_itrunc->bindex = bindex; ++ break; ++ } ++ } ++ aufs_read_unlock(root, !AuLock_IR); ++ path_put(&path); ++ ++ if (unlikely(xino_itrunc->bindex < 0)) { ++ AuErr("no such branch %s\n", args[0].from); ++ err = -EINVAL; ++ } ++ ++ out: ++ return err; ++} ++ ++/* called without aufs lock */ ++int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts) ++{ ++ int err, n, token; ++ aufs_bindex_t bindex; ++ unsigned char skipped; ++ struct dentry *root; ++ struct au_opt *opt, *opt_tail; ++ char *opt_str; ++ /* reduce the stack space */ ++ union { ++ struct au_opt_xino_itrunc *xino_itrunc; ++ struct au_opt_wbr_create *create; ++ } u; ++ struct { ++ substring_t args[MAX_OPT_ARGS]; ++ } *a; ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ root = sb->s_root; ++ err = 0; ++ bindex = 0; ++ opt = opts->opt; ++ opt_tail = opt + opts->max_opt - 1; ++ opt->type = Opt_tail; ++ while (!err && (opt_str = strsep(&str, ",")) && *opt_str) { ++ err = -EINVAL; ++ skipped = 0; ++ token = match_token(opt_str, options, a->args); ++ switch (token) { ++ case Opt_br: ++ err = 0; ++ while (!err && (opt_str = strsep(&a->args[0].from, ":")) ++ && *opt_str) { ++ err = opt_add(opt, opt_str, opts->sb_flags, ++ bindex++); ++ if (unlikely(!err && ++opt > opt_tail)) { ++ err = -E2BIG; ++ break; ++ } ++ opt->type = Opt_tail; ++ skipped = 1; ++ } ++ break; ++ case Opt_add: ++ if (unlikely(match_int(&a->args[0], &n))) { ++ AuErr("bad integer in %s\n", opt_str); ++ break; ++ } ++ bindex = n; ++ err = opt_add(opt, a->args[1].from, opts->sb_flags, ++ bindex); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_append: ++ err = opt_add(opt, a->args[0].from, opts->sb_flags, ++ /*dummy bindex*/1); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_prepend: ++ err = opt_add(opt, a->args[0].from, opts->sb_flags, ++ /*bindex*/0); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_del: ++ err = au_opts_parse_del(&opt->del, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#if 0 /* reserved for future use */ ++ case Opt_idel: ++ del->pathname = "(indexed)"; ++ if (unlikely(match_int(&args[0], &n))) { ++ AuErr("bad integer in %s\n", opt_str); ++ break; ++ } ++ err = au_opts_parse_idel(sb, n, &opt->del, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#endif ++ case Opt_mod: ++ err = au_opts_parse_mod(&opt->mod, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#ifdef IMOD /* reserved for future use */ ++ case Opt_imod: ++ u.mod->path = "(indexed)"; ++ if (unlikely(match_int(&a->args[0], &n))) { ++ AuErr("bad integer in %s\n", opt_str); ++ break; ++ } ++ err = au_opts_parse_imod(sb, n, &opt->mod, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#endif ++ case Opt_xino: ++ err = au_opts_parse_xino(sb, &opt->xino, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++ ++ case Opt_trunc_xino_path: ++ err = au_opts_parse_xino_itrunc_path ++ (sb, &opt->xino_itrunc, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++ ++ case Opt_itrunc_xino: ++ u.xino_itrunc = &opt->xino_itrunc; ++ if (unlikely(match_int(&a->args[0], &n))) { ++ AuErr("bad integer in %s\n", opt_str); ++ break; ++ } ++ u.xino_itrunc->bindex = n; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (n < 0 || au_sbend(sb) < n) { ++ AuErr("out of bounds, %d\n", n); ++ aufs_read_unlock(root, !AuLock_IR); ++ break; ++ } ++ aufs_read_unlock(root, !AuLock_IR); ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_dirwh: ++ if (unlikely(match_int(&a->args[0], &opt->dirwh))) ++ break; ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_rdcache: ++ if (unlikely(match_int(&a->args[0], &opt->rdcache))) ++ break; ++ err = 0; ++ opt->type = token; ++ break; ++ case Opt_rdblk: ++ if (unlikely(match_int(&a->args[0], &n) ++ || n < 0 ++ || n > KMALLOC_MAX_SIZE)) { ++ AuErr("bad integer in %s\n", opt_str); ++ break; ++ } ++ if (unlikely(n && n < NAME_MAX)) { ++ AuErr("rdblk must be larger than %d\n", ++ NAME_MAX); ++ break; ++ } ++ opt->rdblk = n; ++ err = 0; ++ opt->type = token; ++ break; ++ case Opt_rdhash: ++ if (unlikely(match_int(&a->args[0], &n) ++ || n < 0 ++ || n * sizeof(struct hlist_head) ++ > KMALLOC_MAX_SIZE)) { ++ AuErr("bad integer in %s\n", opt_str); ++ break; ++ } ++ opt->rdhash = n; ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_trunc_xino: ++ case Opt_notrunc_xino: ++ case Opt_noxino: ++ case Opt_trunc_xib: ++ case Opt_notrunc_xib: ++ case Opt_shwh: ++ case Opt_noshwh: ++ case Opt_plink: ++ case Opt_noplink: ++ case Opt_list_plink: ++ case Opt_diropq_a: ++ case Opt_diropq_w: ++ case Opt_warn_perm: ++ case Opt_nowarn_perm: ++ case Opt_refrof: ++ case Opt_norefrof: ++ case Opt_verbose: ++ case Opt_noverbose: ++ case Opt_sum: ++ case Opt_nosum: ++ case Opt_wsum: ++ case Opt_rdblk_def: ++ case Opt_rdhash_def: ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_udba: ++ opt->udba = udba_val(a->args[0].from); ++ if (opt->udba >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ AuErr("wrong value, %s\n", opt_str); ++ break; ++ ++ case Opt_wbr_create: ++ u.create = &opt->wbr_create; ++ u.create->wbr_create ++ = au_wbr_create_val(a->args[0].from, u.create); ++ if (u.create->wbr_create >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ AuErr("wrong value, %s\n", opt_str); ++ break; ++ case Opt_wbr_copyup: ++ opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from); ++ if (opt->wbr_copyup >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ AuErr("wrong value, %s\n", opt_str); ++ break; ++ ++ case Opt_ignore: ++ AuWarn("ignored %s\n", opt_str); ++ /*FALLTHROUGH*/ ++ case Opt_ignore_silent: ++ skipped = 1; ++ err = 0; ++ break; ++ case Opt_err: ++ AuErr("unknown option %s\n", opt_str); ++ break; ++ } ++ ++ if (!err && !skipped) { ++ if (unlikely(++opt > opt_tail)) { ++ err = -E2BIG; ++ opt--; ++ opt->type = Opt_tail; ++ break; ++ } ++ opt->type = Opt_tail; ++ } ++ } ++ ++ kfree(a); ++ dump_opts(opts); ++ if (unlikely(err)) ++ au_opts_free(opts); ++ ++ out: ++ return err; ++} ++ ++static int au_opt_wbr_create(struct super_block *sb, ++ struct au_opt_wbr_create *create) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 1; /* handled */ ++ sbinfo = au_sbi(sb); ++ if (sbinfo->si_wbr_create_ops->fin) { ++ err = sbinfo->si_wbr_create_ops->fin(sb); ++ if (!err) ++ err = 1; ++ } ++ ++ sbinfo->si_wbr_create = create->wbr_create; ++ sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create; ++ switch (create->wbr_create) { ++ case AuWbrCreate_MFSRRV: ++ case AuWbrCreate_MFSRR: ++ sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark; ++ /*FALLTHROUGH*/ ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFS: ++ case AuWbrCreate_PMFSV: ++ sbinfo->si_wbr_mfs.mfs_expire = create->mfs_second * HZ; ++ break; ++ } ++ ++ if (sbinfo->si_wbr_create_ops->init) ++ sbinfo->si_wbr_create_ops->init(sb); /* ignore */ ++ ++ return err; ++} ++ ++/* ++ * returns, ++ * plus: processed without an error ++ * zero: unprocessed ++ */ ++static int au_opt_simple(struct super_block *sb, struct au_opt *opt, ++ struct au_opts *opts) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 1; /* handled */ ++ sbinfo = au_sbi(sb); ++ switch (opt->type) { ++ case Opt_udba: ++ sbinfo->si_mntflags &= ~AuOptMask_UDBA; ++ sbinfo->si_mntflags |= opt->udba; ++ opts->given_udba |= opt->udba; ++ break; ++ ++ case Opt_plink: ++ au_opt_set(sbinfo->si_mntflags, PLINK); ++ break; ++ case Opt_noplink: ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_put(sb); ++ au_opt_clr(sbinfo->si_mntflags, PLINK); ++ break; ++ case Opt_list_plink: ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_list(sb); ++ break; ++ ++ case Opt_diropq_a: ++ au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ); ++ break; ++ case Opt_diropq_w: ++ au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ); ++ break; ++ ++ case Opt_warn_perm: ++ au_opt_set(sbinfo->si_mntflags, WARN_PERM); ++ break; ++ case Opt_nowarn_perm: ++ au_opt_clr(sbinfo->si_mntflags, WARN_PERM); ++ break; ++ ++ case Opt_refrof: ++ au_opt_set(sbinfo->si_mntflags, REFROF); ++ break; ++ case Opt_norefrof: ++ au_opt_clr(sbinfo->si_mntflags, REFROF); ++ break; ++ ++ case Opt_verbose: ++ au_opt_set(sbinfo->si_mntflags, VERBOSE); ++ break; ++ case Opt_noverbose: ++ au_opt_clr(sbinfo->si_mntflags, VERBOSE); ++ break; ++ ++ case Opt_sum: ++ au_opt_set(sbinfo->si_mntflags, SUM); ++ break; ++ case Opt_wsum: ++ au_opt_clr(sbinfo->si_mntflags, SUM); ++ au_opt_set(sbinfo->si_mntflags, SUM_W); ++ case Opt_nosum: ++ au_opt_clr(sbinfo->si_mntflags, SUM); ++ au_opt_clr(sbinfo->si_mntflags, SUM_W); ++ break; ++ ++ case Opt_wbr_create: ++ err = au_opt_wbr_create(sb, &opt->wbr_create); ++ break; ++ case Opt_wbr_copyup: ++ sbinfo->si_wbr_copyup = opt->wbr_copyup; ++ sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup; ++ break; ++ ++ case Opt_dirwh: ++ sbinfo->si_dirwh = opt->dirwh; ++ break; ++ ++ case Opt_rdcache: ++ sbinfo->si_rdcache = opt->rdcache * HZ; ++ break; ++ case Opt_rdblk: ++ sbinfo->si_rdblk = opt->rdblk; ++ break; ++ case Opt_rdblk_def: ++ sbinfo->si_rdblk = AUFS_RDBLK_DEF; ++ break; ++ case Opt_rdhash: ++ sbinfo->si_rdhash = opt->rdhash; ++ break; ++ case Opt_rdhash_def: ++ sbinfo->si_rdhash = AUFS_RDHASH_DEF; ++ break; ++ ++ case Opt_shwh: ++ au_opt_set(sbinfo->si_mntflags, SHWH); ++ break; ++ case Opt_noshwh: ++ au_opt_clr(sbinfo->si_mntflags, SHWH); ++ break; ++ ++ case Opt_trunc_xino: ++ au_opt_set(sbinfo->si_mntflags, TRUNC_XINO); ++ break; ++ case Opt_notrunc_xino: ++ au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO); ++ break; ++ ++ case Opt_trunc_xino_path: ++ case Opt_itrunc_xino: ++ err = au_xino_trunc(sb, opt->xino_itrunc.bindex); ++ if (!err) ++ err = 1; ++ break; ++ ++ case Opt_trunc_xib: ++ au_fset_opts(opts->flags, TRUNC_XIB); ++ break; ++ case Opt_notrunc_xib: ++ au_fclr_opts(opts->flags, TRUNC_XIB); ++ break; ++ ++ default: ++ err = 0; ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * returns tri-state. ++ * plus: processed without an error ++ * zero: unprocessed ++ * minus: error ++ */ ++static int au_opt_br(struct super_block *sb, struct au_opt *opt, ++ struct au_opts *opts) ++{ ++ int err, do_refresh; ++ ++ err = 0; ++ switch (opt->type) { ++ case Opt_append: ++ opt->add.bindex = au_sbend(sb) + 1; ++ if (opt->add.bindex < 0) ++ opt->add.bindex = 0; ++ goto add; ++ case Opt_prepend: ++ opt->add.bindex = 0; ++ add: ++ case Opt_add: ++ err = au_br_add(sb, &opt->add, ++ au_ftest_opts(opts->flags, REMOUNT)); ++ if (!err) { ++ err = 1; ++ au_fset_opts(opts->flags, REFRESH_DIR); ++ if (au_br_whable(opt->add.perm)) ++ au_fset_opts(opts->flags, REFRESH_NONDIR); ++ } ++ break; ++ ++ case Opt_del: ++ case Opt_idel: ++ err = au_br_del(sb, &opt->del, ++ au_ftest_opts(opts->flags, REMOUNT)); ++ if (!err) { ++ err = 1; ++ au_fset_opts(opts->flags, TRUNC_XIB); ++ au_fset_opts(opts->flags, REFRESH_DIR); ++ au_fset_opts(opts->flags, REFRESH_NONDIR); ++ } ++ break; ++ ++ case Opt_mod: ++ case Opt_imod: ++ err = au_br_mod(sb, &opt->mod, ++ au_ftest_opts(opts->flags, REMOUNT), ++ &do_refresh); ++ if (!err) { ++ err = 1; ++ if (do_refresh) { ++ au_fset_opts(opts->flags, REFRESH_DIR); ++ au_fset_opts(opts->flags, REFRESH_NONDIR); ++ } ++ } ++ break; ++ } ++ ++ return err; ++} ++ ++static int au_opt_xino(struct super_block *sb, struct au_opt *opt, ++ struct au_opt_xino **opt_xino, ++ struct au_opts *opts) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct dentry *root, *parent, *h_root; ++ ++ err = 0; ++ switch (opt->type) { ++ case Opt_xino: ++ err = au_xino_set(sb, &opt->xino, ++ !!au_ftest_opts(opts->flags, REMOUNT)); ++ if (unlikely(err)) ++ break; ++ ++ *opt_xino = &opt->xino; ++ au_xino_brid_set(sb, -1); ++ ++ /* safe d_parent access */ ++ parent = opt->xino.file->f_dentry->d_parent; ++ root = sb->s_root; ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ h_root = au_h_dptr(root, bindex); ++ if (h_root == parent) { ++ au_xino_brid_set(sb, au_sbr_id(sb, bindex)); ++ break; ++ } ++ } ++ break; ++ ++ case Opt_noxino: ++ au_xino_clr(sb); ++ au_xino_brid_set(sb, -1); ++ *opt_xino = (void *)-1; ++ break; ++ } ++ ++ return err; ++} ++ ++int au_opts_verify(struct super_block *sb, unsigned long sb_flags, ++ unsigned int pending) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ unsigned char do_plink, skip, do_free; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ struct dentry *root; ++ struct inode *dir, *h_dir; ++ struct au_sbinfo *sbinfo; ++ struct au_hinode *hdir; ++ ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA)); ++ ++ if (!(sb_flags & MS_RDONLY)) { ++ if (unlikely(!au_br_writable(au_sbr_perm(sb, 0)))) ++ AuWarn("first branch should be rw\n"); ++ if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH))) ++ AuWarn("shwh should be used with ro\n"); ++ } ++ ++ if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HINOTIFY) ++ && !au_opt_test(sbinfo->si_mntflags, XINO)) ++ AuWarn("udba=inotify requires xino\n"); ++ ++ err = 0; ++ root = sb->s_root; ++ dir = sb->s_root->d_inode; ++ do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK); ++ bend = au_sbend(sb); ++ for (bindex = 0; !err && bindex <= bend; bindex++) { ++ skip = 0; ++ h_dir = au_h_iptr(dir, bindex); ++ br = au_sbr(sb, bindex); ++ do_free = 0; ++ ++ wbr = br->br_wbr; ++ if (wbr) ++ wbr_wh_read_lock(wbr); ++ ++ switch (br->br_perm) { ++ case AuBrPerm_RO: ++ case AuBrPerm_ROWH: ++ case AuBrPerm_RR: ++ case AuBrPerm_RRWH: ++ do_free = !!wbr; ++ skip = (!wbr ++ || (!wbr->wbr_whbase ++ && !wbr->wbr_plink ++ && !wbr->wbr_orph)); ++ break; ++ ++ case AuBrPerm_RWNoLinkWH: ++ /* skip = (!br->br_whbase && !br->br_orph); */ ++ skip = (!wbr || !wbr->wbr_whbase); ++ if (skip && wbr) { ++ if (do_plink) ++ skip = !!wbr->wbr_plink; ++ else ++ skip = !wbr->wbr_plink; ++ } ++ break; ++ ++ case AuBrPerm_RW: ++ /* skip = (br->br_whbase && br->br_ohph); */ ++ skip = (wbr && wbr->wbr_whbase); ++ if (skip) { ++ if (do_plink) ++ skip = !!wbr->wbr_plink; ++ else ++ skip = !wbr->wbr_plink; ++ } ++ break; ++ ++ default: ++ BUG(); ++ } ++ if (wbr) ++ wbr_wh_read_unlock(wbr); ++ ++ if (skip) ++ continue; ++ ++ hdir = au_hi(dir, bindex); ++ au_hin_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ if (wbr) ++ wbr_wh_write_lock(wbr); ++ err = au_wh_init(au_h_dptr(root, bindex), br, sb); ++ if (wbr) ++ wbr_wh_write_unlock(wbr); ++ au_hin_imtx_unlock(hdir); ++ ++ if (!err && do_free) { ++ kfree(wbr); ++ br->br_wbr = NULL; ++ } ++ } ++ ++ return err; ++} ++ ++int au_opts_mount(struct super_block *sb, struct au_opts *opts) ++{ ++ int err; ++ unsigned int tmp; ++ aufs_bindex_t bend; ++ struct au_opt *opt; ++ struct au_opt_xino *opt_xino, xino; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ opt_xino = NULL; ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) ++ err = au_opt_simple(sb, opt++, opts); ++ if (err > 0) ++ err = 0; ++ else if (unlikely(err < 0)) ++ goto out; ++ ++ /* disable xino and udba temporary */ ++ sbinfo = au_sbi(sb); ++ tmp = sbinfo->si_mntflags; ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++ au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL); ++ ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) ++ err = au_opt_br(sb, opt++, opts); ++ if (err > 0) ++ err = 0; ++ else if (unlikely(err < 0)) ++ goto out; ++ ++ bend = au_sbend(sb); ++ if (unlikely(bend < 0)) { ++ err = -EINVAL; ++ AuErr("no branches\n"); ++ goto out; ++ } ++ ++ if (au_opt_test(tmp, XINO)) ++ au_opt_set(sbinfo->si_mntflags, XINO); ++ opt = opts->opt; ++ while (!err && opt->type != Opt_tail) ++ err = au_opt_xino(sb, opt++, &opt_xino, opts); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_opts_verify(sb, sb->s_flags, tmp); ++ if (unlikely(err)) ++ goto out; ++ ++ /* restore xino */ ++ if (au_opt_test(tmp, XINO) && !opt_xino) { ++ xino.file = au_xino_def(sb); ++ err = PTR_ERR(xino.file); ++ if (IS_ERR(xino.file)) ++ goto out; ++ ++ err = au_xino_set(sb, &xino, /*remount*/0); ++ fput(xino.file); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ /* restore udba */ ++ sbinfo->si_mntflags &= ~AuOptMask_UDBA; ++ sbinfo->si_mntflags |= (tmp & AuOptMask_UDBA); ++ if (au_opt_test(tmp, UDBA_HINOTIFY)) { ++ struct inode *dir = sb->s_root->d_inode; ++ au_reset_hinotify(dir, ++ au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO); ++ } ++ ++ out: ++ return err; ++} ++ ++int au_opts_remount(struct super_block *sb, struct au_opts *opts) ++{ ++ int err, rerr; ++ struct inode *dir; ++ struct au_opt_xino *opt_xino; ++ struct au_opt *opt; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ dir = sb->s_root->d_inode; ++ sbinfo = au_sbi(sb); ++ err = 0; ++ opt_xino = NULL; ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) { ++ err = au_opt_simple(sb, opt, opts); ++ if (!err) ++ err = au_opt_br(sb, opt, opts); ++ if (!err) ++ err = au_opt_xino(sb, opt, &opt_xino, opts); ++ opt++; ++ } ++ if (err > 0) ++ err = 0; ++ AuTraceErr(err); ++ /* go on even err */ ++ ++ rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0); ++ if (unlikely(rerr && !err)) ++ err = rerr; ++ ++ if (au_ftest_opts(opts->flags, TRUNC_XIB)) { ++ rerr = au_xib_trunc(sb); ++ if (unlikely(rerr && !err)) ++ err = rerr; ++ } ++ ++ /* will be handled by the caller */ ++ if (!au_ftest_opts(opts->flags, REFRESH_DIR) ++ && (opts->given_udba || au_opt_test(sbinfo->si_mntflags, XINO))) ++ au_fset_opts(opts->flags, REFRESH_DIR); ++ ++ AuDbg("status 0x%x\n", opts->flags); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++unsigned int au_opt_udba(struct super_block *sb) ++{ ++ return au_mntflags(sb) & AuOptMask_UDBA; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/opts.h linux-2.6.31/fs/aufs/opts.h +--- linux-2.6.31-vanilla/fs/aufs/opts.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/opts.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,196 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * mount options/flags ++ */ ++ ++#ifndef __AUFS_OPTS_H__ ++#define __AUFS_OPTS_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/path.h> ++#include <linux/aufs_type.h> ++ ++struct file; ++struct super_block; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* mount flags */ ++#define AuOpt_XINO 1 /* external inode number bitmap ++ and translation table */ ++#define AuOpt_TRUNC_XINO (1 << 1) /* truncate xino files */ ++#define AuOpt_UDBA_NONE (1 << 2) /* users direct branch access */ ++#define AuOpt_UDBA_REVAL (1 << 3) ++#define AuOpt_UDBA_HINOTIFY (1 << 4) ++#define AuOpt_SHWH (1 << 5) /* show whiteout */ ++#define AuOpt_PLINK (1 << 6) /* pseudo-link */ ++#define AuOpt_DIRPERM1 (1 << 7) /* unimplemented */ ++#define AuOpt_REFROF (1 << 8) /* unimplemented */ ++#define AuOpt_ALWAYS_DIROPQ (1 << 9) /* policy to creating diropq */ ++#define AuOpt_SUM (1 << 10) /* summation for statfs(2) */ ++#define AuOpt_SUM_W (1 << 11) /* unimplemented */ ++#define AuOpt_WARN_PERM (1 << 12) /* warn when add-branch */ ++#define AuOpt_VERBOSE (1 << 13) /* busy inode when del-branch */ ++ ++#ifndef CONFIG_AUFS_HINOTIFY ++#undef AuOpt_UDBA_HINOTIFY ++#define AuOpt_UDBA_HINOTIFY 0 ++#endif ++#ifndef CONFIG_AUFS_SHWH ++#undef AuOpt_SHWH ++#define AuOpt_SHWH 0 ++#endif ++ ++#define AuOpt_Def (AuOpt_XINO \ ++ | AuOpt_UDBA_REVAL \ ++ | AuOpt_PLINK \ ++ /* | AuOpt_DIRPERM1 */ \ ++ | AuOpt_WARN_PERM) ++#define AuOptMask_UDBA (AuOpt_UDBA_NONE \ ++ | AuOpt_UDBA_REVAL \ ++ | AuOpt_UDBA_HINOTIFY) ++ ++#define au_opt_test(flags, name) (flags & AuOpt_##name) ++#define au_opt_set(flags, name) do { \ ++ BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \ ++ ((flags) |= AuOpt_##name); \ ++} while (0) ++#define au_opt_set_udba(flags, name) do { \ ++ (flags) &= ~AuOptMask_UDBA; \ ++ ((flags) |= AuOpt_##name); \ ++} while (0) ++#define au_opt_clr(flags, name) { ((flags) &= ~AuOpt_##name); } ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies to select one among multiple writable branches */ ++enum { ++ AuWbrCreate_TDP, /* top down parent */ ++ AuWbrCreate_RR, /* round robin */ ++ AuWbrCreate_MFS, /* most free space */ ++ AuWbrCreate_MFSV, /* mfs with seconds */ ++ AuWbrCreate_MFSRR, /* mfs then rr */ ++ AuWbrCreate_MFSRRV, /* mfs then rr with seconds */ ++ AuWbrCreate_PMFS, /* parent and mfs */ ++ AuWbrCreate_PMFSV, /* parent and mfs with seconds */ ++ ++ AuWbrCreate_Def = AuWbrCreate_TDP ++}; ++ ++enum { ++ AuWbrCopyup_TDP, /* top down parent */ ++ AuWbrCopyup_BUP, /* bottom up parent */ ++ AuWbrCopyup_BU, /* bottom up */ ++ ++ AuWbrCopyup_Def = AuWbrCopyup_TDP ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_opt_add { ++ aufs_bindex_t bindex; ++ char *pathname; ++ int perm; ++ struct path path; ++}; ++ ++struct au_opt_del { ++ char *pathname; ++ struct path h_path; ++}; ++ ++struct au_opt_mod { ++ char *path; ++ int perm; ++ struct dentry *h_root; ++}; ++ ++struct au_opt_xino { ++ char *path; ++ struct file *file; ++}; ++ ++struct au_opt_xino_itrunc { ++ aufs_bindex_t bindex; ++}; ++ ++struct au_opt_wbr_create { ++ int wbr_create; ++ int mfs_second; ++ unsigned long long mfsrr_watermark; ++}; ++ ++struct au_opt { ++ int type; ++ union { ++ struct au_opt_xino xino; ++ struct au_opt_xino_itrunc xino_itrunc; ++ struct au_opt_add add; ++ struct au_opt_del del; ++ struct au_opt_mod mod; ++ int dirwh; ++ int rdcache; ++ unsigned int rdblk; ++ unsigned int rdhash; ++ int udba; ++ struct au_opt_wbr_create wbr_create; ++ int wbr_copyup; ++ }; ++}; ++ ++/* opts flags */ ++#define AuOpts_REMOUNT 1 ++#define AuOpts_REFRESH_DIR (1 << 1) ++#define AuOpts_REFRESH_NONDIR (1 << 2) ++#define AuOpts_TRUNC_XIB (1 << 3) ++#define au_ftest_opts(flags, name) ((flags) & AuOpts_##name) ++#define au_fset_opts(flags, name) { (flags) |= AuOpts_##name; } ++#define au_fclr_opts(flags, name) { (flags) &= ~AuOpts_##name; } ++ ++struct au_opts { ++ struct au_opt *opt; ++ int max_opt; ++ ++ unsigned int given_udba; ++ unsigned int flags; ++ unsigned long sb_flags; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++const char *au_optstr_br_perm(int brperm); ++const char *au_optstr_udba(int udba); ++const char *au_optstr_wbr_copyup(int wbr_copyup); ++const char *au_optstr_wbr_create(int wbr_create); ++ ++void au_opts_free(struct au_opts *opts); ++int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts); ++int au_opts_verify(struct super_block *sb, unsigned long sb_flags, ++ unsigned int pending); ++int au_opts_mount(struct super_block *sb, struct au_opts *opts); ++int au_opts_remount(struct super_block *sb, struct au_opts *opts); ++ ++unsigned int au_opt_udba(struct super_block *sb); ++ ++/* ---------------------------------------------------------------------- */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_OPTS_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/plink.c linux-2.6.31/fs/aufs/plink.c +--- linux-2.6.31-vanilla/fs/aufs/plink.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/plink.c 2009-09-16 13:55:29.000000000 +0200 +@@ -0,0 +1,396 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * pseudo-link ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * during a user process maintains the pseudo-links, ++ * prohibit adding a new plink and branch manipulation. ++ */ ++void au_plink_block_maintain(struct super_block *sb) ++{ ++ struct au_sbinfo *sbi = au_sbi(sb); ++ ++ SiMustAnyLock(sb); ++ ++ /* gave up wake_up_bit() */ ++ wait_event(sbi->si_plink_wq, !au_ftest_si(sbi, MAINTAIN_PLINK)); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct pseudo_link { ++ struct list_head list; ++ struct inode *inode; ++}; ++ ++#ifdef CONFIG_AUFS_DEBUG ++void au_plink_list(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink; ++ ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ ++ plink_list = &sbinfo->si_plink.head; ++ spin_lock(&sbinfo->si_plink.spin); ++ list_for_each_entry(plink, plink_list, list) ++ AuDbg("%lu\n", plink->inode->i_ino); ++ spin_unlock(&sbinfo->si_plink.spin); ++} ++#endif ++ ++/* is the inode pseudo-linked? */ ++int au_plink_test(struct inode *inode) ++{ ++ int found; ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink; ++ ++ sbinfo = au_sbi(inode->i_sb); ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK)); ++ ++ found = 0; ++ plink_list = &sbinfo->si_plink.head; ++ spin_lock(&sbinfo->si_plink.spin); ++ list_for_each_entry(plink, plink_list, list) ++ if (plink->inode == inode) { ++ found = 1; ++ break; ++ } ++ spin_unlock(&sbinfo->si_plink.spin); ++ return found; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * generate a name for plink. ++ * the file will be stored under AUFS_WH_PLINKDIR. ++ */ ++/* 20 is max digits length of ulong 64 */ ++#define PLINK_NAME_LEN ((20 + 1) * 2) ++ ++static int plink_name(char *name, int len, struct inode *inode, ++ aufs_bindex_t bindex) ++{ ++ int rlen; ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, bindex); ++ rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino); ++ return rlen; ++} ++ ++/* lookup the plink-ed @inode under the branch at @bindex */ ++struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex) ++{ ++ struct dentry *h_dentry, *h_parent; ++ struct au_branch *br; ++ struct inode *h_dir; ++ char a[PLINK_NAME_LEN]; ++ struct qstr tgtname = { ++ .name = a ++ }; ++ ++ br = au_sbr(inode->i_sb, bindex); ++ h_parent = br->br_wbr->wbr_plink; ++ h_dir = h_parent->d_inode; ++ tgtname.len = plink_name(a, sizeof(a), inode, bindex); ++ ++ /* always superio. */ ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2); ++ h_dentry = au_sio_lkup_one(&tgtname, h_parent, br); ++ mutex_unlock(&h_dir->i_mutex); ++ return h_dentry; ++} ++ ++/* create a pseudo-link */ ++static int do_whplink(struct qstr *tgt, struct dentry *h_parent, ++ struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = br->br_mnt ++ }; ++ struct inode *h_dir; ++ ++ h_dir = h_parent->d_inode; ++ again: ++ h_path.dentry = au_lkup_one(tgt, h_parent, br, /*nd*/NULL); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ err = 0; ++ /* wh.plink dir is not monitored */ ++ if (h_path.dentry->d_inode ++ && h_path.dentry->d_inode != h_dentry->d_inode) { ++ err = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ dput(h_path.dentry); ++ h_path.dentry = NULL; ++ if (!err) ++ goto again; ++ } ++ if (!err && !h_path.dentry->d_inode) ++ err = vfsub_link(h_dentry, h_dir, &h_path); ++ dput(h_path.dentry); ++ ++ out: ++ return err; ++} ++ ++struct do_whplink_args { ++ int *errp; ++ struct qstr *tgt; ++ struct dentry *h_parent; ++ struct dentry *h_dentry; ++ struct au_branch *br; ++}; ++ ++static void call_do_whplink(void *args) ++{ ++ struct do_whplink_args *a = args; ++ *a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br); ++} ++ ++static int whplink(struct dentry *h_dentry, struct inode *inode, ++ aufs_bindex_t bindex, struct au_branch *br) ++{ ++ int err, wkq_err; ++ struct au_wbr *wbr; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ char a[PLINK_NAME_LEN]; ++ struct qstr tgtname = { ++ .name = a ++ }; ++ ++ wbr = au_sbr(inode->i_sb, bindex)->br_wbr; ++ h_parent = wbr->wbr_plink; ++ h_dir = h_parent->d_inode; ++ tgtname.len = plink_name(a, sizeof(a), inode, bindex); ++ ++ /* always superio. */ ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2); ++ if (!au_test_wkq(current)) { ++ struct do_whplink_args args = { ++ .errp = &err, ++ .tgt = &tgtname, ++ .h_parent = h_parent, ++ .h_dentry = h_dentry, ++ .br = br ++ }; ++ wkq_err = au_wkq_wait(call_do_whplink, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } else ++ err = do_whplink(&tgtname, h_parent, h_dentry, br); ++ mutex_unlock(&h_dir->i_mutex); ++ ++ return err; ++} ++ ++/* free a single plink */ ++static void do_put_plink(struct pseudo_link *plink, int do_del) ++{ ++ iput(plink->inode); ++ if (do_del) ++ list_del(&plink->list); ++ kfree(plink); ++} ++ ++/* ++ * create a new pseudo-link for @h_dentry on @bindex. ++ * the linked inode is held in aufs @inode. ++ */ ++void au_plink_append(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry) ++{ ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink; ++ int found, err, cnt; ++ ++ sb = inode->i_sb; ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ ++ err = 0; ++ cnt = 0; ++ found = 0; ++ plink_list = &sbinfo->si_plink.head; ++ spin_lock(&sbinfo->si_plink.spin); ++ list_for_each_entry(plink, plink_list, list) { ++ cnt++; ++ if (plink->inode == inode) { ++ found = 1; ++ break; ++ } ++ } ++ if (found) { ++ spin_unlock(&sbinfo->si_plink.spin); ++ return; ++ } ++ ++ plink = NULL; ++ if (!found) { ++ plink = kmalloc(sizeof(*plink), GFP_ATOMIC); ++ if (plink) { ++ plink->inode = au_igrab(inode); ++ list_add(&plink->list, plink_list); ++ cnt++; ++ } else ++ err = -ENOMEM; ++ } ++ spin_unlock(&sbinfo->si_plink.spin); ++ ++ if (!err) { ++ au_plink_block_maintain(sb); ++ err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex)); ++ } ++ ++ if (unlikely(cnt > AUFS_PLINK_WARN)) ++ AuWarn1("unexpectedly many pseudo links, %d\n", cnt); ++ if (unlikely(err)) { ++ AuWarn("err %d, damaged pseudo link.\n", err); ++ if (!found && plink) ++ do_put_plink(plink, /*do_del*/1); ++ } ++} ++ ++/* free all plinks */ ++void au_plink_put(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink, *tmp; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ ++ plink_list = &sbinfo->si_plink.head; ++ /* no spin_lock since sbinfo is write-locked */ ++ list_for_each_entry_safe(plink, tmp, plink_list, list) ++ do_put_plink(plink, 0); ++ INIT_LIST_HEAD(plink_list); ++} ++ ++/* free the plinks on a branch specified by @br_id */ ++void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id) ++{ ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink, *tmp; ++ struct inode *inode; ++ aufs_bindex_t bstart, bend, bindex; ++ unsigned char do_put; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ ++ plink_list = &sbinfo->si_plink.head; ++ /* no spin_lock since sbinfo is write-locked */ ++ list_for_each_entry_safe(plink, tmp, plink_list, list) { ++ do_put = 0; ++ inode = au_igrab(plink->inode); ++ ii_write_lock_child(inode); ++ bstart = au_ibstart(inode); ++ bend = au_ibend(inode); ++ if (bstart >= 0) { ++ for (bindex = bstart; bindex <= bend; bindex++) { ++ if (!au_h_iptr(inode, bindex) ++ || au_ii_br_id(inode, bindex) != br_id) ++ continue; ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ do_put = 1; ++ break; ++ } ++ } else ++ do_put_plink(plink, 1); ++ ++ if (do_put) { ++ for (bindex = bstart; bindex <= bend; bindex++) ++ if (au_h_iptr(inode, bindex)) { ++ do_put = 0; ++ break; ++ } ++ if (do_put) ++ do_put_plink(plink, 1); ++ } ++ ii_write_unlock(inode); ++ iput(inode); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++long au_plink_ioctl(struct file *file, unsigned int cmd) ++{ ++ long err; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ err = -EACCES; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ err = 0; ++ sb = file->f_dentry->d_sb; ++ sbinfo = au_sbi(sb); ++ switch (cmd) { ++ case AUFS_CTL_PLINK_MAINT: ++ /* ++ * pseudo-link maintenance mode, ++ * cleared by aufs_release_dir() ++ */ ++ si_write_lock(sb); ++ if (!au_ftest_si(sbinfo, MAINTAIN_PLINK)) { ++ au_fset_si(sbinfo, MAINTAIN_PLINK); ++ au_fi(file)->fi_maintain_plink = 1; ++ } else ++ err = -EBUSY; ++ si_write_unlock(sb); ++ break; ++ case AUFS_CTL_PLINK_CLEAN: ++ aufs_write_lock(sb->s_root); ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_put(sb); ++ aufs_write_unlock(sb->s_root); ++ break; ++ default: ++ err = -EINVAL; ++ } ++ out: ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/poll.c linux-2.6.31/fs/aufs/poll.c +--- linux-2.6.31-vanilla/fs/aufs/poll.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/poll.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,56 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * poll operation ++ * There is only one filesystem which implements ->poll operation, currently. ++ */ ++ ++#include "aufs.h" ++ ++unsigned int aufs_poll(struct file *file, poll_table *wait) ++{ ++ unsigned int mask; ++ int err; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ /* We should pretend an error happened. */ ++ mask = POLLERR /* | POLLIN | POLLOUT */; ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ /* it is not an error if h_file has no operation */ ++ mask = DEFAULT_POLLMASK; ++ h_file = au_h_fptr(file, au_fbstart(file)); ++ if (h_file->f_op && h_file->f_op->poll) ++ mask = h_file->f_op->poll(h_file, wait); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++ out: ++ si_read_unlock(sb); ++ AuTraceErr((int)mask); ++ return mask; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/rdu.c linux-2.6.31/fs/aufs/rdu.c +--- linux-2.6.31-vanilla/fs/aufs/rdu.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/rdu.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,331 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * readdir in userspace. ++ */ ++ ++#include <linux/security.h> ++#include <linux/uaccess.h> ++#include <linux/aufs_type.h> ++#include "aufs.h" ++ ++/* bits for struct aufs_rdu.flags */ ++#define AuRdu_CALLED 1 ++#define AuRdu_CONT (1 << 1) ++#define AuRdu_FULL (1 << 2) ++#define au_ftest_rdu(flags, name) ((flags) & AuRdu_##name) ++#define au_fset_rdu(flags, name) { (flags) |= AuRdu_##name; } ++#define au_fclr_rdu(flags, name) { (flags) &= ~AuRdu_##name; } ++ ++struct au_rdu_arg { ++ struct aufs_rdu *rdu; ++ union au_rdu_ent_ul ent; ++ unsigned long end; ++ ++ struct super_block *sb; ++ int err; ++}; ++ ++static int au_rdu_fill(void *__arg, const char *name, int nlen, ++ loff_t offset, u64 h_ino, unsigned int d_type) ++{ ++ int err, len; ++ struct au_rdu_arg *arg = __arg; ++ struct aufs_rdu *rdu = arg->rdu; ++ struct au_rdu_ent ent; ++ ++ err = 0; ++ arg->err = 0; ++ au_fset_rdu(rdu->cookie.flags, CALLED); ++ len = au_rdu_len(nlen); ++ if (arg->ent.ul + len < arg->end) { ++ ent.ino = h_ino; ++ ent.bindex = rdu->cookie.bindex; ++ ent.type = d_type; ++ ent.nlen = nlen; ++ ++ err = -EFAULT; ++ if (copy_to_user(arg->ent.e, &ent, sizeof(ent))) ++ goto out; ++ if (copy_to_user(arg->ent.e->name, name, nlen)) ++ goto out; ++ /* the terminating NULL */ ++ if (__put_user(0, arg->ent.e->name + nlen)) ++ goto out; ++ err = 0; ++ /* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */ ++ arg->ent.ul += len; ++ rdu->rent++; ++ } else { ++ err = -EFAULT; ++ au_fset_rdu(rdu->cookie.flags, FULL); ++ rdu->full = 1; ++ rdu->tail = arg->ent; ++ } ++ ++ out: ++ /* AuTraceErr(err); */ ++ return err; ++} ++ ++static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg) ++{ ++ int err; ++ loff_t offset; ++ struct au_rdu_cookie *cookie = &arg->rdu->cookie; ++ ++ offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET); ++ err = offset; ++ if (unlikely(offset != cookie->h_pos)) ++ goto out; ++ ++ err = 0; ++ do { ++ arg->err = 0; ++ au_fclr_rdu(cookie->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_readdir(h_file, au_rdu_fill, arg); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err ++ && au_ftest_rdu(cookie->flags, CALLED) ++ && !au_ftest_rdu(cookie->flags, FULL)); ++ cookie->h_pos = h_file->f_pos; ++ ++ out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_rdu(struct file *file, struct aufs_rdu *rdu) ++{ ++ int err; ++ aufs_bindex_t bend; ++ struct au_rdu_arg arg; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *h_file; ++ struct au_rdu_cookie *cookie = &rdu->cookie; ++ ++ err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ rdu->rent = 0; ++ rdu->tail = rdu->ent; ++ rdu->full = 0; ++ arg.rdu = rdu; ++ arg.ent = rdu->ent; ++ arg.end = arg.ent.ul; ++ arg.end += rdu->sz; ++ ++ err = -ENOTDIR; ++ if (unlikely(!file->f_op || !file->f_op->readdir)) ++ goto out; ++ ++ err = security_file_permission(file, MAY_READ); ++ AuTraceErr(err); ++ if (unlikely(err)) ++ goto out; ++ ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++#if 1 ++ mutex_lock(&inode->i_mutex); ++#else ++ err = mutex_lock_killable(&inode->i_mutex); ++ AuTraceErr(err); ++ if (unlikely(err)) ++ goto out; ++#endif ++ err = -ENOENT; ++ if (unlikely(IS_DEADDIR(inode))) ++ goto out_mtx; ++ ++ arg.sb = inode->i_sb; ++ si_read_lock(arg.sb, AuLock_FLUSH); ++ fi_read_lock(file); ++ ++ err = -EAGAIN; ++ if (unlikely(au_ftest_rdu(cookie->flags, CONT) ++ && cookie->generation != au_figen(file))) ++ goto out_unlock; ++ ++ err = 0; ++ if (!rdu->blk) { ++ rdu->blk = au_sbi(arg.sb)->si_rdblk; ++ if (!rdu->blk) ++ rdu->blk = au_dir_size(file, /*dentry*/NULL); ++ } ++ bend = au_fbstart(file); ++ if (cookie->bindex < bend) ++ cookie->bindex = bend; ++ bend = au_fbend(file); ++ /* AuDbg("b%d, b%d\n", cookie->bindex, bend); */ ++ for (; !err && cookie->bindex <= bend; ++ cookie->bindex++, cookie->h_pos = 0) { ++ h_file = au_h_fptr(file, cookie->bindex); ++ if (!h_file) ++ continue; ++ ++ au_fclr_rdu(cookie->flags, FULL); ++ err = au_rdu_do(h_file, &arg); ++ AuTraceErr(err); ++ if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err)) ++ break; ++ } ++ AuDbg("rent %llu\n", rdu->rent); ++ ++ if (!err && !au_ftest_rdu(cookie->flags, CONT)) { ++ rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH); ++ au_fset_rdu(cookie->flags, CONT); ++ cookie->generation = au_figen(file); ++ } ++ ++ ii_read_lock_child(inode); ++ fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibstart(inode))); ++ ii_read_unlock(inode); ++ ++ out_unlock: ++ fi_read_unlock(file); ++ si_read_unlock(arg.sb); ++ out_mtx: ++ mutex_unlock(&inode->i_mutex); ++ out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu) ++{ ++ int err; ++ ino_t ino; ++ unsigned long long nent; ++ union au_rdu_ent_ul *u; ++ struct au_rdu_ent ent; ++ struct super_block *sb; ++ ++ err = 0; ++ nent = rdu->nent; ++ u = &rdu->ent; ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ while (nent-- > 0) { ++ err = !access_ok(VERIFY_WRITE, u->e, sizeof(ent)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ break; ++ } ++ ++ err = copy_from_user(&ent, u->e, sizeof(ent)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ break; ++ } ++ ++ /* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */ ++ if (!ent.wh) ++ err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino); ++ else ++ err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type, ++ &ino); ++ if (unlikely(err)) { ++ AuTraceErr(err); ++ break; ++ } ++ ++ err = __put_user(ino, &u->e->ino); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ break; ++ } ++ u->ul += au_rdu_len(ent.nlen); ++ } ++ si_read_unlock(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_rdu_verify(struct aufs_rdu *rdu) ++{ ++ AuDbg("rdu{%llu, %p, (%u, %u) | %u | %llu, %u, %u | " ++ "%llu, b%d, 0x%x, g%u}\n", ++ rdu->sz, rdu->ent.e, rdu->verify[0], rdu->verify[1], ++ rdu->blk, ++ rdu->rent, rdu->shwh, rdu->full, ++ rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags, ++ rdu->cookie.generation); ++ ++ if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu) ++ && rdu->verify[AufsCtlRduV_SZ_PTR] == sizeof(rdu)) ++ return 0; ++ ++ AuDbg("%u:%u, %u:%u\n", ++ rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu), ++ rdu->verify[AufsCtlRduV_SZ_PTR], (unsigned int)sizeof(rdu)); ++ return -EINVAL; ++} ++ ++long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err, e; ++ struct aufs_rdu rdu; ++ void __user *p = (void __user *)arg; ++ ++ err = copy_from_user(&rdu, p, sizeof(rdu)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ err = au_rdu_verify(&rdu); ++ if (unlikely(err)) ++ goto out; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ err = au_rdu(file, &rdu); ++ if (unlikely(err)) ++ break; ++ ++ e = copy_to_user(p, &rdu, sizeof(rdu)); ++ if (unlikely(e)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ } ++ break; ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ino(file, &rdu); ++ break; ++ ++ default: ++ err = -EINVAL; ++ } ++ ++ out: ++ AuTraceErr(err); ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/rwsem.h linux-2.6.31/fs/aufs/rwsem.h +--- linux-2.6.31-vanilla/fs/aufs/rwsem.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/rwsem.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,186 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * simple read-write semaphore wrappers ++ */ ++ ++#ifndef __AUFS_RWSEM_H__ ++#define __AUFS_RWSEM_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/rwsem.h> ++ ++struct au_rwsem { ++ struct rw_semaphore rwsem; ++#ifdef CONFIG_AUFS_DEBUG ++ /* just for debugging, not almighty counter */ ++ atomic_t rcnt, wcnt; ++#endif ++}; ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define AuDbgCntInit(rw) do { \ ++ atomic_set(&(rw)->rcnt, 0); \ ++ atomic_set(&(rw)->wcnt, 0); \ ++ smp_mb(); /* atomic set */ \ ++} while (0) ++ ++#define AuDbgRcntInc(rw) atomic_inc_return(&(rw)->rcnt) ++#define AuDbgRcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->rcnt) < 0) ++#define AuDbgWcntInc(rw) WARN_ON(atomic_inc_return(&(rw)->wcnt) > 1) ++#define AuDbgWcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->wcnt) < 0) ++#else ++#define AuDbgCntInit(rw) do {} while (0) ++#define AuDbgRcntInc(rw) do {} while (0) ++#define AuDbgRcntDec(rw) do {} while (0) ++#define AuDbgWcntInc(rw) do {} while (0) ++#define AuDbgWcntDec(rw) do {} while (0) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* to debug easier, do not make them inlined functions */ ++#define AuRwMustNoWaiters(rw) AuDebugOn(!list_empty(&(rw)->rwsem.wait_list)) ++/* rwsem_is_locked() is unusable */ ++#define AuRwMustReadLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0) ++#define AuRwMustWriteLock(rw) AuDebugOn(atomic_read(&(rw)->wcnt) <= 0) ++#define AuRwMustAnyLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0 \ ++ && atomic_read(&(rw)->wcnt) <= 0) ++#define AuRwDestroy(rw) AuDebugOn(atomic_read(&(rw)->rcnt) \ ++ || atomic_read(&(rw)->wcnt)) ++ ++static inline void au_rw_init(struct au_rwsem *rw) ++{ ++ AuDbgCntInit(rw); ++ init_rwsem(&rw->rwsem); ++} ++ ++static inline void au_rw_init_wlock(struct au_rwsem *rw) ++{ ++ au_rw_init(rw); ++ down_write(&rw->rwsem); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_init_wlock_nested(struct au_rwsem *rw, ++ unsigned int lsc) ++{ ++ au_rw_init(rw); ++ down_write_nested(&rw->rwsem, lsc); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_read_lock(struct au_rwsem *rw) ++{ ++ down_read(&rw->rwsem); ++ AuDbgRcntInc(rw); ++} ++ ++static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc) ++{ ++ down_read_nested(&rw->rwsem, lsc); ++ AuDbgRcntInc(rw); ++} ++ ++static inline void au_rw_read_unlock(struct au_rwsem *rw) ++{ ++ AuRwMustReadLock(rw); ++ AuDbgRcntDec(rw); ++ up_read(&rw->rwsem); ++} ++ ++static inline void au_rw_dgrade_lock(struct au_rwsem *rw) ++{ ++ AuRwMustWriteLock(rw); ++ AuDbgRcntInc(rw); ++ AuDbgWcntDec(rw); ++ downgrade_write(&rw->rwsem); ++} ++ ++static inline void au_rw_write_lock(struct au_rwsem *rw) ++{ ++ down_write(&rw->rwsem); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_write_lock_nested(struct au_rwsem *rw, ++ unsigned int lsc) ++{ ++ down_write_nested(&rw->rwsem, lsc); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_write_unlock(struct au_rwsem *rw) ++{ ++ AuRwMustWriteLock(rw); ++ AuDbgWcntDec(rw); ++ up_write(&rw->rwsem); ++} ++ ++/* why is not _nested version defined */ ++static inline int au_rw_read_trylock(struct au_rwsem *rw) ++{ ++ int ret = down_read_trylock(&rw->rwsem); ++ if (ret) ++ AuDbgRcntInc(rw); ++ return ret; ++} ++ ++static inline int au_rw_write_trylock(struct au_rwsem *rw) ++{ ++ int ret = down_write_trylock(&rw->rwsem); ++ if (ret) ++ AuDbgWcntInc(rw); ++ return ret; ++} ++ ++#undef AuDbgCntInit ++#undef AuDbgRcntInc ++#undef AuDbgRcntDec ++#undef AuDbgWcntInc ++#undef AuDbgWcntDec ++ ++#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ ++static inline void prefix##_read_lock(param) \ ++{ au_rw_read_lock(rwsem); } \ ++static inline void prefix##_write_lock(param) \ ++{ au_rw_write_lock(rwsem); } \ ++static inline int prefix##_read_trylock(param) \ ++{ return au_rw_read_trylock(rwsem); } \ ++static inline int prefix##_write_trylock(param) \ ++{ return au_rw_write_trylock(rwsem); } ++/* why is not _nested version defined */ ++/* static inline void prefix##_read_trylock_nested(param, lsc) ++{ au_rw_read_trylock_nested(rwsem, lsc)); } ++static inline void prefix##_write_trylock_nestd(param, lsc) ++{ au_rw_write_trylock_nested(rwsem, lsc); } */ ++ ++#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \ ++static inline void prefix##_read_unlock(param) \ ++{ au_rw_read_unlock(rwsem); } \ ++static inline void prefix##_write_unlock(param) \ ++{ au_rw_write_unlock(rwsem); } \ ++static inline void prefix##_downgrade_lock(param) \ ++{ au_rw_dgrade_lock(rwsem); } ++ ++#define AuSimpleRwsemFuncs(prefix, param, rwsem) \ ++ AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ ++ AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_RWSEM_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/sbinfo.c linux-2.6.31/fs/aufs/sbinfo.c +--- linux-2.6.31-vanilla/fs/aufs/sbinfo.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/sbinfo.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,208 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * superblock private data ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * they are necessary regardless sysfs is disabled. ++ */ ++void au_si_free(struct kobject *kobj) ++{ ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ ++ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); ++ AuDebugOn(!list_empty(&sbinfo->si_plink.head)); ++ ++ sb = sbinfo->si_sb; ++ si_write_lock(sb); ++ au_xino_clr(sb); ++ au_br_free(sbinfo); ++ kfree(sbinfo->si_branch); ++ mutex_destroy(&sbinfo->si_xib_mtx); ++ si_write_unlock(sb); ++ AuRwDestroy(&sbinfo->si_rwsem); ++ ++ kfree(sbinfo); ++} ++ ++int au_si_alloc(struct super_block *sb) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ err = -ENOMEM; ++ sbinfo = kmalloc(sizeof(*sbinfo), GFP_NOFS); ++ if (unlikely(!sbinfo)) ++ goto out; ++ ++ /* will be reallocated separately */ ++ sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS); ++ if (unlikely(!sbinfo->si_branch)) ++ goto out_sbinfo; ++ ++ memset(&sbinfo->si_kobj, 0, sizeof(sbinfo->si_kobj)); ++ err = sysaufs_si_init(sbinfo); ++ if (unlikely(err)) ++ goto out_br; ++ ++ au_nwt_init(&sbinfo->si_nowait); ++ au_rw_init_wlock(&sbinfo->si_rwsem); ++ sbinfo->si_generation = 0; ++ sbinfo->au_si_status = 0; ++ sbinfo->si_bend = -1; ++ sbinfo->si_last_br_id = 0; ++ ++ sbinfo->si_wbr_copyup = AuWbrCopyup_Def; ++ sbinfo->si_wbr_create = AuWbrCreate_Def; ++ sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + AuWbrCopyup_Def; ++ sbinfo->si_wbr_create_ops = au_wbr_create_ops + AuWbrCreate_Def; ++ ++ sbinfo->si_mntflags = AuOpt_Def; ++ ++ sbinfo->si_xread = NULL; ++ sbinfo->si_xwrite = NULL; ++ sbinfo->si_xib = NULL; ++ mutex_init(&sbinfo->si_xib_mtx); ++ sbinfo->si_xib_buf = NULL; ++ sbinfo->si_xino_brid = -1; ++ /* leave si_xib_last_pindex and si_xib_next_bit */ ++ ++ sbinfo->si_rdcache = AUFS_RDCACHE_DEF * HZ; ++ sbinfo->si_rdblk = AUFS_RDBLK_DEF; ++ sbinfo->si_rdhash = AUFS_RDHASH_DEF; ++ sbinfo->si_dirwh = AUFS_DIRWH_DEF; ++ ++ au_spl_init(&sbinfo->si_plink); ++ init_waitqueue_head(&sbinfo->si_plink_wq); ++ ++ /* leave other members for sysaufs and si_mnt. */ ++ sbinfo->si_sb = sb; ++ sb->s_fs_info = sbinfo; ++ au_debug_sbinfo_init(sbinfo); ++ return 0; /* success */ ++ ++ out_br: ++ kfree(sbinfo->si_branch); ++ out_sbinfo: ++ kfree(sbinfo); ++ out: ++ return err; ++} ++ ++int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr) ++{ ++ int err, sz; ++ struct au_branch **brp; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*brp) * (sbinfo->si_bend + 1); ++ if (unlikely(!sz)) ++ sz = sizeof(*brp); ++ brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS); ++ if (brp) { ++ sbinfo->si_branch = brp; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++unsigned int au_sigen_inc(struct super_block *sb) ++{ ++ unsigned int gen; ++ ++ SiMustWriteLock(sb); ++ ++ gen = ++au_sbi(sb)->si_generation; ++ au_update_digen(sb->s_root); ++ au_update_iigen(sb->s_root->d_inode); ++ sb->s_root->d_inode->i_version++; ++ return gen; ++} ++ ++aufs_bindex_t au_new_br_id(struct super_block *sb) ++{ ++ aufs_bindex_t br_id; ++ int i; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ for (i = 0; i <= AUFS_BRANCH_MAX; i++) { ++ br_id = ++sbinfo->si_last_br_id; ++ if (br_id && au_br_index(sb, br_id) < 0) ++ return br_id; ++ } ++ ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dentry and super_block lock. call at entry point */ ++void aufs_read_lock(struct dentry *dentry, int flags) ++{ ++ si_read_lock(dentry->d_sb, flags); ++ if (au_ftest_lock(flags, DW)) ++ di_write_lock_child(dentry); ++ else ++ di_read_lock_child(dentry, flags); ++} ++ ++void aufs_read_unlock(struct dentry *dentry, int flags) ++{ ++ if (au_ftest_lock(flags, DW)) ++ di_write_unlock(dentry); ++ else ++ di_read_unlock(dentry, flags); ++ si_read_unlock(dentry->d_sb); ++} ++ ++void aufs_write_lock(struct dentry *dentry) ++{ ++ si_write_lock(dentry->d_sb); ++ di_write_lock_child(dentry); ++} ++ ++void aufs_write_unlock(struct dentry *dentry) ++{ ++ di_write_unlock(dentry); ++ si_write_unlock(dentry->d_sb); ++} ++ ++void aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags) ++{ ++ si_read_lock(d1->d_sb, flags); ++ di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIR)); ++} ++ ++void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2) ++{ ++ di_write_unlock2(d1, d2); ++ si_read_unlock(d1->d_sb); ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/spl.h linux-2.6.31/fs/aufs/spl.h +--- linux-2.6.31-vanilla/fs/aufs/spl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/spl.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,57 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * simple list protected by a spinlock ++ */ ++ ++#ifndef __AUFS_SPL_H__ ++#define __AUFS_SPL_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/spinlock.h> ++#include <linux/list.h> ++ ++struct au_splhead { ++ spinlock_t spin; ++ struct list_head head; ++}; ++ ++static inline void au_spl_init(struct au_splhead *spl) ++{ ++ spin_lock_init(&spl->spin); ++ INIT_LIST_HEAD(&spl->head); ++} ++ ++static inline void au_spl_add(struct list_head *list, struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_add(list, &spl->head); ++ spin_unlock(&spl->spin); ++} ++ ++static inline void au_spl_del(struct list_head *list, struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_del(list); ++ spin_unlock(&spl->spin); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_SPL_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/super.c linux-2.6.31/fs/aufs/super.c +--- linux-2.6.31-vanilla/fs/aufs/super.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/super.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,874 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * mount and super_block operations ++ */ ++ ++#include <linux/buffer_head.h> ++#include <linux/module.h> ++#include <linux/seq_file.h> ++#include <linux/statfs.h> ++#include "aufs.h" ++ ++/* ++ * super_operations ++ */ ++static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused) ++{ ++ struct au_icntnr *c; ++ ++ c = au_cache_alloc_icntnr(); ++ if (c) { ++ inode_init_once(&c->vfs_inode); ++ c->vfs_inode.i_version = 1; /* sigen(sb); */ ++ c->iinfo.ii_hinode = NULL; ++ return &c->vfs_inode; ++ } ++ return NULL; ++} ++ ++static void aufs_destroy_inode(struct inode *inode) ++{ ++ au_iinfo_fin(inode); ++ au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode)); ++} ++ ++struct inode *au_iget_locked(struct super_block *sb, ino_t ino) ++{ ++ struct inode *inode; ++ int err; ++ ++ inode = iget_locked(sb, ino); ++ if (unlikely(!inode)) { ++ inode = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ if (!(inode->i_state & I_NEW)) ++ goto out; ++ ++ err = au_xigen_new(inode); ++ if (!err) ++ err = au_iinfo_init(inode); ++ if (!err) ++ inode->i_version++; ++ else { ++ iget_failed(inode); ++ inode = ERR_PTR(err); ++ } ++ ++ out: ++ /* never return NULL */ ++ AuDebugOn(!inode); ++ AuTraceErrPtr(inode); ++ return inode; ++} ++ ++/* lock free root dinfo */ ++static int au_show_brs(struct seq_file *seq, struct super_block *sb) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct path path; ++ struct au_hdentry *hd; ++ struct au_branch *br; ++ ++ err = 0; ++ bend = au_sbend(sb); ++ hd = au_di(sb->s_root)->di_hdentry; ++ for (bindex = 0; !err && bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ path.mnt = br->br_mnt; ++ path.dentry = hd[bindex].hd_dentry; ++ err = au_seq_path(seq, &path); ++ if (err > 0) ++ err = seq_printf(seq, "=%s", ++ au_optstr_br_perm(br->br_perm)); ++ if (!err && bindex != bend) ++ err = seq_putc(seq, ':'); ++ } ++ ++ return err; ++} ++ ++static void au_show_wbr_create(struct seq_file *m, int v, ++ struct au_sbinfo *sbinfo) ++{ ++ const char *pat; ++ ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ ++ seq_printf(m, ",create="); ++ pat = au_optstr_wbr_create(v); ++ switch (v) { ++ case AuWbrCreate_TDP: ++ case AuWbrCreate_RR: ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_PMFS: ++ seq_printf(m, pat); ++ break; ++ case AuWbrCreate_MFSV: ++ seq_printf(m, /*pat*/"mfs:%lu", ++ sbinfo->si_wbr_mfs.mfs_expire / HZ); ++ break; ++ case AuWbrCreate_PMFSV: ++ seq_printf(m, /*pat*/"pmfs:%lu", ++ sbinfo->si_wbr_mfs.mfs_expire / HZ); ++ break; ++ case AuWbrCreate_MFSRR: ++ seq_printf(m, /*pat*/"mfsrr:%llu", ++ sbinfo->si_wbr_mfs.mfsrr_watermark); ++ break; ++ case AuWbrCreate_MFSRRV: ++ seq_printf(m, /*pat*/"mfsrr:%llu:%lu", ++ sbinfo->si_wbr_mfs.mfsrr_watermark, ++ sbinfo->si_wbr_mfs.mfs_expire / HZ); ++ break; ++ } ++} ++ ++static int au_show_xino(struct seq_file *seq, struct vfsmount *mnt) ++{ ++#ifdef CONFIG_SYSFS ++ return 0; ++#else ++ int err; ++ const int len = sizeof(AUFS_XINO_FNAME) - 1; ++ aufs_bindex_t bindex, brid; ++ struct super_block *sb; ++ struct qstr *name; ++ struct file *f; ++ struct dentry *d, *h_root; ++ ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ ++ err = 0; ++ sb = mnt->mnt_sb; ++ f = au_sbi(sb)->si_xib; ++ if (!f) ++ goto out; ++ ++ /* stop printing the default xino path on the first writable branch */ ++ h_root = NULL; ++ brid = au_xino_brid(sb); ++ if (brid >= 0) { ++ bindex = au_br_index(sb, brid); ++ h_root = au_di(sb->s_root)->di_hdentry[0 + bindex].hd_dentry; ++ } ++ d = f->f_dentry; ++ name = &d->d_name; ++ /* safe ->d_parent because the file is unlinked */ ++ if (d->d_parent == h_root ++ && name->len == len ++ && !memcmp(name->name, AUFS_XINO_FNAME, len)) ++ goto out; ++ ++ seq_puts(seq, ",xino="); ++ err = au_xino_path(seq, f); ++ ++ out: ++ return err; ++#endif ++} ++ ++/* seq_file will re-call me in case of too long string */ ++static int aufs_show_options(struct seq_file *m, struct vfsmount *mnt) ++{ ++ int err, n; ++ unsigned int mnt_flags, v; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++#define AuBool(name, str) do { \ ++ v = au_opt_test(mnt_flags, name); \ ++ if (v != au_opt_test(AuOpt_Def, name)) \ ++ seq_printf(m, ",%s" #str, v ? "" : "no"); \ ++} while (0) ++ ++#define AuStr(name, str) do { \ ++ v = mnt_flags & AuOptMask_##name; \ ++ if (v != (AuOpt_Def & AuOptMask_##name)) \ ++ seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \ ++} while (0) ++ ++#define AuUInt(name, str, val) do { \ ++ if (val != AUFS_##name##_DEF) \ ++ seq_printf(m, "," #str "=%u", val); \ ++} while (0) ++ ++ /* lock free root dinfo */ ++ sb = mnt->mnt_sb; ++ si_noflush_read_lock(sb); ++ sbinfo = au_sbi(sb); ++ seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo)); ++ ++ mnt_flags = au_mntflags(sb); ++ if (au_opt_test(mnt_flags, XINO)) { ++ err = au_show_xino(m, mnt); ++ if (unlikely(err)) ++ goto out; ++ } else ++ seq_puts(m, ",noxino"); ++ ++ AuBool(TRUNC_XINO, trunc_xino); ++ AuStr(UDBA, udba); ++ AuBool(SHWH, shwh); ++ AuBool(PLINK, plink); ++ /* AuBool(DIRPERM1, dirperm1); */ ++ /* AuBool(REFROF, refrof); */ ++ ++ v = sbinfo->si_wbr_create; ++ if (v != AuWbrCreate_Def) ++ au_show_wbr_create(m, v, sbinfo); ++ ++ v = sbinfo->si_wbr_copyup; ++ if (v != AuWbrCopyup_Def) ++ seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v)); ++ ++ v = au_opt_test(mnt_flags, ALWAYS_DIROPQ); ++ if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ)) ++ seq_printf(m, ",diropq=%c", v ? 'a' : 'w'); ++ ++ AuUInt(DIRWH, dirwh, sbinfo->si_dirwh); ++ ++ n = sbinfo->si_rdcache / HZ; ++ AuUInt(RDCACHE, rdcache, n); ++ ++ AuUInt(RDBLK, rdblk, sbinfo->si_rdblk); ++ AuUInt(RDHASH, rdhash, sbinfo->si_rdhash); ++ ++ AuBool(SUM, sum); ++ /* AuBool(SUM_W, wsum); */ ++ AuBool(WARN_PERM, warn_perm); ++ AuBool(VERBOSE, verbose); ++ ++ out: ++ /* be sure to print "br:" last */ ++ if (!sysaufs_brs) { ++ seq_puts(m, ",br:"); ++ au_show_brs(m, sb); ++ } ++ si_read_unlock(sb); ++ return 0; ++ ++#undef Deleted ++#undef AuBool ++#undef AuStr ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* sum mode which returns the summation for statfs(2) */ ++ ++static u64 au_add_till_max(u64 a, u64 b) ++{ ++ u64 old; ++ ++ old = a; ++ a += b; ++ if (old < a) ++ return a; ++ return ULLONG_MAX; ++} ++ ++static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf) ++{ ++ int err; ++ u64 blocks, bfree, bavail, files, ffree; ++ aufs_bindex_t bend, bindex, i; ++ unsigned char shared; ++ struct vfsmount *h_mnt; ++ struct super_block *h_sb; ++ ++ blocks = 0; ++ bfree = 0; ++ bavail = 0; ++ files = 0; ++ ffree = 0; ++ ++ err = 0; ++ bend = au_sbend(sb); ++ for (bindex = bend; bindex >= 0; bindex--) { ++ h_mnt = au_sbr_mnt(sb, bindex); ++ h_sb = h_mnt->mnt_sb; ++ shared = 0; ++ for (i = bindex + 1; !shared && i <= bend; i++) ++ shared = (au_sbr_sb(sb, i) == h_sb); ++ if (shared) ++ continue; ++ ++ /* sb->s_root for NFS is unreliable */ ++ err = vfs_statfs(h_mnt->mnt_root, buf); ++ if (unlikely(err)) ++ goto out; ++ ++ blocks = au_add_till_max(blocks, buf->f_blocks); ++ bfree = au_add_till_max(bfree, buf->f_bfree); ++ bavail = au_add_till_max(bavail, buf->f_bavail); ++ files = au_add_till_max(files, buf->f_files); ++ ffree = au_add_till_max(ffree, buf->f_ffree); ++ } ++ ++ buf->f_blocks = blocks; ++ buf->f_bfree = bfree; ++ buf->f_bavail = bavail; ++ buf->f_files = files; ++ buf->f_ffree = ffree; ++ ++ out: ++ return err; ++} ++ ++static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ int err; ++ struct super_block *sb; ++ ++ /* lock free root dinfo */ ++ sb = dentry->d_sb; ++ si_noflush_read_lock(sb); ++ if (!au_opt_test(au_mntflags(sb), SUM)) ++ /* sb->s_root for NFS is unreliable */ ++ err = vfs_statfs(au_sbr_mnt(sb, 0)->mnt_root, buf); ++ else ++ err = au_statfs_sum(sb, buf); ++ si_read_unlock(sb); ++ ++ if (!err) { ++ buf->f_type = AUFS_SUPER_MAGIC; ++ buf->f_namelen -= AUFS_WH_PFX_LEN; ++ memset(&buf->f_fsid, 0, sizeof(buf->f_fsid)); ++ } ++ /* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */ ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* try flushing the lower fs at aufs remount/unmount time */ ++ ++static void au_fsync_br(struct super_block *sb) ++{ ++ aufs_bindex_t bend, bindex; ++ int brperm; ++ struct au_branch *br; ++ struct super_block *h_sb; ++ ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex < bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ brperm = br->br_perm; ++ if (brperm == AuBrPerm_RR || brperm == AuBrPerm_RRWH) ++ continue; ++ h_sb = br->br_mnt->mnt_sb; ++ if (bdev_read_only(h_sb->s_bdev)) ++ continue; ++ ++ lockdep_off(); ++ down_write(&h_sb->s_umount); ++ shrink_dcache_sb(h_sb); ++ sync_filesystem(h_sb); ++ up_write(&h_sb->s_umount); ++ lockdep_on(); ++ } ++} ++ ++/* ++ * this IS NOT for super_operations. ++ * I guess it will be reverted someday. ++ */ ++static void aufs_umount_begin(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = au_sbi(sb); ++ if (!sbinfo) ++ return; ++ ++ si_write_lock(sb); ++ au_fsync_br(sb); ++ if (au_opt_test(au_mntflags(sb), PLINK)) ++ au_plink_put(sb); ++ if (sbinfo->si_wbr_create_ops->fin) ++ sbinfo->si_wbr_create_ops->fin(sb); ++ si_write_unlock(sb); ++} ++ ++/* final actions when unmounting a file system */ ++static void aufs_put_super(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = au_sbi(sb); ++ if (!sbinfo) ++ return; ++ ++ aufs_umount_begin(sb); ++ dbgaufs_si_fin(sbinfo); ++ kobject_put(&sbinfo->si_kobj); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * refresh dentry and inode at remount time. ++ */ ++static int do_refresh(struct dentry *dentry, mode_t type, ++ unsigned int dir_flags) ++{ ++ int err; ++ struct dentry *parent; ++ ++ di_write_lock_child(dentry); ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ ++ /* returns the number of positive dentries */ ++ err = au_refresh_hdentry(dentry, type); ++ if (err >= 0) { ++ struct inode *inode = dentry->d_inode; ++ err = au_refresh_hinode(inode, dentry); ++ if (!err && type == S_IFDIR) ++ au_reset_hinotify(inode, dir_flags); ++ } ++ if (unlikely(err)) ++ AuErr("unrecoverable error %d, %.*s\n", err, AuDLNPair(dentry)); ++ ++ di_read_unlock(parent, AuLock_IR); ++ dput(parent); ++ di_write_unlock(dentry); ++ ++ return err; ++} ++ ++static int test_dir(struct dentry *dentry, void *arg __maybe_unused) ++{ ++ return S_ISDIR(dentry->d_inode->i_mode); ++} ++ ++/* gave up consolidating with refresh_nondir() */ ++static int refresh_dir(struct dentry *root, unsigned int sigen) ++{ ++ int err, i, j, ndentry, e; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries; ++ struct inode *inode; ++ const unsigned int flags = au_hi_flags(root->d_inode, /*isdir*/1); ++ ++ err = 0; ++ list_for_each_entry(inode, &root->d_sb->s_inodes, i_sb_list) ++ if (S_ISDIR(inode->i_mode) && au_iigen(inode) != sigen) { ++ ii_write_lock_child(inode); ++ e = au_refresh_hinode_self(inode, /*do_attr*/1); ++ ii_write_unlock(inode); ++ if (unlikely(e)) { ++ AuDbg("e %d, i%lu\n", e, inode->i_ino); ++ if (!err) ++ err = e; ++ /* go on even if err */ ++ } ++ } ++ ++ e = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(e)) { ++ if (!err) ++ err = e; ++ goto out; ++ } ++ e = au_dcsub_pages(&dpages, root, test_dir, NULL); ++ if (unlikely(e)) { ++ if (!err) ++ err = e; ++ goto out_dpages; ++ } ++ ++ for (i = 0; !e && i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; !e && j < ndentry; j++) { ++ struct dentry *d; ++ ++ d = dentries[j]; ++ au_dbg_verify_dir_parent(d, sigen); ++ if (au_digen(d) != sigen) { ++ e = do_refresh(d, S_IFDIR, flags); ++ if (unlikely(e && !err)) ++ err = e; ++ /* break on err */ ++ } ++ } ++ } ++ ++ out_dpages: ++ au_dpages_free(&dpages); ++ out: ++ return err; ++} ++ ++static int test_nondir(struct dentry *dentry, void *arg __maybe_unused) ++{ ++ return !S_ISDIR(dentry->d_inode->i_mode); ++} ++ ++static int refresh_nondir(struct dentry *root, unsigned int sigen, ++ int do_dentry) ++{ ++ int err, i, j, ndentry, e; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries; ++ struct inode *inode; ++ ++ err = 0; ++ list_for_each_entry(inode, &root->d_sb->s_inodes, i_sb_list) ++ if (!S_ISDIR(inode->i_mode) && au_iigen(inode) != sigen) { ++ ii_write_lock_child(inode); ++ e = au_refresh_hinode_self(inode, /*do_attr*/1); ++ ii_write_unlock(inode); ++ if (unlikely(e)) { ++ AuDbg("e %d, i%lu\n", e, inode->i_ino); ++ if (!err) ++ err = e; ++ /* go on even if err */ ++ } ++ } ++ ++ if (!do_dentry) ++ goto out; ++ ++ e = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(e)) { ++ if (!err) ++ err = e; ++ goto out; ++ } ++ e = au_dcsub_pages(&dpages, root, test_nondir, NULL); ++ if (unlikely(e)) { ++ if (!err) ++ err = e; ++ goto out_dpages; ++ } ++ ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ struct dentry *d; ++ ++ d = dentries[j]; ++ au_dbg_verify_nondir_parent(d, sigen); ++ inode = d->d_inode; ++ if (inode && au_digen(d) != sigen) { ++ e = do_refresh(d, inode->i_mode & S_IFMT, ++ /*dir_flags*/0); ++ if (unlikely(e && !err)) ++ err = e; ++ /* go on even err */ ++ } ++ } ++ } ++ ++ out_dpages: ++ au_dpages_free(&dpages); ++ out: ++ return err; ++} ++ ++static void au_remount_refresh(struct super_block *sb, unsigned int flags) ++{ ++ int err; ++ unsigned int sigen; ++ struct au_sbinfo *sbinfo; ++ struct dentry *root; ++ struct inode *inode; ++ ++ au_sigen_inc(sb); ++ sigen = au_sigen(sb); ++ sbinfo = au_sbi(sb); ++ au_fclr_si(sbinfo, FAILED_REFRESH_DIRS); ++ ++ root = sb->s_root; ++ DiMustNoWaiters(root); ++ inode = root->d_inode; ++ IiMustNoWaiters(inode); ++ au_reset_hinotify(inode, au_hi_flags(inode, /*isdir*/1)); ++ di_write_unlock(root); ++ ++ err = refresh_dir(root, sigen); ++ if (unlikely(err)) { ++ au_fset_si(sbinfo, FAILED_REFRESH_DIRS); ++ AuWarn("Refreshing directories failed, ignored (%d)\n", err); ++ } ++ ++ if (au_ftest_opts(flags, REFRESH_NONDIR)) { ++ err = refresh_nondir(root, sigen, !err); ++ if (unlikely(err)) ++ AuWarn("Refreshing non-directories failed, ignored" ++ "(%d)\n", err); ++ } ++ ++ /* aufs_write_lock() calls ..._child() */ ++ di_write_lock_child(root); ++ au_cpup_attr_all(root->d_inode, /*force*/1); ++} ++ ++/* stop extra interpretation of errno in mount(8), and strange error messages */ ++static int cvt_err(int err) ++{ ++ AuTraceErr(err); ++ ++ switch (err) { ++ case -ENOENT: ++ case -ENOTDIR: ++ case -EEXIST: ++ case -EIO: ++ err = -EINVAL; ++ } ++ return err; ++} ++ ++static int aufs_remount_fs(struct super_block *sb, int *flags, char *data) ++{ ++ int err; ++ struct au_opts opts; ++ struct dentry *root; ++ struct inode *inode; ++ struct au_sbinfo *sbinfo; ++ ++ err = 0; ++ root = sb->s_root; ++ if (!data || !*data) { ++ aufs_write_lock(root); ++ err = au_opts_verify(sb, *flags, /*pending*/0); ++ if (!err) ++ au_fsync_br(sb); ++ aufs_write_unlock(root); ++ goto out; ++ } ++ ++ err = -ENOMEM; ++ memset(&opts, 0, sizeof(opts)); ++ opts.opt = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!opts.opt)) ++ goto out; ++ opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); ++ opts.flags = AuOpts_REMOUNT; ++ opts.sb_flags = *flags; ++ ++ /* parse it before aufs lock */ ++ err = au_opts_parse(sb, data, &opts); ++ if (unlikely(err)) ++ goto out_opts; ++ ++ sbinfo = au_sbi(sb); ++ inode = root->d_inode; ++ mutex_lock(&inode->i_mutex); ++ aufs_write_lock(root); ++ au_fsync_br(sb); ++ ++ /* au_opts_remount() may return an error */ ++ err = au_opts_remount(sb, &opts); ++ au_opts_free(&opts); ++ ++ if (au_ftest_opts(opts.flags, REFRESH_DIR) ++ || au_ftest_opts(opts.flags, REFRESH_NONDIR)) ++ au_remount_refresh(sb, opts.flags); ++ ++ aufs_write_unlock(root); ++ mutex_unlock(&inode->i_mutex); ++ ++ out_opts: ++ free_page((unsigned long)opts.opt); ++ out: ++ err = cvt_err(err); ++ AuTraceErr(err); ++ return err; ++} ++ ++static struct super_operations aufs_sop = { ++ .alloc_inode = aufs_alloc_inode, ++ .destroy_inode = aufs_destroy_inode, ++ .drop_inode = generic_delete_inode, ++ .show_options = aufs_show_options, ++ .statfs = aufs_statfs, ++ .put_super = aufs_put_super, ++ .remount_fs = aufs_remount_fs ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int alloc_root(struct super_block *sb) ++{ ++ int err; ++ struct inode *inode; ++ struct dentry *root; ++ ++ err = -ENOMEM; ++ inode = au_iget_locked(sb, AUFS_ROOT_INO); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out; ++ ++ inode->i_op = &aufs_dir_iop; ++ inode->i_fop = &aufs_dir_fop; ++ inode->i_mode = S_IFDIR; ++ inode->i_nlink = 2; ++ unlock_new_inode(inode); ++ ++ root = d_alloc_root(inode); ++ if (unlikely(!root)) ++ goto out_iput; ++ err = PTR_ERR(root); ++ if (IS_ERR(root)) ++ goto out_iput; ++ ++ err = au_alloc_dinfo(root); ++ if (!err) { ++ sb->s_root = root; ++ return 0; /* success */ ++ } ++ dput(root); ++ goto out; /* do not iput */ ++ ++ out_iput: ++ iget_failed(inode); ++ iput(inode); ++ out: ++ return err; ++ ++} ++ ++static int aufs_fill_super(struct super_block *sb, void *raw_data, ++ int silent __maybe_unused) ++{ ++ int err; ++ struct au_opts opts; ++ struct dentry *root; ++ struct inode *inode; ++ char *arg = raw_data; ++ ++ if (unlikely(!arg || !*arg)) { ++ err = -EINVAL; ++ AuErr("no arg\n"); ++ goto out; ++ } ++ ++ err = -ENOMEM; ++ memset(&opts, 0, sizeof(opts)); ++ opts.opt = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!opts.opt)) ++ goto out; ++ opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); ++ opts.sb_flags = sb->s_flags; ++ ++ err = au_si_alloc(sb); ++ if (unlikely(err)) ++ goto out_opts; ++ ++ /* all timestamps always follow the ones on the branch */ ++ sb->s_flags |= MS_NOATIME | MS_NODIRATIME; ++ sb->s_op = &aufs_sop; ++ sb->s_magic = AUFS_SUPER_MAGIC; ++ sb->s_maxbytes = 0; ++ au_export_init(sb); ++ ++ err = alloc_root(sb); ++ if (unlikely(err)) { ++ si_write_unlock(sb); ++ goto out_info; ++ } ++ root = sb->s_root; ++ inode = root->d_inode; ++ ++ /* ++ * actually we can parse options regardless aufs lock here. ++ * but at remount time, parsing must be done before aufs lock. ++ * so we follow the same rule. ++ */ ++ ii_write_lock_parent(inode); ++ aufs_write_unlock(root); ++ err = au_opts_parse(sb, arg, &opts); ++ if (unlikely(err)) ++ goto out_root; ++ ++ /* lock vfs_inode first, then aufs. */ ++ mutex_lock(&inode->i_mutex); ++ inode->i_op = &aufs_dir_iop; ++ inode->i_fop = &aufs_dir_fop; ++ aufs_write_lock(root); ++ err = au_opts_mount(sb, &opts); ++ au_opts_free(&opts); ++ if (unlikely(err)) ++ goto out_unlock; ++ aufs_write_unlock(root); ++ mutex_unlock(&inode->i_mutex); ++ goto out_opts; /* success */ ++ ++ out_unlock: ++ aufs_write_unlock(root); ++ mutex_unlock(&inode->i_mutex); ++ out_root: ++ dput(root); ++ sb->s_root = NULL; ++ out_info: ++ kobject_put(&au_sbi(sb)->si_kobj); ++ sb->s_fs_info = NULL; ++ out_opts: ++ free_page((unsigned long)opts.opt); ++ out: ++ AuTraceErr(err); ++ err = cvt_err(err); ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_get_sb(struct file_system_type *fs_type, int flags, ++ const char *dev_name __maybe_unused, void *raw_data, ++ struct vfsmount *mnt) ++{ ++ int err; ++ struct super_block *sb; ++ ++ /* all timestamps always follow the ones on the branch */ ++ /* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */ ++ err = get_sb_nodev(fs_type, flags, raw_data, aufs_fill_super, mnt); ++ if (!err) { ++ sb = mnt->mnt_sb; ++ si_write_lock(sb); ++ sysaufs_brs_add(sb, 0); ++ si_write_unlock(sb); ++ } ++ return err; ++} ++ ++struct file_system_type aufs_fs_type = { ++ .name = AUFS_FSTYPE, ++ .fs_flags = ++ FS_RENAME_DOES_D_MOVE /* a race between rename and others */ ++ | FS_REVAL_DOT, /* for NFS branch and udba */ ++ .get_sb = aufs_get_sb, ++ .kill_sb = generic_shutdown_super, ++ /* no need to __module_get() and module_put(). */ ++ .owner = THIS_MODULE, ++}; +diff -Nur linux-2.6.31-vanilla/fs/aufs/super.h linux-2.6.31/fs/aufs/super.h +--- linux-2.6.31-vanilla/fs/aufs/super.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/super.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,384 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * super_block operations ++ */ ++ ++#ifndef __AUFS_SUPER_H__ ++#define __AUFS_SUPER_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/fs.h> ++#include <linux/aufs_type.h> ++#include "rwsem.h" ++#include "spl.h" ++#include "wkq.h" ++ ++typedef ssize_t (*au_readf_t)(struct file *, char __user *, size_t, loff_t *); ++typedef ssize_t (*au_writef_t)(struct file *, const char __user *, size_t, ++ loff_t *); ++ ++/* policies to select one among multiple writable branches */ ++struct au_wbr_copyup_operations { ++ int (*copyup)(struct dentry *dentry); ++}; ++ ++struct au_wbr_create_operations { ++ int (*create)(struct dentry *dentry, int isdir); ++ int (*init)(struct super_block *sb); ++ int (*fin)(struct super_block *sb); ++}; ++ ++struct au_wbr_mfs { ++ struct mutex mfs_lock; /* protect this structure */ ++ unsigned long mfs_jiffy; ++ unsigned long mfs_expire; ++ aufs_bindex_t mfs_bindex; ++ ++ unsigned long long mfsrr_bytes; ++ unsigned long long mfsrr_watermark; ++}; ++ ++struct au_branch; ++struct au_sbinfo { ++ /* nowait tasks in the system-wide workqueue */ ++ struct au_nowait_tasks si_nowait; ++ ++ struct au_rwsem si_rwsem; ++ ++ /* branch management */ ++ unsigned int si_generation; ++ ++ /* see above flags */ ++ unsigned char au_si_status; ++ ++ aufs_bindex_t si_bend; ++ aufs_bindex_t si_last_br_id; ++ struct au_branch **si_branch; ++ ++ /* policy to select a writable branch */ ++ unsigned char si_wbr_copyup; ++ unsigned char si_wbr_create; ++ struct au_wbr_copyup_operations *si_wbr_copyup_ops; ++ struct au_wbr_create_operations *si_wbr_create_ops; ++ ++ /* round robin */ ++ atomic_t si_wbr_rr_next; ++ ++ /* most free space */ ++ struct au_wbr_mfs si_wbr_mfs; ++ ++ /* mount flags */ ++ /* include/asm-ia64/siginfo.h defines a macro named si_flags */ ++ unsigned int si_mntflags; ++ ++ /* external inode number (bitmap and translation table) */ ++ au_readf_t si_xread; ++ au_writef_t si_xwrite; ++ struct file *si_xib; ++ struct mutex si_xib_mtx; /* protect xib members */ ++ unsigned long *si_xib_buf; ++ unsigned long si_xib_last_pindex; ++ int si_xib_next_bit; ++ aufs_bindex_t si_xino_brid; ++ /* reserved for future use */ ++ /* unsigned long long si_xib_limit; */ /* Max xib file size */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++ /* i_generation */ ++ struct file *si_xigen; ++ atomic_t si_xigen_next; ++#endif ++ ++ /* vdir parameters */ ++ unsigned long si_rdcache; /* max cache time in HZ */ ++ unsigned int si_rdblk; /* deblk size */ ++ unsigned int si_rdhash; /* hash size */ ++ ++ /* ++ * If the number of whiteouts are larger than si_dirwh, leave all of ++ * them after au_whtmp_ren to reduce the cost of rmdir(2). ++ * future fsck.aufs or kernel thread will remove them later. ++ * Otherwise, remove all whiteouts and the dir in rmdir(2). ++ */ ++ unsigned int si_dirwh; ++ ++ /* ++ * rename(2) a directory with all children. ++ */ ++ /* reserved for future use */ ++ /* int si_rendir; */ ++ ++ /* pseudo_link list */ ++ struct au_splhead si_plink; ++ wait_queue_head_t si_plink_wq; ++ ++ /* ++ * sysfs and lifetime management. ++ * this is not a small structure and it may be a waste of memory in case ++ * of sysfs is disabled, particulary when many aufs-es are mounted. ++ * but using sysfs is majority. ++ */ ++ struct kobject si_kobj; ++#ifdef CONFIG_DEBUG_FS ++ struct dentry *si_dbgaufs, *si_dbgaufs_xib; ++#ifdef CONFIG_AUFS_EXPORT ++ struct dentry *si_dbgaufs_xigen; ++#endif ++#endif ++ ++ /* dirty, necessary for unmounting, sysfs and sysrq */ ++ struct super_block *si_sb; ++}; ++ ++/* sbinfo status flags */ ++/* ++ * set true when refresh_dirs() failed at remount time. ++ * then try refreshing dirs at access time again. ++ * if it is false, refreshing dirs at access time is unnecesary ++ */ ++#define AuSi_FAILED_REFRESH_DIRS 1 ++#define AuSi_MAINTAIN_PLINK (1 << 1) /* ioctl */ ++static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi, ++ unsigned int flag) ++{ ++ AuRwMustAnyLock(&sbi->si_rwsem); ++ return sbi->au_si_status & flag; ++} ++#define au_ftest_si(sbinfo, name) au_do_ftest_si(sbinfo, AuSi_##name) ++#define au_fset_si(sbinfo, name) do { \ ++ AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ ++ (sbinfo)->au_si_status |= AuSi_##name; \ ++} while (0) ++#define au_fclr_si(sbinfo, name) do { \ ++ AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ ++ (sbinfo)->au_si_status &= ~AuSi_##name; \ ++} while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policy to select one among writable branches */ ++#define AuWbrCopyup(sbinfo, args...) \ ++ ((sbinfo)->si_wbr_copyup_ops->copyup(args)) ++#define AuWbrCreate(sbinfo, args...) \ ++ ((sbinfo)->si_wbr_create_ops->create(args)) ++ ++/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */ ++#define AuLock_DW 1 /* write-lock dentry */ ++#define AuLock_IR (1 << 1) /* read-lock inode */ ++#define AuLock_IW (1 << 2) /* write-lock inode */ ++#define AuLock_FLUSH (1 << 3) /* wait for 'nowait' tasks */ ++#define AuLock_DIR (1 << 4) /* target is a dir */ ++#define au_ftest_lock(flags, name) ((flags) & AuLock_##name) ++#define au_fset_lock(flags, name) { (flags) |= AuLock_##name; } ++#define au_fclr_lock(flags, name) { (flags) &= ~AuLock_##name; } ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* super.c */ ++extern struct file_system_type aufs_fs_type; ++struct inode *au_iget_locked(struct super_block *sb, ino_t ino); ++ ++/* sbinfo.c */ ++void au_si_free(struct kobject *kobj); ++int au_si_alloc(struct super_block *sb); ++int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr); ++ ++unsigned int au_sigen_inc(struct super_block *sb); ++aufs_bindex_t au_new_br_id(struct super_block *sb); ++ ++void aufs_read_lock(struct dentry *dentry, int flags); ++void aufs_read_unlock(struct dentry *dentry, int flags); ++void aufs_write_lock(struct dentry *dentry); ++void aufs_write_unlock(struct dentry *dentry); ++void aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int isdir); ++void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2); ++ ++/* wbr_policy.c */ ++extern struct au_wbr_copyup_operations au_wbr_copyup_ops[]; ++extern struct au_wbr_create_operations au_wbr_create_ops[]; ++int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_sbinfo *au_sbi(struct super_block *sb) ++{ ++ return sb->s_fs_info; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++void au_export_init(struct super_block *sb); ++ ++static inline int au_test_nfsd(struct task_struct *tsk) ++{ ++ return !tsk->mm && !strcmp(tsk->comm, "nfsd"); ++} ++ ++int au_xigen_inc(struct inode *inode); ++int au_xigen_new(struct inode *inode); ++int au_xigen_set(struct super_block *sb, struct file *base); ++void au_xigen_clr(struct super_block *sb); ++ ++static inline int au_busy_or_stale(void) ++{ ++ if (!au_test_nfsd(current)) ++ return -EBUSY; ++ return -ESTALE; ++} ++#else ++static inline void au_export_init(struct super_block *sb) ++{ ++ /* nothing */ ++} ++ ++static inline int au_test_nfsd(struct task_struct *tsk) ++{ ++ return 0; ++} ++ ++static inline int au_xigen_inc(struct inode *inode) ++{ ++ return 0; ++} ++ ++static inline int au_xigen_new(struct inode *inode) ++{ ++ return 0; ++} ++ ++static inline int au_xigen_set(struct super_block *sb, struct file *base) ++{ ++ return 0; ++} ++ ++static inline void au_xigen_clr(struct super_block *sb) ++{ ++ /* empty */ ++} ++ ++static inline int au_busy_or_stale(void) ++{ ++ return -EBUSY; ++} ++#endif /* CONFIG_AUFS_EXPORT */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo) ++{ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++#ifdef CONFIG_DEBUG_FS ++ sbinfo->si_dbgaufs = NULL; ++ sbinfo->si_dbgaufs_xib = NULL; ++#ifdef CONFIG_AUFS_EXPORT ++ sbinfo->si_dbgaufs_xigen = NULL; ++#endif ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock superblock. mainly for entry point functions */ ++/* ++ * si_noflush_read_lock, si_noflush_write_lock, ++ * si_read_unlock, si_write_unlock, si_downgrade_lock ++ */ ++AuSimpleLockRwsemFuncs(si_noflush, struct super_block *sb, ++ &au_sbi(sb)->si_rwsem); ++AuSimpleUnlockRwsemFuncs(si, struct super_block *sb, &au_sbi(sb)->si_rwsem); ++ ++#define SiMustNoWaiters(sb) AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem) ++#define SiMustAnyLock(sb) AuRwMustAnyLock(&au_sbi(sb)->si_rwsem) ++#define SiMustWriteLock(sb) AuRwMustWriteLock(&au_sbi(sb)->si_rwsem) ++ ++static inline void si_read_lock(struct super_block *sb, int flags) ++{ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ si_noflush_read_lock(sb); ++} ++ ++static inline void si_write_lock(struct super_block *sb) ++{ ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ si_noflush_write_lock(sb); ++} ++ ++static inline int si_read_trylock(struct super_block *sb, int flags) ++{ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ return si_noflush_read_trylock(sb); ++} ++ ++static inline int si_write_trylock(struct super_block *sb, int flags) ++{ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ return si_noflush_write_trylock(sb); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline aufs_bindex_t au_sbend(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_bend; ++} ++ ++static inline unsigned int au_mntflags(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_mntflags; ++} ++ ++static inline unsigned int au_sigen(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_generation; ++} ++ ++static inline struct au_branch *au_sbr(struct super_block *sb, ++ aufs_bindex_t bindex) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_branch[0 + bindex]; ++} ++ ++static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid) ++{ ++ SiMustWriteLock(sb); ++ au_sbi(sb)->si_xino_brid = brid; ++} ++ ++static inline aufs_bindex_t au_xino_brid(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_xino_brid; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_SUPER_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/sysaufs.c linux-2.6.31/fs/aufs/sysaufs.c +--- linux-2.6.31-vanilla/fs/aufs/sysaufs.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/sysaufs.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,104 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sysfs interface and lifetime management ++ * they are necessary regardless sysfs is disabled. ++ */ ++ ++#include <linux/fs.h> ++#include <linux/random.h> ++#include <linux/sysfs.h> ++#include "aufs.h" ++ ++unsigned long sysaufs_si_mask; ++struct kset *sysaufs_ket; ++ ++#define AuSiAttr(_name) { \ ++ .attr = { .name = __stringify(_name), .mode = 0444 }, \ ++ .show = sysaufs_si_##_name, \ ++} ++ ++static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path); ++struct attribute *sysaufs_si_attrs[] = { ++ &sysaufs_si_attr_xi_path.attr, ++ NULL, ++}; ++ ++static struct sysfs_ops au_sbi_ops = { ++ .show = sysaufs_si_show ++}; ++ ++static struct kobj_type au_sbi_ktype = { ++ .release = au_si_free, ++ .sysfs_ops = &au_sbi_ops, ++ .default_attrs = sysaufs_si_attrs ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int sysaufs_si_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ ++ sbinfo->si_kobj.kset = sysaufs_ket; ++ /* cf. sysaufs_name() */ ++ err = kobject_init_and_add ++ (&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_ket->kobj*/NULL, ++ SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo)); ++ ++ dbgaufs_si_null(sbinfo); ++ if (!err) { ++ err = dbgaufs_si_init(sbinfo); ++ if (unlikely(err)) ++ kobject_put(&sbinfo->si_kobj); ++ } ++ return err; ++} ++ ++void sysaufs_fin(void) ++{ ++ dbgaufs_fin(); ++ sysfs_remove_group(&sysaufs_ket->kobj, sysaufs_attr_group); ++ kset_unregister(sysaufs_ket); ++} ++ ++int __init sysaufs_init(void) ++{ ++ int err; ++ ++ do { ++ get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask)); ++ } while (!sysaufs_si_mask); ++ ++ sysaufs_ket = kset_create_and_add(AUFS_NAME, NULL, fs_kobj); ++ err = PTR_ERR(sysaufs_ket); ++ if (IS_ERR(sysaufs_ket)) ++ goto out; ++ err = sysfs_create_group(&sysaufs_ket->kobj, sysaufs_attr_group); ++ if (unlikely(err)) { ++ kset_unregister(sysaufs_ket); ++ goto out; ++ } ++ ++ err = dbgaufs_init(); ++ if (unlikely(err)) ++ sysaufs_fin(); ++ out: ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/sysaufs.h linux-2.6.31/fs/aufs/sysaufs.h +--- linux-2.6.31-vanilla/fs/aufs/sysaufs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/sysaufs.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,120 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sysfs interface and mount lifetime management ++ */ ++ ++#ifndef __SYSAUFS_H__ ++#define __SYSAUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/sysfs.h> ++#include <linux/aufs_type.h> ++#include "module.h" ++ ++struct super_block; ++struct au_sbinfo; ++ ++struct sysaufs_si_attr { ++ struct attribute attr; ++ int (*show)(struct seq_file *seq, struct super_block *sb); ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* sysaufs.c */ ++extern unsigned long sysaufs_si_mask; ++extern struct kset *sysaufs_ket; ++extern struct attribute *sysaufs_si_attrs[]; ++int sysaufs_si_init(struct au_sbinfo *sbinfo); ++int __init sysaufs_init(void); ++void sysaufs_fin(void); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* some people doesn't like to show a pointer in kernel */ ++static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo) ++{ ++ return sysaufs_si_mask ^ (unsigned long)sbinfo; ++} ++ ++#define SysaufsSiNamePrefix "si_" ++#define SysaufsSiNameLen (sizeof(SysaufsSiNamePrefix) + 16) ++static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name) ++{ ++ snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx", ++ sysaufs_si_id(sbinfo)); ++} ++ ++struct au_branch; ++#ifdef CONFIG_SYSFS ++/* sysfs.c */ ++extern struct attribute_group *sysaufs_attr_group; ++ ++int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb); ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf); ++ ++void sysaufs_br_init(struct au_branch *br); ++void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); ++void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); ++ ++#define sysaufs_brs_init() do {} while (0) ++ ++#else ++#define sysaufs_attr_group NULL ++ ++static inline ++int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb) ++{ ++ return 0; ++} ++ ++static inline ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf) ++{ ++ return 0; ++} ++ ++static inline void sysaufs_br_init(struct au_branch *br) ++{ ++ /* empty */ ++} ++ ++static inline void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ /* nothing */ ++} ++ ++static inline void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ /* nothing */ ++} ++ ++static inline void sysaufs_brs_init(void) ++{ ++ sysaufs_brs = 0; ++} ++ ++#endif /* CONFIG_SYSFS */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __SYSAUFS_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/sysfs.c linux-2.6.31/fs/aufs/sysfs.c +--- linux-2.6.31-vanilla/fs/aufs/sysfs.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/sysfs.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,210 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sysfs interface ++ */ ++ ++#include <linux/fs.h> ++#include <linux/module.h> ++#include <linux/seq_file.h> ++#include <linux/sysfs.h> ++#include "aufs.h" ++ ++static struct attribute *au_attr[] = { ++ NULL, /* need to NULL terminate the list of attributes */ ++}; ++ ++static struct attribute_group sysaufs_attr_group_body = { ++ .attrs = au_attr ++}; ++ ++struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb) ++{ ++ int err; ++ ++ SiMustAnyLock(sb); ++ ++ err = 0; ++ if (au_opt_test(au_mntflags(sb), XINO)) { ++ err = au_xino_path(seq, au_sbi(sb)->si_xib); ++ seq_putc(seq, '\n'); ++ } ++ return err; ++} ++ ++/* ++ * the lifetime of branch is independent from the entry under sysfs. ++ * sysfs handles the lifetime of the entry, and never call ->show() after it is ++ * unlinked. ++ */ ++static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb, ++ aufs_bindex_t bindex) ++{ ++ struct path path; ++ struct dentry *root; ++ struct au_branch *br; ++ ++ AuDbg("b%d\n", bindex); ++ ++ root = sb->s_root; ++ di_read_lock_parent(root, !AuLock_IR); ++ br = au_sbr(sb, bindex); ++ path.mnt = br->br_mnt; ++ path.dentry = au_h_dptr(root, bindex); ++ au_seq_path(seq, &path); ++ di_read_unlock(root, !AuLock_IR); ++ seq_printf(seq, "=%s\n", au_optstr_br_perm(br->br_perm)); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct seq_file *au_seq(char *p, ssize_t len) ++{ ++ struct seq_file *seq; ++ ++ seq = kzalloc(sizeof(*seq), GFP_NOFS); ++ if (seq) { ++ /* mutex_init(&seq.lock); */ ++ seq->buf = p; ++ seq->size = len; ++ return seq; /* success */ ++ } ++ ++ seq = ERR_PTR(-ENOMEM); ++ return seq; ++} ++ ++#define SysaufsBr_PREFIX "br" ++ ++/* todo: file size may exceed PAGE_SIZE */ ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf) ++{ ++ ssize_t err; ++ long l; ++ aufs_bindex_t bend; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ struct seq_file *seq; ++ char *name; ++ struct attribute **cattr; ++ ++ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ ++ seq = au_seq(buf, PAGE_SIZE); ++ err = PTR_ERR(seq); ++ if (IS_ERR(seq)) ++ goto out; ++ ++ name = (void *)attr->name; ++ cattr = sysaufs_si_attrs; ++ while (*cattr) { ++ if (!strcmp(name, (*cattr)->name)) { ++ err = container_of(*cattr, struct sysaufs_si_attr, attr) ++ ->show(seq, sb); ++ goto out_seq; ++ } ++ cattr++; ++ } ++ ++ bend = au_sbend(sb); ++ if (!strncmp(name, SysaufsBr_PREFIX, sizeof(SysaufsBr_PREFIX) - 1)) { ++ name += sizeof(SysaufsBr_PREFIX) - 1; ++ err = strict_strtol(name, 10, &l); ++ if (!err) { ++ if (l <= bend) ++ err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l); ++ else ++ err = -ENOENT; ++ } ++ goto out_seq; ++ } ++ BUG(); ++ ++ out_seq: ++ if (!err) { ++ err = seq->count; ++ /* sysfs limit */ ++ if (unlikely(err == PAGE_SIZE)) ++ err = -EFBIG; ++ } ++ kfree(seq); ++ out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void sysaufs_br_init(struct au_branch *br) ++{ ++ br->br_attr.name = br->br_name; ++ br->br_attr.mode = S_IRUGO; ++ br->br_attr.owner = THIS_MODULE; ++} ++ ++void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ struct au_branch *br; ++ struct kobject *kobj; ++ aufs_bindex_t bend; ++ ++ dbgaufs_brs_del(sb, bindex); ++ ++ if (!sysaufs_brs) ++ return; ++ ++ kobj = &au_sbi(sb)->si_kobj; ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ sysfs_remove_file(kobj, &br->br_attr); ++ } ++} ++ ++void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err; ++ aufs_bindex_t bend; ++ struct kobject *kobj; ++ struct au_branch *br; ++ ++ dbgaufs_brs_add(sb, bindex); ++ ++ if (!sysaufs_brs) ++ return; ++ ++ kobj = &au_sbi(sb)->si_kobj; ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ snprintf(br->br_name, sizeof(br->br_name), SysaufsBr_PREFIX ++ "%d", bindex); ++ err = sysfs_create_file(kobj, &br->br_attr); ++ if (unlikely(err)) ++ AuWarn("failed %s under sysfs(%d)\n", br->br_name, err); ++ } ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/sysrq.c linux-2.6.31/fs/aufs/sysrq.c +--- linux-2.6.31-vanilla/fs/aufs/sysrq.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/sysrq.c 2009-09-16 13:55:29.000000000 +0200 +@@ -0,0 +1,115 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * magic sysrq hanlder ++ */ ++ ++#include <linux/fs.h> ++#include <linux/module.h> ++#include <linux/moduleparam.h> ++/* #include <linux/sysrq.h> */ ++#include "aufs.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void sysrq_sb(struct super_block *sb) ++{ ++ char *plevel; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ plevel = au_plevel; ++ au_plevel = KERN_WARNING; ++ au_debug(1); ++ ++ sbinfo = au_sbi(sb); ++ pr_warning("si=%lx\n", sysaufs_si_id(sbinfo)); ++ pr_warning(AUFS_NAME ": superblock\n"); ++ au_dpri_sb(sb); ++ pr_warning(AUFS_NAME ": root dentry\n"); ++ au_dpri_dentry(sb->s_root); ++ pr_warning(AUFS_NAME ": root inode\n"); ++ au_dpri_inode(sb->s_root->d_inode); ++#if 0 ++ struct inode *i; ++ pr_warning(AUFS_NAME ": isolated inode\n"); ++ list_for_each_entry(i, &sb->s_inodes, i_sb_list) ++ if (list_empty(&i->i_dentry)) ++ au_dpri_inode(i); ++#endif ++ pr_warning(AUFS_NAME ": files\n"); ++ list_for_each_entry(file, &sb->s_files, f_u.fu_list) ++ if (!special_file(file->f_dentry->d_inode->i_mode)) ++ au_dpri_file(file); ++ ++ au_plevel = plevel; ++ au_debug(0); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* module parameter */ ++static char *aufs_sysrq_key = "a"; ++module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO); ++MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME); ++ ++static void au_sysrq(int key __maybe_unused, ++ struct tty_struct *tty __maybe_unused) ++{ ++ struct kobject *kobj; ++ struct au_sbinfo *sbinfo; ++ ++ /* spin_lock(&sysaufs_ket->list_lock); */ ++ list_for_each_entry(kobj, &sysaufs_ket->list, entry) { ++ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); ++ sysrq_sb(sbinfo->si_sb); ++ } ++ /* spin_unlock(&sysaufs_ket->list_lock); */ ++} ++ ++static struct sysrq_key_op au_sysrq_op = { ++ .handler = au_sysrq, ++ .help_msg = "Aufs", ++ .action_msg = "Aufs", ++ .enable_mask = SYSRQ_ENABLE_DUMP ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int __init au_sysrq_init(void) ++{ ++ int err; ++ char key; ++ ++ err = -1; ++ key = *aufs_sysrq_key; ++ if ('a' <= key && key <= 'z') ++ err = register_sysrq_key(key, &au_sysrq_op); ++ if (unlikely(err)) ++ AuErr("err %d, sysrq=%c\n", err, key); ++ return err; ++} ++ ++void au_sysrq_fin(void) ++{ ++ int err; ++ err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op); ++ if (unlikely(err)) ++ AuErr("err %d (ignored)\n", err); ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/vdir.c linux-2.6.31/fs/aufs/vdir.c +--- linux-2.6.31-vanilla/fs/aufs/vdir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/vdir.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,879 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * virtual or vertical directory ++ */ ++ ++#include <linux/hash.h> ++#include "aufs.h" ++ ++static unsigned int calc_size(int nlen) ++{ ++ BUILD_BUG_ON(sizeof(ino_t) != sizeof(long)); ++ return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t)); ++} ++ ++static int set_deblk_end(union au_vdir_deblk_p *p, ++ union au_vdir_deblk_p *deblk_end) ++{ ++ if (calc_size(0) <= deblk_end->deblk - p->deblk) { ++ p->de->de_str.len = 0; ++ /* smp_mb(); */ ++ return 0; ++ } ++ return -1; /* error */ ++} ++ ++/* returns true or false */ ++static int is_deblk_end(union au_vdir_deblk_p *p, ++ union au_vdir_deblk_p *deblk_end) ++{ ++ if (calc_size(0) <= deblk_end->deblk - p->deblk) ++ return !p->de->de_str.len; ++ return 1; ++} ++ ++static unsigned char *last_deblk(struct au_vdir *vdir) ++{ ++ return vdir->vd_deblk[vdir->vd_nblk - 1]; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* estimate the apropriate size for name hash table */ ++unsigned int au_rdhash_est(loff_t sz) ++{ ++ unsigned int n; ++ ++ n = UINT_MAX; ++ sz >>= 10; ++ if (sz < n) ++ n = sz; ++ if (sz < AUFS_RDHASH_DEF) ++ n = AUFS_RDHASH_DEF; ++ /* AuInfo("n %u\n", n); */ ++ return n; ++} ++ ++/* ++ * the allocated memory has to be freed by ++ * au_nhash_wh_free() or au_nhash_de_free(). ++ */ ++int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp) ++{ ++ struct hlist_head *head; ++ unsigned int u; ++ ++ head = kmalloc(sizeof(*nhash->nh_head) * num_hash, gfp); ++ if (head) { ++ nhash->nh_num = num_hash; ++ nhash->nh_head = head; ++ for (u = 0; u < num_hash; u++) ++ INIT_HLIST_HEAD(head++); ++ return 0; /* success */ ++ } ++ ++ return -ENOMEM; ++} ++ ++static void nhash_count(struct hlist_head *head) ++{ ++#if 0 ++ unsigned long n; ++ struct hlist_node *pos; ++ ++ n = 0; ++ hlist_for_each(pos, head) ++ n++; ++ AuInfo("%lu\n", n); ++#endif ++} ++ ++static void au_nhash_wh_do_free(struct hlist_head *head) ++{ ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos, *node; ++ ++ hlist_for_each_entry_safe(tpos, pos, node, head, wh_hash) { ++ /* hlist_del(pos); */ ++ kfree(tpos); ++ } ++} ++ ++static void au_nhash_de_do_free(struct hlist_head *head) ++{ ++ struct au_vdir_dehstr *tpos; ++ struct hlist_node *pos, *node; ++ ++ hlist_for_each_entry_safe(tpos, pos, node, head, hash) { ++ /* hlist_del(pos); */ ++ au_cache_free_dehstr(tpos); ++ } ++} ++ ++static void au_nhash_do_free(struct au_nhash *nhash, ++ void (*free)(struct hlist_head *head)) ++{ ++ unsigned int n; ++ struct hlist_head *head; ++ ++ n = nhash->nh_num; ++ if (!n) ++ return; ++ ++ head = nhash->nh_head; ++ while (n-- > 0) { ++ nhash_count(head); ++ free(head++); ++ } ++ kfree(nhash->nh_head); ++} ++ ++void au_nhash_wh_free(struct au_nhash *whlist) ++{ ++ au_nhash_do_free(whlist, au_nhash_wh_do_free); ++} ++ ++static void au_nhash_de_free(struct au_nhash *delist) ++{ ++ au_nhash_do_free(delist, au_nhash_de_do_free); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, ++ int limit) ++{ ++ int num; ++ unsigned int u, n; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ ++ num = 0; ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (u = 0; u < n; u++, head++) ++ hlist_for_each_entry(tpos, pos, head, wh_hash) ++ if (tpos->wh_bindex == btgt && ++num > limit) ++ return 1; ++ return 0; ++} ++ ++static struct hlist_head *au_name_hash(struct au_nhash *nhash, ++ unsigned char *name, ++ unsigned int len) ++{ ++ unsigned int v; ++ /* const unsigned int magic_bit = 12; */ ++ ++ AuDebugOn(!nhash->nh_num || !nhash->nh_head); ++ ++ v = 0; ++ while (len--) ++ v += *name++; ++ /* v = hash_long(v, magic_bit); */ ++ v %= nhash->nh_num; ++ return nhash->nh_head + v; ++} ++ ++static int au_nhash_test_name(struct au_vdir_destr *str, const char *name, ++ int nlen) ++{ ++ return str->len == nlen && !memcmp(str->name, name, nlen); ++} ++ ++/* returns found or not */ ++int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen) ++{ ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ struct au_vdir_destr *str; ++ ++ head = au_name_hash(whlist, name, nlen); ++ hlist_for_each_entry(tpos, pos, head, wh_hash) { ++ str = &tpos->wh_str; ++ AuDbg("%.*s\n", str->len, str->name); ++ if (au_nhash_test_name(str, name, nlen)) ++ return 1; ++ } ++ return 0; ++} ++ ++/* returns found(true) or not */ ++static int test_known(struct au_nhash *delist, char *name, int nlen) ++{ ++ struct hlist_head *head; ++ struct au_vdir_dehstr *tpos; ++ struct hlist_node *pos; ++ struct au_vdir_destr *str; ++ ++ head = au_name_hash(delist, name, nlen); ++ hlist_for_each_entry(tpos, pos, head, hash) { ++ str = tpos->str; ++ AuDbg("%.*s\n", str->len, str->name); ++ if (au_nhash_test_name(str, name, nlen)) ++ return 1; ++ } ++ return 0; ++} ++ ++static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino, ++ unsigned char d_type) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ wh->wh_ino = ino; ++ wh->wh_type = d_type; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, ++ unsigned int d_type, aufs_bindex_t bindex, ++ unsigned char shwh) ++{ ++ int err; ++ struct au_vdir_destr *str; ++ struct au_vdir_wh *wh; ++ ++ AuDbg("%.*s\n", nlen, name); ++ AuDebugOn(!whlist->nh_num || !whlist->nh_head); ++ ++ err = -ENOMEM; ++ wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS); ++ if (unlikely(!wh)) ++ goto out; ++ ++ err = 0; ++ wh->wh_bindex = bindex; ++ if (shwh) ++ au_shwh_init_wh(wh, ino, d_type); ++ str = &wh->wh_str; ++ str->len = nlen; ++ memcpy(str->name, name, nlen); ++ hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen)); ++ /* smp_mb(); */ ++ ++ out: ++ return err; ++} ++ ++static int append_deblk(struct au_vdir *vdir) ++{ ++ int err; ++ unsigned long ul; ++ const unsigned int deblk_sz = vdir->vd_deblk_sz; ++ union au_vdir_deblk_p p, deblk_end; ++ unsigned char **o; ++ ++ err = -ENOMEM; ++ o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1), ++ GFP_NOFS); ++ if (unlikely(!o)) ++ goto out; ++ ++ vdir->vd_deblk = o; ++ p.deblk = kmalloc(deblk_sz, GFP_NOFS); ++ if (p.deblk) { ++ ul = vdir->vd_nblk++; ++ vdir->vd_deblk[ul] = p.deblk; ++ vdir->vd_last.ul = ul; ++ vdir->vd_last.p.deblk = p.deblk; ++ deblk_end.deblk = p.deblk + deblk_sz; ++ err = set_deblk_end(&p, &deblk_end); ++ } ++ ++ out: ++ return err; ++} ++ ++static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino, ++ unsigned int d_type, struct au_nhash *delist) ++{ ++ int err; ++ unsigned int sz; ++ const unsigned int deblk_sz = vdir->vd_deblk_sz; ++ union au_vdir_deblk_p p, *room, deblk_end; ++ struct au_vdir_dehstr *dehstr; ++ ++ p.deblk = last_deblk(vdir); ++ deblk_end.deblk = p.deblk + deblk_sz; ++ room = &vdir->vd_last.p; ++ AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk ++ || !is_deblk_end(room, &deblk_end)); ++ ++ sz = calc_size(nlen); ++ if (unlikely(sz > deblk_end.deblk - room->deblk)) { ++ err = append_deblk(vdir); ++ if (unlikely(err)) ++ goto out; ++ ++ p.deblk = last_deblk(vdir); ++ deblk_end.deblk = p.deblk + deblk_sz; ++ /* smp_mb(); */ ++ AuDebugOn(room->deblk != p.deblk); ++ } ++ ++ err = -ENOMEM; ++ dehstr = au_cache_alloc_dehstr(); ++ if (unlikely(!dehstr)) ++ goto out; ++ ++ dehstr->str = &room->de->de_str; ++ hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen)); ++ room->de->de_ino = ino; ++ room->de->de_type = d_type; ++ room->de->de_str.len = nlen; ++ memcpy(room->de->de_str.name, name, nlen); ++ ++ err = 0; ++ room->deblk += sz; ++ if (unlikely(set_deblk_end(room, &deblk_end))) ++ err = append_deblk(vdir); ++ /* smp_mb(); */ ++ ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_vdir_free(struct au_vdir *vdir) ++{ ++ unsigned char **deblk; ++ ++ deblk = vdir->vd_deblk; ++ while (vdir->vd_nblk--) ++ kfree(*deblk++); ++ kfree(vdir->vd_deblk); ++ au_cache_free_vdir(vdir); ++} ++ ++static struct au_vdir *alloc_vdir(struct file *file) ++{ ++ struct au_vdir *vdir; ++ struct super_block *sb; ++ int err; ++ ++ sb = file->f_dentry->d_sb; ++ SiMustAnyLock(sb); ++ ++ err = -ENOMEM; ++ vdir = au_cache_alloc_vdir(); ++ if (unlikely(!vdir)) ++ goto out; ++ ++ vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS); ++ if (unlikely(!vdir->vd_deblk)) ++ goto out_free; ++ ++ vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk; ++ if (!vdir->vd_deblk_sz) { ++ /* estimate the apropriate size for deblk */ ++ vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL); ++ /* AuInfo("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */ ++ } ++ vdir->vd_nblk = 0; ++ vdir->vd_version = 0; ++ vdir->vd_jiffy = 0; ++ err = append_deblk(vdir); ++ if (!err) ++ return vdir; /* success */ ++ ++ kfree(vdir->vd_deblk); ++ ++ out_free: ++ au_cache_free_vdir(vdir); ++ out: ++ vdir = ERR_PTR(err); ++ return vdir; ++} ++ ++static int reinit_vdir(struct au_vdir *vdir) ++{ ++ int err; ++ union au_vdir_deblk_p p, deblk_end; ++ ++ while (vdir->vd_nblk > 1) { ++ kfree(vdir->vd_deblk[vdir->vd_nblk - 1]); ++ /* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */ ++ vdir->vd_nblk--; ++ } ++ p.deblk = vdir->vd_deblk[0]; ++ deblk_end.deblk = p.deblk + vdir->vd_deblk_sz; ++ err = set_deblk_end(&p, &deblk_end); ++ /* keep vd_dblk_sz */ ++ vdir->vd_last.ul = 0; ++ vdir->vd_last.p.deblk = vdir->vd_deblk[0]; ++ vdir->vd_version = 0; ++ vdir->vd_jiffy = 0; ++ /* smp_mb(); */ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuFillVdir_CALLED 1 ++#define AuFillVdir_WHABLE (1 << 1) ++#define AuFillVdir_SHWH (1 << 2) ++#define au_ftest_fillvdir(flags, name) ((flags) & AuFillVdir_##name) ++#define au_fset_fillvdir(flags, name) { (flags) |= AuFillVdir_##name; } ++#define au_fclr_fillvdir(flags, name) { (flags) &= ~AuFillVdir_##name; } ++ ++#ifndef CONFIG_AUFS_SHWH ++#undef AuFillVdir_SHWH ++#define AuFillVdir_SHWH 0 ++#endif ++ ++struct fillvdir_arg { ++ struct file *file; ++ struct au_vdir *vdir; ++ struct au_nhash delist; ++ struct au_nhash whlist; ++ aufs_bindex_t bindex; ++ unsigned int flags; ++ int err; ++}; ++ ++static int fillvdir(void *__arg, const char *__name, int nlen, ++ loff_t offset __maybe_unused, u64 h_ino, ++ unsigned int d_type) ++{ ++ struct fillvdir_arg *arg = __arg; ++ char *name = (void *)__name; ++ struct super_block *sb; ++ ino_t ino; ++ const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH); ++ ++ arg->err = 0; ++ sb = arg->file->f_dentry->d_sb; ++ au_fset_fillvdir(arg->flags, CALLED); ++ /* smp_mb(); */ ++ if (nlen <= AUFS_WH_PFX_LEN ++ || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ if (test_known(&arg->delist, name, nlen) ++ || au_nhash_test_known_wh(&arg->whlist, name, nlen)) ++ goto out; /* already exists or whiteouted */ ++ ++ sb = arg->file->f_dentry->d_sb; ++ arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino); ++ if (!arg->err) ++ arg->err = append_de(arg->vdir, name, nlen, ino, ++ d_type, &arg->delist); ++ } else if (au_ftest_fillvdir(arg->flags, WHABLE)) { ++ name += AUFS_WH_PFX_LEN; ++ nlen -= AUFS_WH_PFX_LEN; ++ if (au_nhash_test_known_wh(&arg->whlist, name, nlen)) ++ goto out; /* already whiteouted */ ++ ++ if (shwh) ++ arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type, ++ &ino); ++ if (!arg->err) ++ arg->err = au_nhash_append_wh ++ (&arg->whlist, name, nlen, ino, d_type, ++ arg->bindex, shwh); ++ } ++ ++ out: ++ if (!arg->err) ++ arg->vdir->vd_jiffy = jiffies; ++ /* smp_mb(); */ ++ AuTraceErr(arg->err); ++ return arg->err; ++} ++ ++static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir, ++ struct au_nhash *whlist, struct au_nhash *delist) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ int err; ++ unsigned int nh, u; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos, *n; ++ char *p, *o; ++ struct au_vdir_destr *destr; ++ ++ AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH)); ++ ++ err = -ENOMEM; ++ o = p = __getname(); ++ if (unlikely(!p)) ++ goto out; ++ ++ err = 0; ++ nh = whlist->nh_num; ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ p += AUFS_WH_PFX_LEN; ++ for (u = 0; u < nh; u++) { ++ head = whlist->nh_head + u; ++ hlist_for_each_entry_safe(tpos, pos, n, head, wh_hash) { ++ destr = &tpos->wh_str; ++ memcpy(p, destr->name, destr->len); ++ err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN, ++ tpos->wh_ino, tpos->wh_type, delist); ++ if (unlikely(err)) ++ break; ++ } ++ } ++ ++ __putname(o); ++ ++ out: ++ AuTraceErr(err); ++ return err; ++#else ++ return 0; ++#endif ++} ++ ++static int au_do_read_vdir(struct fillvdir_arg *arg) ++{ ++ int err; ++ unsigned int rdhash; ++ loff_t offset; ++ aufs_bindex_t bend, bindex, bstart; ++ unsigned char shwh; ++ struct file *hf, *file; ++ struct super_block *sb; ++ ++ file = arg->file; ++ sb = file->f_dentry->d_sb; ++ SiMustAnyLock(sb); ++ ++ rdhash = au_sbi(sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL)); ++ err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out_delist; ++ ++ err = 0; ++ arg->flags = 0; ++ shwh = 0; ++ if (au_opt_test(au_mntflags(sb), SHWH)) { ++ shwh = 1; ++ au_fset_fillvdir(arg->flags, SHWH); ++ } ++ bstart = au_fbstart(file); ++ bend = au_fbend(file); ++ for (bindex = bstart; !err && bindex <= bend; bindex++) { ++ hf = au_h_fptr(file, bindex); ++ if (!hf) ++ continue; ++ ++ offset = vfsub_llseek(hf, 0, SEEK_SET); ++ err = offset; ++ if (unlikely(offset)) ++ break; ++ ++ arg->bindex = bindex; ++ au_fclr_fillvdir(arg->flags, WHABLE); ++ if (shwh ++ || (bindex != bend ++ && au_br_whable(au_sbr_perm(sb, bindex)))) ++ au_fset_fillvdir(arg->flags, WHABLE); ++ do { ++ arg->err = 0; ++ au_fclr_fillvdir(arg->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_readdir(hf, fillvdir, arg); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err && au_ftest_fillvdir(arg->flags, CALLED)); ++ } ++ ++ if (!err && shwh) ++ err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist); ++ ++ au_nhash_wh_free(&arg->whlist); ++ ++ out_delist: ++ au_nhash_de_free(&arg->delist); ++ out: ++ return err; ++} ++ ++static int read_vdir(struct file *file, int may_read) ++{ ++ int err; ++ unsigned long expire; ++ unsigned char do_read; ++ struct fillvdir_arg arg; ++ struct inode *inode; ++ struct au_vdir *vdir, *allocated; ++ ++ err = 0; ++ inode = file->f_dentry->d_inode; ++ IMustLock(inode); ++ SiMustAnyLock(inode->i_sb); ++ ++ allocated = NULL; ++ do_read = 0; ++ expire = au_sbi(inode->i_sb)->si_rdcache; ++ vdir = au_ivdir(inode); ++ if (!vdir) { ++ do_read = 1; ++ vdir = alloc_vdir(file); ++ err = PTR_ERR(vdir); ++ if (IS_ERR(vdir)) ++ goto out; ++ err = 0; ++ allocated = vdir; ++ } else if (may_read ++ && (inode->i_version != vdir->vd_version ++ || time_after(jiffies, vdir->vd_jiffy + expire))) { ++ do_read = 1; ++ err = reinit_vdir(vdir); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ if (!do_read) ++ return 0; /* success */ ++ ++ arg.file = file; ++ arg.vdir = vdir; ++ err = au_do_read_vdir(&arg); ++ if (!err) { ++ /* file->f_pos = 0; */ ++ vdir->vd_version = inode->i_version; ++ vdir->vd_last.ul = 0; ++ vdir->vd_last.p.deblk = vdir->vd_deblk[0]; ++ if (allocated) ++ au_set_ivdir(inode, allocated); ++ } else if (allocated) ++ au_vdir_free(allocated); ++ ++ out: ++ return err; ++} ++ ++static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src) ++{ ++ int err, rerr; ++ unsigned long ul, n; ++ const unsigned int deblk_sz = src->vd_deblk_sz; ++ ++ AuDebugOn(tgt->vd_nblk != 1); ++ ++ err = -ENOMEM; ++ if (tgt->vd_nblk < src->vd_nblk) { ++ unsigned char **p; ++ ++ p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk, ++ GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ tgt->vd_deblk = p; ++ } ++ ++ if (tgt->vd_deblk_sz != deblk_sz) { ++ unsigned char *p; ++ ++ tgt->vd_deblk_sz = deblk_sz; ++ p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ tgt->vd_deblk[0] = p; ++ } ++ memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz); ++ tgt->vd_version = src->vd_version; ++ tgt->vd_jiffy = src->vd_jiffy; ++ ++ n = src->vd_nblk; ++ for (ul = 1; ul < n; ul++) { ++ tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz, ++ GFP_NOFS); ++ if (unlikely(!tgt->vd_deblk[ul])) ++ goto out; ++ tgt->vd_nblk++; ++ } ++ tgt->vd_nblk = n; ++ tgt->vd_last.ul = tgt->vd_last.ul; ++ tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul]; ++ tgt->vd_last.p.deblk += src->vd_last.p.deblk ++ - src->vd_deblk[src->vd_last.ul]; ++ /* smp_mb(); */ ++ return 0; /* success */ ++ ++ out: ++ rerr = reinit_vdir(tgt); ++ BUG_ON(rerr); ++ return err; ++} ++ ++int au_vdir_init(struct file *file) ++{ ++ int err; ++ struct inode *inode; ++ struct au_vdir *vdir_cache, *allocated; ++ ++ err = read_vdir(file, !file->f_pos); ++ if (unlikely(err)) ++ goto out; ++ ++ allocated = NULL; ++ vdir_cache = au_fvdir_cache(file); ++ if (!vdir_cache) { ++ vdir_cache = alloc_vdir(file); ++ err = PTR_ERR(vdir_cache); ++ if (IS_ERR(vdir_cache)) ++ goto out; ++ allocated = vdir_cache; ++ } else if (!file->f_pos && vdir_cache->vd_version != file->f_version) { ++ err = reinit_vdir(vdir_cache); ++ if (unlikely(err)) ++ goto out; ++ } else ++ return 0; /* success */ ++ ++ inode = file->f_dentry->d_inode; ++ err = copy_vdir(vdir_cache, au_ivdir(inode)); ++ if (!err) { ++ file->f_version = inode->i_version; ++ if (allocated) ++ au_set_fvdir_cache(file, allocated); ++ } else if (allocated) ++ au_vdir_free(allocated); ++ ++ out: ++ return err; ++} ++ ++static loff_t calc_offset(struct au_vdir *vdir) ++{ ++ loff_t offset; ++ union au_vdir_deblk_p p; ++ ++ p.deblk = vdir->vd_deblk[vdir->vd_last.ul]; ++ offset = vdir->vd_last.p.deblk - p.deblk; ++ offset += vdir->vd_deblk_sz * vdir->vd_last.ul; ++ return offset; ++} ++ ++/* returns true or false */ ++static int seek_vdir(struct file *file) ++{ ++ int valid; ++ unsigned int deblk_sz; ++ unsigned long ul, n; ++ loff_t offset; ++ union au_vdir_deblk_p p, deblk_end; ++ struct au_vdir *vdir_cache; ++ ++ valid = 1; ++ vdir_cache = au_fvdir_cache(file); ++ offset = calc_offset(vdir_cache); ++ AuDbg("offset %lld\n", offset); ++ if (file->f_pos == offset) ++ goto out; ++ ++ vdir_cache->vd_last.ul = 0; ++ vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0]; ++ if (!file->f_pos) ++ goto out; ++ ++ valid = 0; ++ deblk_sz = vdir_cache->vd_deblk_sz; ++ ul = div64_u64(file->f_pos, deblk_sz); ++ AuDbg("ul %lu\n", ul); ++ if (ul >= vdir_cache->vd_nblk) ++ goto out; ++ ++ n = vdir_cache->vd_nblk; ++ for (; ul < n; ul++) { ++ p.deblk = vdir_cache->vd_deblk[ul]; ++ deblk_end.deblk = p.deblk + deblk_sz; ++ offset = ul; ++ offset *= deblk_sz; ++ while (!is_deblk_end(&p, &deblk_end) && offset < file->f_pos) { ++ unsigned int l; ++ ++ l = calc_size(p.de->de_str.len); ++ offset += l; ++ p.deblk += l; ++ } ++ if (!is_deblk_end(&p, &deblk_end)) { ++ valid = 1; ++ vdir_cache->vd_last.ul = ul; ++ vdir_cache->vd_last.p = p; ++ break; ++ } ++ } ++ ++ out: ++ /* smp_mb(); */ ++ AuTraceErr(!valid); ++ return valid; ++} ++ ++int au_vdir_fill_de(struct file *file, void *dirent, filldir_t filldir) ++{ ++ int err; ++ unsigned int l, deblk_sz; ++ union au_vdir_deblk_p deblk_end; ++ struct au_vdir *vdir_cache; ++ struct au_vdir_de *de; ++ ++ vdir_cache = au_fvdir_cache(file); ++ if (!seek_vdir(file)) ++ return 0; ++ ++ deblk_sz = vdir_cache->vd_deblk_sz; ++ while (1) { ++ deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; ++ deblk_end.deblk += deblk_sz; ++ while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) { ++ de = vdir_cache->vd_last.p.de; ++ AuDbg("%.*s, off%lld, i%lu, dt%d\n", ++ de->de_str.len, de->de_str.name, file->f_pos, ++ (unsigned long)de->de_ino, de->de_type); ++ err = filldir(dirent, de->de_str.name, de->de_str.len, ++ file->f_pos, de->de_ino, de->de_type); ++ if (unlikely(err)) { ++ AuTraceErr(err); ++ /* todo: ignore the error caused by udba? */ ++ /* return err; */ ++ return 0; ++ } ++ ++ l = calc_size(de->de_str.len); ++ vdir_cache->vd_last.p.deblk += l; ++ file->f_pos += l; ++ } ++ if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) { ++ vdir_cache->vd_last.ul++; ++ vdir_cache->vd_last.p.deblk ++ = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; ++ file->f_pos = deblk_sz * vdir_cache->vd_last.ul; ++ continue; ++ } ++ break; ++ } ++ ++ /* smp_mb(); */ ++ return 0; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/vfsub.c linux-2.6.31/fs/aufs/vfsub.c +--- linux-2.6.31-vanilla/fs/aufs/vfsub.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/vfsub.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,755 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for VFS ++ */ ++ ++#include <linux/ima.h> ++#include <linux/namei.h> ++#include <linux/security.h> ++#include <linux/splice.h> ++#include <linux/uaccess.h> ++#include "aufs.h" ++ ++int vfsub_update_h_iattr(struct path *h_path, int *did) ++{ ++ int err; ++ struct kstat st; ++ struct super_block *h_sb; ++ ++ /* for remote fs, leave work for its getattr or d_revalidate */ ++ /* for bad i_attr fs, handle them in aufs_getattr() */ ++ /* still some fs may acquire i_mutex. we need to skip them */ ++ err = 0; ++ if (!did) ++ did = &err; ++ h_sb = h_path->dentry->d_sb; ++ *did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb)); ++ if (*did) ++ err = vfs_getattr(h_path->mnt, h_path->dentry, &st); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_IMA ++#error IMA is not supported since it does not work well. Wait for their fixing. ++#endif ++ ++struct file *vfsub_dentry_open(struct path *path, int flags, ++ const struct cred *cred) ++{ ++ struct file *file; ++ ++ file = dentry_open(path->dentry, path->mnt, flags, cred); ++ if (IS_ERR(file)) ++ return file; ++ /* as NFSD does, just call ima_..._get() simply after dentry_open */ ++ ima_counts_get(file); ++ return file; ++} ++ ++struct file *vfsub_filp_open(const char *path, int oflags, int mode) ++{ ++ struct file *file; ++ ++ lockdep_off(); ++ file = filp_open(path, oflags, mode); ++ lockdep_on(); ++ if (IS_ERR(file)) ++ goto out; ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ ++ out: ++ return file; ++} ++ ++int vfsub_kern_path(const char *name, unsigned int flags, struct path *path) ++{ ++ int err; ++ ++ /* lockdep_off(); */ ++ err = kern_path(name, flags, path); ++ /* lockdep_on(); */ ++ if (!err && path->dentry->d_inode) ++ vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, ++ int len) ++{ ++ struct path path = { ++ .mnt = NULL ++ }; ++ ++ /* VFS checks it too, but by WARN_ON_ONCE() */ ++ IMustLock(parent->d_inode); ++ ++ path.dentry = lookup_one_len(name, parent, len); ++ if (IS_ERR(path.dentry)) ++ goto out; ++ if (path.dentry->d_inode) ++ vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ ++ ++ out: ++ return path.dentry; ++} ++ ++struct dentry *vfsub_lookup_hash(struct nameidata *nd) ++{ ++ struct path path = { ++ .mnt = nd->path.mnt ++ }; ++ ++ IMustLock(nd->path.dentry->d_inode); ++ ++ path.dentry = lookup_hash(nd); ++ if (!IS_ERR(path.dentry) && path.dentry->d_inode) ++ vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ ++ ++ return path.dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2) ++{ ++ struct dentry *d; ++ ++ lockdep_off(); ++ d = lock_rename(d1, d2); ++ lockdep_on(); ++ au_hin_suspend(hdir1); ++ if (hdir1 != hdir2) ++ au_hin_suspend(hdir2); ++ ++ return d; ++} ++ ++void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2) ++{ ++ au_hin_resume(hdir1); ++ if (hdir1 != hdir2) ++ au_hin_resume(hdir2); ++ lockdep_off(); ++ unlock_rename(d1, d2); ++ lockdep_on(); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_create(struct inode *dir, struct path *path, int mode) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mknod(path, path->dentry, mode, 0); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ if (au_test_fs_null_nd(dir->i_sb)) ++ err = vfs_create(dir, path->dentry, mode, NULL); ++ else { ++ struct nameidata h_nd; ++ ++ memset(&h_nd, 0, sizeof(h_nd)); ++ h_nd.flags = LOOKUP_CREATE; ++ h_nd.intent.open.flags = O_CREAT ++ | vfsub_fmode_to_uint(FMODE_READ); ++ h_nd.intent.open.create_mode = mode; ++ h_nd.path.dentry = path->dentry->d_parent; ++ h_nd.path.mnt = path->mnt; ++ path_get(&h_nd.path); ++ err = vfs_create(dir, path->dentry, mode, &h_nd); ++ path_put(&h_nd.path); ++ } ++ ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++ out: ++ return err; ++} ++ ++int vfsub_symlink(struct inode *dir, struct path *path, const char *symname) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_symlink(path, path->dentry, symname); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ err = vfs_symlink(dir, path->dentry, symname); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++ out: ++ return err; ++} ++ ++int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mknod(path, path->dentry, mode, dev); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ err = vfs_mknod(dir, path->dentry, mode, dev); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++ out: ++ return err; ++} ++ ++static int au_test_nlink(struct inode *inode) ++{ ++ const unsigned int link_max = UINT_MAX >> 1; /* rough margin */ ++ ++ if (!au_test_fs_no_limit_nlink(inode->i_sb) ++ || inode->i_nlink < link_max) ++ return 0; ++ return -EMLINK; ++} ++ ++int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ err = au_test_nlink(src_dentry->d_inode); ++ if (unlikely(err)) ++ return err; ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_link(src_dentry, path, path->dentry); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_link(src_dentry, dir, path->dentry); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ /* fuse has different memory inode for the same inumber */ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ tmp.dentry = src_dentry; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++ out: ++ return err; ++} ++ ++int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry, ++ struct inode *dir, struct path *path) ++{ ++ int err; ++ struct path tmp = { ++ .mnt = path->mnt ++ }; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ IMustLock(src_dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ tmp.dentry = src_dentry->d_parent; ++ err = security_path_rename(&tmp, src_dentry, path, path->dentry); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_rename(src_dir, src_dentry, dir, path->dentry); ++ lockdep_on(); ++ if (!err) { ++ int did; ++ ++ tmp.dentry = d->d_parent; ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = src_dentry; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ tmp.dentry = src_dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++ out: ++ return err; ++} ++ ++int vfsub_mkdir(struct inode *dir, struct path *path, int mode) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mkdir(path, path->dentry, mode); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ err = vfs_mkdir(dir, path->dentry, mode); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++ out: ++ return err; ++} ++ ++int vfsub_rmdir(struct inode *dir, struct path *path) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_rmdir(path, path->dentry); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_rmdir(dir, path->dentry); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = { ++ .dentry = path->dentry->d_parent, ++ .mnt = path->mnt ++ }; ++ ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ ++ } ++ ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ ++ err = vfs_read(file, ubuf, count, ppos); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++/* todo: kernel_read()? */ ++ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = vfsub_read_u(file, (char __user *)kbuf, count, ppos); ++ set_fs(oldfs); ++ return err; ++} ++ ++ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ ++ lockdep_off(); ++ err = vfs_write(file, ubuf, count, ppos); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = vfsub_write_u(file, (const char __user *)kbuf, count, ppos); ++ set_fs(oldfs); ++ return err; ++} ++ ++int vfsub_readdir(struct file *file, filldir_t filldir, void *arg) ++{ ++ int err; ++ ++ lockdep_off(); ++ err = vfs_readdir(file, filldir, arg); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++long vfsub_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ long err; ++ ++ lockdep_off(); ++ err = do_splice_to(in, ppos, pipe, len, flags); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags) ++{ ++ long err; ++ ++ lockdep_off(); ++ err = do_splice_from(pipe, out, ppos, len, flags); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */ ++int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, ++ struct file *h_file) ++{ ++ int err; ++ struct inode *h_inode; ++ ++ h_inode = h_path->dentry->d_inode; ++ if (!h_file) { ++ err = mnt_want_write(h_path->mnt); ++ if (err) ++ goto out; ++ err = inode_permission(h_inode, MAY_WRITE); ++ if (err) ++ goto out_mnt; ++ err = get_write_access(h_inode); ++ if (err) ++ goto out_mnt; ++ err = break_lease(h_inode, vfsub_fmode_to_uint(FMODE_WRITE)); ++ if (err) ++ goto out_inode; ++ } ++ ++ err = locks_verify_truncate(h_inode, h_file, length); ++ if (!err) ++ err = security_path_truncate(h_path, length, attr); ++ if (!err) { ++ lockdep_off(); ++ err = do_truncate(h_path->dentry, length, attr, h_file); ++ lockdep_on(); ++ } ++ ++ out_inode: ++ if (!h_file) ++ put_write_access(h_inode); ++ out_mnt: ++ if (!h_file) ++ mnt_drop_write(h_path->mnt); ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_vfsub_mkdir_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++ int mode; ++}; ++ ++static void au_call_vfsub_mkdir(void *args) ++{ ++ struct au_vfsub_mkdir_args *a = args; ++ *a->errp = vfsub_mkdir(a->dir, a->path, a->mode); ++} ++ ++int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode) ++{ ++ int err, do_sio, wkq_err; ++ ++ do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); ++ if (!do_sio) ++ err = vfsub_mkdir(dir, path, mode); ++ else { ++ struct au_vfsub_mkdir_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path, ++ .mode = mode ++ }; ++ wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++struct au_vfsub_rmdir_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++}; ++ ++static void au_call_vfsub_rmdir(void *args) ++{ ++ struct au_vfsub_rmdir_args *a = args; ++ *a->errp = vfsub_rmdir(a->dir, a->path); ++} ++ ++int vfsub_sio_rmdir(struct inode *dir, struct path *path) ++{ ++ int err, do_sio, wkq_err; ++ ++ do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); ++ if (!do_sio) ++ err = vfsub_rmdir(dir, path); ++ else { ++ struct au_vfsub_rmdir_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path ++ }; ++ wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct notify_change_args { ++ int *errp; ++ struct path *path; ++ struct iattr *ia; ++}; ++ ++static void call_notify_change(void *args) ++{ ++ struct notify_change_args *a = args; ++ struct inode *h_inode; ++ ++ h_inode = a->path->dentry->d_inode; ++ IMustLock(h_inode); ++ ++ *a->errp = -EPERM; ++ if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) { ++ lockdep_off(); ++ *a->errp = notify_change(a->path->dentry, a->ia); ++ lockdep_on(); ++ if (!*a->errp) ++ vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/ ++ } ++ AuTraceErr(*a->errp); ++} ++ ++int vfsub_notify_change(struct path *path, struct iattr *ia) ++{ ++ int err; ++ struct notify_change_args args = { ++ .errp = &err, ++ .path = path, ++ .ia = ia ++ }; ++ ++ call_notify_change(&args); ++ ++ return err; ++} ++ ++int vfsub_sio_notify_change(struct path *path, struct iattr *ia) ++{ ++ int err, wkq_err; ++ struct notify_change_args args = { ++ .errp = &err, ++ .path = path, ++ .ia = ia ++ }; ++ ++ wkq_err = au_wkq_wait(call_notify_change, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct unlink_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++}; ++ ++static void call_unlink(void *args) ++{ ++ struct unlink_args *a = args; ++ struct dentry *d = a->path->dentry; ++ struct inode *h_inode; ++ const int stop_sillyrename = (au_test_nfs(d->d_sb) ++ && atomic_read(&d->d_count) == 1); ++ ++ IMustLock(a->dir); ++ ++ a->path->dentry = d->d_parent; ++ *a->errp = security_path_unlink(a->path, d); ++ a->path->dentry = d; ++ if (unlikely(*a->errp)) ++ return; ++ ++ if (!stop_sillyrename) ++ dget(d); ++ h_inode = d->d_inode; ++ if (h_inode) ++ atomic_inc(&h_inode->i_count); ++ ++ lockdep_off(); ++ *a->errp = vfs_unlink(a->dir, d); ++ lockdep_on(); ++ if (!*a->errp) { ++ struct path tmp = { ++ .dentry = d->d_parent, ++ .mnt = a->path->mnt ++ }; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ ++ } ++ ++ if (!stop_sillyrename) ++ dput(d); ++ if (h_inode) ++ iput(h_inode); ++ ++ AuTraceErr(*a->errp); ++} ++ ++/* ++ * @dir: must be locked. ++ * @dentry: target dentry. ++ */ ++int vfsub_unlink(struct inode *dir, struct path *path, int force) ++{ ++ int err; ++ struct unlink_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path ++ }; ++ ++ if (!force) ++ call_unlink(&args); ++ else { ++ int wkq_err; ++ ++ wkq_err = au_wkq_wait(call_unlink, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/vfsub.h linux-2.6.31/fs/aufs/vfsub.h +--- linux-2.6.31-vanilla/fs/aufs/vfsub.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/vfsub.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,172 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for VFS ++ */ ++ ++#ifndef __AUFS_VFSUB_H__ ++#define __AUFS_VFSUB_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/fs.h> ++#include <linux/fs_stack.h> ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for lower inode */ ++/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */ ++/* reduce? gave up. */ ++enum { ++ AuLsc_I_Begin = I_MUTEX_QUOTA, /* 4 */ ++ AuLsc_I_PARENT, /* lower inode, parent first */ ++ AuLsc_I_PARENT2, /* copyup dirs */ ++ AuLsc_I_PARENT3, /* copyup wh */ ++ AuLsc_I_CHILD, ++ AuLsc_I_CHILD2, ++ AuLsc_I_End ++}; ++ ++/* to debug easier, do not make them inlined functions */ ++#define MtxMustLock(mtx) AuDebugOn(!mutex_is_locked(mtx)) ++#define IMustLock(i) MtxMustLock(&(i)->i_mutex) ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void vfsub_copy_inode_size(struct inode *inode, ++ struct inode *h_inode) ++{ ++ spin_lock(&inode->i_lock); ++ fsstack_copy_inode_size(inode, h_inode); ++ spin_unlock(&inode->i_lock); ++} ++ ++int vfsub_update_h_iattr(struct path *h_path, int *did); ++struct file *vfsub_filp_open(const char *path, int oflags, int mode); ++struct file *vfsub_dentry_open(struct path *path, int flags, ++ const struct cred *cred); ++int vfsub_kern_path(const char *name, unsigned int flags, struct path *path); ++struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, ++ int len); ++struct dentry *vfsub_lookup_hash(struct nameidata *nd); ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_hinode; ++struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2); ++void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2); ++ ++int vfsub_create(struct inode *dir, struct path *path, int mode); ++int vfsub_symlink(struct inode *dir, struct path *path, ++ const char *symname); ++int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev); ++int vfsub_link(struct dentry *src_dentry, struct inode *dir, ++ struct path *path); ++int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry, ++ struct inode *hdir, struct path *path); ++int vfsub_mkdir(struct inode *dir, struct path *path, int mode); ++int vfsub_rmdir(struct inode *dir, struct path *path); ++ ++int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode); ++int vfsub_sio_rmdir(struct inode *dir, struct path *path); ++int vfsub_sio_notify_change(struct path *path, struct iattr *ia); ++int vfsub_notify_change(struct path *path, struct iattr *ia); ++int vfsub_unlink(struct inode *dir, struct path *path, int force); ++ ++/* ---------------------------------------------------------------------- */ ++ ++ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos); ++int vfsub_readdir(struct file *file, filldir_t filldir, void *arg); ++ ++long vfsub_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags); ++long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags); ++int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, ++ struct file *h_file); ++ ++static inline void vfsub_file_accessed(struct file *h_file) ++{ ++ file_accessed(h_file); ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/ ++} ++ ++static inline void vfsub_touch_atime(struct vfsmount *h_mnt, ++ struct dentry *h_dentry) ++{ ++ struct path h_path = { ++ .dentry = h_dentry, ++ .mnt = h_mnt ++ }; ++ touch_atime(h_mnt, h_dentry); ++ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin) ++{ ++ loff_t err; ++ ++ lockdep_off(); ++ err = vfs_llseek(file, offset, origin); ++ lockdep_on(); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dirty workaround for strict type of fmode_t */ ++union vfsub_fmu { ++ fmode_t fm; ++ unsigned int ui; ++}; ++ ++static inline unsigned int vfsub_fmode_to_uint(fmode_t fm) ++{ ++ union vfsub_fmu u = { ++ .fm = fm ++ }; ++ ++ BUILD_BUG_ON(sizeof(u.fm) != sizeof(u.ui)); ++ ++ return u.ui; ++} ++ ++static inline fmode_t vfsub_uint_to_fmode(unsigned int ui) ++{ ++ union vfsub_fmu u = { ++ .ui = ui ++ }; ++ ++ return u.fm; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_VFSUB_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/wbr_policy.c linux-2.6.31/fs/aufs/wbr_policy.c +--- linux-2.6.31-vanilla/fs/aufs/wbr_policy.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/wbr_policy.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,641 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * policies for selecting one among multiple writable branches ++ */ ++ ++#include <linux/statfs.h> ++#include "aufs.h" ++ ++/* subset of cpup_attr() */ ++static noinline_for_stack ++int au_cpdown_attr(struct path *h_path, struct dentry *h_src) ++{ ++ int err, sbits; ++ struct iattr ia; ++ struct inode *h_isrc; ++ ++ h_isrc = h_src->d_inode; ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID; ++ ia.ia_mode = h_isrc->i_mode; ++ ia.ia_uid = h_isrc->i_uid; ++ ia.ia_gid = h_isrc->i_gid; ++ sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID)); ++ au_cpup_attr_flags(h_path->dentry->d_inode, h_isrc); ++ err = vfsub_sio_notify_change(h_path, &ia); ++ ++ /* is this nfs only? */ ++ if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) { ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ err = vfsub_sio_notify_change(h_path, &ia); ++ } ++ ++ return err; ++} ++ ++#define AuCpdown_PARENT_OPQ 1 ++#define AuCpdown_WHED (1 << 1) ++#define AuCpdown_MADE_DIR (1 << 2) ++#define AuCpdown_DIROPQ (1 << 3) ++#define au_ftest_cpdown(flags, name) ((flags) & AuCpdown_##name) ++#define au_fset_cpdown(flags, name) { (flags) |= AuCpdown_##name; } ++#define au_fclr_cpdown(flags, name) { (flags) &= ~AuCpdown_##name; } ++ ++struct au_cpdown_dir_args { ++ struct dentry *parent; ++ unsigned int flags; ++}; ++ ++static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst, ++ struct au_cpdown_dir_args *a) ++{ ++ int err; ++ struct dentry *opq_dentry; ++ ++ opq_dentry = au_diropq_create(dentry, bdst); ++ err = PTR_ERR(opq_dentry); ++ if (IS_ERR(opq_dentry)) ++ goto out; ++ dput(opq_dentry); ++ au_fset_cpdown(a->flags, DIROPQ); ++ ++ out: ++ return err; ++} ++ ++static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent, ++ struct inode *dir, aufs_bindex_t bdst) ++{ ++ int err; ++ struct path h_path; ++ struct au_branch *br; ++ ++ br = au_sbr(dentry->d_sb, bdst); ++ h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ err = 0; ++ if (h_path.dentry->d_inode) { ++ h_path.mnt = br->br_mnt; ++ err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path, ++ dentry); ++ } ++ dput(h_path.dentry); ++ ++ out: ++ return err; ++} ++ ++static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent, void *arg) ++{ ++ int err, rerr; ++ aufs_bindex_t bend, bopq, bstart; ++ unsigned char parent_opq; ++ struct path h_path; ++ struct dentry *parent; ++ struct inode *h_dir, *h_inode, *inode, *dir; ++ struct au_cpdown_dir_args *args = arg; ++ ++ bstart = au_dbstart(dentry); ++ /* dentry is di-locked */ ++ parent = dget_parent(dentry); ++ dir = parent->d_inode; ++ h_dir = h_parent->d_inode; ++ AuDebugOn(h_dir != au_h_iptr(dir, bdst)); ++ IMustLock(h_dir); ++ ++ err = au_lkup_neg(dentry, bdst); ++ if (unlikely(err < 0)) ++ goto out; ++ h_path.dentry = au_h_dptr(dentry, bdst); ++ h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst); ++ err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path, ++ S_IRWXU | S_IRUGO | S_IXUGO); ++ if (unlikely(err)) ++ goto out_put; ++ au_fset_cpdown(args->flags, MADE_DIR); ++ ++ bend = au_dbend(dentry); ++ bopq = au_dbdiropq(dentry); ++ au_fclr_cpdown(args->flags, WHED); ++ au_fclr_cpdown(args->flags, DIROPQ); ++ if (au_dbwh(dentry) == bdst) ++ au_fset_cpdown(args->flags, WHED); ++ if (!au_ftest_cpdown(args->flags, PARENT_OPQ) && bopq <= bdst) ++ au_fset_cpdown(args->flags, PARENT_OPQ); ++ parent_opq = (au_ftest_cpdown(args->flags, PARENT_OPQ) ++ && args->parent == dentry); ++ h_inode = h_path.dentry->d_inode; ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ if (au_ftest_cpdown(args->flags, WHED)) { ++ err = au_cpdown_dir_opq(dentry, bdst, args); ++ if (unlikely(err)) { ++ mutex_unlock(&h_inode->i_mutex); ++ goto out_dir; ++ } ++ } ++ ++ err = au_cpdown_attr(&h_path, au_h_dptr(dentry, bstart)); ++ mutex_unlock(&h_inode->i_mutex); ++ if (unlikely(err)) ++ goto out_opq; ++ ++ if (au_ftest_cpdown(args->flags, WHED)) { ++ err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst); ++ if (unlikely(err)) ++ goto out_opq; ++ } ++ ++ inode = dentry->d_inode; ++ if (au_ibend(inode) < bdst) ++ au_set_ibend(inode, bdst); ++ au_set_h_iptr(inode, bdst, au_igrab(h_inode), ++ au_hi_flags(inode, /*isdir*/1)); ++ goto out; /* success */ ++ ++ /* revert */ ++ out_opq: ++ if (au_ftest_cpdown(args->flags, DIROPQ)) { ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(dentry, bdst); ++ mutex_unlock(&h_inode->i_mutex); ++ if (unlikely(rerr)) { ++ AuIOErr("failed removing diropq for %.*s b%d (%d)\n", ++ AuDLNPair(dentry), bdst, rerr); ++ err = -EIO; ++ goto out; ++ } ++ } ++ out_dir: ++ if (au_ftest_cpdown(args->flags, MADE_DIR)) { ++ rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path); ++ if (unlikely(rerr)) { ++ AuIOErr("failed removing %.*s b%d (%d)\n", ++ AuDLNPair(dentry), bdst, rerr); ++ err = -EIO; ++ } ++ } ++ out_put: ++ au_set_h_dptr(dentry, bdst, NULL); ++ if (au_dbend(dentry) == bdst) ++ au_update_dbend(dentry); ++ out: ++ dput(parent); ++ return err; ++} ++ ++int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ int err; ++ struct au_cpdown_dir_args args = { ++ .parent = dget_parent(dentry), ++ .flags = 0 ++ }; ++ ++ err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &args); ++ dput(args.parent); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies for create */ ++ ++static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ for (; bindex >= 0; bindex--) ++ if (!au_br_rdonly(au_sbr(sb, bindex))) ++ return bindex; ++ return -EROFS; ++} ++ ++/* top down parent */ ++static int au_wbr_create_tdp(struct dentry *dentry, int isdir __maybe_unused) ++{ ++ int err; ++ aufs_bindex_t bstart, bindex; ++ struct super_block *sb; ++ struct dentry *parent, *h_parent; ++ ++ sb = dentry->d_sb; ++ bstart = au_dbstart(dentry); ++ err = bstart; ++ if (!au_br_rdonly(au_sbr(sb, bstart))) ++ goto out; ++ ++ err = -EROFS; ++ parent = dget_parent(dentry); ++ for (bindex = au_dbstart(parent); bindex < bstart; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || !h_parent->d_inode) ++ continue; ++ ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = bindex; ++ break; ++ } ++ } ++ dput(parent); ++ ++ /* bottom up here */ ++ if (unlikely(err < 0)) ++ err = au_wbr_bu(sb, bstart - 1); ++ ++ out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* an exception for the policy other than tdp */ ++static int au_wbr_create_exp(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bwh, bdiropq; ++ struct dentry *parent; ++ ++ err = -1; ++ bwh = au_dbwh(dentry); ++ parent = dget_parent(dentry); ++ bdiropq = au_dbdiropq(parent); ++ if (bwh >= 0) { ++ if (bdiropq >= 0) ++ err = min(bdiropq, bwh); ++ else ++ err = bwh; ++ AuDbg("%d\n", err); ++ } else if (bdiropq >= 0) { ++ err = bdiropq; ++ AuDbg("%d\n", err); ++ } ++ dput(parent); ++ ++ if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err))) ++ err = -1; ++ ++ AuDbg("%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* round robin */ ++static int au_wbr_create_init_rr(struct super_block *sb) ++{ ++ int err; ++ ++ err = au_wbr_bu(sb, au_sbend(sb)); ++ atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */ ++ /* smp_mb(); */ ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_rr(struct dentry *dentry, int isdir) ++{ ++ int err, nbr; ++ unsigned int u; ++ aufs_bindex_t bindex, bend; ++ struct super_block *sb; ++ atomic_t *next; ++ ++ err = au_wbr_create_exp(dentry); ++ if (err >= 0) ++ goto out; ++ ++ sb = dentry->d_sb; ++ next = &au_sbi(sb)->si_wbr_rr_next; ++ bend = au_sbend(sb); ++ nbr = bend + 1; ++ for (bindex = 0; bindex <= bend; bindex++) { ++ if (!isdir) { ++ err = atomic_dec_return(next) + 1; ++ /* modulo for 0 is meaningless */ ++ if (unlikely(!err)) ++ err = atomic_dec_return(next) + 1; ++ } else ++ err = atomic_read(next); ++ AuDbg("%d\n", err); ++ u = err; ++ err = u % nbr; ++ AuDbg("%d\n", err); ++ if (!au_br_rdonly(au_sbr(sb, err))) ++ break; ++ err = -EROFS; ++ } ++ ++ out: ++ AuDbg("%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* most free space */ ++static void au_mfs(struct dentry *dentry) ++{ ++ struct super_block *sb; ++ struct au_branch *br; ++ struct au_wbr_mfs *mfs; ++ aufs_bindex_t bindex, bend; ++ int err; ++ unsigned long long b, bavail; ++ /* reduce the stack usage */ ++ struct kstatfs *st; ++ ++ st = kmalloc(sizeof(*st), GFP_NOFS); ++ if (unlikely(!st)) { ++ AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM); ++ return; ++ } ++ ++ bavail = 0; ++ sb = dentry->d_sb; ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ MtxMustLock(&mfs->mfs_lock); ++ mfs->mfs_bindex = -EROFS; ++ mfs->mfsrr_bytes = 0; ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_rdonly(br)) ++ continue; ++ ++ /* sb->s_root for NFS is unreliable */ ++ err = vfs_statfs(br->br_mnt->mnt_root, st); ++ if (unlikely(err)) { ++ AuWarn1("failed statfs, b%d, %d\n", bindex, err); ++ continue; ++ } ++ ++ /* when the available size is equal, select the lower one */ ++ BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail) ++ || sizeof(b) < sizeof(st->f_bsize)); ++ b = st->f_bavail * st->f_bsize; ++ br->br_wbr->wbr_bytes = b; ++ if (b >= bavail) { ++ bavail = b; ++ mfs->mfs_bindex = bindex; ++ mfs->mfs_jiffy = jiffies; ++ } ++ } ++ ++ mfs->mfsrr_bytes = bavail; ++ AuDbg("b%d\n", mfs->mfs_bindex); ++ kfree(st); ++} ++ ++static int au_wbr_create_mfs(struct dentry *dentry, int isdir __maybe_unused) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_wbr_mfs *mfs; ++ ++ err = au_wbr_create_exp(dentry); ++ if (err >= 0) ++ goto out; ++ ++ sb = dentry->d_sb; ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ mutex_lock(&mfs->mfs_lock); ++ if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire) ++ || mfs->mfs_bindex < 0 ++ || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex))) ++ au_mfs(dentry); ++ mutex_unlock(&mfs->mfs_lock); ++ err = mfs->mfs_bindex; ++ ++ out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_init_mfs(struct super_block *sb) ++{ ++ struct au_wbr_mfs *mfs; ++ ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ mutex_init(&mfs->mfs_lock); ++ mfs->mfs_jiffy = 0; ++ mfs->mfs_bindex = -EROFS; ++ ++ return 0; ++} ++ ++static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused) ++{ ++ mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* most free space and then round robin */ ++static int au_wbr_create_mfsrr(struct dentry *dentry, int isdir) ++{ ++ int err; ++ struct au_wbr_mfs *mfs; ++ ++ err = au_wbr_create_mfs(dentry, isdir); ++ if (err >= 0) { ++ mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs; ++ mutex_lock(&mfs->mfs_lock); ++ if (mfs->mfsrr_bytes < mfs->mfsrr_watermark) ++ err = au_wbr_create_rr(dentry, isdir); ++ mutex_unlock(&mfs->mfs_lock); ++ } ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_init_mfsrr(struct super_block *sb) ++{ ++ int err; ++ ++ au_wbr_create_init_mfs(sb); /* ignore */ ++ err = au_wbr_create_init_rr(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* top down parent and most free space */ ++static int au_wbr_create_pmfs(struct dentry *dentry, int isdir) ++{ ++ int err, e2; ++ unsigned long long b; ++ aufs_bindex_t bindex, bstart, bend; ++ struct super_block *sb; ++ struct dentry *parent, *h_parent; ++ struct au_branch *br; ++ ++ err = au_wbr_create_tdp(dentry, isdir); ++ if (unlikely(err < 0)) ++ goto out; ++ parent = dget_parent(dentry); ++ bstart = au_dbstart(parent); ++ bend = au_dbtaildir(parent); ++ if (bstart == bend) ++ goto out_parent; /* success */ ++ ++ e2 = au_wbr_create_mfs(dentry, isdir); ++ if (e2 < 0) ++ goto out_parent; /* success */ ++ ++ /* when the available size is equal, select upper one */ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, err); ++ b = br->br_wbr->wbr_bytes; ++ AuDbg("b%d, %llu\n", err, b); ++ ++ for (bindex = bstart; bindex <= bend; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || !h_parent->d_inode) ++ continue; ++ ++ br = au_sbr(sb, bindex); ++ if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) { ++ b = br->br_wbr->wbr_bytes; ++ err = bindex; ++ AuDbg("b%d, %llu\n", err, b); ++ } ++ } ++ ++ out_parent: ++ dput(parent); ++ out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies for copyup */ ++ ++/* top down parent */ ++static int au_wbr_copyup_tdp(struct dentry *dentry) ++{ ++ return au_wbr_create_tdp(dentry, /*isdir, anything is ok*/0); ++} ++ ++/* bottom up parent */ ++static int au_wbr_copyup_bup(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bindex, bstart; ++ struct dentry *parent, *h_parent; ++ struct super_block *sb; ++ ++ err = -EROFS; ++ sb = dentry->d_sb; ++ parent = dget_parent(dentry); ++ bstart = au_dbstart(parent); ++ for (bindex = au_dbstart(dentry); bindex >= bstart; bindex--) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || !h_parent->d_inode) ++ continue; ++ ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = bindex; ++ break; ++ } ++ } ++ dput(parent); ++ ++ /* bottom up here */ ++ if (unlikely(err < 0)) ++ err = au_wbr_bu(sb, bstart - 1); ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* bottom up */ ++static int au_wbr_copyup_bu(struct dentry *dentry) ++{ ++ int err; ++ ++ err = au_wbr_bu(dentry->d_sb, au_dbstart(dentry)); ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_wbr_copyup_operations au_wbr_copyup_ops[] = { ++ [AuWbrCopyup_TDP] = { ++ .copyup = au_wbr_copyup_tdp ++ }, ++ [AuWbrCopyup_BUP] = { ++ .copyup = au_wbr_copyup_bup ++ }, ++ [AuWbrCopyup_BU] = { ++ .copyup = au_wbr_copyup_bu ++ } ++}; ++ ++struct au_wbr_create_operations au_wbr_create_ops[] = { ++ [AuWbrCreate_TDP] = { ++ .create = au_wbr_create_tdp ++ }, ++ [AuWbrCreate_RR] = { ++ .create = au_wbr_create_rr, ++ .init = au_wbr_create_init_rr ++ }, ++ [AuWbrCreate_MFS] = { ++ .create = au_wbr_create_mfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSV] = { ++ .create = au_wbr_create_mfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSRR] = { ++ .create = au_wbr_create_mfsrr, ++ .init = au_wbr_create_init_mfsrr, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSRRV] = { ++ .create = au_wbr_create_mfsrr, ++ .init = au_wbr_create_init_mfsrr, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_PMFS] = { ++ .create = au_wbr_create_pmfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_PMFSV] = { ++ .create = au_wbr_create_pmfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ } ++}; +diff -Nur linux-2.6.31-vanilla/fs/aufs/whout.c linux-2.6.31/fs/aufs/whout.c +--- linux-2.6.31-vanilla/fs/aufs/whout.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/whout.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,1052 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * whiteout for logical deletion and opaque directory ++ */ ++ ++#include <linux/fs.h> ++#include "aufs.h" ++ ++#define WH_MASK S_IRUGO ++ ++/* ++ * If a directory contains this file, then it is opaque. We start with the ++ * .wh. flag so that it is blocked by lookup. ++ */ ++static struct qstr diropq_name = { ++ .name = AUFS_WH_DIROPQ, ++ .len = sizeof(AUFS_WH_DIROPQ) - 1 ++}; ++ ++/* ++ * generate whiteout name, which is NOT terminated by NULL. ++ * @name: original d_name.name ++ * @len: original d_name.len ++ * @wh: whiteout qstr ++ * returns zero when succeeds, otherwise error. ++ * succeeded value as wh->name should be freed by kfree(). ++ */ ++int au_wh_name_alloc(struct qstr *wh, const struct qstr *name) ++{ ++ char *p; ++ ++ if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN)) ++ return -ENAMETOOLONG; ++ ++ wh->len = name->len + AUFS_WH_PFX_LEN; ++ p = kmalloc(wh->len, GFP_NOFS); ++ wh->name = p; ++ if (p) { ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len); ++ /* smp_mb(); */ ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * test if the @wh_name exists under @h_parent. ++ * @try_sio specifies the necessary of super-io. ++ */ ++int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, ++ struct au_branch *br, int try_sio) ++{ ++ int err; ++ struct dentry *wh_dentry; ++ struct inode *h_dir; ++ ++ h_dir = h_parent->d_inode; ++ if (!try_sio) ++ wh_dentry = au_lkup_one(wh_name, h_parent, br, /*nd*/NULL); ++ else ++ wh_dentry = au_sio_lkup_one(wh_name, h_parent, br); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out; ++ ++ err = 0; ++ if (!wh_dentry->d_inode) ++ goto out_wh; /* success */ ++ ++ err = 1; ++ if (S_ISREG(wh_dentry->d_inode->i_mode)) ++ goto out_wh; /* success */ ++ ++ err = -EIO; ++ AuIOErr("%.*s Invalid whiteout entry type 0%o.\n", ++ AuDLNPair(wh_dentry), wh_dentry->d_inode->i_mode); ++ ++ out_wh: ++ dput(wh_dentry); ++ out: ++ return err; ++} ++ ++/* ++ * test if the @h_dentry sets opaque or not. ++ */ ++int au_diropq_test(struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ h_dir = h_dentry->d_inode; ++ err = au_wh_test(h_dentry, &diropq_name, br, ++ au_test_h_perm_sio(h_dir, MAY_EXEC)); ++ return err; ++} ++ ++/* ++ * returns a negative dentry whose name is unique and temporary. ++ */ ++struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, ++ struct qstr *prefix) ++{ ++#define HEX_LEN 4 ++ struct dentry *dentry; ++ int i; ++ char defname[AUFS_WH_PFX_LEN * 2 + DNAME_INLINE_LEN_MIN + 1 ++ + HEX_LEN + 1], *name, *p; ++ static unsigned short cnt; ++ struct qstr qs; ++ ++ name = defname; ++ qs.len = sizeof(defname) - DNAME_INLINE_LEN_MIN + prefix->len - 1; ++ if (unlikely(prefix->len > DNAME_INLINE_LEN_MIN)) { ++ dentry = ERR_PTR(-ENAMETOOLONG); ++ if (unlikely(qs.len >= PATH_MAX)) ++ goto out; ++ dentry = ERR_PTR(-ENOMEM); ++ name = kmalloc(qs.len + 1, GFP_NOFS); ++ if (unlikely(!name)) ++ goto out; ++ } ++ ++ /* doubly whiteout-ed */ ++ memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2); ++ p = name + AUFS_WH_PFX_LEN * 2; ++ memcpy(p, prefix->name, prefix->len); ++ p += prefix->len; ++ *p++ = '.'; ++ AuDebugOn(name + qs.len + 1 - p <= HEX_LEN); ++ ++ qs.name = name; ++ for (i = 0; i < 3; i++) { ++ sprintf(p, "%.*d", HEX_LEN, cnt++); ++ dentry = au_sio_lkup_one(&qs, h_parent, br); ++ if (IS_ERR(dentry) || !dentry->d_inode) ++ goto out_name; ++ dput(dentry); ++ } ++ /* AuWarn("could not get random name\n"); */ ++ dentry = ERR_PTR(-EEXIST); ++ AuDbg("%.*s\n", AuLNPair(&qs)); ++ BUG(); ++ ++ out_name: ++ if (name != defname) ++ kfree(name); ++ out: ++ return dentry; ++#undef HEX_LEN ++} ++ ++/* ++ * rename the @h_dentry on @br to the whiteouted temporary name. ++ */ ++int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = br->br_mnt ++ }; ++ struct inode *h_dir; ++ struct dentry *h_parent; ++ ++ h_parent = h_dentry->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ ++ h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ /* under the same dir, no need to lock_rename() */ ++ err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path); ++ AuTraceErr(err); ++ dput(h_path.dentry); ++ ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * functions for removing a whiteout ++ */ ++ ++static int do_unlink_wh(struct inode *h_dir, struct path *h_path) ++{ ++ int force; ++ ++ /* ++ * forces superio when the dir has a sticky bit. ++ * this may be a violation of unix fs semantics. ++ */ ++ force = (h_dir->i_mode & S_ISVTX) ++ && h_path->dentry->d_inode->i_uid != current_fsuid(); ++ return vfsub_unlink(h_dir, h_path, force); ++} ++ ++int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, ++ struct dentry *dentry) ++{ ++ int err; ++ ++ err = do_unlink_wh(h_dir, h_path); ++ if (!err && dentry) ++ au_set_dbwh(dentry, -1); ++ ++ return err; ++} ++ ++static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh, ++ struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = br->br_mnt ++ }; ++ ++ err = 0; ++ h_path.dentry = au_lkup_one(wh, h_parent, br, /*nd*/NULL); ++ if (IS_ERR(h_path.dentry)) ++ err = PTR_ERR(h_path.dentry); ++ else { ++ if (h_path.dentry->d_inode ++ && S_ISREG(h_path.dentry->d_inode->i_mode)) ++ err = do_unlink_wh(h_parent->d_inode, &h_path); ++ dput(h_path.dentry); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * initialize/clean whiteout for a branch ++ */ ++ ++static void au_wh_clean(struct inode *h_dir, struct path *whpath, ++ const int isdir) ++{ ++ int err; ++ ++ if (!whpath->dentry->d_inode) ++ return; ++ ++ err = mnt_want_write(whpath->mnt); ++ if (!err) { ++ if (isdir) ++ err = vfsub_rmdir(h_dir, whpath); ++ else ++ err = vfsub_unlink(h_dir, whpath, /*force*/0); ++ mnt_drop_write(whpath->mnt); ++ } ++ if (unlikely(err)) ++ AuWarn("failed removing %.*s (%d), ignored.\n", ++ AuDLNPair(whpath->dentry), err); ++} ++ ++static int test_linkable(struct dentry *h_root) ++{ ++ struct inode *h_dir = h_root->d_inode; ++ ++ if (h_dir->i_op->link) ++ return 0; ++ ++ AuErr("%.*s (%s) doesn't support link(2), use noplink and rw+nolwh\n", ++ AuDLNPair(h_root), au_sbtype(h_root->d_sb)); ++ return -ENOSYS; ++} ++ ++/* todo: should this mkdir be done in /sbin/mount.aufs helper? */ ++static int au_whdir(struct inode *h_dir, struct path *path) ++{ ++ int err; ++ ++ err = -EEXIST; ++ if (!path->dentry->d_inode) { ++ int mode = S_IRWXU; ++ ++ if (au_test_nfs(path->dentry->d_sb)) ++ mode |= S_IXUGO; ++ err = mnt_want_write(path->mnt); ++ if (!err) { ++ err = vfsub_mkdir(h_dir, path, mode); ++ mnt_drop_write(path->mnt); ++ } ++ } else if (S_ISDIR(path->dentry->d_inode->i_mode)) ++ err = 0; ++ else ++ AuErr("unknown %.*s exists\n", AuDLNPair(path->dentry)); ++ ++ return err; ++} ++ ++struct au_wh_base { ++ const struct qstr *name; ++ struct dentry *dentry; ++}; ++ ++static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/0); ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++} ++ ++/* ++ * returns tri-state, ++ * minus: error, caller should print the mesage ++ * zero: succuess ++ * plus: error, caller should NOT print the mesage ++ */ ++static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr, ++ int do_plink, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ h_dir = h_root->d_inode; ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/0); ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ if (do_plink) { ++ err = test_linkable(h_root); ++ if (unlikely(err)) { ++ err = 1; ++ goto out; ++ } ++ ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); ++ } else ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); ++ ++ out: ++ return err; ++} ++ ++/* ++ * for the moment, aufs supports the branch filesystem which does not support ++ * link(2). testing on FAT which does not support i_op->setattr() fully either, ++ * copyup failed. finally, such filesystem will not be used as the writable ++ * branch. ++ * ++ * returns tri-state, see above. ++ */ ++static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr, ++ int do_plink, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ WbrWhMustWriteLock(wbr); ++ ++ err = test_linkable(h_root); ++ if (unlikely(err)) { ++ err = 1; ++ goto out; ++ } ++ ++ /* ++ * todo: should this create be done in /sbin/mount.aufs helper? ++ */ ++ err = -EEXIST; ++ h_dir = h_root->d_inode; ++ if (!base[AuBrWh_BASE].dentry->d_inode) { ++ err = mnt_want_write(h_path->mnt); ++ if (!err) { ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ err = vfsub_create(h_dir, h_path, WH_MASK); ++ mnt_drop_write(h_path->mnt); ++ } ++ } else if (S_ISREG(base[AuBrWh_BASE].dentry->d_inode->i_mode)) ++ err = 0; ++ else ++ AuErr("unknown %.*s/%.*s exists\n", ++ AuDLNPair(h_root), AuDLNPair(base[AuBrWh_BASE].dentry)); ++ if (unlikely(err)) ++ goto out; ++ ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ if (do_plink) { ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); ++ } else ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry); ++ ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); ++ ++ out: ++ return err; ++} ++ ++/* ++ * initialize the whiteout base file/dir for @br. ++ */ ++int au_wh_init(struct dentry *h_root, struct au_branch *br, ++ struct super_block *sb) ++{ ++ int err, i; ++ const unsigned char do_plink ++ = !!au_opt_test(au_mntflags(sb), PLINK); ++ struct path path = { ++ .mnt = br->br_mnt ++ }; ++ struct inode *h_dir; ++ struct au_wbr *wbr = br->br_wbr; ++ static const struct qstr base_name[] = { ++ [AuBrWh_BASE] = { ++ .name = AUFS_BASE_NAME, ++ .len = sizeof(AUFS_BASE_NAME) - 1 ++ }, ++ [AuBrWh_PLINK] = { ++ .name = AUFS_PLINKDIR_NAME, ++ .len = sizeof(AUFS_PLINKDIR_NAME) - 1 ++ }, ++ [AuBrWh_ORPH] = { ++ .name = AUFS_ORPHDIR_NAME, ++ .len = sizeof(AUFS_ORPHDIR_NAME) - 1 ++ } ++ }; ++ struct au_wh_base base[] = { ++ [AuBrWh_BASE] = { ++ .name = base_name + AuBrWh_BASE, ++ .dentry = NULL ++ }, ++ [AuBrWh_PLINK] = { ++ .name = base_name + AuBrWh_PLINK, ++ .dentry = NULL ++ }, ++ [AuBrWh_ORPH] = { ++ .name = base_name + AuBrWh_ORPH, ++ .dentry = NULL ++ } ++ }; ++ ++ if (wbr) ++ WbrWhMustWriteLock(wbr); ++ ++ h_dir = h_root->d_inode; ++ for (i = 0; i < AuBrWh_Last; i++) { ++ /* doubly whiteouted */ ++ struct dentry *d; ++ ++ d = au_wh_lkup(h_root, (void *)base[i].name, br); ++ err = PTR_ERR(d); ++ if (IS_ERR(d)) ++ goto out; ++ ++ base[i].dentry = d; ++ AuDebugOn(wbr ++ && wbr->wbr_wh[i] ++ && wbr->wbr_wh[i] != base[i].dentry); ++ } ++ ++ if (wbr) ++ for (i = 0; i < AuBrWh_Last; i++) { ++ dput(wbr->wbr_wh[i]); ++ wbr->wbr_wh[i] = NULL; ++ } ++ ++ err = 0; ++ ++ switch (br->br_perm) { ++ case AuBrPerm_RO: ++ case AuBrPerm_ROWH: ++ case AuBrPerm_RR: ++ case AuBrPerm_RRWH: ++ au_wh_init_ro(h_dir, base, &path); ++ break; ++ ++ case AuBrPerm_RWNoLinkWH: ++ err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path); ++ if (err > 0) ++ goto out; ++ else if (err) ++ goto out_err; ++ break; ++ ++ case AuBrPerm_RW: ++ err = au_wh_init_rw(h_root, wbr, do_plink, base, &path); ++ if (err > 0) ++ goto out; ++ else if (err) ++ goto out_err; ++ break; ++ ++ default: ++ BUG(); ++ } ++ goto out; /* success */ ++ ++ out_err: ++ AuErr("an error(%d) on the writable branch %.*s(%s)\n", ++ err, AuDLNPair(h_root), au_sbtype(h_root->d_sb)); ++ out: ++ for (i = 0; i < AuBrWh_Last; i++) ++ dput(base[i].dentry); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * whiteouts are all hard-linked usually. ++ * when its link count reaches a ceiling, we create a new whiteout base ++ * asynchronously. ++ */ ++ ++struct reinit_br_wh { ++ struct super_block *sb; ++ struct au_branch *br; ++}; ++ ++static void reinit_br_wh(void *arg) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct path h_path; ++ struct reinit_br_wh *a = arg; ++ struct au_wbr *wbr; ++ struct inode *dir; ++ struct dentry *h_root; ++ struct au_hinode *hdir; ++ ++ err = 0; ++ wbr = a->br->br_wbr; ++ /* big aufs lock */ ++ si_noflush_write_lock(a->sb); ++ if (!au_br_writable(a->br->br_perm)) ++ goto out; ++ bindex = au_br_index(a->sb, a->br->br_id); ++ if (unlikely(bindex < 0)) ++ goto out; ++ ++ di_read_lock_parent(a->sb->s_root, AuLock_IR); ++ dir = a->sb->s_root->d_inode; ++ hdir = au_hi(dir, bindex); ++ h_root = au_h_dptr(a->sb->s_root, bindex); ++ ++ au_hin_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ wbr_wh_write_lock(wbr); ++ err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode, ++ h_root, a->br); ++ if (!err) { ++ err = mnt_want_write(a->br->br_mnt); ++ if (!err) { ++ h_path.dentry = wbr->wbr_whbase; ++ h_path.mnt = a->br->br_mnt; ++ err = vfsub_unlink(hdir->hi_inode, &h_path, /*force*/0); ++ mnt_drop_write(a->br->br_mnt); ++ } ++ } else { ++ AuWarn("%.*s is moved, ignored\n", AuDLNPair(wbr->wbr_whbase)); ++ err = 0; ++ } ++ dput(wbr->wbr_whbase); ++ wbr->wbr_whbase = NULL; ++ if (!err) ++ err = au_wh_init(h_root, a->br, a->sb); ++ wbr_wh_write_unlock(wbr); ++ au_hin_imtx_unlock(hdir); ++ di_read_unlock(a->sb->s_root, AuLock_IR); ++ ++ out: ++ if (wbr) ++ atomic_dec(&wbr->wbr_wh_running); ++ atomic_dec(&a->br->br_count); ++ au_nwt_done(&au_sbi(a->sb)->si_nowait); ++ si_write_unlock(a->sb); ++ kfree(arg); ++ if (unlikely(err)) ++ AuIOErr("err %d\n", err); ++} ++ ++static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br) ++{ ++ int do_dec, wkq_err; ++ struct reinit_br_wh *arg; ++ ++ do_dec = 1; ++ if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1) ++ goto out; ++ ++ /* ignore ENOMEM */ ++ arg = kmalloc(sizeof(*arg), GFP_NOFS); ++ if (arg) { ++ /* ++ * dec(wh_running), kfree(arg) and dec(br_count) ++ * in reinit function ++ */ ++ arg->sb = sb; ++ arg->br = br; ++ atomic_inc(&br->br_count); ++ wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb); ++ if (unlikely(wkq_err)) { ++ atomic_dec(&br->br_wbr->wbr_wh_running); ++ atomic_dec(&br->br_count); ++ kfree(arg); ++ } ++ do_dec = 0; ++ } ++ ++ out: ++ if (do_dec) ++ atomic_dec(&br->br_wbr->wbr_wh_running); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create the whiteout @wh. ++ */ ++static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex, ++ struct dentry *wh) ++{ ++ int err; ++ struct path h_path = { ++ .dentry = wh ++ }; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ ++ h_parent = wh->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ ++ br = au_sbr(sb, bindex); ++ h_path.mnt = br->br_mnt; ++ wbr = br->br_wbr; ++ wbr_wh_read_lock(wbr); ++ if (wbr->wbr_whbase) { ++ err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path); ++ if (!err || err != -EMLINK) ++ goto out; ++ ++ /* link count full. re-initialize br_whbase. */ ++ kick_reinit_br_wh(sb, br); ++ } ++ ++ /* return this error in this context */ ++ err = vfsub_create(h_dir, &h_path, WH_MASK); ++ ++ out: ++ wbr_wh_read_unlock(wbr); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create or remove the diropq. ++ */ ++static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags) ++{ ++ struct dentry *opq_dentry, *h_dentry; ++ struct super_block *sb; ++ struct au_branch *br; ++ int err; ++ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, bindex); ++ h_dentry = au_h_dptr(dentry, bindex); ++ opq_dentry = au_lkup_one(&diropq_name, h_dentry, br, /*nd*/NULL); ++ if (IS_ERR(opq_dentry)) ++ goto out; ++ ++ if (au_ftest_diropq(flags, CREATE)) { ++ err = link_or_create_wh(sb, bindex, opq_dentry); ++ if (!err) { ++ au_set_dbdiropq(dentry, bindex); ++ goto out; /* success */ ++ } ++ } else { ++ struct path tmp = { ++ .dentry = opq_dentry, ++ .mnt = br->br_mnt ++ }; ++ err = do_unlink_wh(au_h_iptr(dentry->d_inode, bindex), &tmp); ++ if (!err) ++ au_set_dbdiropq(dentry, -1); ++ } ++ dput(opq_dentry); ++ opq_dentry = ERR_PTR(err); ++ ++ out: ++ return opq_dentry; ++} ++ ++struct do_diropq_args { ++ struct dentry **errp; ++ struct dentry *dentry; ++ aufs_bindex_t bindex; ++ unsigned int flags; ++}; ++ ++static void call_do_diropq(void *args) ++{ ++ struct do_diropq_args *a = args; ++ *a->errp = do_diropq(a->dentry, a->bindex, a->flags); ++} ++ ++struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags) ++{ ++ struct dentry *diropq, *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!au_test_h_perm_sio(h_dentry->d_inode, MAY_EXEC | MAY_WRITE)) ++ diropq = do_diropq(dentry, bindex, flags); ++ else { ++ int wkq_err; ++ struct do_diropq_args args = { ++ .errp = &diropq, ++ .dentry = dentry, ++ .bindex = bindex, ++ .flags = flags ++ }; ++ ++ wkq_err = au_wkq_wait(call_do_diropq, &args); ++ if (unlikely(wkq_err)) ++ diropq = ERR_PTR(wkq_err); ++ } ++ ++ return diropq; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * lookup whiteout dentry. ++ * @h_parent: lower parent dentry which must exist and be locked ++ * @base_name: name of dentry which will be whiteouted ++ * returns dentry for whiteout. ++ */ ++struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, ++ struct au_branch *br) ++{ ++ int err; ++ struct qstr wh_name; ++ struct dentry *wh_dentry; ++ ++ err = au_wh_name_alloc(&wh_name, base_name); ++ wh_dentry = ERR_PTR(err); ++ if (!err) { ++ wh_dentry = au_lkup_one(&wh_name, h_parent, br, /*nd*/NULL); ++ kfree(wh_name.name); ++ } ++ return wh_dentry; ++} ++ ++/* ++ * link/create a whiteout for @dentry on @bindex. ++ */ ++struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent) ++{ ++ struct dentry *wh_dentry; ++ struct super_block *sb; ++ int err; ++ ++ sb = dentry->d_sb; ++ wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex)); ++ if (!IS_ERR(wh_dentry) && !wh_dentry->d_inode) { ++ err = link_or_create_wh(sb, bindex, wh_dentry); ++ if (!err) ++ au_set_dbwh(dentry, bindex); ++ else { ++ dput(wh_dentry); ++ wh_dentry = ERR_PTR(err); ++ } ++ } ++ ++ return wh_dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Delete all whiteouts in this directory on branch bindex. */ ++static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist, ++ aufs_bindex_t bindex, struct au_branch *br) ++{ ++ int err; ++ unsigned long ul, n; ++ struct qstr wh_name; ++ char *p; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ struct au_vdir_destr *str; ++ ++ err = -ENOMEM; ++ p = __getname(); ++ wh_name.name = p; ++ if (unlikely(!wh_name.name)) ++ goto out; ++ ++ err = 0; ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ p += AUFS_WH_PFX_LEN; ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (ul = 0; !err && ul < n; ul++, head++) { ++ hlist_for_each_entry(tpos, pos, head, wh_hash) { ++ if (tpos->wh_bindex != bindex) ++ continue; ++ ++ str = &tpos->wh_str; ++ if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) { ++ memcpy(p, str->name, str->len); ++ wh_name.len = AUFS_WH_PFX_LEN + str->len; ++ err = unlink_wh_name(h_dentry, &wh_name, br); ++ if (!err) ++ continue; ++ break; ++ } ++ AuIOErr("whiteout name too long %.*s\n", ++ str->len, str->name); ++ err = -EIO; ++ break; ++ } ++ } ++ __putname(wh_name.name); ++ ++ out: ++ return err; ++} ++ ++struct del_wh_children_args { ++ int *errp; ++ struct dentry *h_dentry; ++ struct au_nhash *whlist; ++ aufs_bindex_t bindex; ++ struct au_branch *br; ++}; ++ ++static void call_del_wh_children(void *args) ++{ ++ struct del_wh_children_args *a = args; ++ *a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp) ++{ ++ struct au_whtmp_rmdir *whtmp; ++ int err; ++ unsigned int rdhash; ++ ++ SiMustAnyLock(sb); ++ ++ whtmp = kmalloc(sizeof(*whtmp), gfp); ++ if (unlikely(!whtmp)) { ++ whtmp = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ whtmp->dir = NULL; ++ whtmp->wh_dentry = NULL; ++ /* no estimation for dir size */ ++ rdhash = au_sbi(sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = AUFS_RDHASH_DEF; ++ err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp); ++ if (unlikely(err)) { ++ kfree(whtmp); ++ whtmp = ERR_PTR(err); ++ } ++ ++ out: ++ return whtmp; ++} ++ ++void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp) ++{ ++ dput(whtmp->wh_dentry); ++ iput(whtmp->dir); ++ au_nhash_wh_free(&whtmp->whlist); ++ kfree(whtmp); ++} ++ ++/* ++ * rmdir the whiteouted temporary named dir @h_dentry. ++ * @whlist: whiteouted children. ++ */ ++int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_nhash *whlist) ++{ ++ int err; ++ struct path h_tmp; ++ struct inode *wh_inode, *h_dir; ++ struct au_branch *br; ++ ++ h_dir = wh_dentry->d_parent->d_inode; /* dir inode is locked */ ++ IMustLock(h_dir); ++ ++ br = au_sbr(dir->i_sb, bindex); ++ wh_inode = wh_dentry->d_inode; ++ mutex_lock_nested(&wh_inode->i_mutex, AuLsc_I_CHILD); ++ ++ /* ++ * someone else might change some whiteouts while we were sleeping. ++ * it means this whlist may have an obsoleted entry. ++ */ ++ if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE)) ++ err = del_wh_children(wh_dentry, whlist, bindex, br); ++ else { ++ int wkq_err; ++ struct del_wh_children_args args = { ++ .errp = &err, ++ .h_dentry = wh_dentry, ++ .whlist = whlist, ++ .bindex = bindex, ++ .br = br ++ }; ++ ++ wkq_err = au_wkq_wait(call_del_wh_children, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ mutex_unlock(&wh_inode->i_mutex); ++ ++ if (!err) { ++ h_tmp.dentry = wh_dentry; ++ h_tmp.mnt = br->br_mnt; ++ err = vfsub_rmdir(h_dir, &h_tmp); ++ /* d_drop(h_dentry); */ ++ } ++ ++ if (!err) { ++ if (au_ibstart(dir) == bindex) { ++ au_cpup_attr_timesizes(dir); ++ drop_nlink(dir); ++ } ++ return 0; /* success */ ++ } ++ ++ AuWarn("failed removing %.*s(%d), ignored\n", ++ AuDLNPair(wh_dentry), err); ++ return err; ++} ++ ++static void call_rmdir_whtmp(void *args) ++{ ++ int err; ++ struct au_whtmp_rmdir *a = args; ++ struct super_block *sb; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ struct au_branch *br; ++ struct au_hinode *hdir; ++ ++ /* rmdir by nfsd may cause deadlock with this i_mutex */ ++ /* mutex_lock(&a->dir->i_mutex); */ ++ sb = a->dir->i_sb; ++ si_noflush_read_lock(sb); ++ err = au_test_ro(sb, a->bindex, NULL); ++ if (unlikely(err)) ++ goto out; ++ ++ err = -EIO; ++ br = au_sbr(sb, a->bindex); ++ ii_write_lock_parent(a->dir); ++ h_parent = dget_parent(a->wh_dentry); ++ h_dir = h_parent->d_inode; ++ hdir = au_hi(a->dir, a->bindex); ++ au_hin_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent, br); ++ if (!err) { ++ err = mnt_want_write(br->br_mnt); ++ if (!err) { ++ err = au_whtmp_rmdir(a->dir, a->bindex, a->wh_dentry, ++ &a->whlist); ++ mnt_drop_write(br->br_mnt); ++ } ++ } ++ au_hin_imtx_unlock(hdir); ++ dput(h_parent); ++ ii_write_unlock(a->dir); ++ ++ out: ++ /* mutex_unlock(&a->dir->i_mutex); */ ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ si_read_unlock(sb); ++ au_whtmp_rmdir_free(a); ++ if (unlikely(err)) ++ AuIOErr("err %d\n", err); ++} ++ ++void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_whtmp_rmdir *args) ++{ ++ int wkq_err; ++ ++ IMustLock(dir); ++ ++ /* all post-process will be done in do_rmdir_whtmp(). */ ++ args->dir = au_igrab(dir); ++ args->bindex = bindex; ++ args->wh_dentry = dget(wh_dentry); ++ wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, dir->i_sb); ++ if (unlikely(wkq_err)) { ++ AuWarn("rmdir error %.*s (%d), ignored\n", ++ AuDLNPair(wh_dentry), wkq_err); ++ au_whtmp_rmdir_free(args); ++ } ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/whout.h linux-2.6.31/fs/aufs/whout.h +--- linux-2.6.31-vanilla/fs/aufs/whout.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/whout.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,87 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * whiteout for logical deletion and opaque directory ++ */ ++ ++#ifndef __AUFS_WHOUT_H__ ++#define __AUFS_WHOUT_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/aufs_type.h> ++#include "dir.h" ++ ++/* whout.c */ ++int au_wh_name_alloc(struct qstr *wh, const struct qstr *name); ++struct au_branch; ++int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, ++ struct au_branch *br, int try_sio); ++int au_diropq_test(struct dentry *h_dentry, struct au_branch *br); ++struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, ++ struct qstr *prefix); ++int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br); ++int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, ++ struct dentry *dentry); ++int au_wh_init(struct dentry *h_parent, struct au_branch *br, ++ struct super_block *sb); ++ ++/* diropq flags */ ++#define AuDiropq_CREATE 1 ++#define au_ftest_diropq(flags, name) ((flags) & AuDiropq_##name) ++#define au_fset_diropq(flags, name) { (flags) |= AuDiropq_##name; } ++#define au_fclr_diropq(flags, name) { (flags) &= ~AuDiropq_##name; } ++ ++struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags); ++struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, ++ struct au_branch *br); ++struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent); ++ ++/* real rmdir for the whiteout-ed dir */ ++struct au_whtmp_rmdir { ++ struct inode *dir; ++ aufs_bindex_t bindex; ++ struct dentry *wh_dentry; ++ struct au_nhash whlist; ++}; ++ ++struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp); ++void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp); ++int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_nhash *whlist); ++void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_whtmp_rmdir *args); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct dentry *au_diropq_create(struct dentry *dentry, ++ aufs_bindex_t bindex) ++{ ++ return au_diropq_sio(dentry, bindex, AuDiropq_CREATE); ++} ++ ++static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE)); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_WHOUT_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/wkq.c linux-2.6.31/fs/aufs/wkq.c +--- linux-2.6.31-vanilla/fs/aufs/wkq.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/wkq.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,259 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * workqueue for asynchronous/super-io operations ++ * todo: try new dredential scheme ++ */ ++ ++#include <linux/module.h> ++#include "aufs.h" ++ ++/* internal workqueue named AUFS_WKQ_NAME */ ++static struct au_wkq { ++ struct workqueue_struct *q; ++ ++ /* balancing */ ++ atomic_t busy; ++} *au_wkq; ++ ++struct au_wkinfo { ++ struct work_struct wk; ++ struct super_block *sb; ++ ++ unsigned int flags; /* see wkq.h */ ++ ++ au_wkq_func_t func; ++ void *args; ++ ++ atomic_t *busyp; ++ struct completion *comp; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int enqueue(struct au_wkq *wkq, struct au_wkinfo *wkinfo) ++{ ++ wkinfo->busyp = &wkq->busy; ++ if (au_ftest_wkq(wkinfo->flags, WAIT)) ++ return !queue_work(wkq->q, &wkinfo->wk); ++ else ++ return !schedule_work(&wkinfo->wk); ++} ++ ++static void do_wkq(struct au_wkinfo *wkinfo) ++{ ++ unsigned int idle, n; ++ int i, idle_idx; ++ ++ while (1) { ++ if (au_ftest_wkq(wkinfo->flags, WAIT)) { ++ idle_idx = 0; ++ idle = UINT_MAX; ++ for (i = 0; i < aufs_nwkq; i++) { ++ n = atomic_inc_return(&au_wkq[i].busy); ++ if (n == 1 && !enqueue(au_wkq + i, wkinfo)) ++ return; /* success */ ++ ++ if (n < idle) { ++ idle_idx = i; ++ idle = n; ++ } ++ atomic_dec(&au_wkq[i].busy); ++ } ++ } else ++ idle_idx = aufs_nwkq; ++ ++ atomic_inc(&au_wkq[idle_idx].busy); ++ if (!enqueue(au_wkq + idle_idx, wkinfo)) ++ return; /* success */ ++ ++ /* impossible? */ ++ AuWarn1("failed to queue_work()\n"); ++ yield(); ++ } ++} ++ ++static void wkq_func(struct work_struct *wk) ++{ ++ struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk); ++ ++ wkinfo->func(wkinfo->args); ++ atomic_dec_return(wkinfo->busyp); ++ if (au_ftest_wkq(wkinfo->flags, WAIT)) ++ complete(wkinfo->comp); ++ else { ++ kobject_put(&au_sbi(wkinfo->sb)->si_kobj); ++ module_put(THIS_MODULE); ++ kfree(wkinfo); ++ } ++} ++ ++/* ++ * Since struct completion is large, try allocating it dynamically. ++ */ ++#if defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) ++#define AuWkqCompDeclare(name) struct completion *comp = NULL ++ ++static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) ++{ ++ *comp = kmalloc(sizeof(**comp), GFP_NOFS); ++ if (*comp) { ++ init_completion(*comp); ++ wkinfo->comp = *comp; ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++static void au_wkq_comp_free(struct completion *comp) ++{ ++ kfree(comp); ++} ++ ++#else ++ ++/* no braces */ ++#define AuWkqCompDeclare(name) \ ++ DECLARE_COMPLETION_ONSTACK(_ ## name); \ ++ struct completion *comp = &_ ## name ++ ++static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) ++{ ++ wkinfo->comp = *comp; ++ return 0; ++} ++ ++static void au_wkq_comp_free(struct completion *comp __maybe_unused) ++{ ++ /* empty */ ++} ++#endif /* 4KSTACKS */ ++ ++static void au_wkq_run(struct au_wkinfo *wkinfo) ++{ ++ au_dbg_verify_kthread(); ++ INIT_WORK(&wkinfo->wk, wkq_func); ++ do_wkq(wkinfo); ++} ++ ++int au_wkq_wait(au_wkq_func_t func, void *args) ++{ ++ int err; ++ AuWkqCompDeclare(comp); ++ struct au_wkinfo wkinfo = { ++ .flags = AuWkq_WAIT, ++ .func = func, ++ .args = args ++ }; ++ ++ err = au_wkq_comp_alloc(&wkinfo, &comp); ++ if (!err) { ++ au_wkq_run(&wkinfo); ++ /* no timeout, no interrupt */ ++ wait_for_completion(wkinfo.comp); ++ au_wkq_comp_free(comp); ++ } ++ ++ return err; ++ ++} ++ ++int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb) ++{ ++ int err; ++ struct au_wkinfo *wkinfo; ++ ++ atomic_inc(&au_sbi(sb)->si_nowait.nw_len); ++ ++ /* ++ * wkq_func() must free this wkinfo. ++ * it highly depends upon the implementation of workqueue. ++ */ ++ err = 0; ++ wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS); ++ if (wkinfo) { ++ wkinfo->sb = sb; ++ wkinfo->flags = !AuWkq_WAIT; ++ wkinfo->func = func; ++ wkinfo->args = args; ++ wkinfo->comp = NULL; ++ kobject_get(&au_sbi(sb)->si_kobj); ++ __module_get(THIS_MODULE); ++ ++ au_wkq_run(wkinfo); ++ } else { ++ err = -ENOMEM; ++ atomic_dec(&au_sbi(sb)->si_nowait.nw_len); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_nwt_init(struct au_nowait_tasks *nwt) ++{ ++ atomic_set(&nwt->nw_len, 0); ++ /* smp_mb();*/ /* atomic_set */ ++ init_waitqueue_head(&nwt->nw_wq); ++} ++ ++void au_wkq_fin(void) ++{ ++ int i; ++ ++ for (i = 0; i < aufs_nwkq; i++) ++ if (au_wkq[i].q && !IS_ERR(au_wkq[i].q)) ++ destroy_workqueue(au_wkq[i].q); ++ kfree(au_wkq); ++} ++ ++int __init au_wkq_init(void) ++{ ++ int err, i; ++ struct au_wkq *nowaitq; ++ ++ /* '+1' is for accounting of nowait queue */ ++ err = -ENOMEM; ++ au_wkq = kcalloc(aufs_nwkq + 1, sizeof(*au_wkq), GFP_NOFS); ++ if (unlikely(!au_wkq)) ++ goto out; ++ ++ err = 0; ++ for (i = 0; i < aufs_nwkq; i++) { ++ au_wkq[i].q = create_singlethread_workqueue(AUFS_WKQ_NAME); ++ if (au_wkq[i].q && !IS_ERR(au_wkq[i].q)) { ++ atomic_set(&au_wkq[i].busy, 0); ++ continue; ++ } ++ ++ err = PTR_ERR(au_wkq[i].q); ++ au_wkq_fin(); ++ goto out; ++ } ++ ++ /* nowait accounting */ ++ nowaitq = au_wkq + aufs_nwkq; ++ atomic_set(&nowaitq->busy, 0); ++ nowaitq->q = NULL; ++ /* smp_mb(); */ /* atomic_set */ ++ ++ out: ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/aufs/wkq.h linux-2.6.31/fs/aufs/wkq.h +--- linux-2.6.31-vanilla/fs/aufs/wkq.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/wkq.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,82 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * workqueue for asynchronous/super-io operations ++ * todo: try new credentials management scheme ++ */ ++ ++#ifndef __AUFS_WKQ_H__ ++#define __AUFS_WKQ_H__ ++ ++#ifdef __KERNEL__ ++ ++#include <linux/sched.h> ++#include <linux/wait.h> ++#include <linux/aufs_type.h> ++ ++struct super_block; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * in the next operation, wait for the 'nowait' tasks in system-wide workqueue ++ */ ++struct au_nowait_tasks { ++ atomic_t nw_len; ++ wait_queue_head_t nw_wq; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++typedef void (*au_wkq_func_t)(void *args); ++ ++/* wkq flags */ ++#define AuWkq_WAIT 1 ++#define au_ftest_wkq(flags, name) ((flags) & AuWkq_##name) ++#define au_fset_wkq(flags, name) { (flags) |= AuWkq_##name; } ++#define au_fclr_wkq(flags, name) { (flags) &= ~AuWkq_##name; } ++ ++/* wkq.c */ ++int au_wkq_wait(au_wkq_func_t func, void *args); ++int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb); ++void au_nwt_init(struct au_nowait_tasks *nwt); ++int __init au_wkq_init(void); ++void au_wkq_fin(void); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline int au_test_wkq(struct task_struct *tsk) ++{ ++ return !tsk->mm && !strcmp(tsk->comm, AUFS_WKQ_NAME); ++} ++ ++static inline void au_nwt_done(struct au_nowait_tasks *nwt) ++{ ++ if (!atomic_dec_return(&nwt->nw_len)) ++ wake_up_all(&nwt->nw_wq); ++} ++ ++static inline int au_nwt_flush(struct au_nowait_tasks *nwt) ++{ ++ wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len)); ++ return 0; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_WKQ_H__ */ +diff -Nur linux-2.6.31-vanilla/fs/aufs/xino.c linux-2.6.31/fs/aufs/xino.c +--- linux-2.6.31-vanilla/fs/aufs/xino.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/fs/aufs/xino.c 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,1203 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * external inode number translation table and bitmap ++ */ ++ ++#include <linux/file.h> ++#include <linux/seq_file.h> ++#include <linux/uaccess.h> ++#include "aufs.h" ++ ++ssize_t xino_fread(au_readf_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ do { ++ /* todo: signal_pending? */ ++ err = func(file, (char __user *)buf, size, pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ set_fs(oldfs); ++ ++#if 0 /* reserved for future use */ ++ if (err > 0) ++ fsnotify_access(file->f_dentry); ++#endif ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static ssize_t do_xino_fwrite(au_writef_t func, struct file *file, void *buf, ++ size_t size, loff_t *pos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ lockdep_off(); ++ do { ++ /* todo: signal_pending? */ ++ err = func(file, (const char __user *)buf, size, pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ lockdep_on(); ++ set_fs(oldfs); ++ ++#if 0 /* reserved for future use */ ++ if (err > 0) ++ fsnotify_modify(file->f_dentry); ++#endif ++ ++ return err; ++} ++ ++struct do_xino_fwrite_args { ++ ssize_t *errp; ++ au_writef_t func; ++ struct file *file; ++ void *buf; ++ size_t size; ++ loff_t *pos; ++}; ++ ++static void call_do_xino_fwrite(void *args) ++{ ++ struct do_xino_fwrite_args *a = args; ++ *a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos); ++} ++ ++ssize_t xino_fwrite(au_writef_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos) ++{ ++ ssize_t err; ++ ++ /* todo: signal block and no wkq? */ ++ /* todo: new credential scheme */ ++ /* ++ * it breaks RLIMIT_FSIZE and normal user's limit, ++ * users should care about quota and real 'filesystem full.' ++ */ ++ if (!au_test_wkq(current)) { ++ int wkq_err; ++ struct do_xino_fwrite_args args = { ++ .errp = &err, ++ .func = func, ++ .file = file, ++ .buf = buf, ++ .size = size, ++ .pos = pos ++ }; ++ ++ wkq_err = au_wkq_wait(call_do_xino_fwrite, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } else ++ err = do_xino_fwrite(func, file, buf, size, pos); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create a new xinofile at the same place/path as @base_file. ++ */ ++struct file *au_xino_create2(struct file *base_file, struct file *copy_src) ++{ ++ struct file *file; ++ struct dentry *base, *dentry, *parent; ++ struct inode *dir; ++ struct qstr *name; ++ int err; ++ struct path path; ++ ++ base = base_file->f_dentry; ++ parent = base->d_parent; /* dir inode is locked */ ++ dir = parent->d_inode; ++ IMustLock(dir); ++ ++ file = ERR_PTR(-EINVAL); ++ name = &base->d_name; ++ dentry = vfsub_lookup_one_len(name->name, parent, name->len); ++ if (IS_ERR(dentry)) { ++ file = (void *)dentry; ++ AuErr("%.*s lookup err %ld\n", AuLNPair(name), PTR_ERR(dentry)); ++ goto out; ++ } ++ ++ /* no need to mnt_want_write() since we call dentry_open() later */ ++ err = vfs_create(dir, dentry, S_IRUGO | S_IWUGO, NULL); ++ if (unlikely(err)) { ++ file = ERR_PTR(err); ++ AuErr("%.*s create err %d\n", AuLNPair(name), err); ++ goto out_dput; ++ } ++ ++ path.dentry = dentry; ++ path.mnt = base_file->f_vfsmnt; ++ path_get(&path); ++ file = vfsub_dentry_open(&path, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE, ++ current_cred()); ++ if (IS_ERR(file)) { ++ AuErr("%.*s open err %ld\n", AuLNPair(name), PTR_ERR(file)); ++ goto out_dput; ++ } ++ ++ err = vfsub_unlink(dir, &file->f_path, /*force*/0); ++ if (unlikely(err)) { ++ AuErr("%.*s unlink err %d\n", AuLNPair(name), err); ++ goto out_fput; ++ } ++ ++ if (copy_src) { ++ /* no one can touch copy_src xino */ ++ err = au_copy_file(file, copy_src, ++ i_size_read(copy_src->f_dentry->d_inode)); ++ if (unlikely(err)) { ++ AuErr("%.*s copy err %d\n", AuLNPair(name), err); ++ goto out_fput; ++ } ++ } ++ goto out_dput; /* success */ ++ ++ out_fput: ++ fput(file); ++ file = ERR_PTR(err); ++ out_dput: ++ dput(dentry); ++ out: ++ return file; ++} ++ ++struct au_xino_lock_dir { ++ struct au_hinode *hdir; ++ struct dentry *parent; ++ struct mutex *mtx; ++}; ++ ++static void au_xino_lock_dir(struct super_block *sb, struct file *xino, ++ struct au_xino_lock_dir *ldir) ++{ ++ aufs_bindex_t brid, bindex; ++ ++ ldir->hdir = NULL; ++ bindex = -1; ++ brid = au_xino_brid(sb); ++ if (brid >= 0) ++ bindex = au_br_index(sb, brid); ++ if (bindex >= 0) { ++ ldir->hdir = au_hi(sb->s_root->d_inode, bindex); ++ au_hin_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT); ++ } else { ++ ldir->parent = dget_parent(xino->f_dentry); ++ ldir->mtx = &ldir->parent->d_inode->i_mutex; ++ mutex_lock_nested(ldir->mtx, AuLsc_I_PARENT); ++ } ++} ++ ++static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir) ++{ ++ if (ldir->hdir) ++ au_hin_imtx_unlock(ldir->hdir); ++ else { ++ mutex_unlock(ldir->mtx); ++ dput(ldir->parent); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* trucate xino files asynchronously */ ++ ++int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err; ++ aufs_bindex_t bi, bend; ++ struct au_branch *br; ++ struct file *new_xino, *file; ++ struct super_block *h_sb; ++ struct au_xino_lock_dir ldir; ++ ++ err = -EINVAL; ++ bend = au_sbend(sb); ++ if (unlikely(bindex < 0 || bend < bindex)) ++ goto out; ++ br = au_sbr(sb, bindex); ++ file = br->br_xino.xi_file; ++ if (!file) ++ goto out; ++ ++ au_xino_lock_dir(sb, file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ new_xino = au_xino_create2(file, file); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(new_xino); ++ if (IS_ERR(new_xino)) ++ goto out; ++ err = 0; ++ fput(file); ++ br->br_xino.xi_file = new_xino; ++ ++ h_sb = br->br_mnt->mnt_sb; ++ for (bi = 0; bi <= bend; bi++) { ++ if (unlikely(bi == bindex)) ++ continue; ++ br = au_sbr(sb, bi); ++ if (br->br_mnt->mnt_sb != h_sb) ++ continue; ++ ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = new_xino; ++ get_file(new_xino); ++ } ++ ++ out: ++ return err; ++} ++ ++struct xino_do_trunc_args { ++ struct super_block *sb; ++ struct au_branch *br; ++}; ++ ++static void xino_do_trunc(void *_args) ++{ ++ struct xino_do_trunc_args *args = _args; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct inode *dir; ++ int err; ++ aufs_bindex_t bindex; ++ ++ err = 0; ++ sb = args->sb; ++ dir = sb->s_root->d_inode; ++ br = args->br; ++ ++ si_noflush_write_lock(sb); ++ ii_read_lock_parent(dir); ++ bindex = au_br_index(sb, br->br_id); ++ err = au_xino_trunc(sb, bindex); ++ if (!err ++ && br->br_xino.xi_file->f_dentry->d_inode->i_blocks ++ >= br->br_xino_upper) ++ br->br_xino_upper += AUFS_XINO_TRUNC_STEP; ++ ++ ii_read_unlock(dir); ++ if (unlikely(err)) ++ AuWarn("err b%d, (%d)\n", bindex, err); ++ atomic_dec(&br->br_xino_running); ++ atomic_dec(&br->br_count); ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ si_write_unlock(sb); ++ kfree(args); ++} ++ ++static void xino_try_trunc(struct super_block *sb, struct au_branch *br) ++{ ++ struct xino_do_trunc_args *args; ++ int wkq_err; ++ ++ if (br->br_xino.xi_file->f_dentry->d_inode->i_blocks ++ < br->br_xino_upper) ++ return; ++ ++ if (atomic_inc_return(&br->br_xino_running) > 1) ++ goto out; ++ ++ /* lock and kfree() will be called in trunc_xino() */ ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (unlikely(!args)) { ++ AuErr1("no memory\n"); ++ goto out_args; ++ } ++ ++ atomic_inc_return(&br->br_count); ++ args->sb = sb; ++ args->br = br; ++ wkq_err = au_wkq_nowait(xino_do_trunc, args, sb); ++ if (!wkq_err) ++ return; /* success */ ++ ++ AuErr("wkq %d\n", wkq_err); ++ atomic_dec_return(&br->br_count); ++ ++ out_args: ++ kfree(args); ++ out: ++ atomic_dec_return(&br->br_xino_running); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_xino_do_write(au_writef_t write, struct file *file, ++ ino_t h_ino, ino_t ino) ++{ ++ loff_t pos; ++ ssize_t sz; ++ ++ pos = h_ino; ++ if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) { ++ AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); ++ return -EFBIG; ++ } ++ pos *= sizeof(ino); ++ sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos); ++ if (sz == sizeof(ino)) ++ return 0; /* success */ ++ ++ AuIOErr("write failed (%zd)\n", sz); ++ return -EIO; ++} ++ ++/* ++ * write @ino to the xinofile for the specified branch{@sb, @bindex} ++ * at the position of @h_ino. ++ * even if @ino is zero, it is written to the xinofile and means no entry. ++ * if the size of the xino file on a specific filesystem exceeds the watermark, ++ * try truncating it. ++ */ ++int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t ino) ++{ ++ int err; ++ unsigned int mnt_flags; ++ struct au_branch *br; ++ ++ BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max) ++ || ((loff_t)-1) > 0); ++ SiMustAnyLock(sb); ++ ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, XINO)) ++ return 0; ++ ++ br = au_sbr(sb, bindex); ++ err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, ++ h_ino, ino); ++ if (!err) { ++ if (au_opt_test(mnt_flags, TRUNC_XINO) ++ && au_test_fs_trunc_xino(br->br_mnt->mnt_sb)) ++ xino_try_trunc(sb, br); ++ return 0; /* success */ ++ } ++ ++ AuIOErr("write failed (%d)\n", err); ++ return -EIO; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* aufs inode number bitmap */ ++ ++static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE; ++static ino_t xib_calc_ino(unsigned long pindex, int bit) ++{ ++ ino_t ino; ++ ++ AuDebugOn(bit < 0 || page_bits <= bit); ++ ino = AUFS_FIRST_INO + pindex * page_bits + bit; ++ return ino; ++} ++ ++static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit) ++{ ++ AuDebugOn(ino < AUFS_FIRST_INO); ++ ino -= AUFS_FIRST_INO; ++ *pindex = ino / page_bits; ++ *bit = ino % page_bits; ++} ++ ++static int xib_pindex(struct super_block *sb, unsigned long pindex) ++{ ++ int err; ++ loff_t pos; ++ ssize_t sz; ++ struct au_sbinfo *sbinfo; ++ struct file *xib; ++ unsigned long *p; ++ ++ sbinfo = au_sbi(sb); ++ MtxMustLock(&sbinfo->si_xib_mtx); ++ AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE ++ || !au_opt_test(sbinfo->si_mntflags, XINO)); ++ ++ if (pindex == sbinfo->si_xib_last_pindex) ++ return 0; ++ ++ xib = sbinfo->si_xib; ++ p = sbinfo->si_xib_buf; ++ pos = sbinfo->si_xib_last_pindex; ++ pos *= PAGE_SIZE; ++ sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); ++ if (unlikely(sz != PAGE_SIZE)) ++ goto out; ++ ++ pos = pindex; ++ pos *= PAGE_SIZE; ++ if (i_size_read(xib->f_dentry->d_inode) >= pos + PAGE_SIZE) ++ sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos); ++ else { ++ memset(p, 0, PAGE_SIZE); ++ sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); ++ } ++ if (sz == PAGE_SIZE) { ++ sbinfo->si_xib_last_pindex = pindex; ++ return 0; /* success */ ++ } ++ ++ out: ++ AuIOErr1("write failed (%zd)\n", sz); ++ err = sz; ++ if (sz >= 0) ++ err = -EIO; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_xino_write0(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t ino) ++{ ++ int err, bit; ++ unsigned long pindex; ++ struct au_sbinfo *sbinfo; ++ ++ if (!au_opt_test(au_mntflags(sb), XINO)) ++ return 0; ++ ++ err = 0; ++ if (ino) { ++ sbinfo = au_sbi(sb); ++ xib_calc_bit(ino, &pindex, &bit); ++ AuDebugOn(page_bits <= bit); ++ mutex_lock(&sbinfo->si_xib_mtx); ++ err = xib_pindex(sb, pindex); ++ if (!err) { ++ clear_bit(bit, sbinfo->si_xib_buf); ++ sbinfo->si_xib_next_bit = bit; ++ } ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ } ++ ++ if (!err) ++ err = au_xino_write(sb, bindex, h_ino, 0); ++ return err; ++} ++ ++/* get an unused inode number from bitmap */ ++ino_t au_xino_new_ino(struct super_block *sb) ++{ ++ ino_t ino; ++ unsigned long *p, pindex, ul, pend; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ int free_bit, err; ++ ++ if (!au_opt_test(au_mntflags(sb), XINO)) ++ return iunique(sb, AUFS_FIRST_INO); ++ ++ sbinfo = au_sbi(sb); ++ mutex_lock(&sbinfo->si_xib_mtx); ++ p = sbinfo->si_xib_buf; ++ free_bit = sbinfo->si_xib_next_bit; ++ if (free_bit < page_bits && !test_bit(free_bit, p)) ++ goto out; /* success */ ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ ++ pindex = sbinfo->si_xib_last_pindex; ++ for (ul = pindex - 1; ul < ULONG_MAX; ul--) { ++ err = xib_pindex(sb, ul); ++ if (unlikely(err)) ++ goto out_err; ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ } ++ ++ file = sbinfo->si_xib; ++ pend = i_size_read(file->f_dentry->d_inode) / PAGE_SIZE; ++ for (ul = pindex + 1; ul <= pend; ul++) { ++ err = xib_pindex(sb, ul); ++ if (unlikely(err)) ++ goto out_err; ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ } ++ BUG(); ++ ++ out: ++ set_bit(free_bit, p); ++ sbinfo->si_xib_next_bit++; ++ pindex = sbinfo->si_xib_last_pindex; ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ ino = xib_calc_ino(pindex, free_bit); ++ AuDbg("i%lu\n", (unsigned long)ino); ++ return ino; ++ out_err: ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ AuDbg("i0\n"); ++ return 0; ++} ++ ++/* ++ * read @ino from xinofile for the specified branch{@sb, @bindex} ++ * at the position of @h_ino. ++ * if @ino does not exist and @do_new is true, get new one. ++ */ ++int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t *ino) ++{ ++ int err; ++ ssize_t sz; ++ loff_t pos; ++ struct file *file; ++ struct au_sbinfo *sbinfo; ++ ++ *ino = 0; ++ if (!au_opt_test(au_mntflags(sb), XINO)) ++ return 0; /* no xino */ ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ pos = h_ino; ++ if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) { ++ AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); ++ return -EFBIG; ++ } ++ pos *= sizeof(*ino); ++ ++ file = au_sbr(sb, bindex)->br_xino.xi_file; ++ if (i_size_read(file->f_dentry->d_inode) < pos + sizeof(*ino)) ++ return 0; /* no ino */ ++ ++ sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos); ++ if (sz == sizeof(*ino)) ++ return 0; /* success */ ++ ++ err = sz; ++ if (unlikely(sz >= 0)) { ++ err = -EIO; ++ AuIOErr("xino read error (%zd)\n", sz); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* create and set a new xino file */ ++ ++struct file *au_xino_create(struct super_block *sb, char *fname, int silent) ++{ ++ struct file *file; ++ struct dentry *h_parent, *d; ++ struct inode *h_dir; ++ int err; ++ ++ /* ++ * at mount-time, and the xino file is the default path, ++ * hinotify is disabled so we have no inotify events to ignore. ++ * when a user specified the xino, we cannot get au_hdir to be ignored. ++ */ ++ file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE, ++ S_IRUGO | S_IWUGO); ++ if (IS_ERR(file)) { ++ if (!silent) ++ AuErr("open %s(%ld)\n", fname, PTR_ERR(file)); ++ return file; ++ } ++ ++ /* keep file count */ ++ h_parent = dget_parent(file->f_dentry); ++ h_dir = h_parent->d_inode; ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); ++ /* mnt_want_write() is unnecessary here */ ++ err = vfsub_unlink(h_dir, &file->f_path, /*force*/0); ++ mutex_unlock(&h_dir->i_mutex); ++ dput(h_parent); ++ if (unlikely(err)) { ++ if (!silent) ++ AuErr("unlink %s(%d)\n", fname, err); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ d = file->f_dentry; ++ if (unlikely(sb == d->d_sb)) { ++ if (!silent) ++ AuErr("%s must be outside\n", fname); ++ goto out; ++ } ++ if (unlikely(au_test_fs_bad_xino(d->d_sb))) { ++ if (!silent) ++ AuErr("xino doesn't support %s(%s)\n", ++ fname, au_sbtype(d->d_sb)); ++ goto out; ++ } ++ return file; /* success */ ++ ++ out: ++ fput(file); ++ file = ERR_PTR(err); ++ return file; ++} ++ ++/* ++ * find another branch who is on the same filesystem of the specified ++ * branch{@btgt}. search until @bend. ++ */ ++static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt, ++ aufs_bindex_t bend) ++{ ++ aufs_bindex_t bindex; ++ struct super_block *tgt_sb = au_sbr_sb(sb, btgt); ++ ++ for (bindex = 0; bindex < btgt; bindex++) ++ if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) ++ return bindex; ++ for (bindex++; bindex <= bend; bindex++) ++ if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) ++ return bindex; ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * initialize the xinofile for the specified branch @br ++ * at the place/path where @base_file indicates. ++ * test whether another branch is on the same filesystem or not, ++ * if @do_test is true. ++ */ ++int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino, ++ struct file *base_file, int do_test) ++{ ++ int err; ++ ino_t ino; ++ aufs_bindex_t bend, bindex; ++ struct au_branch *shared_br, *b; ++ struct file *file; ++ struct super_block *tgt_sb; ++ ++ shared_br = NULL; ++ bend = au_sbend(sb); ++ if (do_test) { ++ tgt_sb = br->br_mnt->mnt_sb; ++ for (bindex = 0; bindex <= bend; bindex++) { ++ b = au_sbr(sb, bindex); ++ if (tgt_sb == b->br_mnt->mnt_sb) { ++ shared_br = b; ++ break; ++ } ++ } ++ } ++ ++ if (!shared_br || !shared_br->br_xino.xi_file) { ++ struct au_xino_lock_dir ldir; ++ ++ au_xino_lock_dir(sb, base_file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ file = au_xino_create2(base_file, NULL); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ br->br_xino.xi_file = file; ++ } else { ++ br->br_xino.xi_file = shared_br->br_xino.xi_file; ++ get_file(br->br_xino.xi_file); ++ } ++ ++ ino = AUFS_ROOT_INO; ++ err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, ++ h_ino, ino); ++ if (!err) ++ return 0; /* success */ ++ ++ ++ out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* trucate a xino bitmap file */ ++ ++/* todo: slow */ ++static int do_xib_restore(struct super_block *sb, struct file *file, void *page) ++{ ++ int err, bit; ++ ssize_t sz; ++ unsigned long pindex; ++ loff_t pos, pend; ++ struct au_sbinfo *sbinfo; ++ au_readf_t func; ++ ino_t *ino; ++ unsigned long *p; ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ MtxMustLock(&sbinfo->si_xib_mtx); ++ p = sbinfo->si_xib_buf; ++ func = sbinfo->si_xread; ++ pend = i_size_read(file->f_dentry->d_inode); ++ pos = 0; ++ while (pos < pend) { ++ sz = xino_fread(func, file, page, PAGE_SIZE, &pos); ++ err = sz; ++ if (unlikely(sz <= 0)) ++ goto out; ++ ++ err = 0; ++ for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) { ++ if (unlikely(*ino < AUFS_FIRST_INO)) ++ continue; ++ ++ xib_calc_bit(*ino, &pindex, &bit); ++ AuDebugOn(page_bits <= bit); ++ err = xib_pindex(sb, pindex); ++ if (!err) ++ set_bit(bit, p); ++ else ++ goto out; ++ } ++ } ++ ++ out: ++ return err; ++} ++ ++static int xib_restore(struct super_block *sb) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ void *page; ++ ++ err = -ENOMEM; ++ page = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!page)) ++ goto out; ++ ++ err = 0; ++ bend = au_sbend(sb); ++ for (bindex = 0; !err && bindex <= bend; bindex++) ++ if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0) ++ err = do_xib_restore ++ (sb, au_sbr(sb, bindex)->br_xino.xi_file, page); ++ else ++ AuDbg("b%d\n", bindex); ++ free_page((unsigned long)page); ++ ++ out: ++ return err; ++} ++ ++int au_xib_trunc(struct super_block *sb) ++{ ++ int err; ++ ssize_t sz; ++ loff_t pos; ++ struct au_xino_lock_dir ldir; ++ struct au_sbinfo *sbinfo; ++ unsigned long *p; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ if (!au_opt_test(sbinfo->si_mntflags, XINO)) ++ goto out; ++ ++ file = sbinfo->si_xib; ++ if (i_size_read(file->f_dentry->d_inode) <= PAGE_SIZE) ++ goto out; ++ ++ au_xino_lock_dir(sb, file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ file = au_xino_create2(sbinfo->si_xib, NULL); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = file; ++ ++ p = sbinfo->si_xib_buf; ++ memset(p, 0, PAGE_SIZE); ++ pos = 0; ++ sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos); ++ if (unlikely(sz != PAGE_SIZE)) { ++ err = sz; ++ AuIOErr("err %d\n", err); ++ if (sz >= 0) ++ err = -EIO; ++ goto out; ++ } ++ ++ mutex_lock(&sbinfo->si_xib_mtx); ++ /* mnt_want_write() is unnecessary here */ ++ err = xib_restore(sb); ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * xino mount option handlers ++ */ ++static au_readf_t find_readf(struct file *h_file) ++{ ++ const struct file_operations *fop = h_file->f_op; ++ ++ if (fop) { ++ if (fop->read) ++ return fop->read; ++ if (fop->aio_read) ++ return do_sync_read; ++ } ++ return ERR_PTR(-ENOSYS); ++} ++ ++static au_writef_t find_writef(struct file *h_file) ++{ ++ const struct file_operations *fop = h_file->f_op; ++ ++ if (fop) { ++ if (fop->write) ++ return fop->write; ++ if (fop->aio_write) ++ return do_sync_write; ++ } ++ return ERR_PTR(-ENOSYS); ++} ++ ++/* xino bitmap */ ++static void xino_clear_xib(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ sbinfo->si_xread = NULL; ++ sbinfo->si_xwrite = NULL; ++ if (sbinfo->si_xib) ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = NULL; ++ free_page((unsigned long)sbinfo->si_xib_buf); ++ sbinfo->si_xib_buf = NULL; ++} ++ ++static int au_xino_set_xib(struct super_block *sb, struct file *base) ++{ ++ int err; ++ loff_t pos; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ file = au_xino_create2(base, sbinfo->si_xib); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ if (sbinfo->si_xib) ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = file; ++ sbinfo->si_xread = find_readf(file); ++ sbinfo->si_xwrite = find_writef(file); ++ ++ err = -ENOMEM; ++ if (!sbinfo->si_xib_buf) ++ sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS); ++ if (unlikely(!sbinfo->si_xib_buf)) ++ goto out_unset; ++ ++ sbinfo->si_xib_last_pindex = 0; ++ sbinfo->si_xib_next_bit = 0; ++ if (i_size_read(file->f_dentry->d_inode) < PAGE_SIZE) { ++ pos = 0; ++ err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf, ++ PAGE_SIZE, &pos); ++ if (unlikely(err != PAGE_SIZE)) ++ goto out_free; ++ } ++ err = 0; ++ goto out; /* success */ ++ ++ out_free: ++ free_page((unsigned long)sbinfo->si_xib_buf); ++ sbinfo->si_xib_buf = NULL; ++ if (err >= 0) ++ err = -EIO; ++ out_unset: ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = NULL; ++ sbinfo->si_xread = NULL; ++ sbinfo->si_xwrite = NULL; ++ out: ++ return err; ++} ++ ++/* xino for each branch */ ++static void xino_clear_br(struct super_block *sb) ++{ ++ aufs_bindex_t bindex, bend; ++ struct au_branch *br; ++ ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (!br || !br->br_xino.xi_file) ++ continue; ++ ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = NULL; ++ } ++} ++ ++static int au_xino_set_br(struct super_block *sb, struct file *base) ++{ ++ int err; ++ ino_t ino; ++ aufs_bindex_t bindex, bend, bshared; ++ struct { ++ struct file *old, *new; ++ } *fpair, *p; ++ struct au_branch *br; ++ struct inode *inode; ++ au_writef_t writef; ++ ++ SiMustWriteLock(sb); ++ ++ err = -ENOMEM; ++ bend = au_sbend(sb); ++ fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS); ++ if (unlikely(!fpair)) ++ goto out; ++ ++ inode = sb->s_root->d_inode; ++ ino = AUFS_ROOT_INO; ++ writef = au_sbi(sb)->si_xwrite; ++ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { ++ br = au_sbr(sb, bindex); ++ bshared = is_sb_shared(sb, bindex, bindex - 1); ++ if (bshared >= 0) { ++ /* shared xino */ ++ *p = fpair[bshared]; ++ get_file(p->new); ++ } ++ ++ if (!p->new) { ++ /* new xino */ ++ p->old = br->br_xino.xi_file; ++ p->new = au_xino_create2(base, br->br_xino.xi_file); ++ err = PTR_ERR(p->new); ++ if (IS_ERR(p->new)) { ++ p->new = NULL; ++ goto out_pair; ++ } ++ } ++ ++ err = au_xino_do_write(writef, p->new, ++ au_h_iptr(inode, bindex)->i_ino, ino); ++ if (unlikely(err)) ++ goto out_pair; ++ } ++ ++ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { ++ br = au_sbr(sb, bindex); ++ if (br->br_xino.xi_file) ++ fput(br->br_xino.xi_file); ++ get_file(p->new); ++ br->br_xino.xi_file = p->new; ++ } ++ ++ out_pair: ++ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) ++ if (p->new) ++ fput(p->new); ++ else ++ break; ++ kfree(fpair); ++ out: ++ return err; ++} ++ ++void au_xino_clr(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ au_xigen_clr(sb); ++ xino_clear_xib(sb); ++ xino_clear_br(sb); ++ sbinfo = au_sbi(sb); ++ /* lvalue, do not call au_mntflags() */ ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++} ++ ++int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount) ++{ ++ int err, skip; ++ struct dentry *parent, *cur_parent; ++ struct qstr *dname, *cur_name; ++ struct file *cur_xino; ++ struct inode *dir; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ parent = dget_parent(xino->file->f_dentry); ++ if (remount) { ++ skip = 0; ++ dname = &xino->file->f_dentry->d_name; ++ cur_xino = sbinfo->si_xib; ++ if (cur_xino) { ++ cur_parent = dget_parent(cur_xino->f_dentry); ++ cur_name = &cur_xino->f_dentry->d_name; ++ skip = (cur_parent == parent ++ && dname->len == cur_name->len ++ && !memcmp(dname->name, cur_name->name, ++ dname->len)); ++ dput(cur_parent); ++ } ++ if (skip) ++ goto out; ++ } ++ ++ au_opt_set(sbinfo->si_mntflags, XINO); ++ dir = parent->d_inode; ++ mutex_lock_nested(&dir->i_mutex, AuLsc_I_PARENT); ++ /* mnt_want_write() is unnecessary here */ ++ err = au_xino_set_xib(sb, xino->file); ++ if (!err) ++ err = au_xigen_set(sb, xino->file); ++ if (!err) ++ err = au_xino_set_br(sb, xino->file); ++ mutex_unlock(&dir->i_mutex); ++ if (!err) ++ goto out; /* success */ ++ ++ /* reset all */ ++ AuIOErr("failed creating xino(%d).\n", err); ++ ++ out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create a xinofile at the default place/path. ++ */ ++struct file *au_xino_def(struct super_block *sb) ++{ ++ struct file *file; ++ char *page, *p; ++ struct au_branch *br; ++ struct super_block *h_sb; ++ struct path path; ++ aufs_bindex_t bend, bindex, bwr; ++ ++ br = NULL; ++ bend = au_sbend(sb); ++ bwr = -1; ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_writable(br->br_perm) ++ && !au_test_fs_bad_xino(br->br_mnt->mnt_sb)) { ++ bwr = bindex; ++ break; ++ } ++ } ++ ++ if (bwr >= 0) { ++ file = ERR_PTR(-ENOMEM); ++ page = __getname(); ++ if (unlikely(!page)) ++ goto out; ++ path.mnt = br->br_mnt; ++ path.dentry = au_h_dptr(sb->s_root, bwr); ++ p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME)); ++ file = (void *)p; ++ if (!IS_ERR(p)) { ++ strcat(p, "/" AUFS_XINO_FNAME); ++ AuDbg("%s\n", p); ++ file = au_xino_create(sb, p, /*silent*/0); ++ if (!IS_ERR(file)) ++ au_xino_brid_set(sb, br->br_id); ++ } ++ __putname(page); ++ } else { ++ file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0); ++ if (IS_ERR(file)) ++ goto out; ++ h_sb = file->f_dentry->d_sb; ++ if (unlikely(au_test_fs_bad_xino(h_sb))) { ++ AuErr("xino doesn't support %s(%s)\n", ++ AUFS_XINO_DEFPATH, au_sbtype(h_sb)); ++ fput(file); ++ file = ERR_PTR(-EINVAL); ++ } ++ if (!IS_ERR(file)) ++ au_xino_brid_set(sb, -1); ++ } ++ ++ out: ++ return file; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_xino_path(struct seq_file *seq, struct file *file) ++{ ++ int err; ++ ++ err = au_seq_path(seq, &file->f_path); ++ if (unlikely(err < 0)) ++ goto out; ++ ++ err = 0; ++#define Deleted "\040(deleted)" ++ seq->count -= sizeof(Deleted) - 1; ++ AuDebugOn(memcmp(seq->buf + seq->count, Deleted, ++ sizeof(Deleted) - 1)); ++#undef Deleted ++ ++ out: ++ return err; ++} +diff -Nur linux-2.6.31-vanilla/fs/Kconfig linux-2.6.31/fs/Kconfig +--- linux-2.6.31-vanilla/fs/Kconfig 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/fs/Kconfig 2009-09-16 13:55:56.000000000 +0200 +@@ -187,6 +187,7 @@ + source "fs/ufs/Kconfig" + source "fs/exofs/Kconfig" + source "fs/nilfs2/Kconfig" ++source "fs/aufs/Kconfig" + + endif # MISC_FILESYSTEMS + +diff -Nur linux-2.6.31-vanilla/fs/Makefile linux-2.6.31/fs/Makefile +--- linux-2.6.31-vanilla/fs/Makefile 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/fs/Makefile 2009-09-16 13:55:56.000000000 +0200 +@@ -124,3 +124,4 @@ + obj-$(CONFIG_BTRFS_FS) += btrfs/ + obj-$(CONFIG_GFS2_FS) += gfs2/ + obj-$(CONFIG_EXOFS_FS) += exofs/ ++obj-$(CONFIG_AUFS_FS) += aufs/ +diff -Nur linux-2.6.31-vanilla/fs/namei.c linux-2.6.31/fs/namei.c +--- linux-2.6.31-vanilla/fs/namei.c 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/fs/namei.c 2009-09-16 13:55:49.000000000 +0200 +@@ -337,6 +337,7 @@ + + return 0; + } ++EXPORT_SYMBOL(deny_write_access); + + /** + * path_get - get a reference to a path +@@ -1219,7 +1220,7 @@ + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +-static struct dentry *lookup_hash(struct nameidata *nd) ++struct dentry *lookup_hash(struct nameidata *nd) + { + int err; + +@@ -1228,8 +1229,9 @@ + return ERR_PTR(err); + return __lookup_hash(&nd->last, nd->path.dentry, nd); + } ++EXPORT_SYMBOL(lookup_hash); + +-static int __lookup_one_len(const char *name, struct qstr *this, ++int __lookup_one_len(const char *name, struct qstr *this, + struct dentry *base, int len) + { + unsigned long hash; +@@ -1250,6 +1252,7 @@ + this->hash = end_name_hash(hash); + return 0; + } ++EXPORT_SYMBOL(__lookup_one_len); + + /** + * lookup_one_len - filesystem helper to lookup single pathname component +diff -Nur linux-2.6.31-vanilla/fs/namespace.c linux-2.6.31/fs/namespace.c +--- linux-2.6.31-vanilla/fs/namespace.c 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/fs/namespace.c 2009-09-16 13:55:49.000000000 +0200 +@@ -39,6 +39,7 @@ + + /* spinlock for vfsmount related operations, inplace of dcache_lock */ + __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); ++EXPORT_SYMBOL(vfsmount_lock); + + static int event; + static DEFINE_IDA(mnt_id_ida); +diff -Nur linux-2.6.31-vanilla/fs/open.c linux-2.6.31/fs/open.c +--- linux-2.6.31-vanilla/fs/open.c 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/fs/open.c 2009-09-16 13:55:49.000000000 +0200 +@@ -221,6 +221,7 @@ + mutex_unlock(&dentry->d_inode->i_mutex); + return err; + } ++EXPORT_SYMBOL(do_truncate); + + static long do_sys_truncate(const char __user *pathname, loff_t length) + { +diff -Nur linux-2.6.31-vanilla/fs/splice.c linux-2.6.31/fs/splice.c +--- linux-2.6.31-vanilla/fs/splice.c 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/fs/splice.c 2009-09-16 13:55:49.000000000 +0200 +@@ -1057,8 +1057,8 @@ + /* + * Attempt to initiate a splice from pipe to file. + */ +-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, +- loff_t *ppos, size_t len, unsigned int flags) ++long do_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags) + { + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); +@@ -1080,13 +1080,14 @@ + + return splice_write(pipe, out, ppos, len, flags); + } ++EXPORT_SYMBOL(do_splice_from); + + /* + * Attempt to initiate a splice from a file to a pipe. + */ +-static long do_splice_to(struct file *in, loff_t *ppos, +- struct pipe_inode_info *pipe, size_t len, +- unsigned int flags) ++long do_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) + { + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); +@@ -1105,6 +1106,7 @@ + + return splice_read(in, ppos, pipe, len, flags); + } ++EXPORT_SYMBOL(do_splice_to); + + /** + * splice_direct_to_actor - splices data directly between two non-pipes +diff -Nur linux-2.6.31-vanilla/include/linux/aufs_type.h linux-2.6.31/include/linux/aufs_type.h +--- linux-2.6.31-vanilla/include/linux/aufs_type.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/include/linux/aufs_type.h 2009-09-16 13:55:30.000000000 +0200 +@@ -0,0 +1,184 @@ ++/* ++ * Copyright (C) 2005-2009 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef __AUFS_TYPE_H__ ++#define __AUFS_TYPE_H__ ++ ++#include <linux/ioctl.h> ++#include <linux/types.h> ++ ++#define AUFS_VERSION "2-standalone.tree-20090914" ++ ++/* todo? move this to linux-2.6.19/include/magic.h */ ++#define AUFS_SUPER_MAGIC ('a' << 24 | 'u' << 16 | 'f' << 8 | 's') ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_BRANCH_MAX_127 ++typedef __s8 aufs_bindex_t; ++#define AUFS_BRANCH_MAX 127 ++#else ++typedef __s16 aufs_bindex_t; ++#ifdef CONFIG_AUFS_BRANCH_MAX_511 ++#define AUFS_BRANCH_MAX 511 ++#elif defined(CONFIG_AUFS_BRANCH_MAX_1023) ++#define AUFS_BRANCH_MAX 1023 ++#elif defined(CONFIG_AUFS_BRANCH_MAX_32767) ++#define AUFS_BRANCH_MAX 32767 ++#endif ++#endif ++ ++#ifdef __KERNEL__ ++#ifndef AUFS_BRANCH_MAX ++#error unknown CONFIG_AUFS_BRANCH_MAX value ++#endif ++#endif /* __KERNEL__ */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AUFS_NAME "aufs" ++#define AUFS_FSTYPE AUFS_NAME ++ ++#define AUFS_ROOT_INO 2 ++#define AUFS_FIRST_INO 11 ++ ++#define AUFS_WH_PFX ".wh." ++#define AUFS_WH_PFX_LEN ((int)sizeof(AUFS_WH_PFX) - 1) ++#define AUFS_XINO_FNAME "." AUFS_NAME ".xino" ++#define AUFS_XINO_DEFPATH "/tmp/" AUFS_XINO_FNAME ++#define AUFS_XINO_TRUNC_INIT 64 /* blocks */ ++#define AUFS_XINO_TRUNC_STEP 4 /* blocks */ ++#define AUFS_DIRWH_DEF 3 ++#define AUFS_RDCACHE_DEF 10 /* seconds */ ++#define AUFS_RDBLK_DEF 512 /* bytes */ ++#define AUFS_RDHASH_DEF 32 ++#define AUFS_WKQ_NAME AUFS_NAME "d" ++#define AUFS_NWKQ_DEF 4 ++#define AUFS_MFS_SECOND_DEF 30 /* seconds */ ++#define AUFS_PLINK_WARN 100 /* number of plinks */ ++ ++#define AUFS_DIROPQ_NAME AUFS_WH_PFX ".opq" /* whiteouted doubly */ ++#define AUFS_WH_DIROPQ AUFS_WH_PFX AUFS_DIROPQ_NAME ++ ++#define AUFS_BASE_NAME AUFS_WH_PFX AUFS_NAME ++#define AUFS_PLINKDIR_NAME AUFS_WH_PFX "plnk" ++#define AUFS_ORPHDIR_NAME AUFS_WH_PFX "orph" ++ ++/* doubly whiteouted */ ++#define AUFS_WH_BASE AUFS_WH_PFX AUFS_BASE_NAME ++#define AUFS_WH_PLINKDIR AUFS_WH_PFX AUFS_PLINKDIR_NAME ++#define AUFS_WH_ORPHDIR AUFS_WH_PFX AUFS_ORPHDIR_NAME ++ ++/* branch permission */ ++#define AUFS_BRPERM_RW "rw" ++#define AUFS_BRPERM_RO "ro" ++#define AUFS_BRPERM_RR "rr" ++#define AUFS_BRPERM_WH "wh" ++#define AUFS_BRPERM_NLWH "nolwh" ++#define AUFS_BRPERM_ROWH AUFS_BRPERM_RO "+" AUFS_BRPERM_WH ++#define AUFS_BRPERM_RRWH AUFS_BRPERM_RR "+" AUFS_BRPERM_WH ++#define AUFS_BRPERM_RWNLWH AUFS_BRPERM_RW "+" AUFS_BRPERM_NLWH ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ioctl */ ++enum { ++ AuCtl_PLINK_MAINT, ++ AuCtl_PLINK_CLEAN, ++ ++ /* readdir in userspace */ ++ AuCtl_RDU, ++ AuCtl_RDU_INO ++}; ++ ++/* borrowed from linux/include/linux/kernel.h */ ++#ifndef ALIGN ++#define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) ++#define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) ++#endif ++ ++/* borrowed from linux/include/linux/compiler-gcc3.h */ ++#ifndef __aligned ++#define __aligned(x) __attribute__((aligned(x))) ++#define __packed __attribute__((packed)) ++#endif ++ ++struct au_rdu_cookie { ++ __u64 h_pos; ++ __s16 bindex; ++ __u8 flags; ++ __u8 pad; ++ __u32 generation; ++} __aligned(8); ++ ++struct au_rdu_ent { ++ __u64 ino; ++ __s16 bindex; ++ __u8 type; ++ __u8 nlen; ++ __u8 wh; ++ char name[0]; ++} __aligned(8); ++ ++static inline int au_rdu_len(int nlen) ++{ ++ /* include the terminating NULL */ ++ return ALIGN(sizeof(struct au_rdu_ent) + nlen + 1, ++ sizeof(__u64)); ++} ++ ++union au_rdu_ent_ul { ++ struct au_rdu_ent __user *e; ++ unsigned long ul; ++}; ++ ++enum { ++ AufsCtlRduV_SZ, ++ AufsCtlRduV_SZ_PTR, ++ AufsCtlRduV_End ++}; ++ ++struct aufs_rdu { ++ /* input */ ++ union { ++ __u64 sz; /* AuCtl_RDU */ ++ __u64 nent; /* AuCtl_RDU_INO */ ++ }; ++ union au_rdu_ent_ul ent; ++ __u16 verify[AufsCtlRduV_End]; ++ ++ /* input/output */ ++ __u32 blk; ++ ++ /* output */ ++ union au_rdu_ent_ul tail; ++ /* number of entries which were added in a single call */ ++ __u64 rent; ++ __u8 full; ++ __u8 shwh; ++ ++ struct au_rdu_cookie cookie; ++} __aligned(8); ++ ++#define AuCtlType 'A' ++#define AUFS_CTL_PLINK_MAINT _IO(AuCtlType, AuCtl_PLINK_MAINT) ++#define AUFS_CTL_PLINK_CLEAN _IO(AuCtlType, AuCtl_PLINK_CLEAN) ++#define AUFS_CTL_RDU _IOWR(AuCtlType, AuCtl_RDU, struct aufs_rdu) ++#define AUFS_CTL_RDU_INO _IOWR(AuCtlType, AuCtl_RDU_INO, struct aufs_rdu) ++ ++#endif /* __AUFS_TYPE_H__ */ +diff -Nur linux-2.6.31-vanilla/include/linux/Kbuild linux-2.6.31/include/linux/Kbuild +--- linux-2.6.31-vanilla/include/linux/Kbuild 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/include/linux/Kbuild 2009-09-16 13:55:56.000000000 +0200 +@@ -34,6 +34,7 @@ + header-y += atmsap.h + header-y += atmsvc.h + header-y += atm_zatm.h ++header-y += aufs_type.h + header-y += auto_fs4.h + header-y += ax25.h + header-y += b1lli.h +diff -Nur linux-2.6.31-vanilla/include/linux/namei.h linux-2.6.31/include/linux/namei.h +--- linux-2.6.31-vanilla/include/linux/namei.h 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/include/linux/namei.h 2009-09-16 13:55:46.000000000 +0200 +@@ -75,6 +75,9 @@ + extern struct file *nameidata_to_filp(struct nameidata *nd, int flags); + extern void release_open_intent(struct nameidata *); + ++extern struct dentry *lookup_hash(struct nameidata *nd); ++extern int __lookup_one_len(const char *name, struct qstr *this, ++ struct dentry *base, int len); + extern struct dentry *lookup_one_len(const char *, struct dentry *, int); + extern struct dentry *lookup_one_noperm(const char *, struct dentry *); + +diff -Nur linux-2.6.31-vanilla/include/linux/splice.h linux-2.6.31/include/linux/splice.h +--- linux-2.6.31-vanilla/include/linux/splice.h 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/include/linux/splice.h 2009-09-16 13:55:46.000000000 +0200 +@@ -82,4 +82,10 @@ + extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, + splice_direct_actor *); + ++extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags); ++extern long do_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags); ++ + #endif +diff -Nur linux-2.6.31-vanilla/security/device_cgroup.c linux-2.6.31/security/device_cgroup.c +--- linux-2.6.31-vanilla/security/device_cgroup.c 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/security/device_cgroup.c 2009-09-16 13:55:49.000000000 +0200 +@@ -513,6 +513,7 @@ + + return -EPERM; + } ++EXPORT_SYMBOL(devcgroup_inode_permission); + + int devcgroup_inode_mknod(int mode, dev_t dev) + { +diff -Nur linux-2.6.31-vanilla/security/integrity/ima/ima_main.c linux-2.6.31/security/integrity/ima/ima_main.c +--- linux-2.6.31-vanilla/security/integrity/ima/ima_main.c 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/security/integrity/ima/ima_main.c 2009-09-16 13:55:49.000000000 +0200 +@@ -324,6 +324,7 @@ + MAY_EXEC, FILE_MMAP); + return 0; + } ++EXPORT_SYMBOL(ima_file_mmap); + + /** + * ima_bprm_check - based on policy, collect/store measurement. +diff -Nur linux-2.6.31-vanilla/security/integrity/ima/ima_main.c.orig linux-2.6.31/security/integrity/ima/ima_main.c.orig +--- linux-2.6.31-vanilla/security/integrity/ima/ima_main.c.orig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.31/security/integrity/ima/ima_main.c.orig 2009-09-10 00:13:59.000000000 +0200 +@@ -0,0 +1,368 @@ ++/* ++ * Copyright (C) 2005,2006,2007,2008 IBM Corporation ++ * ++ * Authors: ++ * Reiner Sailer sailer@watson.ibm.com ++ * Serge Hallyn serue@us.ibm.com ++ * Kylene Hall kylene@us.ibm.com ++ * Mimi Zohar zohar@us.ibm.com ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License as ++ * published by the Free Software Foundation, version 2 of the ++ * License. ++ * ++ * File: ima_main.c ++ * implements the IMA hooks: ima_bprm_check, ima_file_mmap, ++ * and ima_path_check. ++ */ ++#include <linux/module.h> ++#include <linux/file.h> ++#include <linux/binfmts.h> ++#include <linux/mount.h> ++#include <linux/mman.h> ++ ++#include "ima.h" ++ ++int ima_initialized; ++ ++char *ima_hash = "sha1"; ++static int __init hash_setup(char *str) ++{ ++ if (strncmp(str, "md5", 3) == 0) ++ ima_hash = "md5"; ++ return 1; ++} ++__setup("ima_hash=", hash_setup); ++ ++/** ++ * ima_file_free - called on __fput() ++ * @file: pointer to file structure being freed ++ * ++ * Flag files that changed, based on i_version; ++ * and decrement the iint readcount/writecount. ++ */ ++void ima_file_free(struct file *file) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ struct ima_iint_cache *iint; ++ ++ if (!ima_initialized || !S_ISREG(inode->i_mode)) ++ return; ++ iint = ima_iint_find_get(inode); ++ if (!iint) ++ return; ++ ++ mutex_lock(&iint->mutex); ++ if (iint->opencount <= 0) { ++ printk(KERN_INFO ++ "%s: %s open/free imbalance (r:%ld w:%ld o:%ld f:%ld)\n", ++ __FUNCTION__, file->f_dentry->d_name.name, ++ iint->readcount, iint->writecount, ++ iint->opencount, atomic_long_read(&file->f_count)); ++ if (!(iint->flags & IMA_IINT_DUMP_STACK)) { ++ dump_stack(); ++ iint->flags |= IMA_IINT_DUMP_STACK; ++ } ++ } ++ iint->opencount--; ++ ++ if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) ++ iint->readcount--; ++ ++ if (file->f_mode & FMODE_WRITE) { ++ iint->writecount--; ++ if (iint->writecount == 0) { ++ if (iint->version != inode->i_version) ++ iint->flags &= ~IMA_MEASURED; ++ } ++ } ++ mutex_unlock(&iint->mutex); ++ kref_put(&iint->refcount, iint_free); ++} ++ ++/* ima_read_write_check - reflect possible reading/writing errors in the PCR. ++ * ++ * When opening a file for read, if the file is already open for write, ++ * the file could change, resulting in a file measurement error. ++ * ++ * Opening a file for write, if the file is already open for read, results ++ * in a time of measure, time of use (ToMToU) error. ++ * ++ * In either case invalidate the PCR. ++ */ ++enum iint_pcr_error { TOMTOU, OPEN_WRITERS }; ++static void ima_read_write_check(enum iint_pcr_error error, ++ struct ima_iint_cache *iint, ++ struct inode *inode, ++ const unsigned char *filename) ++{ ++ switch (error) { ++ case TOMTOU: ++ if (iint->readcount > 0) ++ ima_add_violation(inode, filename, "invalid_pcr", ++ "ToMToU"); ++ break; ++ case OPEN_WRITERS: ++ if (iint->writecount > 0) ++ ima_add_violation(inode, filename, "invalid_pcr", ++ "open_writers"); ++ break; ++ } ++} ++ ++static int get_path_measurement(struct ima_iint_cache *iint, struct file *file, ++ const unsigned char *filename) ++{ ++ int rc = 0; ++ ++ iint->opencount++; ++ iint->readcount++; ++ ++ rc = ima_collect_measurement(iint, file); ++ if (!rc) ++ ima_store_measurement(iint, file, filename); ++ return rc; ++} ++ ++static void ima_update_counts(struct ima_iint_cache *iint, int mask) ++{ ++ iint->opencount++; ++ if ((mask & MAY_WRITE) || (mask == 0)) ++ iint->writecount++; ++ else if (mask & (MAY_READ | MAY_EXEC)) ++ iint->readcount++; ++} ++ ++/** ++ * ima_path_check - based on policy, collect/store measurement. ++ * @path: contains a pointer to the path to be measured ++ * @mask: contains MAY_READ, MAY_WRITE or MAY_EXECUTE ++ * ++ * Measure the file being open for readonly, based on the ++ * ima_must_measure() policy decision. ++ * ++ * Keep read/write counters for all files, but only ++ * invalidate the PCR for measured files: ++ * - Opening a file for write when already open for read, ++ * results in a time of measure, time of use (ToMToU) error. ++ * - Opening a file for read when already open for write, ++ * could result in a file measurement error. ++ * ++ * Always return 0 and audit dentry_open failures. ++ * (Return code will be based upon measurement appraisal.) ++ */ ++int ima_path_check(struct path *path, int mask, int update_counts) ++{ ++ struct inode *inode = path->dentry->d_inode; ++ struct ima_iint_cache *iint; ++ struct file *file = NULL; ++ int rc; ++ ++ if (!ima_initialized || !S_ISREG(inode->i_mode)) ++ return 0; ++ iint = ima_iint_find_insert_get(inode); ++ if (!iint) ++ return 0; ++ ++ mutex_lock(&iint->mutex); ++ if (update_counts) ++ ima_update_counts(iint, mask); ++ ++ rc = ima_must_measure(iint, inode, MAY_READ, PATH_CHECK); ++ if (rc < 0) ++ goto out; ++ ++ if ((mask & MAY_WRITE) || (mask == 0)) ++ ima_read_write_check(TOMTOU, iint, inode, ++ path->dentry->d_name.name); ++ ++ if ((mask & (MAY_WRITE | MAY_READ | MAY_EXEC)) != MAY_READ) ++ goto out; ++ ++ ima_read_write_check(OPEN_WRITERS, iint, inode, ++ path->dentry->d_name.name); ++ if (!(iint->flags & IMA_MEASURED)) { ++ struct dentry *dentry = dget(path->dentry); ++ struct vfsmount *mnt = mntget(path->mnt); ++ ++ file = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE, ++ current_cred()); ++ if (IS_ERR(file)) { ++ int audit_info = 0; ++ ++ integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, ++ dentry->d_name.name, ++ "add_measurement", ++ "dentry_open failed", ++ 1, audit_info); ++ file = NULL; ++ goto out; ++ } ++ rc = get_path_measurement(iint, file, dentry->d_name.name); ++ } ++out: ++ mutex_unlock(&iint->mutex); ++ if (file) ++ fput(file); ++ kref_put(&iint->refcount, iint_free); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(ima_path_check); ++ ++static int process_measurement(struct file *file, const unsigned char *filename, ++ int mask, int function) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ struct ima_iint_cache *iint; ++ int rc; ++ ++ if (!ima_initialized || !S_ISREG(inode->i_mode)) ++ return 0; ++ iint = ima_iint_find_insert_get(inode); ++ if (!iint) ++ return -ENOMEM; ++ ++ mutex_lock(&iint->mutex); ++ rc = ima_must_measure(iint, inode, mask, function); ++ if (rc != 0) ++ goto out; ++ ++ rc = ima_collect_measurement(iint, file); ++ if (!rc) ++ ima_store_measurement(iint, file, filename); ++out: ++ mutex_unlock(&iint->mutex); ++ kref_put(&iint->refcount, iint_free); ++ return rc; ++} ++ ++/* ++ * ima_counts_put - decrement file counts ++ * ++ * File counts are incremented in ima_path_check. On file open ++ * error, such as ETXTBSY, decrement the counts to prevent ++ * unnecessary imbalance messages. ++ */ ++void ima_counts_put(struct path *path, int mask) ++{ ++ struct inode *inode = path->dentry->d_inode; ++ struct ima_iint_cache *iint; ++ ++ /* The inode may already have been freed, freeing the iint ++ * with it. Verify the inode is not NULL before dereferencing ++ * it. ++ */ ++ if (!ima_initialized || !inode || !S_ISREG(inode->i_mode)) ++ return; ++ iint = ima_iint_find_insert_get(inode); ++ if (!iint) ++ return; ++ ++ mutex_lock(&iint->mutex); ++ iint->opencount--; ++ if ((mask & MAY_WRITE) || (mask == 0)) ++ iint->writecount--; ++ else if (mask & (MAY_READ | MAY_EXEC)) ++ iint->readcount--; ++ mutex_unlock(&iint->mutex); ++ ++ kref_put(&iint->refcount, iint_free); ++} ++ ++/* ++ * ima_counts_get - increment file counts ++ * ++ * - for IPC shm and shmat file. ++ * - for nfsd exported files. ++ * ++ * Increment the counts for these files to prevent unnecessary ++ * imbalance messages. ++ */ ++void ima_counts_get(struct file *file) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ struct ima_iint_cache *iint; ++ ++ if (!ima_initialized || !S_ISREG(inode->i_mode)) ++ return; ++ iint = ima_iint_find_insert_get(inode); ++ if (!iint) ++ return; ++ mutex_lock(&iint->mutex); ++ iint->opencount++; ++ if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) ++ iint->readcount++; ++ ++ if (file->f_mode & FMODE_WRITE) ++ iint->writecount++; ++ mutex_unlock(&iint->mutex); ++ ++ kref_put(&iint->refcount, iint_free); ++} ++EXPORT_SYMBOL_GPL(ima_counts_get); ++ ++/** ++ * ima_file_mmap - based on policy, collect/store measurement. ++ * @file: pointer to the file to be measured (May be NULL) ++ * @prot: contains the protection that will be applied by the kernel. ++ * ++ * Measure files being mmapped executable based on the ima_must_measure() ++ * policy decision. ++ * ++ * Return 0 on success, an error code on failure. ++ * (Based on the results of appraise_measurement().) ++ */ ++int ima_file_mmap(struct file *file, unsigned long prot) ++{ ++ int rc; ++ ++ if (!file) ++ return 0; ++ if (prot & PROT_EXEC) ++ rc = process_measurement(file, file->f_dentry->d_name.name, ++ MAY_EXEC, FILE_MMAP); ++ return 0; ++} ++ ++/** ++ * ima_bprm_check - based on policy, collect/store measurement. ++ * @bprm: contains the linux_binprm structure ++ * ++ * The OS protects against an executable file, already open for write, ++ * from being executed in deny_write_access() and an executable file, ++ * already open for execute, from being modified in get_write_access(). ++ * So we can be certain that what we verify and measure here is actually ++ * what is being executed. ++ * ++ * Return 0 on success, an error code on failure. ++ * (Based on the results of appraise_measurement().) ++ */ ++int ima_bprm_check(struct linux_binprm *bprm) ++{ ++ int rc; ++ ++ rc = process_measurement(bprm->file, bprm->filename, ++ MAY_EXEC, BPRM_CHECK); ++ return 0; ++} ++ ++static int __init init_ima(void) ++{ ++ int error; ++ ++ ima_iintcache_init(); ++ error = ima_init(); ++ ima_initialized = 1; ++ return error; ++} ++ ++static void __exit cleanup_ima(void) ++{ ++ ima_cleanup(); ++} ++ ++late_initcall(init_ima); /* Start IMA after the TPM is available */ ++ ++MODULE_DESCRIPTION("Integrity Measurement Architecture"); ++MODULE_LICENSE("GPL"); +diff -Nur linux-2.6.31-vanilla/security/security.c linux-2.6.31/security/security.c +--- linux-2.6.31-vanilla/security/security.c 2009-09-10 00:13:59.000000000 +0200 ++++ linux-2.6.31/security/security.c 2009-09-16 13:55:49.000000000 +0200 +@@ -386,6 +386,7 @@ + return 0; + return security_ops->path_mkdir(path, dentry, mode); + } ++EXPORT_SYMBOL(security_path_mkdir); + + int security_path_rmdir(struct path *path, struct dentry *dentry) + { +@@ -393,6 +394,7 @@ + return 0; + return security_ops->path_rmdir(path, dentry); + } ++EXPORT_SYMBOL(security_path_rmdir); + + int security_path_unlink(struct path *path, struct dentry *dentry) + { +@@ -400,6 +402,7 @@ + return 0; + return security_ops->path_unlink(path, dentry); + } ++EXPORT_SYMBOL(security_path_unlink); + + int security_path_symlink(struct path *path, struct dentry *dentry, + const char *old_name) +@@ -408,6 +411,7 @@ + return 0; + return security_ops->path_symlink(path, dentry, old_name); + } ++EXPORT_SYMBOL(security_path_symlink); + + int security_path_link(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry) +@@ -416,6 +420,7 @@ + return 0; + return security_ops->path_link(old_dentry, new_dir, new_dentry); + } ++EXPORT_SYMBOL(security_path_link); + + int security_path_rename(struct path *old_dir, struct dentry *old_dentry, + struct path *new_dir, struct dentry *new_dentry) +@@ -426,6 +431,7 @@ + return security_ops->path_rename(old_dir, old_dentry, new_dir, + new_dentry); + } ++EXPORT_SYMBOL(security_path_rename); + + int security_path_truncate(struct path *path, loff_t length, + unsigned int time_attrs) +@@ -434,6 +440,7 @@ + return 0; + return security_ops->path_truncate(path, length, time_attrs); + } ++EXPORT_SYMBOL(security_path_truncate); + #endif + + int security_inode_create(struct inode *dir, struct dentry *dentry, int mode) +@@ -505,6 +512,7 @@ + return 0; + return security_ops->inode_readlink(dentry); + } ++EXPORT_SYMBOL(security_inode_readlink); + + int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd) + { +@@ -519,6 +527,7 @@ + return 0; + return security_ops->inode_permission(inode, mask); + } ++EXPORT_SYMBOL(security_inode_permission); + + int security_inode_setattr(struct dentry *dentry, struct iattr *attr) + { +@@ -619,6 +628,7 @@ + { + return security_ops->file_permission(file, mask); + } ++EXPORT_SYMBOL(security_file_permission); + + int security_file_alloc(struct file *file) + { diff --git a/pkgs/core/kernel/patches/grsecurity-2.1.14-2.6.31.1-200910012153.patch b/pkgs/core/kernel/patches/grsecurity-2.1.14-2.6.31.1-200910012153.patch deleted file mode 100644 index ae34c84..0000000 --- a/pkgs/core/kernel/patches/grsecurity-2.1.14-2.6.31.1-200910012153.patch +++ /dev/null @@ -1,47055 +0,0 @@ -diff -urNp linux-2.6.31.1/arch/alpha/include/asm/atomic.h linux-2.6.31.1/arch/alpha/include/asm/atomic.h ---- linux-2.6.31.1/arch/alpha/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/alpha/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -18,9 +18,11 @@ - #define ATOMIC64_INIT(i) ( (atomic64_t) { (i) } ) - - #define atomic_read(v) ((v)->counter + 0) -+#define atomic_read_unchecked(v) ((v)->counter + 0) - #define atomic64_read(v) ((v)->counter + 0) - - #define atomic_set(v,i) ((v)->counter = (i)) -+#define atomic_set_unchecked(v,i) ((v)->counter = (i)) - #define atomic64_set(v,i) ((v)->counter = (i)) - - /* -@@ -44,6 +46,11 @@ static __inline__ void atomic_add(int i, - :"Ir" (i), "m" (v->counter)); - } - -+static __inline__ void atomic_add_unchecked(int i, atomic_unchecked_t * v) -+{ -+ atomic_add(i, (atomic_t *)v); -+} -+ - static __inline__ void atomic64_add(long i, atomic64_t * v) - { - unsigned long temp; -@@ -74,6 +81,11 @@ static __inline__ void atomic_sub(int i, - :"Ir" (i), "m" (v->counter)); - } - -+static __inline__ void atomic_sub_unchecked(int i, atomic_unchecked_t * v) -+{ -+ atomic_sub(i, (atomic_t *)v); -+} -+ - static __inline__ void atomic64_sub(long i, atomic64_t * v) - { - unsigned long temp; -@@ -246,6 +258,7 @@ static __inline__ int atomic64_add_unles - #define atomic64_dec_and_test(v) (atomic64_sub_return(1, (v)) == 0) - - #define atomic_inc(v) atomic_add(1,(v)) -+#define atomic_inc_unchecked(v) atomic_add_unchecked(1,(v)) - #define atomic64_inc(v) atomic64_add(1,(v)) - - #define atomic_dec(v) atomic_sub(1,(v)) -diff -urNp linux-2.6.31.1/arch/alpha/include/asm/elf.h linux-2.6.31.1/arch/alpha/include/asm/elf.h ---- linux-2.6.31.1/arch/alpha/include/asm/elf.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/alpha/include/asm/elf.h 2009-10-01 20:12:42.000000000 -0400 -@@ -91,6 +91,13 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_N - - #define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) - -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE (current->personality & ADDR_LIMIT_32BIT ? 0x10000 : 0x120000000UL) -+ -+#define PAX_DELTA_MMAP_LEN (current->personality & ADDR_LIMIT_32BIT ? 14 : 28) -+#define PAX_DELTA_STACK_LEN (current->personality & ADDR_LIMIT_32BIT ? 14 : 19) -+#endif -+ - /* $0 is set by ld.so to a pointer to a function which might be - registered using atexit. This provides a mean for the dynamic - linker to call DT_FINI functions for shared libraries that have -diff -urNp linux-2.6.31.1/arch/alpha/include/asm/pgtable.h linux-2.6.31.1/arch/alpha/include/asm/pgtable.h ---- linux-2.6.31.1/arch/alpha/include/asm/pgtable.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/alpha/include/asm/pgtable.h 2009-10-01 20:12:42.000000000 -0400 -@@ -101,6 +101,17 @@ struct vm_area_struct; - #define PAGE_SHARED __pgprot(_PAGE_VALID | __ACCESS_BITS) - #define PAGE_COPY __pgprot(_PAGE_VALID | __ACCESS_BITS | _PAGE_FOW) - #define PAGE_READONLY __pgprot(_PAGE_VALID | __ACCESS_BITS | _PAGE_FOW) -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+# define PAGE_SHARED_NOEXEC __pgprot(_PAGE_VALID | __ACCESS_BITS | _PAGE_FOE) -+# define PAGE_COPY_NOEXEC __pgprot(_PAGE_VALID | __ACCESS_BITS | _PAGE_FOW | _PAGE_FOE) -+# define PAGE_READONLY_NOEXEC __pgprot(_PAGE_VALID | __ACCESS_BITS | _PAGE_FOW | _PAGE_FOE) -+#else -+# define PAGE_SHARED_NOEXEC PAGE_SHARED -+# define PAGE_COPY_NOEXEC PAGE_COPY -+# define PAGE_READONLY_NOEXEC PAGE_READONLY -+#endif -+ - #define PAGE_KERNEL __pgprot(_PAGE_VALID | _PAGE_ASM | _PAGE_KRE | _PAGE_KWE) - - #define _PAGE_NORMAL(x) __pgprot(_PAGE_VALID | __ACCESS_BITS | (x)) -diff -urNp linux-2.6.31.1/arch/alpha/kernel/module.c linux-2.6.31.1/arch/alpha/kernel/module.c ---- linux-2.6.31.1/arch/alpha/kernel/module.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/alpha/kernel/module.c 2009-10-01 20:12:42.000000000 -0400 -@@ -182,7 +182,7 @@ apply_relocate_add(Elf64_Shdr *sechdrs, - - /* The small sections were sorted to the end of the segment. - The following should definitely cover them. */ -- gp = (u64)me->module_core + me->core_size - 0x8000; -+ gp = (u64)me->module_core_rw + me->core_size_rw - 0x8000; - got = sechdrs[me->arch.gotsecindex].sh_addr; - - for (i = 0; i < n; i++) { -diff -urNp linux-2.6.31.1/arch/alpha/kernel/osf_sys.c linux-2.6.31.1/arch/alpha/kernel/osf_sys.c ---- linux-2.6.31.1/arch/alpha/kernel/osf_sys.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/alpha/kernel/osf_sys.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1212,6 +1212,10 @@ arch_get_unmapped_area(struct file *filp - merely specific addresses, but regions of memory -- perhaps - this feature should be incorporated into all ports? */ - -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!(current->mm->pax_flags & MF_PAX_RANDMMAP)) -+#endif -+ - if (addr) { - addr = arch_get_unmapped_area_1 (PAGE_ALIGN(addr), len, limit); - if (addr != (unsigned long) -ENOMEM) -@@ -1219,8 +1223,8 @@ arch_get_unmapped_area(struct file *filp - } - - /* Next, try allocating at TASK_UNMAPPED_BASE. */ -- addr = arch_get_unmapped_area_1 (PAGE_ALIGN(TASK_UNMAPPED_BASE), -- len, limit); -+ addr = arch_get_unmapped_area_1 (PAGE_ALIGN(current->mm->mmap_base), len, limit); -+ - if (addr != (unsigned long) -ENOMEM) - return addr; - -diff -urNp linux-2.6.31.1/arch/alpha/mm/fault.c linux-2.6.31.1/arch/alpha/mm/fault.c ---- linux-2.6.31.1/arch/alpha/mm/fault.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/alpha/mm/fault.c 2009-10-01 20:12:42.000000000 -0400 -@@ -54,6 +54,124 @@ __load_new_mm_context(struct mm_struct * - __reload_thread(pcb); - } - -+#ifdef CONFIG_PAX_PAGEEXEC -+/* -+ * PaX: decide what to do with offenders (regs->pc = fault address) -+ * -+ * returns 1 when task should be killed -+ * 2 when patched PLT trampoline was detected -+ * 3 when unpatched PLT trampoline was detected -+ */ -+static int pax_handle_fetch_fault(struct pt_regs *regs) -+{ -+ -+#ifdef CONFIG_PAX_EMUPLT -+ int err; -+ -+ do { /* PaX: patched PLT emulation #1 */ -+ unsigned int ldah, ldq, jmp; -+ -+ err = get_user(ldah, (unsigned int *)regs->pc); -+ err |= get_user(ldq, (unsigned int *)(regs->pc+4)); -+ err |= get_user(jmp, (unsigned int *)(regs->pc+8)); -+ -+ if (err) -+ break; -+ -+ if ((ldah & 0xFFFF0000U) == 0x277B0000U && -+ (ldq & 0xFFFF0000U) == 0xA77B0000U && -+ jmp == 0x6BFB0000U) -+ { -+ unsigned long r27, addr; -+ unsigned long addrh = (ldah | 0xFFFFFFFFFFFF0000UL) << 16; -+ unsigned long addrl = ldq | 0xFFFFFFFFFFFF0000UL; -+ -+ addr = regs->r27 + ((addrh ^ 0x80000000UL) + 0x80000000UL) + ((addrl ^ 0x8000UL) + 0x8000UL); -+ err = get_user(r27, (unsigned long *)addr); -+ if (err) -+ break; -+ -+ regs->r27 = r27; -+ regs->pc = r27; -+ return 2; -+ } -+ } while (0); -+ -+ do { /* PaX: patched PLT emulation #2 */ -+ unsigned int ldah, lda, br; -+ -+ err = get_user(ldah, (unsigned int *)regs->pc); -+ err |= get_user(lda, (unsigned int *)(regs->pc+4)); -+ err |= get_user(br, (unsigned int *)(regs->pc+8)); -+ -+ if (err) -+ break; -+ -+ if ((ldah & 0xFFFF0000U) == 0x277B0000U && -+ (lda & 0xFFFF0000U) == 0xA77B0000U && -+ (br & 0xFFE00000U) == 0xC3E00000U) -+ { -+ unsigned long addr = br | 0xFFFFFFFFFFE00000UL; -+ unsigned long addrh = (ldah | 0xFFFFFFFFFFFF0000UL) << 16; -+ unsigned long addrl = lda | 0xFFFFFFFFFFFF0000UL; -+ -+ regs->r27 += ((addrh ^ 0x80000000UL) + 0x80000000UL) + ((addrl ^ 0x8000UL) + 0x8000UL); -+ regs->pc += 12 + (((addr ^ 0x00100000UL) + 0x00100000UL) << 2); -+ return 2; -+ } -+ } while (0); -+ -+ do { /* PaX: unpatched PLT emulation */ -+ unsigned int br; -+ -+ err = get_user(br, (unsigned int *)regs->pc); -+ -+ if (!err && (br & 0xFFE00000U) == 0xC3800000U) { -+ unsigned int br2, ldq, nop, jmp; -+ unsigned long addr = br | 0xFFFFFFFFFFE00000UL, resolver; -+ -+ addr = regs->pc + 4 + (((addr ^ 0x00100000UL) + 0x00100000UL) << 2); -+ err = get_user(br2, (unsigned int *)addr); -+ err |= get_user(ldq, (unsigned int *)(addr+4)); -+ err |= get_user(nop, (unsigned int *)(addr+8)); -+ err |= get_user(jmp, (unsigned int *)(addr+12)); -+ err |= get_user(resolver, (unsigned long *)(addr+16)); -+ -+ if (err) -+ break; -+ -+ if (br2 == 0xC3600000U && -+ ldq == 0xA77B000CU && -+ nop == 0x47FF041FU && -+ jmp == 0x6B7B0000U) -+ { -+ regs->r28 = regs->pc+4; -+ regs->r27 = addr+16; -+ regs->pc = resolver; -+ return 3; -+ } -+ } -+ } while (0); -+#endif -+ -+ return 1; -+} -+ -+void pax_report_insns(void *pc, void *sp) -+{ -+ unsigned long i; -+ -+ printk(KERN_ERR "PAX: bytes at PC: "); -+ for (i = 0; i < 5; i++) { -+ unsigned int c; -+ if (get_user(c, (unsigned int *)pc+i)) -+ printk(KERN_CONT "???????? "); -+ else -+ printk(KERN_CONT "%08x ", c); -+ } -+ printk("\n"); -+} -+#endif - - /* - * This routine handles page faults. It determines the address, -@@ -131,8 +249,29 @@ do_page_fault(unsigned long address, uns - good_area: - si_code = SEGV_ACCERR; - if (cause < 0) { -- if (!(vma->vm_flags & VM_EXEC)) -+ if (!(vma->vm_flags & VM_EXEC)) { -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (!(mm->pax_flags & MF_PAX_PAGEEXEC) || address != regs->pc) -+ goto bad_area; -+ -+ up_read(&mm->mmap_sem); -+ switch (pax_handle_fetch_fault(regs)) { -+ -+#ifdef CONFIG_PAX_EMUPLT -+ case 2: -+ case 3: -+ return; -+#endif -+ -+ } -+ pax_report_fault(regs, (void *)regs->pc, (void *)rdusp()); -+ do_group_exit(SIGKILL); -+#else - goto bad_area; -+#endif -+ -+ } - } else if (!cause) { - /* Allow reads even for write-only mappings */ - if (!(vma->vm_flags & (VM_READ | VM_WRITE))) -diff -urNp linux-2.6.31.1/arch/arm/include/asm/atomic.h linux-2.6.31.1/arch/arm/include/asm/atomic.h ---- linux-2.6.31.1/arch/arm/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/arm/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -20,6 +20,7 @@ - #ifdef __KERNEL__ - - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - - #if __LINUX_ARM_ARCH__ >= 6 - -@@ -44,6 +45,11 @@ static inline void atomic_set(atomic_t * - : "cc"); - } - -+static inline void atomic_set_unchecked(atomic_unchecked_t *v, int i) -+{ -+ atomic_set((atomic_t *)v, i); -+} -+ - static inline void atomic_add(int i, atomic_t *v) - { - unsigned long tmp; -@@ -60,6 +66,11 @@ static inline void atomic_add(int i, ato - : "cc"); - } - -+static inline void atomic_add_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_add(i, (atomic_t *)v); -+} -+ - static inline int atomic_add_return(int i, atomic_t *v) - { - unsigned long tmp; -@@ -98,6 +109,11 @@ static inline void atomic_sub(int i, ato - : "cc"); - } - -+static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_sub(i, (atomic_t *)v); -+} -+ - static inline int atomic_sub_return(int i, atomic_t *v) - { - unsigned long tmp; -@@ -164,6 +180,7 @@ static inline void atomic_clear_mask(uns - #endif - - #define atomic_set(v,i) (((v)->counter) = (i)) -+#define atomic_set_unchecked(v,i) (((v)->counter) = (i)) - - static inline int atomic_add_return(int i, atomic_t *v) - { -@@ -232,6 +249,7 @@ static inline int atomic_add_unless(atom - #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) - - #define atomic_inc(v) atomic_add(1, v) -+#define atomic_inc_unchecked(v) atomic_add_unchecked(1, v) - #define atomic_dec(v) atomic_sub(1, v) - - #define atomic_inc_and_test(v) (atomic_add_return(1, v) == 0) -diff -urNp linux-2.6.31.1/arch/arm/include/asm/elf.h linux-2.6.31.1/arch/arm/include/asm/elf.h ---- linux-2.6.31.1/arch/arm/include/asm/elf.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/arm/include/asm/elf.h 2009-10-01 20:12:42.000000000 -0400 -@@ -103,7 +103,14 @@ extern int arm_elf_read_implies_exec(con - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - --#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) -+#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) -+ -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE 0x00008000UL -+ -+#define PAX_DELTA_MMAP_LEN ((current->personality == PER_LINUX_32BIT) ? 16 : 10) -+#define PAX_DELTA_STACK_LEN ((current->personality == PER_LINUX_32BIT) ? 16 : 10) -+#endif - - /* When the program starts, a1 contains a pointer to a function to be - registered with atexit, as per the SVR4 ABI. A value of 0 means we -diff -urNp linux-2.6.31.1/arch/arm/include/asm/kmap_types.h linux-2.6.31.1/arch/arm/include/asm/kmap_types.h ---- linux-2.6.31.1/arch/arm/include/asm/kmap_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/arm/include/asm/kmap_types.h 2009-10-01 20:12:42.000000000 -0400 -@@ -19,6 +19,7 @@ enum km_type { - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_L2_CACHE, -+ KM_CLEARPAGE, - KM_TYPE_NR - }; - -diff -urNp linux-2.6.31.1/arch/arm/include/asm/uaccess.h linux-2.6.31.1/arch/arm/include/asm/uaccess.h ---- linux-2.6.31.1/arch/arm/include/asm/uaccess.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/arm/include/asm/uaccess.h 2009-10-01 20:12:42.000000000 -0400 -@@ -400,6 +400,9 @@ extern unsigned long __must_check __strn - - static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) - { -+ if ((long)n < 0) -+ return n; -+ - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else /* security hole - plug it */ -@@ -409,6 +412,9 @@ static inline unsigned long __must_check - - static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) - { -+ if ((long)n < 0) -+ return n; -+ - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); - return n; -diff -urNp linux-2.6.31.1/arch/arm/mach-ns9xxx/clock.c linux-2.6.31.1/arch/arm/mach-ns9xxx/clock.c ---- linux-2.6.31.1/arch/arm/mach-ns9xxx/clock.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/arm/mach-ns9xxx/clock.c 2009-10-01 20:12:42.000000000 -0400 -@@ -195,7 +195,7 @@ static int clk_debugfs_open(struct inode - return single_open(file, clk_debugfs_show, NULL); - } - --static struct file_operations clk_debugfs_operations = { -+static const struct file_operations clk_debugfs_operations = { - .open = clk_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, -diff -urNp linux-2.6.31.1/arch/arm/mm/mmap.c linux-2.6.31.1/arch/arm/mm/mmap.c ---- linux-2.6.31.1/arch/arm/mm/mmap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/arm/mm/mmap.c 2009-10-01 20:12:42.000000000 -0400 -@@ -62,6 +62,10 @@ arch_get_unmapped_area(struct file *filp - if (len > TASK_SIZE) - return -ENOMEM; - -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) -+#endif -+ - if (addr) { - if (do_align) - addr = COLOUR_ALIGN(addr, pgoff); -@@ -74,10 +78,10 @@ arch_get_unmapped_area(struct file *filp - return addr; - } - if (len > mm->cached_hole_size) { -- start_addr = addr = mm->free_area_cache; -+ start_addr = addr = mm->free_area_cache; - } else { -- start_addr = addr = TASK_UNMAPPED_BASE; -- mm->cached_hole_size = 0; -+ start_addr = addr = mm->mmap_base; -+ mm->cached_hole_size = 0; - } - - full_search: -@@ -93,8 +97,8 @@ full_search: - * Start a new search - just in case we missed - * some holes. - */ -- if (start_addr != TASK_UNMAPPED_BASE) { -- start_addr = addr = TASK_UNMAPPED_BASE; -+ if (start_addr != mm->mmap_base) { -+ start_addr = addr = mm->mmap_base; - mm->cached_hole_size = 0; - goto full_search; - } -diff -urNp linux-2.6.31.1/arch/avr32/include/asm/atomic.h linux-2.6.31.1/arch/avr32/include/asm/atomic.h ---- linux-2.6.31.1/arch/avr32/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/avr32/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -20,7 +20,9 @@ - #define ATOMIC_INIT(i) { (i) } - - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - #define atomic_set(v, i) (((v)->counter) = i) -+#define atomic_set_unchecked(v, i) (((v)->counter) = i) - - /* - * atomic_sub_return - subtract the atomic variable -@@ -48,6 +50,18 @@ static inline int atomic_sub_return(int - } - - /* -+ * atomic_sub_return_unchecked - subtract the atomic variable -+ * @i: integer value to subtract -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically subtracts @i from @v. Returns the resulting value. -+ */ -+static inline int atomic_sub_return_unchecked(int i, atomic_unchecked_t *v) -+{ -+ return atomic_sub_return(i, (atomic_t *)v); -+} -+ -+/* - * atomic_add_return - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t -@@ -76,6 +90,18 @@ static inline int atomic_add_return(int - } - - /* -+ * atomic_add_return_unchecked - add integer to atomic variable -+ * @i: integer value to add -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically adds @i to @v. Returns the resulting value. -+ */ -+static inline int atomic_add_return_unchecked(int i, atomic_unchecked_t *v) -+{ -+ return atomic_add_return(i, (atomic_t *)v); -+} -+ -+/* - * atomic_sub_unless - sub unless the number is a given value - * @v: pointer of type atomic_t - * @a: the amount to add to v... -@@ -176,9 +202,12 @@ static inline int atomic_sub_if_positive - #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n))) - - #define atomic_sub(i, v) (void)atomic_sub_return(i, v) -+#define atomic_sub_unchecked(i, v) (void)atomic_sub_return_unchecked(i, v) - #define atomic_add(i, v) (void)atomic_add_return(i, v) -+#define atomic_add_unchecked(i, v) (void)atomic_add_return_unchecked(i, v) - #define atomic_dec(v) atomic_sub(1, (v)) - #define atomic_inc(v) atomic_add(1, (v)) -+#define atomic_inc_unchecked(v) (void)atomic_add_return_unchecked(1, (v)) - - #define atomic_dec_return(v) atomic_sub_return(1, v) - #define atomic_inc_return(v) atomic_add_return(1, v) -diff -urNp linux-2.6.31.1/arch/avr32/include/asm/elf.h linux-2.6.31.1/arch/avr32/include/asm/elf.h ---- linux-2.6.31.1/arch/avr32/include/asm/elf.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/avr32/include/asm/elf.h 2009-10-01 20:12:42.000000000 -0400 -@@ -85,8 +85,14 @@ typedef struct user_fpu_struct elf_fpreg - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - --#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) -+#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) - -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE 0x00001000UL -+ -+#define PAX_DELTA_MMAP_LEN 15 -+#define PAX_DELTA_STACK_LEN 15 -+#endif - - /* This yields a mask that user programs can use to figure out what - instruction set this CPU supports. This could be done in user space, -diff -urNp linux-2.6.31.1/arch/avr32/include/asm/kmap_types.h linux-2.6.31.1/arch/avr32/include/asm/kmap_types.h ---- linux-2.6.31.1/arch/avr32/include/asm/kmap_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/avr32/include/asm/kmap_types.h 2009-10-01 20:12:42.000000000 -0400 -@@ -22,7 +22,8 @@ D(10) KM_IRQ0, - D(11) KM_IRQ1, - D(12) KM_SOFTIRQ0, - D(13) KM_SOFTIRQ1, --D(14) KM_TYPE_NR -+D(14) KM_CLEARPAGE, -+D(15) KM_TYPE_NR - }; - - #undef D -diff -urNp linux-2.6.31.1/arch/avr32/mm/fault.c linux-2.6.31.1/arch/avr32/mm/fault.c ---- linux-2.6.31.1/arch/avr32/mm/fault.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/avr32/mm/fault.c 2009-10-01 20:12:42.000000000 -0400 -@@ -41,6 +41,23 @@ static inline int notify_page_fault(stru - - int exception_trace = 1; - -+#ifdef CONFIG_PAX_PAGEEXEC -+void pax_report_insns(void *pc, void *sp) -+{ -+ unsigned long i; -+ -+ printk(KERN_ERR "PAX: bytes at PC: "); -+ for (i = 0; i < 20; i++) { -+ unsigned char c; -+ if (get_user(c, (unsigned char *)pc+i)) -+ printk(KERN_CONT "???????? "); -+ else -+ printk(KERN_CONT "%02x ", c); -+ } -+ printk("\n"); -+} -+#endif -+ - /* - * This routine handles page faults. It determines the address and the - * problem, and then passes it off to one of the appropriate routines. -@@ -157,6 +174,16 @@ bad_area: - up_read(&mm->mmap_sem); - - if (user_mode(regs)) { -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (mm->pax_flags & MF_PAX_PAGEEXEC) { -+ if (ecr == ECR_PROTECTION_X || ecr == ECR_TLB_MISS_X) { -+ pax_report_fault(regs, (void *)regs->pc, (void *)regs->sp); -+ do_group_exit(SIGKILL); -+ } -+ } -+#endif -+ - if (exception_trace && printk_ratelimit()) - printk("%s%s[%d]: segfault at %08lx pc %08lx " - "sp %08lx ecr %lu\n", -diff -urNp linux-2.6.31.1/arch/blackfin/include/asm/atomic.h linux-2.6.31.1/arch/blackfin/include/asm/atomic.h ---- linux-2.6.31.1/arch/blackfin/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/blackfin/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -15,8 +15,10 @@ - - #define ATOMIC_INIT(i) { (i) } - #define atomic_set(v, i) (((v)->counter) = i) -+#define atomic_set_unchecked(v, i) (((v)->counter) = i) - - #define atomic_read(v) __raw_uncached_fetch_asm(&(v)->counter) -+#define atomic_read_unchecked(v) __raw_uncached_fetch_asm(&(v)->counter) - - asmlinkage int __raw_uncached_fetch_asm(const volatile int *ptr); - -@@ -35,11 +37,21 @@ static inline void atomic_add(int i, ato - __raw_atomic_update_asm(&v->counter, i); - } - -+static inline void atomic_add_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_add(i, (atomic_t *)v); -+} -+ - static inline void atomic_sub(int i, atomic_t *v) - { - __raw_atomic_update_asm(&v->counter, -i); - } - -+static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_sub(i, (atomic_t *)v); -+} -+ - static inline int atomic_add_return(int i, atomic_t *v) - { - return __raw_atomic_update_asm(&v->counter, i); -@@ -55,6 +67,11 @@ static inline void atomic_inc(volatile a - __raw_atomic_update_asm(&v->counter, 1); - } - -+static inline void atomic_inc_unchecked(volatile atomic_unchecked_t *v) -+{ -+ atomic_inc((atomic_t *)v); -+} -+ - static inline void atomic_dec(volatile atomic_t *v) - { - __raw_atomic_update_asm(&v->counter, -1); -diff -urNp linux-2.6.31.1/arch/blackfin/mach-bf561/coreb.c linux-2.6.31.1/arch/blackfin/mach-bf561/coreb.c ---- linux-2.6.31.1/arch/blackfin/mach-bf561/coreb.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/blackfin/mach-bf561/coreb.c 2009-10-01 20:12:42.000000000 -0400 -@@ -48,7 +48,7 @@ coreb_ioctl(struct inode *inode, struct - return ret; - } - --static struct file_operations coreb_fops = { -+static const struct file_operations coreb_fops = { - .owner = THIS_MODULE, - .ioctl = coreb_ioctl, - }; -diff -urNp linux-2.6.31.1/arch/cris/arch-v10/drivers/sync_serial.c linux-2.6.31.1/arch/cris/arch-v10/drivers/sync_serial.c ---- linux-2.6.31.1/arch/cris/arch-v10/drivers/sync_serial.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/cris/arch-v10/drivers/sync_serial.c 2009-10-01 20:12:42.000000000 -0400 -@@ -244,7 +244,7 @@ static unsigned sync_serial_prescale_sha - - #define NUMBER_OF_PORTS 2 - --static struct file_operations sync_serial_fops = { -+static const struct file_operations sync_serial_fops = { - .owner = THIS_MODULE, - .write = sync_serial_write, - .read = sync_serial_read, -diff -urNp linux-2.6.31.1/arch/cris/arch-v32/drivers/mach-fs/gpio.c linux-2.6.31.1/arch/cris/arch-v32/drivers/mach-fs/gpio.c ---- linux-2.6.31.1/arch/cris/arch-v32/drivers/mach-fs/gpio.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/cris/arch-v32/drivers/mach-fs/gpio.c 2009-10-01 20:12:42.000000000 -0400 -@@ -855,7 +855,7 @@ gpio_leds_ioctl(unsigned int cmd, unsign - return 0; - } - --struct file_operations gpio_fops = { -+struct struct file_operations gpio_fops = { - .owner = THIS_MODULE, - .poll = gpio_poll, - .ioctl = gpio_ioctl, -diff -urNp linux-2.6.31.1/arch/cris/include/asm/atomic.h linux-2.6.31.1/arch/cris/include/asm/atomic.h ---- linux-2.6.31.1/arch/cris/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/cris/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -16,7 +16,9 @@ - #define ATOMIC_INIT(i) { (i) } - - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - #define atomic_set(v,i) (((v)->counter) = (i)) -+#define atomic_set_unchecked(v,i) (((v)->counter) = (i)) - - /* These should be written in asm but we do it in C for now. */ - -@@ -28,6 +30,11 @@ static inline void atomic_add(int i, vol - cris_atomic_restore(v, flags); - } - -+static inline void atomic_add_unchecked(int i, volatile atomic_unchecked_t *v) -+{ -+ atomic_add(i, (volatile atomic_t *)v); -+} -+ - static inline void atomic_sub(int i, volatile atomic_t *v) - { - unsigned long flags; -@@ -36,6 +43,11 @@ static inline void atomic_sub(int i, vol - cris_atomic_restore(v, flags); - } - -+static inline void atomic_sub_unchecked(int i, volatile atomic_unchecked_t *v) -+{ -+ atomic_sub(i, (volatile atomic_t *)v); -+} -+ - static inline int atomic_add_return(int i, volatile atomic_t *v) - { - unsigned long flags; -@@ -76,6 +88,11 @@ static inline void atomic_inc(volatile a - cris_atomic_restore(v, flags); - } - -+static inline void atomic_inc_unchecked(volatile atomic_unchecked_t *v) -+{ -+ atomic_inc((volatile atomic_t *)v); -+} -+ - static inline void atomic_dec(volatile atomic_t *v) - { - unsigned long flags; -diff -urNp linux-2.6.31.1/arch/frv/include/asm/atomic.h linux-2.6.31.1/arch/frv/include/asm/atomic.h ---- linux-2.6.31.1/arch/frv/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/frv/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -37,7 +37,9 @@ - - #define ATOMIC_INIT(i) { (i) } - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - #define atomic_set(v, i) (((v)->counter) = (i)) -+#define atomic_set_unchecked(v, i) (((v)->counter) = (i)) - - #ifndef CONFIG_FRV_OUTOFLINE_ATOMIC_OPS - static inline int atomic_add_return(int i, atomic_t *v) -@@ -99,16 +101,31 @@ static inline void atomic_add(int i, ato - atomic_add_return(i, v); - } - -+static inline void atomic_add_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_add_return(i, (atomic_t *)v); -+} -+ - static inline void atomic_sub(int i, atomic_t *v) - { - atomic_sub_return(i, v); - } - -+static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_sub_return(i, (atomic_t *)v); -+} -+ - static inline void atomic_inc(atomic_t *v) - { - atomic_add_return(1, v); - } - -+static inline void atomic_inc_unchecked(atomic_unchecked_t *v) -+{ -+ atomic_add_return(1, (atomic_t *)v); -+} -+ - static inline void atomic_dec(atomic_t *v) - { - atomic_sub_return(1, v); -diff -urNp linux-2.6.31.1/arch/frv/include/asm/kmap_types.h linux-2.6.31.1/arch/frv/include/asm/kmap_types.h ---- linux-2.6.31.1/arch/frv/include/asm/kmap_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/frv/include/asm/kmap_types.h 2009-10-01 20:12:42.000000000 -0400 -@@ -23,6 +23,7 @@ enum km_type { - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, -+ KM_CLEARPAGE, - KM_TYPE_NR - }; - -diff -urNp linux-2.6.31.1/arch/h8300/include/asm/atomic.h linux-2.6.31.1/arch/h8300/include/asm/atomic.h ---- linux-2.6.31.1/arch/h8300/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/h8300/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -11,7 +11,9 @@ - #define ATOMIC_INIT(i) { (i) } - - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - #define atomic_set(v, i) (((v)->counter) = i) -+#define atomic_set_unchecked(v, i) (((v)->counter) = i) - - #include <asm/system.h> - #include <linux/kernel.h> -@@ -25,7 +27,13 @@ static __inline__ int atomic_add_return( - return ret; - } - -+static __inline__ int atomic_add_return_unchecked(int i, atomic_unchecked_t *v) -+{ -+ return atomic_add_return(i, (atomic_t *)v); -+} -+ - #define atomic_add(i, v) atomic_add_return(i, v) -+#define atomic_add_unchecked(i, v) atomic_add_return_unchecked(i, v) - #define atomic_add_negative(a, v) (atomic_add_return((a), (v)) < 0) - - static __inline__ int atomic_sub_return(int i, atomic_t *v) -@@ -37,7 +45,13 @@ static __inline__ int atomic_sub_return( - return ret; - } - -+static __inline__ int atomic_sub_return_unchecked(int i, atomic_unchecked_t *v) -+{ -+ return atomic_sub_return(i, (atomic_t *)v); -+} -+ - #define atomic_sub(i, v) atomic_sub_return(i, v) -+#define atomic_sub_unchecked(i, v) atomic_sub_return_unchecked(i, v) - #define atomic_sub_and_test(i,v) (atomic_sub_return(i, v) == 0) - - static __inline__ int atomic_inc_return(atomic_t *v) -@@ -50,7 +64,13 @@ static __inline__ int atomic_inc_return( - return ret; - } - -+static __inline__ int atomic_inc_return_unchecked(atomic_unchecked_t *v) -+{ -+ return atomic_inc_return((atomic_t *)v); -+} -+ - #define atomic_inc(v) atomic_inc_return(v) -+#define atomic_inc_unchecked(v) atomic_inc_return_unchecked(v) - - /* - * atomic_inc_and_test - increment and test -diff -urNp linux-2.6.31.1/arch/ia64/ia32/binfmt_elf32.c linux-2.6.31.1/arch/ia64/ia32/binfmt_elf32.c ---- linux-2.6.31.1/arch/ia64/ia32/binfmt_elf32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/ia64/ia32/binfmt_elf32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -45,6 +45,13 @@ randomize_stack_top(unsigned long stack_ - - #define elf_read_implies_exec(ex, have_pt_gnu_stack) (!(have_pt_gnu_stack)) - -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE (current->personality == PER_LINUX32 ? 0x08048000UL : 0x4000000000000000UL) -+ -+#define PAX_DELTA_MMAP_LEN (current->personality == PER_LINUX32 ? 16 : 3*PAGE_SHIFT - 13) -+#define PAX_DELTA_STACK_LEN (current->personality == PER_LINUX32 ? 16 : 3*PAGE_SHIFT - 13) -+#endif -+ - /* Ugly but avoids duplication */ - #include "../../../fs/binfmt_elf.c" - -@@ -69,11 +76,11 @@ ia32_install_gate_page (struct vm_area_s - } - - --static struct vm_operations_struct ia32_shared_page_vm_ops = { -+static const struct vm_operations_struct ia32_shared_page_vm_ops = { - .fault = ia32_install_shared_page - }; - --static struct vm_operations_struct ia32_gate_page_vm_ops = { -+static const struct vm_operations_struct ia32_gate_page_vm_ops = { - .fault = ia32_install_gate_page - }; - -diff -urNp linux-2.6.31.1/arch/ia64/ia32/ia32priv.h linux-2.6.31.1/arch/ia64/ia32/ia32priv.h ---- linux-2.6.31.1/arch/ia64/ia32/ia32priv.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/ia64/ia32/ia32priv.h 2009-10-01 20:12:42.000000000 -0400 -@@ -296,7 +296,14 @@ typedef struct compat_siginfo { - #define ELF_DATA ELFDATA2LSB - #define ELF_ARCH EM_386 - --#define IA32_STACK_TOP IA32_PAGE_OFFSET -+#ifdef CONFIG_PAX_RANDUSTACK -+#define __IA32_DELTA_STACK (current->mm->delta_stack) -+#else -+#define __IA32_DELTA_STACK 0UL -+#endif -+ -+#define IA32_STACK_TOP (IA32_PAGE_OFFSET - __IA32_DELTA_STACK) -+ - #define IA32_GATE_OFFSET IA32_PAGE_OFFSET - #define IA32_GATE_END IA32_PAGE_OFFSET + PAGE_SIZE - -diff -urNp linux-2.6.31.1/arch/ia64/include/asm/atomic.h linux-2.6.31.1/arch/ia64/include/asm/atomic.h ---- linux-2.6.31.1/arch/ia64/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/ia64/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -22,9 +22,11 @@ - #define ATOMIC64_INIT(i) ((atomic64_t) { (i) }) - - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - #define atomic64_read(v) ((v)->counter) - - #define atomic_set(v,i) (((v)->counter) = (i)) -+#define atomic_set_unchecked(v,i) (((v)->counter) = (i)) - #define atomic64_set(v,i) (((v)->counter) = (i)) - - static __inline__ int -@@ -201,8 +203,11 @@ atomic64_add_negative (__s64 i, atomic64 - #define atomic64_inc_and_test(v) (atomic64_add_return(1, (v)) == 0) - - #define atomic_add(i,v) atomic_add_return((i), (v)) -+#define atomic_add_unchecked(i,v) atomic_add((i), (atomic_t *)(v)) - #define atomic_sub(i,v) atomic_sub_return((i), (v)) -+#define atomic_sub_unchecked(i,v) atomic_sub((i), (atomic_t *)(v)) - #define atomic_inc(v) atomic_add(1, (v)) -+#define atomic_inc_unchecked(v) atomic_inc((atomic_t *)(v)) - #define atomic_dec(v) atomic_sub(1, (v)) - - #define atomic64_add(i,v) atomic64_add_return((i), (v)) -diff -urNp linux-2.6.31.1/arch/ia64/include/asm/elf.h linux-2.6.31.1/arch/ia64/include/asm/elf.h ---- linux-2.6.31.1/arch/ia64/include/asm/elf.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/ia64/include/asm/elf.h 2009-10-01 20:12:42.000000000 -0400 -@@ -43,6 +43,13 @@ - */ - #define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x800000000UL) - -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE (current->personality == PER_LINUX32 ? 0x08048000UL : 0x4000000000000000UL) -+ -+#define PAX_DELTA_MMAP_LEN (current->personality == PER_LINUX32 ? 16 : 3*PAGE_SHIFT - 13) -+#define PAX_DELTA_STACK_LEN (current->personality == PER_LINUX32 ? 16 : 3*PAGE_SHIFT - 13) -+#endif -+ - #define PT_IA_64_UNWIND 0x70000001 - - /* IA-64 relocations: */ -diff -urNp linux-2.6.31.1/arch/ia64/include/asm/pgtable.h linux-2.6.31.1/arch/ia64/include/asm/pgtable.h ---- linux-2.6.31.1/arch/ia64/include/asm/pgtable.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/ia64/include/asm/pgtable.h 2009-10-01 20:12:42.000000000 -0400 -@@ -143,6 +143,17 @@ - #define PAGE_READONLY __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R) - #define PAGE_COPY __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R) - #define PAGE_COPY_EXEC __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX) -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+# define PAGE_SHARED_NOEXEC __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RW) -+# define PAGE_READONLY_NOEXEC __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R) -+# define PAGE_COPY_NOEXEC __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R) -+#else -+# define PAGE_SHARED_NOEXEC PAGE_SHARED -+# define PAGE_READONLY_NOEXEC PAGE_READONLY -+# define PAGE_COPY_NOEXEC PAGE_COPY -+#endif -+ - #define PAGE_GATE __pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_X_RX) - #define PAGE_KERNEL __pgprot(__DIRTY_BITS | _PAGE_PL_0 | _PAGE_AR_RWX) - #define PAGE_KERNELRX __pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX) -diff -urNp linux-2.6.31.1/arch/ia64/include/asm/uaccess.h linux-2.6.31.1/arch/ia64/include/asm/uaccess.h ---- linux-2.6.31.1/arch/ia64/include/asm/uaccess.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/ia64/include/asm/uaccess.h 2009-10-01 20:12:42.000000000 -0400 -@@ -257,7 +257,7 @@ __copy_from_user (void *to, const void _ - const void *__cu_from = (from); \ - long __cu_len = (n); \ - \ -- if (__access_ok(__cu_to, __cu_len, get_fs())) \ -+ if (__cu_len > 0 && __cu_len <= INT_MAX && __access_ok(__cu_to, __cu_len, get_fs())) \ - __cu_len = __copy_user(__cu_to, (__force void __user *) __cu_from, __cu_len); \ - __cu_len; \ - }) -@@ -269,7 +269,7 @@ __copy_from_user (void *to, const void _ - long __cu_len = (n); \ - \ - __chk_user_ptr(__cu_from); \ -- if (__access_ok(__cu_from, __cu_len, get_fs())) \ -+ if (__cu_len > 0 && __cu_len <= INT_MAX && __access_ok(__cu_from, __cu_len, get_fs())) \ - __cu_len = __copy_user((__force void __user *) __cu_to, __cu_from, __cu_len); \ - __cu_len; \ - }) -diff -urNp linux-2.6.31.1/arch/ia64/kernel/module.c linux-2.6.31.1/arch/ia64/kernel/module.c ---- linux-2.6.31.1/arch/ia64/kernel/module.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/ia64/kernel/module.c 2009-10-01 20:12:42.000000000 -0400 -@@ -315,8 +315,7 @@ module_alloc (unsigned long size) - void - module_free (struct module *mod, void *module_region) - { -- if (mod && mod->arch.init_unw_table && -- module_region == mod->module_init) { -+ if (mod && mod->arch.init_unw_table && module_region == mod->module_init_rx) { - unw_remove_unwind_table(mod->arch.init_unw_table); - mod->arch.init_unw_table = NULL; - } -@@ -502,15 +501,39 @@ module_frob_arch_sections (Elf_Ehdr *ehd - } - - static inline int -+in_init_rx (const struct module *mod, uint64_t addr) -+{ -+ return addr - (uint64_t) mod->module_init_rx < mod->init_size_rx; -+} -+ -+static inline int -+in_init_rw (const struct module *mod, uint64_t addr) -+{ -+ return addr - (uint64_t) mod->module_init_rw < mod->init_size_rw; -+} -+ -+static inline int - in_init (const struct module *mod, uint64_t addr) - { -- return addr - (uint64_t) mod->module_init < mod->init_size; -+ return in_init_rx(mod, addr) || in_init_rw(mod, addr); -+} -+ -+static inline int -+in_core_rx (const struct module *mod, uint64_t addr) -+{ -+ return addr - (uint64_t) mod->module_core_rx < mod->core_size_rx; -+} -+ -+static inline int -+in_core_rw (const struct module *mod, uint64_t addr) -+{ -+ return addr - (uint64_t) mod->module_core_rw < mod->core_size_rw; - } - - static inline int - in_core (const struct module *mod, uint64_t addr) - { -- return addr - (uint64_t) mod->module_core < mod->core_size; -+ return in_core_rx(mod, addr) || in_core_rw(mod, addr); - } - - static inline int -@@ -693,7 +716,14 @@ do_reloc (struct module *mod, uint8_t r_ - break; - - case RV_BDREL: -- val -= (uint64_t) (in_init(mod, val) ? mod->module_init : mod->module_core); -+ if (in_init_rx(mod, val)) -+ val -= (uint64_t) mod->module_init_rx; -+ else if (in_init_rw(mod, val)) -+ val -= (uint64_t) mod->module_init_rw; -+ else if (in_core_rx(mod, val)) -+ val -= (uint64_t) mod->module_core_rx; -+ else if (in_core_rw(mod, val)) -+ val -= (uint64_t) mod->module_core_rw; - break; - - case RV_LTV: -@@ -828,15 +858,15 @@ apply_relocate_add (Elf64_Shdr *sechdrs, - * addresses have been selected... - */ - uint64_t gp; -- if (mod->core_size > MAX_LTOFF) -+ if (mod->core_size_rx + mod->core_size_rw > MAX_LTOFF) - /* - * This takes advantage of fact that SHF_ARCH_SMALL gets allocated - * at the end of the module. - */ -- gp = mod->core_size - MAX_LTOFF / 2; -+ gp = mod->core_size_rx + mod->core_size_rw - MAX_LTOFF / 2; - else -- gp = mod->core_size / 2; -- gp = (uint64_t) mod->module_core + ((gp + 7) & -8); -+ gp = (mod->core_size_rx + mod->core_size_rw) / 2; -+ gp = (uint64_t) mod->module_core_rx + ((gp + 7) & -8); - mod->arch.gp = gp; - DEBUGP("%s: placing gp at 0x%lx\n", __func__, gp); - } -diff -urNp linux-2.6.31.1/arch/ia64/kernel/sys_ia64.c linux-2.6.31.1/arch/ia64/kernel/sys_ia64.c ---- linux-2.6.31.1/arch/ia64/kernel/sys_ia64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/ia64/kernel/sys_ia64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -43,6 +43,13 @@ arch_get_unmapped_area (struct file *fil - if (REGION_NUMBER(addr) == RGN_HPAGE) - addr = 0; - #endif -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ addr = mm->free_area_cache; -+ else -+#endif -+ - if (!addr) - addr = mm->free_area_cache; - -@@ -61,9 +68,9 @@ arch_get_unmapped_area (struct file *fil - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr || RGN_MAP_LIMIT - len < REGION_OFFSET(addr)) { -- if (start_addr != TASK_UNMAPPED_BASE) { -+ if (start_addr != mm->mmap_base) { - /* Start a new search --- just in case we missed some holes. */ -- addr = TASK_UNMAPPED_BASE; -+ addr = mm->mmap_base; - goto full_search; - } - return -ENOMEM; -diff -urNp linux-2.6.31.1/arch/ia64/mm/fault.c linux-2.6.31.1/arch/ia64/mm/fault.c ---- linux-2.6.31.1/arch/ia64/mm/fault.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/ia64/mm/fault.c 2009-10-01 20:12:42.000000000 -0400 -@@ -72,6 +72,23 @@ mapped_kernel_page_is_present (unsigned - return pte_present(pte); - } - -+#ifdef CONFIG_PAX_PAGEEXEC -+void pax_report_insns(void *pc, void *sp) -+{ -+ unsigned long i; -+ -+ printk(KERN_ERR "PAX: bytes at PC: "); -+ for (i = 0; i < 8; i++) { -+ unsigned int c; -+ if (get_user(c, (unsigned int *)pc+i)) -+ printk(KERN_CONT "???????? "); -+ else -+ printk(KERN_CONT "%08x ", c); -+ } -+ printk("\n"); -+} -+#endif -+ - void __kprobes - ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *regs) - { -@@ -145,9 +162,23 @@ ia64_do_page_fault (unsigned long addres - mask = ( (((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) - | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); - -- if ((vma->vm_flags & mask) != mask) -+ if ((vma->vm_flags & mask) != mask) { -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (!(vma->vm_flags & VM_EXEC) && (mask & VM_EXEC)) { -+ if (!(mm->pax_flags & MF_PAX_PAGEEXEC) || address != regs->cr_iip) -+ goto bad_area; -+ -+ up_read(&mm->mmap_sem); -+ pax_report_fault(regs, (void *)regs->cr_iip, (void *)regs->r12); -+ do_group_exit(SIGKILL); -+ } -+#endif -+ - goto bad_area; - -+ } -+ - survive: - /* - * If for any reason at all we couldn't handle the fault, make -diff -urNp linux-2.6.31.1/arch/ia64/mm/init.c linux-2.6.31.1/arch/ia64/mm/init.c ---- linux-2.6.31.1/arch/ia64/mm/init.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/ia64/mm/init.c 2009-10-01 20:12:42.000000000 -0400 -@@ -122,6 +122,19 @@ ia64_init_addr_space (void) - vma->vm_start = current->thread.rbs_bot & PAGE_MASK; - vma->vm_end = vma->vm_start + PAGE_SIZE; - vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (current->mm->pax_flags & MF_PAX_PAGEEXEC) { -+ vma->vm_flags &= ~VM_EXEC; -+ -+#ifdef CONFIG_PAX_MPROTECT -+ if (current->mm->pax_flags & MF_PAX_MPROTECT) -+ vma->vm_flags &= ~VM_MAYEXEC; -+#endif -+ -+ } -+#endif -+ - vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - down_write(¤t->mm->mmap_sem); - if (insert_vm_struct(current->mm, vma)) { -diff -urNp linux-2.6.31.1/arch/m32r/include/asm/atomic.h linux-2.6.31.1/arch/m32r/include/asm/atomic.h ---- linux-2.6.31.1/arch/m32r/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/m32r/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -29,6 +29,14 @@ - #define atomic_read(v) ((v)->counter) - - /** -+ * atomic_read_unchecked - read atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically reads the value of @v. -+ */ -+#define atomic_read_unchecked(v) ((v)->counter) -+ -+/** - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value -@@ -38,6 +46,15 @@ - #define atomic_set(v,i) (((v)->counter) = (i)) - - /** -+ * atomic_set_unchecked - set atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * @i: required value -+ * -+ * Atomically sets the value of @v to @i. -+ */ -+#define atomic_set_unchecked(v,i) (((v)->counter) = (i)) -+ -+/** - * atomic_add_return - add integer to atomic variable and return it - * @i: integer value to add - * @v: pointer of type atomic_t -@@ -308,6 +325,10 @@ static __inline__ void atomic_set_mask(u - local_irq_restore(flags); - } - -+#define atomic_inc_unchecked(v) atomic_inc((atomic_t *)(v)) -+#define atomic_add_unchecked(i,v) atomic_add((i),(atomic_t *)(v)) -+#define atomic_sub_unchecked(i,v) atomic_sub((i),(atomic_t *)(v)) -+ - /* Atomic operations are already serializing on m32r */ - #define smp_mb__before_atomic_dec() barrier() - #define smp_mb__after_atomic_dec() barrier() -diff -urNp linux-2.6.31.1/arch/m32r/lib/usercopy.c linux-2.6.31.1/arch/m32r/lib/usercopy.c ---- linux-2.6.31.1/arch/m32r/lib/usercopy.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/m32r/lib/usercopy.c 2009-10-01 20:12:42.000000000 -0400 -@@ -14,6 +14,9 @@ - unsigned long - __generic_copy_to_user(void __user *to, const void *from, unsigned long n) - { -+ if ((long)n < 0) -+ return n; -+ - prefetch(from); - if (access_ok(VERIFY_WRITE, to, n)) - __copy_user(to,from,n); -@@ -23,6 +26,9 @@ __generic_copy_to_user(void __user *to, - unsigned long - __generic_copy_from_user(void *to, const void __user *from, unsigned long n) - { -+ if ((long)n < 0) -+ return n; -+ - prefetchw(to); - if (access_ok(VERIFY_READ, from, n)) - __copy_user_zeroing(to,from,n); -diff -urNp linux-2.6.31.1/arch/m68k/include/asm/atomic_mm.h linux-2.6.31.1/arch/m68k/include/asm/atomic_mm.h ---- linux-2.6.31.1/arch/m68k/include/asm/atomic_mm.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/m68k/include/asm/atomic_mm.h 2009-10-01 20:12:42.000000000 -0400 -@@ -16,23 +16,40 @@ - #define ATOMIC_INIT(i) { (i) } - - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - #define atomic_set(v, i) (((v)->counter) = i) -+#define atomic_set_unchecked(v, i) (((v)->counter) = i) - - static inline void atomic_add(int i, atomic_t *v) - { - __asm__ __volatile__("addl %1,%0" : "+m" (*v) : "id" (i)); - } - -+static inline void atomic_add_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_add(i, (atomic_t *)v); -+} -+ - static inline void atomic_sub(int i, atomic_t *v) - { - __asm__ __volatile__("subl %1,%0" : "+m" (*v) : "id" (i)); - } - -+static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_sub(i, (atomic_t *)v); -+} -+ - static inline void atomic_inc(atomic_t *v) - { - __asm__ __volatile__("addql #1,%0" : "+m" (*v)); - } - -+static inline void atomic_inc_unchecked(atomic_unchecked_t *v) -+{ -+ atomic_inc((atomic_t *)v); -+} -+ - static inline void atomic_dec(atomic_t *v) - { - __asm__ __volatile__("subql #1,%0" : "+m" (*v)); -diff -urNp linux-2.6.31.1/arch/m68k/include/asm/atomic_no.h linux-2.6.31.1/arch/m68k/include/asm/atomic_no.h ---- linux-2.6.31.1/arch/m68k/include/asm/atomic_no.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/m68k/include/asm/atomic_no.h 2009-10-01 20:12:42.000000000 -0400 -@@ -16,7 +16,9 @@ - #define ATOMIC_INIT(i) { (i) } - - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - #define atomic_set(v, i) (((v)->counter) = i) -+#define atomic_set_unchecked(v, i) (((v)->counter) = i) - - static __inline__ void atomic_add(int i, atomic_t *v) - { -@@ -27,6 +29,11 @@ static __inline__ void atomic_add(int i, - #endif - } - -+static __inline__ void atomic_add_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_add(i, (atomic_t *)v); -+} -+ - static __inline__ void atomic_sub(int i, atomic_t *v) - { - #ifdef CONFIG_COLDFIRE -@@ -36,6 +43,11 @@ static __inline__ void atomic_sub(int i, - #endif - } - -+static __inline__ void atomic_sub_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_sub(i, (atomic_t *)v); -+} -+ - static __inline__ int atomic_sub_and_test(int i, atomic_t * v) - { - char c; -@@ -56,6 +68,11 @@ static __inline__ void atomic_inc(volati - __asm__ __volatile__("addql #1,%0" : "+m" (*v)); - } - -+static __inline__ void atomic_inc_unchecked(volatile atomic_unchecked_t *v) -+{ -+ atomic_inc((volatile atomic_t *)v); -+} -+ - /* - * atomic_inc_and_test - increment and test - * @v: pointer of type atomic_t -diff -urNp linux-2.6.31.1/arch/mips/include/asm/atomic.h linux-2.6.31.1/arch/mips/include/asm/atomic.h ---- linux-2.6.31.1/arch/mips/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mips/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -32,6 +32,14 @@ - #define atomic_read(v) ((v)->counter) - - /* -+ * atomic_read_unchecked - read atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically reads the value of @v. -+ */ -+#define atomic_read_unchecked(v) ((v)->counter) -+ -+/* - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value -@@ -41,6 +49,15 @@ - #define atomic_set(v, i) ((v)->counter = (i)) - - /* -+ * atomic_set_unchecked - set atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * @i: required value -+ * -+ * Atomically sets the value of @v to @i. -+ */ -+#define atomic_set_unchecked(v, i) ((v)->counter = (i)) -+ -+/* - * atomic_add - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t -@@ -381,6 +398,9 @@ static __inline__ int atomic_add_unless( - * Atomically increments @v by 1. - */ - #define atomic_inc(v) atomic_add(1, (v)) -+#define atomic_inc_unchecked(v) atomic_inc((atomic_t *)(v)) -+#define atomic_add_unchecked(i, v) atomic_add((i), (atomic_t *)(v)) -+#define atomic_sub_unchecked(i, v) atomic_sub((i), (atomic_t *)(v)) - - /* - * atomic_dec - decrement and test -diff -urNp linux-2.6.31.1/arch/mips/include/asm/elf.h linux-2.6.31.1/arch/mips/include/asm/elf.h ---- linux-2.6.31.1/arch/mips/include/asm/elf.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mips/include/asm/elf.h 2009-10-01 20:12:42.000000000 -0400 -@@ -368,4 +368,11 @@ extern int dump_task_fpu(struct task_str - #define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) - #endif - -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE (test_thread_flag(TIF_32BIT_ADDR) ? 0x00400000UL : 0x00400000UL) -+ -+#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) -+#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) -+#endif -+ - #endif /* _ASM_ELF_H */ -diff -urNp linux-2.6.31.1/arch/mips/include/asm/page.h linux-2.6.31.1/arch/mips/include/asm/page.h ---- linux-2.6.31.1/arch/mips/include/asm/page.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mips/include/asm/page.h 2009-10-01 20:12:42.000000000 -0400 -@@ -92,7 +92,7 @@ extern void copy_user_highpage(struct pa - #ifdef CONFIG_CPU_MIPS32 - typedef struct { unsigned long pte_low, pte_high; } pte_t; - #define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32)) -- #define __pte(x) ({ pte_t __pte = {(x), ((unsigned long long)(x)) >> 32}; __pte; }) -+ #define __pte(x) ({ pte_t __pte = {(x), (x) >> 32}; __pte; }) - #else - typedef struct { unsigned long long pte; } pte_t; - #define pte_val(x) ((x).pte) -diff -urNp linux-2.6.31.1/arch/mips/include/asm/system.h linux-2.6.31.1/arch/mips/include/asm/system.h ---- linux-2.6.31.1/arch/mips/include/asm/system.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mips/include/asm/system.h 2009-10-01 20:12:42.000000000 -0400 -@@ -217,6 +217,6 @@ extern void per_cpu_trap_init(void); - */ - #define __ARCH_WANT_UNLOCKED_CTXSW - --extern unsigned long arch_align_stack(unsigned long sp); -+#define arch_align_stack(x) ((x) & ALMASK) - - #endif /* _ASM_SYSTEM_H */ -diff -urNp linux-2.6.31.1/arch/mips/kernel/binfmt_elfn32.c linux-2.6.31.1/arch/mips/kernel/binfmt_elfn32.c ---- linux-2.6.31.1/arch/mips/kernel/binfmt_elfn32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mips/kernel/binfmt_elfn32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -50,6 +50,13 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_N - #undef ELF_ET_DYN_BASE - #define ELF_ET_DYN_BASE (TASK32_SIZE / 3 * 2) - -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE (test_thread_flag(TIF_32BIT_ADDR) ? 0x00400000UL : 0x00400000UL) -+ -+#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) -+#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) -+#endif -+ - #include <asm/processor.h> - #include <linux/module.h> - #include <linux/elfcore.h> -diff -urNp linux-2.6.31.1/arch/mips/kernel/binfmt_elfo32.c linux-2.6.31.1/arch/mips/kernel/binfmt_elfo32.c ---- linux-2.6.31.1/arch/mips/kernel/binfmt_elfo32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mips/kernel/binfmt_elfo32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -52,6 +52,13 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_N - #undef ELF_ET_DYN_BASE - #define ELF_ET_DYN_BASE (TASK32_SIZE / 3 * 2) - -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE (test_thread_flag(TIF_32BIT_ADDR) ? 0x00400000UL : 0x00400000UL) -+ -+#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) -+#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) -+#endif -+ - #include <asm/processor.h> - - /* -diff -urNp linux-2.6.31.1/arch/mips/kernel/process.c linux-2.6.31.1/arch/mips/kernel/process.c ---- linux-2.6.31.1/arch/mips/kernel/process.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mips/kernel/process.c 2009-10-01 20:12:42.000000000 -0400 -@@ -470,15 +470,3 @@ unsigned long get_wchan(struct task_stru - out: - return pc; - } -- --/* -- * Don't forget that the stack pointer must be aligned on a 8 bytes -- * boundary for 32-bits ABI and 16 bytes for 64-bits ABI. -- */ --unsigned long arch_align_stack(unsigned long sp) --{ -- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) -- sp -= get_random_int() & ~PAGE_MASK; -- -- return sp & ALMASK; --} -diff -urNp linux-2.6.31.1/arch/mips/kernel/syscall.c linux-2.6.31.1/arch/mips/kernel/syscall.c ---- linux-2.6.31.1/arch/mips/kernel/syscall.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mips/kernel/syscall.c 2009-10-01 20:12:42.000000000 -0400 -@@ -99,6 +99,11 @@ unsigned long arch_get_unmapped_area(str - do_color_align = 0; - if (filp || (flags & MAP_SHARED)) - do_color_align = 1; -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!(current->mm->pax_flags & MF_PAX_RANDMMAP)) -+#endif -+ - if (addr) { - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); -@@ -109,7 +114,7 @@ unsigned long arch_get_unmapped_area(str - (!vmm || addr + len <= vmm->vm_start)) - return addr; - } -- addr = TASK_UNMAPPED_BASE; -+ addr = current->mm->mmap_base; - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); - else -diff -urNp linux-2.6.31.1/arch/mips/mm/fault.c linux-2.6.31.1/arch/mips/mm/fault.c ---- linux-2.6.31.1/arch/mips/mm/fault.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mips/mm/fault.c 2009-10-01 20:12:42.000000000 -0400 -@@ -26,6 +26,23 @@ - #include <asm/ptrace.h> - #include <asm/highmem.h> /* For VMALLOC_END */ - -+#ifdef CONFIG_PAX_PAGEEXEC -+void pax_report_insns(void *pc) -+{ -+ unsigned long i; -+ -+ printk(KERN_ERR "PAX: bytes at PC: "); -+ for (i = 0; i < 5; i++) { -+ unsigned int c; -+ if (get_user(c, (unsigned int *)pc+i)) -+ printk(KERN_CONT "???????? "); -+ else -+ printk(KERN_CONT "%08x ", c); -+ } -+ printk("\n"); -+} -+#endif -+ - /* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate -diff -urNp linux-2.6.31.1/arch/mn10300/include/asm/atomic.h linux-2.6.31.1/arch/mn10300/include/asm/atomic.h ---- linux-2.6.31.1/arch/mn10300/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mn10300/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -34,6 +34,15 @@ - #define atomic_read(v) ((v)->counter) - - /** -+ * atomic_read_unchecked - read atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically reads the value of @v. Note that the guaranteed -+ * useful range of an atomic_unchecked_t is only 24 bits. -+ */ -+#define atomic_read_unchecked(v) ((v)->counter) -+ -+/** - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value -@@ -43,6 +52,16 @@ - */ - #define atomic_set(v, i) (((v)->counter) = (i)) - -+/** -+ * atomic_set_unchecked - set atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * @i: required value -+ * -+ * Atomically sets the value of @v to @i. Note that the guaranteed -+ * useful range of an atomic_unchecked_t is only 24 bits. -+ */ -+#define atomic_set_unchecked(v, i) (((v)->counter) = (i)) -+ - #include <asm/system.h> - - /** -@@ -99,16 +118,31 @@ static inline void atomic_add(int i, ato - atomic_add_return(i, v); - } - -+static inline void atomic_add_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_add_return(i, (atomic_t *)v); -+} -+ - static inline void atomic_sub(int i, atomic_t *v) - { - atomic_sub_return(i, v); - } - -+static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_sub_return(i, (atomic_t *)v); -+} -+ - static inline void atomic_inc(atomic_t *v) - { - atomic_add_return(1, v); - } - -+static inline void atomic_inc_unchecked(atomic_unchecked_t *v) -+{ -+ atomic_add_return(1, (atomic_t *)v); -+} -+ - static inline void atomic_dec(atomic_t *v) - { - atomic_sub_return(1, v); -diff -urNp linux-2.6.31.1/arch/mn10300/kernel/setup.c linux-2.6.31.1/arch/mn10300/kernel/setup.c ---- linux-2.6.31.1/arch/mn10300/kernel/setup.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/mn10300/kernel/setup.c 2009-10-01 20:12:42.000000000 -0400 -@@ -285,7 +285,7 @@ static void c_stop(struct seq_file *m, v - { - } - --struct seq_operations cpuinfo_op = { -+const struct seq_operations cpuinfo_op = { - .start = c_start, - .next = c_next, - .stop = c_stop, -diff -urNp linux-2.6.31.1/arch/parisc/include/asm/atomic.h linux-2.6.31.1/arch/parisc/include/asm/atomic.h ---- linux-2.6.31.1/arch/parisc/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/parisc/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -177,6 +177,18 @@ static __inline__ int __atomic_add_retur - return ret; - } - -+static __inline__ int __atomic_add_return_unchecked(int i, atomic_unchecked_t *v) -+{ -+ int ret; -+ unsigned long flags; -+ _atomic_spin_lock_irqsave(v, flags); -+ -+ ret = (v->counter += i); -+ -+ _atomic_spin_unlock_irqrestore(v, flags); -+ return ret; -+} -+ - static __inline__ void atomic_set(atomic_t *v, int i) - { - unsigned long flags; -@@ -187,11 +199,26 @@ static __inline__ void atomic_set(atomic - _atomic_spin_unlock_irqrestore(v, flags); - } - -+static __inline__ void atomic_set_unchecked(atomic_unchecked_t *v, int i) -+{ -+ unsigned long flags; -+ _atomic_spin_lock_irqsave(v, flags); -+ -+ v->counter = i; -+ -+ _atomic_spin_unlock_irqrestore(v, flags); -+} -+ - static __inline__ int atomic_read(const atomic_t *v) - { - return v->counter; - } - -+static __inline__ int atomic_read_unchecked(const atomic_unchecked_t *v) -+{ -+ return v->counter; -+} -+ - /* exported interface */ - #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n))) - #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) -@@ -223,8 +250,11 @@ static __inline__ int atomic_add_unless( - #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) - - #define atomic_add(i,v) ((void)(__atomic_add_return( (i),(v)))) -+#define atomic_add_unchecked(i,v) ((void)(__atomic_add_return_unchecked( ((i),(v)))) - #define atomic_sub(i,v) ((void)(__atomic_add_return(-(i),(v)))) -+#define atomic_sub_unchecked(i,v) ((void)(__atomic_add_return_unchecked(-(i),(v)))) - #define atomic_inc(v) ((void)(__atomic_add_return( 1,(v)))) -+#define atomic_inc_unchecked(v) ((void)(__atomic_add_return_unchecked( 1,(v)))) - #define atomic_dec(v) ((void)(__atomic_add_return( -1,(v)))) - - #define atomic_add_return(i,v) (__atomic_add_return( (i),(v))) -diff -urNp linux-2.6.31.1/arch/parisc/include/asm/elf.h linux-2.6.31.1/arch/parisc/include/asm/elf.h ---- linux-2.6.31.1/arch/parisc/include/asm/elf.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/parisc/include/asm/elf.h 2009-10-01 20:12:42.000000000 -0400 -@@ -343,6 +343,13 @@ struct pt_regs; /* forward declaration.. - - #define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x01000000) - -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE 0x10000UL -+ -+#define PAX_DELTA_MMAP_LEN 16 -+#define PAX_DELTA_STACK_LEN 16 -+#endif -+ - /* This yields a mask that user programs can use to figure out what - instruction set this CPU supports. This could be done in user space, - but it's not easy, and we've already done it here. */ -diff -urNp linux-2.6.31.1/arch/parisc/include/asm/pgtable.h linux-2.6.31.1/arch/parisc/include/asm/pgtable.h ---- linux-2.6.31.1/arch/parisc/include/asm/pgtable.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/parisc/include/asm/pgtable.h 2009-10-01 20:12:42.000000000 -0400 -@@ -207,6 +207,17 @@ - #define PAGE_EXECREAD __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC |_PAGE_ACCESSED) - #define PAGE_COPY PAGE_EXECREAD - #define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC |_PAGE_ACCESSED) -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+# define PAGE_SHARED_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_ACCESSED) -+# define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_ACCESSED) -+# define PAGE_READONLY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_ACCESSED) -+#else -+# define PAGE_SHARED_NOEXEC PAGE_SHARED -+# define PAGE_COPY_NOEXEC PAGE_COPY -+# define PAGE_READONLY_NOEXEC PAGE_READONLY -+#endif -+ - #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) - #define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL & ~_PAGE_WRITE) - #define PAGE_KERNEL_UNC __pgprot(_PAGE_KERNEL | _PAGE_NO_CACHE) -diff -urNp linux-2.6.31.1/arch/parisc/kernel/module.c linux-2.6.31.1/arch/parisc/kernel/module.c ---- linux-2.6.31.1/arch/parisc/kernel/module.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/parisc/kernel/module.c 2009-10-01 20:12:42.000000000 -0400 -@@ -95,16 +95,38 @@ - - /* three functions to determine where in the module core - * or init pieces the location is */ -+static inline int in_init_rx(struct module *me, void *loc) -+{ -+ return (loc >= me->module_init_rx && -+ loc < (me->module_init_rx + me->init_size_rx)); -+} -+ -+static inline int in_init_rw(struct module *me, void *loc) -+{ -+ return (loc >= me->module_init_rw && -+ loc < (me->module_init_rw + me->init_size_rw)); -+} -+ - static inline int in_init(struct module *me, void *loc) - { -- return (loc >= me->module_init && -- loc <= (me->module_init + me->init_size)); -+ return in_init_rx(me, loc) || in_init_rw(me, loc); -+} -+ -+static inline int in_core_rx(struct module *me, void *loc) -+{ -+ return (loc >= me->module_core_rx && -+ loc < (me->module_core_rx + me->core_size_rx)); -+} -+ -+static inline int in_core_rw(struct module *me, void *loc) -+{ -+ return (loc >= me->module_core_rw && -+ loc < (me->module_core_rw + me->core_size_rw)); - } - - static inline int in_core(struct module *me, void *loc) - { -- return (loc >= me->module_core && -- loc <= (me->module_core + me->core_size)); -+ return in_core_rx(me, loc) || in_core_rw(me, loc); - } - - static inline int in_local(struct module *me, void *loc) -@@ -364,13 +386,13 @@ int module_frob_arch_sections(CONST Elf_ - } - - /* align things a bit */ -- me->core_size = ALIGN(me->core_size, 16); -- me->arch.got_offset = me->core_size; -- me->core_size += gots * sizeof(struct got_entry); -- -- me->core_size = ALIGN(me->core_size, 16); -- me->arch.fdesc_offset = me->core_size; -- me->core_size += fdescs * sizeof(Elf_Fdesc); -+ me->core_size_rw = ALIGN(me->core_size_rw, 16); -+ me->arch.got_offset = me->core_size_rw; -+ me->core_size_rw += gots * sizeof(struct got_entry); -+ -+ me->core_size_rw = ALIGN(me->core_size_rw, 16); -+ me->arch.fdesc_offset = me->core_size_rw; -+ me->core_size_rw += fdescs * sizeof(Elf_Fdesc); - - me->arch.got_max = gots; - me->arch.fdesc_max = fdescs; -@@ -388,7 +410,7 @@ static Elf64_Word get_got(struct module - - BUG_ON(value == 0); - -- got = me->module_core + me->arch.got_offset; -+ got = me->module_core_rw + me->arch.got_offset; - for (i = 0; got[i].addr; i++) - if (got[i].addr == value) - goto out; -@@ -406,7 +428,7 @@ static Elf64_Word get_got(struct module - #ifdef CONFIG_64BIT - static Elf_Addr get_fdesc(struct module *me, unsigned long value) - { -- Elf_Fdesc *fdesc = me->module_core + me->arch.fdesc_offset; -+ Elf_Fdesc *fdesc = me->module_core_rw + me->arch.fdesc_offset; - - if (!value) { - printk(KERN_ERR "%s: zero OPD requested!\n", me->name); -@@ -424,7 +446,7 @@ static Elf_Addr get_fdesc(struct module - - /* Create new one */ - fdesc->addr = value; -- fdesc->gp = (Elf_Addr)me->module_core + me->arch.got_offset; -+ fdesc->gp = (Elf_Addr)me->module_core_rw + me->arch.got_offset; - return (Elf_Addr)fdesc; - } - #endif /* CONFIG_64BIT */ -@@ -848,7 +870,7 @@ register_unwind_table(struct module *me, - - table = (unsigned char *)sechdrs[me->arch.unwind_section].sh_addr; - end = table + sechdrs[me->arch.unwind_section].sh_size; -- gp = (Elf_Addr)me->module_core + me->arch.got_offset; -+ gp = (Elf_Addr)me->module_core_rw + me->arch.got_offset; - - DEBUGP("register_unwind_table(), sect = %d at 0x%p - 0x%p (gp=0x%lx)\n", - me->arch.unwind_section, table, end, gp); -diff -urNp linux-2.6.31.1/arch/parisc/kernel/sys_parisc.c linux-2.6.31.1/arch/parisc/kernel/sys_parisc.c ---- linux-2.6.31.1/arch/parisc/kernel/sys_parisc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/parisc/kernel/sys_parisc.c 2009-10-01 20:12:42.000000000 -0400 -@@ -98,7 +98,7 @@ unsigned long arch_get_unmapped_area(str - if (flags & MAP_FIXED) - return addr; - if (!addr) -- addr = TASK_UNMAPPED_BASE; -+ addr = current->mm->mmap_base; - - if (filp) { - addr = get_shared_area(filp->f_mapping, addr, len, pgoff); -diff -urNp linux-2.6.31.1/arch/parisc/kernel/traps.c linux-2.6.31.1/arch/parisc/kernel/traps.c ---- linux-2.6.31.1/arch/parisc/kernel/traps.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/parisc/kernel/traps.c 2009-10-01 20:12:42.000000000 -0400 -@@ -733,9 +733,7 @@ void notrace handle_interruption(int cod - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm,regs->iaoq[0]); -- if (vma && (regs->iaoq[0] >= vma->vm_start) -- && (vma->vm_flags & VM_EXEC)) { -- -+ if (vma && (regs->iaoq[0] >= vma->vm_start)) { - fault_address = regs->iaoq[0]; - fault_space = regs->iasq[0]; - -diff -urNp linux-2.6.31.1/arch/parisc/mm/fault.c linux-2.6.31.1/arch/parisc/mm/fault.c ---- linux-2.6.31.1/arch/parisc/mm/fault.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/parisc/mm/fault.c 2009-10-01 20:12:42.000000000 -0400 -@@ -15,6 +15,7 @@ - #include <linux/sched.h> - #include <linux/interrupt.h> - #include <linux/module.h> -+#include <linux/unistd.h> - - #include <asm/uaccess.h> - #include <asm/traps.h> -@@ -52,7 +53,7 @@ DEFINE_PER_CPU(struct exception_data, ex - static unsigned long - parisc_acctyp(unsigned long code, unsigned int inst) - { -- if (code == 6 || code == 16) -+ if (code == 6 || code == 7 || code == 16) - return VM_EXEC; - - switch (inst & 0xf0000000) { -@@ -138,6 +139,116 @@ parisc_acctyp(unsigned long code, unsign - } - #endif - -+#ifdef CONFIG_PAX_PAGEEXEC -+/* -+ * PaX: decide what to do with offenders (instruction_pointer(regs) = fault address) -+ * -+ * returns 1 when task should be killed -+ * 2 when rt_sigreturn trampoline was detected -+ * 3 when unpatched PLT trampoline was detected -+ */ -+static int pax_handle_fetch_fault(struct pt_regs *regs) -+{ -+ -+#ifdef CONFIG_PAX_EMUPLT -+ int err; -+ -+ do { /* PaX: unpatched PLT emulation */ -+ unsigned int bl, depwi; -+ -+ err = get_user(bl, (unsigned int *)instruction_pointer(regs)); -+ err |= get_user(depwi, (unsigned int *)(instruction_pointer(regs)+4)); -+ -+ if (err) -+ break; -+ -+ if (bl == 0xEA9F1FDDU && depwi == 0xD6801C1EU) { -+ unsigned int ldw, bv, ldw2, addr = instruction_pointer(regs)-12; -+ -+ err = get_user(ldw, (unsigned int *)addr); -+ err |= get_user(bv, (unsigned int *)(addr+4)); -+ err |= get_user(ldw2, (unsigned int *)(addr+8)); -+ -+ if (err) -+ break; -+ -+ if (ldw == 0x0E801096U && -+ bv == 0xEAC0C000U && -+ ldw2 == 0x0E881095U) -+ { -+ unsigned int resolver, map; -+ -+ err = get_user(resolver, (unsigned int *)(instruction_pointer(regs)+8)); -+ err |= get_user(map, (unsigned int *)(instruction_pointer(regs)+12)); -+ if (err) -+ break; -+ -+ regs->gr[20] = instruction_pointer(regs)+8; -+ regs->gr[21] = map; -+ regs->gr[22] = resolver; -+ regs->iaoq[0] = resolver | 3UL; -+ regs->iaoq[1] = regs->iaoq[0] + 4; -+ return 3; -+ } -+ } -+ } while (0); -+#endif -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+ -+#ifndef CONFIG_PAX_EMUSIGRT -+ if (!(current->mm->pax_flags & MF_PAX_EMUTRAMP)) -+ return 1; -+#endif -+ -+ do { /* PaX: rt_sigreturn emulation */ -+ unsigned int ldi1, ldi2, bel, nop; -+ -+ err = get_user(ldi1, (unsigned int *)instruction_pointer(regs)); -+ err |= get_user(ldi2, (unsigned int *)(instruction_pointer(regs)+4)); -+ err |= get_user(bel, (unsigned int *)(instruction_pointer(regs)+8)); -+ err |= get_user(nop, (unsigned int *)(instruction_pointer(regs)+12)); -+ -+ if (err) -+ break; -+ -+ if ((ldi1 == 0x34190000U || ldi1 == 0x34190002U) && -+ ldi2 == 0x3414015AU && -+ bel == 0xE4008200U && -+ nop == 0x08000240U) -+ { -+ regs->gr[25] = (ldi1 & 2) >> 1; -+ regs->gr[20] = __NR_rt_sigreturn; -+ regs->gr[31] = regs->iaoq[1] + 16; -+ regs->sr[0] = regs->iasq[1]; -+ regs->iaoq[0] = 0x100UL; -+ regs->iaoq[1] = regs->iaoq[0] + 4; -+ regs->iasq[0] = regs->sr[2]; -+ regs->iasq[1] = regs->sr[2]; -+ return 2; -+ } -+ } while (0); -+#endif -+ -+ return 1; -+} -+ -+void pax_report_insns(void *pc, void *sp) -+{ -+ unsigned long i; -+ -+ printk(KERN_ERR "PAX: bytes at PC: "); -+ for (i = 0; i < 5; i++) { -+ unsigned int c; -+ if (get_user(c, (unsigned int *)pc+i)) -+ printk(KERN_CONT "???????? "); -+ else -+ printk(KERN_CONT "%08x ", c); -+ } -+ printk("\n"); -+} -+#endif -+ - int fixup_exception(struct pt_regs *regs) - { - const struct exception_table_entry *fix; -@@ -192,8 +303,33 @@ good_area: - - acc_type = parisc_acctyp(code,regs->iir); - -- if ((vma->vm_flags & acc_type) != acc_type) -+ if ((vma->vm_flags & acc_type) != acc_type) { -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && (acc_type & VM_EXEC) && -+ (address & ~3UL) == instruction_pointer(regs)) -+ { -+ up_read(&mm->mmap_sem); -+ switch (pax_handle_fetch_fault(regs)) { -+ -+#ifdef CONFIG_PAX_EMUPLT -+ case 3: -+ return; -+#endif -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+ case 2: -+ return; -+#endif -+ -+ } -+ pax_report_fault(regs, (void *)instruction_pointer(regs), (void *)regs->gr[30]); -+ do_group_exit(SIGKILL); -+ } -+#endif -+ - goto bad_area; -+ } - - /* - * If for any reason at all we couldn't handle the fault, make -diff -urNp linux-2.6.31.1/arch/powerpc/include/asm/atomic.h linux-2.6.31.1/arch/powerpc/include/asm/atomic.h ---- linux-2.6.31.1/arch/powerpc/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -24,11 +24,21 @@ static __inline__ int atomic_read(const - return t; - } - -+static __inline__ int atomic_read_unchecked(const atomic_unchecked_t *v) -+{ -+ return atomic_read((const atomic_t *)v); -+} -+ - static __inline__ void atomic_set(atomic_t *v, int i) - { - __asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i)); - } - -+static __inline__ void atomic_set_unchecked(atomic_unchecked_t *v, int i) -+{ -+ atomic_set((atomic_t *)v, i); -+} -+ - static __inline__ void atomic_add(int a, atomic_t *v) - { - int t; -@@ -44,6 +54,11 @@ static __inline__ void atomic_add(int a, - : "cc"); - } - -+static __inline__ void atomic_add_unchecked(int a, atomic_unchecked_t *v) -+{ -+ atomic_add(a, (atomic_t *)v); -+} -+ - static __inline__ int atomic_add_return(int a, atomic_t *v) - { - int t; -@@ -80,6 +95,11 @@ static __inline__ void atomic_sub(int a, - : "cc"); - } - -+static __inline__ void atomic_sub_unchecked(int a, atomic_unchecked_t *v) -+{ -+ atomic_sub(a, (atomic_t *)v); -+} -+ - static __inline__ int atomic_sub_return(int a, atomic_t *v) - { - int t; -@@ -114,6 +134,11 @@ static __inline__ void atomic_inc(atomic - : "cc", "xer"); - } - -+static __inline__ void atomic_inc_unchecked(atomic_unchecked_t *v) -+{ -+ atomic_inc((atomic_t *)v); -+} -+ - static __inline__ int atomic_inc_return(atomic_t *v) - { - int t; -diff -urNp linux-2.6.31.1/arch/powerpc/include/asm/elf.h linux-2.6.31.1/arch/powerpc/include/asm/elf.h ---- linux-2.6.31.1/arch/powerpc/include/asm/elf.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/include/asm/elf.h 2009-10-01 20:12:42.000000000 -0400 -@@ -179,8 +179,19 @@ typedef elf_fpreg_t elf_vsrreghalf_t32[E - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - --extern unsigned long randomize_et_dyn(unsigned long base); --#define ELF_ET_DYN_BASE (randomize_et_dyn(0x20000000)) -+#define ELF_ET_DYN_BASE (0x20000000) -+ -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE (0x10000000UL) -+ -+#ifdef __powerpc64__ -+#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_32BIT) ? 16 : 28) -+#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_32BIT) ? 16 : 28) -+#else -+#define PAX_DELTA_MMAP_LEN 15 -+#define PAX_DELTA_STACK_LEN 15 -+#endif -+#endif - - /* - * Our registers are always unsigned longs, whether we're a 32 bit -@@ -279,9 +290,6 @@ extern int arch_setup_additional_pages(s - (0x7ff >> (PAGE_SHIFT - 12)) : \ - (0x3ffff >> (PAGE_SHIFT - 12))) - --extern unsigned long arch_randomize_brk(struct mm_struct *mm); --#define arch_randomize_brk arch_randomize_brk -- - #endif /* __KERNEL__ */ - - /* -diff -urNp linux-2.6.31.1/arch/powerpc/include/asm/kmap_types.h linux-2.6.31.1/arch/powerpc/include/asm/kmap_types.h ---- linux-2.6.31.1/arch/powerpc/include/asm/kmap_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/include/asm/kmap_types.h 2009-10-01 20:12:42.000000000 -0400 -@@ -26,6 +26,7 @@ enum km_type { - KM_SOFTIRQ1, - KM_PPC_SYNC_PAGE, - KM_PPC_SYNC_ICACHE, -+ KM_CLEARPAGE, - KM_TYPE_NR - }; - -diff -urNp linux-2.6.31.1/arch/powerpc/include/asm/page_64.h linux-2.6.31.1/arch/powerpc/include/asm/page_64.h ---- linux-2.6.31.1/arch/powerpc/include/asm/page_64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/include/asm/page_64.h 2009-10-01 20:12:42.000000000 -0400 -@@ -170,15 +170,18 @@ do { \ - * stack by default, so in the absense of a PT_GNU_STACK program header - * we turn execute permission off. - */ --#define VM_STACK_DEFAULT_FLAGS32 (VM_READ | VM_WRITE | VM_EXEC | \ -- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -+#define VM_STACK_DEFAULT_FLAGS32 \ -+ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ -+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - - #define VM_STACK_DEFAULT_FLAGS64 (VM_READ | VM_WRITE | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -+#ifndef CONFIG_PAX_PAGEEXEC - #define VM_STACK_DEFAULT_FLAGS \ - (test_thread_flag(TIF_32BIT) ? \ - VM_STACK_DEFAULT_FLAGS32 : VM_STACK_DEFAULT_FLAGS64) -+#endif - - #include <asm-generic/getorder.h> - -diff -urNp linux-2.6.31.1/arch/powerpc/include/asm/page.h linux-2.6.31.1/arch/powerpc/include/asm/page.h ---- linux-2.6.31.1/arch/powerpc/include/asm/page.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/include/asm/page.h 2009-10-01 20:12:42.000000000 -0400 -@@ -116,8 +116,9 @@ extern phys_addr_t kernstart_addr; - * and needs to be executable. This means the whole heap ends - * up being executable. - */ --#define VM_DATA_DEFAULT_FLAGS32 (VM_READ | VM_WRITE | VM_EXEC | \ -- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -+#define VM_DATA_DEFAULT_FLAGS32 \ -+ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ -+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - - #define VM_DATA_DEFAULT_FLAGS64 (VM_READ | VM_WRITE | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -diff -urNp linux-2.6.31.1/arch/powerpc/include/asm/pte-common.h linux-2.6.31.1/arch/powerpc/include/asm/pte-common.h ---- linux-2.6.31.1/arch/powerpc/include/asm/pte-common.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/include/asm/pte-common.h 2009-10-01 20:12:42.000000000 -0400 -@@ -121,11 +121,11 @@ extern unsigned long bad_call_to_PMD_PAG - */ - #define PAGE_NONE __pgprot(_PAGE_BASE) - #define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) --#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) -+#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC | _PAGE_HWEXEC) - #define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) --#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) -+#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC) - #define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) --#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) -+#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC) - - #define __P000 PAGE_NONE - #define __P001 PAGE_READONLY -diff -urNp linux-2.6.31.1/arch/powerpc/include/asm/pte-hash32.h linux-2.6.31.1/arch/powerpc/include/asm/pte-hash32.h ---- linux-2.6.31.1/arch/powerpc/include/asm/pte-hash32.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/include/asm/pte-hash32.h 2009-10-01 20:12:42.000000000 -0400 -@@ -21,6 +21,7 @@ - #define _PAGE_FILE 0x004 /* when !present: nonlinear file mapping */ - #define _PAGE_USER 0x004 /* usermode access allowed */ - #define _PAGE_GUARDED 0x008 /* G: prohibit speculative access */ -+#define _PAGE_HWEXEC _PAGE_GUARDED - #define _PAGE_COHERENT 0x010 /* M: enforce memory coherence (SMP systems) */ - #define _PAGE_NO_CACHE 0x020 /* I: cache inhibit */ - #define _PAGE_WRITETHRU 0x040 /* W: cache write-through */ -diff -urNp linux-2.6.31.1/arch/powerpc/include/asm/reg.h linux-2.6.31.1/arch/powerpc/include/asm/reg.h ---- linux-2.6.31.1/arch/powerpc/include/asm/reg.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/include/asm/reg.h 2009-10-01 20:12:42.000000000 -0400 -@@ -195,6 +195,7 @@ - #define SPRN_DBCR 0x136 /* e300 Data Breakpoint Control Reg */ - #define SPRN_DSISR 0x012 /* Data Storage Interrupt Status Register */ - #define DSISR_NOHPTE 0x40000000 /* no translation found */ -+#define DSISR_GUARDED 0x10000000 /* fetch from guarded storage */ - #define DSISR_PROTFAULT 0x08000000 /* protection fault */ - #define DSISR_ISSTORE 0x02000000 /* access was a store */ - #define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ -diff -urNp linux-2.6.31.1/arch/powerpc/include/asm/uaccess.h linux-2.6.31.1/arch/powerpc/include/asm/uaccess.h ---- linux-2.6.31.1/arch/powerpc/include/asm/uaccess.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/include/asm/uaccess.h 2009-10-01 20:12:42.000000000 -0400 -@@ -327,52 +327,6 @@ do { \ - extern unsigned long __copy_tofrom_user(void __user *to, - const void __user *from, unsigned long size); - --#ifndef __powerpc64__ -- --static inline unsigned long copy_from_user(void *to, -- const void __user *from, unsigned long n) --{ -- unsigned long over; -- -- if (access_ok(VERIFY_READ, from, n)) -- return __copy_tofrom_user((__force void __user *)to, from, n); -- if ((unsigned long)from < TASK_SIZE) { -- over = (unsigned long)from + n - TASK_SIZE; -- return __copy_tofrom_user((__force void __user *)to, from, -- n - over) + over; -- } -- return n; --} -- --static inline unsigned long copy_to_user(void __user *to, -- const void *from, unsigned long n) --{ -- unsigned long over; -- -- if (access_ok(VERIFY_WRITE, to, n)) -- return __copy_tofrom_user(to, (__force void __user *)from, n); -- if ((unsigned long)to < TASK_SIZE) { -- over = (unsigned long)to + n - TASK_SIZE; -- return __copy_tofrom_user(to, (__force void __user *)from, -- n - over) + over; -- } -- return n; --} -- --#else /* __powerpc64__ */ -- --#define __copy_in_user(to, from, size) \ -- __copy_tofrom_user((to), (from), (size)) -- --extern unsigned long copy_from_user(void *to, const void __user *from, -- unsigned long n); --extern unsigned long copy_to_user(void __user *to, const void *from, -- unsigned long n); --extern unsigned long copy_in_user(void __user *to, const void __user *from, -- unsigned long n); -- --#endif /* __powerpc64__ */ -- - static inline unsigned long __copy_from_user_inatomic(void *to, - const void __user *from, unsigned long n) - { -@@ -396,6 +350,9 @@ static inline unsigned long __copy_from_ - if (ret == 0) - return 0; - } -+ if (!__builtin_constant_p(n)) -+ check_object_size(to, n, false); -+ - return __copy_tofrom_user((__force void __user *)to, from, n); - } - -@@ -422,6 +379,9 @@ static inline unsigned long __copy_to_us - if (ret == 0) - return 0; - } -+ if (!__builtin_constant_p(n)) -+ check_object_size(from, n, true); -+ - return __copy_tofrom_user(to, (__force const void __user *)from, n); - } - -@@ -439,6 +399,97 @@ static inline unsigned long __copy_to_us - return __copy_to_user_inatomic(to, from, size); - } - -+#ifndef __powerpc64__ -+ -+static inline unsigned long __must_check copy_from_user(void *to, -+ const void __user *from, unsigned long n) -+{ -+ unsigned long over; -+ -+ if (((long)n < 0) || (n > INT_MAX)) -+ return n; -+ -+ if (access_ok(VERIFY_READ, from, n)) { -+ if (!__builtin_constant_p(n)) -+ check_object_size(to, n, false); -+ -+ return __copy_tofrom_user((__force void __user *)to, from, n); -+ } -+ if ((unsigned long)from < TASK_SIZE) { -+ over = (unsigned long)from + n - TASK_SIZE; -+ if (!__builtin_constant_p(n - over)) -+ check_object_size(to, n - over, false); -+ return __copy_tofrom_user((__force void __user *)to, from, -+ n - over) + over; -+ } -+ return n; -+} -+ -+static inline unsigned long __must_check copy_to_user(void __user *to, -+ const void *from, unsigned long n) -+{ -+ unsigned long over; -+ -+ if (((long)n < 0) || (n > INT_MAX)) -+ return n; -+ -+ if (access_ok(VERIFY_WRITE, to, n)) { -+ if (!__builtin_constant_p(n)) -+ check_object_size(from, n, true); -+ return __copy_tofrom_user(to, (__force void __user *)from, n); -+ } -+ if ((unsigned long)to < TASK_SIZE) { -+ over = (unsigned long)to + n - TASK_SIZE; -+ if (!__builtin_constant_p(n)) -+ check_object_size(from, n - over, true); -+ return __copy_tofrom_user(to, (__force void __user *)from, -+ n - over) + over; -+ } -+ return n; -+} -+ -+#else /* __powerpc64__ */ -+ -+#define __copy_in_user(to, from, size) \ -+ __copy_tofrom_user((to), (from), (size)) -+ -+static inline unsigned long __must_check copy_from_user(void *to, -+ const void __user *from, unsigned long n) -+{ -+ if (unlikely(((long)n < 0) || (n > INT_MAX))) -+ return n; -+ -+ if (!__builtin_constant_p(n)) -+ check_object_size(to, n, false); -+ -+ if (likely(access_ok(VERIFY_READ, from, n))) -+ n = __copy_from_user(to, from, n); -+ else -+ memset(to, 0, n); -+ -+ return n; -+} -+ -+static inline unsigned long __must_check copy_to_user(void __user *to, -+ const void *from, unsigned long n) -+{ -+ if (unlikely(((long)n < 0) || (n > INT_MAX))) -+ return n; -+ -+ if (likely(access_ok(VERIFY_WRITE, to, n))) { -+ if (!__builtin_constant_p(n)) -+ check_object_size(from, n, true); -+ n = __copy_to_user(to, from, n); -+ } -+ -+ return n; -+} -+ -+extern unsigned long copy_in_user(void __user *to, const void __user *from, -+ unsigned long n); -+ -+#endif /* __powerpc64__ */ -+ - extern unsigned long __clear_user(void __user *addr, unsigned long size); - - static inline unsigned long clear_user(void __user *addr, unsigned long size) -diff -urNp linux-2.6.31.1/arch/powerpc/kernel/module_32.c linux-2.6.31.1/arch/powerpc/kernel/module_32.c ---- linux-2.6.31.1/arch/powerpc/kernel/module_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/kernel/module_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -162,7 +162,7 @@ int module_frob_arch_sections(Elf32_Ehdr - me->arch.core_plt_section = i; - } - if (!me->arch.core_plt_section || !me->arch.init_plt_section) { -- printk("Module doesn't contain .plt or .init.plt sections.\n"); -+ printk("Module %s doesn't contain .plt or .init.plt sections.\n", me->name); - return -ENOEXEC; - } - -@@ -203,11 +203,16 @@ static uint32_t do_plt_call(void *locati - - DEBUGP("Doing plt for call to 0x%x at 0x%x\n", val, (unsigned int)location); - /* Init, or core PLT? */ -- if (location >= mod->module_core -- && location < mod->module_core + mod->core_size) -+ if ((location >= mod->module_core_rx && location < mod->module_core_rx + mod->core_size_rx) || -+ (location >= mod->module_core_rw && location < mod->module_core_rw + mod->core_size_rw)) - entry = (void *)sechdrs[mod->arch.core_plt_section].sh_addr; -- else -+ else if ((location >= mod->module_init_rx && location < mod->module_init_rx + mod->init_size_rx) || -+ (location >= mod->module_init_rw && location < mod->module_init_rw + mod->init_size_rw)) - entry = (void *)sechdrs[mod->arch.init_plt_section].sh_addr; -+ else { -+ printk(KERN_ERR "%s: invalid R_PPC_REL24 entry found\n", mod->name); -+ return ~0UL; -+ } - - /* Find this entry, or if that fails, the next avail. entry */ - while (entry->jump[0]) { -diff -urNp linux-2.6.31.1/arch/powerpc/kernel/process.c linux-2.6.31.1/arch/powerpc/kernel/process.c ---- linux-2.6.31.1/arch/powerpc/kernel/process.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/kernel/process.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1147,36 +1147,3 @@ unsigned long arch_align_stack(unsigned - sp -= get_random_int() & ~PAGE_MASK; - return sp & ~0xf; - } -- --static inline unsigned long brk_rnd(void) --{ -- unsigned long rnd = 0; -- -- /* 8MB for 32bit, 1GB for 64bit */ -- if (is_32bit_task()) -- rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT))); -- else -- rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT))); -- -- return rnd << PAGE_SHIFT; --} -- --unsigned long arch_randomize_brk(struct mm_struct *mm) --{ -- unsigned long ret = PAGE_ALIGN(mm->brk + brk_rnd()); -- -- if (ret < mm->brk) -- return mm->brk; -- -- return ret; --} -- --unsigned long randomize_et_dyn(unsigned long base) --{ -- unsigned long ret = PAGE_ALIGN(base + brk_rnd()); -- -- if (ret < base) -- return base; -- -- return ret; --} -diff -urNp linux-2.6.31.1/arch/powerpc/kernel/setup-common.c linux-2.6.31.1/arch/powerpc/kernel/setup-common.c ---- linux-2.6.31.1/arch/powerpc/kernel/setup-common.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/kernel/setup-common.c 2009-10-01 20:12:42.000000000 -0400 -@@ -328,7 +328,7 @@ static void c_stop(struct seq_file *m, v - { - } - --struct seq_operations cpuinfo_op = { -+const struct seq_operations cpuinfo_op = { - .start =c_start, - .next = c_next, - .stop = c_stop, -diff -urNp linux-2.6.31.1/arch/powerpc/kernel/signal_32.c linux-2.6.31.1/arch/powerpc/kernel/signal_32.c ---- linux-2.6.31.1/arch/powerpc/kernel/signal_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/kernel/signal_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -857,7 +857,7 @@ int handle_rt_signal32(unsigned long sig - /* Save user registers on the stack */ - frame = &rt_sf->uc.uc_mcontext; - addr = frame; -- if (vdso32_rt_sigtramp && current->mm->context.vdso_base) { -+ if (vdso32_rt_sigtramp && current->mm->context.vdso_base != ~0UL) { - if (save_user_regs(regs, frame, 0, 1)) - goto badframe; - regs->link = current->mm->context.vdso_base + vdso32_rt_sigtramp; -diff -urNp linux-2.6.31.1/arch/powerpc/kernel/signal_64.c linux-2.6.31.1/arch/powerpc/kernel/signal_64.c ---- linux-2.6.31.1/arch/powerpc/kernel/signal_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/kernel/signal_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -429,7 +429,7 @@ int handle_rt_signal64(int signr, struct - current->thread.fpscr.val = 0; - - /* Set up to return from userspace. */ -- if (vdso64_rt_sigtramp && current->mm->context.vdso_base) { -+ if (vdso64_rt_sigtramp && current->mm->context.vdso_base != ~0UL) { - regs->link = current->mm->context.vdso_base + vdso64_rt_sigtramp; - } else { - err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]); -diff -urNp linux-2.6.31.1/arch/powerpc/kernel/sys_ppc32.c linux-2.6.31.1/arch/powerpc/kernel/sys_ppc32.c ---- linux-2.6.31.1/arch/powerpc/kernel/sys_ppc32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/kernel/sys_ppc32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -552,10 +552,10 @@ asmlinkage long compat_sys_sysctl(struct - if (oldlenp) { - if (!error) { - if (get_user(oldlen, oldlenp) || -- put_user(oldlen, (compat_size_t __user *)compat_ptr(tmp.oldlenp))) -+ put_user(oldlen, (compat_size_t __user *)compat_ptr(tmp.oldlenp)) || -+ copy_to_user(args->__unused, tmp.__unused, sizeof(tmp.__unused))) - error = -EFAULT; - } -- copy_to_user(args->__unused, tmp.__unused, sizeof(tmp.__unused)); - } - return error; - } -diff -urNp linux-2.6.31.1/arch/powerpc/kernel/vdso.c linux-2.6.31.1/arch/powerpc/kernel/vdso.c ---- linux-2.6.31.1/arch/powerpc/kernel/vdso.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/kernel/vdso.c 2009-10-01 20:12:42.000000000 -0400 -@@ -35,6 +35,7 @@ - #include <asm/firmware.h> - #include <asm/vdso.h> - #include <asm/vdso_datapage.h> -+#include <asm/mman.h> - - #include "setup.h" - -@@ -211,7 +212,7 @@ int arch_setup_additional_pages(struct l - vdso_base = VDSO32_MBASE; - #endif - -- current->mm->context.vdso_base = 0; -+ current->mm->context.vdso_base = ~0UL; - - /* vDSO has a problem and was disabled, just don't "enable" it for the - * process -@@ -228,7 +229,7 @@ int arch_setup_additional_pages(struct l - */ - down_write(&mm->mmap_sem); - vdso_base = get_unmapped_area(NULL, vdso_base, -- vdso_pages << PAGE_SHIFT, 0, 0); -+ vdso_pages << PAGE_SHIFT, 0, MAP_PRIVATE | MAP_EXECUTABLE); - if (IS_ERR_VALUE(vdso_base)) { - rc = vdso_base; - goto fail_mmapsem; -diff -urNp linux-2.6.31.1/arch/powerpc/kvm/timing.c linux-2.6.31.1/arch/powerpc/kvm/timing.c ---- linux-2.6.31.1/arch/powerpc/kvm/timing.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/kvm/timing.c 2009-10-01 20:12:42.000000000 -0400 -@@ -201,7 +201,7 @@ static int kvmppc_exit_timing_open(struc - return single_open(file, kvmppc_exit_timing_show, inode->i_private); - } - --static struct file_operations kvmppc_exit_timing_fops = { -+static const struct file_operations kvmppc_exit_timing_fops = { - .owner = THIS_MODULE, - .open = kvmppc_exit_timing_open, - .read = seq_read, -diff -urNp linux-2.6.31.1/arch/powerpc/lib/usercopy_64.c linux-2.6.31.1/arch/powerpc/lib/usercopy_64.c ---- linux-2.6.31.1/arch/powerpc/lib/usercopy_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/lib/usercopy_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -9,22 +9,6 @@ - #include <linux/module.h> - #include <asm/uaccess.h> - --unsigned long copy_from_user(void *to, const void __user *from, unsigned long n) --{ -- if (likely(access_ok(VERIFY_READ, from, n))) -- n = __copy_from_user(to, from, n); -- else -- memset(to, 0, n); -- return n; --} -- --unsigned long copy_to_user(void __user *to, const void *from, unsigned long n) --{ -- if (likely(access_ok(VERIFY_WRITE, to, n))) -- n = __copy_to_user(to, from, n); -- return n; --} -- - unsigned long copy_in_user(void __user *to, const void __user *from, - unsigned long n) - { -@@ -35,7 +19,5 @@ unsigned long copy_in_user(void __user * - return n; - } - --EXPORT_SYMBOL(copy_from_user); --EXPORT_SYMBOL(copy_to_user); - EXPORT_SYMBOL(copy_in_user); - -diff -urNp linux-2.6.31.1/arch/powerpc/mm/fault.c linux-2.6.31.1/arch/powerpc/mm/fault.c ---- linux-2.6.31.1/arch/powerpc/mm/fault.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/mm/fault.c 2009-10-01 20:12:42.000000000 -0400 -@@ -30,6 +30,10 @@ - #include <linux/kprobes.h> - #include <linux/kdebug.h> - #include <linux/perf_counter.h> -+#include <linux/slab.h> -+#include <linux/pagemap.h> -+#include <linux/compiler.h> -+#include <linux/unistd.h> - - #include <asm/firmware.h> - #include <asm/page.h> -@@ -40,6 +44,7 @@ - #include <asm/uaccess.h> - #include <asm/tlbflush.h> - #include <asm/siginfo.h> -+#include <asm/ptrace.h> - - - #ifdef CONFIG_KPROBES -@@ -64,6 +69,33 @@ static inline int notify_page_fault(stru - } - #endif - -+#ifdef CONFIG_PAX_PAGEEXEC -+/* -+ * PaX: decide what to do with offenders (regs->nip = fault address) -+ * -+ * returns 1 when task should be killed -+ */ -+static int pax_handle_fetch_fault(struct pt_regs *regs) -+{ -+ return 1; -+} -+ -+void pax_report_insns(void *pc, void *sp) -+{ -+ unsigned long i; -+ -+ printk(KERN_ERR "PAX: bytes at PC: "); -+ for (i = 0; i < 5; i++) { -+ unsigned int c; -+ if (get_user(c, (unsigned int *)pc+i)) -+ printk(KERN_CONT "???????? "); -+ else -+ printk(KERN_CONT "%08x ", c); -+ } -+ printk("\n"); -+} -+#endif -+ - /* - * Check whether the instruction at regs->nip is a store using - * an update addressing form which will update r1. -@@ -134,7 +166,7 @@ int __kprobes do_page_fault(struct pt_re - * indicate errors in DSISR but can validly be set in SRR1. - */ - if (trap == 0x400) -- error_code &= 0x48200000; -+ error_code &= 0x58200000; - else - is_write = error_code & DSISR_ISSTORE; - #else -@@ -250,7 +282,7 @@ good_area: - * "undefined". Of those that can be set, this is the only - * one which seems bad. - */ -- if (error_code & 0x10000000) -+ if (error_code & DSISR_GUARDED) - /* Guarded storage error. */ - goto bad_area; - #endif /* CONFIG_8xx */ -@@ -265,7 +297,7 @@ good_area: - * processors use the same I/D cache coherency mechanism - * as embedded. - */ -- if (error_code & DSISR_PROTFAULT) -+ if (error_code & (DSISR_PROTFAULT | DSISR_GUARDED)) - goto bad_area; - #endif /* CONFIG_PPC_STD_MMU */ - -@@ -335,6 +367,23 @@ bad_area: - bad_area_nosemaphore: - /* User mode accesses cause a SIGSEGV */ - if (user_mode(regs)) { -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (mm->pax_flags & MF_PAX_PAGEEXEC) { -+#ifdef CONFIG_PPC_STD_MMU -+ if (is_exec && (error_code & (DSISR_PROTFAULT | DSISR_GUARDED))) { -+#else -+ if (is_exec && regs->nip == address) { -+#endif -+ switch (pax_handle_fetch_fault(regs)) { -+ } -+ -+ pax_report_fault(regs, (void *)regs->nip, (void *)regs->gpr[PT_R1]); -+ do_group_exit(SIGKILL); -+ } -+ } -+#endif -+ - _exception(SIGSEGV, regs, code, address); - return 0; - } -diff -urNp linux-2.6.31.1/arch/powerpc/mm/mmap_64.c linux-2.6.31.1/arch/powerpc/mm/mmap_64.c ---- linux-2.6.31.1/arch/powerpc/mm/mmap_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/mm/mmap_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -99,10 +99,22 @@ void arch_pick_mmap_layout(struct mm_str - */ - if (mmap_is_legacy()) { - mm->mmap_base = TASK_UNMAPPED_BASE; -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base += mm->delta_mmap; -+#endif -+ - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; - } else { - mm->mmap_base = mmap_base(); -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base -= mm->delta_mmap + mm->delta_stack; -+#endif -+ - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; - } -diff -urNp linux-2.6.31.1/arch/powerpc/mm/slice.c linux-2.6.31.1/arch/powerpc/mm/slice.c ---- linux-2.6.31.1/arch/powerpc/mm/slice.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/mm/slice.c 2009-10-01 20:12:42.000000000 -0400 -@@ -426,6 +426,11 @@ unsigned long slice_get_unmapped_area(un - if (fixed && addr > (mm->task_size - len)) - return -EINVAL; - -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!fixed && (mm->pax_flags & MF_PAX_RANDMMAP)) -+ addr = 0; -+#endif -+ - /* If hint, make sure it matches our alignment restrictions */ - if (!fixed && addr) { - addr = _ALIGN_UP(addr, 1ul << pshift); -diff -urNp linux-2.6.31.1/arch/powerpc/platforms/cell/spufs/file.c linux-2.6.31.1/arch/powerpc/platforms/cell/spufs/file.c ---- linux-2.6.31.1/arch/powerpc/platforms/cell/spufs/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/platforms/cell/spufs/file.c 2009-10-01 20:12:42.000000000 -0400 -@@ -147,7 +147,7 @@ static int __fops ## _open(struct inode - __simple_attr_check_format(__fmt, 0ull); \ - return spufs_attr_open(inode, file, __get, __set, __fmt); \ - } \ --static struct file_operations __fops = { \ -+static const struct file_operations __fops = { \ - .owner = THIS_MODULE, \ - .open = __fops ## _open, \ - .release = spufs_attr_release, \ -@@ -309,7 +309,7 @@ static int spufs_mem_mmap_access(struct - return len; - } - --static struct vm_operations_struct spufs_mem_mmap_vmops = { -+static const struct vm_operations_struct spufs_mem_mmap_vmops = { - .fault = spufs_mem_mmap_fault, - .access = spufs_mem_mmap_access, - }; -@@ -436,7 +436,7 @@ static int spufs_cntl_mmap_fault(struct - return spufs_ps_fault(vma, vmf, 0x4000, SPUFS_CNTL_MAP_SIZE); - } - --static struct vm_operations_struct spufs_cntl_mmap_vmops = { -+static const struct vm_operations_struct spufs_cntl_mmap_vmops = { - .fault = spufs_cntl_mmap_fault, - }; - -@@ -1143,7 +1143,7 @@ spufs_signal1_mmap_fault(struct vm_area_ - #endif - } - --static struct vm_operations_struct spufs_signal1_mmap_vmops = { -+static const struct vm_operations_struct spufs_signal1_mmap_vmops = { - .fault = spufs_signal1_mmap_fault, - }; - -@@ -1279,7 +1279,7 @@ spufs_signal2_mmap_fault(struct vm_area_ - #endif - } - --static struct vm_operations_struct spufs_signal2_mmap_vmops = { -+static const struct vm_operations_struct spufs_signal2_mmap_vmops = { - .fault = spufs_signal2_mmap_fault, - }; - -@@ -1397,7 +1397,7 @@ spufs_mss_mmap_fault(struct vm_area_stru - return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_MSS_MAP_SIZE); - } - --static struct vm_operations_struct spufs_mss_mmap_vmops = { -+static const struct vm_operations_struct spufs_mss_mmap_vmops = { - .fault = spufs_mss_mmap_fault, - }; - -@@ -1458,7 +1458,7 @@ spufs_psmap_mmap_fault(struct vm_area_st - return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_PS_MAP_SIZE); - } - --static struct vm_operations_struct spufs_psmap_mmap_vmops = { -+static const struct vm_operations_struct spufs_psmap_mmap_vmops = { - .fault = spufs_psmap_mmap_fault, - }; - -@@ -1517,7 +1517,7 @@ spufs_mfc_mmap_fault(struct vm_area_stru - return spufs_ps_fault(vma, vmf, 0x3000, SPUFS_MFC_MAP_SIZE); - } - --static struct vm_operations_struct spufs_mfc_mmap_vmops = { -+static const struct vm_operations_struct spufs_mfc_mmap_vmops = { - .fault = spufs_mfc_mmap_fault, - }; - -diff -urNp linux-2.6.31.1/arch/powerpc/platforms/pseries/dtl.c linux-2.6.31.1/arch/powerpc/platforms/pseries/dtl.c ---- linux-2.6.31.1/arch/powerpc/platforms/pseries/dtl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/platforms/pseries/dtl.c 2009-10-01 20:12:42.000000000 -0400 -@@ -209,7 +209,7 @@ static ssize_t dtl_file_read(struct file - return n_read * sizeof(struct dtl_entry); - } - --static struct file_operations dtl_fops = { -+static const struct file_operations dtl_fops = { - .open = dtl_file_open, - .release = dtl_file_release, - .read = dtl_file_read, -diff -urNp linux-2.6.31.1/arch/powerpc/platforms/pseries/hvCall_inst.c linux-2.6.31.1/arch/powerpc/platforms/pseries/hvCall_inst.c ---- linux-2.6.31.1/arch/powerpc/platforms/pseries/hvCall_inst.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/powerpc/platforms/pseries/hvCall_inst.c 2009-10-01 20:12:42.000000000 -0400 -@@ -71,7 +71,7 @@ static int hc_show(struct seq_file *m, v - return 0; - } - --static struct seq_operations hcall_inst_seq_ops = { -+static const struct seq_operations hcall_inst_seq_ops = { - .start = hc_start, - .next = hc_next, - .stop = hc_stop, -diff -urNp linux-2.6.31.1/arch/s390/hypfs/inode.c linux-2.6.31.1/arch/s390/hypfs/inode.c ---- linux-2.6.31.1/arch/s390/hypfs/inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/s390/hypfs/inode.c 2009-10-01 20:12:42.000000000 -0400 -@@ -41,7 +41,7 @@ struct hypfs_sb_info { - - static const struct file_operations hypfs_file_ops; - static struct file_system_type hypfs_type; --static struct super_operations hypfs_s_ops; -+static const struct super_operations hypfs_s_ops; - - /* start of list of all dentries, which have to be deleted on update */ - static struct dentry *hypfs_last_dentry; -@@ -476,7 +476,7 @@ static struct file_system_type hypfs_typ - .kill_sb = hypfs_kill_super - }; - --static struct super_operations hypfs_s_ops = { -+static const struct super_operations hypfs_s_ops = { - .statfs = simple_statfs, - .drop_inode = hypfs_drop_inode, - .show_options = hypfs_show_options, -diff -urNp linux-2.6.31.1/arch/s390/include/asm/atomic.h linux-2.6.31.1/arch/s390/include/asm/atomic.h ---- linux-2.6.31.1/arch/s390/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/s390/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -71,19 +71,31 @@ static inline int atomic_read(const atom - return v->counter; - } - -+static inline int atomic_read_unchecked(const atomic_unchecked_t *v) -+{ -+ return atomic_read((const atomic_t *)v); -+} -+ - static inline void atomic_set(atomic_t *v, int i) - { - v->counter = i; - barrier(); - } - -+static inline void atomic_set_unchecked(atomic_unchecked_t *v, int i) -+{ -+ atomic_set((atomic_t *)v, i); -+} -+ - static __inline__ int atomic_add_return(int i, atomic_t * v) - { - return __CS_LOOP(v, i, "ar"); - } - #define atomic_add(_i, _v) atomic_add_return(_i, _v) -+#define atomic_add_unchecked(_i, _v) atomic_add((_i), (atomic_t *)(_v)) - #define atomic_add_negative(_i, _v) (atomic_add_return(_i, _v) < 0) - #define atomic_inc(_v) atomic_add_return(1, _v) -+#define atomic_inc_unchecked(_v) atomic_inc((atomic_t *)(_v)) - #define atomic_inc_return(_v) atomic_add_return(1, _v) - #define atomic_inc_and_test(_v) (atomic_add_return(1, _v) == 0) - -@@ -92,6 +104,7 @@ static __inline__ int atomic_sub_return( - return __CS_LOOP(v, i, "sr"); - } - #define atomic_sub(_i, _v) atomic_sub_return(_i, _v) -+#define atomic_sub_unchecked(_i, _v) atomic_sub((_i), (atomic_t *)(_v)) - #define atomic_sub_and_test(_i, _v) (atomic_sub_return(_i, _v) == 0) - #define atomic_dec(_v) atomic_sub_return(1, _v) - #define atomic_dec_return(_v) atomic_sub_return(1, _v) -diff -urNp linux-2.6.31.1/arch/s390/include/asm/uaccess.h linux-2.6.31.1/arch/s390/include/asm/uaccess.h ---- linux-2.6.31.1/arch/s390/include/asm/uaccess.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/s390/include/asm/uaccess.h 2009-10-01 20:12:42.000000000 -0400 -@@ -232,6 +232,10 @@ static inline unsigned long __must_check - copy_to_user(void __user *to, const void *from, unsigned long n) - { - might_fault(); -+ -+ if ((long)n < 0) -+ return n; -+ - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); - return n; -@@ -257,6 +261,9 @@ copy_to_user(void __user *to, const void - static inline unsigned long __must_check - __copy_from_user(void *to, const void __user *from, unsigned long n) - { -+ if ((long)n < 0) -+ return n; -+ - if (__builtin_constant_p(n) && (n <= 256)) - return uaccess.copy_from_user_small(n, from, to); - else -@@ -283,6 +290,10 @@ static inline unsigned long __must_check - copy_from_user(void *to, const void __user *from, unsigned long n) - { - might_fault(); -+ -+ if ((long)n < 0) -+ return n; -+ - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else -diff -urNp linux-2.6.31.1/arch/s390/kernel/module.c linux-2.6.31.1/arch/s390/kernel/module.c ---- linux-2.6.31.1/arch/s390/kernel/module.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/s390/kernel/module.c 2009-10-01 20:12:42.000000000 -0400 -@@ -164,11 +164,11 @@ module_frob_arch_sections(Elf_Ehdr *hdr, - - /* Increase core size by size of got & plt and set start - offsets for got and plt. */ -- me->core_size = ALIGN(me->core_size, 4); -- me->arch.got_offset = me->core_size; -- me->core_size += me->arch.got_size; -- me->arch.plt_offset = me->core_size; -- me->core_size += me->arch.plt_size; -+ me->core_size_rw = ALIGN(me->core_size_rw, 4); -+ me->arch.got_offset = me->core_size_rw; -+ me->core_size_rw += me->arch.got_size; -+ me->arch.plt_offset = me->core_size_rx; -+ me->core_size_rx += me->arch.plt_size; - return 0; - } - -@@ -254,7 +254,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base - if (info->got_initialized == 0) { - Elf_Addr *gotent; - -- gotent = me->module_core + me->arch.got_offset + -+ gotent = me->module_core_rw + me->arch.got_offset + - info->got_offset; - *gotent = val; - info->got_initialized = 1; -@@ -278,7 +278,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base - else if (r_type == R_390_GOTENT || - r_type == R_390_GOTPLTENT) - *(unsigned int *) loc = -- (val + (Elf_Addr) me->module_core - loc) >> 1; -+ (val + (Elf_Addr) me->module_core_rw - loc) >> 1; - else if (r_type == R_390_GOT64 || - r_type == R_390_GOTPLT64) - *(unsigned long *) loc = val; -@@ -292,7 +292,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base - case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ - if (info->plt_initialized == 0) { - unsigned int *ip; -- ip = me->module_core + me->arch.plt_offset + -+ ip = me->module_core_rx + me->arch.plt_offset + - info->plt_offset; - #ifndef CONFIG_64BIT - ip[0] = 0x0d105810; /* basr 1,0; l 1,6(1); br 1 */ -@@ -317,7 +317,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base - val - loc + 0xffffUL < 0x1ffffeUL) || - (r_type == R_390_PLT32DBL && - val - loc + 0xffffffffULL < 0x1fffffffeULL))) -- val = (Elf_Addr) me->module_core + -+ val = (Elf_Addr) me->module_core_rx + - me->arch.plt_offset + - info->plt_offset; - val += rela->r_addend - loc; -@@ -339,7 +339,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base - case R_390_GOTOFF32: /* 32 bit offset to GOT. */ - case R_390_GOTOFF64: /* 64 bit offset to GOT. */ - val = val + rela->r_addend - -- ((Elf_Addr) me->module_core + me->arch.got_offset); -+ ((Elf_Addr) me->module_core_rw + me->arch.got_offset); - if (r_type == R_390_GOTOFF16) - *(unsigned short *) loc = val; - else if (r_type == R_390_GOTOFF32) -@@ -349,7 +349,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base - break; - case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */ - case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */ -- val = (Elf_Addr) me->module_core + me->arch.got_offset + -+ val = (Elf_Addr) me->module_core_rw + me->arch.got_offset + - rela->r_addend - loc; - if (r_type == R_390_GOTPC) - *(unsigned int *) loc = val; -diff -urNp linux-2.6.31.1/arch/sh/include/asm/atomic.h linux-2.6.31.1/arch/sh/include/asm/atomic.h ---- linux-2.6.31.1/arch/sh/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sh/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -14,7 +14,9 @@ - #define ATOMIC_INIT(i) ( (atomic_t) { (i) } ) - - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - #define atomic_set(v,i) ((v)->counter = (i)) -+#define atomic_set_unchecked(v,i) ((v)->counter = (i)) - - #if defined(CONFIG_GUSA_RB) - #include <asm/atomic-grb.h> -@@ -43,6 +45,9 @@ - #define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0) - - #define atomic_inc(v) atomic_add(1,(v)) -+#define atomic_inc_unchecked(v) atomic_inc((atomic_t *)(v)) -+#define atomic_add_unchecked(i,v) atomic_add((i),(atomic_t *)(v)) -+#define atomic_sub_unchecked(i,v) atomic_sub((i),(atomic_t *)(v)) - #define atomic_dec(v) atomic_sub(1,(v)) - - #if !defined(CONFIG_GUSA_RB) && !defined(CONFIG_CPU_SH4A) -diff -urNp linux-2.6.31.1/arch/sparc/include/asm/atomic_32.h linux-2.6.31.1/arch/sparc/include/asm/atomic_32.h ---- linux-2.6.31.1/arch/sparc/include/asm/atomic_32.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/include/asm/atomic_32.h 2009-10-01 20:12:42.000000000 -0400 -@@ -24,12 +24,17 @@ extern int atomic_cmpxchg(atomic_t *, in - #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) - extern int atomic_add_unless(atomic_t *, int, int); - extern void atomic_set(atomic_t *, int); -+extern void atomic_set_unchecked(atomic_unchecked_t *, int); - - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - - #define atomic_add(i, v) ((void)__atomic_add_return( (int)(i), (v))) -+#define atomic_add_unchecked(i, v) atomic_add((i), (atomic_t *)(v)) - #define atomic_sub(i, v) ((void)__atomic_add_return(-(int)(i), (v))) -+#define atomic_sub_unchecked(i, v) atomic_sub((i), (atomic_t *)(v)) - #define atomic_inc(v) ((void)__atomic_add_return( 1, (v))) -+#define atomic_inc_unchecked(v) atomic_inc((atomic_t *)(v)) - #define atomic_dec(v) ((void)__atomic_add_return( -1, (v))) - - #define atomic_add_return(i, v) (__atomic_add_return( (int)(i), (v))) -diff -urNp linux-2.6.31.1/arch/sparc/include/asm/atomic_64.h linux-2.6.31.1/arch/sparc/include/asm/atomic_64.h ---- linux-2.6.31.1/arch/sparc/include/asm/atomic_64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/include/asm/atomic_64.h 2009-10-01 20:12:42.000000000 -0400 -@@ -14,14 +14,18 @@ - #define ATOMIC64_INIT(i) { (i) } - - #define atomic_read(v) ((v)->counter) -+#define atomic_read_unchecked(v) ((v)->counter) - #define atomic64_read(v) ((v)->counter) - - #define atomic_set(v, i) (((v)->counter) = i) -+#define atomic_set_unchecked(v, i) (((v)->counter) = i) - #define atomic64_set(v, i) (((v)->counter) = i) - - extern void atomic_add(int, atomic_t *); -+extern void atomic_add_unchecked(int, atomic_unchecked_t *); - extern void atomic64_add(int, atomic64_t *); - extern void atomic_sub(int, atomic_t *); -+extern void atomic_sub_unchecked(int, atomic_unchecked_t *); - extern void atomic64_sub(int, atomic64_t *); - - extern int atomic_add_ret(int, atomic_t *); -@@ -59,6 +63,7 @@ extern int atomic64_sub_ret(int, atomic6 - #define atomic64_dec_and_test(v) (atomic64_sub_ret(1, v) == 0) - - #define atomic_inc(v) atomic_add(1, v) -+#define atomic_inc_unchecked(v) atomic_add_unchecked(1, v) - #define atomic64_inc(v) atomic64_add(1, v) - - #define atomic_dec(v) atomic_sub(1, v) -@@ -72,17 +77,28 @@ extern int atomic64_sub_ret(int, atomic6 - - static inline int atomic_add_unless(atomic_t *v, int a, int u) - { -- int c, old; -+ int c, old, new; - c = atomic_read(v); - for (;;) { -- if (unlikely(c == (u))) -+ if (unlikely(c == u)) - break; -- old = atomic_cmpxchg((v), c, c + (a)); -+ -+ asm volatile("addcc %2, %0, %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "tvs %%icc, 6\n" -+#endif -+ -+ : "=r" (new) -+ : "0" (c), "ir" (a) -+ : "cc"); -+ -+ old = atomic_cmpxchg(v, c, new); - if (likely(old == c)) - break; - c = old; - } -- return c != (u); -+ return c != u; - } - - #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) -@@ -93,17 +109,28 @@ static inline int atomic_add_unless(atom - - static inline int atomic64_add_unless(atomic64_t *v, long a, long u) - { -- long c, old; -+ long c, old, new; - c = atomic64_read(v); - for (;;) { -- if (unlikely(c == (u))) -+ if (unlikely(c == u)) - break; -- old = atomic64_cmpxchg((v), c, c + (a)); -+ -+ asm volatile("addcc %2, %0, %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "tvs %%xcc, 6\n" -+#endif -+ -+ : "=r" (new) -+ : "0" (c), "ir" (a) -+ : "cc"); -+ -+ old = atomic64_cmpxchg(v, c, new); - if (likely(old == c)) - break; - c = old; - } -- return c != (u); -+ return c != u; - } - - #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) -diff -urNp linux-2.6.31.1/arch/sparc/include/asm/elf_32.h linux-2.6.31.1/arch/sparc/include/asm/elf_32.h ---- linux-2.6.31.1/arch/sparc/include/asm/elf_32.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/include/asm/elf_32.h 2009-10-01 20:12:42.000000000 -0400 -@@ -116,6 +116,13 @@ typedef struct { - - #define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE) - -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE 0x10000UL -+ -+#define PAX_DELTA_MMAP_LEN 16 -+#define PAX_DELTA_STACK_LEN 16 -+#endif -+ - /* This yields a mask that user programs can use to figure out what - instruction set this cpu supports. This can NOT be done in userspace - on Sparc. */ -diff -urNp linux-2.6.31.1/arch/sparc/include/asm/elf_64.h linux-2.6.31.1/arch/sparc/include/asm/elf_64.h ---- linux-2.6.31.1/arch/sparc/include/asm/elf_64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/include/asm/elf_64.h 2009-10-01 20:12:42.000000000 -0400 -@@ -163,6 +163,12 @@ typedef struct { - #define ELF_ET_DYN_BASE 0x0000010000000000UL - #define COMPAT_ELF_ET_DYN_BASE 0x0000000070000000UL - -+#ifdef CONFIG_PAX_ASLR -+#define PAX_ELF_ET_DYN_BASE (test_thread_flag(TIF_32BIT) ? 0x10000UL : 0x100000UL) -+ -+#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_32BIT) ? 14 : 28 ) -+#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_32BIT) ? 15 : 29 ) -+#endif - - /* This yields a mask that user programs can use to figure out what - instruction set this cpu supports. */ -diff -urNp linux-2.6.31.1/arch/sparc/include/asm/pgtable_32.h linux-2.6.31.1/arch/sparc/include/asm/pgtable_32.h ---- linux-2.6.31.1/arch/sparc/include/asm/pgtable_32.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/include/asm/pgtable_32.h 2009-10-01 20:12:42.000000000 -0400 -@@ -43,6 +43,13 @@ BTFIXUPDEF_SIMM13(user_ptrs_per_pgd) - BTFIXUPDEF_INT(page_none) - BTFIXUPDEF_INT(page_copy) - BTFIXUPDEF_INT(page_readonly) -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+BTFIXUPDEF_INT(page_shared_noexec) -+BTFIXUPDEF_INT(page_copy_noexec) -+BTFIXUPDEF_INT(page_readonly_noexec) -+#endif -+ - BTFIXUPDEF_INT(page_kernel) - - #define PMD_SHIFT SUN4C_PMD_SHIFT -@@ -64,6 +71,16 @@ extern pgprot_t PAGE_SHARED; - #define PAGE_COPY __pgprot(BTFIXUP_INT(page_copy)) - #define PAGE_READONLY __pgprot(BTFIXUP_INT(page_readonly)) - -+#ifdef CONFIG_PAX_PAGEEXEC -+extern pgprot_t PAGE_SHARED_NOEXEC; -+# define PAGE_COPY_NOEXEC __pgprot(BTFIXUP_INT(page_copy_noexec)) -+# define PAGE_READONLY_NOEXEC __pgprot(BTFIXUP_INT(page_readonly_noexec)) -+#else -+# define PAGE_SHARED_NOEXEC PAGE_SHARED -+# define PAGE_COPY_NOEXEC PAGE_COPY -+# define PAGE_READONLY_NOEXEC PAGE_READONLY -+#endif -+ - extern unsigned long page_kernel; - - #ifdef MODULE -diff -urNp linux-2.6.31.1/arch/sparc/include/asm/pgtsrmmu.h linux-2.6.31.1/arch/sparc/include/asm/pgtsrmmu.h ---- linux-2.6.31.1/arch/sparc/include/asm/pgtsrmmu.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/include/asm/pgtsrmmu.h 2009-10-01 20:12:42.000000000 -0400 -@@ -115,6 +115,13 @@ - SRMMU_EXEC | SRMMU_REF) - #define SRMMU_PAGE_RDONLY __pgprot(SRMMU_VALID | SRMMU_CACHE | \ - SRMMU_EXEC | SRMMU_REF) -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+#define SRMMU_PAGE_SHARED_NOEXEC __pgprot(SRMMU_VALID | SRMMU_CACHE | SRMMU_WRITE | SRMMU_REF) -+#define SRMMU_PAGE_COPY_NOEXEC __pgprot(SRMMU_VALID | SRMMU_CACHE | SRMMU_REF) -+#define SRMMU_PAGE_RDONLY_NOEXEC __pgprot(SRMMU_VALID | SRMMU_CACHE | SRMMU_REF) -+#endif -+ - #define SRMMU_PAGE_KERNEL __pgprot(SRMMU_VALID | SRMMU_CACHE | SRMMU_PRIV | \ - SRMMU_DIRTY | SRMMU_REF) - -diff -urNp linux-2.6.31.1/arch/sparc/include/asm/spinlock_64.h linux-2.6.31.1/arch/sparc/include/asm/spinlock_64.h ---- linux-2.6.31.1/arch/sparc/include/asm/spinlock_64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/include/asm/spinlock_64.h 2009-10-01 20:12:42.000000000 -0400 -@@ -99,7 +99,12 @@ static void inline __read_lock(raw_rwloc - __asm__ __volatile__ ( - "1: ldsw [%2], %0\n" - " brlz,pn %0, 2f\n" --"4: add %0, 1, %1\n" -+"4: addcc %0, 1, %1\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+" tvs %%icc, 6\n" -+#endif -+ - " cas [%2], %0, %1\n" - " cmp %0, %1\n" - " bne,pn %%icc, 1b\n" -@@ -112,7 +117,7 @@ static void inline __read_lock(raw_rwloc - " .previous" - : "=&r" (tmp1), "=&r" (tmp2) - : "r" (lock) -- : "memory"); -+ : "memory", "cc"); - } - - static int inline __read_trylock(raw_rwlock_t *lock) -@@ -123,7 +128,12 @@ static int inline __read_trylock(raw_rwl - "1: ldsw [%2], %0\n" - " brlz,a,pn %0, 2f\n" - " mov 0, %0\n" --" add %0, 1, %1\n" -+" addcc %0, 1, %1\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+" tvs %%icc, 6\n" -+#endif -+ - " cas [%2], %0, %1\n" - " cmp %0, %1\n" - " bne,pn %%icc, 1b\n" -@@ -142,7 +152,12 @@ static void inline __read_unlock(raw_rwl - - __asm__ __volatile__( - "1: lduw [%2], %0\n" --" sub %0, 1, %1\n" -+" subcc %0, 1, %1\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+" tvs %%icc, 6\n" -+#endif -+ - " cas [%2], %0, %1\n" - " cmp %0, %1\n" - " bne,pn %%xcc, 1b\n" -diff -urNp linux-2.6.31.1/arch/sparc/include/asm/uaccess_32.h linux-2.6.31.1/arch/sparc/include/asm/uaccess_32.h ---- linux-2.6.31.1/arch/sparc/include/asm/uaccess_32.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/include/asm/uaccess_32.h 2009-10-01 20:12:42.000000000 -0400 -@@ -249,27 +249,49 @@ extern unsigned long __copy_user(void __ - - static inline unsigned long copy_to_user(void __user *to, const void *from, unsigned long n) - { -- if (n && __access_ok((unsigned long) to, n)) -+ if ((long)n < 0) -+ return n; -+ -+ if (n && __access_ok((unsigned long) to, n)) { -+ if (!__builtin_constant_p(n)) -+ check_object_size(from, n, true); - return __copy_user(to, (__force void __user *) from, n); -- else -+ } else - return n; - } - - static inline unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) - { -+ if ((long)n < 0) -+ return n; -+ -+ if (!__builtin_constant_p(n)) -+ check_object_size(from, n, true); -+ - return __copy_user(to, (__force void __user *) from, n); - } - - static inline unsigned long copy_from_user(void *to, const void __user *from, unsigned long n) - { -- if (n && __access_ok((unsigned long) from, n)) -+ if ((long)n < 0) -+ return n; -+ -+ if (n && __access_ok((unsigned long) from, n)) { -+ if (!__builtin_constant_p(n)) -+ check_object_size(to, n, false); - return __copy_user((__force void __user *) to, from, n); -- else -+ } else - return n; - } - - static inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) - { -+ if ((long)n < 0) -+ return n; -+ -+ if (!__builtin_constant_p(n)) -+ check_object_size(to, n, false); -+ - return __copy_user((__force void __user *) to, from, n); - } - -diff -urNp linux-2.6.31.1/arch/sparc/include/asm/uaccess_64.h linux-2.6.31.1/arch/sparc/include/asm/uaccess_64.h ---- linux-2.6.31.1/arch/sparc/include/asm/uaccess_64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/include/asm/uaccess_64.h 2009-10-01 20:12:42.000000000 -0400 -@@ -212,7 +212,15 @@ extern unsigned long copy_from_user_fixu - static inline unsigned long __must_check - copy_from_user(void *to, const void __user *from, unsigned long size) - { -- unsigned long ret = ___copy_from_user(to, from, size); -+ unsigned long ret; -+ -+ if (unlikely(((long)size > INT_MAX) || ((long)size < 0))) -+ return size; -+ -+ if (!__builtin_constant_p(size)) -+ check_object_size(to, size, false); -+ -+ ret = ___copy_from_user(to, from, size); - - if (unlikely(ret)) - ret = copy_from_user_fixup(to, from, size); -@@ -228,7 +236,15 @@ extern unsigned long copy_to_user_fixup( - static inline unsigned long __must_check - copy_to_user(void __user *to, const void *from, unsigned long size) - { -- unsigned long ret = ___copy_to_user(to, from, size); -+ unsigned long ret; -+ -+ if (unlikely(((long)size > INT_MAX) || ((long)size < 0))) -+ return size; -+ -+ if (!__builtin_constant_p(size)) -+ check_object_size(from, size, true); -+ -+ ret = ___copy_to_user(to, from, size); - - if (unlikely(ret)) - ret = copy_to_user_fixup(to, from, size); -diff -urNp linux-2.6.31.1/arch/sparc/kernel/Makefile linux-2.6.31.1/arch/sparc/kernel/Makefile ---- linux-2.6.31.1/arch/sparc/kernel/Makefile 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/kernel/Makefile 2009-10-01 20:12:42.000000000 -0400 -@@ -3,7 +3,7 @@ - # - - asflags-y := -ansi --ccflags-y := -Werror -+#ccflags-y := -Werror - - extra-y := head_$(BITS).o - extra-y += init_task.o -diff -urNp linux-2.6.31.1/arch/sparc/kernel/sys_sparc_32.c linux-2.6.31.1/arch/sparc/kernel/sys_sparc_32.c ---- linux-2.6.31.1/arch/sparc/kernel/sys_sparc_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/kernel/sys_sparc_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -56,7 +56,7 @@ unsigned long arch_get_unmapped_area(str - if (ARCH_SUN4C && len > 0x20000000) - return -ENOMEM; - if (!addr) -- addr = TASK_UNMAPPED_BASE; -+ addr = current->mm->mmap_base; - - if (flags & MAP_SHARED) - addr = COLOUR_ALIGN(addr); -diff -urNp linux-2.6.31.1/arch/sparc/kernel/sys_sparc_64.c linux-2.6.31.1/arch/sparc/kernel/sys_sparc_64.c ---- linux-2.6.31.1/arch/sparc/kernel/sys_sparc_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/kernel/sys_sparc_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -125,7 +125,7 @@ unsigned long arch_get_unmapped_area(str - /* We do not accept a shared mapping if it would violate - * cache aliasing constraints. - */ -- if ((flags & MAP_SHARED) && -+ if ((filp || (flags & MAP_SHARED)) && - ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) - return -EINVAL; - return addr; -@@ -140,6 +140,10 @@ unsigned long arch_get_unmapped_area(str - if (filp || (flags & MAP_SHARED)) - do_color_align = 1; - -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) -+#endif -+ - if (addr) { - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); -@@ -153,9 +157,9 @@ unsigned long arch_get_unmapped_area(str - } - - if (len > mm->cached_hole_size) { -- start_addr = addr = mm->free_area_cache; -+ start_addr = addr = mm->free_area_cache; - } else { -- start_addr = addr = TASK_UNMAPPED_BASE; -+ start_addr = addr = mm->mmap_base; - mm->cached_hole_size = 0; - } - -@@ -175,8 +179,8 @@ full_search: - vma = find_vma(mm, VA_EXCLUDE_END); - } - if (unlikely(task_size < addr)) { -- if (start_addr != TASK_UNMAPPED_BASE) { -- start_addr = addr = TASK_UNMAPPED_BASE; -+ if (start_addr != mm->mmap_base) { -+ start_addr = addr = mm->mmap_base; - mm->cached_hole_size = 0; - goto full_search; - } -@@ -216,7 +220,7 @@ arch_get_unmapped_area_topdown(struct fi - /* We do not accept a shared mapping if it would violate - * cache aliasing constraints. - */ -- if ((flags & MAP_SHARED) && -+ if ((filp || (flags & MAP_SHARED)) && - ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) - return -EINVAL; - return addr; -@@ -380,6 +384,12 @@ void arch_pick_mmap_layout(struct mm_str - current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY || - sysctl_legacy_va_layout) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base += mm->delta_mmap; -+#endif -+ - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; - } else { -@@ -394,6 +404,12 @@ void arch_pick_mmap_layout(struct mm_str - gap = (task_size / 6 * 5); - - mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base -= mm->delta_mmap + mm->delta_stack; -+#endif -+ - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; - } -diff -urNp linux-2.6.31.1/arch/sparc/kernel/traps_64.c linux-2.6.31.1/arch/sparc/kernel/traps_64.c ---- linux-2.6.31.1/arch/sparc/kernel/traps_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/kernel/traps_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -93,6 +93,12 @@ void bad_trap(struct pt_regs *regs, long - - lvl -= 0x100; - if (regs->tstate & TSTATE_PRIV) { -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ if (lvl == 6) -+ pax_report_refcount_overflow(regs); -+#endif -+ - sprintf(buffer, "Kernel bad sw trap %lx", lvl); - die_if_kernel(buffer, regs); - } -@@ -111,11 +117,16 @@ void bad_trap(struct pt_regs *regs, long - void bad_trap_tl1(struct pt_regs *regs, long lvl) - { - char buffer[32]; -- -+ - if (notify_die(DIE_TRAP_TL1, "bad trap tl1", regs, - 0, lvl, SIGTRAP) == NOTIFY_STOP) - return; - -+#ifdef CONFIG_PAX_REFCOUNT -+ if (lvl == 6) -+ pax_report_refcount_overflow(regs); -+#endif -+ - dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); - - sprintf (buffer, "Bad trap %lx at tl>0", lvl); -diff -urNp linux-2.6.31.1/arch/sparc/lib/atomic32.c linux-2.6.31.1/arch/sparc/lib/atomic32.c ---- linux-2.6.31.1/arch/sparc/lib/atomic32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/lib/atomic32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -80,6 +80,12 @@ void atomic_set(atomic_t *v, int i) - } - EXPORT_SYMBOL(atomic_set); - -+void atomic_set_unchecked(atomic_unchecked_t *v, int i) -+{ -+ atomic_set((atomic_t *)v, i); -+} -+EXPORT_SYMBOL(atomic_set_unchecked); -+ - unsigned long ___set_bit(unsigned long *addr, unsigned long mask) - { - unsigned long old, flags; -diff -urNp linux-2.6.31.1/arch/sparc/lib/atomic_64.S linux-2.6.31.1/arch/sparc/lib/atomic_64.S ---- linux-2.6.31.1/arch/sparc/lib/atomic_64.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/lib/atomic_64.S 2009-10-01 20:12:42.000000000 -0400 -@@ -18,7 +18,12 @@ - atomic_add: /* %o0 = increment, %o1 = atomic_ptr */ - BACKOFF_SETUP(%o2) - 1: lduw [%o1], %g1 -- add %g1, %o0, %g7 -+ addcc %g1, %o0, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cas [%o1], %g1, %g7 - cmp %g1, %g7 - bne,pn %icc, 2f -@@ -28,12 +33,32 @@ atomic_add: /* %o0 = increment, %o1 = at - 2: BACKOFF_SPIN(%o2, %o3, 1b) - .size atomic_add, .-atomic_add - -+ .globl atomic_add_unchecked -+ .type atomic_add_unchecked,#function -+atomic_add_unchecked: /* %o0 = increment, %o1 = atomic_ptr */ -+ BACKOFF_SETUP(%o2) -+1: lduw [%o1], %g1 -+ add %g1, %o0, %g7 -+ cas [%o1], %g1, %g7 -+ cmp %g1, %g7 -+ bne,pn %icc, 2f -+ nop -+ retl -+ nop -+2: BACKOFF_SPIN(%o2, %o3, 1b) -+ .size atomic_add_unchecked, .-atomic_add_unchecked -+ - .globl atomic_sub - .type atomic_sub,#function - atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */ - BACKOFF_SETUP(%o2) - 1: lduw [%o1], %g1 -- sub %g1, %o0, %g7 -+ subcc %g1, %o0, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cas [%o1], %g1, %g7 - cmp %g1, %g7 - bne,pn %icc, 2f -@@ -43,12 +68,32 @@ atomic_sub: /* %o0 = decrement, %o1 = at - 2: BACKOFF_SPIN(%o2, %o3, 1b) - .size atomic_sub, .-atomic_sub - -+ .globl atomic_sub_unchecked -+ .type atomic_sub_unchecked,#function -+atomic_sub_unchecked: /* %o0 = decrement, %o1 = atomic_ptr */ -+ BACKOFF_SETUP(%o2) -+1: lduw [%o1], %g1 -+ sub %g1, %o0, %g7 -+ cas [%o1], %g1, %g7 -+ cmp %g1, %g7 -+ bne,pn %icc, 2f -+ nop -+ retl -+ nop -+2: BACKOFF_SPIN(%o2, %o3, 1b) -+ .size atomic_sub_unchecked, .-atomic_sub_unchecked -+ - .globl atomic_add_ret - .type atomic_add_ret,#function - atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ - BACKOFF_SETUP(%o2) - 1: lduw [%o1], %g1 -- add %g1, %o0, %g7 -+ addcc %g1, %o0, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cas [%o1], %g1, %g7 - cmp %g1, %g7 - bne,pn %icc, 2f -@@ -64,7 +109,12 @@ atomic_add_ret: /* %o0 = increment, %o1 - atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ - BACKOFF_SETUP(%o2) - 1: lduw [%o1], %g1 -- sub %g1, %o0, %g7 -+ subcc %g1, %o0, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cas [%o1], %g1, %g7 - cmp %g1, %g7 - bne,pn %icc, 2f -@@ -80,7 +130,12 @@ atomic_sub_ret: /* %o0 = decrement, %o1 - atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */ - BACKOFF_SETUP(%o2) - 1: ldx [%o1], %g1 -- add %g1, %o0, %g7 -+ addcc %g1, %o0, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %xcc, 6 -+#endif -+ - casx [%o1], %g1, %g7 - cmp %g1, %g7 - bne,pn %xcc, 2f -@@ -95,7 +150,12 @@ atomic64_add: /* %o0 = increment, %o1 = - atomic64_sub: /* %o0 = decrement, %o1 = atomic_ptr */ - BACKOFF_SETUP(%o2) - 1: ldx [%o1], %g1 -- sub %g1, %o0, %g7 -+ subcc %g1, %o0, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %xcc, 6 -+#endif -+ - casx [%o1], %g1, %g7 - cmp %g1, %g7 - bne,pn %xcc, 2f -@@ -110,7 +170,12 @@ atomic64_sub: /* %o0 = decrement, %o1 = - atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ - BACKOFF_SETUP(%o2) - 1: ldx [%o1], %g1 -- add %g1, %o0, %g7 -+ addcc %g1, %o0, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %xcc, 6 -+#endif -+ - casx [%o1], %g1, %g7 - cmp %g1, %g7 - bne,pn %xcc, 2f -@@ -126,7 +191,12 @@ atomic64_add_ret: /* %o0 = increment, %o - atomic64_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ - BACKOFF_SETUP(%o2) - 1: ldx [%o1], %g1 -- sub %g1, %o0, %g7 -+ subcc %g1, %o0, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %xcc, 6 -+#endif -+ - casx [%o1], %g1, %g7 - cmp %g1, %g7 - bne,pn %xcc, 2f -diff -urNp linux-2.6.31.1/arch/sparc/lib/ksyms.c linux-2.6.31.1/arch/sparc/lib/ksyms.c ---- linux-2.6.31.1/arch/sparc/lib/ksyms.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/lib/ksyms.c 2009-10-01 20:12:42.000000000 -0400 -@@ -144,8 +144,10 @@ EXPORT_SYMBOL(__downgrade_write); - - /* Atomic counter implementation. */ - EXPORT_SYMBOL(atomic_add); -+EXPORT_SYMBOL(atomic_add_unchecked); - EXPORT_SYMBOL(atomic_add_ret); - EXPORT_SYMBOL(atomic_sub); -+EXPORT_SYMBOL(atomic_sub_unchecked); - EXPORT_SYMBOL(atomic_sub_ret); - EXPORT_SYMBOL(atomic64_add); - EXPORT_SYMBOL(atomic64_add_ret); -diff -urNp linux-2.6.31.1/arch/sparc/lib/rwsem_64.S linux-2.6.31.1/arch/sparc/lib/rwsem_64.S ---- linux-2.6.31.1/arch/sparc/lib/rwsem_64.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/lib/rwsem_64.S 2009-10-01 20:12:42.000000000 -0400 -@@ -11,7 +11,12 @@ - .globl __down_read - __down_read: - 1: lduw [%o0], %g1 -- add %g1, 1, %g7 -+ addcc %g1, 1, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cas [%o0], %g1, %g7 - cmp %g1, %g7 - bne,pn %icc, 1b -@@ -33,7 +38,12 @@ __down_read: - .globl __down_read_trylock - __down_read_trylock: - 1: lduw [%o0], %g1 -- add %g1, 1, %g7 -+ addcc %g1, 1, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cmp %g7, 0 - bl,pn %icc, 2f - mov 0, %o1 -@@ -51,7 +61,12 @@ __down_write: - or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1 - 1: - lduw [%o0], %g3 -- add %g3, %g1, %g7 -+ addcc %g3, %g1, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cas [%o0], %g3, %g7 - cmp %g3, %g7 - bne,pn %icc, 1b -@@ -77,7 +92,12 @@ __down_write_trylock: - cmp %g3, 0 - bne,pn %icc, 2f - mov 0, %o1 -- add %g3, %g1, %g7 -+ addcc %g3, %g1, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cas [%o0], %g3, %g7 - cmp %g3, %g7 - bne,pn %icc, 1b -@@ -90,7 +110,12 @@ __down_write_trylock: - __up_read: - 1: - lduw [%o0], %g1 -- sub %g1, 1, %g7 -+ subcc %g1, 1, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cas [%o0], %g1, %g7 - cmp %g1, %g7 - bne,pn %icc, 1b -@@ -118,7 +143,12 @@ __up_write: - or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1 - 1: - lduw [%o0], %g3 -- sub %g3, %g1, %g7 -+ subcc %g3, %g1, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cas [%o0], %g3, %g7 - cmp %g3, %g7 - bne,pn %icc, 1b -@@ -143,7 +173,12 @@ __downgrade_write: - or %g1, %lo(RWSEM_WAITING_BIAS), %g1 - 1: - lduw [%o0], %g3 -- sub %g3, %g1, %g7 -+ subcc %g3, %g1, %g7 -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ tvs %icc, 6 -+#endif -+ - cas [%o0], %g3, %g7 - cmp %g3, %g7 - bne,pn %icc, 1b -diff -urNp linux-2.6.31.1/arch/sparc/Makefile linux-2.6.31.1/arch/sparc/Makefile ---- linux-2.6.31.1/arch/sparc/Makefile 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/Makefile 2009-10-01 20:12:42.000000000 -0400 -@@ -81,7 +81,7 @@ drivers-$(CONFIG_OPROFILE) += arch/sparc - # Export what is needed by arch/sparc/boot/Makefile - export VMLINUX_INIT VMLINUX_MAIN - VMLINUX_INIT := $(head-y) $(init-y) --VMLINUX_MAIN := $(core-y) kernel/ mm/ fs/ ipc/ security/ crypto/ block/ -+VMLINUX_MAIN := $(core-y) kernel/ mm/ fs/ ipc/ security/ crypto/ block/ grsecurity/ - VMLINUX_MAIN += $(patsubst %/, %/lib.a, $(libs-y)) $(libs-y) - VMLINUX_MAIN += $(drivers-y) $(net-y) - -diff -urNp linux-2.6.31.1/arch/sparc/mm/fault_32.c linux-2.6.31.1/arch/sparc/mm/fault_32.c ---- linux-2.6.31.1/arch/sparc/mm/fault_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/mm/fault_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -21,6 +21,9 @@ - #include <linux/interrupt.h> - #include <linux/module.h> - #include <linux/kdebug.h> -+#include <linux/slab.h> -+#include <linux/pagemap.h> -+#include <linux/compiler.h> - - #include <asm/system.h> - #include <asm/page.h> -@@ -167,6 +170,267 @@ static unsigned long compute_si_addr(str - return safe_compute_effective_address(regs, insn); - } - -+#ifdef CONFIG_PAX_PAGEEXEC -+#ifdef CONFIG_PAX_DLRESOLVE -+void pax_emuplt_close(struct vm_area_struct *vma) -+{ -+ vma->vm_mm->call_dl_resolve = 0UL; -+} -+ -+static int pax_emuplt_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -+{ -+ unsigned int *kaddr; -+ -+ vmf->page = alloc_page(GFP_HIGHUSER); -+ if (!vmf->page) -+ return VM_FAULT_OOM; -+ -+ kaddr = kmap(vmf->page); -+ memset(kaddr, 0, PAGE_SIZE); -+ kaddr[0] = 0x9DE3BFA8U; /* save */ -+ flush_dcache_page(vmf->page); -+ kunmap(vmf->page); -+ return VM_FAULT_MAJOR; -+} -+ -+static const struct vm_operations_struct pax_vm_ops = { -+ .close = pax_emuplt_close, -+ .fault = pax_emuplt_fault -+}; -+ -+static int pax_insert_vma(struct vm_area_struct *vma, unsigned long addr) -+{ -+ int ret; -+ -+ vma->vm_mm = current->mm; -+ vma->vm_start = addr; -+ vma->vm_end = addr + PAGE_SIZE; -+ vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; -+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); -+ vma->vm_ops = &pax_vm_ops; -+ -+ ret = insert_vm_struct(current->mm, vma); -+ if (ret) -+ return ret; -+ -+ ++current->mm->total_vm; -+ return 0; -+} -+#endif -+ -+/* -+ * PaX: decide what to do with offenders (regs->pc = fault address) -+ * -+ * returns 1 when task should be killed -+ * 2 when patched PLT trampoline was detected -+ * 3 when unpatched PLT trampoline was detected -+ */ -+static int pax_handle_fetch_fault(struct pt_regs *regs) -+{ -+ -+#ifdef CONFIG_PAX_EMUPLT -+ int err; -+ -+ do { /* PaX: patched PLT emulation #1 */ -+ unsigned int sethi1, sethi2, jmpl; -+ -+ err = get_user(sethi1, (unsigned int *)regs->pc); -+ err |= get_user(sethi2, (unsigned int *)(regs->pc+4)); -+ err |= get_user(jmpl, (unsigned int *)(regs->pc+8)); -+ -+ if (err) -+ break; -+ -+ if ((sethi1 & 0xFFC00000U) == 0x03000000U && -+ (sethi2 & 0xFFC00000U) == 0x03000000U && -+ (jmpl & 0xFFFFE000U) == 0x81C06000U) -+ { -+ unsigned int addr; -+ -+ regs->u_regs[UREG_G1] = (sethi2 & 0x003FFFFFU) << 10; -+ addr = regs->u_regs[UREG_G1]; -+ addr += (((jmpl | 0xFFFFE000U) ^ 0x00001000U) + 0x00001000U); -+ regs->pc = addr; -+ regs->npc = addr+4; -+ return 2; -+ } -+ } while (0); -+ -+ { /* PaX: patched PLT emulation #2 */ -+ unsigned int ba; -+ -+ err = get_user(ba, (unsigned int *)regs->pc); -+ -+ if (!err && (ba & 0xFFC00000U) == 0x30800000U) { -+ unsigned int addr; -+ -+ addr = regs->pc + ((((ba | 0xFFC00000U) ^ 0x00200000U) + 0x00200000U) << 2); -+ regs->pc = addr; -+ regs->npc = addr+4; -+ return 2; -+ } -+ } -+ -+ do { /* PaX: patched PLT emulation #3 */ -+ unsigned int sethi, jmpl, nop; -+ -+ err = get_user(sethi, (unsigned int *)regs->pc); -+ err |= get_user(jmpl, (unsigned int *)(regs->pc+4)); -+ err |= get_user(nop, (unsigned int *)(regs->pc+8)); -+ -+ if (err) -+ break; -+ -+ if ((sethi & 0xFFC00000U) == 0x03000000U && -+ (jmpl & 0xFFFFE000U) == 0x81C06000U && -+ nop == 0x01000000U) -+ { -+ unsigned int addr; -+ -+ addr = (sethi & 0x003FFFFFU) << 10; -+ regs->u_regs[UREG_G1] = addr; -+ addr += (((jmpl | 0xFFFFE000U) ^ 0x00001000U) + 0x00001000U); -+ regs->pc = addr; -+ regs->npc = addr+4; -+ return 2; -+ } -+ } while (0); -+ -+ do { /* PaX: unpatched PLT emulation step 1 */ -+ unsigned int sethi, ba, nop; -+ -+ err = get_user(sethi, (unsigned int *)regs->pc); -+ err |= get_user(ba, (unsigned int *)(regs->pc+4)); -+ err |= get_user(nop, (unsigned int *)(regs->pc+8)); -+ -+ if (err) -+ break; -+ -+ if ((sethi & 0xFFC00000U) == 0x03000000U && -+ ((ba & 0xFFC00000U) == 0x30800000U || (ba & 0xFFF80000U) == 0x30680000U) && -+ nop == 0x01000000U) -+ { -+ unsigned int addr, save, call; -+ -+ if ((ba & 0xFFC00000U) == 0x30800000U) -+ addr = regs->pc + 4 + ((((ba | 0xFFC00000U) ^ 0x00200000U) + 0x00200000U) << 2); -+ else -+ addr = regs->pc + 4 + ((((ba | 0xFFF80000U) ^ 0x00040000U) + 0x00040000U) << 2); -+ -+ err = get_user(save, (unsigned int *)addr); -+ err |= get_user(call, (unsigned int *)(addr+4)); -+ err |= get_user(nop, (unsigned int *)(addr+8)); -+ if (err) -+ break; -+ -+#ifdef CONFIG_PAX_DLRESOLVE -+ if (save == 0x9DE3BFA8U && -+ (call & 0xC0000000U) == 0x40000000U && -+ nop == 0x01000000U) -+ { -+ struct vm_area_struct *vma; -+ unsigned long call_dl_resolve; -+ -+ down_read(¤t->mm->mmap_sem); -+ call_dl_resolve = current->mm->call_dl_resolve; -+ up_read(¤t->mm->mmap_sem); -+ if (likely(call_dl_resolve)) -+ goto emulate; -+ -+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); -+ -+ down_write(¤t->mm->mmap_sem); -+ if (current->mm->call_dl_resolve) { -+ call_dl_resolve = current->mm->call_dl_resolve; -+ up_write(¤t->mm->mmap_sem); -+ if (vma) -+ kmem_cache_free(vm_area_cachep, vma); -+ goto emulate; -+ } -+ -+ call_dl_resolve = get_unmapped_area(NULL, 0UL, PAGE_SIZE, 0UL, MAP_PRIVATE); -+ if (!vma || (call_dl_resolve & ~PAGE_MASK)) { -+ up_write(¤t->mm->mmap_sem); -+ if (vma) -+ kmem_cache_free(vm_area_cachep, vma); -+ return 1; -+ } -+ -+ if (pax_insert_vma(vma, call_dl_resolve)) { -+ up_write(¤t->mm->mmap_sem); -+ kmem_cache_free(vm_area_cachep, vma); -+ return 1; -+ } -+ -+ current->mm->call_dl_resolve = call_dl_resolve; -+ up_write(¤t->mm->mmap_sem); -+ -+emulate: -+ regs->u_regs[UREG_G1] = (sethi & 0x003FFFFFU) << 10; -+ regs->pc = call_dl_resolve; -+ regs->npc = addr+4; -+ return 3; -+ } -+#endif -+ -+ /* PaX: glibc 2.4+ generates sethi/jmpl instead of save/call */ -+ if ((save & 0xFFC00000U) == 0x05000000U && -+ (call & 0xFFFFE000U) == 0x85C0A000U && -+ nop == 0x01000000U) -+ { -+ regs->u_regs[UREG_G1] = (sethi & 0x003FFFFFU) << 10; -+ regs->u_regs[UREG_G2] = addr + 4; -+ addr = (save & 0x003FFFFFU) << 10; -+ addr += (((call | 0xFFFFE000U) ^ 0x00001000U) + 0x00001000U); -+ regs->pc = addr; -+ regs->npc = addr+4; -+ return 3; -+ } -+ } -+ } while (0); -+ -+ do { /* PaX: unpatched PLT emulation step 2 */ -+ unsigned int save, call, nop; -+ -+ err = get_user(save, (unsigned int *)(regs->pc-4)); -+ err |= get_user(call, (unsigned int *)regs->pc); -+ err |= get_user(nop, (unsigned int *)(regs->pc+4)); -+ if (err) -+ break; -+ -+ if (save == 0x9DE3BFA8U && -+ (call & 0xC0000000U) == 0x40000000U && -+ nop == 0x01000000U) -+ { -+ unsigned int dl_resolve = regs->pc + ((((call | 0xC0000000U) ^ 0x20000000U) + 0x20000000U) << 2); -+ -+ regs->u_regs[UREG_RETPC] = regs->pc; -+ regs->pc = dl_resolve; -+ regs->npc = dl_resolve+4; -+ return 3; -+ } -+ } while (0); -+#endif -+ -+ return 1; -+} -+ -+void pax_report_insns(void *pc, void *sp) -+{ -+ unsigned long i; -+ -+ printk(KERN_ERR "PAX: bytes at PC: "); -+ for (i = 0; i < 5; i++) { -+ unsigned int c; -+ if (get_user(c, (unsigned int *)pc+i)) -+ printk(KERN_CONT "???????? "); -+ else -+ printk(KERN_CONT "%08x ", c); -+ } -+ printk("\n"); -+} -+#endif -+ - asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, - unsigned long address) - { -@@ -231,6 +495,24 @@ good_area: - if(!(vma->vm_flags & VM_WRITE)) - goto bad_area; - } else { -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && text_fault && !(vma->vm_flags & VM_EXEC)) { -+ up_read(&mm->mmap_sem); -+ switch (pax_handle_fetch_fault(regs)) { -+ -+#ifdef CONFIG_PAX_EMUPLT -+ case 2: -+ case 3: -+ return; -+#endif -+ -+ } -+ pax_report_fault(regs, (void *)regs->pc, (void *)regs->u_regs[UREG_FP]); -+ do_group_exit(SIGKILL); -+ } -+#endif -+ - /* Allow reads even for write-only mappings */ - if(!(vma->vm_flags & (VM_READ | VM_EXEC))) - goto bad_area; -diff -urNp linux-2.6.31.1/arch/sparc/mm/fault_64.c linux-2.6.31.1/arch/sparc/mm/fault_64.c ---- linux-2.6.31.1/arch/sparc/mm/fault_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/mm/fault_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -20,6 +20,9 @@ - #include <linux/kprobes.h> - #include <linux/kdebug.h> - #include <linux/percpu.h> -+#include <linux/slab.h> -+#include <linux/pagemap.h> -+#include <linux/compiler.h> - - #include <asm/page.h> - #include <asm/pgtable.h> -@@ -249,6 +252,416 @@ static void noinline bogus_32bit_fault_a - show_regs(regs); - } - -+#ifdef CONFIG_PAX_PAGEEXEC -+#ifdef CONFIG_PAX_DLRESOLVE -+static void pax_emuplt_close(struct vm_area_struct *vma) -+{ -+ vma->vm_mm->call_dl_resolve = 0UL; -+} -+ -+static int pax_emuplt_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -+{ -+ unsigned int *kaddr; -+ -+ vmf->page = alloc_page(GFP_HIGHUSER); -+ if (!vmf->page) -+ return VM_FAULT_OOM; -+ -+ kaddr = kmap(vmf->page); -+ memset(kaddr, 0, PAGE_SIZE); -+ kaddr[0] = 0x9DE3BFA8U; /* save */ -+ flush_dcache_page(vmf->page); -+ kunmap(vmf->page); -+ return VM_FAULT_MAJOR; -+} -+ -+static const struct vm_operations_struct pax_vm_ops = { -+ .close = pax_emuplt_close, -+ .fault = pax_emuplt_fault -+}; -+ -+static int pax_insert_vma(struct vm_area_struct *vma, unsigned long addr) -+{ -+ int ret; -+ -+ vma->vm_mm = current->mm; -+ vma->vm_start = addr; -+ vma->vm_end = addr + PAGE_SIZE; -+ vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; -+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); -+ vma->vm_ops = &pax_vm_ops; -+ -+ ret = insert_vm_struct(current->mm, vma); -+ if (ret) -+ return ret; -+ -+ ++current->mm->total_vm; -+ return 0; -+} -+#endif -+ -+/* -+ * PaX: decide what to do with offenders (regs->tpc = fault address) -+ * -+ * returns 1 when task should be killed -+ * 2 when patched PLT trampoline was detected -+ * 3 when unpatched PLT trampoline was detected -+ */ -+static int pax_handle_fetch_fault(struct pt_regs *regs) -+{ -+ -+#ifdef CONFIG_PAX_EMUPLT -+ int err; -+ -+ do { /* PaX: patched PLT emulation #1 */ -+ unsigned int sethi1, sethi2, jmpl; -+ -+ err = get_user(sethi1, (unsigned int *)regs->tpc); -+ err |= get_user(sethi2, (unsigned int *)(regs->tpc+4)); -+ err |= get_user(jmpl, (unsigned int *)(regs->tpc+8)); -+ -+ if (err) -+ break; -+ -+ if ((sethi1 & 0xFFC00000U) == 0x03000000U && -+ (sethi2 & 0xFFC00000U) == 0x03000000U && -+ (jmpl & 0xFFFFE000U) == 0x81C06000U) -+ { -+ unsigned long addr; -+ -+ regs->u_regs[UREG_G1] = (sethi2 & 0x003FFFFFU) << 10; -+ addr = regs->u_regs[UREG_G1]; -+ addr += (((jmpl | 0xFFFFFFFFFFFFE000UL) ^ 0x00001000UL) + 0x00001000UL); -+ -+ if (test_thread_flag(TIF_32BIT)) -+ addr &= 0xFFFFFFFFUL; -+ -+ regs->tpc = addr; -+ regs->tnpc = addr+4; -+ return 2; -+ } -+ } while (0); -+ -+ { /* PaX: patched PLT emulation #2 */ -+ unsigned int ba; -+ -+ err = get_user(ba, (unsigned int *)regs->tpc); -+ -+ if (!err && (ba & 0xFFC00000U) == 0x30800000U) { -+ unsigned long addr; -+ -+ addr = regs->tpc + ((((ba | 0xFFFFFFFFFFC00000UL) ^ 0x00200000UL) + 0x00200000UL) << 2); -+ -+ if (test_thread_flag(TIF_32BIT)) -+ addr &= 0xFFFFFFFFUL; -+ -+ regs->tpc = addr; -+ regs->tnpc = addr+4; -+ return 2; -+ } -+ } -+ -+ do { /* PaX: patched PLT emulation #3 */ -+ unsigned int sethi, jmpl, nop; -+ -+ err = get_user(sethi, (unsigned int *)regs->tpc); -+ err |= get_user(jmpl, (unsigned int *)(regs->tpc+4)); -+ err |= get_user(nop, (unsigned int *)(regs->tpc+8)); -+ -+ if (err) -+ break; -+ -+ if ((sethi & 0xFFC00000U) == 0x03000000U && -+ (jmpl & 0xFFFFE000U) == 0x81C06000U && -+ nop == 0x01000000U) -+ { -+ unsigned long addr; -+ -+ addr = (sethi & 0x003FFFFFU) << 10; -+ regs->u_regs[UREG_G1] = addr; -+ addr += (((jmpl | 0xFFFFFFFFFFFFE000UL) ^ 0x00001000UL) + 0x00001000UL); -+ -+ if (test_thread_flag(TIF_32BIT)) -+ addr &= 0xFFFFFFFFUL; -+ -+ regs->tpc = addr; -+ regs->tnpc = addr+4; -+ return 2; -+ } -+ } while (0); -+ -+ do { /* PaX: patched PLT emulation #4 */ -+ unsigned int mov1, call, mov2; -+ -+ err = get_user(mov1, (unsigned int *)regs->tpc); -+ err |= get_user(call, (unsigned int *)(regs->tpc+4)); -+ err |= get_user(mov2, (unsigned int *)(regs->tpc+8)); -+ -+ if (err) -+ break; -+ -+ if (mov1 == 0x8210000FU && -+ (call & 0xC0000000U) == 0x40000000U && -+ mov2 == 0x9E100001U) -+ { -+ unsigned long addr; -+ -+ regs->u_regs[UREG_G1] = regs->u_regs[UREG_RETPC]; -+ addr = regs->tpc + 4 + ((((call | 0xFFFFFFFFC0000000UL) ^ 0x20000000UL) + 0x20000000UL) << 2); -+ -+ if (test_thread_flag(TIF_32BIT)) -+ addr &= 0xFFFFFFFFUL; -+ -+ regs->tpc = addr; -+ regs->tnpc = addr+4; -+ return 2; -+ } -+ } while (0); -+ -+ do { /* PaX: patched PLT emulation #5 */ -+ unsigned int sethi1, sethi2, or1, or2, sllx, jmpl, nop; -+ -+ err = get_user(sethi1, (unsigned int *)regs->tpc); -+ err |= get_user(sethi2, (unsigned int *)(regs->tpc+4)); -+ err |= get_user(or1, (unsigned int *)(regs->tpc+8)); -+ err |= get_user(or2, (unsigned int *)(regs->tpc+12)); -+ err |= get_user(sllx, (unsigned int *)(regs->tpc+16)); -+ err |= get_user(jmpl, (unsigned int *)(regs->tpc+20)); -+ err |= get_user(nop, (unsigned int *)(regs->tpc+24)); -+ -+ if (err) -+ break; -+ -+ if ((sethi1 & 0xFFC00000U) == 0x03000000U && -+ (sethi2 & 0xFFC00000U) == 0x0B000000U && -+ (or1 & 0xFFFFE000U) == 0x82106000U && -+ (or2 & 0xFFFFE000U) == 0x8A116000U && -+ sllx == 0x83287020 && -+ jmpl == 0x81C04005U && -+ nop == 0x01000000U) -+ { -+ unsigned long addr; -+ -+ regs->u_regs[UREG_G1] = ((sethi1 & 0x003FFFFFU) << 10) | (or1 & 0x000003FFU); -+ regs->u_regs[UREG_G1] <<= 32; -+ regs->u_regs[UREG_G5] = ((sethi2 & 0x003FFFFFU) << 10) | (or2 & 0x000003FFU); -+ addr = regs->u_regs[UREG_G1] + regs->u_regs[UREG_G5]; -+ regs->tpc = addr; -+ regs->tnpc = addr+4; -+ return 2; -+ } -+ } while (0); -+ -+ do { /* PaX: patched PLT emulation #6 */ -+ unsigned int sethi1, sethi2, sllx, or, jmpl, nop; -+ -+ err = get_user(sethi1, (unsigned int *)regs->tpc); -+ err |= get_user(sethi2, (unsigned int *)(regs->tpc+4)); -+ err |= get_user(sllx, (unsigned int *)(regs->tpc+8)); -+ err |= get_user(or, (unsigned int *)(regs->tpc+12)); -+ err |= get_user(jmpl, (unsigned int *)(regs->tpc+16)); -+ err |= get_user(nop, (unsigned int *)(regs->tpc+20)); -+ -+ if (err) -+ break; -+ -+ if ((sethi1 & 0xFFC00000U) == 0x03000000U && -+ (sethi2 & 0xFFC00000U) == 0x0B000000U && -+ sllx == 0x83287020 && -+ (or & 0xFFFFE000U) == 0x8A116000U && -+ jmpl == 0x81C04005U && -+ nop == 0x01000000U) -+ { -+ unsigned long addr; -+ -+ regs->u_regs[UREG_G1] = (sethi1 & 0x003FFFFFU) << 10; -+ regs->u_regs[UREG_G1] <<= 32; -+ regs->u_regs[UREG_G5] = ((sethi2 & 0x003FFFFFU) << 10) | (or & 0x3FFU); -+ addr = regs->u_regs[UREG_G1] + regs->u_regs[UREG_G5]; -+ regs->tpc = addr; -+ regs->tnpc = addr+4; -+ return 2; -+ } -+ } while (0); -+ -+ do { /* PaX: unpatched PLT emulation step 1 */ -+ unsigned int sethi, ba, nop; -+ -+ err = get_user(sethi, (unsigned int *)regs->tpc); -+ err |= get_user(ba, (unsigned int *)(regs->tpc+4)); -+ err |= get_user(nop, (unsigned int *)(regs->tpc+8)); -+ -+ if (err) -+ break; -+ -+ if ((sethi & 0xFFC00000U) == 0x03000000U && -+ ((ba & 0xFFC00000U) == 0x30800000U || (ba & 0xFFF80000U) == 0x30680000U) && -+ nop == 0x01000000U) -+ { -+ unsigned long addr; -+ unsigned int save, call; -+ -+ if ((ba & 0xFFC00000U) == 0x30800000U) -+ addr = regs->tpc + 4 + ((((ba | 0xFFFFFFFFFFC00000UL) ^ 0x00200000UL) + 0x00200000UL) << 2); -+ else -+ addr = regs->tpc + 4 + ((((ba | 0xFFFFFFFFFFF80000UL) ^ 0x00040000UL) + 0x00040000UL) << 2); -+ -+ if (test_thread_flag(TIF_32BIT)) -+ addr &= 0xFFFFFFFFUL; -+ -+ err = get_user(save, (unsigned int *)addr); -+ err |= get_user(call, (unsigned int *)(addr+4)); -+ err |= get_user(nop, (unsigned int *)(addr+8)); -+ if (err) -+ break; -+ -+#ifdef CONFIG_PAX_DLRESOLVE -+ if (save == 0x9DE3BFA8U && -+ (call & 0xC0000000U) == 0x40000000U && -+ nop == 0x01000000U) -+ { -+ struct vm_area_struct *vma; -+ unsigned long call_dl_resolve; -+ -+ down_read(¤t->mm->mmap_sem); -+ call_dl_resolve = current->mm->call_dl_resolve; -+ up_read(¤t->mm->mmap_sem); -+ if (likely(call_dl_resolve)) -+ goto emulate; -+ -+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); -+ -+ down_write(¤t->mm->mmap_sem); -+ if (current->mm->call_dl_resolve) { -+ call_dl_resolve = current->mm->call_dl_resolve; -+ up_write(¤t->mm->mmap_sem); -+ if (vma) -+ kmem_cache_free(vm_area_cachep, vma); -+ goto emulate; -+ } -+ -+ call_dl_resolve = get_unmapped_area(NULL, 0UL, PAGE_SIZE, 0UL, MAP_PRIVATE); -+ if (!vma || (call_dl_resolve & ~PAGE_MASK)) { -+ up_write(¤t->mm->mmap_sem); -+ if (vma) -+ kmem_cache_free(vm_area_cachep, vma); -+ return 1; -+ } -+ -+ if (pax_insert_vma(vma, call_dl_resolve)) { -+ up_write(¤t->mm->mmap_sem); -+ kmem_cache_free(vm_area_cachep, vma); -+ return 1; -+ } -+ -+ current->mm->call_dl_resolve = call_dl_resolve; -+ up_write(¤t->mm->mmap_sem); -+ -+emulate: -+ regs->u_regs[UREG_G1] = (sethi & 0x003FFFFFU) << 10; -+ regs->tpc = call_dl_resolve; -+ regs->tnpc = addr+4; -+ return 3; -+ } -+#endif -+ -+ /* PaX: glibc 2.4+ generates sethi/jmpl instead of save/call */ -+ if ((save & 0xFFC00000U) == 0x05000000U && -+ (call & 0xFFFFE000U) == 0x85C0A000U && -+ nop == 0x01000000U) -+ { -+ regs->u_regs[UREG_G1] = (sethi & 0x003FFFFFU) << 10; -+ regs->u_regs[UREG_G2] = addr + 4; -+ addr = (save & 0x003FFFFFU) << 10; -+ addr += (((call | 0xFFFFFFFFFFFFE000UL) ^ 0x00001000UL) + 0x00001000UL); -+ -+ if (test_thread_flag(TIF_32BIT)) -+ addr &= 0xFFFFFFFFUL; -+ -+ regs->tpc = addr; -+ regs->tnpc = addr+4; -+ return 3; -+ } -+ } -+ } while (0); -+ -+#ifdef CONFIG_PAX_DLRESOLVE -+ do { /* PaX: unpatched PLT emulation step 2 */ -+ unsigned int save, call, nop; -+ -+ err = get_user(save, (unsigned int *)(regs->tpc-4)); -+ err |= get_user(call, (unsigned int *)regs->tpc); -+ err |= get_user(nop, (unsigned int *)(regs->tpc+4)); -+ if (err) -+ break; -+ -+ if (save == 0x9DE3BFA8U && -+ (call & 0xC0000000U) == 0x40000000U && -+ nop == 0x01000000U) -+ { -+ unsigned long dl_resolve = regs->tpc + ((((call | 0xFFFFFFFFC0000000UL) ^ 0x20000000UL) + 0x20000000UL) << 2); -+ -+ if (test_thread_flag(TIF_32BIT)) -+ dl_resolve &= 0xFFFFFFFFUL; -+ -+ regs->u_regs[UREG_RETPC] = regs->tpc; -+ regs->tpc = dl_resolve; -+ regs->tnpc = dl_resolve+4; -+ return 3; -+ } -+ } while (0); -+#endif -+ -+ do { /* PaX: patched PLT emulation #7, must be AFTER the unpatched PLT emulation */ -+ unsigned int sethi, ba, nop; -+ -+ err = get_user(sethi, (unsigned int *)regs->tpc); -+ err |= get_user(ba, (unsigned int *)(regs->tpc+4)); -+ err |= get_user(nop, (unsigned int *)(regs->tpc+8)); -+ -+ if (err) -+ break; -+ -+ if ((sethi & 0xFFC00000U) == 0x03000000U && -+ (ba & 0xFFF00000U) == 0x30600000U && -+ nop == 0x01000000U) -+ { -+ unsigned long addr; -+ -+ addr = (sethi & 0x003FFFFFU) << 10; -+ regs->u_regs[UREG_G1] = addr; -+ addr = regs->tpc + ((((ba | 0xFFFFFFFFFFF80000UL) ^ 0x00040000UL) + 0x00040000UL) << 2); -+ -+ if (test_thread_flag(TIF_32BIT)) -+ addr &= 0xFFFFFFFFUL; -+ -+ regs->tpc = addr; -+ regs->tnpc = addr+4; -+ return 2; -+ } -+ } while (0); -+ -+#endif -+ -+ return 1; -+} -+ -+void pax_report_insns(void *pc, void *sp) -+{ -+ unsigned long i; -+ -+ printk(KERN_ERR "PAX: bytes at PC: "); -+ for (i = 0; i < 5; i++) { -+ unsigned int c; -+ if (get_user(c, (unsigned int *)pc+i)) -+ printk(KERN_CONT "???????? "); -+ else -+ printk(KERN_CONT "%08x ", c); -+ } -+ printk("\n"); -+} -+#endif -+ - asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) - { - struct mm_struct *mm = current->mm; -@@ -315,6 +728,29 @@ asmlinkage void __kprobes do_sparc64_fau - if (!vma) - goto bad_area; - -+#ifdef CONFIG_PAX_PAGEEXEC -+ /* PaX: detect ITLB misses on non-exec pages */ -+ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && vma->vm_start <= address && -+ !(vma->vm_flags & VM_EXEC) && (fault_code & FAULT_CODE_ITLB)) -+ { -+ if (address != regs->tpc) -+ goto good_area; -+ -+ up_read(&mm->mmap_sem); -+ switch (pax_handle_fetch_fault(regs)) { -+ -+#ifdef CONFIG_PAX_EMUPLT -+ case 2: -+ case 3: -+ return; -+#endif -+ -+ } -+ pax_report_fault(regs, (void *)regs->tpc, (void *)(regs->u_regs[UREG_FP] + STACK_BIAS)); -+ do_group_exit(SIGKILL); -+ } -+#endif -+ - /* Pure DTLB misses do not tell us whether the fault causing - * load/store/atomic was a write or not, it only says that there - * was no match. So in such a case we (carefully) read the -diff -urNp linux-2.6.31.1/arch/sparc/mm/init_32.c linux-2.6.31.1/arch/sparc/mm/init_32.c ---- linux-2.6.31.1/arch/sparc/mm/init_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/mm/init_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -316,6 +316,9 @@ extern void device_scan(void); - pgprot_t PAGE_SHARED __read_mostly; - EXPORT_SYMBOL(PAGE_SHARED); - -+pgprot_t PAGE_SHARED_NOEXEC __read_mostly; -+EXPORT_SYMBOL(PAGE_SHARED_NOEXEC); -+ - void __init paging_init(void) - { - switch(sparc_cpu_model) { -@@ -341,17 +344,17 @@ void __init paging_init(void) - - /* Initialize the protection map with non-constant, MMU dependent values. */ - protection_map[0] = PAGE_NONE; -- protection_map[1] = PAGE_READONLY; -- protection_map[2] = PAGE_COPY; -- protection_map[3] = PAGE_COPY; -+ protection_map[1] = PAGE_READONLY_NOEXEC; -+ protection_map[2] = PAGE_COPY_NOEXEC; -+ protection_map[3] = PAGE_COPY_NOEXEC; - protection_map[4] = PAGE_READONLY; - protection_map[5] = PAGE_READONLY; - protection_map[6] = PAGE_COPY; - protection_map[7] = PAGE_COPY; - protection_map[8] = PAGE_NONE; -- protection_map[9] = PAGE_READONLY; -- protection_map[10] = PAGE_SHARED; -- protection_map[11] = PAGE_SHARED; -+ protection_map[9] = PAGE_READONLY_NOEXEC; -+ protection_map[10] = PAGE_SHARED_NOEXEC; -+ protection_map[11] = PAGE_SHARED_NOEXEC; - protection_map[12] = PAGE_READONLY; - protection_map[13] = PAGE_READONLY; - protection_map[14] = PAGE_SHARED; -diff -urNp linux-2.6.31.1/arch/sparc/mm/Makefile linux-2.6.31.1/arch/sparc/mm/Makefile ---- linux-2.6.31.1/arch/sparc/mm/Makefile 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/mm/Makefile 2009-10-01 20:12:42.000000000 -0400 -@@ -2,7 +2,7 @@ - # - - asflags-y := -ansi --ccflags-y := -Werror -+#ccflags-y := -Werror - - obj-$(CONFIG_SPARC64) += ultra.o tlb.o tsb.o - obj-y += fault_$(BITS).o -diff -urNp linux-2.6.31.1/arch/sparc/mm/srmmu.c linux-2.6.31.1/arch/sparc/mm/srmmu.c ---- linux-2.6.31.1/arch/sparc/mm/srmmu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/sparc/mm/srmmu.c 2009-10-01 20:12:42.000000000 -0400 -@@ -2149,6 +2149,13 @@ void __init ld_mmu_srmmu(void) - PAGE_SHARED = pgprot_val(SRMMU_PAGE_SHARED); - BTFIXUPSET_INT(page_copy, pgprot_val(SRMMU_PAGE_COPY)); - BTFIXUPSET_INT(page_readonly, pgprot_val(SRMMU_PAGE_RDONLY)); -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ PAGE_SHARED_NOEXEC = pgprot_val(SRMMU_PAGE_SHARED_NOEXEC); -+ BTFIXUPSET_INT(page_copy_noexec, pgprot_val(SRMMU_PAGE_COPY_NOEXEC)); -+ BTFIXUPSET_INT(page_readonly_noexec, pgprot_val(SRMMU_PAGE_RDONLY_NOEXEC)); -+#endif -+ - BTFIXUPSET_INT(page_kernel, pgprot_val(SRMMU_PAGE_KERNEL)); - page_kernel = pgprot_val(SRMMU_PAGE_KERNEL); - -diff -urNp linux-2.6.31.1/arch/um/include/asm/kmap_types.h linux-2.6.31.1/arch/um/include/asm/kmap_types.h ---- linux-2.6.31.1/arch/um/include/asm/kmap_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/um/include/asm/kmap_types.h 2009-10-01 20:12:42.000000000 -0400 -@@ -23,6 +23,7 @@ enum km_type { - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, -+ KM_CLEARPAGE, - KM_TYPE_NR - }; - -diff -urNp linux-2.6.31.1/arch/um/include/asm/page.h linux-2.6.31.1/arch/um/include/asm/page.h ---- linux-2.6.31.1/arch/um/include/asm/page.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/um/include/asm/page.h 2009-10-01 20:12:42.000000000 -0400 -@@ -14,6 +14,9 @@ - #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) - #define PAGE_MASK (~(PAGE_SIZE-1)) - -+#define ktla_ktva(addr) (addr) -+#define ktva_ktla(addr) (addr) -+ - #ifndef __ASSEMBLY__ - - struct page; -diff -urNp linux-2.6.31.1/arch/um/sys-i386/syscalls.c linux-2.6.31.1/arch/um/sys-i386/syscalls.c ---- linux-2.6.31.1/arch/um/sys-i386/syscalls.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/um/sys-i386/syscalls.c 2009-10-01 20:12:42.000000000 -0400 -@@ -11,6 +11,21 @@ - #include "asm/uaccess.h" - #include "asm/unistd.h" - -+int i386_mmap_check(unsigned long addr, unsigned long len, unsigned long flags) -+{ -+ unsigned long pax_task_size = TASK_SIZE; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) -+ pax_task_size = SEGMEXEC_TASK_SIZE; -+#endif -+ -+ if (len > pax_task_size || addr > pax_task_size - len) -+ return -EINVAL; -+ -+ return 0; -+} -+ - /* - * Perform the select(nd, in, out, ex, tv) and mmap() system - * calls. Linux/i386 didn't use to be able to handle more than -diff -urNp linux-2.6.31.1/arch/x86/boot/bitops.h linux-2.6.31.1/arch/x86/boot/bitops.h ---- linux-2.6.31.1/arch/x86/boot/bitops.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/boot/bitops.h 2009-10-01 20:12:42.000000000 -0400 -@@ -26,7 +26,7 @@ static inline int variable_test_bit(int - u8 v; - const u32 *p = (const u32 *)addr; - -- asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr)); -+ asm volatile("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr)); - return v; - } - -@@ -37,7 +37,7 @@ static inline int variable_test_bit(int - - static inline void set_bit(int nr, void *addr) - { -- asm("btsl %1,%0" : "+m" (*(u32 *)addr) : "Ir" (nr)); -+ asm volatile("btsl %1,%0" : "+m" (*(u32 *)addr) : "Ir" (nr)); - } - - #endif /* BOOT_BITOPS_H */ -diff -urNp linux-2.6.31.1/arch/x86/boot/boot.h linux-2.6.31.1/arch/x86/boot/boot.h ---- linux-2.6.31.1/arch/x86/boot/boot.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/boot/boot.h 2009-10-01 20:12:42.000000000 -0400 -@@ -82,7 +82,7 @@ static inline void io_delay(void) - static inline u16 ds(void) - { - u16 seg; -- asm("movw %%ds,%0" : "=rm" (seg)); -+ asm volatile("movw %%ds,%0" : "=rm" (seg)); - return seg; - } - -@@ -178,7 +178,7 @@ static inline void wrgs32(u32 v, addr_t - static inline int memcmp(const void *s1, const void *s2, size_t len) - { - u8 diff; -- asm("repe; cmpsb; setnz %0" -+ asm volatile("repe; cmpsb; setnz %0" - : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); - return diff; - } -diff -urNp linux-2.6.31.1/arch/x86/boot/compressed/head_32.S linux-2.6.31.1/arch/x86/boot/compressed/head_32.S ---- linux-2.6.31.1/arch/x86/boot/compressed/head_32.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/boot/compressed/head_32.S 2009-10-01 20:12:42.000000000 -0400 -@@ -75,7 +75,7 @@ ENTRY(startup_32) - notl %eax - andl %eax, %ebx - #else -- movl $LOAD_PHYSICAL_ADDR, %ebx -+ movl $____LOAD_PHYSICAL_ADDR, %ebx - #endif - - /* Target address to relocate to for decompression */ -@@ -148,7 +148,7 @@ relocated: - * and where it was actually loaded. - */ - movl %ebp, %ebx -- subl $LOAD_PHYSICAL_ADDR, %ebx -+ subl $____LOAD_PHYSICAL_ADDR, %ebx - jz 2f /* Nothing to be done if loaded at compiled addr. */ - /* - * Process relocations. -@@ -156,8 +156,7 @@ relocated: - - 1: subl $4, %edi - movl (%edi), %ecx -- testl %ecx, %ecx -- jz 2f -+ jecxz 2f - addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) - jmp 1b - 2: -diff -urNp linux-2.6.31.1/arch/x86/boot/compressed/head_64.S linux-2.6.31.1/arch/x86/boot/compressed/head_64.S ---- linux-2.6.31.1/arch/x86/boot/compressed/head_64.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/boot/compressed/head_64.S 2009-10-01 20:12:42.000000000 -0400 -@@ -90,7 +90,7 @@ ENTRY(startup_32) - notl %eax - andl %eax, %ebx - #else -- movl $LOAD_PHYSICAL_ADDR, %ebx -+ movl $____LOAD_PHYSICAL_ADDR, %ebx - #endif - - /* Target address to relocate to for decompression */ -@@ -233,7 +233,7 @@ ENTRY(startup_64) - notq %rax - andq %rax, %rbp - #else -- movq $LOAD_PHYSICAL_ADDR, %rbp -+ movq $____LOAD_PHYSICAL_ADDR, %rbp - #endif - - /* Target address to relocate to for decompression */ -diff -urNp linux-2.6.31.1/arch/x86/boot/compressed/misc.c linux-2.6.31.1/arch/x86/boot/compressed/misc.c ---- linux-2.6.31.1/arch/x86/boot/compressed/misc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/boot/compressed/misc.c 2009-10-01 20:12:42.000000000 -0400 -@@ -288,7 +288,7 @@ static void parse_elf(void *output) - case PT_LOAD: - #ifdef CONFIG_RELOCATABLE - dest = output; -- dest += (phdr->p_paddr - LOAD_PHYSICAL_ADDR); -+ dest += (phdr->p_paddr - ____LOAD_PHYSICAL_ADDR); - #else - dest = (void *)(phdr->p_paddr); - #endif -@@ -335,7 +335,7 @@ asmlinkage void decompress_kernel(void * - error("Destination address too large"); - #endif - #ifndef CONFIG_RELOCATABLE -- if ((unsigned long)output != LOAD_PHYSICAL_ADDR) -+ if ((unsigned long)output != ____LOAD_PHYSICAL_ADDR) - error("Wrong destination address"); - #endif - -diff -urNp linux-2.6.31.1/arch/x86/boot/compressed/mkpiggy.c linux-2.6.31.1/arch/x86/boot/compressed/mkpiggy.c ---- linux-2.6.31.1/arch/x86/boot/compressed/mkpiggy.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/boot/compressed/mkpiggy.c 2009-10-01 20:12:42.000000000 -0400 -@@ -74,7 +74,7 @@ int main(int argc, char *argv[]) - - offs = (olen > ilen) ? olen - ilen : 0; - offs += olen >> 12; /* Add 8 bytes for each 32K block */ -- offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ -+ offs += 64*1024; /* Add 64K bytes slack */ - offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ - - printf(".section ".rodata.compressed","a",@progbits\n"); -diff -urNp linux-2.6.31.1/arch/x86/boot/compressed/relocs.c linux-2.6.31.1/arch/x86/boot/compressed/relocs.c ---- linux-2.6.31.1/arch/x86/boot/compressed/relocs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/boot/compressed/relocs.c 2009-10-01 20:12:42.000000000 -0400 -@@ -10,8 +10,11 @@ - #define USE_BSD - #include <endian.h> - -+#include "../../../../include/linux/autoconf.h" -+ - #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - static Elf32_Ehdr ehdr; -+static Elf32_Phdr *phdr; - static unsigned long reloc_count, reloc_idx; - static unsigned long *relocs; - -@@ -37,7 +40,7 @@ static const char* safe_abs_relocs[] = { - - static int is_safe_abs_reloc(const char* sym_name) - { -- int i; -+ unsigned int i; - - for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { - if (!strcmp(sym_name, safe_abs_relocs[i])) -@@ -245,9 +248,39 @@ static void read_ehdr(FILE *fp) - } - } - -+static void read_phdrs(FILE *fp) -+{ -+ unsigned int i; -+ -+ phdr = calloc(ehdr.e_phnum, sizeof(Elf32_Phdr)); -+ if (!phdr) { -+ die("Unable to allocate %d program headers\n", -+ ehdr.e_phnum); -+ } -+ if (fseek(fp, ehdr.e_phoff, SEEK_SET) < 0) { -+ die("Seek to %d failed: %s\n", -+ ehdr.e_phoff, strerror(errno)); -+ } -+ if (fread(phdr, sizeof(*phdr), ehdr.e_phnum, fp) != ehdr.e_phnum) { -+ die("Cannot read ELF program headers: %s\n", -+ strerror(errno)); -+ } -+ for(i = 0; i < ehdr.e_phnum; i++) { -+ phdr[i].p_type = elf32_to_cpu(phdr[i].p_type); -+ phdr[i].p_offset = elf32_to_cpu(phdr[i].p_offset); -+ phdr[i].p_vaddr = elf32_to_cpu(phdr[i].p_vaddr); -+ phdr[i].p_paddr = elf32_to_cpu(phdr[i].p_paddr); -+ phdr[i].p_filesz = elf32_to_cpu(phdr[i].p_filesz); -+ phdr[i].p_memsz = elf32_to_cpu(phdr[i].p_memsz); -+ phdr[i].p_flags = elf32_to_cpu(phdr[i].p_flags); -+ phdr[i].p_align = elf32_to_cpu(phdr[i].p_align); -+ } -+ -+} -+ - static void read_shdrs(FILE *fp) - { -- int i; -+ unsigned int i; - Elf32_Shdr shdr; - - secs = calloc(ehdr.e_shnum, sizeof(struct section)); -@@ -282,7 +315,7 @@ static void read_shdrs(FILE *fp) - - static void read_strtabs(FILE *fp) - { -- int i; -+ unsigned int i; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_STRTAB) { -@@ -307,7 +340,7 @@ static void read_strtabs(FILE *fp) - - static void read_symtabs(FILE *fp) - { -- int i,j; -+ unsigned int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_SYMTAB) { -@@ -340,7 +373,9 @@ static void read_symtabs(FILE *fp) - - static void read_relocs(FILE *fp) - { -- int i,j; -+ unsigned int i,j; -+ uint32_t base; -+ - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_REL) { -@@ -360,9 +395,18 @@ static void read_relocs(FILE *fp) - die("Cannot read symbol table: %s\n", - strerror(errno)); - } -+ base = 0; -+ for (j = 0; j < ehdr.e_phnum; j++) { -+ if (phdr[j].p_type != PT_LOAD ) -+ continue; -+ if (secs[sec->shdr.sh_info].shdr.sh_offset < phdr[j].p_offset || secs[sec->shdr.sh_info].shdr.sh_offset >= phdr[j].p_offset + phdr[j].p_filesz) -+ continue; -+ base = CONFIG_PAGE_OFFSET + phdr[j].p_paddr - phdr[j].p_vaddr; -+ break; -+ } - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel = &sec->reltab[j]; -- rel->r_offset = elf32_to_cpu(rel->r_offset); -+ rel->r_offset = elf32_to_cpu(rel->r_offset) + base; - rel->r_info = elf32_to_cpu(rel->r_info); - } - } -@@ -371,14 +415,14 @@ static void read_relocs(FILE *fp) - - static void print_absolute_symbols(void) - { -- int i; -+ unsigned int i; - printf("Absolute symbols\n"); - printf(" Num: Value Size Type Bind Visibility Name\n"); - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - char *sym_strtab; - Elf32_Sym *sh_symtab; -- int j; -+ unsigned int j; - - if (sec->shdr.sh_type != SHT_SYMTAB) { - continue; -@@ -406,14 +450,14 @@ static void print_absolute_symbols(void) - - static void print_absolute_relocs(void) - { -- int i, printed = 0; -+ unsigned int i, printed = 0; - - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - struct section *sec_applies, *sec_symtab; - char *sym_strtab; - Elf32_Sym *sh_symtab; -- int j; -+ unsigned int j; - if (sec->shdr.sh_type != SHT_REL) { - continue; - } -@@ -474,13 +518,13 @@ static void print_absolute_relocs(void) - - static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) - { -- int i; -+ unsigned int i; - /* Walk through the relocations */ - for (i = 0; i < ehdr.e_shnum; i++) { - char *sym_strtab; - Elf32_Sym *sh_symtab; - struct section *sec_applies, *sec_symtab; -- int j; -+ unsigned int j; - struct section *sec = &secs[i]; - - if (sec->shdr.sh_type != SHT_REL) { -@@ -504,6 +548,19 @@ static void walk_relocs(void (*visit)(El - if (sym->st_shndx == SHN_ABS) { - continue; - } -+ /* Don't relocate actual per-cpu variables, they are absolute indices, not addresses */ -+ if (!strcmp(sec_name(sym->st_shndx), ".data.percpu") && strcmp(sym_name(sym_strtab, sym), "__per_cpu_load")) -+ continue; -+ -+#if defined(CONFIG_PAX_KERNEXEC) && defined(CONFIG_X86_32) -+ /* Don't relocate actual code, they are relocated implicitly by the base address of KERNEL_CS */ -+ if (!strcmp(sec_name(sym->st_shndx), ".init.text")) -+ continue; -+ if (!strcmp(sec_name(sym->st_shndx), ".exit.text")) -+ continue; -+ if (!strcmp(sec_name(sym->st_shndx), ".text") && strcmp(sym_name(sym_strtab, sym), "__LOAD_PHYSICAL_ADDR")) -+ continue; -+#endif - if (r_type == R_386_NONE || r_type == R_386_PC32) { - /* - * NONE can be ignored and and PC relative -@@ -541,7 +598,7 @@ static int cmp_relocs(const void *va, co - - static void emit_relocs(int as_text) - { -- int i; -+ unsigned int i; - /* Count how many relocations I have and allocate space for them. */ - reloc_count = 0; - walk_relocs(count_reloc); -@@ -634,6 +691,7 @@ int main(int argc, char **argv) - fname, strerror(errno)); - } - read_ehdr(fp); -+ read_phdrs(fp); - read_shdrs(fp); - read_strtabs(fp); - read_symtabs(fp); -diff -urNp linux-2.6.31.1/arch/x86/boot/cpucheck.c linux-2.6.31.1/arch/x86/boot/cpucheck.c ---- linux-2.6.31.1/arch/x86/boot/cpucheck.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/boot/cpucheck.c 2009-10-01 20:12:42.000000000 -0400 -@@ -74,7 +74,7 @@ static int has_fpu(void) - u16 fcw = -1, fsw = -1; - u32 cr0; - -- asm("movl %%cr0,%0" : "=r" (cr0)); -+ asm volatile("movl %%cr0,%0" : "=r" (cr0)); - if (cr0 & (X86_CR0_EM|X86_CR0_TS)) { - cr0 &= ~(X86_CR0_EM|X86_CR0_TS); - asm volatile("movl %0,%%cr0" : : "r" (cr0)); -@@ -90,7 +90,7 @@ static int has_eflag(u32 mask) - { - u32 f0, f1; - -- asm("pushfl ; " -+ asm volatile("pushfl ; " - "pushfl ; " - "popl %0 ; " - "movl %0,%1 ; " -@@ -115,7 +115,7 @@ static void get_flags(void) - set_bit(X86_FEATURE_FPU, cpu.flags); - - if (has_eflag(X86_EFLAGS_ID)) { -- asm("cpuid" -+ asm volatile("cpuid" - : "=a" (max_intel_level), - "=b" (cpu_vendor[0]), - "=d" (cpu_vendor[1]), -@@ -124,7 +124,7 @@ static void get_flags(void) - - if (max_intel_level >= 0x00000001 && - max_intel_level <= 0x0000ffff) { -- asm("cpuid" -+ asm volatile("cpuid" - : "=a" (tfms), - "=c" (cpu.flags[4]), - "=d" (cpu.flags[0]) -@@ -136,7 +136,7 @@ static void get_flags(void) - cpu.model += ((tfms >> 16) & 0xf) << 4; - } - -- asm("cpuid" -+ asm volatile("cpuid" - : "=a" (max_amd_level) - : "a" (0x80000000) - : "ebx", "ecx", "edx"); -@@ -144,7 +144,7 @@ static void get_flags(void) - if (max_amd_level >= 0x80000001 && - max_amd_level <= 0x8000ffff) { - u32 eax = 0x80000001; -- asm("cpuid" -+ asm volatile("cpuid" - : "+a" (eax), - "=c" (cpu.flags[6]), - "=d" (cpu.flags[1]) -@@ -203,9 +203,9 @@ int check_cpu(int *cpu_level_ptr, int *r - u32 ecx = MSR_K7_HWCR; - u32 eax, edx; - -- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); -+ asm volatile("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); - eax &= ~(1 << 15); -- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); -+ asm volatile("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); - - get_flags(); /* Make sure it really did something */ - err = check_flags(); -@@ -218,9 +218,9 @@ int check_cpu(int *cpu_level_ptr, int *r - u32 ecx = MSR_VIA_FCR; - u32 eax, edx; - -- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); -+ asm volatile("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); - eax |= (1<<1)|(1<<7); -- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); -+ asm volatile("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); - - set_bit(X86_FEATURE_CX8, cpu.flags); - err = check_flags(); -@@ -231,12 +231,12 @@ int check_cpu(int *cpu_level_ptr, int *r - u32 eax, edx; - u32 level = 1; - -- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); -- asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx)); -- asm("cpuid" -+ asm volatile("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); -+ asm volatile("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx)); -+ asm volatile("cpuid" - : "+a" (level), "=d" (cpu.flags[0]) - : : "ecx", "ebx"); -- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); -+ asm volatile("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); - - err = check_flags(); - } -diff -urNp linux-2.6.31.1/arch/x86/boot/header.S linux-2.6.31.1/arch/x86/boot/header.S ---- linux-2.6.31.1/arch/x86/boot/header.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/boot/header.S 2009-10-01 20:12:42.000000000 -0400 -@@ -224,7 +224,7 @@ setup_data: .quad 0 # 64-bit physical - # single linked list of - # struct setup_data - --pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr -+pref_address: .quad ____LOAD_PHYSICAL_ADDR # preferred load addr - - #define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) - #define VO_INIT_SIZE (VO__end - VO__text) -diff -urNp linux-2.6.31.1/arch/x86/boot/video-vesa.c linux-2.6.31.1/arch/x86/boot/video-vesa.c ---- linux-2.6.31.1/arch/x86/boot/video-vesa.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/boot/video-vesa.c 2009-10-01 20:12:42.000000000 -0400 -@@ -205,6 +205,7 @@ static void vesa_store_pm_info(void) - - boot_params.screen_info.vesapm_seg = oreg.es; - boot_params.screen_info.vesapm_off = oreg.di; -+ boot_params.screen_info.vesapm_size = oreg.cx; - } - - /* -diff -urNp linux-2.6.31.1/arch/x86/ia32/ia32_signal.c linux-2.6.31.1/arch/x86/ia32/ia32_signal.c ---- linux-2.6.31.1/arch/x86/ia32/ia32_signal.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/ia32/ia32_signal.c 2009-10-01 20:12:42.000000000 -0400 -@@ -403,7 +403,7 @@ static void __user *get_sigframe(struct - sp -= frame_size; - /* Align the stack pointer according to the i386 ABI, - * i.e. so that on function entry ((sp + 4) & 15) == 0. */ -- sp = ((sp + 4) & -16ul) - 4; -+ sp = ((sp - 12) & -16ul) - 4; - return (void __user *) sp; - } - -@@ -503,7 +503,7 @@ int ia32_setup_rt_frame(int sig, struct - 0xb8, - __NR_ia32_rt_sigreturn, - 0x80cd, -- 0, -+ 0 - }; - - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); -diff -urNp linux-2.6.31.1/arch/x86/include/asm/alternative.h linux-2.6.31.1/arch/x86/include/asm/alternative.h ---- linux-2.6.31.1/arch/x86/include/asm/alternative.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/alternative.h 2009-10-01 20:12:42.000000000 -0400 -@@ -87,7 +87,7 @@ const unsigned char *const *find_nop_tab - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ -- ".section .altinstr_replacement, "ax"\n" \ -+ ".section .altinstr_replacement, "a"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" - -diff -urNp linux-2.6.31.1/arch/x86/include/asm/apm.h linux-2.6.31.1/arch/x86/include/asm/apm.h ---- linux-2.6.31.1/arch/x86/include/asm/apm.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/apm.h 2009-10-01 20:12:42.000000000 -0400 -@@ -34,7 +34,7 @@ static inline void apm_bios_call_asm(u32 - __asm__ __volatile__(APM_DO_ZERO_SEGS - "pushl %%edi\n\t" - "pushl %%ebp\n\t" -- "lcall *%%cs:apm_bios_entry\n\t" -+ "lcall *%%ss:apm_bios_entry\n\t" - "setc %%al\n\t" - "popl %%ebp\n\t" - "popl %%edi\n\t" -@@ -58,7 +58,7 @@ static inline u8 apm_bios_call_simple_as - __asm__ __volatile__(APM_DO_ZERO_SEGS - "pushl %%edi\n\t" - "pushl %%ebp\n\t" -- "lcall *%%cs:apm_bios_entry\n\t" -+ "lcall *%%ss:apm_bios_entry\n\t" - "setc %%bl\n\t" - "popl %%ebp\n\t" - "popl %%edi\n\t" -diff -urNp linux-2.6.31.1/arch/x86/include/asm/atomic_32.h linux-2.6.31.1/arch/x86/include/asm/atomic_32.h ---- linux-2.6.31.1/arch/x86/include/asm/atomic_32.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/atomic_32.h 2009-10-01 20:12:42.000000000 -0400 -@@ -25,6 +25,17 @@ static inline int atomic_read(const atom - } - - /** -+ * atomic_read_unchecked - read atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically reads the value of @v. -+ */ -+static inline int atomic_read_unchecked(const atomic_unchecked_t *v) -+{ -+ return v->counter; -+} -+ -+/** - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value -@@ -37,6 +48,18 @@ static inline void atomic_set(atomic_t * - } - - /** -+ * atomic_set_unchecked - set atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * @i: required value -+ * -+ * Atomically sets the value of @v to @i. -+ */ -+static inline void atomic_set_unchecked(atomic_unchecked_t *v, int i) -+{ -+ v->counter = i; -+} -+ -+/** - * atomic_add - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t -@@ -45,7 +68,29 @@ static inline void atomic_set(atomic_t * - */ - static inline void atomic_add(int i, atomic_t *v) - { -- asm volatile(LOCK_PREFIX "addl %1,%0" -+ asm volatile(LOCK_PREFIX "addl %1,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "subl %1,%0\n" -+ "into\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ : "+m" (v->counter) -+ : "ir" (i)); -+} -+ -+/** -+ * atomic_add_unchecked - add integer to atomic variable -+ * @i: integer value to add -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically adds @i to @v. -+ */ -+static inline void atomic_add_unchecked(int i, atomic_unchecked_t *v) -+{ -+ asm volatile(LOCK_PREFIX "addl %1,%0\n" - : "+m" (v->counter) - : "ir" (i)); - } -@@ -59,7 +104,29 @@ static inline void atomic_add(int i, ato - */ - static inline void atomic_sub(int i, atomic_t *v) - { -- asm volatile(LOCK_PREFIX "subl %1,%0" -+ asm volatile(LOCK_PREFIX "subl %1,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "addl %1,%0\n" -+ "into\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ : "+m" (v->counter) -+ : "ir" (i)); -+} -+ -+/** -+ * atomic_sub_unchecked - subtract integer from atomic variable -+ * @i: integer value to subtract -+ * @v: pointer of type atomic_t -+ * -+ * Atomically subtracts @i from @v. -+ */ -+static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) -+{ -+ asm volatile(LOCK_PREFIX "subl %1,%0\n" - : "+m" (v->counter) - : "ir" (i)); - } -@@ -77,7 +144,16 @@ static inline int atomic_sub_and_test(in - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" -+ asm volatile(LOCK_PREFIX "subl %2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "addl %2,%0\n" -+ "into\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ "sete %1\n" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; -@@ -91,7 +167,30 @@ static inline int atomic_sub_and_test(in - */ - static inline void atomic_inc(atomic_t *v) - { -- asm volatile(LOCK_PREFIX "incl %0" -+ asm volatile(LOCK_PREFIX "incl %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "into\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ LOCK_PREFIX "decl %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ : "+m" (v->counter)); -+} -+ -+/** -+ * atomic_inc_unchecked - increment atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically increments @v by 1. -+ */ -+static inline void atomic_inc_unchecked(atomic_unchecked_t *v) -+{ -+ asm volatile(LOCK_PREFIX "incl %0\n" - : "+m" (v->counter)); - } - -@@ -103,7 +202,18 @@ static inline void atomic_inc(atomic_t * - */ - static inline void atomic_dec(atomic_t *v) - { -- asm volatile(LOCK_PREFIX "decl %0" -+ asm volatile(LOCK_PREFIX "decl %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "into\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1: \n" -+ LOCK_PREFIX "incl %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "+m" (v->counter)); - } - -@@ -119,7 +229,19 @@ static inline int atomic_dec_and_test(at - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "decl %0; sete %1" -+ asm volatile(LOCK_PREFIX "decl %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "into\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1: \n" -+ LOCK_PREFIX "incl %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "sete %1\n" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; -@@ -137,7 +259,19 @@ static inline int atomic_inc_and_test(at - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "incl %0; sete %1" -+ asm volatile(LOCK_PREFIX "incl %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "into\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1: \n" -+ LOCK_PREFIX "decl %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "sete %1\n" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; -@@ -156,7 +290,16 @@ static inline int atomic_add_negative(in - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" -+ asm volatile(LOCK_PREFIX "addl %2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "subl %2,%0\n" -+ "into\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ "sets %1\n" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; -@@ -179,7 +322,15 @@ static inline int atomic_add_return(int - #endif - /* Modern 486+ processor */ - __i = i; -- asm volatile(LOCK_PREFIX "xaddl %0, %1" -+ asm volatile(LOCK_PREFIX "xaddl %0, %1\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "movl %0, %1\n" -+ "into\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; -@@ -227,17 +378,28 @@ static inline int atomic_xchg(atomic_t * - */ - static inline int atomic_add_unless(atomic_t *v, int a, int u) - { -- int c, old; -+ int c, old, new; - c = atomic_read(v); - for (;;) { -- if (unlikely(c == (u))) -+ if (unlikely(c == u)) - break; -- old = atomic_cmpxchg((v), c, c + (a)); -+ -+ asm volatile("addl %2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "into\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ : "=r" (new) -+ : "0" (c), "ir" (a)); -+ -+ old = atomic_cmpxchg(v, c, new); - if (likely(old == c)) - break; - c = old; - } -- return c != (u); -+ return c != u; - } - - #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) -diff -urNp linux-2.6.31.1/arch/x86/include/asm/atomic_64.h linux-2.6.31.1/arch/x86/include/asm/atomic_64.h ---- linux-2.6.31.1/arch/x86/include/asm/atomic_64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/atomic_64.h 2009-10-01 20:12:42.000000000 -0400 -@@ -24,6 +24,17 @@ static inline int atomic_read(const atom - } - - /** -+ * atomic_read_unchecked - read atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically reads the value of @v. -+ */ -+static inline int atomic_read_unchecked(const atomic_unchecked_t *v) -+{ -+ return v->counter; -+} -+ -+/** - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value -@@ -36,6 +47,18 @@ static inline void atomic_set(atomic_t * - } - - /** -+ * atomic_set_unchecked - set atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * @i: required value -+ * -+ * Atomically sets the value of @v to @i. -+ */ -+static inline void atomic_set_unchecked(atomic_unchecked_t *v, int i) -+{ -+ v->counter = i; -+} -+ -+/** - * atomic_add - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t -@@ -44,7 +67,29 @@ static inline void atomic_set(atomic_t * - */ - static inline void atomic_add(int i, atomic_t *v) - { -- asm volatile(LOCK_PREFIX "addl %1,%0" -+ asm volatile(LOCK_PREFIX "addl %1,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "subl %1,%0\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ : "=m" (v->counter) -+ : "ir" (i), "m" (v->counter)); -+} -+ -+/** -+ * atomic_add_unchecked - add integer to atomic variable -+ * @i: integer value to add -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically adds @i to @v. -+ */ -+static inline void atomic_add_unchecked(int i, atomic_unchecked_t *v) -+{ -+ asm volatile(LOCK_PREFIX "addl %1,%0\n" - : "=m" (v->counter) - : "ir" (i), "m" (v->counter)); - } -@@ -58,7 +103,29 @@ static inline void atomic_add(int i, ato - */ - static inline void atomic_sub(int i, atomic_t *v) - { -- asm volatile(LOCK_PREFIX "subl %1,%0" -+ asm volatile(LOCK_PREFIX "subl %1,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "addl %1,%0\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ : "=m" (v->counter) -+ : "ir" (i), "m" (v->counter)); -+} -+ -+/** -+ * atomic_sub_unchecked - subtract the atomic variable -+ * @i: integer value to subtract -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically subtracts @i from @v. -+ */ -+static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) -+{ -+ asm volatile(LOCK_PREFIX "subl %1,%0\n" - : "=m" (v->counter) - : "ir" (i), "m" (v->counter)); - } -@@ -76,7 +143,16 @@ static inline int atomic_sub_and_test(in - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" -+ asm volatile(LOCK_PREFIX "subl %2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "addl %2,%0\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ "sete %1\n" - : "=m" (v->counter), "=qm" (c) - : "ir" (i), "m" (v->counter) : "memory"); - return c; -@@ -90,7 +166,32 @@ static inline int atomic_sub_and_test(in - */ - static inline void atomic_inc(atomic_t *v) - { -- asm volatile(LOCK_PREFIX "incl %0" -+ asm volatile(LOCK_PREFIX "incl %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "int $4\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ LOCK_PREFIX "decl %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ : "=m" (v->counter) -+ : "m" (v->counter)); -+} -+ -+/** -+ * atomic_inc_unchecked - increment atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically increments @v by 1. -+ */ -+static inline void atomic_inc_unchecked(atomic_unchecked_t *v) -+{ -+ asm volatile(LOCK_PREFIX "incl %0\n" - : "=m" (v->counter) - : "m" (v->counter)); - } -@@ -103,7 +204,19 @@ static inline void atomic_inc(atomic_t * - */ - static inline void atomic_dec(atomic_t *v) - { -- asm volatile(LOCK_PREFIX "decl %0" -+ asm volatile(LOCK_PREFIX "decl %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "int $4\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1: \n" -+ LOCK_PREFIX "incl %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "=m" (v->counter) - : "m" (v->counter)); - } -@@ -120,7 +233,20 @@ static inline int atomic_dec_and_test(at - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "decl %0; sete %1" -+ asm volatile(LOCK_PREFIX "decl %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "int $4\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1: \n" -+ LOCK_PREFIX "incl %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "sete %1\n" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; -@@ -138,7 +264,20 @@ static inline int atomic_inc_and_test(at - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "incl %0; sete %1" -+ asm volatile(LOCK_PREFIX "incl %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "int $4\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1: \n" -+ LOCK_PREFIX "decl %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "sete %1\n" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; -@@ -157,7 +296,16 @@ static inline int atomic_add_negative(in - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" -+ asm volatile(LOCK_PREFIX "addl %2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "subl %2,%0\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ "sets %1\n" - : "=m" (v->counter), "=qm" (c) - : "ir" (i), "m" (v->counter) : "memory"); - return c; -@@ -173,7 +321,15 @@ static inline int atomic_add_negative(in - static inline int atomic_add_return(int i, atomic_t *v) - { - int __i = i; -- asm volatile(LOCK_PREFIX "xaddl %0, %1" -+ asm volatile(LOCK_PREFIX "xaddl %0, %1\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "movl %0, %1\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; -@@ -224,7 +380,15 @@ static inline void atomic64_set(atomic64 - */ - static inline void atomic64_add(long i, atomic64_t *v) - { -- asm volatile(LOCK_PREFIX "addq %1,%0" -+ asm volatile(LOCK_PREFIX "addq %1,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "subq %1,%0\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ - : "=m" (v->counter) - : "er" (i), "m" (v->counter)); - } -@@ -238,7 +402,15 @@ static inline void atomic64_add(long i, - */ - static inline void atomic64_sub(long i, atomic64_t *v) - { -- asm volatile(LOCK_PREFIX "subq %1,%0" -+ asm volatile(LOCK_PREFIX "subq %1,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "addq %1,%0\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ - : "=m" (v->counter) - : "er" (i), "m" (v->counter)); - } -@@ -256,7 +428,16 @@ static inline int atomic64_sub_and_test( - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" -+ asm volatile(LOCK_PREFIX "subq %2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "addq %2,%0\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ "sete %1\n" - : "=m" (v->counter), "=qm" (c) - : "er" (i), "m" (v->counter) : "memory"); - return c; -@@ -270,7 +451,19 @@ static inline int atomic64_sub_and_test( - */ - static inline void atomic64_inc(atomic64_t *v) - { -- asm volatile(LOCK_PREFIX "incq %0" -+ asm volatile(LOCK_PREFIX "incq %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "int $4\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ LOCK_PREFIX "decq %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "=m" (v->counter) - : "m" (v->counter)); - } -@@ -283,7 +476,19 @@ static inline void atomic64_inc(atomic64 - */ - static inline void atomic64_dec(atomic64_t *v) - { -- asm volatile(LOCK_PREFIX "decq %0" -+ asm volatile(LOCK_PREFIX "decq %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "int $4\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1: \n" -+ LOCK_PREFIX "incq %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "=m" (v->counter) - : "m" (v->counter)); - } -@@ -300,7 +505,20 @@ static inline int atomic64_dec_and_test( - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "decq %0; sete %1" -+ asm volatile(LOCK_PREFIX "decq %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "int $4\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1: \n" -+ LOCK_PREFIX "incq %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "sete %1\n" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; -@@ -318,7 +536,20 @@ static inline int atomic64_inc_and_test( - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "incq %0; sete %1" -+ asm volatile(LOCK_PREFIX "incq %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "int $4\n0:\n" -+ ".pushsection .fixup,"ax"\n" -+ "1: \n" -+ LOCK_PREFIX "decq %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "sete %1\n" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; -@@ -337,7 +568,16 @@ static inline int atomic64_add_negative( - { - unsigned char c; - -- asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" -+ asm volatile(LOCK_PREFIX "addq %2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ LOCK_PREFIX "subq %2,%0\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ "sets %1\n" - : "=m" (v->counter), "=qm" (c) - : "er" (i), "m" (v->counter) : "memory"); - return c; -@@ -353,7 +593,15 @@ static inline int atomic64_add_negative( - static inline long atomic64_add_return(long i, atomic64_t *v) - { - long __i = i; -- asm volatile(LOCK_PREFIX "xaddq %0, %1;" -+ asm volatile(LOCK_PREFIX "xaddq %0, %1\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "movq %0, %1\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; -@@ -398,17 +646,29 @@ static inline long atomic_xchg(atomic_t - */ - static inline int atomic_add_unless(atomic_t *v, int a, int u) - { -- int c, old; -+ int c, old, new; - c = atomic_read(v); - for (;;) { -- if (unlikely(c == (u))) -+ if (unlikely(c == u)) - break; -- old = atomic_cmpxchg((v), c, c + (a)); -+ -+ asm volatile("addl %2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ : "=r" (new) -+ : "0" (c), "ir" (a)); -+ -+ old = atomic_cmpxchg(v, c, new); - if (likely(old == c)) - break; - c = old; - } -- return c != (u); -+ return c != u; - } - - #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) -@@ -424,17 +684,29 @@ static inline int atomic_add_unless(atom - */ - static inline int atomic64_add_unless(atomic64_t *v, long a, long u) - { -- long c, old; -+ long c, old, new; - c = atomic64_read(v); - for (;;) { -- if (unlikely(c == (u))) -+ if (unlikely(c == u)) - break; -- old = atomic64_cmpxchg((v), c, c + (a)); -+ -+ asm volatile("addq %2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ "jno 0f\n" -+ "int $4\n0:\n" -+ _ASM_EXTABLE(0b, 0b) -+#endif -+ -+ : "=r" (new) -+ : "0" (c), "er" (a)); -+ -+ old = atomic64_cmpxchg((v), c, new); - if (likely(old == c)) - break; - c = old; - } -- return c != (u); -+ return c != u; - } - - /** -diff -urNp linux-2.6.31.1/arch/x86/include/asm/boot.h linux-2.6.31.1/arch/x86/include/asm/boot.h ---- linux-2.6.31.1/arch/x86/include/asm/boot.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/boot.h 2009-10-01 20:12:42.000000000 -0400 -@@ -11,10 +11,15 @@ - #include <asm/pgtable_types.h> - - /* Physical address where kernel should be loaded. */ --#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ -+#define ____LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ - + (CONFIG_PHYSICAL_ALIGN - 1)) \ - & ~(CONFIG_PHYSICAL_ALIGN - 1)) - -+#ifndef __ASSEMBLY__ -+extern unsigned char __LOAD_PHYSICAL_ADDR[]; -+#define LOAD_PHYSICAL_ADDR ((unsigned long)__LOAD_PHYSICAL_ADDR) -+#endif -+ - /* Minimum kernel alignment, as a power of two */ - #ifdef CONFIG_X86_64 - #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT -diff -urNp linux-2.6.31.1/arch/x86/include/asm/cache.h linux-2.6.31.1/arch/x86/include/asm/cache.h ---- linux-2.6.31.1/arch/x86/include/asm/cache.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/cache.h 2009-10-01 20:12:42.000000000 -0400 -@@ -6,6 +6,7 @@ - #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) - - #define __read_mostly __attribute__((__section__(".data.read_mostly"))) -+#define __read_only __attribute__((__section__(".data.read_only"))) - - #ifdef CONFIG_X86_VSMP - /* vSMP Internode cacheline shift */ -diff -urNp linux-2.6.31.1/arch/x86/include/asm/checksum_32.h linux-2.6.31.1/arch/x86/include/asm/checksum_32.h ---- linux-2.6.31.1/arch/x86/include/asm/checksum_32.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/checksum_32.h 2009-10-01 20:12:42.000000000 -0400 -@@ -31,6 +31,14 @@ asmlinkage __wsum csum_partial_copy_gene - int len, __wsum sum, - int *src_err_ptr, int *dst_err_ptr); - -+asmlinkage __wsum csum_partial_copy_generic_to_user(const void *src, void *dst, -+ int len, __wsum sum, -+ int *src_err_ptr, int *dst_err_ptr); -+ -+asmlinkage __wsum csum_partial_copy_generic_from_user(const void *src, void *dst, -+ int len, __wsum sum, -+ int *src_err_ptr, int *dst_err_ptr); -+ - /* - * Note: when you get a NULL pointer exception here this means someone - * passed in an incorrect kernel address to one of these functions. -@@ -50,7 +58,7 @@ static inline __wsum csum_partial_copy_f - int *err_ptr) - { - might_sleep(); -- return csum_partial_copy_generic((__force void *)src, dst, -+ return csum_partial_copy_generic_from_user((__force void *)src, dst, - len, sum, err_ptr, NULL); - } - -@@ -177,7 +185,7 @@ static inline __wsum csum_and_copy_to_us - { - might_sleep(); - if (access_ok(VERIFY_WRITE, dst, len)) -- return csum_partial_copy_generic(src, (__force void *)dst, -+ return csum_partial_copy_generic_to_user(src, (__force void *)dst, - len, sum, NULL, err_ptr); - - if (len) -diff -urNp linux-2.6.31.1/arch/x86/include/asm/desc.h linux-2.6.31.1/arch/x86/include/asm/desc.h ---- linux-2.6.31.1/arch/x86/include/asm/desc.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/desc.h 2009-10-01 20:12:42.000000000 -0400 -@@ -15,6 +15,7 @@ static inline void fill_ldt(struct desc_ - desc->base1 = (info->base_addr & 0x00ff0000) >> 16; - desc->type = (info->read_exec_only ^ 1) << 1; - desc->type |= info->contents << 2; -+ desc->type |= info->seg_not_present ^ 1; - desc->s = 1; - desc->dpl = 0x3; - desc->p = info->seg_not_present ^ 1; -@@ -31,16 +32,12 @@ static inline void fill_ldt(struct desc_ - } - - extern struct desc_ptr idt_descr; --extern gate_desc idt_table[]; -- --struct gdt_page { -- struct desc_struct gdt[GDT_ENTRIES]; --} __attribute__((aligned(PAGE_SIZE))); --DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); -+extern gate_desc idt_table[256]; - -+extern struct desc_struct cpu_gdt_table[NR_CPUS][PAGE_SIZE / sizeof(struct desc_struct)]; - static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) - { -- return per_cpu(gdt_page, cpu).gdt; -+ return cpu_gdt_table[cpu]; - } - - #ifdef CONFIG_X86_64 -@@ -115,19 +112,48 @@ static inline void paravirt_free_ldt(str - static inline void native_write_idt_entry(gate_desc *idt, int entry, - const gate_desc *gate) - { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ - memcpy(&idt[entry], gate, sizeof(*gate)); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, - const void *desc) - { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ - memcpy(&ldt[entry], desc, 8); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, - const void *desc, int type) - { - unsigned int size; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - switch (type) { - case DESC_TSS: - size = sizeof(tss_desc); -@@ -139,7 +165,17 @@ static inline void native_write_gdt_entr - size = sizeof(struct desc_struct); - break; - } -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - memcpy(&gdt[entry], desc, size); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, -@@ -211,7 +247,19 @@ static inline void native_set_ldt(const - - static inline void native_load_tr_desc(void) - { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ - asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - static inline void native_load_gdt(const struct desc_ptr *dtr) -@@ -246,8 +294,19 @@ static inline void native_load_tls(struc - unsigned int i; - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) - gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - #define _LDT_empty(info) \ -@@ -379,4 +438,16 @@ static inline void set_system_intr_gate_ - _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); - } - -+#ifdef CONFIG_X86_32 -+static inline void set_user_cs(unsigned long base, unsigned long limit, int cpu) -+{ -+ struct desc_struct d; -+ -+ if (likely(limit)) -+ limit = (limit - 1UL) >> PAGE_SHIFT; -+ pack_descriptor(&d, base, limit, 0xFB, 0xC); -+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_DEFAULT_USER_CS, &d, DESCTYPE_S); -+} -+#endif -+ - #endif /* _ASM_X86_DESC_H */ -diff -urNp linux-2.6.31.1/arch/x86/include/asm/e820.h linux-2.6.31.1/arch/x86/include/asm/e820.h ---- linux-2.6.31.1/arch/x86/include/asm/e820.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/e820.h 2009-10-01 20:12:42.000000000 -0400 -@@ -135,7 +135,7 @@ extern char *memory_setup(void); - #define ISA_END_ADDRESS 0x100000 - #define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) - --#define BIOS_BEGIN 0x000a0000 -+#define BIOS_BEGIN 0x000c0000 - #define BIOS_END 0x00100000 - - #ifdef __KERNEL__ -diff -urNp linux-2.6.31.1/arch/x86/include/asm/elf.h linux-2.6.31.1/arch/x86/include/asm/elf.h ---- linux-2.6.31.1/arch/x86/include/asm/elf.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/elf.h 2009-10-01 20:12:42.000000000 -0400 -@@ -263,7 +263,25 @@ extern int force_personality32; - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -+#ifdef CONFIG_PAX_SEGMEXEC -+#define ELF_ET_DYN_BASE ((current->mm->pax_flags & MF_PAX_SEGMEXEC) ? SEGMEXEC_TASK_SIZE/3*2 : TASK_SIZE/3*2) -+#else - #define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) -+#endif -+ -+#ifdef CONFIG_PAX_ASLR -+#ifdef CONFIG_X86_32 -+#define PAX_ELF_ET_DYN_BASE 0x10000000UL -+ -+#define PAX_DELTA_MMAP_LEN (current->mm->pax_flags & MF_PAX_SEGMEXEC ? 15 : 16) -+#define PAX_DELTA_STACK_LEN (current->mm->pax_flags & MF_PAX_SEGMEXEC ? 15 : 16) -+#else -+#define PAX_ELF_ET_DYN_BASE 0x400000UL -+ -+#define PAX_DELTA_MMAP_LEN ((test_thread_flag(TIF_IA32)) ? 16 : 32) -+#define PAX_DELTA_STACK_LEN ((test_thread_flag(TIF_IA32)) ? 16 : 32) -+#endif -+#endif - - /* This yields a mask that user programs can use to figure out what - instruction set this CPU supports. This could be done in user space, -@@ -315,8 +333,7 @@ do { \ - #define ARCH_DLINFO \ - do { \ - if (vdso_enabled) \ -- NEW_AUX_ENT(AT_SYSINFO_EHDR, \ -- (unsigned long)current->mm->context.vdso); \ -+ NEW_AUX_ENT(AT_SYSINFO_EHDR, current->mm->context.vdso);\ - } while (0) - - #define AT_SYSINFO 32 -@@ -327,7 +344,7 @@ do { \ - - #endif /* !CONFIG_X86_32 */ - --#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso) -+#define VDSO_CURRENT_BASE (current->mm->context.vdso) - - #define VDSO_ENTRY \ - ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall)) -@@ -341,7 +358,4 @@ extern int arch_setup_additional_pages(s - extern int syscall32_setup_pages(struct linux_binprm *, int exstack); - #define compat_arch_setup_additional_pages syscall32_setup_pages - --extern unsigned long arch_randomize_brk(struct mm_struct *mm); --#define arch_randomize_brk arch_randomize_brk -- - #endif /* _ASM_X86_ELF_H */ -diff -urNp linux-2.6.31.1/arch/x86/include/asm/futex.h linux-2.6.31.1/arch/x86/include/asm/futex.h ---- linux-2.6.31.1/arch/x86/include/asm/futex.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/futex.h 2009-10-01 20:12:42.000000000 -0400 -@@ -11,6 +11,40 @@ - #include <asm/processor.h> - #include <asm/system.h> - -+#ifdef CONFIG_X86_32 -+#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ -+ asm volatile( \ -+ "movw\t%w6, %%ds\n" \ -+ "1:\t" insn "\n" \ -+ "2:\tpushl\t%%ss\n" \ -+ "\tpopl\t%%ds\n" \ -+ "\t.section .fixup,"ax"\n" \ -+ "3:\tmov\t%3, %1\n" \ -+ "\tjmp\t2b\n" \ -+ "\t.previous\n" \ -+ _ASM_EXTABLE(1b, 3b) \ -+ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \ -+ : "i" (-EFAULT), "0" (oparg), "1" (0), "r" (__USER_DS)) -+ -+#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \ -+ asm volatile("movw\t%w7, %%es\n" \ -+ "1:\tmovl\t%%es:%2, %0\n" \ -+ "\tmovl\t%0, %3\n" \ -+ "\t" insn "\n" \ -+ "2:\t" LOCK_PREFIX "cmpxchgl %3, %%es:%2\n"\ -+ "\tjnz\t1b\n" \ -+ "3:\tpushl\t%%ss\n" \ -+ "\tpopl\t%%es\n" \ -+ "\t.section .fixup,"ax"\n" \ -+ "4:\tmov\t%5, %1\n" \ -+ "\tjmp\t3b\n" \ -+ "\t.previous\n" \ -+ _ASM_EXTABLE(1b, 4b) \ -+ _ASM_EXTABLE(2b, 4b) \ -+ : "=&a" (oldval), "=&r" (ret), \ -+ "+m" (*uaddr), "=&r" (tem) \ -+ : "r" (oparg), "i" (-EFAULT), "1" (0), "r" (__USER_DS)) -+#else - #define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ - asm volatile("1:\t" insn "\n" \ - "2:\t.section .fixup,"ax"\n" \ -@@ -36,8 +70,9 @@ - : "=&a" (oldval), "=&r" (ret), \ - "+m" (*uaddr), "=&r" (tem) \ - : "r" (oparg), "i" (-EFAULT), "1" (0)) -+#endif - --static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) -+static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) - { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; -@@ -61,11 +96,20 @@ static inline int futex_atomic_op_inuser - - switch (op) { - case FUTEX_OP_SET: -+#ifdef CONFIG_X86_32 -+ __futex_atomic_op1("xchgl %0, %%ds:%2", ret, oldval, uaddr, oparg); -+#else - __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg); -+#endif - break; - case FUTEX_OP_ADD: -+#ifdef CONFIG_X86_32 -+ __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %%ds:%2", ret, oldval, -+ uaddr, oparg); -+#else - __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval, - uaddr, oparg); -+#endif - break; - case FUTEX_OP_OR: - __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg); -@@ -109,7 +153,7 @@ static inline int futex_atomic_op_inuser - return ret; - } - --static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, -+static inline int futex_atomic_cmpxchg_inatomic(u32 __user *uaddr, int oldval, - int newval) - { - -@@ -122,14 +166,27 @@ static inline int futex_atomic_cmpxchg_i - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; - -- asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" -+ asm volatile( -+#ifdef CONFIG_X86_32 -+ "\tmovw %w5, %%ds\n" -+ "1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" -+ "2:\tpushl %%ss\n" -+ "\tpopl %%ds\n" -+ "\t.section .fixup, "ax"\n" -+#else -+ "1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" - "2:\t.section .fixup, "ax"\n" -+#endif - "3:\tmov %2, %0\n" - "\tjmp 2b\n" - "\t.previous\n" - _ASM_EXTABLE(1b, 3b) - : "=a" (oldval), "+m" (*uaddr) -+#ifdef CONFIG_X86_32 -+ : "i" (-EFAULT), "r" (newval), "0" (oldval), "r" (__USER_DS) -+#else - : "i" (-EFAULT), "r" (newval), "0" (oldval) -+#endif - : "memory" - ); - -diff -urNp linux-2.6.31.1/arch/x86/include/asm/i387.h linux-2.6.31.1/arch/x86/include/asm/i387.h ---- linux-2.6.31.1/arch/x86/include/asm/i387.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/i387.h 2009-10-01 20:12:42.000000000 -0400 -@@ -194,13 +194,8 @@ static inline int fxrstor_checking(struc - } - - /* We need a safe address that is cheap to find and that is already -- in L1 during context switch. The best choices are unfortunately -- different for UP and SMP */ --#ifdef CONFIG_SMP --#define safe_address (__per_cpu_offset[0]) --#else --#define safe_address (kstat_cpu(0).cpustat.user) --#endif -+ in L1 during context switch. */ -+#define safe_address (init_tss[smp_processor_id()].x86_tss.sp0) - - /* - * These must be called with preempt disabled -diff -urNp linux-2.6.31.1/arch/x86/include/asm/io_64.h linux-2.6.31.1/arch/x86/include/asm/io_64.h ---- linux-2.6.31.1/arch/x86/include/asm/io_64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/io_64.h 2009-10-01 20:12:42.000000000 -0400 -@@ -140,6 +140,17 @@ __OUTS(l) - - #include <linux/vmalloc.h> - -+#define ARCH_HAS_VALID_PHYS_ADDR_RANGE -+static inline int valid_phys_addr_range (unsigned long addr, size_t count) -+{ -+ return ((addr + count + PAGE_SIZE - 1) >> PAGE_SHIFT) < (1 << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) ? 1 : 0; -+} -+ -+static inline int valid_mmap_phys_addr_range (unsigned long pfn, size_t count) -+{ -+ return (pfn + (count >> PAGE_SHIFT)) < (1 << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) ? 1 : 0; -+} -+ - #include <asm-generic/iomap.h> - - void __memcpy_fromio(void *, unsigned long, unsigned); -diff -urNp linux-2.6.31.1/arch/x86/include/asm/irqflags.h linux-2.6.31.1/arch/x86/include/asm/irqflags.h ---- linux-2.6.31.1/arch/x86/include/asm/irqflags.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/irqflags.h 2009-10-01 20:12:42.000000000 -0400 -@@ -147,6 +147,8 @@ static inline unsigned long __raw_local_ - #define INTERRUPT_RETURN iret - #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit - #define GET_CR0_INTO_EAX movl %cr0, %eax -+#define GET_CR0_INTO_EDX movl %cr0, %edx -+#define SET_CR0_FROM_EDX movl %edx, %cr0 - #endif - - -diff -urNp linux-2.6.31.1/arch/x86/include/asm/kvm_host.h linux-2.6.31.1/arch/x86/include/asm/kvm_host.h ---- linux-2.6.31.1/arch/x86/include/asm/kvm_host.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/kvm_host.h 2009-10-01 20:12:42.000000000 -0400 -@@ -528,7 +528,7 @@ struct kvm_x86_ops { - u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - }; - --extern struct kvm_x86_ops *kvm_x86_ops; -+extern const struct kvm_x86_ops *kvm_x86_ops; - - int kvm_mmu_module_init(void); - void kvm_mmu_module_exit(void); -diff -urNp linux-2.6.31.1/arch/x86/include/asm/local.h linux-2.6.31.1/arch/x86/include/asm/local.h ---- linux-2.6.31.1/arch/x86/include/asm/local.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/local.h 2009-10-01 20:12:42.000000000 -0400 -@@ -18,26 +18,90 @@ typedef struct { - - static inline void local_inc(local_t *l) - { -- asm volatile(_ASM_INC "%0" -+ asm volatile(_ASM_INC "%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ _ASM_DEC "%0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "+m" (l->a.counter)); - } - - static inline void local_dec(local_t *l) - { -- asm volatile(_ASM_DEC "%0" -+ asm volatile(_ASM_DEC "%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ _ASM_INC "%0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "+m" (l->a.counter)); - } - - static inline void local_add(long i, local_t *l) - { -- asm volatile(_ASM_ADD "%1,%0" -+ asm volatile(_ASM_ADD "%1,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ _ASM_SUB "%1,%0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "+m" (l->a.counter) - : "ir" (i)); - } - - static inline void local_sub(long i, local_t *l) - { -- asm volatile(_ASM_SUB "%1,%0" -+ asm volatile(_ASM_SUB "%1,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ _ASM_ADD "%1,%0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "+m" (l->a.counter) - : "ir" (i)); - } -@@ -55,7 +119,24 @@ static inline int local_sub_and_test(lon - { - unsigned char c; - -- asm volatile(_ASM_SUB "%2,%0; sete %1" -+ asm volatile(_ASM_SUB "%2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ _ASM_ADD "%2,%0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "sete %1\n" - : "+m" (l->a.counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; -@@ -73,7 +154,24 @@ static inline int local_dec_and_test(loc - { - unsigned char c; - -- asm volatile(_ASM_DEC "%0; sete %1" -+ asm volatile(_ASM_DEC "%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ _ASM_INC "%0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "sete %1\n" - : "+m" (l->a.counter), "=qm" (c) - : : "memory"); - return c != 0; -@@ -91,7 +189,24 @@ static inline int local_inc_and_test(loc - { - unsigned char c; - -- asm volatile(_ASM_INC "%0; sete %1" -+ asm volatile(_ASM_INC "%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ _ASM_DEC "%0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "sete %1\n" - : "+m" (l->a.counter), "=qm" (c) - : : "memory"); - return c != 0; -@@ -110,7 +225,24 @@ static inline int local_add_negative(lon - { - unsigned char c; - -- asm volatile(_ASM_ADD "%2,%0; sets %1" -+ asm volatile(_ASM_ADD "%2,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ _ASM_SUB "%2,%0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "sets %1\n" - : "+m" (l->a.counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; -@@ -133,7 +265,23 @@ static inline long local_add_return(long - #endif - /* Modern 486+ processor */ - __i = i; -- asm volatile(_ASM_XADD "%0, %1;" -+ asm volatile(_ASM_XADD "%0, %1\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ _ASM_MOV "%0,%1\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "+r" (i), "+m" (l->a.counter) - : : "memory"); - return i + __i; -diff -urNp linux-2.6.31.1/arch/x86/include/asm/mman.h linux-2.6.31.1/arch/x86/include/asm/mman.h ---- linux-2.6.31.1/arch/x86/include/asm/mman.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/mman.h 2009-10-01 20:12:42.000000000 -0400 -@@ -17,4 +17,14 @@ - #define MCL_CURRENT 1 /* lock all current mappings */ - #define MCL_FUTURE 2 /* lock all future mappings */ - -+#ifdef __KERNEL__ -+#ifndef __ASSEMBLY__ -+#ifdef CONFIG_X86_32 -+#define arch_mmap_check i386_mmap_check -+int i386_mmap_check(unsigned long addr, unsigned long len, -+ unsigned long flags); -+#endif -+#endif -+#endif -+ - #endif /* _ASM_X86_MMAN_H */ -diff -urNp linux-2.6.31.1/arch/x86/include/asm/mmu_context.h linux-2.6.31.1/arch/x86/include/asm/mmu_context.h ---- linux-2.6.31.1/arch/x86/include/asm/mmu_context.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/mmu_context.h 2009-10-01 20:12:42.000000000 -0400 -@@ -34,11 +34,17 @@ static inline void switch_mm(struct mm_s - struct task_struct *tsk) - { - unsigned cpu = smp_processor_id(); -+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) -+ int tlbstate = TLBSTATE_OK; -+#endif - - if (likely(prev != next)) { - /* stop flush ipis for the previous mm */ - cpu_clear(cpu, prev->cpu_vm_mask); - #ifdef CONFIG_SMP -+#ifdef CONFIG_X86_32 -+ tlbstate = percpu_read(cpu_tlbstate.state); -+#endif - percpu_write(cpu_tlbstate.state, TLBSTATE_OK); - percpu_write(cpu_tlbstate.active_mm, next); - #endif -@@ -52,6 +58,26 @@ static inline void switch_mm(struct mm_s - */ - if (unlikely(prev->context.ldt != next->context.ldt)) - load_LDT_nolock(&next->context); -+ -+#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_SMP) -+ if (!nx_enabled) { -+ smp_mb__before_clear_bit(); -+ cpu_clear(cpu, prev->context.cpu_user_cs_mask); -+ smp_mb__after_clear_bit(); -+ cpu_set(cpu, next->context.cpu_user_cs_mask); -+ } -+#endif -+ -+#if defined(CONFIG_X86_32) && (defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC)) -+ if (unlikely(prev->context.user_cs_base != next->context.user_cs_base || -+ prev->context.user_cs_limit != next->context.user_cs_limit -+#ifdef CONFIG_SMP -+ || tlbstate != TLBSTATE_OK -+#endif -+ )) -+ set_user_cs(next->context.user_cs_base, next->context.user_cs_limit, cpu); -+#endif -+ - } - #ifdef CONFIG_SMP - else { -@@ -65,6 +91,19 @@ static inline void switch_mm(struct mm_s - */ - load_cr3(next->pgd); - load_LDT_nolock(&next->context); -+ -+#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_PAGEEXEC) -+ if (!nx_enabled) -+ cpu_set(cpu, next->context.cpu_user_cs_mask); -+#endif -+ -+#if defined(CONFIG_X86_32) && (defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC)) -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (!((next->pax_flags & MF_PAX_PAGEEXEC) && nx_enabled)) -+#endif -+ set_user_cs(next->context.user_cs_base, next->context.user_cs_limit, cpu); -+#endif -+ - } - } - #endif -diff -urNp linux-2.6.31.1/arch/x86/include/asm/mmu.h linux-2.6.31.1/arch/x86/include/asm/mmu.h ---- linux-2.6.31.1/arch/x86/include/asm/mmu.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/mmu.h 2009-10-01 20:12:42.000000000 -0400 -@@ -9,10 +9,23 @@ - * we put the segment information here. - */ - typedef struct { -- void *ldt; -+ struct desc_struct *ldt; - int size; - struct mutex lock; -- void *vdso; -+ unsigned long vdso; -+ -+#ifdef CONFIG_X86_32 -+#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) -+ unsigned long user_cs_base; -+ unsigned long user_cs_limit; -+ -+#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_SMP) -+ cpumask_t cpu_user_cs_mask; -+#endif -+ -+#endif -+#endif -+ - } mm_context_t; - - #ifdef CONFIG_SMP -diff -urNp linux-2.6.31.1/arch/x86/include/asm/module.h linux-2.6.31.1/arch/x86/include/asm/module.h ---- linux-2.6.31.1/arch/x86/include/asm/module.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/module.h 2009-10-01 20:12:42.000000000 -0400 -@@ -74,7 +74,12 @@ struct mod_arch_specific {}; - # else - # define MODULE_STACKSIZE "" - # endif --# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE -+# ifdef CONFIG_GRKERNSEC -+# define MODULE_GRSEC "GRSECURITY " -+# else -+# define MODULE_GRSEC "" -+# endif -+# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE MODULE_GRSEC - #endif - - #endif /* _ASM_X86_MODULE_H */ -diff -urNp linux-2.6.31.1/arch/x86/include/asm/page_32_types.h linux-2.6.31.1/arch/x86/include/asm/page_32_types.h ---- linux-2.6.31.1/arch/x86/include/asm/page_32_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/page_32_types.h 2009-10-01 20:12:42.000000000 -0400 -@@ -15,6 +15,10 @@ - */ - #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) - -+#ifdef CONFIG_PAX_PAGEEXEC -+#define CONFIG_ARCH_TRACK_EXEC_LIMIT 1 -+#endif -+ - #ifdef CONFIG_4KSTACKS - #define THREAD_ORDER 0 - #else -diff -urNp linux-2.6.31.1/arch/x86/include/asm/page_64_types.h linux-2.6.31.1/arch/x86/include/asm/page_64_types.h ---- linux-2.6.31.1/arch/x86/include/asm/page_64_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/page_64_types.h 2009-10-01 20:12:42.000000000 -0400 -@@ -39,6 +39,9 @@ - #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) - #define __START_KERNEL_map _AC(0xffffffff80000000, UL) - -+#define ktla_ktva(addr) (addr) -+#define ktva_ktla(addr) (addr) -+ - /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ - #define __PHYSICAL_MASK_SHIFT 46 - #define __VIRTUAL_MASK_SHIFT 47 -diff -urNp linux-2.6.31.1/arch/x86/include/asm/paravirt.h linux-2.6.31.1/arch/x86/include/asm/paravirt.h ---- linux-2.6.31.1/arch/x86/include/asm/paravirt.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/paravirt.h 2009-10-01 20:12:42.000000000 -0400 -@@ -1688,7 +1688,7 @@ static inline unsigned long __raw_local_ - - #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) - #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) --#define PARA_INDIRECT(addr) *%cs:addr -+#define PARA_INDIRECT(addr) *%ss:addr - #endif - - #define INTERRUPT_RETURN \ -@@ -1718,6 +1718,18 @@ static inline unsigned long __raw_local_ - call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ - pop %edx; pop %ecx - -+#define GET_CR0_INTO_EDX \ -+ push %eax; push %ecx; \ -+ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ -+ mov %eax, %edx; \ -+ pop %ecx; pop %eax -+ -+#define SET_CR0_FROM_EDX \ -+ push %eax; push %ecx; \ -+ mov %edx, %eax; \ -+ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_write_cr0);\ -+ pop %ecx; pop %eax -+ - #define ENABLE_INTERRUPTS_SYSEXIT \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ - CLBR_NONE, \ -diff -urNp linux-2.6.31.1/arch/x86/include/asm/pgalloc.h linux-2.6.31.1/arch/x86/include/asm/pgalloc.h ---- linux-2.6.31.1/arch/x86/include/asm/pgalloc.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/pgalloc.h 2009-10-01 20:12:42.000000000 -0400 -@@ -58,6 +58,13 @@ static inline void pmd_populate_kernel(s - pmd_t *pmd, pte_t *pte) - { - paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); -+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); -+} -+ -+static inline void pmd_populate_user(struct mm_struct *mm, -+ pmd_t *pmd, pte_t *pte) -+{ -+ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); - set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); - } - -diff -urNp linux-2.6.31.1/arch/x86/include/asm/pgtable-2level.h linux-2.6.31.1/arch/x86/include/asm/pgtable-2level.h ---- linux-2.6.31.1/arch/x86/include/asm/pgtable-2level.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/pgtable-2level.h 2009-10-01 20:12:42.000000000 -0400 -@@ -18,7 +18,19 @@ static inline void native_set_pte(pte_t - - static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) - { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ - *pmdp = pmd; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) -diff -urNp linux-2.6.31.1/arch/x86/include/asm/pgtable_32.h linux-2.6.31.1/arch/x86/include/asm/pgtable_32.h ---- linux-2.6.31.1/arch/x86/include/asm/pgtable_32.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/pgtable_32.h 2009-10-01 20:12:42.000000000 -0400 -@@ -26,8 +26,6 @@ - struct mm_struct; - struct vm_area_struct; - --extern pgd_t swapper_pg_dir[1024]; -- - static inline void pgtable_cache_init(void) { } - static inline void check_pgt_cache(void) { } - void paging_init(void); -@@ -48,6 +46,11 @@ extern void set_pmd_pfn(unsigned long, u - # include <asm/pgtable-2level.h> - #endif - -+extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; -+#ifdef CONFIG_X86_PAE -+extern pmd_t swapper_pm_dir[PTRS_PER_PGD][PTRS_PER_PMD]; -+#endif -+ - #if defined(CONFIG_HIGHPTE) - #define __KM_PTE \ - (in_nmi() ? KM_NMI_PTE : \ -@@ -84,6 +87,9 @@ do { \ - - #endif /* !__ASSEMBLY__ */ - -+#define HAVE_ARCH_UNMAPPED_AREA -+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -+ - /* - * kern_addr_valid() is (1) for FLATMEM and (0) for - * SPARSEMEM and DISCONTIGMEM -diff -urNp linux-2.6.31.1/arch/x86/include/asm/pgtable_32_types.h linux-2.6.31.1/arch/x86/include/asm/pgtable_32_types.h ---- linux-2.6.31.1/arch/x86/include/asm/pgtable_32_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/pgtable_32_types.h 2009-10-01 20:12:42.000000000 -0400 -@@ -8,7 +8,7 @@ - */ - #ifdef CONFIG_X86_PAE - # include <asm/pgtable-3level_types.h> --# define PMD_SIZE (1UL << PMD_SHIFT) -+# define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) - # define PMD_MASK (~(PMD_SIZE - 1)) - #else - # include <asm/pgtable-2level_types.h> -@@ -46,6 +46,19 @@ extern bool __vmalloc_start_set; /* set - # define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) - #endif - -+#ifdef CONFIG_PAX_KERNEXEC -+#ifndef __ASSEMBLY__ -+extern unsigned char MODULES_EXEC_VADDR[]; -+extern unsigned char MODULES_EXEC_END[]; -+#endif -+#include <asm/boot.h> -+#define ktla_ktva(addr) (addr + LOAD_PHYSICAL_ADDR + PAGE_OFFSET) -+#define ktva_ktla(addr) (addr - LOAD_PHYSICAL_ADDR - PAGE_OFFSET) -+#else -+#define ktla_ktva(addr) (addr) -+#define ktva_ktla(addr) (addr) -+#endif -+ - #define MODULES_VADDR VMALLOC_START - #define MODULES_END VMALLOC_END - #define MODULES_LEN (MODULES_VADDR - MODULES_END) -diff -urNp linux-2.6.31.1/arch/x86/include/asm/pgtable-3level.h linux-2.6.31.1/arch/x86/include/asm/pgtable-3level.h ---- linux-2.6.31.1/arch/x86/include/asm/pgtable-3level.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/pgtable-3level.h 2009-10-01 20:12:42.000000000 -0400 -@@ -38,12 +38,36 @@ static inline void native_set_pte_atomic - - static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) - { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ - set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd)); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - static inline void native_set_pud(pud_t *pudp, pud_t pud) - { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ - set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - /* -diff -urNp linux-2.6.31.1/arch/x86/include/asm/pgtable_64.h linux-2.6.31.1/arch/x86/include/asm/pgtable_64.h ---- linux-2.6.31.1/arch/x86/include/asm/pgtable_64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/pgtable_64.h 2009-10-01 20:12:42.000000000 -0400 -@@ -16,9 +16,12 @@ - - extern pud_t level3_kernel_pgt[512]; - extern pud_t level3_ident_pgt[512]; -+extern pud_t level3_vmalloc_pgt[512]; -+extern pud_t level3_vmemmap_pgt[512]; -+extern pud_t level2_vmemmap_pgt[512]; - extern pmd_t level2_kernel_pgt[512]; - extern pmd_t level2_fixmap_pgt[512]; --extern pmd_t level2_ident_pgt[512]; -+extern pmd_t level2_ident_pgt[512*4]; - extern pgd_t init_level4_pgt[]; - - #define swapper_pg_dir init_level4_pgt -@@ -74,7 +77,19 @@ static inline pte_t native_ptep_get_and_ - - static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) - { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ - *pmdp = pmd; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - static inline void native_pmd_clear(pmd_t *pmd) -diff -urNp linux-2.6.31.1/arch/x86/include/asm/pgtable.h linux-2.6.31.1/arch/x86/include/asm/pgtable.h ---- linux-2.6.31.1/arch/x86/include/asm/pgtable.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/pgtable.h 2009-10-01 20:12:42.000000000 -0400 -@@ -90,6 +90,11 @@ static inline void __init paravirt_paget - * The following only work if pte_present() is true. - * Undefined behaviour if not.. - */ -+static inline int pte_user(pte_t pte) -+{ -+ return pte_val(pte) & _PAGE_USER; -+} -+ - static inline int pte_dirty(pte_t pte) - { - return pte_flags(pte) & _PAGE_DIRTY; -@@ -172,9 +177,29 @@ static inline pte_t pte_wrprotect(pte_t - return pte_clear_flags(pte, _PAGE_RW); - } - -+static inline pte_t pte_mkread(pte_t pte) -+{ -+ return __pte(pte_val(pte) | _PAGE_USER); -+} -+ - static inline pte_t pte_mkexec(pte_t pte) - { -- return pte_clear_flags(pte, _PAGE_NX); -+#ifdef CONFIG_X86_PAE -+ if (__supported_pte_mask & _PAGE_NX) -+ return pte_clear_flags(pte, _PAGE_NX); -+ else -+#endif -+ return pte_set_flags(pte, _PAGE_USER); -+} -+ -+static inline pte_t pte_exprotect(pte_t pte) -+{ -+#ifdef CONFIG_X86_PAE -+ if (__supported_pte_mask & _PAGE_NX) -+ return pte_set_flags(pte, _PAGE_NX); -+ else -+#endif -+ return pte_clear_flags(pte, _PAGE_USER); - } - - static inline pte_t pte_mkdirty(pte_t pte) -@@ -482,7 +507,7 @@ static inline pud_t *pud_offset(pgd_t *p - - static inline int pgd_bad(pgd_t pgd) - { -- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; -+ return (pgd_flags(pgd) & ~(_PAGE_USER | _PAGE_NX)) != _KERNPG_TABLE; - } - - static inline int pgd_none(pgd_t pgd) -@@ -623,7 +648,19 @@ static inline void ptep_set_wrprotect(st - */ - static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) - { -- memcpy(dst, src, count * sizeof(pgd_t)); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ -+ memcpy(dst, src, count * sizeof(pgd_t)); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - -diff -urNp linux-2.6.31.1/arch/x86/include/asm/pgtable_types.h linux-2.6.31.1/arch/x86/include/asm/pgtable_types.h ---- linux-2.6.31.1/arch/x86/include/asm/pgtable_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/pgtable_types.h 2009-10-01 20:12:42.000000000 -0400 -@@ -16,12 +16,11 @@ - #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ - #define _PAGE_BIT_PAT 7 /* on 4KB pages */ - #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ --#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ -+#define _PAGE_BIT_SPECIAL 9 /* special mappings, no associated struct page */ - #define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ - #define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ - #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ --#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 --#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 -+#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SPECIAL - #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ - - /* If _PAGE_BIT_PRESENT is clear, we use these: */ -@@ -39,7 +38,6 @@ - #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) - #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) - #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) --#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) - #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) - #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) - #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) -@@ -55,8 +53,10 @@ - - #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) - #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) --#else -+#elif defined(CONFIG_KMEMCHECK) - #define _PAGE_NX (_AT(pteval_t, 0)) -+#else -+#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) - #endif - - #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) -@@ -93,6 +93,9 @@ - #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ - _PAGE_ACCESSED) - -+#define PAGE_READONLY_NOEXEC PAGE_READONLY -+#define PAGE_SHARED_NOEXEC PAGE_SHARED -+ - #define __PAGE_KERNEL_EXEC \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL) - #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) -@@ -103,8 +106,8 @@ - #define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) - #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) - #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) --#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) --#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) -+#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RO | _PAGE_USER) -+#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_RO | _PAGE_PCD | _PAGE_PWT | _PAGE_USER) - #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) - #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) - #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) -@@ -163,8 +166,8 @@ - * bits are combined, this will alow user to access the high address mapped - * VDSO in the presence of CONFIG_COMPAT_VDSO - */ --#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ --#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ -+#define PTE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ -+#define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ - #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ - #endif - -@@ -277,7 +280,15 @@ static inline pteval_t pte_flags(pte_t p - typedef struct page *pgtable_t; - - extern pteval_t __supported_pte_mask; -+#ifdef CONFIG_X86_32 -+#ifdef CONFIG_X86_PAE - extern int nx_enabled; -+#else -+#define nx_enabled (0) -+#endif -+#else -+#define nx_enabled (1) -+#endif - - #define pgprot_writecombine pgprot_writecombine - extern pgprot_t pgprot_writecombine(pgprot_t prot); -diff -urNp linux-2.6.31.1/arch/x86/include/asm/processor.h linux-2.6.31.1/arch/x86/include/asm/processor.h ---- linux-2.6.31.1/arch/x86/include/asm/processor.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/processor.h 2009-10-01 20:12:42.000000000 -0400 -@@ -271,7 +271,7 @@ struct tss_struct { - - } ____cacheline_aligned; - --DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); -+extern struct tss_struct init_tss[NR_CPUS]; - - /* - * Save the original ist values for checking stack pointers during debugging -@@ -900,8 +900,17 @@ static inline void spin_lock_prefetch(co - */ - #define TASK_SIZE PAGE_OFFSET - #define TASK_SIZE_MAX TASK_SIZE -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+#define SEGMEXEC_TASK_SIZE (TASK_SIZE / 2) -+#endif -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+#define STACK_TOP ((current->mm->pax_flags & MF_PAX_SEGMEXEC)?SEGMEXEC_TASK_SIZE:TASK_SIZE) -+#else - #define STACK_TOP TASK_SIZE --#define STACK_TOP_MAX STACK_TOP -+#endif -+#define STACK_TOP_MAX TASK_SIZE - - #define INIT_THREAD { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ -@@ -918,7 +927,7 @@ static inline void spin_lock_prefetch(co - */ - #define INIT_TSS { \ - .x86_tss = { \ -- .sp0 = sizeof(init_stack) + (long)&init_stack, \ -+ .sp0 = sizeof(init_stack) + (long)&init_stack - 8, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ -@@ -929,11 +938,7 @@ static inline void spin_lock_prefetch(co - extern unsigned long thread_saved_pc(struct task_struct *tsk); - - #define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) --#define KSTK_TOP(info) \ --({ \ -- unsigned long *__ptr = (unsigned long *)(info); \ -- (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ --}) -+#define KSTK_TOP(info) ((info)->task.thread.sp0) - - /* - * The below -8 is to reserve 8 bytes on top of the ring0 stack. -@@ -948,7 +953,7 @@ extern unsigned long thread_saved_pc(str - #define task_pt_regs(task) \ - ({ \ - struct pt_regs *__regs__; \ -- __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ -+ __regs__ = (struct pt_regs *)((task)->thread.sp0); \ - __regs__ - 1; \ - }) - -@@ -964,7 +969,7 @@ extern unsigned long thread_saved_pc(str - * space during mmap's. - */ - #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ -- 0xc0000000 : 0xFFFFe000) -+ 0xc0000000 : 0xFFFFf000) - - #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ - IA32_PAGE_OFFSET : TASK_SIZE_MAX) -@@ -1001,6 +1006,10 @@ extern void start_thread(struct pt_regs - */ - #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) - -+#ifdef CONFIG_PAX_SEGMEXEC -+#define SEGMEXEC_TASK_UNMAPPED_BASE (PAGE_ALIGN(SEGMEXEC_TASK_SIZE / 3)) -+#endif -+ - #define KSTK_EIP(task) (task_pt_regs(task)->ip) - - /* Get/set a process' ability to use the timestamp counter instruction */ -diff -urNp linux-2.6.31.1/arch/x86/include/asm/ptrace.h linux-2.6.31.1/arch/x86/include/asm/ptrace.h ---- linux-2.6.31.1/arch/x86/include/asm/ptrace.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/ptrace.h 2009-10-01 20:12:42.000000000 -0400 -@@ -151,28 +151,29 @@ static inline unsigned long regs_return_ - } - - /* -- * user_mode_vm(regs) determines whether a register set came from user mode. -+ * user_mode(regs) determines whether a register set came from user mode. - * This is true if V8086 mode was enabled OR if the register set was from - * protected mode with RPL-3 CS value. This tricky test checks that with - * one comparison. Many places in the kernel can bypass this full check -- * if they have already ruled out V8086 mode, so user_mode(regs) can be used. -+ * if they have already ruled out V8086 mode, so user_mode_novm(regs) can -+ * be used. - */ --static inline int user_mode(struct pt_regs *regs) -+static inline int user_mode_novm(struct pt_regs *regs) - { - #ifdef CONFIG_X86_32 - return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL; - #else -- return !!(regs->cs & 3); -+ return !!(regs->cs & SEGMENT_RPL_MASK); - #endif - } - --static inline int user_mode_vm(struct pt_regs *regs) -+static inline int user_mode(struct pt_regs *regs) - { - #ifdef CONFIG_X86_32 - return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= - USER_RPL; - #else -- return user_mode(regs); -+ return user_mode_novm(regs); - #endif - } - -diff -urNp linux-2.6.31.1/arch/x86/include/asm/reboot.h linux-2.6.31.1/arch/x86/include/asm/reboot.h ---- linux-2.6.31.1/arch/x86/include/asm/reboot.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/reboot.h 2009-10-01 20:12:42.000000000 -0400 -@@ -18,7 +18,7 @@ extern struct machine_ops machine_ops; - - void native_machine_crash_shutdown(struct pt_regs *regs); - void native_machine_shutdown(void); --void machine_real_restart(const unsigned char *code, int length); -+void machine_real_restart(const unsigned char *code, unsigned int length); - - typedef void (*nmi_shootdown_cb)(int, struct die_args*); - void nmi_shootdown_cpus(nmi_shootdown_cb callback); -diff -urNp linux-2.6.31.1/arch/x86/include/asm/rwsem.h linux-2.6.31.1/arch/x86/include/asm/rwsem.h ---- linux-2.6.31.1/arch/x86/include/asm/rwsem.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/rwsem.h 2009-10-01 20:12:42.000000000 -0400 -@@ -106,10 +106,26 @@ static inline void __down_read(struct rw - { - asm volatile("# beginning down_read\n\t" - LOCK_PREFIX " incl (%%eax)\n\t" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ LOCK_PREFIX "decl (%%eax)\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - /* adds 0x00000001, returns the old value */ -- " jns 1f\n" -+ " jns 2f\n" - " call call_rwsem_down_read_failed\n" -- "1:\n\t" -+ "2:\n\t" - "# ending down_read\n\t" - : "+m" (sem->count) - : "a" (sem) -@@ -124,13 +140,29 @@ static inline int __down_read_trylock(st - __s32 result, tmp; - asm volatile("# beginning __down_read_trylock\n\t" - " movl %0,%1\n\t" -- "1:\n\t" -+ "2:\n\t" - " movl %1,%2\n\t" - " addl %3,%2\n\t" -- " jle 2f\n\t" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ "subl %3,%2\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ " jle 3f\n\t" - LOCK_PREFIX " cmpxchgl %2,%0\n\t" -- " jnz 1b\n\t" -- "2:\n\t" -+ " jnz 2b\n\t" -+ "3:\n\t" - "# ending __down_read_trylock\n\t" - : "+m" (sem->count), "=&a" (result), "=&r" (tmp) - : "i" (RWSEM_ACTIVE_READ_BIAS) -@@ -148,12 +180,28 @@ static inline void __down_write_nested(s - tmp = RWSEM_ACTIVE_WRITE_BIAS; - asm volatile("# beginning down_write\n\t" - LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ "movl %%edx,(%%eax)\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - /* subtract 0x0000ffff, returns the old value */ - " testl %%edx,%%edx\n\t" - /* was the count 0 before? */ -- " jz 1f\n" -+ " jz 2f\n" - " call call_rwsem_down_write_failed\n" -- "1:\n" -+ "2:\n" - "# ending down_write" - : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (tmp) -@@ -186,10 +234,26 @@ static inline void __up_read(struct rw_s - __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; - asm volatile("# beginning __up_read\n\t" - LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ "movl %%edx,(%%eax)\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - /* subtracts 1, returns the old value */ -- " jns 1f\n\t" -+ " jns 2f\n\t" - " call call_rwsem_wake\n" -- "1:\n" -+ "2:\n" - "# ending __up_read\n" - : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (tmp) -@@ -204,11 +268,27 @@ static inline void __up_write(struct rw_ - asm volatile("# beginning __up_write\n\t" - " movl %2,%%edx\n\t" - LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ "movl %%edx,(%%eax)\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - /* tries to transition - 0xffff0001 -> 0x00000000 */ -- " jz 1f\n" -+ " jz 2f\n" - " call call_rwsem_wake\n" -- "1:\n\t" -+ "2:\n\t" - "# ending __up_write\n" - : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) -@@ -222,10 +302,26 @@ static inline void __downgrade_write(str - { - asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX " addl %2,(%%eax)\n\t" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ LOCK_PREFIX "subl %2,(%%eax)\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ -- " jns 1f\n\t" -+ " jns 2f\n\t" - " call call_rwsem_downgrade_wake\n" -- "1:\n\t" -+ "2:\n\t" - "# ending __downgrade_write\n" - : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_WAITING_BIAS) -@@ -237,7 +333,23 @@ static inline void __downgrade_write(str - */ - static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) - { -- asm volatile(LOCK_PREFIX "addl %1,%0" -+ asm volatile(LOCK_PREFIX "addl %1,%0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ LOCK_PREFIX "subl %1,%0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "+m" (sem->count) - : "ir" (delta)); - } -@@ -249,7 +361,23 @@ static inline int rwsem_atomic_update(in - { - int tmp = delta; - -- asm volatile(LOCK_PREFIX "xadd %0,%1" -+ asm volatile(LOCK_PREFIX "xadd %0,%1\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ "movl %0,%1\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "+r" (tmp), "+m" (sem->count) - : : "memory"); - -diff -urNp linux-2.6.31.1/arch/x86/include/asm/segment.h linux-2.6.31.1/arch/x86/include/asm/segment.h ---- linux-2.6.31.1/arch/x86/include/asm/segment.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/segment.h 2009-10-01 20:12:42.000000000 -0400 -@@ -88,7 +88,7 @@ - #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) - #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) - --#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) -+#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) - #ifdef CONFIG_SMP - #define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) - #else -@@ -102,6 +102,12 @@ - #define __KERNEL_STACK_CANARY 0 - #endif - -+#define GDT_ENTRY_PCIBIOS_CS (GDT_ENTRY_KERNEL_BASE + 17) -+#define __PCIBIOS_CS (GDT_ENTRY_PCIBIOS_CS * 8) -+ -+#define GDT_ENTRY_PCIBIOS_DS (GDT_ENTRY_KERNEL_BASE + 18) -+#define __PCIBIOS_DS (GDT_ENTRY_PCIBIOS_DS * 8) -+ - #define GDT_ENTRY_DOUBLEFAULT_TSS 31 - - /* -@@ -139,7 +145,7 @@ - */ - - /* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ --#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) -+#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xFFFCU) == PNP_CS32 || ((x) & 0xFFFCU) == PNP_CS16) - - - #else -diff -urNp linux-2.6.31.1/arch/x86/include/asm/spinlock.h linux-2.6.31.1/arch/x86/include/asm/spinlock.h ---- linux-2.6.31.1/arch/x86/include/asm/spinlock.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/spinlock.h 2009-10-01 20:12:42.000000000 -0400 -@@ -249,18 +249,50 @@ static inline int __raw_write_can_lock(r - static inline void __raw_read_lock(raw_rwlock_t *rw) - { - asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" -- "jns 1f\n" -- "call __read_lock_failed\n\t" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" - "1:\n" -+ LOCK_PREFIX " addl $1,(%0)\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "jns 2f\n" -+ "call __read_lock_failed\n\t" -+ "2:\n" - ::LOCK_PTR_REG (rw) : "memory"); - } - - static inline void __raw_write_lock(raw_rwlock_t *rw) - { - asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" -- "jz 1f\n" -- "call __write_lock_failed\n\t" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" - "1:\n" -+ LOCK_PREFIX " addl %1,(%0)\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ "jz 2f\n" -+ "call __write_lock_failed\n\t" -+ "2:\n" - ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); - } - -@@ -286,12 +318,45 @@ static inline int __raw_write_trylock(ra - - static inline void __raw_read_unlock(raw_rwlock_t *rw) - { -- asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); -+ asm volatile(LOCK_PREFIX "incl %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ LOCK_PREFIX "decl %0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ -+ :"+m" (rw->lock) : : "memory"); - } - - static inline void __raw_write_unlock(raw_rwlock_t *rw) - { -- asm volatile(LOCK_PREFIX "addl %1, %0" -+ asm volatile(LOCK_PREFIX "addl %1, %0\n" -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#ifdef CONFIG_X86_32 -+ "into\n0:\n" -+#else -+ "jno 0f\n" -+ "int $4\n0:\n" -+#endif -+ ".pushsection .fixup,"ax"\n" -+ "1:\n" -+ LOCK_PREFIX "subl %1,%0\n" -+ "jmp 0b\n" -+ ".popsection\n" -+ _ASM_EXTABLE(0b, 1b) -+#endif -+ - : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); - } - -diff -urNp linux-2.6.31.1/arch/x86/include/asm/system.h linux-2.6.31.1/arch/x86/include/asm/system.h ---- linux-2.6.31.1/arch/x86/include/asm/system.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/system.h 2009-10-01 20:12:42.000000000 -0400 -@@ -227,7 +227,7 @@ static inline unsigned long get_limit(un - { - unsigned long __limit; - asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); -- return __limit + 1; -+ return __limit; - } - - static inline void native_clts(void) -@@ -353,6 +353,23 @@ static inline void native_wbinvd(void) - - #define stts() write_cr0(read_cr0() | X86_CR0_TS) - -+#define pax_open_kernel(cr0) \ -+do { \ -+ typecheck(unsigned long, cr0); \ -+ preempt_disable(); \ -+ barrier(); \ -+ cr0 = read_cr0(); \ -+ write_cr0(cr0 & ~X86_CR0_WP); \ -+} while (0) -+ -+#define pax_close_kernel(cr0) \ -+do { \ -+ typecheck(unsigned long, cr0); \ -+ write_cr0(cr0); \ -+ barrier(); \ -+ preempt_enable_no_resched(); \ -+} while (0) -+ - #endif /* __KERNEL__ */ - - static inline void clflush(volatile void *__p) -@@ -367,7 +384,7 @@ void enable_hlt(void); - - void cpu_idle_wait(void); - --extern unsigned long arch_align_stack(unsigned long sp); -+#define arch_align_stack(x) ((x) & ~0xfUL) - extern void free_init_pages(char *what, unsigned long begin, unsigned long end); - - void default_idle(void); -diff -urNp linux-2.6.31.1/arch/x86/include/asm/uaccess_32.h linux-2.6.31.1/arch/x86/include/asm/uaccess_32.h ---- linux-2.6.31.1/arch/x86/include/asm/uaccess_32.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/uaccess_32.h 2009-10-01 20:12:42.000000000 -0400 -@@ -44,6 +44,9 @@ unsigned long __must_check __copy_from_u - static __always_inline unsigned long __must_check - __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) - { -+ if ((long)n < 0) -+ return n; -+ - if (__builtin_constant_p(n)) { - unsigned long ret; - -@@ -62,6 +65,8 @@ __copy_to_user_inatomic(void __user *to, - return ret; - } - } -+ if (!__builtin_constant_p(n)) -+ check_object_size(from, n, true); - return __copy_to_user_ll(to, from, n); - } - -@@ -89,6 +94,9 @@ __copy_to_user(void __user *to, const vo - static __always_inline unsigned long - __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) - { -+ if ((long)n < 0) -+ return n; -+ - /* Avoid zeroing the tail if the copy fails.. - * If 'n' is constant and 1, 2, or 4, we do still zero on a failure, - * but as the zeroing behaviour is only significant when n is not -@@ -138,6 +146,10 @@ static __always_inline unsigned long - __copy_from_user(void *to, const void __user *from, unsigned long n) - { - might_fault(); -+ -+ if ((long)n < 0) -+ return n; -+ - if (__builtin_constant_p(n)) { - unsigned long ret; - -@@ -153,6 +165,8 @@ __copy_from_user(void *to, const void __ - return ret; - } - } -+ if (!__builtin_constant_p(n)) -+ check_object_size(to, n, false); - return __copy_from_user_ll(to, from, n); - } - -@@ -160,6 +174,10 @@ static __always_inline unsigned long __c - const void __user *from, unsigned long n) - { - might_fault(); -+ -+ if ((long)n < 0) -+ return n; -+ - if (__builtin_constant_p(n)) { - unsigned long ret; - -@@ -182,14 +200,62 @@ static __always_inline unsigned long - __copy_from_user_inatomic_nocache(void *to, const void __user *from, - unsigned long n) - { -- return __copy_from_user_ll_nocache_nozero(to, from, n); -+ if ((long)n < 0) -+ return n; -+ -+ return __copy_from_user_ll_nocache_nozero(to, from, n); -+} -+ -+/** -+ * copy_to_user: - Copy a block of data into user space. -+ * @to: Destination address, in user space. -+ * @from: Source address, in kernel space. -+ * @n: Number of bytes to copy. -+ * -+ * Context: User context only. This function may sleep. -+ * -+ * Copy data from kernel space to user space. -+ * -+ * Returns number of bytes that could not be copied. -+ * On success, this will be zero. -+ */ -+static __always_inline unsigned long __must_check -+copy_to_user(void __user *to, const void *from, unsigned long n) -+{ -+ if (access_ok(VERIFY_WRITE, to, n)) -+ n = __copy_to_user(to, from, n); -+ return n; -+} -+ -+/** -+ * copy_from_user: - Copy a block of data from user space. -+ * @to: Destination address, in kernel space. -+ * @from: Source address, in user space. -+ * @n: Number of bytes to copy. -+ * -+ * Context: User context only. This function may sleep. -+ * -+ * Copy data from user space to kernel space. -+ * -+ * Returns number of bytes that could not be copied. -+ * On success, this will be zero. -+ * -+ * If some data could not be copied, this function will pad the copied -+ * data to the requested size using zero bytes. -+ */ -+static __always_inline unsigned long __must_check -+copy_from_user(void *to, const void __user *from, unsigned long n) -+{ -+ if (access_ok(VERIFY_READ, from, n)) -+ n = __copy_from_user(to, from, n); -+ else if ((long)n > 0) { -+ if (!__builtin_constant_p(n)) -+ check_object_size(to, n, false); -+ memset(to, 0, n); -+ } -+ return n; - } - --unsigned long __must_check copy_to_user(void __user *to, -- const void *from, unsigned long n); --unsigned long __must_check copy_from_user(void *to, -- const void __user *from, -- unsigned long n); - long __must_check strncpy_from_user(char *dst, const char __user *src, - long count); - long __must_check __strncpy_from_user(char *dst, -diff -urNp linux-2.6.31.1/arch/x86/include/asm/uaccess_64.h linux-2.6.31.1/arch/x86/include/asm/uaccess_64.h ---- linux-2.6.31.1/arch/x86/include/asm/uaccess_64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/uaccess_64.h 2009-10-01 20:12:42.000000000 -0400 -@@ -10,6 +10,8 @@ - #include <linux/lockdep.h> - #include <asm/page.h> - -+#define set_fs(x) (current_thread_info()->addr_limit = (x)) -+ - /* - * Copy To/From Userspace - */ -@@ -19,20 +21,22 @@ __must_check unsigned long - copy_user_generic(void *to, const void *from, unsigned len); - - __must_check unsigned long --copy_to_user(void __user *to, const void *from, unsigned len); --__must_check unsigned long --copy_from_user(void *to, const void __user *from, unsigned len); --__must_check unsigned long - copy_in_user(void __user *to, const void __user *from, unsigned len); - - static __always_inline __must_check --int __copy_from_user(void *dst, const void __user *src, unsigned size) -+unsigned long __copy_from_user(void *dst, const void __user *src, unsigned size) - { -- int ret = 0; -+ unsigned ret = 0; - - might_fault(); -- if (!__builtin_constant_p(size)) -+ -+ if ((int)size < 0) -+ return size; -+ -+ if (!__builtin_constant_p(size)) { -+ check_object_size(dst, size, false); - return copy_user_generic(dst, (__force void *)src, size); -+ } - switch (size) { - case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src, - ret, "b", "b", "=q", 1); -@@ -70,13 +74,19 @@ int __copy_from_user(void *dst, const vo - } - - static __always_inline __must_check --int __copy_to_user(void __user *dst, const void *src, unsigned size) -+unsigned long __copy_to_user(void __user *dst, const void *src, unsigned size) - { -- int ret = 0; -+ unsigned ret = 0; - - might_fault(); -- if (!__builtin_constant_p(size)) -+ -+ if ((int)size < 0) -+ return size; -+ -+ if (!__builtin_constant_p(size)) { -+ check_object_size(src, size, true); - return copy_user_generic((__force void *)dst, src, size); -+ } - switch (size) { - case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst, - ret, "b", "b", "iq", 1); -@@ -114,11 +124,39 @@ int __copy_to_user(void __user *dst, con - } - - static __always_inline __must_check --int __copy_in_user(void __user *dst, const void __user *src, unsigned size) -+unsigned long copy_to_user(void __user *to, const void *from, unsigned len) - { -- int ret = 0; -+ if (access_ok(VERIFY_WRITE, to, len)) -+ len = __copy_to_user(to, from, len); -+ return len; -+} -+ -+static __always_inline __must_check -+unsigned long copy_from_user(void *to, const void __user *from, unsigned len) -+{ -+ if ((int)len < 0) -+ return len; -+ -+ if (access_ok(VERIFY_READ, from, len)) -+ len = __copy_from_user(to, from, len); -+ else if ((int)len > 0) { -+ if (!__builtin_constant_p(len)) -+ check_object_size(to, len, false); -+ memset(to, 0, len); -+ } -+ return len; -+} -+ -+static __always_inline __must_check -+unsigned long __copy_in_user(void __user *dst, const void __user *src, unsigned size) -+{ -+ unsigned ret = 0; - - might_fault(); -+ -+ if ((int)size < 0) -+ return size; -+ - if (!__builtin_constant_p(size)) - return copy_user_generic((__force void *)dst, - (__force void *)src, size); -@@ -179,30 +217,38 @@ __must_check unsigned long __clear_user( - __must_check long __copy_from_user_inatomic(void *dst, const void __user *src, - unsigned size); - --static __must_check __always_inline int -+static __must_check __always_inline unsigned long - __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) - { -+ if ((int)size < 0) -+ return size; -+ - return copy_user_generic((__force void *)dst, src, size); - } - --extern long __copy_user_nocache(void *dst, const void __user *src, -+extern unsigned long __copy_user_nocache(void *dst, const void __user *src, - unsigned size, int zerorest); - --static inline int --__copy_from_user_nocache(void *dst, const void __user *src, unsigned size) -+static inline unsigned long __copy_from_user_nocache(void *dst, const void __user *src, unsigned size) - { - might_sleep(); -+ -+ if ((int)size < 0) -+ return size; -+ - return __copy_user_nocache(dst, src, size, 1); - } - --static inline int --__copy_from_user_inatomic_nocache(void *dst, const void __user *src, -+static inline unsigned long __copy_from_user_inatomic_nocache(void *dst, const void __user *src, - unsigned size) - { -+ if ((int)size < 0) -+ return size; -+ - return __copy_user_nocache(dst, src, size, 0); - } - --unsigned long -+extern unsigned long - copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest); - - #endif /* _ASM_X86_UACCESS_64_H */ -diff -urNp linux-2.6.31.1/arch/x86/include/asm/uaccess.h linux-2.6.31.1/arch/x86/include/asm/uaccess.h ---- linux-2.6.31.1/arch/x86/include/asm/uaccess.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/uaccess.h 2009-10-01 20:12:42.000000000 -0400 -@@ -8,8 +8,11 @@ - #include <linux/thread_info.h> - #include <linux/prefetch.h> - #include <linux/string.h> -+#include <linux/sched.h> -+#include <linux/slab.h> - #include <asm/asm.h> - #include <asm/page.h> -+#include <asm/segment.h> - - #define VERIFY_READ 0 - #define VERIFY_WRITE 1 -@@ -29,7 +32,12 @@ - - #define get_ds() (KERNEL_DS) - #define get_fs() (current_thread_info()->addr_limit) -+#ifdef CONFIG_X86_32 -+void __set_fs(mm_segment_t x, int cpu); -+void set_fs(mm_segment_t x); -+#else - #define set_fs(x) (current_thread_info()->addr_limit = (x)) -+#endif - - #define segment_eq(a, b) ((a).seg == (b).seg) - -@@ -77,7 +85,26 @@ - * checks that the pointer is in the user space range - after calling - * this function, memory access functions may still return -EFAULT. - */ --#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0)) -+#define access_ok(type, addr, size) \ -+({ \ -+ bool __ret_ao = __range_not_ok(addr, size) == 0; \ -+ unsigned long __addr_ao = (unsigned long)addr & PAGE_MASK; \ -+ unsigned long __end_ao = (unsigned long)addr + size - 1; \ -+ if (__ret_ao && unlikely((__end_ao ^ __addr_ao) & PAGE_MASK)) { \ -+ for (; __addr_ao <= __end_ao; __addr_ao += PAGE_SIZE) { \ -+ char __c_ao; \ -+ if (size > PAGE_SIZE) \ -+ cond_resched(); \ -+ if (__get_user(__c_ao, (char __user *)__addr_ao))\ -+ break; \ -+ if (type != VERIFY_WRITE) \ -+ continue; \ -+ if (__put_user(__c_ao, (char __user *)__addr_ao))\ -+ break; \ -+ } \ -+ } \ -+ __ret_ao; \ -+}) - - /* - * The exception table consists of pairs of addresses: the first is the -@@ -183,13 +210,21 @@ extern int __get_user_bad(void); - asm volatile("call __put_user_" #size : "=a" (__ret_pu) \ - : "0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") - -- -+#ifdef CONFIG_X86_32 -+#define _ASM_LOAD_USER_DS(ds) "movw %w" #ds ",%%ds\n" -+#define _ASM_LOAD_KERNEL_DS "pushl %%ss; popl %%ds\n" -+#else -+#define _ASM_LOAD_USER_DS(ds) -+#define _ASM_LOAD_KERNEL_DS -+#endif - - #ifdef CONFIG_X86_32 - #define __put_user_asm_u64(x, addr, err, errret) \ -- asm volatile("1: movl %%eax,0(%2)\n" \ -- "2: movl %%edx,4(%2)\n" \ -+ asm volatile(_ASM_LOAD_USER_DS(5) \ -+ "1: movl %%eax,%%ds:0(%2)\n" \ -+ "2: movl %%edx,%%ds:4(%2)\n" \ - "3:\n" \ -+ _ASM_LOAD_KERNEL_DS \ - ".section .fixup,"ax"\n" \ - "4: movl %3,%0\n" \ - " jmp 3b\n" \ -@@ -197,15 +232,18 @@ extern int __get_user_bad(void); - _ASM_EXTABLE(1b, 4b) \ - _ASM_EXTABLE(2b, 4b) \ - : "=r" (err) \ -- : "A" (x), "r" (addr), "i" (errret), "0" (err)) -+ : "A" (x), "r" (addr), "i" (errret), "0" (err), \ -+ "r"(__USER_DS)) - - #define __put_user_asm_ex_u64(x, addr) \ -- asm volatile("1: movl %%eax,0(%1)\n" \ -- "2: movl %%edx,4(%1)\n" \ -+ asm volatile(_ASM_LOAD_USER_DS(2) \ -+ "1: movl %%eax,%%ds:0(%1)\n" \ -+ "2: movl %%edx,%%ds:4(%1)\n" \ - "3:\n" \ -+ _ASM_LOAD_KERNEL_DS \ - _ASM_EXTABLE(1b, 2b - 1b) \ - _ASM_EXTABLE(2b, 3b - 2b) \ -- : : "A" (x), "r" (addr)) -+ : : "A" (x), "r" (addr), "r"(__USER_DS)) - - #define __put_user_x8(x, ptr, __ret_pu) \ - asm volatile("call __put_user_8" : "=a" (__ret_pu) \ -@@ -374,16 +412,18 @@ do { \ - } while (0) - - #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ -- asm volatile("1: mov"itype" %2,%"rtype"1\n" \ -+ asm volatile(_ASM_LOAD_USER_DS(5) \ -+ "1: mov"itype" %%ds:%2,%"rtype"1\n" \ - "2:\n" \ -+ _ASM_LOAD_KERNEL_DS \ - ".section .fixup,"ax"\n" \ - "3: mov %3,%0\n" \ - " xor"itype" %"rtype"1,%"rtype"1\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ -- : "=r" (err), ltype(x) \ -- : "m" (__m(addr)), "i" (errret), "0" (err)) -+ : "=r" (err), ltype (x) \ -+ : "m" (__m(addr)), "i" (errret), "0" (err), "r"(__USER_DS)) - - #define __get_user_size_ex(x, ptr, size) \ - do { \ -@@ -407,10 +447,12 @@ do { \ - } while (0) - - #define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ -- asm volatile("1: mov"itype" %1,%"rtype"0\n" \ -+ asm volatile(_ASM_LOAD_USER_DS(2) \ -+ "1: mov"itype" %%ds:%1,%"rtype"0\n" \ - "2:\n" \ -+ _ASM_LOAD_KERNEL_DS \ - _ASM_EXTABLE(1b, 2b - 1b) \ -- : ltype(x) : "m" (__m(addr))) -+ : ltype(x) : "m" (__m(addr)), "r"(__USER_DS)) - - #define __put_user_nocheck(x, ptr, size) \ - ({ \ -@@ -438,21 +480,26 @@ struct __large_struct { unsigned long bu - * aliasing issues. - */ - #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ -- asm volatile("1: mov"itype" %"rtype"1,%2\n" \ -+ asm volatile(_ASM_LOAD_USER_DS(5) \ -+ "1: mov"itype" %"rtype"1,%%ds:%2\n" \ - "2:\n" \ -+ _ASM_LOAD_KERNEL_DS \ - ".section .fixup,"ax"\n" \ - "3: mov %3,%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : "=r"(err) \ -- : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err)) -+ : ltype (x), "m" (__m(addr)), "i" (errret), "0" (err),\ -+ "r"(__USER_DS)) - - #define __put_user_asm_ex(x, addr, itype, rtype, ltype) \ -- asm volatile("1: mov"itype" %"rtype"0,%1\n" \ -+ asm volatile(_ASM_LOAD_USER_DS(2) \ -+ "1: mov"itype" %"rtype"0,%%ds:%1\n" \ - "2:\n" \ -+ _ASM_LOAD_KERNEL_DS \ - _ASM_EXTABLE(1b, 2b - 1b) \ -- : : ltype(x), "m" (__m(addr))) -+ : : ltype(x), "m" (__m(addr)), "r"(__USER_DS)) - - /* - * uaccess_try and catch -@@ -567,6 +614,7 @@ extern struct movsl_mask { - - #define ARCH_HAS_NOCACHE_UACCESS 1 - -+#define ARCH_HAS_SORT_EXTABLE - #ifdef CONFIG_X86_32 - # include "uaccess_32.h" - #else -diff -urNp linux-2.6.31.1/arch/x86/include/asm/vgtod.h linux-2.6.31.1/arch/x86/include/asm/vgtod.h ---- linux-2.6.31.1/arch/x86/include/asm/vgtod.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/vgtod.h 2009-10-01 20:12:42.000000000 -0400 -@@ -14,6 +14,7 @@ struct vsyscall_gtod_data { - int sysctl_enabled; - struct timezone sys_tz; - struct { /* extract of a clocksource struct */ -+ char name[8]; - cycle_t (*vread)(void); - cycle_t cycle_last; - cycle_t mask; -diff -urNp linux-2.6.31.1/arch/x86/include/asm/vsyscall.h linux-2.6.31.1/arch/x86/include/asm/vsyscall.h ---- linux-2.6.31.1/arch/x86/include/asm/vsyscall.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/include/asm/vsyscall.h 2009-10-01 20:12:42.000000000 -0400 -@@ -15,9 +15,10 @@ enum vsyscall_num { - - #ifdef __KERNEL__ - #include <linux/seqlock.h> -+#include <linux/getcpu.h> -+#include <linux/time.h> - - #define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) --#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) - - /* Definitions for CONFIG_GENERIC_TIME definitions */ - #define __section_vsyscall_gtod_data __attribute__ \ -@@ -31,7 +32,6 @@ enum vsyscall_num { - #define VGETCPU_LSL 2 - - extern int __vgetcpu_mode; --extern volatile unsigned long __jiffies; - - /* kernel space (writeable) */ - extern int vgetcpu_mode; -@@ -39,6 +39,9 @@ extern struct timezone sys_tz; - - extern void map_vsyscall(void); - -+extern int vgettimeofday(struct timeval * tv, struct timezone * tz); -+extern time_t vtime(time_t *t); -+extern long vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache); - #endif /* __KERNEL__ */ - - #endif /* _ASM_X86_VSYSCALL_H */ -diff -urNp linux-2.6.31.1/arch/x86/Kconfig linux-2.6.31.1/arch/x86/Kconfig ---- linux-2.6.31.1/arch/x86/Kconfig 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/Kconfig 2009-10-01 20:12:42.000000000 -0400 -@@ -1098,7 +1098,7 @@ config PAGE_OFFSET - hex - default 0xB0000000 if VMSPLIT_3G_OPT - default 0x80000000 if VMSPLIT_2G -- default 0x78000000 if VMSPLIT_2G_OPT -+ default 0x70000000 if VMSPLIT_2G_OPT - default 0x40000000 if VMSPLIT_1G - default 0xC0000000 - depends on X86_32 -@@ -1416,7 +1416,7 @@ config X86_PAT - - config EFI - bool "EFI runtime service support" -- depends on ACPI -+ depends on ACPI && !PAX_KERNEXEC - ---help--- - This enables the kernel to use EFI runtime services that are - available (such as the EFI variable services). -@@ -1602,9 +1602,10 @@ config HOTPLUG_CPU - Say N if you want to disable CPU hotplug. - - config COMPAT_VDSO -- def_bool y -+ def_bool n - prompt "Compat VDSO support" - depends on X86_32 || IA32_EMULATION -+ depends on !PAX_NOEXEC && !PAX_MEMORY_UDEREF - ---help--- - Map the 32-bit VDSO to the predictable old-style address too. - ---help--- -diff -urNp linux-2.6.31.1/arch/x86/Kconfig.cpu linux-2.6.31.1/arch/x86/Kconfig.cpu ---- linux-2.6.31.1/arch/x86/Kconfig.cpu 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/Kconfig.cpu 2009-10-01 20:12:42.000000000 -0400 -@@ -331,7 +331,7 @@ config X86_PPRO_FENCE - - config X86_F00F_BUG - def_bool y -- depends on M586MMX || M586TSC || M586 || M486 || M386 -+ depends on (M586MMX || M586TSC || M586 || M486 || M386) && !PAX_KERNEXEC - - config X86_WP_WORKS_OK - def_bool y -@@ -351,7 +351,7 @@ config X86_POPAD_OK - - config X86_ALIGNMENT_16 - def_bool y -- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 -+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK8 || MK7 || MK6 || MCORE2 || MPENTIUM4 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 - - config X86_INTEL_USERCOPY - def_bool y -@@ -397,7 +397,7 @@ config X86_CMPXCHG64 - # generates cmov. - config X86_CMOV - def_bool y -- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) -+ depends on (MK8 || MK7 || MCORE2 || MPSC || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) - - config X86_MINIMUM_CPU_FAMILY - int -diff -urNp linux-2.6.31.1/arch/x86/Kconfig.debug linux-2.6.31.1/arch/x86/Kconfig.debug ---- linux-2.6.31.1/arch/x86/Kconfig.debug 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/Kconfig.debug 2009-10-01 20:12:42.000000000 -0400 -@@ -99,7 +99,7 @@ config X86_PTDUMP - config DEBUG_RODATA - bool "Write protect kernel read-only data structures" - default y -- depends on DEBUG_KERNEL -+ depends on DEBUG_KERNEL && BROKEN - ---help--- - Mark the kernel read-only data as write-protected in the pagetables, - in order to catch accidental (and incorrect) writes to such const -diff -urNp linux-2.6.31.1/arch/x86/kernel/acpi/boot.c linux-2.6.31.1/arch/x86/kernel/acpi/boot.c ---- linux-2.6.31.1/arch/x86/kernel/acpi/boot.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/acpi/boot.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1609,7 +1609,7 @@ static struct dmi_system_id __initdata a - DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), - }, - }, -- {} -+ { NULL, NULL, {{0, {0}}}, NULL} - }; - - /* -diff -urNp linux-2.6.31.1/arch/x86/kernel/acpi/realmode/wakeup.S linux-2.6.31.1/arch/x86/kernel/acpi/realmode/wakeup.S ---- linux-2.6.31.1/arch/x86/kernel/acpi/realmode/wakeup.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/acpi/realmode/wakeup.S 2009-10-01 20:12:42.000000000 -0400 -@@ -104,7 +104,7 @@ _start: - movl %eax, %ecx - orl %edx, %ecx - jz 1f -- movl $0xc0000080, %ecx -+ mov $MSR_EFER, %ecx - wrmsr - 1: - -diff -urNp linux-2.6.31.1/arch/x86/kernel/acpi/sleep.c linux-2.6.31.1/arch/x86/kernel/acpi/sleep.c ---- linux-2.6.31.1/arch/x86/kernel/acpi/sleep.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/acpi/sleep.c 2009-10-01 20:12:42.000000000 -0400 -@@ -11,11 +11,12 @@ - #include <linux/cpumask.h> - #include <asm/segment.h> - #include <asm/desc.h> -+#include <asm/e820.h> - - #include "realmode/wakeup.h" - #include "sleep.h" - --unsigned long acpi_wakeup_address; -+unsigned long acpi_wakeup_address = 0x2000; - unsigned long acpi_realmode_flags; - - /* address in low memory of the wakeup routine. */ -@@ -37,6 +38,10 @@ int acpi_save_state_mem(void) - { - struct wakeup_header *header; - -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) && defined(CONFIG_PAX_KERNEXEC) -+ unsigned long cr0; -+#endif -+ - if (!acpi_realmode) { - printk(KERN_ERR "Could not allocate memory during boot, " - "S3 disabled\n"); -@@ -99,8 +104,18 @@ int acpi_save_state_mem(void) - header->trampoline_segment = setup_trampoline() >> 4; - #ifdef CONFIG_SMP - stack_start.sp = temp_stack + sizeof(temp_stack); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - early_gdt_descr.address = - (unsigned long)get_cpu_gdt_table(smp_processor_id()); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - initial_gs = per_cpu_offset(smp_processor_id()); - #endif - initial_code = (unsigned long)wakeup_long64; -@@ -134,14 +149,8 @@ void __init acpi_reserve_bootmem(void) - return; - } - -- acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); -- -- if (!acpi_realmode) { -- printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); -- return; -- } -- -- acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); -+ reserve_early(acpi_wakeup_address, acpi_wakeup_address + WAKEUP_SIZE, "ACPI Wakeup Code"); -+ acpi_realmode = (unsigned long)__va(acpi_wakeup_address);; - } - - -diff -urNp linux-2.6.31.1/arch/x86/kernel/acpi/wakeup_32.S linux-2.6.31.1/arch/x86/kernel/acpi/wakeup_32.S ---- linux-2.6.31.1/arch/x86/kernel/acpi/wakeup_32.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/acpi/wakeup_32.S 2009-10-01 20:12:42.000000000 -0400 -@@ -30,13 +30,11 @@ wakeup_pmode_return: - # and restore the stack ... but you need gdt for this to work - movl saved_context_esp, %esp - -- movl %cs:saved_magic, %eax -- cmpl $0x12345678, %eax -+ cmpl $0x12345678, saved_magic - jne bogus_magic - - # jump to place where we left off -- movl saved_eip, %eax -- jmp *%eax -+ jmp *(saved_eip) - - bogus_magic: - jmp bogus_magic -diff -urNp linux-2.6.31.1/arch/x86/kernel/alternative.c linux-2.6.31.1/arch/x86/kernel/alternative.c ---- linux-2.6.31.1/arch/x86/kernel/alternative.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/alternative.c 2009-10-01 20:12:42.000000000 -0400 -@@ -400,7 +400,7 @@ void apply_paravirt(struct paravirt_patc - - BUG_ON(p->len > MAX_PATCH_LEN); - /* prep the buffer with the original instructions */ -- memcpy(insnbuf, p->instr, p->len); -+ memcpy(insnbuf, ktla_ktva(p->instr), p->len); - used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, - (unsigned long)p->instr, p->len); - -@@ -485,11 +485,26 @@ void __init alternative_instructions(voi - * instructions. And on the local CPU you need to be protected again NMI or MCE - * handlers seeing an inconsistent instruction while you patch. - */ --void *text_poke_early(void *addr, const void *opcode, size_t len) -+void *__kprobes text_poke_early(void *addr, const void *opcode, size_t len) - { - unsigned long flags; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - local_irq_save(flags); -- memcpy(addr, opcode, len); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ memcpy(ktla_ktva(addr), opcode, len); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - local_irq_restore(flags); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but -@@ -512,35 +527,27 @@ void *text_poke_early(void *addr, const - */ - void *__kprobes text_poke(void *addr, const void *opcode, size_t len) - { -- unsigned long flags; -- char *vaddr; -+ unsigned char *vaddr = ktla_ktva(addr); - struct page *pages[2]; -- int i; -+ size_t i; -+ -+ if (!core_kernel_text((unsigned long)addr) - -- if (!core_kernel_text((unsigned long)addr)) { -- pages[0] = vmalloc_to_page(addr); -- pages[1] = vmalloc_to_page(addr + PAGE_SIZE); -+#if defined(CONFIG_X86_32) && defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) -+ && (vaddr < MODULES_EXEC_VADDR || MODULES_EXEC_END < vaddr) -+#endif -+ -+ ) { -+ pages[0] = vmalloc_to_page(vaddr); -+ pages[1] = vmalloc_to_page(vaddr + PAGE_SIZE); - } else { -- pages[0] = virt_to_page(addr); -+ pages[0] = virt_to_page(vaddr); - WARN_ON(!PageReserved(pages[0])); -- pages[1] = virt_to_page(addr + PAGE_SIZE); -+ pages[1] = virt_to_page(vaddr + PAGE_SIZE); - } - BUG_ON(!pages[0]); -- local_irq_save(flags); -- set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); -- if (pages[1]) -- set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); -- vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); -- memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); -- clear_fixmap(FIX_TEXT_POKE0); -- if (pages[1]) -- clear_fixmap(FIX_TEXT_POKE1); -- local_flush_tlb(); -- sync_core(); -- /* Could also do a CLFLUSH here to speed up CPU recovery; but -- that causes hangs on some VIA CPUs. */ -+ text_poke_early(addr, opcode, len); - for (i = 0; i < len; i++) -- BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); -- local_irq_restore(flags); -+ BUG_ON(((char *)vaddr)[i] != ((char *)opcode)[i]); - return addr; - } -diff -urNp linux-2.6.31.1/arch/x86/kernel/apm_32.c linux-2.6.31.1/arch/x86/kernel/apm_32.c ---- linux-2.6.31.1/arch/x86/kernel/apm_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/apm_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -403,7 +403,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitq - static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); - static struct apm_user *user_list; - static DEFINE_SPINLOCK(user_list_lock); --static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; -+static const struct desc_struct bad_bios_desc = { { { 0, 0x00409300 } } }; - - static const char driver_version[] = "1.16ac"; /* no spaces */ - -@@ -576,12 +576,25 @@ static long __apm_bios_call(void *_call) - struct desc_struct *gdt; - struct apm_bios_call *call = _call; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - cpu = get_cpu(); - BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); - save_desc_40 = gdt[0x40 / 8]; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - gdt[0x40 / 8] = bad_bios_desc; - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - apm_irq_save(flags); - APM_DO_SAVE_SEGS; - apm_bios_call_asm(call->func, call->ebx, call->ecx, -@@ -589,7 +602,17 @@ static long __apm_bios_call(void *_call) - &call->esi); - APM_DO_RESTORE_SEGS; - apm_irq_restore(flags); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - gdt[0x40 / 8] = save_desc_40; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - put_cpu(); - - return call->eax & 0xff; -@@ -652,19 +675,42 @@ static long __apm_bios_call_simple(void - struct desc_struct *gdt; - struct apm_bios_call *call = _call; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - cpu = get_cpu(); - BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); - save_desc_40 = gdt[0x40 / 8]; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - gdt[0x40 / 8] = bad_bios_desc; - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - apm_irq_save(flags); - APM_DO_SAVE_SEGS; - error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, - &call->eax); - APM_DO_RESTORE_SEGS; - apm_irq_restore(flags); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - gdt[0x40 / 8] = save_desc_40; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - put_cpu(); - return error; - } -@@ -967,7 +1013,7 @@ recalc: - - static void apm_power_off(void) - { -- unsigned char po_bios_call[] = { -+ const unsigned char po_bios_call[] = { - 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ - 0x8e, 0xd0, /* movw ax,ss */ - 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ -@@ -1925,7 +1971,10 @@ static const struct file_operations apm_ - static struct miscdevice apm_device = { - APM_MINOR_DEV, - "apm_bios", -- &apm_bios_fops -+ &apm_bios_fops, -+ {NULL, NULL}, -+ NULL, -+ NULL - }; - - -@@ -2246,7 +2295,7 @@ static struct dmi_system_id __initdata a - { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, - }, - -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL} - }; - - /* -@@ -2264,6 +2313,10 @@ static int __init apm_init(void) - struct desc_struct *gdt; - int err; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - dmi_check_system(apm_dmi_table); - - if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { -@@ -2337,9 +2390,18 @@ static int __init apm_init(void) - * This is for buggy BIOS's that refer to (real mode) segment 0x40 - * even though they are called in protected mode. - */ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - /* - * Set up the long jump entry point to the APM BIOS, which is called - * from inline assembly. -@@ -2358,6 +2420,11 @@ static int __init apm_init(void) - * code to that CPU. - */ - gdt = get_cpu_gdt_table(0); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - set_base(gdt[APM_CS >> 3], - __va((unsigned long)apm_info.bios.cseg << 4)); - set_base(gdt[APM_CS_16 >> 3], -@@ -2365,6 +2432,10 @@ static int __init apm_init(void) - set_base(gdt[APM_DS >> 3], - __va((unsigned long)apm_info.bios.dseg << 4)); - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - proc_create("apm", 0, NULL, &apm_file_ops); - - kapmd_task = kthread_create(apm, NULL, "kapmd"); -diff -urNp linux-2.6.31.1/arch/x86/kernel/asm-offsets_32.c linux-2.6.31.1/arch/x86/kernel/asm-offsets_32.c ---- linux-2.6.31.1/arch/x86/kernel/asm-offsets_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/asm-offsets_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -115,6 +115,7 @@ void foo(void) - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); - OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); -+ OFFSET(PV_CPU_write_cr0, pv_cpu_ops, write_cr0); - #endif - - #ifdef CONFIG_XEN -diff -urNp linux-2.6.31.1/arch/x86/kernel/asm-offsets_64.c linux-2.6.31.1/arch/x86/kernel/asm-offsets_64.c ---- linux-2.6.31.1/arch/x86/kernel/asm-offsets_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/asm-offsets_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -114,6 +114,7 @@ int main(void) - ENTRY(cr8); - BLANK(); - #undef ENTRY -+ DEFINE(TSS_size, sizeof(struct tss_struct)); - DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); - BLANK(); - DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); -diff -urNp linux-2.6.31.1/arch/x86/kernel/cpu/common.c linux-2.6.31.1/arch/x86/kernel/cpu/common.c ---- linux-2.6.31.1/arch/x86/kernel/cpu/common.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/cpu/common.c 2009-10-01 20:12:42.000000000 -0400 -@@ -84,60 +84,6 @@ static const struct cpu_dev __cpuinitcon - - static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; - --DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { --#ifdef CONFIG_X86_64 -- /* -- * We need valid kernel segments for data and code in long mode too -- * IRET will check the segment types kkeil 2000/10/28 -- * Also sysret mandates a special GDT layout -- * -- * TLS descriptors are currently at a different place compared to i386. -- * Hopefully nobody expects them at a fixed place (Wine?) -- */ -- [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, -- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, -- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, -- [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, -- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, -- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, --#else -- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, -- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, -- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, -- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, -- /* -- * Segments used for calling PnP BIOS have byte granularity. -- * They code segments and data segments have fixed 64k limits, -- * the transfer segment sizes are set at run time. -- */ -- /* 32-bit code */ -- [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, -- /* 16-bit code */ -- [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, -- /* 16-bit data */ -- [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, -- /* 16-bit data */ -- [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, -- /* 16-bit data */ -- [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, -- /* -- * The APM segments have byte granularity and their bases -- * are set at run time. All have 64k limits. -- */ -- /* 32-bit code */ -- [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, -- /* 16-bit code */ -- [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, -- /* data */ -- [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, -- -- [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, -- [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, -- GDT_STACK_CANARY_INIT --#endif --} }; --EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); -- - static int __init x86_xsave_setup(char *s) - { - setup_clear_cpu_cap(X86_FEATURE_XSAVE); -@@ -345,7 +291,7 @@ void switch_to_new_gdt(int cpu) - { - struct desc_ptr gdt_descr; - -- gdt_descr.address = (long)get_cpu_gdt_table(cpu); -+ gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); - /* Reload the per-cpu base */ -@@ -799,6 +745,10 @@ static void __cpuinit identify_cpu(struc - /* Filter out anything that depends on CPUID levels we don't have */ - filter_cpuid_features(c, true); - -+#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_KERNEXEC) || defined(CONFIG_PAX_MEMORY_UDEREF) -+ setup_clear_cpu_cap(X86_FEATURE_SEP); -+#endif -+ - /* If the model name is still unset, do table lookup. */ - if (!c->x86_model_id[0]) { - const char *p; -@@ -982,7 +932,7 @@ static __init int setup_disablecpuid(cha - __setup("clearcpuid=", setup_disablecpuid); - - #ifdef CONFIG_X86_64 --struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; -+struct desc_ptr idt_descr __read_only = { 256 * 16 - 1, (unsigned long) idt_table }; - - DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE); -@@ -1092,7 +1042,7 @@ void __cpuinit cpu_init(void) - int i; - - cpu = stack_smp_processor_id(); -- t = &per_cpu(init_tss, cpu); -+ t = init_tss + cpu; - orig_ist = &per_cpu(orig_ist, cpu); - - #ifdef CONFIG_NUMA -@@ -1190,7 +1140,7 @@ void __cpuinit cpu_init(void) - { - int cpu = smp_processor_id(); - struct task_struct *curr = current; -- struct tss_struct *t = &per_cpu(init_tss, cpu); -+ struct tss_struct *t = init_tss + cpu; - struct thread_struct *thread = &curr->thread; - - if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { -diff -urNp linux-2.6.31.1/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c linux-2.6.31.1/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c ---- linux-2.6.31.1/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c 2009-10-01 20:12:42.000000000 -0400 -@@ -586,7 +586,7 @@ static const struct dmi_system_id sw_any - DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"), - }, - }, -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } - }; - #endif - -diff -urNp linux-2.6.31.1/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c linux-2.6.31.1/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c ---- linux-2.6.31.1/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c 2009-10-01 20:12:42.000000000 -0400 -@@ -225,7 +225,7 @@ static struct cpu_model models[] = - { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL }, - { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL }, - -- { NULL, } -+ { NULL, NULL, 0, NULL} - }; - #undef _BANIAS - #undef BANIAS -diff -urNp linux-2.6.31.1/arch/x86/kernel/cpu/intel.c linux-2.6.31.1/arch/x86/kernel/cpu/intel.c ---- linux-2.6.31.1/arch/x86/kernel/cpu/intel.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/cpu/intel.c 2009-10-01 20:12:42.000000000 -0400 -@@ -140,7 +140,7 @@ static void __cpuinit trap_init_f00f_bug - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. - */ -- idt_descr.address = fix_to_virt(FIX_F00F_IDT); -+ idt_descr.address = (struct desc_struct *)fix_to_virt(FIX_F00F_IDT); - load_idt(&idt_descr); - } - #endif -diff -urNp linux-2.6.31.1/arch/x86/kernel/cpu/Makefile linux-2.6.31.1/arch/x86/kernel/cpu/Makefile ---- linux-2.6.31.1/arch/x86/kernel/cpu/Makefile 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/cpu/Makefile 2009-10-01 20:12:42.000000000 -0400 -@@ -7,10 +7,6 @@ ifdef CONFIG_FUNCTION_TRACER - CFLAGS_REMOVE_common.o = -pg - endif - --# Make sure load_percpu_segment has no stackprotector --nostackp := $(call cc-option, -fno-stack-protector) --CFLAGS_common.o := $(nostackp) -- - obj-y := intel_cacheinfo.o addon_cpuid_features.o - obj-y += proc.o capflags.o powerflags.o common.o - obj-y += vmware.o hypervisor.o -diff -urNp linux-2.6.31.1/arch/x86/kernel/cpu/mcheck/mce.c linux-2.6.31.1/arch/x86/kernel/cpu/mcheck/mce.c ---- linux-2.6.31.1/arch/x86/kernel/cpu/mcheck/mce.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/cpu/mcheck/mce.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1370,14 +1370,14 @@ void __cpuinit mcheck_init(struct cpuinf - */ - - static DEFINE_SPINLOCK(mce_state_lock); --static int open_count; /* #times opened */ -+static atomic_t open_count; /* #times opened */ - static int open_exclu; /* already open exclusive? */ - - static int mce_open(struct inode *inode, struct file *file) - { - spin_lock(&mce_state_lock); - -- if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { -+ if (open_exclu || (atomic_read(&open_count) && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_state_lock); - - return -EBUSY; -@@ -1385,7 +1385,7 @@ static int mce_open(struct inode *inode, - - if (file->f_flags & O_EXCL) - open_exclu = 1; -- open_count++; -+ atomic_inc(&open_count); - - spin_unlock(&mce_state_lock); - -@@ -1396,7 +1396,7 @@ static int mce_release(struct inode *ino - { - spin_lock(&mce_state_lock); - -- open_count--; -+ atomic_dec(&open_count); - open_exclu = 0; - - spin_unlock(&mce_state_lock); -@@ -1536,6 +1536,7 @@ static struct miscdevice mce_log_device - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -+ {NULL, NULL}, NULL, NULL - }; - - /* -diff -urNp linux-2.6.31.1/arch/x86/kernel/cpu/mtrr/generic.c linux-2.6.31.1/arch/x86/kernel/cpu/mtrr/generic.c ---- linux-2.6.31.1/arch/x86/kernel/cpu/mtrr/generic.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/cpu/mtrr/generic.c 2009-10-01 20:12:42.000000000 -0400 -@@ -23,14 +23,14 @@ static struct fixed_range_block fixed_ra - { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ - { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ - { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ -- {} -+ { 0, 0 } - }; - - static unsigned long smp_changes_mask; - static int mtrr_state_set; - u64 mtrr_tom2; - --struct mtrr_state_type mtrr_state = {}; -+struct mtrr_state_type mtrr_state; - EXPORT_SYMBOL_GPL(mtrr_state); - - /** -diff -urNp linux-2.6.31.1/arch/x86/kernel/crash.c linux-2.6.31.1/arch/x86/kernel/crash.c ---- linux-2.6.31.1/arch/x86/kernel/crash.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/crash.c 2009-10-01 20:12:42.000000000 -0400 -@@ -42,7 +42,7 @@ static void kdump_nmi_callback(int cpu, - regs = args->regs; - - #ifdef CONFIG_X86_32 -- if (!user_mode_vm(regs)) { -+ if (!user_mode(regs)) { - crash_fixup_ss_esp(&fixed_regs, regs); - regs = &fixed_regs; - } -diff -urNp linux-2.6.31.1/arch/x86/kernel/doublefault_32.c linux-2.6.31.1/arch/x86/kernel/doublefault_32.c ---- linux-2.6.31.1/arch/x86/kernel/doublefault_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/doublefault_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -11,7 +11,7 @@ - - #define DOUBLEFAULT_STACKSIZE (1024) - static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; --#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) -+#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE-2) - - #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) - -@@ -21,7 +21,7 @@ static void doublefault_fn(void) - unsigned long gdt, tss; - - store_gdt(&gdt_desc); -- gdt = gdt_desc.address; -+ gdt = (unsigned long)gdt_desc.address; - - printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); - -@@ -60,10 +60,10 @@ struct tss_struct doublefault_tss __cach - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, -- .es = __USER_DS, -+ .es = __KERNEL_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, -- .ds = __USER_DS, -+ .ds = __KERNEL_DS, - .fs = __KERNEL_PERCPU, - - .__cr3 = __pa_nodebug(swapper_pg_dir), -diff -urNp linux-2.6.31.1/arch/x86/kernel/dumpstack_32.c linux-2.6.31.1/arch/x86/kernel/dumpstack_32.c ---- linux-2.6.31.1/arch/x86/kernel/dumpstack_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/dumpstack_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -113,11 +113,12 @@ void show_registers(struct pt_regs *regs - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ -- if (!user_mode_vm(regs)) { -+ if (!user_mode(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; -+ unsigned long cs_base = get_desc_base(&get_cpu_gdt_table(smp_processor_id())[(0xffff & regs->cs) >> 3]); - - printk(KERN_EMERG "Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, -@@ -125,10 +126,10 @@ void show_registers(struct pt_regs *regs - - printk(KERN_EMERG "Code: "); - -- ip = (u8 *)regs->ip - code_prologue; -+ ip = (u8 *)regs->ip - code_prologue + cs_base; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at IP */ -- ip = (u8 *)regs->ip; -+ ip = (u8 *)regs->ip + cs_base; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { -@@ -137,7 +138,7 @@ void show_registers(struct pt_regs *regs - printk(" Bad EIP value."); - break; - } -- if (ip == (u8 *)regs->ip) -+ if (ip == (u8 *)regs->ip + cs_base) - printk("<%02x> ", c); - else - printk("%02x ", c); -@@ -150,6 +151,7 @@ int is_valid_bugaddr(unsigned long ip) - { - unsigned short ud2; - -+ ip = ktla_ktva(ip); - if (ip < PAGE_OFFSET) - return 0; - if (probe_kernel_address((unsigned short *)ip, ud2)) -diff -urNp linux-2.6.31.1/arch/x86/kernel/dumpstack.c linux-2.6.31.1/arch/x86/kernel/dumpstack.c ---- linux-2.6.31.1/arch/x86/kernel/dumpstack.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/dumpstack.c 2009-10-01 20:12:42.000000000 -0400 -@@ -181,7 +181,7 @@ void dump_stack(void) - #endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", -- current->pid, current->comm, print_tainted(), -+ task_pid_nr(current), current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); -@@ -242,7 +242,7 @@ void __kprobes oops_end(unsigned long fl - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); -- do_exit(signr); -+ do_group_exit(signr); - } - - int __kprobes __die(const char *str, struct pt_regs *regs, long err) -@@ -296,7 +296,7 @@ void die(const char *str, struct pt_regs - unsigned long flags = oops_begin(); - int sig = SIGSEGV; - -- if (!user_mode_vm(regs)) -+ if (!user_mode(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) -diff -urNp linux-2.6.31.1/arch/x86/kernel/e820.c linux-2.6.31.1/arch/x86/kernel/e820.c ---- linux-2.6.31.1/arch/x86/kernel/e820.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/e820.c 2009-10-01 20:12:42.000000000 -0400 -@@ -733,7 +733,10 @@ struct early_res { - }; - static struct early_res early_res[MAX_EARLY_RES] __initdata = { - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ -- {} -+#ifdef CONFIG_VM86 -+ { PAGE_SIZE, ISA_START_ADDRESS, "V86 mode memory", 1 }, -+#endif -+ { 0, 0, {0}, 0 } - }; - - static int __init find_overlapped_early(u64 start, u64 end) -diff -urNp linux-2.6.31.1/arch/x86/kernel/efi_32.c linux-2.6.31.1/arch/x86/kernel/efi_32.c ---- linux-2.6.31.1/arch/x86/kernel/efi_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/efi_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -38,70 +38,38 @@ - */ - - static unsigned long efi_rt_eflags; --static pgd_t efi_bak_pg_dir_pointer[2]; -+static pgd_t __initdata efi_bak_pg_dir_pointer[KERNEL_PGD_PTRS]; - --void efi_call_phys_prelog(void) -+void __init efi_call_phys_prelog(void) - { -- unsigned long cr4; -- unsigned long temp; - struct desc_ptr gdt_descr; - - local_irq_save(efi_rt_eflags); - -- /* -- * If I don't have PAE, I should just duplicate two entries in page -- * directory. If I have PAE, I just need to duplicate one entry in -- * page directory. -- */ -- cr4 = read_cr4_safe(); - -- if (cr4 & X86_CR4_PAE) { -- efi_bak_pg_dir_pointer[0].pgd = -- swapper_pg_dir[pgd_index(0)].pgd; -- swapper_pg_dir[0].pgd = -- swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; -- } else { -- efi_bak_pg_dir_pointer[0].pgd = -- swapper_pg_dir[pgd_index(0)].pgd; -- efi_bak_pg_dir_pointer[1].pgd = -- swapper_pg_dir[pgd_index(0x400000)].pgd; -- swapper_pg_dir[pgd_index(0)].pgd = -- swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; -- temp = PAGE_OFFSET + 0x400000; -- swapper_pg_dir[pgd_index(0x400000)].pgd = -- swapper_pg_dir[pgd_index(temp)].pgd; -- } -+ clone_pgd_range(efi_bak_pg_dir_pointer, swapper_pg_dir, KERNEL_PGD_PTRS); -+ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, -+ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); - - /* - * After the lock is released, the original page table is restored. - */ - __flush_tlb_all(); - -- gdt_descr.address = __pa(get_cpu_gdt_table(0)); -+ gdt_descr.address = (struct desc_struct *)__pa(get_cpu_gdt_table(0)); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); - } - --void efi_call_phys_epilog(void) -+void __init efi_call_phys_epilog(void) - { -- unsigned long cr4; - struct desc_ptr gdt_descr; - -- gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); -+ gdt_descr.address = get_cpu_gdt_table(0); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); - -- cr4 = read_cr4_safe(); -- -- if (cr4 & X86_CR4_PAE) { -- swapper_pg_dir[pgd_index(0)].pgd = -- efi_bak_pg_dir_pointer[0].pgd; -- } else { -- swapper_pg_dir[pgd_index(0)].pgd = -- efi_bak_pg_dir_pointer[0].pgd; -- swapper_pg_dir[pgd_index(0x400000)].pgd = -- efi_bak_pg_dir_pointer[1].pgd; -- } -+ clone_pgd_range(swapper_pg_dir, efi_bak_pg_dir_pointer, KERNEL_PGD_PTRS); - - /* - * After the lock is released, the original page table is restored. -diff -urNp linux-2.6.31.1/arch/x86/kernel/efi_stub_32.S linux-2.6.31.1/arch/x86/kernel/efi_stub_32.S ---- linux-2.6.31.1/arch/x86/kernel/efi_stub_32.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/efi_stub_32.S 2009-10-01 20:12:42.000000000 -0400 -@@ -6,6 +6,7 @@ - */ - - #include <linux/linkage.h> -+#include <linux/init.h> - #include <asm/page_types.h> - - /* -@@ -20,7 +21,7 @@ - * service functions will comply with gcc calling convention, too. - */ - --.text -+__INIT - ENTRY(efi_call_phys) - /* - * 0. The function can only be called in Linux kernel. So CS has been -@@ -36,9 +37,7 @@ ENTRY(efi_call_phys) - * The mapping of lower virtual memory has been created in prelog and - * epilog. - */ -- movl $1f, %edx -- subl $__PAGE_OFFSET, %edx -- jmp *%edx -+ jmp 1f-__PAGE_OFFSET - 1: - - /* -@@ -47,14 +46,8 @@ ENTRY(efi_call_phys) - * parameter 2, ..., param n. To make things easy, we save the return - * address of efi_call_phys in a global variable. - */ -- popl %edx -- movl %edx, saved_return_addr -- /* get the function pointer into ECX*/ -- popl %ecx -- movl %ecx, efi_rt_function_ptr -- movl $2f, %edx -- subl $__PAGE_OFFSET, %edx -- pushl %edx -+ popl (saved_return_addr) -+ popl (efi_rt_function_ptr) - - /* - * 3. Clear PG bit in %CR0. -@@ -73,9 +66,8 @@ ENTRY(efi_call_phys) - /* - * 5. Call the physical function. - */ -- jmp *%ecx -+ call *(efi_rt_function_ptr-__PAGE_OFFSET) - --2: - /* - * 6. After EFI runtime service returns, control will return to - * following instruction. We'd better readjust stack pointer first. -@@ -88,35 +80,28 @@ ENTRY(efi_call_phys) - movl %cr0, %edx - orl $0x80000000, %edx - movl %edx, %cr0 -- jmp 1f --1: -+ - /* - * 8. Now restore the virtual mode from flat mode by - * adding EIP with PAGE_OFFSET. - */ -- movl $1f, %edx -- jmp *%edx -+ jmp 1f+__PAGE_OFFSET - 1: - - /* - * 9. Balance the stack. And because EAX contain the return value, - * we'd better not clobber it. - */ -- leal efi_rt_function_ptr, %edx -- movl (%edx), %ecx -- pushl %ecx -+ pushl (efi_rt_function_ptr) - - /* -- * 10. Push the saved return address onto the stack and return. -+ * 10. Return to the saved return address. - */ -- leal saved_return_addr, %edx -- movl (%edx), %ecx -- pushl %ecx -- ret -+ jmpl *(saved_return_addr) - ENDPROC(efi_call_phys) - .previous - --.data -+__INITDATA - saved_return_addr: - .long 0 - efi_rt_function_ptr: -diff -urNp linux-2.6.31.1/arch/x86/kernel/entry_32.S linux-2.6.31.1/arch/x86/kernel/entry_32.S ---- linux-2.6.31.1/arch/x86/kernel/entry_32.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/entry_32.S 2009-10-01 20:12:42.000000000 -0400 -@@ -191,7 +191,7 @@ - - #endif /* CONFIG_X86_32_LAZY_GS */ - --.macro SAVE_ALL -+.macro __SAVE_ALL _DS - cld - PUSH_GS - pushl %fs -@@ -224,7 +224,7 @@ - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebx, 0 -- movl $(__USER_DS), %edx -+ movl $_DS, %edx - movl %edx, %ds - movl %edx, %es - movl $(__KERNEL_PERCPU), %edx -@@ -232,6 +232,21 @@ - SET_KERNEL_GS %edx - .endm - -+.macro SAVE_ALL -+#ifdef CONFIG_PAX_KERNEXEC -+ __SAVE_ALL __KERNEL_DS -+ GET_CR0_INTO_EDX; -+ movl %edx, %esi; -+ orl $X86_CR0_WP, %edx; -+ xorl %edx, %esi; -+ SET_CR0_FROM_EDX -+#elif defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_MEMORY_UDEREF) -+ __SAVE_ALL __KERNEL_DS -+#else -+ __SAVE_ALL __USER_DS -+#endif -+.endm -+ - .macro RESTORE_INT_REGS - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 -@@ -329,6 +344,11 @@ ENTRY(ret_from_fork) - CFI_ADJUST_CFA_OFFSET 4 - popfl - CFI_ADJUST_CFA_OFFSET -4 -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ xorl %esi, %esi -+#endif -+ - jmp syscall_exit - CFI_ENDPROC - END(ret_from_fork) -@@ -352,7 +372,17 @@ check_userspace: - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax - cmpl $USER_RPL, %eax -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ jae resume_userspace -+ -+ GET_CR0_INTO_EDX -+ xorl %esi, %edx -+ SET_CR0_FROM_EDX -+ jmp resume_kernel -+#else - jb resume_kernel # not returning to v8086 or userspace -+#endif - - ENTRY(resume_userspace) - LOCKDEP_SYS_EXIT -@@ -414,10 +444,9 @@ sysenter_past_esp: - /*CFI_REL_OFFSET cs, 0*/ - /* - * Push current_thread_info()->sysenter_return to the stack. -- * A tiny bit of offset fixup is necessary - 4*4 means the 4 words -- * pushed above; +8 corresponds to copy_thread's esp0 setting. - */ -- pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) -+ GET_THREAD_INFO(%ebp) -+ pushl TI_sysenter_return(%ebp) - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET eip, 0 - -@@ -430,9 +459,19 @@ sysenter_past_esp: - * Load the potential sixth argument from user stack. - * Careful about security. - */ -+ movl PT_OLDESP(%esp),%ebp -+ -+#ifdef CONFIG_PAX_MEMORY_UDEREF -+ mov PT_OLDSS(%esp),%ds -+1: movl %ds:(%ebp),%ebp -+ push %ss -+ pop %ds -+#else - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault - 1: movl (%ebp),%ebp -+#endif -+ - movl %ebp,PT_EBP(%esp) - .section __ex_table,"a" - .align 4 -@@ -455,12 +494,23 @@ sysenter_do_call: - testl $_TIF_ALLWORK_MASK, %ecx - jne sysexit_audit - sysenter_exit: -+ -+#ifdef CONFIG_PAX_RANDKSTACK -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ call pax_randomize_kstack -+ popl %eax -+ CFI_ADJUST_CFA_OFFSET -4 -+#endif -+ - /* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp,%ebp - TRACE_IRQS_ON - 1: mov PT_FS(%esp), %fs -+2: mov PT_DS(%esp), %ds -+3: mov PT_ES(%esp), %es - PTGS_TO_GS - ENABLE_INTERRUPTS_SYSEXIT - -@@ -504,11 +554,17 @@ sysexit_audit: - - CFI_ENDPROC - .pushsection .fixup,"ax" --2: movl $0,PT_FS(%esp) -+4: movl $0,PT_FS(%esp) -+ jmp 1b -+5: movl $0,PT_DS(%esp) -+ jmp 1b -+6: movl $0,PT_ES(%esp) - jmp 1b - .section __ex_table,"a" - .align 4 -- .long 1b,2b -+ .long 1b,4b -+ .long 2b,5b -+ .long 3b,6b - .popsection - PTGS_TO_GS_EX - ENDPROC(ia32_sysenter_target) -@@ -538,6 +594,10 @@ syscall_exit: - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jne syscall_exit_work - -+#ifdef CONFIG_PAX_RANDKSTACK -+ call pax_randomize_kstack -+#endif -+ - restore_all: - TRACE_IRQS_IRET - restore_all_notrace: -@@ -602,7 +662,13 @@ ldt_ss: - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ -- PER_CPU(gdt_page, %ebx) -+#ifdef CONFIG_SMP -+ movl PER_CPU_VAR(cpu_number), %ebx -+ shll $PAGE_SHIFT_asm, %ebx -+ addl $cpu_gdt_table, %ebx -+#else -+ movl $cpu_gdt_table, %ebx -+#endif - shr $16, %edx - mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ - mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ -@@ -642,25 +708,19 @@ work_resched: - - work_notifysig: # deal with pending signals and - # notify-resume requests -+ movl %esp, %eax - #ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) -- movl %esp, %eax -- jne work_notifysig_v86 # returning to kernel-space or -+ jz 1f # returning to kernel-space or - # vm86-space -- xorl %edx, %edx -- call do_notify_resume -- jmp resume_userspace_sig - -- ALIGN --work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - CFI_ADJUST_CFA_OFFSET 4 - call save_v86_state # %eax contains pt_regs pointer - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - movl %eax, %esp --#else -- movl %esp, %eax -+1: - #endif - xorl %edx, %edx - call do_notify_resume -@@ -695,6 +755,10 @@ END(syscall_exit_work) - - RING0_INT_FRAME # can't unwind into user space anyway - syscall_fault: -+#ifdef CONFIG_PAX_MEMORY_UDEREF -+ push %ss -+ pop %ds -+#endif - GET_THREAD_INFO(%ebp) - movl $-EFAULT,PT_EAX(%esp) - jmp resume_userspace -@@ -735,7 +799,13 @@ PTREGSCALL(vm86old) - * normal stack and adjusts ESP with the matching offset. - */ - /* fixup the stack */ -- PER_CPU(gdt_page, %ebx) -+#ifdef CONFIG_SMP -+ movl PER_CPU_VAR(cpu_number), %ebx -+ shll $PAGE_SHIFT_asm, %ebx -+ addl $cpu_gdt_table, %ebx -+#else -+ movl $cpu_gdt_table, %ebx -+#endif - mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ - mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ - shl $16, %eax -@@ -1198,7 +1268,6 @@ return_to_handler: - ret - #endif - --.section .rodata,"a" - #include "syscall_table_32.S" - - syscall_table_size=(.-sys_call_table) -@@ -1250,12 +1319,21 @@ error_code: - movl %ecx, %fs - UNWIND_ESPFIX_STACK - GS_TO_REG %ecx -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ GET_CR0_INTO_EDX -+ movl %edx, %esi -+ orl $X86_CR0_WP, %edx -+ xorl %edx, %esi -+ SET_CR0_FROM_EDX -+#endif -+ - movl PT_GS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx -- movl $(__USER_DS), %ecx -+ movl $(__KERNEL_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF -@@ -1351,6 +1429,13 @@ nmi_stack_correct: - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ GET_CR0_INTO_EDX -+ xorl %esi, %edx -+ SET_CR0_FROM_EDX -+#endif -+ - jmp restore_all_notrace - CFI_ENDPROC - -@@ -1391,6 +1476,13 @@ nmi_espfix_stack: - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ GET_CR0_INTO_EDX -+ xorl %esi, %edx -+ SET_CR0_FROM_EDX -+#endif -+ - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 -diff -urNp linux-2.6.31.1/arch/x86/kernel/entry_64.S linux-2.6.31.1/arch/x86/kernel/entry_64.S ---- linux-2.6.31.1/arch/x86/kernel/entry_64.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/entry_64.S 2009-10-01 20:12:42.000000000 -0400 -@@ -1074,7 +1074,12 @@ ENTRY(\sym) - TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ -- PER_CPU(init_tss, %rbp) -+#ifdef CONFIG_SMP -+ imul $TSS_size, PER_CPU_VAR(cpu_number), %ebp -+ lea init_tss(%rbp), %rbp -+#else -+ lea init_tss(%rip), %rbp -+#endif - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) - call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) -diff -urNp linux-2.6.31.1/arch/x86/kernel/ftrace.c linux-2.6.31.1/arch/x86/kernel/ftrace.c ---- linux-2.6.31.1/arch/x86/kernel/ftrace.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/ftrace.c 2009-10-01 20:12:42.000000000 -0400 -@@ -284,9 +284,9 @@ int ftrace_update_ftrace_func(ftrace_fun - unsigned char old[MCOUNT_INSN_SIZE], *new; - int ret; - -- memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); -+ memcpy(old, (void *)ktla_ktva((unsigned long)ftrace_call), MCOUNT_INSN_SIZE); - new = ftrace_call_replace(ip, (unsigned long)func); -- ret = ftrace_modify_code(ip, old, new); -+ ret = ftrace_modify_code(ktla_ktva(ip), old, new); - - return ret; - } -diff -urNp linux-2.6.31.1/arch/x86/kernel/head32.c linux-2.6.31.1/arch/x86/kernel/head32.c ---- linux-2.6.31.1/arch/x86/kernel/head32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/head32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -13,12 +13,13 @@ - #include <asm/e820.h> - #include <asm/bios_ebda.h> - #include <asm/trampoline.h> -+#include <asm/boot.h> - - void __init i386_start_kernel(void) - { - reserve_trampoline_memory(); - -- reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); -+ reserve_early(LOAD_PHYSICAL_ADDR, __pa_symbol(&__bss_stop), "TEXT DATA BSS"); - - #ifdef CONFIG_BLK_DEV_INITRD - /* Reserve INITRD */ -diff -urNp linux-2.6.31.1/arch/x86/kernel/head_32.S linux-2.6.31.1/arch/x86/kernel/head_32.S ---- linux-2.6.31.1/arch/x86/kernel/head_32.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/head_32.S 2009-10-01 20:12:42.000000000 -0400 -@@ -19,6 +19,7 @@ - #include <asm/setup.h> - #include <asm/processor-flags.h> - #include <asm/percpu.h> -+#include <asm/msr-index.h> - - /* Physical address */ - #define pa(X) ((X) - __PAGE_OFFSET) -@@ -52,11 +53,7 @@ - * and small than max_low_pfn, otherwise will waste some page table entries - */ - --#if PTRS_PER_PMD > 1 --#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) --#else --#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) --#endif -+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PTE) - - /* Enough space to fit pagetables for the low memory linear map */ - MAPPING_BEYOND_END = \ -@@ -73,6 +70,12 @@ INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_P - RESERVE_BRK(pagetables, INIT_MAP_SIZE) - - /* -+ * Real beginning of normal "text" segment -+ */ -+ENTRY(stext) -+ENTRY(_stext) -+ -+/* - * 32-bit kernel entrypoint; only used by the boot CPU. On entry, - * %esi points to the real-mode code as a 32-bit pointer. - * CS and DS must be 4 GB flat segments, but we don't depend on -@@ -80,6 +83,13 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) - * can. - */ - .section .text.head,"ax",@progbits -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ jmp startup_32 -+/* PaX: fill first page in .text with int3 to catch NULL derefs in kernel mode */ -+.fill PAGE_SIZE-5,1,0xcc -+#endif -+ - ENTRY(startup_32) - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - us to not reload segments */ -@@ -97,6 +107,48 @@ ENTRY(startup_32) - movl %eax,%gs - 2: - -+#ifdef CONFIG_SMP -+ movl $pa(cpu_gdt_table),%edi -+ movl $__per_cpu_load,%eax -+ movw %ax,__KERNEL_PERCPU + 2(%edi) -+ rorl $16,%eax -+ movb %al,__KERNEL_PERCPU + 4(%edi) -+ movb %ah,__KERNEL_PERCPU + 7(%edi) -+ movl $__per_cpu_end - 1,%eax -+ subl $__per_cpu_load,%eax -+ movw %ax,__KERNEL_PERCPU + 0(%edi) -+#endif -+ -+#ifdef CONFIG_PAX_MEMORY_UDEREF -+ movl $NR_CPUS,%ecx -+ movl $pa(cpu_gdt_table),%edi -+1: -+ movl $((((__PAGE_OFFSET-1) & 0xf0000000) >> 12) | 0x00c09700),GDT_ENTRY_KERNEL_DS * 8 + 4(%edi) -+ addl $PAGE_SIZE_asm,%edi -+ loop 1b -+#endif -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ movl $pa(boot_gdt),%edi -+ movl $__LOAD_PHYSICAL_ADDR + __PAGE_OFFSET,%eax -+ movw %ax,__BOOT_CS + 2(%edi) -+ rorl $16,%eax -+ movb %al,__BOOT_CS + 4(%edi) -+ movb %ah,__BOOT_CS + 7(%edi) -+ rorl $16,%eax -+ -+ movl $NR_CPUS,%ecx -+ movl $pa(cpu_gdt_table),%edi -+1: -+ movw %ax,__KERNEL_CS + 2(%edi) -+ rorl $16,%eax -+ movb %al,__KERNEL_CS + 4(%edi) -+ movb %ah,__KERNEL_CS + 7(%edi) -+ rorl $16,%eax -+ addl $PAGE_SIZE_asm,%edi -+ loop 1b -+#endif -+ - /* - * Clear BSS first so that there are no surprises... - */ -@@ -140,9 +192,7 @@ ENTRY(startup_32) - cmpl $num_subarch_entries, %eax - jae bad_subarch - -- movl pa(subarch_entries)(,%eax,4), %eax -- subl $__PAGE_OFFSET, %eax -- jmp *%eax -+ jmp *pa(subarch_entries)(,%eax,4) - - bad_subarch: - WEAK(lguest_entry) -@@ -154,9 +204,9 @@ WEAK(xen_entry) - __INITDATA - - subarch_entries: -- .long default_entry /* normal x86/PC */ -- .long lguest_entry /* lguest hypervisor */ -- .long xen_entry /* Xen hypervisor */ -+ .long pa(default_entry) /* normal x86/PC */ -+ .long pa(lguest_entry) /* lguest hypervisor */ -+ .long pa(xen_entry) /* Xen hypervisor */ - num_subarch_entries = (. - subarch_entries) / 4 - .previous - #endif /* CONFIG_PARAVIRT */ -@@ -217,8 +267,11 @@ default_entry: - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ -- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax -- movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) -+#ifdef CONFIG_COMPAT_VDSO -+ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR+_PAGE_USER,pa(swapper_pg_pmd+0x1000*KPMDS-8) -+#else -+ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,pa(swapper_pg_pmd+0x1000*KPMDS-8) -+#endif - #else /* Not PAE */ - - page_pde_offset = (__PAGE_OFFSET >> 20); -@@ -248,8 +301,11 @@ page_pde_offset = (__PAGE_OFFSET >> 20); - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ -- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax -- movl %eax,pa(swapper_pg_dir+0xffc) -+#ifdef CONFIG_COMPAT_VDSO -+ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR+_PAGE_USER,pa(swapper_pg_dir+0xffc) -+#else -+ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,pa(swapper_pg_dir+0xffc) -+#endif - #endif - jmp 3f - /* -@@ -296,6 +352,7 @@ ENTRY(startup_32_smp) - orl %edx,%eax - movl %eax,%cr4 - -+#ifdef CONFIG_X86_PAE - btl $5, %eax # check if PAE is enabled - jnc 6f - -@@ -311,13 +368,16 @@ ENTRY(startup_32_smp) - jnc 6f - - /* Setup EFER (Extended Feature Enable Register) */ -- movl $0xc0000080, %ecx -+ movl $MSR_EFER, %ecx - rdmsr - - btsl $11, %eax - /* Make changes effective */ - wrmsr - -+ btsl $_PAGE_BIT_NX-32,pa(__supported_pte_mask+4) -+ movl $1,pa(nx_enabled) -+#endif - 6: - - /* -@@ -343,9 +403,7 @@ ENTRY(startup_32_smp) - - #ifdef CONFIG_SMP - cmpb $0, ready -- jz 1f /* Initial CPU cleans BSS */ -- jmp checkCPUtype --1: -+ jnz checkCPUtype /* Initial CPU cleans BSS */ - #endif /* CONFIG_SMP */ - - /* -@@ -423,7 +481,7 @@ is386: movl $2,%ecx # set MP - 1: movl $(__KERNEL_DS),%eax # reload all the segment registers - movl %eax,%ss # after changing gdt. - -- movl $(__USER_DS),%eax # DS/ES contains default USER segment -+# movl $(__KERNEL_DS),%eax # DS/ES contains default KERNEL segment - movl %eax,%ds - movl %eax,%es - -@@ -437,8 +495,11 @@ is386: movl $2,%ecx # set MP - */ - cmpb $0,ready - jne 1f -- movl $per_cpu__gdt_page,%eax -+ movl $cpu_gdt_table,%eax - movl $per_cpu__stack_canary,%ecx -+#ifdef CONFIG_SMP -+ addl $__per_cpu_load,%ecx -+#endif - movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) - shrl $16, %ecx - movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) -@@ -456,10 +517,6 @@ is386: movl $2,%ecx # set MP - #ifdef CONFIG_SMP - movb ready, %cl - movb $1, ready -- cmpb $0,%cl # the first CPU calls start_kernel -- je 1f -- movl (stack_start), %esp --1: - #endif /* CONFIG_SMP */ - jmp *(initial_code) - -@@ -545,22 +602,22 @@ early_page_fault: - jmp early_fault - - early_fault: -- cld - #ifdef CONFIG_PRINTK -+ cmpl $1,%ss:early_recursion_flag -+ je hlt_loop -+ incl %ss:early_recursion_flag -+ cld - pusha - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es -- cmpl $2,early_recursion_flag -- je hlt_loop -- incl early_recursion_flag - movl %cr2,%eax - pushl %eax - pushl %edx /* trapno */ - pushl $fault_msg - call printk -+; call dump_stack - #endif -- call dump_stack - hlt_loop: - hlt - jmp hlt_loop -@@ -568,8 +625,11 @@ hlt_loop: - /* This is the default interrupt "handler" :-) */ - ALIGN - ignore_int: -- cld - #ifdef CONFIG_PRINTK -+ cmpl $2,%ss:early_recursion_flag -+ je hlt_loop -+ incl %ss:early_recursion_flag -+ cld - pushl %eax - pushl %ecx - pushl %edx -@@ -578,9 +638,6 @@ ignore_int: - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es -- cmpl $2,early_recursion_flag -- je hlt_loop -- incl early_recursion_flag - pushl 16(%esp) - pushl 24(%esp) - pushl 32(%esp) -@@ -607,27 +664,37 @@ ENTRY(initial_code) - /* - * BSS section - */ --.section ".bss.page_aligned","wa" -- .align PAGE_SIZE_asm - #ifdef CONFIG_X86_PAE -+.section .swapper_pg_pmd,"a",@progbits - swapper_pg_pmd: - .fill 1024*KPMDS,4,0 - #else -+.section .swapper_pg_dir,"a",@progbits - ENTRY(swapper_pg_dir) - .fill 1024,4,0 - #endif -+ - swapper_pg_fixmap: - .fill 1024,4,0 -+ -+.section .empty_zero_page,"a",@progbits - ENTRY(empty_zero_page) - .fill 4096,1,0 - - /* -+ * The IDT has to be page-aligned to simplify the Pentium -+ * F0 0F bug workaround.. We have a special link segment -+ * for this. -+ */ -+.section .idt,"a",@progbits -+ENTRY(idt_table) -+ .fill 256,8,0 -+ -+/* - * This starts the data section. - */ - #ifdef CONFIG_X86_PAE --.section ".data.page_aligned","wa" -- /* Page-aligned for the benefit of paravirt? */ -- .align PAGE_SIZE_asm -+.section .swapper_pg_dir,"a",@progbits - ENTRY(swapper_pg_dir) - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ - # if KPMDS == 3 -@@ -650,11 +717,12 @@ ENTRY(swapper_pg_dir) - - .data - ENTRY(stack_start) -- .long init_thread_union+THREAD_SIZE -+ .long init_thread_union+THREAD_SIZE-8 - .long __BOOT_DS - - ready: .byte 0 - -+.section .rodata,"a",@progbits - early_recursion_flag: - .long 0 - -@@ -690,7 +758,7 @@ fault_msg: - .word 0 # 32 bit align gdt_desc.address - boot_gdt_descr: - .word __BOOT_DS+7 -- .long boot_gdt - __PAGE_OFFSET -+ .long pa(boot_gdt) - - .word 0 # 32-bit align idt_desc.address - idt_descr: -@@ -701,7 +769,7 @@ idt_descr: - .word 0 # 32 bit align gdt_desc.address - ENTRY(early_gdt_descr) - .word GDT_ENTRIES*8-1 -- .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ -+ .long cpu_gdt_table /* Overwritten for secondary CPUs */ - - /* - * The boot_gdt must mirror the equivalent in setup.S and is -@@ -710,5 +778,59 @@ ENTRY(early_gdt_descr) - .align L1_CACHE_BYTES - ENTRY(boot_gdt) - .fill GDT_ENTRY_BOOT_CS,8,0 -- .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ -- .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ -+ .quad 0x00cf9b000000ffff /* kernel 4GB code at 0x00000000 */ -+ .quad 0x00cf93000000ffff /* kernel 4GB data at 0x00000000 */ -+ -+ .align PAGE_SIZE_asm -+ENTRY(cpu_gdt_table) -+ .rept NR_CPUS -+ .quad 0x0000000000000000 /* NULL descriptor */ -+ .quad 0x0000000000000000 /* 0x0b reserved */ -+ .quad 0x0000000000000000 /* 0x13 reserved */ -+ .quad 0x0000000000000000 /* 0x1b reserved */ -+ .quad 0x0000000000000000 /* 0x20 unused */ -+ .quad 0x0000000000000000 /* 0x28 unused */ -+ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ -+ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ -+ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ -+ .quad 0x0000000000000000 /* 0x4b reserved */ -+ .quad 0x0000000000000000 /* 0x53 reserved */ -+ .quad 0x0000000000000000 /* 0x5b reserved */ -+ -+ .quad 0x00cf9b000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ -+ .quad 0x00cf93000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ -+ .quad 0x00cffb000000ffff /* 0x73 user 4GB code at 0x00000000 */ -+ .quad 0x00cff3000000ffff /* 0x7b user 4GB data at 0x00000000 */ -+ -+ .quad 0x0000000000000000 /* 0x80 TSS descriptor */ -+ .quad 0x0000000000000000 /* 0x88 LDT descriptor */ -+ -+ /* -+ * Segments used for calling PnP BIOS have byte granularity. -+ * The code segments and data segments have fixed 64k limits, -+ * the transfer segment sizes are set at run time. -+ */ -+ .quad 0x00409b000000ffff /* 0x90 32-bit code */ -+ .quad 0x00009b000000ffff /* 0x98 16-bit code */ -+ .quad 0x000093000000ffff /* 0xa0 16-bit data */ -+ .quad 0x0000930000000000 /* 0xa8 16-bit data */ -+ .quad 0x0000930000000000 /* 0xb0 16-bit data */ -+ -+ /* -+ * The APM segments have byte granularity and their bases -+ * are set at run time. All have 64k limits. -+ */ -+ .quad 0x00409b000000ffff /* 0xb8 APM CS code */ -+ .quad 0x00009b000000ffff /* 0xc0 APM CS 16 code (16 bit) */ -+ .quad 0x004093000000ffff /* 0xc8 APM DS data */ -+ -+ .quad 0x00c0930000000000 /* 0xd0 - ESPFIX SS */ -+ .quad 0x0040930000000000 /* 0xd8 - PERCPU */ -+ .quad 0x0040930000000018 /* 0xe0 - STACK_CANARY */ -+ .quad 0x0000000000000000 /* 0xe8 - PCIBIOS_CS */ -+ .quad 0x0000000000000000 /* 0xf0 - PCIBIOS_DS */ -+ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ -+ -+ /* Be sure this is zeroed to avoid false validations in Xen */ -+ .fill PAGE_SIZE_asm - GDT_SIZE,1,0 -+ .endr -diff -urNp linux-2.6.31.1/arch/x86/kernel/head_64.S linux-2.6.31.1/arch/x86/kernel/head_64.S ---- linux-2.6.31.1/arch/x86/kernel/head_64.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/head_64.S 2009-10-01 20:12:42.000000000 -0400 -@@ -38,6 +38,10 @@ L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET - L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET) - L4_START_KERNEL = pgd_index(__START_KERNEL_map) - L3_START_KERNEL = pud_index(__START_KERNEL_map) -+L4_VMALLOC_START = pgd_index(VMALLOC_START) -+L3_VMALLOC_START = pud_index(VMALLOC_START) -+L4_VMEMMAP_START = pgd_index(VMEMMAP_START) -+L3_VMEMMAP_START = pud_index(VMEMMAP_START) - - .text - .section .text.head -@@ -85,35 +89,22 @@ startup_64: - */ - addq %rbp, init_level4_pgt + 0(%rip) - addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) -+ addq %rbp, init_level4_pgt + (L4_VMALLOC_START*8)(%rip) -+ addq %rbp, init_level4_pgt + (L4_VMEMMAP_START*8)(%rip) - addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) - - addq %rbp, level3_ident_pgt + 0(%rip) -+ addq %rbp, level3_ident_pgt + 8(%rip) -+ addq %rbp, level3_ident_pgt + 16(%rip) -+ addq %rbp, level3_ident_pgt + 24(%rip) - -- addq %rbp, level3_kernel_pgt + (510*8)(%rip) -- addq %rbp, level3_kernel_pgt + (511*8)(%rip) -+ addq %rbp, level3_vmemmap_pgt + (L3_VMEMMAP_START*8)(%rip) - -- addq %rbp, level2_fixmap_pgt + (506*8)(%rip) -+ addq %rbp, level3_kernel_pgt + (L3_START_KERNEL*8)(%rip) -+ addq %rbp, level3_kernel_pgt + (L3_START_KERNEL*8+8)(%rip) - -- /* Add an Identity mapping if I am above 1G */ -- leaq _text(%rip), %rdi -- andq $PMD_PAGE_MASK, %rdi -- -- movq %rdi, %rax -- shrq $PUD_SHIFT, %rax -- andq $(PTRS_PER_PUD - 1), %rax -- jz ident_complete -- -- leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx -- leaq level3_ident_pgt(%rip), %rbx -- movq %rdx, 0(%rbx, %rax, 8) -- -- movq %rdi, %rax -- shrq $PMD_SHIFT, %rax -- andq $(PTRS_PER_PMD - 1), %rax -- leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx -- leaq level2_spare_pgt(%rip), %rbx -- movq %rdx, 0(%rbx, %rax, 8) --ident_complete: -+ addq %rbp, level2_fixmap_pgt + (506*8)(%rip) -+ addq %rbp, level2_fixmap_pgt + (507*8)(%rip) - - /* - * Fixup the kernel text+data virtual addresses. Note that -@@ -187,6 +178,10 @@ ENTRY(secondary_startup_64) - btl $20,%edi /* No Execute supported? */ - jnc 1f - btsl $_EFER_NX, %eax -+ leaq init_level4_pgt(%rip), %rdi -+ btsq $_PAGE_BIT_NX, 8*L4_PAGE_OFFSET(%rdi) -+ btsq $_PAGE_BIT_NX, 8*L4_VMALLOC_START(%rdi) -+ btsq $_PAGE_BIT_NX, 8*L4_VMEMMAP_START(%rdi) - 1: wrmsr /* Make changes effective */ - - /* Setup cr0 */ -@@ -262,16 +257,16 @@ ENTRY(secondary_startup_64) - .quad x86_64_start_kernel - ENTRY(initial_gs) - .quad INIT_PER_CPU_VAR(irq_stack_union) -- __FINITDATA - - ENTRY(stack_start) - .quad init_thread_union+THREAD_SIZE-8 - .word 0 -+ __FINITDATA - - bad_address: - jmp bad_address - -- .section ".init.text","ax" -+ __INIT - #ifdef CONFIG_EARLY_PRINTK - .globl early_idt_handlers - early_idt_handlers: -@@ -316,18 +311,23 @@ ENTRY(early_idt_handler) - #endif /* EARLY_PRINTK */ - 1: hlt - jmp 1b -+ .previous - - #ifdef CONFIG_EARLY_PRINTK -+ __INITDATA - early_recursion_flag: - .long 0 -+ .previous - -+ .section .rodata,"a",@progbits - early_idt_msg: - .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" - early_idt_ripmsg: - .asciz "RIP %s\n" --#endif /* CONFIG_EARLY_PRINTK */ - .previous -+#endif /* CONFIG_EARLY_PRINTK */ - -+ .section .rodata,"a",@progbits - #define NEXT_PAGE(name) \ - .balign PAGE_SIZE; \ - ENTRY(name) -@@ -350,13 +350,31 @@ NEXT_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE -+ .org init_level4_pgt + L4_VMALLOC_START*8, 0 -+ .quad level3_vmalloc_pgt - __START_KERNEL_map + _KERNPG_TABLE -+ .org init_level4_pgt + L4_VMEMMAP_START*8, 0 -+ .quad level3_vmemmap_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE - - NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE -+#ifdef CONFIG_XEN - .fill 511,8,0 -+#else -+ .quad level2_ident_pgt + PAGE_SIZE - __START_KERNEL_map + _KERNPG_TABLE -+ .quad level2_ident_pgt + 2*PAGE_SIZE - __START_KERNEL_map + _KERNPG_TABLE -+ .quad level2_ident_pgt + 3*PAGE_SIZE - __START_KERNEL_map + _KERNPG_TABLE -+ .fill 508,8,0 -+#endif -+ -+NEXT_PAGE(level3_vmalloc_pgt) -+ .fill 512,8,0 -+ -+NEXT_PAGE(level3_vmemmap_pgt) -+ .fill L3_VMEMMAP_START,8,0 -+ .quad level2_vmemmap_pgt - __START_KERNEL_map + _KERNPG_TABLE - - NEXT_PAGE(level3_kernel_pgt) - .fill L3_START_KERNEL,8,0 -@@ -364,20 +382,23 @@ NEXT_PAGE(level3_kernel_pgt) - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - -+NEXT_PAGE(level2_vmemmap_pgt) -+ .fill 512,8,0 -+ - NEXT_PAGE(level2_fixmap_pgt) -- .fill 506,8,0 -- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE -- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ -- .fill 5,8,0 -+ .fill 507,8,0 -+ .quad level1_vsyscall_pgt - __START_KERNEL_map + _PAGE_TABLE -+ /* 6MB reserved for vsyscalls + a 2MB hole = 3 + 1 entries */ -+ .fill 4,8,0 - --NEXT_PAGE(level1_fixmap_pgt) -+NEXT_PAGE(level1_vsyscall_pgt) - .fill 512,8,0 - --NEXT_PAGE(level2_ident_pgt) -- /* Since I easily can, map the first 1G. -+ /* Since I easily can, map the first 4G. - * Don't set NX because code runs from these pages. - */ -- PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) -+NEXT_PAGE(level2_ident_pgt) -+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, 4*PTRS_PER_PMD) - - NEXT_PAGE(level2_kernel_pgt) - /* -@@ -390,33 +411,49 @@ NEXT_PAGE(level2_kernel_pgt) - * If you want to increase this then increase MODULES_VADDR - * too.) - */ -- PMDS(0, __PAGE_KERNEL_LARGE_EXEC, -- KERNEL_IMAGE_SIZE/PMD_SIZE) -- --NEXT_PAGE(level2_spare_pgt) -- .fill 512, 8, 0 -+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) - - #undef PMDS - #undef NEXT_PAGE - -- .data -+ .align PAGE_SIZE -+ENTRY(cpu_gdt_table) -+ .rept NR_CPUS -+ .quad 0x0000000000000000 /* NULL descriptor */ -+ .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ -+ .quad 0x00af9b000000ffff /* __KERNEL_CS */ -+ .quad 0x00cf93000000ffff /* __KERNEL_DS */ -+ .quad 0x00cffb000000ffff /* __USER32_CS */ -+ .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ -+ .quad 0x00affb000000ffff /* __USER_CS */ -+ .quad 0x0 /* unused */ -+ .quad 0,0 /* TSS */ -+ .quad 0,0 /* LDT */ -+ .quad 0,0,0 /* three TLS descriptors */ -+ .quad 0x0000f40000000000 /* node/CPU stored in limit */ -+ /* asm/segment.h:GDT_ENTRIES must match this */ -+ -+ /* zero the remaining page */ -+ .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 -+ .endr -+ - .align 16 - .globl early_gdt_descr - early_gdt_descr: - .word GDT_ENTRIES*8-1 - early_gdt_descr_base: -- .quad INIT_PER_CPU_VAR(gdt_page) -+ .quad cpu_gdt_table - - ENTRY(phys_base) - /* This must match the first entry in level2_kernel_pgt */ - .quad 0x0000000000000000 - - #include "../../x86/xen/xen-head.S" -- -- .section .bss, "aw", @nobits -+ -+ .section .rodata,"a",@progbits - .align L1_CACHE_BYTES - ENTRY(idt_table) -- .skip IDT_ENTRIES * 16 -+ .fill 512,8,0 - - .section .bss.page_aligned, "aw", @nobits - .align PAGE_SIZE -diff -urNp linux-2.6.31.1/arch/x86/kernel/i386_ksyms_32.c linux-2.6.31.1/arch/x86/kernel/i386_ksyms_32.c ---- linux-2.6.31.1/arch/x86/kernel/i386_ksyms_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/i386_ksyms_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -10,8 +10,12 @@ - EXPORT_SYMBOL(mcount); - #endif - -+EXPORT_SYMBOL_GPL(cpu_gdt_table); -+ - /* Networking helper routines. */ - EXPORT_SYMBOL(csum_partial_copy_generic); -+EXPORT_SYMBOL(csum_partial_copy_generic_to_user); -+EXPORT_SYMBOL(csum_partial_copy_generic_from_user); - - EXPORT_SYMBOL(__get_user_1); - EXPORT_SYMBOL(__get_user_2); -@@ -26,3 +30,7 @@ EXPORT_SYMBOL(strstr); - - EXPORT_SYMBOL(csum_partial); - EXPORT_SYMBOL(empty_zero_page); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+EXPORT_SYMBOL(__LOAD_PHYSICAL_ADDR); -+#endif -diff -urNp linux-2.6.31.1/arch/x86/kernel/init_task.c linux-2.6.31.1/arch/x86/kernel/init_task.c ---- linux-2.6.31.1/arch/x86/kernel/init_task.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/init_task.c 2009-10-01 20:12:42.000000000 -0400 -@@ -39,5 +39,5 @@ EXPORT_SYMBOL(init_task); - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ --DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; -- -+struct tss_struct init_tss[NR_CPUS] ____cacheline_internodealigned_in_smp = { [0 ... NR_CPUS-1] = INIT_TSS }; -+EXPORT_SYMBOL(init_tss); -diff -urNp linux-2.6.31.1/arch/x86/kernel/ioport.c linux-2.6.31.1/arch/x86/kernel/ioport.c ---- linux-2.6.31.1/arch/x86/kernel/ioport.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/ioport.c 2009-10-01 20:12:42.000000000 -0400 -@@ -6,6 +6,7 @@ - #include <linux/sched.h> - #include <linux/kernel.h> - #include <linux/capability.h> -+#include <linux/security.h> - #include <linux/errno.h> - #include <linux/types.h> - #include <linux/ioport.h> -@@ -41,6 +42,12 @@ asmlinkage long sys_ioperm(unsigned long - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; -+#ifdef CONFIG_GRKERNSEC_IO -+ if (turn_on) { -+ gr_handle_ioperm(); -+ return -EPERM; -+ } -+#endif - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - -@@ -67,7 +74,7 @@ asmlinkage long sys_ioperm(unsigned long - * because the ->io_bitmap_max value must match the bitmap - * contents: - */ -- tss = &per_cpu(init_tss, get_cpu()); -+ tss = init_tss + get_cpu(); - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - -@@ -111,8 +118,13 @@ static int do_iopl(unsigned int level, s - return -EINVAL; - /* Trying to gain more privileges? */ - if (level > old) { -+#ifdef CONFIG_GRKERNSEC_IO -+ gr_handle_iopl(); -+ return -EPERM; -+#else - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; -+#endif - } - regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); - -diff -urNp linux-2.6.31.1/arch/x86/kernel/irq_32.c linux-2.6.31.1/arch/x86/kernel/irq_32.c ---- linux-2.6.31.1/arch/x86/kernel/irq_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/irq_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -94,7 +94,7 @@ execute_on_irq_stack(int overflow, struc - return 0; - - /* build the stack frame on the IRQ stack */ -- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); -+ isp = (u32 *) ((char *)irqctx + sizeof(*irqctx) - 8); - irqctx->tinfo.task = curctx->tinfo.task; - irqctx->tinfo.previous_esp = current_stack_pointer; - -@@ -175,7 +175,7 @@ asmlinkage void do_softirq(void) - irqctx->tinfo.previous_esp = current_stack_pointer; - - /* build the stack frame on the softirq stack */ -- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); -+ isp = (u32 *) ((char *)irqctx + sizeof(*irqctx) - 8); - - call_on_stack(__do_softirq, isp); - /* -diff -urNp linux-2.6.31.1/arch/x86/kernel/kprobes.c linux-2.6.31.1/arch/x86/kernel/kprobes.c ---- linux-2.6.31.1/arch/x86/kernel/kprobes.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/kprobes.c 2009-10-01 20:12:42.000000000 -0400 -@@ -166,9 +166,24 @@ static void __kprobes set_jmp_op(void *f - char op; - s32 raddr; - } __attribute__((packed)) * jop; -- jop = (struct __arch_jmp_op *)from; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ -+ jop = (struct __arch_jmp_op *)(ktla_ktva(from)); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_INSTRUCTION; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } - - /* -@@ -345,16 +360,29 @@ static void __kprobes fix_riprel(struct - - static void __kprobes arch_copy_kprobe(struct kprobe *p) - { -- memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ memcpy(p->ainsn.insn, ktla_ktva(p->addr), MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif - - fix_riprel(p); - -- if (can_boost(p->addr)) -+ if (can_boost(ktla_ktva(p->addr))) - p->ainsn.boostable = 0; - else - p->ainsn.boostable = -1; - -- p->opcode = *p->addr; -+ p->opcode = *(ktla_ktva(p->addr)); - } - - int __kprobes arch_prepare_kprobe(struct kprobe *p) -@@ -432,7 +460,7 @@ static void __kprobes prepare_singlestep - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->ip = (unsigned long)p->addr; - else -- regs->ip = (unsigned long)p->ainsn.insn; -+ regs->ip = ktva_ktla((unsigned long)p->ainsn.insn); - } - - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, -@@ -453,7 +481,7 @@ static void __kprobes setup_singlestep(s - if (p->ainsn.boostable == 1 && !p->post_handler) { - /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); -- regs->ip = (unsigned long)p->ainsn.insn; -+ regs->ip = ktva_ktla((unsigned long)p->ainsn.insn); - preempt_enable_no_resched(); - return; - } -@@ -523,7 +551,7 @@ static int __kprobes kprobe_handler(stru - struct kprobe_ctlblk *kcb; - - addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); -- if (*addr != BREAKPOINT_INSTRUCTION) { -+ if (*(kprobe_opcode_t *)ktla_ktva((unsigned long)addr) != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed -@@ -775,7 +803,7 @@ static void __kprobes resume_execution(s - struct pt_regs *regs, struct kprobe_ctlblk *kcb) - { - unsigned long *tos = stack_addr(regs); -- unsigned long copy_ip = (unsigned long)p->ainsn.insn; -+ unsigned long copy_ip = ktva_ktla((unsigned long)p->ainsn.insn); - unsigned long orig_ip = (unsigned long)p->addr; - kprobe_opcode_t *insn = p->ainsn.insn; - -@@ -958,7 +986,7 @@ int __kprobes kprobe_exceptions_notify(s - struct die_args *args = data; - int ret = NOTIFY_DONE; - -- if (args->regs && user_mode_vm(args->regs)) -+ if (args->regs && user_mode(args->regs)) - return ret; - - switch (val) { -diff -urNp linux-2.6.31.1/arch/x86/kernel/ldt.c linux-2.6.31.1/arch/x86/kernel/ldt.c ---- linux-2.6.31.1/arch/x86/kernel/ldt.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/ldt.c 2009-10-01 20:12:42.000000000 -0400 -@@ -66,13 +66,13 @@ static int alloc_ldt(mm_context_t *pc, i - if (reload) { - #ifdef CONFIG_SMP - preempt_disable(); -- load_LDT(pc); -+ load_LDT_nolock(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, - cpumask_of_cpu(smp_processor_id()))) - smp_call_function(flush_ldt, current->mm, 1); - preempt_enable(); - #else -- load_LDT(pc); -+ load_LDT_nolock(pc); - #endif - } - if (oldsize) { -@@ -94,7 +94,7 @@ static inline int copy_ldt(mm_context_t - return err; - - for (i = 0; i < old->size; i++) -- write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); -+ write_ldt_entry(new->ldt, i, old->ldt + i); - return 0; - } - -@@ -115,6 +115,24 @@ int init_new_context(struct task_struct - retval = copy_ldt(&mm->context, &old_mm->context); - mutex_unlock(&old_mm->context.lock); - } -+ -+ if (tsk == current) { -+ mm->context.vdso = ~0UL; -+ -+#ifdef CONFIG_X86_32 -+#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) -+ mm->context.user_cs_base = 0UL; -+ mm->context.user_cs_limit = ~0UL; -+ -+#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_SMP) -+ cpus_clear(mm->context.cpu_user_cs_mask); -+#endif -+ -+#endif -+#endif -+ -+ } -+ - return retval; - } - -@@ -229,6 +247,13 @@ static int write_ldt(void __user *ptr, u - } - } - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (ldt_info.contents & MODIFY_LDT_CONTENTS_CODE)) { -+ error = -EINVAL; -+ goto out_unlock; -+ } -+#endif -+ - fill_ldt(&ldt, &ldt_info); - if (oldmode) - ldt.avl = 0; -diff -urNp linux-2.6.31.1/arch/x86/kernel/machine_kexec_32.c linux-2.6.31.1/arch/x86/kernel/machine_kexec_32.c ---- linux-2.6.31.1/arch/x86/kernel/machine_kexec_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/machine_kexec_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -26,7 +26,7 @@ - #include <asm/system.h> - #include <asm/cacheflush.h> - --static void set_idt(void *newidt, __u16 limit) -+static void set_idt(struct desc_struct *newidt, __u16 limit) - { - struct desc_ptr curidt; - -@@ -38,7 +38,7 @@ static void set_idt(void *newidt, __u16 - } - - --static void set_gdt(void *newgdt, __u16 limit) -+static void set_gdt(struct desc_struct *newgdt, __u16 limit) - { - struct desc_ptr curgdt; - -@@ -217,7 +217,7 @@ void machine_kexec(struct kimage *image) - } - - control_page = page_address(image->control_code_page); -- memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); -+ memcpy(control_page, (void *)ktla_ktva((unsigned long)relocate_kernel), KEXEC_CONTROL_CODE_MAX_SIZE); - - relocate_kernel_ptr = control_page; - page_list[PA_CONTROL_PAGE] = __pa(control_page); -diff -urNp linux-2.6.31.1/arch/x86/kernel/module.c linux-2.6.31.1/arch/x86/kernel/module.c ---- linux-2.6.31.1/arch/x86/kernel/module.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/module.c 2009-10-01 20:12:42.000000000 -0400 -@@ -27,6 +27,7 @@ - #include <asm/system.h> - #include <asm/page.h> - #include <asm/pgtable.h> -+#include <asm/desc.h> - - #if 0 - #define DEBUGP printk -@@ -34,7 +35,7 @@ - #define DEBUGP(fmt...) - #endif - --void *module_alloc(unsigned long size) -+static void *__module_alloc(unsigned long size, pgprot_t prot) - { - struct vm_struct *area; - -@@ -48,9 +49,92 @@ void *module_alloc(unsigned long size) - if (!area) - return NULL; - -- return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, -- PAGE_KERNEL_EXEC); -+ return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, prot); -+} -+ -+#ifdef CONFIG_PAX_KERNEXEC -+#ifdef CONFIG_X86_32 -+void *module_alloc(unsigned long size) -+{ -+ return __module_alloc(size, PAGE_KERNEL); -+} -+ -+void *module_alloc_exec(unsigned long size) -+{ -+ struct vm_struct *area; -+ -+ if (size == 0) -+ return NULL; -+ -+ area = __get_vm_area(size, VM_ALLOC, (unsigned long)&MODULES_EXEC_VADDR, (unsigned long)&MODULES_EXEC_END); -+ if (area) -+ return area->addr; -+ -+ return NULL; -+} -+EXPORT_SYMBOL(module_alloc_exec); -+ -+void module_free_exec(struct module *mod, void *module_region) -+{ -+ struct vm_struct **p, *tmp; -+ -+ if (!module_region) -+ return; -+ -+ if ((PAGE_SIZE-1) & (unsigned long)module_region) { -+ printk(KERN_ERR "Trying to module_free_exec() bad address (%p)\n", module_region); -+ WARN_ON(1); -+ return; -+ } -+ -+ write_lock(&vmlist_lock); -+ for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) -+ if (tmp->addr == module_region) -+ break; -+ -+ if (tmp) { -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+ memset(tmp->addr, 0xCC, tmp->size); -+ pax_close_kernel(cr0); -+ -+ *p = tmp->next; -+ kfree(tmp); -+ } -+ write_unlock(&vmlist_lock); -+ -+ if (!tmp) { -+ printk(KERN_ERR "Trying to module_free_exec() nonexistent vm area (%p)\n", -+ module_region); -+ WARN_ON(1); -+ } -+} -+EXPORT_SYMBOL(module_free_exec); -+#else -+void *module_alloc(unsigned long size) -+{ -+ return __module_alloc(size, PAGE_KERNEL); -+} -+ -+void module_free_exec(struct module *mod, void *module_region) -+{ -+ module_free(mod, module_region); -+} -+EXPORT_SYMBOL(module_free_exec); -+ -+void *module_alloc_exec(unsigned long size) -+{ -+ return __module_alloc(size, PAGE_KERNEL_RX); -+} -+EXPORT_SYMBOL(module_alloc_exec); -+#endif -+#else -+void *module_alloc(unsigned long size) -+{ -+ return __module_alloc(size, PAGE_KERNEL_EXEC); - } -+#endif - - /* Free memory returned from module_alloc */ - void module_free(struct module *mod, void *module_region) -@@ -77,14 +161,20 @@ int apply_relocate(Elf32_Shdr *sechdrs, - unsigned int i; - Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; - Elf32_Sym *sym; -- uint32_t *location; -+ uint32_t *plocation, location; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif - - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { - /* This is where to make the change */ -- location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr -- + rel[i].r_offset; -+ plocation = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + rel[i].r_offset; -+ location = (uint32_t)plocation; -+ if (sechdrs[sechdrs[relsec].sh_info].sh_flags & SHF_EXECINSTR) -+ plocation = ktla_ktva((void *)plocation); - /* This is the symbol it is referring to. Note that all - undefined symbols have been resolved. */ - sym = (Elf32_Sym *)sechdrs[symindex].sh_addr -@@ -93,11 +183,31 @@ int apply_relocate(Elf32_Shdr *sechdrs, - switch (ELF32_R_TYPE(rel[i].r_info)) { - case R_386_32: - /* We add the value into the location given */ -- *location += sym->st_value; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ *plocation += sym->st_value; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - break; - case R_386_PC32: - /* Add the value, subtract its postition */ -- *location += sym->st_value - (uint32_t)location; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ *plocation += sym->st_value - location; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - break; - default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", -@@ -131,6 +241,10 @@ int apply_relocate_add(Elf64_Shdr *sechd - void *loc; - u64 val; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { -@@ -153,21 +267,61 @@ int apply_relocate_add(Elf64_Shdr *sechd - case R_X86_64_NONE: - break; - case R_X86_64_64: -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - *(u64 *)loc = val; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - break; - case R_X86_64_32: -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - *(u32 *)loc = val; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - if (val != *(u32 *)loc) - goto overflow; - break; - case R_X86_64_32S: -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - *(s32 *)loc = val; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - if ((s64)val != *(s32 *)loc) - goto overflow; - break; - case R_X86_64_PC32: - val -= (u64)loc; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - *(u32 *)loc = val; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - #if 0 - if ((s64)val != *(s32 *)loc) - goto overflow; -diff -urNp linux-2.6.31.1/arch/x86/kernel/paravirt.c linux-2.6.31.1/arch/x86/kernel/paravirt.c ---- linux-2.6.31.1/arch/x86/kernel/paravirt.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/paravirt.c 2009-10-01 20:12:42.000000000 -0400 -@@ -54,7 +54,7 @@ u64 _paravirt_ident_64(u64 x) - return x; - } - --static void __init default_banner(void) -+static void default_banner(void) - { - printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - pv_info.name); -@@ -125,9 +125,9 @@ unsigned paravirt_patch_jmp(void *insnbu - - /* Neat trick to map patch type back to the call within the - * corresponding structure. */ --static void *get_call_destination(u8 type) -+static const void *get_call_destination(u8 type) - { -- struct paravirt_patch_template tmpl = { -+ const struct paravirt_patch_template tmpl = { - .pv_init_ops = pv_init_ops, - .pv_time_ops = pv_time_ops, - .pv_cpu_ops = pv_cpu_ops, -@@ -138,13 +138,13 @@ static void *get_call_destination(u8 typ - .pv_lock_ops = pv_lock_ops, - #endif - }; -- return *((void **)&tmpl + type); -+ return *((const void **)&tmpl + type); - } - - unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len) - { -- void *opfunc = get_call_destination(type); -+ const void *opfunc = get_call_destination(type); - unsigned ret; - - if (opfunc == NULL) -@@ -183,7 +183,7 @@ unsigned paravirt_patch_insns(void *insn - if (insn_len > len || start == NULL) - insn_len = len; - else -- memcpy(insnbuf, start, insn_len); -+ memcpy(insnbuf, ktla_ktva(start), insn_len); - - return insn_len; - } -@@ -311,21 +311,21 @@ void arch_flush_lazy_mmu_mode(void) - preempt_enable(); - } - --struct pv_info pv_info = { -+struct pv_info pv_info __read_only = { - .name = "bare hardware", - .paravirt_enabled = 0, - .kernel_rpl = 0, - .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ - }; - --struct pv_init_ops pv_init_ops = { -+struct pv_init_ops pv_init_ops __read_only = { - .patch = native_patch, - .banner = default_banner, - .arch_setup = paravirt_nop, - .memory_setup = machine_specific_memory_setup, - }; - --struct pv_time_ops pv_time_ops = { -+struct pv_time_ops pv_time_ops __read_only = { - .time_init = hpet_time_init, - .get_wallclock = native_get_wallclock, - .set_wallclock = native_set_wallclock, -@@ -333,7 +333,7 @@ struct pv_time_ops pv_time_ops = { - .get_tsc_khz = native_calibrate_tsc, - }; - --struct pv_irq_ops pv_irq_ops = { -+struct pv_irq_ops pv_irq_ops __read_only = { - .init_IRQ = native_init_IRQ, - .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), -@@ -346,7 +346,7 @@ struct pv_irq_ops pv_irq_ops = { - #endif - }; - --struct pv_cpu_ops pv_cpu_ops = { -+struct pv_cpu_ops pv_cpu_ops __read_only = { - .cpuid = native_cpuid, - .get_debugreg = native_get_debugreg, - .set_debugreg = native_set_debugreg, -@@ -406,7 +406,7 @@ struct pv_cpu_ops pv_cpu_ops = { - .end_context_switch = paravirt_nop, - }; - --struct pv_apic_ops pv_apic_ops = { -+struct pv_apic_ops pv_apic_ops __read_only = { - #ifdef CONFIG_X86_LOCAL_APIC - .setup_boot_clock = setup_boot_APIC_clock, - .setup_secondary_clock = setup_secondary_APIC_clock, -@@ -422,7 +422,7 @@ struct pv_apic_ops pv_apic_ops = { - #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) - #endif - --struct pv_mmu_ops pv_mmu_ops = { -+struct pv_mmu_ops pv_mmu_ops __read_only = { - #ifndef CONFIG_X86_64 - .pagetable_setup_start = native_pagetable_setup_start, - .pagetable_setup_done = native_pagetable_setup_done, -diff -urNp linux-2.6.31.1/arch/x86/kernel/paravirt-spinlocks.c linux-2.6.31.1/arch/x86/kernel/paravirt-spinlocks.c ---- linux-2.6.31.1/arch/x86/kernel/paravirt-spinlocks.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/paravirt-spinlocks.c 2009-10-01 20:12:42.000000000 -0400 -@@ -13,7 +13,7 @@ default_spin_lock_flags(raw_spinlock_t * - __raw_spin_lock(lock); - } - --struct pv_lock_ops pv_lock_ops = { -+struct pv_lock_ops pv_lock_ops __read_only = { - #ifdef CONFIG_SMP - .spin_is_locked = __ticket_spin_is_locked, - .spin_is_contended = __ticket_spin_is_contended, -diff -urNp linux-2.6.31.1/arch/x86/kernel/process_32.c linux-2.6.31.1/arch/x86/kernel/process_32.c ---- linux-2.6.31.1/arch/x86/kernel/process_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/process_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -70,6 +70,7 @@ EXPORT_PER_CPU_SYMBOL(current_task); - unsigned long thread_saved_pc(struct task_struct *tsk) - { - return ((unsigned long *)tsk->thread.sp)[3]; -+//XXX return tsk->thread.eip; - } - - #ifndef CONFIG_SMP -@@ -132,7 +133,7 @@ void __show_regs(struct pt_regs *regs, i - unsigned short ss, gs; - const char *board; - -- if (user_mode_vm(regs)) { -+ if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - gs = get_user_gs(regs); -@@ -213,8 +214,8 @@ int kernel_thread(int (*fn)(void *), voi - regs.bx = (unsigned long) fn; - regs.dx = (unsigned long) arg; - -- regs.ds = __USER_DS; -- regs.es = __USER_DS; -+ regs.ds = __KERNEL_DS; -+ regs.es = __KERNEL_DS; - regs.fs = __KERNEL_PERCPU; - regs.gs = __KERNEL_STACK_CANARY; - regs.orig_ax = -1; -@@ -250,7 +251,7 @@ int copy_thread(unsigned long clone_flag - struct task_struct *tsk; - int err; - -- childregs = task_pt_regs(p); -+ childregs = task_stack_page(p) + THREAD_SIZE - sizeof(struct pt_regs) - 8; - *childregs = *regs; - childregs->ax = 0; - childregs->sp = sp; -@@ -279,6 +280,7 @@ int copy_thread(unsigned long clone_flag - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) -+//XXX needs set_fs()? - err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs->si, 0); - -@@ -349,7 +351,7 @@ __switch_to(struct task_struct *prev_p, - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; - int cpu = smp_processor_id(); -- struct tss_struct *tss = &per_cpu(init_tss, cpu); -+ struct tss_struct *tss = init_tss + cpu; - - /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - -@@ -377,6 +379,11 @@ __switch_to(struct task_struct *prev_p, - */ - lazy_save_gs(prev->gs); - -+#ifdef CONFIG_PAX_MEMORY_UDEREF -+ if (!segment_eq(task_thread_info(prev_p)->addr_limit, task_thread_info(next_p)->addr_limit)) -+ __set_fs(task_thread_info(next_p)->addr_limit, cpu); -+#endif -+ - /* - * Load the per-thread Thread-Local Storage descriptor. - */ -@@ -495,3 +502,27 @@ unsigned long get_wchan(struct task_stru - return 0; - } - -+#ifdef CONFIG_PAX_RANDKSTACK -+asmlinkage void pax_randomize_kstack(void) -+{ -+ struct thread_struct *thread = ¤t->thread; -+ unsigned long time; -+ -+ if (!randomize_va_space) -+ return; -+ -+ rdtscl(time); -+ -+ /* P4 seems to return a 0 LSB, ignore it */ -+#ifdef CONFIG_MPENTIUM4 -+ time &= 0x1EUL; -+ time <<= 2; -+#else -+ time &= 0xFUL; -+ time <<= 3; -+#endif -+ -+ thread->sp0 ^= time; -+ load_sp0(init_tss + smp_processor_id(), thread); -+} -+#endif -diff -urNp linux-2.6.31.1/arch/x86/kernel/process_64.c linux-2.6.31.1/arch/x86/kernel/process_64.c ---- linux-2.6.31.1/arch/x86/kernel/process_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/process_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -94,7 +94,7 @@ static void __exit_idle(void) - void exit_idle(void) - { - /* idle loop has pid 0 */ -- if (current->pid) -+ if (task_pid_nr(current)) - return; - __exit_idle(); - } -@@ -173,7 +173,7 @@ void __show_regs(struct pt_regs *regs, i - if (!board) - board = ""; - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", -- current->pid, current->comm, print_tainted(), -+ task_pid_nr(current), current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); -@@ -384,7 +384,7 @@ __switch_to(struct task_struct *prev_p, - struct thread_struct *prev = &prev_p->thread; - struct thread_struct *next = &next_p->thread; - int cpu = smp_processor_id(); -- struct tss_struct *tss = &per_cpu(init_tss, cpu); -+ struct tss_struct *tss = init_tss + cpu; - unsigned fsindex, gsindex; - - /* we're going to use this soon, after a few expensive things */ -@@ -543,12 +543,11 @@ unsigned long get_wchan(struct task_stru - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - stack = (unsigned long)task_stack_page(p); -- if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) -+ if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE-8-sizeof(u64)) - return 0; - fp = *(u64 *)(p->thread.sp); - do { -- if (fp < (unsigned long)stack || -- fp >= (unsigned long)stack+THREAD_SIZE) -+ if (fp < stack || fp > stack+THREAD_SIZE-8-sizeof(u64)) - return 0; - ip = *(u64 *)(fp+8); - if (!in_sched_functions(ip)) -diff -urNp linux-2.6.31.1/arch/x86/kernel/process.c linux-2.6.31.1/arch/x86/kernel/process.c ---- linux-2.6.31.1/arch/x86/kernel/process.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/process.c 2009-10-01 20:12:42.000000000 -0400 -@@ -76,7 +76,7 @@ void exit_thread(void) - unsigned long *bp = t->io_bitmap_ptr; - - if (bp) { -- struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); -+ struct tss_struct *tss = init_tss + get_cpu(); - - t->io_bitmap_ptr = NULL; - clear_thread_flag(TIF_IO_BITMAP); -@@ -108,6 +108,9 @@ void flush_thread(void) - - clear_tsk_thread_flag(tsk, TIF_DEBUG); - -+#if defined(CONFIG_X86_32) && !defined(CONFIG_CC_STACKPROTECTOR) -+ loadsegment(gs, 0); -+#endif - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; -@@ -611,17 +614,3 @@ static int __init idle_setup(char *str) - return 0; - } - early_param("idle", idle_setup); -- --unsigned long arch_align_stack(unsigned long sp) --{ -- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) -- sp -= get_random_int() % 8192; -- return sp & ~0xf; --} -- --unsigned long arch_randomize_brk(struct mm_struct *mm) --{ -- unsigned long range_end = mm->brk + 0x02000000; -- return randomize_range(mm->brk, range_end, 0) ? : mm->brk; --} -- -diff -urNp linux-2.6.31.1/arch/x86/kernel/ptrace.c linux-2.6.31.1/arch/x86/kernel/ptrace.c ---- linux-2.6.31.1/arch/x86/kernel/ptrace.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/ptrace.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1454,7 +1454,7 @@ void send_sigtrap(struct task_struct *ts - info.si_code = si_code; - - /* User-mode ip? */ -- info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; -+ info.si_addr = user_mode(regs) ? (void __user *) regs->ip : NULL; - - /* Send us the fake SIGTRAP */ - force_sig_info(SIGTRAP, &info, tsk); -diff -urNp linux-2.6.31.1/arch/x86/kernel/reboot.c linux-2.6.31.1/arch/x86/kernel/reboot.c ---- linux-2.6.31.1/arch/x86/kernel/reboot.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/reboot.c 2009-10-01 20:12:42.000000000 -0400 -@@ -31,7 +31,7 @@ void (*pm_power_off)(void); - EXPORT_SYMBOL(pm_power_off); - - static const struct desc_ptr no_idt = {}; --static int reboot_mode; -+static unsigned short reboot_mode; - enum reboot_type reboot_type = BOOT_KBD; - int reboot_force; - -@@ -257,7 +257,7 @@ static struct dmi_system_id __initdata r - DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), - }, - }, -- { } -+ { NULL, NULL, {{0, {0}}}, NULL} - }; - - static int __init reboot_init(void) -@@ -273,12 +273,12 @@ core_initcall(reboot_init); - controller to pulse the CPU reset line, which is more thorough, but - doesn't work with at least one type of 486 motherboard. It is easy - to stop this code working; hence the copious comments. */ --static const unsigned long long --real_mode_gdt_entries [3] = -+static struct desc_struct -+real_mode_gdt_entries [3] __read_only = - { -- 0x0000000000000000ULL, /* Null descriptor */ -- 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ -- 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ -+ {{{0x00000000, 0x00000000}}}, /* Null descriptor */ -+ {{{0x0000ffff, 0x00009b00}}}, /* 16-bit real-mode 64k code at 0x00000000 */ -+ {{{0x0100ffff, 0x00009300}}} /* 16-bit real-mode 64k data at 0x00000100 */ - }; - - static const struct desc_ptr -@@ -327,7 +327,7 @@ static const unsigned char jump_to_bios - * specified by the code and length parameters. - * We assume that length will aways be less that 100! - */ --void machine_real_restart(const unsigned char *code, int length) -+void machine_real_restart(const unsigned char *code, unsigned int length) - { - local_irq_disable(); - -@@ -347,8 +347,8 @@ void machine_real_restart(const unsigned - /* Remap the kernel at virtual address zero, as well as offset zero - from the kernel segment. This assumes the kernel segment starts at - virtual address PAGE_OFFSET. */ -- memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, -- sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); -+ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, -+ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); - - /* - * Use `swapper_pg_dir' as our page directory. -@@ -360,16 +360,15 @@ void machine_real_restart(const unsigned - boot)". This seems like a fairly standard thing that gets set by - REBOOT.COM programs, and the previous reset routine did this - too. */ -- *((unsigned short *)0x472) = reboot_mode; -+ *(unsigned short *)(__va(0x472)) = reboot_mode; - - /* For the switch to real mode, copy some code to low memory. It has - to be in the first 64k because it is running in 16-bit mode, and it - has to have the same physical and virtual address, because it turns - off paging. Copy it near the end of the first page, out of the way - of BIOS variables. */ -- memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), -- real_mode_switch, sizeof (real_mode_switch)); -- memcpy((void *)(0x1000 - 100), code, length); -+ memcpy(__va(0x1000 - sizeof (real_mode_switch) - 100), real_mode_switch, sizeof (real_mode_switch)); -+ memcpy(__va(0x1000 - 100), code, length); - - /* Set up the IDT for real mode. */ - load_idt(&real_mode_idt); -diff -urNp linux-2.6.31.1/arch/x86/kernel/setup.c linux-2.6.31.1/arch/x86/kernel/setup.c ---- linux-2.6.31.1/arch/x86/kernel/setup.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/setup.c 2009-10-01 20:12:42.000000000 -0400 -@@ -768,14 +768,14 @@ void __init setup_arch(char **cmdline_p) - - if (!boot_params.hdr.root_flags) - root_mountflags &= ~MS_RDONLY; -- init_mm.start_code = (unsigned long) _text; -- init_mm.end_code = (unsigned long) _etext; -+ init_mm.start_code = ktla_ktva((unsigned long) _text); -+ init_mm.end_code = ktla_ktva((unsigned long) _etext); - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = _brk_end; - -- code_resource.start = virt_to_phys(_text); -- code_resource.end = virt_to_phys(_etext)-1; -- data_resource.start = virt_to_phys(_etext); -+ code_resource.start = virt_to_phys(ktla_ktva(_text)); -+ code_resource.end = virt_to_phys(ktla_ktva(_etext))-1; -+ data_resource.start = virt_to_phys(_sdata); - data_resource.end = virt_to_phys(_edata)-1; - bss_resource.start = virt_to_phys(&__bss_start); - bss_resource.end = virt_to_phys(&__bss_stop)-1; -diff -urNp linux-2.6.31.1/arch/x86/kernel/setup_percpu.c linux-2.6.31.1/arch/x86/kernel/setup_percpu.c ---- linux-2.6.31.1/arch/x86/kernel/setup_percpu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/setup_percpu.c 2009-10-01 20:12:42.000000000 -0400 -@@ -25,19 +25,17 @@ - # define DBG(x...) - #endif - -+#ifdef CONFIG_SMP - DEFINE_PER_CPU(int, cpu_number); - EXPORT_PER_CPU_SYMBOL(cpu_number); -+#endif - --#ifdef CONFIG_X86_64 - #define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) --#else --#define BOOT_PERCPU_OFFSET 0 --#endif - - DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; - EXPORT_PER_CPU_SYMBOL(this_cpu_off); - --unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { -+unsigned long __per_cpu_offset[NR_CPUS] __read_only = { - [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, - }; - EXPORT_SYMBOL(__per_cpu_offset); -@@ -429,13 +427,15 @@ early_param("percpu_alloc", percpu_alloc - static inline void setup_percpu_segment(int cpu) - { - #ifdef CONFIG_X86_32 -- struct desc_struct gdt; -- -- pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, -- 0x2 | DESCTYPE_S, 0x8); -- gdt.s = 1; -- write_gdt_entry(get_cpu_gdt_table(cpu), -- GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); -+ struct desc_struct d, *gdt = get_cpu_gdt_table(cpu); -+ unsigned long base = per_cpu_offset(cpu); -+ const unsigned long limit = VMALLOC_END - base - 1; -+ -+ if (limit < 64*1024) -+ pack_descriptor(&d, base, limit, 0x80 | DESCTYPE_S | 0x3, 0x4); -+ else -+ pack_descriptor(&d, base, limit >> PAGE_SHIFT, 0x80 | DESCTYPE_S | 0x3, 0xC); -+ write_gdt_entry(gdt, GDT_ENTRY_PERCPU, &d, DESCTYPE_S); - #endif - } - -@@ -486,6 +486,11 @@ void __init setup_per_cpu_areas(void) - /* alrighty, percpu areas up and running */ - delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; - for_each_possible_cpu(cpu) { -+#ifdef CONFIG_CC_STACKPROTECTOR -+#ifdef CONFIG_x86_32 -+ unsigned long canary = per_cpu(stack_canary, cpu); -+#endif -+#endif - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; - per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); - per_cpu(cpu_number, cpu) = cpu; -@@ -513,6 +518,12 @@ void __init setup_per_cpu_areas(void) - early_per_cpu_map(x86_cpu_to_node_map, cpu); - #endif - #endif -+#ifdef CONFIG_CC_STACKPROTECTOR -+#ifdef CONFIG_x86_32 -+ if (cpu == boot_cpu_id) -+ per_cpu(stack_canary, cpu) = canary; -+#endif -+#endif - /* - * Up to this point, the boot CPU has been using .data.init - * area. Reload any changed state for the boot CPU. -diff -urNp linux-2.6.31.1/arch/x86/kernel/signal.c linux-2.6.31.1/arch/x86/kernel/signal.c ---- linux-2.6.31.1/arch/x86/kernel/signal.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/signal.c 2009-10-01 20:12:42.000000000 -0400 -@@ -197,7 +197,7 @@ static unsigned long align_sigframe(unsi - * Align the stack pointer according to the i386 ABI, - * i.e. so that on function entry ((sp + 4) & 15) == 0. - */ -- sp = ((sp + 4) & -16ul) - 4; -+ sp = ((sp - 12) & -16ul) - 4; - #else /* !CONFIG_X86_32 */ - sp = round_down(sp, 16) - 8; - #endif -@@ -307,9 +307,9 @@ __setup_frame(int sig, struct k_sigactio - } - - if (current->mm->context.vdso) -- restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); -+ restorer = (void __user *)VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); - else -- restorer = &frame->retcode; -+ restorer = (void __user *)&frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - -@@ -377,7 +377,7 @@ static int __setup_rt_frame(int sig, str - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - /* Set up to return from userspace. */ -- restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); -+ restorer = (void __user *)VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - put_user_ex(restorer, &frame->pretcode); -@@ -789,7 +789,7 @@ static void do_signal(struct pt_regs *re - * X86_32: vm86 regs switched out by assembly code before reaching - * here, so testing against kernel CS suffices. - */ -- if (!user_mode(regs)) -+ if (!user_mode_novm(regs)) - return; - - if (current_thread_info()->status & TS_RESTORE_SIGMASK) -diff -urNp linux-2.6.31.1/arch/x86/kernel/smpboot.c linux-2.6.31.1/arch/x86/kernel/smpboot.c ---- linux-2.6.31.1/arch/x86/kernel/smpboot.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/smpboot.c 2009-10-01 20:12:42.000000000 -0400 -@@ -685,6 +685,10 @@ static int __cpuinit do_boot_cpu(int api - .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), - }; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - INIT_WORK(&c_idle.work, do_fork_idle); - - alternatives_smp_switch(1); -@@ -727,7 +731,17 @@ do_rest: - (unsigned long)task_stack_page(c_idle.idle) - - KERNEL_STACK_OFFSET + THREAD_SIZE; - #endif -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - initial_code = (unsigned long)start_secondary; - stack_start.sp = (void *) c_idle.idle->thread.sp; - -diff -urNp linux-2.6.31.1/arch/x86/kernel/step.c linux-2.6.31.1/arch/x86/kernel/step.c ---- linux-2.6.31.1/arch/x86/kernel/step.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/step.c 2009-10-01 20:12:42.000000000 -0400 -@@ -23,22 +23,20 @@ unsigned long convert_ip_to_linear(struc - * and APM bios ones we just ignore here. - */ - if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { -- u32 *desc; -+ struct desc_struct *desc; - unsigned long base; - -- seg &= ~7UL; -+ seg >>= 3; - - mutex_lock(&child->mm->context.lock); -- if (unlikely((seg >> 3) >= child->mm->context.size)) -- addr = -1L; /* bogus selector, access would fault */ -+ if (unlikely(seg >= child->mm->context.size)) -+ addr = -EINVAL; - else { -- desc = child->mm->context.ldt + seg; -- base = ((desc[0] >> 16) | -- ((desc[1] & 0xff) << 16) | -- (desc[1] & 0xff000000)); -+ desc = &child->mm->context.ldt[seg]; -+ base = (desc->a >> 16) | ((desc->b & 0xff) << 16) | (desc->b & 0xff000000); - - /* 16-bit code segment? */ -- if (!((desc[1] >> 22) & 1)) -+ if (!((desc->b >> 22) & 1)) - addr &= 0xffff; - addr += base; - } -@@ -54,6 +52,9 @@ static int is_setting_trap_flag(struct t - unsigned char opcode[15]; - unsigned long addr = convert_ip_to_linear(child, regs); - -+ if (addr == -EINVAL) -+ return 0; -+ - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); - for (i = 0; i < copied; i++) { - switch (opcode[i]) { -@@ -75,7 +76,7 @@ static int is_setting_trap_flag(struct t - - #ifdef CONFIG_X86_64 - case 0x40 ... 0x4f: -- if (regs->cs != __USER_CS) -+ if ((regs->cs & 0xffff) != __USER_CS) - /* 32-bit mode: register increment */ - return 0; - /* 64-bit mode: REX prefix */ -diff -urNp linux-2.6.31.1/arch/x86/kernel/syscall_table_32.S linux-2.6.31.1/arch/x86/kernel/syscall_table_32.S ---- linux-2.6.31.1/arch/x86/kernel/syscall_table_32.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/syscall_table_32.S 2009-10-01 20:12:42.000000000 -0400 -@@ -1,3 +1,4 @@ -+.section .rodata,"a",@progbits - ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit -diff -urNp linux-2.6.31.1/arch/x86/kernel/sys_i386_32.c linux-2.6.31.1/arch/x86/kernel/sys_i386_32.c ---- linux-2.6.31.1/arch/x86/kernel/sys_i386_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/sys_i386_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -24,6 +24,21 @@ - - #include <asm/syscalls.h> - -+int i386_mmap_check(unsigned long addr, unsigned long len, unsigned long flags) -+{ -+ unsigned long pax_task_size = TASK_SIZE; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) -+ pax_task_size = SEGMEXEC_TASK_SIZE; -+#endif -+ -+ if (len > pax_task_size || addr > pax_task_size - len) -+ return -EINVAL; -+ -+ return 0; -+} -+ - asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -@@ -83,6 +98,205 @@ out: - return err; - } - -+unsigned long -+arch_get_unmapped_area(struct file *filp, unsigned long addr, -+ unsigned long len, unsigned long pgoff, unsigned long flags) -+{ -+ struct mm_struct *mm = current->mm; -+ struct vm_area_struct *vma; -+ unsigned long start_addr, pax_task_size = TASK_SIZE; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (mm->pax_flags & MF_PAX_SEGMEXEC) -+ pax_task_size = SEGMEXEC_TASK_SIZE; -+#endif -+ -+ if (len > pax_task_size) -+ return -ENOMEM; -+ -+ if (flags & MAP_FIXED) -+ return addr; -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) -+#endif -+ -+ if (addr) { -+ addr = PAGE_ALIGN(addr); -+ vma = find_vma(mm, addr); -+ if (pax_task_size - len >= addr && -+ (!vma || addr + len <= vma->vm_start)) -+ return addr; -+ } -+ if (len > mm->cached_hole_size) { -+ start_addr = addr = mm->free_area_cache; -+ } else { -+ start_addr = addr = mm->mmap_base; -+ mm->cached_hole_size = 0; -+ } -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (!nx_enabled && (mm->pax_flags & MF_PAX_PAGEEXEC) && (flags & MAP_EXECUTABLE) && start_addr >= mm->mmap_base) { -+ start_addr = 0x00110000UL; -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ start_addr += mm->delta_mmap & 0x03FFF000UL; -+#endif -+ -+ if (mm->start_brk <= start_addr && start_addr < mm->mmap_base) -+ start_addr = addr = mm->mmap_base; -+ else -+ addr = start_addr; -+ } -+#endif -+ -+full_search: -+ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { -+ /* At this point: (!vma || addr < vma->vm_end). */ -+ if (pax_task_size - len < addr) { -+ /* -+ * Start a new search - just in case we missed -+ * some holes. -+ */ -+ if (start_addr != mm->mmap_base) { -+ start_addr = addr = mm->mmap_base; -+ mm->cached_hole_size = 0; -+ goto full_search; -+ } -+ return -ENOMEM; -+ } -+ if (!vma || addr + len <= vma->vm_start) { -+ /* -+ * Remember the place where we stopped the search: -+ */ -+ mm->free_area_cache = addr + len; -+ return addr; -+ } -+ if (addr + mm->cached_hole_size < vma->vm_start) -+ mm->cached_hole_size = vma->vm_start - addr; -+ addr = vma->vm_end; -+ if (mm->start_brk <= addr && addr < mm->mmap_base) { -+ start_addr = addr = mm->mmap_base; -+ mm->cached_hole_size = 0; -+ goto full_search; -+ } -+ } -+} -+ -+unsigned long -+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, -+ const unsigned long len, const unsigned long pgoff, -+ const unsigned long flags) -+{ -+ struct vm_area_struct *vma; -+ struct mm_struct *mm = current->mm; -+ unsigned long base = mm->mmap_base, addr = addr0, pax_task_size = TASK_SIZE; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (mm->pax_flags & MF_PAX_SEGMEXEC) -+ pax_task_size = SEGMEXEC_TASK_SIZE; -+#endif -+ -+ /* requested length too big for entire address space */ -+ if (len > pax_task_size) -+ return -ENOMEM; -+ -+ if (flags & MAP_FIXED) -+ return addr; -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (!nx_enabled && (mm->pax_flags & MF_PAX_PAGEEXEC) && (flags & MAP_EXECUTABLE)) -+ goto bottomup; -+#endif -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) -+#endif -+ -+ /* requesting a specific address */ -+ if (addr) { -+ addr = PAGE_ALIGN(addr); -+ vma = find_vma(mm, addr); -+ if (pax_task_size - len >= addr && -+ (!vma || addr + len <= vma->vm_start)) -+ return addr; -+ } -+ -+ /* check if free_area_cache is useful for us */ -+ if (len <= mm->cached_hole_size) { -+ mm->cached_hole_size = 0; -+ mm->free_area_cache = mm->mmap_base; -+ } -+ -+ /* either no address requested or can't fit in requested address hole */ -+ addr = mm->free_area_cache; -+ -+ /* make sure it can fit in the remaining address space */ -+ if (addr > len) { -+ vma = find_vma(mm, addr-len); -+ if (!vma || addr <= vma->vm_start) -+ /* remember the address as a hint for next time */ -+ return (mm->free_area_cache = addr-len); -+ } -+ -+ if (mm->mmap_base < len) -+ goto bottomup; -+ -+ addr = mm->mmap_base-len; -+ -+ do { -+ /* -+ * Lookup failure means no vma is above this address, -+ * else if new region fits below vma->vm_start, -+ * return with success: -+ */ -+ vma = find_vma(mm, addr); -+ if (!vma || addr+len <= vma->vm_start) -+ /* remember the address as a hint for next time */ -+ return (mm->free_area_cache = addr); -+ -+ /* remember the largest hole we saw so far */ -+ if (addr + mm->cached_hole_size < vma->vm_start) -+ mm->cached_hole_size = vma->vm_start - addr; -+ -+ /* try just below the current vma->vm_start */ -+ addr = vma->vm_start-len; -+ } while (len < vma->vm_start); -+ -+bottomup: -+ /* -+ * A failed mmap() very likely causes application failure, -+ * so fall back to the bottom-up function here. This scenario -+ * can happen with large stack limits and large mmap() -+ * allocations. -+ */ -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (mm->pax_flags & MF_PAX_SEGMEXEC) -+ mm->mmap_base = SEGMEXEC_TASK_UNMAPPED_BASE; -+ else -+#endif -+ -+ mm->mmap_base = TASK_UNMAPPED_BASE; -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base += mm->delta_mmap; -+#endif -+ -+ mm->free_area_cache = mm->mmap_base; -+ mm->cached_hole_size = ~0UL; -+ addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); -+ /* -+ * Restore the topdown base: -+ */ -+ mm->mmap_base = base; -+ mm->free_area_cache = base; -+ mm->cached_hole_size = ~0UL; -+ -+ return addr; -+} - - struct sel_arg_struct { - unsigned long n; -diff -urNp linux-2.6.31.1/arch/x86/kernel/sys_x86_64.c linux-2.6.31.1/arch/x86/kernel/sys_x86_64.c ---- linux-2.6.31.1/arch/x86/kernel/sys_x86_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/sys_x86_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -47,8 +47,8 @@ out: - return error; - } - --static void find_start_end(unsigned long flags, unsigned long *begin, -- unsigned long *end) -+static void find_start_end(struct mm_struct *mm, unsigned long flags, -+ unsigned long *begin, unsigned long *end) - { - if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { - unsigned long new_begin; -@@ -67,7 +67,7 @@ static void find_start_end(unsigned long - *begin = new_begin; - } - } else { -- *begin = TASK_UNMAPPED_BASE; -+ *begin = mm->mmap_base; - *end = TASK_SIZE; - } - } -@@ -84,11 +84,15 @@ arch_get_unmapped_area(struct file *filp - if (flags & MAP_FIXED) - return addr; - -- find_start_end(flags, &begin, &end); -+ find_start_end(mm, flags, &begin, &end); - - if (len > end) - return -ENOMEM; - -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) -+#endif -+ - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); -@@ -143,7 +147,7 @@ arch_get_unmapped_area_topdown(struct fi - { - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; -- unsigned long addr = addr0; -+ unsigned long base = mm->mmap_base, addr = addr0; - - /* requested length too big for entire address space */ - if (len > TASK_SIZE) -@@ -156,6 +160,10 @@ arch_get_unmapped_area_topdown(struct fi - if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) - goto bottomup; - -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) -+#endif -+ - /* requesting a specific address */ - if (addr) { - addr = PAGE_ALIGN(addr); -@@ -213,13 +221,21 @@ bottomup: - * can happen with large stack limits and large mmap() - * allocations. - */ -+ mm->mmap_base = TASK_UNMAPPED_BASE; -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base += mm->delta_mmap; -+#endif -+ -+ mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; -- mm->free_area_cache = TASK_UNMAPPED_BASE; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ -- mm->free_area_cache = mm->mmap_base; -+ mm->mmap_base = base; -+ mm->free_area_cache = base; - mm->cached_hole_size = ~0UL; - - return addr; -diff -urNp linux-2.6.31.1/arch/x86/kernel/time_32.c linux-2.6.31.1/arch/x86/kernel/time_32.c ---- linux-2.6.31.1/arch/x86/kernel/time_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/time_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -47,22 +47,32 @@ unsigned long profile_pc(struct pt_regs - unsigned long pc = instruction_pointer(regs); - - #ifdef CONFIG_SMP -- if (!user_mode_vm(regs) && in_lock_functions(pc)) { -+ if (!user_mode(regs) && in_lock_functions(pc)) { - #ifdef CONFIG_FRAME_POINTER -- return *(unsigned long *)(regs->bp + sizeof(long)); -+ return ktla_ktva(*(unsigned long *)(regs->bp + sizeof(long))); - #else - unsigned long *sp = (unsigned long *)®s->sp; - - /* Return address is either directly at stack pointer - or above a saved flags. Eflags has bits 22-31 zero, - kernel addresses don't. */ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ return ktla_ktva(sp[0]); -+#else - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; - #endif -+ -+#endif - } - #endif -+ -+ if (!user_mode(regs)) -+ pc = ktla_ktva(pc); -+ - return pc; - } - EXPORT_SYMBOL(profile_pc); -diff -urNp linux-2.6.31.1/arch/x86/kernel/time_64.c linux-2.6.31.1/arch/x86/kernel/time_64.c ---- linux-2.6.31.1/arch/x86/kernel/time_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/time_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -25,8 +25,6 @@ - #include <asm/time.h> - #include <asm/timer.h> - --volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -- - unsigned long profile_pc(struct pt_regs *regs) - { - unsigned long pc = instruction_pointer(regs); -@@ -34,7 +32,7 @@ unsigned long profile_pc(struct pt_regs - /* Assume the lock function has either no stack frame or a copy - of flags from PUSHF - Eflags always has bits 22 and up cleared unlike kernel addresses. */ -- if (!user_mode_vm(regs) && in_lock_functions(pc)) { -+ if (!user_mode(regs) && in_lock_functions(pc)) { - #ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); - #else -diff -urNp linux-2.6.31.1/arch/x86/kernel/tls.c linux-2.6.31.1/arch/x86/kernel/tls.c ---- linux-2.6.31.1/arch/x86/kernel/tls.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/tls.c 2009-10-01 20:12:42.000000000 -0400 -@@ -85,6 +85,11 @@ int do_set_thread_area(struct task_struc - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((p->mm->pax_flags & MF_PAX_SEGMEXEC) && (info.contents & MODIFY_LDT_CONTENTS_CODE)) -+ return -EINVAL; -+#endif -+ - set_tls_desc(p, idx, &info, 1); - - return 0; -diff -urNp linux-2.6.31.1/arch/x86/kernel/traps.c linux-2.6.31.1/arch/x86/kernel/traps.c ---- linux-2.6.31.1/arch/x86/kernel/traps.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/traps.c 2009-10-01 20:12:42.000000000 -0400 -@@ -70,14 +70,6 @@ asmlinkage int system_call(void); - - /* Do we ignore FPU interrupts ? */ - char ignore_fpu_irq; -- --/* -- * The IDT has to be page-aligned to simplify the Pentium -- * F0 0F bug workaround.. We have a special link segment -- * for this. -- */ --gate_desc idt_table[256] -- __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; - #endif - - DECLARE_BITMAP(used_vectors, NR_VECTORS); -@@ -115,7 +107,7 @@ static inline void preempt_conditional_c - static inline void - die_if_kernel(const char *str, struct pt_regs *regs, long err) - { -- if (!user_mode_vm(regs)) -+ if (!user_mode(regs)) - die(str, regs, err); - } - #endif -@@ -127,7 +119,7 @@ do_trap(int trapnr, int signr, char *str - struct task_struct *tsk = current; - - #ifdef CONFIG_X86_32 -- if (regs->flags & X86_VM_MASK) { -+ if (v8086_mode(regs)) { - /* - * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. - * On nmi (interrupt 2), do_trap should not be called. -@@ -138,7 +130,7 @@ do_trap(int trapnr, int signr, char *str - } - #endif - -- if (!user_mode(regs)) -+ if (!user_mode_novm(regs)) - goto kernel_trap; - - #ifdef CONFIG_X86_32 -@@ -161,7 +153,7 @@ trap_signal: - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", -- tsk->comm, tsk->pid, str, -+ tsk->comm, task_pid_nr(tsk), str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); -@@ -180,6 +172,12 @@ kernel_trap: - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } -+ -+#ifdef CONFIG_PAX_REFCOUNT -+ if (trapnr == 4) -+ pax_report_refcount_overflow(regs); -+#endif -+ - return; - - #ifdef CONFIG_X86_32 -@@ -268,14 +266,30 @@ do_general_protection(struct pt_regs *re - conditional_sti(regs); - - #ifdef CONFIG_X86_32 -- if (regs->flags & X86_VM_MASK) -+ if (v8086_mode(regs)) - goto gp_in_vm86; - #endif - - tsk = current; -- if (!user_mode(regs)) -+ if (!user_mode_novm(regs)) - goto gp_in_kernel; - -+#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_PAGEEXEC) -+ if (!nx_enabled && tsk->mm && (tsk->mm->pax_flags & MF_PAX_PAGEEXEC)) { -+ struct mm_struct *mm = tsk->mm; -+ unsigned long limit; -+ -+ down_write(&mm->mmap_sem); -+ limit = mm->context.user_cs_limit; -+ if (limit < TASK_SIZE) { -+ track_exec_limit(mm, limit, TASK_SIZE, VM_EXEC); -+ up_write(&mm->mmap_sem); -+ return; -+ } -+ up_write(&mm->mmap_sem); -+ } -+#endif -+ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - -@@ -308,6 +322,13 @@ gp_in_kernel: - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; -+ -+#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) -+ if ((regs->cs & 0xFFFF) == __KERNEL_CS) -+ die("PAX: suspicious general protection fault", regs, error_code); -+ else -+#endif -+ - die("general protection fault", regs, error_code); - } - -@@ -561,7 +582,7 @@ dotraplinkage void __kprobes do_debug(st - } - - #ifdef CONFIG_X86_32 -- if (regs->flags & X86_VM_MASK) -+ if (v8086_mode(regs)) - goto debug_vm86; - #endif - -@@ -573,7 +594,7 @@ dotraplinkage void __kprobes do_debug(st - * kernel space (but re-enable TF when returning to user mode). - */ - if (condition & DR_STEP) { -- if (!user_mode(regs)) -+ if (!user_mode_novm(regs)) - goto clear_TF_reenable; - } - -@@ -760,7 +781,7 @@ do_simd_coprocessor_error(struct pt_regs - * Handle strange cache flush from user space exception - * in all other cases. This is undocumented behaviour. - */ -- if (regs->flags & X86_VM_MASK) { -+ if (v8086_mode(regs)) { - handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); - return; - } -@@ -789,19 +810,14 @@ do_spurious_interrupt_bug(struct pt_regs - #ifdef CONFIG_X86_32 - unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) - { -- struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; -- __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; -+ struct desc_struct ss; - - /* Set up base for espfix segment */ -- desc &= 0x00f0ff0000000000ULL; -- desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | -- ((((__u64)base) << 32) & 0xff00000000000000ULL) | -- ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | -- (lim_pages & 0xffff); -- *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; -+ pack_descriptor(&ss, base, lim_pages, 0x93, 0xC); -+ write_gdt_entry(get_cpu_gdt_table(smp_processor_id()), GDT_ENTRY_ESPFIX_SS, &ss, DESCTYPE_S); - - return new_kesp; - } -diff -urNp linux-2.6.31.1/arch/x86/kernel/tsc.c linux-2.6.31.1/arch/x86/kernel/tsc.c ---- linux-2.6.31.1/arch/x86/kernel/tsc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/tsc.c 2009-10-01 20:12:42.000000000 -0400 -@@ -790,7 +790,7 @@ static struct dmi_system_id __initdata b - DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), - }, - }, -- {} -+ { NULL, NULL, {{0, {0}}}, NULL} - }; - - static void __init check_system_tsc_reliable(void) -diff -urNp linux-2.6.31.1/arch/x86/kernel/vm86_32.c linux-2.6.31.1/arch/x86/kernel/vm86_32.c ---- linux-2.6.31.1/arch/x86/kernel/vm86_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/vm86_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -148,7 +148,7 @@ struct pt_regs *save_v86_state(struct ke - do_exit(SIGSEGV); - } - -- tss = &per_cpu(init_tss, get_cpu()); -+ tss = init_tss + get_cpu(); - current->thread.sp0 = current->thread.saved_sp0; - current->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tss, ¤t->thread); -@@ -324,7 +324,7 @@ static void do_sys_vm86(struct kernel_vm - tsk->thread.saved_fs = info->regs32->fs; - tsk->thread.saved_gs = get_user_gs(info->regs32); - -- tss = &per_cpu(init_tss, get_cpu()); -+ tss = init_tss + get_cpu(); - tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; - if (cpu_has_sep) - tsk->thread.sysenter_cs = 0; -diff -urNp linux-2.6.31.1/arch/x86/kernel/vmi_32.c linux-2.6.31.1/arch/x86/kernel/vmi_32.c ---- linux-2.6.31.1/arch/x86/kernel/vmi_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/vmi_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -102,18 +102,43 @@ static unsigned patch_internal(int call, - { - u64 reloc; - struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - reloc = call_vrom_long_func(vmi_rom, get_reloc, call); - switch(rel->type) { - case VMI_RELOCATION_CALL_REL: - BUG_ON(len < 5); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - *(char *)insnbuf = MNEM_CALL; - patch_offset(insnbuf, ip, (unsigned long)rel->eip); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - return 5; - - case VMI_RELOCATION_JUMP_REL: - BUG_ON(len < 5); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - *(char *)insnbuf = MNEM_JMP; - patch_offset(insnbuf, ip, (unsigned long)rel->eip); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - return 5; - - case VMI_RELOCATION_NOP: -@@ -404,13 +429,13 @@ static void vmi_set_pud(pud_t *pudp, pud - - static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) - { -- const pte_t pte = { .pte = 0 }; -+ const pte_t pte = __pte(0ULL); - vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); - } - - static void vmi_pmd_clear(pmd_t *pmd) - { -- const pte_t pte = { .pte = 0 }; -+ const pte_t pte = __pte(0ULL); - vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); - } - #endif -@@ -438,8 +463,8 @@ vmi_startup_ipi_hook(int phys_apicid, un - ap.ss = __KERNEL_DS; - ap.esp = (unsigned long) start_esp; - -- ap.ds = __USER_DS; -- ap.es = __USER_DS; -+ ap.ds = __KERNEL_DS; -+ ap.es = __KERNEL_DS; - ap.fs = __KERNEL_PERCPU; - ap.gs = __KERNEL_STACK_CANARY; - -@@ -640,12 +665,20 @@ static inline int __init activate_vmi(vo - u64 reloc; - const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - if (call_vrom_func(vmi_rom, vmi_init) != 0) { - printk(KERN_ERR "VMI ROM failed to initialize!"); - return 0; - } - savesegment(cs, kernel_cs); - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - pv_info.paravirt_enabled = 1; - pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; - pv_info.name = "vmi"; -@@ -836,6 +869,10 @@ static inline int __init activate_vmi(vo - - para_fill(pv_irq_ops.safe_halt, Halt); - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - /* - * Alternative instruction rewriting doesn't happen soon enough - * to convert VMI_IRET to a call instead of a jump; so we have -diff -urNp linux-2.6.31.1/arch/x86/kernel/vmlinux.lds.S linux-2.6.31.1/arch/x86/kernel/vmlinux.lds.S ---- linux-2.6.31.1/arch/x86/kernel/vmlinux.lds.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/vmlinux.lds.S 2009-10-01 20:12:42.000000000 -0400 -@@ -26,6 +26,22 @@ - #include <asm/page_types.h> - #include <asm/cache.h> - #include <asm/boot.h> -+#include <asm/segment.h> -+ -+#undef PMD_SIZE -+#undef PMD_SHIFT -+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -+#define PMD_SHIFT 21 -+#else -+#define PMD_SHIFT 22 -+#endif -+#define PMD_SIZE (1 << PMD_SHIFT) -+ -+#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) -+#define __KERNEL_TEXT_OFFSET (LOAD_OFFSET + ____LOAD_PHYSICAL_ADDR) -+#else -+#define __KERNEL_TEXT_OFFSET 0 -+#endif - - #undef i386 /* in case the preprocessor is a 32bit one */ - -@@ -34,46 +50,52 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONF - #ifdef CONFIG_X86_32 - OUTPUT_ARCH(i386) - ENTRY(phys_startup_32) --jiffies = jiffies_64; - #else - OUTPUT_ARCH(i386:x86-64) - ENTRY(phys_startup_64) --jiffies_64 = jiffies; - #endif - -+jiffies = jiffies_64; -+ - PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ -- data PT_LOAD FLAGS(7); /* RWE */ -+ rodata PT_LOAD FLAGS(4); /* R__ */ -+ data PT_LOAD FLAGS(6); /* RW_ */ - #ifdef CONFIG_X86_64 -- user PT_LOAD FLAGS(7); /* RWE */ -+ user PT_LOAD FLAGS(5); /* R_E */ -+#endif -+ init.begin PT_LOAD FLAGS(6); /* RW_ */ - #ifdef CONFIG_SMP -- percpu PT_LOAD FLAGS(7); /* RWE */ -+ percpu PT_LOAD FLAGS(6); /* RW_ */ - #endif -+ text.init PT_LOAD FLAGS(5); /* R_E */ -+ text.exit PT_LOAD FLAGS(5); /* R_E */ - init PT_LOAD FLAGS(7); /* RWE */ --#endif - note PT_NOTE FLAGS(0); /* ___ */ - } - - SECTIONS - { - #ifdef CONFIG_X86_32 -- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; -- phys_startup_32 = startup_32 - LOAD_OFFSET; -+ . = LOAD_OFFSET + ____LOAD_PHYSICAL_ADDR; - #else -- . = __START_KERNEL; -- phys_startup_64 = startup_64 - LOAD_OFFSET; -+ . = __START_KERNEL; - #endif - - /* Text and read-only data */ - -- /* bootstrapping code */ -- .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { -+ .text (. - __KERNEL_TEXT_OFFSET): AT(ADDR(.text) - LOAD_OFFSET + __KERNEL_TEXT_OFFSET) { -+ /* bootstrapping code */ -+#ifdef CONFIG_X86_32 -+ phys_startup_32 = startup_32 - LOAD_OFFSET + __KERNEL_TEXT_OFFSET; -+#else -+ phys_startup_64 = startup_64 - LOAD_OFFSET + __KERNEL_TEXT_OFFSET; -+#endif -+ __LOAD_PHYSICAL_ADDR = . - LOAD_OFFSET + __KERNEL_TEXT_OFFSET; - _text = .; - *(.text.head) -- } :text = 0x9090 - -- /* The rest of the text */ -- .text : AT(ADDR(.text) - LOAD_OFFSET) { -+ /* The rest of the text */ - #ifdef CONFIG_X86_32 - /* not really needed, already page aligned */ - . = ALIGN(PAGE_SIZE); -@@ -92,7 +114,10 @@ SECTIONS - _etext = .; - } :text = 0x9090 - -- NOTES :text :note -+ . += __KERNEL_TEXT_OFFSET; -+ -+ . = ALIGN(PAGE_SIZE); -+ NOTES :rodata :note - - /* Exception table */ - . = ALIGN(16); -@@ -100,22 +125,53 @@ SECTIONS - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; -- } :text = 0x9090 -+ } :rodata - - RO_DATA(PAGE_SIZE) - -+#ifdef CONFIG_X86_32 -+ . = ALIGN(PAGE_SIZE); -+ .rodata.page_aligned : AT(ADDR(.rodata.page_aligned) - LOAD_OFFSET) { -+ *(.idt) -+ . = ALIGN(PAGE_SIZE); -+ *(.empty_zero_page) -+ *(.swapper_pg_pmd) -+ *(.swapper_pg_dir) -+ -+#if defined(CONFIG_PAX_KERNEXEC) && !defined(CONFIG_MODULES) -+ . = ALIGN(PMD_SIZE); -+#endif -+ -+ } -+ -+#if defined(CONFIG_PAX_KERNEXEC) && defined(CONFIG_MODULES) -+ . = ALIGN(PAGE_SIZE); -+ .module.text : AT(ADDR(.module.text) - LOAD_OFFSET) { -+ MODULES_EXEC_VADDR = .; -+ BYTE(0) -+ . += (8 * 1024 * 1024); -+ . = ALIGN(PMD_SIZE); -+ MODULES_EXEC_END = . - 1; -+ } -+#endif -+#endif -+ - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ . = ALIGN(PMD_SIZE); -+#else -+ . = ALIGN(PAGE_SIZE); -+#endif -+ - /* Start of data section */ - _sdata = .; - - /* init_task */ - INIT_TASK_DATA(THREAD_SIZE) - --#ifdef CONFIG_X86_32 -- /* 32 bit has nosave before _edata */ - NOSAVE_DATA --#endif - - PAGE_ALIGNED_DATA(PAGE_SIZE) - *(.data.idt) -@@ -182,12 +238,6 @@ SECTIONS - } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - -- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); -- .jiffies : AT(VLOAD(.jiffies)) { -- *(.jiffies) -- } -- jiffies = VVIRT(.jiffies); -- - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { - *(.vsyscall_3) - } -@@ -205,12 +255,19 @@ SECTIONS - #endif /* CONFIG_X86_64 */ - - /* Init code and data - will be freed after init */ -- . = ALIGN(PAGE_SIZE); - .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { -+ BYTE(0) -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ . = ALIGN(PMD_SIZE); -+#else -+ . = ALIGN(PAGE_SIZE); -+#endif -+ - __init_begin = .; /* paired with __init_end */ -- } -+ } :init.begin - --#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) -+#ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - .init.text - should -@@ -219,18 +276,26 @@ SECTIONS - PERCPU_VADDR(0, :percpu) - #endif - -- .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { -+ init_begin = .; -+ .init.text (. - __KERNEL_TEXT_OFFSET): AT(init_begin - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; -- } --#ifdef CONFIG_X86_64 -- :init --#endif -+ } :text.init -+ -+ /* -+ * .exit.text is discard at runtime, not link time, to deal with -+ * references from .altinstructions and .eh_frame -+ */ -+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { -+ EXIT_TEXT -+ . = ALIGN(16); -+ } :text.exit -+ . = init_begin + SIZEOF(.init.text) + SIZEOF(.exit.text); - - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA -- } -+ } :init - - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { -@@ -276,14 +341,6 @@ SECTIONS - *(.altinstr_replacement) - } - -- /* -- * .exit.text is discard at runtime, not link time, to deal with -- * references from .altinstructions and .eh_frame -- */ -- .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { -- EXIT_TEXT -- } -- - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } -@@ -297,7 +354,7 @@ SECTIONS - } - #endif - --#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) -+#ifndef CONFIG_SMP - PERCPU(PAGE_SIZE) - #endif - -@@ -320,12 +377,6 @@ SECTIONS - . = ALIGN(PAGE_SIZE); - } - --#ifdef CONFIG_X86_64 -- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { -- NOSAVE_DATA -- } --#endif -- - /* BSS */ - . = ALIGN(PAGE_SIZE); - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { -@@ -341,6 +392,7 @@ SECTIONS - __brk_base = .; - . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ -+ . = ALIGN(PMD_SIZE); - __brk_limit = .; - } - -@@ -369,13 +421,12 @@ SECTIONS - * for the boot processor. - */ - #define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load --INIT_PER_CPU(gdt_page); - INIT_PER_CPU(irq_stack_union); - - /* - * Build-time check on the image size: - */ --. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), -+. = ASSERT((_end - _text - __KERNEL_TEXT_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); - - #ifdef CONFIG_SMP -diff -urNp linux-2.6.31.1/arch/x86/kernel/vsyscall_64.c linux-2.6.31.1/arch/x86/kernel/vsyscall_64.c ---- linux-2.6.31.1/arch/x86/kernel/vsyscall_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/vsyscall_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -79,6 +79,7 @@ void update_vsyscall(struct timespec *wa - - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); - /* copy vsyscall data */ -+ strlcpy(vsyscall_gtod_data.clock.name, clock->name, sizeof vsyscall_gtod_data.clock.name); - vsyscall_gtod_data.clock.vread = clock->vread; - vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; - vsyscall_gtod_data.clock.mask = clock->mask; -@@ -201,7 +202,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s - We do this here because otherwise user space would do it on - its own in a likely inferior way (no access to jiffies). - If you don't like it pass NULL. */ -- if (tcache && tcache->blob[0] == (j = __jiffies)) { -+ if (tcache && tcache->blob[0] == (j = jiffies)) { - p = tcache->blob[1]; - } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ -@@ -240,13 +241,13 @@ static ctl_table kernel_table2[] = { - .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, -- {} -+ { 0, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL } - }; - - static ctl_table kernel_root_table2[] = { - { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, - .child = kernel_table2 }, -- {} -+ { 0, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL } - }; - #endif - -diff -urNp linux-2.6.31.1/arch/x86/kernel/x8664_ksyms_64.c linux-2.6.31.1/arch/x86/kernel/x8664_ksyms_64.c ---- linux-2.6.31.1/arch/x86/kernel/x8664_ksyms_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kernel/x8664_ksyms_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -30,8 +30,6 @@ EXPORT_SYMBOL(__put_user_8); - - EXPORT_SYMBOL(copy_user_generic); - EXPORT_SYMBOL(__copy_user_nocache); --EXPORT_SYMBOL(copy_from_user); --EXPORT_SYMBOL(copy_to_user); - EXPORT_SYMBOL(__copy_from_user_inatomic); - - EXPORT_SYMBOL(copy_page); -diff -urNp linux-2.6.31.1/arch/x86/kvm/svm.c linux-2.6.31.1/arch/x86/kvm/svm.c ---- linux-2.6.31.1/arch/x86/kvm/svm.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kvm/svm.c 2009-10-01 20:12:42.000000000 -0400 -@@ -2289,7 +2289,19 @@ static void reload_tss(struct kvm_vcpu * - int cpu = raw_smp_processor_id(); - - struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ - svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - load_TR_desc(); - } - -@@ -2673,7 +2685,7 @@ static u64 svm_get_mt_mask(struct kvm_vc - return 0; - } - --static struct kvm_x86_ops svm_x86_ops = { -+static const struct kvm_x86_ops svm_x86_ops = { - .cpu_has_kvm_support = has_svm, - .disabled_by_bios = is_disabled, - .hardware_setup = svm_hardware_setup, -diff -urNp linux-2.6.31.1/arch/x86/kvm/vmx.c linux-2.6.31.1/arch/x86/kvm/vmx.c ---- linux-2.6.31.1/arch/x86/kvm/vmx.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kvm/vmx.c 2009-10-01 20:12:42.000000000 -0400 -@@ -519,9 +519,23 @@ static void reload_tss(void) - struct descriptor_table gdt; - struct desc_struct *descs; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - kvm_get_gdt(&gdt); - descs = (void *)gdt.base; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - load_TR_desc(); - } - -@@ -1321,6 +1335,11 @@ static __init int alloc_kvm_area(void) - - static __init int hardware_setup(void) - { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - if (setup_vmcs_config(&vmcs_config) < 0) - return -EIO; - -@@ -1336,8 +1355,19 @@ static __init int hardware_setup(void) - if (!cpu_has_vmx_flexpriority()) - flexpriority_enabled = 0; - -- if (!cpu_has_vmx_tpr_shadow()) -- kvm_x86_ops->update_cr8_intercept = NULL; -+ if (!cpu_has_vmx_tpr_shadow()) { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ *(void **)&kvm_x86_ops->update_cr8_intercept = NULL; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ -+ } - - return alloc_kvm_area(); - } -@@ -2239,7 +2269,7 @@ static int vmx_vcpu_setup(struct vcpu_vm - vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ - - asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); -- vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ -+ vmcs_writel(HOST_RIP, ktla_ktva(kvm_vmx_return)); /* 22.2.5 */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); -@@ -3493,6 +3523,12 @@ static void vmx_vcpu_run(struct kvm_vcpu - "jmp .Lkvm_vmx_return \n\t" - ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" - ".Lkvm_vmx_return: " -+ -+#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) -+ "ljmp %[cs],$.Lkvm_vmx_return2\n\t" -+ ".Lkvm_vmx_return2: " -+#endif -+ - /* Save guest registers, load host registers, keep flags */ - "xchg %0, (%%"R"sp) \n\t" - "mov %%"R"ax, %c[rax](%0) \n\t" -@@ -3539,6 +3575,11 @@ static void vmx_vcpu_run(struct kvm_vcpu - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), - #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) -+ -+#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) -+ ,[cs]"i"(__KERNEL_CS) -+#endif -+ - : "cc", "memory" - , R"bx", R"di", R"si" - #ifdef CONFIG_X86_64 -@@ -3555,7 +3596,7 @@ static void vmx_vcpu_run(struct kvm_vcpu - if (vmx->rmode.irq.pending) - fixup_rmode_irq(vmx); - -- asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); -+ asm("mov %0, %%ds; mov %0, %%es" : : "r"(__KERNEL_DS)); - vmx->launched = 1; - - vmx_complete_interrupts(vmx); -@@ -3698,7 +3739,7 @@ static u64 vmx_get_mt_mask(struct kvm_vc - return ret; - } - --static struct kvm_x86_ops vmx_x86_ops = { -+static const struct kvm_x86_ops vmx_x86_ops = { - .cpu_has_kvm_support = cpu_has_kvm_support, - .disabled_by_bios = vmx_disabled_by_bios, - .hardware_setup = hardware_setup, -diff -urNp linux-2.6.31.1/arch/x86/kvm/x86.c linux-2.6.31.1/arch/x86/kvm/x86.c ---- linux-2.6.31.1/arch/x86/kvm/x86.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/kvm/x86.c 2009-10-01 20:12:42.000000000 -0400 -@@ -73,42 +73,42 @@ static int kvm_dev_ioctl_get_supported_c - struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); - --struct kvm_x86_ops *kvm_x86_ops; -+const struct kvm_x86_ops *kvm_x86_ops; - EXPORT_SYMBOL_GPL(kvm_x86_ops); - - struct kvm_stats_debugfs_item debugfs_entries[] = { -- { "pf_fixed", VCPU_STAT(pf_fixed) }, -- { "pf_guest", VCPU_STAT(pf_guest) }, -- { "tlb_flush", VCPU_STAT(tlb_flush) }, -- { "invlpg", VCPU_STAT(invlpg) }, -- { "exits", VCPU_STAT(exits) }, -- { "io_exits", VCPU_STAT(io_exits) }, -- { "mmio_exits", VCPU_STAT(mmio_exits) }, -- { "signal_exits", VCPU_STAT(signal_exits) }, -- { "irq_window", VCPU_STAT(irq_window_exits) }, -- { "nmi_window", VCPU_STAT(nmi_window_exits) }, -- { "halt_exits", VCPU_STAT(halt_exits) }, -- { "halt_wakeup", VCPU_STAT(halt_wakeup) }, -- { "hypercalls", VCPU_STAT(hypercalls) }, -- { "request_irq", VCPU_STAT(request_irq_exits) }, -- { "irq_exits", VCPU_STAT(irq_exits) }, -- { "host_state_reload", VCPU_STAT(host_state_reload) }, -- { "efer_reload", VCPU_STAT(efer_reload) }, -- { "fpu_reload", VCPU_STAT(fpu_reload) }, -- { "insn_emulation", VCPU_STAT(insn_emulation) }, -- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, -- { "irq_injections", VCPU_STAT(irq_injections) }, -- { "nmi_injections", VCPU_STAT(nmi_injections) }, -- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, -- { "mmu_pte_write", VM_STAT(mmu_pte_write) }, -- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, -- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, -- { "mmu_flooded", VM_STAT(mmu_flooded) }, -- { "mmu_recycled", VM_STAT(mmu_recycled) }, -- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, -- { "mmu_unsync", VM_STAT(mmu_unsync) }, -- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, -- { "largepages", VM_STAT(lpages) }, -+ { "pf_fixed", VCPU_STAT(pf_fixed), NULL }, -+ { "pf_guest", VCPU_STAT(pf_guest), NULL }, -+ { "tlb_flush", VCPU_STAT(tlb_flush), NULL }, -+ { "invlpg", VCPU_STAT(invlpg), NULL }, -+ { "exits", VCPU_STAT(exits), NULL }, -+ { "io_exits", VCPU_STAT(io_exits), NULL }, -+ { "mmio_exits", VCPU_STAT(mmio_exits), NULL }, -+ { "signal_exits", VCPU_STAT(signal_exits), NULL }, -+ { "irq_window", VCPU_STAT(irq_window_exits), NULL }, -+ { "nmi_window", VCPU_STAT(nmi_window_exits), NULL }, -+ { "halt_exits", VCPU_STAT(halt_exits), NULL }, -+ { "halt_wakeup", VCPU_STAT(halt_wakeup), NULL }, -+ { "hypercalls", VCPU_STAT(hypercalls), NULL }, -+ { "request_irq", VCPU_STAT(request_irq_exits), NULL }, -+ { "irq_exits", VCPU_STAT(irq_exits), NULL }, -+ { "host_state_reload", VCPU_STAT(host_state_reload), NULL }, -+ { "efer_reload", VCPU_STAT(efer_reload), NULL }, -+ { "fpu_reload", VCPU_STAT(fpu_reload), NULL }, -+ { "insn_emulation", VCPU_STAT(insn_emulation), NULL }, -+ { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail), NULL }, -+ { "irq_injections", VCPU_STAT(irq_injections), NULL }, -+ { "nmi_injections", VCPU_STAT(nmi_injections), NULL }, -+ { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped), NULL }, -+ { "mmu_pte_write", VM_STAT(mmu_pte_write), NULL }, -+ { "mmu_pte_updated", VM_STAT(mmu_pte_updated), NULL }, -+ { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped), NULL }, -+ { "mmu_flooded", VM_STAT(mmu_flooded), NULL }, -+ { "mmu_recycled", VM_STAT(mmu_recycled), NULL }, -+ { "mmu_cache_miss", VM_STAT(mmu_cache_miss), NULL }, -+ { "mmu_unsync", VM_STAT(mmu_unsync), NULL }, -+ { "remote_tlb_flush", VM_STAT(remote_tlb_flush), NULL }, -+ { "largepages", VM_STAT(lpages), NULL }, - { NULL } - }; - -@@ -1485,7 +1485,7 @@ static int kvm_vcpu_ioctl_set_lapic(stru - static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) - { -- if (irq->irq < 0 || irq->irq >= 256) -+ if (irq->irq >= 256) - return -EINVAL; - if (irqchip_in_kernel(vcpu->kvm)) - return -ENXIO; -@@ -2810,10 +2810,10 @@ static struct notifier_block kvmclock_cp - .notifier_call = kvmclock_cpufreq_notifier - }; - --int kvm_arch_init(void *opaque) -+int kvm_arch_init(const void *opaque) - { - int r, cpu; -- struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; -+ const struct kvm_x86_ops *ops = (const struct kvm_x86_ops *)opaque; - - if (kvm_x86_ops) { - printk(KERN_ERR "kvm: already loaded the other module\n"); -diff -urNp linux-2.6.31.1/arch/x86/lib/checksum_32.S linux-2.6.31.1/arch/x86/lib/checksum_32.S ---- linux-2.6.31.1/arch/x86/lib/checksum_32.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/lib/checksum_32.S 2009-10-01 20:12:42.000000000 -0400 -@@ -28,7 +28,8 @@ - #include <linux/linkage.h> - #include <asm/dwarf2.h> - #include <asm/errno.h> -- -+#include <asm/segment.h> -+ - /* - * computes a partial checksum, e.g. for TCP/UDP fragments - */ -@@ -304,9 +305,22 @@ unsigned int csum_partial_copy_generic ( - - #define ARGBASE 16 - #define FP 12 -- --ENTRY(csum_partial_copy_generic) -+ -+ENTRY(csum_partial_copy_generic_to_user) - CFI_STARTPROC -+ pushl $(__USER_DS) -+ CFI_ADJUST_CFA_OFFSET 4 -+ popl %es -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp csum_partial_copy_generic -+ -+ENTRY(csum_partial_copy_generic_from_user) -+ pushl $(__USER_DS) -+ CFI_ADJUST_CFA_OFFSET 4 -+ popl %ds -+ CFI_ADJUST_CFA_OFFSET -4 -+ -+ENTRY(csum_partial_copy_generic) - subl $4,%esp - CFI_ADJUST_CFA_OFFSET 4 - pushl %edi -@@ -331,7 +345,7 @@ ENTRY(csum_partial_copy_generic) - jmp 4f - SRC(1: movw (%esi), %bx ) - addl $2, %esi --DST( movw %bx, (%edi) ) -+DST( movw %bx, %es:(%edi) ) - addl $2, %edi - addw %bx, %ax - adcl $0, %eax -@@ -343,30 +357,30 @@ DST( movw %bx, (%edi) ) - SRC(1: movl (%esi), %ebx ) - SRC( movl 4(%esi), %edx ) - adcl %ebx, %eax --DST( movl %ebx, (%edi) ) -+DST( movl %ebx, %es:(%edi) ) - adcl %edx, %eax --DST( movl %edx, 4(%edi) ) -+DST( movl %edx, %es:4(%edi) ) - - SRC( movl 8(%esi), %ebx ) - SRC( movl 12(%esi), %edx ) - adcl %ebx, %eax --DST( movl %ebx, 8(%edi) ) -+DST( movl %ebx, %es:8(%edi) ) - adcl %edx, %eax --DST( movl %edx, 12(%edi) ) -+DST( movl %edx, %es:12(%edi) ) - - SRC( movl 16(%esi), %ebx ) - SRC( movl 20(%esi), %edx ) - adcl %ebx, %eax --DST( movl %ebx, 16(%edi) ) -+DST( movl %ebx, %es:16(%edi) ) - adcl %edx, %eax --DST( movl %edx, 20(%edi) ) -+DST( movl %edx, %es:20(%edi) ) - - SRC( movl 24(%esi), %ebx ) - SRC( movl 28(%esi), %edx ) - adcl %ebx, %eax --DST( movl %ebx, 24(%edi) ) -+DST( movl %ebx, %es:24(%edi) ) - adcl %edx, %eax --DST( movl %edx, 28(%edi) ) -+DST( movl %edx, %es:28(%edi) ) - - lea 32(%esi), %esi - lea 32(%edi), %edi -@@ -380,7 +394,7 @@ DST( movl %edx, 28(%edi) ) - shrl $2, %edx # This clears CF - SRC(3: movl (%esi), %ebx ) - adcl %ebx, %eax --DST( movl %ebx, (%edi) ) -+DST( movl %ebx, %es:(%edi) ) - lea 4(%esi), %esi - lea 4(%edi), %edi - dec %edx -@@ -392,12 +406,12 @@ DST( movl %ebx, (%edi) ) - jb 5f - SRC( movw (%esi), %cx ) - leal 2(%esi), %esi --DST( movw %cx, (%edi) ) -+DST( movw %cx, %es:(%edi) ) - leal 2(%edi), %edi - je 6f - shll $16,%ecx - SRC(5: movb (%esi), %cl ) --DST( movb %cl, (%edi) ) -+DST( movb %cl, %es:(%edi) ) - 6: addl %ecx, %eax - adcl $0, %eax - 7: -@@ -408,7 +422,7 @@ DST( movb %cl, (%edi) ) - - 6001: - movl ARGBASE+20(%esp), %ebx # src_err_ptr -- movl $-EFAULT, (%ebx) -+ movl $-EFAULT, %ss:(%ebx) - - # zero the complete destination - computing the rest - # is too much work -@@ -421,11 +435,19 @@ DST( movb %cl, (%edi) ) - - 6002: - movl ARGBASE+24(%esp), %ebx # dst_err_ptr -- movl $-EFAULT,(%ebx) -+ movl $-EFAULT,%ss:(%ebx) - jmp 5000b - - .previous - -+ pushl %ss -+ CFI_ADJUST_CFA_OFFSET 4 -+ popl %ds -+ CFI_ADJUST_CFA_OFFSET -4 -+ pushl %ss -+ CFI_ADJUST_CFA_OFFSET 4 -+ popl %es -+ CFI_ADJUST_CFA_OFFSET -4 - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE ebx -@@ -439,26 +461,41 @@ DST( movb %cl, (%edi) ) - CFI_ADJUST_CFA_OFFSET -4 - ret - CFI_ENDPROC --ENDPROC(csum_partial_copy_generic) -+ENDPROC(csum_partial_copy_generic_to_user) - - #else - - /* Version for PentiumII/PPro */ - - #define ROUND1(x) \ -+ nop; nop; nop; \ - SRC(movl x(%esi), %ebx ) ; \ - addl %ebx, %eax ; \ -- DST(movl %ebx, x(%edi) ) ; -+ DST(movl %ebx, %es:x(%edi)) ; - - #define ROUND(x) \ -+ nop; nop; nop; \ - SRC(movl x(%esi), %ebx ) ; \ - adcl %ebx, %eax ; \ -- DST(movl %ebx, x(%edi) ) ; -+ DST(movl %ebx, %es:x(%edi)) ; - - #define ARGBASE 12 -- --ENTRY(csum_partial_copy_generic) -+ -+ENTRY(csum_partial_copy_generic_to_user) - CFI_STARTPROC -+ pushl $(__USER_DS) -+ CFI_ADJUST_CFA_OFFSET 4 -+ popl %es -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp csum_partial_copy_generic -+ -+ENTRY(csum_partial_copy_generic_from_user) -+ pushl $(__USER_DS) -+ CFI_ADJUST_CFA_OFFSET 4 -+ popl %ds -+ CFI_ADJUST_CFA_OFFSET -4 -+ -+ENTRY(csum_partial_copy_generic) - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebx, 0 -@@ -482,7 +519,7 @@ ENTRY(csum_partial_copy_generic) - subl %ebx, %edi - lea -1(%esi),%edx - andl $-32,%edx -- lea 3f(%ebx,%ebx), %ebx -+ lea 3f(%ebx,%ebx,2), %ebx - testl %esi, %esi - jmp *%ebx - 1: addl $64,%esi -@@ -503,19 +540,19 @@ ENTRY(csum_partial_copy_generic) - jb 5f - SRC( movw (%esi), %dx ) - leal 2(%esi), %esi --DST( movw %dx, (%edi) ) -+DST( movw %dx, %es:(%edi) ) - leal 2(%edi), %edi - je 6f - shll $16,%edx - 5: - SRC( movb (%esi), %dl ) --DST( movb %dl, (%edi) ) -+DST( movb %dl, %es:(%edi) ) - 6: addl %edx, %eax - adcl $0, %eax - 7: - .section .fixup, "ax" - 6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr -- movl $-EFAULT, (%ebx) -+ movl $-EFAULT, %ss:(%ebx) - # zero the complete destination (computing the rest is too much work) - movl ARGBASE+8(%esp),%edi # dst - movl ARGBASE+12(%esp),%ecx # len -@@ -523,10 +560,18 @@ DST( movb %dl, (%edi) ) - rep; stosb - jmp 7b - 6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr -- movl $-EFAULT, (%ebx) -+ movl $-EFAULT, %ss:(%ebx) - jmp 7b - .previous - -+ pushl %ss -+ CFI_ADJUST_CFA_OFFSET 4 -+ popl %ds -+ CFI_ADJUST_CFA_OFFSET -4 -+ pushl %ss -+ CFI_ADJUST_CFA_OFFSET 4 -+ popl %es -+ CFI_ADJUST_CFA_OFFSET -4 - popl %esi - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE esi -@@ -538,7 +583,7 @@ DST( movb %dl, (%edi) ) - CFI_RESTORE ebx - ret - CFI_ENDPROC --ENDPROC(csum_partial_copy_generic) -+ENDPROC(csum_partial_copy_generic_to_user) - - #undef ROUND - #undef ROUND1 -diff -urNp linux-2.6.31.1/arch/x86/lib/clear_page_64.S linux-2.6.31.1/arch/x86/lib/clear_page_64.S ---- linux-2.6.31.1/arch/x86/lib/clear_page_64.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/lib/clear_page_64.S 2009-10-01 20:12:42.000000000 -0400 -@@ -43,7 +43,7 @@ ENDPROC(clear_page) - - #include <asm/cpufeature.h> - -- .section .altinstr_replacement,"ax" -+ .section .altinstr_replacement,"a" - 1: .byte 0xeb /* jmp <disp8> */ - .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ - 2: -diff -urNp linux-2.6.31.1/arch/x86/lib/copy_page_64.S linux-2.6.31.1/arch/x86/lib/copy_page_64.S ---- linux-2.6.31.1/arch/x86/lib/copy_page_64.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/lib/copy_page_64.S 2009-10-01 20:12:42.000000000 -0400 -@@ -104,7 +104,7 @@ ENDPROC(copy_page) - - #include <asm/cpufeature.h> - -- .section .altinstr_replacement,"ax" -+ .section .altinstr_replacement,"a" - 1: .byte 0xeb /* jmp <disp8> */ - .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ - 2: -diff -urNp linux-2.6.31.1/arch/x86/lib/copy_user_64.S linux-2.6.31.1/arch/x86/lib/copy_user_64.S ---- linux-2.6.31.1/arch/x86/lib/copy_user_64.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/lib/copy_user_64.S 2009-10-01 20:12:42.000000000 -0400 -@@ -21,7 +21,7 @@ - .byte 0xe9 /* 32bit jump */ - .long \orig-1f /* by default jump to orig */ - 1: -- .section .altinstr_replacement,"ax" -+ .section .altinstr_replacement,"a" - 2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt-1b /* offset */ /* or alternatively to alt */ - .previous -@@ -64,32 +64,6 @@ - #endif - .endm - --/* Standard copy_to_user with segment limit checking */ --ENTRY(copy_to_user) -- CFI_STARTPROC -- GET_THREAD_INFO(%rax) -- movq %rdi,%rcx -- addq %rdx,%rcx -- jc bad_to_user -- cmpq TI_addr_limit(%rax),%rcx -- jae bad_to_user -- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string -- CFI_ENDPROC --ENDPROC(copy_to_user) -- --/* Standard copy_from_user with segment limit checking */ --ENTRY(copy_from_user) -- CFI_STARTPROC -- GET_THREAD_INFO(%rax) -- movq %rsi,%rcx -- addq %rdx,%rcx -- jc bad_from_user -- cmpq TI_addr_limit(%rax),%rcx -- jae bad_from_user -- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string -- CFI_ENDPROC --ENDPROC(copy_from_user) -- - ENTRY(copy_user_generic) - CFI_STARTPROC - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string -@@ -107,6 +81,8 @@ ENDPROC(__copy_from_user_inatomic) - ENTRY(bad_from_user) - bad_from_user: - CFI_STARTPROC -+ testl %edx,%edx -+ js bad_to_user - movl %edx,%ecx - xorl %eax,%eax - rep -diff -urNp linux-2.6.31.1/arch/x86/lib/getuser.S linux-2.6.31.1/arch/x86/lib/getuser.S ---- linux-2.6.31.1/arch/x86/lib/getuser.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/lib/getuser.S 2009-10-01 20:12:42.000000000 -0400 -@@ -33,6 +33,7 @@ - #include <asm/asm-offsets.h> - #include <asm/thread_info.h> - #include <asm/asm.h> -+#include <asm/segment.h> - - .text - ENTRY(__get_user_1) -@@ -40,7 +41,19 @@ ENTRY(__get_user_1) - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX - jae bad_get_user -+ -+#ifdef CONFIG_X86_32 -+ pushl $(__USER_DS) -+ popl %ds -+#endif -+ - 1: movzb (%_ASM_AX),%edx -+ -+#ifdef CONFIG_X86_32 -+ pushl %ss -+ pop %ds -+#endif -+ - xor %eax,%eax - ret - CFI_ENDPROC -@@ -53,7 +66,19 @@ ENTRY(__get_user_2) - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX - jae bad_get_user -+ -+#ifdef CONFIG_X86_32 -+ pushl $(__USER_DS) -+ popl %ds -+#endif -+ - 2: movzwl -1(%_ASM_AX),%edx -+ -+#ifdef CONFIG_X86_32 -+ pushl %ss -+ pop %ds -+#endif -+ - xor %eax,%eax - ret - CFI_ENDPROC -@@ -66,7 +91,19 @@ ENTRY(__get_user_4) - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX - jae bad_get_user -+ -+#ifdef CONFIG_X86_32 -+ pushl $(__USER_DS) -+ popl %ds -+#endif -+ - 3: mov -3(%_ASM_AX),%edx -+ -+#ifdef CONFIG_X86_32 -+ pushl %ss -+ pop %ds -+#endif -+ - xor %eax,%eax - ret - CFI_ENDPROC -@@ -89,6 +126,12 @@ ENDPROC(__get_user_8) - - bad_get_user: - CFI_STARTPROC -+ -+#ifdef CONFIG_X86_32 -+ pushl %ss -+ pop %ds -+#endif -+ - xor %edx,%edx - mov $(-EFAULT),%_ASM_AX - ret -diff -urNp linux-2.6.31.1/arch/x86/lib/memcpy_64.S linux-2.6.31.1/arch/x86/lib/memcpy_64.S ---- linux-2.6.31.1/arch/x86/lib/memcpy_64.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/lib/memcpy_64.S 2009-10-01 20:12:42.000000000 -0400 -@@ -128,7 +128,7 @@ ENDPROC(__memcpy) - * It is also a lot simpler. Use this when possible: - */ - -- .section .altinstr_replacement, "ax" -+ .section .altinstr_replacement, "a" - 1: .byte 0xeb /* jmp <disp8> */ - .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ - 2: -diff -urNp linux-2.6.31.1/arch/x86/lib/memset_64.S linux-2.6.31.1/arch/x86/lib/memset_64.S ---- linux-2.6.31.1/arch/x86/lib/memset_64.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/lib/memset_64.S 2009-10-01 20:12:42.000000000 -0400 -@@ -118,7 +118,7 @@ ENDPROC(__memset) - - #include <asm/cpufeature.h> - -- .section .altinstr_replacement,"ax" -+ .section .altinstr_replacement,"a" - 1: .byte 0xeb /* jmp <disp8> */ - .byte (memset_c - memset) - (2f - 1b) /* offset */ - 2: -diff -urNp linux-2.6.31.1/arch/x86/lib/mmx_32.c linux-2.6.31.1/arch/x86/lib/mmx_32.c ---- linux-2.6.31.1/arch/x86/lib/mmx_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/lib/mmx_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -29,6 +29,7 @@ void *_mmx_memcpy(void *to, const void * - { - void *p; - int i; -+ unsigned long cr0; - - if (unlikely(in_interrupt())) - return __memcpy(to, from, len); -@@ -39,44 +40,72 @@ void *_mmx_memcpy(void *to, const void * - kernel_fpu_begin(); - - __asm__ __volatile__ ( -- "1: prefetch (%0)\n" /* This set is 28 bytes */ -- " prefetch 64(%0)\n" -- " prefetch 128(%0)\n" -- " prefetch 192(%0)\n" -- " prefetch 256(%0)\n" -+ "1: prefetch (%1)\n" /* This set is 28 bytes */ -+ " prefetch 64(%1)\n" -+ " prefetch 128(%1)\n" -+ " prefetch 192(%1)\n" -+ " prefetch 256(%1)\n" - "2: \n" - ".section .fixup, "ax"\n" -- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ -+ "3: \n" -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %%cr0, %0\n" -+ " movl %0, %%eax\n" -+ " andl $0xFFFEFFFF, %%eax\n" -+ " movl %%eax, %%cr0\n" -+#endif -+ -+ " movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %0, %%cr0\n" -+#endif -+ - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) -- : : "r" (from)); -+ : "=&r" (cr0) : "r" (from) : "ax"); - - for ( ; i > 5; i--) { - __asm__ __volatile__ ( -- "1: prefetch 320(%0)\n" -- "2: movq (%0), %%mm0\n" -- " movq 8(%0), %%mm1\n" -- " movq 16(%0), %%mm2\n" -- " movq 24(%0), %%mm3\n" -- " movq %%mm0, (%1)\n" -- " movq %%mm1, 8(%1)\n" -- " movq %%mm2, 16(%1)\n" -- " movq %%mm3, 24(%1)\n" -- " movq 32(%0), %%mm0\n" -- " movq 40(%0), %%mm1\n" -- " movq 48(%0), %%mm2\n" -- " movq 56(%0), %%mm3\n" -- " movq %%mm0, 32(%1)\n" -- " movq %%mm1, 40(%1)\n" -- " movq %%mm2, 48(%1)\n" -- " movq %%mm3, 56(%1)\n" -+ "1: prefetch 320(%1)\n" -+ "2: movq (%1), %%mm0\n" -+ " movq 8(%1), %%mm1\n" -+ " movq 16(%1), %%mm2\n" -+ " movq 24(%1), %%mm3\n" -+ " movq %%mm0, (%2)\n" -+ " movq %%mm1, 8(%2)\n" -+ " movq %%mm2, 16(%2)\n" -+ " movq %%mm3, 24(%2)\n" -+ " movq 32(%1), %%mm0\n" -+ " movq 40(%1), %%mm1\n" -+ " movq 48(%1), %%mm2\n" -+ " movq 56(%1), %%mm3\n" -+ " movq %%mm0, 32(%2)\n" -+ " movq %%mm1, 40(%2)\n" -+ " movq %%mm2, 48(%2)\n" -+ " movq %%mm3, 56(%2)\n" - ".section .fixup, "ax"\n" -- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ -+ "3:\n" -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %%cr0, %0\n" -+ " movl %0, %%eax\n" -+ " andl $0xFFFEFFFF, %%eax\n" -+ " movl %%eax, %%cr0\n" -+#endif -+ -+ " movw $0x05EB, 1b\n" /* jmp on 5 bytes */ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %0, %%cr0\n" -+#endif -+ - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) -- : : "r" (from), "r" (to) : "memory"); -+ : "=&r" (cr0) : "r" (from), "r" (to) : "memory", "ax"); - - from += 64; - to += 64; -@@ -158,6 +187,7 @@ static void fast_clear_page(void *page) - static void fast_copy_page(void *to, void *from) - { - int i; -+ unsigned long cr0; - - kernel_fpu_begin(); - -@@ -166,42 +196,70 @@ static void fast_copy_page(void *to, voi - * but that is for later. -AV - */ - __asm__ __volatile__( -- "1: prefetch (%0)\n" -- " prefetch 64(%0)\n" -- " prefetch 128(%0)\n" -- " prefetch 192(%0)\n" -- " prefetch 256(%0)\n" -+ "1: prefetch (%1)\n" -+ " prefetch 64(%1)\n" -+ " prefetch 128(%1)\n" -+ " prefetch 192(%1)\n" -+ " prefetch 256(%1)\n" - "2: \n" - ".section .fixup, "ax"\n" -- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ -+ "3: \n" -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %%cr0, %0\n" -+ " movl %0, %%eax\n" -+ " andl $0xFFFEFFFF, %%eax\n" -+ " movl %%eax, %%cr0\n" -+#endif -+ -+ " movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %0, %%cr0\n" -+#endif -+ - " jmp 2b\n" - ".previous\n" -- _ASM_EXTABLE(1b, 3b) : : "r" (from)); -+ _ASM_EXTABLE(1b, 3b) : "=&r" (cr0) : "r" (from) : "ax"); - - for (i = 0; i < (4096-320)/64; i++) { - __asm__ __volatile__ ( -- "1: prefetch 320(%0)\n" -- "2: movq (%0), %%mm0\n" -- " movntq %%mm0, (%1)\n" -- " movq 8(%0), %%mm1\n" -- " movntq %%mm1, 8(%1)\n" -- " movq 16(%0), %%mm2\n" -- " movntq %%mm2, 16(%1)\n" -- " movq 24(%0), %%mm3\n" -- " movntq %%mm3, 24(%1)\n" -- " movq 32(%0), %%mm4\n" -- " movntq %%mm4, 32(%1)\n" -- " movq 40(%0), %%mm5\n" -- " movntq %%mm5, 40(%1)\n" -- " movq 48(%0), %%mm6\n" -- " movntq %%mm6, 48(%1)\n" -- " movq 56(%0), %%mm7\n" -- " movntq %%mm7, 56(%1)\n" -+ "1: prefetch 320(%1)\n" -+ "2: movq (%1), %%mm0\n" -+ " movntq %%mm0, (%2)\n" -+ " movq 8(%1), %%mm1\n" -+ " movntq %%mm1, 8(%2)\n" -+ " movq 16(%1), %%mm2\n" -+ " movntq %%mm2, 16(%2)\n" -+ " movq 24(%1), %%mm3\n" -+ " movntq %%mm3, 24(%2)\n" -+ " movq 32(%1), %%mm4\n" -+ " movntq %%mm4, 32(%2)\n" -+ " movq 40(%1), %%mm5\n" -+ " movntq %%mm5, 40(%2)\n" -+ " movq 48(%1), %%mm6\n" -+ " movntq %%mm6, 48(%2)\n" -+ " movq 56(%1), %%mm7\n" -+ " movntq %%mm7, 56(%2)\n" - ".section .fixup, "ax"\n" -- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ -+ "3:\n" -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %%cr0, %0\n" -+ " movl %0, %%eax\n" -+ " andl $0xFFFEFFFF, %%eax\n" -+ " movl %%eax, %%cr0\n" -+#endif -+ -+ " movw $0x05EB, 1b\n" /* jmp on 5 bytes */ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %0, %%cr0\n" -+#endif -+ - " jmp 2b\n" - ".previous\n" -- _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory"); -+ _ASM_EXTABLE(1b, 3b) : "=&r" (cr0) : "r" (from), "r" (to) : "memory", "ax"); - - from += 64; - to += 64; -@@ -280,47 +338,76 @@ static void fast_clear_page(void *page) - static void fast_copy_page(void *to, void *from) - { - int i; -+ unsigned long cr0; - - kernel_fpu_begin(); - - __asm__ __volatile__ ( -- "1: prefetch (%0)\n" -- " prefetch 64(%0)\n" -- " prefetch 128(%0)\n" -- " prefetch 192(%0)\n" -- " prefetch 256(%0)\n" -+ "1: prefetch (%1)\n" -+ " prefetch 64(%1)\n" -+ " prefetch 128(%1)\n" -+ " prefetch 192(%1)\n" -+ " prefetch 256(%1)\n" - "2: \n" - ".section .fixup, "ax"\n" -- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ -+ "3: \n" -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %%cr0, %0\n" -+ " movl %0, %%eax\n" -+ " andl $0xFFFEFFFF, %%eax\n" -+ " movl %%eax, %%cr0\n" -+#endif -+ -+ " movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %0, %%cr0\n" -+#endif -+ - " jmp 2b\n" - ".previous\n" -- _ASM_EXTABLE(1b, 3b) : : "r" (from)); -+ _ASM_EXTABLE(1b, 3b) : "=&r" (cr0) : "r" (from) : "ax"); - - for (i = 0; i < 4096/64; i++) { - __asm__ __volatile__ ( -- "1: prefetch 320(%0)\n" -- "2: movq (%0), %%mm0\n" -- " movq 8(%0), %%mm1\n" -- " movq 16(%0), %%mm2\n" -- " movq 24(%0), %%mm3\n" -- " movq %%mm0, (%1)\n" -- " movq %%mm1, 8(%1)\n" -- " movq %%mm2, 16(%1)\n" -- " movq %%mm3, 24(%1)\n" -- " movq 32(%0), %%mm0\n" -- " movq 40(%0), %%mm1\n" -- " movq 48(%0), %%mm2\n" -- " movq 56(%0), %%mm3\n" -- " movq %%mm0, 32(%1)\n" -- " movq %%mm1, 40(%1)\n" -- " movq %%mm2, 48(%1)\n" -- " movq %%mm3, 56(%1)\n" -+ "1: prefetch 320(%1)\n" -+ "2: movq (%1), %%mm0\n" -+ " movq 8(%1), %%mm1\n" -+ " movq 16(%1), %%mm2\n" -+ " movq 24(%1), %%mm3\n" -+ " movq %%mm0, (%2)\n" -+ " movq %%mm1, 8(%2)\n" -+ " movq %%mm2, 16(%2)\n" -+ " movq %%mm3, 24(%2)\n" -+ " movq 32(%1), %%mm0\n" -+ " movq 40(%1), %%mm1\n" -+ " movq 48(%1), %%mm2\n" -+ " movq 56(%1), %%mm3\n" -+ " movq %%mm0, 32(%2)\n" -+ " movq %%mm1, 40(%2)\n" -+ " movq %%mm2, 48(%2)\n" -+ " movq %%mm3, 56(%2)\n" - ".section .fixup, "ax"\n" -- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ -+ "3:\n" -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %%cr0, %0\n" -+ " movl %0, %%eax\n" -+ " andl $0xFFFEFFFF, %%eax\n" -+ " movl %%eax, %%cr0\n" -+#endif -+ -+ " movw $0x05EB, 1b\n" /* jmp on 5 bytes */ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ " movl %0, %%cr0\n" -+#endif -+ - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) -- : : "r" (from), "r" (to) : "memory"); -+ : "=&r" (cr0) : "r" (from), "r" (to) : "memory", "ax"); - - from += 64; - to += 64; -diff -urNp linux-2.6.31.1/arch/x86/lib/putuser.S linux-2.6.31.1/arch/x86/lib/putuser.S ---- linux-2.6.31.1/arch/x86/lib/putuser.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/lib/putuser.S 2009-10-01 20:12:42.000000000 -0400 -@@ -15,6 +15,7 @@ - #include <asm/thread_info.h> - #include <asm/errno.h> - #include <asm/asm.h> -+#include <asm/segment.h> - - - /* -@@ -39,7 +40,19 @@ ENTRY(__put_user_1) - ENTER - cmp TI_addr_limit(%_ASM_BX),%_ASM_CX - jae bad_put_user -+ -+#ifdef CONFIG_X86_32 -+ pushl $(__USER_DS) -+ popl %ds -+#endif -+ - 1: movb %al,(%_ASM_CX) -+ -+#ifdef CONFIG_X86_32 -+ pushl %ss -+ popl %ds -+#endif -+ - xor %eax,%eax - EXIT - ENDPROC(__put_user_1) -@@ -50,7 +63,19 @@ ENTRY(__put_user_2) - sub $1,%_ASM_BX - cmp %_ASM_BX,%_ASM_CX - jae bad_put_user -+ -+#ifdef CONFIG_X86_32 -+ pushl $(__USER_DS) -+ popl %ds -+#endif -+ - 2: movw %ax,(%_ASM_CX) -+ -+#ifdef CONFIG_X86_32 -+ pushl %ss -+ popl %ds -+#endif -+ - xor %eax,%eax - EXIT - ENDPROC(__put_user_2) -@@ -61,7 +86,19 @@ ENTRY(__put_user_4) - sub $3,%_ASM_BX - cmp %_ASM_BX,%_ASM_CX - jae bad_put_user -+ -+#ifdef CONFIG_X86_32 -+ pushl $(__USER_DS) -+ popl %ds -+#endif -+ - 3: movl %eax,(%_ASM_CX) -+ -+#ifdef CONFIG_X86_32 -+ pushl %ss -+ popl %ds -+#endif -+ - xor %eax,%eax - EXIT - ENDPROC(__put_user_4) -@@ -72,16 +109,34 @@ ENTRY(__put_user_8) - sub $7,%_ASM_BX - cmp %_ASM_BX,%_ASM_CX - jae bad_put_user -+ -+#ifdef CONFIG_X86_32 -+ pushl $(__USER_DS) -+ popl %ds -+#endif -+ - 4: mov %_ASM_AX,(%_ASM_CX) - #ifdef CONFIG_X86_32 - 5: movl %edx,4(%_ASM_CX) - #endif -+ -+#ifdef CONFIG_X86_32 -+ pushl %ss -+ popl %ds -+#endif -+ - xor %eax,%eax - EXIT - ENDPROC(__put_user_8) - - bad_put_user: - CFI_STARTPROC -+ -+#ifdef CONFIG_X86_32 -+ pushl %ss -+ popl %ds -+#endif -+ - movl $-EFAULT,%eax - EXIT - END(bad_put_user) -diff -urNp linux-2.6.31.1/arch/x86/lib/usercopy_32.c linux-2.6.31.1/arch/x86/lib/usercopy_32.c ---- linux-2.6.31.1/arch/x86/lib/usercopy_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/lib/usercopy_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -36,31 +36,38 @@ static inline int __movsl_is_ok(unsigned - * Copy a null terminated string from userspace. - */ - --#define __do_strncpy_from_user(dst, src, count, res) \ --do { \ -- int __d0, __d1, __d2; \ -- might_fault(); \ -- __asm__ __volatile__( \ -- " testl %1,%1\n" \ -- " jz 2f\n" \ -- "0: lodsb\n" \ -- " stosb\n" \ -- " testb %%al,%%al\n" \ -- " jz 1f\n" \ -- " decl %1\n" \ -- " jnz 0b\n" \ -- "1: subl %1,%0\n" \ -- "2:\n" \ -- ".section .fixup,"ax"\n" \ -- "3: movl %5,%0\n" \ -- " jmp 2b\n" \ -- ".previous\n" \ -- _ASM_EXTABLE(0b,3b) \ -- : "=&d"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1), \ -- "=&D" (__d2) \ -- : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ -- : "memory"); \ --} while (0) -+static long __do_strncpy_from_user(char *dst, const char __user *src, long count) -+{ -+ int __d0, __d1, __d2; -+ long res = -EFAULT; -+ -+ might_fault(); -+ __asm__ __volatile__( -+ " movw %w10,%%ds\n" -+ " testl %1,%1\n" -+ " jz 2f\n" -+ "0: lodsb\n" -+ " stosb\n" -+ " testb %%al,%%al\n" -+ " jz 1f\n" -+ " decl %1\n" -+ " jnz 0b\n" -+ "1: subl %1,%0\n" -+ "2:\n" -+ " pushl %%ss\n" -+ " popl %%ds\n" -+ ".section .fixup,"ax"\n" -+ "3: movl %5,%0\n" -+ " jmp 2b\n" -+ ".previous\n" -+ _ASM_EXTABLE(0b,3b) -+ : "=&d"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1), -+ "=&D" (__d2) -+ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst), -+ "r"(__USER_DS) -+ : "memory"); -+ return res; -+} - - /** - * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking. -@@ -85,9 +92,7 @@ do { \ - long - __strncpy_from_user(char *dst, const char __user *src, long count) - { -- long res; -- __do_strncpy_from_user(dst, src, count, res); -- return res; -+ return __do_strncpy_from_user(dst, src, count); - } - EXPORT_SYMBOL(__strncpy_from_user); - -@@ -114,7 +119,7 @@ strncpy_from_user(char *dst, const char - { - long res = -EFAULT; - if (access_ok(VERIFY_READ, src, 1)) -- __do_strncpy_from_user(dst, src, count, res); -+ res = __do_strncpy_from_user(dst, src, count); - return res; - } - EXPORT_SYMBOL(strncpy_from_user); -@@ -123,24 +128,30 @@ EXPORT_SYMBOL(strncpy_from_user); - * Zero Userspace - */ - --#define __do_clear_user(addr,size) \ --do { \ -- int __d0; \ -- might_fault(); \ -- __asm__ __volatile__( \ -- "0: rep; stosl\n" \ -- " movl %2,%0\n" \ -- "1: rep; stosb\n" \ -- "2:\n" \ -- ".section .fixup,"ax"\n" \ -- "3: lea 0(%2,%0,4),%0\n" \ -- " jmp 2b\n" \ -- ".previous\n" \ -- _ASM_EXTABLE(0b,3b) \ -- _ASM_EXTABLE(1b,2b) \ -- : "=&c"(size), "=&D" (__d0) \ -- : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ --} while (0) -+static unsigned long __do_clear_user(void __user *addr, unsigned long size) -+{ -+ int __d0; -+ -+ might_fault(); -+ __asm__ __volatile__( -+ " movw %w6,%%es\n" -+ "0: rep; stosl\n" -+ " movl %2,%0\n" -+ "1: rep; stosb\n" -+ "2:\n" -+ " pushl %%ss\n" -+ " popl %%es\n" -+ ".section .fixup,"ax"\n" -+ "3: lea 0(%2,%0,4),%0\n" -+ " jmp 2b\n" -+ ".previous\n" -+ _ASM_EXTABLE(0b,3b) -+ _ASM_EXTABLE(1b,2b) -+ : "=&c"(size), "=&D" (__d0) -+ : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0), -+ "r"(__USER_DS)); -+ return size; -+} - - /** - * clear_user: - Zero a block of memory in user space. -@@ -157,7 +168,7 @@ clear_user(void __user *to, unsigned lon - { - might_fault(); - if (access_ok(VERIFY_WRITE, to, n)) -- __do_clear_user(to, n); -+ n = __do_clear_user(to, n); - return n; - } - EXPORT_SYMBOL(clear_user); -@@ -176,8 +187,7 @@ EXPORT_SYMBOL(clear_user); - unsigned long - __clear_user(void __user *to, unsigned long n) - { -- __do_clear_user(to, n); -- return n; -+ return __do_clear_user(to, n); - } - EXPORT_SYMBOL(__clear_user); - -@@ -200,14 +210,17 @@ long strnlen_user(const char __user *s, - might_fault(); - - __asm__ __volatile__( -+ " movw %w8,%%es\n" - " testl %0, %0\n" - " jz 3f\n" -- " andl %0,%%ecx\n" -+ " movl %0,%%ecx\n" - "0: repne; scasb\n" - " setne %%al\n" - " subl %%ecx,%0\n" - " addl %0,%%eax\n" - "1:\n" -+ " pushl %%ss\n" -+ " popl %%es\n" - ".section .fixup,"ax"\n" - "2: xorl %%eax,%%eax\n" - " jmp 1b\n" -@@ -219,7 +232,7 @@ long strnlen_user(const char __user *s, - " .long 0b,2b\n" - ".previous" - :"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp) -- :"0" (n), "1" (s), "2" (0), "3" (mask) -+ :"0" (n), "1" (s), "2" (0), "3" (mask), "r" (__USER_DS) - :"cc"); - return res & mask; - } -@@ -227,10 +240,121 @@ EXPORT_SYMBOL(strnlen_user); - - #ifdef CONFIG_X86_INTEL_USERCOPY - static unsigned long --__copy_user_intel(void __user *to, const void *from, unsigned long size) -+__generic_copy_to_user_intel(void __user *to, const void *from, unsigned long size) -+{ -+ int d0, d1; -+ __asm__ __volatile__( -+ " movw %w6, %%es\n" -+ " .align 2,0x90\n" -+ "1: movl 32(%4), %%eax\n" -+ " cmpl $67, %0\n" -+ " jbe 3f\n" -+ "2: movl 64(%4), %%eax\n" -+ " .align 2,0x90\n" -+ "3: movl 0(%4), %%eax\n" -+ "4: movl 4(%4), %%edx\n" -+ "5: movl %%eax, %%es:0(%3)\n" -+ "6: movl %%edx, %%es:4(%3)\n" -+ "7: movl 8(%4), %%eax\n" -+ "8: movl 12(%4),%%edx\n" -+ "9: movl %%eax, %%es:8(%3)\n" -+ "10: movl %%edx, %%es:12(%3)\n" -+ "11: movl 16(%4), %%eax\n" -+ "12: movl 20(%4), %%edx\n" -+ "13: movl %%eax, %%es:16(%3)\n" -+ "14: movl %%edx, %%es:20(%3)\n" -+ "15: movl 24(%4), %%eax\n" -+ "16: movl 28(%4), %%edx\n" -+ "17: movl %%eax, %%es:24(%3)\n" -+ "18: movl %%edx, %%es:28(%3)\n" -+ "19: movl 32(%4), %%eax\n" -+ "20: movl 36(%4), %%edx\n" -+ "21: movl %%eax, %%es:32(%3)\n" -+ "22: movl %%edx, %%es:36(%3)\n" -+ "23: movl 40(%4), %%eax\n" -+ "24: movl 44(%4), %%edx\n" -+ "25: movl %%eax, %%es:40(%3)\n" -+ "26: movl %%edx, %%es:44(%3)\n" -+ "27: movl 48(%4), %%eax\n" -+ "28: movl 52(%4), %%edx\n" -+ "29: movl %%eax, %%es:48(%3)\n" -+ "30: movl %%edx, %%es:52(%3)\n" -+ "31: movl 56(%4), %%eax\n" -+ "32: movl 60(%4), %%edx\n" -+ "33: movl %%eax, %%es:56(%3)\n" -+ "34: movl %%edx, %%es:60(%3)\n" -+ " addl $-64, %0\n" -+ " addl $64, %4\n" -+ " addl $64, %3\n" -+ " cmpl $63, %0\n" -+ " ja 1b\n" -+ "35: movl %0, %%eax\n" -+ " shrl $2, %0\n" -+ " andl $3, %%eax\n" -+ " cld\n" -+ "99: rep; movsl\n" -+ "36: movl %%eax, %0\n" -+ "37: rep; movsb\n" -+ "100:\n" -+ " pushl %%ss\n" -+ " popl %%es\n" -+ ".section .fixup,"ax"\n" -+ "101: lea 0(%%eax,%0,4),%0\n" -+ " jmp 100b\n" -+ ".previous\n" -+ ".section __ex_table,"a"\n" -+ " .align 4\n" -+ " .long 1b,100b\n" -+ " .long 2b,100b\n" -+ " .long 3b,100b\n" -+ " .long 4b,100b\n" -+ " .long 5b,100b\n" -+ " .long 6b,100b\n" -+ " .long 7b,100b\n" -+ " .long 8b,100b\n" -+ " .long 9b,100b\n" -+ " .long 10b,100b\n" -+ " .long 11b,100b\n" -+ " .long 12b,100b\n" -+ " .long 13b,100b\n" -+ " .long 14b,100b\n" -+ " .long 15b,100b\n" -+ " .long 16b,100b\n" -+ " .long 17b,100b\n" -+ " .long 18b,100b\n" -+ " .long 19b,100b\n" -+ " .long 20b,100b\n" -+ " .long 21b,100b\n" -+ " .long 22b,100b\n" -+ " .long 23b,100b\n" -+ " .long 24b,100b\n" -+ " .long 25b,100b\n" -+ " .long 26b,100b\n" -+ " .long 27b,100b\n" -+ " .long 28b,100b\n" -+ " .long 29b,100b\n" -+ " .long 30b,100b\n" -+ " .long 31b,100b\n" -+ " .long 32b,100b\n" -+ " .long 33b,100b\n" -+ " .long 34b,100b\n" -+ " .long 35b,100b\n" -+ " .long 36b,100b\n" -+ " .long 37b,100b\n" -+ " .long 99b,101b\n" -+ ".previous" -+ : "=&c"(size), "=&D" (d0), "=&S" (d1) -+ : "1"(to), "2"(from), "0"(size), "r"(__USER_DS) -+ : "eax", "edx", "memory"); -+ return size; -+} -+ -+static unsigned long -+__generic_copy_from_user_intel(void *to, const void __user *from, unsigned long size) - { - int d0, d1; - __asm__ __volatile__( -+ " movw %w6, %%ds\n" - " .align 2,0x90\n" - "1: movl 32(%4), %%eax\n" - " cmpl $67, %0\n" -@@ -239,36 +363,36 @@ __copy_user_intel(void __user *to, const - " .align 2,0x90\n" - "3: movl 0(%4), %%eax\n" - "4: movl 4(%4), %%edx\n" -- "5: movl %%eax, 0(%3)\n" -- "6: movl %%edx, 4(%3)\n" -+ "5: movl %%eax, %%es:0(%3)\n" -+ "6: movl %%edx, %%es:4(%3)\n" - "7: movl 8(%4), %%eax\n" - "8: movl 12(%4),%%edx\n" -- "9: movl %%eax, 8(%3)\n" -- "10: movl %%edx, 12(%3)\n" -+ "9: movl %%eax, %%es:8(%3)\n" -+ "10: movl %%edx, %%es:12(%3)\n" - "11: movl 16(%4), %%eax\n" - "12: movl 20(%4), %%edx\n" -- "13: movl %%eax, 16(%3)\n" -- "14: movl %%edx, 20(%3)\n" -+ "13: movl %%eax, %%es:16(%3)\n" -+ "14: movl %%edx, %%es:20(%3)\n" - "15: movl 24(%4), %%eax\n" - "16: movl 28(%4), %%edx\n" -- "17: movl %%eax, 24(%3)\n" -- "18: movl %%edx, 28(%3)\n" -+ "17: movl %%eax, %%es:24(%3)\n" -+ "18: movl %%edx, %%es:28(%3)\n" - "19: movl 32(%4), %%eax\n" - "20: movl 36(%4), %%edx\n" -- "21: movl %%eax, 32(%3)\n" -- "22: movl %%edx, 36(%3)\n" -+ "21: movl %%eax, %%es:32(%3)\n" -+ "22: movl %%edx, %%es:36(%3)\n" - "23: movl 40(%4), %%eax\n" - "24: movl 44(%4), %%edx\n" -- "25: movl %%eax, 40(%3)\n" -- "26: movl %%edx, 44(%3)\n" -+ "25: movl %%eax, %%es:40(%3)\n" -+ "26: movl %%edx, %%es:44(%3)\n" - "27: movl 48(%4), %%eax\n" - "28: movl 52(%4), %%edx\n" -- "29: movl %%eax, 48(%3)\n" -- "30: movl %%edx, 52(%3)\n" -+ "29: movl %%eax, %%es:48(%3)\n" -+ "30: movl %%edx, %%es:52(%3)\n" - "31: movl 56(%4), %%eax\n" - "32: movl 60(%4), %%edx\n" -- "33: movl %%eax, 56(%3)\n" -- "34: movl %%edx, 60(%3)\n" -+ "33: movl %%eax, %%es:56(%3)\n" -+ "34: movl %%edx, %%es:60(%3)\n" - " addl $-64, %0\n" - " addl $64, %4\n" - " addl $64, %3\n" -@@ -282,6 +406,8 @@ __copy_user_intel(void __user *to, const - "36: movl %%eax, %0\n" - "37: rep; movsb\n" - "100:\n" -+ " pushl %%ss\n" -+ " popl %%ds\n" - ".section .fixup,"ax"\n" - "101: lea 0(%%eax,%0,4),%0\n" - " jmp 100b\n" -@@ -328,7 +454,7 @@ __copy_user_intel(void __user *to, const - " .long 99b,101b\n" - ".previous" - : "=&c"(size), "=&D" (d0), "=&S" (d1) -- : "1"(to), "2"(from), "0"(size) -+ : "1"(to), "2"(from), "0"(size), "r"(__USER_DS) - : "eax", "edx", "memory"); - return size; - } -@@ -338,6 +464,7 @@ __copy_user_zeroing_intel(void *to, cons - { - int d0, d1; - __asm__ __volatile__( -+ " movw %w6, %%ds\n" - " .align 2,0x90\n" - "0: movl 32(%4), %%eax\n" - " cmpl $67, %0\n" -@@ -346,36 +473,36 @@ __copy_user_zeroing_intel(void *to, cons - " .align 2,0x90\n" - "2: movl 0(%4), %%eax\n" - "21: movl 4(%4), %%edx\n" -- " movl %%eax, 0(%3)\n" -- " movl %%edx, 4(%3)\n" -+ " movl %%eax, %%es:0(%3)\n" -+ " movl %%edx, %%es:4(%3)\n" - "3: movl 8(%4), %%eax\n" - "31: movl 12(%4),%%edx\n" -- " movl %%eax, 8(%3)\n" -- " movl %%edx, 12(%3)\n" -+ " movl %%eax, %%es:8(%3)\n" -+ " movl %%edx, %%es:12(%3)\n" - "4: movl 16(%4), %%eax\n" - "41: movl 20(%4), %%edx\n" -- " movl %%eax, 16(%3)\n" -- " movl %%edx, 20(%3)\n" -+ " movl %%eax, %%es:16(%3)\n" -+ " movl %%edx, %%es:20(%3)\n" - "10: movl 24(%4), %%eax\n" - "51: movl 28(%4), %%edx\n" -- " movl %%eax, 24(%3)\n" -- " movl %%edx, 28(%3)\n" -+ " movl %%eax, %%es:24(%3)\n" -+ " movl %%edx, %%es:28(%3)\n" - "11: movl 32(%4), %%eax\n" - "61: movl 36(%4), %%edx\n" -- " movl %%eax, 32(%3)\n" -- " movl %%edx, 36(%3)\n" -+ " movl %%eax, %%es:32(%3)\n" -+ " movl %%edx, %%es:36(%3)\n" - "12: movl 40(%4), %%eax\n" - "71: movl 44(%4), %%edx\n" -- " movl %%eax, 40(%3)\n" -- " movl %%edx, 44(%3)\n" -+ " movl %%eax, %%es:40(%3)\n" -+ " movl %%edx, %%es:44(%3)\n" - "13: movl 48(%4), %%eax\n" - "81: movl 52(%4), %%edx\n" -- " movl %%eax, 48(%3)\n" -- " movl %%edx, 52(%3)\n" -+ " movl %%eax, %%es:48(%3)\n" -+ " movl %%edx, %%es:52(%3)\n" - "14: movl 56(%4), %%eax\n" - "91: movl 60(%4), %%edx\n" -- " movl %%eax, 56(%3)\n" -- " movl %%edx, 60(%3)\n" -+ " movl %%eax, %%es:56(%3)\n" -+ " movl %%edx, %%es:60(%3)\n" - " addl $-64, %0\n" - " addl $64, %4\n" - " addl $64, %3\n" -@@ -389,6 +516,8 @@ __copy_user_zeroing_intel(void *to, cons - " movl %%eax,%0\n" - "7: rep; movsb\n" - "8:\n" -+ " pushl %%ss\n" -+ " popl %%ds\n" - ".section .fixup,"ax"\n" - "9: lea 0(%%eax,%0,4),%0\n" - "16: pushl %0\n" -@@ -423,7 +552,7 @@ __copy_user_zeroing_intel(void *to, cons - " .long 7b,16b\n" - ".previous" - : "=&c"(size), "=&D" (d0), "=&S" (d1) -- : "1"(to), "2"(from), "0"(size) -+ : "1"(to), "2"(from), "0"(size), "r"(__USER_DS) - : "eax", "edx", "memory"); - return size; - } -@@ -439,6 +568,7 @@ static unsigned long __copy_user_zeroing - int d0, d1; - - __asm__ __volatile__( -+ " movw %w6, %%ds\n" - " .align 2,0x90\n" - "0: movl 32(%4), %%eax\n" - " cmpl $67, %0\n" -@@ -447,36 +577,36 @@ static unsigned long __copy_user_zeroing - " .align 2,0x90\n" - "2: movl 0(%4), %%eax\n" - "21: movl 4(%4), %%edx\n" -- " movnti %%eax, 0(%3)\n" -- " movnti %%edx, 4(%3)\n" -+ " movnti %%eax, %%es:0(%3)\n" -+ " movnti %%edx, %%es:4(%3)\n" - "3: movl 8(%4), %%eax\n" - "31: movl 12(%4),%%edx\n" -- " movnti %%eax, 8(%3)\n" -- " movnti %%edx, 12(%3)\n" -+ " movnti %%eax, %%es:8(%3)\n" -+ " movnti %%edx, %%es:12(%3)\n" - "4: movl 16(%4), %%eax\n" - "41: movl 20(%4), %%edx\n" -- " movnti %%eax, 16(%3)\n" -- " movnti %%edx, 20(%3)\n" -+ " movnti %%eax, %%es:16(%3)\n" -+ " movnti %%edx, %%es:20(%3)\n" - "10: movl 24(%4), %%eax\n" - "51: movl 28(%4), %%edx\n" -- " movnti %%eax, 24(%3)\n" -- " movnti %%edx, 28(%3)\n" -+ " movnti %%eax, %%es:24(%3)\n" -+ " movnti %%edx, %%es:28(%3)\n" - "11: movl 32(%4), %%eax\n" - "61: movl 36(%4), %%edx\n" -- " movnti %%eax, 32(%3)\n" -- " movnti %%edx, 36(%3)\n" -+ " movnti %%eax, %%es:32(%3)\n" -+ " movnti %%edx, %%es:36(%3)\n" - "12: movl 40(%4), %%eax\n" - "71: movl 44(%4), %%edx\n" -- " movnti %%eax, 40(%3)\n" -- " movnti %%edx, 44(%3)\n" -+ " movnti %%eax, %%es:40(%3)\n" -+ " movnti %%edx, %%es:44(%3)\n" - "13: movl 48(%4), %%eax\n" - "81: movl 52(%4), %%edx\n" -- " movnti %%eax, 48(%3)\n" -- " movnti %%edx, 52(%3)\n" -+ " movnti %%eax, %%es:48(%3)\n" -+ " movnti %%edx, %%es:52(%3)\n" - "14: movl 56(%4), %%eax\n" - "91: movl 60(%4), %%edx\n" -- " movnti %%eax, 56(%3)\n" -- " movnti %%edx, 60(%3)\n" -+ " movnti %%eax, %%es:56(%3)\n" -+ " movnti %%edx, %%es:60(%3)\n" - " addl $-64, %0\n" - " addl $64, %4\n" - " addl $64, %3\n" -@@ -491,6 +621,8 @@ static unsigned long __copy_user_zeroing - " movl %%eax,%0\n" - "7: rep; movsb\n" - "8:\n" -+ " pushl %%ss\n" -+ " popl %%ds\n" - ".section .fixup,"ax"\n" - "9: lea 0(%%eax,%0,4),%0\n" - "16: pushl %0\n" -@@ -525,7 +657,7 @@ static unsigned long __copy_user_zeroing - " .long 7b,16b\n" - ".previous" - : "=&c"(size), "=&D" (d0), "=&S" (d1) -- : "1"(to), "2"(from), "0"(size) -+ : "1"(to), "2"(from), "0"(size), "r"(__USER_DS) - : "eax", "edx", "memory"); - return size; - } -@@ -536,6 +668,7 @@ static unsigned long __copy_user_intel_n - int d0, d1; - - __asm__ __volatile__( -+ " movw %w6, %%ds\n" - " .align 2,0x90\n" - "0: movl 32(%4), %%eax\n" - " cmpl $67, %0\n" -@@ -544,36 +677,36 @@ static unsigned long __copy_user_intel_n - " .align 2,0x90\n" - "2: movl 0(%4), %%eax\n" - "21: movl 4(%4), %%edx\n" -- " movnti %%eax, 0(%3)\n" -- " movnti %%edx, 4(%3)\n" -+ " movnti %%eax, %%es:0(%3)\n" -+ " movnti %%edx, %%es:4(%3)\n" - "3: movl 8(%4), %%eax\n" - "31: movl 12(%4),%%edx\n" -- " movnti %%eax, 8(%3)\n" -- " movnti %%edx, 12(%3)\n" -+ " movnti %%eax, %%es:8(%3)\n" -+ " movnti %%edx, %%es:12(%3)\n" - "4: movl 16(%4), %%eax\n" - "41: movl 20(%4), %%edx\n" -- " movnti %%eax, 16(%3)\n" -- " movnti %%edx, 20(%3)\n" -+ " movnti %%eax, %%es:16(%3)\n" -+ " movnti %%edx, %%es:20(%3)\n" - "10: movl 24(%4), %%eax\n" - "51: movl 28(%4), %%edx\n" -- " movnti %%eax, 24(%3)\n" -- " movnti %%edx, 28(%3)\n" -+ " movnti %%eax, %%es:24(%3)\n" -+ " movnti %%edx, %%es:28(%3)\n" - "11: movl 32(%4), %%eax\n" - "61: movl 36(%4), %%edx\n" -- " movnti %%eax, 32(%3)\n" -- " movnti %%edx, 36(%3)\n" -+ " movnti %%eax, %%es:32(%3)\n" -+ " movnti %%edx, %%es:36(%3)\n" - "12: movl 40(%4), %%eax\n" - "71: movl 44(%4), %%edx\n" -- " movnti %%eax, 40(%3)\n" -- " movnti %%edx, 44(%3)\n" -+ " movnti %%eax, %%es:40(%3)\n" -+ " movnti %%edx, %%es:44(%3)\n" - "13: movl 48(%4), %%eax\n" - "81: movl 52(%4), %%edx\n" -- " movnti %%eax, 48(%3)\n" -- " movnti %%edx, 52(%3)\n" -+ " movnti %%eax, %%es:48(%3)\n" -+ " movnti %%edx, %%es:52(%3)\n" - "14: movl 56(%4), %%eax\n" - "91: movl 60(%4), %%edx\n" -- " movnti %%eax, 56(%3)\n" -- " movnti %%edx, 60(%3)\n" -+ " movnti %%eax, %%es:56(%3)\n" -+ " movnti %%edx, %%es:60(%3)\n" - " addl $-64, %0\n" - " addl $64, %4\n" - " addl $64, %3\n" -@@ -588,6 +721,8 @@ static unsigned long __copy_user_intel_n - " movl %%eax,%0\n" - "7: rep; movsb\n" - "8:\n" -+ " pushl %%ss\n" -+ " popl %%ds\n" - ".section .fixup,"ax"\n" - "9: lea 0(%%eax,%0,4),%0\n" - "16: jmp 8b\n" -@@ -616,7 +751,7 @@ static unsigned long __copy_user_intel_n - " .long 7b,16b\n" - ".previous" - : "=&c"(size), "=&D" (d0), "=&S" (d1) -- : "1"(to), "2"(from), "0"(size) -+ : "1"(to), "2"(from), "0"(size), "r"(__USER_DS) - : "eax", "edx", "memory"); - return size; - } -@@ -629,90 +764,146 @@ static unsigned long __copy_user_intel_n - */ - unsigned long __copy_user_zeroing_intel(void *to, const void __user *from, - unsigned long size); --unsigned long __copy_user_intel(void __user *to, const void *from, -+unsigned long __generic_copy_to_user_intel(void __user *to, const void *from, -+ unsigned long size); -+unsigned long __generic_copy_from_user_intel(void *to, const void __user *from, - unsigned long size); - unsigned long __copy_user_zeroing_intel_nocache(void *to, - const void __user *from, unsigned long size); - #endif /* CONFIG_X86_INTEL_USERCOPY */ - - /* Generic arbitrary sized copy. */ --#define __copy_user(to, from, size) \ --do { \ -- int __d0, __d1, __d2; \ -- __asm__ __volatile__( \ -- " cmp $7,%0\n" \ -- " jbe 1f\n" \ -- " movl %1,%0\n" \ -- " negl %0\n" \ -- " andl $7,%0\n" \ -- " subl %0,%3\n" \ -- "4: rep; movsb\n" \ -- " movl %3,%0\n" \ -- " shrl $2,%0\n" \ -- " andl $3,%3\n" \ -- " .align 2,0x90\n" \ -- "0: rep; movsl\n" \ -- " movl %3,%0\n" \ -- "1: rep; movsb\n" \ -- "2:\n" \ -- ".section .fixup,"ax"\n" \ -- "5: addl %3,%0\n" \ -- " jmp 2b\n" \ -- "3: lea 0(%3,%0,4),%0\n" \ -- " jmp 2b\n" \ -- ".previous\n" \ -- ".section __ex_table,"a"\n" \ -- " .align 4\n" \ -- " .long 4b,5b\n" \ -- " .long 0b,3b\n" \ -- " .long 1b,2b\n" \ -- ".previous" \ -- : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ -- : "3"(size), "0"(size), "1"(to), "2"(from) \ -- : "memory"); \ --} while (0) -- --#define __copy_user_zeroing(to, from, size) \ --do { \ -- int __d0, __d1, __d2; \ -- __asm__ __volatile__( \ -- " cmp $7,%0\n" \ -- " jbe 1f\n" \ -- " movl %1,%0\n" \ -- " negl %0\n" \ -- " andl $7,%0\n" \ -- " subl %0,%3\n" \ -- "4: rep; movsb\n" \ -- " movl %3,%0\n" \ -- " shrl $2,%0\n" \ -- " andl $3,%3\n" \ -- " .align 2,0x90\n" \ -- "0: rep; movsl\n" \ -- " movl %3,%0\n" \ -- "1: rep; movsb\n" \ -- "2:\n" \ -- ".section .fixup,"ax"\n" \ -- "5: addl %3,%0\n" \ -- " jmp 6f\n" \ -- "3: lea 0(%3,%0,4),%0\n" \ -- "6: pushl %0\n" \ -- " pushl %%eax\n" \ -- " xorl %%eax,%%eax\n" \ -- " rep; stosb\n" \ -- " popl %%eax\n" \ -- " popl %0\n" \ -- " jmp 2b\n" \ -- ".previous\n" \ -- ".section __ex_table,"a"\n" \ -- " .align 4\n" \ -- " .long 4b,5b\n" \ -- " .long 0b,3b\n" \ -- " .long 1b,6b\n" \ -- ".previous" \ -- : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ -- : "3"(size), "0"(size), "1"(to), "2"(from) \ -- : "memory"); \ --} while (0) -+static unsigned long -+__generic_copy_to_user(void __user *to, const void *from, unsigned long size) -+{ -+ int __d0, __d1, __d2; -+ -+ __asm__ __volatile__( -+ " movw %w8,%%es\n" -+ " cmp $7,%0\n" -+ " jbe 1f\n" -+ " movl %1,%0\n" -+ " negl %0\n" -+ " andl $7,%0\n" -+ " subl %0,%3\n" -+ "4: rep; movsb\n" -+ " movl %3,%0\n" -+ " shrl $2,%0\n" -+ " andl $3,%3\n" -+ " .align 2,0x90\n" -+ "0: rep; movsl\n" -+ " movl %3,%0\n" -+ "1: rep; movsb\n" -+ "2:\n" -+ " pushl %%ss\n" -+ " popl %%es\n" -+ ".section .fixup,"ax"\n" -+ "5: addl %3,%0\n" -+ " jmp 2b\n" -+ "3: lea 0(%3,%0,4),%0\n" -+ " jmp 2b\n" -+ ".previous\n" -+ ".section __ex_table,"a"\n" -+ " .align 4\n" -+ " .long 4b,5b\n" -+ " .long 0b,3b\n" -+ " .long 1b,2b\n" -+ ".previous" -+ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) -+ : "3"(size), "0"(size), "1"(to), "2"(from), "r"(__USER_DS) -+ : "memory"); -+ return size; -+} -+ -+static unsigned long -+__generic_copy_from_user(void *to, const void __user *from, unsigned long size) -+{ -+ int __d0, __d1, __d2; -+ -+ __asm__ __volatile__( -+ " movw %w8,%%ds\n" -+ " cmp $7,%0\n" -+ " jbe 1f\n" -+ " movl %1,%0\n" -+ " negl %0\n" -+ " andl $7,%0\n" -+ " subl %0,%3\n" -+ "4: rep; movsb\n" -+ " movl %3,%0\n" -+ " shrl $2,%0\n" -+ " andl $3,%3\n" -+ " .align 2,0x90\n" -+ "0: rep; movsl\n" -+ " movl %3,%0\n" -+ "1: rep; movsb\n" -+ "2:\n" -+ " pushl %%ss\n" -+ " popl %%ds\n" -+ ".section .fixup,"ax"\n" -+ "5: addl %3,%0\n" -+ " jmp 2b\n" -+ "3: lea 0(%3,%0,4),%0\n" -+ " jmp 2b\n" -+ ".previous\n" -+ ".section __ex_table,"a"\n" -+ " .align 4\n" -+ " .long 4b,5b\n" -+ " .long 0b,3b\n" -+ " .long 1b,2b\n" -+ ".previous" -+ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) -+ : "3"(size), "0"(size), "1"(to), "2"(from), "r"(__USER_DS) -+ : "memory"); -+ return size; -+} -+ -+static unsigned long -+__copy_user_zeroing(void *to, const void __user *from, unsigned long size) -+{ -+ int __d0, __d1, __d2; -+ -+ __asm__ __volatile__( -+ " movw %w8,%%ds\n" -+ " cmp $7,%0\n" -+ " jbe 1f\n" -+ " movl %1,%0\n" -+ " negl %0\n" -+ " andl $7,%0\n" -+ " subl %0,%3\n" -+ "4: rep; movsb\n" -+ " movl %3,%0\n" -+ " shrl $2,%0\n" -+ " andl $3,%3\n" -+ " .align 2,0x90\n" -+ "0: rep; movsl\n" -+ " movl %3,%0\n" -+ "1: rep; movsb\n" -+ "2:\n" -+ " pushl %%ss\n" -+ " popl %%ds\n" -+ ".section .fixup,"ax"\n" -+ "5: addl %3,%0\n" -+ " jmp 6f\n" -+ "3: lea 0(%3,%0,4),%0\n" -+ "6: pushl %0\n" -+ " pushl %%eax\n" -+ " xorl %%eax,%%eax\n" -+ " rep; stosb\n" -+ " popl %%eax\n" -+ " popl %0\n" -+ " jmp 2b\n" -+ ".previous\n" -+ ".section __ex_table,"a"\n" -+ " .align 4\n" -+ " .long 4b,5b\n" -+ " .long 0b,3b\n" -+ " .long 1b,6b\n" -+ ".previous" -+ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) -+ : "3"(size), "0"(size), "1"(to), "2"(from), "r"(__USER_DS) -+ : "memory"); -+ return size; -+} - - unsigned long __copy_to_user_ll(void __user *to, const void *from, - unsigned long n) -@@ -775,9 +966,9 @@ survive: - } - #endif - if (movsl_is_ok(to, from, n)) -- __copy_user(to, from, n); -+ n = __generic_copy_to_user(to, from, n); - else -- n = __copy_user_intel(to, from, n); -+ n = __generic_copy_to_user_intel(to, from, n); - return n; - } - EXPORT_SYMBOL(__copy_to_user_ll); -@@ -786,7 +977,7 @@ unsigned long __copy_from_user_ll(void * - unsigned long n) - { - if (movsl_is_ok(to, from, n)) -- __copy_user_zeroing(to, from, n); -+ n = __copy_user_zeroing(to, from, n); - else - n = __copy_user_zeroing_intel(to, from, n); - return n; -@@ -797,10 +988,9 @@ unsigned long __copy_from_user_ll_nozero - unsigned long n) - { - if (movsl_is_ok(to, from, n)) -- __copy_user(to, from, n); -+ n = __generic_copy_from_user(to, from, n); - else -- n = __copy_user_intel((void __user *)to, -- (const void *)from, n); -+ n = __generic_copy_from_user_intel(to, from, n); - return n; - } - EXPORT_SYMBOL(__copy_from_user_ll_nozero); -@@ -812,9 +1002,9 @@ unsigned long __copy_from_user_ll_nocach - if (n > 64 && cpu_has_xmm2) - n = __copy_user_zeroing_intel_nocache(to, from, n); - else -- __copy_user_zeroing(to, from, n); -+ n = __copy_user_zeroing(to, from, n); - #else -- __copy_user_zeroing(to, from, n); -+ n = __copy_user_zeroing(to, from, n); - #endif - return n; - } -@@ -827,59 +1017,37 @@ unsigned long __copy_from_user_ll_nocach - if (n > 64 && cpu_has_xmm2) - n = __copy_user_intel_nocache(to, from, n); - else -- __copy_user(to, from, n); -+ n = __generic_copy_from_user(to, from, n); - #else -- __copy_user(to, from, n); -+ n = __generic_copy_from_user(to, from, n); - #endif - return n; - } - EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); - --/** -- * copy_to_user: - Copy a block of data into user space. -- * @to: Destination address, in user space. -- * @from: Source address, in kernel space. -- * @n: Number of bytes to copy. -- * -- * Context: User context only. This function may sleep. -- * -- * Copy data from kernel space to user space. -- * -- * Returns number of bytes that could not be copied. -- * On success, this will be zero. -- */ --unsigned long --copy_to_user(void __user *to, const void *from, unsigned long n) -+#ifdef CONFIG_PAX_MEMORY_UDEREF -+void __set_fs(mm_segment_t x, int cpu) - { -- if (access_ok(VERIFY_WRITE, to, n)) -- n = __copy_to_user(to, from, n); -- return n; -+ unsigned long limit = x.seg; -+ struct desc_struct d; -+ -+ current_thread_info()->addr_limit = x; -+ if (likely(limit)) -+ limit = (limit - 1UL) >> PAGE_SHIFT; -+ pack_descriptor(&d, 0UL, limit, 0xF3, 0xC); -+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_DEFAULT_USER_DS, &d, DESCTYPE_S); - } --EXPORT_SYMBOL(copy_to_user); - --/** -- * copy_from_user: - Copy a block of data from user space. -- * @to: Destination address, in kernel space. -- * @from: Source address, in user space. -- * @n: Number of bytes to copy. -- * -- * Context: User context only. This function may sleep. -- * -- * Copy data from user space to kernel space. -- * -- * Returns number of bytes that could not be copied. -- * On success, this will be zero. -- * -- * If some data could not be copied, this function will pad the copied -- * data to the requested size using zero bytes. -- */ --unsigned long --copy_from_user(void *to, const void __user *from, unsigned long n) -+void set_fs(mm_segment_t x) - { -- if (access_ok(VERIFY_READ, from, n)) -- n = __copy_from_user(to, from, n); -- else -- memset(to, 0, n); -- return n; -+ __set_fs(x, get_cpu()); -+ put_cpu(); - } --EXPORT_SYMBOL(copy_from_user); -+#else -+void set_fs(mm_segment_t x) -+{ -+ current_thread_info()->addr_limit = x; -+} -+#endif -+ -+EXPORT_SYMBOL(set_fs); -diff -urNp linux-2.6.31.1/arch/x86/Makefile linux-2.6.31.1/arch/x86/Makefile ---- linux-2.6.31.1/arch/x86/Makefile 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/Makefile 2009-10-01 20:12:42.000000000 -0400 -@@ -188,3 +188,12 @@ define archhelp - echo ' FDARGS="..." arguments for the booted kernel' - echo ' FDINITRD=file initrd for the booted kernel' - endef -+ -+define OLD_LD -+ -+*** ${VERSION}.${PATCHLEVEL} PaX kernels no longer build correctly with old versions of binutils. -+*** Please upgrade your binutils to 2.18 or newer -+endef -+ -+archprepare: -+ $(if $(LDFLAGS_BUILD_ID),,$(error $(OLD_LD))) -diff -urNp linux-2.6.31.1/arch/x86/mm/extable.c linux-2.6.31.1/arch/x86/mm/extable.c ---- linux-2.6.31.1/arch/x86/mm/extable.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/extable.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1,14 +1,81 @@ - #include <linux/module.h> - #include <linux/spinlock.h> -+#include <linux/sort.h> - #include <asm/uaccess.h> - -+/* -+ * The exception table needs to be sorted so that the binary -+ * search that we use to find entries in it works properly. -+ * This is used both for the kernel exception table and for -+ * the exception tables of modules that get loaded. -+ */ -+static int cmp_ex(const void *a, const void *b) -+{ -+ const struct exception_table_entry *x = a, *y = b; -+ -+ /* avoid overflow */ -+ if (x->insn > y->insn) -+ return 1; -+ if (x->insn < y->insn) -+ return -1; -+ return 0; -+} -+ -+static void swap_ex(void *a, void *b, int size) -+{ -+ struct exception_table_entry t, *x = a, *y = b; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ -+ t = *x; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ *x = *y; -+ *y = t; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ -+} -+ -+void sort_extable(struct exception_table_entry *start, -+ struct exception_table_entry *finish) -+{ -+ sort(start, finish - start, sizeof(struct exception_table_entry), -+ cmp_ex, swap_ex); -+} -+ -+#ifdef CONFIG_MODULES -+/* -+ * If the exception table is sorted, any referring to the module init -+ * will be at the beginning or the end. -+ */ -+void trim_init_extable(struct module *m) -+{ -+ /*trim the beginning*/ -+ while (m->num_exentries && within_module_init(m->extable[0].insn, m)) { -+ m->extable++; -+ m->num_exentries--; -+ } -+ /*trim the end*/ -+ while (m->num_exentries && -+ within_module_init(m->extable[m->num_exentries-1].insn, m)) -+ m->num_exentries--; -+} -+#endif /* CONFIG_MODULES */ - - int fixup_exception(struct pt_regs *regs) - { - const struct exception_table_entry *fixup; - - #ifdef CONFIG_PNPBIOS -- if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { -+ if (unlikely(!v8086_mode(regs) && SEGMENT_IS_PNP_CODE(regs->cs))) { - extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; - extern u32 pnp_bios_is_utter_crap; - pnp_bios_is_utter_crap = 1; -diff -urNp linux-2.6.31.1/arch/x86/mm/fault.c linux-2.6.31.1/arch/x86/mm/fault.c ---- linux-2.6.31.1/arch/x86/mm/fault.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/fault.c 2009-10-01 20:12:42.000000000 -0400 -@@ -11,10 +11,14 @@ - #include <linux/kprobes.h> /* __kprobes, ... */ - #include <linux/mmiotrace.h> /* kmmio_handler, ... */ - #include <linux/perf_counter.h> /* perf_swcounter_event */ -+#include <linux/unistd.h> -+#include <linux/compiler.h> - - #include <asm/traps.h> /* dotraplinkage, ... */ - #include <asm/pgalloc.h> /* pgd_*(), ... */ - #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ -+#include <asm/vsyscall.h> -+#include <asm/tlbflush.h> - - /* - * Page fault error code bits: -@@ -51,7 +55,7 @@ static inline int notify_page_fault(stru - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ -- if (kprobes_built_in() && !user_mode_vm(regs)) { -+ if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; -@@ -171,6 +175,30 @@ force_sig_info_fault(int si_signo, int s - force_sig_info(si_signo, &info, tsk); - } - -+#ifdef CONFIG_PAX_EMUTRAMP -+static int pax_handle_fetch_fault(struct pt_regs *regs); -+#endif -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+static inline pmd_t * pax_get_pmd(struct mm_struct *mm, unsigned long address) -+{ -+ pgd_t *pgd; -+ pud_t *pud; -+ pmd_t *pmd; -+ -+ pgd = pgd_offset(mm, address); -+ if (!pgd_present(*pgd)) -+ return NULL; -+ pud = pud_offset(pgd, address); -+ if (!pud_present(*pud)) -+ return NULL; -+ pmd = pmd_offset(pud, address); -+ if (!pmd_present(*pmd)) -+ return NULL; -+ return pmd; -+} -+#endif -+ - DEFINE_SPINLOCK(pgd_lock); - LIST_HEAD(pgd_list); - -@@ -543,7 +571,7 @@ static int is_errata93(struct pt_regs *r - static int is_errata100(struct pt_regs *regs, unsigned long address) - { - #ifdef CONFIG_X86_64 -- if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) -+ if ((regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT)) && (address >> 32)) - return 1; - #endif - return 0; -@@ -570,7 +598,7 @@ static int is_f00f_bug(struct pt_regs *r - } - - static const char nx_warning[] = KERN_CRIT --"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; -+"kernel tried to execute NX-protected page - exploit attempt? (uid: %d, task: %s, pid: %d)\n"; - - static void - show_fault_oops(struct pt_regs *regs, unsigned long error_code, -@@ -579,15 +607,31 @@ show_fault_oops(struct pt_regs *regs, un - if (!oops_may_print()) - return; - -- if (error_code & PF_INSTR) { -+ if (nx_enabled && (error_code & PF_INSTR)) { - unsigned int level; - - pte_t *pte = lookup_address(address, &level); - - if (pte && pte_present(*pte) && !pte_exec(*pte)) -- printk(nx_warning, current_uid()); -+ printk(nx_warning, current_uid(), current->comm, task_pid_nr(current)); - } - -+#ifdef CONFIG_PAX_KERNEXEC -+#if defined(CONFIG_x86_32) && defined(CONFIG_MODULES) -+ if (init_mm.start_code <= address && address < (unsigned long)&MODULES_EXEC_END) -+#else -+ if (init_mm.start_code <= address && address < init_mm.end_code) -+#endif -+ { -+ if (current->signal->curr_ip) -+ printk(KERN_ERR "PAX: From %u.%u.%u.%u: %s:%d, uid/euid: %u/%u, attempted to modify kernel code\n", -+ NIPQUAD(current->signal->curr_ip), current->comm, task_pid_nr(current), current_uid(), current_euid()); -+ else -+ printk(KERN_ERR "PAX: %s:%d, uid/euid: %u/%u, attempted to modify kernel code\n", -+ current->comm, task_pid_nr(current), current_uid(), current_euid()); -+ } -+#endif -+ - printk(KERN_ALERT "BUG: unable to handle kernel "); - if (address < PAGE_SIZE) - printk(KERN_CONT "NULL pointer dereference"); -@@ -712,6 +756,68 @@ __bad_area_nosemaphore(struct pt_regs *r - unsigned long address, int si_code) - { - struct task_struct *tsk = current; -+ struct mm_struct *mm = tsk->mm; -+ -+#ifdef CONFIG_X86_64 -+ if (mm && (error_code & PF_INSTR)) { -+ if (regs->ip == (unsigned long)vgettimeofday) { -+ regs->ip = (unsigned long)VDSO64_SYMBOL(mm->context.vdso, fallback_gettimeofday); -+ return; -+ } else if (regs->ip == (unsigned long)vtime) { -+ regs->ip = (unsigned long)VDSO64_SYMBOL(mm->context.vdso, fallback_time); -+ return; -+ } else if (regs->ip == (unsigned long)vgetcpu) { -+ regs->ip = (unsigned long)VDSO64_SYMBOL(mm->context.vdso, getcpu); -+ return; -+ } -+ } -+#endif -+ -+#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) -+ if (mm && (error_code & PF_USER)) { -+ unsigned long ip = regs->ip; -+ -+ if (v8086_mode(regs)) -+ ip = ((regs->cs & 0xffff) << 4) + (regs->ip & 0xffff); -+ -+ /* -+ * It's possible to have interrupts off here: -+ */ -+ local_irq_enable(); -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && -+ ((nx_enabled && (error_code & PF_INSTR)) || (!(error_code & (PF_PROT | PF_WRITE)) && regs->ip == address))) { -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+ switch (pax_handle_fetch_fault(regs)) { -+ case 2: -+ return; -+ } -+#endif -+ -+ pax_report_fault(regs, (void *)regs->ip, (void *)regs->sp); -+ do_group_exit(SIGKILL); -+ } -+#endif -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && !(error_code & (PF_PROT | PF_WRITE)) && (regs->ip + SEGMEXEC_TASK_SIZE == address)) { -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+ switch (pax_handle_fetch_fault(regs)) { -+ case 2: -+ return; -+ } -+#endif -+ -+ pax_report_fault(regs, (void *)regs->ip, (void *)regs->sp); -+ do_group_exit(SIGKILL); -+ } -+#endif -+ -+ } -+#endif - - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { -@@ -846,6 +952,106 @@ static int spurious_fault_check(unsigned - return 1; - } - -+#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_PAGEEXEC) -+static int pax_handle_pageexec_fault(struct pt_regs *regs, struct mm_struct *mm, unsigned long address, unsigned long error_code) -+{ -+ pte_t *pte; -+ pmd_t *pmd; -+ spinlock_t *ptl; -+ unsigned char pte_mask; -+ -+ if (nx_enabled || (error_code & (PF_PROT|PF_USER)) != (PF_PROT|PF_USER) || v8086_mode(regs) || -+ !(mm->pax_flags & MF_PAX_PAGEEXEC)) -+ return 0; -+ -+ /* PaX: it's our fault, let's handle it if we can */ -+ -+ /* PaX: take a look at read faults before acquiring any locks */ -+ if (unlikely(!(error_code & PF_WRITE) && (regs->ip == address))) { -+ /* instruction fetch attempt from a protected page in user mode */ -+ up_read(&mm->mmap_sem); -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+ switch (pax_handle_fetch_fault(regs)) { -+ case 2: -+ return 1; -+ } -+#endif -+ -+ pax_report_fault(regs, (void *)regs->ip, (void *)regs->sp); -+ do_group_exit(SIGKILL); -+ } -+ -+ pmd = pax_get_pmd(mm, address); -+ if (unlikely(!pmd)) -+ return 0; -+ -+ pte = pte_offset_map_lock(mm, pmd, address, &ptl); -+ if (unlikely(!(pte_val(*pte) & _PAGE_PRESENT) || pte_user(*pte))) { -+ pte_unmap_unlock(pte, ptl); -+ return 0; -+ } -+ -+ if (unlikely((error_code & PF_WRITE) && !pte_write(*pte))) { -+ /* write attempt to a protected page in user mode */ -+ pte_unmap_unlock(pte, ptl); -+ return 0; -+ } -+ -+#ifdef CONFIG_SMP -+ if (likely(address > get_limit(regs->cs) && cpu_isset(smp_processor_id(), mm->context.cpu_user_cs_mask))) -+#else -+ if (likely(address > get_limit(regs->cs))) -+#endif -+ { -+ set_pte(pte, pte_mkread(*pte)); -+ __flush_tlb_one(address); -+ pte_unmap_unlock(pte, ptl); -+ up_read(&mm->mmap_sem); -+ return 1; -+ } -+ -+ pte_mask = _PAGE_ACCESSED | _PAGE_USER | ((error_code & PF_WRITE) << (_PAGE_BIT_DIRTY-1)); -+ -+ /* -+ * PaX: fill DTLB with user rights and retry -+ */ -+ __asm__ __volatile__ ( -+#ifdef CONFIG_PAX_MEMORY_UDEREF -+ "movw %w4,%%es\n" -+#endif -+ "orb %2,(%1)\n" -+#if defined(CONFIG_M586) || defined(CONFIG_M586TSC) -+/* -+ * PaX: let this uncommented 'invlpg' remind us on the behaviour of Intel's -+ * (and AMD's) TLBs. namely, they do not cache PTEs that would raise *any* -+ * page fault when examined during a TLB load attempt. this is true not only -+ * for PTEs holding a non-present entry but also present entries that will -+ * raise a page fault (such as those set up by PaX, or the copy-on-write -+ * mechanism). in effect it means that we do *not* need to flush the TLBs -+ * for our target pages since their PTEs are simply not in the TLBs at all. -+ -+ * the best thing in omitting it is that we gain around 15-20% speed in the -+ * fast path of the page fault handler and can get rid of tracing since we -+ * can no longer flush unintended entries. -+ */ -+ "invlpg (%0)\n" -+#endif -+ "testb $0,%%es:(%0)\n" -+ "xorb %3,(%1)\n" -+#ifdef CONFIG_PAX_MEMORY_UDEREF -+ "pushl %%ss\n" -+ "popl %%es\n" -+#endif -+ : -+ : "r" (address), "r" (pte), "q" (pte_mask), "i" (_PAGE_USER), "r" (__USER_DS) -+ : "memory", "cc"); -+ pte_unmap_unlock(pte, ptl); -+ up_read(&mm->mmap_sem); -+ return 1; -+} -+#endif -+ - /* - * Handle a spurious fault caused by a stale TLB entry. - * -@@ -912,6 +1118,9 @@ int show_unhandled_signals = 1; - static inline int - access_error(unsigned long error_code, int write, struct vm_area_struct *vma) - { -+ if (nx_enabled && (error_code & PF_INSTR) && !(vma->vm_flags & VM_EXEC)) -+ return 1; -+ - if (write) { - /* write, present and write, not present: */ - if (unlikely(!(vma->vm_flags & VM_WRITE))) -@@ -945,17 +1154,16 @@ do_page_fault(struct pt_regs *regs, unsi - { - struct vm_area_struct *vma; - struct task_struct *tsk; -- unsigned long address; - struct mm_struct *mm; - int write; - int fault; - -+ /* Get the faulting address: */ -+ const unsigned long address = read_cr2(); -+ - tsk = current; - mm = tsk->mm; - -- /* Get the faulting address: */ -- address = read_cr2(); -- - /* - * Detect and handle instructions that would cause a page fault for - * both a tracked kernel page and a userspace page. -@@ -1015,7 +1223,7 @@ do_page_fault(struct pt_regs *regs, unsi - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet: - */ -- if (user_mode_vm(regs)) { -+ if (user_mode(regs)) { - local_irq_enable(); - error_code |= PF_USER; - } else { -@@ -1069,6 +1277,11 @@ do_page_fault(struct pt_regs *regs, unsi - might_sleep(); - } - -+#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_PAGEEXEC) -+ if (pax_handle_pageexec_fault(regs, mm, address, error_code)) -+ return; -+#endif -+ - vma = find_vma(mm, address); - if (unlikely(!vma)) { - bad_area(regs, error_code, address); -@@ -1080,18 +1293,24 @@ do_page_fault(struct pt_regs *regs, unsi - bad_area(regs, error_code, address); - return; - } -- if (error_code & PF_USER) { -- /* -- * Accessing the stack below %sp is always a bug. -- * The large cushion allows instructions like enter -- * and pusha to work. ("enter $65535, $31" pushes -- * 32 pointers and then decrements %sp by 65535.) -- */ -- if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { -- bad_area(regs, error_code, address); -- return; -- } -+ /* -+ * Accessing the stack below %sp is always a bug. -+ * The large cushion allows instructions like enter -+ * and pusha to work. ("enter $65535, $31" pushes -+ * 32 pointers and then decrements %sp by 65535.) -+ */ -+ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < task_pt_regs(tsk)->sp)) { -+ bad_area(regs, error_code, address); -+ return; -+ } -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (unlikely((mm->pax_flags & MF_PAX_SEGMEXEC) && vma->vm_end - SEGMEXEC_TASK_SIZE - 1 < address - SEGMEXEC_TASK_SIZE - 1)) { -+ bad_area(regs, error_code, address); -+ return; - } -+#endif -+ - if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); - return; -@@ -1135,3 +1354,174 @@ good_area: - - up_read(&mm->mmap_sem); - } -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+static int pax_handle_fetch_fault_32(struct pt_regs *regs) -+{ -+ int err; -+ -+ do { /* PaX: gcc trampoline emulation #1 */ -+ unsigned char mov1, mov2; -+ unsigned short jmp; -+ unsigned int addr1, addr2; -+ -+#ifdef CONFIG_X86_64 -+ if ((regs->ip + 11) >> 32) -+ break; -+#endif -+ -+ err = get_user(mov1, (unsigned char __user *)regs->ip); -+ err |= get_user(addr1, (unsigned int __user *)(regs->ip + 1)); -+ err |= get_user(mov2, (unsigned char __user *)(regs->ip + 5)); -+ err |= get_user(addr2, (unsigned int __user *)(regs->ip + 6)); -+ err |= get_user(jmp, (unsigned short __user *)(regs->ip + 10)); -+ -+ if (err) -+ break; -+ -+ if (mov1 == 0xB9 && mov2 == 0xB8 && jmp == 0xE0FF) { -+ regs->cx = addr1; -+ regs->ax = addr2; -+ regs->ip = addr2; -+ return 2; -+ } -+ } while (0); -+ -+ do { /* PaX: gcc trampoline emulation #2 */ -+ unsigned char mov, jmp; -+ unsigned int addr1, addr2; -+ -+#ifdef CONFIG_X86_64 -+ if ((regs->ip + 9) >> 32) -+ break; -+#endif -+ -+ err = get_user(mov, (unsigned char __user *)regs->ip); -+ err |= get_user(addr1, (unsigned int __user *)(regs->ip + 1)); -+ err |= get_user(jmp, (unsigned char __user *)(regs->ip + 5)); -+ err |= get_user(addr2, (unsigned int __user *)(regs->ip + 6)); -+ -+ if (err) -+ break; -+ -+ if (mov == 0xB9 && jmp == 0xE9) { -+ regs->cx = addr1; -+ regs->ip = (unsigned int)(regs->ip + addr2 + 10); -+ return 2; -+ } -+ } while (0); -+ -+ return 1; /* PaX in action */ -+} -+ -+#ifdef CONFIG_X86_64 -+static int pax_handle_fetch_fault_64(struct pt_regs *regs) -+{ -+ int err; -+ -+ do { /* PaX: gcc trampoline emulation #1 */ -+ unsigned short mov1, mov2, jmp1; -+ unsigned char jmp2; -+ unsigned int addr1; -+ unsigned long addr2; -+ -+ err = get_user(mov1, (unsigned short __user *)regs->ip); -+ err |= get_user(addr1, (unsigned int __user *)(regs->ip + 2)); -+ err |= get_user(mov2, (unsigned short __user *)(regs->ip + 6)); -+ err |= get_user(addr2, (unsigned long __user *)(regs->ip + 8)); -+ err |= get_user(jmp1, (unsigned short __user *)(regs->ip + 16)); -+ err |= get_user(jmp2, (unsigned char __user *)(regs->ip + 18)); -+ -+ if (err) -+ break; -+ -+ if (mov1 == 0xBB41 && mov2 == 0xBA49 && jmp1 == 0xFF49 && jmp2 == 0xE3) { -+ regs->r11 = addr1; -+ regs->r10 = addr2; -+ regs->ip = addr1; -+ return 2; -+ } -+ } while (0); -+ -+ do { /* PaX: gcc trampoline emulation #2 */ -+ unsigned short mov1, mov2, jmp1; -+ unsigned char jmp2; -+ unsigned long addr1, addr2; -+ -+ err = get_user(mov1, (unsigned short __user *)regs->ip); -+ err |= get_user(addr1, (unsigned long __user *)(regs->ip + 2)); -+ err |= get_user(mov2, (unsigned short __user *)(regs->ip + 10)); -+ err |= get_user(addr2, (unsigned long __user *)(regs->ip + 12)); -+ err |= get_user(jmp1, (unsigned short __user *)(regs->ip + 20)); -+ err |= get_user(jmp2, (unsigned char __user *)(regs->ip + 22)); -+ -+ if (err) -+ break; -+ -+ if (mov1 == 0xBB49 && mov2 == 0xBA49 && jmp1 == 0xFF49 && jmp2 == 0xE3) { -+ regs->r11 = addr1; -+ regs->r10 = addr2; -+ regs->ip = addr1; -+ return 2; -+ } -+ } while (0); -+ -+ return 1; /* PaX in action */ -+} -+#endif -+ -+/* -+ * PaX: decide what to do with offenders (regs->ip = fault address) -+ * -+ * returns 1 when task should be killed -+ * 2 when gcc trampoline was detected -+ */ -+static int pax_handle_fetch_fault(struct pt_regs *regs) -+{ -+ if (v8086_mode(regs)) -+ return 1; -+ -+ if (!(current->mm->pax_flags & MF_PAX_EMUTRAMP)) -+ return 1; -+ -+#ifdef CONFIG_X86_32 -+ return pax_handle_fetch_fault_32(regs); -+#else -+ if (regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT)) -+ return pax_handle_fetch_fault_32(regs); -+ else -+ return pax_handle_fetch_fault_64(regs); -+#endif -+} -+#endif -+ -+#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) -+void pax_report_insns(void *pc, void *sp) -+{ -+ long i; -+ -+ printk(KERN_ERR "PAX: bytes at PC: "); -+ for (i = 0; i < 20; i++) { -+ unsigned char c; -+ if (get_user(c, (unsigned char __user *)pc+i)) -+ printk(KERN_CONT "?? "); -+ else -+ printk(KERN_CONT "%02x ", c); -+ } -+ printk("\n"); -+ -+ printk(KERN_ERR "PAX: bytes at SP-%lu: ", (unsigned long)sizeof(long)); -+ for (i = -1; i < 80 / sizeof(long); i++) { -+ unsigned long c; -+ if (get_user(c, (unsigned long __user *)sp+i)) -+#ifdef CONFIG_X86_32 -+ printk(KERN_CONT "???????? "); -+#else -+ printk(KERN_CONT "???????????????? "); -+#endif -+ else -+ printk(KERN_CONT "%0*lx ", 2 * (int)sizeof(long), c); -+ } -+ printk("\n"); -+} -+#endif -diff -urNp linux-2.6.31.1/arch/x86/mm/highmem_32.c linux-2.6.31.1/arch/x86/mm/highmem_32.c ---- linux-2.6.31.1/arch/x86/mm/highmem_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/highmem_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -32,6 +32,10 @@ void *kmap_atomic_prot(struct page *page - enum fixed_addresses idx; - unsigned long vaddr; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ - pagefault_disable(); - -@@ -43,8 +47,17 @@ void *kmap_atomic_prot(struct page *page - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - set_pte(kmap_pte-idx, mk_pte(page, prot)); - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - return (void *)vaddr; - } - -@@ -58,15 +71,29 @@ void kunmap_atomic(void *kvaddr, enum km - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - /* - * Force other mappings to Oops if they'll try to access this pte - * without first remap it. Keeping stale mappings around is a bad idea - * also, in case the page changes cacheability attributes or becomes - * a protected page in a hypervisor. - */ -- if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) -+ if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - kpte_clear_flush(kmap_pte-idx, vaddr); -- else { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ -+ } else { - #ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(vaddr < PAGE_OFFSET); - BUG_ON(vaddr >= (unsigned long)high_memory); -diff -urNp linux-2.6.31.1/arch/x86/mm/hugetlbpage.c linux-2.6.31.1/arch/x86/mm/hugetlbpage.c ---- linux-2.6.31.1/arch/x86/mm/hugetlbpage.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/hugetlbpage.c 2009-10-01 20:12:42.000000000 -0400 -@@ -267,13 +267,18 @@ static unsigned long hugetlb_get_unmappe - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; -- unsigned long start_addr; -+ unsigned long start_addr, pax_task_size = TASK_SIZE; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (mm->pax_flags & MF_PAX_SEGMEXEC) -+ pax_task_size = SEGMEXEC_TASK_SIZE; -+#endif - - if (len > mm->cached_hole_size) { -- start_addr = mm->free_area_cache; -+ start_addr = mm->free_area_cache; - } else { -- start_addr = TASK_UNMAPPED_BASE; -- mm->cached_hole_size = 0; -+ start_addr = mm->mmap_base; -+ mm->cached_hole_size = 0; - } - - full_search: -@@ -281,13 +286,13 @@ full_search: - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ -- if (TASK_SIZE - len < addr) { -+ if (pax_task_size - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ -- if (start_addr != TASK_UNMAPPED_BASE) { -- start_addr = TASK_UNMAPPED_BASE; -+ if (start_addr != mm->mmap_base) { -+ start_addr = mm->mmap_base; - mm->cached_hole_size = 0; - goto full_search; - } -@@ -310,9 +315,8 @@ static unsigned long hugetlb_get_unmappe - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev_vma; -- unsigned long base = mm->mmap_base, addr = addr0; -+ unsigned long base = mm->mmap_base, addr; - unsigned long largest_hole = mm->cached_hole_size; -- int first_time = 1; - - /* don't allow allocations above current base */ - if (mm->free_area_cache > base) -@@ -322,7 +326,7 @@ static unsigned long hugetlb_get_unmappe - largest_hole = 0; - mm->free_area_cache = base; - } --try_again: -+ - /* make sure it can fit in the remaining address space */ - if (mm->free_area_cache < len) - goto fail; -@@ -364,22 +368,26 @@ try_again: - - fail: - /* -- * if hint left us with no space for the requested -- * mapping then try again: -- */ -- if (first_time) { -- mm->free_area_cache = base; -- largest_hole = 0; -- first_time = 0; -- goto try_again; -- } -- /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ -- mm->free_area_cache = TASK_UNMAPPED_BASE; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (mm->pax_flags & MF_PAX_SEGMEXEC) -+ mm->mmap_base = SEGMEXEC_TASK_UNMAPPED_BASE; -+ else -+#endif -+ -+ mm->mmap_base = TASK_UNMAPPED_BASE; -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base += mm->delta_mmap; -+#endif -+ -+ mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; - addr = hugetlb_get_unmapped_area_bottomup(file, addr0, - len, pgoff, flags); -@@ -387,6 +395,7 @@ fail: - /* - * Restore the topdown base: - */ -+ mm->mmap_base = base; - mm->free_area_cache = base; - mm->cached_hole_size = ~0UL; - -@@ -400,10 +409,17 @@ hugetlb_get_unmapped_area(struct file *f - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; -+ unsigned long pax_task_size = TASK_SIZE; - - if (len & ~huge_page_mask(h)) - return -EINVAL; -- if (len > TASK_SIZE) -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (mm->pax_flags & MF_PAX_SEGMEXEC) -+ pax_task_size = SEGMEXEC_TASK_SIZE; -+#endif -+ -+ if (len > pax_task_size) - return -ENOMEM; - - if (flags & MAP_FIXED) { -@@ -415,7 +431,7 @@ hugetlb_get_unmapped_area(struct file *f - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); -- if (TASK_SIZE - len >= addr && -+ if (pax_task_size - len >= addr && - (!vma || addr + len <= vma->vm_start)) - return addr; - } -diff -urNp linux-2.6.31.1/arch/x86/mm/init_32.c linux-2.6.31.1/arch/x86/mm/init_32.c ---- linux-2.6.31.1/arch/x86/mm/init_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/init_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -51,6 +51,7 @@ - #include <asm/cacheflush.h> - #include <asm/page_types.h> - #include <asm/init.h> -+#include <asm/desc.h> - - unsigned long highstart_pfn, highend_pfn; - -@@ -72,36 +73,6 @@ static __init void *alloc_low_page(void) - } - - /* -- * Creates a middle page table and puts a pointer to it in the -- * given global directory entry. This only returns the gd entry -- * in non-PAE compilation mode, since the middle layer is folded. -- */ --static pmd_t * __init one_md_table_init(pgd_t *pgd) --{ -- pud_t *pud; -- pmd_t *pmd_table; -- --#ifdef CONFIG_X86_PAE -- if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { -- if (after_bootmem) -- pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); -- else -- pmd_table = (pmd_t *)alloc_low_page(); -- paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); -- set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); -- pud = pud_offset(pgd, 0); -- BUG_ON(pmd_table != pmd_offset(pud, 0)); -- -- return pmd_table; -- } --#endif -- pud = pud_offset(pgd, 0); -- pmd_table = pmd_offset(pud, 0); -- -- return pmd_table; --} -- --/* - * Create a page table and place a pointer to it in a middle page - * directory entry: - */ -@@ -121,13 +92,28 @@ static pte_t * __init one_page_table_ini - page_table = (pte_t *)alloc_low_page(); - - paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); -+#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) -+ set_pmd(pmd, __pmd(__pa(page_table) | _KERNPG_TABLE)); -+#else - set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); -+#endif - BUG_ON(page_table != pte_offset_kernel(pmd, 0)); - } - - return pte_offset_kernel(pmd, 0); - } - -+static pmd_t * __init one_md_table_init(pgd_t *pgd) -+{ -+ pud_t *pud; -+ pmd_t *pmd_table; -+ -+ pud = pud_offset(pgd, 0); -+ pmd_table = pmd_offset(pud, 0); -+ -+ return pmd_table; -+} -+ - pmd_t * __init populate_extra_pmd(unsigned long vaddr) - { - int pgd_idx = pgd_index(vaddr); -@@ -201,6 +187,7 @@ page_table_range_init(unsigned long star - int pgd_idx, pmd_idx; - unsigned long vaddr; - pgd_t *pgd; -+ pud_t *pud; - pmd_t *pmd; - pte_t *pte = NULL; - -@@ -210,8 +197,13 @@ page_table_range_init(unsigned long star - pgd = pgd_base + pgd_idx; - - for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { -- pmd = one_md_table_init(pgd); -- pmd = pmd + pmd_index(vaddr); -+ pud = pud_offset(pgd, vaddr); -+ pmd = pmd_offset(pud, vaddr); -+ -+#ifdef CONFIG_X86_PAE -+ paravirt_alloc_pmd(&init_mm, __pa(pmd) >> PAGE_SHIFT); -+#endif -+ - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); - pmd++, pmd_idx++) { - pte = page_table_kmap_check(one_page_table_init(pmd), -@@ -223,11 +215,23 @@ page_table_range_init(unsigned long star - } - } - --static inline int is_kernel_text(unsigned long addr) -+static inline int is_kernel_text(unsigned long start, unsigned long end) - { -- if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) -- return 1; -- return 0; -+ unsigned long etext; -+ -+#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) -+ etext = ktva_ktla((unsigned long)&MODULES_EXEC_END); -+#else -+ etext = (unsigned long)&_etext; -+#endif -+ -+ if ((start > ktla_ktva(etext) || -+ end <= ktla_ktva((unsigned long)_stext)) && -+ (start > ktla_ktva((unsigned long)_einittext) || -+ end <= ktla_ktva((unsigned long)_sinittext)) && -+ (start > (unsigned long)__va(0xfffff) || end <= (unsigned long)__va(0xc0000))) -+ return 0; -+ return 1; - } - - /* -@@ -243,9 +247,10 @@ kernel_physical_mapping_init(unsigned lo - int use_pse = page_size_mask == (1<<PG_LEVEL_2M); - unsigned long start_pfn, end_pfn; - pgd_t *pgd_base = swapper_pg_dir; -- int pgd_idx, pmd_idx, pte_ofs; -+ unsigned int pgd_idx, pmd_idx, pte_ofs; - unsigned long pfn; - pgd_t *pgd; -+ pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned pages_2m, pages_4k; -@@ -278,8 +283,13 @@ repeat: - pfn = start_pfn; - pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); - pgd = pgd_base + pgd_idx; -- for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { -- pmd = one_md_table_init(pgd); -+ for (; pgd_idx < PTRS_PER_PGD && pfn < max_low_pfn; pgd++, pgd_idx++) { -+ pud = pud_offset(pgd, 0); -+ pmd = pmd_offset(pud, 0); -+ -+#ifdef CONFIG_X86_PAE -+ paravirt_alloc_pmd(&init_mm, __pa(pmd) >> PAGE_SHIFT); -+#endif - - if (pfn >= end_pfn) - continue; -@@ -291,14 +301,13 @@ repeat: - #endif - for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn; - pmd++, pmd_idx++) { -- unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; -+ unsigned long address = pfn * PAGE_SIZE + PAGE_OFFSET; - - /* - * Map with big pages if possible, otherwise - * create normal page tables: - */ - if (use_pse) { -- unsigned int addr2; - pgprot_t prot = PAGE_KERNEL_LARGE; - /* - * first pass will use the same initial -@@ -308,11 +317,7 @@ repeat: - __pgprot(PTE_IDENT_ATTR | - _PAGE_PSE); - -- addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + -- PAGE_OFFSET + PAGE_SIZE-1; -- -- if (is_kernel_text(addr) || -- is_kernel_text(addr2)) -+ if (is_kernel_text(address, address + PMD_SIZE)) - prot = PAGE_KERNEL_LARGE_EXEC; - - pages_2m++; -@@ -329,7 +334,7 @@ repeat: - pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); - pte += pte_ofs; - for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; -- pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { -+ pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { - pgprot_t prot = PAGE_KERNEL; - /* - * first pass will use the same initial -@@ -337,7 +342,7 @@ repeat: - */ - pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); - -- if (is_kernel_text(addr)) -+ if (is_kernel_text(address, address + PAGE_SIZE)) - prot = PAGE_KERNEL_EXEC; - - pages_4k++; -@@ -489,7 +494,7 @@ void __init native_pagetable_setup_start - - pud = pud_offset(pgd, va); - pmd = pmd_offset(pud, va); -- if (!pmd_present(*pmd)) -+ if (!pmd_present(*pmd) || pmd_huge(*pmd)) - break; - - pte = pte_offset_kernel(pmd, va); -@@ -541,9 +546,7 @@ void __init early_ioremap_page_table_ran - - static void __init pagetable_init(void) - { -- pgd_t *pgd_base = swapper_pg_dir; -- -- permanent_kmaps_init(pgd_base); -+ permanent_kmaps_init(swapper_pg_dir); - } - - #ifdef CONFIG_ACPI_SLEEP -@@ -551,12 +554,12 @@ static void __init pagetable_init(void) - * ACPI suspend needs this for resume, because things like the intel-agp - * driver might have split up a kernel 4MB mapping. - */ --char swsusp_pg_dir[PAGE_SIZE] -+pgd_t swsusp_pg_dir[PTRS_PER_PGD] - __attribute__ ((aligned(PAGE_SIZE))); - - static inline void save_pg_dir(void) - { -- memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); -+ clone_pgd_range(swsusp_pg_dir, swapper_pg_dir, PTRS_PER_PGD); - } - #else /* !CONFIG_ACPI_SLEEP */ - static inline void save_pg_dir(void) -@@ -588,7 +591,7 @@ void zap_low_mappings(bool early) - flush_tlb_all(); - } - --pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); -+pteval_t __supported_pte_mask __read_only = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); - EXPORT_SYMBOL_GPL(__supported_pte_mask); - - /* user-defined highmem size */ -@@ -883,7 +886,7 @@ void __init mem_init(void) - set_highmem_pages_init(); - - codesize = (unsigned long) &_etext - (unsigned long) &_text; -- datasize = (unsigned long) &_edata - (unsigned long) &_etext; -+ datasize = (unsigned long) &_edata - (unsigned long) &_sdata; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); -@@ -929,10 +932,10 @@ void __init mem_init(void) - ((unsigned long)&__init_end - - (unsigned long)&__init_begin) >> 10, - -- (unsigned long)&_etext, (unsigned long)&_edata, -- ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, -+ (unsigned long)&_sdata, (unsigned long)&_edata, -+ ((unsigned long)&_edata - (unsigned long)&_sdata) >> 10, - -- (unsigned long)&_text, (unsigned long)&_etext, -+ ktla_ktva((unsigned long)&_text), ktla_ktva((unsigned long)&_etext), - ((unsigned long)&_etext - (unsigned long)&_text) >> 10); - - /* -diff -urNp linux-2.6.31.1/arch/x86/mm/init_64.c linux-2.6.31.1/arch/x86/mm/init_64.c ---- linux-2.6.31.1/arch/x86/mm/init_64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/init_64.c 2009-10-01 20:12:42.000000000 -0400 -@@ -159,12 +159,24 @@ void set_pte_vaddr_pud(pud_t *pud_page, - pmd_t *pmd; - pte_t *pte; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - pud = pud_page + pud_index(vaddr); - pmd = fill_pmd(pud, vaddr); - pte = fill_pte(pmd, vaddr); - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - set_pte(pte, new_pte); - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) -@@ -222,14 +234,12 @@ static void __init __init_extra_mapping( - pgd = pgd_offset_k((unsigned long)__va(phys)); - if (pgd_none(*pgd)) { - pud = (pud_t *) spp_getpage(); -- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | -- _PAGE_USER)); -+ set_pgd(pgd, __pgd(__pa(pud) | _PAGE_TABLE)); - } - pud = pud_offset(pgd, (unsigned long)__va(phys)); - if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); -- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | -- _PAGE_USER)); -+ set_pud(pud, __pud(__pa(pmd) | _PAGE_TABLE)); - } - pmd = pmd_offset(pud, phys); - BUG_ON(!pmd_none(*pmd)); -@@ -848,8 +858,8 @@ int kern_addr_valid(unsigned long addr) - static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), -- .vm_page_prot = PAGE_READONLY_EXEC, -- .vm_flags = VM_READ | VM_EXEC -+ .vm_page_prot = PAGE_READONLY, -+ .vm_flags = VM_READ - }; - - struct vm_area_struct *get_gate_vma(struct task_struct *tsk) -@@ -883,7 +893,7 @@ int in_gate_area_no_task(unsigned long a - - const char *arch_vma_name(struct vm_area_struct *vma) - { -- if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) -+ if (vma->vm_mm && vma->vm_start == vma->vm_mm->context.vdso) - return "[vdso]"; - if (vma == &gate_vma) - return "[vsyscall]"; -diff -urNp linux-2.6.31.1/arch/x86/mm/init.c linux-2.6.31.1/arch/x86/mm/init.c ---- linux-2.6.31.1/arch/x86/mm/init.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/init.c 2009-10-01 20:12:42.000000000 -0400 -@@ -28,11 +28,10 @@ int direct_gbpages - #endif - ; - -+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_PAE) - int nx_enabled; - --#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) --static int disable_nx __cpuinitdata; -- -+#ifndef CONFIG_PAX_PAGEEXEC - /* - * noexec = on|off - * -@@ -46,32 +45,26 @@ static int __init noexec_setup(char *str - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { -- __supported_pte_mask |= _PAGE_NX; -- disable_nx = 0; -+ nx_enabled = 1; - } else if (!strncmp(str, "off", 3)) { -- disable_nx = 1; -- __supported_pte_mask &= ~_PAGE_NX; -+ nx_enabled = 0; - } - return 0; - } - early_param("noexec", noexec_setup); - #endif -+#endif - - #ifdef CONFIG_X86_PAE - static void __init set_nx(void) - { -- unsigned int v[4], l, h; -- -- if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { -- cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); -+ if (!nx_enabled && cpu_has_nx) { -+ unsigned l, h; - -- if ((v[3] & (1 << 20)) && !disable_nx) { -- rdmsr(MSR_EFER, l, h); -- l |= EFER_NX; -- wrmsr(MSR_EFER, l, h); -- nx_enabled = 1; -- __supported_pte_mask |= _PAGE_NX; -- } -+ __supported_pte_mask &= ~_PAGE_NX; -+ rdmsr(MSR_EFER, l, h); -+ l &= ~EFER_NX; -+ wrmsr(MSR_EFER, l, h); - } - } - #else -@@ -86,7 +79,7 @@ void __cpuinit check_efer(void) - unsigned long efer; - - rdmsrl(MSR_EFER, efer); -- if (!(efer & EFER_NX) || disable_nx) -+ if (!(efer & EFER_NX) || !nx_enabled) - __supported_pte_mask &= ~_PAGE_NX; - } - #endif -@@ -394,7 +387,13 @@ unsigned long __init_refok init_memory_m - */ - int devmem_is_allowed(unsigned long pagenr) - { -- if (pagenr <= 256) -+ if (!pagenr) -+ return 1; -+#ifdef CONFIG_VM86 -+ if (pagenr < (ISA_START_ADDRESS >> PAGE_SHIFT)) -+ return 1; -+#endif -+ if ((ISA_START_ADDRESS >> PAGE_SHIFT) <= pagenr && pagenr < (ISA_END_ADDRESS >> PAGE_SHIFT)) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; -@@ -442,6 +441,76 @@ void free_init_pages(char *what, unsigne - - void free_initmem(void) - { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pgd_t *pgd; -+ pud_t *pud; -+ pmd_t *pmd; -+ -+#ifdef CONFIG_X86_32 -+ /* PaX: limit KERNEL_CS to actual size */ -+ unsigned long addr, limit; -+ struct desc_struct d; -+ int cpu; -+ -+#ifdef CONFIG_MODULES -+ limit = ktva_ktla((unsigned long)&MODULES_EXEC_END); -+#else -+ limit = (unsigned long)&_etext; -+#endif -+ limit = (limit - 1UL) >> PAGE_SHIFT; -+ -+ memset(__LOAD_PHYSICAL_ADDR + PAGE_OFFSET, POISON_FREE_INITMEM, PAGE_SIZE); -+ for (cpu = 0; cpu < NR_CPUS; cpu++) { -+ pack_descriptor(&d, get_desc_base(&get_cpu_gdt_table(cpu)[GDT_ENTRY_KERNEL_CS]), limit, 0x9B, 0xC); -+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_KERNEL_CS, &d, DESCTYPE_S); -+ } -+ -+ /* PaX: make KERNEL_CS read-only */ -+ for (addr = ktla_ktva((unsigned long)&_text); addr < (unsigned long)&_sdata; addr += PMD_SIZE) { -+ pgd = pgd_offset_k(addr); -+ pud = pud_offset(pgd, addr); -+ pmd = pmd_offset(pud, addr); -+ set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_RW)); -+ } -+#ifdef CONFIG_X86_PAE -+ for (addr = (unsigned long)&__init_begin; addr < (unsigned long)&__init_end; addr += PMD_SIZE) { -+ pgd = pgd_offset_k(addr); -+ pud = pud_offset(pgd, addr); -+ pmd = pmd_offset(pud, addr); -+ set_pmd(pmd, __pmd(pmd_val(*pmd) | (_PAGE_NX & __supported_pte_mask))); -+ } -+#endif -+#else -+ unsigned long addr, end; -+ -+ /* PaX: make kernel code/rodata read-only, rest non-executable */ -+ for (addr = __START_KERNEL_map; addr < __START_KERNEL_map + KERNEL_IMAGE_SIZE; addr += PMD_SIZE) { -+ pgd = pgd_offset_k(addr); -+ pud = pud_offset(pgd, addr); -+ pmd = pmd_offset(pud, addr); -+ if ((unsigned long)_text <= addr && addr < (unsigned long)_sdata) -+ set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_RW)); -+ else -+ set_pmd(pmd, __pmd(pmd_val(*pmd) | (_PAGE_NX & __supported_pte_mask))); -+ } -+ -+ addr = (unsigned long)__va(__pa(__START_KERNEL_map)); -+ end = addr + KERNEL_IMAGE_SIZE; -+ for (; addr < end; addr += PMD_SIZE) { -+ pgd = pgd_offset_k(addr); -+ pud = pud_offset(pgd, addr); -+ pmd = pmd_offset(pud, addr); -+ if ((unsigned long)__va(__pa(_text)) <= addr && addr < (unsigned long)__va(__pa(_sdata))) -+ set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_RW)); -+ else -+ set_pmd(pmd, __pmd(pmd_val(*pmd) | (_PAGE_NX & __supported_pte_mask))); -+ } -+#endif -+ -+ flush_tlb_all(); -+#endif -+ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -diff -urNp linux-2.6.31.1/arch/x86/mm/iomap_32.c linux-2.6.31.1/arch/x86/mm/iomap_32.c ---- linux-2.6.31.1/arch/x86/mm/iomap_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/iomap_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -37,12 +37,26 @@ void *kmap_atomic_prot_pfn(unsigned long - enum fixed_addresses idx; - unsigned long vaddr; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - pagefault_disable(); - - debug_kmap_atomic(type); - idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -diff -urNp linux-2.6.31.1/arch/x86/mm/ioremap.c linux-2.6.31.1/arch/x86/mm/ioremap.c ---- linux-2.6.31.1/arch/x86/mm/ioremap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/ioremap.c 2009-10-01 20:12:42.000000000 -0400 -@@ -111,8 +111,8 @@ int page_is_ram(unsigned long pagenr) - * Second special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. - */ -- if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && -- pagenr < (BIOS_END >> PAGE_SHIFT)) -+ if (pagenr >= (ISA_START_ADDRESS >> PAGE_SHIFT) && -+ pagenr < (ISA_END_ADDRESS >> PAGE_SHIFT)) - return 0; - - for (i = 0; i < e820.nr_map; i++) { -@@ -207,10 +207,7 @@ static void __iomem *__ioremap_caller(re - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ -- for (pfn = phys_addr >> PAGE_SHIFT; -- (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); -- pfn++) { -- -+ for (pfn = phys_addr >> PAGE_SHIFT; ((resource_size_t)pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); pfn++) { - int is_ram = page_is_ram(pfn); - - if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) -@@ -272,6 +269,8 @@ static void __iomem *__ioremap_caller(re - break; - } - -+ prot = canon_pgprot(prot); -+ - /* - * Ok, go for it.. - */ -@@ -489,7 +488,7 @@ static int __init early_ioremap_debug_se - early_param("early_ioremap_debug", early_ioremap_debug_setup); - - static __initdata int after_paging_init; --static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; -+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __read_only __aligned(PAGE_SIZE); - - static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) - { -@@ -521,8 +520,7 @@ void __init early_ioremap_init(void) - slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); - - pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); -- memset(bm_pte, 0, sizeof(bm_pte)); -- pmd_populate_kernel(&init_mm, pmd, bm_pte); -+ pmd_populate_user(&init_mm, pmd, bm_pte); - - /* - * The boot-ioremap range spans multiple pmds, for which -diff -urNp linux-2.6.31.1/arch/x86/mm/mmap.c linux-2.6.31.1/arch/x86/mm/mmap.c ---- linux-2.6.31.1/arch/x86/mm/mmap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/mmap.c 2009-10-01 20:12:42.000000000 -0400 -@@ -36,7 +36,7 @@ - * Leave an at least ~128 MB hole. - */ - #define MIN_GAP (128*1024*1024) --#define MAX_GAP (TASK_SIZE/6*5) -+#define MAX_GAP (pax_task_size/6*5) - - /* - * True on X86_32 or when emulating IA32 on X86_64 -@@ -81,27 +81,40 @@ static unsigned long mmap_rnd(void) - return rnd << PAGE_SHIFT; - } - --static unsigned long mmap_base(void) -+static unsigned long mmap_base(struct mm_struct *mm) - { - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; -+ unsigned long pax_task_size = TASK_SIZE; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (mm->pax_flags & MF_PAX_SEGMEXEC) -+ pax_task_size = SEGMEXEC_TASK_SIZE; -+#endif - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - -- return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); -+ return PAGE_ALIGN(pax_task_size - gap - mmap_rnd()); - } - - /* - * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 - * does, but not when emulating X86_32 - */ --static unsigned long mmap_legacy_base(void) -+static unsigned long mmap_legacy_base(struct mm_struct *mm) - { -- if (mmap_is_ia32()) -+ if (mmap_is_ia32()) { -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (mm->pax_flags & MF_PAX_SEGMEXEC) -+ return SEGMEXEC_TASK_UNMAPPED_BASE; -+ else -+#endif -+ - return TASK_UNMAPPED_BASE; -- else -+ } else - return TASK_UNMAPPED_BASE + mmap_rnd(); - } - -@@ -112,11 +125,23 @@ static unsigned long mmap_legacy_base(vo - void arch_pick_mmap_layout(struct mm_struct *mm) - { - if (mmap_is_legacy()) { -- mm->mmap_base = mmap_legacy_base(); -+ mm->mmap_base = mmap_legacy_base(mm); -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base += mm->delta_mmap; -+#endif -+ - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; - } else { -- mm->mmap_base = mmap_base(); -+ mm->mmap_base = mmap_base(mm); -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base -= mm->delta_mmap + mm->delta_stack; -+#endif -+ - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; - } -diff -urNp linux-2.6.31.1/arch/x86/mm/numa_32.c linux-2.6.31.1/arch/x86/mm/numa_32.c ---- linux-2.6.31.1/arch/x86/mm/numa_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/numa_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -98,7 +98,6 @@ unsigned long node_memmap_size_bytes(int - } - #endif - --extern unsigned long find_max_low_pfn(void); - extern unsigned long highend_pfn, highstart_pfn; - - #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -diff -urNp linux-2.6.31.1/arch/x86/mm/pageattr.c linux-2.6.31.1/arch/x86/mm/pageattr.c ---- linux-2.6.31.1/arch/x86/mm/pageattr.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/pageattr.c 2009-10-01 20:12:42.000000000 -0400 -@@ -22,6 +22,7 @@ - #include <asm/pgalloc.h> - #include <asm/proto.h> - #include <asm/pat.h> -+#include <asm/desc.h> - - /* - * The current flushing context - we pass it instead of 5 arguments: -@@ -266,9 +267,10 @@ static inline pgprot_t static_protection - * Does not cover __inittext since that is gone later on. On - * 64bit we do not enforce !NX on the low mapping - */ -- if (within(address, (unsigned long)_text, (unsigned long)_etext)) -+ if (within(address, ktla_ktva((unsigned long)_text), ktla_ktva((unsigned long)_etext))) - pgprot_val(forbidden) |= _PAGE_NX; - -+#ifdef CONFIG_DEBUG_RODATA - /* - * The .rodata section needs to be read-only. Using the pfn - * catches all aliases. -@@ -276,6 +278,7 @@ static inline pgprot_t static_protection - if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, - __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) - pgprot_val(forbidden) |= _PAGE_RW; -+#endif - - prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); - -@@ -328,8 +331,20 @@ EXPORT_SYMBOL_GPL(lookup_address); - */ - static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) - { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+#endif -+ - /* change init_mm */ - set_pte_atomic(kpte, pte); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - #ifdef CONFIG_X86_32 - if (!SHARED_KERNEL_PMD) { - struct page *page; -diff -urNp linux-2.6.31.1/arch/x86/mm/pageattr-test.c linux-2.6.31.1/arch/x86/mm/pageattr-test.c ---- linux-2.6.31.1/arch/x86/mm/pageattr-test.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/pageattr-test.c 2009-10-01 20:12:42.000000000 -0400 -@@ -36,7 +36,7 @@ enum { - - static int pte_testbit(pte_t pte) - { -- return pte_flags(pte) & _PAGE_UNUSED1; -+ return pte_flags(pte) & _PAGE_CPA_TEST; - } - - struct split_state { -diff -urNp linux-2.6.31.1/arch/x86/mm/pat.c linux-2.6.31.1/arch/x86/mm/pat.c ---- linux-2.6.31.1/arch/x86/mm/pat.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/pat.c 2009-10-01 20:12:42.000000000 -0400 -@@ -213,7 +213,7 @@ chk_conflict(struct memtype *new, struct - - conflict: - printk(KERN_INFO "%s:%d conflicting memory types " -- "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, -+ "%Lx-%Lx %s<->%s\n", current->comm, task_pid_nr(current), new->start, - new->end, cattr_name(new->type), cattr_name(entry->type)); - return -EBUSY; - } -@@ -487,7 +487,7 @@ int free_memtype(u64 start, u64 end) - - if (err) { - printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", -- current->comm, current->pid, start, end); -+ current->comm, task_pid_nr(current), start, end); - } - - dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); -@@ -588,7 +588,7 @@ int kernel_map_sync_memtype(u64 base, un - printk(KERN_INFO - "%s:%d ioremap_change_attr failed %s " - "for %Lx-%Lx\n", -- current->comm, current->pid, -+ current->comm, task_pid_nr(current), - cattr_name(flags), - base, (unsigned long long)(base + size)); - return -EINVAL; -@@ -628,7 +628,7 @@ static int reserve_pfn_range(u64 paddr, - free_memtype(paddr, paddr + size); - printk(KERN_ERR "%s:%d map pfn expected mapping type %s" - " for %Lx-%Lx, got %s\n", -- current->comm, current->pid, -+ current->comm, task_pid_nr(current), - cattr_name(want_flags), - (unsigned long long)paddr, - (unsigned long long)(paddr + size), -@@ -827,7 +827,7 @@ static int memtype_seq_show(struct seq_f - return 0; - } - --static struct seq_operations memtype_seq_ops = { -+static const struct seq_operations memtype_seq_ops = { - .start = memtype_seq_start, - .next = memtype_seq_next, - .stop = memtype_seq_stop, -diff -urNp linux-2.6.31.1/arch/x86/mm/pgtable_32.c linux-2.6.31.1/arch/x86/mm/pgtable_32.c ---- linux-2.6.31.1/arch/x86/mm/pgtable_32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/pgtable_32.c 2009-10-01 20:12:42.000000000 -0400 -@@ -33,6 +33,10 @@ void set_pte_vaddr(unsigned long vaddr, - pmd_t *pmd; - pte_t *pte; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); -@@ -49,11 +53,20 @@ void set_pte_vaddr(unsigned long vaddr, - return; - } - pte = pte_offset_kernel(pmd, vaddr); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - if (pte_val(pteval)) - set_pte_at(&init_mm, vaddr, pte, pteval); - else - pte_clear(&init_mm, vaddr, pte); - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) -diff -urNp linux-2.6.31.1/arch/x86/mm/tlb.c linux-2.6.31.1/arch/x86/mm/tlb.c ---- linux-2.6.31.1/arch/x86/mm/tlb.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/mm/tlb.c 2009-10-01 20:12:42.000000000 -0400 -@@ -12,7 +12,7 @@ - #include <asm/uv/uv.h> - - DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) -- = { &init_mm, 0, }; -+ = { &init_mm, 0 }; - - /* - * Smarter SMP flushing macros. -diff -urNp linux-2.6.31.1/arch/x86/oprofile/backtrace.c linux-2.6.31.1/arch/x86/oprofile/backtrace.c ---- linux-2.6.31.1/arch/x86/oprofile/backtrace.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/oprofile/backtrace.c 2009-10-01 20:12:42.000000000 -0400 -@@ -37,7 +37,7 @@ static void backtrace_address(void *data - unsigned int *depth = data; - - if ((*depth)--) -- oprofile_add_trace(addr); -+ oprofile_add_trace(ktla_ktva(addr)); - } - - static struct stacktrace_ops backtrace_ops = { -@@ -77,7 +77,7 @@ x86_backtrace(struct pt_regs * const reg - { - struct frame_head *head = (struct frame_head *)frame_pointer(regs); - -- if (!user_mode_vm(regs)) { -+ if (!user_mode(regs)) { - unsigned long stack = kernel_stack_pointer(regs); - if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, 0, -diff -urNp linux-2.6.31.1/arch/x86/oprofile/op_model_p4.c linux-2.6.31.1/arch/x86/oprofile/op_model_p4.c ---- linux-2.6.31.1/arch/x86/oprofile/op_model_p4.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/oprofile/op_model_p4.c 2009-10-01 20:12:42.000000000 -0400 -@@ -48,7 +48,7 @@ static inline void setup_num_counters(vo - #endif - } - --static int inline addr_increment(void) -+static inline int addr_increment(void) - { - #ifdef CONFIG_SMP - return smp_num_siblings == 2 ? 2 : 1; -diff -urNp linux-2.6.31.1/arch/x86/pci/common.c linux-2.6.31.1/arch/x86/pci/common.c ---- linux-2.6.31.1/arch/x86/pci/common.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/pci/common.c 2009-10-01 20:12:42.000000000 -0400 -@@ -370,7 +370,7 @@ static const struct dmi_system_id __devi - DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"), - }, - }, -- {} -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL} - }; - - void __init dmi_check_pciprobe(void) -diff -urNp linux-2.6.31.1/arch/x86/pci/fixup.c linux-2.6.31.1/arch/x86/pci/fixup.c ---- linux-2.6.31.1/arch/x86/pci/fixup.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/pci/fixup.c 2009-10-01 20:12:42.000000000 -0400 -@@ -364,7 +364,7 @@ static const struct dmi_system_id __devi - DMI_MATCH(DMI_PRODUCT_NAME, "MS-6702E"), - }, - }, -- {} -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } - }; - - /* -@@ -435,7 +435,7 @@ static const struct dmi_system_id __devi - DMI_MATCH(DMI_PRODUCT_VERSION, "PSA40U"), - }, - }, -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } - }; - - static void __devinit pci_pre_fixup_toshiba_ohci1394(struct pci_dev *dev) -diff -urNp linux-2.6.31.1/arch/x86/pci/i386.c linux-2.6.31.1/arch/x86/pci/i386.c ---- linux-2.6.31.1/arch/x86/pci/i386.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/pci/i386.c 2009-10-01 20:12:42.000000000 -0400 -@@ -266,7 +266,7 @@ void pcibios_set_master(struct pci_dev * - pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); - } - --static struct vm_operations_struct pci_mmap_ops = { -+static const struct vm_operations_struct pci_mmap_ops = { - .access = generic_access_phys, - }; - -diff -urNp linux-2.6.31.1/arch/x86/pci/irq.c linux-2.6.31.1/arch/x86/pci/irq.c ---- linux-2.6.31.1/arch/x86/pci/irq.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/pci/irq.c 2009-10-01 20:12:42.000000000 -0400 -@@ -543,7 +543,7 @@ static __init int intel_router_probe(str - static struct pci_device_id __initdata pirq_440gx[] = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) }, -- { }, -+ { PCI_DEVICE(0, 0) } - }; - - /* 440GX has a proprietary PIRQ router -- don't use it */ -@@ -1107,7 +1107,7 @@ static struct dmi_system_id __initdata p - DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), - }, - }, -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } - }; - - int __init pcibios_irq_init(void) -diff -urNp linux-2.6.31.1/arch/x86/pci/pcbios.c linux-2.6.31.1/arch/x86/pci/pcbios.c ---- linux-2.6.31.1/arch/x86/pci/pcbios.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/pci/pcbios.c 2009-10-01 20:12:42.000000000 -0400 -@@ -56,50 +56,120 @@ union bios32 { - static struct { - unsigned long address; - unsigned short segment; --} bios32_indirect = { 0, __KERNEL_CS }; -+} bios32_indirect __read_only = { 0, __PCIBIOS_CS }; - - /* - * Returns the entry point for the given service, NULL on error - */ - --static unsigned long bios32_service(unsigned long service) -+static unsigned long __devinit bios32_service(unsigned long service) - { - unsigned char return_code; /* %al */ - unsigned long address; /* %ebx */ - unsigned long length; /* %ecx */ - unsigned long entry; /* %edx */ - unsigned long flags; -+ struct desc_struct d, *gdt; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif - - local_irq_save(flags); -- __asm__("lcall *(%%edi); cld" -+ -+ gdt = get_cpu_gdt_table(smp_processor_id()); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ pack_descriptor(&d, 0UL, 0xFFFFFUL, 0x9B, 0xC); -+ write_gdt_entry(gdt, GDT_ENTRY_PCIBIOS_CS, &d, DESCTYPE_S); -+ pack_descriptor(&d, 0UL, 0xFFFFFUL, 0x93, 0xC); -+ write_gdt_entry(gdt, GDT_ENTRY_PCIBIOS_DS, &d, DESCTYPE_S); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ -+ __asm__("movw %w7, %%ds; lcall *(%%edi); push %%ss; pop %%ds; cld" - : "=a" (return_code), - "=b" (address), - "=c" (length), - "=d" (entry) - : "0" (service), - "1" (0), -- "D" (&bios32_indirect)); -+ "D" (&bios32_indirect), -+ "r"(__PCIBIOS_DS) -+ : "memory"); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ gdt[GDT_ENTRY_PCIBIOS_CS].a = 0; -+ gdt[GDT_ENTRY_PCIBIOS_CS].b = 0; -+ gdt[GDT_ENTRY_PCIBIOS_DS].a = 0; -+ gdt[GDT_ENTRY_PCIBIOS_DS].b = 0; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - local_irq_restore(flags); - - switch (return_code) { -- case 0: -- return address + entry; -- case 0x80: /* Not present */ -- printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service); -- return 0; -- default: /* Shouldn't happen */ -- printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n", -- service, return_code); -+ case 0: { -+ int cpu; -+ unsigned char flags; -+ -+ printk(KERN_INFO "bios32_service: base:%08lx length:%08lx entry:%08lx\n", address, length, entry); -+ if (address >= 0xFFFF0 || length > 0x100000 - address || length <= entry) { -+ printk(KERN_WARNING "bios32_service: not valid\n"); - return 0; -+ } -+ address = address + PAGE_OFFSET; -+ length += 16UL; /* some BIOSs underreport this... */ -+ flags = 4; -+ if (length >= 64*1024*1024) { -+ length >>= PAGE_SHIFT; -+ flags |= 8; -+ } -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ for (cpu = 0; cpu < NR_CPUS; cpu++) { -+ gdt = get_cpu_gdt_table(cpu); -+ pack_descriptor(&d, address, length, 0x9b, flags); -+ write_gdt_entry(gdt, GDT_ENTRY_PCIBIOS_CS, &d, DESCTYPE_S); -+ pack_descriptor(&d, address, length, 0x93, flags); -+ write_gdt_entry(gdt, GDT_ENTRY_PCIBIOS_DS, &d, DESCTYPE_S); -+ } -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ -+ return entry; -+ } -+ case 0x80: /* Not present */ -+ printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service); -+ return 0; -+ default: /* Shouldn't happen */ -+ printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n", -+ service, return_code); -+ return 0; - } - } - - static struct { - unsigned long address; - unsigned short segment; --} pci_indirect = { 0, __KERNEL_CS }; -+} pci_indirect __read_only = { 0, __PCIBIOS_CS }; - --static int pci_bios_present; -+static int pci_bios_present __read_only; - - static int __devinit check_pcibios(void) - { -@@ -108,11 +178,13 @@ static int __devinit check_pcibios(void) - unsigned long flags, pcibios_entry; - - if ((pcibios_entry = bios32_service(PCI_SERVICE))) { -- pci_indirect.address = pcibios_entry + PAGE_OFFSET; -+ pci_indirect.address = pcibios_entry; - - local_irq_save(flags); -- __asm__( -- "lcall *(%%edi); cld\n\t" -+ __asm__("movw %w6, %%ds\n\t" -+ "lcall *%%ss:(%%edi); cld\n\t" -+ "push %%ss\n\t" -+ "pop %%ds\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" -@@ -121,7 +193,8 @@ static int __devinit check_pcibios(void) - "=b" (ebx), - "=c" (ecx) - : "1" (PCIBIOS_PCI_BIOS_PRESENT), -- "D" (&pci_indirect) -+ "D" (&pci_indirect), -+ "r" (__PCIBIOS_DS) - : "memory"); - local_irq_restore(flags); - -@@ -165,7 +238,10 @@ static int pci_bios_read(unsigned int se - - switch (len) { - case 1: -- __asm__("lcall *(%%esi); cld\n\t" -+ __asm__("movw %w6, %%ds\n\t" -+ "lcall *%%ss:(%%esi); cld\n\t" -+ "push %%ss\n\t" -+ "pop %%ds\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" -@@ -174,7 +250,8 @@ static int pci_bios_read(unsigned int se - : "1" (PCIBIOS_READ_CONFIG_BYTE), - "b" (bx), - "D" ((long)reg), -- "S" (&pci_indirect)); -+ "S" (&pci_indirect), -+ "r" (__PCIBIOS_DS)); - /* - * Zero-extend the result beyond 8 bits, do not trust the - * BIOS having done it: -@@ -182,7 +259,10 @@ static int pci_bios_read(unsigned int se - *value &= 0xff; - break; - case 2: -- __asm__("lcall *(%%esi); cld\n\t" -+ __asm__("movw %w6, %%ds\n\t" -+ "lcall *%%ss:(%%esi); cld\n\t" -+ "push %%ss\n\t" -+ "pop %%ds\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" -@@ -191,7 +271,8 @@ static int pci_bios_read(unsigned int se - : "1" (PCIBIOS_READ_CONFIG_WORD), - "b" (bx), - "D" ((long)reg), -- "S" (&pci_indirect)); -+ "S" (&pci_indirect), -+ "r" (__PCIBIOS_DS)); - /* - * Zero-extend the result beyond 16 bits, do not trust the - * BIOS having done it: -@@ -199,7 +280,10 @@ static int pci_bios_read(unsigned int se - *value &= 0xffff; - break; - case 4: -- __asm__("lcall *(%%esi); cld\n\t" -+ __asm__("movw %w6, %%ds\n\t" -+ "lcall *%%ss:(%%esi); cld\n\t" -+ "push %%ss\n\t" -+ "pop %%ds\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" -@@ -208,7 +292,8 @@ static int pci_bios_read(unsigned int se - : "1" (PCIBIOS_READ_CONFIG_DWORD), - "b" (bx), - "D" ((long)reg), -- "S" (&pci_indirect)); -+ "S" (&pci_indirect), -+ "r" (__PCIBIOS_DS)); - break; - } - -@@ -231,7 +316,10 @@ static int pci_bios_write(unsigned int s - - switch (len) { - case 1: -- __asm__("lcall *(%%esi); cld\n\t" -+ __asm__("movw %w6, %%ds\n\t" -+ "lcall *%%ss:(%%esi); cld\n\t" -+ "push %%ss\n\t" -+ "pop %%ds\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" -@@ -240,10 +328,14 @@ static int pci_bios_write(unsigned int s - "c" (value), - "b" (bx), - "D" ((long)reg), -- "S" (&pci_indirect)); -+ "S" (&pci_indirect), -+ "r" (__PCIBIOS_DS)); - break; - case 2: -- __asm__("lcall *(%%esi); cld\n\t" -+ __asm__("movw %w6, %%ds\n\t" -+ "lcall *%%ss:(%%esi); cld\n\t" -+ "push %%ss\n\t" -+ "pop %%ds\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" -@@ -252,10 +344,14 @@ static int pci_bios_write(unsigned int s - "c" (value), - "b" (bx), - "D" ((long)reg), -- "S" (&pci_indirect)); -+ "S" (&pci_indirect), -+ "r" (__PCIBIOS_DS)); - break; - case 4: -- __asm__("lcall *(%%esi); cld\n\t" -+ __asm__("movw %w6, %%ds\n\t" -+ "lcall *%%ss:(%%esi); cld\n\t" -+ "push %%ss\n\t" -+ "pop %%ds\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" -@@ -264,7 +360,8 @@ static int pci_bios_write(unsigned int s - "c" (value), - "b" (bx), - "D" ((long)reg), -- "S" (&pci_indirect)); -+ "S" (&pci_indirect), -+ "r" (__PCIBIOS_DS)); - break; - } - -@@ -368,10 +465,13 @@ struct irq_routing_table * pcibios_get_i - - DBG("PCI: Fetching IRQ routing table... "); - __asm__("push %%es\n\t" -+ "movw %w8, %%ds\n\t" - "push %%ds\n\t" - "pop %%es\n\t" -- "lcall *(%%esi); cld\n\t" -+ "lcall *%%ss:(%%esi); cld\n\t" - "pop %%es\n\t" -+ "push %%ss\n\t" -+ "pop %%ds\n" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" -@@ -382,7 +482,8 @@ struct irq_routing_table * pcibios_get_i - "1" (0), - "D" ((long) &opt), - "S" (&pci_indirect), -- "m" (opt) -+ "m" (opt), -+ "r" (__PCIBIOS_DS) - : "memory"); - DBG("OK ret=%d, size=%d, map=%x\n", ret, opt.size, map); - if (ret & 0xff00) -@@ -406,7 +507,10 @@ int pcibios_set_irq_routing(struct pci_d - { - int ret; - -- __asm__("lcall *(%%esi); cld\n\t" -+ __asm__("movw %w5, %%ds\n\t" -+ "lcall *%%ss:(%%esi); cld\n\t" -+ "push %%ss\n\t" -+ "pop %%ds\n" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" -@@ -414,7 +518,8 @@ int pcibios_set_irq_routing(struct pci_d - : "0" (PCIBIOS_SET_PCI_HW_INT), - "b" ((dev->bus->number << 8) | dev->devfn), - "c" ((irq << 8) | (pin + 10)), -- "S" (&pci_indirect)); -+ "S" (&pci_indirect), -+ "r" (__PCIBIOS_DS)); - return !(ret & 0xff00); - } - EXPORT_SYMBOL(pcibios_set_irq_routing); -diff -urNp linux-2.6.31.1/arch/x86/power/cpu.c linux-2.6.31.1/arch/x86/power/cpu.c ---- linux-2.6.31.1/arch/x86/power/cpu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/power/cpu.c 2009-10-01 20:12:42.000000000 -0400 -@@ -126,7 +126,11 @@ static void do_fpu_end(void) - static void fix_processor_context(void) - { - int cpu = smp_processor_id(); -- struct tss_struct *t = &per_cpu(init_tss, cpu); -+ struct tss_struct *t = init_tss + cpu; -+ -+#if defined(CONFIG_X86_64) && defined(CONFIG_PAX_KERNEXEC) -+ unsigned long cr0; -+#endif - - set_tss_desc(cpu, t); /* - * This just modifies memory; should not be -@@ -136,8 +140,17 @@ static void fix_processor_context(void) - */ - - #ifdef CONFIG_X86_64 -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - syscall_init(); /* This sets MSR_*STAR and related */ - #endif - load_TR_desc(); /* This does ltr */ -diff -urNp linux-2.6.31.1/arch/x86/vdso/Makefile linux-2.6.31.1/arch/x86/vdso/Makefile ---- linux-2.6.31.1/arch/x86/vdso/Makefile 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/vdso/Makefile 2009-10-01 20:12:42.000000000 -0400 -@@ -122,7 +122,7 @@ quiet_cmd_vdso = VDSO $@ - $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) - --VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) -+VDSO_LDFLAGS = -fPIC -shared --no-undefined $(call ld-option, -Wl$(comma)--hash-style=sysv) - GCOV_PROFILE := n - - # -diff -urNp linux-2.6.31.1/arch/x86/vdso/vclock_gettime.c linux-2.6.31.1/arch/x86/vdso/vclock_gettime.c ---- linux-2.6.31.1/arch/x86/vdso/vclock_gettime.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/vdso/vclock_gettime.c 2009-10-01 20:12:42.000000000 -0400 -@@ -22,24 +22,48 @@ - #include <asm/hpet.h> - #include <asm/unistd.h> - #include <asm/io.h> -+#include <asm/fixmap.h> - #include "vextern.h" - - #define gtod vdso_vsyscall_gtod_data - -+notrace noinline long __vdso_fallback_time(long *t) -+{ -+ long secs; -+ asm volatile("syscall" -+ : "=a" (secs) -+ : "0" (__NR_time),"D" (t) : "r11", "cx", "memory"); -+ return secs; -+} -+ - notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) - { - long ret; - asm("syscall" : "=a" (ret) : -- "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory"); -+ "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "r11", "cx", "memory"); - return ret; - } - -+notrace static inline cycle_t __vdso_vread_hpet(void) -+{ -+ return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); -+} -+ -+notrace static inline cycle_t __vdso_vread_tsc(void) -+{ -+ cycle_t ret = (cycle_t)vget_cycles(); -+ -+ return ret >= gtod->clock.cycle_last ? ret : gtod->clock.cycle_last; -+} -+ - notrace static inline long vgetns(void) - { - long v; -- cycles_t (*vread)(void); -- vread = gtod->clock.vread; -- v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; -+ if (gtod->clock.name[0] == 't' && gtod->clock.name[1] == 's' && gtod->clock.name[2] == 'c' && !gtod->clock.name[3]) -+ v = __vdso_vread_tsc(); -+ else -+ v = __vdso_vread_hpet(); -+ v = (v - gtod->clock.cycle_last) & gtod->clock.mask; - return (v * gtod->clock.mult) >> gtod->clock.shift; - } - -@@ -88,7 +112,9 @@ notrace static noinline int do_monotonic - - notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) - { -- if (likely(gtod->sysctl_enabled && gtod->clock.vread)) -+ if (likely(gtod->sysctl_enabled && -+ ((gtod->clock.name[0] == 'h' && gtod->clock.name[1] == 'p' && gtod->clock.name[2] == 'e' && gtod->clock.name[3] == 't' && !gtod->clock.name[4]) || -+ (gtod->clock.name[0] == 't' && gtod->clock.name[1] == 's' && gtod->clock.name[2] == 'c' && !gtod->clock.name[3])))) - switch (clock) { - case CLOCK_REALTIME: - return do_realtime(ts); -@@ -100,10 +126,20 @@ notrace int __vdso_clock_gettime(clockid - int clock_gettime(clockid_t, struct timespec *) - __attribute__((weak, alias("__vdso_clock_gettime"))); - --notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) -+notrace noinline int __vdso_fallback_gettimeofday(struct timeval *tv, struct timezone *tz) - { - long ret; -- if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { -+ asm("syscall" : "=a" (ret) : -+ "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "r11", "cx", "memory"); -+ return ret; -+} -+ -+notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) -+{ -+ if (likely(gtod->sysctl_enabled && -+ ((gtod->clock.name[0] == 'h' && gtod->clock.name[1] == 'p' && gtod->clock.name[2] == 'e' && gtod->clock.name[3] == 't' && !gtod->clock.name[4]) || -+ (gtod->clock.name[0] == 't' && gtod->clock.name[1] == 's' && gtod->clock.name[2] == 'c' && !gtod->clock.name[3])))) -+ { - if (likely(tv != NULL)) { - BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != - offsetof(struct timespec, tv_nsec) || -@@ -118,9 +154,7 @@ notrace int __vdso_gettimeofday(struct t - } - return 0; - } -- asm("syscall" : "=a" (ret) : -- "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); -- return ret; -+ return __vdso_fallback_gettimeofday(tv, tz); - } - int gettimeofday(struct timeval *, struct timezone *) - __attribute__((weak, alias("__vdso_gettimeofday"))); -diff -urNp linux-2.6.31.1/arch/x86/vdso/vdso32-setup.c linux-2.6.31.1/arch/x86/vdso/vdso32-setup.c ---- linux-2.6.31.1/arch/x86/vdso/vdso32-setup.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/vdso/vdso32-setup.c 2009-10-01 20:12:42.000000000 -0400 -@@ -25,6 +25,7 @@ - #include <asm/tlbflush.h> - #include <asm/vdso.h> - #include <asm/proto.h> -+#include <asm/mman.h> - - enum { - VDSO_DISABLED = 0, -@@ -226,7 +227,7 @@ static inline void map_compat_vdso(int m - void enable_sep_cpu(void) - { - int cpu = get_cpu(); -- struct tss_struct *tss = &per_cpu(init_tss, cpu); -+ struct tss_struct *tss = init_tss + cpu; - - if (!boot_cpu_has(X86_FEATURE_SEP)) { - put_cpu(); -@@ -249,7 +250,7 @@ static int __init gate_vma_init(void) - gate_vma.vm_start = FIXADDR_USER_START; - gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; -- gate_vma.vm_page_prot = __P101; -+ gate_vma.vm_page_prot = vm_get_page_prot(gate_vma.vm_flags); - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later -@@ -331,14 +332,14 @@ int arch_setup_additional_pages(struct l - if (compat) - addr = VDSO_HIGH_BASE; - else { -- addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); -+ addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, MAP_EXECUTABLE); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - } - -- current->mm->context.vdso = (void *)addr; -+ current->mm->context.vdso = addr; - - if (compat_uses_vma || !compat) { - /* -@@ -365,7 +366,7 @@ int arch_setup_additional_pages(struct l - - up_fail: - if (ret) -- current->mm->context.vdso = NULL; -+ current->mm->context.vdso = 0; - - up_write(&mm->mmap_sem); - -@@ -388,7 +389,7 @@ static ctl_table abi_table2[] = { - .mode = 0644, - .proc_handler = proc_dointvec - }, -- {} -+ { 0, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL } - }; - - static ctl_table abi_root_table2[] = { -@@ -398,7 +399,7 @@ static ctl_table abi_root_table2[] = { - .mode = 0555, - .child = abi_table2 - }, -- {} -+ { 0, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL } - }; - - static __init int ia32_binfmt_init(void) -@@ -413,8 +414,14 @@ __initcall(ia32_binfmt_init); - - const char *arch_vma_name(struct vm_area_struct *vma) - { -- if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) -+ if (vma->vm_mm && vma->vm_start == vma->vm_mm->context.vdso) - return "[vdso]"; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (vma->vm_mm && vma->vm_mirror && vma->vm_mirror->vm_start == vma->vm_mm->context.vdso) -+ return "[vdso]"; -+#endif -+ - return NULL; - } - -@@ -423,7 +430,7 @@ struct vm_area_struct *get_gate_vma(stru - struct mm_struct *mm = tsk->mm; - - /* Check to see if this task was created in compat vdso mode */ -- if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) -+ if (mm && mm->context.vdso == VDSO_HIGH_BASE) - return &gate_vma; - return NULL; - } -diff -urNp linux-2.6.31.1/arch/x86/vdso/vdso.lds.S linux-2.6.31.1/arch/x86/vdso/vdso.lds.S ---- linux-2.6.31.1/arch/x86/vdso/vdso.lds.S 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/vdso/vdso.lds.S 2009-10-01 20:12:42.000000000 -0400 -@@ -35,3 +35,9 @@ VDSO64_PRELINK = VDSO_PRELINK; - #define VEXTERN(x) VDSO64_ ## x = vdso_ ## x; - #include "vextern.h" - #undef VEXTERN -+ -+#define VEXTERN(x) VDSO64_ ## x = __vdso_ ## x; -+VEXTERN(fallback_gettimeofday) -+VEXTERN(fallback_time) -+VEXTERN(getcpu) -+#undef VEXTERN -diff -urNp linux-2.6.31.1/arch/x86/vdso/vextern.h linux-2.6.31.1/arch/x86/vdso/vextern.h ---- linux-2.6.31.1/arch/x86/vdso/vextern.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/vdso/vextern.h 2009-10-01 20:12:42.000000000 -0400 -@@ -11,6 +11,5 @@ - put into vextern.h and be referenced as a pointer with vdso prefix. - The main kernel later fills in the values. */ - --VEXTERN(jiffies) - VEXTERN(vgetcpu_mode) - VEXTERN(vsyscall_gtod_data) -diff -urNp linux-2.6.31.1/arch/x86/vdso/vma.c linux-2.6.31.1/arch/x86/vdso/vma.c ---- linux-2.6.31.1/arch/x86/vdso/vma.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/vdso/vma.c 2009-10-01 20:12:42.000000000 -0400 -@@ -57,7 +57,7 @@ static int __init init_vdso_vars(void) - if (!vbase) - goto oom; - -- if (memcmp(vbase, "\177ELF", 4)) { -+ if (memcmp(vbase, ELFMAG, SELFMAG)) { - printk("VDSO: I'm broken; not ELF\n"); - vdso_enabled = 0; - } -@@ -66,6 +66,7 @@ static int __init init_vdso_vars(void) - *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; - #include "vextern.h" - #undef VEXTERN -+ vunmap(vbase); - return 0; - - oom: -@@ -116,7 +117,7 @@ int arch_setup_additional_pages(struct l - goto up_fail; - } - -- current->mm->context.vdso = (void *)addr; -+ current->mm->context.vdso = addr; - - ret = install_special_mapping(mm, addr, vdso_size, - VM_READ|VM_EXEC| -@@ -124,7 +125,7 @@ int arch_setup_additional_pages(struct l - VM_ALWAYSDUMP, - vdso_pages); - if (ret) { -- current->mm->context.vdso = NULL; -+ current->mm->context.vdso = 0; - goto up_fail; - } - -@@ -132,10 +133,3 @@ up_fail: - up_write(&mm->mmap_sem); - return ret; - } -- --static __init int vdso_setup(char *s) --{ -- vdso_enabled = simple_strtoul(s, NULL, 0); -- return 0; --} --__setup("vdso=", vdso_setup); -diff -urNp linux-2.6.31.1/arch/x86/xen/debugfs.c linux-2.6.31.1/arch/x86/xen/debugfs.c ---- linux-2.6.31.1/arch/x86/xen/debugfs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/xen/debugfs.c 2009-10-01 20:12:42.000000000 -0400 -@@ -100,7 +100,7 @@ static int xen_array_release(struct inod - return 0; - } - --static struct file_operations u32_array_fops = { -+static const struct file_operations u32_array_fops = { - .owner = THIS_MODULE, - .open = u32_array_open, - .release= xen_array_release, -diff -urNp linux-2.6.31.1/arch/x86/xen/enlighten.c linux-2.6.31.1/arch/x86/xen/enlighten.c ---- linux-2.6.31.1/arch/x86/xen/enlighten.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/xen/enlighten.c 2009-10-01 20:12:42.000000000 -0400 -@@ -69,8 +69,6 @@ EXPORT_SYMBOL_GPL(xen_start_info); - - struct shared_info xen_dummy_shared_info; - --void *xen_initial_gdt; -- - /* - * Point at some empty memory to start with. We map the real shared_info - * page as soon as fixmap is up and running. -@@ -490,7 +488,7 @@ static void xen_write_idt_entry(gate_des - - preempt_disable(); - -- start = __get_cpu_var(idt_desc).address; -+ start = (unsigned long)__get_cpu_var(idt_desc).address; - end = start + __get_cpu_var(idt_desc).size + 1; - - xen_mc_flush(); -@@ -1010,13 +1008,6 @@ asmlinkage void __init xen_start_kernel( - - machine_ops = xen_machine_ops; - -- /* -- * The only reliable way to retain the initial address of the -- * percpu gdt_page is to remember it here, so we can go and -- * mark it RW later, when the initial percpu area is freed. -- */ -- xen_initial_gdt = &per_cpu(gdt_page, 0); -- - xen_smp_init(); - - /* Get mfn list */ -diff -urNp linux-2.6.31.1/arch/x86/xen/mmu.c linux-2.6.31.1/arch/x86/xen/mmu.c ---- linux-2.6.31.1/arch/x86/xen/mmu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/xen/mmu.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1707,6 +1707,8 @@ __init pgd_t *xen_setup_kernel_pagetable - convert_pfn_mfn(init_level4_pgt); - convert_pfn_mfn(level3_ident_pgt); - convert_pfn_mfn(level3_kernel_pgt); -+ convert_pfn_mfn(level3_vmalloc_pgt); -+ convert_pfn_mfn(level3_vmemmap_pgt); - - l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); -@@ -1725,7 +1727,10 @@ __init pgd_t *xen_setup_kernel_pagetable - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); -+ set_page_prot(level3_vmalloc_pgt, PAGE_KERNEL_RO); -+ set_page_prot(level3_vmemmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); -+ set_page_prot(level2_vmemmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - -diff -urNp linux-2.6.31.1/arch/x86/xen/smp.c linux-2.6.31.1/arch/x86/xen/smp.c ---- linux-2.6.31.1/arch/x86/xen/smp.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/xen/smp.c 2009-10-01 20:12:42.000000000 -0400 -@@ -167,11 +167,6 @@ static void __init xen_smp_prepare_boot_ - { - BUG_ON(smp_processor_id() != 0); - native_smp_prepare_boot_cpu(); -- -- /* We've switched to the "real" per-cpu gdt, so make sure the -- old memory can be recycled */ -- make_lowmem_page_readwrite(xen_initial_gdt); -- - xen_setup_vcpu_info_placement(); - } - -@@ -231,8 +226,8 @@ cpu_initialize_context(unsigned int cpu, - gdt = get_cpu_gdt_table(cpu); - - ctxt->flags = VGCF_IN_KERNEL; -- ctxt->user_regs.ds = __USER_DS; -- ctxt->user_regs.es = __USER_DS; -+ ctxt->user_regs.ds = __KERNEL_DS; -+ ctxt->user_regs.es = __KERNEL_DS; - ctxt->user_regs.ss = __KERNEL_DS; - #ifdef CONFIG_X86_32 - ctxt->user_regs.fs = __KERNEL_PERCPU; -diff -urNp linux-2.6.31.1/arch/x86/xen/xen-ops.h linux-2.6.31.1/arch/x86/xen/xen-ops.h ---- linux-2.6.31.1/arch/x86/xen/xen-ops.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/x86/xen/xen-ops.h 2009-10-01 20:12:42.000000000 -0400 -@@ -10,8 +10,6 @@ - extern const char xen_hypervisor_callback[]; - extern const char xen_failsafe_callback[]; - --extern void *xen_initial_gdt; -- - struct trap_info; - void xen_copy_trap_info(struct trap_info *traps); - -diff -urNp linux-2.6.31.1/arch/xtensa/include/asm/atomic.h linux-2.6.31.1/arch/xtensa/include/asm/atomic.h ---- linux-2.6.31.1/arch/xtensa/include/asm/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/arch/xtensa/include/asm/atomic.h 2009-10-01 20:12:42.000000000 -0400 -@@ -49,6 +49,14 @@ - #define atomic_read(v) ((v)->counter) - - /** -+ * atomic_read_unchecked - read atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically reads the value of @v. -+ */ -+#define atomic_read_unchecked(v) ((v)->counter) -+ -+/** - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value -@@ -58,6 +66,15 @@ - #define atomic_set(v,i) ((v)->counter = (i)) - - /** -+ * atomic_set_unchecked - set atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * @i: required value -+ * -+ * Atomically sets the value of @v to @i. -+ */ -+#define atomic_set_unchecked(v,i) ((v)->counter = (i)) -+ -+/** - * atomic_add - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t -@@ -81,6 +98,11 @@ static inline void atomic_add(int i, ato - ); - } - -+static inline void atomic_add_unchecked(int i, atomic_unchecked_t * v) -+{ -+ atomic_add(i, (atomic_t *)v); -+} -+ - /** - * atomic_sub - subtract the atomic variable - * @i: integer value to subtract -@@ -105,6 +127,11 @@ static inline void atomic_sub(int i, ato - ); - } - -+static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_sub(i, (atomic_t *)v); -+} -+ - /* - * We use atomic_{add|sub}_return to define other functions. - */ -@@ -165,6 +192,7 @@ static inline int atomic_sub_return(int - * Atomically increments @v by 1. - */ - #define atomic_inc(v) atomic_add(1,(v)) -+#define atomic_inc_unchecked(v) atomic_add_unchecked(1,(v)) - - /** - * atomic_inc - increment atomic variable -diff -urNp linux-2.6.31.1/crypto/lrw.c linux-2.6.31.1/crypto/lrw.c ---- linux-2.6.31.1/crypto/lrw.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/crypto/lrw.c 2009-10-01 20:12:42.000000000 -0400 -@@ -60,7 +60,7 @@ static int setkey(struct crypto_tfm *par - struct priv *ctx = crypto_tfm_ctx(parent); - struct crypto_cipher *child = ctx->child; - int err, i; -- be128 tmp = { 0 }; -+ be128 tmp = { 0, 0 }; - int bsize = crypto_cipher_blocksize(child); - - crypto_cipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); -diff -urNp linux-2.6.31.1/Documentation/dontdiff linux-2.6.31.1/Documentation/dontdiff ---- linux-2.6.31.1/Documentation/dontdiff 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/Documentation/dontdiff 2009-10-01 20:12:42.000000000 -0400 -@@ -3,6 +3,7 @@ - *.bin - *.cpio - *.csp -+*.dbg - *.dsp - *.dvi - *.elf -@@ -49,11 +50,16 @@ - 53c700_d.h - CVS - ChangeSet -+GPATH -+GRTAGS -+GSYMS -+GTAGS - Image - Kerntypes - Module.markers - Module.symvers - PENDING -+PERF* - SCCS - System.map* - TAGS -@@ -76,7 +82,9 @@ btfixupprep - build - bvmlinux - bzImage* -+capflags.c - classlist.h* -+common-cmds.h - comp*.log - compile.h* - conf -@@ -103,13 +111,14 @@ gen_crc32table - gen_init_cpio - genksyms - *_gray256.c -+hash - ihex2fw - ikconfig.h* - initramfs_data.cpio -+initramfs_data.cpio.bz2 - initramfs_data.cpio.gz - initramfs_list - kallsyms --kconfig - keywords.c - ksym.c* - ksym.h* -@@ -133,6 +142,7 @@ mkboot - mkbugboot - mkcpustr - mkdep -+mkpiggy - mkprep - mktables - mktree -@@ -149,6 +159,7 @@ patches* - pca200e.bin - pca200e_ecd.bin2 - piggy.gz -+piggy.S - piggyback - pnmtologo - ppc_defs.h* -@@ -164,6 +175,7 @@ setup - setup.bin - setup.elf - sImage -+slabinfo - sm_tbl* - split-include - syscalltab.h -@@ -187,14 +199,20 @@ version.h* - vmlinux - vmlinux-* - vmlinux.aout -+vmlinux.bin.all -+vmlinux.bin.bz2 - vmlinux.lds -+vmlinux.relocs -+voffset.h - vsyscall.lds - vsyscall_32.lds - wanxlfw.inc - uImage - unifdef -+utsrelease.h - wakeup.bin - wakeup.elf - wakeup.lds - zImage* - zconf.hash.c -+zoffset.h -diff -urNp linux-2.6.31.1/Documentation/kernel-parameters.txt linux-2.6.31.1/Documentation/kernel-parameters.txt ---- linux-2.6.31.1/Documentation/kernel-parameters.txt 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/Documentation/kernel-parameters.txt 2009-10-01 20:12:42.000000000 -0400 -@@ -1776,6 +1776,12 @@ and is between 256 and 4096 characters. - the specified number of seconds. This is to be used if - your oopses keep scrolling off the screen. - -+ pax_nouderef [X86-32] disables UDEREF. Most likely needed under certain -+ virtualization environments that don't cope well with the -+ expand down segment used by UDEREF on X86-32. -+ -+ pax_softmode= [X86-32] 0/1 to disable/enable PaX softmode on boot already. -+ - pcbit= [HW,ISDN] - - pcd. [PARIDE] -diff -urNp linux-2.6.31.1/drivers/acpi/blacklist.c linux-2.6.31.1/drivers/acpi/blacklist.c ---- linux-2.6.31.1/drivers/acpi/blacklist.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/acpi/blacklist.c 2009-10-01 20:12:42.000000000 -0400 -@@ -71,7 +71,7 @@ static struct acpi_blacklist_item acpi_b - {"IBM ", "TP600E ", 0x00000105, ACPI_SIG_DSDT, less_than_or_equal, - "Incorrect _ADR", 1}, - -- {""} -+ {"", "", 0, 0, 0, all_versions, 0} - }; - - #if CONFIG_ACPI_BLACKLIST_YEAR -diff -urNp linux-2.6.31.1/drivers/acpi/osl.c linux-2.6.31.1/drivers/acpi/osl.c ---- linux-2.6.31.1/drivers/acpi/osl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/acpi/osl.c 2009-10-01 20:12:42.000000000 -0400 -@@ -521,6 +521,8 @@ acpi_os_read_memory(acpi_physical_addres - void __iomem *virt_addr; - - virt_addr = ioremap(phys_addr, width); -+ if (!virt_addr) -+ return AE_NO_MEMORY; - if (!value) - value = &dummy; - -@@ -549,6 +551,8 @@ acpi_os_write_memory(acpi_physical_addre - void __iomem *virt_addr; - - virt_addr = ioremap(phys_addr, width); -+ if (!virt_addr) -+ return AE_NO_MEMORY; - - switch (width) { - case 8: -diff -urNp linux-2.6.31.1/drivers/acpi/processor_core.c linux-2.6.31.1/drivers/acpi/processor_core.c ---- linux-2.6.31.1/drivers/acpi/processor_core.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/acpi/processor_core.c 2009-10-01 20:12:42.000000000 -0400 -@@ -712,7 +712,7 @@ static int __cpuinit acpi_processor_star - return 0; - } - -- BUG_ON((pr->id >= nr_cpu_ids) || (pr->id < 0)); -+ BUG_ON(pr->id >= nr_cpu_ids); - - /* - * Buggy BIOS check -diff -urNp linux-2.6.31.1/drivers/acpi/processor_idle.c linux-2.6.31.1/drivers/acpi/processor_idle.c ---- linux-2.6.31.1/drivers/acpi/processor_idle.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/acpi/processor_idle.c 2009-10-01 20:12:42.000000000 -0400 -@@ -108,7 +108,7 @@ static struct dmi_system_id __cpuinitdat - DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"SHE845M0.86C.0013.D.0302131307")}, - (void *)2}, -- {}, -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL}, - }; - - -diff -urNp linux-2.6.31.1/drivers/acpi/video.c linux-2.6.31.1/drivers/acpi/video.c ---- linux-2.6.31.1/drivers/acpi/video.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/acpi/video.c 2009-10-01 20:12:42.000000000 -0400 -@@ -283,7 +283,7 @@ static int acpi_video_device_brightness_ - struct file *file); - static ssize_t acpi_video_device_write_brightness(struct file *file, - const char __user *buffer, size_t count, loff_t *data); --static struct file_operations acpi_video_device_brightness_fops = { -+static const struct file_operations acpi_video_device_brightness_fops = { - .owner = THIS_MODULE, - .open = acpi_video_device_brightness_open_fs, - .read = seq_read, -diff -urNp linux-2.6.31.1/drivers/ata/ahci.c linux-2.6.31.1/drivers/ata/ahci.c ---- linux-2.6.31.1/drivers/ata/ahci.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ata/ahci.c 2009-10-01 20:12:42.000000000 -0400 -@@ -629,7 +629,7 @@ static const struct pci_device_id ahci_p - { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, - PCI_CLASS_STORAGE_SATA_AHCI, 0xffffff, board_ahci }, - -- { } /* terminate list */ -+ { 0, 0, 0, 0, 0, 0, 0 } /* terminate list */ - }; - - -diff -urNp linux-2.6.31.1/drivers/ata/ata_piix.c linux-2.6.31.1/drivers/ata/ata_piix.c ---- linux-2.6.31.1/drivers/ata/ata_piix.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ata/ata_piix.c 2009-10-01 20:12:42.000000000 -0400 -@@ -291,7 +291,7 @@ static const struct pci_device_id piix_p - { 0x8086, 0x3b2d, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_2port_sata }, - /* SATA Controller IDE (PCH) */ - { 0x8086, 0x3b2e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_sata }, -- { } /* terminate list */ -+ { 0, 0, 0, 0, 0, 0, 0 } /* terminate list */ - }; - - static struct pci_driver piix_pci_driver = { -@@ -608,7 +608,7 @@ static const struct ich_laptop ich_lapto - { 0x2653, 0x1043, 0x82D8 }, /* ICH6M on Asus Eee 701 */ - { 0x27df, 0x104d, 0x900e }, /* ICH7 on Sony TZ-90 */ - /* end marker */ -- { 0, } -+ { 0, 0, 0 } - }; - - /** -@@ -1086,7 +1086,7 @@ static int piix_broken_suspend(void) - }, - }, - -- { } /* terminate list */ -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } /* terminate list */ - }; - static const char *oemstrs[] = { - "Tecra M3,", -diff -urNp linux-2.6.31.1/drivers/ata/libata-core.c linux-2.6.31.1/drivers/ata/libata-core.c ---- linux-2.6.31.1/drivers/ata/libata-core.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ata/libata-core.c 2009-10-01 20:12:42.000000000 -0400 -@@ -896,7 +896,7 @@ static const struct ata_xfer_ent { - { ATA_SHIFT_PIO, ATA_NR_PIO_MODES, XFER_PIO_0 }, - { ATA_SHIFT_MWDMA, ATA_NR_MWDMA_MODES, XFER_MW_DMA_0 }, - { ATA_SHIFT_UDMA, ATA_NR_UDMA_MODES, XFER_UDMA_0 }, -- { -1, }, -+ { -1, 0, 0 } - }; - - /** -@@ -3141,7 +3141,7 @@ static const struct ata_timing ata_timin - { XFER_UDMA_5, 0, 0, 0, 0, 0, 0, 0, 0, 20 }, - { XFER_UDMA_6, 0, 0, 0, 0, 0, 0, 0, 0, 15 }, - -- { 0xFF } -+ { 0xFF, 0, 0, 0, 0, 0, 0, 0, 0 } - }; - - #define ENOUGH(v, unit) (((v)-1)/(unit)+1) -@@ -4339,7 +4339,7 @@ static const struct ata_blacklist_entry - { "PIONEER DVD-RW DVRTD08", "1.00", ATA_HORKAGE_NOSETXFER }, - - /* End Marker */ -- { } -+ { NULL, NULL, 0 } - }; - - static int strn_pattern_cmp(const char *patt, const char *name, int wildchar) -diff -urNp linux-2.6.31.1/drivers/atm/adummy.c linux-2.6.31.1/drivers/atm/adummy.c ---- linux-2.6.31.1/drivers/atm/adummy.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/adummy.c 2009-10-01 20:12:42.000000000 -0400 -@@ -77,7 +77,7 @@ adummy_send(struct atm_vcc *vcc, struct - vcc->pop(vcc, skb); - else - dev_kfree_skb_any(skb); -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - - return 0; - } -diff -urNp linux-2.6.31.1/drivers/atm/ambassador.c linux-2.6.31.1/drivers/atm/ambassador.c ---- linux-2.6.31.1/drivers/atm/ambassador.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/ambassador.c 2009-10-01 20:12:42.000000000 -0400 -@@ -453,7 +453,7 @@ static void tx_complete (amb_dev * dev, - PRINTD (DBG_FLOW|DBG_TX, "tx_complete %p %p", dev, tx); - - // VC layer stats -- atomic_inc(&ATM_SKB(skb)->vcc->stats->tx); -+ atomic_inc_unchecked(&ATM_SKB(skb)->vcc->stats->tx); - - // free the descriptor - kfree (tx_descr); -@@ -494,7 +494,7 @@ static void rx_complete (amb_dev * dev, - dump_skb ("<<<", vc, skb); - - // VC layer stats -- atomic_inc(&atm_vcc->stats->rx); -+ atomic_inc_unchecked(&atm_vcc->stats->rx); - __net_timestamp(skb); - // end of our responsability - atm_vcc->push (atm_vcc, skb); -@@ -509,7 +509,7 @@ static void rx_complete (amb_dev * dev, - } else { - PRINTK (KERN_INFO, "dropped over-size frame"); - // should we count this? -- atomic_inc(&atm_vcc->stats->rx_drop); -+ atomic_inc_unchecked(&atm_vcc->stats->rx_drop); - } - - } else { -@@ -1349,7 +1349,7 @@ static int amb_send (struct atm_vcc * at - } - - if (check_area (skb->data, skb->len)) { -- atomic_inc(&atm_vcc->stats->tx_err); -+ atomic_inc_unchecked(&atm_vcc->stats->tx_err); - return -ENOMEM; // ? - } - -diff -urNp linux-2.6.31.1/drivers/atm/atmtcp.c linux-2.6.31.1/drivers/atm/atmtcp.c ---- linux-2.6.31.1/drivers/atm/atmtcp.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/atmtcp.c 2009-10-01 20:12:42.000000000 -0400 -@@ -206,7 +206,7 @@ static int atmtcp_v_send(struct atm_vcc - if (vcc->pop) vcc->pop(vcc,skb); - else dev_kfree_skb(skb); - if (dev_data) return 0; -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - return -ENOLINK; - } - size = skb->len+sizeof(struct atmtcp_hdr); -@@ -214,7 +214,7 @@ static int atmtcp_v_send(struct atm_vcc - if (!new_skb) { - if (vcc->pop) vcc->pop(vcc,skb); - else dev_kfree_skb(skb); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - return -ENOBUFS; - } - hdr = (void *) skb_put(new_skb,sizeof(struct atmtcp_hdr)); -@@ -225,8 +225,8 @@ static int atmtcp_v_send(struct atm_vcc - if (vcc->pop) vcc->pop(vcc,skb); - else dev_kfree_skb(skb); - out_vcc->push(out_vcc,new_skb); -- atomic_inc(&vcc->stats->tx); -- atomic_inc(&out_vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->tx); -+ atomic_inc_unchecked(&out_vcc->stats->rx); - return 0; - } - -@@ -300,7 +300,7 @@ static int atmtcp_c_send(struct atm_vcc - out_vcc = find_vcc(dev, ntohs(hdr->vpi), ntohs(hdr->vci)); - read_unlock(&vcc_sklist_lock); - if (!out_vcc) { -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - goto done; - } - skb_pull(skb,sizeof(struct atmtcp_hdr)); -@@ -312,8 +312,8 @@ static int atmtcp_c_send(struct atm_vcc - __net_timestamp(new_skb); - skb_copy_from_linear_data(skb, skb_put(new_skb, skb->len), skb->len); - out_vcc->push(out_vcc,new_skb); -- atomic_inc(&vcc->stats->tx); -- atomic_inc(&out_vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->tx); -+ atomic_inc_unchecked(&out_vcc->stats->rx); - done: - if (vcc->pop) vcc->pop(vcc,skb); - else dev_kfree_skb(skb); -diff -urNp linux-2.6.31.1/drivers/atm/eni.c linux-2.6.31.1/drivers/atm/eni.c ---- linux-2.6.31.1/drivers/atm/eni.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/eni.c 2009-10-01 20:12:42.000000000 -0400 -@@ -525,7 +525,7 @@ static int rx_aal0(struct atm_vcc *vcc) - DPRINTK(DEV_LABEL "(itf %d): trashing empty cell\n", - vcc->dev->number); - length = 0; -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - } - else { - length = ATM_CELL_SIZE-1; /* no HEC */ -@@ -580,7 +580,7 @@ static int rx_aal5(struct atm_vcc *vcc) - size); - } - eff = length = 0; -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - } - else { - size = (descr & MID_RED_COUNT)*(ATM_CELL_PAYLOAD >> 2); -@@ -597,7 +597,7 @@ static int rx_aal5(struct atm_vcc *vcc) - "(VCI=%d,length=%ld,size=%ld (descr 0x%lx))\n", - vcc->dev->number,vcc->vci,length,size << 2,descr); - length = eff = 0; -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - } - } - skb = eff ? atm_alloc_charge(vcc,eff << 2,GFP_ATOMIC) : NULL; -@@ -770,7 +770,7 @@ rx_dequeued++; - vcc->push(vcc,skb); - pushed++; - } -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - } - wake_up(&eni_dev->rx_wait); - } -@@ -1227,7 +1227,7 @@ static void dequeue_tx(struct atm_dev *d - PCI_DMA_TODEVICE); - if (vcc->pop) vcc->pop(vcc,skb); - else dev_kfree_skb_irq(skb); -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - wake_up(&eni_dev->tx_wait); - dma_complete++; - } -diff -urNp linux-2.6.31.1/drivers/atm/firestream.c linux-2.6.31.1/drivers/atm/firestream.c ---- linux-2.6.31.1/drivers/atm/firestream.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/firestream.c 2009-10-01 20:12:42.000000000 -0400 -@@ -748,7 +748,7 @@ static void process_txdone_queue (struct - } - } - -- atomic_inc(&ATM_SKB(skb)->vcc->stats->tx); -+ atomic_inc_unchecked(&ATM_SKB(skb)->vcc->stats->tx); - - fs_dprintk (FS_DEBUG_TXMEM, "i"); - fs_dprintk (FS_DEBUG_ALLOC, "Free t-skb: %p\n", skb); -@@ -815,7 +815,7 @@ static void process_incoming (struct fs_ - #endif - skb_put (skb, qe->p1 & 0xffff); - ATM_SKB(skb)->vcc = atm_vcc; -- atomic_inc(&atm_vcc->stats->rx); -+ atomic_inc_unchecked(&atm_vcc->stats->rx); - __net_timestamp(skb); - fs_dprintk (FS_DEBUG_ALLOC, "Free rec-skb: %p (pushed)\n", skb); - atm_vcc->push (atm_vcc, skb); -@@ -836,12 +836,12 @@ static void process_incoming (struct fs_ - kfree (pe); - } - if (atm_vcc) -- atomic_inc(&atm_vcc->stats->rx_drop); -+ atomic_inc_unchecked(&atm_vcc->stats->rx_drop); - break; - case 0x1f: /* Reassembly abort: no buffers. */ - /* Silently increment error counter. */ - if (atm_vcc) -- atomic_inc(&atm_vcc->stats->rx_drop); -+ atomic_inc_unchecked(&atm_vcc->stats->rx_drop); - break; - default: /* Hmm. Haven't written the code to handle the others yet... -- REW */ - printk (KERN_WARNING "Don't know what to do with RX status %x: %s.\n", -diff -urNp linux-2.6.31.1/drivers/atm/fore200e.c linux-2.6.31.1/drivers/atm/fore200e.c ---- linux-2.6.31.1/drivers/atm/fore200e.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/fore200e.c 2009-10-01 20:12:42.000000000 -0400 -@@ -931,9 +931,9 @@ fore200e_tx_irq(struct fore200e* fore200 - #endif - /* check error condition */ - if (*entry->status & STATUS_ERROR) -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - else -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - } - } - -@@ -1082,7 +1082,7 @@ fore200e_push_rpd(struct fore200e* fore2 - if (skb == NULL) { - DPRINTK(2, "unable to alloc new skb, rx PDU length = %d\n", pdu_len); - -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - return -ENOMEM; - } - -@@ -1125,14 +1125,14 @@ fore200e_push_rpd(struct fore200e* fore2 - - dev_kfree_skb_any(skb); - -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - return -ENOMEM; - } - - ASSERT(atomic_read(&sk_atm(vcc)->sk_wmem_alloc) >= 0); - - vcc->push(vcc, skb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - - ASSERT(atomic_read(&sk_atm(vcc)->sk_wmem_alloc) >= 0); - -@@ -1210,7 +1210,7 @@ fore200e_rx_irq(struct fore200e* fore200 - DPRINTK(2, "damaged PDU on %d.%d.%d\n", - fore200e->atm_dev->number, - entry->rpd->atm_header.vpi, entry->rpd->atm_header.vci); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - } - } - -@@ -1655,7 +1655,7 @@ fore200e_send(struct atm_vcc *vcc, struc - goto retry_here; - } - -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - - fore200e->tx_sat++; - DPRINTK(2, "tx queue of device %s is saturated, PDU dropped - heartbeat is %08x\n", -diff -urNp linux-2.6.31.1/drivers/atm/he.c linux-2.6.31.1/drivers/atm/he.c ---- linux-2.6.31.1/drivers/atm/he.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/he.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1728,7 +1728,7 @@ he_service_rbrq(struct he_dev *he_dev, i - - if (RBRQ_HBUF_ERR(he_dev->rbrq_head)) { - hprintk("HBUF_ERR! (cid 0x%x)\n", cid); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - goto return_host_buffers; - } - -@@ -1761,7 +1761,7 @@ he_service_rbrq(struct he_dev *he_dev, i - RBRQ_LEN_ERR(he_dev->rbrq_head) - ? "LEN_ERR" : "", - vcc->vpi, vcc->vci); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - goto return_host_buffers; - } - -@@ -1820,7 +1820,7 @@ he_service_rbrq(struct he_dev *he_dev, i - vcc->push(vcc, skb); - spin_lock(&he_dev->global_lock); - -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - - return_host_buffers: - ++pdus_assembled; -@@ -2165,7 +2165,7 @@ __enqueue_tpd(struct he_dev *he_dev, str - tpd->vcc->pop(tpd->vcc, tpd->skb); - else - dev_kfree_skb_any(tpd->skb); -- atomic_inc(&tpd->vcc->stats->tx_err); -+ atomic_inc_unchecked(&tpd->vcc->stats->tx_err); - } - pci_pool_free(he_dev->tpd_pool, tpd, TPD_ADDR(tpd->status)); - return; -@@ -2577,7 +2577,7 @@ he_send(struct atm_vcc *vcc, struct sk_b - vcc->pop(vcc, skb); - else - dev_kfree_skb_any(skb); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - return -EINVAL; - } - -@@ -2588,7 +2588,7 @@ he_send(struct atm_vcc *vcc, struct sk_b - vcc->pop(vcc, skb); - else - dev_kfree_skb_any(skb); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - return -EINVAL; - } - #endif -@@ -2600,7 +2600,7 @@ he_send(struct atm_vcc *vcc, struct sk_b - vcc->pop(vcc, skb); - else - dev_kfree_skb_any(skb); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - spin_unlock_irqrestore(&he_dev->global_lock, flags); - return -ENOMEM; - } -@@ -2642,7 +2642,7 @@ he_send(struct atm_vcc *vcc, struct sk_b - vcc->pop(vcc, skb); - else - dev_kfree_skb_any(skb); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - spin_unlock_irqrestore(&he_dev->global_lock, flags); - return -ENOMEM; - } -@@ -2673,7 +2673,7 @@ he_send(struct atm_vcc *vcc, struct sk_b - __enqueue_tpd(he_dev, tpd, cid); - spin_unlock_irqrestore(&he_dev->global_lock, flags); - -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - - return 0; - } -diff -urNp linux-2.6.31.1/drivers/atm/horizon.c linux-2.6.31.1/drivers/atm/horizon.c ---- linux-2.6.31.1/drivers/atm/horizon.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/horizon.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1033,7 +1033,7 @@ static void rx_schedule (hrz_dev * dev, - { - struct atm_vcc * vcc = ATM_SKB(skb)->vcc; - // VC layer stats -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - __net_timestamp(skb); - // end of our responsability - vcc->push (vcc, skb); -@@ -1185,7 +1185,7 @@ static void tx_schedule (hrz_dev * const - dev->tx_iovec = NULL; - - // VC layer stats -- atomic_inc(&ATM_SKB(skb)->vcc->stats->tx); -+ atomic_inc_unchecked(&ATM_SKB(skb)->vcc->stats->tx); - - // free the skb - hrz_kfree_skb (skb); -diff -urNp linux-2.6.31.1/drivers/atm/idt77252.c linux-2.6.31.1/drivers/atm/idt77252.c ---- linux-2.6.31.1/drivers/atm/idt77252.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/idt77252.c 2009-10-01 20:12:42.000000000 -0400 -@@ -810,7 +810,7 @@ drain_scq(struct idt77252_dev *card, str - else - dev_kfree_skb(skb); - -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - } - - atomic_dec(&scq->used); -@@ -1073,13 +1073,13 @@ dequeue_rx(struct idt77252_dev *card, st - if ((sb = dev_alloc_skb(64)) == NULL) { - printk("%s: Can't allocate buffers for aal0.\n", - card->name); -- atomic_add(i, &vcc->stats->rx_drop); -+ atomic_add_unchecked(i, &vcc->stats->rx_drop); - break; - } - if (!atm_charge(vcc, sb->truesize)) { - RXPRINTK("%s: atm_charge() dropped aal0 packets.\n", - card->name); -- atomic_add(i - 1, &vcc->stats->rx_drop); -+ atomic_add_unchecked(i - 1, &vcc->stats->rx_drop); - dev_kfree_skb(sb); - break; - } -@@ -1096,7 +1096,7 @@ dequeue_rx(struct idt77252_dev *card, st - ATM_SKB(sb)->vcc = vcc; - __net_timestamp(sb); - vcc->push(vcc, sb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - - cell += ATM_CELL_PAYLOAD; - } -@@ -1133,13 +1133,13 @@ dequeue_rx(struct idt77252_dev *card, st - "(CDC: %08x)\n", - card->name, len, rpp->len, readl(SAR_REG_CDC)); - recycle_rx_pool_skb(card, rpp); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - return; - } - if (stat & SAR_RSQE_CRC) { - RXPRINTK("%s: AAL5 CRC error.\n", card->name); - recycle_rx_pool_skb(card, rpp); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - return; - } - if (skb_queue_len(&rpp->queue) > 1) { -@@ -1150,7 +1150,7 @@ dequeue_rx(struct idt77252_dev *card, st - RXPRINTK("%s: Can't alloc RX skb.\n", - card->name); - recycle_rx_pool_skb(card, rpp); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - return; - } - if (!atm_charge(vcc, skb->truesize)) { -@@ -1169,7 +1169,7 @@ dequeue_rx(struct idt77252_dev *card, st - __net_timestamp(skb); - - vcc->push(vcc, skb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - - return; - } -@@ -1191,7 +1191,7 @@ dequeue_rx(struct idt77252_dev *card, st - __net_timestamp(skb); - - vcc->push(vcc, skb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - - if (skb->truesize > SAR_FB_SIZE_3) - add_rx_skb(card, 3, SAR_FB_SIZE_3, 1); -@@ -1303,14 +1303,14 @@ idt77252_rx_raw(struct idt77252_dev *car - if (vcc->qos.aal != ATM_AAL0) { - RPRINTK("%s: raw cell for non AAL0 vc %u.%u\n", - card->name, vpi, vci); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - goto drop; - } - - if ((sb = dev_alloc_skb(64)) == NULL) { - printk("%s: Can't allocate buffers for AAL0.\n", - card->name); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - goto drop; - } - -@@ -1329,7 +1329,7 @@ idt77252_rx_raw(struct idt77252_dev *car - ATM_SKB(sb)->vcc = vcc; - __net_timestamp(sb); - vcc->push(vcc, sb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - - drop: - skb_pull(queue, 64); -@@ -1954,13 +1954,13 @@ idt77252_send_skb(struct atm_vcc *vcc, s - - if (vc == NULL) { - printk("%s: NULL connection in send().\n", card->name); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - dev_kfree_skb(skb); - return -EINVAL; - } - if (!test_bit(VCF_TX, &vc->flags)) { - printk("%s: Trying to transmit on a non-tx VC.\n", card->name); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - dev_kfree_skb(skb); - return -EINVAL; - } -@@ -1972,14 +1972,14 @@ idt77252_send_skb(struct atm_vcc *vcc, s - break; - default: - printk("%s: Unsupported AAL: %d\n", card->name, vcc->qos.aal); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - dev_kfree_skb(skb); - return -EINVAL; - } - - if (skb_shinfo(skb)->nr_frags != 0) { - printk("%s: No scatter-gather yet.\n", card->name); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - dev_kfree_skb(skb); - return -EINVAL; - } -@@ -1987,7 +1987,7 @@ idt77252_send_skb(struct atm_vcc *vcc, s - - err = queue_skb(card, vc, skb, oam); - if (err) { -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - dev_kfree_skb(skb); - return err; - } -@@ -2010,7 +2010,7 @@ idt77252_send_oam(struct atm_vcc *vcc, v - skb = dev_alloc_skb(64); - if (!skb) { - printk("%s: Out of memory in send_oam().\n", card->name); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - return -ENOMEM; - } - atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); -diff -urNp linux-2.6.31.1/drivers/atm/iphase.c linux-2.6.31.1/drivers/atm/iphase.c ---- linux-2.6.31.1/drivers/atm/iphase.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/iphase.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1123,7 +1123,7 @@ static int rx_pkt(struct atm_dev *dev) - status = (u_short) (buf_desc_ptr->desc_mode); - if (status & (RX_CER | RX_PTE | RX_OFL)) - { -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - IF_ERR(printk("IA: bad packet, dropping it");) - if (status & RX_CER) { - IF_ERR(printk(" cause: packet CRC error\n");) -@@ -1146,7 +1146,7 @@ static int rx_pkt(struct atm_dev *dev) - len = dma_addr - buf_addr; - if (len > iadev->rx_buf_sz) { - printk("Over %d bytes sdu received, dropped!!!\n", iadev->rx_buf_sz); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - goto out_free_desc; - } - -@@ -1296,7 +1296,7 @@ static void rx_dle_intr(struct atm_dev * - ia_vcc = INPH_IA_VCC(vcc); - if (ia_vcc == NULL) - { -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - dev_kfree_skb_any(skb); - atm_return(vcc, atm_guess_pdu2truesize(len)); - goto INCR_DLE; -@@ -1308,7 +1308,7 @@ static void rx_dle_intr(struct atm_dev * - if ((length > iadev->rx_buf_sz) || (length > - (skb->len - sizeof(struct cpcs_trailer)))) - { -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - IF_ERR(printk("rx_dle_intr: Bad AAL5 trailer %d (skb len %d)", - length, skb->len);) - dev_kfree_skb_any(skb); -@@ -1324,7 +1324,7 @@ static void rx_dle_intr(struct atm_dev * - - IF_RX(printk("rx_dle_intr: skb push");) - vcc->push(vcc,skb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - iadev->rx_pkt_cnt++; - } - INCR_DLE: -@@ -2806,15 +2806,15 @@ static int ia_ioctl(struct atm_dev *dev, - { - struct k_sonet_stats *stats; - stats = &PRIV(_ia_dev[board])->sonet_stats; -- printk("section_bip: %d\n", atomic_read(&stats->section_bip)); -- printk("line_bip : %d\n", atomic_read(&stats->line_bip)); -- printk("path_bip : %d\n", atomic_read(&stats->path_bip)); -- printk("line_febe : %d\n", atomic_read(&stats->line_febe)); -- printk("path_febe : %d\n", atomic_read(&stats->path_febe)); -- printk("corr_hcs : %d\n", atomic_read(&stats->corr_hcs)); -- printk("uncorr_hcs : %d\n", atomic_read(&stats->uncorr_hcs)); -- printk("tx_cells : %d\n", atomic_read(&stats->tx_cells)); -- printk("rx_cells : %d\n", atomic_read(&stats->rx_cells)); -+ printk("section_bip: %d\n", atomic_read_unchecked(&stats->section_bip)); -+ printk("line_bip : %d\n", atomic_read_unchecked(&stats->line_bip)); -+ printk("path_bip : %d\n", atomic_read_unchecked(&stats->path_bip)); -+ printk("line_febe : %d\n", atomic_read_unchecked(&stats->line_febe)); -+ printk("path_febe : %d\n", atomic_read_unchecked(&stats->path_febe)); -+ printk("corr_hcs : %d\n", atomic_read_unchecked(&stats->corr_hcs)); -+ printk("uncorr_hcs : %d\n", atomic_read_unchecked(&stats->uncorr_hcs)); -+ printk("tx_cells : %d\n", atomic_read_unchecked(&stats->tx_cells)); -+ printk("rx_cells : %d\n", atomic_read_unchecked(&stats->rx_cells)); - } - ia_cmds.status = 0; - break; -@@ -2919,7 +2919,7 @@ static int ia_pkt_tx (struct atm_vcc *vc - if ((desc == 0) || (desc > iadev->num_tx_desc)) - { - IF_ERR(printk(DEV_LABEL "invalid desc for send: %d\n", desc);) -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - if (vcc->pop) - vcc->pop(vcc, skb); - else -@@ -3024,14 +3024,14 @@ static int ia_pkt_tx (struct atm_vcc *vc - ATM_DESC(skb) = vcc->vci; - skb_queue_tail(&iadev->tx_dma_q, skb); - -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - iadev->tx_pkt_cnt++; - /* Increment transaction counter */ - writel(2, iadev->dma+IPHASE5575_TX_COUNTER); - - #if 0 - /* add flow control logic */ -- if (atomic_read(&vcc->stats->tx) % 20 == 0) { -+ if (atomic_read_unchecked(&vcc->stats->tx) % 20 == 0) { - if (iavcc->vc_desc_cnt > 10) { - vcc->tx_quota = vcc->tx_quota * 3 / 4; - printk("Tx1: vcc->tx_quota = %d \n", (u32)vcc->tx_quota ); -diff -urNp linux-2.6.31.1/drivers/atm/lanai.c linux-2.6.31.1/drivers/atm/lanai.c ---- linux-2.6.31.1/drivers/atm/lanai.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/lanai.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1305,7 +1305,7 @@ static void lanai_send_one_aal5(struct l - vcc_tx_add_aal5_trailer(lvcc, skb->len, 0, 0); - lanai_endtx(lanai, lvcc); - lanai_free_skb(lvcc->tx.atmvcc, skb); -- atomic_inc(&lvcc->tx.atmvcc->stats->tx); -+ atomic_inc_unchecked(&lvcc->tx.atmvcc->stats->tx); - } - - /* Try to fill the buffer - don't call unless there is backlog */ -@@ -1428,7 +1428,7 @@ static void vcc_rx_aal5(struct lanai_vcc - ATM_SKB(skb)->vcc = lvcc->rx.atmvcc; - __net_timestamp(skb); - lvcc->rx.atmvcc->push(lvcc->rx.atmvcc, skb); -- atomic_inc(&lvcc->rx.atmvcc->stats->rx); -+ atomic_inc_unchecked(&lvcc->rx.atmvcc->stats->rx); - out: - lvcc->rx.buf.ptr = end; - cardvcc_write(lvcc, endptr, vcc_rxreadptr); -@@ -1670,7 +1670,7 @@ static int handle_service(struct lanai_d - DPRINTK("(itf %d) got RX service entry 0x%X for non-AAL5 " - "vcc %d\n", lanai->number, (unsigned int) s, vci); - lanai->stats.service_rxnotaal5++; -- atomic_inc(&lvcc->rx.atmvcc->stats->rx_err); -+ atomic_inc_unchecked(&lvcc->rx.atmvcc->stats->rx_err); - return 0; - } - if (likely(!(s & (SERVICE_TRASH | SERVICE_STREAM | SERVICE_CRCERR)))) { -@@ -1682,7 +1682,7 @@ static int handle_service(struct lanai_d - int bytes; - read_unlock(&vcc_sklist_lock); - DPRINTK("got trashed rx pdu on vci %d\n", vci); -- atomic_inc(&lvcc->rx.atmvcc->stats->rx_err); -+ atomic_inc_unchecked(&lvcc->rx.atmvcc->stats->rx_err); - lvcc->stats.x.aal5.service_trash++; - bytes = (SERVICE_GET_END(s) * 16) - - (((unsigned long) lvcc->rx.buf.ptr) - -@@ -1694,7 +1694,7 @@ static int handle_service(struct lanai_d - } - if (s & SERVICE_STREAM) { - read_unlock(&vcc_sklist_lock); -- atomic_inc(&lvcc->rx.atmvcc->stats->rx_err); -+ atomic_inc_unchecked(&lvcc->rx.atmvcc->stats->rx_err); - lvcc->stats.x.aal5.service_stream++; - printk(KERN_ERR DEV_LABEL "(itf %d): Got AAL5 stream " - "PDU on VCI %d!\n", lanai->number, vci); -@@ -1702,7 +1702,7 @@ static int handle_service(struct lanai_d - return 0; - } - DPRINTK("got rx crc error on vci %d\n", vci); -- atomic_inc(&lvcc->rx.atmvcc->stats->rx_err); -+ atomic_inc_unchecked(&lvcc->rx.atmvcc->stats->rx_err); - lvcc->stats.x.aal5.service_rxcrc++; - lvcc->rx.buf.ptr = &lvcc->rx.buf.start[SERVICE_GET_END(s) * 4]; - cardvcc_write(lvcc, SERVICE_GET_END(s), vcc_rxreadptr); -diff -urNp linux-2.6.31.1/drivers/atm/nicstar.c linux-2.6.31.1/drivers/atm/nicstar.c ---- linux-2.6.31.1/drivers/atm/nicstar.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/nicstar.c 2009-10-01 20:12:42.000000000 -0400 -@@ -1723,7 +1723,7 @@ static int ns_send(struct atm_vcc *vcc, - if ((vc = (vc_map *) vcc->dev_data) == NULL) - { - printk("nicstar%d: vcc->dev_data == NULL on ns_send().\n", card->index); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - dev_kfree_skb_any(skb); - return -EINVAL; - } -@@ -1731,7 +1731,7 @@ static int ns_send(struct atm_vcc *vcc, - if (!vc->tx) - { - printk("nicstar%d: Trying to transmit on a non-tx VC.\n", card->index); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - dev_kfree_skb_any(skb); - return -EINVAL; - } -@@ -1739,7 +1739,7 @@ static int ns_send(struct atm_vcc *vcc, - if (vcc->qos.aal != ATM_AAL5 && vcc->qos.aal != ATM_AAL0) - { - printk("nicstar%d: Only AAL0 and AAL5 are supported.\n", card->index); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - dev_kfree_skb_any(skb); - return -EINVAL; - } -@@ -1747,7 +1747,7 @@ static int ns_send(struct atm_vcc *vcc, - if (skb_shinfo(skb)->nr_frags != 0) - { - printk("nicstar%d: No scatter-gather yet.\n", card->index); -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - dev_kfree_skb_any(skb); - return -EINVAL; - } -@@ -1792,11 +1792,11 @@ static int ns_send(struct atm_vcc *vcc, - - if (push_scqe(card, vc, scq, &scqe, skb) != 0) - { -- atomic_inc(&vcc->stats->tx_err); -+ atomic_inc_unchecked(&vcc->stats->tx_err); - dev_kfree_skb_any(skb); - return -EIO; - } -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - - return 0; - } -@@ -2111,14 +2111,14 @@ static void dequeue_rx(ns_dev *card, ns_ - { - printk("nicstar%d: Can't allocate buffers for aal0.\n", - card->index); -- atomic_add(i,&vcc->stats->rx_drop); -+ atomic_add_unchecked(i,&vcc->stats->rx_drop); - break; - } - if (!atm_charge(vcc, sb->truesize)) - { - RXPRINTK("nicstar%d: atm_charge() dropped aal0 packets.\n", - card->index); -- atomic_add(i-1,&vcc->stats->rx_drop); /* already increased by 1 */ -+ atomic_add_unchecked(i-1,&vcc->stats->rx_drop); /* already increased by 1 */ - dev_kfree_skb_any(sb); - break; - } -@@ -2133,7 +2133,7 @@ static void dequeue_rx(ns_dev *card, ns_ - ATM_SKB(sb)->vcc = vcc; - __net_timestamp(sb); - vcc->push(vcc, sb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - cell += ATM_CELL_PAYLOAD; - } - -@@ -2152,7 +2152,7 @@ static void dequeue_rx(ns_dev *card, ns_ - if (iovb == NULL) - { - printk("nicstar%d: Out of iovec buffers.\n", card->index); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - recycle_rx_buf(card, skb); - return; - } -@@ -2182,7 +2182,7 @@ static void dequeue_rx(ns_dev *card, ns_ - else if (NS_SKB(iovb)->iovcnt >= NS_MAX_IOVECS) - { - printk("nicstar%d: received too big AAL5 SDU.\n", card->index); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - recycle_iovec_rx_bufs(card, (struct iovec *) iovb->data, NS_MAX_IOVECS); - NS_SKB(iovb)->iovcnt = 0; - iovb->len = 0; -@@ -2202,7 +2202,7 @@ static void dequeue_rx(ns_dev *card, ns_ - printk("nicstar%d: Expected a small buffer, and this is not one.\n", - card->index); - which_list(card, skb); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - recycle_rx_buf(card, skb); - vc->rx_iov = NULL; - recycle_iov_buf(card, iovb); -@@ -2216,7 +2216,7 @@ static void dequeue_rx(ns_dev *card, ns_ - printk("nicstar%d: Expected a large buffer, and this is not one.\n", - card->index); - which_list(card, skb); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - recycle_iovec_rx_bufs(card, (struct iovec *) iovb->data, - NS_SKB(iovb)->iovcnt); - vc->rx_iov = NULL; -@@ -2240,7 +2240,7 @@ static void dequeue_rx(ns_dev *card, ns_ - printk(" - PDU size mismatch.\n"); - else - printk(".\n"); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - recycle_iovec_rx_bufs(card, (struct iovec *) iovb->data, - NS_SKB(iovb)->iovcnt); - vc->rx_iov = NULL; -@@ -2256,7 +2256,7 @@ static void dequeue_rx(ns_dev *card, ns_ - if (!atm_charge(vcc, skb->truesize)) - { - push_rxbufs(card, skb); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - } - else - { -@@ -2268,7 +2268,7 @@ static void dequeue_rx(ns_dev *card, ns_ - ATM_SKB(skb)->vcc = vcc; - __net_timestamp(skb); - vcc->push(vcc, skb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - } - } - else if (NS_SKB(iovb)->iovcnt == 2) /* One small plus one large buffer */ -@@ -2283,7 +2283,7 @@ static void dequeue_rx(ns_dev *card, ns_ - if (!atm_charge(vcc, sb->truesize)) - { - push_rxbufs(card, sb); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - } - else - { -@@ -2295,7 +2295,7 @@ static void dequeue_rx(ns_dev *card, ns_ - ATM_SKB(sb)->vcc = vcc; - __net_timestamp(sb); - vcc->push(vcc, sb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - } - - push_rxbufs(card, skb); -@@ -2306,7 +2306,7 @@ static void dequeue_rx(ns_dev *card, ns_ - if (!atm_charge(vcc, skb->truesize)) - { - push_rxbufs(card, skb); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - } - else - { -@@ -2320,7 +2320,7 @@ static void dequeue_rx(ns_dev *card, ns_ - ATM_SKB(skb)->vcc = vcc; - __net_timestamp(skb); - vcc->push(vcc, skb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - } - - push_rxbufs(card, sb); -@@ -2342,7 +2342,7 @@ static void dequeue_rx(ns_dev *card, ns_ - if (hb == NULL) - { - printk("nicstar%d: Out of huge buffers.\n", card->index); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - recycle_iovec_rx_bufs(card, (struct iovec *) iovb->data, - NS_SKB(iovb)->iovcnt); - vc->rx_iov = NULL; -@@ -2393,7 +2393,7 @@ static void dequeue_rx(ns_dev *card, ns_ - } - else - dev_kfree_skb_any(hb); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - } - else - { -@@ -2427,7 +2427,7 @@ static void dequeue_rx(ns_dev *card, ns_ - #endif /* NS_USE_DESTRUCTORS */ - __net_timestamp(hb); - vcc->push(vcc, hb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - } - } - -diff -urNp linux-2.6.31.1/drivers/atm/solos-pci.c linux-2.6.31.1/drivers/atm/solos-pci.c ---- linux-2.6.31.1/drivers/atm/solos-pci.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/solos-pci.c 2009-10-01 20:12:42.000000000 -0400 -@@ -663,7 +663,7 @@ void solos_bh(unsigned long card_arg) - } - atm_charge(vcc, skb->truesize); - vcc->push(vcc, skb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - break; - - case PKT_STATUS: -@@ -966,7 +966,7 @@ static uint32_t fpga_tx(struct solos_car - vcc = SKB_CB(oldskb)->vcc; - - if (vcc) { -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - solos_pop(vcc, oldskb); - } else - dev_kfree_skb_irq(oldskb); -diff -urNp linux-2.6.31.1/drivers/atm/suni.c linux-2.6.31.1/drivers/atm/suni.c ---- linux-2.6.31.1/drivers/atm/suni.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/suni.c 2009-10-01 20:12:42.000000000 -0400 -@@ -49,8 +49,8 @@ static DEFINE_SPINLOCK(sunis_lock); - - - #define ADD_LIMITED(s,v) \ -- atomic_add((v),&stats->s); \ -- if (atomic_read(&stats->s) < 0) atomic_set(&stats->s,INT_MAX); -+ atomic_add_unchecked((v),&stats->s); \ -+ if (atomic_read_unchecked(&stats->s) < 0) atomic_set_unchecked(&stats->s,INT_MAX); - - - static void suni_hz(unsigned long from_timer) -diff -urNp linux-2.6.31.1/drivers/atm/uPD98402.c linux-2.6.31.1/drivers/atm/uPD98402.c ---- linux-2.6.31.1/drivers/atm/uPD98402.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/uPD98402.c 2009-10-01 20:12:42.000000000 -0400 -@@ -41,7 +41,7 @@ static int fetch_stats(struct atm_dev *d - struct sonet_stats tmp; - int error = 0; - -- atomic_add(GET(HECCT),&PRIV(dev)->sonet_stats.uncorr_hcs); -+ atomic_add_unchecked(GET(HECCT),&PRIV(dev)->sonet_stats.uncorr_hcs); - sonet_copy_stats(&PRIV(dev)->sonet_stats,&tmp); - if (arg) error = copy_to_user(arg,&tmp,sizeof(tmp)); - if (zero && !error) { -@@ -160,9 +160,9 @@ static int uPD98402_ioctl(struct atm_dev - - - #define ADD_LIMITED(s,v) \ -- { atomic_add(GET(v),&PRIV(dev)->sonet_stats.s); \ -- if (atomic_read(&PRIV(dev)->sonet_stats.s) < 0) \ -- atomic_set(&PRIV(dev)->sonet_stats.s,INT_MAX); } -+ { atomic_add_unchecked(GET(v),&PRIV(dev)->sonet_stats.s); \ -+ if (atomic_read_unchecked(&PRIV(dev)->sonet_stats.s) < 0) \ -+ atomic_set_unchecked(&PRIV(dev)->sonet_stats.s,INT_MAX); } - - - static void stat_event(struct atm_dev *dev) -@@ -193,7 +193,7 @@ static void uPD98402_int(struct atm_dev - if (reason & uPD98402_INT_PFM) stat_event(dev); - if (reason & uPD98402_INT_PCO) { - (void) GET(PCOCR); /* clear interrupt cause */ -- atomic_add(GET(HECCT), -+ atomic_add_unchecked(GET(HECCT), - &PRIV(dev)->sonet_stats.uncorr_hcs); - } - if ((reason & uPD98402_INT_RFO) && -@@ -221,9 +221,9 @@ static int uPD98402_start(struct atm_dev - PUT(~(uPD98402_INT_PFM | uPD98402_INT_ALM | uPD98402_INT_RFO | - uPD98402_INT_LOS),PIMR); /* enable them */ - (void) fetch_stats(dev,NULL,1); /* clear kernel counters */ -- atomic_set(&PRIV(dev)->sonet_stats.corr_hcs,-1); -- atomic_set(&PRIV(dev)->sonet_stats.tx_cells,-1); -- atomic_set(&PRIV(dev)->sonet_stats.rx_cells,-1); -+ atomic_set_unchecked(&PRIV(dev)->sonet_stats.corr_hcs,-1); -+ atomic_set_unchecked(&PRIV(dev)->sonet_stats.tx_cells,-1); -+ atomic_set_unchecked(&PRIV(dev)->sonet_stats.rx_cells,-1); - return 0; - } - -diff -urNp linux-2.6.31.1/drivers/atm/zatm.c linux-2.6.31.1/drivers/atm/zatm.c ---- linux-2.6.31.1/drivers/atm/zatm.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/atm/zatm.c 2009-10-01 20:12:42.000000000 -0400 -@@ -458,7 +458,7 @@ printk("dummy: 0x%08lx, 0x%08lx\n",dummy - } - if (!size) { - dev_kfree_skb_irq(skb); -- if (vcc) atomic_inc(&vcc->stats->rx_err); -+ if (vcc) atomic_inc_unchecked(&vcc->stats->rx_err); - continue; - } - if (!atm_charge(vcc,skb->truesize)) { -@@ -468,7 +468,7 @@ printk("dummy: 0x%08lx, 0x%08lx\n",dummy - skb->len = size; - ATM_SKB(skb)->vcc = vcc; - vcc->push(vcc,skb); -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - } - zout(pos & 0xffff,MTA(mbx)); - #if 0 /* probably a stupid idea */ -@@ -732,7 +732,7 @@ if (*ZATM_PRV_DSC(skb) != (uPD98401_TXPD - skb_queue_head(&zatm_vcc->backlog,skb); - break; - } -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - wake_up(&zatm_vcc->tx_wait); - } - -diff -urNp linux-2.6.31.1/drivers/block/cciss.c linux-2.6.31.1/drivers/block/cciss.c ---- linux-2.6.31.1/drivers/block/cciss.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/block/cciss.c 2009-10-01 20:12:42.000000000 -0400 -@@ -363,7 +363,7 @@ static void cciss_seq_stop(struct seq_fi - h->busy_configuring = 0; - } - --static struct seq_operations cciss_seq_ops = { -+static const struct seq_operations cciss_seq_ops = { - .start = cciss_seq_start, - .show = cciss_seq_show, - .next = cciss_seq_next, -@@ -426,7 +426,7 @@ out: - return err; - } - --static struct file_operations cciss_proc_fops = { -+static const struct file_operations cciss_proc_fops = { - .owner = THIS_MODULE, - .open = cciss_seq_open, - .read = seq_read, -diff -urNp linux-2.6.31.1/drivers/char/agp/agp.h linux-2.6.31.1/drivers/char/agp/agp.h ---- linux-2.6.31.1/drivers/char/agp/agp.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/agp/agp.h 2009-10-01 20:12:42.000000000 -0400 -@@ -126,7 +126,7 @@ struct agp_bridge_driver { - struct agp_bridge_data { - const struct agp_version *version; - const struct agp_bridge_driver *driver; -- struct vm_operations_struct *vm_ops; -+ const struct vm_operations_struct *vm_ops; - void *previous_size; - void *current_size; - void *dev_private_data; -diff -urNp linux-2.6.31.1/drivers/char/agp/alpha-agp.c linux-2.6.31.1/drivers/char/agp/alpha-agp.c ---- linux-2.6.31.1/drivers/char/agp/alpha-agp.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/agp/alpha-agp.c 2009-10-01 20:12:42.000000000 -0400 -@@ -40,7 +40,7 @@ static struct aper_size_info_fixed alpha - { 0, 0, 0 }, /* filled in by alpha_core_agp_setup */ - }; - --struct vm_operations_struct alpha_core_agp_vm_ops = { -+const struct vm_operations_struct alpha_core_agp_vm_ops = { - .fault = alpha_core_agp_vm_fault, - }; - -diff -urNp linux-2.6.31.1/drivers/char/agp/frontend.c linux-2.6.31.1/drivers/char/agp/frontend.c ---- linux-2.6.31.1/drivers/char/agp/frontend.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/agp/frontend.c 2009-10-01 20:12:42.000000000 -0400 -@@ -824,7 +824,7 @@ static int agpioc_reserve_wrap(struct ag - if (copy_from_user(&reserve, arg, sizeof(struct agp_region))) - return -EFAULT; - -- if ((unsigned) reserve.seg_count >= ~0U/sizeof(struct agp_segment)) -+ if ((unsigned) reserve.seg_count >= ~0U/sizeof(struct agp_segment_priv)) - return -EFAULT; - - client = agp_find_client_by_pid(reserve.pid); -diff -urNp linux-2.6.31.1/drivers/char/agp/intel-agp.c linux-2.6.31.1/drivers/char/agp/intel-agp.c ---- linux-2.6.31.1/drivers/char/agp/intel-agp.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/agp/intel-agp.c 2009-10-01 20:12:42.000000000 -0400 -@@ -2395,7 +2395,7 @@ static struct pci_device_id agp_intel_pc - ID(PCI_DEVICE_ID_INTEL_IGDNG_D_HB), - ID(PCI_DEVICE_ID_INTEL_IGDNG_M_HB), - ID(PCI_DEVICE_ID_INTEL_IGDNG_MA_HB), -- { } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(pci, agp_intel_pci_table); -diff -urNp linux-2.6.31.1/drivers/char/apm-emulation.c linux-2.6.31.1/drivers/char/apm-emulation.c ---- linux-2.6.31.1/drivers/char/apm-emulation.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/apm-emulation.c 2009-10-01 20:12:42.000000000 -0400 -@@ -393,7 +393,7 @@ static int apm_open(struct inode * inode - return as ? 0 : -ENOMEM; - } - --static struct file_operations apm_bios_fops = { -+static const struct file_operations apm_bios_fops = { - .owner = THIS_MODULE, - .read = apm_read, - .poll = apm_poll, -diff -urNp linux-2.6.31.1/drivers/char/bfin-otp.c linux-2.6.31.1/drivers/char/bfin-otp.c ---- linux-2.6.31.1/drivers/char/bfin-otp.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/bfin-otp.c 2009-10-01 20:12:42.000000000 -0400 -@@ -133,7 +133,7 @@ static ssize_t bfin_otp_write(struct fil - # define bfin_otp_write NULL - #endif - --static struct file_operations bfin_otp_fops = { -+static const struct file_operations bfin_otp_fops = { - .owner = THIS_MODULE, - .read = bfin_otp_read, - .write = bfin_otp_write, -diff -urNp linux-2.6.31.1/drivers/char/hpet.c linux-2.6.31.1/drivers/char/hpet.c ---- linux-2.6.31.1/drivers/char/hpet.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/hpet.c 2009-10-01 20:12:42.000000000 -0400 -@@ -995,7 +995,7 @@ static struct acpi_driver hpet_acpi_driv - }, - }; - --static struct miscdevice hpet_misc = { HPET_MINOR, "hpet", &hpet_fops }; -+static struct miscdevice hpet_misc = { HPET_MINOR, "hpet", &hpet_fops, {NULL, NULL}, NULL, NULL }; - - static int __init hpet_init(void) - { -diff -urNp linux-2.6.31.1/drivers/char/hvcs.c linux-2.6.31.1/drivers/char/hvcs.c ---- linux-2.6.31.1/drivers/char/hvcs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/hvcs.c 2009-10-01 20:12:42.000000000 -0400 -@@ -269,7 +269,7 @@ struct hvcs_struct { - unsigned int index; - - struct tty_struct *tty; -- int open_count; -+ atomic_t open_count; - - /* - * Used to tell the driver kernel_thread what operations need to take -@@ -419,7 +419,7 @@ static ssize_t hvcs_vterm_state_store(st - - spin_lock_irqsave(&hvcsd->lock, flags); - -- if (hvcsd->open_count > 0) { -+ if (atomic_read(&hvcsd->open_count) > 0) { - spin_unlock_irqrestore(&hvcsd->lock, flags); - printk(KERN_INFO "HVCS: vterm state unchanged. " - "The hvcs device node is still in use.\n"); -@@ -1135,7 +1135,7 @@ static int hvcs_open(struct tty_struct * - if ((retval = hvcs_partner_connect(hvcsd))) - goto error_release; - -- hvcsd->open_count = 1; -+ atomic_set(&hvcsd->open_count, 1); - hvcsd->tty = tty; - tty->driver_data = hvcsd; - -@@ -1169,7 +1169,7 @@ fast_open: - - spin_lock_irqsave(&hvcsd->lock, flags); - kref_get(&hvcsd->kref); -- hvcsd->open_count++; -+ atomic_inc(&hvcsd->open_count); - hvcsd->todo_mask |= HVCS_SCHED_READ; - spin_unlock_irqrestore(&hvcsd->lock, flags); - -@@ -1213,7 +1213,7 @@ static void hvcs_close(struct tty_struct - hvcsd = tty->driver_data; - - spin_lock_irqsave(&hvcsd->lock, flags); -- if (--hvcsd->open_count == 0) { -+ if (atomic_dec_and_test(&hvcsd->open_count)) { - - vio_disable_interrupts(hvcsd->vdev); - -@@ -1239,10 +1239,10 @@ static void hvcs_close(struct tty_struct - free_irq(irq, hvcsd); - kref_put(&hvcsd->kref, destroy_hvcs_struct); - return; -- } else if (hvcsd->open_count < 0) { -+ } else if (atomic_read(&hvcsd->open_count) < 0) { - printk(KERN_ERR "HVCS: vty-server@%X open_count: %d" - " is missmanaged.\n", -- hvcsd->vdev->unit_address, hvcsd->open_count); -+ hvcsd->vdev->unit_address, atomic_read(&hvcsd->open_count)); - } - - spin_unlock_irqrestore(&hvcsd->lock, flags); -@@ -1258,7 +1258,7 @@ static void hvcs_hangup(struct tty_struc - - spin_lock_irqsave(&hvcsd->lock, flags); - /* Preserve this so that we know how many kref refs to put */ -- temp_open_count = hvcsd->open_count; -+ temp_open_count = atomic_read(&hvcsd->open_count); - - /* - * Don't kref put inside the spinlock because the destruction -@@ -1273,7 +1273,7 @@ static void hvcs_hangup(struct tty_struc - hvcsd->tty->driver_data = NULL; - hvcsd->tty = NULL; - -- hvcsd->open_count = 0; -+ atomic_set(&hvcsd->open_count, 0); - - /* This will drop any buffered data on the floor which is OK in a hangup - * scenario. */ -@@ -1344,7 +1344,7 @@ static int hvcs_write(struct tty_struct - * the middle of a write operation? This is a crummy place to do this - * but we want to keep it all in the spinlock. - */ -- if (hvcsd->open_count <= 0) { -+ if (atomic_read(&hvcsd->open_count) <= 0) { - spin_unlock_irqrestore(&hvcsd->lock, flags); - return -ENODEV; - } -@@ -1418,7 +1418,7 @@ static int hvcs_write_room(struct tty_st - { - struct hvcs_struct *hvcsd = tty->driver_data; - -- if (!hvcsd || hvcsd->open_count <= 0) -+ if (!hvcsd || atomic_read(&hvcsd->open_count) <= 0) - return 0; - - return HVCS_BUFF_LEN - hvcsd->chars_in_buffer; -diff -urNp linux-2.6.31.1/drivers/char/ipmi/ipmi_msghandler.c linux-2.6.31.1/drivers/char/ipmi/ipmi_msghandler.c ---- linux-2.6.31.1/drivers/char/ipmi/ipmi_msghandler.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/ipmi/ipmi_msghandler.c 2009-10-01 20:12:42.000000000 -0400 -@@ -413,7 +413,7 @@ struct ipmi_smi { - struct proc_dir_entry *proc_dir; - char proc_dir_name[10]; - -- atomic_t stats[IPMI_NUM_STATS]; -+ atomic_unchecked_t stats[IPMI_NUM_STATS]; - - /* - * run_to_completion duplicate of smb_info, smi_info -@@ -446,9 +446,9 @@ static DEFINE_MUTEX(smi_watchers_mutex); - - - #define ipmi_inc_stat(intf, stat) \ -- atomic_inc(&(intf)->stats[IPMI_STAT_ ## stat]) -+ atomic_inc_unchecked(&(intf)->stats[IPMI_STAT_ ## stat]) - #define ipmi_get_stat(intf, stat) \ -- ((unsigned int) atomic_read(&(intf)->stats[IPMI_STAT_ ## stat])) -+ ((unsigned int) atomic_read_unchecked(&(intf)->stats[IPMI_STAT_ ## stat])) - - static int is_lan_addr(struct ipmi_addr *addr) - { -@@ -2807,7 +2807,7 @@ int ipmi_register_smi(struct ipmi_smi_ha - INIT_LIST_HEAD(&intf->cmd_rcvrs); - init_waitqueue_head(&intf->waitq); - for (i = 0; i < IPMI_NUM_STATS; i++) -- atomic_set(&intf->stats[i], 0); -+ atomic_set_unchecked(&intf->stats[i], 0); - - intf->proc_dir = NULL; - -diff -urNp linux-2.6.31.1/drivers/char/ipmi/ipmi_si_intf.c linux-2.6.31.1/drivers/char/ipmi/ipmi_si_intf.c ---- linux-2.6.31.1/drivers/char/ipmi/ipmi_si_intf.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/ipmi/ipmi_si_intf.c 2009-10-01 20:12:42.000000000 -0400 -@@ -277,7 +277,7 @@ struct smi_info { - unsigned char slave_addr; - - /* Counters and things for the proc filesystem. */ -- atomic_t stats[SI_NUM_STATS]; -+ atomic_unchecked_t stats[SI_NUM_STATS]; - - struct task_struct *thread; - -@@ -285,9 +285,9 @@ struct smi_info { - }; - - #define smi_inc_stat(smi, stat) \ -- atomic_inc(&(smi)->stats[SI_STAT_ ## stat]) -+ atomic_inc_unchecked(&(smi)->stats[SI_STAT_ ## stat]) - #define smi_get_stat(smi, stat) \ -- ((unsigned int) atomic_read(&(smi)->stats[SI_STAT_ ## stat])) -+ ((unsigned int) atomic_read_unchecked(&(smi)->stats[SI_STAT_ ## stat])) - - #define SI_MAX_PARMS 4 - -@@ -2926,7 +2926,7 @@ static int try_smi_init(struct smi_info - atomic_set(&new_smi->req_events, 0); - new_smi->run_to_completion = 0; - for (i = 0; i < SI_NUM_STATS; i++) -- atomic_set(&new_smi->stats[i], 0); -+ atomic_set_unchecked(&new_smi->stats[i], 0); - - new_smi->interrupt_disabled = 0; - atomic_set(&new_smi->stop_operation, 0); -diff -urNp linux-2.6.31.1/drivers/char/keyboard.c linux-2.6.31.1/drivers/char/keyboard.c ---- linux-2.6.31.1/drivers/char/keyboard.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/keyboard.c 2009-10-01 20:12:42.000000000 -0400 -@@ -635,6 +635,16 @@ static void k_spec(struct vc_data *vc, u - kbd->kbdmode == VC_MEDIUMRAW) && - value != KVAL(K_SAK)) - return; /* SAK is allowed even in raw mode */ -+ -+#if defined(CONFIG_GRKERNSEC_PROC) || defined(CONFIG_GRKERNSEC_PROC_MEMMAP) -+ { -+ void *func = fn_handler[value]; -+ if (func == fn_show_state || func == fn_show_ptregs || -+ func == fn_show_mem) -+ return; -+ } -+#endif -+ - fn_handler[value](vc); - } - -@@ -1386,7 +1396,7 @@ static const struct input_device_id kbd_ - .evbit = { BIT_MASK(EV_SND) }, - }, - -- { }, /* Terminating entry */ -+ { 0 }, /* Terminating entry */ - }; - - MODULE_DEVICE_TABLE(input, kbd_ids); -diff -urNp linux-2.6.31.1/drivers/char/mem.c linux-2.6.31.1/drivers/char/mem.c ---- linux-2.6.31.1/drivers/char/mem.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/mem.c 2009-10-01 20:12:42.000000000 -0400 -@@ -18,6 +18,7 @@ - #include <linux/raw.h> - #include <linux/tty.h> - #include <linux/capability.h> -+#include <linux/security.h> - #include <linux/ptrace.h> - #include <linux/device.h> - #include <linux/highmem.h> -@@ -35,6 +36,10 @@ - # include <linux/efi.h> - #endif - -+#if defined(CONFIG_GRKERNSEC) && !defined(CONFIG_GRKERNSEC_NO_RBAC) -+extern struct file_operations grsec_fops; -+#endif -+ - /* - * Architectures vary in how they handle caching for addresses - * outside of main memory. -@@ -192,6 +197,11 @@ static ssize_t write_mem(struct file * f - if (!valid_phys_addr_range(p, count)) - return -EFAULT; - -+#ifdef CONFIG_GRKERNSEC_KMEM -+ gr_handle_mem_write(); -+ return -EPERM; -+#endif -+ - written = 0; - - #ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED -@@ -301,7 +311,7 @@ static inline int private_mapping_ok(str - } - #endif - --static struct vm_operations_struct mmap_mem_ops = { -+static const struct vm_operations_struct mmap_mem_ops = { - #ifdef CONFIG_HAVE_IOREMAP_PROT - .access = generic_access_phys - #endif -@@ -324,6 +334,11 @@ static int mmap_mem(struct file * file, - &vma->vm_page_prot)) - return -EINVAL; - -+#ifdef CONFIG_GRKERNSEC_KMEM -+ if (gr_handle_mem_mmap(vma->vm_pgoff << PAGE_SHIFT, vma)) -+ return -EPERM; -+#endif -+ - vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff, - size, - vma->vm_page_prot); -@@ -558,6 +573,11 @@ static ssize_t write_kmem(struct file * - ssize_t written; - char * kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */ - -+#ifdef CONFIG_GRKERNSEC_KMEM -+ gr_handle_kmem_write(); -+ return -EPERM; -+#endif -+ - if (p < (unsigned long) high_memory) { - - wrote = count; -@@ -763,6 +783,16 @@ static loff_t memory_lseek(struct file * - - static int open_port(struct inode * inode, struct file * filp) - { -+#ifdef CONFIG_GRKERNSEC_KMEM -+ gr_handle_open_port(); -+ return -EPERM; -+#endif -+ -+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -+} -+ -+static int open_mem(struct inode * inode, struct file * filp) -+{ - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; - } - -@@ -770,7 +800,6 @@ static int open_port(struct inode * inod - #define full_lseek null_lseek - #define write_zero write_null - #define read_full read_zero --#define open_mem open_port - #define open_kmem open_mem - #define open_oldmem open_mem - -@@ -888,6 +917,9 @@ static const struct { - #ifdef CONFIG_CRASH_DUMP - {12,"oldmem", S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops, NULL}, - #endif -+#if defined(CONFIG_GRKERNSEC) && !defined(CONFIG_GRKERNSEC_NO_RBAC) -+ {13,"grsec", S_IRUSR | S_IWUGO, &grsec_fops}, -+#endif - }; - - static int memory_open(struct inode *inode, struct file *filp) -diff -urNp linux-2.6.31.1/drivers/char/misc.c linux-2.6.31.1/drivers/char/misc.c ---- linux-2.6.31.1/drivers/char/misc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/misc.c 2009-10-01 20:12:42.000000000 -0400 -@@ -91,7 +91,7 @@ static int misc_seq_show(struct seq_file - } - - --static struct seq_operations misc_seq_ops = { -+static const struct seq_operations misc_seq_ops = { - .start = misc_seq_start, - .next = misc_seq_next, - .stop = misc_seq_stop, -diff -urNp linux-2.6.31.1/drivers/char/mspec.c linux-2.6.31.1/drivers/char/mspec.c ---- linux-2.6.31.1/drivers/char/mspec.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/mspec.c 2009-10-01 20:12:43.000000000 -0400 -@@ -239,7 +239,7 @@ mspec_fault(struct vm_area_struct *vma, - return VM_FAULT_NOPAGE; - } - --static struct vm_operations_struct mspec_vm_ops = { -+static const struct vm_operations_struct mspec_vm_ops = { - .open = mspec_open, - .close = mspec_close, - .fault = mspec_fault, -diff -urNp linux-2.6.31.1/drivers/char/nvram.c linux-2.6.31.1/drivers/char/nvram.c ---- linux-2.6.31.1/drivers/char/nvram.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/nvram.c 2009-10-01 20:12:43.000000000 -0400 -@@ -429,7 +429,10 @@ static const struct file_operations nvra - static struct miscdevice nvram_dev = { - NVRAM_MINOR, - "nvram", -- &nvram_fops -+ &nvram_fops, -+ {NULL, NULL}, -+ NULL, -+ NULL - }; - - static int __init nvram_init(void) -diff -urNp linux-2.6.31.1/drivers/char/pcmcia/ipwireless/tty.c linux-2.6.31.1/drivers/char/pcmcia/ipwireless/tty.c ---- linux-2.6.31.1/drivers/char/pcmcia/ipwireless/tty.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/pcmcia/ipwireless/tty.c 2009-10-01 20:12:43.000000000 -0400 -@@ -51,7 +51,7 @@ struct ipw_tty { - int tty_type; - struct ipw_network *network; - struct tty_struct *linux_tty; -- int open_count; -+ atomic_t open_count; - unsigned int control_lines; - struct mutex ipw_tty_mutex; - int tx_bytes_queued; -@@ -127,10 +127,10 @@ static int ipw_open(struct tty_struct *l - mutex_unlock(&tty->ipw_tty_mutex); - return -ENODEV; - } -- if (tty->open_count == 0) -+ if (atomic_read(&tty->open_count) == 0) - tty->tx_bytes_queued = 0; - -- tty->open_count++; -+ atomic_inc(&tty->open_count); - - tty->linux_tty = linux_tty; - linux_tty->driver_data = tty; -@@ -146,9 +146,7 @@ static int ipw_open(struct tty_struct *l - - static void do_ipw_close(struct ipw_tty *tty) - { -- tty->open_count--; -- -- if (tty->open_count == 0) { -+ if (atomic_dec_return(&tty->open_count) == 0) { - struct tty_struct *linux_tty = tty->linux_tty; - - if (linux_tty != NULL) { -@@ -169,7 +167,7 @@ static void ipw_hangup(struct tty_struct - return; - - mutex_lock(&tty->ipw_tty_mutex); -- if (tty->open_count == 0) { -+ if (atomic_read(&tty->open_count) == 0) { - mutex_unlock(&tty->ipw_tty_mutex); - return; - } -@@ -198,7 +196,7 @@ void ipwireless_tty_received(struct ipw_ - return; - } - -- if (!tty->open_count) { -+ if (!atomic_read(&tty->open_count)) { - mutex_unlock(&tty->ipw_tty_mutex); - return; - } -@@ -240,7 +238,7 @@ static int ipw_write(struct tty_struct * - return -ENODEV; - - mutex_lock(&tty->ipw_tty_mutex); -- if (!tty->open_count) { -+ if (!atomic_read(&tty->open_count)) { - mutex_unlock(&tty->ipw_tty_mutex); - return -EINVAL; - } -@@ -280,7 +278,7 @@ static int ipw_write_room(struct tty_str - if (!tty) - return -ENODEV; - -- if (!tty->open_count) -+ if (!atomic_read(&tty->open_count)) - return -EINVAL; - - room = IPWIRELESS_TX_QUEUE_SIZE - tty->tx_bytes_queued; -@@ -322,7 +320,7 @@ static int ipw_chars_in_buffer(struct tt - if (!tty) - return 0; - -- if (!tty->open_count) -+ if (!atomic_read(&tty->open_count)) - return 0; - - return tty->tx_bytes_queued; -@@ -403,7 +401,7 @@ static int ipw_tiocmget(struct tty_struc - if (!tty) - return -ENODEV; - -- if (!tty->open_count) -+ if (!atomic_read(&tty->open_count)) - return -EINVAL; - - return get_control_lines(tty); -@@ -419,7 +417,7 @@ ipw_tiocmset(struct tty_struct *linux_tt - if (!tty) - return -ENODEV; - -- if (!tty->open_count) -+ if (!atomic_read(&tty->open_count)) - return -EINVAL; - - return set_control_lines(tty, set, clear); -@@ -433,7 +431,7 @@ static int ipw_ioctl(struct tty_struct * - if (!tty) - return -ENODEV; - -- if (!tty->open_count) -+ if (!atomic_read(&tty->open_count)) - return -EINVAL; - - /* FIXME: Exactly how is the tty object locked here .. */ -@@ -591,7 +589,7 @@ void ipwireless_tty_free(struct ipw_tty - against a parallel ioctl etc */ - mutex_lock(&ttyj->ipw_tty_mutex); - } -- while (ttyj->open_count) -+ while (atomic_read(&ttyj->open_count)) - do_ipw_close(ttyj); - ipwireless_disassociate_network_ttys(network, - ttyj->channel_idx); -diff -urNp linux-2.6.31.1/drivers/char/random.c linux-2.6.31.1/drivers/char/random.c ---- linux-2.6.31.1/drivers/char/random.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/random.c 2009-10-01 20:12:43.000000000 -0400 -@@ -253,8 +253,13 @@ - /* - * Configuration information - */ -+#ifdef CONFIG_GRKERNSEC_RANDNET -+#define INPUT_POOL_WORDS 512 -+#define OUTPUT_POOL_WORDS 128 -+#else - #define INPUT_POOL_WORDS 128 - #define OUTPUT_POOL_WORDS 32 -+#endif - #define SEC_XFER_SIZE 512 - - /* -@@ -291,10 +296,17 @@ static struct poolinfo { - int poolwords; - int tap1, tap2, tap3, tap4, tap5; - } poolinfo_table[] = { -+#ifdef CONFIG_GRKERNSEC_RANDNET -+ /* x^512 + x^411 + x^308 + x^208 +x^104 + x + 1 -- 225 */ -+ { 512, 411, 308, 208, 104, 1 }, -+ /* x^128 + x^103 + x^76 + x^51 + x^25 + x + 1 -- 105 */ -+ { 128, 103, 76, 51, 25, 1 }, -+#else - /* x^128 + x^103 + x^76 + x^51 +x^25 + x + 1 -- 105 */ - { 128, 103, 76, 51, 25, 1 }, - /* x^32 + x^26 + x^20 + x^14 + x^7 + x + 1 -- 15 */ - { 32, 26, 20, 14, 7, 1 }, -+#endif - #if 0 - /* x^2048 + x^1638 + x^1231 + x^819 + x^411 + x + 1 -- 115 */ - { 2048, 1638, 1231, 819, 411, 1 }, -@@ -1204,7 +1216,7 @@ EXPORT_SYMBOL(generate_random_uuid); - #include <linux/sysctl.h> - - static int min_read_thresh = 8, min_write_thresh; --static int max_read_thresh = INPUT_POOL_WORDS * 32; -+static int max_read_thresh = OUTPUT_POOL_WORDS * 32; - static int max_write_thresh = INPUT_POOL_WORDS * 32; - static char sysctl_bootid[16]; - -diff -urNp linux-2.6.31.1/drivers/char/sonypi.c linux-2.6.31.1/drivers/char/sonypi.c ---- linux-2.6.31.1/drivers/char/sonypi.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/sonypi.c 2009-10-01 20:12:43.000000000 -0400 -@@ -490,7 +490,7 @@ static struct sonypi_device { - spinlock_t fifo_lock; - wait_queue_head_t fifo_proc_list; - struct fasync_struct *fifo_async; -- int open_count; -+ atomic_t open_count; - int model; - struct input_dev *input_jog_dev; - struct input_dev *input_key_dev; -@@ -894,7 +894,7 @@ static int sonypi_misc_fasync(int fd, st - static int sonypi_misc_release(struct inode *inode, struct file *file) - { - mutex_lock(&sonypi_device.lock); -- sonypi_device.open_count--; -+ atomic_dec(&sonypi_device.open_count); - mutex_unlock(&sonypi_device.lock); - return 0; - } -@@ -904,9 +904,9 @@ static int sonypi_misc_open(struct inode - lock_kernel(); - mutex_lock(&sonypi_device.lock); - /* Flush input queue on first open */ -- if (!sonypi_device.open_count) -+ if (!atomic_read(&sonypi_device.open_count)) - kfifo_reset(sonypi_device.fifo); -- sonypi_device.open_count++; -+ atomic_inc(&sonypi_device.open_count); - mutex_unlock(&sonypi_device.lock); - unlock_kernel(); - return 0; -diff -urNp linux-2.6.31.1/drivers/char/tpm/tpm_bios.c linux-2.6.31.1/drivers/char/tpm/tpm_bios.c ---- linux-2.6.31.1/drivers/char/tpm/tpm_bios.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/tpm/tpm_bios.c 2009-10-01 20:12:43.000000000 -0400 -@@ -343,14 +343,14 @@ static int tpm_ascii_bios_measurements_s - return 0; - } - --static struct seq_operations tpm_ascii_b_measurments_seqops = { -+static const struct seq_operations tpm_ascii_b_measurments_seqops = { - .start = tpm_bios_measurements_start, - .next = tpm_bios_measurements_next, - .stop = tpm_bios_measurements_stop, - .show = tpm_ascii_bios_measurements_show, - }; - --static struct seq_operations tpm_binary_b_measurments_seqops = { -+static const struct seq_operations tpm_binary_b_measurments_seqops = { - .start = tpm_bios_measurements_start, - .next = tpm_bios_measurements_next, - .stop = tpm_bios_measurements_stop, -diff -urNp linux-2.6.31.1/drivers/char/tty_ldisc.c linux-2.6.31.1/drivers/char/tty_ldisc.c ---- linux-2.6.31.1/drivers/char/tty_ldisc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/tty_ldisc.c 2009-10-01 20:12:43.000000000 -0400 -@@ -73,7 +73,7 @@ static void put_ldisc(struct tty_ldisc * - if (atomic_dec_and_lock(&ld->users, &tty_ldisc_lock)) { - struct tty_ldisc_ops *ldo = ld->ops; - -- ldo->refcount--; -+ atomic_dec(&ldo->refcount); - module_put(ldo->owner); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - -@@ -107,7 +107,7 @@ int tty_register_ldisc(int disc, struct - spin_lock_irqsave(&tty_ldisc_lock, flags); - tty_ldiscs[disc] = new_ldisc; - new_ldisc->num = disc; -- new_ldisc->refcount = 0; -+ atomic_set(&new_ldisc->refcount, 0); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - - return ret; -@@ -135,7 +135,7 @@ int tty_unregister_ldisc(int disc) - return -EINVAL; - - spin_lock_irqsave(&tty_ldisc_lock, flags); -- if (tty_ldiscs[disc]->refcount) -+ if (atomic_read(&tty_ldiscs[disc]->refcount)) - ret = -EBUSY; - else - tty_ldiscs[disc] = NULL; -@@ -175,7 +175,7 @@ static struct tty_ldisc *tty_ldisc_try_g - err = -EAGAIN; - else { - /* lock it */ -- ldops->refcount++; -+ atomic_inc(&ldops->refcount); - ld->ops = ldops; - atomic_set(&ld->users, 1); - err = 0; -diff -urNp linux-2.6.31.1/drivers/char/vt_ioctl.c linux-2.6.31.1/drivers/char/vt_ioctl.c ---- linux-2.6.31.1/drivers/char/vt_ioctl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/vt_ioctl.c 2009-10-01 20:12:43.000000000 -0400 -@@ -97,6 +97,12 @@ do_kdsk_ioctl(int cmd, struct kbentry __ - case KDSKBENT: - if (!perm) - return -EPERM; -+ -+#ifdef CONFIG_GRKERNSEC -+ if (!capable(CAP_SYS_TTY_CONFIG)) -+ return -EPERM; -+#endif -+ - if (!i && v == K_NOSUCHMAP) { - /* deallocate map */ - key_map = key_maps[s]; -@@ -237,6 +243,13 @@ do_kdgkb_ioctl(int cmd, struct kbsentry - goto reterr; - } - -+#ifdef CONFIG_GRKERNSEC -+ if (!capable(CAP_SYS_TTY_CONFIG)) { -+ ret = -EPERM; -+ goto reterr; -+ } -+#endif -+ - q = func_table[i]; - first_free = funcbufptr + (funcbufsize - funcbufleft); - for (j = i+1; j < MAX_NR_FUNC && !func_table[j]; j++) -diff -urNp linux-2.6.31.1/drivers/char/xilinx_hwicap/xilinx_hwicap.c linux-2.6.31.1/drivers/char/xilinx_hwicap/xilinx_hwicap.c ---- linux-2.6.31.1/drivers/char/xilinx_hwicap/xilinx_hwicap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/char/xilinx_hwicap/xilinx_hwicap.c 2009-10-01 20:12:43.000000000 -0400 -@@ -559,7 +559,7 @@ static int hwicap_release(struct inode * - return status; - } - --static struct file_operations hwicap_fops = { -+static const struct file_operations hwicap_fops = { - .owner = THIS_MODULE, - .write = hwicap_write, - .read = hwicap_read, -diff -urNp linux-2.6.31.1/drivers/edac/edac_core.h linux-2.6.31.1/drivers/edac/edac_core.h ---- linux-2.6.31.1/drivers/edac/edac_core.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/edac/edac_core.h 2009-10-01 20:12:43.000000000 -0400 -@@ -99,11 +99,11 @@ extern int edac_debug_level; - - #else /* !CONFIG_EDAC_DEBUG */ - --#define debugf0( ... ) --#define debugf1( ... ) --#define debugf2( ... ) --#define debugf3( ... ) --#define debugf4( ... ) -+#define debugf0( ... ) do {} while (0) -+#define debugf1( ... ) do {} while (0) -+#define debugf2( ... ) do {} while (0) -+#define debugf3( ... ) do {} while (0) -+#define debugf4( ... ) do {} while (0) - - #endif /* !CONFIG_EDAC_DEBUG */ - -diff -urNp linux-2.6.31.1/drivers/firmware/dmi_scan.c linux-2.6.31.1/drivers/firmware/dmi_scan.c ---- linux-2.6.31.1/drivers/firmware/dmi_scan.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/firmware/dmi_scan.c 2009-10-01 20:12:43.000000000 -0400 -@@ -391,11 +391,6 @@ void __init dmi_scan_machine(void) - } - } - else { -- /* -- * no iounmap() for that ioremap(); it would be a no-op, but -- * it's so early in setup that sucker gets confused into doing -- * what it shouldn't if we actually call it. -- */ - p = dmi_ioremap(0xF0000, 0x10000); - if (p == NULL) - goto error; -diff -urNp linux-2.6.31.1/drivers/gpio/gpiolib.c linux-2.6.31.1/drivers/gpio/gpiolib.c ---- linux-2.6.31.1/drivers/gpio/gpiolib.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpio/gpiolib.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1244,7 +1244,7 @@ static int gpiolib_open(struct inode *in - return single_open(file, gpiolib_show, NULL); - } - --static struct file_operations gpiolib_operations = { -+static const struct file_operations gpiolib_operations = { - .open = gpiolib_open, - .read = seq_read, - .llseek = seq_lseek, -diff -urNp linux-2.6.31.1/drivers/gpu/drm/drm_drv.c linux-2.6.31.1/drivers/gpu/drm/drm_drv.c ---- linux-2.6.31.1/drivers/gpu/drm/drm_drv.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/drm_drv.c 2009-10-01 20:12:43.000000000 -0400 -@@ -417,7 +417,7 @@ int drm_ioctl(struct inode *inode, struc - char *kdata = NULL; - - atomic_inc(&dev->ioctl_count); -- atomic_inc(&dev->counts[_DRM_STAT_IOCTLS]); -+ atomic_inc_unchecked(&dev->counts[_DRM_STAT_IOCTLS]); - ++file_priv->ioctl_count; - - DRM_DEBUG("pid=%d, cmd=0x%02x, nr=0x%02x, dev 0x%lx, auth=%d\n", -diff -urNp linux-2.6.31.1/drivers/gpu/drm/drm_fops.c linux-2.6.31.1/drivers/gpu/drm/drm_fops.c ---- linux-2.6.31.1/drivers/gpu/drm/drm_fops.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/drm_fops.c 2009-10-01 20:12:43.000000000 -0400 -@@ -66,7 +66,7 @@ static int drm_setup(struct drm_device * - } - - for (i = 0; i < ARRAY_SIZE(dev->counts); i++) -- atomic_set(&dev->counts[i], 0); -+ atomic_set_unchecked(&dev->counts[i], 0); - - dev->sigdata.lock = NULL; - -@@ -130,9 +130,9 @@ int drm_open(struct inode *inode, struct - - retcode = drm_open_helper(inode, filp, dev); - if (!retcode) { -- atomic_inc(&dev->counts[_DRM_STAT_OPENS]); -+ atomic_inc_unchecked(&dev->counts[_DRM_STAT_OPENS]); - spin_lock(&dev->count_lock); -- if (!dev->open_count++) { -+ if (atomic_inc_return(&dev->open_count) == 1) { - spin_unlock(&dev->count_lock); - retcode = drm_setup(dev); - goto out; -@@ -433,7 +433,7 @@ int drm_release(struct inode *inode, str - - lock_kernel(); - -- DRM_DEBUG("open_count = %d\n", dev->open_count); -+ DRM_DEBUG("open_count = %d\n", atomic_read(&dev->open_count)); - - if (dev->driver->preclose) - dev->driver->preclose(dev, file_priv); -@@ -445,7 +445,7 @@ int drm_release(struct inode *inode, str - DRM_DEBUG("pid = %d, device = 0x%lx, open_count = %d\n", - task_pid_nr(current), - (long)old_encode_dev(file_priv->minor->device), -- dev->open_count); -+ atomic_read(&dev->open_count)); - - /* if the master has gone away we can't do anything with the lock */ - if (file_priv->minor->master) -@@ -522,9 +522,9 @@ int drm_release(struct inode *inode, str - * End inline drm_release - */ - -- atomic_inc(&dev->counts[_DRM_STAT_CLOSES]); -+ atomic_inc_unchecked(&dev->counts[_DRM_STAT_CLOSES]); - spin_lock(&dev->count_lock); -- if (!--dev->open_count) { -+ if (atomic_dec_and_test(&dev->open_count)) { - if (atomic_read(&dev->ioctl_count)) { - DRM_ERROR("Device busy: %d\n", - atomic_read(&dev->ioctl_count)); -diff -urNp linux-2.6.31.1/drivers/gpu/drm/drm_ioctl.c linux-2.6.31.1/drivers/gpu/drm/drm_ioctl.c ---- linux-2.6.31.1/drivers/gpu/drm/drm_ioctl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/drm_ioctl.c 2009-10-01 20:12:43.000000000 -0400 -@@ -283,7 +283,7 @@ int drm_getstats(struct drm_device *dev, - stats->data[i].value = - (file_priv->master->lock.hw_lock ? file_priv->master->lock.hw_lock->lock : 0); - else -- stats->data[i].value = atomic_read(&dev->counts[i]); -+ stats->data[i].value = atomic_read_unchecked(&dev->counts[i]); - stats->data[i].type = dev->types[i]; - } - -diff -urNp linux-2.6.31.1/drivers/gpu/drm/drm_lock.c linux-2.6.31.1/drivers/gpu/drm/drm_lock.c ---- linux-2.6.31.1/drivers/gpu/drm/drm_lock.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/drm_lock.c 2009-10-01 20:12:43.000000000 -0400 -@@ -87,7 +87,7 @@ int drm_lock(struct drm_device *dev, voi - if (drm_lock_take(&master->lock, lock->context)) { - master->lock.file_priv = file_priv; - master->lock.lock_time = jiffies; -- atomic_inc(&dev->counts[_DRM_STAT_LOCKS]); -+ atomic_inc_unchecked(&dev->counts[_DRM_STAT_LOCKS]); - break; /* Got lock */ - } - -@@ -165,7 +165,7 @@ int drm_unlock(struct drm_device *dev, v - return -EINVAL; - } - -- atomic_inc(&dev->counts[_DRM_STAT_UNLOCKS]); -+ atomic_inc_unchecked(&dev->counts[_DRM_STAT_UNLOCKS]); - - /* kernel_context_switch isn't used by any of the x86 drm - * modules but is required by the Sparc driver. -diff -urNp linux-2.6.31.1/drivers/gpu/drm/drm_vm.c linux-2.6.31.1/drivers/gpu/drm/drm_vm.c ---- linux-2.6.31.1/drivers/gpu/drm/drm_vm.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/drm_vm.c 2009-10-01 20:12:43.000000000 -0400 -@@ -369,28 +369,28 @@ static int drm_vm_sg_fault(struct vm_are - } - - /** AGP virtual memory operations */ --static struct vm_operations_struct drm_vm_ops = { -+static const struct vm_operations_struct drm_vm_ops = { - .fault = drm_vm_fault, - .open = drm_vm_open, - .close = drm_vm_close, - }; - - /** Shared virtual memory operations */ --static struct vm_operations_struct drm_vm_shm_ops = { -+static const struct vm_operations_struct drm_vm_shm_ops = { - .fault = drm_vm_shm_fault, - .open = drm_vm_open, - .close = drm_vm_shm_close, - }; - - /** DMA virtual memory operations */ --static struct vm_operations_struct drm_vm_dma_ops = { -+static const struct vm_operations_struct drm_vm_dma_ops = { - .fault = drm_vm_dma_fault, - .open = drm_vm_open, - .close = drm_vm_close, - }; - - /** Scatter-gather virtual memory operations */ --static struct vm_operations_struct drm_vm_sg_ops = { -+static const struct vm_operations_struct drm_vm_sg_ops = { - .fault = drm_vm_sg_fault, - .open = drm_vm_open, - .close = drm_vm_close, -diff -urNp linux-2.6.31.1/drivers/gpu/drm/i810/i810_dma.c linux-2.6.31.1/drivers/gpu/drm/i810/i810_dma.c ---- linux-2.6.31.1/drivers/gpu/drm/i810/i810_dma.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/i810/i810_dma.c 2009-10-01 20:12:43.000000000 -0400 -@@ -952,8 +952,8 @@ static int i810_dma_vertex(struct drm_de - dma->buflist[vertex->idx], - vertex->discard, vertex->used); - -- atomic_add(vertex->used, &dev->counts[_DRM_STAT_SECONDARY]); -- atomic_inc(&dev->counts[_DRM_STAT_DMA]); -+ atomic_add_unchecked(vertex->used, &dev->counts[_DRM_STAT_SECONDARY]); -+ atomic_inc_unchecked(&dev->counts[_DRM_STAT_DMA]); - sarea_priv->last_enqueue = dev_priv->counter - 1; - sarea_priv->last_dispatch = (int)hw_status[5]; - -@@ -1115,8 +1115,8 @@ static int i810_dma_mc(struct drm_device - i810_dma_dispatch_mc(dev, dma->buflist[mc->idx], mc->used, - mc->last_render); - -- atomic_add(mc->used, &dev->counts[_DRM_STAT_SECONDARY]); -- atomic_inc(&dev->counts[_DRM_STAT_DMA]); -+ atomic_add_unchecked(mc->used, &dev->counts[_DRM_STAT_SECONDARY]); -+ atomic_inc_unchecked(&dev->counts[_DRM_STAT_DMA]); - sarea_priv->last_enqueue = dev_priv->counter - 1; - sarea_priv->last_dispatch = (int)hw_status[5]; - -diff -urNp linux-2.6.31.1/drivers/gpu/drm/i915/i915_drv.c linux-2.6.31.1/drivers/gpu/drm/i915/i915_drv.c ---- linux-2.6.31.1/drivers/gpu/drm/i915/i915_drv.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/i915/i915_drv.c 2009-10-01 20:12:43.000000000 -0400 -@@ -154,7 +154,7 @@ i915_pci_resume(struct pci_dev *pdev) - return i915_resume(dev); - } - --static struct vm_operations_struct i915_gem_vm_ops = { -+static const struct vm_operations_struct i915_gem_vm_ops = { - .fault = i915_gem_fault, - .open = drm_gem_vm_open, - .close = drm_gem_vm_close, -diff -urNp linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_atombios.c linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_atombios.c ---- linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_atombios.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_atombios.c 2009-10-01 20:12:43.000000000 -0400 -@@ -425,13 +425,13 @@ bool radeon_get_atom_connector_info_from - return true; - } - --struct bios_connector { -+static struct bios_connector { - bool valid; - uint8_t line_mux; - uint16_t devices; - int connector_type; - struct radeon_i2c_bus_rec ddc_bus; --}; -+} bios_connectors[ATOM_MAX_SUPPORTED_DEVICE];; - - bool radeon_get_atom_connector_info_from_supported_devices_table(struct - drm_device -@@ -447,7 +447,6 @@ bool radeon_get_atom_connector_info_from - uint8_t dac; - union atom_supported_devices *supported_devices; - int i, j; -- struct bios_connector bios_connectors[ATOM_MAX_SUPPORTED_DEVICE]; - - atom_parse_data_header(ctx, index, &size, &frev, &crev, &data_offset); - -diff -urNp linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_state.c linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_state.c ---- linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_state.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_state.c 2009-10-01 20:12:43.000000000 -0400 -@@ -3007,7 +3007,7 @@ static int radeon_cp_getparam(struct drm - { - drm_radeon_private_t *dev_priv = dev->dev_private; - drm_radeon_getparam_t *param = data; -- int value; -+ int value = 0; - - DRM_DEBUG("pid=%d\n", DRM_CURRENTPID); - -diff -urNp linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_ttm.c linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_ttm.c ---- linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_ttm.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/radeon/radeon_ttm.c 2009-10-01 20:12:43.000000000 -0400 -@@ -500,27 +500,10 @@ void radeon_ttm_fini(struct radeon_devic - DRM_INFO("radeon: ttm finalized\n"); - } - --static struct vm_operations_struct radeon_ttm_vm_ops; --static struct vm_operations_struct *ttm_vm_ops = NULL; -- --static int radeon_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) --{ -- struct ttm_buffer_object *bo; -- int r; -- -- bo = (struct ttm_buffer_object *)vma->vm_private_data; -- if (bo == NULL) { -- return VM_FAULT_NOPAGE; -- } -- r = ttm_vm_ops->fault(vma, vmf); -- return r; --} -- - int radeon_mmap(struct file *filp, struct vm_area_struct *vma) - { - struct drm_file *file_priv; - struct radeon_device *rdev; -- int r; - - if (unlikely(vma->vm_pgoff < DRM_FILE_PAGE_OFFSET)) { - return drm_mmap(filp, vma); -@@ -528,20 +511,9 @@ int radeon_mmap(struct file *filp, struc - - file_priv = (struct drm_file *)filp->private_data; - rdev = file_priv->minor->dev->dev_private; -- if (rdev == NULL) { -+ if (!rdev) - return -EINVAL; -- } -- r = ttm_bo_mmap(filp, vma, &rdev->mman.bdev); -- if (unlikely(r != 0)) { -- return r; -- } -- if (unlikely(ttm_vm_ops == NULL)) { -- ttm_vm_ops = vma->vm_ops; -- radeon_ttm_vm_ops = *ttm_vm_ops; -- radeon_ttm_vm_ops.fault = &radeon_ttm_fault; -- } -- vma->vm_ops = &radeon_ttm_vm_ops; -- return 0; -+ return ttm_bo_mmap(filp, vma, &rdev->mman.bdev); - } - - -diff -urNp linux-2.6.31.1/drivers/gpu/drm/ttm/ttm_bo_vm.c linux-2.6.31.1/drivers/gpu/drm/ttm/ttm_bo_vm.c ---- linux-2.6.31.1/drivers/gpu/drm/ttm/ttm_bo_vm.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/gpu/drm/ttm/ttm_bo_vm.c 2009-10-01 20:12:43.000000000 -0400 -@@ -73,7 +73,7 @@ static int ttm_bo_vm_fault(struct vm_are - { - struct ttm_buffer_object *bo = (struct ttm_buffer_object *) - vma->vm_private_data; -- struct ttm_bo_device *bdev = bo->bdev; -+ struct ttm_bo_device *bdev; - unsigned long bus_base; - unsigned long bus_offset; - unsigned long bus_size; -@@ -88,6 +88,10 @@ static int ttm_bo_vm_fault(struct vm_are - unsigned long address = (unsigned long)vmf->virtual_address; - int retval = VM_FAULT_NOPAGE; - -+ if (!bo) -+ return VM_FAULT_NOPAGE; -+ bdev = bo->bdev; -+ - /* - * Work around locking order reversal in fault / nopfn - * between mmap_sem and bo_reserve: Perform a trylock operation -@@ -228,7 +232,7 @@ static void ttm_bo_vm_close(struct vm_ar - vma->vm_private_data = NULL; - } - --static struct vm_operations_struct ttm_bo_vm_ops = { -+static const struct vm_operations_struct ttm_bo_vm_ops = { - .fault = ttm_bo_vm_fault, - .open = ttm_bo_vm_open, - .close = ttm_bo_vm_close -diff -urNp linux-2.6.31.1/drivers/hwmon/fschmd.c linux-2.6.31.1/drivers/hwmon/fschmd.c ---- linux-2.6.31.1/drivers/hwmon/fschmd.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/hwmon/fschmd.c 2009-10-01 20:12:43.000000000 -0400 -@@ -915,7 +915,7 @@ static int watchdog_ioctl(struct inode * - return ret; - } - --static struct file_operations watchdog_fops = { -+static const struct file_operations watchdog_fops = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .open = watchdog_open, -diff -urNp linux-2.6.31.1/drivers/hwmon/fscpos.c linux-2.6.31.1/drivers/hwmon/fscpos.c ---- linux-2.6.31.1/drivers/hwmon/fscpos.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/hwmon/fscpos.c 2009-10-01 20:12:43.000000000 -0400 -@@ -240,7 +240,6 @@ static ssize_t set_pwm(struct i2c_client - unsigned long v = simple_strtoul(buf, NULL, 10); - - /* Range: 0..255 */ -- if (v < 0) v = 0; - if (v > 255) v = 255; - - mutex_lock(&data->update_lock); -diff -urNp linux-2.6.31.1/drivers/hwmon/k8temp.c linux-2.6.31.1/drivers/hwmon/k8temp.c ---- linux-2.6.31.1/drivers/hwmon/k8temp.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/hwmon/k8temp.c 2009-10-01 20:12:43.000000000 -0400 -@@ -138,7 +138,7 @@ static DEVICE_ATTR(name, S_IRUGO, show_n - - static struct pci_device_id k8temp_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, -- { 0 }, -+ { 0, 0, 0, 0, 0, 0, 0 }, - }; - - MODULE_DEVICE_TABLE(pci, k8temp_ids); -diff -urNp linux-2.6.31.1/drivers/hwmon/sis5595.c linux-2.6.31.1/drivers/hwmon/sis5595.c ---- linux-2.6.31.1/drivers/hwmon/sis5595.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/hwmon/sis5595.c 2009-10-01 20:12:43.000000000 -0400 -@@ -699,7 +699,7 @@ static struct sis5595_data *sis5595_upda - - static struct pci_device_id sis5595_pci_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_503) }, -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(pci, sis5595_pci_ids); -diff -urNp linux-2.6.31.1/drivers/hwmon/via686a.c linux-2.6.31.1/drivers/hwmon/via686a.c ---- linux-2.6.31.1/drivers/hwmon/via686a.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/hwmon/via686a.c 2009-10-01 20:12:43.000000000 -0400 -@@ -769,7 +769,7 @@ static struct via686a_data *via686a_upda - - static struct pci_device_id via686a_pci_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C686_4) }, -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(pci, via686a_pci_ids); -diff -urNp linux-2.6.31.1/drivers/hwmon/vt8231.c linux-2.6.31.1/drivers/hwmon/vt8231.c ---- linux-2.6.31.1/drivers/hwmon/vt8231.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/hwmon/vt8231.c 2009-10-01 20:12:43.000000000 -0400 -@@ -699,7 +699,7 @@ static struct platform_driver vt8231_dri - - static struct pci_device_id vt8231_pci_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8231_4) }, -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(pci, vt8231_pci_ids); -diff -urNp linux-2.6.31.1/drivers/hwmon/w83791d.c linux-2.6.31.1/drivers/hwmon/w83791d.c ---- linux-2.6.31.1/drivers/hwmon/w83791d.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/hwmon/w83791d.c 2009-10-01 20:12:43.000000000 -0400 -@@ -330,8 +330,8 @@ static int w83791d_detect(struct i2c_cli - struct i2c_board_info *info); - static int w83791d_remove(struct i2c_client *client); - --static int w83791d_read(struct i2c_client *client, u8 register); --static int w83791d_write(struct i2c_client *client, u8 register, u8 value); -+static int w83791d_read(struct i2c_client *client, u8 reg); -+static int w83791d_write(struct i2c_client *client, u8 reg, u8 value); - static struct w83791d_data *w83791d_update_device(struct device *dev); - - #ifdef DEBUG -diff -urNp linux-2.6.31.1/drivers/i2c/busses/i2c-i801.c linux-2.6.31.1/drivers/i2c/busses/i2c-i801.c ---- linux-2.6.31.1/drivers/i2c/busses/i2c-i801.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/i2c/busses/i2c-i801.c 2009-10-01 20:12:43.000000000 -0400 -@@ -578,7 +578,7 @@ static struct pci_device_id i801_ids[] = - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH10_4) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH10_5) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PCH_SMBUS) }, -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE (pci, i801_ids); -diff -urNp linux-2.6.31.1/drivers/i2c/busses/i2c-piix4.c linux-2.6.31.1/drivers/i2c/busses/i2c-piix4.c ---- linux-2.6.31.1/drivers/i2c/busses/i2c-piix4.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/i2c/busses/i2c-piix4.c 2009-10-01 20:12:43.000000000 -0400 -@@ -123,7 +123,7 @@ static struct dmi_system_id __devinitdat - .ident = "IBM", - .matches = { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, - }, -- { }, -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, NULL)}, NULL }, - }; - - static int __devinit piix4_setup(struct pci_dev *PIIX4_dev, -@@ -489,7 +489,7 @@ static struct pci_device_id piix4_ids[] - PCI_DEVICE_ID_SERVERWORKS_HT1000SB) }, - { PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, - PCI_DEVICE_ID_SERVERWORKS_HT1100LD) }, -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE (pci, piix4_ids); -diff -urNp linux-2.6.31.1/drivers/i2c/busses/i2c-sis630.c linux-2.6.31.1/drivers/i2c/busses/i2c-sis630.c ---- linux-2.6.31.1/drivers/i2c/busses/i2c-sis630.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/i2c/busses/i2c-sis630.c 2009-10-01 20:12:43.000000000 -0400 -@@ -471,7 +471,7 @@ static struct i2c_adapter sis630_adapter - static struct pci_device_id sis630_ids[] __devinitdata = { - { PCI_DEVICE(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_503) }, - { PCI_DEVICE(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_LPC) }, -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE (pci, sis630_ids); -diff -urNp linux-2.6.31.1/drivers/i2c/busses/i2c-sis96x.c linux-2.6.31.1/drivers/i2c/busses/i2c-sis96x.c ---- linux-2.6.31.1/drivers/i2c/busses/i2c-sis96x.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/i2c/busses/i2c-sis96x.c 2009-10-01 20:12:43.000000000 -0400 -@@ -247,7 +247,7 @@ static struct i2c_adapter sis96x_adapter - - static struct pci_device_id sis96x_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_SMBUS) }, -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE (pci, sis96x_ids); -diff -urNp linux-2.6.31.1/drivers/ieee1394/dma.c linux-2.6.31.1/drivers/ieee1394/dma.c ---- linux-2.6.31.1/drivers/ieee1394/dma.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ieee1394/dma.c 2009-10-01 20:12:43.000000000 -0400 -@@ -247,7 +247,7 @@ static int dma_region_pagefault(struct v - return 0; - } - --static struct vm_operations_struct dma_region_vm_ops = { -+static const struct vm_operations_struct dma_region_vm_ops = { - .fault = dma_region_pagefault, - }; - -diff -urNp linux-2.6.31.1/drivers/ieee1394/dv1394.c linux-2.6.31.1/drivers/ieee1394/dv1394.c ---- linux-2.6.31.1/drivers/ieee1394/dv1394.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ieee1394/dv1394.c 2009-10-01 20:12:43.000000000 -0400 -@@ -739,7 +739,7 @@ static void frame_prepare(struct video_c - based upon DIF section and sequence - */ - --static void inline -+static inline void - frame_put_packet (struct frame *f, struct packet *p) - { - int section_type = p->data[0] >> 5; /* section type is in bits 5 - 7 */ -@@ -2178,7 +2178,7 @@ static const struct ieee1394_device_id d - .specifier_id = AVC_UNIT_SPEC_ID_ENTRY & 0xffffff, - .version = AVC_SW_VERSION_ENTRY & 0xffffff - }, -- { } -+ { 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(ieee1394, dv1394_id_table); -diff -urNp linux-2.6.31.1/drivers/ieee1394/eth1394.c linux-2.6.31.1/drivers/ieee1394/eth1394.c ---- linux-2.6.31.1/drivers/ieee1394/eth1394.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ieee1394/eth1394.c 2009-10-01 20:12:43.000000000 -0400 -@@ -445,7 +445,7 @@ static const struct ieee1394_device_id e - .specifier_id = ETHER1394_GASP_SPECIFIER_ID, - .version = ETHER1394_GASP_VERSION, - }, -- {} -+ { 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(ieee1394, eth1394_id_table); -diff -urNp linux-2.6.31.1/drivers/ieee1394/hosts.c linux-2.6.31.1/drivers/ieee1394/hosts.c ---- linux-2.6.31.1/drivers/ieee1394/hosts.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ieee1394/hosts.c 2009-10-01 20:12:43.000000000 -0400 -@@ -78,6 +78,7 @@ static int dummy_isoctl(struct hpsb_iso - } - - static struct hpsb_host_driver dummy_driver = { -+ .name = "dummy", - .transmit_packet = dummy_transmit_packet, - .devctl = dummy_devctl, - .isoctl = dummy_isoctl -diff -urNp linux-2.6.31.1/drivers/ieee1394/ohci1394.c linux-2.6.31.1/drivers/ieee1394/ohci1394.c ---- linux-2.6.31.1/drivers/ieee1394/ohci1394.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ieee1394/ohci1394.c 2009-10-01 20:12:43.000000000 -0400 -@@ -147,9 +147,9 @@ printk(level "%s: " fmt "\n" , OHCI1394_ - printk(level "%s: fw-host%d: " fmt "\n" , OHCI1394_DRIVER_NAME, ohci->host->id , ## args) - - /* Module Parameters */ --static int phys_dma = 1; -+static int phys_dma; - module_param(phys_dma, int, 0444); --MODULE_PARM_DESC(phys_dma, "Enable physical DMA (default = 1)."); -+MODULE_PARM_DESC(phys_dma, "Enable physical DMA (default = 0)."); - - static void dma_trm_tasklet(unsigned long data); - static void dma_trm_reset(struct dma_trm_ctx *d); -@@ -3449,7 +3449,7 @@ static struct pci_device_id ohci1394_pci - .subvendor = PCI_ANY_ID, - .subdevice = PCI_ANY_ID, - }, -- { 0, }, -+ { 0, 0, 0, 0, 0, 0, 0 }, - }; - - MODULE_DEVICE_TABLE(pci, ohci1394_pci_tbl); -diff -urNp linux-2.6.31.1/drivers/ieee1394/raw1394.c linux-2.6.31.1/drivers/ieee1394/raw1394.c ---- linux-2.6.31.1/drivers/ieee1394/raw1394.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ieee1394/raw1394.c 2009-10-01 20:12:43.000000000 -0400 -@@ -2999,7 +2999,7 @@ static const struct ieee1394_device_id r - .match_flags = IEEE1394_MATCH_SPECIFIER_ID | IEEE1394_MATCH_VERSION, - .specifier_id = CAMERA_UNIT_SPEC_ID_ENTRY & 0xffffff, - .version = (CAMERA_SW_VERSION_ENTRY + 2) & 0xffffff}, -- {} -+ { 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(ieee1394, raw1394_id_table); -diff -urNp linux-2.6.31.1/drivers/ieee1394/sbp2.c linux-2.6.31.1/drivers/ieee1394/sbp2.c ---- linux-2.6.31.1/drivers/ieee1394/sbp2.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ieee1394/sbp2.c 2009-10-01 20:12:43.000000000 -0400 -@@ -290,7 +290,7 @@ static const struct ieee1394_device_id s - .match_flags = IEEE1394_MATCH_SPECIFIER_ID | IEEE1394_MATCH_VERSION, - .specifier_id = SBP2_UNIT_SPEC_ID_ENTRY & 0xffffff, - .version = SBP2_SW_VERSION_ENTRY & 0xffffff}, -- {} -+ { 0, 0, 0, 0, 0, 0 } - }; - MODULE_DEVICE_TABLE(ieee1394, sbp2_id_table); - -@@ -2112,7 +2112,7 @@ MODULE_DESCRIPTION("IEEE-1394 SBP-2 prot - MODULE_SUPPORTED_DEVICE(SBP2_DEVICE_NAME); - MODULE_LICENSE("GPL"); - --static int sbp2_module_init(void) -+static int __init sbp2_module_init(void) - { - int ret; - -diff -urNp linux-2.6.31.1/drivers/ieee1394/video1394.c linux-2.6.31.1/drivers/ieee1394/video1394.c ---- linux-2.6.31.1/drivers/ieee1394/video1394.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/ieee1394/video1394.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1310,7 +1310,7 @@ static const struct ieee1394_device_id v - .specifier_id = CAMERA_UNIT_SPEC_ID_ENTRY & 0xffffff, - .version = (CAMERA_SW_VERSION_ENTRY + 2) & 0xffffff - }, -- { } -+ { 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(ieee1394, video1394_id_table); -diff -urNp linux-2.6.31.1/drivers/infiniband/hw/ehca/ehca_uverbs.c linux-2.6.31.1/drivers/infiniband/hw/ehca/ehca_uverbs.c ---- linux-2.6.31.1/drivers/infiniband/hw/ehca/ehca_uverbs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/infiniband/hw/ehca/ehca_uverbs.c 2009-10-01 20:12:43.000000000 -0400 -@@ -95,7 +95,7 @@ static void ehca_mm_close(struct vm_area - vma->vm_start, vma->vm_end, *count); - } - --static struct vm_operations_struct vm_ops = { -+static const struct vm_operations_struct vm_ops = { - .open = ehca_mm_open, - .close = ehca_mm_close, - }; -diff -urNp linux-2.6.31.1/drivers/infiniband/hw/ipath/ipath_file_ops.c linux-2.6.31.1/drivers/infiniband/hw/ipath/ipath_file_ops.c ---- linux-2.6.31.1/drivers/infiniband/hw/ipath/ipath_file_ops.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/infiniband/hw/ipath/ipath_file_ops.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1151,7 +1151,7 @@ static int ipath_file_vma_fault(struct v - return 0; - } - --static struct vm_operations_struct ipath_file_vm_ops = { -+static const struct vm_operations_struct ipath_file_vm_ops = { - .fault = ipath_file_vma_fault, - }; - -diff -urNp linux-2.6.31.1/drivers/infiniband/hw/ipath/ipath_mmap.c linux-2.6.31.1/drivers/infiniband/hw/ipath/ipath_mmap.c ---- linux-2.6.31.1/drivers/infiniband/hw/ipath/ipath_mmap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/infiniband/hw/ipath/ipath_mmap.c 2009-10-01 20:12:43.000000000 -0400 -@@ -74,7 +74,7 @@ static void ipath_vma_close(struct vm_ar - kref_put(&ip->ref, ipath_release_mmap_info); - } - --static struct vm_operations_struct ipath_vm_ops = { -+static const struct vm_operations_struct ipath_vm_ops = { - .open = ipath_vma_open, - .close = ipath_vma_close, - }; -diff -urNp linux-2.6.31.1/drivers/input/keyboard/atkbd.c linux-2.6.31.1/drivers/input/keyboard/atkbd.c ---- linux-2.6.31.1/drivers/input/keyboard/atkbd.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/input/keyboard/atkbd.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1188,7 +1188,7 @@ static struct serio_device_id atkbd_seri - .id = SERIO_ANY, - .extra = SERIO_ANY, - }, -- { 0 } -+ { 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(serio, atkbd_serio_ids); -diff -urNp linux-2.6.31.1/drivers/input/mouse/lifebook.c linux-2.6.31.1/drivers/input/mouse/lifebook.c ---- linux-2.6.31.1/drivers/input/mouse/lifebook.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/input/mouse/lifebook.c 2009-10-01 20:12:43.000000000 -0400 -@@ -116,7 +116,7 @@ static const struct dmi_system_id lifebo - DMI_MATCH(DMI_PRODUCT_NAME, "LifeBook B142"), - }, - }, -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL} - }; - - static psmouse_ret_t lifebook_process_byte(struct psmouse *psmouse) -diff -urNp linux-2.6.31.1/drivers/input/mouse/psmouse-base.c linux-2.6.31.1/drivers/input/mouse/psmouse-base.c ---- linux-2.6.31.1/drivers/input/mouse/psmouse-base.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/input/mouse/psmouse-base.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1380,7 +1380,7 @@ static struct serio_device_id psmouse_se - .id = SERIO_ANY, - .extra = SERIO_ANY, - }, -- { 0 } -+ { 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(serio, psmouse_serio_ids); -diff -urNp linux-2.6.31.1/drivers/input/mouse/synaptics.c linux-2.6.31.1/drivers/input/mouse/synaptics.c ---- linux-2.6.31.1/drivers/input/mouse/synaptics.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/input/mouse/synaptics.c 2009-10-01 20:12:43.000000000 -0400 -@@ -437,7 +437,7 @@ static void synaptics_process_packet(str - break; - case 2: - if (SYN_MODEL_PEN(priv->model_id)) -- ; /* Nothing, treat a pen as a single finger */ -+ break; /* Nothing, treat a pen as a single finger */ - break; - case 4 ... 15: - if (SYN_CAP_PALMDETECT(priv->capabilities)) -@@ -653,7 +653,7 @@ static const struct dmi_system_id toshib - DMI_MATCH(DMI_PRODUCT_NAME, "PORTEGE M300"), - }, - }, -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } - }; - #endif - -diff -urNp linux-2.6.31.1/drivers/input/mousedev.c linux-2.6.31.1/drivers/input/mousedev.c ---- linux-2.6.31.1/drivers/input/mousedev.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/input/mousedev.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1056,7 +1056,7 @@ static struct input_handler mousedev_han - - #ifdef CONFIG_INPUT_MOUSEDEV_PSAUX - static struct miscdevice psaux_mouse = { -- PSMOUSE_MINOR, "psaux", &mousedev_fops -+ PSMOUSE_MINOR, "psaux", &mousedev_fops, {NULL, NULL}, NULL, NULL - }; - static int psaux_registered; - #endif -diff -urNp linux-2.6.31.1/drivers/input/serio/i8042-x86ia64io.h linux-2.6.31.1/drivers/input/serio/i8042-x86ia64io.h ---- linux-2.6.31.1/drivers/input/serio/i8042-x86ia64io.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/input/serio/i8042-x86ia64io.h 2009-10-01 20:12:43.000000000 -0400 -@@ -167,7 +167,7 @@ static struct dmi_system_id __initdata i - DMI_MATCH(DMI_PRODUCT_VERSION, "Rev 1"), - }, - }, -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } - }; - - /* -@@ -390,7 +390,7 @@ static struct dmi_system_id __initdata i - DMI_MATCH(DMI_PRODUCT_VERSION, "0100"), - }, - }, -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } - }; - - static struct dmi_system_id __initdata i8042_dmi_reset_table[] = { -@@ -436,7 +436,7 @@ static struct dmi_system_id __initdata i - DMI_MATCH(DMI_PRODUCT_NAME, "N10"), - }, - }, -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } - }; - - #ifdef CONFIG_PNP -@@ -455,7 +455,7 @@ static struct dmi_system_id __initdata i - DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"), - }, - }, -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } - }; - #endif - -@@ -522,7 +522,7 @@ static struct dmi_system_id __initdata i - DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 4280"), - }, - }, -- { } -+ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } - }; - - #endif /* CONFIG_X86 */ -diff -urNp linux-2.6.31.1/drivers/input/serio/serio_raw.c linux-2.6.31.1/drivers/input/serio/serio_raw.c ---- linux-2.6.31.1/drivers/input/serio/serio_raw.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/input/serio/serio_raw.c 2009-10-01 20:12:43.000000000 -0400 -@@ -376,7 +376,7 @@ static struct serio_device_id serio_raw_ - .id = SERIO_ANY, - .extra = SERIO_ANY, - }, -- { 0 } -+ { 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(serio, serio_raw_serio_ids); -diff -urNp linux-2.6.31.1/drivers/isdn/capi/kcapi_proc.c linux-2.6.31.1/drivers/isdn/capi/kcapi_proc.c ---- linux-2.6.31.1/drivers/isdn/capi/kcapi_proc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/isdn/capi/kcapi_proc.c 2009-10-01 20:12:43.000000000 -0400 -@@ -89,14 +89,14 @@ static int contrstats_show(struct seq_fi - return 0; - } - --static struct seq_operations seq_controller_ops = { -+static const struct seq_operations seq_controller_ops = { - .start = controller_start, - .next = controller_next, - .stop = controller_stop, - .show = controller_show, - }; - --static struct seq_operations seq_contrstats_ops = { -+static const struct seq_operations seq_contrstats_ops = { - .start = controller_start, - .next = controller_next, - .stop = controller_stop, -@@ -194,14 +194,14 @@ applstats_show(struct seq_file *seq, voi - return 0; - } - --static struct seq_operations seq_applications_ops = { -+static const struct seq_operations seq_applications_ops = { - .start = applications_start, - .next = applications_next, - .stop = applications_stop, - .show = applications_show, - }; - --static struct seq_operations seq_applstats_ops = { -+static const struct seq_operations seq_applstats_ops = { - .start = applications_start, - .next = applications_next, - .stop = applications_stop, -@@ -264,7 +264,7 @@ static int capi_driver_show(struct seq_f - return 0; - } - --static struct seq_operations seq_capi_driver_ops = { -+static const struct seq_operations seq_capi_driver_ops = { - .start = capi_driver_start, - .next = capi_driver_next, - .stop = capi_driver_stop, -diff -urNp linux-2.6.31.1/drivers/isdn/gigaset/common.c linux-2.6.31.1/drivers/isdn/gigaset/common.c ---- linux-2.6.31.1/drivers/isdn/gigaset/common.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/isdn/gigaset/common.c 2009-10-01 20:12:43.000000000 -0400 -@@ -665,7 +665,7 @@ struct cardstate *gigaset_initcs(struct - cs->commands_pending = 0; - cs->cur_at_seq = 0; - cs->gotfwver = -1; -- cs->open_count = 0; -+ atomic_set(&cs->open_count, 0); - cs->dev = NULL; - cs->tty = NULL; - cs->tty_dev = NULL; -diff -urNp linux-2.6.31.1/drivers/isdn/gigaset/gigaset.h linux-2.6.31.1/drivers/isdn/gigaset/gigaset.h ---- linux-2.6.31.1/drivers/isdn/gigaset/gigaset.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/isdn/gigaset/gigaset.h 2009-10-01 20:12:43.000000000 -0400 -@@ -446,7 +446,7 @@ struct cardstate { - spinlock_t cmdlock; - unsigned curlen, cmdbytes; - -- unsigned open_count; -+ atomic_t open_count; - struct tty_struct *tty; - struct tasklet_struct if_wake_tasklet; - unsigned control_state; -diff -urNp linux-2.6.31.1/drivers/isdn/gigaset/interface.c linux-2.6.31.1/drivers/isdn/gigaset/interface.c ---- linux-2.6.31.1/drivers/isdn/gigaset/interface.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/isdn/gigaset/interface.c 2009-10-01 20:12:43.000000000 -0400 -@@ -165,9 +165,7 @@ static int if_open(struct tty_struct *tt - return -ERESTARTSYS; // FIXME -EINTR? - tty->driver_data = cs; - -- ++cs->open_count; -- -- if (cs->open_count == 1) { -+ if (atomic_inc_return(&cs->open_count) == 1) { - spin_lock_irqsave(&cs->lock, flags); - cs->tty = tty; - spin_unlock_irqrestore(&cs->lock, flags); -@@ -195,10 +193,10 @@ static void if_close(struct tty_struct * - - if (!cs->connected) - gig_dbg(DEBUG_IF, "not connected"); /* nothing to do */ -- else if (!cs->open_count) -+ else if (!atomic_read(&cs->open_count)) - dev_warn(cs->dev, "%s: device not opened\n", __func__); - else { -- if (!--cs->open_count) { -+ if (!atomic_dec_return(&cs->open_count)) { - spin_lock_irqsave(&cs->lock, flags); - cs->tty = NULL; - spin_unlock_irqrestore(&cs->lock, flags); -@@ -233,7 +231,7 @@ static int if_ioctl(struct tty_struct *t - if (!cs->connected) { - gig_dbg(DEBUG_IF, "not connected"); - retval = -ENODEV; -- } else if (!cs->open_count) -+ } else if (!atomic_read(&cs->open_count)) - dev_warn(cs->dev, "%s: device not opened\n", __func__); - else { - retval = 0; -@@ -361,7 +359,7 @@ static int if_write(struct tty_struct *t - if (!cs->connected) { - gig_dbg(DEBUG_IF, "not connected"); - retval = -ENODEV; -- } else if (!cs->open_count) -+ } else if (!atomic_read(&cs->open_count)) - dev_warn(cs->dev, "%s: device not opened\n", __func__); - else if (cs->mstate != MS_LOCKED) { - dev_warn(cs->dev, "can't write to unlocked device\n"); -@@ -395,7 +393,7 @@ static int if_write_room(struct tty_stru - if (!cs->connected) { - gig_dbg(DEBUG_IF, "not connected"); - retval = -ENODEV; -- } else if (!cs->open_count) -+ } else if (!atomic_read(&cs->open_count)) - dev_warn(cs->dev, "%s: device not opened\n", __func__); - else if (cs->mstate != MS_LOCKED) { - dev_warn(cs->dev, "can't write to unlocked device\n"); -@@ -429,7 +427,7 @@ static int if_chars_in_buffer(struct tty - if (!cs->connected) { - gig_dbg(DEBUG_IF, "not connected"); - retval = -ENODEV; -- } else if (!cs->open_count) -+ } else if (!atomic_read(&cs->open_count)) - dev_warn(cs->dev, "%s: device not opened\n", __func__); - else if (cs->mstate != MS_LOCKED) { - dev_warn(cs->dev, "can't write to unlocked device\n"); -@@ -458,7 +456,7 @@ static void if_throttle(struct tty_struc - - if (!cs->connected) - gig_dbg(DEBUG_IF, "not connected"); /* nothing to do */ -- else if (!cs->open_count) -+ else if (!atomic_read(&cs->open_count)) - dev_warn(cs->dev, "%s: device not opened\n", __func__); - else { - //FIXME -@@ -483,7 +481,7 @@ static void if_unthrottle(struct tty_str - - if (!cs->connected) - gig_dbg(DEBUG_IF, "not connected"); /* nothing to do */ -- else if (!cs->open_count) -+ else if (!atomic_read(&cs->open_count)) - dev_warn(cs->dev, "%s: device not opened\n", __func__); - else { - //FIXME -@@ -515,7 +513,7 @@ static void if_set_termios(struct tty_st - goto out; - } - -- if (!cs->open_count) { -+ if (!atomic_read(&cs->open_count)) { - dev_warn(cs->dev, "%s: device not opened\n", __func__); - goto out; - } -diff -urNp linux-2.6.31.1/drivers/lguest/core.c linux-2.6.31.1/drivers/lguest/core.c ---- linux-2.6.31.1/drivers/lguest/core.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/lguest/core.c 2009-10-01 20:12:43.000000000 -0400 -@@ -92,9 +92,17 @@ static __init int map_switcher(void) - * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. - */ -+ -+#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) -+ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, -+ VM_ALLOC | VM_KERNEXEC, SWITCHER_ADDR, SWITCHER_ADDR -+ + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); -+#else - switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, - VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR - + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); -+#endif -+ - if (!switcher_vma) { - err = -ENOMEM; - printk("lguest: could not map switcher pages high\n"); -diff -urNp linux-2.6.31.1/drivers/lguest/lguest_user.c linux-2.6.31.1/drivers/lguest/lguest_user.c ---- linux-2.6.31.1/drivers/lguest/lguest_user.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/lguest/lguest_user.c 2009-10-01 20:12:43.000000000 -0400 -@@ -508,7 +508,7 @@ static int close(struct inode *inode, st - * uses: reading and writing a character device called /dev/lguest. All the - * work happens in the read(), write() and close() routines: - */ --static struct file_operations lguest_fops = { -+static const struct file_operations lguest_fops = { - .owner = THIS_MODULE, - .release = close, - .write = write, -diff -urNp linux-2.6.31.1/drivers/md/bitmap.c linux-2.6.31.1/drivers/md/bitmap.c ---- linux-2.6.31.1/drivers/md/bitmap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/md/bitmap.c 2009-10-01 20:12:43.000000000 -0400 -@@ -58,7 +58,7 @@ - # if DEBUG > 0 - # define PRINTK(x...) printk(KERN_DEBUG x) - # else --# define PRINTK(x...) -+# define PRINTK(x...) do {} while (0) - # endif - #endif - -diff -urNp linux-2.6.31.1/drivers/md/dm-table.c linux-2.6.31.1/drivers/md/dm-table.c ---- linux-2.6.31.1/drivers/md/dm-table.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/md/dm-table.c 2009-10-01 20:12:43.000000000 -0400 -@@ -359,7 +359,7 @@ static int device_area_is_invalid(struct - if (!dev_size) - return 0; - -- if ((start >= dev_size) || (start + len > dev_size)) { -+ if ((start >= dev_size) || (len > dev_size - start)) { - DMWARN("%s: %s too small for target: " - "start=%llu, len=%llu, dev_size=%llu", - dm_device_name(ti->table->md), bdevname(bdev, b), -diff -urNp linux-2.6.31.1/drivers/md/md.c linux-2.6.31.1/drivers/md/md.c ---- linux-2.6.31.1/drivers/md/md.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/md/md.c 2009-10-01 20:12:43.000000000 -0400 -@@ -5963,7 +5963,7 @@ static int md_seq_show(struct seq_file * - chunk_kb ? "KB" : "B"); - if (bitmap->file) { - seq_printf(seq, ", file: "); -- seq_path(seq, &bitmap->file->f_path, " \t\n"); -+ seq_path(seq, &bitmap->file->f_path, " \t\n\"); - } - - seq_printf(seq, "\n"); -@@ -6057,7 +6057,7 @@ static int is_mddev_idle(mddev_t *mddev, - struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + - (int)part_stat_read(&disk->part0, sectors[1]) - -- atomic_read(&disk->sync_io); -+ atomic_read_unchecked(&disk->sync_io); - /* sync IO will cause sync_io to increase before the disk_stats - * as sync_io is counted when a request starts, and - * disk_stats is counted when it completes. -diff -urNp linux-2.6.31.1/drivers/md/md.h linux-2.6.31.1/drivers/md/md.h ---- linux-2.6.31.1/drivers/md/md.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/md/md.h 2009-10-01 20:12:43.000000000 -0400 -@@ -303,7 +303,7 @@ static inline void rdev_dec_pending(mdk_ - - static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) - { -- atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); -+ atomic_add_unchecked(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); - } - - struct mdk_personality -diff -urNp linux-2.6.31.1/drivers/media/dvb/dvb-core/dmxdev.c linux-2.6.31.1/drivers/media/dvb/dvb-core/dmxdev.c ---- linux-2.6.31.1/drivers/media/dvb/dvb-core/dmxdev.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/dvb/dvb-core/dmxdev.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1086,7 +1086,7 @@ static unsigned int dvb_dvr_poll(struct - return mask; - } - --static struct file_operations dvb_dvr_fops = { -+static const struct file_operations dvb_dvr_fops = { - .owner = THIS_MODULE, - .read = dvb_dvr_read, - .write = dvb_dvr_write, -diff -urNp linux-2.6.31.1/drivers/media/dvb/firewire/firedtv-ci.c linux-2.6.31.1/drivers/media/dvb/firewire/firedtv-ci.c ---- linux-2.6.31.1/drivers/media/dvb/firewire/firedtv-ci.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/dvb/firewire/firedtv-ci.c 2009-10-01 20:12:43.000000000 -0400 -@@ -215,7 +215,7 @@ static unsigned int fdtv_ca_io_poll(stru - return POLLIN; - } - --static struct file_operations fdtv_ca_fops = { -+static const struct file_operations fdtv_ca_fops = { - .owner = THIS_MODULE, - .ioctl = dvb_generic_ioctl, - .open = dvb_generic_open, -diff -urNp linux-2.6.31.1/drivers/media/video/cafe_ccic.c linux-2.6.31.1/drivers/media/video/cafe_ccic.c ---- linux-2.6.31.1/drivers/media/video/cafe_ccic.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/cafe_ccic.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1326,7 +1326,7 @@ static void cafe_v4l_vm_close(struct vm_ - mutex_unlock(&sbuf->cam->s_mutex); - } - --static struct vm_operations_struct cafe_v4l_vm_ops = { -+static const struct vm_operations_struct cafe_v4l_vm_ops = { - .open = cafe_v4l_vm_open, - .close = cafe_v4l_vm_close - }; -diff -urNp linux-2.6.31.1/drivers/media/video/et61x251/et61x251_core.c linux-2.6.31.1/drivers/media/video/et61x251/et61x251_core.c ---- linux-2.6.31.1/drivers/media/video/et61x251/et61x251_core.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/et61x251/et61x251_core.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1494,7 +1494,7 @@ static void et61x251_vm_close(struct vm_ - } - - --static struct vm_operations_struct et61x251_vm_ops = { -+static const struct vm_operations_struct et61x251_vm_ops = { - .open = et61x251_vm_open, - .close = et61x251_vm_close, - }; -diff -urNp linux-2.6.31.1/drivers/media/video/gspca/gspca.c linux-2.6.31.1/drivers/media/video/gspca/gspca.c ---- linux-2.6.31.1/drivers/media/video/gspca/gspca.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/gspca/gspca.c 2009-10-01 20:12:43.000000000 -0400 -@@ -99,7 +99,7 @@ static void gspca_vm_close(struct vm_are - frame->v4l2_buf.flags &= ~V4L2_BUF_FLAG_MAPPED; - } - --static struct vm_operations_struct gspca_vm_ops = { -+static const struct vm_operations_struct gspca_vm_ops = { - .open = gspca_vm_open, - .close = gspca_vm_close, - }; -diff -urNp linux-2.6.31.1/drivers/media/video/meye.c linux-2.6.31.1/drivers/media/video/meye.c ---- linux-2.6.31.1/drivers/media/video/meye.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/meye.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1589,7 +1589,7 @@ static void meye_vm_close(struct vm_area - meye.vma_use_count[idx]--; - } - --static struct vm_operations_struct meye_vm_ops = { -+static const struct vm_operations_struct meye_vm_ops = { - .open = meye_vm_open, - .close = meye_vm_close, - }; -diff -urNp linux-2.6.31.1/drivers/media/video/sn9c102/sn9c102_core.c linux-2.6.31.1/drivers/media/video/sn9c102/sn9c102_core.c ---- linux-2.6.31.1/drivers/media/video/sn9c102/sn9c102_core.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/sn9c102/sn9c102_core.c 2009-10-01 20:12:43.000000000 -0400 -@@ -2075,7 +2075,7 @@ static void sn9c102_vm_close(struct vm_a - } - - --static struct vm_operations_struct sn9c102_vm_ops = { -+static const struct vm_operations_struct sn9c102_vm_ops = { - .open = sn9c102_vm_open, - .close = sn9c102_vm_close, - }; -diff -urNp linux-2.6.31.1/drivers/media/video/stk-webcam.c linux-2.6.31.1/drivers/media/video/stk-webcam.c ---- linux-2.6.31.1/drivers/media/video/stk-webcam.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/stk-webcam.c 2009-10-01 20:12:43.000000000 -0400 -@@ -790,7 +790,7 @@ static void stk_v4l_vm_close(struct vm_a - if (sbuf->mapcount == 0) - sbuf->v4lbuf.flags &= ~V4L2_BUF_FLAG_MAPPED; - } --static struct vm_operations_struct stk_v4l_vm_ops = { -+static const struct vm_operations_struct stk_v4l_vm_ops = { - .open = stk_v4l_vm_open, - .close = stk_v4l_vm_close - }; -diff -urNp linux-2.6.31.1/drivers/media/video/usbvideo/konicawc.c linux-2.6.31.1/drivers/media/video/usbvideo/konicawc.c ---- linux-2.6.31.1/drivers/media/video/usbvideo/konicawc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/usbvideo/konicawc.c 2009-10-01 20:12:43.000000000 -0400 -@@ -225,7 +225,7 @@ static void konicawc_register_input(stru - int error; - - usb_make_path(dev, cam->input_physname, sizeof(cam->input_physname)); -- strncat(cam->input_physname, "/input0", sizeof(cam->input_physname)); -+ strlcat(cam->input_physname, "/input0", sizeof(cam->input_physname)); - - cam->input = input_dev = input_allocate_device(); - if (!input_dev) { -diff -urNp linux-2.6.31.1/drivers/media/video/usbvideo/quickcam_messenger.c linux-2.6.31.1/drivers/media/video/usbvideo/quickcam_messenger.c ---- linux-2.6.31.1/drivers/media/video/usbvideo/quickcam_messenger.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/usbvideo/quickcam_messenger.c 2009-10-01 20:12:43.000000000 -0400 -@@ -89,7 +89,7 @@ static void qcm_register_input(struct qc - int error; - - usb_make_path(dev, cam->input_physname, sizeof(cam->input_physname)); -- strncat(cam->input_physname, "/input0", sizeof(cam->input_physname)); -+ strlcat(cam->input_physname, "/input0", sizeof(cam->input_physname)); - - cam->input = input_dev = input_allocate_device(); - if (!input_dev) { -diff -urNp linux-2.6.31.1/drivers/media/video/uvc/uvc_v4l2.c linux-2.6.31.1/drivers/media/video/uvc/uvc_v4l2.c ---- linux-2.6.31.1/drivers/media/video/uvc/uvc_v4l2.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/uvc/uvc_v4l2.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1063,7 +1063,7 @@ static void uvc_vm_close(struct vm_area_ - buffer->vma_use_count--; - } - --static struct vm_operations_struct uvc_vm_ops = { -+static const struct vm_operations_struct uvc_vm_ops = { - .open = uvc_vm_open, - .close = uvc_vm_close, - }; -diff -urNp linux-2.6.31.1/drivers/media/video/videobuf-dma-contig.c linux-2.6.31.1/drivers/media/video/videobuf-dma-contig.c ---- linux-2.6.31.1/drivers/media/video/videobuf-dma-contig.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/videobuf-dma-contig.c 2009-10-01 20:12:43.000000000 -0400 -@@ -105,7 +105,7 @@ static void videobuf_vm_close(struct vm_ - } - } - --static struct vm_operations_struct videobuf_vm_ops = { -+static const struct vm_operations_struct videobuf_vm_ops = { - .open = videobuf_vm_open, - .close = videobuf_vm_close, - }; -diff -urNp linux-2.6.31.1/drivers/media/video/vino.c linux-2.6.31.1/drivers/media/video/vino.c ---- linux-2.6.31.1/drivers/media/video/vino.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/vino.c 2009-10-01 20:12:43.000000000 -0400 -@@ -3858,7 +3858,7 @@ static void vino_vm_close(struct vm_area - dprintk("vino_vm_close(): count = %d\n", fb->map_count); - } - --static struct vm_operations_struct vino_vm_ops = { -+static const struct vm_operations_struct vino_vm_ops = { - .open = vino_vm_open, - .close = vino_vm_close, - }; -diff -urNp linux-2.6.31.1/drivers/media/video/zc0301/zc0301_core.c linux-2.6.31.1/drivers/media/video/zc0301/zc0301_core.c ---- linux-2.6.31.1/drivers/media/video/zc0301/zc0301_core.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/zc0301/zc0301_core.c 2009-10-01 20:12:43.000000000 -0400 -@@ -933,7 +933,7 @@ static void zc0301_vm_close(struct vm_ar - } - - --static struct vm_operations_struct zc0301_vm_ops = { -+static const struct vm_operations_struct zc0301_vm_ops = { - .open = zc0301_vm_open, - .close = zc0301_vm_close, - }; -diff -urNp linux-2.6.31.1/drivers/media/video/zoran/zoran_driver.c linux-2.6.31.1/drivers/media/video/zoran/zoran_driver.c ---- linux-2.6.31.1/drivers/media/video/zoran/zoran_driver.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/media/video/zoran/zoran_driver.c 2009-10-01 20:12:43.000000000 -0400 -@@ -3172,7 +3172,7 @@ zoran_vm_close (struct vm_area_struct *v - mutex_unlock(&zr->resource_lock); - } - --static struct vm_operations_struct zoran_vm_ops = { -+static const struct vm_operations_struct zoran_vm_ops = { - .open = zoran_vm_open, - .close = zoran_vm_close, - }; -diff -urNp linux-2.6.31.1/drivers/message/i2o/i2o_proc.c linux-2.6.31.1/drivers/message/i2o/i2o_proc.c ---- linux-2.6.31.1/drivers/message/i2o/i2o_proc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/message/i2o/i2o_proc.c 2009-10-01 20:12:43.000000000 -0400 -@@ -259,13 +259,6 @@ static char *scsi_devices[] = { - "Array Controller Device" - }; - --static char *chtostr(u8 * chars, int n) --{ -- char tmp[256]; -- tmp[0] = 0; -- return strncat(tmp, (char *)chars, n); --} -- - static int i2o_report_query_status(struct seq_file *seq, int block_status, - char *group) - { -@@ -842,8 +835,7 @@ static int i2o_seq_show_ddm_table(struct - - seq_printf(seq, "%-#7x", ddm_table.i2o_vendor_id); - seq_printf(seq, "%-#8x", ddm_table.module_id); -- seq_printf(seq, "%-29s", -- chtostr(ddm_table.module_name_version, 28)); -+ seq_printf(seq, "%-.28s", ddm_table.module_name_version); - seq_printf(seq, "%9d ", ddm_table.data_size); - seq_printf(seq, "%8d", ddm_table.code_size); - -@@ -944,8 +936,8 @@ static int i2o_seq_show_drivers_stored(s - - seq_printf(seq, "%-#7x", dst->i2o_vendor_id); - seq_printf(seq, "%-#8x", dst->module_id); -- seq_printf(seq, "%-29s", chtostr(dst->module_name_version, 28)); -- seq_printf(seq, "%-9s", chtostr(dst->date, 8)); -+ seq_printf(seq, "%-.28s", dst->module_name_version); -+ seq_printf(seq, "%-.8s", dst->date); - seq_printf(seq, "%8d ", dst->module_size); - seq_printf(seq, "%8d ", dst->mpb_size); - seq_printf(seq, "0x%04x", dst->module_flags); -@@ -1276,14 +1268,10 @@ static int i2o_seq_show_dev_identity(str - seq_printf(seq, "Device Class : %s\n", i2o_get_class_name(work16[0])); - seq_printf(seq, "Owner TID : %0#5x\n", work16[2]); - seq_printf(seq, "Parent TID : %0#5x\n", work16[3]); -- seq_printf(seq, "Vendor info : %s\n", -- chtostr((u8 *) (work32 + 2), 16)); -- seq_printf(seq, "Product info : %s\n", -- chtostr((u8 *) (work32 + 6), 16)); -- seq_printf(seq, "Description : %s\n", -- chtostr((u8 *) (work32 + 10), 16)); -- seq_printf(seq, "Product rev. : %s\n", -- chtostr((u8 *) (work32 + 14), 8)); -+ seq_printf(seq, "Vendor info : %.16s\n", (u8 *) (work32 + 2)); -+ seq_printf(seq, "Product info : %.16s\n", (u8 *) (work32 + 6)); -+ seq_printf(seq, "Description : %.16s\n", (u8 *) (work32 + 10)); -+ seq_printf(seq, "Product rev. : %.8s\n", (u8 *) (work32 + 14)); - - seq_printf(seq, "Serial number : "); - print_serial_number(seq, (u8 *) (work32 + 16), -@@ -1328,10 +1316,8 @@ static int i2o_seq_show_ddm_identity(str - } - - seq_printf(seq, "Registering DDM TID : 0x%03x\n", result.ddm_tid); -- seq_printf(seq, "Module name : %s\n", -- chtostr(result.module_name, 24)); -- seq_printf(seq, "Module revision : %s\n", -- chtostr(result.module_rev, 8)); -+ seq_printf(seq, "Module name : %.24s\n", result.module_name); -+ seq_printf(seq, "Module revision : %.8s\n", result.module_rev); - - seq_printf(seq, "Serial number : "); - print_serial_number(seq, result.serial_number, sizeof(result) - 36); -@@ -1362,14 +1348,10 @@ static int i2o_seq_show_uinfo(struct seq - return 0; - } - -- seq_printf(seq, "Device name : %s\n", -- chtostr(result.device_name, 64)); -- seq_printf(seq, "Service name : %s\n", -- chtostr(result.service_name, 64)); -- seq_printf(seq, "Physical name : %s\n", -- chtostr(result.physical_location, 64)); -- seq_printf(seq, "Instance number : %s\n", -- chtostr(result.instance_number, 4)); -+ seq_printf(seq, "Device name : %.64s\n", result.device_name); -+ seq_printf(seq, "Service name : %.64s\n", result.service_name); -+ seq_printf(seq, "Physical name : %.64s\n", result.physical_location); -+ seq_printf(seq, "Instance number : %.4s\n", result.instance_number); - - return 0; - } -diff -urNp linux-2.6.31.1/drivers/misc/ibmasm/ibmasmfs.c linux-2.6.31.1/drivers/misc/ibmasm/ibmasmfs.c ---- linux-2.6.31.1/drivers/misc/ibmasm/ibmasmfs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/misc/ibmasm/ibmasmfs.c 2009-10-01 20:12:43.000000000 -0400 -@@ -97,7 +97,7 @@ static int ibmasmfs_get_super(struct fil - return get_sb_single(fst, flags, data, ibmasmfs_fill_super, mnt); - } - --static struct super_operations ibmasmfs_s_ops = { -+static const struct super_operations ibmasmfs_s_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - }; -diff -urNp linux-2.6.31.1/drivers/misc/phantom.c linux-2.6.31.1/drivers/misc/phantom.c ---- linux-2.6.31.1/drivers/misc/phantom.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/misc/phantom.c 2009-10-01 20:12:43.000000000 -0400 -@@ -271,7 +271,7 @@ static unsigned int phantom_poll(struct - return mask; - } - --static struct file_operations phantom_file_ops = { -+static const struct file_operations phantom_file_ops = { - .open = phantom_open, - .release = phantom_release, - .unlocked_ioctl = phantom_ioctl, -diff -urNp linux-2.6.31.1/drivers/misc/sgi-gru/grufile.c linux-2.6.31.1/drivers/misc/sgi-gru/grufile.c ---- linux-2.6.31.1/drivers/misc/sgi-gru/grufile.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/misc/sgi-gru/grufile.c 2009-10-01 20:12:43.000000000 -0400 -@@ -53,7 +53,7 @@ struct gru_stats_s gru_stats; - /* Guaranteed user available resources on each node */ - static int max_user_cbrs, max_user_dsr_bytes; - --static struct file_operations gru_fops; -+static const struct file_operations gru_fops; - static struct miscdevice gru_miscdev; - - -@@ -426,7 +426,7 @@ static void __exit gru_exit(void) - gru_proc_exit(); - } - --static struct file_operations gru_fops = { -+static const struct file_operations gru_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = gru_file_unlocked_ioctl, - .mmap = gru_file_mmap, -@@ -438,7 +438,7 @@ static struct miscdevice gru_miscdev = { - .fops = &gru_fops, - }; - --struct vm_operations_struct gru_vm_ops = { -+const struct vm_operations_struct gru_vm_ops = { - .close = gru_vma_close, - .fault = gru_fault, - }; -diff -urNp linux-2.6.31.1/drivers/misc/sgi-gru/grutables.h linux-2.6.31.1/drivers/misc/sgi-gru/grutables.h ---- linux-2.6.31.1/drivers/misc/sgi-gru/grutables.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/misc/sgi-gru/grutables.h 2009-10-01 20:12:43.000000000 -0400 -@@ -624,7 +624,7 @@ static inline int is_kernel_context(stru - */ - struct gru_unload_context_req; - --extern struct vm_operations_struct gru_vm_ops; -+extern const struct vm_operations_struct gru_vm_ops; - extern struct device *grudev; - - extern struct gru_vma_data *gru_alloc_vma_data(struct vm_area_struct *vma, -diff -urNp linux-2.6.31.1/drivers/mmc/core/debugfs.c linux-2.6.31.1/drivers/mmc/core/debugfs.c ---- linux-2.6.31.1/drivers/mmc/core/debugfs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/mmc/core/debugfs.c 2009-10-01 20:12:43.000000000 -0400 -@@ -240,7 +240,7 @@ static int mmc_ext_csd_release(struct in - return 0; - } - --static struct file_operations mmc_dbg_ext_csd_fops = { -+static const struct file_operations mmc_dbg_ext_csd_fops = { - .open = mmc_ext_csd_open, - .read = mmc_ext_csd_read, - .release = mmc_ext_csd_release, -diff -urNp linux-2.6.31.1/drivers/mtd/devices/doc2000.c linux-2.6.31.1/drivers/mtd/devices/doc2000.c ---- linux-2.6.31.1/drivers/mtd/devices/doc2000.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/mtd/devices/doc2000.c 2009-10-01 20:12:43.000000000 -0400 -@@ -776,7 +776,7 @@ static int doc_write(struct mtd_info *mt - - /* The ECC will not be calculated correctly if less than 512 is written */ - /* DBB- -- if (len != 0x200 && eccbuf) -+ if (len != 0x200) - printk(KERN_WARNING - "ECC needs a full sector write (adr: %lx size %lx)\n", - (long) to, (long) len); -diff -urNp linux-2.6.31.1/drivers/mtd/devices/doc2001.c linux-2.6.31.1/drivers/mtd/devices/doc2001.c ---- linux-2.6.31.1/drivers/mtd/devices/doc2001.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/mtd/devices/doc2001.c 2009-10-01 20:12:43.000000000 -0400 -@@ -395,6 +395,8 @@ static int doc_read (struct mtd_info *mt - /* Don't allow read past end of device */ - if (from >= this->totlen) - return -EINVAL; -+ if (!len) -+ return -EINVAL; - - /* Don't allow a single read to cross a 512-byte block boundary */ - if (from + len > ((from | 0x1ff) + 1)) -diff -urNp linux-2.6.31.1/drivers/mtd/ubi/build.c linux-2.6.31.1/drivers/mtd/ubi/build.c ---- linux-2.6.31.1/drivers/mtd/ubi/build.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/mtd/ubi/build.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1257,7 +1257,7 @@ static int __init bytes_str_to_int(const - unsigned long result; - - result = simple_strtoul(str, &endp, 0); -- if (str == endp || result < 0) { -+ if (str == endp) { - printk(KERN_ERR "UBI error: incorrect bytes count: "%s"\n", - str); - return -EINVAL; -diff -urNp linux-2.6.31.1/drivers/net/irda/vlsi_ir.c linux-2.6.31.1/drivers/net/irda/vlsi_ir.c ---- linux-2.6.31.1/drivers/net/irda/vlsi_ir.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/net/irda/vlsi_ir.c 2009-10-01 20:12:43.000000000 -0400 -@@ -906,13 +906,12 @@ static int vlsi_hard_start_xmit(struct s - /* no race - tx-ring already empty */ - vlsi_set_baud(idev, iobase); - netif_wake_queue(ndev); -- } -- else -- ; -+ } else { - /* keep the speed change pending like it would - * for any len>0 packet. tx completion interrupt - * will apply it when the tx ring becomes empty. - */ -+ } - spin_unlock_irqrestore(&idev->lock, flags); - dev_kfree_skb_any(skb); - return 0; -diff -urNp linux-2.6.31.1/drivers/net/pcnet32.c linux-2.6.31.1/drivers/net/pcnet32.c ---- linux-2.6.31.1/drivers/net/pcnet32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/net/pcnet32.c 2009-10-01 20:12:43.000000000 -0400 -@@ -78,7 +78,7 @@ static int cards_found; - /* - * VLB I/O addresses - */ --static unsigned int pcnet32_portlist[] __initdata = -+static unsigned int pcnet32_portlist[] __devinitdata = - { 0x300, 0x320, 0x340, 0x360, 0 }; - - static int pcnet32_debug = 0; -diff -urNp linux-2.6.31.1/drivers/net/tg3.h linux-2.6.31.1/drivers/net/tg3.h ---- linux-2.6.31.1/drivers/net/tg3.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/net/tg3.h 2009-10-01 20:12:43.000000000 -0400 -@@ -89,6 +89,7 @@ - #define CHIPREV_ID_5750_A0 0x4000 - #define CHIPREV_ID_5750_A1 0x4001 - #define CHIPREV_ID_5750_A3 0x4003 -+#define CHIPREV_ID_5750_C1 0x4201 - #define CHIPREV_ID_5750_C2 0x4202 - #define CHIPREV_ID_5752_A0_HW 0x5000 - #define CHIPREV_ID_5752_A0 0x6000 -diff -urNp linux-2.6.31.1/drivers/net/usb/hso.c linux-2.6.31.1/drivers/net/usb/hso.c ---- linux-2.6.31.1/drivers/net/usb/hso.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/net/usb/hso.c 2009-10-01 20:12:43.000000000 -0400 -@@ -258,7 +258,7 @@ struct hso_serial { - - /* from usb_serial_port */ - struct tty_struct *tty; -- int open_count; -+ atomic_t open_count; - spinlock_t serial_lock; - - int (*write_data) (struct hso_serial *serial); -@@ -1179,7 +1179,7 @@ static void put_rxbuf_data_and_resubmit_ - struct urb *urb; - - urb = serial->rx_urb[0]; -- if (serial->open_count > 0) { -+ if (atomic_read(&serial->open_count) > 0) { - count = put_rxbuf_data(urb, serial); - if (count == -1) - return; -@@ -1215,7 +1215,7 @@ static void hso_std_serial_read_bulk_cal - DUMP1(urb->transfer_buffer, urb->actual_length); - - /* Anyone listening? */ -- if (serial->open_count == 0) -+ if (atomic_read(&serial->open_count) == 0) - return; - - if (status == 0) { -@@ -1310,8 +1310,7 @@ static int hso_serial_open(struct tty_st - spin_unlock_irq(&serial->serial_lock); - - /* check for port already opened, if not set the termios */ -- serial->open_count++; -- if (serial->open_count == 1) { -+ if (atomic_inc_return(&serial->open_count) == 1) { - tty->low_latency = 1; - serial->rx_state = RX_IDLE; - /* Force default termio settings */ -@@ -1324,7 +1323,7 @@ static int hso_serial_open(struct tty_st - result = hso_start_serial_device(serial->parent, GFP_KERNEL); - if (result) { - hso_stop_serial_device(serial->parent); -- serial->open_count--; -+ atomic_dec(&serial->open_count); - kref_put(&serial->parent->ref, hso_serial_ref_free); - } - } else { -@@ -1361,10 +1360,10 @@ static void hso_serial_close(struct tty_ - - /* reset the rts and dtr */ - /* do the actual close */ -- serial->open_count--; -+ atomic_dec(&serial->open_count); - kref_put(&serial->parent->ref, hso_serial_ref_free); -- if (serial->open_count <= 0) { -- serial->open_count = 0; -+ if (atomic_read(&serial->open_count) <= 0) { -+ atomic_set(&serial->open_count, 0); - spin_lock_irq(&serial->serial_lock); - if (serial->tty == tty) { - serial->tty->driver_data = NULL; -@@ -1444,7 +1443,7 @@ static void hso_serial_set_termios(struc - - /* the actual setup */ - spin_lock_irqsave(&serial->serial_lock, flags); -- if (serial->open_count) -+ if (atomic_read(&serial->open_count)) - _hso_serial_set_termios(tty, old); - else - tty->termios = old; -@@ -3087,7 +3086,7 @@ static int hso_resume(struct usb_interfa - /* Start all serial ports */ - for (i = 0; i < HSO_SERIAL_TTY_MINORS; i++) { - if (serial_table[i] && (serial_table[i]->interface == iface)) { -- if (dev2ser(serial_table[i])->open_count) { -+ if (atomic_read(&dev2ser(serial_table[i])->open_count)) { - result = - hso_start_serial_device(serial_table[i], GFP_NOIO); - hso_kick_transmit(dev2ser(serial_table[i])); -diff -urNp linux-2.6.31.1/drivers/oprofile/buffer_sync.c linux-2.6.31.1/drivers/oprofile/buffer_sync.c ---- linux-2.6.31.1/drivers/oprofile/buffer_sync.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/oprofile/buffer_sync.c 2009-10-01 20:12:43.000000000 -0400 -@@ -341,7 +341,7 @@ static void add_data(struct op_entry *en - if (cookie == NO_COOKIE) - offset = pc; - if (cookie == INVALID_COOKIE) { -- atomic_inc(&oprofile_stats.sample_lost_no_mapping); -+ atomic_inc_unchecked(&oprofile_stats.sample_lost_no_mapping); - offset = pc; - } - if (cookie != last_cookie) { -@@ -385,14 +385,14 @@ add_sample(struct mm_struct *mm, struct - /* add userspace sample */ - - if (!mm) { -- atomic_inc(&oprofile_stats.sample_lost_no_mm); -+ atomic_inc_unchecked(&oprofile_stats.sample_lost_no_mm); - return 0; - } - - cookie = lookup_dcookie(mm, s->eip, &offset); - - if (cookie == INVALID_COOKIE) { -- atomic_inc(&oprofile_stats.sample_lost_no_mapping); -+ atomic_inc_unchecked(&oprofile_stats.sample_lost_no_mapping); - return 0; - } - -@@ -561,7 +561,7 @@ void sync_buffer(int cpu) - /* ignore backtraces if failed to add a sample */ - if (state == sb_bt_start) { - state = sb_bt_ignore; -- atomic_inc(&oprofile_stats.bt_lost_no_mapping); -+ atomic_inc_unchecked(&oprofile_stats.bt_lost_no_mapping); - } - } - release_mm(mm); -diff -urNp linux-2.6.31.1/drivers/oprofile/event_buffer.c linux-2.6.31.1/drivers/oprofile/event_buffer.c ---- linux-2.6.31.1/drivers/oprofile/event_buffer.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/oprofile/event_buffer.c 2009-10-01 20:12:43.000000000 -0400 -@@ -42,7 +42,7 @@ static atomic_t buffer_ready = ATOMIC_IN - void add_event_entry(unsigned long value) - { - if (buffer_pos == buffer_size) { -- atomic_inc(&oprofile_stats.event_lost_overflow); -+ atomic_inc_unchecked(&oprofile_stats.event_lost_overflow); - return; - } - -diff -urNp linux-2.6.31.1/drivers/oprofile/oprofilefs.c linux-2.6.31.1/drivers/oprofile/oprofilefs.c ---- linux-2.6.31.1/drivers/oprofile/oprofilefs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/oprofile/oprofilefs.c 2009-10-01 20:12:43.000000000 -0400 -@@ -35,7 +35,7 @@ static struct inode *oprofilefs_get_inod - } - - --static struct super_operations s_ops = { -+static const struct super_operations s_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - }; -@@ -187,7 +187,7 @@ static const struct file_operations atom - - - int oprofilefs_create_ro_atomic(struct super_block *sb, struct dentry *root, -- char const *name, atomic_t *val) -+ char const *name, atomic_unchecked_t *val) - { - struct dentry *d = __oprofilefs_create_file(sb, root, name, - &atomic_ro_fops, 0444); -diff -urNp linux-2.6.31.1/drivers/oprofile/oprofile_stats.c linux-2.6.31.1/drivers/oprofile/oprofile_stats.c ---- linux-2.6.31.1/drivers/oprofile/oprofile_stats.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/oprofile/oprofile_stats.c 2009-10-01 20:12:43.000000000 -0400 -@@ -30,10 +30,10 @@ void oprofile_reset_stats(void) - cpu_buf->sample_invalid_eip = 0; - } - -- atomic_set(&oprofile_stats.sample_lost_no_mm, 0); -- atomic_set(&oprofile_stats.sample_lost_no_mapping, 0); -- atomic_set(&oprofile_stats.event_lost_overflow, 0); -- atomic_set(&oprofile_stats.bt_lost_no_mapping, 0); -+ atomic_set_unchecked(&oprofile_stats.sample_lost_no_mm, 0); -+ atomic_set_unchecked(&oprofile_stats.sample_lost_no_mapping, 0); -+ atomic_set_unchecked(&oprofile_stats.event_lost_overflow, 0); -+ atomic_set_unchecked(&oprofile_stats.bt_lost_no_mapping, 0); - } - - -diff -urNp linux-2.6.31.1/drivers/oprofile/oprofile_stats.h linux-2.6.31.1/drivers/oprofile/oprofile_stats.h ---- linux-2.6.31.1/drivers/oprofile/oprofile_stats.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/oprofile/oprofile_stats.h 2009-10-01 20:12:43.000000000 -0400 -@@ -13,10 +13,10 @@ - #include <asm/atomic.h> - - struct oprofile_stat_struct { -- atomic_t sample_lost_no_mm; -- atomic_t sample_lost_no_mapping; -- atomic_t bt_lost_no_mapping; -- atomic_t event_lost_overflow; -+ atomic_unchecked_t sample_lost_no_mm; -+ atomic_unchecked_t sample_lost_no_mapping; -+ atomic_unchecked_t bt_lost_no_mapping; -+ atomic_unchecked_t event_lost_overflow; - }; - - extern struct oprofile_stat_struct oprofile_stats; -diff -urNp linux-2.6.31.1/drivers/pci/hotplug/cpqphp_nvram.c linux-2.6.31.1/drivers/pci/hotplug/cpqphp_nvram.c ---- linux-2.6.31.1/drivers/pci/hotplug/cpqphp_nvram.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/pci/hotplug/cpqphp_nvram.c 2009-10-01 20:12:43.000000000 -0400 -@@ -428,9 +428,13 @@ static u32 store_HRT (void __iomem *rom_ - - void compaq_nvram_init (void __iomem *rom_start) - { -+ -+#ifndef CONFIG_PAX_KERNEXEC - if (rom_start) { - compaq_int15_entry_point = (rom_start + ROM_INT15_PHY_ADDR - ROM_PHY_ADDR); - } -+#endif -+ - dbg("int15 entry = %p\n", compaq_int15_entry_point); - - /* initialize our int15 lock */ -diff -urNp linux-2.6.31.1/drivers/pci/pcie/portdrv_pci.c linux-2.6.31.1/drivers/pci/pcie/portdrv_pci.c ---- linux-2.6.31.1/drivers/pci/pcie/portdrv_pci.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/pci/pcie/portdrv_pci.c 2009-10-01 20:12:43.000000000 -0400 -@@ -249,7 +249,7 @@ static void pcie_portdrv_err_resume(stru - static const struct pci_device_id port_pci_ids[] = { { - /* handle any PCI-Express port */ - PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x00), ~0), -- }, { /* end: all zeroes */ } -+ }, { 0, 0, 0, 0, 0, 0, 0 } - }; - MODULE_DEVICE_TABLE(pci, port_pci_ids); - -diff -urNp linux-2.6.31.1/drivers/pci/proc.c linux-2.6.31.1/drivers/pci/proc.c ---- linux-2.6.31.1/drivers/pci/proc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/pci/proc.c 2009-10-01 20:12:43.000000000 -0400 -@@ -480,7 +480,16 @@ static const struct file_operations proc - static int __init pci_proc_init(void) - { - struct pci_dev *dev = NULL; -+ -+#ifdef CONFIG_GRKERNSEC_PROC_ADD -+#ifdef CONFIG_GRKERNSEC_PROC_USER -+ proc_bus_pci_dir = proc_mkdir_mode("bus/pci", S_IRUSR | S_IXUSR, NULL); -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ proc_bus_pci_dir = proc_mkdir_mode("bus/pci", S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP, NULL); -+#endif -+#else - proc_bus_pci_dir = proc_mkdir("bus/pci", NULL); -+#endif - proc_create("devices", 0, proc_bus_pci_dir, - &proc_bus_pci_dev_operations); - proc_initialized = 1; -diff -urNp linux-2.6.31.1/drivers/pcmcia/ti113x.h linux-2.6.31.1/drivers/pcmcia/ti113x.h ---- linux-2.6.31.1/drivers/pcmcia/ti113x.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/pcmcia/ti113x.h 2009-10-01 20:12:43.000000000 -0400 -@@ -903,7 +903,7 @@ static struct pci_device_id ene_tune_tbl - DEVID(PCI_VENDOR_ID_MOTOROLA, 0x3410, 0xECC0, PCI_ANY_ID, - ENE_TEST_C9_TLTENABLE | ENE_TEST_C9_PFENABLE, ENE_TEST_C9_TLTENABLE), - -- {} -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - static void ene_tune_bridge(struct pcmcia_socket *sock, struct pci_bus *bus) -diff -urNp linux-2.6.31.1/drivers/pcmcia/yenta_socket.c linux-2.6.31.1/drivers/pcmcia/yenta_socket.c ---- linux-2.6.31.1/drivers/pcmcia/yenta_socket.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/pcmcia/yenta_socket.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1366,7 +1366,7 @@ static struct pci_device_id yenta_table - - /* match any cardbus bridge */ - CB_ID(PCI_ANY_ID, PCI_ANY_ID, DEFAULT), -- { /* all zeroes */ } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - MODULE_DEVICE_TABLE(pci, yenta_table); - -diff -urNp linux-2.6.31.1/drivers/pnp/pnpbios/bioscalls.c linux-2.6.31.1/drivers/pnp/pnpbios/bioscalls.c ---- linux-2.6.31.1/drivers/pnp/pnpbios/bioscalls.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/pnp/pnpbios/bioscalls.c 2009-10-01 20:12:43.000000000 -0400 -@@ -60,7 +60,7 @@ set_base(gdt[(selname) >> 3], (u32)(addr - set_limit(gdt[(selname) >> 3], size); \ - } while(0) - --static struct desc_struct bad_bios_desc; -+static struct desc_struct bad_bios_desc __read_only; - - /* - * At some point we want to use this stack frame pointer to unwind -@@ -87,6 +87,10 @@ static inline u16 call_pnp_bios(u16 func - struct desc_struct save_desc_40; - int cpu; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - /* - * PnP BIOSes are generally not terribly re-entrant. - * Also, don't rely on them to save everything correctly. -@@ -96,8 +100,17 @@ static inline u16 call_pnp_bios(u16 func - - cpu = get_cpu(); - save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8]; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc; - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - /* On some boxes IRQ's during PnP BIOS calls are deadly. */ - spin_lock_irqsave(&pnp_bios_lock, flags); - -@@ -134,7 +147,16 @@ static inline u16 call_pnp_bios(u16 func - :"memory"); - spin_unlock_irqrestore(&pnp_bios_lock, flags); - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - put_cpu(); - - /* If we get here and this is set then the PnP BIOS faulted on us. */ -@@ -468,16 +490,24 @@ int pnp_bios_read_escd(char *data, u32 n - return status; - } - --void pnpbios_calls_init(union pnp_bios_install_struct *header) -+void __init pnpbios_calls_init(union pnp_bios_install_struct *header) - { - int i; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - spin_lock_init(&pnp_bios_lock); - pnp_bios_callpoint.offset = header->fields.pm16offset; - pnp_bios_callpoint.segment = PNP_CS16; - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - bad_bios_desc.a = 0; -- bad_bios_desc.b = 0x00409200; -+ bad_bios_desc.b = 0x00409300; - - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); -@@ -491,4 +521,9 @@ void pnpbios_calls_init(union pnp_bios_i - set_base(gdt[GDT_ENTRY_PNPBIOS_DS], - __va(header->fields.pm16dseg)); - } -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - } -diff -urNp linux-2.6.31.1/drivers/pnp/quirks.c linux-2.6.31.1/drivers/pnp/quirks.c ---- linux-2.6.31.1/drivers/pnp/quirks.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/pnp/quirks.c 2009-10-01 20:12:43.000000000 -0400 -@@ -327,7 +327,7 @@ static struct pnp_fixup pnp_fixups[] = { - /* PnP resources that might overlap PCI BARs */ - {"PNP0c01", quirk_system_pci_resources}, - {"PNP0c02", quirk_system_pci_resources}, -- {""} -+ {"", NULL} - }; - - void pnp_fixup_device(struct pnp_dev *dev) -diff -urNp linux-2.6.31.1/drivers/pnp/resource.c linux-2.6.31.1/drivers/pnp/resource.c ---- linux-2.6.31.1/drivers/pnp/resource.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/pnp/resource.c 2009-10-01 20:12:43.000000000 -0400 -@@ -355,7 +355,7 @@ int pnp_check_irq(struct pnp_dev *dev, s - return 1; - - /* check if the resource is valid */ -- if (*irq < 0 || *irq > 15) -+ if (*irq > 15) - return 0; - - /* check if the resource is reserved */ -@@ -419,7 +419,7 @@ int pnp_check_dma(struct pnp_dev *dev, s - return 1; - - /* check if the resource is valid */ -- if (*dma < 0 || *dma == 4 || *dma > 7) -+ if (*dma == 4 || *dma > 7) - return 0; - - /* check if the resource is reserved */ -diff -urNp linux-2.6.31.1/drivers/s390/cio/qdio_debug.c linux-2.6.31.1/drivers/s390/cio/qdio_debug.c ---- linux-2.6.31.1/drivers/s390/cio/qdio_debug.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/s390/cio/qdio_debug.c 2009-10-01 20:12:43.000000000 -0400 -@@ -144,7 +144,7 @@ static void remove_debugfs_entry(struct - } - } - --static struct file_operations debugfs_fops = { -+static const struct file_operations debugfs_fops = { - .owner = THIS_MODULE, - .open = qstat_seq_open, - .read = seq_read, -diff -urNp linux-2.6.31.1/drivers/s390/cio/qdio_perf.c linux-2.6.31.1/drivers/s390/cio/qdio_perf.c ---- linux-2.6.31.1/drivers/s390/cio/qdio_perf.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/s390/cio/qdio_perf.c 2009-10-01 20:12:43.000000000 -0400 -@@ -84,7 +84,7 @@ static int qdio_perf_seq_open(struct ino - return single_open(filp, qdio_perf_proc_show, NULL); - } - --static struct file_operations qdio_perf_proc_fops = { -+static const struct file_operations qdio_perf_proc_fops = { - .owner = THIS_MODULE, - .open = qdio_perf_seq_open, - .read = seq_read, -diff -urNp linux-2.6.31.1/drivers/scsi/libfc/fc_exch.c linux-2.6.31.1/drivers/scsi/libfc/fc_exch.c ---- linux-2.6.31.1/drivers/scsi/libfc/fc_exch.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/scsi/libfc/fc_exch.c 2009-10-01 20:12:43.000000000 -0400 -@@ -73,12 +73,12 @@ struct fc_exch_mgr { - * all together if not used XXX - */ - struct { -- atomic_t no_free_exch; -- atomic_t no_free_exch_xid; -- atomic_t xid_not_found; -- atomic_t xid_busy; -- atomic_t seq_not_found; -- atomic_t non_bls_resp; -+ atomic_unchecked_t no_free_exch; -+ atomic_unchecked_t no_free_exch_xid; -+ atomic_unchecked_t xid_not_found; -+ atomic_unchecked_t xid_busy; -+ atomic_unchecked_t seq_not_found; -+ atomic_unchecked_t non_bls_resp; - } stats; - struct fc_exch **exches; /* for exch pointers indexed by xid */ - }; -@@ -523,7 +523,7 @@ struct fc_exch *fc_exch_alloc(struct fc_ - /* allocate memory for exchange */ - ep = mempool_alloc(mp->ep_pool, GFP_ATOMIC); - if (!ep) { -- atomic_inc(&mp->stats.no_free_exch); -+ atomic_inc_unchecked(&mp->stats.no_free_exch); - goto out; - } - memset(ep, 0, sizeof(*ep)); -@@ -568,7 +568,7 @@ out: - return ep; - err: - spin_unlock_bh(&mp->em_lock); -- atomic_inc(&mp->stats.no_free_exch_xid); -+ atomic_inc_unchecked(&mp->stats.no_free_exch_xid); - mempool_free(ep, mp->ep_pool); - return NULL; - } -@@ -671,7 +671,7 @@ static enum fc_pf_rjt_reason fc_seq_look - xid = ntohs(fh->fh_ox_id); /* we originated exch */ - ep = fc_exch_find(mp, xid); - if (!ep) { -- atomic_inc(&mp->stats.xid_not_found); -+ atomic_inc_unchecked(&mp->stats.xid_not_found); - reject = FC_RJT_OX_ID; - goto out; - } -@@ -701,7 +701,7 @@ static enum fc_pf_rjt_reason fc_seq_look - ep = fc_exch_find(mp, xid); - if ((f_ctl & FC_FC_FIRST_SEQ) && fc_sof_is_init(fr_sof(fp))) { - if (ep) { -- atomic_inc(&mp->stats.xid_busy); -+ atomic_inc_unchecked(&mp->stats.xid_busy); - reject = FC_RJT_RX_ID; - goto rel; - } -@@ -712,7 +712,7 @@ static enum fc_pf_rjt_reason fc_seq_look - } - xid = ep->xid; /* get our XID */ - } else if (!ep) { -- atomic_inc(&mp->stats.xid_not_found); -+ atomic_inc_unchecked(&mp->stats.xid_not_found); - reject = FC_RJT_RX_ID; /* XID not found */ - goto out; - } -@@ -733,7 +733,7 @@ static enum fc_pf_rjt_reason fc_seq_look - } else { - sp = &ep->seq; - if (sp->id != fh->fh_seq_id) { -- atomic_inc(&mp->stats.seq_not_found); -+ atomic_inc_unchecked(&mp->stats.seq_not_found); - reject = FC_RJT_SEQ_ID; /* sequence/exch should exist */ - goto rel; - } -@@ -1145,22 +1145,22 @@ static void fc_exch_recv_seq_resp(struct - - ep = fc_exch_find(mp, ntohs(fh->fh_ox_id)); - if (!ep) { -- atomic_inc(&mp->stats.xid_not_found); -+ atomic_inc_unchecked(&mp->stats.xid_not_found); - goto out; - } - if (ep->esb_stat & ESB_ST_COMPLETE) { -- atomic_inc(&mp->stats.xid_not_found); -+ atomic_inc_unchecked(&mp->stats.xid_not_found); - goto out; - } - if (ep->rxid == FC_XID_UNKNOWN) - ep->rxid = ntohs(fh->fh_rx_id); - if (ep->sid != 0 && ep->sid != ntoh24(fh->fh_d_id)) { -- atomic_inc(&mp->stats.xid_not_found); -+ atomic_inc_unchecked(&mp->stats.xid_not_found); - goto rel; - } - if (ep->did != ntoh24(fh->fh_s_id) && - ep->did != FC_FID_FLOGI) { -- atomic_inc(&mp->stats.xid_not_found); -+ atomic_inc_unchecked(&mp->stats.xid_not_found); - goto rel; - } - sof = fr_sof(fp); -@@ -1171,7 +1171,7 @@ static void fc_exch_recv_seq_resp(struct - } else { - sp = &ep->seq; - if (sp->id != fh->fh_seq_id) { -- atomic_inc(&mp->stats.seq_not_found); -+ atomic_inc_unchecked(&mp->stats.seq_not_found); - goto rel; - } - } -@@ -1230,10 +1230,10 @@ static void fc_exch_recv_resp(struct fc_ - - sp = fc_seq_lookup_orig(mp, fp); /* doesn't hold sequence */ - if (!sp) { -- atomic_inc(&mp->stats.xid_not_found); -+ atomic_inc_unchecked(&mp->stats.xid_not_found); - FC_EM_DBG(mp, "seq lookup failed\n"); - } else { -- atomic_inc(&mp->stats.non_bls_resp); -+ atomic_inc_unchecked(&mp->stats.non_bls_resp); - FC_EM_DBG(mp, "non-BLS response to sequence"); - } - fc_frame_free(fp); -diff -urNp linux-2.6.31.1/drivers/scsi/scsi_logging.h linux-2.6.31.1/drivers/scsi/scsi_logging.h ---- linux-2.6.31.1/drivers/scsi/scsi_logging.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/scsi/scsi_logging.h 2009-10-01 20:12:43.000000000 -0400 -@@ -51,7 +51,7 @@ do { \ - } while (0); \ - } while (0) - #else --#define SCSI_CHECK_LOGGING(SHIFT, BITS, LEVEL, CMD) -+#define SCSI_CHECK_LOGGING(SHIFT, BITS, LEVEL, CMD) do {} while (0) - #endif /* CONFIG_SCSI_LOGGING */ - - /* -diff -urNp linux-2.6.31.1/drivers/scsi/sg.c linux-2.6.31.1/drivers/scsi/sg.c ---- linux-2.6.31.1/drivers/scsi/sg.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/scsi/sg.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1185,7 +1185,7 @@ sg_vma_fault(struct vm_area_struct *vma, - return VM_FAULT_SIGBUS; - } - --static struct vm_operations_struct sg_mmap_vm_ops = { -+static const struct vm_operations_struct sg_mmap_vm_ops = { - .fault = sg_vma_fault, - }; - -@@ -1317,7 +1317,7 @@ static void sg_rq_end_io(struct request - } - } - --static struct file_operations sg_fops = { -+static const struct file_operations sg_fops = { - .owner = THIS_MODULE, - .read = sg_read, - .write = sg_write, -@@ -2194,8 +2194,11 @@ static int sg_proc_seq_show_int(struct s - static int sg_proc_single_open_adio(struct inode *inode, struct file *file); - static ssize_t sg_proc_write_adio(struct file *filp, const char __user *buffer, - size_t count, loff_t *off); --static struct file_operations adio_fops = { -- /* .owner, .read and .llseek added in sg_proc_init() */ -+ -+static const struct file_operations adio_fops = { -+ .owner = THIS_MODULE, -+ .read = seq_read, -+ .llseek = seq_lseek, - .open = sg_proc_single_open_adio, - .write = sg_proc_write_adio, - .release = single_release, -@@ -2204,7 +2207,10 @@ static struct file_operations adio_fops - static int sg_proc_single_open_dressz(struct inode *inode, struct file *file); - static ssize_t sg_proc_write_dressz(struct file *filp, - const char __user *buffer, size_t count, loff_t *off); --static struct file_operations dressz_fops = { -+static const struct file_operations dressz_fops = { -+ .owner = THIS_MODULE, -+ .read = seq_read, -+ .llseek = seq_lseek, - .open = sg_proc_single_open_dressz, - .write = sg_proc_write_dressz, - .release = single_release, -@@ -2212,14 +2218,20 @@ static struct file_operations dressz_fop - - static int sg_proc_seq_show_version(struct seq_file *s, void *v); - static int sg_proc_single_open_version(struct inode *inode, struct file *file); --static struct file_operations version_fops = { -+static const struct file_operations version_fops = { -+ .owner = THIS_MODULE, -+ .read = seq_read, -+ .llseek = seq_lseek, - .open = sg_proc_single_open_version, - .release = single_release, - }; - - static int sg_proc_seq_show_devhdr(struct seq_file *s, void *v); - static int sg_proc_single_open_devhdr(struct inode *inode, struct file *file); --static struct file_operations devhdr_fops = { -+static const struct file_operations devhdr_fops = { -+ .owner = THIS_MODULE, -+ .read = seq_read, -+ .llseek = seq_lseek, - .open = sg_proc_single_open_devhdr, - .release = single_release, - }; -@@ -2229,11 +2241,14 @@ static int sg_proc_open_dev(struct inode - static void * dev_seq_start(struct seq_file *s, loff_t *pos); - static void * dev_seq_next(struct seq_file *s, void *v, loff_t *pos); - static void dev_seq_stop(struct seq_file *s, void *v); --static struct file_operations dev_fops = { -+static const struct file_operations dev_fops = { -+ .owner = THIS_MODULE, -+ .read = seq_read, -+ .llseek = seq_lseek, - .open = sg_proc_open_dev, - .release = seq_release, - }; --static struct seq_operations dev_seq_ops = { -+static const struct seq_operations dev_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, -@@ -2242,11 +2257,14 @@ static struct seq_operations dev_seq_ops - - static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v); - static int sg_proc_open_devstrs(struct inode *inode, struct file *file); --static struct file_operations devstrs_fops = { -+static const struct file_operations devstrs_fops = { -+ .owner = THIS_MODULE, -+ .read = seq_read, -+ .llseek = seq_lseek, - .open = sg_proc_open_devstrs, - .release = seq_release, - }; --static struct seq_operations devstrs_seq_ops = { -+static const struct seq_operations devstrs_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, -@@ -2255,11 +2273,14 @@ static struct seq_operations devstrs_seq - - static int sg_proc_seq_show_debug(struct seq_file *s, void *v); - static int sg_proc_open_debug(struct inode *inode, struct file *file); --static struct file_operations debug_fops = { -+static const struct file_operations debug_fops = { -+ .owner = THIS_MODULE, -+ .read = seq_read, -+ .llseek = seq_lseek, - .open = sg_proc_open_debug, - .release = seq_release, - }; --static struct seq_operations debug_seq_ops = { -+static const struct seq_operations debug_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, -@@ -2269,7 +2290,7 @@ static struct seq_operations debug_seq_o - - struct sg_proc_leaf { - const char * name; -- struct file_operations * fops; -+ const struct file_operations * fops; - }; - - static struct sg_proc_leaf sg_proc_leaf_arr[] = { -@@ -2295,9 +2316,6 @@ sg_proc_init(void) - for (k = 0; k < num_leaves; ++k) { - leaf = &sg_proc_leaf_arr[k]; - mask = leaf->fops->write ? S_IRUGO | S_IWUSR : S_IRUGO; -- leaf->fops->owner = THIS_MODULE; -- leaf->fops->read = seq_read; -- leaf->fops->llseek = seq_lseek; - proc_create(leaf->name, mask, sg_proc_sgp, leaf->fops); - } - return 0; -diff -urNp linux-2.6.31.1/drivers/serial/8250_pci.c linux-2.6.31.1/drivers/serial/8250_pci.c ---- linux-2.6.31.1/drivers/serial/8250_pci.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/serial/8250_pci.c 2009-10-01 20:12:43.000000000 -0400 -@@ -3580,7 +3580,7 @@ static struct pci_device_id serial_pci_t - PCI_ANY_ID, PCI_ANY_ID, - PCI_CLASS_COMMUNICATION_MULTISERIAL << 8, - 0xffff00, pbn_default }, -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - static struct pci_driver serial_pci_driver = { -diff -urNp linux-2.6.31.1/drivers/spi/spidev.c linux-2.6.31.1/drivers/spi/spidev.c ---- linux-2.6.31.1/drivers/spi/spidev.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/spi/spidev.c 2009-10-01 20:12:43.000000000 -0400 -@@ -537,7 +537,7 @@ static int spidev_release(struct inode * - return status; - } - --static struct file_operations spidev_fops = { -+static const struct file_operations spidev_fops = { - .owner = THIS_MODULE, - /* REVISIT switch to aio primitives, so that userspace - * gets more complete API coverage. It'll simplify things -diff -urNp linux-2.6.31.1/drivers/staging/android/binder.c linux-2.6.31.1/drivers/staging/android/binder.c ---- linux-2.6.31.1/drivers/staging/android/binder.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/staging/android/binder.c 2009-10-01 20:12:43.000000000 -0400 -@@ -2717,7 +2717,7 @@ static void binder_vma_close(struct vm_a - binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); - } - --static struct vm_operations_struct binder_vm_ops = { -+static const struct vm_operations_struct binder_vm_ops = { - .open = binder_vma_open, - .close = binder_vma_close, - }; -diff -urNp linux-2.6.31.1/drivers/staging/b3dfg/b3dfg.c linux-2.6.31.1/drivers/staging/b3dfg/b3dfg.c ---- linux-2.6.31.1/drivers/staging/b3dfg/b3dfg.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/staging/b3dfg/b3dfg.c 2009-10-01 20:12:43.000000000 -0400 -@@ -454,7 +454,7 @@ static int b3dfg_vma_fault(struct vm_are - return VM_FAULT_NOPAGE; - } - --static struct vm_operations_struct b3dfg_vm_ops = { -+static const struct vm_operations_struct b3dfg_vm_ops = { - .fault = b3dfg_vma_fault, - }; - -@@ -854,7 +854,7 @@ static int b3dfg_mmap(struct file *filp, - return r; - } - --static struct file_operations b3dfg_fops = { -+static const struct file_operations b3dfg_fops = { - .owner = THIS_MODULE, - .open = b3dfg_open, - .release = b3dfg_release, -diff -urNp linux-2.6.31.1/drivers/staging/comedi/comedi_fops.c linux-2.6.31.1/drivers/staging/comedi/comedi_fops.c ---- linux-2.6.31.1/drivers/staging/comedi/comedi_fops.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/staging/comedi/comedi_fops.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1370,7 +1370,7 @@ void comedi_unmap(struct vm_area_struct - mutex_unlock(&dev->mutex); - } - --static struct vm_operations_struct comedi_vm_ops = { -+static const struct vm_operations_struct comedi_vm_ops = { - .close = comedi_unmap, - }; - -diff -urNp linux-2.6.31.1/drivers/staging/cpc-usb/cpc-usb_drv.c linux-2.6.31.1/drivers/staging/cpc-usb/cpc-usb_drv.c ---- linux-2.6.31.1/drivers/staging/cpc-usb/cpc-usb_drv.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/staging/cpc-usb/cpc-usb_drv.c 2009-10-01 20:12:43.000000000 -0400 -@@ -104,7 +104,7 @@ static void cpcusb_read_interrupt_callba - - static int cpcusb_setup_intrep(CPC_USB_T *card); - --static struct file_operations cpcusb_fops = { -+static const struct file_operations cpcusb_fops = { - /* - * The owner field is part of the module-locking - * mechanism. The idea is that the kernel knows -diff -urNp linux-2.6.31.1/drivers/staging/epl/EplApiLinuxKernel.c linux-2.6.31.1/drivers/staging/epl/EplApiLinuxKernel.c ---- linux-2.6.31.1/drivers/staging/epl/EplApiLinuxKernel.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/staging/epl/EplApiLinuxKernel.c 2009-10-01 20:12:43.000000000 -0400 -@@ -203,7 +203,7 @@ static int EplLinIoctl(struct inode *pDe - module_init(EplLinInit); - module_exit(EplLinExit); - --static struct file_operations EplLinFileOps_g = { -+static const struct file_operations EplLinFileOps_g = { - .owner = THIS_MODULE, - .open = EplLinOpen, - .release = EplLinRelease, -diff -urNp linux-2.6.31.1/drivers/staging/go7007/go7007-v4l2.c linux-2.6.31.1/drivers/staging/go7007/go7007-v4l2.c ---- linux-2.6.31.1/drivers/staging/go7007/go7007-v4l2.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/staging/go7007/go7007-v4l2.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1717,7 +1717,7 @@ static int go7007_vm_fault(struct vm_are - return 0; - } - --static struct vm_operations_struct go7007_vm_ops = { -+static const struct vm_operations_struct go7007_vm_ops = { - .open = go7007_vm_open, - .close = go7007_vm_close, - .fault = go7007_vm_fault, -diff -urNp linux-2.6.31.1/drivers/staging/panel/panel.c linux-2.6.31.1/drivers/staging/panel/panel.c ---- linux-2.6.31.1/drivers/staging/panel/panel.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/staging/panel/panel.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1263,7 +1263,7 @@ static int lcd_release(struct inode *ino - return 0; - } - --static struct file_operations lcd_fops = { -+static const struct file_operations lcd_fops = { - .write = lcd_write, - .open = lcd_open, - .release = lcd_release, -@@ -1519,7 +1519,7 @@ static int keypad_release(struct inode * - return 0; - } - --static struct file_operations keypad_fops = { -+static const struct file_operations keypad_fops = { - .read = keypad_read, /* read */ - .open = keypad_open, /* open */ - .release = keypad_release, /* close */ -diff -urNp linux-2.6.31.1/drivers/staging/poch/poch.c linux-2.6.31.1/drivers/staging/poch/poch.c ---- linux-2.6.31.1/drivers/staging/poch/poch.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/staging/poch/poch.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1056,7 +1056,7 @@ static int poch_ioctl(struct inode *inod - return 0; - } - --static struct file_operations poch_fops = { -+static const struct file_operations poch_fops = { - .owner = THIS_MODULE, - .open = poch_open, - .release = poch_release, -diff -urNp linux-2.6.31.1/drivers/staging/rtl8192su/ieee80211/proc.c linux-2.6.31.1/drivers/staging/rtl8192su/ieee80211/proc.c ---- linux-2.6.31.1/drivers/staging/rtl8192su/ieee80211/proc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/staging/rtl8192su/ieee80211/proc.c 2009-10-01 20:12:43.000000000 -0400 -@@ -87,7 +87,7 @@ static int c_show(struct seq_file *m, vo - return 0; - } - --static struct seq_operations crypto_seq_ops = { -+static const struct seq_operations crypto_seq_ops = { - .start = c_start, - .next = c_next, - .stop = c_stop, -@@ -99,7 +99,7 @@ static int crypto_info_open(struct inode - return seq_open(file, &crypto_seq_ops); - } - --static struct file_operations proc_crypto_ops = { -+static const struct file_operations proc_crypto_ops = { - .open = crypto_info_open, - .read = seq_read, - .llseek = seq_lseek, -diff -urNp linux-2.6.31.1/drivers/uio/uio.c linux-2.6.31.1/drivers/uio/uio.c ---- linux-2.6.31.1/drivers/uio/uio.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/uio/uio.c 2009-10-01 20:12:43.000000000 -0400 -@@ -658,7 +658,7 @@ static int uio_vma_fault(struct vm_area_ - return 0; - } - --static struct vm_operations_struct uio_vm_ops = { -+static const struct vm_operations_struct uio_vm_ops = { - .open = uio_vma_open, - .close = uio_vma_close, - .fault = uio_vma_fault, -diff -urNp linux-2.6.31.1/drivers/usb/atm/usbatm.c linux-2.6.31.1/drivers/usb/atm/usbatm.c ---- linux-2.6.31.1/drivers/usb/atm/usbatm.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/atm/usbatm.c 2009-10-01 20:12:43.000000000 -0400 -@@ -333,7 +333,7 @@ static void usbatm_extract_one_cell(stru - if (printk_ratelimit()) - atm_warn(instance, "%s: OAM not supported (vpi %d, vci %d)!\n", - __func__, vpi, vci); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - return; - } - -@@ -361,7 +361,7 @@ static void usbatm_extract_one_cell(stru - if (length > ATM_MAX_AAL5_PDU) { - atm_rldbg(instance, "%s: bogus length %u (vcc: 0x%p)!\n", - __func__, length, vcc); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - goto out; - } - -@@ -370,14 +370,14 @@ static void usbatm_extract_one_cell(stru - if (sarb->len < pdu_length) { - atm_rldbg(instance, "%s: bogus pdu_length %u (sarb->len: %u, vcc: 0x%p)!\n", - __func__, pdu_length, sarb->len, vcc); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - goto out; - } - - if (crc32_be(~0, skb_tail_pointer(sarb) - pdu_length, pdu_length) != 0xc704dd7b) { - atm_rldbg(instance, "%s: packet failed crc check (vcc: 0x%p)!\n", - __func__, vcc); -- atomic_inc(&vcc->stats->rx_err); -+ atomic_inc_unchecked(&vcc->stats->rx_err); - goto out; - } - -@@ -387,7 +387,7 @@ static void usbatm_extract_one_cell(stru - if (printk_ratelimit()) - atm_err(instance, "%s: no memory for skb (length: %u)!\n", - __func__, length); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - goto out; - } - -@@ -412,7 +412,7 @@ static void usbatm_extract_one_cell(stru - - vcc->push(vcc, skb); - -- atomic_inc(&vcc->stats->rx); -+ atomic_inc_unchecked(&vcc->stats->rx); - out: - skb_trim(sarb, 0); - } -@@ -616,7 +616,7 @@ static void usbatm_tx_process(unsigned l - struct atm_vcc *vcc = UDSL_SKB(skb)->atm.vcc; - - usbatm_pop(vcc, skb); -- atomic_inc(&vcc->stats->tx); -+ atomic_inc_unchecked(&vcc->stats->tx); - - skb = skb_dequeue(&instance->sndqueue); - } -@@ -775,11 +775,11 @@ static int usbatm_atm_proc_read(struct a - if (!left--) - return sprintf(page, - "AAL5: tx %d ( %d err ), rx %d ( %d err, %d drop )\n", -- atomic_read(&atm_dev->stats.aal5.tx), -- atomic_read(&atm_dev->stats.aal5.tx_err), -- atomic_read(&atm_dev->stats.aal5.rx), -- atomic_read(&atm_dev->stats.aal5.rx_err), -- atomic_read(&atm_dev->stats.aal5.rx_drop)); -+ atomic_read_unchecked(&atm_dev->stats.aal5.tx), -+ atomic_read_unchecked(&atm_dev->stats.aal5.tx_err), -+ atomic_read_unchecked(&atm_dev->stats.aal5.rx), -+ atomic_read_unchecked(&atm_dev->stats.aal5.rx_err), -+ atomic_read_unchecked(&atm_dev->stats.aal5.rx_drop)); - - if (!left--) { - if (instance->disconnected) -diff -urNp linux-2.6.31.1/drivers/usb/class/cdc-acm.c linux-2.6.31.1/drivers/usb/class/cdc-acm.c ---- linux-2.6.31.1/drivers/usb/class/cdc-acm.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/class/cdc-acm.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1529,7 +1529,7 @@ static struct usb_device_id acm_ids[] = - USB_CDC_ACM_PROTO_AT_CDMA) }, - - /* NOTE: COMM/ACM/0xff is likely MSFT RNDIS ... NOT a modem!! */ -- { } -+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(usb, acm_ids); -diff -urNp linux-2.6.31.1/drivers/usb/class/usblp.c linux-2.6.31.1/drivers/usb/class/usblp.c ---- linux-2.6.31.1/drivers/usb/class/usblp.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/class/usblp.c 2009-10-01 20:12:43.000000000 -0400 -@@ -228,7 +228,7 @@ static const struct quirk_printer_struct - { 0x0482, 0x0010, USBLP_QUIRK_BIDIR }, /* Kyocera Mita FS 820, by zut kernel@zut.de */ - { 0x04f9, 0x000d, USBLP_QUIRK_BIDIR }, /* Brother Industries, Ltd HL-1440 Laser Printer */ - { 0x04b8, 0x0202, USBLP_QUIRK_BAD_CLASS }, /* Seiko Epson Receipt Printer M129C */ -- { 0, 0 } -+ { 0, 0, 0 } - }; - - static int usblp_wwait(struct usblp *usblp, int nonblock); -@@ -1412,7 +1412,7 @@ static struct usb_device_id usblp_ids [] - { USB_INTERFACE_INFO(7, 1, 2) }, - { USB_INTERFACE_INFO(7, 1, 3) }, - { USB_DEVICE(0x04b8, 0x0202) }, /* Seiko Epson Receipt Printer M129C */ -- { } /* Terminating entry */ -+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* Terminating entry */ - }; - - MODULE_DEVICE_TABLE (usb, usblp_ids); -diff -urNp linux-2.6.31.1/drivers/usb/class/usbtmc.c linux-2.6.31.1/drivers/usb/class/usbtmc.c ---- linux-2.6.31.1/drivers/usb/class/usbtmc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/class/usbtmc.c 2009-10-01 20:12:43.000000000 -0400 -@@ -956,7 +956,7 @@ static long usbtmc_ioctl(struct file *fi - return retval; - } - --static struct file_operations fops = { -+static const struct file_operations fops = { - .owner = THIS_MODULE, - .read = usbtmc_read, - .write = usbtmc_write, -diff -urNp linux-2.6.31.1/drivers/usb/core/hub.c linux-2.6.31.1/drivers/usb/core/hub.c ---- linux-2.6.31.1/drivers/usb/core/hub.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/core/hub.c 2009-10-01 20:12:43.000000000 -0400 -@@ -3284,7 +3284,7 @@ static struct usb_device_id hub_id_table - .bDeviceClass = USB_CLASS_HUB}, - { .match_flags = USB_DEVICE_ID_MATCH_INT_CLASS, - .bInterfaceClass = USB_CLASS_HUB}, -- { } /* Terminating entry */ -+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* Terminating entry */ - }; - - MODULE_DEVICE_TABLE (usb, hub_id_table); -diff -urNp linux-2.6.31.1/drivers/usb/core/inode.c linux-2.6.31.1/drivers/usb/core/inode.c ---- linux-2.6.31.1/drivers/usb/core/inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/core/inode.c 2009-10-01 20:12:43.000000000 -0400 -@@ -48,7 +48,7 @@ - #define USBFS_DEFAULT_BUSMODE (S_IXUGO | S_IRUGO) - #define USBFS_DEFAULT_LISTMODE S_IRUGO - --static struct super_operations usbfs_ops; -+static const struct super_operations usbfs_ops; - static const struct file_operations default_file_operations; - static struct vfsmount *usbfs_mount; - static int usbfs_mount_count; /* = 0 */ -@@ -449,7 +449,7 @@ static const struct file_operations defa - .llseek = default_file_lseek, - }; - --static struct super_operations usbfs_ops = { -+static const struct super_operations usbfs_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .remount_fs = remount, -diff -urNp linux-2.6.31.1/drivers/usb/core/message.c linux-2.6.31.1/drivers/usb/core/message.c ---- linux-2.6.31.1/drivers/usb/core/message.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/core/message.c 2009-10-01 20:12:43.000000000 -0400 -@@ -926,8 +926,8 @@ char *usb_cache_string(struct usb_device - buf = kmalloc(MAX_USB_STRING_SIZE, GFP_KERNEL); - if (buf) { - len = usb_string(udev, index, buf, MAX_USB_STRING_SIZE); -- if (len > 0) { -- smallbuf = kmalloc(++len, GFP_KERNEL); -+ if (len++ > 0) { -+ smallbuf = kmalloc(len, GFP_KERNEL); - if (!smallbuf) - return buf; - memcpy(smallbuf, buf, len); -diff -urNp linux-2.6.31.1/drivers/usb/gadget/inode.c linux-2.6.31.1/drivers/usb/gadget/inode.c ---- linux-2.6.31.1/drivers/usb/gadget/inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/gadget/inode.c 2009-10-01 20:12:43.000000000 -0400 -@@ -2033,7 +2033,7 @@ gadgetfs_create_file (struct super_block - return inode; - } - --static struct super_operations gadget_fs_operations = { -+static const struct super_operations gadget_fs_operations = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - }; -diff -urNp linux-2.6.31.1/drivers/usb/gadget/printer.c linux-2.6.31.1/drivers/usb/gadget/printer.c ---- linux-2.6.31.1/drivers/usb/gadget/printer.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/gadget/printer.c 2009-10-01 20:12:43.000000000 -0400 -@@ -875,7 +875,7 @@ printer_ioctl(struct file *fd, unsigned - } - - /* used after endpoint configuration */ --static struct file_operations printer_io_operations = { -+static const struct file_operations printer_io_operations = { - .owner = THIS_MODULE, - .open = printer_open, - .read = printer_read, -diff -urNp linux-2.6.31.1/drivers/usb/host/ehci-pci.c linux-2.6.31.1/drivers/usb/host/ehci-pci.c ---- linux-2.6.31.1/drivers/usb/host/ehci-pci.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/host/ehci-pci.c 2009-10-01 20:12:43.000000000 -0400 -@@ -416,7 +416,7 @@ static const struct pci_device_id pci_id - PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_EHCI, ~0), - .driver_data = (unsigned long) &ehci_pci_hc_driver, - }, -- { /* end: all zeroes */ } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - MODULE_DEVICE_TABLE(pci, pci_ids); - -diff -urNp linux-2.6.31.1/drivers/usb/host/uhci-hcd.c linux-2.6.31.1/drivers/usb/host/uhci-hcd.c ---- linux-2.6.31.1/drivers/usb/host/uhci-hcd.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/host/uhci-hcd.c 2009-10-01 20:12:43.000000000 -0400 -@@ -927,7 +927,7 @@ static const struct pci_device_id uhci_p - /* handle any USB UHCI controller */ - PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_UHCI, ~0), - .driver_data = (unsigned long) &uhci_driver, -- }, { /* end: all zeroes */ } -+ }, { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(pci, uhci_pci_ids); -diff -urNp linux-2.6.31.1/drivers/usb/host/whci/debug.c linux-2.6.31.1/drivers/usb/host/whci/debug.c ---- linux-2.6.31.1/drivers/usb/host/whci/debug.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/host/whci/debug.c 2009-10-01 20:12:43.000000000 -0400 -@@ -134,7 +134,7 @@ static int pzl_open(struct inode *inode, - return single_open(file, pzl_print, inode->i_private); - } - --static struct file_operations di_fops = { -+static const struct file_operations di_fops = { - .open = di_open, - .read = seq_read, - .llseek = seq_lseek, -@@ -142,7 +142,7 @@ static struct file_operations di_fops = - .owner = THIS_MODULE, - }; - --static struct file_operations asl_fops = { -+static const struct file_operations asl_fops = { - .open = asl_open, - .read = seq_read, - .llseek = seq_lseek, -@@ -150,7 +150,7 @@ static struct file_operations asl_fops = - .owner = THIS_MODULE, - }; - --static struct file_operations pzl_fops = { -+static const struct file_operations pzl_fops = { - .open = pzl_open, - .read = seq_read, - .llseek = seq_lseek, -diff -urNp linux-2.6.31.1/drivers/usb/mon/mon_bin.c linux-2.6.31.1/drivers/usb/mon/mon_bin.c ---- linux-2.6.31.1/drivers/usb/mon/mon_bin.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/mon/mon_bin.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1184,7 +1184,7 @@ static int mon_bin_vma_fault(struct vm_a - return 0; - } - --static struct vm_operations_struct mon_bin_vm_ops = { -+static const struct vm_operations_struct mon_bin_vm_ops = { - .open = mon_bin_vma_open, - .close = mon_bin_vma_close, - .fault = mon_bin_vma_fault, -diff -urNp linux-2.6.31.1/drivers/usb/storage/debug.h linux-2.6.31.1/drivers/usb/storage/debug.h ---- linux-2.6.31.1/drivers/usb/storage/debug.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/storage/debug.h 2009-10-01 20:12:43.000000000 -0400 -@@ -54,9 +54,9 @@ void usb_stor_show_sense( unsigned char - #define US_DEBUGPX(x...) printk( x ) - #define US_DEBUG(x) x - #else --#define US_DEBUGP(x...) --#define US_DEBUGPX(x...) --#define US_DEBUG(x) -+#define US_DEBUGP(x...) do {} while (0) -+#define US_DEBUGPX(x...) do {} while (0) -+#define US_DEBUG(x) do {} while (0) - #endif - - #endif -diff -urNp linux-2.6.31.1/drivers/usb/storage/usb.c linux-2.6.31.1/drivers/usb/storage/usb.c ---- linux-2.6.31.1/drivers/usb/storage/usb.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/storage/usb.c 2009-10-01 20:12:43.000000000 -0400 -@@ -118,7 +118,7 @@ MODULE_PARM_DESC(quirks, "supplemental l - - static struct us_unusual_dev us_unusual_dev_list[] = { - # include "unusual_devs.h" -- { } /* Terminating entry */ -+ { NULL, NULL, 0, 0, NULL } /* Terminating entry */ - }; - - #undef UNUSUAL_DEV -diff -urNp linux-2.6.31.1/drivers/usb/storage/usual-tables.c linux-2.6.31.1/drivers/usb/storage/usual-tables.c ---- linux-2.6.31.1/drivers/usb/storage/usual-tables.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/usb/storage/usual-tables.c 2009-10-01 20:12:43.000000000 -0400 -@@ -48,7 +48,7 @@ - - struct usb_device_id usb_storage_usb_ids[] = { - # include "unusual_devs.h" -- { } /* Terminating entry */ -+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* Terminating entry */ - }; - EXPORT_SYMBOL_GPL(usb_storage_usb_ids); - -diff -urNp linux-2.6.31.1/drivers/uwb/uwb-debug.c linux-2.6.31.1/drivers/uwb/uwb-debug.c ---- linux-2.6.31.1/drivers/uwb/uwb-debug.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/uwb/uwb-debug.c 2009-10-01 20:12:43.000000000 -0400 -@@ -205,7 +205,7 @@ static ssize_t command_write(struct file - return ret < 0 ? ret : len; - } - --static struct file_operations command_fops = { -+static const struct file_operations command_fops = { - .open = command_open, - .write = command_write, - .read = NULL, -@@ -255,7 +255,7 @@ static int reservations_open(struct inod - return single_open(file, reservations_print, inode->i_private); - } - --static struct file_operations reservations_fops = { -+static const struct file_operations reservations_fops = { - .open = reservations_open, - .read = seq_read, - .llseek = seq_lseek, -@@ -283,7 +283,7 @@ static int drp_avail_open(struct inode * - return single_open(file, drp_avail_print, inode->i_private); - } - --static struct file_operations drp_avail_fops = { -+static const struct file_operations drp_avail_fops = { - .open = drp_avail_open, - .read = seq_read, - .llseek = seq_lseek, -diff -urNp linux-2.6.31.1/drivers/uwb/wlp/messages.c linux-2.6.31.1/drivers/uwb/wlp/messages.c ---- linux-2.6.31.1/drivers/uwb/wlp/messages.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/uwb/wlp/messages.c 2009-10-01 20:12:43.000000000 -0400 -@@ -903,7 +903,7 @@ int wlp_parse_f0(struct wlp *wlp, struct - size_t len = skb->len; - size_t used; - ssize_t result; -- struct wlp_nonce enonce, rnonce; -+ struct wlp_nonce enonce = {{0}}, rnonce = {{0}}; - enum wlp_assc_error assc_err; - char enonce_buf[WLP_WSS_NONCE_STRSIZE]; - char rnonce_buf[WLP_WSS_NONCE_STRSIZE]; -diff -urNp linux-2.6.31.1/drivers/video/fb_defio.c linux-2.6.31.1/drivers/video/fb_defio.c ---- linux-2.6.31.1/drivers/video/fb_defio.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/video/fb_defio.c 2009-10-01 20:12:43.000000000 -0400 -@@ -125,7 +125,7 @@ page_already_added: - return 0; - } - --static struct vm_operations_struct fb_deferred_io_vm_ops = { -+static const struct vm_operations_struct fb_deferred_io_vm_ops = { - .fault = fb_deferred_io_fault, - .page_mkwrite = fb_deferred_io_mkwrite, - }; -diff -urNp linux-2.6.31.1/drivers/video/fbmem.c linux-2.6.31.1/drivers/video/fbmem.c ---- linux-2.6.31.1/drivers/video/fbmem.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/video/fbmem.c 2009-10-01 20:12:43.000000000 -0400 -@@ -403,7 +403,7 @@ static void fb_do_show_logo(struct fb_in - image->dx += image->width + 8; - } - } else if (rotate == FB_ROTATE_UD) { -- for (x = 0; x < num && image->dx >= 0; x++) { -+ for (x = 0; x < num && (__s32)image->dx >= 0; x++) { - info->fbops->fb_imageblit(info, image); - image->dx -= image->width + 8; - } -@@ -415,7 +415,7 @@ static void fb_do_show_logo(struct fb_in - image->dy += image->height + 8; - } - } else if (rotate == FB_ROTATE_CCW) { -- for (x = 0; x < num && image->dy >= 0; x++) { -+ for (x = 0; x < num && (__s32)image->dy >= 0; x++) { - info->fbops->fb_imageblit(info, image); - image->dy -= image->height + 8; - } -@@ -1108,7 +1108,7 @@ static long do_fb_ioctl(struct fb_info * - return -EFAULT; - if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) - return -EINVAL; -- if (con2fb.framebuffer < 0 || con2fb.framebuffer >= FB_MAX) -+ if (con2fb.framebuffer >= FB_MAX) - return -EINVAL; - if (!registered_fb[con2fb.framebuffer]) - request_module("fb%d", con2fb.framebuffer); -diff -urNp linux-2.6.31.1/drivers/video/fbmon.c linux-2.6.31.1/drivers/video/fbmon.c ---- linux-2.6.31.1/drivers/video/fbmon.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/video/fbmon.c 2009-10-01 20:12:43.000000000 -0400 -@@ -45,7 +45,7 @@ - #ifdef DEBUG - #define DPRINTK(fmt, args...) printk(fmt,## args) - #else --#define DPRINTK(fmt, args...) -+#define DPRINTK(fmt, args...) do {} while (0) - #endif - - #define FBMON_FIX_HEADER 1 -diff -urNp linux-2.6.31.1/drivers/video/i810/i810_accel.c linux-2.6.31.1/drivers/video/i810/i810_accel.c ---- linux-2.6.31.1/drivers/video/i810/i810_accel.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/video/i810/i810_accel.c 2009-10-01 20:12:43.000000000 -0400 -@@ -73,6 +73,7 @@ static inline int wait_for_space(struct - } - } - printk("ringbuffer lockup!!!\n"); -+ printk("head:%u tail:%u iring.size:%u space:%u\n", head, tail, par->iring.size, space); - i810_report_error(mmio); - par->dev_flags |= LOCKUP; - info->pixmap.scan_align = 1; -diff -urNp linux-2.6.31.1/drivers/video/i810/i810_main.c linux-2.6.31.1/drivers/video/i810/i810_main.c ---- linux-2.6.31.1/drivers/video/i810/i810_main.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/video/i810/i810_main.c 2009-10-01 20:12:43.000000000 -0400 -@@ -120,7 +120,7 @@ static struct pci_device_id i810fb_pci_t - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 4 }, - { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82815_CGC, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 5 }, -- { 0 }, -+ { 0, 0, 0, 0, 0, 0, 0 }, - }; - - static struct pci_driver i810fb_driver = { -diff -urNp linux-2.6.31.1/drivers/video/modedb.c linux-2.6.31.1/drivers/video/modedb.c ---- linux-2.6.31.1/drivers/video/modedb.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/video/modedb.c 2009-10-01 20:12:43.000000000 -0400 -@@ -38,240 +38,240 @@ static const struct fb_videomode modedb[ - { - /* 640x400 @ 70 Hz, 31.5 kHz hsync */ - NULL, 70, 640, 400, 39721, 40, 24, 39, 9, 96, 2, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 640x480 @ 60 Hz, 31.5 kHz hsync */ - NULL, 60, 640, 480, 39721, 40, 24, 32, 11, 96, 2, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 800x600 @ 56 Hz, 35.15 kHz hsync */ - NULL, 56, 800, 600, 27777, 128, 24, 22, 1, 72, 2, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1024x768 @ 87 Hz interlaced, 35.5 kHz hsync */ - NULL, 87, 1024, 768, 22271, 56, 24, 33, 8, 160, 8, -- 0, FB_VMODE_INTERLACED -+ 0, FB_VMODE_INTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 640x400 @ 85 Hz, 37.86 kHz hsync */ - NULL, 85, 640, 400, 31746, 96, 32, 41, 1, 64, 3, -- FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 640x480 @ 72 Hz, 36.5 kHz hsync */ - NULL, 72, 640, 480, 31746, 144, 40, 30, 8, 40, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 640x480 @ 75 Hz, 37.50 kHz hsync */ - NULL, 75, 640, 480, 31746, 120, 16, 16, 1, 64, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 800x600 @ 60 Hz, 37.8 kHz hsync */ - NULL, 60, 800, 600, 25000, 88, 40, 23, 1, 128, 4, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 640x480 @ 85 Hz, 43.27 kHz hsync */ - NULL, 85, 640, 480, 27777, 80, 56, 25, 1, 56, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1152x864 @ 89 Hz interlaced, 44 kHz hsync */ - NULL, 89, 1152, 864, 15384, 96, 16, 110, 1, 216, 10, -- 0, FB_VMODE_INTERLACED -+ 0, FB_VMODE_INTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 800x600 @ 72 Hz, 48.0 kHz hsync */ - NULL, 72, 800, 600, 20000, 64, 56, 23, 37, 120, 6, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1024x768 @ 60 Hz, 48.4 kHz hsync */ - NULL, 60, 1024, 768, 15384, 168, 8, 29, 3, 144, 6, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 640x480 @ 100 Hz, 53.01 kHz hsync */ - NULL, 100, 640, 480, 21834, 96, 32, 36, 8, 96, 6, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1152x864 @ 60 Hz, 53.5 kHz hsync */ - NULL, 60, 1152, 864, 11123, 208, 64, 16, 4, 256, 8, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 800x600 @ 85 Hz, 55.84 kHz hsync */ - NULL, 85, 800, 600, 16460, 160, 64, 36, 16, 64, 5, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1024x768 @ 70 Hz, 56.5 kHz hsync */ - NULL, 70, 1024, 768, 13333, 144, 24, 29, 3, 136, 6, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1280x1024 @ 87 Hz interlaced, 51 kHz hsync */ - NULL, 87, 1280, 1024, 12500, 56, 16, 128, 1, 216, 12, -- 0, FB_VMODE_INTERLACED -+ 0, FB_VMODE_INTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 800x600 @ 100 Hz, 64.02 kHz hsync */ - NULL, 100, 800, 600, 14357, 160, 64, 30, 4, 64, 6, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1024x768 @ 76 Hz, 62.5 kHz hsync */ - NULL, 76, 1024, 768, 11764, 208, 8, 36, 16, 120, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1152x864 @ 70 Hz, 62.4 kHz hsync */ - NULL, 70, 1152, 864, 10869, 106, 56, 20, 1, 160, 10, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1280x1024 @ 61 Hz, 64.2 kHz hsync */ - NULL, 61, 1280, 1024, 9090, 200, 48, 26, 1, 184, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1400x1050 @ 60Hz, 63.9 kHz hsync */ - NULL, 60, 1400, 1050, 9259, 136, 40, 13, 1, 112, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1400x1050 @ 75,107 Hz, 82,392 kHz +hsync +vsync*/ - NULL, 75, 1400, 1050, 7190, 120, 56, 23, 10, 112, 13, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1400x1050 @ 60 Hz, ? kHz +hsync +vsync*/ - NULL, 60, 1400, 1050, 9259, 128, 40, 12, 0, 112, 3, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1024x768 @ 85 Hz, 70.24 kHz hsync */ - NULL, 85, 1024, 768, 10111, 192, 32, 34, 14, 160, 6, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1152x864 @ 78 Hz, 70.8 kHz hsync */ - NULL, 78, 1152, 864, 9090, 228, 88, 32, 0, 84, 12, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1280x1024 @ 70 Hz, 74.59 kHz hsync */ - NULL, 70, 1280, 1024, 7905, 224, 32, 28, 8, 160, 8, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1600x1200 @ 60Hz, 75.00 kHz hsync */ - NULL, 60, 1600, 1200, 6172, 304, 64, 46, 1, 192, 3, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1152x864 @ 84 Hz, 76.0 kHz hsync */ - NULL, 84, 1152, 864, 7407, 184, 312, 32, 0, 128, 12, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1280x1024 @ 74 Hz, 78.85 kHz hsync */ - NULL, 74, 1280, 1024, 7407, 256, 32, 34, 3, 144, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1024x768 @ 100Hz, 80.21 kHz hsync */ - NULL, 100, 1024, 768, 8658, 192, 32, 21, 3, 192, 10, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1280x1024 @ 76 Hz, 81.13 kHz hsync */ - NULL, 76, 1280, 1024, 7407, 248, 32, 34, 3, 104, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1600x1200 @ 70 Hz, 87.50 kHz hsync */ - NULL, 70, 1600, 1200, 5291, 304, 64, 46, 1, 192, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1152x864 @ 100 Hz, 89.62 kHz hsync */ - NULL, 100, 1152, 864, 7264, 224, 32, 17, 2, 128, 19, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1280x1024 @ 85 Hz, 91.15 kHz hsync */ - NULL, 85, 1280, 1024, 6349, 224, 64, 44, 1, 160, 3, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1600x1200 @ 75 Hz, 93.75 kHz hsync */ - NULL, 75, 1600, 1200, 4938, 304, 64, 46, 1, 192, 3, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1680x1050 @ 60 Hz, 65.191 kHz hsync */ - NULL, 60, 1680, 1050, 6848, 280, 104, 30, 3, 176, 6, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1600x1200 @ 85 Hz, 105.77 kHz hsync */ - NULL, 85, 1600, 1200, 4545, 272, 16, 37, 4, 192, 3, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1280x1024 @ 100 Hz, 107.16 kHz hsync */ - NULL, 100, 1280, 1024, 5502, 256, 32, 26, 7, 128, 15, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1800x1440 @ 64Hz, 96.15 kHz hsync */ - NULL, 64, 1800, 1440, 4347, 304, 96, 46, 1, 192, 3, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1800x1440 @ 70Hz, 104.52 kHz hsync */ - NULL, 70, 1800, 1440, 4000, 304, 96, 46, 1, 192, 3, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 512x384 @ 78 Hz, 31.50 kHz hsync */ - NULL, 78, 512, 384, 49603, 48, 16, 16, 1, 64, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 512x384 @ 85 Hz, 34.38 kHz hsync */ - NULL, 85, 512, 384, 45454, 48, 16, 16, 1, 64, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 320x200 @ 70 Hz, 31.5 kHz hsync, 8:5 aspect ratio */ - NULL, 70, 320, 200, 79440, 16, 16, 20, 4, 48, 1, -- 0, FB_VMODE_DOUBLE -+ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN - }, { - /* 320x240 @ 60 Hz, 31.5 kHz hsync, 4:3 aspect ratio */ - NULL, 60, 320, 240, 79440, 16, 16, 16, 5, 48, 1, -- 0, FB_VMODE_DOUBLE -+ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN - }, { - /* 320x240 @ 72 Hz, 36.5 kHz hsync */ - NULL, 72, 320, 240, 63492, 16, 16, 16, 4, 48, 2, -- 0, FB_VMODE_DOUBLE -+ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN - }, { - /* 400x300 @ 56 Hz, 35.2 kHz hsync, 4:3 aspect ratio */ - NULL, 56, 400, 300, 55555, 64, 16, 10, 1, 32, 1, -- 0, FB_VMODE_DOUBLE -+ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN - }, { - /* 400x300 @ 60 Hz, 37.8 kHz hsync */ - NULL, 60, 400, 300, 50000, 48, 16, 11, 1, 64, 2, -- 0, FB_VMODE_DOUBLE -+ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN - }, { - /* 400x300 @ 72 Hz, 48.0 kHz hsync */ - NULL, 72, 400, 300, 40000, 32, 24, 11, 19, 64, 3, -- 0, FB_VMODE_DOUBLE -+ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN - }, { - /* 480x300 @ 56 Hz, 35.2 kHz hsync, 8:5 aspect ratio */ - NULL, 56, 480, 300, 46176, 80, 16, 10, 1, 40, 1, -- 0, FB_VMODE_DOUBLE -+ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN - }, { - /* 480x300 @ 60 Hz, 37.8 kHz hsync */ - NULL, 60, 480, 300, 41858, 56, 16, 11, 1, 80, 2, -- 0, FB_VMODE_DOUBLE -+ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN - }, { - /* 480x300 @ 63 Hz, 39.6 kHz hsync */ - NULL, 63, 480, 300, 40000, 56, 16, 11, 1, 80, 2, -- 0, FB_VMODE_DOUBLE -+ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN - }, { - /* 480x300 @ 72 Hz, 48.0 kHz hsync */ - NULL, 72, 480, 300, 33386, 40, 24, 11, 19, 80, 3, -- 0, FB_VMODE_DOUBLE -+ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN - }, { - /* 1920x1200 @ 60 Hz, 74.5 Khz hsync */ - NULL, 60, 1920, 1200, 5177, 128, 336, 1, 38, 208, 3, - FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, -- FB_VMODE_NONINTERLACED -+ FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1152x768, 60 Hz, PowerBook G4 Titanium I and II */ - NULL, 60, 1152, 768, 14047, 158, 26, 29, 3, 136, 6, -- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED -+ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1366x768, 60 Hz, 47.403 kHz hsync, WXGA 16:9 aspect ratio */ - NULL, 60, 1366, 768, 13806, 120, 10, 14, 3, 32, 5, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 1280x800, 60 Hz, 47.403 kHz hsync, WXGA 16:10 aspect ratio */ - NULL, 60, 1280, 800, 12048, 200, 64, 24, 1, 136, 3, -- 0, FB_VMODE_NONINTERLACED -+ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 720x576i @ 50 Hz, 15.625 kHz hsync (PAL RGB) */ - NULL, 50, 720, 576, 74074, 64, 16, 39, 5, 64, 5, -- 0, FB_VMODE_INTERLACED -+ 0, FB_VMODE_INTERLACED, FB_MODE_IS_UNKNOWN - }, { - /* 800x520i @ 50 Hz, 15.625 kHz hsync (PAL RGB) */ - NULL, 50, 800, 520, 58823, 144, 64, 72, 28, 80, 5, -- 0, FB_VMODE_INTERLACED -+ 0, FB_VMODE_INTERLACED, FB_MODE_IS_UNKNOWN - }, - }; - -diff -urNp linux-2.6.31.1/drivers/video/omap/dispc.c linux-2.6.31.1/drivers/video/omap/dispc.c ---- linux-2.6.31.1/drivers/video/omap/dispc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/video/omap/dispc.c 2009-10-01 20:12:43.000000000 -0400 -@@ -1013,7 +1013,7 @@ static void mmap_user_close(struct vm_ar - atomic_dec(&dispc.map_count[plane]); - } - --static struct vm_operations_struct mmap_user_ops = { -+static const struct vm_operations_struct mmap_user_ops = { - .open = mmap_user_open, - .close = mmap_user_close, - }; -diff -urNp linux-2.6.31.1/drivers/video/uvesafb.c linux-2.6.31.1/drivers/video/uvesafb.c ---- linux-2.6.31.1/drivers/video/uvesafb.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/video/uvesafb.c 2009-10-01 20:12:43.000000000 -0400 -@@ -18,6 +18,7 @@ - #include <linux/fb.h> - #include <linux/io.h> - #include <linux/mutex.h> -+#include <linux/moduleloader.h> - #include <video/edid.h> - #include <video/uvesafb.h> - #ifdef CONFIG_X86 -@@ -118,7 +119,7 @@ static int uvesafb_helper_start(void) - NULL, - }; - -- return call_usermodehelper(v86d_path, argv, envp, 1); -+ return call_usermodehelper(v86d_path, argv, envp, UMH_WAIT_PROC); - } - - /* -@@ -566,10 +567,34 @@ static int __devinit uvesafb_vbe_getpmi( - if ((task->t.regs.eax & 0xffff) != 0x4f || task->t.regs.es < 0xc000) { - par->pmi_setpal = par->ypan = 0; - } else { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+#ifdef CONFIG_MODULES -+ unsigned long cr0; -+ -+ par->pmi_code = module_alloc_exec((u16)task->t.regs.ecx); -+#endif -+ if (!par->pmi_code) { -+ par->pmi_setpal = par->ypan = 0; -+ return 0; -+ } -+#endif -+ - par->pmi_base = (u16 *)phys_to_virt(((u32)task->t.regs.es << 4) - + task->t.regs.edi); -+ -+#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) -+ pax_open_kernel(cr0); -+ memcpy(par->pmi_code, par->pmi_base, (u16)task->t.regs.ecx); -+ pax_close_kernel(cr0); -+ -+ par->pmi_start = ktva_ktla(par->pmi_code + par->pmi_base[1]); -+ par->pmi_pal = ktva_ktla(par->pmi_code + par->pmi_base[2]); -+#else - par->pmi_start = (u8 *)par->pmi_base + par->pmi_base[1]; - par->pmi_pal = (u8 *)par->pmi_base + par->pmi_base[2]; -+#endif -+ - printk(KERN_INFO "uvesafb: protected mode interface info at " - "%04x:%04x\n", - (u16)task->t.regs.es, (u16)task->t.regs.edi); -@@ -1825,6 +1850,11 @@ out: - if (par->vbe_modes) - kfree(par->vbe_modes); - -+#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) -+ if (par->pmi_code) -+ module_free_exec(NULL, par->pmi_code); -+#endif -+ - framebuffer_release(info); - return err; - } -@@ -1851,6 +1881,12 @@ static int uvesafb_remove(struct platfor - kfree(par->vbe_state_orig); - if (par->vbe_state_saved) - kfree(par->vbe_state_saved); -+ -+#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) -+ if (par->pmi_code) -+ module_free_exec(NULL, par->pmi_code); -+#endif -+ - } - - framebuffer_release(info); -diff -urNp linux-2.6.31.1/drivers/video/vesafb.c linux-2.6.31.1/drivers/video/vesafb.c ---- linux-2.6.31.1/drivers/video/vesafb.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/drivers/video/vesafb.c 2009-10-01 20:12:43.000000000 -0400 -@@ -9,6 +9,7 @@ - */ - - #include <linux/module.h> -+#include <linux/moduleloader.h> - #include <linux/kernel.h> - #include <linux/errno.h> - #include <linux/string.h> -@@ -53,8 +54,8 @@ static int vram_remap __initdata; /* - static int vram_total __initdata; /* Set total amount of memory */ - static int pmi_setpal __read_mostly = 1; /* pmi for palette changes ??? */ - static int ypan __read_mostly; /* 0..nothing, 1..ypan, 2..ywrap */ --static void (*pmi_start)(void) __read_mostly; --static void (*pmi_pal) (void) __read_mostly; -+static void (*pmi_start)(void) __read_only; -+static void (*pmi_pal) (void) __read_only; - static int depth __read_mostly; - static int vga_compat __read_mostly; - /* --------------------------------------------------------------------- */ -@@ -233,6 +234,7 @@ static int __init vesafb_probe(struct pl - unsigned int size_vmode; - unsigned int size_remap; - unsigned int size_total; -+ void *pmi_code = NULL; - - if (screen_info.orig_video_isVGA != VIDEO_TYPE_VLFB) - return -ENODEV; -@@ -275,10 +277,6 @@ static int __init vesafb_probe(struct pl - size_remap = size_total; - vesafb_fix.smem_len = size_remap; - --#ifndef __i386__ -- screen_info.vesapm_seg = 0; --#endif -- - if (!request_mem_region(vesafb_fix.smem_start, size_total, "vesafb")) { - printk(KERN_WARNING - "vesafb: cannot reserve video memory at 0x%lx\n", -@@ -315,9 +313,21 @@ static int __init vesafb_probe(struct pl - printk(KERN_INFO "vesafb: mode is %dx%dx%d, linelength=%d, pages=%d\n", - vesafb_defined.xres, vesafb_defined.yres, vesafb_defined.bits_per_pixel, vesafb_fix.line_length, screen_info.pages); - -+#ifdef __i386__ -+ -+#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) -+ pmi_code = module_alloc_exec(screen_info.vesapm_size); -+ if (!pmi_code) -+#elif !defined(CONFIG_PAX_KERNEXEC) -+ if (0) -+#endif -+ -+#endif -+ screen_info.vesapm_seg = 0; -+ - if (screen_info.vesapm_seg) { -- printk(KERN_INFO "vesafb: protected mode interface info at %04x:%04x\n", -- screen_info.vesapm_seg,screen_info.vesapm_off); -+ printk(KERN_INFO "vesafb: protected mode interface info at %04x:%04x %04x bytes\n", -+ screen_info.vesapm_seg,screen_info.vesapm_off,screen_info.vesapm_size); - } - - if (screen_info.vesapm_seg < 0xc000) -@@ -325,9 +335,29 @@ static int __init vesafb_probe(struct pl - - if (ypan || pmi_setpal) { - unsigned short *pmi_base; -- pmi_base = (unsigned short*)phys_to_virt(((unsigned long)screen_info.vesapm_seg << 4) + screen_info.vesapm_off); -- pmi_start = (void*)((char*)pmi_base + pmi_base[1]); -- pmi_pal = (void*)((char*)pmi_base + pmi_base[2]); -+ -+#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) -+ unsigned long cr0; -+#endif -+ -+ pmi_base = (unsigned short*)phys_to_virt(((unsigned long)screen_info.vesapm_seg << 4) + screen_info.vesapm_off); -+ -+#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) -+ pax_open_kernel(cr0); -+ memcpy(pmi_code, pmi_base, screen_info.vesapm_size); -+#else -+ pmi_code = pmi_base; -+#endif -+ -+ pmi_start = (void*)((char*)pmi_code + pmi_base[1]); -+ pmi_pal = (void*)((char*)pmi_code + pmi_base[2]); -+ -+#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) -+ pmi_start = ktva_ktla(pmi_start); -+ pmi_pal = ktva_ktla(pmi_pal); -+ pax_close_kernel(cr0); -+#endif -+ - printk(KERN_INFO "vesafb: pmi: set display start = %p, set palette = %p\n",pmi_start,pmi_pal); - if (pmi_base[3]) { - printk(KERN_INFO "vesafb: pmi: ports = "); -@@ -469,6 +499,11 @@ static int __init vesafb_probe(struct pl - info->node, info->fix.id); - return 0; - err: -+ -+#if defined(__i386__) && defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) -+ module_free_exec(NULL, pmi_code); -+#endif -+ - if (info->screen_base) - iounmap(info->screen_base); - framebuffer_release(info); -diff -urNp linux-2.6.31.1/fs/9p/vfs_inode.c linux-2.6.31.1/fs/9p/vfs_inode.c ---- linux-2.6.31.1/fs/9p/vfs_inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/9p/vfs_inode.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1025,7 +1025,7 @@ static void *v9fs_vfs_follow_link(struct - static void - v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) - { -- char *s = nd_get_link(nd); -+ const char *s = nd_get_link(nd); - - P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, - IS_ERR(s) ? "<error>" : s); -diff -urNp linux-2.6.31.1/fs/afs/proc.c linux-2.6.31.1/fs/afs/proc.c ---- linux-2.6.31.1/fs/afs/proc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/afs/proc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -28,7 +28,7 @@ static int afs_proc_cells_show(struct se - static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, - size_t size, loff_t *_pos); - --static struct seq_operations afs_proc_cells_ops = { -+static const struct seq_operations afs_proc_cells_ops = { - .start = afs_proc_cells_start, - .next = afs_proc_cells_next, - .stop = afs_proc_cells_stop, -@@ -70,7 +70,7 @@ static void *afs_proc_cell_volumes_next( - static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v); - static int afs_proc_cell_volumes_show(struct seq_file *m, void *v); - --static struct seq_operations afs_proc_cell_volumes_ops = { -+static const struct seq_operations afs_proc_cell_volumes_ops = { - .start = afs_proc_cell_volumes_start, - .next = afs_proc_cell_volumes_next, - .stop = afs_proc_cell_volumes_stop, -@@ -95,7 +95,7 @@ static void *afs_proc_cell_vlservers_nex - static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v); - static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v); - --static struct seq_operations afs_proc_cell_vlservers_ops = { -+static const struct seq_operations afs_proc_cell_vlservers_ops = { - .start = afs_proc_cell_vlservers_start, - .next = afs_proc_cell_vlservers_next, - .stop = afs_proc_cell_vlservers_stop, -@@ -119,7 +119,7 @@ static void *afs_proc_cell_servers_next( - static void afs_proc_cell_servers_stop(struct seq_file *p, void *v); - static int afs_proc_cell_servers_show(struct seq_file *m, void *v); - --static struct seq_operations afs_proc_cell_servers_ops = { -+static const struct seq_operations afs_proc_cell_servers_ops = { - .start = afs_proc_cell_servers_start, - .next = afs_proc_cell_servers_next, - .stop = afs_proc_cell_servers_stop, -diff -urNp linux-2.6.31.1/fs/aio.c linux-2.6.31.1/fs/aio.c ---- linux-2.6.31.1/fs/aio.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/aio.c 2009-10-01 20:12:44.000000000 -0400 -@@ -114,7 +114,7 @@ static int aio_setup_ring(struct kioctx - size += sizeof(struct io_event) * nr_events; - nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; - -- if (nr_pages < 0) -+ if (nr_pages <= 0) - return -EINVAL; - - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); -diff -urNp linux-2.6.31.1/fs/autofs/root.c linux-2.6.31.1/fs/autofs/root.c ---- linux-2.6.31.1/fs/autofs/root.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/autofs/root.c 2009-10-01 20:12:44.000000000 -0400 -@@ -299,7 +299,8 @@ static int autofs_root_symlink(struct in - set_bit(n,sbi->symlink_bitmap); - sl = &sbi->symlink[n]; - sl->len = strlen(symname); -- sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL); -+ slsize = sl->len+1; -+ sl->data = kmalloc(slsize, GFP_KERNEL); - if (!sl->data) { - clear_bit(n,sbi->symlink_bitmap); - unlock_kernel(); -diff -urNp linux-2.6.31.1/fs/autofs4/symlink.c linux-2.6.31.1/fs/autofs4/symlink.c ---- linux-2.6.31.1/fs/autofs4/symlink.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/autofs4/symlink.c 2009-10-01 20:12:44.000000000 -0400 -@@ -15,7 +15,7 @@ - static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) - { - struct autofs_info *ino = autofs4_dentry_ino(dentry); -- nd_set_link(nd, (char *)ino->u.symlink); -+ nd_set_link(nd, ino->u.symlink); - return NULL; - } - -diff -urNp linux-2.6.31.1/fs/befs/linuxvfs.c linux-2.6.31.1/fs/befs/linuxvfs.c ---- linux-2.6.31.1/fs/befs/linuxvfs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/befs/linuxvfs.c 2009-10-01 20:12:44.000000000 -0400 -@@ -493,7 +493,7 @@ static void befs_put_link(struct dentry - { - befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); - if (befs_ino->i_flags & BEFS_LONG_SYMLINK) { -- char *link = nd_get_link(nd); -+ const char *link = nd_get_link(nd); - if (!IS_ERR(link)) - kfree(link); - } -diff -urNp linux-2.6.31.1/fs/binfmt_aout.c linux-2.6.31.1/fs/binfmt_aout.c ---- linux-2.6.31.1/fs/binfmt_aout.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/binfmt_aout.c 2009-10-01 20:12:44.000000000 -0400 -@@ -16,6 +16,7 @@ - #include <linux/string.h> - #include <linux/fs.h> - #include <linux/file.h> -+#include <linux/security.h> - #include <linux/stat.h> - #include <linux/fcntl.h> - #include <linux/ptrace.h> -@@ -113,10 +114,12 @@ static int aout_core_dump(long signr, st - - /* If the size of the dump file exceeds the rlimit, then see what would happen - if we wrote the stack, but not the data area. */ -+ gr_learn_resource(current, RLIMIT_CORE, (dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE, 1); - if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit) - dump.u_dsize = 0; - - /* Make sure we have enough room to write the stack and data areas. */ -+ gr_learn_resource(current, RLIMIT_CORE, (dump.u_ssize + 1) * PAGE_SIZE, 1); - if ((dump.u_ssize + 1) * PAGE_SIZE > limit) - dump.u_ssize = 0; - -@@ -249,6 +252,8 @@ static int load_aout_binary(struct linux - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; - if (rlim >= RLIM_INFINITY) - rlim = ~0; -+ -+ gr_learn_resource(current, RLIMIT_DATA, ex.a_data + ex.a_bss, 1); - if (ex.a_data + ex.a_bss > rlim) - return -ENOMEM; - -@@ -276,6 +281,27 @@ static int load_aout_binary(struct linux - install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; - -+#if defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) -+ current->mm->pax_flags = 0UL; -+#endif -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (!(N_FLAGS(ex) & F_PAX_PAGEEXEC)) { -+ current->mm->pax_flags |= MF_PAX_PAGEEXEC; -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+ if (N_FLAGS(ex) & F_PAX_EMUTRAMP) -+ current->mm->pax_flags |= MF_PAX_EMUTRAMP; -+#endif -+ -+#ifdef CONFIG_PAX_MPROTECT -+ if (!(N_FLAGS(ex) & F_PAX_MPROTECT)) -+ current->mm->pax_flags |= MF_PAX_MPROTECT; -+#endif -+ -+ } -+#endif -+ - if (N_MAGIC(ex) == OMAGIC) { - unsigned long text_addr, map_size; - loff_t pos; -@@ -348,7 +374,7 @@ static int load_aout_binary(struct linux - - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, -- PROT_READ | PROT_WRITE | PROT_EXEC, -+ PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, - fd_offset + ex.a_text); - up_write(¤t->mm->mmap_sem); -diff -urNp linux-2.6.31.1/fs/binfmt_elf.c linux-2.6.31.1/fs/binfmt_elf.c ---- linux-2.6.31.1/fs/binfmt_elf.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/binfmt_elf.c 2009-10-01 20:12:44.000000000 -0400 -@@ -35,6 +35,10 @@ - #include <asm/param.h> - #include <asm/page.h> - -+#ifdef CONFIG_PAX_SEGMEXEC -+#include <asm/desc.h> -+#endif -+ - static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); - static int load_elf_library(struct file *); - static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, -@@ -50,6 +54,10 @@ static int elf_core_dump(long signr, str - #define elf_core_dump NULL - #endif - -+#ifdef CONFIG_PAX_MPROTECT -+static void elf_handle_mprotect(struct vm_area_struct *vma, unsigned long newflags); -+#endif -+ - #if ELF_EXEC_PAGESIZE > PAGE_SIZE - #define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE - #else -@@ -69,6 +77,11 @@ static struct linux_binfmt elf_format = - .load_binary = load_elf_binary, - .load_shlib = load_elf_library, - .core_dump = elf_core_dump, -+ -+#ifdef CONFIG_PAX_MPROTECT -+ .handle_mprotect= elf_handle_mprotect, -+#endif -+ - .min_coredump = ELF_EXEC_PAGESIZE, - .hasvdso = 1 - }; -@@ -77,6 +90,8 @@ static struct linux_binfmt elf_format = - - static int set_brk(unsigned long start, unsigned long end) - { -+ unsigned long e = end; -+ - start = ELF_PAGEALIGN(start); - end = ELF_PAGEALIGN(end); - if (end > start) { -@@ -87,7 +102,7 @@ static int set_brk(unsigned long start, - if (BAD_ADDR(addr)) - return addr; - } -- current->mm->start_brk = current->mm->brk = end; -+ current->mm->start_brk = current->mm->brk = e; - return 0; - } - -@@ -148,7 +163,7 @@ create_elf_tables(struct linux_binprm *b - elf_addr_t __user *u_rand_bytes; - const char *k_platform = ELF_PLATFORM; - const char *k_base_platform = ELF_BASE_PLATFORM; -- unsigned char k_rand_bytes[16]; -+ u32 k_rand_bytes[4]; - int items; - elf_addr_t *elf_info; - int ei_index = 0; -@@ -195,6 +210,10 @@ create_elf_tables(struct linux_binprm *b - * Generate 16 random bytes for userspace PRNG seeding. - */ - get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); -+ srandom32(k_rand_bytes[0] ^ random32()); -+ srandom32(k_rand_bytes[1] ^ random32()); -+ srandom32(k_rand_bytes[2] ^ random32()); -+ srandom32(k_rand_bytes[3] ^ random32()); - u_rand_bytes = (elf_addr_t __user *) - STACK_ALLOC(p, sizeof(k_rand_bytes)); - if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) -@@ -385,10 +404,10 @@ static unsigned long load_elf_interp(str - { - struct elf_phdr *elf_phdata; - struct elf_phdr *eppnt; -- unsigned long load_addr = 0; -+ unsigned long load_addr = 0, pax_task_size = TASK_SIZE; - int load_addr_set = 0; - unsigned long last_bss = 0, elf_bss = 0; -- unsigned long error = ~0UL; -+ unsigned long error = -EINVAL; - unsigned long total_size; - int retval, i, size; - -@@ -434,6 +453,11 @@ static unsigned long load_elf_interp(str - goto out_close; - } - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) -+ pax_task_size = SEGMEXEC_TASK_SIZE; -+#endif -+ - eppnt = elf_phdata; - for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { - if (eppnt->p_type == PT_LOAD) { -@@ -477,8 +501,8 @@ static unsigned long load_elf_interp(str - k = load_addr + eppnt->p_vaddr; - if (BAD_ADDR(k) || - eppnt->p_filesz > eppnt->p_memsz || -- eppnt->p_memsz > TASK_SIZE || -- TASK_SIZE - eppnt->p_memsz < k) { -+ eppnt->p_memsz > pax_task_size || -+ pax_task_size - eppnt->p_memsz < k) { - error = -ENOMEM; - goto out_close; - } -@@ -532,6 +556,177 @@ out: - return error; - } - -+#if (defined(CONFIG_PAX_EI_PAX) || defined(CONFIG_PAX_PT_PAX_FLAGS)) && defined(CONFIG_PAX_SOFTMODE) -+static unsigned long pax_parse_softmode(const struct elf_phdr * const elf_phdata) -+{ -+ unsigned long pax_flags = 0UL; -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (elf_phdata->p_flags & PF_PAGEEXEC) -+ pax_flags |= MF_PAX_PAGEEXEC; -+#endif -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (elf_phdata->p_flags & PF_SEGMEXEC) -+ pax_flags |= MF_PAX_SEGMEXEC; -+#endif -+ -+#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_PAX_SEGMEXEC) -+ if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) == (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { -+ if (nx_enabled) -+ pax_flags &= ~MF_PAX_SEGMEXEC; -+ else -+ pax_flags &= ~MF_PAX_PAGEEXEC; -+ } -+#endif -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+ if (elf_phdata->p_flags & PF_EMUTRAMP) -+ pax_flags |= MF_PAX_EMUTRAMP; -+#endif -+ -+#ifdef CONFIG_PAX_MPROTECT -+ if (elf_phdata->p_flags & PF_MPROTECT) -+ pax_flags |= MF_PAX_MPROTECT; -+#endif -+ -+#if defined(CONFIG_PAX_RANDMMAP) || defined(CONFIG_PAX_RANDUSTACK) -+ if (randomize_va_space && (elf_phdata->p_flags & PF_RANDMMAP)) -+ pax_flags |= MF_PAX_RANDMMAP; -+#endif -+ -+ return pax_flags; -+} -+#endif -+ -+#ifdef CONFIG_PAX_PT_PAX_FLAGS -+static unsigned long pax_parse_hardmode(const struct elf_phdr * const elf_phdata) -+{ -+ unsigned long pax_flags = 0UL; -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (!(elf_phdata->p_flags & PF_NOPAGEEXEC)) -+ pax_flags |= MF_PAX_PAGEEXEC; -+#endif -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (!(elf_phdata->p_flags & PF_NOSEGMEXEC)) -+ pax_flags |= MF_PAX_SEGMEXEC; -+#endif -+ -+#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_PAX_SEGMEXEC) -+ if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) == (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { -+ if (nx_enabled) -+ pax_flags &= ~MF_PAX_SEGMEXEC; -+ else -+ pax_flags &= ~MF_PAX_PAGEEXEC; -+ } -+#endif -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+ if (!(elf_phdata->p_flags & PF_NOEMUTRAMP)) -+ pax_flags |= MF_PAX_EMUTRAMP; -+#endif -+ -+#ifdef CONFIG_PAX_MPROTECT -+ if (!(elf_phdata->p_flags & PF_NOMPROTECT)) -+ pax_flags |= MF_PAX_MPROTECT; -+#endif -+ -+#if defined(CONFIG_PAX_RANDMMAP) || defined(CONFIG_PAX_RANDUSTACK) -+ if (randomize_va_space && !(elf_phdata->p_flags & PF_NORANDMMAP)) -+ pax_flags |= MF_PAX_RANDMMAP; -+#endif -+ -+ return pax_flags; -+} -+#endif -+ -+#ifdef CONFIG_PAX_EI_PAX -+static unsigned long pax_parse_ei_pax(const struct elfhdr * const elf_ex) -+{ -+ unsigned long pax_flags = 0UL; -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ if (!(elf_ex->e_ident[EI_PAX] & EF_PAX_PAGEEXEC)) -+ pax_flags |= MF_PAX_PAGEEXEC; -+#endif -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (!(elf_ex->e_ident[EI_PAX] & EF_PAX_SEGMEXEC)) -+ pax_flags |= MF_PAX_SEGMEXEC; -+#endif -+ -+#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_PAX_SEGMEXEC) -+ if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) == (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { -+ if (nx_enabled) -+ pax_flags &= ~MF_PAX_SEGMEXEC; -+ else -+ pax_flags &= ~MF_PAX_PAGEEXEC; -+ } -+#endif -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+ if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) && (elf_ex->e_ident[EI_PAX] & EF_PAX_EMUTRAMP)) -+ pax_flags |= MF_PAX_EMUTRAMP; -+#endif -+ -+#ifdef CONFIG_PAX_MPROTECT -+ if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) && !(elf_ex->e_ident[EI_PAX] & EF_PAX_MPROTECT)) -+ pax_flags |= MF_PAX_MPROTECT; -+#endif -+ -+#ifdef CONFIG_PAX_ASLR -+ if (randomize_va_space && !(elf_ex->e_ident[EI_PAX] & EF_PAX_RANDMMAP)) -+ pax_flags |= MF_PAX_RANDMMAP; -+#endif -+ -+ return pax_flags; -+} -+#endif -+ -+#if defined(CONFIG_PAX_EI_PAX) || defined(CONFIG_PAX_PT_PAX_FLAGS) -+static long pax_parse_elf_flags(const struct elfhdr * const elf_ex, const struct elf_phdr * const elf_phdata) -+{ -+ unsigned long pax_flags = 0UL; -+ -+#ifdef CONFIG_PAX_PT_PAX_FLAGS -+ unsigned long i; -+#endif -+ -+#ifdef CONFIG_PAX_EI_PAX -+ pax_flags = pax_parse_ei_pax(elf_ex); -+#endif -+ -+#ifdef CONFIG_PAX_PT_PAX_FLAGS -+ for (i = 0UL; i < elf_ex->e_phnum; i++) -+ if (elf_phdata[i].p_type == PT_PAX_FLAGS) { -+ if (((elf_phdata[i].p_flags & PF_PAGEEXEC) && (elf_phdata[i].p_flags & PF_NOPAGEEXEC)) || -+ ((elf_phdata[i].p_flags & PF_SEGMEXEC) && (elf_phdata[i].p_flags & PF_NOSEGMEXEC)) || -+ ((elf_phdata[i].p_flags & PF_EMUTRAMP) && (elf_phdata[i].p_flags & PF_NOEMUTRAMP)) || -+ ((elf_phdata[i].p_flags & PF_MPROTECT) && (elf_phdata[i].p_flags & PF_NOMPROTECT)) || -+ ((elf_phdata[i].p_flags & PF_RANDMMAP) && (elf_phdata[i].p_flags & PF_NORANDMMAP))) -+ return -EINVAL; -+ -+#ifdef CONFIG_PAX_SOFTMODE -+ if (pax_softmode) -+ pax_flags = pax_parse_softmode(&elf_phdata[i]); -+ else -+#endif -+ -+ pax_flags = pax_parse_hardmode(&elf_phdata[i]); -+ break; -+ } -+#endif -+ -+ if (0 > pax_check_flags(&pax_flags)) -+ return -EINVAL; -+ -+ current->mm->pax_flags = pax_flags; -+ return 0; -+} -+#endif -+ - /* - * These are the functions used to load ELF style executables and shared - * libraries. There is no binary dependent code anywhere else. -@@ -548,6 +743,11 @@ static unsigned long randomize_stack_top - { - unsigned int random_variable = 0; - -+#ifdef CONFIG_PAX_RANDUSTACK -+ if (randomize_va_space) -+ return stack_top - current->mm->delta_stack; -+#endif -+ - if ((current->flags & PF_RANDOMIZE) && - !(current->personality & ADDR_NO_RANDOMIZE)) { - random_variable = get_random_int() & STACK_RND_MASK; -@@ -566,7 +766,7 @@ static int load_elf_binary(struct linux_ - unsigned long load_addr = 0, load_bias = 0; - int load_addr_set = 0; - char * elf_interpreter = NULL; -- unsigned long error; -+ unsigned long error = 0; - struct elf_phdr *elf_ppnt, *elf_phdata; - unsigned long elf_bss, elf_brk; - int retval, i; -@@ -576,11 +776,11 @@ static int load_elf_binary(struct linux_ - unsigned long start_code, end_code, start_data, end_data; - unsigned long reloc_func_desc = 0; - int executable_stack = EXSTACK_DEFAULT; -- unsigned long def_flags = 0; - struct { - struct elfhdr elf_ex; - struct elfhdr interp_elf_ex; - } *loc; -+ unsigned long pax_task_size = TASK_SIZE; - - loc = kmalloc(sizeof(*loc), GFP_KERNEL); - if (!loc) { -@@ -742,11 +942,80 @@ static int load_elf_binary(struct linux_ - - /* OK, This is the point of no return */ - current->flags &= ~PF_FORKNOEXEC; -- current->mm->def_flags = def_flags; -+ -+#if defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) -+ current->mm->pax_flags = 0UL; -+#endif -+ -+#ifdef CONFIG_PAX_DLRESOLVE -+ current->mm->call_dl_resolve = 0UL; -+#endif -+ -+#if defined(CONFIG_PPC32) && defined(CONFIG_PAX_EMUSIGRT) -+ current->mm->call_syscall = 0UL; -+#endif -+ -+#ifdef CONFIG_PAX_ASLR -+ current->mm->delta_mmap = 0UL; -+ current->mm->delta_stack = 0UL; -+#endif -+ -+ current->mm->def_flags = 0; -+ -+#if defined(CONFIG_PAX_EI_PAX) || defined(CONFIG_PAX_PT_PAX_FLAGS) -+ if (0 > pax_parse_elf_flags(&loc->elf_ex, elf_phdata)) { -+ send_sig(SIGKILL, current, 0); -+ goto out_free_dentry; -+ } -+#endif -+ -+#ifdef CONFIG_PAX_HAVE_ACL_FLAGS -+ pax_set_initial_flags(bprm); -+#elif defined(CONFIG_PAX_HOOK_ACL_FLAGS) -+ if (pax_set_initial_flags_func) -+ (pax_set_initial_flags_func)(bprm); -+#endif -+ -+#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT -+ if ((current->mm->pax_flags & MF_PAX_PAGEEXEC) && !nx_enabled) { -+ current->mm->context.user_cs_limit = PAGE_SIZE; -+ current->mm->def_flags |= VM_PAGEEXEC; -+ } -+#endif -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) { -+ current->mm->context.user_cs_base = SEGMEXEC_TASK_SIZE; -+ current->mm->context.user_cs_limit = TASK_SIZE-SEGMEXEC_TASK_SIZE; -+ pax_task_size = SEGMEXEC_TASK_SIZE; -+ } -+#endif -+ -+#if defined(CONFIG_ARCH_TRACK_EXEC_LIMIT) || defined(CONFIG_PAX_SEGMEXEC) -+ if (current->mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { -+ set_user_cs(current->mm->context.user_cs_base, current->mm->context.user_cs_limit, get_cpu()); -+ put_cpu(); -+ } -+#endif -+ -+#ifdef CONFIG_PAX_ASLR -+ if (current->mm->pax_flags & MF_PAX_RANDMMAP) { -+ current->mm->delta_mmap = (pax_get_random_long() & ((1UL << PAX_DELTA_MMAP_LEN)-1)) << PAGE_SHIFT; -+ current->mm->delta_stack = (pax_get_random_long() & ((1UL << PAX_DELTA_STACK_LEN)-1)) << PAGE_SHIFT; -+ } -+#endif - - /* Do this immediately, since STACK_TOP as used in setup_arg_pages - may depend on the personality. */ - SET_PERSONALITY(loc->elf_ex); -+ -+#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) -+ if (current->mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { -+ executable_stack = EXSTACK_DISABLE_X; -+ current->personality &= ~READ_IMPLIES_EXEC; -+ } else -+#endif -+ - if (elf_read_implies_exec(loc->elf_ex, executable_stack)) - current->personality |= READ_IMPLIES_EXEC; - -@@ -827,6 +1096,20 @@ static int load_elf_binary(struct linux_ - #else - load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); - #endif -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ /* PaX: randomize base address at the default exe base if requested */ -+ if ((current->mm->pax_flags & MF_PAX_RANDMMAP) && elf_interpreter) { -+#ifdef CONFIG_SPARC64 -+ load_bias = (pax_get_random_long() & ((1UL << PAX_DELTA_MMAP_LEN) - 1)) << (PAGE_SHIFT+1); -+#else -+ load_bias = (pax_get_random_long() & ((1UL << PAX_DELTA_MMAP_LEN) - 1)) << PAGE_SHIFT; -+#endif -+ load_bias = ELF_PAGESTART(PAX_ELF_ET_DYN_BASE - vaddr + load_bias); -+ elf_flags |= MAP_FIXED; -+ } -+#endif -+ - } - - error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, -@@ -859,9 +1142,9 @@ static int load_elf_binary(struct linux_ - * allowed task size. Note that p_filesz must always be - * <= p_memsz so it is only necessary to check p_memsz. - */ -- if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz || -- elf_ppnt->p_memsz > TASK_SIZE || -- TASK_SIZE - elf_ppnt->p_memsz < k) { -+ if (k >= pax_task_size || elf_ppnt->p_filesz > elf_ppnt->p_memsz || -+ elf_ppnt->p_memsz > pax_task_size || -+ pax_task_size - elf_ppnt->p_memsz < k) { - /* set_brk can never work. Avoid overflows. */ - send_sig(SIGKILL, current, 0); - retval = -EINVAL; -@@ -889,6 +1172,11 @@ static int load_elf_binary(struct linux_ - start_data += load_bias; - end_data += load_bias; - -+#ifdef CONFIG_PAX_RANDMMAP -+ if (current->mm->pax_flags & MF_PAX_RANDMMAP) -+ elf_brk += PAGE_SIZE + ((pax_get_random_long() & ~PAGE_MASK) << 4); -+#endif -+ - /* Calling set_brk effectively mmaps the pages that we need - * for the bss and break sections. We must do this before - * mapping in the interpreter, to make sure it doesn't wind -@@ -900,9 +1188,11 @@ static int load_elf_binary(struct linux_ - goto out_free_dentry; - } - if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { -- send_sig(SIGSEGV, current, 0); -- retval = -EFAULT; /* Nobody gets to see this, but.. */ -- goto out_free_dentry; -+ /* -+ * This bss-zeroing can fail if the ELF -+ * file specifies odd protections. So -+ * we don't check the return value -+ */ - } - - if (elf_interpreter) { -@@ -1135,8 +1425,10 @@ static int dump_seek(struct file *file, - unsigned long n = off; - if (n > PAGE_SIZE) - n = PAGE_SIZE; -- if (!dump_write(file, buf, n)) -+ if (!dump_write(file, buf, n)) { -+ free_page((unsigned long)buf); - return 0; -+ } - off -= n; - } - free_page((unsigned long)buf); -@@ -1148,7 +1440,7 @@ static int dump_seek(struct file *file, - * Decide what to dump of a segment, part, all or none. - */ - static unsigned long vma_dump_size(struct vm_area_struct *vma, -- unsigned long mm_flags) -+ unsigned long mm_flags, long signr) - { - #define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - -@@ -1182,7 +1474,7 @@ static unsigned long vma_dump_size(struc - if (vma->vm_file == NULL) - return 0; - -- if (FILTER(MAPPED_PRIVATE)) -+ if (signr == SIGKILL || FILTER(MAPPED_PRIVATE)) - goto whole; - - /* -@@ -1278,8 +1570,11 @@ static int writenote(struct memelfnote * - #undef DUMP_WRITE - - #define DUMP_WRITE(addr, nr) \ -+ do { \ -+ gr_learn_resource(current, RLIMIT_CORE, size + (nr), 1); \ - if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ -- goto end_coredump; -+ goto end_coredump; \ -+ } while (0); - #define DUMP_SEEK(off) \ - if (!dump_seek(file, (off))) \ - goto end_coredump; -@@ -1991,7 +2286,7 @@ static int elf_core_dump(long signr, str - phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; - phdr.p_paddr = 0; -- phdr.p_filesz = vma_dump_size(vma, mm_flags); -+ phdr.p_filesz = vma_dump_size(vma, mm_flags, signr); - phdr.p_memsz = vma->vm_end - vma->vm_start; - offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; -@@ -2023,7 +2318,7 @@ static int elf_core_dump(long signr, str - unsigned long addr; - unsigned long end; - -- end = vma->vm_start + vma_dump_size(vma, mm_flags); -+ end = vma->vm_start + vma_dump_size(vma, mm_flags, signr); - - for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { - struct page *page; -@@ -2043,6 +2338,7 @@ static int elf_core_dump(long signr, str - flush_cache_page(tmp_vma, addr, - page_to_pfn(page)); - kaddr = kmap(page); -+ gr_learn_resource(current, RLIMIT_CORE, size + PAGE_SIZE, 1); - if ((size += PAGE_SIZE) > limit || - !dump_write(file, kaddr, - PAGE_SIZE)) { -@@ -2073,6 +2369,97 @@ out: - - #endif /* USE_ELF_CORE_DUMP */ - -+#ifdef CONFIG_PAX_MPROTECT -+/* PaX: non-PIC ELF libraries need relocations on their executable segments -+ * therefore we'll grant them VM_MAYWRITE once during their life. Similarly -+ * we'll remove VM_MAYWRITE for good on RELRO segments. -+ * -+ * The checks favour ld-linux.so behaviour which operates on a per ELF segment -+ * basis because we want to allow the common case and not the special ones. -+ */ -+static void elf_handle_mprotect(struct vm_area_struct *vma, unsigned long newflags) -+{ -+ struct elfhdr elf_h; -+ struct elf_phdr elf_p; -+ unsigned long i; -+ unsigned long oldflags; -+ bool is_textrel_rw, is_textrel_rx, is_relro; -+ -+ if (!(vma->vm_mm->pax_flags & MF_PAX_MPROTECT)) -+ return; -+ -+ oldflags = vma->vm_flags & (VM_MAYEXEC | VM_MAYWRITE | VM_MAYREAD | VM_EXEC | VM_WRITE | VM_READ); -+ newflags &= VM_MAYEXEC | VM_MAYWRITE | VM_MAYREAD | VM_EXEC | VM_WRITE | VM_READ; -+ -+#ifdef CONFIG_PAX_NOELFRELOCS -+ is_textrel_rw = false; -+ is_textrel_rx = false; -+#else -+ /* possible TEXTREL */ -+ is_textrel_rw = vma->vm_file && !vma->anon_vma && oldflags == (VM_MAYEXEC | VM_MAYREAD | VM_EXEC | VM_READ) && newflags == (VM_WRITE | VM_READ); -+ is_textrel_rx = vma->vm_file && vma->anon_vma && oldflags == (VM_MAYEXEC | VM_MAYWRITE | VM_MAYREAD | VM_WRITE | VM_READ) && newflags == (VM_EXEC | VM_READ); -+#endif -+ -+ /* possible RELRO */ -+ is_relro = vma->vm_file && vma->anon_vma && oldflags == (VM_MAYWRITE | VM_MAYREAD | VM_READ) && newflags == (VM_MAYWRITE | VM_MAYREAD | VM_READ); -+ -+ if (!is_textrel_rw && !is_textrel_rx && !is_relro) -+ return; -+ -+ if (sizeof(elf_h) != kernel_read(vma->vm_file, 0UL, (char *)&elf_h, sizeof(elf_h)) || -+ memcmp(elf_h.e_ident, ELFMAG, SELFMAG) || -+ -+#ifdef CONFIG_PAX_ETEXECRELOCS -+ ((is_textrel_rw || is_textrel_rx) && (elf_h.e_type != ET_DYN && elf_h.e_type != ET_EXEC)) || -+#else -+ ((is_textrel_rw || is_textrel_rx) && elf_h.e_type != ET_DYN) || -+#endif -+ -+ (is_relro && (elf_h.e_type != ET_DYN && elf_h.e_type != ET_EXEC)) || -+ !elf_check_arch(&elf_h) || -+ elf_h.e_phentsize != sizeof(struct elf_phdr) || -+ elf_h.e_phnum > 65536UL / sizeof(struct elf_phdr)) -+ return; -+ -+ for (i = 0UL; i < elf_h.e_phnum; i++) { -+ if (sizeof(elf_p) != kernel_read(vma->vm_file, elf_h.e_phoff + i*sizeof(elf_p), (char *)&elf_p, sizeof(elf_p))) -+ return; -+ switch (elf_p.p_type) { -+ case PT_DYNAMIC: -+ if (!is_textrel_rw && !is_textrel_rx) -+ continue; -+ i = 0UL; -+ while ((i+1) * sizeof(elf_dyn) <= elf_p.p_filesz) { -+ elf_dyn dyn; -+ -+ if (sizeof(dyn) != kernel_read(vma->vm_file, elf_p.p_offset + i*sizeof(dyn), (char *)&dyn, sizeof(dyn))) -+ return; -+ if (dyn.d_tag == DT_NULL) -+ return; -+ if (dyn.d_tag == DT_TEXTREL || (dyn.d_tag == DT_FLAGS && (dyn.d_un.d_val & DF_TEXTREL))) { -+ gr_log_textrel(vma); -+ if (is_textrel_rw) -+ vma->vm_flags |= VM_MAYWRITE; -+ else -+ /* PaX: disallow write access after relocs are done, hopefully noone else needs it... */ -+ vma->vm_flags &= ~VM_MAYWRITE; -+ return; -+ } -+ i++; -+ } -+ return; -+ -+ case PT_GNU_RELRO: -+ if (!is_relro) -+ continue; -+ if ((elf_p.p_offset >> PAGE_SHIFT) == vma->vm_pgoff && ELF_PAGEALIGN(elf_p.p_memsz) == vma->vm_end - vma->vm_start) -+ vma->vm_flags &= ~VM_MAYWRITE; -+ return; -+ } -+ } -+} -+#endif -+ - static int __init init_elf_binfmt(void) - { - return register_binfmt(&elf_format); -diff -urNp linux-2.6.31.1/fs/binfmt_flat.c linux-2.6.31.1/fs/binfmt_flat.c ---- linux-2.6.31.1/fs/binfmt_flat.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/binfmt_flat.c 2009-10-01 20:12:44.000000000 -0400 -@@ -565,7 +565,9 @@ static int load_flat_file(struct linux_b - realdatastart = (unsigned long) -ENOMEM; - printk("Unable to allocate RAM for process data, errno %d\n", - (int)-realdatastart); -+ down_write(¤t->mm->mmap_sem); - do_munmap(current->mm, textpos, text_len); -+ up_write(¤t->mm->mmap_sem); - ret = realdatastart; - goto err; - } -@@ -589,8 +591,10 @@ static int load_flat_file(struct linux_b - } - if (result >= (unsigned long)-4096) { - printk("Unable to read data+bss, errno %d\n", (int)-result); -+ down_write(¤t->mm->mmap_sem); - do_munmap(current->mm, textpos, text_len); - do_munmap(current->mm, realdatastart, data_len + extra); -+ up_write(¤t->mm->mmap_sem); - ret = result; - goto err; - } -@@ -659,8 +663,10 @@ static int load_flat_file(struct linux_b - } - if (result >= (unsigned long)-4096) { - printk("Unable to read code+data+bss, errno %d\n",(int)-result); -+ down_write(¤t->mm->mmap_sem); - do_munmap(current->mm, textpos, text_len + data_len + extra + - MAX_SHARED_LIBS * sizeof(unsigned long)); -+ up_write(¤t->mm->mmap_sem); - ret = result; - goto err; - } -diff -urNp linux-2.6.31.1/fs/binfmt_misc.c linux-2.6.31.1/fs/binfmt_misc.c ---- linux-2.6.31.1/fs/binfmt_misc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/binfmt_misc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -693,7 +693,7 @@ static int bm_fill_super(struct super_bl - static struct tree_descr bm_files[] = { - [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, - [3] = {"register", &bm_register_operations, S_IWUSR}, -- /* last one */ {""} -+ /* last one */ {"", NULL, 0} - }; - int err = simple_fill_super(sb, 0x42494e4d, bm_files); - if (!err) -diff -urNp linux-2.6.31.1/fs/btrfs/ctree.h linux-2.6.31.1/fs/btrfs/ctree.h ---- linux-2.6.31.1/fs/btrfs/ctree.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/btrfs/ctree.h 2009-10-01 20:12:44.000000000 -0400 -@@ -2286,7 +2286,7 @@ int btrfs_sync_file(struct file *file, s - int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned); - int btrfs_check_file(struct btrfs_root *root, struct inode *inode); --extern struct file_operations btrfs_file_operations; -+extern const struct file_operations btrfs_file_operations; - int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 locked_end, -diff -urNp linux-2.6.31.1/fs/btrfs/disk-io.c linux-2.6.31.1/fs/btrfs/disk-io.c ---- linux-2.6.31.1/fs/btrfs/disk-io.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/btrfs/disk-io.c 2009-10-01 20:12:44.000000000 -0400 -@@ -772,7 +772,7 @@ static void btree_invalidatepage(struct - } - } - --static struct address_space_operations btree_aops = { -+static const struct address_space_operations btree_aops = { - .readpage = btree_readpage, - .writepage = btree_writepage, - .writepages = btree_writepages, -diff -urNp linux-2.6.31.1/fs/btrfs/file.c linux-2.6.31.1/fs/btrfs/file.c ---- linux-2.6.31.1/fs/btrfs/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/btrfs/file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1203,7 +1203,7 @@ out: - return ret > 0 ? EIO : ret; - } - --static struct vm_operations_struct btrfs_file_vm_ops = { -+static const struct vm_operations_struct btrfs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = btrfs_page_mkwrite, - }; -@@ -1215,7 +1215,7 @@ static int btrfs_file_mmap(struct file * - return 0; - } - --struct file_operations btrfs_file_operations = { -+const struct file_operations btrfs_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, -diff -urNp linux-2.6.31.1/fs/btrfs/inode.c linux-2.6.31.1/fs/btrfs/inode.c ---- linux-2.6.31.1/fs/btrfs/inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/btrfs/inode.c 2009-10-01 20:12:44.000000000 -0400 -@@ -55,14 +55,14 @@ struct btrfs_iget_args { - struct btrfs_root *root; - }; - --static struct inode_operations btrfs_dir_inode_operations; --static struct inode_operations btrfs_symlink_inode_operations; --static struct inode_operations btrfs_dir_ro_inode_operations; --static struct inode_operations btrfs_special_inode_operations; --static struct inode_operations btrfs_file_inode_operations; --static struct address_space_operations btrfs_aops; --static struct address_space_operations btrfs_symlink_aops; --static struct file_operations btrfs_dir_file_operations; -+static const struct inode_operations btrfs_dir_inode_operations; -+static const struct inode_operations btrfs_symlink_inode_operations; -+static const struct inode_operations btrfs_dir_ro_inode_operations; -+static const struct inode_operations btrfs_special_inode_operations; -+static const struct inode_operations btrfs_file_inode_operations; -+static const struct address_space_operations btrfs_aops; -+static const struct address_space_operations btrfs_symlink_aops; -+static const struct file_operations btrfs_dir_file_operations; - static struct extent_io_ops btrfs_extent_io_ops; - - static struct kmem_cache *btrfs_inode_cachep; -@@ -5201,7 +5201,7 @@ static int btrfs_permission(struct inode - return generic_permission(inode, mask, btrfs_check_acl); - } - --static struct inode_operations btrfs_dir_inode_operations = { -+static const struct inode_operations btrfs_dir_inode_operations = { - .getattr = btrfs_getattr, - .lookup = btrfs_lookup, - .create = btrfs_create, -@@ -5219,11 +5219,11 @@ static struct inode_operations btrfs_dir - .removexattr = btrfs_removexattr, - .permission = btrfs_permission, - }; --static struct inode_operations btrfs_dir_ro_inode_operations = { -+static const struct inode_operations btrfs_dir_ro_inode_operations = { - .lookup = btrfs_lookup, - .permission = btrfs_permission, - }; --static struct file_operations btrfs_dir_file_operations = { -+static const struct file_operations btrfs_dir_file_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = btrfs_real_readdir, -@@ -5259,7 +5259,7 @@ static struct extent_io_ops btrfs_extent - * - * For now we're avoiding this by dropping bmap. - */ --static struct address_space_operations btrfs_aops = { -+static const struct address_space_operations btrfs_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, - .writepages = btrfs_writepages, -@@ -5271,14 +5271,14 @@ static struct address_space_operations b - .set_page_dirty = btrfs_set_page_dirty, - }; - --static struct address_space_operations btrfs_symlink_aops = { -+static const struct address_space_operations btrfs_symlink_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, - .invalidatepage = btrfs_invalidatepage, - .releasepage = btrfs_releasepage, - }; - --static struct inode_operations btrfs_file_inode_operations = { -+static const struct inode_operations btrfs_file_inode_operations = { - .truncate = btrfs_truncate, - .getattr = btrfs_getattr, - .setattr = btrfs_setattr, -@@ -5290,7 +5290,7 @@ static struct inode_operations btrfs_fil - .fallocate = btrfs_fallocate, - .fiemap = btrfs_fiemap, - }; --static struct inode_operations btrfs_special_inode_operations = { -+static const struct inode_operations btrfs_special_inode_operations = { - .getattr = btrfs_getattr, - .setattr = btrfs_setattr, - .permission = btrfs_permission, -@@ -5299,7 +5299,7 @@ static struct inode_operations btrfs_spe - .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, - }; --static struct inode_operations btrfs_symlink_inode_operations = { -+static const struct inode_operations btrfs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, -diff -urNp linux-2.6.31.1/fs/btrfs/super.c linux-2.6.31.1/fs/btrfs/super.c ---- linux-2.6.31.1/fs/btrfs/super.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/btrfs/super.c 2009-10-01 20:12:44.000000000 -0400 -@@ -51,7 +51,7 @@ - #include "export.h" - #include "compression.h" - --static struct super_operations btrfs_super_ops; -+static const struct super_operations btrfs_super_ops; - - static void btrfs_put_super(struct super_block *sb) - { -@@ -675,7 +675,7 @@ static int btrfs_unfreeze(struct super_b - return 0; - } - --static struct super_operations btrfs_super_ops = { -+static const struct super_operations btrfs_super_ops = { - .delete_inode = btrfs_delete_inode, - .put_super = btrfs_put_super, - .sync_fs = btrfs_sync_fs, -diff -urNp linux-2.6.31.1/fs/buffer.c linux-2.6.31.1/fs/buffer.c ---- linux-2.6.31.1/fs/buffer.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/buffer.c 2009-10-01 20:12:44.000000000 -0400 -@@ -25,6 +25,7 @@ - #include <linux/percpu.h> - #include <linux/slab.h> - #include <linux/capability.h> -+#include <linux/security.h> - #include <linux/blkdev.h> - #include <linux/file.h> - #include <linux/quotaops.h> -@@ -2233,6 +2234,7 @@ int generic_cont_expand_simple(struct in - - err = -EFBIG; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; -+ gr_learn_resource(current, RLIMIT_FSIZE, (unsigned long) size, 1); - if (limit != RLIM_INFINITY && size > (loff_t)limit) { - send_sig(SIGXFSZ, current, 0); - goto out; -diff -urNp linux-2.6.31.1/fs/cifs/cifs_dfs_ref.c linux-2.6.31.1/fs/cifs/cifs_dfs_ref.c ---- linux-2.6.31.1/fs/cifs/cifs_dfs_ref.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/cifs/cifs_dfs_ref.c 2009-10-01 20:12:44.000000000 -0400 -@@ -385,7 +385,7 @@ out_err: - goto out; - } - --struct inode_operations cifs_dfs_referral_inode_operations = { -+const struct inode_operations cifs_dfs_referral_inode_operations = { - .follow_link = cifs_dfs_follow_mountpoint, - }; - -diff -urNp linux-2.6.31.1/fs/cifs/cifsfs.h linux-2.6.31.1/fs/cifs/cifsfs.h ---- linux-2.6.31.1/fs/cifs/cifsfs.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/cifs/cifsfs.h 2009-10-01 20:12:44.000000000 -0400 -@@ -67,7 +67,7 @@ extern int cifs_setattr(struct dentry *, - - extern const struct inode_operations cifs_file_inode_ops; - extern const struct inode_operations cifs_symlink_inode_ops; --extern struct inode_operations cifs_dfs_referral_inode_operations; -+extern const struct inode_operations cifs_dfs_referral_inode_operations; - - - /* Functions related to files and directories */ -diff -urNp linux-2.6.31.1/fs/cifs/cifs_uniupr.h linux-2.6.31.1/fs/cifs/cifs_uniupr.h ---- linux-2.6.31.1/fs/cifs/cifs_uniupr.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/cifs/cifs_uniupr.h 2009-10-01 20:12:44.000000000 -0400 -@@ -132,7 +132,7 @@ const struct UniCaseRange CifsUniUpperRa - {0x0490, 0x04cc, UniCaseRangeU0490}, - {0x1e00, 0x1ffc, UniCaseRangeU1e00}, - {0xff40, 0xff5a, UniCaseRangeUff40}, -- {0} -+ {0, 0, NULL} - }; - #endif - -diff -urNp linux-2.6.31.1/fs/cifs/link.c linux-2.6.31.1/fs/cifs/link.c ---- linux-2.6.31.1/fs/cifs/link.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/cifs/link.c 2009-10-01 20:12:44.000000000 -0400 -@@ -215,7 +215,7 @@ cifs_symlink(struct inode *inode, struct - - void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie) - { -- char *p = nd_get_link(nd); -+ const char *p = nd_get_link(nd); - if (!IS_ERR(p)) - kfree(p); - } -diff -urNp linux-2.6.31.1/fs/compat_binfmt_elf.c linux-2.6.31.1/fs/compat_binfmt_elf.c ---- linux-2.6.31.1/fs/compat_binfmt_elf.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/compat_binfmt_elf.c 2009-10-01 20:12:44.000000000 -0400 -@@ -29,10 +29,12 @@ - #undef elfhdr - #undef elf_phdr - #undef elf_note -+#undef elf_dyn - #undef elf_addr_t - #define elfhdr elf32_hdr - #define elf_phdr elf32_phdr - #define elf_note elf32_note -+#define elf_dyn Elf32_Dyn - #define elf_addr_t Elf32_Addr - - /* -diff -urNp linux-2.6.31.1/fs/compat.c linux-2.6.31.1/fs/compat.c ---- linux-2.6.31.1/fs/compat.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/compat.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1417,14 +1417,12 @@ static int compat_copy_strings(int argc, - if (!kmapped_page || kpos != (pos & PAGE_MASK)) { - struct page *page; - --#ifdef CONFIG_STACK_GROWSUP - ret = expand_stack_downwards(bprm->vma, pos); - if (ret < 0) { - /* We've exceed the stack rlimit. */ - ret = -E2BIG; - goto out; - } --#endif - ret = get_user_pages(current, bprm->mm, pos, - 1, 1, 1, &page, NULL); - if (ret <= 0) { -@@ -1470,6 +1468,11 @@ int compat_do_execve(char * filename, - compat_uptr_t __user *envp, - struct pt_regs * regs) - { -+#ifdef CONFIG_GRKERNSEC -+ struct file *old_exec_file; -+ struct acl_subject_label *old_acl; -+ struct rlimit old_rlim[RLIM_NLIMITS]; -+#endif - struct linux_binprm *bprm; - struct file *file; - struct files_struct *displaced; -@@ -1506,6 +1509,14 @@ int compat_do_execve(char * filename, - bprm->filename = filename; - bprm->interp = filename; - -+ gr_learn_resource(current, RLIMIT_NPROC, atomic_read(¤t->cred->user->processes), 1); -+ retval = -EAGAIN; -+ if (gr_handle_nproc()) -+ goto out_file; -+ retval = -EACCES; -+ if (!gr_acl_handle_execve(file->f_dentry, file->f_vfsmnt)) -+ goto out_file; -+ - retval = bprm_mm_init(bprm); - if (retval) - goto out_file; -@@ -1535,9 +1546,40 @@ int compat_do_execve(char * filename, - if (retval < 0) - goto out; - -+ if (!gr_tpe_allow(file)) { -+ retval = -EACCES; -+ goto out; -+ } -+ -+ if (gr_check_crash_exec(file)) { -+ retval = -EACCES; -+ goto out; -+ } -+ -+ gr_log_chroot_exec(file->f_dentry, file->f_vfsmnt); -+ -+ gr_handle_exec_args(bprm, (char __user * __user *)argv); -+ -+#ifdef CONFIG_GRKERNSEC -+ old_acl = current->acl; -+ memcpy(old_rlim, current->signal->rlim, sizeof(old_rlim)); -+ old_exec_file = current->exec_file; -+ get_file(file); -+ current->exec_file = file; -+#endif -+ -+ retval = gr_set_proc_label(file->f_dentry, file->f_vfsmnt, -+ bprm->unsafe & LSM_UNSAFE_SHARE); -+ if (retval < 0) -+ goto out_fail; -+ - retval = search_binary_handler(bprm, regs); - if (retval < 0) -- goto out; -+ goto out_fail; -+#ifdef CONFIG_GRKERNSEC -+ if (old_exec_file) -+ fput(old_exec_file); -+#endif - - /* execve succeeded */ - current->fs->in_exec = 0; -@@ -1548,6 +1590,14 @@ int compat_do_execve(char * filename, - put_files_struct(displaced); - return retval; - -+out_fail: -+#ifdef CONFIG_GRKERNSEC -+ current->acl = old_acl; -+ memcpy(current->signal->rlim, old_rlim, sizeof(old_rlim)); -+ fput(current->exec_file); -+ current->exec_file = old_exec_file; -+#endif -+ - out: - if (bprm->mm) - mmput(bprm->mm); -diff -urNp linux-2.6.31.1/fs/compat_ioctl.c linux-2.6.31.1/fs/compat_ioctl.c ---- linux-2.6.31.1/fs/compat_ioctl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/compat_ioctl.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1827,15 +1827,15 @@ struct ioctl_trans { - }; - - #define HANDLE_IOCTL(cmd,handler) \ -- { (cmd), (ioctl_trans_handler_t)(handler) }, -+ { (cmd), (ioctl_trans_handler_t)(handler), NULL }, - - /* pointer to compatible structure or no argument */ - #define COMPATIBLE_IOCTL(cmd) \ -- { (cmd), do_ioctl32_pointer }, -+ { (cmd), do_ioctl32_pointer, NULL }, - - /* argument is an unsigned long integer, not a pointer */ - #define ULONG_IOCTL(cmd) \ -- { (cmd), (ioctl_trans_handler_t)sys_ioctl }, -+ { (cmd), (ioctl_trans_handler_t)sys_ioctl, NULL }, - - /* ioctl should not be warned about even if it's not implemented. - Valid reasons to use this: -diff -urNp linux-2.6.31.1/fs/debugfs/inode.c linux-2.6.31.1/fs/debugfs/inode.c ---- linux-2.6.31.1/fs/debugfs/inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/debugfs/inode.c 2009-10-01 20:12:44.000000000 -0400 -@@ -118,7 +118,7 @@ static inline int debugfs_positive(struc - - static int debug_fill_super(struct super_block *sb, void *data, int silent) - { -- static struct tree_descr debug_files[] = {{""}}; -+ static struct tree_descr debug_files[] = {{"", NULL, 0}}; - - return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); - } -diff -urNp linux-2.6.31.1/fs/dlm/debug_fs.c linux-2.6.31.1/fs/dlm/debug_fs.c ---- linux-2.6.31.1/fs/dlm/debug_fs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/dlm/debug_fs.c 2009-10-01 20:12:44.000000000 -0400 -@@ -386,9 +386,9 @@ static int table_seq_show(struct seq_fil - return rv; - } - --static struct seq_operations format1_seq_ops; --static struct seq_operations format2_seq_ops; --static struct seq_operations format3_seq_ops; -+static const struct seq_operations format1_seq_ops; -+static const struct seq_operations format2_seq_ops; -+static const struct seq_operations format3_seq_ops; - - static void *table_seq_start(struct seq_file *seq, loff_t *pos) - { -@@ -534,21 +534,21 @@ static void table_seq_stop(struct seq_fi - } - } - --static struct seq_operations format1_seq_ops = { -+static const struct seq_operations format1_seq_ops = { - .start = table_seq_start, - .next = table_seq_next, - .stop = table_seq_stop, - .show = table_seq_show, - }; - --static struct seq_operations format2_seq_ops = { -+static const struct seq_operations format2_seq_ops = { - .start = table_seq_start, - .next = table_seq_next, - .stop = table_seq_stop, - .show = table_seq_show, - }; - --static struct seq_operations format3_seq_ops = { -+static const struct seq_operations format3_seq_ops = { - .start = table_seq_start, - .next = table_seq_next, - .stop = table_seq_stop, -diff -urNp linux-2.6.31.1/fs/ecryptfs/ecryptfs_kernel.h linux-2.6.31.1/fs/ecryptfs/ecryptfs_kernel.h ---- linux-2.6.31.1/fs/ecryptfs/ecryptfs_kernel.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ecryptfs/ecryptfs_kernel.h 2009-10-01 20:12:44.000000000 -0400 -@@ -582,7 +582,7 @@ extern const struct inode_operations ecr - extern const struct inode_operations ecryptfs_symlink_iops; - extern const struct super_operations ecryptfs_sops; - extern const struct dentry_operations ecryptfs_dops; --extern struct address_space_operations ecryptfs_aops; -+extern const struct address_space_operations ecryptfs_aops; - extern int ecryptfs_verbosity; - extern unsigned int ecryptfs_message_buf_len; - extern signed long ecryptfs_message_wait_timeout; -diff -urNp linux-2.6.31.1/fs/ecryptfs/mmap.c linux-2.6.31.1/fs/ecryptfs/mmap.c ---- linux-2.6.31.1/fs/ecryptfs/mmap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ecryptfs/mmap.c 2009-10-01 20:12:44.000000000 -0400 -@@ -545,7 +545,7 @@ static sector_t ecryptfs_bmap(struct add - return rc; - } - --struct address_space_operations ecryptfs_aops = { -+const struct address_space_operations ecryptfs_aops = { - .writepage = ecryptfs_writepage, - .readpage = ecryptfs_readpage, - .write_begin = ecryptfs_write_begin, -diff -urNp linux-2.6.31.1/fs/exec.c linux-2.6.31.1/fs/exec.c ---- linux-2.6.31.1/fs/exec.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/exec.c 2009-10-01 20:12:44.000000000 -0400 -@@ -55,12 +55,24 @@ - #include <linux/kmod.h> - #include <linux/fsnotify.h> - #include <linux/fs_struct.h> -+#include <linux/random.h> -+#include <linux/seq_file.h> -+ -+#ifdef CONFIG_PAX_REFCOUNT -+#include <linux/kallsyms.h> -+#include <linux/kdebug.h> -+#endif - - #include <asm/uaccess.h> - #include <asm/mmu_context.h> - #include <asm/tlb.h> - #include "internal.h" - -+#ifdef CONFIG_PAX_HOOK_ACL_FLAGS -+void (*pax_set_initial_flags_func)(struct linux_binprm *bprm); -+EXPORT_SYMBOL(pax_set_initial_flags_func); -+#endif -+ - int core_uses_pid; - char core_pattern[CORENAME_MAX_SIZE] = "core"; - int suid_dumpable = 0; -@@ -113,7 +125,7 @@ SYSCALL_DEFINE1(uselib, const char __use - goto out; - - file = do_filp_open(AT_FDCWD, tmp, -- O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, -+ O_LARGEFILE | O_RDONLY | FMODE_EXEC | FMODE_GREXEC, 0, - MAY_READ | MAY_EXEC | MAY_OPEN); - putname(tmp); - error = PTR_ERR(file); -@@ -161,18 +173,10 @@ static struct page *get_arg_page(struct - int write) - { - struct page *page; -- int ret; - --#ifdef CONFIG_STACK_GROWSUP -- if (write) { -- ret = expand_stack_downwards(bprm->vma, pos); -- if (ret < 0) -- return NULL; -- } --#endif -- ret = get_user_pages(current, bprm->mm, pos, -- 1, write, 1, &page, NULL); -- if (ret <= 0) -+ if (0 > expand_stack_downwards(bprm->vma, pos)) -+ return NULL; -+ if (0 >= get_user_pages(current, bprm->mm, pos, 1, write, 1, &page, NULL)) - return NULL; - - if (write) { -@@ -244,6 +248,11 @@ static int __bprm_mm_init(struct linux_b - vma->vm_end = STACK_TOP_MAX; - vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ vma->vm_flags &= ~(VM_EXEC | VM_MAYEXEC); -+#endif -+ - vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - err = insert_vm_struct(mm, vma); - if (err) -@@ -252,6 +261,12 @@ static int __bprm_mm_init(struct linux_b - mm->stack_vm = mm->total_vm = 1; - up_write(&mm->mmap_sem); - bprm->p = vma->vm_end - sizeof(void *); -+ -+#ifdef CONFIG_PAX_RANDUSTACK -+ if (randomize_va_space) -+ bprm->p ^= (pax_get_random_long() & ~15) & ~PAGE_MASK; -+#endif -+ - return 0; - err: - up_write(&mm->mmap_sem); -@@ -503,7 +518,8 @@ static int shift_arg_pages(struct vm_are - unsigned long new_end = old_end - shift; - struct mmu_gather *tlb; - -- BUG_ON(new_start > new_end); -+ if (new_start >= new_end || new_start < mmap_min_addr) -+ return -EFAULT; - - /* - * ensure there are no vmas between where we want to go -@@ -512,6 +528,10 @@ static int shift_arg_pages(struct vm_are - if (vma != find_vma(mm, new_start)) - return -EFAULT; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ BUG_ON(pax_find_mirror_vma(vma)); -+#endif -+ - /* - * cover the whole range: [new_start, old_end) - */ -@@ -600,6 +620,14 @@ int setup_arg_pages(struct linux_binprm - bprm->exec -= stack_shift; - - down_write(&mm->mmap_sem); -+ -+ /* Move stack pages down in memory. */ -+ if (stack_shift) { -+ ret = shift_arg_pages(vma, stack_shift); -+ if (ret) -+ goto out_unlock; -+ } -+ - vm_flags = VM_STACK_FLAGS; - - /* -@@ -613,21 +641,24 @@ int setup_arg_pages(struct linux_binprm - vm_flags &= ~VM_EXEC; - vm_flags |= mm->def_flags; - -+#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) -+ if (mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { -+ vm_flags &= ~VM_EXEC; -+ -+#ifdef CONFIG_PAX_MPROTECT -+ if (mm->pax_flags & MF_PAX_MPROTECT) -+ vm_flags &= ~VM_MAYEXEC; -+#endif -+ -+ } -+#endif -+ - ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, - vm_flags); - if (ret) - goto out_unlock; - BUG_ON(prev != vma); - -- /* Move stack pages down in memory. */ -- if (stack_shift) { -- ret = shift_arg_pages(vma, stack_shift); -- if (ret) { -- up_write(&mm->mmap_sem); -- return ret; -- } -- } -- - #ifdef CONFIG_STACK_GROWSUP - stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE; - #else -@@ -639,7 +670,7 @@ int setup_arg_pages(struct linux_binprm - - out_unlock: - up_write(&mm->mmap_sem); -- return 0; -+ return ret; - } - EXPORT_SYMBOL(setup_arg_pages); - -@@ -651,7 +682,7 @@ struct file *open_exec(const char *name) - int err; - - file = do_filp_open(AT_FDCWD, name, -- O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, -+ O_LARGEFILE | O_RDONLY | FMODE_EXEC | FMODE_GREXEC, 0, - MAY_EXEC | MAY_OPEN); - if (IS_ERR(file)) - goto out; -@@ -1085,7 +1116,7 @@ int check_unsafe_exec(struct linux_binpr - } - rcu_read_unlock(); - -- if (p->fs->users > n_fs) { -+ if (atomic_read(&p->fs->users) > n_fs) { - bprm->unsafe |= LSM_UNSAFE_SHARE; - } else { - res = -EAGAIN; -@@ -1284,6 +1315,11 @@ int do_execve(char * filename, - char __user *__user *envp, - struct pt_regs * regs) - { -+#ifdef CONFIG_GRKERNSEC -+ struct file *old_exec_file; -+ struct acl_subject_label *old_acl; -+ struct rlimit old_rlim[RLIM_NLIMITS]; -+#endif - struct linux_binprm *bprm; - struct file *file; - struct files_struct *displaced; -@@ -1320,6 +1356,18 @@ int do_execve(char * filename, - bprm->filename = filename; - bprm->interp = filename; - -+ gr_learn_resource(current, RLIMIT_NPROC, atomic_read(¤t->cred->user->processes), 1); -+ -+ if (gr_handle_nproc()) { -+ retval = -EAGAIN; -+ goto out_file; -+ } -+ -+ if (!gr_acl_handle_execve(file->f_dentry, file->f_vfsmnt)) { -+ retval = -EACCES; -+ goto out_file; -+ } -+ - retval = bprm_mm_init(bprm); - if (retval) - goto out_file; -@@ -1349,10 +1397,41 @@ int do_execve(char * filename, - if (retval < 0) - goto out; - -+ if (!gr_tpe_allow(file)) { -+ retval = -EACCES; -+ goto out; -+ } -+ -+ if (gr_check_crash_exec(file)) { -+ retval = -EACCES; -+ goto out; -+ } -+ -+ gr_log_chroot_exec(file->f_dentry, file->f_vfsmnt); -+ -+ gr_handle_exec_args(bprm, argv); -+ -+#ifdef CONFIG_GRKERNSEC -+ old_acl = current->acl; -+ memcpy(old_rlim, current->signal->rlim, sizeof(old_rlim)); -+ old_exec_file = current->exec_file; -+ get_file(file); -+ current->exec_file = file; -+#endif -+ -+ retval = gr_set_proc_label(file->f_dentry, file->f_vfsmnt, -+ bprm->unsafe & LSM_UNSAFE_SHARE); -+ if (retval < 0) -+ goto out_fail; -+ - current->flags &= ~PF_KTHREAD; - retval = search_binary_handler(bprm,regs); - if (retval < 0) -- goto out; -+ goto out_fail; -+#ifdef CONFIG_GRKERNSEC -+ if (old_exec_file) -+ fput(old_exec_file); -+#endif - - /* execve succeeded */ - current->fs->in_exec = 0; -@@ -1363,6 +1442,14 @@ int do_execve(char * filename, - put_files_struct(displaced); - return retval; - -+out_fail: -+#ifdef CONFIG_GRKERNSEC -+ current->acl = old_acl; -+ memcpy(current->signal->rlim, old_rlim, sizeof(old_rlim)); -+ fput(current->exec_file); -+ current->exec_file = old_exec_file; -+#endif -+ - out: - if (bprm->mm) - mmput (bprm->mm); -@@ -1528,6 +1615,164 @@ out: - return ispipe; - } - -+int pax_check_flags(unsigned long *flags) -+{ -+ int retval = 0; -+ -+#if !defined(CONFIG_X86_32) || !defined(CONFIG_PAX_SEGMEXEC) -+ if (*flags & MF_PAX_SEGMEXEC) -+ { -+ *flags &= ~MF_PAX_SEGMEXEC; -+ retval = -EINVAL; -+ } -+#endif -+ -+ if ((*flags & MF_PAX_PAGEEXEC) -+ -+#ifdef CONFIG_PAX_PAGEEXEC -+ && (*flags & MF_PAX_SEGMEXEC) -+#endif -+ -+ ) -+ { -+ *flags &= ~MF_PAX_PAGEEXEC; -+ retval = -EINVAL; -+ } -+ -+ if ((*flags & MF_PAX_MPROTECT) -+ -+#ifdef CONFIG_PAX_MPROTECT -+ && !(*flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) -+#endif -+ -+ ) -+ { -+ *flags &= ~MF_PAX_MPROTECT; -+ retval = -EINVAL; -+ } -+ -+ if ((*flags & MF_PAX_EMUTRAMP) -+ -+#ifdef CONFIG_PAX_EMUTRAMP -+ && !(*flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) -+#endif -+ -+ ) -+ { -+ *flags &= ~MF_PAX_EMUTRAMP; -+ retval = -EINVAL; -+ } -+ -+ return retval; -+} -+ -+EXPORT_SYMBOL(pax_check_flags); -+ -+#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) -+void pax_report_fault(struct pt_regs *regs, void *pc, void *sp) -+{ -+ struct task_struct *tsk = current; -+ struct mm_struct *mm = current->mm; -+ char *buffer_exec = (char *)__get_free_page(GFP_KERNEL); -+ char *buffer_fault = (char *)__get_free_page(GFP_KERNEL); -+ char *path_exec = NULL; -+ char *path_fault = NULL; -+ unsigned long start = 0UL, end = 0UL, offset = 0UL; -+ -+ if (buffer_exec && buffer_fault) { -+ struct vm_area_struct *vma, *vma_exec = NULL, *vma_fault = NULL; -+ -+ down_read(&mm->mmap_sem); -+ vma = mm->mmap; -+ while (vma && (!vma_exec || !vma_fault)) { -+ if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) -+ vma_exec = vma; -+ if (vma->vm_start <= (unsigned long)pc && (unsigned long)pc < vma->vm_end) -+ vma_fault = vma; -+ vma = vma->vm_next; -+ } -+ if (vma_exec) { -+ path_exec = d_path(&vma_exec->vm_file->f_path, buffer_exec, PAGE_SIZE); -+ if (IS_ERR(path_exec)) -+ path_exec = "<path too long>"; -+ else { -+ path_exec = mangle_path(buffer_exec, path_exec, "\t\n\"); -+ if (path_exec) { -+ *path_exec = 0; -+ path_exec = buffer_exec; -+ } else -+ path_exec = "<path too long>"; -+ } -+ } -+ if (vma_fault) { -+ start = vma_fault->vm_start; -+ end = vma_fault->vm_end; -+ offset = vma_fault->vm_pgoff << PAGE_SHIFT; -+ if (vma_fault->vm_file) { -+ path_fault = d_path(&vma_fault->vm_file->f_path, buffer_fault, PAGE_SIZE); -+ if (IS_ERR(path_fault)) -+ path_fault = "<path too long>"; -+ else { -+ path_fault = mangle_path(buffer_fault, path_fault, "\t\n\"); -+ if (path_fault) { -+ *path_fault = 0; -+ path_fault = buffer_fault; -+ } else -+ path_fault = "<path too long>"; -+ } -+ } else -+ path_fault = "<anonymous mapping>"; -+ } -+ up_read(&mm->mmap_sem); -+ } -+ if (tsk->signal->curr_ip) -+ printk(KERN_ERR "PAX: From %u.%u.%u.%u: execution attempt in: %s, %08lx-%08lx %08lx\n", NIPQUAD(tsk->signal->curr_ip), path_fault, start, end, offset); -+ else -+ printk(KERN_ERR "PAX: execution attempt in: %s, %08lx-%08lx %08lx\n", path_fault, start, end, offset); -+ printk(KERN_ERR "PAX: terminating task: %s(%s):%d, uid/euid: %u/%u, " -+ "PC: %p, SP: %p\n", path_exec, tsk->comm, task_pid_nr(tsk), -+ task_uid(tsk), task_euid(tsk), pc, sp); -+ free_page((unsigned long)buffer_exec); -+ free_page((unsigned long)buffer_fault); -+ pax_report_insns(pc, sp); -+ do_coredump(SIGKILL, SIGKILL, regs); -+} -+#endif -+ -+#ifdef CONFIG_PAX_REFCOUNT -+void pax_report_refcount_overflow(struct pt_regs *regs) -+{ -+ if (current->signal->curr_ip) -+ printk(KERN_ERR "PAX: From %u.%u.%u.%u: refcount overflow detected in: %s:%d, uid/euid: %u/%u\n", -+ NIPQUAD(current->signal->curr_ip), current->comm, task_pid_nr(current), current_uid(), current_euid()); -+ else -+ printk(KERN_ERR "PAX: refcount overflow detected in: %s:%d, uid/euid: %u/%u\n", -+ current->comm, task_pid_nr(current), current_uid(), current_euid()); -+ print_symbol(KERN_ERR "PAX: refcount overflow occured at: %s\n", instruction_pointer(regs)); -+ show_regs(regs); -+ force_sig_specific(SIGKILL, current); -+} -+#endif -+ -+#ifdef CONFIG_PAX_USERCOPY -+void pax_report_leak_to_user(const void *ptr, unsigned long len) -+{ -+ if (current->signal->curr_ip) -+ printk(KERN_ERR "PAX: From %u.%u.%u.%u: kernel memory leak attempt detected from %p (%lu bytes)\n", NIPQUAD(current->signal->curr_ip), ptr, len); -+ else -+ printk(KERN_ERR "PAX: kernel memory leak attempt detected from %p (%lu bytes)\n", ptr, len); -+ dump_stack(); -+ do_group_exit(SIGKILL); -+} -+ -+void pax_report_overflow_from_user(const void *ptr, unsigned long len) -+{ -+ printk(KERN_ERR "PAX: kernel memory overflow attempt detected to %p (%lu bytes)\n", ptr, len); -+ dump_stack(); -+ do_group_exit(SIGKILL); -+} -+#endif -+ - static int zap_process(struct task_struct *start) - { - struct task_struct *t; -@@ -1787,6 +2032,10 @@ void do_coredump(long signr, int exit_co - */ - clear_thread_flag(TIF_SIGPENDING); - -+ if (signr == SIGKILL || signr == SIGILL) -+ gr_handle_brute_attach(current); -+ gr_learn_resource(current, RLIMIT_CORE, binfmt->min_coredump, 1); -+ - /* - * lock_kernel() because format_corename() is controlled by sysctl, which - * uses lock_kernel() -diff -urNp linux-2.6.31.1/fs/ext2/balloc.c linux-2.6.31.1/fs/ext2/balloc.c ---- linux-2.6.31.1/fs/ext2/balloc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ext2/balloc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1192,7 +1192,7 @@ static int ext2_has_free_blocks(struct e - - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); -- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && -+ if (free_blocks < root_blocks + 1 && !capable_nolog(CAP_SYS_RESOURCE) && - sbi->s_resuid != current_fsuid() && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { - return 0; -diff -urNp linux-2.6.31.1/fs/ext3/balloc.c linux-2.6.31.1/fs/ext3/balloc.c ---- linux-2.6.31.1/fs/ext3/balloc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ext3/balloc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1421,7 +1421,7 @@ static int ext3_has_free_blocks(struct e - - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); -- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && -+ if (free_blocks < root_blocks + 1 && !capable_nolog(CAP_SYS_RESOURCE) && - sbi->s_resuid != current_fsuid() && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { - return 0; -diff -urNp linux-2.6.31.1/fs/ext3/namei.c linux-2.6.31.1/fs/ext3/namei.c ---- linux-2.6.31.1/fs/ext3/namei.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ext3/namei.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1168,7 +1168,7 @@ static struct ext3_dir_entry_2 *do_split - char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size; - struct ext3_dir_entry_2 *de = NULL, *de2; -- int err = 0, i; -+ int i, err = 0; - - bh2 = ext3_append (handle, dir, &newblock, &err); - if (!(bh2)) { -diff -urNp linux-2.6.31.1/fs/ext3/xattr.c linux-2.6.31.1/fs/ext3/xattr.c ---- linux-2.6.31.1/fs/ext3/xattr.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ext3/xattr.c 2009-10-01 20:12:44.000000000 -0400 -@@ -89,8 +89,8 @@ - printk("\n"); \ - } while (0) - #else --# define ea_idebug(f...) --# define ea_bdebug(f...) -+# define ea_idebug(f...) do {} while (0) -+# define ea_bdebug(f...) do {} while (0) - #endif - - static void ext3_xattr_cache_insert(struct buffer_head *); -diff -urNp linux-2.6.31.1/fs/ext4/balloc.c linux-2.6.31.1/fs/ext4/balloc.c ---- linux-2.6.31.1/fs/ext4/balloc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ext4/balloc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -573,7 +573,7 @@ int ext4_has_free_blocks(struct ext4_sb_ - /* Hm, nope. Are (enough) root reserved blocks available? */ - if (sbi->s_resuid == current_fsuid() || - ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || -- capable(CAP_SYS_RESOURCE)) { -+ capable_nolog(CAP_SYS_RESOURCE)) { - if (free_blocks >= (nblocks + dirty_blocks)) - return 1; - } -diff -urNp linux-2.6.31.1/fs/ext4/file.c linux-2.6.31.1/fs/ext4/file.c ---- linux-2.6.31.1/fs/ext4/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ext4/file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -130,7 +130,7 @@ force_commit: - return ret; - } - --static struct vm_operations_struct ext4_file_vm_ops = { -+static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = ext4_page_mkwrite, - }; -diff -urNp linux-2.6.31.1/fs/ext4/mballoc.c linux-2.6.31.1/fs/ext4/mballoc.c ---- linux-2.6.31.1/fs/ext4/mballoc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ext4/mballoc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -2205,7 +2205,7 @@ static void ext4_mb_seq_history_stop(str - { - } - --static struct seq_operations ext4_mb_seq_history_ops = { -+static const struct seq_operations ext4_mb_seq_history_ops = { - .start = ext4_mb_seq_history_start, - .next = ext4_mb_seq_history_next, - .stop = ext4_mb_seq_history_stop, -@@ -2287,7 +2287,7 @@ static ssize_t ext4_mb_seq_history_write - return count; - } - --static struct file_operations ext4_mb_seq_history_fops = { -+static const struct file_operations ext4_mb_seq_history_fops = { - .owner = THIS_MODULE, - .open = ext4_mb_seq_history_open, - .read = seq_read, -@@ -2366,7 +2366,7 @@ static void ext4_mb_seq_groups_stop(stru - { - } - --static struct seq_operations ext4_mb_seq_groups_ops = { -+static const struct seq_operations ext4_mb_seq_groups_ops = { - .start = ext4_mb_seq_groups_start, - .next = ext4_mb_seq_groups_next, - .stop = ext4_mb_seq_groups_stop, -@@ -2387,7 +2387,7 @@ static int ext4_mb_seq_groups_open(struc - - } - --static struct file_operations ext4_mb_seq_groups_fops = { -+static const struct file_operations ext4_mb_seq_groups_fops = { - .owner = THIS_MODULE, - .open = ext4_mb_seq_groups_open, - .read = seq_read, -diff -urNp linux-2.6.31.1/fs/ext4/namei.c linux-2.6.31.1/fs/ext4/namei.c ---- linux-2.6.31.1/fs/ext4/namei.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ext4/namei.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1203,7 +1203,7 @@ static struct ext4_dir_entry_2 *do_split - char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size; - struct ext4_dir_entry_2 *de = NULL, *de2; -- int err = 0, i; -+ int i, err = 0; - - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) { -diff -urNp linux-2.6.31.1/fs/fcntl.c linux-2.6.31.1/fs/fcntl.c ---- linux-2.6.31.1/fs/fcntl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/fcntl.c 2009-10-01 20:12:44.000000000 -0400 -@@ -271,6 +271,7 @@ static long do_fcntl(int fd, unsigned in - switch (cmd) { - case F_DUPFD: - case F_DUPFD_CLOEXEC: -+ gr_learn_resource(current, RLIMIT_NOFILE, arg, 0); - if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) - break; - err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); -@@ -421,7 +422,8 @@ static inline int sigio_perm(struct task - ret = ((fown->euid == 0 || - fown->euid == cred->suid || fown->euid == cred->uid || - fown->uid == cred->suid || fown->uid == cred->uid) && -- !security_file_send_sigiotask(p, fown, sig)); -+ !security_file_send_sigiotask(p, fown, sig) && -+ !gr_check_protected_task(p) && !gr_pid_is_chrooted(p)); - rcu_read_unlock(); - return ret; - } -diff -urNp linux-2.6.31.1/fs/file.c linux-2.6.31.1/fs/file.c ---- linux-2.6.31.1/fs/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -13,6 +13,7 @@ - #include <linux/slab.h> - #include <linux/vmalloc.h> - #include <linux/file.h> -+#include <linux/security.h> - #include <linux/fdtable.h> - #include <linux/bitops.h> - #include <linux/interrupt.h> -@@ -256,6 +257,8 @@ int expand_files(struct files_struct *fi - * N.B. For clone tasks sharing a files structure, this test - * will limit the total number of files that can be opened. - */ -+ -+ gr_learn_resource(current, RLIMIT_NOFILE, nr, 0); - if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) - return -EMFILE; - -diff -urNp linux-2.6.31.1/fs/fs_struct.c linux-2.6.31.1/fs/fs_struct.c ---- linux-2.6.31.1/fs/fs_struct.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/fs_struct.c 2009-10-01 20:12:44.000000000 -0400 -@@ -89,7 +89,7 @@ void exit_fs(struct task_struct *tsk) - task_lock(tsk); - write_lock(&fs->lock); - tsk->fs = NULL; -- kill = !--fs->users; -+ kill = !atomic_dec_return(&fs->users); - write_unlock(&fs->lock); - task_unlock(tsk); - if (kill) -@@ -102,7 +102,7 @@ struct fs_struct *copy_fs_struct(struct - struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); - /* We don't need to lock fs - think why ;-) */ - if (fs) { -- fs->users = 1; -+ atomic_set(&fs->users, 1); - fs->in_exec = 0; - rwlock_init(&fs->lock); - fs->umask = old->umask; -@@ -127,7 +127,7 @@ int unshare_fs_struct(void) - - task_lock(current); - write_lock(&fs->lock); -- kill = !--fs->users; -+ kill = !atomic_dec_return(&fs->users); - current->fs = new_fs; - write_unlock(&fs->lock); - task_unlock(current); -@@ -147,7 +147,7 @@ EXPORT_SYMBOL(current_umask); - - /* to be mentioned only in INIT_TASK */ - struct fs_struct init_fs = { -- .users = 1, -+ .users = ATOMIC_INIT(1), - .lock = __RW_LOCK_UNLOCKED(init_fs.lock), - .umask = 0022, - }; -@@ -162,12 +162,12 @@ void daemonize_fs_struct(void) - task_lock(current); - - write_lock(&init_fs.lock); -- init_fs.users++; -+ atomic_inc(&init_fs.users); - write_unlock(&init_fs.lock); - - write_lock(&fs->lock); - current->fs = &init_fs; -- kill = !--fs->users; -+ kill = !atomic_dec_return(&fs->users); - write_unlock(&fs->lock); - - task_unlock(current); -diff -urNp linux-2.6.31.1/fs/fuse/control.c linux-2.6.31.1/fs/fuse/control.c ---- linux-2.6.31.1/fs/fuse/control.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/fuse/control.c 2009-10-01 20:12:44.000000000 -0400 -@@ -161,7 +161,7 @@ void fuse_ctl_remove_conn(struct fuse_co - - static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) - { -- struct tree_descr empty_descr = {""}; -+ struct tree_descr empty_descr = {"", NULL, 0}; - struct fuse_conn *fc; - int err; - -diff -urNp linux-2.6.31.1/fs/fuse/dev.c linux-2.6.31.1/fs/fuse/dev.c ---- linux-2.6.31.1/fs/fuse/dev.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/fuse/dev.c 2009-10-01 20:12:44.000000000 -0400 -@@ -885,7 +885,7 @@ static int fuse_notify_inval_entry(struc - { - struct fuse_notify_inval_entry_out outarg; - int err = -EINVAL; -- char buf[FUSE_NAME_MAX+1]; -+ char *buf = NULL; - struct qstr name; - - if (size < sizeof(outarg)) -@@ -899,6 +899,11 @@ static int fuse_notify_inval_entry(struc - if (outarg.namelen > FUSE_NAME_MAX) - goto err; - -+ err = -ENOMEM; -+ buf = kmalloc(FUSE_NAME_MAX+1, GFP_KERNEL); -+ if (!buf) -+ goto err; -+ - name.name = buf; - name.len = outarg.namelen; - err = fuse_copy_one(cs, buf, outarg.namelen + 1); -@@ -910,17 +915,15 @@ static int fuse_notify_inval_entry(struc - - down_read(&fc->killsb); - err = -ENOENT; -- if (!fc->sb) -- goto err_unlock; -- -- err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); -- --err_unlock: -+ if (fc->sb) -+ err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); - up_read(&fc->killsb); -+ kfree(buf); - return err; - - err: - fuse_copy_finish(cs); -+ kfree(buf); - return err; - } - -diff -urNp linux-2.6.31.1/fs/fuse/dir.c linux-2.6.31.1/fs/fuse/dir.c ---- linux-2.6.31.1/fs/fuse/dir.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/fuse/dir.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1122,7 +1122,7 @@ static char *read_link(struct dentry *de - return link; - } - --static void free_link(char *link) -+static void free_link(const char *link) - { - if (!IS_ERR(link)) - free_page((unsigned long) link); -diff -urNp linux-2.6.31.1/fs/fuse/file.c linux-2.6.31.1/fs/fuse/file.c ---- linux-2.6.31.1/fs/fuse/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/fuse/file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1313,7 +1313,7 @@ static int fuse_page_mkwrite(struct vm_a - return 0; - } - --static struct vm_operations_struct fuse_file_vm_ops = { -+static const struct vm_operations_struct fuse_file_vm_ops = { - .close = fuse_vma_close, - .fault = filemap_fault, - .page_mkwrite = fuse_page_mkwrite, -diff -urNp linux-2.6.31.1/fs/gfs2/file.c linux-2.6.31.1/fs/gfs2/file.c ---- linux-2.6.31.1/fs/gfs2/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/gfs2/file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -419,7 +419,7 @@ out: - return ret; - } - --static struct vm_operations_struct gfs2_vm_ops = { -+static const struct vm_operations_struct gfs2_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = gfs2_page_mkwrite, - }; -diff -urNp linux-2.6.31.1/fs/hfs/inode.c linux-2.6.31.1/fs/hfs/inode.c ---- linux-2.6.31.1/fs/hfs/inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/hfs/inode.c 2009-10-01 20:12:44.000000000 -0400 -@@ -423,7 +423,7 @@ int hfs_write_inode(struct inode *inode, - - if (S_ISDIR(main_inode->i_mode)) { - if (fd.entrylength < sizeof(struct hfs_cat_dir)) -- /* panic? */; -+ {/* panic? */} - hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, - sizeof(struct hfs_cat_dir)); - if (rec.type != HFS_CDR_DIR || -@@ -444,7 +444,7 @@ int hfs_write_inode(struct inode *inode, - sizeof(struct hfs_cat_file)); - } else { - if (fd.entrylength < sizeof(struct hfs_cat_file)) -- /* panic? */; -+ {/* panic? */} - hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, - sizeof(struct hfs_cat_file)); - if (rec.type != HFS_CDR_FIL || -diff -urNp linux-2.6.31.1/fs/hfsplus/inode.c linux-2.6.31.1/fs/hfsplus/inode.c ---- linux-2.6.31.1/fs/hfsplus/inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/hfsplus/inode.c 2009-10-01 20:12:44.000000000 -0400 -@@ -406,7 +406,7 @@ int hfsplus_cat_read_inode(struct inode - struct hfsplus_cat_folder *folder = &entry.folder; - - if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) -- /* panic? */; -+ {/* panic? */} - hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, - sizeof(struct hfsplus_cat_folder)); - hfsplus_get_perms(inode, &folder->permissions, 1); -@@ -423,7 +423,7 @@ int hfsplus_cat_read_inode(struct inode - struct hfsplus_cat_file *file = &entry.file; - - if (fd->entrylength < sizeof(struct hfsplus_cat_file)) -- /* panic? */; -+ {/* panic? */} - hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, - sizeof(struct hfsplus_cat_file)); - -@@ -479,7 +479,7 @@ int hfsplus_cat_write_inode(struct inode - struct hfsplus_cat_folder *folder = &entry.folder; - - if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) -- /* panic? */; -+ {/* panic? */} - hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, - sizeof(struct hfsplus_cat_folder)); - /* simple node checks? */ -@@ -501,7 +501,7 @@ int hfsplus_cat_write_inode(struct inode - struct hfsplus_cat_file *file = &entry.file; - - if (fd.entrylength < sizeof(struct hfsplus_cat_file)) -- /* panic? */; -+ {/* panic? */} - hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, - sizeof(struct hfsplus_cat_file)); - hfsplus_inode_write_fork(inode, &file->data_fork); -diff -urNp linux-2.6.31.1/fs/jbd2/journal.c linux-2.6.31.1/fs/jbd2/journal.c ---- linux-2.6.31.1/fs/jbd2/journal.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/jbd2/journal.c 2009-10-01 20:12:44.000000000 -0400 -@@ -768,7 +768,7 @@ static void jbd2_seq_history_stop(struct - { - } - --static struct seq_operations jbd2_seq_history_ops = { -+static const struct seq_operations jbd2_seq_history_ops = { - .start = jbd2_seq_history_start, - .next = jbd2_seq_history_next, - .stop = jbd2_seq_history_stop, -@@ -818,7 +818,7 @@ static int jbd2_seq_history_release(stru - return seq_release(inode, file); - } - --static struct file_operations jbd2_seq_history_fops = { -+static const struct file_operations jbd2_seq_history_fops = { - .owner = THIS_MODULE, - .open = jbd2_seq_history_open, - .read = seq_read, -@@ -872,7 +872,7 @@ static void jbd2_seq_info_stop(struct se - { - } - --static struct seq_operations jbd2_seq_info_ops = { -+static const struct seq_operations jbd2_seq_info_ops = { - .start = jbd2_seq_info_start, - .next = jbd2_seq_info_next, - .stop = jbd2_seq_info_stop, -@@ -920,7 +920,7 @@ static int jbd2_seq_info_release(struct - return seq_release(inode, file); - } - --static struct file_operations jbd2_seq_info_fops = { -+static const struct file_operations jbd2_seq_info_fops = { - .owner = THIS_MODULE, - .open = jbd2_seq_info_open, - .read = seq_read, -diff -urNp linux-2.6.31.1/fs/jffs2/debug.h linux-2.6.31.1/fs/jffs2/debug.h ---- linux-2.6.31.1/fs/jffs2/debug.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/jffs2/debug.h 2009-10-01 20:12:44.000000000 -0400 -@@ -52,13 +52,13 @@ - #if CONFIG_JFFS2_FS_DEBUG > 0 - #define D1(x) x - #else --#define D1(x) -+#define D1(x) do {} while (0); - #endif - - #if CONFIG_JFFS2_FS_DEBUG > 1 - #define D2(x) x - #else --#define D2(x) -+#define D2(x) do {} while (0); - #endif - - /* The prefixes of JFFS2 messages */ -@@ -114,73 +114,73 @@ - #ifdef JFFS2_DBG_READINODE_MESSAGES - #define dbg_readinode(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_readinode(fmt, ...) -+#define dbg_readinode(fmt, ...) do {} while (0) - #endif - #ifdef JFFS2_DBG_READINODE2_MESSAGES - #define dbg_readinode2(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_readinode2(fmt, ...) -+#define dbg_readinode2(fmt, ...) do {} while (0) - #endif - - /* Fragtree build debugging messages */ - #ifdef JFFS2_DBG_FRAGTREE_MESSAGES - #define dbg_fragtree(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_fragtree(fmt, ...) -+#define dbg_fragtree(fmt, ...) do {} while (0) - #endif - #ifdef JFFS2_DBG_FRAGTREE2_MESSAGES - #define dbg_fragtree2(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_fragtree2(fmt, ...) -+#define dbg_fragtree2(fmt, ...) do {} while (0) - #endif - - /* Directory entry list manilulation debugging messages */ - #ifdef JFFS2_DBG_DENTLIST_MESSAGES - #define dbg_dentlist(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_dentlist(fmt, ...) -+#define dbg_dentlist(fmt, ...) do {} while (0) - #endif - - /* Print the messages about manipulating node_refs */ - #ifdef JFFS2_DBG_NODEREF_MESSAGES - #define dbg_noderef(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_noderef(fmt, ...) -+#define dbg_noderef(fmt, ...) do {} while (0) - #endif - - /* Manipulations with the list of inodes (JFFS2 inocache) */ - #ifdef JFFS2_DBG_INOCACHE_MESSAGES - #define dbg_inocache(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_inocache(fmt, ...) -+#define dbg_inocache(fmt, ...) do {} while (0) - #endif - - /* Summary debugging messages */ - #ifdef JFFS2_DBG_SUMMARY_MESSAGES - #define dbg_summary(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_summary(fmt, ...) -+#define dbg_summary(fmt, ...) do {} while (0) - #endif - - /* File system build messages */ - #ifdef JFFS2_DBG_FSBUILD_MESSAGES - #define dbg_fsbuild(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_fsbuild(fmt, ...) -+#define dbg_fsbuild(fmt, ...) do {} while (0) - #endif - - /* Watch the object allocations */ - #ifdef JFFS2_DBG_MEMALLOC_MESSAGES - #define dbg_memalloc(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_memalloc(fmt, ...) -+#define dbg_memalloc(fmt, ...) do {} while (0) - #endif - - /* Watch the XATTR subsystem */ - #ifdef JFFS2_DBG_XATTR_MESSAGES - #define dbg_xattr(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) - #else --#define dbg_xattr(fmt, ...) -+#define dbg_xattr(fmt, ...) do {} while (0) - #endif - - /* "Sanity" checks */ -diff -urNp linux-2.6.31.1/fs/jffs2/erase.c linux-2.6.31.1/fs/jffs2/erase.c ---- linux-2.6.31.1/fs/jffs2/erase.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/jffs2/erase.c 2009-10-01 20:12:44.000000000 -0400 -@@ -434,7 +434,8 @@ static void jffs2_mark_erased_block(stru - struct jffs2_unknown_node marker = { - .magic = cpu_to_je16(JFFS2_MAGIC_BITMASK), - .nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER), -- .totlen = cpu_to_je32(c->cleanmarker_size) -+ .totlen = cpu_to_je32(c->cleanmarker_size), -+ .hdr_crc = cpu_to_je32(0) - }; - - jffs2_prealloc_raw_node_refs(c, jeb, 1); -diff -urNp linux-2.6.31.1/fs/jffs2/summary.h linux-2.6.31.1/fs/jffs2/summary.h ---- linux-2.6.31.1/fs/jffs2/summary.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/jffs2/summary.h 2009-10-01 20:12:44.000000000 -0400 -@@ -194,18 +194,18 @@ int jffs2_sum_scan_sumnode(struct jffs2_ - - #define jffs2_sum_active() (0) - #define jffs2_sum_init(a) (0) --#define jffs2_sum_exit(a) --#define jffs2_sum_disable_collecting(a) -+#define jffs2_sum_exit(a) do {} while (0) -+#define jffs2_sum_disable_collecting(a) do {} while (0) - #define jffs2_sum_is_disabled(a) (0) --#define jffs2_sum_reset_collected(a) -+#define jffs2_sum_reset_collected(a) do {} while (0) - #define jffs2_sum_add_kvec(a,b,c,d) (0) --#define jffs2_sum_move_collected(a,b) -+#define jffs2_sum_move_collected(a,b) do {} while (0) - #define jffs2_sum_write_sumnode(a) (0) --#define jffs2_sum_add_padding_mem(a,b) --#define jffs2_sum_add_inode_mem(a,b,c) --#define jffs2_sum_add_dirent_mem(a,b,c) --#define jffs2_sum_add_xattr_mem(a,b,c) --#define jffs2_sum_add_xref_mem(a,b,c) -+#define jffs2_sum_add_padding_mem(a,b) do {} while (0) -+#define jffs2_sum_add_inode_mem(a,b,c) do {} while (0) -+#define jffs2_sum_add_dirent_mem(a,b,c) do {} while (0) -+#define jffs2_sum_add_xattr_mem(a,b,c) do {} while (0) -+#define jffs2_sum_add_xref_mem(a,b,c) do {} while (0) - #define jffs2_sum_scan_sumnode(a,b,c,d,e) (0) - - #endif /* CONFIG_JFFS2_SUMMARY */ -diff -urNp linux-2.6.31.1/fs/jffs2/wbuf.c linux-2.6.31.1/fs/jffs2/wbuf.c ---- linux-2.6.31.1/fs/jffs2/wbuf.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/jffs2/wbuf.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1012,7 +1012,8 @@ static const struct jffs2_unknown_node o - { - .magic = constant_cpu_to_je16(JFFS2_MAGIC_BITMASK), - .nodetype = constant_cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER), -- .totlen = constant_cpu_to_je32(8) -+ .totlen = constant_cpu_to_je32(8), -+ .hdr_crc = constant_cpu_to_je32(0) - }; - - /* -diff -urNp linux-2.6.31.1/fs/locks.c linux-2.6.31.1/fs/locks.c ---- linux-2.6.31.1/fs/locks.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/locks.c 2009-10-01 20:12:44.000000000 -0400 -@@ -2007,16 +2007,16 @@ void locks_remove_flock(struct file *fil - return; - - if (filp->f_op && filp->f_op->flock) { -- struct file_lock fl = { -+ struct file_lock flock = { - .fl_pid = current->tgid, - .fl_file = filp, - .fl_flags = FL_FLOCK, - .fl_type = F_UNLCK, - .fl_end = OFFSET_MAX, - }; -- filp->f_op->flock(filp, F_SETLKW, &fl); -- if (fl.fl_ops && fl.fl_ops->fl_release_private) -- fl.fl_ops->fl_release_private(&fl); -+ filp->f_op->flock(filp, F_SETLKW, &flock); -+ if (flock.fl_ops && flock.fl_ops->fl_release_private) -+ flock.fl_ops->fl_release_private(&flock); - } - - lock_kernel(); -diff -urNp linux-2.6.31.1/fs/namei.c linux-2.6.31.1/fs/namei.c ---- linux-2.6.31.1/fs/namei.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/namei.c 2009-10-01 20:12:44.000000000 -0400 -@@ -631,7 +631,7 @@ static __always_inline int __do_follow_l - cookie = dentry->d_inode->i_op->follow_link(dentry, nd); - error = PTR_ERR(cookie); - if (!IS_ERR(cookie)) { -- char *s = nd_get_link(nd); -+ const char *s = nd_get_link(nd); - error = 0; - if (s) - error = __vfs_follow_link(nd, s); -@@ -662,6 +662,13 @@ static inline int do_follow_link(struct - err = security_inode_follow_link(path->dentry, nd); - if (err) - goto loop; -+ -+ if (gr_handle_follow_link(path->dentry->d_parent->d_inode, -+ path->dentry->d_inode, path->dentry, nd->path.mnt)) { -+ err = -EACCES; -+ goto loop; -+ } -+ - current->link_count++; - current->total_link_count++; - nd->depth++; -@@ -1005,11 +1012,18 @@ return_reval: - break; - } - return_base: -+ if (!gr_acl_handle_hidden_file(nd->path.dentry, nd->path.mnt)) { -+ path_put(&nd->path); -+ return -ENOENT; -+ } - return 0; - out_dput: - path_put_conditional(&next, nd); - break; - } -+ if (!gr_acl_handle_hidden_file(nd->path.dentry, nd->path.mnt)) -+ err = -ENOENT; -+ - path_put(&nd->path); - return_err: - return err; -@@ -1608,12 +1622,19 @@ static int __open_namei_create(struct na - int error; - struct dentry *dir = nd->path.dentry; - -+ if (!gr_acl_handle_creat(path->dentry, nd->path.dentry, nd->path.mnt, flag, mode)) { -+ error = -EACCES; -+ goto out_unlock; -+ } -+ - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); - error = security_path_mknod(&nd->path, path->dentry, mode, 0); - if (error) - goto out_unlock; - error = vfs_create(dir->d_inode, path->dentry, mode, nd); -+ if (!error) -+ gr_handle_create(path->dentry, nd->path.mnt); - out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); - dput(nd->path.dentry); -@@ -1696,6 +1717,17 @@ struct file *do_filp_open(int dfd, const - &nd, flag); - if (error) - return ERR_PTR(error); -+ -+ if (gr_handle_rawio(nd.path.dentry->d_inode)) { -+ error = -EPERM; -+ goto exit; -+ } -+ -+ if (!gr_acl_handle_open(nd.path.dentry, nd.path.mnt, flag)) { -+ error = -EACCES; -+ goto exit; -+ } -+ - goto ok; - } - -@@ -1782,6 +1814,20 @@ do_last: - /* - * It already exists. - */ -+ -+ if (gr_handle_rawio(path.dentry->d_inode)) { -+ error = -EPERM; -+ goto exit_mutex_unlock; -+ } -+ if (!gr_acl_handle_open(path.dentry, nd.path.mnt, flag)) { -+ error = -EACCES; -+ goto exit_mutex_unlock; -+ } -+ if (gr_handle_fifo(path.dentry, nd.path.mnt, dir, flag, acc_mode)) { -+ error = -EACCES; -+ goto exit_mutex_unlock; -+ } -+ - mutex_unlock(&dir->d_inode->i_mutex); - audit_inode(pathname, path.dentry); - -@@ -1874,6 +1920,13 @@ do_link: - error = security_inode_follow_link(path.dentry, &nd); - if (error) - goto exit_dput; -+ -+ if (gr_handle_follow_link(path.dentry->d_parent->d_inode, path.dentry->d_inode, -+ path.dentry, nd.path.mnt)) { -+ error = -EACCES; -+ goto exit_dput; -+ } -+ - error = __do_follow_link(&path, &nd); - if (error) { - /* Does someone understand code flow here? Or it is only -@@ -2048,6 +2101,17 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const - error = may_mknod(mode); - if (error) - goto out_dput; -+ -+ if (gr_handle_chroot_mknod(dentry, nd.path.mnt, mode)) { -+ error = -EPERM; -+ goto out_dput; -+ } -+ -+ if (!gr_acl_handle_mknod(dentry, nd.path.dentry, nd.path.mnt, mode)) { -+ error = -EACCES; -+ goto out_dput; -+ } -+ - error = mnt_want_write(nd.path.mnt); - if (error) - goto out_dput; -@@ -2068,6 +2132,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const - } - out_drop_write: - mnt_drop_write(nd.path.mnt); -+ -+ if (!error) -+ gr_handle_create(dentry, nd.path.mnt); - out_dput: - dput(dentry); - out_unlock: -@@ -2121,6 +2188,11 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const - if (IS_ERR(dentry)) - goto out_unlock; - -+ if (!gr_acl_handle_mkdir(dentry, nd.path.dentry, nd.path.mnt)) { -+ error = -EACCES; -+ goto out_dput; -+ } -+ - if (!IS_POSIXACL(nd.path.dentry->d_inode)) - mode &= ~current_umask(); - error = mnt_want_write(nd.path.mnt); -@@ -2132,6 +2204,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const - error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); - out_drop_write: - mnt_drop_write(nd.path.mnt); -+ -+ if (!error) -+ gr_handle_create(dentry, nd.path.mnt); -+ - out_dput: - dput(dentry); - out_unlock: -@@ -2213,6 +2289,8 @@ static long do_rmdir(int dfd, const char - char * name; - struct dentry *dentry; - struct nameidata nd; -+ ino_t saved_ino = 0; -+ dev_t saved_dev = 0; - - error = user_path_parent(dfd, pathname, &nd, &name); - if (error) -@@ -2237,6 +2315,19 @@ static long do_rmdir(int dfd, const char - error = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto exit2; -+ -+ if (dentry->d_inode != NULL) { -+ if (dentry->d_inode->i_nlink <= 1) { -+ saved_ino = dentry->d_inode->i_ino; -+ saved_dev = dentry->d_inode->i_sb->s_dev; -+ } -+ -+ if (!gr_acl_handle_rmdir(dentry, nd.path.mnt)) { -+ error = -EACCES; -+ goto exit3; -+ } -+ } -+ - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit3; -@@ -2244,6 +2335,8 @@ static long do_rmdir(int dfd, const char - if (error) - goto exit4; - error = vfs_rmdir(nd.path.dentry->d_inode, dentry); -+ if (!error && (saved_dev || saved_ino)) -+ gr_handle_delete(saved_ino, saved_dev); - exit4: - mnt_drop_write(nd.path.mnt); - exit3: -@@ -2305,6 +2398,8 @@ static long do_unlinkat(int dfd, const c - struct dentry *dentry; - struct nameidata nd; - struct inode *inode = NULL; -+ ino_t saved_ino = 0; -+ dev_t saved_dev = 0; - - error = user_path_parent(dfd, pathname, &nd, &name); - if (error) -@@ -2324,8 +2419,19 @@ static long do_unlinkat(int dfd, const c - if (nd.last.name[nd.last.len]) - goto slashes; - inode = dentry->d_inode; -- if (inode) -+ if (inode) { -+ if (inode->i_nlink <= 1) { -+ saved_ino = inode->i_ino; -+ saved_dev = inode->i_sb->s_dev; -+ } -+ - atomic_inc(&inode->i_count); -+ -+ if (!gr_acl_handle_unlink(dentry, nd.path.mnt)) { -+ error = -EACCES; -+ goto exit2; -+ } -+ } - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit2; -@@ -2333,6 +2439,8 @@ static long do_unlinkat(int dfd, const c - if (error) - goto exit3; - error = vfs_unlink(nd.path.dentry->d_inode, dentry); -+ if (!error && (saved_ino || saved_dev)) -+ gr_handle_delete(saved_ino, saved_dev); - exit3: - mnt_drop_write(nd.path.mnt); - exit2: -@@ -2411,6 +2519,11 @@ SYSCALL_DEFINE3(symlinkat, const char __ - if (IS_ERR(dentry)) - goto out_unlock; - -+ if (!gr_acl_handle_symlink(dentry, nd.path.dentry, nd.path.mnt, from)) { -+ error = -EACCES; -+ goto out_dput; -+ } -+ - error = mnt_want_write(nd.path.mnt); - if (error) - goto out_dput; -@@ -2418,6 +2531,8 @@ SYSCALL_DEFINE3(symlinkat, const char __ - if (error) - goto out_drop_write; - error = vfs_symlink(nd.path.dentry->d_inode, dentry, from); -+ if (!error) -+ gr_handle_create(dentry, nd.path.mnt); - out_drop_write: - mnt_drop_write(nd.path.mnt); - out_dput: -@@ -2511,6 +2626,20 @@ SYSCALL_DEFINE5(linkat, int, olddfd, con - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto out_unlock; -+ -+ if (gr_handle_hardlink(old_path.dentry, old_path.mnt, -+ old_path.dentry->d_inode, -+ old_path.dentry->d_inode->i_mode, to)) { -+ error = -EACCES; -+ goto out_dput; -+ } -+ -+ if (!gr_acl_handle_link(new_dentry, nd.path.dentry, nd.path.mnt, -+ old_path.dentry, old_path.mnt, to)) { -+ error = -EACCES; -+ goto out_dput; -+ } -+ - error = mnt_want_write(nd.path.mnt); - if (error) - goto out_dput; -@@ -2518,6 +2647,8 @@ SYSCALL_DEFINE5(linkat, int, olddfd, con - if (error) - goto out_drop_write; - error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry); -+ if (!error) -+ gr_handle_create(new_dentry, nd.path.mnt); - out_drop_write: - mnt_drop_write(nd.path.mnt); - out_dput: -@@ -2751,6 +2882,12 @@ SYSCALL_DEFINE4(renameat, int, olddfd, c - if (new_dentry == trap) - goto exit5; - -+ error = gr_acl_handle_rename(new_dentry, new_dir, newnd.path.mnt, -+ old_dentry, old_dir->d_inode, oldnd.path.mnt, -+ to); -+ if (error) -+ goto exit5; -+ - error = mnt_want_write(oldnd.path.mnt); - if (error) - goto exit5; -@@ -2760,6 +2897,9 @@ SYSCALL_DEFINE4(renameat, int, olddfd, c - goto exit6; - error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); -+ if (!error) -+ gr_handle_rename(old_dir->d_inode, new_dir->d_inode, old_dentry, -+ new_dentry, oldnd.path.mnt, new_dentry->d_inode ? 1 : 0); - exit6: - mnt_drop_write(oldnd.path.mnt); - exit5: -diff -urNp linux-2.6.31.1/fs/namespace.c linux-2.6.31.1/fs/namespace.c ---- linux-2.6.31.1/fs/namespace.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/namespace.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1083,6 +1083,9 @@ static int do_umount(struct vfsmount *mn - if (!(sb->s_flags & MS_RDONLY)) - retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); - up_write(&sb->s_umount); -+ -+ gr_log_remount(mnt->mnt_devname, retval); -+ - return retval; - } - -@@ -1104,6 +1107,9 @@ static int do_umount(struct vfsmount *mn - security_sb_umount_busy(mnt); - up_write(&namespace_sem); - release_mounts(&umount_list); -+ -+ gr_log_unmount(mnt->mnt_devname, retval); -+ - return retval; - } - -@@ -1940,6 +1946,11 @@ long do_mount(char *dev_name, char *dir_ - if (retval) - goto dput_out; - -+ if (gr_handle_chroot_mount(path.dentry, path.mnt, dev_name)) { -+ retval = -EPERM; -+ goto dput_out; -+ } -+ - if (flags & MS_REMOUNT) - retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, - data_page); -@@ -1954,6 +1965,9 @@ long do_mount(char *dev_name, char *dir_ - dev_name, data_page); - dput_out: - path_put(&path); -+ -+ gr_log_mount(dev_name, dir_name, retval); -+ - return retval; - } - -@@ -2158,6 +2172,12 @@ SYSCALL_DEFINE2(pivot_root, const char _ - goto out1; - } - -+ if (gr_handle_chroot_pivot()) { -+ error = -EPERM; -+ path_put(&old); -+ goto out1; -+ } -+ - read_lock(¤t->fs->lock); - root = current->fs->root; - path_get(¤t->fs->root); -diff -urNp linux-2.6.31.1/fs/nfs/client.c linux-2.6.31.1/fs/nfs/client.c ---- linux-2.6.31.1/fs/nfs/client.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nfs/client.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1533,7 +1533,7 @@ static void *nfs_server_list_next(struct - static void nfs_server_list_stop(struct seq_file *p, void *v); - static int nfs_server_list_show(struct seq_file *m, void *v); - --static struct seq_operations nfs_server_list_ops = { -+static const struct seq_operations nfs_server_list_ops = { - .start = nfs_server_list_start, - .next = nfs_server_list_next, - .stop = nfs_server_list_stop, -@@ -1554,7 +1554,7 @@ static void *nfs_volume_list_next(struct - static void nfs_volume_list_stop(struct seq_file *p, void *v); - static int nfs_volume_list_show(struct seq_file *m, void *v); - --static struct seq_operations nfs_volume_list_ops = { -+static const struct seq_operations nfs_volume_list_ops = { - .start = nfs_volume_list_start, - .next = nfs_volume_list_next, - .stop = nfs_volume_list_stop, -diff -urNp linux-2.6.31.1/fs/nfs/file.c linux-2.6.31.1/fs/nfs/file.c ---- linux-2.6.31.1/fs/nfs/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nfs/file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -59,7 +59,7 @@ static int nfs_lock(struct file *filp, i - static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); - static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); - --static struct vm_operations_struct nfs_file_vm_ops; -+static const struct vm_operations_struct nfs_file_vm_ops; - - const struct file_operations nfs_file_operations = { - .llseek = nfs_file_llseek, -@@ -526,7 +526,7 @@ out_unlock: - return VM_FAULT_SIGBUS; - } - --static struct vm_operations_struct nfs_file_vm_ops = { -+static const struct vm_operations_struct nfs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = nfs_vm_page_mkwrite, - }; -diff -urNp linux-2.6.31.1/fs/nfs/nfs4proc.c linux-2.6.31.1/fs/nfs/nfs4proc.c ---- linux-2.6.31.1/fs/nfs/nfs4proc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nfs/nfs4proc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1123,7 +1123,7 @@ static int _nfs4_do_open_reclaim(struct - static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state) - { - struct nfs_server *server = NFS_SERVER(state->inode); -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = _nfs4_do_open_reclaim(ctx, state); -@@ -1165,7 +1165,7 @@ static int _nfs4_open_delegation_recall( - - int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - struct nfs_server *server = NFS_SERVER(state->inode); - int err; - do { -@@ -1481,7 +1481,7 @@ static int _nfs4_open_expired(struct nfs - static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) - { - struct nfs_server *server = NFS_SERVER(state->inode); -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - - do { -@@ -1579,7 +1579,7 @@ out_err: - - static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - struct nfs4_state *res; - int status; - -@@ -1670,7 +1670,7 @@ static int nfs4_do_setattr(struct inode - struct nfs4_state *state) - { - struct nfs_server *server = NFS_SERVER(inode); -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(server, -@@ -2014,7 +2014,7 @@ static int _nfs4_server_capabilities(str - - int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(server, -@@ -2048,7 +2048,7 @@ static int _nfs4_lookup_root(struct nfs_ - static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(server, -@@ -2137,7 +2137,7 @@ static int _nfs4_proc_getattr(struct nfs - - static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(server, -@@ -2225,7 +2225,7 @@ static int nfs4_proc_lookupfh(struct nfs - struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr); -@@ -2254,7 +2254,7 @@ static int _nfs4_proc_lookup(struct inod - - static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(dir), -@@ -2318,7 +2318,7 @@ static int _nfs4_proc_access(struct inod - - static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(inode), -@@ -2374,7 +2374,7 @@ static int _nfs4_proc_readlink(struct in - static int nfs4_proc_readlink(struct inode *inode, struct page *page, - unsigned int pgbase, unsigned int pglen) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(inode), -@@ -2472,7 +2472,7 @@ static int _nfs4_proc_remove(struct inod - - static int nfs4_proc_remove(struct inode *dir, struct qstr *name) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(dir), -@@ -2546,7 +2546,7 @@ static int _nfs4_proc_rename(struct inod - static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(old_dir), -@@ -2593,7 +2593,7 @@ static int _nfs4_proc_link(struct inode - - static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(inode), -@@ -2685,7 +2685,7 @@ out: - static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, - struct page *page, unsigned int len, struct iattr *sattr) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(dir), -@@ -2716,7 +2716,7 @@ out: - static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(dir), -@@ -2765,7 +2765,7 @@ static int _nfs4_proc_readdir(struct den - static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), -@@ -2813,7 +2813,7 @@ out: - static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, - struct iattr *sattr, dev_t rdev) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(dir), -@@ -2845,7 +2845,7 @@ static int _nfs4_proc_statfs(struct nfs_ - - static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(server, -@@ -2876,7 +2876,7 @@ static int _nfs4_do_fsinfo(struct nfs_se - - static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - - do { -@@ -2922,7 +2922,7 @@ static int _nfs4_proc_pathconf(struct nf - static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_pathconf *pathconf) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - - do { -@@ -3224,7 +3224,7 @@ out_free: - - static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - ssize_t ret; - do { - ret = __nfs4_get_acl_uncached(inode, buf, buflen); -@@ -3280,7 +3280,7 @@ static int __nfs4_proc_set_acl(struct in - - static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(inode), -@@ -3545,7 +3545,7 @@ out: - int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync) - { - struct nfs_server *server = NFS_SERVER(inode); -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - do { - err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); -@@ -3618,7 +3618,7 @@ out: - - static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - - do { -@@ -3992,7 +3992,7 @@ static int _nfs4_do_setlk(struct nfs4_st - static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) - { - struct nfs_server *server = NFS_SERVER(state->inode); -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - - do { -@@ -4010,7 +4010,7 @@ static int nfs4_lock_reclaim(struct nfs4 - static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request) - { - struct nfs_server *server = NFS_SERVER(state->inode); -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - - err = nfs4_set_lock_state(state, request); -@@ -4065,7 +4065,7 @@ out: - - static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) - { -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - - do { -@@ -4125,7 +4125,7 @@ nfs4_proc_lock(struct file *filp, int cm - int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) - { - struct nfs_server *server = NFS_SERVER(state->inode); -- struct nfs4_exception exception = { }; -+ struct nfs4_exception exception = {0, 0}; - int err; - - err = nfs4_set_lock_state(state, fl); -diff -urNp linux-2.6.31.1/fs/nfsd/export.c linux-2.6.31.1/fs/nfsd/export.c ---- linux-2.6.31.1/fs/nfsd/export.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nfsd/export.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1505,7 +1505,7 @@ static int e_show(struct seq_file *m, vo - return svc_export_show(m, &svc_export_cache, cp); - } - --struct seq_operations nfs_exports_op = { -+const struct seq_operations nfs_exports_op = { - .start = e_start, - .next = e_next, - .stop = e_stop, -diff -urNp linux-2.6.31.1/fs/nfsd/nfsctl.c linux-2.6.31.1/fs/nfsd/nfsctl.c ---- linux-2.6.31.1/fs/nfsd/nfsctl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nfsd/nfsctl.c 2009-10-01 20:12:44.000000000 -0400 -@@ -174,7 +174,7 @@ static const struct file_operations expo - - extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); - --static struct file_operations pool_stats_operations = { -+static const struct file_operations pool_stats_operations = { - .open = nfsd_pool_stats_open, - .read = seq_read, - .llseek = seq_lseek, -diff -urNp linux-2.6.31.1/fs/nilfs2/btnode.c linux-2.6.31.1/fs/nilfs2/btnode.c ---- linux-2.6.31.1/fs/nilfs2/btnode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nilfs2/btnode.c 2009-10-01 20:12:44.000000000 -0400 -@@ -46,7 +46,7 @@ void nilfs_btnode_cache_init_once(struct - INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); - } - --static struct address_space_operations def_btnode_aops = { -+static const struct address_space_operations def_btnode_aops = { - .sync_page = block_sync_page, - }; - -diff -urNp linux-2.6.31.1/fs/nilfs2/dir.c linux-2.6.31.1/fs/nilfs2/dir.c ---- linux-2.6.31.1/fs/nilfs2/dir.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nilfs2/dir.c 2009-10-01 20:12:44.000000000 -0400 -@@ -697,7 +697,7 @@ not_empty: - return 0; - } - --struct file_operations nilfs_dir_operations = { -+const struct file_operations nilfs_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = nilfs_readdir, -diff -urNp linux-2.6.31.1/fs/nilfs2/file.c linux-2.6.31.1/fs/nilfs2/file.c ---- linux-2.6.31.1/fs/nilfs2/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nilfs2/file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -117,7 +117,7 @@ static int nilfs_page_mkwrite(struct vm_ - return 0; - } - --struct vm_operations_struct nilfs_file_vm_ops = { -+const struct vm_operations_struct nilfs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = nilfs_page_mkwrite, - }; -@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file * - * We have mostly NULL's here: the current defaults are ok for - * the nilfs filesystem. - */ --struct file_operations nilfs_file_operations = { -+const struct file_operations nilfs_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, -@@ -151,7 +151,7 @@ struct file_operations nilfs_file_operat - .splice_read = generic_file_splice_read, - }; - --struct inode_operations nilfs_file_inode_operations = { -+const struct inode_operations nilfs_file_inode_operations = { - .truncate = nilfs_truncate, - .setattr = nilfs_setattr, - .permission = nilfs_permission, -diff -urNp linux-2.6.31.1/fs/nilfs2/gcinode.c linux-2.6.31.1/fs/nilfs2/gcinode.c ---- linux-2.6.31.1/fs/nilfs2/gcinode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nilfs2/gcinode.c 2009-10-01 20:12:44.000000000 -0400 -@@ -52,7 +52,7 @@ - #include "dat.h" - #include "ifile.h" - --static struct address_space_operations def_gcinode_aops = { -+static const struct address_space_operations def_gcinode_aops = { - .sync_page = block_sync_page, - }; - -diff -urNp linux-2.6.31.1/fs/nilfs2/inode.c linux-2.6.31.1/fs/nilfs2/inode.c ---- linux-2.6.31.1/fs/nilfs2/inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nilfs2/inode.c 2009-10-01 20:12:44.000000000 -0400 -@@ -238,7 +238,7 @@ nilfs_direct_IO(int rw, struct kiocb *io - return size; - } - --struct address_space_operations nilfs_aops = { -+const struct address_space_operations nilfs_aops = { - .writepage = nilfs_writepage, - .readpage = nilfs_readpage, - .sync_page = block_sync_page, -diff -urNp linux-2.6.31.1/fs/nilfs2/mdt.c linux-2.6.31.1/fs/nilfs2/mdt.c ---- linux-2.6.31.1/fs/nilfs2/mdt.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nilfs2/mdt.c 2009-10-01 20:12:44.000000000 -0400 -@@ -430,7 +430,7 @@ nilfs_mdt_write_page(struct page *page, - } - - --static struct address_space_operations def_mdt_aops = { -+static const struct address_space_operations def_mdt_aops = { - .writepage = nilfs_mdt_write_page, - .sync_page = block_sync_page, - }; -diff -urNp linux-2.6.31.1/fs/nilfs2/namei.c linux-2.6.31.1/fs/nilfs2/namei.c ---- linux-2.6.31.1/fs/nilfs2/namei.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nilfs2/namei.c 2009-10-01 20:12:44.000000000 -0400 -@@ -448,7 +448,7 @@ out: - return err; - } - --struct inode_operations nilfs_dir_inode_operations = { -+const struct inode_operations nilfs_dir_inode_operations = { - .create = nilfs_create, - .lookup = nilfs_lookup, - .link = nilfs_link, -@@ -462,12 +462,12 @@ struct inode_operations nilfs_dir_inode_ - .permission = nilfs_permission, - }; - --struct inode_operations nilfs_special_inode_operations = { -+const struct inode_operations nilfs_special_inode_operations = { - .setattr = nilfs_setattr, - .permission = nilfs_permission, - }; - --struct inode_operations nilfs_symlink_inode_operations = { -+const struct inode_operations nilfs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, -diff -urNp linux-2.6.31.1/fs/nilfs2/nilfs.h linux-2.6.31.1/fs/nilfs2/nilfs.h ---- linux-2.6.31.1/fs/nilfs2/nilfs.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nilfs2/nilfs.h 2009-10-01 20:12:44.000000000 -0400 -@@ -294,13 +294,13 @@ void nilfs_clear_gcdat_inode(struct the_ - /* - * Inodes and files operations - */ --extern struct file_operations nilfs_dir_operations; --extern struct inode_operations nilfs_file_inode_operations; --extern struct file_operations nilfs_file_operations; --extern struct address_space_operations nilfs_aops; --extern struct inode_operations nilfs_dir_inode_operations; --extern struct inode_operations nilfs_special_inode_operations; --extern struct inode_operations nilfs_symlink_inode_operations; -+extern const struct file_operations nilfs_dir_operations; -+extern const struct inode_operations nilfs_file_inode_operations; -+extern const struct file_operations nilfs_file_operations; -+extern const struct address_space_operations nilfs_aops; -+extern const struct inode_operations nilfs_dir_inode_operations; -+extern const struct inode_operations nilfs_special_inode_operations; -+extern const struct inode_operations nilfs_symlink_inode_operations; - - /* - * filesystem type -diff -urNp linux-2.6.31.1/fs/nilfs2/super.c linux-2.6.31.1/fs/nilfs2/super.c ---- linux-2.6.31.1/fs/nilfs2/super.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nilfs2/super.c 2009-10-01 20:12:44.000000000 -0400 -@@ -529,7 +529,7 @@ static int nilfs_statfs(struct dentry *d - return 0; - } - --static struct super_operations nilfs_sops = { -+static const struct super_operations nilfs_sops = { - .alloc_inode = nilfs_alloc_inode, - .destroy_inode = nilfs_destroy_inode, - .dirty_inode = nilfs_dirty_inode, -diff -urNp linux-2.6.31.1/fs/nls/nls_base.c linux-2.6.31.1/fs/nls/nls_base.c ---- linux-2.6.31.1/fs/nls/nls_base.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/nls/nls_base.c 2009-10-01 20:12:44.000000000 -0400 -@@ -41,7 +41,7 @@ static const struct utf8_table utf8_tabl - {0xF8, 0xF0, 3*6, 0x1FFFFF, 0x10000, /* 4 byte sequence */}, - {0xFC, 0xF8, 4*6, 0x3FFFFFF, 0x200000, /* 5 byte sequence */}, - {0xFE, 0xFC, 5*6, 0x7FFFFFFF, 0x4000000, /* 6 byte sequence */}, -- {0, /* end of table */} -+ {0, 0, 0, 0, 0, /* end of table */} - }; - - #define UNICODE_MAX 0x0010ffff -diff -urNp linux-2.6.31.1/fs/ntfs/file.c linux-2.6.31.1/fs/ntfs/file.c ---- linux-2.6.31.1/fs/ntfs/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ntfs/file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -2291,6 +2291,6 @@ const struct inode_operations ntfs_file_ - #endif /* NTFS_RW */ - }; - --const struct file_operations ntfs_empty_file_ops = {}; -+const struct file_operations ntfs_empty_file_ops; - --const struct inode_operations ntfs_empty_inode_ops = {}; -+const struct inode_operations ntfs_empty_inode_ops; -diff -urNp linux-2.6.31.1/fs/ocfs2/cluster/heartbeat.c linux-2.6.31.1/fs/ocfs2/cluster/heartbeat.c ---- linux-2.6.31.1/fs/ocfs2/cluster/heartbeat.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ocfs2/cluster/heartbeat.c 2009-10-01 20:12:44.000000000 -0400 -@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct fi - } - #endif /* CONFIG_DEBUG_FS */ - --static struct file_operations o2hb_debug_fops = { -+static const struct file_operations o2hb_debug_fops = { - .open = o2hb_debug_open, - .release = o2hb_debug_release, - .read = o2hb_debug_read, -diff -urNp linux-2.6.31.1/fs/ocfs2/cluster/netdebug.c linux-2.6.31.1/fs/ocfs2/cluster/netdebug.c ---- linux-2.6.31.1/fs/ocfs2/cluster/netdebug.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ocfs2/cluster/netdebug.c 2009-10-01 20:12:44.000000000 -0400 -@@ -163,7 +163,7 @@ static void nst_seq_stop(struct seq_file - { - } - --static struct seq_operations nst_seq_ops = { -+static const struct seq_operations nst_seq_ops = { - .start = nst_seq_start, - .next = nst_seq_next, - .stop = nst_seq_stop, -@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode - return seq_release_private(inode, file); - } - --static struct file_operations nst_seq_fops = { -+static const struct file_operations nst_seq_fops = { - .open = nst_fop_open, - .read = seq_read, - .llseek = seq_lseek, -@@ -344,7 +344,7 @@ static void sc_seq_stop(struct seq_file - { - } - --static struct seq_operations sc_seq_ops = { -+static const struct seq_operations sc_seq_ops = { - .start = sc_seq_start, - .next = sc_seq_next, - .stop = sc_seq_stop, -@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode * - return seq_release_private(inode, file); - } - --static struct file_operations sc_seq_fops = { -+static const struct file_operations sc_seq_fops = { - .open = sc_fop_open, - .read = seq_read, - .llseek = seq_lseek, -diff -urNp linux-2.6.31.1/fs/ocfs2/dlm/dlmdebug.c linux-2.6.31.1/fs/ocfs2/dlm/dlmdebug.c ---- linux-2.6.31.1/fs/ocfs2/dlm/dlmdebug.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ocfs2/dlm/dlmdebug.c 2009-10-01 20:12:44.000000000 -0400 -@@ -479,7 +479,7 @@ bail: - return -ENOMEM; - } - --static struct file_operations debug_purgelist_fops = { -+static const struct file_operations debug_purgelist_fops = { - .open = debug_purgelist_open, - .release = debug_buffer_release, - .read = debug_buffer_read, -@@ -539,7 +539,7 @@ bail: - return -ENOMEM; - } - --static struct file_operations debug_mle_fops = { -+static const struct file_operations debug_mle_fops = { - .open = debug_mle_open, - .release = debug_buffer_release, - .read = debug_buffer_read, -@@ -683,7 +683,7 @@ static int lockres_seq_show(struct seq_f - return 0; - } - --static struct seq_operations debug_lockres_ops = { -+static const struct seq_operations debug_lockres_ops = { - .start = lockres_seq_start, - .stop = lockres_seq_stop, - .next = lockres_seq_next, -@@ -742,7 +742,7 @@ static int debug_lockres_release(struct - return seq_release_private(inode, file); - } - --static struct file_operations debug_lockres_fops = { -+static const struct file_operations debug_lockres_fops = { - .open = debug_lockres_open, - .release = debug_lockres_release, - .read = seq_read, -@@ -926,7 +926,7 @@ bail: - return -ENOMEM; - } - --static struct file_operations debug_state_fops = { -+static const struct file_operations debug_state_fops = { - .open = debug_state_open, - .release = debug_buffer_release, - .read = debug_buffer_read, -diff -urNp linux-2.6.31.1/fs/ocfs2/localalloc.c linux-2.6.31.1/fs/ocfs2/localalloc.c ---- linux-2.6.31.1/fs/ocfs2/localalloc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ocfs2/localalloc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1186,7 +1186,7 @@ static int ocfs2_local_alloc_slide_windo - goto bail; - } - -- atomic_inc(&osb->alloc_stats.moves); -+ atomic_inc_unchecked(&osb->alloc_stats.moves); - - status = 0; - bail: -diff -urNp linux-2.6.31.1/fs/ocfs2/mmap.c linux-2.6.31.1/fs/ocfs2/mmap.c ---- linux-2.6.31.1/fs/ocfs2/mmap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ocfs2/mmap.c 2009-10-01 20:12:44.000000000 -0400 -@@ -202,7 +202,7 @@ out: - return ret; - } - --static struct vm_operations_struct ocfs2_file_vm_ops = { -+static const struct vm_operations_struct ocfs2_file_vm_ops = { - .fault = ocfs2_fault, - .page_mkwrite = ocfs2_page_mkwrite, - }; -diff -urNp linux-2.6.31.1/fs/ocfs2/ocfs2.h linux-2.6.31.1/fs/ocfs2/ocfs2.h ---- linux-2.6.31.1/fs/ocfs2/ocfs2.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ocfs2/ocfs2.h 2009-10-01 20:12:44.000000000 -0400 -@@ -191,11 +191,11 @@ enum ocfs2_vol_state - - struct ocfs2_alloc_stats - { -- atomic_t moves; -- atomic_t local_data; -- atomic_t bitmap_data; -- atomic_t bg_allocs; -- atomic_t bg_extends; -+ atomic_unchecked_t moves; -+ atomic_unchecked_t local_data; -+ atomic_unchecked_t bitmap_data; -+ atomic_unchecked_t bg_allocs; -+ atomic_unchecked_t bg_extends; - }; - - enum ocfs2_local_alloc_state -diff -urNp linux-2.6.31.1/fs/ocfs2/suballoc.c linux-2.6.31.1/fs/ocfs2/suballoc.c ---- linux-2.6.31.1/fs/ocfs2/suballoc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ocfs2/suballoc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -620,7 +620,7 @@ static int ocfs2_reserve_suballoc_bits(s - mlog_errno(status); - goto bail; - } -- atomic_inc(&osb->alloc_stats.bg_extends); -+ atomic_inc_unchecked(&osb->alloc_stats.bg_extends); - - /* You should never ask for this much metadata */ - BUG_ON(bits_wanted > -@@ -1650,7 +1650,7 @@ int ocfs2_claim_metadata(struct ocfs2_su - mlog_errno(status); - goto bail; - } -- atomic_inc(&osb->alloc_stats.bg_allocs); -+ atomic_inc_unchecked(&osb->alloc_stats.bg_allocs); - - *blkno_start = bg_blkno + (u64) *suballoc_bit_start; - ac->ac_bits_given += (*num_bits); -@@ -1724,7 +1724,7 @@ int ocfs2_claim_new_inode(struct ocfs2_s - mlog_errno(status); - goto bail; - } -- atomic_inc(&osb->alloc_stats.bg_allocs); -+ atomic_inc_unchecked(&osb->alloc_stats.bg_allocs); - - BUG_ON(num_bits != 1); - -@@ -1826,7 +1826,7 @@ int __ocfs2_claim_clusters(struct ocfs2_ - cluster_start, - num_clusters); - if (!status) -- atomic_inc(&osb->alloc_stats.local_data); -+ atomic_inc_unchecked(&osb->alloc_stats.local_data); - } else { - if (min_clusters > (osb->bitmap_cpg - 1)) { - /* The only paths asking for contiguousness -@@ -1854,7 +1854,7 @@ int __ocfs2_claim_clusters(struct ocfs2_ - ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, - bg_blkno, - bg_bit_off); -- atomic_inc(&osb->alloc_stats.bitmap_data); -+ atomic_inc_unchecked(&osb->alloc_stats.bitmap_data); - } - } - if (status < 0) { -diff -urNp linux-2.6.31.1/fs/ocfs2/super.c linux-2.6.31.1/fs/ocfs2/super.c ---- linux-2.6.31.1/fs/ocfs2/super.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ocfs2/super.c 2009-10-01 20:12:44.000000000 -0400 -@@ -284,11 +284,11 @@ static int ocfs2_osb_dump(struct ocfs2_s - "%10s => GlobalAllocs: %d LocalAllocs: %d " - "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", - "Stats", -- atomic_read(&osb->alloc_stats.bitmap_data), -- atomic_read(&osb->alloc_stats.local_data), -- atomic_read(&osb->alloc_stats.bg_allocs), -- atomic_read(&osb->alloc_stats.moves), -- atomic_read(&osb->alloc_stats.bg_extends)); -+ atomic_read_unchecked(&osb->alloc_stats.bitmap_data), -+ atomic_read_unchecked(&osb->alloc_stats.local_data), -+ atomic_read_unchecked(&osb->alloc_stats.bg_allocs), -+ atomic_read_unchecked(&osb->alloc_stats.moves), -+ atomic_read_unchecked(&osb->alloc_stats.bg_extends)); - - out += snprintf(buf + out, len - out, - "%10s => State: %u Descriptor: %llu Size: %u bits " -@@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct f - } - #endif /* CONFIG_DEBUG_FS */ - --static struct file_operations ocfs2_osb_debug_fops = { -+static const struct file_operations ocfs2_osb_debug_fops = { - .open = ocfs2_osb_debug_open, - .release = ocfs2_debug_release, - .read = ocfs2_debug_read, -@@ -1991,11 +1991,11 @@ static int ocfs2_initialize_super(struct - spin_lock_init(&osb->osb_xattr_lock); - ocfs2_init_inode_steal_slot(osb); - -- atomic_set(&osb->alloc_stats.moves, 0); -- atomic_set(&osb->alloc_stats.local_data, 0); -- atomic_set(&osb->alloc_stats.bitmap_data, 0); -- atomic_set(&osb->alloc_stats.bg_allocs, 0); -- atomic_set(&osb->alloc_stats.bg_extends, 0); -+ atomic_set_unchecked(&osb->alloc_stats.moves, 0); -+ atomic_set_unchecked(&osb->alloc_stats.local_data, 0); -+ atomic_set_unchecked(&osb->alloc_stats.bitmap_data, 0); -+ atomic_set_unchecked(&osb->alloc_stats.bg_allocs, 0); -+ atomic_set_unchecked(&osb->alloc_stats.bg_extends, 0); - - /* Copy the blockcheck stats from the superblock probe */ - osb->osb_ecc_stats = *stats; -diff -urNp linux-2.6.31.1/fs/omfs/dir.c linux-2.6.31.1/fs/omfs/dir.c ---- linux-2.6.31.1/fs/omfs/dir.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/omfs/dir.c 2009-10-01 20:12:44.000000000 -0400 -@@ -489,7 +489,7 @@ out: - return ret; - } - --struct inode_operations omfs_dir_inops = { -+const struct inode_operations omfs_dir_inops = { - .lookup = omfs_lookup, - .mkdir = omfs_mkdir, - .rename = omfs_rename, -@@ -498,7 +498,7 @@ struct inode_operations omfs_dir_inops = - .rmdir = omfs_rmdir, - }; - --struct file_operations omfs_dir_operations = { -+const struct file_operations omfs_dir_operations = { - .read = generic_read_dir, - .readdir = omfs_readdir, - .llseek = generic_file_llseek, -diff -urNp linux-2.6.31.1/fs/omfs/file.c linux-2.6.31.1/fs/omfs/file.c ---- linux-2.6.31.1/fs/omfs/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/omfs/file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address - return generic_block_bmap(mapping, block, omfs_get_block); - } - --struct file_operations omfs_file_operations = { -+const struct file_operations omfs_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, -@@ -333,11 +333,11 @@ struct file_operations omfs_file_operati - .splice_read = generic_file_splice_read, - }; - --struct inode_operations omfs_file_inops = { -+const struct inode_operations omfs_file_inops = { - .truncate = omfs_truncate - }; - --struct address_space_operations omfs_aops = { -+const struct address_space_operations omfs_aops = { - .readpage = omfs_readpage, - .readpages = omfs_readpages, - .writepage = omfs_writepage, -diff -urNp linux-2.6.31.1/fs/omfs/inode.c linux-2.6.31.1/fs/omfs/inode.c ---- linux-2.6.31.1/fs/omfs/inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/omfs/inode.c 2009-10-01 20:12:44.000000000 -0400 -@@ -278,7 +278,7 @@ static int omfs_statfs(struct dentry *de - return 0; - } - --static struct super_operations omfs_sops = { -+static const struct super_operations omfs_sops = { - .write_inode = omfs_write_inode, - .delete_inode = omfs_delete_inode, - .put_super = omfs_put_super, -diff -urNp linux-2.6.31.1/fs/omfs/omfs.h linux-2.6.31.1/fs/omfs/omfs.h ---- linux-2.6.31.1/fs/omfs/omfs.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/omfs/omfs.h 2009-10-01 20:12:44.000000000 -0400 -@@ -44,16 +44,16 @@ extern int omfs_allocate_range(struct su - extern int omfs_clear_range(struct super_block *sb, u64 block, int count); - - /* dir.c */ --extern struct file_operations omfs_dir_operations; --extern struct inode_operations omfs_dir_inops; -+extern const struct file_operations omfs_dir_operations; -+extern const struct inode_operations omfs_dir_inops; - extern int omfs_make_empty(struct inode *inode, struct super_block *sb); - extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, - u64 fsblock); - - /* file.c */ --extern struct file_operations omfs_file_operations; --extern struct inode_operations omfs_file_inops; --extern struct address_space_operations omfs_aops; -+extern const struct file_operations omfs_file_operations; -+extern const struct inode_operations omfs_file_inops; -+extern const struct address_space_operations omfs_aops; - extern void omfs_make_empty_table(struct buffer_head *bh, int offset); - extern int omfs_shrink_inode(struct inode *inode); - -diff -urNp linux-2.6.31.1/fs/open.c linux-2.6.31.1/fs/open.c ---- linux-2.6.31.1/fs/open.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/open.c 2009-10-01 20:12:44.000000000 -0400 -@@ -206,6 +206,9 @@ int do_truncate(struct dentry *dentry, l - if (length < 0) - return -EINVAL; - -+ if (filp && !gr_acl_handle_truncate(dentry, filp->f_path.mnt)) -+ return -EACCES; -+ - newattrs.ia_size = length; - newattrs.ia_valid = ATTR_SIZE | time_attrs; - if (filp) { -@@ -510,6 +513,9 @@ SYSCALL_DEFINE3(faccessat, int, dfd, con - if (__mnt_is_readonly(path.mnt)) - res = -EROFS; - -+ if (!res && !gr_acl_handle_access(path.dentry, path.mnt, mode)) -+ res = -EACCES; -+ - out_path_release: - path_put(&path); - out: -@@ -536,6 +542,8 @@ SYSCALL_DEFINE1(chdir, const char __user - if (error) - goto dput_and_out; - -+ gr_log_chdir(path.dentry, path.mnt); -+ - set_fs_pwd(current->fs, &path); - - dput_and_out: -@@ -562,6 +570,13 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd - goto out_putf; - - error = inode_permission(inode, MAY_EXEC | MAY_ACCESS); -+ -+ if (!error && !gr_chroot_fchdir(file->f_path.dentry, file->f_path.mnt)) -+ error = -EPERM; -+ -+ if (!error) -+ gr_log_chdir(file->f_path.dentry, file->f_path.mnt); -+ - if (!error) - set_fs_pwd(current->fs, &file->f_path); - out_putf: -@@ -587,7 +602,18 @@ SYSCALL_DEFINE1(chroot, const char __use - if (!capable(CAP_SYS_CHROOT)) - goto dput_and_out; - -+ if (gr_handle_chroot_chroot(path.dentry, path.mnt)) -+ goto dput_and_out; -+ -+ if (gr_handle_chroot_caps(&path)) { -+ error = -ENOMEM; -+ goto dput_and_out; -+ } -+ - set_fs_root(current->fs, &path); -+ -+ gr_handle_chroot_chdir(&path); -+ - error = 0; - dput_and_out: - path_put(&path); -@@ -615,13 +641,28 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd - err = mnt_want_write_file(file); - if (err) - goto out_putf; -+ -+ if (!gr_acl_handle_fchmod(dentry, file->f_path.mnt, mode)) { -+ err = -EACCES; -+ goto out_drop_write; -+ } -+ - mutex_lock(&inode->i_mutex); - if (mode == (mode_t) -1) - mode = inode->i_mode; -+ -+ if (gr_handle_chroot_chmod(dentry, file->f_path.mnt, mode)) { -+ err = -EPERM; -+ mutex_unlock(&inode->i_mutex); -+ goto out_drop_write; -+ } -+ - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); - mutex_unlock(&inode->i_mutex); -+ -+out_drop_write: - mnt_drop_write(file->f_path.mnt); - out_putf: - fput(file); -@@ -644,13 +685,28 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, cons - error = mnt_want_write(path.mnt); - if (error) - goto dput_and_out; -+ -+ if (!gr_acl_handle_chmod(path.dentry, path.mnt, mode)) { -+ error = -EACCES; -+ goto out_drop_write; -+ } -+ - mutex_lock(&inode->i_mutex); - if (mode == (mode_t) -1) - mode = inode->i_mode; -+ -+ if (gr_handle_chroot_chmod(path.dentry, path.mnt, mode)) { -+ error = -EACCES; -+ mutex_unlock(&inode->i_mutex); -+ goto out_drop_write; -+ } -+ - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path.dentry, &newattrs); - mutex_unlock(&inode->i_mutex); -+ -+out_drop_write: - mnt_drop_write(path.mnt); - dput_and_out: - path_put(&path); -@@ -663,12 +719,15 @@ SYSCALL_DEFINE2(chmod, const char __user - return sys_fchmodat(AT_FDCWD, filename, mode); - } - --static int chown_common(struct dentry * dentry, uid_t user, gid_t group) -+static int chown_common(struct dentry * dentry, uid_t user, gid_t group, struct vfsmount *mnt) - { - struct inode *inode = dentry->d_inode; - int error; - struct iattr newattrs; - -+ if (!gr_acl_handle_chown(dentry, mnt)) -+ return -EACCES; -+ - newattrs.ia_valid = ATTR_CTIME; - if (user != (uid_t) -1) { - newattrs.ia_valid |= ATTR_UID; -@@ -699,7 +758,7 @@ SYSCALL_DEFINE3(chown, const char __user - error = mnt_want_write(path.mnt); - if (error) - goto out_release; -- error = chown_common(path.dentry, user, group); -+ error = chown_common(path.dentry, user, group, path.mnt); - mnt_drop_write(path.mnt); - out_release: - path_put(&path); -@@ -724,7 +783,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, cons - error = mnt_want_write(path.mnt); - if (error) - goto out_release; -- error = chown_common(path.dentry, user, group); -+ error = chown_common(path.dentry, user, group, path.mnt); - mnt_drop_write(path.mnt); - out_release: - path_put(&path); -@@ -743,7 +802,7 @@ SYSCALL_DEFINE3(lchown, const char __use - error = mnt_want_write(path.mnt); - if (error) - goto out_release; -- error = chown_common(path.dentry, user, group); -+ error = chown_common(path.dentry, user, group, path.mnt); - mnt_drop_write(path.mnt); - out_release: - path_put(&path); -@@ -766,7 +825,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd - goto out_fput; - dentry = file->f_path.dentry; - audit_inode(NULL, dentry); -- error = chown_common(dentry, user, group); -+ error = chown_common(dentry, user, group, file->f_path.mnt); - mnt_drop_write(file->f_path.mnt); - out_fput: - fput(file); -diff -urNp linux-2.6.31.1/fs/pipe.c linux-2.6.31.1/fs/pipe.c ---- linux-2.6.31.1/fs/pipe.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/pipe.c 2009-10-01 20:12:44.000000000 -0400 -@@ -886,7 +886,7 @@ void free_pipe_info(struct inode *inode) - inode->i_pipe = NULL; - } - --static struct vfsmount *pipe_mnt __read_mostly; -+struct vfsmount *pipe_mnt __read_mostly; - static int pipefs_delete_dentry(struct dentry *dentry) - { - /* -diff -urNp linux-2.6.31.1/fs/proc/array.c linux-2.6.31.1/fs/proc/array.c ---- linux-2.6.31.1/fs/proc/array.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/array.c 2009-10-01 20:12:44.000000000 -0400 -@@ -321,6 +321,21 @@ static inline void task_context_switch_c - p->nivcsw); - } - -+#if defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) -+static inline void task_pax(struct seq_file *m, struct task_struct *p) -+{ -+ if (p->mm) -+ seq_printf(m, "PaX:\t%c%c%c%c%c\n", -+ p->mm->pax_flags & MF_PAX_PAGEEXEC ? 'P' : 'p', -+ p->mm->pax_flags & MF_PAX_EMUTRAMP ? 'E' : 'e', -+ p->mm->pax_flags & MF_PAX_MPROTECT ? 'M' : 'm', -+ p->mm->pax_flags & MF_PAX_RANDMMAP ? 'R' : 'r', -+ p->mm->pax_flags & MF_PAX_SEGMEXEC ? 'S' : 's'); -+ else -+ seq_printf(m, "PaX:\t-----\n"); -+} -+#endif -+ - int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task) - { -@@ -340,9 +355,20 @@ int proc_pid_status(struct seq_file *m, - task_show_regs(m, task); - #endif - task_context_switch_counts(m, task); -+ -+#if defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) -+ task_pax(m, task); -+#endif -+ - return 0; - } - -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+#define PAX_RAND_FLAGS(_mm) (_mm != NULL && _mm != current->mm && \ -+ (_mm->pax_flags & MF_PAX_RANDMMAP || \ -+ _mm->pax_flags & MF_PAX_SEGMEXEC)) -+#endif -+ - static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task, int whole) - { -@@ -439,6 +465,19 @@ static int do_task_stat(struct seq_file - gtime = task_gtime(task); - } - -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+ if (PAX_RAND_FLAGS(mm)) { -+ eip = 0; -+ esp = 0; -+ wchan = 0; -+ } -+#endif -+#ifdef CONFIG_GRKERNSEC_HIDESYM -+ wchan = 0; -+ eip =0; -+ esp =0; -+#endif -+ - /* scale priority and nice values from timeslices to -20..20 */ - /* to make it look like a "normal" Unix priority/nice value */ - priority = task_prio(task); -@@ -479,9 +518,15 @@ static int do_task_stat(struct seq_file - vsize, - mm ? get_mm_rss(mm) : 0, - rsslim, -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+ PAX_RAND_FLAGS(mm) ? 1 : (mm ? mm->start_code : 0), -+ PAX_RAND_FLAGS(mm) ? 1 : (mm ? mm->end_code : 0), -+ PAX_RAND_FLAGS(mm) ? 0 : ((permitted && mm) ? mm->start_stack : 0), -+#else - mm ? mm->start_code : 0, - mm ? mm->end_code : 0, - (permitted && mm) ? mm->start_stack : 0, -+#endif - esp, - eip, - /* The signal information here is obsolete. -@@ -534,3 +579,10 @@ int proc_pid_statm(struct seq_file *m, s - - return 0; - } -+ -+#ifdef CONFIG_GRKERNSEC_PROC_IPADDR -+int proc_pid_ipaddr(struct task_struct *task, char *buffer) -+{ -+ return sprintf(buffer, "%u.%u.%u.%u\n", NIPQUAD(task->signal->curr_ip)); -+} -+#endif -diff -urNp linux-2.6.31.1/fs/proc/base.c linux-2.6.31.1/fs/proc/base.c ---- linux-2.6.31.1/fs/proc/base.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/base.c 2009-10-01 20:12:44.000000000 -0400 -@@ -213,6 +213,9 @@ static int check_mem_permission(struct t - if (task == current) - return 0; - -+ if (gr_handle_proc_ptrace(task) || gr_acl_handle_procpidmem(task)) -+ return -EPERM; -+ - /* - * If current is actively ptrace'ing, and would also be - * permitted to freshly attach with ptrace now, permit it. -@@ -260,6 +263,9 @@ static int proc_pid_cmdline(struct task_ - if (!mm->arg_end) - goto out_mm; /* Shh! No looking before we're done */ - -+ if (gr_acl_handle_procpidmem(task)) -+ goto out_mm; -+ - len = mm->arg_end - mm->arg_start; - - if (len > PAGE_SIZE) -@@ -287,12 +293,26 @@ out: - return res; - } - -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+#define PAX_RAND_FLAGS(_mm) (_mm != NULL && _mm != current->mm && \ -+ (_mm->pax_flags & MF_PAX_RANDMMAP || \ -+ _mm->pax_flags & MF_PAX_SEGMEXEC)) -+#endif -+ - static int proc_pid_auxv(struct task_struct *task, char *buffer) - { - int res = 0; - struct mm_struct *mm = get_task_mm(task); - if (mm) { - unsigned int nwords = 0; -+ -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+ if (PAX_RAND_FLAGS(mm)) { -+ mmput(mm); -+ return res; -+ } -+#endif -+ - do { - nwords += 2; - } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ -@@ -328,7 +348,7 @@ static int proc_pid_wchan(struct task_st - } - #endif /* CONFIG_KALLSYMS */ - --#ifdef CONFIG_STACKTRACE -+#if defined(CONFIG_STACKTRACE) && !defined(CONFIG_GRKERNSEC_HIDESYM) - - #define MAX_STACK_TRACE_DEPTH 64 - -@@ -521,7 +541,7 @@ static int proc_pid_limits(struct task_s - return count; - } - --#ifdef CONFIG_HAVE_ARCH_TRACEHOOK -+#if defined(CONFIG_HAVE_ARCH_TRACEHOOK) && !defined(CONFIG_GRKERNSEC_PROC_MEMMAP) - static int proc_pid_syscall(struct task_struct *task, char *buffer) - { - long nr; -@@ -935,6 +955,9 @@ static ssize_t environ_read(struct file - if (!task) - goto out_no_task; - -+ if (gr_acl_handle_procpidmem(task)) -+ goto out; -+ - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - -@@ -1438,7 +1461,11 @@ static struct inode *proc_pid_make_inode - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; -+#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP -+ inode->i_gid = CONFIG_GRKERNSEC_PROC_GID; -+#else - inode->i_gid = cred->egid; -+#endif - rcu_read_unlock(); - } - security_task_to_inode(task, inode); -@@ -1456,6 +1483,9 @@ static int pid_getattr(struct vfsmount * - struct inode *inode = dentry->d_inode; - struct task_struct *task; - const struct cred *cred; -+#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ const struct cred *tmpcred = current_cred(); -+#endif - - generic_fillattr(inode, stat); - -@@ -1463,12 +1493,34 @@ static int pid_getattr(struct vfsmount * - stat->uid = 0; - stat->gid = 0; - task = pid_task(proc_pid(inode), PIDTYPE_PID); -+ -+ if (task && (gr_pid_is_chrooted(task) || gr_check_hidden_task(task))) { -+ rcu_read_unlock(); -+ return -ENOENT; -+ } -+ - if (task) { -+ cred = __task_cred(task); -+#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ if (!tmpcred->uid || (tmpcred->uid == cred->uid) -+#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP -+ || in_group_p(CONFIG_GRKERNSEC_PROC_GID) -+#endif -+ ) -+#endif - if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || -+#ifdef CONFIG_GRKERNSEC_PROC_USER -+ (inode->i_mode == (S_IFDIR|S_IRUSR|S_IXUSR)) || -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ (inode->i_mode == (S_IFDIR|S_IRUSR|S_IRGRP|S_IXUSR|S_IXGRP)) || -+#endif - task_dumpable(task)) { -- cred = __task_cred(task); - stat->uid = cred->euid; -+#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP -+ stat->gid = CONFIG_GRKERNSEC_PROC_GID; -+#else - stat->gid = cred->egid; -+#endif - } - } - rcu_read_unlock(); -@@ -1500,11 +1552,20 @@ static int pid_revalidate(struct dentry - - if (task) { - if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || -+#ifdef CONFIG_GRKERNSEC_PROC_USER -+ (inode->i_mode == (S_IFDIR|S_IRUSR|S_IXUSR)) || -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ (inode->i_mode == (S_IFDIR|S_IRUSR|S_IRGRP|S_IXUSR|S_IXGRP)) || -+#endif - task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; -+#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP -+ inode->i_gid = CONFIG_GRKERNSEC_PROC_GID; -+#else - inode->i_gid = cred->egid; -+#endif - rcu_read_unlock(); - } else { - inode->i_uid = 0; -@@ -1625,7 +1686,8 @@ static int proc_fd_info(struct inode *in - int fd = proc_fd(inode); - - if (task) { -- files = get_files_struct(task); -+ if (!gr_acl_handle_procpidmem(task)) -+ files = get_files_struct(task); - put_task_struct(task); - } - if (files) { -@@ -1877,12 +1939,22 @@ static const struct file_operations proc - static int proc_fd_permission(struct inode *inode, int mask) - { - int rv; -+ struct task_struct *task; - - rv = generic_permission(inode, mask, NULL); -- if (rv == 0) -- return 0; -+ - if (task_pid(current) == proc_pid(inode)) - rv = 0; -+ -+ task = get_proc_task(inode); -+ if (task == NULL) -+ return rv; -+ -+ if (gr_acl_handle_procpidmem(task)) -+ rv = -EACCES; -+ -+ put_task_struct(task); -+ - return rv; - } - -@@ -1991,6 +2063,9 @@ static struct dentry *proc_pident_lookup - if (!task) - goto out_no_task; - -+ if (gr_pid_is_chrooted(task) || gr_check_hidden_task(task)) -+ goto out; -+ - /* - * Yes, it does not scale. And it should not. Don't add - * new entries into /proc/<tgid>/ without very good reasons. -@@ -2035,6 +2110,9 @@ static int proc_pident_readdir(struct fi - if (!task) - goto out_no_task; - -+ if (gr_pid_is_chrooted(task) || gr_check_hidden_task(task)) -+ goto out; -+ - ret = 0; - i = filp->f_pos; - switch (i) { -@@ -2401,6 +2479,9 @@ static struct dentry *proc_base_lookup(s - if (p > last) - goto out; - -+ if (gr_pid_is_chrooted(task) || gr_check_hidden_task(task)) -+ goto out; -+ - error = proc_base_instantiate(dir, dentry, task, p); - - out: -@@ -2487,7 +2568,7 @@ static const struct pid_entry tgid_base_ - #ifdef CONFIG_SCHED_DEBUG - REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), - #endif --#ifdef CONFIG_HAVE_ARCH_TRACEHOOK -+#if defined(CONFIG_HAVE_ARCH_TRACEHOOK) && !defined(CONFIG_GRKERNSEC_PROC_MEMMAP) - INF("syscall", S_IRUSR, proc_pid_syscall), - #endif - INF("cmdline", S_IRUGO, proc_pid_cmdline), -@@ -2515,7 +2596,7 @@ static const struct pid_entry tgid_base_ - #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, proc_pid_wchan), - #endif --#ifdef CONFIG_STACKTRACE -+#if defined(CONFIG_STACKTRACE) && !defined(CONFIG_GRKERNSEC_HIDESYM) - ONE("stack", S_IRUSR, proc_pid_stack), - #endif - #ifdef CONFIG_SCHEDSTATS -@@ -2545,6 +2626,9 @@ static const struct pid_entry tgid_base_ - #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tgid_io_accounting), - #endif -+#ifdef CONFIG_GRKERNSEC_PROC_IPADDR -+ INF("ipaddr", S_IRUSR, proc_pid_ipaddr), -+#endif - }; - - static int proc_tgid_base_readdir(struct file * filp, -@@ -2674,7 +2758,14 @@ static struct dentry *proc_pid_instantia - if (!inode) - goto out; - -+#ifdef CONFIG_GRKERNSEC_PROC_USER -+ inode->i_mode = S_IFDIR|S_IRUSR|S_IXUSR; -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ inode->i_gid = CONFIG_GRKERNSEC_PROC_GID; -+ inode->i_mode = S_IFDIR|S_IRUSR|S_IRGRP|S_IXUSR|S_IXGRP; -+#else - inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; -+#endif - inode->i_op = &proc_tgid_base_inode_operations; - inode->i_fop = &proc_tgid_base_operations; - inode->i_flags|=S_IMMUTABLE; -@@ -2716,7 +2807,11 @@ struct dentry *proc_pid_lookup(struct in - if (!task) - goto out; - -+ if (gr_check_hidden_task(task)) -+ goto out_put_task; -+ - result = proc_pid_instantiate(dir, dentry, task, NULL); -+out_put_task: - put_task_struct(task); - out: - return result; -@@ -2781,6 +2876,10 @@ int proc_pid_readdir(struct file * filp, - { - unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode); -+#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ const struct cred *tmpcred = current_cred(); -+ const struct cred *itercred; -+#endif - struct tgid_iter iter; - struct pid_namespace *ns; - -@@ -2799,6 +2898,20 @@ int proc_pid_readdir(struct file * filp, - for (iter = next_tgid(ns, iter); - iter.task; - iter.tgid += 1, iter = next_tgid(ns, iter)) { -+#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ itercred = __task_cred(iter.task); -+#endif -+ if (gr_pid_is_chrooted(iter.task) || gr_check_hidden_task(iter.task) -+#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ || (tmpcred->uid && (itercred->uid != tmpcred->uid) -+#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP -+ && !in_group_p(CONFIG_GRKERNSEC_PROC_GID) -+#endif -+ ) -+#endif -+ ) -+ continue; -+ - filp->f_pos = iter.tgid + TGID_OFFSET; - if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { - put_task_struct(iter.task); -@@ -2826,7 +2939,7 @@ static const struct pid_entry tid_base_s - #ifdef CONFIG_SCHED_DEBUG - REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), - #endif --#ifdef CONFIG_HAVE_ARCH_TRACEHOOK -+#if defined(CONFIG_HAVE_ARCH_TRACEHOOK) && !defined(CONFIG_GRKERNSEC_PROC_MEMMAP) - INF("syscall", S_IRUSR, proc_pid_syscall), - #endif - INF("cmdline", S_IRUGO, proc_pid_cmdline), -@@ -2853,7 +2966,7 @@ static const struct pid_entry tid_base_s - #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, proc_pid_wchan), - #endif --#ifdef CONFIG_STACKTRACE -+#if defined(CONFIG_STACKTRACE) && !defined(CONFIG_GRKERNSEC_HIDESYM) - ONE("stack", S_IRUSR, proc_pid_stack), - #endif - #ifdef CONFIG_SCHEDSTATS -diff -urNp linux-2.6.31.1/fs/proc/cmdline.c linux-2.6.31.1/fs/proc/cmdline.c ---- linux-2.6.31.1/fs/proc/cmdline.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/cmdline.c 2009-10-01 20:12:44.000000000 -0400 -@@ -23,7 +23,11 @@ static const struct file_operations cmdl - - static int __init proc_cmdline_init(void) - { -+#ifdef CONFIG_GRKERNSEC_PROC_ADD -+ proc_create_grsec("cmdline", 0, NULL, &cmdline_proc_fops); -+#else - proc_create("cmdline", 0, NULL, &cmdline_proc_fops); -+#endif - return 0; - } - module_init(proc_cmdline_init); -diff -urNp linux-2.6.31.1/fs/proc/devices.c linux-2.6.31.1/fs/proc/devices.c ---- linux-2.6.31.1/fs/proc/devices.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/devices.c 2009-10-01 20:12:44.000000000 -0400 -@@ -64,7 +64,11 @@ static const struct file_operations proc - - static int __init proc_devices_init(void) - { -+#ifdef CONFIG_GRKERNSEC_PROC_ADD -+ proc_create_grsec("devices", 0, NULL, &proc_devinfo_operations); -+#else - proc_create("devices", 0, NULL, &proc_devinfo_operations); -+#endif - return 0; - } - module_init(proc_devices_init); -diff -urNp linux-2.6.31.1/fs/proc/inode.c linux-2.6.31.1/fs/proc/inode.c ---- linux-2.6.31.1/fs/proc/inode.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/inode.c 2009-10-01 20:12:44.000000000 -0400 -@@ -457,7 +457,11 @@ struct inode *proc_get_inode(struct supe - if (de->mode) { - inode->i_mode = de->mode; - inode->i_uid = de->uid; -+#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP -+ inode->i_gid = CONFIG_GRKERNSEC_PROC_GID; -+#else - inode->i_gid = de->gid; -+#endif - } - if (de->size) - inode->i_size = de->size; -diff -urNp linux-2.6.31.1/fs/proc/internal.h linux-2.6.31.1/fs/proc/internal.h ---- linux-2.6.31.1/fs/proc/internal.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/internal.h 2009-10-01 20:12:44.000000000 -0400 -@@ -51,6 +51,9 @@ extern int proc_pid_status(struct seq_fi - struct pid *pid, struct task_struct *task); - extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -+#ifdef CONFIG_GRKERNSEC_PROC_IPADDR -+extern int proc_pid_ipaddr(struct task_struct *task, char *buffer); -+#endif - extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); - - extern const struct file_operations proc_maps_operations; -diff -urNp linux-2.6.31.1/fs/proc/Kconfig linux-2.6.31.1/fs/proc/Kconfig ---- linux-2.6.31.1/fs/proc/Kconfig 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/Kconfig 2009-10-01 20:12:44.000000000 -0400 -@@ -30,12 +30,12 @@ config PROC_FS - - config PROC_KCORE - bool "/proc/kcore support" if !ARM -- depends on PROC_FS && MMU -+ depends on PROC_FS && MMU && !GRKERNSEC_PROC_ADD - - config PROC_VMCORE - bool "/proc/vmcore support (EXPERIMENTAL)" -- depends on PROC_FS && CRASH_DUMP -- default y -+ depends on PROC_FS && CRASH_DUMP && !GRKERNSEC -+ default n - help - Exports the dump image of crashed kernel in ELF format. - -@@ -59,8 +59,8 @@ config PROC_SYSCTL - limited in memory. - - config PROC_PAGE_MONITOR -- default y -- depends on PROC_FS && MMU -+ default n -+ depends on PROC_FS && MMU && !GRKERNSEC - bool "Enable /proc page monitoring" if EMBEDDED - help - Various /proc files exist to monitor process memory utilization: -diff -urNp linux-2.6.31.1/fs/proc/kcore.c linux-2.6.31.1/fs/proc/kcore.c ---- linux-2.6.31.1/fs/proc/kcore.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/kcore.c 2009-10-01 20:12:44.000000000 -0400 -@@ -404,10 +404,12 @@ read_kcore(struct file *file, char __use - - static int __init proc_kcore_init(void) - { -+#if !defined(CONFIG_GRKERNSEC_PROC_ADD) && !defined(CONFIG_GRKERNSEC_HIDESYM) - proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); - if (proc_root_kcore) - proc_root_kcore->size = - (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; -+#endif - return 0; - } - module_init(proc_kcore_init); -diff -urNp linux-2.6.31.1/fs/proc/nommu.c linux-2.6.31.1/fs/proc/nommu.c ---- linux-2.6.31.1/fs/proc/nommu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/nommu.c 2009-10-01 20:12:44.000000000 -0400 -@@ -67,7 +67,7 @@ static int nommu_region_show(struct seq_ - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); -- seq_path(m, &file->f_path, ""); -+ seq_path(m, &file->f_path, "\n\"); - } - - seq_putc(m, '\n'); -@@ -109,7 +109,7 @@ static void *nommu_region_list_next(stru - return rb_next((struct rb_node *) v); - } - --static struct seq_operations proc_nommu_region_list_seqop = { -+static const struct seq_operations proc_nommu_region_list_seqop = { - .start = nommu_region_list_start, - .next = nommu_region_list_next, - .stop = nommu_region_list_stop, -diff -urNp linux-2.6.31.1/fs/proc/proc_net.c linux-2.6.31.1/fs/proc/proc_net.c ---- linux-2.6.31.1/fs/proc/proc_net.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/proc_net.c 2009-10-01 20:12:44.000000000 -0400 -@@ -104,6 +104,17 @@ static struct net *get_proc_task_net(str - struct task_struct *task; - struct nsproxy *ns; - struct net *net = NULL; -+#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ const struct cred *cred = current_cred(); -+#endif -+ -+#ifdef CONFIG_GRKERNSEC_PROC_USER -+ if (cred->fsuid) -+ return net; -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ if (cred->fsuid && !in_group_p(CONFIG_GRKERNSEC_PROC_GID)) -+ return net; -+#endif - - rcu_read_lock(); - task = pid_task(proc_pid(dir), PIDTYPE_PID); -diff -urNp linux-2.6.31.1/fs/proc/proc_sysctl.c linux-2.6.31.1/fs/proc/proc_sysctl.c ---- linux-2.6.31.1/fs/proc/proc_sysctl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/proc_sysctl.c 2009-10-01 20:12:44.000000000 -0400 -@@ -7,6 +7,8 @@ - #include <linux/security.h> - #include "internal.h" - -+extern __u32 gr_handle_sysctl(const struct ctl_table *table, const int op); -+ - static const struct dentry_operations proc_sys_dentry_operations; - static const struct file_operations proc_sys_file_operations; - static const struct inode_operations proc_sys_inode_operations; -@@ -109,6 +111,9 @@ static struct dentry *proc_sys_lookup(st - if (!p) - goto out; - -+ if (gr_handle_sysctl(p, MAY_EXEC)) -+ goto out; -+ - err = ERR_PTR(-ENOMEM); - inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (h) -@@ -228,6 +233,9 @@ static int scan(struct ctl_table_header - if (*pos < file->f_pos) - continue; - -+ if (gr_handle_sysctl(table, 0)) -+ continue; -+ - res = proc_sys_fill_cache(file, dirent, filldir, head, table); - if (res) - return res; -@@ -344,6 +352,9 @@ static int proc_sys_getattr(struct vfsmo - if (IS_ERR(head)) - return PTR_ERR(head); - -+ if (table && gr_handle_sysctl(table, MAY_EXEC)) -+ return -ENOENT; -+ - generic_fillattr(inode, stat); - if (table) - stat->mode = (stat->mode & S_IFMT) | table->mode; -diff -urNp linux-2.6.31.1/fs/proc/root.c linux-2.6.31.1/fs/proc/root.c ---- linux-2.6.31.1/fs/proc/root.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/root.c 2009-10-01 20:12:44.000000000 -0400 -@@ -134,7 +134,15 @@ void __init proc_root_init(void) - #ifdef CONFIG_PROC_DEVICETREE - proc_device_tree_init(); - #endif -+#ifdef CONFIG_GRKERNSEC_PROC_ADD -+#ifdef CONFIG_GRKERNSEC_PROC_USER -+ proc_mkdir_mode("bus", S_IRUSR | S_IXUSR, NULL); -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ proc_mkdir_mode("bus", S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP, NULL); -+#endif -+#else - proc_mkdir("bus", NULL); -+#endif - proc_sys_init(); - } - -diff -urNp linux-2.6.31.1/fs/proc/task_mmu.c linux-2.6.31.1/fs/proc/task_mmu.c ---- linux-2.6.31.1/fs/proc/task_mmu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/task_mmu.c 2009-10-01 20:12:44.000000000 -0400 -@@ -46,15 +46,26 @@ void task_mem(struct seq_file *m, struct - "VmStk:\t%8lu kB\n" - "VmExe:\t%8lu kB\n" - "VmLib:\t%8lu kB\n" -- "VmPTE:\t%8lu kB\n", -- hiwater_vm << (PAGE_SHIFT-10), -+ "VmPTE:\t%8lu kB\n" -+ -+#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT -+ "CsBase:\t%8lx\nCsLim:\t%8lx\n" -+#endif -+ -+ ,hiwater_vm << (PAGE_SHIFT-10), - (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), - mm->locked_vm << (PAGE_SHIFT-10), - hiwater_rss << (PAGE_SHIFT-10), - total_rss << (PAGE_SHIFT-10), - data << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), text, lib, -- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); -+ (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10 -+ -+#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT -+ , mm->context.user_cs_base, mm->context.user_cs_limit -+#endif -+ -+ ); - } - - unsigned long task_vsize(struct mm_struct *mm) -@@ -199,6 +210,12 @@ static int do_maps_open(struct inode *in - return ret; - } - -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+#define PAX_RAND_FLAGS(_mm) (_mm != NULL && _mm != current->mm && \ -+ (_mm->pax_flags & MF_PAX_RANDMMAP || \ -+ _mm->pax_flags & MF_PAX_SEGMEXEC)) -+#endif -+ - static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) - { - struct mm_struct *mm = vma->vm_mm; -@@ -217,13 +234,22 @@ static void show_map_vma(struct seq_file - } - - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+ PAX_RAND_FLAGS(mm) ? 0UL : vma->vm_start, -+ PAX_RAND_FLAGS(mm) ? 0UL : vma->vm_end, -+#else - vma->vm_start, - vma->vm_end, -+#endif - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+ PAX_RAND_FLAGS(mm) ? 0UL : pgoff, -+#else - pgoff, -+#endif - MAJOR(dev), MINOR(dev), ino, &len); - - /* -@@ -232,16 +258,16 @@ static void show_map_vma(struct seq_file - */ - if (file) { - pad_len_spaces(m, len); -- seq_path(m, &file->f_path, "\n"); -+ seq_path(m, &file->f_path, "\n\"); - } else { - const char *name = arch_vma_name(vma); - if (!name) { - if (mm) { -- if (vma->vm_start <= mm->start_brk && -- vma->vm_end >= mm->brk) { -+ if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { - name = "[heap]"; -- } else if (vma->vm_start <= mm->start_stack && -- vma->vm_end >= mm->start_stack) { -+ } else if ((vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) || -+ (vma->vm_start <= mm->start_stack && -+ vma->vm_end >= mm->start_stack)) { - name = "[stack]"; - } - } else { -@@ -384,9 +410,16 @@ static int show_smap(struct seq_file *m, - }; - - memset(&mss, 0, sizeof mss); -- mss.vma = vma; -- if (vma->vm_mm && !is_vm_hugetlb_page(vma)) -- walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); -+ -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+ if (!PAX_RAND_FLAGS(vma->vm_mm)) { -+#endif -+ mss.vma = vma; -+ if (vma->vm_mm && !is_vm_hugetlb_page(vma)) -+ walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+ } -+#endif - - show_map_vma(m, vma); - -@@ -402,7 +435,11 @@ static int show_smap(struct seq_file *m, - "Swap: %8lu kB\n" - "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+ PAX_RAND_FLAGS(vma->vm_mm) ? 0UL : (vma->vm_end - vma->vm_start) >> 10, -+#else - (vma->vm_end - vma->vm_start) >> 10, -+#endif - mss.resident >> 10, - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), - mss.shared_clean >> 10, -diff -urNp linux-2.6.31.1/fs/proc/task_nommu.c linux-2.6.31.1/fs/proc/task_nommu.c ---- linux-2.6.31.1/fs/proc/task_nommu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/proc/task_nommu.c 2009-10-01 20:12:44.000000000 -0400 -@@ -50,7 +50,7 @@ void task_mem(struct seq_file *m, struct - else - bytes += kobjsize(mm); - -- if (current->fs && current->fs->users > 1) -+ if (current->fs && atomic_read(¤t->fs->users) > 1) - sbytes += kobjsize(current->fs); - else - bytes += kobjsize(current->fs); -@@ -154,7 +154,7 @@ static int nommu_vma_show(struct seq_fil - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); -- seq_path(m, &file->f_path, ""); -+ seq_path(m, &file->f_path, "\n\"); - } - - seq_putc(m, '\n'); -diff -urNp linux-2.6.31.1/fs/readdir.c linux-2.6.31.1/fs/readdir.c ---- linux-2.6.31.1/fs/readdir.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/readdir.c 2009-10-01 20:12:44.000000000 -0400 -@@ -16,6 +16,7 @@ - #include <linux/security.h> - #include <linux/syscalls.h> - #include <linux/unistd.h> -+#include <linux/namei.h> - - #include <asm/uaccess.h> - -@@ -67,6 +68,7 @@ struct old_linux_dirent { - - struct readdir_callback { - struct old_linux_dirent __user * dirent; -+ struct file * file; - int result; - }; - -@@ -84,6 +86,10 @@ static int fillonedir(void * __buf, cons - buf->result = -EOVERFLOW; - return -EOVERFLOW; - } -+ -+ if (!gr_acl_handle_filldir(buf->file, name, namlen, ino)) -+ return 0; -+ - buf->result++; - dirent = buf->dirent; - if (!access_ok(VERIFY_WRITE, dirent, -@@ -116,6 +122,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned in - - buf.result = 0; - buf.dirent = dirent; -+ buf.file = file; - - error = vfs_readdir(file, fillonedir, &buf); - if (buf.result) -@@ -142,6 +149,7 @@ struct linux_dirent { - struct getdents_callback { - struct linux_dirent __user * current_dir; - struct linux_dirent __user * previous; -+ struct file * file; - int count; - int error; - }; -@@ -162,6 +170,10 @@ static int filldir(void * __buf, const c - buf->error = -EOVERFLOW; - return -EOVERFLOW; - } -+ -+ if (!gr_acl_handle_filldir(buf->file, name, namlen, ino)) -+ return 0; -+ - dirent = buf->previous; - if (dirent) { - if (__put_user(offset, &dirent->d_off)) -@@ -209,6 +221,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, - buf.previous = NULL; - buf.count = count; - buf.error = 0; -+ buf.file = file; - - error = vfs_readdir(file, filldir, &buf); - if (error >= 0) -@@ -228,6 +241,7 @@ out: - struct getdents_callback64 { - struct linux_dirent64 __user * current_dir; - struct linux_dirent64 __user * previous; -+ struct file *file; - int count; - int error; - }; -@@ -242,6 +256,10 @@ static int filldir64(void * __buf, const - buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) - return -EINVAL; -+ -+ if (!gr_acl_handle_filldir(buf->file, name, namlen, ino)) -+ return 0; -+ - dirent = buf->previous; - if (dirent) { - if (__put_user(offset, &dirent->d_off)) -@@ -289,6 +307,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int - - buf.current_dir = dirent; - buf.previous = NULL; -+ buf.file = file; - buf.count = count; - buf.error = 0; - -diff -urNp linux-2.6.31.1/fs/reiserfs/do_balan.c linux-2.6.31.1/fs/reiserfs/do_balan.c ---- linux-2.6.31.1/fs/reiserfs/do_balan.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/reiserfs/do_balan.c 2009-10-01 20:12:44.000000000 -0400 -@@ -2058,7 +2058,7 @@ void do_balance(struct tree_balance *tb, - return; - } - -- atomic_inc(&(fs_generation(tb->tb_sb))); -+ atomic_inc_unchecked(&(fs_generation(tb->tb_sb))); - do_balance_starts(tb); - - /* balance leaf returns 0 except if combining L R and S into -diff -urNp linux-2.6.31.1/fs/reiserfs/procfs.c linux-2.6.31.1/fs/reiserfs/procfs.c ---- linux-2.6.31.1/fs/reiserfs/procfs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/reiserfs/procfs.c 2009-10-01 20:12:44.000000000 -0400 -@@ -123,7 +123,7 @@ static int show_super(struct seq_file *m - "SMALL_TAILS " : "NO_TAILS ", - replay_only(sb) ? "REPLAY_ONLY " : "", - convert_reiserfs(sb) ? "CONV " : "", -- atomic_read(&r->s_generation_counter), -+ atomic_read_unchecked(&r->s_generation_counter), - SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), - SF(s_do_balance), SF(s_unneeded_left_neighbor), - SF(s_good_search_by_key_reada), SF(s_bmaps), -diff -urNp linux-2.6.31.1/fs/romfs/super.c linux-2.6.31.1/fs/romfs/super.c ---- linux-2.6.31.1/fs/romfs/super.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/romfs/super.c 2009-10-01 20:12:44.000000000 -0400 -@@ -284,7 +284,7 @@ static const struct file_operations romf - .readdir = romfs_readdir, - }; - --static struct inode_operations romfs_dir_inode_operations = { -+static const struct inode_operations romfs_dir_inode_operations = { - .lookup = romfs_lookup, - }; - -diff -urNp linux-2.6.31.1/fs/select.c linux-2.6.31.1/fs/select.c ---- linux-2.6.31.1/fs/select.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/select.c 2009-10-01 20:12:44.000000000 -0400 -@@ -19,6 +19,7 @@ - #include <linux/module.h> - #include <linux/slab.h> - #include <linux/poll.h> -+#include <linux/security.h> - #include <linux/personality.h> /* for STICKY_TIMEOUTS */ - #include <linux/file.h> - #include <linux/fdtable.h> -@@ -814,6 +815,7 @@ int do_sys_poll(struct pollfd __user *uf - struct poll_list *walk = head; - unsigned long todo = nfds; - -+ gr_learn_resource(current, RLIMIT_NOFILE, nfds, 1); - if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) - return -EINVAL; - -diff -urNp linux-2.6.31.1/fs/seq_file.c linux-2.6.31.1/fs/seq_file.c ---- linux-2.6.31.1/fs/seq_file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/seq_file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -76,7 +76,8 @@ static int traverse(struct seq_file *m, - return 0; - } - if (!m->buf) { -- m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); -+ m->size = PAGE_SIZE; -+ m->buf = kmalloc(m->size, GFP_KERNEL); - if (!m->buf) - return -ENOMEM; - } -@@ -116,7 +117,8 @@ static int traverse(struct seq_file *m, - Eoverflow: - m->op->stop(m, p); - kfree(m->buf); -- m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); -+ m->size <<= 1; -+ m->buf = kmalloc(m->size, GFP_KERNEL); - return !m->buf ? -ENOMEM : -EAGAIN; - } - -@@ -169,7 +171,8 @@ ssize_t seq_read(struct file *file, char - m->version = file->f_version; - /* grab buffer if we didn't have one */ - if (!m->buf) { -- m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); -+ m->size = PAGE_SIZE; -+ m->buf = kmalloc(m->size, GFP_KERNEL); - if (!m->buf) - goto Enomem; - } -@@ -210,7 +213,8 @@ ssize_t seq_read(struct file *file, char - goto Fill; - m->op->stop(m, p); - kfree(m->buf); -- m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); -+ m->size <<= 1; -+ m->buf = kmalloc(m->size, GFP_KERNEL); - if (!m->buf) - goto Enomem; - m->count = 0; -diff -urNp linux-2.6.31.1/fs/smbfs/symlink.c linux-2.6.31.1/fs/smbfs/symlink.c ---- linux-2.6.31.1/fs/smbfs/symlink.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/smbfs/symlink.c 2009-10-01 20:12:44.000000000 -0400 -@@ -55,7 +55,7 @@ static void *smb_follow_link(struct dent - - static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p) - { -- char *s = nd_get_link(nd); -+ const char *s = nd_get_link(nd); - if (!IS_ERR(s)) - __putname(s); - } -diff -urNp linux-2.6.31.1/fs/squashfs/super.c linux-2.6.31.1/fs/squashfs/super.c ---- linux-2.6.31.1/fs/squashfs/super.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/squashfs/super.c 2009-10-01 20:12:44.000000000 -0400 -@@ -44,7 +44,7 @@ - #include "squashfs.h" - - static struct file_system_type squashfs_fs_type; --static struct super_operations squashfs_super_ops; -+static const struct super_operations squashfs_super_ops; - - static int supported_squashfs_filesystem(short major, short minor, short comp) - { -@@ -444,7 +444,7 @@ static struct file_system_type squashfs_ - .fs_flags = FS_REQUIRES_DEV - }; - --static struct super_operations squashfs_super_ops = { -+static const struct super_operations squashfs_super_ops = { - .alloc_inode = squashfs_alloc_inode, - .destroy_inode = squashfs_destroy_inode, - .statfs = squashfs_statfs, -diff -urNp linux-2.6.31.1/fs/sysfs/bin.c linux-2.6.31.1/fs/sysfs/bin.c ---- linux-2.6.31.1/fs/sysfs/bin.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/sysfs/bin.c 2009-10-01 20:12:44.000000000 -0400 -@@ -40,7 +40,7 @@ struct bin_buffer { - struct mutex mutex; - void *buffer; - int mmapped; -- struct vm_operations_struct *vm_ops; -+ const struct vm_operations_struct *vm_ops; - struct file *file; - struct hlist_node list; - }; -@@ -331,7 +331,7 @@ static int bin_migrate(struct vm_area_st - } - #endif - --static struct vm_operations_struct bin_vm_ops = { -+static const struct vm_operations_struct bin_vm_ops = { - .open = bin_vma_open, - .close = bin_vma_close, - .fault = bin_fault, -diff -urNp linux-2.6.31.1/fs/sysfs/symlink.c linux-2.6.31.1/fs/sysfs/symlink.c ---- linux-2.6.31.1/fs/sysfs/symlink.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/sysfs/symlink.c 2009-10-01 20:12:44.000000000 -0400 -@@ -203,7 +203,7 @@ static void *sysfs_follow_link(struct de - - static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) - { -- char *page = nd_get_link(nd); -+ const char *page = nd_get_link(nd); - if (!IS_ERR(page)) - free_page((unsigned long)page); - } -diff -urNp linux-2.6.31.1/fs/ubifs/file.c linux-2.6.31.1/fs/ubifs/file.c ---- linux-2.6.31.1/fs/ubifs/file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/ubifs/file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1536,7 +1536,7 @@ out_unlock: - return err; - } - --static struct vm_operations_struct ubifs_file_vm_ops = { -+static const struct vm_operations_struct ubifs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = ubifs_vm_page_mkwrite, - }; -diff -urNp linux-2.6.31.1/fs/udf/balloc.c linux-2.6.31.1/fs/udf/balloc.c ---- linux-2.6.31.1/fs/udf/balloc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/udf/balloc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -172,9 +172,7 @@ static void udf_bitmap_free_blocks(struc - - mutex_lock(&sbi->s_alloc_mutex); - partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; -- if (bloc->logicalBlockNum < 0 || -- (bloc->logicalBlockNum + count) > -- partmap->s_partition_len) { -+ if ((bloc->logicalBlockNum + count) > partmap->s_partition_len) { - udf_debug("%d < %d || %d + %d > %d\n", - bloc->logicalBlockNum, 0, bloc->logicalBlockNum, - count, partmap->s_partition_len); -@@ -436,9 +434,7 @@ static void udf_table_free_blocks(struct - - mutex_lock(&sbi->s_alloc_mutex); - partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; -- if (bloc->logicalBlockNum < 0 || -- (bloc->logicalBlockNum + count) > -- partmap->s_partition_len) { -+ if ((bloc->logicalBlockNum + count) > partmap->s_partition_len) { - udf_debug("%d < %d || %d + %d > %d\n", - bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, - partmap->s_partition_len); -diff -urNp linux-2.6.31.1/fs/utimes.c linux-2.6.31.1/fs/utimes.c ---- linux-2.6.31.1/fs/utimes.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/utimes.c 2009-10-01 20:12:44.000000000 -0400 -@@ -1,6 +1,7 @@ - #include <linux/compiler.h> - #include <linux/file.h> - #include <linux/fs.h> -+#include <linux/security.h> - #include <linux/linkage.h> - #include <linux/mount.h> - #include <linux/namei.h> -@@ -101,6 +102,12 @@ static int utimes_common(struct path *pa - goto mnt_drop_write_and_out; - } - } -+ -+ if (!gr_acl_handle_utime(path->dentry, path->mnt)) { -+ error = -EACCES; -+ goto mnt_drop_write_and_out; -+ } -+ - mutex_lock(&inode->i_mutex); - error = notify_change(path->dentry, &newattrs); - mutex_unlock(&inode->i_mutex); -diff -urNp linux-2.6.31.1/fs/xfs/linux-2.6/xfs_file.c linux-2.6.31.1/fs/xfs/linux-2.6/xfs_file.c ---- linux-2.6.31.1/fs/xfs/linux-2.6/xfs_file.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/xfs/linux-2.6/xfs_file.c 2009-10-01 20:12:44.000000000 -0400 -@@ -42,7 +42,7 @@ - - #include <linux/dcache.h> - --static struct vm_operations_struct xfs_file_vm_ops; -+static const struct vm_operations_struct xfs_file_vm_ops; - - STATIC ssize_t - xfs_file_aio_read( -@@ -271,7 +271,7 @@ const struct file_operations xfs_dir_fil - .fsync = xfs_file_fsync, - }; - --static struct vm_operations_struct xfs_file_vm_ops = { -+static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = xfs_vm_page_mkwrite, - }; -diff -urNp linux-2.6.31.1/fs/xfs/linux-2.6/xfs_iops.c linux-2.6.31.1/fs/xfs/linux-2.6/xfs_iops.c ---- linux-2.6.31.1/fs/xfs/linux-2.6/xfs_iops.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/xfs/linux-2.6/xfs_iops.c 2009-10-01 20:12:44.000000000 -0400 -@@ -478,7 +478,7 @@ xfs_vn_put_link( - struct nameidata *nd, - void *p) - { -- char *s = nd_get_link(nd); -+ const char *s = nd_get_link(nd); - - if (!IS_ERR(s)) - kfree(s); -diff -urNp linux-2.6.31.1/fs/xfs/linux-2.6/xfs_super.c linux-2.6.31.1/fs/xfs/linux-2.6/xfs_super.c ---- linux-2.6.31.1/fs/xfs/linux-2.6/xfs_super.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/xfs/linux-2.6/xfs_super.c 2009-10-01 20:12:44.000000000 -0400 -@@ -67,7 +67,7 @@ - #include <linux/freezer.h> - #include <linux/parser.h> - --static struct super_operations xfs_super_operations; -+static const struct super_operations xfs_super_operations; - static kmem_zone_t *xfs_ioend_zone; - mempool_t *xfs_ioend_pool; - -@@ -1532,7 +1532,7 @@ xfs_fs_get_sb( - mnt); - } - --static struct super_operations xfs_super_operations = { -+static const struct super_operations xfs_super_operations = { - .alloc_inode = xfs_fs_alloc_inode, - .destroy_inode = xfs_fs_destroy_inode, - .write_inode = xfs_fs_write_inode, -diff -urNp linux-2.6.31.1/fs/xfs/xfs_bmap.c linux-2.6.31.1/fs/xfs/xfs_bmap.c ---- linux-2.6.31.1/fs/xfs/xfs_bmap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/fs/xfs/xfs_bmap.c 2009-10-01 20:12:44.000000000 -0400 -@@ -360,7 +360,7 @@ xfs_bmap_validate_ret( - int nmap, - int ret_nmap); - #else --#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) -+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do {} while (0) - #endif /* DEBUG */ - - #if defined(XFS_RW_TRACE) -diff -urNp linux-2.6.31.1/grsecurity/gracl_alloc.c linux-2.6.31.1/grsecurity/gracl_alloc.c ---- linux-2.6.31.1/grsecurity/gracl_alloc.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/gracl_alloc.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,105 @@ -+#include <linux/kernel.h> -+#include <linux/mm.h> -+#include <linux/slab.h> -+#include <linux/vmalloc.h> -+#include <linux/gracl.h> -+#include <linux/grsecurity.h> -+ -+static unsigned long alloc_stack_next = 1; -+static unsigned long alloc_stack_size = 1; -+static void **alloc_stack; -+ -+static __inline__ int -+alloc_pop(void) -+{ -+ if (alloc_stack_next == 1) -+ return 0; -+ -+ kfree(alloc_stack[alloc_stack_next - 2]); -+ -+ alloc_stack_next--; -+ -+ return 1; -+} -+ -+static __inline__ int -+alloc_push(void *buf) -+{ -+ if (alloc_stack_next >= alloc_stack_size) -+ return 1; -+ -+ alloc_stack[alloc_stack_next - 1] = buf; -+ -+ alloc_stack_next++; -+ -+ return 0; -+} -+ -+void * -+acl_alloc(unsigned long len) -+{ -+ void *ret = NULL; -+ -+ if (!len || len > PAGE_SIZE) -+ goto out; -+ -+ ret = kmalloc(len, GFP_KERNEL); -+ -+ if (ret) { -+ if (alloc_push(ret)) { -+ kfree(ret); -+ ret = NULL; -+ } -+ } -+ -+out: -+ return ret; -+} -+ -+void * -+acl_alloc_num(unsigned long num, unsigned long len) -+{ -+ if (!len || (num > (PAGE_SIZE / len))) -+ return NULL; -+ -+ return acl_alloc(num * len); -+} -+ -+void -+acl_free_all(void) -+{ -+ if (gr_acl_is_enabled() || !alloc_stack) -+ return; -+ -+ while (alloc_pop()) ; -+ -+ if (alloc_stack) { -+ if ((alloc_stack_size * sizeof (void *)) <= PAGE_SIZE) -+ kfree(alloc_stack); -+ else -+ vfree(alloc_stack); -+ } -+ -+ alloc_stack = NULL; -+ alloc_stack_size = 1; -+ alloc_stack_next = 1; -+ -+ return; -+} -+ -+int -+acl_alloc_stack_init(unsigned long size) -+{ -+ if ((size * sizeof (void *)) <= PAGE_SIZE) -+ alloc_stack = -+ (void **) kmalloc(size * sizeof (void *), GFP_KERNEL); -+ else -+ alloc_stack = (void **) vmalloc(size * sizeof (void *)); -+ -+ alloc_stack_size = size; -+ -+ if (!alloc_stack) -+ return 0; -+ else -+ return 1; -+} -diff -urNp linux-2.6.31.1/grsecurity/gracl.c linux-2.6.31.1/grsecurity/gracl.c ---- linux-2.6.31.1/grsecurity/gracl.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/gracl.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,3912 @@ -+#include <linux/kernel.h> -+#include <linux/module.h> -+#include <linux/sched.h> -+#include <linux/mm.h> -+#include <linux/file.h> -+#include <linux/fs.h> -+#include <linux/namei.h> -+#include <linux/mount.h> -+#include <linux/tty.h> -+#include <linux/proc_fs.h> -+#include <linux/smp_lock.h> -+#include <linux/slab.h> -+#include <linux/vmalloc.h> -+#include <linux/types.h> -+#include <linux/sysctl.h> -+#include <linux/netdevice.h> -+#include <linux/ptrace.h> -+#include <linux/gracl.h> -+#include <linux/gralloc.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+#include <linux/pid_namespace.h> -+#include <linux/fdtable.h> -+#include <linux/percpu.h> -+ -+#include <asm/uaccess.h> -+#include <asm/errno.h> -+#include <asm/mman.h> -+ -+static struct acl_role_db acl_role_set; -+static struct name_db name_set; -+static struct inodev_db inodev_set; -+ -+/* for keeping track of userspace pointers used for subjects, so we -+ can share references in the kernel as well -+*/ -+ -+static struct dentry *real_root; -+static struct vfsmount *real_root_mnt; -+ -+static struct acl_subj_map_db subj_map_set; -+ -+static struct acl_role_label *default_role; -+ -+static u16 acl_sp_role_value; -+ -+extern char *gr_shared_page[4]; -+static DECLARE_MUTEX(gr_dev_sem); -+DEFINE_RWLOCK(gr_inode_lock); -+ -+struct gr_arg *gr_usermode; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+static unsigned int gr_status __read_only = GR_STATUS_INIT; -+#else -+static unsigned int gr_status = GR_STATUS_INIT; -+#endif -+ -+extern int chkpw(struct gr_arg *entry, unsigned char *salt, unsigned char *sum); -+extern void gr_clear_learn_entries(void); -+ -+#ifdef CONFIG_GRKERNSEC_RESLOG -+extern void gr_log_resource(const struct task_struct *task, -+ const int res, const unsigned long wanted, const int gt); -+#endif -+ -+unsigned char *gr_system_salt; -+unsigned char *gr_system_sum; -+ -+static struct sprole_pw **acl_special_roles = NULL; -+static __u16 num_sprole_pws = 0; -+ -+static struct acl_role_label *kernel_role = NULL; -+ -+static unsigned int gr_auth_attempts = 0; -+static unsigned long gr_auth_expires = 0UL; -+ -+extern struct vfsmount *sock_mnt; -+extern struct vfsmount *pipe_mnt; -+extern struct vfsmount *shm_mnt; -+static struct acl_object_label *fakefs_obj; -+ -+extern int gr_init_uidset(void); -+extern void gr_free_uidset(void); -+extern void gr_remove_uid(uid_t uid); -+extern int gr_find_uid(uid_t uid); -+ -+__inline__ int -+gr_acl_is_enabled(void) -+{ -+ return (gr_status & GR_READY); -+} -+ -+char gr_roletype_to_char(void) -+{ -+ switch (current->role->roletype & -+ (GR_ROLE_DEFAULT | GR_ROLE_USER | GR_ROLE_GROUP | -+ GR_ROLE_SPECIAL)) { -+ case GR_ROLE_DEFAULT: -+ return 'D'; -+ case GR_ROLE_USER: -+ return 'U'; -+ case GR_ROLE_GROUP: -+ return 'G'; -+ case GR_ROLE_SPECIAL: -+ return 'S'; -+ } -+ -+ return 'X'; -+} -+ -+__inline__ int -+gr_acl_tpe_check(void) -+{ -+ if (unlikely(!(gr_status & GR_READY))) -+ return 0; -+ if (current->role->roletype & GR_ROLE_TPE) -+ return 1; -+ else -+ return 0; -+} -+ -+int -+gr_handle_rawio(const struct inode *inode) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_CAPS -+ if (inode && S_ISBLK(inode->i_mode) && -+ grsec_enable_chroot_caps && proc_is_chrooted(current) && -+ !capable(CAP_SYS_RAWIO)) -+ return 1; -+#endif -+ return 0; -+} -+ -+static int -+gr_streq(const char *a, const char *b, const unsigned int lena, const unsigned int lenb) -+{ -+ int i; -+ unsigned long *l1; -+ unsigned long *l2; -+ unsigned char *c1; -+ unsigned char *c2; -+ int num_longs; -+ -+ if (likely(lena != lenb)) -+ return 0; -+ -+ l1 = (unsigned long *)a; -+ l2 = (unsigned long *)b; -+ -+ num_longs = lena / sizeof(unsigned long); -+ -+ for (i = num_longs; i--; l1++, l2++) { -+ if (unlikely(*l1 != *l2)) -+ return 0; -+ } -+ -+ c1 = (unsigned char *) l1; -+ c2 = (unsigned char *) l2; -+ -+ i = lena - (num_longs * sizeof(unsigned long)); -+ -+ for (; i--; c1++, c2++) { -+ if (unlikely(*c1 != *c2)) -+ return 0; -+ } -+ -+ return 1; -+} -+ -+static char * __our_d_path(struct dentry *dentry, struct vfsmount *vfsmnt, -+ struct dentry *root, struct vfsmount *rootmnt, -+ char *buffer, int buflen) -+{ -+ char * end = buffer+buflen; -+ char * retval; -+ int namelen; -+ -+ *--end = '\0'; -+ buflen--; -+ -+ if (buflen < 1) -+ goto Elong; -+ /* Get '/' right */ -+ retval = end-1; -+ *retval = '/'; -+ -+ for (;;) { -+ struct dentry * parent; -+ -+ if (dentry == root && vfsmnt == rootmnt) -+ break; -+ if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { -+ /* Global root? */ -+ spin_lock(&vfsmount_lock); -+ if (vfsmnt->mnt_parent == vfsmnt) { -+ spin_unlock(&vfsmount_lock); -+ goto global_root; -+ } -+ dentry = vfsmnt->mnt_mountpoint; -+ vfsmnt = vfsmnt->mnt_parent; -+ spin_unlock(&vfsmount_lock); -+ continue; -+ } -+ parent = dentry->d_parent; -+ prefetch(parent); -+ namelen = dentry->d_name.len; -+ buflen -= namelen + 1; -+ if (buflen < 0) -+ goto Elong; -+ end -= namelen; -+ memcpy(end, dentry->d_name.name, namelen); -+ *--end = '/'; -+ retval = end; -+ dentry = parent; -+ } -+ -+ return retval; -+ -+global_root: -+ namelen = dentry->d_name.len; -+ buflen -= namelen; -+ if (buflen < 0) -+ goto Elong; -+ retval -= namelen-1; /* hit the slash */ -+ memcpy(retval, dentry->d_name.name, namelen); -+ return retval; -+Elong: -+ return ERR_PTR(-ENAMETOOLONG); -+} -+ -+static char * -+gen_full_path(struct dentry *dentry, struct vfsmount *vfsmnt, -+ struct dentry *root, struct vfsmount *rootmnt, char *buf, int buflen) -+{ -+ char *retval; -+ -+ retval = __our_d_path(dentry, vfsmnt, root, rootmnt, buf, buflen); -+ if (unlikely(IS_ERR(retval))) -+ retval = strcpy(buf, "<path too long>"); -+ else if (unlikely(retval[1] == '/' && retval[2] == '\0')) -+ retval[1] = '\0'; -+ -+ return retval; -+} -+ -+static char * -+__d_real_path(const struct dentry *dentry, const struct vfsmount *vfsmnt, -+ char *buf, int buflen) -+{ -+ char *res; -+ -+ /* we can use real_root, real_root_mnt, because this is only called -+ by the RBAC system */ -+ res = gen_full_path((struct dentry *)dentry, (struct vfsmount *)vfsmnt, real_root, real_root_mnt, buf, buflen); -+ -+ return res; -+} -+ -+static char * -+d_real_path(const struct dentry *dentry, const struct vfsmount *vfsmnt, -+ char *buf, int buflen) -+{ -+ char *res; -+ struct dentry *root; -+ struct vfsmount *rootmnt; -+ struct task_struct *reaper = &init_task; -+ -+ /* we can't use real_root, real_root_mnt, because they belong only to the RBAC system */ -+ read_lock(&reaper->fs->lock); -+ root = dget(reaper->fs->root.dentry); -+ rootmnt = mntget(reaper->fs->root.mnt); -+ read_unlock(&reaper->fs->lock); -+ -+ spin_lock(&dcache_lock); -+ res = gen_full_path((struct dentry *)dentry, (struct vfsmount *)vfsmnt, root, rootmnt, buf, buflen); -+ spin_unlock(&dcache_lock); -+ -+ dput(root); -+ mntput(rootmnt); -+ return res; -+} -+ -+static char * -+gr_to_filename_rbac(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ char *ret; -+ spin_lock(&dcache_lock); -+ ret = __d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[0],smp_processor_id()), -+ PAGE_SIZE); -+ spin_unlock(&dcache_lock); -+ return ret; -+} -+ -+char * -+gr_to_filename_nolock(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return __d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[0],smp_processor_id()), -+ PAGE_SIZE); -+} -+ -+char * -+gr_to_filename(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[0], smp_processor_id()), -+ PAGE_SIZE); -+} -+ -+char * -+gr_to_filename1(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[1], smp_processor_id()), -+ PAGE_SIZE); -+} -+ -+char * -+gr_to_filename2(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[2], smp_processor_id()), -+ PAGE_SIZE); -+} -+ -+char * -+gr_to_filename3(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[3], smp_processor_id()), -+ PAGE_SIZE); -+} -+ -+__inline__ __u32 -+to_gr_audit(const __u32 reqmode) -+{ -+ /* masks off auditable permission flags, then shifts them to create -+ auditing flags, and adds the special case of append auditing if -+ we're requesting write */ -+ return (((reqmode & ~GR_AUDITS) << 10) | ((reqmode & GR_WRITE) ? GR_AUDIT_APPEND : 0)); -+} -+ -+struct acl_subject_label * -+lookup_subject_map(const struct acl_subject_label *userp) -+{ -+ unsigned int index = shash(userp, subj_map_set.s_size); -+ struct subject_map *match; -+ -+ match = subj_map_set.s_hash[index]; -+ -+ while (match && match->user != userp) -+ match = match->next; -+ -+ if (match != NULL) -+ return match->kernel; -+ else -+ return NULL; -+} -+ -+static void -+insert_subj_map_entry(struct subject_map *subjmap) -+{ -+ unsigned int index = shash(subjmap->user, subj_map_set.s_size); -+ struct subject_map **curr; -+ -+ subjmap->prev = NULL; -+ -+ curr = &subj_map_set.s_hash[index]; -+ if (*curr != NULL) -+ (*curr)->prev = subjmap; -+ -+ subjmap->next = *curr; -+ *curr = subjmap; -+ -+ return; -+} -+ -+static struct acl_role_label * -+lookup_acl_role_label(const struct task_struct *task, const uid_t uid, -+ const gid_t gid) -+{ -+ unsigned int index = rhash(uid, GR_ROLE_USER, acl_role_set.r_size); -+ struct acl_role_label *match; -+ struct role_allowed_ip *ipp; -+ unsigned int x; -+ -+ match = acl_role_set.r_hash[index]; -+ -+ while (match) { -+ if ((match->roletype & (GR_ROLE_DOMAIN | GR_ROLE_USER)) == (GR_ROLE_DOMAIN | GR_ROLE_USER)) { -+ for (x = 0; x < match->domain_child_num; x++) { -+ if (match->domain_children[x] == uid) -+ goto found; -+ } -+ } else if (match->uidgid == uid && match->roletype & GR_ROLE_USER) -+ break; -+ match = match->next; -+ } -+found: -+ if (match == NULL) { -+ try_group: -+ index = rhash(gid, GR_ROLE_GROUP, acl_role_set.r_size); -+ match = acl_role_set.r_hash[index]; -+ -+ while (match) { -+ if ((match->roletype & (GR_ROLE_DOMAIN | GR_ROLE_GROUP)) == (GR_ROLE_DOMAIN | GR_ROLE_GROUP)) { -+ for (x = 0; x < match->domain_child_num; x++) { -+ if (match->domain_children[x] == gid) -+ goto found2; -+ } -+ } else if (match->uidgid == gid && match->roletype & GR_ROLE_GROUP) -+ break; -+ match = match->next; -+ } -+found2: -+ if (match == NULL) -+ match = default_role; -+ if (match->allowed_ips == NULL) -+ return match; -+ else { -+ for (ipp = match->allowed_ips; ipp; ipp = ipp->next) { -+ if (likely -+ ((ntohl(task->signal->curr_ip) & ipp->netmask) == -+ (ntohl(ipp->addr) & ipp->netmask))) -+ return match; -+ } -+ match = default_role; -+ } -+ } else if (match->allowed_ips == NULL) { -+ return match; -+ } else { -+ for (ipp = match->allowed_ips; ipp; ipp = ipp->next) { -+ if (likely -+ ((ntohl(task->signal->curr_ip) & ipp->netmask) == -+ (ntohl(ipp->addr) & ipp->netmask))) -+ return match; -+ } -+ goto try_group; -+ } -+ -+ return match; -+} -+ -+struct acl_subject_label * -+lookup_acl_subj_label(const ino_t ino, const dev_t dev, -+ const struct acl_role_label *role) -+{ -+ unsigned int index = fhash(ino, dev, role->subj_hash_size); -+ struct acl_subject_label *match; -+ -+ match = role->subj_hash[index]; -+ -+ while (match && (match->inode != ino || match->device != dev || -+ (match->mode & GR_DELETED))) { -+ match = match->next; -+ } -+ -+ if (match && !(match->mode & GR_DELETED)) -+ return match; -+ else -+ return NULL; -+} -+ -+struct acl_subject_label * -+lookup_acl_subj_label_deleted(const ino_t ino, const dev_t dev, -+ const struct acl_role_label *role) -+{ -+ unsigned int index = fhash(ino, dev, role->subj_hash_size); -+ struct acl_subject_label *match; -+ -+ match = role->subj_hash[index]; -+ -+ while (match && (match->inode != ino || match->device != dev || -+ !(match->mode & GR_DELETED))) { -+ match = match->next; -+ } -+ -+ if (match && (match->mode & GR_DELETED)) -+ return match; -+ else -+ return NULL; -+} -+ -+static struct acl_object_label * -+lookup_acl_obj_label(const ino_t ino, const dev_t dev, -+ const struct acl_subject_label *subj) -+{ -+ unsigned int index = fhash(ino, dev, subj->obj_hash_size); -+ struct acl_object_label *match; -+ -+ match = subj->obj_hash[index]; -+ -+ while (match && (match->inode != ino || match->device != dev || -+ (match->mode & GR_DELETED))) { -+ match = match->next; -+ } -+ -+ if (match && !(match->mode & GR_DELETED)) -+ return match; -+ else -+ return NULL; -+} -+ -+static struct acl_object_label * -+lookup_acl_obj_label_create(const ino_t ino, const dev_t dev, -+ const struct acl_subject_label *subj) -+{ -+ unsigned int index = fhash(ino, dev, subj->obj_hash_size); -+ struct acl_object_label *match; -+ -+ match = subj->obj_hash[index]; -+ -+ while (match && (match->inode != ino || match->device != dev || -+ !(match->mode & GR_DELETED))) { -+ match = match->next; -+ } -+ -+ if (match && (match->mode & GR_DELETED)) -+ return match; -+ -+ match = subj->obj_hash[index]; -+ -+ while (match && (match->inode != ino || match->device != dev || -+ (match->mode & GR_DELETED))) { -+ match = match->next; -+ } -+ -+ if (match && !(match->mode & GR_DELETED)) -+ return match; -+ else -+ return NULL; -+} -+ -+static struct name_entry * -+lookup_name_entry(const char *name) -+{ -+ unsigned int len = strlen(name); -+ unsigned int key = full_name_hash(name, len); -+ unsigned int index = key % name_set.n_size; -+ struct name_entry *match; -+ -+ match = name_set.n_hash[index]; -+ -+ while (match && (match->key != key || !gr_streq(match->name, name, match->len, len))) -+ match = match->next; -+ -+ return match; -+} -+ -+static struct name_entry * -+lookup_name_entry_create(const char *name) -+{ -+ unsigned int len = strlen(name); -+ unsigned int key = full_name_hash(name, len); -+ unsigned int index = key % name_set.n_size; -+ struct name_entry *match; -+ -+ match = name_set.n_hash[index]; -+ -+ while (match && (match->key != key || !gr_streq(match->name, name, match->len, len) || -+ !match->deleted)) -+ match = match->next; -+ -+ if (match && match->deleted) -+ return match; -+ -+ match = name_set.n_hash[index]; -+ -+ while (match && (match->key != key || !gr_streq(match->name, name, match->len, len) || -+ match->deleted)) -+ match = match->next; -+ -+ if (match && !match->deleted) -+ return match; -+ else -+ return NULL; -+} -+ -+static struct inodev_entry * -+lookup_inodev_entry(const ino_t ino, const dev_t dev) -+{ -+ unsigned int index = fhash(ino, dev, inodev_set.i_size); -+ struct inodev_entry *match; -+ -+ match = inodev_set.i_hash[index]; -+ -+ while (match && (match->nentry->inode != ino || match->nentry->device != dev)) -+ match = match->next; -+ -+ return match; -+} -+ -+static void -+insert_inodev_entry(struct inodev_entry *entry) -+{ -+ unsigned int index = fhash(entry->nentry->inode, entry->nentry->device, -+ inodev_set.i_size); -+ struct inodev_entry **curr; -+ -+ entry->prev = NULL; -+ -+ curr = &inodev_set.i_hash[index]; -+ if (*curr != NULL) -+ (*curr)->prev = entry; -+ -+ entry->next = *curr; -+ *curr = entry; -+ -+ return; -+} -+ -+static void -+__insert_acl_role_label(struct acl_role_label *role, uid_t uidgid) -+{ -+ unsigned int index = -+ rhash(uidgid, role->roletype & (GR_ROLE_USER | GR_ROLE_GROUP), acl_role_set.r_size); -+ struct acl_role_label **curr; -+ -+ role->prev = NULL; -+ -+ curr = &acl_role_set.r_hash[index]; -+ if (*curr != NULL) -+ (*curr)->prev = role; -+ -+ role->next = *curr; -+ *curr = role; -+ -+ return; -+} -+ -+static void -+insert_acl_role_label(struct acl_role_label *role) -+{ -+ int i; -+ -+ if (role->roletype & GR_ROLE_DOMAIN) { -+ for (i = 0; i < role->domain_child_num; i++) -+ __insert_acl_role_label(role, role->domain_children[i]); -+ } else -+ __insert_acl_role_label(role, role->uidgid); -+} -+ -+static int -+insert_name_entry(char *name, const ino_t inode, const dev_t device, __u8 deleted) -+{ -+ struct name_entry **curr, *nentry; -+ struct inodev_entry *ientry; -+ unsigned int len = strlen(name); -+ unsigned int key = full_name_hash(name, len); -+ unsigned int index = key % name_set.n_size; -+ -+ curr = &name_set.n_hash[index]; -+ -+ while (*curr && ((*curr)->key != key || !gr_streq((*curr)->name, name, (*curr)->len, len))) -+ curr = &((*curr)->next); -+ -+ if (*curr != NULL) -+ return 1; -+ -+ nentry = acl_alloc(sizeof (struct name_entry)); -+ if (nentry == NULL) -+ return 0; -+ ientry = acl_alloc(sizeof (struct inodev_entry)); -+ if (ientry == NULL) -+ return 0; -+ ientry->nentry = nentry; -+ -+ nentry->key = key; -+ nentry->name = name; -+ nentry->inode = inode; -+ nentry->device = device; -+ nentry->len = len; -+ nentry->deleted = deleted; -+ -+ nentry->prev = NULL; -+ curr = &name_set.n_hash[index]; -+ if (*curr != NULL) -+ (*curr)->prev = nentry; -+ nentry->next = *curr; -+ *curr = nentry; -+ -+ /* insert us into the table searchable by inode/dev */ -+ insert_inodev_entry(ientry); -+ -+ return 1; -+} -+ -+static void -+insert_acl_obj_label(struct acl_object_label *obj, -+ struct acl_subject_label *subj) -+{ -+ unsigned int index = -+ fhash(obj->inode, obj->device, subj->obj_hash_size); -+ struct acl_object_label **curr; -+ -+ -+ obj->prev = NULL; -+ -+ curr = &subj->obj_hash[index]; -+ if (*curr != NULL) -+ (*curr)->prev = obj; -+ -+ obj->next = *curr; -+ *curr = obj; -+ -+ return; -+} -+ -+static void -+insert_acl_subj_label(struct acl_subject_label *obj, -+ struct acl_role_label *role) -+{ -+ unsigned int index = fhash(obj->inode, obj->device, role->subj_hash_size); -+ struct acl_subject_label **curr; -+ -+ obj->prev = NULL; -+ -+ curr = &role->subj_hash[index]; -+ if (*curr != NULL) -+ (*curr)->prev = obj; -+ -+ obj->next = *curr; -+ *curr = obj; -+ -+ return; -+} -+ -+/* allocating chained hash tables, so optimal size is where lambda ~ 1 */ -+ -+static void * -+create_table(__u32 * len, int elementsize) -+{ -+ unsigned int table_sizes[] = { -+ 7, 13, 31, 61, 127, 251, 509, 1021, 2039, 4093, 8191, 16381, -+ 32749, 65521, 131071, 262139, 524287, 1048573, 2097143, -+ 4194301, 8388593, 16777213, 33554393, 67108859 -+ }; -+ void *newtable = NULL; -+ unsigned int pwr = 0; -+ -+ while ((pwr < ((sizeof (table_sizes) / sizeof (table_sizes[0])) - 1)) && -+ table_sizes[pwr] <= *len) -+ pwr++; -+ -+ if (table_sizes[pwr] <= *len || (table_sizes[pwr] > ULONG_MAX / elementsize)) -+ return newtable; -+ -+ if ((table_sizes[pwr] * elementsize) <= PAGE_SIZE) -+ newtable = -+ kmalloc(table_sizes[pwr] * elementsize, GFP_KERNEL); -+ else -+ newtable = vmalloc(table_sizes[pwr] * elementsize); -+ -+ *len = table_sizes[pwr]; -+ -+ return newtable; -+} -+ -+static int -+init_variables(const struct gr_arg *arg) -+{ -+ struct task_struct *reaper = &init_task; -+ unsigned int stacksize; -+ -+ subj_map_set.s_size = arg->role_db.num_subjects; -+ acl_role_set.r_size = arg->role_db.num_roles + arg->role_db.num_domain_children; -+ name_set.n_size = arg->role_db.num_objects; -+ inodev_set.i_size = arg->role_db.num_objects; -+ -+ if (!subj_map_set.s_size || !acl_role_set.r_size || -+ !name_set.n_size || !inodev_set.i_size) -+ return 1; -+ -+ if (!gr_init_uidset()) -+ return 1; -+ -+ /* set up the stack that holds allocation info */ -+ -+ stacksize = arg->role_db.num_pointers + 5; -+ -+ if (!acl_alloc_stack_init(stacksize)) -+ return 1; -+ -+ /* grab reference for the real root dentry and vfsmount */ -+ read_lock(&reaper->fs->lock); -+ real_root_mnt = mntget(reaper->fs->root.mnt); -+ real_root = dget(reaper->fs->root.dentry); -+ read_unlock(&reaper->fs->lock); -+ -+ fakefs_obj = acl_alloc(sizeof(struct acl_object_label)); -+ if (fakefs_obj == NULL) -+ return 1; -+ fakefs_obj->mode = GR_FIND | GR_READ | GR_WRITE | GR_EXEC; -+ -+ subj_map_set.s_hash = -+ (struct subject_map **) create_table(&subj_map_set.s_size, sizeof(void *)); -+ acl_role_set.r_hash = -+ (struct acl_role_label **) create_table(&acl_role_set.r_size, sizeof(void *)); -+ name_set.n_hash = (struct name_entry **) create_table(&name_set.n_size, sizeof(void *)); -+ inodev_set.i_hash = -+ (struct inodev_entry **) create_table(&inodev_set.i_size, sizeof(void *)); -+ -+ if (!subj_map_set.s_hash || !acl_role_set.r_hash || -+ !name_set.n_hash || !inodev_set.i_hash) -+ return 1; -+ -+ memset(subj_map_set.s_hash, 0, -+ sizeof(struct subject_map *) * subj_map_set.s_size); -+ memset(acl_role_set.r_hash, 0, -+ sizeof (struct acl_role_label *) * acl_role_set.r_size); -+ memset(name_set.n_hash, 0, -+ sizeof (struct name_entry *) * name_set.n_size); -+ memset(inodev_set.i_hash, 0, -+ sizeof (struct inodev_entry *) * inodev_set.i_size); -+ -+ return 0; -+} -+ -+/* free information not needed after startup -+ currently contains user->kernel pointer mappings for subjects -+*/ -+ -+static void -+free_init_variables(void) -+{ -+ __u32 i; -+ -+ if (subj_map_set.s_hash) { -+ for (i = 0; i < subj_map_set.s_size; i++) { -+ if (subj_map_set.s_hash[i]) { -+ kfree(subj_map_set.s_hash[i]); -+ subj_map_set.s_hash[i] = NULL; -+ } -+ } -+ -+ if ((subj_map_set.s_size * sizeof (struct subject_map *)) <= -+ PAGE_SIZE) -+ kfree(subj_map_set.s_hash); -+ else -+ vfree(subj_map_set.s_hash); -+ } -+ -+ return; -+} -+ -+static void -+free_variables(void) -+{ -+ struct acl_subject_label *s; -+ struct acl_role_label *r; -+ struct task_struct *task, *task2; -+ unsigned int i, x; -+ -+ gr_clear_learn_entries(); -+ -+ read_lock(&tasklist_lock); -+ do_each_thread(task2, task) { -+ task->acl_sp_role = 0; -+ task->acl_role_id = 0; -+ task->acl = NULL; -+ task->role = NULL; -+ } while_each_thread(task2, task); -+ read_unlock(&tasklist_lock); -+ -+ /* release the reference to the real root dentry and vfsmount */ -+ if (real_root) -+ dput(real_root); -+ real_root = NULL; -+ if (real_root_mnt) -+ mntput(real_root_mnt); -+ real_root_mnt = NULL; -+ -+ /* free all object hash tables */ -+ -+ FOR_EACH_ROLE_START(r, i) -+ if (r->subj_hash == NULL) -+ break; -+ FOR_EACH_SUBJECT_START(r, s, x) -+ if (s->obj_hash == NULL) -+ break; -+ if ((s->obj_hash_size * sizeof (struct acl_object_label *)) <= PAGE_SIZE) -+ kfree(s->obj_hash); -+ else -+ vfree(s->obj_hash); -+ FOR_EACH_SUBJECT_END(s, x) -+ FOR_EACH_NESTED_SUBJECT_START(r, s) -+ if (s->obj_hash == NULL) -+ break; -+ if ((s->obj_hash_size * sizeof (struct acl_object_label *)) <= PAGE_SIZE) -+ kfree(s->obj_hash); -+ else -+ vfree(s->obj_hash); -+ FOR_EACH_NESTED_SUBJECT_END(s) -+ if ((r->subj_hash_size * sizeof (struct acl_subject_label *)) <= PAGE_SIZE) -+ kfree(r->subj_hash); -+ else -+ vfree(r->subj_hash); -+ r->subj_hash = NULL; -+ FOR_EACH_ROLE_END(r,i) -+ -+ acl_free_all(); -+ -+ if (acl_role_set.r_hash) { -+ if ((acl_role_set.r_size * sizeof (struct acl_role_label *)) <= -+ PAGE_SIZE) -+ kfree(acl_role_set.r_hash); -+ else -+ vfree(acl_role_set.r_hash); -+ } -+ if (name_set.n_hash) { -+ if ((name_set.n_size * sizeof (struct name_entry *)) <= -+ PAGE_SIZE) -+ kfree(name_set.n_hash); -+ else -+ vfree(name_set.n_hash); -+ } -+ -+ if (inodev_set.i_hash) { -+ if ((inodev_set.i_size * sizeof (struct inodev_entry *)) <= -+ PAGE_SIZE) -+ kfree(inodev_set.i_hash); -+ else -+ vfree(inodev_set.i_hash); -+ } -+ -+ gr_free_uidset(); -+ -+ memset(&name_set, 0, sizeof (struct name_db)); -+ memset(&inodev_set, 0, sizeof (struct inodev_db)); -+ memset(&acl_role_set, 0, sizeof (struct acl_role_db)); -+ memset(&subj_map_set, 0, sizeof (struct acl_subj_map_db)); -+ -+ default_role = NULL; -+ -+ return; -+} -+ -+static __u32 -+count_user_objs(struct acl_object_label *userp) -+{ -+ struct acl_object_label o_tmp; -+ __u32 num = 0; -+ -+ while (userp) { -+ if (copy_from_user(&o_tmp, userp, -+ sizeof (struct acl_object_label))) -+ break; -+ -+ userp = o_tmp.prev; -+ num++; -+ } -+ -+ return num; -+} -+ -+static struct acl_subject_label * -+do_copy_user_subj(struct acl_subject_label *userp, struct acl_role_label *role); -+ -+static int -+copy_user_glob(struct acl_object_label *obj) -+{ -+ struct acl_object_label *g_tmp, **guser; -+ unsigned int len; -+ char *tmp; -+ -+ if (obj->globbed == NULL) -+ return 0; -+ -+ guser = &obj->globbed; -+ while (*guser) { -+ g_tmp = (struct acl_object_label *) -+ acl_alloc(sizeof (struct acl_object_label)); -+ if (g_tmp == NULL) -+ return -ENOMEM; -+ -+ if (copy_from_user(g_tmp, *guser, -+ sizeof (struct acl_object_label))) -+ return -EFAULT; -+ -+ len = strnlen_user(g_tmp->filename, PATH_MAX); -+ -+ if (!len || len >= PATH_MAX) -+ return -EINVAL; -+ -+ if ((tmp = (char *) acl_alloc(len)) == NULL) -+ return -ENOMEM; -+ -+ if (copy_from_user(tmp, g_tmp->filename, len)) -+ return -EFAULT; -+ tmp[len-1] = '\0'; -+ g_tmp->filename = tmp; -+ -+ *guser = g_tmp; -+ guser = &(g_tmp->next); -+ } -+ -+ return 0; -+} -+ -+static int -+copy_user_objs(struct acl_object_label *userp, struct acl_subject_label *subj, -+ struct acl_role_label *role) -+{ -+ struct acl_object_label *o_tmp; -+ unsigned int len; -+ int ret; -+ char *tmp; -+ -+ while (userp) { -+ if ((o_tmp = (struct acl_object_label *) -+ acl_alloc(sizeof (struct acl_object_label))) == NULL) -+ return -ENOMEM; -+ -+ if (copy_from_user(o_tmp, userp, -+ sizeof (struct acl_object_label))) -+ return -EFAULT; -+ -+ userp = o_tmp->prev; -+ -+ len = strnlen_user(o_tmp->filename, PATH_MAX); -+ -+ if (!len || len >= PATH_MAX) -+ return -EINVAL; -+ -+ if ((tmp = (char *) acl_alloc(len)) == NULL) -+ return -ENOMEM; -+ -+ if (copy_from_user(tmp, o_tmp->filename, len)) -+ return -EFAULT; -+ tmp[len-1] = '\0'; -+ o_tmp->filename = tmp; -+ -+ insert_acl_obj_label(o_tmp, subj); -+ if (!insert_name_entry(o_tmp->filename, o_tmp->inode, -+ o_tmp->device, (o_tmp->mode & GR_DELETED) ? 1 : 0)) -+ return -ENOMEM; -+ -+ ret = copy_user_glob(o_tmp); -+ if (ret) -+ return ret; -+ -+ if (o_tmp->nested) { -+ o_tmp->nested = do_copy_user_subj(o_tmp->nested, role); -+ if (IS_ERR(o_tmp->nested)) -+ return PTR_ERR(o_tmp->nested); -+ -+ /* insert into nested subject list */ -+ o_tmp->nested->next = role->hash->first; -+ role->hash->first = o_tmp->nested; -+ } -+ } -+ -+ return 0; -+} -+ -+static __u32 -+count_user_subjs(struct acl_subject_label *userp) -+{ -+ struct acl_subject_label s_tmp; -+ __u32 num = 0; -+ -+ while (userp) { -+ if (copy_from_user(&s_tmp, userp, -+ sizeof (struct acl_subject_label))) -+ break; -+ -+ userp = s_tmp.prev; -+ /* do not count nested subjects against this count, since -+ they are not included in the hash table, but are -+ attached to objects. We have already counted -+ the subjects in userspace for the allocation -+ stack -+ */ -+ if (!(s_tmp.mode & GR_NESTED)) -+ num++; -+ } -+ -+ return num; -+} -+ -+static int -+copy_user_allowedips(struct acl_role_label *rolep) -+{ -+ struct role_allowed_ip *ruserip, *rtmp = NULL, *rlast; -+ -+ ruserip = rolep->allowed_ips; -+ -+ while (ruserip) { -+ rlast = rtmp; -+ -+ if ((rtmp = (struct role_allowed_ip *) -+ acl_alloc(sizeof (struct role_allowed_ip))) == NULL) -+ return -ENOMEM; -+ -+ if (copy_from_user(rtmp, ruserip, -+ sizeof (struct role_allowed_ip))) -+ return -EFAULT; -+ -+ ruserip = rtmp->prev; -+ -+ if (!rlast) { -+ rtmp->prev = NULL; -+ rolep->allowed_ips = rtmp; -+ } else { -+ rlast->next = rtmp; -+ rtmp->prev = rlast; -+ } -+ -+ if (!ruserip) -+ rtmp->next = NULL; -+ } -+ -+ return 0; -+} -+ -+static int -+copy_user_transitions(struct acl_role_label *rolep) -+{ -+ struct role_transition *rusertp, *rtmp = NULL, *rlast; -+ -+ unsigned int len; -+ char *tmp; -+ -+ rusertp = rolep->transitions; -+ -+ while (rusertp) { -+ rlast = rtmp; -+ -+ if ((rtmp = (struct role_transition *) -+ acl_alloc(sizeof (struct role_transition))) == NULL) -+ return -ENOMEM; -+ -+ if (copy_from_user(rtmp, rusertp, -+ sizeof (struct role_transition))) -+ return -EFAULT; -+ -+ rusertp = rtmp->prev; -+ -+ len = strnlen_user(rtmp->rolename, GR_SPROLE_LEN); -+ -+ if (!len || len >= GR_SPROLE_LEN) -+ return -EINVAL; -+ -+ if ((tmp = (char *) acl_alloc(len)) == NULL) -+ return -ENOMEM; -+ -+ if (copy_from_user(tmp, rtmp->rolename, len)) -+ return -EFAULT; -+ tmp[len-1] = '\0'; -+ rtmp->rolename = tmp; -+ -+ if (!rlast) { -+ rtmp->prev = NULL; -+ rolep->transitions = rtmp; -+ } else { -+ rlast->next = rtmp; -+ rtmp->prev = rlast; -+ } -+ -+ if (!rusertp) -+ rtmp->next = NULL; -+ } -+ -+ return 0; -+} -+ -+static struct acl_subject_label * -+do_copy_user_subj(struct acl_subject_label *userp, struct acl_role_label *role) -+{ -+ struct acl_subject_label *s_tmp = NULL, *s_tmp2; -+ unsigned int len; -+ char *tmp; -+ __u32 num_objs; -+ struct acl_ip_label **i_tmp, *i_utmp2; -+ struct gr_hash_struct ghash; -+ struct subject_map *subjmap; -+ unsigned int i_num; -+ int err; -+ -+ s_tmp = lookup_subject_map(userp); -+ -+ /* we've already copied this subject into the kernel, just return -+ the reference to it, and don't copy it over again -+ */ -+ if (s_tmp) -+ return(s_tmp); -+ -+ if ((s_tmp = (struct acl_subject_label *) -+ acl_alloc(sizeof (struct acl_subject_label))) == NULL) -+ return ERR_PTR(-ENOMEM); -+ -+ subjmap = (struct subject_map *)kmalloc(sizeof (struct subject_map), GFP_KERNEL); -+ if (subjmap == NULL) -+ return ERR_PTR(-ENOMEM); -+ -+ subjmap->user = userp; -+ subjmap->kernel = s_tmp; -+ insert_subj_map_entry(subjmap); -+ -+ if (copy_from_user(s_tmp, userp, -+ sizeof (struct acl_subject_label))) -+ return ERR_PTR(-EFAULT); -+ -+ len = strnlen_user(s_tmp->filename, PATH_MAX); -+ -+ if (!len || len >= PATH_MAX) -+ return ERR_PTR(-EINVAL); -+ -+ if ((tmp = (char *) acl_alloc(len)) == NULL) -+ return ERR_PTR(-ENOMEM); -+ -+ if (copy_from_user(tmp, s_tmp->filename, len)) -+ return ERR_PTR(-EFAULT); -+ tmp[len-1] = '\0'; -+ s_tmp->filename = tmp; -+ -+ if (!strcmp(s_tmp->filename, "/")) -+ role->root_label = s_tmp; -+ -+ if (copy_from_user(&ghash, s_tmp->hash, sizeof(struct gr_hash_struct))) -+ return ERR_PTR(-EFAULT); -+ -+ /* copy user and group transition tables */ -+ -+ if (s_tmp->user_trans_num) { -+ uid_t *uidlist; -+ -+ uidlist = (uid_t *)acl_alloc_num(s_tmp->user_trans_num, sizeof(uid_t)); -+ if (uidlist == NULL) -+ return ERR_PTR(-ENOMEM); -+ if (copy_from_user(uidlist, s_tmp->user_transitions, s_tmp->user_trans_num * sizeof(uid_t))) -+ return ERR_PTR(-EFAULT); -+ -+ s_tmp->user_transitions = uidlist; -+ } -+ -+ if (s_tmp->group_trans_num) { -+ gid_t *gidlist; -+ -+ gidlist = (gid_t *)acl_alloc_num(s_tmp->group_trans_num, sizeof(gid_t)); -+ if (gidlist == NULL) -+ return ERR_PTR(-ENOMEM); -+ if (copy_from_user(gidlist, s_tmp->group_transitions, s_tmp->group_trans_num * sizeof(gid_t))) -+ return ERR_PTR(-EFAULT); -+ -+ s_tmp->group_transitions = gidlist; -+ } -+ -+ /* set up object hash table */ -+ num_objs = count_user_objs(ghash.first); -+ -+ s_tmp->obj_hash_size = num_objs; -+ s_tmp->obj_hash = -+ (struct acl_object_label **) -+ create_table(&(s_tmp->obj_hash_size), sizeof(void *)); -+ -+ if (!s_tmp->obj_hash) -+ return ERR_PTR(-ENOMEM); -+ -+ memset(s_tmp->obj_hash, 0, -+ s_tmp->obj_hash_size * -+ sizeof (struct acl_object_label *)); -+ -+ /* add in objects */ -+ err = copy_user_objs(ghash.first, s_tmp, role); -+ -+ if (err) -+ return ERR_PTR(err); -+ -+ /* set pointer for parent subject */ -+ if (s_tmp->parent_subject) { -+ s_tmp2 = do_copy_user_subj(s_tmp->parent_subject, role); -+ -+ if (IS_ERR(s_tmp2)) -+ return s_tmp2; -+ -+ s_tmp->parent_subject = s_tmp2; -+ } -+ -+ /* add in ip acls */ -+ -+ if (!s_tmp->ip_num) { -+ s_tmp->ips = NULL; -+ goto insert; -+ } -+ -+ i_tmp = -+ (struct acl_ip_label **) acl_alloc_num(s_tmp->ip_num, -+ sizeof (struct acl_ip_label *)); -+ -+ if (!i_tmp) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i_num = 0; i_num < s_tmp->ip_num; i_num++) { -+ *(i_tmp + i_num) = -+ (struct acl_ip_label *) -+ acl_alloc(sizeof (struct acl_ip_label)); -+ if (!*(i_tmp + i_num)) -+ return ERR_PTR(-ENOMEM); -+ -+ if (copy_from_user -+ (&i_utmp2, s_tmp->ips + i_num, -+ sizeof (struct acl_ip_label *))) -+ return ERR_PTR(-EFAULT); -+ -+ if (copy_from_user -+ (*(i_tmp + i_num), i_utmp2, -+ sizeof (struct acl_ip_label))) -+ return ERR_PTR(-EFAULT); -+ -+ if ((*(i_tmp + i_num))->iface == NULL) -+ continue; -+ -+ len = strnlen_user((*(i_tmp + i_num))->iface, IFNAMSIZ); -+ if (!len || len >= IFNAMSIZ) -+ return ERR_PTR(-EINVAL); -+ tmp = acl_alloc(len); -+ if (tmp == NULL) -+ return ERR_PTR(-ENOMEM); -+ if (copy_from_user(tmp, (*(i_tmp + i_num))->iface, len)) -+ return ERR_PTR(-EFAULT); -+ (*(i_tmp + i_num))->iface = tmp; -+ } -+ -+ s_tmp->ips = i_tmp; -+ -+insert: -+ if (!insert_name_entry(s_tmp->filename, s_tmp->inode, -+ s_tmp->device, (s_tmp->mode & GR_DELETED) ? 1 : 0)) -+ return ERR_PTR(-ENOMEM); -+ -+ return s_tmp; -+} -+ -+static int -+copy_user_subjs(struct acl_subject_label *userp, struct acl_role_label *role) -+{ -+ struct acl_subject_label s_pre; -+ struct acl_subject_label * ret; -+ int err; -+ -+ while (userp) { -+ if (copy_from_user(&s_pre, userp, -+ sizeof (struct acl_subject_label))) -+ return -EFAULT; -+ -+ /* do not add nested subjects here, add -+ while parsing objects -+ */ -+ -+ if (s_pre.mode & GR_NESTED) { -+ userp = s_pre.prev; -+ continue; -+ } -+ -+ ret = do_copy_user_subj(userp, role); -+ -+ err = PTR_ERR(ret); -+ if (IS_ERR(ret)) -+ return err; -+ -+ insert_acl_subj_label(ret, role); -+ -+ userp = s_pre.prev; -+ } -+ -+ return 0; -+} -+ -+static int -+copy_user_acl(struct gr_arg *arg) -+{ -+ struct acl_role_label *r_tmp = NULL, **r_utmp, *r_utmp2; -+ struct sprole_pw *sptmp; -+ struct gr_hash_struct *ghash; -+ uid_t *domainlist; -+ unsigned int r_num; -+ unsigned int len; -+ char *tmp; -+ int err = 0; -+ __u16 i; -+ __u32 num_subjs; -+ -+ /* we need a default and kernel role */ -+ if (arg->role_db.num_roles < 2) -+ return -EINVAL; -+ -+ /* copy special role authentication info from userspace */ -+ -+ num_sprole_pws = arg->num_sprole_pws; -+ acl_special_roles = (struct sprole_pw **) acl_alloc_num(num_sprole_pws, sizeof(struct sprole_pw *)); -+ -+ if (!acl_special_roles) { -+ err = -ENOMEM; -+ goto cleanup; -+ } -+ -+ for (i = 0; i < num_sprole_pws; i++) { -+ sptmp = (struct sprole_pw *) acl_alloc(sizeof(struct sprole_pw)); -+ if (!sptmp) { -+ err = -ENOMEM; -+ goto cleanup; -+ } -+ if (copy_from_user(sptmp, arg->sprole_pws + i, -+ sizeof (struct sprole_pw))) { -+ err = -EFAULT; -+ goto cleanup; -+ } -+ -+ len = -+ strnlen_user(sptmp->rolename, GR_SPROLE_LEN); -+ -+ if (!len || len >= GR_SPROLE_LEN) { -+ err = -EINVAL; -+ goto cleanup; -+ } -+ -+ if ((tmp = (char *) acl_alloc(len)) == NULL) { -+ err = -ENOMEM; -+ goto cleanup; -+ } -+ -+ if (copy_from_user(tmp, sptmp->rolename, len)) { -+ err = -EFAULT; -+ goto cleanup; -+ } -+ tmp[len-1] = '\0'; -+#ifdef CONFIG_GRKERNSEC_ACL_DEBUG -+ printk(KERN_ALERT "Copying special role %s\n", tmp); -+#endif -+ sptmp->rolename = tmp; -+ acl_special_roles[i] = sptmp; -+ } -+ -+ r_utmp = (struct acl_role_label **) arg->role_db.r_table; -+ -+ for (r_num = 0; r_num < arg->role_db.num_roles; r_num++) { -+ r_tmp = acl_alloc(sizeof (struct acl_role_label)); -+ -+ if (!r_tmp) { -+ err = -ENOMEM; -+ goto cleanup; -+ } -+ -+ if (copy_from_user(&r_utmp2, r_utmp + r_num, -+ sizeof (struct acl_role_label *))) { -+ err = -EFAULT; -+ goto cleanup; -+ } -+ -+ if (copy_from_user(r_tmp, r_utmp2, -+ sizeof (struct acl_role_label))) { -+ err = -EFAULT; -+ goto cleanup; -+ } -+ -+ len = strnlen_user(r_tmp->rolename, GR_SPROLE_LEN); -+ -+ if (!len || len >= PATH_MAX) { -+ err = -EINVAL; -+ goto cleanup; -+ } -+ -+ if ((tmp = (char *) acl_alloc(len)) == NULL) { -+ err = -ENOMEM; -+ goto cleanup; -+ } -+ if (copy_from_user(tmp, r_tmp->rolename, len)) { -+ err = -EFAULT; -+ goto cleanup; -+ } -+ tmp[len-1] = '\0'; -+ r_tmp->rolename = tmp; -+ -+ if (!strcmp(r_tmp->rolename, "default") -+ && (r_tmp->roletype & GR_ROLE_DEFAULT)) { -+ default_role = r_tmp; -+ } else if (!strcmp(r_tmp->rolename, ":::kernel:::")) { -+ kernel_role = r_tmp; -+ } -+ -+ if ((ghash = (struct gr_hash_struct *) acl_alloc(sizeof(struct gr_hash_struct))) == NULL) { -+ err = -ENOMEM; -+ goto cleanup; -+ } -+ if (copy_from_user(ghash, r_tmp->hash, sizeof(struct gr_hash_struct))) { -+ err = -EFAULT; -+ goto cleanup; -+ } -+ -+ r_tmp->hash = ghash; -+ -+ num_subjs = count_user_subjs(r_tmp->hash->first); -+ -+ r_tmp->subj_hash_size = num_subjs; -+ r_tmp->subj_hash = -+ (struct acl_subject_label **) -+ create_table(&(r_tmp->subj_hash_size), sizeof(void *)); -+ -+ if (!r_tmp->subj_hash) { -+ err = -ENOMEM; -+ goto cleanup; -+ } -+ -+ err = copy_user_allowedips(r_tmp); -+ if (err) -+ goto cleanup; -+ -+ /* copy domain info */ -+ if (r_tmp->domain_children != NULL) { -+ domainlist = acl_alloc_num(r_tmp->domain_child_num, sizeof(uid_t)); -+ if (domainlist == NULL) { -+ err = -ENOMEM; -+ goto cleanup; -+ } -+ if (copy_from_user(domainlist, r_tmp->domain_children, r_tmp->domain_child_num * sizeof(uid_t))) { -+ err = -EFAULT; -+ goto cleanup; -+ } -+ r_tmp->domain_children = domainlist; -+ } -+ -+ err = copy_user_transitions(r_tmp); -+ if (err) -+ goto cleanup; -+ -+ memset(r_tmp->subj_hash, 0, -+ r_tmp->subj_hash_size * -+ sizeof (struct acl_subject_label *)); -+ -+ err = copy_user_subjs(r_tmp->hash->first, r_tmp); -+ -+ if (err) -+ goto cleanup; -+ -+ /* set nested subject list to null */ -+ r_tmp->hash->first = NULL; -+ -+ insert_acl_role_label(r_tmp); -+ } -+ -+ goto return_err; -+ cleanup: -+ free_variables(); -+ return_err: -+ return err; -+ -+} -+ -+static int -+gracl_init(struct gr_arg *args) -+{ -+ int error = 0; -+ -+ memcpy(gr_system_salt, args->salt, GR_SALT_LEN); -+ memcpy(gr_system_sum, args->sum, GR_SHA_LEN); -+ -+ if (init_variables(args)) { -+ gr_log_str(GR_DONT_AUDIT_GOOD, GR_INITF_ACL_MSG, GR_VERSION); -+ error = -ENOMEM; -+ free_variables(); -+ goto out; -+ } -+ -+ error = copy_user_acl(args); -+ free_init_variables(); -+ if (error) { -+ free_variables(); -+ goto out; -+ } -+ -+ if ((error = gr_set_acls(0))) { -+ free_variables(); -+ goto out; -+ } -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ { -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+ gr_status |= GR_READY; -+ pax_close_kernel(cr0); -+ } -+#else -+ gr_status |= GR_READY; -+#endif -+ -+ out: -+ return error; -+} -+ -+/* derived from glibc fnmatch() 0: match, 1: no match*/ -+ -+static int -+glob_match(const char *p, const char *n) -+{ -+ char c; -+ -+ while ((c = *p++) != '\0') { -+ switch (c) { -+ case '?': -+ if (*n == '\0') -+ return 1; -+ else if (*n == '/') -+ return 1; -+ break; -+ case '\': -+ if (*n != c) -+ return 1; -+ break; -+ case '*': -+ for (c = *p++; c == '?' || c == '*'; c = *p++) { -+ if (*n == '/') -+ return 1; -+ else if (c == '?') { -+ if (*n == '\0') -+ return 1; -+ else -+ ++n; -+ } -+ } -+ if (c == '\0') { -+ return 0; -+ } else { -+ const char *endp; -+ -+ if ((endp = strchr(n, '/')) == NULL) -+ endp = n + strlen(n); -+ -+ if (c == '[') { -+ for (--p; n < endp; ++n) -+ if (!glob_match(p, n)) -+ return 0; -+ } else if (c == '/') { -+ while (*n != '\0' && *n != '/') -+ ++n; -+ if (*n == '/' && !glob_match(p, n + 1)) -+ return 0; -+ } else { -+ for (--p; n < endp; ++n) -+ if (*n == c && !glob_match(p, n)) -+ return 0; -+ } -+ -+ return 1; -+ } -+ case '[': -+ { -+ int not; -+ char cold; -+ -+ if (*n == '\0' || *n == '/') -+ return 1; -+ -+ not = (*p == '!' || *p == '^'); -+ if (not) -+ ++p; -+ -+ c = *p++; -+ for (;;) { -+ unsigned char fn = (unsigned char)*n; -+ -+ if (c == '\0') -+ return 1; -+ else { -+ if (c == fn) -+ goto matched; -+ cold = c; -+ c = *p++; -+ -+ if (c == '-' && *p != ']') { -+ unsigned char cend = *p++; -+ -+ if (cend == '\0') -+ return 1; -+ -+ if (cold <= fn && fn <= cend) -+ goto matched; -+ -+ c = *p++; -+ } -+ } -+ -+ if (c == ']') -+ break; -+ } -+ if (!not) -+ return 1; -+ break; -+ matched: -+ while (c != ']') { -+ if (c == '\0') -+ return 1; -+ -+ c = *p++; -+ } -+ if (not) -+ return 1; -+ } -+ break; -+ default: -+ if (c != *n) -+ return 1; -+ } -+ -+ ++n; -+ } -+ -+ if (*n == '\0') -+ return 0; -+ -+ if (*n == '/') -+ return 0; -+ -+ return 1; -+} -+ -+static struct acl_object_label * -+chk_glob_label(struct acl_object_label *globbed, -+ struct dentry *dentry, struct vfsmount *mnt, char **path) -+{ -+ struct acl_object_label *tmp; -+ -+ if (*path == NULL) -+ *path = gr_to_filename_nolock(dentry, mnt); -+ -+ tmp = globbed; -+ -+ while (tmp) { -+ if (!glob_match(tmp->filename, *path)) -+ return tmp; -+ tmp = tmp->next; -+ } -+ -+ return NULL; -+} -+ -+static struct acl_object_label * -+__full_lookup(const struct dentry *orig_dentry, const struct vfsmount *orig_mnt, -+ const ino_t curr_ino, const dev_t curr_dev, -+ const struct acl_subject_label *subj, char **path, const int checkglob) -+{ -+ struct acl_subject_label *tmpsubj; -+ struct acl_object_label *retval; -+ struct acl_object_label *retval2; -+ -+ tmpsubj = (struct acl_subject_label *) subj; -+ read_lock(&gr_inode_lock); -+ do { -+ retval = lookup_acl_obj_label(curr_ino, curr_dev, tmpsubj); -+ if (retval) { -+ if (checkglob && retval->globbed) { -+ retval2 = chk_glob_label(retval->globbed, (struct dentry *)orig_dentry, -+ (struct vfsmount *)orig_mnt, path); -+ if (retval2) -+ retval = retval2; -+ } -+ break; -+ } -+ } while ((tmpsubj = tmpsubj->parent_subject)); -+ read_unlock(&gr_inode_lock); -+ -+ return retval; -+} -+ -+static __inline__ struct acl_object_label * -+full_lookup(const struct dentry *orig_dentry, const struct vfsmount *orig_mnt, -+ const struct dentry *curr_dentry, -+ const struct acl_subject_label *subj, char **path, const int checkglob) -+{ -+ return __full_lookup(orig_dentry, orig_mnt, -+ curr_dentry->d_inode->i_ino, -+ curr_dentry->d_inode->i_sb->s_dev, subj, path, checkglob); -+} -+ -+static struct acl_object_label * -+__chk_obj_label(const struct dentry *l_dentry, const struct vfsmount *l_mnt, -+ const struct acl_subject_label *subj, char *path, const int checkglob) -+{ -+ struct dentry *dentry = (struct dentry *) l_dentry; -+ struct vfsmount *mnt = (struct vfsmount *) l_mnt; -+ struct acl_object_label *retval; -+ -+ spin_lock(&dcache_lock); -+ -+ if (unlikely(mnt == shm_mnt || mnt == pipe_mnt || mnt == sock_mnt || -+ /* ignore Eric Biederman */ -+ IS_PRIVATE(l_dentry->d_inode))) { -+ retval = fakefs_obj; -+ goto out; -+ } -+ -+ for (;;) { -+ if (dentry == real_root && mnt == real_root_mnt) -+ break; -+ -+ if (dentry == mnt->mnt_root || IS_ROOT(dentry)) { -+ if (mnt->mnt_parent == mnt) -+ break; -+ -+ retval = full_lookup(l_dentry, l_mnt, dentry, subj, &path, checkglob); -+ if (retval != NULL) -+ goto out; -+ -+ dentry = mnt->mnt_mountpoint; -+ mnt = mnt->mnt_parent; -+ continue; -+ } -+ -+ retval = full_lookup(l_dentry, l_mnt, dentry, subj, &path, checkglob); -+ if (retval != NULL) -+ goto out; -+ -+ dentry = dentry->d_parent; -+ } -+ -+ retval = full_lookup(l_dentry, l_mnt, dentry, subj, &path, checkglob); -+ -+ if (retval == NULL) -+ retval = full_lookup(l_dentry, l_mnt, real_root, subj, &path, checkglob); -+out: -+ spin_unlock(&dcache_lock); -+ return retval; -+} -+ -+static __inline__ struct acl_object_label * -+chk_obj_label(const struct dentry *l_dentry, const struct vfsmount *l_mnt, -+ const struct acl_subject_label *subj) -+{ -+ char *path = NULL; -+ return __chk_obj_label(l_dentry, l_mnt, subj, path, 1); -+} -+ -+static __inline__ struct acl_object_label * -+chk_obj_label_noglob(const struct dentry *l_dentry, const struct vfsmount *l_mnt, -+ const struct acl_subject_label *subj) -+{ -+ char *path = NULL; -+ return __chk_obj_label(l_dentry, l_mnt, subj, path, 0); -+} -+ -+static __inline__ struct acl_object_label * -+chk_obj_create_label(const struct dentry *l_dentry, const struct vfsmount *l_mnt, -+ const struct acl_subject_label *subj, char *path) -+{ -+ return __chk_obj_label(l_dentry, l_mnt, subj, path, 1); -+} -+ -+static struct acl_subject_label * -+chk_subj_label(const struct dentry *l_dentry, const struct vfsmount *l_mnt, -+ const struct acl_role_label *role) -+{ -+ struct dentry *dentry = (struct dentry *) l_dentry; -+ struct vfsmount *mnt = (struct vfsmount *) l_mnt; -+ struct acl_subject_label *retval; -+ -+ spin_lock(&dcache_lock); -+ -+ for (;;) { -+ if (dentry == real_root && mnt == real_root_mnt) -+ break; -+ if (dentry == mnt->mnt_root || IS_ROOT(dentry)) { -+ if (mnt->mnt_parent == mnt) -+ break; -+ -+ read_lock(&gr_inode_lock); -+ retval = -+ lookup_acl_subj_label(dentry->d_inode->i_ino, -+ dentry->d_inode->i_sb->s_dev, role); -+ read_unlock(&gr_inode_lock); -+ if (retval != NULL) -+ goto out; -+ -+ dentry = mnt->mnt_mountpoint; -+ mnt = mnt->mnt_parent; -+ continue; -+ } -+ -+ read_lock(&gr_inode_lock); -+ retval = lookup_acl_subj_label(dentry->d_inode->i_ino, -+ dentry->d_inode->i_sb->s_dev, role); -+ read_unlock(&gr_inode_lock); -+ if (retval != NULL) -+ goto out; -+ -+ dentry = dentry->d_parent; -+ } -+ -+ read_lock(&gr_inode_lock); -+ retval = lookup_acl_subj_label(dentry->d_inode->i_ino, -+ dentry->d_inode->i_sb->s_dev, role); -+ read_unlock(&gr_inode_lock); -+ -+ if (unlikely(retval == NULL)) { -+ read_lock(&gr_inode_lock); -+ retval = lookup_acl_subj_label(real_root->d_inode->i_ino, -+ real_root->d_inode->i_sb->s_dev, role); -+ read_unlock(&gr_inode_lock); -+ } -+out: -+ spin_unlock(&dcache_lock); -+ -+ return retval; -+} -+ -+static void -+gr_log_learn(const struct dentry *dentry, const struct vfsmount *mnt, const __u32 mode) -+{ -+ struct task_struct *task = current; -+ const struct cred *cred = current_cred(); -+ -+ security_learn(GR_LEARN_AUDIT_MSG, task->role->rolename, task->role->roletype, -+ cred->uid, cred->gid, task->exec_file ? gr_to_filename1(task->exec_file->f_path.dentry, -+ task->exec_file->f_path.mnt) : task->acl->filename, task->acl->filename, -+ 1UL, 1UL, gr_to_filename(dentry, mnt), (unsigned long) mode, NIPQUAD(task->signal->curr_ip)); -+ -+ return; -+} -+ -+static void -+gr_log_learn_sysctl(const char *path, const __u32 mode) -+{ -+ struct task_struct *task = current; -+ const struct cred *cred = current_cred(); -+ -+ security_learn(GR_LEARN_AUDIT_MSG, task->role->rolename, task->role->roletype, -+ cred->uid, cred->gid, task->exec_file ? gr_to_filename1(task->exec_file->f_path.dentry, -+ task->exec_file->f_path.mnt) : task->acl->filename, task->acl->filename, -+ 1UL, 1UL, path, (unsigned long) mode, NIPQUAD(task->signal->curr_ip)); -+ -+ return; -+} -+ -+static void -+gr_log_learn_id_change(const char type, const unsigned int real, -+ const unsigned int effective, const unsigned int fs) -+{ -+ struct task_struct *task = current; -+ const struct cred *cred = current_cred(); -+ -+ security_learn(GR_ID_LEARN_MSG, task->role->rolename, task->role->roletype, -+ cred->uid, cred->gid, task->exec_file ? gr_to_filename1(task->exec_file->f_path.dentry, -+ task->exec_file->f_path.mnt) : task->acl->filename, task->acl->filename, -+ type, real, effective, fs, NIPQUAD(task->signal->curr_ip)); -+ -+ return; -+} -+ -+__u32 -+gr_check_link(const struct dentry * new_dentry, -+ const struct dentry * parent_dentry, -+ const struct vfsmount * parent_mnt, -+ const struct dentry * old_dentry, const struct vfsmount * old_mnt) -+{ -+ struct acl_object_label *obj; -+ __u32 oldmode, newmode; -+ __u32 needmode; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return (GR_CREATE | GR_LINK); -+ -+ obj = chk_obj_label(old_dentry, old_mnt, current->acl); -+ oldmode = obj->mode; -+ -+ if (current->acl->mode & (GR_LEARN | GR_INHERITLEARN)) -+ oldmode |= (GR_CREATE | GR_LINK); -+ -+ needmode = GR_CREATE | GR_AUDIT_CREATE | GR_SUPPRESS; -+ if (old_dentry->d_inode->i_mode & (S_ISUID | S_ISGID)) -+ needmode |= GR_SETID | GR_AUDIT_SETID; -+ -+ newmode = -+ gr_check_create(new_dentry, parent_dentry, parent_mnt, -+ oldmode | needmode); -+ -+ needmode = newmode & (GR_FIND | GR_APPEND | GR_WRITE | GR_EXEC | -+ GR_SETID | GR_READ | GR_FIND | GR_DELETE | -+ GR_INHERIT | GR_AUDIT_INHERIT); -+ -+ if (old_dentry->d_inode->i_mode & (S_ISUID | S_ISGID) && !(newmode & GR_SETID)) -+ goto bad; -+ -+ if ((oldmode & needmode) != needmode) -+ goto bad; -+ -+ needmode = oldmode & (GR_NOPTRACE | GR_PTRACERD | GR_INHERIT | GR_AUDITS); -+ if ((newmode & needmode) != needmode) -+ goto bad; -+ -+ if ((newmode & (GR_CREATE | GR_LINK)) == (GR_CREATE | GR_LINK)) -+ return newmode; -+bad: -+ needmode = oldmode; -+ if (old_dentry->d_inode->i_mode & (S_ISUID | S_ISGID)) -+ needmode |= GR_SETID; -+ -+ if (current->acl->mode & (GR_LEARN | GR_INHERITLEARN)) { -+ gr_log_learn(old_dentry, old_mnt, needmode); -+ return (GR_CREATE | GR_LINK); -+ } else if (newmode & GR_SUPPRESS) -+ return GR_SUPPRESS; -+ else -+ return 0; -+} -+ -+__u32 -+gr_search_file(const struct dentry * dentry, const __u32 mode, -+ const struct vfsmount * mnt) -+{ -+ __u32 retval = mode; -+ struct acl_subject_label *curracl; -+ struct acl_object_label *currobj; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return (mode & ~GR_AUDITS); -+ -+ curracl = current->acl; -+ -+ currobj = chk_obj_label(dentry, mnt, curracl); -+ retval = currobj->mode & mode; -+ -+ if (unlikely -+ ((curracl->mode & (GR_LEARN | GR_INHERITLEARN)) && !(mode & GR_NOPTRACE) -+ && (retval != (mode & ~(GR_AUDITS | GR_SUPPRESS))))) { -+ __u32 new_mode = mode; -+ -+ new_mode &= ~(GR_AUDITS | GR_SUPPRESS); -+ -+ retval = new_mode; -+ -+ if (new_mode & GR_EXEC && curracl->mode & GR_INHERITLEARN) -+ new_mode |= GR_INHERIT; -+ -+ if (!(mode & GR_NOLEARN)) -+ gr_log_learn(dentry, mnt, new_mode); -+ } -+ -+ return retval; -+} -+ -+__u32 -+gr_check_create(const struct dentry * new_dentry, const struct dentry * parent, -+ const struct vfsmount * mnt, const __u32 mode) -+{ -+ struct name_entry *match; -+ struct acl_object_label *matchpo; -+ struct acl_subject_label *curracl; -+ char *path; -+ __u32 retval; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return (mode & ~GR_AUDITS); -+ -+ preempt_disable(); -+ path = gr_to_filename_rbac(new_dentry, mnt); -+ match = lookup_name_entry_create(path); -+ -+ if (!match) -+ goto check_parent; -+ -+ curracl = current->acl; -+ -+ read_lock(&gr_inode_lock); -+ matchpo = lookup_acl_obj_label_create(match->inode, match->device, curracl); -+ read_unlock(&gr_inode_lock); -+ -+ if (matchpo) { -+ if ((matchpo->mode & mode) != -+ (mode & ~(GR_AUDITS | GR_SUPPRESS)) -+ && curracl->mode & (GR_LEARN | GR_INHERITLEARN)) { -+ __u32 new_mode = mode; -+ -+ new_mode &= ~(GR_AUDITS | GR_SUPPRESS); -+ -+ gr_log_learn(new_dentry, mnt, new_mode); -+ -+ preempt_enable(); -+ return new_mode; -+ } -+ preempt_enable(); -+ return (matchpo->mode & mode); -+ } -+ -+ check_parent: -+ curracl = current->acl; -+ -+ matchpo = chk_obj_create_label(parent, mnt, curracl, path); -+ retval = matchpo->mode & mode; -+ -+ if ((retval != (mode & ~(GR_AUDITS | GR_SUPPRESS))) -+ && (curracl->mode & (GR_LEARN | GR_INHERITLEARN))) { -+ __u32 new_mode = mode; -+ -+ new_mode &= ~(GR_AUDITS | GR_SUPPRESS); -+ -+ gr_log_learn(new_dentry, mnt, new_mode); -+ preempt_enable(); -+ return new_mode; -+ } -+ -+ preempt_enable(); -+ return retval; -+} -+ -+int -+gr_check_hidden_task(const struct task_struct *task) -+{ -+ if (unlikely(!(gr_status & GR_READY))) -+ return 0; -+ -+ if (!(task->acl->mode & GR_PROCFIND) && !(current->acl->mode & GR_VIEW)) -+ return 1; -+ -+ return 0; -+} -+ -+int -+gr_check_protected_task(const struct task_struct *task) -+{ -+ if (unlikely(!(gr_status & GR_READY) || !task)) -+ return 0; -+ -+ if ((task->acl->mode & GR_PROTECTED) && !(current->acl->mode & GR_KILL) && -+ task->acl != current->acl) -+ return 1; -+ -+ return 0; -+} -+ -+void -+gr_copy_label(struct task_struct *tsk) -+{ -+ tsk->signal->used_accept = 0; -+ tsk->acl_sp_role = 0; -+ tsk->acl_role_id = current->acl_role_id; -+ tsk->acl = current->acl; -+ tsk->role = current->role; -+ tsk->signal->curr_ip = current->signal->curr_ip; -+ if (current->exec_file) -+ get_file(current->exec_file); -+ tsk->exec_file = current->exec_file; -+ tsk->is_writable = current->is_writable; -+ if (unlikely(current->signal->used_accept)) -+ current->signal->curr_ip = 0; -+ -+ return; -+} -+ -+static void -+gr_set_proc_res(struct task_struct *task) -+{ -+ struct acl_subject_label *proc; -+ unsigned short i; -+ -+ proc = task->acl; -+ -+ if (proc->mode & (GR_LEARN | GR_INHERITLEARN)) -+ return; -+ -+ for (i = 0; i < RLIM_NLIMITS; i++) { -+ if (!(proc->resmask & (1 << i))) -+ continue; -+ -+ task->signal->rlim[i].rlim_cur = proc->res[i].rlim_cur; -+ task->signal->rlim[i].rlim_max = proc->res[i].rlim_max; -+ } -+ -+ return; -+} -+ -+int -+gr_check_user_change(int real, int effective, int fs) -+{ -+ unsigned int i; -+ __u16 num; -+ uid_t *uidlist; -+ int curuid; -+ int realok = 0; -+ int effectiveok = 0; -+ int fsok = 0; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return 0; -+ -+ if (current->acl->mode & (GR_LEARN | GR_INHERITLEARN)) -+ gr_log_learn_id_change('u', real, effective, fs); -+ -+ num = current->acl->user_trans_num; -+ uidlist = current->acl->user_transitions; -+ -+ if (uidlist == NULL) -+ return 0; -+ -+ if (real == -1) -+ realok = 1; -+ if (effective == -1) -+ effectiveok = 1; -+ if (fs == -1) -+ fsok = 1; -+ -+ if (current->acl->user_trans_type & GR_ID_ALLOW) { -+ for (i = 0; i < num; i++) { -+ curuid = (int)uidlist[i]; -+ if (real == curuid) -+ realok = 1; -+ if (effective == curuid) -+ effectiveok = 1; -+ if (fs == curuid) -+ fsok = 1; -+ } -+ } else if (current->acl->user_trans_type & GR_ID_DENY) { -+ for (i = 0; i < num; i++) { -+ curuid = (int)uidlist[i]; -+ if (real == curuid) -+ break; -+ if (effective == curuid) -+ break; -+ if (fs == curuid) -+ break; -+ } -+ /* not in deny list */ -+ if (i == num) { -+ realok = 1; -+ effectiveok = 1; -+ fsok = 1; -+ } -+ } -+ -+ if (realok && effectiveok && fsok) -+ return 0; -+ else { -+ gr_log_int(GR_DONT_AUDIT, GR_USRCHANGE_ACL_MSG, realok ? (effectiveok ? (fsok ? 0 : fs) : effective) : real); -+ return 1; -+ } -+} -+ -+int -+gr_check_group_change(int real, int effective, int fs) -+{ -+ unsigned int i; -+ __u16 num; -+ gid_t *gidlist; -+ int curgid; -+ int realok = 0; -+ int effectiveok = 0; -+ int fsok = 0; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return 0; -+ -+ if (current->acl->mode & (GR_LEARN | GR_INHERITLEARN)) -+ gr_log_learn_id_change('g', real, effective, fs); -+ -+ num = current->acl->group_trans_num; -+ gidlist = current->acl->group_transitions; -+ -+ if (gidlist == NULL) -+ return 0; -+ -+ if (real == -1) -+ realok = 1; -+ if (effective == -1) -+ effectiveok = 1; -+ if (fs == -1) -+ fsok = 1; -+ -+ if (current->acl->group_trans_type & GR_ID_ALLOW) { -+ for (i = 0; i < num; i++) { -+ curgid = (int)gidlist[i]; -+ if (real == curgid) -+ realok = 1; -+ if (effective == curgid) -+ effectiveok = 1; -+ if (fs == curgid) -+ fsok = 1; -+ } -+ } else if (current->acl->group_trans_type & GR_ID_DENY) { -+ for (i = 0; i < num; i++) { -+ curgid = (int)gidlist[i]; -+ if (real == curgid) -+ break; -+ if (effective == curgid) -+ break; -+ if (fs == curgid) -+ break; -+ } -+ /* not in deny list */ -+ if (i == num) { -+ realok = 1; -+ effectiveok = 1; -+ fsok = 1; -+ } -+ } -+ -+ if (realok && effectiveok && fsok) -+ return 0; -+ else { -+ gr_log_int(GR_DONT_AUDIT, GR_GRPCHANGE_ACL_MSG, realok ? (effectiveok ? (fsok ? 0 : fs) : effective) : real); -+ return 1; -+ } -+} -+ -+void -+gr_set_role_label(struct task_struct *task, const uid_t uid, const uid_t gid) -+{ -+ struct acl_role_label *role = task->role; -+ struct acl_subject_label *subj = NULL; -+ struct acl_object_label *obj; -+ struct file *filp; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return; -+ -+ filp = task->exec_file; -+ -+ /* kernel process, we'll give them the kernel role */ -+ if (unlikely(!filp)) { -+ task->role = kernel_role; -+ task->acl = kernel_role->root_label; -+ return; -+ } else if (!task->role || !(task->role->roletype & GR_ROLE_SPECIAL)) -+ role = lookup_acl_role_label(task, uid, gid); -+ -+ /* perform subject lookup in possibly new role -+ we can use this result below in the case where role == task->role -+ */ -+ subj = chk_subj_label(filp->f_path.dentry, filp->f_path.mnt, role); -+ -+ /* if we changed uid/gid, but result in the same role -+ and are using inheritance, don't lose the inherited subject -+ if current subject is other than what normal lookup -+ would result in, we arrived via inheritance, don't -+ lose subject -+ */ -+ if (role != task->role || (!(task->acl->mode & GR_INHERITLEARN) && -+ (subj == task->acl))) -+ task->acl = subj; -+ -+ task->role = role; -+ -+ task->is_writable = 0; -+ -+ /* ignore additional mmap checks for processes that are writable -+ by the default ACL */ -+ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, default_role->root_label); -+ if (unlikely(obj->mode & GR_WRITE)) -+ task->is_writable = 1; -+ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, task->role->root_label); -+ if (unlikely(obj->mode & GR_WRITE)) -+ task->is_writable = 1; -+ -+#ifdef CONFIG_GRKERNSEC_ACL_DEBUG -+ printk(KERN_ALERT "Set role label for (%s:%d): role:%s, subject:%s\n", task->comm, task->pid, task->role->rolename, task->acl->filename); -+#endif -+ -+ gr_set_proc_res(task); -+ -+ return; -+} -+ -+int -+gr_set_proc_label(const struct dentry *dentry, const struct vfsmount *mnt, -+ const int unsafe_share) -+{ -+ struct task_struct *task = current; -+ struct acl_subject_label *newacl; -+ struct acl_object_label *obj; -+ __u32 retmode; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return 0; -+ -+ newacl = chk_subj_label(dentry, mnt, task->role); -+ -+ task_lock(task); -+ if (((task->ptrace & PT_PTRACED) && !(task->acl->mode & -+ GR_POVERRIDE) && (task->acl != newacl) && -+ !(task->role->roletype & GR_ROLE_GOD) && -+ !gr_search_file(dentry, GR_PTRACERD, mnt) && -+ !(task->acl->mode & (GR_LEARN | GR_INHERITLEARN))) -+ || unsafe_share) { -+ task_unlock(task); -+ gr_log_fs_generic(GR_DONT_AUDIT, GR_PTRACE_EXEC_ACL_MSG, dentry, mnt); -+ return -EACCES; -+ } -+ task_unlock(task); -+ -+ obj = chk_obj_label(dentry, mnt, task->acl); -+ retmode = obj->mode & (GR_INHERIT | GR_AUDIT_INHERIT); -+ -+ if (!(task->acl->mode & GR_INHERITLEARN) && -+ ((newacl->mode & GR_LEARN) || !(retmode & GR_INHERIT))) { -+ if (obj->nested) -+ task->acl = obj->nested; -+ else -+ task->acl = newacl; -+ } else if (retmode & GR_INHERIT && retmode & GR_AUDIT_INHERIT) -+ gr_log_str_fs(GR_DO_AUDIT, GR_INHERIT_ACL_MSG, task->acl->filename, dentry, mnt); -+ -+ task->is_writable = 0; -+ -+ /* ignore additional mmap checks for processes that are writable -+ by the default ACL */ -+ obj = chk_obj_label(dentry, mnt, default_role->root_label); -+ if (unlikely(obj->mode & GR_WRITE)) -+ task->is_writable = 1; -+ obj = chk_obj_label(dentry, mnt, task->role->root_label); -+ if (unlikely(obj->mode & GR_WRITE)) -+ task->is_writable = 1; -+ -+ gr_set_proc_res(task); -+ -+#ifdef CONFIG_GRKERNSEC_ACL_DEBUG -+ printk(KERN_ALERT "Set subject label for (%s:%d): role:%s, subject:%s\n", task->comm, task->pid, task->role->rolename, task->acl->filename); -+#endif -+ return 0; -+} -+ -+/* always called with valid inodev ptr */ -+static void -+do_handle_delete(struct inodev_entry *inodev, const ino_t ino, const dev_t dev) -+{ -+ struct acl_object_label *matchpo; -+ struct acl_subject_label *matchps; -+ struct acl_subject_label *subj; -+ struct acl_role_label *role; -+ unsigned int i, x; -+ -+ FOR_EACH_ROLE_START(role, i) -+ FOR_EACH_SUBJECT_START(role, subj, x) -+ if ((matchpo = lookup_acl_obj_label(ino, dev, subj)) != NULL) -+ matchpo->mode |= GR_DELETED; -+ FOR_EACH_SUBJECT_END(subj,x) -+ FOR_EACH_NESTED_SUBJECT_START(role, subj) -+ if (subj->inode == ino && subj->device == dev) -+ subj->mode |= GR_DELETED; -+ FOR_EACH_NESTED_SUBJECT_END(subj) -+ if ((matchps = lookup_acl_subj_label(ino, dev, role)) != NULL) -+ matchps->mode |= GR_DELETED; -+ FOR_EACH_ROLE_END(role,i) -+ -+ inodev->nentry->deleted = 1; -+ -+ return; -+} -+ -+void -+gr_handle_delete(const ino_t ino, const dev_t dev) -+{ -+ struct inodev_entry *inodev; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return; -+ -+ write_lock(&gr_inode_lock); -+ inodev = lookup_inodev_entry(ino, dev); -+ if (inodev != NULL) -+ do_handle_delete(inodev, ino, dev); -+ write_unlock(&gr_inode_lock); -+ -+ return; -+} -+ -+static void -+update_acl_obj_label(const ino_t oldinode, const dev_t olddevice, -+ const ino_t newinode, const dev_t newdevice, -+ struct acl_subject_label *subj) -+{ -+ unsigned int index = fhash(oldinode, olddevice, subj->obj_hash_size); -+ struct acl_object_label *match; -+ -+ match = subj->obj_hash[index]; -+ -+ while (match && (match->inode != oldinode || -+ match->device != olddevice || -+ !(match->mode & GR_DELETED))) -+ match = match->next; -+ -+ if (match && (match->inode == oldinode) -+ && (match->device == olddevice) -+ && (match->mode & GR_DELETED)) { -+ if (match->prev == NULL) { -+ subj->obj_hash[index] = match->next; -+ if (match->next != NULL) -+ match->next->prev = NULL; -+ } else { -+ match->prev->next = match->next; -+ if (match->next != NULL) -+ match->next->prev = match->prev; -+ } -+ match->prev = NULL; -+ match->next = NULL; -+ match->inode = newinode; -+ match->device = newdevice; -+ match->mode &= ~GR_DELETED; -+ -+ insert_acl_obj_label(match, subj); -+ } -+ -+ return; -+} -+ -+static void -+update_acl_subj_label(const ino_t oldinode, const dev_t olddevice, -+ const ino_t newinode, const dev_t newdevice, -+ struct acl_role_label *role) -+{ -+ unsigned int index = fhash(oldinode, olddevice, role->subj_hash_size); -+ struct acl_subject_label *match; -+ -+ match = role->subj_hash[index]; -+ -+ while (match && (match->inode != oldinode || -+ match->device != olddevice || -+ !(match->mode & GR_DELETED))) -+ match = match->next; -+ -+ if (match && (match->inode == oldinode) -+ && (match->device == olddevice) -+ && (match->mode & GR_DELETED)) { -+ if (match->prev == NULL) { -+ role->subj_hash[index] = match->next; -+ if (match->next != NULL) -+ match->next->prev = NULL; -+ } else { -+ match->prev->next = match->next; -+ if (match->next != NULL) -+ match->next->prev = match->prev; -+ } -+ match->prev = NULL; -+ match->next = NULL; -+ match->inode = newinode; -+ match->device = newdevice; -+ match->mode &= ~GR_DELETED; -+ -+ insert_acl_subj_label(match, role); -+ } -+ -+ return; -+} -+ -+static void -+update_inodev_entry(const ino_t oldinode, const dev_t olddevice, -+ const ino_t newinode, const dev_t newdevice) -+{ -+ unsigned int index = fhash(oldinode, olddevice, inodev_set.i_size); -+ struct inodev_entry *match; -+ -+ match = inodev_set.i_hash[index]; -+ -+ while (match && (match->nentry->inode != oldinode || -+ match->nentry->device != olddevice || !match->nentry->deleted)) -+ match = match->next; -+ -+ if (match && (match->nentry->inode == oldinode) -+ && (match->nentry->device == olddevice) && -+ match->nentry->deleted) { -+ if (match->prev == NULL) { -+ inodev_set.i_hash[index] = match->next; -+ if (match->next != NULL) -+ match->next->prev = NULL; -+ } else { -+ match->prev->next = match->next; -+ if (match->next != NULL) -+ match->next->prev = match->prev; -+ } -+ match->prev = NULL; -+ match->next = NULL; -+ match->nentry->inode = newinode; -+ match->nentry->device = newdevice; -+ match->nentry->deleted = 0; -+ -+ insert_inodev_entry(match); -+ } -+ -+ return; -+} -+ -+static void -+do_handle_create(const struct name_entry *matchn, const struct dentry *dentry, -+ const struct vfsmount *mnt) -+{ -+ struct acl_subject_label *subj; -+ struct acl_role_label *role; -+ unsigned int i, x; -+ -+ FOR_EACH_ROLE_START(role, i) -+ update_acl_subj_label(matchn->inode, matchn->device, -+ dentry->d_inode->i_ino, -+ dentry->d_inode->i_sb->s_dev, role); -+ -+ FOR_EACH_NESTED_SUBJECT_START(role, subj) -+ if ((subj->inode == dentry->d_inode->i_ino) && -+ (subj->device == dentry->d_inode->i_sb->s_dev)) { -+ subj->inode = dentry->d_inode->i_ino; -+ subj->device = dentry->d_inode->i_sb->s_dev; -+ } -+ FOR_EACH_NESTED_SUBJECT_END(subj) -+ FOR_EACH_SUBJECT_START(role, subj, x) -+ update_acl_obj_label(matchn->inode, matchn->device, -+ dentry->d_inode->i_ino, -+ dentry->d_inode->i_sb->s_dev, subj); -+ FOR_EACH_SUBJECT_END(subj,x) -+ FOR_EACH_ROLE_END(role,i) -+ -+ update_inodev_entry(matchn->inode, matchn->device, -+ dentry->d_inode->i_ino, dentry->d_inode->i_sb->s_dev); -+ -+ return; -+} -+ -+void -+gr_handle_create(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ struct name_entry *matchn; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return; -+ -+ preempt_disable(); -+ matchn = lookup_name_entry(gr_to_filename_rbac(dentry, mnt)); -+ -+ if (unlikely((unsigned long)matchn)) { -+ write_lock(&gr_inode_lock); -+ do_handle_create(matchn, dentry, mnt); -+ write_unlock(&gr_inode_lock); -+ } -+ preempt_enable(); -+ -+ return; -+} -+ -+void -+gr_handle_rename(struct inode *old_dir, struct inode *new_dir, -+ struct dentry *old_dentry, -+ struct dentry *new_dentry, -+ struct vfsmount *mnt, const __u8 replace) -+{ -+ struct name_entry *matchn; -+ struct inodev_entry *inodev; -+ -+ /* vfs_rename swaps the name and parent link for old_dentry and -+ new_dentry -+ at this point, old_dentry has the new name, parent link, and inode -+ for the renamed file -+ if a file is being replaced by a rename, new_dentry has the inode -+ and name for the replaced file -+ */ -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return; -+ -+ preempt_disable(); -+ matchn = lookup_name_entry(gr_to_filename_rbac(old_dentry, mnt)); -+ -+ /* we wouldn't have to check d_inode if it weren't for -+ NFS silly-renaming -+ */ -+ -+ write_lock(&gr_inode_lock); -+ if (unlikely(replace && new_dentry->d_inode)) { -+ inodev = lookup_inodev_entry(new_dentry->d_inode->i_ino, -+ new_dentry->d_inode->i_sb->s_dev); -+ if (inodev != NULL && (new_dentry->d_inode->i_nlink <= 1)) -+ do_handle_delete(inodev, new_dentry->d_inode->i_ino, -+ new_dentry->d_inode->i_sb->s_dev); -+ } -+ -+ inodev = lookup_inodev_entry(old_dentry->d_inode->i_ino, -+ old_dentry->d_inode->i_sb->s_dev); -+ if (inodev != NULL && (old_dentry->d_inode->i_nlink <= 1)) -+ do_handle_delete(inodev, old_dentry->d_inode->i_ino, -+ old_dentry->d_inode->i_sb->s_dev); -+ -+ if (unlikely((unsigned long)matchn)) -+ do_handle_create(matchn, old_dentry, mnt); -+ -+ write_unlock(&gr_inode_lock); -+ preempt_enable(); -+ -+ return; -+} -+ -+static int -+lookup_special_role_auth(__u16 mode, const char *rolename, unsigned char **salt, -+ unsigned char **sum) -+{ -+ struct acl_role_label *r; -+ struct role_allowed_ip *ipp; -+ struct role_transition *trans; -+ unsigned int i; -+ int found = 0; -+ -+ /* check transition table */ -+ -+ for (trans = current->role->transitions; trans; trans = trans->next) { -+ if (!strcmp(rolename, trans->rolename)) { -+ found = 1; -+ break; -+ } -+ } -+ -+ if (!found) -+ return 0; -+ -+ /* handle special roles that do not require authentication -+ and check ip */ -+ -+ FOR_EACH_ROLE_START(r, i) -+ if (!strcmp(rolename, r->rolename) && -+ (r->roletype & GR_ROLE_SPECIAL)) { -+ found = 0; -+ if (r->allowed_ips != NULL) { -+ for (ipp = r->allowed_ips; ipp; ipp = ipp->next) { -+ if ((ntohl(current->signal->curr_ip) & ipp->netmask) == -+ (ntohl(ipp->addr) & ipp->netmask)) -+ found = 1; -+ } -+ } else -+ found = 2; -+ if (!found) -+ return 0; -+ -+ if (((mode == GR_SPROLE) && (r->roletype & GR_ROLE_NOPW)) || -+ ((mode == GR_SPROLEPAM) && (r->roletype & GR_ROLE_PAM))) { -+ *salt = NULL; -+ *sum = NULL; -+ return 1; -+ } -+ } -+ FOR_EACH_ROLE_END(r,i) -+ -+ for (i = 0; i < num_sprole_pws; i++) { -+ if (!strcmp(rolename, acl_special_roles[i]->rolename)) { -+ *salt = acl_special_roles[i]->salt; -+ *sum = acl_special_roles[i]->sum; -+ return 1; -+ } -+ } -+ -+ return 0; -+} -+ -+static void -+assign_special_role(char *rolename) -+{ -+ struct acl_object_label *obj; -+ struct acl_role_label *r; -+ struct acl_role_label *assigned = NULL; -+ struct task_struct *tsk; -+ struct file *filp; -+ unsigned int i; -+ -+ FOR_EACH_ROLE_START(r, i) -+ if (!strcmp(rolename, r->rolename) && -+ (r->roletype & GR_ROLE_SPECIAL)) -+ assigned = r; -+ FOR_EACH_ROLE_END(r,i) -+ -+ if (!assigned) -+ return; -+ -+ read_lock(&tasklist_lock); -+ read_lock(&grsec_exec_file_lock); -+ -+ tsk = current->parent; -+ if (tsk == NULL) -+ goto out_unlock; -+ -+ filp = tsk->exec_file; -+ if (filp == NULL) -+ goto out_unlock; -+ -+ tsk->is_writable = 0; -+ -+ tsk->acl_sp_role = 1; -+ tsk->acl_role_id = ++acl_sp_role_value; -+ tsk->role = assigned; -+ tsk->acl = chk_subj_label(filp->f_path.dentry, filp->f_path.mnt, tsk->role); -+ -+ /* ignore additional mmap checks for processes that are writable -+ by the default ACL */ -+ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, default_role->root_label); -+ if (unlikely(obj->mode & GR_WRITE)) -+ tsk->is_writable = 1; -+ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, tsk->role->root_label); -+ if (unlikely(obj->mode & GR_WRITE)) -+ tsk->is_writable = 1; -+ -+#ifdef CONFIG_GRKERNSEC_ACL_DEBUG -+ printk(KERN_ALERT "Assigning special role:%s subject:%s to process (%s:%d)\n", tsk->role->rolename, tsk->acl->filename, tsk->comm, tsk->pid); -+#endif -+ -+out_unlock: -+ read_unlock(&grsec_exec_file_lock); -+ read_unlock(&tasklist_lock); -+ return; -+} -+ -+int gr_check_secure_terminal(struct task_struct *task) -+{ -+ struct task_struct *p, *p2, *p3; -+ struct files_struct *files; -+ struct fdtable *fdt; -+ struct file *our_file = NULL, *file; -+ int i; -+ -+ if (task->signal->tty == NULL) -+ return 1; -+ -+ files = get_files_struct(task); -+ if (files != NULL) { -+ rcu_read_lock(); -+ fdt = files_fdtable(files); -+ for (i=0; i < fdt->max_fds; i++) { -+ file = fcheck_files(files, i); -+ if (file && (our_file == NULL) && (file->private_data == task->signal->tty)) { -+ get_file(file); -+ our_file = file; -+ } -+ } -+ rcu_read_unlock(); -+ put_files_struct(files); -+ } -+ -+ if (our_file == NULL) -+ return 1; -+ -+ read_lock(&tasklist_lock); -+ do_each_thread(p2, p) { -+ files = get_files_struct(p); -+ if (files == NULL || -+ (p->signal && p->signal->tty == task->signal->tty)) { -+ if (files != NULL) -+ put_files_struct(files); -+ continue; -+ } -+ rcu_read_lock(); -+ fdt = files_fdtable(files); -+ for (i=0; i < fdt->max_fds; i++) { -+ file = fcheck_files(files, i); -+ if (file && S_ISCHR(file->f_path.dentry->d_inode->i_mode) && -+ file->f_path.dentry->d_inode->i_rdev == our_file->f_path.dentry->d_inode->i_rdev) { -+ p3 = task; -+ while (p3->pid > 0) { -+ if (p3 == p) -+ break; -+ p3 = p3->parent; -+ } -+ if (p3 == p) -+ break; -+ gr_log_ttysniff(GR_DONT_AUDIT_GOOD, GR_TTYSNIFF_ACL_MSG, p); -+ gr_handle_alertkill(p); -+ rcu_read_unlock(); -+ put_files_struct(files); -+ read_unlock(&tasklist_lock); -+ fput(our_file); -+ return 0; -+ } -+ } -+ rcu_read_unlock(); -+ put_files_struct(files); -+ } while_each_thread(p2, p); -+ read_unlock(&tasklist_lock); -+ -+ fput(our_file); -+ return 1; -+} -+ -+ssize_t -+write_grsec_handler(struct file *file, const char * buf, size_t count, loff_t *ppos) -+{ -+ struct gr_arg_wrapper uwrap; -+ unsigned char *sprole_salt; -+ unsigned char *sprole_sum; -+ int error = sizeof (struct gr_arg_wrapper); -+ int error2 = 0; -+ -+ down(&gr_dev_sem); -+ -+ if ((gr_status & GR_READY) && !(current->acl->mode & GR_KERNELAUTH)) { -+ error = -EPERM; -+ goto out; -+ } -+ -+ if (count != sizeof (struct gr_arg_wrapper)) { -+ gr_log_int_int(GR_DONT_AUDIT_GOOD, GR_DEV_ACL_MSG, (int)count, (int)sizeof(struct gr_arg_wrapper)); -+ error = -EINVAL; -+ goto out; -+ } -+ -+ -+ if (gr_auth_expires && time_after_eq(get_seconds(), gr_auth_expires)) { -+ gr_auth_expires = 0; -+ gr_auth_attempts = 0; -+ } -+ -+ if (copy_from_user(&uwrap, buf, sizeof (struct gr_arg_wrapper))) { -+ error = -EFAULT; -+ goto out; -+ } -+ -+ if ((uwrap.version != GRSECURITY_VERSION) || (uwrap.size != sizeof(struct gr_arg))) { -+ error = -EINVAL; -+ goto out; -+ } -+ -+ if (copy_from_user(gr_usermode, uwrap.arg, sizeof (struct gr_arg))) { -+ error = -EFAULT; -+ goto out; -+ } -+ -+ if (gr_usermode->mode != GR_SPROLE && gr_usermode->mode != GR_SPROLEPAM && -+ gr_auth_attempts >= CONFIG_GRKERNSEC_ACL_MAXTRIES && -+ time_after(gr_auth_expires, get_seconds())) { -+ error = -EBUSY; -+ goto out; -+ } -+ -+ /* if non-root trying to do anything other than use a special role, -+ do not attempt authentication, do not count towards authentication -+ locking -+ */ -+ -+ if (gr_usermode->mode != GR_SPROLE && gr_usermode->mode != GR_STATUS && -+ gr_usermode->mode != GR_UNSPROLE && gr_usermode->mode != GR_SPROLEPAM && -+ current_uid()) { -+ error = -EPERM; -+ goto out; -+ } -+ -+ /* ensure pw and special role name are null terminated */ -+ -+ gr_usermode->pw[GR_PW_LEN - 1] = '\0'; -+ gr_usermode->sp_role[GR_SPROLE_LEN - 1] = '\0'; -+ -+ /* Okay. -+ * We have our enough of the argument structure..(we have yet -+ * to copy_from_user the tables themselves) . Copy the tables -+ * only if we need them, i.e. for loading operations. */ -+ -+ switch (gr_usermode->mode) { -+ case GR_STATUS: -+ if (gr_status & GR_READY) { -+ error = 1; -+ if (!gr_check_secure_terminal(current)) -+ error = 3; -+ } else -+ error = 2; -+ goto out; -+ case GR_SHUTDOWN: -+ if ((gr_status & GR_READY) -+ && !(chkpw(gr_usermode, gr_system_salt, gr_system_sum))) { -+#ifdef CONFIG_PAX_KERNEXEC -+ { -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+ gr_status &= ~GR_READY; -+ pax_close_kernel(cr0); -+ } -+#else -+ gr_status &= ~GR_READY; -+#endif -+ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_SHUTS_ACL_MSG); -+ free_variables(); -+ memset(gr_usermode, 0, sizeof (struct gr_arg)); -+ memset(gr_system_salt, 0, GR_SALT_LEN); -+ memset(gr_system_sum, 0, GR_SHA_LEN); -+ } else if (gr_status & GR_READY) { -+ gr_log_noargs(GR_DONT_AUDIT, GR_SHUTF_ACL_MSG); -+ error = -EPERM; -+ } else { -+ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_SHUTI_ACL_MSG); -+ error = -EAGAIN; -+ } -+ break; -+ case GR_ENABLE: -+ if (!(gr_status & GR_READY) && !(error2 = gracl_init(gr_usermode))) -+ gr_log_str(GR_DONT_AUDIT_GOOD, GR_ENABLE_ACL_MSG, GR_VERSION); -+ else { -+ if (gr_status & GR_READY) -+ error = -EAGAIN; -+ else -+ error = error2; -+ gr_log_str(GR_DONT_AUDIT, GR_ENABLEF_ACL_MSG, GR_VERSION); -+ } -+ break; -+ case GR_RELOAD: -+ if (!(gr_status & GR_READY)) { -+ gr_log_str(GR_DONT_AUDIT_GOOD, GR_RELOADI_ACL_MSG, GR_VERSION); -+ error = -EAGAIN; -+ } else if (!(chkpw(gr_usermode, gr_system_salt, gr_system_sum))) { -+ lock_kernel(); -+#ifdef CONFIG_PAX_KERNEXEC -+ { -+ unsigned long cr0; -+ -+ pax_open_kernel(cr0); -+ gr_status &= ~GR_READY; -+ pax_close_kernel(cr0); -+ } -+#else -+ gr_status &= ~GR_READY; -+#endif -+ free_variables(); -+ if (!(error2 = gracl_init(gr_usermode))) { -+ unlock_kernel(); -+ gr_log_str(GR_DONT_AUDIT_GOOD, GR_RELOAD_ACL_MSG, GR_VERSION); -+ } else { -+ unlock_kernel(); -+ error = error2; -+ gr_log_str(GR_DONT_AUDIT, GR_RELOADF_ACL_MSG, GR_VERSION); -+ } -+ } else { -+ gr_log_str(GR_DONT_AUDIT, GR_RELOADF_ACL_MSG, GR_VERSION); -+ error = -EPERM; -+ } -+ break; -+ case GR_SEGVMOD: -+ if (unlikely(!(gr_status & GR_READY))) { -+ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_SEGVMODI_ACL_MSG); -+ error = -EAGAIN; -+ break; -+ } -+ -+ if (!(chkpw(gr_usermode, gr_system_salt, gr_system_sum))) { -+ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_SEGVMODS_ACL_MSG); -+ if (gr_usermode->segv_device && gr_usermode->segv_inode) { -+ struct acl_subject_label *segvacl; -+ segvacl = -+ lookup_acl_subj_label(gr_usermode->segv_inode, -+ gr_usermode->segv_device, -+ current->role); -+ if (segvacl) { -+ segvacl->crashes = 0; -+ segvacl->expires = 0; -+ } -+ } else if (gr_find_uid(gr_usermode->segv_uid) >= 0) { -+ gr_remove_uid(gr_usermode->segv_uid); -+ } -+ } else { -+ gr_log_noargs(GR_DONT_AUDIT, GR_SEGVMODF_ACL_MSG); -+ error = -EPERM; -+ } -+ break; -+ case GR_SPROLE: -+ case GR_SPROLEPAM: -+ if (unlikely(!(gr_status & GR_READY))) { -+ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_SPROLEI_ACL_MSG); -+ error = -EAGAIN; -+ break; -+ } -+ -+ if (current->role->expires && time_after_eq(get_seconds(), current->role->expires)) { -+ current->role->expires = 0; -+ current->role->auth_attempts = 0; -+ } -+ -+ if (current->role->auth_attempts >= CONFIG_GRKERNSEC_ACL_MAXTRIES && -+ time_after(current->role->expires, get_seconds())) { -+ error = -EBUSY; -+ goto out; -+ } -+ -+ if (lookup_special_role_auth -+ (gr_usermode->mode, gr_usermode->sp_role, &sprole_salt, &sprole_sum) -+ && ((!sprole_salt && !sprole_sum) -+ || !(chkpw(gr_usermode, sprole_salt, sprole_sum)))) { -+ char *p = ""; -+ assign_special_role(gr_usermode->sp_role); -+ read_lock(&tasklist_lock); -+ if (current->parent) -+ p = current->parent->role->rolename; -+ read_unlock(&tasklist_lock); -+ gr_log_str_int(GR_DONT_AUDIT_GOOD, GR_SPROLES_ACL_MSG, -+ p, acl_sp_role_value); -+ } else { -+ gr_log_str(GR_DONT_AUDIT, GR_SPROLEF_ACL_MSG, gr_usermode->sp_role); -+ error = -EPERM; -+ if(!(current->role->auth_attempts++)) -+ current->role->expires = get_seconds() + CONFIG_GRKERNSEC_ACL_TIMEOUT; -+ -+ goto out; -+ } -+ break; -+ case GR_UNSPROLE: -+ if (unlikely(!(gr_status & GR_READY))) { -+ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_UNSPROLEI_ACL_MSG); -+ error = -EAGAIN; -+ break; -+ } -+ -+ if (current->role->roletype & GR_ROLE_SPECIAL) { -+ char *p = ""; -+ int i = 0; -+ -+ read_lock(&tasklist_lock); -+ if (current->parent) { -+ p = current->parent->role->rolename; -+ i = current->parent->acl_role_id; -+ } -+ read_unlock(&tasklist_lock); -+ -+ gr_log_str_int(GR_DONT_AUDIT_GOOD, GR_UNSPROLES_ACL_MSG, p, i); -+ gr_set_acls(1); -+ } else { -+ gr_log_str(GR_DONT_AUDIT, GR_UNSPROLEF_ACL_MSG, current->role->rolename); -+ error = -EPERM; -+ goto out; -+ } -+ break; -+ default: -+ gr_log_int(GR_DONT_AUDIT, GR_INVMODE_ACL_MSG, gr_usermode->mode); -+ error = -EINVAL; -+ break; -+ } -+ -+ if (error != -EPERM) -+ goto out; -+ -+ if(!(gr_auth_attempts++)) -+ gr_auth_expires = get_seconds() + CONFIG_GRKERNSEC_ACL_TIMEOUT; -+ -+ out: -+ up(&gr_dev_sem); -+ return error; -+} -+ -+int -+gr_set_acls(const int type) -+{ -+ struct acl_object_label *obj; -+ struct task_struct *task, *task2; -+ struct file *filp; -+ struct acl_role_label *role = current->role; -+ __u16 acl_role_id = current->acl_role_id; -+ const struct cred *cred; -+ char *tmpname; -+ struct name_entry *nmatch; -+ struct acl_subject_label *tmpsubj; -+ -+ read_lock(&tasklist_lock); -+ read_lock(&grsec_exec_file_lock); -+ do_each_thread(task2, task) { -+ /* check to see if we're called from the exit handler, -+ if so, only replace ACLs that have inherited the admin -+ ACL */ -+ -+ if (type && (task->role != role || -+ task->acl_role_id != acl_role_id)) -+ continue; -+ -+ task->acl_role_id = 0; -+ task->acl_sp_role = 0; -+ -+ if ((filp = task->exec_file)) { -+ cred = __task_cred(task); -+ task->role = lookup_acl_role_label(task, cred->uid, cred->gid); -+ -+ /* the following is to apply the correct subject -+ on binaries running when the RBAC system -+ is enabled, when the binaries have been -+ replaced or deleted since their execution -+ ----- -+ when the RBAC system starts, the inode/dev -+ from exec_file will be one the RBAC system -+ is unaware of. It only knows the inode/dev -+ of the present file on disk, or the absence -+ of it. -+ */ -+ preempt_disable(); -+ tmpname = gr_to_filename_rbac(filp->f_path.dentry, filp->f_path.mnt); -+ -+ nmatch = lookup_name_entry(tmpname); -+ preempt_enable(); -+ tmpsubj = NULL; -+ if (nmatch) { -+ if (nmatch->deleted) -+ tmpsubj = lookup_acl_subj_label_deleted(nmatch->inode, nmatch->device, task->role); -+ else -+ tmpsubj = lookup_acl_subj_label(nmatch->inode, nmatch->device, task->role); -+ if (tmpsubj != NULL) -+ task->acl = tmpsubj; -+ } -+ if (tmpsubj == NULL) -+ task->acl = chk_subj_label(filp->f_path.dentry, filp->f_path.mnt, -+ task->role); -+ if (task->acl) { -+ struct acl_subject_label *curr; -+ curr = task->acl; -+ -+ task->is_writable = 0; -+ /* ignore additional mmap checks for processes that are writable -+ by the default ACL */ -+ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, default_role->root_label); -+ if (unlikely(obj->mode & GR_WRITE)) -+ task->is_writable = 1; -+ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, task->role->root_label); -+ if (unlikely(obj->mode & GR_WRITE)) -+ task->is_writable = 1; -+ -+ gr_set_proc_res(task); -+ -+#ifdef CONFIG_GRKERNSEC_ACL_DEBUG -+ printk(KERN_ALERT "gr_set_acls for (%s:%d): role:%s, subject:%s\n", task->comm, task->pid, task->role->rolename, task->acl->filename); -+#endif -+ } else { -+ read_unlock(&grsec_exec_file_lock); -+ read_unlock(&tasklist_lock); -+ gr_log_str_int(GR_DONT_AUDIT_GOOD, GR_DEFACL_MSG, task->comm, task->pid); -+ return 1; -+ } -+ } else { -+ // it's a kernel process -+ task->role = kernel_role; -+ task->acl = kernel_role->root_label; -+#ifdef CONFIG_GRKERNSEC_ACL_HIDEKERN -+ task->acl->mode &= ~GR_PROCFIND; -+#endif -+ } -+ } while_each_thread(task2, task); -+ read_unlock(&grsec_exec_file_lock); -+ read_unlock(&tasklist_lock); -+ return 0; -+} -+ -+void -+gr_learn_resource(const struct task_struct *task, -+ const int res, const unsigned long wanted, const int gt) -+{ -+ struct acl_subject_label *acl; -+ const struct cred *cred; -+ -+ if (unlikely((gr_status & GR_READY) && -+ task->acl && (task->acl->mode & (GR_LEARN | GR_INHERITLEARN)))) -+ goto skip_reslog; -+ -+#ifdef CONFIG_GRKERNSEC_RESLOG -+ gr_log_resource(task, res, wanted, gt); -+#endif -+ skip_reslog: -+ -+ if (unlikely(!(gr_status & GR_READY) || !wanted || res >= GR_NLIMITS)) -+ return; -+ -+ acl = task->acl; -+ -+ if (likely(!acl || !(acl->mode & (GR_LEARN | GR_INHERITLEARN)) || -+ !(acl->resmask & (1 << (unsigned short) res)))) -+ return; -+ -+ if (wanted >= acl->res[res].rlim_cur) { -+ unsigned long res_add; -+ -+ res_add = wanted; -+ switch (res) { -+ case RLIMIT_CPU: -+ res_add += GR_RLIM_CPU_BUMP; -+ break; -+ case RLIMIT_FSIZE: -+ res_add += GR_RLIM_FSIZE_BUMP; -+ break; -+ case RLIMIT_DATA: -+ res_add += GR_RLIM_DATA_BUMP; -+ break; -+ case RLIMIT_STACK: -+ res_add += GR_RLIM_STACK_BUMP; -+ break; -+ case RLIMIT_CORE: -+ res_add += GR_RLIM_CORE_BUMP; -+ break; -+ case RLIMIT_RSS: -+ res_add += GR_RLIM_RSS_BUMP; -+ break; -+ case RLIMIT_NPROC: -+ res_add += GR_RLIM_NPROC_BUMP; -+ break; -+ case RLIMIT_NOFILE: -+ res_add += GR_RLIM_NOFILE_BUMP; -+ break; -+ case RLIMIT_MEMLOCK: -+ res_add += GR_RLIM_MEMLOCK_BUMP; -+ break; -+ case RLIMIT_AS: -+ res_add += GR_RLIM_AS_BUMP; -+ break; -+ case RLIMIT_LOCKS: -+ res_add += GR_RLIM_LOCKS_BUMP; -+ break; -+ case RLIMIT_SIGPENDING: -+ res_add += GR_RLIM_SIGPENDING_BUMP; -+ break; -+ case RLIMIT_MSGQUEUE: -+ res_add += GR_RLIM_MSGQUEUE_BUMP; -+ break; -+ case RLIMIT_NICE: -+ res_add += GR_RLIM_NICE_BUMP; -+ break; -+ case RLIMIT_RTPRIO: -+ res_add += GR_RLIM_RTPRIO_BUMP; -+ break; -+ case RLIMIT_RTTIME: -+ res_add += GR_RLIM_RTTIME_BUMP; -+ break; -+ } -+ -+ acl->res[res].rlim_cur = res_add; -+ -+ if (wanted > acl->res[res].rlim_max) -+ acl->res[res].rlim_max = res_add; -+ -+ /* only log the subject filename, since resource logging is supported for -+ single-subject learning only */ -+ cred = __task_cred(task); -+ security_learn(GR_LEARN_AUDIT_MSG, task->role->rolename, -+ task->role->roletype, cred->uid, cred->gid, acl->filename, -+ acl->filename, acl->res[res].rlim_cur, acl->res[res].rlim_max, -+ "", (unsigned long) res, NIPQUAD(task->signal->curr_ip)); -+ } -+ -+ return; -+} -+ -+#if defined(CONFIG_PAX_HAVE_ACL_FLAGS) && (defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR)) -+void -+pax_set_initial_flags(struct linux_binprm *bprm) -+{ -+ struct task_struct *task = current; -+ struct acl_subject_label *proc; -+ unsigned long flags; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return; -+ -+ flags = pax_get_flags(task); -+ -+ proc = task->acl; -+ -+ if (proc->pax_flags & GR_PAX_DISABLE_PAGEEXEC) -+ flags &= ~MF_PAX_PAGEEXEC; -+ if (proc->pax_flags & GR_PAX_DISABLE_SEGMEXEC) -+ flags &= ~MF_PAX_SEGMEXEC; -+ if (proc->pax_flags & GR_PAX_DISABLE_RANDMMAP) -+ flags &= ~MF_PAX_RANDMMAP; -+ if (proc->pax_flags & GR_PAX_DISABLE_EMUTRAMP) -+ flags &= ~MF_PAX_EMUTRAMP; -+ if (proc->pax_flags & GR_PAX_DISABLE_MPROTECT) -+ flags &= ~MF_PAX_MPROTECT; -+ -+ if (proc->pax_flags & GR_PAX_ENABLE_PAGEEXEC) -+ flags |= MF_PAX_PAGEEXEC; -+ if (proc->pax_flags & GR_PAX_ENABLE_SEGMEXEC) -+ flags |= MF_PAX_SEGMEXEC; -+ if (proc->pax_flags & GR_PAX_ENABLE_RANDMMAP) -+ flags |= MF_PAX_RANDMMAP; -+ if (proc->pax_flags & GR_PAX_ENABLE_EMUTRAMP) -+ flags |= MF_PAX_EMUTRAMP; -+ if (proc->pax_flags & GR_PAX_ENABLE_MPROTECT) -+ flags |= MF_PAX_MPROTECT; -+ -+ pax_set_flags(task, flags); -+ -+ return; -+} -+#endif -+ -+#ifdef CONFIG_SYSCTL -+/* Eric Biederman likes breaking userland ABI and every inode-based security -+ system to save 35kb of memory */ -+ -+/* we modify the passed in filename, but adjust it back before returning */ -+static struct acl_object_label *gr_lookup_by_name(char *name, unsigned int len) -+{ -+ struct name_entry *nmatch; -+ char *p, *lastp = NULL; -+ struct acl_object_label *obj = NULL, *tmp; -+ struct acl_subject_label *tmpsubj; -+ char c = '\0'; -+ -+ read_lock(&gr_inode_lock); -+ -+ p = name + len - 1; -+ do { -+ nmatch = lookup_name_entry(name); -+ if (lastp != NULL) -+ *lastp = c; -+ -+ if (nmatch == NULL) -+ goto next_component; -+ tmpsubj = current->acl; -+ do { -+ obj = lookup_acl_obj_label(nmatch->inode, nmatch->device, tmpsubj); -+ if (obj != NULL) { -+ tmp = obj->globbed; -+ while (tmp) { -+ if (!glob_match(tmp->filename, name)) { -+ obj = tmp; -+ goto found_obj; -+ } -+ tmp = tmp->next; -+ } -+ goto found_obj; -+ } -+ } while ((tmpsubj = tmpsubj->parent_subject)); -+next_component: -+ /* end case */ -+ if (p == name) -+ break; -+ -+ while (*p != '/') -+ p--; -+ if (p == name) -+ lastp = p + 1; -+ else { -+ lastp = p; -+ p--; -+ } -+ c = *lastp; -+ *lastp = '\0'; -+ } while (1); -+found_obj: -+ read_unlock(&gr_inode_lock); -+ /* obj returned will always be non-null */ -+ return obj; -+} -+ -+/* returns 0 when allowing, non-zero on error -+ op of 0 is used for readdir, so we don't log the names of hidden files -+*/ -+__u32 -+gr_handle_sysctl(const struct ctl_table *table, const int op) -+{ -+ ctl_table *tmp; -+ const char *proc_sys = "/proc/sys"; -+ char *path; -+ struct acl_object_label *obj; -+ unsigned short len = 0, pos = 0, depth = 0, i; -+ __u32 err = 0; -+ __u32 mode = 0; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return 0; -+ -+ /* for now, ignore operations on non-sysctl entries if it's not a -+ readdir*/ -+ if (table->child != NULL && op != 0) -+ return 0; -+ -+ mode |= GR_FIND; -+ /* it's only a read if it's an entry, read on dirs is for readdir */ -+ if (op & MAY_READ) -+ mode |= GR_READ; -+ if (op & MAY_WRITE) -+ mode |= GR_WRITE; -+ -+ preempt_disable(); -+ -+ path = per_cpu_ptr(gr_shared_page[0], smp_processor_id()); -+ -+ /* it's only a read/write if it's an actual entry, not a dir -+ (which are opened for readdir) -+ */ -+ -+ /* convert the requested sysctl entry into a pathname */ -+ -+ for (tmp = (ctl_table *)table; tmp != NULL; tmp = tmp->parent) { -+ len += strlen(tmp->procname); -+ len++; -+ depth++; -+ } -+ -+ if ((len + depth + strlen(proc_sys) + 1) > PAGE_SIZE) { -+ /* deny */ -+ goto out; -+ } -+ -+ memset(path, 0, PAGE_SIZE); -+ -+ memcpy(path, proc_sys, strlen(proc_sys)); -+ -+ pos += strlen(proc_sys); -+ -+ for (; depth > 0; depth--) { -+ path[pos] = '/'; -+ pos++; -+ for (i = 1, tmp = (ctl_table *)table; tmp != NULL; tmp = tmp->parent) { -+ if (depth == i) { -+ memcpy(path + pos, tmp->procname, -+ strlen(tmp->procname)); -+ pos += strlen(tmp->procname); -+ } -+ i++; -+ } -+ } -+ -+ obj = gr_lookup_by_name(path, pos); -+ err = obj->mode & (mode | to_gr_audit(mode) | GR_SUPPRESS); -+ -+ if (unlikely((current->acl->mode & (GR_LEARN | GR_INHERITLEARN)) && -+ ((err & mode) != mode))) { -+ __u32 new_mode = mode; -+ -+ new_mode &= ~(GR_AUDITS | GR_SUPPRESS); -+ -+ err = 0; -+ gr_log_learn_sysctl(path, new_mode); -+ } else if (!(err & GR_FIND) && !(err & GR_SUPPRESS) && op != 0) { -+ gr_log_hidden_sysctl(GR_DONT_AUDIT, GR_HIDDEN_ACL_MSG, path); -+ err = -ENOENT; -+ } else if (!(err & GR_FIND)) { -+ err = -ENOENT; -+ } else if (((err & mode) & ~GR_FIND) != (mode & ~GR_FIND) && !(err & GR_SUPPRESS)) { -+ gr_log_str4(GR_DONT_AUDIT, GR_SYSCTL_ACL_MSG, "denied", -+ path, (mode & GR_READ) ? " reading" : "", -+ (mode & GR_WRITE) ? " writing" : ""); -+ err = -EACCES; -+ } else if ((err & mode) != mode) { -+ err = -EACCES; -+ } else if ((((err & mode) & ~GR_FIND) == (mode & ~GR_FIND)) && (err & GR_AUDITS)) { -+ gr_log_str4(GR_DO_AUDIT, GR_SYSCTL_ACL_MSG, "successful", -+ path, (mode & GR_READ) ? " reading" : "", -+ (mode & GR_WRITE) ? " writing" : ""); -+ err = 0; -+ } else -+ err = 0; -+ -+ out: -+ preempt_enable(); -+ -+ return err; -+} -+#endif -+ -+int -+gr_handle_proc_ptrace(struct task_struct *task) -+{ -+ struct file *filp; -+ struct task_struct *tmp = task; -+ struct task_struct *curtemp = current; -+ __u32 retmode; -+ -+#ifndef CONFIG_GRKERNSEC_HARDEN_PTRACE -+ if (unlikely(!(gr_status & GR_READY))) -+ return 0; -+#endif -+ -+ read_lock(&tasklist_lock); -+ read_lock(&grsec_exec_file_lock); -+ filp = task->exec_file; -+ -+ while (tmp->pid > 0) { -+ if (tmp == curtemp) -+ break; -+ tmp = tmp->parent; -+ } -+ -+ if (!filp || (tmp->pid == 0 && ((grsec_enable_harden_ptrace && current_uid() && !(gr_status & GR_READY)) || -+ ((gr_status & GR_READY) && !(current->acl->mode & GR_RELAXPTRACE))))) { -+ read_unlock(&grsec_exec_file_lock); -+ read_unlock(&tasklist_lock); -+ return 1; -+ } -+ -+#ifdef CONFIG_GRKERNSEC_HARDEN_PTRACE -+ if (!(gr_status & GR_READY)) { -+ read_unlock(&grsec_exec_file_lock); -+ read_unlock(&tasklist_lock); -+ return 0; -+ } -+#endif -+ -+ retmode = gr_search_file(filp->f_path.dentry, GR_NOPTRACE, filp->f_path.mnt); -+ read_unlock(&grsec_exec_file_lock); -+ read_unlock(&tasklist_lock); -+ -+ if (retmode & GR_NOPTRACE) -+ return 1; -+ -+ if (!(current->acl->mode & GR_POVERRIDE) && !(current->role->roletype & GR_ROLE_GOD) -+ && (current->acl != task->acl || (current->acl != current->role->root_label -+ && current->pid != task->pid))) -+ return 1; -+ -+ return 0; -+} -+ -+int -+gr_handle_ptrace(struct task_struct *task, const long request) -+{ -+ struct task_struct *tmp = task; -+ struct task_struct *curtemp = current; -+ __u32 retmode; -+ -+#ifndef CONFIG_GRKERNSEC_HARDEN_PTRACE -+ if (unlikely(!(gr_status & GR_READY))) -+ return 0; -+#endif -+ -+ read_lock(&tasklist_lock); -+ while (tmp->pid > 0) { -+ if (tmp == curtemp) -+ break; -+ tmp = tmp->parent; -+ } -+ -+ if (tmp->pid == 0 && ((grsec_enable_harden_ptrace && current_uid() && !(gr_status & GR_READY)) || -+ ((gr_status & GR_READY) && !(current->acl->mode & GR_RELAXPTRACE)))) { -+ read_unlock(&tasklist_lock); -+ gr_log_ptrace(GR_DONT_AUDIT, GR_PTRACE_ACL_MSG, task); -+ return 1; -+ } -+ read_unlock(&tasklist_lock); -+ -+#ifdef CONFIG_GRKERNSEC_HARDEN_PTRACE -+ if (!(gr_status & GR_READY)) -+ return 0; -+#endif -+ -+ read_lock(&grsec_exec_file_lock); -+ if (unlikely(!task->exec_file)) { -+ read_unlock(&grsec_exec_file_lock); -+ return 0; -+ } -+ -+ retmode = gr_search_file(task->exec_file->f_path.dentry, GR_PTRACERD | GR_NOPTRACE, task->exec_file->f_path.mnt); -+ read_unlock(&grsec_exec_file_lock); -+ -+ if (retmode & GR_NOPTRACE) { -+ gr_log_ptrace(GR_DONT_AUDIT, GR_PTRACE_ACL_MSG, task); -+ return 1; -+ } -+ -+ if (retmode & GR_PTRACERD) { -+ switch (request) { -+ case PTRACE_POKETEXT: -+ case PTRACE_POKEDATA: -+ case PTRACE_POKEUSR: -+#if !defined(CONFIG_PPC32) && !defined(CONFIG_PPC64) && !defined(CONFIG_PARISC) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64) -+ case PTRACE_SETREGS: -+ case PTRACE_SETFPREGS: -+#endif -+#ifdef CONFIG_X86 -+ case PTRACE_SETFPXREGS: -+#endif -+#ifdef CONFIG_ALTIVEC -+ case PTRACE_SETVRREGS: -+#endif -+ return 1; -+ default: -+ return 0; -+ } -+ } else if (!(current->acl->mode & GR_POVERRIDE) && -+ !(current->role->roletype & GR_ROLE_GOD) && -+ (current->acl != task->acl)) { -+ gr_log_ptrace(GR_DONT_AUDIT, GR_PTRACE_ACL_MSG, task); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+static int is_writable_mmap(const struct file *filp) -+{ -+ struct task_struct *task = current; -+ struct acl_object_label *obj, *obj2; -+ -+ if (gr_status & GR_READY && !(task->acl->mode & GR_OVERRIDE) && -+ !task->is_writable && S_ISREG(filp->f_path.dentry->d_inode->i_mode)) { -+ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, default_role->root_label); -+ obj2 = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, -+ task->role->root_label); -+ if (unlikely((obj->mode & GR_WRITE) || (obj2->mode & GR_WRITE))) { -+ gr_log_fs_generic(GR_DONT_AUDIT, GR_WRITLIB_ACL_MSG, filp->f_path.dentry, filp->f_path.mnt); -+ return 1; -+ } -+ } -+ return 0; -+} -+ -+int -+gr_acl_handle_mmap(const struct file *file, const unsigned long prot) -+{ -+ __u32 mode; -+ -+ if (unlikely(!file || !(prot & PROT_EXEC))) -+ return 1; -+ -+ if (is_writable_mmap(file)) -+ return 0; -+ -+ mode = -+ gr_search_file(file->f_path.dentry, -+ GR_EXEC | GR_AUDIT_EXEC | GR_SUPPRESS, -+ file->f_path.mnt); -+ -+ if (!gr_tpe_allow(file)) -+ return 0; -+ -+ if (unlikely(!(mode & GR_EXEC) && !(mode & GR_SUPPRESS))) { -+ gr_log_fs_rbac_generic(GR_DONT_AUDIT, GR_MMAP_ACL_MSG, file->f_path.dentry, file->f_path.mnt); -+ return 0; -+ } else if (unlikely(!(mode & GR_EXEC))) { -+ return 0; -+ } else if (unlikely(mode & GR_EXEC && mode & GR_AUDIT_EXEC)) { -+ gr_log_fs_rbac_generic(GR_DO_AUDIT, GR_MMAP_ACL_MSG, file->f_path.dentry, file->f_path.mnt); -+ return 1; -+ } -+ -+ return 1; -+} -+ -+int -+gr_acl_handle_mprotect(const struct file *file, const unsigned long prot) -+{ -+ __u32 mode; -+ -+ if (unlikely(!file || !(prot & PROT_EXEC))) -+ return 1; -+ -+ if (is_writable_mmap(file)) -+ return 0; -+ -+ mode = -+ gr_search_file(file->f_path.dentry, -+ GR_EXEC | GR_AUDIT_EXEC | GR_SUPPRESS, -+ file->f_path.mnt); -+ -+ if (!gr_tpe_allow(file)) -+ return 0; -+ -+ if (unlikely(!(mode & GR_EXEC) && !(mode & GR_SUPPRESS))) { -+ gr_log_fs_rbac_generic(GR_DONT_AUDIT, GR_MPROTECT_ACL_MSG, file->f_path.dentry, file->f_path.mnt); -+ return 0; -+ } else if (unlikely(!(mode & GR_EXEC))) { -+ return 0; -+ } else if (unlikely(mode & GR_EXEC && mode & GR_AUDIT_EXEC)) { -+ gr_log_fs_rbac_generic(GR_DO_AUDIT, GR_MPROTECT_ACL_MSG, file->f_path.dentry, file->f_path.mnt); -+ return 1; -+ } -+ -+ return 1; -+} -+ -+void -+gr_acl_handle_psacct(struct task_struct *task, const long code) -+{ -+ unsigned long runtime; -+ unsigned long cputime; -+ unsigned int wday, cday; -+ __u8 whr, chr; -+ __u8 wmin, cmin; -+ __u8 wsec, csec; -+ struct timespec timeval; -+ -+ if (unlikely(!(gr_status & GR_READY) || !task->acl || -+ !(task->acl->mode & GR_PROCACCT))) -+ return; -+ -+ do_posix_clock_monotonic_gettime(&timeval); -+ runtime = timeval.tv_sec - task->start_time.tv_sec; -+ wday = runtime / (3600 * 24); -+ runtime -= wday * (3600 * 24); -+ whr = runtime / 3600; -+ runtime -= whr * 3600; -+ wmin = runtime / 60; -+ runtime -= wmin * 60; -+ wsec = runtime; -+ -+ cputime = (task->utime + task->stime) / HZ; -+ cday = cputime / (3600 * 24); -+ cputime -= cday * (3600 * 24); -+ chr = cputime / 3600; -+ cputime -= chr * 3600; -+ cmin = cputime / 60; -+ cputime -= cmin * 60; -+ csec = cputime; -+ -+ gr_log_procacct(GR_DO_AUDIT, GR_ACL_PROCACCT_MSG, task, wday, whr, wmin, wsec, cday, chr, cmin, csec, code); -+ -+ return; -+} -+ -+void gr_set_kernel_label(struct task_struct *task) -+{ -+ if (gr_status & GR_READY) { -+ task->role = kernel_role; -+ task->acl = kernel_role->root_label; -+ } -+ return; -+} -+ -+#ifdef CONFIG_TASKSTATS -+int gr_is_taskstats_denied(int pid) -+{ -+ struct task_struct *task; -+#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ const struct cred *cred; -+#endif -+ int ret = 0; -+ -+ /* restrict taskstats viewing to un-chrooted root users -+ who have the 'view' subject flag if the RBAC system is enabled -+ */ -+ -+ read_lock(&tasklist_lock); -+ task = find_task_by_vpid(pid); -+ if (task) { -+ task_lock(task); -+#ifdef CONFIG_GRKERNSEC_CHROOT -+ if (proc_is_chrooted(task)) -+ ret = -EACCES; -+#endif -+#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ cred = __task_cred(task); -+#ifdef CONFIG_GRKERNSEC_PROC_USER -+ if (cred->uid != 0) -+ ret = -EACCES; -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ if (cred->uid != 0 && !groups_search(cred->group_info, CONFIG_GRKERNSEC_PROC_GID)) -+ ret = -EACCES; -+#endif -+#endif -+ if (gr_status & GR_READY) { -+ if (!(task->acl->mode & GR_VIEW)) -+ ret = -EACCES; -+ } -+ -+ task_unlock(task); -+ } else -+ ret = -ENOENT; -+ -+ read_unlock(&tasklist_lock); -+ -+ return ret; -+} -+#endif -+ -+int gr_acl_handle_filldir(const struct file *file, const char *name, const unsigned int namelen, const ino_t ino) -+{ -+ struct task_struct *task = current; -+ struct dentry *dentry = file->f_path.dentry; -+ struct vfsmount *mnt = file->f_path.mnt; -+ struct acl_object_label *obj, *tmp; -+ struct acl_subject_label *subj; -+ unsigned int bufsize; -+ int is_not_root; -+ char *path; -+ -+ if (unlikely(!(gr_status & GR_READY))) -+ return 1; -+ -+ if (task->acl->mode & (GR_LEARN | GR_INHERITLEARN)) -+ return 1; -+ -+ /* ignore Eric Biederman */ -+ if (IS_PRIVATE(dentry->d_inode)) -+ return 1; -+ -+ subj = task->acl; -+ do { -+ obj = lookup_acl_obj_label(ino, dentry->d_inode->i_sb->s_dev, subj); -+ if (obj != NULL) -+ return (obj->mode & GR_FIND) ? 1 : 0; -+ } while ((subj = subj->parent_subject)); -+ -+ /* this is purely an optimization since we're looking for an object -+ for the directory we're doing a readdir on -+ if it's possible for any globbed object to match the entry we're -+ filling into the directory, then the object we find here will be -+ an anchor point with attached globbed objects -+ */ -+ obj = chk_obj_label_noglob(dentry, mnt, task->acl); -+ if (obj->globbed == NULL) -+ return (obj->mode & GR_FIND) ? 1 : 0; -+ -+ is_not_root = ((obj->filename[0] == '/') && -+ (obj->filename[1] == '\0')) ? 0 : 1; -+ bufsize = PAGE_SIZE - namelen - is_not_root; -+ -+ /* check bufsize > PAGE_SIZE || bufsize == 0 */ -+ if (unlikely((bufsize - 1) > (PAGE_SIZE - 1))) -+ return 1; -+ -+ preempt_disable(); -+ path = d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[0], smp_processor_id()), -+ bufsize); -+ -+ bufsize = strlen(path); -+ -+ /* if base is "/", don't append an additional slash */ -+ if (is_not_root) -+ *(path + bufsize) = '/'; -+ memcpy(path + bufsize + is_not_root, name, namelen); -+ *(path + bufsize + namelen + is_not_root) = '\0'; -+ -+ tmp = obj->globbed; -+ while (tmp) { -+ if (!glob_match(tmp->filename, path)) { -+ preempt_enable(); -+ return (tmp->mode & GR_FIND) ? 1 : 0; -+ } -+ tmp = tmp->next; -+ } -+ preempt_enable(); -+ return (obj->mode & GR_FIND) ? 1 : 0; -+} -+ -+EXPORT_SYMBOL(gr_learn_resource); -+EXPORT_SYMBOL(gr_set_kernel_label); -+#ifdef CONFIG_SECURITY -+EXPORT_SYMBOL(gr_check_user_change); -+EXPORT_SYMBOL(gr_check_group_change); -+#endif -+ -diff -urNp linux-2.6.31.1/grsecurity/gracl_cap.c linux-2.6.31.1/grsecurity/gracl_cap.c ---- linux-2.6.31.1/grsecurity/gracl_cap.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/gracl_cap.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,131 @@ -+#include <linux/kernel.h> -+#include <linux/module.h> -+#include <linux/sched.h> -+#include <linux/gracl.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+ -+static const char *captab_log[] = { -+ "CAP_CHOWN", -+ "CAP_DAC_OVERRIDE", -+ "CAP_DAC_READ_SEARCH", -+ "CAP_FOWNER", -+ "CAP_FSETID", -+ "CAP_KILL", -+ "CAP_SETGID", -+ "CAP_SETUID", -+ "CAP_SETPCAP", -+ "CAP_LINUX_IMMUTABLE", -+ "CAP_NET_BIND_SERVICE", -+ "CAP_NET_BROADCAST", -+ "CAP_NET_ADMIN", -+ "CAP_NET_RAW", -+ "CAP_IPC_LOCK", -+ "CAP_IPC_OWNER", -+ "CAP_SYS_MODULE", -+ "CAP_SYS_RAWIO", -+ "CAP_SYS_CHROOT", -+ "CAP_SYS_PTRACE", -+ "CAP_SYS_PACCT", -+ "CAP_SYS_ADMIN", -+ "CAP_SYS_BOOT", -+ "CAP_SYS_NICE", -+ "CAP_SYS_RESOURCE", -+ "CAP_SYS_TIME", -+ "CAP_SYS_TTY_CONFIG", -+ "CAP_MKNOD", -+ "CAP_LEASE", -+ "CAP_AUDIT_WRITE", -+ "CAP_AUDIT_CONTROL", -+ "CAP_SETFCAP", -+ "CAP_MAC_OVERRIDE", -+ "CAP_MAC_ADMIN" -+}; -+ -+EXPORT_SYMBOL(gr_is_capable); -+EXPORT_SYMBOL(gr_is_capable_nolog); -+ -+int -+gr_is_capable(const int cap) -+{ -+ struct task_struct *task = current; -+ const struct cred *cred = current_cred(); -+ struct acl_subject_label *curracl; -+ kernel_cap_t cap_drop = __cap_empty_set, cap_mask = __cap_empty_set; -+ -+ if (!gr_acl_is_enabled()) -+ return 1; -+ -+ curracl = task->acl; -+ -+ cap_drop = curracl->cap_lower; -+ cap_mask = curracl->cap_mask; -+ -+ while ((curracl = curracl->parent_subject)) { -+ /* if the cap isn't specified in the current computed mask but is specified in the -+ current level subject, and is lowered in the current level subject, then add -+ it to the set of dropped capabilities -+ otherwise, add the current level subject's mask to the current computed mask -+ */ -+ if (!cap_raised(cap_mask, cap) && cap_raised(curracl->cap_mask, cap)) { -+ cap_raise(cap_mask, cap); -+ if (cap_raised(curracl->cap_lower, cap)) -+ cap_raise(cap_drop, cap); -+ } -+ } -+ -+ if (!cap_raised(cap_drop, cap)) -+ return 1; -+ -+ curracl = task->acl; -+ -+ if ((curracl->mode & (GR_LEARN | GR_INHERITLEARN)) -+ && cap_raised(cred->cap_effective, cap)) { -+ security_learn(GR_LEARN_AUDIT_MSG, task->role->rolename, -+ task->role->roletype, cred->uid, -+ cred->gid, task->exec_file ? -+ gr_to_filename(task->exec_file->f_path.dentry, -+ task->exec_file->f_path.mnt) : curracl->filename, -+ curracl->filename, 0UL, -+ 0UL, "", (unsigned long) cap, NIPQUAD(task->signal->curr_ip)); -+ return 1; -+ } -+ -+ if ((cap >= 0) && (cap < (sizeof(captab_log)/sizeof(captab_log[0]))) && cap_raised(cred->cap_effective, cap)) -+ gr_log_cap(GR_DONT_AUDIT, GR_CAP_ACL_MSG, task, captab_log[cap]); -+ return 0; -+} -+ -+int -+gr_is_capable_nolog(const int cap) -+{ -+ struct acl_subject_label *curracl; -+ kernel_cap_t cap_drop = __cap_empty_set, cap_mask = __cap_empty_set; -+ -+ if (!gr_acl_is_enabled()) -+ return 1; -+ -+ curracl = current->acl; -+ -+ cap_drop = curracl->cap_lower; -+ cap_mask = curracl->cap_mask; -+ -+ while ((curracl = curracl->parent_subject)) { -+ /* if the cap isn't specified in the current computed mask but is specified in the -+ current level subject, and is lowered in the current level subject, then add -+ it to the set of dropped capabilities -+ otherwise, add the current level subject's mask to the current computed mask -+ */ -+ if (!cap_raised(cap_mask, cap) && cap_raised(curracl->cap_mask, cap)) { -+ cap_raise(cap_mask, cap); -+ if (cap_raised(curracl->cap_lower, cap)) -+ cap_raise(cap_drop, cap); -+ } -+ } -+ -+ if (!cap_raised(cap_drop, cap)) -+ return 1; -+ -+ return 0; -+} -+ -diff -urNp linux-2.6.31.1/grsecurity/gracl_fs.c linux-2.6.31.1/grsecurity/gracl_fs.c ---- linux-2.6.31.1/grsecurity/gracl_fs.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/gracl_fs.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,424 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/types.h> -+#include <linux/fs.h> -+#include <linux/file.h> -+#include <linux/stat.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+#include <linux/gracl.h> -+ -+__u32 -+gr_acl_handle_hidden_file(const struct dentry * dentry, -+ const struct vfsmount * mnt) -+{ -+ __u32 mode; -+ -+ if (unlikely(!dentry->d_inode)) -+ return GR_FIND; -+ -+ mode = -+ gr_search_file(dentry, GR_FIND | GR_AUDIT_FIND | GR_SUPPRESS, mnt); -+ -+ if (unlikely(mode & GR_FIND && mode & GR_AUDIT_FIND)) { -+ gr_log_fs_rbac_generic(GR_DO_AUDIT, GR_HIDDEN_ACL_MSG, dentry, mnt); -+ return mode; -+ } else if (unlikely(!(mode & GR_FIND) && !(mode & GR_SUPPRESS))) { -+ gr_log_fs_rbac_generic(GR_DONT_AUDIT, GR_HIDDEN_ACL_MSG, dentry, mnt); -+ return 0; -+ } else if (unlikely(!(mode & GR_FIND))) -+ return 0; -+ -+ return GR_FIND; -+} -+ -+__u32 -+gr_acl_handle_open(const struct dentry * dentry, const struct vfsmount * mnt, -+ const int fmode) -+{ -+ __u32 reqmode = GR_FIND; -+ __u32 mode; -+ -+ if (unlikely(!dentry->d_inode)) -+ return reqmode; -+ -+ if (unlikely(fmode & O_APPEND)) -+ reqmode |= GR_APPEND; -+ else if (unlikely(fmode & FMODE_WRITE)) -+ reqmode |= GR_WRITE; -+ if (likely((fmode & FMODE_READ) && !(fmode & O_DIRECTORY))) -+ reqmode |= GR_READ; -+ if ((fmode & FMODE_GREXEC) && (fmode & FMODE_EXEC)) -+ reqmode &= ~GR_READ; -+ mode = -+ gr_search_file(dentry, reqmode | to_gr_audit(reqmode) | GR_SUPPRESS, -+ mnt); -+ -+ if (unlikely(((mode & reqmode) == reqmode) && mode & GR_AUDITS)) { -+ gr_log_fs_rbac_mode2(GR_DO_AUDIT, GR_OPEN_ACL_MSG, dentry, mnt, -+ reqmode & GR_READ ? " reading" : "", -+ reqmode & GR_WRITE ? " writing" : reqmode & -+ GR_APPEND ? " appending" : ""); -+ return reqmode; -+ } else -+ if (unlikely((mode & reqmode) != reqmode && !(mode & GR_SUPPRESS))) -+ { -+ gr_log_fs_rbac_mode2(GR_DONT_AUDIT, GR_OPEN_ACL_MSG, dentry, mnt, -+ reqmode & GR_READ ? " reading" : "", -+ reqmode & GR_WRITE ? " writing" : reqmode & -+ GR_APPEND ? " appending" : ""); -+ return 0; -+ } else if (unlikely((mode & reqmode) != reqmode)) -+ return 0; -+ -+ return reqmode; -+} -+ -+__u32 -+gr_acl_handle_creat(const struct dentry * dentry, -+ const struct dentry * p_dentry, -+ const struct vfsmount * p_mnt, const int fmode, -+ const int imode) -+{ -+ __u32 reqmode = GR_WRITE | GR_CREATE; -+ __u32 mode; -+ -+ if (unlikely(fmode & O_APPEND)) -+ reqmode |= GR_APPEND; -+ if (unlikely((fmode & FMODE_READ) && !(fmode & O_DIRECTORY))) -+ reqmode |= GR_READ; -+ if (unlikely((fmode & O_CREAT) && (imode & (S_ISUID | S_ISGID)))) -+ reqmode |= GR_SETID; -+ -+ mode = -+ gr_check_create(dentry, p_dentry, p_mnt, -+ reqmode | to_gr_audit(reqmode) | GR_SUPPRESS); -+ -+ if (unlikely(((mode & reqmode) == reqmode) && mode & GR_AUDITS)) { -+ gr_log_fs_rbac_mode2(GR_DO_AUDIT, GR_CREATE_ACL_MSG, dentry, p_mnt, -+ reqmode & GR_READ ? " reading" : "", -+ reqmode & GR_WRITE ? " writing" : reqmode & -+ GR_APPEND ? " appending" : ""); -+ return reqmode; -+ } else -+ if (unlikely((mode & reqmode) != reqmode && !(mode & GR_SUPPRESS))) -+ { -+ gr_log_fs_rbac_mode2(GR_DONT_AUDIT, GR_CREATE_ACL_MSG, dentry, p_mnt, -+ reqmode & GR_READ ? " reading" : "", -+ reqmode & GR_WRITE ? " writing" : reqmode & -+ GR_APPEND ? " appending" : ""); -+ return 0; -+ } else if (unlikely((mode & reqmode) != reqmode)) -+ return 0; -+ -+ return reqmode; -+} -+ -+__u32 -+gr_acl_handle_access(const struct dentry * dentry, const struct vfsmount * mnt, -+ const int fmode) -+{ -+ __u32 mode, reqmode = GR_FIND; -+ -+ if ((fmode & S_IXOTH) && !S_ISDIR(dentry->d_inode->i_mode)) -+ reqmode |= GR_EXEC; -+ if (fmode & S_IWOTH) -+ reqmode |= GR_WRITE; -+ if (fmode & S_IROTH) -+ reqmode |= GR_READ; -+ -+ mode = -+ gr_search_file(dentry, reqmode | to_gr_audit(reqmode) | GR_SUPPRESS, -+ mnt); -+ -+ if (unlikely(((mode & reqmode) == reqmode) && mode & GR_AUDITS)) { -+ gr_log_fs_rbac_mode3(GR_DO_AUDIT, GR_ACCESS_ACL_MSG, dentry, mnt, -+ reqmode & GR_READ ? " reading" : "", -+ reqmode & GR_WRITE ? " writing" : "", -+ reqmode & GR_EXEC ? " executing" : ""); -+ return reqmode; -+ } else -+ if (unlikely((mode & reqmode) != reqmode && !(mode & GR_SUPPRESS))) -+ { -+ gr_log_fs_rbac_mode3(GR_DONT_AUDIT, GR_ACCESS_ACL_MSG, dentry, mnt, -+ reqmode & GR_READ ? " reading" : "", -+ reqmode & GR_WRITE ? " writing" : "", -+ reqmode & GR_EXEC ? " executing" : ""); -+ return 0; -+ } else if (unlikely((mode & reqmode) != reqmode)) -+ return 0; -+ -+ return reqmode; -+} -+ -+static __u32 generic_fs_handler(const struct dentry *dentry, const struct vfsmount *mnt, __u32 reqmode, const char *fmt) -+{ -+ __u32 mode; -+ -+ mode = gr_search_file(dentry, reqmode | to_gr_audit(reqmode) | GR_SUPPRESS, mnt); -+ -+ if (unlikely(((mode & (reqmode)) == (reqmode)) && mode & GR_AUDITS)) { -+ gr_log_fs_rbac_generic(GR_DO_AUDIT, fmt, dentry, mnt); -+ return mode; -+ } else if (unlikely((mode & (reqmode)) != (reqmode) && !(mode & GR_SUPPRESS))) { -+ gr_log_fs_rbac_generic(GR_DONT_AUDIT, fmt, dentry, mnt); -+ return 0; -+ } else if (unlikely((mode & (reqmode)) != (reqmode))) -+ return 0; -+ -+ return (reqmode); -+} -+ -+__u32 -+gr_acl_handle_rmdir(const struct dentry * dentry, const struct vfsmount * mnt) -+{ -+ return generic_fs_handler(dentry, mnt, GR_WRITE | GR_DELETE , GR_RMDIR_ACL_MSG); -+} -+ -+__u32 -+gr_acl_handle_unlink(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return generic_fs_handler(dentry, mnt, GR_WRITE | GR_DELETE , GR_UNLINK_ACL_MSG); -+} -+ -+__u32 -+gr_acl_handle_truncate(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return generic_fs_handler(dentry, mnt, GR_WRITE, GR_TRUNCATE_ACL_MSG); -+} -+ -+__u32 -+gr_acl_handle_utime(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return generic_fs_handler(dentry, mnt, GR_WRITE, GR_ATIME_ACL_MSG); -+} -+ -+__u32 -+gr_acl_handle_fchmod(const struct dentry *dentry, const struct vfsmount *mnt, -+ mode_t mode) -+{ -+ if (unlikely(dentry->d_inode && S_ISSOCK(dentry->d_inode->i_mode))) -+ return 1; -+ -+ if (unlikely((mode != (mode_t)-1) && (mode & (S_ISUID | S_ISGID)))) { -+ return generic_fs_handler(dentry, mnt, GR_WRITE | GR_SETID, -+ GR_FCHMOD_ACL_MSG); -+ } else { -+ return generic_fs_handler(dentry, mnt, GR_WRITE, GR_FCHMOD_ACL_MSG); -+ } -+} -+ -+__u32 -+gr_acl_handle_chmod(const struct dentry *dentry, const struct vfsmount *mnt, -+ mode_t mode) -+{ -+ if (unlikely((mode != (mode_t)-1) && (mode & (S_ISUID | S_ISGID)))) { -+ return generic_fs_handler(dentry, mnt, GR_WRITE | GR_SETID, -+ GR_CHMOD_ACL_MSG); -+ } else { -+ return generic_fs_handler(dentry, mnt, GR_WRITE, GR_CHMOD_ACL_MSG); -+ } -+} -+ -+__u32 -+gr_acl_handle_chown(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return generic_fs_handler(dentry, mnt, GR_WRITE, GR_CHOWN_ACL_MSG); -+} -+ -+__u32 -+gr_acl_handle_execve(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return generic_fs_handler(dentry, mnt, GR_EXEC, GR_EXEC_ACL_MSG); -+} -+ -+__u32 -+gr_acl_handle_unix(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return generic_fs_handler(dentry, mnt, GR_READ | GR_WRITE, -+ GR_UNIXCONNECT_ACL_MSG); -+} -+ -+/* hardlinks require at minimum create permission, -+ any additional privilege required is based on the -+ privilege of the file being linked to -+*/ -+__u32 -+gr_acl_handle_link(const struct dentry * new_dentry, -+ const struct dentry * parent_dentry, -+ const struct vfsmount * parent_mnt, -+ const struct dentry * old_dentry, -+ const struct vfsmount * old_mnt, const char *to) -+{ -+ __u32 mode; -+ __u32 needmode = GR_CREATE | GR_LINK; -+ __u32 needaudit = GR_AUDIT_CREATE | GR_AUDIT_LINK; -+ -+ mode = -+ gr_check_link(new_dentry, parent_dentry, parent_mnt, old_dentry, -+ old_mnt); -+ -+ if (unlikely(((mode & needmode) == needmode) && (mode & needaudit))) { -+ gr_log_fs_rbac_str(GR_DO_AUDIT, GR_LINK_ACL_MSG, old_dentry, old_mnt, to); -+ return mode; -+ } else if (unlikely(((mode & needmode) != needmode) && !(mode & GR_SUPPRESS))) { -+ gr_log_fs_rbac_str(GR_DONT_AUDIT, GR_LINK_ACL_MSG, old_dentry, old_mnt, to); -+ return 0; -+ } else if (unlikely((mode & needmode) != needmode)) -+ return 0; -+ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_symlink(const struct dentry * new_dentry, -+ const struct dentry * parent_dentry, -+ const struct vfsmount * parent_mnt, const char *from) -+{ -+ __u32 needmode = GR_WRITE | GR_CREATE; -+ __u32 mode; -+ -+ mode = -+ gr_check_create(new_dentry, parent_dentry, parent_mnt, -+ GR_CREATE | GR_AUDIT_CREATE | -+ GR_WRITE | GR_AUDIT_WRITE | GR_SUPPRESS); -+ -+ if (unlikely(mode & GR_WRITE && mode & GR_AUDITS)) { -+ gr_log_fs_str_rbac(GR_DO_AUDIT, GR_SYMLINK_ACL_MSG, from, new_dentry, parent_mnt); -+ return mode; -+ } else if (unlikely(((mode & needmode) != needmode) && !(mode & GR_SUPPRESS))) { -+ gr_log_fs_str_rbac(GR_DONT_AUDIT, GR_SYMLINK_ACL_MSG, from, new_dentry, parent_mnt); -+ return 0; -+ } else if (unlikely((mode & needmode) != needmode)) -+ return 0; -+ -+ return (GR_WRITE | GR_CREATE); -+} -+ -+static __u32 generic_fs_create_handler(const struct dentry *new_dentry, const struct dentry *parent_dentry, const struct vfsmount *parent_mnt, __u32 reqmode, const char *fmt) -+{ -+ __u32 mode; -+ -+ mode = gr_check_create(new_dentry, parent_dentry, parent_mnt, reqmode | to_gr_audit(reqmode) | GR_SUPPRESS); -+ -+ if (unlikely(((mode & (reqmode)) == (reqmode)) && mode & GR_AUDITS)) { -+ gr_log_fs_rbac_generic(GR_DO_AUDIT, fmt, new_dentry, parent_mnt); -+ return mode; -+ } else if (unlikely((mode & (reqmode)) != (reqmode) && !(mode & GR_SUPPRESS))) { -+ gr_log_fs_rbac_generic(GR_DONT_AUDIT, fmt, new_dentry, parent_mnt); -+ return 0; -+ } else if (unlikely((mode & (reqmode)) != (reqmode))) -+ return 0; -+ -+ return (reqmode); -+} -+ -+__u32 -+gr_acl_handle_mknod(const struct dentry * new_dentry, -+ const struct dentry * parent_dentry, -+ const struct vfsmount * parent_mnt, -+ const int mode) -+{ -+ __u32 reqmode = GR_WRITE | GR_CREATE; -+ if (unlikely(mode & (S_ISUID | S_ISGID))) -+ reqmode |= GR_SETID; -+ -+ return generic_fs_create_handler(new_dentry, parent_dentry, parent_mnt, -+ reqmode, GR_MKNOD_ACL_MSG); -+} -+ -+__u32 -+gr_acl_handle_mkdir(const struct dentry *new_dentry, -+ const struct dentry *parent_dentry, -+ const struct vfsmount *parent_mnt) -+{ -+ return generic_fs_create_handler(new_dentry, parent_dentry, parent_mnt, -+ GR_WRITE | GR_CREATE, GR_MKDIR_ACL_MSG); -+} -+ -+#define RENAME_CHECK_SUCCESS(old, new) \ -+ (((old & (GR_WRITE | GR_READ)) == (GR_WRITE | GR_READ)) && \ -+ ((new & (GR_WRITE | GR_READ)) == (GR_WRITE | GR_READ))) -+ -+int -+gr_acl_handle_rename(struct dentry *new_dentry, -+ struct dentry *parent_dentry, -+ const struct vfsmount *parent_mnt, -+ struct dentry *old_dentry, -+ struct inode *old_parent_inode, -+ struct vfsmount *old_mnt, const char *newname) -+{ -+ __u32 comp1, comp2; -+ int error = 0; -+ -+ if (unlikely(!gr_acl_is_enabled())) -+ return 0; -+ -+ if (!new_dentry->d_inode) { -+ comp1 = gr_check_create(new_dentry, parent_dentry, parent_mnt, -+ GR_READ | GR_WRITE | GR_CREATE | GR_AUDIT_READ | -+ GR_AUDIT_WRITE | GR_AUDIT_CREATE | GR_SUPPRESS); -+ comp2 = gr_search_file(old_dentry, GR_READ | GR_WRITE | -+ GR_DELETE | GR_AUDIT_DELETE | -+ GR_AUDIT_READ | GR_AUDIT_WRITE | -+ GR_SUPPRESS, old_mnt); -+ } else { -+ comp1 = gr_search_file(new_dentry, GR_READ | GR_WRITE | -+ GR_CREATE | GR_DELETE | -+ GR_AUDIT_CREATE | GR_AUDIT_DELETE | -+ GR_AUDIT_READ | GR_AUDIT_WRITE | -+ GR_SUPPRESS, parent_mnt); -+ comp2 = -+ gr_search_file(old_dentry, -+ GR_READ | GR_WRITE | GR_AUDIT_READ | -+ GR_DELETE | GR_AUDIT_DELETE | -+ GR_AUDIT_WRITE | GR_SUPPRESS, old_mnt); -+ } -+ -+ if (RENAME_CHECK_SUCCESS(comp1, comp2) && -+ ((comp1 & GR_AUDITS) || (comp2 & GR_AUDITS))) -+ gr_log_fs_rbac_str(GR_DO_AUDIT, GR_RENAME_ACL_MSG, old_dentry, old_mnt, newname); -+ else if (!RENAME_CHECK_SUCCESS(comp1, comp2) && !(comp1 & GR_SUPPRESS) -+ && !(comp2 & GR_SUPPRESS)) { -+ gr_log_fs_rbac_str(GR_DONT_AUDIT, GR_RENAME_ACL_MSG, old_dentry, old_mnt, newname); -+ error = -EACCES; -+ } else if (unlikely(!RENAME_CHECK_SUCCESS(comp1, comp2))) -+ error = -EACCES; -+ -+ return error; -+} -+ -+void -+gr_acl_handle_exit(void) -+{ -+ u16 id; -+ char *rolename; -+ struct file *exec_file; -+ -+ if (unlikely(current->acl_sp_role && gr_acl_is_enabled())) { -+ id = current->acl_role_id; -+ rolename = current->role->rolename; -+ gr_set_acls(1); -+ gr_log_str_int(GR_DONT_AUDIT_GOOD, GR_SPROLEL_ACL_MSG, rolename, id); -+ } -+ -+ write_lock(&grsec_exec_file_lock); -+ exec_file = current->exec_file; -+ current->exec_file = NULL; -+ write_unlock(&grsec_exec_file_lock); -+ -+ if (exec_file) -+ fput(exec_file); -+} -+ -+int -+gr_acl_handle_procpidmem(const struct task_struct *task) -+{ -+ if (unlikely(!gr_acl_is_enabled())) -+ return 0; -+ -+ if (task != current && task->acl->mode & GR_PROTPROCFD) -+ return -EACCES; -+ -+ return 0; -+} -diff -urNp linux-2.6.31.1/grsecurity/gracl_ip.c linux-2.6.31.1/grsecurity/gracl_ip.c ---- linux-2.6.31.1/grsecurity/gracl_ip.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/gracl_ip.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,340 @@ -+#include <linux/kernel.h> -+#include <asm/uaccess.h> -+#include <asm/errno.h> -+#include <net/sock.h> -+#include <linux/file.h> -+#include <linux/fs.h> -+#include <linux/net.h> -+#include <linux/in.h> -+#include <linux/skbuff.h> -+#include <linux/ip.h> -+#include <linux/udp.h> -+#include <linux/smp_lock.h> -+#include <linux/types.h> -+#include <linux/sched.h> -+#include <linux/netdevice.h> -+#include <linux/inetdevice.h> -+#include <linux/gracl.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+ -+#define GR_BIND 0x01 -+#define GR_CONNECT 0x02 -+#define GR_INVERT 0x04 -+#define GR_BINDOVERRIDE 0x08 -+#define GR_CONNECTOVERRIDE 0x10 -+ -+static const char * gr_protocols[256] = { -+ "ip", "icmp", "igmp", "ggp", "ipencap", "st", "tcp", "cbt", -+ "egp", "igp", "bbn-rcc", "nvp", "pup", "argus", "emcon", "xnet", -+ "chaos", "udp", "mux", "dcn", "hmp", "prm", "xns-idp", "trunk-1", -+ "trunk-2", "leaf-1", "leaf-2", "rdp", "irtp", "iso-tp4", "netblt", "mfe-nsp", -+ "merit-inp", "sep", "3pc", "idpr", "xtp", "ddp", "idpr-cmtp", "tp++", -+ "il", "ipv6", "sdrp", "ipv6-route", "ipv6-frag", "idrp", "rsvp", "gre", -+ "mhrp", "bna", "ipv6-crypt", "ipv6-auth", "i-nlsp", "swipe", "narp", "mobile", -+ "tlsp", "skip", "ipv6-icmp", "ipv6-nonxt", "ipv6-opts", "unknown:61", "cftp", "unknown:63", -+ "sat-expak", "kryptolan", "rvd", "ippc", "unknown:68", "sat-mon", "visa", "ipcv", -+ "cpnx", "cphb", "wsn", "pvp", "br-sat-mon", "sun-nd", "wb-mon", "wb-expak", -+ "iso-ip", "vmtp", "secure-vmtp", "vines", "ttp", "nfsnet-igp", "dgp", "tcf", -+ "eigrp", "ospf", "sprite-rpc", "larp", "mtp", "ax.25", "ipip", "micp", -+ "scc-sp", "etherip", "encap", "unknown:99", "gmtp", "ifmp", "pnni", "pim", -+ "aris", "scps", "qnx", "a/n", "ipcomp", "snp", "compaq-peer", "ipx-in-ip", -+ "vrrp", "pgm", "unknown:114", "l2tp", "ddx", "iatp", "stp", "srp", -+ "uti", "smp", "sm", "ptp", "isis", "fire", "crtp", "crdup", -+ "sscopmce", "iplt", "sps", "pipe", "sctp", "fc", "unkown:134", "unknown:135", -+ "unknown:136", "unknown:137", "unknown:138", "unknown:139", "unknown:140", "unknown:141", "unknown:142", "unknown:143", -+ "unknown:144", "unknown:145", "unknown:146", "unknown:147", "unknown:148", "unknown:149", "unknown:150", "unknown:151", -+ "unknown:152", "unknown:153", "unknown:154", "unknown:155", "unknown:156", "unknown:157", "unknown:158", "unknown:159", -+ "unknown:160", "unknown:161", "unknown:162", "unknown:163", "unknown:164", "unknown:165", "unknown:166", "unknown:167", -+ "unknown:168", "unknown:169", "unknown:170", "unknown:171", "unknown:172", "unknown:173", "unknown:174", "unknown:175", -+ "unknown:176", "unknown:177", "unknown:178", "unknown:179", "unknown:180", "unknown:181", "unknown:182", "unknown:183", -+ "unknown:184", "unknown:185", "unknown:186", "unknown:187", "unknown:188", "unknown:189", "unknown:190", "unknown:191", -+ "unknown:192", "unknown:193", "unknown:194", "unknown:195", "unknown:196", "unknown:197", "unknown:198", "unknown:199", -+ "unknown:200", "unknown:201", "unknown:202", "unknown:203", "unknown:204", "unknown:205", "unknown:206", "unknown:207", -+ "unknown:208", "unknown:209", "unknown:210", "unknown:211", "unknown:212", "unknown:213", "unknown:214", "unknown:215", -+ "unknown:216", "unknown:217", "unknown:218", "unknown:219", "unknown:220", "unknown:221", "unknown:222", "unknown:223", -+ "unknown:224", "unknown:225", "unknown:226", "unknown:227", "unknown:228", "unknown:229", "unknown:230", "unknown:231", -+ "unknown:232", "unknown:233", "unknown:234", "unknown:235", "unknown:236", "unknown:237", "unknown:238", "unknown:239", -+ "unknown:240", "unknown:241", "unknown:242", "unknown:243", "unknown:244", "unknown:245", "unknown:246", "unknown:247", -+ "unknown:248", "unknown:249", "unknown:250", "unknown:251", "unknown:252", "unknown:253", "unknown:254", "unknown:255", -+ }; -+ -+static const char * gr_socktypes[11] = { -+ "unknown:0", "stream", "dgram", "raw", "rdm", "seqpacket", "unknown:6", -+ "unknown:7", "unknown:8", "unknown:9", "packet" -+ }; -+ -+const char * -+gr_proto_to_name(unsigned char proto) -+{ -+ return gr_protocols[proto]; -+} -+ -+const char * -+gr_socktype_to_name(unsigned char type) -+{ -+ return gr_socktypes[type]; -+} -+ -+int -+gr_search_socket(const int domain, const int type, const int protocol) -+{ -+ struct acl_subject_label *curr; -+ const struct cred *cred = current_cred(); -+ -+ if (unlikely(!gr_acl_is_enabled())) -+ goto exit; -+ -+ if ((domain < 0) || (type < 0) || (protocol < 0) || (domain != PF_INET) -+ || (domain >= NPROTO) || (type >= SOCK_MAX) || (protocol > 255)) -+ goto exit; // let the kernel handle it -+ -+ curr = current->acl; -+ -+ if (!curr->ips) -+ goto exit; -+ -+ if ((curr->ip_type & (1 << type)) && -+ (curr->ip_proto[protocol / 32] & (1 << (protocol % 32)))) -+ goto exit; -+ -+ if (curr->mode & (GR_LEARN | GR_INHERITLEARN)) { -+ /* we don't place acls on raw sockets , and sometimes -+ dgram/ip sockets are opened for ioctl and not -+ bind/connect, so we'll fake a bind learn log */ -+ if (type == SOCK_RAW || type == SOCK_PACKET) { -+ __u32 fakeip = 0; -+ security_learn(GR_IP_LEARN_MSG, current->role->rolename, -+ current->role->roletype, cred->uid, -+ cred->gid, current->exec_file ? -+ gr_to_filename(current->exec_file->f_path.dentry, -+ current->exec_file->f_path.mnt) : -+ curr->filename, curr->filename, -+ NIPQUAD(fakeip), 0, type, -+ protocol, GR_CONNECT, -+NIPQUAD(current->signal->curr_ip)); -+ } else if ((type == SOCK_DGRAM) && (protocol == IPPROTO_IP)) { -+ __u32 fakeip = 0; -+ security_learn(GR_IP_LEARN_MSG, current->role->rolename, -+ current->role->roletype, cred->uid, -+ cred->gid, current->exec_file ? -+ gr_to_filename(current->exec_file->f_path.dentry, -+ current->exec_file->f_path.mnt) : -+ curr->filename, curr->filename, -+ NIPQUAD(fakeip), 0, type, -+ protocol, GR_BIND, NIPQUAD(current->signal->curr_ip)); -+ } -+ /* we'll log when they use connect or bind */ -+ goto exit; -+ } -+ -+ gr_log_str3(GR_DONT_AUDIT, GR_SOCK_MSG, "inet", -+ gr_socktype_to_name(type), gr_proto_to_name(protocol)); -+ -+ return 0; -+ exit: -+ return 1; -+} -+ -+int check_ip_policy(struct acl_ip_label *ip, __u32 ip_addr, __u16 ip_port, __u8 protocol, const int mode, const int type, __u32 our_addr, __u32 our_netmask) -+{ -+ if ((ip->mode & mode) && -+ (ip_port >= ip->low) && -+ (ip_port <= ip->high) && -+ ((ntohl(ip_addr) & our_netmask) == -+ (ntohl(our_addr) & our_netmask)) -+ && (ip->proto[protocol / 32] & (1 << (protocol % 32))) -+ && (ip->type & (1 << type))) { -+ if (ip->mode & GR_INVERT) -+ return 2; // specifically denied -+ else -+ return 1; // allowed -+ } -+ -+ return 0; // not specifically allowed, may continue parsing -+} -+ -+static int -+gr_search_connectbind(const int full_mode, struct sock *sk, -+ struct sockaddr_in *addr, const int type) -+{ -+ char iface[IFNAMSIZ] = {0}; -+ struct acl_subject_label *curr; -+ struct acl_ip_label *ip; -+ struct inet_sock *isk; -+ struct net_device *dev; -+ struct in_device *idev; -+ unsigned long i; -+ int ret; -+ int mode = full_mode & (GR_BIND | GR_CONNECT); -+ __u32 ip_addr = 0; -+ __u32 our_addr; -+ __u32 our_netmask; -+ char *p; -+ __u16 ip_port = 0; -+ const struct cred *cred = current_cred(); -+ -+ if (unlikely(!gr_acl_is_enabled() || sk->sk_family != PF_INET)) -+ return 0; -+ -+ curr = current->acl; -+ isk = inet_sk(sk); -+ -+ /* INADDR_ANY overriding for binds, inaddr_any_override is already in network order */ -+ if ((full_mode & GR_BINDOVERRIDE) && addr->sin_addr.s_addr == htonl(INADDR_ANY) && curr->inaddr_any_override != 0) -+ addr->sin_addr.s_addr = curr->inaddr_any_override; -+ if ((full_mode & GR_CONNECT) && isk->saddr == htonl(INADDR_ANY) && curr->inaddr_any_override != 0) { -+ struct sockaddr_in saddr; -+ int err; -+ -+ saddr.sin_family = AF_INET; -+ saddr.sin_addr.s_addr = curr->inaddr_any_override; -+ saddr.sin_port = isk->sport; -+ -+ err = security_socket_bind(sk->sk_socket, (struct sockaddr *)&saddr, sizeof(struct sockaddr_in)); -+ if (err) -+ return err; -+ -+ err = sk->sk_socket->ops->bind(sk->sk_socket, (struct sockaddr *)&saddr, sizeof(struct sockaddr_in)); -+ if (err) -+ return err; -+ } -+ -+ if (!curr->ips) -+ return 0; -+ -+ ip_addr = addr->sin_addr.s_addr; -+ ip_port = ntohs(addr->sin_port); -+ -+ if (curr->mode & (GR_LEARN | GR_INHERITLEARN)) { -+ security_learn(GR_IP_LEARN_MSG, current->role->rolename, -+ current->role->roletype, cred->uid, -+ cred->gid, current->exec_file ? -+ gr_to_filename(current->exec_file->f_path.dentry, -+ current->exec_file->f_path.mnt) : -+ curr->filename, curr->filename, -+ NIPQUAD(ip_addr), ip_port, type, -+ sk->sk_protocol, mode, NIPQUAD(current->signal->curr_ip)); -+ return 0; -+ } -+ -+ for (i = 0; i < curr->ip_num; i++) { -+ ip = *(curr->ips + i); -+ if (ip->iface != NULL) { -+ strncpy(iface, ip->iface, IFNAMSIZ - 1); -+ p = strchr(iface, ':'); -+ if (p != NULL) -+ *p = '\0'; -+ dev = dev_get_by_name(sock_net(sk), iface); -+ if (dev == NULL) -+ continue; -+ idev = in_dev_get(dev); -+ if (idev == NULL) { -+ dev_put(dev); -+ continue; -+ } -+ rcu_read_lock(); -+ for_ifa(idev) { -+ if (!strcmp(ip->iface, ifa->ifa_label)) { -+ our_addr = ifa->ifa_address; -+ our_netmask = 0xffffffff; -+ ret = check_ip_policy(ip, ip_addr, ip_port, sk->sk_protocol, mode, type, our_addr, our_netmask); -+ if (ret == 1) { -+ rcu_read_unlock(); -+ in_dev_put(idev); -+ dev_put(dev); -+ return 0; -+ } else if (ret == 2) { -+ rcu_read_unlock(); -+ in_dev_put(idev); -+ dev_put(dev); -+ goto denied; -+ } -+ } -+ } endfor_ifa(idev); -+ rcu_read_unlock(); -+ in_dev_put(idev); -+ dev_put(dev); -+ } else { -+ our_addr = ip->addr; -+ our_netmask = ip->netmask; -+ ret = check_ip_policy(ip, ip_addr, ip_port, sk->sk_protocol, mode, type, our_addr, our_netmask); -+ if (ret == 1) -+ return 0; -+ else if (ret == 2) -+ goto denied; -+ } -+ } -+ -+denied: -+ if (mode == GR_BIND) -+ gr_log_int5_str2(GR_DONT_AUDIT, GR_BIND_ACL_MSG, NIPQUAD(ip_addr), ip_port, gr_socktype_to_name(type), gr_proto_to_name(sk->sk_protocol)); -+ else if (mode == GR_CONNECT) -+ gr_log_int5_str2(GR_DONT_AUDIT, GR_CONNECT_ACL_MSG, NIPQUAD(ip_addr), ip_port, gr_socktype_to_name(type), gr_proto_to_name(sk->sk_protocol)); -+ -+ return -EACCES; -+} -+ -+int -+gr_search_connect(struct socket *sock, struct sockaddr_in *addr) -+{ -+ return gr_search_connectbind(GR_CONNECT | GR_CONNECTOVERRIDE, sock->sk, addr, sock->type); -+} -+ -+int -+gr_search_bind(struct socket *sock, struct sockaddr_in *addr) -+{ -+ return gr_search_connectbind(GR_BIND | GR_BINDOVERRIDE, sock->sk, addr, sock->type); -+} -+ -+int gr_search_listen(struct socket *sock) -+{ -+ struct sock *sk = sock->sk; -+ struct sockaddr_in addr; -+ -+ addr.sin_addr.s_addr = inet_sk(sk)->saddr; -+ addr.sin_port = inet_sk(sk)->sport; -+ -+ return gr_search_connectbind(GR_BIND | GR_CONNECTOVERRIDE, sock->sk, &addr, sock->type); -+} -+ -+int gr_search_accept(struct socket *sock) -+{ -+ struct sock *sk = sock->sk; -+ struct sockaddr_in addr; -+ -+ addr.sin_addr.s_addr = inet_sk(sk)->saddr; -+ addr.sin_port = inet_sk(sk)->sport; -+ -+ return gr_search_connectbind(GR_BIND | GR_CONNECTOVERRIDE, sock->sk, &addr, sock->type); -+} -+ -+int -+gr_search_udp_sendmsg(struct sock *sk, struct sockaddr_in *addr) -+{ -+ if (addr) -+ return gr_search_connectbind(GR_CONNECT, sk, addr, SOCK_DGRAM); -+ else { -+ struct sockaddr_in sin; -+ const struct inet_sock *inet = inet_sk(sk); -+ -+ sin.sin_addr.s_addr = inet->daddr; -+ sin.sin_port = inet->dport; -+ -+ return gr_search_connectbind(GR_CONNECT | GR_CONNECTOVERRIDE, sk, &sin, SOCK_DGRAM); -+ } -+} -+ -+int -+gr_search_udp_recvmsg(struct sock *sk, const struct sk_buff *skb) -+{ -+ struct sockaddr_in sin; -+ -+ if (unlikely(skb->len < sizeof (struct udphdr))) -+ return 0; // skip this packet -+ -+ sin.sin_addr.s_addr = ip_hdr(skb)->saddr; -+ sin.sin_port = udp_hdr(skb)->source; -+ -+ return gr_search_connectbind(GR_CONNECT | GR_CONNECTOVERRIDE, sk, &sin, SOCK_DGRAM); -+} -diff -urNp linux-2.6.31.1/grsecurity/gracl_learn.c linux-2.6.31.1/grsecurity/gracl_learn.c ---- linux-2.6.31.1/grsecurity/gracl_learn.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/gracl_learn.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,211 @@ -+#include <linux/kernel.h> -+#include <linux/mm.h> -+#include <linux/sched.h> -+#include <linux/poll.h> -+#include <linux/smp_lock.h> -+#include <linux/string.h> -+#include <linux/file.h> -+#include <linux/types.h> -+#include <linux/vmalloc.h> -+#include <linux/grinternal.h> -+ -+extern ssize_t write_grsec_handler(struct file * file, const char __user * buf, -+ size_t count, loff_t *ppos); -+extern int gr_acl_is_enabled(void); -+ -+static DECLARE_WAIT_QUEUE_HEAD(learn_wait); -+static int gr_learn_attached; -+ -+/* use a 512k buffer */ -+#define LEARN_BUFFER_SIZE (512 * 1024) -+ -+static DEFINE_SPINLOCK(gr_learn_lock); -+static DECLARE_MUTEX(gr_learn_user_sem); -+ -+/* we need to maintain two buffers, so that the kernel context of grlearn -+ uses a semaphore around the userspace copying, and the other kernel contexts -+ use a spinlock when copying into the buffer, since they cannot sleep -+*/ -+static char *learn_buffer; -+static char *learn_buffer_user; -+static int learn_buffer_len; -+static int learn_buffer_user_len; -+ -+static ssize_t -+read_learn(struct file *file, char __user * buf, size_t count, loff_t * ppos) -+{ -+ DECLARE_WAITQUEUE(wait, current); -+ ssize_t retval = 0; -+ -+ add_wait_queue(&learn_wait, &wait); -+ set_current_state(TASK_INTERRUPTIBLE); -+ do { -+ down(&gr_learn_user_sem); -+ spin_lock(&gr_learn_lock); -+ if (learn_buffer_len) -+ break; -+ spin_unlock(&gr_learn_lock); -+ up(&gr_learn_user_sem); -+ if (file->f_flags & O_NONBLOCK) { -+ retval = -EAGAIN; -+ goto out; -+ } -+ if (signal_pending(current)) { -+ retval = -ERESTARTSYS; -+ goto out; -+ } -+ -+ schedule(); -+ } while (1); -+ -+ memcpy(learn_buffer_user, learn_buffer, learn_buffer_len); -+ learn_buffer_user_len = learn_buffer_len; -+ retval = learn_buffer_len; -+ learn_buffer_len = 0; -+ -+ spin_unlock(&gr_learn_lock); -+ -+ if (copy_to_user(buf, learn_buffer_user, learn_buffer_user_len)) -+ retval = -EFAULT; -+ -+ up(&gr_learn_user_sem); -+out: -+ set_current_state(TASK_RUNNING); -+ remove_wait_queue(&learn_wait, &wait); -+ return retval; -+} -+ -+static unsigned int -+poll_learn(struct file * file, poll_table * wait) -+{ -+ poll_wait(file, &learn_wait, wait); -+ -+ if (learn_buffer_len) -+ return (POLLIN | POLLRDNORM); -+ -+ return 0; -+} -+ -+void -+gr_clear_learn_entries(void) -+{ -+ char *tmp; -+ -+ down(&gr_learn_user_sem); -+ if (learn_buffer != NULL) { -+ spin_lock(&gr_learn_lock); -+ tmp = learn_buffer; -+ learn_buffer = NULL; -+ spin_unlock(&gr_learn_lock); -+ vfree(learn_buffer); -+ } -+ if (learn_buffer_user != NULL) { -+ vfree(learn_buffer_user); -+ learn_buffer_user = NULL; -+ } -+ learn_buffer_len = 0; -+ up(&gr_learn_user_sem); -+ -+ return; -+} -+ -+void -+gr_add_learn_entry(const char *fmt, ...) -+{ -+ va_list args; -+ unsigned int len; -+ -+ if (!gr_learn_attached) -+ return; -+ -+ spin_lock(&gr_learn_lock); -+ -+ /* leave a gap at the end so we know when it's "full" but don't have to -+ compute the exact length of the string we're trying to append -+ */ -+ if (learn_buffer_len > LEARN_BUFFER_SIZE - 16384) { -+ spin_unlock(&gr_learn_lock); -+ wake_up_interruptible(&learn_wait); -+ return; -+ } -+ if (learn_buffer == NULL) { -+ spin_unlock(&gr_learn_lock); -+ return; -+ } -+ -+ va_start(args, fmt); -+ len = vsnprintf(learn_buffer + learn_buffer_len, LEARN_BUFFER_SIZE - learn_buffer_len, fmt, args); -+ va_end(args); -+ -+ learn_buffer_len += len + 1; -+ -+ spin_unlock(&gr_learn_lock); -+ wake_up_interruptible(&learn_wait); -+ -+ return; -+} -+ -+static int -+open_learn(struct inode *inode, struct file *file) -+{ -+ if (file->f_mode & FMODE_READ && gr_learn_attached) -+ return -EBUSY; -+ if (file->f_mode & FMODE_READ) { -+ int retval = 0; -+ down(&gr_learn_user_sem); -+ if (learn_buffer == NULL) -+ learn_buffer = vmalloc(LEARN_BUFFER_SIZE); -+ if (learn_buffer_user == NULL) -+ learn_buffer_user = vmalloc(LEARN_BUFFER_SIZE); -+ if (learn_buffer == NULL) { -+ retval = -ENOMEM; -+ goto out_error; -+ } -+ if (learn_buffer_user == NULL) { -+ retval = -ENOMEM; -+ goto out_error; -+ } -+ learn_buffer_len = 0; -+ learn_buffer_user_len = 0; -+ gr_learn_attached = 1; -+out_error: -+ up(&gr_learn_user_sem); -+ return retval; -+ } -+ return 0; -+} -+ -+static int -+close_learn(struct inode *inode, struct file *file) -+{ -+ char *tmp; -+ -+ if (file->f_mode & FMODE_READ) { -+ down(&gr_learn_user_sem); -+ if (learn_buffer != NULL) { -+ spin_lock(&gr_learn_lock); -+ tmp = learn_buffer; -+ learn_buffer = NULL; -+ spin_unlock(&gr_learn_lock); -+ vfree(tmp); -+ } -+ if (learn_buffer_user != NULL) { -+ vfree(learn_buffer_user); -+ learn_buffer_user = NULL; -+ } -+ learn_buffer_len = 0; -+ learn_buffer_user_len = 0; -+ gr_learn_attached = 0; -+ up(&gr_learn_user_sem); -+ } -+ -+ return 0; -+} -+ -+const struct file_operations grsec_fops = { -+ .read = read_learn, -+ .write = write_grsec_handler, -+ .open = open_learn, -+ .release = close_learn, -+ .poll = poll_learn, -+}; -diff -urNp linux-2.6.31.1/grsecurity/gracl_res.c linux-2.6.31.1/grsecurity/gracl_res.c ---- linux-2.6.31.1/grsecurity/gracl_res.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/gracl_res.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,58 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/gracl.h> -+#include <linux/grinternal.h> -+ -+static const char *restab_log[] = { -+ [RLIMIT_CPU] = "RLIMIT_CPU", -+ [RLIMIT_FSIZE] = "RLIMIT_FSIZE", -+ [RLIMIT_DATA] = "RLIMIT_DATA", -+ [RLIMIT_STACK] = "RLIMIT_STACK", -+ [RLIMIT_CORE] = "RLIMIT_CORE", -+ [RLIMIT_RSS] = "RLIMIT_RSS", -+ [RLIMIT_NPROC] = "RLIMIT_NPROC", -+ [RLIMIT_NOFILE] = "RLIMIT_NOFILE", -+ [RLIMIT_MEMLOCK] = "RLIMIT_MEMLOCK", -+ [RLIMIT_AS] = "RLIMIT_AS", -+ [RLIMIT_LOCKS] = "RLIMIT_LOCKS", -+ [RLIMIT_SIGPENDING] = "RLIMIT_SIGPENDING", -+ [RLIMIT_MSGQUEUE] = "RLIMIT_MSGQUEUE", -+ [RLIMIT_NICE] = "RLIMIT_NICE", -+ [RLIMIT_RTPRIO] = "RLIMIT_RTPRIO", -+ [RLIMIT_RTTIME] = "RLIMIT_RTTIME", -+ [GR_CRASH_RES] = "RLIMIT_CRASH" -+}; -+ -+void -+gr_log_resource(const struct task_struct *task, -+ const int res, const unsigned long wanted, const int gt) -+{ -+ const struct cred *cred = __task_cred(task); -+ -+ if (res == RLIMIT_NPROC && -+ (cap_raised(cred->cap_effective, CAP_SYS_ADMIN) || -+ cap_raised(cred->cap_effective, CAP_SYS_RESOURCE))) -+ return; -+ else if (res == RLIMIT_MEMLOCK && -+ cap_raised(cred->cap_effective, CAP_IPC_LOCK)) -+ return; -+ else if (res == RLIMIT_NICE && cap_raised(cred->cap_effective, CAP_SYS_NICE)) -+ return; -+ -+ if (!gr_acl_is_enabled() && !grsec_resource_logging) -+ return; -+ -+ // not yet supported resource -+ if (!restab_log[res]) -+ return; -+ -+ preempt_disable(); -+ -+ if (unlikely(((gt && wanted > task->signal->rlim[res].rlim_cur) || -+ (!gt && wanted >= task->signal->rlim[res].rlim_cur)) && -+ task->signal->rlim[res].rlim_cur != RLIM_INFINITY)) -+ gr_log_res_ulong2_str(GR_DONT_AUDIT, GR_RESOURCE_MSG, task, wanted, restab_log[res], task->signal->rlim[res].rlim_cur); -+ preempt_enable_no_resched(); -+ -+ return; -+} -diff -urNp linux-2.6.31.1/grsecurity/gracl_segv.c linux-2.6.31.1/grsecurity/gracl_segv.c ---- linux-2.6.31.1/grsecurity/gracl_segv.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/gracl_segv.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,307 @@ -+#include <linux/kernel.h> -+#include <linux/mm.h> -+#include <asm/uaccess.h> -+#include <asm/errno.h> -+#include <asm/mman.h> -+#include <net/sock.h> -+#include <linux/file.h> -+#include <linux/fs.h> -+#include <linux/net.h> -+#include <linux/in.h> -+#include <linux/smp_lock.h> -+#include <linux/slab.h> -+#include <linux/types.h> -+#include <linux/sched.h> -+#include <linux/timer.h> -+#include <linux/gracl.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+ -+static struct crash_uid *uid_set; -+static unsigned short uid_used; -+static DEFINE_SPINLOCK(gr_uid_lock); -+extern rwlock_t gr_inode_lock; -+extern struct acl_subject_label * -+ lookup_acl_subj_label(const ino_t inode, const dev_t dev, -+ struct acl_role_label *role); -+extern int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t); -+ -+int -+gr_init_uidset(void) -+{ -+ uid_set = -+ kmalloc(GR_UIDTABLE_MAX * sizeof (struct crash_uid), GFP_KERNEL); -+ uid_used = 0; -+ -+ return uid_set ? 1 : 0; -+} -+ -+void -+gr_free_uidset(void) -+{ -+ if (uid_set) -+ kfree(uid_set); -+ -+ return; -+} -+ -+int -+gr_find_uid(const uid_t uid) -+{ -+ struct crash_uid *tmp = uid_set; -+ uid_t buid; -+ int low = 0, high = uid_used - 1, mid; -+ -+ while (high >= low) { -+ mid = (low + high) >> 1; -+ buid = tmp[mid].uid; -+ if (buid == uid) -+ return mid; -+ if (buid > uid) -+ high = mid - 1; -+ if (buid < uid) -+ low = mid + 1; -+ } -+ -+ return -1; -+} -+ -+static __inline__ void -+gr_insertsort(void) -+{ -+ unsigned short i, j; -+ struct crash_uid index; -+ -+ for (i = 1; i < uid_used; i++) { -+ index = uid_set[i]; -+ j = i; -+ while ((j > 0) && uid_set[j - 1].uid > index.uid) { -+ uid_set[j] = uid_set[j - 1]; -+ j--; -+ } -+ uid_set[j] = index; -+ } -+ -+ return; -+} -+ -+static __inline__ void -+gr_insert_uid(const uid_t uid, const unsigned long expires) -+{ -+ int loc; -+ -+ if (uid_used == GR_UIDTABLE_MAX) -+ return; -+ -+ loc = gr_find_uid(uid); -+ -+ if (loc >= 0) { -+ uid_set[loc].expires = expires; -+ return; -+ } -+ -+ uid_set[uid_used].uid = uid; -+ uid_set[uid_used].expires = expires; -+ uid_used++; -+ -+ gr_insertsort(); -+ -+ return; -+} -+ -+void -+gr_remove_uid(const unsigned short loc) -+{ -+ unsigned short i; -+ -+ for (i = loc + 1; i < uid_used; i++) -+ uid_set[i - 1] = uid_set[i]; -+ -+ uid_used--; -+ -+ return; -+} -+ -+int -+gr_check_crash_uid(const uid_t uid) -+{ -+ int loc; -+ int ret = 0; -+ -+ if (unlikely(!gr_acl_is_enabled())) -+ return 0; -+ -+ spin_lock(&gr_uid_lock); -+ loc = gr_find_uid(uid); -+ -+ if (loc < 0) -+ goto out_unlock; -+ -+ if (time_before_eq(uid_set[loc].expires, get_seconds())) -+ gr_remove_uid(loc); -+ else -+ ret = 1; -+ -+out_unlock: -+ spin_unlock(&gr_uid_lock); -+ return ret; -+} -+ -+static __inline__ int -+proc_is_setxid(const struct cred *cred) -+{ -+ if (cred->uid != cred->euid || cred->uid != cred->suid || -+ cred->uid != cred->fsuid) -+ return 1; -+ if (cred->gid != cred->egid || cred->gid != cred->sgid || -+ cred->gid != cred->fsgid) -+ return 1; -+ -+ return 0; -+} -+static __inline__ int -+gr_fake_force_sig(int sig, struct task_struct *t) -+{ -+ unsigned long int flags; -+ int ret, blocked, ignored; -+ struct k_sigaction *action; -+ -+ spin_lock_irqsave(&t->sighand->siglock, flags); -+ action = &t->sighand->action[sig-1]; -+ ignored = action->sa.sa_handler == SIG_IGN; -+ blocked = sigismember(&t->blocked, sig); -+ if (blocked || ignored) { -+ action->sa.sa_handler = SIG_DFL; -+ if (blocked) { -+ sigdelset(&t->blocked, sig); -+ recalc_sigpending_and_wake(t); -+ } -+ } -+ if (action->sa.sa_handler == SIG_DFL) -+ t->signal->flags &= ~SIGNAL_UNKILLABLE; -+ ret = specific_send_sig_info(sig, SEND_SIG_PRIV, t); -+ -+ spin_unlock_irqrestore(&t->sighand->siglock, flags); -+ -+ return ret; -+} -+ -+void -+gr_handle_crash(struct task_struct *task, const int sig) -+{ -+ struct acl_subject_label *curr; -+ struct acl_subject_label *curr2; -+ struct task_struct *tsk, *tsk2; -+ const struct cred *cred = __task_cred(task); -+ const struct cred *cred2; -+ -+ if (sig != SIGSEGV && sig != SIGKILL && sig != SIGBUS && sig != SIGILL) -+ return; -+ -+ if (unlikely(!gr_acl_is_enabled())) -+ return; -+ -+ curr = task->acl; -+ -+ if (!(curr->resmask & (1 << GR_CRASH_RES))) -+ return; -+ -+ if (time_before_eq(curr->expires, get_seconds())) { -+ curr->expires = 0; -+ curr->crashes = 0; -+ } -+ -+ curr->crashes++; -+ -+ if (!curr->expires) -+ curr->expires = get_seconds() + curr->res[GR_CRASH_RES].rlim_max; -+ -+ if ((curr->crashes >= curr->res[GR_CRASH_RES].rlim_cur) && -+ time_after(curr->expires, get_seconds())) { -+ if (cred->uid && proc_is_setxid(cred)) { -+ gr_log_crash1(GR_DONT_AUDIT, GR_SEGVSTART_ACL_MSG, task, curr->res[GR_CRASH_RES].rlim_max); -+ spin_lock(&gr_uid_lock); -+ gr_insert_uid(cred->uid, curr->expires); -+ spin_unlock(&gr_uid_lock); -+ curr->expires = 0; -+ curr->crashes = 0; -+ read_lock(&tasklist_lock); -+ do_each_thread(tsk2, tsk) { -+ cred2 = __task_cred(tsk); -+ if (tsk != task && cred2->uid == cred->uid) -+ gr_fake_force_sig(SIGKILL, tsk); -+ } while_each_thread(tsk2, tsk); -+ read_unlock(&tasklist_lock); -+ } else { -+ gr_log_crash2(GR_DONT_AUDIT, GR_SEGVNOSUID_ACL_MSG, task, curr->res[GR_CRASH_RES].rlim_max); -+ read_lock(&tasklist_lock); -+ do_each_thread(tsk2, tsk) { -+ if (likely(tsk != task)) { -+ curr2 = tsk->acl; -+ -+ if (curr2->device == curr->device && -+ curr2->inode == curr->inode) -+ gr_fake_force_sig(SIGKILL, tsk); -+ } -+ } while_each_thread(tsk2, tsk); -+ read_unlock(&tasklist_lock); -+ } -+ } -+ -+ return; -+} -+ -+int -+gr_check_crash_exec(const struct file *filp) -+{ -+ struct acl_subject_label *curr; -+ -+ if (unlikely(!gr_acl_is_enabled())) -+ return 0; -+ -+ read_lock(&gr_inode_lock); -+ curr = lookup_acl_subj_label(filp->f_path.dentry->d_inode->i_ino, -+ filp->f_path.dentry->d_inode->i_sb->s_dev, -+ current->role); -+ read_unlock(&gr_inode_lock); -+ -+ if (!curr || !(curr->resmask & (1 << GR_CRASH_RES)) || -+ (!curr->crashes && !curr->expires)) -+ return 0; -+ -+ if ((curr->crashes >= curr->res[GR_CRASH_RES].rlim_cur) && -+ time_after(curr->expires, get_seconds())) -+ return 1; -+ else if (time_before_eq(curr->expires, get_seconds())) { -+ curr->crashes = 0; -+ curr->expires = 0; -+ } -+ -+ return 0; -+} -+ -+void -+gr_handle_alertkill(struct task_struct *task) -+{ -+ struct acl_subject_label *curracl; -+ __u32 curr_ip; -+ struct task_struct *p, *p2; -+ -+ if (unlikely(!gr_acl_is_enabled())) -+ return; -+ -+ curracl = task->acl; -+ curr_ip = task->signal->curr_ip; -+ -+ if ((curracl->mode & GR_KILLIPPROC) && curr_ip) { -+ read_lock(&tasklist_lock); -+ do_each_thread(p2, p) { -+ if (p->signal->curr_ip == curr_ip) -+ gr_fake_force_sig(SIGKILL, p); -+ } while_each_thread(p2, p); -+ read_unlock(&tasklist_lock); -+ } else if (curracl->mode & GR_KILLPROC) -+ gr_fake_force_sig(SIGKILL, task); -+ -+ return; -+} -diff -urNp linux-2.6.31.1/grsecurity/gracl_shm.c linux-2.6.31.1/grsecurity/gracl_shm.c ---- linux-2.6.31.1/grsecurity/gracl_shm.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/gracl_shm.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,37 @@ -+#include <linux/kernel.h> -+#include <linux/mm.h> -+#include <linux/sched.h> -+#include <linux/file.h> -+#include <linux/ipc.h> -+#include <linux/gracl.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+ -+int -+gr_handle_shmat(const pid_t shm_cprid, const pid_t shm_lapid, -+ const time_t shm_createtime, const uid_t cuid, const int shmid) -+{ -+ struct task_struct *task; -+ -+ if (!gr_acl_is_enabled()) -+ return 1; -+ -+ read_lock(&tasklist_lock); -+ -+ task = find_task_by_vpid(shm_cprid); -+ -+ if (unlikely(!task)) -+ task = find_task_by_vpid(shm_lapid); -+ -+ if (unlikely(task && (time_before_eq((unsigned long)task->start_time.tv_sec, (unsigned long)shm_createtime) || -+ (task->pid == shm_lapid)) && -+ (task->acl->mode & GR_PROTSHM) && -+ (task->acl != current->acl))) { -+ read_unlock(&tasklist_lock); -+ gr_log_int3(GR_DONT_AUDIT, GR_SHMAT_ACL_MSG, cuid, shm_cprid, shmid); -+ return 0; -+ } -+ read_unlock(&tasklist_lock); -+ -+ return 1; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_chdir.c linux-2.6.31.1/grsecurity/grsec_chdir.c ---- linux-2.6.31.1/grsecurity/grsec_chdir.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_chdir.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,19 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/fs.h> -+#include <linux/file.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+ -+void -+gr_log_chdir(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+#ifdef CONFIG_GRKERNSEC_AUDIT_CHDIR -+ if ((grsec_enable_chdir && grsec_enable_group && -+ in_group_p(grsec_audit_gid)) || (grsec_enable_chdir && -+ !grsec_enable_group)) { -+ gr_log_fs_generic(GR_DO_AUDIT, GR_CHDIR_AUDIT_MSG, dentry, mnt); -+ } -+#endif -+ return; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_chroot.c linux-2.6.31.1/grsecurity/grsec_chroot.c ---- linux-2.6.31.1/grsecurity/grsec_chroot.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_chroot.c 2009-10-01 21:52:18.000000000 -0400 -@@ -0,0 +1,348 @@ -+#include <linux/kernel.h> -+#include <linux/module.h> -+#include <linux/sched.h> -+#include <linux/file.h> -+#include <linux/fs.h> -+#include <linux/mount.h> -+#include <linux/types.h> -+#include <linux/pid_namespace.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+ -+int -+gr_handle_chroot_unix(const pid_t pid) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_UNIX -+ struct pid *spid = NULL; -+ -+ if (unlikely(!grsec_enable_chroot_unix)) -+ return 1; -+ -+ if (likely(!proc_is_chrooted(current))) -+ return 1; -+ -+ read_lock(&tasklist_lock); -+ -+ spid = find_vpid(pid); -+ if (spid) { -+ struct task_struct *p; -+ p = pid_task(spid, PIDTYPE_PID); -+ task_lock(p); -+ if (unlikely(!have_same_root(current, p))) { -+ task_unlock(p); -+ read_unlock(&tasklist_lock); -+ gr_log_noargs(GR_DONT_AUDIT, GR_UNIX_CHROOT_MSG); -+ return 0; -+ } -+ task_unlock(p); -+ } -+ read_unlock(&tasklist_lock); -+#endif -+ return 1; -+} -+ -+int -+gr_handle_chroot_nice(void) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_NICE -+ if (grsec_enable_chroot_nice && proc_is_chrooted(current)) { -+ gr_log_noargs(GR_DONT_AUDIT, GR_NICE_CHROOT_MSG); -+ return -EPERM; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_chroot_setpriority(struct task_struct *p, const int niceval) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_NICE -+ if (grsec_enable_chroot_nice && (niceval < task_nice(p)) -+ && proc_is_chrooted(current)) { -+ gr_log_str_int(GR_DONT_AUDIT, GR_PRIORITY_CHROOT_MSG, p->comm, p->pid); -+ return -EACCES; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_chroot_rawio(const struct inode *inode) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_CAPS -+ if (grsec_enable_chroot_caps && proc_is_chrooted(current) && -+ inode && S_ISBLK(inode->i_mode) && !capable(CAP_SYS_RAWIO)) -+ return 1; -+#endif -+ return 0; -+} -+ -+int -+gr_pid_is_chrooted(struct task_struct *p) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_FINDTASK -+ if (!grsec_enable_chroot_findtask || !proc_is_chrooted(current) || p == NULL) -+ return 0; -+ -+ task_lock(p); -+ if ((p->exit_state & (EXIT_ZOMBIE | EXIT_DEAD)) || -+ !have_same_root(current, p)) { -+ task_unlock(p); -+ return 1; -+ } -+ task_unlock(p); -+#endif -+ return 0; -+} -+ -+EXPORT_SYMBOL(gr_pid_is_chrooted); -+ -+#if defined(CONFIG_GRKERNSEC_CHROOT_DOUBLE) || defined(CONFIG_GRKERNSEC_CHROOT_FCHDIR) -+int gr_is_outside_chroot(const struct dentry *u_dentry, const struct vfsmount *u_mnt) -+{ -+ struct dentry *dentry = (struct dentry *)u_dentry; -+ struct vfsmount *mnt = (struct vfsmount *)u_mnt; -+ struct dentry *realroot; -+ struct vfsmount *realrootmnt; -+ struct dentry *currentroot; -+ struct vfsmount *currentmnt; -+ struct task_struct *reaper = &init_task; -+ int ret = 1; -+ -+ read_lock(&reaper->fs->lock); -+ realrootmnt = mntget(reaper->fs->root.mnt); -+ realroot = dget(reaper->fs->root.dentry); -+ read_unlock(&reaper->fs->lock); -+ -+ read_lock(¤t->fs->lock); -+ currentmnt = mntget(current->fs->root.mnt); -+ currentroot = dget(current->fs->root.dentry); -+ read_unlock(¤t->fs->lock); -+ -+ spin_lock(&dcache_lock); -+ for (;;) { -+ if (unlikely((dentry == realroot && mnt == realrootmnt) -+ || (dentry == currentroot && mnt == currentmnt))) -+ break; -+ if (unlikely(dentry == mnt->mnt_root || IS_ROOT(dentry))) { -+ if (mnt->mnt_parent == mnt) -+ break; -+ dentry = mnt->mnt_mountpoint; -+ mnt = mnt->mnt_parent; -+ continue; -+ } -+ dentry = dentry->d_parent; -+ } -+ spin_unlock(&dcache_lock); -+ -+ dput(currentroot); -+ mntput(currentmnt); -+ -+ /* access is outside of chroot */ -+ if (dentry == realroot && mnt == realrootmnt) -+ ret = 0; -+ -+ dput(realroot); -+ mntput(realrootmnt); -+ return ret; -+} -+#endif -+ -+int -+gr_chroot_fchdir(struct dentry *u_dentry, struct vfsmount *u_mnt) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_FCHDIR -+ if (!grsec_enable_chroot_fchdir) -+ return 1; -+ -+ if (!proc_is_chrooted(current)) -+ return 1; -+ else if (!gr_is_outside_chroot(u_dentry, u_mnt)) { -+ gr_log_fs_generic(GR_DONT_AUDIT, GR_CHROOT_FCHDIR_MSG, u_dentry, u_mnt); -+ return 0; -+ } -+#endif -+ return 1; -+} -+ -+int -+gr_chroot_shmat(const pid_t shm_cprid, const pid_t shm_lapid, -+ const time_t shm_createtime) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_SHMAT -+ struct pid *pid = NULL; -+ time_t starttime; -+ -+ if (unlikely(!grsec_enable_chroot_shmat)) -+ return 1; -+ -+ if (likely(!proc_is_chrooted(current))) -+ return 1; -+ -+ read_lock(&tasklist_lock); -+ -+ pid = find_vpid(shm_cprid); -+ if (pid) { -+ struct task_struct *p; -+ p = pid_task(pid, PIDTYPE_PID); -+ task_lock(p); -+ starttime = p->start_time.tv_sec; -+ if (unlikely(!have_same_root(current, p) && -+ time_before_eq((unsigned long)starttime, (unsigned long)shm_createtime))) { -+ task_unlock(p); -+ read_unlock(&tasklist_lock); -+ gr_log_noargs(GR_DONT_AUDIT, GR_SHMAT_CHROOT_MSG); -+ return 0; -+ } -+ task_unlock(p); -+ } else { -+ pid = find_vpid(shm_lapid); -+ if (pid) { -+ struct task_struct *p; -+ p = pid_task(pid, PIDTYPE_PID); -+ task_lock(p); -+ if (unlikely(!have_same_root(current, p))) { -+ task_unlock(p); -+ read_unlock(&tasklist_lock); -+ gr_log_noargs(GR_DONT_AUDIT, GR_SHMAT_CHROOT_MSG); -+ return 0; -+ } -+ task_unlock(p); -+ } -+ } -+ -+ read_unlock(&tasklist_lock); -+#endif -+ return 1; -+} -+ -+void -+gr_log_chroot_exec(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_EXECLOG -+ if (grsec_enable_chroot_execlog && proc_is_chrooted(current)) -+ gr_log_fs_generic(GR_DO_AUDIT, GR_EXEC_CHROOT_MSG, dentry, mnt); -+#endif -+ return; -+} -+ -+int -+gr_handle_chroot_mknod(const struct dentry *dentry, -+ const struct vfsmount *mnt, const int mode) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_MKNOD -+ if (grsec_enable_chroot_mknod && !S_ISFIFO(mode) && !S_ISREG(mode) && -+ proc_is_chrooted(current)) { -+ gr_log_fs_generic(GR_DONT_AUDIT, GR_MKNOD_CHROOT_MSG, dentry, mnt); -+ return -EPERM; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_chroot_mount(const struct dentry *dentry, -+ const struct vfsmount *mnt, const char *dev_name) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_MOUNT -+ if (grsec_enable_chroot_mount && proc_is_chrooted(current)) { -+ gr_log_str_fs(GR_DONT_AUDIT, GR_MOUNT_CHROOT_MSG, dev_name, dentry, mnt); -+ return -EPERM; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_chroot_pivot(void) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_PIVOT -+ if (grsec_enable_chroot_pivot && proc_is_chrooted(current)) { -+ gr_log_noargs(GR_DONT_AUDIT, GR_PIVOT_CHROOT_MSG); -+ return -EPERM; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_chroot_chroot(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_DOUBLE -+ if (grsec_enable_chroot_double && proc_is_chrooted(current) && -+ !gr_is_outside_chroot(dentry, mnt)) { -+ gr_log_fs_generic(GR_DONT_AUDIT, GR_CHROOT_CHROOT_MSG, dentry, mnt); -+ return -EPERM; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_chroot_caps(struct path *path) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_CAPS -+ if (grsec_enable_chroot_caps && current->pid > 1 && current->fs != NULL && -+ (init_task.fs->root.dentry != path->dentry) && -+ (current->nsproxy->mnt_ns->root->mnt_root != path->dentry)) { -+ -+ kernel_cap_t chroot_caps = GR_CHROOT_CAPS; -+ const struct cred *old = current_cred(); -+ struct cred *new = prepare_creds(); -+ if (new == NULL) -+ return 1; -+ -+ new->cap_permitted = cap_drop(old->cap_permitted, -+ chroot_caps); -+ new->cap_inheritable = cap_drop(old->cap_inheritable, -+ chroot_caps); -+ new->cap_effective = cap_drop(old->cap_effective, -+ chroot_caps); -+ -+ commit_creds(new); -+ -+ return 0; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_chroot_sysctl(const int op) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_SYSCTL -+ if (grsec_enable_chroot_sysctl && proc_is_chrooted(current) -+ && (op & MAY_WRITE)) -+ return -EACCES; -+#endif -+ return 0; -+} -+ -+void -+gr_handle_chroot_chdir(struct path *path) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_CHDIR -+ if (grsec_enable_chroot_chdir) -+ set_fs_pwd(current->fs, path); -+#endif -+ return; -+} -+ -+int -+gr_handle_chroot_chmod(const struct dentry *dentry, -+ const struct vfsmount *mnt, const int mode) -+{ -+#ifdef CONFIG_GRKERNSEC_CHROOT_CHMOD -+ if (grsec_enable_chroot_chmod && -+ ((mode & S_ISUID) || ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && -+ proc_is_chrooted(current)) { -+ gr_log_fs_generic(GR_DONT_AUDIT, GR_CHMOD_CHROOT_MSG, dentry, mnt); -+ return -EPERM; -+ } -+#endif -+ return 0; -+} -+ -+#ifdef CONFIG_SECURITY -+EXPORT_SYMBOL(gr_handle_chroot_caps); -+#endif -diff -urNp linux-2.6.31.1/grsecurity/grsec_disabled.c linux-2.6.31.1/grsecurity/grsec_disabled.c ---- linux-2.6.31.1/grsecurity/grsec_disabled.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_disabled.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,426 @@ -+#include <linux/kernel.h> -+#include <linux/module.h> -+#include <linux/sched.h> -+#include <linux/file.h> -+#include <linux/fs.h> -+#include <linux/kdev_t.h> -+#include <linux/net.h> -+#include <linux/in.h> -+#include <linux/ip.h> -+#include <linux/skbuff.h> -+#include <linux/sysctl.h> -+ -+#ifdef CONFIG_PAX_HAVE_ACL_FLAGS -+void -+pax_set_initial_flags(struct linux_binprm *bprm) -+{ -+ return; -+} -+#endif -+ -+#ifdef CONFIG_SYSCTL -+__u32 -+gr_handle_sysctl(const struct ctl_table * table, const int op) -+{ -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_TASKSTATS -+int gr_is_taskstats_denied(int pid) -+{ -+ return 0; -+} -+#endif -+ -+int -+gr_acl_is_enabled(void) -+{ -+ return 0; -+} -+ -+int -+gr_handle_rawio(const struct inode *inode) -+{ -+ return 0; -+} -+ -+void -+gr_acl_handle_psacct(struct task_struct *task, const long code) -+{ -+ return; -+} -+ -+int -+gr_handle_ptrace(struct task_struct *task, const long request) -+{ -+ return 0; -+} -+ -+int -+gr_handle_proc_ptrace(struct task_struct *task) -+{ -+ return 0; -+} -+ -+void -+gr_learn_resource(const struct task_struct *task, -+ const int res, const unsigned long wanted, const int gt) -+{ -+ return; -+} -+ -+int -+gr_set_acls(const int type) -+{ -+ return 0; -+} -+ -+int -+gr_check_hidden_task(const struct task_struct *tsk) -+{ -+ return 0; -+} -+ -+int -+gr_check_protected_task(const struct task_struct *task) -+{ -+ return 0; -+} -+ -+void -+gr_copy_label(struct task_struct *tsk) -+{ -+ return; -+} -+ -+void -+gr_set_pax_flags(struct task_struct *task) -+{ -+ return; -+} -+ -+int -+gr_set_proc_label(const struct dentry *dentry, const struct vfsmount *mnt, -+ const int unsafe_share) -+{ -+ return 0; -+} -+ -+void -+gr_handle_delete(const ino_t ino, const dev_t dev) -+{ -+ return; -+} -+ -+void -+gr_handle_create(const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+ return; -+} -+ -+void -+gr_handle_crash(struct task_struct *task, const int sig) -+{ -+ return; -+} -+ -+int -+gr_check_crash_exec(const struct file *filp) -+{ -+ return 0; -+} -+ -+int -+gr_check_crash_uid(const uid_t uid) -+{ -+ return 0; -+} -+ -+void -+gr_handle_rename(struct inode *old_dir, struct inode *new_dir, -+ struct dentry *old_dentry, -+ struct dentry *new_dentry, -+ struct vfsmount *mnt, const __u8 replace) -+{ -+ return; -+} -+ -+int -+gr_search_socket(const int family, const int type, const int protocol) -+{ -+ return 1; -+} -+ -+int -+gr_search_connectbind(const int mode, const struct socket *sock, -+ const struct sockaddr_in *addr) -+{ -+ return 0; -+} -+ -+int -+gr_is_capable(const int cap) -+{ -+ return 1; -+} -+ -+int -+gr_is_capable_nolog(const int cap) -+{ -+ return 1; -+} -+ -+void -+gr_handle_alertkill(struct task_struct *task) -+{ -+ return; -+} -+ -+__u32 -+gr_acl_handle_execve(const struct dentry * dentry, const struct vfsmount * mnt) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_hidden_file(const struct dentry * dentry, -+ const struct vfsmount * mnt) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_open(const struct dentry * dentry, const struct vfsmount * mnt, -+ const int fmode) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_rmdir(const struct dentry * dentry, const struct vfsmount * mnt) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_unlink(const struct dentry * dentry, const struct vfsmount * mnt) -+{ -+ return 1; -+} -+ -+int -+gr_acl_handle_mmap(const struct file *file, const unsigned long prot, -+ unsigned int *vm_flags) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_truncate(const struct dentry * dentry, -+ const struct vfsmount * mnt) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_utime(const struct dentry * dentry, const struct vfsmount * mnt) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_access(const struct dentry * dentry, -+ const struct vfsmount * mnt, const int fmode) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_fchmod(const struct dentry * dentry, const struct vfsmount * mnt, -+ mode_t mode) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_chmod(const struct dentry * dentry, const struct vfsmount * mnt, -+ mode_t mode) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_chown(const struct dentry * dentry, const struct vfsmount * mnt) -+{ -+ return 1; -+} -+ -+void -+grsecurity_init(void) -+{ -+ return; -+} -+ -+__u32 -+gr_acl_handle_mknod(const struct dentry * new_dentry, -+ const struct dentry * parent_dentry, -+ const struct vfsmount * parent_mnt, -+ const int mode) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_mkdir(const struct dentry * new_dentry, -+ const struct dentry * parent_dentry, -+ const struct vfsmount * parent_mnt) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_symlink(const struct dentry * new_dentry, -+ const struct dentry * parent_dentry, -+ const struct vfsmount * parent_mnt, const char *from) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_link(const struct dentry * new_dentry, -+ const struct dentry * parent_dentry, -+ const struct vfsmount * parent_mnt, -+ const struct dentry * old_dentry, -+ const struct vfsmount * old_mnt, const char *to) -+{ -+ return 1; -+} -+ -+int -+gr_acl_handle_rename(const struct dentry *new_dentry, -+ const struct dentry *parent_dentry, -+ const struct vfsmount *parent_mnt, -+ const struct dentry *old_dentry, -+ const struct inode *old_parent_inode, -+ const struct vfsmount *old_mnt, const char *newname) -+{ -+ return 0; -+} -+ -+int -+gr_acl_handle_filldir(const struct file *file, const char *name, -+ const int namelen, const ino_t ino) -+{ -+ return 1; -+} -+ -+int -+gr_handle_shmat(const pid_t shm_cprid, const pid_t shm_lapid, -+ const time_t shm_createtime, const uid_t cuid, const int shmid) -+{ -+ return 1; -+} -+ -+int -+gr_search_bind(const struct socket *sock, const struct sockaddr_in *addr) -+{ -+ return 0; -+} -+ -+int -+gr_search_accept(const struct socket *sock) -+{ -+ return 0; -+} -+ -+int -+gr_search_listen(const struct socket *sock) -+{ -+ return 0; -+} -+ -+int -+gr_search_connect(const struct socket *sock, const struct sockaddr_in *addr) -+{ -+ return 0; -+} -+ -+__u32 -+gr_acl_handle_unix(const struct dentry * dentry, const struct vfsmount * mnt) -+{ -+ return 1; -+} -+ -+__u32 -+gr_acl_handle_creat(const struct dentry * dentry, -+ const struct dentry * p_dentry, -+ const struct vfsmount * p_mnt, const int fmode, -+ const int imode) -+{ -+ return 1; -+} -+ -+void -+gr_acl_handle_exit(void) -+{ -+ return; -+} -+ -+int -+gr_acl_handle_mprotect(const struct file *file, const unsigned long prot) -+{ -+ return 1; -+} -+ -+void -+gr_set_role_label(const uid_t uid, const gid_t gid) -+{ -+ return; -+} -+ -+int -+gr_acl_handle_procpidmem(const struct task_struct *task) -+{ -+ return 0; -+} -+ -+int -+gr_search_udp_recvmsg(const struct sock *sk, const struct sk_buff *skb) -+{ -+ return 0; -+} -+ -+int -+gr_search_udp_sendmsg(const struct sock *sk, const struct sockaddr_in *addr) -+{ -+ return 0; -+} -+ -+void -+gr_set_kernel_label(struct task_struct *task) -+{ -+ return; -+} -+ -+int -+gr_check_user_change(int real, int effective, int fs) -+{ -+ return 0; -+} -+ -+int -+gr_check_group_change(int real, int effective, int fs) -+{ -+ return 0; -+} -+ -+ -+EXPORT_SYMBOL(gr_is_capable); -+EXPORT_SYMBOL(gr_is_capable_nolog); -+EXPORT_SYMBOL(gr_learn_resource); -+EXPORT_SYMBOL(gr_set_kernel_label); -+#ifdef CONFIG_SECURITY -+EXPORT_SYMBOL(gr_check_user_change); -+EXPORT_SYMBOL(gr_check_group_change); -+#endif -diff -urNp linux-2.6.31.1/grsecurity/grsec_exec.c linux-2.6.31.1/grsecurity/grsec_exec.c ---- linux-2.6.31.1/grsecurity/grsec_exec.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_exec.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,89 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/file.h> -+#include <linux/binfmts.h> -+#include <linux/smp_lock.h> -+#include <linux/fs.h> -+#include <linux/types.h> -+#include <linux/grdefs.h> -+#include <linux/grinternal.h> -+#include <linux/capability.h> -+ -+#include <asm/uaccess.h> -+ -+#ifdef CONFIG_GRKERNSEC_EXECLOG -+static char gr_exec_arg_buf[132]; -+static DECLARE_MUTEX(gr_exec_arg_sem); -+#endif -+ -+int -+gr_handle_nproc(void) -+{ -+#ifdef CONFIG_GRKERNSEC_EXECVE -+ const struct cred *cred = current_cred(); -+ if (grsec_enable_execve && cred->user && -+ (atomic_read(&cred->user->processes) > -+ current->signal->rlim[RLIMIT_NPROC].rlim_cur) && -+ !capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE)) { -+ gr_log_noargs(GR_DONT_AUDIT, GR_NPROC_MSG); -+ return -EAGAIN; -+ } -+#endif -+ return 0; -+} -+ -+void -+gr_handle_exec_args(struct linux_binprm *bprm, const char __user *__user *argv) -+{ -+#ifdef CONFIG_GRKERNSEC_EXECLOG -+ char *grarg = gr_exec_arg_buf; -+ unsigned int i, x, execlen = 0; -+ char c; -+ -+ if (!((grsec_enable_execlog && grsec_enable_group && -+ in_group_p(grsec_audit_gid)) -+ || (grsec_enable_execlog && !grsec_enable_group))) -+ return; -+ -+ down(&gr_exec_arg_sem); -+ memset(grarg, 0, sizeof(gr_exec_arg_buf)); -+ -+ if (unlikely(argv == NULL)) -+ goto log; -+ -+ for (i = 0; i < bprm->argc && execlen < 128; i++) { -+ const char __user *p; -+ unsigned int len; -+ -+ if (copy_from_user(&p, argv + i, sizeof(p))) -+ goto log; -+ if (!p) -+ goto log; -+ len = strnlen_user(p, 128 - execlen); -+ if (len > 128 - execlen) -+ len = 128 - execlen; -+ else if (len > 0) -+ len--; -+ if (copy_from_user(grarg + execlen, p, len)) -+ goto log; -+ -+ /* rewrite unprintable characters */ -+ for (x = 0; x < len; x++) { -+ c = *(grarg + execlen + x); -+ if (c < 32 || c > 126) -+ *(grarg + execlen + x) = ' '; -+ } -+ -+ execlen += len; -+ *(grarg + execlen) = ' '; -+ *(grarg + execlen + 1) = '\0'; -+ execlen++; -+ } -+ -+ log: -+ gr_log_fs_str(GR_DO_AUDIT, GR_EXEC_AUDIT_MSG, bprm->file->f_path.dentry, -+ bprm->file->f_path.mnt, grarg); -+ up(&gr_exec_arg_sem); -+#endif -+ return; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_fifo.c linux-2.6.31.1/grsecurity/grsec_fifo.c ---- linux-2.6.31.1/grsecurity/grsec_fifo.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_fifo.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,24 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/fs.h> -+#include <linux/file.h> -+#include <linux/grinternal.h> -+ -+int -+gr_handle_fifo(const struct dentry *dentry, const struct vfsmount *mnt, -+ const struct dentry *dir, const int flag, const int acc_mode) -+{ -+#ifdef CONFIG_GRKERNSEC_FIFO -+ const struct cred *cred = current_cred(); -+ -+ if (grsec_enable_fifo && S_ISFIFO(dentry->d_inode->i_mode) && -+ !(flag & O_EXCL) && (dir->d_inode->i_mode & S_ISVTX) && -+ (dentry->d_inode->i_uid != dir->d_inode->i_uid) && -+ (cred->fsuid != dentry->d_inode->i_uid)) { -+ if (!generic_permission(dentry->d_inode, acc_mode, NULL)) -+ gr_log_fs_int2(GR_DONT_AUDIT, GR_FIFO_MSG, dentry, mnt, dentry->d_inode->i_uid, dentry->d_inode->i_gid); -+ return -EACCES; -+ } -+#endif -+ return 0; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_fork.c linux-2.6.31.1/grsecurity/grsec_fork.c ---- linux-2.6.31.1/grsecurity/grsec_fork.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_fork.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,15 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+#include <linux/errno.h> -+ -+void -+gr_log_forkfail(const int retval) -+{ -+#ifdef CONFIG_GRKERNSEC_FORKFAIL -+ if (grsec_enable_forkfail && retval != -ERESTARTNOINTR) -+ gr_log_int(GR_DONT_AUDIT, GR_FAILFORK_MSG, retval); -+#endif -+ return; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_init.c linux-2.6.31.1/grsecurity/grsec_init.c ---- linux-2.6.31.1/grsecurity/grsec_init.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_init.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,230 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/mm.h> -+#include <linux/smp_lock.h> -+#include <linux/gracl.h> -+#include <linux/slab.h> -+#include <linux/vmalloc.h> -+#include <linux/percpu.h> -+ -+int grsec_enable_link; -+int grsec_enable_dmesg; -+int grsec_enable_harden_ptrace; -+int grsec_enable_fifo; -+int grsec_enable_execve; -+int grsec_enable_execlog; -+int grsec_enable_signal; -+int grsec_enable_forkfail; -+int grsec_enable_time; -+int grsec_enable_audit_textrel; -+int grsec_enable_group; -+int grsec_audit_gid; -+int grsec_enable_chdir; -+int grsec_enable_mount; -+int grsec_enable_chroot_findtask; -+int grsec_enable_chroot_mount; -+int grsec_enable_chroot_shmat; -+int grsec_enable_chroot_fchdir; -+int grsec_enable_chroot_double; -+int grsec_enable_chroot_pivot; -+int grsec_enable_chroot_chdir; -+int grsec_enable_chroot_chmod; -+int grsec_enable_chroot_mknod; -+int grsec_enable_chroot_nice; -+int grsec_enable_chroot_execlog; -+int grsec_enable_chroot_caps; -+int grsec_enable_chroot_sysctl; -+int grsec_enable_chroot_unix; -+int grsec_enable_tpe; -+int grsec_tpe_gid; -+int grsec_enable_tpe_all; -+int grsec_enable_socket_all; -+int grsec_socket_all_gid; -+int grsec_enable_socket_client; -+int grsec_socket_client_gid; -+int grsec_enable_socket_server; -+int grsec_socket_server_gid; -+int grsec_resource_logging; -+int grsec_lock; -+ -+DEFINE_SPINLOCK(grsec_alert_lock); -+unsigned long grsec_alert_wtime = 0; -+unsigned long grsec_alert_fyet = 0; -+ -+DEFINE_SPINLOCK(grsec_audit_lock); -+ -+DEFINE_RWLOCK(grsec_exec_file_lock); -+ -+char *gr_shared_page[4]; -+ -+char *gr_alert_log_fmt; -+char *gr_audit_log_fmt; -+char *gr_alert_log_buf; -+char *gr_audit_log_buf; -+ -+extern struct gr_arg *gr_usermode; -+extern unsigned char *gr_system_salt; -+extern unsigned char *gr_system_sum; -+ -+void __init -+grsecurity_init(void) -+{ -+ int j; -+ /* create the per-cpu shared pages */ -+ -+#ifdef CONFIG_X86 -+ memset((char *)(0x41a + PAGE_OFFSET), 0, 36); -+#endif -+ -+ for (j = 0; j < 4; j++) { -+ gr_shared_page[j] = (char *)__alloc_percpu(PAGE_SIZE, __alignof__(unsigned long long)); -+ if (gr_shared_page[j] == NULL) { -+ panic("Unable to allocate grsecurity shared page"); -+ return; -+ } -+ } -+ -+ /* allocate log buffers */ -+ gr_alert_log_fmt = kmalloc(512, GFP_KERNEL); -+ if (!gr_alert_log_fmt) { -+ panic("Unable to allocate grsecurity alert log format buffer"); -+ return; -+ } -+ gr_audit_log_fmt = kmalloc(512, GFP_KERNEL); -+ if (!gr_audit_log_fmt) { -+ panic("Unable to allocate grsecurity audit log format buffer"); -+ return; -+ } -+ gr_alert_log_buf = (char *) get_zeroed_page(GFP_KERNEL); -+ if (!gr_alert_log_buf) { -+ panic("Unable to allocate grsecurity alert log buffer"); -+ return; -+ } -+ gr_audit_log_buf = (char *) get_zeroed_page(GFP_KERNEL); -+ if (!gr_audit_log_buf) { -+ panic("Unable to allocate grsecurity audit log buffer"); -+ return; -+ } -+ -+ /* allocate memory for authentication structure */ -+ gr_usermode = kmalloc(sizeof(struct gr_arg), GFP_KERNEL); -+ gr_system_salt = kmalloc(GR_SALT_LEN, GFP_KERNEL); -+ gr_system_sum = kmalloc(GR_SHA_LEN, GFP_KERNEL); -+ -+ if (!gr_usermode || !gr_system_salt || !gr_system_sum) { -+ panic("Unable to allocate grsecurity authentication structure"); -+ return; -+ } -+ -+#if !defined(CONFIG_GRKERNSEC_SYSCTL) || defined(CONFIG_GRKERNSEC_SYSCTL_ON) -+#ifndef CONFIG_GRKERNSEC_SYSCTL -+ grsec_lock = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_AUDIT_TEXTREL -+ grsec_enable_audit_textrel = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_AUDIT_GROUP -+ grsec_enable_group = 1; -+ grsec_audit_gid = CONFIG_GRKERNSEC_AUDIT_GID; -+#endif -+#ifdef CONFIG_GRKERNSEC_AUDIT_CHDIR -+ grsec_enable_chdir = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_HARDEN_PTRACE -+ grsec_enable_harden_ptrace = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_AUDIT_MOUNT -+ grsec_enable_mount = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_LINK -+ grsec_enable_link = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_DMESG -+ grsec_enable_dmesg = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_FIFO -+ grsec_enable_fifo = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_EXECVE -+ grsec_enable_execve = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_EXECLOG -+ grsec_enable_execlog = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_SIGNAL -+ grsec_enable_signal = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_FORKFAIL -+ grsec_enable_forkfail = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_TIME -+ grsec_enable_time = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_RESLOG -+ grsec_resource_logging = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_FINDTASK -+ grsec_enable_chroot_findtask = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_UNIX -+ grsec_enable_chroot_unix = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_MOUNT -+ grsec_enable_chroot_mount = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_FCHDIR -+ grsec_enable_chroot_fchdir = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_SHMAT -+ grsec_enable_chroot_shmat = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_DOUBLE -+ grsec_enable_chroot_double = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_PIVOT -+ grsec_enable_chroot_pivot = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_CHDIR -+ grsec_enable_chroot_chdir = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_CHMOD -+ grsec_enable_chroot_chmod = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_MKNOD -+ grsec_enable_chroot_mknod = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_NICE -+ grsec_enable_chroot_nice = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_EXECLOG -+ grsec_enable_chroot_execlog = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_CAPS -+ grsec_enable_chroot_caps = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_SYSCTL -+ grsec_enable_chroot_sysctl = 1; -+#endif -+#ifdef CONFIG_GRKERNSEC_TPE -+ grsec_enable_tpe = 1; -+ grsec_tpe_gid = CONFIG_GRKERNSEC_TPE_GID; -+#ifdef CONFIG_GRKERNSEC_TPE_ALL -+ grsec_enable_tpe_all = 1; -+#endif -+#endif -+#ifdef CONFIG_GRKERNSEC_SOCKET_ALL -+ grsec_enable_socket_all = 1; -+ grsec_socket_all_gid = CONFIG_GRKERNSEC_SOCKET_ALL_GID; -+#endif -+#ifdef CONFIG_GRKERNSEC_SOCKET_CLIENT -+ grsec_enable_socket_client = 1; -+ grsec_socket_client_gid = CONFIG_GRKERNSEC_SOCKET_CLIENT_GID; -+#endif -+#ifdef CONFIG_GRKERNSEC_SOCKET_SERVER -+ grsec_enable_socket_server = 1; -+ grsec_socket_server_gid = CONFIG_GRKERNSEC_SOCKET_SERVER_GID; -+#endif -+#endif -+ -+ return; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_link.c linux-2.6.31.1/grsecurity/grsec_link.c ---- linux-2.6.31.1/grsecurity/grsec_link.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_link.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,43 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/fs.h> -+#include <linux/file.h> -+#include <linux/grinternal.h> -+ -+int -+gr_handle_follow_link(const struct inode *parent, -+ const struct inode *inode, -+ const struct dentry *dentry, const struct vfsmount *mnt) -+{ -+#ifdef CONFIG_GRKERNSEC_LINK -+ const struct cred *cred = current_cred(); -+ -+ if (grsec_enable_link && S_ISLNK(inode->i_mode) && -+ (parent->i_mode & S_ISVTX) && (parent->i_uid != inode->i_uid) && -+ (parent->i_mode & S_IWOTH) && (cred->fsuid != inode->i_uid)) { -+ gr_log_fs_int2(GR_DONT_AUDIT, GR_SYMLINK_MSG, dentry, mnt, inode->i_uid, inode->i_gid); -+ return -EACCES; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_hardlink(const struct dentry *dentry, -+ const struct vfsmount *mnt, -+ struct inode *inode, const int mode, const char *to) -+{ -+#ifdef CONFIG_GRKERNSEC_LINK -+ const struct cred *cred = current_cred(); -+ -+ if (grsec_enable_link && cred->fsuid != inode->i_uid && -+ (!S_ISREG(mode) || (mode & S_ISUID) || -+ ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) || -+ (generic_permission(inode, MAY_READ | MAY_WRITE, NULL))) && -+ !capable(CAP_FOWNER) && cred->uid) { -+ gr_log_fs_int2_str(GR_DONT_AUDIT, GR_HARDLINK_MSG, dentry, mnt, inode->i_uid, inode->i_gid, to); -+ return -EPERM; -+ } -+#endif -+ return 0; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_log.c linux-2.6.31.1/grsecurity/grsec_log.c ---- linux-2.6.31.1/grsecurity/grsec_log.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_log.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,294 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/file.h> -+#include <linux/tty.h> -+#include <linux/fs.h> -+#include <linux/grinternal.h> -+ -+#define BEGIN_LOCKS(x) \ -+ read_lock(&tasklist_lock); \ -+ read_lock(&grsec_exec_file_lock); \ -+ if (x != GR_DO_AUDIT) \ -+ spin_lock(&grsec_alert_lock); \ -+ else \ -+ spin_lock(&grsec_audit_lock) -+ -+#define END_LOCKS(x) \ -+ if (x != GR_DO_AUDIT) \ -+ spin_unlock(&grsec_alert_lock); \ -+ else \ -+ spin_unlock(&grsec_audit_lock); \ -+ read_unlock(&grsec_exec_file_lock); \ -+ read_unlock(&tasklist_lock); \ -+ if (x == GR_DONT_AUDIT) \ -+ gr_handle_alertkill(current) -+ -+enum { -+ FLOODING, -+ NO_FLOODING -+}; -+ -+extern char *gr_alert_log_fmt; -+extern char *gr_audit_log_fmt; -+extern char *gr_alert_log_buf; -+extern char *gr_audit_log_buf; -+ -+static int gr_log_start(int audit) -+{ -+ char *loglevel = (audit == GR_DO_AUDIT) ? KERN_INFO : KERN_ALERT; -+ char *fmt = (audit == GR_DO_AUDIT) ? gr_audit_log_fmt : gr_alert_log_fmt; -+ char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; -+ -+ if (audit == GR_DO_AUDIT) -+ goto set_fmt; -+ -+ if (!grsec_alert_wtime || jiffies - grsec_alert_wtime > CONFIG_GRKERNSEC_FLOODTIME * HZ) { -+ grsec_alert_wtime = jiffies; -+ grsec_alert_fyet = 0; -+ } else if ((jiffies - grsec_alert_wtime < CONFIG_GRKERNSEC_FLOODTIME * HZ) && (grsec_alert_fyet < CONFIG_GRKERNSEC_FLOODBURST)) { -+ grsec_alert_fyet++; -+ } else if (grsec_alert_fyet == CONFIG_GRKERNSEC_FLOODBURST) { -+ grsec_alert_wtime = jiffies; -+ grsec_alert_fyet++; -+ printk(KERN_ALERT "grsec: more alerts, logging disabled for %d seconds\n", CONFIG_GRKERNSEC_FLOODTIME); -+ return FLOODING; -+ } else return FLOODING; -+ -+set_fmt: -+ memset(buf, 0, PAGE_SIZE); -+ if (current->signal->curr_ip && gr_acl_is_enabled()) { -+ sprintf(fmt, "%s%s", loglevel, "grsec: From %u.%u.%u.%u: (%.64s:%c:%.950s) "); -+ snprintf(buf, PAGE_SIZE - 1, fmt, NIPQUAD(current->signal->curr_ip), current->role->rolename, gr_roletype_to_char(), current->acl->filename); -+ } else if (current->signal->curr_ip) { -+ sprintf(fmt, "%s%s", loglevel, "grsec: From %u.%u.%u.%u: "); -+ snprintf(buf, PAGE_SIZE - 1, fmt, NIPQUAD(current->signal->curr_ip)); -+ } else if (gr_acl_is_enabled()) { -+ sprintf(fmt, "%s%s", loglevel, "grsec: (%.64s:%c:%.950s) "); -+ snprintf(buf, PAGE_SIZE - 1, fmt, current->role->rolename, gr_roletype_to_char(), current->acl->filename); -+ } else { -+ sprintf(fmt, "%s%s", loglevel, "grsec: "); -+ strcpy(buf, fmt); -+ } -+ -+ return NO_FLOODING; -+} -+ -+static void gr_log_middle(int audit, const char *msg, va_list ap) -+ __attribute__ ((format (printf, 2, 0))); -+ -+static void gr_log_middle(int audit, const char *msg, va_list ap) -+{ -+ char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; -+ unsigned int len = strlen(buf); -+ -+ vsnprintf(buf + len, PAGE_SIZE - len - 1, msg, ap); -+ -+ return; -+} -+ -+static void gr_log_middle_varargs(int audit, const char *msg, ...) -+ __attribute__ ((format (printf, 2, 3))); -+ -+static void gr_log_middle_varargs(int audit, const char *msg, ...) -+{ -+ char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; -+ unsigned int len = strlen(buf); -+ va_list ap; -+ -+ va_start(ap, msg); -+ vsnprintf(buf + len, PAGE_SIZE - len - 1, msg, ap); -+ va_end(ap); -+ -+ return; -+} -+ -+static void gr_log_end(int audit) -+{ -+ char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; -+ unsigned int len = strlen(buf); -+ -+ snprintf(buf + len, PAGE_SIZE - len - 1, DEFAULTSECMSG, DEFAULTSECARGS(current, current_cred(), __task_cred(current->parent))); -+ printk("%s\n", buf); -+ -+ return; -+} -+ -+void gr_log_varargs(int audit, const char *msg, int argtypes, ...) -+{ -+ int logtype; -+ char *result = (audit == GR_DO_AUDIT) ? "successful" : "denied"; -+ char *str1, *str2, *str3; -+ void *voidptr; -+ int num1, num2; -+ unsigned long ulong1, ulong2; -+ struct dentry *dentry; -+ struct vfsmount *mnt; -+ struct file *file; -+ struct task_struct *task; -+ const struct cred *cred, *pcred; -+ va_list ap; -+ -+ BEGIN_LOCKS(audit); -+ logtype = gr_log_start(audit); -+ if (logtype == FLOODING) { -+ END_LOCKS(audit); -+ return; -+ } -+ va_start(ap, argtypes); -+ switch (argtypes) { -+ case GR_TTYSNIFF: -+ task = va_arg(ap, struct task_struct *); -+ gr_log_middle_varargs(audit, msg, NIPQUAD(task->signal->curr_ip), gr_task_fullpath0(task), task->comm, task->pid, gr_parent_task_fullpath0(task), task->parent->comm, task->parent->pid); -+ break; -+ case GR_SYSCTL_HIDDEN: -+ str1 = va_arg(ap, char *); -+ gr_log_middle_varargs(audit, msg, result, str1); -+ break; -+ case GR_RBAC: -+ dentry = va_arg(ap, struct dentry *); -+ mnt = va_arg(ap, struct vfsmount *); -+ gr_log_middle_varargs(audit, msg, result, gr_to_filename(dentry, mnt)); -+ break; -+ case GR_RBAC_STR: -+ dentry = va_arg(ap, struct dentry *); -+ mnt = va_arg(ap, struct vfsmount *); -+ str1 = va_arg(ap, char *); -+ gr_log_middle_varargs(audit, msg, result, gr_to_filename(dentry, mnt), str1); -+ break; -+ case GR_STR_RBAC: -+ str1 = va_arg(ap, char *); -+ dentry = va_arg(ap, struct dentry *); -+ mnt = va_arg(ap, struct vfsmount *); -+ gr_log_middle_varargs(audit, msg, result, str1, gr_to_filename(dentry, mnt)); -+ break; -+ case GR_RBAC_MODE2: -+ dentry = va_arg(ap, struct dentry *); -+ mnt = va_arg(ap, struct vfsmount *); -+ str1 = va_arg(ap, char *); -+ str2 = va_arg(ap, char *); -+ gr_log_middle_varargs(audit, msg, result, gr_to_filename(dentry, mnt), str1, str2); -+ break; -+ case GR_RBAC_MODE3: -+ dentry = va_arg(ap, struct dentry *); -+ mnt = va_arg(ap, struct vfsmount *); -+ str1 = va_arg(ap, char *); -+ str2 = va_arg(ap, char *); -+ str3 = va_arg(ap, char *); -+ gr_log_middle_varargs(audit, msg, result, gr_to_filename(dentry, mnt), str1, str2, str3); -+ break; -+ case GR_FILENAME: -+ dentry = va_arg(ap, struct dentry *); -+ mnt = va_arg(ap, struct vfsmount *); -+ gr_log_middle_varargs(audit, msg, gr_to_filename(dentry, mnt)); -+ break; -+ case GR_STR_FILENAME: -+ str1 = va_arg(ap, char *); -+ dentry = va_arg(ap, struct dentry *); -+ mnt = va_arg(ap, struct vfsmount *); -+ gr_log_middle_varargs(audit, msg, str1, gr_to_filename(dentry, mnt)); -+ break; -+ case GR_FILENAME_STR: -+ dentry = va_arg(ap, struct dentry *); -+ mnt = va_arg(ap, struct vfsmount *); -+ str1 = va_arg(ap, char *); -+ gr_log_middle_varargs(audit, msg, gr_to_filename(dentry, mnt), str1); -+ break; -+ case GR_FILENAME_TWO_INT: -+ dentry = va_arg(ap, struct dentry *); -+ mnt = va_arg(ap, struct vfsmount *); -+ num1 = va_arg(ap, int); -+ num2 = va_arg(ap, int); -+ gr_log_middle_varargs(audit, msg, gr_to_filename(dentry, mnt), num1, num2); -+ break; -+ case GR_FILENAME_TWO_INT_STR: -+ dentry = va_arg(ap, struct dentry *); -+ mnt = va_arg(ap, struct vfsmount *); -+ num1 = va_arg(ap, int); -+ num2 = va_arg(ap, int); -+ str1 = va_arg(ap, char *); -+ gr_log_middle_varargs(audit, msg, gr_to_filename(dentry, mnt), num1, num2, str1); -+ break; -+ case GR_TEXTREL: -+ file = va_arg(ap, struct file *); -+ ulong1 = va_arg(ap, unsigned long); -+ ulong2 = va_arg(ap, unsigned long); -+ gr_log_middle_varargs(audit, msg, file ? gr_to_filename(file->f_path.dentry, file->f_path.mnt) : "<anonymous mapping>", ulong1, ulong2); -+ break; -+ case GR_PTRACE: -+ task = va_arg(ap, struct task_struct *); -+ gr_log_middle_varargs(audit, msg, task->exec_file ? gr_to_filename(task->exec_file->f_path.dentry, task->exec_file->f_path.mnt) : "(none)", task->comm, task->pid); -+ break; -+ case GR_RESOURCE: -+ task = va_arg(ap, struct task_struct *); -+ cred = __task_cred(task); -+ pcred = __task_cred(task->parent); -+ ulong1 = va_arg(ap, unsigned long); -+ str1 = va_arg(ap, char *); -+ ulong2 = va_arg(ap, unsigned long); -+ gr_log_middle_varargs(audit, msg, ulong1, str1, ulong2, gr_task_fullpath(task), task->comm, task->pid, cred->uid, cred->euid, cred->gid, cred->egid, gr_parent_task_fullpath(task), task->parent->comm, task->parent->pid, pcred->uid, pcred->euid, pcred->gid, pcred->egid); -+ break; -+ case GR_CAP: -+ task = va_arg(ap, struct task_struct *); -+ cred = __task_cred(task); -+ pcred = __task_cred(task->parent); -+ str1 = va_arg(ap, char *); -+ gr_log_middle_varargs(audit, msg, str1, gr_task_fullpath(task), task->comm, task->pid, cred->uid, cred->euid, cred->gid, cred->egid, gr_parent_task_fullpath(task), task->parent->comm, task->parent->pid, pcred->uid, pcred->euid, pcred->gid, pcred->egid); -+ break; -+ case GR_SIG: -+ str1 = va_arg(ap, char *); -+ voidptr = va_arg(ap, void *); -+ gr_log_middle_varargs(audit, msg, str1, voidptr); -+ break; -+ case GR_SIG2: -+ task = va_arg(ap, struct task_struct *); -+ cred = __task_cred(task); -+ pcred = __task_cred(task->parent); -+ num1 = va_arg(ap, int); -+ gr_log_middle_varargs(audit, msg, num1, gr_task_fullpath0(task), task->comm, task->pid, cred->uid, cred->euid, cred->gid, cred->egid, gr_parent_task_fullpath0(task), task->parent->comm, task->parent->pid, pcred->uid, pcred->euid, pcred->gid, pcred->egid); -+ break; -+ case GR_CRASH1: -+ task = va_arg(ap, struct task_struct *); -+ cred = __task_cred(task); -+ pcred = __task_cred(task->parent); -+ ulong1 = va_arg(ap, unsigned long); -+ gr_log_middle_varargs(audit, msg, gr_task_fullpath(task), task->comm, task->pid, cred->uid, cred->euid, cred->gid, cred->egid, gr_parent_task_fullpath(task), task->parent->comm, task->parent->pid, pcred->uid, pcred->euid, pcred->gid, pcred->egid, cred->uid, ulong1); -+ break; -+ case GR_CRASH2: -+ task = va_arg(ap, struct task_struct *); -+ cred = __task_cred(task); -+ pcred = __task_cred(task->parent); -+ ulong1 = va_arg(ap, unsigned long); -+ gr_log_middle_varargs(audit, msg, gr_task_fullpath(task), task->comm, task->pid, cred->uid, cred->euid, cred->gid, cred->egid, gr_parent_task_fullpath(task), task->parent->comm, task->parent->pid, pcred->uid, pcred->euid, pcred->gid, pcred->egid, ulong1); -+ break; -+ case GR_PSACCT: -+ { -+ unsigned int wday, cday; -+ __u8 whr, chr; -+ __u8 wmin, cmin; -+ __u8 wsec, csec; -+ char cur_tty[64] = { 0 }; -+ char parent_tty[64] = { 0 }; -+ -+ task = va_arg(ap, struct task_struct *); -+ wday = va_arg(ap, unsigned int); -+ cday = va_arg(ap, unsigned int); -+ whr = va_arg(ap, int); -+ chr = va_arg(ap, int); -+ wmin = va_arg(ap, int); -+ cmin = va_arg(ap, int); -+ wsec = va_arg(ap, int); -+ csec = va_arg(ap, int); -+ ulong1 = va_arg(ap, unsigned long); -+ cred = __task_cred(task); -+ pcred = __task_cred(task->parent); -+ -+ gr_log_middle_varargs(audit, msg, gr_task_fullpath(task), task->comm, task->pid, NIPQUAD(task->signal->curr_ip), tty_name(task->signal->tty, cur_tty), cred->uid, cred->euid, cred->gid, cred->egid, wday, whr, wmin, wsec, cday, chr, cmin, csec, (task->flags & PF_SIGNALED) ? "killed by signal" : "exited", ulong1, gr_parent_task_fullpath(task), task->parent->comm, task->parent->pid, NIPQUAD(task->parent->signal->curr_ip), tty_name(task->parent->signal->tty, parent_tty), pcred->uid, pcred->euid, pcred->gid, pcred->egid); -+ } -+ break; -+ default: -+ gr_log_middle(audit, msg, ap); -+ } -+ va_end(ap); -+ gr_log_end(audit); -+ END_LOCKS(audit); -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_mem.c linux-2.6.31.1/grsecurity/grsec_mem.c ---- linux-2.6.31.1/grsecurity/grsec_mem.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_mem.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,79 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/mm.h> -+#include <linux/mman.h> -+#include <linux/grinternal.h> -+ -+void -+gr_handle_ioperm(void) -+{ -+ gr_log_noargs(GR_DONT_AUDIT, GR_IOPERM_MSG); -+ return; -+} -+ -+void -+gr_handle_iopl(void) -+{ -+ gr_log_noargs(GR_DONT_AUDIT, GR_IOPL_MSG); -+ return; -+} -+ -+void -+gr_handle_mem_write(void) -+{ -+ gr_log_noargs(GR_DONT_AUDIT, GR_MEM_WRITE_MSG); -+ return; -+} -+ -+void -+gr_handle_kmem_write(void) -+{ -+ gr_log_noargs(GR_DONT_AUDIT, GR_KMEM_MSG); -+ return; -+} -+ -+void -+gr_handle_open_port(void) -+{ -+ gr_log_noargs(GR_DONT_AUDIT, GR_PORT_OPEN_MSG); -+ return; -+} -+ -+int -+gr_handle_mem_mmap(const unsigned long offset, struct vm_area_struct *vma) -+{ -+ unsigned long start, end; -+ -+ start = offset; -+ end = start + vma->vm_end - vma->vm_start; -+ -+ if (start > end) { -+ gr_log_noargs(GR_DONT_AUDIT, GR_MEM_MMAP_MSG); -+ return -EPERM; -+ } -+ -+ /* allowed ranges : ISA I/O BIOS */ -+ if ((start >= __pa(high_memory)) -+#if defined(CONFIG_X86) || defined(CONFIG_PPC) -+ || (start >= 0x000a0000 && end <= 0x00100000) -+ || (start >= 0x00000000 && end <= 0x00001000) -+#endif -+ ) -+ return 0; -+ -+ if (vma->vm_flags & VM_WRITE) { -+ gr_log_noargs(GR_DONT_AUDIT, GR_MEM_MMAP_MSG); -+ return -EPERM; -+ } else -+ vma->vm_flags &= ~VM_MAYWRITE; -+ -+ return 0; -+} -+ -+void -+gr_log_nonroot_mod_load(const char *modname) -+{ -+ gr_log_str(GR_DONT_AUDIT, GR_NONROOT_MODLOAD_MSG, modname); -+ return; -+} -+ -diff -urNp linux-2.6.31.1/grsecurity/grsec_mount.c linux-2.6.31.1/grsecurity/grsec_mount.c ---- linux-2.6.31.1/grsecurity/grsec_mount.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_mount.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,34 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+ -+void -+gr_log_remount(const char *devname, const int retval) -+{ -+#ifdef CONFIG_GRKERNSEC_AUDIT_MOUNT -+ if (grsec_enable_mount && (retval >= 0)) -+ gr_log_str(GR_DO_AUDIT, GR_REMOUNT_AUDIT_MSG, devname ? devname : "none"); -+#endif -+ return; -+} -+ -+void -+gr_log_unmount(const char *devname, const int retval) -+{ -+#ifdef CONFIG_GRKERNSEC_AUDIT_MOUNT -+ if (grsec_enable_mount && (retval >= 0)) -+ gr_log_str(GR_DO_AUDIT, GR_UNMOUNT_AUDIT_MSG, devname ? devname : "none"); -+#endif -+ return; -+} -+ -+void -+gr_log_mount(const char *from, const char *to, const int retval) -+{ -+#ifdef CONFIG_GRKERNSEC_AUDIT_MOUNT -+ if (grsec_enable_mount && (retval >= 0)) -+ gr_log_str_str(GR_DO_AUDIT, GR_MOUNT_AUDIT_MSG, from, to); -+#endif -+ return; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_sig.c linux-2.6.31.1/grsecurity/grsec_sig.c ---- linux-2.6.31.1/grsecurity/grsec_sig.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_sig.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,65 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/delay.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+ -+char *signames[] = { -+ [SIGSEGV] = "Segmentation fault", -+ [SIGILL] = "Illegal instruction", -+ [SIGABRT] = "Abort", -+ [SIGBUS] = "Invalid alignment/Bus error" -+}; -+ -+void -+gr_log_signal(const int sig, const void *addr, const struct task_struct *t) -+{ -+#ifdef CONFIG_GRKERNSEC_SIGNAL -+ if (grsec_enable_signal && ((sig == SIGSEGV) || (sig == SIGILL) || -+ (sig == SIGABRT) || (sig == SIGBUS))) { -+ if (t->pid == current->pid) { -+ gr_log_sig_addr(GR_DONT_AUDIT_GOOD, GR_UNISIGLOG_MSG, signames[sig], addr); -+ } else { -+ gr_log_sig_task(GR_DONT_AUDIT_GOOD, GR_DUALSIGLOG_MSG, t, sig); -+ } -+ } -+#endif -+ return; -+} -+ -+int -+gr_handle_signal(const struct task_struct *p, const int sig) -+{ -+#ifdef CONFIG_GRKERNSEC -+ if (current->pid > 1 && gr_check_protected_task(p)) { -+ gr_log_sig_task(GR_DONT_AUDIT, GR_SIG_ACL_MSG, p, sig); -+ return -EPERM; -+ } else if (gr_pid_is_chrooted((struct task_struct *)p)) { -+ return -EPERM; -+ } -+#endif -+ return 0; -+} -+ -+void gr_handle_brute_attach(struct task_struct *p) -+{ -+#ifdef CONFIG_GRKERNSEC_BRUTE -+ read_lock(&tasklist_lock); -+ read_lock(&grsec_exec_file_lock); -+ if (p->parent && p->parent->exec_file == p->exec_file) -+ p->parent->brute = 1; -+ read_unlock(&grsec_exec_file_lock); -+ read_unlock(&tasklist_lock); -+#endif -+ return; -+} -+ -+void gr_handle_brute_check(void) -+{ -+#ifdef CONFIG_GRKERNSEC_BRUTE -+ if (current->brute) -+ msleep(30 * 1000); -+#endif -+ return; -+} -+ -diff -urNp linux-2.6.31.1/grsecurity/grsec_sock.c linux-2.6.31.1/grsecurity/grsec_sock.c ---- linux-2.6.31.1/grsecurity/grsec_sock.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_sock.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,269 @@ -+#include <linux/kernel.h> -+#include <linux/module.h> -+#include <linux/sched.h> -+#include <linux/file.h> -+#include <linux/net.h> -+#include <linux/in.h> -+#include <linux/ip.h> -+#include <net/sock.h> -+#include <net/inet_sock.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+#include <linux/gracl.h> -+ -+kernel_cap_t gr_cap_rtnetlink(struct sock *sock); -+EXPORT_SYMBOL(gr_cap_rtnetlink); -+ -+extern int gr_search_udp_recvmsg(const struct sock *sk, const struct sk_buff *skb); -+extern int gr_search_udp_sendmsg(const struct sock *sk, const struct sockaddr_in *addr); -+ -+EXPORT_SYMBOL(gr_search_udp_recvmsg); -+EXPORT_SYMBOL(gr_search_udp_sendmsg); -+ -+#ifdef CONFIG_UNIX_MODULE -+EXPORT_SYMBOL(gr_acl_handle_unix); -+EXPORT_SYMBOL(gr_acl_handle_mknod); -+EXPORT_SYMBOL(gr_handle_chroot_unix); -+EXPORT_SYMBOL(gr_handle_create); -+#endif -+ -+#ifdef CONFIG_GRKERNSEC -+#define gr_conn_table_size 32749 -+struct conn_table_entry { -+ struct conn_table_entry *next; -+ struct signal_struct *sig; -+}; -+ -+struct conn_table_entry *gr_conn_table[gr_conn_table_size]; -+DEFINE_SPINLOCK(gr_conn_table_lock); -+ -+extern const char * gr_socktype_to_name(unsigned char type); -+extern const char * gr_proto_to_name(unsigned char proto); -+ -+static __inline__ int -+conn_hash(__u32 saddr, __u32 daddr, __u16 sport, __u16 dport, unsigned int size) -+{ -+ return ((daddr + saddr + (sport << 8) + (dport << 16)) % size); -+} -+ -+static __inline__ int -+conn_match(const struct signal_struct *sig, __u32 saddr, __u32 daddr, -+ __u16 sport, __u16 dport) -+{ -+ if (unlikely(sig->gr_saddr == saddr && sig->gr_daddr == daddr && -+ sig->gr_sport == sport && sig->gr_dport == dport)) -+ return 1; -+ else -+ return 0; -+} -+ -+static void gr_add_to_task_ip_table_nolock(struct signal_struct *sig, struct conn_table_entry *newent) -+{ -+ struct conn_table_entry **match; -+ unsigned int index; -+ -+ index = conn_hash(sig->gr_saddr, sig->gr_daddr, -+ sig->gr_sport, sig->gr_dport, -+ gr_conn_table_size); -+ -+ newent->sig = sig; -+ -+ match = &gr_conn_table[index]; -+ newent->next = *match; -+ *match = newent; -+ -+ return; -+} -+ -+static void gr_del_task_from_ip_table_nolock(struct signal_struct *sig) -+{ -+ struct conn_table_entry *match, *last = NULL; -+ unsigned int index; -+ -+ index = conn_hash(sig->gr_saddr, sig->gr_daddr, -+ sig->gr_sport, sig->gr_dport, -+ gr_conn_table_size); -+ -+ match = gr_conn_table[index]; -+ while (match && !conn_match(match->sig, -+ sig->gr_saddr, sig->gr_daddr, sig->gr_sport, -+ sig->gr_dport)) { -+ last = match; -+ match = match->next; -+ } -+ -+ if (match) { -+ if (last) -+ last->next = match->next; -+ else -+ gr_conn_table[index] = NULL; -+ kfree(match); -+ } -+ -+ return; -+} -+ -+static struct signal_struct * gr_lookup_task_ip_table(__u32 saddr, __u32 daddr, -+ __u16 sport, __u16 dport) -+{ -+ struct conn_table_entry *match; -+ unsigned int index; -+ -+ index = conn_hash(saddr, daddr, sport, dport, gr_conn_table_size); -+ -+ match = gr_conn_table[index]; -+ while (match && !conn_match(match->sig, saddr, daddr, sport, dport)) -+ match = match->next; -+ -+ if (match) -+ return match->sig; -+ else -+ return NULL; -+} -+ -+#endif -+ -+void gr_update_task_in_ip_table(struct task_struct *task, const struct inet_sock *inet) -+{ -+#ifdef CONFIG_GRKERNSEC -+ struct signal_struct *sig = task->signal; -+ struct conn_table_entry *newent; -+ -+ newent = kmalloc(sizeof(struct conn_table_entry), GFP_ATOMIC); -+ if (newent == NULL) -+ return; -+ /* no bh lock needed since we are called with bh disabled */ -+ spin_lock(&gr_conn_table_lock); -+ gr_del_task_from_ip_table_nolock(sig); -+ sig->gr_saddr = inet->rcv_saddr; -+ sig->gr_daddr = inet->daddr; -+ sig->gr_sport = inet->sport; -+ sig->gr_dport = inet->dport; -+ gr_add_to_task_ip_table_nolock(sig, newent); -+ spin_unlock(&gr_conn_table_lock); -+#endif -+ return; -+} -+ -+void gr_del_task_from_ip_table(struct task_struct *task) -+{ -+#ifdef CONFIG_GRKERNSEC -+ spin_lock_bh(&gr_conn_table_lock); -+ gr_del_task_from_ip_table_nolock(task->signal); -+ spin_unlock_bh(&gr_conn_table_lock); -+#endif -+ return; -+} -+ -+void -+gr_attach_curr_ip(const struct sock *sk) -+{ -+#ifdef CONFIG_GRKERNSEC -+ struct signal_struct *p, *set; -+ const struct inet_sock *inet = inet_sk(sk); -+ -+ if (unlikely(sk->sk_protocol != IPPROTO_TCP)) -+ return; -+ -+ set = current->signal; -+ -+ spin_lock_bh(&gr_conn_table_lock); -+ p = gr_lookup_task_ip_table(inet->daddr, inet->rcv_saddr, -+ inet->dport, inet->sport); -+ if (unlikely(p != NULL)) { -+ set->curr_ip = p->curr_ip; -+ set->used_accept = 1; -+ gr_del_task_from_ip_table_nolock(p); -+ spin_unlock_bh(&gr_conn_table_lock); -+ return; -+ } -+ spin_unlock_bh(&gr_conn_table_lock); -+ -+ set->curr_ip = inet->daddr; -+ set->used_accept = 1; -+#endif -+ return; -+} -+ -+int -+gr_handle_sock_all(const int family, const int type, const int protocol) -+{ -+#ifdef CONFIG_GRKERNSEC_SOCKET_ALL -+ if (grsec_enable_socket_all && in_group_p(grsec_socket_all_gid) && -+ (family != AF_UNIX) && (family != AF_LOCAL)) { -+ gr_log_int_str2(GR_DONT_AUDIT, GR_SOCK2_MSG, family, gr_socktype_to_name(type), gr_proto_to_name(protocol)); -+ return -EACCES; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_sock_server(const struct sockaddr *sck) -+{ -+#ifdef CONFIG_GRKERNSEC_SOCKET_SERVER -+ if (grsec_enable_socket_server && -+ in_group_p(grsec_socket_server_gid) && -+ sck && (sck->sa_family != AF_UNIX) && -+ (sck->sa_family != AF_LOCAL)) { -+ gr_log_noargs(GR_DONT_AUDIT, GR_BIND_MSG); -+ return -EACCES; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_sock_server_other(const struct sock *sck) -+{ -+#ifdef CONFIG_GRKERNSEC_SOCKET_SERVER -+ if (grsec_enable_socket_server && -+ in_group_p(grsec_socket_server_gid) && -+ sck && (sck->sk_family != AF_UNIX) && -+ (sck->sk_family != AF_LOCAL)) { -+ gr_log_noargs(GR_DONT_AUDIT, GR_BIND_MSG); -+ return -EACCES; -+ } -+#endif -+ return 0; -+} -+ -+int -+gr_handle_sock_client(const struct sockaddr *sck) -+{ -+#ifdef CONFIG_GRKERNSEC_SOCKET_CLIENT -+ if (grsec_enable_socket_client && in_group_p(grsec_socket_client_gid) && -+ sck && (sck->sa_family != AF_UNIX) && -+ (sck->sa_family != AF_LOCAL)) { -+ gr_log_noargs(GR_DONT_AUDIT, GR_CONNECT_MSG); -+ return -EACCES; -+ } -+#endif -+ return 0; -+} -+ -+kernel_cap_t -+gr_cap_rtnetlink(struct sock *sock) -+{ -+#ifdef CONFIG_GRKERNSEC -+ if (!gr_acl_is_enabled()) -+ return current_cap(); -+ else if (sock->sk_protocol == NETLINK_ISCSI && -+ cap_raised(current_cap(), CAP_SYS_ADMIN) && -+ gr_is_capable(CAP_SYS_ADMIN)) -+ return current_cap(); -+ else if (sock->sk_protocol == NETLINK_AUDIT && -+ cap_raised(current_cap(), CAP_AUDIT_WRITE) && -+ gr_is_capable(CAP_AUDIT_WRITE) && -+ cap_raised(current_cap(), CAP_AUDIT_CONTROL) && -+ gr_is_capable(CAP_AUDIT_CONTROL)) -+ return current_cap(); -+ else if (cap_raised(current_cap(), CAP_NET_ADMIN) && -+ gr_is_capable(CAP_NET_ADMIN)) -+ return current_cap(); -+ else -+ return __cap_empty_set; -+#else -+ return current_cap(); -+#endif -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_sysctl.c linux-2.6.31.1/grsecurity/grsec_sysctl.c ---- linux-2.6.31.1/grsecurity/grsec_sysctl.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_sysctl.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,403 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/sysctl.h> -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+ -+int -+gr_handle_sysctl_mod(const char *dirname, const char *name, const int op) -+{ -+#ifdef CONFIG_GRKERNSEC_SYSCTL -+ if (!strcmp(dirname, "grsecurity") && grsec_lock && (op & MAY_WRITE)) { -+ gr_log_str(GR_DONT_AUDIT, GR_SYSCTL_MSG, name); -+ return -EACCES; -+ } -+#endif -+ return 0; -+} -+ -+#if defined(CONFIG_GRKERNSEC_SYSCTL) -+ctl_table grsecurity_table[] = { -+#ifdef CONFIG_GRKERNSEC_SYSCTL -+#ifdef CONFIG_GRKERNSEC_LINK -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "linking_restrictions", -+ .data = &grsec_enable_link, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_FIFO -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "fifo_restrictions", -+ .data = &grsec_enable_fifo, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_EXECVE -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "execve_limiting", -+ .data = &grsec_enable_execve, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_EXECLOG -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "exec_logging", -+ .data = &grsec_enable_execlog, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_SIGNAL -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "signal_logging", -+ .data = &grsec_enable_signal, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_FORKFAIL -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "forkfail_logging", -+ .data = &grsec_enable_forkfail, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_TIME -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "timechange_logging", -+ .data = &grsec_enable_time, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_SHMAT -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_deny_shmat", -+ .data = &grsec_enable_chroot_shmat, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_UNIX -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_deny_unix", -+ .data = &grsec_enable_chroot_unix, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_MOUNT -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_deny_mount", -+ .data = &grsec_enable_chroot_mount, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_FCHDIR -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_deny_fchdir", -+ .data = &grsec_enable_chroot_fchdir, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_DOUBLE -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_deny_chroot", -+ .data = &grsec_enable_chroot_double, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_PIVOT -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_deny_pivot", -+ .data = &grsec_enable_chroot_pivot, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_CHDIR -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_enforce_chdir", -+ .data = &grsec_enable_chroot_chdir, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_CHMOD -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_deny_chmod", -+ .data = &grsec_enable_chroot_chmod, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_MKNOD -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_deny_mknod", -+ .data = &grsec_enable_chroot_mknod, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_NICE -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_restrict_nice", -+ .data = &grsec_enable_chroot_nice, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_EXECLOG -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_execlog", -+ .data = &grsec_enable_chroot_execlog, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_CAPS -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_caps", -+ .data = &grsec_enable_chroot_caps, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_SYSCTL -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_deny_sysctl", -+ .data = &grsec_enable_chroot_sysctl, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_TPE -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "tpe", -+ .data = &grsec_enable_tpe, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "tpe_gid", -+ .data = &grsec_tpe_gid, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_TPE_ALL -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "tpe_restrict_all", -+ .data = &grsec_enable_tpe_all, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_SOCKET_ALL -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "socket_all", -+ .data = &grsec_enable_socket_all, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "socket_all_gid", -+ .data = &grsec_socket_all_gid, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_SOCKET_CLIENT -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "socket_client", -+ .data = &grsec_enable_socket_client, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "socket_client_gid", -+ .data = &grsec_socket_client_gid, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_SOCKET_SERVER -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "socket_server", -+ .data = &grsec_enable_socket_server, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "socket_server_gid", -+ .data = &grsec_socket_server_gid, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_AUDIT_GROUP -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "audit_group", -+ .data = &grsec_enable_group, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "audit_gid", -+ .data = &grsec_audit_gid, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_AUDIT_CHDIR -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "audit_chdir", -+ .data = &grsec_enable_chdir, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_AUDIT_MOUNT -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "audit_mount", -+ .data = &grsec_enable_mount, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_AUDIT_TEXTREL -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "audit_textrel", -+ .data = &grsec_enable_audit_textrel, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_DMESG -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "dmesg", -+ .data = &grsec_enable_dmesg, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_CHROOT_FINDTASK -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "chroot_findtask", -+ .data = &grsec_enable_chroot_findtask, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_RESLOG -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "resource_logging", -+ .data = &grsec_resource_logging, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+#ifdef CONFIG_GRKERNSEC_HARDEN_PTRACE -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "harden_ptrace", -+ .data = &grsec_enable_harden_ptrace, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "grsec_lock", -+ .data = &grsec_lock, -+ .maxlen = sizeof(int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif -+ { .ctl_name = 0 } -+}; -+#endif -diff -urNp linux-2.6.31.1/grsecurity/grsec_textrel.c linux-2.6.31.1/grsecurity/grsec_textrel.c ---- linux-2.6.31.1/grsecurity/grsec_textrel.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_textrel.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,16 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/mm.h> -+#include <linux/file.h> -+#include <linux/grinternal.h> -+#include <linux/grsecurity.h> -+ -+void -+gr_log_textrel(struct vm_area_struct * vma) -+{ -+#ifdef CONFIG_GRKERNSEC_AUDIT_TEXTREL -+ if (grsec_enable_audit_textrel) -+ gr_log_textrel_ulong_ulong(GR_DO_AUDIT, GR_TEXTREL_AUDIT_MSG, vma->vm_file, vma->vm_start, vma->vm_pgoff); -+#endif -+ return; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_time.c linux-2.6.31.1/grsecurity/grsec_time.c ---- linux-2.6.31.1/grsecurity/grsec_time.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_time.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,13 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/grinternal.h> -+ -+void -+gr_log_timechange(void) -+{ -+#ifdef CONFIG_GRKERNSEC_TIME -+ if (grsec_enable_time) -+ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_TIME_MSG); -+#endif -+ return; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsec_tpe.c linux-2.6.31.1/grsecurity/grsec_tpe.c ---- linux-2.6.31.1/grsecurity/grsec_tpe.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsec_tpe.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,38 @@ -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/file.h> -+#include <linux/fs.h> -+#include <linux/grinternal.h> -+ -+extern int gr_acl_tpe_check(void); -+ -+int -+gr_tpe_allow(const struct file *file) -+{ -+#ifdef CONFIG_GRKERNSEC -+ struct inode *inode = file->f_path.dentry->d_parent->d_inode; -+ const struct cred *cred = current_cred(); -+ -+ if (cred->uid && ((grsec_enable_tpe && -+#ifdef CONFIG_GRKERNSEC_TPE_INVERT -+ !in_group_p(grsec_tpe_gid) -+#else -+ in_group_p(grsec_tpe_gid) -+#endif -+ ) || gr_acl_tpe_check()) && -+ (inode->i_uid || (!inode->i_uid && ((inode->i_mode & S_IWGRP) || -+ (inode->i_mode & S_IWOTH))))) { -+ gr_log_fs_generic(GR_DONT_AUDIT, GR_EXEC_TPE_MSG, file->f_path.dentry, file->f_path.mnt); -+ return 0; -+ } -+#ifdef CONFIG_GRKERNSEC_TPE_ALL -+ if (cred->uid && grsec_enable_tpe && grsec_enable_tpe_all && -+ ((inode->i_uid && (inode->i_uid != cred->uid)) || -+ (inode->i_mode & S_IWGRP) || (inode->i_mode & S_IWOTH))) { -+ gr_log_fs_generic(GR_DONT_AUDIT, GR_EXEC_TPE_MSG, file->f_path.dentry, file->f_path.mnt); -+ return 0; -+ } -+#endif -+#endif -+ return 1; -+} -diff -urNp linux-2.6.31.1/grsecurity/grsum.c linux-2.6.31.1/grsecurity/grsum.c ---- linux-2.6.31.1/grsecurity/grsum.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/grsum.c 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,59 @@ -+#include <linux/err.h> -+#include <linux/kernel.h> -+#include <linux/sched.h> -+#include <linux/mm.h> -+#include <linux/scatterlist.h> -+#include <linux/crypto.h> -+#include <linux/gracl.h> -+ -+ -+#if !defined(CONFIG_CRYPTO) || defined(CONFIG_CRYPTO_MODULE) || !defined(CONFIG_CRYPTO_SHA256) || defined(CONFIG_CRYPTO_SHA256_MODULE) -+#error "crypto and sha256 must be built into the kernel" -+#endif -+ -+int -+chkpw(struct gr_arg *entry, unsigned char *salt, unsigned char *sum) -+{ -+ char *p; -+ struct crypto_hash *tfm; -+ struct hash_desc desc; -+ struct scatterlist sg; -+ unsigned char temp_sum[GR_SHA_LEN]; -+ volatile int retval = 0; -+ volatile int dummy = 0; -+ unsigned int i; -+ -+ tfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC); -+ if (IS_ERR(tfm)) { -+ /* should never happen, since sha256 should be built in */ -+ return 1; -+ } -+ -+ desc.tfm = tfm; -+ desc.flags = 0; -+ -+ crypto_hash_init(&desc); -+ -+ p = salt; -+ sg_set_buf(&sg, p, GR_SALT_LEN); -+ crypto_hash_update(&desc, &sg, sg.length); -+ -+ p = entry->pw; -+ sg_set_buf(&sg, p, strlen(p)); -+ -+ crypto_hash_update(&desc, &sg, sg.length); -+ -+ crypto_hash_final(&desc, temp_sum); -+ -+ memset(entry->pw, 0, GR_PW_LEN); -+ -+ for (i = 0; i < GR_SHA_LEN; i++) -+ if (sum[i] != temp_sum[i]) -+ retval = 1; -+ else -+ dummy = 1; // waste a cycle -+ -+ crypto_free_hash(tfm); -+ -+ return retval; -+} -diff -urNp linux-2.6.31.1/grsecurity/Kconfig linux-2.6.31.1/grsecurity/Kconfig ---- linux-2.6.31.1/grsecurity/Kconfig 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/Kconfig 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,908 @@ -+# -+# grecurity configuration -+# -+ -+menu "Grsecurity" -+ -+config GRKERNSEC -+ bool "Grsecurity" -+ select CRYPTO -+ select CRYPTO_SHA256 -+ help -+ If you say Y here, you will be able to configure many features -+ that will enhance the security of your system. It is highly -+ recommended that you say Y here and read through the help -+ for each option so that you fully understand the features and -+ can evaluate their usefulness for your machine. -+ -+choice -+ prompt "Security Level" -+ depends on GRKERNSEC -+ default GRKERNSEC_CUSTOM -+ -+config GRKERNSEC_LOW -+ bool "Low" -+ select GRKERNSEC_LINK -+ select GRKERNSEC_FIFO -+ select GRKERNSEC_EXECVE -+ select GRKERNSEC_RANDNET -+ select GRKERNSEC_DMESG -+ select GRKERNSEC_CHROOT -+ select GRKERNSEC_CHROOT_CHDIR -+ -+ help -+ If you choose this option, several of the grsecurity options will -+ be enabled that will give you greater protection against a number -+ of attacks, while assuring that none of your software will have any -+ conflicts with the additional security measures. If you run a lot -+ of unusual software, or you are having problems with the higher -+ security levels, you should say Y here. With this option, the -+ following features are enabled: -+ -+ - Linking restrictions -+ - FIFO restrictions -+ - Enforcing RLIMIT_NPROC on execve -+ - Restricted dmesg -+ - Enforced chdir("/") on chroot -+ - Runtime module disabling -+ -+config GRKERNSEC_MEDIUM -+ bool "Medium" -+ select PAX -+ select PAX_EI_PAX -+ select PAX_PT_PAX_FLAGS -+ select PAX_HAVE_ACL_FLAGS -+ select GRKERNSEC_PROC_MEMMAP if (PAX_NOEXEC || PAX_ASLR) -+ select GRKERNSEC_CHROOT -+ select GRKERNSEC_CHROOT_SYSCTL -+ select GRKERNSEC_LINK -+ select GRKERNSEC_FIFO -+ select GRKERNSEC_EXECVE -+ select GRKERNSEC_DMESG -+ select GRKERNSEC_RANDNET -+ select GRKERNSEC_FORKFAIL -+ select GRKERNSEC_TIME -+ select GRKERNSEC_SIGNAL -+ select GRKERNSEC_CHROOT -+ select GRKERNSEC_CHROOT_UNIX -+ select GRKERNSEC_CHROOT_MOUNT -+ select GRKERNSEC_CHROOT_PIVOT -+ select GRKERNSEC_CHROOT_DOUBLE -+ select GRKERNSEC_CHROOT_CHDIR -+ select GRKERNSEC_CHROOT_MKNOD -+ select GRKERNSEC_PROC -+ select GRKERNSEC_PROC_USERGROUP -+ select PAX_RANDUSTACK -+ select PAX_ASLR -+ select PAX_RANDMMAP -+ select PAX_REFCOUNT if (X86 || SPARC64) -+ select PAX_USERCOPY if ((X86 || SPARC32 || SPARC64 || PPC32 || PPC64) && (SLAB || SLUB || SLOB)) -+ -+ help -+ If you say Y here, several features in addition to those included -+ in the low additional security level will be enabled. These -+ features provide even more security to your system, though in rare -+ cases they may be incompatible with very old or poorly written -+ software. If you enable this option, make sure that your auth -+ service (identd) is running as gid 1001. With this option, -+ the following features (in addition to those provided in the -+ low additional security level) will be enabled: -+ -+ - Failed fork logging -+ - Time change logging -+ - Signal logging -+ - Deny mounts in chroot -+ - Deny double chrooting -+ - Deny sysctl writes in chroot -+ - Deny mknod in chroot -+ - Deny access to abstract AF_UNIX sockets out of chroot -+ - Deny pivot_root in chroot -+ - Denied writes of /dev/kmem, /dev/mem, and /dev/port -+ - /proc restrictions with special GID set to 10 (usually wheel) -+ - Address Space Layout Randomization (ASLR) -+ - Prevent exploitation of most refcount overflows -+ - Bounds checking of copying between the kernel and userland -+ -+config GRKERNSEC_HIGH -+ bool "High" -+ select GRKERNSEC_LINK -+ select GRKERNSEC_FIFO -+ select GRKERNSEC_EXECVE -+ select GRKERNSEC_DMESG -+ select GRKERNSEC_FORKFAIL -+ select GRKERNSEC_TIME -+ select GRKERNSEC_SIGNAL -+ select GRKERNSEC_CHROOT -+ select GRKERNSEC_CHROOT_SHMAT -+ select GRKERNSEC_CHROOT_UNIX -+ select GRKERNSEC_CHROOT_MOUNT -+ select GRKERNSEC_CHROOT_FCHDIR -+ select GRKERNSEC_CHROOT_PIVOT -+ select GRKERNSEC_CHROOT_DOUBLE -+ select GRKERNSEC_CHROOT_CHDIR -+ select GRKERNSEC_CHROOT_MKNOD -+ select GRKERNSEC_CHROOT_CAPS -+ select GRKERNSEC_CHROOT_SYSCTL -+ select GRKERNSEC_CHROOT_FINDTASK -+ select GRKERNSEC_PROC -+ select GRKERNSEC_PROC_MEMMAP if (PAX_NOEXEC || PAX_ASLR) -+ select GRKERNSEC_HIDESYM -+ select GRKERNSEC_BRUTE -+ select GRKERNSEC_PROC_USERGROUP -+ select GRKERNSEC_KMEM -+ select GRKERNSEC_RESLOG -+ select GRKERNSEC_RANDNET -+ select GRKERNSEC_PROC_ADD -+ select GRKERNSEC_CHROOT_CHMOD -+ select GRKERNSEC_CHROOT_NICE -+ select GRKERNSEC_AUDIT_MOUNT -+ select GRKERNSEC_MODHARDEN if (MODULES) -+ select GRKERNSEC_HARDEN_PTRACE -+ select PAX -+ select PAX_RANDUSTACK -+ select PAX_ASLR -+ select PAX_RANDMMAP -+ select PAX_NOEXEC -+ select PAX_MPROTECT -+ select PAX_EI_PAX -+ select PAX_PT_PAX_FLAGS -+ select PAX_HAVE_ACL_FLAGS -+ select PAX_KERNEXEC if (X86 && (!X86_32 || X86_WP_WORKS_OK)) -+ select PAX_MEMORY_UDEREF if (X86_32) -+ select PAX_RANDKSTACK if (X86_TSC && !X86_64) -+ select PAX_SEGMEXEC if (X86_32) -+ select PAX_PAGEEXEC -+ select PAX_EMUPLT if (ALPHA || PARISC || SPARC32 || SPARC64) -+ select PAX_EMUTRAMP if (PARISC) -+ select PAX_EMUSIGRT if (PARISC) -+ select PAX_ETEXECRELOCS if (ALPHA || IA64 || PARISC) -+ select PAX_REFCOUNT if (X86 || SPARC64) -+ select PAX_USERCOPY if ((X86 || PPC32 || PPC64 || SPARC32 || SPARC64) && (SLAB || SLUB || SLOB)) -+ help -+ If you say Y here, many of the features of grsecurity will be -+ enabled, which will protect you against many kinds of attacks -+ against your system. The heightened security comes at a cost -+ of an increased chance of incompatibilities with rare software -+ on your machine. Since this security level enables PaX, you should -+ view http://pax.grsecurity.net and read about the PaX -+ project. While you are there, download chpax and run it on -+ binaries that cause problems with PaX. Also remember that -+ since the /proc restrictions are enabled, you must run your -+ identd as gid 1001. This security level enables the following -+ features in addition to those listed in the low and medium -+ security levels: -+ -+ - Additional /proc restrictions -+ - Chmod restrictions in chroot -+ - No signals, ptrace, or viewing of processes outside of chroot -+ - Capability restrictions in chroot -+ - Deny fchdir out of chroot -+ - Priority restrictions in chroot -+ - Segmentation-based implementation of PaX -+ - Mprotect restrictions -+ - Removal of addresses from /proc/<pid>/[smaps|maps|stat] -+ - Kernel stack randomization -+ - Mount/unmount/remount logging -+ - Kernel symbol hiding -+ - Prevention of memory exhaustion-based exploits -+ - Hardening of module auto-loading -+ - Ptrace restrictions -+ -+config GRKERNSEC_CUSTOM -+ bool "Custom" -+ help -+ If you say Y here, you will be able to configure every grsecurity -+ option, which allows you to enable many more features that aren't -+ covered in the basic security levels. These additional features -+ include TPE, socket restrictions, and the sysctl system for -+ grsecurity. It is advised that you read through the help for -+ each option to determine its usefulness in your situation. -+ -+endchoice -+ -+menu "Address Space Protection" -+depends on GRKERNSEC -+ -+config GRKERNSEC_KMEM -+ bool "Deny writing to /dev/kmem, /dev/mem, and /dev/port" -+ help -+ If you say Y here, /dev/kmem and /dev/mem won't be allowed to -+ be written to via mmap or otherwise to modify the running kernel. -+ /dev/port will also not be allowed to be opened. If you have module -+ support disabled, enabling this will close up four ways that are -+ currently used to insert malicious code into the running kernel. -+ Even with all these features enabled, we still highly recommend that -+ you use the RBAC system, as it is still possible for an attacker to -+ modify the running kernel through privileged I/O granted by ioperm/iopl. -+ If you are not using XFree86, you may be able to stop this additional -+ case by enabling the 'Disable privileged I/O' option. Though nothing -+ legitimately writes to /dev/kmem, XFree86 does need to write to /dev/mem, -+ but only to video memory, which is the only writing we allow in this -+ case. If /dev/kmem or /dev/mem are mmaped without PROT_WRITE, they will -+ not be allowed to mprotect it with PROT_WRITE later. -+ It is highly recommended that you say Y here if you meet all the -+ conditions above. -+ -+config GRKERNSEC_IO -+ bool "Disable privileged I/O" -+ depends on X86 -+ select RTC_CLASS -+ select RTC_INTF_DEV -+ select RTC_DRV_CMOS -+ -+ help -+ If you say Y here, all ioperm and iopl calls will return an error. -+ Ioperm and iopl can be used to modify the running kernel. -+ Unfortunately, some programs need this access to operate properly, -+ the most notable of which are XFree86 and hwclock. hwclock can be -+ remedied by having RTC support in the kernel, so real-time -+ clock support is enabled if this option is enabled, to ensure -+ that hwclock operates correctly. XFree86 still will not -+ operate correctly with this option enabled, so DO NOT CHOOSE Y -+ IF YOU USE XFree86. If you use XFree86 and you still want to -+ protect your kernel against modification, use the RBAC system. -+ -+config GRKERNSEC_PROC_MEMMAP -+ bool "Remove addresses from /proc/<pid>/[smaps|maps|stat]" -+ default y if (PAX_NOEXEC || PAX_ASLR) -+ depends on PAX_NOEXEC || PAX_ASLR -+ help -+ If you say Y here, the /proc/<pid>/maps and /proc/<pid>/stat files will -+ give no information about the addresses of its mappings if -+ PaX features that rely on random addresses are enabled on the task. -+ If you use PaX it is greatly recommended that you say Y here as it -+ closes up a hole that makes the full ASLR useless for suid -+ binaries. -+ -+config GRKERNSEC_BRUTE -+ bool "Deter exploit bruteforcing" -+ help -+ If you say Y here, attempts to bruteforce exploits against forking -+ daemons such as apache or sshd will be deterred. When a child of a -+ forking daemon is killed by PaX or crashes due to an illegal -+ instruction, the parent process will be delayed 30 seconds upon every -+ subsequent fork until the administrator is able to assess the -+ situation and restart the daemon. It is recommended that you also -+ enable signal logging in the auditing section so that logs are -+ generated when a process performs an illegal instruction. -+ -+config GRKERNSEC_MODHARDEN -+ bool "Harden module auto-loading" -+ depends on MODULES -+ help -+ If you say Y here, module auto-loading in response to use of some -+ feature implemented by an unloaded module will be restricted to -+ root users. Enabling this option helps defend against attacks -+ by unprivileged users who abuse the auto-loading behavior to -+ cause a vulnerable module to load that is then exploited. -+ -+ If this option prevents a legitimate use of auto-loading for a -+ non-root user, the administrator can execute modprobe manually -+ with the exact name of the module mentioned in the alert log. -+ Alternatively, the administrator can add the module to the list -+ of modules loaded at boot by modifying init scripts. -+ -+ Modification of init scripts will most likely be needed on -+ Ubuntu servers with encrypted home directory support enabled, -+ as the first non-root user logging in will cause the ecb(aes), -+ ecb(aes)-all, cbc(aes), and cbc(aes)-all modules to be loaded. -+ -+config GRKERNSEC_HIDESYM -+ bool "Hide kernel symbols" -+ help -+ If you say Y here, getting information on loaded modules, and -+ displaying all kernel symbols through a syscall will be restricted -+ to users with CAP_SYS_MODULE. For software compatibility reasons, -+ /proc/kallsyms will be restricted to the root user. The RBAC -+ system can hide that entry even from root. Note that this option -+ is only effective provided the following conditions are met: -+ 1) The kernel using grsecurity is not precompiled by some distribution -+ 2) You are using the RBAC system and hiding other files such as your -+ kernel image and System.map. Alternatively, enabling this option -+ causes the permissions on /boot, /lib/modules, and the kernel -+ source directory to change at compile time to prevent -+ reading by non-root users. -+ If the above conditions are met, this option will aid in providing a -+ useful protection against local kernel exploitation of overflows -+ and arbitrary read/write vulnerabilities. -+ -+endmenu -+menu "Role Based Access Control Options" -+depends on GRKERNSEC -+ -+config GRKERNSEC_NO_RBAC -+ bool "Disable RBAC system" -+ help -+ If you say Y here, the /dev/grsec device will be removed from the kernel, -+ preventing the RBAC system from being enabled. You should only say Y -+ here if you have no intention of using the RBAC system, so as to prevent -+ an attacker with root access from misusing the RBAC system to hide files -+ and processes when loadable module support and /dev/[k]mem have been -+ locked down. -+ -+config GRKERNSEC_ACL_HIDEKERN -+ bool "Hide kernel processes" -+ help -+ If you say Y here, all kernel threads will be hidden to all -+ processes but those whose subject has the "view hidden processes" -+ flag. -+ -+config GRKERNSEC_ACL_MAXTRIES -+ int "Maximum tries before password lockout" -+ default 3 -+ help -+ This option enforces the maximum number of times a user can attempt -+ to authorize themselves with the grsecurity RBAC system before being -+ denied the ability to attempt authorization again for a specified time. -+ The lower the number, the harder it will be to brute-force a password. -+ -+config GRKERNSEC_ACL_TIMEOUT -+ int "Time to wait after max password tries, in seconds" -+ default 30 -+ help -+ This option specifies the time the user must wait after attempting to -+ authorize to the RBAC system with the maximum number of invalid -+ passwords. The higher the number, the harder it will be to brute-force -+ a password. -+ -+endmenu -+menu "Filesystem Protections" -+depends on GRKERNSEC -+ -+config GRKERNSEC_PROC -+ bool "Proc restrictions" -+ help -+ If you say Y here, the permissions of the /proc filesystem -+ will be altered to enhance system security and privacy. You MUST -+ choose either a user only restriction or a user and group restriction. -+ Depending upon the option you choose, you can either restrict users to -+ see only the processes they themselves run, or choose a group that can -+ view all processes and files normally restricted to root if you choose -+ the "restrict to user only" option. NOTE: If you're running identd as -+ a non-root user, you will have to run it as the group you specify here. -+ -+config GRKERNSEC_PROC_USER -+ bool "Restrict /proc to user only" -+ depends on GRKERNSEC_PROC -+ help -+ If you say Y here, non-root users will only be able to view their own -+ processes, and restricts them from viewing network-related information, -+ and viewing kernel symbol and module information. -+ -+config GRKERNSEC_PROC_USERGROUP -+ bool "Allow special group" -+ depends on GRKERNSEC_PROC && !GRKERNSEC_PROC_USER -+ help -+ If you say Y here, you will be able to select a group that will be -+ able to view all processes, network-related information, and -+ kernel and symbol information. This option is useful if you want -+ to run identd as a non-root user. -+ -+config GRKERNSEC_PROC_GID -+ int "GID for special group" -+ depends on GRKERNSEC_PROC_USERGROUP -+ default 1001 -+ -+config GRKERNSEC_PROC_ADD -+ bool "Additional restrictions" -+ depends on GRKERNSEC_PROC_USER || GRKERNSEC_PROC_USERGROUP -+ help -+ If you say Y here, additional restrictions will be placed on -+ /proc that keep normal users from viewing device information and -+ slabinfo information that could be useful for exploits. -+ -+config GRKERNSEC_LINK -+ bool "Linking restrictions" -+ help -+ If you say Y here, /tmp race exploits will be prevented, since users -+ will no longer be able to follow symlinks owned by other users in -+ world-writable +t directories (i.e. /tmp), unless the owner of the -+ symlink is the owner of the directory. users will also not be -+ able to hardlink to files they do not own. If the sysctl option is -+ enabled, a sysctl option with name "linking_restrictions" is created. -+ -+config GRKERNSEC_FIFO -+ bool "FIFO restrictions" -+ help -+ If you say Y here, users will not be able to write to FIFOs they don't -+ own in world-writable +t directories (i.e. /tmp), unless the owner of -+ the FIFO is the same owner of the directory it's held in. If the sysctl -+ option is enabled, a sysctl option with name "fifo_restrictions" is -+ created. -+ -+config GRKERNSEC_CHROOT -+ bool "Chroot jail restrictions" -+ help -+ If you say Y here, you will be able to choose several options that will -+ make breaking out of a chrooted jail much more difficult. If you -+ encounter no software incompatibilities with the following options, it -+ is recommended that you enable each one. -+ -+config GRKERNSEC_CHROOT_MOUNT -+ bool "Deny mounts" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, processes inside a chroot will not be able to -+ mount or remount filesystems. If the sysctl option is enabled, a -+ sysctl option with name "chroot_deny_mount" is created. -+ -+config GRKERNSEC_CHROOT_DOUBLE -+ bool "Deny double-chroots" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, processes inside a chroot will not be able to chroot -+ again outside the chroot. This is a widely used method of breaking -+ out of a chroot jail and should not be allowed. If the sysctl -+ option is enabled, a sysctl option with name -+ "chroot_deny_chroot" is created. -+ -+config GRKERNSEC_CHROOT_PIVOT -+ bool "Deny pivot_root in chroot" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, processes inside a chroot will not be able to use -+ a function called pivot_root() that was introduced in Linux 2.3.41. It -+ works similar to chroot in that it changes the root filesystem. This -+ function could be misused in a chrooted process to attempt to break out -+ of the chroot, and therefore should not be allowed. If the sysctl -+ option is enabled, a sysctl option with name "chroot_deny_pivot" is -+ created. -+ -+config GRKERNSEC_CHROOT_CHDIR -+ bool "Enforce chdir("/") on all chroots" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, the current working directory of all newly-chrooted -+ applications will be set to the the root directory of the chroot. -+ The man page on chroot(2) states: -+ Note that this call does not change the current working -+ directory, so that `.' can be outside the tree rooted at -+ `/'. In particular, the super-user can escape from a -+ `chroot jail' by doing `mkdir foo; chroot foo; cd ..'. -+ -+ It is recommended that you say Y here, since it's not known to break -+ any software. If the sysctl option is enabled, a sysctl option with -+ name "chroot_enforce_chdir" is created. -+ -+config GRKERNSEC_CHROOT_CHMOD -+ bool "Deny (f)chmod +s" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, processes inside a chroot will not be able to chmod -+ or fchmod files to make them have suid or sgid bits. This protects -+ against another published method of breaking a chroot. If the sysctl -+ option is enabled, a sysctl option with name "chroot_deny_chmod" is -+ created. -+ -+config GRKERNSEC_CHROOT_FCHDIR -+ bool "Deny fchdir out of chroot" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, a well-known method of breaking chroots by fchdir'ing -+ to a file descriptor of the chrooting process that points to a directory -+ outside the filesystem will be stopped. If the sysctl option -+ is enabled, a sysctl option with name "chroot_deny_fchdir" is created. -+ -+config GRKERNSEC_CHROOT_MKNOD -+ bool "Deny mknod" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, processes inside a chroot will not be allowed to -+ mknod. The problem with using mknod inside a chroot is that it -+ would allow an attacker to create a device entry that is the same -+ as one on the physical root of your system, which could range from -+ anything from the console device to a device for your harddrive (which -+ they could then use to wipe the drive or steal data). It is recommended -+ that you say Y here, unless you run into software incompatibilities. -+ If the sysctl option is enabled, a sysctl option with name -+ "chroot_deny_mknod" is created. -+ -+config GRKERNSEC_CHROOT_SHMAT -+ bool "Deny shmat() out of chroot" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, processes inside a chroot will not be able to attach -+ to shared memory segments that were created outside of the chroot jail. -+ It is recommended that you say Y here. If the sysctl option is enabled, -+ a sysctl option with name "chroot_deny_shmat" is created. -+ -+config GRKERNSEC_CHROOT_UNIX -+ bool "Deny access to abstract AF_UNIX sockets out of chroot" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, processes inside a chroot will not be able to -+ connect to abstract (meaning not belonging to a filesystem) Unix -+ domain sockets that were bound outside of a chroot. It is recommended -+ that you say Y here. If the sysctl option is enabled, a sysctl option -+ with name "chroot_deny_unix" is created. -+ -+config GRKERNSEC_CHROOT_FINDTASK -+ bool "Protect outside processes" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, processes inside a chroot will not be able to -+ kill, send signals with fcntl, ptrace, capget, getpgid, setpgid, -+ getsid, or view any process outside of the chroot. If the sysctl -+ option is enabled, a sysctl option with name "chroot_findtask" is -+ created. -+ -+config GRKERNSEC_CHROOT_NICE -+ bool "Restrict priority changes" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, processes inside a chroot will not be able to raise -+ the priority of processes in the chroot, or alter the priority of -+ processes outside the chroot. This provides more security than simply -+ removing CAP_SYS_NICE from the process' capability set. If the -+ sysctl option is enabled, a sysctl option with name "chroot_restrict_nice" -+ is created. -+ -+config GRKERNSEC_CHROOT_SYSCTL -+ bool "Deny sysctl writes" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, an attacker in a chroot will not be able to -+ write to sysctl entries, either by sysctl(2) or through a /proc -+ interface. It is strongly recommended that you say Y here. If the -+ sysctl option is enabled, a sysctl option with name -+ "chroot_deny_sysctl" is created. -+ -+config GRKERNSEC_CHROOT_CAPS -+ bool "Capability restrictions" -+ depends on GRKERNSEC_CHROOT -+ help -+ If you say Y here, the capabilities on all root processes within a -+ chroot jail will be lowered to stop module insertion, raw i/o, -+ system and net admin tasks, rebooting the system, modifying immutable -+ files, modifying IPC owned by another, and changing the system time. -+ This is left an option because it can break some apps. Disable this -+ if your chrooted apps are having problems performing those kinds of -+ tasks. If the sysctl option is enabled, a sysctl option with -+ name "chroot_caps" is created. -+ -+endmenu -+menu "Kernel Auditing" -+depends on GRKERNSEC -+ -+config GRKERNSEC_AUDIT_GROUP -+ bool "Single group for auditing" -+ help -+ If you say Y here, the exec, chdir, and (un)mount logging features -+ will only operate on a group you specify. This option is recommended -+ if you only want to watch certain users instead of having a large -+ amount of logs from the entire system. If the sysctl option is enabled, -+ a sysctl option with name "audit_group" is created. -+ -+config GRKERNSEC_AUDIT_GID -+ int "GID for auditing" -+ depends on GRKERNSEC_AUDIT_GROUP -+ default 1007 -+ -+config GRKERNSEC_EXECLOG -+ bool "Exec logging" -+ help -+ If you say Y here, all execve() calls will be logged (since the -+ other exec*() calls are frontends to execve(), all execution -+ will be logged). Useful for shell-servers that like to keep track -+ of their users. If the sysctl option is enabled, a sysctl option with -+ name "exec_logging" is created. -+ WARNING: This option when enabled will produce a LOT of logs, especially -+ on an active system. -+ -+config GRKERNSEC_RESLOG -+ bool "Resource logging" -+ help -+ If you say Y here, all attempts to overstep resource limits will -+ be logged with the resource name, the requested size, and the current -+ limit. It is highly recommended that you say Y here. If the sysctl -+ option is enabled, a sysctl option with name "resource_logging" is -+ created. If the RBAC system is enabled, the sysctl value is ignored. -+ -+config GRKERNSEC_CHROOT_EXECLOG -+ bool "Log execs within chroot" -+ help -+ If you say Y here, all executions inside a chroot jail will be logged -+ to syslog. This can cause a large amount of logs if certain -+ applications (eg. djb's daemontools) are installed on the system, and -+ is therefore left as an option. If the sysctl option is enabled, a -+ sysctl option with name "chroot_execlog" is created. -+ -+config GRKERNSEC_AUDIT_CHDIR -+ bool "Chdir logging" -+ help -+ If you say Y here, all chdir() calls will be logged. If the sysctl -+ option is enabled, a sysctl option with name "audit_chdir" is created. -+ -+config GRKERNSEC_AUDIT_MOUNT -+ bool "(Un)Mount logging" -+ help -+ If you say Y here, all mounts and unmounts will be logged. If the -+ sysctl option is enabled, a sysctl option with name "audit_mount" is -+ created. -+ -+config GRKERNSEC_SIGNAL -+ bool "Signal logging" -+ help -+ If you say Y here, certain important signals will be logged, such as -+ SIGSEGV, which will as a result inform you of when a error in a program -+ occurred, which in some cases could mean a possible exploit attempt. -+ If the sysctl option is enabled, a sysctl option with name -+ "signal_logging" is created. -+ -+config GRKERNSEC_FORKFAIL -+ bool "Fork failure logging" -+ help -+ If you say Y here, all failed fork() attempts will be logged. -+ This could suggest a fork bomb, or someone attempting to overstep -+ their process limit. If the sysctl option is enabled, a sysctl option -+ with name "forkfail_logging" is created. -+ -+config GRKERNSEC_TIME -+ bool "Time change logging" -+ help -+ If you say Y here, any changes of the system clock will be logged. -+ If the sysctl option is enabled, a sysctl option with name -+ "timechange_logging" is created. -+ -+config GRKERNSEC_PROC_IPADDR -+ bool "/proc/<pid>/ipaddr support" -+ help -+ If you say Y here, a new entry will be added to each /proc/<pid> -+ directory that contains the IP address of the person using the task. -+ The IP is carried across local TCP and AF_UNIX stream sockets. -+ This information can be useful for IDS/IPSes to perform remote response -+ to a local attack. The entry is readable by only the owner of the -+ process (and root if he has CAP_DAC_OVERRIDE, which can be removed via -+ the RBAC system), and thus does not create privacy concerns. -+ -+config GRKERNSEC_AUDIT_TEXTREL -+ bool 'ELF text relocations logging (READ HELP)' -+ depends on PAX_MPROTECT -+ help -+ If you say Y here, text relocations will be logged with the filename -+ of the offending library or binary. The purpose of the feature is -+ to help Linux distribution developers get rid of libraries and -+ binaries that need text relocations which hinder the future progress -+ of PaX. Only Linux distribution developers should say Y here, and -+ never on a production machine, as this option creates an information -+ leak that could aid an attacker in defeating the randomization of -+ a single memory region. If the sysctl option is enabled, a sysctl -+ option with name "audit_textrel" is created. -+ -+endmenu -+ -+menu "Executable Protections" -+depends on GRKERNSEC -+ -+config GRKERNSEC_EXECVE -+ bool "Enforce RLIMIT_NPROC on execs" -+ help -+ If you say Y here, users with a resource limit on processes will -+ have the value checked during execve() calls. The current system -+ only checks the system limit during fork() calls. If the sysctl option -+ is enabled, a sysctl option with name "execve_limiting" is created. -+ -+config GRKERNSEC_DMESG -+ bool "Dmesg(8) restriction" -+ help -+ If you say Y here, non-root users will not be able to use dmesg(8) -+ to view up to the last 4kb of messages in the kernel's log buffer. -+ If the sysctl option is enabled, a sysctl option with name "dmesg" is -+ created. -+ -+config GRKERNSEC_HARDEN_PTRACE -+ bool "Deter ptrace-based process snooping" -+ help -+ If you say Y here, TTY sniffers and other malicious monitoring -+ programs implemented through ptrace will be defeated. If you -+ have been using the RBAC system, this option has already been -+ enabled for several years for all users, with the ability to make -+ fine-grained exceptions. -+ -+ This option only affects the ability of non-root users to ptrace -+ processes that are not a descendent of the ptracing process. -+ This means that strace ./binary and gdb ./binary will still work, -+ but attaching to arbitrary processes will not. If the sysctl -+ option is enabled, a sysctl option with name "harden_ptrace" is -+ created. -+ -+config GRKERNSEC_TPE -+ bool "Trusted Path Execution (TPE)" -+ help -+ If you say Y here, you will be able to choose a gid to add to the -+ supplementary groups of users you want to mark as "untrusted." -+ These users will not be able to execute any files that are not in -+ root-owned directories writable only by root. If the sysctl option -+ is enabled, a sysctl option with name "tpe" is created. -+ -+config GRKERNSEC_TPE_ALL -+ bool "Partially restrict non-root users" -+ depends on GRKERNSEC_TPE -+ help -+ If you say Y here, All non-root users other than the ones in the -+ group specified in the main TPE option will only be allowed to -+ execute files in directories they own that are not group or -+ world-writable, or in directories owned by root and writable only by -+ root. If the sysctl option is enabled, a sysctl option with name -+ "tpe_restrict_all" is created. -+ -+config GRKERNSEC_TPE_INVERT -+ bool "Invert GID option" -+ depends on GRKERNSEC_TPE -+ help -+ If you say Y here, the group you specify in the TPE configuration will -+ decide what group TPE restrictions will be *disabled* for. This -+ option is useful if you want TPE restrictions to be applied to most -+ users on the system. -+ -+config GRKERNSEC_TPE_GID -+ int "GID for untrusted users" -+ depends on GRKERNSEC_TPE && !GRKERNSEC_TPE_INVERT -+ default 1005 -+ help -+ If you have selected the "Invert GID option" above, setting this -+ GID determines what group TPE restrictions will be *disabled* for. -+ If you have not selected the "Invert GID option" above, setting this -+ GID determines what group TPE restrictions will be *enabled* for. -+ If the sysctl option is enabled, a sysctl option with name "tpe_gid" -+ is created. -+ -+config GRKERNSEC_TPE_GID -+ int "GID for trusted users" -+ depends on GRKERNSEC_TPE && GRKERNSEC_TPE_INVERT -+ default 1005 -+ help -+ If you have selected the "Invert GID option" above, setting this -+ GID determines what group TPE restrictions will be *disabled* for. -+ If you have not selected the "Invert GID option" above, setting this -+ GID determines what group TPE restrictions will be *enabled* for. -+ If the sysctl option is enabled, a sysctl option with name "tpe_gid" -+ is created. -+ -+endmenu -+menu "Network Protections" -+depends on GRKERNSEC -+ -+config GRKERNSEC_RANDNET -+ bool "Larger entropy pools" -+ help -+ If you say Y here, the entropy pools used for many features of Linux -+ and grsecurity will be doubled in size. Since several grsecurity -+ features use additional randomness, it is recommended that you say Y -+ here. Saying Y here has a similar effect as modifying -+ /proc/sys/kernel/random/poolsize. -+ -+config GRKERNSEC_BLACKHOLE -+ bool "TCP/UDP blackhole" -+ help -+ If you say Y here, neither TCP resets nor ICMP -+ destination-unreachable packets will be sent in response to packets -+ send to ports for which no associated listening process exists. -+ This feature supports both IPV4 and IPV6 and exempts the -+ loopback interface from blackholing. Enabling this feature -+ makes a host more resilient to DoS attacks and reduces network -+ visibility against scanners. -+ -+config GRKERNSEC_SOCKET -+ bool "Socket restrictions" -+ help -+ If you say Y here, you will be able to choose from several options. -+ If you assign a GID on your system and add it to the supplementary -+ groups of users you want to restrict socket access to, this patch -+ will perform up to three things, based on the option(s) you choose. -+ -+config GRKERNSEC_SOCKET_ALL -+ bool "Deny any sockets to group" -+ depends on GRKERNSEC_SOCKET -+ help -+ If you say Y here, you will be able to choose a GID of whose users will -+ be unable to connect to other hosts from your machine or run server -+ applications from your machine. If the sysctl option is enabled, a -+ sysctl option with name "socket_all" is created. -+ -+config GRKERNSEC_SOCKET_ALL_GID -+ int "GID to deny all sockets for" -+ depends on GRKERNSEC_SOCKET_ALL -+ default 1004 -+ help -+ Here you can choose the GID to disable socket access for. Remember to -+ add the users you want socket access disabled for to the GID -+ specified here. If the sysctl option is enabled, a sysctl option -+ with name "socket_all_gid" is created. -+ -+config GRKERNSEC_SOCKET_CLIENT -+ bool "Deny client sockets to group" -+ depends on GRKERNSEC_SOCKET -+ help -+ If you say Y here, you will be able to choose a GID of whose users will -+ be unable to connect to other hosts from your machine, but will be -+ able to run servers. If this option is enabled, all users in the group -+ you specify will have to use passive mode when initiating ftp transfers -+ from the shell on your machine. If the sysctl option is enabled, a -+ sysctl option with name "socket_client" is created. -+ -+config GRKERNSEC_SOCKET_CLIENT_GID -+ int "GID to deny client sockets for" -+ depends on GRKERNSEC_SOCKET_CLIENT -+ default 1003 -+ help -+ Here you can choose the GID to disable client socket access for. -+ Remember to add the users you want client socket access disabled for to -+ the GID specified here. If the sysctl option is enabled, a sysctl -+ option with name "socket_client_gid" is created. -+ -+config GRKERNSEC_SOCKET_SERVER -+ bool "Deny server sockets to group" -+ depends on GRKERNSEC_SOCKET -+ help -+ If you say Y here, you will be able to choose a GID of whose users will -+ be unable to run server applications from your machine. If the sysctl -+ option is enabled, a sysctl option with name "socket_server" is created. -+ -+config GRKERNSEC_SOCKET_SERVER_GID -+ int "GID to deny server sockets for" -+ depends on GRKERNSEC_SOCKET_SERVER -+ default 1002 -+ help -+ Here you can choose the GID to disable server socket access for. -+ Remember to add the users you want server socket access disabled for to -+ the GID specified here. If the sysctl option is enabled, a sysctl -+ option with name "socket_server_gid" is created. -+ -+endmenu -+menu "Sysctl support" -+depends on GRKERNSEC && SYSCTL -+ -+config GRKERNSEC_SYSCTL -+ bool "Sysctl support" -+ help -+ If you say Y here, you will be able to change the options that -+ grsecurity runs with at bootup, without having to recompile your -+ kernel. You can echo values to files in /proc/sys/kernel/grsecurity -+ to enable (1) or disable (0) various features. All the sysctl entries -+ are mutable until the "grsec_lock" entry is set to a non-zero value. -+ All features enabled in the kernel configuration are disabled at boot -+ if you do not say Y to the "Turn on features by default" option. -+ All options should be set at startup, and the grsec_lock entry should -+ be set to a non-zero value after all the options are set. -+ *THIS IS EXTREMELY IMPORTANT* -+ -+config GRKERNSEC_SYSCTL_ON -+ bool "Turn on features by default" -+ depends on GRKERNSEC_SYSCTL -+ help -+ If you say Y here, instead of having all features enabled in the -+ kernel configuration disabled at boot time, the features will be -+ enabled at boot time. It is recommended you say Y here unless -+ there is some reason you would want all sysctl-tunable features to -+ be disabled by default. As mentioned elsewhere, it is important -+ to enable the grsec_lock entry once you have finished modifying -+ the sysctl entries. -+ -+endmenu -+menu "Logging Options" -+depends on GRKERNSEC -+ -+config GRKERNSEC_FLOODTIME -+ int "Seconds in between log messages (minimum)" -+ default 10 -+ help -+ This option allows you to enforce the number of seconds between -+ grsecurity log messages. The default should be suitable for most -+ people, however, if you choose to change it, choose a value small enough -+ to allow informative logs to be produced, but large enough to -+ prevent flooding. -+ -+config GRKERNSEC_FLOODBURST -+ int "Number of messages in a burst (maximum)" -+ default 4 -+ help -+ This option allows you to choose the maximum number of messages allowed -+ within the flood time interval you chose in a separate option. The -+ default should be suitable for most people, however if you find that -+ many of your logs are being interpreted as flooding, you may want to -+ raise this value. -+ -+endmenu -+ -+endmenu -diff -urNp linux-2.6.31.1/grsecurity/Makefile linux-2.6.31.1/grsecurity/Makefile ---- linux-2.6.31.1/grsecurity/Makefile 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/grsecurity/Makefile 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,29 @@ -+# grsecurity's ACL system was originally written in 2001 by Michael Dalton -+# during 2001-2009 it has been completely redesigned by Brad Spengler -+# into an RBAC system -+# -+# All code in this directory and various hooks inserted throughout the kernel -+# are copyright Brad Spengler - Open Source Security, Inc., and released -+# under the GPL v2 or higher -+ -+obj-y = grsec_chdir.o grsec_chroot.o grsec_exec.o grsec_fifo.o grsec_fork.o \ -+ grsec_mount.o grsec_sig.o grsec_sock.o grsec_sysctl.o \ -+ grsec_time.o grsec_tpe.o grsec_link.o grsec_textrel.o -+ -+obj-$(CONFIG_GRKERNSEC) += grsec_init.o grsum.o gracl.o gracl_ip.o gracl_segv.o \ -+ gracl_cap.o gracl_alloc.o gracl_shm.o grsec_mem.o gracl_fs.o \ -+ gracl_learn.o grsec_log.o -+obj-$(CONFIG_GRKERNSEC_RESLOG) += gracl_res.o -+ -+ifndef CONFIG_GRKERNSEC -+obj-y += grsec_disabled.o -+endif -+ -+ifdef CONFIG_GRKERNSEC_HIDESYM -+extra-y := grsec_hidesym.o -+$(obj)/grsec_hidesym.o: -+ @-chmod -f 500 /boot -+ @-chmod -f 500 /lib/modules -+ @-chmod -f 700 . -+ @echo ' grsec: protected kernel image paths' -+endif -diff -urNp linux-2.6.31.1/include/asm-generic/atomic.h linux-2.6.31.1/include/asm-generic/atomic.h ---- linux-2.6.31.1/include/asm-generic/atomic.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/asm-generic/atomic.h 2009-10-01 20:12:44.000000000 -0400 -@@ -36,6 +36,15 @@ - #define atomic_read(v) ((v)->counter) - - /** -+ * atomic_read_unchecked - read atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * -+ * Atomically reads the value of @v. Note that the guaranteed -+ * useful range of an atomic_unchecked_t is only 24 bits. -+ */ -+#define atomic_read_unchecked(v) ((v)->counter) -+ -+/** - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value -@@ -45,6 +54,16 @@ - */ - #define atomic_set(v, i) (((v)->counter) = (i)) - -+/** -+ * atomic_set_unchecked - set atomic variable -+ * @v: pointer of type atomic_unchecked_t -+ * @i: required value -+ * -+ * Atomically sets the value of @v to @i. Note that the guaranteed -+ * useful range of an atomic_unchecked_t is only 24 bits. -+ */ -+#define atomic_set_unchecked(v, i) (((v)->counter) = (i)) -+ - #include <asm/system.h> - - /** -@@ -101,16 +120,31 @@ static inline void atomic_add(int i, ato - atomic_add_return(i, v); - } - -+static inline void atomic_add_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_add_return(i, (atomic_t *)v); -+} -+ - static inline void atomic_sub(int i, atomic_t *v) - { - atomic_sub_return(i, v); - } - -+static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) -+{ -+ atomic_sub_return(i, (atomic_t *)v); -+} -+ - static inline void atomic_inc(atomic_t *v) - { - atomic_add_return(1, v); - } - -+static inline void atomic_inc_unchecked(atomic_unchecked_t *v) -+{ -+ atomic_add_return(1, (atomic_t *)v); -+} -+ - static inline void atomic_dec(atomic_t *v) - { - atomic_sub_return(1, v); -diff -urNp linux-2.6.31.1/include/asm-generic/futex.h linux-2.6.31.1/include/asm-generic/futex.h ---- linux-2.6.31.1/include/asm-generic/futex.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/asm-generic/futex.h 2009-10-01 20:12:44.000000000 -0400 -@@ -6,7 +6,7 @@ - #include <asm/errno.h> - - static inline int --futex_atomic_op_inuser (int encoded_op, int __user *uaddr) -+futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) - { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; -@@ -48,7 +48,7 @@ futex_atomic_op_inuser (int encoded_op, - } - - static inline int --futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) -+futex_atomic_cmpxchg_inatomic(u32 __user *uaddr, int oldval, int newval) - { - return -ENOSYS; - } -diff -urNp linux-2.6.31.1/include/asm-generic/int-l64.h linux-2.6.31.1/include/asm-generic/int-l64.h ---- linux-2.6.31.1/include/asm-generic/int-l64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/asm-generic/int-l64.h 2009-10-01 20:12:44.000000000 -0400 -@@ -46,6 +46,8 @@ typedef unsigned int u32; - typedef signed long s64; - typedef unsigned long u64; - -+typedef unsigned int intoverflow_t __attribute__ ((mode(TI))); -+ - #define S8_C(x) x - #define U8_C(x) x ## U - #define S16_C(x) x -diff -urNp linux-2.6.31.1/include/asm-generic/int-ll64.h linux-2.6.31.1/include/asm-generic/int-ll64.h ---- linux-2.6.31.1/include/asm-generic/int-ll64.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/asm-generic/int-ll64.h 2009-10-01 20:12:44.000000000 -0400 -@@ -51,6 +51,8 @@ typedef unsigned int u32; - typedef signed long long s64; - typedef unsigned long long u64; - -+typedef unsigned long long intoverflow_t; -+ - #define S8_C(x) x - #define U8_C(x) x ## U - #define S16_C(x) x -diff -urNp linux-2.6.31.1/include/asm-generic/kmap_types.h linux-2.6.31.1/include/asm-generic/kmap_types.h ---- linux-2.6.31.1/include/asm-generic/kmap_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/asm-generic/kmap_types.h 2009-10-01 20:12:44.000000000 -0400 -@@ -27,7 +27,8 @@ D(15) KM_UML_USERCOPY, /* UML specific, - D(16) KM_IRQ_PTE, - D(17) KM_NMI, - D(18) KM_NMI_PTE, --D(19) KM_TYPE_NR -+D(19) KM_CLEARPAGE, -+D(20) KM_TYPE_NR - }; - - #undef D -diff -urNp linux-2.6.31.1/include/asm-generic/vmlinux.lds.h linux-2.6.31.1/include/asm-generic/vmlinux.lds.h ---- linux-2.6.31.1/include/asm-generic/vmlinux.lds.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/asm-generic/vmlinux.lds.h 2009-10-01 20:12:44.000000000 -0400 -@@ -201,6 +201,7 @@ - .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ - VMLINUX_SYMBOL(__start_rodata) = .; \ - *(.rodata) *(.rodata.*) \ -+ *(.data.read_only) \ - *(__vermagic) /* Kernel version magic */ \ - *(__markers_strings) /* Markers: strings */ \ - *(__tracepoints_strings)/* Tracepoints: strings */ \ -@@ -641,22 +642,24 @@ - * section in the linker script will go there too. @phdr should have - * a leading colon. - * -- * Note that this macros defines __per_cpu_load as an absolute symbol. -+ * Note that this macros defines per_cpu_load as an absolute symbol. - * If there is no need to put the percpu section at a predetermined - * address, use PERCPU(). - */ - #define PERCPU_VADDR(vaddr, phdr) \ -- VMLINUX_SYMBOL(__per_cpu_load) = .; \ -- .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \ -+ per_cpu_load = .; \ -+ .data.percpu vaddr : AT(VMLINUX_SYMBOL(per_cpu_load) \ - - LOAD_OFFSET) { \ -+ VMLINUX_SYMBOL(__per_cpu_load) = . + per_cpu_load; \ - VMLINUX_SYMBOL(__per_cpu_start) = .; \ - *(.data.percpu.first) \ -- *(.data.percpu.page_aligned) \ - *(.data.percpu) \ -+ . = ALIGN(PAGE_SIZE); \ -+ *(.data.percpu.page_aligned) \ - *(.data.percpu.shared_aligned) \ - VMLINUX_SYMBOL(__per_cpu_end) = .; \ - } phdr \ -- . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu); -+ . = VMLINUX_SYMBOL(per_cpu_load) + SIZEOF(.data.percpu); - - /** - * PERCPU - define output section for percpu area, simple version -diff -urNp linux-2.6.31.1/include/drm/drm_pciids.h linux-2.6.31.1/include/drm/drm_pciids.h ---- linux-2.6.31.1/include/drm/drm_pciids.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/drm/drm_pciids.h 2009-10-01 20:12:44.000000000 -0400 -@@ -375,7 +375,7 @@ - {0x1002, 0x9712, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ - {0x1002, 0x9713, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ - {0x1002, 0x9714, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define r128_PCI_IDS \ - {0x1002, 0x4c45, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -@@ -415,14 +415,14 @@ - {0x1002, 0x5446, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x1002, 0x544C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x1002, 0x5452, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define mga_PCI_IDS \ - {0x102b, 0x0520, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MGA_CARD_TYPE_G200}, \ - {0x102b, 0x0521, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MGA_CARD_TYPE_G200}, \ - {0x102b, 0x0525, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MGA_CARD_TYPE_G400}, \ - {0x102b, 0x2527, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MGA_CARD_TYPE_G550}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define mach64_PCI_IDS \ - {0x1002, 0x4749, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -@@ -445,7 +445,7 @@ - {0x1002, 0x4c53, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x1002, 0x4c4d, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x1002, 0x4c4e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define sisdrv_PCI_IDS \ - {0x1039, 0x0300, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -@@ -456,7 +456,7 @@ - {0x1039, 0x7300, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x18CA, 0x0040, PCI_ANY_ID, PCI_ANY_ID, 0, 0, SIS_CHIP_315}, \ - {0x18CA, 0x0042, PCI_ANY_ID, PCI_ANY_ID, 0, 0, SIS_CHIP_315}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define tdfx_PCI_IDS \ - {0x121a, 0x0003, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -@@ -465,7 +465,7 @@ - {0x121a, 0x0007, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x121a, 0x0009, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x121a, 0x000b, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define viadrv_PCI_IDS \ - {0x1106, 0x3022, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -@@ -477,14 +477,14 @@ - {0x1106, 0x3343, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x1106, 0x3230, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VIA_DX9_0}, \ - {0x1106, 0x3157, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VIA_PRO_GROUP_A}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define i810_PCI_IDS \ - {0x8086, 0x7121, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x8086, 0x7123, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x8086, 0x7125, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x8086, 0x1132, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define i830_PCI_IDS \ - {0x8086, 0x3577, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -@@ -492,11 +492,11 @@ - {0x8086, 0x3582, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x8086, 0x2572, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ - {0x8086, 0x358e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define gamma_PCI_IDS \ - {0x3d3d, 0x0008, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define savage_PCI_IDS \ - {0x5333, 0x8a20, PCI_ANY_ID, PCI_ANY_ID, 0, 0, S3_SAVAGE3D}, \ -@@ -522,10 +522,10 @@ - {0x5333, 0x8d02, PCI_ANY_ID, PCI_ANY_ID, 0, 0, S3_TWISTER}, \ - {0x5333, 0x8d03, PCI_ANY_ID, PCI_ANY_ID, 0, 0, S3_PROSAVAGEDDR}, \ - {0x5333, 0x8d04, PCI_ANY_ID, PCI_ANY_ID, 0, 0, S3_PROSAVAGEDDR}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define ffb_PCI_IDS \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} - - #define i915_PCI_IDS \ - {0x8086, 0x3577, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \ -@@ -557,4 +557,4 @@ - {0x8086, 0x35e8, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \ - {0x8086, 0x0042, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \ - {0x8086, 0x0046, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \ -- {0, 0, 0} -+ {0, 0, 0, 0, 0, 0} -diff -urNp linux-2.6.31.1/include/drm/drmP.h linux-2.6.31.1/include/drm/drmP.h ---- linux-2.6.31.1/include/drm/drmP.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/drm/drmP.h 2009-10-01 20:12:44.000000000 -0400 -@@ -787,7 +787,7 @@ struct drm_driver { - void (*gem_free_object) (struct drm_gem_object *obj); - - /* Driver private ops for this object */ -- struct vm_operations_struct *gem_vm_ops; -+ const struct vm_operations_struct *gem_vm_ops; - - int major; - int minor; -@@ -890,7 +890,7 @@ struct drm_device { - - /** \name Usage Counters */ - /*@{ */ -- int open_count; /**< Outstanding files open */ -+ atomic_t open_count; /**< Outstanding files open */ - atomic_t ioctl_count; /**< Outstanding IOCTLs pending */ - atomic_t vma_count; /**< Outstanding vma areas open */ - int buf_use; /**< Buffers in use -- cannot alloc */ -@@ -901,7 +901,7 @@ struct drm_device { - /*@{ */ - unsigned long counters; - enum drm_stat_type types[15]; -- atomic_t counts[15]; -+ atomic_unchecked_t counts[15]; - /*@} */ - - struct list_head filelist; -diff -urNp linux-2.6.31.1/include/linux/agp_backend.h linux-2.6.31.1/include/linux/agp_backend.h ---- linux-2.6.31.1/include/linux/agp_backend.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/agp_backend.h 2009-10-01 20:12:44.000000000 -0400 -@@ -53,7 +53,7 @@ struct agp_kern_info { - int current_memory; - bool cant_use_aperture; - unsigned long page_mask; -- struct vm_operations_struct *vm_ops; -+ const struct vm_operations_struct *vm_ops; - }; - - /* -diff -urNp linux-2.6.31.1/include/linux/a.out.h linux-2.6.31.1/include/linux/a.out.h ---- linux-2.6.31.1/include/linux/a.out.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/a.out.h 2009-10-01 20:12:44.000000000 -0400 -@@ -39,6 +39,14 @@ enum machine_type { - M_MIPS2 = 152 /* MIPS R6000/R4000 binary */ - }; - -+/* Constants for the N_FLAGS field */ -+#define F_PAX_PAGEEXEC 1 /* Paging based non-executable pages */ -+#define F_PAX_EMUTRAMP 2 /* Emulate trampolines */ -+#define F_PAX_MPROTECT 4 /* Restrict mprotect() */ -+#define F_PAX_RANDMMAP 8 /* Randomize mmap() base */ -+/*#define F_PAX_RANDEXEC 16*/ /* Randomize ET_EXEC base */ -+#define F_PAX_SEGMEXEC 32 /* Segmentation based non-executable pages */ -+ - #if !defined (N_MAGIC) - #define N_MAGIC(exec) ((exec).a_info & 0xffff) - #endif -diff -urNp linux-2.6.31.1/include/linux/atmdev.h linux-2.6.31.1/include/linux/atmdev.h ---- linux-2.6.31.1/include/linux/atmdev.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/atmdev.h 2009-10-01 20:12:44.000000000 -0400 -@@ -237,7 +237,7 @@ struct compat_atm_iobuf { - #endif - - struct k_atm_aal_stats { --#define __HANDLE_ITEM(i) atomic_t i -+#define __HANDLE_ITEM(i) atomic_unchecked_t i - __AAL_STAT_ITEMS - #undef __HANDLE_ITEM - }; -diff -urNp linux-2.6.31.1/include/linux/binfmts.h linux-2.6.31.1/include/linux/binfmts.h ---- linux-2.6.31.1/include/linux/binfmts.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/binfmts.h 2009-10-01 20:12:44.000000000 -0400 -@@ -78,6 +78,7 @@ struct linux_binfmt { - int (*load_binary)(struct linux_binprm *, struct pt_regs * regs); - int (*load_shlib)(struct file *); - int (*core_dump)(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); -+ void (*handle_mprotect)(struct vm_area_struct *vma, unsigned long newflags); - unsigned long min_coredump; /* minimal dump size */ - int hasvdso; - }; -diff -urNp linux-2.6.31.1/include/linux/cache.h linux-2.6.31.1/include/linux/cache.h ---- linux-2.6.31.1/include/linux/cache.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/cache.h 2009-10-01 20:12:44.000000000 -0400 -@@ -16,6 +16,10 @@ - #define __read_mostly - #endif - -+#ifndef __read_only -+#define __read_only __read_mostly -+#endif -+ - #ifndef ____cacheline_aligned - #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) - #endif -diff -urNp linux-2.6.31.1/include/linux/capability.h linux-2.6.31.1/include/linux/capability.h ---- linux-2.6.31.1/include/linux/capability.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/capability.h 2009-10-01 20:12:44.000000000 -0400 -@@ -563,6 +563,7 @@ extern const kernel_cap_t __cap_init_eff - (security_real_capable_noaudit((t), (cap)) == 0) - - extern int capable(int cap); -+int capable_nolog(int cap); - - /* audit system wants to get cap info from files as well */ - struct dentry; -diff -urNp linux-2.6.31.1/include/linux/cgroup.h linux-2.6.31.1/include/linux/cgroup.h ---- linux-2.6.31.1/include/linux/cgroup.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/cgroup.h 2009-10-01 20:12:44.000000000 -0400 -@@ -37,7 +37,7 @@ extern void cgroup_exit(struct task_stru - extern int cgroupstats_build(struct cgroupstats *stats, - struct dentry *dentry); - --extern struct file_operations proc_cgroup_operations; -+extern const struct file_operations proc_cgroup_operations; - - /* Define the enumeration of all cgroup subsystems */ - #define SUBSYS(_x) _x ## _subsys_id, -diff -urNp linux-2.6.31.1/include/linux/compiler-gcc4.h linux-2.6.31.1/include/linux/compiler-gcc4.h ---- linux-2.6.31.1/include/linux/compiler-gcc4.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/compiler-gcc4.h 2009-10-01 20:12:44.000000000 -0400 -@@ -36,4 +36,8 @@ - the kernel context */ - #define __cold __attribute__((__cold__)) - -+#define __alloc_size(...) __attribute((alloc_size(__VA_ARGS__))) -+#define __bos(ptr, arg) __builtin_object_size((ptr), (arg)) -+#define __bos0(ptr) __bos((ptr), 0) -+#define __bos1(ptr) __bos((ptr), 1) - #endif -diff -urNp linux-2.6.31.1/include/linux/compiler.h linux-2.6.31.1/include/linux/compiler.h ---- linux-2.6.31.1/include/linux/compiler.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/compiler.h 2009-10-01 20:12:44.000000000 -0400 -@@ -256,6 +256,22 @@ void ftrace_likely_update(struct ftrace_ - #define __cold - #endif - -+#ifndef __alloc_size -+#define __alloc_size -+#endif -+ -+#ifndef __bos -+#define __bos -+#endif -+ -+#ifndef __bos0 -+#define __bos0 -+#endif -+ -+#ifndef __bos1 -+#define __bos1 -+#endif -+ - /* Simple shorthand for a section definition */ - #ifndef __section - # define __section(S) __attribute__ ((__section__(#S))) -diff -urNp linux-2.6.31.1/include/linux/cpumask.h linux-2.6.31.1/include/linux/cpumask.h ---- linux-2.6.31.1/include/linux/cpumask.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/cpumask.h 2009-10-01 20:12:44.000000000 -0400 -@@ -142,7 +142,6 @@ - #include <linux/bitmap.h> - - typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; --extern cpumask_t _unused_cpumask_arg_; - - #ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS - #define cpu_set(cpu, dst) __cpu_set((cpu), &(dst)) -diff -urNp linux-2.6.31.1/include/linux/decompress/mm.h linux-2.6.31.1/include/linux/decompress/mm.h ---- linux-2.6.31.1/include/linux/decompress/mm.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/decompress/mm.h 2009-10-01 20:12:44.000000000 -0400 -@@ -68,7 +68,7 @@ static void free(void *where) - * warnings when not needed (indeed large_malloc / large_free are not - * needed by inflate */ - --#define malloc(a) kmalloc(a, GFP_KERNEL) -+#define malloc(a) kmalloc((a), GFP_KERNEL) - #define free(a) kfree(a) - - #define large_malloc(a) vmalloc(a) -diff -urNp linux-2.6.31.1/include/linux/elf.h linux-2.6.31.1/include/linux/elf.h ---- linux-2.6.31.1/include/linux/elf.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/elf.h 2009-10-01 20:12:44.000000000 -0400 -@@ -49,6 +49,17 @@ typedef __s64 Elf64_Sxword; - #define PT_GNU_EH_FRAME 0x6474e550 - - #define PT_GNU_STACK (PT_LOOS + 0x474e551) -+#define PT_GNU_RELRO (PT_LOOS + 0x474e552) -+ -+#define PT_PAX_FLAGS (PT_LOOS + 0x5041580) -+ -+/* Constants for the e_flags field */ -+#define EF_PAX_PAGEEXEC 1 /* Paging based non-executable pages */ -+#define EF_PAX_EMUTRAMP 2 /* Emulate trampolines */ -+#define EF_PAX_MPROTECT 4 /* Restrict mprotect() */ -+#define EF_PAX_RANDMMAP 8 /* Randomize mmap() base */ -+/*#define EF_PAX_RANDEXEC 16*/ /* Randomize ET_EXEC base */ -+#define EF_PAX_SEGMEXEC 32 /* Segmentation based non-executable pages */ - - /* These constants define the different elf file types */ - #define ET_NONE 0 -@@ -84,6 +95,8 @@ typedef __s64 Elf64_Sxword; - #define DT_DEBUG 21 - #define DT_TEXTREL 22 - #define DT_JMPREL 23 -+#define DT_FLAGS 30 -+ #define DF_TEXTREL 0x00000004 - #define DT_ENCODING 32 - #define OLD_DT_LOOS 0x60000000 - #define DT_LOOS 0x6000000d -@@ -230,6 +243,19 @@ typedef struct elf64_hdr { - #define PF_W 0x2 - #define PF_X 0x1 - -+#define PF_PAGEEXEC (1U << 4) /* Enable PAGEEXEC */ -+#define PF_NOPAGEEXEC (1U << 5) /* Disable PAGEEXEC */ -+#define PF_SEGMEXEC (1U << 6) /* Enable SEGMEXEC */ -+#define PF_NOSEGMEXEC (1U << 7) /* Disable SEGMEXEC */ -+#define PF_MPROTECT (1U << 8) /* Enable MPROTECT */ -+#define PF_NOMPROTECT (1U << 9) /* Disable MPROTECT */ -+/*#define PF_RANDEXEC (1U << 10)*/ /* Enable RANDEXEC */ -+/*#define PF_NORANDEXEC (1U << 11)*/ /* Disable RANDEXEC */ -+#define PF_EMUTRAMP (1U << 12) /* Enable EMUTRAMP */ -+#define PF_NOEMUTRAMP (1U << 13) /* Disable EMUTRAMP */ -+#define PF_RANDMMAP (1U << 14) /* Enable RANDMMAP */ -+#define PF_NORANDMMAP (1U << 15) /* Disable RANDMMAP */ -+ - typedef struct elf32_phdr{ - Elf32_Word p_type; - Elf32_Off p_offset; -@@ -322,6 +348,8 @@ typedef struct elf64_shdr { - #define EI_OSABI 7 - #define EI_PAD 8 - -+#define EI_PAX 14 -+ - #define ELFMAG0 0x7f /* EI_MAG */ - #define ELFMAG1 'E' - #define ELFMAG2 'L' -@@ -385,6 +413,7 @@ extern Elf32_Dyn _DYNAMIC []; - #define elf_phdr elf32_phdr - #define elf_note elf32_note - #define elf_addr_t Elf32_Off -+#define elf_dyn Elf32_Dyn - - #else - -@@ -393,6 +422,7 @@ extern Elf64_Dyn _DYNAMIC []; - #define elf_phdr elf64_phdr - #define elf_note elf64_note - #define elf_addr_t Elf64_Off -+#define elf_dyn Elf64_Dyn - - #endif - -diff -urNp linux-2.6.31.1/include/linux/fs.h linux-2.6.31.1/include/linux/fs.h ---- linux-2.6.31.1/include/linux/fs.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/fs.h 2009-10-01 20:12:44.000000000 -0400 -@@ -87,6 +87,10 @@ struct inodes_stat_t { - */ - #define FMODE_NOCMTIME ((__force fmode_t)2048) - -+/* Hack for grsec so as not to require read permission simply to execute -+ a binary */ -+#define FMODE_GREXEC ((__force fmode_t)8192) -+ - /* - * The below are the various read and write types that we support. Some of - * them include behavioral modifiers that send information down to the -@@ -2430,7 +2434,7 @@ static int __fops ## _open(struct inode - __simple_attr_check_format(__fmt, 0ull); \ - return simple_attr_open(inode, file, __get, __set, __fmt); \ - } \ --static struct file_operations __fops = { \ -+static const struct file_operations __fops = { \ - .owner = THIS_MODULE, \ - .open = __fops ## _open, \ - .release = simple_attr_release, \ -diff -urNp linux-2.6.31.1/include/linux/fs_struct.h linux-2.6.31.1/include/linux/fs_struct.h ---- linux-2.6.31.1/include/linux/fs_struct.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/fs_struct.h 2009-10-01 20:12:44.000000000 -0400 -@@ -4,7 +4,7 @@ - #include <linux/path.h> - - struct fs_struct { -- int users; -+ atomic_t users; - rwlock_t lock; - int umask; - int in_exec; -diff -urNp linux-2.6.31.1/include/linux/genhd.h linux-2.6.31.1/include/linux/genhd.h ---- linux-2.6.31.1/include/linux/genhd.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/genhd.h 2009-10-01 20:12:44.000000000 -0400 -@@ -161,7 +161,7 @@ struct gendisk { - - struct timer_rand_state *random; - -- atomic_t sync_io; /* RAID */ -+ atomic_unchecked_t sync_io; /* RAID */ - struct work_struct async_notify; - #ifdef CONFIG_BLK_DEV_INTEGRITY - struct blk_integrity *integrity; -diff -urNp linux-2.6.31.1/include/linux/gracl.h linux-2.6.31.1/include/linux/gracl.h ---- linux-2.6.31.1/include/linux/gracl.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/include/linux/gracl.h 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,318 @@ -+#ifndef GR_ACL_H -+#define GR_ACL_H -+ -+#include <linux/grdefs.h> -+#include <linux/resource.h> -+#include <linux/capability.h> -+#include <linux/dcache.h> -+#include <asm/resource.h> -+ -+/* Major status information */ -+ -+#define GR_VERSION "grsecurity 2.1.14" -+#define GRSECURITY_VERSION 0x2114 -+ -+enum { -+ GR_SHUTDOWN = 0, -+ GR_ENABLE = 1, -+ GR_SPROLE = 2, -+ GR_RELOAD = 3, -+ GR_SEGVMOD = 4, -+ GR_STATUS = 5, -+ GR_UNSPROLE = 6, -+ GR_PASSSET = 7, -+ GR_SPROLEPAM = 8, -+}; -+ -+/* Password setup definitions -+ * kernel/grhash.c */ -+enum { -+ GR_PW_LEN = 128, -+ GR_SALT_LEN = 16, -+ GR_SHA_LEN = 32, -+}; -+ -+enum { -+ GR_SPROLE_LEN = 64, -+}; -+ -+#define GR_NLIMITS 32 -+ -+/* Begin Data Structures */ -+ -+struct sprole_pw { -+ unsigned char *rolename; -+ unsigned char salt[GR_SALT_LEN]; -+ unsigned char sum[GR_SHA_LEN]; /* 256-bit SHA hash of the password */ -+}; -+ -+struct name_entry { -+ __u32 key; -+ ino_t inode; -+ dev_t device; -+ char *name; -+ __u16 len; -+ __u8 deleted; -+ struct name_entry *prev; -+ struct name_entry *next; -+}; -+ -+struct inodev_entry { -+ struct name_entry *nentry; -+ struct inodev_entry *prev; -+ struct inodev_entry *next; -+}; -+ -+struct acl_role_db { -+ struct acl_role_label **r_hash; -+ __u32 r_size; -+}; -+ -+struct inodev_db { -+ struct inodev_entry **i_hash; -+ __u32 i_size; -+}; -+ -+struct name_db { -+ struct name_entry **n_hash; -+ __u32 n_size; -+}; -+ -+struct crash_uid { -+ uid_t uid; -+ unsigned long expires; -+}; -+ -+struct gr_hash_struct { -+ void **table; -+ void **nametable; -+ void *first; -+ __u32 table_size; -+ __u32 used_size; -+ int type; -+}; -+ -+/* Userspace Grsecurity ACL data structures */ -+ -+struct acl_subject_label { -+ char *filename; -+ ino_t inode; -+ dev_t device; -+ __u32 mode; -+ kernel_cap_t cap_mask; -+ kernel_cap_t cap_lower; -+ -+ struct rlimit res[GR_NLIMITS]; -+ __u32 resmask; -+ -+ __u8 user_trans_type; -+ __u8 group_trans_type; -+ uid_t *user_transitions; -+ gid_t *group_transitions; -+ __u16 user_trans_num; -+ __u16 group_trans_num; -+ -+ __u32 ip_proto[8]; -+ __u32 ip_type; -+ struct acl_ip_label **ips; -+ __u32 ip_num; -+ __u32 inaddr_any_override; -+ -+ __u32 crashes; -+ unsigned long expires; -+ -+ struct acl_subject_label *parent_subject; -+ struct gr_hash_struct *hash; -+ struct acl_subject_label *prev; -+ struct acl_subject_label *next; -+ -+ struct acl_object_label **obj_hash; -+ __u32 obj_hash_size; -+ __u16 pax_flags; -+}; -+ -+struct role_allowed_ip { -+ __u32 addr; -+ __u32 netmask; -+ -+ struct role_allowed_ip *prev; -+ struct role_allowed_ip *next; -+}; -+ -+struct role_transition { -+ char *rolename; -+ -+ struct role_transition *prev; -+ struct role_transition *next; -+}; -+ -+struct acl_role_label { -+ char *rolename; -+ uid_t uidgid; -+ __u16 roletype; -+ -+ __u16 auth_attempts; -+ unsigned long expires; -+ -+ struct acl_subject_label *root_label; -+ struct gr_hash_struct *hash; -+ -+ struct acl_role_label *prev; -+ struct acl_role_label *next; -+ -+ struct role_transition *transitions; -+ struct role_allowed_ip *allowed_ips; -+ uid_t *domain_children; -+ __u16 domain_child_num; -+ -+ struct acl_subject_label **subj_hash; -+ __u32 subj_hash_size; -+}; -+ -+struct user_acl_role_db { -+ struct acl_role_label **r_table; -+ __u32 num_pointers; /* Number of allocations to track */ -+ __u32 num_roles; /* Number of roles */ -+ __u32 num_domain_children; /* Number of domain children */ -+ __u32 num_subjects; /* Number of subjects */ -+ __u32 num_objects; /* Number of objects */ -+}; -+ -+struct acl_object_label { -+ char *filename; -+ ino_t inode; -+ dev_t device; -+ __u32 mode; -+ -+ struct acl_subject_label *nested; -+ struct acl_object_label *globbed; -+ -+ /* next two structures not used */ -+ -+ struct acl_object_label *prev; -+ struct acl_object_label *next; -+}; -+ -+struct acl_ip_label { -+ char *iface; -+ __u32 addr; -+ __u32 netmask; -+ __u16 low, high; -+ __u8 mode; -+ __u32 type; -+ __u32 proto[8]; -+ -+ /* next two structures not used */ -+ -+ struct acl_ip_label *prev; -+ struct acl_ip_label *next; -+}; -+ -+struct gr_arg { -+ struct user_acl_role_db role_db; -+ unsigned char pw[GR_PW_LEN]; -+ unsigned char salt[GR_SALT_LEN]; -+ unsigned char sum[GR_SHA_LEN]; -+ unsigned char sp_role[GR_SPROLE_LEN]; -+ struct sprole_pw *sprole_pws; -+ dev_t segv_device; -+ ino_t segv_inode; -+ uid_t segv_uid; -+ __u16 num_sprole_pws; -+ __u16 mode; -+}; -+ -+struct gr_arg_wrapper { -+ struct gr_arg *arg; -+ __u32 version; -+ __u32 size; -+}; -+ -+struct subject_map { -+ struct acl_subject_label *user; -+ struct acl_subject_label *kernel; -+ struct subject_map *prev; -+ struct subject_map *next; -+}; -+ -+struct acl_subj_map_db { -+ struct subject_map **s_hash; -+ __u32 s_size; -+}; -+ -+/* End Data Structures Section */ -+ -+/* Hash functions generated by empirical testing by Brad Spengler -+ Makes good use of the low bits of the inode. Generally 0-1 times -+ in loop for successful match. 0-3 for unsuccessful match. -+ Shift/add algorithm with modulus of table size and an XOR*/ -+ -+static __inline__ unsigned int -+rhash(const uid_t uid, const __u16 type, const unsigned int sz) -+{ -+ return (((uid << type) + (uid ^ type)) % sz); -+} -+ -+ static __inline__ unsigned int -+shash(const struct acl_subject_label *userp, const unsigned int sz) -+{ -+ return ((const unsigned long)userp % sz); -+} -+ -+static __inline__ unsigned int -+fhash(const ino_t ino, const dev_t dev, const unsigned int sz) -+{ -+ return (((ino + dev) ^ ((ino << 13) + (ino << 23) + (dev << 9))) % sz); -+} -+ -+static __inline__ unsigned int -+nhash(const char *name, const __u16 len, const unsigned int sz) -+{ -+ return full_name_hash((const unsigned char *)name, len) % sz; -+} -+ -+#define FOR_EACH_ROLE_START(role,iter) \ -+ role = NULL; \ -+ iter = 0; \ -+ while (iter < acl_role_set.r_size) { \ -+ if (role == NULL) \ -+ role = acl_role_set.r_hash[iter]; \ -+ if (role == NULL) { \ -+ iter++; \ -+ continue; \ -+ } -+ -+#define FOR_EACH_ROLE_END(role,iter) \ -+ role = role->next; \ -+ if (role == NULL) \ -+ iter++; \ -+ } -+ -+#define FOR_EACH_SUBJECT_START(role,subj,iter) \ -+ subj = NULL; \ -+ iter = 0; \ -+ while (iter < role->subj_hash_size) { \ -+ if (subj == NULL) \ -+ subj = role->subj_hash[iter]; \ -+ if (subj == NULL) { \ -+ iter++; \ -+ continue; \ -+ } -+ -+#define FOR_EACH_SUBJECT_END(subj,iter) \ -+ subj = subj->next; \ -+ if (subj == NULL) \ -+ iter++; \ -+ } -+ -+ -+#define FOR_EACH_NESTED_SUBJECT_START(role,subj) \ -+ subj = role->hash->first; \ -+ while (subj != NULL) { -+ -+#define FOR_EACH_NESTED_SUBJECT_END(subj) \ -+ subj = subj->next; \ -+ } -+ -+#endif -+ -diff -urNp linux-2.6.31.1/include/linux/gralloc.h linux-2.6.31.1/include/linux/gralloc.h ---- linux-2.6.31.1/include/linux/gralloc.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/include/linux/gralloc.h 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,9 @@ -+#ifndef __GRALLOC_H -+#define __GRALLOC_H -+ -+void acl_free_all(void); -+int acl_alloc_stack_init(unsigned long size); -+void *acl_alloc(unsigned long len); -+void *acl_alloc_num(unsigned long num, unsigned long len); -+ -+#endif -diff -urNp linux-2.6.31.1/include/linux/grdefs.h linux-2.6.31.1/include/linux/grdefs.h ---- linux-2.6.31.1/include/linux/grdefs.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/include/linux/grdefs.h 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,136 @@ -+#ifndef GRDEFS_H -+#define GRDEFS_H -+ -+/* Begin grsecurity status declarations */ -+ -+enum { -+ GR_READY = 0x01, -+ GR_STATUS_INIT = 0x00 // disabled state -+}; -+ -+/* Begin ACL declarations */ -+ -+/* Role flags */ -+ -+enum { -+ GR_ROLE_USER = 0x0001, -+ GR_ROLE_GROUP = 0x0002, -+ GR_ROLE_DEFAULT = 0x0004, -+ GR_ROLE_SPECIAL = 0x0008, -+ GR_ROLE_AUTH = 0x0010, -+ GR_ROLE_NOPW = 0x0020, -+ GR_ROLE_GOD = 0x0040, -+ GR_ROLE_LEARN = 0x0080, -+ GR_ROLE_TPE = 0x0100, -+ GR_ROLE_DOMAIN = 0x0200, -+ GR_ROLE_PAM = 0x0400 -+}; -+ -+/* ACL Subject and Object mode flags */ -+enum { -+ GR_DELETED = 0x80000000 -+}; -+ -+/* ACL Object-only mode flags */ -+enum { -+ GR_READ = 0x00000001, -+ GR_APPEND = 0x00000002, -+ GR_WRITE = 0x00000004, -+ GR_EXEC = 0x00000008, -+ GR_FIND = 0x00000010, -+ GR_INHERIT = 0x00000020, -+ GR_SETID = 0x00000040, -+ GR_CREATE = 0x00000080, -+ GR_DELETE = 0x00000100, -+ GR_LINK = 0x00000200, -+ GR_AUDIT_READ = 0x00000400, -+ GR_AUDIT_APPEND = 0x00000800, -+ GR_AUDIT_WRITE = 0x00001000, -+ GR_AUDIT_EXEC = 0x00002000, -+ GR_AUDIT_FIND = 0x00004000, -+ GR_AUDIT_INHERIT= 0x00008000, -+ GR_AUDIT_SETID = 0x00010000, -+ GR_AUDIT_CREATE = 0x00020000, -+ GR_AUDIT_DELETE = 0x00040000, -+ GR_AUDIT_LINK = 0x00080000, -+ GR_PTRACERD = 0x00100000, -+ GR_NOPTRACE = 0x00200000, -+ GR_SUPPRESS = 0x00400000, -+ GR_NOLEARN = 0x00800000 -+}; -+ -+#define GR_AUDITS (GR_AUDIT_READ | GR_AUDIT_WRITE | GR_AUDIT_APPEND | GR_AUDIT_EXEC | \ -+ GR_AUDIT_FIND | GR_AUDIT_INHERIT | GR_AUDIT_SETID | \ -+ GR_AUDIT_CREATE | GR_AUDIT_DELETE | GR_AUDIT_LINK) -+ -+/* ACL subject-only mode flags */ -+enum { -+ GR_KILL = 0x00000001, -+ GR_VIEW = 0x00000002, -+ GR_PROTECTED = 0x00000004, -+ GR_LEARN = 0x00000008, -+ GR_OVERRIDE = 0x00000010, -+ /* just a placeholder, this mode is only used in userspace */ -+ GR_DUMMY = 0x00000020, -+ GR_PROTSHM = 0x00000040, -+ GR_KILLPROC = 0x00000080, -+ GR_KILLIPPROC = 0x00000100, -+ /* just a placeholder, this mode is only used in userspace */ -+ GR_NOTROJAN = 0x00000200, -+ GR_PROTPROCFD = 0x00000400, -+ GR_PROCACCT = 0x00000800, -+ GR_RELAXPTRACE = 0x00001000, -+ GR_NESTED = 0x00002000, -+ GR_INHERITLEARN = 0x00004000, -+ GR_PROCFIND = 0x00008000, -+ GR_POVERRIDE = 0x00010000, -+ GR_KERNELAUTH = 0x00020000, -+}; -+ -+enum { -+ GR_PAX_ENABLE_SEGMEXEC = 0x0001, -+ GR_PAX_ENABLE_PAGEEXEC = 0x0002, -+ GR_PAX_ENABLE_MPROTECT = 0x0004, -+ GR_PAX_ENABLE_RANDMMAP = 0x0008, -+ GR_PAX_ENABLE_EMUTRAMP = 0x0010, -+ GR_PAX_DISABLE_SEGMEXEC = 0x0100, -+ GR_PAX_DISABLE_PAGEEXEC = 0x0200, -+ GR_PAX_DISABLE_MPROTECT = 0x0400, -+ GR_PAX_DISABLE_RANDMMAP = 0x0800, -+ GR_PAX_DISABLE_EMUTRAMP = 0x1000, -+}; -+ -+enum { -+ GR_ID_USER = 0x01, -+ GR_ID_GROUP = 0x02, -+}; -+ -+enum { -+ GR_ID_ALLOW = 0x01, -+ GR_ID_DENY = 0x02, -+}; -+ -+#define GR_CRASH_RES 31 -+#define GR_UIDTABLE_MAX 500 -+ -+/* begin resource learning section */ -+enum { -+ GR_RLIM_CPU_BUMP = 60, -+ GR_RLIM_FSIZE_BUMP = 50000, -+ GR_RLIM_DATA_BUMP = 10000, -+ GR_RLIM_STACK_BUMP = 1000, -+ GR_RLIM_CORE_BUMP = 10000, -+ GR_RLIM_RSS_BUMP = 500000, -+ GR_RLIM_NPROC_BUMP = 1, -+ GR_RLIM_NOFILE_BUMP = 5, -+ GR_RLIM_MEMLOCK_BUMP = 50000, -+ GR_RLIM_AS_BUMP = 500000, -+ GR_RLIM_LOCKS_BUMP = 2, -+ GR_RLIM_SIGPENDING_BUMP = 5, -+ GR_RLIM_MSGQUEUE_BUMP = 10000, -+ GR_RLIM_NICE_BUMP = 1, -+ GR_RLIM_RTPRIO_BUMP = 1, -+ GR_RLIM_RTTIME_BUMP = 1000000 -+}; -+ -+#endif -diff -urNp linux-2.6.31.1/include/linux/grinternal.h linux-2.6.31.1/include/linux/grinternal.h ---- linux-2.6.31.1/include/linux/grinternal.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/include/linux/grinternal.h 2009-10-01 21:50:27.000000000 -0400 -@@ -0,0 +1,211 @@ -+#ifndef __GRINTERNAL_H -+#define __GRINTERNAL_H -+ -+#ifdef CONFIG_GRKERNSEC -+ -+#include <linux/fs.h> -+#include <linux/mnt_namespace.h> -+#include <linux/nsproxy.h> -+#include <linux/gracl.h> -+#include <linux/grdefs.h> -+#include <linux/grmsg.h> -+ -+void gr_add_learn_entry(const char *fmt, ...) -+ __attribute__ ((format (printf, 1, 2))); -+__u32 gr_search_file(const struct dentry *dentry, const __u32 mode, -+ const struct vfsmount *mnt); -+__u32 gr_check_create(const struct dentry *new_dentry, -+ const struct dentry *parent, -+ const struct vfsmount *mnt, const __u32 mode); -+int gr_check_protected_task(const struct task_struct *task); -+__u32 to_gr_audit(const __u32 reqmode); -+int gr_set_acls(const int type); -+ -+int gr_acl_is_enabled(void); -+char gr_roletype_to_char(void); -+ -+void gr_handle_alertkill(struct task_struct *task); -+char *gr_to_filename(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+char *gr_to_filename1(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+char *gr_to_filename2(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+char *gr_to_filename3(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+ -+extern int grsec_enable_harden_ptrace; -+extern int grsec_enable_link; -+extern int grsec_enable_fifo; -+extern int grsec_enable_execve; -+extern int grsec_enable_shm; -+extern int grsec_enable_execlog; -+extern int grsec_enable_signal; -+extern int grsec_enable_forkfail; -+extern int grsec_enable_time; -+extern int grsec_enable_chroot_shmat; -+extern int grsec_enable_chroot_findtask; -+extern int grsec_enable_chroot_mount; -+extern int grsec_enable_chroot_double; -+extern int grsec_enable_chroot_pivot; -+extern int grsec_enable_chroot_chdir; -+extern int grsec_enable_chroot_chmod; -+extern int grsec_enable_chroot_mknod; -+extern int grsec_enable_chroot_fchdir; -+extern int grsec_enable_chroot_nice; -+extern int grsec_enable_chroot_execlog; -+extern int grsec_enable_chroot_caps; -+extern int grsec_enable_chroot_sysctl; -+extern int grsec_enable_chroot_unix; -+extern int grsec_enable_tpe; -+extern int grsec_tpe_gid; -+extern int grsec_enable_tpe_all; -+extern int grsec_enable_sidcaps; -+extern int grsec_enable_socket_all; -+extern int grsec_socket_all_gid; -+extern int grsec_enable_socket_client; -+extern int grsec_socket_client_gid; -+extern int grsec_enable_socket_server; -+extern int grsec_socket_server_gid; -+extern int grsec_audit_gid; -+extern int grsec_enable_group; -+extern int grsec_enable_audit_textrel; -+extern int grsec_enable_mount; -+extern int grsec_enable_chdir; -+extern int grsec_resource_logging; -+extern int grsec_lock; -+ -+extern spinlock_t grsec_alert_lock; -+extern unsigned long grsec_alert_wtime; -+extern unsigned long grsec_alert_fyet; -+ -+extern spinlock_t grsec_audit_lock; -+ -+extern rwlock_t grsec_exec_file_lock; -+ -+#define gr_task_fullpath(tsk) (tsk->exec_file ? \ -+ gr_to_filename2(tsk->exec_file->f_path.dentry, \ -+ tsk->exec_file->f_vfsmnt) : "/") -+ -+#define gr_parent_task_fullpath(tsk) (tsk->parent->exec_file ? \ -+ gr_to_filename3(tsk->parent->exec_file->f_path.dentry, \ -+ tsk->parent->exec_file->f_vfsmnt) : "/") -+ -+#define gr_task_fullpath0(tsk) (tsk->exec_file ? \ -+ gr_to_filename(tsk->exec_file->f_path.dentry, \ -+ tsk->exec_file->f_vfsmnt) : "/") -+ -+#define gr_parent_task_fullpath0(tsk) (tsk->parent->exec_file ? \ -+ gr_to_filename1(tsk->parent->exec_file->f_path.dentry, \ -+ tsk->parent->exec_file->f_vfsmnt) : "/") -+ -+#define proc_is_chrooted(tsk_a) ((tsk_a->pid > 1) && (tsk_a->fs != NULL) && \ -+ ((init_task.fs->root.dentry != tsk_a->fs->root.dentry) && \ -+ (tsk_a->nsproxy->mnt_ns->root->mnt_root != \ -+ tsk_a->fs->root.dentry))) -+ -+#define have_same_root(tsk_a,tsk_b) ((tsk_a->fs != NULL) && (tsk_b->fs != NULL) && \ -+ (tsk_a->fs->root.dentry == tsk_b->fs->root.dentry)) -+ -+#define DEFAULTSECARGS(task, cred, pcred) gr_task_fullpath(task), task->comm, \ -+ task->pid, cred->uid, \ -+ cred->euid, cred->gid, cred->egid, \ -+ gr_parent_task_fullpath(task), \ -+ task->parent->comm, task->parent->pid, \ -+ pcred->uid, pcred->euid, \ -+ pcred->gid, pcred->egid -+ -+#define GR_CHROOT_CAPS {{ \ -+ CAP_TO_MASK(CAP_LINUX_IMMUTABLE) | CAP_TO_MASK(CAP_NET_ADMIN) | \ -+ CAP_TO_MASK(CAP_SYS_MODULE) | CAP_TO_MASK(CAP_SYS_RAWIO) | \ -+ CAP_TO_MASK(CAP_SYS_PACCT) | CAP_TO_MASK(CAP_SYS_ADMIN) | \ -+ CAP_TO_MASK(CAP_SYS_BOOT) | CAP_TO_MASK(CAP_SYS_TIME) | \ -+ CAP_TO_MASK(CAP_NET_RAW) | CAP_TO_MASK(CAP_SYS_TTY_CONFIG) | \ -+ CAP_TO_MASK(CAP_IPC_OWNER) , 0 }} -+ -+#define security_learn(normal_msg,args...) \ -+({ \ -+ read_lock(&grsec_exec_file_lock); \ -+ gr_add_learn_entry(normal_msg "\n", ## args); \ -+ read_unlock(&grsec_exec_file_lock); \ -+}) -+ -+enum { -+ GR_DO_AUDIT, -+ GR_DONT_AUDIT, -+ GR_DONT_AUDIT_GOOD -+}; -+ -+enum { -+ GR_TTYSNIFF, -+ GR_RBAC, -+ GR_RBAC_STR, -+ GR_STR_RBAC, -+ GR_RBAC_MODE2, -+ GR_RBAC_MODE3, -+ GR_FILENAME, -+ GR_SYSCTL_HIDDEN, -+ GR_NOARGS, -+ GR_ONE_INT, -+ GR_ONE_INT_TWO_STR, -+ GR_ONE_STR, -+ GR_STR_INT, -+ GR_TWO_INT, -+ GR_THREE_INT, -+ GR_FIVE_INT_TWO_STR, -+ GR_TWO_STR, -+ GR_THREE_STR, -+ GR_FOUR_STR, -+ GR_STR_FILENAME, -+ GR_FILENAME_STR, -+ GR_FILENAME_TWO_INT, -+ GR_FILENAME_TWO_INT_STR, -+ GR_TEXTREL, -+ GR_PTRACE, -+ GR_RESOURCE, -+ GR_CAP, -+ GR_SIG, -+ GR_SIG2, -+ GR_CRASH1, -+ GR_CRASH2, -+ GR_PSACCT -+}; -+ -+#define gr_log_hidden_sysctl(audit, msg, str) gr_log_varargs(audit, msg, GR_SYSCTL_HIDDEN, str) -+#define gr_log_ttysniff(audit, msg, task) gr_log_varargs(audit, msg, GR_TTYSNIFF, task) -+#define gr_log_fs_rbac_generic(audit, msg, dentry, mnt) gr_log_varargs(audit, msg, GR_RBAC, dentry, mnt) -+#define gr_log_fs_rbac_str(audit, msg, dentry, mnt, str) gr_log_varargs(audit, msg, GR_RBAC_STR, dentry, mnt, str) -+#define gr_log_fs_str_rbac(audit, msg, str, dentry, mnt) gr_log_varargs(audit, msg, GR_STR_RBAC, str, dentry, mnt) -+#define gr_log_fs_rbac_mode2(audit, msg, dentry, mnt, str1, str2) gr_log_varargs(audit, msg, GR_RBAC_MODE2, dentry, mnt, str1, str2) -+#define gr_log_fs_rbac_mode3(audit, msg, dentry, mnt, str1, str2, str3) gr_log_varargs(audit, msg, GR_RBAC_MODE3, dentry, mnt, str1, str2, str3) -+#define gr_log_fs_generic(audit, msg, dentry, mnt) gr_log_varargs(audit, msg, GR_FILENAME, dentry, mnt) -+#define gr_log_noargs(audit, msg) gr_log_varargs(audit, msg, GR_NOARGS) -+#define gr_log_int(audit, msg, num) gr_log_varargs(audit, msg, GR_ONE_INT, num) -+#define gr_log_int_str2(audit, msg, num, str1, str2) gr_log_varargs(audit, msg, GR_ONE_INT_TWO_STR, num, str1, str2) -+#define gr_log_str(audit, msg, str) gr_log_varargs(audit, msg, GR_ONE_STR, str) -+#define gr_log_str_int(audit, msg, str, num) gr_log_varargs(audit, msg, GR_STR_INT, str, num) -+#define gr_log_int_int(audit, msg, num1, num2) gr_log_varargs(audit, msg, GR_TWO_INT, num1, num2) -+#define gr_log_int3(audit, msg, num1, num2, num3) gr_log_varargs(audit, msg, GR_THREE_INT, num1, num2, num3) -+#define gr_log_int5_str2(audit, msg, num1, num2, str1, str2) gr_log_varargs(audit, msg, GR_FIVE_INT_TWO_STR, num1, num2, str1, str2) -+#define gr_log_str_str(audit, msg, str1, str2) gr_log_varargs(audit, msg, GR_TWO_STR, str1, str2) -+#define gr_log_str3(audit, msg, str1, str2, str3) gr_log_varargs(audit, msg, GR_THREE_STR, str1, str2, str3) -+#define gr_log_str4(audit, msg, str1, str2, str3, str4) gr_log_varargs(audit, msg, GR_FOUR_STR, str1, str2, str3, str4) -+#define gr_log_str_fs(audit, msg, str, dentry, mnt) gr_log_varargs(audit, msg, GR_STR_FILENAME, str, dentry, mnt) -+#define gr_log_fs_str(audit, msg, dentry, mnt, str) gr_log_varargs(audit, msg, GR_FILENAME_STR, dentry, mnt, str) -+#define gr_log_fs_int2(audit, msg, dentry, mnt, num1, num2) gr_log_varargs(audit, msg, GR_FILENAME_TWO_INT, dentry, mnt, num1, num2) -+#define gr_log_fs_int2_str(audit, msg, dentry, mnt, num1, num2, str) gr_log_varargs(audit, msg, GR_FILENAME_TWO_INT_STR, dentry, mnt, num1, num2, str) -+#define gr_log_textrel_ulong_ulong(audit, msg, file, ulong1, ulong2) gr_log_varargs(audit, msg, GR_TEXTREL, file, ulong1, ulong2) -+#define gr_log_ptrace(audit, msg, task) gr_log_varargs(audit, msg, GR_PTRACE, task) -+#define gr_log_res_ulong2_str(audit, msg, task, ulong1, str, ulong2) gr_log_varargs(audit, msg, GR_RESOURCE, task, ulong1, str, ulong2) -+#define gr_log_cap(audit, msg, task, str) gr_log_varargs(audit, msg, GR_CAP, task, str) -+#define gr_log_sig_addr(audit, msg, str, addr) gr_log_varargs(audit, msg, GR_SIG, str, addr) -+#define gr_log_sig_task(audit, msg, task, num) gr_log_varargs(audit, msg, GR_SIG2, task, num) -+#define gr_log_crash1(audit, msg, task, ulong) gr_log_varargs(audit, msg, GR_CRASH1, task, ulong) -+#define gr_log_crash2(audit, msg, task, ulong1) gr_log_varargs(audit, msg, GR_CRASH2, task, ulong1) -+#define gr_log_procacct(audit, msg, task, num1, num2, num3, num4, num5, num6, num7, num8, num9) gr_log_varargs(audit, msg, GR_PSACCT, task, num1, num2, num3, num4, num5, num6, num7, num8, num9) -+ -+void gr_log_varargs(int audit, const char *msg, int argtypes, ...); -+ -+#endif -+ -+#endif -diff -urNp linux-2.6.31.1/include/linux/grmsg.h linux-2.6.31.1/include/linux/grmsg.h ---- linux-2.6.31.1/include/linux/grmsg.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/include/linux/grmsg.h 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,103 @@ -+#define DEFAULTSECMSG "%.256s[%.16s:%d] uid/euid:%u/%u gid/egid:%u/%u, parent %.256s[%.16s:%d] uid/euid:%u/%u gid/egid:%u/%u" -+#define GR_ACL_PROCACCT_MSG "%.256s[%.16s:%d] IP:%u.%u.%u.%u TTY:%.64s uid/euid:%u/%u gid/egid:%u/%u run time:[%ud %uh %um %us] cpu time:[%ud %uh %um %us] %s with exit code %ld, parent %.256s[%.16s:%d] IP:%u.%u.%u.%u TTY:%.64s uid/euid:%u/%u gid/egid:%u/%u" -+#define GR_PTRACE_ACL_MSG "denied ptrace of %.950s(%.16s:%d) by " -+#define GR_STOPMOD_MSG "denied modification of module state by " -+#define GR_IOPERM_MSG "denied use of ioperm() by " -+#define GR_IOPL_MSG "denied use of iopl() by " -+#define GR_SHMAT_ACL_MSG "denied attach of shared memory of UID %u, PID %d, ID %u by " -+#define GR_UNIX_CHROOT_MSG "denied connect() to abstract AF_UNIX socket outside of chroot by " -+#define GR_SHMAT_CHROOT_MSG "denied attach of shared memory outside of chroot by " -+#define GR_KMEM_MSG "denied write of /dev/kmem by " -+#define GR_PORT_OPEN_MSG "denied open of /dev/port by " -+#define GR_MEM_WRITE_MSG "denied write of /dev/mem by " -+#define GR_MEM_MMAP_MSG "denied mmap write of /dev/[k]mem by " -+#define GR_SYMLINK_MSG "not following symlink %.950s owned by %d.%d by " -+#define GR_LEARN_AUDIT_MSG "%s\t%u\t%u\t%u\t%.4095s\t%.4095s\t%lu\t%lu\t%.4095s\t%lu\t%u.%u.%u.%u" -+#define GR_ID_LEARN_MSG "%s\t%u\t%u\t%u\t%.4095s\t%.4095s\t%c\t%d\t%d\t%d\t%u.%u.%u.%u" -+#define GR_HIDDEN_ACL_MSG "%s access to hidden file %.950s by " -+#define GR_OPEN_ACL_MSG "%s open of %.950s for%s%s by " -+#define GR_CREATE_ACL_MSG "%s create of %.950s for%s%s by " -+#define GR_FIFO_MSG "denied writing FIFO %.950s of %d.%d by " -+#define GR_MKNOD_CHROOT_MSG "denied mknod of %.950s from chroot by " -+#define GR_MKNOD_ACL_MSG "%s mknod of %.950s by " -+#define GR_UNIXCONNECT_ACL_MSG "%s connect() to the unix domain socket %.950s by " -+#define GR_TTYSNIFF_ACL_MSG "terminal being sniffed by IP:%u.%u.%u.%u %.480s[%.16s:%d], parent %.480s[%.16s:%d] against " -+#define GR_MKDIR_ACL_MSG "%s mkdir of %.950s by " -+#define GR_RMDIR_ACL_MSG "%s rmdir of %.950s by " -+#define GR_UNLINK_ACL_MSG "%s unlink of %.950s by " -+#define GR_SYMLINK_ACL_MSG "%s symlink from %.480s to %.480s by " -+#define GR_HARDLINK_MSG "denied hardlink of %.930s (owned by %d.%d) to %.30s for " -+#define GR_LINK_ACL_MSG "%s link of %.480s to %.480s by " -+#define GR_INHERIT_ACL_MSG "successful inherit of %.480s's ACL for %.480s by " -+#define GR_RENAME_ACL_MSG "%s rename of %.480s to %.480s by " -+#define GR_PTRACE_EXEC_ACL_MSG "denied ptrace of %.950s by " -+#define GR_NPROC_MSG "denied overstep of process limit by " -+#define GR_EXEC_ACL_MSG "%s execution of %.950s by " -+#define GR_EXEC_TPE_MSG "denied untrusted exec of %.950s by " -+#define GR_SEGVSTART_ACL_MSG "possible exploit bruteforcing on " DEFAULTSECMSG " banning uid %u from login for %lu seconds" -+#define GR_SEGVNOSUID_ACL_MSG "possible exploit bruteforcing on " DEFAULTSECMSG " banning execution for %lu seconds" -+#define GR_MOUNT_CHROOT_MSG "denied mount of %.256s as %.930s from chroot by " -+#define GR_PIVOT_CHROOT_MSG "denied pivot_root from chroot by " -+#define GR_TRUNCATE_ACL_MSG "%s truncate of %.950s by " -+#define GR_ATIME_ACL_MSG "%s access time change of %.950s by " -+#define GR_ACCESS_ACL_MSG "%s access of %.950s for%s%s%s by " -+#define GR_CHROOT_CHROOT_MSG "denied double chroot to %.950s by " -+#define GR_FCHMOD_ACL_MSG "%s fchmod of %.950s by " -+#define GR_CHMOD_CHROOT_MSG "denied chmod +s of %.950s by " -+#define GR_CHMOD_ACL_MSG "%s chmod of %.950s by " -+#define GR_CHROOT_FCHDIR_MSG "denied fchdir outside of chroot to %.950s by " -+#define GR_CHOWN_ACL_MSG "%s chown of %.950s by " -+#define GR_WRITLIB_ACL_MSG "denied load of writable library %.950s by " -+#define GR_INITF_ACL_MSG "init_variables() failed %s by " -+#define GR_DISABLED_ACL_MSG "Error loading %s, trying to run kernel with acls disabled. To disable acls at startup use <kernel image name> gracl=off from your boot loader" -+#define GR_DEV_ACL_MSG "/dev/grsec: %d bytes sent %d required, being fed garbaged by " -+#define GR_SHUTS_ACL_MSG "shutdown auth success for " -+#define GR_SHUTF_ACL_MSG "shutdown auth failure for " -+#define GR_SHUTI_ACL_MSG "ignoring shutdown for disabled RBAC system for " -+#define GR_SEGVMODS_ACL_MSG "segvmod auth success for " -+#define GR_SEGVMODF_ACL_MSG "segvmod auth failure for " -+#define GR_SEGVMODI_ACL_MSG "ignoring segvmod for disabled RBAC system for " -+#define GR_ENABLE_ACL_MSG "%s RBAC system loaded by " -+#define GR_ENABLEF_ACL_MSG "unable to load %s for " -+#define GR_RELOADI_ACL_MSG "ignoring reload request for disabled RBAC system" -+#define GR_RELOAD_ACL_MSG "%s RBAC system reloaded by " -+#define GR_RELOADF_ACL_MSG "failed reload of %s for " -+#define GR_SPROLEI_ACL_MSG "ignoring change to special role for disabled RBAC system for " -+#define GR_SPROLES_ACL_MSG "successful change to special role %s (id %d) by " -+#define GR_SPROLEL_ACL_MSG "special role %s (id %d) exited by " -+#define GR_SPROLEF_ACL_MSG "special role %s failure for " -+#define GR_UNSPROLEI_ACL_MSG "ignoring unauth of special role for disabled RBAC system for " -+#define GR_UNSPROLES_ACL_MSG "successful unauth of special role %s (id %d) by " -+#define GR_UNSPROLEF_ACL_MSG "special role unauth of %s failure for " -+#define GR_INVMODE_ACL_MSG "invalid mode %d by " -+#define GR_PRIORITY_CHROOT_MSG "denied priority change of process (%.16s:%d) by " -+#define GR_FAILFORK_MSG "failed fork with errno %d by " -+#define GR_NICE_CHROOT_MSG "denied priority change by " -+#define GR_UNISIGLOG_MSG "%.32s occurred at %p in " -+#define GR_DUALSIGLOG_MSG "signal %d sent to " DEFAULTSECMSG " by " -+#define GR_SIG_ACL_MSG "denied send of signal %d to protected task " DEFAULTSECMSG " by " -+#define GR_SYSCTL_MSG "denied modification of grsecurity sysctl value : %.32s by " -+#define GR_SYSCTL_ACL_MSG "%s sysctl of %.950s for%s%s by " -+#define GR_TIME_MSG "time set by " -+#define GR_DEFACL_MSG "fatal: unable to find subject for (%.16s:%d), loaded by " -+#define GR_MMAP_ACL_MSG "%s executable mmap of %.950s by " -+#define GR_MPROTECT_ACL_MSG "%s executable mprotect of %.950s by " -+#define GR_SOCK_MSG "denied socket(%.16s,%.16s,%.16s) by " -+#define GR_SOCK2_MSG "denied socket(%d,%.16s,%.16s) by " -+#define GR_BIND_MSG "denied bind() by " -+#define GR_CONNECT_MSG "denied connect() by " -+#define GR_BIND_ACL_MSG "denied bind() to %u.%u.%u.%u port %u sock type %.16s protocol %.16s by " -+#define GR_CONNECT_ACL_MSG "denied connect() to %u.%u.%u.%u port %u sock type %.16s protocol %.16s by " -+#define GR_IP_LEARN_MSG "%s\t%u\t%u\t%u\t%.4095s\t%.4095s\t%u.%u.%u.%u\t%u\t%u\t%u\t%u\t%u.%u.%u.%u" -+#define GR_EXEC_CHROOT_MSG "exec of %.980s within chroot by process " -+#define GR_CAP_ACL_MSG "use of %s denied for " -+#define GR_USRCHANGE_ACL_MSG "change to uid %u denied for " -+#define GR_GRPCHANGE_ACL_MSG "change to gid %u denied for " -+#define GR_REMOUNT_AUDIT_MSG "remount of %.256s by " -+#define GR_UNMOUNT_AUDIT_MSG "unmount of %.256s by " -+#define GR_MOUNT_AUDIT_MSG "mount of %.256s to %.256s by " -+#define GR_CHDIR_AUDIT_MSG "chdir to %.980s by " -+#define GR_EXEC_AUDIT_MSG "exec of %.930s (%.128s) by " -+#define GR_RESOURCE_MSG "denied resource overstep by requesting %lu for %.16s against limit %lu for " -+#define GR_TEXTREL_AUDIT_MSG "text relocation in %s, VMA:0x%08lx 0x%08lx by " -+#define GR_NONROOT_MODLOAD_MSG "denied kernel module auto-load of %.64s by " -diff -urNp linux-2.6.31.1/include/linux/grsecurity.h linux-2.6.31.1/include/linux/grsecurity.h ---- linux-2.6.31.1/include/linux/grsecurity.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/include/linux/grsecurity.h 2009-10-01 20:12:44.000000000 -0400 -@@ -0,0 +1,197 @@ -+#ifndef GR_SECURITY_H -+#define GR_SECURITY_H -+#include <linux/fs.h> -+#include <linux/fs_struct.h> -+#include <linux/binfmts.h> -+#include <linux/gracl.h> -+ -+/* notify of brain-dead configs */ -+#if defined(CONFIG_PAX_NOEXEC) && !defined(CONFIG_PAX_PAGEEXEC) && !defined(CONFIG_PAX_SEGMEXEC) && !defined(CONFIG_PAX_KERNEXEC) -+#error "CONFIG_PAX_NOEXEC enabled, but PAGEEXEC, SEGMEXEC, and KERNEXEC are disabled." -+#endif -+#if defined(CONFIG_PAX_NOEXEC) && !defined(CONFIG_PAX_EI_PAX) && !defined(CONFIG_PAX_PT_PAX_FLAGS) -+#error "CONFIG_PAX_NOEXEC enabled, but neither CONFIG_PAX_EI_PAX nor CONFIG_PAX_PT_PAX_FLAGS are enabled." -+#endif -+#if defined(CONFIG_PAX_ASLR) && (defined(CONFIG_PAX_RANDMMAP) || defined(CONFIG_PAX_RANDUSTACK)) && !defined(CONFIG_PAX_EI_PAX) && !defined(CONFIG_PAX_PT_PAX_FLAGS) -+#error "CONFIG_PAX_ASLR enabled, but neither CONFIG_PAX_EI_PAX nor CONFIG_PAX_PT_PAX_FLAGS are enabled." -+#endif -+#if defined(CONFIG_PAX_ASLR) && !defined(CONFIG_PAX_RANDKSTACK) && !defined(CONFIG_PAX_RANDUSTACK) && !defined(CONFIG_PAX_RANDMMAP) -+#error "CONFIG_PAX_ASLR enabled, but RANDKSTACK, RANDUSTACK, and RANDMMAP are disabled." -+#endif -+#if defined(CONFIG_PAX) && !defined(CONFIG_PAX_NOEXEC) && !defined(CONFIG_PAX_ASLR) -+#error "CONFIG_PAX enabled, but no PaX options are enabled." -+#endif -+ -+void gr_handle_brute_attach(struct task_struct *p); -+void gr_handle_brute_check(void); -+ -+char gr_roletype_to_char(void); -+ -+int gr_check_user_change(int real, int effective, int fs); -+int gr_check_group_change(int real, int effective, int fs); -+ -+void gr_del_task_from_ip_table(struct task_struct *p); -+ -+int gr_pid_is_chrooted(struct task_struct *p); -+int gr_handle_chroot_nice(void); -+int gr_handle_chroot_sysctl(const int op); -+int gr_handle_chroot_setpriority(struct task_struct *p, -+ const int niceval); -+int gr_chroot_fchdir(struct dentry *u_dentry, struct vfsmount *u_mnt); -+int gr_handle_chroot_chroot(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+int gr_handle_chroot_caps(struct path *path); -+void gr_handle_chroot_chdir(struct path *path); -+int gr_handle_chroot_chmod(const struct dentry *dentry, -+ const struct vfsmount *mnt, const int mode); -+int gr_handle_chroot_mknod(const struct dentry *dentry, -+ const struct vfsmount *mnt, const int mode); -+int gr_handle_chroot_mount(const struct dentry *dentry, -+ const struct vfsmount *mnt, -+ const char *dev_name); -+int gr_handle_chroot_pivot(void); -+int gr_handle_chroot_unix(const pid_t pid); -+ -+int gr_handle_rawio(const struct inode *inode); -+int gr_handle_nproc(void); -+ -+void gr_handle_ioperm(void); -+void gr_handle_iopl(void); -+ -+int gr_tpe_allow(const struct file *file); -+ -+int gr_random_pid(void); -+ -+void gr_log_forkfail(const int retval); -+void gr_log_timechange(void); -+void gr_log_signal(const int sig, const void *addr, const struct task_struct *t); -+void gr_log_chdir(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+void gr_log_chroot_exec(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+void gr_handle_exec_args(struct linux_binprm *bprm, char **argv); -+void gr_log_remount(const char *devname, const int retval); -+void gr_log_unmount(const char *devname, const int retval); -+void gr_log_mount(const char *from, const char *to, const int retval); -+void gr_log_textrel(struct vm_area_struct *vma); -+ -+int gr_handle_follow_link(const struct inode *parent, -+ const struct inode *inode, -+ const struct dentry *dentry, -+ const struct vfsmount *mnt); -+int gr_handle_fifo(const struct dentry *dentry, -+ const struct vfsmount *mnt, -+ const struct dentry *dir, const int flag, -+ const int acc_mode); -+int gr_handle_hardlink(const struct dentry *dentry, -+ const struct vfsmount *mnt, -+ struct inode *inode, -+ const int mode, const char *to); -+ -+int gr_is_capable(const int cap); -+int gr_is_capable_nolog(const int cap); -+void gr_learn_resource(const struct task_struct *task, const int limit, -+ const unsigned long wanted, const int gt); -+void gr_copy_label(struct task_struct *tsk); -+void gr_handle_crash(struct task_struct *task, const int sig); -+int gr_handle_signal(const struct task_struct *p, const int sig); -+int gr_check_crash_uid(const uid_t uid); -+int gr_check_protected_task(const struct task_struct *task); -+int gr_acl_handle_mmap(const struct file *file, -+ const unsigned long prot); -+int gr_acl_handle_mprotect(const struct file *file, -+ const unsigned long prot); -+int gr_check_hidden_task(const struct task_struct *tsk); -+__u32 gr_acl_handle_truncate(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+__u32 gr_acl_handle_utime(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+__u32 gr_acl_handle_access(const struct dentry *dentry, -+ const struct vfsmount *mnt, const int fmode); -+__u32 gr_acl_handle_fchmod(const struct dentry *dentry, -+ const struct vfsmount *mnt, mode_t mode); -+__u32 gr_acl_handle_chmod(const struct dentry *dentry, -+ const struct vfsmount *mnt, mode_t mode); -+__u32 gr_acl_handle_chown(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+int gr_handle_ptrace(struct task_struct *task, const long request); -+int gr_handle_proc_ptrace(struct task_struct *task); -+__u32 gr_acl_handle_execve(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+int gr_check_crash_exec(const struct file *filp); -+int gr_acl_is_enabled(void); -+void gr_set_kernel_label(struct task_struct *task); -+void gr_set_role_label(struct task_struct *task, const uid_t uid, -+ const gid_t gid); -+int gr_set_proc_label(const struct dentry *dentry, -+ const struct vfsmount *mnt, -+ const int unsafe_share); -+__u32 gr_acl_handle_hidden_file(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+__u32 gr_acl_handle_open(const struct dentry *dentry, -+ const struct vfsmount *mnt, const int fmode); -+__u32 gr_acl_handle_creat(const struct dentry *dentry, -+ const struct dentry *p_dentry, -+ const struct vfsmount *p_mnt, const int fmode, -+ const int imode); -+void gr_handle_create(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+__u32 gr_acl_handle_mknod(const struct dentry *new_dentry, -+ const struct dentry *parent_dentry, -+ const struct vfsmount *parent_mnt, -+ const int mode); -+__u32 gr_acl_handle_mkdir(const struct dentry *new_dentry, -+ const struct dentry *parent_dentry, -+ const struct vfsmount *parent_mnt); -+__u32 gr_acl_handle_rmdir(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+void gr_handle_delete(const ino_t ino, const dev_t dev); -+__u32 gr_acl_handle_unlink(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+__u32 gr_acl_handle_symlink(const struct dentry *new_dentry, -+ const struct dentry *parent_dentry, -+ const struct vfsmount *parent_mnt, -+ const char *from); -+__u32 gr_acl_handle_link(const struct dentry *new_dentry, -+ const struct dentry *parent_dentry, -+ const struct vfsmount *parent_mnt, -+ const struct dentry *old_dentry, -+ const struct vfsmount *old_mnt, const char *to); -+int gr_acl_handle_rename(struct dentry *new_dentry, -+ struct dentry *parent_dentry, -+ const struct vfsmount *parent_mnt, -+ struct dentry *old_dentry, -+ struct inode *old_parent_inode, -+ struct vfsmount *old_mnt, const char *newname); -+void gr_handle_rename(struct inode *old_dir, struct inode *new_dir, -+ struct dentry *old_dentry, -+ struct dentry *new_dentry, -+ struct vfsmount *mnt, const __u8 replace); -+__u32 gr_check_link(const struct dentry *new_dentry, -+ const struct dentry *parent_dentry, -+ const struct vfsmount *parent_mnt, -+ const struct dentry *old_dentry, -+ const struct vfsmount *old_mnt); -+int gr_acl_handle_filldir(const struct file *file, const char *name, -+ const unsigned int namelen, const ino_t ino); -+ -+__u32 gr_acl_handle_unix(const struct dentry *dentry, -+ const struct vfsmount *mnt); -+void gr_acl_handle_exit(void); -+void gr_acl_handle_psacct(struct task_struct *task, const long code); -+int gr_acl_handle_procpidmem(const struct task_struct *task); -+ -+#ifdef CONFIG_GRKERNSEC -+void gr_log_nonroot_mod_load(const char *modname); -+void gr_handle_mem_write(void); -+void gr_handle_kmem_write(void); -+void gr_handle_open_port(void); -+int gr_handle_mem_mmap(const unsigned long offset, -+ struct vm_area_struct *vma); -+ -+extern int grsec_enable_dmesg; -+extern int grsec_enable_randsrc; -+extern int grsec_enable_shm; -+#endif -+ -+#endif -diff -urNp linux-2.6.31.1/include/linux/hdpu_features.h linux-2.6.31.1/include/linux/hdpu_features.h ---- linux-2.6.31.1/include/linux/hdpu_features.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/hdpu_features.h 2009-10-01 20:12:44.000000000 -0400 -@@ -3,7 +3,7 @@ - struct cpustate_t { - spinlock_t lock; - int excl; -- int open_count; -+ atomic_t open_count; - unsigned char cached_val; - int inited; - unsigned long *set_addr; -diff -urNp linux-2.6.31.1/include/linux/highmem.h linux-2.6.31.1/include/linux/highmem.h ---- linux-2.6.31.1/include/linux/highmem.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/highmem.h 2009-10-01 20:12:44.000000000 -0400 -@@ -137,6 +137,18 @@ static inline void clear_highpage(struct - kunmap_atomic(kaddr, KM_USER0); - } - -+static inline void sanitize_highpage(struct page *page) -+{ -+ void *kaddr; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ kaddr = kmap_atomic(page, KM_CLEARPAGE); -+ clear_page(kaddr); -+ kunmap_atomic(kaddr, KM_CLEARPAGE); -+ local_irq_restore(flags); -+} -+ - static inline void zero_user_segments(struct page *page, - unsigned start1, unsigned end1, - unsigned start2, unsigned end2) -diff -urNp linux-2.6.31.1/include/linux/hugetlb.h linux-2.6.31.1/include/linux/hugetlb.h ---- linux-2.6.31.1/include/linux/hugetlb.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/hugetlb.h 2009-10-01 20:12:44.000000000 -0400 -@@ -146,7 +146,7 @@ static inline struct hugetlbfs_sb_info * - } - - extern const struct file_operations hugetlbfs_file_operations; --extern struct vm_operations_struct hugetlb_vm_ops; -+extern const struct vm_operations_struct hugetlb_vm_ops; - struct file *hugetlb_file_setup(const char *name, size_t size, int acct, - struct user_struct **user); - int hugetlb_get_quota(struct address_space *mapping, long delta); -diff -urNp linux-2.6.31.1/include/linux/jbd2.h linux-2.6.31.1/include/linux/jbd2.h ---- linux-2.6.31.1/include/linux/jbd2.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/jbd2.h 2009-10-01 20:12:44.000000000 -0400 -@@ -66,7 +66,7 @@ extern u8 jbd2_journal_enable_debug; - } \ - } while (0) - #else --#define jbd_debug(f, a...) /**/ -+#define jbd_debug(f, a...) do {} while (0) - #endif - - static inline void *jbd2_alloc(size_t size, gfp_t flags) -diff -urNp linux-2.6.31.1/include/linux/jbd.h linux-2.6.31.1/include/linux/jbd.h ---- linux-2.6.31.1/include/linux/jbd.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/jbd.h 2009-10-01 20:12:44.000000000 -0400 -@@ -66,7 +66,7 @@ extern u8 journal_enable_debug; - } \ - } while (0) - #else --#define jbd_debug(f, a...) /**/ -+#define jbd_debug(f, a...) do {} while (0) - #endif - - static inline void *jbd_alloc(size_t size, gfp_t flags) -diff -urNp linux-2.6.31.1/include/linux/kallsyms.h linux-2.6.31.1/include/linux/kallsyms.h ---- linux-2.6.31.1/include/linux/kallsyms.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/kallsyms.h 2009-10-01 20:12:44.000000000 -0400 -@@ -15,7 +15,8 @@ - - struct module; - --#ifdef CONFIG_KALLSYMS -+#ifndef __INCLUDED_BY_HIDESYM -+#if defined(CONFIG_KALLSYMS) && !defined(CONFIG_GRKERNSEC_HIDESYM) - /* Lookup the address for a symbol. Returns 0 if not found. */ - unsigned long kallsyms_lookup_name(const char *name); - -@@ -92,6 +93,9 @@ static inline int lookup_symbol_attrs(un - /* Stupid that this does nothing, but I didn't create this mess. */ - #define __print_symbol(fmt, addr) - #endif /*CONFIG_KALLSYMS*/ -+#else /* when included by kallsyms.c, with HIDESYM enabled */ -+extern void __print_symbol(const char *fmt, unsigned long address); -+#endif - - /* This macro allows us to keep printk typechecking */ - static void __check_printsym_format(const char *fmt, ...) -diff -urNp linux-2.6.31.1/include/linux/kvm_host.h linux-2.6.31.1/include/linux/kvm_host.h ---- linux-2.6.31.1/include/linux/kvm_host.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/kvm_host.h 2009-10-01 20:12:44.000000000 -0400 -@@ -173,7 +173,7 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vc - void vcpu_load(struct kvm_vcpu *vcpu); - void vcpu_put(struct kvm_vcpu *vcpu); - --int kvm_init(void *opaque, unsigned int vcpu_size, -+int kvm_init(const void *opaque, unsigned int vcpu_size, - struct module *module); - void kvm_exit(void); - -@@ -280,7 +280,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug( - struct kvm_guest_debug *dbg); - int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); - --int kvm_arch_init(void *opaque); -+int kvm_arch_init(const void *opaque); - void kvm_arch_exit(void); - - int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); -diff -urNp linux-2.6.31.1/include/linux/libata.h linux-2.6.31.1/include/linux/libata.h ---- linux-2.6.31.1/include/linux/libata.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/libata.h 2009-10-01 20:12:44.000000000 -0400 -@@ -64,11 +64,11 @@ - #ifdef ATA_VERBOSE_DEBUG - #define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) - #else --#define VPRINTK(fmt, args...) -+#define VPRINTK(fmt, args...) do {} while (0) - #endif /* ATA_VERBOSE_DEBUG */ - #else --#define DPRINTK(fmt, args...) --#define VPRINTK(fmt, args...) -+#define DPRINTK(fmt, args...) do {} while (0) -+#define VPRINTK(fmt, args...) do {} while (0) - #endif /* ATA_DEBUG */ - - #define BPRINTK(fmt, args...) if (ap->flags & ATA_FLAG_DEBUGMSG) printk(KERN_ERR "%s: " fmt, __func__, ## args) -diff -urNp linux-2.6.31.1/include/linux/mm.h linux-2.6.31.1/include/linux/mm.h ---- linux-2.6.31.1/include/linux/mm.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/mm.h 2009-10-01 20:12:44.000000000 -0400 -@@ -104,6 +104,10 @@ extern unsigned int kobjsize(const void - #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ - #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ - -+#ifdef CONFIG_PAX_PAGEEXEC -+#define VM_PAGEEXEC 0x80000000 /* vma->vm_page_prot needs special handling */ -+#endif -+ - #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ - #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS - #endif -@@ -871,6 +875,8 @@ struct shrinker { - extern void register_shrinker(struct shrinker *); - extern void unregister_shrinker(struct shrinker *); - -+pgprot_t vm_get_page_prot(unsigned long vm_flags); -+ - int vma_wants_writenotify(struct vm_area_struct *vma); - - extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); -@@ -1141,6 +1147,7 @@ out: - } - - extern int do_munmap(struct mm_struct *, unsigned long, size_t); -+extern int __do_munmap(struct mm_struct *, unsigned long, size_t); - - extern unsigned long do_brk(unsigned long, unsigned long); - -@@ -1195,6 +1202,10 @@ extern struct vm_area_struct * find_vma( - extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, - struct vm_area_struct **pprev); - -+extern struct vm_area_struct *pax_find_mirror_vma(struct vm_area_struct *vma); -+extern void pax_mirror_vma(struct vm_area_struct *vma_m, struct vm_area_struct *vma); -+extern void pax_mirror_file_pte(struct vm_area_struct *vma, unsigned long address, struct page *page_m, spinlock_t *ptl); -+ - /* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ - static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) -@@ -1211,7 +1222,6 @@ static inline unsigned long vma_pages(st - return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - } - --pgprot_t vm_get_page_prot(unsigned long vm_flags); - struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); - int remap_pfn_range(struct vm_area_struct *, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t); -@@ -1303,5 +1313,12 @@ void vmemmap_populate_print_last(void); - extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, - size_t size); - extern void refund_locked_memory(struct mm_struct *mm, size_t size); -+ -+#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT -+extern void track_exec_limit(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long prot); -+#else -+static inline void track_exec_limit(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long prot) {} -+#endif -+ - #endif /* __KERNEL__ */ - #endif /* _LINUX_MM_H */ -diff -urNp linux-2.6.31.1/include/linux/mm_types.h linux-2.6.31.1/include/linux/mm_types.h ---- linux-2.6.31.1/include/linux/mm_types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/mm_types.h 2009-10-01 20:12:44.000000000 -0400 -@@ -171,7 +171,7 @@ struct vm_area_struct { - struct anon_vma *anon_vma; /* Serialized by page_table_lock */ - - /* Function pointers to deal with this struct. */ -- struct vm_operations_struct * vm_ops; -+ const struct vm_operations_struct * vm_ops; - - /* Information about our backing store: */ - unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE -@@ -186,6 +186,8 @@ struct vm_area_struct { - #ifdef CONFIG_NUMA - struct mempolicy *vm_policy; /* NUMA policy for the VMA */ - #endif -+ -+ struct vm_area_struct *vm_mirror;/* PaX: mirror vma or NULL */ - }; - - struct core_thread { -@@ -286,6 +288,24 @@ struct mm_struct { - #ifdef CONFIG_MMU_NOTIFIER - struct mmu_notifier_mm *mmu_notifier_mm; - #endif -+ -+#if defined(CONFIG_PAX_EI_PAX) || defined(CONFIG_PAX_PT_PAX_FLAGS) || defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) -+ unsigned long pax_flags; -+#endif -+ -+#ifdef CONFIG_PAX_DLRESOLVE -+ unsigned long call_dl_resolve; -+#endif -+ -+#if defined(CONFIG_PPC32) && defined(CONFIG_PAX_EMUSIGRT) -+ unsigned long call_syscall; -+#endif -+ -+#ifdef CONFIG_PAX_ASLR -+ unsigned long delta_mmap; /* randomized offset */ -+ unsigned long delta_stack; /* randomized offset */ -+#endif -+ - }; - - /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ -diff -urNp linux-2.6.31.1/include/linux/mod_devicetable.h linux-2.6.31.1/include/linux/mod_devicetable.h ---- linux-2.6.31.1/include/linux/mod_devicetable.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/mod_devicetable.h 2009-10-01 20:12:44.000000000 -0400 -@@ -12,7 +12,7 @@ - typedef unsigned long kernel_ulong_t; - #endif - --#define PCI_ANY_ID (~0) -+#define PCI_ANY_ID ((__u16)~0) - - struct pci_device_id { - __u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/ -@@ -131,7 +131,7 @@ struct usb_device_id { - #define USB_DEVICE_ID_MATCH_INT_SUBCLASS 0x0100 - #define USB_DEVICE_ID_MATCH_INT_PROTOCOL 0x0200 - --#define HID_ANY_ID (~0) -+#define HID_ANY_ID (~0U) - - struct hid_device_id { - __u16 bus; -diff -urNp linux-2.6.31.1/include/linux/module.h linux-2.6.31.1/include/linux/module.h ---- linux-2.6.31.1/include/linux/module.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/module.h 2009-10-01 20:12:44.000000000 -0400 -@@ -283,16 +283,16 @@ struct module - int (*init)(void); - - /* If this is non-NULL, vfree after init() returns */ -- void *module_init; -+ void *module_init_rx, *module_init_rw; - - /* Here is the actual code + data, vfree'd on unload. */ -- void *module_core; -+ void *module_core_rx, *module_core_rw; - - /* Here are the sizes of the init and core sections */ -- unsigned int init_size, core_size; -+ unsigned int init_size_rw, core_size_rw; - - /* The size of the executable code in each section. */ -- unsigned int init_text_size, core_text_size; -+ unsigned int init_size_rx, core_size_rx; - - /* Arch-specific module values */ - struct mod_arch_specific arch; -@@ -389,16 +389,46 @@ struct module *__module_address(unsigned - bool is_module_address(unsigned long addr); - bool is_module_text_address(unsigned long addr); - -+static inline int within_module_range(unsigned long addr, void *start, unsigned long size) -+{ -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ if (ktla_ktva(addr) >= (unsigned long)start && -+ ktla_ktva(addr) < (unsigned long)start + size) -+ return 1; -+#endif -+ -+ return ((void *)addr >= start && (void *)addr < start + size); -+} -+ -+static inline int within_module_core_rx(unsigned long addr, struct module *mod) -+{ -+ return within_module_range(addr, mod->module_core_rx, mod->core_size_rx); -+} -+ -+static inline int within_module_core_rw(unsigned long addr, struct module *mod) -+{ -+ return within_module_range(addr, mod->module_core_rw, mod->core_size_rw); -+} -+ -+static inline int within_module_init_rx(unsigned long addr, struct module *mod) -+{ -+ return within_module_range(addr, mod->module_init_rx, mod->init_size_rx); -+} -+ -+static inline int within_module_init_rw(unsigned long addr, struct module *mod) -+{ -+ return within_module_range(addr, mod->module_init_rw, mod->init_size_rw); -+} -+ - static inline int within_module_core(unsigned long addr, struct module *mod) - { -- return (unsigned long)mod->module_core <= addr && -- addr < (unsigned long)mod->module_core + mod->core_size; -+ return within_module_core_rx(addr, mod) || within_module_core_rw(addr, mod); - } - - static inline int within_module_init(unsigned long addr, struct module *mod) - { -- return (unsigned long)mod->module_init <= addr && -- addr < (unsigned long)mod->module_init + mod->init_size; -+ return within_module_init_rx(addr, mod) || within_module_init_rw(addr, mod); - } - - /* Search for module by name: must hold module_mutex. */ -@@ -451,7 +481,11 @@ void symbol_put_addr(void *addr); - static inline local_t *__module_ref_addr(struct module *mod, int cpu) - { - #ifdef CONFIG_SMP -+#ifdef CONFIG_X86_32 -+ return (local_t *) (mod->refptr + __per_cpu_offset[cpu]); -+#else - return (local_t *) (mod->refptr + per_cpu_offset(cpu)); -+#endif - #else - return &mod->ref; - #endif -diff -urNp linux-2.6.31.1/include/linux/moduleloader.h linux-2.6.31.1/include/linux/moduleloader.h ---- linux-2.6.31.1/include/linux/moduleloader.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/moduleloader.h 2009-10-01 20:12:44.000000000 -0400 -@@ -20,9 +20,21 @@ unsigned int arch_mod_section_prepend(st - sections. Returns NULL on failure. */ - void *module_alloc(unsigned long size); - -+#ifdef CONFIG_PAX_KERNEXEC -+void *module_alloc_exec(unsigned long size); -+#else -+#define module_alloc_exec(x) module_alloc(x) -+#endif -+ - /* Free memory returned from module_alloc. */ - void module_free(struct module *mod, void *module_region); - -+#ifdef CONFIG_PAX_KERNEXEC -+void module_free_exec(struct module *mod, void *module_region); -+#else -+#define module_free_exec(x, y) module_free(x, y) -+#endif -+ - /* Apply the given relocation to the (simplified) ELF. Return -error - or 0. */ - int apply_relocate(Elf_Shdr *sechdrs, -diff -urNp linux-2.6.31.1/include/linux/moduleparam.h linux-2.6.31.1/include/linux/moduleparam.h ---- linux-2.6.31.1/include/linux/moduleparam.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/moduleparam.h 2009-10-01 20:12:44.000000000 -0400 -@@ -37,7 +37,6 @@ typedef int (*param_set_fn)(const char * - typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp); - - /* Flag bits for kernel_param.flags */ --#define KPARAM_KMALLOCED 1 - #define KPARAM_ISBOOL 2 - - struct kernel_param { -diff -urNp linux-2.6.31.1/include/linux/namei.h linux-2.6.31.1/include/linux/namei.h ---- linux-2.6.31.1/include/linux/namei.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/namei.h 2009-10-01 20:12:44.000000000 -0400 -@@ -22,7 +22,7 @@ struct nameidata { - unsigned int flags; - int last_type; - unsigned depth; -- char *saved_names[MAX_NESTED_LINKS + 1]; -+ const char *saved_names[MAX_NESTED_LINKS + 1]; - - /* Intent data */ - union { -@@ -84,12 +84,12 @@ extern int follow_up(struct path *); - extern struct dentry *lock_rename(struct dentry *, struct dentry *); - extern void unlock_rename(struct dentry *, struct dentry *); - --static inline void nd_set_link(struct nameidata *nd, char *path) -+static inline void nd_set_link(struct nameidata *nd, const char *path) - { - nd->saved_names[nd->depth] = path; - } - --static inline char *nd_get_link(struct nameidata *nd) -+static inline const char *nd_get_link(struct nameidata *nd) - { - return nd->saved_names[nd->depth]; - } -diff -urNp linux-2.6.31.1/include/linux/nfsd/nfsd.h linux-2.6.31.1/include/linux/nfsd/nfsd.h ---- linux-2.6.31.1/include/linux/nfsd/nfsd.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/nfsd/nfsd.h 2009-10-01 20:12:44.000000000 -0400 -@@ -57,7 +57,7 @@ extern u32 nfsd_supported_minorversion - extern struct mutex nfsd_mutex; - extern struct svc_serv *nfsd_serv; - --extern struct seq_operations nfs_exports_op; -+extern const struct seq_operations nfs_exports_op; - - /* - * Function prototypes. -diff -urNp linux-2.6.31.1/include/linux/nodemask.h linux-2.6.31.1/include/linux/nodemask.h ---- linux-2.6.31.1/include/linux/nodemask.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/nodemask.h 2009-10-01 20:12:44.000000000 -0400 -@@ -464,11 +464,11 @@ static inline int num_node_state(enum no - - #define any_online_node(mask) \ - ({ \ -- int node; \ -- for_each_node_mask(node, (mask)) \ -- if (node_online(node)) \ -+ int __node; \ -+ for_each_node_mask(__node, (mask)) \ -+ if (node_online(__node)) \ - break; \ -- node; \ -+ __node; \ - }) - - #define num_online_nodes() num_node_state(N_ONLINE) -diff -urNp linux-2.6.31.1/include/linux/oprofile.h linux-2.6.31.1/include/linux/oprofile.h ---- linux-2.6.31.1/include/linux/oprofile.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/oprofile.h 2009-10-01 20:12:44.000000000 -0400 -@@ -128,7 +128,7 @@ int oprofilefs_create_ro_ulong(struct su - - /** Create a file for read-only access to an atomic_t. */ - int oprofilefs_create_ro_atomic(struct super_block * sb, struct dentry * root, -- char const * name, atomic_t * val); -+ char const * name, atomic_unchecked_t * val); - - /** create a directory */ - struct dentry * oprofilefs_mkdir(struct super_block * sb, struct dentry * root, -diff -urNp linux-2.6.31.1/include/linux/poison.h linux-2.6.31.1/include/linux/poison.h ---- linux-2.6.31.1/include/linux/poison.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/poison.h 2009-10-01 20:12:44.000000000 -0400 -@@ -7,8 +7,8 @@ - * under normal circumstances, used to verify that nobody uses - * non-initialized list entries. - */ --#define LIST_POISON1 ((void *) 0x00100100) --#define LIST_POISON2 ((void *) 0x00200200) -+#define LIST_POISON1 ((void *) 0xFF1001FFFF1001FFULL) -+#define LIST_POISON2 ((void *) 0xFF2002FFFF2002FFULL) - - /********** include/linux/timer.h **********/ - /* -diff -urNp linux-2.6.31.1/include/linux/proc_fs.h linux-2.6.31.1/include/linux/proc_fs.h ---- linux-2.6.31.1/include/linux/proc_fs.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/proc_fs.h 2009-10-01 20:12:44.000000000 -0400 -@@ -146,6 +146,19 @@ static inline struct proc_dir_entry *pro - return proc_create_data(name, mode, parent, proc_fops, NULL); - } - -+static inline struct proc_dir_entry *proc_create_grsec(const char *name, mode_t mode, -+ struct proc_dir_entry *parent, const struct file_operations *proc_fops) -+{ -+#ifdef CONFIG_GRKERNSEC_PROC_USER -+ return proc_create_data(name, S_IRUSR, parent, proc_fops, NULL); -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ return proc_create_data(name, S_IRUSR | S_IRGRP, parent, proc_fops, NULL); -+#else -+ return proc_create_data(name, mode, parent, proc_fops, NULL); -+#endif -+} -+ -+ - static inline struct proc_dir_entry *create_proc_read_entry(const char *name, - mode_t mode, struct proc_dir_entry *base, - read_proc_t *read_proc, void * data) -diff -urNp linux-2.6.31.1/include/linux/random.h linux-2.6.31.1/include/linux/random.h ---- linux-2.6.31.1/include/linux/random.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/random.h 2009-10-01 20:12:44.000000000 -0400 -@@ -74,6 +74,11 @@ unsigned long randomize_range(unsigned l - u32 random32(void); - void srandom32(u32 seed); - -+static inline unsigned long pax_get_random_long(void) -+{ -+ return random32() + (sizeof(long) > 4 ? (unsigned long)random32() << 32 : 0); -+} -+ - #endif /* __KERNEL___ */ - - #endif /* _LINUX_RANDOM_H */ -diff -urNp linux-2.6.31.1/include/linux/reiserfs_fs.h linux-2.6.31.1/include/linux/reiserfs_fs.h ---- linux-2.6.31.1/include/linux/reiserfs_fs.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/reiserfs_fs.h 2009-10-01 20:12:44.000000000 -0400 -@@ -1326,7 +1326,7 @@ static inline loff_t max_reiserfs_offset - #define REISERFS_USER_MEM 1 /* reiserfs user memory mode */ - - #define fs_generation(s) (REISERFS_SB(s)->s_generation_counter) --#define get_generation(s) atomic_read (&fs_generation(s)) -+#define get_generation(s) atomic_read_unchecked (&fs_generation(s)) - #define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) - #define __fs_changed(gen,s) (gen != get_generation (s)) - #define fs_changed(gen,s) ({cond_resched(); __fs_changed(gen, s);}) -diff -urNp linux-2.6.31.1/include/linux/reiserfs_fs_sb.h linux-2.6.31.1/include/linux/reiserfs_fs_sb.h ---- linux-2.6.31.1/include/linux/reiserfs_fs_sb.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/reiserfs_fs_sb.h 2009-10-01 20:12:44.000000000 -0400 -@@ -377,7 +377,7 @@ struct reiserfs_sb_info { - /* Comment? -Hans */ - wait_queue_head_t s_wait; - /* To be obsoleted soon by per buffer seals.. -Hans */ -- atomic_t s_generation_counter; // increased by one every time the -+ atomic_unchecked_t s_generation_counter; // increased by one every time the - // tree gets re-balanced - unsigned long s_properties; /* File system properties. Currently holds - on-disk FS format */ -diff -urNp linux-2.6.31.1/include/linux/sched.h linux-2.6.31.1/include/linux/sched.h ---- linux-2.6.31.1/include/linux/sched.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/sched.h 2009-10-01 20:12:44.000000000 -0400 -@@ -99,6 +99,7 @@ struct bio; - struct fs_struct; - struct bts_context; - struct perf_counter_context; -+struct linux_binprm; - - /* - * List of flags we want to share for kernel threads, -@@ -629,6 +630,15 @@ struct signal_struct { - unsigned audit_tty; - struct tty_audit_buf *tty_audit_buf; - #endif -+ -+#ifdef CONFIG_GRKERNSEC -+ u32 curr_ip; -+ u32 gr_saddr; -+ u32 gr_daddr; -+ u16 gr_sport; -+ u16 gr_dport; -+ u8 used_accept:1; -+#endif - }; - - /* Context switch must be unlocked if interrupts are to be enabled */ -@@ -1165,7 +1175,7 @@ struct sched_rt_entity { - - struct task_struct { - volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ -- void *stack; -+ struct thread_info *stack; - atomic_t usage; - unsigned int flags; /* per process flags, defined below */ - unsigned int ptrace; -@@ -1269,8 +1279,8 @@ struct task_struct { - struct list_head thread_group; - - struct completion *vfork_done; /* for vfork() */ -- int __user *set_child_tid; /* CLONE_CHILD_SETTID */ -- int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ -+ pid_t __user *set_child_tid; /* CLONE_CHILD_SETTID */ -+ pid_t __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ - - cputime_t utime, stime, utimescaled, stimescaled; - cputime_t gtime; -@@ -1284,15 +1294,6 @@ struct task_struct { - struct task_cputime cputime_expires; - struct list_head cpu_timers[3]; - --/* process credentials */ -- const struct cred *real_cred; /* objective and real subjective task -- * credentials (COW) */ -- const struct cred *cred; /* effective (overridable) subjective task -- * credentials (COW) */ -- struct mutex cred_guard_mutex; /* guard against foreign influences on -- * credential calculations -- * (notably. ptrace) */ -- - char comm[TASK_COMM_LEN]; /* executable name excluding path - - access with [gs]et_task_comm (which lock - it with task_lock()) -@@ -1429,6 +1430,16 @@ struct task_struct { - struct mutex perf_counter_mutex; - struct list_head perf_counter_list; - #endif -+ -+/* process credentials */ -+ const struct cred *real_cred; /* objective and real subjective task -+ * credentials (COW) */ -+ const struct cred *cred; /* effective (overridable) subjective task -+ * credentials (COW) */ -+ struct mutex cred_guard_mutex; /* guard against foreign influences on -+ * credential calculations -+ * (notably. ptrace) */ -+ - #ifdef CONFIG_NUMA - struct mempolicy *mempolicy; /* Protected by alloc_lock */ - short il_next; -@@ -1480,8 +1491,66 @@ struct task_struct { - /* bitmask of trace recursion */ - unsigned long trace_recursion; - #endif /* CONFIG_TRACING */ -+ -+#ifdef CONFIG_GRKERNSEC -+ /* grsecurity */ -+ struct acl_subject_label *acl; -+ struct acl_role_label *role; -+ struct file *exec_file; -+ u16 acl_role_id; -+ u8 acl_sp_role; -+ u8 is_writable; -+ u8 brute; -+#endif -+ - }; - -+#define MF_PAX_PAGEEXEC 0x01000000 /* Paging based non-executable pages */ -+#define MF_PAX_EMUTRAMP 0x02000000 /* Emulate trampolines */ -+#define MF_PAX_MPROTECT 0x04000000 /* Restrict mprotect() */ -+#define MF_PAX_RANDMMAP 0x08000000 /* Randomize mmap() base */ -+/*#define MF_PAX_RANDEXEC 0x10000000*/ /* Randomize ET_EXEC base */ -+#define MF_PAX_SEGMEXEC 0x20000000 /* Segmentation based non-executable pages */ -+ -+#ifdef CONFIG_PAX_SOFTMODE -+extern unsigned int pax_softmode; -+#endif -+ -+extern int pax_check_flags(unsigned long *); -+ -+/* if tsk != current then task_lock must be held on it */ -+#if defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) -+static inline unsigned long pax_get_flags(struct task_struct *tsk) -+{ -+ if (likely(tsk->mm)) -+ return tsk->mm->pax_flags; -+ else -+ return 0UL; -+} -+ -+/* if tsk != current then task_lock must be held on it */ -+static inline long pax_set_flags(struct task_struct *tsk, unsigned long flags) -+{ -+ if (likely(tsk->mm)) { -+ tsk->mm->pax_flags = flags; -+ return 0; -+ } -+ return -EINVAL; -+} -+#endif -+ -+#ifdef CONFIG_PAX_HAVE_ACL_FLAGS -+extern void pax_set_initial_flags(struct linux_binprm *bprm); -+#elif defined(CONFIG_PAX_HOOK_ACL_FLAGS) -+extern void (*pax_set_initial_flags_func)(struct linux_binprm *bprm); -+#endif -+ -+void pax_report_fault(struct pt_regs *regs, void *pc, void *sp); -+void pax_report_insns(void *pc, void *sp); -+void pax_report_refcount_overflow(struct pt_regs *regs); -+void pax_report_leak_to_user(const void *ptr, unsigned long len); -+void pax_report_overflow_from_user(const void *ptr, unsigned long len); -+ - /* Future-safe accessor for struct task_struct's cpus_allowed. */ - #define tsk_cpumask(tsk) (&(tsk)->cpus_allowed) - -@@ -2046,7 +2115,7 @@ extern void __cleanup_sighand(struct sig - extern void exit_itimers(struct signal_struct *); - extern void flush_itimer_signals(void); - --extern NORET_TYPE void do_group_exit(int); -+extern NORET_TYPE void do_group_exit(int) ATTRIB_NORET; - - extern void daemonize(const char *, ...); - extern int allow_signal(int); -@@ -2159,8 +2228,8 @@ static inline void unlock_task_sighand(s - - #ifndef __HAVE_THREAD_FUNCTIONS - --#define task_thread_info(task) ((struct thread_info *)(task)->stack) --#define task_stack_page(task) ((task)->stack) -+#define task_thread_info(task) ((task)->stack) -+#define task_stack_page(task) ((void *)(task)->stack) - - static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) - { -@@ -2175,7 +2244,7 @@ static inline unsigned long *end_of_stac - - #endif - --static inline int object_is_on_stack(void *obj) -+static inline int object_is_on_stack(const void *obj) - { - void *stack = task_stack_page(current); - -diff -urNp linux-2.6.31.1/include/linux/screen_info.h linux-2.6.31.1/include/linux/screen_info.h ---- linux-2.6.31.1/include/linux/screen_info.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/screen_info.h 2009-10-01 20:12:44.000000000 -0400 -@@ -42,7 +42,8 @@ struct screen_info { - __u16 pages; /* 0x32 */ - __u16 vesa_attributes; /* 0x34 */ - __u32 capabilities; /* 0x36 */ -- __u8 _reserved[6]; /* 0x3a */ -+ __u16 vesapm_size; /* 0x3a */ -+ __u8 _reserved[4]; /* 0x3c */ - } __attribute__((packed)); - - #define VIDEO_TYPE_MDA 0x10 /* Monochrome Text Display */ -diff -urNp linux-2.6.31.1/include/linux/security.h linux-2.6.31.1/include/linux/security.h ---- linux-2.6.31.1/include/linux/security.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/security.h 2009-10-01 20:12:44.000000000 -0400 -@@ -34,6 +34,7 @@ - #include <linux/key.h> - #include <linux/xfrm.h> - #include <linux/gfp.h> -+#include <linux/grsecurity.h> - #include <net/flow.h> - - /* Maximum number of letters for an LSM name string */ -diff -urNp linux-2.6.31.1/include/linux/shm.h linux-2.6.31.1/include/linux/shm.h ---- linux-2.6.31.1/include/linux/shm.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/shm.h 2009-10-01 20:12:44.000000000 -0400 -@@ -95,6 +95,10 @@ struct shmid_kernel /* private to the ke - pid_t shm_cprid; - pid_t shm_lprid; - struct user_struct *mlock_user; -+#ifdef CONFIG_GRKERNSEC -+ time_t shm_createtime; -+ pid_t shm_lapid; -+#endif - }; - - /* shm_mode upper byte flags */ -diff -urNp linux-2.6.31.1/include/linux/slab.h linux-2.6.31.1/include/linux/slab.h ---- linux-2.6.31.1/include/linux/slab.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/slab.h 2009-10-01 20:12:44.000000000 -0400 -@@ -82,10 +82,9 @@ - * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can. - * Both make kfree a no-op. - */ --#define ZERO_SIZE_PTR ((void *)16) -+#define ZERO_SIZE_PTR ((void *)-1024L) - --#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \ -- (unsigned long)ZERO_SIZE_PTR) -+#define ZERO_OR_NULL_PTR(x) (!(x) || (x) == ZERO_SIZE_PTR) - - /* - * struct kmem_cache related prototypes -@@ -138,6 +137,7 @@ void * __must_check krealloc(const void - void kfree(const void *); - void kzfree(const void *); - size_t ksize(const void *); -+void check_object_size(const void *ptr, unsigned long n, bool to); - - /* - * Allocator specific definitions. These are mainly used to establish optimized -@@ -328,4 +328,37 @@ static inline void *kzalloc_node(size_t - - void __init kmem_cache_init_late(void); - -+#define kmalloc(x, y) \ -+({ \ -+ void *___retval; \ -+ intoverflow_t ___x = (intoverflow_t)x; \ -+ if (WARN(___x > ULONG_MAX, "kmalloc size overflow\n"))\ -+ ___retval = NULL; \ -+ else \ -+ ___retval = kmalloc((size_t)___x, (y)); \ -+ ___retval; \ -+}) -+ -+#define kmalloc_node(x, y, z) \ -+({ \ -+ void *___retval; \ -+ intoverflow_t ___x = (intoverflow_t)x; \ -+ if (WARN(___x > ULONG_MAX, "kmalloc_node size overflow\n"))\ -+ ___retval = NULL; \ -+ else \ -+ ___retval = kmalloc_node((size_t)___x, (y), (z));\ -+ ___retval; \ -+}) -+ -+#define kzalloc(x, y) \ -+({ \ -+ void *___retval; \ -+ intoverflow_t ___x = (intoverflow_t)x; \ -+ if (WARN(___x > ULONG_MAX, "kzalloc size overflow\n"))\ -+ ___retval = NULL; \ -+ else \ -+ ___retval = kzalloc((size_t)___x, (y)); \ -+ ___retval; \ -+}) -+ - #endif /* _LINUX_SLAB_H */ -diff -urNp linux-2.6.31.1/include/linux/slub_def.h linux-2.6.31.1/include/linux/slub_def.h ---- linux-2.6.31.1/include/linux/slub_def.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/slub_def.h 2009-10-01 20:12:44.000000000 -0400 -@@ -86,7 +86,7 @@ struct kmem_cache { - struct kmem_cache_order_objects max; - struct kmem_cache_order_objects min; - gfp_t allocflags; /* gfp flags to use on each alloc */ -- int refcount; /* Refcount for slab cache destroy */ -+ atomic_t refcount; /* Refcount for slab cache destroy */ - void (*ctor)(void *); - int inuse; /* Offset to metadata */ - int align; /* Alignment */ -diff -urNp linux-2.6.31.1/include/linux/sonet.h linux-2.6.31.1/include/linux/sonet.h ---- linux-2.6.31.1/include/linux/sonet.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/sonet.h 2009-10-01 20:12:44.000000000 -0400 -@@ -61,7 +61,7 @@ struct sonet_stats { - #include <asm/atomic.h> - - struct k_sonet_stats { --#define __HANDLE_ITEM(i) atomic_t i -+#define __HANDLE_ITEM(i) atomic_unchecked_t i - __SONET_ITEMS - #undef __HANDLE_ITEM - }; -diff -urNp linux-2.6.31.1/include/linux/sysctl.h linux-2.6.31.1/include/linux/sysctl.h ---- linux-2.6.31.1/include/linux/sysctl.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/sysctl.h 2009-10-01 20:12:44.000000000 -0400 -@@ -165,7 +165,11 @@ enum - KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ - }; - -- -+#ifdef CONFIG_PAX_SOFTMODE -+enum { -+ PAX_SOFTMODE=1 /* PaX: disable/enable soft mode */ -+}; -+#endif - - /* CTL_VM names: */ - enum -diff -urNp linux-2.6.31.1/include/linux/thread_info.h linux-2.6.31.1/include/linux/thread_info.h ---- linux-2.6.31.1/include/linux/thread_info.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/thread_info.h 2009-10-01 20:12:44.000000000 -0400 -@@ -23,7 +23,7 @@ struct restart_block { - }; - /* For futex_wait and futex_wait_requeue_pi */ - struct { -- u32 *uaddr; -+ u32 __user *uaddr; - u32 val; - u32 flags; - u32 bitset; -diff -urNp linux-2.6.31.1/include/linux/tty_ldisc.h linux-2.6.31.1/include/linux/tty_ldisc.h ---- linux-2.6.31.1/include/linux/tty_ldisc.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/tty_ldisc.h 2009-10-01 20:12:44.000000000 -0400 -@@ -139,7 +139,7 @@ struct tty_ldisc_ops { - - struct module *owner; - -- int refcount; -+ atomic_t refcount; - }; - - struct tty_ldisc { -diff -urNp linux-2.6.31.1/include/linux/types.h linux-2.6.31.1/include/linux/types.h ---- linux-2.6.31.1/include/linux/types.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/types.h 2009-10-01 20:12:44.000000000 -0400 -@@ -191,10 +191,26 @@ typedef struct { - volatile int counter; - } atomic_t; - -+#ifdef CONFIG_PAX_REFCOUNT -+typedef struct { -+ volatile int counter; -+} atomic_unchecked_t; -+#else -+typedef atomic_t atomic_unchecked_t; -+#endif -+ - #ifdef CONFIG_64BIT - typedef struct { - volatile long counter; - } atomic64_t; -+ -+#ifdef CONFIG_PAX_REFCOUNT -+typedef struct { -+ volatile long counter; -+} atomic64_unchecked_t; -+#else -+typedef atomic64_t atomic64_unchecked_t; -+#endif - #endif - - struct ustat { -diff -urNp linux-2.6.31.1/include/linux/uaccess.h linux-2.6.31.1/include/linux/uaccess.h ---- linux-2.6.31.1/include/linux/uaccess.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/uaccess.h 2009-10-01 20:12:44.000000000 -0400 -@@ -76,11 +76,11 @@ static inline unsigned long __copy_from_ - long ret; \ - mm_segment_t old_fs = get_fs(); \ - \ -- set_fs(KERNEL_DS); \ - pagefault_disable(); \ -+ set_fs(KERNEL_DS); \ - ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \ -- pagefault_enable(); \ - set_fs(old_fs); \ -+ pagefault_enable(); \ - ret; \ - }) - -diff -urNp linux-2.6.31.1/include/linux/vmalloc.h linux-2.6.31.1/include/linux/vmalloc.h ---- linux-2.6.31.1/include/linux/vmalloc.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/linux/vmalloc.h 2009-10-01 20:12:44.000000000 -0400 -@@ -13,6 +13,11 @@ struct vm_area_struct; /* vma defining - #define VM_MAP 0x00000004 /* vmap()ed pages */ - #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ - #define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ -+ -+#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) -+#define VM_KERNEXEC 0x00000020 /* allocate from executable kernel memory range */ -+#endif -+ - /* bits [20..32] reserved for arch specific ioremap internals */ - - /* -@@ -115,4 +120,81 @@ extern rwlock_t vmlist_lock; - extern struct vm_struct *vmlist; - extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); - -+#define vmalloc(x) \ -+({ \ -+ void *___retval; \ -+ intoverflow_t ___x = (intoverflow_t)x; \ -+ if (WARN(___x > ULONG_MAX, "vmalloc size overflow\n")) \ -+ ___retval = NULL; \ -+ else \ -+ ___retval = vmalloc((unsigned long)___x); \ -+ ___retval; \ -+}) -+ -+#define __vmalloc(x, y, z) \ -+({ \ -+ void *___retval; \ -+ intoverflow_t ___x = (intoverflow_t)x; \ -+ if (WARN(___x > ULONG_MAX, "__vmalloc size overflow\n"))\ -+ ___retval = NULL; \ -+ else \ -+ ___retval = __vmalloc((unsigned long)___x, (y), (z));\ -+ ___retval; \ -+}) -+ -+#define vmalloc_user(x) \ -+({ \ -+ void *___retval; \ -+ intoverflow_t ___x = (intoverflow_t)x; \ -+ if (WARN(___x > ULONG_MAX, "vmalloc_user size overflow\n"))\ -+ ___retval = NULL; \ -+ else \ -+ ___retval = vmalloc_user((unsigned long)___x); \ -+ ___retval; \ -+}) -+ -+#define vmalloc_exec(x) \ -+({ \ -+ void *___retval; \ -+ intoverflow_t ___x = (intoverflow_t)x; \ -+ if (WARN(___x > ULONG_MAX, "vmalloc_exec size overflow\n"))\ -+ ___retval = NULL; \ -+ else \ -+ ___retval = vmalloc_exec((unsigned long)___x); \ -+ ___retval; \ -+}) -+ -+#define vmalloc_node(x, y) \ -+({ \ -+ void *___retval; \ -+ intoverflow_t ___x = (intoverflow_t)x; \ -+ if (WARN(___x > ULONG_MAX, "vmalloc_node size overflow\n"))\ -+ ___retval = NULL; \ -+ else \ -+ ___retval = vmalloc_node((unsigned long)___x, (y));\ -+ ___retval; \ -+}) -+ -+#define vmalloc_32(x) \ -+({ \ -+ void *___retval; \ -+ intoverflow_t ___x = (intoverflow_t)x; \ -+ if (WARN(___x > ULONG_MAX, "vmalloc_32 size overflow\n"))\ -+ ___retval = NULL; \ -+ else \ -+ ___retval = vmalloc_32((unsigned long)___x); \ -+ ___retval; \ -+}) -+ -+#define vmalloc_32_user(x) \ -+({ \ -+ void *___retval; \ -+ intoverflow_t ___x = (intoverflow_t)x; \ -+ if (WARN(___x > ULONG_MAX, "vmalloc_32_user size overflow\n"))\ -+ ___retval = NULL; \ -+ else \ -+ ___retval = vmalloc_32_user((unsigned long)___x);\ -+ ___retval; \ -+}) -+ - #endif /* _LINUX_VMALLOC_H */ -diff -urNp linux-2.6.31.1/include/net/irda/ircomm_tty.h linux-2.6.31.1/include/net/irda/ircomm_tty.h ---- linux-2.6.31.1/include/net/irda/ircomm_tty.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/net/irda/ircomm_tty.h 2009-10-01 20:12:44.000000000 -0400 -@@ -105,8 +105,8 @@ struct ircomm_tty_cb { - unsigned short close_delay; - unsigned short closing_wait; /* time to wait before closing */ - -- int open_count; -- int blocked_open; /* # of blocked opens */ -+ atomic_t open_count; -+ atomic_t blocked_open; /* # of blocked opens */ - - /* Protect concurent access to : - * o self->open_count -diff -urNp linux-2.6.31.1/include/net/sctp/sctp.h linux-2.6.31.1/include/net/sctp/sctp.h ---- linux-2.6.31.1/include/net/sctp/sctp.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/net/sctp/sctp.h 2009-10-01 20:12:44.000000000 -0400 -@@ -305,8 +305,8 @@ extern int sctp_debug_flag; - - #else /* SCTP_DEBUG */ - --#define SCTP_DEBUG_PRINTK(whatever...) --#define SCTP_DEBUG_PRINTK_IPADDR(whatever...) -+#define SCTP_DEBUG_PRINTK(whatever...) do {} while (0) -+#define SCTP_DEBUG_PRINTK_IPADDR(whatever...) do {} while (0) - #define SCTP_ENABLE_DEBUG - #define SCTP_DISABLE_DEBUG - #define SCTP_ASSERT(expr, str, func) -diff -urNp linux-2.6.31.1/include/sound/core.h linux-2.6.31.1/include/sound/core.h ---- linux-2.6.31.1/include/sound/core.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/sound/core.h 2009-10-01 20:12:44.000000000 -0400 -@@ -430,7 +430,7 @@ static inline int __snd_bug_on(int cond) - */ - #define snd_printdd(format, args...) snd_printk(format, ##args) - #else --#define snd_printdd(format, args...) /* nothing */ -+#define snd_printdd(format, args...) do {} while (0) - #endif - - -diff -urNp linux-2.6.31.1/include/video/uvesafb.h linux-2.6.31.1/include/video/uvesafb.h ---- linux-2.6.31.1/include/video/uvesafb.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/include/video/uvesafb.h 2009-10-01 20:12:44.000000000 -0400 -@@ -177,6 +177,7 @@ struct uvesafb_par { - u8 ypan; /* 0 - nothing, 1 - ypan, 2 - ywrap */ - u8 pmi_setpal; /* PMI for palette changes */ - u16 *pmi_base; /* protected mode interface location */ -+ u8 *pmi_code; /* protected mode code location */ - void *pmi_start; - void *pmi_pal; - u8 *vbe_state_orig; /* -diff -urNp linux-2.6.31.1/init/do_mounts.c linux-2.6.31.1/init/do_mounts.c ---- linux-2.6.31.1/init/do_mounts.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/init/do_mounts.c 2009-10-01 20:12:44.000000000 -0400 -@@ -216,11 +216,11 @@ static void __init get_fs_names(char *pa - - static int __init do_mount_root(char *name, char *fs, int flags, void *data) - { -- int err = sys_mount(name, "/root", fs, flags, data); -+ int err = sys_mount((char __user *)name, (char __user *)"/root", (char __user *)fs, flags, (void __user *)data); - if (err) - return err; - -- sys_chdir("/root"); -+ sys_chdir((char __user *)"/root"); - ROOT_DEV = current->fs->pwd.mnt->mnt_sb->s_dev; - printk("VFS: Mounted root (%s filesystem)%s on device %u:%u.\n", - current->fs->pwd.mnt->mnt_sb->s_type->name, -@@ -311,18 +311,18 @@ void __init change_floppy(char *fmt, ... - va_start(args, fmt); - vsprintf(buf, fmt, args); - va_end(args); -- fd = sys_open("/dev/root", O_RDWR | O_NDELAY, 0); -+ fd = sys_open((char __user *)"/dev/root", O_RDWR | O_NDELAY, 0); - if (fd >= 0) { - sys_ioctl(fd, FDEJECT, 0); - sys_close(fd); - } - printk(KERN_NOTICE "VFS: Insert %s and press ENTER\n", buf); -- fd = sys_open("/dev/console", O_RDWR, 0); -+ fd = sys_open((char __user *)"/dev/console", O_RDWR, 0); - if (fd >= 0) { - sys_ioctl(fd, TCGETS, (long)&termios); - termios.c_lflag &= ~ICANON; - sys_ioctl(fd, TCSETSF, (long)&termios); -- sys_read(fd, &c, 1); -+ sys_read(fd, (char __user *)&c, 1); - termios.c_lflag |= ICANON; - sys_ioctl(fd, TCSETSF, (long)&termios); - sys_close(fd); -@@ -415,7 +415,7 @@ void __init prepare_namespace(void) - - mount_root(); - out: -- sys_mount(".", "/", NULL, MS_MOVE, NULL); -- sys_chroot("."); -+ sys_mount((char __user *)".", (char __user *)"/", NULL, MS_MOVE, NULL); -+ sys_chroot((char __user *)"."); - } - -diff -urNp linux-2.6.31.1/init/do_mounts.h linux-2.6.31.1/init/do_mounts.h ---- linux-2.6.31.1/init/do_mounts.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/init/do_mounts.h 2009-10-01 20:12:44.000000000 -0400 -@@ -15,15 +15,15 @@ extern int root_mountflags; - - static inline int create_dev(char *name, dev_t dev) - { -- sys_unlink(name); -- return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); -+ sys_unlink((char __user *)name); -+ return sys_mknod((char __user *)name, S_IFBLK|0600, new_encode_dev(dev)); - } - - #if BITS_PER_LONG == 32 - static inline u32 bstat(char *name) - { - struct stat64 stat; -- if (sys_stat64(name, &stat) != 0) -+ if (sys_stat64((char __user *)name, (struct stat64 __user *)&stat) != 0) - return 0; - if (!S_ISBLK(stat.st_mode)) - return 0; -diff -urNp linux-2.6.31.1/init/do_mounts_initrd.c linux-2.6.31.1/init/do_mounts_initrd.c ---- linux-2.6.31.1/init/do_mounts_initrd.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/init/do_mounts_initrd.c 2009-10-01 20:12:44.000000000 -0400 -@@ -32,7 +32,7 @@ static int __init do_linuxrc(void * shel - sys_close(old_fd);sys_close(root_fd); - sys_close(0);sys_close(1);sys_close(2); - sys_setsid(); -- (void) sys_open("/dev/console",O_RDWR,0); -+ (void) sys_open((const char __user *)"/dev/console",O_RDWR,0); - (void) sys_dup(0); - (void) sys_dup(0); - return kernel_execve(shell, argv, envp_init); -@@ -47,13 +47,13 @@ static void __init handle_initrd(void) - create_dev("/dev/root.old", Root_RAM0); - /* mount initrd on rootfs' /root */ - mount_block_root("/dev/root.old", root_mountflags & ~MS_RDONLY); -- sys_mkdir("/old", 0700); -- root_fd = sys_open("/", 0, 0); -- old_fd = sys_open("/old", 0, 0); -+ sys_mkdir((const char __user *)"/old", 0700); -+ root_fd = sys_open((const char __user *)"/", 0, 0); -+ old_fd = sys_open((const char __user *)"/old", 0, 0); - /* move initrd over / and chdir/chroot in initrd root */ -- sys_chdir("/root"); -- sys_mount(".", "/", NULL, MS_MOVE, NULL); -- sys_chroot("."); -+ sys_chdir((const char __user *)"/root"); -+ sys_mount((char __user *)".", (char __user *)"/", NULL, MS_MOVE, NULL); -+ sys_chroot((const char __user *)"."); - - /* - * In case that a resume from disk is carried out by linuxrc or one of -@@ -70,15 +70,15 @@ static void __init handle_initrd(void) - - /* move initrd to rootfs' /old */ - sys_fchdir(old_fd); -- sys_mount("/", ".", NULL, MS_MOVE, NULL); -+ sys_mount((char __user *)"/", (char __user *)".", NULL, MS_MOVE, NULL); - /* switch root and cwd back to / of rootfs */ - sys_fchdir(root_fd); -- sys_chroot("."); -+ sys_chroot((const char __user *)"."); - sys_close(old_fd); - sys_close(root_fd); - - if (new_decode_dev(real_root_dev) == Root_RAM0) { -- sys_chdir("/old"); -+ sys_chdir((const char __user *)"/old"); - return; - } - -@@ -86,17 +86,17 @@ static void __init handle_initrd(void) - mount_root(); - - printk(KERN_NOTICE "Trying to move old root to /initrd ... "); -- error = sys_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL); -+ error = sys_mount((char __user *)"/old", (char __user *)"/root/initrd", NULL, MS_MOVE, NULL); - if (!error) - printk("okay\n"); - else { -- int fd = sys_open("/dev/root.old", O_RDWR, 0); -+ int fd = sys_open((const char __user *)"/dev/root.old", O_RDWR, 0); - if (error == -ENOENT) - printk("/initrd does not exist. Ignored.\n"); - else - printk("failed\n"); - printk(KERN_NOTICE "Unmounting old root\n"); -- sys_umount("/old", MNT_DETACH); -+ sys_umount((char __user *)"/old", MNT_DETACH); - printk(KERN_NOTICE "Trying to free ramdisk memory ... "); - if (fd < 0) { - error = fd; -@@ -119,11 +119,11 @@ int __init initrd_load(void) - * mounted in the normal path. - */ - if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) { -- sys_unlink("/initrd.image"); -+ sys_unlink((const char __user *)"/initrd.image"); - handle_initrd(); - return 1; - } - } -- sys_unlink("/initrd.image"); -+ sys_unlink((const char __user *)"/initrd.image"); - return 0; - } -diff -urNp linux-2.6.31.1/init/do_mounts_md.c linux-2.6.31.1/init/do_mounts_md.c ---- linux-2.6.31.1/init/do_mounts_md.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/init/do_mounts_md.c 2009-10-01 20:12:44.000000000 -0400 -@@ -170,7 +170,7 @@ static void __init md_setup_drive(void) - partitioned ? "_d" : "", minor, - md_setup_args[ent].device_names); - -- fd = sys_open(name, 0, 0); -+ fd = sys_open((char __user *)name, 0, 0); - if (fd < 0) { - printk(KERN_ERR "md: open failed - cannot start " - "array %s\n", name); -@@ -233,7 +233,7 @@ static void __init md_setup_drive(void) - * array without it - */ - sys_close(fd); -- fd = sys_open(name, 0, 0); -+ fd = sys_open((char __user *)name, 0, 0); - sys_ioctl(fd, BLKRRPART, 0); - } - sys_close(fd); -@@ -283,7 +283,7 @@ static void __init autodetect_raid(void) - - wait_for_device_probe(); - -- fd = sys_open("/dev/md0", 0, 0); -+ fd = sys_open((char __user *)"/dev/md0", 0, 0); - if (fd >= 0) { - sys_ioctl(fd, RAID_AUTORUN, raid_autopart); - sys_close(fd); -diff -urNp linux-2.6.31.1/init/initramfs.c linux-2.6.31.1/init/initramfs.c ---- linux-2.6.31.1/init/initramfs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/init/initramfs.c 2009-10-01 20:12:44.000000000 -0400 -@@ -271,7 +271,7 @@ static int __init maybe_link(void) - if (nlink >= 2) { - char *old = find_link(major, minor, ino, mode, collected); - if (old) -- return (sys_link(old, collected) < 0) ? -1 : 1; -+ return (sys_link((char __user *)old, (char __user *)collected) < 0) ? -1 : 1; - } - return 0; - } -@@ -280,11 +280,11 @@ static void __init clean_path(char *path - { - struct stat st; - -- if (!sys_newlstat(path, &st) && (st.st_mode^mode) & S_IFMT) { -+ if (!sys_newlstat((char __user *)path, (struct stat __user *)&st) && (st.st_mode^mode) & S_IFMT) { - if (S_ISDIR(st.st_mode)) -- sys_rmdir(path); -+ sys_rmdir((char __user *)path); - else -- sys_unlink(path); -+ sys_unlink((char __user *)path); - } - } - -@@ -305,7 +305,7 @@ static int __init do_name(void) - int openflags = O_WRONLY|O_CREAT; - if (ml != 1) - openflags |= O_TRUNC; -- wfd = sys_open(collected, openflags, mode); -+ wfd = sys_open((char __user *)collected, openflags, mode); - - if (wfd >= 0) { - sys_fchown(wfd, uid, gid); -@@ -317,16 +317,16 @@ static int __init do_name(void) - } - } - } else if (S_ISDIR(mode)) { -- sys_mkdir(collected, mode); -- sys_chown(collected, uid, gid); -- sys_chmod(collected, mode); -+ sys_mkdir((char __user *)collected, mode); -+ sys_chown((char __user *)collected, uid, gid); -+ sys_chmod((char __user *)collected, mode); - dir_add(collected, mtime); - } else if (S_ISBLK(mode) || S_ISCHR(mode) || - S_ISFIFO(mode) || S_ISSOCK(mode)) { - if (maybe_link() == 0) { -- sys_mknod(collected, mode, rdev); -- sys_chown(collected, uid, gid); -- sys_chmod(collected, mode); -+ sys_mknod((char __user *)collected, mode, rdev); -+ sys_chown((char __user *)collected, uid, gid); -+ sys_chmod((char __user *)collected, mode); - do_utime(collected, mtime); - } - } -@@ -336,7 +336,7 @@ static int __init do_name(void) - static int __init do_copy(void) - { - if (count >= body_len) { -- sys_write(wfd, victim, body_len); -+ sys_write(wfd, (char __user *)victim, body_len); - sys_close(wfd); - do_utime(vcollected, mtime); - kfree(vcollected); -@@ -344,7 +344,7 @@ static int __init do_copy(void) - state = SkipIt; - return 0; - } else { -- sys_write(wfd, victim, count); -+ sys_write(wfd, (char __user *)victim, count); - body_len -= count; - eat(count); - return 1; -@@ -355,8 +355,8 @@ static int __init do_symlink(void) - { - collected[N_ALIGN(name_len) + body_len] = '\0'; - clean_path(collected, 0); -- sys_symlink(collected + N_ALIGN(name_len), collected); -- sys_lchown(collected, uid, gid); -+ sys_symlink((char __user *)collected + N_ALIGN(name_len), (char __user *)collected); -+ sys_lchown((char __user *)collected, uid, gid); - do_utime(collected, mtime); - state = SkipIt; - next_state = Reset; -diff -urNp linux-2.6.31.1/init/Kconfig linux-2.6.31.1/init/Kconfig ---- linux-2.6.31.1/init/Kconfig 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/init/Kconfig 2009-10-01 20:12:44.000000000 -0400 -@@ -1014,7 +1014,7 @@ config STRIP_ASM_SYMS - - config COMPAT_BRK - bool "Disable heap randomization" -- default y -+ default n - help - Randomizing heap placement makes heap exploits harder, but it - also breaks ancient binaries (including anything libc5 based). -@@ -1101,9 +1101,9 @@ config HAVE_GENERIC_DMA_COHERENT - - config SLABINFO - bool -- depends on PROC_FS -+ depends on PROC_FS && !GRKERNSEC_PROC_ADD - depends on SLAB || SLUB_DEBUG -- default y -+ default n - - config RT_MUTEXES - boolean -diff -urNp linux-2.6.31.1/init/main.c linux-2.6.31.1/init/main.c ---- linux-2.6.31.1/init/main.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/init/main.c 2009-10-01 20:12:44.000000000 -0400 -@@ -96,6 +96,7 @@ static inline void mark_rodata_ro(void) - #ifdef CONFIG_TC - extern void tc_init(void); - #endif -+extern void grsecurity_init(void); - - enum system_states system_state __read_mostly; - EXPORT_SYMBOL(system_state); -@@ -182,6 +183,35 @@ static int __init set_reset_devices(char - - __setup("reset_devices", set_reset_devices); - -+#if defined(CONFIG_PAX_MEMORY_UDEREF) && defined(CONFIG_X86_32) -+static int __init setup_pax_nouderef(char *str) -+{ -+ unsigned int cpu; -+ -+ for (cpu = 0; cpu < NR_CPUS; cpu++) { -+ get_cpu_gdt_table(cpu)[GDT_ENTRY_KERNEL_DS].type = 3; -+ get_cpu_gdt_table(cpu)[GDT_ENTRY_KERNEL_DS].limit = 0xf; -+ } -+ asm("mov %0, %%ds" : : "r" (__KERNEL_DS) : "memory"); -+ asm("mov %0, %%es" : : "r" (__KERNEL_DS) : "memory"); -+ asm("mov %0, %%ss" : : "r" (__KERNEL_DS) : "memory"); -+ -+ return 0; -+} -+early_param("pax_nouderef", setup_pax_nouderef); -+#endif -+ -+#ifdef CONFIG_PAX_SOFTMODE -+unsigned int pax_softmode; -+ -+static int __init setup_pax_softmode(char *str) -+{ -+ get_option(&str, &pax_softmode); -+ return 1; -+} -+__setup("pax_softmode=", setup_pax_softmode); -+#endif -+ - static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; - char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; - static const char *panic_later, *panic_param; -@@ -375,7 +405,7 @@ static void __init setup_nr_cpu_ids(void - } - - #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA --unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; -+unsigned long __per_cpu_offset[NR_CPUS] __read_only; - - EXPORT_SYMBOL(__per_cpu_offset); - -@@ -741,6 +771,7 @@ int do_one_initcall(initcall_t fn) - { - int count = preempt_count(); - ktime_t calltime, delta, rettime; -+ const char *msg1 = "", *msg2 = ""; - - if (initcall_debug) { - call.caller = task_pid_nr(current); -@@ -768,15 +799,15 @@ int do_one_initcall(initcall_t fn) - sprintf(msgbuf, "error code %d ", ret.result); - - if (preempt_count() != count) { -- strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); -+ msg1 = " preemption imbalance"; - preempt_count() = count; - } - if (irqs_disabled()) { -- strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); -+ msg2 = " disabled interrupts"; - local_irq_enable(); - } -- if (msgbuf[0]) { -- printk("initcall %pF returned with %s\n", fn, msgbuf); -+ if (msgbuf[0] || *msg1 || *msg2) { -+ printk("initcall %pF returned with %s%s%s\n", fn, msgbuf, msg1, msg2); - } - - return ret.result; -@@ -923,6 +954,8 @@ static int __init kernel_init(void * unu - prepare_namespace(); - } - -+ grsecurity_init(); -+ - /* - * Ok, we have completed the initial bootup, and - * we're essentially up and running. Get rid of the -diff -urNp linux-2.6.31.1/init/noinitramfs.c linux-2.6.31.1/init/noinitramfs.c ---- linux-2.6.31.1/init/noinitramfs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/init/noinitramfs.c 2009-10-01 20:12:44.000000000 -0400 -@@ -29,7 +29,7 @@ static int __init default_rootfs(void) - { - int err; - -- err = sys_mkdir("/dev", 0755); -+ err = sys_mkdir((const char __user *)"/dev", 0755); - if (err < 0) - goto out; - -@@ -39,7 +39,7 @@ static int __init default_rootfs(void) - if (err < 0) - goto out; - -- err = sys_mkdir("/root", 0700); -+ err = sys_mkdir((const char __user *)"/root", 0700); - if (err < 0) - goto out; - -diff -urNp linux-2.6.31.1/ipc/ipc_sysctl.c linux-2.6.31.1/ipc/ipc_sysctl.c ---- linux-2.6.31.1/ipc/ipc_sysctl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/ipc/ipc_sysctl.c 2009-10-01 20:12:44.000000000 -0400 -@@ -267,7 +267,7 @@ static struct ctl_table ipc_kern_table[] - .extra1 = &zero, - .extra2 = &one, - }, -- {} -+ { 0, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL } - }; - - static struct ctl_table ipc_root_table[] = { -@@ -277,7 +277,7 @@ static struct ctl_table ipc_root_table[] - .mode = 0555, - .child = ipc_kern_table, - }, -- {} -+ { 0, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL } - }; - - static int __init ipc_sysctl_init(void) -diff -urNp linux-2.6.31.1/ipc/mqueue.c linux-2.6.31.1/ipc/mqueue.c ---- linux-2.6.31.1/ipc/mqueue.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/ipc/mqueue.c 2009-10-01 20:12:44.000000000 -0400 -@@ -77,7 +77,7 @@ struct mqueue_inode_info { - - static const struct inode_operations mqueue_dir_inode_operations; - static const struct file_operations mqueue_file_operations; --static struct super_operations mqueue_super_ops; -+static const struct super_operations mqueue_super_ops; - static void remove_notification(struct mqueue_inode_info *info); - - static struct kmem_cache *mqueue_inode_cachep; -@@ -150,6 +150,7 @@ static struct inode *mqueue_get_inode(st - mq_bytes = (mq_msg_tblsz + - (info->attr.mq_maxmsg * info->attr.mq_msgsize)); - -+ gr_learn_resource(current, RLIMIT_MSGQUEUE, u->mq_bytes + mq_bytes, 1); - spin_lock(&mq_lock); - if (u->mq_bytes + mq_bytes < u->mq_bytes || - u->mq_bytes + mq_bytes > -@@ -1224,7 +1225,7 @@ static const struct file_operations mque - .read = mqueue_read_file, - }; - --static struct super_operations mqueue_super_ops = { -+static const struct super_operations mqueue_super_ops = { - .alloc_inode = mqueue_alloc_inode, - .destroy_inode = mqueue_destroy_inode, - .statfs = simple_statfs, -diff -urNp linux-2.6.31.1/ipc/shm.c linux-2.6.31.1/ipc/shm.c ---- linux-2.6.31.1/ipc/shm.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/ipc/shm.c 2009-10-01 20:12:44.000000000 -0400 -@@ -55,7 +55,7 @@ struct shm_file_data { - #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) - - static const struct file_operations shm_file_operations; --static struct vm_operations_struct shm_vm_ops; -+static const struct vm_operations_struct shm_vm_ops; - - #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) - -@@ -70,6 +70,14 @@ static void shm_destroy (struct ipc_name - static int sysvipc_shm_proc_show(struct seq_file *s, void *it); - #endif - -+#ifdef CONFIG_GRKERNSEC -+extern int gr_handle_shmat(const pid_t shm_cprid, const pid_t shm_lapid, -+ const time_t shm_createtime, const uid_t cuid, -+ const int shmid); -+extern int gr_chroot_shmat(const pid_t shm_cprid, const pid_t shm_lapid, -+ const time_t shm_createtime); -+#endif -+ - void shm_init_ns(struct ipc_namespace *ns) - { - ns->shm_ctlmax = SHMMAX; -@@ -312,7 +320,7 @@ static const struct file_operations shm_ - .get_unmapped_area = shm_get_unmapped_area, - }; - --static struct vm_operations_struct shm_vm_ops = { -+static const struct vm_operations_struct shm_vm_ops = { - .open = shm_open, /* callback for a new vm-area open */ - .close = shm_close, /* callback for when the vm-area is released */ - .fault = shm_fault, -@@ -395,6 +403,14 @@ static int newseg(struct ipc_namespace * - shp->shm_lprid = 0; - shp->shm_atim = shp->shm_dtim = 0; - shp->shm_ctim = get_seconds(); -+#ifdef CONFIG_GRKERNSEC -+ { -+ struct timespec timeval; -+ do_posix_clock_monotonic_gettime(&timeval); -+ -+ shp->shm_createtime = timeval.tv_sec; -+ } -+#endif - shp->shm_segsz = size; - shp->shm_nattch = 0; - shp->shm_file = file; -@@ -878,9 +894,21 @@ long do_shmat(int shmid, char __user *sh - if (err) - goto out_unlock; - -+#ifdef CONFIG_GRKERNSEC -+ if (!gr_handle_shmat(shp->shm_cprid, shp->shm_lapid, shp->shm_createtime, -+ shp->shm_perm.cuid, shmid) || -+ !gr_chroot_shmat(shp->shm_cprid, shp->shm_lapid, shp->shm_createtime)) { -+ err = -EACCES; -+ goto out_unlock; -+ } -+#endif -+ - path.dentry = dget(shp->shm_file->f_path.dentry); - path.mnt = shp->shm_file->f_path.mnt; - shp->shm_nattch++; -+#ifdef CONFIG_GRKERNSEC -+ shp->shm_lapid = current->pid; -+#endif - size = i_size_read(path.dentry->d_inode); - shm_unlock(shp); - -diff -urNp linux-2.6.31.1/ipc/util.c linux-2.6.31.1/ipc/util.c ---- linux-2.6.31.1/ipc/util.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/ipc/util.c 2009-10-01 20:12:44.000000000 -0400 -@@ -942,7 +942,7 @@ static int sysvipc_proc_show(struct seq_ - return iface->show(s, it); - } - --static struct seq_operations sysvipc_proc_seqops = { -+static const struct seq_operations sysvipc_proc_seqops = { - .start = sysvipc_proc_start, - .stop = sysvipc_proc_stop, - .next = sysvipc_proc_next, -diff -urNp linux-2.6.31.1/kernel/acct.c linux-2.6.31.1/kernel/acct.c ---- linux-2.6.31.1/kernel/acct.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/acct.c 2009-10-01 20:12:44.000000000 -0400 -@@ -574,7 +574,7 @@ static void do_acct_process(struct bsd_a - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; -- file->f_op->write(file, (char *)&ac, -+ file->f_op->write(file, (char __user *)&ac, - sizeof(acct_t), &file->f_pos); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - set_fs(fs); -diff -urNp linux-2.6.31.1/kernel/capability.c linux-2.6.31.1/kernel/capability.c ---- linux-2.6.31.1/kernel/capability.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/capability.c 2009-10-01 20:12:44.000000000 -0400 -@@ -306,10 +306,21 @@ int capable(int cap) - BUG(); - } - -- if (security_capable(cap) == 0) { -+ if (security_capable(cap) == 0 && gr_is_capable(cap)) { - current->flags |= PF_SUPERPRIV; - return 1; - } - return 0; - } -+ -+int capable_nolog(int cap) -+{ -+ if (security_capable(cap) == 0 && gr_is_capable_nolog(cap)) { -+ current->flags |= PF_SUPERPRIV; -+ return 1; -+ } -+ return 0; -+} -+ - EXPORT_SYMBOL(capable); -+EXPORT_SYMBOL(capable_nolog); -diff -urNp linux-2.6.31.1/kernel/cgroup.c linux-2.6.31.1/kernel/cgroup.c ---- linux-2.6.31.1/kernel/cgroup.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/cgroup.c 2009-10-01 20:12:44.000000000 -0400 -@@ -596,8 +596,8 @@ void cgroup_unlock(void) - static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); - static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); - static int cgroup_populate_dir(struct cgroup *cgrp); --static struct inode_operations cgroup_dir_inode_operations; --static struct file_operations proc_cgroupstats_operations; -+static const struct inode_operations cgroup_dir_inode_operations; -+static const struct file_operations proc_cgroupstats_operations; - - static struct backing_dev_info cgroup_backing_dev_info = { - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -@@ -960,7 +960,7 @@ static int cgroup_remount(struct super_b - return ret; - } - --static struct super_operations cgroup_ops = { -+static const struct super_operations cgroup_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .show_options = cgroup_show_options, -@@ -1643,7 +1643,7 @@ static int cgroup_seqfile_release(struct - return single_release(inode, file); - } - --static struct file_operations cgroup_seqfile_operations = { -+static const struct file_operations cgroup_seqfile_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = seq_lseek, -@@ -1702,7 +1702,7 @@ static int cgroup_rename(struct inode *o - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); - } - --static struct file_operations cgroup_file_operations = { -+static const struct file_operations cgroup_file_operations = { - .read = cgroup_file_read, - .write = cgroup_file_write, - .llseek = generic_file_llseek, -@@ -1710,7 +1710,7 @@ static struct file_operations cgroup_fil - .release = cgroup_file_release, - }; - --static struct inode_operations cgroup_dir_inode_operations = { -+static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, -@@ -2313,7 +2313,7 @@ static int cgroup_tasks_show(struct seq_ - return seq_printf(s, "%d\n", *(int *)v); - } - --static struct seq_operations cgroup_tasks_seq_operations = { -+static const struct seq_operations cgroup_tasks_seq_operations = { - .start = cgroup_tasks_start, - .stop = cgroup_tasks_stop, - .next = cgroup_tasks_next, -@@ -2350,7 +2350,7 @@ static int cgroup_tasks_release(struct i - return seq_release(inode, file); - } - --static struct file_operations cgroup_tasks_operations = { -+static const struct file_operations cgroup_tasks_operations = { - .read = seq_read, - .llseek = seq_lseek, - .write = cgroup_file_write, -@@ -3016,7 +3016,7 @@ static int cgroup_open(struct inode *ino - return single_open(file, proc_cgroup_show, pid); - } - --struct file_operations proc_cgroup_operations = { -+const struct file_operations proc_cgroup_operations = { - .open = cgroup_open, - .read = seq_read, - .llseek = seq_lseek, -@@ -3045,7 +3045,7 @@ static int cgroupstats_open(struct inode - return single_open(file, proc_cgroupstats_show, NULL); - } - --static struct file_operations proc_cgroupstats_operations = { -+static const struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, -diff -urNp linux-2.6.31.1/kernel/configs.c linux-2.6.31.1/kernel/configs.c ---- linux-2.6.31.1/kernel/configs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/configs.c 2009-10-01 20:12:44.000000000 -0400 -@@ -73,8 +73,19 @@ static int __init ikconfig_init(void) - struct proc_dir_entry *entry; - - /* create the current config file */ -+#if defined(CONFIG_GRKERNSEC_PROC_ADD) || defined(CONFIG_GRKERNSEC_HIDESYM) -+#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_HIDESYM) -+ entry = proc_create("config.gz", S_IFREG | S_IRUSR, NULL, -+ &ikconfig_file_ops); -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ entry = proc_create("config.gz", S_IFREG | S_IRUSR | S_IRGRP, NULL, -+ &ikconfig_file_ops); -+#endif -+#else - entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, - &ikconfig_file_ops); -+#endif -+ - if (!entry) - return -ENOMEM; - -diff -urNp linux-2.6.31.1/kernel/cpu.c linux-2.6.31.1/kernel/cpu.c ---- linux-2.6.31.1/kernel/cpu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/cpu.c 2009-10-01 20:12:44.000000000 -0400 -@@ -19,7 +19,7 @@ - /* Serializes the updates to cpu_online_mask, cpu_present_mask */ - static DEFINE_MUTEX(cpu_add_remove_lock); - --static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); -+static RAW_NOTIFIER_HEAD(cpu_chain); - - /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. - * Should always be manipulated under cpu_add_remove_lock -diff -urNp linux-2.6.31.1/kernel/cred.c linux-2.6.31.1/kernel/cred.c ---- linux-2.6.31.1/kernel/cred.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/cred.c 2009-10-01 20:12:44.000000000 -0400 -@@ -366,6 +366,8 @@ int commit_creds(struct cred *new) - - get_cred(new); /* we will require a ref for the subj creds too */ - -+ gr_set_role_label(task, new->uid, new->gid); -+ - /* dumpability changes */ - if (old->euid != new->euid || - old->egid != new->egid || -diff -urNp linux-2.6.31.1/kernel/exit.c linux-2.6.31.1/kernel/exit.c ---- linux-2.6.31.1/kernel/exit.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/exit.c 2009-10-01 20:12:44.000000000 -0400 -@@ -56,6 +56,10 @@ - #include <asm/mmu_context.h> - #include "cred-internals.h" - -+#ifdef CONFIG_GRKERNSEC -+extern rwlock_t grsec_exec_file_lock; -+#endif -+ - static void exit_mm(struct task_struct * tsk); - - static void __unhash_process(struct task_struct *p) -@@ -167,6 +171,8 @@ void release_task(struct task_struct * p - struct task_struct *leader; - int zap_leader; - repeat: -+ gr_del_task_from_ip_table(p); -+ - tracehook_prepare_release_task(p); - /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials */ -@@ -334,11 +340,22 @@ static void reparent_to_kthreadd(void) - { - write_lock_irq(&tasklist_lock); - -+#ifdef CONFIG_GRKERNSEC -+ write_lock(&grsec_exec_file_lock); -+ if (current->exec_file) { -+ fput(current->exec_file); -+ current->exec_file = NULL; -+ } -+ write_unlock(&grsec_exec_file_lock); -+#endif -+ - ptrace_unlink(current); - /* Reparent to init */ - current->real_parent = current->parent = kthreadd_task; - list_move_tail(¤t->sibling, ¤t->real_parent->children); - -+ gr_set_kernel_label(current); -+ - /* Set the exit signal to SIGCHLD so we signal init on exit */ - current->exit_signal = SIGCHLD; - -@@ -426,6 +443,17 @@ void daemonize(const char *name, ...) - vsnprintf(current->comm, sizeof(current->comm), name, args); - va_end(args); - -+#ifdef CONFIG_GRKERNSEC -+ write_lock(&grsec_exec_file_lock); -+ if (current->exec_file) { -+ fput(current->exec_file); -+ current->exec_file = NULL; -+ } -+ write_unlock(&grsec_exec_file_lock); -+#endif -+ -+ gr_set_kernel_label(current); -+ - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them -@@ -953,6 +981,9 @@ NORET_TYPE void do_exit(long code) - tsk->exit_code = code; - taskstats_exit(tsk, group_dead); - -+ gr_acl_handle_psacct(tsk, code); -+ gr_acl_handle_exit(); -+ - exit_mm(tsk); - - if (group_dead) -@@ -1171,7 +1202,7 @@ static int wait_task_zombie(struct wait_ - - if (unlikely(wo->wo_flags & WNOWAIT)) { - int exit_code = p->exit_code; -- int why, status; -+ int why; - - get_task_struct(p); - read_unlock(&tasklist_lock); -diff -urNp linux-2.6.31.1/kernel/fork.c linux-2.6.31.1/kernel/fork.c ---- linux-2.6.31.1/kernel/fork.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/fork.c 2009-10-01 20:12:45.000000000 -0400 -@@ -244,7 +244,7 @@ static struct task_struct *dup_task_stru - *stackend = STACK_END_MAGIC; /* for overflow detection */ - - #ifdef CONFIG_CC_STACKPROTECTOR -- tsk->stack_canary = get_random_int(); -+ tsk->stack_canary = pax_get_random_long(); - #endif - - /* One for us, one for whoever does the "release_task()" (usually parent) */ -@@ -281,8 +281,8 @@ static int dup_mmap(struct mm_struct *mm - mm->locked_vm = 0; - mm->mmap = NULL; - mm->mmap_cache = NULL; -- mm->free_area_cache = oldmm->mmap_base; -- mm->cached_hole_size = ~0UL; -+ mm->free_area_cache = oldmm->free_area_cache; -+ mm->cached_hole_size = oldmm->cached_hole_size; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; -@@ -319,6 +319,7 @@ static int dup_mmap(struct mm_struct *mm - tmp->vm_flags &= ~VM_LOCKED; - tmp->vm_mm = mm; - tmp->vm_next = NULL; -+ tmp->vm_mirror = NULL; - anon_vma_link(tmp); - file = tmp->vm_file; - if (file) { -@@ -366,6 +367,31 @@ static int dup_mmap(struct mm_struct *mm - if (retval) - goto out; - } -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (oldmm->pax_flags & MF_PAX_SEGMEXEC) { -+ struct vm_area_struct *mpnt_m; -+ -+ for (mpnt = oldmm->mmap, mpnt_m = mm->mmap; mpnt; mpnt = mpnt->vm_next, mpnt_m = mpnt_m->vm_next) { -+ BUG_ON(!mpnt_m || mpnt_m->vm_mirror || mpnt->vm_mm != oldmm || mpnt_m->vm_mm != mm); -+ -+ if (!mpnt->vm_mirror) -+ continue; -+ -+ if (mpnt->vm_end <= SEGMEXEC_TASK_SIZE) { -+ BUG_ON(mpnt->vm_mirror->vm_mirror != mpnt); -+ mpnt->vm_mirror = mpnt_m; -+ } else { -+ BUG_ON(mpnt->vm_mirror->vm_mirror == mpnt || mpnt->vm_mirror->vm_mirror->vm_mm != mm); -+ mpnt_m->vm_mirror = mpnt->vm_mirror->vm_mirror; -+ mpnt_m->vm_mirror->vm_mirror = mpnt_m; -+ mpnt->vm_mirror->vm_mirror = mpnt; -+ } -+ } -+ BUG_ON(mpnt_m); -+ } -+#endif -+ - /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; -@@ -546,9 +572,11 @@ void mm_release(struct task_struct *tsk, - #ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); -+ tsk->robust_list = NULL; - #ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); -+ tsk->compat_robust_list = NULL; - #endif - #endif - -@@ -567,6 +595,7 @@ void mm_release(struct task_struct *tsk, - * the value intact in a core dump, and to save the unnecessary - * trouble otherwise. Userland only wants this done for a sys_exit. - */ -+ - if (tsk->clear_child_tid) { - if (!(tsk->flags & PF_SIGNALED) && - atomic_read(&mm->mm_users) > 1) { -@@ -576,7 +605,7 @@ void mm_release(struct task_struct *tsk, - */ - put_user(0, tsk->clear_child_tid); - sys_futex(tsk->clear_child_tid, FUTEX_WAKE, -- 1, NULL, NULL, 0); -+ 1, NULL, NULL, 0); - } - tsk->clear_child_tid = NULL; - } -@@ -694,7 +723,7 @@ static int copy_fs(unsigned long clone_f - write_unlock(&fs->lock); - return -EAGAIN; - } -- fs->users++; -+ atomic_inc(&fs->users); - write_unlock(&fs->lock); - return 0; - } -@@ -977,6 +1006,9 @@ static struct task_struct *copy_process( - DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); - #endif - retval = -EAGAIN; -+ -+ gr_learn_resource(p, RLIMIT_NPROC, atomic_read(&p->real_cred->user->processes), 0); -+ - if (atomic_read(&p->real_cred->user->processes) >= - p->signal->rlim[RLIMIT_NPROC].rlim_cur) { - if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && -@@ -1133,6 +1165,8 @@ static struct task_struct *copy_process( - goto bad_fork_free_pid; - } - -+ gr_copy_label(p); -+ - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; - /* - * Clear TID on mm_release()? -@@ -1302,6 +1336,8 @@ bad_fork_cleanup_count: - bad_fork_free: - free_task(p); - fork_out: -+ gr_log_forkfail(retval); -+ - return ERR_PTR(retval); - } - -@@ -1395,6 +1431,8 @@ long do_fork(unsigned long clone_flags, - if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); - -+ gr_handle_brute_check(); -+ - if (clone_flags & CLONE_VFORK) { - p->vfork_done = &vfork; - init_completion(&vfork); -@@ -1527,7 +1565,7 @@ static int unshare_fs(unsigned long unsh - return 0; - - /* don't need lock here; in the worst case we'll do useless copy */ -- if (fs->users == 1) -+ if (atomic_read(&fs->users) == 1) - return 0; - - *new_fsp = copy_fs_struct(fs); -@@ -1650,7 +1688,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, - fs = current->fs; - write_lock(&fs->lock); - current->fs = new_fs; -- if (--fs->users) -+ if (atomic_dec_return(&fs->users)) - new_fs = NULL; - else - new_fs = fs; -diff -urNp linux-2.6.31.1/kernel/futex.c linux-2.6.31.1/kernel/futex.c ---- linux-2.6.31.1/kernel/futex.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/futex.c 2009-10-01 20:12:45.000000000 -0400 -@@ -218,6 +218,11 @@ get_futex_key(u32 __user *uaddr, int fsh - struct page *page; - int err; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && address >= SEGMEXEC_TASK_SIZE) -+ return -EFAULT; -+#endif -+ - /* - * The futex address must be "naturally" aligned. - */ -@@ -1788,7 +1793,7 @@ static int futex_wait(u32 __user *uaddr, - - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_wait_restart; -- restart->futex.uaddr = (u32 *)uaddr; -+ restart->futex.uaddr = uaddr; - restart->futex.val = val; - restart->futex.time = abs_time->tv64; - restart->futex.bitset = bitset; -@@ -2403,7 +2408,7 @@ retry: - */ - static inline int fetch_robust_entry(struct robust_list __user **entry, - struct robust_list __user * __user *head, -- int *pi) -+ unsigned int *pi) - { - unsigned long uentry; - -diff -urNp linux-2.6.31.1/kernel/gcov/base.c linux-2.6.31.1/kernel/gcov/base.c ---- linux-2.6.31.1/kernel/gcov/base.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/gcov/base.c 2009-10-01 20:12:45.000000000 -0400 -@@ -102,11 +102,6 @@ void gcov_enable_events(void) - } - - #ifdef CONFIG_MODULES --static inline int within(void *addr, void *start, unsigned long size) --{ -- return ((addr >= start) && (addr < start + size)); --} -- - /* Update list and generate events when modules are unloaded. */ - static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, - void *data) -@@ -121,7 +116,7 @@ static int gcov_module_notifier(struct n - prev = NULL; - /* Remove entries located in module from linked list. */ - for (info = gcov_info_head; info; info = info->next) { -- if (within(info, mod->module_core, mod->core_size)) { -+ if (within_module_core_rw((unsigned long)info, mod)) { - if (prev) - prev->next = info->next; - else -diff -urNp linux-2.6.31.1/kernel/kallsyms.c linux-2.6.31.1/kernel/kallsyms.c ---- linux-2.6.31.1/kernel/kallsyms.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/kallsyms.c 2009-10-01 20:12:45.000000000 -0400 -@@ -11,6 +11,9 @@ - * Changed the compression method from stem compression to "table lookup" - * compression (see scripts/kallsyms.c for a more complete description) - */ -+#ifdef CONFIG_GRKERNSEC_HIDESYM -+#define __INCLUDED_BY_HIDESYM 1 -+#endif - #include <linux/kallsyms.h> - #include <linux/module.h> - #include <linux/init.h> -@@ -51,6 +54,9 @@ extern const unsigned long kallsyms_mark - - static inline int is_kernel_inittext(unsigned long addr) - { -+ if (system_state != SYSTEM_BOOTING) -+ return 0; -+ - if (addr >= (unsigned long)_sinittext - && addr <= (unsigned long)_einittext) - return 1; -@@ -66,6 +72,16 @@ static inline int is_kernel_text(unsigne - - static inline int is_kernel(unsigned long addr) - { -+ -+#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) && defined(CONFIG_MODULES) -+ if ((unsigned long)&MODULES_EXEC_VADDR <= ktla_ktva(addr) && -+ ktla_ktva(addr) < (unsigned long)&MODULES_EXEC_END) -+ return 0; -+#endif -+ -+ if (is_kernel_inittext(addr)) -+ return 1; -+ - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) - return 1; - return in_gate_area_no_task(addr); -@@ -412,7 +428,6 @@ static unsigned long get_ksymbol_core(st - - static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) - { -- iter->name[0] = '\0'; - iter->nameoff = get_symbol_offset(new_pos); - iter->pos = new_pos; - } -@@ -500,7 +515,7 @@ static int kallsyms_open(struct inode *i - struct kallsym_iter *iter; - int ret; - -- iter = kmalloc(sizeof(*iter), GFP_KERNEL); -+ iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - reset_iter(iter, 0); -@@ -522,7 +537,15 @@ static const struct file_operations kall - - static int __init kallsyms_init(void) - { -+#if defined(CONFIG_GRKERNSEC_PROC_ADD) || defined(CONFIG_GRKERNSEC_HIDESYM) -+#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_HIDESYM) -+ proc_create("kallsyms", S_IFREG | S_IRUSR, NULL, &kallsyms_operations); -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ proc_create("kallsyms", S_IFREG | S_IRUSR | S_IRGRP, NULL, &kallsyms_operations); -+#endif -+#else - proc_create("kallsyms", 0444, NULL, &kallsyms_operations); -+#endif - return 0; - } - device_initcall(kallsyms_init); -diff -urNp linux-2.6.31.1/kernel/kmod.c linux-2.6.31.1/kernel/kmod.c ---- linux-2.6.31.1/kernel/kmod.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/kmod.c 2009-10-01 20:12:45.000000000 -0400 -@@ -84,6 +84,18 @@ int __request_module(bool wait, const ch - if (ret >= MODULE_NAME_LEN) - return -ENAMETOOLONG; - -+#ifdef CONFIG_GRKERNSEC_MODHARDEN -+ /* we could do a tighter check here, but some distros -+ are taking it upon themselves to remove CAP_SYS_MODULE -+ from even root-running apps which cause modules to be -+ auto-loaded -+ */ -+ if (current_uid()) { -+ gr_log_nonroot_mod_load(module_name); -+ return -EPERM; -+ } -+#endif -+ - /* If modprobe needs a service that is in a module, we get a recursive - * loop. Limit the number of running kmod threads to max_threads/2 or - * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method -diff -urNp linux-2.6.31.1/kernel/kprobes.c linux-2.6.31.1/kernel/kprobes.c ---- linux-2.6.31.1/kernel/kprobes.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/kprobes.c 2009-10-01 20:12:45.000000000 -0400 -@@ -184,7 +184,7 @@ static kprobe_opcode_t __kprobes *__get_ - * kernel image and loaded module images reside. This is required - * so x86_64 can correctly handle the %rip-relative fixups. - */ -- kip->insns = module_alloc(PAGE_SIZE); -+ kip->insns = module_alloc_exec(PAGE_SIZE); - if (!kip->insns) { - kfree(kip); - return NULL; -@@ -225,7 +225,7 @@ static int __kprobes collect_one_slot(st - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { -- module_free(NULL, kip->insns); -+ module_free_exec(NULL, kip->insns); - kfree(kip); - } - return 1; -@@ -1329,7 +1329,7 @@ static int __kprobes show_kprobe_addr(st - return 0; - } - --static struct seq_operations kprobes_seq_ops = { -+static const struct seq_operations kprobes_seq_ops = { - .start = kprobe_seq_start, - .next = kprobe_seq_next, - .stop = kprobe_seq_stop, -@@ -1341,7 +1341,7 @@ static int __kprobes kprobes_open(struct - return seq_open(filp, &kprobes_seq_ops); - } - --static struct file_operations debugfs_kprobes_operations = { -+static const struct file_operations debugfs_kprobes_operations = { - .open = kprobes_open, - .read = seq_read, - .llseek = seq_lseek, -@@ -1523,7 +1523,7 @@ static ssize_t write_enabled_file_bool(s - return count; - } - --static struct file_operations fops_kp = { -+static const struct file_operations fops_kp = { - .read = read_enabled_file_bool, - .write = write_enabled_file_bool, - }; -diff -urNp linux-2.6.31.1/kernel/lockdep.c linux-2.6.31.1/kernel/lockdep.c ---- linux-2.6.31.1/kernel/lockdep.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/lockdep.c 2009-10-01 20:12:45.000000000 -0400 -@@ -630,6 +630,10 @@ static int static_obj(void *obj) - int i; - #endif - -+#ifdef CONFIG_PAX_KERNEXEC -+ start = (unsigned long )&_sdata; -+#endif -+ - /* - * static variable? - */ -@@ -641,9 +645,12 @@ static int static_obj(void *obj) - * percpu var? - */ - for_each_possible_cpu(i) { -+#ifdef CONFIG_X86_32 -+ start = per_cpu_offset(i); -+#else - start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); -- end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM -- + per_cpu_offset(i); -+#endif -+ end = start + PERCPU_ENOUGH_ROOM; - - if ((addr >= start) && (addr < end)) - return 1; -diff -urNp linux-2.6.31.1/kernel/lockdep_proc.c linux-2.6.31.1/kernel/lockdep_proc.c ---- linux-2.6.31.1/kernel/lockdep_proc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/lockdep_proc.c 2009-10-01 20:12:45.000000000 -0400 -@@ -670,7 +670,7 @@ static int ls_show(struct seq_file *m, v - return 0; - } - --static struct seq_operations lockstat_ops = { -+static const struct seq_operations lockstat_ops = { - .start = ls_start, - .next = ls_next, - .stop = ls_stop, -diff -urNp linux-2.6.31.1/kernel/module.c linux-2.6.31.1/kernel/module.c ---- linux-2.6.31.1/kernel/module.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/module.c 2009-10-01 20:12:45.000000000 -0400 -@@ -47,6 +47,11 @@ - #include <linux/rculist.h> - #include <asm/uaccess.h> - #include <asm/cacheflush.h> -+ -+#ifdef CONFIG_PAX_KERNEXEC -+#include <asm/desc.h> -+#endif -+ - #include <linux/license.h> - #include <asm/sections.h> - #include <linux/tracepoint.h> -@@ -83,7 +88,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq - static BLOCKING_NOTIFIER_HEAD(module_notify_list); - - /* Bounds of module allocation, for speeding __module_address */ --static unsigned long module_addr_min = -1UL, module_addr_max = 0; -+static unsigned long module_addr_min_rw = -1UL, module_addr_max_rw = 0; -+static unsigned long module_addr_min_rx = -1UL, module_addr_max_rx = 0; - - int register_module_notifier(struct notifier_block * nb) - { -@@ -239,7 +245,7 @@ bool each_symbol(bool (*fn)(const struct - return true; - - list_for_each_entry_rcu(mod, &modules, list) { -- struct symsearch arr[] = { -+ struct symsearch modarr[] = { - { mod->syms, mod->syms + mod->num_syms, mod->crcs, - NOT_GPL_ONLY, false }, - { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, -@@ -261,7 +267,7 @@ bool each_symbol(bool (*fn)(const struct - #endif - }; - -- if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) -+ if (each_symbol_in_section(modarr, ARRAY_SIZE(modarr), mod, fn, data)) - return true; - } - return false; -@@ -436,7 +442,7 @@ static void *percpu_modalloc(unsigned lo - void *ptr; - int cpu; - -- if (align > PAGE_SIZE) { -+ if (align-1 >= PAGE_SIZE) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - name, align, PAGE_SIZE); - align = PAGE_SIZE; -@@ -549,7 +555,11 @@ static void percpu_modcopy(void *pcpudes - int cpu; - - for_each_possible_cpu(cpu) -+#ifdef CONFIG_X86_32 -+ memcpy(pcpudest + __per_cpu_offset[cpu], from, size); -+#else - memcpy(pcpudest + per_cpu_offset(cpu), from, size); -+#endif - } - - #else /* ... !CONFIG_SMP */ -@@ -1513,7 +1523,8 @@ static void free_module(struct module *m - destroy_params(mod->kp, mod->num_kp); - - /* This may be NULL, but that's OK */ -- module_free(mod, mod->module_init); -+ module_free(mod, mod->module_init_rw); -+ module_free_exec(mod, mod->module_init_rx); - kfree(mod->args); - if (mod->percpu) - percpu_modfree(mod->percpu); -@@ -1522,10 +1533,12 @@ static void free_module(struct module *m - percpu_modfree(mod->refptr); - #endif - /* Free lock-classes: */ -- lockdep_free_key_range(mod->module_core, mod->core_size); -+ lockdep_free_key_range(mod->module_core_rx, mod->core_size_rx); -+ lockdep_free_key_range(mod->module_core_rw, mod->core_size_rw); - - /* Finally, free the core (containing the module structure) */ -- module_free(mod, mod->module_core); -+ module_free_exec(mod, mod->module_core_rx); -+ module_free(mod, mod->module_core_rw); - } - - void *__symbol_get(const char *symbol) -@@ -1593,6 +1606,10 @@ static int simplify_symbols(Elf_Shdr *se - int ret = 0; - const struct kernel_symbol *ksym; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - for (i = 1; i < n; i++) { - switch (sym[i].st_shndx) { - case SHN_COMMON: -@@ -1615,7 +1632,17 @@ static int simplify_symbols(Elf_Shdr *se - strtab + sym[i].st_name, mod); - /* Ok if resolved. */ - if (ksym) { -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - sym[i].st_value = ksym->value; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - break; - } - -@@ -1634,7 +1661,17 @@ static int simplify_symbols(Elf_Shdr *se - secbase = (unsigned long)mod->percpu; - else - secbase = sechdrs[sym[i].st_shndx].sh_addr; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - sym[i].st_value += secbase; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - break; - } - } -@@ -1695,11 +1732,12 @@ static void layout_sections(struct modul - || s->sh_entsize != ~0UL - || strstarts(secstrings + s->sh_name, ".init")) - continue; -- s->sh_entsize = get_offset(mod, &mod->core_size, s, i); -+ if ((s->sh_flags & SHF_WRITE) || !(s->sh_flags & SHF_ALLOC)) -+ s->sh_entsize = get_offset(mod, &mod->core_size_rw, s, i); -+ else -+ s->sh_entsize = get_offset(mod, &mod->core_size_rx, s, i); - DEBUGP("\t%s\n", secstrings + s->sh_name); - } -- if (m == 0) -- mod->core_text_size = mod->core_size; - } - - DEBUGP("Init section allocation order:\n"); -@@ -1712,12 +1750,13 @@ static void layout_sections(struct modul - || s->sh_entsize != ~0UL - || !strstarts(secstrings + s->sh_name, ".init")) - continue; -- s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) -- | INIT_OFFSET_MASK); -+ if ((s->sh_flags & SHF_WRITE) || !(s->sh_flags & SHF_ALLOC)) -+ s->sh_entsize = get_offset(mod, &mod->init_size_rw, s, i); -+ else -+ s->sh_entsize = get_offset(mod, &mod->init_size_rx, s, i); -+ s->sh_entsize |= INIT_OFFSET_MASK; - DEBUGP("\t%s\n", secstrings + s->sh_name); - } -- if (m == 0) -- mod->init_text_size = mod->init_size; - } - } - -@@ -1856,14 +1895,31 @@ static void add_kallsyms(struct module * - { - unsigned int i; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - mod->symtab = (void *)sechdrs[symindex].sh_addr; - mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); - mod->strtab = (void *)sechdrs[strindex].sh_addr; - - /* Set types up while we still have access to sections. */ -- for (i = 0; i < mod->num_symtab; i++) -- mod->symtab[i].st_info -- = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); -+ -+ for (i = 0; i < mod->num_symtab; i++) { -+ char type = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ mod->symtab[i].st_info = type; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ -+ } -+ - } - #else - static inline void add_kallsyms(struct module *mod, -@@ -1884,16 +1940,30 @@ static void dynamic_debug_setup(struct _ - #endif - } - --static void *module_alloc_update_bounds(unsigned long size) -+static void *module_alloc_update_bounds_rw(unsigned long size) - { - void *ret = module_alloc(size); - - if (ret) { - /* Update module bounds. */ -- if ((unsigned long)ret < module_addr_min) -- module_addr_min = (unsigned long)ret; -- if ((unsigned long)ret + size > module_addr_max) -- module_addr_max = (unsigned long)ret + size; -+ if ((unsigned long)ret < module_addr_min_rw) -+ module_addr_min_rw = (unsigned long)ret; -+ if ((unsigned long)ret + size > module_addr_max_rw) -+ module_addr_max_rw = (unsigned long)ret + size; -+ } -+ return ret; -+} -+ -+static void *module_alloc_update_bounds_rx(unsigned long size) -+{ -+ void *ret = module_alloc_exec(size); -+ -+ if (ret) { -+ /* Update module bounds. */ -+ if ((unsigned long)ret < module_addr_min_rx) -+ module_addr_min_rx = (unsigned long)ret; -+ if ((unsigned long)ret + size > module_addr_max_rx) -+ module_addr_max_rx = (unsigned long)ret + size; - } - return ret; - } -@@ -1905,8 +1975,8 @@ static void kmemleak_load_module(struct - unsigned int i; - - /* only scan the sections containing data */ -- kmemleak_scan_area(mod->module_core, (unsigned long)mod - -- (unsigned long)mod->module_core, -+ kmemleak_scan_area(mod->module_core_rw, (unsigned long)mod - -+ (unsigned long)mod->module_core_rw, - sizeof(struct module), GFP_KERNEL); - - for (i = 1; i < hdr->e_shnum; i++) { -@@ -1916,8 +1986,8 @@ static void kmemleak_load_module(struct - && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) - continue; - -- kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - -- (unsigned long)mod->module_core, -+ kmemleak_scan_area(mod->module_core_rw, sechdrs[i].sh_addr - -+ (unsigned long)mod->module_core_rw, - sechdrs[i].sh_size, GFP_KERNEL); - } - } -@@ -1947,6 +2017,10 @@ static noinline struct module *load_modu - void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - mm_segment_t old_fs; - -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif -+ - DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); - if (len < sizeof(*hdr)) -@@ -2097,7 +2171,7 @@ static noinline struct module *load_modu - layout_sections(mod, hdr, sechdrs, secstrings); - - /* Do the allocs. */ -- ptr = module_alloc_update_bounds(mod->core_size); -+ ptr = module_alloc_update_bounds_rw(mod->core_size_rw); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. Just mark it as not being a -@@ -2108,23 +2182,61 @@ static noinline struct module *load_modu - err = -ENOMEM; - goto free_percpu; - } -- memset(ptr, 0, mod->core_size); -- mod->module_core = ptr; -+ memset(ptr, 0, mod->core_size_rw); -+ mod->module_core_rw = ptr; - -- ptr = module_alloc_update_bounds(mod->init_size); -+ ptr = module_alloc_update_bounds_rw(mod->init_size_rw); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. This block doesn't need to be - * scanned as it contains data and code that will be freed - * after the module is initialized. - */ -- kmemleak_ignore(ptr); -- if (!ptr && mod->init_size) { -+ kmemleak_not_leak(ptr); -+ if (!ptr && mod->init_size_rw) { - err = -ENOMEM; -- goto free_core; -+ goto free_core_rw; - } -- memset(ptr, 0, mod->init_size); -- mod->module_init = ptr; -+ memset(ptr, 0, mod->init_size_rw); -+ mod->module_init_rw = ptr; -+ -+ ptr = module_alloc_update_bounds_rx(mod->core_size_rx); -+ kmemleak_not_leak(ptr); -+ if (!ptr) { -+ err = -ENOMEM; -+ goto free_init_rw; -+ } -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ memset(ptr, 0, mod->core_size_rx); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ -+ mod->module_core_rx = ptr; -+ -+ ptr = module_alloc_update_bounds_rx(mod->init_size_rx); -+ kmemleak_not_leak(ptr); -+ if (!ptr && mod->init_size_rx) { -+ err = -ENOMEM; -+ goto free_core_rx; -+ } -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ -+ memset(ptr, 0, mod->init_size_rx); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ -+ mod->module_init_rx = ptr; - - /* Transfer each section which specifies SHF_ALLOC */ - DEBUGP("final section addresses:\n"); -@@ -2134,17 +2246,41 @@ static noinline struct module *load_modu - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - -- if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) -- dest = mod->module_init -- + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); -- else -- dest = mod->module_core + sechdrs[i].sh_entsize; -+ if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) { -+ if ((sechdrs[i].sh_flags & SHF_WRITE) || !(sechdrs[i].sh_flags & SHF_ALLOC)) -+ dest = mod->module_init_rw -+ + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); -+ else -+ dest = mod->module_init_rx -+ + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); -+ } else { -+ if ((sechdrs[i].sh_flags & SHF_WRITE) || !(sechdrs[i].sh_flags & SHF_ALLOC)) -+ dest = mod->module_core_rw + sechdrs[i].sh_entsize; -+ else -+ dest = mod->module_core_rx + sechdrs[i].sh_entsize; -+ } -+ -+ if (sechdrs[i].sh_type != SHT_NOBITS) { - -- if (sechdrs[i].sh_type != SHT_NOBITS) -- memcpy(dest, (void *)sechdrs[i].sh_addr, -- sechdrs[i].sh_size); -+#ifdef CONFIG_PAX_KERNEXEC -+ if (!(sechdrs[i].sh_flags & SHF_WRITE) && (sechdrs[i].sh_flags & SHF_ALLOC)) { -+ pax_open_kernel(cr0); -+ memcpy(dest, (void *)sechdrs[i].sh_addr, sechdrs[i].sh_size); -+ pax_close_kernel(cr0); -+ } else -+#endif -+ -+ memcpy(dest, (void *)sechdrs[i].sh_addr, sechdrs[i].sh_size); -+ } - /* Update sh_addr to point to copy in image. */ -- sechdrs[i].sh_addr = (unsigned long)dest; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ if (sechdrs[i].sh_flags & SHF_EXECINSTR) -+ sechdrs[i].sh_addr = ktva_ktla((unsigned long)dest); -+ else -+#endif -+ -+ sechdrs[i].sh_addr = (unsigned long)dest; - DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); - } - /* Module has been moved. */ -@@ -2156,7 +2292,7 @@ static noinline struct module *load_modu - mod->name); - if (!mod->refptr) { - err = -ENOMEM; -- goto free_init; -+ goto free_init_rx; - } - #endif - /* Now we've moved module, initialize linked lists, etc. */ -@@ -2269,8 +2405,8 @@ static noinline struct module *load_modu - - /* Now do relocations. */ - for (i = 1; i < hdr->e_shnum; i++) { -- const char *strtab = (char *)sechdrs[strindex].sh_addr; - unsigned int info = sechdrs[i].sh_info; -+ strtab = (char *)sechdrs[strindex].sh_addr; - - /* Not a valid relocation section? */ - if (info >= hdr->e_shnum) -@@ -2328,12 +2464,12 @@ static noinline struct module *load_modu - * Do it before processing of module parameters, so the module - * can provide parameter accessor functions of its own. - */ -- if (mod->module_init) -- flush_icache_range((unsigned long)mod->module_init, -- (unsigned long)mod->module_init -- + mod->init_size); -- flush_icache_range((unsigned long)mod->module_core, -- (unsigned long)mod->module_core + mod->core_size); -+ if (mod->module_init_rx) -+ flush_icache_range((unsigned long)mod->module_init_rx, -+ (unsigned long)mod->module_init_rx -+ + mod->init_size_rx); -+ flush_icache_range((unsigned long)mod->module_core_rx, -+ (unsigned long)mod->module_core_rx + mod->core_size_rx); - - set_fs(old_fs); - -@@ -2378,12 +2514,16 @@ static noinline struct module *load_modu - free_unload: - module_unload_free(mod); - #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) -- free_init: -+ free_init_rx: - percpu_modfree(mod->refptr); - #endif -- module_free(mod, mod->module_init); -- free_core: -- module_free(mod, mod->module_core); -+ module_free_exec(mod, mod->module_init_rx); -+ free_core_rx: -+ module_free_exec(mod, mod->module_core_rx); -+ free_init_rw: -+ module_free(mod, mod->module_init_rw); -+ free_core_rw: -+ module_free(mod, mod->module_core_rw); - /* mod will be freed with core. Don't access it beyond this line! */ - free_percpu: - if (percpu) -@@ -2479,10 +2619,12 @@ SYSCALL_DEFINE3(init_module, void __user - /* Drop initial reference. */ - module_put(mod); - trim_init_extable(mod); -- module_free(mod, mod->module_init); -- mod->module_init = NULL; -- mod->init_size = 0; -- mod->init_text_size = 0; -+ module_free(mod, mod->module_init_rw); -+ module_free_exec(mod, mod->module_init_rx); -+ mod->module_init_rw = NULL; -+ mod->module_init_rx = NULL; -+ mod->init_size_rw = 0; -+ mod->init_size_rx = 0; - mutex_unlock(&module_mutex); - - return 0; -@@ -2513,10 +2655,16 @@ static const char *get_ksymbol(struct mo - unsigned long nextval; - - /* At worse, next value is at end of module */ -- if (within_module_init(addr, mod)) -- nextval = (unsigned long)mod->module_init+mod->init_text_size; -+ if (within_module_init_rx(addr, mod)) -+ nextval = (unsigned long)mod->module_init_rx+mod->init_size_rx; -+ else if (within_module_init_rw(addr, mod)) -+ nextval = (unsigned long)mod->module_init_rw+mod->init_size_rw; -+ else if (within_module_core_rx(addr, mod)) -+ nextval = (unsigned long)mod->module_core_rx+mod->core_size_rx; -+ else if (within_module_core_rw(addr, mod)) -+ nextval = (unsigned long)mod->module_core_rw+mod->core_size_rw; - else -- nextval = (unsigned long)mod->module_core+mod->core_text_size; -+ return NULL; - - /* Scan for closest preceeding symbol, and next symbol. (ELF - starts real symbols at 1). */ -@@ -2762,7 +2910,7 @@ static int m_show(struct seq_file *m, vo - char buf[8]; - - seq_printf(m, "%s %u", -- mod->name, mod->init_size + mod->core_size); -+ mod->name, mod->init_size_rx + mod->init_size_rw + mod->core_size_rx + mod->core_size_rw); - print_unload_info(m, mod); - - /* Informative for users. */ -@@ -2771,7 +2919,7 @@ static int m_show(struct seq_file *m, vo - mod->state == MODULE_STATE_COMING ? "Loading": - "Live"); - /* Used by oprofile and other similar tools. */ -- seq_printf(m, " 0x%p", mod->module_core); -+ seq_printf(m, " 0x%p 0x%p", mod->module_core_rx, mod->module_core_rw); - - /* Taints info */ - if (mod->taints) -@@ -2807,7 +2955,17 @@ static const struct file_operations proc - - static int __init proc_modules_init(void) - { -+#ifndef CONFIG_GRKERNSEC_HIDESYM -+#ifdef CONFIG_GRKERNSEC_PROC_USER -+ proc_create("modules", S_IRUSR, NULL, &proc_modules_operations); -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ proc_create("modules", S_IRUSR | S_IRGRP, NULL, &proc_modules_operations); -+#else - proc_create("modules", 0, NULL, &proc_modules_operations); -+#endif -+#else -+ proc_create("modules", S_IRUSR, NULL, &proc_modules_operations); -+#endif - return 0; - } - module_init(proc_modules_init); -@@ -2866,12 +3024,12 @@ struct module *__module_address(unsigned - { - struct module *mod; - -- if (addr < module_addr_min || addr > module_addr_max) -+ if ((addr < module_addr_min_rx || addr > module_addr_max_rx) && -+ (addr < module_addr_min_rw || addr > module_addr_max_rw)) - return NULL; - - list_for_each_entry_rcu(mod, &modules, list) -- if (within_module_core(addr, mod) -- || within_module_init(addr, mod)) -+ if (within_module_init(addr, mod) || within_module_core(addr, mod)) - return mod; - return NULL; - } -@@ -2905,11 +3063,20 @@ bool is_module_text_address(unsigned lon - */ - struct module *__module_text_address(unsigned long addr) - { -- struct module *mod = __module_address(addr); -+ struct module *mod; -+ -+#ifdef CONFIG_X86_32 -+ addr = ktla_ktva(addr); -+#endif -+ -+ if (addr < module_addr_min_rx || addr > module_addr_max_rx) -+ return NULL; -+ -+ mod = __module_address(addr); -+ - if (mod) { - /* Make sure it's within the text section. */ -- if (!within(addr, mod->module_init, mod->init_text_size) -- && !within(addr, mod->module_core, mod->core_text_size)) -+ if (!within_module_init_rx(addr, mod) && !within_module_core_rx(addr, mod)) - mod = NULL; - } - return mod; -diff -urNp linux-2.6.31.1/kernel/panic.c linux-2.6.31.1/kernel/panic.c ---- linux-2.6.31.1/kernel/panic.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/panic.c 2009-10-01 20:12:45.000000000 -0400 -@@ -391,7 +391,8 @@ EXPORT_SYMBOL(warn_slowpath_null); - */ - void __stack_chk_fail(void) - { -- panic("stack-protector: Kernel stack is corrupted in: %p\n", -+ dump_stack(); -+ panic("stack-protector: Kernel stack is corrupted in: %pS\n", - __builtin_return_address(0)); - } - EXPORT_SYMBOL(__stack_chk_fail); -diff -urNp linux-2.6.31.1/kernel/params.c linux-2.6.31.1/kernel/params.c ---- linux-2.6.31.1/kernel/params.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/params.c 2009-10-01 20:12:45.000000000 -0400 -@@ -217,13 +217,9 @@ int param_set_charp(const char *val, str - return -ENOSPC; - } - -- if (kp->flags & KPARAM_KMALLOCED) -- kfree(*(char **)kp->arg); -- - /* This is a hack. We can't need to strdup in early boot, and we - * don't need to; this mangled commandline is preserved. */ - if (slab_is_available()) { -- kp->flags |= KPARAM_KMALLOCED; - *(char **)kp->arg = kstrdup(val, GFP_KERNEL); - if (!kp->arg) - return -ENOMEM; -@@ -607,7 +603,7 @@ void destroy_params(const struct kernel_ - unsigned int i; - - for (i = 0; i < num; i++) -- if (params[i].flags & KPARAM_KMALLOCED) -+ if (params[i].set == param_set_charp) - kfree(*(char **)params[i].arg); - } - -diff -urNp linux-2.6.31.1/kernel/perf_counter.c linux-2.6.31.1/kernel/perf_counter.c ---- linux-2.6.31.1/kernel/perf_counter.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/perf_counter.c 2009-10-01 20:12:45.000000000 -0400 -@@ -2231,7 +2231,7 @@ static void perf_mmap_close(struct vm_ar - } - } - --static struct vm_operations_struct perf_mmap_vmops = { -+static const struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, -@@ -4181,7 +4181,7 @@ static int perf_copy_attr(struct perf_co - end = PTR_ALIGN((void __user *)uattr + size, - sizeof(unsigned long)); - -- for (; addr < end; addr += sizeof(unsigned long)) { -+ for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; -diff -urNp linux-2.6.31.1/kernel/pid.c linux-2.6.31.1/kernel/pid.c ---- linux-2.6.31.1/kernel/pid.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/pid.c 2009-10-01 20:12:45.000000000 -0400 -@@ -33,6 +33,7 @@ - #include <linux/rculist.h> - #include <linux/bootmem.h> - #include <linux/hash.h> -+#include <linux/security.h> - #include <linux/pid_namespace.h> - #include <linux/init_task.h> - #include <linux/syscalls.h> -@@ -45,7 +46,7 @@ struct pid init_struct_pid = INIT_STRUCT - - int pid_max = PID_MAX_DEFAULT; - --#define RESERVED_PIDS 300 -+#define RESERVED_PIDS 500 - - int pid_max_min = RESERVED_PIDS + 1; - int pid_max_max = PID_MAX_LIMIT; -@@ -380,7 +381,14 @@ EXPORT_SYMBOL(pid_task); - */ - struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) - { -- return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); -+ struct task_struct *task; -+ -+ task = pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); -+ -+ if (gr_pid_is_chrooted(task)) -+ return NULL; -+ -+ return task; - } - - struct task_struct *find_task_by_vpid(pid_t vnr) -diff -urNp linux-2.6.31.1/kernel/posix-cpu-timers.c linux-2.6.31.1/kernel/posix-cpu-timers.c ---- linux-2.6.31.1/kernel/posix-cpu-timers.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/posix-cpu-timers.c 2009-10-01 20:12:45.000000000 -0400 -@@ -6,6 +6,7 @@ - #include <linux/posix-timers.h> - #include <linux/errno.h> - #include <linux/math64.h> -+#include <linux/security.h> - #include <asm/uaccess.h> - #include <linux/kernel_stat.h> - -@@ -1041,6 +1042,7 @@ static void check_thread_timers(struct t - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); - return; - } -+ gr_learn_resource(tsk, RLIMIT_RTTIME, tsk->rt.timeout, 1); - if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { - /* - * At the soft limit, send a SIGXCPU every second. -@@ -1196,6 +1198,7 @@ static void check_process_timers(struct - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); - return; - } -+ gr_learn_resource(tsk, RLIMIT_CPU, psecs, 0); - if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { - /* - * At the soft limit, send a SIGXCPU every second. -diff -urNp linux-2.6.31.1/kernel/power/poweroff.c linux-2.6.31.1/kernel/power/poweroff.c ---- linux-2.6.31.1/kernel/power/poweroff.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/power/poweroff.c 2009-10-01 20:12:45.000000000 -0400 -@@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_powerof - .enable_mask = SYSRQ_ENABLE_BOOT, - }; - --static int pm_sysrq_init(void) -+static int __init pm_sysrq_init(void) - { - register_sysrq_key('o', &sysrq_poweroff_op); - return 0; -diff -urNp linux-2.6.31.1/kernel/power/process.c linux-2.6.31.1/kernel/power/process.c ---- linux-2.6.31.1/kernel/power/process.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/power/process.c 2009-10-01 20:12:45.000000000 -0400 -@@ -36,12 +36,15 @@ static int try_to_freeze_tasks(bool sig_ - struct timeval start, end; - u64 elapsed_csecs64; - unsigned int elapsed_csecs; -+ bool timedout = false; - - do_gettimeofday(&start); - - end_time = jiffies + TIMEOUT; - do { - todo = 0; -+ if (time_after(jiffies, end_time)) -+ timedout = true; - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (frozen(p) || !freezeable(p)) -@@ -56,15 +59,17 @@ static int try_to_freeze_tasks(bool sig_ - * It is "frozen enough". If the task does wake - * up, it will immediately call try_to_freeze. - */ -- if (!task_is_stopped_or_traced(p) && -- !freezer_should_skip(p)) -+ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) { - todo++; -+ if (timedout) { -+ printk(KERN_ERR "Task refusing to freeze:\n"); -+ sched_show_task(p); -+ } -+ } - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - yield(); /* Yield is okay here */ -- if (time_after(jiffies, end_time)) -- break; -- } while (todo); -+ } while (todo && !timedout); - - do_gettimeofday(&end); - elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); -diff -urNp linux-2.6.31.1/kernel/printk.c linux-2.6.31.1/kernel/printk.c ---- linux-2.6.31.1/kernel/printk.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/printk.c 2009-10-01 20:12:45.000000000 -0400 -@@ -272,6 +272,11 @@ int do_syslog(int type, char __user *buf - char c; - int error = 0; - -+#ifdef CONFIG_GRKERNSEC_DMESG -+ if (grsec_enable_dmesg && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+#endif -+ - error = security_syslog(type); - if (error) - return error; -diff -urNp linux-2.6.31.1/kernel/ptrace.c linux-2.6.31.1/kernel/ptrace.c ---- linux-2.6.31.1/kernel/ptrace.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/ptrace.c 2009-10-01 20:12:45.000000000 -0400 -@@ -141,7 +141,7 @@ int __ptrace_may_access(struct task_stru - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && -- !capable(CAP_SYS_PTRACE)) { -+ !capable_nolog(CAP_SYS_PTRACE)) { - rcu_read_unlock(); - return -EPERM; - } -@@ -149,7 +149,7 @@ int __ptrace_may_access(struct task_stru - smp_rmb(); - if (task->mm) - dumpable = get_dumpable(task->mm); -- if (!dumpable && !capable(CAP_SYS_PTRACE)) -+ if (!dumpable && !capable_nolog(CAP_SYS_PTRACE)) - return -EPERM; - - return security_ptrace_may_access(task, mode); -@@ -199,7 +199,7 @@ int ptrace_attach(struct task_struct *ta - goto unlock_tasklist; - - task->ptrace = PT_PTRACED; -- if (capable(CAP_SYS_PTRACE)) -+ if (capable_nolog(CAP_SYS_PTRACE)) - task->ptrace |= PT_PTRACE_CAP; - - __ptrace_link(task, current); -@@ -633,6 +633,11 @@ SYSCALL_DEFINE4(ptrace, long, request, l - if (ret < 0) - goto out_put_task_struct; - -+ if (gr_handle_ptrace(child, request)) { -+ ret = -EPERM; -+ goto out_put_task_struct; -+ } -+ - ret = arch_ptrace(child, request, addr, data); - - out_put_task_struct: -diff -urNp linux-2.6.31.1/kernel/rcupreempt_trace.c linux-2.6.31.1/kernel/rcupreempt_trace.c ---- linux-2.6.31.1/kernel/rcupreempt_trace.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/rcupreempt_trace.c 2009-10-01 20:12:45.000000000 -0400 -@@ -261,17 +261,17 @@ static ssize_t rcuctrs_read(struct file - return bcount; - } - --static struct file_operations rcustats_fops = { -+static const struct file_operations rcustats_fops = { - .owner = THIS_MODULE, - .read = rcustats_read, - }; - --static struct file_operations rcugp_fops = { -+static const struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .read = rcugp_read, - }; - --static struct file_operations rcuctrs_fops = { -+static const struct file_operations rcuctrs_fops = { - .owner = THIS_MODULE, - .read = rcuctrs_read, - }; -diff -urNp linux-2.6.31.1/kernel/rcutree_trace.c linux-2.6.31.1/kernel/rcutree_trace.c ---- linux-2.6.31.1/kernel/rcutree_trace.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/rcutree_trace.c 2009-10-01 20:12:45.000000000 -0400 -@@ -88,7 +88,7 @@ static int rcudata_open(struct inode *in - return single_open(file, show_rcudata, NULL); - } - --static struct file_operations rcudata_fops = { -+static const struct file_operations rcudata_fops = { - .owner = THIS_MODULE, - .open = rcudata_open, - .read = seq_read, -@@ -136,7 +136,7 @@ static int rcudata_csv_open(struct inode - return single_open(file, show_rcudata_csv, NULL); - } - --static struct file_operations rcudata_csv_fops = { -+static const struct file_operations rcudata_csv_fops = { - .owner = THIS_MODULE, - .open = rcudata_csv_open, - .read = seq_read, -@@ -183,7 +183,7 @@ static int rcuhier_open(struct inode *in - return single_open(file, show_rcuhier, NULL); - } - --static struct file_operations rcuhier_fops = { -+static const struct file_operations rcuhier_fops = { - .owner = THIS_MODULE, - .open = rcuhier_open, - .read = seq_read, -@@ -205,7 +205,7 @@ static int rcugp_open(struct inode *inod - return single_open(file, show_rcugp, NULL); - } - --static struct file_operations rcugp_fops = { -+static const struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .open = rcugp_open, - .read = seq_read, -@@ -255,7 +255,7 @@ static int rcu_pending_open(struct inode - return single_open(file, show_rcu_pending, NULL); - } - --static struct file_operations rcu_pending_fops = { -+static const struct file_operations rcu_pending_fops = { - .owner = THIS_MODULE, - .open = rcu_pending_open, - .read = seq_read, -diff -urNp linux-2.6.31.1/kernel/relay.c linux-2.6.31.1/kernel/relay.c ---- linux-2.6.31.1/kernel/relay.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/relay.c 2009-10-01 20:12:45.000000000 -0400 -@@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_are - /* - * vm_ops for relay file mappings. - */ --static struct vm_operations_struct relay_file_mmap_ops = { -+static const struct vm_operations_struct relay_file_mmap_ops = { - .fault = relay_buf_fault, - .close = relay_file_mmap_close, - }; -@@ -1292,7 +1292,7 @@ static int subbuf_splice_actor(struct fi - return 0; - - ret = *nonpad_ret = splice_to_pipe(pipe, &spd); -- if (ret < 0 || ret < total_len) -+ if ((int)ret < 0 || ret < total_len) - return ret; - - if (read_start + ret == nonpad_end) -diff -urNp linux-2.6.31.1/kernel/resource.c linux-2.6.31.1/kernel/resource.c ---- linux-2.6.31.1/kernel/resource.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/resource.c 2009-10-01 20:12:45.000000000 -0400 -@@ -132,8 +132,18 @@ static const struct file_operations proc - - static int __init ioresources_init(void) - { -+#ifdef CONFIG_GRKERNSEC_PROC_ADD -+#ifdef CONFIG_GRKERNSEC_PROC_USER -+ proc_create("ioports", S_IRUSR, NULL, &proc_ioports_operations); -+ proc_create("iomem", S_IRUSR, NULL, &proc_iomem_operations); -+#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) -+ proc_create("ioports", S_IRUSR | S_IRGRP, NULL, &proc_ioports_operations); -+ proc_create("iomem", S_IRUSR | S_IRGRP, NULL, &proc_iomem_operations); -+#endif -+#else - proc_create("ioports", 0, NULL, &proc_ioports_operations); - proc_create("iomem", 0, NULL, &proc_iomem_operations); -+#endif - return 0; - } - __initcall(ioresources_init); -diff -urNp linux-2.6.31.1/kernel/sched.c linux-2.6.31.1/kernel/sched.c ---- linux-2.6.31.1/kernel/sched.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/sched.c 2009-10-01 20:12:45.000000000 -0400 -@@ -820,7 +820,7 @@ static int sched_feat_open(struct inode - return single_open(filp, sched_feat_show, NULL); - } - --static struct file_operations sched_feat_fops = { -+static const struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .write = sched_feat_write, - .read = seq_read, -@@ -5978,6 +5978,8 @@ int can_nice(const struct task_struct *p - /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; - -+ gr_learn_resource(p, RLIMIT_NICE, nice_rlim, 1); -+ - return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || - capable(CAP_SYS_NICE)); - } -@@ -6011,7 +6013,8 @@ SYSCALL_DEFINE1(nice, int, increment) - if (nice > 19) - nice = 19; - -- if (increment < 0 && !can_nice(current, nice)) -+ if (increment < 0 && (!can_nice(current, nice) || -+ gr_handle_chroot_nice())) - return -EPERM; - - retval = security_task_setnice(current, nice); -@@ -6153,6 +6156,8 @@ recheck: - if (rt_policy(policy)) { - unsigned long rlim_rtprio; - -+ gr_learn_resource(p, RLIMIT_RTPRIO, param->sched_priority, 1); -+ - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; -@@ -7300,7 +7305,7 @@ static struct ctl_table sd_ctl_dir[] = { - .procname = "sched_domain", - .mode = 0555, - }, -- {0, }, -+ { 0, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL } - }; - - static struct ctl_table sd_ctl_root[] = { -@@ -7310,7 +7315,7 @@ static struct ctl_table sd_ctl_root[] = - .mode = 0555, - .child = sd_ctl_dir, - }, -- {0, }, -+ { 0, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL } - }; - - static struct ctl_table *sd_alloc_ctl_entry(int n) -diff -urNp linux-2.6.31.1/kernel/signal.c linux-2.6.31.1/kernel/signal.c ---- linux-2.6.31.1/kernel/signal.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/signal.c 2009-10-01 20:12:45.000000000 -0400 -@@ -207,6 +207,9 @@ static struct sigqueue *__sigqueue_alloc - */ - user = get_uid(__task_cred(t)->user); - atomic_inc(&user->sigpending); -+ -+ if (!override_rlimit) -+ gr_learn_resource(t, RLIMIT_SIGPENDING, atomic_read(&user->sigpending), 1); - if (override_rlimit || - atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) -@@ -625,6 +628,9 @@ static int check_kill_permission(int sig - } - } - -+ if (gr_handle_signal(t, sig)) -+ return -EPERM; -+ - return security_task_kill(t, info, sig, 0); - } - -@@ -939,8 +945,8 @@ static void print_fatal_signal(struct pt - for (i = 0; i < 16; i++) { - unsigned char insn; - -- __get_user(insn, (unsigned char *)(regs->ip + i)); -- printk("%02x ", insn); -+ if (!get_user(insn, (unsigned char __user *)(regs->ip + i))) -+ printk("%02x ", insn); - } - } - #endif -@@ -965,7 +971,7 @@ __group_send_sig_info(int sig, struct si - return send_signal(sig, info, p, 1); - } - --static int -+int - specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) - { - return send_signal(sig, info, t, 0); -@@ -1005,6 +1011,9 @@ force_sig_info(int sig, struct siginfo * - ret = specific_send_sig_info(sig, info, t); - spin_unlock_irqrestore(&t->sighand->siglock, flags); - -+ gr_log_signal(sig, !is_si_special(info) ? info->si_addr : NULL, t); -+ gr_handle_crash(t, sig); -+ - return ret; - } - -@@ -1079,6 +1088,8 @@ int group_send_sig_info(int sig, struct - ret = __group_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); - } -+ if (!ret) -+ gr_log_signal(sig, !is_si_special(info) ? info->si_addr : NULL, p); - } - - return ret; -diff -urNp linux-2.6.31.1/kernel/sys.c linux-2.6.31.1/kernel/sys.c ---- linux-2.6.31.1/kernel/sys.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/sys.c 2009-10-01 20:12:45.000000000 -0400 -@@ -133,6 +133,12 @@ static int set_one_prio(struct task_stru - error = -EACCES; - goto out; - } -+ -+ if (gr_handle_chroot_setpriority(p, niceval)) { -+ error = -EACCES; -+ goto out; -+ } -+ - no_nice = security_task_setnice(p, niceval); - if (no_nice) { - error = no_nice; -@@ -509,6 +515,9 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, g - goto error; - } - -+ if (gr_check_group_change(new->gid, new->egid, -1)) -+ goto error; -+ - if (rgid != (gid_t) -1 || - (egid != (gid_t) -1 && egid != old->gid)) - new->sgid = new->egid; -@@ -542,6 +551,10 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) - goto error; - - retval = -EPERM; -+ -+ if (gr_check_group_change(gid, gid, gid)) -+ goto error; -+ - if (capable(CAP_SETGID)) - new->gid = new->egid = new->sgid = new->fsgid = gid; - else if (gid == old->gid || gid == old->sgid) -@@ -632,6 +645,9 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, u - goto error; - } - -+ if (gr_check_user_change(new->uid, new->euid, -1)) -+ goto error; -+ - if (new->uid != old->uid) { - retval = set_user(new); - if (retval < 0) -@@ -680,6 +696,12 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) - goto error; - - retval = -EPERM; -+ -+ if (gr_check_crash_uid(uid)) -+ goto error; -+ if (gr_check_user_change(uid, uid, uid)) -+ goto error; -+ - if (capable(CAP_SETUID)) { - new->suid = new->uid = uid; - if (uid != old->uid) { -@@ -737,6 +759,9 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, - goto error; - } - -+ if (gr_check_user_change(ruid, euid, -1)) -+ goto error; -+ - if (ruid != (uid_t) -1) { - new->uid = ruid; - if (ruid != old->uid) { -@@ -805,6 +830,9 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, - goto error; - } - -+ if (gr_check_group_change(rgid, egid, -1)) -+ goto error; -+ - if (rgid != (gid_t) -1) - new->gid = rgid; - if (egid != (gid_t) -1) -@@ -854,6 +882,9 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) - if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) - goto error; - -+ if (gr_check_user_change(-1, -1, uid)) -+ goto error; -+ - if (uid == old->uid || uid == old->euid || - uid == old->suid || uid == old->fsuid || - capable(CAP_SETUID)) { -@@ -894,6 +925,9 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) - if (gid == old->gid || gid == old->egid || - gid == old->sgid || gid == old->fsgid || - capable(CAP_SETGID)) { -+ if (gr_check_group_change(-1, -1, gid)) -+ goto error; -+ - if (gid != old_fsgid) { - new->fsgid = gid; - goto change_okay; -@@ -1443,7 +1477,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsi - error = get_dumpable(me->mm); - break; - case PR_SET_DUMPABLE: -- if (arg2 < 0 || arg2 > 1) { -+ if (arg2 > 1) { - error = -EINVAL; - break; - } -diff -urNp linux-2.6.31.1/kernel/sysctl.c linux-2.6.31.1/kernel/sysctl.c ---- linux-2.6.31.1/kernel/sysctl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/sysctl.c 2009-10-01 20:12:45.000000000 -0400 -@@ -65,6 +65,13 @@ - static int deprecated_sysctl_warning(struct __sysctl_args *args); - - #if defined(CONFIG_SYSCTL) -+#include <linux/grsecurity.h> -+#include <linux/grinternal.h> -+ -+extern __u32 gr_handle_sysctl(const ctl_table *table, const int op); -+extern int gr_handle_sysctl_mod(const char *dirname, const char *name, -+ const int op); -+extern int gr_handle_chroot_sysctl(const int op); - - /* External variables not in a header file. */ - extern int C_A_D; -@@ -163,6 +170,7 @@ static int proc_do_cad_pid(struct ctl_ta - static int proc_taint(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); - #endif -+extern ctl_table grsecurity_table[]; - - static struct ctl_table root_table[]; - static struct ctl_table_root sysctl_table_root; -@@ -195,6 +203,21 @@ extern struct ctl_table epoll_table[]; - int sysctl_legacy_va_layout; - #endif - -+#ifdef CONFIG_PAX_SOFTMODE -+static ctl_table pax_table[] = { -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "softmode", -+ .data = &pax_softmode, -+ .maxlen = sizeof(unsigned int), -+ .mode = 0600, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = 0 } -+}; -+#endif -+ - extern int prove_locking; - extern int lock_stat; - -@@ -246,6 +269,24 @@ static int max_wakeup_granularity_ns = N - #endif - - static struct ctl_table kern_table[] = { -+#if defined(CONFIG_GRKERNSEC_SYSCTL) || defined(CONFIG_GRKERNSEC_MODSTOP) -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "grsecurity", -+ .mode = 0500, -+ .child = grsecurity_table, -+ }, -+#endif -+ -+#ifdef CONFIG_PAX_SOFTMODE -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "pax", -+ .mode = 0500, -+ .child = pax_table, -+ }, -+#endif -+ - #ifdef CONFIG_SCHED_DEBUG - { - .ctl_name = CTL_UNNUMBERED, -@@ -1734,6 +1775,8 @@ static int do_sysctl_strategy(struct ctl - return 0; - } - -+static int sysctl_perm_nochk(struct ctl_table_root *root, struct ctl_table *table, int op); -+ - static int parse_table(int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, -@@ -1752,7 +1795,7 @@ repeat: - if (n == table->ctl_name) { - int error; - if (table->child) { -- if (sysctl_perm(root, table, MAY_EXEC)) -+ if (sysctl_perm_nochk(root, table, MAY_EXEC)) - return -EPERM; - name++; - nlen--; -@@ -1837,6 +1880,33 @@ int sysctl_perm(struct ctl_table_root *r - int error; - int mode; - -+ if (table->parent != NULL && table->parent->procname != NULL && -+ table->procname != NULL && -+ gr_handle_sysctl_mod(table->parent->procname, table->procname, op)) -+ return -EACCES; -+ if (gr_handle_chroot_sysctl(op)) -+ return -EACCES; -+ error = gr_handle_sysctl(table, op); -+ if (error) -+ return error; -+ -+ error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); -+ if (error) -+ return error; -+ -+ if (root->permissions) -+ mode = root->permissions(root, current->nsproxy, table); -+ else -+ mode = table->mode; -+ -+ return test_perm(mode, op); -+} -+ -+int sysctl_perm_nochk(struct ctl_table_root *root, struct ctl_table *table, int op) -+{ -+ int error; -+ int mode; -+ - error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); - if (error) - return error; -diff -urNp linux-2.6.31.1/kernel/taskstats.c linux-2.6.31.1/kernel/taskstats.c ---- linux-2.6.31.1/kernel/taskstats.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/taskstats.c 2009-10-01 20:12:45.000000000 -0400 -@@ -26,9 +26,12 @@ - #include <linux/cgroup.h> - #include <linux/fs.h> - #include <linux/file.h> -+#include <linux/grsecurity.h> - #include <net/genetlink.h> - #include <asm/atomic.h> - -+extern int gr_is_taskstats_denied(int pid); -+ - /* - * Maximum length of a cpumask that can be specified in - * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute -@@ -433,6 +436,9 @@ static int taskstats_user_cmd(struct sk_ - size_t size; - cpumask_var_t mask; - -+ if (gr_is_taskstats_denied(current->pid)) -+ return -EACCES; -+ - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - -diff -urNp linux-2.6.31.1/kernel/time/tick-broadcast.c linux-2.6.31.1/kernel/time/tick-broadcast.c ---- linux-2.6.31.1/kernel/time/tick-broadcast.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/time/tick-broadcast.c 2009-10-01 20:12:45.000000000 -0400 -@@ -116,7 +116,7 @@ int tick_device_uses_broadcast(struct cl - * then clear the broadcast bit. - */ - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { -- int cpu = smp_processor_id(); -+ cpu = smp_processor_id(); - - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); - tick_broadcast_clear_oneshot(cpu); -diff -urNp linux-2.6.31.1/kernel/time/timer_list.c linux-2.6.31.1/kernel/time/timer_list.c ---- linux-2.6.31.1/kernel/time/timer_list.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/time/timer_list.c 2009-10-01 20:12:45.000000000 -0400 -@@ -275,7 +275,7 @@ static int timer_list_open(struct inode - return single_open(filp, timer_list_show, NULL); - } - --static struct file_operations timer_list_fops = { -+static const struct file_operations timer_list_fops = { - .open = timer_list_open, - .read = seq_read, - .llseek = seq_lseek, -diff -urNp linux-2.6.31.1/kernel/time/timer_stats.c linux-2.6.31.1/kernel/time/timer_stats.c ---- linux-2.6.31.1/kernel/time/timer_stats.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/time/timer_stats.c 2009-10-01 20:12:45.000000000 -0400 -@@ -395,7 +395,7 @@ static int tstats_open(struct inode *ino - return single_open(filp, tstats_show, NULL); - } - --static struct file_operations tstats_fops = { -+static const struct file_operations tstats_fops = { - .open = tstats_open, - .read = seq_read, - .write = tstats_write, -diff -urNp linux-2.6.31.1/kernel/time.c linux-2.6.31.1/kernel/time.c ---- linux-2.6.31.1/kernel/time.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/time.c 2009-10-01 20:12:45.000000000 -0400 -@@ -94,6 +94,9 @@ SYSCALL_DEFINE1(stime, time_t __user *, - return err; - - do_settimeofday(&tv); -+ -+ gr_log_timechange(); -+ - return 0; - } - -@@ -202,6 +205,8 @@ SYSCALL_DEFINE2(settimeofday, struct tim - return -EFAULT; - } - -+ gr_log_timechange(); -+ - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); - } - -@@ -240,7 +245,7 @@ EXPORT_SYMBOL(current_fs_time); - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ --unsigned int inline jiffies_to_msecs(const unsigned long j) -+inline unsigned int jiffies_to_msecs(const unsigned long j) - { - #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -@@ -256,7 +261,7 @@ unsigned int inline jiffies_to_msecs(con - } - EXPORT_SYMBOL(jiffies_to_msecs); - --unsigned int inline jiffies_to_usecs(const unsigned long j) -+inline unsigned int jiffies_to_usecs(const unsigned long j) - { - #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -diff -urNp linux-2.6.31.1/kernel/trace/ftrace.c linux-2.6.31.1/kernel/trace/ftrace.c ---- linux-2.6.31.1/kernel/trace/ftrace.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/trace/ftrace.c 2009-10-01 20:12:45.000000000 -0400 -@@ -1567,7 +1567,7 @@ static int t_show(struct seq_file *m, vo - return 0; - } - --static struct seq_operations show_ftrace_seq_ops = { -+static const struct seq_operations show_ftrace_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, -@@ -2565,7 +2565,7 @@ static int g_show(struct seq_file *m, vo - return 0; - } - --static struct seq_operations ftrace_graph_seq_ops = { -+static const struct seq_operations ftrace_graph_seq_ops = { - .start = g_start, - .next = g_next, - .stop = g_stop, -diff -urNp linux-2.6.31.1/kernel/trace/Kconfig linux-2.6.31.1/kernel/trace/Kconfig ---- linux-2.6.31.1/kernel/trace/Kconfig 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/trace/Kconfig 2009-10-01 20:12:45.000000000 -0400 -@@ -111,6 +111,7 @@ if FTRACE - config FUNCTION_TRACER - bool "Kernel Function Tracer" - depends on HAVE_FUNCTION_TRACER -+ depends on !PAX_KERNEXEC - select FRAME_POINTER - select KALLSYMS - select GENERIC_TRACER -@@ -326,6 +327,7 @@ config POWER_TRACER - config STACK_TRACER - bool "Trace max stack" - depends on HAVE_FUNCTION_TRACER -+ depends on !PAX_KERNEXEC - select FUNCTION_TRACER - select STACKTRACE - select KALLSYMS -diff -urNp linux-2.6.31.1/kernel/trace/trace.c linux-2.6.31.1/kernel/trace/trace.c ---- linux-2.6.31.1/kernel/trace/trace.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/trace/trace.c 2009-10-01 20:12:45.000000000 -0400 -@@ -1885,7 +1885,7 @@ static int s_show(struct seq_file *m, vo - return 0; - } - --static struct seq_operations tracer_seq_ops = { -+static const struct seq_operations tracer_seq_ops = { - .start = s_start, - .next = s_next, - .stop = s_stop, -@@ -2097,7 +2097,7 @@ static int t_show(struct seq_file *m, vo - return 0; - } - --static struct seq_operations show_traces_seq_ops = { -+static const struct seq_operations show_traces_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, -diff -urNp linux-2.6.31.1/kernel/trace/trace_output.c linux-2.6.31.1/kernel/trace/trace_output.c ---- linux-2.6.31.1/kernel/trace/trace_output.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/trace/trace_output.c 2009-10-01 20:12:45.000000000 -0400 -@@ -234,7 +234,7 @@ int trace_seq_path(struct trace_seq *s, - return 0; - p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); - if (!IS_ERR(p)) { -- p = mangle_path(s->buffer + s->len, p, "\n"); -+ p = mangle_path(s->buffer + s->len, p, "\n\"); - if (p) { - s->len = p - s->buffer; - return 1; -diff -urNp linux-2.6.31.1/kernel/utsname_sysctl.c linux-2.6.31.1/kernel/utsname_sysctl.c ---- linux-2.6.31.1/kernel/utsname_sysctl.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/kernel/utsname_sysctl.c 2009-10-01 20:12:45.000000000 -0400 -@@ -123,7 +123,7 @@ static struct ctl_table uts_kern_table[] - .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, - }, -- {} -+ { 0, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL } - }; - - static struct ctl_table uts_root_table[] = { -@@ -133,7 +133,7 @@ static struct ctl_table uts_root_table[] - .mode = 0555, - .child = uts_kern_table, - }, -- {} -+ { 0, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL } - }; - - static int __init utsname_sysctl_init(void) -diff -urNp linux-2.6.31.1/lib/inflate.c linux-2.6.31.1/lib/inflate.c ---- linux-2.6.31.1/lib/inflate.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/lib/inflate.c 2009-10-01 20:12:45.000000000 -0400 -@@ -266,7 +266,7 @@ static void free(void *where) - malloc_ptr = free_mem_ptr; - } - #else --#define malloc(a) kmalloc(a, GFP_KERNEL) -+#define malloc(a) kmalloc((a), GFP_KERNEL) - #define free(a) kfree(a) - #endif - -diff -urNp linux-2.6.31.1/lib/Kconfig.debug linux-2.6.31.1/lib/Kconfig.debug ---- linux-2.6.31.1/lib/Kconfig.debug 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/lib/Kconfig.debug 2009-10-01 20:12:45.000000000 -0400 -@@ -866,7 +866,7 @@ config LATENCYTOP - select STACKTRACE - select SCHEDSTATS - select SCHED_DEBUG -- depends on HAVE_LATENCYTOP_SUPPORT -+ depends on HAVE_LATENCYTOP_SUPPORT && !GRKERNSEC_HIDESYM - help - Enable this option if you want to use the LatencyTOP tool - to find out which userspace is blocking on what kernel operations. -diff -urNp linux-2.6.31.1/lib/parser.c linux-2.6.31.1/lib/parser.c ---- linux-2.6.31.1/lib/parser.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/lib/parser.c 2009-10-01 20:12:45.000000000 -0400 -@@ -126,7 +126,7 @@ static int match_number(substring_t *s, - char *buf; - int ret; - -- buf = kmalloc(s->to - s->from + 1, GFP_KERNEL); -+ buf = kmalloc((s->to - s->from) + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - memcpy(buf, s->from, s->to - s->from); -diff -urNp linux-2.6.31.1/lib/radix-tree.c linux-2.6.31.1/lib/radix-tree.c ---- linux-2.6.31.1/lib/radix-tree.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/lib/radix-tree.c 2009-10-01 20:12:45.000000000 -0400 -@@ -81,7 +81,7 @@ struct radix_tree_preload { - int nr; - struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; - }; --static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; -+static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads); - - static inline gfp_t root_gfp_mask(struct radix_tree_root *root) - { -diff -urNp linux-2.6.31.1/lib/random32.c linux-2.6.31.1/lib/random32.c ---- linux-2.6.31.1/lib/random32.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/lib/random32.c 2009-10-01 20:12:45.000000000 -0400 -@@ -61,7 +61,7 @@ static u32 __random32(struct rnd_state * - */ - static inline u32 __seed(u32 x, u32 m) - { -- return (x < m) ? x + m : x; -+ return (x <= m) ? x + m + 1 : x; - } - - /** -diff -urNp linux-2.6.31.1/localversion-grsec linux-2.6.31.1/localversion-grsec ---- linux-2.6.31.1/localversion-grsec 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.31.1/localversion-grsec 2009-10-01 20:12:45.000000000 -0400 -@@ -0,0 +1 @@ -+-grsec -diff -urNp linux-2.6.31.1/Makefile linux-2.6.31.1/Makefile ---- linux-2.6.31.1/Makefile 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/Makefile 2009-10-01 20:12:45.000000000 -0400 -@@ -221,8 +221,8 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" - - HOSTCC = gcc - HOSTCXX = g++ --HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer --HOSTCXXFLAGS = -O2 -+HOSTCFLAGS = -Wall -W -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-delete-null-pointer-checks -+HOSTCXXFLAGS = -O2 -fno-delete-null-pointer-checks - - # Decide whether to build built-in, modular, or both. - # Normally, just do built-in. -@@ -639,7 +639,7 @@ export mod_strip_cmd - - - ifeq ($(KBUILD_EXTMOD),) --core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ -+core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ grsecurity/ - - vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ - $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ -diff -urNp linux-2.6.31.1/mm/filemap.c linux-2.6.31.1/mm/filemap.c ---- linux-2.6.31.1/mm/filemap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/filemap.c 2009-10-01 20:12:45.000000000 -0400 -@@ -1648,7 +1648,7 @@ page_not_uptodate: - } - EXPORT_SYMBOL(filemap_fault); - --struct vm_operations_struct generic_file_vm_ops = { -+const struct vm_operations_struct generic_file_vm_ops = { - .fault = filemap_fault, - }; - -@@ -1659,7 +1659,7 @@ int generic_file_mmap(struct file * file - struct address_space *mapping = file->f_mapping; - - if (!mapping->a_ops->readpage) -- return -ENOEXEC; -+ return -ENODEV; - file_accessed(file); - vma->vm_ops = &generic_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; -@@ -2019,6 +2019,7 @@ inline int generic_write_checks(struct f - *pos = i_size_read(inode); - - if (limit != RLIM_INFINITY) { -+ gr_learn_resource(current, RLIMIT_FSIZE,*pos, 0); - if (*pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; -diff -urNp linux-2.6.31.1/mm/filemap_xip.c linux-2.6.31.1/mm/filemap_xip.c ---- linux-2.6.31.1/mm/filemap_xip.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/filemap_xip.c 2009-10-01 20:12:45.000000000 -0400 -@@ -296,7 +296,7 @@ out: - } - } - --static struct vm_operations_struct xip_file_vm_ops = { -+static const struct vm_operations_struct xip_file_vm_ops = { - .fault = xip_file_fault, - }; - -diff -urNp linux-2.6.31.1/mm/fremap.c linux-2.6.31.1/mm/fremap.c ---- linux-2.6.31.1/mm/fremap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/fremap.c 2009-10-01 20:12:45.000000000 -0400 -@@ -153,6 +153,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsign - retry: - vma = find_vma(mm, start); - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (vma && (mm->pax_flags & MF_PAX_SEGMEXEC) && (vma->vm_flags & VM_MAYEXEC)) -+ goto out; -+#endif -+ - /* - * Make sure the vma is shared, that it supports prefaulting, - * and that the remapped range is valid and fully within -diff -urNp linux-2.6.31.1/mm/highmem.c linux-2.6.31.1/mm/highmem.c ---- linux-2.6.31.1/mm/highmem.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/highmem.c 2009-10-01 20:12:45.000000000 -0400 -@@ -94,6 +94,9 @@ static void flush_all_zero_pkmaps(void) - - for (i = 0; i < LAST_PKMAP; i++) { - struct page *page; -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif - - /* - * zero means we don't have anything to do, -@@ -116,9 +119,18 @@ static void flush_all_zero_pkmaps(void) - * So no dangers, even with speculative execution. - */ - page = pte_page(pkmap_page_table[i]); -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - pte_clear(&init_mm, (unsigned long)page_address(page), - &pkmap_page_table[i]); - -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ - set_page_address(page, NULL); - need_flush = 1; - } -@@ -140,6 +152,9 @@ static inline unsigned long map_new_virt - { - unsigned long vaddr; - int count; -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif - - start: - count = LAST_PKMAP; -@@ -177,8 +192,14 @@ start: - } - } - vaddr = PKMAP_ADDR(last_pkmap_nr); -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif - set_pte_at(&init_mm, vaddr, - &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif - - pkmap_count[last_pkmap_nr] = 1; - set_page_address(page, (void *)vaddr); -diff -urNp linux-2.6.31.1/mm/hugetlb.c linux-2.6.31.1/mm/hugetlb.c ---- linux-2.6.31.1/mm/hugetlb.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/hugetlb.c 2009-10-01 20:12:45.000000000 -0400 -@@ -1689,7 +1689,7 @@ static int hugetlb_vm_op_fault(struct vm - return 0; - } - --struct vm_operations_struct hugetlb_vm_ops = { -+const struct vm_operations_struct hugetlb_vm_ops = { - .fault = hugetlb_vm_op_fault, - .open = hugetlb_vm_op_open, - .close = hugetlb_vm_op_close, -@@ -1892,6 +1892,26 @@ static int unmap_ref_private(struct mm_s - return 1; - } - -+#ifdef CONFIG_PAX_SEGMEXEC -+static void pax_mirror_huge_pte(struct vm_area_struct *vma, unsigned long address, struct page *page_m) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ struct vm_area_struct *vma_m; -+ unsigned long address_m; -+ pte_t *ptep_m; -+ -+ vma_m = pax_find_mirror_vma(vma); -+ if (!vma_m) -+ return; -+ -+ BUG_ON(address >= SEGMEXEC_TASK_SIZE); -+ address_m = address + SEGMEXEC_TASK_SIZE; -+ ptep_m = huge_pte_offset(mm, address_m & HPAGE_MASK); -+ get_page(page_m); -+ set_huge_pte_at(mm, address_m, ptep_m, make_huge_pte(vma_m, page_m, 0)); -+} -+#endif -+ - static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page) -@@ -1963,6 +1983,11 @@ retry_avoidcopy: - huge_ptep_clear_flush(vma, address, ptep); - set_huge_pte_at(mm, address, ptep, - make_huge_pte(vma, new_page, 1)); -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ pax_mirror_huge_pte(vma, address, new_page); -+#endif -+ - /* Make the old page be freed below */ - new_page = old_page; - } -@@ -2072,6 +2097,10 @@ retry: - && (vma->vm_flags & VM_SHARED))); - set_huge_pte_at(mm, address, ptep, new_pte); - -+#ifdef CONFIG_PAX_SEGMEXEC -+ pax_mirror_huge_pte(vma, address, page); -+#endif -+ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); -@@ -2100,6 +2129,28 @@ int hugetlb_fault(struct mm_struct *mm, - static DEFINE_MUTEX(hugetlb_instantiation_mutex); - struct hstate *h = hstate_vma(vma); - -+#ifdef CONFIG_PAX_SEGMEXEC -+ struct vm_area_struct *vma_m; -+ -+ vma_m = pax_find_mirror_vma(vma); -+ if (vma_m) { -+ unsigned long address_m; -+ -+ if (vma->vm_start > vma_m->vm_start) { -+ address_m = address; -+ address -= SEGMEXEC_TASK_SIZE; -+ vma = vma_m; -+ h = hstate_vma(vma); -+ } else -+ address_m = address + SEGMEXEC_TASK_SIZE; -+ -+ if (!huge_pte_alloc(mm, address_m, huge_page_size(h))) -+ return VM_FAULT_OOM; -+ address_m &= HPAGE_MASK; -+ unmap_hugepage_range(vma, address_m, address_m + HPAGE_SIZE, NULL); -+ } -+#endif -+ - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; -diff -urNp linux-2.6.31.1/mm/Kconfig linux-2.6.31.1/mm/Kconfig ---- linux-2.6.31.1/mm/Kconfig 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/Kconfig 2009-10-01 20:12:45.000000000 -0400 -@@ -216,7 +216,7 @@ config MMU_NOTIFIER - - config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" -- default 4096 -+ default 65536 - help - This is the portion of low virtual memory which should be protected - from userspace allocation. Keeping a user from writing to low pages -diff -urNp linux-2.6.31.1/mm/madvise.c linux-2.6.31.1/mm/madvise.c ---- linux-2.6.31.1/mm/madvise.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/madvise.c 2009-10-01 20:12:45.000000000 -0400 -@@ -43,6 +43,10 @@ static long madvise_behavior(struct vm_a - pgoff_t pgoff; - int new_flags = vma->vm_flags; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ struct vm_area_struct *vma_m; -+#endif -+ - switch (behavior) { - case MADV_NORMAL: - new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; -@@ -92,6 +96,13 @@ success: - /* - * vm_flags is protected by the mmap_sem held in write mode. - */ -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ vma_m = pax_find_mirror_vma(vma); -+ if (vma_m) -+ vma_m->vm_flags = new_flags & ~(VM_WRITE | VM_MAYWRITE | VM_ACCOUNT); -+#endif -+ - vma->vm_flags = new_flags; - - out: -@@ -235,6 +246,17 @@ madvise_vma(struct vm_area_struct *vma, - - case MADV_DONTNEED: - error = madvise_dontneed(vma, prev, start, end); -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (!error) { -+ struct vm_area_struct *vma_m, *prev_m; -+ -+ vma_m = pax_find_mirror_vma(vma); -+ if (vma_m) -+ error = madvise_dontneed(vma_m, &prev_m, start + SEGMEXEC_TASK_SIZE, end + SEGMEXEC_TASK_SIZE); -+ } -+#endif -+ - break; - - default: -@@ -328,6 +350,16 @@ SYSCALL_DEFINE3(madvise, unsigned long, - if (end < start) - goto out; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) { -+ if (end > SEGMEXEC_TASK_SIZE) -+ goto out; -+ } else -+#endif -+ -+ if (end > TASK_SIZE) -+ goto out; -+ - error = 0; - if (end == start) - goto out; -diff -urNp linux-2.6.31.1/mm/memory.c linux-2.6.31.1/mm/memory.c ---- linux-2.6.31.1/mm/memory.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/memory.c 2009-10-01 20:12:45.000000000 -0400 -@@ -47,6 +47,7 @@ - #include <linux/pagemap.h> - #include <linux/rmap.h> - #include <linux/module.h> -+#include <linux/security.h> - #include <linux/delayacct.h> - #include <linux/init.h> - #include <linux/writeback.h> -@@ -1228,11 +1229,11 @@ int __get_user_pages(struct task_struct - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - i = 0; - -- do { -+ while (nr_pages) { - struct vm_area_struct *vma; - unsigned int foll_flags; - -- vma = find_extend_vma(mm, start); -+ vma = find_vma(mm, start); - if (!vma && in_gate_area(tsk, start)) { - unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(tsk); -@@ -1274,7 +1275,7 @@ int __get_user_pages(struct task_struct - continue; - } - -- if (!vma || -+ if (!vma || start < vma->vm_start || - (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - (!ignore && !(vm_flags & vma->vm_flags))) - return i ? : -EFAULT; -@@ -1360,7 +1361,7 @@ int __get_user_pages(struct task_struct - start += PAGE_SIZE; - nr_pages--; - } while (nr_pages && start < vma->vm_end); -- } while (nr_pages); -+ } - return i; - } - -@@ -1926,6 +1927,186 @@ static inline void cow_user_page(struct - copy_user_highpage(dst, src, va, vma); - } - -+#ifdef CONFIG_PAX_SEGMEXEC -+static void pax_unmap_mirror_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ spinlock_t *ptl; -+ pte_t *pte, entry; -+ -+ pte = pte_offset_map_lock(mm, pmd, address, &ptl); -+ entry = *pte; -+ if (!pte_present(entry)) { -+ if (!pte_none(entry)) { -+ BUG_ON(pte_file(entry)); -+ free_swap_and_cache(pte_to_swp_entry(entry)); -+ pte_clear_not_present_full(mm, address, pte, 0); -+ } -+ } else { -+ struct page *page; -+ -+ flush_cache_page(vma, address, pte_pfn(entry)); -+ entry = ptep_clear_flush(vma, address, pte); -+ BUG_ON(pte_dirty(entry)); -+ page = vm_normal_page(vma, address, entry); -+ if (page) { -+ update_hiwater_rss(mm); -+ if (PageAnon(page)) -+ dec_mm_counter(mm, anon_rss); -+ else -+ dec_mm_counter(mm, file_rss); -+ page_remove_rmap(page); -+ page_cache_release(page); -+ } -+ } -+ pte_unmap_unlock(pte, ptl); -+} -+ -+/* PaX: if vma is mirrored, synchronize the mirror's PTE -+ * -+ * the ptl of the lower mapped page is held on entry and is not released on exit -+ * or inside to ensure atomic changes to the PTE states (swapout, mremap, munmap, etc) -+ */ -+static void pax_mirror_anon_pte(struct vm_area_struct *vma, unsigned long address, struct page *page_m, spinlock_t *ptl) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ unsigned long address_m; -+ spinlock_t *ptl_m; -+ struct vm_area_struct *vma_m; -+ pmd_t *pmd_m; -+ pte_t *pte_m, entry_m; -+ -+ BUG_ON(!page_m || !PageAnon(page_m)); -+ -+ vma_m = pax_find_mirror_vma(vma); -+ if (!vma_m) -+ return; -+ -+ BUG_ON(!PageLocked(page_m)); -+ BUG_ON(address >= SEGMEXEC_TASK_SIZE); -+ address_m = address + SEGMEXEC_TASK_SIZE; -+ pmd_m = pmd_offset(pud_offset(pgd_offset(mm, address_m), address_m), address_m); -+ pte_m = pte_offset_map_nested(pmd_m, address_m); -+ ptl_m = pte_lockptr(mm, pmd_m); -+ if (ptl != ptl_m) { -+ spin_lock_nested(ptl_m, SINGLE_DEPTH_NESTING); -+ if (!pte_none(*pte_m)) -+ goto out; -+ } -+ -+ entry_m = pfn_pte(page_to_pfn(page_m), vma_m->vm_page_prot); -+ page_cache_get(page_m); -+ page_add_anon_rmap(page_m, vma_m, address_m); -+ inc_mm_counter(mm, anon_rss); -+ set_pte_at(mm, address_m, pte_m, entry_m); -+ update_mmu_cache(vma_m, address_m, entry_m); -+out: -+ if (ptl != ptl_m) -+ spin_unlock(ptl_m); -+ pte_unmap_nested(pte_m); -+ unlock_page(page_m); -+} -+ -+void pax_mirror_file_pte(struct vm_area_struct *vma, unsigned long address, struct page *page_m, spinlock_t *ptl) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ unsigned long address_m; -+ spinlock_t *ptl_m; -+ struct vm_area_struct *vma_m; -+ pmd_t *pmd_m; -+ pte_t *pte_m, entry_m; -+ -+ BUG_ON(!page_m || PageAnon(page_m)); -+ -+ vma_m = pax_find_mirror_vma(vma); -+ if (!vma_m) -+ return; -+ -+ BUG_ON(address >= SEGMEXEC_TASK_SIZE); -+ address_m = address + SEGMEXEC_TASK_SIZE; -+ pmd_m = pmd_offset(pud_offset(pgd_offset(mm, address_m), address_m), address_m); -+ pte_m = pte_offset_map_nested(pmd_m, address_m); -+ ptl_m = pte_lockptr(mm, pmd_m); -+ if (ptl != ptl_m) { -+ spin_lock_nested(ptl_m, SINGLE_DEPTH_NESTING); -+ if (!pte_none(*pte_m)) -+ goto out; -+ } -+ -+ entry_m = pfn_pte(page_to_pfn(page_m), vma_m->vm_page_prot); -+ page_cache_get(page_m); -+ page_add_file_rmap(page_m); -+ inc_mm_counter(mm, file_rss); -+ set_pte_at(mm, address_m, pte_m, entry_m); -+ update_mmu_cache(vma_m, address_m, entry_m); -+out: -+ if (ptl != ptl_m) -+ spin_unlock(ptl_m); -+ pte_unmap_nested(pte_m); -+} -+ -+static void pax_mirror_pfn_pte(struct vm_area_struct *vma, unsigned long address, unsigned long pfn_m, spinlock_t *ptl) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ unsigned long address_m; -+ spinlock_t *ptl_m; -+ struct vm_area_struct *vma_m; -+ pmd_t *pmd_m; -+ pte_t *pte_m, entry_m; -+ -+ vma_m = pax_find_mirror_vma(vma); -+ if (!vma_m) -+ return; -+ -+ BUG_ON(address >= SEGMEXEC_TASK_SIZE); -+ address_m = address + SEGMEXEC_TASK_SIZE; -+ pmd_m = pmd_offset(pud_offset(pgd_offset(mm, address_m), address_m), address_m); -+ pte_m = pte_offset_map_nested(pmd_m, address_m); -+ ptl_m = pte_lockptr(mm, pmd_m); -+ if (ptl != ptl_m) { -+ spin_lock_nested(ptl_m, SINGLE_DEPTH_NESTING); -+ if (!pte_none(*pte_m)) -+ goto out; -+ } -+ -+ entry_m = pfn_pte(pfn_m, vma_m->vm_page_prot); -+ set_pte_at(mm, address_m, pte_m, entry_m); -+out: -+ if (ptl != ptl_m) -+ spin_unlock(ptl_m); -+ pte_unmap_nested(pte_m); -+} -+ -+static void pax_mirror_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, spinlock_t *ptl) -+{ -+ struct page *page_m; -+ pte_t entry; -+ -+ if (!(vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC)) -+ goto out; -+ -+ entry = *pte; -+ page_m = vm_normal_page(vma, address, entry); -+ if (!page_m) -+ pax_mirror_pfn_pte(vma, address, pte_pfn(entry), ptl); -+ else if (PageAnon(page_m)) { -+ if (pax_find_mirror_vma(vma)) { -+ pte_unmap_unlock(pte, ptl); -+ lock_page(page_m); -+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); -+ if (pte_same(entry, *pte)) -+ pax_mirror_anon_pte(vma, address, page_m, ptl); -+ else -+ unlock_page(page_m); -+ } -+ } else -+ pax_mirror_file_pte(vma, address, page_m, ptl); -+ -+out: -+ pte_unmap_unlock(pte, ptl); -+} -+#endif -+ - /* - * This routine handles present pages, when users try to write - * to a shared page. It is done by copying the page to a new address -@@ -2098,6 +2279,12 @@ gotten: - */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (likely(pte_same(*page_table, orig_pte))) { -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (pax_find_mirror_vma(vma)) -+ BUG_ON(!trylock_page(new_page)); -+#endif -+ - if (old_page) { - if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); -@@ -2144,6 +2331,10 @@ gotten: - page_remove_rmap(old_page); - } - -+#ifdef CONFIG_PAX_SEGMEXEC -+ pax_mirror_anon_pte(vma, address, new_page, ptl); -+#endif -+ - /* Free the old page.. */ - new_page = old_page; - ret |= VM_FAULT_WRITE; -@@ -2425,6 +2616,7 @@ int vmtruncate(struct inode * inode, lof - unsigned long limit; - - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; -+ gr_learn_resource(current, RLIMIT_FSIZE, offset, 1); - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) -@@ -2587,6 +2779,11 @@ static int do_swap_page(struct mm_struct - swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - try_to_free_swap(page); -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((flags & FAULT_FLAG_WRITE) || !pax_find_mirror_vma(vma)) -+#endif -+ - unlock_page(page); - - if (flags & FAULT_FLAG_WRITE) { -@@ -2598,6 +2795,11 @@ static int do_swap_page(struct mm_struct - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ pax_mirror_anon_pte(vma, address, page, ptl); -+#endif -+ - unlock: - pte_unmap_unlock(page_table, ptl); - out: -@@ -2643,12 +2845,23 @@ static int do_anonymous_page(struct mm_s - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!pte_none(*page_table)) - goto release; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (pax_find_mirror_vma(vma)) -+ BUG_ON(!trylock_page(page)); -+#endif -+ - inc_mm_counter(mm, anon_rss); - page_add_new_anon_rmap(page, vma, address); - set_pte_at(mm, address, page_table, entry); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, entry); -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ pax_mirror_anon_pte(vma, address, page, ptl); -+#endif -+ - unlock: - pte_unmap_unlock(page_table, ptl); - return 0; -@@ -2785,6 +2998,12 @@ static int __do_fault(struct mm_struct * - */ - /* Only go through if we didn't race with anybody else... */ - if (likely(pte_same(*page_table, orig_pte))) { -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (anon && pax_find_mirror_vma(vma)) -+ BUG_ON(!trylock_page(page)); -+#endif -+ - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); - if (flags & FAULT_FLAG_WRITE) -@@ -2804,6 +3023,14 @@ static int __do_fault(struct mm_struct * - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, entry); -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (anon) -+ pax_mirror_anon_pte(vma, address, page, ptl); -+ else -+ pax_mirror_file_pte(vma, address, page, ptl); -+#endif -+ - } else { - if (charged) - mem_cgroup_uncharge_page(page); -@@ -2951,6 +3178,12 @@ static inline int handle_pte_fault(struc - if (flags & FAULT_FLAG_WRITE) - flush_tlb_page(vma, address); - } -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ pax_mirror_pte(vma, address, pte, pmd, ptl); -+ return 0; -+#endif -+ - unlock: - pte_unmap_unlock(pte, ptl); - return 0; -@@ -2967,6 +3200,10 @@ int handle_mm_fault(struct mm_struct *mm - pmd_t *pmd; - pte_t *pte; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ struct vm_area_struct *vma_m; -+#endif -+ - __set_current_state(TASK_RUNNING); - - count_vm_event(PGFAULT); -@@ -2974,6 +3211,34 @@ int handle_mm_fault(struct mm_struct *mm - if (unlikely(is_vm_hugetlb_page(vma))) - return hugetlb_fault(mm, vma, address, flags); - -+#ifdef CONFIG_PAX_SEGMEXEC -+ vma_m = pax_find_mirror_vma(vma); -+ if (vma_m) { -+ unsigned long address_m; -+ pgd_t *pgd_m; -+ pud_t *pud_m; -+ pmd_t *pmd_m; -+ -+ if (vma->vm_start > vma_m->vm_start) { -+ address_m = address; -+ address -= SEGMEXEC_TASK_SIZE; -+ vma = vma_m; -+ } else -+ address_m = address + SEGMEXEC_TASK_SIZE; -+ -+ pgd_m = pgd_offset(mm, address_m); -+ pud_m = pud_alloc(mm, pgd_m, address_m); -+ if (!pud_m) -+ return VM_FAULT_OOM; -+ pmd_m = pmd_alloc(mm, pud_m, address_m); -+ if (!pmd_m) -+ return VM_FAULT_OOM; -+ if (!pmd_present(*pmd_m) && __pte_alloc(mm, pmd_m, address_m)) -+ return VM_FAULT_OOM; -+ pax_unmap_mirror_pte(vma_m, address_m, pmd_m); -+ } -+#endif -+ - pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (!pud) -@@ -3071,7 +3336,7 @@ static int __init gate_vma_init(void) - gate_vma.vm_start = FIXADDR_USER_START; - gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; -- gate_vma.vm_page_prot = __P101; -+ gate_vma.vm_page_prot = vm_get_page_prot(gate_vma.vm_flags); - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later -diff -urNp linux-2.6.31.1/mm/mempolicy.c linux-2.6.31.1/mm/mempolicy.c ---- linux-2.6.31.1/mm/mempolicy.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/mempolicy.c 2009-10-01 20:12:45.000000000 -0400 -@@ -573,6 +573,10 @@ static int mbind_range(struct vm_area_st - struct vm_area_struct *next; - int err; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ struct vm_area_struct *vma_m; -+#endif -+ - err = 0; - for (; vma && vma->vm_start < end; vma = next) { - next = vma->vm_next; -@@ -584,6 +588,16 @@ static int mbind_range(struct vm_area_st - err = policy_vma(vma, new); - if (err) - break; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ vma_m = pax_find_mirror_vma(vma); -+ if (vma_m) { -+ err = policy_vma(vma_m, new); -+ if (err) -+ break; -+ } -+#endif -+ - } - return err; - } -@@ -1002,6 +1016,17 @@ static long do_mbind(unsigned long start - - if (end < start) - return -EINVAL; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (mm->pax_flags & MF_PAX_SEGMEXEC) { -+ if (end > SEGMEXEC_TASK_SIZE) -+ return -EINVAL; -+ } else -+#endif -+ -+ if (end > TASK_SIZE) -+ return -EINVAL; -+ - if (end == start) - return 0; - -@@ -1206,6 +1231,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pi - if (!mm) - return -EINVAL; - -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+ if (mm != current->mm && -+ (mm->pax_flags & MF_PAX_RANDMMAP || mm->pax_flags & MF_PAX_SEGMEXEC)) { -+ err = -EPERM; -+ goto out; -+ } -+#endif -+ - /* - * Check if this process has the right to modify the specified - * process. The right exists if the process has administrative -@@ -1215,8 +1248,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pi - rcu_read_lock(); - tcred = __task_cred(task); - if (cred->euid != tcred->suid && cred->euid != tcred->uid && -- cred->uid != tcred->suid && cred->uid != tcred->uid && -- !capable(CAP_SYS_NICE)) { -+ cred->uid != tcred->suid && !capable(CAP_SYS_NICE)) { - rcu_read_unlock(); - err = -EPERM; - goto out; -@@ -2385,7 +2417,7 @@ int show_numa_map(struct seq_file *m, vo - - if (file) { - seq_printf(m, " file="); -- seq_path(m, &file->f_path, "\n\t= "); -+ seq_path(m, &file->f_path, "\n\t\= "); - } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { - seq_printf(m, " heap"); - } else if (vma->vm_start <= mm->start_stack && -diff -urNp linux-2.6.31.1/mm/migrate.c linux-2.6.31.1/mm/migrate.c ---- linux-2.6.31.1/mm/migrate.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/migrate.c 2009-10-01 20:12:45.000000000 -0400 -@@ -1087,6 +1087,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, - if (!mm) - return -EINVAL; - -+#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP -+ if (mm != current->mm && -+ (mm->pax_flags & MF_PAX_RANDMMAP || mm->pax_flags & MF_PAX_SEGMEXEC)) { -+ err = -EPERM; -+ goto out; -+ } -+#endif -+ - /* - * Check if this process has the right to modify the specified - * process. The right exists if the process has administrative -@@ -1096,8 +1104,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, - rcu_read_lock(); - tcred = __task_cred(task); - if (cred->euid != tcred->suid && cred->euid != tcred->uid && -- cred->uid != tcred->suid && cred->uid != tcred->uid && -- !capable(CAP_SYS_NICE)) { -+ cred->uid != tcred->suid && !capable(CAP_SYS_NICE)) { - rcu_read_unlock(); - err = -EPERM; - goto out; -diff -urNp linux-2.6.31.1/mm/mlock.c linux-2.6.31.1/mm/mlock.c ---- linux-2.6.31.1/mm/mlock.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/mlock.c 2009-10-01 20:12:45.000000000 -0400 -@@ -13,6 +13,7 @@ - #include <linux/pagemap.h> - #include <linux/mempolicy.h> - #include <linux/syscalls.h> -+#include <linux/security.h> - #include <linux/sched.h> - #include <linux/module.h> - #include <linux/rmap.h> -@@ -431,6 +432,17 @@ static int do_mlock(unsigned long start, - return -EINVAL; - if (end == start) - return 0; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) { -+ if (end > SEGMEXEC_TASK_SIZE) -+ return -EINVAL; -+ } else -+#endif -+ -+ if (end > TASK_SIZE) -+ return -EINVAL; -+ - vma = find_vma_prev(current->mm, start, &prev); - if (!vma || vma->vm_start > start) - return -ENOMEM; -@@ -490,6 +502,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, st - lock_limit >>= PAGE_SHIFT; - - /* check against resource limits */ -+ gr_learn_resource(current, RLIMIT_MEMLOCK, (current->mm->locked_vm << PAGE_SHIFT) + len, 1); - if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) - error = do_mlock(start, len, 1); - up_write(¤t->mm->mmap_sem); -@@ -511,10 +524,10 @@ SYSCALL_DEFINE2(munlock, unsigned long, - static int do_mlockall(int flags) - { - struct vm_area_struct * vma, * prev = NULL; -- unsigned int def_flags = 0; -+ unsigned int def_flags = current->mm->def_flags & ~VM_LOCKED; - - if (flags & MCL_FUTURE) -- def_flags = VM_LOCKED; -+ def_flags |= VM_LOCKED; - current->mm->def_flags = def_flags; - if (flags == MCL_FUTURE) - goto out; -@@ -522,6 +535,12 @@ static int do_mlockall(int flags) - for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { - unsigned int newflags; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((current->mm->pax_flags & MF_PAX_SEGMEXEC) && (vma->vm_start >= SEGMEXEC_TASK_SIZE)) -+ break; -+#endif -+ -+ BUG_ON(vma->vm_end > TASK_SIZE); - newflags = vma->vm_flags | VM_LOCKED; - if (!(flags & MCL_CURRENT)) - newflags &= ~VM_LOCKED; -@@ -553,6 +572,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) - lock_limit >>= PAGE_SHIFT; - - ret = -ENOMEM; -+ gr_learn_resource(current, RLIMIT_MEMLOCK, current->mm->total_vm, 1); - if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || - capable(CAP_IPC_LOCK)) - ret = do_mlockall(flags); -diff -urNp linux-2.6.31.1/mm/mmap.c linux-2.6.31.1/mm/mmap.c ---- linux-2.6.31.1/mm/mmap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/mmap.c 2009-10-01 20:12:45.000000000 -0400 -@@ -45,6 +45,16 @@ - #define arch_rebalance_pgtables(addr, len) (addr) - #endif - -+static inline void verify_mm_writelocked(struct mm_struct *mm) -+{ -+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAX) -+ if (unlikely(down_read_trylock(&mm->mmap_sem))) { -+ up_read(&mm->mmap_sem); -+ BUG(); -+ } -+#endif -+} -+ - static void unmap_region(struct mm_struct *mm, - struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long start, unsigned long end); -@@ -70,16 +80,25 @@ static void unmap_region(struct mm_struc - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - */ --pgprot_t protection_map[16] = { -+pgprot_t protection_map[16] __read_only = { - __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, - __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 - }; - - pgprot_t vm_get_page_prot(unsigned long vm_flags) - { -- return __pgprot(pgprot_val(protection_map[vm_flags & -+ pgprot_t prot = __pgprot(pgprot_val(protection_map[vm_flags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | - pgprot_val(arch_vm_get_page_prot(vm_flags))); -+ -+#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_X86_32) -+ if (!nx_enabled && -+ (vm_flags & (VM_PAGEEXEC | VM_EXEC)) == VM_PAGEEXEC && -+ (vm_flags & (VM_READ | VM_WRITE))) -+ prot = __pgprot(pte_val(pte_exprotect(__pte(pgprot_val(prot))))); -+#endif -+ -+ return prot; - } - EXPORT_SYMBOL(vm_get_page_prot); - -@@ -231,6 +250,7 @@ static struct vm_area_struct *remove_vma - struct vm_area_struct *next = vma->vm_next; - - might_sleep(); -+ BUG_ON(vma->vm_mirror); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - if (vma->vm_file) { -@@ -267,6 +287,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) - * not page aligned -Ram Gupta - */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; -+ gr_learn_resource(current, RLIMIT_DATA, (brk - mm->start_brk) + (mm->end_data - mm->start_data), 1); - if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) - goto out; -@@ -696,6 +717,12 @@ static int - can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) - { -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC) && vma->vm_start == SEGMEXEC_TASK_SIZE) -+ return 0; -+#endif -+ - if (is_mergeable_vma(vma, file, vm_flags) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { - if (vma->vm_pgoff == vm_pgoff) -@@ -715,6 +742,12 @@ static int - can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) - { -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC) && vma->vm_end == SEGMEXEC_TASK_SIZE) -+ return 0; -+#endif -+ - if (is_mergeable_vma(vma, file, vm_flags) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { - pgoff_t vm_pglen; -@@ -757,12 +790,19 @@ can_vma_merge_after(struct vm_area_struc - struct vm_area_struct *vma_merge(struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, -- struct anon_vma *anon_vma, struct file *file, -+ struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy) - { - pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *area, *next; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ unsigned long addr_m = addr + SEGMEXEC_TASK_SIZE, end_m = end + SEGMEXEC_TASK_SIZE; -+ struct vm_area_struct *area_m = NULL, *next_m = NULL, *prev_m = NULL; -+ -+ BUG_ON((mm->pax_flags & MF_PAX_SEGMEXEC) && SEGMEXEC_TASK_SIZE < end); -+#endif -+ - /* - * We later require that vma->vm_flags == vm_flags, - * so this tests vma->vm_flags & VM_SPECIAL, too. -@@ -778,6 +818,15 @@ struct vm_area_struct *vma_merge(struct - if (next && next->vm_end == end) /* cases 6, 7, 8 */ - next = next->vm_next; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (prev) -+ prev_m = pax_find_mirror_vma(prev); -+ if (area) -+ area_m = pax_find_mirror_vma(area); -+ if (next) -+ next_m = pax_find_mirror_vma(next); -+#endif -+ - /* - * Can it merge with the predecessor? - */ -@@ -797,9 +846,24 @@ struct vm_area_struct *vma_merge(struct - /* cases 1, 6 */ - vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL); -- } else /* cases 2, 5, 7 */ -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (prev_m) -+ vma_adjust(prev_m, prev_m->vm_start, -+ next_m->vm_end, prev_m->vm_pgoff, NULL); -+#endif -+ -+ } else { /* cases 2, 5, 7 */ - vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL); -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (prev_m) -+ vma_adjust(prev_m, prev_m->vm_start, -+ end_m, prev_m->vm_pgoff, NULL); -+#endif -+ -+ } - return prev; - } - -@@ -810,12 +874,27 @@ struct vm_area_struct *vma_merge(struct - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen)) { -- if (prev && addr < prev->vm_end) /* case 4 */ -+ if (prev && addr < prev->vm_end) { /* case 4 */ - vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL); -- else /* cases 3, 8 */ -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (prev_m) -+ vma_adjust(prev_m, prev_m->vm_start, -+ addr_m, prev_m->vm_pgoff, NULL); -+#endif -+ -+ } else { /* cases 3, 8 */ - vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL); -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (area_m) -+ vma_adjust(area_m, addr_m, next_m->vm_end, -+ next_m->vm_pgoff - pglen, NULL); -+#endif -+ -+ } - return area; - } - -@@ -890,14 +969,11 @@ none: - void vm_stat_account(struct mm_struct *mm, unsigned long flags, - struct file *file, long pages) - { -- const unsigned long stack_flags -- = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); -- - if (file) { - mm->shared_vm += pages; - if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) - mm->exec_vm += pages; -- } else if (flags & stack_flags) -+ } else if (flags & (VM_GROWSUP|VM_GROWSDOWN)) - mm->stack_vm += pages; - if (flags & (VM_RESERVED|VM_IO)) - mm->reserved_vm += pages; -@@ -924,7 +1000,7 @@ unsigned long do_mmap_pgoff(struct file - * (the exception is when the underlying filesystem is noexec - * mounted, in which case we dont add PROT_EXEC.) - */ -- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) -+ if ((prot & (PROT_READ | PROT_WRITE)) && (current->personality & READ_IMPLIES_EXEC)) - if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) - prot |= PROT_EXEC; - -@@ -934,15 +1010,15 @@ unsigned long do_mmap_pgoff(struct file - if (!(flags & MAP_FIXED)) - addr = round_hint_to_min(addr); - -- error = arch_mmap_check(addr, len, flags); -- if (error) -- return error; -- - /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) - return -ENOMEM; - -+ error = arch_mmap_check(addr, len, flags); -+ if (error) -+ return error; -+ - /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) - return -EOVERFLOW; -@@ -954,7 +1030,7 @@ unsigned long do_mmap_pgoff(struct file - /* Obtain the address to map to. we verify (or select) it and ensure - * that it represents a valid section of the address space. - */ -- addr = get_unmapped_area(file, addr, len, pgoff, flags); -+ addr = get_unmapped_area(file, addr, len, pgoff, flags | ((prot & PROT_EXEC) ? MAP_EXECUTABLE : 0)); - if (addr & ~PAGE_MASK) - return addr; - -@@ -965,6 +1041,26 @@ unsigned long do_mmap_pgoff(struct file - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | - mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - -+#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) -+ if (mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { -+ -+#ifdef CONFIG_PAX_MPROTECT -+ if (mm->pax_flags & MF_PAX_MPROTECT) { -+ if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC) -+ vm_flags &= ~(VM_EXEC | VM_MAYEXEC); -+ else -+ vm_flags &= ~(VM_WRITE | VM_MAYWRITE); -+ } -+#endif -+ -+ } -+#endif -+ -+#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_X86_32) -+ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && file) -+ vm_flags &= ~VM_PAGEEXEC; -+#endif -+ - if (flags & MAP_LOCKED) { - if (!can_do_mlock()) - return -EPERM; -@@ -978,6 +1074,7 @@ unsigned long do_mmap_pgoff(struct file - locked += mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; -+ gr_learn_resource(current, RLIMIT_MEMLOCK, locked << PAGE_SHIFT, 1); - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } -@@ -1051,6 +1148,9 @@ unsigned long do_mmap_pgoff(struct file - if (error) - return error; - -+ if (!gr_acl_handle_mmap(file, prot)) -+ return -EACCES; -+ - return mmap_region(file, addr, len, flags, vm_flags, pgoff); - } - EXPORT_SYMBOL(do_mmap_pgoff); -@@ -1063,10 +1163,10 @@ EXPORT_SYMBOL(do_mmap_pgoff); - */ - int vma_wants_writenotify(struct vm_area_struct *vma) - { -- unsigned int vm_flags = vma->vm_flags; -+ unsigned long vm_flags = vma->vm_flags; - - /* If it was private or non-writable, the write bit is already clear */ -- if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) -+ if ((vm_flags & (VM_WRITE|VM_SHARED)) != (VM_WRITE|VM_SHARED)) - return 0; - - /* The backer wishes to know when pages are first written to? */ -@@ -1115,14 +1215,24 @@ unsigned long mmap_region(struct file *f - unsigned long charged = 0; - struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ struct vm_area_struct *vma_m = NULL; -+#endif -+ -+ /* -+ * mm->mmap_sem is required to protect against another thread -+ * changing the mappings in case we sleep. -+ */ -+ verify_mm_writelocked(mm); -+ - /* Clear old maps */ - error = -ENOMEM; --munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { - if (do_munmap(mm, addr, len)) - return -ENOMEM; -- goto munmap_back; -+ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); -+ BUG_ON(vma && vma->vm_start < addr + len); - } - - /* Check against address space limit. */ -@@ -1171,6 +1281,16 @@ munmap_back: - goto unacct_error; - } - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (vm_flags & VM_EXEC)) { -+ vma_m = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); -+ if (!vma_m) { -+ error = -ENOMEM; -+ goto free_vma; -+ } -+ } -+#endif -+ - vma->vm_mm = mm; - vma->vm_start = addr; - vma->vm_end = addr + len; -@@ -1193,6 +1313,19 @@ munmap_back: - error = file->f_op->mmap(file, vma); - if (error) - goto unmap_and_free_vma; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (vma_m && (vm_flags & VM_EXECUTABLE)) -+ added_exe_file_vma(mm); -+#endif -+ -+#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_X86_32) -+ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && !(vma->vm_flags & VM_SPECIAL)) { -+ vma->vm_flags |= VM_PAGEEXEC; -+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); -+ } -+#endif -+ - if (vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); - } else if (vm_flags & VM_SHARED) { -@@ -1216,6 +1349,11 @@ munmap_back: - vma_link(mm, vma, prev, rb_link, rb_parent); - file = vma->vm_file; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (vma_m) -+ pax_mirror_vma(vma_m, vma); -+#endif -+ - /* Once vma denies write, undo our temporary denial count */ - if (correct_wcount) - atomic_inc(&inode->i_writecount); -@@ -1224,6 +1362,7 @@ out: - - mm->total_vm += len >> PAGE_SHIFT; - vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); -+ track_exec_limit(mm, addr, addr + len, vm_flags); - if (vm_flags & VM_LOCKED) { - /* - * makes pages present; downgrades, drops, reacquires mmap_sem -@@ -1246,6 +1385,12 @@ unmap_and_free_vma: - unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - charged = 0; - free_vma: -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (vma_m) -+ kmem_cache_free(vm_area_cachep, vma_m); -+#endif -+ - kmem_cache_free(vm_area_cachep, vma); - unacct_error: - if (charged) -@@ -1279,6 +1424,10 @@ arch_get_unmapped_area(struct file *filp - if (flags & MAP_FIXED) - return addr; - -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) -+#endif -+ - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); -@@ -1287,10 +1436,10 @@ arch_get_unmapped_area(struct file *filp - return addr; - } - if (len > mm->cached_hole_size) { -- start_addr = addr = mm->free_area_cache; -+ start_addr = addr = mm->free_area_cache; - } else { -- start_addr = addr = TASK_UNMAPPED_BASE; -- mm->cached_hole_size = 0; -+ start_addr = addr = mm->mmap_base; -+ mm->cached_hole_size = 0; - } - - full_search: -@@ -1301,9 +1450,8 @@ full_search: - * Start a new search - just in case we missed - * some holes. - */ -- if (start_addr != TASK_UNMAPPED_BASE) { -- addr = TASK_UNMAPPED_BASE; -- start_addr = addr; -+ if (start_addr != mm->mmap_base) { -+ start_addr = addr = mm->mmap_base; - mm->cached_hole_size = 0; - goto full_search; - } -@@ -1325,10 +1473,16 @@ full_search: - - void arch_unmap_area(struct mm_struct *mm, unsigned long addr) - { -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && SEGMEXEC_TASK_SIZE <= addr) -+ return; -+#endif -+ - /* - * Is this a new hole at the lowest possible address? - */ -- if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { -+ if (addr >= mm->mmap_base && addr < mm->free_area_cache) { - mm->free_area_cache = addr; - mm->cached_hole_size = ~0UL; - } -@@ -1346,7 +1500,7 @@ arch_get_unmapped_area_topdown(struct fi - { - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; -- unsigned long addr = addr0; -+ unsigned long base = mm->mmap_base, addr = addr0; - - /* requested length too big for entire address space */ - if (len > TASK_SIZE) -@@ -1355,6 +1509,10 @@ arch_get_unmapped_area_topdown(struct fi - if (flags & MAP_FIXED) - return addr; - -+#ifdef CONFIG_PAX_RANDMMAP -+ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) -+#endif -+ - /* requesting a specific address */ - if (addr) { - addr = PAGE_ALIGN(addr); -@@ -1412,13 +1570,21 @@ bottomup: - * can happen with large stack limits and large mmap() - * allocations. - */ -+ mm->mmap_base = TASK_UNMAPPED_BASE; -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base += mm->delta_mmap; -+#endif -+ -+ mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; -- mm->free_area_cache = TASK_UNMAPPED_BASE; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ -- mm->free_area_cache = mm->mmap_base; -+ mm->mmap_base = base; -+ mm->free_area_cache = base; - mm->cached_hole_size = ~0UL; - - return addr; -@@ -1427,6 +1593,12 @@ bottomup: - - void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) - { -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && SEGMEXEC_TASK_SIZE <= addr) -+ return; -+#endif -+ - /* - * Is this a new hole at the highest possible address? - */ -@@ -1434,8 +1606,10 @@ void arch_unmap_area_topdown(struct mm_s - mm->free_area_cache = addr; - - /* dont allow allocations above current base */ -- if (mm->free_area_cache > mm->mmap_base) -+ if (mm->free_area_cache > mm->mmap_base) { - mm->free_area_cache = mm->mmap_base; -+ mm->cached_hole_size = ~0UL; -+ } - } - - unsigned long -@@ -1535,6 +1709,27 @@ out: - return prev ? prev->vm_next : vma; - } - -+#ifdef CONFIG_PAX_SEGMEXEC -+struct vm_area_struct *pax_find_mirror_vma(struct vm_area_struct *vma) -+{ -+ struct vm_area_struct *vma_m; -+ -+ BUG_ON(!vma || vma->vm_start >= vma->vm_end); -+ if (!(vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC) || !(vma->vm_flags & VM_EXEC)) { -+ BUG_ON(vma->vm_mirror); -+ return NULL; -+ } -+ BUG_ON(vma->vm_start < SEGMEXEC_TASK_SIZE && SEGMEXEC_TASK_SIZE < vma->vm_end); -+ vma_m = vma->vm_mirror; -+ BUG_ON(!vma_m || vma_m->vm_mirror != vma); -+ BUG_ON(vma->vm_file != vma_m->vm_file); -+ BUG_ON(vma->vm_end - vma->vm_start != vma_m->vm_end - vma_m->vm_start); -+ BUG_ON(vma->vm_pgoff != vma_m->vm_pgoff || vma->anon_vma != vma_m->anon_vma); -+ BUG_ON((vma->vm_flags ^ vma_m->vm_flags) & ~(VM_WRITE | VM_MAYWRITE | VM_ACCOUNT | VM_LOCKED)); -+ return vma_m; -+} -+#endif -+ - /* - * Verify that the stack growth is acceptable and - * update accounting. This is shared with both the -@@ -1551,6 +1746,7 @@ static int acct_stack_growth(struct vm_a - return -ENOMEM; - - /* Stack limit test */ -+ gr_learn_resource(current, RLIMIT_STACK, size, 1); - if (size > rlim[RLIMIT_STACK].rlim_cur) - return -ENOMEM; - -@@ -1560,6 +1756,7 @@ static int acct_stack_growth(struct vm_a - unsigned long limit; - locked = mm->locked_vm + grow; - limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; -+ gr_learn_resource(current, RLIMIT_MEMLOCK, locked << PAGE_SHIFT, 1); - if (locked > limit && !capable(CAP_IPC_LOCK)) - return -ENOMEM; - } -@@ -1595,35 +1792,40 @@ static - #endif - int expand_upwards(struct vm_area_struct *vma, unsigned long address) - { -- int error; -+ int error, locknext; - - if (!(vma->vm_flags & VM_GROWSUP)) - return -EFAULT; - -+ /* Also guard against wrapping around to address 0. */ -+ if (address < PAGE_ALIGN(address+1)) -+ address = PAGE_ALIGN(address+1); -+ else -+ return -ENOMEM; -+ - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; -+ locknext = vma->vm_next && (vma->vm_next->vm_flags & VM_GROWSDOWN); -+ if (locknext && unlikely(anon_vma_prepare(vma->vm_next))) -+ return -ENOMEM; - anon_vma_lock(vma); -+ if (locknext) -+ anon_vma_lock(vma->vm_next); - - /* - * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the -- * anon_vma lock to serialize against concurrent expand_stacks. -- * Also guard against wrapping around to address 0. -+ * anon_vma locks to serialize against concurrent expand_stacks -+ * and expand_upwards. - */ -- if (address < PAGE_ALIGN(address+4)) -- address = PAGE_ALIGN(address+4); -- else { -- anon_vma_unlock(vma); -- return -ENOMEM; -- } - error = 0; - - /* Somebody else might have raced and expanded it already */ -- if (address > vma->vm_end) { -+ if (address > vma->vm_end && (!locknext || vma->vm_next->vm_start >= address)) { - unsigned long size, grow; - - size = address - vma->vm_start; -@@ -1633,6 +1835,8 @@ int expand_upwards(struct vm_area_struct - if (!error) - vma->vm_end = address; - } -+ if (locknext) -+ anon_vma_unlock(vma->vm_next); - anon_vma_unlock(vma); - return error; - } -@@ -1644,7 +1848,8 @@ int expand_upwards(struct vm_area_struct - static int expand_downwards(struct vm_area_struct *vma, - unsigned long address) - { -- int error; -+ int error, lockprev = 0; -+ struct vm_area_struct *prev = NULL; - - /* - * We must make sure the anon_vma is allocated -@@ -1658,6 +1863,15 @@ static int expand_downwards(struct vm_ar - if (error) - return error; - -+#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) -+ find_vma_prev(vma->vm_mm, address, &prev); -+ lockprev = prev && (prev->vm_flags & VM_GROWSUP); -+#endif -+ if (lockprev && unlikely(anon_vma_prepare(prev))) -+ return -ENOMEM; -+ if (lockprev) -+ anon_vma_lock(prev); -+ - anon_vma_lock(vma); - - /* -@@ -1667,9 +1881,15 @@ static int expand_downwards(struct vm_ar - */ - - /* Somebody else might have raced and expanded it already */ -- if (address < vma->vm_start) { -+ if (address < vma->vm_start && (!lockprev || prev->vm_end <= address)) { - unsigned long size, grow; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ struct vm_area_struct *vma_m; -+ -+ vma_m = pax_find_mirror_vma(vma); -+#endif -+ - size = vma->vm_end - address; - grow = (vma->vm_start - address) >> PAGE_SHIFT; - -@@ -1677,9 +1897,20 @@ static int expand_downwards(struct vm_ar - if (!error) { - vma->vm_start = address; - vma->vm_pgoff -= grow; -+ track_exec_limit(vma->vm_mm, vma->vm_start, vma->vm_end, vma->vm_flags); -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (vma_m) { -+ vma_m->vm_start -= grow << PAGE_SHIFT; -+ vma_m->vm_pgoff -= grow; -+ } -+#endif -+ - } - } - anon_vma_unlock(vma); -+ if (lockprev) -+ anon_vma_unlock(prev); - return error; - } - -@@ -1755,6 +1986,13 @@ static void remove_vma_list(struct mm_st - do { - long nrpages = vma_pages(vma); - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (vma->vm_start >= SEGMEXEC_TASK_SIZE)) { -+ vma = remove_vma(vma); -+ continue; -+ } -+#endif -+ - mm->total_vm -= nrpages; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); - vma = remove_vma(vma); -@@ -1799,6 +2037,16 @@ detach_vmas_to_be_unmapped(struct mm_str - - insertion_point = (prev ? &prev->vm_next : &mm->mmap); - do { -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (vma->vm_mirror) { -+ BUG_ON(!vma->vm_mirror->vm_mirror || vma->vm_mirror->vm_mirror != vma); -+ vma->vm_mirror->vm_mirror = NULL; -+ vma->vm_mirror->vm_flags &= ~VM_EXEC; -+ vma->vm_mirror = NULL; -+ } -+#endif -+ - rb_erase(&vma->vm_rb, &mm->mm_rb); - mm->map_count--; - tail_vma = vma; -@@ -1818,6 +2066,108 @@ detach_vmas_to_be_unmapped(struct mm_str - * Split a vma into two pieces at address 'addr', a new vma is allocated - * either for the first part or the tail. - */ -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, -+ unsigned long addr, int new_below) -+{ -+ struct mempolicy *pol; -+ struct vm_area_struct *new, *vma_m, *new_m = NULL; -+ unsigned long addr_m = addr + SEGMEXEC_TASK_SIZE; -+ -+ if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) -+ return -EINVAL; -+ -+ vma_m = pax_find_mirror_vma(vma); -+ if (vma_m) { -+ BUG_ON(vma->vm_end > SEGMEXEC_TASK_SIZE); -+ if (mm->map_count >= sysctl_max_map_count-1) -+ return -ENOMEM; -+ } else if (mm->map_count >= sysctl_max_map_count) -+ return -ENOMEM; -+ -+ new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); -+ if (!new) -+ return -ENOMEM; -+ -+ if (vma_m) { -+ new_m = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); -+ if (!new_m) { -+ kmem_cache_free(vm_area_cachep, new); -+ return -ENOMEM; -+ } -+ } -+ -+ /* most fields are the same, copy all, and then fixup */ -+ *new = *vma; -+ -+ if (new_below) -+ new->vm_end = addr; -+ else { -+ new->vm_start = addr; -+ new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); -+ } -+ -+ if (vma_m) { -+ *new_m = *vma_m; -+ new_m->vm_mirror = new; -+ new->vm_mirror = new_m; -+ -+ if (new_below) -+ new_m->vm_end = addr_m; -+ else { -+ new_m->vm_start = addr_m; -+ new_m->vm_pgoff += ((addr_m - vma_m->vm_start) >> PAGE_SHIFT); -+ } -+ } -+ -+ pol = mpol_dup(vma_policy(vma)); -+ if (IS_ERR(pol)) { -+ if (new_m) -+ kmem_cache_free(vm_area_cachep, new_m); -+ kmem_cache_free(vm_area_cachep, new); -+ return PTR_ERR(pol); -+ } -+ vma_set_policy(new, pol); -+ -+ if (new->vm_file) { -+ get_file(new->vm_file); -+ if (vma->vm_flags & VM_EXECUTABLE) -+ added_exe_file_vma(mm); -+ } -+ -+ if (new->vm_ops && new->vm_ops->open) -+ new->vm_ops->open(new); -+ -+ if (new_below) -+ vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + -+ ((addr - new->vm_start) >> PAGE_SHIFT), new); -+ else -+ vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); -+ -+ if (vma_m) { -+ mpol_get(pol); -+ vma_set_policy(new_m, pol); -+ -+ if (new_m->vm_file) { -+ get_file(new_m->vm_file); -+ if (vma_m->vm_flags & VM_EXECUTABLE) -+ added_exe_file_vma(mm); -+ } -+ -+ if (new_m->vm_ops && new_m->vm_ops->open) -+ new_m->vm_ops->open(new_m); -+ -+ if (new_below) -+ vma_adjust(vma_m, addr_m, vma_m->vm_end, vma_m->vm_pgoff + -+ ((addr_m - new_m->vm_start) >> PAGE_SHIFT), new_m); -+ else -+ vma_adjust(vma_m, vma_m->vm_start, addr_m, vma_m->vm_pgoff, new_m); -+ } -+ -+ return 0; -+} -+#else - int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, - unsigned long addr, int new_below) - { -@@ -1869,17 +2219,37 @@ int split_vma(struct mm_struct * mm, str - - return 0; - } -+#endif - - /* Munmap is split into 2 main parts -- this part which finds - * what needs doing, and the areas themselves, which do the - * work. This now handles partial unmappings. - * Jeremy Fitzhardinge jeremy@goop.org - */ -+#ifdef CONFIG_PAX_SEGMEXEC - int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) - { -+ int ret = __do_munmap(mm, start, len); -+ if (ret || !(mm->pax_flags & MF_PAX_SEGMEXEC)) -+ return ret; -+ -+ return __do_munmap(mm, start + SEGMEXEC_TASK_SIZE, len); -+} -+ -+int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len) -+#else -+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) -+#endif -+{ - unsigned long end; - struct vm_area_struct *vma, *prev, *last; - -+ /* -+ * mm->mmap_sem is required to protect against another thread -+ * changing the mappings in case we sleep. -+ */ -+ verify_mm_writelocked(mm); -+ - if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; - -@@ -1943,6 +2313,8 @@ int do_munmap(struct mm_struct *mm, unsi - /* Fix up all other VM information */ - remove_vma_list(mm, vma); - -+ track_exec_limit(mm, start, end, 0UL); -+ - return 0; - } - -@@ -1955,22 +2327,18 @@ SYSCALL_DEFINE2(munmap, unsigned long, a - - profile_munmap(addr); - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && -+ (len > SEGMEXEC_TASK_SIZE || addr > SEGMEXEC_TASK_SIZE-len)) -+ return -EINVAL; -+#endif -+ - down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len); - up_write(&mm->mmap_sem); - return ret; - } - --static inline void verify_mm_writelocked(struct mm_struct *mm) --{ --#ifdef CONFIG_DEBUG_VM -- if (unlikely(down_read_trylock(&mm->mmap_sem))) { -- WARN_ON(1); -- up_read(&mm->mmap_sem); -- } --#endif --} -- - /* - * this is really a simplified "do_mmap". it only handles - * anonymous maps. eventually we may be able to do some -@@ -1984,6 +2352,11 @@ unsigned long do_brk(unsigned long addr, - struct rb_node ** rb_link, * rb_parent; - pgoff_t pgoff = addr >> PAGE_SHIFT; - int error; -+ unsigned long charged; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ struct vm_area_struct *vma_m = NULL; -+#endif - - len = PAGE_ALIGN(len); - if (!len) -@@ -2001,19 +2374,34 @@ unsigned long do_brk(unsigned long addr, - - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - -+#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) -+ if (mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { -+ flags &= ~VM_EXEC; -+ -+#ifdef CONFIG_PAX_MPROTECT -+ if (mm->pax_flags & MF_PAX_MPROTECT) -+ flags &= ~VM_MAYEXEC; -+#endif -+ -+ } -+#endif -+ - error = arch_mmap_check(addr, len, flags); - if (error) - return error; - -+ charged = len >> PAGE_SHIFT; -+ - /* - * mlock MCL_FUTURE? - */ - if (mm->def_flags & VM_LOCKED) { - unsigned long locked, lock_limit; -- locked = len >> PAGE_SHIFT; -+ locked = charged; - locked += mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; -+ gr_learn_resource(current, RLIMIT_MEMLOCK, locked << PAGE_SHIFT, 1); - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } -@@ -2027,22 +2415,22 @@ unsigned long do_brk(unsigned long addr, - /* - * Clear old maps. this also does some error checking for us - */ -- munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { - if (do_munmap(mm, addr, len)) - return -ENOMEM; -- goto munmap_back; -+ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); -+ BUG_ON(vma && vma->vm_start < addr + len); - } - - /* Check against address space limits *after* clearing old maps... */ -- if (!may_expand_vm(mm, len >> PAGE_SHIFT)) -+ if (!may_expand_vm(mm, charged)) - return -ENOMEM; - - if (mm->map_count > sysctl_max_map_count) - return -ENOMEM; - -- if (security_vm_enough_memory(len >> PAGE_SHIFT)) -+ if (security_vm_enough_memory(charged)) - return -ENOMEM; - - /* Can we just expand an old private anonymous mapping? */ -@@ -2056,10 +2444,21 @@ unsigned long do_brk(unsigned long addr, - */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!vma) { -- vm_unacct_memory(len >> PAGE_SHIFT); -+ vm_unacct_memory(charged); - return -ENOMEM; - } - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (flags & VM_EXEC)) { -+ vma_m = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); -+ if (!vma_m) { -+ kmem_cache_free(vm_area_cachep, vma); -+ vm_unacct_memory(charged); -+ return -ENOMEM; -+ } -+ } -+#endif -+ - vma->vm_mm = mm; - vma->vm_start = addr; - vma->vm_end = addr + len; -@@ -2068,11 +2467,12 @@ unsigned long do_brk(unsigned long addr, - vma->vm_page_prot = vm_get_page_prot(flags); - vma_link(mm, vma, prev, rb_link, rb_parent); - out: -- mm->total_vm += len >> PAGE_SHIFT; -+ mm->total_vm += charged; - if (flags & VM_LOCKED) { - if (!mlock_vma_pages_range(vma, addr, addr + len)) -- mm->locked_vm += (len >> PAGE_SHIFT); -+ mm->locked_vm += charged; - } -+ track_exec_limit(mm, addr, addr + len, flags); - return addr; - } - -@@ -2118,8 +2518,10 @@ void exit_mmap(struct mm_struct *mm) - * Walk the list again, actually closing and freeing it, - * with preemption enabled, without holding any MM locks. - */ -- while (vma) -+ while (vma) { -+ vma->vm_mirror = NULL; - vma = remove_vma(vma); -+ } - - BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); - } -@@ -2133,6 +2535,10 @@ int insert_vm_struct(struct mm_struct * - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ struct vm_area_struct *vma_m = NULL; -+#endif -+ - /* - * The vm_pgoff of a purely anonymous vma should be irrelevant - * until its first write fault, when page's anon_vma and index -@@ -2155,7 +2561,22 @@ int insert_vm_struct(struct mm_struct * - if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory_mm(mm, vma_pages(vma))) - return -ENOMEM; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (vma->vm_flags & VM_EXEC)) { -+ vma_m = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); -+ if (!vma_m) -+ return -ENOMEM; -+ } -+#endif -+ - vma_link(mm, vma, prev, rb_link, rb_parent); -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (vma_m) -+ pax_mirror_vma(vma_m, vma); -+#endif -+ - return 0; - } - -@@ -2173,6 +2594,8 @@ struct vm_area_struct *copy_vma(struct v - struct rb_node **rb_link, *rb_parent; - struct mempolicy *pol; - -+ BUG_ON(vma->vm_mirror); -+ - /* - * If anonymous vma has not yet been faulted, update new pgoff - * to match new location, to increase its chance of merging. -@@ -2216,6 +2639,35 @@ struct vm_area_struct *copy_vma(struct v - return new_vma; - } - -+#ifdef CONFIG_PAX_SEGMEXEC -+void pax_mirror_vma(struct vm_area_struct *vma_m, struct vm_area_struct *vma) -+{ -+ struct vm_area_struct *prev_m; -+ struct rb_node **rb_link_m, *rb_parent_m; -+ struct mempolicy *pol_m; -+ -+ BUG_ON(!(vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC) || !(vma->vm_flags & VM_EXEC)); -+ BUG_ON(vma->vm_mirror || vma_m->vm_mirror); -+ BUG_ON(!mpol_equal(vma_policy(vma), vma_policy(vma_m))); -+ *vma_m = *vma; -+ pol_m = vma_policy(vma_m); -+ mpol_get(pol_m); -+ vma_set_policy(vma_m, pol_m); -+ vma_m->vm_start += SEGMEXEC_TASK_SIZE; -+ vma_m->vm_end += SEGMEXEC_TASK_SIZE; -+ vma_m->vm_flags &= ~(VM_WRITE | VM_MAYWRITE | VM_ACCOUNT | VM_LOCKED); -+ vma_m->vm_page_prot = vm_get_page_prot(vma_m->vm_flags); -+ if (vma_m->vm_file) -+ get_file(vma_m->vm_file); -+ if (vma_m->vm_ops && vma_m->vm_ops->open) -+ vma_m->vm_ops->open(vma_m); -+ find_vma_prepare(vma->vm_mm, vma_m->vm_start, &prev_m, &rb_link_m, &rb_parent_m); -+ vma_link(vma->vm_mm, vma_m, prev_m, rb_link_m, rb_parent_m); -+ vma_m->vm_mirror = vma; -+ vma->vm_mirror = vma_m; -+} -+#endif -+ - /* - * Return true if the calling process may expand its vm space by the passed - * number of pages -@@ -2226,7 +2678,7 @@ int may_expand_vm(struct mm_struct *mm, - unsigned long lim; - - lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; -- -+ gr_learn_resource(current, RLIMIT_AS, (cur + npages) << PAGE_SHIFT, 1); - if (cur + npages > lim) - return 0; - return 1; -@@ -2267,7 +2719,7 @@ static void special_mapping_close(struct - { - } - --static struct vm_operations_struct special_mapping_vmops = { -+static const struct vm_operations_struct special_mapping_vmops = { - .close = special_mapping_close, - .fault = special_mapping_fault, - }; -@@ -2295,6 +2747,15 @@ int install_special_mapping(struct mm_st - vma->vm_start = addr; - vma->vm_end = addr + len; - -+#ifdef CONFIG_PAX_MPROTECT -+ if (mm->pax_flags & MF_PAX_MPROTECT) { -+ if ((vm_flags & (VM_WRITE | VM_EXEC)) != VM_EXEC) -+ vm_flags &= ~(VM_EXEC | VM_MAYEXEC); -+ else -+ vm_flags &= ~(VM_WRITE | VM_MAYWRITE); -+ } -+#endif -+ - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; - vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - -diff -urNp linux-2.6.31.1/mm/mprotect.c linux-2.6.31.1/mm/mprotect.c ---- linux-2.6.31.1/mm/mprotect.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/mprotect.c 2009-10-01 20:12:45.000000000 -0400 -@@ -24,10 +24,16 @@ - #include <linux/mmu_notifier.h> - #include <linux/migrate.h> - #include <linux/perf_counter.h> -+ -+#ifdef CONFIG_PAX_MPROTECT -+#include <linux/elf.h> -+#endif -+ - #include <asm/uaccess.h> - #include <asm/pgtable.h> - #include <asm/cacheflush.h> - #include <asm/tlbflush.h> -+#include <asm/mmu_context.h> - - #ifndef pgprot_modify - static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) -@@ -132,6 +138,48 @@ static void change_protection(struct vm_ - flush_tlb_range(vma, start, end); - } - -+#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT -+/* called while holding the mmap semaphor for writing except stack expansion */ -+void track_exec_limit(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long prot) -+{ -+ unsigned long oldlimit, newlimit = 0UL; -+ -+ if (!(mm->pax_flags & MF_PAX_PAGEEXEC) || nx_enabled) -+ return; -+ -+ spin_lock(&mm->page_table_lock); -+ oldlimit = mm->context.user_cs_limit; -+ if ((prot & VM_EXEC) && oldlimit < end) -+ /* USER_CS limit moved up */ -+ newlimit = end; -+ else if (!(prot & VM_EXEC) && start < oldlimit && oldlimit <= end) -+ /* USER_CS limit moved down */ -+ newlimit = start; -+ -+ if (newlimit) { -+ mm->context.user_cs_limit = newlimit; -+ -+#ifdef CONFIG_SMP -+ wmb(); -+ cpus_clear(mm->context.cpu_user_cs_mask); -+ cpu_set(smp_processor_id(), mm->context.cpu_user_cs_mask); -+#endif -+ -+ set_user_cs(mm->context.user_cs_base, mm->context.user_cs_limit, smp_processor_id()); -+ } -+ spin_unlock(&mm->page_table_lock); -+ if (newlimit == end) { -+ struct vm_area_struct *vma = find_vma(mm, oldlimit); -+ -+ for (; vma && vma->vm_start < end; vma = vma->vm_next) -+ if (is_vm_hugetlb_page(vma)) -+ hugetlb_change_protection(vma, vma->vm_start, vma->vm_end, vma->vm_page_prot); -+ else -+ change_protection(vma, vma->vm_start, vma->vm_end, vma->vm_page_prot, vma_wants_writenotify(vma)); -+ } -+} -+#endif -+ - int - mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, - unsigned long start, unsigned long end, unsigned long newflags) -@@ -144,6 +192,14 @@ mprotect_fixup(struct vm_area_struct *vm - int error; - int dirty_accountable = 0; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ struct vm_area_struct *vma_m = NULL; -+ unsigned long start_m, end_m; -+ -+ start_m = start + SEGMEXEC_TASK_SIZE; -+ end_m = end + SEGMEXEC_TASK_SIZE; -+#endif -+ - if (newflags == oldflags) { - *pprev = vma; - return 0; -@@ -165,6 +221,38 @@ mprotect_fixup(struct vm_area_struct *vm - } - } - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && ((oldflags ^ newflags) & VM_EXEC)) { -+ if (start != vma->vm_start) { -+ error = split_vma(mm, vma, start, 1); -+ if (error) -+ goto fail; -+ BUG_ON(!*pprev || (*pprev)->vm_next == vma); -+ *pprev = (*pprev)->vm_next; -+ } -+ -+ if (end != vma->vm_end) { -+ error = split_vma(mm, vma, end, 0); -+ if (error) -+ goto fail; -+ } -+ -+ if (pax_find_mirror_vma(vma)) { -+ error = __do_munmap(mm, start_m, end_m - start_m); -+ if (error) -+ goto fail; -+ } else { -+ vma_m = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); -+ if (!vma_m) { -+ error = -ENOMEM; -+ goto fail; -+ } -+ vma->vm_flags = newflags; -+ pax_mirror_vma(vma_m, vma); -+ } -+ } -+#endif -+ - /* - * First try to merge with previous and/or next vma. - */ -@@ -196,8 +284,14 @@ success: - * held in write mode. - */ - vma->vm_flags = newflags; -+ -+#ifdef CONFIG_PAX_MPROTECT -+ if (current->binfmt && current->binfmt->handle_mprotect) -+ current->binfmt->handle_mprotect(vma, newflags); -+#endif -+ - vma->vm_page_prot = pgprot_modify(vma->vm_page_prot, -- vm_get_page_prot(newflags)); -+ vm_get_page_prot(vma->vm_flags)); - - if (vma_wants_writenotify(vma)) { - vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED); -@@ -238,6 +332,17 @@ SYSCALL_DEFINE3(mprotect, unsigned long, - end = start + len; - if (end <= start) - return -ENOMEM; -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) { -+ if (end > SEGMEXEC_TASK_SIZE) -+ return -EINVAL; -+ } else -+#endif -+ -+ if (end > TASK_SIZE) -+ return -EINVAL; -+ - if (!arch_validate_prot(prot)) - return -EINVAL; - -@@ -245,7 +350,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, - /* - * Does the application expect PROT_READ to imply PROT_EXEC: - */ -- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) -+ if ((prot & (PROT_READ | PROT_WRITE)) && (current->personality & READ_IMPLIES_EXEC)) - prot |= PROT_EXEC; - - vm_flags = calc_vm_prot_bits(prot); -@@ -277,6 +382,16 @@ SYSCALL_DEFINE3(mprotect, unsigned long, - if (start > vma->vm_start) - prev = vma; - -+ if (!gr_acl_handle_mprotect(vma->vm_file, prot)) { -+ error = -EACCES; -+ goto out; -+ } -+ -+#ifdef CONFIG_PAX_MPROTECT -+ if (current->binfmt && current->binfmt->handle_mprotect) -+ current->binfmt->handle_mprotect(vma, vm_flags); -+#endif -+ - for (nstart = start ; ; ) { - unsigned long newflags; - -@@ -301,6 +416,9 @@ SYSCALL_DEFINE3(mprotect, unsigned long, - if (error) - goto out; - perf_counter_mmap(vma); -+ -+ track_exec_limit(current->mm, nstart, tmp, vm_flags); -+ - nstart = tmp; - - if (nstart < prev->vm_end) -diff -urNp linux-2.6.31.1/mm/mremap.c linux-2.6.31.1/mm/mremap.c ---- linux-2.6.31.1/mm/mremap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/mremap.c 2009-10-01 20:12:45.000000000 -0400 -@@ -113,6 +113,12 @@ static void move_ptes(struct vm_area_str - continue; - pte = ptep_clear_flush(vma, old_addr, old_pte); - pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); -+ -+#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT -+ if (!nx_enabled && (new_vma->vm_flags & (VM_PAGEEXEC | VM_EXEC)) == VM_PAGEEXEC) -+ pte = pte_exprotect(pte); -+#endif -+ - set_pte_at(mm, new_addr, new_pte, pte); - } - -@@ -262,6 +268,7 @@ unsigned long do_mremap(unsigned long ad - struct vm_area_struct *vma; - unsigned long ret = -EINVAL; - unsigned long charged = 0; -+ unsigned long pax_task_size = TASK_SIZE; - - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) - goto out; -@@ -280,6 +287,15 @@ unsigned long do_mremap(unsigned long ad - if (!new_len) - goto out; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) -+ pax_task_size = SEGMEXEC_TASK_SIZE; -+#endif -+ -+ if (new_len > pax_task_size || addr > pax_task_size-new_len || -+ old_len > pax_task_size || addr > pax_task_size-old_len) -+ goto out; -+ - /* new_addr is only valid if MREMAP_FIXED is specified */ - if (flags & MREMAP_FIXED) { - if (new_addr & ~PAGE_MASK) -@@ -287,16 +303,13 @@ unsigned long do_mremap(unsigned long ad - if (!(flags & MREMAP_MAYMOVE)) - goto out; - -- if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) -+ if (new_addr > pax_task_size - new_len) - goto out; - - /* Check if the location we're moving into overlaps the - * old location at all, and fail if it does. - */ -- if ((new_addr <= addr) && (new_addr+new_len) > addr) -- goto out; -- -- if ((addr <= new_addr) && (addr+old_len) > new_addr) -+ if (addr + old_len > new_addr && new_addr + new_len > addr) - goto out; - - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); -@@ -334,6 +347,14 @@ unsigned long do_mremap(unsigned long ad - ret = -EINVAL; - goto out; - } -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ if (pax_find_mirror_vma(vma)) { -+ ret = -EINVAL; -+ goto out; -+ } -+#endif -+ - /* We can't remap across vm area boundaries */ - if (old_len > vma->vm_end - addr) - goto out; -@@ -367,7 +388,7 @@ unsigned long do_mremap(unsigned long ad - if (old_len == vma->vm_end - addr && - !((flags & MREMAP_FIXED) && (addr != new_addr)) && - (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { -- unsigned long max_addr = TASK_SIZE; -+ unsigned long max_addr = pax_task_size; - if (vma->vm_next) - max_addr = vma->vm_next->vm_start; - /* can we just expand the current mapping? */ -@@ -385,6 +406,7 @@ unsigned long do_mremap(unsigned long ad - addr + new_len); - } - ret = addr; -+ track_exec_limit(vma->vm_mm, vma->vm_start, addr + new_len, vma->vm_flags); - goto out; - } - } -@@ -395,8 +417,8 @@ unsigned long do_mremap(unsigned long ad - */ - ret = -ENOMEM; - if (flags & MREMAP_MAYMOVE) { -+ unsigned long map_flags = 0; - if (!(flags & MREMAP_FIXED)) { -- unsigned long map_flags = 0; - if (vma->vm_flags & VM_MAYSHARE) - map_flags |= MAP_SHARED; - -@@ -411,7 +433,12 @@ unsigned long do_mremap(unsigned long ad - if (ret) - goto out; - } -+ map_flags = vma->vm_flags; - ret = move_vma(vma, addr, old_len, new_len, new_addr); -+ if (!(ret & ~PAGE_MASK)) { -+ track_exec_limit(current->mm, addr, addr + old_len, 0UL); -+ track_exec_limit(current->mm, new_addr, new_addr + new_len, map_flags); -+ } - } - out: - if (ret & ~PAGE_MASK) -diff -urNp linux-2.6.31.1/mm/nommu.c linux-2.6.31.1/mm/nommu.c ---- linux-2.6.31.1/mm/nommu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/nommu.c 2009-10-01 20:12:45.000000000 -0400 -@@ -79,7 +79,7 @@ static struct kmem_cache *vm_region_jar; - struct rb_root nommu_region_tree = RB_ROOT; - DECLARE_RWSEM(nommu_region_sem); - --struct vm_operations_struct generic_file_vm_ops = { -+const struct vm_operations_struct generic_file_vm_ops = { - }; - - /* -@@ -780,15 +780,6 @@ struct vm_area_struct *find_vma(struct m - EXPORT_SYMBOL(find_vma); - - /* -- * find a VMA -- * - we don't extend stack VMAs under NOMMU conditions -- */ --struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) --{ -- return find_vma(mm, addr); --} -- --/* - * expand a stack to a given address - * - not supported under NOMMU conditions - */ -diff -urNp linux-2.6.31.1/mm/page_alloc.c linux-2.6.31.1/mm/page_alloc.c ---- linux-2.6.31.1/mm/page_alloc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/page_alloc.c 2009-10-01 20:12:45.000000000 -0400 -@@ -559,6 +559,10 @@ static void __free_pages_ok(struct page - int bad = 0; - int wasMlocked = TestClearPageMlocked(page); - -+#ifdef CONFIG_PAX_MEMORY_SANITIZE -+ unsigned long index = 1UL << order; -+#endif -+ - kmemcheck_free_shadow(page, order); - - for (i = 0 ; i < (1 << order) ; ++i) -@@ -571,6 +575,12 @@ static void __free_pages_ok(struct page - debug_check_no_obj_freed(page_address(page), - PAGE_SIZE << order); - } -+ -+#ifdef CONFIG_PAX_MEMORY_SANITIZE -+ for (; index; --index) -+ sanitize_highpage(page + index - 1); -+#endif -+ - arch_free_page(page, order); - kernel_map_pages(page, 1 << order, 0); - -@@ -662,8 +672,10 @@ static int prep_new_page(struct page *pa - arch_alloc_page(page, order); - kernel_map_pages(page, 1 << order, 1); - -+#ifndef CONFIG_PAX_MEMORY_SANITIZE - if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); -+#endif - - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); -@@ -1039,6 +1051,11 @@ static void free_hot_cold_page(struct pa - debug_check_no_locks_freed(page_address(page), PAGE_SIZE); - debug_check_no_obj_freed(page_address(page), PAGE_SIZE); - } -+ -+#ifdef CONFIG_PAX_MEMORY_SANITIZE -+ sanitize_highpage(page); -+#endif -+ - arch_free_page(page, 0); - kernel_map_pages(page, 1, 0); - -diff -urNp linux-2.6.31.1/mm/percpu.c linux-2.6.31.1/mm/percpu.c ---- linux-2.6.31.1/mm/percpu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/percpu.c 2009-10-01 20:12:45.000000000 -0400 -@@ -105,7 +105,7 @@ static int pcpu_nr_slots __read_mostly; - static size_t pcpu_chunk_struct_size __read_mostly; - - /* the address of the first chunk which starts with the kernel static area */ --void *pcpu_base_addr __read_mostly; -+void *pcpu_base_addr __read_only; - EXPORT_SYMBOL_GPL(pcpu_base_addr); - - /* -diff -urNp linux-2.6.31.1/mm/rmap.c linux-2.6.31.1/mm/rmap.c ---- linux-2.6.31.1/mm/rmap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/rmap.c 2009-10-01 20:12:45.000000000 -0400 -@@ -103,6 +103,10 @@ int anon_vma_prepare(struct vm_area_stru - struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated; - -+#ifdef CONFIG_PAX_SEGMEXEC -+ struct vm_area_struct *vma_m; -+#endif -+ - anon_vma = find_mergeable_anon_vma(vma); - allocated = NULL; - if (!anon_vma) { -@@ -116,6 +120,15 @@ int anon_vma_prepare(struct vm_area_stru - /* page_table_lock to protect against threads */ - spin_lock(&mm->page_table_lock); - if (likely(!vma->anon_vma)) { -+ -+#ifdef CONFIG_PAX_SEGMEXEC -+ vma_m = pax_find_mirror_vma(vma); -+ if (vma_m) { -+ vma_m->anon_vma = anon_vma; -+ __anon_vma_link(vma_m); -+ } -+#endif -+ - vma->anon_vma = anon_vma; - list_add_tail(&vma->anon_vma_node, &anon_vma->head); - allocated = NULL; -diff -urNp linux-2.6.31.1/mm/shmem.c linux-2.6.31.1/mm/shmem.c ---- linux-2.6.31.1/mm/shmem.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/shmem.c 2009-10-01 20:12:45.000000000 -0400 -@@ -31,7 +31,7 @@ - #include <linux/swap.h> - #include <linux/ima.h> - --static struct vfsmount *shm_mnt; -+struct vfsmount *shm_mnt; - - #ifdef CONFIG_SHMEM - /* -@@ -219,7 +219,7 @@ static const struct file_operations shme - static const struct inode_operations shmem_inode_operations; - static const struct inode_operations shmem_dir_inode_operations; - static const struct inode_operations shmem_special_inode_operations; --static struct vm_operations_struct shmem_vm_ops; -+static const struct vm_operations_struct shmem_vm_ops; - - static struct backing_dev_info shmem_backing_dev_info __read_mostly = { - .ra_pages = 0, /* No readahead */ -@@ -2497,7 +2497,7 @@ static const struct super_operations shm - .put_super = shmem_put_super, - }; - --static struct vm_operations_struct shmem_vm_ops = { -+static const struct vm_operations_struct shmem_vm_ops = { - .fault = shmem_fault, - #ifdef CONFIG_NUMA - .set_policy = shmem_set_policy, -diff -urNp linux-2.6.31.1/mm/slab.c linux-2.6.31.1/mm/slab.c ---- linux-2.6.31.1/mm/slab.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/slab.c 2009-10-01 20:12:45.000000000 -0400 -@@ -308,7 +308,7 @@ struct kmem_list3 { - * Need this for bootstrapping a per node allocator. - */ - #define NUM_INIT_LISTS (3 * MAX_NUMNODES) --struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; -+struct kmem_list3 initkmem_list3[NUM_INIT_LISTS]; - #define CACHE_CACHE 0 - #define SIZE_AC MAX_NUMNODES - #define SIZE_L3 (2 * MAX_NUMNODES) -@@ -558,7 +558,7 @@ static inline void *index_to_obj(struct - * reciprocal_divide(offset, cache->reciprocal_buffer_size) - */ - static inline unsigned int obj_to_index(const struct kmem_cache *cache, -- const struct slab *slab, void *obj) -+ const struct slab *slab, const void *obj) - { - u32 offset = (obj - slab->s_mem); - return reciprocal_divide(offset, cache->reciprocal_buffer_size); -@@ -584,14 +584,14 @@ struct cache_names { - static struct cache_names __initdata cache_names[] = { - #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, - #include <linux/kmalloc_sizes.h> -- {NULL,} -+ {NULL, NULL} - #undef CACHE - }; - - static struct arraycache_init initarray_cache __initdata = -- { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; -+ { {0, BOOT_CPUCACHE_ENTRIES, 1, 0}, {NULL} }; - static struct arraycache_init initarray_generic = -- { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; -+ { {0, BOOT_CPUCACHE_ENTRIES, 1, 0}, {NULL} }; - - /* internal cache of cache description objs */ - static struct kmem_cache cache_cache = { -@@ -4473,15 +4473,64 @@ static const struct file_operations proc - - static int __init slab_proc_init(void) - { -+#if !defined(CONFIG_GRKERNSEC_PROC_ADD) - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); - #ifdef CONFIG_DEBUG_SLAB_LEAK - proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); - #endif -+#endif - return 0; - } - module_init(slab_proc_init); - #endif - -+void check_object_size(const void *ptr, unsigned long n, bool to) -+{ -+ -+#ifdef CONFIG_PAX_USERCOPY -+ struct kmem_cache *cachep; -+ struct slab *slabp; -+ struct page *page; -+ unsigned int objnr; -+ unsigned long offset; -+ -+ if (!n) -+ return; -+ -+ if (ZERO_OR_NULL_PTR(ptr)) -+ goto report; -+ -+ if (!virt_addr_valid(ptr)) -+ return; -+ -+ page = virt_to_head_page(ptr); -+ -+ /* XXX: can get a little tighter with this stack check */ -+ if (!PageSlab(page) && object_is_on_stack(ptr) && -+ (n > ((unsigned long)task_stack_page(current) + THREAD_SIZE - -+ (unsigned long)ptr))) -+ goto report; -+ else -+ return; -+ -+ cachep = page_get_cache(page); -+ slabp = page_get_slab(page); -+ objnr = obj_to_index(cachep, slabp, ptr); -+ BUG_ON(objnr >= cachep->num); -+ offset = ptr - index_to_obj(cachep, slabp, objnr) - obj_offset(cachep); -+ if (offset <= obj_size(cachep) && n <= obj_size(cachep) - offset) -+ return; -+ -+report: -+ if (to) -+ pax_report_leak_to_user(ptr, n); -+ else -+ pax_report_overflow_from_user(ptr, n); -+#endif -+ -+} -+EXPORT_SYMBOL(check_object_size); -+ - /** - * ksize - get the actual amount of memory allocated for a given object - * @objp: Pointer to the object -diff -urNp linux-2.6.31.1/mm/slob.c linux-2.6.31.1/mm/slob.c ---- linux-2.6.31.1/mm/slob.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/slob.c 2009-10-01 20:12:45.000000000 -0400 -@@ -29,7 +29,7 @@ - * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls - * alloc_pages() directly, allocating compound pages so the page order - * does not have to be separately tracked, and also stores the exact -- * allocation size in page->private so that it can be used to accurately -+ * allocation size in slob_page->size so that it can be used to accurately - * provide ksize(). These objects are detected in kfree() because slob_page() - * is false for them. - * -@@ -58,6 +58,7 @@ - */ - - #include <linux/kernel.h> -+#include <linux/sched.h> - #include <linux/slab.h> - #include <linux/mm.h> - #include <linux/swap.h> /* struct reclaim_state */ -@@ -100,7 +101,8 @@ struct slob_page { - unsigned long flags; /* mandatory */ - atomic_t _count; /* mandatory */ - slobidx_t units; /* free units left in page */ -- unsigned long pad[2]; -+ unsigned long pad[1]; -+ unsigned long size; /* size when >=PAGE_SIZE */ - slob_t *free; /* first free slob_t in page */ - struct list_head list; /* linked list of free pages */ - }; -@@ -133,7 +135,7 @@ static LIST_HEAD(free_slob_large); - */ - static inline int is_slob_page(struct slob_page *sp) - { -- return PageSlab((struct page *)sp); -+ return PageSlab((struct page *)sp) && !sp->size; - } - - static inline void set_slob_page(struct slob_page *sp) -@@ -148,7 +150,7 @@ static inline void clear_slob_page(struc - - static inline struct slob_page *slob_page(const void *addr) - { -- return (struct slob_page *)virt_to_page(addr); -+ return (struct slob_page *)virt_to_head_page(addr); - } - - /* -@@ -208,7 +210,7 @@ static void set_slob(slob_t *s, slobidx_ - /* - * Return the size of a slob block. - */ --static slobidx_t slob_units(slob_t *s) -+static slobidx_t slob_units(const slob_t *s) - { - if (s->units > 0) - return s->units; -@@ -218,7 +220,7 @@ static slobidx_t slob_units(slob_t *s) - /* - * Return the next free slob block pointer after this one. - */ --static slob_t *slob_next(slob_t *s) -+static slob_t *slob_next(const slob_t *s) - { - slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); - slobidx_t next; -@@ -233,7 +235,7 @@ static slob_t *slob_next(slob_t *s) - /* - * Returns true if s is the last free block in its page. - */ --static int slob_last(slob_t *s) -+static int slob_last(const slob_t *s) - { - return !((unsigned long)slob_next(s) & ~PAGE_MASK); - } -@@ -252,6 +254,7 @@ static void *slob_new_pages(gfp_t gfp, i - if (!page) - return NULL; - -+ set_slob_page(page); - return page_address(page); - } - -@@ -368,11 +371,11 @@ static void *slob_alloc(size_t size, gfp - if (!b) - return NULL; - sp = slob_page(b); -- set_slob_page(sp); - - spin_lock_irqsave(&slob_lock, flags); - sp->units = SLOB_UNITS(PAGE_SIZE); - sp->free = b; -+ sp->size = 0; - INIT_LIST_HEAD(&sp->list); - set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); - set_slob_page_free(sp, slob_list); -@@ -475,10 +478,9 @@ out: - #define ARCH_SLAB_MINALIGN __alignof__(unsigned long) - #endif - --void *__kmalloc_node(size_t size, gfp_t gfp, int node) -+static void *__kmalloc_node_align(size_t size, gfp_t gfp, int node, int align) - { -- unsigned int *m; -- int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); -+ slob_t *m; - void *ret; - - lockdep_trace_alloc(gfp); -@@ -491,7 +493,10 @@ void *__kmalloc_node(size_t size, gfp_t - - if (!m) - return NULL; -- *m = size; -+ BUILD_BUG_ON(ARCH_KMALLOC_MINALIGN < 2 * SLOB_UNIT); -+ BUILD_BUG_ON(ARCH_SLAB_MINALIGN < 2 * SLOB_UNIT); -+ m[0].units = size; -+ m[1].units = align; - ret = (void *)m + align; - - trace_kmalloc_node(_RET_IP_, ret, -@@ -501,9 +506,9 @@ void *__kmalloc_node(size_t size, gfp_t - - ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); - if (ret) { -- struct page *page; -- page = virt_to_page(ret); -- page->private = size; -+ struct slob_page *sp; -+ sp = slob_page(ret); -+ sp->size = size; - } - - trace_kmalloc_node(_RET_IP_, ret, -@@ -513,6 +518,13 @@ void *__kmalloc_node(size_t size, gfp_t - kmemleak_alloc(ret, size, 1, gfp); - return ret; - } -+ -+void *__kmalloc_node(size_t size, gfp_t gfp, int node) -+{ -+ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); -+ -+ return __kmalloc_node_align(size, gfp, node, align); -+} - EXPORT_SYMBOL(__kmalloc_node); - - void kfree(const void *block) -@@ -528,13 +540,86 @@ void kfree(const void *block) - sp = slob_page(block); - if (is_slob_page(sp)) { - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); -- unsigned int *m = (unsigned int *)(block - align); -- slob_free(m, *m + align); -- } else -+ slob_t *m = (slob_t *)(block - align); -+ slob_free(m, m[0].units + align); -+ } else { -+ clear_slob_page(sp); -+ free_slob_page(sp); -+ sp->size = 0; - put_page(&sp->page); -+ } - } - EXPORT_SYMBOL(kfree); - -+void check_object_size(const void *ptr, unsigned long n, bool to) -+{ -+ -+#ifdef CONFIG_PAX_USERCOPY -+ struct slob_page *sp; -+ const slob_t *free; -+ const void *base; -+ -+ if (!n) -+ return; -+ -+ if (ZERO_OR_NULL_PTR(ptr)) -+ goto report; -+ -+ if (!virt_addr_valid(ptr)) -+ return; -+ -+ sp = slob_page(ptr); -+ /* XXX: can get a little tighter with this stack check */ -+ if (!PageSlobPage((struct page*)sp) && object_is_on_stack(ptr) && -+ (n > ((unsigned long)task_stack_page(current) + THREAD_SIZE - -+ (unsigned long)ptr))) -+ goto report; -+ else -+ return; -+ -+ if (sp->size) { -+ base = page_address(&sp->page); -+ if (base <= ptr && n <= sp->size - (ptr - base)) -+ return; -+ goto report; -+ } -+ -+ /* some tricky double walking to find the chunk */ -+ base = (void *)((unsigned long)ptr & PAGE_MASK); -+ free = sp->free; -+ -+ while (!slob_last(free) && (void *)free <= ptr) { -+ base = free + slob_units(free); -+ free = slob_next(free); -+ } -+ -+ while (base < (void *)free) { -+ slobidx_t m = ((slob_t *)base)[0].units, align = ((slob_t *)base)[1].units; -+ int size = SLOB_UNIT * SLOB_UNITS(m + align); -+ int offset; -+ -+ if (ptr < base + align) -+ goto report; -+ -+ offset = ptr - base - align; -+ if (offset < m) { -+ if (n <= m - offset) -+ return; -+ goto report; -+ } -+ base += size; -+ } -+ -+report: -+ if (to) -+ pax_report_leak_to_user(ptr, n); -+ else -+ pax_report_overflow_from_user(ptr, n); -+#endif -+ -+} -+EXPORT_SYMBOL(check_object_size); -+ - /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ - size_t ksize(const void *block) - { -@@ -547,10 +632,10 @@ size_t ksize(const void *block) - sp = slob_page(block); - if (is_slob_page(sp)) { - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); -- unsigned int *m = (unsigned int *)(block - align); -- return SLOB_UNITS(*m) * SLOB_UNIT; -+ slob_t *m = (slob_t *)(block - align); -+ return SLOB_UNITS(m[0].units) * SLOB_UNIT; - } else -- return sp->page.private; -+ return sp->size; - } - EXPORT_SYMBOL(ksize); - -@@ -605,17 +690,25 @@ void *kmem_cache_alloc_node(struct kmem_ - { - void *b; - -+#ifdef CONFIG_PAX_USERCOPY -+ b = __kmalloc_node_align(c->size, flags, node, c->align); -+#else - if (c->size < PAGE_SIZE) { - b = slob_alloc(c->size, flags, c->align, node); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, - SLOB_UNITS(c->size) * SLOB_UNIT, - flags, node); - } else { -+ struct slob_page *sp; -+ - b = slob_new_pages(flags, get_order(c->size), node); -+ sp = slob_page(b); -+ sp->size = c->size; - trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, - PAGE_SIZE << get_order(c->size), - flags, node); - } -+#endif - - if (c->ctor) - c->ctor(b); -@@ -627,10 +720,16 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); - - static void __kmem_cache_free(void *b, int size) - { -- if (size < PAGE_SIZE) -+ struct slob_page *sp = slob_page(b); -+ -+ if (is_slob_page(sp)) - slob_free(b, size); -- else -+ else { -+ clear_slob_page(sp); -+ free_slob_page(sp); -+ sp->size = 0; - slob_free_pages(b, get_order(size)); -+ } - } - - static void kmem_rcu_free(struct rcu_head *head) -@@ -643,15 +742,24 @@ static void kmem_rcu_free(struct rcu_hea - - void kmem_cache_free(struct kmem_cache *c, void *b) - { -+ int size = c->size; -+ -+#ifdef CONFIG_PAX_USERCOPY -+ if (size + c->align < PAGE_SIZE) { -+ size += c->align; -+ b -= c->align; -+ } -+#endif -+ - kmemleak_free_recursive(b, c->flags); - if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { - struct slob_rcu *slob_rcu; -- slob_rcu = b + (c->size - sizeof(struct slob_rcu)); -+ slob_rcu = b + (size - sizeof(struct slob_rcu)); - INIT_RCU_HEAD(&slob_rcu->head); -- slob_rcu->size = c->size; -+ slob_rcu->size = size; - call_rcu(&slob_rcu->head, kmem_rcu_free); - } else { -- __kmem_cache_free(b, c->size); -+ __kmem_cache_free(b, size); - } - - trace_kmem_cache_free(_RET_IP_, b); -diff -urNp linux-2.6.31.1/mm/slub.c linux-2.6.31.1/mm/slub.c ---- linux-2.6.31.1/mm/slub.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/slub.c 2009-10-01 20:12:45.000000000 -0400 -@@ -1915,7 +1915,7 @@ static int slub_min_objects; - * Merge control. If this is set then no merging of slab caches will occur. - * (Could be removed. This was introduced to pacify the merge skeptics.) - */ --static int slub_nomerge; -+static int slub_nomerge = 1; - - /* - * Calculate the order of allocation given an slab object size. -@@ -2458,7 +2458,7 @@ static int kmem_cache_open(struct kmem_c - * list to avoid pounding the page allocator excessively. - */ - set_min_partial(s, ilog2(s->size)); -- s->refcount = 1; -+ atomic_set(&s->refcount, 1); - #ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 1000; - #endif -@@ -2595,8 +2595,7 @@ static inline int kmem_cache_close(struc - void kmem_cache_destroy(struct kmem_cache *s) - { - down_write(&slub_lock); -- s->refcount--; -- if (!s->refcount) { -+ if (atomic_dec_and_test(&s->refcount)) { - list_del(&s->list); - up_write(&slub_lock); - if (kmem_cache_close(s)) { -@@ -2875,6 +2874,48 @@ void *__kmalloc_node(size_t size, gfp_t - EXPORT_SYMBOL(__kmalloc_node); - #endif - -+void check_object_size(const void *ptr, unsigned long n, bool to) -+{ -+ -+#ifdef CONFIG_PAX_USERCOPY -+ struct page *page; -+ struct kmem_cache *s; -+ unsigned long offset; -+ -+ if (!n) -+ return; -+ -+ if (ZERO_OR_NULL_PTR(ptr)) -+ goto report; -+ -+ if (!virt_addr_valid(ptr)) -+ return; -+ -+ page = get_object_page(ptr); -+ -+ /* XXX: can get a little tighter with this stack check */ -+ if (!page && object_is_on_stack(ptr) && -+ (n > ((unsigned long)task_stack_page(current) + THREAD_SIZE - -+ (unsigned long)ptr))) -+ goto report; -+ else -+ return; -+ -+ s = page->slab; -+ offset = (ptr - page_address(page)) % s->size; -+ if (offset <= s->objsize && n <= s->objsize - offset) -+ return; -+ -+report: -+ if (to) -+ pax_report_leak_to_user(ptr, n); -+ else -+ pax_report_overflow_from_user(ptr, n); -+#endif -+ -+} -+EXPORT_SYMBOL(check_object_size); -+ - size_t ksize(const void *object) - { - struct page *page; -@@ -3146,7 +3187,7 @@ void __init kmem_cache_init(void) - */ - create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_NOWAIT); -- kmalloc_caches[0].refcount = -1; -+ atomic_set(&kmalloc_caches[0].refcount, -1); - caches++; - - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); -@@ -3240,7 +3281,7 @@ static int slab_unmergeable(struct kmem_ - /* - * We may have set a slab to be unmergeable during bootstrap. - */ -- if (s->refcount < 0) -+ if (atomic_read(&s->refcount) < 0) - return 1; - - return 0; -@@ -3297,7 +3338,7 @@ struct kmem_cache *kmem_cache_create(con - if (s) { - int cpu; - -- s->refcount++; -+ atomic_inc(&s->refcount); - /* - * Adjust the object sizes so that we clear - * the complete object on kzalloc. -@@ -3316,7 +3357,7 @@ struct kmem_cache *kmem_cache_create(con - - if (sysfs_slab_alias(s, name)) { - down_write(&slub_lock); -- s->refcount--; -+ atomic_dec(&s->refcount); - up_write(&slub_lock); - goto err; - } -@@ -4045,7 +4086,7 @@ SLAB_ATTR_RO(ctor); - - static ssize_t aliases_show(struct kmem_cache *s, char *buf) - { -- return sprintf(buf, "%d\n", s->refcount - 1); -+ return sprintf(buf, "%d\n", atomic_read(&s->refcount) - 1); - } - SLAB_ATTR_RO(aliases); - -@@ -4726,7 +4767,9 @@ static const struct file_operations proc - - static int __init slab_proc_init(void) - { -+#if !defined(CONFIG_GRKERNSEC_PROC_ADD) - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); -+#endif - return 0; - } - module_init(slab_proc_init); -diff -urNp linux-2.6.31.1/mm/util.c linux-2.6.31.1/mm/util.c ---- linux-2.6.31.1/mm/util.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/util.c 2009-10-01 20:12:45.000000000 -0400 -@@ -224,6 +224,12 @@ EXPORT_SYMBOL(strndup_user); - void arch_pick_mmap_layout(struct mm_struct *mm) - { - mm->mmap_base = TASK_UNMAPPED_BASE; -+ -+#ifdef CONFIG_PAX_RANDMMAP -+ if (mm->pax_flags & MF_PAX_RANDMMAP) -+ mm->mmap_base += mm->delta_mmap; -+#endif -+ - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; - } -diff -urNp linux-2.6.31.1/mm/vmalloc.c linux-2.6.31.1/mm/vmalloc.c ---- linux-2.6.31.1/mm/vmalloc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/mm/vmalloc.c 2009-10-01 20:12:45.000000000 -0400 -@@ -91,6 +91,11 @@ static int vmap_pte_range(pmd_t *pmd, un - unsigned long end, pgprot_t prot, struct page **pages, int *nr) - { - pte_t *pte; -+ int ret = -ENOMEM; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ unsigned long cr0; -+#endif - - /* - * nr is a running index into the array which helps higher level -@@ -100,17 +105,33 @@ static int vmap_pte_range(pmd_t *pmd, un - pte = pte_alloc_kernel(pmd, addr); - if (!pte) - return -ENOMEM; -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_open_kernel(cr0); -+#endif -+ - do { - struct page *page = pages[*nr]; - -- if (WARN_ON(!pte_none(*pte))) -- return -EBUSY; -- if (WARN_ON(!page)) -- return -ENOMEM; -+ if (WARN_ON(!pte_none(*pte))) { -+ ret = -EBUSY; -+ goto out; -+ } -+ if (WARN_ON(!page)) { -+ ret = -ENOMEM; -+ goto out; -+ } - set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); - (*nr)++; - } while (pte++, addr += PAGE_SIZE, addr != end); -- return 0; -+ ret = 0; -+out: -+ -+#ifdef CONFIG_PAX_KERNEXEC -+ pax_close_kernel(cr0); -+#endif -+ -+ return ret; - } - - static int vmap_pmd_range(pud_t *pud, unsigned long addr, -@@ -1132,6 +1153,16 @@ static struct vm_struct *__get_vm_area_n - unsigned long align = 1; - - BUG_ON(in_interrupt()); -+ -+#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) -+ if (flags & VM_KERNEXEC) { -+ if (start != VMALLOC_START || end != VMALLOC_END) -+ return NULL; -+ start = (unsigned long)&MODULES_EXEC_VADDR; -+ end = (unsigned long)&MODULES_EXEC_END; -+ } -+#endif -+ - if (flags & VM_IOREMAP) { - int bit = fls(size); - -@@ -1371,6 +1402,11 @@ void *vmap(struct page **pages, unsigned - if (count > num_physpages) - return NULL; - -+#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) -+ if (!(pgprot_val(prot) & _PAGE_NX)) -+ flags |= VM_KERNEXEC; -+#endif -+ - area = get_vm_area_caller((count << PAGE_SHIFT), flags, - __builtin_return_address(0)); - if (!area) -@@ -1478,6 +1514,13 @@ static void *__vmalloc_node(unsigned lon - if (!size || (size >> PAGE_SHIFT) > num_physpages) - return NULL; - -+#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) -+ if (!(pgprot_val(prot) & _PAGE_NX)) -+ area = __get_vm_area_node(size, VM_ALLOC | VM_KERNEXEC, VMALLOC_START, VMALLOC_END, -+ node, gfp_mask, caller); -+ else -+#endif -+ - area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, - node, gfp_mask, caller); - -@@ -1496,6 +1539,7 @@ static void *__vmalloc_node(unsigned lon - return addr; - } - -+#undef __vmalloc - void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) - { - return __vmalloc_node(size, gfp_mask, prot, -1, -@@ -1512,6 +1556,7 @@ EXPORT_SYMBOL(__vmalloc); - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - */ -+#undef vmalloc - void *vmalloc(unsigned long size) - { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, -@@ -1526,6 +1571,7 @@ EXPORT_SYMBOL(vmalloc); - * The resulting memory area is zeroed so it can be mapped to userspace - * without leaking data. - */ -+#undef vmalloc_user - void *vmalloc_user(unsigned long size) - { - struct vm_struct *area; -@@ -1552,6 +1598,7 @@ EXPORT_SYMBOL(vmalloc_user); - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - */ -+#undef vmalloc_node - void *vmalloc_node(unsigned long size, int node) - { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, -@@ -1574,10 +1621,10 @@ EXPORT_SYMBOL(vmalloc_node); - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - */ -- -+#undef vmalloc_exec - void *vmalloc_exec(unsigned long size) - { -- return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, -+ return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL_EXEC, - -1, __builtin_return_address(0)); - } - -@@ -1596,6 +1643,7 @@ void *vmalloc_exec(unsigned long size) - * Allocate enough 32bit PA addressable pages to cover @size from the - * page level allocator and map them into contiguous kernel virtual space. - */ -+#undef vmalloc_32 - void *vmalloc_32(unsigned long size) - { - return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, -@@ -1610,6 +1658,7 @@ EXPORT_SYMBOL(vmalloc_32); - * The resulting memory area is 32bit addressable and zeroed so it can be - * mapped to userspace without leaking data. - */ -+#undef vmalloc_32_user - void *vmalloc_32_user(unsigned long size) - { - struct vm_struct *area; -diff -urNp linux-2.6.31.1/net/atm/atm_misc.c linux-2.6.31.1/net/atm/atm_misc.c ---- linux-2.6.31.1/net/atm/atm_misc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/atm/atm_misc.c 2009-10-01 20:12:45.000000000 -0400 -@@ -19,7 +19,7 @@ int atm_charge(struct atm_vcc *vcc,int t - if (atomic_read(&sk_atm(vcc)->sk_rmem_alloc) <= sk_atm(vcc)->sk_rcvbuf) - return 1; - atm_return(vcc,truesize); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - return 0; - } - -@@ -41,7 +41,7 @@ struct sk_buff *atm_alloc_charge(struct - } - } - atm_return(vcc,guess); -- atomic_inc(&vcc->stats->rx_drop); -+ atomic_inc_unchecked(&vcc->stats->rx_drop); - return NULL; - } - -@@ -88,7 +88,7 @@ int atm_pcr_goal(const struct atm_trafpr - - void sonet_copy_stats(struct k_sonet_stats *from,struct sonet_stats *to) - { --#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i) -+#define __HANDLE_ITEM(i) to->i = atomic_read_unchecked(&from->i) - __SONET_ITEMS - #undef __HANDLE_ITEM - } -@@ -96,7 +96,7 @@ void sonet_copy_stats(struct k_sonet_sta - - void sonet_subtract_stats(struct k_sonet_stats *from,struct sonet_stats *to) - { --#define __HANDLE_ITEM(i) atomic_sub(to->i,&from->i) -+#define __HANDLE_ITEM(i) atomic_sub_unchecked(to->i,&from->i) - __SONET_ITEMS - #undef __HANDLE_ITEM - } -diff -urNp linux-2.6.31.1/net/atm/proc.c linux-2.6.31.1/net/atm/proc.c ---- linux-2.6.31.1/net/atm/proc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/atm/proc.c 2009-10-01 20:12:45.000000000 -0400 -@@ -43,9 +43,9 @@ static void add_stats(struct seq_file *s - const struct k_atm_aal_stats *stats) - { - seq_printf(seq, "%s ( %d %d %d %d %d )", aal, -- atomic_read(&stats->tx),atomic_read(&stats->tx_err), -- atomic_read(&stats->rx),atomic_read(&stats->rx_err), -- atomic_read(&stats->rx_drop)); -+ atomic_read_unchecked(&stats->tx),atomic_read_unchecked(&stats->tx_err), -+ atomic_read_unchecked(&stats->rx),atomic_read_unchecked(&stats->rx_err), -+ atomic_read_unchecked(&stats->rx_drop)); - } - - static void atm_dev_info(struct seq_file *seq, const struct atm_dev *dev) -diff -urNp linux-2.6.31.1/net/atm/resources.c linux-2.6.31.1/net/atm/resources.c ---- linux-2.6.31.1/net/atm/resources.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/atm/resources.c 2009-10-01 20:12:45.000000000 -0400 -@@ -161,7 +161,7 @@ void atm_dev_deregister(struct atm_dev * - static void copy_aal_stats(struct k_atm_aal_stats *from, - struct atm_aal_stats *to) - { --#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i) -+#define __HANDLE_ITEM(i) to->i = atomic_read_unchecked(&from->i) - __AAL_STAT_ITEMS - #undef __HANDLE_ITEM - } -@@ -170,7 +170,7 @@ static void copy_aal_stats(struct k_atm_ - static void subtract_aal_stats(struct k_atm_aal_stats *from, - struct atm_aal_stats *to) - { --#define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i) -+#define __HANDLE_ITEM(i) atomic_sub_unchecked(to->i, &from->i) - __AAL_STAT_ITEMS - #undef __HANDLE_ITEM - } -diff -urNp linux-2.6.31.1/net/bridge/br_stp_if.c linux-2.6.31.1/net/bridge/br_stp_if.c ---- linux-2.6.31.1/net/bridge/br_stp_if.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/bridge/br_stp_if.c 2009-10-01 20:12:45.000000000 -0400 -@@ -146,7 +146,7 @@ static void br_stp_stop(struct net_bridg - char *envp[] = { NULL }; - - if (br->stp_enabled == BR_USER_STP) { -- r = call_usermodehelper(BR_STP_PROG, argv, envp, 1); -+ r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); - printk(KERN_INFO "%s: userspace STP stopped, return code %d\n", - br->dev->name, r); - -diff -urNp linux-2.6.31.1/net/core/flow.c linux-2.6.31.1/net/core/flow.c ---- linux-2.6.31.1/net/core/flow.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/core/flow.c 2009-10-01 20:12:45.000000000 -0400 -@@ -39,7 +39,7 @@ atomic_t flow_cache_genid = ATOMIC_INIT( - - static u32 flow_hash_shift; - #define flow_hash_size (1 << flow_hash_shift) --static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; -+static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables); - - #define flow_table(cpu) (per_cpu(flow_tables, cpu)) - -@@ -52,7 +52,7 @@ struct flow_percpu_info { - u32 hash_rnd; - int count; - }; --static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 }; -+static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info); - - #define flow_hash_rnd_recalc(cpu) \ - (per_cpu(flow_hash_info, cpu).hash_rnd_recalc) -@@ -69,7 +69,7 @@ struct flow_flush_info { - atomic_t cpuleft; - struct completion completion; - }; --static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL }; -+static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets); - - #define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu)) - -diff -urNp linux-2.6.31.1/net/dccp/ccids/ccid3.c linux-2.6.31.1/net/dccp/ccids/ccid3.c ---- linux-2.6.31.1/net/dccp/ccids/ccid3.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/dccp/ccids/ccid3.c 2009-10-01 20:12:45.000000000 -0400 -@@ -43,7 +43,7 @@ - static int ccid3_debug; - #define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a) - #else --#define ccid3_pr_debug(format, a...) -+#define ccid3_pr_debug(format, a...) do {} while (0) - #endif - - /* -diff -urNp linux-2.6.31.1/net/dccp/dccp.h linux-2.6.31.1/net/dccp/dccp.h ---- linux-2.6.31.1/net/dccp/dccp.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/dccp/dccp.h 2009-10-01 20:12:45.000000000 -0400 -@@ -44,9 +44,9 @@ extern int dccp_debug; - #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) - #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) - #else --#define dccp_pr_debug(format, a...) --#define dccp_pr_debug_cat(format, a...) --#define dccp_debug(format, a...) -+#define dccp_pr_debug(format, a...) do {} while (0) -+#define dccp_pr_debug_cat(format, a...) do {} while (0) -+#define dccp_debug(format, a...) do {} while (0) - #endif - - extern struct inet_hashinfo dccp_hashinfo; -diff -urNp linux-2.6.31.1/net/ipv4/inet_hashtables.c linux-2.6.31.1/net/ipv4/inet_hashtables.c ---- linux-2.6.31.1/net/ipv4/inet_hashtables.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/ipv4/inet_hashtables.c 2009-10-01 20:12:45.000000000 -0400 -@@ -18,11 +18,14 @@ - #include <linux/sched.h> - #include <linux/slab.h> - #include <linux/wait.h> -+#include <linux/security.h> - - #include <net/inet_connection_sock.h> - #include <net/inet_hashtables.h> - #include <net/ip.h> - -+extern void gr_update_task_in_ip_table(struct task_struct *task, const struct inet_sock *inet); -+ - /* - * Allocate and initialize a new local port bind bucket. - * The bindhash mutex for snum's hash chain must be held here. -@@ -490,6 +493,8 @@ ok: - } - spin_unlock(&head->lock); - -+ gr_update_task_in_ip_table(current, inet_sk(sk)); -+ - if (tw) { - inet_twsk_deschedule(tw, death_row); - inet_twsk_put(tw); -diff -urNp linux-2.6.31.1/net/ipv4/netfilter/nf_nat_snmp_basic.c linux-2.6.31.1/net/ipv4/netfilter/nf_nat_snmp_basic.c ---- linux-2.6.31.1/net/ipv4/netfilter/nf_nat_snmp_basic.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/ipv4/netfilter/nf_nat_snmp_basic.c 2009-10-01 20:12:45.000000000 -0400 -@@ -397,7 +397,7 @@ static unsigned char asn1_octets_decode( - - *len = 0; - -- *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); -+ *octets = kmalloc((eoc - ctx->pointer), GFP_ATOMIC); - if (*octets == NULL) { - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); -diff -urNp linux-2.6.31.1/net/ipv4/tcp_ipv4.c linux-2.6.31.1/net/ipv4/tcp_ipv4.c ---- linux-2.6.31.1/net/ipv4/tcp_ipv4.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/ipv4/tcp_ipv4.c 2009-10-01 20:12:45.000000000 -0400 -@@ -1504,6 +1504,9 @@ int tcp_v4_do_rcv(struct sock *sk, struc - return 0; - - reset: -+#ifdef CONFIG_GRKERNSEC_BLACKHOLE -+ if (!skb->dev || (skb->dev->flags & IFF_LOOPBACK)) -+#endif - tcp_v4_send_reset(rsk, skb); - discard: - kfree_skb(skb); -@@ -1612,6 +1615,9 @@ no_tcp_socket: - bad_packet: - TCP_INC_STATS_BH(net, TCP_MIB_INERRS); - } else { -+#ifdef CONFIG_GRKERNSEC_BLACKHOLE -+ if (skb->dev->flags & IFF_LOOPBACK) -+#endif - tcp_v4_send_reset(NULL, skb); - } - -diff -urNp linux-2.6.31.1/net/ipv4/tcp_minisocks.c linux-2.6.31.1/net/ipv4/tcp_minisocks.c ---- linux-2.6.31.1/net/ipv4/tcp_minisocks.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/ipv4/tcp_minisocks.c 2009-10-01 20:12:45.000000000 -0400 -@@ -695,8 +695,11 @@ listen_overflow: - - embryonic_reset: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); -+ -+#ifndef CONFIG_GRKERNSEC_BLACKHOLE - if (!(flg & TCP_FLAG_RST)) - req->rsk_ops->send_reset(sk, skb); -+#endif - - inet_csk_reqsk_queue_drop(sk, req, prev); - return NULL; -diff -urNp linux-2.6.31.1/net/ipv4/udp.c linux-2.6.31.1/net/ipv4/udp.c ---- linux-2.6.31.1/net/ipv4/udp.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/ipv4/udp.c 2009-10-01 20:12:45.000000000 -0400 -@@ -86,6 +86,7 @@ - #include <linux/types.h> - #include <linux/fcntl.h> - #include <linux/module.h> -+#include <linux/security.h> - #include <linux/socket.h> - #include <linux/sockios.h> - #include <linux/igmp.h> -@@ -369,6 +370,9 @@ found: - return s; - } - -+extern int gr_search_udp_recvmsg(struct sock *sk, const struct sk_buff *skb); -+extern int gr_search_udp_sendmsg(struct sock *sk, struct sockaddr_in *addr); -+ - /* - * This routine is called by the ICMP module when it gets some - * sort of error condition. If err < 0 then the socket should -@@ -631,9 +635,18 @@ int udp_sendmsg(struct kiocb *iocb, stru - dport = usin->sin_port; - if (dport == 0) - return -EINVAL; -+ -+ err = gr_search_udp_sendmsg(sk, usin); -+ if (err) -+ return err; - } else { - if (sk->sk_state != TCP_ESTABLISHED) - return -EDESTADDRREQ; -+ -+ err = gr_search_udp_sendmsg(sk, NULL); -+ if (err) -+ return err; -+ - daddr = inet->daddr; - dport = inet->dport; - /* Open fast path for connected socket. -@@ -903,6 +916,10 @@ try_again: - if (!skb) - goto out; - -+ err = gr_search_udp_recvmsg(sk, skb); -+ if (err) -+ goto out_free; -+ - ulen = skb->len - sizeof(struct udphdr); - copied = len; - if (copied > ulen) -@@ -1293,6 +1310,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, - goto csum_error; - - UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); -+#ifdef CONFIG_GRKERNSEC_BLACKHOLE -+ if (skb->dev->flags & IFF_LOOPBACK) -+#endif - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - - /* -diff -urNp linux-2.6.31.1/net/ipv6/exthdrs.c linux-2.6.31.1/net/ipv6/exthdrs.c ---- linux-2.6.31.1/net/ipv6/exthdrs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/ipv6/exthdrs.c 2009-10-01 20:12:45.000000000 -0400 -@@ -630,7 +630,7 @@ static struct tlvtype_proc tlvprochopopt - .type = IPV6_TLV_JUMBO, - .func = ipv6_hop_jumbo, - }, -- { -1, } -+ { -1, NULL } - }; - - int ipv6_parse_hopopts(struct sk_buff *skb) -diff -urNp linux-2.6.31.1/net/ipv6/ip6mr.c linux-2.6.31.1/net/ipv6/ip6mr.c ---- linux-2.6.31.1/net/ipv6/ip6mr.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/ipv6/ip6mr.c 2009-10-01 20:12:45.000000000 -0400 -@@ -204,7 +204,7 @@ static int ip6mr_vif_seq_show(struct seq - return 0; - } - --static struct seq_operations ip6mr_vif_seq_ops = { -+static const struct seq_operations ip6mr_vif_seq_ops = { - .start = ip6mr_vif_seq_start, - .next = ip6mr_vif_seq_next, - .stop = ip6mr_vif_seq_stop, -@@ -217,7 +217,7 @@ static int ip6mr_vif_open(struct inode * - sizeof(struct ipmr_vif_iter)); - } - --static struct file_operations ip6mr_vif_fops = { -+static const struct file_operations ip6mr_vif_fops = { - .owner = THIS_MODULE, - .open = ip6mr_vif_open, - .read = seq_read, -@@ -328,7 +328,7 @@ static int ipmr_mfc_seq_show(struct seq_ - return 0; - } - --static struct seq_operations ipmr_mfc_seq_ops = { -+static const struct seq_operations ipmr_mfc_seq_ops = { - .start = ipmr_mfc_seq_start, - .next = ipmr_mfc_seq_next, - .stop = ipmr_mfc_seq_stop, -@@ -341,7 +341,7 @@ static int ipmr_mfc_open(struct inode *i - sizeof(struct ipmr_mfc_iter)); - } - --static struct file_operations ip6mr_mfc_fops = { -+static const struct file_operations ip6mr_mfc_fops = { - .owner = THIS_MODULE, - .open = ipmr_mfc_open, - .read = seq_read, -diff -urNp linux-2.6.31.1/net/ipv6/raw.c linux-2.6.31.1/net/ipv6/raw.c ---- linux-2.6.31.1/net/ipv6/raw.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/ipv6/raw.c 2009-10-01 20:12:45.000000000 -0400 -@@ -600,7 +600,7 @@ out: - return err; - } - --static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, -+static int rawv6_send_hdrinc(struct sock *sk, void *from, unsigned int length, - struct flowi *fl, struct rt6_info *rt, - unsigned int flags) - { -diff -urNp linux-2.6.31.1/net/ipv6/tcp_ipv6.c linux-2.6.31.1/net/ipv6/tcp_ipv6.c ---- linux-2.6.31.1/net/ipv6/tcp_ipv6.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/ipv6/tcp_ipv6.c 2009-10-01 20:12:45.000000000 -0400 -@@ -1577,6 +1577,9 @@ static int tcp_v6_do_rcv(struct sock *sk - return 0; - - reset: -+#ifdef CONFIG_GRKERNSEC_BLACKHOLE -+ if (!skb->dev || (skb->dev->flags & IFF_LOOPBACK)) -+#endif - tcp_v6_send_reset(sk, skb); - discard: - if (opt_skb) -@@ -1699,6 +1702,9 @@ no_tcp_socket: - bad_packet: - TCP_INC_STATS_BH(net, TCP_MIB_INERRS); - } else { -+#ifdef CONFIG_GRKERNSEC_BLACKHOLE -+ if (skb->dev->flags & IFF_LOOPBACK) -+#endif - tcp_v6_send_reset(NULL, skb); - } - -diff -urNp linux-2.6.31.1/net/ipv6/udp.c linux-2.6.31.1/net/ipv6/udp.c ---- linux-2.6.31.1/net/ipv6/udp.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/ipv6/udp.c 2009-10-01 20:12:45.000000000 -0400 -@@ -589,6 +589,9 @@ int __udp6_lib_rcv(struct sk_buff *skb, - UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, - proto == IPPROTO_UDPLITE); - -+#ifdef CONFIG_GRKERNSEC_BLACKHOLE -+ if (skb->dev->flags & IFF_LOOPBACK) -+#endif - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev); - - kfree_skb(skb); -diff -urNp linux-2.6.31.1/net/irda/ircomm/ircomm_tty.c linux-2.6.31.1/net/irda/ircomm/ircomm_tty.c ---- linux-2.6.31.1/net/irda/ircomm/ircomm_tty.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/irda/ircomm/ircomm_tty.c 2009-10-01 20:12:45.000000000 -0400 -@@ -280,16 +280,16 @@ static int ircomm_tty_block_til_ready(st - add_wait_queue(&self->open_wait, &wait); - - IRDA_DEBUG(2, "%s(%d):block_til_ready before block on %s open_count=%d\n", -- __FILE__,__LINE__, tty->driver->name, self->open_count ); -+ __FILE__,__LINE__, tty->driver->name, atomic_read(&self->open_count) ); - - /* As far as I can see, we protect open_count - Jean II */ - spin_lock_irqsave(&self->spinlock, flags); - if (!tty_hung_up_p(filp)) { - extra_count = 1; -- self->open_count--; -+ atomic_dec(&self->open_count); - } - spin_unlock_irqrestore(&self->spinlock, flags); -- self->blocked_open++; -+ atomic_inc(&self->blocked_open); - - while (1) { - if (tty->termios->c_cflag & CBAUD) { -@@ -329,7 +329,7 @@ static int ircomm_tty_block_til_ready(st - } - - IRDA_DEBUG(1, "%s(%d):block_til_ready blocking on %s open_count=%d\n", -- __FILE__,__LINE__, tty->driver->name, self->open_count ); -+ __FILE__,__LINE__, tty->driver->name, atomic_read(&self->open_count) ); - - schedule(); - } -@@ -340,13 +340,13 @@ static int ircomm_tty_block_til_ready(st - if (extra_count) { - /* ++ is not atomic, so this should be protected - Jean II */ - spin_lock_irqsave(&self->spinlock, flags); -- self->open_count++; -+ atomic_inc(&self->open_count); - spin_unlock_irqrestore(&self->spinlock, flags); - } -- self->blocked_open--; -+ atomic_dec(&self->blocked_open); - - IRDA_DEBUG(1, "%s(%d):block_til_ready after blocking on %s open_count=%d\n", -- __FILE__,__LINE__, tty->driver->name, self->open_count); -+ __FILE__,__LINE__, tty->driver->name, atomic_read(&self->open_count)); - - if (!retval) - self->flags |= ASYNC_NORMAL_ACTIVE; -@@ -415,14 +415,14 @@ static int ircomm_tty_open(struct tty_st - } - /* ++ is not atomic, so this should be protected - Jean II */ - spin_lock_irqsave(&self->spinlock, flags); -- self->open_count++; -+ atomic_inc(&self->open_count); - - tty->driver_data = self; - self->tty = tty; - spin_unlock_irqrestore(&self->spinlock, flags); - - IRDA_DEBUG(1, "%s(), %s%d, count = %d\n", __func__ , tty->driver->name, -- self->line, self->open_count); -+ self->line, atomic_read(&self->open_count)); - - /* Not really used by us, but lets do it anyway */ - self->tty->low_latency = (self->flags & ASYNC_LOW_LATENCY) ? 1 : 0; -@@ -511,7 +511,7 @@ static void ircomm_tty_close(struct tty_ - return; - } - -- if ((tty->count == 1) && (self->open_count != 1)) { -+ if ((tty->count == 1) && (atomic_read(&self->open_count) != 1)) { - /* - * Uh, oh. tty->count is 1, which means that the tty - * structure will be freed. state->count should always -@@ -521,16 +521,16 @@ static void ircomm_tty_close(struct tty_ - */ - IRDA_DEBUG(0, "%s(), bad serial port count; " - "tty->count is 1, state->count is %d\n", __func__ , -- self->open_count); -- self->open_count = 1; -+ atomic_read(&self->open_count)); -+ atomic_set(&self->open_count, 1); - } - -- if (--self->open_count < 0) { -+ if (atomic_dec_return(&self->open_count) < 0) { - IRDA_ERROR("%s(), bad serial port count for ttys%d: %d\n", -- __func__, self->line, self->open_count); -- self->open_count = 0; -+ __func__, self->line, atomic_read(&self->open_count)); -+ atomic_set(&self->open_count, 0); - } -- if (self->open_count) { -+ if (atomic_read(&self->open_count)) { - spin_unlock_irqrestore(&self->spinlock, flags); - - IRDA_DEBUG(0, "%s(), open count > 0\n", __func__ ); -@@ -562,7 +562,7 @@ static void ircomm_tty_close(struct tty_ - tty->closing = 0; - self->tty = NULL; - -- if (self->blocked_open) { -+ if (atomic_read(&self->blocked_open)) { - if (self->close_delay) - schedule_timeout_interruptible(self->close_delay); - wake_up_interruptible(&self->open_wait); -@@ -1017,7 +1017,7 @@ static void ircomm_tty_hangup(struct tty - spin_lock_irqsave(&self->spinlock, flags); - self->flags &= ~ASYNC_NORMAL_ACTIVE; - self->tty = NULL; -- self->open_count = 0; -+ atomic_set(&self->open_count, 0); - spin_unlock_irqrestore(&self->spinlock, flags); - - wake_up_interruptible(&self->open_wait); -@@ -1369,7 +1369,7 @@ static void ircomm_tty_line_info(struct - seq_putc(m, '\n'); - - seq_printf(m, "Role: %s\n", self->client ? "client" : "server"); -- seq_printf(m, "Open count: %d\n", self->open_count); -+ seq_printf(m, "Open count: %d\n", atomic_read(&self->open_count)); - seq_printf(m, "Max data size: %d\n", self->max_data_size); - seq_printf(m, "Max header size: %d\n", self->max_header_size); - -diff -urNp linux-2.6.31.1/net/key/af_key.c linux-2.6.31.1/net/key/af_key.c ---- linux-2.6.31.1/net/key/af_key.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/key/af_key.c 2009-10-01 20:12:45.000000000 -0400 -@@ -3705,7 +3705,7 @@ static void pfkey_seq_stop(struct seq_fi - read_unlock(&pfkey_table_lock); - } - --static struct seq_operations pfkey_seq_ops = { -+static const struct seq_operations pfkey_seq_ops = { - .start = pfkey_seq_start, - .next = pfkey_seq_next, - .stop = pfkey_seq_stop, -@@ -3718,7 +3718,7 @@ static int pfkey_seq_open(struct inode * - sizeof(struct seq_net_private)); - } - --static struct file_operations pfkey_proc_ops = { -+static const struct file_operations pfkey_proc_ops = { - .open = pfkey_seq_open, - .read = seq_read, - .llseek = seq_lseek, -diff -urNp linux-2.6.31.1/net/mac80211/ieee80211_i.h linux-2.6.31.1/net/mac80211/ieee80211_i.h ---- linux-2.6.31.1/net/mac80211/ieee80211_i.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/mac80211/ieee80211_i.h 2009-10-01 20:12:45.000000000 -0400 -@@ -609,7 +609,7 @@ struct ieee80211_local { - spinlock_t queue_stop_reason_lock; - - struct net_device *mdev; /* wmaster# - "master" 802.11 device */ -- int open_count; -+ atomic_t open_count; - int monitors, cooked_mntrs; - /* number of interfaces with corresponding FIF_ flags */ - int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss; -diff -urNp linux-2.6.31.1/net/mac80211/iface.c linux-2.6.31.1/net/mac80211/iface.c ---- linux-2.6.31.1/net/mac80211/iface.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/mac80211/iface.c 2009-10-01 20:12:45.000000000 -0400 -@@ -164,7 +164,7 @@ static int ieee80211_open(struct net_dev - break; - } - -- if (local->open_count == 0) { -+ if (atomic_read(&local->open_count) == 0) { - res = drv_start(local); - if (res) - goto err_del_bss; -@@ -198,7 +198,7 @@ static int ieee80211_open(struct net_dev - * Validate the MAC address for this device. - */ - if (!is_valid_ether_addr(dev->dev_addr)) { -- if (!local->open_count) -+ if (!atomic_read(&local->open_count)) - drv_stop(local); - return -EADDRNOTAVAIL; - } -@@ -281,7 +281,7 @@ static int ieee80211_open(struct net_dev - } - } - -- if (local->open_count == 0) { -+ if (atomic_read(&local->open_count) == 0) { - res = dev_open(local->mdev); - WARN_ON(res); - if (res) -@@ -303,7 +303,7 @@ static int ieee80211_open(struct net_dev - - hw_reconf_flags |= __ieee80211_recalc_idle(local); - -- local->open_count++; -+ atomic_inc(&local->open_count); - if (hw_reconf_flags) { - ieee80211_hw_config(local, hw_reconf_flags); - /* -@@ -331,7 +331,7 @@ static int ieee80211_open(struct net_dev - err_del_interface: - drv_remove_interface(local, &conf); - err_stop: -- if (!local->open_count) -+ if (!atomic_read(&local->open_count)) - drv_stop(local); - err_del_bss: - sdata->bss = NULL; -@@ -429,7 +429,7 @@ static int ieee80211_stop(struct net_dev - WARN_ON(!list_empty(&sdata->u.ap.vlans)); - } - -- local->open_count--; -+ atomic_dec(&local->open_count); - - switch (sdata->vif.type) { - case NL80211_IFTYPE_AP_VLAN: -@@ -554,7 +554,7 @@ static int ieee80211_stop(struct net_dev - - ieee80211_recalc_ps(local, -1); - -- if (local->open_count == 0) { -+ if (atomic_read(&local->open_count) == 0) { - if (netif_running(local->mdev)) - dev_close(local->mdev); - -diff -urNp linux-2.6.31.1/net/mac80211/main.c linux-2.6.31.1/net/mac80211/main.c ---- linux-2.6.31.1/net/mac80211/main.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/mac80211/main.c 2009-10-01 20:12:45.000000000 -0400 -@@ -193,7 +193,7 @@ int ieee80211_hw_config(struct ieee80211 - local->hw.conf.power_level = power; - } - -- if (changed && local->open_count) { -+ if (changed && atomic_read(&local->open_count)) { - ret = drv_config(local, changed); - /* - * Goal: -diff -urNp linux-2.6.31.1/net/mac80211/pm.c linux-2.6.31.1/net/mac80211/pm.c ---- linux-2.6.31.1/net/mac80211/pm.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/mac80211/pm.c 2009-10-01 20:12:45.000000000 -0400 -@@ -103,7 +103,7 @@ int __ieee80211_suspend(struct ieee80211 - } - - /* stop hardware - this must stop RX */ -- if (local->open_count) { -+ if (atomic_read(&local->open_count)) { - ieee80211_led_radio(local, false); - drv_stop(local); - } -diff -urNp linux-2.6.31.1/net/mac80211/rate.c linux-2.6.31.1/net/mac80211/rate.c ---- linux-2.6.31.1/net/mac80211/rate.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/mac80211/rate.c 2009-10-01 20:12:45.000000000 -0400 -@@ -258,7 +258,7 @@ int ieee80211_init_rate_ctrl_alg(struct - struct rate_control_ref *ref, *old; - - ASSERT_RTNL(); -- if (local->open_count || netif_running(local->mdev)) -+ if (atomic_read(&local->open_count) || netif_running(local->mdev)) - return -EBUSY; - - ref = rate_control_alloc(name, local); -diff -urNp linux-2.6.31.1/net/mac80211/rc80211_minstrel_debugfs.c linux-2.6.31.1/net/mac80211/rc80211_minstrel_debugfs.c ---- linux-2.6.31.1/net/mac80211/rc80211_minstrel_debugfs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/mac80211/rc80211_minstrel_debugfs.c 2009-10-01 20:12:45.000000000 -0400 -@@ -139,7 +139,7 @@ minstrel_stats_release(struct inode *ino - return 0; - } - --static struct file_operations minstrel_stat_fops = { -+static const struct file_operations minstrel_stat_fops = { - .owner = THIS_MODULE, - .open = minstrel_stats_open, - .read = minstrel_stats_read, -diff -urNp linux-2.6.31.1/net/mac80211/rc80211_pid_debugfs.c linux-2.6.31.1/net/mac80211/rc80211_pid_debugfs.c ---- linux-2.6.31.1/net/mac80211/rc80211_pid_debugfs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/mac80211/rc80211_pid_debugfs.c 2009-10-01 20:12:45.000000000 -0400 -@@ -198,7 +198,7 @@ static ssize_t rate_control_pid_events_r - - #undef RC_PID_PRINT_BUF_SIZE - --static struct file_operations rc_pid_fop_events = { -+static const struct file_operations rc_pid_fop_events = { - .owner = THIS_MODULE, - .read = rate_control_pid_events_read, - .poll = rate_control_pid_events_poll, -diff -urNp linux-2.6.31.1/net/mac80211/util.c linux-2.6.31.1/net/mac80211/util.c ---- linux-2.6.31.1/net/mac80211/util.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/mac80211/util.c 2009-10-01 20:12:45.000000000 -0400 -@@ -991,7 +991,7 @@ int ieee80211_reconfig(struct ieee80211_ - local->suspended = false; - - /* restart hardware */ -- if (local->open_count) { -+ if (atomic_read(&local->open_count)) { - res = drv_start(local); - - ieee80211_led_radio(local, true); -diff -urNp linux-2.6.31.1/net/packet/af_packet.c linux-2.6.31.1/net/packet/af_packet.c ---- linux-2.6.31.1/net/packet/af_packet.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/packet/af_packet.c 2009-10-01 20:12:45.000000000 -0400 -@@ -2086,7 +2086,7 @@ static void packet_mm_close(struct vm_ar - atomic_dec(&pkt_sk(sk)->mapped); - } - --static struct vm_operations_struct packet_mmap_ops = { -+static const struct vm_operations_struct packet_mmap_ops = { - .open = packet_mm_open, - .close =packet_mm_close, - }; -diff -urNp linux-2.6.31.1/net/sctp/socket.c linux-2.6.31.1/net/sctp/socket.c ---- linux-2.6.31.1/net/sctp/socket.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/sctp/socket.c 2009-10-01 20:12:45.000000000 -0400 -@@ -1471,7 +1471,7 @@ SCTP_STATIC int sctp_sendmsg(struct kioc - struct sctp_sndrcvinfo *sinfo; - struct sctp_initmsg *sinit; - sctp_assoc_t associd = 0; -- sctp_cmsgs_t cmsgs = { NULL }; -+ sctp_cmsgs_t cmsgs = { NULL, NULL }; - int err; - sctp_scope_t scope; - long timeo; -@@ -5790,7 +5790,6 @@ pp_found: - */ - int reuse = sk->sk_reuse; - struct sock *sk2; -- struct hlist_node *node; - - SCTP_DEBUG_PRINTK("sctp_get_port() found a possible match\n"); - if (pp->fastreuse && sk->sk_reuse && -diff -urNp linux-2.6.31.1/net/socket.c linux-2.6.31.1/net/socket.c ---- linux-2.6.31.1/net/socket.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/socket.c 2009-10-01 20:12:45.000000000 -0400 -@@ -86,6 +86,7 @@ - #include <linux/audit.h> - #include <linux/wireless.h> - #include <linux/nsproxy.h> -+#include <linux/in.h> - - #include <asm/uaccess.h> - #include <asm/unistd.h> -@@ -96,6 +97,21 @@ - #include <net/sock.h> - #include <linux/netfilter.h> - -+extern void gr_attach_curr_ip(const struct sock *sk); -+extern int gr_handle_sock_all(const int family, const int type, -+ const int protocol); -+extern int gr_handle_sock_server(const struct sockaddr *sck); -+extern int gr_handle_sock_server_other(const struct socket *sck); -+extern int gr_handle_sock_client(const struct sockaddr *sck); -+extern int gr_search_connect(struct socket * sock, -+ struct sockaddr_in * addr); -+extern int gr_search_bind(struct socket * sock, -+ struct sockaddr_in * addr); -+extern int gr_search_listen(struct socket * sock); -+extern int gr_search_accept(struct socket * sock); -+extern int gr_search_socket(const int domain, const int type, -+ const int protocol); -+ - static int sock_no_open(struct inode *irrelevant, struct file *dontcare); - static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -@@ -285,7 +301,7 @@ static int init_inodecache(void) - return 0; - } - --static struct super_operations sockfs_ops = { -+static const struct super_operations sockfs_ops = { - .alloc_inode = sock_alloc_inode, - .destroy_inode =sock_destroy_inode, - .statfs = simple_statfs, -@@ -299,7 +315,7 @@ static int sockfs_get_sb(struct file_sys - mnt); - } - --static struct vfsmount *sock_mnt __read_mostly; -+struct vfsmount *sock_mnt __read_mostly; - - static struct file_system_type sock_fs_type = { - .name = "sockfs", -@@ -1283,6 +1299,16 @@ SYSCALL_DEFINE3(socket, int, family, int - if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) - flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - -+ if(!gr_search_socket(family, type, protocol)) { -+ retval = -EACCES; -+ goto out; -+ } -+ -+ if (gr_handle_sock_all(family, type, protocol)) { -+ retval = -EACCES; -+ goto out; -+ } -+ - retval = sock_create(family, type, protocol, &sock); - if (retval < 0) - goto out; -@@ -1415,6 +1441,14 @@ SYSCALL_DEFINE3(bind, int, fd, struct so - if (sock) { - err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address); - if (err >= 0) { -+ if (gr_handle_sock_server((struct sockaddr *)&address)) { -+ err = -EACCES; -+ goto error; -+ } -+ err = gr_search_bind(sock, (struct sockaddr_in *)&address); -+ if (err) -+ goto error; -+ - err = security_socket_bind(sock, - (struct sockaddr *)&address, - addrlen); -@@ -1423,6 +1457,7 @@ SYSCALL_DEFINE3(bind, int, fd, struct so - (struct sockaddr *) - &address, addrlen); - } -+error: - fput_light(sock->file, fput_needed); - } - return err; -@@ -1446,10 +1481,20 @@ SYSCALL_DEFINE2(listen, int, fd, int, ba - if ((unsigned)backlog > somaxconn) - backlog = somaxconn; - -+ if (gr_handle_sock_server_other(sock)) { -+ err = -EPERM; -+ goto error; -+ } -+ -+ err = gr_search_listen(sock); -+ if (err) -+ goto error; -+ - err = security_socket_listen(sock, backlog); - if (!err) - err = sock->ops->listen(sock, backlog); - -+error: - fput_light(sock->file, fput_needed); - } - return err; -@@ -1492,6 +1537,18 @@ SYSCALL_DEFINE4(accept4, int, fd, struct - newsock->type = sock->type; - newsock->ops = sock->ops; - -+ if (gr_handle_sock_server_other(sock)) { -+ err = -EPERM; -+ sock_release(newsock); -+ goto out_put; -+ } -+ -+ err = gr_search_accept(sock); -+ if (err) { -+ sock_release(newsock); -+ goto out_put; -+ } -+ - /* - * We don't need try_module_get here, as the listening socket (sock) - * has the protocol module (sock->ops->owner) held. -@@ -1534,6 +1591,8 @@ SYSCALL_DEFINE4(accept4, int, fd, struct - fd_install(newfd, newfile); - err = newfd; - -+ gr_attach_curr_ip(newsock->sk); -+ - out_put: - fput_light(sock->file, fput_needed); - out: -@@ -1571,6 +1630,7 @@ SYSCALL_DEFINE3(connect, int, fd, struct - int, addrlen) - { - struct socket *sock; -+ struct sockaddr *sck; - struct sockaddr_storage address; - int err, fput_needed; - -@@ -1581,6 +1641,17 @@ SYSCALL_DEFINE3(connect, int, fd, struct - if (err < 0) - goto out_put; - -+ sck = (struct sockaddr *)&address; -+ -+ if (gr_handle_sock_client(sck)) { -+ err = -EACCES; -+ goto out_put; -+ } -+ -+ err = gr_search_connect(sock, (struct sockaddr_in *)sck); -+ if (err) -+ goto out_put; -+ - err = - security_socket_connect(sock, (struct sockaddr *)&address, addrlen); - if (err) -diff -urNp linux-2.6.31.1/net/sunrpc/rpc_pipe.c linux-2.6.31.1/net/sunrpc/rpc_pipe.c ---- linux-2.6.31.1/net/sunrpc/rpc_pipe.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/sunrpc/rpc_pipe.c 2009-10-01 20:12:45.000000000 -0400 -@@ -858,7 +858,7 @@ EXPORT_SYMBOL_GPL(rpc_unlink); - /* - * populate the filesystem - */ --static struct super_operations s_ops = { -+static const struct super_operations s_ops = { - .alloc_inode = rpc_alloc_inode, - .destroy_inode = rpc_destroy_inode, - .statfs = simple_statfs, -diff -urNp linux-2.6.31.1/net/unix/af_unix.c linux-2.6.31.1/net/unix/af_unix.c ---- linux-2.6.31.1/net/unix/af_unix.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/unix/af_unix.c 2009-10-01 20:12:45.000000000 -0400 -@@ -734,6 +734,12 @@ static struct sock *unix_find_other(stru - err = -ECONNREFUSED; - if (!S_ISSOCK(inode->i_mode)) - goto put_fail; -+ -+ if (!gr_acl_handle_unix(path.dentry, path.mnt)) { -+ err = -EACCES; -+ goto put_fail; -+ } -+ - u = unix_find_socket_byinode(net, inode); - if (!u) - goto put_fail; -@@ -754,6 +760,13 @@ static struct sock *unix_find_other(stru - if (u) { - struct dentry *dentry; - dentry = unix_sk(u)->dentry; -+ -+ if (!gr_handle_chroot_unix(u->sk_peercred.pid)) { -+ err = -EPERM; -+ sock_put(u); -+ goto fail; -+ } -+ - if (dentry) - touch_atime(unix_sk(u)->mnt, dentry); - } else -@@ -839,11 +852,18 @@ static int unix_bind(struct socket *sock - err = security_path_mknod(&nd.path, dentry, mode, 0); - if (err) - goto out_mknod_drop_write; -+ if (!gr_acl_handle_mknod(dentry, nd.path.dentry, nd.path.mnt, mode)) { -+ err = -EACCES; -+ goto out_mknod_drop_write; -+ } - err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0); - out_mknod_drop_write: - mnt_drop_write(nd.path.mnt); - if (err) - goto out_mknod_dput; -+ -+ gr_handle_create(dentry, nd.path.mnt); -+ - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - dput(nd.path.dentry); - nd.path.dentry = dentry; -@@ -861,6 +881,10 @@ out_mknod_drop_write: - goto out_unlock; - } - -+#ifdef CONFIG_GRKERNSEC_CHROOT_UNIX -+ sk->sk_peercred.pid = current->pid; -+#endif -+ - list = &unix_socket_table[addr->hash]; - } else { - list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; -diff -urNp linux-2.6.31.1/net/xfrm/xfrm_proc.c linux-2.6.31.1/net/xfrm/xfrm_proc.c ---- linux-2.6.31.1/net/xfrm/xfrm_proc.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/net/xfrm/xfrm_proc.c 2009-10-01 20:12:45.000000000 -0400 -@@ -60,7 +60,7 @@ static int xfrm_statistics_seq_open(stru - return single_open_net(inode, file, xfrm_statistics_seq_show); - } - --static struct file_operations xfrm_statistics_seq_fops = { -+static const struct file_operations xfrm_statistics_seq_fops = { - .owner = THIS_MODULE, - .open = xfrm_statistics_seq_open, - .read = seq_read, -diff -urNp linux-2.6.31.1/samples/markers/marker-example.c linux-2.6.31.1/samples/markers/marker-example.c ---- linux-2.6.31.1/samples/markers/marker-example.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/samples/markers/marker-example.c 2009-10-01 20:12:45.000000000 -0400 -@@ -26,7 +26,7 @@ static int my_open(struct inode *inode, - return -EPERM; - } - --static struct file_operations mark_ops = { -+static const struct file_operations mark_ops = { - .open = my_open, - }; - -diff -urNp linux-2.6.31.1/samples/tracepoints/tracepoint-sample.c linux-2.6.31.1/samples/tracepoints/tracepoint-sample.c ---- linux-2.6.31.1/samples/tracepoints/tracepoint-sample.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/samples/tracepoints/tracepoint-sample.c 2009-10-01 20:12:45.000000000 -0400 -@@ -28,7 +28,7 @@ static int my_open(struct inode *inode, - return -EPERM; - } - --static struct file_operations mark_ops = { -+static const struct file_operations mark_ops = { - .open = my_open, - }; - -diff -urNp linux-2.6.31.1/scripts/basic/fixdep.c linux-2.6.31.1/scripts/basic/fixdep.c ---- linux-2.6.31.1/scripts/basic/fixdep.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/scripts/basic/fixdep.c 2009-10-01 20:12:45.000000000 -0400 -@@ -224,9 +224,9 @@ void use_config(char *m, int slen) - - void parse_config_file(char *map, size_t len) - { -- int *end = (int *) (map + len); -+ unsigned int *end = (unsigned int *) (map + len); - /* start at +1, so that p can never be < map */ -- int *m = (int *) map + 1; -+ unsigned int *m = (unsigned int *) map + 1; - char *p, *q; - - for (; m < end; m++) { -@@ -373,7 +373,7 @@ void print_deps(void) - void traps(void) - { - static char test[] __attribute__((aligned(sizeof(int)))) = "CONF"; -- int *p = (int *)test; -+ unsigned int *p = (unsigned int *)test; - - if (*p != INT_CONF) { - fprintf(stderr, "fixdep: sizeof(int) != 4 or wrong endianess? %#x\n", -diff -urNp linux-2.6.31.1/scripts/kallsyms.c linux-2.6.31.1/scripts/kallsyms.c ---- linux-2.6.31.1/scripts/kallsyms.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/scripts/kallsyms.c 2009-10-01 20:12:45.000000000 -0400 -@@ -43,10 +43,10 @@ struct text_range { - - static unsigned long long _text; - static struct text_range text_ranges[] = { -- { "_stext", "_etext" }, -- { "_sinittext", "_einittext" }, -- { "_stext_l1", "_etext_l1" }, /* Blackfin on-chip L1 inst SRAM */ -- { "_stext_l2", "_etext_l2" }, /* Blackfin on-chip L2 SRAM */ -+ { "_stext", "_etext", 0, 0 }, -+ { "_sinittext", "_einittext", 0, 0 }, -+ { "_stext_l1", "_etext_l1", 0, 0 }, /* Blackfin on-chip L1 inst SRAM */ -+ { "_stext_l2", "_etext_l2", 0, 0 }, /* Blackfin on-chip L2 SRAM */ - }; - #define text_range_text (&text_ranges[0]) - #define text_range_inittext (&text_ranges[1]) -diff -urNp linux-2.6.31.1/scripts/kconfig/lkc.h linux-2.6.31.1/scripts/kconfig/lkc.h ---- linux-2.6.31.1/scripts/kconfig/lkc.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/scripts/kconfig/lkc.h 2009-10-01 20:12:45.000000000 -0400 -@@ -97,7 +97,7 @@ void menu_add_expr(enum prop_type type, - void menu_add_symbol(enum prop_type type, struct symbol *sym, struct expr *dep); - void menu_add_option(int token, char *arg); - void menu_finalize(struct menu *parent); --void menu_set_type(int type); -+void menu_set_type(unsigned int type); - - /* util.c */ - struct file *file_lookup(const char *name); -diff -urNp linux-2.6.31.1/scripts/kconfig/mconf.c linux-2.6.31.1/scripts/kconfig/mconf.c ---- linux-2.6.31.1/scripts/kconfig/mconf.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/scripts/kconfig/mconf.c 2009-10-01 20:12:45.000000000 -0400 -@@ -361,7 +361,7 @@ static char filename[PATH_MAX+1]; - static void set_config_filename(const char *config_filename) - { - static char menu_backtitle[PATH_MAX+128]; -- int size; -+ unsigned int size; - struct symbol *sym; - - sym = sym_lookup("KERNELVERSION", 0); -diff -urNp linux-2.6.31.1/scripts/kconfig/menu.c linux-2.6.31.1/scripts/kconfig/menu.c ---- linux-2.6.31.1/scripts/kconfig/menu.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/scripts/kconfig/menu.c 2009-10-01 20:12:45.000000000 -0400 -@@ -104,7 +104,7 @@ void menu_add_dep(struct expr *dep) - current_entry->dep = expr_alloc_and(current_entry->dep, menu_check_dep(dep)); - } - --void menu_set_type(int type) -+void menu_set_type(unsigned int type) - { - struct symbol *sym = current_entry->sym; - -diff -urNp linux-2.6.31.1/scripts/mod/file2alias.c linux-2.6.31.1/scripts/mod/file2alias.c ---- linux-2.6.31.1/scripts/mod/file2alias.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/scripts/mod/file2alias.c 2009-10-01 20:12:45.000000000 -0400 -@@ -72,7 +72,7 @@ static void device_id_check(const char * - unsigned long size, unsigned long id_size, - void *symval) - { -- int i; -+ unsigned int i; - - if (size % id_size || size < id_size) { - if (cross_build != 0) -@@ -102,7 +102,7 @@ static void device_id_check(const char * - /* USB is special because the bcdDevice can be matched against a numeric range */ - /* Looks like "usb:vNpNdNdcNdscNdpNicNiscNipN" */ - static void do_usb_entry(struct usb_device_id *id, -- unsigned int bcdDevice_initial, int bcdDevice_initial_digits, -+ unsigned int bcdDevice_initial, unsigned int bcdDevice_initial_digits, - unsigned char range_lo, unsigned char range_hi, - struct module *mod) - { -@@ -368,7 +368,7 @@ static void do_pnp_device_entry(void *sy - for (i = 0; i < count; i++) { - const char *id = (char *)devs[i].id; - char acpi_id[sizeof(devs[0].id)]; -- int j; -+ unsigned int j; - - buf_printf(&mod->dev_table_buf, - "MODULE_ALIAS("pnp:d%s*");\n", id); -@@ -398,7 +398,7 @@ static void do_pnp_card_entries(void *sy - - for (j = 0; j < PNP_MAX_DEVICES; j++) { - const char *id = (char *)card->devs[j].id; -- int i2, j2; -+ unsigned int i2, j2; - int dup = 0; - - if (!id[0]) -@@ -424,7 +424,7 @@ static void do_pnp_card_entries(void *sy - /* add an individual alias for every device entry */ - if (!dup) { - char acpi_id[sizeof(card->devs[0].id)]; -- int k; -+ unsigned int k; - - buf_printf(&mod->dev_table_buf, - "MODULE_ALIAS("pnp:d%s*");\n", id); -@@ -690,7 +690,7 @@ static void dmi_ascii_filter(char *d, co - static int do_dmi_entry(const char *filename, struct dmi_system_id *id, - char *alias) - { -- int i, j; -+ unsigned int i, j; - - sprintf(alias, "dmi*"); - -diff -urNp linux-2.6.31.1/scripts/mod/modpost.c linux-2.6.31.1/scripts/mod/modpost.c ---- linux-2.6.31.1/scripts/mod/modpost.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/scripts/mod/modpost.c 2009-10-01 20:12:45.000000000 -0400 -@@ -835,6 +835,7 @@ enum mismatch { - INIT_TO_EXIT, - EXIT_TO_INIT, - EXPORT_TO_INIT_EXIT, -+ DATA_TO_TEXT - }; - - struct sectioncheck { -@@ -920,6 +921,12 @@ const struct sectioncheck sectioncheck[] - .fromsec = { "__ksymtab*", NULL }, - .tosec = { INIT_SECTIONS, EXIT_SECTIONS, NULL }, - .mismatch = EXPORT_TO_INIT_EXIT -+}, -+/* Do not reference code from writable data */ -+{ -+ .fromsec = { DATA_SECTIONS, NULL }, -+ .tosec = { TEXT_SECTIONS, NULL }, -+ .mismatch = DATA_TO_TEXT - } - }; - -@@ -1024,10 +1031,10 @@ static Elf_Sym *find_elf_symbol(struct e - continue; - if (ELF_ST_TYPE(sym->st_info) == STT_SECTION) - continue; -- if (sym->st_value == addr) -- return sym; - /* Find a symbol nearby - addr are maybe negative */ - d = sym->st_value - addr; -+ if (d == 0) -+ return sym; - if (d < 0) - d = addr - sym->st_value; - if (d < distance) { -@@ -1268,6 +1275,14 @@ static void report_sec_mismatch(const ch - "Fix this by removing the %sannotation of %s " - "or drop the export.\n", - tosym, sec2annotation(tosec), sec2annotation(tosec), tosym); -+ case DATA_TO_TEXT: -+/* -+ fprintf(stderr, -+ "The variable %s references\n" -+ "the %s %s%s%s\n", -+ fromsym, to, sec2annotation(tosec), tosym, to_p); -+*/ -+ break; - case NO_MISMATCH: - /* To get warnings on missing members */ - break; -@@ -1651,7 +1666,7 @@ void __attribute__((format(printf, 2, 3) - va_end(ap); - } - --void buf_write(struct buffer *buf, const char *s, int len) -+void buf_write(struct buffer *buf, const char *s, unsigned int len) - { - if (buf->size - buf->pos < len) { - buf->size += len + SZ; -@@ -1863,7 +1878,7 @@ static void write_if_changed(struct buff - if (fstat(fileno(file), &st) < 0) - goto close_write; - -- if (st.st_size != b->pos) -+ if (st.st_size != (off_t)b->pos) - goto close_write; - - tmp = NOFAIL(malloc(b->pos)); -diff -urNp linux-2.6.31.1/scripts/mod/modpost.h linux-2.6.31.1/scripts/mod/modpost.h ---- linux-2.6.31.1/scripts/mod/modpost.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/scripts/mod/modpost.h 2009-10-01 20:12:45.000000000 -0400 -@@ -92,15 +92,15 @@ void *do_nofail(void *ptr, const char *e - - struct buffer { - char *p; -- int pos; -- int size; -+ unsigned int pos; -+ unsigned int size; - }; - - void __attribute__((format(printf, 2, 3))) - buf_printf(struct buffer *buf, const char *fmt, ...); - - void --buf_write(struct buffer *buf, const char *s, int len); -+buf_write(struct buffer *buf, const char *s, unsigned int len); - - struct module { - struct module *next; -diff -urNp linux-2.6.31.1/scripts/mod/sumversion.c linux-2.6.31.1/scripts/mod/sumversion.c ---- linux-2.6.31.1/scripts/mod/sumversion.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/scripts/mod/sumversion.c 2009-10-01 20:12:45.000000000 -0400 -@@ -457,7 +457,7 @@ static void write_version(const char *fi - goto out; - } - -- if (write(fd, sum, strlen(sum)+1) != strlen(sum)+1) { -+ if (write(fd, sum, strlen(sum)+1) != (ssize_t)strlen(sum)+1) { - warn("writing sum in %s failed: %s\n", - filename, strerror(errno)); - goto out; -diff -urNp linux-2.6.31.1/scripts/pnmtologo.c linux-2.6.31.1/scripts/pnmtologo.c ---- linux-2.6.31.1/scripts/pnmtologo.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/scripts/pnmtologo.c 2009-10-01 20:12:45.000000000 -0400 -@@ -237,14 +237,14 @@ static void write_header(void) - fprintf(out, " * Linux logo %s\n", logoname); - fputs(" */\n\n", out); - fputs("#include <linux/linux_logo.h>\n\n", out); -- fprintf(out, "static unsigned char %s_data[] __initdata = {\n", -+ fprintf(out, "static unsigned char %s_data[] = {\n", - logoname); - } - - static void write_footer(void) - { - fputs("\n};\n\n", out); -- fprintf(out, "const struct linux_logo %s __initconst = {\n", logoname); -+ fprintf(out, "const struct linux_logo %s = {\n", logoname); - fprintf(out, "\t.type\t\t= %s,\n", logo_types[logo_type]); - fprintf(out, "\t.width\t\t= %d,\n", logo_width); - fprintf(out, "\t.height\t\t= %d,\n", logo_height); -@@ -374,7 +374,7 @@ static void write_logo_clut224(void) - fputs("\n};\n\n", out); - - /* write logo clut */ -- fprintf(out, "static unsigned char %s_clut[] __initdata = {\n", -+ fprintf(out, "static unsigned char %s_clut[] = {\n", - logoname); - write_hex_cnt = 0; - for (i = 0; i < logo_clutsize; i++) { -diff -urNp linux-2.6.31.1/security/commoncap.c linux-2.6.31.1/security/commoncap.c ---- linux-2.6.31.1/security/commoncap.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/security/commoncap.c 2009-10-01 20:12:45.000000000 -0400 -@@ -27,7 +27,7 @@ - #include <linux/sched.h> - #include <linux/prctl.h> - #include <linux/securebits.h> -- -+#include <net/sock.h> - /* - * If a non-root user executes a setuid-root binary in - * !secure(SECURE_NOROOT) mode, then we raise capabilities. -@@ -50,9 +50,11 @@ static void warn_setuid_and_fcaps_mixed( - } - } - -+extern kernel_cap_t gr_cap_rtnetlink(struct sock *sk); -+ - int cap_netlink_send(struct sock *sk, struct sk_buff *skb) - { -- NETLINK_CB(skb).eff_cap = current_cap(); -+ NETLINK_CB(skb).eff_cap = gr_cap_rtnetlink(sk); - return 0; - } - -diff -urNp linux-2.6.31.1/security/integrity/ima/ima_fs.c linux-2.6.31.1/security/integrity/ima/ima_fs.c ---- linux-2.6.31.1/security/integrity/ima/ima_fs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/security/integrity/ima/ima_fs.c 2009-10-01 20:12:45.000000000 -0400 -@@ -43,7 +43,7 @@ static ssize_t ima_show_htable_violation - return ima_show_htable_value(buf, count, ppos, &ima_htable.violations); - } - --static struct file_operations ima_htable_violations_ops = { -+static const struct file_operations ima_htable_violations_ops = { - .read = ima_show_htable_violations - }; - -@@ -55,7 +55,7 @@ static ssize_t ima_show_measurements_cou - - } - --static struct file_operations ima_measurements_count_ops = { -+static const struct file_operations ima_measurements_count_ops = { - .read = ima_show_measurements_count - }; - -@@ -146,7 +146,7 @@ static int ima_measurements_show(struct - return 0; - } - --static struct seq_operations ima_measurments_seqops = { -+static const struct seq_operations ima_measurments_seqops = { - .start = ima_measurements_start, - .next = ima_measurements_next, - .stop = ima_measurements_stop, -@@ -158,7 +158,7 @@ static int ima_measurements_open(struct - return seq_open(file, &ima_measurments_seqops); - } - --static struct file_operations ima_measurements_ops = { -+static const struct file_operations ima_measurements_ops = { - .open = ima_measurements_open, - .read = seq_read, - .llseek = seq_lseek, -@@ -221,7 +221,7 @@ static int ima_ascii_measurements_show(s - return 0; - } - --static struct seq_operations ima_ascii_measurements_seqops = { -+static const struct seq_operations ima_ascii_measurements_seqops = { - .start = ima_measurements_start, - .next = ima_measurements_next, - .stop = ima_measurements_stop, -@@ -233,7 +233,7 @@ static int ima_ascii_measurements_open(s - return seq_open(file, &ima_ascii_measurements_seqops); - } - --static struct file_operations ima_ascii_measurements_ops = { -+static const struct file_operations ima_ascii_measurements_ops = { - .open = ima_ascii_measurements_open, - .read = seq_read, - .llseek = seq_lseek, -@@ -313,7 +313,7 @@ static int ima_release_policy(struct ino - return 0; - } - --static struct file_operations ima_measure_policy_ops = { -+static const struct file_operations ima_measure_policy_ops = { - .open = ima_open_policy, - .write = ima_write_policy, - .release = ima_release_policy -diff -urNp linux-2.6.31.1/security/Kconfig linux-2.6.31.1/security/Kconfig ---- linux-2.6.31.1/security/Kconfig 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/security/Kconfig 2009-10-01 20:12:45.000000000 -0400 -@@ -4,6 +4,465 @@ - - menu "Security options" - -+source grsecurity/Kconfig -+ -+menu "PaX" -+ -+config PAX -+ bool "Enable various PaX features" -+ depends on GRKERNSEC && (ALPHA || ARM || AVR32 || IA64 || MIPS32 || MIPS64 || PARISC || PPC32 || PPC64 || SPARC32 || SPARC64 || X86) -+ help -+ This allows you to enable various PaX features. PaX adds -+ intrusion prevention mechanisms to the kernel that reduce -+ the risks posed by exploitable memory corruption bugs. -+ -+menu "PaX Control" -+ depends on PAX -+ -+config PAX_SOFTMODE -+ bool 'Support soft mode' -+ help -+ Enabling this option will allow you to run PaX in soft mode, that -+ is, PaX features will not be enforced by default, only on executables -+ marked explicitly. You must also enable PT_PAX_FLAGS support as it -+ is the only way to mark executables for soft mode use. -+ -+ Soft mode can be activated by using the "pax_softmode=1" kernel command -+ line option on boot. Furthermore you can control various PaX features -+ at runtime via the entries in /proc/sys/kernel/pax. -+ -+config PAX_EI_PAX -+ bool 'Use legacy ELF header marking' -+ help -+ Enabling this option will allow you to control PaX features on -+ a per executable basis via the 'chpax' utility available at -+ http://pax.grsecurity.net/. The control flags will be read from -+ an otherwise reserved part of the ELF header. This marking has -+ numerous drawbacks (no support for soft-mode, toolchain does not -+ know about the non-standard use of the ELF header) therefore it -+ has been deprecated in favour of PT_PAX_FLAGS support. -+ -+ If you have applications not marked by the PT_PAX_FLAGS ELF -+ program header then you MUST enable this option otherwise they -+ will not get any protection. -+ -+ Note that if you enable PT_PAX_FLAGS marking support as well, -+ the PT_PAX_FLAG marks will override the legacy EI_PAX marks. -+ -+config PAX_PT_PAX_FLAGS -+ bool 'Use ELF program header marking' -+ help -+ Enabling this option will allow you to control PaX features on -+ a per executable basis via the 'paxctl' utility available at -+ http://pax.grsecurity.net/. The control flags will be read from -+ a PaX specific ELF program header (PT_PAX_FLAGS). This marking -+ has the benefits of supporting both soft mode and being fully -+ integrated into the toolchain (the binutils patch is available -+ from http://pax.grsecurity.net). -+ -+ If you have applications not marked by the PT_PAX_FLAGS ELF -+ program header then you MUST enable the EI_PAX marking support -+ otherwise they will not get any protection. -+ -+ Note that if you enable the legacy EI_PAX marking support as well, -+ the EI_PAX marks will be overridden by the PT_PAX_FLAGS marks. -+ -+choice -+ prompt 'MAC system integration' -+ default PAX_HAVE_ACL_FLAGS -+ help -+ Mandatory Access Control systems have the option of controlling -+ PaX flags on a per executable basis, choose the method supported -+ by your particular system. -+ -+ - "none": if your MAC system does not interact with PaX, -+ - "direct": if your MAC system defines pax_set_initial_flags() itself, -+ - "hook": if your MAC system uses the pax_set_initial_flags_func callback. -+ -+ NOTE: this option is for developers/integrators only. -+ -+ config PAX_NO_ACL_FLAGS -+ bool 'none' -+ -+ config PAX_HAVE_ACL_FLAGS -+ bool 'direct' -+ -+ config PAX_HOOK_ACL_FLAGS -+ bool 'hook' -+endchoice -+ -+endmenu -+ -+menu "Non-executable pages" -+ depends on PAX -+ -+config PAX_NOEXEC -+ bool "Enforce non-executable pages" -+ depends on (PAX_EI_PAX || PAX_PT_PAX_FLAGS || PAX_HAVE_ACL_FLAGS || PAX_HOOK_ACL_FLAGS) && (ALPHA || IA64 || MIPS32 || MIPS64 || PARISC || PPC32 || PPC64 || SPARC32 || SPARC64 || X86) -+ help -+ By design some architectures do not allow for protecting memory -+ pages against execution or even if they do, Linux does not make -+ use of this feature. In practice this means that if a page is -+ readable (such as the stack or heap) it is also executable. -+ -+ There is a well known exploit technique that makes use of this -+ fact and a common programming mistake where an attacker can -+ introduce code of his choice somewhere in the attacked program's -+ memory (typically the stack or the heap) and then execute it. -+ -+ If the attacked program was running with different (typically -+ higher) privileges than that of the attacker, then he can elevate -+ his own privilege level (e.g. get a root shell, write to files for -+ which he does not have write access to, etc). -+ -+ Enabling this option will let you choose from various features -+ that prevent the injection and execution of 'foreign' code in -+ a program. -+ -+ This will also break programs that rely on the old behaviour and -+ expect that dynamically allocated memory via the malloc() family -+ of functions is executable (which it is not). Notable examples -+ are the XFree86 4.x server, the java runtime and wine. -+ -+config PAX_PAGEEXEC -+ bool "Paging based non-executable pages" -+ depends on PAX_NOEXEC && (!X86_32 || M586 || M586TSC || M586MMX || M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC || MK7 || MK8 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MVIAC3_2 || MVIAC7) -+ help -+ This implementation is based on the paging feature of the CPU. -+ On i386 without hardware non-executable bit support there is a -+ variable but usually low performance impact, however on Intel's -+ P4 core based CPUs it is very high so you should not enable this -+ for kernels meant to be used on such CPUs. -+ -+ On alpha, avr32, ia64, parisc, sparc, sparc64, x86_64 and i386 -+ with hardware non-executable bit support there is no performance -+ impact, on ppc the impact is negligible. -+ -+ Note that several architectures require various emulations due to -+ badly designed userland ABIs, this will cause a performance impact -+ but will disappear as soon as userland is fixed. For example, ppc -+ userland MUST have been built with secure-plt by a recent toolchain. -+ -+config PAX_SEGMEXEC -+ bool "Segmentation based non-executable pages" -+ depends on PAX_NOEXEC && X86_32 -+ help -+ This implementation is based on the segmentation feature of the -+ CPU and has a very small performance impact, however applications -+ will be limited to a 1.5 GB address space instead of the normal -+ 3 GB. -+ -+config PAX_EMUTRAMP -+ bool "Emulate trampolines" if (PAX_PAGEEXEC || PAX_SEGMEXEC) && (PARISC || X86) -+ default y if PARISC -+ help -+ There are some programs and libraries that for one reason or -+ another attempt to execute special small code snippets from -+ non-executable memory pages. Most notable examples are the -+ signal handler return code generated by the kernel itself and -+ the GCC trampolines. -+ -+ If you enabled CONFIG_PAX_PAGEEXEC or CONFIG_PAX_SEGMEXEC then -+ such programs will no longer work under your kernel. -+ -+ As a remedy you can say Y here and use the 'chpax' or 'paxctl' -+ utilities to enable trampoline emulation for the affected programs -+ yet still have the protection provided by the non-executable pages. -+ -+ On parisc you MUST enable this option and EMUSIGRT as well, otherwise -+ your system will not even boot. -+ -+ Alternatively you can say N here and use the 'chpax' or 'paxctl' -+ utilities to disable CONFIG_PAX_PAGEEXEC and CONFIG_PAX_SEGMEXEC -+ for the affected files. -+ -+ NOTE: enabling this feature *may* open up a loophole in the -+ protection provided by non-executable pages that an attacker -+ could abuse. Therefore the best solution is to not have any -+ files on your system that would require this option. This can -+ be achieved by not using libc5 (which relies on the kernel -+ signal handler return code) and not using or rewriting programs -+ that make use of the nested function implementation of GCC. -+ Skilled users can just fix GCC itself so that it implements -+ nested function calls in a way that does not interfere with PaX. -+ -+config PAX_EMUSIGRT -+ bool "Automatically emulate sigreturn trampolines" -+ depends on PAX_EMUTRAMP && PARISC -+ default y -+ help -+ Enabling this option will have the kernel automatically detect -+ and emulate signal return trampolines executing on the stack -+ that would otherwise lead to task termination. -+ -+ This solution is intended as a temporary one for users with -+ legacy versions of libc (libc5, glibc 2.0, uClibc before 0.9.17, -+ Modula-3 runtime, etc) or executables linked to such, basically -+ everything that does not specify its own SA_RESTORER function in -+ normal executable memory like glibc 2.1+ does. -+ -+ On parisc you MUST enable this option, otherwise your system will -+ not even boot. -+ -+ NOTE: this feature cannot be disabled on a per executable basis -+ and since it *does* open up a loophole in the protection provided -+ by non-executable pages, the best solution is to not have any -+ files on your system that would require this option. -+ -+config PAX_MPROTECT -+ bool "Restrict mprotect()" -+ depends on (PAX_PAGEEXEC || PAX_SEGMEXEC) -+ help -+ Enabling this option will prevent programs from -+ - changing the executable status of memory pages that were -+ not originally created as executable, -+ - making read-only executable pages writable again, -+ - creating executable pages from anonymous memory. -+ -+ You should say Y here to complete the protection provided by -+ the enforcement of non-executable pages. -+ -+ NOTE: you can use the 'chpax' or 'paxctl' utilities to control -+ this feature on a per file basis. -+ -+config PAX_NOELFRELOCS -+ bool "Disallow ELF text relocations" -+ depends on PAX_MPROTECT && !PAX_ETEXECRELOCS && (IA64 || PPC || X86) -+ help -+ Non-executable pages and mprotect() restrictions are effective -+ in preventing the introduction of new executable code into an -+ attacked task's address space. There remain only two venues -+ for this kind of attack: if the attacker can execute already -+ existing code in the attacked task then he can either have it -+ create and mmap() a file containing his code or have it mmap() -+ an already existing ELF library that does not have position -+ independent code in it and use mprotect() on it to make it -+ writable and copy his code there. While protecting against -+ the former approach is beyond PaX, the latter can be prevented -+ by having only PIC ELF libraries on one's system (which do not -+ need to relocate their code). If you are sure this is your case, -+ then enable this option otherwise be careful as you may not even -+ be able to boot or log on your system (for example, some PAM -+ modules are erroneously compiled as non-PIC by default). -+ -+ NOTE: if you are using dynamic ELF executables (as suggested -+ when using ASLR) then you must have made sure that you linked -+ your files using the PIC version of crt1 (the et_dyn.tar.gz package -+ referenced there has already been updated to support this). -+ -+config PAX_ETEXECRELOCS -+ bool "Allow ELF ET_EXEC text relocations" -+ depends on PAX_MPROTECT && (ALPHA || IA64 || PARISC) -+ default y -+ help -+ On some architectures there are incorrectly created applications -+ that require text relocations and would not work without enabling -+ this option. If you are an alpha, ia64 or parisc user, you should -+ enable this option and disable it once you have made sure that -+ none of your applications need it. -+ -+config PAX_EMUPLT -+ bool "Automatically emulate ELF PLT" -+ depends on PAX_MPROTECT && (ALPHA || PARISC || SPARC32 || SPARC64) -+ default y -+ help -+ Enabling this option will have the kernel automatically detect -+ and emulate the Procedure Linkage Table entries in ELF files. -+ On some architectures such entries are in writable memory, and -+ become non-executable leading to task termination. Therefore -+ it is mandatory that you enable this option on alpha, parisc, -+ sparc and sparc64, otherwise your system would not even boot. -+ -+ NOTE: this feature *does* open up a loophole in the protection -+ provided by the non-executable pages, therefore the proper -+ solution is to modify the toolchain to produce a PLT that does -+ not need to be writable. -+ -+config PAX_DLRESOLVE -+ bool 'Emulate old glibc resolver stub' -+ depends on PAX_EMUPLT && (SPARC32 || SPARC64) -+ default n -+ help -+ This option is needed if userland has an old glibc (before 2.4) -+ that puts a 'save' instruction into the runtime generated resolver -+ stub that needs special emulation. -+ -+config PAX_KERNEXEC -+ bool "Enforce non-executable kernel pages" -+ depends on PAX_NOEXEC && X86 && (!X86_32 || X86_WP_WORKS_OK) -+ help -+ This is the kernel land equivalent of PAGEEXEC and MPROTECT, -+ that is, enabling this option will make it harder to inject -+ and execute 'foreign' code in kernel memory itself. -+ -+endmenu -+ -+menu "Address Space Layout Randomization" -+ depends on PAX -+ -+config PAX_ASLR -+ bool "Address Space Layout Randomization" -+ depends on PAX_EI_PAX || PAX_PT_PAX_FLAGS || PAX_HAVE_ACL_FLAGS || PAX_HOOK_ACL_FLAGS -+ help -+ Many if not most exploit techniques rely on the knowledge of -+ certain addresses in the attacked program. The following options -+ will allow the kernel to apply a certain amount of randomization -+ to specific parts of the program thereby forcing an attacker to -+ guess them in most cases. Any failed guess will most likely crash -+ the attacked program which allows the kernel to detect such attempts -+ and react on them. PaX itself provides no reaction mechanisms, -+ instead it is strongly encouraged that you make use of Nergal's -+ segvguard (ftp://ftp.pl.openwall.com/misc/segvguard/) or grsecurity's -+ (http://www.grsecurity.net/) built-in crash detection features or -+ develop one yourself. -+ -+ By saying Y here you can choose to randomize the following areas: -+ - top of the task's kernel stack -+ - top of the task's userland stack -+ - base address for mmap() requests that do not specify one -+ (this includes all libraries) -+ - base address of the main executable -+ -+ It is strongly recommended to say Y here as address space layout -+ randomization has negligible impact on performance yet it provides -+ a very effective protection. -+ -+ NOTE: you can use the 'chpax' or 'paxctl' utilities to control -+ this feature on a per file basis. -+ -+config PAX_RANDKSTACK -+ bool "Randomize kernel stack base" -+ depends on PAX_ASLR && X86_TSC && X86_32 -+ help -+ By saying Y here the kernel will randomize every task's kernel -+ stack on every system call. This will not only force an attacker -+ to guess it but also prevent him from making use of possible -+ leaked information about it. -+ -+ Since the kernel stack is a rather scarce resource, randomization -+ may cause unexpected stack overflows, therefore you should very -+ carefully test your system. Note that once enabled in the kernel -+ configuration, this feature cannot be disabled on a per file basis. -+ -+config PAX_RANDUSTACK -+ bool "Randomize user stack base" -+ depends on PAX_ASLR -+ help -+ By saying Y here the kernel will randomize every task's userland -+ stack. The randomization is done in two steps where the second -+ one may apply a big amount of shift to the top of the stack and -+ cause problems for programs that want to use lots of memory (more -+ than 2.5 GB if SEGMEXEC is not active, or 1.25 GB when it is). -+ For this reason the second step can be controlled by 'chpax' or -+ 'paxctl' on a per file basis. -+ -+config PAX_RANDMMAP -+ bool "Randomize mmap() base" -+ depends on PAX_ASLR -+ help -+ By saying Y here the kernel will use a randomized base address for -+ mmap() requests that do not specify one themselves. As a result -+ all dynamically loaded libraries will appear at random addresses -+ and therefore be harder to exploit by a technique where an attacker -+ attempts to execute library code for his purposes (e.g. spawn a -+ shell from an exploited program that is running at an elevated -+ privilege level). -+ -+ Furthermore, if a program is relinked as a dynamic ELF file, its -+ base address will be randomized as well, completing the full -+ randomization of the address space layout. Attacking such programs -+ becomes a guess game. You can find an example of doing this at -+ http://pax.grsecurity.net/et_dyn.tar.gz and practical samples at -+ http://www.grsecurity.net/grsec-gcc-specs.tar.gz . -+ -+ NOTE: you can use the 'chpax' or 'paxctl' utilities to control this -+ feature on a per file basis. -+ -+endmenu -+ -+menu "Miscellaneous hardening features" -+ -+config PAX_MEMORY_SANITIZE -+ bool "Sanitize all freed memory" -+ help -+ By saying Y here the kernel will erase memory pages as soon as they -+ are freed. This in turn reduces the lifetime of data stored in the -+ pages, making it less likely that sensitive information such as -+ passwords, cryptographic secrets, etc stay in memory for too long. -+ -+ This is especially useful for programs whose runtime is short, long -+ lived processes and the kernel itself benefit from this as long as -+ they operate on whole memory pages and ensure timely freeing of pages -+ that may hold sensitive information. -+ -+ The tradeoff is performance impact, on a single CPU system kernel -+ compilation sees a 3% slowdown, other systems and workloads may vary -+ and you are advised to test this feature on your expected workload -+ before deploying it. -+ -+ Note that this feature does not protect data stored in live pages, -+ e.g., process memory swapped to disk may stay there for a long time. -+ -+config PAX_MEMORY_UDEREF -+ bool "Prevent invalid userland pointer dereference" -+ depends on X86_32 && !UML_X86 -+ help -+ By saying Y here the kernel will be prevented from dereferencing -+ userland pointers in contexts where the kernel expects only kernel -+ pointers. This is both a useful runtime debugging feature and a -+ security measure that prevents exploiting a class of kernel bugs. -+ -+ The tradeoff is that some virtualization solutions may experience -+ a huge slowdown and therefore you should not enable this feature -+ for kernels meant to run in such environments. Whether a given VM -+ solution is affected or not is best determined by simply trying it -+ out, the performance impact will be obvious right on boot as this -+ mechanism engages from very early on. A good rule of thumb is that -+ VMs running on CPUs without hardware virtualization support (i.e., -+ the majority of IA-32 CPUs) will likely experience the slowdown. -+ -+config PAX_REFCOUNT -+ bool "Prevent various kernel object reference counter overflows" -+ depends on GRKERNSEC && (X86 || SPARC64) -+ help -+ By saying Y here the kernel will detect and prevent overflowing -+ various (but not all) kinds of object reference counters. Such -+ overflows can normally occur due to bugs only and are often, if -+ not always, exploitable. -+ -+ The tradeoff is that data structures protected by an overflowed -+ refcount will never be freed and therefore will leak memory. Note -+ that this leak also happens even without this protection but in -+ that case the overflow can eventually trigger the freeing of the -+ data structure while it is still being used elsewhere, resulting -+ in the exploitable situation that this feature prevents. -+ -+ Since this has a negligible performance impact, you should enable -+ this feature. -+ -+config PAX_USERCOPY -+ bool "Bounds check heap object copies between kernel and userland" -+ depends on X86 || PPC32 || PPC64 || SPARC32 || SPARC64 -+ depends on GRKERNSEC && (SLAB || SLUB || SLOB) -+ help -+ By saying Y here the kernel will enforce the size of heap objects -+ when they are copied in either direction between the kernel and -+ userland, even if only a part of the heap object is copied. -+ -+ Specifically, this checking prevents information leaking from the -+ kernel heap during kernel to userland copies (if the kernel heap -+ object is otherwise fully initialized) and prevents kernel heap -+ overflows during userland to kernel copies. -+ -+ Note that the current implementation provides the strictest checks -+ for the SLUB allocator. -+ -+ Since this has a negligible performance impact, you should enable -+ this feature. -+endmenu -+ -+endmenu -+ - config KEYS - bool "Enable access key retention support" - help -diff -urNp linux-2.6.31.1/security/min_addr.c linux-2.6.31.1/security/min_addr.c ---- linux-2.6.31.1/security/min_addr.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/security/min_addr.c 2009-10-01 20:12:45.000000000 -0400 -@@ -14,6 +14,7 @@ unsigned long dac_mmap_min_addr = CONFIG - */ - static void update_mmap_min_addr(void) - { -+#ifndef SPARC - #ifdef CONFIG_LSM_MMAP_MIN_ADDR - if (dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) - mmap_min_addr = dac_mmap_min_addr; -@@ -22,6 +23,7 @@ static void update_mmap_min_addr(void) - #else - mmap_min_addr = dac_mmap_min_addr; - #endif -+#endif - } - - /* -diff -urNp linux-2.6.31.1/security/smack/smackfs.c linux-2.6.31.1/security/smack/smackfs.c ---- linux-2.6.31.1/security/smack/smackfs.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/security/smack/smackfs.c 2009-10-01 20:12:45.000000000 -0400 -@@ -187,7 +187,7 @@ static void load_seq_stop(struct seq_fil - /* No-op */ - } - --static struct seq_operations load_seq_ops = { -+static const struct seq_operations load_seq_ops = { - .start = load_seq_start, - .next = load_seq_next, - .show = load_seq_show, -@@ -503,7 +503,7 @@ static void cipso_seq_stop(struct seq_fi - /* No-op */ - } - --static struct seq_operations cipso_seq_ops = { -+static const struct seq_operations cipso_seq_ops = { - .start = cipso_seq_start, - .stop = cipso_seq_stop, - .next = cipso_seq_next, -@@ -697,7 +697,7 @@ static void netlbladdr_seq_stop(struct s - /* No-op */ - } - --static struct seq_operations netlbladdr_seq_ops = { -+static const struct seq_operations netlbladdr_seq_ops = { - .start = netlbladdr_seq_start, - .stop = netlbladdr_seq_stop, - .next = netlbladdr_seq_next, -diff -urNp linux-2.6.31.1/sound/aoa/codecs/onyx.c linux-2.6.31.1/sound/aoa/codecs/onyx.c ---- linux-2.6.31.1/sound/aoa/codecs/onyx.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/aoa/codecs/onyx.c 2009-10-01 20:12:45.000000000 -0400 -@@ -53,7 +53,7 @@ struct onyx { - spdif_locked:1, - analog_locked:1, - original_mute:2; -- int open_count; -+ atomic_t open_count; - struct codec_info *codec_info; - - /* mutex serializes concurrent access to the device -@@ -752,7 +752,7 @@ static int onyx_open(struct codec_info_i - struct onyx *onyx = cii->codec_data; - - mutex_lock(&onyx->mutex); -- onyx->open_count++; -+ atomic_inc(&onyx->open_count); - mutex_unlock(&onyx->mutex); - - return 0; -@@ -764,8 +764,7 @@ static int onyx_close(struct codec_info_ - struct onyx *onyx = cii->codec_data; - - mutex_lock(&onyx->mutex); -- onyx->open_count--; -- if (!onyx->open_count) -+ if (atomic_dec_and_test(&onyx->open_count)) - onyx->spdif_locked = onyx->analog_locked = 0; - mutex_unlock(&onyx->mutex); - -diff -urNp linux-2.6.31.1/sound/core/oss/pcm_oss.c linux-2.6.31.1/sound/core/oss/pcm_oss.c ---- linux-2.6.31.1/sound/core/oss/pcm_oss.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/core/oss/pcm_oss.c 2009-10-01 20:12:45.000000000 -0400 -@@ -2943,8 +2943,8 @@ static void snd_pcm_oss_proc_done(struct - } - } - #else /* !CONFIG_SND_VERBOSE_PROCFS */ --#define snd_pcm_oss_proc_init(pcm) --#define snd_pcm_oss_proc_done(pcm) -+#define snd_pcm_oss_proc_init(pcm) do {} while (0) -+#define snd_pcm_oss_proc_done(pcm) do {} while (0) - #endif /* CONFIG_SND_VERBOSE_PROCFS */ - - /* -diff -urNp linux-2.6.31.1/sound/core/seq/seq_lock.h linux-2.6.31.1/sound/core/seq/seq_lock.h ---- linux-2.6.31.1/sound/core/seq/seq_lock.h 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/core/seq/seq_lock.h 2009-10-01 20:12:45.000000000 -0400 -@@ -23,10 +23,10 @@ void snd_use_lock_sync_helper(snd_use_lo - #else /* SMP || CONFIG_SND_DEBUG */ - - typedef spinlock_t snd_use_lock_t; /* dummy */ --#define snd_use_lock_init(lockp) /**/ --#define snd_use_lock_use(lockp) /**/ --#define snd_use_lock_free(lockp) /**/ --#define snd_use_lock_sync(lockp) /**/ -+#define snd_use_lock_init(lockp) do {} while (0) -+#define snd_use_lock_use(lockp) do {} while (0) -+#define snd_use_lock_free(lockp) do {} while (0) -+#define snd_use_lock_sync(lockp) do {} while (0) - - #endif /* SMP || CONFIG_SND_DEBUG */ - -diff -urNp linux-2.6.31.1/sound/drivers/mts64.c linux-2.6.31.1/sound/drivers/mts64.c ---- linux-2.6.31.1/sound/drivers/mts64.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/drivers/mts64.c 2009-10-01 20:12:45.000000000 -0400 -@@ -65,7 +65,7 @@ struct mts64 { - struct pardevice *pardev; - int pardev_claimed; - -- int open_count; -+ atomic_t open_count; - int current_midi_output_port; - int current_midi_input_port; - u8 mode[MTS64_NUM_INPUT_PORTS]; -@@ -695,7 +695,7 @@ static int snd_mts64_rawmidi_open(struct - { - struct mts64 *mts = substream->rmidi->private_data; - -- if (mts->open_count == 0) { -+ if (atomic_read(&mts->open_count) == 0) { - /* We don't need a spinlock here, because this is just called - if the device has not been opened before. - So there aren't any IRQs from the device */ -@@ -703,7 +703,7 @@ static int snd_mts64_rawmidi_open(struct - - msleep(50); - } -- ++(mts->open_count); -+ atomic_inc(&mts->open_count); - - return 0; - } -@@ -713,8 +713,7 @@ static int snd_mts64_rawmidi_close(struc - struct mts64 *mts = substream->rmidi->private_data; - unsigned long flags; - -- --(mts->open_count); -- if (mts->open_count == 0) { -+ if (atomic_dec_return(&mts->open_count) == 0) { - /* We need the spinlock_irqsave here because we can still - have IRQs at this point */ - spin_lock_irqsave(&mts->lock, flags); -@@ -723,8 +722,8 @@ static int snd_mts64_rawmidi_close(struc - - msleep(500); - -- } else if (mts->open_count < 0) -- mts->open_count = 0; -+ } else if (atomic_read(&mts->open_count) < 0) -+ atomic_set(&mts->open_count, 0); - - return 0; - } -diff -urNp linux-2.6.31.1/sound/drivers/portman2x4.c linux-2.6.31.1/sound/drivers/portman2x4.c ---- linux-2.6.31.1/sound/drivers/portman2x4.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/drivers/portman2x4.c 2009-10-01 20:12:45.000000000 -0400 -@@ -83,7 +83,7 @@ struct portman { - struct pardevice *pardev; - int pardev_claimed; - -- int open_count; -+ atomic_t open_count; - int mode[PORTMAN_NUM_INPUT_PORTS]; - struct snd_rawmidi_substream *midi_input[PORTMAN_NUM_INPUT_PORTS]; - }; -diff -urNp linux-2.6.31.1/sound/pci/ac97/ac97_patch.c linux-2.6.31.1/sound/pci/ac97/ac97_patch.c ---- linux-2.6.31.1/sound/pci/ac97/ac97_patch.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/pci/ac97/ac97_patch.c 2009-10-01 20:12:45.000000000 -0400 -@@ -1501,7 +1501,7 @@ static const struct snd_ac97_res_table a - { AC97_VIDEO, 0x9f1f }, - { AC97_AUX, 0x9f1f }, - { AC97_PCM, 0x9f1f }, -- { } /* terminator */ -+ { 0, 0 } /* terminator */ - }; - - static int patch_ad1819(struct snd_ac97 * ac97) -@@ -3876,7 +3876,7 @@ static struct snd_ac97_res_table lm4550_ - { AC97_AUX, 0x1f1f }, - { AC97_PCM, 0x1f1f }, - { AC97_REC_GAIN, 0x0f0f }, -- { } /* terminator */ -+ { 0, 0 } /* terminator */ - }; - - static int patch_lm4550(struct snd_ac97 *ac97) -diff -urNp linux-2.6.31.1/sound/pci/ens1370.c linux-2.6.31.1/sound/pci/ens1370.c ---- linux-2.6.31.1/sound/pci/ens1370.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/pci/ens1370.c 2009-10-01 20:12:45.000000000 -0400 -@@ -452,7 +452,7 @@ static struct pci_device_id snd_audiopci - { PCI_VDEVICE(ENSONIQ, 0x5880), 0, }, /* ES1373 - CT5880 */ - { PCI_VDEVICE(ECTIVA, 0x8938), 0, }, /* Ectiva EV1938 */ - #endif -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(pci, snd_audiopci_ids); -diff -urNp linux-2.6.31.1/sound/pci/intel8x0.c linux-2.6.31.1/sound/pci/intel8x0.c ---- linux-2.6.31.1/sound/pci/intel8x0.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/pci/intel8x0.c 2009-10-01 20:12:45.000000000 -0400 -@@ -444,7 +444,7 @@ static struct pci_device_id snd_intel8x0 - { PCI_VDEVICE(AMD, 0x746d), DEVICE_INTEL }, /* AMD8111 */ - { PCI_VDEVICE(AMD, 0x7445), DEVICE_INTEL }, /* AMD768 */ - { PCI_VDEVICE(AL, 0x5455), DEVICE_ALI }, /* Ali5455 */ -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(pci, snd_intel8x0_ids); -@@ -2105,7 +2105,7 @@ static struct ac97_quirk ac97_quirks[] _ - .type = AC97_TUNE_HP_ONLY - }, - #endif -- { } /* terminator */ -+ { 0, 0, 0, 0, NULL, 0 } /* terminator */ - }; - - static int __devinit snd_intel8x0_mixer(struct intel8x0 *chip, int ac97_clock, -diff -urNp linux-2.6.31.1/sound/pci/intel8x0m.c linux-2.6.31.1/sound/pci/intel8x0m.c ---- linux-2.6.31.1/sound/pci/intel8x0m.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/pci/intel8x0m.c 2009-10-01 20:12:45.000000000 -0400 -@@ -239,7 +239,7 @@ static struct pci_device_id snd_intel8x0 - { PCI_VDEVICE(AMD, 0x746d), DEVICE_INTEL }, /* AMD8111 */ - { PCI_VDEVICE(AL, 0x5455), DEVICE_ALI }, /* Ali5455 */ - #endif -- { 0, } -+ { 0, 0, 0, 0, 0, 0, 0 } - }; - - MODULE_DEVICE_TABLE(pci, snd_intel8x0m_ids); -@@ -1264,7 +1264,7 @@ static struct shortname_table { - { 0x5455, "ALi M5455" }, - { 0x746d, "AMD AMD8111" }, - #endif -- { 0 }, -+ { 0, NULL }, - }; - - static int __devinit snd_intel8x0m_probe(struct pci_dev *pci, -diff -urNp linux-2.6.31.1/sound/usb/usx2y/us122l.c linux-2.6.31.1/sound/usb/usx2y/us122l.c ---- linux-2.6.31.1/sound/usb/usx2y/us122l.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/usb/usx2y/us122l.c 2009-10-01 20:12:45.000000000 -0400 -@@ -154,7 +154,7 @@ static void usb_stream_hwdep_vm_close(st - snd_printdd(KERN_DEBUG "%i\n", atomic_read(&us122l->mmap_count)); - } - --static struct vm_operations_struct usb_stream_hwdep_vm_ops = { -+static const struct vm_operations_struct usb_stream_hwdep_vm_ops = { - .open = usb_stream_hwdep_vm_open, - .fault = usb_stream_hwdep_vm_fault, - .close = usb_stream_hwdep_vm_close, -diff -urNp linux-2.6.31.1/sound/usb/usx2y/usX2Yhwdep.c linux-2.6.31.1/sound/usb/usx2y/usX2Yhwdep.c ---- linux-2.6.31.1/sound/usb/usx2y/usX2Yhwdep.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/usb/usx2y/usX2Yhwdep.c 2009-10-01 20:12:45.000000000 -0400 -@@ -53,7 +53,7 @@ static int snd_us428ctls_vm_fault(struct - return 0; - } - --static struct vm_operations_struct us428ctls_vm_ops = { -+static const struct vm_operations_struct us428ctls_vm_ops = { - .fault = snd_us428ctls_vm_fault, - }; - -diff -urNp linux-2.6.31.1/sound/usb/usx2y/usx2yhwdeppcm.c linux-2.6.31.1/sound/usb/usx2y/usx2yhwdeppcm.c ---- linux-2.6.31.1/sound/usb/usx2y/usx2yhwdeppcm.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/sound/usb/usx2y/usx2yhwdeppcm.c 2009-10-01 20:12:45.000000000 -0400 -@@ -697,7 +697,7 @@ static int snd_usX2Y_hwdep_pcm_vm_fault( - } - - --static struct vm_operations_struct snd_usX2Y_hwdep_pcm_vm_ops = { -+static const struct vm_operations_struct snd_usX2Y_hwdep_pcm_vm_ops = { - .open = snd_usX2Y_hwdep_pcm_vm_open, - .close = snd_usX2Y_hwdep_pcm_vm_close, - .fault = snd_usX2Y_hwdep_pcm_vm_fault, -diff -urNp linux-2.6.31.1/usr/gen_init_cpio.c linux-2.6.31.1/usr/gen_init_cpio.c ---- linux-2.6.31.1/usr/gen_init_cpio.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/usr/gen_init_cpio.c 2009-10-01 20:12:45.000000000 -0400 -@@ -299,7 +299,7 @@ static int cpio_mkfile(const char *name, - int retval; - int rc = -1; - int namesize; -- int i; -+ unsigned int i; - - mode |= S_IFREG; - -@@ -383,9 +383,10 @@ static char *cpio_replace_env(char *new_ - *env_var = *expanded = '\0'; - strncat(env_var, start + 2, end - start - 2); - strncat(expanded, new_location, start - new_location); -- strncat(expanded, getenv(env_var), PATH_MAX); -- strncat(expanded, end + 1, PATH_MAX); -+ strncat(expanded, getenv(env_var), PATH_MAX - strlen(expanded)); -+ strncat(expanded, end + 1, PATH_MAX - strlen(expanded)); - strncpy(new_location, expanded, PATH_MAX); -+ new_location[PATH_MAX] = 0; - } else - break; - } -diff -urNp linux-2.6.31.1/virt/kvm/kvm_main.c linux-2.6.31.1/virt/kvm/kvm_main.c ---- linux-2.6.31.1/virt/kvm/kvm_main.c 2009-09-24 11:45:25.000000000 -0400 -+++ linux-2.6.31.1/virt/kvm/kvm_main.c 2009-10-01 20:12:45.000000000 -0400 -@@ -2353,6 +2353,9 @@ static struct miscdevice kvm_dev = { - KVM_MINOR, - "kvm", - &kvm_chardev_ops, -+ {NULL, NULL}, -+ NULL, -+ NULL - }; - - static void hardware_enable(void *junk) -@@ -2512,7 +2515,7 @@ static int vcpu_stat_get(void *_offset, - - DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); - --static struct file_operations *stat_fops[] = { -+static const struct file_operations *stat_fops[] = { - [KVM_STAT_VCPU] = &vcpu_stat_fops, - [KVM_STAT_VM] = &vm_stat_fops, - }; -@@ -2584,7 +2587,7 @@ static void kvm_sched_out(struct preempt - kvm_arch_vcpu_put(vcpu); - } - --int kvm_init(void *opaque, unsigned int vcpu_size, -+int kvm_init(const void *opaque, unsigned int vcpu_size, - struct module *module) - { - int r; diff --git a/pkgs/core/kernel/patches/grsecurity-2.1.14-2.6.33.1-201003201735.patch b/pkgs/core/kernel/patches/grsecurity-2.1.14-2.6.33.1-201003201735.patch new file mode 100644 index 0000000..1f037ba --- /dev/null +++ b/pkgs/core/kernel/patches/grsecurity-2.1.14-2.6.33.1-201003201735.patch @@ -0,0 +1,53619 @@ +diff -urNp linux-2.6.33.1/arch/alpha/include/asm/elf.h linux-2.6.33.1/arch/alpha/include/asm/elf.h +--- linux-2.6.33.1/arch/alpha/include/asm/elf.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/alpha/include/asm/elf.h 2010-03-20 16:58:38.417757561 -0400 +@@ -90,6 +90,13 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_N + + #define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE (current->personality & ADDR_LIMIT_32BIT ? 0x10000 : 0x120000000UL) ++ ++#define PAX_DELTA_MMAP_LEN (current->personality & ADDR_LIMIT_32BIT ? 14 : 28) ++#define PAX_DELTA_STACK_LEN (current->personality & ADDR_LIMIT_32BIT ? 14 : 19) ++#endif ++ + /* $0 is set by ld.so to a pointer to a function which might be + registered using atexit. This provides a mean for the dynamic + linker to call DT_FINI functions for shared libraries that have +diff -urNp linux-2.6.33.1/arch/alpha/include/asm/pgtable.h linux-2.6.33.1/arch/alpha/include/asm/pgtable.h +--- linux-2.6.33.1/arch/alpha/include/asm/pgtable.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/alpha/include/asm/pgtable.h 2010-03-20 16:58:38.417757561 -0400 +@@ -101,6 +101,17 @@ struct vm_area_struct; + #define PAGE_SHARED __pgprot(_PAGE_VALID | __ACCESS_BITS) + #define PAGE_COPY __pgprot(_PAGE_VALID | __ACCESS_BITS | _PAGE_FOW) + #define PAGE_READONLY __pgprot(_PAGE_VALID | __ACCESS_BITS | _PAGE_FOW) ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++# define PAGE_SHARED_NOEXEC __pgprot(_PAGE_VALID | __ACCESS_BITS | _PAGE_FOE) ++# define PAGE_COPY_NOEXEC __pgprot(_PAGE_VALID | __ACCESS_BITS | _PAGE_FOW | _PAGE_FOE) ++# define PAGE_READONLY_NOEXEC __pgprot(_PAGE_VALID | __ACCESS_BITS | _PAGE_FOW | _PAGE_FOE) ++#else ++# define PAGE_SHARED_NOEXEC PAGE_SHARED ++# define PAGE_COPY_NOEXEC PAGE_COPY ++# define PAGE_READONLY_NOEXEC PAGE_READONLY ++#endif ++ + #define PAGE_KERNEL __pgprot(_PAGE_VALID | _PAGE_ASM | _PAGE_KRE | _PAGE_KWE) + + #define _PAGE_NORMAL(x) __pgprot(_PAGE_VALID | __ACCESS_BITS | (x)) +diff -urNp linux-2.6.33.1/arch/alpha/kernel/module.c linux-2.6.33.1/arch/alpha/kernel/module.c +--- linux-2.6.33.1/arch/alpha/kernel/module.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/alpha/kernel/module.c 2010-03-20 16:58:38.417757561 -0400 +@@ -182,7 +182,7 @@ apply_relocate_add(Elf64_Shdr *sechdrs, + + /* The small sections were sorted to the end of the segment. + The following should definitely cover them. */ +- gp = (u64)me->module_core + me->core_size - 0x8000; ++ gp = (u64)me->module_core_rw + me->core_size_rw - 0x8000; + got = sechdrs[me->arch.gotsecindex].sh_addr; + + for (i = 0; i < n; i++) { +diff -urNp linux-2.6.33.1/arch/alpha/kernel/osf_sys.c linux-2.6.33.1/arch/alpha/kernel/osf_sys.c +--- linux-2.6.33.1/arch/alpha/kernel/osf_sys.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/alpha/kernel/osf_sys.c 2010-03-20 16:58:38.417757561 -0400 +@@ -1205,6 +1205,10 @@ arch_get_unmapped_area(struct file *filp + merely specific addresses, but regions of memory -- perhaps + this feature should be incorporated into all ports? */ + ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!(current->mm->pax_flags & MF_PAX_RANDMMAP)) ++#endif ++ + if (addr) { + addr = arch_get_unmapped_area_1 (PAGE_ALIGN(addr), len, limit); + if (addr != (unsigned long) -ENOMEM) +@@ -1212,8 +1216,8 @@ arch_get_unmapped_area(struct file *filp + } + + /* Next, try allocating at TASK_UNMAPPED_BASE. */ +- addr = arch_get_unmapped_area_1 (PAGE_ALIGN(TASK_UNMAPPED_BASE), +- len, limit); ++ addr = arch_get_unmapped_area_1 (PAGE_ALIGN(current->mm->mmap_base), len, limit); ++ + if (addr != (unsigned long) -ENOMEM) + return addr; + +diff -urNp linux-2.6.33.1/arch/alpha/mm/fault.c linux-2.6.33.1/arch/alpha/mm/fault.c +--- linux-2.6.33.1/arch/alpha/mm/fault.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/alpha/mm/fault.c 2010-03-20 16:58:38.420782159 -0400 +@@ -54,6 +54,124 @@ __load_new_mm_context(struct mm_struct * + __reload_thread(pcb); + } + ++#ifdef CONFIG_PAX_PAGEEXEC ++/* ++ * PaX: decide what to do with offenders (regs->pc = fault address) ++ * ++ * returns 1 when task should be killed ++ * 2 when patched PLT trampoline was detected ++ * 3 when unpatched PLT trampoline was detected ++ */ ++static int pax_handle_fetch_fault(struct pt_regs *regs) ++{ ++ ++#ifdef CONFIG_PAX_EMUPLT ++ int err; ++ ++ do { /* PaX: patched PLT emulation #1 */ ++ unsigned int ldah, ldq, jmp; ++ ++ err = get_user(ldah, (unsigned int *)regs->pc); ++ err |= get_user(ldq, (unsigned int *)(regs->pc+4)); ++ err |= get_user(jmp, (unsigned int *)(regs->pc+8)); ++ ++ if (err) ++ break; ++ ++ if ((ldah & 0xFFFF0000U) == 0x277B0000U && ++ (ldq & 0xFFFF0000U) == 0xA77B0000U && ++ jmp == 0x6BFB0000U) ++ { ++ unsigned long r27, addr; ++ unsigned long addrh = (ldah | 0xFFFFFFFFFFFF0000UL) << 16; ++ unsigned long addrl = ldq | 0xFFFFFFFFFFFF0000UL; ++ ++ addr = regs->r27 + ((addrh ^ 0x80000000UL) + 0x80000000UL) + ((addrl ^ 0x8000UL) + 0x8000UL); ++ err = get_user(r27, (unsigned long *)addr); ++ if (err) ++ break; ++ ++ regs->r27 = r27; ++ regs->pc = r27; ++ return 2; ++ } ++ } while (0); ++ ++ do { /* PaX: patched PLT emulation #2 */ ++ unsigned int ldah, lda, br; ++ ++ err = get_user(ldah, (unsigned int *)regs->pc); ++ err |= get_user(lda, (unsigned int *)(regs->pc+4)); ++ err |= get_user(br, (unsigned int *)(regs->pc+8)); ++ ++ if (err) ++ break; ++ ++ if ((ldah & 0xFFFF0000U) == 0x277B0000U && ++ (lda & 0xFFFF0000U) == 0xA77B0000U && ++ (br & 0xFFE00000U) == 0xC3E00000U) ++ { ++ unsigned long addr = br | 0xFFFFFFFFFFE00000UL; ++ unsigned long addrh = (ldah | 0xFFFFFFFFFFFF0000UL) << 16; ++ unsigned long addrl = lda | 0xFFFFFFFFFFFF0000UL; ++ ++ regs->r27 += ((addrh ^ 0x80000000UL) + 0x80000000UL) + ((addrl ^ 0x8000UL) + 0x8000UL); ++ regs->pc += 12 + (((addr ^ 0x00100000UL) + 0x00100000UL) << 2); ++ return 2; ++ } ++ } while (0); ++ ++ do { /* PaX: unpatched PLT emulation */ ++ unsigned int br; ++ ++ err = get_user(br, (unsigned int *)regs->pc); ++ ++ if (!err && (br & 0xFFE00000U) == 0xC3800000U) { ++ unsigned int br2, ldq, nop, jmp; ++ unsigned long addr = br | 0xFFFFFFFFFFE00000UL, resolver; ++ ++ addr = regs->pc + 4 + (((addr ^ 0x00100000UL) + 0x00100000UL) << 2); ++ err = get_user(br2, (unsigned int *)addr); ++ err |= get_user(ldq, (unsigned int *)(addr+4)); ++ err |= get_user(nop, (unsigned int *)(addr+8)); ++ err |= get_user(jmp, (unsigned int *)(addr+12)); ++ err |= get_user(resolver, (unsigned long *)(addr+16)); ++ ++ if (err) ++ break; ++ ++ if (br2 == 0xC3600000U && ++ ldq == 0xA77B000CU && ++ nop == 0x47FF041FU && ++ jmp == 0x6B7B0000U) ++ { ++ regs->r28 = regs->pc+4; ++ regs->r27 = addr+16; ++ regs->pc = resolver; ++ return 3; ++ } ++ } ++ } while (0); ++#endif ++ ++ return 1; ++} ++ ++void pax_report_insns(void *pc, void *sp) ++{ ++ unsigned long i; ++ ++ printk(KERN_ERR "PAX: bytes at PC: "); ++ for (i = 0; i < 5; i++) { ++ unsigned int c; ++ if (get_user(c, (unsigned int *)pc+i)) ++ printk(KERN_CONT "???????? "); ++ else ++ printk(KERN_CONT "%08x ", c); ++ } ++ printk("\n"); ++} ++#endif + + /* + * This routine handles page faults. It determines the address, +@@ -131,8 +249,29 @@ do_page_fault(unsigned long address, uns + good_area: + si_code = SEGV_ACCERR; + if (cause < 0) { +- if (!(vma->vm_flags & VM_EXEC)) ++ if (!(vma->vm_flags & VM_EXEC)) { ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (!(mm->pax_flags & MF_PAX_PAGEEXEC) || address != regs->pc) ++ goto bad_area; ++ ++ up_read(&mm->mmap_sem); ++ switch (pax_handle_fetch_fault(regs)) { ++ ++#ifdef CONFIG_PAX_EMUPLT ++ case 2: ++ case 3: ++ return; ++#endif ++ ++ } ++ pax_report_fault(regs, (void *)regs->pc, (void *)rdusp()); ++ do_group_exit(SIGKILL); ++#else + goto bad_area; ++#endif ++ ++ } + } else if (!cause) { + /* Allow reads even for write-only mappings */ + if (!(vma->vm_flags & (VM_READ | VM_WRITE))) +diff -urNp linux-2.6.33.1/arch/arm/include/asm/elf.h linux-2.6.33.1/arch/arm/include/asm/elf.h +--- linux-2.6.33.1/arch/arm/include/asm/elf.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/include/asm/elf.h 2010-03-20 16:58:38.440745685 -0400 +@@ -108,7 +108,14 @@ int dump_task_regs(struct task_struct *t + the loader. We need to make sure that it is out of the way of the program + that it will "exec", and that there is sufficient room for the brk. */ + +-#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) ++#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) ++ ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE 0x00008000UL ++ ++#define PAX_DELTA_MMAP_LEN ((current->personality == PER_LINUX_32BIT) ? 16 : 10) ++#define PAX_DELTA_STACK_LEN ((current->personality == PER_LINUX_32BIT) ? 16 : 10) ++#endif + + /* When the program starts, a1 contains a pointer to a function to be + registered with atexit, as per the SVR4 ABI. A value of 0 means we +diff -urNp linux-2.6.33.1/arch/arm/include/asm/kmap_types.h linux-2.6.33.1/arch/arm/include/asm/kmap_types.h +--- linux-2.6.33.1/arch/arm/include/asm/kmap_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/include/asm/kmap_types.h 2010-03-20 16:58:38.444581130 -0400 +@@ -19,6 +19,7 @@ enum km_type { + KM_SOFTIRQ0, + KM_SOFTIRQ1, + KM_L2_CACHE, ++ KM_CLEARPAGE, + KM_TYPE_NR + }; + +diff -urNp linux-2.6.33.1/arch/arm/include/asm/uaccess.h linux-2.6.33.1/arch/arm/include/asm/uaccess.h +--- linux-2.6.33.1/arch/arm/include/asm/uaccess.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/include/asm/uaccess.h 2010-03-20 16:58:38.444581130 -0400 +@@ -403,6 +403,9 @@ extern unsigned long __must_check __strn + + static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) + { ++ if ((long)n < 0) ++ return n; ++ + if (access_ok(VERIFY_READ, from, n)) + n = __copy_from_user(to, from, n); + else /* security hole - plug it */ +@@ -412,6 +415,9 @@ static inline unsigned long __must_check + + static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) + { ++ if ((long)n < 0) ++ return n; ++ + if (access_ok(VERIFY_WRITE, to, n)) + n = __copy_to_user(to, from, n); + return n; +diff -urNp linux-2.6.33.1/arch/arm/kernel/kgdb.c linux-2.6.33.1/arch/arm/kernel/kgdb.c +--- linux-2.6.33.1/arch/arm/kernel/kgdb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/kernel/kgdb.c 2010-03-20 16:58:38.468885430 -0400 +@@ -190,7 +190,7 @@ void kgdb_arch_exit(void) + * and we handle the normal undef case within the do_undefinstr + * handler. + */ +-struct kgdb_arch arch_kgdb_ops = { ++const struct kgdb_arch arch_kgdb_ops = { + #ifndef __ARMEB__ + .gdb_bpt_instr = {0xfe, 0xde, 0xff, 0xe7} + #else /* ! __ARMEB__ */ +diff -urNp linux-2.6.33.1/arch/arm/mach-at91/pm.c linux-2.6.33.1/arch/arm/mach-at91/pm.c +--- linux-2.6.33.1/arch/arm/mach-at91/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/mach-at91/pm.c 2010-03-20 16:58:38.468885430 -0400 +@@ -294,7 +294,7 @@ static void at91_pm_end(void) + } + + +-static struct platform_suspend_ops at91_pm_ops ={ ++static const struct platform_suspend_ops at91_pm_ops ={ + .valid = at91_pm_valid_state, + .begin = at91_pm_begin, + .enter = at91_pm_enter, +diff -urNp linux-2.6.33.1/arch/arm/mach-omap1/pm.c linux-2.6.33.1/arch/arm/mach-omap1/pm.c +--- linux-2.6.33.1/arch/arm/mach-omap1/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/mach-omap1/pm.c 2010-03-20 16:58:38.472778666 -0400 +@@ -647,7 +647,7 @@ static struct irqaction omap_wakeup_irq + + + +-static struct platform_suspend_ops omap_pm_ops ={ ++static const struct platform_suspend_ops omap_pm_ops ={ + .prepare = omap_pm_prepare, + .enter = omap_pm_enter, + .finish = omap_pm_finish, +diff -urNp linux-2.6.33.1/arch/arm/mach-omap2/pm24xx.c linux-2.6.33.1/arch/arm/mach-omap2/pm24xx.c +--- linux-2.6.33.1/arch/arm/mach-omap2/pm24xx.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/mach-omap2/pm24xx.c 2010-03-20 16:58:38.476775080 -0400 +@@ -326,7 +326,7 @@ static void omap2_pm_finish(void) + enable_hlt(); + } + +-static struct platform_suspend_ops omap_pm_ops = { ++static const struct platform_suspend_ops omap_pm_ops = { + .prepare = omap2_pm_prepare, + .enter = omap2_pm_enter, + .finish = omap2_pm_finish, +diff -urNp linux-2.6.33.1/arch/arm/mach-omap2/pm34xx.c linux-2.6.33.1/arch/arm/mach-omap2/pm34xx.c +--- linux-2.6.33.1/arch/arm/mach-omap2/pm34xx.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/mach-omap2/pm34xx.c 2010-03-20 16:58:38.484767476 -0400 +@@ -650,7 +650,7 @@ static void omap3_pm_end(void) + return; + } + +-static struct platform_suspend_ops omap_pm_ops = { ++static const struct platform_suspend_ops omap_pm_ops = { + .begin = omap3_pm_begin, + .end = omap3_pm_end, + .prepare = omap3_pm_prepare, +diff -urNp linux-2.6.33.1/arch/arm/mach-pnx4008/pm.c linux-2.6.33.1/arch/arm/mach-pnx4008/pm.c +--- linux-2.6.33.1/arch/arm/mach-pnx4008/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/mach-pnx4008/pm.c 2010-03-20 16:58:38.488749331 -0400 +@@ -116,7 +116,7 @@ static int pnx4008_pm_valid(suspend_stat + (state == PM_SUSPEND_MEM); + } + +-static struct platform_suspend_ops pnx4008_pm_ops = { ++static const struct platform_suspend_ops pnx4008_pm_ops = { + .enter = pnx4008_pm_enter, + .valid = pnx4008_pm_valid, + }; +diff -urNp linux-2.6.33.1/arch/arm/mach-pxa/pm.c linux-2.6.33.1/arch/arm/mach-pxa/pm.c +--- linux-2.6.33.1/arch/arm/mach-pxa/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/mach-pxa/pm.c 2010-03-20 16:58:38.492744784 -0400 +@@ -95,7 +95,7 @@ void pxa_pm_finish(void) + pxa_cpu_pm_fns->finish(); + } + +-static struct platform_suspend_ops pxa_pm_ops = { ++static const struct platform_suspend_ops pxa_pm_ops = { + .valid = pxa_pm_valid, + .enter = pxa_pm_enter, + .prepare = pxa_pm_prepare, +diff -urNp linux-2.6.33.1/arch/arm/mach-pxa/sharpsl_pm.c linux-2.6.33.1/arch/arm/mach-pxa/sharpsl_pm.c +--- linux-2.6.33.1/arch/arm/mach-pxa/sharpsl_pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/mach-pxa/sharpsl_pm.c 2010-03-20 16:58:38.500761827 -0400 +@@ -892,7 +892,7 @@ static void sharpsl_apm_get_power_status + } + + #ifdef CONFIG_PM +-static struct platform_suspend_ops sharpsl_pm_ops = { ++static const struct platform_suspend_ops sharpsl_pm_ops = { + .prepare = pxa_pm_prepare, + .finish = pxa_pm_finish, + .enter = corgi_pxa_pm_enter, +diff -urNp linux-2.6.33.1/arch/arm/mach-sa1100/pm.c linux-2.6.33.1/arch/arm/mach-sa1100/pm.c +--- linux-2.6.33.1/arch/arm/mach-sa1100/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/mach-sa1100/pm.c 2010-03-20 16:58:38.504745508 -0400 +@@ -120,7 +120,7 @@ unsigned long sleep_phys_sp(void *sp) + return virt_to_phys(sp); + } + +-static struct platform_suspend_ops sa11x0_pm_ops = { ++static const struct platform_suspend_ops sa11x0_pm_ops = { + .enter = sa11x0_pm_enter, + .valid = suspend_valid_only_mem, + }; +diff -urNp linux-2.6.33.1/arch/arm/mm/fault.c linux-2.6.33.1/arch/arm/mm/fault.c +--- linux-2.6.33.1/arch/arm/mm/fault.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/mm/fault.c 2010-03-20 16:58:38.512762145 -0400 +@@ -166,6 +166,13 @@ __do_user_fault(struct task_struct *tsk, + } + #endif + ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (fsr & FSR_LNX_PF) { ++ pax_report_fault(regs, (void *)regs->ARM_pc, (void *)regs->ARM_sp); ++ do_group_exit(SIGKILL); ++ } ++#endif ++ + tsk->thread.address = addr; + tsk->thread.error_code = fsr; + tsk->thread.trap_no = 14; +@@ -357,6 +364,33 @@ do_page_fault(unsigned long addr, unsign + } + #endif /* CONFIG_MMU */ + ++#ifdef CONFIG_PAX_PAGEEXEC ++void pax_report_insns(void *pc, void *sp) ++{ ++ long i; ++ ++ printk(KERN_ERR "PAX: bytes at PC: "); ++ for (i = 0; i < 20; i++) { ++ unsigned char c; ++ if (get_user(c, (__force unsigned char __user *)pc+i)) ++ printk(KERN_CONT "?? "); ++ else ++ printk(KERN_CONT "%02x ", c); ++ } ++ printk("\n"); ++ ++ printk(KERN_ERR "PAX: bytes at SP-4: "); ++ for (i = -1; i < 20; i++) { ++ unsigned long c; ++ if (get_user(c, (__force unsigned long __user *)sp+i)) ++ printk(KERN_CONT "???????? "); ++ else ++ printk(KERN_CONT "%08lx ", c); ++ } ++ printk("\n"); ++} ++#endif ++ + /* + * First Level Translation Fault Handler + * +diff -urNp linux-2.6.33.1/arch/arm/mm/mmap.c linux-2.6.33.1/arch/arm/mm/mmap.c +--- linux-2.6.33.1/arch/arm/mm/mmap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/mm/mmap.c 2010-03-20 16:58:38.512762145 -0400 +@@ -63,6 +63,10 @@ arch_get_unmapped_area(struct file *filp + if (len > TASK_SIZE) + return -ENOMEM; + ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) ++#endif ++ + if (addr) { + if (do_align) + addr = COLOUR_ALIGN(addr, pgoff); +@@ -75,10 +79,10 @@ arch_get_unmapped_area(struct file *filp + return addr; + } + if (len > mm->cached_hole_size) { +- start_addr = addr = mm->free_area_cache; ++ start_addr = addr = mm->free_area_cache; + } else { +- start_addr = addr = TASK_UNMAPPED_BASE; +- mm->cached_hole_size = 0; ++ start_addr = addr = mm->mmap_base; ++ mm->cached_hole_size = 0; + } + + full_search: +@@ -94,8 +98,8 @@ full_search: + * Start a new search - just in case we missed + * some holes. + */ +- if (start_addr != TASK_UNMAPPED_BASE) { +- start_addr = addr = TASK_UNMAPPED_BASE; ++ if (start_addr != mm->mmap_base) { ++ start_addr = addr = mm->mmap_base; + mm->cached_hole_size = 0; + goto full_search; + } +diff -urNp linux-2.6.33.1/arch/arm/plat-s3c/pm.c linux-2.6.33.1/arch/arm/plat-s3c/pm.c +--- linux-2.6.33.1/arch/arm/plat-s3c/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/arm/plat-s3c/pm.c 2010-03-20 16:58:38.533259132 -0400 +@@ -355,7 +355,7 @@ static void s3c_pm_finish(void) + s3c_pm_check_cleanup(); + } + +-static struct platform_suspend_ops s3c_pm_ops = { ++static const struct platform_suspend_ops s3c_pm_ops = { + .enter = s3c_pm_enter, + .prepare = s3c_pm_prepare, + .finish = s3c_pm_finish, +diff -urNp linux-2.6.33.1/arch/avr32/include/asm/elf.h linux-2.6.33.1/arch/avr32/include/asm/elf.h +--- linux-2.6.33.1/arch/avr32/include/asm/elf.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/avr32/include/asm/elf.h 2010-03-20 16:58:38.533259132 -0400 +@@ -84,8 +84,14 @@ typedef struct user_fpu_struct elf_fpreg + the loader. We need to make sure that it is out of the way of the program + that it will "exec", and that there is sufficient room for the brk. */ + +-#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) ++#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE 0x00001000UL ++ ++#define PAX_DELTA_MMAP_LEN 15 ++#define PAX_DELTA_STACK_LEN 15 ++#endif + + /* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. This could be done in user space, +diff -urNp linux-2.6.33.1/arch/avr32/include/asm/kmap_types.h linux-2.6.33.1/arch/avr32/include/asm/kmap_types.h +--- linux-2.6.33.1/arch/avr32/include/asm/kmap_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/avr32/include/asm/kmap_types.h 2010-03-20 16:58:38.533259132 -0400 +@@ -22,7 +22,8 @@ D(10) KM_IRQ0, + D(11) KM_IRQ1, + D(12) KM_SOFTIRQ0, + D(13) KM_SOFTIRQ1, +-D(14) KM_TYPE_NR ++D(14) KM_CLEARPAGE, ++D(15) KM_TYPE_NR + }; + + #undef D +diff -urNp linux-2.6.33.1/arch/avr32/mach-at32ap/pm.c linux-2.6.33.1/arch/avr32/mach-at32ap/pm.c +--- linux-2.6.33.1/arch/avr32/mach-at32ap/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/avr32/mach-at32ap/pm.c 2010-03-20 16:58:38.533259132 -0400 +@@ -176,7 +176,7 @@ out: + return 0; + } + +-static struct platform_suspend_ops avr32_pm_ops = { ++static const struct platform_suspend_ops avr32_pm_ops = { + .valid = avr32_pm_valid_state, + .enter = avr32_pm_enter, + }; +diff -urNp linux-2.6.33.1/arch/avr32/mm/fault.c linux-2.6.33.1/arch/avr32/mm/fault.c +--- linux-2.6.33.1/arch/avr32/mm/fault.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/avr32/mm/fault.c 2010-03-20 16:58:38.533259132 -0400 +@@ -41,6 +41,23 @@ static inline int notify_page_fault(stru + + int exception_trace = 1; + ++#ifdef CONFIG_PAX_PAGEEXEC ++void pax_report_insns(void *pc, void *sp) ++{ ++ unsigned long i; ++ ++ printk(KERN_ERR "PAX: bytes at PC: "); ++ for (i = 0; i < 20; i++) { ++ unsigned char c; ++ if (get_user(c, (unsigned char *)pc+i)) ++ printk(KERN_CONT "???????? "); ++ else ++ printk(KERN_CONT "%02x ", c); ++ } ++ printk("\n"); ++} ++#endif ++ + /* + * This routine handles page faults. It determines the address and the + * problem, and then passes it off to one of the appropriate routines. +@@ -157,6 +174,16 @@ bad_area: + up_read(&mm->mmap_sem); + + if (user_mode(regs)) { ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (mm->pax_flags & MF_PAX_PAGEEXEC) { ++ if (ecr == ECR_PROTECTION_X || ecr == ECR_TLB_MISS_X) { ++ pax_report_fault(regs, (void *)regs->pc, (void *)regs->sp); ++ do_group_exit(SIGKILL); ++ } ++ } ++#endif ++ + if (exception_trace && printk_ratelimit()) + printk("%s%s[%d]: segfault at %08lx pc %08lx " + "sp %08lx ecr %lu\n", +diff -urNp linux-2.6.33.1/arch/blackfin/kernel/kgdb.c linux-2.6.33.1/arch/blackfin/kernel/kgdb.c +--- linux-2.6.33.1/arch/blackfin/kernel/kgdb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/blackfin/kernel/kgdb.c 2010-03-20 16:58:38.533259132 -0400 +@@ -397,7 +397,7 @@ int kgdb_arch_handle_exception(int vecto + return -1; /* this means that we do not want to exit from the handler */ + } + +-struct kgdb_arch arch_kgdb_ops = { ++const struct kgdb_arch arch_kgdb_ops = { + .gdb_bpt_instr = {0xa1}, + #ifdef CONFIG_SMP + .flags = KGDB_HW_BREAKPOINT|KGDB_THR_PROC_SWAP, +diff -urNp linux-2.6.33.1/arch/blackfin/mach-common/pm.c linux-2.6.33.1/arch/blackfin/mach-common/pm.c +--- linux-2.6.33.1/arch/blackfin/mach-common/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/blackfin/mach-common/pm.c 2010-03-20 16:58:38.533259132 -0400 +@@ -255,7 +255,7 @@ static int bfin_pm_enter(suspend_state_t + return 0; + } + +-struct platform_suspend_ops bfin_pm_ops = { ++const struct platform_suspend_ops bfin_pm_ops = { + .enter = bfin_pm_enter, + .valid = bfin_pm_valid, + }; +diff -urNp linux-2.6.33.1/arch/blackfin/mm/maccess.c linux-2.6.33.1/arch/blackfin/mm/maccess.c +--- linux-2.6.33.1/arch/blackfin/mm/maccess.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/blackfin/mm/maccess.c 2010-03-20 16:58:38.536529376 -0400 +@@ -16,7 +16,7 @@ static int validate_memory_access_addres + return bfin_mem_access_type(addr, size); + } + +-long probe_kernel_read(void *dst, void *src, size_t size) ++long probe_kernel_read(void *dst, const void *src, size_t size) + { + unsigned long lsrc = (unsigned long)src; + int mem_type; +@@ -55,7 +55,7 @@ long probe_kernel_read(void *dst, void * + return -EFAULT; + } + +-long probe_kernel_write(void *dst, void *src, size_t size) ++long probe_kernel_write(void *dst, const void *src, size_t size) + { + unsigned long ldst = (unsigned long)dst; + int mem_type; +diff -urNp linux-2.6.33.1/arch/frv/include/asm/kmap_types.h linux-2.6.33.1/arch/frv/include/asm/kmap_types.h +--- linux-2.6.33.1/arch/frv/include/asm/kmap_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/frv/include/asm/kmap_types.h 2010-03-20 16:58:38.536529376 -0400 +@@ -23,6 +23,7 @@ enum km_type { + KM_IRQ1, + KM_SOFTIRQ0, + KM_SOFTIRQ1, ++ KM_CLEARPAGE, + KM_TYPE_NR + }; + +diff -urNp linux-2.6.33.1/arch/ia64/hp/common/hwsw_iommu.c linux-2.6.33.1/arch/ia64/hp/common/hwsw_iommu.c +--- linux-2.6.33.1/arch/ia64/hp/common/hwsw_iommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/hp/common/hwsw_iommu.c 2010-03-20 16:58:38.544753181 -0400 +@@ -17,7 +17,7 @@ + #include <linux/swiotlb.h> + #include <asm/machvec.h> + +-extern struct dma_map_ops sba_dma_ops, swiotlb_dma_ops; ++extern const struct dma_map_ops sba_dma_ops, swiotlb_dma_ops; + + /* swiotlb declarations & definitions: */ + extern int swiotlb_late_init_with_default_size (size_t size); +@@ -33,7 +33,7 @@ static inline int use_swiotlb(struct dev + !sba_dma_ops.dma_supported(dev, *dev->dma_mask); + } + +-struct dma_map_ops *hwsw_dma_get_ops(struct device *dev) ++const struct dma_map_ops *hwsw_dma_get_ops(struct device *dev) + { + if (use_swiotlb(dev)) + return &swiotlb_dma_ops; +diff -urNp linux-2.6.33.1/arch/ia64/hp/common/sba_iommu.c linux-2.6.33.1/arch/ia64/hp/common/sba_iommu.c +--- linux-2.6.33.1/arch/ia64/hp/common/sba_iommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/hp/common/sba_iommu.c 2010-03-20 16:58:38.552757823 -0400 +@@ -2097,7 +2097,7 @@ static struct acpi_driver acpi_sba_ioc_d + }, + }; + +-extern struct dma_map_ops swiotlb_dma_ops; ++extern const struct dma_map_ops swiotlb_dma_ops; + + static int __init + sba_init(void) +@@ -2211,7 +2211,7 @@ sba_page_override(char *str) + + __setup("sbapagesize=",sba_page_override); + +-struct dma_map_ops sba_dma_ops = { ++const struct dma_map_ops sba_dma_ops = { + .alloc_coherent = sba_alloc_coherent, + .free_coherent = sba_free_coherent, + .map_page = sba_map_page, +diff -urNp linux-2.6.33.1/arch/ia64/ia32/binfmt_elf32.c linux-2.6.33.1/arch/ia64/ia32/binfmt_elf32.c +--- linux-2.6.33.1/arch/ia64/ia32/binfmt_elf32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/ia32/binfmt_elf32.c 2010-03-20 16:58:38.556766787 -0400 +@@ -45,6 +45,13 @@ randomize_stack_top(unsigned long stack_ + + #define elf_read_implies_exec(ex, have_pt_gnu_stack) (!(have_pt_gnu_stack)) + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE (current->personality == PER_LINUX32 ? 0x08048000UL : 0x4000000000000000UL) ++ ++#define PAX_DELTA_MMAP_LEN (current->personality == PER_LINUX32 ? 16 : 3*PAGE_SHIFT - 13) ++#define PAX_DELTA_STACK_LEN (current->personality == PER_LINUX32 ? 16 : 3*PAGE_SHIFT - 13) ++#endif ++ + /* Ugly but avoids duplication */ + #include "../../../fs/binfmt_elf.c" + +diff -urNp linux-2.6.33.1/arch/ia64/ia32/ia32priv.h linux-2.6.33.1/arch/ia64/ia32/ia32priv.h +--- linux-2.6.33.1/arch/ia64/ia32/ia32priv.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/ia32/ia32priv.h 2010-03-20 16:58:38.556766787 -0400 +@@ -296,7 +296,14 @@ typedef struct compat_siginfo { + #define ELF_DATA ELFDATA2LSB + #define ELF_ARCH EM_386 + +-#define IA32_STACK_TOP IA32_PAGE_OFFSET ++#ifdef CONFIG_PAX_RANDUSTACK ++#define __IA32_DELTA_STACK (current->mm->delta_stack) ++#else ++#define __IA32_DELTA_STACK 0UL ++#endif ++ ++#define IA32_STACK_TOP (IA32_PAGE_OFFSET - __IA32_DELTA_STACK) ++ + #define IA32_GATE_OFFSET IA32_PAGE_OFFSET + #define IA32_GATE_END IA32_PAGE_OFFSET + PAGE_SIZE + +diff -urNp linux-2.6.33.1/arch/ia64/include/asm/dma-mapping.h linux-2.6.33.1/arch/ia64/include/asm/dma-mapping.h +--- linux-2.6.33.1/arch/ia64/include/asm/dma-mapping.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/include/asm/dma-mapping.h 2010-03-20 16:58:38.564763179 -0400 +@@ -12,7 +12,7 @@ + + #define ARCH_HAS_DMA_GET_REQUIRED_MASK + +-extern struct dma_map_ops *dma_ops; ++extern const struct dma_map_ops *dma_ops; + extern struct ia64_machine_vector ia64_mv; + extern void set_iommu_machvec(void); + +@@ -24,7 +24,7 @@ extern void machvec_dma_sync_sg(struct d + static inline void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *daddr, gfp_t gfp) + { +- struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ const struct dma_map_ops *ops = platform_dma_get_ops(dev); + void *caddr; + + caddr = ops->alloc_coherent(dev, size, daddr, gfp); +@@ -35,7 +35,7 @@ static inline void *dma_alloc_coherent(s + static inline void dma_free_coherent(struct device *dev, size_t size, + void *caddr, dma_addr_t daddr) + { +- struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ const struct dma_map_ops *ops = platform_dma_get_ops(dev); + debug_dma_free_coherent(dev, size, caddr, daddr); + ops->free_coherent(dev, size, caddr, daddr); + } +@@ -49,13 +49,13 @@ static inline void dma_free_coherent(str + + static inline int dma_mapping_error(struct device *dev, dma_addr_t daddr) + { +- struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ const struct dma_map_ops *ops = platform_dma_get_ops(dev); + return ops->mapping_error(dev, daddr); + } + + static inline int dma_supported(struct device *dev, u64 mask) + { +- struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ const struct dma_map_ops *ops = platform_dma_get_ops(dev); + return ops->dma_supported(dev, mask); + } + +diff -urNp linux-2.6.33.1/arch/ia64/include/asm/elf.h linux-2.6.33.1/arch/ia64/include/asm/elf.h +--- linux-2.6.33.1/arch/ia64/include/asm/elf.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/include/asm/elf.h 2010-03-20 16:58:38.564763179 -0400 +@@ -42,6 +42,13 @@ + */ + #define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x800000000UL) + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE (current->personality == PER_LINUX32 ? 0x08048000UL : 0x4000000000000000UL) ++ ++#define PAX_DELTA_MMAP_LEN (current->personality == PER_LINUX32 ? 16 : 3*PAGE_SHIFT - 13) ++#define PAX_DELTA_STACK_LEN (current->personality == PER_LINUX32 ? 16 : 3*PAGE_SHIFT - 13) ++#endif ++ + #define PT_IA_64_UNWIND 0x70000001 + + /* IA-64 relocations: */ +diff -urNp linux-2.6.33.1/arch/ia64/include/asm/machvec.h linux-2.6.33.1/arch/ia64/include/asm/machvec.h +--- linux-2.6.33.1/arch/ia64/include/asm/machvec.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/include/asm/machvec.h 2010-03-20 16:58:38.568641931 -0400 +@@ -45,7 +45,7 @@ typedef void ia64_mv_kernel_launch_event + /* DMA-mapping interface: */ + typedef void ia64_mv_dma_init (void); + typedef u64 ia64_mv_dma_get_required_mask (struct device *); +-typedef struct dma_map_ops *ia64_mv_dma_get_ops(struct device *); ++typedef const struct dma_map_ops *ia64_mv_dma_get_ops(struct device *); + + /* + * WARNING: The legacy I/O space is _architected_. Platforms are +@@ -251,7 +251,7 @@ extern void machvec_init_from_cmdline(co + # endif /* CONFIG_IA64_GENERIC */ + + extern void swiotlb_dma_init(void); +-extern struct dma_map_ops *dma_get_ops(struct device *); ++extern const struct dma_map_ops *dma_get_ops(struct device *); + + /* + * Define default versions so we can extend machvec for new platforms without having +diff -urNp linux-2.6.33.1/arch/ia64/include/asm/pgtable.h linux-2.6.33.1/arch/ia64/include/asm/pgtable.h +--- linux-2.6.33.1/arch/ia64/include/asm/pgtable.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/include/asm/pgtable.h 2010-03-20 16:58:38.568641931 -0400 +@@ -143,6 +143,17 @@ + #define PAGE_READONLY __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R) + #define PAGE_COPY __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R) + #define PAGE_COPY_EXEC __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX) ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++# define PAGE_SHARED_NOEXEC __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RW) ++# define PAGE_READONLY_NOEXEC __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R) ++# define PAGE_COPY_NOEXEC __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R) ++#else ++# define PAGE_SHARED_NOEXEC PAGE_SHARED ++# define PAGE_READONLY_NOEXEC PAGE_READONLY ++# define PAGE_COPY_NOEXEC PAGE_COPY ++#endif ++ + #define PAGE_GATE __pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_X_RX) + #define PAGE_KERNEL __pgprot(__DIRTY_BITS | _PAGE_PL_0 | _PAGE_AR_RWX) + #define PAGE_KERNELRX __pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX) +diff -urNp linux-2.6.33.1/arch/ia64/include/asm/uaccess.h linux-2.6.33.1/arch/ia64/include/asm/uaccess.h +--- linux-2.6.33.1/arch/ia64/include/asm/uaccess.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/include/asm/uaccess.h 2010-03-20 16:58:38.568641931 -0400 +@@ -257,7 +257,7 @@ __copy_from_user (void *to, const void _ + const void *__cu_from = (from); \ + long __cu_len = (n); \ + \ +- if (__access_ok(__cu_to, __cu_len, get_fs())) \ ++ if (__cu_len > 0 && __cu_len <= INT_MAX && __access_ok(__cu_to, __cu_len, get_fs())) \ + __cu_len = __copy_user(__cu_to, (__force void __user *) __cu_from, __cu_len); \ + __cu_len; \ + }) +@@ -269,7 +269,7 @@ __copy_from_user (void *to, const void _ + long __cu_len = (n); \ + \ + __chk_user_ptr(__cu_from); \ +- if (__access_ok(__cu_from, __cu_len, get_fs())) \ ++ if (__cu_len > 0 && __cu_len <= INT_MAX && __access_ok(__cu_from, __cu_len, get_fs())) \ + __cu_len = __copy_user((__force void __user *) __cu_to, __cu_from, __cu_len); \ + __cu_len; \ + }) +diff -urNp linux-2.6.33.1/arch/ia64/kernel/dma-mapping.c linux-2.6.33.1/arch/ia64/kernel/dma-mapping.c +--- linux-2.6.33.1/arch/ia64/kernel/dma-mapping.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/kernel/dma-mapping.c 2010-03-20 16:58:38.568641931 -0400 +@@ -3,7 +3,7 @@ + /* Set this to 1 if there is a HW IOMMU in the system */ + int iommu_detected __read_mostly; + +-struct dma_map_ops *dma_ops; ++const struct dma_map_ops *dma_ops; + EXPORT_SYMBOL(dma_ops); + + #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) +@@ -16,7 +16,7 @@ static int __init dma_init(void) + } + fs_initcall(dma_init); + +-struct dma_map_ops *dma_get_ops(struct device *dev) ++const struct dma_map_ops *dma_get_ops(struct device *dev) + { + return dma_ops; + } +diff -urNp linux-2.6.33.1/arch/ia64/kernel/module.c linux-2.6.33.1/arch/ia64/kernel/module.c +--- linux-2.6.33.1/arch/ia64/kernel/module.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/kernel/module.c 2010-03-20 16:58:38.576761283 -0400 +@@ -315,8 +315,7 @@ module_alloc (unsigned long size) + void + module_free (struct module *mod, void *module_region) + { +- if (mod && mod->arch.init_unw_table && +- module_region == mod->module_init) { ++ if (mod && mod->arch.init_unw_table && module_region == mod->module_init_rx) { + unw_remove_unwind_table(mod->arch.init_unw_table); + mod->arch.init_unw_table = NULL; + } +@@ -502,15 +501,39 @@ module_frob_arch_sections (Elf_Ehdr *ehd + } + + static inline int ++in_init_rx (const struct module *mod, uint64_t addr) ++{ ++ return addr - (uint64_t) mod->module_init_rx < mod->init_size_rx; ++} ++ ++static inline int ++in_init_rw (const struct module *mod, uint64_t addr) ++{ ++ return addr - (uint64_t) mod->module_init_rw < mod->init_size_rw; ++} ++ ++static inline int + in_init (const struct module *mod, uint64_t addr) + { +- return addr - (uint64_t) mod->module_init < mod->init_size; ++ return in_init_rx(mod, addr) || in_init_rw(mod, addr); ++} ++ ++static inline int ++in_core_rx (const struct module *mod, uint64_t addr) ++{ ++ return addr - (uint64_t) mod->module_core_rx < mod->core_size_rx; ++} ++ ++static inline int ++in_core_rw (const struct module *mod, uint64_t addr) ++{ ++ return addr - (uint64_t) mod->module_core_rw < mod->core_size_rw; + } + + static inline int + in_core (const struct module *mod, uint64_t addr) + { +- return addr - (uint64_t) mod->module_core < mod->core_size; ++ return in_core_rx(mod, addr) || in_core_rw(mod, addr); + } + + static inline int +@@ -693,7 +716,14 @@ do_reloc (struct module *mod, uint8_t r_ + break; + + case RV_BDREL: +- val -= (uint64_t) (in_init(mod, val) ? mod->module_init : mod->module_core); ++ if (in_init_rx(mod, val)) ++ val -= (uint64_t) mod->module_init_rx; ++ else if (in_init_rw(mod, val)) ++ val -= (uint64_t) mod->module_init_rw; ++ else if (in_core_rx(mod, val)) ++ val -= (uint64_t) mod->module_core_rx; ++ else if (in_core_rw(mod, val)) ++ val -= (uint64_t) mod->module_core_rw; + break; + + case RV_LTV: +@@ -828,15 +858,15 @@ apply_relocate_add (Elf64_Shdr *sechdrs, + * addresses have been selected... + */ + uint64_t gp; +- if (mod->core_size > MAX_LTOFF) ++ if (mod->core_size_rx + mod->core_size_rw > MAX_LTOFF) + /* + * This takes advantage of fact that SHF_ARCH_SMALL gets allocated + * at the end of the module. + */ +- gp = mod->core_size - MAX_LTOFF / 2; ++ gp = mod->core_size_rx + mod->core_size_rw - MAX_LTOFF / 2; + else +- gp = mod->core_size / 2; +- gp = (uint64_t) mod->module_core + ((gp + 7) & -8); ++ gp = (mod->core_size_rx + mod->core_size_rw) / 2; ++ gp = (uint64_t) mod->module_core_rx + ((gp + 7) & -8); + mod->arch.gp = gp; + DEBUGP("%s: placing gp at 0x%lx\n", __func__, gp); + } +diff -urNp linux-2.6.33.1/arch/ia64/kernel/pci-dma.c linux-2.6.33.1/arch/ia64/kernel/pci-dma.c +--- linux-2.6.33.1/arch/ia64/kernel/pci-dma.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/kernel/pci-dma.c 2010-03-20 16:58:38.576761283 -0400 +@@ -43,7 +43,7 @@ struct device fallback_dev = { + .dma_mask = &fallback_dev.coherent_dma_mask, + }; + +-extern struct dma_map_ops intel_dma_ops; ++extern const struct dma_map_ops intel_dma_ops; + + static int __init pci_iommu_init(void) + { +diff -urNp linux-2.6.33.1/arch/ia64/kernel/pci-swiotlb.c linux-2.6.33.1/arch/ia64/kernel/pci-swiotlb.c +--- linux-2.6.33.1/arch/ia64/kernel/pci-swiotlb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/kernel/pci-swiotlb.c 2010-03-20 16:58:38.576761283 -0400 +@@ -21,7 +21,7 @@ static void *ia64_swiotlb_alloc_coherent + return swiotlb_alloc_coherent(dev, size, dma_handle, gfp); + } + +-struct dma_map_ops swiotlb_dma_ops = { ++const struct dma_map_ops swiotlb_dma_ops = { + .alloc_coherent = ia64_swiotlb_alloc_coherent, + .free_coherent = swiotlb_free_coherent, + .map_page = swiotlb_map_page, +diff -urNp linux-2.6.33.1/arch/ia64/kernel/sys_ia64.c linux-2.6.33.1/arch/ia64/kernel/sys_ia64.c +--- linux-2.6.33.1/arch/ia64/kernel/sys_ia64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/kernel/sys_ia64.c 2010-03-20 16:58:38.584549337 -0400 +@@ -43,6 +43,13 @@ arch_get_unmapped_area (struct file *fil + if (REGION_NUMBER(addr) == RGN_HPAGE) + addr = 0; + #endif ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ addr = mm->free_area_cache; ++ else ++#endif ++ + if (!addr) + addr = mm->free_area_cache; + +@@ -61,9 +68,9 @@ arch_get_unmapped_area (struct file *fil + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr || RGN_MAP_LIMIT - len < REGION_OFFSET(addr)) { +- if (start_addr != TASK_UNMAPPED_BASE) { ++ if (start_addr != mm->mmap_base) { + /* Start a new search --- just in case we missed some holes. */ +- addr = TASK_UNMAPPED_BASE; ++ addr = mm->mmap_base; + goto full_search; + } + return -ENOMEM; +diff -urNp linux-2.6.33.1/arch/ia64/kernel/topology.c linux-2.6.33.1/arch/ia64/kernel/topology.c +--- linux-2.6.33.1/arch/ia64/kernel/topology.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/kernel/topology.c 2010-03-20 16:58:38.584549337 -0400 +@@ -282,7 +282,7 @@ static ssize_t cache_show(struct kobject + return ret; + } + +-static struct sysfs_ops cache_sysfs_ops = { ++static const struct sysfs_ops cache_sysfs_ops = { + .show = cache_show + }; + +diff -urNp linux-2.6.33.1/arch/ia64/kernel/vmlinux.lds.S linux-2.6.33.1/arch/ia64/kernel/vmlinux.lds.S +--- linux-2.6.33.1/arch/ia64/kernel/vmlinux.lds.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/kernel/vmlinux.lds.S 2010-03-20 16:58:38.584549337 -0400 +@@ -196,7 +196,7 @@ SECTIONS + /* Per-cpu data: */ + . = ALIGN(PERCPU_PAGE_SIZE); + PERCPU_VADDR(PERCPU_ADDR, :percpu) +- __phys_per_cpu_start = __per_cpu_load; ++ __phys_per_cpu_start = per_cpu_load; + . = __phys_per_cpu_start + PERCPU_PAGE_SIZE; /* ensure percpu data fits + * into percpu page size + */ +diff -urNp linux-2.6.33.1/arch/ia64/mm/fault.c linux-2.6.33.1/arch/ia64/mm/fault.c +--- linux-2.6.33.1/arch/ia64/mm/fault.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/mm/fault.c 2010-03-20 16:58:38.584549337 -0400 +@@ -72,6 +72,23 @@ mapped_kernel_page_is_present (unsigned + return pte_present(pte); + } + ++#ifdef CONFIG_PAX_PAGEEXEC ++void pax_report_insns(void *pc, void *sp) ++{ ++ unsigned long i; ++ ++ printk(KERN_ERR "PAX: bytes at PC: "); ++ for (i = 0; i < 8; i++) { ++ unsigned int c; ++ if (get_user(c, (unsigned int *)pc+i)) ++ printk(KERN_CONT "???????? "); ++ else ++ printk(KERN_CONT "%08x ", c); ++ } ++ printk("\n"); ++} ++#endif ++ + void __kprobes + ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *regs) + { +@@ -145,9 +162,23 @@ ia64_do_page_fault (unsigned long addres + mask = ( (((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) + | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); + +- if ((vma->vm_flags & mask) != mask) ++ if ((vma->vm_flags & mask) != mask) { ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (!(vma->vm_flags & VM_EXEC) && (mask & VM_EXEC)) { ++ if (!(mm->pax_flags & MF_PAX_PAGEEXEC) || address != regs->cr_iip) ++ goto bad_area; ++ ++ up_read(&mm->mmap_sem); ++ pax_report_fault(regs, (void *)regs->cr_iip, (void *)regs->r12); ++ do_group_exit(SIGKILL); ++ } ++#endif ++ + goto bad_area; + ++ } ++ + survive: + /* + * If for any reason at all we couldn't handle the fault, make +diff -urNp linux-2.6.33.1/arch/ia64/mm/init.c linux-2.6.33.1/arch/ia64/mm/init.c +--- linux-2.6.33.1/arch/ia64/mm/init.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/mm/init.c 2010-03-20 16:58:38.584549337 -0400 +@@ -122,6 +122,19 @@ ia64_init_addr_space (void) + vma->vm_start = current->thread.rbs_bot & PAGE_MASK; + vma->vm_end = vma->vm_start + PAGE_SIZE; + vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (current->mm->pax_flags & MF_PAX_PAGEEXEC) { ++ vma->vm_flags &= ~VM_EXEC; ++ ++#ifdef CONFIG_PAX_MPROTECT ++ if (current->mm->pax_flags & MF_PAX_MPROTECT) ++ vma->vm_flags &= ~VM_MAYEXEC; ++#endif ++ ++ } ++#endif ++ + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + down_write(¤t->mm->mmap_sem); + if (insert_vm_struct(current->mm, vma)) { +diff -urNp linux-2.6.33.1/arch/ia64/sn/pci/pci_dma.c linux-2.6.33.1/arch/ia64/sn/pci/pci_dma.c +--- linux-2.6.33.1/arch/ia64/sn/pci/pci_dma.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/ia64/sn/pci/pci_dma.c 2010-03-20 16:58:38.584549337 -0400 +@@ -464,7 +464,7 @@ int sn_pci_legacy_write(struct pci_bus * + return ret; + } + +-static struct dma_map_ops sn_dma_ops = { ++static const struct dma_map_ops sn_dma_ops = { + .alloc_coherent = sn_dma_alloc_coherent, + .free_coherent = sn_dma_free_coherent, + .map_page = sn_dma_map_page, +diff -urNp linux-2.6.33.1/arch/m32r/lib/usercopy.c linux-2.6.33.1/arch/m32r/lib/usercopy.c +--- linux-2.6.33.1/arch/m32r/lib/usercopy.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/m32r/lib/usercopy.c 2010-03-20 16:58:38.584549337 -0400 +@@ -14,6 +14,9 @@ + unsigned long + __generic_copy_to_user(void __user *to, const void *from, unsigned long n) + { ++ if ((long)n < 0) ++ return n; ++ + prefetch(from); + if (access_ok(VERIFY_WRITE, to, n)) + __copy_user(to,from,n); +@@ -23,6 +26,9 @@ __generic_copy_to_user(void __user *to, + unsigned long + __generic_copy_from_user(void *to, const void __user *from, unsigned long n) + { ++ if ((long)n < 0) ++ return n; ++ + prefetchw(to); + if (access_ok(VERIFY_READ, from, n)) + __copy_user_zeroing(to,from,n); +diff -urNp linux-2.6.33.1/arch/mips/alchemy/devboards/pm.c linux-2.6.33.1/arch/mips/alchemy/devboards/pm.c +--- linux-2.6.33.1/arch/mips/alchemy/devboards/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/alchemy/devboards/pm.c 2010-03-20 16:58:38.588679980 -0400 +@@ -78,7 +78,7 @@ static void db1x_pm_end(void) + + } + +-static struct platform_suspend_ops db1x_pm_ops = { ++static const struct platform_suspend_ops db1x_pm_ops = { + .valid = suspend_valid_only_mem, + .begin = db1x_pm_begin, + .enter = db1x_pm_enter, +diff -urNp linux-2.6.33.1/arch/mips/include/asm/elf.h linux-2.6.33.1/arch/mips/include/asm/elf.h +--- linux-2.6.33.1/arch/mips/include/asm/elf.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/include/asm/elf.h 2010-03-20 16:58:38.588679980 -0400 +@@ -367,4 +367,11 @@ extern int dump_task_fpu(struct task_str + #define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) + #endif + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE (test_thread_flag(TIF_32BIT_ADDR) ? 0x00400000UL : 0x00400000UL) ++ ++#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) ++#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) ++#endif ++ + #endif /* _ASM_ELF_H */ +diff -urNp linux-2.6.33.1/arch/mips/include/asm/page.h linux-2.6.33.1/arch/mips/include/asm/page.h +--- linux-2.6.33.1/arch/mips/include/asm/page.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/include/asm/page.h 2010-03-20 16:58:38.592758361 -0400 +@@ -93,7 +93,7 @@ extern void copy_user_highpage(struct pa + #ifdef CONFIG_CPU_MIPS32 + typedef struct { unsigned long pte_low, pte_high; } pte_t; + #define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32)) +- #define __pte(x) ({ pte_t __pte = {(x), ((unsigned long long)(x)) >> 32}; __pte; }) ++ #define __pte(x) ({ pte_t __pte = {(x), (x) >> 32}; __pte; }) + #else + typedef struct { unsigned long long pte; } pte_t; + #define pte_val(x) ((x).pte) +diff -urNp linux-2.6.33.1/arch/mips/include/asm/system.h linux-2.6.33.1/arch/mips/include/asm/system.h +--- linux-2.6.33.1/arch/mips/include/asm/system.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/include/asm/system.h 2010-03-20 16:58:38.592758361 -0400 +@@ -230,6 +230,6 @@ extern void per_cpu_trap_init(void); + */ + #define __ARCH_WANT_UNLOCKED_CTXSW + +-extern unsigned long arch_align_stack(unsigned long sp); ++#define arch_align_stack(x) ((x) & ALMASK) + + #endif /* _ASM_SYSTEM_H */ +diff -urNp linux-2.6.33.1/arch/mips/kernel/binfmt_elfn32.c linux-2.6.33.1/arch/mips/kernel/binfmt_elfn32.c +--- linux-2.6.33.1/arch/mips/kernel/binfmt_elfn32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/kernel/binfmt_elfn32.c 2010-03-20 16:58:38.592758361 -0400 +@@ -50,6 +50,13 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_N + #undef ELF_ET_DYN_BASE + #define ELF_ET_DYN_BASE (TASK32_SIZE / 3 * 2) + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE (test_thread_flag(TIF_32BIT_ADDR) ? 0x00400000UL : 0x00400000UL) ++ ++#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) ++#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) ++#endif ++ + #include <asm/processor.h> + #include <linux/module.h> + #include <linux/elfcore.h> +diff -urNp linux-2.6.33.1/arch/mips/kernel/binfmt_elfo32.c linux-2.6.33.1/arch/mips/kernel/binfmt_elfo32.c +--- linux-2.6.33.1/arch/mips/kernel/binfmt_elfo32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/kernel/binfmt_elfo32.c 2010-03-20 16:58:38.592758361 -0400 +@@ -52,6 +52,13 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_N + #undef ELF_ET_DYN_BASE + #define ELF_ET_DYN_BASE (TASK32_SIZE / 3 * 2) + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE (test_thread_flag(TIF_32BIT_ADDR) ? 0x00400000UL : 0x00400000UL) ++ ++#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) ++#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_32BIT_ADDR) ? 27-PAGE_SHIFT : 36-PAGE_SHIFT) ++#endif ++ + #include <asm/processor.h> + + /* +diff -urNp linux-2.6.33.1/arch/mips/kernel/kgdb.c linux-2.6.33.1/arch/mips/kernel/kgdb.c +--- linux-2.6.33.1/arch/mips/kernel/kgdb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/kernel/kgdb.c 2010-03-20 16:58:38.592758361 -0400 +@@ -245,6 +245,7 @@ int kgdb_arch_handle_exception(int vecto + return -1; + } + ++/* cannot be const */ + struct kgdb_arch arch_kgdb_ops; + + /* +diff -urNp linux-2.6.33.1/arch/mips/kernel/process.c linux-2.6.33.1/arch/mips/kernel/process.c +--- linux-2.6.33.1/arch/mips/kernel/process.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/kernel/process.c 2010-03-20 16:58:38.592758361 -0400 +@@ -470,15 +470,3 @@ unsigned long get_wchan(struct task_stru + out: + return pc; + } +- +-/* +- * Don't forget that the stack pointer must be aligned on a 8 bytes +- * boundary for 32-bits ABI and 16 bytes for 64-bits ABI. +- */ +-unsigned long arch_align_stack(unsigned long sp) +-{ +- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) +- sp -= get_random_int() & ~PAGE_MASK; +- +- return sp & ALMASK; +-} +diff -urNp linux-2.6.33.1/arch/mips/kernel/syscall.c linux-2.6.33.1/arch/mips/kernel/syscall.c +--- linux-2.6.33.1/arch/mips/kernel/syscall.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/kernel/syscall.c 2010-03-20 16:58:38.592758361 -0400 +@@ -102,6 +102,11 @@ unsigned long arch_get_unmapped_area(str + do_color_align = 0; + if (filp || (flags & MAP_SHARED)) + do_color_align = 1; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!(current->mm->pax_flags & MF_PAX_RANDMMAP)) ++#endif ++ + if (addr) { + if (do_color_align) + addr = COLOUR_ALIGN(addr, pgoff); +@@ -112,7 +117,7 @@ unsigned long arch_get_unmapped_area(str + (!vmm || addr + len <= vmm->vm_start)) + return addr; + } +- addr = TASK_UNMAPPED_BASE; ++ addr = current->mm->mmap_base; + if (do_color_align) + addr = COLOUR_ALIGN(addr, pgoff); + else +diff -urNp linux-2.6.33.1/arch/mips/loongson/common/pm.c linux-2.6.33.1/arch/mips/loongson/common/pm.c +--- linux-2.6.33.1/arch/mips/loongson/common/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/loongson/common/pm.c 2010-03-20 16:58:38.608773091 -0400 +@@ -147,7 +147,7 @@ static int loongson_pm_valid_state(suspe + } + } + +-static struct platform_suspend_ops loongson_pm_ops = { ++static const struct platform_suspend_ops loongson_pm_ops = { + .valid = loongson_pm_valid_state, + .enter = loongson_pm_enter, + }; +diff -urNp linux-2.6.33.1/arch/mips/mm/fault.c linux-2.6.33.1/arch/mips/mm/fault.c +--- linux-2.6.33.1/arch/mips/mm/fault.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/mips/mm/fault.c 2010-03-20 16:58:38.612819864 -0400 +@@ -26,6 +26,23 @@ + #include <asm/ptrace.h> + #include <asm/highmem.h> /* For VMALLOC_END */ + ++#ifdef CONFIG_PAX_PAGEEXEC ++void pax_report_insns(void *pc) ++{ ++ unsigned long i; ++ ++ printk(KERN_ERR "PAX: bytes at PC: "); ++ for (i = 0; i < 5; i++) { ++ unsigned int c; ++ if (get_user(c, (unsigned int *)pc+i)) ++ printk(KERN_CONT "???????? "); ++ else ++ printk(KERN_CONT "%08x ", c); ++ } ++ printk("\n"); ++} ++#endif ++ + /* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate +diff -urNp linux-2.6.33.1/arch/parisc/include/asm/elf.h linux-2.6.33.1/arch/parisc/include/asm/elf.h +--- linux-2.6.33.1/arch/parisc/include/asm/elf.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/parisc/include/asm/elf.h 2010-03-20 16:58:38.624763350 -0400 +@@ -342,6 +342,13 @@ struct pt_regs; /* forward declaration.. + + #define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x01000000) + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE 0x10000UL ++ ++#define PAX_DELTA_MMAP_LEN 16 ++#define PAX_DELTA_STACK_LEN 16 ++#endif ++ + /* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. This could be done in user space, + but it's not easy, and we've already done it here. */ +diff -urNp linux-2.6.33.1/arch/parisc/include/asm/pgtable.h linux-2.6.33.1/arch/parisc/include/asm/pgtable.h +--- linux-2.6.33.1/arch/parisc/include/asm/pgtable.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/parisc/include/asm/pgtable.h 2010-03-20 16:58:38.628643492 -0400 +@@ -207,6 +207,17 @@ + #define PAGE_EXECREAD __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_EXEC |_PAGE_ACCESSED) + #define PAGE_COPY PAGE_EXECREAD + #define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC |_PAGE_ACCESSED) ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++# define PAGE_SHARED_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_ACCESSED) ++# define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_ACCESSED) ++# define PAGE_READONLY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_ACCESSED) ++#else ++# define PAGE_SHARED_NOEXEC PAGE_SHARED ++# define PAGE_COPY_NOEXEC PAGE_COPY ++# define PAGE_READONLY_NOEXEC PAGE_READONLY ++#endif ++ + #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) + #define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL & ~_PAGE_WRITE) + #define PAGE_KERNEL_UNC __pgprot(_PAGE_KERNEL | _PAGE_NO_CACHE) +diff -urNp linux-2.6.33.1/arch/parisc/kernel/module.c linux-2.6.33.1/arch/parisc/kernel/module.c +--- linux-2.6.33.1/arch/parisc/kernel/module.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/parisc/kernel/module.c 2010-03-20 16:58:38.636778509 -0400 +@@ -95,16 +95,38 @@ + + /* three functions to determine where in the module core + * or init pieces the location is */ ++static inline int in_init_rx(struct module *me, void *loc) ++{ ++ return (loc >= me->module_init_rx && ++ loc < (me->module_init_rx + me->init_size_rx)); ++} ++ ++static inline int in_init_rw(struct module *me, void *loc) ++{ ++ return (loc >= me->module_init_rw && ++ loc < (me->module_init_rw + me->init_size_rw)); ++} ++ + static inline int in_init(struct module *me, void *loc) + { +- return (loc >= me->module_init && +- loc <= (me->module_init + me->init_size)); ++ return in_init_rx(me, loc) || in_init_rw(me, loc); ++} ++ ++static inline int in_core_rx(struct module *me, void *loc) ++{ ++ return (loc >= me->module_core_rx && ++ loc < (me->module_core_rx + me->core_size_rx)); ++} ++ ++static inline int in_core_rw(struct module *me, void *loc) ++{ ++ return (loc >= me->module_core_rw && ++ loc < (me->module_core_rw + me->core_size_rw)); + } + + static inline int in_core(struct module *me, void *loc) + { +- return (loc >= me->module_core && +- loc <= (me->module_core + me->core_size)); ++ return in_core_rx(me, loc) || in_core_rw(me, loc); + } + + static inline int in_local(struct module *me, void *loc) +@@ -364,13 +386,13 @@ int module_frob_arch_sections(CONST Elf_ + } + + /* align things a bit */ +- me->core_size = ALIGN(me->core_size, 16); +- me->arch.got_offset = me->core_size; +- me->core_size += gots * sizeof(struct got_entry); +- +- me->core_size = ALIGN(me->core_size, 16); +- me->arch.fdesc_offset = me->core_size; +- me->core_size += fdescs * sizeof(Elf_Fdesc); ++ me->core_size_rw = ALIGN(me->core_size_rw, 16); ++ me->arch.got_offset = me->core_size_rw; ++ me->core_size_rw += gots * sizeof(struct got_entry); ++ ++ me->core_size_rw = ALIGN(me->core_size_rw, 16); ++ me->arch.fdesc_offset = me->core_size_rw; ++ me->core_size_rw += fdescs * sizeof(Elf_Fdesc); + + me->arch.got_max = gots; + me->arch.fdesc_max = fdescs; +@@ -388,7 +410,7 @@ static Elf64_Word get_got(struct module + + BUG_ON(value == 0); + +- got = me->module_core + me->arch.got_offset; ++ got = me->module_core_rw + me->arch.got_offset; + for (i = 0; got[i].addr; i++) + if (got[i].addr == value) + goto out; +@@ -406,7 +428,7 @@ static Elf64_Word get_got(struct module + #ifdef CONFIG_64BIT + static Elf_Addr get_fdesc(struct module *me, unsigned long value) + { +- Elf_Fdesc *fdesc = me->module_core + me->arch.fdesc_offset; ++ Elf_Fdesc *fdesc = me->module_core_rw + me->arch.fdesc_offset; + + if (!value) { + printk(KERN_ERR "%s: zero OPD requested!\n", me->name); +@@ -424,7 +446,7 @@ static Elf_Addr get_fdesc(struct module + + /* Create new one */ + fdesc->addr = value; +- fdesc->gp = (Elf_Addr)me->module_core + me->arch.got_offset; ++ fdesc->gp = (Elf_Addr)me->module_core_rw + me->arch.got_offset; + return (Elf_Addr)fdesc; + } + #endif /* CONFIG_64BIT */ +@@ -848,7 +870,7 @@ register_unwind_table(struct module *me, + + table = (unsigned char *)sechdrs[me->arch.unwind_section].sh_addr; + end = table + sechdrs[me->arch.unwind_section].sh_size; +- gp = (Elf_Addr)me->module_core + me->arch.got_offset; ++ gp = (Elf_Addr)me->module_core_rw + me->arch.got_offset; + + DEBUGP("register_unwind_table(), sect = %d at 0x%p - 0x%p (gp=0x%lx)\n", + me->arch.unwind_section, table, end, gp); +diff -urNp linux-2.6.33.1/arch/parisc/kernel/sys_parisc.c linux-2.6.33.1/arch/parisc/kernel/sys_parisc.c +--- linux-2.6.33.1/arch/parisc/kernel/sys_parisc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/parisc/kernel/sys_parisc.c 2010-03-20 16:58:38.636778509 -0400 +@@ -98,7 +98,7 @@ unsigned long arch_get_unmapped_area(str + if (flags & MAP_FIXED) + return addr; + if (!addr) +- addr = TASK_UNMAPPED_BASE; ++ addr = current->mm->mmap_base; + + if (filp) { + addr = get_shared_area(filp->f_mapping, addr, len, pgoff); +diff -urNp linux-2.6.33.1/arch/parisc/kernel/traps.c linux-2.6.33.1/arch/parisc/kernel/traps.c +--- linux-2.6.33.1/arch/parisc/kernel/traps.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/parisc/kernel/traps.c 2010-03-20 16:58:38.644755446 -0400 +@@ -733,9 +733,7 @@ void notrace handle_interruption(int cod + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm,regs->iaoq[0]); +- if (vma && (regs->iaoq[0] >= vma->vm_start) +- && (vma->vm_flags & VM_EXEC)) { +- ++ if (vma && (regs->iaoq[0] >= vma->vm_start)) { + fault_address = regs->iaoq[0]; + fault_space = regs->iasq[0]; + +diff -urNp linux-2.6.33.1/arch/parisc/mm/fault.c linux-2.6.33.1/arch/parisc/mm/fault.c +--- linux-2.6.33.1/arch/parisc/mm/fault.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/parisc/mm/fault.c 2010-03-20 16:58:38.644755446 -0400 +@@ -15,6 +15,7 @@ + #include <linux/sched.h> + #include <linux/interrupt.h> + #include <linux/module.h> ++#include <linux/unistd.h> + + #include <asm/uaccess.h> + #include <asm/traps.h> +@@ -52,7 +53,7 @@ DEFINE_PER_CPU(struct exception_data, ex + static unsigned long + parisc_acctyp(unsigned long code, unsigned int inst) + { +- if (code == 6 || code == 16) ++ if (code == 6 || code == 7 || code == 16) + return VM_EXEC; + + switch (inst & 0xf0000000) { +@@ -138,6 +139,116 @@ parisc_acctyp(unsigned long code, unsign + } + #endif + ++#ifdef CONFIG_PAX_PAGEEXEC ++/* ++ * PaX: decide what to do with offenders (instruction_pointer(regs) = fault address) ++ * ++ * returns 1 when task should be killed ++ * 2 when rt_sigreturn trampoline was detected ++ * 3 when unpatched PLT trampoline was detected ++ */ ++static int pax_handle_fetch_fault(struct pt_regs *regs) ++{ ++ ++#ifdef CONFIG_PAX_EMUPLT ++ int err; ++ ++ do { /* PaX: unpatched PLT emulation */ ++ unsigned int bl, depwi; ++ ++ err = get_user(bl, (unsigned int *)instruction_pointer(regs)); ++ err |= get_user(depwi, (unsigned int *)(instruction_pointer(regs)+4)); ++ ++ if (err) ++ break; ++ ++ if (bl == 0xEA9F1FDDU && depwi == 0xD6801C1EU) { ++ unsigned int ldw, bv, ldw2, addr = instruction_pointer(regs)-12; ++ ++ err = get_user(ldw, (unsigned int *)addr); ++ err |= get_user(bv, (unsigned int *)(addr+4)); ++ err |= get_user(ldw2, (unsigned int *)(addr+8)); ++ ++ if (err) ++ break; ++ ++ if (ldw == 0x0E801096U && ++ bv == 0xEAC0C000U && ++ ldw2 == 0x0E881095U) ++ { ++ unsigned int resolver, map; ++ ++ err = get_user(resolver, (unsigned int *)(instruction_pointer(regs)+8)); ++ err |= get_user(map, (unsigned int *)(instruction_pointer(regs)+12)); ++ if (err) ++ break; ++ ++ regs->gr[20] = instruction_pointer(regs)+8; ++ regs->gr[21] = map; ++ regs->gr[22] = resolver; ++ regs->iaoq[0] = resolver | 3UL; ++ regs->iaoq[1] = regs->iaoq[0] + 4; ++ return 3; ++ } ++ } ++ } while (0); ++#endif ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++ ++#ifndef CONFIG_PAX_EMUSIGRT ++ if (!(current->mm->pax_flags & MF_PAX_EMUTRAMP)) ++ return 1; ++#endif ++ ++ do { /* PaX: rt_sigreturn emulation */ ++ unsigned int ldi1, ldi2, bel, nop; ++ ++ err = get_user(ldi1, (unsigned int *)instruction_pointer(regs)); ++ err |= get_user(ldi2, (unsigned int *)(instruction_pointer(regs)+4)); ++ err |= get_user(bel, (unsigned int *)(instruction_pointer(regs)+8)); ++ err |= get_user(nop, (unsigned int *)(instruction_pointer(regs)+12)); ++ ++ if (err) ++ break; ++ ++ if ((ldi1 == 0x34190000U || ldi1 == 0x34190002U) && ++ ldi2 == 0x3414015AU && ++ bel == 0xE4008200U && ++ nop == 0x08000240U) ++ { ++ regs->gr[25] = (ldi1 & 2) >> 1; ++ regs->gr[20] = __NR_rt_sigreturn; ++ regs->gr[31] = regs->iaoq[1] + 16; ++ regs->sr[0] = regs->iasq[1]; ++ regs->iaoq[0] = 0x100UL; ++ regs->iaoq[1] = regs->iaoq[0] + 4; ++ regs->iasq[0] = regs->sr[2]; ++ regs->iasq[1] = regs->sr[2]; ++ return 2; ++ } ++ } while (0); ++#endif ++ ++ return 1; ++} ++ ++void pax_report_insns(void *pc, void *sp) ++{ ++ unsigned long i; ++ ++ printk(KERN_ERR "PAX: bytes at PC: "); ++ for (i = 0; i < 5; i++) { ++ unsigned int c; ++ if (get_user(c, (unsigned int *)pc+i)) ++ printk(KERN_CONT "???????? "); ++ else ++ printk(KERN_CONT "%08x ", c); ++ } ++ printk("\n"); ++} ++#endif ++ + int fixup_exception(struct pt_regs *regs) + { + const struct exception_table_entry *fix; +@@ -192,8 +303,33 @@ good_area: + + acc_type = parisc_acctyp(code,regs->iir); + +- if ((vma->vm_flags & acc_type) != acc_type) ++ if ((vma->vm_flags & acc_type) != acc_type) { ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && (acc_type & VM_EXEC) && ++ (address & ~3UL) == instruction_pointer(regs)) ++ { ++ up_read(&mm->mmap_sem); ++ switch (pax_handle_fetch_fault(regs)) { ++ ++#ifdef CONFIG_PAX_EMUPLT ++ case 3: ++ return; ++#endif ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++ case 2: ++ return; ++#endif ++ ++ } ++ pax_report_fault(regs, (void *)instruction_pointer(regs), (void *)regs->gr[30]); ++ do_group_exit(SIGKILL); ++ } ++#endif ++ + goto bad_area; ++ } + + /* + * If for any reason at all we couldn't handle the fault, make +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/device.h linux-2.6.33.1/arch/powerpc/include/asm/device.h +--- linux-2.6.33.1/arch/powerpc/include/asm/device.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/device.h 2010-03-20 16:58:38.652757516 -0400 +@@ -14,7 +14,7 @@ struct dev_archdata { + struct device_node *of_node; + + /* DMA operations on that device */ +- struct dma_map_ops *dma_ops; ++ const struct dma_map_ops *dma_ops; + + /* + * When an iommu is in use, dma_data is used as a ptr to the base of the +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/dma-mapping.h linux-2.6.33.1/arch/powerpc/include/asm/dma-mapping.h +--- linux-2.6.33.1/arch/powerpc/include/asm/dma-mapping.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/dma-mapping.h 2010-03-20 16:58:38.664764295 -0400 +@@ -69,9 +69,9 @@ static inline unsigned long device_to_ma + #ifdef CONFIG_PPC64 + extern struct dma_map_ops dma_iommu_ops; + #endif +-extern struct dma_map_ops dma_direct_ops; ++extern const struct dma_map_ops dma_direct_ops; + +-static inline struct dma_map_ops *get_dma_ops(struct device *dev) ++static inline const struct dma_map_ops *get_dma_ops(struct device *dev) + { + /* We don't handle the NULL dev case for ISA for now. We could + * do it via an out of line call but it is not needed for now. The +@@ -84,7 +84,7 @@ static inline struct dma_map_ops *get_dm + return dev->archdata.dma_ops; + } + +-static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops) ++static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *ops) + { + dev->archdata.dma_ops = ops; + } +@@ -118,7 +118,7 @@ static inline void set_dma_offset(struct + + static inline int dma_supported(struct device *dev, u64 mask) + { +- struct dma_map_ops *dma_ops = get_dma_ops(dev); ++ const struct dma_map_ops *dma_ops = get_dma_ops(dev); + + if (unlikely(dma_ops == NULL)) + return 0; +@@ -132,7 +132,7 @@ static inline int dma_supported(struct d + + static inline int dma_set_mask(struct device *dev, u64 dma_mask) + { +- struct dma_map_ops *dma_ops = get_dma_ops(dev); ++ const struct dma_map_ops *dma_ops = get_dma_ops(dev); + + if (unlikely(dma_ops == NULL)) + return -EIO; +@@ -147,7 +147,7 @@ static inline int dma_set_mask(struct de + static inline void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) + { +- struct dma_map_ops *dma_ops = get_dma_ops(dev); ++ const struct dma_map_ops *dma_ops = get_dma_ops(dev); + void *cpu_addr; + + BUG_ON(!dma_ops); +@@ -162,7 +162,7 @@ static inline void *dma_alloc_coherent(s + static inline void dma_free_coherent(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle) + { +- struct dma_map_ops *dma_ops = get_dma_ops(dev); ++ const struct dma_map_ops *dma_ops = get_dma_ops(dev); + + BUG_ON(!dma_ops); + +@@ -173,7 +173,7 @@ static inline void dma_free_coherent(str + + static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) + { +- struct dma_map_ops *dma_ops = get_dma_ops(dev); ++ const struct dma_map_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops->mapping_error) + return dma_ops->mapping_error(dev, dma_addr); +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/elf.h linux-2.6.33.1/arch/powerpc/include/asm/elf.h +--- linux-2.6.33.1/arch/powerpc/include/asm/elf.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/elf.h 2010-03-20 16:58:38.664764295 -0400 +@@ -178,8 +178,19 @@ typedef elf_fpreg_t elf_vsrreghalf_t32[E + the loader. We need to make sure that it is out of the way of the program + that it will "exec", and that there is sufficient room for the brk. */ + +-extern unsigned long randomize_et_dyn(unsigned long base); +-#define ELF_ET_DYN_BASE (randomize_et_dyn(0x20000000)) ++#define ELF_ET_DYN_BASE (0x20000000) ++ ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE (0x10000000UL) ++ ++#ifdef __powerpc64__ ++#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_32BIT) ? 16 : 28) ++#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_32BIT) ? 16 : 28) ++#else ++#define PAX_DELTA_MMAP_LEN 15 ++#define PAX_DELTA_STACK_LEN 15 ++#endif ++#endif + + /* + * Our registers are always unsigned longs, whether we're a 32 bit +@@ -274,9 +285,6 @@ extern int arch_setup_additional_pages(s + (0x7ff >> (PAGE_SHIFT - 12)) : \ + (0x3ffff >> (PAGE_SHIFT - 12))) + +-extern unsigned long arch_randomize_brk(struct mm_struct *mm); +-#define arch_randomize_brk arch_randomize_brk +- + #endif /* __KERNEL__ */ + + /* +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/iommu.h linux-2.6.33.1/arch/powerpc/include/asm/iommu.h +--- linux-2.6.33.1/arch/powerpc/include/asm/iommu.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/iommu.h 2010-03-20 16:58:38.664764295 -0400 +@@ -116,6 +116,9 @@ extern void iommu_init_early_iSeries(voi + extern void iommu_init_early_dart(void); + extern void iommu_init_early_pasemi(void); + ++/* dma-iommu.c */ ++extern int dma_iommu_dma_supported(struct device *dev, u64 mask); ++ + #ifdef CONFIG_PCI + extern void pci_iommu_init(void); + extern void pci_direct_iommu_init(void); +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/kmap_types.h linux-2.6.33.1/arch/powerpc/include/asm/kmap_types.h +--- linux-2.6.33.1/arch/powerpc/include/asm/kmap_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/kmap_types.h 2010-03-20 16:58:38.664764295 -0400 +@@ -26,6 +26,7 @@ enum km_type { + KM_SOFTIRQ1, + KM_PPC_SYNC_PAGE, + KM_PPC_SYNC_ICACHE, ++ KM_CLEARPAGE, + KM_TYPE_NR + }; + +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/page_64.h linux-2.6.33.1/arch/powerpc/include/asm/page_64.h +--- linux-2.6.33.1/arch/powerpc/include/asm/page_64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/page_64.h 2010-03-20 16:58:38.664764295 -0400 +@@ -180,15 +180,18 @@ do { \ + * stack by default, so in the absense of a PT_GNU_STACK program header + * we turn execute permission off. + */ +-#define VM_STACK_DEFAULT_FLAGS32 (VM_READ | VM_WRITE | VM_EXEC | \ +- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) ++#define VM_STACK_DEFAULT_FLAGS32 \ ++ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ ++ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + + #define VM_STACK_DEFAULT_FLAGS64 (VM_READ | VM_WRITE | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + ++#ifndef CONFIG_PAX_PAGEEXEC + #define VM_STACK_DEFAULT_FLAGS \ + (test_thread_flag(TIF_32BIT) ? \ + VM_STACK_DEFAULT_FLAGS32 : VM_STACK_DEFAULT_FLAGS64) ++#endif + + #include <asm-generic/getorder.h> + +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/page.h linux-2.6.33.1/arch/powerpc/include/asm/page.h +--- linux-2.6.33.1/arch/powerpc/include/asm/page.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/page.h 2010-03-20 16:58:38.668753880 -0400 +@@ -116,8 +116,9 @@ extern phys_addr_t kernstart_addr; + * and needs to be executable. This means the whole heap ends + * up being executable. + */ +-#define VM_DATA_DEFAULT_FLAGS32 (VM_READ | VM_WRITE | VM_EXEC | \ +- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) ++#define VM_DATA_DEFAULT_FLAGS32 \ ++ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ ++ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + + #define VM_DATA_DEFAULT_FLAGS64 (VM_READ | VM_WRITE | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +@@ -145,6 +146,9 @@ extern phys_addr_t kernstart_addr; + #define is_kernel_addr(x) ((x) >= PAGE_OFFSET) + #endif + ++#define ktla_ktva(addr) (addr) ++#define ktva_ktla(addr) (addr) ++ + #ifndef __ASSEMBLY__ + + #undef STRICT_MM_TYPECHECKS +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/pci.h linux-2.6.33.1/arch/powerpc/include/asm/pci.h +--- linux-2.6.33.1/arch/powerpc/include/asm/pci.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/pci.h 2010-03-20 16:58:38.668753880 -0400 +@@ -65,8 +65,8 @@ static inline int pci_get_legacy_ide_irq + } + + #ifdef CONFIG_PCI +-extern void set_pci_dma_ops(struct dma_map_ops *dma_ops); +-extern struct dma_map_ops *get_pci_dma_ops(void); ++extern void set_pci_dma_ops(const struct dma_map_ops *dma_ops); ++extern const struct dma_map_ops *get_pci_dma_ops(void); + #else /* CONFIG_PCI */ + #define set_pci_dma_ops(d) + #define get_pci_dma_ops() NULL +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/pte-hash32.h linux-2.6.33.1/arch/powerpc/include/asm/pte-hash32.h +--- linux-2.6.33.1/arch/powerpc/include/asm/pte-hash32.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/pte-hash32.h 2010-03-20 16:58:38.668753880 -0400 +@@ -21,6 +21,7 @@ + #define _PAGE_FILE 0x004 /* when !present: nonlinear file mapping */ + #define _PAGE_USER 0x004 /* usermode access allowed */ + #define _PAGE_GUARDED 0x008 /* G: prohibit speculative access */ ++#define _PAGE_EXEC _PAGE_GUARDED + #define _PAGE_COHERENT 0x010 /* M: enforce memory coherence (SMP systems) */ + #define _PAGE_NO_CACHE 0x020 /* I: cache inhibit */ + #define _PAGE_WRITETHRU 0x040 /* W: cache write-through */ +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/reg.h linux-2.6.33.1/arch/powerpc/include/asm/reg.h +--- linux-2.6.33.1/arch/powerpc/include/asm/reg.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/reg.h 2010-03-20 16:58:38.672753614 -0400 +@@ -191,6 +191,7 @@ + #define SPRN_DBCR 0x136 /* e300 Data Breakpoint Control Reg */ + #define SPRN_DSISR 0x012 /* Data Storage Interrupt Status Register */ + #define DSISR_NOHPTE 0x40000000 /* no translation found */ ++#define DSISR_GUARDED 0x10000000 /* fetch from guarded storage */ + #define DSISR_PROTFAULT 0x08000000 /* protection fault */ + #define DSISR_ISSTORE 0x02000000 /* access was a store */ + #define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/swiotlb.h linux-2.6.33.1/arch/powerpc/include/asm/swiotlb.h +--- linux-2.6.33.1/arch/powerpc/include/asm/swiotlb.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/swiotlb.h 2010-03-20 16:58:38.672753614 -0400 +@@ -13,7 +13,7 @@ + + #include <linux/swiotlb.h> + +-extern struct dma_map_ops swiotlb_dma_ops; ++extern const struct dma_map_ops swiotlb_dma_ops; + + static inline void dma_mark_clean(void *addr, size_t size) {} + +diff -urNp linux-2.6.33.1/arch/powerpc/include/asm/uaccess.h linux-2.6.33.1/arch/powerpc/include/asm/uaccess.h +--- linux-2.6.33.1/arch/powerpc/include/asm/uaccess.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/include/asm/uaccess.h 2010-03-20 16:58:38.676578811 -0400 +@@ -327,52 +327,6 @@ do { \ + extern unsigned long __copy_tofrom_user(void __user *to, + const void __user *from, unsigned long size); + +-#ifndef __powerpc64__ +- +-static inline unsigned long copy_from_user(void *to, +- const void __user *from, unsigned long n) +-{ +- unsigned long over; +- +- if (access_ok(VERIFY_READ, from, n)) +- return __copy_tofrom_user((__force void __user *)to, from, n); +- if ((unsigned long)from < TASK_SIZE) { +- over = (unsigned long)from + n - TASK_SIZE; +- return __copy_tofrom_user((__force void __user *)to, from, +- n - over) + over; +- } +- return n; +-} +- +-static inline unsigned long copy_to_user(void __user *to, +- const void *from, unsigned long n) +-{ +- unsigned long over; +- +- if (access_ok(VERIFY_WRITE, to, n)) +- return __copy_tofrom_user(to, (__force void __user *)from, n); +- if ((unsigned long)to < TASK_SIZE) { +- over = (unsigned long)to + n - TASK_SIZE; +- return __copy_tofrom_user(to, (__force void __user *)from, +- n - over) + over; +- } +- return n; +-} +- +-#else /* __powerpc64__ */ +- +-#define __copy_in_user(to, from, size) \ +- __copy_tofrom_user((to), (from), (size)) +- +-extern unsigned long copy_from_user(void *to, const void __user *from, +- unsigned long n); +-extern unsigned long copy_to_user(void __user *to, const void *from, +- unsigned long n); +-extern unsigned long copy_in_user(void __user *to, const void __user *from, +- unsigned long n); +- +-#endif /* __powerpc64__ */ +- + static inline unsigned long __copy_from_user_inatomic(void *to, + const void __user *from, unsigned long n) + { +@@ -396,6 +350,10 @@ static inline unsigned long __copy_from_ + if (ret == 0) + return 0; + } ++ ++ if (!__builtin_constant_p(n)) ++ check_object_size(to, n, false); ++ + return __copy_tofrom_user((__force void __user *)to, from, n); + } + +@@ -422,6 +380,10 @@ static inline unsigned long __copy_to_us + if (ret == 0) + return 0; + } ++ ++ if (!__builtin_constant_p(n)) ++ check_object_size(from, n, true); ++ + return __copy_tofrom_user(to, (__force const void __user *)from, n); + } + +@@ -439,6 +401,92 @@ static inline unsigned long __copy_to_us + return __copy_to_user_inatomic(to, from, size); + } + ++#ifndef __powerpc64__ ++ ++static inline unsigned long __must_check copy_from_user(void *to, ++ const void __user *from, unsigned long n) ++{ ++ unsigned long over; ++ ++ if ((long)n < 0) ++ return n; ++ ++ if (access_ok(VERIFY_READ, from, n)) { ++ if (!__builtin_constant_p(n)) ++ check_object_size(to, n, false); ++ return __copy_tofrom_user((__force void __user *)to, from, n); ++ } ++ if ((unsigned long)from < TASK_SIZE) { ++ over = (unsigned long)from + n - TASK_SIZE; ++ if (!__builtin_constant_p(n - over)) ++ check_object_size(to, n - over, false); ++ return __copy_tofrom_user((__force void __user *)to, from, ++ n - over) + over; ++ } ++ return n; ++} ++ ++static inline unsigned long __must_check copy_to_user(void __user *to, ++ const void *from, unsigned long n) ++{ ++ unsigned long over; ++ ++ if ((long)n < 0) ++ return n; ++ ++ if (access_ok(VERIFY_WRITE, to, n)) { ++ if (!__builtin_constant_p(n)) ++ check_object_size(from, n, true); ++ return __copy_tofrom_user(to, (__force void __user *)from, n); ++ } ++ if ((unsigned long)to < TASK_SIZE) { ++ over = (unsigned long)to + n - TASK_SIZE; ++ if (!__builtin_constant_p(n)) ++ check_object_size(from, n - over, true); ++ return __copy_tofrom_user(to, (__force void __user *)from, ++ n - over) + over; ++ } ++ return n; ++} ++ ++#else /* __powerpc64__ */ ++ ++#define __copy_in_user(to, from, size) \ ++ __copy_tofrom_user((to), (from), (size)) ++ ++static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) ++{ ++ if ((long)n < 0 || n > INT_MAX) ++ return n; ++ ++ if (!__builtin_constant_p(n)) ++ check_object_size(to, n, false); ++ ++ if (likely(access_ok(VERIFY_READ, from, n))) ++ n = __copy_from_user(to, from, n); ++ else ++ memset(to, 0, n); ++ return n; ++} ++ ++static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) ++{ ++ if ((long)n < 0 || n > INT_MAX) ++ return n; ++ ++ if (likely(access_ok(VERIFY_WRITE, to, n))) { ++ if (!__builtin_constant_p(n)) ++ check_object_size(from, n, true); ++ n = __copy_to_user(to, from, n); ++ } ++ return n; ++} ++ ++extern unsigned long copy_in_user(void __user *to, const void __user *from, ++ unsigned long n); ++ ++#endif /* __powerpc64__ */ ++ + extern unsigned long __clear_user(void __user *addr, unsigned long size); + + static inline unsigned long clear_user(void __user *addr, unsigned long size) +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/cacheinfo.c linux-2.6.33.1/arch/powerpc/kernel/cacheinfo.c +--- linux-2.6.33.1/arch/powerpc/kernel/cacheinfo.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/cacheinfo.c 2010-03-20 16:58:38.684761974 -0400 +@@ -642,7 +642,7 @@ static struct kobj_attribute *cache_inde + &cache_assoc_attr, + }; + +-static struct sysfs_ops cache_index_ops = { ++static const struct sysfs_ops cache_index_ops = { + .show = cache_index_show, + }; + +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/dma.c linux-2.6.33.1/arch/powerpc/kernel/dma.c +--- linux-2.6.33.1/arch/powerpc/kernel/dma.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/dma.c 2010-03-20 16:58:38.700769723 -0400 +@@ -134,7 +134,7 @@ static inline void dma_direct_sync_singl + } + #endif + +-struct dma_map_ops dma_direct_ops = { ++const struct dma_map_ops dma_direct_ops = { + .alloc_coherent = dma_direct_alloc_coherent, + .free_coherent = dma_direct_free_coherent, + .map_sg = dma_direct_map_sg, +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/dma-iommu.c linux-2.6.33.1/arch/powerpc/kernel/dma-iommu.c +--- linux-2.6.33.1/arch/powerpc/kernel/dma-iommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/dma-iommu.c 2010-03-20 16:58:38.708797745 -0400 +@@ -70,7 +70,7 @@ static void dma_iommu_unmap_sg(struct de + } + + /* We support DMA to/from any memory page via the iommu */ +-static int dma_iommu_dma_supported(struct device *dev, u64 mask) ++int dma_iommu_dma_supported(struct device *dev, u64 mask) + { + struct iommu_table *tbl = get_iommu_table_base(dev); + +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/dma-swiotlb.c linux-2.6.33.1/arch/powerpc/kernel/dma-swiotlb.c +--- linux-2.6.33.1/arch/powerpc/kernel/dma-swiotlb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/dma-swiotlb.c 2010-03-20 16:58:38.720772371 -0400 +@@ -30,7 +30,7 @@ unsigned int ppc_swiotlb_enable; + * map_page, and unmap_page on highmem, use normal dma_ops + * for everything else. + */ +-struct dma_map_ops swiotlb_dma_ops = { ++const struct dma_map_ops swiotlb_dma_ops = { + .alloc_coherent = dma_direct_alloc_coherent, + .free_coherent = dma_direct_free_coherent, + .map_sg = swiotlb_map_sg_attrs, +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/exceptions-64e.S linux-2.6.33.1/arch/powerpc/kernel/exceptions-64e.S +--- linux-2.6.33.1/arch/powerpc/kernel/exceptions-64e.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/exceptions-64e.S 2010-03-20 16:58:38.720772371 -0400 +@@ -455,6 +455,7 @@ storage_fault_common: + std r14,_DAR(r1) + std r15,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD ++ bl .save_nvgprs + mr r4,r14 + mr r5,r15 + ld r14,PACA_EXGEN+EX_R14(r13) +@@ -464,8 +465,7 @@ storage_fault_common: + cmpdi r3,0 + bne- 1f + b .ret_from_except_lite +-1: bl .save_nvgprs +- mr r5,r3 ++1: mr r5,r3 + addi r3,r1,STACK_FRAME_OVERHEAD + ld r4,_DAR(r1) + bl .bad_page_fault +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/exceptions-64s.S linux-2.6.33.1/arch/powerpc/kernel/exceptions-64s.S +--- linux-2.6.33.1/arch/powerpc/kernel/exceptions-64s.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/exceptions-64s.S 2010-03-20 16:58:38.720772371 -0400 +@@ -829,10 +829,10 @@ handle_page_fault: + 11: ld r4,_DAR(r1) + ld r5,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD ++ bl .save_nvgprs + bl .do_page_fault + cmpdi r3,0 + beq+ 13f +- bl .save_nvgprs + mr r5,r3 + addi r3,r1,STACK_FRAME_OVERHEAD + lwz r4,_DAR(r1) +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/ibmebus.c linux-2.6.33.1/arch/powerpc/kernel/ibmebus.c +--- linux-2.6.33.1/arch/powerpc/kernel/ibmebus.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/ibmebus.c 2010-03-20 16:58:38.720772371 -0400 +@@ -127,7 +127,7 @@ static int ibmebus_dma_supported(struct + return 1; + } + +-static struct dma_map_ops ibmebus_dma_ops = { ++static const struct dma_map_ops ibmebus_dma_ops = { + .alloc_coherent = ibmebus_alloc_coherent, + .free_coherent = ibmebus_free_coherent, + .map_sg = ibmebus_map_sg, +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/kgdb.c linux-2.6.33.1/arch/powerpc/kernel/kgdb.c +--- linux-2.6.33.1/arch/powerpc/kernel/kgdb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/kgdb.c 2010-03-20 16:58:38.720772371 -0400 +@@ -126,7 +126,7 @@ static int kgdb_handle_breakpoint(struct + if (kgdb_handle_exception(0, SIGTRAP, 0, regs) != 0) + return 0; + +- if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr)) ++ if (*(u32 *) (regs->nip) == *(const u32 *) (&arch_kgdb_ops.gdb_bpt_instr)) + regs->nip += 4; + + return 1; +@@ -353,7 +353,7 @@ int kgdb_arch_handle_exception(int vecto + /* + * Global data + */ +-struct kgdb_arch arch_kgdb_ops = { ++const struct kgdb_arch arch_kgdb_ops = { + .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08}, + }; + +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/module_32.c linux-2.6.33.1/arch/powerpc/kernel/module_32.c +--- linux-2.6.33.1/arch/powerpc/kernel/module_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/module_32.c 2010-03-20 16:58:38.724667855 -0400 +@@ -162,7 +162,7 @@ int module_frob_arch_sections(Elf32_Ehdr + me->arch.core_plt_section = i; + } + if (!me->arch.core_plt_section || !me->arch.init_plt_section) { +- printk("Module doesn't contain .plt or .init.plt sections.\n"); ++ printk("Module %s doesn't contain .plt or .init.plt sections.\n", me->name); + return -ENOEXEC; + } + +@@ -203,11 +203,16 @@ static uint32_t do_plt_call(void *locati + + DEBUGP("Doing plt for call to 0x%x at 0x%x\n", val, (unsigned int)location); + /* Init, or core PLT? */ +- if (location >= mod->module_core +- && location < mod->module_core + mod->core_size) ++ if ((location >= mod->module_core_rx && location < mod->module_core_rx + mod->core_size_rx) || ++ (location >= mod->module_core_rw && location < mod->module_core_rw + mod->core_size_rw)) + entry = (void *)sechdrs[mod->arch.core_plt_section].sh_addr; +- else ++ else if ((location >= mod->module_init_rx && location < mod->module_init_rx + mod->init_size_rx) || ++ (location >= mod->module_init_rw && location < mod->module_init_rw + mod->init_size_rw)) + entry = (void *)sechdrs[mod->arch.init_plt_section].sh_addr; ++ else { ++ printk(KERN_ERR "%s: invalid R_PPC_REL24 entry found\n", mod->name); ++ return ~0UL; ++ } + + /* Find this entry, or if that fails, the next avail. entry */ + while (entry->jump[0]) { +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/module.c linux-2.6.33.1/arch/powerpc/kernel/module.c +--- linux-2.6.33.1/arch/powerpc/kernel/module.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/module.c 2010-03-20 16:58:38.724667855 -0400 +@@ -31,11 +31,24 @@ + + LIST_HEAD(module_bug_list); + ++#ifdef CONFIG_PAX_KERNEXEC + void *module_alloc(unsigned long size) + { + if (size == 0) + return NULL; + ++ return vmalloc(size); ++} ++ ++void *module_alloc_exec(unsigned long size) ++#else ++void *module_alloc(unsigned long size) ++#endif ++ ++{ ++ if (size == 0) ++ return NULL; ++ + return vmalloc_exec(size); + } + +@@ -45,6 +58,13 @@ void module_free(struct module *mod, voi + vfree(module_region); + } + ++#ifdef CONFIG_PAX_KERNEXEC ++void module_free_exec(struct module *mod, void *module_region) ++{ ++ module_free(mod, module_region); ++} ++#endif ++ + static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + const char *name) +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/pci-common.c linux-2.6.33.1/arch/powerpc/kernel/pci-common.c +--- linux-2.6.33.1/arch/powerpc/kernel/pci-common.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/pci-common.c 2010-03-20 16:58:38.736620382 -0400 +@@ -50,14 +50,14 @@ resource_size_t isa_mem_base; + unsigned int ppc_pci_flags = 0; + + +-static struct dma_map_ops *pci_dma_ops = &dma_direct_ops; ++static const struct dma_map_ops *pci_dma_ops = &dma_direct_ops; + +-void set_pci_dma_ops(struct dma_map_ops *dma_ops) ++void set_pci_dma_ops(const struct dma_map_ops *dma_ops) + { + pci_dma_ops = dma_ops; + } + +-struct dma_map_ops *get_pci_dma_ops(void) ++const struct dma_map_ops *get_pci_dma_ops(void) + { + return pci_dma_ops; + } +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/process.c linux-2.6.33.1/arch/powerpc/kernel/process.c +--- linux-2.6.33.1/arch/powerpc/kernel/process.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/process.c 2010-03-20 16:58:38.740772809 -0400 +@@ -1141,51 +1141,3 @@ unsigned long arch_align_stack(unsigned + sp -= get_random_int() & ~PAGE_MASK; + return sp & ~0xf; + } +- +-static inline unsigned long brk_rnd(void) +-{ +- unsigned long rnd = 0; +- +- /* 8MB for 32bit, 1GB for 64bit */ +- if (is_32bit_task()) +- rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT))); +- else +- rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT))); +- +- return rnd << PAGE_SHIFT; +-} +- +-unsigned long arch_randomize_brk(struct mm_struct *mm) +-{ +- unsigned long base = mm->brk; +- unsigned long ret; +- +-#ifdef CONFIG_PPC_STD_MMU_64 +- /* +- * If we are using 1TB segments and we are allowed to randomise +- * the heap, we can put it above 1TB so it is backed by a 1TB +- * segment. Otherwise the heap will be in the bottom 1TB +- * which always uses 256MB segments and this may result in a +- * performance penalty. +- */ +- if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T)) +- base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T); +-#endif +- +- ret = PAGE_ALIGN(base + brk_rnd()); +- +- if (ret < mm->brk) +- return mm->brk; +- +- return ret; +-} +- +-unsigned long randomize_et_dyn(unsigned long base) +-{ +- unsigned long ret = PAGE_ALIGN(base + brk_rnd()); +- +- if (ret < base) +- return base; +- +- return ret; +-} +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/signal_32.c linux-2.6.33.1/arch/powerpc/kernel/signal_32.c +--- linux-2.6.33.1/arch/powerpc/kernel/signal_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/signal_32.c 2010-03-20 16:58:38.740772809 -0400 +@@ -857,7 +857,7 @@ int handle_rt_signal32(unsigned long sig + /* Save user registers on the stack */ + frame = &rt_sf->uc.uc_mcontext; + addr = frame; +- if (vdso32_rt_sigtramp && current->mm->context.vdso_base) { ++ if (vdso32_rt_sigtramp && current->mm->context.vdso_base != ~0UL) { + if (save_user_regs(regs, frame, 0, 1)) + goto badframe; + regs->link = current->mm->context.vdso_base + vdso32_rt_sigtramp; +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/signal_64.c linux-2.6.33.1/arch/powerpc/kernel/signal_64.c +--- linux-2.6.33.1/arch/powerpc/kernel/signal_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/signal_64.c 2010-03-20 16:58:38.740772809 -0400 +@@ -429,7 +429,7 @@ int handle_rt_signal64(int signr, struct + current->thread.fpscr.val = 0; + + /* Set up to return from userspace. */ +- if (vdso64_rt_sigtramp && current->mm->context.vdso_base) { ++ if (vdso64_rt_sigtramp && current->mm->context.vdso_base != ~0UL) { + regs->link = current->mm->context.vdso_base + vdso64_rt_sigtramp; + } else { + err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]); +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/vdso.c linux-2.6.33.1/arch/powerpc/kernel/vdso.c +--- linux-2.6.33.1/arch/powerpc/kernel/vdso.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/vdso.c 2010-03-20 16:58:38.740772809 -0400 +@@ -36,6 +36,7 @@ + #include <asm/firmware.h> + #include <asm/vdso.h> + #include <asm/vdso_datapage.h> ++#include <asm/mman.h> + + #include "setup.h" + +@@ -220,7 +221,7 @@ int arch_setup_additional_pages(struct l + vdso_base = VDSO32_MBASE; + #endif + +- current->mm->context.vdso_base = 0; ++ current->mm->context.vdso_base = ~0UL; + + /* vDSO has a problem and was disabled, just don't "enable" it for the + * process +@@ -240,7 +241,7 @@ int arch_setup_additional_pages(struct l + vdso_base = get_unmapped_area(NULL, vdso_base, + (vdso_pages << PAGE_SHIFT) + + ((VDSO_ALIGNMENT - 1) & PAGE_MASK), +- 0, 0); ++ 0, MAP_PRIVATE | MAP_EXECUTABLE); + if (IS_ERR_VALUE(vdso_base)) { + rc = vdso_base; + goto fail_mmapsem; +diff -urNp linux-2.6.33.1/arch/powerpc/kernel/vio.c linux-2.6.33.1/arch/powerpc/kernel/vio.c +--- linux-2.6.33.1/arch/powerpc/kernel/vio.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/kernel/vio.c 2010-03-20 16:58:38.744562669 -0400 +@@ -601,11 +601,12 @@ static void vio_dma_iommu_unmap_sg(struc + vio_cmo_dealloc(viodev, alloc_size); + } + +-struct dma_map_ops vio_dma_mapping_ops = { ++static const struct dma_map_ops vio_dma_mapping_ops = { + .alloc_coherent = vio_dma_iommu_alloc_coherent, + .free_coherent = vio_dma_iommu_free_coherent, + .map_sg = vio_dma_iommu_map_sg, + .unmap_sg = vio_dma_iommu_unmap_sg, ++ .dma_supported = dma_iommu_dma_supported, + .map_page = vio_dma_iommu_map_page, + .unmap_page = vio_dma_iommu_unmap_page, + +@@ -857,7 +858,6 @@ static void vio_cmo_bus_remove(struct vi + + static void vio_cmo_set_dma_ops(struct vio_dev *viodev) + { +- vio_dma_mapping_ops.dma_supported = dma_iommu_ops.dma_supported; + viodev->dev.archdata.dma_ops = &vio_dma_mapping_ops; + } + +diff -urNp linux-2.6.33.1/arch/powerpc/lib/usercopy_64.c linux-2.6.33.1/arch/powerpc/lib/usercopy_64.c +--- linux-2.6.33.1/arch/powerpc/lib/usercopy_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/lib/usercopy_64.c 2010-03-20 16:58:38.748775088 -0400 +@@ -9,22 +9,6 @@ + #include <linux/module.h> + #include <asm/uaccess.h> + +-unsigned long copy_from_user(void *to, const void __user *from, unsigned long n) +-{ +- if (likely(access_ok(VERIFY_READ, from, n))) +- n = __copy_from_user(to, from, n); +- else +- memset(to, 0, n); +- return n; +-} +- +-unsigned long copy_to_user(void __user *to, const void *from, unsigned long n) +-{ +- if (likely(access_ok(VERIFY_WRITE, to, n))) +- n = __copy_to_user(to, from, n); +- return n; +-} +- + unsigned long copy_in_user(void __user *to, const void __user *from, + unsigned long n) + { +@@ -35,7 +19,5 @@ unsigned long copy_in_user(void __user * + return n; + } + +-EXPORT_SYMBOL(copy_from_user); +-EXPORT_SYMBOL(copy_to_user); + EXPORT_SYMBOL(copy_in_user); + +diff -urNp linux-2.6.33.1/arch/powerpc/mm/fault.c linux-2.6.33.1/arch/powerpc/mm/fault.c +--- linux-2.6.33.1/arch/powerpc/mm/fault.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/mm/fault.c 2010-03-20 16:58:38.748775088 -0400 +@@ -30,6 +30,10 @@ + #include <linux/kprobes.h> + #include <linux/kdebug.h> + #include <linux/perf_event.h> ++#include <linux/slab.h> ++#include <linux/pagemap.h> ++#include <linux/compiler.h> ++#include <linux/unistd.h> + + #include <asm/firmware.h> + #include <asm/page.h> +@@ -41,6 +45,7 @@ + #include <asm/tlbflush.h> + #include <asm/siginfo.h> + #include <mm/mmu_decl.h> ++#include <asm/ptrace.h> + + #ifdef CONFIG_KPROBES + static inline int notify_page_fault(struct pt_regs *regs) +@@ -64,6 +69,33 @@ static inline int notify_page_fault(stru + } + #endif + ++#ifdef CONFIG_PAX_PAGEEXEC ++/* ++ * PaX: decide what to do with offenders (regs->nip = fault address) ++ * ++ * returns 1 when task should be killed ++ */ ++static int pax_handle_fetch_fault(struct pt_regs *regs) ++{ ++ return 1; ++} ++ ++void pax_report_insns(void *pc, void *sp) ++{ ++ unsigned long i; ++ ++ printk(KERN_ERR "PAX: bytes at PC: "); ++ for (i = 0; i < 5; i++) { ++ unsigned int c; ++ if (get_user(c, (unsigned int __user *)pc+i)) ++ printk(KERN_CONT "???????? "); ++ else ++ printk(KERN_CONT "%08x ", c); ++ } ++ printk("\n"); ++} ++#endif ++ + /* + * Check whether the instruction at regs->nip is a store using + * an update addressing form which will update r1. +@@ -134,7 +166,7 @@ int __kprobes do_page_fault(struct pt_re + * indicate errors in DSISR but can validly be set in SRR1. + */ + if (trap == 0x400) +- error_code &= 0x48200000; ++ error_code &= 0x58200000; + else + is_write = error_code & DSISR_ISSTORE; + #else +@@ -256,7 +288,7 @@ good_area: + * "undefined". Of those that can be set, this is the only + * one which seems bad. + */ +- if (error_code & 0x10000000) ++ if (error_code & DSISR_GUARDED) + /* Guarded storage error. */ + goto bad_area; + #endif /* CONFIG_8xx */ +@@ -271,7 +303,7 @@ good_area: + * processors use the same I/D cache coherency mechanism + * as embedded. + */ +- if (error_code & DSISR_PROTFAULT) ++ if (error_code & (DSISR_PROTFAULT | DSISR_GUARDED)) + goto bad_area; + #endif /* CONFIG_PPC_STD_MMU */ + +@@ -341,6 +373,23 @@ bad_area: + bad_area_nosemaphore: + /* User mode accesses cause a SIGSEGV */ + if (user_mode(regs)) { ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (mm->pax_flags & MF_PAX_PAGEEXEC) { ++#ifdef CONFIG_PPC_STD_MMU ++ if (is_exec && (error_code & (DSISR_PROTFAULT | DSISR_GUARDED))) { ++#else ++ if (is_exec && regs->nip == address) { ++#endif ++ switch (pax_handle_fetch_fault(regs)) { ++ } ++ ++ pax_report_fault(regs, (void *)regs->nip, (void *)regs->gpr[PT_R1]); ++ do_group_exit(SIGKILL); ++ } ++ } ++#endif ++ + _exception(SIGSEGV, regs, code, address); + return 0; + } +diff -urNp linux-2.6.33.1/arch/powerpc/mm/mmap_64.c linux-2.6.33.1/arch/powerpc/mm/mmap_64.c +--- linux-2.6.33.1/arch/powerpc/mm/mmap_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/mm/mmap_64.c 2010-03-20 16:58:38.748775088 -0400 +@@ -99,10 +99,22 @@ void arch_pick_mmap_layout(struct mm_str + */ + if (mmap_is_legacy()) { + mm->mmap_base = TASK_UNMAPPED_BASE; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base += mm->delta_mmap; ++#endif ++ + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(); ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base -= mm->delta_mmap + mm->delta_stack; ++#endif ++ + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + mm->unmap_area = arch_unmap_area_topdown; + } +diff -urNp linux-2.6.33.1/arch/powerpc/mm/slice.c linux-2.6.33.1/arch/powerpc/mm/slice.c +--- linux-2.6.33.1/arch/powerpc/mm/slice.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/mm/slice.c 2010-03-20 16:58:38.764771645 -0400 +@@ -426,6 +426,11 @@ unsigned long slice_get_unmapped_area(un + if (fixed && addr > (mm->task_size - len)) + return -EINVAL; + ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!fixed && (mm->pax_flags & MF_PAX_RANDMMAP)) ++ addr = 0; ++#endif ++ + /* If hint, make sure it matches our alignment restrictions */ + if (!fixed && addr) { + addr = _ALIGN_UP(addr, 1ul << pshift); +diff -urNp linux-2.6.33.1/arch/powerpc/platforms/52xx/lite5200_pm.c linux-2.6.33.1/arch/powerpc/platforms/52xx/lite5200_pm.c +--- linux-2.6.33.1/arch/powerpc/platforms/52xx/lite5200_pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/platforms/52xx/lite5200_pm.c 2010-03-20 16:58:38.776769493 -0400 +@@ -235,7 +235,7 @@ static void lite5200_pm_end(void) + lite5200_pm_target_state = PM_SUSPEND_ON; + } + +-static struct platform_suspend_ops lite5200_pm_ops = { ++static const struct platform_suspend_ops lite5200_pm_ops = { + .valid = lite5200_pm_valid, + .begin = lite5200_pm_begin, + .prepare = lite5200_pm_prepare, +diff -urNp linux-2.6.33.1/arch/powerpc/platforms/52xx/mpc52xx_pm.c linux-2.6.33.1/arch/powerpc/platforms/52xx/mpc52xx_pm.c +--- linux-2.6.33.1/arch/powerpc/platforms/52xx/mpc52xx_pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/platforms/52xx/mpc52xx_pm.c 2010-03-20 16:58:38.776769493 -0400 +@@ -180,7 +180,7 @@ void mpc52xx_pm_finish(void) + iounmap(mbar); + } + +-static struct platform_suspend_ops mpc52xx_pm_ops = { ++static const struct platform_suspend_ops mpc52xx_pm_ops = { + .valid = mpc52xx_pm_valid, + .prepare = mpc52xx_pm_prepare, + .enter = mpc52xx_pm_enter, +diff -urNp linux-2.6.33.1/arch/powerpc/platforms/83xx/suspend.c linux-2.6.33.1/arch/powerpc/platforms/83xx/suspend.c +--- linux-2.6.33.1/arch/powerpc/platforms/83xx/suspend.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/platforms/83xx/suspend.c 2010-03-20 16:58:38.776769493 -0400 +@@ -311,7 +311,7 @@ static int mpc83xx_is_pci_agent(void) + return ret; + } + +-static struct platform_suspend_ops mpc83xx_suspend_ops = { ++static const struct platform_suspend_ops mpc83xx_suspend_ops = { + .valid = mpc83xx_suspend_valid, + .begin = mpc83xx_suspend_begin, + .enter = mpc83xx_suspend_enter, +diff -urNp linux-2.6.33.1/arch/powerpc/platforms/cell/iommu.c linux-2.6.33.1/arch/powerpc/platforms/cell/iommu.c +--- linux-2.6.33.1/arch/powerpc/platforms/cell/iommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/platforms/cell/iommu.c 2010-03-20 16:58:38.776769493 -0400 +@@ -642,7 +642,7 @@ static int dma_fixed_dma_supported(struc + + static int dma_set_mask_and_switch(struct device *dev, u64 dma_mask); + +-struct dma_map_ops dma_iommu_fixed_ops = { ++const struct dma_map_ops dma_iommu_fixed_ops = { + .alloc_coherent = dma_fixed_alloc_coherent, + .free_coherent = dma_fixed_free_coherent, + .map_sg = dma_fixed_map_sg, +diff -urNp linux-2.6.33.1/arch/powerpc/platforms/ps3/system-bus.c linux-2.6.33.1/arch/powerpc/platforms/ps3/system-bus.c +--- linux-2.6.33.1/arch/powerpc/platforms/ps3/system-bus.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/platforms/ps3/system-bus.c 2010-03-20 16:58:38.776769493 -0400 +@@ -694,7 +694,7 @@ static int ps3_dma_supported(struct devi + return mask >= DMA_BIT_MASK(32); + } + +-static struct dma_map_ops ps3_sb_dma_ops = { ++static const struct dma_map_ops ps3_sb_dma_ops = { + .alloc_coherent = ps3_alloc_coherent, + .free_coherent = ps3_free_coherent, + .map_sg = ps3_sb_map_sg, +@@ -704,7 +704,7 @@ static struct dma_map_ops ps3_sb_dma_ops + .unmap_page = ps3_unmap_page, + }; + +-static struct dma_map_ops ps3_ioc0_dma_ops = { ++static const struct dma_map_ops ps3_ioc0_dma_ops = { + .alloc_coherent = ps3_alloc_coherent, + .free_coherent = ps3_free_coherent, + .map_sg = ps3_ioc0_map_sg, +diff -urNp linux-2.6.33.1/arch/powerpc/sysdev/fsl_pmc.c linux-2.6.33.1/arch/powerpc/sysdev/fsl_pmc.c +--- linux-2.6.33.1/arch/powerpc/sysdev/fsl_pmc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/powerpc/sysdev/fsl_pmc.c 2010-03-20 16:58:38.784767469 -0400 +@@ -53,7 +53,7 @@ static int pmc_suspend_valid(suspend_sta + return 1; + } + +-static struct platform_suspend_ops pmc_suspend_ops = { ++static const struct platform_suspend_ops pmc_suspend_ops = { + .valid = pmc_suspend_valid, + .enter = pmc_suspend_enter, + }; +diff -urNp linux-2.6.33.1/arch/s390/include/asm/elf.h linux-2.6.33.1/arch/s390/include/asm/elf.h +--- linux-2.6.33.1/arch/s390/include/asm/elf.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/s390/include/asm/elf.h 2010-03-20 16:58:38.792763717 -0400 +@@ -163,6 +163,13 @@ extern unsigned int vdso_enabled; + that it will "exec", and that there is sufficient room for the brk. */ + #define ELF_ET_DYN_BASE (STACK_TOP / 3 * 2) + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE (test_thread_flag(TIF_31BIT) ? 0x10000UL : 0x80000000UL) ++ ++#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_31BIT) ? 15 : 26 ) ++#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_31BIT) ? 15 : 26 ) ++#endif ++ + /* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. */ + +diff -urNp linux-2.6.33.1/arch/s390/include/asm/uaccess.h linux-2.6.33.1/arch/s390/include/asm/uaccess.h +--- linux-2.6.33.1/arch/s390/include/asm/uaccess.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/s390/include/asm/uaccess.h 2010-03-20 16:58:38.800777709 -0400 +@@ -234,6 +234,10 @@ static inline unsigned long __must_check + copy_to_user(void __user *to, const void *from, unsigned long n) + { + might_fault(); ++ ++ if ((long)n < 0) ++ return n; ++ + if (access_ok(VERIFY_WRITE, to, n)) + n = __copy_to_user(to, from, n); + return n; +@@ -259,6 +263,9 @@ copy_to_user(void __user *to, const void + static inline unsigned long __must_check + __copy_from_user(void *to, const void __user *from, unsigned long n) + { ++ if ((long)n < 0) ++ return n; ++ + if (__builtin_constant_p(n) && (n <= 256)) + return uaccess.copy_from_user_small(n, from, to); + else +@@ -285,6 +292,10 @@ static inline unsigned long __must_check + copy_from_user(void *to, const void __user *from, unsigned long n) + { + might_fault(); ++ ++ if ((long)n < 0) ++ return n; ++ + if (access_ok(VERIFY_READ, from, n)) + n = __copy_from_user(to, from, n); + else +diff -urNp linux-2.6.33.1/arch/s390/Kconfig linux-2.6.33.1/arch/s390/Kconfig +--- linux-2.6.33.1/arch/s390/Kconfig 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/s390/Kconfig 2010-03-20 16:58:38.812766228 -0400 +@@ -222,13 +222,12 @@ config AUDIT_ARCH + + config S390_EXEC_PROTECT + bool "Data execute protection" ++ default y + help + This option allows to enable a buffer overflow protection for user +- space programs and it also selects the addressing mode option above. +- The kernel parameter noexec=on will enable this feature and also +- switch the addressing modes, default is disabled. Enabling this (via +- kernel parameter) on machines earlier than IBM System z9-109 EC/BC +- will reduce system performance. ++ space programs. ++ Enabling this on machines earlier than IBM System z9-109 EC/BC will ++ reduce system performance. + + comment "Code generation options" + +diff -urNp linux-2.6.33.1/arch/s390/kernel/module.c linux-2.6.33.1/arch/s390/kernel/module.c +--- linux-2.6.33.1/arch/s390/kernel/module.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/s390/kernel/module.c 2010-03-20 16:58:38.824764313 -0400 +@@ -166,11 +166,11 @@ module_frob_arch_sections(Elf_Ehdr *hdr, + + /* Increase core size by size of got & plt and set start + offsets for got and plt. */ +- me->core_size = ALIGN(me->core_size, 4); +- me->arch.got_offset = me->core_size; +- me->core_size += me->arch.got_size; +- me->arch.plt_offset = me->core_size; +- me->core_size += me->arch.plt_size; ++ me->core_size_rw = ALIGN(me->core_size_rw, 4); ++ me->arch.got_offset = me->core_size_rw; ++ me->core_size_rw += me->arch.got_size; ++ me->arch.plt_offset = me->core_size_rx; ++ me->core_size_rx += me->arch.plt_size; + return 0; + } + +@@ -256,7 +256,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base + if (info->got_initialized == 0) { + Elf_Addr *gotent; + +- gotent = me->module_core + me->arch.got_offset + ++ gotent = me->module_core_rw + me->arch.got_offset + + info->got_offset; + *gotent = val; + info->got_initialized = 1; +@@ -280,7 +280,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base + else if (r_type == R_390_GOTENT || + r_type == R_390_GOTPLTENT) + *(unsigned int *) loc = +- (val + (Elf_Addr) me->module_core - loc) >> 1; ++ (val + (Elf_Addr) me->module_core_rw - loc) >> 1; + else if (r_type == R_390_GOT64 || + r_type == R_390_GOTPLT64) + *(unsigned long *) loc = val; +@@ -294,7 +294,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base + case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ + if (info->plt_initialized == 0) { + unsigned int *ip; +- ip = me->module_core + me->arch.plt_offset + ++ ip = me->module_core_rx + me->arch.plt_offset + + info->plt_offset; + #ifndef CONFIG_64BIT + ip[0] = 0x0d105810; /* basr 1,0; l 1,6(1); br 1 */ +@@ -319,7 +319,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base + val - loc + 0xffffUL < 0x1ffffeUL) || + (r_type == R_390_PLT32DBL && + val - loc + 0xffffffffULL < 0x1fffffffeULL))) +- val = (Elf_Addr) me->module_core + ++ val = (Elf_Addr) me->module_core_rx + + me->arch.plt_offset + + info->plt_offset; + val += rela->r_addend - loc; +@@ -341,7 +341,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base + case R_390_GOTOFF32: /* 32 bit offset to GOT. */ + case R_390_GOTOFF64: /* 64 bit offset to GOT. */ + val = val + rela->r_addend - +- ((Elf_Addr) me->module_core + me->arch.got_offset); ++ ((Elf_Addr) me->module_core_rw + me->arch.got_offset); + if (r_type == R_390_GOTOFF16) + *(unsigned short *) loc = val; + else if (r_type == R_390_GOTOFF32) +@@ -351,7 +351,7 @@ apply_rela(Elf_Rela *rela, Elf_Addr base + break; + case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */ + case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */ +- val = (Elf_Addr) me->module_core + me->arch.got_offset + ++ val = (Elf_Addr) me->module_core_rw + me->arch.got_offset + + rela->r_addend - loc; + if (r_type == R_390_GOTPC) + *(unsigned int *) loc = val; +diff -urNp linux-2.6.33.1/arch/s390/kernel/setup.c linux-2.6.33.1/arch/s390/kernel/setup.c +--- linux-2.6.33.1/arch/s390/kernel/setup.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/s390/kernel/setup.c 2010-03-20 16:58:38.824764313 -0400 +@@ -298,7 +298,7 @@ static int __init early_parse_mem(char * + } + early_param("mem", early_parse_mem); + +-unsigned int user_mode = HOME_SPACE_MODE; ++unsigned int user_mode = SECONDARY_SPACE_MODE; + EXPORT_SYMBOL_GPL(user_mode); + + static int set_amode_and_uaccess(unsigned long user_amode, +@@ -327,17 +327,6 @@ static int set_amode_and_uaccess(unsigne + } + } + +-/* +- * Switch kernel/user addressing modes? +- */ +-static int __init early_parse_switch_amode(char *p) +-{ +- if (user_mode != SECONDARY_SPACE_MODE) +- user_mode = PRIMARY_SPACE_MODE; +- return 0; +-} +-early_param("switch_amode", early_parse_switch_amode); +- + static int __init early_parse_user_mode(char *p) + { + if (p && strcmp(p, "primary") == 0) +@@ -354,20 +343,6 @@ static int __init early_parse_user_mode( + } + early_param("user_mode", early_parse_user_mode); + +-#ifdef CONFIG_S390_EXEC_PROTECT +-/* +- * Enable execute protection? +- */ +-static int __init early_parse_noexec(char *p) +-{ +- if (!strncmp(p, "off", 3)) +- return 0; +- user_mode = SECONDARY_SPACE_MODE; +- return 0; +-} +-early_param("noexec", early_parse_noexec); +-#endif /* CONFIG_S390_EXEC_PROTECT */ +- + static void setup_addressing_mode(void) + { + if (user_mode == SECONDARY_SPACE_MODE) { +diff -urNp linux-2.6.33.1/arch/s390/mm/maccess.c linux-2.6.33.1/arch/s390/mm/maccess.c +--- linux-2.6.33.1/arch/s390/mm/maccess.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/s390/mm/maccess.c 2010-03-20 16:58:38.828581240 -0400 +@@ -45,7 +45,7 @@ static long probe_kernel_write_odd(void + return rc ? rc : count; + } + +-long probe_kernel_write(void *dst, void *src, size_t size) ++long probe_kernel_write(void *dst, const void *src, size_t size) + { + long copied = 0; + +diff -urNp linux-2.6.33.1/arch/s390/mm/mmap.c linux-2.6.33.1/arch/s390/mm/mmap.c +--- linux-2.6.33.1/arch/s390/mm/mmap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/s390/mm/mmap.c 2010-03-20 16:58:38.828581240 -0400 +@@ -78,10 +78,22 @@ void arch_pick_mmap_layout(struct mm_str + */ + if (mmap_is_legacy()) { + mm->mmap_base = TASK_UNMAPPED_BASE; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base += mm->delta_mmap; ++#endif ++ + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(); ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base -= mm->delta_mmap + mm->delta_stack; ++#endif ++ + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + mm->unmap_area = arch_unmap_area_topdown; + } +@@ -153,10 +165,22 @@ void arch_pick_mmap_layout(struct mm_str + */ + if (mmap_is_legacy()) { + mm->mmap_base = TASK_UNMAPPED_BASE; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base += mm->delta_mmap; ++#endif ++ + mm->get_unmapped_area = s390_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(); ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base -= mm->delta_mmap + mm->delta_stack; ++#endif ++ + mm->get_unmapped_area = s390_get_unmapped_area_topdown; + mm->unmap_area = arch_unmap_area_topdown; + } +diff -urNp linux-2.6.33.1/arch/sh/boards/mach-hp6xx/pm.c linux-2.6.33.1/arch/sh/boards/mach-hp6xx/pm.c +--- linux-2.6.33.1/arch/sh/boards/mach-hp6xx/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sh/boards/mach-hp6xx/pm.c 2010-03-20 16:58:38.840778404 -0400 +@@ -143,7 +143,7 @@ static int hp6x0_pm_enter(suspend_state_ + return 0; + } + +-static struct platform_suspend_ops hp6x0_pm_ops = { ++static const struct platform_suspend_ops hp6x0_pm_ops = { + .enter = hp6x0_pm_enter, + .valid = suspend_valid_only_mem, + }; +diff -urNp linux-2.6.33.1/arch/sh/include/asm/dma-mapping.h linux-2.6.33.1/arch/sh/include/asm/dma-mapping.h +--- linux-2.6.33.1/arch/sh/include/asm/dma-mapping.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sh/include/asm/dma-mapping.h 2010-03-20 16:58:38.844777279 -0400 +@@ -1,10 +1,10 @@ + #ifndef __ASM_SH_DMA_MAPPING_H + #define __ASM_SH_DMA_MAPPING_H + +-extern struct dma_map_ops *dma_ops; ++extern const struct dma_map_ops *dma_ops; + extern void no_iommu_init(void); + +-static inline struct dma_map_ops *get_dma_ops(struct device *dev) ++static inline const struct dma_map_ops *get_dma_ops(struct device *dev) + { + return dma_ops; + } +@@ -14,7 +14,7 @@ static inline struct dma_map_ops *get_dm + + static inline int dma_supported(struct device *dev, u64 mask) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops->dma_supported) + return ops->dma_supported(dev, mask); +@@ -24,7 +24,7 @@ static inline int dma_supported(struct d + + static inline int dma_set_mask(struct device *dev, u64 mask) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; +@@ -59,7 +59,7 @@ static inline int dma_get_cache_alignmen + + static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops->mapping_error) + return ops->mapping_error(dev, dma_addr); +@@ -70,7 +70,7 @@ static inline int dma_mapping_error(stru + static inline void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + void *memory; + + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) +@@ -87,7 +87,7 @@ static inline void *dma_alloc_coherent(s + static inline void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + WARN_ON(irqs_disabled()); /* for portability */ + +diff -urNp linux-2.6.33.1/arch/sh/kernel/cpu/sh4/sq.c linux-2.6.33.1/arch/sh/kernel/cpu/sh4/sq.c +--- linux-2.6.33.1/arch/sh/kernel/cpu/sh4/sq.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sh/kernel/cpu/sh4/sq.c 2010-03-20 16:58:38.848768890 -0400 +@@ -327,7 +327,7 @@ static struct attribute *sq_sysfs_attrs[ + NULL, + }; + +-static struct sysfs_ops sq_sysfs_ops = { ++static const struct sysfs_ops sq_sysfs_ops = { + .show = sq_sysfs_show, + .store = sq_sysfs_store, + }; +diff -urNp linux-2.6.33.1/arch/sh/kernel/cpu/shmobile/pm.c linux-2.6.33.1/arch/sh/kernel/cpu/shmobile/pm.c +--- linux-2.6.33.1/arch/sh/kernel/cpu/shmobile/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sh/kernel/cpu/shmobile/pm.c 2010-03-20 16:58:38.852765354 -0400 +@@ -140,7 +140,7 @@ static int sh_pm_enter(suspend_state_t s + return 0; + } + +-static struct platform_suspend_ops sh_pm_ops = { ++static const struct platform_suspend_ops sh_pm_ops = { + .enter = sh_pm_enter, + .valid = suspend_valid_only_mem, + }; +diff -urNp linux-2.6.33.1/arch/sh/kernel/dma-nommu.c linux-2.6.33.1/arch/sh/kernel/dma-nommu.c +--- linux-2.6.33.1/arch/sh/kernel/dma-nommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sh/kernel/dma-nommu.c 2010-03-20 16:58:38.852765354 -0400 +@@ -62,7 +62,7 @@ static void nommu_sync_sg(struct device + } + #endif + +-struct dma_map_ops nommu_dma_ops = { ++const struct dma_map_ops nommu_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, + .map_page = nommu_map_page, +diff -urNp linux-2.6.33.1/arch/sh/kernel/kgdb.c linux-2.6.33.1/arch/sh/kernel/kgdb.c +--- linux-2.6.33.1/arch/sh/kernel/kgdb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sh/kernel/kgdb.c 2010-03-20 16:58:38.856769610 -0400 +@@ -271,7 +271,7 @@ void kgdb_arch_exit(void) + { + } + +-struct kgdb_arch arch_kgdb_ops = { ++const struct kgdb_arch arch_kgdb_ops = { + /* Breakpoint instruction: trapa #0x3c */ + #ifdef CONFIG_CPU_LITTLE_ENDIAN + .gdb_bpt_instr = { 0x3c, 0xc3 }, +diff -urNp linux-2.6.33.1/arch/sh/mm/consistent.c linux-2.6.33.1/arch/sh/mm/consistent.c +--- linux-2.6.33.1/arch/sh/mm/consistent.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sh/mm/consistent.c 2010-03-20 16:58:38.860748595 -0400 +@@ -21,7 +21,7 @@ + + #define PREALLOC_DMA_DEBUG_ENTRIES 4096 + +-struct dma_map_ops *dma_ops; ++const struct dma_map_ops *dma_ops; + EXPORT_SYMBOL(dma_ops); + + static int __init dma_init(void) +diff -urNp linux-2.6.33.1/arch/sparc/include/asm/atomic_64.h linux-2.6.33.1/arch/sparc/include/asm/atomic_64.h +--- linux-2.6.33.1/arch/sparc/include/asm/atomic_64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/include/asm/atomic_64.h 2010-03-20 16:58:38.860748595 -0400 +@@ -14,18 +14,38 @@ + #define ATOMIC64_INIT(i) { (i) } + + #define atomic_read(v) ((v)->counter) ++static inline int atomic_read_unchecked(const atomic_unchecked_t *v) ++{ ++ return v->counter; ++} + #define atomic64_read(v) ((v)->counter) ++static inline long atomic64_read_unchecked(const atomic64_unchecked_t *v) ++{ ++ return v->counter; ++} + + #define atomic_set(v, i) (((v)->counter) = i) ++static inline void atomic_set_unchecked(atomic_unchecked_t *v, int i) ++{ ++ v->counter = i; ++} + #define atomic64_set(v, i) (((v)->counter) = i) ++static inline void atomic64_set_unchecked(atomic64_unchecked_t *v, long i) ++{ ++ v->counter = i; ++} + + extern void atomic_add(int, atomic_t *); ++extern void atomic_add_unchecked(int, atomic_unchecked_t *); + extern void atomic64_add(int, atomic64_t *); ++extern void atomic64_add_unchecked(int, atomic64_unchecked_t *); + extern void atomic_sub(int, atomic_t *); ++extern void atomic_sub_unchecked(int, atomic_unchecked_t *); + extern void atomic64_sub(int, atomic64_t *); + + extern int atomic_add_ret(int, atomic_t *); + extern int atomic64_add_ret(int, atomic64_t *); ++extern int atomic64_add_ret_unchecked(int, atomic64_unchecked_t *); + extern int atomic_sub_ret(int, atomic_t *); + extern int atomic64_sub_ret(int, atomic64_t *); + +@@ -34,6 +54,7 @@ extern int atomic64_sub_ret(int, atomic6 + + #define atomic_inc_return(v) atomic_add_ret(1, v) + #define atomic64_inc_return(v) atomic64_add_ret(1, v) ++#define atomic64_inc_return_unchecked(v) atomic64_add_ret_unchecked(1, v) + + #define atomic_sub_return(i, v) atomic_sub_ret(i, v) + #define atomic64_sub_return(i, v) atomic64_sub_ret(i, v) +@@ -59,7 +80,15 @@ extern int atomic64_sub_ret(int, atomic6 + #define atomic64_dec_and_test(v) (atomic64_sub_ret(1, v) == 0) + + #define atomic_inc(v) atomic_add(1, v) ++static inline void atomic_inc_unchecked(atomic_unchecked_t *v) ++{ ++ atomic_add_unchecked(1, v); ++} + #define atomic64_inc(v) atomic64_add(1, v) ++static inline void atomic64_inc_unchecked(atomic64_unchecked_t *v) ++{ ++ atomic64_add_unchecked(1, v); ++} + + #define atomic_dec(v) atomic_sub(1, v) + #define atomic64_dec(v) atomic64_sub(1, v) +@@ -72,17 +101,28 @@ extern int atomic64_sub_ret(int, atomic6 + + static inline int atomic_add_unless(atomic_t *v, int a, int u) + { +- int c, old; ++ int c, old, new; + c = atomic_read(v); + for (;;) { +- if (unlikely(c == (u))) ++ if (unlikely(c == u)) + break; +- old = atomic_cmpxchg((v), c, c + (a)); ++ ++ asm volatile("addcc %2, %0, %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "tvs %%icc, 6\n" ++#endif ++ ++ : "=r" (new) ++ : "0" (c), "ir" (a) ++ : "cc"); ++ ++ old = atomic_cmpxchg(v, c, new); + if (likely(old == c)) + break; + c = old; + } +- return c != (u); ++ return c != u; + } + + #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) +@@ -93,17 +133,28 @@ static inline int atomic_add_unless(atom + + static inline int atomic64_add_unless(atomic64_t *v, long a, long u) + { +- long c, old; ++ long c, old, new; + c = atomic64_read(v); + for (;;) { +- if (unlikely(c == (u))) ++ if (unlikely(c == u)) + break; +- old = atomic64_cmpxchg((v), c, c + (a)); ++ ++ asm volatile("addcc %2, %0, %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "tvs %%xcc, 6\n" ++#endif ++ ++ : "=r" (new) ++ : "0" (c), "ir" (a) ++ : "cc"); ++ ++ old = atomic64_cmpxchg(v, c, new); + if (likely(old == c)) + break; + c = old; + } +- return c != (u); ++ return c != u; + } + + #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) +diff -urNp linux-2.6.33.1/arch/sparc/include/asm/dma-mapping.h linux-2.6.33.1/arch/sparc/include/asm/dma-mapping.h +--- linux-2.6.33.1/arch/sparc/include/asm/dma-mapping.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/include/asm/dma-mapping.h 2010-03-20 16:58:38.881211488 -0400 +@@ -14,10 +14,10 @@ extern int dma_set_mask(struct device *d + #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) + #define dma_is_consistent(d, h) (1) + +-extern struct dma_map_ops *dma_ops, pci32_dma_ops; ++extern struct const dma_map_ops *dma_ops, pci32_dma_ops; + extern struct bus_type pci_bus_type; + +-static inline struct dma_map_ops *get_dma_ops(struct device *dev) ++static inline const struct dma_map_ops *get_dma_ops(struct device *dev) + { + #if defined(CONFIG_SPARC32) && defined(CONFIG_PCI) + if (dev->bus == &pci_bus_type) +@@ -31,7 +31,7 @@ static inline struct dma_map_ops *get_dm + static inline void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + void *cpu_addr; + + cpu_addr = ops->alloc_coherent(dev, size, dma_handle, flag); +@@ -42,7 +42,7 @@ static inline void *dma_alloc_coherent(s + static inline void dma_free_coherent(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); + ops->free_coherent(dev, size, cpu_addr, dma_handle); +diff -urNp linux-2.6.33.1/arch/sparc/include/asm/elf_32.h linux-2.6.33.1/arch/sparc/include/asm/elf_32.h +--- linux-2.6.33.1/arch/sparc/include/asm/elf_32.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/include/asm/elf_32.h 2010-03-20 16:58:38.884769570 -0400 +@@ -114,6 +114,13 @@ typedef struct { + + #define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE) + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE 0x10000UL ++ ++#define PAX_DELTA_MMAP_LEN 16 ++#define PAX_DELTA_STACK_LEN 16 ++#endif ++ + /* This yields a mask that user programs can use to figure out what + instruction set this cpu supports. This can NOT be done in userspace + on Sparc. */ +diff -urNp linux-2.6.33.1/arch/sparc/include/asm/elf_64.h linux-2.6.33.1/arch/sparc/include/asm/elf_64.h +--- linux-2.6.33.1/arch/sparc/include/asm/elf_64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/include/asm/elf_64.h 2010-03-20 16:58:38.892779813 -0400 +@@ -162,6 +162,12 @@ typedef struct { + #define ELF_ET_DYN_BASE 0x0000010000000000UL + #define COMPAT_ELF_ET_DYN_BASE 0x0000000070000000UL + ++#ifdef CONFIG_PAX_ASLR ++#define PAX_ELF_ET_DYN_BASE (test_thread_flag(TIF_32BIT) ? 0x10000UL : 0x100000UL) ++ ++#define PAX_DELTA_MMAP_LEN (test_thread_flag(TIF_32BIT) ? 14 : 28) ++#define PAX_DELTA_STACK_LEN (test_thread_flag(TIF_32BIT) ? 15 : 29) ++#endif + + /* This yields a mask that user programs can use to figure out what + instruction set this cpu supports. */ +diff -urNp linux-2.6.33.1/arch/sparc/include/asm/pgtable_32.h linux-2.6.33.1/arch/sparc/include/asm/pgtable_32.h +--- linux-2.6.33.1/arch/sparc/include/asm/pgtable_32.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/include/asm/pgtable_32.h 2010-03-20 16:58:38.892779813 -0400 +@@ -43,6 +43,13 @@ BTFIXUPDEF_SIMM13(user_ptrs_per_pgd) + BTFIXUPDEF_INT(page_none) + BTFIXUPDEF_INT(page_copy) + BTFIXUPDEF_INT(page_readonly) ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++BTFIXUPDEF_INT(page_shared_noexec) ++BTFIXUPDEF_INT(page_copy_noexec) ++BTFIXUPDEF_INT(page_readonly_noexec) ++#endif ++ + BTFIXUPDEF_INT(page_kernel) + + #define PMD_SHIFT SUN4C_PMD_SHIFT +@@ -64,6 +71,16 @@ extern pgprot_t PAGE_SHARED; + #define PAGE_COPY __pgprot(BTFIXUP_INT(page_copy)) + #define PAGE_READONLY __pgprot(BTFIXUP_INT(page_readonly)) + ++#ifdef CONFIG_PAX_PAGEEXEC ++extern pgprot_t PAGE_SHARED_NOEXEC; ++# define PAGE_COPY_NOEXEC __pgprot(BTFIXUP_INT(page_copy_noexec)) ++# define PAGE_READONLY_NOEXEC __pgprot(BTFIXUP_INT(page_readonly_noexec)) ++#else ++# define PAGE_SHARED_NOEXEC PAGE_SHARED ++# define PAGE_COPY_NOEXEC PAGE_COPY ++# define PAGE_READONLY_NOEXEC PAGE_READONLY ++#endif ++ + extern unsigned long page_kernel; + + #ifdef MODULE +diff -urNp linux-2.6.33.1/arch/sparc/include/asm/pgtsrmmu.h linux-2.6.33.1/arch/sparc/include/asm/pgtsrmmu.h +--- linux-2.6.33.1/arch/sparc/include/asm/pgtsrmmu.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/include/asm/pgtsrmmu.h 2010-03-20 16:58:38.917006269 -0400 +@@ -115,6 +115,13 @@ + SRMMU_EXEC | SRMMU_REF) + #define SRMMU_PAGE_RDONLY __pgprot(SRMMU_VALID | SRMMU_CACHE | \ + SRMMU_EXEC | SRMMU_REF) ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++#define SRMMU_PAGE_SHARED_NOEXEC __pgprot(SRMMU_VALID | SRMMU_CACHE | SRMMU_WRITE | SRMMU_REF) ++#define SRMMU_PAGE_COPY_NOEXEC __pgprot(SRMMU_VALID | SRMMU_CACHE | SRMMU_REF) ++#define SRMMU_PAGE_RDONLY_NOEXEC __pgprot(SRMMU_VALID | SRMMU_CACHE | SRMMU_REF) ++#endif ++ + #define SRMMU_PAGE_KERNEL __pgprot(SRMMU_VALID | SRMMU_CACHE | SRMMU_PRIV | \ + SRMMU_DIRTY | SRMMU_REF) + +diff -urNp linux-2.6.33.1/arch/sparc/include/asm/spinlock_64.h linux-2.6.33.1/arch/sparc/include/asm/spinlock_64.h +--- linux-2.6.33.1/arch/sparc/include/asm/spinlock_64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/include/asm/spinlock_64.h 2010-03-20 16:58:38.917006269 -0400 +@@ -99,7 +99,12 @@ static void inline arch_read_lock(arch_r + __asm__ __volatile__ ( + "1: ldsw [%2], %0\n" + " brlz,pn %0, 2f\n" +-"4: add %0, 1, %1\n" ++"4: addcc %0, 1, %1\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++" tvs %%icc, 6\n" ++#endif ++ + " cas [%2], %0, %1\n" + " cmp %0, %1\n" + " bne,pn %%icc, 1b\n" +@@ -112,7 +117,7 @@ static void inline arch_read_lock(arch_r + " .previous" + : "=&r" (tmp1), "=&r" (tmp2) + : "r" (lock) +- : "memory"); ++ : "memory", "cc"); + } + + static int inline arch_read_trylock(arch_rwlock_t *lock) +@@ -123,7 +128,12 @@ static int inline arch_read_trylock(arch + "1: ldsw [%2], %0\n" + " brlz,a,pn %0, 2f\n" + " mov 0, %0\n" +-" add %0, 1, %1\n" ++" addcc %0, 1, %1\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++" tvs %%icc, 6\n" ++#endif ++ + " cas [%2], %0, %1\n" + " cmp %0, %1\n" + " bne,pn %%icc, 1b\n" +@@ -142,7 +152,12 @@ static void inline arch_read_unlock(arch + + __asm__ __volatile__( + "1: lduw [%2], %0\n" +-" sub %0, 1, %1\n" ++" subcc %0, 1, %1\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++" tvs %%icc, 6\n" ++#endif ++ + " cas [%2], %0, %1\n" + " cmp %0, %1\n" + " bne,pn %%xcc, 1b\n" +diff -urNp linux-2.6.33.1/arch/sparc/include/asm/uaccess_32.h linux-2.6.33.1/arch/sparc/include/asm/uaccess_32.h +--- linux-2.6.33.1/arch/sparc/include/asm/uaccess_32.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/include/asm/uaccess_32.h 2010-03-20 16:58:38.917006269 -0400 +@@ -249,14 +249,25 @@ extern unsigned long __copy_user(void __ + + static inline unsigned long copy_to_user(void __user *to, const void *from, unsigned long n) + { +- if (n && __access_ok((unsigned long) to, n)) ++ if ((long)n < 0) ++ return n; ++ ++ if (n && __access_ok((unsigned long) to, n)) { ++ if (!__builtin_constant_p(n)) ++ check_object_size(from, n, true); + return __copy_user(to, (__force void __user *) from, n); +- else ++ } else + return n; + } + + static inline unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) + { ++ if ((long)n < 0) ++ return n; ++ ++ if (!__builtin_constant_p(n)) ++ check_object_size(from, n, true); ++ + return __copy_user(to, (__force void __user *) from, n); + } + +@@ -272,19 +283,27 @@ static inline unsigned long copy_from_us + { + int sz = __compiletime_object_size(to); + ++ if ((long)n < 0) ++ return n; ++ + if (unlikely(sz != -1 && sz < n)) { + copy_from_user_overflow(); + return n; + } + +- if (n && __access_ok((unsigned long) from, n)) ++ if (n && __access_ok((unsigned long) from, n)) { ++ if (!__builtin_constant_p(n)) ++ check_object_size(to, n, false); + return __copy_user((__force void __user *) to, from, n); +- else ++ } else + return n; + } + + static inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) + { ++ if ((long)n < 0) ++ return n; ++ + return __copy_user((__force void __user *) to, from, n); + } + +diff -urNp linux-2.6.33.1/arch/sparc/include/asm/uaccess_64.h linux-2.6.33.1/arch/sparc/include/asm/uaccess_64.h +--- linux-2.6.33.1/arch/sparc/include/asm/uaccess_64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/include/asm/uaccess_64.h 2010-03-20 16:58:38.924784570 -0400 +@@ -10,6 +10,7 @@ + #include <linux/compiler.h> + #include <linux/string.h> + #include <linux/thread_info.h> ++#include <linux/kernel.h> + #include <asm/asi.h> + #include <asm/system.h> + #include <asm/spitfire.h> +@@ -204,6 +205,7 @@ __asm__ __volatile__( \ + : "=r" (x) : "r" (__m(addr)), "i" (retval)) + + extern int __get_user_bad(void); ++extern void check_object_size(const void *ptr, unsigned long n, bool to); + + extern void copy_from_user_overflow(void) + #ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS +@@ -224,6 +226,12 @@ copy_from_user(void *to, const void __us + int sz = __compiletime_object_size(to); + unsigned long ret = size; + ++ if ((long)size < 0 || size > INT_MAX) ++ return size; ++ ++ if (!__builtin_constant_p(size)) ++ check_object_size(to, size, false); ++ + if (likely(sz == -1 || sz >= size)) { + ret = ___copy_from_user(to, from, size); + if (unlikely(ret)) +@@ -243,8 +251,15 @@ extern unsigned long copy_to_user_fixup( + static inline unsigned long __must_check + copy_to_user(void __user *to, const void *from, unsigned long size) + { +- unsigned long ret = ___copy_to_user(to, from, size); ++ unsigned long ret; ++ ++ if ((long)size < 0 || size > INT_MAX) ++ return size; ++ ++ if (!__builtin_constant_p(size)) ++ check_object_size(from, size, true); + ++ ret = ___copy_to_user(to, from, size); + if (unlikely(ret)) + ret = copy_to_user_fixup(to, from, size); + return ret; +diff -urNp linux-2.6.33.1/arch/sparc/kernel/iommu.c linux-2.6.33.1/arch/sparc/kernel/iommu.c +--- linux-2.6.33.1/arch/sparc/kernel/iommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/kernel/iommu.c 2010-03-20 16:58:38.928788728 -0400 +@@ -827,7 +827,7 @@ static void dma_4u_sync_sg_for_cpu(struc + spin_unlock_irqrestore(&iommu->lock, flags); + } + +-static struct dma_map_ops sun4u_dma_ops = { ++static const struct dma_map_ops sun4u_dma_ops = { + .alloc_coherent = dma_4u_alloc_coherent, + .free_coherent = dma_4u_free_coherent, + .map_page = dma_4u_map_page, +@@ -838,7 +838,7 @@ static struct dma_map_ops sun4u_dma_ops + .sync_sg_for_cpu = dma_4u_sync_sg_for_cpu, + }; + +-struct dma_map_ops *dma_ops = &sun4u_dma_ops; ++const struct dma_map_ops *dma_ops = &sun4u_dma_ops; + EXPORT_SYMBOL(dma_ops); + + extern int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask); +diff -urNp linux-2.6.33.1/arch/sparc/kernel/ioport.c linux-2.6.33.1/arch/sparc/kernel/ioport.c +--- linux-2.6.33.1/arch/sparc/kernel/ioport.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/kernel/ioport.c 2010-03-20 16:58:38.940791948 -0400 +@@ -397,7 +397,7 @@ static void sbus_sync_sg_for_device(stru + BUG(); + } + +-struct dma_map_ops sbus_dma_ops = { ++const struct dma_map_ops sbus_dma_ops = { + .alloc_coherent = sbus_alloc_coherent, + .free_coherent = sbus_free_coherent, + .map_page = sbus_map_page, +@@ -408,7 +408,7 @@ struct dma_map_ops sbus_dma_ops = { + .sync_sg_for_device = sbus_sync_sg_for_device, + }; + +-struct dma_map_ops *dma_ops = &sbus_dma_ops; ++const struct dma_map_ops *dma_ops = &sbus_dma_ops; + EXPORT_SYMBOL(dma_ops); + + static int __init sparc_register_ioport(void) +@@ -645,7 +645,7 @@ static void pci32_sync_sg_for_device(str + } + } + +-struct dma_map_ops pci32_dma_ops = { ++const struct dma_map_ops pci32_dma_ops = { + .alloc_coherent = pci32_alloc_coherent, + .free_coherent = pci32_free_coherent, + .map_page = pci32_map_page, +diff -urNp linux-2.6.33.1/arch/sparc/kernel/kgdb_32.c linux-2.6.33.1/arch/sparc/kernel/kgdb_32.c +--- linux-2.6.33.1/arch/sparc/kernel/kgdb_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/kernel/kgdb_32.c 2010-03-20 16:58:38.940791948 -0400 +@@ -158,7 +158,7 @@ void kgdb_arch_exit(void) + { + } + +-struct kgdb_arch arch_kgdb_ops = { ++const struct kgdb_arch arch_kgdb_ops = { + /* Breakpoint instruction: ta 0x7d */ + .gdb_bpt_instr = { 0x91, 0xd0, 0x20, 0x7d }, + }; +diff -urNp linux-2.6.33.1/arch/sparc/kernel/kgdb_64.c linux-2.6.33.1/arch/sparc/kernel/kgdb_64.c +--- linux-2.6.33.1/arch/sparc/kernel/kgdb_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/kernel/kgdb_64.c 2010-03-20 16:58:38.940791948 -0400 +@@ -180,7 +180,7 @@ void kgdb_arch_exit(void) + { + } + +-struct kgdb_arch arch_kgdb_ops = { ++const struct kgdb_arch arch_kgdb_ops = { + /* Breakpoint instruction: ta 0x72 */ + .gdb_bpt_instr = { 0x91, 0xd0, 0x20, 0x72 }, + }; +diff -urNp linux-2.6.33.1/arch/sparc/kernel/Makefile linux-2.6.33.1/arch/sparc/kernel/Makefile +--- linux-2.6.33.1/arch/sparc/kernel/Makefile 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/kernel/Makefile 2010-03-20 16:58:38.944776638 -0400 +@@ -3,7 +3,7 @@ + # + + asflags-y := -ansi +-ccflags-y := -Werror ++#ccflags-y := -Werror + + extra-y := head_$(BITS).o + extra-y += init_task.o +diff -urNp linux-2.6.33.1/arch/sparc/kernel/pci_sun4v.c linux-2.6.33.1/arch/sparc/kernel/pci_sun4v.c +--- linux-2.6.33.1/arch/sparc/kernel/pci_sun4v.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/kernel/pci_sun4v.c 2010-03-20 16:58:38.944776638 -0400 +@@ -525,7 +525,7 @@ static void dma_4v_unmap_sg(struct devic + spin_unlock_irqrestore(&iommu->lock, flags); + } + +-static struct dma_map_ops sun4v_dma_ops = { ++static const struct dma_map_ops sun4v_dma_ops = { + .alloc_coherent = dma_4v_alloc_coherent, + .free_coherent = dma_4v_free_coherent, + .map_page = dma_4v_map_page, +diff -urNp linux-2.6.33.1/arch/sparc/kernel/sys_sparc_32.c linux-2.6.33.1/arch/sparc/kernel/sys_sparc_32.c +--- linux-2.6.33.1/arch/sparc/kernel/sys_sparc_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/kernel/sys_sparc_32.c 2010-03-20 16:58:38.944776638 -0400 +@@ -57,7 +57,7 @@ unsigned long arch_get_unmapped_area(str + if (ARCH_SUN4C && len > 0x20000000) + return -ENOMEM; + if (!addr) +- addr = TASK_UNMAPPED_BASE; ++ addr = current->mm->mmap_base; + + if (flags & MAP_SHARED) + addr = COLOUR_ALIGN(addr); +diff -urNp linux-2.6.33.1/arch/sparc/kernel/sys_sparc_64.c linux-2.6.33.1/arch/sparc/kernel/sys_sparc_64.c +--- linux-2.6.33.1/arch/sparc/kernel/sys_sparc_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/kernel/sys_sparc_64.c 2010-03-20 16:58:38.948763345 -0400 +@@ -125,7 +125,7 @@ unsigned long arch_get_unmapped_area(str + /* We do not accept a shared mapping if it would violate + * cache aliasing constraints. + */ +- if ((flags & MAP_SHARED) && ++ if ((filp || (flags & MAP_SHARED)) && + ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) + return -EINVAL; + return addr; +@@ -140,6 +140,10 @@ unsigned long arch_get_unmapped_area(str + if (filp || (flags & MAP_SHARED)) + do_color_align = 1; + ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) ++#endif ++ + if (addr) { + if (do_color_align) + addr = COLOUR_ALIGN(addr, pgoff); +@@ -153,9 +157,9 @@ unsigned long arch_get_unmapped_area(str + } + + if (len > mm->cached_hole_size) { +- start_addr = addr = mm->free_area_cache; ++ start_addr = addr = mm->free_area_cache; + } else { +- start_addr = addr = TASK_UNMAPPED_BASE; ++ start_addr = addr = mm->mmap_base; + mm->cached_hole_size = 0; + } + +@@ -175,8 +179,8 @@ full_search: + vma = find_vma(mm, VA_EXCLUDE_END); + } + if (unlikely(task_size < addr)) { +- if (start_addr != TASK_UNMAPPED_BASE) { +- start_addr = addr = TASK_UNMAPPED_BASE; ++ if (start_addr != mm->mmap_base) { ++ start_addr = addr = mm->mmap_base; + mm->cached_hole_size = 0; + goto full_search; + } +@@ -216,7 +220,7 @@ arch_get_unmapped_area_topdown(struct fi + /* We do not accept a shared mapping if it would violate + * cache aliasing constraints. + */ +- if ((flags & MAP_SHARED) && ++ if ((filp || (flags & MAP_SHARED)) && + ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) + return -EINVAL; + return addr; +@@ -386,6 +390,12 @@ void arch_pick_mmap_layout(struct mm_str + gap == RLIM_INFINITY || + sysctl_legacy_va_layout) { + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base += mm->delta_mmap; ++#endif ++ + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { +@@ -398,6 +408,12 @@ void arch_pick_mmap_layout(struct mm_str + gap = (task_size / 6 * 5); + + mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base -= mm->delta_mmap + mm->delta_stack; ++#endif ++ + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + mm->unmap_area = arch_unmap_area_topdown; + } +diff -urNp linux-2.6.33.1/arch/sparc/kernel/traps_64.c linux-2.6.33.1/arch/sparc/kernel/traps_64.c +--- linux-2.6.33.1/arch/sparc/kernel/traps_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/kernel/traps_64.c 2010-03-20 16:58:38.948763345 -0400 +@@ -93,6 +93,12 @@ void bad_trap(struct pt_regs *regs, long + + lvl -= 0x100; + if (regs->tstate & TSTATE_PRIV) { ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ if (lvl == 6) ++ pax_report_refcount_overflow(regs); ++#endif ++ + sprintf(buffer, "Kernel bad sw trap %lx", lvl); + die_if_kernel(buffer, regs); + } +@@ -111,11 +117,16 @@ void bad_trap(struct pt_regs *regs, long + void bad_trap_tl1(struct pt_regs *regs, long lvl) + { + char buffer[32]; +- ++ + if (notify_die(DIE_TRAP_TL1, "bad trap tl1", regs, + 0, lvl, SIGTRAP) == NOTIFY_STOP) + return; + ++#ifdef CONFIG_PAX_REFCOUNT ++ if (lvl == 6) ++ pax_report_refcount_overflow(regs); ++#endif ++ + dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); + + sprintf (buffer, "Bad trap %lx at tl>0", lvl); +diff -urNp linux-2.6.33.1/arch/sparc/lib/atomic_64.S linux-2.6.33.1/arch/sparc/lib/atomic_64.S +--- linux-2.6.33.1/arch/sparc/lib/atomic_64.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/lib/atomic_64.S 2010-03-20 16:58:38.952780321 -0400 +@@ -18,7 +18,12 @@ + atomic_add: /* %o0 = increment, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) + 1: lduw [%o1], %g1 +- add %g1, %o0, %g7 ++ addcc %g1, %o0, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cas [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, 2f +@@ -28,12 +33,32 @@ atomic_add: /* %o0 = increment, %o1 = at + 2: BACKOFF_SPIN(%o2, %o3, 1b) + .size atomic_add, .-atomic_add + ++ .globl atomic_add_unchecked ++ .type atomic_add_unchecked,#function ++atomic_add_unchecked: /* %o0 = increment, %o1 = atomic_ptr */ ++ BACKOFF_SETUP(%o2) ++1: lduw [%o1], %g1 ++ add %g1, %o0, %g7 ++ cas [%o1], %g1, %g7 ++ cmp %g1, %g7 ++ bne,pn %icc, 2f ++ nop ++ retl ++ nop ++2: BACKOFF_SPIN(%o2, %o3, 1b) ++ .size atomic_add_unchecked, .-atomic_add_unchecked ++ + .globl atomic_sub + .type atomic_sub,#function + atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) + 1: lduw [%o1], %g1 +- sub %g1, %o0, %g7 ++ subcc %g1, %o0, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cas [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, 2f +@@ -43,12 +68,32 @@ atomic_sub: /* %o0 = decrement, %o1 = at + 2: BACKOFF_SPIN(%o2, %o3, 1b) + .size atomic_sub, .-atomic_sub + ++ .globl atomic_sub_unchecked ++ .type atomic_sub_unchecked,#function ++atomic_sub_unchecked: /* %o0 = decrement, %o1 = atomic_ptr */ ++ BACKOFF_SETUP(%o2) ++1: lduw [%o1], %g1 ++ sub %g1, %o0, %g7 ++ cas [%o1], %g1, %g7 ++ cmp %g1, %g7 ++ bne,pn %icc, 2f ++ nop ++ retl ++ nop ++2: BACKOFF_SPIN(%o2, %o3, 1b) ++ .size atomic_sub_unchecked, .-atomic_sub_unchecked ++ + .globl atomic_add_ret + .type atomic_add_ret,#function + atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) + 1: lduw [%o1], %g1 +- add %g1, %o0, %g7 ++ addcc %g1, %o0, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cas [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, 2f +@@ -64,7 +109,12 @@ atomic_add_ret: /* %o0 = increment, %o1 + atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) + 1: lduw [%o1], %g1 +- sub %g1, %o0, %g7 ++ subcc %g1, %o0, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cas [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, 2f +@@ -80,7 +130,12 @@ atomic_sub_ret: /* %o0 = decrement, %o1 + atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) + 1: ldx [%o1], %g1 +- add %g1, %o0, %g7 ++ addcc %g1, %o0, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %xcc, 6 ++#endif ++ + casx [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %xcc, 2f +@@ -90,12 +145,32 @@ atomic64_add: /* %o0 = increment, %o1 = + 2: BACKOFF_SPIN(%o2, %o3, 1b) + .size atomic64_add, .-atomic64_add + ++ .globl atomic64_add_unchecked ++ .type atomic64_add_unchecked,#function ++atomic64_add_unchecked: /* %o0 = increment, %o1 = atomic_ptr */ ++ BACKOFF_SETUP(%o2) ++1: ldx [%o1], %g1 ++ addcc %g1, %o0, %g7 ++ casx [%o1], %g1, %g7 ++ cmp %g1, %g7 ++ bne,pn %xcc, 2f ++ nop ++ retl ++ nop ++2: BACKOFF_SPIN(%o2, %o3, 1b) ++ .size atomic64_add_unchecked, .-atomic64_add_unchecked ++ + .globl atomic64_sub + .type atomic64_sub,#function + atomic64_sub: /* %o0 = decrement, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) + 1: ldx [%o1], %g1 +- sub %g1, %o0, %g7 ++ subcc %g1, %o0, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %xcc, 6 ++#endif ++ + casx [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %xcc, 2f +@@ -110,7 +185,12 @@ atomic64_sub: /* %o0 = decrement, %o1 = + atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) + 1: ldx [%o1], %g1 +- add %g1, %o0, %g7 ++ addcc %g1, %o0, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %xcc, 6 ++#endif ++ + casx [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %xcc, 2f +@@ -121,12 +201,33 @@ atomic64_add_ret: /* %o0 = increment, %o + 2: BACKOFF_SPIN(%o2, %o3, 1b) + .size atomic64_add_ret, .-atomic64_add_ret + ++ .globl atomic64_add_ret_unchecked ++ .type atomic64_add_ret_unchecked,#function ++atomic64_add_ret_unchecked: /* %o0 = increment, %o1 = atomic_ptr */ ++ BACKOFF_SETUP(%o2) ++1: ldx [%o1], %g1 ++ addcc %g1, %o0, %g7 ++ casx [%o1], %g1, %g7 ++ cmp %g1, %g7 ++ bne,pn %xcc, 2f ++ add %g7, %o0, %g7 ++ mov %g7, %o0 ++ retl ++ nop ++2: BACKOFF_SPIN(%o2, %o3, 1b) ++ .size atomic64_add_ret_unchecked, .-atomic64_add_ret_unchecked ++ + .globl atomic64_sub_ret + .type atomic64_sub_ret,#function + atomic64_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) + 1: ldx [%o1], %g1 +- sub %g1, %o0, %g7 ++ subcc %g1, %o0, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %xcc, 6 ++#endif ++ + casx [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %xcc, 2f +diff -urNp linux-2.6.33.1/arch/sparc/lib/ksyms.c linux-2.6.33.1/arch/sparc/lib/ksyms.c +--- linux-2.6.33.1/arch/sparc/lib/ksyms.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/lib/ksyms.c 2010-03-20 16:58:38.956579787 -0400 +@@ -142,8 +142,10 @@ EXPORT_SYMBOL(__downgrade_write); + + /* Atomic counter implementation. */ + EXPORT_SYMBOL(atomic_add); ++EXPORT_SYMBOL(atomic_add_unchecked); + EXPORT_SYMBOL(atomic_add_ret); + EXPORT_SYMBOL(atomic_sub); ++EXPORT_SYMBOL(atomic_sub_unchecked); + EXPORT_SYMBOL(atomic_sub_ret); + EXPORT_SYMBOL(atomic64_add); + EXPORT_SYMBOL(atomic64_add_ret); +diff -urNp linux-2.6.33.1/arch/sparc/lib/rwsem_64.S linux-2.6.33.1/arch/sparc/lib/rwsem_64.S +--- linux-2.6.33.1/arch/sparc/lib/rwsem_64.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/lib/rwsem_64.S 2010-03-20 16:58:38.956579787 -0400 +@@ -11,7 +11,12 @@ + .globl __down_read + __down_read: + 1: lduw [%o0], %g1 +- add %g1, 1, %g7 ++ addcc %g1, 1, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cas [%o0], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, 1b +@@ -33,7 +38,12 @@ __down_read: + .globl __down_read_trylock + __down_read_trylock: + 1: lduw [%o0], %g1 +- add %g1, 1, %g7 ++ addcc %g1, 1, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cmp %g7, 0 + bl,pn %icc, 2f + mov 0, %o1 +@@ -51,7 +61,12 @@ __down_write: + or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1 + 1: + lduw [%o0], %g3 +- add %g3, %g1, %g7 ++ addcc %g3, %g1, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cas [%o0], %g3, %g7 + cmp %g3, %g7 + bne,pn %icc, 1b +@@ -77,7 +92,12 @@ __down_write_trylock: + cmp %g3, 0 + bne,pn %icc, 2f + mov 0, %o1 +- add %g3, %g1, %g7 ++ addcc %g3, %g1, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cas [%o0], %g3, %g7 + cmp %g3, %g7 + bne,pn %icc, 1b +@@ -90,7 +110,12 @@ __down_write_trylock: + __up_read: + 1: + lduw [%o0], %g1 +- sub %g1, 1, %g7 ++ subcc %g1, 1, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cas [%o0], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, 1b +@@ -118,7 +143,12 @@ __up_write: + or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1 + 1: + lduw [%o0], %g3 +- sub %g3, %g1, %g7 ++ subcc %g3, %g1, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cas [%o0], %g3, %g7 + cmp %g3, %g7 + bne,pn %icc, 1b +@@ -143,7 +173,12 @@ __downgrade_write: + or %g1, %lo(RWSEM_WAITING_BIAS), %g1 + 1: + lduw [%o0], %g3 +- sub %g3, %g1, %g7 ++ subcc %g3, %g1, %g7 ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ tvs %icc, 6 ++#endif ++ + cas [%o0], %g3, %g7 + cmp %g3, %g7 + bne,pn %icc, 1b +diff -urNp linux-2.6.33.1/arch/sparc/Makefile linux-2.6.33.1/arch/sparc/Makefile +--- linux-2.6.33.1/arch/sparc/Makefile 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/Makefile 2010-03-20 16:58:38.956579787 -0400 +@@ -75,7 +75,7 @@ drivers-$(CONFIG_OPROFILE) += arch/sparc + # Export what is needed by arch/sparc/boot/Makefile + export VMLINUX_INIT VMLINUX_MAIN + VMLINUX_INIT := $(head-y) $(init-y) +-VMLINUX_MAIN := $(core-y) kernel/ mm/ fs/ ipc/ security/ crypto/ block/ ++VMLINUX_MAIN := $(core-y) kernel/ mm/ fs/ ipc/ security/ crypto/ block/ grsecurity/ + VMLINUX_MAIN += $(patsubst %/, %/lib.a, $(libs-y)) $(libs-y) + VMLINUX_MAIN += $(drivers-y) $(net-y) + +diff -urNp linux-2.6.33.1/arch/sparc/mm/fault_32.c linux-2.6.33.1/arch/sparc/mm/fault_32.c +--- linux-2.6.33.1/arch/sparc/mm/fault_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/mm/fault_32.c 2010-03-20 16:58:38.956579787 -0400 +@@ -22,6 +22,9 @@ + #include <linux/interrupt.h> + #include <linux/module.h> + #include <linux/kdebug.h> ++#include <linux/slab.h> ++#include <linux/pagemap.h> ++#include <linux/compiler.h> + + #include <asm/system.h> + #include <asm/page.h> +@@ -168,6 +171,267 @@ static unsigned long compute_si_addr(str + return safe_compute_effective_address(regs, insn); + } + ++#ifdef CONFIG_PAX_PAGEEXEC ++#ifdef CONFIG_PAX_DLRESOLVE ++static void pax_emuplt_close(struct vm_area_struct *vma) ++{ ++ vma->vm_mm->call_dl_resolve = 0UL; ++} ++ ++static int pax_emuplt_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ unsigned int *kaddr; ++ ++ vmf->page = alloc_page(GFP_HIGHUSER); ++ if (!vmf->page) ++ return VM_FAULT_OOM; ++ ++ kaddr = kmap(vmf->page); ++ memset(kaddr, 0, PAGE_SIZE); ++ kaddr[0] = 0x9DE3BFA8U; /* save */ ++ flush_dcache_page(vmf->page); ++ kunmap(vmf->page); ++ return VM_FAULT_MAJOR; ++} ++ ++static const struct vm_operations_struct pax_vm_ops = { ++ .close = pax_emuplt_close, ++ .fault = pax_emuplt_fault ++}; ++ ++static int pax_insert_vma(struct vm_area_struct *vma, unsigned long addr) ++{ ++ int ret; ++ ++ vma->vm_mm = current->mm; ++ vma->vm_start = addr; ++ vma->vm_end = addr + PAGE_SIZE; ++ vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; ++ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); ++ vma->vm_ops = &pax_vm_ops; ++ ++ ret = insert_vm_struct(current->mm, vma); ++ if (ret) ++ return ret; ++ ++ ++current->mm->total_vm; ++ return 0; ++} ++#endif ++ ++/* ++ * PaX: decide what to do with offenders (regs->pc = fault address) ++ * ++ * returns 1 when task should be killed ++ * 2 when patched PLT trampoline was detected ++ * 3 when unpatched PLT trampoline was detected ++ */ ++static int pax_handle_fetch_fault(struct pt_regs *regs) ++{ ++ ++#ifdef CONFIG_PAX_EMUPLT ++ int err; ++ ++ do { /* PaX: patched PLT emulation #1 */ ++ unsigned int sethi1, sethi2, jmpl; ++ ++ err = get_user(sethi1, (unsigned int *)regs->pc); ++ err |= get_user(sethi2, (unsigned int *)(regs->pc+4)); ++ err |= get_user(jmpl, (unsigned int *)(regs->pc+8)); ++ ++ if (err) ++ break; ++ ++ if ((sethi1 & 0xFFC00000U) == 0x03000000U && ++ (sethi2 & 0xFFC00000U) == 0x03000000U && ++ (jmpl & 0xFFFFE000U) == 0x81C06000U) ++ { ++ unsigned int addr; ++ ++ regs->u_regs[UREG_G1] = (sethi2 & 0x003FFFFFU) << 10; ++ addr = regs->u_regs[UREG_G1]; ++ addr += (((jmpl | 0xFFFFE000U) ^ 0x00001000U) + 0x00001000U); ++ regs->pc = addr; ++ regs->npc = addr+4; ++ return 2; ++ } ++ } while (0); ++ ++ { /* PaX: patched PLT emulation #2 */ ++ unsigned int ba; ++ ++ err = get_user(ba, (unsigned int *)regs->pc); ++ ++ if (!err && (ba & 0xFFC00000U) == 0x30800000U) { ++ unsigned int addr; ++ ++ addr = regs->pc + ((((ba | 0xFFC00000U) ^ 0x00200000U) + 0x00200000U) << 2); ++ regs->pc = addr; ++ regs->npc = addr+4; ++ return 2; ++ } ++ } ++ ++ do { /* PaX: patched PLT emulation #3 */ ++ unsigned int sethi, jmpl, nop; ++ ++ err = get_user(sethi, (unsigned int *)regs->pc); ++ err |= get_user(jmpl, (unsigned int *)(regs->pc+4)); ++ err |= get_user(nop, (unsigned int *)(regs->pc+8)); ++ ++ if (err) ++ break; ++ ++ if ((sethi & 0xFFC00000U) == 0x03000000U && ++ (jmpl & 0xFFFFE000U) == 0x81C06000U && ++ nop == 0x01000000U) ++ { ++ unsigned int addr; ++ ++ addr = (sethi & 0x003FFFFFU) << 10; ++ regs->u_regs[UREG_G1] = addr; ++ addr += (((jmpl | 0xFFFFE000U) ^ 0x00001000U) + 0x00001000U); ++ regs->pc = addr; ++ regs->npc = addr+4; ++ return 2; ++ } ++ } while (0); ++ ++ do { /* PaX: unpatched PLT emulation step 1 */ ++ unsigned int sethi, ba, nop; ++ ++ err = get_user(sethi, (unsigned int *)regs->pc); ++ err |= get_user(ba, (unsigned int *)(regs->pc+4)); ++ err |= get_user(nop, (unsigned int *)(regs->pc+8)); ++ ++ if (err) ++ break; ++ ++ if ((sethi & 0xFFC00000U) == 0x03000000U && ++ ((ba & 0xFFC00000U) == 0x30800000U || (ba & 0xFFF80000U) == 0x30680000U) && ++ nop == 0x01000000U) ++ { ++ unsigned int addr, save, call; ++ ++ if ((ba & 0xFFC00000U) == 0x30800000U) ++ addr = regs->pc + 4 + ((((ba | 0xFFC00000U) ^ 0x00200000U) + 0x00200000U) << 2); ++ else ++ addr = regs->pc + 4 + ((((ba | 0xFFF80000U) ^ 0x00040000U) + 0x00040000U) << 2); ++ ++ err = get_user(save, (unsigned int *)addr); ++ err |= get_user(call, (unsigned int *)(addr+4)); ++ err |= get_user(nop, (unsigned int *)(addr+8)); ++ if (err) ++ break; ++ ++#ifdef CONFIG_PAX_DLRESOLVE ++ if (save == 0x9DE3BFA8U && ++ (call & 0xC0000000U) == 0x40000000U && ++ nop == 0x01000000U) ++ { ++ struct vm_area_struct *vma; ++ unsigned long call_dl_resolve; ++ ++ down_read(¤t->mm->mmap_sem); ++ call_dl_resolve = current->mm->call_dl_resolve; ++ up_read(¤t->mm->mmap_sem); ++ if (likely(call_dl_resolve)) ++ goto emulate; ++ ++ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); ++ ++ down_write(¤t->mm->mmap_sem); ++ if (current->mm->call_dl_resolve) { ++ call_dl_resolve = current->mm->call_dl_resolve; ++ up_write(¤t->mm->mmap_sem); ++ if (vma) ++ kmem_cache_free(vm_area_cachep, vma); ++ goto emulate; ++ } ++ ++ call_dl_resolve = get_unmapped_area(NULL, 0UL, PAGE_SIZE, 0UL, MAP_PRIVATE); ++ if (!vma || (call_dl_resolve & ~PAGE_MASK)) { ++ up_write(¤t->mm->mmap_sem); ++ if (vma) ++ kmem_cache_free(vm_area_cachep, vma); ++ return 1; ++ } ++ ++ if (pax_insert_vma(vma, call_dl_resolve)) { ++ up_write(¤t->mm->mmap_sem); ++ kmem_cache_free(vm_area_cachep, vma); ++ return 1; ++ } ++ ++ current->mm->call_dl_resolve = call_dl_resolve; ++ up_write(¤t->mm->mmap_sem); ++ ++emulate: ++ regs->u_regs[UREG_G1] = (sethi & 0x003FFFFFU) << 10; ++ regs->pc = call_dl_resolve; ++ regs->npc = addr+4; ++ return 3; ++ } ++#endif ++ ++ /* PaX: glibc 2.4+ generates sethi/jmpl instead of save/call */ ++ if ((save & 0xFFC00000U) == 0x05000000U && ++ (call & 0xFFFFE000U) == 0x85C0A000U && ++ nop == 0x01000000U) ++ { ++ regs->u_regs[UREG_G1] = (sethi & 0x003FFFFFU) << 10; ++ regs->u_regs[UREG_G2] = addr + 4; ++ addr = (save & 0x003FFFFFU) << 10; ++ addr += (((call | 0xFFFFE000U) ^ 0x00001000U) + 0x00001000U); ++ regs->pc = addr; ++ regs->npc = addr+4; ++ return 3; ++ } ++ } ++ } while (0); ++ ++ do { /* PaX: unpatched PLT emulation step 2 */ ++ unsigned int save, call, nop; ++ ++ err = get_user(save, (unsigned int *)(regs->pc-4)); ++ err |= get_user(call, (unsigned int *)regs->pc); ++ err |= get_user(nop, (unsigned int *)(regs->pc+4)); ++ if (err) ++ break; ++ ++ if (save == 0x9DE3BFA8U && ++ (call & 0xC0000000U) == 0x40000000U && ++ nop == 0x01000000U) ++ { ++ unsigned int dl_resolve = regs->pc + ((((call | 0xC0000000U) ^ 0x20000000U) + 0x20000000U) << 2); ++ ++ regs->u_regs[UREG_RETPC] = regs->pc; ++ regs->pc = dl_resolve; ++ regs->npc = dl_resolve+4; ++ return 3; ++ } ++ } while (0); ++#endif ++ ++ return 1; ++} ++ ++void pax_report_insns(void *pc, void *sp) ++{ ++ unsigned long i; ++ ++ printk(KERN_ERR "PAX: bytes at PC: "); ++ for (i = 0; i < 8; i++) { ++ unsigned int c; ++ if (get_user(c, (unsigned int *)pc+i)) ++ printk(KERN_CONT "???????? "); ++ else ++ printk(KERN_CONT "%08x ", c); ++ } ++ printk("\n"); ++} ++#endif ++ + asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, + unsigned long address) + { +@@ -234,6 +498,24 @@ good_area: + if(!(vma->vm_flags & VM_WRITE)) + goto bad_area; + } else { ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && text_fault && !(vma->vm_flags & VM_EXEC)) { ++ up_read(&mm->mmap_sem); ++ switch (pax_handle_fetch_fault(regs)) { ++ ++#ifdef CONFIG_PAX_EMUPLT ++ case 2: ++ case 3: ++ return; ++#endif ++ ++ } ++ pax_report_fault(regs, (void *)regs->pc, (void *)regs->u_regs[UREG_FP]); ++ do_group_exit(SIGKILL); ++ } ++#endif ++ + /* Allow reads even for write-only mappings */ + if(!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; +diff -urNp linux-2.6.33.1/arch/sparc/mm/fault_64.c linux-2.6.33.1/arch/sparc/mm/fault_64.c +--- linux-2.6.33.1/arch/sparc/mm/fault_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/mm/fault_64.c 2010-03-20 16:58:38.956579787 -0400 +@@ -21,6 +21,9 @@ + #include <linux/kprobes.h> + #include <linux/kdebug.h> + #include <linux/percpu.h> ++#include <linux/slab.h> ++#include <linux/pagemap.h> ++#include <linux/compiler.h> + + #include <asm/page.h> + #include <asm/pgtable.h> +@@ -244,6 +247,456 @@ static void noinline __kprobes bogus_32b + show_regs(regs); + } + ++#ifdef CONFIG_PAX_PAGEEXEC ++#ifdef CONFIG_PAX_DLRESOLVE ++static void pax_emuplt_close(struct vm_area_struct *vma) ++{ ++ vma->vm_mm->call_dl_resolve = 0UL; ++} ++ ++static int pax_emuplt_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ unsigned int *kaddr; ++ ++ vmf->page = alloc_page(GFP_HIGHUSER); ++ if (!vmf->page) ++ return VM_FAULT_OOM; ++ ++ kaddr = kmap(vmf->page); ++ memset(kaddr, 0, PAGE_SIZE); ++ kaddr[0] = 0x9DE3BFA8U; /* save */ ++ flush_dcache_page(vmf->page); ++ kunmap(vmf->page); ++ return VM_FAULT_MAJOR; ++} ++ ++static const struct vm_operations_struct pax_vm_ops = { ++ .close = pax_emuplt_close, ++ .fault = pax_emuplt_fault ++}; ++ ++static int pax_insert_vma(struct vm_area_struct *vma, unsigned long addr) ++{ ++ int ret; ++ ++ vma->vm_mm = current->mm; ++ vma->vm_start = addr; ++ vma->vm_end = addr + PAGE_SIZE; ++ vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; ++ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); ++ vma->vm_ops = &pax_vm_ops; ++ ++ ret = insert_vm_struct(current->mm, vma); ++ if (ret) ++ return ret; ++ ++ ++current->mm->total_vm; ++ return 0; ++} ++#endif ++ ++/* ++ * PaX: decide what to do with offenders (regs->tpc = fault address) ++ * ++ * returns 1 when task should be killed ++ * 2 when patched PLT trampoline was detected ++ * 3 when unpatched PLT trampoline was detected ++ */ ++static int pax_handle_fetch_fault(struct pt_regs *regs) ++{ ++ ++#ifdef CONFIG_PAX_EMUPLT ++ int err; ++ ++ do { /* PaX: patched PLT emulation #1 */ ++ unsigned int sethi1, sethi2, jmpl; ++ ++ err = get_user(sethi1, (unsigned int *)regs->tpc); ++ err |= get_user(sethi2, (unsigned int *)(regs->tpc+4)); ++ err |= get_user(jmpl, (unsigned int *)(regs->tpc+8)); ++ ++ if (err) ++ break; ++ ++ if ((sethi1 & 0xFFC00000U) == 0x03000000U && ++ (sethi2 & 0xFFC00000U) == 0x03000000U && ++ (jmpl & 0xFFFFE000U) == 0x81C06000U) ++ { ++ unsigned long addr; ++ ++ regs->u_regs[UREG_G1] = (sethi2 & 0x003FFFFFU) << 10; ++ addr = regs->u_regs[UREG_G1]; ++ addr += (((jmpl | 0xFFFFFFFFFFFFE000UL) ^ 0x00001000UL) + 0x00001000UL); ++ ++ if (test_thread_flag(TIF_32BIT)) ++ addr &= 0xFFFFFFFFUL; ++ ++ regs->tpc = addr; ++ regs->tnpc = addr+4; ++ return 2; ++ } ++ } while (0); ++ ++ { /* PaX: patched PLT emulation #2 */ ++ unsigned int ba; ++ ++ err = get_user(ba, (unsigned int *)regs->tpc); ++ ++ if (!err && (ba & 0xFFC00000U) == 0x30800000U) { ++ unsigned long addr; ++ ++ addr = regs->tpc + ((((ba | 0xFFFFFFFFFFC00000UL) ^ 0x00200000UL) + 0x00200000UL) << 2); ++ ++ if (test_thread_flag(TIF_32BIT)) ++ addr &= 0xFFFFFFFFUL; ++ ++ regs->tpc = addr; ++ regs->tnpc = addr+4; ++ return 2; ++ } ++ } ++ ++ do { /* PaX: patched PLT emulation #3 */ ++ unsigned int sethi, jmpl, nop; ++ ++ err = get_user(sethi, (unsigned int *)regs->tpc); ++ err |= get_user(jmpl, (unsigned int *)(regs->tpc+4)); ++ err |= get_user(nop, (unsigned int *)(regs->tpc+8)); ++ ++ if (err) ++ break; ++ ++ if ((sethi & 0xFFC00000U) == 0x03000000U && ++ (jmpl & 0xFFFFE000U) == 0x81C06000U && ++ nop == 0x01000000U) ++ { ++ unsigned long addr; ++ ++ addr = (sethi & 0x003FFFFFU) << 10; ++ regs->u_regs[UREG_G1] = addr; ++ addr += (((jmpl | 0xFFFFFFFFFFFFE000UL) ^ 0x00001000UL) + 0x00001000UL); ++ ++ if (test_thread_flag(TIF_32BIT)) ++ addr &= 0xFFFFFFFFUL; ++ ++ regs->tpc = addr; ++ regs->tnpc = addr+4; ++ return 2; ++ } ++ } while (0); ++ ++ do { /* PaX: patched PLT emulation #4 */ ++ unsigned int sethi, mov1, call, mov2; ++ ++ err = get_user(sethi, (unsigned int *)regs->tpc); ++ err |= get_user(mov1, (unsigned int *)(regs->tpc+4)); ++ err |= get_user(call, (unsigned int *)(regs->tpc+8)); ++ err |= get_user(mov2, (unsigned int *)(regs->tpc+12)); ++ ++ if (err) ++ break; ++ ++ if ((sethi & 0xFFC00000U) == 0x03000000U && ++ mov1 == 0x8210000FU && ++ (call & 0xC0000000U) == 0x40000000U && ++ mov2 == 0x9E100001U) ++ { ++ unsigned long addr; ++ ++ regs->u_regs[UREG_G1] = regs->u_regs[UREG_RETPC]; ++ addr = regs->tpc + 4 + ((((call | 0xFFFFFFFFC0000000UL) ^ 0x20000000UL) + 0x20000000UL) << 2); ++ ++ if (test_thread_flag(TIF_32BIT)) ++ addr &= 0xFFFFFFFFUL; ++ ++ regs->tpc = addr; ++ regs->tnpc = addr+4; ++ return 2; ++ } ++ } while (0); ++ ++ do { /* PaX: patched PLT emulation #5 */ ++ unsigned int sethi, sethi1, sethi2, or1, or2, sllx, jmpl, nop; ++ ++ err = get_user(sethi, (unsigned int *)regs->tpc); ++ err |= get_user(sethi1, (unsigned int *)(regs->tpc+4)); ++ err |= get_user(sethi2, (unsigned int *)(regs->tpc+8)); ++ err |= get_user(or1, (unsigned int *)(regs->tpc+12)); ++ err |= get_user(or2, (unsigned int *)(regs->tpc+16)); ++ err |= get_user(sllx, (unsigned int *)(regs->tpc+20)); ++ err |= get_user(jmpl, (unsigned int *)(regs->tpc+24)); ++ err |= get_user(nop, (unsigned int *)(regs->tpc+28)); ++ ++ if (err) ++ break; ++ ++ if ((sethi & 0xFFC00000U) == 0x03000000U && ++ (sethi1 & 0xFFC00000U) == 0x03000000U && ++ (sethi2 & 0xFFC00000U) == 0x0B000000U && ++ (or1 & 0xFFFFE000U) == 0x82106000U && ++ (or2 & 0xFFFFE000U) == 0x8A116000U && ++ sllx == 0x83287020U && ++ jmpl == 0x81C04005U && ++ nop == 0x01000000U) ++ { ++ unsigned long addr; ++ ++ regs->u_regs[UREG_G1] = ((sethi1 & 0x003FFFFFU) << 10) | (or1 & 0x000003FFU); ++ regs->u_regs[UREG_G1] <<= 32; ++ regs->u_regs[UREG_G5] = ((sethi2 & 0x003FFFFFU) << 10) | (or2 & 0x000003FFU); ++ addr = regs->u_regs[UREG_G1] + regs->u_regs[UREG_G5]; ++ regs->tpc = addr; ++ regs->tnpc = addr+4; ++ return 2; ++ } ++ } while (0); ++ ++ do { /* PaX: patched PLT emulation #6 */ ++ unsigned int sethi, sethi1, sethi2, sllx, or, jmpl, nop; ++ ++ err = get_user(sethi, (unsigned int *)regs->tpc); ++ err |= get_user(sethi1, (unsigned int *)(regs->tpc+4)); ++ err |= get_user(sethi2, (unsigned int *)(regs->tpc+8)); ++ err |= get_user(sllx, (unsigned int *)(regs->tpc+12)); ++ err |= get_user(or, (unsigned int *)(regs->tpc+16)); ++ err |= get_user(jmpl, (unsigned int *)(regs->tpc+20)); ++ err |= get_user(nop, (unsigned int *)(regs->tpc+24)); ++ ++ if (err) ++ break; ++ ++ if ((sethi & 0xFFC00000U) == 0x03000000U && ++ (sethi1 & 0xFFC00000U) == 0x03000000U && ++ (sethi2 & 0xFFC00000U) == 0x0B000000U && ++ sllx == 0x83287020U && ++ (or & 0xFFFFE000U) == 0x8A116000U && ++ jmpl == 0x81C04005U && ++ nop == 0x01000000U) ++ { ++ unsigned long addr; ++ ++ regs->u_regs[UREG_G1] = (sethi1 & 0x003FFFFFU) << 10; ++ regs->u_regs[UREG_G1] <<= 32; ++ regs->u_regs[UREG_G5] = ((sethi2 & 0x003FFFFFU) << 10) | (or & 0x3FFU); ++ addr = regs->u_regs[UREG_G1] + regs->u_regs[UREG_G5]; ++ regs->tpc = addr; ++ regs->tnpc = addr+4; ++ return 2; ++ } ++ } while (0); ++ ++ do { /* PaX: unpatched PLT emulation step 1 */ ++ unsigned int sethi, ba, nop; ++ ++ err = get_user(sethi, (unsigned int *)regs->tpc); ++ err |= get_user(ba, (unsigned int *)(regs->tpc+4)); ++ err |= get_user(nop, (unsigned int *)(regs->tpc+8)); ++ ++ if (err) ++ break; ++ ++ if ((sethi & 0xFFC00000U) == 0x03000000U && ++ ((ba & 0xFFC00000U) == 0x30800000U || (ba & 0xFFF80000U) == 0x30680000U) && ++ nop == 0x01000000U) ++ { ++ unsigned long addr; ++ unsigned int save, call; ++ unsigned int sethi1, sethi2, or1, or2, sllx, add, jmpl; ++ ++ if ((ba & 0xFFC00000U) == 0x30800000U) ++ addr = regs->tpc + 4 + ((((ba | 0xFFFFFFFFFFC00000UL) ^ 0x00200000UL) + 0x00200000UL) << 2); ++ else ++ addr = regs->tpc + 4 + ((((ba | 0xFFFFFFFFFFF80000UL) ^ 0x00040000UL) + 0x00040000UL) << 2); ++ ++ if (test_thread_flag(TIF_32BIT)) ++ addr &= 0xFFFFFFFFUL; ++ ++ err = get_user(save, (unsigned int *)addr); ++ err |= get_user(call, (unsigned int *)(addr+4)); ++ err |= get_user(nop, (unsigned int *)(addr+8)); ++ if (err) ++ break; ++ ++#ifdef CONFIG_PAX_DLRESOLVE ++ if (save == 0x9DE3BFA8U && ++ (call & 0xC0000000U) == 0x40000000U && ++ nop == 0x01000000U) ++ { ++ struct vm_area_struct *vma; ++ unsigned long call_dl_resolve; ++ ++ down_read(¤t->mm->mmap_sem); ++ call_dl_resolve = current->mm->call_dl_resolve; ++ up_read(¤t->mm->mmap_sem); ++ if (likely(call_dl_resolve)) ++ goto emulate; ++ ++ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); ++ ++ down_write(¤t->mm->mmap_sem); ++ if (current->mm->call_dl_resolve) { ++ call_dl_resolve = current->mm->call_dl_resolve; ++ up_write(¤t->mm->mmap_sem); ++ if (vma) ++ kmem_cache_free(vm_area_cachep, vma); ++ goto emulate; ++ } ++ ++ call_dl_resolve = get_unmapped_area(NULL, 0UL, PAGE_SIZE, 0UL, MAP_PRIVATE); ++ if (!vma || (call_dl_resolve & ~PAGE_MASK)) { ++ up_write(¤t->mm->mmap_sem); ++ if (vma) ++ kmem_cache_free(vm_area_cachep, vma); ++ return 1; ++ } ++ ++ if (pax_insert_vma(vma, call_dl_resolve)) { ++ up_write(¤t->mm->mmap_sem); ++ kmem_cache_free(vm_area_cachep, vma); ++ return 1; ++ } ++ ++ current->mm->call_dl_resolve = call_dl_resolve; ++ up_write(¤t->mm->mmap_sem); ++ ++emulate: ++ regs->u_regs[UREG_G1] = (sethi & 0x003FFFFFU) << 10; ++ regs->tpc = call_dl_resolve; ++ regs->tnpc = addr+4; ++ return 3; ++ } ++#endif ++ ++ /* PaX: glibc 2.4+ generates sethi/jmpl instead of save/call */ ++ if ((save & 0xFFC00000U) == 0x05000000U && ++ (call & 0xFFFFE000U) == 0x85C0A000U && ++ nop == 0x01000000U) ++ { ++ regs->u_regs[UREG_G1] = (sethi & 0x003FFFFFU) << 10; ++ regs->u_regs[UREG_G2] = addr + 4; ++ addr = (save & 0x003FFFFFU) << 10; ++ addr += (((call | 0xFFFFFFFFFFFFE000UL) ^ 0x00001000UL) + 0x00001000UL); ++ ++ if (test_thread_flag(TIF_32BIT)) ++ addr &= 0xFFFFFFFFUL; ++ ++ regs->tpc = addr; ++ regs->tnpc = addr+4; ++ return 3; ++ } ++ ++ /* PaX: 64-bit PLT stub */ ++ err = get_user(sethi1, (unsigned int *)addr); ++ err |= get_user(sethi2, (unsigned int *)(addr+4)); ++ err |= get_user(or1, (unsigned int *)(addr+8)); ++ err |= get_user(or2, (unsigned int *)(addr+12)); ++ err |= get_user(sllx, (unsigned int *)(addr+16)); ++ err |= get_user(add, (unsigned int *)(addr+20)); ++ err |= get_user(jmpl, (unsigned int *)(addr+24)); ++ err |= get_user(nop, (unsigned int *)(addr+28)); ++ if (err) ++ break; ++ ++ if ((sethi1 & 0xFFC00000U) == 0x09000000U && ++ (sethi2 & 0xFFC00000U) == 0x0B000000U && ++ (or1 & 0xFFFFE000U) == 0x88112000U && ++ (or2 & 0xFFFFE000U) == 0x8A116000U && ++ sllx == 0x89293020U && ++ add == 0x8A010005U && ++ jmpl == 0x89C14000U && ++ nop == 0x01000000U) ++ { ++ regs->u_regs[UREG_G1] = (sethi & 0x003FFFFFU) << 10; ++ regs->u_regs[UREG_G4] = ((sethi1 & 0x003FFFFFU) << 10) | (or1 & 0x000003FFU); ++ regs->u_regs[UREG_G4] <<= 32; ++ regs->u_regs[UREG_G5] = ((sethi2 & 0x003FFFFFU) << 10) | (or2 & 0x000003FFU); ++ regs->u_regs[UREG_G5] += regs->u_regs[UREG_G4]; ++ regs->u_regs[UREG_G4] = addr + 24; ++ addr = regs->u_regs[UREG_G5]; ++ regs->tpc = addr; ++ regs->tnpc = addr+4; ++ return 3; ++ } ++ } ++ } while (0); ++ ++#ifdef CONFIG_PAX_DLRESOLVE ++ do { /* PaX: unpatched PLT emulation step 2 */ ++ unsigned int save, call, nop; ++ ++ err = get_user(save, (unsigned int *)(regs->tpc-4)); ++ err |= get_user(call, (unsigned int *)regs->tpc); ++ err |= get_user(nop, (unsigned int *)(regs->tpc+4)); ++ if (err) ++ break; ++ ++ if (save == 0x9DE3BFA8U && ++ (call & 0xC0000000U) == 0x40000000U && ++ nop == 0x01000000U) ++ { ++ unsigned long dl_resolve = regs->tpc + ((((call | 0xFFFFFFFFC0000000UL) ^ 0x20000000UL) + 0x20000000UL) << 2); ++ ++ if (test_thread_flag(TIF_32BIT)) ++ dl_resolve &= 0xFFFFFFFFUL; ++ ++ regs->u_regs[UREG_RETPC] = regs->tpc; ++ regs->tpc = dl_resolve; ++ regs->tnpc = dl_resolve+4; ++ return 3; ++ } ++ } while (0); ++#endif ++ ++ do { /* PaX: patched PLT emulation #7, must be AFTER the unpatched PLT emulation */ ++ unsigned int sethi, ba, nop; ++ ++ err = get_user(sethi, (unsigned int *)regs->tpc); ++ err |= get_user(ba, (unsigned int *)(regs->tpc+4)); ++ err |= get_user(nop, (unsigned int *)(regs->tpc+8)); ++ ++ if (err) ++ break; ++ ++ if ((sethi & 0xFFC00000U) == 0x03000000U && ++ (ba & 0xFFF00000U) == 0x30600000U && ++ nop == 0x01000000U) ++ { ++ unsigned long addr; ++ ++ addr = (sethi & 0x003FFFFFU) << 10; ++ regs->u_regs[UREG_G1] = addr; ++ addr = regs->tpc + ((((ba | 0xFFFFFFFFFFF80000UL) ^ 0x00040000UL) + 0x00040000UL) << 2); ++ ++ if (test_thread_flag(TIF_32BIT)) ++ addr &= 0xFFFFFFFFUL; ++ ++ regs->tpc = addr; ++ regs->tnpc = addr+4; ++ return 2; ++ } ++ } while (0); ++ ++#endif ++ ++ return 1; ++} ++ ++void pax_report_insns(void *pc, void *sp) ++{ ++ unsigned long i; ++ ++ printk(KERN_ERR "PAX: bytes at PC: "); ++ for (i = 0; i < 8; i++) { ++ unsigned int c; ++ if (get_user(c, (unsigned int *)pc+i)) ++ printk(KERN_CONT "???????? "); ++ else ++ printk(KERN_CONT "%08x ", c); ++ } ++ printk("\n"); ++} ++#endif ++ + asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) + { + struct mm_struct *mm = current->mm; +@@ -312,6 +765,29 @@ asmlinkage void __kprobes do_sparc64_fau + if (!vma) + goto bad_area; + ++#ifdef CONFIG_PAX_PAGEEXEC ++ /* PaX: detect ITLB misses on non-exec pages */ ++ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && vma->vm_start <= address && ++ !(vma->vm_flags & VM_EXEC) && (fault_code & FAULT_CODE_ITLB)) ++ { ++ if (address != regs->tpc) ++ goto good_area; ++ ++ up_read(&mm->mmap_sem); ++ switch (pax_handle_fetch_fault(regs)) { ++ ++#ifdef CONFIG_PAX_EMUPLT ++ case 2: ++ case 3: ++ return; ++#endif ++ ++ } ++ pax_report_fault(regs, (void *)regs->tpc, (void *)(regs->u_regs[UREG_FP] + STACK_BIAS)); ++ do_group_exit(SIGKILL); ++ } ++#endif ++ + /* Pure DTLB misses do not tell us whether the fault causing + * load/store/atomic was a write or not, it only says that there + * was no match. So in such a case we (carefully) read the +diff -urNp linux-2.6.33.1/arch/sparc/mm/init_32.c linux-2.6.33.1/arch/sparc/mm/init_32.c +--- linux-2.6.33.1/arch/sparc/mm/init_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/mm/init_32.c 2010-03-20 16:58:38.956579787 -0400 +@@ -317,6 +317,9 @@ extern void device_scan(void); + pgprot_t PAGE_SHARED __read_mostly; + EXPORT_SYMBOL(PAGE_SHARED); + ++pgprot_t PAGE_SHARED_NOEXEC __read_mostly; ++EXPORT_SYMBOL(PAGE_SHARED_NOEXEC); ++ + void __init paging_init(void) + { + switch(sparc_cpu_model) { +@@ -345,17 +348,17 @@ void __init paging_init(void) + + /* Initialize the protection map with non-constant, MMU dependent values. */ + protection_map[0] = PAGE_NONE; +- protection_map[1] = PAGE_READONLY; +- protection_map[2] = PAGE_COPY; +- protection_map[3] = PAGE_COPY; ++ protection_map[1] = PAGE_READONLY_NOEXEC; ++ protection_map[2] = PAGE_COPY_NOEXEC; ++ protection_map[3] = PAGE_COPY_NOEXEC; + protection_map[4] = PAGE_READONLY; + protection_map[5] = PAGE_READONLY; + protection_map[6] = PAGE_COPY; + protection_map[7] = PAGE_COPY; + protection_map[8] = PAGE_NONE; +- protection_map[9] = PAGE_READONLY; +- protection_map[10] = PAGE_SHARED; +- protection_map[11] = PAGE_SHARED; ++ protection_map[9] = PAGE_READONLY_NOEXEC; ++ protection_map[10] = PAGE_SHARED_NOEXEC; ++ protection_map[11] = PAGE_SHARED_NOEXEC; + protection_map[12] = PAGE_READONLY; + protection_map[13] = PAGE_READONLY; + protection_map[14] = PAGE_SHARED; +diff -urNp linux-2.6.33.1/arch/sparc/mm/Makefile linux-2.6.33.1/arch/sparc/mm/Makefile +--- linux-2.6.33.1/arch/sparc/mm/Makefile 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/mm/Makefile 2010-03-20 16:58:38.968781407 -0400 +@@ -2,7 +2,7 @@ + # + + asflags-y := -ansi +-ccflags-y := -Werror ++#ccflags-y := -Werror + + obj-$(CONFIG_SPARC64) += ultra.o tlb.o tsb.o + obj-y += fault_$(BITS).o +diff -urNp linux-2.6.33.1/arch/sparc/mm/srmmu.c linux-2.6.33.1/arch/sparc/mm/srmmu.c +--- linux-2.6.33.1/arch/sparc/mm/srmmu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/sparc/mm/srmmu.c 2010-03-20 16:58:38.968781407 -0400 +@@ -2198,6 +2198,13 @@ void __init ld_mmu_srmmu(void) + PAGE_SHARED = pgprot_val(SRMMU_PAGE_SHARED); + BTFIXUPSET_INT(page_copy, pgprot_val(SRMMU_PAGE_COPY)); + BTFIXUPSET_INT(page_readonly, pgprot_val(SRMMU_PAGE_RDONLY)); ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ PAGE_SHARED_NOEXEC = pgprot_val(SRMMU_PAGE_SHARED_NOEXEC); ++ BTFIXUPSET_INT(page_copy_noexec, pgprot_val(SRMMU_PAGE_COPY_NOEXEC)); ++ BTFIXUPSET_INT(page_readonly_noexec, pgprot_val(SRMMU_PAGE_RDONLY_NOEXEC)); ++#endif ++ + BTFIXUPSET_INT(page_kernel, pgprot_val(SRMMU_PAGE_KERNEL)); + page_kernel = pgprot_val(SRMMU_PAGE_KERNEL); + +diff -urNp linux-2.6.33.1/arch/um/include/asm/kmap_types.h linux-2.6.33.1/arch/um/include/asm/kmap_types.h +--- linux-2.6.33.1/arch/um/include/asm/kmap_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/um/include/asm/kmap_types.h 2010-03-20 16:58:38.968781407 -0400 +@@ -23,6 +23,7 @@ enum km_type { + KM_IRQ1, + KM_SOFTIRQ0, + KM_SOFTIRQ1, ++ KM_CLEARPAGE, + KM_TYPE_NR + }; + +diff -urNp linux-2.6.33.1/arch/um/include/asm/page.h linux-2.6.33.1/arch/um/include/asm/page.h +--- linux-2.6.33.1/arch/um/include/asm/page.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/um/include/asm/page.h 2010-03-20 16:58:38.968781407 -0400 +@@ -14,6 +14,9 @@ + #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) + #define PAGE_MASK (~(PAGE_SIZE-1)) + ++#define ktla_ktva(addr) (addr) ++#define ktva_ktla(addr) (addr) ++ + #ifndef __ASSEMBLY__ + + struct page; +diff -urNp linux-2.6.33.1/arch/um/sys-i386/syscalls.c linux-2.6.33.1/arch/um/sys-i386/syscalls.c +--- linux-2.6.33.1/arch/um/sys-i386/syscalls.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/um/sys-i386/syscalls.c 2010-03-20 16:58:38.968781407 -0400 +@@ -11,6 +11,21 @@ + #include "asm/uaccess.h" + #include "asm/unistd.h" + ++int i386_mmap_check(unsigned long addr, unsigned long len, unsigned long flags) ++{ ++ unsigned long pax_task_size = TASK_SIZE; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++#endif ++ ++ if (len > pax_task_size || addr > pax_task_size - len) ++ return -EINVAL; ++ ++ return 0; ++} ++ + /* + * Perform the select(nd, in, out, ex, tv) and mmap() system + * calls. Linux/i386 didn't use to be able to handle more than +diff -urNp linux-2.6.33.1/arch/x86/boot/bitops.h linux-2.6.33.1/arch/x86/boot/bitops.h +--- linux-2.6.33.1/arch/x86/boot/bitops.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/boot/bitops.h 2010-03-20 16:58:38.968781407 -0400 +@@ -26,7 +26,7 @@ static inline int variable_test_bit(int + u8 v; + const u32 *p = (const u32 *)addr; + +- asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr)); ++ asm volatile("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr)); + return v; + } + +@@ -37,7 +37,7 @@ static inline int variable_test_bit(int + + static inline void set_bit(int nr, void *addr) + { +- asm("btsl %1,%0" : "+m" (*(u32 *)addr) : "Ir" (nr)); ++ asm volatile("btsl %1,%0" : "+m" (*(u32 *)addr) : "Ir" (nr)); + } + + #endif /* BOOT_BITOPS_H */ +diff -urNp linux-2.6.33.1/arch/x86/boot/boot.h linux-2.6.33.1/arch/x86/boot/boot.h +--- linux-2.6.33.1/arch/x86/boot/boot.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/boot/boot.h 2010-03-20 16:58:38.972657826 -0400 +@@ -82,7 +82,7 @@ static inline void io_delay(void) + static inline u16 ds(void) + { + u16 seg; +- asm("movw %%ds,%0" : "=rm" (seg)); ++ asm volatile("movw %%ds,%0" : "=rm" (seg)); + return seg; + } + +@@ -178,7 +178,7 @@ static inline void wrgs32(u32 v, addr_t + static inline int memcmp(const void *s1, const void *s2, size_t len) + { + u8 diff; +- asm("repe; cmpsb; setnz %0" ++ asm volatile("repe; cmpsb; setnz %0" + : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + return diff; + } +diff -urNp linux-2.6.33.1/arch/x86/boot/compressed/head_32.S linux-2.6.33.1/arch/x86/boot/compressed/head_32.S +--- linux-2.6.33.1/arch/x86/boot/compressed/head_32.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/boot/compressed/head_32.S 2010-03-20 16:58:38.972657826 -0400 +@@ -76,7 +76,7 @@ ENTRY(startup_32) + notl %eax + andl %eax, %ebx + #else +- movl $LOAD_PHYSICAL_ADDR, %ebx ++ movl $____LOAD_PHYSICAL_ADDR, %ebx + #endif + + /* Target address to relocate to for decompression */ +@@ -149,7 +149,7 @@ relocated: + * and where it was actually loaded. + */ + movl %ebp, %ebx +- subl $LOAD_PHYSICAL_ADDR, %ebx ++ subl $____LOAD_PHYSICAL_ADDR, %ebx + jz 2f /* Nothing to be done if loaded at compiled addr. */ + /* + * Process relocations. +@@ -157,8 +157,7 @@ relocated: + + 1: subl $4, %edi + movl (%edi), %ecx +- testl %ecx, %ecx +- jz 2f ++ jecxz 2f + addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) + jmp 1b + 2: +diff -urNp linux-2.6.33.1/arch/x86/boot/compressed/head_64.S linux-2.6.33.1/arch/x86/boot/compressed/head_64.S +--- linux-2.6.33.1/arch/x86/boot/compressed/head_64.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/boot/compressed/head_64.S 2010-03-20 16:58:38.972657826 -0400 +@@ -91,7 +91,7 @@ ENTRY(startup_32) + notl %eax + andl %eax, %ebx + #else +- movl $LOAD_PHYSICAL_ADDR, %ebx ++ movl $____LOAD_PHYSICAL_ADDR, %ebx + #endif + + /* Target address to relocate to for decompression */ +@@ -233,7 +233,7 @@ ENTRY(startup_64) + notq %rax + andq %rax, %rbp + #else +- movq $LOAD_PHYSICAL_ADDR, %rbp ++ movq $____LOAD_PHYSICAL_ADDR, %rbp + #endif + + /* Target address to relocate to for decompression */ +diff -urNp linux-2.6.33.1/arch/x86/boot/compressed/misc.c linux-2.6.33.1/arch/x86/boot/compressed/misc.c +--- linux-2.6.33.1/arch/x86/boot/compressed/misc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/boot/compressed/misc.c 2010-03-20 16:58:38.972657826 -0400 +@@ -292,7 +292,7 @@ static void parse_elf(void *output) + case PT_LOAD: + #ifdef CONFIG_RELOCATABLE + dest = output; +- dest += (phdr->p_paddr - LOAD_PHYSICAL_ADDR); ++ dest += (phdr->p_paddr - ____LOAD_PHYSICAL_ADDR); + #else + dest = (void *)(phdr->p_paddr); + #endif +@@ -339,7 +339,7 @@ asmlinkage void decompress_kernel(void * + error("Destination address too large"); + #endif + #ifndef CONFIG_RELOCATABLE +- if ((unsigned long)output != LOAD_PHYSICAL_ADDR) ++ if ((unsigned long)output != ____LOAD_PHYSICAL_ADDR) + error("Wrong destination address"); + #endif + +diff -urNp linux-2.6.33.1/arch/x86/boot/compressed/mkpiggy.c linux-2.6.33.1/arch/x86/boot/compressed/mkpiggy.c +--- linux-2.6.33.1/arch/x86/boot/compressed/mkpiggy.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/boot/compressed/mkpiggy.c 2010-03-20 16:58:38.972657826 -0400 +@@ -74,7 +74,7 @@ int main(int argc, char *argv[]) + + offs = (olen > ilen) ? olen - ilen : 0; + offs += olen >> 12; /* Add 8 bytes for each 32K block */ +- offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ ++ offs += 64*1024; /* Add 64K bytes slack */ + offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ + + printf(".section ".rodata.compressed","a",@progbits\n"); +diff -urNp linux-2.6.33.1/arch/x86/boot/compressed/relocs.c linux-2.6.33.1/arch/x86/boot/compressed/relocs.c +--- linux-2.6.33.1/arch/x86/boot/compressed/relocs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/boot/compressed/relocs.c 2010-03-20 16:58:38.972657826 -0400 +@@ -13,8 +13,11 @@ + + static void die(char *fmt, ...); + ++#include "../../../../include/generated/autoconf.h" ++ + #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + static Elf32_Ehdr ehdr; ++static Elf32_Phdr *phdr; + static unsigned long reloc_count, reloc_idx; + static unsigned long *relocs; + +@@ -270,9 +273,39 @@ static void read_ehdr(FILE *fp) + } + } + ++static void read_phdrs(FILE *fp) ++{ ++ unsigned int i; ++ ++ phdr = calloc(ehdr.e_phnum, sizeof(Elf32_Phdr)); ++ if (!phdr) { ++ die("Unable to allocate %d program headers\n", ++ ehdr.e_phnum); ++ } ++ if (fseek(fp, ehdr.e_phoff, SEEK_SET) < 0) { ++ die("Seek to %d failed: %s\n", ++ ehdr.e_phoff, strerror(errno)); ++ } ++ if (fread(phdr, sizeof(*phdr), ehdr.e_phnum, fp) != ehdr.e_phnum) { ++ die("Cannot read ELF program headers: %s\n", ++ strerror(errno)); ++ } ++ for(i = 0; i < ehdr.e_phnum; i++) { ++ phdr[i].p_type = elf32_to_cpu(phdr[i].p_type); ++ phdr[i].p_offset = elf32_to_cpu(phdr[i].p_offset); ++ phdr[i].p_vaddr = elf32_to_cpu(phdr[i].p_vaddr); ++ phdr[i].p_paddr = elf32_to_cpu(phdr[i].p_paddr); ++ phdr[i].p_filesz = elf32_to_cpu(phdr[i].p_filesz); ++ phdr[i].p_memsz = elf32_to_cpu(phdr[i].p_memsz); ++ phdr[i].p_flags = elf32_to_cpu(phdr[i].p_flags); ++ phdr[i].p_align = elf32_to_cpu(phdr[i].p_align); ++ } ++ ++} ++ + static void read_shdrs(FILE *fp) + { +- int i; ++ unsigned int i; + Elf32_Shdr shdr; + + secs = calloc(ehdr.e_shnum, sizeof(struct section)); +@@ -307,7 +340,7 @@ static void read_shdrs(FILE *fp) + + static void read_strtabs(FILE *fp) + { +- int i; ++ unsigned int i; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_STRTAB) { +@@ -332,7 +365,7 @@ static void read_strtabs(FILE *fp) + + static void read_symtabs(FILE *fp) + { +- int i,j; ++ unsigned int i,j; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_SYMTAB) { +@@ -365,7 +398,9 @@ static void read_symtabs(FILE *fp) + + static void read_relocs(FILE *fp) + { +- int i,j; ++ unsigned int i,j; ++ uint32_t base; ++ + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_REL) { +@@ -385,9 +420,18 @@ static void read_relocs(FILE *fp) + die("Cannot read symbol table: %s\n", + strerror(errno)); + } ++ base = 0; ++ for (j = 0; j < ehdr.e_phnum; j++) { ++ if (phdr[j].p_type != PT_LOAD ) ++ continue; ++ if (secs[sec->shdr.sh_info].shdr.sh_offset < phdr[j].p_offset || secs[sec->shdr.sh_info].shdr.sh_offset >= phdr[j].p_offset + phdr[j].p_filesz) ++ continue; ++ base = CONFIG_PAGE_OFFSET + phdr[j].p_paddr - phdr[j].p_vaddr; ++ break; ++ } + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel = &sec->reltab[j]; +- rel->r_offset = elf32_to_cpu(rel->r_offset); ++ rel->r_offset = elf32_to_cpu(rel->r_offset) + base; + rel->r_info = elf32_to_cpu(rel->r_info); + } + } +@@ -396,14 +440,14 @@ static void read_relocs(FILE *fp) + + static void print_absolute_symbols(void) + { +- int i; ++ unsigned int i; + printf("Absolute symbols\n"); + printf(" Num: Value Size Type Bind Visibility Name\n"); + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + char *sym_strtab; + Elf32_Sym *sh_symtab; +- int j; ++ unsigned int j; + + if (sec->shdr.sh_type != SHT_SYMTAB) { + continue; +@@ -431,14 +475,14 @@ static void print_absolute_symbols(void) + + static void print_absolute_relocs(void) + { +- int i, printed = 0; ++ unsigned int i, printed = 0; + + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + struct section *sec_applies, *sec_symtab; + char *sym_strtab; + Elf32_Sym *sh_symtab; +- int j; ++ unsigned int j; + if (sec->shdr.sh_type != SHT_REL) { + continue; + } +@@ -499,13 +543,13 @@ static void print_absolute_relocs(void) + + static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) + { +- int i; ++ unsigned int i; + /* Walk through the relocations */ + for (i = 0; i < ehdr.e_shnum; i++) { + char *sym_strtab; + Elf32_Sym *sh_symtab; + struct section *sec_applies, *sec_symtab; +- int j; ++ unsigned int j; + struct section *sec = &secs[i]; + + if (sec->shdr.sh_type != SHT_REL) { +@@ -530,6 +574,22 @@ static void walk_relocs(void (*visit)(El + !is_rel_reloc(sym_name(sym_strtab, sym))) { + continue; + } ++ /* Don't relocate actual per-cpu variables, they are absolute indices, not addresses */ ++ if (!strcmp(sec_name(sym->st_shndx), ".data.percpu") && strcmp(sym_name(sym_strtab, sym), "__per_cpu_load")) ++ continue; ++ ++#if defined(CONFIG_PAX_KERNEXEC) && defined(CONFIG_X86_32) ++ /* Don't relocate actual code, they are relocated implicitly by the base address of KERNEL_CS */ ++ if (!strcmp(sec_name(sym->st_shndx), ".data") && !strcmp(sym_name(sym_strtab, sym), "_etext")) ++ continue; ++ if (!strcmp(sec_name(sym->st_shndx), ".init.text")) ++ continue; ++ if (!strcmp(sec_name(sym->st_shndx), ".exit.text")) ++ continue; ++ if (!strcmp(sec_name(sym->st_shndx), ".text") && strcmp(sym_name(sym_strtab, sym), "__LOAD_PHYSICAL_ADDR")) ++ continue; ++#endif ++ + switch (r_type) { + case R_386_NONE: + case R_386_PC32: +@@ -571,7 +631,7 @@ static int cmp_relocs(const void *va, co + + static void emit_relocs(int as_text) + { +- int i; ++ unsigned int i; + /* Count how many relocations I have and allocate space for them. */ + reloc_count = 0; + walk_relocs(count_reloc); +@@ -665,6 +725,7 @@ int main(int argc, char **argv) + fname, strerror(errno)); + } + read_ehdr(fp); ++ read_phdrs(fp); + read_shdrs(fp); + read_strtabs(fp); + read_symtabs(fp); +diff -urNp linux-2.6.33.1/arch/x86/boot/cpucheck.c linux-2.6.33.1/arch/x86/boot/cpucheck.c +--- linux-2.6.33.1/arch/x86/boot/cpucheck.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/boot/cpucheck.c 2010-03-20 16:58:38.972657826 -0400 +@@ -74,7 +74,7 @@ static int has_fpu(void) + u16 fcw = -1, fsw = -1; + u32 cr0; + +- asm("movl %%cr0,%0" : "=r" (cr0)); ++ asm volatile("movl %%cr0,%0" : "=r" (cr0)); + if (cr0 & (X86_CR0_EM|X86_CR0_TS)) { + cr0 &= ~(X86_CR0_EM|X86_CR0_TS); + asm volatile("movl %0,%%cr0" : : "r" (cr0)); +@@ -90,7 +90,7 @@ static int has_eflag(u32 mask) + { + u32 f0, f1; + +- asm("pushfl ; " ++ asm volatile("pushfl ; " + "pushfl ; " + "popl %0 ; " + "movl %0,%1 ; " +@@ -115,7 +115,7 @@ static void get_flags(void) + set_bit(X86_FEATURE_FPU, cpu.flags); + + if (has_eflag(X86_EFLAGS_ID)) { +- asm("cpuid" ++ asm volatile("cpuid" + : "=a" (max_intel_level), + "=b" (cpu_vendor[0]), + "=d" (cpu_vendor[1]), +@@ -124,7 +124,7 @@ static void get_flags(void) + + if (max_intel_level >= 0x00000001 && + max_intel_level <= 0x0000ffff) { +- asm("cpuid" ++ asm volatile("cpuid" + : "=a" (tfms), + "=c" (cpu.flags[4]), + "=d" (cpu.flags[0]) +@@ -136,7 +136,7 @@ static void get_flags(void) + cpu.model += ((tfms >> 16) & 0xf) << 4; + } + +- asm("cpuid" ++ asm volatile("cpuid" + : "=a" (max_amd_level) + : "a" (0x80000000) + : "ebx", "ecx", "edx"); +@@ -144,7 +144,7 @@ static void get_flags(void) + if (max_amd_level >= 0x80000001 && + max_amd_level <= 0x8000ffff) { + u32 eax = 0x80000001; +- asm("cpuid" ++ asm volatile("cpuid" + : "+a" (eax), + "=c" (cpu.flags[6]), + "=d" (cpu.flags[1]) +@@ -203,9 +203,9 @@ int check_cpu(int *cpu_level_ptr, int *r + u32 ecx = MSR_K7_HWCR; + u32 eax, edx; + +- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); ++ asm volatile("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); + eax &= ~(1 << 15); +- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); ++ asm volatile("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + + get_flags(); /* Make sure it really did something */ + err = check_flags(); +@@ -218,9 +218,9 @@ int check_cpu(int *cpu_level_ptr, int *r + u32 ecx = MSR_VIA_FCR; + u32 eax, edx; + +- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); ++ asm volatile("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); + eax |= (1<<1)|(1<<7); +- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); ++ asm volatile("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + + set_bit(X86_FEATURE_CX8, cpu.flags); + err = check_flags(); +@@ -231,12 +231,12 @@ int check_cpu(int *cpu_level_ptr, int *r + u32 eax, edx; + u32 level = 1; + +- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); +- asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx)); +- asm("cpuid" ++ asm volatile("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); ++ asm volatile("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx)); ++ asm volatile("cpuid" + : "+a" (level), "=d" (cpu.flags[0]) + : : "ecx", "ebx"); +- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); ++ asm volatile("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + + err = check_flags(); + } +diff -urNp linux-2.6.33.1/arch/x86/boot/header.S linux-2.6.33.1/arch/x86/boot/header.S +--- linux-2.6.33.1/arch/x86/boot/header.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/boot/header.S 2010-03-20 16:58:38.972657826 -0400 +@@ -224,7 +224,7 @@ setup_data: .quad 0 # 64-bit physical + # single linked list of + # struct setup_data + +-pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr ++pref_address: .quad ____LOAD_PHYSICAL_ADDR # preferred load addr + + #define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) + #define VO_INIT_SIZE (VO__end - VO__text) +diff -urNp linux-2.6.33.1/arch/x86/boot/video-vesa.c linux-2.6.33.1/arch/x86/boot/video-vesa.c +--- linux-2.6.33.1/arch/x86/boot/video-vesa.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/boot/video-vesa.c 2010-03-20 16:58:38.972657826 -0400 +@@ -200,6 +200,7 @@ static void vesa_store_pm_info(void) + + boot_params.screen_info.vesapm_seg = oreg.es; + boot_params.screen_info.vesapm_off = oreg.di; ++ boot_params.screen_info.vesapm_size = oreg.cx; + } + + /* +diff -urNp linux-2.6.33.1/arch/x86/ia32/ia32_signal.c linux-2.6.33.1/arch/x86/ia32/ia32_signal.c +--- linux-2.6.33.1/arch/x86/ia32/ia32_signal.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/ia32/ia32_signal.c 2010-03-20 16:58:38.972657826 -0400 +@@ -403,7 +403,7 @@ static void __user *get_sigframe(struct + sp -= frame_size; + /* Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. */ +- sp = ((sp + 4) & -16ul) - 4; ++ sp = ((sp - 12) & -16ul) - 4; + return (void __user *) sp; + } + +@@ -503,7 +503,7 @@ int ia32_setup_rt_frame(int sig, struct + 0xb8, + __NR_ia32_rt_sigreturn, + 0x80cd, +- 0, ++ 0 + }; + + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); +diff -urNp linux-2.6.33.1/arch/x86/include/asm/alternative.h linux-2.6.33.1/arch/x86/include/asm/alternative.h +--- linux-2.6.33.1/arch/x86/include/asm/alternative.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/alternative.h 2010-03-20 16:58:38.972657826 -0400 +@@ -86,7 +86,7 @@ static inline void alternatives_smp_swit + " .byte 664f-663f\n" /* replacementlen */ \ + " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ + ".previous\n" \ +- ".section .altinstr_replacement, "ax"\n" \ ++ ".section .altinstr_replacement, "a"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" + +diff -urNp linux-2.6.33.1/arch/x86/include/asm/apm.h linux-2.6.33.1/arch/x86/include/asm/apm.h +--- linux-2.6.33.1/arch/x86/include/asm/apm.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/apm.h 2010-03-20 16:58:38.972657826 -0400 +@@ -34,7 +34,7 @@ static inline void apm_bios_call_asm(u32 + __asm__ __volatile__(APM_DO_ZERO_SEGS + "pushl %%edi\n\t" + "pushl %%ebp\n\t" +- "lcall *%%cs:apm_bios_entry\n\t" ++ "lcall *%%ss:apm_bios_entry\n\t" + "setc %%al\n\t" + "popl %%ebp\n\t" + "popl %%edi\n\t" +@@ -58,7 +58,7 @@ static inline u8 apm_bios_call_simple_as + __asm__ __volatile__(APM_DO_ZERO_SEGS + "pushl %%edi\n\t" + "pushl %%ebp\n\t" +- "lcall *%%cs:apm_bios_entry\n\t" ++ "lcall *%%ss:apm_bios_entry\n\t" + "setc %%bl\n\t" + "popl %%ebp\n\t" + "popl %%edi\n\t" +diff -urNp linux-2.6.33.1/arch/x86/include/asm/atomic_32.h linux-2.6.33.1/arch/x86/include/asm/atomic_32.h +--- linux-2.6.33.1/arch/x86/include/asm/atomic_32.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/atomic_32.h 2010-03-20 16:58:38.972657826 -0400 +@@ -25,6 +25,17 @@ static inline int atomic_read(const atom + } + + /** ++ * atomic_read_unchecked - read atomic variable ++ * @v: pointer of type atomic_unchecked_t ++ * ++ * Atomically reads the value of @v. ++ */ ++static inline int atomic_read_unchecked(const atomic_unchecked_t *v) ++{ ++ return v->counter; ++} ++ ++/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value +@@ -37,6 +48,18 @@ static inline void atomic_set(atomic_t * + } + + /** ++ * atomic_set_unchecked - set atomic variable ++ * @v: pointer of type atomic_unchecked_t ++ * @i: required value ++ * ++ * Atomically sets the value of @v to @i. ++ */ ++static inline void atomic_set_unchecked(atomic_unchecked_t *v, int i) ++{ ++ v->counter = i; ++} ++ ++/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t +@@ -45,7 +68,29 @@ static inline void atomic_set(atomic_t * + */ + static inline void atomic_add(int i, atomic_t *v) + { +- asm volatile(LOCK_PREFIX "addl %1,%0" ++ asm volatile(LOCK_PREFIX "addl %1,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "subl %1,%0\n" ++ "into\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ : "+m" (v->counter) ++ : "ir" (i)); ++} ++ ++/** ++ * atomic_add_unchecked - add integer to atomic variable ++ * @i: integer value to add ++ * @v: pointer of type atomic_unchecked_t ++ * ++ * Atomically adds @i to @v. ++ */ ++static inline void atomic_add_unchecked(int i, atomic_unchecked_t *v) ++{ ++ asm volatile(LOCK_PREFIX "addl %1,%0\n" + : "+m" (v->counter) + : "ir" (i)); + } +@@ -59,7 +104,29 @@ static inline void atomic_add(int i, ato + */ + static inline void atomic_sub(int i, atomic_t *v) + { +- asm volatile(LOCK_PREFIX "subl %1,%0" ++ asm volatile(LOCK_PREFIX "subl %1,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "addl %1,%0\n" ++ "into\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ : "+m" (v->counter) ++ : "ir" (i)); ++} ++ ++/** ++ * atomic_sub_unchecked - subtract integer from atomic variable ++ * @i: integer value to subtract ++ * @v: pointer of type atomic_t ++ * ++ * Atomically subtracts @i from @v. ++ */ ++static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) ++{ ++ asm volatile(LOCK_PREFIX "subl %1,%0\n" + : "+m" (v->counter) + : "ir" (i)); + } +@@ -77,7 +144,16 @@ static inline int atomic_sub_and_test(in + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" ++ asm volatile(LOCK_PREFIX "subl %2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "addl %2,%0\n" ++ "into\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ "sete %1\n" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +@@ -91,7 +167,30 @@ static inline int atomic_sub_and_test(in + */ + static inline void atomic_inc(atomic_t *v) + { +- asm volatile(LOCK_PREFIX "incl %0" ++ asm volatile(LOCK_PREFIX "incl %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "into\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ LOCK_PREFIX "decl %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ : "+m" (v->counter)); ++} ++ ++/** ++ * atomic_inc_unchecked - increment atomic variable ++ * @v: pointer of type atomic_unchecked_t ++ * ++ * Atomically increments @v by 1. ++ */ ++static inline void atomic_inc_unchecked(atomic_unchecked_t *v) ++{ ++ asm volatile(LOCK_PREFIX "incl %0\n" + : "+m" (v->counter)); + } + +@@ -103,7 +202,18 @@ static inline void atomic_inc(atomic_t * + */ + static inline void atomic_dec(atomic_t *v) + { +- asm volatile(LOCK_PREFIX "decl %0" ++ asm volatile(LOCK_PREFIX "decl %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "into\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1: \n" ++ LOCK_PREFIX "incl %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "+m" (v->counter)); + } + +@@ -119,7 +229,19 @@ static inline int atomic_dec_and_test(at + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "decl %0; sete %1" ++ asm volatile(LOCK_PREFIX "decl %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "into\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1: \n" ++ LOCK_PREFIX "incl %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "sete %1\n" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +@@ -137,7 +259,19 @@ static inline int atomic_inc_and_test(at + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "incl %0; sete %1" ++ asm volatile(LOCK_PREFIX "incl %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "into\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1: \n" ++ LOCK_PREFIX "decl %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "sete %1\n" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +@@ -156,7 +290,16 @@ static inline int atomic_add_negative(in + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" ++ asm volatile(LOCK_PREFIX "addl %2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "subl %2,%0\n" ++ "into\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ "sets %1\n" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +@@ -179,6 +322,46 @@ static inline int atomic_add_return(int + #endif + /* Modern 486+ processor */ + __i = i; ++ asm volatile(LOCK_PREFIX "xaddl %0, %1\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "movl %0, %1\n" ++ "into\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ : "+r" (i), "+m" (v->counter) ++ : : "memory"); ++ return i + __i; ++ ++#ifdef CONFIG_M386 ++no_xadd: /* Legacy 386 processor */ ++ local_irq_save(flags); ++ __i = atomic_read(v); ++ atomic_set(v, i + __i); ++ local_irq_restore(flags); ++ return i + __i; ++#endif ++} ++ ++/** ++ * atomic_add_return_unchecked - add integer and return ++ * @v: pointer of type atomic_unchecked_t ++ * @i: integer value to add ++ * ++ * Atomically adds @i to @v and returns @i + @v ++ */ ++static inline int atomic_add_return_unchecked(int i, atomic_unchecked_t *v) ++{ ++ int __i; ++#ifdef CONFIG_M386 ++ unsigned long flags; ++ if (unlikely(boot_cpu_data.x86 <= 3)) ++ goto no_xadd; ++#endif ++ /* Modern 486+ processor */ ++ __i = i; + asm volatile(LOCK_PREFIX "xaddl %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); +@@ -227,22 +410,34 @@ static inline int atomic_xchg(atomic_t * + */ + static inline int atomic_add_unless(atomic_t *v, int a, int u) + { +- int c, old; ++ int c, old, new; + c = atomic_read(v); + for (;;) { +- if (unlikely(c == (u))) ++ if (unlikely(c == u)) + break; +- old = atomic_cmpxchg((v), c, c + (a)); ++ ++ asm volatile("addl %2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "into\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ : "=r" (new) ++ : "0" (c), "ir" (a)); ++ ++ old = atomic_cmpxchg(v, c, new); + if (likely(old == c)) + break; + c = old; + } +- return c != (u); ++ return c != u; + } + + #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + + #define atomic_inc_return(v) (atomic_add_return(1, v)) ++#define atomic_inc_return_unchecked(v) (atomic_add_return_unchecked(1, v)) + #define atomic_dec_return(v) (atomic_sub_return(1, v)) + + /* These are x86-specific, used by some header files */ +@@ -266,6 +461,14 @@ typedef struct { + u64 __aligned(8) counter; + } atomic64_t; + ++#ifdef CONFIG_PAX_REFCOUNT ++typedef struct { ++ u64 __aligned(8) counter; ++} atomic64_unchecked_t; ++#else ++typedef atomic64_t atomic64_unchecked_t; ++#endif ++ + #define ATOMIC64_INIT(val) { (val) } + + extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); +diff -urNp linux-2.6.33.1/arch/x86/include/asm/atomic_64.h linux-2.6.33.1/arch/x86/include/asm/atomic_64.h +--- linux-2.6.33.1/arch/x86/include/asm/atomic_64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/atomic_64.h 2010-03-20 16:58:38.972657826 -0400 +@@ -24,6 +24,17 @@ static inline int atomic_read(const atom + } + + /** ++ * atomic_read_unchecked - read atomic variable ++ * @v: pointer of type atomic_unchecked_t ++ * ++ * Atomically reads the value of @v. ++ */ ++static inline int atomic_read_unchecked(const atomic_unchecked_t *v) ++{ ++ return v->counter; ++} ++ ++/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value +@@ -36,6 +47,18 @@ static inline void atomic_set(atomic_t * + } + + /** ++ * atomic_set_unchecked - set atomic variable ++ * @v: pointer of type atomic_unchecked_t ++ * @i: required value ++ * ++ * Atomically sets the value of @v to @i. ++ */ ++static inline void atomic_set_unchecked(atomic_unchecked_t *v, int i) ++{ ++ v->counter = i; ++} ++ ++/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t +@@ -44,7 +67,29 @@ static inline void atomic_set(atomic_t * + */ + static inline void atomic_add(int i, atomic_t *v) + { +- asm volatile(LOCK_PREFIX "addl %1,%0" ++ asm volatile(LOCK_PREFIX "addl %1,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "subl %1,%0\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ : "=m" (v->counter) ++ : "ir" (i), "m" (v->counter)); ++} ++ ++/** ++ * atomic_add_unchecked - add integer to atomic variable ++ * @i: integer value to add ++ * @v: pointer of type atomic_unchecked_t ++ * ++ * Atomically adds @i to @v. ++ */ ++static inline void atomic_add_unchecked(int i, atomic_unchecked_t *v) ++{ ++ asm volatile(LOCK_PREFIX "addl %1,%0\n" + : "=m" (v->counter) + : "ir" (i), "m" (v->counter)); + } +@@ -58,7 +103,29 @@ static inline void atomic_add(int i, ato + */ + static inline void atomic_sub(int i, atomic_t *v) + { +- asm volatile(LOCK_PREFIX "subl %1,%0" ++ asm volatile(LOCK_PREFIX "subl %1,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "addl %1,%0\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ : "=m" (v->counter) ++ : "ir" (i), "m" (v->counter)); ++} ++ ++/** ++ * atomic_sub_unchecked - subtract the atomic variable ++ * @i: integer value to subtract ++ * @v: pointer of type atomic_unchecked_t ++ * ++ * Atomically subtracts @i from @v. ++ */ ++static inline void atomic_sub_unchecked(int i, atomic_unchecked_t *v) ++{ ++ asm volatile(LOCK_PREFIX "subl %1,%0\n" + : "=m" (v->counter) + : "ir" (i), "m" (v->counter)); + } +@@ -76,7 +143,16 @@ static inline int atomic_sub_and_test(in + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" ++ asm volatile(LOCK_PREFIX "subl %2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "addl %2,%0\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ "sete %1\n" + : "=m" (v->counter), "=qm" (c) + : "ir" (i), "m" (v->counter) : "memory"); + return c; +@@ -90,7 +166,32 @@ static inline int atomic_sub_and_test(in + */ + static inline void atomic_inc(atomic_t *v) + { +- asm volatile(LOCK_PREFIX "incl %0" ++ asm volatile(LOCK_PREFIX "incl %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "int $4\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ LOCK_PREFIX "decl %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ : "=m" (v->counter) ++ : "m" (v->counter)); ++} ++ ++/** ++ * atomic_inc_unchecked - increment atomic variable ++ * @v: pointer of type atomic_unchecked_t ++ * ++ * Atomically increments @v by 1. ++ */ ++static inline void atomic_inc_unchecked(atomic_unchecked_t *v) ++{ ++ asm volatile(LOCK_PREFIX "incl %0\n" + : "=m" (v->counter) + : "m" (v->counter)); + } +@@ -103,7 +204,19 @@ static inline void atomic_inc(atomic_t * + */ + static inline void atomic_dec(atomic_t *v) + { +- asm volatile(LOCK_PREFIX "decl %0" ++ asm volatile(LOCK_PREFIX "decl %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "int $4\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1: \n" ++ LOCK_PREFIX "incl %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "=m" (v->counter) + : "m" (v->counter)); + } +@@ -120,7 +233,20 @@ static inline int atomic_dec_and_test(at + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "decl %0; sete %1" ++ asm volatile(LOCK_PREFIX "decl %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "int $4\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1: \n" ++ LOCK_PREFIX "incl %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "sete %1\n" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +@@ -138,7 +264,20 @@ static inline int atomic_inc_and_test(at + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "incl %0; sete %1" ++ asm volatile(LOCK_PREFIX "incl %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "int $4\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1: \n" ++ LOCK_PREFIX "decl %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "sete %1\n" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +@@ -157,7 +296,16 @@ static inline int atomic_add_negative(in + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" ++ asm volatile(LOCK_PREFIX "addl %2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "subl %2,%0\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ "sets %1\n" + : "=m" (v->counter), "=qm" (c) + : "ir" (i), "m" (v->counter) : "memory"); + return c; +@@ -173,7 +321,15 @@ static inline int atomic_add_negative(in + static inline int atomic_add_return(int i, atomic_t *v) + { + int __i = i; +- asm volatile(LOCK_PREFIX "xaddl %0, %1" ++ asm volatile(LOCK_PREFIX "xaddl %0, %1\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "movl %0, %1\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +@@ -204,6 +360,18 @@ static inline long atomic64_read(const a + } + + /** ++ * atomic64_read_unchecked - read atomic64 variable ++ * @v: pointer of type atomic64_unchecked_t ++ * ++ * Atomically reads the value of @v. ++ * Doesn't imply a read memory barrier. ++ */ ++static inline long atomic64_read_unchecked(const atomic64_unchecked_t *v) ++{ ++ return v->counter; ++} ++ ++/** + * atomic64_set - set atomic64 variable + * @v: pointer to type atomic64_t + * @i: required value +@@ -216,6 +384,18 @@ static inline void atomic64_set(atomic64 + } + + /** ++ * atomic64_set_unchecked - set atomic64 variable ++ * @v: pointer to type atomic64_unchecked_t ++ * @i: required value ++ * ++ * Atomically sets the value of @v to @i. ++ */ ++static inline void atomic64_set_unchecked(atomic64_unchecked_t *v, long i) ++{ ++ v->counter = i; ++} ++ ++/** + * atomic64_add - add integer to atomic64 variable + * @i: integer value to add + * @v: pointer to type atomic64_t +@@ -224,6 +404,28 @@ static inline void atomic64_set(atomic64 + */ + static inline void atomic64_add(long i, atomic64_t *v) + { ++ asm volatile(LOCK_PREFIX "addq %1,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "subq %1,%0\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ : "=m" (v->counter) ++ : "er" (i), "m" (v->counter)); ++} ++ ++/** ++ * atomic64_add_unchecked - add integer to atomic64 variable ++ * @i: integer value to add ++ * @v: pointer to type atomic64_unchecked_t ++ * ++ * Atomically adds @i to @v. ++ */ ++static inline void atomic64_add_unchecked(long i, atomic64_unchecked_t *v) ++{ + asm volatile(LOCK_PREFIX "addq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +@@ -238,7 +440,15 @@ static inline void atomic64_add(long i, + */ + static inline void atomic64_sub(long i, atomic64_t *v) + { +- asm volatile(LOCK_PREFIX "subq %1,%0" ++ asm volatile(LOCK_PREFIX "subq %1,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "addq %1,%0\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); + } +@@ -256,7 +466,16 @@ static inline int atomic64_sub_and_test( + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" ++ asm volatile(LOCK_PREFIX "subq %2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "addq %2,%0\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ "sete %1\n" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +@@ -270,6 +489,31 @@ static inline int atomic64_sub_and_test( + */ + static inline void atomic64_inc(atomic64_t *v) + { ++ asm volatile(LOCK_PREFIX "incq %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "int $4\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ LOCK_PREFIX "decq %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ : "=m" (v->counter) ++ : "m" (v->counter)); ++} ++ ++/** ++ * atomic64_inc_unchecked - increment atomic64 variable ++ * @v: pointer to type atomic64_unchecked_t ++ * ++ * Atomically increments @v by 1. ++ */ ++static inline void atomic64_inc_unchecked(atomic64_unchecked_t *v) ++{ + asm volatile(LOCK_PREFIX "incq %0" + : "=m" (v->counter) + : "m" (v->counter)); +@@ -283,7 +527,19 @@ static inline void atomic64_inc(atomic64 + */ + static inline void atomic64_dec(atomic64_t *v) + { +- asm volatile(LOCK_PREFIX "decq %0" ++ asm volatile(LOCK_PREFIX "decq %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "int $4\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1: \n" ++ LOCK_PREFIX "incq %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "=m" (v->counter) + : "m" (v->counter)); + } +@@ -300,7 +556,20 @@ static inline int atomic64_dec_and_test( + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "decq %0; sete %1" ++ asm volatile(LOCK_PREFIX "decq %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "int $4\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1: \n" ++ LOCK_PREFIX "incq %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "sete %1\n" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +@@ -318,7 +587,20 @@ static inline int atomic64_inc_and_test( + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "incq %0; sete %1" ++ asm volatile(LOCK_PREFIX "incq %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "int $4\n0:\n" ++ ".pushsection .fixup,"ax"\n" ++ "1: \n" ++ LOCK_PREFIX "decq %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "sete %1\n" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +@@ -337,7 +619,16 @@ static inline int atomic64_add_negative( + { + unsigned char c; + +- asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" ++ asm volatile(LOCK_PREFIX "addq %2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ LOCK_PREFIX "subq %2,%0\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ "sets %1\n" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +@@ -353,7 +644,31 @@ static inline int atomic64_add_negative( + static inline long atomic64_add_return(long i, atomic64_t *v) + { + long __i = i; +- asm volatile(LOCK_PREFIX "xaddq %0, %1;" ++ asm volatile(LOCK_PREFIX "xaddq %0, %1\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "movq %0, %1\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ : "+r" (i), "+m" (v->counter) ++ : : "memory"); ++ return i + __i; ++} ++ ++/** ++ * atomic64_add_return_unchecked - add and return ++ * @i: integer value to add ++ * @v: pointer to type atomic64_unchecked_t ++ * ++ * Atomically adds @i to @v and returns @i + @v ++ */ ++static inline long atomic64_add_return_unchecked(long i, atomic64_unchecked_t *v) ++{ ++ long __i = i; ++ asm volatile(LOCK_PREFIX "xaddq %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +@@ -365,6 +680,7 @@ static inline long atomic64_sub_return(l + } + + #define atomic64_inc_return(v) (atomic64_add_return(1, (v))) ++#define atomic64_inc_return_unchecked(v) (atomic64_add_return_unchecked(1, (v))) + #define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) + + static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) +@@ -398,17 +714,29 @@ static inline long atomic_xchg(atomic_t + */ + static inline int atomic_add_unless(atomic_t *v, int a, int u) + { +- int c, old; ++ int c, old, new; + c = atomic_read(v); + for (;;) { +- if (unlikely(c == (u))) ++ if (unlikely(c == u)) + break; +- old = atomic_cmpxchg((v), c, c + (a)); ++ ++ asm volatile("addl %2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ : "=r" (new) ++ : "0" (c), "ir" (a)); ++ ++ old = atomic_cmpxchg(v, c, new); + if (likely(old == c)) + break; + c = old; + } +- return c != (u); ++ return c != u; + } + + #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) +@@ -424,17 +752,29 @@ static inline int atomic_add_unless(atom + */ + static inline int atomic64_add_unless(atomic64_t *v, long a, long u) + { +- long c, old; ++ long c, old, new; + c = atomic64_read(v); + for (;;) { +- if (unlikely(c == (u))) ++ if (unlikely(c == u)) + break; +- old = atomic64_cmpxchg((v), c, c + (a)); ++ ++ asm volatile("addq %2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ "jno 0f\n" ++ "int $4\n0:\n" ++ _ASM_EXTABLE(0b, 0b) ++#endif ++ ++ : "=r" (new) ++ : "0" (c), "er" (a)); ++ ++ old = atomic64_cmpxchg((v), c, new); + if (likely(old == c)) + break; + c = old; + } +- return c != (u); ++ return c != u; + } + + /** +diff -urNp linux-2.6.33.1/arch/x86/include/asm/boot.h linux-2.6.33.1/arch/x86/include/asm/boot.h +--- linux-2.6.33.1/arch/x86/include/asm/boot.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/boot.h 2010-03-20 16:58:38.972657826 -0400 +@@ -11,10 +11,15 @@ + #include <asm/pgtable_types.h> + + /* Physical address where kernel should be loaded. */ +-#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ ++#define ____LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + + (CONFIG_PHYSICAL_ALIGN - 1)) \ + & ~(CONFIG_PHYSICAL_ALIGN - 1)) + ++#ifndef __ASSEMBLY__ ++extern unsigned char __LOAD_PHYSICAL_ADDR[]; ++#define LOAD_PHYSICAL_ADDR ((unsigned long)__LOAD_PHYSICAL_ADDR) ++#endif ++ + /* Minimum kernel alignment, as a power of two */ + #ifdef CONFIG_X86_64 + #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT +diff -urNp linux-2.6.33.1/arch/x86/include/asm/cache.h linux-2.6.33.1/arch/x86/include/asm/cache.h +--- linux-2.6.33.1/arch/x86/include/asm/cache.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/cache.h 2010-03-20 16:58:38.976510592 -0400 +@@ -8,6 +8,7 @@ + #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) + + #define __read_mostly __attribute__((__section__(".data.read_mostly"))) ++#define __read_only __attribute__((__section__(".data.read_only"))) + + #define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT + #define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT) +diff -urNp linux-2.6.33.1/arch/x86/include/asm/checksum_32.h linux-2.6.33.1/arch/x86/include/asm/checksum_32.h +--- linux-2.6.33.1/arch/x86/include/asm/checksum_32.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/checksum_32.h 2010-03-20 16:58:38.976510592 -0400 +@@ -31,6 +31,14 @@ asmlinkage __wsum csum_partial_copy_gene + int len, __wsum sum, + int *src_err_ptr, int *dst_err_ptr); + ++asmlinkage __wsum csum_partial_copy_generic_to_user(const void *src, void *dst, ++ int len, __wsum sum, ++ int *src_err_ptr, int *dst_err_ptr); ++ ++asmlinkage __wsum csum_partial_copy_generic_from_user(const void *src, void *dst, ++ int len, __wsum sum, ++ int *src_err_ptr, int *dst_err_ptr); ++ + /* + * Note: when you get a NULL pointer exception here this means someone + * passed in an incorrect kernel address to one of these functions. +@@ -50,7 +58,7 @@ static inline __wsum csum_partial_copy_f + int *err_ptr) + { + might_sleep(); +- return csum_partial_copy_generic((__force void *)src, dst, ++ return csum_partial_copy_generic_from_user((__force void *)src, dst, + len, sum, err_ptr, NULL); + } + +@@ -178,7 +186,7 @@ static inline __wsum csum_and_copy_to_us + { + might_sleep(); + if (access_ok(VERIFY_WRITE, dst, len)) +- return csum_partial_copy_generic(src, (__force void *)dst, ++ return csum_partial_copy_generic_to_user(src, (__force void *)dst, + len, sum, NULL, err_ptr); + + if (len) +diff -urNp linux-2.6.33.1/arch/x86/include/asm/desc.h linux-2.6.33.1/arch/x86/include/asm/desc.h +--- linux-2.6.33.1/arch/x86/include/asm/desc.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/desc.h 2010-03-20 16:58:38.976510592 -0400 +@@ -4,6 +4,7 @@ + #include <asm/desc_defs.h> + #include <asm/ldt.h> + #include <asm/mmu.h> ++#include <asm/pgtable.h> + #include <linux/smp.h> + + static inline void fill_ldt(struct desc_struct *desc, +@@ -15,6 +16,7 @@ static inline void fill_ldt(struct desc_ + desc->base1 = (info->base_addr & 0x00ff0000) >> 16; + desc->type = (info->read_exec_only ^ 1) << 1; + desc->type |= info->contents << 2; ++ desc->type |= info->seg_not_present ^ 1; + desc->s = 1; + desc->dpl = 0x3; + desc->p = info->seg_not_present ^ 1; +@@ -31,16 +33,12 @@ static inline void fill_ldt(struct desc_ + } + + extern struct desc_ptr idt_descr; +-extern gate_desc idt_table[]; +- +-struct gdt_page { +- struct desc_struct gdt[GDT_ENTRIES]; +-} __attribute__((aligned(PAGE_SIZE))); +-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); ++extern gate_desc idt_table[256]; + ++extern struct desc_struct cpu_gdt_table[NR_CPUS][PAGE_SIZE / sizeof(struct desc_struct)]; + static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) + { +- return per_cpu(gdt_page, cpu).gdt; ++ return cpu_gdt_table[cpu]; + } + + #ifdef CONFIG_X86_64 +@@ -115,19 +113,24 @@ static inline void paravirt_free_ldt(str + static inline void native_write_idt_entry(gate_desc *idt, int entry, + const gate_desc *gate) + { ++ pax_open_kernel(); + memcpy(&idt[entry], gate, sizeof(*gate)); ++ pax_close_kernel(); + } + + static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, + const void *desc) + { ++ pax_open_kernel(); + memcpy(&ldt[entry], desc, 8); ++ pax_close_kernel(); + } + + static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, + const void *desc, int type) + { + unsigned int size; ++ + switch (type) { + case DESC_TSS: + size = sizeof(tss_desc); +@@ -139,7 +142,10 @@ static inline void native_write_gdt_entr + size = sizeof(struct desc_struct); + break; + } ++ ++ pax_open_kernel(); + memcpy(&gdt[entry], desc, size); ++ pax_close_kernel(); + } + + static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, +@@ -211,7 +217,9 @@ static inline void native_set_ldt(const + + static inline void native_load_tr_desc(void) + { ++ pax_open_kernel(); + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); ++ pax_close_kernel(); + } + + static inline void native_load_gdt(const struct desc_ptr *dtr) +@@ -246,8 +254,10 @@ static inline void native_load_tls(struc + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + ++ pax_open_kernel(); + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; ++ pax_close_kernel(); + } + + #define _LDT_empty(info) \ +@@ -392,4 +402,16 @@ static inline void set_system_intr_gate_ + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); + } + ++#ifdef CONFIG_X86_32 ++static inline void set_user_cs(unsigned long base, unsigned long limit, int cpu) ++{ ++ struct desc_struct d; ++ ++ if (likely(limit)) ++ limit = (limit - 1UL) >> PAGE_SHIFT; ++ pack_descriptor(&d, base, limit, 0xFB, 0xC); ++ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_DEFAULT_USER_CS, &d, DESCTYPE_S); ++} ++#endif ++ + #endif /* _ASM_X86_DESC_H */ +diff -urNp linux-2.6.33.1/arch/x86/include/asm/device.h linux-2.6.33.1/arch/x86/include/asm/device.h +--- linux-2.6.33.1/arch/x86/include/asm/device.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/device.h 2010-03-20 16:58:38.976510592 -0400 +@@ -6,7 +6,7 @@ struct dev_archdata { + void *acpi_handle; + #endif + #ifdef CONFIG_X86_64 +-struct dma_map_ops *dma_ops; ++ const struct dma_map_ops *dma_ops; + #endif + #if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU) + void *iommu; /* hook for IOMMU specific extension */ +diff -urNp linux-2.6.33.1/arch/x86/include/asm/dma-mapping.h linux-2.6.33.1/arch/x86/include/asm/dma-mapping.h +--- linux-2.6.33.1/arch/x86/include/asm/dma-mapping.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/dma-mapping.h 2010-03-20 16:58:38.976510592 -0400 +@@ -26,9 +26,9 @@ extern int iommu_merge; + extern struct device x86_dma_fallback_dev; + extern int panic_on_overflow; + +-extern struct dma_map_ops *dma_ops; ++extern const struct dma_map_ops *dma_ops; + +-static inline struct dma_map_ops *get_dma_ops(struct device *dev) ++static inline const struct dma_map_ops *get_dma_ops(struct device *dev) + { + #ifdef CONFIG_X86_32 + return dma_ops; +@@ -45,7 +45,7 @@ static inline struct dma_map_ops *get_dm + /* Make sure we keep the same behaviour */ + static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + if (ops->mapping_error) + return ops->mapping_error(dev, dma_addr); + +@@ -123,7 +123,7 @@ static inline void * + dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + void *memory; + + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); +@@ -150,7 +150,7 @@ dma_alloc_coherent(struct device *dev, s + static inline void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + WARN_ON(irqs_disabled()); /* for portability */ + +diff -urNp linux-2.6.33.1/arch/x86/include/asm/e820.h linux-2.6.33.1/arch/x86/include/asm/e820.h +--- linux-2.6.33.1/arch/x86/include/asm/e820.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/e820.h 2010-03-20 16:58:38.976510592 -0400 +@@ -64,7 +64,7 @@ struct e820map { + #define ISA_START_ADDRESS 0xa0000 + #define ISA_END_ADDRESS 0x100000 + +-#define BIOS_BEGIN 0x000a0000 ++#define BIOS_BEGIN 0x000c0000 + #define BIOS_END 0x00100000 + + #ifdef __KERNEL__ +diff -urNp linux-2.6.33.1/arch/x86/include/asm/elf.h linux-2.6.33.1/arch/x86/include/asm/elf.h +--- linux-2.6.33.1/arch/x86/include/asm/elf.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/elf.h 2010-03-20 16:58:38.976510592 -0400 +@@ -237,7 +237,25 @@ extern int force_personality32; + the loader. We need to make sure that it is out of the way of the program + that it will "exec", and that there is sufficient room for the brk. */ + ++#ifdef CONFIG_PAX_SEGMEXEC ++#define ELF_ET_DYN_BASE ((current->mm->pax_flags & MF_PAX_SEGMEXEC) ? SEGMEXEC_TASK_SIZE/3*2 : TASK_SIZE/3*2) ++#else + #define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) ++#endif ++ ++#ifdef CONFIG_PAX_ASLR ++#ifdef CONFIG_X86_32 ++#define PAX_ELF_ET_DYN_BASE 0x10000000UL ++ ++#define PAX_DELTA_MMAP_LEN (current->mm->pax_flags & MF_PAX_SEGMEXEC ? 15 : 16) ++#define PAX_DELTA_STACK_LEN (current->mm->pax_flags & MF_PAX_SEGMEXEC ? 15 : 16) ++#else ++#define PAX_ELF_ET_DYN_BASE 0x400000UL ++ ++#define PAX_DELTA_MMAP_LEN ((test_thread_flag(TIF_IA32)) ? 16 : 32) ++#define PAX_DELTA_STACK_LEN ((test_thread_flag(TIF_IA32)) ? 16 : 32) ++#endif ++#endif + + /* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. This could be done in user space, +@@ -291,8 +309,7 @@ do { \ + #define ARCH_DLINFO \ + do { \ + if (vdso_enabled) \ +- NEW_AUX_ENT(AT_SYSINFO_EHDR, \ +- (unsigned long)current->mm->context.vdso); \ ++ NEW_AUX_ENT(AT_SYSINFO_EHDR, current->mm->context.vdso);\ + } while (0) + + #define AT_SYSINFO 32 +@@ -303,7 +320,7 @@ do { \ + + #endif /* !CONFIG_X86_32 */ + +-#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso) ++#define VDSO_CURRENT_BASE (current->mm->context.vdso) + + #define VDSO_ENTRY \ + ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall)) +@@ -317,7 +334,4 @@ extern int arch_setup_additional_pages(s + extern int syscall32_setup_pages(struct linux_binprm *, int exstack); + #define compat_arch_setup_additional_pages syscall32_setup_pages + +-extern unsigned long arch_randomize_brk(struct mm_struct *mm); +-#define arch_randomize_brk arch_randomize_brk +- + #endif /* _ASM_X86_ELF_H */ +diff -urNp linux-2.6.33.1/arch/x86/include/asm/futex.h linux-2.6.33.1/arch/x86/include/asm/futex.h +--- linux-2.6.33.1/arch/x86/include/asm/futex.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/futex.h 2010-03-20 16:58:38.976510592 -0400 +@@ -11,6 +11,40 @@ + #include <asm/processor.h> + #include <asm/system.h> + ++#ifdef CONFIG_X86_32 ++#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ ++ asm volatile( \ ++ "movw\t%w6, %%ds\n" \ ++ "1:\t" insn "\n" \ ++ "2:\tpushl\t%%ss\n" \ ++ "\tpopl\t%%ds\n" \ ++ "\t.section .fixup,"ax"\n" \ ++ "3:\tmov\t%3, %1\n" \ ++ "\tjmp\t2b\n" \ ++ "\t.previous\n" \ ++ _ASM_EXTABLE(1b, 3b) \ ++ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \ ++ : "i" (-EFAULT), "0" (oparg), "1" (0), "r" (__USER_DS)) ++ ++#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \ ++ asm volatile("movw\t%w7, %%es\n" \ ++ "1:\tmovl\t%%es:%2, %0\n" \ ++ "\tmovl\t%0, %3\n" \ ++ "\t" insn "\n" \ ++ "2:\t" LOCK_PREFIX "cmpxchgl %3, %%es:%2\n"\ ++ "\tjnz\t1b\n" \ ++ "3:\tpushl\t%%ss\n" \ ++ "\tpopl\t%%es\n" \ ++ "\t.section .fixup,"ax"\n" \ ++ "4:\tmov\t%5, %1\n" \ ++ "\tjmp\t3b\n" \ ++ "\t.previous\n" \ ++ _ASM_EXTABLE(1b, 4b) \ ++ _ASM_EXTABLE(2b, 4b) \ ++ : "=&a" (oldval), "=&r" (ret), \ ++ "+m" (*uaddr), "=&r" (tem) \ ++ : "r" (oparg), "i" (-EFAULT), "1" (0), "r" (__USER_DS)) ++#else + #define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ + asm volatile("1:\t" insn "\n" \ + "2:\t.section .fixup,"ax"\n" \ +@@ -36,8 +70,9 @@ + : "=&a" (oldval), "=&r" (ret), \ + "+m" (*uaddr), "=&r" (tem) \ + : "r" (oparg), "i" (-EFAULT), "1" (0)) ++#endif + +-static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) ++static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) + { + int op = (encoded_op >> 28) & 7; + int cmp = (encoded_op >> 24) & 15; +@@ -61,11 +96,20 @@ static inline int futex_atomic_op_inuser + + switch (op) { + case FUTEX_OP_SET: ++#ifdef CONFIG_X86_32 ++ __futex_atomic_op1("xchgl %0, %%ds:%2", ret, oldval, uaddr, oparg); ++#else + __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg); ++#endif + break; + case FUTEX_OP_ADD: ++#ifdef CONFIG_X86_32 ++ __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %%ds:%2", ret, oldval, ++ uaddr, oparg); ++#else + __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval, + uaddr, oparg); ++#endif + break; + case FUTEX_OP_OR: + __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg); +@@ -109,7 +153,7 @@ static inline int futex_atomic_op_inuser + return ret; + } + +-static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, ++static inline int futex_atomic_cmpxchg_inatomic(u32 __user *uaddr, int oldval, + int newval) + { + +@@ -122,14 +166,27 @@ static inline int futex_atomic_cmpxchg_i + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + return -EFAULT; + +- asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" ++ asm volatile( ++#ifdef CONFIG_X86_32 ++ "\tmovw %w5, %%ds\n" ++ "1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" ++ "2:\tpushl %%ss\n" ++ "\tpopl %%ds\n" ++ "\t.section .fixup, "ax"\n" ++#else ++ "1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" + "2:\t.section .fixup, "ax"\n" ++#endif + "3:\tmov %2, %0\n" + "\tjmp 2b\n" + "\t.previous\n" + _ASM_EXTABLE(1b, 3b) + : "=a" (oldval), "+m" (*uaddr) ++#ifdef CONFIG_X86_32 ++ : "i" (-EFAULT), "r" (newval), "0" (oldval), "r" (__USER_DS) ++#else + : "i" (-EFAULT), "r" (newval), "0" (oldval) ++#endif + : "memory" + ); + +diff -urNp linux-2.6.33.1/arch/x86/include/asm/i387.h linux-2.6.33.1/arch/x86/include/asm/i387.h +--- linux-2.6.33.1/arch/x86/include/asm/i387.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/i387.h 2010-03-20 16:58:38.976510592 -0400 +@@ -197,13 +197,8 @@ static inline int fxrstor_checking(struc + } + + /* We need a safe address that is cheap to find and that is already +- in L1 during context switch. The best choices are unfortunately +- different for UP and SMP */ +-#ifdef CONFIG_SMP +-#define safe_address (__per_cpu_offset[0]) +-#else +-#define safe_address (kstat_cpu(0).cpustat.user) +-#endif ++ in L1 during context switch. */ ++#define safe_address (init_tss[smp_processor_id()].x86_tss.sp0) + + /* + * These must be called with preempt disabled +diff -urNp linux-2.6.33.1/arch/x86/include/asm/io_64.h linux-2.6.33.1/arch/x86/include/asm/io_64.h +--- linux-2.6.33.1/arch/x86/include/asm/io_64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/io_64.h 2010-03-20 16:58:38.976510592 -0400 +@@ -140,6 +140,17 @@ __OUTS(l) + + #include <linux/vmalloc.h> + ++#define ARCH_HAS_VALID_PHYS_ADDR_RANGE ++static inline int valid_phys_addr_range(unsigned long addr, size_t count) ++{ ++ return ((addr + count + PAGE_SIZE - 1) >> PAGE_SHIFT) < (1 << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) ? 1 : 0; ++} ++ ++static inline int valid_mmap_phys_addr_range(unsigned long pfn, size_t count) ++{ ++ return (pfn + (count >> PAGE_SHIFT)) < (1 << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) ? 1 : 0; ++} ++ + #include <asm-generic/iomap.h> + + void __memcpy_fromio(void *, unsigned long, unsigned); +diff -urNp linux-2.6.33.1/arch/x86/include/asm/iommu.h linux-2.6.33.1/arch/x86/include/asm/iommu.h +--- linux-2.6.33.1/arch/x86/include/asm/iommu.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/iommu.h 2010-03-20 16:58:38.976510592 -0400 +@@ -1,7 +1,7 @@ + #ifndef _ASM_X86_IOMMU_H + #define _ASM_X86_IOMMU_H + +-extern struct dma_map_ops nommu_dma_ops; ++extern const struct dma_map_ops nommu_dma_ops; + extern int force_iommu, no_iommu; + extern int iommu_detected; + extern int iommu_pass_through; +diff -urNp linux-2.6.33.1/arch/x86/include/asm/irqflags.h linux-2.6.33.1/arch/x86/include/asm/irqflags.h +--- linux-2.6.33.1/arch/x86/include/asm/irqflags.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/irqflags.h 2010-03-20 17:06:47.204705877 -0400 +@@ -142,10 +142,77 @@ static inline unsigned long __raw_local_ + sti; \ + sysexit + ++/* PaX: special register usage in entry_64.S, beware */ ++#ifdef CONFIG_PAX_KERNEXEC ++ .macro ljmpq sel, off ++ .byte 0x48; ljmp *1234f(%rip) ++ .pushsection .rodata ++ .align 16 ++ 1234: .quad \off; .word \sel ++ .popsection ++ .endm ++ ++#define PAX_EXIT_KERNEL \ ++ push %rsi; \ ++ mov %cs, %rsi; \ ++ cmp $__KERNEXEC_KERNEL_CS, %esi;\ ++ jnz 2f; \ ++ mov %cr0, %rsi; \ ++ btc $16, %rsi; \ ++ ljmpq __KERNEL_CS, 1f; \ ++1: mov %rsi, %cr0; \ ++2: pop %rsi ++ ++#define PAX_ENTER_KERNEL \ ++ push %rsi; \ ++ mov %cr0, %rsi; \ ++ bts $16, %rsi; \ ++ jnc 1f; \ ++ mov %cs, %esi; \ ++ cmp $__KERNEL_CS, %esi; \ ++ jz 3f; \ ++ ljmpq __KERNEL_CS, 3f; \ ++1: ljmpq __KERNEXEC_KERNEL_CS, 2f; \ ++2: mov %rsi, %cr0; \ ++3: pop %rsi ++#else ++#define PAX_EXIT_KERNEL ++#define PAX_ENTER_KERNEL ++#endif ++ + #else + #define INTERRUPT_RETURN iret + #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit + #define GET_CR0_INTO_EAX movl %cr0, %eax ++ ++/* PaX: special register usage in entry_32.S, beware */ ++#ifdef CONFIG_PAX_KERNEXEC ++#define PAX_EXIT_KERNEL \ ++ mov %cs, %esi; \ ++ cmp $__KERNEXEC_KERNEL_CS, %esi;\ ++ jnz 2f; \ ++ mov %cr0, %esi; \ ++ btc $16, %esi; \ ++ ljmp $__KERNEL_CS, $1f; \ ++1: mov %esi, %cr0; \ ++2: ++ ++#define PAX_ENTER_KERNEL \ ++ mov %cr0, %esi; \ ++ bts $16, %esi; \ ++ jnc 1f; \ ++ mov %cs, %esi; \ ++ cmp $__KERNEL_CS, %esi; \ ++ jz 3f; \ ++ ljmp $__KERNEL_CS, $3f; \ ++1: ljmp $__KERNEXEC_KERNEL_CS, $2f;\ ++2: mov %esi, %cr0; \ ++3: ++#else ++#define PAX_EXIT_KERNEL ++#define PAX_ENTER_KERNEL ++#endif ++ + #endif + + +diff -urNp linux-2.6.33.1/arch/x86/include/asm/kvm_host.h linux-2.6.33.1/arch/x86/include/asm/kvm_host.h +--- linux-2.6.33.1/arch/x86/include/asm/kvm_host.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/kvm_host.h 2010-03-20 16:58:38.976510592 -0400 +@@ -536,7 +536,7 @@ struct kvm_x86_ops { + const struct trace_print_flags *exit_reasons_str; + }; + +-extern struct kvm_x86_ops *kvm_x86_ops; ++extern const struct kvm_x86_ops *kvm_x86_ops; + + int kvm_mmu_module_init(void); + void kvm_mmu_module_exit(void); +diff -urNp linux-2.6.33.1/arch/x86/include/asm/local.h linux-2.6.33.1/arch/x86/include/asm/local.h +--- linux-2.6.33.1/arch/x86/include/asm/local.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/local.h 2010-03-20 16:58:38.976510592 -0400 +@@ -18,26 +18,90 @@ typedef struct { + + static inline void local_inc(local_t *l) + { +- asm volatile(_ASM_INC "%0" ++ asm volatile(_ASM_INC "%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ _ASM_DEC "%0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "+m" (l->a.counter)); + } + + static inline void local_dec(local_t *l) + { +- asm volatile(_ASM_DEC "%0" ++ asm volatile(_ASM_DEC "%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ _ASM_INC "%0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "+m" (l->a.counter)); + } + + static inline void local_add(long i, local_t *l) + { +- asm volatile(_ASM_ADD "%1,%0" ++ asm volatile(_ASM_ADD "%1,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ _ASM_SUB "%1,%0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "+m" (l->a.counter) + : "ir" (i)); + } + + static inline void local_sub(long i, local_t *l) + { +- asm volatile(_ASM_SUB "%1,%0" ++ asm volatile(_ASM_SUB "%1,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ _ASM_ADD "%1,%0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "+m" (l->a.counter) + : "ir" (i)); + } +@@ -55,7 +119,24 @@ static inline int local_sub_and_test(lon + { + unsigned char c; + +- asm volatile(_ASM_SUB "%2,%0; sete %1" ++ asm volatile(_ASM_SUB "%2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ _ASM_ADD "%2,%0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "sete %1\n" + : "+m" (l->a.counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +@@ -73,7 +154,24 @@ static inline int local_dec_and_test(loc + { + unsigned char c; + +- asm volatile(_ASM_DEC "%0; sete %1" ++ asm volatile(_ASM_DEC "%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ _ASM_INC "%0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "sete %1\n" + : "+m" (l->a.counter), "=qm" (c) + : : "memory"); + return c != 0; +@@ -91,7 +189,24 @@ static inline int local_inc_and_test(loc + { + unsigned char c; + +- asm volatile(_ASM_INC "%0; sete %1" ++ asm volatile(_ASM_INC "%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ _ASM_DEC "%0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "sete %1\n" + : "+m" (l->a.counter), "=qm" (c) + : : "memory"); + return c != 0; +@@ -110,7 +225,24 @@ static inline int local_add_negative(lon + { + unsigned char c; + +- asm volatile(_ASM_ADD "%2,%0; sets %1" ++ asm volatile(_ASM_ADD "%2,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ _ASM_SUB "%2,%0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "sets %1\n" + : "+m" (l->a.counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +@@ -133,7 +265,23 @@ static inline long local_add_return(long + #endif + /* Modern 486+ processor */ + __i = i; +- asm volatile(_ASM_XADD "%0, %1;" ++ asm volatile(_ASM_XADD "%0, %1\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ _ASM_MOV "%0,%1\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "+r" (i), "+m" (l->a.counter) + : : "memory"); + return i + __i; +diff -urNp linux-2.6.33.1/arch/x86/include/asm/microcode.h linux-2.6.33.1/arch/x86/include/asm/microcode.h +--- linux-2.6.33.1/arch/x86/include/asm/microcode.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/microcode.h 2010-03-20 16:58:38.976510592 -0400 +@@ -12,13 +12,13 @@ struct device; + enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; + + struct microcode_ops { +- enum ucode_state (*request_microcode_user) (int cpu, ++ enum ucode_state (* const request_microcode_user) (int cpu, + const void __user *buf, size_t size); + +- enum ucode_state (*request_microcode_fw) (int cpu, ++ enum ucode_state (* const request_microcode_fw) (int cpu, + struct device *device); + +- void (*microcode_fini_cpu) (int cpu); ++ void (* const microcode_fini_cpu) (int cpu); + + /* + * The generic 'microcode_core' part guarantees that +@@ -38,18 +38,18 @@ struct ucode_cpu_info { + extern struct ucode_cpu_info ucode_cpu_info[]; + + #ifdef CONFIG_MICROCODE_INTEL +-extern struct microcode_ops * __init init_intel_microcode(void); ++extern const struct microcode_ops * __init init_intel_microcode(void); + #else +-static inline struct microcode_ops * __init init_intel_microcode(void) ++static inline const struct microcode_ops * __init init_intel_microcode(void) + { + return NULL; + } + #endif /* CONFIG_MICROCODE_INTEL */ + + #ifdef CONFIG_MICROCODE_AMD +-extern struct microcode_ops * __init init_amd_microcode(void); ++extern const struct microcode_ops * __init init_amd_microcode(void); + #else +-static inline struct microcode_ops * __init init_amd_microcode(void) ++static inline const struct microcode_ops * __init init_amd_microcode(void) + { + return NULL; + } +diff -urNp linux-2.6.33.1/arch/x86/include/asm/mman.h linux-2.6.33.1/arch/x86/include/asm/mman.h +--- linux-2.6.33.1/arch/x86/include/asm/mman.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/mman.h 2010-03-20 16:58:38.976510592 -0400 +@@ -5,4 +5,14 @@ + + #include <asm-generic/mman.h> + ++#ifdef __KERNEL__ ++#ifndef __ASSEMBLY__ ++#ifdef CONFIG_X86_32 ++#define arch_mmap_check i386_mmap_check ++int i386_mmap_check(unsigned long addr, unsigned long len, ++ unsigned long flags); ++#endif ++#endif ++#endif ++ + #endif /* _ASM_X86_MMAN_H */ +diff -urNp linux-2.6.33.1/arch/x86/include/asm/mmu_context.h linux-2.6.33.1/arch/x86/include/asm/mmu_context.h +--- linux-2.6.33.1/arch/x86/include/asm/mmu_context.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/mmu_context.h 2010-03-20 16:58:38.976510592 -0400 +@@ -34,11 +34,17 @@ static inline void switch_mm(struct mm_s + struct task_struct *tsk) + { + unsigned cpu = smp_processor_id(); ++#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) ++ int tlbstate = TLBSTATE_OK; ++#endif + + if (likely(prev != next)) { + /* stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + #ifdef CONFIG_SMP ++#ifdef CONFIG_X86_32 ++ tlbstate = percpu_read(cpu_tlbstate.state); ++#endif + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + percpu_write(cpu_tlbstate.active_mm, next); + #endif +@@ -52,6 +58,26 @@ static inline void switch_mm(struct mm_s + */ + if (unlikely(prev->context.ldt != next->context.ldt)) + load_LDT_nolock(&next->context); ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_SMP) ++ if (!(__supported_pte_mask & _PAGE_NX)) { ++ smp_mb__before_clear_bit(); ++ cpu_clear(cpu, prev->context.cpu_user_cs_mask); ++ smp_mb__after_clear_bit(); ++ cpu_set(cpu, next->context.cpu_user_cs_mask); ++ } ++#endif ++ ++#if defined(CONFIG_X86_32) && (defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC)) ++ if (unlikely(prev->context.user_cs_base != next->context.user_cs_base || ++ prev->context.user_cs_limit != next->context.user_cs_limit)) ++ set_user_cs(next->context.user_cs_base, next->context.user_cs_limit, cpu); ++#ifdef CONFIG_SMP ++ else if (unlikely(tlbstate != TLBSTATE_OK)) ++ set_user_cs(next->context.user_cs_base, next->context.user_cs_limit, cpu); ++#endif ++#endif ++ + } + #ifdef CONFIG_SMP + else { +@@ -65,6 +91,19 @@ static inline void switch_mm(struct mm_s + */ + load_cr3(next->pgd); + load_LDT_nolock(&next->context); ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_PAGEEXEC) ++ if (!(__supported_pte_mask & _PAGE_NX)) ++ cpu_set(cpu, next->context.cpu_user_cs_mask); ++#endif ++ ++#if defined(CONFIG_X86_32) && (defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC)) ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (!((next->pax_flags & MF_PAX_PAGEEXEC) && (__supported_pte_mask & _PAGE_NX))) ++#endif ++ set_user_cs(next->context.user_cs_base, next->context.user_cs_limit, cpu); ++#endif ++ + } + } + #endif +diff -urNp linux-2.6.33.1/arch/x86/include/asm/mmu.h linux-2.6.33.1/arch/x86/include/asm/mmu.h +--- linux-2.6.33.1/arch/x86/include/asm/mmu.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/mmu.h 2010-03-20 16:58:38.976510592 -0400 +@@ -9,10 +9,23 @@ + * we put the segment information here. + */ + typedef struct { +- void *ldt; ++ struct desc_struct *ldt; + int size; + struct mutex lock; +- void *vdso; ++ unsigned long vdso; ++ ++#ifdef CONFIG_X86_32 ++#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) ++ unsigned long user_cs_base; ++ unsigned long user_cs_limit; ++ ++#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_SMP) ++ cpumask_t cpu_user_cs_mask; ++#endif ++ ++#endif ++#endif ++ + } mm_context_t; + + #ifdef CONFIG_SMP +diff -urNp linux-2.6.33.1/arch/x86/include/asm/module.h linux-2.6.33.1/arch/x86/include/asm/module.h +--- linux-2.6.33.1/arch/x86/include/asm/module.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/module.h 2010-03-20 16:58:38.976510592 -0400 +@@ -65,7 +65,12 @@ + # else + # define MODULE_STACKSIZE "" + # endif +-# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE ++# ifdef CONFIG_GRKERNSEC ++# define MODULE_GRSEC "GRSECURITY " ++# else ++# define MODULE_GRSEC "" ++# endif ++# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE MODULE_GRSEC + #endif + + #endif /* _ASM_X86_MODULE_H */ +diff -urNp linux-2.6.33.1/arch/x86/include/asm/page_32_types.h linux-2.6.33.1/arch/x86/include/asm/page_32_types.h +--- linux-2.6.33.1/arch/x86/include/asm/page_32_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/page_32_types.h 2010-03-20 16:58:38.976510592 -0400 +@@ -15,6 +15,10 @@ + */ + #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) + ++#ifdef CONFIG_PAX_PAGEEXEC ++#define CONFIG_ARCH_TRACK_EXEC_LIMIT 1 ++#endif ++ + #ifdef CONFIG_4KSTACKS + #define THREAD_ORDER 0 + #else +diff -urNp linux-2.6.33.1/arch/x86/include/asm/page_64_types.h linux-2.6.33.1/arch/x86/include/asm/page_64_types.h +--- linux-2.6.33.1/arch/x86/include/asm/page_64_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/page_64_types.h 2010-03-20 16:58:38.980670389 -0400 +@@ -39,6 +39,9 @@ + #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) + #define __START_KERNEL_map _AC(0xffffffff80000000, UL) + ++#define ktla_ktva(addr) (addr) ++#define ktva_ktla(addr) (addr) ++ + /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ + #define __PHYSICAL_MASK_SHIFT 46 + #define __VIRTUAL_MASK_SHIFT 47 +diff -urNp linux-2.6.33.1/arch/x86/include/asm/paravirt.h linux-2.6.33.1/arch/x86/include/asm/paravirt.h +--- linux-2.6.33.1/arch/x86/include/asm/paravirt.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/paravirt.h 2010-03-20 16:58:38.980670389 -0400 +@@ -729,6 +729,21 @@ static inline void __set_fixmap(unsigned + pv_mmu_ops.set_fixmap(idx, phys, flags); + } + ++#ifdef CONFIG_PAX_KERNEXEC ++static inline unsigned long pax_open_kernel(void) ++{ ++ return pv_mmu_ops.pax_open_kernel(); ++} ++ ++static inline unsigned long pax_close_kernel(void) ++{ ++ return pv_mmu_ops.pax_close_kernel(); ++} ++#else ++static inline unsigned long pax_open_kernel(void) { return 0; } ++static inline unsigned long pax_close_kernel(void) { return 0; } ++#endif ++ + #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) + + static inline int arch_spin_is_locked(struct arch_spinlock *lock) +@@ -945,7 +960,7 @@ extern void default_banner(void); + + #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) + #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) +-#define PARA_INDIRECT(addr) *%cs:addr ++#define PARA_INDIRECT(addr) *%ss:addr + #endif + + #define INTERRUPT_RETURN \ +@@ -980,6 +995,34 @@ extern void default_banner(void); + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) + ++#ifdef CONFIG_PAX_KERNEXEC ++#define PAX_EXIT_KERNEL \ ++ push %eax; push %ecx; \ ++ mov %cs, %eax; \ ++ cmp $__KERNEXEC_KERNEL_CS, %eax; \ ++ jnz 2f; \ ++ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ ++ btc $16, %eax; \ ++ ljmp $__KERNEL_CS, $1f; \ ++1: call PARA_INDIRECT(pv_cpu_ops+PV_CPU_write_cr0);\ ++2: pop %ecx; pop %eax; \ ++ ++#define PAX_ENTER_KERNEL \ ++ push %eax; push %ecx; \ ++ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ ++ bts $16, %eax; \ ++ jnc 1f; \ ++ mov %cs, %ecx; \ ++ cmp $__KERNEL_CS, %ecx; \ ++ jz 3f; \ ++ ljmp $__KERNEL_CS, $3f; \ ++1: ljmp $__KERNEXEC_KERNEL_CS, $2f; \ ++2: call PARA_INDIRECT(pv_cpu_ops+PV_CPU_write_cr0);\ ++3: pop %ecx; pop %eax; ++#else ++#define PAX_EXIT_KERNEL ++#define PAX_ENTER_KERNEL ++#endif + + #else /* !CONFIG_X86_32 */ + +@@ -1022,6 +1065,46 @@ extern void default_banner(void); + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ .macro ljmpq sel, off ++ .byte 0x48; ljmp *1234f(%rip) ++ .pushsection .rodata ++ .align 16 ++ 1234: .quad \off; .word \sel ++ .popsection ++ .endm ++ ++#define PAX_EXIT_KERNEL \ ++ PV_SAVE_REGS(CLBR_NONE); \ ++ mov %cs, %rax; \ ++ cmp $__KERNEXEC_KERNEL_CS, %eax; \ ++ jnz 2f; \ ++ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ ++ btc $16, %rax; \ ++ mov %rax, %rdi; \ ++ ljmpq __KERNEL_CS, 1f; \ ++1: call PARA_INDIRECT(pv_cpu_ops+PV_CPU_write_cr0);\ ++2: PV_RESTORE_REGS(CLBR_NONE); ++ ++#define PAX_ENTER_KERNEL \ ++ PV_SAVE_REGS(CLBR_NONE); \ ++ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ ++ bts $16, %rax; \ ++ jnc 1f; \ ++ mov %cs, %rax; \ ++ cmp $__KERNEL_CS, %eax; \ ++ jz 3f; \ ++ ljmpq __KERNEL_CS, 3f; \ ++1: mov %rax, %rdi; \ ++ ljmpq __KERNEXEC_KERNEL_CS, 2f; \ ++2: call PARA_INDIRECT(pv_cpu_ops+PV_CPU_write_cr0);\ ++3: PV_RESTORE_REGS(CLBR_NONE); ++#else ++#define PAX_EXIT_KERNEL ++#define PAX_ENTER_KERNEL ++#endif ++ + #endif /* CONFIG_X86_32 */ + + #endif /* __ASSEMBLY__ */ +diff -urNp linux-2.6.33.1/arch/x86/include/asm/paravirt_types.h linux-2.6.33.1/arch/x86/include/asm/paravirt_types.h +--- linux-2.6.33.1/arch/x86/include/asm/paravirt_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/paravirt_types.h 2010-03-20 16:58:38.980670389 -0400 +@@ -316,6 +316,12 @@ struct pv_mmu_ops { + an mfn. We can tell which is which from the index. */ + void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags); ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ unsigned long (*pax_open_kernel)(void); ++ unsigned long (*pax_close_kernel)(void); ++#endif ++ + }; + + struct arch_spinlock; +diff -urNp linux-2.6.33.1/arch/x86/include/asm/pci_x86.h linux-2.6.33.1/arch/x86/include/asm/pci_x86.h +--- linux-2.6.33.1/arch/x86/include/asm/pci_x86.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/pci_x86.h 2010-03-20 16:58:38.980670389 -0400 +@@ -89,16 +89,16 @@ extern int (*pcibios_enable_irq)(struct + extern void (*pcibios_disable_irq)(struct pci_dev *dev); + + struct pci_raw_ops { +- int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, ++ int (* const read)(unsigned int domain, unsigned int bus, unsigned int devfn, + int reg, int len, u32 *val); +- int (*write)(unsigned int domain, unsigned int bus, unsigned int devfn, ++ int (* const write)(unsigned int domain, unsigned int bus, unsigned int devfn, + int reg, int len, u32 val); + }; + +-extern struct pci_raw_ops *raw_pci_ops; +-extern struct pci_raw_ops *raw_pci_ext_ops; ++extern const struct pci_raw_ops *raw_pci_ops; ++extern const struct pci_raw_ops *raw_pci_ext_ops; + +-extern struct pci_raw_ops pci_direct_conf1; ++extern const struct pci_raw_ops pci_direct_conf1; + extern bool port_cf9_safe; + + /* arch_initcall level */ +diff -urNp linux-2.6.33.1/arch/x86/include/asm/pgalloc.h linux-2.6.33.1/arch/x86/include/asm/pgalloc.h +--- linux-2.6.33.1/arch/x86/include/asm/pgalloc.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/pgalloc.h 2010-03-20 16:58:38.980670389 -0400 +@@ -63,6 +63,13 @@ static inline void pmd_populate_kernel(s + pmd_t *pmd, pte_t *pte) + { + paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); ++ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); ++} ++ ++static inline void pmd_populate_user(struct mm_struct *mm, ++ pmd_t *pmd, pte_t *pte) ++{ ++ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); + } + +diff -urNp linux-2.6.33.1/arch/x86/include/asm/pgtable-2level.h linux-2.6.33.1/arch/x86/include/asm/pgtable-2level.h +--- linux-2.6.33.1/arch/x86/include/asm/pgtable-2level.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/pgtable-2level.h 2010-03-20 16:58:38.980670389 -0400 +@@ -18,7 +18,9 @@ static inline void native_set_pte(pte_t + + static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) + { ++ pax_open_kernel(); + *pmdp = pmd; ++ pax_close_kernel(); + } + + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) +diff -urNp linux-2.6.33.1/arch/x86/include/asm/pgtable_32.h linux-2.6.33.1/arch/x86/include/asm/pgtable_32.h +--- linux-2.6.33.1/arch/x86/include/asm/pgtable_32.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/pgtable_32.h 2010-03-20 16:58:38.980670389 -0400 +@@ -26,8 +26,6 @@ + struct mm_struct; + struct vm_area_struct; + +-extern pgd_t swapper_pg_dir[1024]; +- + static inline void pgtable_cache_init(void) { } + static inline void check_pgt_cache(void) { } + void paging_init(void); +@@ -48,6 +46,11 @@ extern void set_pmd_pfn(unsigned long, u + # include <asm/pgtable-2level.h> + #endif + ++extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; ++#ifdef CONFIG_X86_PAE ++extern pmd_t swapper_pm_dir[PTRS_PER_PGD][PTRS_PER_PMD]; ++#endif ++ + #if defined(CONFIG_HIGHPTE) + #define __KM_PTE \ + (in_nmi() ? KM_NMI_PTE : \ +@@ -72,7 +75,9 @@ extern void set_pmd_pfn(unsigned long, u + /* Clear a kernel PTE and flush it from the TLB */ + #define kpte_clear_flush(ptep, vaddr) \ + do { \ ++ pax_open_kernel(); \ + pte_clear(&init_mm, (vaddr), (ptep)); \ ++ pax_close_kernel(); \ + __flush_tlb_one((vaddr)); \ + } while (0) + +@@ -84,6 +89,9 @@ do { \ + + #endif /* !__ASSEMBLY__ */ + ++#define HAVE_ARCH_UNMAPPED_AREA ++#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN ++ + /* + * kern_addr_valid() is (1) for FLATMEM and (0) for + * SPARSEMEM and DISCONTIGMEM +diff -urNp linux-2.6.33.1/arch/x86/include/asm/pgtable_32_types.h linux-2.6.33.1/arch/x86/include/asm/pgtable_32_types.h +--- linux-2.6.33.1/arch/x86/include/asm/pgtable_32_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/pgtable_32_types.h 2010-03-20 16:58:38.980670389 -0400 +@@ -8,7 +8,7 @@ + */ + #ifdef CONFIG_X86_PAE + # include <asm/pgtable-3level_types.h> +-# define PMD_SIZE (1UL << PMD_SHIFT) ++# define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) + # define PMD_MASK (~(PMD_SIZE - 1)) + #else + # include <asm/pgtable-2level_types.h> +@@ -46,6 +46,19 @@ extern bool __vmalloc_start_set; /* set + # define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) + #endif + ++#ifdef CONFIG_PAX_KERNEXEC ++#ifndef __ASSEMBLY__ ++extern unsigned char MODULES_EXEC_VADDR[]; ++extern unsigned char MODULES_EXEC_END[]; ++#endif ++#include <asm/boot.h> ++#define ktla_ktva(addr) (addr + LOAD_PHYSICAL_ADDR + PAGE_OFFSET) ++#define ktva_ktla(addr) (addr - LOAD_PHYSICAL_ADDR - PAGE_OFFSET) ++#else ++#define ktla_ktva(addr) (addr) ++#define ktva_ktla(addr) (addr) ++#endif ++ + #define MODULES_VADDR VMALLOC_START + #define MODULES_END VMALLOC_END + #define MODULES_LEN (MODULES_VADDR - MODULES_END) +diff -urNp linux-2.6.33.1/arch/x86/include/asm/pgtable-3level.h linux-2.6.33.1/arch/x86/include/asm/pgtable-3level.h +--- linux-2.6.33.1/arch/x86/include/asm/pgtable-3level.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/pgtable-3level.h 2010-03-20 16:58:38.980670389 -0400 +@@ -38,12 +38,16 @@ static inline void native_set_pte_atomic + + static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) + { ++ pax_open_kernel(); + set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd)); ++ pax_close_kernel(); + } + + static inline void native_set_pud(pud_t *pudp, pud_t pud) + { ++ pax_open_kernel(); + set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); ++ pax_close_kernel(); + } + + /* +diff -urNp linux-2.6.33.1/arch/x86/include/asm/pgtable_64.h linux-2.6.33.1/arch/x86/include/asm/pgtable_64.h +--- linux-2.6.33.1/arch/x86/include/asm/pgtable_64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/pgtable_64.h 2010-03-20 16:58:38.980670389 -0400 +@@ -16,9 +16,12 @@ + + extern pud_t level3_kernel_pgt[512]; + extern pud_t level3_ident_pgt[512]; ++extern pud_t level3_vmalloc_pgt[512]; ++extern pud_t level3_vmemmap_pgt[512]; ++extern pud_t level2_vmemmap_pgt[512]; + extern pmd_t level2_kernel_pgt[512]; + extern pmd_t level2_fixmap_pgt[512]; +-extern pmd_t level2_ident_pgt[512]; ++extern pmd_t level2_ident_pgt[512*2]; + extern pgd_t init_level4_pgt[]; + + #define swapper_pg_dir init_level4_pgt +@@ -74,7 +77,9 @@ static inline pte_t native_ptep_get_and_ + + static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) + { ++ pax_open_kernel(); + *pmdp = pmd; ++ pax_close_kernel(); + } + + static inline void native_pmd_clear(pmd_t *pmd) +@@ -94,7 +99,9 @@ static inline void native_pud_clear(pud_ + + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) + { ++ pax_open_kernel(); + *pgdp = pgd; ++ pax_close_kernel(); + } + + static inline void native_pgd_clear(pgd_t *pgd) +diff -urNp linux-2.6.33.1/arch/x86/include/asm/pgtable.h linux-2.6.33.1/arch/x86/include/asm/pgtable.h +--- linux-2.6.33.1/arch/x86/include/asm/pgtable.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/pgtable.h 2010-03-20 16:58:38.980670389 -0400 +@@ -76,12 +76,51 @@ extern struct list_head pgd_list; + + #define arch_end_context_switch(prev) do {} while(0) + ++#define pax_open_kernel() native_pax_open_kernel() ++#define pax_close_kernel() native_pax_close_kernel() + #endif /* CONFIG_PARAVIRT */ + ++#define __HAVE_ARCH_PAX_OPEN_KERNEL ++#define __HAVE_ARCH_PAX_CLOSE_KERNEL ++ ++#ifdef CONFIG_PAX_KERNEXEC ++static inline unsigned long native_pax_open_kernel(void) ++{ ++ unsigned long cr0; ++ ++ preempt_disable(); ++ barrier(); ++ cr0 = read_cr0() ^ X86_CR0_WP; ++ BUG_ON(unlikely(cr0 & X86_CR0_WP)); ++ write_cr0(cr0); ++ return cr0 ^ X86_CR0_WP; ++} ++ ++static inline unsigned long native_pax_close_kernel(void) ++{ ++ unsigned long cr0; ++ ++ cr0 = read_cr0() ^ X86_CR0_WP; ++ BUG_ON(unlikely(!(cr0 & X86_CR0_WP))); ++ write_cr0(cr0); ++ barrier(); ++ preempt_enable_no_resched(); ++ return cr0 ^ X86_CR0_WP; ++} ++#else ++static inline unsigned long native_pax_open_kernel(void) { return 0; } ++static inline unsigned long native_pax_close_kernel(void) { return 0; } ++#endif ++ + /* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ ++static inline int pte_user(pte_t pte) ++{ ++ return pte_val(pte) & _PAGE_USER; ++} ++ + static inline int pte_dirty(pte_t pte) + { + return pte_flags(pte) & _PAGE_DIRTY; +@@ -169,9 +208,29 @@ static inline pte_t pte_wrprotect(pte_t + return pte_clear_flags(pte, _PAGE_RW); + } + ++static inline pte_t pte_mkread(pte_t pte) ++{ ++ return __pte(pte_val(pte) | _PAGE_USER); ++} ++ + static inline pte_t pte_mkexec(pte_t pte) + { +- return pte_clear_flags(pte, _PAGE_NX); ++#ifdef CONFIG_X86_PAE ++ if (__supported_pte_mask & _PAGE_NX) ++ return pte_clear_flags(pte, _PAGE_NX); ++ else ++#endif ++ return pte_set_flags(pte, _PAGE_USER); ++} ++ ++static inline pte_t pte_exprotect(pte_t pte) ++{ ++#ifdef CONFIG_X86_PAE ++ if (__supported_pte_mask & _PAGE_NX) ++ return pte_set_flags(pte, _PAGE_NX); ++ else ++#endif ++ return pte_clear_flags(pte, _PAGE_USER); + } + + static inline pte_t pte_mkdirty(pte_t pte) +@@ -474,7 +533,7 @@ static inline pud_t *pud_offset(pgd_t *p + + static inline int pgd_bad(pgd_t pgd) + { +- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; ++ return (pgd_flags(pgd) & ~(_PAGE_USER | _PAGE_NX)) != _KERNPG_TABLE; + } + + static inline int pgd_none(pgd_t pgd) +@@ -613,9 +672,12 @@ static inline void ptep_set_wrprotect(st + * dst and src can be on the same page, but the range must not overlap, + * and must not cross a page boundary. + */ +-static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) ++static inline void clone_pgd_range(pgd_t *dst, const pgd_t *src, int count) + { +- memcpy(dst, src, count * sizeof(pgd_t)); ++ pax_open_kernel(); ++ while (count--) ++ *dst++ = *src++; ++ pax_close_kernel(); + } + + +diff -urNp linux-2.6.33.1/arch/x86/include/asm/pgtable_types.h linux-2.6.33.1/arch/x86/include/asm/pgtable_types.h +--- linux-2.6.33.1/arch/x86/include/asm/pgtable_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/pgtable_types.h 2010-03-20 16:58:38.980670389 -0400 +@@ -16,12 +16,11 @@ + #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ + #define _PAGE_BIT_PAT 7 /* on 4KB pages */ + #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +-#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ ++#define _PAGE_BIT_SPECIAL 9 /* special mappings, no associated struct page */ + #define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ + #define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ + #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +-#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 +-#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 ++#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SPECIAL + #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ + + /* If _PAGE_BIT_PRESENT is clear, we use these: */ +@@ -39,7 +38,6 @@ + #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) + #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) + #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +-#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) + #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) + #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) + #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) +@@ -55,8 +53,10 @@ + + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) +-#else ++#elif defined(CONFIG_KMEMCHECK) + #define _PAGE_NX (_AT(pteval_t, 0)) ++#else ++#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) + #endif + + #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) +@@ -93,6 +93,9 @@ + #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) + ++#define PAGE_READONLY_NOEXEC PAGE_READONLY ++#define PAGE_SHARED_NOEXEC PAGE_SHARED ++ + #define __PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL) + #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) +@@ -103,8 +106,8 @@ + #define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) + #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) + #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +-#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) +-#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) ++#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RO | _PAGE_USER) ++#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_RO | _PAGE_PCD | _PAGE_PWT | _PAGE_USER) + #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) + #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) + #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +@@ -163,8 +166,8 @@ + * bits are combined, this will alow user to access the high address mapped + * VDSO in the presence of CONFIG_COMPAT_VDSO + */ +-#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ +-#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ ++#define PTE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ ++#define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ + #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ + #endif + +@@ -278,7 +281,6 @@ typedef struct page *pgtable_t; + + extern pteval_t __supported_pte_mask; + extern void set_nx(void); +-extern int nx_enabled; + + #define pgprot_writecombine pgprot_writecombine + extern pgprot_t pgprot_writecombine(pgprot_t prot); +diff -urNp linux-2.6.33.1/arch/x86/include/asm/processor.h linux-2.6.33.1/arch/x86/include/asm/processor.h +--- linux-2.6.33.1/arch/x86/include/asm/processor.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/processor.h 2010-03-20 16:58:38.980670389 -0400 +@@ -273,7 +273,7 @@ struct tss_struct { + + } ____cacheline_aligned; + +-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); ++extern struct tss_struct init_tss[NR_CPUS]; + + /* + * Save the original ist values for checking stack pointers during debugging +@@ -913,8 +913,15 @@ static inline void spin_lock_prefetch(co + */ + #define TASK_SIZE PAGE_OFFSET + #define TASK_SIZE_MAX TASK_SIZE ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++#define SEGMEXEC_TASK_SIZE (TASK_SIZE / 2) ++#define STACK_TOP ((current->mm->pax_flags & MF_PAX_SEGMEXEC)?SEGMEXEC_TASK_SIZE:TASK_SIZE) ++#else + #define STACK_TOP TASK_SIZE +-#define STACK_TOP_MAX STACK_TOP ++#endif ++ ++#define STACK_TOP_MAX TASK_SIZE + + #define INIT_THREAD { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ +@@ -931,7 +938,7 @@ static inline void spin_lock_prefetch(co + */ + #define INIT_TSS { \ + .x86_tss = { \ +- .sp0 = sizeof(init_stack) + (long)&init_stack, \ ++ .sp0 = sizeof(init_stack) + (long)&init_stack - 8, \ + .ss0 = __KERNEL_DS, \ + .ss1 = __KERNEL_CS, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ +@@ -942,11 +949,7 @@ static inline void spin_lock_prefetch(co + extern unsigned long thread_saved_pc(struct task_struct *tsk); + + #define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) +-#define KSTK_TOP(info) \ +-({ \ +- unsigned long *__ptr = (unsigned long *)(info); \ +- (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ +-}) ++#define KSTK_TOP(info) ((info)->task.thread.sp0) + + /* + * The below -8 is to reserve 8 bytes on top of the ring0 stack. +@@ -961,7 +964,7 @@ extern unsigned long thread_saved_pc(str + #define task_pt_regs(task) \ + ({ \ + struct pt_regs *__regs__; \ +- __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ ++ __regs__ = (struct pt_regs *)((task)->thread.sp0); \ + __regs__ - 1; \ + }) + +@@ -977,7 +980,7 @@ extern unsigned long thread_saved_pc(str + * space during mmap's. + */ + #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ +- 0xc0000000 : 0xFFFFe000) ++ 0xc0000000 : 0xFFFFf000) + + #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) +@@ -1014,6 +1017,10 @@ extern void start_thread(struct pt_regs + */ + #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) + ++#ifdef CONFIG_PAX_SEGMEXEC ++#define SEGMEXEC_TASK_UNMAPPED_BASE (PAGE_ALIGN(SEGMEXEC_TASK_SIZE / 3)) ++#endif ++ + #define KSTK_EIP(task) (task_pt_regs(task)->ip) + + /* Get/set a process' ability to use the timestamp counter instruction */ +diff -urNp linux-2.6.33.1/arch/x86/include/asm/ptrace.h linux-2.6.33.1/arch/x86/include/asm/ptrace.h +--- linux-2.6.33.1/arch/x86/include/asm/ptrace.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/ptrace.h 2010-03-20 16:58:38.984539447 -0400 +@@ -152,28 +152,29 @@ static inline unsigned long regs_return_ + } + + /* +- * user_mode_vm(regs) determines whether a register set came from user mode. ++ * user_mode(regs) determines whether a register set came from user mode. + * This is true if V8086 mode was enabled OR if the register set was from + * protected mode with RPL-3 CS value. This tricky test checks that with + * one comparison. Many places in the kernel can bypass this full check +- * if they have already ruled out V8086 mode, so user_mode(regs) can be used. ++ * if they have already ruled out V8086 mode, so user_mode_novm(regs) can ++ * be used. + */ +-static inline int user_mode(struct pt_regs *regs) ++static inline int user_mode_novm(struct pt_regs *regs) + { + #ifdef CONFIG_X86_32 + return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL; + #else +- return !!(regs->cs & 3); ++ return !!(regs->cs & SEGMENT_RPL_MASK); + #endif + } + +-static inline int user_mode_vm(struct pt_regs *regs) ++static inline int user_mode(struct pt_regs *regs) + { + #ifdef CONFIG_X86_32 + return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= + USER_RPL; + #else +- return user_mode(regs); ++ return user_mode_novm(regs); + #endif + } + +diff -urNp linux-2.6.33.1/arch/x86/include/asm/reboot.h linux-2.6.33.1/arch/x86/include/asm/reboot.h +--- linux-2.6.33.1/arch/x86/include/asm/reboot.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/reboot.h 2010-03-20 16:58:38.984539447 -0400 +@@ -18,7 +18,7 @@ extern struct machine_ops machine_ops; + + void native_machine_crash_shutdown(struct pt_regs *regs); + void native_machine_shutdown(void); +-void machine_real_restart(const unsigned char *code, int length); ++void machine_real_restart(const unsigned char *code, unsigned int length); + + typedef void (*nmi_shootdown_cb)(int, struct die_args*); + void nmi_shootdown_cpus(nmi_shootdown_cb callback); +diff -urNp linux-2.6.33.1/arch/x86/include/asm/rwsem.h linux-2.6.33.1/arch/x86/include/asm/rwsem.h +--- linux-2.6.33.1/arch/x86/include/asm/rwsem.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/rwsem.h 2010-03-20 16:58:38.984539447 -0400 +@@ -106,10 +106,26 @@ static inline void __down_read(struct rw + { + asm volatile("# beginning down_read\n\t" + LOCK_PREFIX " incl (%%eax)\n\t" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ LOCK_PREFIX "decl (%%eax)\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + /* adds 0x00000001, returns the old value */ +- " jns 1f\n" ++ " jns 2f\n" + " call call_rwsem_down_read_failed\n" +- "1:\n\t" ++ "2:\n\t" + "# ending down_read\n\t" + : "+m" (sem->count) + : "a" (sem) +@@ -124,13 +140,29 @@ static inline int __down_read_trylock(st + __s32 result, tmp; + asm volatile("# beginning __down_read_trylock\n\t" + " movl %0,%1\n\t" +- "1:\n\t" ++ "2:\n\t" + " movl %1,%2\n\t" + " addl %3,%2\n\t" +- " jle 2f\n\t" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ "subl %3,%2\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ " jle 3f\n\t" + LOCK_PREFIX " cmpxchgl %2,%0\n\t" +- " jnz 1b\n\t" +- "2:\n\t" ++ " jnz 2b\n\t" ++ "3:\n\t" + "# ending __down_read_trylock\n\t" + : "+m" (sem->count), "=&a" (result), "=&r" (tmp) + : "i" (RWSEM_ACTIVE_READ_BIAS) +@@ -148,12 +180,28 @@ static inline void __down_write_nested(s + tmp = RWSEM_ACTIVE_WRITE_BIAS; + asm volatile("# beginning down_write\n\t" + LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ "movl %%edx,(%%eax)\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + /* subtract 0x0000ffff, returns the old value */ + " testl %%edx,%%edx\n\t" + /* was the count 0 before? */ +- " jz 1f\n" ++ " jz 2f\n" + " call call_rwsem_down_write_failed\n" +- "1:\n" ++ "2:\n" + "# ending down_write" + : "+m" (sem->count), "=d" (tmp) + : "a" (sem), "1" (tmp) +@@ -186,10 +234,26 @@ static inline void __up_read(struct rw_s + __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; + asm volatile("# beginning __up_read\n\t" + LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ "movl %%edx,(%%eax)\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + /* subtracts 1, returns the old value */ +- " jns 1f\n\t" ++ " jns 2f\n\t" + " call call_rwsem_wake\n" +- "1:\n" ++ "2:\n" + "# ending __up_read\n" + : "+m" (sem->count), "=d" (tmp) + : "a" (sem), "1" (tmp) +@@ -204,11 +268,27 @@ static inline void __up_write(struct rw_ + asm volatile("# beginning __up_write\n\t" + " movl %2,%%edx\n\t" + LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ "movl %%edx,(%%eax)\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + /* tries to transition + 0xffff0001 -> 0x00000000 */ +- " jz 1f\n" ++ " jz 2f\n" + " call call_rwsem_wake\n" +- "1:\n\t" ++ "2:\n\t" + "# ending __up_write\n" + : "+m" (sem->count) + : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) +@@ -222,10 +302,26 @@ static inline void __downgrade_write(str + { + asm volatile("# beginning __downgrade_write\n\t" + LOCK_PREFIX " addl %2,(%%eax)\n\t" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ LOCK_PREFIX "subl %2,(%%eax)\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ +- " jns 1f\n\t" ++ " jns 2f\n\t" + " call call_rwsem_downgrade_wake\n" +- "1:\n\t" ++ "2:\n\t" + "# ending __downgrade_write\n" + : "+m" (sem->count) + : "a" (sem), "i" (-RWSEM_WAITING_BIAS) +@@ -237,7 +333,23 @@ static inline void __downgrade_write(str + */ + static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) + { +- asm volatile(LOCK_PREFIX "addl %1,%0" ++ asm volatile(LOCK_PREFIX "addl %1,%0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ LOCK_PREFIX "subl %1,%0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "+m" (sem->count) + : "ir" (delta)); + } +@@ -249,7 +361,23 @@ static inline int rwsem_atomic_update(in + { + int tmp = delta; + +- asm volatile(LOCK_PREFIX "xadd %0,%1" ++ asm volatile(LOCK_PREFIX "xadd %0,%1\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ "movl %0,%1\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "+r" (tmp), "+m" (sem->count) + : : "memory"); + +diff -urNp linux-2.6.33.1/arch/x86/include/asm/segment.h linux-2.6.33.1/arch/x86/include/asm/segment.h +--- linux-2.6.33.1/arch/x86/include/asm/segment.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/segment.h 2010-03-20 16:58:38.984539447 -0400 +@@ -62,8 +62,8 @@ + * 26 - ESPFIX small SS + * 27 - per-cpu [ offset to per-cpu data area ] + * 28 - stack_canary-20 [ for stack protector ] +- * 29 - unused +- * 30 - unused ++ * 29 - PCI BIOS CS ++ * 30 - PCI BIOS DS + * 31 - TSS for double fault handler + */ + #define GDT_ENTRY_TLS_MIN 6 +@@ -77,6 +77,8 @@ + + #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) + ++#define GDT_ENTRY_KERNEXEC_KERNEL_CS (4) ++ + #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) + + #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) +@@ -88,7 +90,7 @@ + #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) + #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) + +-#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) ++#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) + #ifdef CONFIG_SMP + #define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) + #else +@@ -102,6 +104,12 @@ + #define __KERNEL_STACK_CANARY 0 + #endif + ++#define GDT_ENTRY_PCIBIOS_CS (GDT_ENTRY_KERNEL_BASE + 17) ++#define __PCIBIOS_CS (GDT_ENTRY_PCIBIOS_CS * 8) ++ ++#define GDT_ENTRY_PCIBIOS_DS (GDT_ENTRY_KERNEL_BASE + 18) ++#define __PCIBIOS_DS (GDT_ENTRY_PCIBIOS_DS * 8) ++ + #define GDT_ENTRY_DOUBLEFAULT_TSS 31 + + /* +@@ -139,7 +147,7 @@ + */ + + /* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ +-#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) ++#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xFFFCU) == PNP_CS32 || ((x) & 0xFFFCU) == PNP_CS16) + + + #else +@@ -163,6 +171,8 @@ + #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3) + #define __USER32_DS __USER_DS + ++#define GDT_ENTRY_KERNEXEC_KERNEL_CS 7 ++ + #define GDT_ENTRY_TSS 8 /* needs two entries */ + #define GDT_ENTRY_LDT 10 /* needs two entries */ + #define GDT_ENTRY_TLS_MIN 12 +@@ -183,6 +193,7 @@ + #endif + + #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) ++#define __KERNEXEC_KERNEL_CS (GDT_ENTRY_KERNEXEC_KERNEL_CS * 8) + #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) + #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3) + #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3) +diff -urNp linux-2.6.33.1/arch/x86/include/asm/spinlock.h linux-2.6.33.1/arch/x86/include/asm/spinlock.h +--- linux-2.6.33.1/arch/x86/include/asm/spinlock.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/spinlock.h 2010-03-20 16:58:38.984539447 -0400 +@@ -249,18 +249,50 @@ static inline int arch_write_can_lock(ar + static inline void arch_read_lock(arch_rwlock_t *rw) + { + asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" +- "jns 1f\n" +- "call __read_lock_failed\n\t" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" + "1:\n" ++ LOCK_PREFIX " addl $1,(%0)\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "jns 2f\n" ++ "call __read_lock_failed\n\t" ++ "2:\n" + ::LOCK_PTR_REG (rw) : "memory"); + } + + static inline void arch_write_lock(arch_rwlock_t *rw) + { + asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" +- "jz 1f\n" +- "call __write_lock_failed\n\t" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" + "1:\n" ++ LOCK_PREFIX " addl %1,(%0)\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ "jz 2f\n" ++ "call __write_lock_failed\n\t" ++ "2:\n" + ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); + } + +@@ -286,12 +318,45 @@ static inline int arch_write_trylock(arc + + static inline void arch_read_unlock(arch_rwlock_t *rw) + { +- asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); ++ asm volatile(LOCK_PREFIX "incl %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ LOCK_PREFIX "decl %0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ ++ :"+m" (rw->lock) : : "memory"); + } + + static inline void arch_write_unlock(arch_rwlock_t *rw) + { +- asm volatile(LOCK_PREFIX "addl %1, %0" ++ asm volatile(LOCK_PREFIX "addl %1, %0\n" ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#ifdef CONFIG_X86_32 ++ "into\n0:\n" ++#else ++ "jno 0f\n" ++ "int $4\n0:\n" ++#endif ++ ".pushsection .fixup,"ax"\n" ++ "1:\n" ++ LOCK_PREFIX "subl %1,%0\n" ++ "jmp 0b\n" ++ ".popsection\n" ++ _ASM_EXTABLE(0b, 1b) ++#endif ++ + : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); + } + +diff -urNp linux-2.6.33.1/arch/x86/include/asm/system.h linux-2.6.33.1/arch/x86/include/asm/system.h +--- linux-2.6.33.1/arch/x86/include/asm/system.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/system.h 2010-03-20 16:58:38.984539447 -0400 +@@ -202,7 +202,7 @@ static inline unsigned long get_limit(un + { + unsigned long __limit; + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); +- return __limit + 1; ++ return __limit; + } + + static inline void native_clts(void) +@@ -342,7 +342,7 @@ void enable_hlt(void); + + void cpu_idle_wait(void); + +-extern unsigned long arch_align_stack(unsigned long sp); ++#define arch_align_stack(x) ((x) & ~0xfUL) + extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + + void default_idle(void); +diff -urNp linux-2.6.33.1/arch/x86/include/asm/uaccess_32.h linux-2.6.33.1/arch/x86/include/asm/uaccess_32.h +--- linux-2.6.33.1/arch/x86/include/asm/uaccess_32.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/uaccess_32.h 2010-03-20 16:58:38.984539447 -0400 +@@ -44,6 +44,9 @@ unsigned long __must_check __copy_from_u + static __always_inline unsigned long __must_check + __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) + { ++ if ((long)n < 0) ++ return n; ++ + if (__builtin_constant_p(n)) { + unsigned long ret; + +@@ -62,6 +65,8 @@ __copy_to_user_inatomic(void __user *to, + return ret; + } + } ++ if (!__builtin_constant_p(n)) ++ check_object_size(from, n, true); + return __copy_to_user_ll(to, from, n); + } + +@@ -89,6 +94,9 @@ __copy_to_user(void __user *to, const vo + static __always_inline unsigned long + __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) + { ++ if ((long)n < 0) ++ return n; ++ + /* Avoid zeroing the tail if the copy fails.. + * If 'n' is constant and 1, 2, or 4, we do still zero on a failure, + * but as the zeroing behaviour is only significant when n is not +@@ -138,6 +146,10 @@ static __always_inline unsigned long + __copy_from_user(void *to, const void __user *from, unsigned long n) + { + might_fault(); ++ ++ if ((long)n < 0) ++ return n; ++ + if (__builtin_constant_p(n)) { + unsigned long ret; + +@@ -153,6 +165,8 @@ __copy_from_user(void *to, const void __ + return ret; + } + } ++ if (!__builtin_constant_p(n)) ++ check_object_size(to, n, false); + return __copy_from_user_ll(to, from, n); + } + +@@ -160,6 +174,10 @@ static __always_inline unsigned long __c + const void __user *from, unsigned long n) + { + might_fault(); ++ ++ if ((long)n < 0) ++ return n; ++ + if (__builtin_constant_p(n)) { + unsigned long ret; + +@@ -182,15 +200,19 @@ static __always_inline unsigned long + __copy_from_user_inatomic_nocache(void *to, const void __user *from, + unsigned long n) + { +- return __copy_from_user_ll_nocache_nozero(to, from, n); +-} ++ if ((long)n < 0) ++ return n; + +-unsigned long __must_check copy_to_user(void __user *to, +- const void *from, unsigned long n); +-unsigned long __must_check _copy_from_user(void *to, +- const void __user *from, +- unsigned long n); ++ return __copy_from_user_ll_nocache_nozero(to, from, n); ++} + ++extern void copy_to_user_overflow(void) ++#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS ++ __compiletime_error("copy_to_user() buffer size is not provably correct") ++#else ++ __compiletime_warning("copy_to_user() buffer size is not provably correct") ++#endif ++; + + extern void copy_from_user_overflow(void) + #ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS +@@ -200,17 +222,61 @@ extern void copy_from_user_overflow(void + #endif + ; + +-static inline unsigned long __must_check copy_from_user(void *to, +- const void __user *from, +- unsigned long n) ++/** ++ * copy_to_user: - Copy a block of data into user space. ++ * @to: Destination address, in user space. ++ * @from: Source address, in kernel space. ++ * @n: Number of bytes to copy. ++ * ++ * Context: User context only. This function may sleep. ++ * ++ * Copy data from kernel space to user space. ++ * ++ * Returns number of bytes that could not be copied. ++ * On success, this will be zero. ++ */ ++static inline unsigned long __must_check ++copy_to_user(void __user *to, const void *from, unsigned long n) ++{ ++ int sz = __compiletime_object_size(from); ++ ++ if (unlikely(sz != -1 && sz < n)) ++ copy_to_user_overflow(); ++ else if (access_ok(VERIFY_WRITE, to, n)) ++ n = __copy_to_user(to, from, n); ++ return n; ++} ++ ++/** ++ * copy_from_user: - Copy a block of data from user space. ++ * @to: Destination address, in kernel space. ++ * @from: Source address, in user space. ++ * @n: Number of bytes to copy. ++ * ++ * Context: User context only. This function may sleep. ++ * ++ * Copy data from user space to kernel space. ++ * ++ * Returns number of bytes that could not be copied. ++ * On success, this will be zero. ++ * ++ * If some data could not be copied, this function will pad the copied ++ * data to the requested size using zero bytes. ++ */ ++static inline unsigned long __must_check ++copy_from_user(void *to, const void __user *from, unsigned long n) + { + int sz = __compiletime_object_size(to); + +- if (likely(sz == -1 || sz >= n)) +- n = _copy_from_user(to, from, n); +- else ++ if (unlikely(sz != -1 && sz < n)) + copy_from_user_overflow(); +- ++ else if (access_ok(VERIFY_READ, from, n)) ++ n = __copy_from_user(to, from, n); ++ else if ((long)n > 0) { ++ if (!__builtin_constant_p(n)) ++ check_object_size(to, n, false); ++ memset(to, 0, n); ++ } + return n; + } + +diff -urNp linux-2.6.33.1/arch/x86/include/asm/uaccess_64.h linux-2.6.33.1/arch/x86/include/asm/uaccess_64.h +--- linux-2.6.33.1/arch/x86/include/asm/uaccess_64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/uaccess_64.h 2010-03-20 16:58:38.984539447 -0400 +@@ -10,6 +10,8 @@ + #include <linux/lockdep.h> + #include <asm/page.h> + ++#define set_fs(x) (current_thread_info()->addr_limit = (x)) ++ + /* + * Copy To/From Userspace + */ +@@ -17,27 +19,26 @@ + /* Handles exceptions in both to and from, but doesn't do access_ok */ + __must_check unsigned long + copy_user_generic(void *to, const void *from, unsigned len); +- +-__must_check unsigned long +-_copy_to_user(void __user *to, const void *from, unsigned len); +-__must_check unsigned long +-_copy_from_user(void *to, const void __user *from, unsigned len); ++static __always_inline __must_check unsigned long ++__copy_to_user(void __user *to, const void *from, unsigned len); ++static __always_inline __must_check unsigned long ++__copy_from_user(void *to, const void __user *from, unsigned len); + __must_check unsigned long + copy_in_user(void __user *to, const void __user *from, unsigned len); + + static inline unsigned long __must_check copy_from_user(void *to, + const void __user *from, +- unsigned long n) ++ unsigned n) + { +- int sz = __compiletime_object_size(to); +- + might_fault(); +- if (likely(sz == -1 || sz >= n)) +- n = _copy_from_user(to, from, n); +-#ifdef CONFIG_DEBUG_VM +- else +- WARN(1, "Buffer overflow detected!\n"); +-#endif ++ ++ if (access_ok(VERIFY_READ, from, n)) ++ n = __copy_from_user(to, from, n); ++ else if ((int)n > 0) { ++ if (!__builtin_constant_p(n)) ++ check_object_size(to, n, false); ++ memset(to, 0, n); ++ } + return n; + } + +@@ -46,17 +47,33 @@ int copy_to_user(void __user *dst, const + { + might_fault(); + +- return _copy_to_user(dst, src, size); ++ if (access_ok(VERIFY_WRITE, dst, size)) ++ size = __copy_to_user(dst, src, size); ++ return size; + } + + static __always_inline __must_check +-int __copy_from_user(void *dst, const void __user *src, unsigned size) ++unsigned long __copy_from_user(void *dst, const void __user *src, unsigned size) + { +- int ret = 0; ++ int sz = __compiletime_object_size(dst); ++ unsigned ret = 0; + + might_fault(); +- if (!__builtin_constant_p(size)) ++ ++ if ((int)size < 0) ++ return size; ++ ++ if (unlikely(sz != -1 && sz < size)) { ++#ifdef CONFIG_DEBUG_VM ++ WARN(1, "Buffer overflow detected!\n"); ++#endif ++ return size; ++ } ++ ++ if (!__builtin_constant_p(size)) { ++ check_object_size(dst, size, false); + return copy_user_generic(dst, (__force void *)src, size); ++ } + switch (size) { + case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src, + ret, "b", "b", "=q", 1); +@@ -94,13 +111,27 @@ int __copy_from_user(void *dst, const vo + } + + static __always_inline __must_check +-int __copy_to_user(void __user *dst, const void *src, unsigned size) ++unsigned long __copy_to_user(void __user *dst, const void *src, unsigned size) + { +- int ret = 0; ++ int sz = __compiletime_object_size(src); ++ unsigned ret = 0; + + might_fault(); +- if (!__builtin_constant_p(size)) ++ ++ if ((int)size < 0) ++ return size; ++ ++ if (unlikely(sz != -1 && sz < size)) { ++#ifdef CONFIG_DEBUG_VM ++ WARN(1, "Buffer overflow detected!\n"); ++#endif ++ return size; ++ } ++ ++ if (!__builtin_constant_p(size)) { ++ check_object_size(src, size, true); + return copy_user_generic((__force void *)dst, src, size); ++ } + switch (size) { + case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst, + ret, "b", "b", "iq", 1); +@@ -138,11 +169,15 @@ int __copy_to_user(void __user *dst, con + } + + static __always_inline __must_check +-int __copy_in_user(void __user *dst, const void __user *src, unsigned size) ++unsigned long __copy_in_user(void __user *dst, const void __user *src, unsigned size) + { +- int ret = 0; ++ unsigned ret = 0; + + might_fault(); ++ ++ if ((int)size < 0) ++ return size; ++ + if (!__builtin_constant_p(size)) + return copy_user_generic((__force void *)dst, + (__force void *)src, size); +@@ -206,30 +241,38 @@ __copy_from_user_inatomic(void *dst, con + return copy_user_generic(dst, (__force const void *)src, size); + } + +-static __must_check __always_inline int ++static __must_check __always_inline unsigned long + __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) + { ++ if ((int)size < 0) ++ return size; ++ + return copy_user_generic((__force void *)dst, src, size); + } + +-extern long __copy_user_nocache(void *dst, const void __user *src, ++extern unsigned long __copy_user_nocache(void *dst, const void __user *src, + unsigned size, int zerorest); + +-static inline int +-__copy_from_user_nocache(void *dst, const void __user *src, unsigned size) ++static inline unsigned long __copy_from_user_nocache(void *dst, const void __user *src, unsigned size) + { + might_sleep(); ++ ++ if ((int)size < 0) ++ return size; ++ + return __copy_user_nocache(dst, src, size, 1); + } + +-static inline int +-__copy_from_user_inatomic_nocache(void *dst, const void __user *src, ++static inline unsigned long __copy_from_user_inatomic_nocache(void *dst, const void __user *src, + unsigned size) + { ++ if ((int)size < 0) ++ return size; ++ + return __copy_user_nocache(dst, src, size, 0); + } + +-unsigned long ++extern unsigned long + copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest); + + #endif /* _ASM_X86_UACCESS_64_H */ +diff -urNp linux-2.6.33.1/arch/x86/include/asm/uaccess.h linux-2.6.33.1/arch/x86/include/asm/uaccess.h +--- linux-2.6.33.1/arch/x86/include/asm/uaccess.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/uaccess.h 2010-03-20 16:58:38.984539447 -0400 +@@ -8,8 +8,11 @@ + #include <linux/thread_info.h> + #include <linux/prefetch.h> + #include <linux/string.h> ++#include <linux/sched.h> ++#include <linux/slab.h> + #include <asm/asm.h> + #include <asm/page.h> ++#include <asm/segment.h> + + #define VERIFY_READ 0 + #define VERIFY_WRITE 1 +@@ -29,7 +32,12 @@ + + #define get_ds() (KERNEL_DS) + #define get_fs() (current_thread_info()->addr_limit) ++#ifdef CONFIG_X86_32 ++void __set_fs(mm_segment_t x, int cpu); ++void set_fs(mm_segment_t x); ++#else + #define set_fs(x) (current_thread_info()->addr_limit = (x)) ++#endif + + #define segment_eq(a, b) ((a).seg == (b).seg) + +@@ -77,7 +85,33 @@ + * checks that the pointer is in the user space range - after calling + * this function, memory access functions may still return -EFAULT. + */ +-#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0)) ++#define __access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0)) ++#define access_ok(type, addr, size) \ ++({ \ ++ long __size = size; \ ++ unsigned long __addr = (unsigned long)addr; \ ++ unsigned long __addr_ao = __addr & PAGE_MASK; \ ++ unsigned long __end_ao = __addr + __size - 1; \ ++ bool __ret_ao = __range_not_ok(__addr, __size) == 0; \ ++ if (__ret_ao && unlikely((__end_ao ^ __addr_ao) & PAGE_MASK)) { \ ++ while(__addr_ao <= __end_ao) { \ ++ char __c_ao; \ ++ __addr_ao += PAGE_SIZE; \ ++ if (__size > PAGE_SIZE) \ ++ cond_resched(); \ ++ if (__get_user(__c_ao, (char __user *)__addr)) \ ++ break; \ ++ if (type != VERIFY_WRITE) { \ ++ __addr = __addr_ao; \ ++ continue; \ ++ } \ ++ if (__put_user(__c_ao, (char __user *)__addr)) \ ++ break; \ ++ __addr = __addr_ao; \ ++ } \ ++ } \ ++ __ret_ao; \ ++}) + + /* + * The exception table consists of pairs of addresses: the first is the +@@ -183,13 +217,21 @@ extern int __get_user_bad(void); + asm volatile("call __put_user_" #size : "=a" (__ret_pu) \ + : "0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") + +- ++#ifdef CONFIG_X86_32 ++#define _ASM_LOAD_USER_DS(ds) "movw %w" #ds ",%%ds\n" ++#define _ASM_LOAD_KERNEL_DS "pushl %%ss; popl %%ds\n" ++#else ++#define _ASM_LOAD_USER_DS(ds) ++#define _ASM_LOAD_KERNEL_DS ++#endif + + #ifdef CONFIG_X86_32 + #define __put_user_asm_u64(x, addr, err, errret) \ +- asm volatile("1: movl %%eax,0(%2)\n" \ +- "2: movl %%edx,4(%2)\n" \ ++ asm volatile(_ASM_LOAD_USER_DS(5) \ ++ "1: movl %%eax,%%ds:0(%2)\n" \ ++ "2: movl %%edx,%%ds:4(%2)\n" \ + "3:\n" \ ++ _ASM_LOAD_KERNEL_DS \ + ".section .fixup,"ax"\n" \ + "4: movl %3,%0\n" \ + " jmp 3b\n" \ +@@ -197,15 +239,18 @@ extern int __get_user_bad(void); + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ + : "=r" (err) \ +- : "A" (x), "r" (addr), "i" (errret), "0" (err)) ++ : "A" (x), "r" (addr), "i" (errret), "0" (err), \ ++ "r"(__USER_DS)) + + #define __put_user_asm_ex_u64(x, addr) \ +- asm volatile("1: movl %%eax,0(%1)\n" \ +- "2: movl %%edx,4(%1)\n" \ ++ asm volatile(_ASM_LOAD_USER_DS(2) \ ++ "1: movl %%eax,%%ds:0(%1)\n" \ ++ "2: movl %%edx,%%ds:4(%1)\n" \ + "3:\n" \ ++ _ASM_LOAD_KERNEL_DS \ + _ASM_EXTABLE(1b, 2b - 1b) \ + _ASM_EXTABLE(2b, 3b - 2b) \ +- : : "A" (x), "r" (addr)) ++ : : "A" (x), "r" (addr), "r"(__USER_DS)) + + #define __put_user_x8(x, ptr, __ret_pu) \ + asm volatile("call __put_user_8" : "=a" (__ret_pu) \ +@@ -374,16 +419,18 @@ do { \ + } while (0) + + #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ +- asm volatile("1: mov"itype" %2,%"rtype"1\n" \ ++ asm volatile(_ASM_LOAD_USER_DS(5) \ ++ "1: mov"itype" %%ds:%2,%"rtype"1\n" \ + "2:\n" \ ++ _ASM_LOAD_KERNEL_DS \ + ".section .fixup,"ax"\n" \ + "3: mov %3,%0\n" \ + " xor"itype" %"rtype"1,%"rtype"1\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ +- : "=r" (err), ltype(x) \ +- : "m" (__m(addr)), "i" (errret), "0" (err)) ++ : "=r" (err), ltype (x) \ ++ : "m" (__m(addr)), "i" (errret), "0" (err), "r"(__USER_DS)) + + #define __get_user_size_ex(x, ptr, size) \ + do { \ +@@ -407,10 +454,12 @@ do { \ + } while (0) + + #define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ +- asm volatile("1: mov"itype" %1,%"rtype"0\n" \ ++ asm volatile(_ASM_LOAD_USER_DS(2) \ ++ "1: mov"itype" %%ds:%1,%"rtype"0\n" \ + "2:\n" \ ++ _ASM_LOAD_KERNEL_DS \ + _ASM_EXTABLE(1b, 2b - 1b) \ +- : ltype(x) : "m" (__m(addr))) ++ : ltype(x) : "m" (__m(addr)), "r"(__USER_DS)) + + #define __put_user_nocheck(x, ptr, size) \ + ({ \ +@@ -424,7 +473,7 @@ do { \ + int __gu_err; \ + unsigned long __gu_val; \ + __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ +- (x) = (__force __typeof__(*(ptr)))__gu_val; \ ++ (x) = (__typeof__(*(ptr)))__gu_val; \ + __gu_err; \ + }) + +@@ -438,21 +487,26 @@ struct __large_struct { unsigned long bu + * aliasing issues. + */ + #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ +- asm volatile("1: mov"itype" %"rtype"1,%2\n" \ ++ asm volatile(_ASM_LOAD_USER_DS(5) \ ++ "1: mov"itype" %"rtype"1,%%ds:%2\n" \ + "2:\n" \ ++ _ASM_LOAD_KERNEL_DS \ + ".section .fixup,"ax"\n" \ + "3: mov %3,%0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "=r"(err) \ +- : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err)) ++ : ltype (x), "m" (__m(addr)), "i" (errret), "0" (err),\ ++ "r"(__USER_DS)) + + #define __put_user_asm_ex(x, addr, itype, rtype, ltype) \ +- asm volatile("1: mov"itype" %"rtype"0,%1\n" \ ++ asm volatile(_ASM_LOAD_USER_DS(2) \ ++ "1: mov"itype" %"rtype"0,%%ds:%1\n" \ + "2:\n" \ ++ _ASM_LOAD_KERNEL_DS \ + _ASM_EXTABLE(1b, 2b - 1b) \ +- : : ltype(x), "m" (__m(addr))) ++ : : ltype(x), "m" (__m(addr)), "r"(__USER_DS)) + + /* + * uaccess_try and catch +@@ -530,7 +584,7 @@ struct __large_struct { unsigned long bu + #define get_user_ex(x, ptr) do { \ + unsigned long __gue_val; \ + __get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr)))); \ +- (x) = (__force __typeof__(*(ptr)))__gue_val; \ ++ (x) = (__typeof__(*(ptr)))__gue_val; \ + } while (0) + + #ifdef CONFIG_X86_WP_WORKS_OK +@@ -567,6 +621,7 @@ extern struct movsl_mask { + + #define ARCH_HAS_NOCACHE_UACCESS 1 + ++#define ARCH_HAS_SORT_EXTABLE + #ifdef CONFIG_X86_32 + # include "uaccess_32.h" + #else +diff -urNp linux-2.6.33.1/arch/x86/include/asm/vgtod.h linux-2.6.33.1/arch/x86/include/asm/vgtod.h +--- linux-2.6.33.1/arch/x86/include/asm/vgtod.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/vgtod.h 2010-03-20 16:58:38.984539447 -0400 +@@ -14,6 +14,7 @@ struct vsyscall_gtod_data { + int sysctl_enabled; + struct timezone sys_tz; + struct { /* extract of a clocksource struct */ ++ char name[8]; + cycle_t (*vread)(void); + cycle_t cycle_last; + cycle_t mask; +diff -urNp linux-2.6.33.1/arch/x86/include/asm/vmi.h linux-2.6.33.1/arch/x86/include/asm/vmi.h +--- linux-2.6.33.1/arch/x86/include/asm/vmi.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/vmi.h 2010-03-20 16:58:38.984539447 -0400 +@@ -191,6 +191,7 @@ struct vrom_header { + u8 reserved[96]; /* Reserved for headers */ + char vmi_init[8]; /* VMI_Init jump point */ + char get_reloc[8]; /* VMI_GetRelocationInfo jump point */ ++ char rom_data[8048]; /* rest of the option ROM */ + } __attribute__((packed)); + + struct pnp_header { +diff -urNp linux-2.6.33.1/arch/x86/include/asm/vsyscall.h linux-2.6.33.1/arch/x86/include/asm/vsyscall.h +--- linux-2.6.33.1/arch/x86/include/asm/vsyscall.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/include/asm/vsyscall.h 2010-03-20 16:58:38.984539447 -0400 +@@ -15,9 +15,10 @@ enum vsyscall_num { + + #ifdef __KERNEL__ + #include <linux/seqlock.h> ++#include <linux/getcpu.h> ++#include <linux/time.h> + + #define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) +-#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) + + /* Definitions for CONFIG_GENERIC_TIME definitions */ + #define __section_vsyscall_gtod_data __attribute__ \ +@@ -31,7 +32,6 @@ enum vsyscall_num { + #define VGETCPU_LSL 2 + + extern int __vgetcpu_mode; +-extern volatile unsigned long __jiffies; + + /* kernel space (writeable) */ + extern int vgetcpu_mode; +@@ -39,6 +39,9 @@ extern struct timezone sys_tz; + + extern void map_vsyscall(void); + ++extern int vgettimeofday(struct timeval * tv, struct timezone * tz); ++extern time_t vtime(time_t *t); ++extern long vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache); + #endif /* __KERNEL__ */ + + #endif /* _ASM_X86_VSYSCALL_H */ +diff -urNp linux-2.6.33.1/arch/x86/Kconfig linux-2.6.33.1/arch/x86/Kconfig +--- linux-2.6.33.1/arch/x86/Kconfig 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/Kconfig 2010-03-20 16:58:38.984539447 -0400 +@@ -1088,7 +1088,7 @@ config PAGE_OFFSET + hex + default 0xB0000000 if VMSPLIT_3G_OPT + default 0x80000000 if VMSPLIT_2G +- default 0x78000000 if VMSPLIT_2G_OPT ++ default 0x70000000 if VMSPLIT_2G_OPT + default 0x40000000 if VMSPLIT_1G + default 0xC0000000 + depends on X86_32 +@@ -1422,7 +1422,7 @@ config ARCH_USES_PG_UNCACHED + + config EFI + bool "EFI runtime service support" +- depends on ACPI ++ depends on ACPI && !PAX_KERNEXEC + ---help--- + This enables the kernel to use EFI runtime services that are + available (such as the EFI variable services). +@@ -1509,6 +1509,7 @@ config KEXEC_JUMP + config PHYSICAL_START + hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) + default "0x1000000" ++ range 0x400000 0x40000000 + ---help--- + This gives the physical address where the kernel is loaded. + +@@ -1573,6 +1574,7 @@ config PHYSICAL_ALIGN + hex + prompt "Alignment value to which kernel should be aligned" if X86_32 + default "0x1000000" ++ range 0x400000 0x1000000 if PAX_KERNEXEC + range 0x2000 0x1000000 + ---help--- + This value puts the alignment restrictions on physical address +@@ -1604,9 +1606,10 @@ config HOTPLUG_CPU + Say N if you want to disable CPU hotplug. + + config COMPAT_VDSO +- def_bool y ++ def_bool n + prompt "Compat VDSO support" + depends on X86_32 || IA32_EMULATION ++ depends on !PAX_NOEXEC && !PAX_MEMORY_UDEREF + ---help--- + Map the 32-bit VDSO to the predictable old-style address too. + +diff -urNp linux-2.6.33.1/arch/x86/Kconfig.cpu linux-2.6.33.1/arch/x86/Kconfig.cpu +--- linux-2.6.33.1/arch/x86/Kconfig.cpu 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/Kconfig.cpu 2010-03-20 16:58:38.984539447 -0400 +@@ -336,7 +336,7 @@ config X86_PPRO_FENCE + + config X86_F00F_BUG + def_bool y +- depends on M586MMX || M586TSC || M586 || M486 || M386 ++ depends on (M586MMX || M586TSC || M586 || M486 || M386) && !PAX_KERNEXEC + + config X86_WP_WORKS_OK + def_bool y +@@ -356,7 +356,7 @@ config X86_POPAD_OK + + config X86_ALIGNMENT_16 + def_bool y +- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK8 || MK7 || MK6 || MCORE2 || MPENTIUM4 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 + + config X86_INTEL_USERCOPY + def_bool y +@@ -402,7 +402,7 @@ config X86_CMPXCHG64 + # generates cmov. + config X86_CMOV + def_bool y +- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) ++ depends on (MK8 || MK7 || MCORE2 || MPSC || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + + config X86_MINIMUM_CPU_FAMILY + int +diff -urNp linux-2.6.33.1/arch/x86/Kconfig.debug linux-2.6.33.1/arch/x86/Kconfig.debug +--- linux-2.6.33.1/arch/x86/Kconfig.debug 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/Kconfig.debug 2010-03-20 16:58:38.984539447 -0400 +@@ -99,7 +99,7 @@ config X86_PTDUMP + config DEBUG_RODATA + bool "Write protect kernel read-only data structures" + default y +- depends on DEBUG_KERNEL ++ depends on DEBUG_KERNEL && BROKEN + ---help--- + Mark the kernel read-only data as write-protected in the pagetables, + in order to catch accidental (and incorrect) writes to such const +diff -urNp linux-2.6.33.1/arch/x86/kernel/acpi/boot.c linux-2.6.33.1/arch/x86/kernel/acpi/boot.c +--- linux-2.6.33.1/arch/x86/kernel/acpi/boot.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/acpi/boot.c 2010-03-20 16:58:38.988525305 -0400 +@@ -1503,7 +1503,7 @@ static struct dmi_system_id __initdata a + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), + }, + }, +- {} ++ { NULL, NULL, {{0, {0}}}, NULL} + }; + + /* +diff -urNp linux-2.6.33.1/arch/x86/kernel/acpi/realmode/wakeup.S linux-2.6.33.1/arch/x86/kernel/acpi/realmode/wakeup.S +--- linux-2.6.33.1/arch/x86/kernel/acpi/realmode/wakeup.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/acpi/realmode/wakeup.S 2010-03-20 16:58:38.988525305 -0400 +@@ -104,7 +104,7 @@ _start: + movl %eax, %ecx + orl %edx, %ecx + jz 1f +- movl $0xc0000080, %ecx ++ mov $MSR_EFER, %ecx + wrmsr + 1: + +diff -urNp linux-2.6.33.1/arch/x86/kernel/acpi/sleep.c linux-2.6.33.1/arch/x86/kernel/acpi/sleep.c +--- linux-2.6.33.1/arch/x86/kernel/acpi/sleep.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/acpi/sleep.c 2010-03-20 16:58:38.988525305 -0400 +@@ -11,11 +11,12 @@ + #include <linux/cpumask.h> + #include <asm/segment.h> + #include <asm/desc.h> ++#include <asm/e820.h> + + #include "realmode/wakeup.h" + #include "sleep.h" + +-unsigned long acpi_wakeup_address; ++unsigned long acpi_wakeup_address = 0x2000; + unsigned long acpi_realmode_flags; + + /* address in low memory of the wakeup routine. */ +@@ -96,8 +97,12 @@ int acpi_save_state_mem(void) + header->trampoline_segment = setup_trampoline() >> 4; + #ifdef CONFIG_SMP + stack_start.sp = temp_stack + sizeof(temp_stack); ++ ++ pax_open_kernel(); + early_gdt_descr.address = + (unsigned long)get_cpu_gdt_table(smp_processor_id()); ++ pax_close_kernel(); ++ + initial_gs = per_cpu_offset(smp_processor_id()); + #endif + initial_code = (unsigned long)wakeup_long64; +diff -urNp linux-2.6.33.1/arch/x86/kernel/acpi/wakeup_32.S linux-2.6.33.1/arch/x86/kernel/acpi/wakeup_32.S +--- linux-2.6.33.1/arch/x86/kernel/acpi/wakeup_32.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/acpi/wakeup_32.S 2010-03-20 16:58:38.988525305 -0400 +@@ -30,13 +30,11 @@ wakeup_pmode_return: + # and restore the stack ... but you need gdt for this to work + movl saved_context_esp, %esp + +- movl %cs:saved_magic, %eax +- cmpl $0x12345678, %eax ++ cmpl $0x12345678, saved_magic + jne bogus_magic + + # jump to place where we left off +- movl saved_eip, %eax +- jmp *%eax ++ jmp *(saved_eip) + + bogus_magic: + jmp bogus_magic +diff -urNp linux-2.6.33.1/arch/x86/kernel/alternative.c linux-2.6.33.1/arch/x86/kernel/alternative.c +--- linux-2.6.33.1/arch/x86/kernel/alternative.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/alternative.c 2010-03-20 16:58:38.988525305 -0400 +@@ -407,7 +407,7 @@ void __init_or_module apply_paravirt(str + + BUG_ON(p->len > MAX_PATCH_LEN); + /* prep the buffer with the original instructions */ +- memcpy(insnbuf, p->instr, p->len); ++ memcpy(insnbuf, ktla_ktva(p->instr), p->len); + used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, + (unsigned long)p->instr, p->len); + +@@ -492,12 +492,16 @@ void __init alternative_instructions(voi + * instructions. And on the local CPU you need to be protected again NMI or MCE + * handlers seeing an inconsistent instruction while you patch. + */ +-static void *__init_or_module text_poke_early(void *addr, const void *opcode, ++static void *__kprobes text_poke_early(void *addr, const void *opcode, + size_t len) + { + unsigned long flags; + local_irq_save(flags); +- memcpy(addr, opcode, len); ++ ++ pax_open_kernel(); ++ memcpy(ktla_ktva(addr), opcode, len); ++ pax_close_kernel(); ++ + sync_core(); + local_irq_restore(flags); + /* Could also do a CLFLUSH here to speed up CPU recovery; but +@@ -520,35 +524,21 @@ static void *__init_or_module text_poke_ + */ + void *__kprobes text_poke(void *addr, const void *opcode, size_t len) + { +- unsigned long flags; +- char *vaddr; ++ unsigned char *vaddr = ktla_ktva(addr); + struct page *pages[2]; +- int i; ++ size_t i; + + if (!core_kernel_text((unsigned long)addr)) { +- pages[0] = vmalloc_to_page(addr); +- pages[1] = vmalloc_to_page(addr + PAGE_SIZE); ++ pages[0] = vmalloc_to_page(vaddr); ++ pages[1] = vmalloc_to_page(vaddr + PAGE_SIZE); + } else { +- pages[0] = virt_to_page(addr); ++ pages[0] = virt_to_page(vaddr); + WARN_ON(!PageReserved(pages[0])); +- pages[1] = virt_to_page(addr + PAGE_SIZE); ++ pages[1] = virt_to_page(vaddr + PAGE_SIZE); + } + BUG_ON(!pages[0]); +- local_irq_save(flags); +- set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); +- if (pages[1]) +- set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); +- vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); +- memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); +- clear_fixmap(FIX_TEXT_POKE0); +- if (pages[1]) +- clear_fixmap(FIX_TEXT_POKE1); +- local_flush_tlb(); +- sync_core(); +- /* Could also do a CLFLUSH here to speed up CPU recovery; but +- that causes hangs on some VIA CPUs. */ ++ text_poke_early(addr, opcode, len); + for (i = 0; i < len; i++) +- BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); +- local_irq_restore(flags); ++ BUG_ON(((char *)vaddr)[i] != ((char *)opcode)[i]); + return addr; + } +diff -urNp linux-2.6.33.1/arch/x86/kernel/amd_iommu.c linux-2.6.33.1/arch/x86/kernel/amd_iommu.c +--- linux-2.6.33.1/arch/x86/kernel/amd_iommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/amd_iommu.c 2010-03-20 16:58:38.988525305 -0400 +@@ -2210,7 +2210,7 @@ static void prealloc_protection_domains( + } + } + +-static struct dma_map_ops amd_iommu_dma_ops = { ++static const struct dma_map_ops amd_iommu_dma_ops = { + .alloc_coherent = alloc_coherent, + .free_coherent = free_coherent, + .map_page = map_page, +diff -urNp linux-2.6.33.1/arch/x86/kernel/apic/io_apic.c linux-2.6.33.1/arch/x86/kernel/apic/io_apic.c +--- linux-2.6.33.1/arch/x86/kernel/apic/io_apic.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/apic/io_apic.c 2010-03-20 16:58:38.988525305 -0400 +@@ -701,7 +701,7 @@ struct IO_APIC_route_entry **alloc_ioapi + ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics, + GFP_ATOMIC); + if (!ioapic_entries) +- return 0; ++ return NULL; + + for (apic = 0; apic < nr_ioapics; apic++) { + ioapic_entries[apic] = +@@ -718,7 +718,7 @@ nomem: + kfree(ioapic_entries[apic]); + kfree(ioapic_entries); + +- return 0; ++ return NULL; + } + + /* +@@ -1135,7 +1135,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, + } + EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +-void lock_vector_lock(void) ++void lock_vector_lock(void) __acquires(vector_lock) + { + /* Used to the online set of cpus does not change + * during assign_irq_vector. +@@ -1143,7 +1143,7 @@ void lock_vector_lock(void) + spin_lock(&vector_lock); + } + +-void unlock_vector_lock(void) ++void unlock_vector_lock(void) __releases(vector_lock) + { + spin_unlock(&vector_lock); + } +diff -urNp linux-2.6.33.1/arch/x86/kernel/apm_32.c linux-2.6.33.1/arch/x86/kernel/apm_32.c +--- linux-2.6.33.1/arch/x86/kernel/apm_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/apm_32.c 2010-03-20 16:58:38.988525305 -0400 +@@ -410,7 +410,7 @@ static DEFINE_MUTEX(apm_mutex); + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ +-static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, ++static const struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4093, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); + + static const char driver_version[] = "1.16ac"; /* no spaces */ +@@ -588,7 +588,10 @@ static long __apm_bios_call(void *_call) + BUG_ON(cpu != 0); + gdt = get_cpu_gdt_table(cpu); + save_desc_40 = gdt[0x40 / 8]; ++ ++ pax_open_kernel(); + gdt[0x40 / 8] = bad_bios_desc; ++ pax_close_kernel(); + + apm_irq_save(flags); + APM_DO_SAVE_SEGS; +@@ -597,7 +600,11 @@ static long __apm_bios_call(void *_call) + &call->esi); + APM_DO_RESTORE_SEGS; + apm_irq_restore(flags); ++ ++ pax_open_kernel(); + gdt[0x40 / 8] = save_desc_40; ++ pax_close_kernel(); ++ + put_cpu(); + + return call->eax & 0xff; +@@ -664,7 +671,10 @@ static long __apm_bios_call_simple(void + BUG_ON(cpu != 0); + gdt = get_cpu_gdt_table(cpu); + save_desc_40 = gdt[0x40 / 8]; ++ ++ pax_open_kernel(); + gdt[0x40 / 8] = bad_bios_desc; ++ pax_close_kernel(); + + apm_irq_save(flags); + APM_DO_SAVE_SEGS; +@@ -672,7 +682,11 @@ static long __apm_bios_call_simple(void + &call->eax); + APM_DO_RESTORE_SEGS; + apm_irq_restore(flags); ++ ++ pax_open_kernel(); + gdt[0x40 / 8] = save_desc_40; ++ pax_close_kernel(); ++ + put_cpu(); + return error; + } +@@ -975,7 +989,7 @@ recalc: + + static void apm_power_off(void) + { +- unsigned char po_bios_call[] = { ++ const unsigned char po_bios_call[] = { + 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ + 0x8e, 0xd0, /* movw ax,ss */ + 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ +@@ -1931,7 +1945,10 @@ static const struct file_operations apm_ + static struct miscdevice apm_device = { + APM_MINOR_DEV, + "apm_bios", +- &apm_bios_fops ++ &apm_bios_fops, ++ {NULL, NULL}, ++ NULL, ++ NULL + }; + + +@@ -2252,7 +2269,7 @@ static struct dmi_system_id __initdata a + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, + }, + +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL} + }; + + /* +@@ -2355,12 +2372,15 @@ static int __init apm_init(void) + * code to that CPU. + */ + gdt = get_cpu_gdt_table(0); ++ ++ pax_open_kernel(); + set_desc_base(&gdt[APM_CS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); + set_desc_base(&gdt[APM_CS_16 >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_desc_base(&gdt[APM_DS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); ++ pax_close_kernel(); + + proc_create("apm", 0, NULL, &apm_file_ops); + +diff -urNp linux-2.6.33.1/arch/x86/kernel/asm-offsets_32.c linux-2.6.33.1/arch/x86/kernel/asm-offsets_32.c +--- linux-2.6.33.1/arch/x86/kernel/asm-offsets_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/asm-offsets_32.c 2010-03-20 16:58:38.988525305 -0400 +@@ -115,6 +115,11 @@ void foo(void) + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ OFFSET(PV_CPU_write_cr0, pv_cpu_ops, write_cr0); ++#endif ++ + #endif + + #ifdef CONFIG_XEN +diff -urNp linux-2.6.33.1/arch/x86/kernel/asm-offsets_64.c linux-2.6.33.1/arch/x86/kernel/asm-offsets_64.c +--- linux-2.6.33.1/arch/x86/kernel/asm-offsets_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/asm-offsets_64.c 2010-03-20 16:58:38.988525305 -0400 +@@ -63,6 +63,12 @@ int main(void) + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); + OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); ++ OFFSET(PV_CPU_write_cr0, pv_cpu_ops, write_cr0); ++#endif ++ + #endif + + +@@ -115,6 +121,7 @@ int main(void) + ENTRY(cr8); + BLANK(); + #undef ENTRY ++ DEFINE(TSS_size, sizeof(struct tss_struct)); + DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); + BLANK(); + DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/common.c linux-2.6.33.1/arch/x86/kernel/cpu/common.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/common.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/common.c 2010-03-20 16:58:38.992510812 -0400 +@@ -83,60 +83,6 @@ static const struct cpu_dev __cpuinitcon + + static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; + +-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +-#ifdef CONFIG_X86_64 +- /* +- * We need valid kernel segments for data and code in long mode too +- * IRET will check the segment types kkeil 2000/10/28 +- * Also sysret mandates a special GDT layout +- * +- * TLS descriptors are currently at a different place compared to i386. +- * Hopefully nobody expects them at a fixed place (Wine?) +- */ +- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), +- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), +- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), +- [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), +- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), +- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), +-#else +- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), +- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), +- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), +- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), +- /* +- * Segments used for calling PnP BIOS have byte granularity. +- * They code segments and data segments have fixed 64k limits, +- * the transfer segment sizes are set at run time. +- */ +- /* 32-bit code */ +- [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), +- /* 16-bit code */ +- [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), +- /* 16-bit data */ +- [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), +- /* 16-bit data */ +- [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), +- /* 16-bit data */ +- [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), +- /* +- * The APM segments have byte granularity and their bases +- * are set at run time. All have 64k limits. +- */ +- /* 32-bit code */ +- [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), +- /* 16-bit code */ +- [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), +- /* data */ +- [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), +- +- [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), +- [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), +- GDT_STACK_CANARY_INIT +-#endif +-} }; +-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +- + static int __init x86_xsave_setup(char *s) + { + setup_clear_cpu_cap(X86_FEATURE_XSAVE); +@@ -344,7 +290,7 @@ void switch_to_new_gdt(int cpu) + { + struct desc_ptr gdt_descr; + +- gdt_descr.address = (long)get_cpu_gdt_table(cpu); ++ gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); + /* Reload the per-cpu base */ +@@ -802,6 +748,10 @@ static void __cpuinit identify_cpu(struc + /* Filter out anything that depends on CPUID levels we don't have */ + filter_cpuid_features(c, true); + ++#if defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_KERNEXEC) || (defined(CONFIG_PAX_MEMORY_UDEREF) && defined(CONFIG_X86_32)) ++ setup_clear_cpu_cap(X86_FEATURE_SEP); ++#endif ++ + /* If the model name is still unset, do table lookup. */ + if (!c->x86_model_id[0]) { + const char *p; +@@ -1103,7 +1053,7 @@ void __cpuinit cpu_init(void) + int i; + + cpu = stack_smp_processor_id(); +- t = &per_cpu(init_tss, cpu); ++ t = init_tss + cpu; + oist = &per_cpu(orig_ist, cpu); + + #ifdef CONFIG_NUMA +@@ -1201,7 +1151,7 @@ void __cpuinit cpu_init(void) + { + int cpu = smp_processor_id(); + struct task_struct *curr = current; +- struct tss_struct *t = &per_cpu(init_tss, cpu); ++ struct tss_struct *t = init_tss + cpu; + struct thread_struct *thread = &curr->thread; + + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c linux-2.6.33.1/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c 2010-03-20 16:58:38.992510812 -0400 +@@ -523,7 +523,7 @@ static const struct dmi_system_id sw_any + DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"), + }, + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + + static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c linux-2.6.33.1/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c 2010-03-20 16:58:38.992510812 -0400 +@@ -225,7 +225,7 @@ static struct cpu_model models[] = + { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL }, + { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL }, + +- { NULL, } ++ { NULL, NULL, 0, NULL} + }; + #undef _BANIAS + #undef BANIAS +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/intel.c linux-2.6.33.1/arch/x86/kernel/cpu/intel.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/intel.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/intel.c 2010-03-20 16:58:38.992510812 -0400 +@@ -139,7 +139,7 @@ static void __cpuinit trap_init_f00f_bug + * Update the IDT descriptor and reload the IDT so that + * it uses the read-only mapped virtual address. + */ +- idt_descr.address = fix_to_virt(FIX_F00F_IDT); ++ idt_descr.address = (struct desc_struct *)fix_to_virt(FIX_F00F_IDT); + load_idt(&idt_descr); + } + #endif +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/intel_cacheinfo.c linux-2.6.33.1/arch/x86/kernel/cpu/intel_cacheinfo.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/intel_cacheinfo.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/intel_cacheinfo.c 2010-03-20 16:58:38.992510812 -0400 +@@ -848,7 +848,7 @@ static ssize_t store(struct kobject *kob + return ret; + } + +-static struct sysfs_ops sysfs_ops = { ++static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, + }; +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/Makefile linux-2.6.33.1/arch/x86/kernel/cpu/Makefile +--- linux-2.6.33.1/arch/x86/kernel/cpu/Makefile 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/Makefile 2010-03-20 16:58:38.992510812 -0400 +@@ -8,10 +8,6 @@ CFLAGS_REMOVE_common.o = -pg + CFLAGS_REMOVE_perf_event.o = -pg + endif + +-# Make sure load_percpu_segment has no stackprotector +-nostackp := $(call cc-option, -fno-stack-protector) +-CFLAGS_common.o := $(nostackp) +- + obj-y := intel_cacheinfo.o addon_cpuid_features.o + obj-y += proc.o capflags.o powerflags.o common.o + obj-y += vmware.o hypervisor.o sched.o +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/mcheck/mce_amd.c linux-2.6.33.1/arch/x86/kernel/cpu/mcheck/mce_amd.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/mcheck/mce_amd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/mcheck/mce_amd.c 2010-03-20 16:58:38.992510812 -0400 +@@ -388,7 +388,7 @@ static ssize_t store(struct kobject *kob + return ret; + } + +-static struct sysfs_ops threshold_ops = { ++static const struct sysfs_ops threshold_ops = { + .show = show, + .store = store, + }; +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/mcheck/mce.c linux-2.6.33.1/arch/x86/kernel/cpu/mcheck/mce.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/mcheck/mce.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/mcheck/mce.c 2010-03-20 16:58:38.992510812 -0400 +@@ -201,7 +201,7 @@ static void print_mce(struct mce *m) + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + +- if (m->cs == __KERNEL_CS) ++ if (m->cs == __KERNEL_CS || m->cs == __KERNEXEC_KERNEL_CS) + print_symbol("{%s}", m->ip); + pr_cont("\n"); + } +@@ -1444,14 +1444,14 @@ void __cpuinit mcheck_cpu_init(struct cp + */ + + static DEFINE_SPINLOCK(mce_state_lock); +-static int open_count; /* #times opened */ ++static atomic_t open_count; /* #times opened */ + static int open_exclu; /* already open exclusive? */ + + static int mce_open(struct inode *inode, struct file *file) + { + spin_lock(&mce_state_lock); + +- if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { ++ if (open_exclu || (atomic_read(&open_count) && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_state_lock); + + return -EBUSY; +@@ -1459,7 +1459,7 @@ static int mce_open(struct inode *inode, + + if (file->f_flags & O_EXCL) + open_exclu = 1; +- open_count++; ++ atomic_inc(&open_count); + + spin_unlock(&mce_state_lock); + +@@ -1470,7 +1470,7 @@ static int mce_release(struct inode *ino + { + spin_lock(&mce_state_lock); + +- open_count--; ++ atomic_dec(&open_count); + open_exclu = 0; + + spin_unlock(&mce_state_lock); +@@ -1610,6 +1610,7 @@ static struct miscdevice mce_log_device + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, ++ {NULL, NULL}, NULL, NULL + }; + + /* +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/amd.c linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/amd.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/amd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/amd.c 2010-03-20 16:58:38.992510812 -0400 +@@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base + return 0; + } + +-static struct mtrr_ops amd_mtrr_ops = { ++static const struct mtrr_ops amd_mtrr_ops = { + .vendor = X86_VENDOR_AMD, + .set = amd_set_mtrr, + .get = amd_get_mtrr, +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/centaur.c linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/centaur.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/centaur.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/centaur.c 2010-03-20 16:58:38.992510812 -0400 +@@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long + return 0; + } + +-static struct mtrr_ops centaur_mtrr_ops = { ++static const struct mtrr_ops centaur_mtrr_ops = { + .vendor = X86_VENDOR_CENTAUR, + .set = centaur_set_mcr, + .get = centaur_get_mcr, +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/cyrix.c linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/cyrix.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/cyrix.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/cyrix.c 2010-03-20 16:58:38.992510812 -0400 +@@ -265,7 +265,7 @@ static void cyrix_set_all(void) + post_set(); + } + +-static struct mtrr_ops cyrix_mtrr_ops = { ++static const struct mtrr_ops cyrix_mtrr_ops = { + .vendor = X86_VENDOR_CYRIX, + .set_all = cyrix_set_all, + .set = cyrix_set_arr, +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/generic.c linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/generic.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/generic.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/generic.c 2010-03-20 16:58:38.992510812 -0400 +@@ -29,7 +29,7 @@ static struct fixed_range_block fixed_ra + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ +- {} ++ { 0, 0 } + }; + + static unsigned long smp_changes_mask; +@@ -752,7 +752,7 @@ int positive_have_wrcomb(void) + /* + * Generic structure... + */ +-struct mtrr_ops generic_mtrr_ops = { ++const struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, + .set_all = generic_set_all, + .get = generic_get_mtrr, +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/main.c linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/main.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/main.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/main.c 2010-03-20 16:58:38.996547732 -0400 +@@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex); + u64 size_or_mask, size_and_mask; + static bool mtrr_aps_delayed_init; + +-static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; ++static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __read_only; + +-struct mtrr_ops *mtrr_if; ++const struct mtrr_ops *mtrr_if; + + static void set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + +-void set_mtrr_ops(struct mtrr_ops *ops) ++void set_mtrr_ops(const struct mtrr_ops *ops) + { + if (ops->vendor && ops->vendor < X86_VENDOR_NUM) + mtrr_ops[ops->vendor] = ops; +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/mtrr.h linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/mtrr.h +--- linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/mtrr.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/mtrr/mtrr.h 2010-03-20 16:58:38.996547732 -0400 +@@ -12,19 +12,19 @@ + extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; + + struct mtrr_ops { +- u32 vendor; +- u32 use_intel_if; +- void (*set)(unsigned int reg, unsigned long base, ++ const u32 vendor; ++ const u32 use_intel_if; ++ void (* const set)(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); +- void (*set_all)(void); ++ void (* const set_all)(void); + +- void (*get)(unsigned int reg, unsigned long *base, ++ void (* const get)(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type *type); +- int (*get_free_region)(unsigned long base, unsigned long size, ++ int (* const get_free_region)(unsigned long base, unsigned long size, + int replace_reg); +- int (*validate_add_page)(unsigned long base, unsigned long size, ++ int (* const validate_add_page)(unsigned long base, unsigned long size, + unsigned int type); +- int (*have_wrcomb)(void); ++ int (* const have_wrcomb)(void); + }; + + extern int generic_get_free_region(unsigned long base, unsigned long size, +@@ -32,7 +32,7 @@ extern int generic_get_free_region(unsig + extern int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type); + +-extern struct mtrr_ops generic_mtrr_ops; ++extern const struct mtrr_ops generic_mtrr_ops; + + extern int positive_have_wrcomb(void); + +@@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int in + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); + void get_mtrr_state(void); + +-extern void set_mtrr_ops(struct mtrr_ops *ops); ++extern void set_mtrr_ops(const struct mtrr_ops *ops); + + extern u64 size_or_mask, size_and_mask; +-extern struct mtrr_ops *mtrr_if; ++extern const struct mtrr_ops *mtrr_if; + + #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) + #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/perfctr-watchdog.c linux-2.6.33.1/arch/x86/kernel/cpu/perfctr-watchdog.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/perfctr-watchdog.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/perfctr-watchdog.c 2010-03-20 16:58:38.996547732 -0400 +@@ -30,11 +30,11 @@ struct nmi_watchdog_ctlblk { + + /* Interface defining a CPU specific perfctr watchdog */ + struct wd_ops { +- int (*reserve)(void); +- void (*unreserve)(void); +- int (*setup)(unsigned nmi_hz); +- void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); +- void (*stop)(void); ++ int (* const reserve)(void); ++ void (* const unreserve)(void); ++ int (* const setup)(unsigned nmi_hz); ++ void (* const rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); ++ void (* const stop)(void); + unsigned perfctr; + unsigned evntsel; + u64 checkbit; +@@ -645,6 +645,7 @@ static const struct wd_ops p4_wd_ops = { + #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL + #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + ++/* cannot be const */ + static struct wd_ops intel_arch_wd_ops; + + static int setup_intel_arch_watchdog(unsigned nmi_hz) +@@ -697,6 +698,7 @@ static int setup_intel_arch_watchdog(uns + return 1; + } + ++/* cannot be const */ + static struct wd_ops intel_arch_wd_ops __read_mostly = { + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, +diff -urNp linux-2.6.33.1/arch/x86/kernel/cpu/perf_event.c linux-2.6.33.1/arch/x86/kernel/cpu/perf_event.c +--- linux-2.6.33.1/arch/x86/kernel/cpu/perf_event.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/cpu/perf_event.c 2010-03-20 16:58:38.996547732 -0400 +@@ -2426,7 +2426,7 @@ perf_callchain_user(struct pt_regs *regs + break; + + callchain_store(entry, frame.return_address); +- fp = frame.next_frame; ++ fp = (__force const void __user *)frame.next_frame; + } + } + +diff -urNp linux-2.6.33.1/arch/x86/kernel/crash.c linux-2.6.33.1/arch/x86/kernel/crash.c +--- linux-2.6.33.1/arch/x86/kernel/crash.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/crash.c 2010-03-20 16:58:38.996547732 -0400 +@@ -41,7 +41,7 @@ static void kdump_nmi_callback(int cpu, + regs = args->regs; + + #ifdef CONFIG_X86_32 +- if (!user_mode_vm(regs)) { ++ if (!user_mode(regs)) { + crash_fixup_ss_esp(&fixed_regs, regs); + regs = &fixed_regs; + } +diff -urNp linux-2.6.33.1/arch/x86/kernel/doublefault_32.c linux-2.6.33.1/arch/x86/kernel/doublefault_32.c +--- linux-2.6.33.1/arch/x86/kernel/doublefault_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/doublefault_32.c 2010-03-20 16:58:38.996547732 -0400 +@@ -11,7 +11,7 @@ + + #define DOUBLEFAULT_STACKSIZE (1024) + static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; +-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) ++#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE-2) + + #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) + +@@ -21,7 +21,7 @@ static void doublefault_fn(void) + unsigned long gdt, tss; + + store_gdt(&gdt_desc); +- gdt = gdt_desc.address; ++ gdt = (unsigned long)gdt_desc.address; + + printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); + +@@ -58,10 +58,10 @@ struct tss_struct doublefault_tss __cach + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, +- .es = __USER_DS, ++ .es = __KERNEL_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, +- .ds = __USER_DS, ++ .ds = __KERNEL_DS, + .fs = __KERNEL_PERCPU, + + .__cr3 = __pa_nodebug(swapper_pg_dir), +diff -urNp linux-2.6.33.1/arch/x86/kernel/dumpstack_32.c linux-2.6.33.1/arch/x86/kernel/dumpstack_32.c +--- linux-2.6.33.1/arch/x86/kernel/dumpstack_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/dumpstack_32.c 2010-03-20 16:58:38.996547732 -0400 +@@ -112,11 +112,12 @@ void show_registers(struct pt_regs *regs + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ +- if (!user_mode_vm(regs)) { ++ if (!user_mode(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; ++ unsigned long cs_base = get_desc_base(&get_cpu_gdt_table(smp_processor_id())[(0xffff & regs->cs) >> 3]); + + printk(KERN_EMERG "Stack:\n"); + show_stack_log_lvl(NULL, regs, ®s->sp, +@@ -124,10 +125,10 @@ void show_registers(struct pt_regs *regs + + printk(KERN_EMERG "Code: "); + +- ip = (u8 *)regs->ip - code_prologue; ++ ip = (u8 *)regs->ip - code_prologue + cs_base; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at IP */ +- ip = (u8 *)regs->ip; ++ ip = (u8 *)regs->ip + cs_base; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { +@@ -136,7 +137,7 @@ void show_registers(struct pt_regs *regs + printk(" Bad EIP value."); + break; + } +- if (ip == (u8 *)regs->ip) ++ if (ip == (u8 *)regs->ip + cs_base) + printk("<%02x> ", c); + else + printk("%02x ", c); +@@ -149,6 +150,7 @@ int is_valid_bugaddr(unsigned long ip) + { + unsigned short ud2; + ++ ip = ktla_ktva(ip); + if (ip < PAGE_OFFSET) + return 0; + if (probe_kernel_address((unsigned short *)ip, ud2)) +diff -urNp linux-2.6.33.1/arch/x86/kernel/dumpstack.c linux-2.6.33.1/arch/x86/kernel/dumpstack.c +--- linux-2.6.33.1/arch/x86/kernel/dumpstack.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/dumpstack.c 2010-03-20 16:58:38.996547732 -0400 +@@ -207,7 +207,7 @@ void dump_stack(void) + #endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", +- current->pid, current->comm, print_tainted(), ++ task_pid_nr(current), current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); +@@ -268,7 +268,7 @@ void __kprobes oops_end(unsigned long fl + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); +- do_exit(signr); ++ do_group_exit(signr); + } + + int __kprobes __die(const char *str, struct pt_regs *regs, long err) +@@ -295,7 +295,7 @@ int __kprobes __die(const char *str, str + + show_registers(regs); + #ifdef CONFIG_X86_32 +- if (user_mode_vm(regs)) { ++ if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } else { +@@ -323,7 +323,7 @@ void die(const char *str, struct pt_regs + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + +- if (!user_mode_vm(regs)) ++ if (!user_mode(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) +diff -urNp linux-2.6.33.1/arch/x86/kernel/e820.c linux-2.6.33.1/arch/x86/kernel/e820.c +--- linux-2.6.33.1/arch/x86/kernel/e820.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/e820.c 2010-03-20 16:58:38.996547732 -0400 +@@ -28,6 +28,8 @@ + #include <asm/setup.h> + #include <asm/trampoline.h> + ++#include "acpi/realmode/wakeup.h" ++ + /* + * The e820 map is the map that gets modified e.g. with command line parameters + * and that is also registered with modifications in the kernel resource tree +@@ -741,8 +743,7 @@ static struct early_res early_res[MAX_EA + */ + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 }, + #endif +- +- {} ++ { 0, 0, {0}, 0 } + }; + + static int __init find_overlapped_early(u64 start, u64 end) +diff -urNp linux-2.6.33.1/arch/x86/kernel/efi_32.c linux-2.6.33.1/arch/x86/kernel/efi_32.c +--- linux-2.6.33.1/arch/x86/kernel/efi_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/efi_32.c 2010-03-20 16:58:38.996547732 -0400 +@@ -38,70 +38,38 @@ + */ + + static unsigned long efi_rt_eflags; +-static pgd_t efi_bak_pg_dir_pointer[2]; ++static pgd_t __initdata efi_bak_pg_dir_pointer[KERNEL_PGD_PTRS]; + +-void efi_call_phys_prelog(void) ++void __init efi_call_phys_prelog(void) + { +- unsigned long cr4; +- unsigned long temp; + struct desc_ptr gdt_descr; + + local_irq_save(efi_rt_eflags); + +- /* +- * If I don't have PAE, I should just duplicate two entries in page +- * directory. If I have PAE, I just need to duplicate one entry in +- * page directory. +- */ +- cr4 = read_cr4_safe(); + +- if (cr4 & X86_CR4_PAE) { +- efi_bak_pg_dir_pointer[0].pgd = +- swapper_pg_dir[pgd_index(0)].pgd; +- swapper_pg_dir[0].pgd = +- swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; +- } else { +- efi_bak_pg_dir_pointer[0].pgd = +- swapper_pg_dir[pgd_index(0)].pgd; +- efi_bak_pg_dir_pointer[1].pgd = +- swapper_pg_dir[pgd_index(0x400000)].pgd; +- swapper_pg_dir[pgd_index(0)].pgd = +- swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; +- temp = PAGE_OFFSET + 0x400000; +- swapper_pg_dir[pgd_index(0x400000)].pgd = +- swapper_pg_dir[pgd_index(temp)].pgd; +- } ++ clone_pgd_range(efi_bak_pg_dir_pointer, swapper_pg_dir, KERNEL_PGD_PTRS); ++ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, ++ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); + + /* + * After the lock is released, the original page table is restored. + */ + __flush_tlb_all(); + +- gdt_descr.address = __pa(get_cpu_gdt_table(0)); ++ gdt_descr.address = (struct desc_struct *)__pa(get_cpu_gdt_table(0)); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); + } + +-void efi_call_phys_epilog(void) ++void __init efi_call_phys_epilog(void) + { +- unsigned long cr4; + struct desc_ptr gdt_descr; + +- gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); ++ gdt_descr.address = get_cpu_gdt_table(0); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); + +- cr4 = read_cr4_safe(); +- +- if (cr4 & X86_CR4_PAE) { +- swapper_pg_dir[pgd_index(0)].pgd = +- efi_bak_pg_dir_pointer[0].pgd; +- } else { +- swapper_pg_dir[pgd_index(0)].pgd = +- efi_bak_pg_dir_pointer[0].pgd; +- swapper_pg_dir[pgd_index(0x400000)].pgd = +- efi_bak_pg_dir_pointer[1].pgd; +- } ++ clone_pgd_range(swapper_pg_dir, efi_bak_pg_dir_pointer, KERNEL_PGD_PTRS); + + /* + * After the lock is released, the original page table is restored. +diff -urNp linux-2.6.33.1/arch/x86/kernel/efi_stub_32.S linux-2.6.33.1/arch/x86/kernel/efi_stub_32.S +--- linux-2.6.33.1/arch/x86/kernel/efi_stub_32.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/efi_stub_32.S 2010-03-20 16:58:38.996547732 -0400 +@@ -6,6 +6,7 @@ + */ + + #include <linux/linkage.h> ++#include <linux/init.h> + #include <asm/page_types.h> + + /* +@@ -20,7 +21,7 @@ + * service functions will comply with gcc calling convention, too. + */ + +-.text ++__INIT + ENTRY(efi_call_phys) + /* + * 0. The function can only be called in Linux kernel. So CS has been +@@ -36,9 +37,7 @@ ENTRY(efi_call_phys) + * The mapping of lower virtual memory has been created in prelog and + * epilog. + */ +- movl $1f, %edx +- subl $__PAGE_OFFSET, %edx +- jmp *%edx ++ jmp 1f-__PAGE_OFFSET + 1: + + /* +@@ -47,14 +46,8 @@ ENTRY(efi_call_phys) + * parameter 2, ..., param n. To make things easy, we save the return + * address of efi_call_phys in a global variable. + */ +- popl %edx +- movl %edx, saved_return_addr +- /* get the function pointer into ECX*/ +- popl %ecx +- movl %ecx, efi_rt_function_ptr +- movl $2f, %edx +- subl $__PAGE_OFFSET, %edx +- pushl %edx ++ popl (saved_return_addr) ++ popl (efi_rt_function_ptr) + + /* + * 3. Clear PG bit in %CR0. +@@ -73,9 +66,8 @@ ENTRY(efi_call_phys) + /* + * 5. Call the physical function. + */ +- jmp *%ecx ++ call *(efi_rt_function_ptr-__PAGE_OFFSET) + +-2: + /* + * 6. After EFI runtime service returns, control will return to + * following instruction. We'd better readjust stack pointer first. +@@ -88,35 +80,28 @@ ENTRY(efi_call_phys) + movl %cr0, %edx + orl $0x80000000, %edx + movl %edx, %cr0 +- jmp 1f +-1: ++ + /* + * 8. Now restore the virtual mode from flat mode by + * adding EIP with PAGE_OFFSET. + */ +- movl $1f, %edx +- jmp *%edx ++ jmp 1f+__PAGE_OFFSET + 1: + + /* + * 9. Balance the stack. And because EAX contain the return value, + * we'd better not clobber it. + */ +- leal efi_rt_function_ptr, %edx +- movl (%edx), %ecx +- pushl %ecx ++ pushl (efi_rt_function_ptr) + + /* +- * 10. Push the saved return address onto the stack and return. ++ * 10. Return to the saved return address. + */ +- leal saved_return_addr, %edx +- movl (%edx), %ecx +- pushl %ecx +- ret ++ jmpl *(saved_return_addr) + ENDPROC(efi_call_phys) + .previous + +-.data ++__INITDATA + saved_return_addr: + .long 0 + efi_rt_function_ptr: +diff -urNp linux-2.6.33.1/arch/x86/kernel/entry_32.S linux-2.6.33.1/arch/x86/kernel/entry_32.S +--- linux-2.6.33.1/arch/x86/kernel/entry_32.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/entry_32.S 2010-03-20 16:58:39.000574964 -0400 +@@ -191,7 +191,7 @@ + + #endif /* CONFIG_X86_32_LAZY_GS */ + +-.macro SAVE_ALL ++.macro __SAVE_ALL _DS + cld + PUSH_GS + pushl %fs +@@ -224,7 +224,7 @@ + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 +- movl $(__USER_DS), %edx ++ movl $_DS, %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx +@@ -232,6 +232,15 @@ + SET_KERNEL_GS %edx + .endm + ++.macro SAVE_ALL ++#if defined(CONFIG_PAX_KERNEXEC) || defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) || defined(CONFIG_PAX_MEMORY_UDEREF) ++ __SAVE_ALL __KERNEL_DS ++ PAX_ENTER_KERNEL ++#else ++ __SAVE_ALL __USER_DS ++#endif ++.endm ++ + .macro RESTORE_INT_REGS + popl %ebx + CFI_ADJUST_CFA_OFFSET -4 +@@ -356,7 +365,15 @@ check_userspace: + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ jae resume_userspace ++ ++ PAX_EXIT_KERNEL ++ jmp resume_kernel ++#else + jb resume_kernel # not returning to v8086 or userspace ++#endif + + ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT +@@ -422,10 +439,9 @@ sysenter_past_esp: + /*CFI_REL_OFFSET cs, 0*/ + /* + * Push current_thread_info()->sysenter_return to the stack. +- * A tiny bit of offset fixup is necessary - 4*4 means the 4 words +- * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ +- pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) ++ GET_THREAD_INFO(%ebp) ++ pushl TI_sysenter_return(%ebp) + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 + +@@ -438,9 +454,19 @@ sysenter_past_esp: + * Load the potential sixth argument from user stack. + * Careful about security. + */ ++ movl PT_OLDESP(%esp),%ebp ++ ++#ifdef CONFIG_PAX_MEMORY_UDEREF ++ mov PT_OLDSS(%esp),%ds ++1: movl %ds:(%ebp),%ebp ++ push %ss ++ pop %ds ++#else + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault + 1: movl (%ebp),%ebp ++#endif ++ + movl %ebp,PT_EBP(%esp) + .section __ex_table,"a" + .align 4 +@@ -463,12 +489,23 @@ sysenter_do_call: + testl $_TIF_ALLWORK_MASK, %ecx + jne sysexit_audit + sysenter_exit: ++ ++#ifdef CONFIG_PAX_RANDKSTACK ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ call pax_randomize_kstack ++ popl %eax ++ CFI_ADJUST_CFA_OFFSET -4 ++#endif ++ + /* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp,%ebp + TRACE_IRQS_ON + 1: mov PT_FS(%esp), %fs ++2: mov PT_DS(%esp), %ds ++3: mov PT_ES(%esp), %es + PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + +@@ -512,11 +549,17 @@ sysexit_audit: + + CFI_ENDPROC + .pushsection .fixup,"ax" +-2: movl $0,PT_FS(%esp) ++4: movl $0,PT_FS(%esp) ++ jmp 1b ++5: movl $0,PT_DS(%esp) ++ jmp 1b ++6: movl $0,PT_ES(%esp) + jmp 1b + .section __ex_table,"a" + .align 4 +- .long 1b,2b ++ .long 1b,4b ++ .long 2b,5b ++ .long 3b,6b + .popsection + PTGS_TO_GS_EX + ENDPROC(ia32_sysenter_target) +@@ -550,6 +593,10 @@ syscall_exit: + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jne syscall_exit_work + ++#ifdef CONFIG_PAX_RANDKSTACK ++ call pax_randomize_kstack ++#endif ++ + restore_all: + TRACE_IRQS_IRET + restore_all_notrace: +@@ -614,7 +661,13 @@ ldt_ss: + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ +- PER_CPU(gdt_page, %ebx) ++#ifdef CONFIG_SMP ++ movl PER_CPU_VAR(cpu_number), %ebx ++ shll $PAGE_SHIFT_asm, %ebx ++ addl $cpu_gdt_table, %ebx ++#else ++ movl $cpu_gdt_table, %ebx ++#endif + shr $16, %edx + mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ + mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ +@@ -654,25 +707,19 @@ work_resched: + + work_notifysig: # deal with pending signals and + # notify-resume requests ++ movl %esp, %eax + #ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) +- movl %esp, %eax +- jne work_notifysig_v86 # returning to kernel-space or ++ jz 1f # returning to kernel-space or + # vm86-space +- xorl %edx, %edx +- call do_notify_resume +- jmp resume_userspace_sig + +- ALIGN +-work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + CFI_ADJUST_CFA_OFFSET 4 + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + movl %eax, %esp +-#else +- movl %esp, %eax ++1: + #endif + xorl %edx, %edx + call do_notify_resume +@@ -707,6 +754,10 @@ END(syscall_exit_work) + + RING0_INT_FRAME # can't unwind into user space anyway + syscall_fault: ++#ifdef CONFIG_PAX_MEMORY_UDEREF ++ push %ss ++ pop %ds ++#endif + GET_THREAD_INFO(%ebp) + movl $-EFAULT,PT_EAX(%esp) + jmp resume_userspace +@@ -790,7 +841,13 @@ ptregs_clone: + * normal stack and adjusts ESP with the matching offset. + */ + /* fixup the stack */ +- PER_CPU(gdt_page, %ebx) ++#ifdef CONFIG_SMP ++ movl PER_CPU_VAR(cpu_number), %ebx ++ shll $PAGE_SHIFT_asm, %ebx ++ addl $cpu_gdt_table, %ebx ++#else ++ movl $cpu_gdt_table, %ebx ++#endif + mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ + mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + shl $16, %eax +@@ -1254,7 +1311,6 @@ return_to_handler: + jmp *%ecx + #endif + +-.section .rodata,"a" + #include "syscall_table_32.S" + + syscall_table_size=(.-sys_call_table) +@@ -1306,12 +1362,15 @@ error_code: + movl %ecx, %fs + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx ++ ++ PAX_ENTER_KERNEL ++ + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx +- movl $(__USER_DS), %ecx ++ movl $(__KERNEL_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF +@@ -1407,6 +1466,9 @@ nmi_stack_correct: + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi ++ ++ PAX_EXIT_KERNEL ++ + jmp restore_all_notrace + CFI_ENDPROC + +@@ -1447,6 +1509,9 @@ nmi_espfix_stack: + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi ++ ++ PAX_EXIT_KERNEL ++ + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 +diff -urNp linux-2.6.33.1/arch/x86/kernel/entry_64.S linux-2.6.33.1/arch/x86/kernel/entry_64.S +--- linux-2.6.33.1/arch/x86/kernel/entry_64.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/entry_64.S 2010-03-20 16:58:39.000574964 -0400 +@@ -53,6 +53,7 @@ + #include <asm/paravirt.h> + #include <asm/ftrace.h> + #include <asm/percpu.h> ++#include <asm/pgtable.h> + + /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ + #include <linux/elf-em.h> +@@ -800,6 +801,7 @@ END(interrupt) + CFI_ADJUST_CFA_OFFSET 10*8 + call save_args + PARTIAL_FRAME 0 ++ PAX_ENTER_KERNEL + call \func + .endm + +@@ -825,6 +827,7 @@ ret_from_intr: + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 + exit_intr: ++ PAX_EXIT_KERNEL + GET_THREAD_INFO(%rcx) + testl $3,CS-ARGOFFSET(%rsp) + je retint_kernel +@@ -1040,6 +1043,7 @@ ENTRY(\sym) + CFI_ADJUST_CFA_OFFSET 15*8 + call error_entry + DEFAULT_FRAME 0 ++ PAX_ENTER_KERNEL + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \do_sym +@@ -1057,6 +1061,7 @@ ENTRY(\sym) + subq $15*8, %rsp + call save_paranoid + TRACE_IRQS_OFF ++ PAX_ENTER_KERNEL + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \do_sym +@@ -1074,9 +1079,15 @@ ENTRY(\sym) + subq $15*8, %rsp + call save_paranoid + TRACE_IRQS_OFF ++ PAX_ENTER_KERNEL + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ +- PER_CPU(init_tss, %r12) ++#ifdef CONFIG_SMP ++ imul $TSS_size, PER_CPU_VAR(cpu_number), %r12d ++ lea init_tss(%r12), %r12 ++#else ++ lea init_tss(%rip), %r12 ++#endif + subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + call \do_sym + addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) +@@ -1093,6 +1104,7 @@ ENTRY(\sym) + CFI_ADJUST_CFA_OFFSET 15*8 + call error_entry + DEFAULT_FRAME 0 ++ PAX_ENTER_KERNEL + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ +@@ -1112,6 +1124,7 @@ ENTRY(\sym) + call save_paranoid + DEFAULT_FRAME 0 + TRACE_IRQS_OFF ++ PAX_ENTER_KERNEL + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ +@@ -1373,11 +1386,13 @@ ENTRY(paranoid_exit) + testl $3,CS(%rsp) + jnz paranoid_userspace + paranoid_swapgs: ++ PAX_EXIT_KERNEL + TRACE_IRQS_IRETQ 0 + SWAPGS_UNSAFE_STACK + RESTORE_ALL 8 + jmp irq_return + paranoid_restore: ++ PAX_EXIT_KERNEL + TRACE_IRQS_IRETQ 0 + RESTORE_ALL 8 + jmp irq_return +@@ -1499,6 +1514,7 @@ ENTRY(nmi) + CFI_ADJUST_CFA_OFFSET 15*8 + call save_paranoid + DEFAULT_FRAME 0 ++ PAX_ENTER_KERNEL + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp,%rdi + movq $-1,%rsi +@@ -1514,6 +1530,7 @@ ENTRY(nmi) + nmi_swapgs: + SWAPGS_UNSAFE_STACK + nmi_restore: ++ PAX_EXIT_KERNEL + RESTORE_ALL 8 + jmp irq_return + nmi_userspace: +diff -urNp linux-2.6.33.1/arch/x86/kernel/ftrace.c linux-2.6.33.1/arch/x86/kernel/ftrace.c +--- linux-2.6.33.1/arch/x86/kernel/ftrace.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/ftrace.c 2010-03-20 16:58:39.000574964 -0400 +@@ -151,7 +151,9 @@ void ftrace_nmi_enter(void) + { + if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { + smp_rmb(); ++ pax_open_kernel(); + ftrace_mod_code(); ++ pax_close_kernel(); + atomic_inc(&nmi_update_count); + } + /* Must have previous changes seen before executions */ +@@ -234,7 +236,7 @@ do_ftrace_mod_code(unsigned long ip, voi + + + +-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; ++static unsigned char ftrace_nop[MCOUNT_INSN_SIZE] __read_only; + + static unsigned char *ftrace_nop_replace(void) + { +@@ -247,6 +249,8 @@ ftrace_modify_code(unsigned long ip, uns + { + unsigned char replaced[MCOUNT_INSN_SIZE]; + ++ ip = ktla_ktva(ip); ++ + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting +@@ -303,7 +307,7 @@ int ftrace_update_ftrace_func(ftrace_fun + unsigned char old[MCOUNT_INSN_SIZE], *new; + int ret; + +- memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); ++ memcpy(old, (void *)ktla_ktva((unsigned long)ftrace_call), MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + +@@ -356,15 +360,15 @@ int __init ftrace_dyn_arch_init(void *da + switch (faulted) { + case 0: + pr_info("converting mcount calls to 0f 1f 44 00 00\n"); +- memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); ++ memcpy(ftrace_nop, ktla_ktva(ftrace_test_p6nop), MCOUNT_INSN_SIZE); + break; + case 1: + pr_info("converting mcount calls to 66 66 66 66 90\n"); +- memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); ++ memcpy(ftrace_nop, ktla_ktva(ftrace_test_nop5), MCOUNT_INSN_SIZE); + break; + case 2: + pr_info("converting mcount calls to jmp . + 5\n"); +- memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); ++ memcpy(ftrace_nop, ktla_ktva(ftrace_test_jmp), MCOUNT_INSN_SIZE); + break; + } + +@@ -385,6 +389,8 @@ static int ftrace_mod_jmp(unsigned long + { + unsigned char code[MCOUNT_INSN_SIZE]; + ++ ip = ktla_ktva(ip); ++ + if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + +diff -urNp linux-2.6.33.1/arch/x86/kernel/head32.c linux-2.6.33.1/arch/x86/kernel/head32.c +--- linux-2.6.33.1/arch/x86/kernel/head32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/head32.c 2010-03-20 16:58:39.000574964 -0400 +@@ -16,6 +16,7 @@ + #include <asm/apic.h> + #include <asm/io_apic.h> + #include <asm/bios_ebda.h> ++#include <asm/boot.h> + + static void __init i386_default_early_setup(void) + { +@@ -29,7 +30,7 @@ static void __init i386_default_early_se + + void __init i386_start_kernel(void) + { +- reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); ++ reserve_early(LOAD_PHYSICAL_ADDR, __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + + #ifdef CONFIG_BLK_DEV_INITRD + /* Reserve INITRD */ +diff -urNp linux-2.6.33.1/arch/x86/kernel/head_32.S linux-2.6.33.1/arch/x86/kernel/head_32.S +--- linux-2.6.33.1/arch/x86/kernel/head_32.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/head_32.S 2010-03-20 16:58:39.000574964 -0400 +@@ -21,10 +21,17 @@ + #include <asm/msr-index.h> + #include <asm/cpufeature.h> + #include <asm/percpu.h> ++#include <asm/msr-index.h> + + /* Physical address */ + #define pa(X) ((X) - __PAGE_OFFSET) + ++#ifdef CONFIG_PAX_KERNEXEC ++#define ta(X) (X) ++#else ++#define ta(X) ((X) - __PAGE_OFFSET) ++#endif ++ + /* + * References to members of the new_cpu_data structure. + */ +@@ -54,11 +61,7 @@ + * and small than max_low_pfn, otherwise will waste some page table entries + */ + +-#if PTRS_PER_PMD > 1 +-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) +-#else +-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) +-#endif ++#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PTE) + + /* Enough space to fit pagetables for the low memory linear map */ + MAPPING_BEYOND_END = \ +@@ -75,6 +78,12 @@ INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_P + RESERVE_BRK(pagetables, INIT_MAP_SIZE) + + /* ++ * Real beginning of normal "text" segment ++ */ ++ENTRY(stext) ++ENTRY(_stext) ++ ++/* + * 32-bit kernel entrypoint; only used by the boot CPU. On entry, + * %esi points to the real-mode code as a 32-bit pointer. + * CS and DS must be 4 GB flat segments, but we don't depend on +@@ -82,6 +91,13 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) + * can. + */ + __HEAD ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ jmp startup_32 ++/* PaX: fill first page in .text with int3 to catch NULL derefs in kernel mode */ ++.fill PAGE_SIZE-5,1,0xcc ++#endif ++ + ENTRY(startup_32) + /* test KEEP_SEGMENTS flag to see if the bootloader is asking + us to not reload segments */ +@@ -99,6 +115,55 @@ ENTRY(startup_32) + movl %eax,%gs + 2: + ++#ifdef CONFIG_SMP ++ movl $pa(cpu_gdt_table),%edi ++ movl $__per_cpu_load,%eax ++ movw %ax,__KERNEL_PERCPU + 2(%edi) ++ rorl $16,%eax ++ movb %al,__KERNEL_PERCPU + 4(%edi) ++ movb %ah,__KERNEL_PERCPU + 7(%edi) ++ movl $__per_cpu_end - 1,%eax ++ subl $__per_cpu_start,%eax ++ movw %ax,__KERNEL_PERCPU + 0(%edi) ++#endif ++ ++#ifdef CONFIG_PAX_MEMORY_UDEREF ++ movl $NR_CPUS,%ecx ++ movl $pa(cpu_gdt_table),%edi ++1: ++ movl $((((__PAGE_OFFSET-1) & 0xf0000000) >> 12) | 0x00c09700),GDT_ENTRY_KERNEL_DS * 8 + 4(%edi) ++ addl $PAGE_SIZE_asm,%edi ++ loop 1b ++#endif ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ movl $pa(boot_gdt),%edi ++ movl $__LOAD_PHYSICAL_ADDR,%eax ++ movw %ax,__BOOT_CS + 2(%edi) ++ rorl $16,%eax ++ movb %al,__BOOT_CS + 4(%edi) ++ movb %ah,__BOOT_CS + 7(%edi) ++ rorl $16,%eax ++ ++ ljmp $(__BOOT_CS),$1f ++1: ++ ++ movl $NR_CPUS,%ecx ++ movl $pa(cpu_gdt_table),%edi ++ addl $__PAGE_OFFSET,%eax ++1: ++ movw %ax,__KERNEL_CS + 2(%edi) ++ movw %ax,__KERNEXEC_KERNEL_CS + 2(%edi) ++ rorl $16,%eax ++ movb %al,__KERNEL_CS + 4(%edi) ++ movb %al,__KERNEXEC_KERNEL_CS + 4(%edi) ++ movb %ah,__KERNEL_CS + 7(%edi) ++ movb %ah,__KERNEXEC_KERNEL_CS + 7(%edi) ++ rorl $16,%eax ++ addl $PAGE_SIZE_asm,%edi ++ loop 1b ++#endif ++ + /* + * Clear BSS first so that there are no surprises... + */ +@@ -142,9 +207,7 @@ ENTRY(startup_32) + cmpl $num_subarch_entries, %eax + jae bad_subarch + +- movl pa(subarch_entries)(,%eax,4), %eax +- subl $__PAGE_OFFSET, %eax +- jmp *%eax ++ jmp *pa(subarch_entries)(,%eax,4) + + bad_subarch: + WEAK(lguest_entry) +@@ -156,10 +219,10 @@ WEAK(xen_entry) + __INITDATA + + subarch_entries: +- .long default_entry /* normal x86/PC */ +- .long lguest_entry /* lguest hypervisor */ +- .long xen_entry /* Xen hypervisor */ +- .long default_entry /* Moorestown MID */ ++ .long ta(default_entry) /* normal x86/PC */ ++ .long ta(lguest_entry) /* lguest hypervisor */ ++ .long ta(xen_entry) /* Xen hypervisor */ ++ .long ta(default_entry) /* Moorestown MID */ + num_subarch_entries = (. - subarch_entries) / 4 + .previous + #endif /* CONFIG_PARAVIRT */ +@@ -220,8 +283,11 @@ default_entry: + movl %eax, pa(max_pfn_mapped) + + /* Do early initialization of the fixmap area */ +- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax +- movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) ++#ifdef CONFIG_COMPAT_VDSO ++ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR+_PAGE_USER,pa(swapper_pg_pmd+0x1000*KPMDS-8) ++#else ++ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,pa(swapper_pg_pmd+0x1000*KPMDS-8) ++#endif + #else /* Not PAE */ + + page_pde_offset = (__PAGE_OFFSET >> 20); +@@ -251,8 +317,11 @@ page_pde_offset = (__PAGE_OFFSET >> 20); + movl %eax, pa(max_pfn_mapped) + + /* Do early initialization of the fixmap area */ +- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax +- movl %eax,pa(swapper_pg_dir+0xffc) ++#ifdef CONFIG_COMPAT_VDSO ++ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR+_PAGE_USER,pa(swapper_pg_dir+0xffc) ++#else ++ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,pa(swapper_pg_dir+0xffc) ++#endif + #endif + jmp 3f + /* +@@ -299,6 +368,7 @@ ENTRY(startup_32_smp) + orl %edx,%eax + movl %eax,%cr4 + ++#ifdef CONFIG_X86_PAE + testb $X86_CR4_PAE, %al # check if PAE is enabled + jz 6f + +@@ -323,6 +393,9 @@ ENTRY(startup_32_smp) + /* Make changes effective */ + wrmsr + ++ btsl $_PAGE_BIT_NX-32,pa(__supported_pte_mask+4) ++#endif ++ + 6: + + /* +@@ -348,9 +421,7 @@ ENTRY(startup_32_smp) + + #ifdef CONFIG_SMP + cmpb $0, ready +- jz 1f /* Initial CPU cleans BSS */ +- jmp checkCPUtype +-1: ++ jnz checkCPUtype /* Initial CPU cleans BSS */ + #endif /* CONFIG_SMP */ + + /* +@@ -428,7 +499,7 @@ is386: movl $2,%ecx # set MP + 1: movl $(__KERNEL_DS),%eax # reload all the segment registers + movl %eax,%ss # after changing gdt. + +- movl $(__USER_DS),%eax # DS/ES contains default USER segment ++# movl $(__KERNEL_DS),%eax # DS/ES contains default KERNEL segment + movl %eax,%ds + movl %eax,%es + +@@ -442,8 +513,11 @@ is386: movl $2,%ecx # set MP + */ + cmpb $0,ready + jne 1f +- movl $per_cpu__gdt_page,%eax ++ movl $cpu_gdt_table,%eax + movl $per_cpu__stack_canary,%ecx ++#ifdef CONFIG_SMP ++ addl $__per_cpu_load,%ecx ++#endif + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) +@@ -461,10 +535,6 @@ is386: movl $2,%ecx # set MP + #ifdef CONFIG_SMP + movb ready, %cl + movb $1, ready +- cmpb $0,%cl # the first CPU calls start_kernel +- je 1f +- movl (stack_start), %esp +-1: + #endif /* CONFIG_SMP */ + jmp *(initial_code) + +@@ -550,22 +620,22 @@ early_page_fault: + jmp early_fault + + early_fault: +- cld + #ifdef CONFIG_PRINTK ++ cmpl $1,%ss:early_recursion_flag ++ je hlt_loop ++ incl %ss:early_recursion_flag ++ cld + pusha + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es +- cmpl $2,early_recursion_flag +- je hlt_loop +- incl early_recursion_flag + movl %cr2,%eax + pushl %eax + pushl %edx /* trapno */ + pushl $fault_msg + call printk ++; call dump_stack + #endif +- call dump_stack + hlt_loop: + hlt + jmp hlt_loop +@@ -573,8 +643,11 @@ hlt_loop: + /* This is the default interrupt "handler" :-) */ + ALIGN + ignore_int: +- cld + #ifdef CONFIG_PRINTK ++ cmpl $2,%ss:early_recursion_flag ++ je hlt_loop ++ incl %ss:early_recursion_flag ++ cld + pushl %eax + pushl %ecx + pushl %edx +@@ -583,9 +656,6 @@ ignore_int: + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es +- cmpl $2,early_recursion_flag +- je hlt_loop +- incl early_recursion_flag + pushl 16(%esp) + pushl 24(%esp) + pushl 32(%esp) +@@ -612,27 +682,37 @@ ENTRY(initial_code) + /* + * BSS section + */ +-__PAGE_ALIGNED_BSS +- .align PAGE_SIZE_asm + #ifdef CONFIG_X86_PAE ++.section .swapper_pg_pmd,"a",@progbits + swapper_pg_pmd: + .fill 1024*KPMDS,4,0 + #else ++.section .swapper_pg_dir,"a",@progbits + ENTRY(swapper_pg_dir) + .fill 1024,4,0 + #endif ++ + swapper_pg_fixmap: + .fill 1024,4,0 ++ ++.section .empty_zero_page,"a",@progbits + ENTRY(empty_zero_page) + .fill 4096,1,0 + + /* ++ * The IDT has to be page-aligned to simplify the Pentium ++ * F0 0F bug workaround.. We have a special link segment ++ * for this. ++ */ ++.section .idt,"a",@progbits ++ENTRY(idt_table) ++ .fill 256,8,0 ++ ++/* + * This starts the data section. + */ + #ifdef CONFIG_X86_PAE +-__PAGE_ALIGNED_DATA +- /* Page-aligned for the benefit of paravirt? */ +- .align PAGE_SIZE_asm ++.section .swapper_pg_dir,"a",@progbits + ENTRY(swapper_pg_dir) + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ + # if KPMDS == 3 +@@ -655,11 +735,12 @@ ENTRY(swapper_pg_dir) + + .data + ENTRY(stack_start) +- .long init_thread_union+THREAD_SIZE ++ .long init_thread_union+THREAD_SIZE-8 + .long __BOOT_DS + + ready: .byte 0 + ++.section .rodata,"a",@progbits + early_recursion_flag: + .long 0 + +@@ -695,7 +776,7 @@ fault_msg: + .word 0 # 32 bit align gdt_desc.address + boot_gdt_descr: + .word __BOOT_DS+7 +- .long boot_gdt - __PAGE_OFFSET ++ .long pa(boot_gdt) + + .word 0 # 32-bit align idt_desc.address + idt_descr: +@@ -706,7 +787,7 @@ idt_descr: + .word 0 # 32 bit align gdt_desc.address + ENTRY(early_gdt_descr) + .word GDT_ENTRIES*8-1 +- .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ ++ .long cpu_gdt_table /* Overwritten for secondary CPUs */ + + /* + * The boot_gdt must mirror the equivalent in setup.S and is +@@ -715,5 +796,65 @@ ENTRY(early_gdt_descr) + .align L1_CACHE_BYTES + ENTRY(boot_gdt) + .fill GDT_ENTRY_BOOT_CS,8,0 +- .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ +- .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ ++ .quad 0x00cf9b000000ffff /* kernel 4GB code at 0x00000000 */ ++ .quad 0x00cf93000000ffff /* kernel 4GB data at 0x00000000 */ ++ ++ .align PAGE_SIZE_asm ++ENTRY(cpu_gdt_table) ++ .rept NR_CPUS ++ .quad 0x0000000000000000 /* NULL descriptor */ ++ .quad 0x0000000000000000 /* 0x0b reserved */ ++ .quad 0x0000000000000000 /* 0x13 reserved */ ++ .quad 0x0000000000000000 /* 0x1b reserved */ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ .quad 0x00cf9b000000ffff /* 0x20 alternate kernel 4GB code at 0x00000000 */ ++#else ++ .quad 0x0000000000000000 /* 0x20 unused */ ++#endif ++ ++ .quad 0x0000000000000000 /* 0x28 unused */ ++ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ ++ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ ++ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ ++ .quad 0x0000000000000000 /* 0x4b reserved */ ++ .quad 0x0000000000000000 /* 0x53 reserved */ ++ .quad 0x0000000000000000 /* 0x5b reserved */ ++ ++ .quad 0x00cf9b000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ ++ .quad 0x00cf93000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ ++ .quad 0x00cffb000000ffff /* 0x73 user 4GB code at 0x00000000 */ ++ .quad 0x00cff3000000ffff /* 0x7b user 4GB data at 0x00000000 */ ++ ++ .quad 0x0000000000000000 /* 0x80 TSS descriptor */ ++ .quad 0x0000000000000000 /* 0x88 LDT descriptor */ ++ ++ /* ++ * Segments used for calling PnP BIOS have byte granularity. ++ * The code segments and data segments have fixed 64k limits, ++ * the transfer segment sizes are set at run time. ++ */ ++ .quad 0x00409b000000ffff /* 0x90 32-bit code */ ++ .quad 0x00009b000000ffff /* 0x98 16-bit code */ ++ .quad 0x000093000000ffff /* 0xa0 16-bit data */ ++ .quad 0x0000930000000000 /* 0xa8 16-bit data */ ++ .quad 0x0000930000000000 /* 0xb0 16-bit data */ ++ ++ /* ++ * The APM segments have byte granularity and their bases ++ * are set at run time. All have 64k limits. ++ */ ++ .quad 0x00409b000000ffff /* 0xb8 APM CS code */ ++ .quad 0x00009b000000ffff /* 0xc0 APM CS 16 code (16 bit) */ ++ .quad 0x004093000000ffff /* 0xc8 APM DS data */ ++ ++ .quad 0x00c0930000000000 /* 0xd0 - ESPFIX SS */ ++ .quad 0x0040930000000000 /* 0xd8 - PERCPU */ ++ .quad 0x0040910000000018 /* 0xe0 - STACK_CANARY */ ++ .quad 0x0000000000000000 /* 0xe8 - PCIBIOS_CS */ ++ .quad 0x0000000000000000 /* 0xf0 - PCIBIOS_DS */ ++ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ ++ ++ /* Be sure this is zeroed to avoid false validations in Xen */ ++ .fill PAGE_SIZE_asm - GDT_SIZE,1,0 ++ .endr +diff -urNp linux-2.6.33.1/arch/x86/kernel/head_64.S linux-2.6.33.1/arch/x86/kernel/head_64.S +--- linux-2.6.33.1/arch/x86/kernel/head_64.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/head_64.S 2010-03-20 16:58:39.000574964 -0400 +@@ -19,6 +19,7 @@ + #include <asm/cache.h> + #include <asm/processor-flags.h> + #include <asm/percpu.h> ++#include <asm/cpufeature.h> + + #ifdef CONFIG_PARAVIRT + #include <asm/asm-offsets.h> +@@ -38,6 +39,10 @@ L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET + L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET) + L4_START_KERNEL = pgd_index(__START_KERNEL_map) + L3_START_KERNEL = pud_index(__START_KERNEL_map) ++L4_VMALLOC_START = pgd_index(VMALLOC_START) ++L3_VMALLOC_START = pud_index(VMALLOC_START) ++L4_VMEMMAP_START = pgd_index(VMEMMAP_START) ++L3_VMEMMAP_START = pud_index(VMEMMAP_START) + + .text + __HEAD +@@ -85,35 +90,22 @@ startup_64: + */ + addq %rbp, init_level4_pgt + 0(%rip) + addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) ++ addq %rbp, init_level4_pgt + (L4_VMALLOC_START*8)(%rip) ++ addq %rbp, init_level4_pgt + (L4_VMEMMAP_START*8)(%rip) + addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) + + addq %rbp, level3_ident_pgt + 0(%rip) ++#ifndef CONFIG_XEN ++ addq %rbp, level3_ident_pgt + 8(%rip) ++#endif + +- addq %rbp, level3_kernel_pgt + (510*8)(%rip) +- addq %rbp, level3_kernel_pgt + (511*8)(%rip) ++ addq %rbp, level3_vmemmap_pgt + (L3_VMEMMAP_START*8)(%rip) + +- addq %rbp, level2_fixmap_pgt + (506*8)(%rip) ++ addq %rbp, level3_kernel_pgt + (L3_START_KERNEL*8)(%rip) ++ addq %rbp, level3_kernel_pgt + (L3_START_KERNEL*8+8)(%rip) + +- /* Add an Identity mapping if I am above 1G */ +- leaq _text(%rip), %rdi +- andq $PMD_PAGE_MASK, %rdi +- +- movq %rdi, %rax +- shrq $PUD_SHIFT, %rax +- andq $(PTRS_PER_PUD - 1), %rax +- jz ident_complete +- +- leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx +- leaq level3_ident_pgt(%rip), %rbx +- movq %rdx, 0(%rbx, %rax, 8) +- +- movq %rdi, %rax +- shrq $PMD_SHIFT, %rax +- andq $(PTRS_PER_PMD - 1), %rax +- leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx +- leaq level2_spare_pgt(%rip), %rbx +- movq %rdx, 0(%rbx, %rax, 8) +-ident_complete: ++ addq %rbp, level2_fixmap_pgt + (506*8)(%rip) ++ addq %rbp, level2_fixmap_pgt + (507*8)(%rip) + + /* + * Fixup the kernel text+data virtual addresses. Note that +@@ -184,9 +176,14 @@ ENTRY(secondary_startup_64) + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_SCE, %eax /* Enable System Call */ +- btl $20,%edi /* No Execute supported? */ ++ btl $(X86_FEATURE_NX & 31),%edi /* No Execute supported? */ + jnc 1f + btsl $_EFER_NX, %eax ++ leaq init_level4_pgt(%rip), %rdi ++ btsq $_PAGE_BIT_NX, 8*L4_PAGE_OFFSET(%rdi) ++ btsq $_PAGE_BIT_NX, 8*L4_VMALLOC_START(%rdi) ++ btsq $_PAGE_BIT_NX, 8*L4_VMEMMAP_START(%rdi) ++ btsq $_PAGE_BIT_NX, __supported_pte_mask(%rip) + 1: wrmsr /* Make changes effective */ + + /* Setup cr0 */ +@@ -271,7 +268,7 @@ ENTRY(secondary_startup_64) + bad_address: + jmp bad_address + +- .section ".init.text","ax" ++ __INIT + #ifdef CONFIG_EARLY_PRINTK + .globl early_idt_handlers + early_idt_handlers: +@@ -316,18 +313,23 @@ ENTRY(early_idt_handler) + #endif /* EARLY_PRINTK */ + 1: hlt + jmp 1b ++ .previous + + #ifdef CONFIG_EARLY_PRINTK ++ __INITDATA + early_recursion_flag: + .long 0 ++ .previous + ++ .section .rodata,"a",@progbits + early_idt_msg: + .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" + early_idt_ripmsg: + .asciz "RIP %s\n" +-#endif /* CONFIG_EARLY_PRINTK */ + .previous ++#endif /* CONFIG_EARLY_PRINTK */ + ++ .section .rodata,"a",@progbits + #define NEXT_PAGE(name) \ + .balign PAGE_SIZE; \ + ENTRY(name) +@@ -351,13 +353,29 @@ NEXT_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE ++ .org init_level4_pgt + L4_VMALLOC_START*8, 0 ++ .quad level3_vmalloc_pgt - __START_KERNEL_map + _KERNPG_TABLE ++ .org init_level4_pgt + L4_VMEMMAP_START*8, 0 ++ .quad level3_vmemmap_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_START_KERNEL*8, 0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + + NEXT_PAGE(level3_ident_pgt) + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE ++#ifdef CONFIG_XEN + .fill 511,8,0 ++#else ++ .quad level2_ident_pgt + PAGE_SIZE - __START_KERNEL_map + _KERNPG_TABLE ++ .fill 510,8,0 ++#endif ++ ++NEXT_PAGE(level3_vmalloc_pgt) ++ .fill 512,8,0 ++ ++NEXT_PAGE(level3_vmemmap_pgt) ++ .fill L3_VMEMMAP_START,8,0 ++ .quad level2_vmemmap_pgt - __START_KERNEL_map + _KERNPG_TABLE + + NEXT_PAGE(level3_kernel_pgt) + .fill L3_START_KERNEL,8,0 +@@ -365,20 +383,23 @@ NEXT_PAGE(level3_kernel_pgt) + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + ++NEXT_PAGE(level2_vmemmap_pgt) ++ .fill 512,8,0 ++ + NEXT_PAGE(level2_fixmap_pgt) +- .fill 506,8,0 +- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE +- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ +- .fill 5,8,0 ++ .fill 507,8,0 ++ .quad level1_vsyscall_pgt - __START_KERNEL_map + _PAGE_TABLE ++ /* 6MB reserved for vsyscalls + a 2MB hole = 3 + 1 entries */ ++ .fill 4,8,0 + +-NEXT_PAGE(level1_fixmap_pgt) ++NEXT_PAGE(level1_vsyscall_pgt) + .fill 512,8,0 + +-NEXT_PAGE(level2_ident_pgt) +- /* Since I easily can, map the first 1G. ++ /* Since I easily can, map the first 2G. + * Don't set NX because code runs from these pages. + */ +- PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) ++NEXT_PAGE(level2_ident_pgt) ++ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, 2*PTRS_PER_PMD) + + NEXT_PAGE(level2_kernel_pgt) + /* +@@ -391,33 +412,55 @@ NEXT_PAGE(level2_kernel_pgt) + * If you want to increase this then increase MODULES_VADDR + * too.) + */ +- PMDS(0, __PAGE_KERNEL_LARGE_EXEC, +- KERNEL_IMAGE_SIZE/PMD_SIZE) +- +-NEXT_PAGE(level2_spare_pgt) +- .fill 512, 8, 0 ++ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) + + #undef PMDS + #undef NEXT_PAGE + +- .data ++ .align PAGE_SIZE ++ENTRY(cpu_gdt_table) ++ .rept NR_CPUS ++ .quad 0x0000000000000000 /* NULL descriptor */ ++ .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ ++ .quad 0x00af9b000000ffff /* __KERNEL_CS */ ++ .quad 0x00cf93000000ffff /* __KERNEL_DS */ ++ .quad 0x00cffb000000ffff /* __USER32_CS */ ++ .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ ++ .quad 0x00affb000000ffff /* __USER_CS */ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ .quad 0x00af9b000000ffff /* __KERNEXEC_KERNEL_CS */ ++#else ++ .quad 0x0 /* unused */ ++#endif ++ ++ .quad 0,0 /* TSS */ ++ .quad 0,0 /* LDT */ ++ .quad 0,0,0 /* three TLS descriptors */ ++ .quad 0x0000f40000000000 /* node/CPU stored in limit */ ++ /* asm/segment.h:GDT_ENTRIES must match this */ ++ ++ /* zero the remaining page */ ++ .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 ++ .endr ++ + .align 16 + .globl early_gdt_descr + early_gdt_descr: + .word GDT_ENTRIES*8-1 + early_gdt_descr_base: +- .quad INIT_PER_CPU_VAR(gdt_page) ++ .quad cpu_gdt_table + + ENTRY(phys_base) + /* This must match the first entry in level2_kernel_pgt */ + .quad 0x0000000000000000 + + #include "../../x86/xen/xen-head.S" +- +- .section .bss, "aw", @nobits ++ ++ .section .rodata,"a",@progbits + .align L1_CACHE_BYTES + ENTRY(idt_table) +- .skip IDT_ENTRIES * 16 ++ .fill 512,8,0 + + __PAGE_ALIGNED_BSS + .align PAGE_SIZE +diff -urNp linux-2.6.33.1/arch/x86/kernel/i386_ksyms_32.c linux-2.6.33.1/arch/x86/kernel/i386_ksyms_32.c +--- linux-2.6.33.1/arch/x86/kernel/i386_ksyms_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/i386_ksyms_32.c 2010-03-20 16:58:39.000574964 -0400 +@@ -20,8 +20,12 @@ extern void cmpxchg8b_emu(void); + EXPORT_SYMBOL(cmpxchg8b_emu); + #endif + ++EXPORT_SYMBOL_GPL(cpu_gdt_table); ++ + /* Networking helper routines. */ + EXPORT_SYMBOL(csum_partial_copy_generic); ++EXPORT_SYMBOL(csum_partial_copy_generic_to_user); ++EXPORT_SYMBOL(csum_partial_copy_generic_from_user); + + EXPORT_SYMBOL(__get_user_1); + EXPORT_SYMBOL(__get_user_2); +@@ -36,3 +40,7 @@ EXPORT_SYMBOL(strstr); + + EXPORT_SYMBOL(csum_partial); + EXPORT_SYMBOL(empty_zero_page); ++ ++#ifdef CONFIG_PAX_KERNEXEC ++EXPORT_SYMBOL(__LOAD_PHYSICAL_ADDR); ++#endif +diff -urNp linux-2.6.33.1/arch/x86/kernel/init_task.c linux-2.6.33.1/arch/x86/kernel/init_task.c +--- linux-2.6.33.1/arch/x86/kernel/init_task.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/init_task.c 2010-03-20 16:58:39.000574964 -0400 +@@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task); + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +- ++struct tss_struct init_tss[NR_CPUS] ____cacheline_internodealigned_in_smp = { [0 ... NR_CPUS-1] = INIT_TSS }; ++EXPORT_SYMBOL(init_tss); +diff -urNp linux-2.6.33.1/arch/x86/kernel/ioport.c linux-2.6.33.1/arch/x86/kernel/ioport.c +--- linux-2.6.33.1/arch/x86/kernel/ioport.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/ioport.c 2010-03-20 16:58:39.000574964 -0400 +@@ -6,6 +6,7 @@ + #include <linux/sched.h> + #include <linux/kernel.h> + #include <linux/capability.h> ++#include <linux/security.h> + #include <linux/errno.h> + #include <linux/types.h> + #include <linux/ioport.h> +@@ -41,6 +42,12 @@ asmlinkage long sys_ioperm(unsigned long + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; ++#ifdef CONFIG_GRKERNSEC_IO ++ if (turn_on) { ++ gr_handle_ioperm(); ++ return -EPERM; ++ } ++#endif + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + +@@ -67,7 +74,7 @@ asmlinkage long sys_ioperm(unsigned long + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ +- tss = &per_cpu(init_tss, get_cpu()); ++ tss = init_tss + get_cpu(); + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + +@@ -112,8 +119,13 @@ long sys_iopl(unsigned int level, struct + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { ++#ifdef CONFIG_GRKERNSEC_IO ++ gr_handle_iopl(); ++ return -EPERM; ++#else + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; ++#endif + } + regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); + t->iopl = level << 12; +diff -urNp linux-2.6.33.1/arch/x86/kernel/irq_32.c linux-2.6.33.1/arch/x86/kernel/irq_32.c +--- linux-2.6.33.1/arch/x86/kernel/irq_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/irq_32.c 2010-03-20 16:58:39.000574964 -0400 +@@ -94,7 +94,7 @@ execute_on_irq_stack(int overflow, struc + return 0; + + /* build the stack frame on the IRQ stack */ +- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); ++ isp = (u32 *) ((char *)irqctx + sizeof(*irqctx) - 8); + irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.previous_esp = current_stack_pointer; + +@@ -175,7 +175,7 @@ asmlinkage void do_softirq(void) + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* build the stack frame on the softirq stack */ +- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); ++ isp = (u32 *) ((char *)irqctx + sizeof(*irqctx) - 8); + + call_on_stack(__do_softirq, isp); + /* +diff -urNp linux-2.6.33.1/arch/x86/kernel/kgdb.c linux-2.6.33.1/arch/x86/kernel/kgdb.c +--- linux-2.6.33.1/arch/x86/kernel/kgdb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/kgdb.c 2010-03-20 16:58:39.004585163 -0400 +@@ -89,7 +89,7 @@ void pt_regs_to_gdb_regs(unsigned long * + gdb_regs[GDB_CS] = regs->cs; + gdb_regs[GDB_FS] = 0xFFFF; + gdb_regs[GDB_GS] = 0xFFFF; +- if (user_mode_vm(regs)) { ++ if (user_mode(regs)) { + gdb_regs[GDB_SS] = regs->ss; + gdb_regs[GDB_SP] = regs->sp; + } else { +@@ -690,7 +690,7 @@ unsigned long kgdb_arch_pc(int exception + return instruction_pointer(regs); + } + +-struct kgdb_arch arch_kgdb_ops = { ++const struct kgdb_arch arch_kgdb_ops = { + /* Breakpoint instruction: */ + .gdb_bpt_instr = { 0xcc }, + .flags = KGDB_HW_BREAKPOINT, +diff -urNp linux-2.6.33.1/arch/x86/kernel/kprobes.c linux-2.6.33.1/arch/x86/kernel/kprobes.c +--- linux-2.6.33.1/arch/x86/kernel/kprobes.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/kprobes.c 2010-03-20 16:58:39.004585163 -0400 +@@ -113,9 +113,13 @@ static void __kprobes set_jmp_op(void *f + char op; + s32 raddr; + } __attribute__((packed)) * jop; +- jop = (struct __arch_jmp_op *)from; ++ ++ jop = (struct __arch_jmp_op *)(ktla_ktva(from)); ++ ++ pax_open_kernel(); + jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); + jop->op = RELATIVEJUMP_INSTRUCTION; ++ pax_close_kernel(); + } + + /* +@@ -323,16 +327,18 @@ static void __kprobes fix_riprel(struct + + static void __kprobes arch_copy_kprobe(struct kprobe *p) + { +- memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); ++ pax_open_kernel(); ++ memcpy(p->ainsn.insn, ktla_ktva(p->addr), MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); ++ pax_close_kernel(); + + fix_riprel(p); + +- if (can_boost(p->addr)) ++ if (can_boost(ktla_ktva(p->addr))) + p->ainsn.boostable = 0; + else + p->ainsn.boostable = -1; + +- p->opcode = *p->addr; ++ p->opcode = *(ktla_ktva(p->addr)); + } + + int __kprobes arch_prepare_kprobe(struct kprobe *p) +@@ -412,7 +418,7 @@ static void __kprobes prepare_singlestep + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else +- regs->ip = (unsigned long)p->ainsn.insn; ++ regs->ip = ktva_ktla((unsigned long)p->ainsn.insn); + } + + void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, +@@ -433,7 +439,7 @@ static void __kprobes setup_singlestep(s + if (p->ainsn.boostable == 1 && !p->post_handler) { + /* Boost up -- we can execute copied instructions directly */ + reset_current_kprobe(); +- regs->ip = (unsigned long)p->ainsn.insn; ++ regs->ip = ktva_ktla((unsigned long)p->ainsn.insn); + preempt_enable_no_resched(); + return; + } +@@ -490,7 +496,7 @@ static int __kprobes kprobe_handler(stru + struct kprobe_ctlblk *kcb; + + addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); +- if (*addr != BREAKPOINT_INSTRUCTION) { ++ if (*(kprobe_opcode_t *)ktla_ktva((unsigned long)addr) != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed +@@ -742,7 +748,7 @@ static void __kprobes resume_execution(s + struct pt_regs *regs, struct kprobe_ctlblk *kcb) + { + unsigned long *tos = stack_addr(regs); +- unsigned long copy_ip = (unsigned long)p->ainsn.insn; ++ unsigned long copy_ip = ktva_ktla((unsigned long)p->ainsn.insn); + unsigned long orig_ip = (unsigned long)p->addr; + kprobe_opcode_t *insn = p->ainsn.insn; + +@@ -925,7 +931,7 @@ int __kprobes kprobe_exceptions_notify(s + struct die_args *args = data; + int ret = NOTIFY_DONE; + +- if (args->regs && user_mode_vm(args->regs)) ++ if (args->regs && user_mode(args->regs)) + return ret; + + switch (val) { +diff -urNp linux-2.6.33.1/arch/x86/kernel/ldt.c linux-2.6.33.1/arch/x86/kernel/ldt.c +--- linux-2.6.33.1/arch/x86/kernel/ldt.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/ldt.c 2010-03-20 16:58:39.004585163 -0400 +@@ -66,13 +66,13 @@ static int alloc_ldt(mm_context_t *pc, i + if (reload) { + #ifdef CONFIG_SMP + preempt_disable(); +- load_LDT(pc); ++ load_LDT_nolock(pc); + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) + smp_call_function(flush_ldt, current->mm, 1); + preempt_enable(); + #else +- load_LDT(pc); ++ load_LDT_nolock(pc); + #endif + } + if (oldsize) { +@@ -94,7 +94,7 @@ static inline int copy_ldt(mm_context_t + return err; + + for (i = 0; i < old->size; i++) +- write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); ++ write_ldt_entry(new->ldt, i, old->ldt + i); + return 0; + } + +@@ -115,6 +115,24 @@ int init_new_context(struct task_struct + retval = copy_ldt(&mm->context, &old_mm->context); + mutex_unlock(&old_mm->context.lock); + } ++ ++ if (tsk == current) { ++ mm->context.vdso = ~0UL; ++ ++#ifdef CONFIG_X86_32 ++#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) ++ mm->context.user_cs_base = 0UL; ++ mm->context.user_cs_limit = ~0UL; ++ ++#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_SMP) ++ cpus_clear(mm->context.cpu_user_cs_mask); ++#endif ++ ++#endif ++#endif ++ ++ } ++ + return retval; + } + +@@ -229,6 +247,13 @@ static int write_ldt(void __user *ptr, u + } + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (ldt_info.contents & MODIFY_LDT_CONTENTS_CODE)) { ++ error = -EINVAL; ++ goto out_unlock; ++ } ++#endif ++ + fill_ldt(&ldt, &ldt_info); + if (oldmode) + ldt.avl = 0; +diff -urNp linux-2.6.33.1/arch/x86/kernel/machine_kexec_32.c linux-2.6.33.1/arch/x86/kernel/machine_kexec_32.c +--- linux-2.6.33.1/arch/x86/kernel/machine_kexec_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/machine_kexec_32.c 2010-03-20 16:58:39.004585163 -0400 +@@ -27,7 +27,7 @@ + #include <asm/cacheflush.h> + #include <asm/debugreg.h> + +-static void set_idt(void *newidt, __u16 limit) ++static void set_idt(struct desc_struct *newidt, __u16 limit) + { + struct desc_ptr curidt; + +@@ -39,7 +39,7 @@ static void set_idt(void *newidt, __u16 + } + + +-static void set_gdt(void *newgdt, __u16 limit) ++static void set_gdt(struct desc_struct *newgdt, __u16 limit) + { + struct desc_ptr curgdt; + +@@ -217,7 +217,7 @@ void machine_kexec(struct kimage *image) + } + + control_page = page_address(image->control_code_page); +- memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); ++ memcpy(control_page, (void *)ktla_ktva((unsigned long)relocate_kernel), KEXEC_CONTROL_CODE_MAX_SIZE); + + relocate_kernel_ptr = control_page; + page_list[PA_CONTROL_PAGE] = __pa(control_page); +diff -urNp linux-2.6.33.1/arch/x86/kernel/microcode_amd.c linux-2.6.33.1/arch/x86/kernel/microcode_amd.c +--- linux-2.6.33.1/arch/x86/kernel/microcode_amd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/microcode_amd.c 2010-03-20 16:58:39.004585163 -0400 +@@ -331,7 +331,7 @@ static void microcode_fini_cpu_amd(int c + uci->mc = NULL; + } + +-static struct microcode_ops microcode_amd_ops = { ++static const struct microcode_ops microcode_amd_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info_amd, +@@ -339,7 +339,7 @@ static struct microcode_ops microcode_am + .microcode_fini_cpu = microcode_fini_cpu_amd, + }; + +-struct microcode_ops * __init init_amd_microcode(void) ++const struct microcode_ops * __init init_amd_microcode(void) + { + return µcode_amd_ops; + } +diff -urNp linux-2.6.33.1/arch/x86/kernel/microcode_core.c linux-2.6.33.1/arch/x86/kernel/microcode_core.c +--- linux-2.6.33.1/arch/x86/kernel/microcode_core.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/microcode_core.c 2010-03-20 16:58:39.004585163 -0400 +@@ -92,7 +92,7 @@ MODULE_LICENSE("GPL"); + + #define MICROCODE_VERSION "2.00" + +-static struct microcode_ops *microcode_ops; ++static const struct microcode_ops *microcode_ops; + + /* + * Synchronization. +diff -urNp linux-2.6.33.1/arch/x86/kernel/microcode_intel.c linux-2.6.33.1/arch/x86/kernel/microcode_intel.c +--- linux-2.6.33.1/arch/x86/kernel/microcode_intel.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/microcode_intel.c 2010-03-20 16:58:39.011824892 -0400 +@@ -436,13 +436,13 @@ static enum ucode_state request_microcod + + static int get_ucode_user(void *to, const void *from, size_t n) + { +- return copy_from_user(to, from, n); ++ return copy_from_user(to, (__force const void __user *)from, n); + } + + static enum ucode_state + request_microcode_user(int cpu, const void __user *buf, size_t size) + { +- return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); ++ return generic_load_microcode(cpu, (__force void *)buf, size, &get_ucode_user); + } + + static void microcode_fini_cpu(int cpu) +@@ -453,7 +453,7 @@ static void microcode_fini_cpu(int cpu) + uci->mc = NULL; + } + +-static struct microcode_ops microcode_intel_ops = { ++static const struct microcode_ops microcode_intel_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info, +@@ -461,7 +461,7 @@ static struct microcode_ops microcode_in + .microcode_fini_cpu = microcode_fini_cpu, + }; + +-struct microcode_ops * __init init_intel_microcode(void) ++const struct microcode_ops * __init init_intel_microcode(void) + { + return µcode_intel_ops; + } +diff -urNp linux-2.6.33.1/arch/x86/kernel/module.c linux-2.6.33.1/arch/x86/kernel/module.c +--- linux-2.6.33.1/arch/x86/kernel/module.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/module.c 2010-03-20 16:58:39.011824892 -0400 +@@ -34,7 +34,7 @@ + #define DEBUGP(fmt...) + #endif + +-void *module_alloc(unsigned long size) ++static void *__module_alloc(unsigned long size, pgprot_t prot) + { + struct vm_struct *area; + +@@ -48,8 +48,18 @@ void *module_alloc(unsigned long size) + if (!area) + return NULL; + +- return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, +- PAGE_KERNEL_EXEC); ++ return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, prot); ++} ++ ++void *module_alloc(unsigned long size) ++{ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ return __module_alloc(size, PAGE_KERNEL); ++#else ++ return __module_alloc(size, PAGE_KERNEL_EXEC); ++#endif ++ + } + + /* Free memory returned from module_alloc */ +@@ -58,6 +68,40 @@ void module_free(struct module *mod, voi + vfree(module_region); + } + ++#ifdef CONFIG_PAX_KERNEXEC ++#ifdef CONFIG_X86_32 ++void *module_alloc_exec(unsigned long size) ++{ ++ struct vm_struct *area; ++ ++ if (size == 0) ++ return NULL; ++ ++ area = __get_vm_area(size, VM_ALLOC, (unsigned long)&MODULES_EXEC_VADDR, (unsigned long)&MODULES_EXEC_END); ++ return area ? area->addr : NULL; ++} ++EXPORT_SYMBOL(module_alloc_exec); ++ ++void module_free_exec(struct module *mod, void *module_region) ++{ ++ vunmap(module_region); ++} ++EXPORT_SYMBOL(module_free_exec); ++#else ++void module_free_exec(struct module *mod, void *module_region) ++{ ++ module_free(mod, module_region); ++} ++EXPORT_SYMBOL(module_free_exec); ++ ++void *module_alloc_exec(unsigned long size) ++{ ++ return __module_alloc(size, PAGE_KERNEL_RX); ++} ++EXPORT_SYMBOL(module_alloc_exec); ++#endif ++#endif ++ + /* We don't need anything special. */ + int module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, +@@ -77,14 +121,16 @@ int apply_relocate(Elf32_Shdr *sechdrs, + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; +- uint32_t *location; ++ uint32_t *plocation, location; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ +- location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr +- + rel[i].r_offset; ++ plocation = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + rel[i].r_offset; ++ location = (uint32_t)plocation; ++ if (sechdrs[sechdrs[relsec].sh_info].sh_flags & SHF_EXECINSTR) ++ plocation = ktla_ktva((void *)plocation); + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr +@@ -93,11 +139,15 @@ int apply_relocate(Elf32_Shdr *sechdrs, + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ +- *location += sym->st_value; ++ pax_open_kernel(); ++ *plocation += sym->st_value; ++ pax_close_kernel(); + break; + case R_386_PC32: + /* Add the value, subtract its postition */ +- *location += sym->st_value - (uint32_t)location; ++ pax_open_kernel(); ++ *plocation += sym->st_value - location; ++ pax_close_kernel(); + break; + default: + printk(KERN_ERR "module %s: Unknown relocation: %u\n", +@@ -153,21 +203,30 @@ int apply_relocate_add(Elf64_Shdr *sechd + case R_X86_64_NONE: + break; + case R_X86_64_64: ++ pax_open_kernel(); + *(u64 *)loc = val; ++ pax_close_kernel(); + break; + case R_X86_64_32: ++ pax_open_kernel(); + *(u32 *)loc = val; ++ pax_close_kernel(); + if (val != *(u32 *)loc) + goto overflow; + break; + case R_X86_64_32S: ++ pax_open_kernel(); + *(s32 *)loc = val; ++ pax_close_kernel(); + if ((s64)val != *(s32 *)loc) + goto overflow; + break; + case R_X86_64_PC32: + val -= (u64)loc; ++ pax_open_kernel(); + *(u32 *)loc = val; ++ pax_close_kernel(); ++ + #if 0 + if ((s64)val != *(s32 *)loc) + goto overflow; +diff -urNp linux-2.6.33.1/arch/x86/kernel/paravirt.c linux-2.6.33.1/arch/x86/kernel/paravirt.c +--- linux-2.6.33.1/arch/x86/kernel/paravirt.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/paravirt.c 2010-03-20 16:58:39.012867487 -0400 +@@ -120,9 +120,9 @@ unsigned paravirt_patch_jmp(void *insnbu + + /* Neat trick to map patch type back to the call within the + * corresponding structure. */ +-static void *get_call_destination(u8 type) ++static const void *get_call_destination(u8 type) + { +- struct paravirt_patch_template tmpl = { ++ const struct paravirt_patch_template tmpl = { + .pv_init_ops = pv_init_ops, + .pv_time_ops = pv_time_ops, + .pv_cpu_ops = pv_cpu_ops, +@@ -133,13 +133,13 @@ static void *get_call_destination(u8 typ + .pv_lock_ops = pv_lock_ops, + #endif + }; +- return *((void **)&tmpl + type); ++ return *((const void **)&tmpl + type); + } + + unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len) + { +- void *opfunc = get_call_destination(type); ++ const void *opfunc = get_call_destination(type); + unsigned ret; + + if (opfunc == NULL) +@@ -178,7 +178,7 @@ unsigned paravirt_patch_insns(void *insn + if (insn_len > len || start == NULL) + insn_len = len; + else +- memcpy(insnbuf, start, insn_len); ++ memcpy(insnbuf, ktla_ktva(start), insn_len); + + return insn_len; + } +@@ -294,22 +294,22 @@ void arch_flush_lazy_mmu_mode(void) + preempt_enable(); + } + +-struct pv_info pv_info = { ++struct pv_info pv_info __read_only = { + .name = "bare hardware", + .paravirt_enabled = 0, + .kernel_rpl = 0, + .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ + }; + +-struct pv_init_ops pv_init_ops = { ++struct pv_init_ops pv_init_ops __read_only = { + .patch = native_patch, + }; + +-struct pv_time_ops pv_time_ops = { ++struct pv_time_ops pv_time_ops __read_only = { + .sched_clock = native_sched_clock, + }; + +-struct pv_irq_ops pv_irq_ops = { ++struct pv_irq_ops pv_irq_ops __read_only = { + .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), + .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), + .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), +@@ -321,7 +321,7 @@ struct pv_irq_ops pv_irq_ops = { + #endif + }; + +-struct pv_cpu_ops pv_cpu_ops = { ++struct pv_cpu_ops pv_cpu_ops __read_only = { + .cpuid = native_cpuid, + .get_debugreg = native_get_debugreg, + .set_debugreg = native_set_debugreg, +@@ -382,7 +382,7 @@ struct pv_cpu_ops pv_cpu_ops = { + .end_context_switch = paravirt_nop, + }; + +-struct pv_apic_ops pv_apic_ops = { ++struct pv_apic_ops pv_apic_ops __read_only = { + #ifdef CONFIG_X86_LOCAL_APIC + .startup_ipi_hook = paravirt_nop, + #endif +@@ -396,7 +396,7 @@ struct pv_apic_ops pv_apic_ops = { + #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) + #endif + +-struct pv_mmu_ops pv_mmu_ops = { ++struct pv_mmu_ops pv_mmu_ops __read_only = { + + .read_cr2 = native_read_cr2, + .write_cr2 = native_write_cr2, +@@ -467,6 +467,12 @@ struct pv_mmu_ops pv_mmu_ops = { + }, + + .set_fixmap = native_set_fixmap, ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ .pax_open_kernel = native_pax_open_kernel, ++ .pax_close_kernel = native_pax_close_kernel, ++#endif ++ + }; + + EXPORT_SYMBOL_GPL(pv_time_ops); +diff -urNp linux-2.6.33.1/arch/x86/kernel/paravirt-spinlocks.c linux-2.6.33.1/arch/x86/kernel/paravirt-spinlocks.c +--- linux-2.6.33.1/arch/x86/kernel/paravirt-spinlocks.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/paravirt-spinlocks.c 2010-03-20 16:58:39.012867487 -0400 +@@ -13,7 +13,7 @@ default_spin_lock_flags(arch_spinlock_t + arch_spin_lock(lock); + } + +-struct pv_lock_ops pv_lock_ops = { ++struct pv_lock_ops pv_lock_ops __read_only = { + #ifdef CONFIG_SMP + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, +diff -urNp linux-2.6.33.1/arch/x86/kernel/pci-calgary_64.c linux-2.6.33.1/arch/x86/kernel/pci-calgary_64.c +--- linux-2.6.33.1/arch/x86/kernel/pci-calgary_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/pci-calgary_64.c 2010-03-20 16:58:39.012867487 -0400 +@@ -470,7 +470,7 @@ static void calgary_free_coherent(struct + free_pages((unsigned long)vaddr, get_order(size)); + } + +-static struct dma_map_ops calgary_dma_ops = { ++static const struct dma_map_ops calgary_dma_ops = { + .alloc_coherent = calgary_alloc_coherent, + .free_coherent = calgary_free_coherent, + .map_sg = calgary_map_sg, +diff -urNp linux-2.6.33.1/arch/x86/kernel/pci-dma.c linux-2.6.33.1/arch/x86/kernel/pci-dma.c +--- linux-2.6.33.1/arch/x86/kernel/pci-dma.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/pci-dma.c 2010-03-20 16:58:39.012867487 -0400 +@@ -15,7 +15,7 @@ + + static int forbid_dac __read_mostly; + +-struct dma_map_ops *dma_ops = &nommu_dma_ops; ++const struct dma_map_ops *dma_ops = &nommu_dma_ops; + EXPORT_SYMBOL(dma_ops); + + static int iommu_sac_force __read_mostly; +@@ -240,7 +240,7 @@ early_param("iommu", iommu_setup); + + int dma_supported(struct device *dev, u64 mask) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + #ifdef CONFIG_PCI + if (mask > 0xffffffff && forbid_dac > 0) { +diff -urNp linux-2.6.33.1/arch/x86/kernel/pci-gart_64.c linux-2.6.33.1/arch/x86/kernel/pci-gart_64.c +--- linux-2.6.33.1/arch/x86/kernel/pci-gart_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/pci-gart_64.c 2010-03-20 16:58:39.012867487 -0400 +@@ -695,7 +695,7 @@ static __init int init_k8_gatt(struct ag + return -1; + } + +-static struct dma_map_ops gart_dma_ops = { ++static const struct dma_map_ops gart_dma_ops = { + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, + .map_page = gart_map_page, +diff -urNp linux-2.6.33.1/arch/x86/kernel/pci-nommu.c linux-2.6.33.1/arch/x86/kernel/pci-nommu.c +--- linux-2.6.33.1/arch/x86/kernel/pci-nommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/pci-nommu.c 2010-03-20 16:58:39.012867487 -0400 +@@ -94,7 +94,7 @@ static void nommu_sync_sg_for_device(str + flush_write_buffers(); + } + +-struct dma_map_ops nommu_dma_ops = { ++const struct dma_map_ops nommu_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = nommu_free_coherent, + .map_sg = nommu_map_sg, +diff -urNp linux-2.6.33.1/arch/x86/kernel/pci-swiotlb.c linux-2.6.33.1/arch/x86/kernel/pci-swiotlb.c +--- linux-2.6.33.1/arch/x86/kernel/pci-swiotlb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/pci-swiotlb.c 2010-03-20 16:58:39.012867487 -0400 +@@ -25,7 +25,7 @@ static void *x86_swiotlb_alloc_coherent( + return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); + } + +-static struct dma_map_ops swiotlb_dma_ops = { ++static const struct dma_map_ops swiotlb_dma_ops = { + .mapping_error = swiotlb_dma_mapping_error, + .alloc_coherent = x86_swiotlb_alloc_coherent, + .free_coherent = swiotlb_free_coherent, +diff -urNp linux-2.6.33.1/arch/x86/kernel/process_32.c linux-2.6.33.1/arch/x86/kernel/process_32.c +--- linux-2.6.33.1/arch/x86/kernel/process_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/process_32.c 2010-03-20 16:58:39.012867487 -0400 +@@ -66,6 +66,7 @@ asmlinkage void ret_from_fork(void) __as + unsigned long thread_saved_pc(struct task_struct *tsk) + { + return ((unsigned long *)tsk->thread.sp)[3]; ++//XXX return tsk->thread.eip; + } + + #ifndef CONFIG_SMP +@@ -127,7 +128,7 @@ void __show_regs(struct pt_regs *regs, i + unsigned long sp; + unsigned short ss, gs; + +- if (user_mode_vm(regs)) { ++ if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + gs = get_user_gs(regs); +@@ -203,7 +204,7 @@ int copy_thread(unsigned long clone_flag + struct task_struct *tsk; + int err; + +- childregs = task_pt_regs(p); ++ childregs = task_stack_page(p) + THREAD_SIZE - sizeof(struct pt_regs) - 8; + *childregs = *regs; + childregs->ax = 0; + childregs->sp = sp; +@@ -237,6 +238,7 @@ int copy_thread(unsigned long clone_flag + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) ++//XXX needs set_fs()? + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); + +@@ -307,7 +309,7 @@ __switch_to(struct task_struct *prev_p, + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); +- struct tss_struct *tss = &per_cpu(init_tss, cpu); ++ struct tss_struct *tss = init_tss + cpu; + bool preload_fpu; + + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ +@@ -342,6 +344,11 @@ __switch_to(struct task_struct *prev_p, + */ + lazy_save_gs(prev->gs); + ++#ifdef CONFIG_PAX_MEMORY_UDEREF ++ if (!segment_eq(task_thread_info(prev_p)->addr_limit, task_thread_info(next_p)->addr_limit)) ++ __set_fs(task_thread_info(next_p)->addr_limit, cpu); ++#endif ++ + /* + * Load the per-thread Thread-Local Storage descriptor. + */ +@@ -418,3 +425,27 @@ unsigned long get_wchan(struct task_stru + return 0; + } + ++#ifdef CONFIG_PAX_RANDKSTACK ++asmlinkage void pax_randomize_kstack(void) ++{ ++ struct thread_struct *thread = ¤t->thread; ++ unsigned long time; ++ ++ if (!randomize_va_space) ++ return; ++ ++ rdtscl(time); ++ ++ /* P4 seems to return a 0 LSB, ignore it */ ++#ifdef CONFIG_MPENTIUM4 ++ time &= 0x1EUL; ++ time <<= 2; ++#else ++ time &= 0xFUL; ++ time <<= 3; ++#endif ++ ++ thread->sp0 ^= time; ++ load_sp0(init_tss + smp_processor_id(), thread); ++} ++#endif +diff -urNp linux-2.6.33.1/arch/x86/kernel/process_64.c linux-2.6.33.1/arch/x86/kernel/process_64.c +--- linux-2.6.33.1/arch/x86/kernel/process_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/process_64.c 2010-03-20 16:58:39.012867487 -0400 +@@ -88,7 +88,7 @@ static void __exit_idle(void) + void exit_idle(void) + { + /* idle loop has pid 0 */ +- if (current->pid) ++ if (task_pid_nr(current)) + return; + __exit_idle(); + } +@@ -389,7 +389,7 @@ __switch_to(struct task_struct *prev_p, + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; + int cpu = smp_processor_id(); +- struct tss_struct *tss = &per_cpu(init_tss, cpu); ++ struct tss_struct *tss = init_tss + cpu; + unsigned fsindex, gsindex; + bool preload_fpu; + +@@ -542,12 +542,11 @@ unsigned long get_wchan(struct task_stru + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack = (unsigned long)task_stack_page(p); +- if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) ++ if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE-8-sizeof(u64)) + return 0; + fp = *(u64 *)(p->thread.sp); + do { +- if (fp < (unsigned long)stack || +- fp >= (unsigned long)stack+THREAD_SIZE) ++ if (fp < stack || fp > stack+THREAD_SIZE-8-sizeof(u64)) + return 0; + ip = *(u64 *)(fp+8); + if (!in_sched_functions(ip)) +diff -urNp linux-2.6.33.1/arch/x86/kernel/process.c linux-2.6.33.1/arch/x86/kernel/process.c +--- linux-2.6.33.1/arch/x86/kernel/process.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/process.c 2010-03-20 16:58:39.012867487 -0400 +@@ -78,7 +78,7 @@ void exit_thread(void) + unsigned long *bp = t->io_bitmap_ptr; + + if (bp) { +- struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); ++ struct tss_struct *tss = init_tss + get_cpu(); + + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); +@@ -105,7 +105,7 @@ void show_regs_common(void) + + printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", +- current->pid, current->comm, print_tainted(), ++ task_pid_nr(current), current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, board, product); +@@ -115,6 +115,9 @@ void flush_thread(void) + { + struct task_struct *tsk = current; + ++#if defined(CONFIG_X86_32) && !defined(CONFIG_CC_STACKPROTECTOR) ++ loadsegment(gs, 0); ++#endif + flush_ptrace_hw_breakpoint(tsk); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* +@@ -272,8 +275,8 @@ int kernel_thread(int (*fn)(void *), voi + regs.di = (unsigned long) arg; + + #ifdef CONFIG_X86_32 +- regs.ds = __USER_DS; +- regs.es = __USER_DS; ++ regs.ds = __KERNEL_DS; ++ regs.es = __KERNEL_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; + #else +@@ -664,17 +667,3 @@ static int __init idle_setup(char *str) + return 0; + } + early_param("idle", idle_setup); +- +-unsigned long arch_align_stack(unsigned long sp) +-{ +- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) +- sp -= get_random_int() % 8192; +- return sp & ~0xf; +-} +- +-unsigned long arch_randomize_brk(struct mm_struct *mm) +-{ +- unsigned long range_end = mm->brk + 0x02000000; +- return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +-} +- +diff -urNp linux-2.6.33.1/arch/x86/kernel/ptrace.c linux-2.6.33.1/arch/x86/kernel/ptrace.c +--- linux-2.6.33.1/arch/x86/kernel/ptrace.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/ptrace.c 2010-03-20 16:58:39.012867487 -0400 +@@ -1167,7 +1167,7 @@ static const struct user_regset_view use + long arch_ptrace(struct task_struct *child, long request, long addr, long data) + { + int ret; +- unsigned long __user *datap = (unsigned long __user *)data; ++ unsigned long __user *datap = (__force unsigned long __user *)data; + + switch (request) { + /* read the word at location addr in the USER area. */ +@@ -1254,14 +1254,14 @@ long arch_ptrace(struct task_struct *chi + if (addr < 0) + return -EIO; + ret = do_get_thread_area(child, addr, +- (struct user_desc __user *) data); ++ (__force struct user_desc __user *) data); + break; + + case PTRACE_SET_THREAD_AREA: + if (addr < 0) + return -EIO; + ret = do_set_thread_area(child, addr, +- (struct user_desc __user *) data, 0); ++ (__force struct user_desc __user *) data, 0); + break; + #endif + +@@ -1280,12 +1280,12 @@ long arch_ptrace(struct task_struct *chi + #ifdef CONFIG_X86_PTRACE_BTS + case PTRACE_BTS_CONFIG: + ret = ptrace_bts_config +- (child, data, (struct ptrace_bts_config __user *)addr); ++ (child, data, (__force struct ptrace_bts_config __user *)addr); + break; + + case PTRACE_BTS_STATUS: + ret = ptrace_bts_status +- (child, data, (struct ptrace_bts_config __user *)addr); ++ (child, data, (__force struct ptrace_bts_config __user *)addr); + break; + + case PTRACE_BTS_SIZE: +@@ -1294,7 +1294,7 @@ long arch_ptrace(struct task_struct *chi + + case PTRACE_BTS_GET: + ret = ptrace_bts_read_record +- (child, data, (struct bts_struct __user *) addr); ++ (child, data, (__force struct bts_struct __user *) addr); + break; + + case PTRACE_BTS_CLEAR: +@@ -1303,7 +1303,7 @@ long arch_ptrace(struct task_struct *chi + + case PTRACE_BTS_DRAIN: + ret = ptrace_bts_drain +- (child, data, (struct bts_struct __user *) addr); ++ (child, data, (__force struct bts_struct __user *) addr); + break; + #endif /* CONFIG_X86_PTRACE_BTS */ + +@@ -1690,7 +1690,7 @@ static void fill_sigtrap_info(struct tas + memset(info, 0, sizeof(*info)); + info->si_signo = SIGTRAP; + info->si_code = si_code; +- info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; ++ info->si_addr = user_mode(regs) ? (__force void __user *)regs->ip : NULL; + } + + void user_single_step_siginfo(struct task_struct *tsk, +diff -urNp linux-2.6.33.1/arch/x86/kernel/reboot.c linux-2.6.33.1/arch/x86/kernel/reboot.c +--- linux-2.6.33.1/arch/x86/kernel/reboot.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/reboot.c 2010-03-20 16:58:39.016512700 -0400 +@@ -33,7 +33,7 @@ void (*pm_power_off)(void); + EXPORT_SYMBOL(pm_power_off); + + static const struct desc_ptr no_idt = {}; +-static int reboot_mode; ++static unsigned short reboot_mode; + enum reboot_type reboot_type = BOOT_KBD; + int reboot_force; + +@@ -276,7 +276,7 @@ static struct dmi_system_id __initdata r + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, +- { } ++ { NULL, NULL, {{0, {0}}}, NULL} + }; + + static int __init reboot_init(void) +@@ -292,12 +292,12 @@ core_initcall(reboot_init); + controller to pulse the CPU reset line, which is more thorough, but + doesn't work with at least one type of 486 motherboard. It is easy + to stop this code working; hence the copious comments. */ +-static const unsigned long long +-real_mode_gdt_entries [3] = ++static struct desc_struct ++real_mode_gdt_entries [3] __read_only = + { +- 0x0000000000000000ULL, /* Null descriptor */ +- 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ +- 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ ++ GDT_ENTRY_INIT(0, 0, 0), /* Null descriptor */ ++ GDT_ENTRY_INIT(0x9b, 0, 0xffff), /* 16-bit real-mode 64k code at 0x00000000 */ ++ GDT_ENTRY_INIT(0x93, 0x100, 0xffff) /* 16-bit real-mode 64k data at 0x00000100 */ + }; + + static const struct desc_ptr +@@ -346,7 +346,7 @@ static const unsigned char jump_to_bios + * specified by the code and length parameters. + * We assume that length will aways be less that 100! + */ +-void machine_real_restart(const unsigned char *code, int length) ++void machine_real_restart(const unsigned char *code, unsigned int length) + { + local_irq_disable(); + +@@ -366,8 +366,8 @@ void machine_real_restart(const unsigned + /* Remap the kernel at virtual address zero, as well as offset zero + from the kernel segment. This assumes the kernel segment starts at + virtual address PAGE_OFFSET. */ +- memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, +- sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); ++ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, ++ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); + + /* + * Use `swapper_pg_dir' as our page directory. +@@ -379,16 +379,15 @@ void machine_real_restart(const unsigned + boot)". This seems like a fairly standard thing that gets set by + REBOOT.COM programs, and the previous reset routine did this + too. */ +- *((unsigned short *)0x472) = reboot_mode; ++ *(unsigned short *)(__va(0x472)) = reboot_mode; + + /* For the switch to real mode, copy some code to low memory. It has + to be in the first 64k because it is running in 16-bit mode, and it + has to have the same physical and virtual address, because it turns + off paging. Copy it near the end of the first page, out of the way + of BIOS variables. */ +- memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), +- real_mode_switch, sizeof (real_mode_switch)); +- memcpy((void *)(0x1000 - 100), code, length); ++ memcpy(__va(0x1000 - sizeof (real_mode_switch) - 100), real_mode_switch, sizeof (real_mode_switch)); ++ memcpy(__va(0x1000 - 100), code, length); + + /* Set up the IDT for real mode. */ + load_idt(&real_mode_idt); +diff -urNp linux-2.6.33.1/arch/x86/kernel/setup.c linux-2.6.33.1/arch/x86/kernel/setup.c +--- linux-2.6.33.1/arch/x86/kernel/setup.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/setup.c 2010-03-20 16:58:39.016512700 -0400 +@@ -749,14 +749,14 @@ void __init setup_arch(char **cmdline_p) + + if (!boot_params.hdr.root_flags) + root_mountflags &= ~MS_RDONLY; +- init_mm.start_code = (unsigned long) _text; +- init_mm.end_code = (unsigned long) _etext; ++ init_mm.start_code = ktla_ktva((unsigned long) _text); ++ init_mm.end_code = ktla_ktva((unsigned long) _etext); + init_mm.end_data = (unsigned long) _edata; + init_mm.brk = _brk_end; + +- code_resource.start = virt_to_phys(_text); +- code_resource.end = virt_to_phys(_etext)-1; +- data_resource.start = virt_to_phys(_etext); ++ code_resource.start = virt_to_phys(ktla_ktva(_text)); ++ code_resource.end = virt_to_phys(ktla_ktva(_etext))-1; ++ data_resource.start = virt_to_phys(_sdata); + data_resource.end = virt_to_phys(_edata)-1; + bss_resource.start = virt_to_phys(&__bss_start); + bss_resource.end = virt_to_phys(&__bss_stop)-1; +diff -urNp linux-2.6.33.1/arch/x86/kernel/setup_percpu.c linux-2.6.33.1/arch/x86/kernel/setup_percpu.c +--- linux-2.6.33.1/arch/x86/kernel/setup_percpu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/setup_percpu.c 2010-03-20 16:58:39.016512700 -0400 +@@ -27,19 +27,17 @@ + # define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0) + #endif + ++#ifdef CONFIG_SMP + DEFINE_PER_CPU(int, cpu_number); + EXPORT_PER_CPU_SYMBOL(cpu_number); ++#endif + +-#ifdef CONFIG_X86_64 + #define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) +-#else +-#define BOOT_PERCPU_OFFSET 0 +-#endif + + DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; + EXPORT_PER_CPU_SYMBOL(this_cpu_off); + +-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { ++unsigned long __per_cpu_offset[NR_CPUS] __read_only = { + [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, + }; + EXPORT_SYMBOL(__per_cpu_offset); +@@ -161,10 +159,10 @@ static inline void setup_percpu_segment( + { + #ifdef CONFIG_X86_32 + struct desc_struct gdt; ++ unsigned long base = per_cpu_offset(cpu); + +- pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, +- 0x2 | DESCTYPE_S, 0x8); +- gdt.s = 1; ++ pack_descriptor(&gdt, base, (VMALLOC_END - base - 1) >> PAGE_SHIFT, ++ 0x83 | DESCTYPE_S, 0xC); + write_gdt_entry(get_cpu_gdt_table(cpu), + GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); + #endif +@@ -213,6 +211,11 @@ void __init setup_per_cpu_areas(void) + /* alrighty, percpu areas up and running */ + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { ++#ifdef CONFIG_CC_STACKPROTECTOR ++#ifdef CONFIG_x86_32 ++ unsigned long canary = per_cpu(stack_canary, cpu); ++#endif ++#endif + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; + per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); + per_cpu(cpu_number, cpu) = cpu; +@@ -240,6 +243,12 @@ void __init setup_per_cpu_areas(void) + early_per_cpu_map(x86_cpu_to_node_map, cpu); + #endif + #endif ++#ifdef CONFIG_CC_STACKPROTECTOR ++#ifdef CONFIG_x86_32 ++ if (cpu == boot_cpu_id) ++ per_cpu(stack_canary, cpu) = canary; ++#endif ++#endif + /* + * Up to this point, the boot CPU has been using .data.init + * area. Reload any changed state for the boot CPU. +diff -urNp linux-2.6.33.1/arch/x86/kernel/signal.c linux-2.6.33.1/arch/x86/kernel/signal.c +--- linux-2.6.33.1/arch/x86/kernel/signal.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/signal.c 2010-03-20 16:58:39.016512700 -0400 +@@ -198,7 +198,7 @@ static unsigned long align_sigframe(unsi + * Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. + */ +- sp = ((sp + 4) & -16ul) - 4; ++ sp = ((sp - 12) & -16ul) - 4; + #else /* !CONFIG_X86_32 */ + sp = round_down(sp, 16) - 8; + #endif +@@ -249,11 +249,11 @@ get_sigframe(struct k_sigaction *ka, str + * Return an always-bogus address instead so we will die with SIGSEGV. + */ + if (onsigstack && !likely(on_sig_stack(sp))) +- return (void __user *)-1L; ++ return (__force void __user *)-1L; + + /* save i387 state */ + if (used_math() && save_i387_xstate(*fpstate) < 0) +- return (void __user *)-1L; ++ return (__force void __user *)-1L; + + return (void __user *)sp; + } +@@ -308,9 +308,9 @@ __setup_frame(int sig, struct k_sigactio + } + + if (current->mm->context.vdso) +- restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); ++ restorer = (__force void __user *)VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); + else +- restorer = &frame->retcode; ++ restorer = (void __user *)&frame->retcode; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + +@@ -324,7 +324,7 @@ __setup_frame(int sig, struct k_sigactio + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ +- err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); ++ err |= __put_user(*((u64 *)&retcode), (u64 __user *)frame->retcode); + + if (err) + return -EFAULT; +@@ -378,7 +378,7 @@ static int __setup_rt_frame(int sig, str + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + /* Set up to return from userspace. */ +- restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); ++ restorer = (__force void __user *)VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + put_user_ex(restorer, &frame->pretcode); +@@ -390,7 +390,7 @@ static int __setup_rt_frame(int sig, str + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ +- put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); ++ put_user_ex(*((u64 *)&rt_retcode), (u64 __user *)frame->retcode); + } put_user_catch(err); + + if (err) +@@ -780,7 +780,7 @@ static void do_signal(struct pt_regs *re + * X86_32: vm86 regs switched out by assembly code before reaching + * here, so testing against kernel CS suffices. + */ +- if (!user_mode(regs)) ++ if (!user_mode_novm(regs)) + return; + + if (current_thread_info()->status & TS_RESTORE_SIGMASK) +diff -urNp linux-2.6.33.1/arch/x86/kernel/smpboot.c linux-2.6.33.1/arch/x86/kernel/smpboot.c +--- linux-2.6.33.1/arch/x86/kernel/smpboot.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/smpboot.c 2010-03-20 16:58:39.016512700 -0400 +@@ -750,7 +750,11 @@ do_rest: + (unsigned long)task_stack_page(c_idle.idle) - + KERNEL_STACK_OFFSET + THREAD_SIZE; + #endif ++ ++ pax_open_kernel(); + early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); ++ pax_close_kernel(); ++ + initial_code = (unsigned long)start_secondary; + stack_start.sp = (void *) c_idle.idle->thread.sp; + +diff -urNp linux-2.6.33.1/arch/x86/kernel/step.c linux-2.6.33.1/arch/x86/kernel/step.c +--- linux-2.6.33.1/arch/x86/kernel/step.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/step.c 2010-03-20 16:58:39.016512700 -0400 +@@ -27,10 +27,10 @@ unsigned long convert_ip_to_linear(struc + struct desc_struct *desc; + unsigned long base; + +- seg &= ~7UL; ++ seg >>= 3; + + mutex_lock(&child->mm->context.lock); +- if (unlikely((seg >> 3) >= child->mm->context.size)) ++ if (unlikely(seg >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; +@@ -53,6 +53,9 @@ static int is_setting_trap_flag(struct t + unsigned char opcode[15]; + unsigned long addr = convert_ip_to_linear(child, regs); + ++ if (addr == -EINVAL) ++ return 0; ++ + copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + for (i = 0; i < copied; i++) { + switch (opcode[i]) { +@@ -74,7 +77,7 @@ static int is_setting_trap_flag(struct t + + #ifdef CONFIG_X86_64 + case 0x40 ... 0x4f: +- if (regs->cs != __USER_CS) ++ if ((regs->cs & 0xffff) != __USER_CS) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ +diff -urNp linux-2.6.33.1/arch/x86/kernel/syscall_table_32.S linux-2.6.33.1/arch/x86/kernel/syscall_table_32.S +--- linux-2.6.33.1/arch/x86/kernel/syscall_table_32.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/syscall_table_32.S 2010-03-20 16:58:39.016512700 -0400 +@@ -1,3 +1,4 @@ ++.section .rodata,"a",@progbits + ENTRY(sys_call_table) + .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ + .long sys_exit +diff -urNp linux-2.6.33.1/arch/x86/kernel/sys_i386_32.c linux-2.6.33.1/arch/x86/kernel/sys_i386_32.c +--- linux-2.6.33.1/arch/x86/kernel/sys_i386_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/sys_i386_32.c 2010-03-20 16:58:39.016512700 -0400 +@@ -24,6 +24,21 @@ + + #include <asm/syscalls.h> + ++int i386_mmap_check(unsigned long addr, unsigned long len, unsigned long flags) ++{ ++ unsigned long pax_task_size = TASK_SIZE; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++#endif ++ ++ if (len > pax_task_size || addr > pax_task_size - len) ++ return -EINVAL; ++ ++ return 0; ++} ++ + /* + * Perform the select(nd, in, out, ex, tv) and mmap() system + * calls. Linux/i386 didn't use to be able to handle more than +@@ -58,6 +73,205 @@ out: + return err; + } + ++unsigned long ++arch_get_unmapped_area(struct file *filp, unsigned long addr, ++ unsigned long len, unsigned long pgoff, unsigned long flags) ++{ ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ unsigned long start_addr, pax_task_size = TASK_SIZE; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++#endif ++ ++ if (len > pax_task_size) ++ return -ENOMEM; ++ ++ if (flags & MAP_FIXED) ++ return addr; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) ++#endif ++ ++ if (addr) { ++ addr = PAGE_ALIGN(addr); ++ vma = find_vma(mm, addr); ++ if (pax_task_size - len >= addr && ++ (!vma || addr + len <= vma->vm_start)) ++ return addr; ++ } ++ if (len > mm->cached_hole_size) { ++ start_addr = addr = mm->free_area_cache; ++ } else { ++ start_addr = addr = mm->mmap_base; ++ mm->cached_hole_size = 0; ++ } ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (!(__supported_pte_mask & _PAGE_NX) && (mm->pax_flags & MF_PAX_PAGEEXEC) && (flags & MAP_EXECUTABLE) && start_addr >= mm->mmap_base) { ++ start_addr = 0x00110000UL; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ start_addr += mm->delta_mmap & 0x03FFF000UL; ++#endif ++ ++ if (mm->start_brk <= start_addr && start_addr < mm->mmap_base) ++ start_addr = addr = mm->mmap_base; ++ else ++ addr = start_addr; ++ } ++#endif ++ ++full_search: ++ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { ++ /* At this point: (!vma || addr < vma->vm_end). */ ++ if (pax_task_size - len < addr) { ++ /* ++ * Start a new search - just in case we missed ++ * some holes. ++ */ ++ if (start_addr != mm->mmap_base) { ++ start_addr = addr = mm->mmap_base; ++ mm->cached_hole_size = 0; ++ goto full_search; ++ } ++ return -ENOMEM; ++ } ++ if (!vma || addr + len <= vma->vm_start) { ++ /* ++ * Remember the place where we stopped the search: ++ */ ++ mm->free_area_cache = addr + len; ++ return addr; ++ } ++ if (addr + mm->cached_hole_size < vma->vm_start) ++ mm->cached_hole_size = vma->vm_start - addr; ++ addr = vma->vm_end; ++ if (mm->start_brk <= addr && addr < mm->mmap_base) { ++ start_addr = addr = mm->mmap_base; ++ mm->cached_hole_size = 0; ++ goto full_search; ++ } ++ } ++} ++ ++unsigned long ++arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, ++ const unsigned long len, const unsigned long pgoff, ++ const unsigned long flags) ++{ ++ struct vm_area_struct *vma; ++ struct mm_struct *mm = current->mm; ++ unsigned long base = mm->mmap_base, addr = addr0, pax_task_size = TASK_SIZE; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++#endif ++ ++ /* requested length too big for entire address space */ ++ if (len > pax_task_size) ++ return -ENOMEM; ++ ++ if (flags & MAP_FIXED) ++ return addr; ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (!(__supported_pte_mask & _PAGE_NX) && (mm->pax_flags & MF_PAX_PAGEEXEC) && (flags & MAP_EXECUTABLE)) ++ goto bottomup; ++#endif ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) ++#endif ++ ++ /* requesting a specific address */ ++ if (addr) { ++ addr = PAGE_ALIGN(addr); ++ vma = find_vma(mm, addr); ++ if (pax_task_size - len >= addr && ++ (!vma || addr + len <= vma->vm_start)) ++ return addr; ++ } ++ ++ /* check if free_area_cache is useful for us */ ++ if (len <= mm->cached_hole_size) { ++ mm->cached_hole_size = 0; ++ mm->free_area_cache = mm->mmap_base; ++ } ++ ++ /* either no address requested or can't fit in requested address hole */ ++ addr = mm->free_area_cache; ++ ++ /* make sure it can fit in the remaining address space */ ++ if (addr > len) { ++ vma = find_vma(mm, addr-len); ++ if (!vma || addr <= vma->vm_start) ++ /* remember the address as a hint for next time */ ++ return (mm->free_area_cache = addr-len); ++ } ++ ++ if (mm->mmap_base < len) ++ goto bottomup; ++ ++ addr = mm->mmap_base-len; ++ ++ do { ++ /* ++ * Lookup failure means no vma is above this address, ++ * else if new region fits below vma->vm_start, ++ * return with success: ++ */ ++ vma = find_vma(mm, addr); ++ if (!vma || addr+len <= vma->vm_start) ++ /* remember the address as a hint for next time */ ++ return (mm->free_area_cache = addr); ++ ++ /* remember the largest hole we saw so far */ ++ if (addr + mm->cached_hole_size < vma->vm_start) ++ mm->cached_hole_size = vma->vm_start - addr; ++ ++ /* try just below the current vma->vm_start */ ++ addr = vma->vm_start-len; ++ } while (len < vma->vm_start); ++ ++bottomup: ++ /* ++ * A failed mmap() very likely causes application failure, ++ * so fall back to the bottom-up function here. This scenario ++ * can happen with large stack limits and large mmap() ++ * allocations. ++ */ ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) ++ mm->mmap_base = SEGMEXEC_TASK_UNMAPPED_BASE; ++ else ++#endif ++ ++ mm->mmap_base = TASK_UNMAPPED_BASE; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base += mm->delta_mmap; ++#endif ++ ++ mm->free_area_cache = mm->mmap_base; ++ mm->cached_hole_size = ~0UL; ++ addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); ++ /* ++ * Restore the topdown base: ++ */ ++ mm->mmap_base = base; ++ mm->free_area_cache = base; ++ mm->cached_hole_size = ~0UL; ++ ++ return addr; ++} + + struct sel_arg_struct { + unsigned long n; +@@ -93,7 +307,7 @@ asmlinkage int sys_ipc(uint call, int fi + return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); + case SEMTIMEDOP: + return sys_semtimedop(first, (struct sembuf __user *)ptr, second, +- (const struct timespec __user *)fifth); ++ (__force const struct timespec __user *)fifth); + + case SEMGET: + return sys_semget(first, second, third); +@@ -140,7 +354,7 @@ asmlinkage int sys_ipc(uint call, int fi + ret = do_shmat(first, (char __user *) ptr, second, &raddr); + if (ret) + return ret; +- return put_user(raddr, (ulong __user *) third); ++ return put_user(raddr, (__force ulong __user *) third); + } + case 1: /* iBCS2 emulator entry point */ + if (!segment_eq(get_fs(), get_ds())) +diff -urNp linux-2.6.33.1/arch/x86/kernel/sys_x86_64.c linux-2.6.33.1/arch/x86/kernel/sys_x86_64.c +--- linux-2.6.33.1/arch/x86/kernel/sys_x86_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/sys_x86_64.c 2010-03-20 16:58:39.016512700 -0400 +@@ -32,8 +32,8 @@ out: + return error; + } + +-static void find_start_end(unsigned long flags, unsigned long *begin, +- unsigned long *end) ++static void find_start_end(struct mm_struct *mm, unsigned long flags, ++ unsigned long *begin, unsigned long *end) + { + if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { + unsigned long new_begin; +@@ -52,7 +52,7 @@ static void find_start_end(unsigned long + *begin = new_begin; + } + } else { +- *begin = TASK_UNMAPPED_BASE; ++ *begin = mm->mmap_base; + *end = TASK_SIZE; + } + } +@@ -69,11 +69,15 @@ arch_get_unmapped_area(struct file *filp + if (flags & MAP_FIXED) + return addr; + +- find_start_end(flags, &begin, &end); ++ find_start_end(mm, flags, &begin, &end); + + if (len > end) + return -ENOMEM; + ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) ++#endif ++ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); +@@ -128,7 +132,7 @@ arch_get_unmapped_area_topdown(struct fi + { + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; +- unsigned long addr = addr0; ++ unsigned long base = mm->mmap_base, addr = addr0; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) +@@ -141,6 +145,10 @@ arch_get_unmapped_area_topdown(struct fi + if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) + goto bottomup; + ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) ++#endif ++ + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); +@@ -198,13 +206,21 @@ bottomup: + * can happen with large stack limits and large mmap() + * allocations. + */ ++ mm->mmap_base = TASK_UNMAPPED_BASE; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base += mm->delta_mmap; ++#endif ++ ++ mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; +- mm->free_area_cache = TASK_UNMAPPED_BASE; + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + /* + * Restore the topdown base: + */ +- mm->free_area_cache = mm->mmap_base; ++ mm->mmap_base = base; ++ mm->free_area_cache = base; + mm->cached_hole_size = ~0UL; + + return addr; +diff -urNp linux-2.6.33.1/arch/x86/kernel/time.c linux-2.6.33.1/arch/x86/kernel/time.c +--- linux-2.6.33.1/arch/x86/kernel/time.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/time.c 2010-03-20 16:58:39.020693949 -0400 +@@ -26,17 +26,13 @@ + int timer_ack; + #endif + +-#ifdef CONFIG_X86_64 +-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +-#endif +- + unsigned long profile_pc(struct pt_regs *regs) + { + unsigned long pc = instruction_pointer(regs); + +- if (!user_mode_vm(regs) && in_lock_functions(pc)) { ++ if (!user_mode(regs) && in_lock_functions(pc)) { + #ifdef CONFIG_FRAME_POINTER +- return *(unsigned long *)(regs->bp + sizeof(long)); ++ return ktla_ktva(*(unsigned long *)(regs->bp + sizeof(long))); + #else + unsigned long *sp = + (unsigned long *)kernel_stack_pointer(regs); +@@ -45,11 +41,17 @@ unsigned long profile_pc(struct pt_regs + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ return ktla_ktva(sp[0]); ++#else + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; + #endif ++ ++#endif + } + return pc; + } +diff -urNp linux-2.6.33.1/arch/x86/kernel/tls.c linux-2.6.33.1/arch/x86/kernel/tls.c +--- linux-2.6.33.1/arch/x86/kernel/tls.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/tls.c 2010-03-20 16:58:39.020693949 -0400 +@@ -85,6 +85,11 @@ int do_set_thread_area(struct task_struc + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((p->mm->pax_flags & MF_PAX_SEGMEXEC) && (info.contents & MODIFY_LDT_CONTENTS_CODE)) ++ return -EINVAL; ++#endif ++ + set_tls_desc(p, idx, &info, 1); + + return 0; +diff -urNp linux-2.6.33.1/arch/x86/kernel/trampoline_32.S linux-2.6.33.1/arch/x86/kernel/trampoline_32.S +--- linux-2.6.33.1/arch/x86/kernel/trampoline_32.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/trampoline_32.S 2010-03-20 16:58:39.020693949 -0400 +@@ -32,6 +32,12 @@ + #include <asm/segment.h> + #include <asm/page_types.h> + ++#ifdef CONFIG_PAX_KERNEXEC ++#define ta(X) (X) ++#else ++#define ta(X) ((X) - __PAGE_OFFSET) ++#endif ++ + /* We can free up trampoline after bootup if cpu hotplug is not supported. */ + __CPUINITRODATA + .code16 +@@ -60,7 +66,7 @@ r_base = . + inc %ax # protected mode (PE) bit + lmsw %ax # into protected mode + # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S +- ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET) ++ ljmpl $__BOOT_CS, $ta(startup_32_smp) + + # These need to be in the same 64K segment as the above; + # hence we don't use the boot_gdt_descr defined in head.S +diff -urNp linux-2.6.33.1/arch/x86/kernel/traps.c linux-2.6.33.1/arch/x86/kernel/traps.c +--- linux-2.6.33.1/arch/x86/kernel/traps.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/traps.c 2010-03-20 16:58:39.020693949 -0400 +@@ -69,12 +69,6 @@ asmlinkage int system_call(void); + + /* Do we ignore FPU interrupts ? */ + char ignore_fpu_irq; +- +-/* +- * The IDT has to be page-aligned to simplify the Pentium +- * F0 0F bug workaround. +- */ +-gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; + #endif + + DECLARE_BITMAP(used_vectors, NR_VECTORS); +@@ -112,19 +106,19 @@ static inline void preempt_conditional_c + static inline void + die_if_kernel(const char *str, struct pt_regs *regs, long err) + { +- if (!user_mode_vm(regs)) ++ if (!user_mode(regs)) + die(str, regs, err); + } + #endif + + static void __kprobes +-do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, ++do_trap(int trapnr, int signr, const char *str, struct pt_regs *regs, + long error_code, siginfo_t *info) + { + struct task_struct *tsk = current; + + #ifdef CONFIG_X86_32 +- if (regs->flags & X86_VM_MASK) { ++ if (v8086_mode(regs)) { + /* + * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. +@@ -135,7 +129,7 @@ do_trap(int trapnr, int signr, char *str + } + #endif + +- if (!user_mode(regs)) ++ if (!user_mode_novm(regs)) + goto kernel_trap; + + #ifdef CONFIG_X86_32 +@@ -158,7 +152,7 @@ trap_signal: + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", +- tsk->comm, tsk->pid, str, ++ tsk->comm, task_pid_nr(tsk), str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); +@@ -175,8 +169,20 @@ kernel_trap: + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ if (trapnr == 12 && ((regs->cs & 0xFFFF) == __KERNEL_CS || (regs->cs & 0xFFFF) == __KERNEXEC_KERNEL_CS)) ++ str = "PAX: suspicious stack segment fault"; ++#endif ++ + die(str, regs, error_code); + } ++ ++#ifdef CONFIG_PAX_REFCOUNT ++ if (trapnr == 4) ++ pax_report_refcount_overflow(regs); ++#endif ++ + return; + + #ifdef CONFIG_X86_32 +@@ -265,14 +271,30 @@ do_general_protection(struct pt_regs *re + conditional_sti(regs); + + #ifdef CONFIG_X86_32 +- if (regs->flags & X86_VM_MASK) ++ if (v8086_mode(regs)) + goto gp_in_vm86; + #endif + + tsk = current; +- if (!user_mode(regs)) ++ if (!user_mode_novm(regs)) + goto gp_in_kernel; + ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_PAGEEXEC) ++ if (!(__supported_pte_mask & _PAGE_NX) && tsk->mm && (tsk->mm->pax_flags & MF_PAX_PAGEEXEC)) { ++ struct mm_struct *mm = tsk->mm; ++ unsigned long limit; ++ ++ down_write(&mm->mmap_sem); ++ limit = mm->context.user_cs_limit; ++ if (limit < TASK_SIZE) { ++ track_exec_limit(mm, limit, TASK_SIZE, VM_EXEC); ++ up_write(&mm->mmap_sem); ++ return; ++ } ++ up_write(&mm->mmap_sem); ++ } ++#endif ++ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + +@@ -305,6 +327,13 @@ gp_in_kernel: + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ if ((regs->cs & 0xFFFF) == __KERNEL_CS || (regs->cs & 0xFFFF) == __KERNEXEC_KERNEL_CS) ++ die("PAX: suspicious general protection fault", regs, error_code); ++ else ++#endif ++ + die("general protection fault", regs, error_code); + } + +@@ -556,7 +585,7 @@ dotraplinkage void __kprobes do_debug(st + /* It's safe to allow irq's after DR6 has been saved */ + preempt_conditional_sti(regs); + +- if (regs->flags & X86_VM_MASK) { ++ if (v8086_mode(regs)) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, 1); + return; +@@ -569,7 +598,7 @@ dotraplinkage void __kprobes do_debug(st + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. + */ +- if ((dr6 & DR_STEP) && !user_mode(regs)) { ++ if ((dr6 & DR_STEP) && !user_mode_novm(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; +@@ -736,7 +765,7 @@ do_simd_coprocessor_error(struct pt_regs + * Handle strange cache flush from user space exception + * in all other cases. This is undocumented behaviour. + */ +- if (regs->flags & X86_VM_MASK) { ++ if (v8086_mode(regs)) { + handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); + return; + } +diff -urNp linux-2.6.33.1/arch/x86/kernel/tsc.c linux-2.6.33.1/arch/x86/kernel/tsc.c +--- linux-2.6.33.1/arch/x86/kernel/tsc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/tsc.c 2010-03-20 16:58:39.020693949 -0400 +@@ -795,7 +795,7 @@ static struct dmi_system_id __initdata b + DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), + }, + }, +- {} ++ { NULL, NULL, {{0, {0}}}, NULL} + }; + + static void __init check_system_tsc_reliable(void) +diff -urNp linux-2.6.33.1/arch/x86/kernel/vm86_32.c linux-2.6.33.1/arch/x86/kernel/vm86_32.c +--- linux-2.6.33.1/arch/x86/kernel/vm86_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/vm86_32.c 2010-03-20 16:58:39.020693949 -0400 +@@ -41,6 +41,7 @@ + #include <linux/ptrace.h> + #include <linux/audit.h> + #include <linux/stddef.h> ++#include <linux/grsecurity.h> + + #include <asm/uaccess.h> + #include <asm/io.h> +@@ -148,7 +149,7 @@ struct pt_regs *save_v86_state(struct ke + do_exit(SIGSEGV); + } + +- tss = &per_cpu(init_tss, get_cpu()); ++ tss = init_tss + get_cpu(); + current->thread.sp0 = current->thread.saved_sp0; + current->thread.sysenter_cs = __KERNEL_CS; + load_sp0(tss, ¤t->thread); +@@ -207,6 +208,13 @@ int sys_vm86old(struct vm86_struct __use + struct task_struct *tsk; + int tmp, ret = -EPERM; + ++#ifdef CONFIG_GRKERNSEC_VM86 ++ if (!capable(CAP_SYS_RAWIO)) { ++ gr_handle_vm86(); ++ goto out; ++ } ++#endif ++ + tsk = current; + if (tsk->thread.saved_sp0) + goto out; +@@ -237,6 +245,14 @@ int sys_vm86(unsigned long cmd, unsigned + int tmp, ret; + struct vm86plus_struct __user *v86; + ++#ifdef CONFIG_GRKERNSEC_VM86 ++ if (!capable(CAP_SYS_RAWIO)) { ++ gr_handle_vm86(); ++ ret = -EPERM; ++ goto out; ++ } ++#endif ++ + tsk = current; + switch (cmd) { + case VM86_REQUEST_IRQ: +@@ -323,7 +339,7 @@ static void do_sys_vm86(struct kernel_vm + tsk->thread.saved_fs = info->regs32->fs; + tsk->thread.saved_gs = get_user_gs(info->regs32); + +- tss = &per_cpu(init_tss, get_cpu()); ++ tss = init_tss + get_cpu(); + tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; + if (cpu_has_sep) + tsk->thread.sysenter_cs = 0; +@@ -528,7 +544,7 @@ static void do_int(struct kernel_vm86_re + goto cannot_handle; + if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored)) + goto cannot_handle; +- intr_ptr = (unsigned long __user *) (i << 2); ++ intr_ptr = (__force unsigned long __user *) (i << 2); + if (get_user(segoffs, intr_ptr)) + goto cannot_handle; + if ((segoffs >> 16) == BIOSSEG) +diff -urNp linux-2.6.33.1/arch/x86/kernel/vmi_32.c linux-2.6.33.1/arch/x86/kernel/vmi_32.c +--- linux-2.6.33.1/arch/x86/kernel/vmi_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/vmi_32.c 2010-03-20 16:58:39.020693949 -0400 +@@ -44,12 +44,17 @@ typedef u32 __attribute__((regparm(1))) + typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int); + + #define call_vrom_func(rom,func) \ +- (((VROMFUNC *)(rom->func))()) ++ (((VROMFUNC *)(ktva_ktla(rom.func)))()) + + #define call_vrom_long_func(rom,func,arg) \ +- (((VROMLONGFUNC *)(rom->func)) (arg)) ++({\ ++ u64 __reloc = ((VROMLONGFUNC *)(ktva_ktla(rom.func))) (arg);\ ++ struct vmi_relocation_info *const __rel = (struct vmi_relocation_info *)&__reloc;\ ++ __rel->eip = (unsigned char *)ktva_ktla((unsigned long)__rel->eip);\ ++ __reloc;\ ++}) + +-static struct vrom_header *vmi_rom; ++static struct vrom_header vmi_rom __attribute((__section__(".vmi.rom"), __aligned__(PAGE_SIZE))); + static int disable_pge; + static int disable_pse; + static int disable_sep; +@@ -76,10 +81,10 @@ static struct { + void (*set_initial_ap_state)(int, int); + void (*halt)(void); + void (*set_lazy_mode)(int mode); +-} vmi_ops; ++} vmi_ops __read_only; + + /* Cached VMI operations */ +-struct vmi_timer_ops vmi_timer_ops; ++struct vmi_timer_ops vmi_timer_ops __read_only; + + /* + * VMI patching routines. +@@ -94,7 +99,7 @@ struct vmi_timer_ops vmi_timer_ops; + static inline void patch_offset(void *insnbuf, + unsigned long ip, unsigned long dest) + { +- *(unsigned long *)(insnbuf+1) = dest-ip-5; ++ *(unsigned long *)(insnbuf+1) = dest-ip-5; + } + + static unsigned patch_internal(int call, unsigned len, void *insnbuf, +@@ -102,6 +107,7 @@ static unsigned patch_internal(int call, + { + u64 reloc; + struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; ++ + reloc = call_vrom_long_func(vmi_rom, get_reloc, call); + switch(rel->type) { + case VMI_RELOCATION_CALL_REL: +@@ -404,13 +410,13 @@ static void vmi_set_pud(pud_t *pudp, pud + + static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) + { +- const pte_t pte = { .pte = 0 }; ++ const pte_t pte = __pte(0ULL); + vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); + } + + static void vmi_pmd_clear(pmd_t *pmd) + { +- const pte_t pte = { .pte = 0 }; ++ const pte_t pte = __pte(0ULL); + vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); + } + #endif +@@ -438,8 +444,8 @@ vmi_startup_ipi_hook(int phys_apicid, un + ap.ss = __KERNEL_DS; + ap.esp = (unsigned long) start_esp; + +- ap.ds = __USER_DS; +- ap.es = __USER_DS; ++ ap.ds = __KERNEL_DS; ++ ap.es = __KERNEL_DS; + ap.fs = __KERNEL_PERCPU; + ap.gs = __KERNEL_STACK_CANARY; + +@@ -486,6 +492,18 @@ static void vmi_leave_lazy_mmu(void) + paravirt_leave_lazy_mmu(); + } + ++#ifdef CONFIG_PAX_KERNEXEC ++static unsigned long vmi_pax_open_kernel(void) ++{ ++ return 0; ++} ++ ++static unsigned long vmi_pax_close_kernel(void) ++{ ++ return 0; ++} ++#endif ++ + static inline int __init check_vmi_rom(struct vrom_header *rom) + { + struct pci_header *pci; +@@ -498,6 +516,10 @@ static inline int __init check_vmi_rom(s + return 0; + if (rom->vrom_signature != VMI_SIGNATURE) + return 0; ++ if (rom->rom_length * 512 > sizeof(*rom)) { ++ printk(KERN_WARNING "PAX: VMI: ROM size too big: %x\n", rom->rom_length * 512); ++ return 0; ++ } + if (rom->api_version_maj != VMI_API_REV_MAJOR || + rom->api_version_min+1 < VMI_API_REV_MINOR+1) { + printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n", +@@ -562,7 +584,7 @@ static inline int __init probe_vmi_rom(v + struct vrom_header *romstart; + romstart = (struct vrom_header *)isa_bus_to_virt(base); + if (check_vmi_rom(romstart)) { +- vmi_rom = romstart; ++ vmi_rom = *romstart; + return 1; + } + } +@@ -836,6 +858,11 @@ static inline int __init activate_vmi(vo + + para_fill(pv_irq_ops.safe_halt, Halt); + ++#ifdef CONFIG_PAX_KERNEXEC ++ pv_mmu_ops.pax_open_kernel = vmi_pax_open_kernel; ++ pv_mmu_ops.pax_close_kernel = vmi_pax_close_kernel; ++#endif ++ + /* + * Alternative instruction rewriting doesn't happen soon enough + * to convert VMI_IRET to a call instead of a jump; so we have +@@ -853,16 +880,16 @@ static inline int __init activate_vmi(vo + + void __init vmi_init(void) + { +- if (!vmi_rom) ++ if (!vmi_rom.rom_signature) + probe_vmi_rom(); + else +- check_vmi_rom(vmi_rom); ++ check_vmi_rom(&vmi_rom); + + /* In case probing for or validating the ROM failed, basil */ +- if (!vmi_rom) ++ if (!vmi_rom.rom_signature) + return; + +- reserve_top_address(-vmi_rom->virtual_top); ++ reserve_top_address(-vmi_rom.virtual_top); + + #ifdef CONFIG_X86_IO_APIC + /* This is virtual hardware; timer routing is wired correctly */ +@@ -874,7 +901,7 @@ void __init vmi_activate(void) + { + unsigned long flags; + +- if (!vmi_rom) ++ if (!vmi_rom.rom_signature) + return; + + local_irq_save(flags); +diff -urNp linux-2.6.33.1/arch/x86/kernel/vmlinux.lds.S linux-2.6.33.1/arch/x86/kernel/vmlinux.lds.S +--- linux-2.6.33.1/arch/x86/kernel/vmlinux.lds.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/vmlinux.lds.S 2010-03-20 16:58:39.020693949 -0400 +@@ -26,6 +26,22 @@ + #include <asm/page_types.h> + #include <asm/cache.h> + #include <asm/boot.h> ++#include <asm/segment.h> ++ ++#undef PMD_SIZE ++#undef PMD_SHIFT ++#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) ++#define PMD_SHIFT 21 ++#else ++#define PMD_SHIFT 22 ++#endif ++#define PMD_SIZE (1 << PMD_SHIFT) ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++#define __KERNEL_TEXT_OFFSET (LOAD_OFFSET + ____LOAD_PHYSICAL_ADDR) ++#else ++#define __KERNEL_TEXT_OFFSET 0 ++#endif + + #undef i386 /* in case the preprocessor is a 32bit one */ + +@@ -34,13 +50,13 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONF + #ifdef CONFIG_X86_32 + OUTPUT_ARCH(i386) + ENTRY(phys_startup_32) +-jiffies = jiffies_64; + #else + OUTPUT_ARCH(i386:x86-64) + ENTRY(phys_startup_64) +-jiffies_64 = jiffies; + #endif + ++jiffies = jiffies_64; ++ + #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) + /* + * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA +@@ -69,31 +85,46 @@ jiffies_64 = jiffies; + + PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ +- data PT_LOAD FLAGS(7); /* RWE */ ++#ifdef CONFIG_XEN ++ rodata PT_LOAD FLAGS(5); /* R_E */ ++#else ++ rodata PT_LOAD FLAGS(4); /* R__ */ ++#endif ++#ifdef CONFIG_X86_32 ++ module PT_LOAD FLAGS(5); /* R_E */ ++#endif ++ data PT_LOAD FLAGS(6); /* RW_ */ + #ifdef CONFIG_X86_64 + user PT_LOAD FLAGS(5); /* R_E */ ++#endif ++ init.begin PT_LOAD FLAGS(6); /* RW_ */ + #ifdef CONFIG_SMP + percpu PT_LOAD FLAGS(6); /* RW_ */ + #endif ++ text.init PT_LOAD FLAGS(5); /* R_E */ ++ text.exit PT_LOAD FLAGS(5); /* R_E */ + init PT_LOAD FLAGS(7); /* RWE */ +-#endif + note PT_NOTE FLAGS(0); /* ___ */ + } + + SECTIONS + { + #ifdef CONFIG_X86_32 +- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; +- phys_startup_32 = startup_32 - LOAD_OFFSET; ++ . = LOAD_OFFSET + ____LOAD_PHYSICAL_ADDR; + #else +- . = __START_KERNEL; +- phys_startup_64 = startup_64 - LOAD_OFFSET; ++ . = __START_KERNEL; + #endif + + /* Text and read-only data */ +- .text : AT(ADDR(.text) - LOAD_OFFSET) { +- _text = .; ++ .text (. - __KERNEL_TEXT_OFFSET): AT(ADDR(.text) - LOAD_OFFSET + __KERNEL_TEXT_OFFSET) { + /* bootstrapping code */ ++#ifdef CONFIG_X86_32 ++ phys_startup_32 = startup_32 - LOAD_OFFSET + __KERNEL_TEXT_OFFSET; ++#else ++ phys_startup_64 = startup_64 - LOAD_OFFSET + __KERNEL_TEXT_OFFSET; ++#endif ++ __LOAD_PHYSICAL_ADDR = . - LOAD_OFFSET + __KERNEL_TEXT_OFFSET; ++ _text = .; + HEAD_TEXT + #ifdef CONFIG_X86_32 + . = ALIGN(PAGE_SIZE); +@@ -108,30 +139,66 @@ SECTIONS + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) +- /* End of text section */ +- _etext = .; + } :text = 0x9090 + +- NOTES :text :note ++ . += __KERNEL_TEXT_OFFSET; ++ ++ . = ALIGN(PAGE_SIZE); ++ NOTES :rodata :note + +- EXCEPTION_TABLE(16) :text = 0x9090 ++ EXCEPTION_TABLE(16) :rodata + + X64_ALIGN_DEBUG_RODATA_BEGIN + RO_DATA(PAGE_SIZE) + X64_ALIGN_DEBUG_RODATA_END + ++#ifdef CONFIG_X86_32 ++ . = ALIGN(PAGE_SIZE); ++ .rodata.page_aligned : AT(ADDR(.rodata.page_aligned) - LOAD_OFFSET) { ++ *(.idt) ++ . = ALIGN(PAGE_SIZE); ++ *(.empty_zero_page) ++ *(.swapper_pg_pmd) ++ *(.swapper_pg_dir) ++ } ++ ++ . = ALIGN(PAGE_SIZE); ++ .vmi.rom : AT(ADDR(.vmi.rom) - LOAD_OFFSET) { ++ *(.vmi.rom) ++ } :module ++ ++ . = ALIGN(PAGE_SIZE); ++ .module.text : AT(ADDR(.module.text) - LOAD_OFFSET) { ++ ++#if defined(CONFIG_PAX_KERNEXEC) && defined(CONFIG_MODULES) ++ MODULES_EXEC_VADDR = .; ++ BYTE(0) ++ . += (CONFIG_PAX_KERNEXEC_MODULE_TEXT * 1024 * 1024); ++ . = ALIGN(PMD_SIZE); ++ MODULES_EXEC_END = . - 1; ++#endif ++ ++ } :module ++#endif ++ + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { ++ /* End of text section */ ++ _etext = . - __KERNEL_TEXT_OFFSET; ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ . = ALIGN(PMD_SIZE); ++#else ++ . = ALIGN(PAGE_SIZE); ++#endif ++ + /* Start of data section */ + _sdata = .; + + /* init_task */ + INIT_TASK_DATA(THREAD_SIZE) + +-#ifdef CONFIG_X86_32 +- /* 32 bit has nosave before _edata */ + NOSAVE_DATA +-#endif + + PAGE_ALIGNED_DATA(PAGE_SIZE) + +@@ -194,12 +261,6 @@ SECTIONS + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + +- . = ALIGN(L1_CACHE_BYTES); +- .jiffies : AT(VLOAD(.jiffies)) { +- *(.jiffies) +- } +- jiffies = VVIRT(.jiffies); +- + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } +@@ -215,12 +276,19 @@ SECTIONS + #endif /* CONFIG_X86_64 */ + + /* Init code and data - will be freed after init */ +- . = ALIGN(PAGE_SIZE); + .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { ++ BYTE(0) ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ . = ALIGN(PMD_SIZE); ++#else ++ . = ALIGN(PAGE_SIZE); ++#endif ++ + __init_begin = .; /* paired with __init_end */ +- } ++ } :init.begin + +-#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) ++#ifdef CONFIG_SMP + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - .init.text - should +@@ -229,12 +297,27 @@ SECTIONS + PERCPU_VADDR(0, :percpu) + #endif + +- INIT_TEXT_SECTION(PAGE_SIZE) +-#ifdef CONFIG_X86_64 +- :init +-#endif ++ . = ALIGN(PAGE_SIZE); ++ init_begin = .; ++ .init.text (. - __KERNEL_TEXT_OFFSET): AT(init_begin - LOAD_OFFSET) { ++ VMLINUX_SYMBOL(_sinittext) = .; ++ INIT_TEXT ++ VMLINUX_SYMBOL(_einittext) = .; ++ . = ALIGN(PAGE_SIZE); ++ } :text.init ++ ++ /* ++ * .exit.text is discard at runtime, not link time, to deal with ++ * references from .altinstructions and .eh_frame ++ */ ++ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { ++ EXIT_TEXT ++ . = ALIGN(16); ++ } :text.exit ++ . = init_begin + SIZEOF(.init.text) + SIZEOF(.exit.text); + +- INIT_DATA_SECTION(16) ++ . = ALIGN(PAGE_SIZE); ++ INIT_DATA_SECTION(16) :init + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; +@@ -260,19 +343,11 @@ SECTIONS + *(.altinstr_replacement) + } + +- /* +- * .exit.text is discard at runtime, not link time, to deal with +- * references from .altinstructions and .eh_frame +- */ +- .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { +- EXIT_TEXT +- } +- + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +-#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) ++#ifndef CONFIG_SMP + PERCPU(PAGE_SIZE) + #endif + +@@ -295,12 +370,6 @@ SECTIONS + . = ALIGN(PAGE_SIZE); + } + +-#ifdef CONFIG_X86_64 +- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { +- NOSAVE_DATA +- } +-#endif +- + /* BSS */ + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { +@@ -316,6 +385,7 @@ SECTIONS + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ ++ . = ALIGN(PMD_SIZE); + __brk_limit = .; + } + +@@ -342,13 +412,12 @@ SECTIONS + * for the boot processor. + */ + #define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +-INIT_PER_CPU(gdt_page); + INIT_PER_CPU(irq_stack_union); + + /* + * Build-time check on the image size: + */ +-. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), ++. = ASSERT((_end - _text - __KERNEL_TEXT_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); + + #ifdef CONFIG_SMP +diff -urNp linux-2.6.33.1/arch/x86/kernel/vsyscall_64.c linux-2.6.33.1/arch/x86/kernel/vsyscall_64.c +--- linux-2.6.33.1/arch/x86/kernel/vsyscall_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/vsyscall_64.c 2010-03-20 16:58:39.020693949 -0400 +@@ -80,6 +80,7 @@ void update_vsyscall(struct timespec *wa + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ ++ strlcpy(vsyscall_gtod_data.clock.name, clock->name, sizeof vsyscall_gtod_data.clock.name); + vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; +@@ -203,7 +204,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ +- if (tcache && tcache->blob[0] == (j = __jiffies)) { ++ if (tcache && tcache->blob[0] == (j = jiffies)) { + p = tcache->blob[1]; + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ +diff -urNp linux-2.6.33.1/arch/x86/kernel/x8664_ksyms_64.c linux-2.6.33.1/arch/x86/kernel/x8664_ksyms_64.c +--- linux-2.6.33.1/arch/x86/kernel/x8664_ksyms_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/x8664_ksyms_64.c 2010-03-20 16:58:39.020693949 -0400 +@@ -28,8 +28,6 @@ EXPORT_SYMBOL(__put_user_8); + + EXPORT_SYMBOL(copy_user_generic); + EXPORT_SYMBOL(__copy_user_nocache); +-EXPORT_SYMBOL(_copy_from_user); +-EXPORT_SYMBOL(_copy_to_user); + + EXPORT_SYMBOL(copy_page); + EXPORT_SYMBOL(clear_page); +diff -urNp linux-2.6.33.1/arch/x86/kernel/xsave.c linux-2.6.33.1/arch/x86/kernel/xsave.c +--- linux-2.6.33.1/arch/x86/kernel/xsave.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kernel/xsave.c 2010-03-20 16:58:39.020693949 -0400 +@@ -54,7 +54,7 @@ int check_for_xstate(struct i387_fxsave_ + fx_sw_user->xstate_size > fx_sw_user->extended_size) + return -1; + +- err = __get_user(magic2, (__u32 *) (((void *)fpstate) + ++ err = __get_user(magic2, (__u32 __user *) (((void __user *)fpstate) + + fx_sw_user->extended_size - + FP_XSTATE_MAGIC2_SIZE)); + /* +@@ -196,7 +196,7 @@ fx_only: + * the other extended state. + */ + xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE); +- return fxrstor_checking((__force struct i387_fxsave_struct *)buf); ++ return fxrstor_checking((struct i387_fxsave_struct __user *)buf); + } + + /* +@@ -228,7 +228,7 @@ int restore_i387_xstate(void __user *buf + if (task_thread_info(tsk)->status & TS_XSAVE) + err = restore_user_xstate(buf); + else +- err = fxrstor_checking((__force struct i387_fxsave_struct *) ++ err = fxrstor_checking((struct i387_fxsave_struct __user *) + buf); + if (unlikely(err)) { + /* +diff -urNp linux-2.6.33.1/arch/x86/kvm/emulate.c linux-2.6.33.1/arch/x86/kvm/emulate.c +--- linux-2.6.33.1/arch/x86/kvm/emulate.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kvm/emulate.c 2010-03-20 16:58:39.024538149 -0400 +@@ -413,6 +413,7 @@ static u32 group2_table[] = { + + #define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ + do { \ ++ unsigned long _tmp; \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op _suffix " %"_x"3,%1; " \ +@@ -426,8 +427,6 @@ static u32 group2_table[] = { + /* Raw emulation: instruction has two explicit operands. */ + #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ +- unsigned long _tmp; \ +- \ + switch ((_dst).bytes) { \ + case 2: \ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ +@@ -443,7 +442,6 @@ static u32 group2_table[] = { + + #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ +- unsigned long _tmp; \ + switch ((_dst).bytes) { \ + case 1: \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ +diff -urNp linux-2.6.33.1/arch/x86/kvm/svm.c linux-2.6.33.1/arch/x86/kvm/svm.c +--- linux-2.6.33.1/arch/x86/kvm/svm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kvm/svm.c 2010-03-20 16:58:39.024538149 -0400 +@@ -2428,7 +2428,11 @@ static void reload_tss(struct kvm_vcpu * + int cpu = raw_smp_processor_id(); + + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); ++ ++ pax_open_kernel(); + sd->tss_desc->type = 9; /* available 32/64-bit TSS */ ++ pax_close_kernel(); ++ + load_TR_desc(); + } + +@@ -2910,7 +2914,7 @@ static bool svm_gb_page_enable(void) + return true; + } + +-static struct kvm_x86_ops svm_x86_ops = { ++static const struct kvm_x86_ops svm_x86_ops = { + .cpu_has_kvm_support = has_svm, + .disabled_by_bios = is_disabled, + .hardware_setup = svm_hardware_setup, +diff -urNp linux-2.6.33.1/arch/x86/kvm/vmx.c linux-2.6.33.1/arch/x86/kvm/vmx.c +--- linux-2.6.33.1/arch/x86/kvm/vmx.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kvm/vmx.c 2010-03-20 16:58:39.024538149 -0400 +@@ -580,7 +580,11 @@ static void reload_tss(void) + + kvm_get_gdt(&gdt); + descs = (void *)gdt.base; ++ ++ pax_open_kernel(); + descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ ++ pax_close_kernel(); ++ + load_TR_desc(); + } + +@@ -1385,8 +1389,11 @@ static __init int hardware_setup(void) + if (!cpu_has_vmx_flexpriority()) + flexpriority_enabled = 0; + +- if (!cpu_has_vmx_tpr_shadow()) +- kvm_x86_ops->update_cr8_intercept = NULL; ++ if (!cpu_has_vmx_tpr_shadow()) { ++ pax_open_kernel(); ++ *(void **)&kvm_x86_ops->update_cr8_intercept = NULL; ++ pax_close_kernel(); ++ } + + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); +@@ -2357,7 +2364,7 @@ static int vmx_vcpu_setup(struct vcpu_vm + vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ + + asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); +- vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ ++ vmcs_writel(HOST_RIP, ktla_ktva(kvm_vmx_return)); /* 22.2.5 */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); +@@ -3742,6 +3749,12 @@ static void vmx_vcpu_run(struct kvm_vcpu + "jmp .Lkvm_vmx_return \n\t" + ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" + ".Lkvm_vmx_return: " ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ "ljmp %[cs],$.Lkvm_vmx_return2\n\t" ++ ".Lkvm_vmx_return2: " ++#endif ++ + /* Save guest registers, load host registers, keep flags */ + "xchg %0, (%%"R"sp) \n\t" + "mov %%"R"ax, %c[rax](%0) \n\t" +@@ -3788,6 +3801,11 @@ static void vmx_vcpu_run(struct kvm_vcpu + [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), + #endif + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ ,[cs]"i"(__KERNEL_CS) ++#endif ++ + : "cc", "memory" + , R"bx", R"di", R"si" + #ifdef CONFIG_X86_64 +@@ -3806,7 +3824,7 @@ static void vmx_vcpu_run(struct kvm_vcpu + if (vmx->rmode.irq.pending) + fixup_rmode_irq(vmx); + +- asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); ++ asm("mov %0, %%ds; mov %0, %%es" : : "r"(__KERNEL_DS)); + vmx->launched = 1; + + vmx_complete_interrupts(vmx); +@@ -3974,7 +3992,7 @@ static bool vmx_gb_page_enable(void) + return false; + } + +-static struct kvm_x86_ops vmx_x86_ops = { ++static const struct kvm_x86_ops vmx_x86_ops = { + .cpu_has_kvm_support = cpu_has_kvm_support, + .disabled_by_bios = vmx_disabled_by_bios, + .hardware_setup = hardware_setup, +diff -urNp linux-2.6.33.1/arch/x86/kvm/x86.c linux-2.6.33.1/arch/x86/kvm/x86.c +--- linux-2.6.33.1/arch/x86/kvm/x86.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/kvm/x86.c 2010-03-20 16:58:39.024538149 -0400 +@@ -83,7 +83,7 @@ static void update_cr8_intercept(struct + static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); + +-struct kvm_x86_ops *kvm_x86_ops; ++const struct kvm_x86_ops *kvm_x86_ops; + EXPORT_SYMBOL_GPL(kvm_x86_ops); + + int ignore_msrs = 0; +@@ -109,38 +109,38 @@ static struct kvm_shared_msrs_global __r + static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); + + struct kvm_stats_debugfs_item debugfs_entries[] = { +- { "pf_fixed", VCPU_STAT(pf_fixed) }, +- { "pf_guest", VCPU_STAT(pf_guest) }, +- { "tlb_flush", VCPU_STAT(tlb_flush) }, +- { "invlpg", VCPU_STAT(invlpg) }, +- { "exits", VCPU_STAT(exits) }, +- { "io_exits", VCPU_STAT(io_exits) }, +- { "mmio_exits", VCPU_STAT(mmio_exits) }, +- { "signal_exits", VCPU_STAT(signal_exits) }, +- { "irq_window", VCPU_STAT(irq_window_exits) }, +- { "nmi_window", VCPU_STAT(nmi_window_exits) }, +- { "halt_exits", VCPU_STAT(halt_exits) }, +- { "halt_wakeup", VCPU_STAT(halt_wakeup) }, +- { "hypercalls", VCPU_STAT(hypercalls) }, +- { "request_irq", VCPU_STAT(request_irq_exits) }, +- { "irq_exits", VCPU_STAT(irq_exits) }, +- { "host_state_reload", VCPU_STAT(host_state_reload) }, +- { "efer_reload", VCPU_STAT(efer_reload) }, +- { "fpu_reload", VCPU_STAT(fpu_reload) }, +- { "insn_emulation", VCPU_STAT(insn_emulation) }, +- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, +- { "irq_injections", VCPU_STAT(irq_injections) }, +- { "nmi_injections", VCPU_STAT(nmi_injections) }, +- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, +- { "mmu_pte_write", VM_STAT(mmu_pte_write) }, +- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, +- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, +- { "mmu_flooded", VM_STAT(mmu_flooded) }, +- { "mmu_recycled", VM_STAT(mmu_recycled) }, +- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, +- { "mmu_unsync", VM_STAT(mmu_unsync) }, +- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, +- { "largepages", VM_STAT(lpages) }, ++ { "pf_fixed", VCPU_STAT(pf_fixed), NULL }, ++ { "pf_guest", VCPU_STAT(pf_guest), NULL }, ++ { "tlb_flush", VCPU_STAT(tlb_flush), NULL }, ++ { "invlpg", VCPU_STAT(invlpg), NULL }, ++ { "exits", VCPU_STAT(exits), NULL }, ++ { "io_exits", VCPU_STAT(io_exits), NULL }, ++ { "mmio_exits", VCPU_STAT(mmio_exits), NULL }, ++ { "signal_exits", VCPU_STAT(signal_exits), NULL }, ++ { "irq_window", VCPU_STAT(irq_window_exits), NULL }, ++ { "nmi_window", VCPU_STAT(nmi_window_exits), NULL }, ++ { "halt_exits", VCPU_STAT(halt_exits), NULL }, ++ { "halt_wakeup", VCPU_STAT(halt_wakeup), NULL }, ++ { "hypercalls", VCPU_STAT(hypercalls), NULL }, ++ { "request_irq", VCPU_STAT(request_irq_exits), NULL }, ++ { "irq_exits", VCPU_STAT(irq_exits), NULL }, ++ { "host_state_reload", VCPU_STAT(host_state_reload), NULL }, ++ { "efer_reload", VCPU_STAT(efer_reload), NULL }, ++ { "fpu_reload", VCPU_STAT(fpu_reload), NULL }, ++ { "insn_emulation", VCPU_STAT(insn_emulation), NULL }, ++ { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail), NULL }, ++ { "irq_injections", VCPU_STAT(irq_injections), NULL }, ++ { "nmi_injections", VCPU_STAT(nmi_injections), NULL }, ++ { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped), NULL }, ++ { "mmu_pte_write", VM_STAT(mmu_pte_write), NULL }, ++ { "mmu_pte_updated", VM_STAT(mmu_pte_updated), NULL }, ++ { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped), NULL }, ++ { "mmu_flooded", VM_STAT(mmu_flooded), NULL }, ++ { "mmu_recycled", VM_STAT(mmu_recycled), NULL }, ++ { "mmu_cache_miss", VM_STAT(mmu_cache_miss), NULL }, ++ { "mmu_unsync", VM_STAT(mmu_unsync), NULL }, ++ { "remote_tlb_flush", VM_STAT(remote_tlb_flush), NULL }, ++ { "largepages", VM_STAT(lpages), NULL }, + { NULL } + }; + +@@ -1405,6 +1405,8 @@ long kvm_arch_dev_ioctl(struct file *fil + if (n < msr_list.nmsrs) + goto out; + r = -EFAULT; ++ if (num_msrs_to_save > ARRAY_SIZE(msrs_to_save)) ++ goto out; + if (copy_to_user(user_msr_list->indices, &msrs_to_save, + num_msrs_to_save * sizeof(u32))) + goto out; +@@ -1787,7 +1789,7 @@ static int kvm_vcpu_ioctl_set_lapic(stru + static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) + { +- if (irq->irq < 0 || irq->irq >= 256) ++ if (irq->irq >= 256) + return -EINVAL; + if (irqchip_in_kernel(vcpu->kvm)) + return -ENXIO; +@@ -3414,10 +3416,10 @@ static void kvm_timer_init(void) + } + } + +-int kvm_arch_init(void *opaque) ++int kvm_arch_init(const void *opaque) + { + int r; +- struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; ++ const struct kvm_x86_ops *ops = (const struct kvm_x86_ops *)opaque; + + if (kvm_x86_ops) { + printk(KERN_ERR "kvm: already loaded the other module\n"); +diff -urNp linux-2.6.33.1/arch/x86/lib/checksum_32.S linux-2.6.33.1/arch/x86/lib/checksum_32.S +--- linux-2.6.33.1/arch/x86/lib/checksum_32.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/lib/checksum_32.S 2010-03-20 16:58:39.024538149 -0400 +@@ -28,7 +28,8 @@ + #include <linux/linkage.h> + #include <asm/dwarf2.h> + #include <asm/errno.h> +- ++#include <asm/segment.h> ++ + /* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ +@@ -304,9 +305,22 @@ unsigned int csum_partial_copy_generic ( + + #define ARGBASE 16 + #define FP 12 +- +-ENTRY(csum_partial_copy_generic) ++ ++ENTRY(csum_partial_copy_generic_to_user) + CFI_STARTPROC ++ pushl $(__USER_DS) ++ CFI_ADJUST_CFA_OFFSET 4 ++ popl %es ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp csum_partial_copy_generic ++ ++ENTRY(csum_partial_copy_generic_from_user) ++ pushl $(__USER_DS) ++ CFI_ADJUST_CFA_OFFSET 4 ++ popl %ds ++ CFI_ADJUST_CFA_OFFSET -4 ++ ++ENTRY(csum_partial_copy_generic) + subl $4,%esp + CFI_ADJUST_CFA_OFFSET 4 + pushl %edi +@@ -331,7 +345,7 @@ ENTRY(csum_partial_copy_generic) + jmp 4f + SRC(1: movw (%esi), %bx ) + addl $2, %esi +-DST( movw %bx, (%edi) ) ++DST( movw %bx, %es:(%edi) ) + addl $2, %edi + addw %bx, %ax + adcl $0, %eax +@@ -343,30 +357,30 @@ DST( movw %bx, (%edi) ) + SRC(1: movl (%esi), %ebx ) + SRC( movl 4(%esi), %edx ) + adcl %ebx, %eax +-DST( movl %ebx, (%edi) ) ++DST( movl %ebx, %es:(%edi) ) + adcl %edx, %eax +-DST( movl %edx, 4(%edi) ) ++DST( movl %edx, %es:4(%edi) ) + + SRC( movl 8(%esi), %ebx ) + SRC( movl 12(%esi), %edx ) + adcl %ebx, %eax +-DST( movl %ebx, 8(%edi) ) ++DST( movl %ebx, %es:8(%edi) ) + adcl %edx, %eax +-DST( movl %edx, 12(%edi) ) ++DST( movl %edx, %es:12(%edi) ) + + SRC( movl 16(%esi), %ebx ) + SRC( movl 20(%esi), %edx ) + adcl %ebx, %eax +-DST( movl %ebx, 16(%edi) ) ++DST( movl %ebx, %es:16(%edi) ) + adcl %edx, %eax +-DST( movl %edx, 20(%edi) ) ++DST( movl %edx, %es:20(%edi) ) + + SRC( movl 24(%esi), %ebx ) + SRC( movl 28(%esi), %edx ) + adcl %ebx, %eax +-DST( movl %ebx, 24(%edi) ) ++DST( movl %ebx, %es:24(%edi) ) + adcl %edx, %eax +-DST( movl %edx, 28(%edi) ) ++DST( movl %edx, %es:28(%edi) ) + + lea 32(%esi), %esi + lea 32(%edi), %edi +@@ -380,7 +394,7 @@ DST( movl %edx, 28(%edi) ) + shrl $2, %edx # This clears CF + SRC(3: movl (%esi), %ebx ) + adcl %ebx, %eax +-DST( movl %ebx, (%edi) ) ++DST( movl %ebx, %es:(%edi) ) + lea 4(%esi), %esi + lea 4(%edi), %edi + dec %edx +@@ -392,12 +406,12 @@ DST( movl %ebx, (%edi) ) + jb 5f + SRC( movw (%esi), %cx ) + leal 2(%esi), %esi +-DST( movw %cx, (%edi) ) ++DST( movw %cx, %es:(%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%ecx + SRC(5: movb (%esi), %cl ) +-DST( movb %cl, (%edi) ) ++DST( movb %cl, %es:(%edi) ) + 6: addl %ecx, %eax + adcl $0, %eax + 7: +@@ -408,7 +422,7 @@ DST( movb %cl, (%edi) ) + + 6001: + movl ARGBASE+20(%esp), %ebx # src_err_ptr +- movl $-EFAULT, (%ebx) ++ movl $-EFAULT, %ss:(%ebx) + + # zero the complete destination - computing the rest + # is too much work +@@ -421,11 +435,19 @@ DST( movb %cl, (%edi) ) + + 6002: + movl ARGBASE+24(%esp), %ebx # dst_err_ptr +- movl $-EFAULT,(%ebx) ++ movl $-EFAULT,%ss:(%ebx) + jmp 5000b + + .previous + ++ pushl %ss ++ CFI_ADJUST_CFA_OFFSET 4 ++ popl %ds ++ CFI_ADJUST_CFA_OFFSET -4 ++ pushl %ss ++ CFI_ADJUST_CFA_OFFSET 4 ++ popl %es ++ CFI_ADJUST_CFA_OFFSET -4 + popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx +@@ -439,26 +461,41 @@ DST( movb %cl, (%edi) ) + CFI_ADJUST_CFA_OFFSET -4 + ret + CFI_ENDPROC +-ENDPROC(csum_partial_copy_generic) ++ENDPROC(csum_partial_copy_generic_to_user) + + #else + + /* Version for PentiumII/PPro */ + + #define ROUND1(x) \ ++ nop; nop; nop; \ + SRC(movl x(%esi), %ebx ) ; \ + addl %ebx, %eax ; \ +- DST(movl %ebx, x(%edi) ) ; ++ DST(movl %ebx, %es:x(%edi)) ; + + #define ROUND(x) \ ++ nop; nop; nop; \ + SRC(movl x(%esi), %ebx ) ; \ + adcl %ebx, %eax ; \ +- DST(movl %ebx, x(%edi) ) ; ++ DST(movl %ebx, %es:x(%edi)) ; + + #define ARGBASE 12 +- +-ENTRY(csum_partial_copy_generic) ++ ++ENTRY(csum_partial_copy_generic_to_user) + CFI_STARTPROC ++ pushl $(__USER_DS) ++ CFI_ADJUST_CFA_OFFSET 4 ++ popl %es ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp csum_partial_copy_generic ++ ++ENTRY(csum_partial_copy_generic_from_user) ++ pushl $(__USER_DS) ++ CFI_ADJUST_CFA_OFFSET 4 ++ popl %ds ++ CFI_ADJUST_CFA_OFFSET -4 ++ ++ENTRY(csum_partial_copy_generic) + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 +@@ -482,7 +519,7 @@ ENTRY(csum_partial_copy_generic) + subl %ebx, %edi + lea -1(%esi),%edx + andl $-32,%edx +- lea 3f(%ebx,%ebx), %ebx ++ lea 3f(%ebx,%ebx,2), %ebx + testl %esi, %esi + jmp *%ebx + 1: addl $64,%esi +@@ -503,19 +540,19 @@ ENTRY(csum_partial_copy_generic) + jb 5f + SRC( movw (%esi), %dx ) + leal 2(%esi), %esi +-DST( movw %dx, (%edi) ) ++DST( movw %dx, %es:(%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%edx + 5: + SRC( movb (%esi), %dl ) +-DST( movb %dl, (%edi) ) ++DST( movb %dl, %es:(%edi) ) + 6: addl %edx, %eax + adcl $0, %eax + 7: + .section .fixup, "ax" + 6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr +- movl $-EFAULT, (%ebx) ++ movl $-EFAULT, %ss:(%ebx) + # zero the complete destination (computing the rest is too much work) + movl ARGBASE+8(%esp),%edi # dst + movl ARGBASE+12(%esp),%ecx # len +@@ -523,10 +560,18 @@ DST( movb %dl, (%edi) ) + rep; stosb + jmp 7b + 6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr +- movl $-EFAULT, (%ebx) ++ movl $-EFAULT, %ss:(%ebx) + jmp 7b + .previous + ++ pushl %ss ++ CFI_ADJUST_CFA_OFFSET 4 ++ popl %ds ++ CFI_ADJUST_CFA_OFFSET -4 ++ pushl %ss ++ CFI_ADJUST_CFA_OFFSET 4 ++ popl %es ++ CFI_ADJUST_CFA_OFFSET -4 + popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi +@@ -538,7 +583,7 @@ DST( movb %dl, (%edi) ) + CFI_RESTORE ebx + ret + CFI_ENDPROC +-ENDPROC(csum_partial_copy_generic) ++ENDPROC(csum_partial_copy_generic_to_user) + + #undef ROUND + #undef ROUND1 +diff -urNp linux-2.6.33.1/arch/x86/lib/clear_page_64.S linux-2.6.33.1/arch/x86/lib/clear_page_64.S +--- linux-2.6.33.1/arch/x86/lib/clear_page_64.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/lib/clear_page_64.S 2010-03-20 16:58:39.028522513 -0400 +@@ -43,7 +43,7 @@ ENDPROC(clear_page) + + #include <asm/cpufeature.h> + +- .section .altinstr_replacement,"ax" ++ .section .altinstr_replacement,"a" + 1: .byte 0xeb /* jmp <disp8> */ + .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ + 2: +diff -urNp linux-2.6.33.1/arch/x86/lib/copy_page_64.S linux-2.6.33.1/arch/x86/lib/copy_page_64.S +--- linux-2.6.33.1/arch/x86/lib/copy_page_64.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/lib/copy_page_64.S 2010-03-20 16:58:39.028522513 -0400 +@@ -104,7 +104,7 @@ ENDPROC(copy_page) + + #include <asm/cpufeature.h> + +- .section .altinstr_replacement,"ax" ++ .section .altinstr_replacement,"a" + 1: .byte 0xeb /* jmp <disp8> */ + .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ + 2: +diff -urNp linux-2.6.33.1/arch/x86/lib/copy_user_64.S linux-2.6.33.1/arch/x86/lib/copy_user_64.S +--- linux-2.6.33.1/arch/x86/lib/copy_user_64.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/lib/copy_user_64.S 2010-03-20 16:58:39.028522513 -0400 +@@ -21,7 +21,7 @@ + .byte 0xe9 /* 32bit jump */ + .long \orig-1f /* by default jump to orig */ + 1: +- .section .altinstr_replacement,"ax" ++ .section .altinstr_replacement,"a" + 2: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt-1b /* offset */ /* or alternatively to alt */ + .previous +@@ -64,32 +64,6 @@ + #endif + .endm + +-/* Standard copy_to_user with segment limit checking */ +-ENTRY(_copy_to_user) +- CFI_STARTPROC +- GET_THREAD_INFO(%rax) +- movq %rdi,%rcx +- addq %rdx,%rcx +- jc bad_to_user +- cmpq TI_addr_limit(%rax),%rcx +- jae bad_to_user +- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string +- CFI_ENDPROC +-ENDPROC(_copy_to_user) +- +-/* Standard copy_from_user with segment limit checking */ +-ENTRY(_copy_from_user) +- CFI_STARTPROC +- GET_THREAD_INFO(%rax) +- movq %rsi,%rcx +- addq %rdx,%rcx +- jc bad_from_user +- cmpq TI_addr_limit(%rax),%rcx +- jae bad_from_user +- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string +- CFI_ENDPROC +-ENDPROC(_copy_from_user) +- + ENTRY(copy_user_generic) + CFI_STARTPROC + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string +@@ -101,6 +75,8 @@ ENDPROC(copy_user_generic) + ENTRY(bad_from_user) + bad_from_user: + CFI_STARTPROC ++ testl %edx,%edx ++ js bad_to_user + movl %edx,%ecx + xorl %eax,%eax + rep +diff -urNp linux-2.6.33.1/arch/x86/lib/getuser.S linux-2.6.33.1/arch/x86/lib/getuser.S +--- linux-2.6.33.1/arch/x86/lib/getuser.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/lib/getuser.S 2010-03-20 16:58:39.028522513 -0400 +@@ -33,14 +33,28 @@ + #include <asm/asm-offsets.h> + #include <asm/thread_info.h> + #include <asm/asm.h> ++#include <asm/segment.h> + + .text + ENTRY(__get_user_1) + CFI_STARTPROC ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl $(__USER_DS) ++ popl %ds ++#else + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++#endif ++ + 1: movzb (%_ASM_AX),%edx ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl %ss ++ pop %ds ++#endif ++ + xor %eax,%eax + ret + CFI_ENDPROC +@@ -49,11 +63,24 @@ ENDPROC(__get_user_1) + ENTRY(__get_user_2) + CFI_STARTPROC + add $1,%_ASM_AX ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl $(__USER_DS) ++ popl %ds ++#else + jc bad_get_user + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++#endif ++ + 2: movzwl -1(%_ASM_AX),%edx ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl %ss ++ pop %ds ++#endif ++ + xor %eax,%eax + ret + CFI_ENDPROC +@@ -62,11 +89,24 @@ ENDPROC(__get_user_2) + ENTRY(__get_user_4) + CFI_STARTPROC + add $3,%_ASM_AX ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl $(__USER_DS) ++ popl %ds ++#else + jc bad_get_user + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++#endif ++ + 3: mov -3(%_ASM_AX),%edx ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl %ss ++ pop %ds ++#endif ++ + xor %eax,%eax + ret + CFI_ENDPROC +@@ -89,6 +129,12 @@ ENDPROC(__get_user_8) + + bad_get_user: + CFI_STARTPROC ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl %ss ++ pop %ds ++#endif ++ + xor %edx,%edx + mov $(-EFAULT),%_ASM_AX + ret +diff -urNp linux-2.6.33.1/arch/x86/lib/memcpy_64.S linux-2.6.33.1/arch/x86/lib/memcpy_64.S +--- linux-2.6.33.1/arch/x86/lib/memcpy_64.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/lib/memcpy_64.S 2010-03-20 16:58:39.028522513 -0400 +@@ -128,7 +128,7 @@ ENDPROC(__memcpy) + * It is also a lot simpler. Use this when possible: + */ + +- .section .altinstr_replacement, "ax" ++ .section .altinstr_replacement, "a" + 1: .byte 0xeb /* jmp <disp8> */ + .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ + 2: +diff -urNp linux-2.6.33.1/arch/x86/lib/memset_64.S linux-2.6.33.1/arch/x86/lib/memset_64.S +--- linux-2.6.33.1/arch/x86/lib/memset_64.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/lib/memset_64.S 2010-03-20 16:58:39.028522513 -0400 +@@ -118,7 +118,7 @@ ENDPROC(__memset) + + #include <asm/cpufeature.h> + +- .section .altinstr_replacement,"ax" ++ .section .altinstr_replacement,"a" + 1: .byte 0xeb /* jmp <disp8> */ + .byte (memset_c - memset) - (2f - 1b) /* offset */ + 2: +diff -urNp linux-2.6.33.1/arch/x86/lib/mmx_32.c linux-2.6.33.1/arch/x86/lib/mmx_32.c +--- linux-2.6.33.1/arch/x86/lib/mmx_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/lib/mmx_32.c 2010-03-20 16:58:39.028522513 -0400 +@@ -29,6 +29,7 @@ void *_mmx_memcpy(void *to, const void * + { + void *p; + int i; ++ unsigned long cr0; + + if (unlikely(in_interrupt())) + return __memcpy(to, from, len); +@@ -39,44 +40,72 @@ void *_mmx_memcpy(void *to, const void * + kernel_fpu_begin(); + + __asm__ __volatile__ ( +- "1: prefetch (%0)\n" /* This set is 28 bytes */ +- " prefetch 64(%0)\n" +- " prefetch 128(%0)\n" +- " prefetch 192(%0)\n" +- " prefetch 256(%0)\n" ++ "1: prefetch (%1)\n" /* This set is 28 bytes */ ++ " prefetch 64(%1)\n" ++ " prefetch 128(%1)\n" ++ " prefetch 192(%1)\n" ++ " prefetch 256(%1)\n" + "2: \n" + ".section .fixup, "ax"\n" +- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ ++ "3: \n" ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %%cr0, %0\n" ++ " movl %0, %%eax\n" ++ " andl $0xFFFEFFFF, %%eax\n" ++ " movl %%eax, %%cr0\n" ++#endif ++ ++ " movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %0, %%cr0\n" ++#endif ++ + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) +- : : "r" (from)); ++ : "=&r" (cr0) : "r" (from) : "ax"); + + for ( ; i > 5; i--) { + __asm__ __volatile__ ( +- "1: prefetch 320(%0)\n" +- "2: movq (%0), %%mm0\n" +- " movq 8(%0), %%mm1\n" +- " movq 16(%0), %%mm2\n" +- " movq 24(%0), %%mm3\n" +- " movq %%mm0, (%1)\n" +- " movq %%mm1, 8(%1)\n" +- " movq %%mm2, 16(%1)\n" +- " movq %%mm3, 24(%1)\n" +- " movq 32(%0), %%mm0\n" +- " movq 40(%0), %%mm1\n" +- " movq 48(%0), %%mm2\n" +- " movq 56(%0), %%mm3\n" +- " movq %%mm0, 32(%1)\n" +- " movq %%mm1, 40(%1)\n" +- " movq %%mm2, 48(%1)\n" +- " movq %%mm3, 56(%1)\n" ++ "1: prefetch 320(%1)\n" ++ "2: movq (%1), %%mm0\n" ++ " movq 8(%1), %%mm1\n" ++ " movq 16(%1), %%mm2\n" ++ " movq 24(%1), %%mm3\n" ++ " movq %%mm0, (%2)\n" ++ " movq %%mm1, 8(%2)\n" ++ " movq %%mm2, 16(%2)\n" ++ " movq %%mm3, 24(%2)\n" ++ " movq 32(%1), %%mm0\n" ++ " movq 40(%1), %%mm1\n" ++ " movq 48(%1), %%mm2\n" ++ " movq 56(%1), %%mm3\n" ++ " movq %%mm0, 32(%2)\n" ++ " movq %%mm1, 40(%2)\n" ++ " movq %%mm2, 48(%2)\n" ++ " movq %%mm3, 56(%2)\n" + ".section .fixup, "ax"\n" +- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ ++ "3:\n" ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %%cr0, %0\n" ++ " movl %0, %%eax\n" ++ " andl $0xFFFEFFFF, %%eax\n" ++ " movl %%eax, %%cr0\n" ++#endif ++ ++ " movw $0x05EB, 1b\n" /* jmp on 5 bytes */ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %0, %%cr0\n" ++#endif ++ + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) +- : : "r" (from), "r" (to) : "memory"); ++ : "=&r" (cr0) : "r" (from), "r" (to) : "memory", "ax"); + + from += 64; + to += 64; +@@ -158,6 +187,7 @@ static void fast_clear_page(void *page) + static void fast_copy_page(void *to, void *from) + { + int i; ++ unsigned long cr0; + + kernel_fpu_begin(); + +@@ -166,42 +196,70 @@ static void fast_copy_page(void *to, voi + * but that is for later. -AV + */ + __asm__ __volatile__( +- "1: prefetch (%0)\n" +- " prefetch 64(%0)\n" +- " prefetch 128(%0)\n" +- " prefetch 192(%0)\n" +- " prefetch 256(%0)\n" ++ "1: prefetch (%1)\n" ++ " prefetch 64(%1)\n" ++ " prefetch 128(%1)\n" ++ " prefetch 192(%1)\n" ++ " prefetch 256(%1)\n" + "2: \n" + ".section .fixup, "ax"\n" +- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ ++ "3: \n" ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %%cr0, %0\n" ++ " movl %0, %%eax\n" ++ " andl $0xFFFEFFFF, %%eax\n" ++ " movl %%eax, %%cr0\n" ++#endif ++ ++ " movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %0, %%cr0\n" ++#endif ++ + " jmp 2b\n" + ".previous\n" +- _ASM_EXTABLE(1b, 3b) : : "r" (from)); ++ _ASM_EXTABLE(1b, 3b) : "=&r" (cr0) : "r" (from) : "ax"); + + for (i = 0; i < (4096-320)/64; i++) { + __asm__ __volatile__ ( +- "1: prefetch 320(%0)\n" +- "2: movq (%0), %%mm0\n" +- " movntq %%mm0, (%1)\n" +- " movq 8(%0), %%mm1\n" +- " movntq %%mm1, 8(%1)\n" +- " movq 16(%0), %%mm2\n" +- " movntq %%mm2, 16(%1)\n" +- " movq 24(%0), %%mm3\n" +- " movntq %%mm3, 24(%1)\n" +- " movq 32(%0), %%mm4\n" +- " movntq %%mm4, 32(%1)\n" +- " movq 40(%0), %%mm5\n" +- " movntq %%mm5, 40(%1)\n" +- " movq 48(%0), %%mm6\n" +- " movntq %%mm6, 48(%1)\n" +- " movq 56(%0), %%mm7\n" +- " movntq %%mm7, 56(%1)\n" ++ "1: prefetch 320(%1)\n" ++ "2: movq (%1), %%mm0\n" ++ " movntq %%mm0, (%2)\n" ++ " movq 8(%1), %%mm1\n" ++ " movntq %%mm1, 8(%2)\n" ++ " movq 16(%1), %%mm2\n" ++ " movntq %%mm2, 16(%2)\n" ++ " movq 24(%1), %%mm3\n" ++ " movntq %%mm3, 24(%2)\n" ++ " movq 32(%1), %%mm4\n" ++ " movntq %%mm4, 32(%2)\n" ++ " movq 40(%1), %%mm5\n" ++ " movntq %%mm5, 40(%2)\n" ++ " movq 48(%1), %%mm6\n" ++ " movntq %%mm6, 48(%2)\n" ++ " movq 56(%1), %%mm7\n" ++ " movntq %%mm7, 56(%2)\n" + ".section .fixup, "ax"\n" +- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ ++ "3:\n" ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %%cr0, %0\n" ++ " movl %0, %%eax\n" ++ " andl $0xFFFEFFFF, %%eax\n" ++ " movl %%eax, %%cr0\n" ++#endif ++ ++ " movw $0x05EB, 1b\n" /* jmp on 5 bytes */ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %0, %%cr0\n" ++#endif ++ + " jmp 2b\n" + ".previous\n" +- _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory"); ++ _ASM_EXTABLE(1b, 3b) : "=&r" (cr0) : "r" (from), "r" (to) : "memory", "ax"); + + from += 64; + to += 64; +@@ -280,47 +338,76 @@ static void fast_clear_page(void *page) + static void fast_copy_page(void *to, void *from) + { + int i; ++ unsigned long cr0; + + kernel_fpu_begin(); + + __asm__ __volatile__ ( +- "1: prefetch (%0)\n" +- " prefetch 64(%0)\n" +- " prefetch 128(%0)\n" +- " prefetch 192(%0)\n" +- " prefetch 256(%0)\n" ++ "1: prefetch (%1)\n" ++ " prefetch 64(%1)\n" ++ " prefetch 128(%1)\n" ++ " prefetch 192(%1)\n" ++ " prefetch 256(%1)\n" + "2: \n" + ".section .fixup, "ax"\n" +- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ ++ "3: \n" ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %%cr0, %0\n" ++ " movl %0, %%eax\n" ++ " andl $0xFFFEFFFF, %%eax\n" ++ " movl %%eax, %%cr0\n" ++#endif ++ ++ " movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %0, %%cr0\n" ++#endif ++ + " jmp 2b\n" + ".previous\n" +- _ASM_EXTABLE(1b, 3b) : : "r" (from)); ++ _ASM_EXTABLE(1b, 3b) : "=&r" (cr0) : "r" (from) : "ax"); + + for (i = 0; i < 4096/64; i++) { + __asm__ __volatile__ ( +- "1: prefetch 320(%0)\n" +- "2: movq (%0), %%mm0\n" +- " movq 8(%0), %%mm1\n" +- " movq 16(%0), %%mm2\n" +- " movq 24(%0), %%mm3\n" +- " movq %%mm0, (%1)\n" +- " movq %%mm1, 8(%1)\n" +- " movq %%mm2, 16(%1)\n" +- " movq %%mm3, 24(%1)\n" +- " movq 32(%0), %%mm0\n" +- " movq 40(%0), %%mm1\n" +- " movq 48(%0), %%mm2\n" +- " movq 56(%0), %%mm3\n" +- " movq %%mm0, 32(%1)\n" +- " movq %%mm1, 40(%1)\n" +- " movq %%mm2, 48(%1)\n" +- " movq %%mm3, 56(%1)\n" ++ "1: prefetch 320(%1)\n" ++ "2: movq (%1), %%mm0\n" ++ " movq 8(%1), %%mm1\n" ++ " movq 16(%1), %%mm2\n" ++ " movq 24(%1), %%mm3\n" ++ " movq %%mm0, (%2)\n" ++ " movq %%mm1, 8(%2)\n" ++ " movq %%mm2, 16(%2)\n" ++ " movq %%mm3, 24(%2)\n" ++ " movq 32(%1), %%mm0\n" ++ " movq 40(%1), %%mm1\n" ++ " movq 48(%1), %%mm2\n" ++ " movq 56(%1), %%mm3\n" ++ " movq %%mm0, 32(%2)\n" ++ " movq %%mm1, 40(%2)\n" ++ " movq %%mm2, 48(%2)\n" ++ " movq %%mm3, 56(%2)\n" + ".section .fixup, "ax"\n" +- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ ++ "3:\n" ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %%cr0, %0\n" ++ " movl %0, %%eax\n" ++ " andl $0xFFFEFFFF, %%eax\n" ++ " movl %%eax, %%cr0\n" ++#endif ++ ++ " movw $0x05EB, 1b\n" /* jmp on 5 bytes */ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ " movl %0, %%cr0\n" ++#endif ++ + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) +- : : "r" (from), "r" (to) : "memory"); ++ : "=&r" (cr0) : "r" (from), "r" (to) : "memory", "ax"); + + from += 64; + to += 64; +diff -urNp linux-2.6.33.1/arch/x86/lib/putuser.S linux-2.6.33.1/arch/x86/lib/putuser.S +--- linux-2.6.33.1/arch/x86/lib/putuser.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/lib/putuser.S 2010-03-20 16:58:39.028522513 -0400 +@@ -15,6 +15,7 @@ + #include <asm/thread_info.h> + #include <asm/errno.h> + #include <asm/asm.h> ++#include <asm/segment.h> + + + /* +@@ -29,59 +30,120 @@ + * as they get called from within inline assembly. + */ + +-#define ENTER CFI_STARTPROC ; \ +- GET_THREAD_INFO(%_ASM_BX) ++#define ENTER CFI_STARTPROC + #define EXIT ret ; \ + CFI_ENDPROC + + .text + ENTRY(__put_user_1) + ENTER ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl $(__USER_DS) ++ popl %ds ++#else ++ GET_THREAD_INFO(%_ASM_BX) + cmp TI_addr_limit(%_ASM_BX),%_ASM_CX + jae bad_put_user ++#endif ++ + 1: movb %al,(%_ASM_CX) ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl %ss ++ popl %ds ++#endif ++ + xor %eax,%eax + EXIT + ENDPROC(__put_user_1) + + ENTRY(__put_user_2) + ENTER ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl $(__USER_DS) ++ popl %ds ++#else ++ GET_THREAD_INFO(%_ASM_BX) + mov TI_addr_limit(%_ASM_BX),%_ASM_BX + sub $1,%_ASM_BX + cmp %_ASM_BX,%_ASM_CX + jae bad_put_user ++#endif ++ + 2: movw %ax,(%_ASM_CX) ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl %ss ++ popl %ds ++#endif ++ + xor %eax,%eax + EXIT + ENDPROC(__put_user_2) + + ENTRY(__put_user_4) + ENTER ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl $(__USER_DS) ++ popl %ds ++#else ++ GET_THREAD_INFO(%_ASM_BX) + mov TI_addr_limit(%_ASM_BX),%_ASM_BX + sub $3,%_ASM_BX + cmp %_ASM_BX,%_ASM_CX + jae bad_put_user ++#endif ++ + 3: movl %eax,(%_ASM_CX) ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl %ss ++ popl %ds ++#endif ++ + xor %eax,%eax + EXIT + ENDPROC(__put_user_4) + + ENTRY(__put_user_8) + ENTER ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl $(__USER_DS) ++ popl %ds ++#else ++ GET_THREAD_INFO(%_ASM_BX) + mov TI_addr_limit(%_ASM_BX),%_ASM_BX + sub $7,%_ASM_BX + cmp %_ASM_BX,%_ASM_CX + jae bad_put_user ++#endif ++ + 4: mov %_ASM_AX,(%_ASM_CX) + #ifdef CONFIG_X86_32 + 5: movl %edx,4(%_ASM_CX) + #endif ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl %ss ++ popl %ds ++#endif ++ + xor %eax,%eax + EXIT + ENDPROC(__put_user_8) + + bad_put_user: + CFI_STARTPROC ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_MEMORY_UDEREF) ++ pushl %ss ++ popl %ds ++#endif ++ + movl $-EFAULT,%eax + EXIT + END(bad_put_user) +diff -urNp linux-2.6.33.1/arch/x86/lib/usercopy_32.c linux-2.6.33.1/arch/x86/lib/usercopy_32.c +--- linux-2.6.33.1/arch/x86/lib/usercopy_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/lib/usercopy_32.c 2010-03-20 16:58:39.028522513 -0400 +@@ -36,31 +36,38 @@ static inline int __movsl_is_ok(unsigned + * Copy a null terminated string from userspace. + */ + +-#define __do_strncpy_from_user(dst, src, count, res) \ +-do { \ +- int __d0, __d1, __d2; \ +- might_fault(); \ +- __asm__ __volatile__( \ +- " testl %1,%1\n" \ +- " jz 2f\n" \ +- "0: lodsb\n" \ +- " stosb\n" \ +- " testb %%al,%%al\n" \ +- " jz 1f\n" \ +- " decl %1\n" \ +- " jnz 0b\n" \ +- "1: subl %1,%0\n" \ +- "2:\n" \ +- ".section .fixup,"ax"\n" \ +- "3: movl %5,%0\n" \ +- " jmp 2b\n" \ +- ".previous\n" \ +- _ASM_EXTABLE(0b,3b) \ +- : "=&d"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1), \ +- "=&D" (__d2) \ +- : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ +- : "memory"); \ +-} while (0) ++static long __do_strncpy_from_user(char *dst, const char __user *src, long count) ++{ ++ int __d0, __d1, __d2; ++ long res = -EFAULT; ++ ++ might_fault(); ++ __asm__ __volatile__( ++ " movw %w10,%%ds\n" ++ " testl %1,%1\n" ++ " jz 2f\n" ++ "0: lodsb\n" ++ " stosb\n" ++ " testb %%al,%%al\n" ++ " jz 1f\n" ++ " decl %1\n" ++ " jnz 0b\n" ++ "1: subl %1,%0\n" ++ "2:\n" ++ " pushl %%ss\n" ++ " popl %%ds\n" ++ ".section .fixup,"ax"\n" ++ "3: movl %5,%0\n" ++ " jmp 2b\n" ++ ".previous\n" ++ _ASM_EXTABLE(0b,3b) ++ : "=&d"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1), ++ "=&D" (__d2) ++ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst), ++ "r"(__USER_DS) ++ : "memory"); ++ return res; ++} + + /** + * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking. +@@ -85,9 +92,7 @@ do { \ + long + __strncpy_from_user(char *dst, const char __user *src, long count) + { +- long res; +- __do_strncpy_from_user(dst, src, count, res); +- return res; ++ return __do_strncpy_from_user(dst, src, count); + } + EXPORT_SYMBOL(__strncpy_from_user); + +@@ -114,7 +119,7 @@ strncpy_from_user(char *dst, const char + { + long res = -EFAULT; + if (access_ok(VERIFY_READ, src, 1)) +- __do_strncpy_from_user(dst, src, count, res); ++ res = __do_strncpy_from_user(dst, src, count); + return res; + } + EXPORT_SYMBOL(strncpy_from_user); +@@ -123,24 +128,30 @@ EXPORT_SYMBOL(strncpy_from_user); + * Zero Userspace + */ + +-#define __do_clear_user(addr,size) \ +-do { \ +- int __d0; \ +- might_fault(); \ +- __asm__ __volatile__( \ +- "0: rep; stosl\n" \ +- " movl %2,%0\n" \ +- "1: rep; stosb\n" \ +- "2:\n" \ +- ".section .fixup,"ax"\n" \ +- "3: lea 0(%2,%0,4),%0\n" \ +- " jmp 2b\n" \ +- ".previous\n" \ +- _ASM_EXTABLE(0b,3b) \ +- _ASM_EXTABLE(1b,2b) \ +- : "=&c"(size), "=&D" (__d0) \ +- : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ +-} while (0) ++static unsigned long __do_clear_user(void __user *addr, unsigned long size) ++{ ++ int __d0; ++ ++ might_fault(); ++ __asm__ __volatile__( ++ " movw %w6,%%es\n" ++ "0: rep; stosl\n" ++ " movl %2,%0\n" ++ "1: rep; stosb\n" ++ "2:\n" ++ " pushl %%ss\n" ++ " popl %%es\n" ++ ".section .fixup,"ax"\n" ++ "3: lea 0(%2,%0,4),%0\n" ++ " jmp 2b\n" ++ ".previous\n" ++ _ASM_EXTABLE(0b,3b) ++ _ASM_EXTABLE(1b,2b) ++ : "=&c"(size), "=&D" (__d0) ++ : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0), ++ "r"(__USER_DS)); ++ return size; ++} + + /** + * clear_user: - Zero a block of memory in user space. +@@ -157,7 +168,7 @@ clear_user(void __user *to, unsigned lon + { + might_fault(); + if (access_ok(VERIFY_WRITE, to, n)) +- __do_clear_user(to, n); ++ n = __do_clear_user(to, n); + return n; + } + EXPORT_SYMBOL(clear_user); +@@ -176,8 +187,7 @@ EXPORT_SYMBOL(clear_user); + unsigned long + __clear_user(void __user *to, unsigned long n) + { +- __do_clear_user(to, n); +- return n; ++ return __do_clear_user(to, n); + } + EXPORT_SYMBOL(__clear_user); + +@@ -200,14 +210,17 @@ long strnlen_user(const char __user *s, + might_fault(); + + __asm__ __volatile__( ++ " movw %w8,%%es\n" + " testl %0, %0\n" + " jz 3f\n" +- " andl %0,%%ecx\n" ++ " movl %0,%%ecx\n" + "0: repne; scasb\n" + " setne %%al\n" + " subl %%ecx,%0\n" + " addl %0,%%eax\n" + "1:\n" ++ " pushl %%ss\n" ++ " popl %%es\n" + ".section .fixup,"ax"\n" + "2: xorl %%eax,%%eax\n" + " jmp 1b\n" +@@ -219,7 +232,7 @@ long strnlen_user(const char __user *s, + " .long 0b,2b\n" + ".previous" + :"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp) +- :"0" (n), "1" (s), "2" (0), "3" (mask) ++ :"0" (n), "1" (s), "2" (0), "3" (mask), "r" (__USER_DS) + :"cc"); + return res & mask; + } +@@ -227,10 +240,121 @@ EXPORT_SYMBOL(strnlen_user); + + #ifdef CONFIG_X86_INTEL_USERCOPY + static unsigned long +-__copy_user_intel(void __user *to, const void *from, unsigned long size) ++__generic_copy_to_user_intel(void __user *to, const void *from, unsigned long size) ++{ ++ int d0, d1; ++ __asm__ __volatile__( ++ " movw %w6, %%es\n" ++ " .align 2,0x90\n" ++ "1: movl 32(%4), %%eax\n" ++ " cmpl $67, %0\n" ++ " jbe 3f\n" ++ "2: movl 64(%4), %%eax\n" ++ " .align 2,0x90\n" ++ "3: movl 0(%4), %%eax\n" ++ "4: movl 4(%4), %%edx\n" ++ "5: movl %%eax, %%es:0(%3)\n" ++ "6: movl %%edx, %%es:4(%3)\n" ++ "7: movl 8(%4), %%eax\n" ++ "8: movl 12(%4),%%edx\n" ++ "9: movl %%eax, %%es:8(%3)\n" ++ "10: movl %%edx, %%es:12(%3)\n" ++ "11: movl 16(%4), %%eax\n" ++ "12: movl 20(%4), %%edx\n" ++ "13: movl %%eax, %%es:16(%3)\n" ++ "14: movl %%edx, %%es:20(%3)\n" ++ "15: movl 24(%4), %%eax\n" ++ "16: movl 28(%4), %%edx\n" ++ "17: movl %%eax, %%es:24(%3)\n" ++ "18: movl %%edx, %%es:28(%3)\n" ++ "19: movl 32(%4), %%eax\n" ++ "20: movl 36(%4), %%edx\n" ++ "21: movl %%eax, %%es:32(%3)\n" ++ "22: movl %%edx, %%es:36(%3)\n" ++ "23: movl 40(%4), %%eax\n" ++ "24: movl 44(%4), %%edx\n" ++ "25: movl %%eax, %%es:40(%3)\n" ++ "26: movl %%edx, %%es:44(%3)\n" ++ "27: movl 48(%4), %%eax\n" ++ "28: movl 52(%4), %%edx\n" ++ "29: movl %%eax, %%es:48(%3)\n" ++ "30: movl %%edx, %%es:52(%3)\n" ++ "31: movl 56(%4), %%eax\n" ++ "32: movl 60(%4), %%edx\n" ++ "33: movl %%eax, %%es:56(%3)\n" ++ "34: movl %%edx, %%es:60(%3)\n" ++ " addl $-64, %0\n" ++ " addl $64, %4\n" ++ " addl $64, %3\n" ++ " cmpl $63, %0\n" ++ " ja 1b\n" ++ "35: movl %0, %%eax\n" ++ " shrl $2, %0\n" ++ " andl $3, %%eax\n" ++ " cld\n" ++ "99: rep; movsl\n" ++ "36: movl %%eax, %0\n" ++ "37: rep; movsb\n" ++ "100:\n" ++ " pushl %%ss\n" ++ " popl %%es\n" ++ ".section .fixup,"ax"\n" ++ "101: lea 0(%%eax,%0,4),%0\n" ++ " jmp 100b\n" ++ ".previous\n" ++ ".section __ex_table,"a"\n" ++ " .align 4\n" ++ " .long 1b,100b\n" ++ " .long 2b,100b\n" ++ " .long 3b,100b\n" ++ " .long 4b,100b\n" ++ " .long 5b,100b\n" ++ " .long 6b,100b\n" ++ " .long 7b,100b\n" ++ " .long 8b,100b\n" ++ " .long 9b,100b\n" ++ " .long 10b,100b\n" ++ " .long 11b,100b\n" ++ " .long 12b,100b\n" ++ " .long 13b,100b\n" ++ " .long 14b,100b\n" ++ " .long 15b,100b\n" ++ " .long 16b,100b\n" ++ " .long 17b,100b\n" ++ " .long 18b,100b\n" ++ " .long 19b,100b\n" ++ " .long 20b,100b\n" ++ " .long 21b,100b\n" ++ " .long 22b,100b\n" ++ " .long 23b,100b\n" ++ " .long 24b,100b\n" ++ " .long 25b,100b\n" ++ " .long 26b,100b\n" ++ " .long 27b,100b\n" ++ " .long 28b,100b\n" ++ " .long 29b,100b\n" ++ " .long 30b,100b\n" ++ " .long 31b,100b\n" ++ " .long 32b,100b\n" ++ " .long 33b,100b\n" ++ " .long 34b,100b\n" ++ " .long 35b,100b\n" ++ " .long 36b,100b\n" ++ " .long 37b,100b\n" ++ " .long 99b,101b\n" ++ ".previous" ++ : "=&c"(size), "=&D" (d0), "=&S" (d1) ++ : "1"(to), "2"(from), "0"(size), "r"(__USER_DS) ++ : "eax", "edx", "memory"); ++ return size; ++} ++ ++static unsigned long ++__generic_copy_from_user_intel(void *to, const void __user *from, unsigned long size) + { + int d0, d1; + __asm__ __volatile__( ++ " movw %w6, %%ds\n" + " .align 2,0x90\n" + "1: movl 32(%4), %%eax\n" + " cmpl $67, %0\n" +@@ -239,36 +363,36 @@ __copy_user_intel(void __user *to, const + " .align 2,0x90\n" + "3: movl 0(%4), %%eax\n" + "4: movl 4(%4), %%edx\n" +- "5: movl %%eax, 0(%3)\n" +- "6: movl %%edx, 4(%3)\n" ++ "5: movl %%eax, %%es:0(%3)\n" ++ "6: movl %%edx, %%es:4(%3)\n" + "7: movl 8(%4), %%eax\n" + "8: movl 12(%4),%%edx\n" +- "9: movl %%eax, 8(%3)\n" +- "10: movl %%edx, 12(%3)\n" ++ "9: movl %%eax, %%es:8(%3)\n" ++ "10: movl %%edx, %%es:12(%3)\n" + "11: movl 16(%4), %%eax\n" + "12: movl 20(%4), %%edx\n" +- "13: movl %%eax, 16(%3)\n" +- "14: movl %%edx, 20(%3)\n" ++ "13: movl %%eax, %%es:16(%3)\n" ++ "14: movl %%edx, %%es:20(%3)\n" + "15: movl 24(%4), %%eax\n" + "16: movl 28(%4), %%edx\n" +- "17: movl %%eax, 24(%3)\n" +- "18: movl %%edx, 28(%3)\n" ++ "17: movl %%eax, %%es:24(%3)\n" ++ "18: movl %%edx, %%es:28(%3)\n" + "19: movl 32(%4), %%eax\n" + "20: movl 36(%4), %%edx\n" +- "21: movl %%eax, 32(%3)\n" +- "22: movl %%edx, 36(%3)\n" ++ "21: movl %%eax, %%es:32(%3)\n" ++ "22: movl %%edx, %%es:36(%3)\n" + "23: movl 40(%4), %%eax\n" + "24: movl 44(%4), %%edx\n" +- "25: movl %%eax, 40(%3)\n" +- "26: movl %%edx, 44(%3)\n" ++ "25: movl %%eax, %%es:40(%3)\n" ++ "26: movl %%edx, %%es:44(%3)\n" + "27: movl 48(%4), %%eax\n" + "28: movl 52(%4), %%edx\n" +- "29: movl %%eax, 48(%3)\n" +- "30: movl %%edx, 52(%3)\n" ++ "29: movl %%eax, %%es:48(%3)\n" ++ "30: movl %%edx, %%es:52(%3)\n" + "31: movl 56(%4), %%eax\n" + "32: movl 60(%4), %%edx\n" +- "33: movl %%eax, 56(%3)\n" +- "34: movl %%edx, 60(%3)\n" ++ "33: movl %%eax, %%es:56(%3)\n" ++ "34: movl %%edx, %%es:60(%3)\n" + " addl $-64, %0\n" + " addl $64, %4\n" + " addl $64, %3\n" +@@ -282,6 +406,8 @@ __copy_user_intel(void __user *to, const + "36: movl %%eax, %0\n" + "37: rep; movsb\n" + "100:\n" ++ " pushl %%ss\n" ++ " popl %%ds\n" + ".section .fixup,"ax"\n" + "101: lea 0(%%eax,%0,4),%0\n" + " jmp 100b\n" +@@ -328,7 +454,7 @@ __copy_user_intel(void __user *to, const + " .long 99b,101b\n" + ".previous" + : "=&c"(size), "=&D" (d0), "=&S" (d1) +- : "1"(to), "2"(from), "0"(size) ++ : "1"(to), "2"(from), "0"(size), "r"(__USER_DS) + : "eax", "edx", "memory"); + return size; + } +@@ -338,6 +464,7 @@ __copy_user_zeroing_intel(void *to, cons + { + int d0, d1; + __asm__ __volatile__( ++ " movw %w6, %%ds\n" + " .align 2,0x90\n" + "0: movl 32(%4), %%eax\n" + " cmpl $67, %0\n" +@@ -346,36 +473,36 @@ __copy_user_zeroing_intel(void *to, cons + " .align 2,0x90\n" + "2: movl 0(%4), %%eax\n" + "21: movl 4(%4), %%edx\n" +- " movl %%eax, 0(%3)\n" +- " movl %%edx, 4(%3)\n" ++ " movl %%eax, %%es:0(%3)\n" ++ " movl %%edx, %%es:4(%3)\n" + "3: movl 8(%4), %%eax\n" + "31: movl 12(%4),%%edx\n" +- " movl %%eax, 8(%3)\n" +- " movl %%edx, 12(%3)\n" ++ " movl %%eax, %%es:8(%3)\n" ++ " movl %%edx, %%es:12(%3)\n" + "4: movl 16(%4), %%eax\n" + "41: movl 20(%4), %%edx\n" +- " movl %%eax, 16(%3)\n" +- " movl %%edx, 20(%3)\n" ++ " movl %%eax, %%es:16(%3)\n" ++ " movl %%edx, %%es:20(%3)\n" + "10: movl 24(%4), %%eax\n" + "51: movl 28(%4), %%edx\n" +- " movl %%eax, 24(%3)\n" +- " movl %%edx, 28(%3)\n" ++ " movl %%eax, %%es:24(%3)\n" ++ " movl %%edx, %%es:28(%3)\n" + "11: movl 32(%4), %%eax\n" + "61: movl 36(%4), %%edx\n" +- " movl %%eax, 32(%3)\n" +- " movl %%edx, 36(%3)\n" ++ " movl %%eax, %%es:32(%3)\n" ++ " movl %%edx, %%es:36(%3)\n" + "12: movl 40(%4), %%eax\n" + "71: movl 44(%4), %%edx\n" +- " movl %%eax, 40(%3)\n" +- " movl %%edx, 44(%3)\n" ++ " movl %%eax, %%es:40(%3)\n" ++ " movl %%edx, %%es:44(%3)\n" + "13: movl 48(%4), %%eax\n" + "81: movl 52(%4), %%edx\n" +- " movl %%eax, 48(%3)\n" +- " movl %%edx, 52(%3)\n" ++ " movl %%eax, %%es:48(%3)\n" ++ " movl %%edx, %%es:52(%3)\n" + "14: movl 56(%4), %%eax\n" + "91: movl 60(%4), %%edx\n" +- " movl %%eax, 56(%3)\n" +- " movl %%edx, 60(%3)\n" ++ " movl %%eax, %%es:56(%3)\n" ++ " movl %%edx, %%es:60(%3)\n" + " addl $-64, %0\n" + " addl $64, %4\n" + " addl $64, %3\n" +@@ -389,6 +516,8 @@ __copy_user_zeroing_intel(void *to, cons + " movl %%eax,%0\n" + "7: rep; movsb\n" + "8:\n" ++ " pushl %%ss\n" ++ " popl %%ds\n" + ".section .fixup,"ax"\n" + "9: lea 0(%%eax,%0,4),%0\n" + "16: pushl %0\n" +@@ -423,7 +552,7 @@ __copy_user_zeroing_intel(void *to, cons + " .long 7b,16b\n" + ".previous" + : "=&c"(size), "=&D" (d0), "=&S" (d1) +- : "1"(to), "2"(from), "0"(size) ++ : "1"(to), "2"(from), "0"(size), "r"(__USER_DS) + : "eax", "edx", "memory"); + return size; + } +@@ -439,6 +568,7 @@ static unsigned long __copy_user_zeroing + int d0, d1; + + __asm__ __volatile__( ++ " movw %w6, %%ds\n" + " .align 2,0x90\n" + "0: movl 32(%4), %%eax\n" + " cmpl $67, %0\n" +@@ -447,36 +577,36 @@ static unsigned long __copy_user_zeroing + " .align 2,0x90\n" + "2: movl 0(%4), %%eax\n" + "21: movl 4(%4), %%edx\n" +- " movnti %%eax, 0(%3)\n" +- " movnti %%edx, 4(%3)\n" ++ " movnti %%eax, %%es:0(%3)\n" ++ " movnti %%edx, %%es:4(%3)\n" + "3: movl 8(%4), %%eax\n" + "31: movl 12(%4),%%edx\n" +- " movnti %%eax, 8(%3)\n" +- " movnti %%edx, 12(%3)\n" ++ " movnti %%eax, %%es:8(%3)\n" ++ " movnti %%edx, %%es:12(%3)\n" + "4: movl 16(%4), %%eax\n" + "41: movl 20(%4), %%edx\n" +- " movnti %%eax, 16(%3)\n" +- " movnti %%edx, 20(%3)\n" ++ " movnti %%eax, %%es:16(%3)\n" ++ " movnti %%edx, %%es:20(%3)\n" + "10: movl 24(%4), %%eax\n" + "51: movl 28(%4), %%edx\n" +- " movnti %%eax, 24(%3)\n" +- " movnti %%edx, 28(%3)\n" ++ " movnti %%eax, %%es:24(%3)\n" ++ " movnti %%edx, %%es:28(%3)\n" + "11: movl 32(%4), %%eax\n" + "61: movl 36(%4), %%edx\n" +- " movnti %%eax, 32(%3)\n" +- " movnti %%edx, 36(%3)\n" ++ " movnti %%eax, %%es:32(%3)\n" ++ " movnti %%edx, %%es:36(%3)\n" + "12: movl 40(%4), %%eax\n" + "71: movl 44(%4), %%edx\n" +- " movnti %%eax, 40(%3)\n" +- " movnti %%edx, 44(%3)\n" ++ " movnti %%eax, %%es:40(%3)\n" ++ " movnti %%edx, %%es:44(%3)\n" + "13: movl 48(%4), %%eax\n" + "81: movl 52(%4), %%edx\n" +- " movnti %%eax, 48(%3)\n" +- " movnti %%edx, 52(%3)\n" ++ " movnti %%eax, %%es:48(%3)\n" ++ " movnti %%edx, %%es:52(%3)\n" + "14: movl 56(%4), %%eax\n" + "91: movl 60(%4), %%edx\n" +- " movnti %%eax, 56(%3)\n" +- " movnti %%edx, 60(%3)\n" ++ " movnti %%eax, %%es:56(%3)\n" ++ " movnti %%edx, %%es:60(%3)\n" + " addl $-64, %0\n" + " addl $64, %4\n" + " addl $64, %3\n" +@@ -491,6 +621,8 @@ static unsigned long __copy_user_zeroing + " movl %%eax,%0\n" + "7: rep; movsb\n" + "8:\n" ++ " pushl %%ss\n" ++ " popl %%ds\n" + ".section .fixup,"ax"\n" + "9: lea 0(%%eax,%0,4),%0\n" + "16: pushl %0\n" +@@ -525,7 +657,7 @@ static unsigned long __copy_user_zeroing + " .long 7b,16b\n" + ".previous" + : "=&c"(size), "=&D" (d0), "=&S" (d1) +- : "1"(to), "2"(from), "0"(size) ++ : "1"(to), "2"(from), "0"(size), "r"(__USER_DS) + : "eax", "edx", "memory"); + return size; + } +@@ -536,6 +668,7 @@ static unsigned long __copy_user_intel_n + int d0, d1; + + __asm__ __volatile__( ++ " movw %w6, %%ds\n" + " .align 2,0x90\n" + "0: movl 32(%4), %%eax\n" + " cmpl $67, %0\n" +@@ -544,36 +677,36 @@ static unsigned long __copy_user_intel_n + " .align 2,0x90\n" + "2: movl 0(%4), %%eax\n" + "21: movl 4(%4), %%edx\n" +- " movnti %%eax, 0(%3)\n" +- " movnti %%edx, 4(%3)\n" ++ " movnti %%eax, %%es:0(%3)\n" ++ " movnti %%edx, %%es:4(%3)\n" + "3: movl 8(%4), %%eax\n" + "31: movl 12(%4),%%edx\n" +- " movnti %%eax, 8(%3)\n" +- " movnti %%edx, 12(%3)\n" ++ " movnti %%eax, %%es:8(%3)\n" ++ " movnti %%edx, %%es:12(%3)\n" + "4: movl 16(%4), %%eax\n" + "41: movl 20(%4), %%edx\n" +- " movnti %%eax, 16(%3)\n" +- " movnti %%edx, 20(%3)\n" ++ " movnti %%eax, %%es:16(%3)\n" ++ " movnti %%edx, %%es:20(%3)\n" + "10: movl 24(%4), %%eax\n" + "51: movl 28(%4), %%edx\n" +- " movnti %%eax, 24(%3)\n" +- " movnti %%edx, 28(%3)\n" ++ " movnti %%eax, %%es:24(%3)\n" ++ " movnti %%edx, %%es:28(%3)\n" + "11: movl 32(%4), %%eax\n" + "61: movl 36(%4), %%edx\n" +- " movnti %%eax, 32(%3)\n" +- " movnti %%edx, 36(%3)\n" ++ " movnti %%eax, %%es:32(%3)\n" ++ " movnti %%edx, %%es:36(%3)\n" + "12: movl 40(%4), %%eax\n" + "71: movl 44(%4), %%edx\n" +- " movnti %%eax, 40(%3)\n" +- " movnti %%edx, 44(%3)\n" ++ " movnti %%eax, %%es:40(%3)\n" ++ " movnti %%edx, %%es:44(%3)\n" + "13: movl 48(%4), %%eax\n" + "81: movl 52(%4), %%edx\n" +- " movnti %%eax, 48(%3)\n" +- " movnti %%edx, 52(%3)\n" ++ " movnti %%eax, %%es:48(%3)\n" ++ " movnti %%edx, %%es:52(%3)\n" + "14: movl 56(%4), %%eax\n" + "91: movl 60(%4), %%edx\n" +- " movnti %%eax, 56(%3)\n" +- " movnti %%edx, 60(%3)\n" ++ " movnti %%eax, %%es:56(%3)\n" ++ " movnti %%edx, %%es:60(%3)\n" + " addl $-64, %0\n" + " addl $64, %4\n" + " addl $64, %3\n" +@@ -588,6 +721,8 @@ static unsigned long __copy_user_intel_n + " movl %%eax,%0\n" + "7: rep; movsb\n" + "8:\n" ++ " pushl %%ss\n" ++ " popl %%ds\n" + ".section .fixup,"ax"\n" + "9: lea 0(%%eax,%0,4),%0\n" + "16: jmp 8b\n" +@@ -616,7 +751,7 @@ static unsigned long __copy_user_intel_n + " .long 7b,16b\n" + ".previous" + : "=&c"(size), "=&D" (d0), "=&S" (d1) +- : "1"(to), "2"(from), "0"(size) ++ : "1"(to), "2"(from), "0"(size), "r"(__USER_DS) + : "eax", "edx", "memory"); + return size; + } +@@ -629,90 +764,146 @@ static unsigned long __copy_user_intel_n + */ + unsigned long __copy_user_zeroing_intel(void *to, const void __user *from, + unsigned long size); +-unsigned long __copy_user_intel(void __user *to, const void *from, ++unsigned long __generic_copy_to_user_intel(void __user *to, const void *from, ++ unsigned long size); ++unsigned long __generic_copy_from_user_intel(void *to, const void __user *from, + unsigned long size); + unsigned long __copy_user_zeroing_intel_nocache(void *to, + const void __user *from, unsigned long size); + #endif /* CONFIG_X86_INTEL_USERCOPY */ + + /* Generic arbitrary sized copy. */ +-#define __copy_user(to, from, size) \ +-do { \ +- int __d0, __d1, __d2; \ +- __asm__ __volatile__( \ +- " cmp $7,%0\n" \ +- " jbe 1f\n" \ +- " movl %1,%0\n" \ +- " negl %0\n" \ +- " andl $7,%0\n" \ +- " subl %0,%3\n" \ +- "4: rep; movsb\n" \ +- " movl %3,%0\n" \ +- " shrl $2,%0\n" \ +- " andl $3,%3\n" \ +- " .align 2,0x90\n" \ +- "0: rep; movsl\n" \ +- " movl %3,%0\n" \ +- "1: rep; movsb\n" \ +- "2:\n" \ +- ".section .fixup,"ax"\n" \ +- "5: addl %3,%0\n" \ +- " jmp 2b\n" \ +- "3: lea 0(%3,%0,4),%0\n" \ +- " jmp 2b\n" \ +- ".previous\n" \ +- ".section __ex_table,"a"\n" \ +- " .align 4\n" \ +- " .long 4b,5b\n" \ +- " .long 0b,3b\n" \ +- " .long 1b,2b\n" \ +- ".previous" \ +- : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ +- : "3"(size), "0"(size), "1"(to), "2"(from) \ +- : "memory"); \ +-} while (0) +- +-#define __copy_user_zeroing(to, from, size) \ +-do { \ +- int __d0, __d1, __d2; \ +- __asm__ __volatile__( \ +- " cmp $7,%0\n" \ +- " jbe 1f\n" \ +- " movl %1,%0\n" \ +- " negl %0\n" \ +- " andl $7,%0\n" \ +- " subl %0,%3\n" \ +- "4: rep; movsb\n" \ +- " movl %3,%0\n" \ +- " shrl $2,%0\n" \ +- " andl $3,%3\n" \ +- " .align 2,0x90\n" \ +- "0: rep; movsl\n" \ +- " movl %3,%0\n" \ +- "1: rep; movsb\n" \ +- "2:\n" \ +- ".section .fixup,"ax"\n" \ +- "5: addl %3,%0\n" \ +- " jmp 6f\n" \ +- "3: lea 0(%3,%0,4),%0\n" \ +- "6: pushl %0\n" \ +- " pushl %%eax\n" \ +- " xorl %%eax,%%eax\n" \ +- " rep; stosb\n" \ +- " popl %%eax\n" \ +- " popl %0\n" \ +- " jmp 2b\n" \ +- ".previous\n" \ +- ".section __ex_table,"a"\n" \ +- " .align 4\n" \ +- " .long 4b,5b\n" \ +- " .long 0b,3b\n" \ +- " .long 1b,6b\n" \ +- ".previous" \ +- : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ +- : "3"(size), "0"(size), "1"(to), "2"(from) \ +- : "memory"); \ +-} while (0) ++static unsigned long ++__generic_copy_to_user(void __user *to, const void *from, unsigned long size) ++{ ++ int __d0, __d1, __d2; ++ ++ __asm__ __volatile__( ++ " movw %w8,%%es\n" ++ " cmp $7,%0\n" ++ " jbe 1f\n" ++ " movl %1,%0\n" ++ " negl %0\n" ++ " andl $7,%0\n" ++ " subl %0,%3\n" ++ "4: rep; movsb\n" ++ " movl %3,%0\n" ++ " shrl $2,%0\n" ++ " andl $3,%3\n" ++ " .align 2,0x90\n" ++ "0: rep; movsl\n" ++ " movl %3,%0\n" ++ "1: rep; movsb\n" ++ "2:\n" ++ " pushl %%ss\n" ++ " popl %%es\n" ++ ".section .fixup,"ax"\n" ++ "5: addl %3,%0\n" ++ " jmp 2b\n" ++ "3: lea 0(%3,%0,4),%0\n" ++ " jmp 2b\n" ++ ".previous\n" ++ ".section __ex_table,"a"\n" ++ " .align 4\n" ++ " .long 4b,5b\n" ++ " .long 0b,3b\n" ++ " .long 1b,2b\n" ++ ".previous" ++ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) ++ : "3"(size), "0"(size), "1"(to), "2"(from), "r"(__USER_DS) ++ : "memory"); ++ return size; ++} ++ ++static unsigned long ++__generic_copy_from_user(void *to, const void __user *from, unsigned long size) ++{ ++ int __d0, __d1, __d2; ++ ++ __asm__ __volatile__( ++ " movw %w8,%%ds\n" ++ " cmp $7,%0\n" ++ " jbe 1f\n" ++ " movl %1,%0\n" ++ " negl %0\n" ++ " andl $7,%0\n" ++ " subl %0,%3\n" ++ "4: rep; movsb\n" ++ " movl %3,%0\n" ++ " shrl $2,%0\n" ++ " andl $3,%3\n" ++ " .align 2,0x90\n" ++ "0: rep; movsl\n" ++ " movl %3,%0\n" ++ "1: rep; movsb\n" ++ "2:\n" ++ " pushl %%ss\n" ++ " popl %%ds\n" ++ ".section .fixup,"ax"\n" ++ "5: addl %3,%0\n" ++ " jmp 2b\n" ++ "3: lea 0(%3,%0,4),%0\n" ++ " jmp 2b\n" ++ ".previous\n" ++ ".section __ex_table,"a"\n" ++ " .align 4\n" ++ " .long 4b,5b\n" ++ " .long 0b,3b\n" ++ " .long 1b,2b\n" ++ ".previous" ++ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) ++ : "3"(size), "0"(size), "1"(to), "2"(from), "r"(__USER_DS) ++ : "memory"); ++ return size; ++} ++ ++static unsigned long ++__copy_user_zeroing(void *to, const void __user *from, unsigned long size) ++{ ++ int __d0, __d1, __d2; ++ ++ __asm__ __volatile__( ++ " movw %w8,%%ds\n" ++ " cmp $7,%0\n" ++ " jbe 1f\n" ++ " movl %1,%0\n" ++ " negl %0\n" ++ " andl $7,%0\n" ++ " subl %0,%3\n" ++ "4: rep; movsb\n" ++ " movl %3,%0\n" ++ " shrl $2,%0\n" ++ " andl $3,%3\n" ++ " .align 2,0x90\n" ++ "0: rep; movsl\n" ++ " movl %3,%0\n" ++ "1: rep; movsb\n" ++ "2:\n" ++ " pushl %%ss\n" ++ " popl %%ds\n" ++ ".section .fixup,"ax"\n" ++ "5: addl %3,%0\n" ++ " jmp 6f\n" ++ "3: lea 0(%3,%0,4),%0\n" ++ "6: pushl %0\n" ++ " pushl %%eax\n" ++ " xorl %%eax,%%eax\n" ++ " rep; stosb\n" ++ " popl %%eax\n" ++ " popl %0\n" ++ " jmp 2b\n" ++ ".previous\n" ++ ".section __ex_table,"a"\n" ++ " .align 4\n" ++ " .long 4b,5b\n" ++ " .long 0b,3b\n" ++ " .long 1b,6b\n" ++ ".previous" ++ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) ++ : "3"(size), "0"(size), "1"(to), "2"(from), "r"(__USER_DS) ++ : "memory"); ++ return size; ++} + + unsigned long __copy_to_user_ll(void __user *to, const void *from, + unsigned long n) +@@ -775,9 +966,9 @@ survive: + } + #endif + if (movsl_is_ok(to, from, n)) +- __copy_user(to, from, n); ++ n = __generic_copy_to_user(to, from, n); + else +- n = __copy_user_intel(to, from, n); ++ n = __generic_copy_to_user_intel(to, from, n); + return n; + } + EXPORT_SYMBOL(__copy_to_user_ll); +@@ -786,7 +977,7 @@ unsigned long __copy_from_user_ll(void * + unsigned long n) + { + if (movsl_is_ok(to, from, n)) +- __copy_user_zeroing(to, from, n); ++ n = __copy_user_zeroing(to, from, n); + else + n = __copy_user_zeroing_intel(to, from, n); + return n; +@@ -797,10 +988,9 @@ unsigned long __copy_from_user_ll_nozero + unsigned long n) + { + if (movsl_is_ok(to, from, n)) +- __copy_user(to, from, n); ++ n = __generic_copy_from_user(to, from, n); + else +- n = __copy_user_intel((void __user *)to, +- (const void *)from, n); ++ n = __generic_copy_from_user_intel(to, from, n); + return n; + } + EXPORT_SYMBOL(__copy_from_user_ll_nozero); +@@ -812,9 +1002,9 @@ unsigned long __copy_from_user_ll_nocach + if (n > 64 && cpu_has_xmm2) + n = __copy_user_zeroing_intel_nocache(to, from, n); + else +- __copy_user_zeroing(to, from, n); ++ n = __copy_user_zeroing(to, from, n); + #else +- __copy_user_zeroing(to, from, n); ++ n = __copy_user_zeroing(to, from, n); + #endif + return n; + } +@@ -827,65 +1017,53 @@ unsigned long __copy_from_user_ll_nocach + if (n > 64 && cpu_has_xmm2) + n = __copy_user_intel_nocache(to, from, n); + else +- __copy_user(to, from, n); ++ n = __generic_copy_from_user(to, from, n); + #else +- __copy_user(to, from, n); ++ n = __generic_copy_from_user(to, from, n); + #endif + return n; + } + EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); + +-/** +- * copy_to_user: - Copy a block of data into user space. +- * @to: Destination address, in user space. +- * @from: Source address, in kernel space. +- * @n: Number of bytes to copy. +- * +- * Context: User context only. This function may sleep. +- * +- * Copy data from kernel space to user space. +- * +- * Returns number of bytes that could not be copied. +- * On success, this will be zero. +- */ +-unsigned long +-copy_to_user(void __user *to, const void *from, unsigned long n) ++void copy_from_user_overflow(void) + { +- if (access_ok(VERIFY_WRITE, to, n)) +- n = __copy_to_user(to, from, n); +- return n; ++ WARN(1, "Buffer overflow detected!\n"); + } +-EXPORT_SYMBOL(copy_to_user); ++EXPORT_SYMBOL(copy_from_user_overflow); + +-/** +- * copy_from_user: - Copy a block of data from user space. +- * @to: Destination address, in kernel space. +- * @from: Source address, in user space. +- * @n: Number of bytes to copy. +- * +- * Context: User context only. This function may sleep. +- * +- * Copy data from user space to kernel space. +- * +- * Returns number of bytes that could not be copied. +- * On success, this will be zero. +- * +- * If some data could not be copied, this function will pad the copied +- * data to the requested size using zero bytes. +- */ +-unsigned long +-_copy_from_user(void *to, const void __user *from, unsigned long n) ++void copy_to_user_overflow(void) + { +- if (access_ok(VERIFY_READ, from, n)) +- n = __copy_from_user(to, from, n); +- else +- memset(to, 0, n); +- return n; ++ WARN(1, "Buffer overflow detected!\n"); + } +-EXPORT_SYMBOL(_copy_from_user); ++EXPORT_SYMBOL(copy_to_user_overflow); + +-void copy_from_user_overflow(void) ++#ifdef CONFIG_PAX_MEMORY_UDEREF ++void __set_fs(mm_segment_t x, int cpu) + { +- WARN(1, "Buffer overflow detected!\n"); ++ unsigned long limit = x.seg; ++ struct desc_struct d; ++ ++ current_thread_info()->addr_limit = x; ++ if (unlikely(paravirt_enabled())) ++ return; ++ ++ if (likely(limit)) ++ limit = (limit - 1UL) >> PAGE_SHIFT; ++ pack_descriptor(&d, 0UL, limit, 0xF3, 0xC); ++ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_DEFAULT_USER_DS, &d, DESCTYPE_S); + } +-EXPORT_SYMBOL(copy_from_user_overflow); ++ ++void set_fs(mm_segment_t x) ++{ ++ __set_fs(x, get_cpu()); ++ put_cpu(); ++} ++EXPORT_SYMBOL(copy_from_user); ++#else ++void set_fs(mm_segment_t x) ++{ ++ current_thread_info()->addr_limit = x; ++} ++#endif ++ ++EXPORT_SYMBOL(set_fs); +diff -urNp linux-2.6.33.1/arch/x86/Makefile linux-2.6.33.1/arch/x86/Makefile +--- linux-2.6.33.1/arch/x86/Makefile 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/Makefile 2010-03-20 16:58:39.028522513 -0400 +@@ -192,3 +192,12 @@ define archhelp + echo ' FDARGS="..." arguments for the booted kernel' + echo ' FDINITRD=file initrd for the booted kernel' + endef ++ ++define OLD_LD ++ ++*** ${VERSION}.${PATCHLEVEL} PaX kernels no longer build correctly with old versions of binutils. ++*** Please upgrade your binutils to 2.18 or newer ++endef ++ ++archprepare: ++ $(if $(LDFLAGS_BUILD_ID),,$(error $(OLD_LD))) +diff -urNp linux-2.6.33.1/arch/x86/mm/extable.c linux-2.6.33.1/arch/x86/mm/extable.c +--- linux-2.6.33.1/arch/x86/mm/extable.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/extable.c 2010-03-20 16:58:39.032549699 -0400 +@@ -1,14 +1,71 @@ + #include <linux/module.h> + #include <linux/spinlock.h> ++#include <linux/sort.h> + #include <asm/uaccess.h> ++#include <asm/pgtable.h> + ++/* ++ * The exception table needs to be sorted so that the binary ++ * search that we use to find entries in it works properly. ++ * This is used both for the kernel exception table and for ++ * the exception tables of modules that get loaded. ++ */ ++static int cmp_ex(const void *a, const void *b) ++{ ++ const struct exception_table_entry *x = a, *y = b; ++ ++ /* avoid overflow */ ++ if (x->insn > y->insn) ++ return 1; ++ if (x->insn < y->insn) ++ return -1; ++ return 0; ++} ++ ++static void swap_ex(void *a, void *b, int size) ++{ ++ struct exception_table_entry t, *x = a, *y = b; ++ ++ t = *x; ++ ++ pax_open_kernel(); ++ *x = *y; ++ *y = t; ++ pax_close_kernel(); ++} ++ ++void sort_extable(struct exception_table_entry *start, ++ struct exception_table_entry *finish) ++{ ++ sort(start, finish - start, sizeof(struct exception_table_entry), ++ cmp_ex, swap_ex); ++} ++ ++#ifdef CONFIG_MODULES ++/* ++ * If the exception table is sorted, any referring to the module init ++ * will be at the beginning or the end. ++ */ ++void trim_init_extable(struct module *m) ++{ ++ /*trim the beginning*/ ++ while (m->num_exentries && within_module_init(m->extable[0].insn, m)) { ++ m->extable++; ++ m->num_exentries--; ++ } ++ /*trim the end*/ ++ while (m->num_exentries && ++ within_module_init(m->extable[m->num_exentries-1].insn, m)) ++ m->num_exentries--; ++} ++#endif /* CONFIG_MODULES */ + + int fixup_exception(struct pt_regs *regs) + { + const struct exception_table_entry *fixup; + + #ifdef CONFIG_PNPBIOS +- if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { ++ if (unlikely(!v8086_mode(regs) && SEGMENT_IS_PNP_CODE(regs->cs))) { + extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; + extern u32 pnp_bios_is_utter_crap; + pnp_bios_is_utter_crap = 1; +diff -urNp linux-2.6.33.1/arch/x86/mm/fault.c linux-2.6.33.1/arch/x86/mm/fault.c +--- linux-2.6.33.1/arch/x86/mm/fault.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/fault.c 2010-03-20 16:58:39.032549699 -0400 +@@ -11,10 +11,14 @@ + #include <linux/kprobes.h> /* __kprobes, ... */ + #include <linux/mmiotrace.h> /* kmmio_handler, ... */ + #include <linux/perf_event.h> /* perf_sw_event */ ++#include <linux/unistd.h> ++#include <linux/compiler.h> + + #include <asm/traps.h> /* dotraplinkage, ... */ + #include <asm/pgalloc.h> /* pgd_*(), ... */ + #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ ++#include <asm/vsyscall.h> ++#include <asm/tlbflush.h> + + /* + * Page fault error code bits: +@@ -52,7 +56,7 @@ static inline int __kprobes notify_page_ + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ +- if (kprobes_built_in() && !user_mode_vm(regs)) { ++ if (kprobes_built_in() && !user_mode(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; +@@ -173,6 +177,30 @@ force_sig_info_fault(int si_signo, int s + force_sig_info(si_signo, &info, tsk); + } + ++#ifdef CONFIG_PAX_EMUTRAMP ++static int pax_handle_fetch_fault(struct pt_regs *regs); ++#endif ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++static inline pmd_t * pax_get_pmd(struct mm_struct *mm, unsigned long address) ++{ ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ ++ pgd = pgd_offset(mm, address); ++ if (!pgd_present(*pgd)) ++ return NULL; ++ pud = pud_offset(pgd, address); ++ if (!pud_present(*pud)) ++ return NULL; ++ pmd = pmd_offset(pud, address); ++ if (!pmd_present(*pmd)) ++ return NULL; ++ return pmd; ++} ++#endif ++ + DEFINE_SPINLOCK(pgd_lock); + LIST_HEAD(pgd_list); + +@@ -536,7 +564,7 @@ static int is_errata93(struct pt_regs *r + static int is_errata100(struct pt_regs *regs, unsigned long address) + { + #ifdef CONFIG_X86_64 +- if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) ++ if ((regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT)) && (address >> 32)) + return 1; + #endif + return 0; +@@ -563,7 +591,7 @@ static int is_f00f_bug(struct pt_regs *r + } + + static const char nx_warning[] = KERN_CRIT +-"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; ++"kernel tried to execute NX-protected page - exploit attempt? (uid: %d, task: %s, pid: %d)\n"; + + static void + show_fault_oops(struct pt_regs *regs, unsigned long error_code, +@@ -572,15 +600,26 @@ show_fault_oops(struct pt_regs *regs, un + if (!oops_may_print()) + return; + +- if (error_code & PF_INSTR) { ++ if ((__supported_pte_mask & _PAGE_NX) && (error_code & PF_INSTR)) { + unsigned int level; + + pte_t *pte = lookup_address(address, &level); + + if (pte && pte_present(*pte) && !pte_exec(*pte)) +- printk(nx_warning, current_uid()); ++ printk(nx_warning, current_uid(), current->comm, task_pid_nr(current)); + } + ++#ifdef CONFIG_PAX_KERNEXEC ++ if (init_mm.start_code <= address && address < init_mm.end_code) { ++ if (current->signal->curr_ip) ++ printk(KERN_ERR "PAX: From %pI4: %s:%d, uid/euid: %u/%u, attempted to modify kernel code\n", ++ ¤t->signal->curr_ip, current->comm, task_pid_nr(current), current_uid(), current_euid()); ++ else ++ printk(KERN_ERR "PAX: %s:%d, uid/euid: %u/%u, attempted to modify kernel code\n", ++ current->comm, task_pid_nr(current), current_uid(), current_euid()); ++ } ++#endif ++ + printk(KERN_ALERT "BUG: unable to handle kernel "); + if (address < PAGE_SIZE) + printk(KERN_CONT "NULL pointer dereference"); +@@ -705,6 +744,68 @@ __bad_area_nosemaphore(struct pt_regs *r + unsigned long address, int si_code) + { + struct task_struct *tsk = current; ++ struct mm_struct *mm = tsk->mm; ++ ++#ifdef CONFIG_X86_64 ++ if (mm && (error_code & PF_INSTR)) { ++ if (regs->ip == (unsigned long)vgettimeofday) { ++ regs->ip = (unsigned long)VDSO64_SYMBOL(mm->context.vdso, fallback_gettimeofday); ++ return; ++ } else if (regs->ip == (unsigned long)vtime) { ++ regs->ip = (unsigned long)VDSO64_SYMBOL(mm->context.vdso, fallback_time); ++ return; ++ } else if (regs->ip == (unsigned long)vgetcpu) { ++ regs->ip = (unsigned long)VDSO64_SYMBOL(mm->context.vdso, getcpu); ++ return; ++ } ++ } ++#endif ++ ++#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) ++ if (mm && (error_code & PF_USER)) { ++ unsigned long ip = regs->ip; ++ ++ if (v8086_mode(regs)) ++ ip = ((regs->cs & 0xffff) << 4) + (regs->ip & 0xffff); ++ ++ /* ++ * It's possible to have interrupts off here: ++ */ ++ local_irq_enable(); ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && ++ (((__supported_pte_mask & _PAGE_NX) && (error_code & PF_INSTR)) || (!(error_code & (PF_PROT | PF_WRITE)) && regs->ip == address))) { ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++ switch (pax_handle_fetch_fault(regs)) { ++ case 2: ++ return; ++ } ++#endif ++ ++ pax_report_fault(regs, (void *)regs->ip, (void *)regs->sp); ++ do_group_exit(SIGKILL); ++ } ++#endif ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && !(error_code & (PF_PROT | PF_WRITE)) && (regs->ip + SEGMEXEC_TASK_SIZE == address)) { ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++ switch (pax_handle_fetch_fault(regs)) { ++ case 2: ++ return; ++ } ++#endif ++ ++ pax_report_fault(regs, (void *)regs->ip, (void *)regs->sp); ++ do_group_exit(SIGKILL); ++ } ++#endif ++ ++ } ++#endif + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { +@@ -849,6 +950,106 @@ static int spurious_fault_check(unsigned + return 1; + } + ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_PAGEEXEC) ++static int pax_handle_pageexec_fault(struct pt_regs *regs, struct mm_struct *mm, unsigned long address, unsigned long error_code) ++{ ++ pte_t *pte; ++ pmd_t *pmd; ++ spinlock_t *ptl; ++ unsigned char pte_mask; ++ ++ if ((__supported_pte_mask & _PAGE_NX) || (error_code & (PF_PROT|PF_USER)) != (PF_PROT|PF_USER) || v8086_mode(regs) || ++ !(mm->pax_flags & MF_PAX_PAGEEXEC)) ++ return 0; ++ ++ /* PaX: it's our fault, let's handle it if we can */ ++ ++ /* PaX: take a look at read faults before acquiring any locks */ ++ if (unlikely(!(error_code & PF_WRITE) && (regs->ip == address))) { ++ /* instruction fetch attempt from a protected page in user mode */ ++ up_read(&mm->mmap_sem); ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++ switch (pax_handle_fetch_fault(regs)) { ++ case 2: ++ return 1; ++ } ++#endif ++ ++ pax_report_fault(regs, (void *)regs->ip, (void *)regs->sp); ++ do_group_exit(SIGKILL); ++ } ++ ++ pmd = pax_get_pmd(mm, address); ++ if (unlikely(!pmd)) ++ return 0; ++ ++ pte = pte_offset_map_lock(mm, pmd, address, &ptl); ++ if (unlikely(!(pte_val(*pte) & _PAGE_PRESENT) || pte_user(*pte))) { ++ pte_unmap_unlock(pte, ptl); ++ return 0; ++ } ++ ++ if (unlikely((error_code & PF_WRITE) && !pte_write(*pte))) { ++ /* write attempt to a protected page in user mode */ ++ pte_unmap_unlock(pte, ptl); ++ return 0; ++ } ++ ++#ifdef CONFIG_SMP ++ if (likely(address > get_limit(regs->cs) && cpu_isset(smp_processor_id(), mm->context.cpu_user_cs_mask))) ++#else ++ if (likely(address > get_limit(regs->cs))) ++#endif ++ { ++ set_pte(pte, pte_mkread(*pte)); ++ __flush_tlb_one(address); ++ pte_unmap_unlock(pte, ptl); ++ up_read(&mm->mmap_sem); ++ return 1; ++ } ++ ++ pte_mask = _PAGE_ACCESSED | _PAGE_USER | ((error_code & PF_WRITE) << (_PAGE_BIT_DIRTY-1)); ++ ++ /* ++ * PaX: fill DTLB with user rights and retry ++ */ ++ __asm__ __volatile__ ( ++#ifdef CONFIG_PAX_MEMORY_UDEREF ++ "movw %w4,%%es\n" ++#endif ++ "orb %2,(%1)\n" ++#if defined(CONFIG_M586) || defined(CONFIG_M586TSC) ++/* ++ * PaX: let this uncommented 'invlpg' remind us on the behaviour of Intel's ++ * (and AMD's) TLBs. namely, they do not cache PTEs that would raise *any* ++ * page fault when examined during a TLB load attempt. this is true not only ++ * for PTEs holding a non-present entry but also present entries that will ++ * raise a page fault (such as those set up by PaX, or the copy-on-write ++ * mechanism). in effect it means that we do *not* need to flush the TLBs ++ * for our target pages since their PTEs are simply not in the TLBs at all. ++ ++ * the best thing in omitting it is that we gain around 15-20% speed in the ++ * fast path of the page fault handler and can get rid of tracing since we ++ * can no longer flush unintended entries. ++ */ ++ "invlpg (%0)\n" ++#endif ++ "testb $0,%%es:(%0)\n" ++ "xorb %3,(%1)\n" ++#ifdef CONFIG_PAX_MEMORY_UDEREF ++ "pushl %%ss\n" ++ "popl %%es\n" ++#endif ++ : ++ : "r" (address), "r" (pte), "q" (pte_mask), "i" (_PAGE_USER), "r" (__USER_DS) ++ : "memory", "cc"); ++ pte_unmap_unlock(pte, ptl); ++ up_read(&mm->mmap_sem); ++ return 1; ++} ++#endif ++ + /* + * Handle a spurious fault caused by a stale TLB entry. + * +@@ -915,6 +1116,9 @@ int show_unhandled_signals = 1; + static inline int + access_error(unsigned long error_code, int write, struct vm_area_struct *vma) + { ++ if ((__supported_pte_mask & _PAGE_NX) && (error_code & PF_INSTR) && !(vma->vm_flags & VM_EXEC)) ++ return 1; ++ + if (write) { + /* write, present and write, not present: */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) +@@ -948,17 +1152,16 @@ do_page_fault(struct pt_regs *regs, unsi + { + struct vm_area_struct *vma; + struct task_struct *tsk; +- unsigned long address; + struct mm_struct *mm; + int write; + int fault; + ++ /* Get the faulting address: */ ++ const unsigned long address = read_cr2(); ++ + tsk = current; + mm = tsk->mm; + +- /* Get the faulting address: */ +- address = read_cr2(); +- + /* + * Detect and handle instructions that would cause a page fault for + * both a tracked kernel page and a userspace page. +@@ -1018,7 +1221,7 @@ do_page_fault(struct pt_regs *regs, unsi + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet: + */ +- if (user_mode_vm(regs)) { ++ if (user_mode(regs)) { + local_irq_enable(); + error_code |= PF_USER; + } else { +@@ -1072,6 +1275,11 @@ do_page_fault(struct pt_regs *regs, unsi + might_sleep(); + } + ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_PAGEEXEC) ++ if (pax_handle_pageexec_fault(regs, mm, address, error_code)) ++ return; ++#endif ++ + vma = find_vma(mm, address); + if (unlikely(!vma)) { + bad_area(regs, error_code, address); +@@ -1083,18 +1291,24 @@ do_page_fault(struct pt_regs *regs, unsi + bad_area(regs, error_code, address); + return; + } +- if (error_code & PF_USER) { +- /* +- * Accessing the stack below %sp is always a bug. +- * The large cushion allows instructions like enter +- * and pusha to work. ("enter $65535, $31" pushes +- * 32 pointers and then decrements %sp by 65535.) +- */ +- if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { +- bad_area(regs, error_code, address); +- return; +- } ++ /* ++ * Accessing the stack below %sp is always a bug. ++ * The large cushion allows instructions like enter ++ * and pusha to work. ("enter $65535, $31" pushes ++ * 32 pointers and then decrements %sp by 65535.) ++ */ ++ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < task_pt_regs(tsk)->sp)) { ++ bad_area(regs, error_code, address); ++ return; ++ } ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (unlikely((mm->pax_flags & MF_PAX_SEGMEXEC) && vma->vm_end - SEGMEXEC_TASK_SIZE - 1 < address - SEGMEXEC_TASK_SIZE - 1)) { ++ bad_area(regs, error_code, address); ++ return; + } ++#endif ++ + if (unlikely(expand_stack(vma, address))) { + bad_area(regs, error_code, address); + return; +@@ -1138,3 +1352,199 @@ good_area: + + up_read(&mm->mmap_sem); + } ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++static int pax_handle_fetch_fault_32(struct pt_regs *regs) ++{ ++ int err; ++ ++ do { /* PaX: gcc trampoline emulation #1 */ ++ unsigned char mov1, mov2; ++ unsigned short jmp; ++ unsigned int addr1, addr2; ++ ++#ifdef CONFIG_X86_64 ++ if ((regs->ip + 11) >> 32) ++ break; ++#endif ++ ++ err = get_user(mov1, (unsigned char __user *)regs->ip); ++ err |= get_user(addr1, (unsigned int __user *)(regs->ip + 1)); ++ err |= get_user(mov2, (unsigned char __user *)(regs->ip + 5)); ++ err |= get_user(addr2, (unsigned int __user *)(regs->ip + 6)); ++ err |= get_user(jmp, (unsigned short __user *)(regs->ip + 10)); ++ ++ if (err) ++ break; ++ ++ if (mov1 == 0xB9 && mov2 == 0xB8 && jmp == 0xE0FF) { ++ regs->cx = addr1; ++ regs->ax = addr2; ++ regs->ip = addr2; ++ return 2; ++ } ++ } while (0); ++ ++ do { /* PaX: gcc trampoline emulation #2 */ ++ unsigned char mov, jmp; ++ unsigned int addr1, addr2; ++ ++#ifdef CONFIG_X86_64 ++ if ((regs->ip + 9) >> 32) ++ break; ++#endif ++ ++ err = get_user(mov, (unsigned char __user *)regs->ip); ++ err |= get_user(addr1, (unsigned int __user *)(regs->ip + 1)); ++ err |= get_user(jmp, (unsigned char __user *)(regs->ip + 5)); ++ err |= get_user(addr2, (unsigned int __user *)(regs->ip + 6)); ++ ++ if (err) ++ break; ++ ++ if (mov == 0xB9 && jmp == 0xE9) { ++ regs->cx = addr1; ++ regs->ip = (unsigned int)(regs->ip + addr2 + 10); ++ return 2; ++ } ++ } while (0); ++ ++ return 1; /* PaX in action */ ++} ++ ++#ifdef CONFIG_X86_64 ++static int pax_handle_fetch_fault_64(struct pt_regs *regs) ++{ ++ int err; ++ ++ do { /* PaX: gcc trampoline emulation #1 */ ++ unsigned short mov1, mov2, jmp1; ++ unsigned char jmp2; ++ unsigned int addr1; ++ unsigned long addr2; ++ ++ err = get_user(mov1, (unsigned short __user *)regs->ip); ++ err |= get_user(addr1, (unsigned int __user *)(regs->ip + 2)); ++ err |= get_user(mov2, (unsigned short __user *)(regs->ip + 6)); ++ err |= get_user(addr2, (unsigned long __user *)(regs->ip + 8)); ++ err |= get_user(jmp1, (unsigned short __user *)(regs->ip + 16)); ++ err |= get_user(jmp2, (unsigned char __user *)(regs->ip + 18)); ++ ++ if (err) ++ break; ++ ++ if (mov1 == 0xBB41 && mov2 == 0xBA49 && jmp1 == 0xFF49 && jmp2 == 0xE3) { ++ regs->r11 = addr1; ++ regs->r10 = addr2; ++ regs->ip = addr1; ++ return 2; ++ } ++ } while (0); ++ ++ do { /* PaX: gcc trampoline emulation #2 */ ++ unsigned short mov1, mov2, jmp1; ++ unsigned char jmp2; ++ unsigned long addr1, addr2; ++ ++ err = get_user(mov1, (unsigned short __user *)regs->ip); ++ err |= get_user(addr1, (unsigned long __user *)(regs->ip + 2)); ++ err |= get_user(mov2, (unsigned short __user *)(regs->ip + 10)); ++ err |= get_user(addr2, (unsigned long __user *)(regs->ip + 12)); ++ err |= get_user(jmp1, (unsigned short __user *)(regs->ip + 20)); ++ err |= get_user(jmp2, (unsigned char __user *)(regs->ip + 22)); ++ ++ if (err) ++ break; ++ ++ if (mov1 == 0xBB49 && mov2 == 0xBA49 && jmp1 == 0xFF49 && jmp2 == 0xE3) { ++ regs->r11 = addr1; ++ regs->r10 = addr2; ++ regs->ip = addr1; ++ return 2; ++ } ++ } while (0); ++ ++ return 1; /* PaX in action */ ++} ++#endif ++ ++/* ++ * PaX: decide what to do with offenders (regs->ip = fault address) ++ * ++ * returns 1 when task should be killed ++ * 2 when gcc trampoline was detected ++ */ ++static int pax_handle_fetch_fault(struct pt_regs *regs) ++{ ++ if (v8086_mode(regs)) ++ return 1; ++ ++ if (!(current->mm->pax_flags & MF_PAX_EMUTRAMP)) ++ return 1; ++ ++#ifdef CONFIG_X86_32 ++ return pax_handle_fetch_fault_32(regs); ++#else ++ if (regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT)) ++ return pax_handle_fetch_fault_32(regs); ++ else ++ return pax_handle_fetch_fault_64(regs); ++#endif ++} ++#endif ++ ++#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) ++void pax_report_insns(void *pc, void *sp) ++{ ++ long i; ++ ++ printk(KERN_ERR "PAX: bytes at PC: "); ++ for (i = 0; i < 20; i++) { ++ unsigned char c; ++ if (get_user(c, (__force unsigned char __user *)pc+i)) ++ printk(KERN_CONT "?? "); ++ else ++ printk(KERN_CONT "%02x ", c); ++ } ++ printk("\n"); ++ ++ printk(KERN_ERR "PAX: bytes at SP-%lu: ", (unsigned long)sizeof(long)); ++ for (i = -1; i < 80 / (long)sizeof(long); i++) { ++ unsigned long c; ++ if (get_user(c, (__force unsigned long __user *)sp+i)) ++#ifdef CONFIG_X86_32 ++ printk(KERN_CONT "???????? "); ++#else ++ printk(KERN_CONT "???????????????? "); ++#endif ++ else ++ printk(KERN_CONT "%0*lx ", 2 * (int)sizeof(long), c); ++ } ++ printk("\n"); ++} ++#endif ++ ++/** ++ * probe_kernel_write(): safely attempt to write to a location ++ * @dst: address to write to ++ * @src: pointer to the data that shall be written ++ * @size: size of the data chunk ++ * ++ * Safely write to address @dst from the buffer at @src. If a kernel fault ++ * happens, handle that and return -EFAULT. ++ */ ++long notrace probe_kernel_write(void *dst, const void *src, size_t size) ++{ ++ long ret; ++ mm_segment_t old_fs = get_fs(); ++ ++ set_fs(KERNEL_DS); ++ pagefault_disable(); ++ pax_open_kernel(); ++ ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); ++ pax_close_kernel(); ++ pagefault_enable(); ++ set_fs(old_fs); ++ ++ return ret ? -EFAULT : 0; ++} +diff -urNp linux-2.6.33.1/arch/x86/mm/gup.c linux-2.6.33.1/arch/x86/mm/gup.c +--- linux-2.6.33.1/arch/x86/mm/gup.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/gup.c 2010-03-20 16:58:39.032549699 -0400 +@@ -237,7 +237,7 @@ int __get_user_pages_fast(unsigned long + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; +- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, ++ if (unlikely(!__access_ok(write ? VERIFY_WRITE : VERIFY_READ, + (void __user *)start, len))) + return 0; + +diff -urNp linux-2.6.33.1/arch/x86/mm/highmem_32.c linux-2.6.33.1/arch/x86/mm/highmem_32.c +--- linux-2.6.33.1/arch/x86/mm/highmem_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/highmem_32.c 2010-03-20 16:58:39.032549699 -0400 +@@ -43,7 +43,10 @@ void *kmap_atomic_prot(struct page *page + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + BUG_ON(!pte_none(*(kmap_pte-idx))); ++ ++ pax_open_kernel(); + set_pte(kmap_pte-idx, mk_pte(page, prot)); ++ pax_close_kernel(); + + return (void *)vaddr; + } +diff -urNp linux-2.6.33.1/arch/x86/mm/hugetlbpage.c linux-2.6.33.1/arch/x86/mm/hugetlbpage.c +--- linux-2.6.33.1/arch/x86/mm/hugetlbpage.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/hugetlbpage.c 2010-03-20 16:58:39.032549699 -0400 +@@ -267,13 +267,18 @@ static unsigned long hugetlb_get_unmappe + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; +- unsigned long start_addr; ++ unsigned long start_addr, pax_task_size = TASK_SIZE; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++#endif + + if (len > mm->cached_hole_size) { +- start_addr = mm->free_area_cache; ++ start_addr = mm->free_area_cache; + } else { +- start_addr = TASK_UNMAPPED_BASE; +- mm->cached_hole_size = 0; ++ start_addr = mm->mmap_base; ++ mm->cached_hole_size = 0; + } + + full_search: +@@ -281,13 +286,13 @@ full_search: + + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ +- if (TASK_SIZE - len < addr) { ++ if (pax_task_size - len < addr) { + /* + * Start a new search - just in case we missed + * some holes. + */ +- if (start_addr != TASK_UNMAPPED_BASE) { +- start_addr = TASK_UNMAPPED_BASE; ++ if (start_addr != mm->mmap_base) { ++ start_addr = mm->mmap_base; + mm->cached_hole_size = 0; + goto full_search; + } +@@ -310,9 +315,8 @@ static unsigned long hugetlb_get_unmappe + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev_vma; +- unsigned long base = mm->mmap_base, addr = addr0; ++ unsigned long base = mm->mmap_base, addr; + unsigned long largest_hole = mm->cached_hole_size; +- int first_time = 1; + + /* don't allow allocations above current base */ + if (mm->free_area_cache > base) +@@ -322,7 +326,7 @@ static unsigned long hugetlb_get_unmappe + largest_hole = 0; + mm->free_area_cache = base; + } +-try_again: ++ + /* make sure it can fit in the remaining address space */ + if (mm->free_area_cache < len) + goto fail; +@@ -364,22 +368,26 @@ try_again: + + fail: + /* +- * if hint left us with no space for the requested +- * mapping then try again: +- */ +- if (first_time) { +- mm->free_area_cache = base; +- largest_hole = 0; +- first_time = 0; +- goto try_again; +- } +- /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ +- mm->free_area_cache = TASK_UNMAPPED_BASE; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) ++ mm->mmap_base = SEGMEXEC_TASK_UNMAPPED_BASE; ++ else ++#endif ++ ++ mm->mmap_base = TASK_UNMAPPED_BASE; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base += mm->delta_mmap; ++#endif ++ ++ mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; + addr = hugetlb_get_unmapped_area_bottomup(file, addr0, + len, pgoff, flags); +@@ -387,6 +395,7 @@ fail: + /* + * Restore the topdown base: + */ ++ mm->mmap_base = base; + mm->free_area_cache = base; + mm->cached_hole_size = ~0UL; + +@@ -400,10 +409,17 @@ hugetlb_get_unmapped_area(struct file *f + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; ++ unsigned long pax_task_size = TASK_SIZE; + + if (len & ~huge_page_mask(h)) + return -EINVAL; +- if (len > TASK_SIZE) ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++#endif ++ ++ if (len > pax_task_size) + return -ENOMEM; + + if (flags & MAP_FIXED) { +@@ -415,7 +431,7 @@ hugetlb_get_unmapped_area(struct file *f + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); +- if (TASK_SIZE - len >= addr && ++ if (pax_task_size - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } +diff -urNp linux-2.6.33.1/arch/x86/mm/init_32.c linux-2.6.33.1/arch/x86/mm/init_32.c +--- linux-2.6.33.1/arch/x86/mm/init_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/init_32.c 2010-03-20 16:58:39.032549699 -0400 +@@ -72,36 +72,6 @@ static __init void *alloc_low_page(void) + } + + /* +- * Creates a middle page table and puts a pointer to it in the +- * given global directory entry. This only returns the gd entry +- * in non-PAE compilation mode, since the middle layer is folded. +- */ +-static pmd_t * __init one_md_table_init(pgd_t *pgd) +-{ +- pud_t *pud; +- pmd_t *pmd_table; +- +-#ifdef CONFIG_X86_PAE +- if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { +- if (after_bootmem) +- pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); +- else +- pmd_table = (pmd_t *)alloc_low_page(); +- paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); +- set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); +- pud = pud_offset(pgd, 0); +- BUG_ON(pmd_table != pmd_offset(pud, 0)); +- +- return pmd_table; +- } +-#endif +- pud = pud_offset(pgd, 0); +- pmd_table = pmd_offset(pud, 0); +- +- return pmd_table; +-} +- +-/* + * Create a page table and place a pointer to it in a middle page + * directory entry: + */ +@@ -121,13 +91,28 @@ static pte_t * __init one_page_table_ini + page_table = (pte_t *)alloc_low_page(); + + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); ++#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) ++ set_pmd(pmd, __pmd(__pa(page_table) | _KERNPG_TABLE)); ++#else + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); ++#endif + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); + } + + return pte_offset_kernel(pmd, 0); + } + ++static pmd_t * __init one_md_table_init(pgd_t *pgd) ++{ ++ pud_t *pud; ++ pmd_t *pmd_table; ++ ++ pud = pud_offset(pgd, 0); ++ pmd_table = pmd_offset(pud, 0); ++ ++ return pmd_table; ++} ++ + pmd_t * __init populate_extra_pmd(unsigned long vaddr) + { + int pgd_idx = pgd_index(vaddr); +@@ -201,6 +186,7 @@ page_table_range_init(unsigned long star + int pgd_idx, pmd_idx; + unsigned long vaddr; + pgd_t *pgd; ++ pud_t *pud; + pmd_t *pmd; + pte_t *pte = NULL; + +@@ -210,8 +196,13 @@ page_table_range_init(unsigned long star + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { +- pmd = one_md_table_init(pgd); +- pmd = pmd + pmd_index(vaddr); ++ pud = pud_offset(pgd, vaddr); ++ pmd = pmd_offset(pud, vaddr); ++ ++#ifdef CONFIG_X86_PAE ++ paravirt_alloc_pmd(&init_mm, __pa(pmd) >> PAGE_SHIFT); ++#endif ++ + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd++, pmd_idx++) { + pte = page_table_kmap_check(one_page_table_init(pmd), +@@ -223,11 +214,20 @@ page_table_range_init(unsigned long star + } + } + +-static inline int is_kernel_text(unsigned long addr) ++static inline int is_kernel_text(unsigned long start, unsigned long end) + { +- if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) +- return 1; +- return 0; ++ if ((start > ktla_ktva((unsigned long)_etext) || ++ end <= ktla_ktva((unsigned long)_stext)) && ++ (start > ktla_ktva((unsigned long)_einittext) || ++ end <= ktla_ktva((unsigned long)_sinittext)) && ++ ++#ifdef CONFIG_ACPI_SLEEP ++ (start > (unsigned long)__va(acpi_wakeup_address) + 0x4000 || end <= (unsigned long)__va(acpi_wakeup_address)) && ++#endif ++ ++ (start > (unsigned long)__va(0xfffff) || end <= (unsigned long)__va(0xc0000))) ++ return 0; ++ return 1; + } + + /* +@@ -243,9 +243,10 @@ kernel_physical_mapping_init(unsigned lo + int use_pse = page_size_mask == (1<<PG_LEVEL_2M); + unsigned long start_pfn, end_pfn; + pgd_t *pgd_base = swapper_pg_dir; +- int pgd_idx, pmd_idx, pte_ofs; ++ unsigned int pgd_idx, pmd_idx, pte_ofs; + unsigned long pfn; + pgd_t *pgd; ++ pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned pages_2m, pages_4k; +@@ -278,8 +279,13 @@ repeat: + pfn = start_pfn; + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pgd = pgd_base + pgd_idx; +- for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { +- pmd = one_md_table_init(pgd); ++ for (; pgd_idx < PTRS_PER_PGD && pfn < max_low_pfn; pgd++, pgd_idx++) { ++ pud = pud_offset(pgd, 0); ++ pmd = pmd_offset(pud, 0); ++ ++#ifdef CONFIG_X86_PAE ++ paravirt_alloc_pmd(&init_mm, __pa(pmd) >> PAGE_SHIFT); ++#endif + + if (pfn >= end_pfn) + continue; +@@ -291,14 +297,13 @@ repeat: + #endif + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn; + pmd++, pmd_idx++) { +- unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; ++ unsigned long address = pfn * PAGE_SIZE + PAGE_OFFSET; + + /* + * Map with big pages if possible, otherwise + * create normal page tables: + */ + if (use_pse) { +- unsigned int addr2; + pgprot_t prot = PAGE_KERNEL_LARGE; + /* + * first pass will use the same initial +@@ -308,11 +313,7 @@ repeat: + __pgprot(PTE_IDENT_ATTR | + _PAGE_PSE); + +- addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + +- PAGE_OFFSET + PAGE_SIZE-1; +- +- if (is_kernel_text(addr) || +- is_kernel_text(addr2)) ++ if (is_kernel_text(address, address + PMD_SIZE)) + prot = PAGE_KERNEL_LARGE_EXEC; + + pages_2m++; +@@ -329,7 +330,7 @@ repeat: + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pte += pte_ofs; + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; +- pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { ++ pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { + pgprot_t prot = PAGE_KERNEL; + /* + * first pass will use the same initial +@@ -337,7 +338,7 @@ repeat: + */ + pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); + +- if (is_kernel_text(addr)) ++ if (is_kernel_text(address, address + PAGE_SIZE)) + prot = PAGE_KERNEL_EXEC; + + pages_4k++; +@@ -489,7 +490,7 @@ void __init native_pagetable_setup_start + + pud = pud_offset(pgd, va); + pmd = pmd_offset(pud, va); +- if (!pmd_present(*pmd)) ++ if (!pmd_present(*pmd) || pmd_huge(*pmd)) + break; + + pte = pte_offset_kernel(pmd, va); +@@ -541,9 +542,7 @@ void __init early_ioremap_page_table_ran + + static void __init pagetable_init(void) + { +- pgd_t *pgd_base = swapper_pg_dir; +- +- permanent_kmaps_init(pgd_base); ++ permanent_kmaps_init(swapper_pg_dir); + } + + #ifdef CONFIG_ACPI_SLEEP +@@ -551,12 +550,12 @@ static void __init pagetable_init(void) + * ACPI suspend needs this for resume, because things like the intel-agp + * driver might have split up a kernel 4MB mapping. + */ +-char swsusp_pg_dir[PAGE_SIZE] ++pgd_t swsusp_pg_dir[PTRS_PER_PGD] + __attribute__ ((aligned(PAGE_SIZE))); + + static inline void save_pg_dir(void) + { +- memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); ++ clone_pgd_range(swsusp_pg_dir, swapper_pg_dir, PTRS_PER_PGD); + } + #else /* !CONFIG_ACPI_SLEEP */ + static inline void save_pg_dir(void) +@@ -588,7 +587,7 @@ void zap_low_mappings(bool early) + flush_tlb_all(); + } + +-pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); ++pteval_t __supported_pte_mask __read_only = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); + EXPORT_SYMBOL_GPL(__supported_pte_mask); + + /* user-defined highmem size */ +@@ -777,7 +776,7 @@ void __init setup_bootmem_allocator(void + * Initialize the boot-time allocator (with low memory only): + */ + bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; +- bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size, ++ bootmap = find_e820_area(0x100000, max_pfn_mapped<<PAGE_SHIFT, bootmap_size, + PAGE_SIZE); + if (bootmap == -1L) + panic("Cannot find bootmem map of size %ld\n", bootmap_size); +@@ -881,7 +880,7 @@ void __init mem_init(void) + set_highmem_pages_init(); + + codesize = (unsigned long) &_etext - (unsigned long) &_text; +- datasize = (unsigned long) &_edata - (unsigned long) &_etext; ++ datasize = (unsigned long) &_edata - (unsigned long) &_sdata; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " +@@ -922,10 +921,10 @@ void __init mem_init(void) + ((unsigned long)&__init_end - + (unsigned long)&__init_begin) >> 10, + +- (unsigned long)&_etext, (unsigned long)&_edata, +- ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, ++ (unsigned long)&_sdata, (unsigned long)&_edata, ++ ((unsigned long)&_edata - (unsigned long)&_sdata) >> 10, + +- (unsigned long)&_text, (unsigned long)&_etext, ++ ktla_ktva((unsigned long)&_text), ktla_ktva((unsigned long)&_etext), + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + + /* +@@ -1006,6 +1005,7 @@ void set_kernel_text_rw(void) + if (!kernel_set_to_readonly) + return; + ++ start = ktla_ktva(start); + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, start+size); + +@@ -1020,6 +1020,7 @@ void set_kernel_text_ro(void) + if (!kernel_set_to_readonly) + return; + ++ start = ktla_ktva(start); + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, start+size); + +@@ -1031,6 +1032,7 @@ void mark_rodata_ro(void) + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + ++ start = ktla_ktva(start); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); +diff -urNp linux-2.6.33.1/arch/x86/mm/init_64.c linux-2.6.33.1/arch/x86/mm/init_64.c +--- linux-2.6.33.1/arch/x86/mm/init_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/init_64.c 2010-03-20 16:58:39.032549699 -0400 +@@ -73,7 +73,7 @@ early_param("gbpages", parse_direct_gbpa + * around without checking the pgd every time. + */ + +-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; ++pteval_t __supported_pte_mask __read_only = ~(_PAGE_NX | _PAGE_IOMAP); + EXPORT_SYMBOL_GPL(__supported_pte_mask); + + int force_personality32; +@@ -164,7 +164,9 @@ void set_pte_vaddr_pud(pud_t *pud_page, + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); + ++ pax_open_kernel(); + set_pte(pte, new_pte); ++ pax_close_kernel(); + + /* + * It's enough to flush this one mapping. +@@ -223,14 +225,12 @@ static void __init __init_extra_mapping( + pgd = pgd_offset_k((unsigned long)__va(phys)); + if (pgd_none(*pgd)) { + pud = (pud_t *) spp_getpage(); +- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | +- _PAGE_USER)); ++ set_pgd(pgd, __pgd(__pa(pud) | _PAGE_TABLE)); + } + pud = pud_offset(pgd, (unsigned long)__va(phys)); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); +- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | +- _PAGE_USER)); ++ set_pud(pud, __pud(__pa(pmd) | _PAGE_TABLE)); + } + pmd = pmd_offset(pud, phys); + BUG_ON(!pmd_none(*pmd)); +@@ -882,8 +882,8 @@ int kern_addr_valid(unsigned long addr) + static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_START, + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), +- .vm_page_prot = PAGE_READONLY_EXEC, +- .vm_flags = VM_READ | VM_EXEC ++ .vm_page_prot = PAGE_READONLY, ++ .vm_flags = VM_READ + }; + + struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +@@ -917,7 +917,7 @@ int in_gate_area_no_task(unsigned long a + + const char *arch_vma_name(struct vm_area_struct *vma) + { +- if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) ++ if (vma->vm_mm && vma->vm_start == vma->vm_mm->context.vdso) + return "[vdso]"; + if (vma == &gate_vma) + return "[vsyscall]"; +diff -urNp linux-2.6.33.1/arch/x86/mm/init.c linux-2.6.33.1/arch/x86/mm/init.c +--- linux-2.6.33.1/arch/x86/mm/init.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/init.c 2010-03-20 16:58:39.036535097 -0400 +@@ -69,11 +69,7 @@ static void __init find_early_table_spac + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ +-#ifdef CONFIG_X86_32 +- start = 0x7000; +-#else +- start = 0x8000; +-#endif ++ start = 0x100000; + e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, + tables, PAGE_SIZE); + if (e820_table_start == -1UL) +@@ -327,7 +323,13 @@ unsigned long __init_refok init_memory_m + */ + int devmem_is_allowed(unsigned long pagenr) + { +- if (pagenr <= 256) ++ if (!pagenr) ++ return 1; ++#ifdef CONFIG_VM86 ++ if (pagenr < (ISA_START_ADDRESS >> PAGE_SHIFT)) ++ return 1; ++#endif ++ if ((ISA_START_ADDRESS >> PAGE_SHIFT) <= pagenr && pagenr < (ISA_END_ADDRESS >> PAGE_SHIFT)) + return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; +@@ -375,6 +377,87 @@ void free_init_pages(char *what, unsigne + + void free_initmem(void) + { ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ ++#ifdef CONFIG_X86_32 ++ /* PaX: limit KERNEL_CS to actual size */ ++ unsigned long addr, limit; ++ struct desc_struct d; ++ int cpu; ++ ++ limit = paravirt_enabled() ? ktva_ktla(0xffffffff) : (unsigned long)&_etext; ++ limit = (limit - 1UL) >> PAGE_SHIFT; ++ ++ memset(__LOAD_PHYSICAL_ADDR + PAGE_OFFSET, POISON_FREE_INITMEM, PAGE_SIZE); ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ pack_descriptor(&d, get_desc_base(&get_cpu_gdt_table(cpu)[GDT_ENTRY_KERNEL_CS]), limit, 0x9B, 0xC); ++ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_KERNEL_CS, &d, DESCTYPE_S); ++ } ++ ++ /* PaX: make KERNEL_CS read-only */ ++ addr = PFN_ALIGN(ktla_ktva((unsigned long)&_text)); ++ if (!paravirt_enabled()) ++ set_memory_ro(addr, (PFN_ALIGN(_sdata) - addr) >> PAGE_SHIFT); ++/* ++ for (addr = ktla_ktva((unsigned long)&_text); addr < (unsigned long)&_sdata; addr += PMD_SIZE) { ++ pgd = pgd_offset_k(addr); ++ pud = pud_offset(pgd, addr); ++ pmd = pmd_offset(pud, addr); ++ set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_RW)); ++ } ++*/ ++#ifdef CONFIG_X86_PAE ++ set_memory_nx(PFN_ALIGN(__init_begin), (PFN_ALIGN(__init_end) - PFN_ALIGN(__init_begin)) >> PAGE_SHIFT); ++ for (addr = (unsigned long)&__init_begin; addr < (unsigned long)&__init_end; addr += PMD_SIZE) { ++ pgd = pgd_offset_k(addr); ++ pud = pud_offset(pgd, addr); ++ pmd = pmd_offset(pud, addr); ++ set_pmd(pmd, __pmd(pmd_val(*pmd) | (_PAGE_NX & __supported_pte_mask))); ++ } ++#endif ++ ++#ifdef CONFIG_MODULES ++ set_memory_4k((unsigned long)MODULES_EXEC_VADDR, (MODULES_EXEC_END - MODULES_EXEC_VADDR) >> PAGE_SHIFT); ++#endif ++ ++#else ++ unsigned long addr, end; ++ ++ /* PaX: make kernel code/rodata read-only, rest non-executable */ ++ for (addr = __START_KERNEL_map; addr < __START_KERNEL_map + KERNEL_IMAGE_SIZE; addr += PMD_SIZE) { ++ pgd = pgd_offset_k(addr); ++ pud = pud_offset(pgd, addr); ++ pmd = pmd_offset(pud, addr); ++ if (!pmd_present(*pmd)) ++ continue; ++ if ((unsigned long)_text <= addr && addr < (unsigned long)_sdata) ++ set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_RW)); ++ else ++ set_pmd(pmd, __pmd(pmd_val(*pmd) | (_PAGE_NX & __supported_pte_mask))); ++ } ++ ++ addr = (unsigned long)__va(__pa(__START_KERNEL_map)); ++ end = addr + KERNEL_IMAGE_SIZE; ++ for (; addr < end; addr += PMD_SIZE) { ++ pgd = pgd_offset_k(addr); ++ pud = pud_offset(pgd, addr); ++ pmd = pmd_offset(pud, addr); ++ if (!pmd_present(*pmd)) ++ continue; ++ if ((unsigned long)__va(__pa(_text)) <= addr && addr < (unsigned long)__va(__pa(_sdata))) ++ set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_RW)); ++ else ++ set_pmd(pmd, __pmd(pmd_val(*pmd) | (_PAGE_NX & __supported_pte_mask))); ++ } ++#endif ++ ++ flush_tlb_all(); ++#endif ++ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +diff -urNp linux-2.6.33.1/arch/x86/mm/iomap_32.c linux-2.6.33.1/arch/x86/mm/iomap_32.c +--- linux-2.6.33.1/arch/x86/mm/iomap_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/iomap_32.c 2010-03-20 16:58:39.036535097 -0400 +@@ -65,7 +65,11 @@ void *kmap_atomic_prot_pfn(unsigned long + debug_kmap_atomic(type); + idx = type + KM_TYPE_NR * smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); ++ ++ pax_open_kernel(); + set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); ++ pax_close_kernel(); ++ + arch_flush_lazy_mmu_mode(); + + return (void *)vaddr; +diff -urNp linux-2.6.33.1/arch/x86/mm/ioremap.c linux-2.6.33.1/arch/x86/mm/ioremap.c +--- linux-2.6.33.1/arch/x86/mm/ioremap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/ioremap.c 2010-03-20 16:58:39.036535097 -0400 +@@ -41,8 +41,8 @@ int page_is_ram(unsigned long pagenr) + * Second special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + */ +- if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && +- pagenr < (BIOS_END >> PAGE_SHIFT)) ++ if (pagenr >= (ISA_START_ADDRESS >> PAGE_SHIFT) && ++ pagenr < (ISA_END_ADDRESS >> PAGE_SHIFT)) + return 0; + + for (i = 0; i < e820.nr_map; i++) { +@@ -137,13 +137,10 @@ static void __iomem *__ioremap_caller(re + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ +- for (pfn = phys_addr >> PAGE_SHIFT; +- (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); +- pfn++) { +- ++ for (pfn = phys_addr >> PAGE_SHIFT; ((resource_size_t)pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); pfn++) { + int is_ram = page_is_ram(pfn); + +- if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) ++ if (is_ram && pfn_valid(pfn) && (pfn >= 0x100 || !PageReserved(pfn_to_page(pfn)))) + return NULL; + WARN_ON_ONCE(is_ram); + } +@@ -383,7 +380,7 @@ static int __init early_ioremap_debug_se + early_param("early_ioremap_debug", early_ioremap_debug_setup); + + static __initdata int after_paging_init; +-static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; ++static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __read_only __aligned(PAGE_SIZE); + + static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) + { +@@ -415,8 +412,7 @@ void __init early_ioremap_init(void) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); +- memset(bm_pte, 0, sizeof(bm_pte)); +- pmd_populate_kernel(&init_mm, pmd, bm_pte); ++ pmd_populate_user(&init_mm, pmd, bm_pte); + + /* + * The boot-ioremap range spans multiple pmds, for which +diff -urNp linux-2.6.33.1/arch/x86/mm/kmemcheck/kmemcheck.c linux-2.6.33.1/arch/x86/mm/kmemcheck/kmemcheck.c +--- linux-2.6.33.1/arch/x86/mm/kmemcheck/kmemcheck.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/kmemcheck/kmemcheck.c 2010-03-20 16:58:39.036535097 -0400 +@@ -622,9 +622,9 @@ bool kmemcheck_fault(struct pt_regs *reg + * memory (e.g. tracked pages)? For now, we need this to avoid + * invoking kmemcheck for PnP BIOS calls. + */ +- if (regs->flags & X86_VM_MASK) ++ if (v8086_mode(regs)) + return false; +- if (regs->cs != __KERNEL_CS) ++ if (regs->cs != __KERNEL_CS && regs->cs != __KERNEXEC_KERNEL_CS) + return false; + + pte = kmemcheck_pte_lookup(address); +diff -urNp linux-2.6.33.1/arch/x86/mm/mmap.c linux-2.6.33.1/arch/x86/mm/mmap.c +--- linux-2.6.33.1/arch/x86/mm/mmap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/mmap.c 2010-03-20 16:58:39.036535097 -0400 +@@ -49,7 +49,7 @@ static unsigned int stack_maxrandom_size + * Leave an at least ~128 MB hole with possible stack randomization. + */ + #define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) +-#define MAX_GAP (TASK_SIZE/6*5) ++#define MAX_GAP (pax_task_size/6*5) + + /* + * True on X86_32 or when emulating IA32 on X86_64 +@@ -94,27 +94,40 @@ static unsigned long mmap_rnd(void) + return rnd << PAGE_SHIFT; + } + +-static unsigned long mmap_base(void) ++static unsigned long mmap_base(struct mm_struct *mm) + { + unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; ++ unsigned long pax_task_size = TASK_SIZE; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++#endif + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + +- return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); ++ return PAGE_ALIGN(pax_task_size - gap - mmap_rnd()); + } + + /* + * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 + * does, but not when emulating X86_32 + */ +-static unsigned long mmap_legacy_base(void) ++static unsigned long mmap_legacy_base(struct mm_struct *mm) + { +- if (mmap_is_ia32()) ++ if (mmap_is_ia32()) { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) ++ return SEGMEXEC_TASK_UNMAPPED_BASE; ++ else ++#endif ++ + return TASK_UNMAPPED_BASE; +- else ++ } else + return TASK_UNMAPPED_BASE + mmap_rnd(); + } + +@@ -125,11 +138,23 @@ static unsigned long mmap_legacy_base(vo + void arch_pick_mmap_layout(struct mm_struct *mm) + { + if (mmap_is_legacy()) { +- mm->mmap_base = mmap_legacy_base(); ++ mm->mmap_base = mmap_legacy_base(mm); ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base += mm->delta_mmap; ++#endif ++ + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { +- mm->mmap_base = mmap_base(); ++ mm->mmap_base = mmap_base(mm); ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base -= mm->delta_mmap + mm->delta_stack; ++#endif ++ + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + mm->unmap_area = arch_unmap_area_topdown; + } +diff -urNp linux-2.6.33.1/arch/x86/mm/numa_32.c linux-2.6.33.1/arch/x86/mm/numa_32.c +--- linux-2.6.33.1/arch/x86/mm/numa_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/numa_32.c 2010-03-20 16:58:39.036535097 -0400 +@@ -98,7 +98,6 @@ unsigned long node_memmap_size_bytes(int + } + #endif + +-extern unsigned long find_max_low_pfn(void); + extern unsigned long highend_pfn, highstart_pfn; + + #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) +diff -urNp linux-2.6.33.1/arch/x86/mm/pageattr.c linux-2.6.33.1/arch/x86/mm/pageattr.c +--- linux-2.6.33.1/arch/x86/mm/pageattr.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/pageattr.c 2010-03-20 16:58:39.036535097 -0400 +@@ -268,9 +268,10 @@ static inline pgprot_t static_protection + * Does not cover __inittext since that is gone later on. On + * 64bit we do not enforce !NX on the low mapping + */ +- if (within(address, (unsigned long)_text, (unsigned long)_etext)) ++ if (within(address, ktla_ktva((unsigned long)_text), ktla_ktva((unsigned long)_etext))) + pgprot_val(forbidden) |= _PAGE_NX; + ++#ifdef CONFIG_DEBUG_RODATA + /* + * The .rodata section needs to be read-only. Using the pfn + * catches all aliases. +@@ -278,6 +279,7 @@ static inline pgprot_t static_protection + if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, + __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) + pgprot_val(forbidden) |= _PAGE_RW; ++#endif + + #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) + /* +@@ -347,7 +349,10 @@ EXPORT_SYMBOL_GPL(lookup_address); + static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) + { + /* change init_mm */ ++ pax_open_kernel(); + set_pte_atomic(kpte, pte); ++ pax_close_kernel(); ++ + #ifdef CONFIG_X86_32 + if (!SHARED_KERNEL_PMD) { + struct page *page; +diff -urNp linux-2.6.33.1/arch/x86/mm/pageattr-test.c linux-2.6.33.1/arch/x86/mm/pageattr-test.c +--- linux-2.6.33.1/arch/x86/mm/pageattr-test.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/pageattr-test.c 2010-03-20 16:58:39.036535097 -0400 +@@ -36,7 +36,7 @@ enum { + + static int pte_testbit(pte_t pte) + { +- return pte_flags(pte) & _PAGE_UNUSED1; ++ return pte_flags(pte) & _PAGE_CPA_TEST; + } + + struct split_state { +diff -urNp linux-2.6.33.1/arch/x86/mm/pat.c linux-2.6.33.1/arch/x86/mm/pat.c +--- linux-2.6.33.1/arch/x86/mm/pat.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/pat.c 2010-03-20 16:58:39.036535097 -0400 +@@ -259,7 +259,7 @@ chk_conflict(struct memtype *new, struct + + conflict: + printk(KERN_INFO "%s:%d conflicting memory types " +- "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, ++ "%Lx-%Lx %s<->%s\n", current->comm, task_pid_nr(current), new->start, + new->end, cattr_name(new->type), cattr_name(entry->type)); + return -EBUSY; + } +@@ -555,7 +555,7 @@ unlock_ret: + + if (err) { + printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", +- current->comm, current->pid, start, end); ++ current->comm, task_pid_nr(current), start, end); + } + + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); +@@ -750,7 +750,7 @@ int kernel_map_sync_memtype(u64 base, un + printk(KERN_INFO + "%s:%d ioremap_change_attr failed %s " + "for %Lx-%Lx\n", +- current->comm, current->pid, ++ current->comm, task_pid_nr(current), + cattr_name(flags), + base, (unsigned long long)(base + size)); + return -EINVAL; +@@ -808,7 +808,7 @@ static int reserve_pfn_range(u64 paddr, + free_memtype(paddr, paddr + size); + printk(KERN_ERR "%s:%d map pfn expected mapping type %s" + " for %Lx-%Lx, got %s\n", +- current->comm, current->pid, ++ current->comm, task_pid_nr(current), + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), +diff -urNp linux-2.6.33.1/arch/x86/mm/pgtable_32.c linux-2.6.33.1/arch/x86/mm/pgtable_32.c +--- linux-2.6.33.1/arch/x86/mm/pgtable_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/pgtable_32.c 2010-03-20 16:58:39.036535097 -0400 +@@ -49,10 +49,13 @@ void set_pte_vaddr(unsigned long vaddr, + return; + } + pte = pte_offset_kernel(pmd, vaddr); ++ ++ pax_open_kernel(); + if (pte_val(pteval)) + set_pte_at(&init_mm, vaddr, pte, pteval); + else + pte_clear(&init_mm, vaddr, pte); ++ pax_close_kernel(); + + /* + * It's enough to flush this one mapping. +diff -urNp linux-2.6.33.1/arch/x86/mm/tlb.c linux-2.6.33.1/arch/x86/mm/tlb.c +--- linux-2.6.33.1/arch/x86/mm/tlb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/mm/tlb.c 2010-03-20 16:58:39.036535097 -0400 +@@ -13,7 +13,7 @@ + #include <asm/uv/uv.h> + + DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) +- = { &init_mm, 0, }; ++ = { &init_mm, 0 }; + + /* + * Smarter SMP flushing macros. +diff -urNp linux-2.6.33.1/arch/x86/oprofile/backtrace.c linux-2.6.33.1/arch/x86/oprofile/backtrace.c +--- linux-2.6.33.1/arch/x86/oprofile/backtrace.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/oprofile/backtrace.c 2010-03-20 17:06:47.204705877 -0400 +@@ -58,7 +58,7 @@ static struct frame_head *dump_user_back + struct frame_head bufhead[2]; + + /* Also check accessibility of one struct frame_head beyond */ +- if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) ++ if (!__access_ok(VERIFY_READ, head, sizeof(bufhead))) + return NULL; + if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + return NULL; +@@ -78,7 +78,7 @@ x86_backtrace(struct pt_regs * const reg + { + struct frame_head *head = (struct frame_head *)frame_pointer(regs); + +- if (!user_mode_vm(regs)) { ++ if (!user_mode(regs)) { + unsigned long stack = kernel_stack_pointer(regs); + if (depth) + dump_trace(NULL, regs, (unsigned long *)stack, 0, +diff -urNp linux-2.6.33.1/arch/x86/oprofile/op_model_p4.c linux-2.6.33.1/arch/x86/oprofile/op_model_p4.c +--- linux-2.6.33.1/arch/x86/oprofile/op_model_p4.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/oprofile/op_model_p4.c 2010-03-20 16:58:39.036535097 -0400 +@@ -50,7 +50,7 @@ static inline void setup_num_counters(vo + #endif + } + +-static int inline addr_increment(void) ++static inline int addr_increment(void) + { + #ifdef CONFIG_SMP + return smp_num_siblings == 2 ? 2 : 1; +diff -urNp linux-2.6.33.1/arch/x86/pci/common.c linux-2.6.33.1/arch/x86/pci/common.c +--- linux-2.6.33.1/arch/x86/pci/common.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/pci/common.c 2010-03-20 16:58:39.036535097 -0400 +@@ -31,8 +31,8 @@ int noioapicreroute = 1; + int pcibios_last_bus = -1; + unsigned long pirq_table_addr; + struct pci_bus *pci_root_bus; +-struct pci_raw_ops *raw_pci_ops; +-struct pci_raw_ops *raw_pci_ext_ops; ++const struct pci_raw_ops *raw_pci_ops; ++const struct pci_raw_ops *raw_pci_ext_ops; + + int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, + int reg, int len, u32 *val) +@@ -370,7 +370,7 @@ static const struct dmi_system_id __devi + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"), + }, + }, +- {} ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL} + }; + + void __init dmi_check_pciprobe(void) +diff -urNp linux-2.6.33.1/arch/x86/pci/direct.c linux-2.6.33.1/arch/x86/pci/direct.c +--- linux-2.6.33.1/arch/x86/pci/direct.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/pci/direct.c 2010-03-20 16:58:39.040721575 -0400 +@@ -79,7 +79,7 @@ static int pci_conf1_write(unsigned int + + #undef PCI_CONF1_ADDRESS + +-struct pci_raw_ops pci_direct_conf1 = { ++const struct pci_raw_ops pci_direct_conf1 = { + .read = pci_conf1_read, + .write = pci_conf1_write, + }; +@@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int + + #undef PCI_CONF2_ADDRESS + +-struct pci_raw_ops pci_direct_conf2 = { ++const struct pci_raw_ops pci_direct_conf2 = { + .read = pci_conf2_read, + .write = pci_conf2_write, + }; +@@ -189,7 +189,7 @@ struct pci_raw_ops pci_direct_conf2 = { + * This should be close to trivial, but it isn't, because there are buggy + * chipsets (yes, you guessed it, by Intel and Compaq) that have no class ID. + */ +-static int __init pci_sanity_check(struct pci_raw_ops *o) ++static int __init pci_sanity_check(const struct pci_raw_ops *o) + { + u32 x = 0; + int year, devfn; +diff -urNp linux-2.6.33.1/arch/x86/pci/fixup.c linux-2.6.33.1/arch/x86/pci/fixup.c +--- linux-2.6.33.1/arch/x86/pci/fixup.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/pci/fixup.c 2010-03-20 16:58:39.040721575 -0400 +@@ -364,7 +364,7 @@ static const struct dmi_system_id __devi + DMI_MATCH(DMI_PRODUCT_NAME, "MS-6702E"), + }, + }, +- {} ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + + /* +@@ -435,7 +435,7 @@ static const struct dmi_system_id __devi + DMI_MATCH(DMI_PRODUCT_VERSION, "PSA40U"), + }, + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + + static void __devinit pci_pre_fixup_toshiba_ohci1394(struct pci_dev *dev) +diff -urNp linux-2.6.33.1/arch/x86/pci/irq.c linux-2.6.33.1/arch/x86/pci/irq.c +--- linux-2.6.33.1/arch/x86/pci/irq.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/pci/irq.c 2010-03-20 16:58:39.040721575 -0400 +@@ -543,7 +543,7 @@ static __init int intel_router_probe(str + static struct pci_device_id __initdata pirq_440gx[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) }, +- { }, ++ { PCI_DEVICE(0, 0) } + }; + + /* 440GX has a proprietary PIRQ router -- don't use it */ +@@ -1107,7 +1107,7 @@ static struct dmi_system_id __initdata p + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + }, + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + + int __init pcibios_irq_init(void) +diff -urNp linux-2.6.33.1/arch/x86/pci/mmconfig_32.c linux-2.6.33.1/arch/x86/pci/mmconfig_32.c +--- linux-2.6.33.1/arch/x86/pci/mmconfig_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/pci/mmconfig_32.c 2010-03-20 16:58:39.040721575 -0400 +@@ -117,7 +117,7 @@ static int pci_mmcfg_write(unsigned int + return 0; + } + +-static struct pci_raw_ops pci_mmcfg = { ++static const struct pci_raw_ops pci_mmcfg = { + .read = pci_mmcfg_read, + .write = pci_mmcfg_write, + }; +diff -urNp linux-2.6.33.1/arch/x86/pci/mmconfig_64.c linux-2.6.33.1/arch/x86/pci/mmconfig_64.c +--- linux-2.6.33.1/arch/x86/pci/mmconfig_64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/pci/mmconfig_64.c 2010-03-20 16:58:39.040721575 -0400 +@@ -81,7 +81,7 @@ static int pci_mmcfg_write(unsigned int + return 0; + } + +-static struct pci_raw_ops pci_mmcfg = { ++static const struct pci_raw_ops pci_mmcfg = { + .read = pci_mmcfg_read, + .write = pci_mmcfg_write, + }; +diff -urNp linux-2.6.33.1/arch/x86/pci/numaq_32.c linux-2.6.33.1/arch/x86/pci/numaq_32.c +--- linux-2.6.33.1/arch/x86/pci/numaq_32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/pci/numaq_32.c 2010-03-20 16:58:39.040721575 -0400 +@@ -112,7 +112,7 @@ static int pci_conf1_mq_write(unsigned i + + #undef PCI_CONF1_MQ_ADDRESS + +-static struct pci_raw_ops pci_direct_conf1_mq = { ++static const struct pci_raw_ops pci_direct_conf1_mq = { + .read = pci_conf1_mq_read, + .write = pci_conf1_mq_write + }; +diff -urNp linux-2.6.33.1/arch/x86/pci/olpc.c linux-2.6.33.1/arch/x86/pci/olpc.c +--- linux-2.6.33.1/arch/x86/pci/olpc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/pci/olpc.c 2010-03-20 16:58:39.040721575 -0400 +@@ -297,7 +297,7 @@ static int pci_olpc_write(unsigned int s + return 0; + } + +-static struct pci_raw_ops pci_olpc_conf = { ++static const struct pci_raw_ops pci_olpc_conf = { + .read = pci_olpc_read, + .write = pci_olpc_write, + }; +diff -urNp linux-2.6.33.1/arch/x86/pci/pcbios.c linux-2.6.33.1/arch/x86/pci/pcbios.c +--- linux-2.6.33.1/arch/x86/pci/pcbios.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/pci/pcbios.c 2010-03-20 16:58:39.040721575 -0400 +@@ -56,50 +56,93 @@ union bios32 { + static struct { + unsigned long address; + unsigned short segment; +-} bios32_indirect = { 0, __KERNEL_CS }; ++} bios32_indirect __read_only = { 0, __PCIBIOS_CS }; + + /* + * Returns the entry point for the given service, NULL on error + */ + +-static unsigned long bios32_service(unsigned long service) ++static unsigned long __devinit bios32_service(unsigned long service) + { + unsigned char return_code; /* %al */ + unsigned long address; /* %ebx */ + unsigned long length; /* %ecx */ + unsigned long entry; /* %edx */ + unsigned long flags; ++ struct desc_struct d, *gdt; + + local_irq_save(flags); +- __asm__("lcall *(%%edi); cld" ++ ++ gdt = get_cpu_gdt_table(smp_processor_id()); ++ ++ pack_descriptor(&d, 0UL, 0xFFFFFUL, 0x9B, 0xC); ++ write_gdt_entry(gdt, GDT_ENTRY_PCIBIOS_CS, &d, DESCTYPE_S); ++ pack_descriptor(&d, 0UL, 0xFFFFFUL, 0x93, 0xC); ++ write_gdt_entry(gdt, GDT_ENTRY_PCIBIOS_DS, &d, DESCTYPE_S); ++ ++ __asm__("movw %w7, %%ds; lcall *(%%edi); push %%ss; pop %%ds; cld" + : "=a" (return_code), + "=b" (address), + "=c" (length), + "=d" (entry) + : "0" (service), + "1" (0), +- "D" (&bios32_indirect)); ++ "D" (&bios32_indirect), ++ "r"(__PCIBIOS_DS) ++ : "memory"); ++ ++ pax_open_kernel(); ++ gdt[GDT_ENTRY_PCIBIOS_CS].a = 0; ++ gdt[GDT_ENTRY_PCIBIOS_CS].b = 0; ++ gdt[GDT_ENTRY_PCIBIOS_DS].a = 0; ++ gdt[GDT_ENTRY_PCIBIOS_DS].b = 0; ++ pax_close_kernel(); ++ + local_irq_restore(flags); + + switch (return_code) { +- case 0: +- return address + entry; +- case 0x80: /* Not present */ +- printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service); +- return 0; +- default: /* Shouldn't happen */ +- printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n", +- service, return_code); ++ case 0: { ++ int cpu; ++ unsigned char flags; ++ ++ printk(KERN_INFO "bios32_service: base:%08lx length:%08lx entry:%08lx\n", address, length, entry); ++ if (address >= 0xFFFF0 || length > 0x100000 - address || length <= entry) { ++ printk(KERN_WARNING "bios32_service: not valid\n"); + return 0; ++ } ++ address = address + PAGE_OFFSET; ++ length += 16UL; /* some BIOSs underreport this... */ ++ flags = 4; ++ if (length >= 64*1024*1024) { ++ length >>= PAGE_SHIFT; ++ flags |= 8; ++ } ++ ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ gdt = get_cpu_gdt_table(cpu); ++ pack_descriptor(&d, address, length, 0x9b, flags); ++ write_gdt_entry(gdt, GDT_ENTRY_PCIBIOS_CS, &d, DESCTYPE_S); ++ pack_descriptor(&d, address, length, 0x93, flags); ++ write_gdt_entry(gdt, GDT_ENTRY_PCIBIOS_DS, &d, DESCTYPE_S); ++ } ++ return entry; ++ } ++ case 0x80: /* Not present */ ++ printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service); ++ return 0; ++ default: /* Shouldn't happen */ ++ printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n", ++ service, return_code); ++ return 0; + } + } + + static struct { + unsigned long address; + unsigned short segment; +-} pci_indirect = { 0, __KERNEL_CS }; ++} pci_indirect __read_only = { 0, __PCIBIOS_CS }; + +-static int pci_bios_present; ++static int pci_bios_present __read_only; + + static int __devinit check_pcibios(void) + { +@@ -108,11 +151,13 @@ static int __devinit check_pcibios(void) + unsigned long flags, pcibios_entry; + + if ((pcibios_entry = bios32_service(PCI_SERVICE))) { +- pci_indirect.address = pcibios_entry + PAGE_OFFSET; ++ pci_indirect.address = pcibios_entry; + + local_irq_save(flags); +- __asm__( +- "lcall *(%%edi); cld\n\t" ++ __asm__("movw %w6, %%ds\n\t" ++ "lcall *%%ss:(%%edi); cld\n\t" ++ "push %%ss\n\t" ++ "pop %%ds\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" +@@ -121,7 +166,8 @@ static int __devinit check_pcibios(void) + "=b" (ebx), + "=c" (ecx) + : "1" (PCIBIOS_PCI_BIOS_PRESENT), +- "D" (&pci_indirect) ++ "D" (&pci_indirect), ++ "r" (__PCIBIOS_DS) + : "memory"); + local_irq_restore(flags); + +@@ -165,7 +211,10 @@ static int pci_bios_read(unsigned int se + + switch (len) { + case 1: +- __asm__("lcall *(%%esi); cld\n\t" ++ __asm__("movw %w6, %%ds\n\t" ++ "lcall *%%ss:(%%esi); cld\n\t" ++ "push %%ss\n\t" ++ "pop %%ds\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" +@@ -174,7 +223,8 @@ static int pci_bios_read(unsigned int se + : "1" (PCIBIOS_READ_CONFIG_BYTE), + "b" (bx), + "D" ((long)reg), +- "S" (&pci_indirect)); ++ "S" (&pci_indirect), ++ "r" (__PCIBIOS_DS)); + /* + * Zero-extend the result beyond 8 bits, do not trust the + * BIOS having done it: +@@ -182,7 +232,10 @@ static int pci_bios_read(unsigned int se + *value &= 0xff; + break; + case 2: +- __asm__("lcall *(%%esi); cld\n\t" ++ __asm__("movw %w6, %%ds\n\t" ++ "lcall *%%ss:(%%esi); cld\n\t" ++ "push %%ss\n\t" ++ "pop %%ds\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" +@@ -191,7 +244,8 @@ static int pci_bios_read(unsigned int se + : "1" (PCIBIOS_READ_CONFIG_WORD), + "b" (bx), + "D" ((long)reg), +- "S" (&pci_indirect)); ++ "S" (&pci_indirect), ++ "r" (__PCIBIOS_DS)); + /* + * Zero-extend the result beyond 16 bits, do not trust the + * BIOS having done it: +@@ -199,7 +253,10 @@ static int pci_bios_read(unsigned int se + *value &= 0xffff; + break; + case 4: +- __asm__("lcall *(%%esi); cld\n\t" ++ __asm__("movw %w6, %%ds\n\t" ++ "lcall *%%ss:(%%esi); cld\n\t" ++ "push %%ss\n\t" ++ "pop %%ds\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" +@@ -208,7 +265,8 @@ static int pci_bios_read(unsigned int se + : "1" (PCIBIOS_READ_CONFIG_DWORD), + "b" (bx), + "D" ((long)reg), +- "S" (&pci_indirect)); ++ "S" (&pci_indirect), ++ "r" (__PCIBIOS_DS)); + break; + } + +@@ -231,7 +289,10 @@ static int pci_bios_write(unsigned int s + + switch (len) { + case 1: +- __asm__("lcall *(%%esi); cld\n\t" ++ __asm__("movw %w6, %%ds\n\t" ++ "lcall *%%ss:(%%esi); cld\n\t" ++ "push %%ss\n\t" ++ "pop %%ds\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" +@@ -240,10 +301,14 @@ static int pci_bios_write(unsigned int s + "c" (value), + "b" (bx), + "D" ((long)reg), +- "S" (&pci_indirect)); ++ "S" (&pci_indirect), ++ "r" (__PCIBIOS_DS)); + break; + case 2: +- __asm__("lcall *(%%esi); cld\n\t" ++ __asm__("movw %w6, %%ds\n\t" ++ "lcall *%%ss:(%%esi); cld\n\t" ++ "push %%ss\n\t" ++ "pop %%ds\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" +@@ -252,10 +317,14 @@ static int pci_bios_write(unsigned int s + "c" (value), + "b" (bx), + "D" ((long)reg), +- "S" (&pci_indirect)); ++ "S" (&pci_indirect), ++ "r" (__PCIBIOS_DS)); + break; + case 4: +- __asm__("lcall *(%%esi); cld\n\t" ++ __asm__("movw %w6, %%ds\n\t" ++ "lcall *%%ss:(%%esi); cld\n\t" ++ "push %%ss\n\t" ++ "pop %%ds\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" +@@ -264,7 +333,8 @@ static int pci_bios_write(unsigned int s + "c" (value), + "b" (bx), + "D" ((long)reg), +- "S" (&pci_indirect)); ++ "S" (&pci_indirect), ++ "r" (__PCIBIOS_DS)); + break; + } + +@@ -278,7 +348,7 @@ static int pci_bios_write(unsigned int s + * Function table for BIOS32 access + */ + +-static struct pci_raw_ops pci_bios_access = { ++static const struct pci_raw_ops pci_bios_access = { + .read = pci_bios_read, + .write = pci_bios_write + }; +@@ -287,7 +357,7 @@ static struct pci_raw_ops pci_bios_acces + * Try to find PCI BIOS. + */ + +-static struct pci_raw_ops * __devinit pci_find_bios(void) ++static const struct pci_raw_ops * __devinit pci_find_bios(void) + { + union bios32 *check; + unsigned char sum; +@@ -368,10 +438,13 @@ struct irq_routing_table * pcibios_get_i + + DBG("PCI: Fetching IRQ routing table... "); + __asm__("push %%es\n\t" ++ "movw %w8, %%ds\n\t" + "push %%ds\n\t" + "pop %%es\n\t" +- "lcall *(%%esi); cld\n\t" ++ "lcall *%%ss:(%%esi); cld\n\t" + "pop %%es\n\t" ++ "push %%ss\n\t" ++ "pop %%ds\n" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" +@@ -382,7 +455,8 @@ struct irq_routing_table * pcibios_get_i + "1" (0), + "D" ((long) &opt), + "S" (&pci_indirect), +- "m" (opt) ++ "m" (opt), ++ "r" (__PCIBIOS_DS) + : "memory"); + DBG("OK ret=%d, size=%d, map=%x\n", ret, opt.size, map); + if (ret & 0xff00) +@@ -406,7 +480,10 @@ int pcibios_set_irq_routing(struct pci_d + { + int ret; + +- __asm__("lcall *(%%esi); cld\n\t" ++ __asm__("movw %w5, %%ds\n\t" ++ "lcall *%%ss:(%%esi); cld\n\t" ++ "push %%ss\n\t" ++ "pop %%ds\n" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" +@@ -414,7 +491,8 @@ int pcibios_set_irq_routing(struct pci_d + : "0" (PCIBIOS_SET_PCI_HW_INT), + "b" ((dev->bus->number << 8) | dev->devfn), + "c" ((irq << 8) | (pin + 10)), +- "S" (&pci_indirect)); ++ "S" (&pci_indirect), ++ "r" (__PCIBIOS_DS)); + return !(ret & 0xff00); + } + EXPORT_SYMBOL(pcibios_set_irq_routing); +diff -urNp linux-2.6.33.1/arch/x86/power/cpu.c linux-2.6.33.1/arch/x86/power/cpu.c +--- linux-2.6.33.1/arch/x86/power/cpu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/power/cpu.c 2010-03-20 16:58:39.040721575 -0400 +@@ -127,7 +127,7 @@ static void do_fpu_end(void) + static void fix_processor_context(void) + { + int cpu = smp_processor_id(); +- struct tss_struct *t = &per_cpu(init_tss, cpu); ++ struct tss_struct *t = init_tss + cpu; + + set_tss_desc(cpu, t); /* + * This just modifies memory; should not be +@@ -137,7 +137,9 @@ static void fix_processor_context(void) + */ + + #ifdef CONFIG_X86_64 ++ pax_open_kernel(); + get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; ++ pax_close_kernel(); + + syscall_init(); /* This sets MSR_*STAR and related */ + #endif +diff -urNp linux-2.6.33.1/arch/x86/vdso/Makefile linux-2.6.33.1/arch/x86/vdso/Makefile +--- linux-2.6.33.1/arch/x86/vdso/Makefile 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/vdso/Makefile 2010-03-20 16:58:39.040721575 -0400 +@@ -122,7 +122,7 @@ quiet_cmd_vdso = VDSO $@ + $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) + +-VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) ++VDSO_LDFLAGS = -fPIC -shared --no-undefined $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) + GCOV_PROFILE := n + + # +diff -urNp linux-2.6.33.1/arch/x86/vdso/vclock_gettime.c linux-2.6.33.1/arch/x86/vdso/vclock_gettime.c +--- linux-2.6.33.1/arch/x86/vdso/vclock_gettime.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/vdso/vclock_gettime.c 2010-03-20 16:58:39.044545939 -0400 +@@ -22,24 +22,48 @@ + #include <asm/hpet.h> + #include <asm/unistd.h> + #include <asm/io.h> ++#include <asm/fixmap.h> + #include "vextern.h" + + #define gtod vdso_vsyscall_gtod_data + ++notrace noinline long __vdso_fallback_time(long *t) ++{ ++ long secs; ++ asm volatile("syscall" ++ : "=a" (secs) ++ : "0" (__NR_time),"D" (t) : "r11", "cx", "memory"); ++ return secs; ++} ++ + notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) + { + long ret; + asm("syscall" : "=a" (ret) : +- "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory"); ++ "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "r11", "cx", "memory"); + return ret; + } + ++notrace static inline cycle_t __vdso_vread_hpet(void) ++{ ++ return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); ++} ++ ++notrace static inline cycle_t __vdso_vread_tsc(void) ++{ ++ cycle_t ret = (cycle_t)vget_cycles(); ++ ++ return ret >= gtod->clock.cycle_last ? ret : gtod->clock.cycle_last; ++} ++ + notrace static inline long vgetns(void) + { + long v; +- cycles_t (*vread)(void); +- vread = gtod->clock.vread; +- v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; ++ if (gtod->clock.name[0] == 't' && gtod->clock.name[1] == 's' && gtod->clock.name[2] == 'c' && !gtod->clock.name[3]) ++ v = __vdso_vread_tsc(); ++ else ++ v = __vdso_vread_hpet(); ++ v = (v - gtod->clock.cycle_last) & gtod->clock.mask; + return (v * gtod->clock.mult) >> gtod->clock.shift; + } + +@@ -113,7 +137,9 @@ notrace static noinline int do_monotonic + + notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) + { +- if (likely(gtod->sysctl_enabled)) ++ if (likely(gtod->sysctl_enabled && ++ ((gtod->clock.name[0] == 'h' && gtod->clock.name[1] == 'p' && gtod->clock.name[2] == 'e' && gtod->clock.name[3] == 't' && !gtod->clock.name[4]) || ++ (gtod->clock.name[0] == 't' && gtod->clock.name[1] == 's' && gtod->clock.name[2] == 'c' && !gtod->clock.name[3])))) + switch (clock) { + case CLOCK_REALTIME: + if (likely(gtod->clock.vread)) +@@ -133,10 +159,20 @@ notrace int __vdso_clock_gettime(clockid + int clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +-notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) ++notrace noinline int __vdso_fallback_gettimeofday(struct timeval *tv, struct timezone *tz) + { + long ret; +- if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { ++ asm("syscall" : "=a" (ret) : ++ "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "r11", "cx", "memory"); ++ return ret; ++} ++ ++notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) ++{ ++ if (likely(gtod->sysctl_enabled && ++ ((gtod->clock.name[0] == 'h' && gtod->clock.name[1] == 'p' && gtod->clock.name[2] == 'e' && gtod->clock.name[3] == 't' && !gtod->clock.name[4]) || ++ (gtod->clock.name[0] == 't' && gtod->clock.name[1] == 's' && gtod->clock.name[2] == 'c' && !gtod->clock.name[3])))) ++ { + if (likely(tv != NULL)) { + BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != + offsetof(struct timespec, tv_nsec) || +@@ -151,9 +187,7 @@ notrace int __vdso_gettimeofday(struct t + } + return 0; + } +- asm("syscall" : "=a" (ret) : +- "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); +- return ret; ++ return __vdso_fallback_gettimeofday(tv, tz); + } + int gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); +diff -urNp linux-2.6.33.1/arch/x86/vdso/vdso32-setup.c linux-2.6.33.1/arch/x86/vdso/vdso32-setup.c +--- linux-2.6.33.1/arch/x86/vdso/vdso32-setup.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/vdso/vdso32-setup.c 2010-03-20 16:58:39.044545939 -0400 +@@ -25,6 +25,7 @@ + #include <asm/tlbflush.h> + #include <asm/vdso.h> + #include <asm/proto.h> ++#include <asm/mman.h> + + enum { + VDSO_DISABLED = 0, +@@ -226,7 +227,7 @@ static inline void map_compat_vdso(int m + void enable_sep_cpu(void) + { + int cpu = get_cpu(); +- struct tss_struct *tss = &per_cpu(init_tss, cpu); ++ struct tss_struct *tss = init_tss + cpu; + + if (!boot_cpu_has(X86_FEATURE_SEP)) { + put_cpu(); +@@ -249,7 +250,7 @@ static int __init gate_vma_init(void) + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; +- gate_vma.vm_page_prot = __P101; ++ gate_vma.vm_page_prot = vm_get_page_prot(gate_vma.vm_flags); + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later +@@ -331,14 +332,14 @@ int arch_setup_additional_pages(struct l + if (compat) + addr = VDSO_HIGH_BASE; + else { +- addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); ++ addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, MAP_EXECUTABLE); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + } + +- current->mm->context.vdso = (void *)addr; ++ current->mm->context.vdso = addr; + + if (compat_uses_vma || !compat) { + /* +@@ -361,11 +362,11 @@ int arch_setup_additional_pages(struct l + } + + current_thread_info()->sysenter_return = +- VDSO32_SYMBOL(addr, SYSENTER_RETURN); ++ (__force void __user *)VDSO32_SYMBOL(addr, SYSENTER_RETURN); + + up_fail: + if (ret) +- current->mm->context.vdso = NULL; ++ current->mm->context.vdso = 0; + + up_write(&mm->mmap_sem); + +@@ -412,8 +413,14 @@ __initcall(ia32_binfmt_init); + + const char *arch_vma_name(struct vm_area_struct *vma) + { +- if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) ++ if (vma->vm_mm && vma->vm_start == vma->vm_mm->context.vdso) + return "[vdso]"; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma->vm_mm && vma->vm_mirror && vma->vm_mirror->vm_start == vma->vm_mm->context.vdso) ++ return "[vdso]"; ++#endif ++ + return NULL; + } + +@@ -422,7 +429,7 @@ struct vm_area_struct *get_gate_vma(stru + struct mm_struct *mm = tsk->mm; + + /* Check to see if this task was created in compat vdso mode */ +- if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) ++ if (mm && mm->context.vdso == VDSO_HIGH_BASE) + return &gate_vma; + return NULL; + } +diff -urNp linux-2.6.33.1/arch/x86/vdso/vdso.lds.S linux-2.6.33.1/arch/x86/vdso/vdso.lds.S +--- linux-2.6.33.1/arch/x86/vdso/vdso.lds.S 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/vdso/vdso.lds.S 2010-03-20 16:58:39.044545939 -0400 +@@ -35,3 +35,9 @@ VDSO64_PRELINK = VDSO_PRELINK; + #define VEXTERN(x) VDSO64_ ## x = vdso_ ## x; + #include "vextern.h" + #undef VEXTERN ++ ++#define VEXTERN(x) VDSO64_ ## x = __vdso_ ## x; ++VEXTERN(fallback_gettimeofday) ++VEXTERN(fallback_time) ++VEXTERN(getcpu) ++#undef VEXTERN +diff -urNp linux-2.6.33.1/arch/x86/vdso/vextern.h linux-2.6.33.1/arch/x86/vdso/vextern.h +--- linux-2.6.33.1/arch/x86/vdso/vextern.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/vdso/vextern.h 2010-03-20 16:58:39.044545939 -0400 +@@ -11,6 +11,5 @@ + put into vextern.h and be referenced as a pointer with vdso prefix. + The main kernel later fills in the values. */ + +-VEXTERN(jiffies) + VEXTERN(vgetcpu_mode) + VEXTERN(vsyscall_gtod_data) +diff -urNp linux-2.6.33.1/arch/x86/vdso/vma.c linux-2.6.33.1/arch/x86/vdso/vma.c +--- linux-2.6.33.1/arch/x86/vdso/vma.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/vdso/vma.c 2010-03-20 16:58:39.044545939 -0400 +@@ -57,7 +57,7 @@ static int __init init_vdso_vars(void) + if (!vbase) + goto oom; + +- if (memcmp(vbase, "\177ELF", 4)) { ++ if (memcmp(vbase, ELFMAG, SELFMAG)) { + printk("VDSO: I'm broken; not ELF\n"); + vdso_enabled = 0; + } +@@ -66,6 +66,7 @@ static int __init init_vdso_vars(void) + *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; + #include "vextern.h" + #undef VEXTERN ++ vunmap(vbase); + return 0; + + oom: +@@ -116,7 +117,7 @@ int arch_setup_additional_pages(struct l + goto up_fail; + } + +- current->mm->context.vdso = (void *)addr; ++ current->mm->context.vdso = addr; + + ret = install_special_mapping(mm, addr, vdso_size, + VM_READ|VM_EXEC| +@@ -124,7 +125,7 @@ int arch_setup_additional_pages(struct l + VM_ALWAYSDUMP, + vdso_pages); + if (ret) { +- current->mm->context.vdso = NULL; ++ current->mm->context.vdso = 0; + goto up_fail; + } + +@@ -132,10 +133,3 @@ up_fail: + up_write(&mm->mmap_sem); + return ret; + } +- +-static __init int vdso_setup(char *s) +-{ +- vdso_enabled = simple_strtoul(s, NULL, 0); +- return 0; +-} +-__setup("vdso=", vdso_setup); +diff -urNp linux-2.6.33.1/arch/x86/xen/enlighten.c linux-2.6.33.1/arch/x86/xen/enlighten.c +--- linux-2.6.33.1/arch/x86/xen/enlighten.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/xen/enlighten.c 2010-03-20 16:58:39.044545939 -0400 +@@ -73,8 +73,6 @@ EXPORT_SYMBOL_GPL(xen_start_info); + + struct shared_info xen_dummy_shared_info; + +-void *xen_initial_gdt; +- + /* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. +@@ -550,7 +548,7 @@ static void xen_write_idt_entry(gate_des + + preempt_disable(); + +- start = __get_cpu_var(idt_desc).address; ++ start = (unsigned long)__get_cpu_var(idt_desc).address; + end = start + __get_cpu_var(idt_desc).size + 1; + + xen_mc_flush(); +@@ -1133,13 +1131,6 @@ asmlinkage void __init xen_start_kernel( + + machine_ops = xen_machine_ops; + +- /* +- * The only reliable way to retain the initial address of the +- * percpu gdt_page is to remember it here, so we can go and +- * mark it RW later, when the initial percpu area is freed. +- */ +- xen_initial_gdt = &per_cpu(gdt_page, 0); +- + xen_smp_init(); + + pgd = (pgd_t *)xen_start_info->pt_base; +diff -urNp linux-2.6.33.1/arch/x86/xen/mmu.c linux-2.6.33.1/arch/x86/xen/mmu.c +--- linux-2.6.33.1/arch/x86/xen/mmu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/xen/mmu.c 2010-03-20 16:58:39.044545939 -0400 +@@ -1711,6 +1711,8 @@ __init pgd_t *xen_setup_kernel_pagetable + convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(level3_ident_pgt); + convert_pfn_mfn(level3_kernel_pgt); ++ convert_pfn_mfn(level3_vmalloc_pgt); ++ convert_pfn_mfn(level3_vmemmap_pgt); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); +@@ -1729,7 +1731,10 @@ __init pgd_t *xen_setup_kernel_pagetable + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); ++ set_page_prot(level3_vmalloc_pgt, PAGE_KERNEL_RO); ++ set_page_prot(level3_vmemmap_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); ++ set_page_prot(level2_vmemmap_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + +diff -urNp linux-2.6.33.1/arch/x86/xen/smp.c linux-2.6.33.1/arch/x86/xen/smp.c +--- linux-2.6.33.1/arch/x86/xen/smp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/xen/smp.c 2010-03-20 16:58:39.044545939 -0400 +@@ -168,11 +168,6 @@ static void __init xen_smp_prepare_boot_ + { + BUG_ON(smp_processor_id() != 0); + native_smp_prepare_boot_cpu(); +- +- /* We've switched to the "real" per-cpu gdt, so make sure the +- old memory can be recycled */ +- make_lowmem_page_readwrite(xen_initial_gdt); +- + xen_setup_vcpu_info_placement(); + } + +@@ -232,8 +227,8 @@ cpu_initialize_context(unsigned int cpu, + gdt = get_cpu_gdt_table(cpu); + + ctxt->flags = VGCF_IN_KERNEL; +- ctxt->user_regs.ds = __USER_DS; +- ctxt->user_regs.es = __USER_DS; ++ ctxt->user_regs.ds = __KERNEL_DS; ++ ctxt->user_regs.es = __KERNEL_DS; + ctxt->user_regs.ss = __KERNEL_DS; + #ifdef CONFIG_X86_32 + ctxt->user_regs.fs = __KERNEL_PERCPU; +diff -urNp linux-2.6.33.1/arch/x86/xen/xen-ops.h linux-2.6.33.1/arch/x86/xen/xen-ops.h +--- linux-2.6.33.1/arch/x86/xen/xen-ops.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/arch/x86/xen/xen-ops.h 2010-03-20 16:58:39.044545939 -0400 +@@ -10,8 +10,6 @@ + extern const char xen_hypervisor_callback[]; + extern const char xen_failsafe_callback[]; + +-extern void *xen_initial_gdt; +- + struct trap_info; + void xen_copy_trap_info(struct trap_info *traps); + +diff -urNp linux-2.6.33.1/block/blk-integrity.c linux-2.6.33.1/block/blk-integrity.c +--- linux-2.6.33.1/block/blk-integrity.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/block/blk-integrity.c 2010-03-20 16:58:39.052902082 -0400 +@@ -278,7 +278,7 @@ static struct attribute *integrity_attrs + NULL, + }; + +-static struct sysfs_ops integrity_ops = { ++static const struct sysfs_ops integrity_ops = { + .show = &integrity_attr_show, + .store = &integrity_attr_store, + }; +diff -urNp linux-2.6.33.1/block/blk-iopoll.c linux-2.6.33.1/block/blk-iopoll.c +--- linux-2.6.33.1/block/blk-iopoll.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/block/blk-iopoll.c 2010-03-20 16:58:39.068779508 -0400 +@@ -77,7 +77,7 @@ void blk_iopoll_complete(struct blk_iopo + } + EXPORT_SYMBOL(blk_iopoll_complete); + +-static void blk_iopoll_softirq(struct softirq_action *h) ++static void blk_iopoll_softirq(void) + { + struct list_head *list = &__get_cpu_var(blk_cpu_iopoll); + int rearm = 0, budget = blk_iopoll_budget; +diff -urNp linux-2.6.33.1/block/blk-map.c linux-2.6.33.1/block/blk-map.c +--- linux-2.6.33.1/block/blk-map.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/block/blk-map.c 2010-03-20 16:58:39.068779508 -0400 +@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct requ + * direct dma. else, set up kernel bounce buffers + */ + uaddr = (unsigned long) ubuf; +- if (blk_rq_aligned(q, ubuf, len) && !map_data) ++ if (blk_rq_aligned(q, (__force void *)ubuf, len) && !map_data) + bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); + else + bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); +@@ -297,7 +297,7 @@ int blk_rq_map_kern(struct request_queue + if (!len || !kbuf) + return -EINVAL; + +- do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); ++ do_copy = !blk_rq_aligned(q, kbuf, len) || object_starts_on_stack(kbuf); + if (do_copy) + bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); + else +diff -urNp linux-2.6.33.1/block/blk-softirq.c linux-2.6.33.1/block/blk-softirq.c +--- linux-2.6.33.1/block/blk-softirq.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/block/blk-softirq.c 2010-03-20 16:58:39.068779508 -0400 +@@ -17,7 +17,7 @@ static DEFINE_PER_CPU(struct list_head, + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +-static void blk_done_softirq(struct softirq_action *h) ++static void blk_done_softirq(void) + { + struct list_head *cpu_list, local_list; + +diff -urNp linux-2.6.33.1/block/blk-sysfs.c linux-2.6.33.1/block/blk-sysfs.c +--- linux-2.6.33.1/block/blk-sysfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/block/blk-sysfs.c 2010-03-20 16:58:39.068779508 -0400 +@@ -447,7 +447,7 @@ static void blk_release_queue(struct kob + kmem_cache_free(blk_requestq_cachep, q); + } + +-static struct sysfs_ops queue_sysfs_ops = { ++static const struct sysfs_ops queue_sysfs_ops = { + .show = queue_attr_show, + .store = queue_attr_store, + }; +diff -urNp linux-2.6.33.1/block/elevator.c linux-2.6.33.1/block/elevator.c +--- linux-2.6.33.1/block/elevator.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/block/elevator.c 2010-03-20 16:58:39.073372227 -0400 +@@ -883,7 +883,7 @@ elv_attr_store(struct kobject *kobj, str + return error; + } + +-static struct sysfs_ops elv_sysfs_ops = { ++static const struct sysfs_ops elv_sysfs_ops = { + .show = elv_attr_show, + .store = elv_attr_store, + }; +diff -urNp linux-2.6.33.1/crypto/lrw.c linux-2.6.33.1/crypto/lrw.c +--- linux-2.6.33.1/crypto/lrw.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/crypto/lrw.c 2010-03-20 16:58:39.077284706 -0400 +@@ -60,7 +60,7 @@ static int setkey(struct crypto_tfm *par + struct priv *ctx = crypto_tfm_ctx(parent); + struct crypto_cipher *child = ctx->child; + int err, i; +- be128 tmp = { 0 }; ++ be128 tmp = { 0, 0 }; + int bsize = crypto_cipher_blocksize(child); + + crypto_cipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); +diff -urNp linux-2.6.33.1/Documentation/dontdiff linux-2.6.33.1/Documentation/dontdiff +--- linux-2.6.33.1/Documentation/dontdiff 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/Documentation/dontdiff 2010-03-20 16:58:39.077284706 -0400 +@@ -3,6 +3,7 @@ + *.bin + *.cpio + *.csp ++*.dbg + *.dsp + *.dvi + *.elf +@@ -40,6 +41,7 @@ + *.ver + *.xml + *_MODULES ++*_reg_safe.h + *_vga16.c + *~ + *.9 +@@ -49,11 +51,16 @@ + 53c700_d.h + CVS + ChangeSet ++GPATH ++GRTAGS ++GSYMS ++GTAGS + Image + Kerntypes + Module.markers + Module.symvers + PENDING ++PERF* + SCCS + System.map* + TAGS +@@ -77,7 +84,9 @@ btfixupprep + build + bvmlinux + bzImage* ++capflags.c + classlist.h* ++common-cmds.h + comp*.log + compile.h* + conf +@@ -107,13 +116,15 @@ generated + genheaders + genksyms + *_gray256.c ++hash + ihex2fw + ikconfig.h* ++inat-tables.c + initramfs_data.cpio ++initramfs_data.cpio.bz2 + initramfs_data.cpio.gz + initramfs_list + kallsyms +-kconfig + keywords.c + ksym.c* + ksym.h* +@@ -137,10 +148,13 @@ mkboot + mkbugboot + mkcpustr + mkdep ++mkpiggy + mkprep ++mkregtable + mktables + mktree + modpost ++modules.builtin + modules.order + modversions.h* + ncscope.* +@@ -153,6 +167,7 @@ patches* + pca200e.bin + pca200e_ecd.bin2 + piggy.gz ++piggy.S + piggyback + pnmtologo + ppc_defs.h* +@@ -167,6 +182,7 @@ setup + setup.bin + setup.elf + sImage ++slabinfo + sm_tbl* + split-include + syscalltab.h +@@ -190,14 +206,20 @@ version.h* + vmlinux + vmlinux-* + vmlinux.aout ++vmlinux.bin.all ++vmlinux.bin.bz2 + vmlinux.lds ++vmlinux.relocs ++voffset.h + vsyscall.lds + vsyscall_32.lds + wanxlfw.inc + uImage + unifdef ++utsrelease.h + wakeup.bin + wakeup.elf + wakeup.lds + zImage* + zconf.hash.c ++zoffset.h +diff -urNp linux-2.6.33.1/Documentation/filesystems/sysfs.txt linux-2.6.33.1/Documentation/filesystems/sysfs.txt +--- linux-2.6.33.1/Documentation/filesystems/sysfs.txt 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/Documentation/filesystems/sysfs.txt 2010-03-20 16:58:39.077284706 -0400 +@@ -123,8 +123,8 @@ set of sysfs operations for forwarding r + show and store methods of the attribute owners. + + struct sysfs_ops { +- ssize_t (*show)(struct kobject *, struct attribute *, char *); +- ssize_t (*store)(struct kobject *, struct attribute *, const char *); ++ ssize_t (* const show)(struct kobject *, struct attribute *, char *); ++ ssize_t (* const store)(struct kobject *, struct attribute *, const char *); + }; + + [ Subsystems should have already defined a struct kobj_type as a +diff -urNp linux-2.6.33.1/Documentation/kernel-parameters.txt linux-2.6.33.1/Documentation/kernel-parameters.txt +--- linux-2.6.33.1/Documentation/kernel-parameters.txt 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/Documentation/kernel-parameters.txt 2010-03-20 16:58:39.081342736 -0400 +@@ -1865,6 +1865,12 @@ and is between 256 and 4096 characters. + the specified number of seconds. This is to be used if + your oopses keep scrolling off the screen. + ++ pax_nouderef [X86-32] disables UDEREF. Most likely needed under certain ++ virtualization environments that don't cope well with the ++ expand down segment used by UDEREF on X86-32. ++ ++ pax_softmode= [X86-32] 0/1 to disable/enable PaX softmode on boot already. ++ + pcbit= [HW,ISDN] + + pcd. [PARIDE] +diff -urNp linux-2.6.33.1/drivers/acpi/battery.c linux-2.6.33.1/drivers/acpi/battery.c +--- linux-2.6.33.1/drivers/acpi/battery.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/acpi/battery.c 2010-03-20 16:58:39.097283097 -0400 +@@ -763,7 +763,7 @@ DECLARE_FILE_FUNCTIONS(alarm); + } + + static struct battery_file { +- struct file_operations ops; ++ const struct file_operations ops; + mode_t mode; + const char *name; + } acpi_battery_file[] = { +diff -urNp linux-2.6.33.1/drivers/acpi/blacklist.c linux-2.6.33.1/drivers/acpi/blacklist.c +--- linux-2.6.33.1/drivers/acpi/blacklist.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/acpi/blacklist.c 2010-03-20 16:58:39.097283097 -0400 +@@ -73,7 +73,7 @@ static struct acpi_blacklist_item acpi_b + {"IBM ", "TP600E ", 0x00000105, ACPI_SIG_DSDT, less_than_or_equal, + "Incorrect _ADR", 1}, + +- {""} ++ {"", "", 0, NULL, all_versions, NULL, 0} + }; + + #if CONFIG_ACPI_BLACKLIST_YEAR +diff -urNp linux-2.6.33.1/drivers/acpi/dock.c linux-2.6.33.1/drivers/acpi/dock.c +--- linux-2.6.33.1/drivers/acpi/dock.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/acpi/dock.c 2010-03-20 16:58:39.097283097 -0400 +@@ -76,7 +76,7 @@ struct dock_dependent_device { + struct list_head list; + struct list_head hotplug_list; + acpi_handle handle; +- struct acpi_dock_ops *ops; ++ const struct acpi_dock_ops *ops; + void *context; + }; + +@@ -588,7 +588,7 @@ EXPORT_SYMBOL_GPL(unregister_dock_notifi + * the dock driver after _DCK is executed. + */ + int +-register_hotplug_dock_device(acpi_handle handle, struct acpi_dock_ops *ops, ++register_hotplug_dock_device(acpi_handle handle, const struct acpi_dock_ops *ops, + void *context) + { + struct dock_dependent_device *dd; +diff -urNp linux-2.6.33.1/drivers/acpi/osl.c linux-2.6.33.1/drivers/acpi/osl.c +--- linux-2.6.33.1/drivers/acpi/osl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/acpi/osl.c 2010-03-20 16:58:39.101703980 -0400 +@@ -523,6 +523,8 @@ acpi_os_read_memory(acpi_physical_addres + void __iomem *virt_addr; + + virt_addr = ioremap(phys_addr, width); ++ if (!virt_addr) ++ return AE_NO_MEMORY; + if (!value) + value = &dummy; + +@@ -551,6 +553,8 @@ acpi_os_write_memory(acpi_physical_addre + void __iomem *virt_addr; + + virt_addr = ioremap(phys_addr, width); ++ if (!virt_addr) ++ return AE_NO_MEMORY; + + switch (width) { + case 8: +diff -urNp linux-2.6.33.1/drivers/acpi/processor_core.c linux-2.6.33.1/drivers/acpi/processor_core.c +--- linux-2.6.33.1/drivers/acpi/processor_core.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/acpi/processor_core.c 2010-03-20 16:58:39.109290821 -0400 +@@ -734,7 +734,7 @@ static int __cpuinit acpi_processor_add( + return 0; + } + +- BUG_ON((pr->id >= nr_cpu_ids) || (pr->id < 0)); ++ BUG_ON(pr->id >= nr_cpu_ids); + + /* + * Buggy BIOS check +diff -urNp linux-2.6.33.1/drivers/acpi/processor_idle.c linux-2.6.33.1/drivers/acpi/processor_idle.c +--- linux-2.6.33.1/drivers/acpi/processor_idle.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/acpi/processor_idle.c 2010-03-20 16:58:39.109290821 -0400 +@@ -118,7 +118,7 @@ static struct dmi_system_id __cpuinitdat + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), + DMI_MATCH(DMI_PRODUCT_NAME,"L8400B series Notebook PC")}, + (void *)1}, +- {}, ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL}, + }; + + +diff -urNp linux-2.6.33.1/drivers/acpi/sleep.c linux-2.6.33.1/drivers/acpi/sleep.c +--- linux-2.6.33.1/drivers/acpi/sleep.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/acpi/sleep.c 2010-03-20 16:58:39.113276130 -0400 +@@ -302,7 +302,7 @@ static int acpi_suspend_state_valid(susp + } + } + +-static struct platform_suspend_ops acpi_suspend_ops = { ++static const struct platform_suspend_ops acpi_suspend_ops = { + .valid = acpi_suspend_state_valid, + .begin = acpi_suspend_begin, + .prepare_late = acpi_pm_prepare, +@@ -330,7 +330,7 @@ static int acpi_suspend_begin_old(suspen + * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has + * been requested. + */ +-static struct platform_suspend_ops acpi_suspend_ops_old = { ++static const struct platform_suspend_ops acpi_suspend_ops_old = { + .valid = acpi_suspend_state_valid, + .begin = acpi_suspend_begin_old, + .prepare_late = acpi_pm_disable_gpes, +@@ -557,7 +557,7 @@ static void acpi_pm_enable_gpes(void) + acpi_enable_all_runtime_gpes(); + } + +-static struct platform_hibernation_ops acpi_hibernation_ops = { ++static const struct platform_hibernation_ops acpi_hibernation_ops = { + .begin = acpi_hibernation_begin, + .end = acpi_pm_end, + .pre_snapshot = acpi_hibernation_pre_snapshot, +@@ -610,7 +610,7 @@ static int acpi_hibernation_pre_snapshot + * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has + * been requested. + */ +-static struct platform_hibernation_ops acpi_hibernation_ops_old = { ++static const struct platform_hibernation_ops acpi_hibernation_ops_old = { + .begin = acpi_hibernation_begin_old, + .end = acpi_pm_end, + .pre_snapshot = acpi_hibernation_pre_snapshot_old, +diff -urNp linux-2.6.33.1/drivers/acpi/video.c linux-2.6.33.1/drivers/acpi/video.c +--- linux-2.6.33.1/drivers/acpi/video.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/acpi/video.c 2010-03-20 16:58:39.136890536 -0400 +@@ -366,7 +366,7 @@ static int acpi_video_set_brightness(str + vd->brightness->levels[request_level]); + } + +-static struct backlight_ops acpi_backlight_ops = { ++static const struct backlight_ops acpi_backlight_ops = { + .get_brightness = acpi_video_get_brightness, + .update_status = acpi_video_set_brightness, + }; +diff -urNp linux-2.6.33.1/drivers/ata/ahci.c linux-2.6.33.1/drivers/ata/ahci.c +--- linux-2.6.33.1/drivers/ata/ahci.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/ahci.c 2010-03-20 16:58:39.153286662 -0400 +@@ -387,7 +387,7 @@ static struct scsi_host_template ahci_sh + .sdev_attrs = ahci_sdev_attrs, + }; + +-static struct ata_port_operations ahci_ops = { ++static const struct ata_port_operations ahci_ops = { + .inherits = &sata_pmp_port_ops, + + .qc_defer = sata_pmp_qc_defer_cmd_switch, +@@ -424,17 +424,17 @@ static struct ata_port_operations ahci_o + .port_stop = ahci_port_stop, + }; + +-static struct ata_port_operations ahci_vt8251_ops = { ++static const struct ata_port_operations ahci_vt8251_ops = { + .inherits = &ahci_ops, + .hardreset = ahci_vt8251_hardreset, + }; + +-static struct ata_port_operations ahci_p5wdh_ops = { ++static const struct ata_port_operations ahci_p5wdh_ops = { + .inherits = &ahci_ops, + .hardreset = ahci_p5wdh_hardreset, + }; + +-static struct ata_port_operations ahci_sb600_ops = { ++static const struct ata_port_operations ahci_sb600_ops = { + .inherits = &ahci_ops, + .softreset = ahci_sb600_softreset, + .pmp_softreset = ahci_sb600_softreset, +@@ -681,7 +681,7 @@ static const struct pci_device_id ahci_p + { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_STORAGE_SATA_AHCI, 0xffffff, board_ahci }, + +- { } /* terminate list */ ++ { 0, 0, 0, 0, 0, 0, 0 } /* terminate list */ + }; + + +diff -urNp linux-2.6.33.1/drivers/ata/ata_generic.c linux-2.6.33.1/drivers/ata/ata_generic.c +--- linux-2.6.33.1/drivers/ata/ata_generic.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/ata_generic.c 2010-03-20 16:58:39.156781730 -0400 +@@ -95,7 +95,7 @@ static struct scsi_host_template generic + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations generic_port_ops = { ++static const struct ata_port_operations generic_port_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = ata_cable_unknown, + .set_mode = generic_set_mode, +diff -urNp linux-2.6.33.1/drivers/ata/ata_piix.c linux-2.6.33.1/drivers/ata/ata_piix.c +--- linux-2.6.33.1/drivers/ata/ata_piix.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/ata_piix.c 2010-03-20 16:58:39.160672594 -0400 +@@ -291,7 +291,7 @@ static const struct pci_device_id piix_p + { 0x8086, 0x3b2d, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_2port_sata }, + /* SATA Controller IDE (PCH) */ + { 0x8086, 0x3b2e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_sata }, +- { } /* terminate list */ ++ { 0, 0, 0, 0, 0, 0, 0 } /* terminate list */ + }; + + static struct pci_driver piix_pci_driver = { +@@ -309,7 +309,7 @@ static struct scsi_host_template piix_sh + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations piix_pata_ops = { ++static const struct ata_port_operations piix_pata_ops = { + .inherits = &ata_bmdma32_port_ops, + .cable_detect = ata_cable_40wire, + .set_piomode = piix_set_piomode, +@@ -317,22 +317,22 @@ static struct ata_port_operations piix_p + .prereset = piix_pata_prereset, + }; + +-static struct ata_port_operations piix_vmw_ops = { ++static const struct ata_port_operations piix_vmw_ops = { + .inherits = &piix_pata_ops, + .bmdma_status = piix_vmw_bmdma_status, + }; + +-static struct ata_port_operations ich_pata_ops = { ++static const struct ata_port_operations ich_pata_ops = { + .inherits = &piix_pata_ops, + .cable_detect = ich_pata_cable_detect, + .set_dmamode = ich_set_dmamode, + }; + +-static struct ata_port_operations piix_sata_ops = { ++static const struct ata_port_operations piix_sata_ops = { + .inherits = &ata_bmdma32_port_ops, + }; + +-static struct ata_port_operations piix_sidpr_sata_ops = { ++static const struct ata_port_operations piix_sidpr_sata_ops = { + .inherits = &piix_sata_ops, + .hardreset = sata_std_hardreset, + .scr_read = piix_sidpr_scr_read, +@@ -608,7 +608,7 @@ static const struct ich_laptop ich_lapto + { 0x2653, 0x1043, 0x82D8 }, /* ICH6M on Asus Eee 701 */ + { 0x27df, 0x104d, 0x900e }, /* ICH7 on Sony TZ-90 */ + /* end marker */ +- { 0, } ++ { 0, 0, 0 } + }; + + /** +@@ -1086,7 +1086,7 @@ static int piix_broken_suspend(void) + }, + }, + +- { } /* terminate list */ ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } /* terminate list */ + }; + static const char *oemstrs[] = { + "Tecra M3,", +diff -urNp linux-2.6.33.1/drivers/ata/libata-acpi.c linux-2.6.33.1/drivers/ata/libata-acpi.c +--- linux-2.6.33.1/drivers/ata/libata-acpi.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/libata-acpi.c 2010-03-20 16:58:39.164792430 -0400 +@@ -223,12 +223,12 @@ static void ata_acpi_dev_uevent(acpi_han + ata_acpi_uevent(dev->link->ap, dev, event); + } + +-static struct acpi_dock_ops ata_acpi_dev_dock_ops = { ++static const struct acpi_dock_ops ata_acpi_dev_dock_ops = { + .handler = ata_acpi_dev_notify_dock, + .uevent = ata_acpi_dev_uevent, + }; + +-static struct acpi_dock_ops ata_acpi_ap_dock_ops = { ++static const struct acpi_dock_ops ata_acpi_ap_dock_ops = { + .handler = ata_acpi_ap_notify_dock, + .uevent = ata_acpi_ap_uevent, + }; +diff -urNp linux-2.6.33.1/drivers/ata/libata-core.c linux-2.6.33.1/drivers/ata/libata-core.c +--- linux-2.6.33.1/drivers/ata/libata-core.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/libata-core.c 2010-03-20 16:58:39.176796856 -0400 +@@ -896,7 +896,7 @@ static const struct ata_xfer_ent { + { ATA_SHIFT_PIO, ATA_NR_PIO_MODES, XFER_PIO_0 }, + { ATA_SHIFT_MWDMA, ATA_NR_MWDMA_MODES, XFER_MW_DMA_0 }, + { ATA_SHIFT_UDMA, ATA_NR_UDMA_MODES, XFER_UDMA_0 }, +- { -1, }, ++ { -1, 0, 0 } + }; + + /** +@@ -3163,7 +3163,7 @@ static const struct ata_timing ata_timin + { XFER_UDMA_5, 0, 0, 0, 0, 0, 0, 0, 0, 20 }, + { XFER_UDMA_6, 0, 0, 0, 0, 0, 0, 0, 0, 15 }, + +- { 0xFF } ++ { 0xFF, 0, 0, 0, 0, 0, 0, 0, 0 } + }; + + #define ENOUGH(v, unit) (((v)-1)/(unit)+1) +@@ -4385,7 +4385,7 @@ static const struct ata_blacklist_entry + { "PIONEER DVD-RW DVRTD08", "1.00", ATA_HORKAGE_NOSETXFER }, + + /* End Marker */ +- { } ++ { NULL, NULL, 0 } + }; + + static int strn_pattern_cmp(const char *patt, const char *name, int wildchar) +@@ -5961,7 +5961,7 @@ static void ata_host_stop(struct device + * LOCKING: + * None. + */ +-static void ata_finalize_port_ops(struct ata_port_operations *ops) ++static void ata_finalize_port_ops(const struct ata_port_operations *ops) + { + static DEFINE_SPINLOCK(lock); + const struct ata_port_operations *cur; +@@ -5973,6 +5973,7 @@ static void ata_finalize_port_ops(struct + return; + + spin_lock(&lock); ++ pax_open_kernel(); + + for (cur = ops->inherits; cur; cur = cur->inherits) { + void **inherit = (void **)cur; +@@ -5986,8 +5987,9 @@ static void ata_finalize_port_ops(struct + if (IS_ERR(*pp)) + *pp = NULL; + +- ops->inherits = NULL; ++ ((struct ata_port_operations *)ops)->inherits = NULL; + ++ pax_close_kernel(); + spin_unlock(&lock); + } + +@@ -6084,7 +6086,7 @@ int ata_host_start(struct ata_host *host + */ + /* KILLME - the only user left is ipr */ + void ata_host_init(struct ata_host *host, struct device *dev, +- unsigned long flags, struct ata_port_operations *ops) ++ unsigned long flags, const struct ata_port_operations *ops) + { + spin_lock_init(&host->lock); + host->dev = dev; +@@ -6754,7 +6756,7 @@ static void ata_dummy_error_handler(stru + /* truly dummy */ + } + +-struct ata_port_operations ata_dummy_port_ops = { ++const struct ata_port_operations ata_dummy_port_ops = { + .qc_prep = ata_noop_qc_prep, + .qc_issue = ata_dummy_qc_issue, + .error_handler = ata_dummy_error_handler, +diff -urNp linux-2.6.33.1/drivers/ata/libata-eh.c linux-2.6.33.1/drivers/ata/libata-eh.c +--- linux-2.6.33.1/drivers/ata/libata-eh.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/libata-eh.c 2010-03-20 16:58:39.180707680 -0400 +@@ -3675,7 +3675,7 @@ void ata_do_eh(struct ata_port *ap, ata_ + */ + void ata_std_error_handler(struct ata_port *ap) + { +- struct ata_port_operations *ops = ap->ops; ++ const struct ata_port_operations *ops = ap->ops; + ata_reset_fn_t hardreset = ops->hardreset; + + /* ignore built-in hardreset if SCR access is not available */ +diff -urNp linux-2.6.33.1/drivers/ata/libata-pmp.c linux-2.6.33.1/drivers/ata/libata-pmp.c +--- linux-2.6.33.1/drivers/ata/libata-pmp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/libata-pmp.c 2010-03-20 16:58:39.180707680 -0400 +@@ -841,7 +841,7 @@ static int sata_pmp_handle_link_fail(str + */ + static int sata_pmp_eh_recover(struct ata_port *ap) + { +- struct ata_port_operations *ops = ap->ops; ++ const struct ata_port_operations *ops = ap->ops; + int pmp_tries, link_tries[SATA_PMP_MAX_PORTS]; + struct ata_link *pmp_link = &ap->link; + struct ata_device *pmp_dev = pmp_link->device; +diff -urNp linux-2.6.33.1/drivers/ata/pata_acpi.c linux-2.6.33.1/drivers/ata/pata_acpi.c +--- linux-2.6.33.1/drivers/ata/pata_acpi.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_acpi.c 2010-03-20 16:58:39.188784803 -0400 +@@ -215,7 +215,7 @@ static struct scsi_host_template pacpi_s + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations pacpi_ops = { ++static const struct ata_port_operations pacpi_ops = { + .inherits = &ata_bmdma_port_ops, + .qc_issue = pacpi_qc_issue, + .cable_detect = pacpi_cable_detect, +diff -urNp linux-2.6.33.1/drivers/ata/pata_ali.c linux-2.6.33.1/drivers/ata/pata_ali.c +--- linux-2.6.33.1/drivers/ata/pata_ali.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_ali.c 2010-03-20 16:58:39.188784803 -0400 +@@ -365,7 +365,7 @@ static struct scsi_host_template ali_sht + * Port operations for PIO only ALi + */ + +-static struct ata_port_operations ali_early_port_ops = { ++static const struct ata_port_operations ali_early_port_ops = { + .inherits = &ata_sff_port_ops, + .cable_detect = ata_cable_40wire, + .set_piomode = ali_set_piomode, +@@ -382,7 +382,7 @@ static const struct ata_port_operations + * Port operations for DMA capable ALi without cable + * detect + */ +-static struct ata_port_operations ali_20_port_ops = { ++static const struct ata_port_operations ali_20_port_ops = { + .inherits = &ali_dma_base_ops, + .cable_detect = ata_cable_40wire, + .mode_filter = ali_20_filter, +@@ -393,7 +393,7 @@ static struct ata_port_operations ali_20 + /* + * Port operations for DMA capable ALi with cable detect + */ +-static struct ata_port_operations ali_c2_port_ops = { ++static const struct ata_port_operations ali_c2_port_ops = { + .inherits = &ali_dma_base_ops, + .check_atapi_dma = ali_check_atapi_dma, + .cable_detect = ali_c2_cable_detect, +@@ -404,7 +404,7 @@ static struct ata_port_operations ali_c2 + /* + * Port operations for DMA capable ALi with cable detect + */ +-static struct ata_port_operations ali_c4_port_ops = { ++static const struct ata_port_operations ali_c4_port_ops = { + .inherits = &ali_dma_base_ops, + .check_atapi_dma = ali_check_atapi_dma, + .cable_detect = ali_c2_cable_detect, +@@ -414,7 +414,7 @@ static struct ata_port_operations ali_c4 + /* + * Port operations for DMA capable ALi with cable detect and LBA48 + */ +-static struct ata_port_operations ali_c5_port_ops = { ++static const struct ata_port_operations ali_c5_port_ops = { + .inherits = &ali_dma_base_ops, + .check_atapi_dma = ali_check_atapi_dma, + .dev_config = ali_warn_atapi_dma, +diff -urNp linux-2.6.33.1/drivers/ata/pata_amd.c linux-2.6.33.1/drivers/ata/pata_amd.c +--- linux-2.6.33.1/drivers/ata/pata_amd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_amd.c 2010-03-20 16:58:39.214454463 -0400 +@@ -397,28 +397,28 @@ static const struct ata_port_operations + .prereset = amd_pre_reset, + }; + +-static struct ata_port_operations amd33_port_ops = { ++static const struct ata_port_operations amd33_port_ops = { + .inherits = &amd_base_port_ops, + .cable_detect = ata_cable_40wire, + .set_piomode = amd33_set_piomode, + .set_dmamode = amd33_set_dmamode, + }; + +-static struct ata_port_operations amd66_port_ops = { ++static const struct ata_port_operations amd66_port_ops = { + .inherits = &amd_base_port_ops, + .cable_detect = ata_cable_unknown, + .set_piomode = amd66_set_piomode, + .set_dmamode = amd66_set_dmamode, + }; + +-static struct ata_port_operations amd100_port_ops = { ++static const struct ata_port_operations amd100_port_ops = { + .inherits = &amd_base_port_ops, + .cable_detect = ata_cable_unknown, + .set_piomode = amd100_set_piomode, + .set_dmamode = amd100_set_dmamode, + }; + +-static struct ata_port_operations amd133_port_ops = { ++static const struct ata_port_operations amd133_port_ops = { + .inherits = &amd_base_port_ops, + .cable_detect = amd_cable_detect, + .set_piomode = amd133_set_piomode, +@@ -433,13 +433,13 @@ static const struct ata_port_operations + .host_stop = nv_host_stop, + }; + +-static struct ata_port_operations nv100_port_ops = { ++static const struct ata_port_operations nv100_port_ops = { + .inherits = &nv_base_port_ops, + .set_piomode = nv100_set_piomode, + .set_dmamode = nv100_set_dmamode, + }; + +-static struct ata_port_operations nv133_port_ops = { ++static const struct ata_port_operations nv133_port_ops = { + .inherits = &nv_base_port_ops, + .set_piomode = nv133_set_piomode, + .set_dmamode = nv133_set_dmamode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_artop.c linux-2.6.33.1/drivers/ata/pata_artop.c +--- linux-2.6.33.1/drivers/ata/pata_artop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_artop.c 2010-03-20 16:58:39.220796895 -0400 +@@ -311,7 +311,7 @@ static struct scsi_host_template artop_s + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations artop6210_ops = { ++static const struct ata_port_operations artop6210_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = ata_cable_40wire, + .set_piomode = artop6210_set_piomode, +@@ -320,7 +320,7 @@ static struct ata_port_operations artop6 + .qc_defer = artop6210_qc_defer, + }; + +-static struct ata_port_operations artop6260_ops = { ++static const struct ata_port_operations artop6260_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = artop6260_cable_detect, + .set_piomode = artop6260_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_at32.c linux-2.6.33.1/drivers/ata/pata_at32.c +--- linux-2.6.33.1/drivers/ata/pata_at32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_at32.c 2010-03-20 16:58:39.220796895 -0400 +@@ -172,7 +172,7 @@ static struct scsi_host_template at32_sh + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations at32_port_ops = { ++static const struct ata_port_operations at32_port_ops = { + .inherits = &ata_sff_port_ops, + .cable_detect = ata_cable_40wire, + .set_piomode = pata_at32_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_at91.c linux-2.6.33.1/drivers/ata/pata_at91.c +--- linux-2.6.33.1/drivers/ata/pata_at91.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_at91.c 2010-03-20 16:58:39.224661770 -0400 +@@ -195,7 +195,7 @@ static struct scsi_host_template pata_at + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations pata_at91_port_ops = { ++static const struct ata_port_operations pata_at91_port_ops = { + .inherits = &ata_sff_port_ops, + + .sff_data_xfer = pata_at91_data_xfer_noirq, +diff -urNp linux-2.6.33.1/drivers/ata/pata_atiixp.c linux-2.6.33.1/drivers/ata/pata_atiixp.c +--- linux-2.6.33.1/drivers/ata/pata_atiixp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_atiixp.c 2010-03-20 16:58:39.224661770 -0400 +@@ -205,7 +205,7 @@ static struct scsi_host_template atiixp_ + .sg_tablesize = LIBATA_DUMB_MAX_PRD, + }; + +-static struct ata_port_operations atiixp_port_ops = { ++static const struct ata_port_operations atiixp_port_ops = { + .inherits = &ata_bmdma_port_ops, + + .qc_prep = ata_sff_dumb_qc_prep, +diff -urNp linux-2.6.33.1/drivers/ata/pata_atp867x.c linux-2.6.33.1/drivers/ata/pata_atp867x.c +--- linux-2.6.33.1/drivers/ata/pata_atp867x.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_atp867x.c 2010-03-20 16:58:39.224661770 -0400 +@@ -274,7 +274,7 @@ static struct scsi_host_template atp867x + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations atp867x_ops = { ++static const struct ata_port_operations atp867x_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = atp867x_cable_detect, + .set_piomode = atp867x_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_bf54x.c linux-2.6.33.1/drivers/ata/pata_bf54x.c +--- linux-2.6.33.1/drivers/ata/pata_bf54x.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_bf54x.c 2010-03-20 16:58:39.236808133 -0400 +@@ -1464,7 +1464,7 @@ static struct scsi_host_template bfin_sh + .dma_boundary = ATA_DMA_BOUNDARY, + }; + +-static struct ata_port_operations bfin_pata_ops = { ++static const struct ata_port_operations bfin_pata_ops = { + .inherits = &ata_sff_port_ops, + + .set_piomode = bfin_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_cmd640.c linux-2.6.33.1/drivers/ata/pata_cmd640.c +--- linux-2.6.33.1/drivers/ata/pata_cmd640.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_cmd640.c 2010-03-20 16:58:39.244791590 -0400 +@@ -168,7 +168,7 @@ static struct scsi_host_template cmd640_ + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations cmd640_port_ops = { ++static const struct ata_port_operations cmd640_port_ops = { + .inherits = &ata_bmdma_port_ops, + /* In theory xfer_noirq is not needed once we kill the prefetcher */ + .sff_data_xfer = ata_sff_data_xfer_noirq, +diff -urNp linux-2.6.33.1/drivers/ata/pata_cmd64x.c linux-2.6.33.1/drivers/ata/pata_cmd64x.c +--- linux-2.6.33.1/drivers/ata/pata_cmd64x.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_cmd64x.c 2010-03-20 16:58:39.252794847 -0400 +@@ -275,18 +275,18 @@ static const struct ata_port_operations + .set_dmamode = cmd64x_set_dmamode, + }; + +-static struct ata_port_operations cmd64x_port_ops = { ++static const struct ata_port_operations cmd64x_port_ops = { + .inherits = &cmd64x_base_ops, + .cable_detect = ata_cable_40wire, + }; + +-static struct ata_port_operations cmd646r1_port_ops = { ++static const struct ata_port_operations cmd646r1_port_ops = { + .inherits = &cmd64x_base_ops, + .bmdma_stop = cmd646r1_bmdma_stop, + .cable_detect = ata_cable_40wire, + }; + +-static struct ata_port_operations cmd648_port_ops = { ++static const struct ata_port_operations cmd648_port_ops = { + .inherits = &cmd64x_base_ops, + .bmdma_stop = cmd648_bmdma_stop, + .cable_detect = cmd648_cable_detect, +diff -urNp linux-2.6.33.1/drivers/ata/pata_cs5520.c linux-2.6.33.1/drivers/ata/pata_cs5520.c +--- linux-2.6.33.1/drivers/ata/pata_cs5520.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_cs5520.c 2010-03-20 16:58:39.268797665 -0400 +@@ -108,7 +108,7 @@ static struct scsi_host_template cs5520_ + .sg_tablesize = LIBATA_DUMB_MAX_PRD, + }; + +-static struct ata_port_operations cs5520_port_ops = { ++static const struct ata_port_operations cs5520_port_ops = { + .inherits = &ata_bmdma_port_ops, + .qc_prep = ata_sff_dumb_qc_prep, + .cable_detect = ata_cable_40wire, +diff -urNp linux-2.6.33.1/drivers/ata/pata_cs5530.c linux-2.6.33.1/drivers/ata/pata_cs5530.c +--- linux-2.6.33.1/drivers/ata/pata_cs5530.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_cs5530.c 2010-03-20 16:58:39.272512019 -0400 +@@ -164,7 +164,7 @@ static struct scsi_host_template cs5530_ + .sg_tablesize = LIBATA_DUMB_MAX_PRD, + }; + +-static struct ata_port_operations cs5530_port_ops = { ++static const struct ata_port_operations cs5530_port_ops = { + .inherits = &ata_bmdma_port_ops, + + .qc_prep = ata_sff_dumb_qc_prep, +diff -urNp linux-2.6.33.1/drivers/ata/pata_cs5535.c linux-2.6.33.1/drivers/ata/pata_cs5535.c +--- linux-2.6.33.1/drivers/ata/pata_cs5535.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_cs5535.c 2010-03-20 16:58:39.272512019 -0400 +@@ -160,7 +160,7 @@ static struct scsi_host_template cs5535_ + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations cs5535_port_ops = { ++static const struct ata_port_operations cs5535_port_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = cs5535_cable_detect, + .set_piomode = cs5535_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_cs5536.c linux-2.6.33.1/drivers/ata/pata_cs5536.c +--- linux-2.6.33.1/drivers/ata/pata_cs5536.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_cs5536.c 2010-03-20 16:58:39.272512019 -0400 +@@ -223,7 +223,7 @@ static struct scsi_host_template cs5536_ + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations cs5536_port_ops = { ++static const struct ata_port_operations cs5536_port_ops = { + .inherits = &ata_bmdma32_port_ops, + .cable_detect = cs5536_cable_detect, + .set_piomode = cs5536_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_cypress.c linux-2.6.33.1/drivers/ata/pata_cypress.c +--- linux-2.6.33.1/drivers/ata/pata_cypress.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_cypress.c 2010-03-20 16:58:39.272512019 -0400 +@@ -113,7 +113,7 @@ static struct scsi_host_template cy82c69 + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations cy82c693_port_ops = { ++static const struct ata_port_operations cy82c693_port_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = ata_cable_40wire, + .set_piomode = cy82c693_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_efar.c linux-2.6.33.1/drivers/ata/pata_efar.c +--- linux-2.6.33.1/drivers/ata/pata_efar.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_efar.c 2010-03-20 16:58:39.272512019 -0400 +@@ -223,7 +223,7 @@ static struct scsi_host_template efar_sh + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations efar_ops = { ++static const struct ata_port_operations efar_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = efar_cable_detect, + .set_piomode = efar_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_hpt366.c linux-2.6.33.1/drivers/ata/pata_hpt366.c +--- linux-2.6.33.1/drivers/ata/pata_hpt366.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_hpt366.c 2010-03-20 16:58:39.272512019 -0400 +@@ -280,7 +280,7 @@ static struct scsi_host_template hpt36x_ + * Configuration for HPT366/68 + */ + +-static struct ata_port_operations hpt366_port_ops = { ++static const struct ata_port_operations hpt366_port_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = hpt36x_cable_detect, + .mode_filter = hpt366_filter, +diff -urNp linux-2.6.33.1/drivers/ata/pata_hpt37x.c linux-2.6.33.1/drivers/ata/pata_hpt37x.c +--- linux-2.6.33.1/drivers/ata/pata_hpt37x.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_hpt37x.c 2010-03-20 16:58:39.272512019 -0400 +@@ -583,7 +583,7 @@ static struct scsi_host_template hpt37x_ + * Configuration for HPT370 + */ + +-static struct ata_port_operations hpt370_port_ops = { ++static const struct ata_port_operations hpt370_port_ops = { + .inherits = &ata_bmdma_port_ops, + + .bmdma_stop = hpt370_bmdma_stop, +@@ -599,7 +599,7 @@ static struct ata_port_operations hpt370 + * Configuration for HPT370A. Close to 370 but less filters + */ + +-static struct ata_port_operations hpt370a_port_ops = { ++static const struct ata_port_operations hpt370a_port_ops = { + .inherits = &hpt370_port_ops, + .mode_filter = hpt370a_filter, + }; +@@ -609,7 +609,7 @@ static struct ata_port_operations hpt370 + * and DMA mode setting functionality. + */ + +-static struct ata_port_operations hpt372_port_ops = { ++static const struct ata_port_operations hpt372_port_ops = { + .inherits = &ata_bmdma_port_ops, + + .bmdma_stop = hpt37x_bmdma_stop, +@@ -625,7 +625,7 @@ static struct ata_port_operations hpt372 + * but we have a different cable detection procedure for function 1. + */ + +-static struct ata_port_operations hpt374_fn1_port_ops = { ++static const struct ata_port_operations hpt374_fn1_port_ops = { + .inherits = &hpt372_port_ops, + .cable_detect = hpt374_fn1_cable_detect, + .prereset = hpt37x_pre_reset, +diff -urNp linux-2.6.33.1/drivers/ata/pata_hpt3x2n.c linux-2.6.33.1/drivers/ata/pata_hpt3x2n.c +--- linux-2.6.33.1/drivers/ata/pata_hpt3x2n.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_hpt3x2n.c 2010-03-20 16:58:39.276654661 -0400 +@@ -339,7 +339,7 @@ static struct scsi_host_template hpt3x2n + * Configuration for HPT3x2n. + */ + +-static struct ata_port_operations hpt3x2n_port_ops = { ++static const struct ata_port_operations hpt3x2n_port_ops = { + .inherits = &ata_bmdma_port_ops, + + .bmdma_stop = hpt3x2n_bmdma_stop, +diff -urNp linux-2.6.33.1/drivers/ata/pata_hpt3x3.c linux-2.6.33.1/drivers/ata/pata_hpt3x3.c +--- linux-2.6.33.1/drivers/ata/pata_hpt3x3.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_hpt3x3.c 2010-03-20 16:58:39.284801841 -0400 +@@ -141,7 +141,7 @@ static struct scsi_host_template hpt3x3_ + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations hpt3x3_port_ops = { ++static const struct ata_port_operations hpt3x3_port_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = ata_cable_40wire, + .set_piomode = hpt3x3_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_icside.c linux-2.6.33.1/drivers/ata/pata_icside.c +--- linux-2.6.33.1/drivers/ata/pata_icside.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_icside.c 2010-03-20 16:58:39.284801841 -0400 +@@ -319,7 +319,7 @@ static void pata_icside_postreset(struct + } + } + +-static struct ata_port_operations pata_icside_port_ops = { ++static const struct ata_port_operations pata_icside_port_ops = { + .inherits = &ata_sff_port_ops, + /* no need to build any PRD tables for DMA */ + .qc_prep = ata_noop_qc_prep, +diff -urNp linux-2.6.33.1/drivers/ata/pata_isapnp.c linux-2.6.33.1/drivers/ata/pata_isapnp.c +--- linux-2.6.33.1/drivers/ata/pata_isapnp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_isapnp.c 2010-03-20 16:58:39.292806212 -0400 +@@ -23,12 +23,12 @@ static struct scsi_host_template isapnp_ + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations isapnp_port_ops = { ++static const struct ata_port_operations isapnp_port_ops = { + .inherits = &ata_sff_port_ops, + .cable_detect = ata_cable_40wire, + }; + +-static struct ata_port_operations isapnp_noalt_port_ops = { ++static const struct ata_port_operations isapnp_noalt_port_ops = { + .inherits = &ata_sff_port_ops, + .cable_detect = ata_cable_40wire, + /* No altstatus so we don't want to use the lost interrupt poll */ +diff -urNp linux-2.6.33.1/drivers/ata/pata_it8213.c linux-2.6.33.1/drivers/ata/pata_it8213.c +--- linux-2.6.33.1/drivers/ata/pata_it8213.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_it8213.c 2010-03-20 16:58:39.296567577 -0400 +@@ -233,7 +233,7 @@ static struct scsi_host_template it8213_ + }; + + +-static struct ata_port_operations it8213_ops = { ++static const struct ata_port_operations it8213_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = it8213_cable_detect, + .set_piomode = it8213_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_it821x.c linux-2.6.33.1/drivers/ata/pata_it821x.c +--- linux-2.6.33.1/drivers/ata/pata_it821x.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_it821x.c 2010-03-20 16:58:39.296567577 -0400 +@@ -800,7 +800,7 @@ static struct scsi_host_template it821x_ + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations it821x_smart_port_ops = { ++static const struct ata_port_operations it821x_smart_port_ops = { + .inherits = &ata_bmdma_port_ops, + + .check_atapi_dma= it821x_check_atapi_dma, +@@ -814,7 +814,7 @@ static struct ata_port_operations it821x + .port_start = it821x_port_start, + }; + +-static struct ata_port_operations it821x_passthru_port_ops = { ++static const struct ata_port_operations it821x_passthru_port_ops = { + .inherits = &ata_bmdma_port_ops, + + .check_atapi_dma= it821x_check_atapi_dma, +@@ -830,7 +830,7 @@ static struct ata_port_operations it821x + .port_start = it821x_port_start, + }; + +-static struct ata_port_operations it821x_rdc_port_ops = { ++static const struct ata_port_operations it821x_rdc_port_ops = { + .inherits = &ata_bmdma_port_ops, + + .check_atapi_dma= it821x_check_atapi_dma, +diff -urNp linux-2.6.33.1/drivers/ata/pata_ixp4xx_cf.c linux-2.6.33.1/drivers/ata/pata_ixp4xx_cf.c +--- linux-2.6.33.1/drivers/ata/pata_ixp4xx_cf.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_ixp4xx_cf.c 2010-03-20 16:58:39.296567577 -0400 +@@ -89,7 +89,7 @@ static struct scsi_host_template ixp4xx_ + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations ixp4xx_port_ops = { ++static const struct ata_port_operations ixp4xx_port_ops = { + .inherits = &ata_sff_port_ops, + .sff_data_xfer = ixp4xx_mmio_data_xfer, + .cable_detect = ata_cable_40wire, +diff -urNp linux-2.6.33.1/drivers/ata/pata_jmicron.c linux-2.6.33.1/drivers/ata/pata_jmicron.c +--- linux-2.6.33.1/drivers/ata/pata_jmicron.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_jmicron.c 2010-03-20 16:58:39.296567577 -0400 +@@ -111,7 +111,7 @@ static struct scsi_host_template jmicron + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations jmicron_ops = { ++static const struct ata_port_operations jmicron_ops = { + .inherits = &ata_bmdma_port_ops, + .prereset = jmicron_pre_reset, + }; +diff -urNp linux-2.6.33.1/drivers/ata/pata_legacy.c linux-2.6.33.1/drivers/ata/pata_legacy.c +--- linux-2.6.33.1/drivers/ata/pata_legacy.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_legacy.c 2010-03-20 16:58:39.296567577 -0400 +@@ -113,7 +113,7 @@ struct legacy_probe { + + struct legacy_controller { + const char *name; +- struct ata_port_operations *ops; ++ const struct ata_port_operations *ops; + unsigned int pio_mask; + unsigned int flags; + unsigned int pflags; +@@ -230,12 +230,12 @@ static const struct ata_port_operations + * pio_mask as well. + */ + +-static struct ata_port_operations simple_port_ops = { ++static const struct ata_port_operations simple_port_ops = { + .inherits = &legacy_base_port_ops, + .sff_data_xfer = ata_sff_data_xfer_noirq, + }; + +-static struct ata_port_operations legacy_port_ops = { ++static const struct ata_port_operations legacy_port_ops = { + .inherits = &legacy_base_port_ops, + .sff_data_xfer = ata_sff_data_xfer_noirq, + .set_mode = legacy_set_mode, +@@ -331,7 +331,7 @@ static unsigned int pdc_data_xfer_vlb(st + return buflen; + } + +-static struct ata_port_operations pdc20230_port_ops = { ++static const struct ata_port_operations pdc20230_port_ops = { + .inherits = &legacy_base_port_ops, + .set_piomode = pdc20230_set_piomode, + .sff_data_xfer = pdc_data_xfer_vlb, +@@ -364,7 +364,7 @@ static void ht6560a_set_piomode(struct a + ioread8(ap->ioaddr.status_addr); + } + +-static struct ata_port_operations ht6560a_port_ops = { ++static const struct ata_port_operations ht6560a_port_ops = { + .inherits = &legacy_base_port_ops, + .set_piomode = ht6560a_set_piomode, + }; +@@ -407,7 +407,7 @@ static void ht6560b_set_piomode(struct a + ioread8(ap->ioaddr.status_addr); + } + +-static struct ata_port_operations ht6560b_port_ops = { ++static const struct ata_port_operations ht6560b_port_ops = { + .inherits = &legacy_base_port_ops, + .set_piomode = ht6560b_set_piomode, + }; +@@ -506,7 +506,7 @@ static void opti82c611a_set_piomode(stru + } + + +-static struct ata_port_operations opti82c611a_port_ops = { ++static const struct ata_port_operations opti82c611a_port_ops = { + .inherits = &legacy_base_port_ops, + .set_piomode = opti82c611a_set_piomode, + }; +@@ -616,7 +616,7 @@ static unsigned int opti82c46x_qc_issue( + return ata_sff_qc_issue(qc); + } + +-static struct ata_port_operations opti82c46x_port_ops = { ++static const struct ata_port_operations opti82c46x_port_ops = { + .inherits = &legacy_base_port_ops, + .set_piomode = opti82c46x_set_piomode, + .qc_issue = opti82c46x_qc_issue, +@@ -778,20 +778,20 @@ static int qdi_port(struct platform_devi + return 0; + } + +-static struct ata_port_operations qdi6500_port_ops = { ++static const struct ata_port_operations qdi6500_port_ops = { + .inherits = &legacy_base_port_ops, + .set_piomode = qdi6500_set_piomode, + .qc_issue = qdi_qc_issue, + .sff_data_xfer = vlb32_data_xfer, + }; + +-static struct ata_port_operations qdi6580_port_ops = { ++static const struct ata_port_operations qdi6580_port_ops = { + .inherits = &legacy_base_port_ops, + .set_piomode = qdi6580_set_piomode, + .sff_data_xfer = vlb32_data_xfer, + }; + +-static struct ata_port_operations qdi6580dp_port_ops = { ++static const struct ata_port_operations qdi6580dp_port_ops = { + .inherits = &legacy_base_port_ops, + .set_piomode = qdi6580dp_set_piomode, + .qc_issue = qdi_qc_issue, +@@ -863,7 +863,7 @@ static int winbond_port(struct platform_ + return 0; + } + +-static struct ata_port_operations winbond_port_ops = { ++static const struct ata_port_operations winbond_port_ops = { + .inherits = &legacy_base_port_ops, + .set_piomode = winbond_set_piomode, + .sff_data_xfer = vlb32_data_xfer, +@@ -986,7 +986,7 @@ static __init int legacy_init_one(struct + int pio_modes = controller->pio_mask; + unsigned long io = probe->port; + u32 mask = (1 << probe->slot); +- struct ata_port_operations *ops = controller->ops; ++ const struct ata_port_operations *ops = controller->ops; + struct legacy_data *ld = &legacy_data[probe->slot]; + struct ata_host *host = NULL; + struct ata_port *ap; +diff -urNp linux-2.6.33.1/drivers/ata/pata_macio.c linux-2.6.33.1/drivers/ata/pata_macio.c +--- linux-2.6.33.1/drivers/ata/pata_macio.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_macio.c 2010-03-20 16:58:39.300646647 -0400 +@@ -915,7 +915,7 @@ static struct scsi_host_template pata_ma + .slave_configure = pata_macio_slave_config, + }; + +-static struct ata_port_operations pata_macio_ops = { ++static const struct ata_port_operations pata_macio_ops = { + .inherits = &ata_sff_port_ops, + + .freeze = pata_macio_freeze, +diff -urNp linux-2.6.33.1/drivers/ata/pata_marvell.c linux-2.6.33.1/drivers/ata/pata_marvell.c +--- linux-2.6.33.1/drivers/ata/pata_marvell.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_marvell.c 2010-03-20 16:58:39.300646647 -0400 +@@ -100,7 +100,7 @@ static struct scsi_host_template marvell + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations marvell_ops = { ++static const struct ata_port_operations marvell_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = marvell_cable_detect, + .prereset = marvell_pre_reset, +diff -urNp linux-2.6.33.1/drivers/ata/pata_mpc52xx.c linux-2.6.33.1/drivers/ata/pata_mpc52xx.c +--- linux-2.6.33.1/drivers/ata/pata_mpc52xx.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_mpc52xx.c 2010-03-20 16:58:39.300646647 -0400 +@@ -609,7 +609,7 @@ static struct scsi_host_template mpc52xx + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations mpc52xx_ata_port_ops = { ++static const struct ata_port_operations mpc52xx_ata_port_ops = { + .inherits = &ata_sff_port_ops, + .sff_dev_select = mpc52xx_ata_dev_select, + .set_piomode = mpc52xx_ata_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_mpiix.c linux-2.6.33.1/drivers/ata/pata_mpiix.c +--- linux-2.6.33.1/drivers/ata/pata_mpiix.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_mpiix.c 2010-03-20 16:58:39.300646647 -0400 +@@ -140,7 +140,7 @@ static struct scsi_host_template mpiix_s + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations mpiix_port_ops = { ++static const struct ata_port_operations mpiix_port_ops = { + .inherits = &ata_sff_port_ops, + .qc_issue = mpiix_qc_issue, + .cable_detect = ata_cable_40wire, +diff -urNp linux-2.6.33.1/drivers/ata/pata_netcell.c linux-2.6.33.1/drivers/ata/pata_netcell.c +--- linux-2.6.33.1/drivers/ata/pata_netcell.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_netcell.c 2010-03-20 16:58:39.300646647 -0400 +@@ -34,7 +34,7 @@ static struct scsi_host_template netcell + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations netcell_ops = { ++static const struct ata_port_operations netcell_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = ata_cable_80wire, + .read_id = netcell_read_id, +diff -urNp linux-2.6.33.1/drivers/ata/pata_ninja32.c linux-2.6.33.1/drivers/ata/pata_ninja32.c +--- linux-2.6.33.1/drivers/ata/pata_ninja32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_ninja32.c 2010-03-20 16:58:39.304517814 -0400 +@@ -81,7 +81,7 @@ static struct scsi_host_template ninja32 + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations ninja32_port_ops = { ++static const struct ata_port_operations ninja32_port_ops = { + .inherits = &ata_bmdma_port_ops, + .sff_dev_select = ninja32_dev_select, + .cable_detect = ata_cable_40wire, +diff -urNp linux-2.6.33.1/drivers/ata/pata_ns87410.c linux-2.6.33.1/drivers/ata/pata_ns87410.c +--- linux-2.6.33.1/drivers/ata/pata_ns87410.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_ns87410.c 2010-03-20 16:58:39.304517814 -0400 +@@ -132,7 +132,7 @@ static struct scsi_host_template ns87410 + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations ns87410_port_ops = { ++static const struct ata_port_operations ns87410_port_ops = { + .inherits = &ata_sff_port_ops, + .qc_issue = ns87410_qc_issue, + .cable_detect = ata_cable_40wire, +diff -urNp linux-2.6.33.1/drivers/ata/pata_ns87415.c linux-2.6.33.1/drivers/ata/pata_ns87415.c +--- linux-2.6.33.1/drivers/ata/pata_ns87415.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_ns87415.c 2010-03-20 16:58:39.304517814 -0400 +@@ -299,7 +299,7 @@ static u8 ns87560_bmdma_status(struct at + } + #endif /* 87560 SuperIO Support */ + +-static struct ata_port_operations ns87415_pata_ops = { ++static const struct ata_port_operations ns87415_pata_ops = { + .inherits = &ata_bmdma_port_ops, + + .check_atapi_dma = ns87415_check_atapi_dma, +@@ -313,7 +313,7 @@ static struct ata_port_operations ns8741 + }; + + #if defined(CONFIG_SUPERIO) +-static struct ata_port_operations ns87560_pata_ops = { ++static const struct ata_port_operations ns87560_pata_ops = { + .inherits = &ns87415_pata_ops, + .sff_tf_read = ns87560_tf_read, + .sff_check_status = ns87560_check_status, +diff -urNp linux-2.6.33.1/drivers/ata/pata_octeon_cf.c linux-2.6.33.1/drivers/ata/pata_octeon_cf.c +--- linux-2.6.33.1/drivers/ata/pata_octeon_cf.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_octeon_cf.c 2010-03-20 16:58:39.304517814 -0400 +@@ -801,6 +801,7 @@ static unsigned int octeon_cf_qc_issue(s + return 0; + } + ++/* cannot be const */ + static struct ata_port_operations octeon_cf_ops = { + .inherits = &ata_sff_port_ops, + .check_atapi_dma = octeon_cf_check_atapi_dma, +diff -urNp linux-2.6.33.1/drivers/ata/pata_oldpiix.c linux-2.6.33.1/drivers/ata/pata_oldpiix.c +--- linux-2.6.33.1/drivers/ata/pata_oldpiix.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_oldpiix.c 2010-03-20 16:58:39.304517814 -0400 +@@ -208,7 +208,7 @@ static struct scsi_host_template oldpiix + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations oldpiix_pata_ops = { ++static const struct ata_port_operations oldpiix_pata_ops = { + .inherits = &ata_bmdma_port_ops, + .qc_issue = oldpiix_qc_issue, + .cable_detect = ata_cable_40wire, +diff -urNp linux-2.6.33.1/drivers/ata/pata_opti.c linux-2.6.33.1/drivers/ata/pata_opti.c +--- linux-2.6.33.1/drivers/ata/pata_opti.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_opti.c 2010-03-20 16:58:39.304517814 -0400 +@@ -152,7 +152,7 @@ static struct scsi_host_template opti_sh + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations opti_port_ops = { ++static const struct ata_port_operations opti_port_ops = { + .inherits = &ata_sff_port_ops, + .cable_detect = ata_cable_40wire, + .set_piomode = opti_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_optidma.c linux-2.6.33.1/drivers/ata/pata_optidma.c +--- linux-2.6.33.1/drivers/ata/pata_optidma.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_optidma.c 2010-03-20 16:58:39.308536639 -0400 +@@ -337,7 +337,7 @@ static struct scsi_host_template optidma + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations optidma_port_ops = { ++static const struct ata_port_operations optidma_port_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = ata_cable_40wire, + .set_piomode = optidma_set_pio_mode, +@@ -346,7 +346,7 @@ static struct ata_port_operations optidm + .prereset = optidma_pre_reset, + }; + +-static struct ata_port_operations optiplus_port_ops = { ++static const struct ata_port_operations optiplus_port_ops = { + .inherits = &optidma_port_ops, + .set_piomode = optiplus_set_pio_mode, + .set_dmamode = optiplus_set_dma_mode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_palmld.c linux-2.6.33.1/drivers/ata/pata_palmld.c +--- linux-2.6.33.1/drivers/ata/pata_palmld.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_palmld.c 2010-03-20 16:58:39.308536639 -0400 +@@ -37,7 +37,7 @@ static struct scsi_host_template palmld_ + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations palmld_port_ops = { ++static const struct ata_port_operations palmld_port_ops = { + .inherits = &ata_sff_port_ops, + .sff_data_xfer = ata_sff_data_xfer_noirq, + .cable_detect = ata_cable_40wire, +diff -urNp linux-2.6.33.1/drivers/ata/pata_pcmcia.c linux-2.6.33.1/drivers/ata/pata_pcmcia.c +--- linux-2.6.33.1/drivers/ata/pata_pcmcia.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_pcmcia.c 2010-03-20 16:58:39.308536639 -0400 +@@ -162,14 +162,14 @@ static struct scsi_host_template pcmcia_ + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations pcmcia_port_ops = { ++static const struct ata_port_operations pcmcia_port_ops = { + .inherits = &ata_sff_port_ops, + .sff_data_xfer = ata_sff_data_xfer_noirq, + .cable_detect = ata_cable_40wire, + .set_mode = pcmcia_set_mode, + }; + +-static struct ata_port_operations pcmcia_8bit_port_ops = { ++static const struct ata_port_operations pcmcia_8bit_port_ops = { + .inherits = &ata_sff_port_ops, + .sff_data_xfer = ata_data_xfer_8bit, + .cable_detect = ata_cable_40wire, +@@ -253,7 +253,7 @@ static int pcmcia_init_one(struct pcmcia + unsigned long io_base, ctl_base; + void __iomem *io_addr, *ctl_addr; + int n_ports = 1; +- struct ata_port_operations *ops = &pcmcia_port_ops; ++ const struct ata_port_operations *ops = &pcmcia_port_ops; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (info == NULL) +diff -urNp linux-2.6.33.1/drivers/ata/pata_pdc2027x.c linux-2.6.33.1/drivers/ata/pata_pdc2027x.c +--- linux-2.6.33.1/drivers/ata/pata_pdc2027x.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_pdc2027x.c 2010-03-20 16:58:39.308536639 -0400 +@@ -132,14 +132,14 @@ static struct scsi_host_template pdc2027 + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations pdc2027x_pata100_ops = { ++static const struct ata_port_operations pdc2027x_pata100_ops = { + .inherits = &ata_bmdma_port_ops, + .check_atapi_dma = pdc2027x_check_atapi_dma, + .cable_detect = pdc2027x_cable_detect, + .prereset = pdc2027x_prereset, + }; + +-static struct ata_port_operations pdc2027x_pata133_ops = { ++static const struct ata_port_operations pdc2027x_pata133_ops = { + .inherits = &pdc2027x_pata100_ops, + .mode_filter = pdc2027x_mode_filter, + .set_piomode = pdc2027x_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_pdc202xx_old.c linux-2.6.33.1/drivers/ata/pata_pdc202xx_old.c +--- linux-2.6.33.1/drivers/ata/pata_pdc202xx_old.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_pdc202xx_old.c 2010-03-20 16:58:39.308536639 -0400 +@@ -265,7 +265,7 @@ static struct scsi_host_template pdc202x + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations pdc2024x_port_ops = { ++static const struct ata_port_operations pdc2024x_port_ops = { + .inherits = &ata_bmdma_port_ops, + + .cable_detect = ata_cable_40wire, +@@ -273,7 +273,7 @@ static struct ata_port_operations pdc202 + .set_dmamode = pdc202xx_set_dmamode, + }; + +-static struct ata_port_operations pdc2026x_port_ops = { ++static const struct ata_port_operations pdc2026x_port_ops = { + .inherits = &pdc2024x_port_ops, + + .check_atapi_dma = pdc2026x_check_atapi_dma, +diff -urNp linux-2.6.33.1/drivers/ata/pata_piccolo.c linux-2.6.33.1/drivers/ata/pata_piccolo.c +--- linux-2.6.33.1/drivers/ata/pata_piccolo.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_piccolo.c 2010-03-20 16:58:39.308536639 -0400 +@@ -67,7 +67,7 @@ static struct scsi_host_template tosh_sh + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations tosh_port_ops = { ++static const struct ata_port_operations tosh_port_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = ata_cable_unknown, + .set_piomode = tosh_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_platform.c linux-2.6.33.1/drivers/ata/pata_platform.c +--- linux-2.6.33.1/drivers/ata/pata_platform.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_platform.c 2010-03-20 16:58:39.312510954 -0400 +@@ -48,7 +48,7 @@ static struct scsi_host_template pata_pl + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations pata_platform_port_ops = { ++static const struct ata_port_operations pata_platform_port_ops = { + .inherits = &ata_sff_port_ops, + .sff_data_xfer = ata_sff_data_xfer_noirq, + .cable_detect = ata_cable_unknown, +diff -urNp linux-2.6.33.1/drivers/ata/pata_qdi.c linux-2.6.33.1/drivers/ata/pata_qdi.c +--- linux-2.6.33.1/drivers/ata/pata_qdi.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_qdi.c 2010-03-20 16:58:39.312510954 -0400 +@@ -157,7 +157,7 @@ static struct scsi_host_template qdi_sht + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations qdi6500_port_ops = { ++static const struct ata_port_operations qdi6500_port_ops = { + .inherits = &ata_sff_port_ops, + .qc_issue = qdi_qc_issue, + .sff_data_xfer = qdi_data_xfer, +@@ -165,7 +165,7 @@ static struct ata_port_operations qdi650 + .set_piomode = qdi6500_set_piomode, + }; + +-static struct ata_port_operations qdi6580_port_ops = { ++static const struct ata_port_operations qdi6580_port_ops = { + .inherits = &qdi6500_port_ops, + .set_piomode = qdi6580_set_piomode, + }; +diff -urNp linux-2.6.33.1/drivers/ata/pata_radisys.c linux-2.6.33.1/drivers/ata/pata_radisys.c +--- linux-2.6.33.1/drivers/ata/pata_radisys.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_radisys.c 2010-03-20 16:58:39.312510954 -0400 +@@ -187,7 +187,7 @@ static struct scsi_host_template radisys + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations radisys_pata_ops = { ++static const struct ata_port_operations radisys_pata_ops = { + .inherits = &ata_bmdma_port_ops, + .qc_issue = radisys_qc_issue, + .cable_detect = ata_cable_unknown, +diff -urNp linux-2.6.33.1/drivers/ata/pata_rb532_cf.c linux-2.6.33.1/drivers/ata/pata_rb532_cf.c +--- linux-2.6.33.1/drivers/ata/pata_rb532_cf.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_rb532_cf.c 2010-03-20 16:58:39.312510954 -0400 +@@ -68,7 +68,7 @@ static irqreturn_t rb532_pata_irq_handle + return IRQ_HANDLED; + } + +-static struct ata_port_operations rb532_pata_port_ops = { ++static const struct ata_port_operations rb532_pata_port_ops = { + .inherits = &ata_sff_port_ops, + .sff_data_xfer = ata_sff_data_xfer32, + }; +diff -urNp linux-2.6.33.1/drivers/ata/pata_rdc.c linux-2.6.33.1/drivers/ata/pata_rdc.c +--- linux-2.6.33.1/drivers/ata/pata_rdc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_rdc.c 2010-03-20 16:58:39.312510954 -0400 +@@ -272,7 +272,7 @@ static void rdc_set_dmamode(struct ata_p + pci_write_config_byte(dev, 0x48, udma_enable); + } + +-static struct ata_port_operations rdc_pata_ops = { ++static const struct ata_port_operations rdc_pata_ops = { + .inherits = &ata_bmdma32_port_ops, + .cable_detect = rdc_pata_cable_detect, + .set_piomode = rdc_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_rz1000.c linux-2.6.33.1/drivers/ata/pata_rz1000.c +--- linux-2.6.33.1/drivers/ata/pata_rz1000.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_rz1000.c 2010-03-20 16:58:39.312510954 -0400 +@@ -54,7 +54,7 @@ static struct scsi_host_template rz1000_ + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations rz1000_port_ops = { ++static const struct ata_port_operations rz1000_port_ops = { + .inherits = &ata_sff_port_ops, + .cable_detect = ata_cable_40wire, + .set_mode = rz1000_set_mode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_sc1200.c linux-2.6.33.1/drivers/ata/pata_sc1200.c +--- linux-2.6.33.1/drivers/ata/pata_sc1200.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_sc1200.c 2010-03-20 16:58:39.312510954 -0400 +@@ -207,7 +207,7 @@ static struct scsi_host_template sc1200_ + .sg_tablesize = LIBATA_DUMB_MAX_PRD, + }; + +-static struct ata_port_operations sc1200_port_ops = { ++static const struct ata_port_operations sc1200_port_ops = { + .inherits = &ata_bmdma_port_ops, + .qc_prep = ata_sff_dumb_qc_prep, + .qc_issue = sc1200_qc_issue, +diff -urNp linux-2.6.33.1/drivers/ata/pata_scc.c linux-2.6.33.1/drivers/ata/pata_scc.c +--- linux-2.6.33.1/drivers/ata/pata_scc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_scc.c 2010-03-20 16:58:39.316539767 -0400 +@@ -965,7 +965,7 @@ static struct scsi_host_template scc_sht + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations scc_pata_ops = { ++static const struct ata_port_operations scc_pata_ops = { + .inherits = &ata_bmdma_port_ops, + + .set_piomode = scc_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_sch.c linux-2.6.33.1/drivers/ata/pata_sch.c +--- linux-2.6.33.1/drivers/ata/pata_sch.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_sch.c 2010-03-20 16:58:39.316539767 -0400 +@@ -75,7 +75,7 @@ static struct scsi_host_template sch_sht + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations sch_pata_ops = { ++static const struct ata_port_operations sch_pata_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = ata_cable_unknown, + .set_piomode = sch_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_serverworks.c linux-2.6.33.1/drivers/ata/pata_serverworks.c +--- linux-2.6.33.1/drivers/ata/pata_serverworks.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_serverworks.c 2010-03-20 16:58:39.316539767 -0400 +@@ -299,7 +299,7 @@ static struct scsi_host_template serverw + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations serverworks_osb4_port_ops = { ++static const struct ata_port_operations serverworks_osb4_port_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = serverworks_cable_detect, + .mode_filter = serverworks_osb4_filter, +@@ -307,7 +307,7 @@ static struct ata_port_operations server + .set_dmamode = serverworks_set_dmamode, + }; + +-static struct ata_port_operations serverworks_csb_port_ops = { ++static const struct ata_port_operations serverworks_csb_port_ops = { + .inherits = &serverworks_osb4_port_ops, + .mode_filter = serverworks_csb_filter, + }; +diff -urNp linux-2.6.33.1/drivers/ata/pata_sil680.c linux-2.6.33.1/drivers/ata/pata_sil680.c +--- linux-2.6.33.1/drivers/ata/pata_sil680.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_sil680.c 2010-03-20 16:58:39.316539767 -0400 +@@ -194,7 +194,7 @@ static struct scsi_host_template sil680_ + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations sil680_port_ops = { ++static const struct ata_port_operations sil680_port_ops = { + .inherits = &ata_bmdma32_port_ops, + .cable_detect = sil680_cable_detect, + .set_piomode = sil680_set_piomode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_sis.c linux-2.6.33.1/drivers/ata/pata_sis.c +--- linux-2.6.33.1/drivers/ata/pata_sis.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_sis.c 2010-03-20 16:58:39.320616216 -0400 +@@ -503,47 +503,47 @@ static struct scsi_host_template sis_sht + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations sis_133_for_sata_ops = { ++static const struct ata_port_operations sis_133_for_sata_ops = { + .inherits = &ata_bmdma_port_ops, + .set_piomode = sis_133_set_piomode, + .set_dmamode = sis_133_set_dmamode, + .cable_detect = sis_133_cable_detect, + }; + +-static struct ata_port_operations sis_base_ops = { ++static const struct ata_port_operations sis_base_ops = { + .inherits = &ata_bmdma_port_ops, + .prereset = sis_pre_reset, + }; + +-static struct ata_port_operations sis_133_ops = { ++static const struct ata_port_operations sis_133_ops = { + .inherits = &sis_base_ops, + .set_piomode = sis_133_set_piomode, + .set_dmamode = sis_133_set_dmamode, + .cable_detect = sis_133_cable_detect, + }; + +-static struct ata_port_operations sis_133_early_ops = { ++static const struct ata_port_operations sis_133_early_ops = { + .inherits = &sis_base_ops, + .set_piomode = sis_100_set_piomode, + .set_dmamode = sis_133_early_set_dmamode, + .cable_detect = sis_66_cable_detect, + }; + +-static struct ata_port_operations sis_100_ops = { ++static const struct ata_port_operations sis_100_ops = { + .inherits = &sis_base_ops, + .set_piomode = sis_100_set_piomode, + .set_dmamode = sis_100_set_dmamode, + .cable_detect = sis_66_cable_detect, + }; + +-static struct ata_port_operations sis_66_ops = { ++static const struct ata_port_operations sis_66_ops = { + .inherits = &sis_base_ops, + .set_piomode = sis_old_set_piomode, + .set_dmamode = sis_66_set_dmamode, + .cable_detect = sis_66_cable_detect, + }; + +-static struct ata_port_operations sis_old_ops = { ++static const struct ata_port_operations sis_old_ops = { + .inherits = &sis_base_ops, + .set_piomode = sis_old_set_piomode, + .set_dmamode = sis_old_set_dmamode, +diff -urNp linux-2.6.33.1/drivers/ata/pata_sl82c105.c linux-2.6.33.1/drivers/ata/pata_sl82c105.c +--- linux-2.6.33.1/drivers/ata/pata_sl82c105.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_sl82c105.c 2010-03-20 16:58:39.320616216 -0400 +@@ -231,7 +231,7 @@ static struct scsi_host_template sl82c10 + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations sl82c105_port_ops = { ++static const struct ata_port_operations sl82c105_port_ops = { + .inherits = &ata_bmdma_port_ops, + .qc_defer = sl82c105_qc_defer, + .bmdma_start = sl82c105_bmdma_start, +diff -urNp linux-2.6.33.1/drivers/ata/pata_triflex.c linux-2.6.33.1/drivers/ata/pata_triflex.c +--- linux-2.6.33.1/drivers/ata/pata_triflex.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_triflex.c 2010-03-20 16:58:39.320616216 -0400 +@@ -178,7 +178,7 @@ static struct scsi_host_template triflex + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations triflex_port_ops = { ++static const struct ata_port_operations triflex_port_ops = { + .inherits = &ata_bmdma_port_ops, + .bmdma_start = triflex_bmdma_start, + .bmdma_stop = triflex_bmdma_stop, +diff -urNp linux-2.6.33.1/drivers/ata/pata_via.c linux-2.6.33.1/drivers/ata/pata_via.c +--- linux-2.6.33.1/drivers/ata/pata_via.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_via.c 2010-03-20 16:58:39.320616216 -0400 +@@ -452,7 +452,7 @@ static struct scsi_host_template via_sht + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations via_port_ops = { ++static const struct ata_port_operations via_port_ops = { + .inherits = &ata_bmdma_port_ops, + .cable_detect = via_cable_detect, + .set_piomode = via_set_piomode, +@@ -463,7 +463,7 @@ static struct ata_port_operations via_po + .mode_filter = via_mode_filter, + }; + +-static struct ata_port_operations via_port_ops_noirq = { ++static const struct ata_port_operations via_port_ops_noirq = { + .inherits = &via_port_ops, + .sff_data_xfer = ata_sff_data_xfer_noirq, + }; +diff -urNp linux-2.6.33.1/drivers/ata/pata_winbond.c linux-2.6.33.1/drivers/ata/pata_winbond.c +--- linux-2.6.33.1/drivers/ata/pata_winbond.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pata_winbond.c 2010-03-20 16:58:39.346062317 -0400 +@@ -125,7 +125,7 @@ static struct scsi_host_template winbond + ATA_PIO_SHT(DRV_NAME), + }; + +-static struct ata_port_operations winbond_port_ops = { ++static const struct ata_port_operations winbond_port_ops = { + .inherits = &ata_sff_port_ops, + .sff_data_xfer = winbond_data_xfer, + .cable_detect = ata_cable_40wire, +diff -urNp linux-2.6.33.1/drivers/ata/pdc_adma.c linux-2.6.33.1/drivers/ata/pdc_adma.c +--- linux-2.6.33.1/drivers/ata/pdc_adma.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/pdc_adma.c 2010-03-20 16:58:39.348522755 -0400 +@@ -145,7 +145,7 @@ static struct scsi_host_template adma_at + .dma_boundary = ADMA_DMA_BOUNDARY, + }; + +-static struct ata_port_operations adma_ata_ops = { ++static const struct ata_port_operations adma_ata_ops = { + .inherits = &ata_sff_port_ops, + + .lost_interrupt = ATA_OP_NULL, +diff -urNp linux-2.6.33.1/drivers/ata/sata_fsl.c linux-2.6.33.1/drivers/ata/sata_fsl.c +--- linux-2.6.33.1/drivers/ata/sata_fsl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_fsl.c 2010-03-20 16:58:39.348522755 -0400 +@@ -1260,7 +1260,7 @@ static struct scsi_host_template sata_fs + .dma_boundary = ATA_DMA_BOUNDARY, + }; + +-static struct ata_port_operations sata_fsl_ops = { ++static const struct ata_port_operations sata_fsl_ops = { + .inherits = &sata_pmp_port_ops, + + .qc_defer = ata_std_qc_defer, +diff -urNp linux-2.6.33.1/drivers/ata/sata_inic162x.c linux-2.6.33.1/drivers/ata/sata_inic162x.c +--- linux-2.6.33.1/drivers/ata/sata_inic162x.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_inic162x.c 2010-03-20 16:58:39.348522755 -0400 +@@ -721,7 +721,7 @@ static int inic_port_start(struct ata_po + return 0; + } + +-static struct ata_port_operations inic_port_ops = { ++static const struct ata_port_operations inic_port_ops = { + .inherits = &sata_port_ops, + + .check_atapi_dma = inic_check_atapi_dma, +diff -urNp linux-2.6.33.1/drivers/ata/sata_mv.c linux-2.6.33.1/drivers/ata/sata_mv.c +--- linux-2.6.33.1/drivers/ata/sata_mv.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_mv.c 2010-03-20 16:58:39.352614445 -0400 +@@ -662,7 +662,7 @@ static struct scsi_host_template mv6_sht + .dma_boundary = MV_DMA_BOUNDARY, + }; + +-static struct ata_port_operations mv5_ops = { ++static const struct ata_port_operations mv5_ops = { + .inherits = &ata_sff_port_ops, + + .lost_interrupt = ATA_OP_NULL, +@@ -684,7 +684,7 @@ static struct ata_port_operations mv5_op + .port_stop = mv_port_stop, + }; + +-static struct ata_port_operations mv6_ops = { ++static const struct ata_port_operations mv6_ops = { + .inherits = &mv5_ops, + .dev_config = mv6_dev_config, + .scr_read = mv_scr_read, +@@ -704,7 +704,7 @@ static struct ata_port_operations mv6_op + .bmdma_status = mv_bmdma_status, + }; + +-static struct ata_port_operations mv_iie_ops = { ++static const struct ata_port_operations mv_iie_ops = { + .inherits = &mv6_ops, + .dev_config = ATA_OP_NULL, + .qc_prep = mv_qc_prep_iie, +diff -urNp linux-2.6.33.1/drivers/ata/sata_nv.c linux-2.6.33.1/drivers/ata/sata_nv.c +--- linux-2.6.33.1/drivers/ata/sata_nv.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_nv.c 2010-03-20 16:58:39.352614445 -0400 +@@ -464,7 +464,7 @@ static struct scsi_host_template nv_swnc + * cases. Define nv_hardreset() which only kicks in for post-boot + * probing and use it for all variants. + */ +-static struct ata_port_operations nv_generic_ops = { ++static const struct ata_port_operations nv_generic_ops = { + .inherits = &ata_bmdma_port_ops, + .lost_interrupt = ATA_OP_NULL, + .scr_read = nv_scr_read, +@@ -472,20 +472,20 @@ static struct ata_port_operations nv_gen + .hardreset = nv_hardreset, + }; + +-static struct ata_port_operations nv_nf2_ops = { ++static const struct ata_port_operations nv_nf2_ops = { + .inherits = &nv_generic_ops, + .freeze = nv_nf2_freeze, + .thaw = nv_nf2_thaw, + }; + +-static struct ata_port_operations nv_ck804_ops = { ++static const struct ata_port_operations nv_ck804_ops = { + .inherits = &nv_generic_ops, + .freeze = nv_ck804_freeze, + .thaw = nv_ck804_thaw, + .host_stop = nv_ck804_host_stop, + }; + +-static struct ata_port_operations nv_adma_ops = { ++static const struct ata_port_operations nv_adma_ops = { + .inherits = &nv_ck804_ops, + + .check_atapi_dma = nv_adma_check_atapi_dma, +@@ -509,7 +509,7 @@ static struct ata_port_operations nv_adm + .host_stop = nv_adma_host_stop, + }; + +-static struct ata_port_operations nv_swncq_ops = { ++static const struct ata_port_operations nv_swncq_ops = { + .inherits = &nv_generic_ops, + + .qc_defer = ata_std_qc_defer, +diff -urNp linux-2.6.33.1/drivers/ata/sata_promise.c linux-2.6.33.1/drivers/ata/sata_promise.c +--- linux-2.6.33.1/drivers/ata/sata_promise.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_promise.c 2010-03-20 16:58:39.352614445 -0400 +@@ -195,7 +195,7 @@ static const struct ata_port_operations + .error_handler = pdc_error_handler, + }; + +-static struct ata_port_operations pdc_sata_ops = { ++static const struct ata_port_operations pdc_sata_ops = { + .inherits = &pdc_common_ops, + .cable_detect = pdc_sata_cable_detect, + .freeze = pdc_sata_freeze, +@@ -208,14 +208,14 @@ static struct ata_port_operations pdc_sa + + /* First-generation chips need a more restrictive ->check_atapi_dma op, + and ->freeze/thaw that ignore the hotplug controls. */ +-static struct ata_port_operations pdc_old_sata_ops = { ++static const struct ata_port_operations pdc_old_sata_ops = { + .inherits = &pdc_sata_ops, + .freeze = pdc_freeze, + .thaw = pdc_thaw, + .check_atapi_dma = pdc_old_sata_check_atapi_dma, + }; + +-static struct ata_port_operations pdc_pata_ops = { ++static const struct ata_port_operations pdc_pata_ops = { + .inherits = &pdc_common_ops, + .cable_detect = pdc_pata_cable_detect, + .freeze = pdc_freeze, +diff -urNp linux-2.6.33.1/drivers/ata/sata_qstor.c linux-2.6.33.1/drivers/ata/sata_qstor.c +--- linux-2.6.33.1/drivers/ata/sata_qstor.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_qstor.c 2010-03-20 16:58:39.356512831 -0400 +@@ -132,7 +132,7 @@ static struct scsi_host_template qs_ata_ + .dma_boundary = QS_DMA_BOUNDARY, + }; + +-static struct ata_port_operations qs_ata_ops = { ++static const struct ata_port_operations qs_ata_ops = { + .inherits = &ata_sff_port_ops, + + .check_atapi_dma = qs_check_atapi_dma, +diff -urNp linux-2.6.33.1/drivers/ata/sata_sil24.c linux-2.6.33.1/drivers/ata/sata_sil24.c +--- linux-2.6.33.1/drivers/ata/sata_sil24.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_sil24.c 2010-03-20 16:58:39.364799637 -0400 +@@ -388,7 +388,7 @@ static struct scsi_host_template sil24_s + .dma_boundary = ATA_DMA_BOUNDARY, + }; + +-static struct ata_port_operations sil24_ops = { ++static const struct ata_port_operations sil24_ops = { + .inherits = &sata_pmp_port_ops, + + .qc_defer = sil24_qc_defer, +diff -urNp linux-2.6.33.1/drivers/ata/sata_sil.c linux-2.6.33.1/drivers/ata/sata_sil.c +--- linux-2.6.33.1/drivers/ata/sata_sil.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_sil.c 2010-03-20 16:58:39.368722008 -0400 +@@ -182,7 +182,7 @@ static struct scsi_host_template sil_sht + .sg_tablesize = ATA_MAX_PRD + }; + +-static struct ata_port_operations sil_ops = { ++static const struct ata_port_operations sil_ops = { + .inherits = &ata_bmdma32_port_ops, + .dev_config = sil_dev_config, + .set_mode = sil_set_mode, +diff -urNp linux-2.6.33.1/drivers/ata/sata_sis.c linux-2.6.33.1/drivers/ata/sata_sis.c +--- linux-2.6.33.1/drivers/ata/sata_sis.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_sis.c 2010-03-20 16:58:39.368722008 -0400 +@@ -89,7 +89,7 @@ static struct scsi_host_template sis_sht + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations sis_ops = { ++static const struct ata_port_operations sis_ops = { + .inherits = &ata_bmdma_port_ops, + .scr_read = sis_scr_read, + .scr_write = sis_scr_write, +diff -urNp linux-2.6.33.1/drivers/ata/sata_svw.c linux-2.6.33.1/drivers/ata/sata_svw.c +--- linux-2.6.33.1/drivers/ata/sata_svw.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_svw.c 2010-03-20 16:58:39.368722008 -0400 +@@ -344,7 +344,7 @@ static struct scsi_host_template k2_sata + }; + + +-static struct ata_port_operations k2_sata_ops = { ++static const struct ata_port_operations k2_sata_ops = { + .inherits = &ata_bmdma_port_ops, + .sff_tf_load = k2_sata_tf_load, + .sff_tf_read = k2_sata_tf_read, +diff -urNp linux-2.6.33.1/drivers/ata/sata_sx4.c linux-2.6.33.1/drivers/ata/sata_sx4.c +--- linux-2.6.33.1/drivers/ata/sata_sx4.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_sx4.c 2010-03-20 16:58:39.368722008 -0400 +@@ -248,7 +248,7 @@ static struct scsi_host_template pdc_sat + }; + + /* TODO: inherit from base port_ops after converting to new EH */ +-static struct ata_port_operations pdc_20621_ops = { ++static const struct ata_port_operations pdc_20621_ops = { + .inherits = &ata_sff_port_ops, + + .check_atapi_dma = pdc_check_atapi_dma, +diff -urNp linux-2.6.33.1/drivers/ata/sata_uli.c linux-2.6.33.1/drivers/ata/sata_uli.c +--- linux-2.6.33.1/drivers/ata/sata_uli.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_uli.c 2010-03-20 16:58:39.372783452 -0400 +@@ -79,7 +79,7 @@ static struct scsi_host_template uli_sht + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations uli_ops = { ++static const struct ata_port_operations uli_ops = { + .inherits = &ata_bmdma_port_ops, + .scr_read = uli_scr_read, + .scr_write = uli_scr_write, +diff -urNp linux-2.6.33.1/drivers/ata/sata_via.c linux-2.6.33.1/drivers/ata/sata_via.c +--- linux-2.6.33.1/drivers/ata/sata_via.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_via.c 2010-03-20 16:58:39.372783452 -0400 +@@ -112,31 +112,31 @@ static struct scsi_host_template svia_sh + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations svia_base_ops = { ++static const struct ata_port_operations svia_base_ops = { + .inherits = &ata_bmdma_port_ops, + .sff_tf_load = svia_tf_load, + }; + +-static struct ata_port_operations vt6420_sata_ops = { ++static const struct ata_port_operations vt6420_sata_ops = { + .inherits = &svia_base_ops, + .freeze = svia_noop_freeze, + .prereset = vt6420_prereset, + }; + +-static struct ata_port_operations vt6421_pata_ops = { ++static const struct ata_port_operations vt6421_pata_ops = { + .inherits = &svia_base_ops, + .cable_detect = vt6421_pata_cable_detect, + .set_piomode = vt6421_set_pio_mode, + .set_dmamode = vt6421_set_dma_mode, + }; + +-static struct ata_port_operations vt6421_sata_ops = { ++static const struct ata_port_operations vt6421_sata_ops = { + .inherits = &svia_base_ops, + .scr_read = svia_scr_read, + .scr_write = svia_scr_write, + }; + +-static struct ata_port_operations vt8251_ops = { ++static const struct ata_port_operations vt8251_ops = { + .inherits = &svia_base_ops, + .hardreset = sata_std_hardreset, + .scr_read = vt8251_scr_read, +diff -urNp linux-2.6.33.1/drivers/ata/sata_vsc.c linux-2.6.33.1/drivers/ata/sata_vsc.c +--- linux-2.6.33.1/drivers/ata/sata_vsc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ata/sata_vsc.c 2010-03-20 16:58:39.372783452 -0400 +@@ -306,7 +306,7 @@ static struct scsi_host_template vsc_sat + }; + + +-static struct ata_port_operations vsc_sata_ops = { ++static const struct ata_port_operations vsc_sata_ops = { + .inherits = &ata_bmdma_port_ops, + /* The IRQ handling is not quite standard SFF behaviour so we + cannot use the default lost interrupt handler */ +diff -urNp linux-2.6.33.1/drivers/atm/adummy.c linux-2.6.33.1/drivers/atm/adummy.c +--- linux-2.6.33.1/drivers/atm/adummy.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/adummy.c 2010-03-20 16:58:39.380807655 -0400 +@@ -77,7 +77,7 @@ adummy_send(struct atm_vcc *vcc, struct + vcc->pop(vcc, skb); + else + dev_kfree_skb_any(skb); +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + + return 0; + } +diff -urNp linux-2.6.33.1/drivers/atm/ambassador.c linux-2.6.33.1/drivers/atm/ambassador.c +--- linux-2.6.33.1/drivers/atm/ambassador.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/ambassador.c 2010-03-20 16:58:39.414343197 -0400 +@@ -453,7 +453,7 @@ static void tx_complete (amb_dev * dev, + PRINTD (DBG_FLOW|DBG_TX, "tx_complete %p %p", dev, tx); + + // VC layer stats +- atomic_inc(&ATM_SKB(skb)->vcc->stats->tx); ++ atomic_inc_unchecked(&ATM_SKB(skb)->vcc->stats->tx); + + // free the descriptor + kfree (tx_descr); +@@ -494,7 +494,7 @@ static void rx_complete (amb_dev * dev, + dump_skb ("<<<", vc, skb); + + // VC layer stats +- atomic_inc(&atm_vcc->stats->rx); ++ atomic_inc_unchecked(&atm_vcc->stats->rx); + __net_timestamp(skb); + // end of our responsability + atm_vcc->push (atm_vcc, skb); +@@ -509,7 +509,7 @@ static void rx_complete (amb_dev * dev, + } else { + PRINTK (KERN_INFO, "dropped over-size frame"); + // should we count this? +- atomic_inc(&atm_vcc->stats->rx_drop); ++ atomic_inc_unchecked(&atm_vcc->stats->rx_drop); + } + + } else { +@@ -1341,7 +1341,7 @@ static int amb_send (struct atm_vcc * at + } + + if (check_area (skb->data, skb->len)) { +- atomic_inc(&atm_vcc->stats->tx_err); ++ atomic_inc_unchecked(&atm_vcc->stats->tx_err); + return -ENOMEM; // ? + } + +diff -urNp linux-2.6.33.1/drivers/atm/atmtcp.c linux-2.6.33.1/drivers/atm/atmtcp.c +--- linux-2.6.33.1/drivers/atm/atmtcp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/atmtcp.c 2010-03-20 16:58:39.414343197 -0400 +@@ -206,7 +206,7 @@ static int atmtcp_v_send(struct atm_vcc + if (vcc->pop) vcc->pop(vcc,skb); + else dev_kfree_skb(skb); + if (dev_data) return 0; +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + return -ENOLINK; + } + size = skb->len+sizeof(struct atmtcp_hdr); +@@ -214,7 +214,7 @@ static int atmtcp_v_send(struct atm_vcc + if (!new_skb) { + if (vcc->pop) vcc->pop(vcc,skb); + else dev_kfree_skb(skb); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + return -ENOBUFS; + } + hdr = (void *) skb_put(new_skb,sizeof(struct atmtcp_hdr)); +@@ -225,8 +225,8 @@ static int atmtcp_v_send(struct atm_vcc + if (vcc->pop) vcc->pop(vcc,skb); + else dev_kfree_skb(skb); + out_vcc->push(out_vcc,new_skb); +- atomic_inc(&vcc->stats->tx); +- atomic_inc(&out_vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->tx); ++ atomic_inc_unchecked(&out_vcc->stats->rx); + return 0; + } + +@@ -300,7 +300,7 @@ static int atmtcp_c_send(struct atm_vcc + out_vcc = find_vcc(dev, ntohs(hdr->vpi), ntohs(hdr->vci)); + read_unlock(&vcc_sklist_lock); + if (!out_vcc) { +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + goto done; + } + skb_pull(skb,sizeof(struct atmtcp_hdr)); +@@ -312,8 +312,8 @@ static int atmtcp_c_send(struct atm_vcc + __net_timestamp(new_skb); + skb_copy_from_linear_data(skb, skb_put(new_skb, skb->len), skb->len); + out_vcc->push(out_vcc,new_skb); +- atomic_inc(&vcc->stats->tx); +- atomic_inc(&out_vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->tx); ++ atomic_inc_unchecked(&out_vcc->stats->rx); + done: + if (vcc->pop) vcc->pop(vcc,skb); + else dev_kfree_skb(skb); +diff -urNp linux-2.6.33.1/drivers/atm/eni.c linux-2.6.33.1/drivers/atm/eni.c +--- linux-2.6.33.1/drivers/atm/eni.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/eni.c 2010-03-20 16:58:39.441488287 -0400 +@@ -525,7 +525,7 @@ static int rx_aal0(struct atm_vcc *vcc) + DPRINTK(DEV_LABEL "(itf %d): trashing empty cell\n", + vcc->dev->number); + length = 0; +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + } + else { + length = ATM_CELL_SIZE-1; /* no HEC */ +@@ -580,7 +580,7 @@ static int rx_aal5(struct atm_vcc *vcc) + size); + } + eff = length = 0; +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + } + else { + size = (descr & MID_RED_COUNT)*(ATM_CELL_PAYLOAD >> 2); +@@ -597,7 +597,7 @@ static int rx_aal5(struct atm_vcc *vcc) + "(VCI=%d,length=%ld,size=%ld (descr 0x%lx))\n", + vcc->dev->number,vcc->vci,length,size << 2,descr); + length = eff = 0; +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + } + } + skb = eff ? atm_alloc_charge(vcc,eff << 2,GFP_ATOMIC) : NULL; +@@ -770,7 +770,7 @@ rx_dequeued++; + vcc->push(vcc,skb); + pushed++; + } +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + } + wake_up(&eni_dev->rx_wait); + } +@@ -1227,7 +1227,7 @@ static void dequeue_tx(struct atm_dev *d + PCI_DMA_TODEVICE); + if (vcc->pop) vcc->pop(vcc,skb); + else dev_kfree_skb_irq(skb); +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + wake_up(&eni_dev->tx_wait); + dma_complete++; + } +diff -urNp linux-2.6.33.1/drivers/atm/firestream.c linux-2.6.33.1/drivers/atm/firestream.c +--- linux-2.6.33.1/drivers/atm/firestream.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/firestream.c 2010-03-20 16:58:39.441488287 -0400 +@@ -748,7 +748,7 @@ static void process_txdone_queue (struct + } + } + +- atomic_inc(&ATM_SKB(skb)->vcc->stats->tx); ++ atomic_inc_unchecked(&ATM_SKB(skb)->vcc->stats->tx); + + fs_dprintk (FS_DEBUG_TXMEM, "i"); + fs_dprintk (FS_DEBUG_ALLOC, "Free t-skb: %p\n", skb); +@@ -815,7 +815,7 @@ static void process_incoming (struct fs_ + #endif + skb_put (skb, qe->p1 & 0xffff); + ATM_SKB(skb)->vcc = atm_vcc; +- atomic_inc(&atm_vcc->stats->rx); ++ atomic_inc_unchecked(&atm_vcc->stats->rx); + __net_timestamp(skb); + fs_dprintk (FS_DEBUG_ALLOC, "Free rec-skb: %p (pushed)\n", skb); + atm_vcc->push (atm_vcc, skb); +@@ -836,12 +836,12 @@ static void process_incoming (struct fs_ + kfree (pe); + } + if (atm_vcc) +- atomic_inc(&atm_vcc->stats->rx_drop); ++ atomic_inc_unchecked(&atm_vcc->stats->rx_drop); + break; + case 0x1f: /* Reassembly abort: no buffers. */ + /* Silently increment error counter. */ + if (atm_vcc) +- atomic_inc(&atm_vcc->stats->rx_drop); ++ atomic_inc_unchecked(&atm_vcc->stats->rx_drop); + break; + default: /* Hmm. Haven't written the code to handle the others yet... -- REW */ + printk (KERN_WARNING "Don't know what to do with RX status %x: %s.\n", +diff -urNp linux-2.6.33.1/drivers/atm/fore200e.c linux-2.6.33.1/drivers/atm/fore200e.c +--- linux-2.6.33.1/drivers/atm/fore200e.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/fore200e.c 2010-03-20 16:58:39.457804294 -0400 +@@ -931,9 +931,9 @@ fore200e_tx_irq(struct fore200e* fore200 + #endif + /* check error condition */ + if (*entry->status & STATUS_ERROR) +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + else +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + } + } + +@@ -1082,7 +1082,7 @@ fore200e_push_rpd(struct fore200e* fore2 + if (skb == NULL) { + DPRINTK(2, "unable to alloc new skb, rx PDU length = %d\n", pdu_len); + +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + return -ENOMEM; + } + +@@ -1125,14 +1125,14 @@ fore200e_push_rpd(struct fore200e* fore2 + + dev_kfree_skb_any(skb); + +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + return -ENOMEM; + } + + ASSERT(atomic_read(&sk_atm(vcc)->sk_wmem_alloc) >= 0); + + vcc->push(vcc, skb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + + ASSERT(atomic_read(&sk_atm(vcc)->sk_wmem_alloc) >= 0); + +@@ -1210,7 +1210,7 @@ fore200e_rx_irq(struct fore200e* fore200 + DPRINTK(2, "damaged PDU on %d.%d.%d\n", + fore200e->atm_dev->number, + entry->rpd->atm_header.vpi, entry->rpd->atm_header.vci); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + } + } + +@@ -1655,7 +1655,7 @@ fore200e_send(struct atm_vcc *vcc, struc + goto retry_here; + } + +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + + fore200e->tx_sat++; + DPRINTK(2, "tx queue of device %s is saturated, PDU dropped - heartbeat is %08x\n", +diff -urNp linux-2.6.33.1/drivers/atm/he.c linux-2.6.33.1/drivers/atm/he.c +--- linux-2.6.33.1/drivers/atm/he.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/he.c 2010-03-20 16:58:39.488806819 -0400 +@@ -1769,7 +1769,7 @@ he_service_rbrq(struct he_dev *he_dev, i + + if (RBRQ_HBUF_ERR(he_dev->rbrq_head)) { + hprintk("HBUF_ERR! (cid 0x%x)\n", cid); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + goto return_host_buffers; + } + +@@ -1802,7 +1802,7 @@ he_service_rbrq(struct he_dev *he_dev, i + RBRQ_LEN_ERR(he_dev->rbrq_head) + ? "LEN_ERR" : "", + vcc->vpi, vcc->vci); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + goto return_host_buffers; + } + +@@ -1861,7 +1861,7 @@ he_service_rbrq(struct he_dev *he_dev, i + vcc->push(vcc, skb); + spin_lock(&he_dev->global_lock); + +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + + return_host_buffers: + ++pdus_assembled; +@@ -2206,7 +2206,7 @@ __enqueue_tpd(struct he_dev *he_dev, str + tpd->vcc->pop(tpd->vcc, tpd->skb); + else + dev_kfree_skb_any(tpd->skb); +- atomic_inc(&tpd->vcc->stats->tx_err); ++ atomic_inc_unchecked(&tpd->vcc->stats->tx_err); + } + pci_pool_free(he_dev->tpd_pool, tpd, TPD_ADDR(tpd->status)); + return; +@@ -2618,7 +2618,7 @@ he_send(struct atm_vcc *vcc, struct sk_b + vcc->pop(vcc, skb); + else + dev_kfree_skb_any(skb); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + return -EINVAL; + } + +@@ -2629,7 +2629,7 @@ he_send(struct atm_vcc *vcc, struct sk_b + vcc->pop(vcc, skb); + else + dev_kfree_skb_any(skb); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + return -EINVAL; + } + #endif +@@ -2641,7 +2641,7 @@ he_send(struct atm_vcc *vcc, struct sk_b + vcc->pop(vcc, skb); + else + dev_kfree_skb_any(skb); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + spin_unlock_irqrestore(&he_dev->global_lock, flags); + return -ENOMEM; + } +@@ -2683,7 +2683,7 @@ he_send(struct atm_vcc *vcc, struct sk_b + vcc->pop(vcc, skb); + else + dev_kfree_skb_any(skb); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + spin_unlock_irqrestore(&he_dev->global_lock, flags); + return -ENOMEM; + } +@@ -2714,7 +2714,7 @@ he_send(struct atm_vcc *vcc, struct sk_b + __enqueue_tpd(he_dev, tpd, cid); + spin_unlock_irqrestore(&he_dev->global_lock, flags); + +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + + return 0; + } +diff -urNp linux-2.6.33.1/drivers/atm/horizon.c linux-2.6.33.1/drivers/atm/horizon.c +--- linux-2.6.33.1/drivers/atm/horizon.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/horizon.c 2010-03-20 16:58:39.496821217 -0400 +@@ -1033,7 +1033,7 @@ static void rx_schedule (hrz_dev * dev, + { + struct atm_vcc * vcc = ATM_SKB(skb)->vcc; + // VC layer stats +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + __net_timestamp(skb); + // end of our responsability + vcc->push (vcc, skb); +@@ -1185,7 +1185,7 @@ static void tx_schedule (hrz_dev * const + dev->tx_iovec = NULL; + + // VC layer stats +- atomic_inc(&ATM_SKB(skb)->vcc->stats->tx); ++ atomic_inc_unchecked(&ATM_SKB(skb)->vcc->stats->tx); + + // free the skb + hrz_kfree_skb (skb); +diff -urNp linux-2.6.33.1/drivers/atm/idt77252.c linux-2.6.33.1/drivers/atm/idt77252.c +--- linux-2.6.33.1/drivers/atm/idt77252.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/idt77252.c 2010-03-20 16:58:39.496821217 -0400 +@@ -810,7 +810,7 @@ drain_scq(struct idt77252_dev *card, str + else + dev_kfree_skb(skb); + +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + } + + atomic_dec(&scq->used); +@@ -1073,13 +1073,13 @@ dequeue_rx(struct idt77252_dev *card, st + if ((sb = dev_alloc_skb(64)) == NULL) { + printk("%s: Can't allocate buffers for aal0.\n", + card->name); +- atomic_add(i, &vcc->stats->rx_drop); ++ atomic_add_unchecked(i, &vcc->stats->rx_drop); + break; + } + if (!atm_charge(vcc, sb->truesize)) { + RXPRINTK("%s: atm_charge() dropped aal0 packets.\n", + card->name); +- atomic_add(i - 1, &vcc->stats->rx_drop); ++ atomic_add_unchecked(i - 1, &vcc->stats->rx_drop); + dev_kfree_skb(sb); + break; + } +@@ -1096,7 +1096,7 @@ dequeue_rx(struct idt77252_dev *card, st + ATM_SKB(sb)->vcc = vcc; + __net_timestamp(sb); + vcc->push(vcc, sb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + + cell += ATM_CELL_PAYLOAD; + } +@@ -1133,13 +1133,13 @@ dequeue_rx(struct idt77252_dev *card, st + "(CDC: %08x)\n", + card->name, len, rpp->len, readl(SAR_REG_CDC)); + recycle_rx_pool_skb(card, rpp); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + return; + } + if (stat & SAR_RSQE_CRC) { + RXPRINTK("%s: AAL5 CRC error.\n", card->name); + recycle_rx_pool_skb(card, rpp); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + return; + } + if (skb_queue_len(&rpp->queue) > 1) { +@@ -1150,7 +1150,7 @@ dequeue_rx(struct idt77252_dev *card, st + RXPRINTK("%s: Can't alloc RX skb.\n", + card->name); + recycle_rx_pool_skb(card, rpp); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + return; + } + if (!atm_charge(vcc, skb->truesize)) { +@@ -1169,7 +1169,7 @@ dequeue_rx(struct idt77252_dev *card, st + __net_timestamp(skb); + + vcc->push(vcc, skb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + + return; + } +@@ -1191,7 +1191,7 @@ dequeue_rx(struct idt77252_dev *card, st + __net_timestamp(skb); + + vcc->push(vcc, skb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + + if (skb->truesize > SAR_FB_SIZE_3) + add_rx_skb(card, 3, SAR_FB_SIZE_3, 1); +@@ -1303,14 +1303,14 @@ idt77252_rx_raw(struct idt77252_dev *car + if (vcc->qos.aal != ATM_AAL0) { + RPRINTK("%s: raw cell for non AAL0 vc %u.%u\n", + card->name, vpi, vci); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + goto drop; + } + + if ((sb = dev_alloc_skb(64)) == NULL) { + printk("%s: Can't allocate buffers for AAL0.\n", + card->name); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + goto drop; + } + +@@ -1329,7 +1329,7 @@ idt77252_rx_raw(struct idt77252_dev *car + ATM_SKB(sb)->vcc = vcc; + __net_timestamp(sb); + vcc->push(vcc, sb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + + drop: + skb_pull(queue, 64); +@@ -1954,13 +1954,13 @@ idt77252_send_skb(struct atm_vcc *vcc, s + + if (vc == NULL) { + printk("%s: NULL connection in send().\n", card->name); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + dev_kfree_skb(skb); + return -EINVAL; + } + if (!test_bit(VCF_TX, &vc->flags)) { + printk("%s: Trying to transmit on a non-tx VC.\n", card->name); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + dev_kfree_skb(skb); + return -EINVAL; + } +@@ -1972,14 +1972,14 @@ idt77252_send_skb(struct atm_vcc *vcc, s + break; + default: + printk("%s: Unsupported AAL: %d\n", card->name, vcc->qos.aal); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + dev_kfree_skb(skb); + return -EINVAL; + } + + if (skb_shinfo(skb)->nr_frags != 0) { + printk("%s: No scatter-gather yet.\n", card->name); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + dev_kfree_skb(skb); + return -EINVAL; + } +@@ -1987,7 +1987,7 @@ idt77252_send_skb(struct atm_vcc *vcc, s + + err = queue_skb(card, vc, skb, oam); + if (err) { +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + dev_kfree_skb(skb); + return err; + } +@@ -2010,7 +2010,7 @@ idt77252_send_oam(struct atm_vcc *vcc, v + skb = dev_alloc_skb(64); + if (!skb) { + printk("%s: Out of memory in send_oam().\n", card->name); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + return -ENOMEM; + } + atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); +diff -urNp linux-2.6.33.1/drivers/atm/iphase.c linux-2.6.33.1/drivers/atm/iphase.c +--- linux-2.6.33.1/drivers/atm/iphase.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/iphase.c 2010-03-20 16:58:39.500808414 -0400 +@@ -1123,7 +1123,7 @@ static int rx_pkt(struct atm_dev *dev) + status = (u_short) (buf_desc_ptr->desc_mode); + if (status & (RX_CER | RX_PTE | RX_OFL)) + { +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + IF_ERR(printk("IA: bad packet, dropping it");) + if (status & RX_CER) { + IF_ERR(printk(" cause: packet CRC error\n");) +@@ -1146,7 +1146,7 @@ static int rx_pkt(struct atm_dev *dev) + len = dma_addr - buf_addr; + if (len > iadev->rx_buf_sz) { + printk("Over %d bytes sdu received, dropped!!!\n", iadev->rx_buf_sz); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + goto out_free_desc; + } + +@@ -1296,7 +1296,7 @@ static void rx_dle_intr(struct atm_dev * + ia_vcc = INPH_IA_VCC(vcc); + if (ia_vcc == NULL) + { +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + dev_kfree_skb_any(skb); + atm_return(vcc, atm_guess_pdu2truesize(len)); + goto INCR_DLE; +@@ -1308,7 +1308,7 @@ static void rx_dle_intr(struct atm_dev * + if ((length > iadev->rx_buf_sz) || (length > + (skb->len - sizeof(struct cpcs_trailer)))) + { +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + IF_ERR(printk("rx_dle_intr: Bad AAL5 trailer %d (skb len %d)", + length, skb->len);) + dev_kfree_skb_any(skb); +@@ -1324,7 +1324,7 @@ static void rx_dle_intr(struct atm_dev * + + IF_RX(printk("rx_dle_intr: skb push");) + vcc->push(vcc,skb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + iadev->rx_pkt_cnt++; + } + INCR_DLE: +@@ -2806,15 +2806,15 @@ static int ia_ioctl(struct atm_dev *dev, + { + struct k_sonet_stats *stats; + stats = &PRIV(_ia_dev[board])->sonet_stats; +- printk("section_bip: %d\n", atomic_read(&stats->section_bip)); +- printk("line_bip : %d\n", atomic_read(&stats->line_bip)); +- printk("path_bip : %d\n", atomic_read(&stats->path_bip)); +- printk("line_febe : %d\n", atomic_read(&stats->line_febe)); +- printk("path_febe : %d\n", atomic_read(&stats->path_febe)); +- printk("corr_hcs : %d\n", atomic_read(&stats->corr_hcs)); +- printk("uncorr_hcs : %d\n", atomic_read(&stats->uncorr_hcs)); +- printk("tx_cells : %d\n", atomic_read(&stats->tx_cells)); +- printk("rx_cells : %d\n", atomic_read(&stats->rx_cells)); ++ printk("section_bip: %d\n", atomic_read_unchecked(&stats->section_bip)); ++ printk("line_bip : %d\n", atomic_read_unchecked(&stats->line_bip)); ++ printk("path_bip : %d\n", atomic_read_unchecked(&stats->path_bip)); ++ printk("line_febe : %d\n", atomic_read_unchecked(&stats->line_febe)); ++ printk("path_febe : %d\n", atomic_read_unchecked(&stats->path_febe)); ++ printk("corr_hcs : %d\n", atomic_read_unchecked(&stats->corr_hcs)); ++ printk("uncorr_hcs : %d\n", atomic_read_unchecked(&stats->uncorr_hcs)); ++ printk("tx_cells : %d\n", atomic_read_unchecked(&stats->tx_cells)); ++ printk("rx_cells : %d\n", atomic_read_unchecked(&stats->rx_cells)); + } + ia_cmds.status = 0; + break; +@@ -2919,7 +2919,7 @@ static int ia_pkt_tx (struct atm_vcc *vc + if ((desc == 0) || (desc > iadev->num_tx_desc)) + { + IF_ERR(printk(DEV_LABEL "invalid desc for send: %d\n", desc);) +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + if (vcc->pop) + vcc->pop(vcc, skb); + else +@@ -3024,14 +3024,14 @@ static int ia_pkt_tx (struct atm_vcc *vc + ATM_DESC(skb) = vcc->vci; + skb_queue_tail(&iadev->tx_dma_q, skb); + +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + iadev->tx_pkt_cnt++; + /* Increment transaction counter */ + writel(2, iadev->dma+IPHASE5575_TX_COUNTER); + + #if 0 + /* add flow control logic */ +- if (atomic_read(&vcc->stats->tx) % 20 == 0) { ++ if (atomic_read_unchecked(&vcc->stats->tx) % 20 == 0) { + if (iavcc->vc_desc_cnt > 10) { + vcc->tx_quota = vcc->tx_quota * 3 / 4; + printk("Tx1: vcc->tx_quota = %d \n", (u32)vcc->tx_quota ); +diff -urNp linux-2.6.33.1/drivers/atm/lanai.c linux-2.6.33.1/drivers/atm/lanai.c +--- linux-2.6.33.1/drivers/atm/lanai.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/lanai.c 2010-03-20 16:58:39.504511968 -0400 +@@ -1305,7 +1305,7 @@ static void lanai_send_one_aal5(struct l + vcc_tx_add_aal5_trailer(lvcc, skb->len, 0, 0); + lanai_endtx(lanai, lvcc); + lanai_free_skb(lvcc->tx.atmvcc, skb); +- atomic_inc(&lvcc->tx.atmvcc->stats->tx); ++ atomic_inc_unchecked(&lvcc->tx.atmvcc->stats->tx); + } + + /* Try to fill the buffer - don't call unless there is backlog */ +@@ -1428,7 +1428,7 @@ static void vcc_rx_aal5(struct lanai_vcc + ATM_SKB(skb)->vcc = lvcc->rx.atmvcc; + __net_timestamp(skb); + lvcc->rx.atmvcc->push(lvcc->rx.atmvcc, skb); +- atomic_inc(&lvcc->rx.atmvcc->stats->rx); ++ atomic_inc_unchecked(&lvcc->rx.atmvcc->stats->rx); + out: + lvcc->rx.buf.ptr = end; + cardvcc_write(lvcc, endptr, vcc_rxreadptr); +@@ -1670,7 +1670,7 @@ static int handle_service(struct lanai_d + DPRINTK("(itf %d) got RX service entry 0x%X for non-AAL5 " + "vcc %d\n", lanai->number, (unsigned int) s, vci); + lanai->stats.service_rxnotaal5++; +- atomic_inc(&lvcc->rx.atmvcc->stats->rx_err); ++ atomic_inc_unchecked(&lvcc->rx.atmvcc->stats->rx_err); + return 0; + } + if (likely(!(s & (SERVICE_TRASH | SERVICE_STREAM | SERVICE_CRCERR)))) { +@@ -1682,7 +1682,7 @@ static int handle_service(struct lanai_d + int bytes; + read_unlock(&vcc_sklist_lock); + DPRINTK("got trashed rx pdu on vci %d\n", vci); +- atomic_inc(&lvcc->rx.atmvcc->stats->rx_err); ++ atomic_inc_unchecked(&lvcc->rx.atmvcc->stats->rx_err); + lvcc->stats.x.aal5.service_trash++; + bytes = (SERVICE_GET_END(s) * 16) - + (((unsigned long) lvcc->rx.buf.ptr) - +@@ -1694,7 +1694,7 @@ static int handle_service(struct lanai_d + } + if (s & SERVICE_STREAM) { + read_unlock(&vcc_sklist_lock); +- atomic_inc(&lvcc->rx.atmvcc->stats->rx_err); ++ atomic_inc_unchecked(&lvcc->rx.atmvcc->stats->rx_err); + lvcc->stats.x.aal5.service_stream++; + printk(KERN_ERR DEV_LABEL "(itf %d): Got AAL5 stream " + "PDU on VCI %d!\n", lanai->number, vci); +@@ -1702,7 +1702,7 @@ static int handle_service(struct lanai_d + return 0; + } + DPRINTK("got rx crc error on vci %d\n", vci); +- atomic_inc(&lvcc->rx.atmvcc->stats->rx_err); ++ atomic_inc_unchecked(&lvcc->rx.atmvcc->stats->rx_err); + lvcc->stats.x.aal5.service_rxcrc++; + lvcc->rx.buf.ptr = &lvcc->rx.buf.start[SERVICE_GET_END(s) * 4]; + cardvcc_write(lvcc, SERVICE_GET_END(s), vcc_rxreadptr); +diff -urNp linux-2.6.33.1/drivers/atm/nicstar.c linux-2.6.33.1/drivers/atm/nicstar.c +--- linux-2.6.33.1/drivers/atm/nicstar.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/nicstar.c 2010-03-20 16:58:39.504511968 -0400 +@@ -1723,7 +1723,7 @@ static int ns_send(struct atm_vcc *vcc, + if ((vc = (vc_map *) vcc->dev_data) == NULL) + { + printk("nicstar%d: vcc->dev_data == NULL on ns_send().\n", card->index); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + dev_kfree_skb_any(skb); + return -EINVAL; + } +@@ -1731,7 +1731,7 @@ static int ns_send(struct atm_vcc *vcc, + if (!vc->tx) + { + printk("nicstar%d: Trying to transmit on a non-tx VC.\n", card->index); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + dev_kfree_skb_any(skb); + return -EINVAL; + } +@@ -1739,7 +1739,7 @@ static int ns_send(struct atm_vcc *vcc, + if (vcc->qos.aal != ATM_AAL5 && vcc->qos.aal != ATM_AAL0) + { + printk("nicstar%d: Only AAL0 and AAL5 are supported.\n", card->index); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + dev_kfree_skb_any(skb); + return -EINVAL; + } +@@ -1747,7 +1747,7 @@ static int ns_send(struct atm_vcc *vcc, + if (skb_shinfo(skb)->nr_frags != 0) + { + printk("nicstar%d: No scatter-gather yet.\n", card->index); +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + dev_kfree_skb_any(skb); + return -EINVAL; + } +@@ -1792,11 +1792,11 @@ static int ns_send(struct atm_vcc *vcc, + + if (push_scqe(card, vc, scq, &scqe, skb) != 0) + { +- atomic_inc(&vcc->stats->tx_err); ++ atomic_inc_unchecked(&vcc->stats->tx_err); + dev_kfree_skb_any(skb); + return -EIO; + } +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + + return 0; + } +@@ -2111,14 +2111,14 @@ static void dequeue_rx(ns_dev *card, ns_ + { + printk("nicstar%d: Can't allocate buffers for aal0.\n", + card->index); +- atomic_add(i,&vcc->stats->rx_drop); ++ atomic_add_unchecked(i,&vcc->stats->rx_drop); + break; + } + if (!atm_charge(vcc, sb->truesize)) + { + RXPRINTK("nicstar%d: atm_charge() dropped aal0 packets.\n", + card->index); +- atomic_add(i-1,&vcc->stats->rx_drop); /* already increased by 1 */ ++ atomic_add_unchecked(i-1,&vcc->stats->rx_drop); /* already increased by 1 */ + dev_kfree_skb_any(sb); + break; + } +@@ -2133,7 +2133,7 @@ static void dequeue_rx(ns_dev *card, ns_ + ATM_SKB(sb)->vcc = vcc; + __net_timestamp(sb); + vcc->push(vcc, sb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + cell += ATM_CELL_PAYLOAD; + } + +@@ -2152,7 +2152,7 @@ static void dequeue_rx(ns_dev *card, ns_ + if (iovb == NULL) + { + printk("nicstar%d: Out of iovec buffers.\n", card->index); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + recycle_rx_buf(card, skb); + return; + } +@@ -2182,7 +2182,7 @@ static void dequeue_rx(ns_dev *card, ns_ + else if (NS_SKB(iovb)->iovcnt >= NS_MAX_IOVECS) + { + printk("nicstar%d: received too big AAL5 SDU.\n", card->index); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + recycle_iovec_rx_bufs(card, (struct iovec *) iovb->data, NS_MAX_IOVECS); + NS_SKB(iovb)->iovcnt = 0; + iovb->len = 0; +@@ -2202,7 +2202,7 @@ static void dequeue_rx(ns_dev *card, ns_ + printk("nicstar%d: Expected a small buffer, and this is not one.\n", + card->index); + which_list(card, skb); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + recycle_rx_buf(card, skb); + vc->rx_iov = NULL; + recycle_iov_buf(card, iovb); +@@ -2216,7 +2216,7 @@ static void dequeue_rx(ns_dev *card, ns_ + printk("nicstar%d: Expected a large buffer, and this is not one.\n", + card->index); + which_list(card, skb); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + recycle_iovec_rx_bufs(card, (struct iovec *) iovb->data, + NS_SKB(iovb)->iovcnt); + vc->rx_iov = NULL; +@@ -2240,7 +2240,7 @@ static void dequeue_rx(ns_dev *card, ns_ + printk(" - PDU size mismatch.\n"); + else + printk(".\n"); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + recycle_iovec_rx_bufs(card, (struct iovec *) iovb->data, + NS_SKB(iovb)->iovcnt); + vc->rx_iov = NULL; +@@ -2256,7 +2256,7 @@ static void dequeue_rx(ns_dev *card, ns_ + if (!atm_charge(vcc, skb->truesize)) + { + push_rxbufs(card, skb); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + } + else + { +@@ -2268,7 +2268,7 @@ static void dequeue_rx(ns_dev *card, ns_ + ATM_SKB(skb)->vcc = vcc; + __net_timestamp(skb); + vcc->push(vcc, skb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + } + } + else if (NS_SKB(iovb)->iovcnt == 2) /* One small plus one large buffer */ +@@ -2283,7 +2283,7 @@ static void dequeue_rx(ns_dev *card, ns_ + if (!atm_charge(vcc, sb->truesize)) + { + push_rxbufs(card, sb); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + } + else + { +@@ -2295,7 +2295,7 @@ static void dequeue_rx(ns_dev *card, ns_ + ATM_SKB(sb)->vcc = vcc; + __net_timestamp(sb); + vcc->push(vcc, sb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + } + + push_rxbufs(card, skb); +@@ -2306,7 +2306,7 @@ static void dequeue_rx(ns_dev *card, ns_ + if (!atm_charge(vcc, skb->truesize)) + { + push_rxbufs(card, skb); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + } + else + { +@@ -2320,7 +2320,7 @@ static void dequeue_rx(ns_dev *card, ns_ + ATM_SKB(skb)->vcc = vcc; + __net_timestamp(skb); + vcc->push(vcc, skb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + } + + push_rxbufs(card, sb); +@@ -2342,7 +2342,7 @@ static void dequeue_rx(ns_dev *card, ns_ + if (hb == NULL) + { + printk("nicstar%d: Out of huge buffers.\n", card->index); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + recycle_iovec_rx_bufs(card, (struct iovec *) iovb->data, + NS_SKB(iovb)->iovcnt); + vc->rx_iov = NULL; +@@ -2393,7 +2393,7 @@ static void dequeue_rx(ns_dev *card, ns_ + } + else + dev_kfree_skb_any(hb); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + } + else + { +@@ -2427,7 +2427,7 @@ static void dequeue_rx(ns_dev *card, ns_ + #endif /* NS_USE_DESTRUCTORS */ + __net_timestamp(hb); + vcc->push(vcc, hb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + } + } + +diff -urNp linux-2.6.33.1/drivers/atm/solos-pci.c linux-2.6.33.1/drivers/atm/solos-pci.c +--- linux-2.6.33.1/drivers/atm/solos-pci.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/solos-pci.c 2010-03-20 16:58:39.512795633 -0400 +@@ -714,7 +714,7 @@ void solos_bh(unsigned long card_arg) + } + atm_charge(vcc, skb->truesize); + vcc->push(vcc, skb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + break; + + case PKT_STATUS: +@@ -1017,7 +1017,7 @@ static uint32_t fpga_tx(struct solos_car + vcc = SKB_CB(oldskb)->vcc; + + if (vcc) { +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + solos_pop(vcc, oldskb); + } else + dev_kfree_skb_irq(oldskb); +diff -urNp linux-2.6.33.1/drivers/atm/suni.c linux-2.6.33.1/drivers/atm/suni.c +--- linux-2.6.33.1/drivers/atm/suni.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/suni.c 2010-03-20 16:58:39.516795609 -0400 +@@ -49,8 +49,8 @@ static DEFINE_SPINLOCK(sunis_lock); + + + #define ADD_LIMITED(s,v) \ +- atomic_add((v),&stats->s); \ +- if (atomic_read(&stats->s) < 0) atomic_set(&stats->s,INT_MAX); ++ atomic_add_unchecked((v),&stats->s); \ ++ if (atomic_read_unchecked(&stats->s) < 0) atomic_set_unchecked(&stats->s,INT_MAX); + + + static void suni_hz(unsigned long from_timer) +diff -urNp linux-2.6.33.1/drivers/atm/uPD98402.c linux-2.6.33.1/drivers/atm/uPD98402.c +--- linux-2.6.33.1/drivers/atm/uPD98402.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/uPD98402.c 2010-03-20 16:58:39.516795609 -0400 +@@ -41,7 +41,7 @@ static int fetch_stats(struct atm_dev *d + struct sonet_stats tmp; + int error = 0; + +- atomic_add(GET(HECCT),&PRIV(dev)->sonet_stats.uncorr_hcs); ++ atomic_add_unchecked(GET(HECCT),&PRIV(dev)->sonet_stats.uncorr_hcs); + sonet_copy_stats(&PRIV(dev)->sonet_stats,&tmp); + if (arg) error = copy_to_user(arg,&tmp,sizeof(tmp)); + if (zero && !error) { +@@ -160,9 +160,9 @@ static int uPD98402_ioctl(struct atm_dev + + + #define ADD_LIMITED(s,v) \ +- { atomic_add(GET(v),&PRIV(dev)->sonet_stats.s); \ +- if (atomic_read(&PRIV(dev)->sonet_stats.s) < 0) \ +- atomic_set(&PRIV(dev)->sonet_stats.s,INT_MAX); } ++ { atomic_add_unchecked(GET(v),&PRIV(dev)->sonet_stats.s); \ ++ if (atomic_read_unchecked(&PRIV(dev)->sonet_stats.s) < 0) \ ++ atomic_set_unchecked(&PRIV(dev)->sonet_stats.s,INT_MAX); } + + + static void stat_event(struct atm_dev *dev) +@@ -193,7 +193,7 @@ static void uPD98402_int(struct atm_dev + if (reason & uPD98402_INT_PFM) stat_event(dev); + if (reason & uPD98402_INT_PCO) { + (void) GET(PCOCR); /* clear interrupt cause */ +- atomic_add(GET(HECCT), ++ atomic_add_unchecked(GET(HECCT), + &PRIV(dev)->sonet_stats.uncorr_hcs); + } + if ((reason & uPD98402_INT_RFO) && +@@ -221,9 +221,9 @@ static int uPD98402_start(struct atm_dev + PUT(~(uPD98402_INT_PFM | uPD98402_INT_ALM | uPD98402_INT_RFO | + uPD98402_INT_LOS),PIMR); /* enable them */ + (void) fetch_stats(dev,NULL,1); /* clear kernel counters */ +- atomic_set(&PRIV(dev)->sonet_stats.corr_hcs,-1); +- atomic_set(&PRIV(dev)->sonet_stats.tx_cells,-1); +- atomic_set(&PRIV(dev)->sonet_stats.rx_cells,-1); ++ atomic_set_unchecked(&PRIV(dev)->sonet_stats.corr_hcs,-1); ++ atomic_set_unchecked(&PRIV(dev)->sonet_stats.tx_cells,-1); ++ atomic_set_unchecked(&PRIV(dev)->sonet_stats.rx_cells,-1); + return 0; + } + +diff -urNp linux-2.6.33.1/drivers/atm/zatm.c linux-2.6.33.1/drivers/atm/zatm.c +--- linux-2.6.33.1/drivers/atm/zatm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/atm/zatm.c 2010-03-20 16:58:39.516795609 -0400 +@@ -458,7 +458,7 @@ printk("dummy: 0x%08lx, 0x%08lx\n",dummy + } + if (!size) { + dev_kfree_skb_irq(skb); +- if (vcc) atomic_inc(&vcc->stats->rx_err); ++ if (vcc) atomic_inc_unchecked(&vcc->stats->rx_err); + continue; + } + if (!atm_charge(vcc,skb->truesize)) { +@@ -468,7 +468,7 @@ printk("dummy: 0x%08lx, 0x%08lx\n",dummy + skb->len = size; + ATM_SKB(skb)->vcc = vcc; + vcc->push(vcc,skb); +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + } + zout(pos & 0xffff,MTA(mbx)); + #if 0 /* probably a stupid idea */ +@@ -732,7 +732,7 @@ if (*ZATM_PRV_DSC(skb) != (uPD98401_TXPD + skb_queue_head(&zatm_vcc->backlog,skb); + break; + } +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + wake_up(&zatm_vcc->tx_wait); + } + +diff -urNp linux-2.6.33.1/drivers/base/bus.c linux-2.6.33.1/drivers/base/bus.c +--- linux-2.6.33.1/drivers/base/bus.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/base/bus.c 2010-03-20 16:58:39.548804768 -0400 +@@ -70,7 +70,7 @@ static ssize_t drv_attr_store(struct kob + return ret; + } + +-static struct sysfs_ops driver_sysfs_ops = { ++static const struct sysfs_ops driver_sysfs_ops = { + .show = drv_attr_show, + .store = drv_attr_store, + }; +@@ -115,7 +115,7 @@ static ssize_t bus_attr_store(struct kob + return ret; + } + +-static struct sysfs_ops bus_sysfs_ops = { ++static const struct sysfs_ops bus_sysfs_ops = { + .show = bus_attr_show, + .store = bus_attr_store, + }; +@@ -154,7 +154,7 @@ static int bus_uevent_filter(struct kset + return 0; + } + +-static struct kset_uevent_ops bus_uevent_ops = { ++static const struct kset_uevent_ops bus_uevent_ops = { + .filter = bus_uevent_filter, + }; + +diff -urNp linux-2.6.33.1/drivers/base/class.c linux-2.6.33.1/drivers/base/class.c +--- linux-2.6.33.1/drivers/base/class.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/base/class.c 2010-03-20 16:58:39.548804768 -0400 +@@ -63,7 +63,7 @@ static void class_release(struct kobject + kfree(cp); + } + +-static struct sysfs_ops class_sysfs_ops = { ++static const struct sysfs_ops class_sysfs_ops = { + .show = class_attr_show, + .store = class_attr_store, + }; +diff -urNp linux-2.6.33.1/drivers/base/core.c linux-2.6.33.1/drivers/base/core.c +--- linux-2.6.33.1/drivers/base/core.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/base/core.c 2010-03-20 16:58:39.548804768 -0400 +@@ -100,7 +100,7 @@ static ssize_t dev_attr_store(struct kob + return ret; + } + +-static struct sysfs_ops dev_sysfs_ops = { ++static const struct sysfs_ops dev_sysfs_ops = { + .show = dev_attr_show, + .store = dev_attr_store, + }; +@@ -252,7 +252,7 @@ static int dev_uevent(struct kset *kset, + return retval; + } + +-static struct kset_uevent_ops device_uevent_ops = { ++static const struct kset_uevent_ops device_uevent_ops = { + .filter = dev_uevent_filter, + .name = dev_uevent_name, + .uevent = dev_uevent, +diff -urNp linux-2.6.33.1/drivers/base/memory.c linux-2.6.33.1/drivers/base/memory.c +--- linux-2.6.33.1/drivers/base/memory.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/base/memory.c 2010-03-20 16:58:39.553060820 -0400 +@@ -44,7 +44,7 @@ static int memory_uevent(struct kset *ks + return retval; + } + +-static struct kset_uevent_ops memory_uevent_ops = { ++static const struct kset_uevent_ops memory_uevent_ops = { + .name = memory_uevent_name, + .uevent = memory_uevent, + }; +diff -urNp linux-2.6.33.1/drivers/base/sys.c linux-2.6.33.1/drivers/base/sys.c +--- linux-2.6.33.1/drivers/base/sys.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/base/sys.c 2010-03-20 16:58:39.553060820 -0400 +@@ -54,7 +54,7 @@ sysdev_store(struct kobject *kobj, struc + return -EIO; + } + +-static struct sysfs_ops sysfs_ops = { ++static const struct sysfs_ops sysfs_ops = { + .show = sysdev_show, + .store = sysdev_store, + }; +@@ -104,7 +104,7 @@ static ssize_t sysdev_class_store(struct + return -EIO; + } + +-static struct sysfs_ops sysfs_class_ops = { ++static const struct sysfs_ops sysfs_class_ops = { + .show = sysdev_class_show, + .store = sysdev_class_store, + }; +diff -urNp linux-2.6.33.1/drivers/block/pktcdvd.c linux-2.6.33.1/drivers/block/pktcdvd.c +--- linux-2.6.33.1/drivers/block/pktcdvd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/block/pktcdvd.c 2010-03-20 16:58:39.568810035 -0400 +@@ -284,7 +284,7 @@ static ssize_t kobj_pkt_store(struct kob + return len; + } + +-static struct sysfs_ops kobj_pkt_ops = { ++static const struct sysfs_ops kobj_pkt_ops = { + .show = kobj_pkt_show, + .store = kobj_pkt_store + }; +diff -urNp linux-2.6.33.1/drivers/char/agp/frontend.c linux-2.6.33.1/drivers/char/agp/frontend.c +--- linux-2.6.33.1/drivers/char/agp/frontend.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/agp/frontend.c 2010-03-20 16:58:39.572812162 -0400 +@@ -818,7 +818,7 @@ static int agpioc_reserve_wrap(struct ag + if (copy_from_user(&reserve, arg, sizeof(struct agp_region))) + return -EFAULT; + +- if ((unsigned) reserve.seg_count >= ~0U/sizeof(struct agp_segment)) ++ if ((unsigned) reserve.seg_count >= ~0U/sizeof(struct agp_segment_priv)) + return -EFAULT; + + client = agp_find_client_by_pid(reserve.pid); +diff -urNp linux-2.6.33.1/drivers/char/agp/intel-agp.c linux-2.6.33.1/drivers/char/agp/intel-agp.c +--- linux-2.6.33.1/drivers/char/agp/intel-agp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/agp/intel-agp.c 2010-03-20 16:58:39.588814817 -0400 +@@ -2575,7 +2575,7 @@ static struct pci_device_id agp_intel_pc + ID(PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB), + ID(PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB), + ID(PCI_DEVICE_ID_INTEL_IRONLAKE_MC2_HB), +- { } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(pci, agp_intel_pci_table); +diff -urNp linux-2.6.33.1/drivers/char/hpet.c linux-2.6.33.1/drivers/char/hpet.c +--- linux-2.6.33.1/drivers/char/hpet.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hpet.c 2010-03-20 16:58:39.592808427 -0400 +@@ -995,7 +995,7 @@ static struct acpi_driver hpet_acpi_driv + }, + }; + +-static struct miscdevice hpet_misc = { HPET_MINOR, "hpet", &hpet_fops }; ++static struct miscdevice hpet_misc = { HPET_MINOR, "hpet", &hpet_fops, {NULL, NULL}, NULL, NULL }; + + static int __init hpet_init(void) + { +diff -urNp linux-2.6.33.1/drivers/char/hvc_beat.c linux-2.6.33.1/drivers/char/hvc_beat.c +--- linux-2.6.33.1/drivers/char/hvc_beat.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hvc_beat.c 2010-03-20 16:58:39.596613651 -0400 +@@ -84,7 +84,7 @@ static int hvc_beat_put_chars(uint32_t v + return cnt; + } + +-static struct hv_ops hvc_beat_get_put_ops = { ++static const struct hv_ops hvc_beat_get_put_ops = { + .get_chars = hvc_beat_get_chars, + .put_chars = hvc_beat_put_chars, + }; +diff -urNp linux-2.6.33.1/drivers/char/hvc_console.c linux-2.6.33.1/drivers/char/hvc_console.c +--- linux-2.6.33.1/drivers/char/hvc_console.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hvc_console.c 2010-03-20 16:58:39.596613651 -0400 +@@ -125,7 +125,7 @@ static struct hvc_struct *hvc_get_by_ind + * console interfaces but can still be used as a tty device. This has to be + * static because kmalloc will not work during early console init. + */ +-static struct hv_ops *cons_ops[MAX_NR_HVC_CONSOLES]; ++static const struct hv_ops *cons_ops[MAX_NR_HVC_CONSOLES]; + static uint32_t vtermnos[MAX_NR_HVC_CONSOLES] = + {[0 ... MAX_NR_HVC_CONSOLES - 1] = -1}; + +@@ -247,7 +247,7 @@ static void destroy_hvc_struct(struct kr + * vty adapters do NOT get an hvc_instantiate() callback since they + * appear after early console init. + */ +-int hvc_instantiate(uint32_t vtermno, int index, struct hv_ops *ops) ++int hvc_instantiate(uint32_t vtermno, int index, const struct hv_ops *ops) + { + struct hvc_struct *hp; + +@@ -749,7 +749,7 @@ static const struct tty_operations hvc_o + }; + + struct hvc_struct __devinit *hvc_alloc(uint32_t vtermno, int data, +- struct hv_ops *ops, int outbuf_size) ++ const struct hv_ops *ops, int outbuf_size) + { + struct hvc_struct *hp; + int i; +diff -urNp linux-2.6.33.1/drivers/char/hvc_console.h linux-2.6.33.1/drivers/char/hvc_console.h +--- linux-2.6.33.1/drivers/char/hvc_console.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hvc_console.h 2010-03-20 16:58:39.596613651 -0400 +@@ -55,7 +55,7 @@ struct hvc_struct { + int outbuf_size; + int n_outbuf; + uint32_t vtermno; +- struct hv_ops *ops; ++ const struct hv_ops *ops; + int irq_requested; + int data; + struct winsize ws; +@@ -76,11 +76,11 @@ struct hv_ops { + }; + + /* Register a vterm and a slot index for use as a console (console_init) */ +-extern int hvc_instantiate(uint32_t vtermno, int index, struct hv_ops *ops); ++extern int hvc_instantiate(uint32_t vtermno, int index, const struct hv_ops *ops); + + /* register a vterm for hvc tty operation (module_init or hotplug add) */ + extern struct hvc_struct * __devinit hvc_alloc(uint32_t vtermno, int data, +- struct hv_ops *ops, int outbuf_size); ++ const struct hv_ops *ops, int outbuf_size); + /* remove a vterm from hvc tty operation (module_exit or hotplug remove) */ + extern int hvc_remove(struct hvc_struct *hp); + +diff -urNp linux-2.6.33.1/drivers/char/hvc_iseries.c linux-2.6.33.1/drivers/char/hvc_iseries.c +--- linux-2.6.33.1/drivers/char/hvc_iseries.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hvc_iseries.c 2010-03-20 16:58:39.596613651 -0400 +@@ -197,7 +197,7 @@ done: + return sent; + } + +-static struct hv_ops hvc_get_put_ops = { ++static const struct hv_ops hvc_get_put_ops = { + .get_chars = get_chars, + .put_chars = put_chars, + .notifier_add = notifier_add_irq, +diff -urNp linux-2.6.33.1/drivers/char/hvc_iucv.c linux-2.6.33.1/drivers/char/hvc_iucv.c +--- linux-2.6.33.1/drivers/char/hvc_iucv.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hvc_iucv.c 2010-03-20 16:58:39.596613651 -0400 +@@ -922,7 +922,7 @@ static int hvc_iucv_pm_restore_thaw(stru + + + /* HVC operations */ +-static struct hv_ops hvc_iucv_ops = { ++static const struct hv_ops hvc_iucv_ops = { + .get_chars = hvc_iucv_get_chars, + .put_chars = hvc_iucv_put_chars, + .notifier_add = hvc_iucv_notifier_add, +diff -urNp linux-2.6.33.1/drivers/char/hvc_rtas.c linux-2.6.33.1/drivers/char/hvc_rtas.c +--- linux-2.6.33.1/drivers/char/hvc_rtas.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hvc_rtas.c 2010-03-20 16:58:39.600817341 -0400 +@@ -71,7 +71,7 @@ static int hvc_rtas_read_console(uint32_ + return i; + } + +-static struct hv_ops hvc_rtas_get_put_ops = { ++static const struct hv_ops hvc_rtas_get_put_ops = { + .get_chars = hvc_rtas_read_console, + .put_chars = hvc_rtas_write_console, + }; +diff -urNp linux-2.6.33.1/drivers/char/hvcs.c linux-2.6.33.1/drivers/char/hvcs.c +--- linux-2.6.33.1/drivers/char/hvcs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hvcs.c 2010-03-20 16:58:39.600817341 -0400 +@@ -269,7 +269,7 @@ struct hvcs_struct { + unsigned int index; + + struct tty_struct *tty; +- int open_count; ++ atomic_t open_count; + + /* + * Used to tell the driver kernel_thread what operations need to take +@@ -419,7 +419,7 @@ static ssize_t hvcs_vterm_state_store(st + + spin_lock_irqsave(&hvcsd->lock, flags); + +- if (hvcsd->open_count > 0) { ++ if (atomic_read(&hvcsd->open_count) > 0) { + spin_unlock_irqrestore(&hvcsd->lock, flags); + printk(KERN_INFO "HVCS: vterm state unchanged. " + "The hvcs device node is still in use.\n"); +@@ -1135,7 +1135,7 @@ static int hvcs_open(struct tty_struct * + if ((retval = hvcs_partner_connect(hvcsd))) + goto error_release; + +- hvcsd->open_count = 1; ++ atomic_set(&hvcsd->open_count, 1); + hvcsd->tty = tty; + tty->driver_data = hvcsd; + +@@ -1169,7 +1169,7 @@ fast_open: + + spin_lock_irqsave(&hvcsd->lock, flags); + kref_get(&hvcsd->kref); +- hvcsd->open_count++; ++ atomic_inc(&hvcsd->open_count); + hvcsd->todo_mask |= HVCS_SCHED_READ; + spin_unlock_irqrestore(&hvcsd->lock, flags); + +@@ -1213,7 +1213,7 @@ static void hvcs_close(struct tty_struct + hvcsd = tty->driver_data; + + spin_lock_irqsave(&hvcsd->lock, flags); +- if (--hvcsd->open_count == 0) { ++ if (atomic_dec_and_test(&hvcsd->open_count)) { + + vio_disable_interrupts(hvcsd->vdev); + +@@ -1239,10 +1239,10 @@ static void hvcs_close(struct tty_struct + free_irq(irq, hvcsd); + kref_put(&hvcsd->kref, destroy_hvcs_struct); + return; +- } else if (hvcsd->open_count < 0) { ++ } else if (atomic_read(&hvcsd->open_count) < 0) { + printk(KERN_ERR "HVCS: vty-server@%X open_count: %d" + " is missmanaged.\n", +- hvcsd->vdev->unit_address, hvcsd->open_count); ++ hvcsd->vdev->unit_address, atomic_read(&hvcsd->open_count)); + } + + spin_unlock_irqrestore(&hvcsd->lock, flags); +@@ -1258,7 +1258,7 @@ static void hvcs_hangup(struct tty_struc + + spin_lock_irqsave(&hvcsd->lock, flags); + /* Preserve this so that we know how many kref refs to put */ +- temp_open_count = hvcsd->open_count; ++ temp_open_count = atomic_read(&hvcsd->open_count); + + /* + * Don't kref put inside the spinlock because the destruction +@@ -1273,7 +1273,7 @@ static void hvcs_hangup(struct tty_struc + hvcsd->tty->driver_data = NULL; + hvcsd->tty = NULL; + +- hvcsd->open_count = 0; ++ atomic_set(&hvcsd->open_count, 0); + + /* This will drop any buffered data on the floor which is OK in a hangup + * scenario. */ +@@ -1344,7 +1344,7 @@ static int hvcs_write(struct tty_struct + * the middle of a write operation? This is a crummy place to do this + * but we want to keep it all in the spinlock. + */ +- if (hvcsd->open_count <= 0) { ++ if (atomic_read(&hvcsd->open_count) <= 0) { + spin_unlock_irqrestore(&hvcsd->lock, flags); + return -ENODEV; + } +@@ -1418,7 +1418,7 @@ static int hvcs_write_room(struct tty_st + { + struct hvcs_struct *hvcsd = tty->driver_data; + +- if (!hvcsd || hvcsd->open_count <= 0) ++ if (!hvcsd || atomic_read(&hvcsd->open_count) <= 0) + return 0; + + return HVCS_BUFF_LEN - hvcsd->chars_in_buffer; +diff -urNp linux-2.6.33.1/drivers/char/hvc_udbg.c linux-2.6.33.1/drivers/char/hvc_udbg.c +--- linux-2.6.33.1/drivers/char/hvc_udbg.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hvc_udbg.c 2010-03-20 16:58:39.600817341 -0400 +@@ -58,7 +58,7 @@ static int hvc_udbg_get(uint32_t vtermno + return i; + } + +-static struct hv_ops hvc_udbg_ops = { ++static const struct hv_ops hvc_udbg_ops = { + .get_chars = hvc_udbg_get, + .put_chars = hvc_udbg_put, + }; +diff -urNp linux-2.6.33.1/drivers/char/hvc_vio.c linux-2.6.33.1/drivers/char/hvc_vio.c +--- linux-2.6.33.1/drivers/char/hvc_vio.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hvc_vio.c 2010-03-20 16:58:39.600817341 -0400 +@@ -77,7 +77,7 @@ static int filtered_get_chars(uint32_t v + return got; + } + +-static struct hv_ops hvc_get_put_ops = { ++static const struct hv_ops hvc_get_put_ops = { + .get_chars = filtered_get_chars, + .put_chars = hvc_put_chars, + .notifier_add = notifier_add_irq, +diff -urNp linux-2.6.33.1/drivers/char/hvc_xen.c linux-2.6.33.1/drivers/char/hvc_xen.c +--- linux-2.6.33.1/drivers/char/hvc_xen.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/hvc_xen.c 2010-03-20 16:58:39.604556535 -0400 +@@ -122,7 +122,7 @@ static int read_console(uint32_t vtermno + return recv; + } + +-static struct hv_ops hvc_ops = { ++static const struct hv_ops hvc_ops = { + .get_chars = read_console, + .put_chars = write_console, + .notifier_add = notifier_add_irq, +diff -urNp linux-2.6.33.1/drivers/char/ipmi/ipmi_msghandler.c linux-2.6.33.1/drivers/char/ipmi/ipmi_msghandler.c +--- linux-2.6.33.1/drivers/char/ipmi/ipmi_msghandler.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/ipmi/ipmi_msghandler.c 2010-03-20 16:58:39.604556535 -0400 +@@ -414,7 +414,7 @@ struct ipmi_smi { + struct proc_dir_entry *proc_dir; + char proc_dir_name[10]; + +- atomic_t stats[IPMI_NUM_STATS]; ++ atomic_unchecked_t stats[IPMI_NUM_STATS]; + + /* + * run_to_completion duplicate of smb_info, smi_info +@@ -447,9 +447,9 @@ static DEFINE_MUTEX(smi_watchers_mutex); + + + #define ipmi_inc_stat(intf, stat) \ +- atomic_inc(&(intf)->stats[IPMI_STAT_ ## stat]) ++ atomic_inc_unchecked(&(intf)->stats[IPMI_STAT_ ## stat]) + #define ipmi_get_stat(intf, stat) \ +- ((unsigned int) atomic_read(&(intf)->stats[IPMI_STAT_ ## stat])) ++ ((unsigned int) atomic_read_unchecked(&(intf)->stats[IPMI_STAT_ ## stat])) + + static int is_lan_addr(struct ipmi_addr *addr) + { +@@ -2808,7 +2808,7 @@ int ipmi_register_smi(struct ipmi_smi_ha + INIT_LIST_HEAD(&intf->cmd_rcvrs); + init_waitqueue_head(&intf->waitq); + for (i = 0; i < IPMI_NUM_STATS; i++) +- atomic_set(&intf->stats[i], 0); ++ atomic_set_unchecked(&intf->stats[i], 0); + + intf->proc_dir = NULL; + +diff -urNp linux-2.6.33.1/drivers/char/ipmi/ipmi_si_intf.c linux-2.6.33.1/drivers/char/ipmi/ipmi_si_intf.c +--- linux-2.6.33.1/drivers/char/ipmi/ipmi_si_intf.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/ipmi/ipmi_si_intf.c 2010-03-20 16:58:39.608700771 -0400 +@@ -278,7 +278,7 @@ struct smi_info { + unsigned char slave_addr; + + /* Counters and things for the proc filesystem. */ +- atomic_t stats[SI_NUM_STATS]; ++ atomic_unchecked_t stats[SI_NUM_STATS]; + + struct task_struct *thread; + +@@ -286,9 +286,9 @@ struct smi_info { + }; + + #define smi_inc_stat(smi, stat) \ +- atomic_inc(&(smi)->stats[SI_STAT_ ## stat]) ++ atomic_inc_unchecked(&(smi)->stats[SI_STAT_ ## stat]) + #define smi_get_stat(smi, stat) \ +- ((unsigned int) atomic_read(&(smi)->stats[SI_STAT_ ## stat])) ++ ((unsigned int) atomic_read_unchecked(&(smi)->stats[SI_STAT_ ## stat])) + + #define SI_MAX_PARMS 4 + +@@ -3020,7 +3020,7 @@ static int try_smi_init(struct smi_info + atomic_set(&new_smi->req_events, 0); + new_smi->run_to_completion = 0; + for (i = 0; i < SI_NUM_STATS; i++) +- atomic_set(&new_smi->stats[i], 0); ++ atomic_set_unchecked(&new_smi->stats[i], 0); + + new_smi->interrupt_disabled = 0; + atomic_set(&new_smi->stop_operation, 0); +diff -urNp linux-2.6.33.1/drivers/char/keyboard.c linux-2.6.33.1/drivers/char/keyboard.c +--- linux-2.6.33.1/drivers/char/keyboard.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/keyboard.c 2010-03-20 16:58:39.608700771 -0400 +@@ -652,6 +652,16 @@ static void k_spec(struct vc_data *vc, u + kbd->kbdmode == VC_MEDIUMRAW) && + value != KVAL(K_SAK)) + return; /* SAK is allowed even in raw mode */ ++ ++#if defined(CONFIG_GRKERNSEC_PROC) || defined(CONFIG_GRKERNSEC_PROC_MEMMAP) ++ { ++ void *func = fn_handler[value]; ++ if (func == fn_show_state || func == fn_show_ptregs || ++ func == fn_show_mem) ++ return; ++ } ++#endif ++ + fn_handler[value](vc); + } + +@@ -1405,7 +1415,7 @@ static const struct input_device_id kbd_ + .evbit = { BIT_MASK(EV_SND) }, + }, + +- { }, /* Terminating entry */ ++ { 0 }, /* Terminating entry */ + }; + + MODULE_DEVICE_TABLE(input, kbd_ids); +diff -urNp linux-2.6.33.1/drivers/char/mem.c linux-2.6.33.1/drivers/char/mem.c +--- linux-2.6.33.1/drivers/char/mem.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/mem.c 2010-03-20 16:58:39.608700771 -0400 +@@ -18,6 +18,7 @@ + #include <linux/raw.h> + #include <linux/tty.h> + #include <linux/capability.h> ++#include <linux/security.h> + #include <linux/ptrace.h> + #include <linux/device.h> + #include <linux/highmem.h> +@@ -34,6 +35,10 @@ + # include <linux/efi.h> + #endif + ++#if defined(CONFIG_GRKERNSEC) && !defined(CONFIG_GRKERNSEC_NO_RBAC) ++extern struct file_operations grsec_fops; ++#endif ++ + static inline unsigned long size_inside_page(unsigned long start, + unsigned long size) + { +@@ -191,6 +196,11 @@ static ssize_t write_mem(struct file * f + if (!valid_phys_addr_range(p, count)) + return -EFAULT; + ++#ifdef CONFIG_GRKERNSEC_KMEM ++ gr_handle_mem_write(); ++ return -EPERM; ++#endif ++ + written = 0; + + #ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED +@@ -311,6 +321,11 @@ static int mmap_mem(struct file * file, + &vma->vm_page_prot)) + return -EINVAL; + ++#ifdef CONFIG_GRKERNSEC_KMEM ++ if (gr_handle_mem_mmap(vma->vm_pgoff << PAGE_SHIFT, vma)) ++ return -EPERM; ++#endif ++ + vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff, + size, + vma->vm_page_prot); +@@ -527,6 +542,11 @@ static ssize_t write_kmem(struct file * + char * kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */ + int err = 0; + ++#ifdef CONFIG_GRKERNSEC_KMEM ++ gr_handle_kmem_write(); ++ return -EPERM; ++#endif ++ + if (p < (unsigned long) high_memory) { + unsigned long to_write = min_t(unsigned long, count, + (unsigned long)high_memory - p); +@@ -727,6 +747,16 @@ static loff_t memory_lseek(struct file * + + static int open_port(struct inode * inode, struct file * filp) + { ++#ifdef CONFIG_GRKERNSEC_KMEM ++ gr_handle_open_port(); ++ return -EPERM; ++#endif ++ ++ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; ++} ++ ++static int open_mem(struct inode * inode, struct file * filp) ++{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; + } + +@@ -734,7 +764,6 @@ static int open_port(struct inode * inod + #define full_lseek null_lseek + #define write_zero write_null + #define read_full read_zero +-#define open_mem open_port + #define open_kmem open_mem + #define open_oldmem open_mem + +@@ -850,6 +879,9 @@ static const struct memdev { + #ifdef CONFIG_CRASH_DUMP + [12] = { "oldmem", 0, &oldmem_fops, NULL }, + #endif ++#if defined(CONFIG_GRKERNSEC) && !defined(CONFIG_GRKERNSEC_NO_RBAC) ++ [13] = { "grsec",S_IRUSR | S_IWUGO, &grsec_fops, NULL }, ++#endif + }; + + static int memory_open(struct inode *inode, struct file *filp) +diff -urNp linux-2.6.33.1/drivers/char/nvram.c linux-2.6.33.1/drivers/char/nvram.c +--- linux-2.6.33.1/drivers/char/nvram.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/nvram.c 2010-03-20 16:58:39.612523711 -0400 +@@ -246,7 +246,7 @@ static ssize_t nvram_read(struct file *f + + spin_unlock_irq(&rtc_lock); + +- if (copy_to_user(buf, contents, tmp - contents)) ++ if (tmp - contents > sizeof(contents) || copy_to_user(buf, contents, tmp - contents)) + return -EFAULT; + + *ppos = i; +@@ -434,7 +434,10 @@ static const struct file_operations nvra + static struct miscdevice nvram_dev = { + NVRAM_MINOR, + "nvram", +- &nvram_fops ++ &nvram_fops, ++ {NULL, NULL}, ++ NULL, ++ NULL + }; + + static int __init nvram_init(void) +diff -urNp linux-2.6.33.1/drivers/char/pcmcia/ipwireless/tty.c linux-2.6.33.1/drivers/char/pcmcia/ipwireless/tty.c +--- linux-2.6.33.1/drivers/char/pcmcia/ipwireless/tty.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/pcmcia/ipwireless/tty.c 2010-03-20 16:58:39.612523711 -0400 +@@ -51,7 +51,7 @@ struct ipw_tty { + int tty_type; + struct ipw_network *network; + struct tty_struct *linux_tty; +- int open_count; ++ atomic_t open_count; + unsigned int control_lines; + struct mutex ipw_tty_mutex; + int tx_bytes_queued; +@@ -127,10 +127,10 @@ static int ipw_open(struct tty_struct *l + mutex_unlock(&tty->ipw_tty_mutex); + return -ENODEV; + } +- if (tty->open_count == 0) ++ if (atomic_read(&tty->open_count) == 0) + tty->tx_bytes_queued = 0; + +- tty->open_count++; ++ atomic_inc(&tty->open_count); + + tty->linux_tty = linux_tty; + linux_tty->driver_data = tty; +@@ -146,9 +146,7 @@ static int ipw_open(struct tty_struct *l + + static void do_ipw_close(struct ipw_tty *tty) + { +- tty->open_count--; +- +- if (tty->open_count == 0) { ++ if (atomic_dec_return(&tty->open_count) == 0) { + struct tty_struct *linux_tty = tty->linux_tty; + + if (linux_tty != NULL) { +@@ -169,7 +167,7 @@ static void ipw_hangup(struct tty_struct + return; + + mutex_lock(&tty->ipw_tty_mutex); +- if (tty->open_count == 0) { ++ if (atomic_read(&tty->open_count) == 0) { + mutex_unlock(&tty->ipw_tty_mutex); + return; + } +@@ -198,7 +196,7 @@ void ipwireless_tty_received(struct ipw_ + return; + } + +- if (!tty->open_count) { ++ if (!atomic_read(&tty->open_count)) { + mutex_unlock(&tty->ipw_tty_mutex); + return; + } +@@ -240,7 +238,7 @@ static int ipw_write(struct tty_struct * + return -ENODEV; + + mutex_lock(&tty->ipw_tty_mutex); +- if (!tty->open_count) { ++ if (!atomic_read(&tty->open_count)) { + mutex_unlock(&tty->ipw_tty_mutex); + return -EINVAL; + } +@@ -280,7 +278,7 @@ static int ipw_write_room(struct tty_str + if (!tty) + return -ENODEV; + +- if (!tty->open_count) ++ if (!atomic_read(&tty->open_count)) + return -EINVAL; + + room = IPWIRELESS_TX_QUEUE_SIZE - tty->tx_bytes_queued; +@@ -322,7 +320,7 @@ static int ipw_chars_in_buffer(struct tt + if (!tty) + return 0; + +- if (!tty->open_count) ++ if (!atomic_read(&tty->open_count)) + return 0; + + return tty->tx_bytes_queued; +@@ -403,7 +401,7 @@ static int ipw_tiocmget(struct tty_struc + if (!tty) + return -ENODEV; + +- if (!tty->open_count) ++ if (!atomic_read(&tty->open_count)) + return -EINVAL; + + return get_control_lines(tty); +@@ -419,7 +417,7 @@ ipw_tiocmset(struct tty_struct *linux_tt + if (!tty) + return -ENODEV; + +- if (!tty->open_count) ++ if (!atomic_read(&tty->open_count)) + return -EINVAL; + + return set_control_lines(tty, set, clear); +@@ -433,7 +431,7 @@ static int ipw_ioctl(struct tty_struct * + if (!tty) + return -ENODEV; + +- if (!tty->open_count) ++ if (!atomic_read(&tty->open_count)) + return -EINVAL; + + /* FIXME: Exactly how is the tty object locked here .. */ +@@ -591,7 +589,7 @@ void ipwireless_tty_free(struct ipw_tty + against a parallel ioctl etc */ + mutex_lock(&ttyj->ipw_tty_mutex); + } +- while (ttyj->open_count) ++ while (atomic_read(&ttyj->open_count)) + do_ipw_close(ttyj); + ipwireless_disassociate_network_ttys(network, + ttyj->channel_idx); +diff -urNp linux-2.6.33.1/drivers/char/pty.c linux-2.6.33.1/drivers/char/pty.c +--- linux-2.6.33.1/drivers/char/pty.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/pty.c 2010-03-20 16:58:39.612523711 -0400 +@@ -676,7 +676,18 @@ static int ptmx_open(struct inode *inode + return ret; + } + +-static struct file_operations ptmx_fops; ++static const struct file_operations ptmx_fops = { ++ .llseek = no_llseek, ++ .read = tty_read, ++ .write = tty_write, ++ .poll = tty_poll, ++ .unlocked_ioctl = tty_ioctl, ++ .compat_ioctl = tty_compat_ioctl, ++ .open = ptmx_open, ++ .release = tty_release, ++ .fasync = tty_fasync, ++}; ++ + + static void __init unix98_pty_init(void) + { +@@ -730,9 +741,6 @@ static void __init unix98_pty_init(void) + register_sysctl_table(pty_root_table); + + /* Now create the /dev/ptmx special device */ +- tty_default_fops(&ptmx_fops); +- ptmx_fops.open = ptmx_open; +- + cdev_init(&ptmx_cdev, &ptmx_fops); + if (cdev_add(&ptmx_cdev, MKDEV(TTYAUX_MAJOR, 2), 1) || + register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0) +diff -urNp linux-2.6.33.1/drivers/char/random.c linux-2.6.33.1/drivers/char/random.c +--- linux-2.6.33.1/drivers/char/random.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/random.c 2010-03-20 16:58:39.620815282 -0400 +@@ -254,8 +254,13 @@ + /* + * Configuration information + */ ++#ifdef CONFIG_GRKERNSEC_RANDNET ++#define INPUT_POOL_WORDS 512 ++#define OUTPUT_POOL_WORDS 128 ++#else + #define INPUT_POOL_WORDS 128 + #define OUTPUT_POOL_WORDS 32 ++#endif + #define SEC_XFER_SIZE 512 + + /* +@@ -292,10 +297,17 @@ static struct poolinfo { + int poolwords; + int tap1, tap2, tap3, tap4, tap5; + } poolinfo_table[] = { ++#ifdef CONFIG_GRKERNSEC_RANDNET ++ /* x^512 + x^411 + x^308 + x^208 +x^104 + x + 1 -- 225 */ ++ { 512, 411, 308, 208, 104, 1 }, ++ /* x^128 + x^103 + x^76 + x^51 + x^25 + x + 1 -- 105 */ ++ { 128, 103, 76, 51, 25, 1 }, ++#else + /* x^128 + x^103 + x^76 + x^51 +x^25 + x + 1 -- 105 */ + { 128, 103, 76, 51, 25, 1 }, + /* x^32 + x^26 + x^20 + x^14 + x^7 + x + 1 -- 15 */ + { 32, 26, 20, 14, 7, 1 }, ++#endif + #if 0 + /* x^2048 + x^1638 + x^1231 + x^819 + x^411 + x + 1 -- 115 */ + { 2048, 1638, 1231, 819, 411, 1 }, +@@ -903,7 +915,7 @@ static ssize_t extract_entropy_user(stru + + extract_buf(r, tmp); + i = min_t(int, nbytes, EXTRACT_SIZE); +- if (copy_to_user(buf, tmp, i)) { ++ if (i > sizeof(tmp) || copy_to_user(buf, tmp, i)) { + ret = -EFAULT; + break; + } +@@ -1209,7 +1221,7 @@ EXPORT_SYMBOL(generate_random_uuid); + #include <linux/sysctl.h> + + static int min_read_thresh = 8, min_write_thresh; +-static int max_read_thresh = INPUT_POOL_WORDS * 32; ++static int max_read_thresh = OUTPUT_POOL_WORDS * 32; + static int max_write_thresh = INPUT_POOL_WORDS * 32; + static char sysctl_bootid[16]; + +diff -urNp linux-2.6.33.1/drivers/char/sonypi.c linux-2.6.33.1/drivers/char/sonypi.c +--- linux-2.6.33.1/drivers/char/sonypi.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/sonypi.c 2010-03-20 16:58:39.624516836 -0400 +@@ -490,7 +490,7 @@ static struct sonypi_device { + spinlock_t fifo_lock; + wait_queue_head_t fifo_proc_list; + struct fasync_struct *fifo_async; +- int open_count; ++ atomic_t open_count; + int model; + struct input_dev *input_jog_dev; + struct input_dev *input_key_dev; +@@ -897,7 +897,7 @@ static int sonypi_misc_fasync(int fd, st + static int sonypi_misc_release(struct inode *inode, struct file *file) + { + mutex_lock(&sonypi_device.lock); +- sonypi_device.open_count--; ++ atomic_dec(&sonypi_device.open_count); + mutex_unlock(&sonypi_device.lock); + return 0; + } +@@ -906,9 +906,9 @@ static int sonypi_misc_open(struct inode + { + mutex_lock(&sonypi_device.lock); + /* Flush input queue on first open */ +- if (!sonypi_device.open_count) ++ if (!atomic_read(&sonypi_device.open_count)) + kfifo_reset(&sonypi_device.fifo); +- sonypi_device.open_count++; ++ atomic_inc(&sonypi_device.open_count); + mutex_unlock(&sonypi_device.lock); + + return 0; +diff -urNp linux-2.6.33.1/drivers/char/tpm/tpm_bios.c linux-2.6.33.1/drivers/char/tpm/tpm_bios.c +--- linux-2.6.33.1/drivers/char/tpm/tpm_bios.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/tpm/tpm_bios.c 2010-03-20 16:58:39.624516836 -0400 +@@ -172,7 +172,7 @@ static void *tpm_bios_measurements_start + event = addr; + + if ((event->event_type == 0 && event->event_size == 0) || +- ((addr + sizeof(struct tcpa_event) + event->event_size) >= limit)) ++ (event->event_size >= limit - addr - sizeof(struct tcpa_event))) + return NULL; + + return addr; +@@ -197,7 +197,7 @@ static void *tpm_bios_measurements_next( + return NULL; + + if ((event->event_type == 0 && event->event_size == 0) || +- ((v + sizeof(struct tcpa_event) + event->event_size) >= limit)) ++ (event->event_size >= limit - v - sizeof(struct tcpa_event))) + return NULL; + + (*pos)++; +@@ -290,7 +290,8 @@ static int tpm_binary_bios_measurements_ + int i; + + for (i = 0; i < sizeof(struct tcpa_event) + event->event_size; i++) +- seq_putc(m, data[i]); ++ if (!seq_putc(m, data[i])) ++ return -EFAULT; + + return 0; + } +@@ -409,6 +410,11 @@ static int read_log(struct tpm_bios_log + log->bios_event_log_end = log->bios_event_log + len; + + virt = acpi_os_map_memory(start, len); ++ if (!virt) { ++ kfree(log->bios_event_log); ++ log->bios_event_log = NULL; ++ return -EFAULT; ++ } + + memcpy(log->bios_event_log, virt, len); + +diff -urNp linux-2.6.33.1/drivers/char/tty_io.c linux-2.6.33.1/drivers/char/tty_io.c +--- linux-2.6.33.1/drivers/char/tty_io.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/tty_io.c 2010-03-20 16:58:39.628796519 -0400 +@@ -136,20 +136,10 @@ LIST_HEAD(tty_drivers); /* linked list + DEFINE_MUTEX(tty_mutex); + EXPORT_SYMBOL(tty_mutex); + +-static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *); +-static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *); + ssize_t redirected_tty_write(struct file *, const char __user *, + size_t, loff_t *); +-static unsigned int tty_poll(struct file *, poll_table *); + static int tty_open(struct inode *, struct file *); + long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +-#ifdef CONFIG_COMPAT +-static long tty_compat_ioctl(struct file *file, unsigned int cmd, +- unsigned long arg); +-#else +-#define tty_compat_ioctl NULL +-#endif +-static int tty_fasync(int fd, struct file *filp, int on); + static void release_tty(struct tty_struct *tty, int idx); + static void __proc_set_tty(struct task_struct *tsk, struct tty_struct *tty); + static void proc_set_tty(struct task_struct *tsk, struct tty_struct *tty); +@@ -871,7 +861,7 @@ EXPORT_SYMBOL(start_tty); + * read calls may be outstanding in parallel. + */ + +-static ssize_t tty_read(struct file *file, char __user *buf, size_t count, ++ssize_t tty_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) + { + int i; +@@ -899,6 +889,8 @@ static ssize_t tty_read(struct file *fil + return i; + } + ++EXPORT_SYMBOL(tty_read); ++ + void tty_write_unlock(struct tty_struct *tty) + { + mutex_unlock(&tty->atomic_write_lock); +@@ -1048,7 +1040,7 @@ void tty_write_message(struct tty_struct + * write method will not be invoked in parallel for each device. + */ + +-static ssize_t tty_write(struct file *file, const char __user *buf, ++ssize_t tty_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { + struct tty_struct *tty; +@@ -1075,6 +1067,8 @@ static ssize_t tty_write(struct file *fi + return ret; + } + ++EXPORT_SYMBOL(tty_write); ++ + ssize_t redirected_tty_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { +@@ -1894,6 +1888,8 @@ got_driver: + + + ++EXPORT_SYMBOL(tty_release); ++ + /** + * tty_poll - check tty status + * @filp: file being polled +@@ -1906,7 +1902,7 @@ got_driver: + * may be re-entered freely by other callers. + */ + +-static unsigned int tty_poll(struct file *filp, poll_table *wait) ++unsigned int tty_poll(struct file *filp, poll_table *wait) + { + struct tty_struct *tty; + struct tty_ldisc *ld; +@@ -1923,7 +1919,9 @@ static unsigned int tty_poll(struct file + return ret; + } + +-static int tty_fasync(int fd, struct file *filp, int on) ++EXPORT_SYMBOL(tty_poll); ++ ++int tty_fasync(int fd, struct file *filp, int on) + { + struct tty_struct *tty; + unsigned long flags; +@@ -1967,6 +1965,8 @@ out: + return retval; + } + ++EXPORT_SYMBOL(tty_fasync); ++ + /** + * tiocsti - fake input character + * @tty: tty to fake input into +@@ -2599,8 +2599,10 @@ long tty_ioctl(struct file *file, unsign + return retval; + } + ++EXPORT_SYMBOL(tty_ioctl); ++ + #ifdef CONFIG_COMPAT +-static long tty_compat_ioctl(struct file *file, unsigned int cmd, ++long tty_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) + { + struct inode *inode = file->f_dentry->d_inode; +@@ -2624,6 +2626,9 @@ static long tty_compat_ioctl(struct file + + return retval; + } ++ ++EXPORT_SYMBOL(tty_compat_ioctl); ++ + #endif + + /* +@@ -3067,11 +3072,6 @@ struct tty_struct *get_current_tty(void) + } + EXPORT_SYMBOL_GPL(get_current_tty); + +-void tty_default_fops(struct file_operations *fops) +-{ +- *fops = tty_fops; +-} +- + /* + * Initialize the console device. This is called *early*, so + * we can't necessarily depend on lots of kernel help here. +diff -urNp linux-2.6.33.1/drivers/char/tty_ldisc.c linux-2.6.33.1/drivers/char/tty_ldisc.c +--- linux-2.6.33.1/drivers/char/tty_ldisc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/tty_ldisc.c 2010-03-20 16:58:39.628796519 -0400 +@@ -75,7 +75,7 @@ static void put_ldisc(struct tty_ldisc * + if (atomic_dec_and_lock(&ld->users, &tty_ldisc_lock)) { + struct tty_ldisc_ops *ldo = ld->ops; + +- ldo->refcount--; ++ atomic_dec(&ldo->refcount); + module_put(ldo->owner); + spin_unlock_irqrestore(&tty_ldisc_lock, flags); + +@@ -109,7 +109,7 @@ int tty_register_ldisc(int disc, struct + spin_lock_irqsave(&tty_ldisc_lock, flags); + tty_ldiscs[disc] = new_ldisc; + new_ldisc->num = disc; +- new_ldisc->refcount = 0; ++ atomic_set(&new_ldisc->refcount, 0); + spin_unlock_irqrestore(&tty_ldisc_lock, flags); + + return ret; +@@ -137,7 +137,7 @@ int tty_unregister_ldisc(int disc) + return -EINVAL; + + spin_lock_irqsave(&tty_ldisc_lock, flags); +- if (tty_ldiscs[disc]->refcount) ++ if (atomic_read(&tty_ldiscs[disc]->refcount)) + ret = -EBUSY; + else + tty_ldiscs[disc] = NULL; +@@ -158,7 +158,7 @@ static struct tty_ldisc_ops *get_ldops(i + if (ldops) { + ret = ERR_PTR(-EAGAIN); + if (try_module_get(ldops->owner)) { +- ldops->refcount++; ++ atomic_inc(&ldops->refcount); + ret = ldops; + } + } +@@ -171,7 +171,7 @@ static void put_ldops(struct tty_ldisc_o + unsigned long flags; + + spin_lock_irqsave(&tty_ldisc_lock, flags); +- ldops->refcount--; ++ atomic_dec(&ldops->refcount); + module_put(ldops->owner); + spin_unlock_irqrestore(&tty_ldisc_lock, flags); + } +diff -urNp linux-2.6.33.1/drivers/char/virtio_console.c linux-2.6.33.1/drivers/char/virtio_console.c +--- linux-2.6.33.1/drivers/char/virtio_console.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/virtio_console.c 2010-03-20 16:58:39.628796519 -0400 +@@ -44,6 +44,7 @@ static unsigned int in_len; + static char *in, *inbuf; + + /* The operations for our console. */ ++/* cannot be const */ + static struct hv_ops virtio_cons; + + /* The hvc device */ +diff -urNp linux-2.6.33.1/drivers/char/vt_ioctl.c linux-2.6.33.1/drivers/char/vt_ioctl.c +--- linux-2.6.33.1/drivers/char/vt_ioctl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/char/vt_ioctl.c 2010-03-20 16:58:39.632763861 -0400 +@@ -226,6 +226,12 @@ do_kdsk_ioctl(int cmd, struct kbentry __ + case KDSKBENT: + if (!perm) + return -EPERM; ++ ++#ifdef CONFIG_GRKERNSEC ++ if (!capable(CAP_SYS_TTY_CONFIG)) ++ return -EPERM; ++#endif ++ + if (!i && v == K_NOSUCHMAP) { + /* deallocate map */ + key_map = key_maps[s]; +@@ -366,6 +372,13 @@ do_kdgkb_ioctl(int cmd, struct kbsentry + goto reterr; + } + ++#ifdef CONFIG_GRKERNSEC ++ if (!capable(CAP_SYS_TTY_CONFIG)) { ++ ret = -EPERM; ++ goto reterr; ++ } ++#endif ++ + q = func_table[i]; + first_free = funcbufptr + (funcbufsize - funcbufleft); + for (j = i+1; j < MAX_NR_FUNC && !func_table[j]; j++) +diff -urNp linux-2.6.33.1/drivers/cpufreq/cpufreq.c linux-2.6.33.1/drivers/cpufreq/cpufreq.c +--- linux-2.6.33.1/drivers/cpufreq/cpufreq.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/cpufreq/cpufreq.c 2010-03-20 16:58:39.632763861 -0400 +@@ -766,7 +766,7 @@ static void cpufreq_sysfs_release(struct + complete(&policy->kobj_unregister); + } + +-static struct sysfs_ops sysfs_ops = { ++static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, + }; +diff -urNp linux-2.6.33.1/drivers/cpuidle/sysfs.c linux-2.6.33.1/drivers/cpuidle/sysfs.c +--- linux-2.6.33.1/drivers/cpuidle/sysfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/cpuidle/sysfs.c 2010-03-20 16:58:39.640816327 -0400 +@@ -191,7 +191,7 @@ static ssize_t cpuidle_store(struct kobj + return ret; + } + +-static struct sysfs_ops cpuidle_sysfs_ops = { ++static const struct sysfs_ops cpuidle_sysfs_ops = { + .show = cpuidle_show, + .store = cpuidle_store, + }; +@@ -277,7 +277,7 @@ static ssize_t cpuidle_state_show(struct + return ret; + } + +-static struct sysfs_ops cpuidle_state_sysfs_ops = { ++static const struct sysfs_ops cpuidle_state_sysfs_ops = { + .show = cpuidle_state_show, + }; + +diff -urNp linux-2.6.33.1/drivers/dma/ioat/dma.c linux-2.6.33.1/drivers/dma/ioat/dma.c +--- linux-2.6.33.1/drivers/dma/ioat/dma.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/dma/ioat/dma.c 2010-03-20 16:58:39.664561584 -0400 +@@ -1146,7 +1146,7 @@ ioat_attr_show(struct kobject *kobj, str + return entry->show(&chan->common, page); + } + +-struct sysfs_ops ioat_sysfs_ops = { ++const struct sysfs_ops ioat_sysfs_ops = { + .show = ioat_attr_show, + }; + +diff -urNp linux-2.6.33.1/drivers/dma/ioat/dma.h linux-2.6.33.1/drivers/dma/ioat/dma.h +--- linux-2.6.33.1/drivers/dma/ioat/dma.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/dma/ioat/dma.h 2010-03-20 16:58:39.664561584 -0400 +@@ -347,7 +347,7 @@ bool ioat_cleanup_preamble(struct ioat_c + unsigned long *phys_complete); + void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type); + void ioat_kobject_del(struct ioatdma_device *device); +-extern struct sysfs_ops ioat_sysfs_ops; ++extern const struct sysfs_ops ioat_sysfs_ops; + extern struct ioat_sysfs_entry ioat_version_attr; + extern struct ioat_sysfs_entry ioat_cap_attr; + #endif /* IOATDMA_H */ +diff -urNp linux-2.6.33.1/drivers/edac/edac_core.h linux-2.6.33.1/drivers/edac/edac_core.h +--- linux-2.6.33.1/drivers/edac/edac_core.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/edac/edac_core.h 2010-03-20 16:58:39.668813885 -0400 +@@ -100,11 +100,11 @@ extern const char *edac_mem_types[]; + + #else /* !CONFIG_EDAC_DEBUG */ + +-#define debugf0( ... ) +-#define debugf1( ... ) +-#define debugf2( ... ) +-#define debugf3( ... ) +-#define debugf4( ... ) ++#define debugf0( ... ) do {} while (0) ++#define debugf1( ... ) do {} while (0) ++#define debugf2( ... ) do {} while (0) ++#define debugf3( ... ) do {} while (0) ++#define debugf4( ... ) do {} while (0) + + #endif /* !CONFIG_EDAC_DEBUG */ + +diff -urNp linux-2.6.33.1/drivers/edac/edac_device_sysfs.c linux-2.6.33.1/drivers/edac/edac_device_sysfs.c +--- linux-2.6.33.1/drivers/edac/edac_device_sysfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/edac/edac_device_sysfs.c 2010-03-20 16:58:39.672639279 -0400 +@@ -137,7 +137,7 @@ static ssize_t edac_dev_ctl_info_store(s + } + + /* edac_dev file operations for an 'ctl_info' */ +-static struct sysfs_ops device_ctl_info_ops = { ++static const struct sysfs_ops device_ctl_info_ops = { + .show = edac_dev_ctl_info_show, + .store = edac_dev_ctl_info_store + }; +@@ -373,7 +373,7 @@ static ssize_t edac_dev_instance_store(s + } + + /* edac_dev file operations for an 'instance' */ +-static struct sysfs_ops device_instance_ops = { ++static const struct sysfs_ops device_instance_ops = { + .show = edac_dev_instance_show, + .store = edac_dev_instance_store + }; +@@ -476,7 +476,7 @@ static ssize_t edac_dev_block_store(stru + } + + /* edac_dev file operations for a 'block' */ +-static struct sysfs_ops device_block_ops = { ++static const struct sysfs_ops device_block_ops = { + .show = edac_dev_block_show, + .store = edac_dev_block_store + }; +diff -urNp linux-2.6.33.1/drivers/edac/edac_mc_sysfs.c linux-2.6.33.1/drivers/edac/edac_mc_sysfs.c +--- linux-2.6.33.1/drivers/edac/edac_mc_sysfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/edac/edac_mc_sysfs.c 2010-03-20 16:58:39.676823381 -0400 +@@ -245,7 +245,7 @@ static ssize_t csrowdev_store(struct kob + return -EIO; + } + +-static struct sysfs_ops csrowfs_ops = { ++static const struct sysfs_ops csrowfs_ops = { + .show = csrowdev_show, + .store = csrowdev_store + }; +@@ -575,7 +575,7 @@ static ssize_t mcidev_store(struct kobje + } + + /* Intermediate show/store table */ +-static struct sysfs_ops mci_ops = { ++static const struct sysfs_ops mci_ops = { + .show = mcidev_show, + .store = mcidev_store + }; +diff -urNp linux-2.6.33.1/drivers/edac/edac_pci_sysfs.c linux-2.6.33.1/drivers/edac/edac_pci_sysfs.c +--- linux-2.6.33.1/drivers/edac/edac_pci_sysfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/edac/edac_pci_sysfs.c 2010-03-20 16:58:39.680817937 -0400 +@@ -121,7 +121,7 @@ static ssize_t edac_pci_instance_store(s + } + + /* fs_ops table */ +-static struct sysfs_ops pci_instance_ops = { ++static const struct sysfs_ops pci_instance_ops = { + .show = edac_pci_instance_show, + .store = edac_pci_instance_store + }; +@@ -261,7 +261,7 @@ static ssize_t edac_pci_dev_store(struct + return -EIO; + } + +-static struct sysfs_ops edac_pci_sysfs_ops = { ++static const struct sysfs_ops edac_pci_sysfs_ops = { + .show = edac_pci_dev_show, + .store = edac_pci_dev_store + }; +diff -urNp linux-2.6.33.1/drivers/firmware/dmi_scan.c linux-2.6.33.1/drivers/firmware/dmi_scan.c +--- linux-2.6.33.1/drivers/firmware/dmi_scan.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/firmware/dmi_scan.c 2010-03-20 16:58:39.680817937 -0400 +@@ -388,11 +388,6 @@ void __init dmi_scan_machine(void) + } + } + else { +- /* +- * no iounmap() for that ioremap(); it would be a no-op, but +- * it's so early in setup that sucker gets confused into doing +- * what it shouldn't if we actually call it. +- */ + p = dmi_ioremap(0xF0000, 0x10000); + if (p == NULL) + goto error; +diff -urNp linux-2.6.33.1/drivers/firmware/edd.c linux-2.6.33.1/drivers/firmware/edd.c +--- linux-2.6.33.1/drivers/firmware/edd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/firmware/edd.c 2010-03-20 16:58:39.680817937 -0400 +@@ -122,7 +122,7 @@ edd_attr_show(struct kobject * kobj, str + return ret; + } + +-static struct sysfs_ops edd_attr_ops = { ++static const struct sysfs_ops edd_attr_ops = { + .show = edd_attr_show, + }; + +diff -urNp linux-2.6.33.1/drivers/firmware/efivars.c linux-2.6.33.1/drivers/firmware/efivars.c +--- linux-2.6.33.1/drivers/firmware/efivars.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/firmware/efivars.c 2010-03-20 16:58:39.680817937 -0400 +@@ -362,7 +362,7 @@ static ssize_t efivar_attr_store(struct + return ret; + } + +-static struct sysfs_ops efivar_attr_ops = { ++static const struct sysfs_ops efivar_attr_ops = { + .show = efivar_attr_show, + .store = efivar_attr_store, + }; +diff -urNp linux-2.6.33.1/drivers/firmware/iscsi_ibft.c linux-2.6.33.1/drivers/firmware/iscsi_ibft.c +--- linux-2.6.33.1/drivers/firmware/iscsi_ibft.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/firmware/iscsi_ibft.c 2010-03-20 16:58:39.680817937 -0400 +@@ -525,7 +525,7 @@ static ssize_t ibft_show_attribute(struc + return ret; + } + +-static struct sysfs_ops ibft_attr_ops = { ++static const struct sysfs_ops ibft_attr_ops = { + .show = ibft_show_attribute, + }; + +diff -urNp linux-2.6.33.1/drivers/firmware/memmap.c linux-2.6.33.1/drivers/firmware/memmap.c +--- linux-2.6.33.1/drivers/firmware/memmap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/firmware/memmap.c 2010-03-20 16:58:39.684538507 -0400 +@@ -74,7 +74,7 @@ static struct attribute *def_attrs[] = { + NULL + }; + +-static struct sysfs_ops memmap_attr_ops = { ++static const struct sysfs_ops memmap_attr_ops = { + .show = memmap_attr_show, + }; + +diff -urNp linux-2.6.33.1/drivers/gpu/drm/drm_drv.c linux-2.6.33.1/drivers/gpu/drm/drm_drv.c +--- linux-2.6.33.1/drivers/gpu/drm/drm_drv.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/drm_drv.c 2010-03-20 16:58:39.696814507 -0400 +@@ -448,7 +448,7 @@ long drm_ioctl(struct file *filp, + + dev = file_priv->minor->dev; + atomic_inc(&dev->ioctl_count); +- atomic_inc(&dev->counts[_DRM_STAT_IOCTLS]); ++ atomic_inc_unchecked(&dev->counts[_DRM_STAT_IOCTLS]); + ++file_priv->ioctl_count; + + DRM_DEBUG("pid=%d, cmd=0x%02x, nr=0x%02x, dev 0x%lx, auth=%d\n", +diff -urNp linux-2.6.33.1/drivers/gpu/drm/drm_fops.c linux-2.6.33.1/drivers/gpu/drm/drm_fops.c +--- linux-2.6.33.1/drivers/gpu/drm/drm_fops.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/drm_fops.c 2010-03-20 16:58:39.704802989 -0400 +@@ -66,7 +66,7 @@ static int drm_setup(struct drm_device * + } + + for (i = 0; i < ARRAY_SIZE(dev->counts); i++) +- atomic_set(&dev->counts[i], 0); ++ atomic_set_unchecked(&dev->counts[i], 0); + + dev->sigdata.lock = NULL; + +@@ -130,9 +130,9 @@ int drm_open(struct inode *inode, struct + + retcode = drm_open_helper(inode, filp, dev); + if (!retcode) { +- atomic_inc(&dev->counts[_DRM_STAT_OPENS]); ++ atomic_inc_unchecked(&dev->counts[_DRM_STAT_OPENS]); + spin_lock(&dev->count_lock); +- if (!dev->open_count++) { ++ if (atomic_inc_return(&dev->open_count) == 1) { + spin_unlock(&dev->count_lock); + retcode = drm_setup(dev); + goto out; +@@ -472,7 +472,7 @@ int drm_release(struct inode *inode, str + + lock_kernel(); + +- DRM_DEBUG("open_count = %d\n", dev->open_count); ++ DRM_DEBUG("open_count = %d\n", atomic_read(&dev->open_count)); + + if (dev->driver->preclose) + dev->driver->preclose(dev, file_priv); +@@ -484,7 +484,7 @@ int drm_release(struct inode *inode, str + DRM_DEBUG("pid = %d, device = 0x%lx, open_count = %d\n", + task_pid_nr(current), + (long)old_encode_dev(file_priv->minor->device), +- dev->open_count); ++ atomic_read(&dev->open_count)); + + /* if the master has gone away we can't do anything with the lock */ + if (file_priv->minor->master) +@@ -565,9 +565,9 @@ int drm_release(struct inode *inode, str + * End inline drm_release + */ + +- atomic_inc(&dev->counts[_DRM_STAT_CLOSES]); ++ atomic_inc_unchecked(&dev->counts[_DRM_STAT_CLOSES]); + spin_lock(&dev->count_lock); +- if (!--dev->open_count) { ++ if (atomic_dec_and_test(&dev->open_count)) { + if (atomic_read(&dev->ioctl_count)) { + DRM_ERROR("Device busy: %d\n", + atomic_read(&dev->ioctl_count)); +diff -urNp linux-2.6.33.1/drivers/gpu/drm/drm_ioctl.c linux-2.6.33.1/drivers/gpu/drm/drm_ioctl.c +--- linux-2.6.33.1/drivers/gpu/drm/drm_ioctl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/drm_ioctl.c 2010-03-20 16:58:39.704802989 -0400 +@@ -283,7 +283,7 @@ int drm_getstats(struct drm_device *dev, + stats->data[i].value = + (file_priv->master->lock.hw_lock ? file_priv->master->lock.hw_lock->lock : 0); + else +- stats->data[i].value = atomic_read(&dev->counts[i]); ++ stats->data[i].value = atomic_read_unchecked(&dev->counts[i]); + stats->data[i].type = dev->types[i]; + } + +diff -urNp linux-2.6.33.1/drivers/gpu/drm/drm_lock.c linux-2.6.33.1/drivers/gpu/drm/drm_lock.c +--- linux-2.6.33.1/drivers/gpu/drm/drm_lock.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/drm_lock.c 2010-03-20 16:58:39.712815966 -0400 +@@ -87,7 +87,7 @@ int drm_lock(struct drm_device *dev, voi + if (drm_lock_take(&master->lock, lock->context)) { + master->lock.file_priv = file_priv; + master->lock.lock_time = jiffies; +- atomic_inc(&dev->counts[_DRM_STAT_LOCKS]); ++ atomic_inc_unchecked(&dev->counts[_DRM_STAT_LOCKS]); + break; /* Got lock */ + } + +@@ -165,7 +165,7 @@ int drm_unlock(struct drm_device *dev, v + return -EINVAL; + } + +- atomic_inc(&dev->counts[_DRM_STAT_UNLOCKS]); ++ atomic_inc_unchecked(&dev->counts[_DRM_STAT_UNLOCKS]); + + /* kernel_context_switch isn't used by any of the x86 drm + * modules but is required by the Sparc driver. +diff -urNp linux-2.6.33.1/drivers/gpu/drm/i810/i810_dma.c linux-2.6.33.1/drivers/gpu/drm/i810/i810_dma.c +--- linux-2.6.33.1/drivers/gpu/drm/i810/i810_dma.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/i810/i810_dma.c 2010-03-20 16:58:39.712815966 -0400 +@@ -952,8 +952,8 @@ static int i810_dma_vertex(struct drm_de + dma->buflist[vertex->idx], + vertex->discard, vertex->used); + +- atomic_add(vertex->used, &dev->counts[_DRM_STAT_SECONDARY]); +- atomic_inc(&dev->counts[_DRM_STAT_DMA]); ++ atomic_add_unchecked(vertex->used, &dev->counts[_DRM_STAT_SECONDARY]); ++ atomic_inc_unchecked(&dev->counts[_DRM_STAT_DMA]); + sarea_priv->last_enqueue = dev_priv->counter - 1; + sarea_priv->last_dispatch = (int)hw_status[5]; + +@@ -1115,8 +1115,8 @@ static int i810_dma_mc(struct drm_device + i810_dma_dispatch_mc(dev, dma->buflist[mc->idx], mc->used, + mc->last_render); + +- atomic_add(mc->used, &dev->counts[_DRM_STAT_SECONDARY]); +- atomic_inc(&dev->counts[_DRM_STAT_DMA]); ++ atomic_add_unchecked(mc->used, &dev->counts[_DRM_STAT_SECONDARY]); ++ atomic_inc_unchecked(&dev->counts[_DRM_STAT_DMA]); + sarea_priv->last_enqueue = dev_priv->counter - 1; + sarea_priv->last_dispatch = (int)hw_status[5]; + +diff -urNp linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ch7017.c linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ch7017.c +--- linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ch7017.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ch7017.c 2010-03-20 16:58:39.720824119 -0400 +@@ -444,7 +444,7 @@ static void ch7017_destroy(struct intel_ + } + } + +-struct intel_dvo_dev_ops ch7017_ops = { ++const struct intel_dvo_dev_ops ch7017_ops = { + .init = ch7017_init, + .detect = ch7017_detect, + .mode_valid = ch7017_mode_valid, +diff -urNp linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ch7xxx.c linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ch7xxx.c +--- linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ch7xxx.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ch7xxx.c 2010-03-20 16:58:39.724540120 -0400 +@@ -358,7 +358,7 @@ static void ch7xxx_destroy(struct intel_ + } + } + +-struct intel_dvo_dev_ops ch7xxx_ops = { ++const struct intel_dvo_dev_ops ch7xxx_ops = { + .init = ch7xxx_init, + .detect = ch7xxx_detect, + .mode_valid = ch7xxx_mode_valid, +diff -urNp linux-2.6.33.1/drivers/gpu/drm/i915/dvo.h linux-2.6.33.1/drivers/gpu/drm/i915/dvo.h +--- linux-2.6.33.1/drivers/gpu/drm/i915/dvo.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/i915/dvo.h 2010-03-20 16:58:39.724540120 -0400 +@@ -135,23 +135,23 @@ struct intel_dvo_dev_ops { + * + * \return singly-linked list of modes or NULL if no modes found. + */ +- struct drm_display_mode *(*get_modes)(struct intel_dvo_device *dvo); ++ struct drm_display_mode *(* const get_modes)(struct intel_dvo_device *dvo); + + /** + * Clean up driver-specific bits of the output + */ +- void (*destroy) (struct intel_dvo_device *dvo); ++ void (* const destroy) (struct intel_dvo_device *dvo); + + /** + * Debugging hook to dump device registers to log file + */ +- void (*dump_regs)(struct intel_dvo_device *dvo); ++ void (* const dump_regs)(struct intel_dvo_device *dvo); + }; + +-extern struct intel_dvo_dev_ops sil164_ops; +-extern struct intel_dvo_dev_ops ch7xxx_ops; +-extern struct intel_dvo_dev_ops ivch_ops; +-extern struct intel_dvo_dev_ops tfp410_ops; +-extern struct intel_dvo_dev_ops ch7017_ops; ++extern const struct intel_dvo_dev_ops sil164_ops; ++extern const struct intel_dvo_dev_ops ch7xxx_ops; ++extern const struct intel_dvo_dev_ops ivch_ops; ++extern const struct intel_dvo_dev_ops tfp410_ops; ++extern const struct intel_dvo_dev_ops ch7017_ops; + + #endif /* _INTEL_DVO_H */ +diff -urNp linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ivch.c linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ivch.c +--- linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ivch.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/i915/dvo_ivch.c 2010-03-20 16:58:39.724540120 -0400 +@@ -431,7 +431,7 @@ static void ivch_destroy(struct intel_dv + } + } + +-struct intel_dvo_dev_ops ivch_ops= { ++const struct intel_dvo_dev_ops ivch_ops= { + .init = ivch_init, + .dpms = ivch_dpms, + .save = ivch_save, +diff -urNp linux-2.6.33.1/drivers/gpu/drm/i915/dvo_sil164.c linux-2.6.33.1/drivers/gpu/drm/i915/dvo_sil164.c +--- linux-2.6.33.1/drivers/gpu/drm/i915/dvo_sil164.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/i915/dvo_sil164.c 2010-03-20 16:58:39.724540120 -0400 +@@ -290,7 +290,7 @@ static void sil164_destroy(struct intel_ + } + } + +-struct intel_dvo_dev_ops sil164_ops = { ++const struct intel_dvo_dev_ops sil164_ops = { + .init = sil164_init, + .detect = sil164_detect, + .mode_valid = sil164_mode_valid, +diff -urNp linux-2.6.33.1/drivers/gpu/drm/i915/dvo_tfp410.c linux-2.6.33.1/drivers/gpu/drm/i915/dvo_tfp410.c +--- linux-2.6.33.1/drivers/gpu/drm/i915/dvo_tfp410.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/i915/dvo_tfp410.c 2010-03-20 16:58:39.724540120 -0400 +@@ -325,7 +325,7 @@ static void tfp410_destroy(struct intel_ + } + } + +-struct intel_dvo_dev_ops tfp410_ops = { ++const struct intel_dvo_dev_ops tfp410_ops = { + .init = tfp410_init, + .detect = tfp410_detect, + .mode_valid = tfp410_mode_valid, +diff -urNp linux-2.6.33.1/drivers/gpu/drm/i915/i915_drv.c linux-2.6.33.1/drivers/gpu/drm/i915/i915_drv.c +--- linux-2.6.33.1/drivers/gpu/drm/i915/i915_drv.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/i915/i915_drv.c 2010-03-20 16:58:39.724540120 -0400 +@@ -470,7 +470,7 @@ const struct dev_pm_ops i915_pm_ops = { + .restore = i915_pm_resume, + }; + +-static struct vm_operations_struct i915_gem_vm_ops = { ++static const struct vm_operations_struct i915_gem_vm_ops = { + .fault = i915_gem_fault, + .open = drm_gem_vm_open, + .close = drm_gem_vm_close, +diff -urNp linux-2.6.33.1/drivers/gpu/drm/nouveau/nouveau_backlight.c linux-2.6.33.1/drivers/gpu/drm/nouveau/nouveau_backlight.c +--- linux-2.6.33.1/drivers/gpu/drm/nouveau/nouveau_backlight.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/nouveau/nouveau_backlight.c 2010-03-20 16:58:39.737935163 -0400 +@@ -58,7 +58,7 @@ static int nv40_set_intensity(struct bac + return 0; + } + +-static struct backlight_ops nv40_bl_ops = { ++static const struct backlight_ops nv40_bl_ops = { + .options = BL_CORE_SUSPENDRESUME, + .get_brightness = nv40_get_intensity, + .update_status = nv40_set_intensity, +@@ -81,7 +81,7 @@ static int nv50_set_intensity(struct bac + return 0; + } + +-static struct backlight_ops nv50_bl_ops = { ++static const struct backlight_ops nv50_bl_ops = { + .options = BL_CORE_SUSPENDRESUME, + .get_brightness = nv50_get_intensity, + .update_status = nv50_set_intensity, +diff -urNp linux-2.6.33.1/drivers/gpu/drm/radeon/mkregtable.c linux-2.6.33.1/drivers/gpu/drm/radeon/mkregtable.c +--- linux-2.6.33.1/drivers/gpu/drm/radeon/mkregtable.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/radeon/mkregtable.c 2010-03-20 16:58:39.737935163 -0400 +@@ -637,14 +637,14 @@ static int parser_auth(struct table *t, + regex_t mask_rex; + regmatch_t match[4]; + char buf[1024]; +- size_t end; ++ long end; + int len; + int done = 0; + int r; + unsigned o; + struct offset *offset; + char last_reg_s[10]; +- int last_reg; ++ unsigned long last_reg; + + if (regcomp + (&mask_rex, "(0x[0-9a-fA-F]*) *([_a-zA-Z0-9]*)", REG_EXTENDED)) { +diff -urNp linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_atombios.c linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_atombios.c +--- linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_atombios.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_atombios.c 2010-03-20 16:58:39.737935163 -0400 +@@ -637,14 +637,14 @@ static uint16_t atombios_get_connector_o + } + } + +-struct bios_connector { ++static struct bios_connector { + bool valid; + uint16_t line_mux; + uint16_t devices; + int connector_type; + struct radeon_i2c_bus_rec ddc_bus; + struct radeon_hpd hpd; +-}; ++} bios_connectors[ATOM_MAX_SUPPORTED_DEVICE]; + + bool radeon_get_atom_connector_info_from_supported_devices_table(struct + drm_device +@@ -660,7 +660,6 @@ bool radeon_get_atom_connector_info_from + uint8_t dac; + union atom_supported_devices *supported_devices; + int i, j, max_device; +- struct bios_connector bios_connectors[ATOM_MAX_SUPPORTED_DEVICE]; + + atom_parse_data_header(ctx, index, &size, &frev, &crev, &data_offset); + +diff -urNp linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_state.c linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_state.c +--- linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_state.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_state.c 2010-03-20 16:58:39.757317099 -0400 +@@ -2139,7 +2139,7 @@ static int radeon_cp_clear(struct drm_de + if (sarea_priv->nbox > RADEON_NR_SAREA_CLIPRECTS) + sarea_priv->nbox = RADEON_NR_SAREA_CLIPRECTS; + +- if (DRM_COPY_FROM_USER(&depth_boxes, clear->depth_boxes, ++ if (sarea_priv->nbox > RADEON_NR_SAREA_CLIPRECTS || DRM_COPY_FROM_USER(&depth_boxes, clear->depth_boxes, + sarea_priv->nbox * sizeof(depth_boxes[0]))) + return -EFAULT; + +@@ -3014,7 +3014,7 @@ static int radeon_cp_getparam(struct drm + { + drm_radeon_private_t *dev_priv = dev->dev_private; + drm_radeon_getparam_t *param = data; +- int value; ++ int value = 0; + + DRM_DEBUG("pid=%d\n", DRM_CURRENTPID); + +diff -urNp linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_ttm.c linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_ttm.c +--- linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_ttm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/radeon/radeon_ttm.c 2010-03-20 16:58:39.760772786 -0400 +@@ -564,27 +564,10 @@ void radeon_ttm_fini(struct radeon_devic + DRM_INFO("radeon: ttm finalized\n"); + } + +-static struct vm_operations_struct radeon_ttm_vm_ops; +-static const struct vm_operations_struct *ttm_vm_ops = NULL; +- +-static int radeon_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +-{ +- struct ttm_buffer_object *bo; +- int r; +- +- bo = (struct ttm_buffer_object *)vma->vm_private_data; +- if (bo == NULL) { +- return VM_FAULT_NOPAGE; +- } +- r = ttm_vm_ops->fault(vma, vmf); +- return r; +-} +- + int radeon_mmap(struct file *filp, struct vm_area_struct *vma) + { + struct drm_file *file_priv; + struct radeon_device *rdev; +- int r; + + if (unlikely(vma->vm_pgoff < DRM_FILE_PAGE_OFFSET)) { + return drm_mmap(filp, vma); +@@ -592,20 +575,9 @@ int radeon_mmap(struct file *filp, struc + + file_priv = (struct drm_file *)filp->private_data; + rdev = file_priv->minor->dev->dev_private; +- if (rdev == NULL) { ++ if (!rdev) + return -EINVAL; +- } +- r = ttm_bo_mmap(filp, vma, &rdev->mman.bdev); +- if (unlikely(r != 0)) { +- return r; +- } +- if (unlikely(ttm_vm_ops == NULL)) { +- ttm_vm_ops = vma->vm_ops; +- radeon_ttm_vm_ops = *ttm_vm_ops; +- radeon_ttm_vm_ops.fault = &radeon_ttm_fault; +- } +- vma->vm_ops = &radeon_ttm_vm_ops; +- return 0; ++ return ttm_bo_mmap(filp, vma, &rdev->mman.bdev); + } + + +diff -urNp linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_bo.c linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_bo.c +--- linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_bo.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_bo.c 2010-03-20 16:58:39.760772786 -0400 +@@ -128,7 +128,7 @@ static struct attribute *ttm_bo_global_a + NULL + }; + +-static struct sysfs_ops ttm_bo_global_ops = { ++static const struct sysfs_ops ttm_bo_global_ops = { + .show = &ttm_bo_global_show + }; + +diff -urNp linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_bo_vm.c linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_bo_vm.c +--- linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_bo_vm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_bo_vm.c 2010-03-20 16:58:39.760772786 -0400 +@@ -73,7 +73,7 @@ static int ttm_bo_vm_fault(struct vm_are + { + struct ttm_buffer_object *bo = (struct ttm_buffer_object *) + vma->vm_private_data; +- struct ttm_bo_device *bdev = bo->bdev; ++ struct ttm_bo_device *bdev; + unsigned long bus_base; + unsigned long bus_offset; + unsigned long bus_size; +@@ -88,6 +88,10 @@ static int ttm_bo_vm_fault(struct vm_are + unsigned long address = (unsigned long)vmf->virtual_address; + int retval = VM_FAULT_NOPAGE; + ++ if (!bo) ++ return VM_FAULT_NOPAGE; ++ bdev = bo->bdev; ++ + /* + * Work around locking order reversal in fault / nopfn + * between mmap_sem and bo_reserve: Perform a trylock operation +diff -urNp linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_global.c linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_global.c +--- linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_global.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_global.c 2010-03-20 16:58:39.764713262 -0400 +@@ -36,7 +36,7 @@ + struct ttm_global_item { + struct mutex mutex; + void *object; +- int refcount; ++ atomic_t refcount; + }; + + static struct ttm_global_item glob[TTM_GLOBAL_NUM]; +@@ -49,7 +49,7 @@ void ttm_global_init(void) + struct ttm_global_item *item = &glob[i]; + mutex_init(&item->mutex); + item->object = NULL; +- item->refcount = 0; ++ atomic_set(&item->refcount, 0); + } + } + +@@ -59,7 +59,7 @@ void ttm_global_release(void) + for (i = 0; i < TTM_GLOBAL_NUM; ++i) { + struct ttm_global_item *item = &glob[i]; + BUG_ON(item->object != NULL); +- BUG_ON(item->refcount != 0); ++ BUG_ON(atomic_read(&item->refcount) != 0); + } + } + +@@ -70,7 +70,7 @@ int ttm_global_item_ref(struct ttm_globa + void *object; + + mutex_lock(&item->mutex); +- if (item->refcount == 0) { ++ if (atomic_read(&item->refcount) == 0) { + item->object = kzalloc(ref->size, GFP_KERNEL); + if (unlikely(item->object == NULL)) { + ret = -ENOMEM; +@@ -83,7 +83,7 @@ int ttm_global_item_ref(struct ttm_globa + goto out_err; + + } +- ++item->refcount; ++ atomic_inc(&item->refcount); + ref->object = item->object; + object = item->object; + mutex_unlock(&item->mutex); +@@ -100,9 +100,9 @@ void ttm_global_item_unref(struct ttm_gl + struct ttm_global_item *item = &glob[ref->global_type]; + + mutex_lock(&item->mutex); +- BUG_ON(item->refcount == 0); ++ BUG_ON(atomic_read(&item->refcount) == 0); + BUG_ON(ref->object != item->object); +- if (--item->refcount == 0) { ++ if (atomic_dec_and_test(&item->refcount)) { + ref->release(ref); + item->object = NULL; + } +diff -urNp linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_memory.c linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_memory.c +--- linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_memory.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/gpu/drm/ttm/ttm_memory.c 2010-03-20 16:58:39.764713262 -0400 +@@ -152,7 +152,7 @@ static struct attribute *ttm_mem_zone_at + NULL + }; + +-static struct sysfs_ops ttm_mem_zone_ops = { ++static const struct sysfs_ops ttm_mem_zone_ops = { + .show = &ttm_mem_zone_show, + .store = &ttm_mem_zone_store + }; +diff -urNp linux-2.6.33.1/drivers/hwmon/k8temp.c linux-2.6.33.1/drivers/hwmon/k8temp.c +--- linux-2.6.33.1/drivers/hwmon/k8temp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/hwmon/k8temp.c 2010-03-20 16:58:39.764713262 -0400 +@@ -138,7 +138,7 @@ static DEVICE_ATTR(name, S_IRUGO, show_n + + static const struct pci_device_id k8temp_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, +- { 0 }, ++ { 0, 0, 0, 0, 0, 0, 0 }, + }; + + MODULE_DEVICE_TABLE(pci, k8temp_ids); +diff -urNp linux-2.6.33.1/drivers/hwmon/sis5595.c linux-2.6.33.1/drivers/hwmon/sis5595.c +--- linux-2.6.33.1/drivers/hwmon/sis5595.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/hwmon/sis5595.c 2010-03-20 16:58:39.772823109 -0400 +@@ -699,7 +699,7 @@ static struct sis5595_data *sis5595_upda + + static const struct pci_device_id sis5595_pci_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_503) }, +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(pci, sis5595_pci_ids); +diff -urNp linux-2.6.33.1/drivers/hwmon/via686a.c linux-2.6.33.1/drivers/hwmon/via686a.c +--- linux-2.6.33.1/drivers/hwmon/via686a.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/hwmon/via686a.c 2010-03-20 16:58:39.780824922 -0400 +@@ -769,7 +769,7 @@ static struct via686a_data *via686a_upda + + static const struct pci_device_id via686a_pci_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C686_4) }, +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(pci, via686a_pci_ids); +diff -urNp linux-2.6.33.1/drivers/hwmon/vt8231.c linux-2.6.33.1/drivers/hwmon/vt8231.c +--- linux-2.6.33.1/drivers/hwmon/vt8231.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/hwmon/vt8231.c 2010-03-20 16:58:39.784811184 -0400 +@@ -699,7 +699,7 @@ static struct platform_driver vt8231_dri + + static const struct pci_device_id vt8231_pci_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8231_4) }, +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(pci, vt8231_pci_ids); +diff -urNp linux-2.6.33.1/drivers/hwmon/w83791d.c linux-2.6.33.1/drivers/hwmon/w83791d.c +--- linux-2.6.33.1/drivers/hwmon/w83791d.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/hwmon/w83791d.c 2010-03-20 16:58:39.796811468 -0400 +@@ -329,8 +329,8 @@ static int w83791d_detect(struct i2c_cli + struct i2c_board_info *info); + static int w83791d_remove(struct i2c_client *client); + +-static int w83791d_read(struct i2c_client *client, u8 register); +-static int w83791d_write(struct i2c_client *client, u8 register, u8 value); ++static int w83791d_read(struct i2c_client *client, u8 reg); ++static int w83791d_write(struct i2c_client *client, u8 reg, u8 value); + static struct w83791d_data *w83791d_update_device(struct device *dev); + + #ifdef DEBUG +diff -urNp linux-2.6.33.1/drivers/i2c/busses/i2c-i801.c linux-2.6.33.1/drivers/i2c/busses/i2c-i801.c +--- linux-2.6.33.1/drivers/i2c/busses/i2c-i801.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/i2c/busses/i2c-i801.c 2010-03-20 16:58:39.804825087 -0400 +@@ -578,7 +578,7 @@ static struct pci_device_id i801_ids[] = + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH10_4) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH10_5) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PCH_SMBUS) }, +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE (pci, i801_ids); +diff -urNp linux-2.6.33.1/drivers/i2c/busses/i2c-piix4.c linux-2.6.33.1/drivers/i2c/busses/i2c-piix4.c +--- linux-2.6.33.1/drivers/i2c/busses/i2c-piix4.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/i2c/busses/i2c-piix4.c 2010-03-20 16:58:39.821741415 -0400 +@@ -124,7 +124,7 @@ static struct dmi_system_id __devinitdat + .ident = "IBM", + .matches = { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, + }, +- { }, ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + + static int __devinit piix4_setup(struct pci_dev *PIIX4_dev, +@@ -491,7 +491,7 @@ static struct pci_device_id piix4_ids[] + PCI_DEVICE_ID_SERVERWORKS_HT1000SB) }, + { PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, + PCI_DEVICE_ID_SERVERWORKS_HT1100LD) }, +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE (pci, piix4_ids); +diff -urNp linux-2.6.33.1/drivers/i2c/busses/i2c-sis630.c linux-2.6.33.1/drivers/i2c/busses/i2c-sis630.c +--- linux-2.6.33.1/drivers/i2c/busses/i2c-sis630.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/i2c/busses/i2c-sis630.c 2010-03-20 16:58:39.832567311 -0400 +@@ -471,7 +471,7 @@ static struct i2c_adapter sis630_adapter + static struct pci_device_id sis630_ids[] __devinitdata = { + { PCI_DEVICE(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_503) }, + { PCI_DEVICE(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_LPC) }, +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE (pci, sis630_ids); +diff -urNp linux-2.6.33.1/drivers/i2c/busses/i2c-sis96x.c linux-2.6.33.1/drivers/i2c/busses/i2c-sis96x.c +--- linux-2.6.33.1/drivers/i2c/busses/i2c-sis96x.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/i2c/busses/i2c-sis96x.c 2010-03-20 16:58:39.832567311 -0400 +@@ -247,7 +247,7 @@ static struct i2c_adapter sis96x_adapter + + static struct pci_device_id sis96x_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_SMBUS) }, +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE (pci, sis96x_ids); +diff -urNp linux-2.6.33.1/drivers/ide/ide-cd.c linux-2.6.33.1/drivers/ide/ide-cd.c +--- linux-2.6.33.1/drivers/ide/ide-cd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ide/ide-cd.c 2010-03-20 16:58:39.844814079 -0400 +@@ -766,7 +766,7 @@ static void cdrom_do_block_pc(ide_drive_ + alignment = queue_dma_alignment(q) | q->dma_pad_mask; + if ((unsigned long)buf & alignment + || blk_rq_bytes(rq) & q->dma_pad_mask +- || object_is_on_stack(buf)) ++ || object_starts_on_stack(buf)) + drive->dma = 0; + } + } +diff -urNp linux-2.6.33.1/drivers/ieee1394/dv1394.c linux-2.6.33.1/drivers/ieee1394/dv1394.c +--- linux-2.6.33.1/drivers/ieee1394/dv1394.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ieee1394/dv1394.c 2010-03-20 16:58:39.856823569 -0400 +@@ -739,7 +739,7 @@ static void frame_prepare(struct video_c + based upon DIF section and sequence + */ + +-static void inline ++static inline void + frame_put_packet (struct frame *f, struct packet *p) + { + int section_type = p->data[0] >> 5; /* section type is in bits 5 - 7 */ +@@ -2178,7 +2178,7 @@ static const struct ieee1394_device_id d + .specifier_id = AVC_UNIT_SPEC_ID_ENTRY & 0xffffff, + .version = AVC_SW_VERSION_ENTRY & 0xffffff + }, +- { } ++ { 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(ieee1394, dv1394_id_table); +diff -urNp linux-2.6.33.1/drivers/ieee1394/eth1394.c linux-2.6.33.1/drivers/ieee1394/eth1394.c +--- linux-2.6.33.1/drivers/ieee1394/eth1394.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ieee1394/eth1394.c 2010-03-20 16:58:39.865827641 -0400 +@@ -446,7 +446,7 @@ static const struct ieee1394_device_id e + .specifier_id = ETHER1394_GASP_SPECIFIER_ID, + .version = ETHER1394_GASP_VERSION, + }, +- {} ++ { 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(ieee1394, eth1394_id_table); +diff -urNp linux-2.6.33.1/drivers/ieee1394/hosts.c linux-2.6.33.1/drivers/ieee1394/hosts.c +--- linux-2.6.33.1/drivers/ieee1394/hosts.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ieee1394/hosts.c 2010-03-20 16:58:39.865827641 -0400 +@@ -78,6 +78,7 @@ static int dummy_isoctl(struct hpsb_iso + } + + static struct hpsb_host_driver dummy_driver = { ++ .name = "dummy", + .transmit_packet = dummy_transmit_packet, + .devctl = dummy_devctl, + .isoctl = dummy_isoctl +diff -urNp linux-2.6.33.1/drivers/ieee1394/ohci1394.c linux-2.6.33.1/drivers/ieee1394/ohci1394.c +--- linux-2.6.33.1/drivers/ieee1394/ohci1394.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ieee1394/ohci1394.c 2010-03-20 16:58:39.873869022 -0400 +@@ -148,9 +148,9 @@ printk(level "%s: " fmt "\n" , OHCI1394_ + printk(level "%s: fw-host%d: " fmt "\n" , OHCI1394_DRIVER_NAME, ohci->host->id , ## args) + + /* Module Parameters */ +-static int phys_dma = 1; ++static int phys_dma; + module_param(phys_dma, int, 0444); +-MODULE_PARM_DESC(phys_dma, "Enable physical DMA (default = 1)."); ++MODULE_PARM_DESC(phys_dma, "Enable physical DMA (default = 0)."); + + static void dma_trm_tasklet(unsigned long data); + static void dma_trm_reset(struct dma_trm_ctx *d); +@@ -3445,7 +3445,7 @@ static struct pci_device_id ohci1394_pci + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + }, +- { 0, }, ++ { 0, 0, 0, 0, 0, 0, 0 }, + }; + + MODULE_DEVICE_TABLE(pci, ohci1394_pci_tbl); +diff -urNp linux-2.6.33.1/drivers/ieee1394/raw1394.c linux-2.6.33.1/drivers/ieee1394/raw1394.c +--- linux-2.6.33.1/drivers/ieee1394/raw1394.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ieee1394/raw1394.c 2010-03-20 16:58:39.873869022 -0400 +@@ -3002,7 +3002,7 @@ static const struct ieee1394_device_id r + .match_flags = IEEE1394_MATCH_SPECIFIER_ID | IEEE1394_MATCH_VERSION, + .specifier_id = CAMERA_UNIT_SPEC_ID_ENTRY & 0xffffff, + .version = (CAMERA_SW_VERSION_ENTRY + 2) & 0xffffff}, +- {} ++ { 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(ieee1394, raw1394_id_table); +diff -urNp linux-2.6.33.1/drivers/ieee1394/sbp2.c linux-2.6.33.1/drivers/ieee1394/sbp2.c +--- linux-2.6.33.1/drivers/ieee1394/sbp2.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ieee1394/sbp2.c 2010-03-20 16:58:39.877823938 -0400 +@@ -290,7 +290,7 @@ static const struct ieee1394_device_id s + .match_flags = IEEE1394_MATCH_SPECIFIER_ID | IEEE1394_MATCH_VERSION, + .specifier_id = SBP2_UNIT_SPEC_ID_ENTRY & 0xffffff, + .version = SBP2_SW_VERSION_ENTRY & 0xffffff}, +- {} ++ { 0, 0, 0, 0, 0, 0 } + }; + MODULE_DEVICE_TABLE(ieee1394, sbp2_id_table); + +@@ -2111,7 +2111,7 @@ MODULE_DESCRIPTION("IEEE-1394 SBP-2 prot + MODULE_SUPPORTED_DEVICE(SBP2_DEVICE_NAME); + MODULE_LICENSE("GPL"); + +-static int sbp2_module_init(void) ++static int __init sbp2_module_init(void) + { + int ret; + +diff -urNp linux-2.6.33.1/drivers/ieee1394/video1394.c linux-2.6.33.1/drivers/ieee1394/video1394.c +--- linux-2.6.33.1/drivers/ieee1394/video1394.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/ieee1394/video1394.c 2010-03-20 16:58:39.877823938 -0400 +@@ -1311,7 +1311,7 @@ static const struct ieee1394_device_id v + .specifier_id = CAMERA_UNIT_SPEC_ID_ENTRY & 0xffffff, + .version = (CAMERA_SW_VERSION_ENTRY + 2) & 0xffffff + }, +- { } ++ { 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(ieee1394, video1394_id_table); +diff -urNp linux-2.6.33.1/drivers/infiniband/core/cm.c linux-2.6.33.1/drivers/infiniband/core/cm.c +--- linux-2.6.33.1/drivers/infiniband/core/cm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/infiniband/core/cm.c 2010-03-20 16:58:39.905667556 -0400 +@@ -112,7 +112,7 @@ static char const counter_group_names[CM + + struct cm_counter_group { + struct kobject obj; +- atomic_long_t counter[CM_ATTR_COUNT]; ++ atomic_long_unchecked_t counter[CM_ATTR_COUNT]; + }; + + struct cm_counter_attribute { +@@ -1386,7 +1386,7 @@ static void cm_dup_req_handler(struct cm + struct ib_mad_send_buf *msg = NULL; + int ret; + +- atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. ++ atomic_long_inc_unchecked(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_REQ_COUNTER]); + + /* Quick state check to discard duplicate REQs. */ +@@ -1764,7 +1764,7 @@ static void cm_dup_rep_handler(struct cm + if (!cm_id_priv) + return; + +- atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. ++ atomic_long_inc_unchecked(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_REP_COUNTER]); + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + if (ret) +@@ -1931,7 +1931,7 @@ static int cm_rtu_handler(struct cm_work + if (cm_id_priv->id.state != IB_CM_REP_SENT && + cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) { + spin_unlock_irq(&cm_id_priv->lock); +- atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. ++ atomic_long_inc_unchecked(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_RTU_COUNTER]); + goto out; + } +@@ -2110,7 +2110,7 @@ static int cm_dreq_handler(struct cm_wor + cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id, + dreq_msg->local_comm_id); + if (!cm_id_priv) { +- atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. ++ atomic_long_inc_unchecked(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_DREQ_COUNTER]); + cm_issue_drep(work->port, work->mad_recv_wc); + return -EINVAL; +@@ -2131,7 +2131,7 @@ static int cm_dreq_handler(struct cm_wor + case IB_CM_MRA_REP_RCVD: + break; + case IB_CM_TIMEWAIT: +- atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. ++ atomic_long_inc_unchecked(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_DREQ_COUNTER]); + if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + goto unlock; +@@ -2145,7 +2145,7 @@ static int cm_dreq_handler(struct cm_wor + cm_free_msg(msg); + goto deref; + case IB_CM_DREQ_RCVD: +- atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. ++ atomic_long_inc_unchecked(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_DREQ_COUNTER]); + goto unlock; + default: +@@ -2501,7 +2501,7 @@ static int cm_mra_handler(struct cm_work + ib_modify_mad(cm_id_priv->av.port->mad_agent, + cm_id_priv->msg, timeout)) { + if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) +- atomic_long_inc(&work->port-> ++ atomic_long_inc_unchecked(&work->port-> + counter_group[CM_RECV_DUPLICATES]. + counter[CM_MRA_COUNTER]); + goto out; +@@ -2510,7 +2510,7 @@ static int cm_mra_handler(struct cm_work + break; + case IB_CM_MRA_REQ_RCVD: + case IB_CM_MRA_REP_RCVD: +- atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. ++ atomic_long_inc_unchecked(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_MRA_COUNTER]); + /* fall through */ + default: +@@ -2672,7 +2672,7 @@ static int cm_lap_handler(struct cm_work + case IB_CM_LAP_IDLE: + break; + case IB_CM_MRA_LAP_SENT: +- atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. ++ atomic_long_inc_unchecked(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_LAP_COUNTER]); + if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + goto unlock; +@@ -2688,7 +2688,7 @@ static int cm_lap_handler(struct cm_work + cm_free_msg(msg); + goto deref; + case IB_CM_LAP_RCVD: +- atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. ++ atomic_long_inc_unchecked(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_LAP_COUNTER]); + goto unlock; + default: +@@ -2972,7 +2972,7 @@ static int cm_sidr_req_handler(struct cm + cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); + if (cur_cm_id_priv) { + spin_unlock_irq(&cm.lock); +- atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. ++ atomic_long_inc_unchecked(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_SIDR_REQ_COUNTER]); + goto out; /* Duplicate message. */ + } +@@ -3183,10 +3183,10 @@ static void cm_send_handler(struct ib_ma + if (!msg->context[0] && (attr_index != CM_REJ_COUNTER)) + msg->retries = 1; + +- atomic_long_add(1 + msg->retries, ++ atomic_long_add_unchecked(1 + msg->retries, + &port->counter_group[CM_XMIT].counter[attr_index]); + if (msg->retries) +- atomic_long_add(msg->retries, ++ atomic_long_add_unchecked(msg->retries, + &port->counter_group[CM_XMIT_RETRIES]. + counter[attr_index]); + +@@ -3396,7 +3396,7 @@ static void cm_recv_handler(struct ib_ma + } + + attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id); +- atomic_long_inc(&port->counter_group[CM_RECV]. ++ atomic_long_inc_unchecked(&port->counter_group[CM_RECV]. + counter[attr_id - CM_ATTR_ID_OFFSET]); + + work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths, +@@ -3594,10 +3594,10 @@ static ssize_t cm_show_counter(struct ko + cm_attr = container_of(attr, struct cm_counter_attribute, attr); + + return sprintf(buf, "%ld\n", +- atomic_long_read(&group->counter[cm_attr->index])); ++ atomic_long_read_unchecked(&group->counter[cm_attr->index])); + } + +-static struct sysfs_ops cm_counter_ops = { ++static const struct sysfs_ops cm_counter_ops = { + .show = cm_show_counter + }; + +diff -urNp linux-2.6.33.1/drivers/infiniband/core/sysfs.c linux-2.6.33.1/drivers/infiniband/core/sysfs.c +--- linux-2.6.33.1/drivers/infiniband/core/sysfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/infiniband/core/sysfs.c 2010-03-20 16:58:39.912829502 -0400 +@@ -79,7 +79,7 @@ static ssize_t port_attr_show(struct kob + return port_attr->show(p, port_attr, buf); + } + +-static struct sysfs_ops port_sysfs_ops = { ++static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show + }; + +diff -urNp linux-2.6.33.1/drivers/input/keyboard/atkbd.c linux-2.6.33.1/drivers/input/keyboard/atkbd.c +--- linux-2.6.33.1/drivers/input/keyboard/atkbd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/input/keyboard/atkbd.c 2010-03-20 16:58:39.929160943 -0400 +@@ -1229,7 +1229,7 @@ static struct serio_device_id atkbd_seri + .id = SERIO_ANY, + .extra = SERIO_ANY, + }, +- { 0 } ++ { 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(serio, atkbd_serio_ids); +diff -urNp linux-2.6.33.1/drivers/input/mouse/lifebook.c linux-2.6.33.1/drivers/input/mouse/lifebook.c +--- linux-2.6.33.1/drivers/input/mouse/lifebook.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/input/mouse/lifebook.c 2010-03-20 16:58:39.929160943 -0400 +@@ -122,7 +122,7 @@ static const struct dmi_system_id __init + DMI_MATCH(DMI_PRODUCT_NAME, "LifeBook B142"), + }, + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL} + }; + + void __init lifebook_module_init(void) +diff -urNp linux-2.6.33.1/drivers/input/mouse/psmouse-base.c linux-2.6.33.1/drivers/input/mouse/psmouse-base.c +--- linux-2.6.33.1/drivers/input/mouse/psmouse-base.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/input/mouse/psmouse-base.c 2010-03-20 16:58:39.936833108 -0400 +@@ -1442,7 +1442,7 @@ static struct serio_device_id psmouse_se + .id = SERIO_ANY, + .extra = SERIO_ANY, + }, +- { 0 } ++ { 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(serio, psmouse_serio_ids); +diff -urNp linux-2.6.33.1/drivers/input/mouse/synaptics.c linux-2.6.33.1/drivers/input/mouse/synaptics.c +--- linux-2.6.33.1/drivers/input/mouse/synaptics.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/input/mouse/synaptics.c 2010-03-20 16:58:39.944836883 -0400 +@@ -438,7 +438,7 @@ static void synaptics_process_packet(str + break; + case 2: + if (SYN_MODEL_PEN(priv->model_id)) +- ; /* Nothing, treat a pen as a single finger */ ++ break; /* Nothing, treat a pen as a single finger */ + break; + case 4 ... 15: + if (SYN_CAP_PALMDETECT(priv->capabilities)) +@@ -654,7 +654,6 @@ static const struct dmi_system_id __init + DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), + DMI_MATCH(DMI_PRODUCT_NAME, "PORTEGE M300"), + }, +- + }, + { + /* Toshiba Portege M300 */ +@@ -663,9 +662,8 @@ static const struct dmi_system_id __init + DMI_MATCH(DMI_PRODUCT_NAME, "Portable PC"), + DMI_MATCH(DMI_PRODUCT_VERSION, "Version 1.0"), + }, +- + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + #endif + }; + +diff -urNp linux-2.6.33.1/drivers/input/mousedev.c linux-2.6.33.1/drivers/input/mousedev.c +--- linux-2.6.33.1/drivers/input/mousedev.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/input/mousedev.c 2010-03-20 16:58:39.952834604 -0400 +@@ -760,7 +760,7 @@ static ssize_t mousedev_read(struct file + + spin_unlock_irq(&client->packet_lock); + +- if (copy_to_user(buffer, data, count)) ++ if (count > sizeof(data) || copy_to_user(buffer, data, count)) + return -EFAULT; + + return count; +@@ -1057,7 +1057,7 @@ static struct input_handler mousedev_han + + #ifdef CONFIG_INPUT_MOUSEDEV_PSAUX + static struct miscdevice psaux_mouse = { +- PSMOUSE_MINOR, "psaux", &mousedev_fops ++ PSMOUSE_MINOR, "psaux", &mousedev_fops, {NULL, NULL}, NULL, NULL + }; + static int psaux_registered; + #endif +diff -urNp linux-2.6.33.1/drivers/input/serio/i8042-x86ia64io.h linux-2.6.33.1/drivers/input/serio/i8042-x86ia64io.h +--- linux-2.6.33.1/drivers/input/serio/i8042-x86ia64io.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/input/serio/i8042-x86ia64io.h 2010-03-20 16:58:39.964826079 -0400 +@@ -172,7 +172,7 @@ static const struct dmi_system_id __init + DMI_MATCH(DMI_PRODUCT_VERSION, "Rev 1"), + }, + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + + /* +@@ -402,7 +402,7 @@ static const struct dmi_system_id __init + DMI_MATCH(DMI_PRODUCT_VERSION, "0100"), + }, + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + + static const struct dmi_system_id __initconst i8042_dmi_reset_table[] = { +@@ -469,7 +469,7 @@ static const struct dmi_system_id __init + DMI_MATCH(DMI_PRODUCT_NAME, "Vostro 1720"), + }, + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + + #ifdef CONFIG_PNP +@@ -488,7 +488,7 @@ static const struct dmi_system_id __init + DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"), + }, + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + + static const struct dmi_system_id __initconst i8042_dmi_laptop_table[] = { +@@ -512,7 +512,7 @@ static const struct dmi_system_id __init + DMI_MATCH(DMI_CHASSIS_TYPE, "14"), /* Sub-Notebook */ + }, + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + #endif + +@@ -586,7 +586,7 @@ static const struct dmi_system_id __init + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 4280"), + }, + }, +- { } ++ { NULL, NULL, {DMI_MATCH(DMI_NONE, {0})}, NULL } + }; + + #endif /* CONFIG_X86 */ +diff -urNp linux-2.6.33.1/drivers/input/serio/serio_raw.c linux-2.6.33.1/drivers/input/serio/serio_raw.c +--- linux-2.6.33.1/drivers/input/serio/serio_raw.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/input/serio/serio_raw.c 2010-03-20 16:58:39.964826079 -0400 +@@ -377,7 +377,7 @@ static struct serio_device_id serio_raw_ + .id = SERIO_ANY, + .extra = SERIO_ANY, + }, +- { 0 } ++ { 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(serio, serio_raw_serio_ids); +diff -urNp linux-2.6.33.1/drivers/isdn/gigaset/common.c linux-2.6.33.1/drivers/isdn/gigaset/common.c +--- linux-2.6.33.1/drivers/isdn/gigaset/common.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/isdn/gigaset/common.c 2010-03-20 16:58:39.984838298 -0400 +@@ -732,7 +732,7 @@ struct cardstate *gigaset_initcs(struct + cs->commands_pending = 0; + cs->cur_at_seq = 0; + cs->gotfwver = -1; +- cs->open_count = 0; ++ atomic_set(&cs->open_count, 0); + cs->dev = NULL; + cs->tty = NULL; + cs->tty_dev = NULL; +diff -urNp linux-2.6.33.1/drivers/isdn/gigaset/gigaset.h linux-2.6.33.1/drivers/isdn/gigaset/gigaset.h +--- linux-2.6.33.1/drivers/isdn/gigaset/gigaset.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/isdn/gigaset/gigaset.h 2010-03-20 16:58:39.988819691 -0400 +@@ -440,7 +440,7 @@ struct cardstate { + spinlock_t cmdlock; + unsigned curlen, cmdbytes; + +- unsigned open_count; ++ atomic_t open_count; + struct tty_struct *tty; + struct tasklet_struct if_wake_tasklet; + unsigned control_state; +diff -urNp linux-2.6.33.1/drivers/isdn/gigaset/interface.c linux-2.6.33.1/drivers/isdn/gigaset/interface.c +--- linux-2.6.33.1/drivers/isdn/gigaset/interface.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/isdn/gigaset/interface.c 2010-03-20 16:58:39.996850965 -0400 +@@ -165,9 +165,7 @@ static int if_open(struct tty_struct *tt + return -ERESTARTSYS; + tty->driver_data = cs; + +- ++cs->open_count; +- +- if (cs->open_count == 1) { ++ if (atomic_inc_return(&cs->open_count) == 1) { + spin_lock_irqsave(&cs->lock, flags); + cs->tty = tty; + spin_unlock_irqrestore(&cs->lock, flags); +@@ -195,10 +193,10 @@ static void if_close(struct tty_struct * + + if (!cs->connected) + gig_dbg(DEBUG_IF, "not connected"); /* nothing to do */ +- else if (!cs->open_count) ++ else if (!atomic_read(&cs->open_count)) + dev_warn(cs->dev, "%s: device not opened\n", __func__); + else { +- if (!--cs->open_count) { ++ if (!atomic_dec_return(&cs->open_count)) { + spin_lock_irqsave(&cs->lock, flags); + cs->tty = NULL; + spin_unlock_irqrestore(&cs->lock, flags); +@@ -233,7 +231,7 @@ static int if_ioctl(struct tty_struct *t + if (!cs->connected) { + gig_dbg(DEBUG_IF, "not connected"); + retval = -ENODEV; +- } else if (!cs->open_count) ++ } else if (!atomic_read(&cs->open_count)) + dev_warn(cs->dev, "%s: device not opened\n", __func__); + else { + retval = 0; +@@ -360,7 +358,7 @@ static int if_write(struct tty_struct *t + if (!cs->connected) { + gig_dbg(DEBUG_IF, "not connected"); + retval = -ENODEV; +- } else if (!cs->open_count) ++ } else if (!atomic_read(&cs->open_count)) + dev_warn(cs->dev, "%s: device not opened\n", __func__); + else if (cs->mstate != MS_LOCKED) { + dev_warn(cs->dev, "can't write to unlocked device\n"); +@@ -394,7 +392,7 @@ static int if_write_room(struct tty_stru + if (!cs->connected) { + gig_dbg(DEBUG_IF, "not connected"); + retval = -ENODEV; +- } else if (!cs->open_count) ++ } else if (!atomic_read(&cs->open_count)) + dev_warn(cs->dev, "%s: device not opened\n", __func__); + else if (cs->mstate != MS_LOCKED) { + dev_warn(cs->dev, "can't write to unlocked device\n"); +@@ -424,7 +422,7 @@ static int if_chars_in_buffer(struct tty + + if (!cs->connected) + gig_dbg(DEBUG_IF, "not connected"); +- else if (!cs->open_count) ++ else if (!atomic_read(&cs->open_count)) + dev_warn(cs->dev, "%s: device not opened\n", __func__); + else if (cs->mstate != MS_LOCKED) + dev_warn(cs->dev, "can't write to unlocked device\n"); +@@ -452,7 +450,7 @@ static void if_throttle(struct tty_struc + + if (!cs->connected) + gig_dbg(DEBUG_IF, "not connected"); /* nothing to do */ +- else if (!cs->open_count) ++ else if (!atomic_read(&cs->open_count)) + dev_warn(cs->dev, "%s: device not opened\n", __func__); + else + gig_dbg(DEBUG_ANY, "%s: not implemented\n", __func__); +@@ -476,7 +474,7 @@ static void if_unthrottle(struct tty_str + + if (!cs->connected) + gig_dbg(DEBUG_IF, "not connected"); /* nothing to do */ +- else if (!cs->open_count) ++ else if (!atomic_read(&cs->open_count)) + dev_warn(cs->dev, "%s: device not opened\n", __func__); + else + gig_dbg(DEBUG_ANY, "%s: not implemented\n", __func__); +@@ -507,7 +505,7 @@ static void if_set_termios(struct tty_st + goto out; + } + +- if (!cs->open_count) { ++ if (!atomic_read(&cs->open_count)) { + dev_warn(cs->dev, "%s: device not opened\n", __func__); + goto out; + } +diff -urNp linux-2.6.33.1/drivers/isdn/hardware/avm/b1.c linux-2.6.33.1/drivers/isdn/hardware/avm/b1.c +--- linux-2.6.33.1/drivers/isdn/hardware/avm/b1.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/isdn/hardware/avm/b1.c 2010-03-20 16:58:40.004515803 -0400 +@@ -173,7 +173,7 @@ int b1_load_t4file(avmcard *card, capilo + } + if (left) { + if (t4file->user) { +- if (copy_from_user(buf, dp, left)) ++ if (left > sizeof(buf) || copy_from_user(buf, dp, left)) + return -EFAULT; + } else { + memcpy(buf, dp, left); +@@ -221,7 +221,7 @@ int b1_load_config(avmcard *card, capilo + } + if (left) { + if (config->user) { +- if (copy_from_user(buf, dp, left)) ++ if (left > sizeof(buf) || copy_from_user(buf, dp, left)) + return -EFAULT; + } else { + memcpy(buf, dp, left); +diff -urNp linux-2.6.33.1/drivers/isdn/icn/icn.c linux-2.6.33.1/drivers/isdn/icn/icn.c +--- linux-2.6.33.1/drivers/isdn/icn/icn.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/isdn/icn/icn.c 2010-03-20 16:58:40.012833882 -0400 +@@ -1044,7 +1044,7 @@ icn_writecmd(const u_char * buf, int len + if (count > len) + count = len; + if (user) { +- if (copy_from_user(msg, buf, count)) ++ if (count > sizeof(msg) || copy_from_user(msg, buf, count)) + return -EFAULT; + } else + memcpy(msg, buf, count); +diff -urNp linux-2.6.33.1/drivers/lguest/core.c linux-2.6.33.1/drivers/lguest/core.c +--- linux-2.6.33.1/drivers/lguest/core.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/lguest/core.c 2010-03-20 16:58:40.012833882 -0400 +@@ -91,9 +91,17 @@ static __init int map_switcher(void) + * it's worked so far. The end address needs +1 because __get_vm_area + * allocates an extra guard page, so we need space for that. + */ ++ ++#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, ++ VM_ALLOC | VM_KERNEXEC, SWITCHER_ADDR, SWITCHER_ADDR ++ + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); ++#else + switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, + VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); ++#endif ++ + if (!switcher_vma) { + err = -ENOMEM; + printk("lguest: could not map switcher pages high\n"); +diff -urNp linux-2.6.33.1/drivers/macintosh/via-pmu-backlight.c linux-2.6.33.1/drivers/macintosh/via-pmu-backlight.c +--- linux-2.6.33.1/drivers/macintosh/via-pmu-backlight.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/macintosh/via-pmu-backlight.c 2010-03-20 16:58:40.016740354 -0400 +@@ -15,7 +15,7 @@ + + #define MAX_PMU_LEVEL 0xFF + +-static struct backlight_ops pmu_backlight_data; ++static const struct backlight_ops pmu_backlight_data; + static DEFINE_SPINLOCK(pmu_backlight_lock); + static int sleeping, uses_pmu_bl; + static u8 bl_curve[FB_BACKLIGHT_LEVELS]; +@@ -115,7 +115,7 @@ static int pmu_backlight_get_brightness( + return bd->props.brightness; + } + +-static struct backlight_ops pmu_backlight_data = { ++static const struct backlight_ops pmu_backlight_data = { + .get_brightness = pmu_backlight_get_brightness, + .update_status = pmu_backlight_update_status, + +diff -urNp linux-2.6.33.1/drivers/macintosh/via-pmu.c linux-2.6.33.1/drivers/macintosh/via-pmu.c +--- linux-2.6.33.1/drivers/macintosh/via-pmu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/macintosh/via-pmu.c 2010-03-20 16:58:40.028838991 -0400 +@@ -2254,7 +2254,7 @@ static int pmu_sleep_valid(suspend_state + && (pmac_call_feature(PMAC_FTR_SLEEP_STATE, NULL, 0, -1) >= 0); + } + +-static struct platform_suspend_ops pmu_pm_ops = { ++static const struct platform_suspend_ops pmu_pm_ops = { + .enter = powerbook_sleep, + .valid = pmu_sleep_valid, + }; +diff -urNp linux-2.6.33.1/drivers/md/bitmap.c linux-2.6.33.1/drivers/md/bitmap.c +--- linux-2.6.33.1/drivers/md/bitmap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/md/bitmap.c 2010-03-20 16:58:40.032827823 -0400 +@@ -58,7 +58,7 @@ + # if DEBUG > 0 + # define PRINTK(x...) printk(KERN_DEBUG x) + # else +-# define PRINTK(x...) ++# define PRINTK(x...) do {} while (0) + # endif + #endif + +diff -urNp linux-2.6.33.1/drivers/md/dm-sysfs.c linux-2.6.33.1/drivers/md/dm-sysfs.c +--- linux-2.6.33.1/drivers/md/dm-sysfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/md/dm-sysfs.c 2010-03-20 16:58:40.040842627 -0400 +@@ -75,7 +75,7 @@ static struct attribute *dm_attrs[] = { + NULL, + }; + +-static struct sysfs_ops dm_sysfs_ops = { ++static const struct sysfs_ops dm_sysfs_ops = { + .show = dm_attr_show, + }; + +diff -urNp linux-2.6.33.1/drivers/md/dm-table.c linux-2.6.33.1/drivers/md/dm-table.c +--- linux-2.6.33.1/drivers/md/dm-table.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/md/dm-table.c 2010-03-20 16:58:40.048835074 -0400 +@@ -363,7 +363,7 @@ static int device_area_is_invalid(struct + if (!dev_size) + return 0; + +- if ((start >= dev_size) || (start + len > dev_size)) { ++ if ((start >= dev_size) || (len > dev_size - start)) { + DMWARN("%s: %s too small for target: " + "start=%llu, len=%llu, dev_size=%llu", + dm_device_name(ti->table->md), bdevname(bdev, b), +diff -urNp linux-2.6.33.1/drivers/md/md.c linux-2.6.33.1/drivers/md/md.c +--- linux-2.6.33.1/drivers/md/md.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/md/md.c 2010-03-20 16:58:40.084837077 -0400 +@@ -2642,7 +2642,7 @@ static void rdev_free(struct kobject *ko + mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); + kfree(rdev); + } +-static struct sysfs_ops rdev_sysfs_ops = { ++static const struct sysfs_ops rdev_sysfs_ops = { + .show = rdev_attr_show, + .store = rdev_attr_store, + }; +@@ -4059,7 +4059,7 @@ static void md_free(struct kobject *ko) + kfree(mddev); + } + +-static struct sysfs_ops md_sysfs_ops = { ++static const struct sysfs_ops md_sysfs_ops = { + .show = md_attr_show, + .store = md_attr_store, + }; +@@ -6187,7 +6187,7 @@ static int md_seq_show(struct seq_file * + chunk_kb ? "KB" : "B"); + if (bitmap->file) { + seq_printf(seq, ", file: "); +- seq_path(seq, &bitmap->file->f_path, " \t\n"); ++ seq_path(seq, &bitmap->file->f_path, " \t\n\"); + } + + seq_printf(seq, "\n"); +@@ -6281,7 +6281,7 @@ static int is_mddev_idle(mddev_t *mddev, + struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - +- atomic_read(&disk->sync_io); ++ atomic_read_unchecked(&disk->sync_io); + /* sync IO will cause sync_io to increase before the disk_stats + * as sync_io is counted when a request starts, and + * disk_stats is counted when it completes. +diff -urNp linux-2.6.33.1/drivers/md/md.h linux-2.6.33.1/drivers/md/md.h +--- linux-2.6.33.1/drivers/md/md.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/md/md.h 2010-03-20 16:58:40.084837077 -0400 +@@ -327,7 +327,7 @@ static inline void rdev_dec_pending(mdk_ + + static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) + { +- atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); ++ atomic_add_unchecked(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); + } + + struct mdk_personality +diff -urNp linux-2.6.33.1/drivers/media/dvb/dvb-core/dvbdev.c linux-2.6.33.1/drivers/media/dvb/dvb-core/dvbdev.c +--- linux-2.6.33.1/drivers/media/dvb/dvb-core/dvbdev.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/media/dvb/dvb-core/dvbdev.c 2010-03-20 16:58:40.088543139 -0400 +@@ -191,6 +191,7 @@ int dvb_register_device(struct dvb_adapt + const struct dvb_device *template, void *priv, int type) + { + struct dvb_device *dvbdev; ++ /* cannot be const */ + struct file_operations *dvbdevfops; + struct device *clsdev; + int minor; +diff -urNp linux-2.6.33.1/drivers/media/radio/radio-cadet.c linux-2.6.33.1/drivers/media/radio/radio-cadet.c +--- linux-2.6.33.1/drivers/media/radio/radio-cadet.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/media/radio/radio-cadet.c 2010-03-20 16:58:40.096832684 -0400 +@@ -347,7 +347,7 @@ static ssize_t cadet_read(struct file *f + while (i < count && dev->rdsin != dev->rdsout) + readbuf[i++] = dev->rdsbuf[dev->rdsout++]; + +- if (copy_to_user(data, readbuf, i)) ++ if (i > sizeof(readbuf) || copy_to_user(data, readbuf, i)) + return -EFAULT; + return i; + } +diff -urNp linux-2.6.33.1/drivers/message/i2o/i2o_proc.c linux-2.6.33.1/drivers/message/i2o/i2o_proc.c +--- linux-2.6.33.1/drivers/message/i2o/i2o_proc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/message/i2o/i2o_proc.c 2010-03-20 16:58:40.104831679 -0400 +@@ -259,13 +259,6 @@ static char *scsi_devices[] = { + "Array Controller Device" + }; + +-static char *chtostr(u8 * chars, int n) +-{ +- char tmp[256]; +- tmp[0] = 0; +- return strncat(tmp, (char *)chars, n); +-} +- + static int i2o_report_query_status(struct seq_file *seq, int block_status, + char *group) + { +@@ -842,8 +835,7 @@ static int i2o_seq_show_ddm_table(struct + + seq_printf(seq, "%-#7x", ddm_table.i2o_vendor_id); + seq_printf(seq, "%-#8x", ddm_table.module_id); +- seq_printf(seq, "%-29s", +- chtostr(ddm_table.module_name_version, 28)); ++ seq_printf(seq, "%-.28s", ddm_table.module_name_version); + seq_printf(seq, "%9d ", ddm_table.data_size); + seq_printf(seq, "%8d", ddm_table.code_size); + +@@ -944,8 +936,8 @@ static int i2o_seq_show_drivers_stored(s + + seq_printf(seq, "%-#7x", dst->i2o_vendor_id); + seq_printf(seq, "%-#8x", dst->module_id); +- seq_printf(seq, "%-29s", chtostr(dst->module_name_version, 28)); +- seq_printf(seq, "%-9s", chtostr(dst->date, 8)); ++ seq_printf(seq, "%-.28s", dst->module_name_version); ++ seq_printf(seq, "%-.8s", dst->date); + seq_printf(seq, "%8d ", dst->module_size); + seq_printf(seq, "%8d ", dst->mpb_size); + seq_printf(seq, "0x%04x", dst->module_flags); +@@ -1276,14 +1268,10 @@ static int i2o_seq_show_dev_identity(str + seq_printf(seq, "Device Class : %s\n", i2o_get_class_name(work16[0])); + seq_printf(seq, "Owner TID : %0#5x\n", work16[2]); + seq_printf(seq, "Parent TID : %0#5x\n", work16[3]); +- seq_printf(seq, "Vendor info : %s\n", +- chtostr((u8 *) (work32 + 2), 16)); +- seq_printf(seq, "Product info : %s\n", +- chtostr((u8 *) (work32 + 6), 16)); +- seq_printf(seq, "Description : %s\n", +- chtostr((u8 *) (work32 + 10), 16)); +- seq_printf(seq, "Product rev. : %s\n", +- chtostr((u8 *) (work32 + 14), 8)); ++ seq_printf(seq, "Vendor info : %.16s\n", (u8 *) (work32 + 2)); ++ seq_printf(seq, "Product info : %.16s\n", (u8 *) (work32 + 6)); ++ seq_printf(seq, "Description : %.16s\n", (u8 *) (work32 + 10)); ++ seq_printf(seq, "Product rev. : %.8s\n", (u8 *) (work32 + 14)); + + seq_printf(seq, "Serial number : "); + print_serial_number(seq, (u8 *) (work32 + 16), +@@ -1328,10 +1316,8 @@ static int i2o_seq_show_ddm_identity(str + } + + seq_printf(seq, "Registering DDM TID : 0x%03x\n", result.ddm_tid); +- seq_printf(seq, "Module name : %s\n", +- chtostr(result.module_name, 24)); +- seq_printf(seq, "Module revision : %s\n", +- chtostr(result.module_rev, 8)); ++ seq_printf(seq, "Module name : %.24s\n", result.module_name); ++ seq_printf(seq, "Module revision : %.8s\n", result.module_rev); + + seq_printf(seq, "Serial number : "); + print_serial_number(seq, result.serial_number, sizeof(result) - 36); +@@ -1362,14 +1348,10 @@ static int i2o_seq_show_uinfo(struct seq + return 0; + } + +- seq_printf(seq, "Device name : %s\n", +- chtostr(result.device_name, 64)); +- seq_printf(seq, "Service name : %s\n", +- chtostr(result.service_name, 64)); +- seq_printf(seq, "Physical name : %s\n", +- chtostr(result.physical_location, 64)); +- seq_printf(seq, "Instance number : %s\n", +- chtostr(result.instance_number, 4)); ++ seq_printf(seq, "Device name : %.64s\n", result.device_name); ++ seq_printf(seq, "Service name : %.64s\n", result.service_name); ++ seq_printf(seq, "Physical name : %.64s\n", result.physical_location); ++ seq_printf(seq, "Instance number : %.4s\n", result.instance_number); + + return 0; + } +diff -urNp linux-2.6.33.1/drivers/misc/kgdbts.c linux-2.6.33.1/drivers/misc/kgdbts.c +--- linux-2.6.33.1/drivers/misc/kgdbts.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/misc/kgdbts.c 2010-03-20 16:58:40.120597489 -0400 +@@ -118,7 +118,7 @@ + } while (0) + #define MAX_CONFIG_LEN 40 + +-static struct kgdb_io kgdbts_io_ops; ++static const struct kgdb_io kgdbts_io_ops; + static char get_buf[BUFMAX]; + static int get_buf_cnt; + static char put_buf[BUFMAX]; +@@ -1108,7 +1108,7 @@ static void kgdbts_post_exp_handler(void + module_put(THIS_MODULE); + } + +-static struct kgdb_io kgdbts_io_ops = { ++static const struct kgdb_io kgdbts_io_ops = { + .name = "kgdbts", + .read_char = kgdbts_get_char, + .write_char = kgdbts_put_char, +diff -urNp linux-2.6.33.1/drivers/misc/sgi-gru/gruhandles.c linux-2.6.33.1/drivers/misc/sgi-gru/gruhandles.c +--- linux-2.6.33.1/drivers/misc/sgi-gru/gruhandles.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/misc/sgi-gru/gruhandles.c 2010-03-20 16:58:40.120597489 -0400 +@@ -44,8 +44,8 @@ static void update_mcs_stats(enum mcs_op + unsigned long nsec; + + nsec = CLKS2NSEC(clks); +- atomic_long_inc(&mcs_op_statistics[op].count); +- atomic_long_add(nsec, &mcs_op_statistics[op].total); ++ atomic_long_inc_unchecked(&mcs_op_statistics[op].count); ++ atomic_long_add_unchecked(nsec, &mcs_op_statistics[op].total); + if (mcs_op_statistics[op].max < nsec) + mcs_op_statistics[op].max = nsec; + } +diff -urNp linux-2.6.33.1/drivers/misc/sgi-gru/gruprocfs.c linux-2.6.33.1/drivers/misc/sgi-gru/gruprocfs.c +--- linux-2.6.33.1/drivers/misc/sgi-gru/gruprocfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/misc/sgi-gru/gruprocfs.c 2010-03-20 16:58:40.120597489 -0400 +@@ -32,9 +32,9 @@ + + #define printstat(s, f) printstat_val(s, &gru_stats.f, #f) + +-static void printstat_val(struct seq_file *s, atomic_long_t *v, char *id) ++static void printstat_val(struct seq_file *s, atomic_long_unchecked_t *v, char *id) + { +- unsigned long val = atomic_long_read(v); ++ unsigned long val = atomic_long_read_unchecked(v); + + seq_printf(s, "%16lu %s\n", val, id); + } +@@ -134,8 +134,8 @@ static int mcs_statistics_show(struct se + + seq_printf(s, "%-20s%12s%12s%12s\n", "#id", "count", "aver-clks", "max-clks"); + for (op = 0; op < mcsop_last; op++) { +- count = atomic_long_read(&mcs_op_statistics[op].count); +- total = atomic_long_read(&mcs_op_statistics[op].total); ++ count = atomic_long_read_unchecked(&mcs_op_statistics[op].count); ++ total = atomic_long_read_unchecked(&mcs_op_statistics[op].total); + max = mcs_op_statistics[op].max; + seq_printf(s, "%-20s%12ld%12ld%12ld\n", id[op], count, + count ? total / count : 0, max); +diff -urNp linux-2.6.33.1/drivers/misc/sgi-gru/grutables.h linux-2.6.33.1/drivers/misc/sgi-gru/grutables.h +--- linux-2.6.33.1/drivers/misc/sgi-gru/grutables.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/misc/sgi-gru/grutables.h 2010-03-20 16:58:40.124833908 -0400 +@@ -167,82 +167,82 @@ extern unsigned int gru_max_gids; + * GRU statistics. + */ + struct gru_stats_s { +- atomic_long_t vdata_alloc; +- atomic_long_t vdata_free; +- atomic_long_t gts_alloc; +- atomic_long_t gts_free; +- atomic_long_t gms_alloc; +- atomic_long_t gms_free; +- atomic_long_t gts_double_allocate; +- atomic_long_t assign_context; +- atomic_long_t assign_context_failed; +- atomic_long_t free_context; +- atomic_long_t load_user_context; +- atomic_long_t load_kernel_context; +- atomic_long_t lock_kernel_context; +- atomic_long_t unlock_kernel_context; +- atomic_long_t steal_user_context; +- atomic_long_t steal_kernel_context; +- atomic_long_t steal_context_failed; +- atomic_long_t nopfn; +- atomic_long_t asid_new; +- atomic_long_t asid_next; +- atomic_long_t asid_wrap; +- atomic_long_t asid_reuse; +- atomic_long_t intr; +- atomic_long_t intr_cbr; +- atomic_long_t intr_tfh; +- atomic_long_t intr_spurious; +- atomic_long_t intr_mm_lock_failed; +- atomic_long_t call_os; +- atomic_long_t call_os_wait_queue; +- atomic_long_t user_flush_tlb; +- atomic_long_t user_unload_context; +- atomic_long_t user_exception; +- atomic_long_t set_context_option; +- atomic_long_t check_context_retarget_intr; +- atomic_long_t check_context_unload; +- atomic_long_t tlb_dropin; +- atomic_long_t tlb_preload_page; +- atomic_long_t tlb_dropin_fail_no_asid; +- atomic_long_t tlb_dropin_fail_upm; +- atomic_long_t tlb_dropin_fail_invalid; +- atomic_long_t tlb_dropin_fail_range_active; +- atomic_long_t tlb_dropin_fail_idle; +- atomic_long_t tlb_dropin_fail_fmm; +- atomic_long_t tlb_dropin_fail_no_exception; +- atomic_long_t tfh_stale_on_fault; +- atomic_long_t mmu_invalidate_range; +- atomic_long_t mmu_invalidate_page; +- atomic_long_t flush_tlb; +- atomic_long_t flush_tlb_gru; +- atomic_long_t flush_tlb_gru_tgh; +- atomic_long_t flush_tlb_gru_zero_asid; +- +- atomic_long_t copy_gpa; +- atomic_long_t read_gpa; +- +- atomic_long_t mesq_receive; +- atomic_long_t mesq_receive_none; +- atomic_long_t mesq_send; +- atomic_long_t mesq_send_failed; +- atomic_long_t mesq_noop; +- atomic_long_t mesq_send_unexpected_error; +- atomic_long_t mesq_send_lb_overflow; +- atomic_long_t mesq_send_qlimit_reached; +- atomic_long_t mesq_send_amo_nacked; +- atomic_long_t mesq_send_put_nacked; +- atomic_long_t mesq_page_overflow; +- atomic_long_t mesq_qf_locked; +- atomic_long_t mesq_qf_noop_not_full; +- atomic_long_t mesq_qf_switch_head_failed; +- atomic_long_t mesq_qf_unexpected_error; +- atomic_long_t mesq_noop_unexpected_error; +- atomic_long_t mesq_noop_lb_overflow; +- atomic_long_t mesq_noop_qlimit_reached; +- atomic_long_t mesq_noop_amo_nacked; +- atomic_long_t mesq_noop_put_nacked; +- atomic_long_t mesq_noop_page_overflow; ++ atomic_long_unchecked_t vdata_alloc; ++ atomic_long_unchecked_t vdata_free; ++ atomic_long_unchecked_t gts_alloc; ++ atomic_long_unchecked_t gts_free; ++ atomic_long_unchecked_t gms_alloc; ++ atomic_long_unchecked_t gms_free; ++ atomic_long_unchecked_t gts_double_allocate; ++ atomic_long_unchecked_t assign_context; ++ atomic_long_unchecked_t assign_context_failed; ++ atomic_long_unchecked_t free_context; ++ atomic_long_unchecked_t load_user_context; ++ atomic_long_unchecked_t load_kernel_context; ++ atomic_long_unchecked_t lock_kernel_context; ++ atomic_long_unchecked_t unlock_kernel_context; ++ atomic_long_unchecked_t steal_user_context; ++ atomic_long_unchecked_t steal_kernel_context; ++ atomic_long_unchecked_t steal_context_failed; ++ atomic_long_unchecked_t nopfn; ++ atomic_long_unchecked_t asid_new; ++ atomic_long_unchecked_t asid_next; ++ atomic_long_unchecked_t asid_wrap; ++ atomic_long_unchecked_t asid_reuse; ++ atomic_long_unchecked_t intr; ++ atomic_long_unchecked_t intr_cbr; ++ atomic_long_unchecked_t intr_tfh; ++ atomic_long_unchecked_t intr_spurious; ++ atomic_long_unchecked_t intr_mm_lock_failed; ++ atomic_long_unchecked_t call_os; ++ atomic_long_unchecked_t call_os_wait_queue; ++ atomic_long_unchecked_t user_flush_tlb; ++ atomic_long_unchecked_t user_unload_context; ++ atomic_long_unchecked_t user_exception; ++ atomic_long_unchecked_t set_context_option; ++ atomic_long_unchecked_t check_context_retarget_intr; ++ atomic_long_unchecked_t check_context_unload; ++ atomic_long_unchecked_t tlb_dropin; ++ atomic_long_unchecked_t tlb_preload_page; ++ atomic_long_unchecked_t tlb_dropin_fail_no_asid; ++ atomic_long_unchecked_t tlb_dropin_fail_upm; ++ atomic_long_unchecked_t tlb_dropin_fail_invalid; ++ atomic_long_unchecked_t tlb_dropin_fail_range_active; ++ atomic_long_unchecked_t tlb_dropin_fail_idle; ++ atomic_long_unchecked_t tlb_dropin_fail_fmm; ++ atomic_long_unchecked_t tlb_dropin_fail_no_exception; ++ atomic_long_unchecked_t tfh_stale_on_fault; ++ atomic_long_unchecked_t mmu_invalidate_range; ++ atomic_long_unchecked_t mmu_invalidate_page; ++ atomic_long_unchecked_t flush_tlb; ++ atomic_long_unchecked_t flush_tlb_gru; ++ atomic_long_unchecked_t flush_tlb_gru_tgh; ++ atomic_long_unchecked_t flush_tlb_gru_zero_asid; ++ ++ atomic_long_unchecked_t copy_gpa; ++ atomic_long_unchecked_t read_gpa; ++ ++ atomic_long_unchecked_t mesq_receive; ++ atomic_long_unchecked_t mesq_receive_none; ++ atomic_long_unchecked_t mesq_send; ++ atomic_long_unchecked_t mesq_send_failed; ++ atomic_long_unchecked_t mesq_noop; ++ atomic_long_unchecked_t mesq_send_unexpected_error; ++ atomic_long_unchecked_t mesq_send_lb_overflow; ++ atomic_long_unchecked_t mesq_send_qlimit_reached; ++ atomic_long_unchecked_t mesq_send_amo_nacked; ++ atomic_long_unchecked_t mesq_send_put_nacked; ++ atomic_long_unchecked_t mesq_page_overflow; ++ atomic_long_unchecked_t mesq_qf_locked; ++ atomic_long_unchecked_t mesq_qf_noop_not_full; ++ atomic_long_unchecked_t mesq_qf_switch_head_failed; ++ atomic_long_unchecked_t mesq_qf_unexpected_error; ++ atomic_long_unchecked_t mesq_noop_unexpected_error; ++ atomic_long_unchecked_t mesq_noop_lb_overflow; ++ atomic_long_unchecked_t mesq_noop_qlimit_reached; ++ atomic_long_unchecked_t mesq_noop_amo_nacked; ++ atomic_long_unchecked_t mesq_noop_put_nacked; ++ atomic_long_unchecked_t mesq_noop_page_overflow; + + }; + +@@ -251,8 +251,8 @@ enum mcs_op {cchop_allocate, cchop_start + tghop_invalidate, mcsop_last}; + + struct mcs_op_statistic { +- atomic_long_t count; +- atomic_long_t total; ++ atomic_long_unchecked_t count; ++ atomic_long_unchecked_t total; + unsigned long max; + }; + +@@ -275,7 +275,7 @@ extern struct mcs_op_statistic mcs_op_st + + #define STAT(id) do { \ + if (gru_options & OPT_STATS) \ +- atomic_long_inc(&gru_stats.id); \ ++ atomic_long_inc_unchecked(&gru_stats.id); \ + } while (0) + + #ifdef CONFIG_SGI_GRU_DEBUG +diff -urNp linux-2.6.33.1/drivers/mtd/devices/doc2000.c linux-2.6.33.1/drivers/mtd/devices/doc2000.c +--- linux-2.6.33.1/drivers/mtd/devices/doc2000.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/mtd/devices/doc2000.c 2010-03-20 16:58:40.128534904 -0400 +@@ -776,7 +776,7 @@ static int doc_write(struct mtd_info *mt + + /* The ECC will not be calculated correctly if less than 512 is written */ + /* DBB- +- if (len != 0x200 && eccbuf) ++ if (len != 0x200) + printk(KERN_WARNING + "ECC needs a full sector write (adr: %lx size %lx)\n", + (long) to, (long) len); +diff -urNp linux-2.6.33.1/drivers/mtd/devices/doc2001.c linux-2.6.33.1/drivers/mtd/devices/doc2001.c +--- linux-2.6.33.1/drivers/mtd/devices/doc2001.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/mtd/devices/doc2001.c 2010-03-20 17:13:45.197003644 -0400 +@@ -393,7 +393,7 @@ static int doc_read (struct mtd_info *mt + struct Nand *mychip = &this->chips[from >> (this->chipshift)]; + + /* Don't allow read past end of device */ +- if (from >= this->totlen) ++ if (from >= this->totlen || !len) + return -EINVAL; + + /* Don't allow a single read to cross a 512-byte block boundary */ +diff -urNp linux-2.6.33.1/drivers/mtd/ubi/build.c linux-2.6.33.1/drivers/mtd/ubi/build.c +--- linux-2.6.33.1/drivers/mtd/ubi/build.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/mtd/ubi/build.c 2010-03-20 16:58:40.128534904 -0400 +@@ -1255,7 +1255,7 @@ module_exit(ubi_exit); + static int __init bytes_str_to_int(const char *str) + { + char *endp; +- unsigned long result; ++ unsigned long result, scale = 1; + + result = simple_strtoul(str, &endp, 0); + if (str == endp || result >= INT_MAX) { +@@ -1266,11 +1266,11 @@ static int __init bytes_str_to_int(const + + switch (*endp) { + case 'G': +- result *= 1024; ++ scale *= 1024; + case 'M': +- result *= 1024; ++ scale *= 1024; + case 'K': +- result *= 1024; ++ scale *= 1024; + if (endp[1] == 'i' && endp[2] == 'B') + endp += 2; + case '\0': +@@ -1281,7 +1281,13 @@ static int __init bytes_str_to_int(const + return -EINVAL; + } + +- return result; ++ if ((intoverflow_t)result*scale >= INT_MAX) { ++ printk(KERN_ERR "UBI error: incorrect bytes count: "%s"\n", ++ str); ++ return -EINVAL; ++ } ++ ++ return result*scale; + } + + /** +diff -urNp linux-2.6.33.1/drivers/net/e1000e/82571.c linux-2.6.33.1/drivers/net/e1000e/82571.c +--- linux-2.6.33.1/drivers/net/e1000e/82571.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/e1000e/82571.c 2010-03-20 16:58:40.152824721 -0400 +@@ -207,6 +207,7 @@ static s32 e1000_init_mac_params_82571(s + { + struct e1000_hw *hw = &adapter->hw; + struct e1000_mac_info *mac = &hw->mac; ++ /* cannot be const */ + struct e1000_mac_operations *func = &mac->ops; + u32 swsm = 0; + u32 swsm2 = 0; +@@ -1688,7 +1689,7 @@ static void e1000_clear_hw_cntrs_82571(s + er32(ICRXDMTC); + } + +-static struct e1000_mac_operations e82571_mac_ops = { ++static const struct e1000_mac_operations e82571_mac_ops = { + /* .check_mng_mode: mac type dependent */ + /* .check_for_link: media type dependent */ + .id_led_init = e1000e_id_led_init, +@@ -1708,7 +1709,7 @@ static struct e1000_mac_operations e8257 + .setup_led = e1000e_setup_led_generic, + }; + +-static struct e1000_phy_operations e82_phy_ops_igp = { ++static const struct e1000_phy_operations e82_phy_ops_igp = { + .acquire = e1000_get_hw_semaphore_82571, + .check_polarity = e1000_check_polarity_igp, + .check_reset_block = e1000e_check_reset_block_generic, +@@ -1726,7 +1727,7 @@ static struct e1000_phy_operations e82_p + .cfg_on_link_up = NULL, + }; + +-static struct e1000_phy_operations e82_phy_ops_m88 = { ++static const struct e1000_phy_operations e82_phy_ops_m88 = { + .acquire = e1000_get_hw_semaphore_82571, + .check_polarity = e1000_check_polarity_m88, + .check_reset_block = e1000e_check_reset_block_generic, +@@ -1744,7 +1745,7 @@ static struct e1000_phy_operations e82_p + .cfg_on_link_up = NULL, + }; + +-static struct e1000_phy_operations e82_phy_ops_bm = { ++static const struct e1000_phy_operations e82_phy_ops_bm = { + .acquire = e1000_get_hw_semaphore_82571, + .check_polarity = e1000_check_polarity_m88, + .check_reset_block = e1000e_check_reset_block_generic, +@@ -1762,7 +1763,7 @@ static struct e1000_phy_operations e82_p + .cfg_on_link_up = NULL, + }; + +-static struct e1000_nvm_operations e82571_nvm_ops = { ++static const struct e1000_nvm_operations e82571_nvm_ops = { + .acquire = e1000_acquire_nvm_82571, + .read = e1000e_read_nvm_eerd, + .release = e1000_release_nvm_82571, +diff -urNp linux-2.6.33.1/drivers/net/e1000e/e1000.h linux-2.6.33.1/drivers/net/e1000e/e1000.h +--- linux-2.6.33.1/drivers/net/e1000e/e1000.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/e1000e/e1000.h 2010-03-20 16:58:40.152824721 -0400 +@@ -379,9 +379,9 @@ struct e1000_info { + u32 pba; + u32 max_hw_frame_size; + s32 (*get_variants)(struct e1000_adapter *); +- struct e1000_mac_operations *mac_ops; +- struct e1000_phy_operations *phy_ops; +- struct e1000_nvm_operations *nvm_ops; ++ const struct e1000_mac_operations *mac_ops; ++ const struct e1000_phy_operations *phy_ops; ++ const struct e1000_nvm_operations *nvm_ops; + }; + + /* hardware capability, feature, and workaround flags */ +diff -urNp linux-2.6.33.1/drivers/net/e1000e/es2lan.c linux-2.6.33.1/drivers/net/e1000e/es2lan.c +--- linux-2.6.33.1/drivers/net/e1000e/es2lan.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/e1000e/es2lan.c 2010-03-20 16:58:40.152824721 -0400 +@@ -205,6 +205,7 @@ static s32 e1000_init_mac_params_80003es + { + struct e1000_hw *hw = &adapter->hw; + struct e1000_mac_info *mac = &hw->mac; ++ /* cannot be const */ + struct e1000_mac_operations *func = &mac->ops; + + /* Set media type */ +@@ -1402,7 +1403,7 @@ static void e1000_clear_hw_cntrs_80003es + er32(ICRXDMTC); + } + +-static struct e1000_mac_operations es2_mac_ops = { ++static const struct e1000_mac_operations es2_mac_ops = { + .id_led_init = e1000e_id_led_init, + .check_mng_mode = e1000e_check_mng_mode_generic, + /* check_for_link dependent on media type */ +@@ -1422,7 +1423,7 @@ static struct e1000_mac_operations es2_m + .setup_led = e1000e_setup_led_generic, + }; + +-static struct e1000_phy_operations es2_phy_ops = { ++static const struct e1000_phy_operations es2_phy_ops = { + .acquire = e1000_acquire_phy_80003es2lan, + .check_polarity = e1000_check_polarity_m88, + .check_reset_block = e1000e_check_reset_block_generic, +@@ -1440,7 +1441,7 @@ static struct e1000_phy_operations es2_p + .cfg_on_link_up = e1000_cfg_on_link_up_80003es2lan, + }; + +-static struct e1000_nvm_operations es2_nvm_ops = { ++static const struct e1000_nvm_operations es2_nvm_ops = { + .acquire = e1000_acquire_nvm_80003es2lan, + .read = e1000e_read_nvm_eerd, + .release = e1000_release_nvm_80003es2lan, +diff -urNp linux-2.6.33.1/drivers/net/e1000e/hw.h linux-2.6.33.1/drivers/net/e1000e/hw.h +--- linux-2.6.33.1/drivers/net/e1000e/hw.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/e1000e/hw.h 2010-03-20 16:58:40.156835598 -0400 +@@ -783,13 +783,13 @@ struct e1000_phy_operations { + + /* Function pointers for the NVM. */ + struct e1000_nvm_operations { +- s32 (*acquire)(struct e1000_hw *); +- s32 (*read)(struct e1000_hw *, u16, u16, u16 *); +- void (*release)(struct e1000_hw *); +- s32 (*update)(struct e1000_hw *); +- s32 (*valid_led_default)(struct e1000_hw *, u16 *); +- s32 (*validate)(struct e1000_hw *); +- s32 (*write)(struct e1000_hw *, u16, u16, u16 *); ++ s32 (* const acquire)(struct e1000_hw *); ++ s32 (* const read)(struct e1000_hw *, u16, u16, u16 *); ++ void (* const release)(struct e1000_hw *); ++ s32 (* const update)(struct e1000_hw *); ++ s32 (* const valid_led_default)(struct e1000_hw *, u16 *); ++ s32 (* const validate)(struct e1000_hw *); ++ s32 (* const write)(struct e1000_hw *, u16, u16, u16 *); + }; + + struct e1000_mac_info { +@@ -864,6 +864,7 @@ struct e1000_phy_info { + }; + + struct e1000_nvm_info { ++ /* cannot be const */ + struct e1000_nvm_operations ops; + + enum e1000_nvm_type type; +diff -urNp linux-2.6.33.1/drivers/net/e1000e/ich8lan.c linux-2.6.33.1/drivers/net/e1000e/ich8lan.c +--- linux-2.6.33.1/drivers/net/e1000e/ich8lan.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/e1000e/ich8lan.c 2010-03-20 16:58:40.168822418 -0400 +@@ -3361,7 +3361,7 @@ static void e1000_clear_hw_cntrs_ich8lan + } + } + +-static struct e1000_mac_operations ich8_mac_ops = { ++static const struct e1000_mac_operations ich8_mac_ops = { + .id_led_init = e1000e_id_led_init, + .check_mng_mode = e1000_check_mng_mode_ich8lan, + .check_for_link = e1000_check_for_copper_link_ich8lan, +@@ -3379,7 +3379,7 @@ static struct e1000_mac_operations ich8_ + /* id_led_init dependent on mac type */ + }; + +-static struct e1000_phy_operations ich8_phy_ops = { ++static const struct e1000_phy_operations ich8_phy_ops = { + .acquire = e1000_acquire_swflag_ich8lan, + .check_reset_block = e1000_check_reset_block_ich8lan, + .commit = NULL, +@@ -3393,7 +3393,7 @@ static struct e1000_phy_operations ich8_ + .write_reg = e1000e_write_phy_reg_igp, + }; + +-static struct e1000_nvm_operations ich8_nvm_ops = { ++static const struct e1000_nvm_operations ich8_nvm_ops = { + .acquire = e1000_acquire_nvm_ich8lan, + .read = e1000_read_nvm_ich8lan, + .release = e1000_release_nvm_ich8lan, +diff -urNp linux-2.6.33.1/drivers/net/ibmveth.c linux-2.6.33.1/drivers/net/ibmveth.c +--- linux-2.6.33.1/drivers/net/ibmveth.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/ibmveth.c 2010-03-20 16:58:40.180852292 -0400 +@@ -1577,7 +1577,7 @@ static struct attribute * veth_pool_attr + NULL, + }; + +-static struct sysfs_ops veth_pool_ops = { ++static const struct sysfs_ops veth_pool_ops = { + .show = veth_pool_show, + .store = veth_pool_store, + }; +diff -urNp linux-2.6.33.1/drivers/net/igb/e1000_82575.c linux-2.6.33.1/drivers/net/igb/e1000_82575.c +--- linux-2.6.33.1/drivers/net/igb/e1000_82575.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/igb/e1000_82575.c 2010-03-20 16:58:40.192835070 -0400 +@@ -1583,7 +1583,7 @@ u16 igb_rxpbs_adjust_82580(u32 data) + return ret_val; + } + +-static struct e1000_mac_operations e1000_mac_ops_82575 = { ++static const struct e1000_mac_operations e1000_mac_ops_82575 = { + .init_hw = igb_init_hw_82575, + .check_for_link = igb_check_for_link_82575, + .rar_set = igb_rar_set, +@@ -1591,13 +1591,13 @@ static struct e1000_mac_operations e1000 + .get_speed_and_duplex = igb_get_speed_and_duplex_copper, + }; + +-static struct e1000_phy_operations e1000_phy_ops_82575 = { ++static const struct e1000_phy_operations e1000_phy_ops_82575 = { + .acquire = igb_acquire_phy_82575, + .get_cfg_done = igb_get_cfg_done_82575, + .release = igb_release_phy_82575, + }; + +-static struct e1000_nvm_operations e1000_nvm_ops_82575 = { ++static const struct e1000_nvm_operations e1000_nvm_ops_82575 = { + .acquire = igb_acquire_nvm_82575, + .read = igb_read_nvm_eerd, + .release = igb_release_nvm_82575, +diff -urNp linux-2.6.33.1/drivers/net/igb/e1000_hw.h linux-2.6.33.1/drivers/net/igb/e1000_hw.h +--- linux-2.6.33.1/drivers/net/igb/e1000_hw.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/igb/e1000_hw.h 2010-03-20 16:58:40.192835070 -0400 +@@ -316,17 +316,17 @@ struct e1000_phy_operations { + }; + + struct e1000_nvm_operations { +- s32 (*acquire)(struct e1000_hw *); +- s32 (*read)(struct e1000_hw *, u16, u16, u16 *); +- void (*release)(struct e1000_hw *); +- s32 (*write)(struct e1000_hw *, u16, u16, u16 *); ++ s32 (* const acquire)(struct e1000_hw *); ++ s32 (* const read)(struct e1000_hw *, u16, u16, u16 *); ++ void (* const release)(struct e1000_hw *); ++ s32 (* const write)(struct e1000_hw *, u16, u16, u16 *); + }; + + struct e1000_info { + s32 (*get_invariants)(struct e1000_hw *); +- struct e1000_mac_operations *mac_ops; +- struct e1000_phy_operations *phy_ops; +- struct e1000_nvm_operations *nvm_ops; ++ const struct e1000_mac_operations *mac_ops; ++ const struct e1000_phy_operations *phy_ops; ++ const struct e1000_nvm_operations *nvm_ops; + }; + + extern const struct e1000_info e1000_82575_info; +@@ -412,6 +412,7 @@ struct e1000_phy_info { + }; + + struct e1000_nvm_info { ++ /* cannot be const */ + struct e1000_nvm_operations ops; + + enum e1000_nvm_type type; +diff -urNp linux-2.6.33.1/drivers/net/irda/vlsi_ir.c linux-2.6.33.1/drivers/net/irda/vlsi_ir.c +--- linux-2.6.33.1/drivers/net/irda/vlsi_ir.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/irda/vlsi_ir.c 2010-03-20 16:58:40.204836200 -0400 +@@ -907,13 +907,12 @@ static netdev_tx_t vlsi_hard_start_xmit( + /* no race - tx-ring already empty */ + vlsi_set_baud(idev, iobase); + netif_wake_queue(ndev); +- } +- else +- ; ++ } else { + /* keep the speed change pending like it would + * for any len>0 packet. tx completion interrupt + * will apply it when the tx ring becomes empty. + */ ++ } + spin_unlock_irqrestore(&idev->lock, flags); + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; +diff -urNp linux-2.6.33.1/drivers/net/iseries_veth.c linux-2.6.33.1/drivers/net/iseries_veth.c +--- linux-2.6.33.1/drivers/net/iseries_veth.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/iseries_veth.c 2010-03-20 16:58:40.227948499 -0400 +@@ -384,7 +384,7 @@ static struct attribute *veth_cnx_defaul + NULL + }; + +-static struct sysfs_ops veth_cnx_sysfs_ops = { ++static const struct sysfs_ops veth_cnx_sysfs_ops = { + .show = veth_cnx_attribute_show + }; + +@@ -441,7 +441,7 @@ static struct attribute *veth_port_defau + NULL + }; + +-static struct sysfs_ops veth_port_sysfs_ops = { ++static const struct sysfs_ops veth_port_sysfs_ops = { + .show = veth_port_attribute_show + }; + +diff -urNp linux-2.6.33.1/drivers/net/pcnet32.c linux-2.6.33.1/drivers/net/pcnet32.c +--- linux-2.6.33.1/drivers/net/pcnet32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/pcnet32.c 2010-03-20 16:58:40.232840159 -0400 +@@ -80,7 +80,7 @@ static int cards_found; + /* + * VLB I/O addresses + */ +-static unsigned int pcnet32_portlist[] __initdata = ++static unsigned int pcnet32_portlist[] __devinitdata = + { 0x300, 0x320, 0x340, 0x360, 0 }; + + static int pcnet32_debug = 0; +diff -urNp linux-2.6.33.1/drivers/net/ppp_generic.c linux-2.6.33.1/drivers/net/ppp_generic.c +--- linux-2.6.33.1/drivers/net/ppp_generic.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/ppp_generic.c 2010-03-20 16:58:40.236519222 -0400 +@@ -988,7 +988,6 @@ ppp_net_ioctl(struct net_device *dev, st + void __user *addr = (void __user *) ifr->ifr_ifru.ifru_data; + struct ppp_stats stats; + struct ppp_comp_stats cstats; +- char *vers; + + switch (cmd) { + case SIOCGPPPSTATS: +@@ -1010,8 +1009,7 @@ ppp_net_ioctl(struct net_device *dev, st + break; + + case SIOCGPPPVER: +- vers = PPP_VERSION; +- if (copy_to_user(addr, vers, strlen(vers) + 1)) ++ if (copy_to_user(addr, PPP_VERSION, sizeof(PPP_VERSION))) + break; + err = 0; + break; +diff -urNp linux-2.6.33.1/drivers/net/tg3.h linux-2.6.33.1/drivers/net/tg3.h +--- linux-2.6.33.1/drivers/net/tg3.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/tg3.h 2010-03-20 16:58:40.244857185 -0400 +@@ -101,6 +101,7 @@ + #define CHIPREV_ID_5750_A0 0x4000 + #define CHIPREV_ID_5750_A1 0x4001 + #define CHIPREV_ID_5750_A3 0x4003 ++#define CHIPREV_ID_5750_C1 0x4201 + #define CHIPREV_ID_5750_C2 0x4202 + #define CHIPREV_ID_5752_A0_HW 0x5000 + #define CHIPREV_ID_5752_A0 0x6000 +diff -urNp linux-2.6.33.1/drivers/net/tulip/de4x5.c linux-2.6.33.1/drivers/net/tulip/de4x5.c +--- linux-2.6.33.1/drivers/net/tulip/de4x5.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/tulip/de4x5.c 2010-03-20 16:58:40.269912385 -0400 +@@ -5472,7 +5472,7 @@ de4x5_ioctl(struct net_device *dev, stru + for (i=0; i<ETH_ALEN; i++) { + tmp.addr[i] = dev->dev_addr[i]; + } +- if (copy_to_user(ioc->data, tmp.addr, ioc->len)) return -EFAULT; ++ if (ioc->len > sizeof(tmp.addr) || copy_to_user(ioc->data, tmp.addr, ioc->len)) return -EFAULT; + break; + + case DE4X5_SET_HWADDR: /* Set the hardware address */ +@@ -5512,7 +5512,7 @@ de4x5_ioctl(struct net_device *dev, stru + spin_lock_irqsave(&lp->lock, flags); + memcpy(&statbuf, &lp->pktStats, ioc->len); + spin_unlock_irqrestore(&lp->lock, flags); +- if (copy_to_user(ioc->data, &statbuf, ioc->len)) ++ if (ioc->len > sizeof(statbuf) || copy_to_user(ioc->data, &statbuf, ioc->len)) + return -EFAULT; + break; + } +diff -urNp linux-2.6.33.1/drivers/net/usb/hso.c linux-2.6.33.1/drivers/net/usb/hso.c +--- linux-2.6.33.1/drivers/net/usb/hso.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/usb/hso.c 2010-03-20 16:58:40.272671924 -0400 +@@ -258,7 +258,7 @@ struct hso_serial { + + /* from usb_serial_port */ + struct tty_struct *tty; +- int open_count; ++ atomic_t open_count; + spinlock_t serial_lock; + + int (*write_data) (struct hso_serial *serial); +@@ -1203,7 +1203,7 @@ static void put_rxbuf_data_and_resubmit_ + struct urb *urb; + + urb = serial->rx_urb[0]; +- if (serial->open_count > 0) { ++ if (atomic_read(&serial->open_count) > 0) { + count = put_rxbuf_data(urb, serial); + if (count == -1) + return; +@@ -1239,7 +1239,7 @@ static void hso_std_serial_read_bulk_cal + DUMP1(urb->transfer_buffer, urb->actual_length); + + /* Anyone listening? */ +- if (serial->open_count == 0) ++ if (atomic_read(&serial->open_count) == 0) + return; + + if (status == 0) { +@@ -1334,8 +1334,7 @@ static int hso_serial_open(struct tty_st + spin_unlock_irq(&serial->serial_lock); + + /* check for port already opened, if not set the termios */ +- serial->open_count++; +- if (serial->open_count == 1) { ++ if (atomic_inc_return(&serial->open_count) == 1) { + tty->low_latency = 1; + serial->rx_state = RX_IDLE; + /* Force default termio settings */ +@@ -1348,7 +1347,7 @@ static int hso_serial_open(struct tty_st + result = hso_start_serial_device(serial->parent, GFP_KERNEL); + if (result) { + hso_stop_serial_device(serial->parent); +- serial->open_count--; ++ atomic_dec(&serial->open_count); + kref_put(&serial->parent->ref, hso_serial_ref_free); + } + } else { +@@ -1385,10 +1384,10 @@ static void hso_serial_close(struct tty_ + + /* reset the rts and dtr */ + /* do the actual close */ +- serial->open_count--; ++ atomic_dec(&serial->open_count); + +- if (serial->open_count <= 0) { +- serial->open_count = 0; ++ if (atomic_read(&serial->open_count) <= 0) { ++ atomic_set(&serial->open_count, 0); + spin_lock_irq(&serial->serial_lock); + if (serial->tty == tty) { + serial->tty->driver_data = NULL; +@@ -1470,7 +1469,7 @@ static void hso_serial_set_termios(struc + + /* the actual setup */ + spin_lock_irqsave(&serial->serial_lock, flags); +- if (serial->open_count) ++ if (atomic_read(&serial->open_count)) + _hso_serial_set_termios(tty, old); + else + tty->termios = old; +@@ -1933,7 +1932,7 @@ static void intr_callback(struct urb *ur + D1("Pending read interrupt on port %d\n", i); + spin_lock(&serial->serial_lock); + if (serial->rx_state == RX_IDLE && +- serial->open_count > 0) { ++ atomic_read(&serial->open_count) > 0) { + /* Setup and send a ctrl req read on + * port i */ + if (!serial->rx_urb_filled[0]) { +@@ -3124,7 +3123,7 @@ static int hso_resume(struct usb_interfa + /* Start all serial ports */ + for (i = 0; i < HSO_SERIAL_TTY_MINORS; i++) { + if (serial_table[i] && (serial_table[i]->interface == iface)) { +- if (dev2ser(serial_table[i])->open_count) { ++ if (atomic_read(&dev2ser(serial_table[i])->open_count)) { + result = + hso_start_serial_device(serial_table[i], GFP_NOIO); + hso_kick_transmit(dev2ser(serial_table[i])); +diff -urNp linux-2.6.33.1/drivers/net/wireless/b43/debugfs.c linux-2.6.33.1/drivers/net/wireless/b43/debugfs.c +--- linux-2.6.33.1/drivers/net/wireless/b43/debugfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/wireless/b43/debugfs.c 2010-03-20 16:58:40.276838321 -0400 +@@ -43,7 +43,7 @@ static struct dentry *rootdir; + struct b43_debugfs_fops { + ssize_t (*read)(struct b43_wldev *dev, char *buf, size_t bufsize); + int (*write)(struct b43_wldev *dev, const char *buf, size_t count); +- struct file_operations fops; ++ const struct file_operations fops; + /* Offset of struct b43_dfs_file in struct b43_dfsentry */ + size_t file_struct_offset; + }; +diff -urNp linux-2.6.33.1/drivers/net/wireless/b43legacy/debugfs.c linux-2.6.33.1/drivers/net/wireless/b43legacy/debugfs.c +--- linux-2.6.33.1/drivers/net/wireless/b43legacy/debugfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/wireless/b43legacy/debugfs.c 2010-03-20 16:58:40.292572897 -0400 +@@ -44,7 +44,7 @@ static struct dentry *rootdir; + struct b43legacy_debugfs_fops { + ssize_t (*read)(struct b43legacy_wldev *dev, char *buf, size_t bufsize); + int (*write)(struct b43legacy_wldev *dev, const char *buf, size_t count); +- struct file_operations fops; ++ const struct file_operations fops; + /* Offset of struct b43legacy_dfs_file in struct b43legacy_dfsentry */ + size_t file_struct_offset; + /* Take wl->irq_lock before calling read/write? */ +diff -urNp linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-1000.c linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-1000.c +--- linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-1000.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-1000.c 2010-03-20 16:58:40.296852154 -0400 +@@ -140,7 +140,7 @@ static struct iwl_lib_ops iwl1000_lib = + }, + }; + +-static struct iwl_ops iwl1000_ops = { ++static const struct iwl_ops iwl1000_ops = { + .ucode = &iwl5000_ucode, + .lib = &iwl1000_lib, + .hcmd = &iwl5000_hcmd, +diff -urNp linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-3945.c linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-3945.c +--- linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-3945.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-3945.c 2010-03-20 16:58:40.308852389 -0400 +@@ -2804,7 +2804,7 @@ static struct iwl_hcmd_utils_ops iwl3945 + .rts_tx_cmd_flag = iwlcore_rts_tx_cmd_flag, + }; + +-static struct iwl_ops iwl3945_ops = { ++static const struct iwl_ops iwl3945_ops = { + .ucode = &iwl3945_ucode, + .lib = &iwl3945_lib, + .hcmd = &iwl3945_hcmd, +diff -urNp linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-4965.c linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-4965.c +--- linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-4965.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-4965.c 2010-03-20 16:58:40.316840649 -0400 +@@ -2208,7 +2208,7 @@ static struct iwl_lib_ops iwl4965_lib = + }, + }; + +-static struct iwl_ops iwl4965_ops = { ++static const struct iwl_ops iwl4965_ops = { + .ucode = &iwl4965_ucode, + .lib = &iwl4965_lib, + .hcmd = &iwl4965_hcmd, +diff -urNp linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-5000.c linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-5000.c +--- linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-5000.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-5000.c 2010-03-20 16:58:40.338443044 -0400 +@@ -1553,7 +1553,7 @@ static struct iwl_lib_ops iwl5150_lib = + }, + }; + +-static struct iwl_ops iwl5000_ops = { ++static const struct iwl_ops iwl5000_ops = { + .ucode = &iwl5000_ucode, + .lib = &iwl5000_lib, + .hcmd = &iwl5000_hcmd, +@@ -1561,7 +1561,7 @@ static struct iwl_ops iwl5000_ops = { + .led = &iwlagn_led_ops, + }; + +-static struct iwl_ops iwl5150_ops = { ++static const struct iwl_ops iwl5150_ops = { + .ucode = &iwl5000_ucode, + .lib = &iwl5150_lib, + .hcmd = &iwl5000_hcmd, +diff -urNp linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-6000.c linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-6000.c +--- linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-6000.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/wireless/iwlwifi/iwl-6000.c 2010-03-20 16:58:40.338443044 -0400 +@@ -252,7 +252,7 @@ static struct iwl_lib_ops iwl6000_lib = + }, + }; + +-static struct iwl_ops iwl6000_ops = { ++static const struct iwl_ops iwl6000_ops = { + .ucode = &iwl5000_ucode, + .lib = &iwl6000_lib, + .hcmd = &iwl5000_hcmd, +@@ -267,7 +267,7 @@ static struct iwl_hcmd_utils_ops iwl6050 + .calc_rssi = iwl5000_calc_rssi, + }; + +-static struct iwl_ops iwl6050_ops = { ++static const struct iwl_ops iwl6050_ops = { + .ucode = &iwl5000_ucode, + .lib = &iwl6000_lib, + .hcmd = &iwl5000_hcmd, +diff -urNp linux-2.6.33.1/drivers/net/wireless/libertas/debugfs.c linux-2.6.33.1/drivers/net/wireless/libertas/debugfs.c +--- linux-2.6.33.1/drivers/net/wireless/libertas/debugfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/net/wireless/libertas/debugfs.c 2010-03-20 16:58:40.348846186 -0400 +@@ -717,7 +717,7 @@ out_unlock: + struct lbs_debugfs_files { + const char *name; + int perm; +- struct file_operations fops; ++ const struct file_operations fops; + }; + + static const struct lbs_debugfs_files debugfs_files[] = { +diff -urNp linux-2.6.33.1/drivers/oprofile/buffer_sync.c linux-2.6.33.1/drivers/oprofile/buffer_sync.c +--- linux-2.6.33.1/drivers/oprofile/buffer_sync.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/oprofile/buffer_sync.c 2010-03-20 16:58:40.357847963 -0400 +@@ -340,7 +340,7 @@ static void add_data(struct op_entry *en + if (cookie == NO_COOKIE) + offset = pc; + if (cookie == INVALID_COOKIE) { +- atomic_inc(&oprofile_stats.sample_lost_no_mapping); ++ atomic_inc_unchecked(&oprofile_stats.sample_lost_no_mapping); + offset = pc; + } + if (cookie != last_cookie) { +@@ -384,14 +384,14 @@ add_sample(struct mm_struct *mm, struct + /* add userspace sample */ + + if (!mm) { +- atomic_inc(&oprofile_stats.sample_lost_no_mm); ++ atomic_inc_unchecked(&oprofile_stats.sample_lost_no_mm); + return 0; + } + + cookie = lookup_dcookie(mm, s->eip, &offset); + + if (cookie == INVALID_COOKIE) { +- atomic_inc(&oprofile_stats.sample_lost_no_mapping); ++ atomic_inc_unchecked(&oprofile_stats.sample_lost_no_mapping); + return 0; + } + +@@ -560,7 +560,7 @@ void sync_buffer(int cpu) + /* ignore backtraces if failed to add a sample */ + if (state == sb_bt_start) { + state = sb_bt_ignore; +- atomic_inc(&oprofile_stats.bt_lost_no_mapping); ++ atomic_inc_unchecked(&oprofile_stats.bt_lost_no_mapping); + } + } + release_mm(mm); +diff -urNp linux-2.6.33.1/drivers/oprofile/event_buffer.c linux-2.6.33.1/drivers/oprofile/event_buffer.c +--- linux-2.6.33.1/drivers/oprofile/event_buffer.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/oprofile/event_buffer.c 2010-03-20 16:58:40.360587811 -0400 +@@ -53,7 +53,7 @@ void add_event_entry(unsigned long value + } + + if (buffer_pos == buffer_size) { +- atomic_inc(&oprofile_stats.event_lost_overflow); ++ atomic_inc_unchecked(&oprofile_stats.event_lost_overflow); + return; + } + +diff -urNp linux-2.6.33.1/drivers/oprofile/oprof.c linux-2.6.33.1/drivers/oprofile/oprof.c +--- linux-2.6.33.1/drivers/oprofile/oprof.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/oprofile/oprof.c 2010-03-20 16:58:40.368849108 -0400 +@@ -110,7 +110,7 @@ static void switch_worker(struct work_st + if (oprofile_ops.switch_events()) + return; + +- atomic_inc(&oprofile_stats.multiplex_counter); ++ atomic_inc_unchecked(&oprofile_stats.multiplex_counter); + start_switch_worker(); + } + +diff -urNp linux-2.6.33.1/drivers/oprofile/oprofilefs.c linux-2.6.33.1/drivers/oprofile/oprofilefs.c +--- linux-2.6.33.1/drivers/oprofile/oprofilefs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/oprofile/oprofilefs.c 2010-03-20 16:58:40.372843891 -0400 +@@ -187,7 +187,7 @@ static const struct file_operations atom + + + int oprofilefs_create_ro_atomic(struct super_block *sb, struct dentry *root, +- char const *name, atomic_t *val) ++ char const *name, atomic_unchecked_t *val) + { + struct dentry *d = __oprofilefs_create_file(sb, root, name, + &atomic_ro_fops, 0444); +diff -urNp linux-2.6.33.1/drivers/oprofile/oprofile_stats.c linux-2.6.33.1/drivers/oprofile/oprofile_stats.c +--- linux-2.6.33.1/drivers/oprofile/oprofile_stats.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/oprofile/oprofile_stats.c 2010-03-20 16:58:40.376630331 -0400 +@@ -30,11 +30,11 @@ void oprofile_reset_stats(void) + cpu_buf->sample_invalid_eip = 0; + } + +- atomic_set(&oprofile_stats.sample_lost_no_mm, 0); +- atomic_set(&oprofile_stats.sample_lost_no_mapping, 0); +- atomic_set(&oprofile_stats.event_lost_overflow, 0); +- atomic_set(&oprofile_stats.bt_lost_no_mapping, 0); +- atomic_set(&oprofile_stats.multiplex_counter, 0); ++ atomic_set_unchecked(&oprofile_stats.sample_lost_no_mm, 0); ++ atomic_set_unchecked(&oprofile_stats.sample_lost_no_mapping, 0); ++ atomic_set_unchecked(&oprofile_stats.event_lost_overflow, 0); ++ atomic_set_unchecked(&oprofile_stats.bt_lost_no_mapping, 0); ++ atomic_set_unchecked(&oprofile_stats.multiplex_counter, 0); + } + + +diff -urNp linux-2.6.33.1/drivers/oprofile/oprofile_stats.h linux-2.6.33.1/drivers/oprofile/oprofile_stats.h +--- linux-2.6.33.1/drivers/oprofile/oprofile_stats.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/oprofile/oprofile_stats.h 2010-03-20 16:58:40.376630331 -0400 +@@ -13,11 +13,11 @@ + #include <asm/atomic.h> + + struct oprofile_stat_struct { +- atomic_t sample_lost_no_mm; +- atomic_t sample_lost_no_mapping; +- atomic_t bt_lost_no_mapping; +- atomic_t event_lost_overflow; +- atomic_t multiplex_counter; ++ atomic_unchecked_t sample_lost_no_mm; ++ atomic_unchecked_t sample_lost_no_mapping; ++ atomic_unchecked_t bt_lost_no_mapping; ++ atomic_unchecked_t event_lost_overflow; ++ atomic_unchecked_t multiplex_counter; + }; + + extern struct oprofile_stat_struct oprofile_stats; +diff -urNp linux-2.6.33.1/drivers/parisc/pdc_stable.c linux-2.6.33.1/drivers/parisc/pdc_stable.c +--- linux-2.6.33.1/drivers/parisc/pdc_stable.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/parisc/pdc_stable.c 2010-03-20 16:58:40.392862259 -0400 +@@ -481,7 +481,7 @@ pdcspath_attr_store(struct kobject *kobj + return ret; + } + +-static struct sysfs_ops pdcspath_attr_ops = { ++static const struct sysfs_ops pdcspath_attr_ops = { + .show = pdcspath_attr_show, + .store = pdcspath_attr_store, + }; +diff -urNp linux-2.6.33.1/drivers/parport/procfs.c linux-2.6.33.1/drivers/parport/procfs.c +--- linux-2.6.33.1/drivers/parport/procfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/parport/procfs.c 2010-03-20 16:58:40.392862259 -0400 +@@ -64,7 +64,7 @@ static int do_active_device(ctl_table *t + + *ppos += len; + +- return copy_to_user(result, buffer, len) ? -EFAULT : 0; ++ return (len > sizeof(buffer) || copy_to_user(result, buffer, len)) ? -EFAULT : 0; + } + + #ifdef CONFIG_PARPORT_1284 +@@ -106,7 +106,7 @@ static int do_autoprobe(ctl_table *table + + *ppos += len; + +- return copy_to_user (result, buffer, len) ? -EFAULT : 0; ++ return (len > sizeof(buffer) || copy_to_user (result, buffer, len)) ? -EFAULT : 0; + } + #endif /* IEEE1284.3 support. */ + +diff -urNp linux-2.6.33.1/drivers/pci/hotplug/acpiphp_glue.c linux-2.6.33.1/drivers/pci/hotplug/acpiphp_glue.c +--- linux-2.6.33.1/drivers/pci/hotplug/acpiphp_glue.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pci/hotplug/acpiphp_glue.c 2010-03-20 16:58:40.418811417 -0400 +@@ -109,7 +109,7 @@ static int post_dock_fixups(struct notif + } + + +-static struct acpi_dock_ops acpiphp_dock_ops = { ++static const struct acpi_dock_ops acpiphp_dock_ops = { + .handler = handle_hotplug_event_func, + }; + +diff -urNp linux-2.6.33.1/drivers/pci/hotplug/cpqphp_nvram.c linux-2.6.33.1/drivers/pci/hotplug/cpqphp_nvram.c +--- linux-2.6.33.1/drivers/pci/hotplug/cpqphp_nvram.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pci/hotplug/cpqphp_nvram.c 2010-03-20 16:58:40.420729851 -0400 +@@ -428,9 +428,13 @@ static u32 store_HRT (void __iomem *rom_ + + void compaq_nvram_init (void __iomem *rom_start) + { ++ ++#ifndef CONFIG_PAX_KERNEXEC + if (rom_start) { + compaq_int15_entry_point = (rom_start + ROM_INT15_PHY_ADDR - ROM_PHY_ADDR); + } ++#endif ++ + dbg("int15 entry = %p\n", compaq_int15_entry_point); + + /* initialize our int15 lock */ +diff -urNp linux-2.6.33.1/drivers/pci/hotplug/fakephp.c linux-2.6.33.1/drivers/pci/hotplug/fakephp.c +--- linux-2.6.33.1/drivers/pci/hotplug/fakephp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pci/hotplug/fakephp.c 2010-03-20 16:58:40.420729851 -0400 +@@ -73,7 +73,7 @@ static void legacy_release(struct kobjec + } + + static struct kobj_type legacy_ktype = { +- .sysfs_ops = &(struct sysfs_ops){ ++ .sysfs_ops = &(const struct sysfs_ops){ + .store = legacy_store, .show = legacy_show + }, + .release = &legacy_release, +diff -urNp linux-2.6.33.1/drivers/pci/intel-iommu.c linux-2.6.33.1/drivers/pci/intel-iommu.c +--- linux-2.6.33.1/drivers/pci/intel-iommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pci/intel-iommu.c 2010-03-20 16:58:40.428854036 -0400 +@@ -2940,7 +2940,7 @@ static int intel_mapping_error(struct de + return !dma_addr; + } + +-struct dma_map_ops intel_dma_ops = { ++const struct dma_map_ops intel_dma_ops = { + .alloc_coherent = intel_alloc_coherent, + .free_coherent = intel_free_coherent, + .map_sg = intel_map_sg, +diff -urNp linux-2.6.33.1/drivers/pci/pcie/portdrv_pci.c linux-2.6.33.1/drivers/pci/pcie/portdrv_pci.c +--- linux-2.6.33.1/drivers/pci/pcie/portdrv_pci.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pci/pcie/portdrv_pci.c 2010-03-20 16:58:40.432849247 -0400 +@@ -250,7 +250,7 @@ static void pcie_portdrv_err_resume(stru + static const struct pci_device_id port_pci_ids[] = { { + /* handle any PCI-Express port */ + PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x00), ~0), +- }, { /* end: all zeroes */ } ++ }, { 0, 0, 0, 0, 0, 0, 0 } + }; + MODULE_DEVICE_TABLE(pci, port_pci_ids); + +diff -urNp linux-2.6.33.1/drivers/pci/proc.c linux-2.6.33.1/drivers/pci/proc.c +--- linux-2.6.33.1/drivers/pci/proc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pci/proc.c 2010-03-20 16:58:40.432849247 -0400 +@@ -480,7 +480,16 @@ static const struct file_operations proc + static int __init pci_proc_init(void) + { + struct pci_dev *dev = NULL; ++ ++#ifdef CONFIG_GRKERNSEC_PROC_ADD ++#ifdef CONFIG_GRKERNSEC_PROC_USER ++ proc_bus_pci_dir = proc_mkdir_mode("bus/pci", S_IRUSR | S_IXUSR, NULL); ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ proc_bus_pci_dir = proc_mkdir_mode("bus/pci", S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP, NULL); ++#endif ++#else + proc_bus_pci_dir = proc_mkdir("bus/pci", NULL); ++#endif + proc_create("devices", 0, proc_bus_pci_dir, + &proc_bus_pci_dev_operations); + proc_initialized = 1; +diff -urNp linux-2.6.33.1/drivers/pci/slot.c linux-2.6.33.1/drivers/pci/slot.c +--- linux-2.6.33.1/drivers/pci/slot.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pci/slot.c 2010-03-20 16:58:40.436850488 -0400 +@@ -29,7 +29,7 @@ static ssize_t pci_slot_attr_store(struc + return attribute->store ? attribute->store(slot, buf, len) : -EIO; + } + +-static struct sysfs_ops pci_slot_sysfs_ops = { ++static const struct sysfs_ops pci_slot_sysfs_ops = { + .show = pci_slot_attr_show, + .store = pci_slot_attr_store, + }; +diff -urNp linux-2.6.33.1/drivers/pcmcia/ti113x.h linux-2.6.33.1/drivers/pcmcia/ti113x.h +--- linux-2.6.33.1/drivers/pcmcia/ti113x.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pcmcia/ti113x.h 2010-03-20 16:58:40.444846583 -0400 +@@ -903,7 +903,7 @@ static struct pci_device_id ene_tune_tbl + DEVID(PCI_VENDOR_ID_MOTOROLA, 0x3410, 0xECC0, PCI_ANY_ID, + ENE_TEST_C9_TLTENABLE | ENE_TEST_C9_PFENABLE, ENE_TEST_C9_TLTENABLE), + +- {} ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + static void ene_tune_bridge(struct pcmcia_socket *sock, struct pci_bus *bus) +diff -urNp linux-2.6.33.1/drivers/pcmcia/yenta_socket.c linux-2.6.33.1/drivers/pcmcia/yenta_socket.c +--- linux-2.6.33.1/drivers/pcmcia/yenta_socket.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pcmcia/yenta_socket.c 2010-03-20 16:58:40.460864210 -0400 +@@ -1432,7 +1432,7 @@ static struct pci_device_id yenta_table[ + + /* match any cardbus bridge */ + CB_ID(PCI_ANY_ID, PCI_ANY_ID, DEFAULT), +- { /* all zeroes */ } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + MODULE_DEVICE_TABLE(pci, yenta_table); + +diff -urNp linux-2.6.33.1/drivers/platform/x86/acer-wmi.c linux-2.6.33.1/drivers/platform/x86/acer-wmi.c +--- linux-2.6.33.1/drivers/platform/x86/acer-wmi.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/acer-wmi.c 2010-03-20 16:58:40.464856812 -0400 +@@ -915,7 +915,7 @@ static int update_bl_status(struct backl + return 0; + } + +-static struct backlight_ops acer_bl_ops = { ++static const struct backlight_ops acer_bl_ops = { + .get_brightness = read_brightness, + .update_status = update_bl_status, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/asus_acpi.c linux-2.6.33.1/drivers/platform/x86/asus_acpi.c +--- linux-2.6.33.1/drivers/platform/x86/asus_acpi.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/asus_acpi.c 2010-03-20 16:58:40.480866604 -0400 +@@ -1464,7 +1464,7 @@ static int asus_hotk_remove(struct acpi_ + return 0; + } + +-static struct backlight_ops asus_backlight_data = { ++static const struct backlight_ops asus_backlight_data = { + .get_brightness = read_brightness, + .update_status = set_brightness_status, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/asus-laptop.c linux-2.6.33.1/drivers/platform/x86/asus-laptop.c +--- linux-2.6.33.1/drivers/platform/x86/asus-laptop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/asus-laptop.c 2010-03-20 16:58:40.480866604 -0400 +@@ -251,7 +251,7 @@ static struct backlight_device *asus_bac + */ + static int read_brightness(struct backlight_device *bd); + static int update_bl_status(struct backlight_device *bd); +-static struct backlight_ops asusbl_ops = { ++static const struct backlight_ops asusbl_ops = { + .get_brightness = read_brightness, + .update_status = update_bl_status, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/classmate-laptop.c linux-2.6.33.1/drivers/platform/x86/classmate-laptop.c +--- linux-2.6.33.1/drivers/platform/x86/classmate-laptop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/classmate-laptop.c 2010-03-20 16:58:40.484567094 -0400 +@@ -452,7 +452,7 @@ static int cmpc_bl_update_status(struct + return -1; + } + +-static struct backlight_ops cmpc_bl_ops = { ++static const struct backlight_ops cmpc_bl_ops = { + .get_brightness = cmpc_bl_get_brightness, + .update_status = cmpc_bl_update_status + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/compal-laptop.c linux-2.6.33.1/drivers/platform/x86/compal-laptop.c +--- linux-2.6.33.1/drivers/platform/x86/compal-laptop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/compal-laptop.c 2010-03-20 16:58:40.484567094 -0400 +@@ -162,7 +162,7 @@ static int bl_update_status(struct backl + return set_lcd_level(b->props.brightness); + } + +-static struct backlight_ops compalbl_ops = { ++static const struct backlight_ops compalbl_ops = { + .get_brightness = bl_get_brightness, + .update_status = bl_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/dell-laptop.c linux-2.6.33.1/drivers/platform/x86/dell-laptop.c +--- linux-2.6.33.1/drivers/platform/x86/dell-laptop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/dell-laptop.c 2010-03-20 16:58:40.484567094 -0400 +@@ -333,7 +333,7 @@ static int dell_get_intensity(struct bac + return buffer.output[1]; + } + +-static struct backlight_ops dell_ops = { ++static const struct backlight_ops dell_ops = { + .get_brightness = dell_get_intensity, + .update_status = dell_send_intensity, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/eeepc-laptop.c linux-2.6.33.1/drivers/platform/x86/eeepc-laptop.c +--- linux-2.6.33.1/drivers/platform/x86/eeepc-laptop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/eeepc-laptop.c 2010-03-20 16:58:40.484567094 -0400 +@@ -1096,7 +1096,7 @@ static int update_bl_status(struct backl + return set_brightness(bd, bd->props.brightness); + } + +-static struct backlight_ops eeepcbl_ops = { ++static const struct backlight_ops eeepcbl_ops = { + .get_brightness = read_brightness, + .update_status = update_bl_status, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/fujitsu-laptop.c linux-2.6.33.1/drivers/platform/x86/fujitsu-laptop.c +--- linux-2.6.33.1/drivers/platform/x86/fujitsu-laptop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/fujitsu-laptop.c 2010-03-20 16:58:40.488600423 -0400 +@@ -436,7 +436,7 @@ static int bl_update_status(struct backl + return ret; + } + +-static struct backlight_ops fujitsubl_ops = { ++static const struct backlight_ops fujitsubl_ops = { + .get_brightness = bl_get_brightness, + .update_status = bl_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/msi-laptop.c linux-2.6.33.1/drivers/platform/x86/msi-laptop.c +--- linux-2.6.33.1/drivers/platform/x86/msi-laptop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/msi-laptop.c 2010-03-20 16:58:40.488600423 -0400 +@@ -161,7 +161,7 @@ static int bl_update_status(struct backl + return set_lcd_level(b->props.brightness); + } + +-static struct backlight_ops msibl_ops = { ++static const struct backlight_ops msibl_ops = { + .get_brightness = bl_get_brightness, + .update_status = bl_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/msi-wmi.c linux-2.6.33.1/drivers/platform/x86/msi-wmi.c +--- linux-2.6.33.1/drivers/platform/x86/msi-wmi.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/msi-wmi.c 2010-03-20 16:58:40.488600423 -0400 +@@ -138,7 +138,7 @@ static int bl_set_status(struct backligh + return msi_wmi_set_block(0, backlight_map[bright]); + } + +-static struct backlight_ops msi_backlight_ops = { ++static const struct backlight_ops msi_backlight_ops = { + .get_brightness = bl_get, + .update_status = bl_set_status, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/panasonic-laptop.c linux-2.6.33.1/drivers/platform/x86/panasonic-laptop.c +--- linux-2.6.33.1/drivers/platform/x86/panasonic-laptop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/panasonic-laptop.c 2010-03-20 16:58:40.488600423 -0400 +@@ -352,7 +352,7 @@ static int bl_set_status(struct backligh + return acpi_pcc_write_sset(pcc, SINF_DC_CUR_BRIGHT, bright); + } + +-static struct backlight_ops pcc_backlight_ops = { ++static const struct backlight_ops pcc_backlight_ops = { + .get_brightness = bl_get, + .update_status = bl_set_status, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/sony-laptop.c linux-2.6.33.1/drivers/platform/x86/sony-laptop.c +--- linux-2.6.33.1/drivers/platform/x86/sony-laptop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/sony-laptop.c 2010-03-20 16:58:40.508859925 -0400 +@@ -853,7 +853,7 @@ static int sony_backlight_get_brightness + } + + static struct backlight_device *sony_backlight_device; +-static struct backlight_ops sony_backlight_ops = { ++static const struct backlight_ops sony_backlight_ops = { + .update_status = sony_backlight_update_status, + .get_brightness = sony_backlight_get_brightness, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/thinkpad_acpi.c linux-2.6.33.1/drivers/platform/x86/thinkpad_acpi.c +--- linux-2.6.33.1/drivers/platform/x86/thinkpad_acpi.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/thinkpad_acpi.c 2010-03-20 16:58:40.516859801 -0400 +@@ -6131,7 +6131,7 @@ static void tpacpi_brightness_notify_cha + BACKLIGHT_UPDATE_HOTKEY); + } + +-static struct backlight_ops ibm_backlight_data = { ++static const struct backlight_ops ibm_backlight_data = { + .get_brightness = brightness_get, + .update_status = brightness_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/platform/x86/toshiba_acpi.c linux-2.6.33.1/drivers/platform/x86/toshiba_acpi.c +--- linux-2.6.33.1/drivers/platform/x86/toshiba_acpi.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/platform/x86/toshiba_acpi.c 2010-03-20 16:58:40.520536360 -0400 +@@ -706,7 +706,7 @@ static acpi_status remove_device(void) + return AE_OK; + } + +-static struct backlight_ops toshiba_backlight_data = { ++static const struct backlight_ops toshiba_backlight_data = { + .get_brightness = get_lcd, + .update_status = set_lcd_status, + }; +diff -urNp linux-2.6.33.1/drivers/pnp/pnpbios/bioscalls.c linux-2.6.33.1/drivers/pnp/pnpbios/bioscalls.c +--- linux-2.6.33.1/drivers/pnp/pnpbios/bioscalls.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pnp/pnpbios/bioscalls.c 2010-03-20 16:58:40.524855697 -0400 +@@ -60,7 +60,7 @@ do { \ + set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ + } while(0) + +-static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, ++static const struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4093, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); + + /* +@@ -97,7 +97,10 @@ static inline u16 call_pnp_bios(u16 func + + cpu = get_cpu(); + save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8]; ++ ++ pax_open_kernel(); + get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc; ++ pax_close_kernel(); + + /* On some boxes IRQ's during PnP BIOS calls are deadly. */ + spin_lock_irqsave(&pnp_bios_lock, flags); +@@ -135,7 +138,10 @@ static inline u16 call_pnp_bios(u16 func + :"memory"); + spin_unlock_irqrestore(&pnp_bios_lock, flags); + ++ pax_open_kernel(); + get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40; ++ pax_close_kernel(); ++ + put_cpu(); + + /* If we get here and this is set then the PnP BIOS faulted on us. */ +@@ -469,7 +475,7 @@ int pnp_bios_read_escd(char *data, u32 n + return status; + } + +-void pnpbios_calls_init(union pnp_bios_install_struct *header) ++void __init pnpbios_calls_init(union pnp_bios_install_struct *header) + { + int i; + +@@ -477,6 +483,8 @@ void pnpbios_calls_init(union pnp_bios_i + pnp_bios_callpoint.offset = header->fields.pm16offset; + pnp_bios_callpoint.segment = PNP_CS16; + ++ pax_open_kernel(); ++ + for_each_possible_cpu(i) { + struct desc_struct *gdt = get_cpu_gdt_table(i); + if (!gdt) +@@ -488,4 +496,6 @@ void pnpbios_calls_init(union pnp_bios_i + set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_DS], + (unsigned long)__va(header->fields.pm16dseg)); + } ++ ++ pax_close_kernel(); + } +diff -urNp linux-2.6.33.1/drivers/pnp/quirks.c linux-2.6.33.1/drivers/pnp/quirks.c +--- linux-2.6.33.1/drivers/pnp/quirks.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pnp/quirks.c 2010-03-20 16:58:40.528847437 -0400 +@@ -322,7 +322,7 @@ static struct pnp_fixup pnp_fixups[] = { + /* PnP resources that might overlap PCI BARs */ + {"PNP0c01", quirk_system_pci_resources}, + {"PNP0c02", quirk_system_pci_resources}, +- {""} ++ {"", NULL} + }; + + void pnp_fixup_device(struct pnp_dev *dev) +diff -urNp linux-2.6.33.1/drivers/pnp/resource.c linux-2.6.33.1/drivers/pnp/resource.c +--- linux-2.6.33.1/drivers/pnp/resource.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/pnp/resource.c 2010-03-20 16:58:40.532529204 -0400 +@@ -355,7 +355,7 @@ int pnp_check_irq(struct pnp_dev *dev, s + return 1; + + /* check if the resource is valid */ +- if (*irq < 0 || *irq > 15) ++ if (*irq > 15) + return 0; + + /* check if the resource is reserved */ +@@ -419,7 +419,7 @@ int pnp_check_dma(struct pnp_dev *dev, s + return 1; + + /* check if the resource is valid */ +- if (*dma < 0 || *dma == 4 || *dma > 7) ++ if (*dma == 4 || *dma > 7) + return 0; + + /* check if the resource is reserved */ +diff -urNp linux-2.6.33.1/drivers/s390/cio/qdio_debug.c linux-2.6.33.1/drivers/s390/cio/qdio_debug.c +--- linux-2.6.33.1/drivers/s390/cio/qdio_debug.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/s390/cio/qdio_debug.c 2010-03-20 16:58:40.552847675 -0400 +@@ -215,7 +215,7 @@ static int qperf_seq_open(struct inode * + filp->f_path.dentry->d_inode->i_private); + } + +-static struct file_operations debugfs_perf_fops = { ++static const struct file_operations debugfs_perf_fops = { + .owner = THIS_MODULE, + .open = qperf_seq_open, + .read = seq_read, +diff -urNp linux-2.6.33.1/drivers/scsi/ipr.c linux-2.6.33.1/drivers/scsi/ipr.c +--- linux-2.6.33.1/drivers/scsi/ipr.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/scsi/ipr.c 2010-03-20 16:58:40.596672237 -0400 +@@ -5291,7 +5291,7 @@ static bool ipr_qc_fill_rtf(struct ata_q + return true; + } + +-static struct ata_port_operations ipr_sata_ops = { ++static const struct ata_port_operations ipr_sata_ops = { + .phy_reset = ipr_ata_phy_reset, + .hardreset = ipr_sata_reset, + .post_internal_cmd = ipr_ata_post_internal, +diff -urNp linux-2.6.33.1/drivers/scsi/libfc/fc_exch.c linux-2.6.33.1/drivers/scsi/libfc/fc_exch.c +--- linux-2.6.33.1/drivers/scsi/libfc/fc_exch.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/scsi/libfc/fc_exch.c 2010-03-20 16:58:40.600909572 -0400 +@@ -100,12 +100,12 @@ struct fc_exch_mgr { + * all together if not used XXX + */ + struct { +- atomic_t no_free_exch; +- atomic_t no_free_exch_xid; +- atomic_t xid_not_found; +- atomic_t xid_busy; +- atomic_t seq_not_found; +- atomic_t non_bls_resp; ++ atomic_unchecked_t no_free_exch; ++ atomic_unchecked_t no_free_exch_xid; ++ atomic_unchecked_t xid_not_found; ++ atomic_unchecked_t xid_busy; ++ atomic_unchecked_t seq_not_found; ++ atomic_unchecked_t non_bls_resp; + } stats; + }; + #define fc_seq_exch(sp) container_of(sp, struct fc_exch, seq) +@@ -671,7 +671,7 @@ static struct fc_exch *fc_exch_em_alloc( + /* allocate memory for exchange */ + ep = mempool_alloc(mp->ep_pool, GFP_ATOMIC); + if (!ep) { +- atomic_inc(&mp->stats.no_free_exch); ++ atomic_inc_unchecked(&mp->stats.no_free_exch); + goto out; + } + memset(ep, 0, sizeof(*ep)); +@@ -718,7 +718,7 @@ out: + return ep; + err: + spin_unlock_bh(&pool->lock); +- atomic_inc(&mp->stats.no_free_exch_xid); ++ atomic_inc_unchecked(&mp->stats.no_free_exch_xid); + mempool_free(ep, mp->ep_pool); + return NULL; + } +@@ -868,7 +868,7 @@ static enum fc_pf_rjt_reason fc_seq_look + xid = ntohs(fh->fh_ox_id); /* we originated exch */ + ep = fc_exch_find(mp, xid); + if (!ep) { +- atomic_inc(&mp->stats.xid_not_found); ++ atomic_inc_unchecked(&mp->stats.xid_not_found); + reject = FC_RJT_OX_ID; + goto out; + } +@@ -898,7 +898,7 @@ static enum fc_pf_rjt_reason fc_seq_look + ep = fc_exch_find(mp, xid); + if ((f_ctl & FC_FC_FIRST_SEQ) && fc_sof_is_init(fr_sof(fp))) { + if (ep) { +- atomic_inc(&mp->stats.xid_busy); ++ atomic_inc_unchecked(&mp->stats.xid_busy); + reject = FC_RJT_RX_ID; + goto rel; + } +@@ -909,7 +909,7 @@ static enum fc_pf_rjt_reason fc_seq_look + } + xid = ep->xid; /* get our XID */ + } else if (!ep) { +- atomic_inc(&mp->stats.xid_not_found); ++ atomic_inc_unchecked(&mp->stats.xid_not_found); + reject = FC_RJT_RX_ID; /* XID not found */ + goto out; + } +@@ -930,7 +930,7 @@ static enum fc_pf_rjt_reason fc_seq_look + } else { + sp = &ep->seq; + if (sp->id != fh->fh_seq_id) { +- atomic_inc(&mp->stats.seq_not_found); ++ atomic_inc_unchecked(&mp->stats.seq_not_found); + reject = FC_RJT_SEQ_ID; /* sequence/exch should exist */ + goto rel; + } +@@ -1317,22 +1317,22 @@ static void fc_exch_recv_seq_resp(struct + + ep = fc_exch_find(mp, ntohs(fh->fh_ox_id)); + if (!ep) { +- atomic_inc(&mp->stats.xid_not_found); ++ atomic_inc_unchecked(&mp->stats.xid_not_found); + goto out; + } + if (ep->esb_stat & ESB_ST_COMPLETE) { +- atomic_inc(&mp->stats.xid_not_found); ++ atomic_inc_unchecked(&mp->stats.xid_not_found); + goto out; + } + if (ep->rxid == FC_XID_UNKNOWN) + ep->rxid = ntohs(fh->fh_rx_id); + if (ep->sid != 0 && ep->sid != ntoh24(fh->fh_d_id)) { +- atomic_inc(&mp->stats.xid_not_found); ++ atomic_inc_unchecked(&mp->stats.xid_not_found); + goto rel; + } + if (ep->did != ntoh24(fh->fh_s_id) && + ep->did != FC_FID_FLOGI) { +- atomic_inc(&mp->stats.xid_not_found); ++ atomic_inc_unchecked(&mp->stats.xid_not_found); + goto rel; + } + sof = fr_sof(fp); +@@ -1343,7 +1343,7 @@ static void fc_exch_recv_seq_resp(struct + } else { + sp = &ep->seq; + if (sp->id != fh->fh_seq_id) { +- atomic_inc(&mp->stats.seq_not_found); ++ atomic_inc_unchecked(&mp->stats.seq_not_found); + goto rel; + } + } +@@ -1406,9 +1406,9 @@ static void fc_exch_recv_resp(struct fc_ + sp = fc_seq_lookup_orig(mp, fp); /* doesn't hold sequence */ + + if (!sp) +- atomic_inc(&mp->stats.xid_not_found); ++ atomic_inc_unchecked(&mp->stats.xid_not_found); + else +- atomic_inc(&mp->stats.non_bls_resp); ++ atomic_inc_unchecked(&mp->stats.non_bls_resp); + + fc_frame_free(fp); + } +diff -urNp linux-2.6.33.1/drivers/scsi/libsas/sas_ata.c linux-2.6.33.1/drivers/scsi/libsas/sas_ata.c +--- linux-2.6.33.1/drivers/scsi/libsas/sas_ata.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/scsi/libsas/sas_ata.c 2010-03-20 16:58:40.625874020 -0400 +@@ -343,7 +343,7 @@ static int sas_ata_scr_read(struct ata_l + } + } + +-static struct ata_port_operations sas_sata_ops = { ++static const struct ata_port_operations sas_sata_ops = { + .phy_reset = sas_ata_phy_reset, + .post_internal_cmd = sas_ata_post_internal, + .qc_prep = ata_noop_qc_prep, +diff -urNp linux-2.6.33.1/drivers/scsi/scsi_logging.h linux-2.6.33.1/drivers/scsi/scsi_logging.h +--- linux-2.6.33.1/drivers/scsi/scsi_logging.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/scsi/scsi_logging.h 2010-03-20 16:58:40.673358427 -0400 +@@ -51,7 +51,7 @@ do { \ + } while (0); \ + } while (0) + #else +-#define SCSI_CHECK_LOGGING(SHIFT, BITS, LEVEL, CMD) ++#define SCSI_CHECK_LOGGING(SHIFT, BITS, LEVEL, CMD) do {} while (0) + #endif /* CONFIG_SCSI_LOGGING */ + + /* +diff -urNp linux-2.6.33.1/drivers/scsi/sg.c linux-2.6.33.1/drivers/scsi/sg.c +--- linux-2.6.33.1/drivers/scsi/sg.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/scsi/sg.c 2010-03-20 16:58:40.692870926 -0400 +@@ -2292,7 +2292,7 @@ struct sg_proc_leaf { + const struct file_operations * fops; + }; + +-static struct sg_proc_leaf sg_proc_leaf_arr[] = { ++static const struct sg_proc_leaf sg_proc_leaf_arr[] = { + {"allow_dio", &adio_fops}, + {"debug", &debug_fops}, + {"def_reserved_size", &dressz_fops}, +@@ -2307,7 +2307,7 @@ sg_proc_init(void) + { + int k, mask; + int num_leaves = ARRAY_SIZE(sg_proc_leaf_arr); +- struct sg_proc_leaf * leaf; ++ const struct sg_proc_leaf * leaf; + + sg_proc_sgp = proc_mkdir(sg_proc_sg_dirname, NULL); + if (!sg_proc_sgp) +diff -urNp linux-2.6.33.1/drivers/serial/8250_pci.c linux-2.6.33.1/drivers/serial/8250_pci.c +--- linux-2.6.33.1/drivers/serial/8250_pci.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/serial/8250_pci.c 2010-03-20 16:58:40.736223750 -0400 +@@ -3664,7 +3664,7 @@ static struct pci_device_id serial_pci_t + PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_COMMUNICATION_MULTISERIAL << 8, + 0xffff00, pbn_default }, +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + static struct pci_driver serial_pci_driver = { +diff -urNp linux-2.6.33.1/drivers/serial/kgdboc.c linux-2.6.33.1/drivers/serial/kgdboc.c +--- linux-2.6.33.1/drivers/serial/kgdboc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/serial/kgdboc.c 2010-03-20 16:58:40.745176799 -0400 +@@ -18,7 +18,7 @@ + + #define MAX_CONFIG_LEN 40 + +-static struct kgdb_io kgdboc_io_ops; ++static const struct kgdb_io kgdboc_io_ops; + + /* -1 = init not run yet, 0 = unconfigured, 1 = configured. */ + static int configured = -1; +@@ -154,7 +154,7 @@ static void kgdboc_post_exp_handler(void + module_put(THIS_MODULE); + } + +-static struct kgdb_io kgdboc_io_ops = { ++static const struct kgdb_io kgdboc_io_ops = { + .name = "kgdboc", + .read_char = kgdboc_get_char, + .write_char = kgdboc_put_char, +diff -urNp linux-2.6.33.1/drivers/staging/b3dfg/b3dfg.c linux-2.6.33.1/drivers/staging/b3dfg/b3dfg.c +--- linux-2.6.33.1/drivers/staging/b3dfg/b3dfg.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/b3dfg/b3dfg.c 2010-03-20 16:58:40.760695484 -0400 +@@ -455,7 +455,7 @@ static int b3dfg_vma_fault(struct vm_are + return VM_FAULT_NOPAGE; + } + +-static struct vm_operations_struct b3dfg_vm_ops = { ++static const struct vm_operations_struct b3dfg_vm_ops = { + .fault = b3dfg_vma_fault, + }; + +@@ -836,7 +836,7 @@ static int b3dfg_mmap(struct file *filp, + return r; + } + +-static struct file_operations b3dfg_fops = { ++static const struct file_operations b3dfg_fops = { + .owner = THIS_MODULE, + .open = b3dfg_open, + .release = b3dfg_release, +diff -urNp linux-2.6.33.1/drivers/staging/comedi/comedi_fops.c linux-2.6.33.1/drivers/staging/comedi/comedi_fops.c +--- linux-2.6.33.1/drivers/staging/comedi/comedi_fops.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/comedi/comedi_fops.c 2010-03-20 16:58:40.772884102 -0400 +@@ -1384,7 +1384,7 @@ void comedi_unmap(struct vm_area_struct + mutex_unlock(&dev->mutex); + } + +-static struct vm_operations_struct comedi_vm_ops = { ++static const struct vm_operations_struct comedi_vm_ops = { + .close = comedi_unmap, + }; + +diff -urNp linux-2.6.33.1/drivers/staging/dream/pmem.c linux-2.6.33.1/drivers/staging/dream/pmem.c +--- linux-2.6.33.1/drivers/staging/dream/pmem.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/pmem.c 2010-03-20 16:58:40.792529665 -0400 +@@ -174,7 +174,7 @@ static int pmem_mmap(struct file *, stru + static int pmem_open(struct inode *, struct file *); + static long pmem_ioctl(struct file *, unsigned int, unsigned long); + +-struct file_operations pmem_fops = { ++const struct file_operations pmem_fops = { + .release = pmem_release, + .mmap = pmem_mmap, + .open = pmem_open, +@@ -1202,7 +1202,7 @@ static ssize_t debug_read(struct file *f + return simple_read_from_buffer(buf, count, ppos, buffer, n); + } + +-static struct file_operations debug_fops = { ++static const struct file_operations debug_fops = { + .read = debug_read, + .open = debug_open, + }; +diff -urNp linux-2.6.33.1/drivers/staging/dream/qdsp5/adsp_driver.c linux-2.6.33.1/drivers/staging/dream/qdsp5/adsp_driver.c +--- linux-2.6.33.1/drivers/staging/dream/qdsp5/adsp_driver.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/qdsp5/adsp_driver.c 2010-03-20 16:58:40.796888570 -0400 +@@ -576,7 +576,7 @@ static struct adsp_device *inode_to_devi + static dev_t adsp_devno; + static struct class *adsp_class; + +-static struct file_operations adsp_fops = { ++static const struct file_operations adsp_fops = { + .owner = THIS_MODULE, + .open = adsp_open, + .unlocked_ioctl = adsp_ioctl, +diff -urNp linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_aac.c linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_aac.c +--- linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_aac.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_aac.c 2010-03-20 16:58:40.808875958 -0400 +@@ -1022,7 +1022,7 @@ done: + return rc; + } + +-static struct file_operations audio_aac_fops = { ++static const struct file_operations audio_aac_fops = { + .owner = THIS_MODULE, + .open = audio_open, + .release = audio_release, +diff -urNp linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_amrnb.c linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_amrnb.c +--- linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_amrnb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_amrnb.c 2010-03-20 16:58:40.808875958 -0400 +@@ -833,7 +833,7 @@ done: + return rc; + } + +-static struct file_operations audio_amrnb_fops = { ++static const struct file_operations audio_amrnb_fops = { + .owner = THIS_MODULE, + .open = audamrnb_open, + .release = audamrnb_release, +diff -urNp linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_evrc.c linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_evrc.c +--- linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_evrc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_evrc.c 2010-03-20 16:58:40.812532082 -0400 +@@ -805,7 +805,7 @@ dma_fail: + return rc; + } + +-static struct file_operations audio_evrc_fops = { ++static const struct file_operations audio_evrc_fops = { + .owner = THIS_MODULE, + .open = audevrc_open, + .release = audevrc_release, +diff -urNp linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_in.c linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_in.c +--- linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_in.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_in.c 2010-03-20 16:58:40.812532082 -0400 +@@ -913,7 +913,7 @@ static int audpre_open(struct inode *ino + return 0; + } + +-static struct file_operations audio_fops = { ++static const struct file_operations audio_fops = { + .owner = THIS_MODULE, + .open = audio_in_open, + .release = audio_in_release, +@@ -922,7 +922,7 @@ static struct file_operations audio_fops + .unlocked_ioctl = audio_in_ioctl, + }; + +-static struct file_operations audpre_fops = { ++static const struct file_operations audpre_fops = { + .owner = THIS_MODULE, + .open = audpre_open, + .unlocked_ioctl = audpre_ioctl, +diff -urNp linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_mp3.c linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_mp3.c +--- linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_mp3.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_mp3.c 2010-03-20 16:58:40.812532082 -0400 +@@ -941,7 +941,7 @@ done: + return rc; + } + +-static struct file_operations audio_mp3_fops = { ++static const struct file_operations audio_mp3_fops = { + .owner = THIS_MODULE, + .open = audio_open, + .release = audio_release, +diff -urNp linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_out.c linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_out.c +--- linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_out.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_out.c 2010-03-20 16:58:40.812532082 -0400 +@@ -806,7 +806,7 @@ static int audpp_open(struct inode *inod + return 0; + } + +-static struct file_operations audio_fops = { ++static const struct file_operations audio_fops = { + .owner = THIS_MODULE, + .open = audio_open, + .release = audio_release, +@@ -815,7 +815,7 @@ static struct file_operations audio_fops + .unlocked_ioctl = audio_ioctl, + }; + +-static struct file_operations audpp_fops = { ++static const struct file_operations audpp_fops = { + .owner = THIS_MODULE, + .open = audpp_open, + .unlocked_ioctl = audpp_ioctl, +diff -urNp linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_qcelp.c linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_qcelp.c +--- linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_qcelp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/qdsp5/audio_qcelp.c 2010-03-20 16:58:40.816705807 -0400 +@@ -816,7 +816,7 @@ err: + return rc; + } + +-static struct file_operations audio_qcelp_fops = { ++static const struct file_operations audio_qcelp_fops = { + .owner = THIS_MODULE, + .open = audqcelp_open, + .release = audqcelp_release, +diff -urNp linux-2.6.33.1/drivers/staging/dream/qdsp5/snd.c linux-2.6.33.1/drivers/staging/dream/qdsp5/snd.c +--- linux-2.6.33.1/drivers/staging/dream/qdsp5/snd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/qdsp5/snd.c 2010-03-20 16:58:40.816705807 -0400 +@@ -242,7 +242,7 @@ err: + return rc; + } + +-static struct file_operations snd_fops = { ++static const struct file_operations snd_fops = { + .owner = THIS_MODULE, + .open = snd_open, + .release = snd_release, +diff -urNp linux-2.6.33.1/drivers/staging/dream/smd/smd_qmi.c linux-2.6.33.1/drivers/staging/dream/smd/smd_qmi.c +--- linux-2.6.33.1/drivers/staging/dream/smd/smd_qmi.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/smd/smd_qmi.c 2010-03-20 16:58:40.828884259 -0400 +@@ -788,7 +788,7 @@ static int qmi_release(struct inode *ip, + return 0; + } + +-static struct file_operations qmi_fops = { ++static const struct file_operations qmi_fops = { + .owner = THIS_MODULE, + .read = qmi_read, + .write = qmi_write, +diff -urNp linux-2.6.33.1/drivers/staging/dream/smd/smd_rpcrouter_device.c linux-2.6.33.1/drivers/staging/dream/smd/smd_rpcrouter_device.c +--- linux-2.6.33.1/drivers/staging/dream/smd/smd_rpcrouter_device.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/dream/smd/smd_rpcrouter_device.c 2010-03-20 16:58:40.828884259 -0400 +@@ -214,7 +214,7 @@ static long rpcrouter_ioctl(struct file + return rc; + } + +-static struct file_operations rpcrouter_server_fops = { ++static const struct file_operations rpcrouter_server_fops = { + .owner = THIS_MODULE, + .open = rpcrouter_open, + .release = rpcrouter_release, +@@ -224,7 +224,7 @@ static struct file_operations rpcrouter_ + .unlocked_ioctl = rpcrouter_ioctl, + }; + +-static struct file_operations rpcrouter_router_fops = { ++static const struct file_operations rpcrouter_router_fops = { + .owner = THIS_MODULE, + .open = rpcrouter_open, + .release = rpcrouter_release, +diff -urNp linux-2.6.33.1/drivers/staging/go7007/go7007-v4l2.c linux-2.6.33.1/drivers/staging/go7007/go7007-v4l2.c +--- linux-2.6.33.1/drivers/staging/go7007/go7007-v4l2.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/go7007/go7007-v4l2.c 2010-03-20 16:58:40.858660027 -0400 +@@ -1674,7 +1674,7 @@ static int go7007_vm_fault(struct vm_are + return 0; + } + +-static struct vm_operations_struct go7007_vm_ops = { ++static const struct vm_operations_struct go7007_vm_ops = { + .open = go7007_vm_open, + .close = go7007_vm_close, + .fault = go7007_vm_fault, +diff -urNp linux-2.6.33.1/drivers/staging/hv/blkvsc_drv.c linux-2.6.33.1/drivers/staging/hv/blkvsc_drv.c +--- linux-2.6.33.1/drivers/staging/hv/blkvsc_drv.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/hv/blkvsc_drv.c 2010-03-20 16:58:40.876878598 -0400 +@@ -153,7 +153,7 @@ static int blkvsc_ringbuffer_size = BLKV + /* The one and only one */ + static struct blkvsc_driver_context g_blkvsc_drv; + +-static struct block_device_operations block_ops = { ++static const struct block_device_operations block_ops = { + .owner = THIS_MODULE, + .open = blkvsc_open, + .release = blkvsc_release, +diff -urNp linux-2.6.33.1/drivers/staging/panel/panel.c linux-2.6.33.1/drivers/staging/panel/panel.c +--- linux-2.6.33.1/drivers/staging/panel/panel.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/panel/panel.c 2010-03-20 16:58:40.888879754 -0400 +@@ -1305,7 +1305,7 @@ static int lcd_release(struct inode *ino + return 0; + } + +-static struct file_operations lcd_fops = { ++static const struct file_operations lcd_fops = { + .write = lcd_write, + .open = lcd_open, + .release = lcd_release, +@@ -1565,7 +1565,7 @@ static int keypad_release(struct inode * + return 0; + } + +-static struct file_operations keypad_fops = { ++static const struct file_operations keypad_fops = { + .read = keypad_read, /* read */ + .open = keypad_open, /* open */ + .release = keypad_release, /* close */ +diff -urNp linux-2.6.33.1/drivers/staging/phison/phison.c linux-2.6.33.1/drivers/staging/phison/phison.c +--- linux-2.6.33.1/drivers/staging/phison/phison.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/phison/phison.c 2010-03-20 16:58:40.896878917 -0400 +@@ -43,7 +43,7 @@ static struct scsi_host_template phison_ + ATA_BMDMA_SHT(DRV_NAME), + }; + +-static struct ata_port_operations phison_ops = { ++static const struct ata_port_operations phison_ops = { + .inherits = &ata_bmdma_port_ops, + .prereset = phison_pre_reset, + }; +diff -urNp linux-2.6.33.1/drivers/staging/poch/poch.c linux-2.6.33.1/drivers/staging/poch/poch.c +--- linux-2.6.33.1/drivers/staging/poch/poch.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/poch/poch.c 2010-03-20 16:58:40.920050870 -0400 +@@ -1032,7 +1032,7 @@ static int poch_ioctl(struct inode *inod + return 0; + } + +-static struct file_operations poch_fops = { ++static const struct file_operations poch_fops = { + .owner = THIS_MODULE, + .open = poch_open, + .release = poch_release, +diff -urNp linux-2.6.33.1/drivers/staging/pohmelfs/inode.c linux-2.6.33.1/drivers/staging/pohmelfs/inode.c +--- linux-2.6.33.1/drivers/staging/pohmelfs/inode.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/pohmelfs/inode.c 2010-03-20 16:58:40.936891697 -0400 +@@ -1854,7 +1854,7 @@ static int pohmelfs_fill_super(struct su + mutex_init(&psb->mcache_lock); + psb->mcache_root = RB_ROOT; + psb->mcache_timeout = msecs_to_jiffies(5000); +- atomic_long_set(&psb->mcache_gen, 0); ++ atomic_long_set_unchecked(&psb->mcache_gen, 0); + + psb->trans_max_pages = 100; + +diff -urNp linux-2.6.33.1/drivers/staging/pohmelfs/mcache.c linux-2.6.33.1/drivers/staging/pohmelfs/mcache.c +--- linux-2.6.33.1/drivers/staging/pohmelfs/mcache.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/pohmelfs/mcache.c 2010-03-20 16:58:40.936891697 -0400 +@@ -121,7 +121,7 @@ struct pohmelfs_mcache *pohmelfs_mcache_ + m->data = data; + m->start = start; + m->size = size; +- m->gen = atomic_long_inc_return(&psb->mcache_gen); ++ m->gen = atomic_long_inc_return_unchecked(&psb->mcache_gen); + + mutex_lock(&psb->mcache_lock); + err = pohmelfs_mcache_insert(psb, m); +diff -urNp linux-2.6.33.1/drivers/staging/pohmelfs/netfs.h linux-2.6.33.1/drivers/staging/pohmelfs/netfs.h +--- linux-2.6.33.1/drivers/staging/pohmelfs/netfs.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/pohmelfs/netfs.h 2010-03-20 16:58:40.940661223 -0400 +@@ -571,7 +571,7 @@ struct pohmelfs_config; + struct pohmelfs_sb { + struct rb_root mcache_root; + struct mutex mcache_lock; +- atomic_long_t mcache_gen; ++ atomic_long_unchecked_t mcache_gen; + unsigned long mcache_timeout; + + unsigned int idx; +diff -urNp linux-2.6.33.1/drivers/staging/ramzswap/ramzswap_drv.c linux-2.6.33.1/drivers/staging/ramzswap/ramzswap_drv.c +--- linux-2.6.33.1/drivers/staging/ramzswap/ramzswap_drv.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/ramzswap/ramzswap_drv.c 2010-03-20 16:58:40.944873774 -0400 +@@ -1288,7 +1288,7 @@ out: + return ret; + } + +-static struct block_device_operations ramzswap_devops = { ++static const struct block_device_operations ramzswap_devops = { + .ioctl = ramzswap_ioctl, + .owner = THIS_MODULE, + }; +diff -urNp linux-2.6.33.1/drivers/staging/rtl8192u/ieee80211/proc.c linux-2.6.33.1/drivers/staging/rtl8192u/ieee80211/proc.c +--- linux-2.6.33.1/drivers/staging/rtl8192u/ieee80211/proc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/rtl8192u/ieee80211/proc.c 2010-03-20 16:58:40.944873774 -0400 +@@ -99,7 +99,7 @@ static int crypto_info_open(struct inode + return seq_open(file, &crypto_seq_ops); + } + +-static struct file_operations proc_crypto_ops = { ++static const struct file_operations proc_crypto_ops = { + .open = crypto_info_open, + .read = seq_read, + .llseek = seq_lseek, +diff -urNp linux-2.6.33.1/drivers/staging/samsung-laptop/samsung-laptop.c linux-2.6.33.1/drivers/staging/samsung-laptop/samsung-laptop.c +--- linux-2.6.33.1/drivers/staging/samsung-laptop/samsung-laptop.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/samsung-laptop/samsung-laptop.c 2010-03-20 16:58:40.952877179 -0400 +@@ -268,7 +268,7 @@ static int update_status(struct backligh + return 0; + } + +-static struct backlight_ops backlight_ops = { ++static const struct backlight_ops backlight_ops = { + .get_brightness = get_brightness, + .update_status = update_status, + }; +diff -urNp linux-2.6.33.1/drivers/staging/sep/sep_driver.c linux-2.6.33.1/drivers/staging/sep/sep_driver.c +--- linux-2.6.33.1/drivers/staging/sep/sep_driver.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/sep/sep_driver.c 2010-03-20 16:58:40.956865691 -0400 +@@ -2605,7 +2605,7 @@ static struct pci_driver sep_pci_driver + static dev_t sep_devno; + + /* the files operations structure of the driver */ +-static struct file_operations sep_file_operations = { ++static const struct file_operations sep_file_operations = { + .owner = THIS_MODULE, + .ioctl = sep_ioctl, + .poll = sep_poll, +diff -urNp linux-2.6.33.1/drivers/staging/vme/devices/vme_user.c linux-2.6.33.1/drivers/staging/vme/devices/vme_user.c +--- linux-2.6.33.1/drivers/staging/vme/devices/vme_user.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/staging/vme/devices/vme_user.c 2010-03-20 16:58:40.956865691 -0400 +@@ -135,7 +135,7 @@ static int vme_user_ioctl(struct inode * + static int __init vme_user_probe(struct device *, int, int); + static int __exit vme_user_remove(struct device *, int, int); + +-static struct file_operations vme_user_fops = { ++static const struct file_operations vme_user_fops = { + .open = vme_user_open, + .release = vme_user_release, + .read = vme_user_read, +diff -urNp linux-2.6.33.1/drivers/uio/uio.c linux-2.6.33.1/drivers/uio/uio.c +--- linux-2.6.33.1/drivers/uio/uio.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/uio/uio.c 2010-03-20 16:58:40.964892465 -0400 +@@ -129,7 +129,7 @@ static ssize_t map_type_show(struct kobj + return entry->show(mem, buf); + } + +-static struct sysfs_ops map_sysfs_ops = { ++static const struct sysfs_ops map_sysfs_ops = { + .show = map_type_show, + }; + +@@ -217,7 +217,7 @@ static ssize_t portio_type_show(struct k + return entry->show(port, buf); + } + +-static struct sysfs_ops portio_sysfs_ops = { ++static const struct sysfs_ops portio_sysfs_ops = { + .show = portio_type_show, + }; + +diff -urNp linux-2.6.33.1/drivers/usb/atm/usbatm.c linux-2.6.33.1/drivers/usb/atm/usbatm.c +--- linux-2.6.33.1/drivers/usb/atm/usbatm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/atm/usbatm.c 2010-03-20 16:58:40.976907794 -0400 +@@ -333,7 +333,7 @@ static void usbatm_extract_one_cell(stru + if (printk_ratelimit()) + atm_warn(instance, "%s: OAM not supported (vpi %d, vci %d)!\n", + __func__, vpi, vci); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + return; + } + +@@ -361,7 +361,7 @@ static void usbatm_extract_one_cell(stru + if (length > ATM_MAX_AAL5_PDU) { + atm_rldbg(instance, "%s: bogus length %u (vcc: 0x%p)!\n", + __func__, length, vcc); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + goto out; + } + +@@ -370,14 +370,14 @@ static void usbatm_extract_one_cell(stru + if (sarb->len < pdu_length) { + atm_rldbg(instance, "%s: bogus pdu_length %u (sarb->len: %u, vcc: 0x%p)!\n", + __func__, pdu_length, sarb->len, vcc); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + goto out; + } + + if (crc32_be(~0, skb_tail_pointer(sarb) - pdu_length, pdu_length) != 0xc704dd7b) { + atm_rldbg(instance, "%s: packet failed crc check (vcc: 0x%p)!\n", + __func__, vcc); +- atomic_inc(&vcc->stats->rx_err); ++ atomic_inc_unchecked(&vcc->stats->rx_err); + goto out; + } + +@@ -387,7 +387,7 @@ static void usbatm_extract_one_cell(stru + if (printk_ratelimit()) + atm_err(instance, "%s: no memory for skb (length: %u)!\n", + __func__, length); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + goto out; + } + +@@ -412,7 +412,7 @@ static void usbatm_extract_one_cell(stru + + vcc->push(vcc, skb); + +- atomic_inc(&vcc->stats->rx); ++ atomic_inc_unchecked(&vcc->stats->rx); + out: + skb_trim(sarb, 0); + } +@@ -616,7 +616,7 @@ static void usbatm_tx_process(unsigned l + struct atm_vcc *vcc = UDSL_SKB(skb)->atm.vcc; + + usbatm_pop(vcc, skb); +- atomic_inc(&vcc->stats->tx); ++ atomic_inc_unchecked(&vcc->stats->tx); + + skb = skb_dequeue(&instance->sndqueue); + } +@@ -775,11 +775,11 @@ static int usbatm_atm_proc_read(struct a + if (!left--) + return sprintf(page, + "AAL5: tx %d ( %d err ), rx %d ( %d err, %d drop )\n", +- atomic_read(&atm_dev->stats.aal5.tx), +- atomic_read(&atm_dev->stats.aal5.tx_err), +- atomic_read(&atm_dev->stats.aal5.rx), +- atomic_read(&atm_dev->stats.aal5.rx_err), +- atomic_read(&atm_dev->stats.aal5.rx_drop)); ++ atomic_read_unchecked(&atm_dev->stats.aal5.tx), ++ atomic_read_unchecked(&atm_dev->stats.aal5.tx_err), ++ atomic_read_unchecked(&atm_dev->stats.aal5.rx), ++ atomic_read_unchecked(&atm_dev->stats.aal5.rx_err), ++ atomic_read_unchecked(&atm_dev->stats.aal5.rx_drop)); + + if (!left--) { + if (instance->disconnected) +diff -urNp linux-2.6.33.1/drivers/usb/class/cdc-acm.c linux-2.6.33.1/drivers/usb/class/cdc-acm.c +--- linux-2.6.33.1/drivers/usb/class/cdc-acm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/class/cdc-acm.c 2010-03-20 16:58:40.993421809 -0400 +@@ -1590,7 +1590,7 @@ static struct usb_device_id acm_ids[] = + { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, + USB_CDC_ACM_PROTO_AT_CDMA) }, + +- { } ++ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(usb, acm_ids); +diff -urNp linux-2.6.33.1/drivers/usb/class/usblp.c linux-2.6.33.1/drivers/usb/class/usblp.c +--- linux-2.6.33.1/drivers/usb/class/usblp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/class/usblp.c 2010-03-20 16:58:41.016571934 -0400 +@@ -228,7 +228,7 @@ static const struct quirk_printer_struct + { 0x0482, 0x0010, USBLP_QUIRK_BIDIR }, /* Kyocera Mita FS 820, by zut kernel@zut.de */ + { 0x04f9, 0x000d, USBLP_QUIRK_BIDIR }, /* Brother Industries, Ltd HL-1440 Laser Printer */ + { 0x04b8, 0x0202, USBLP_QUIRK_BAD_CLASS }, /* Seiko Epson Receipt Printer M129C */ +- { 0, 0 } ++ { 0, 0, 0 } + }; + + static int usblp_wwait(struct usblp *usblp, int nonblock); +@@ -1412,7 +1412,7 @@ static struct usb_device_id usblp_ids [] + { USB_INTERFACE_INFO(7, 1, 2) }, + { USB_INTERFACE_INFO(7, 1, 3) }, + { USB_DEVICE(0x04b8, 0x0202) }, /* Seiko Epson Receipt Printer M129C */ +- { } /* Terminating entry */ ++ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* Terminating entry */ + }; + + MODULE_DEVICE_TABLE (usb, usblp_ids); +diff -urNp linux-2.6.33.1/drivers/usb/core/hcd.c linux-2.6.33.1/drivers/usb/core/hcd.c +--- linux-2.6.33.1/drivers/usb/core/hcd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/core/hcd.c 2010-03-20 16:58:41.020890314 -0400 +@@ -2266,7 +2266,7 @@ EXPORT_SYMBOL_GPL(usb_hcd_platform_shutd + + #if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE) + +-struct usb_mon_operations *mon_ops; ++const struct usb_mon_operations *mon_ops; + + /* + * The registration is unlocked. +@@ -2276,7 +2276,7 @@ struct usb_mon_operations *mon_ops; + * symbols from usbcore, usbcore gets referenced and cannot be unloaded first. + */ + +-int usb_mon_register (struct usb_mon_operations *ops) ++int usb_mon_register (const struct usb_mon_operations *ops) + { + + if (mon_ops) +diff -urNp linux-2.6.33.1/drivers/usb/core/hcd.h linux-2.6.33.1/drivers/usb/core/hcd.h +--- linux-2.6.33.1/drivers/usb/core/hcd.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/core/hcd.h 2010-03-20 16:58:41.032566122 -0400 +@@ -501,13 +501,13 @@ static inline void usbfs_cleanup(void) { + #if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE) + + struct usb_mon_operations { +- void (*urb_submit)(struct usb_bus *bus, struct urb *urb); +- void (*urb_submit_error)(struct usb_bus *bus, struct urb *urb, int err); +- void (*urb_complete)(struct usb_bus *bus, struct urb *urb, int status); ++ void (* const urb_submit)(struct usb_bus *bus, struct urb *urb); ++ void (* const urb_submit_error)(struct usb_bus *bus, struct urb *urb, int err); ++ void (* const urb_complete)(struct usb_bus *bus, struct urb *urb, int status); + /* void (*urb_unlink)(struct usb_bus *bus, struct urb *urb); */ + }; + +-extern struct usb_mon_operations *mon_ops; ++extern const struct usb_mon_operations *mon_ops; + + static inline void usbmon_urb_submit(struct usb_bus *bus, struct urb *urb) + { +@@ -529,7 +529,7 @@ static inline void usbmon_urb_complete(s + (*mon_ops->urb_complete)(bus, urb, status); + } + +-int usb_mon_register(struct usb_mon_operations *ops); ++int usb_mon_register(const struct usb_mon_operations *ops); + void usb_mon_deregister(void); + + #else +diff -urNp linux-2.6.33.1/drivers/usb/core/hub.c linux-2.6.33.1/drivers/usb/core/hub.c +--- linux-2.6.33.1/drivers/usb/core/hub.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/core/hub.c 2010-03-20 16:58:41.044569139 -0400 +@@ -3462,7 +3462,7 @@ static struct usb_device_id hub_id_table + .bDeviceClass = USB_CLASS_HUB}, + { .match_flags = USB_DEVICE_ID_MATCH_INT_CLASS, + .bInterfaceClass = USB_CLASS_HUB}, +- { } /* Terminating entry */ ++ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* Terminating entry */ + }; + + MODULE_DEVICE_TABLE (usb, hub_id_table); +diff -urNp linux-2.6.33.1/drivers/usb/core/message.c linux-2.6.33.1/drivers/usb/core/message.c +--- linux-2.6.33.1/drivers/usb/core/message.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/core/message.c 2010-03-20 16:58:41.048891316 -0400 +@@ -909,8 +909,8 @@ char *usb_cache_string(struct usb_device + buf = kmalloc(MAX_USB_STRING_SIZE, GFP_NOIO); + if (buf) { + len = usb_string(udev, index, buf, MAX_USB_STRING_SIZE); +- if (len > 0) { +- smallbuf = kmalloc(++len, GFP_NOIO); ++ if (len++ > 0) { ++ smallbuf = kmalloc(len, GFP_NOIO); + if (!smallbuf) + return buf; + memcpy(smallbuf, buf, len); +diff -urNp linux-2.6.33.1/drivers/usb/host/ehci-pci.c linux-2.6.33.1/drivers/usb/host/ehci-pci.c +--- linux-2.6.33.1/drivers/usb/host/ehci-pci.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/host/ehci-pci.c 2010-03-20 16:58:41.060592249 -0400 +@@ -422,7 +422,7 @@ static const struct pci_device_id pci_id + PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_EHCI, ~0), + .driver_data = (unsigned long) &ehci_pci_hc_driver, + }, +- { /* end: all zeroes */ } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + MODULE_DEVICE_TABLE(pci, pci_ids); + +diff -urNp linux-2.6.33.1/drivers/usb/host/uhci-hcd.c linux-2.6.33.1/drivers/usb/host/uhci-hcd.c +--- linux-2.6.33.1/drivers/usb/host/uhci-hcd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/host/uhci-hcd.c 2010-03-20 16:58:41.072592318 -0400 +@@ -941,7 +941,7 @@ static const struct pci_device_id uhci_p + /* handle any USB UHCI controller */ + PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_UHCI, ~0), + .driver_data = (unsigned long) &uhci_driver, +- }, { /* end: all zeroes */ } ++ }, { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(pci, uhci_pci_ids); +diff -urNp linux-2.6.33.1/drivers/usb/misc/appledisplay.c linux-2.6.33.1/drivers/usb/misc/appledisplay.c +--- linux-2.6.33.1/drivers/usb/misc/appledisplay.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/misc/appledisplay.c 2010-03-20 16:58:41.076561776 -0400 +@@ -179,7 +179,7 @@ static int appledisplay_bl_get_brightnes + return pdata->msgdata[1]; + } + +-static struct backlight_ops appledisplay_bl_data = { ++static const struct backlight_ops appledisplay_bl_data = { + .get_brightness = appledisplay_bl_get_brightness, + .update_status = appledisplay_bl_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/usb/mon/mon_main.c linux-2.6.33.1/drivers/usb/mon/mon_main.c +--- linux-2.6.33.1/drivers/usb/mon/mon_main.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/mon/mon_main.c 2010-03-20 16:58:41.084588620 -0400 +@@ -238,7 +238,7 @@ static struct notifier_block mon_nb = { + /* + * Ops + */ +-static struct usb_mon_operations mon_ops_0 = { ++static const struct usb_mon_operations mon_ops_0 = { + .urb_submit = mon_submit, + .urb_submit_error = mon_submit_error, + .urb_complete = mon_complete, +diff -urNp linux-2.6.33.1/drivers/usb/storage/debug.h linux-2.6.33.1/drivers/usb/storage/debug.h +--- linux-2.6.33.1/drivers/usb/storage/debug.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/storage/debug.h 2010-03-20 16:58:41.084588620 -0400 +@@ -54,9 +54,9 @@ void usb_stor_show_sense( unsigned char + #define US_DEBUGPX(x...) printk( x ) + #define US_DEBUG(x) x + #else +-#define US_DEBUGP(x...) +-#define US_DEBUGPX(x...) +-#define US_DEBUG(x) ++#define US_DEBUGP(x...) do {} while (0) ++#define US_DEBUGPX(x...) do {} while (0) ++#define US_DEBUG(x) do {} while (0) + #endif + + #endif +diff -urNp linux-2.6.33.1/drivers/usb/storage/usb.c linux-2.6.33.1/drivers/usb/storage/usb.c +--- linux-2.6.33.1/drivers/usb/storage/usb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/storage/usb.c 2010-03-20 16:58:41.084588620 -0400 +@@ -122,7 +122,7 @@ MODULE_PARM_DESC(quirks, "supplemental l + + static struct us_unusual_dev us_unusual_dev_list[] = { + # include "unusual_devs.h" +- { } /* Terminating entry */ ++ { NULL, NULL, 0, 0, NULL } /* Terminating entry */ + }; + + #undef UNUSUAL_DEV +diff -urNp linux-2.6.33.1/drivers/usb/storage/usual-tables.c linux-2.6.33.1/drivers/usb/storage/usual-tables.c +--- linux-2.6.33.1/drivers/usb/storage/usual-tables.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/usb/storage/usual-tables.c 2010-03-20 16:58:41.084588620 -0400 +@@ -48,7 +48,7 @@ + + struct usb_device_id usb_storage_usb_ids[] = { + # include "unusual_devs.h" +- { } /* Terminating entry */ ++ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* Terminating entry */ + }; + EXPORT_SYMBOL_GPL(usb_storage_usb_ids); + +diff -urNp linux-2.6.33.1/drivers/uwb/wlp/messages.c linux-2.6.33.1/drivers/uwb/wlp/messages.c +--- linux-2.6.33.1/drivers/uwb/wlp/messages.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/uwb/wlp/messages.c 2010-03-20 16:58:41.096879476 -0400 +@@ -903,7 +903,7 @@ int wlp_parse_f0(struct wlp *wlp, struct + size_t len = skb->len; + size_t used; + ssize_t result; +- struct wlp_nonce enonce, rnonce; ++ struct wlp_nonce enonce = {{0}}, rnonce = {{0}}; + enum wlp_assc_error assc_err; + char enonce_buf[WLP_WSS_NONCE_STRSIZE]; + char rnonce_buf[WLP_WSS_NONCE_STRSIZE]; +diff -urNp linux-2.6.33.1/drivers/uwb/wlp/sysfs.c linux-2.6.33.1/drivers/uwb/wlp/sysfs.c +--- linux-2.6.33.1/drivers/uwb/wlp/sysfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/uwb/wlp/sysfs.c 2010-03-20 16:58:41.104559489 -0400 +@@ -615,8 +615,7 @@ ssize_t wlp_wss_attr_store(struct kobjec + return ret; + } + +-static +-struct sysfs_ops wss_sysfs_ops = { ++static const struct sysfs_ops wss_sysfs_ops = { + .show = wlp_wss_attr_show, + .store = wlp_wss_attr_store, + }; +diff -urNp linux-2.6.33.1/drivers/video/atmel_lcdfb.c linux-2.6.33.1/drivers/video/atmel_lcdfb.c +--- linux-2.6.33.1/drivers/video/atmel_lcdfb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/atmel_lcdfb.c 2010-03-20 16:58:41.116887355 -0400 +@@ -110,7 +110,7 @@ static int atmel_bl_get_brightness(struc + return lcdc_readl(sinfo, ATMEL_LCDC_CONTRAST_VAL); + } + +-static struct backlight_ops atmel_lcdc_bl_ops = { ++static const struct backlight_ops atmel_lcdc_bl_ops = { + .update_status = atmel_bl_update_status, + .get_brightness = atmel_bl_get_brightness, + }; +diff -urNp linux-2.6.33.1/drivers/video/aty/aty128fb.c linux-2.6.33.1/drivers/video/aty/aty128fb.c +--- linux-2.6.33.1/drivers/video/aty/aty128fb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/aty/aty128fb.c 2010-03-20 16:58:41.141056447 -0400 +@@ -1787,7 +1787,7 @@ static int aty128_bl_get_brightness(stru + return bd->props.brightness; + } + +-static struct backlight_ops aty128_bl_data = { ++static const struct backlight_ops aty128_bl_data = { + .get_brightness = aty128_bl_get_brightness, + .update_status = aty128_bl_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/video/aty/atyfb_base.c linux-2.6.33.1/drivers/video/aty/atyfb_base.c +--- linux-2.6.33.1/drivers/video/aty/atyfb_base.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/aty/atyfb_base.c 2010-03-20 16:58:41.141056447 -0400 +@@ -2225,7 +2225,7 @@ static int aty_bl_get_brightness(struct + return bd->props.brightness; + } + +-static struct backlight_ops aty_bl_data = { ++static const struct backlight_ops aty_bl_data = { + .get_brightness = aty_bl_get_brightness, + .update_status = aty_bl_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/video/aty/radeon_backlight.c linux-2.6.33.1/drivers/video/aty/radeon_backlight.c +--- linux-2.6.33.1/drivers/video/aty/radeon_backlight.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/aty/radeon_backlight.c 2010-03-20 16:58:41.144609392 -0400 +@@ -127,7 +127,7 @@ static int radeon_bl_get_brightness(stru + return bd->props.brightness; + } + +-static struct backlight_ops radeon_bl_data = { ++static const struct backlight_ops radeon_bl_data = { + .get_brightness = radeon_bl_get_brightness, + .update_status = radeon_bl_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/video/bf54x-lq043fb.c linux-2.6.33.1/drivers/video/bf54x-lq043fb.c +--- linux-2.6.33.1/drivers/video/bf54x-lq043fb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/bf54x-lq043fb.c 2010-03-20 16:58:41.172899299 -0400 +@@ -463,7 +463,7 @@ static int bl_get_brightness(struct back + return 0; + } + +-static struct backlight_ops bfin_lq043fb_bl_ops = { ++static const struct backlight_ops bfin_lq043fb_bl_ops = { + .get_brightness = bl_get_brightness, + }; + +diff -urNp linux-2.6.33.1/drivers/video/bfin-t350mcqb-fb.c linux-2.6.33.1/drivers/video/bfin-t350mcqb-fb.c +--- linux-2.6.33.1/drivers/video/bfin-t350mcqb-fb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/bfin-t350mcqb-fb.c 2010-03-20 16:58:41.172899299 -0400 +@@ -381,7 +381,7 @@ static int bl_get_brightness(struct back + return 0; + } + +-static struct backlight_ops bfin_lq043fb_bl_ops = { ++static const struct backlight_ops bfin_lq043fb_bl_ops = { + .get_brightness = bl_get_brightness, + }; + +diff -urNp linux-2.6.33.1/drivers/video/fbmem.c linux-2.6.33.1/drivers/video/fbmem.c +--- linux-2.6.33.1/drivers/video/fbmem.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/fbmem.c 2010-03-20 16:58:41.176758490 -0400 +@@ -403,7 +403,7 @@ static void fb_do_show_logo(struct fb_in + image->dx += image->width + 8; + } + } else if (rotate == FB_ROTATE_UD) { +- for (x = 0; x < num && image->dx >= 0; x++) { ++ for (x = 0; x < num && (__s32)image->dx >= 0; x++) { + info->fbops->fb_imageblit(info, image); + image->dx -= image->width + 8; + } +@@ -415,7 +415,7 @@ static void fb_do_show_logo(struct fb_in + image->dy += image->height + 8; + } + } else if (rotate == FB_ROTATE_CCW) { +- for (x = 0; x < num && image->dy >= 0; x++) { ++ for (x = 0; x < num && (__s32)image->dy >= 0; x++) { + info->fbops->fb_imageblit(info, image); + image->dy -= image->height + 8; + } +@@ -1119,7 +1119,7 @@ static long do_fb_ioctl(struct fb_info * + return -EFAULT; + if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) + return -EINVAL; +- if (con2fb.framebuffer < 0 || con2fb.framebuffer >= FB_MAX) ++ if (con2fb.framebuffer >= FB_MAX) + return -EINVAL; + if (!registered_fb[con2fb.framebuffer]) + request_module("fb%d", con2fb.framebuffer); +diff -urNp linux-2.6.33.1/drivers/video/fbmon.c linux-2.6.33.1/drivers/video/fbmon.c +--- linux-2.6.33.1/drivers/video/fbmon.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/fbmon.c 2010-03-20 16:58:41.184890729 -0400 +@@ -45,7 +45,7 @@ + #ifdef DEBUG + #define DPRINTK(fmt, args...) printk(fmt,## args) + #else +-#define DPRINTK(fmt, args...) ++#define DPRINTK(fmt, args...) do {} while (0) + #endif + + #define FBMON_FIX_HEADER 1 +diff -urNp linux-2.6.33.1/drivers/video/i810/i810_accel.c linux-2.6.33.1/drivers/video/i810/i810_accel.c +--- linux-2.6.33.1/drivers/video/i810/i810_accel.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/i810/i810_accel.c 2010-03-20 16:58:41.188702176 -0400 +@@ -73,6 +73,7 @@ static inline int wait_for_space(struct + } + } + printk("ringbuffer lockup!!!\n"); ++ printk("head:%u tail:%u iring.size:%u space:%u\n", head, tail, par->iring.size, space); + i810_report_error(mmio); + par->dev_flags |= LOCKUP; + info->pixmap.scan_align = 1; +diff -urNp linux-2.6.33.1/drivers/video/i810/i810_main.c linux-2.6.33.1/drivers/video/i810/i810_main.c +--- linux-2.6.33.1/drivers/video/i810/i810_main.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/i810/i810_main.c 2010-03-20 16:58:41.196891711 -0400 +@@ -120,7 +120,7 @@ static struct pci_device_id i810fb_pci_t + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 4 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82815_CGC, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 5 }, +- { 0 }, ++ { 0, 0, 0, 0, 0, 0, 0 }, + }; + + static struct pci_driver i810fb_driver = { +diff -urNp linux-2.6.33.1/drivers/video/modedb.c linux-2.6.33.1/drivers/video/modedb.c +--- linux-2.6.33.1/drivers/video/modedb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/modedb.c 2010-03-20 16:58:41.200859456 -0400 +@@ -39,240 +39,240 @@ static const struct fb_videomode modedb[ + { + /* 640x400 @ 70 Hz, 31.5 kHz hsync */ + NULL, 70, 640, 400, 39721, 40, 24, 39, 9, 96, 2, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 640x480 @ 60 Hz, 31.5 kHz hsync */ + NULL, 60, 640, 480, 39721, 40, 24, 32, 11, 96, 2, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 800x600 @ 56 Hz, 35.15 kHz hsync */ + NULL, 56, 800, 600, 27777, 128, 24, 22, 1, 72, 2, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1024x768 @ 87 Hz interlaced, 35.5 kHz hsync */ + NULL, 87, 1024, 768, 22271, 56, 24, 33, 8, 160, 8, +- 0, FB_VMODE_INTERLACED ++ 0, FB_VMODE_INTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 640x400 @ 85 Hz, 37.86 kHz hsync */ + NULL, 85, 640, 400, 31746, 96, 32, 41, 1, 64, 3, +- FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 640x480 @ 72 Hz, 36.5 kHz hsync */ + NULL, 72, 640, 480, 31746, 144, 40, 30, 8, 40, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 640x480 @ 75 Hz, 37.50 kHz hsync */ + NULL, 75, 640, 480, 31746, 120, 16, 16, 1, 64, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 800x600 @ 60 Hz, 37.8 kHz hsync */ + NULL, 60, 800, 600, 25000, 88, 40, 23, 1, 128, 4, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 640x480 @ 85 Hz, 43.27 kHz hsync */ + NULL, 85, 640, 480, 27777, 80, 56, 25, 1, 56, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1152x864 @ 89 Hz interlaced, 44 kHz hsync */ + NULL, 89, 1152, 864, 15384, 96, 16, 110, 1, 216, 10, +- 0, FB_VMODE_INTERLACED ++ 0, FB_VMODE_INTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 800x600 @ 72 Hz, 48.0 kHz hsync */ + NULL, 72, 800, 600, 20000, 64, 56, 23, 37, 120, 6, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1024x768 @ 60 Hz, 48.4 kHz hsync */ + NULL, 60, 1024, 768, 15384, 168, 8, 29, 3, 144, 6, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 640x480 @ 100 Hz, 53.01 kHz hsync */ + NULL, 100, 640, 480, 21834, 96, 32, 36, 8, 96, 6, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1152x864 @ 60 Hz, 53.5 kHz hsync */ + NULL, 60, 1152, 864, 11123, 208, 64, 16, 4, 256, 8, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 800x600 @ 85 Hz, 55.84 kHz hsync */ + NULL, 85, 800, 600, 16460, 160, 64, 36, 16, 64, 5, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1024x768 @ 70 Hz, 56.5 kHz hsync */ + NULL, 70, 1024, 768, 13333, 144, 24, 29, 3, 136, 6, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1280x1024 @ 87 Hz interlaced, 51 kHz hsync */ + NULL, 87, 1280, 1024, 12500, 56, 16, 128, 1, 216, 12, +- 0, FB_VMODE_INTERLACED ++ 0, FB_VMODE_INTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 800x600 @ 100 Hz, 64.02 kHz hsync */ + NULL, 100, 800, 600, 14357, 160, 64, 30, 4, 64, 6, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1024x768 @ 76 Hz, 62.5 kHz hsync */ + NULL, 76, 1024, 768, 11764, 208, 8, 36, 16, 120, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1152x864 @ 70 Hz, 62.4 kHz hsync */ + NULL, 70, 1152, 864, 10869, 106, 56, 20, 1, 160, 10, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1280x1024 @ 61 Hz, 64.2 kHz hsync */ + NULL, 61, 1280, 1024, 9090, 200, 48, 26, 1, 184, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1400x1050 @ 60Hz, 63.9 kHz hsync */ + NULL, 60, 1400, 1050, 9259, 136, 40, 13, 1, 112, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1400x1050 @ 75,107 Hz, 82,392 kHz +hsync +vsync*/ + NULL, 75, 1400, 1050, 7190, 120, 56, 23, 10, 112, 13, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1400x1050 @ 60 Hz, ? kHz +hsync +vsync*/ + NULL, 60, 1400, 1050, 9259, 128, 40, 12, 0, 112, 3, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1024x768 @ 85 Hz, 70.24 kHz hsync */ + NULL, 85, 1024, 768, 10111, 192, 32, 34, 14, 160, 6, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1152x864 @ 78 Hz, 70.8 kHz hsync */ + NULL, 78, 1152, 864, 9090, 228, 88, 32, 0, 84, 12, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1280x1024 @ 70 Hz, 74.59 kHz hsync */ + NULL, 70, 1280, 1024, 7905, 224, 32, 28, 8, 160, 8, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1600x1200 @ 60Hz, 75.00 kHz hsync */ + NULL, 60, 1600, 1200, 6172, 304, 64, 46, 1, 192, 3, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1152x864 @ 84 Hz, 76.0 kHz hsync */ + NULL, 84, 1152, 864, 7407, 184, 312, 32, 0, 128, 12, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1280x1024 @ 74 Hz, 78.85 kHz hsync */ + NULL, 74, 1280, 1024, 7407, 256, 32, 34, 3, 144, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1024x768 @ 100Hz, 80.21 kHz hsync */ + NULL, 100, 1024, 768, 8658, 192, 32, 21, 3, 192, 10, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1280x1024 @ 76 Hz, 81.13 kHz hsync */ + NULL, 76, 1280, 1024, 7407, 248, 32, 34, 3, 104, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1600x1200 @ 70 Hz, 87.50 kHz hsync */ + NULL, 70, 1600, 1200, 5291, 304, 64, 46, 1, 192, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1152x864 @ 100 Hz, 89.62 kHz hsync */ + NULL, 100, 1152, 864, 7264, 224, 32, 17, 2, 128, 19, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1280x1024 @ 85 Hz, 91.15 kHz hsync */ + NULL, 85, 1280, 1024, 6349, 224, 64, 44, 1, 160, 3, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1600x1200 @ 75 Hz, 93.75 kHz hsync */ + NULL, 75, 1600, 1200, 4938, 304, 64, 46, 1, 192, 3, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1680x1050 @ 60 Hz, 65.191 kHz hsync */ + NULL, 60, 1680, 1050, 6848, 280, 104, 30, 3, 176, 6, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1600x1200 @ 85 Hz, 105.77 kHz hsync */ + NULL, 85, 1600, 1200, 4545, 272, 16, 37, 4, 192, 3, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1280x1024 @ 100 Hz, 107.16 kHz hsync */ + NULL, 100, 1280, 1024, 5502, 256, 32, 26, 7, 128, 15, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1800x1440 @ 64Hz, 96.15 kHz hsync */ + NULL, 64, 1800, 1440, 4347, 304, 96, 46, 1, 192, 3, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1800x1440 @ 70Hz, 104.52 kHz hsync */ + NULL, 70, 1800, 1440, 4000, 304, 96, 46, 1, 192, 3, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 512x384 @ 78 Hz, 31.50 kHz hsync */ + NULL, 78, 512, 384, 49603, 48, 16, 16, 1, 64, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 512x384 @ 85 Hz, 34.38 kHz hsync */ + NULL, 85, 512, 384, 45454, 48, 16, 16, 1, 64, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 320x200 @ 70 Hz, 31.5 kHz hsync, 8:5 aspect ratio */ + NULL, 70, 320, 200, 79440, 16, 16, 20, 4, 48, 1, +- 0, FB_VMODE_DOUBLE ++ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN + }, { + /* 320x240 @ 60 Hz, 31.5 kHz hsync, 4:3 aspect ratio */ + NULL, 60, 320, 240, 79440, 16, 16, 16, 5, 48, 1, +- 0, FB_VMODE_DOUBLE ++ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN + }, { + /* 320x240 @ 72 Hz, 36.5 kHz hsync */ + NULL, 72, 320, 240, 63492, 16, 16, 16, 4, 48, 2, +- 0, FB_VMODE_DOUBLE ++ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN + }, { + /* 400x300 @ 56 Hz, 35.2 kHz hsync, 4:3 aspect ratio */ + NULL, 56, 400, 300, 55555, 64, 16, 10, 1, 32, 1, +- 0, FB_VMODE_DOUBLE ++ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN + }, { + /* 400x300 @ 60 Hz, 37.8 kHz hsync */ + NULL, 60, 400, 300, 50000, 48, 16, 11, 1, 64, 2, +- 0, FB_VMODE_DOUBLE ++ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN + }, { + /* 400x300 @ 72 Hz, 48.0 kHz hsync */ + NULL, 72, 400, 300, 40000, 32, 24, 11, 19, 64, 3, +- 0, FB_VMODE_DOUBLE ++ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN + }, { + /* 480x300 @ 56 Hz, 35.2 kHz hsync, 8:5 aspect ratio */ + NULL, 56, 480, 300, 46176, 80, 16, 10, 1, 40, 1, +- 0, FB_VMODE_DOUBLE ++ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN + }, { + /* 480x300 @ 60 Hz, 37.8 kHz hsync */ + NULL, 60, 480, 300, 41858, 56, 16, 11, 1, 80, 2, +- 0, FB_VMODE_DOUBLE ++ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN + }, { + /* 480x300 @ 63 Hz, 39.6 kHz hsync */ + NULL, 63, 480, 300, 40000, 56, 16, 11, 1, 80, 2, +- 0, FB_VMODE_DOUBLE ++ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN + }, { + /* 480x300 @ 72 Hz, 48.0 kHz hsync */ + NULL, 72, 480, 300, 33386, 40, 24, 11, 19, 80, 3, +- 0, FB_VMODE_DOUBLE ++ 0, FB_VMODE_DOUBLE, FB_MODE_IS_UNKNOWN + }, { + /* 1920x1200 @ 60 Hz, 74.5 Khz hsync */ + NULL, 60, 1920, 1200, 5177, 128, 336, 1, 38, 208, 3, + FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, +- FB_VMODE_NONINTERLACED ++ FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1152x768, 60 Hz, PowerBook G4 Titanium I and II */ + NULL, 60, 1152, 768, 14047, 158, 26, 29, 3, 136, 6, +- FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED ++ FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1366x768, 60 Hz, 47.403 kHz hsync, WXGA 16:9 aspect ratio */ + NULL, 60, 1366, 768, 13806, 120, 10, 14, 3, 32, 5, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 1280x800, 60 Hz, 47.403 kHz hsync, WXGA 16:10 aspect ratio */ + NULL, 60, 1280, 800, 12048, 200, 64, 24, 1, 136, 3, +- 0, FB_VMODE_NONINTERLACED ++ 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 720x576i @ 50 Hz, 15.625 kHz hsync (PAL RGB) */ + NULL, 50, 720, 576, 74074, 64, 16, 39, 5, 64, 5, +- 0, FB_VMODE_INTERLACED ++ 0, FB_VMODE_INTERLACED, FB_MODE_IS_UNKNOWN + }, { + /* 800x520i @ 50 Hz, 15.625 kHz hsync (PAL RGB) */ + NULL, 50, 800, 520, 58823, 144, 64, 72, 28, 80, 5, +- 0, FB_VMODE_INTERLACED ++ 0, FB_VMODE_INTERLACED, FB_MODE_IS_UNKNOWN + }, + }; + +diff -urNp linux-2.6.33.1/drivers/video/nvidia/nv_backlight.c linux-2.6.33.1/drivers/video/nvidia/nv_backlight.c +--- linux-2.6.33.1/drivers/video/nvidia/nv_backlight.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/nvidia/nv_backlight.c 2010-03-20 16:58:41.200859456 -0400 +@@ -87,7 +87,7 @@ static int nvidia_bl_get_brightness(stru + return bd->props.brightness; + } + +-static struct backlight_ops nvidia_bl_ops = { ++static const struct backlight_ops nvidia_bl_ops = { + .get_brightness = nvidia_bl_get_brightness, + .update_status = nvidia_bl_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/video/omap2/displays/panel-taal.c linux-2.6.33.1/drivers/video/omap2/displays/panel-taal.c +--- linux-2.6.33.1/drivers/video/omap2/displays/panel-taal.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/omap2/displays/panel-taal.c 2010-03-20 16:58:41.208900904 -0400 +@@ -313,7 +313,7 @@ static int taal_bl_get_intensity(struct + return 0; + } + +-static struct backlight_ops taal_bl_ops = { ++static const struct backlight_ops taal_bl_ops = { + .get_brightness = taal_bl_get_intensity, + .update_status = taal_bl_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/video/omap2/dss/manager.c linux-2.6.33.1/drivers/video/omap2/dss/manager.c +--- linux-2.6.33.1/drivers/video/omap2/dss/manager.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/omap2/dss/manager.c 2010-03-20 16:58:41.220903696 -0400 +@@ -341,7 +341,7 @@ static ssize_t manager_attr_store(struct + return manager_attr->store(manager, buf, size); + } + +-static struct sysfs_ops manager_sysfs_ops = { ++static const struct sysfs_ops manager_sysfs_ops = { + .show = manager_attr_show, + .store = manager_attr_store, + }; +diff -urNp linux-2.6.33.1/drivers/video/omap2/dss/overlay.c linux-2.6.33.1/drivers/video/omap2/dss/overlay.c +--- linux-2.6.33.1/drivers/video/omap2/dss/overlay.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/omap2/dss/overlay.c 2010-03-20 16:58:41.220903696 -0400 +@@ -320,7 +320,7 @@ static ssize_t overlay_attr_store(struct + return overlay_attr->store(overlay, buf, size); + } + +-static struct sysfs_ops overlay_sysfs_ops = { ++static const struct sysfs_ops overlay_sysfs_ops = { + .show = overlay_attr_show, + .store = overlay_attr_store, + }; +diff -urNp linux-2.6.33.1/drivers/video/riva/fbdev.c linux-2.6.33.1/drivers/video/riva/fbdev.c +--- linux-2.6.33.1/drivers/video/riva/fbdev.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/riva/fbdev.c 2010-03-20 16:58:41.252568052 -0400 +@@ -331,7 +331,7 @@ static int riva_bl_get_brightness(struct + return bd->props.brightness; + } + +-static struct backlight_ops riva_bl_ops = { ++static const struct backlight_ops riva_bl_ops = { + .get_brightness = riva_bl_get_brightness, + .update_status = riva_bl_update_status, + }; +diff -urNp linux-2.6.33.1/drivers/video/uvesafb.c linux-2.6.33.1/drivers/video/uvesafb.c +--- linux-2.6.33.1/drivers/video/uvesafb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/uvesafb.c 2010-03-20 16:58:41.260895734 -0400 +@@ -18,6 +18,7 @@ + #include <linux/fb.h> + #include <linux/io.h> + #include <linux/mutex.h> ++#include <linux/moduleloader.h> + #include <video/edid.h> + #include <video/uvesafb.h> + #ifdef CONFIG_X86 +@@ -120,7 +121,7 @@ static int uvesafb_helper_start(void) + NULL, + }; + +- return call_usermodehelper(v86d_path, argv, envp, 1); ++ return call_usermodehelper(v86d_path, argv, envp, UMH_WAIT_PROC); + } + + /* +@@ -568,10 +569,32 @@ static int __devinit uvesafb_vbe_getpmi( + if ((task->t.regs.eax & 0xffff) != 0x4f || task->t.regs.es < 0xc000) { + par->pmi_setpal = par->ypan = 0; + } else { ++ ++#ifdef CONFIG_PAX_KERNEXEC ++#ifdef CONFIG_MODULES ++ par->pmi_code = module_alloc_exec((u16)task->t.regs.ecx); ++#endif ++ if (!par->pmi_code) { ++ par->pmi_setpal = par->ypan = 0; ++ return 0; ++ } ++#endif ++ + par->pmi_base = (u16 *)phys_to_virt(((u32)task->t.regs.es << 4) + + task->t.regs.edi); ++ ++#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) ++ pax_open_kernel(); ++ memcpy(par->pmi_code, par->pmi_base, (u16)task->t.regs.ecx); ++ pax_close_kernel(); ++ ++ par->pmi_start = ktva_ktla(par->pmi_code + par->pmi_base[1]); ++ par->pmi_pal = ktva_ktla(par->pmi_code + par->pmi_base[2]); ++#else + par->pmi_start = (u8 *)par->pmi_base + par->pmi_base[1]; + par->pmi_pal = (u8 *)par->pmi_base + par->pmi_base[2]; ++#endif ++ + printk(KERN_INFO "uvesafb: protected mode interface info at " + "%04x:%04x\n", + (u16)task->t.regs.es, (u16)task->t.regs.edi); +@@ -1799,6 +1822,11 @@ out: + if (par->vbe_modes) + kfree(par->vbe_modes); + ++#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) ++ if (par->pmi_code) ++ module_free_exec(NULL, par->pmi_code); ++#endif ++ + framebuffer_release(info); + return err; + } +@@ -1825,6 +1853,12 @@ static int uvesafb_remove(struct platfor + kfree(par->vbe_state_orig); + if (par->vbe_state_saved) + kfree(par->vbe_state_saved); ++ ++#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) ++ if (par->pmi_code) ++ module_free_exec(NULL, par->pmi_code); ++#endif ++ + } + + framebuffer_release(info); +diff -urNp linux-2.6.33.1/drivers/video/vesafb.c linux-2.6.33.1/drivers/video/vesafb.c +--- linux-2.6.33.1/drivers/video/vesafb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/video/vesafb.c 2010-03-20 16:58:41.264903169 -0400 +@@ -9,6 +9,7 @@ + */ + + #include <linux/module.h> ++#include <linux/moduleloader.h> + #include <linux/kernel.h> + #include <linux/errno.h> + #include <linux/string.h> +@@ -53,8 +54,8 @@ static int vram_remap __initdata; /* + static int vram_total __initdata; /* Set total amount of memory */ + static int pmi_setpal __read_mostly = 1; /* pmi for palette changes ??? */ + static int ypan __read_mostly; /* 0..nothing, 1..ypan, 2..ywrap */ +-static void (*pmi_start)(void) __read_mostly; +-static void (*pmi_pal) (void) __read_mostly; ++static void (*pmi_start)(void) __read_only; ++static void (*pmi_pal) (void) __read_only; + static int depth __read_mostly; + static int vga_compat __read_mostly; + /* --------------------------------------------------------------------- */ +@@ -233,6 +234,7 @@ static int __init vesafb_probe(struct pl + unsigned int size_vmode; + unsigned int size_remap; + unsigned int size_total; ++ void *pmi_code = NULL; + + if (screen_info.orig_video_isVGA != VIDEO_TYPE_VLFB) + return -ENODEV; +@@ -275,10 +277,6 @@ static int __init vesafb_probe(struct pl + size_remap = size_total; + vesafb_fix.smem_len = size_remap; + +-#ifndef __i386__ +- screen_info.vesapm_seg = 0; +-#endif +- + if (!request_mem_region(vesafb_fix.smem_start, size_total, "vesafb")) { + printk(KERN_WARNING + "vesafb: cannot reserve video memory at 0x%lx\n", +@@ -315,9 +313,21 @@ static int __init vesafb_probe(struct pl + printk(KERN_INFO "vesafb: mode is %dx%dx%d, linelength=%d, pages=%d\n", + vesafb_defined.xres, vesafb_defined.yres, vesafb_defined.bits_per_pixel, vesafb_fix.line_length, screen_info.pages); + ++#ifdef __i386__ ++ ++#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) ++ pmi_code = module_alloc_exec(screen_info.vesapm_size); ++ if (!pmi_code) ++#elif !defined(CONFIG_PAX_KERNEXEC) ++ if (0) ++#endif ++ ++#endif ++ screen_info.vesapm_seg = 0; ++ + if (screen_info.vesapm_seg) { +- printk(KERN_INFO "vesafb: protected mode interface info at %04x:%04x\n", +- screen_info.vesapm_seg,screen_info.vesapm_off); ++ printk(KERN_INFO "vesafb: protected mode interface info at %04x:%04x %04x bytes\n", ++ screen_info.vesapm_seg,screen_info.vesapm_off,screen_info.vesapm_size); + } + + if (screen_info.vesapm_seg < 0xc000) +@@ -325,9 +335,25 @@ static int __init vesafb_probe(struct pl + + if (ypan || pmi_setpal) { + unsigned short *pmi_base; +- pmi_base = (unsigned short*)phys_to_virt(((unsigned long)screen_info.vesapm_seg << 4) + screen_info.vesapm_off); +- pmi_start = (void*)((char*)pmi_base + pmi_base[1]); +- pmi_pal = (void*)((char*)pmi_base + pmi_base[2]); ++ ++ pmi_base = (unsigned short*)phys_to_virt(((unsigned long)screen_info.vesapm_seg << 4) + screen_info.vesapm_off); ++ ++#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) ++ pax_open_kernel(); ++ memcpy(pmi_code, pmi_base, screen_info.vesapm_size); ++#else ++ pmi_code = pmi_base; ++#endif ++ ++ pmi_start = (void*)((char*)pmi_code + pmi_base[1]); ++ pmi_pal = (void*)((char*)pmi_code + pmi_base[2]); ++ ++#if defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) ++ pmi_start = ktva_ktla(pmi_start); ++ pmi_pal = ktva_ktla(pmi_pal); ++ pax_close_kernel(); ++#endif ++ + printk(KERN_INFO "vesafb: pmi: set display start = %p, set palette = %p\n",pmi_start,pmi_pal); + if (pmi_base[3]) { + printk(KERN_INFO "vesafb: pmi: ports = "); +@@ -469,6 +495,11 @@ static int __init vesafb_probe(struct pl + info->node, info->fix.id); + return 0; + err: ++ ++#if defined(__i386__) && defined(CONFIG_MODULES) && defined(CONFIG_PAX_KERNEXEC) ++ module_free_exec(NULL, pmi_code); ++#endif ++ + if (info->screen_base) + iounmap(info->screen_base); + framebuffer_release(info); +diff -urNp linux-2.6.33.1/drivers/xen/sys-hypervisor.c linux-2.6.33.1/drivers/xen/sys-hypervisor.c +--- linux-2.6.33.1/drivers/xen/sys-hypervisor.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/drivers/xen/sys-hypervisor.c 2010-03-20 16:58:41.272886902 -0400 +@@ -426,7 +426,7 @@ static ssize_t hyp_sysfs_store(struct ko + return 0; + } + +-static struct sysfs_ops hyp_sysfs_ops = { ++static const struct sysfs_ops hyp_sysfs_ops = { + .show = hyp_sysfs_show, + .store = hyp_sysfs_store, + }; +diff -urNp linux-2.6.33.1/fs/9p/vfs_inode.c linux-2.6.33.1/fs/9p/vfs_inode.c +--- linux-2.6.33.1/fs/9p/vfs_inode.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/9p/vfs_inode.c 2010-03-20 16:58:41.284893283 -0400 +@@ -1041,7 +1041,7 @@ static void *v9fs_vfs_follow_link(struct + static void + v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) + { +- char *s = nd_get_link(nd); ++ const char *s = nd_get_link(nd); + + P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, + IS_ERR(s) ? "<error>" : s); +diff -urNp linux-2.6.33.1/fs/aio.c linux-2.6.33.1/fs/aio.c +--- linux-2.6.33.1/fs/aio.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/aio.c 2010-03-20 16:58:41.305540612 -0400 +@@ -129,7 +129,7 @@ static int aio_setup_ring(struct kioctx + size += sizeof(struct io_event) * nr_events; + nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + +- if (nr_pages < 0) ++ if (nr_pages <= 0) + return -EINVAL; + + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); +diff -urNp linux-2.6.33.1/fs/attr.c linux-2.6.33.1/fs/attr.c +--- linux-2.6.33.1/fs/attr.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/attr.c 2010-03-20 16:58:41.305540612 -0400 +@@ -83,6 +83,7 @@ int inode_newsize_ok(const struct inode + unsigned long limit; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; ++ gr_learn_resource(current, RLIMIT_FSIZE, (unsigned long)offset, 1); + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) +diff -urNp linux-2.6.33.1/fs/autofs/root.c linux-2.6.33.1/fs/autofs/root.c +--- linux-2.6.33.1/fs/autofs/root.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/autofs/root.c 2010-03-20 16:58:41.316888614 -0400 +@@ -299,7 +299,8 @@ static int autofs_root_symlink(struct in + set_bit(n,sbi->symlink_bitmap); + sl = &sbi->symlink[n]; + sl->len = strlen(symname); +- sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL); ++ slsize = sl->len+1; ++ sl->data = kmalloc(slsize, GFP_KERNEL); + if (!sl->data) { + clear_bit(n,sbi->symlink_bitmap); + unlock_kernel(); +diff -urNp linux-2.6.33.1/fs/autofs4/symlink.c linux-2.6.33.1/fs/autofs4/symlink.c +--- linux-2.6.33.1/fs/autofs4/symlink.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/autofs4/symlink.c 2010-03-20 16:58:41.324896891 -0400 +@@ -15,7 +15,7 @@ + static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) + { + struct autofs_info *ino = autofs4_dentry_ino(dentry); +- nd_set_link(nd, (char *)ino->u.symlink); ++ nd_set_link(nd, ino->u.symlink); + return NULL; + } + +diff -urNp linux-2.6.33.1/fs/befs/linuxvfs.c linux-2.6.33.1/fs/befs/linuxvfs.c +--- linux-2.6.33.1/fs/befs/linuxvfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/befs/linuxvfs.c 2010-03-20 16:58:41.332900573 -0400 +@@ -493,7 +493,7 @@ static void befs_put_link(struct dentry + { + befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); + if (befs_ino->i_flags & BEFS_LONG_SYMLINK) { +- char *link = nd_get_link(nd); ++ const char *link = nd_get_link(nd); + if (!IS_ERR(link)) + kfree(link); + } +diff -urNp linux-2.6.33.1/fs/binfmt_aout.c linux-2.6.33.1/fs/binfmt_aout.c +--- linux-2.6.33.1/fs/binfmt_aout.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/binfmt_aout.c 2010-03-20 16:58:41.336873242 -0400 +@@ -16,6 +16,7 @@ + #include <linux/string.h> + #include <linux/fs.h> + #include <linux/file.h> ++#include <linux/security.h> + #include <linux/stat.h> + #include <linux/fcntl.h> + #include <linux/ptrace.h> +@@ -114,10 +115,12 @@ static int aout_core_dump(struct coredum + + /* If the size of the dump file exceeds the rlimit, then see what would happen + if we wrote the stack, but not the data area. */ ++ gr_learn_resource(current, RLIMIT_CORE, (dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE, 1); + if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit) + dump.u_dsize = 0; + + /* Make sure we have enough room to write the stack and data areas. */ ++ gr_learn_resource(current, RLIMIT_CORE, (dump.u_ssize + 1) * PAGE_SIZE, 1); + if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit) + dump.u_ssize = 0; + +@@ -250,6 +253,8 @@ static int load_aout_binary(struct linux + rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + if (rlim >= RLIM_INFINITY) + rlim = ~0; ++ ++ gr_learn_resource(current, RLIMIT_DATA, ex.a_data + ex.a_bss, 1); + if (ex.a_data + ex.a_bss > rlim) + return -ENOMEM; + +@@ -278,6 +283,27 @@ static int load_aout_binary(struct linux + install_exec_creds(bprm); + current->flags &= ~PF_FORKNOEXEC; + ++#if defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) ++ current->mm->pax_flags = 0UL; ++#endif ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (!(N_FLAGS(ex) & F_PAX_PAGEEXEC)) { ++ current->mm->pax_flags |= MF_PAX_PAGEEXEC; ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++ if (N_FLAGS(ex) & F_PAX_EMUTRAMP) ++ current->mm->pax_flags |= MF_PAX_EMUTRAMP; ++#endif ++ ++#ifdef CONFIG_PAX_MPROTECT ++ if (!(N_FLAGS(ex) & F_PAX_MPROTECT)) ++ current->mm->pax_flags |= MF_PAX_MPROTECT; ++#endif ++ ++ } ++#endif ++ + if (N_MAGIC(ex) == OMAGIC) { + unsigned long text_addr, map_size; + loff_t pos; +@@ -350,7 +376,7 @@ static int load_aout_binary(struct linux + + down_write(¤t->mm->mmap_sem); + error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, +- PROT_READ | PROT_WRITE | PROT_EXEC, ++ PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + fd_offset + ex.a_text); + up_write(¤t->mm->mmap_sem); +diff -urNp linux-2.6.33.1/fs/binfmt_elf.c linux-2.6.33.1/fs/binfmt_elf.c +--- linux-2.6.33.1/fs/binfmt_elf.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/binfmt_elf.c 2010-03-20 16:58:41.348904798 -0400 +@@ -50,6 +50,10 @@ static int elf_core_dump(struct coredump + #define elf_core_dump NULL + #endif + ++#ifdef CONFIG_PAX_MPROTECT ++static void elf_handle_mprotect(struct vm_area_struct *vma, unsigned long newflags); ++#endif ++ + #if ELF_EXEC_PAGESIZE > PAGE_SIZE + #define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE + #else +@@ -69,6 +73,11 @@ static struct linux_binfmt elf_format = + .load_binary = load_elf_binary, + .load_shlib = load_elf_library, + .core_dump = elf_core_dump, ++ ++#ifdef CONFIG_PAX_MPROTECT ++ .handle_mprotect= elf_handle_mprotect, ++#endif ++ + .min_coredump = ELF_EXEC_PAGESIZE, + .hasvdso = 1 + }; +@@ -77,6 +86,8 @@ static struct linux_binfmt elf_format = + + static int set_brk(unsigned long start, unsigned long end) + { ++ unsigned long e = end; ++ + start = ELF_PAGEALIGN(start); + end = ELF_PAGEALIGN(end); + if (end > start) { +@@ -87,7 +98,7 @@ static int set_brk(unsigned long start, + if (BAD_ADDR(addr)) + return addr; + } +- current->mm->start_brk = current->mm->brk = end; ++ current->mm->start_brk = current->mm->brk = e; + return 0; + } + +@@ -148,7 +159,7 @@ create_elf_tables(struct linux_binprm *b + elf_addr_t __user *u_rand_bytes; + const char *k_platform = ELF_PLATFORM; + const char *k_base_platform = ELF_BASE_PLATFORM; +- unsigned char k_rand_bytes[16]; ++ u32 k_rand_bytes[4]; + int items; + elf_addr_t *elf_info; + int ei_index = 0; +@@ -195,6 +206,10 @@ create_elf_tables(struct linux_binprm *b + * Generate 16 random bytes for userspace PRNG seeding. + */ + get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); ++ srandom32(k_rand_bytes[0] ^ random32()); ++ srandom32(k_rand_bytes[1] ^ random32()); ++ srandom32(k_rand_bytes[2] ^ random32()); ++ srandom32(k_rand_bytes[3] ^ random32()); + u_rand_bytes = (elf_addr_t __user *) + STACK_ALLOC(p, sizeof(k_rand_bytes)); + if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) +@@ -385,10 +400,10 @@ static unsigned long load_elf_interp(str + { + struct elf_phdr *elf_phdata; + struct elf_phdr *eppnt; +- unsigned long load_addr = 0; ++ unsigned long load_addr = 0, pax_task_size = TASK_SIZE; + int load_addr_set = 0; + unsigned long last_bss = 0, elf_bss = 0; +- unsigned long error = ~0UL; ++ unsigned long error = -EINVAL; + unsigned long total_size; + int retval, i, size; + +@@ -434,6 +449,11 @@ static unsigned long load_elf_interp(str + goto out_close; + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++#endif ++ + eppnt = elf_phdata; + for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { + if (eppnt->p_type == PT_LOAD) { +@@ -477,8 +497,8 @@ static unsigned long load_elf_interp(str + k = load_addr + eppnt->p_vaddr; + if (BAD_ADDR(k) || + eppnt->p_filesz > eppnt->p_memsz || +- eppnt->p_memsz > TASK_SIZE || +- TASK_SIZE - eppnt->p_memsz < k) { ++ eppnt->p_memsz > pax_task_size || ++ pax_task_size - eppnt->p_memsz < k) { + error = -ENOMEM; + goto out_close; + } +@@ -532,6 +552,177 @@ out: + return error; + } + ++#if (defined(CONFIG_PAX_EI_PAX) || defined(CONFIG_PAX_PT_PAX_FLAGS)) && defined(CONFIG_PAX_SOFTMODE) ++static unsigned long pax_parse_softmode(const struct elf_phdr * const elf_phdata) ++{ ++ unsigned long pax_flags = 0UL; ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (elf_phdata->p_flags & PF_PAGEEXEC) ++ pax_flags |= MF_PAX_PAGEEXEC; ++#endif ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (elf_phdata->p_flags & PF_SEGMEXEC) ++ pax_flags |= MF_PAX_SEGMEXEC; ++#endif ++ ++#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_PAX_SEGMEXEC) ++ if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) == (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { ++ if ((__supported_pte_mask & _PAGE_NX)) ++ pax_flags &= ~MF_PAX_SEGMEXEC; ++ else ++ pax_flags &= ~MF_PAX_PAGEEXEC; ++ } ++#endif ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++ if (elf_phdata->p_flags & PF_EMUTRAMP) ++ pax_flags |= MF_PAX_EMUTRAMP; ++#endif ++ ++#ifdef CONFIG_PAX_MPROTECT ++ if (elf_phdata->p_flags & PF_MPROTECT) ++ pax_flags |= MF_PAX_MPROTECT; ++#endif ++ ++#if defined(CONFIG_PAX_RANDMMAP) || defined(CONFIG_PAX_RANDUSTACK) ++ if (randomize_va_space && (elf_phdata->p_flags & PF_RANDMMAP)) ++ pax_flags |= MF_PAX_RANDMMAP; ++#endif ++ ++ return pax_flags; ++} ++#endif ++ ++#ifdef CONFIG_PAX_PT_PAX_FLAGS ++static unsigned long pax_parse_hardmode(const struct elf_phdr * const elf_phdata) ++{ ++ unsigned long pax_flags = 0UL; ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (!(elf_phdata->p_flags & PF_NOPAGEEXEC)) ++ pax_flags |= MF_PAX_PAGEEXEC; ++#endif ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (!(elf_phdata->p_flags & PF_NOSEGMEXEC)) ++ pax_flags |= MF_PAX_SEGMEXEC; ++#endif ++ ++#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_PAX_SEGMEXEC) ++ if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) == (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { ++ if ((__supported_pte_mask & _PAGE_NX)) ++ pax_flags &= ~MF_PAX_SEGMEXEC; ++ else ++ pax_flags &= ~MF_PAX_PAGEEXEC; ++ } ++#endif ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++ if (!(elf_phdata->p_flags & PF_NOEMUTRAMP)) ++ pax_flags |= MF_PAX_EMUTRAMP; ++#endif ++ ++#ifdef CONFIG_PAX_MPROTECT ++ if (!(elf_phdata->p_flags & PF_NOMPROTECT)) ++ pax_flags |= MF_PAX_MPROTECT; ++#endif ++ ++#if defined(CONFIG_PAX_RANDMMAP) || defined(CONFIG_PAX_RANDUSTACK) ++ if (randomize_va_space && !(elf_phdata->p_flags & PF_NORANDMMAP)) ++ pax_flags |= MF_PAX_RANDMMAP; ++#endif ++ ++ return pax_flags; ++} ++#endif ++ ++#ifdef CONFIG_PAX_EI_PAX ++static unsigned long pax_parse_ei_pax(const struct elfhdr * const elf_ex) ++{ ++ unsigned long pax_flags = 0UL; ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ if (!(elf_ex->e_ident[EI_PAX] & EF_PAX_PAGEEXEC)) ++ pax_flags |= MF_PAX_PAGEEXEC; ++#endif ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (!(elf_ex->e_ident[EI_PAX] & EF_PAX_SEGMEXEC)) ++ pax_flags |= MF_PAX_SEGMEXEC; ++#endif ++ ++#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_PAX_SEGMEXEC) ++ if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) == (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { ++ if ((__supported_pte_mask & _PAGE_NX)) ++ pax_flags &= ~MF_PAX_SEGMEXEC; ++ else ++ pax_flags &= ~MF_PAX_PAGEEXEC; ++ } ++#endif ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++ if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) && (elf_ex->e_ident[EI_PAX] & EF_PAX_EMUTRAMP)) ++ pax_flags |= MF_PAX_EMUTRAMP; ++#endif ++ ++#ifdef CONFIG_PAX_MPROTECT ++ if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) && !(elf_ex->e_ident[EI_PAX] & EF_PAX_MPROTECT)) ++ pax_flags |= MF_PAX_MPROTECT; ++#endif ++ ++#ifdef CONFIG_PAX_ASLR ++ if (randomize_va_space && !(elf_ex->e_ident[EI_PAX] & EF_PAX_RANDMMAP)) ++ pax_flags |= MF_PAX_RANDMMAP; ++#endif ++ ++ return pax_flags; ++} ++#endif ++ ++#if defined(CONFIG_PAX_EI_PAX) || defined(CONFIG_PAX_PT_PAX_FLAGS) ++static long pax_parse_elf_flags(const struct elfhdr * const elf_ex, const struct elf_phdr * const elf_phdata) ++{ ++ unsigned long pax_flags = 0UL; ++ ++#ifdef CONFIG_PAX_PT_PAX_FLAGS ++ unsigned long i; ++#endif ++ ++#ifdef CONFIG_PAX_EI_PAX ++ pax_flags = pax_parse_ei_pax(elf_ex); ++#endif ++ ++#ifdef CONFIG_PAX_PT_PAX_FLAGS ++ for (i = 0UL; i < elf_ex->e_phnum; i++) ++ if (elf_phdata[i].p_type == PT_PAX_FLAGS) { ++ if (((elf_phdata[i].p_flags & PF_PAGEEXEC) && (elf_phdata[i].p_flags & PF_NOPAGEEXEC)) || ++ ((elf_phdata[i].p_flags & PF_SEGMEXEC) && (elf_phdata[i].p_flags & PF_NOSEGMEXEC)) || ++ ((elf_phdata[i].p_flags & PF_EMUTRAMP) && (elf_phdata[i].p_flags & PF_NOEMUTRAMP)) || ++ ((elf_phdata[i].p_flags & PF_MPROTECT) && (elf_phdata[i].p_flags & PF_NOMPROTECT)) || ++ ((elf_phdata[i].p_flags & PF_RANDMMAP) && (elf_phdata[i].p_flags & PF_NORANDMMAP))) ++ return -EINVAL; ++ ++#ifdef CONFIG_PAX_SOFTMODE ++ if (pax_softmode) ++ pax_flags = pax_parse_softmode(&elf_phdata[i]); ++ else ++#endif ++ ++ pax_flags = pax_parse_hardmode(&elf_phdata[i]); ++ break; ++ } ++#endif ++ ++ if (0 > pax_check_flags(&pax_flags)) ++ return -EINVAL; ++ ++ current->mm->pax_flags = pax_flags; ++ return 0; ++} ++#endif ++ + /* + * These are the functions used to load ELF style executables and shared + * libraries. There is no binary dependent code anywhere else. +@@ -548,6 +739,11 @@ static unsigned long randomize_stack_top + { + unsigned int random_variable = 0; + ++#ifdef CONFIG_PAX_RANDUSTACK ++ if (randomize_va_space) ++ return stack_top - current->mm->delta_stack; ++#endif ++ + if ((current->flags & PF_RANDOMIZE) && + !(current->personality & ADDR_NO_RANDOMIZE)) { + random_variable = get_random_int() & STACK_RND_MASK; +@@ -566,7 +762,7 @@ static int load_elf_binary(struct linux_ + unsigned long load_addr = 0, load_bias = 0; + int load_addr_set = 0; + char * elf_interpreter = NULL; +- unsigned long error; ++ unsigned long error = 0; + struct elf_phdr *elf_ppnt, *elf_phdata; + unsigned long elf_bss, elf_brk; + int retval, i; +@@ -576,11 +772,11 @@ static int load_elf_binary(struct linux_ + unsigned long start_code, end_code, start_data, end_data; + unsigned long reloc_func_desc = 0; + int executable_stack = EXSTACK_DEFAULT; +- unsigned long def_flags = 0; + struct { + struct elfhdr elf_ex; + struct elfhdr interp_elf_ex; + } *loc; ++ unsigned long pax_task_size = TASK_SIZE; + + loc = kmalloc(sizeof(*loc), GFP_KERNEL); + if (!loc) { +@@ -718,11 +914,80 @@ static int load_elf_binary(struct linux_ + + /* OK, This is the point of no return */ + current->flags &= ~PF_FORKNOEXEC; +- current->mm->def_flags = def_flags; ++ ++#if defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) ++ current->mm->pax_flags = 0UL; ++#endif ++ ++#ifdef CONFIG_PAX_DLRESOLVE ++ current->mm->call_dl_resolve = 0UL; ++#endif ++ ++#if defined(CONFIG_PPC32) && defined(CONFIG_PAX_EMUSIGRT) ++ current->mm->call_syscall = 0UL; ++#endif ++ ++#ifdef CONFIG_PAX_ASLR ++ current->mm->delta_mmap = 0UL; ++ current->mm->delta_stack = 0UL; ++#endif ++ ++ current->mm->def_flags = 0; ++ ++#if defined(CONFIG_PAX_EI_PAX) || defined(CONFIG_PAX_PT_PAX_FLAGS) ++ if (0 > pax_parse_elf_flags(&loc->elf_ex, elf_phdata)) { ++ send_sig(SIGKILL, current, 0); ++ goto out_free_dentry; ++ } ++#endif ++ ++#ifdef CONFIG_PAX_HAVE_ACL_FLAGS ++ pax_set_initial_flags(bprm); ++#elif defined(CONFIG_PAX_HOOK_ACL_FLAGS) ++ if (pax_set_initial_flags_func) ++ (pax_set_initial_flags_func)(bprm); ++#endif ++ ++#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT ++ if ((current->mm->pax_flags & MF_PAX_PAGEEXEC) && !(__supported_pte_mask & _PAGE_NX)) { ++ current->mm->context.user_cs_limit = PAGE_SIZE; ++ current->mm->def_flags |= VM_PAGEEXEC; ++ } ++#endif ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) { ++ current->mm->context.user_cs_base = SEGMEXEC_TASK_SIZE; ++ current->mm->context.user_cs_limit = TASK_SIZE-SEGMEXEC_TASK_SIZE; ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++ } ++#endif ++ ++#if defined(CONFIG_ARCH_TRACK_EXEC_LIMIT) || defined(CONFIG_PAX_SEGMEXEC) ++ if (current->mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { ++ set_user_cs(current->mm->context.user_cs_base, current->mm->context.user_cs_limit, get_cpu()); ++ put_cpu(); ++ } ++#endif + + /* Do this immediately, since STACK_TOP as used in setup_arg_pages + may depend on the personality. */ + SET_PERSONALITY(loc->elf_ex); ++ ++#ifdef CONFIG_PAX_ASLR ++ if (current->mm->pax_flags & MF_PAX_RANDMMAP) { ++ current->mm->delta_mmap = (pax_get_random_long() & ((1UL << PAX_DELTA_MMAP_LEN)-1)) << PAGE_SHIFT; ++ current->mm->delta_stack = (pax_get_random_long() & ((1UL << PAX_DELTA_STACK_LEN)-1)) << PAGE_SHIFT; ++ } ++#endif ++ ++#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) ++ if (current->mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { ++ executable_stack = EXSTACK_DISABLE_X; ++ current->personality &= ~READ_IMPLIES_EXEC; ++ } else ++#endif ++ + if (elf_read_implies_exec(loc->elf_ex, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; + +@@ -804,6 +1069,20 @@ static int load_elf_binary(struct linux_ + #else + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); + #endif ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ /* PaX: randomize base address at the default exe base if requested */ ++ if ((current->mm->pax_flags & MF_PAX_RANDMMAP) && elf_interpreter) { ++#ifdef CONFIG_SPARC64 ++ load_bias = (pax_get_random_long() & ((1UL << PAX_DELTA_MMAP_LEN) - 1)) << (PAGE_SHIFT+1); ++#else ++ load_bias = (pax_get_random_long() & ((1UL << PAX_DELTA_MMAP_LEN) - 1)) << PAGE_SHIFT; ++#endif ++ load_bias = ELF_PAGESTART(PAX_ELF_ET_DYN_BASE - vaddr + load_bias); ++ elf_flags |= MAP_FIXED; ++ } ++#endif ++ + } + + error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, +@@ -836,9 +1115,9 @@ static int load_elf_binary(struct linux_ + * allowed task size. Note that p_filesz must always be + * <= p_memsz so it is only necessary to check p_memsz. + */ +- if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz || +- elf_ppnt->p_memsz > TASK_SIZE || +- TASK_SIZE - elf_ppnt->p_memsz < k) { ++ if (k >= pax_task_size || elf_ppnt->p_filesz > elf_ppnt->p_memsz || ++ elf_ppnt->p_memsz > pax_task_size || ++ pax_task_size - elf_ppnt->p_memsz < k) { + /* set_brk can never work. Avoid overflows. */ + send_sig(SIGKILL, current, 0); + retval = -EINVAL; +@@ -866,6 +1145,11 @@ static int load_elf_binary(struct linux_ + start_data += load_bias; + end_data += load_bias; + ++#ifdef CONFIG_PAX_RANDMMAP ++ if (current->mm->pax_flags & MF_PAX_RANDMMAP) ++ elf_brk += PAGE_SIZE + ((pax_get_random_long() & ~PAGE_MASK) << 4); ++#endif ++ + /* Calling set_brk effectively mmaps the pages that we need + * for the bss and break sections. We must do this before + * mapping in the interpreter, to make sure it doesn't wind +@@ -877,9 +1161,11 @@ static int load_elf_binary(struct linux_ + goto out_free_dentry; + } + if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { +- send_sig(SIGSEGV, current, 0); +- retval = -EFAULT; /* Nobody gets to see this, but.. */ +- goto out_free_dentry; ++ /* ++ * This bss-zeroing can fail if the ELF ++ * file specifies odd protections. So ++ * we don't check the return value ++ */ + } + + if (elf_interpreter) { +@@ -1107,8 +1393,10 @@ static int dump_seek(struct file *file, + unsigned long n = off; + if (n > PAGE_SIZE) + n = PAGE_SIZE; +- if (!dump_write(file, buf, n)) ++ if (!dump_write(file, buf, n)) { ++ free_page((unsigned long)buf); + return 0; ++ } + off -= n; + } + free_page((unsigned long)buf); +@@ -1120,7 +1408,7 @@ static int dump_seek(struct file *file, + * Decide what to dump of a segment, part, all or none. + */ + static unsigned long vma_dump_size(struct vm_area_struct *vma, +- unsigned long mm_flags) ++ unsigned long mm_flags, long signr) + { + #define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + +@@ -1154,7 +1442,7 @@ static unsigned long vma_dump_size(struc + if (vma->vm_file == NULL) + return 0; + +- if (FILTER(MAPPED_PRIVATE)) ++ if (signr == SIGKILL || FILTER(MAPPED_PRIVATE)) + goto whole; + + /* +@@ -1250,9 +1538,12 @@ static int writenote(struct memelfnote * + #undef DUMP_WRITE + + #define DUMP_WRITE(addr, nr) \ ++ do { \ ++ gr_learn_resource(current, RLIMIT_CORE, size + (nr), 1); \ + if ((size += (nr)) > cprm->limit || \ + !dump_write(cprm->file, (addr), (nr))) \ +- goto end_coredump; ++ goto end_coredump; \ ++ } while (0); + + static void fill_elf_header(struct elfhdr *elf, int segs, + u16 machine, u32 flags, u8 osabi) +@@ -1381,9 +1672,9 @@ static void fill_auxv_note(struct memelf + { + elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv; + int i = 0; +- do ++ do { + i += 2; +- while (auxv[i - 2] != AT_NULL); ++ } while (auxv[i - 2] != AT_NULL); + fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + } + +@@ -1969,7 +2260,7 @@ static int elf_core_dump(struct coredump + phdr.p_offset = offset; + phdr.p_vaddr = vma->vm_start; + phdr.p_paddr = 0; +- phdr.p_filesz = vma_dump_size(vma, mm_flags); ++ phdr.p_filesz = vma_dump_size(vma, mm_flags, cprm->signr); + phdr.p_memsz = vma->vm_end - vma->vm_start; + offset += phdr.p_filesz; + phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; +@@ -2002,7 +2293,7 @@ static int elf_core_dump(struct coredump + unsigned long addr; + unsigned long end; + +- end = vma->vm_start + vma_dump_size(vma, mm_flags); ++ end = vma->vm_start + vma_dump_size(vma, mm_flags, cprm->signr); + + for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { + struct page *page; +@@ -2011,6 +2302,7 @@ static int elf_core_dump(struct coredump + page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); ++ gr_learn_resource(current, RLIMIT_CORE, size + PAGE_SIZE, 1); + stop = ((size += PAGE_SIZE) > cprm->limit) || + !dump_write(cprm->file, kaddr, + PAGE_SIZE); +@@ -2039,6 +2331,97 @@ out: + + #endif /* CONFIG_ELF_CORE */ + ++#ifdef CONFIG_PAX_MPROTECT ++/* PaX: non-PIC ELF libraries need relocations on their executable segments ++ * therefore we'll grant them VM_MAYWRITE once during their life. Similarly ++ * we'll remove VM_MAYWRITE for good on RELRO segments. ++ * ++ * The checks favour ld-linux.so behaviour which operates on a per ELF segment ++ * basis because we want to allow the common case and not the special ones. ++ */ ++static void elf_handle_mprotect(struct vm_area_struct *vma, unsigned long newflags) ++{ ++ struct elfhdr elf_h; ++ struct elf_phdr elf_p; ++ unsigned long i; ++ unsigned long oldflags; ++ bool is_textrel_rw, is_textrel_rx, is_relro; ++ ++ if (!(vma->vm_mm->pax_flags & MF_PAX_MPROTECT)) ++ return; ++ ++ oldflags = vma->vm_flags & (VM_MAYEXEC | VM_MAYWRITE | VM_MAYREAD | VM_EXEC | VM_WRITE | VM_READ); ++ newflags &= VM_MAYEXEC | VM_MAYWRITE | VM_MAYREAD | VM_EXEC | VM_WRITE | VM_READ; ++ ++#ifdef CONFIG_PAX_NOELFRELOCS ++ is_textrel_rw = false; ++ is_textrel_rx = false; ++#else ++ /* possible TEXTREL */ ++ is_textrel_rw = vma->vm_file && !vma->anon_vma && oldflags == (VM_MAYEXEC | VM_MAYREAD | VM_EXEC | VM_READ) && newflags == (VM_WRITE | VM_READ); ++ is_textrel_rx = vma->vm_file && vma->anon_vma && oldflags == (VM_MAYEXEC | VM_MAYWRITE | VM_MAYREAD | VM_WRITE | VM_READ) && newflags == (VM_EXEC | VM_READ); ++#endif ++ ++ /* possible RELRO */ ++ is_relro = vma->vm_file && vma->anon_vma && oldflags == (VM_MAYWRITE | VM_MAYREAD | VM_READ) && newflags == (VM_MAYWRITE | VM_MAYREAD | VM_READ); ++ ++ if (!is_textrel_rw && !is_textrel_rx && !is_relro) ++ return; ++ ++ if (sizeof(elf_h) != kernel_read(vma->vm_file, 0UL, (char *)&elf_h, sizeof(elf_h)) || ++ memcmp(elf_h.e_ident, ELFMAG, SELFMAG) || ++ ++#ifdef CONFIG_PAX_ETEXECRELOCS ++ ((is_textrel_rw || is_textrel_rx) && (elf_h.e_type != ET_DYN && elf_h.e_type != ET_EXEC)) || ++#else ++ ((is_textrel_rw || is_textrel_rx) && elf_h.e_type != ET_DYN) || ++#endif ++ ++ (is_relro && (elf_h.e_type != ET_DYN && elf_h.e_type != ET_EXEC)) || ++ !elf_check_arch(&elf_h) || ++ elf_h.e_phentsize != sizeof(struct elf_phdr) || ++ elf_h.e_phnum > 65536UL / sizeof(struct elf_phdr)) ++ return; ++ ++ for (i = 0UL; i < elf_h.e_phnum; i++) { ++ if (sizeof(elf_p) != kernel_read(vma->vm_file, elf_h.e_phoff + i*sizeof(elf_p), (char *)&elf_p, sizeof(elf_p))) ++ return; ++ switch (elf_p.p_type) { ++ case PT_DYNAMIC: ++ if (!is_textrel_rw && !is_textrel_rx) ++ continue; ++ i = 0UL; ++ while ((i+1) * sizeof(elf_dyn) <= elf_p.p_filesz) { ++ elf_dyn dyn; ++ ++ if (sizeof(dyn) != kernel_read(vma->vm_file, elf_p.p_offset + i*sizeof(dyn), (char *)&dyn, sizeof(dyn))) ++ return; ++ if (dyn.d_tag == DT_NULL) ++ return; ++ if (dyn.d_tag == DT_TEXTREL || (dyn.d_tag == DT_FLAGS && (dyn.d_un.d_val & DF_TEXTREL))) { ++ gr_log_textrel(vma); ++ if (is_textrel_rw) ++ vma->vm_flags |= VM_MAYWRITE; ++ else ++ /* PaX: disallow write access after relocs are done, hopefully noone else needs it... */ ++ vma->vm_flags &= ~VM_MAYWRITE; ++ return; ++ } ++ i++; ++ } ++ return; ++ ++ case PT_GNU_RELRO: ++ if (!is_relro) ++ continue; ++ if ((elf_p.p_offset >> PAGE_SHIFT) == vma->vm_pgoff && ELF_PAGEALIGN(elf_p.p_memsz) == vma->vm_end - vma->vm_start) ++ vma->vm_flags &= ~VM_MAYWRITE; ++ return; ++ } ++ } ++} ++#endif ++ + static int __init init_elf_binfmt(void) + { + return register_binfmt(&elf_format); +diff -urNp linux-2.6.33.1/fs/binfmt_flat.c linux-2.6.33.1/fs/binfmt_flat.c +--- linux-2.6.33.1/fs/binfmt_flat.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/binfmt_flat.c 2010-03-20 16:58:41.369218937 -0400 +@@ -564,7 +564,9 @@ static int load_flat_file(struct linux_b + realdatastart = (unsigned long) -ENOMEM; + printk("Unable to allocate RAM for process data, errno %d\n", + (int)-realdatastart); ++ down_write(¤t->mm->mmap_sem); + do_munmap(current->mm, textpos, text_len); ++ up_write(¤t->mm->mmap_sem); + ret = realdatastart; + goto err; + } +@@ -588,8 +590,10 @@ static int load_flat_file(struct linux_b + } + if (IS_ERR_VALUE(result)) { + printk("Unable to read data+bss, errno %d\n", (int)-result); ++ down_write(¤t->mm->mmap_sem); + do_munmap(current->mm, textpos, text_len); + do_munmap(current->mm, realdatastart, data_len + extra); ++ up_write(¤t->mm->mmap_sem); + ret = result; + goto err; + } +@@ -658,8 +662,10 @@ static int load_flat_file(struct linux_b + } + if (IS_ERR_VALUE(result)) { + printk("Unable to read code+data+bss, errno %d\n",(int)-result); ++ down_write(¤t->mm->mmap_sem); + do_munmap(current->mm, textpos, text_len + data_len + extra + + MAX_SHARED_LIBS * sizeof(unsigned long)); ++ up_write(¤t->mm->mmap_sem); + ret = result; + goto err; + } +diff -urNp linux-2.6.33.1/fs/binfmt_misc.c linux-2.6.33.1/fs/binfmt_misc.c +--- linux-2.6.33.1/fs/binfmt_misc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/binfmt_misc.c 2010-03-20 16:58:41.384059578 -0400 +@@ -693,7 +693,7 @@ static int bm_fill_super(struct super_bl + static struct tree_descr bm_files[] = { + [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, + [3] = {"register", &bm_register_operations, S_IWUSR}, +- /* last one */ {""} ++ /* last one */ {"", NULL, 0} + }; + int err = simple_fill_super(sb, 0x42494e4d, bm_files); + if (!err) +diff -urNp linux-2.6.33.1/fs/bio.c linux-2.6.33.1/fs/bio.c +--- linux-2.6.33.1/fs/bio.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/bio.c 2010-03-20 16:58:41.384547944 -0400 +@@ -1217,7 +1217,7 @@ static void bio_copy_kern_endio(struct b + const int read = bio_data_dir(bio) == READ; + struct bio_map_data *bmd = bio->bi_private; + int i; +- char *p = bmd->sgvecs[0].iov_base; ++ char *p = (__force char *)bmd->sgvecs[0].iov_base; + + __bio_for_each_segment(bvec, bio, i, 0) { + char *addr = page_address(bvec->bv_page); +diff -urNp linux-2.6.33.1/fs/btrfs/ctree.c linux-2.6.33.1/fs/btrfs/ctree.c +--- linux-2.6.33.1/fs/btrfs/ctree.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/btrfs/ctree.c 2010-03-20 16:58:41.388867884 -0400 +@@ -3645,7 +3645,6 @@ setup_items_for_insert(struct btrfs_tran + + ret = 0; + if (slot == 0) { +- struct btrfs_disk_key disk_key; + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } +diff -urNp linux-2.6.33.1/fs/btrfs/disk-io.c linux-2.6.33.1/fs/btrfs/disk-io.c +--- linux-2.6.33.1/fs/btrfs/disk-io.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/btrfs/disk-io.c 2010-03-20 16:58:41.388867884 -0400 +@@ -39,7 +39,7 @@ + #include "tree-log.h" + #include "free-space-cache.h" + +-static struct extent_io_ops btree_extent_io_ops; ++static const struct extent_io_ops btree_extent_io_ops; + static void end_workqueue_fn(struct btrfs_work *work); + static void free_fs_root(struct btrfs_root *root); + +@@ -2605,7 +2605,7 @@ out: + return 0; + } + +-static struct extent_io_ops btree_extent_io_ops = { ++static const struct extent_io_ops btree_extent_io_ops = { + .write_cache_pages_lock_hook = btree_lock_page_hook, + .readpage_end_io_hook = btree_readpage_end_io_hook, + .submit_bio_hook = btree_submit_bio_hook, +diff -urNp linux-2.6.33.1/fs/btrfs/extent_io.h linux-2.6.33.1/fs/btrfs/extent_io.h +--- linux-2.6.33.1/fs/btrfs/extent_io.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/btrfs/extent_io.h 2010-03-20 16:58:41.392898040 -0400 +@@ -49,36 +49,36 @@ typedef int (extent_submit_bio_hook_t)(s + struct bio *bio, int mirror_num, + unsigned long bio_flags); + struct extent_io_ops { +- int (*fill_delalloc)(struct inode *inode, struct page *locked_page, ++ int (* const fill_delalloc)(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written); +- int (*writepage_start_hook)(struct page *page, u64 start, u64 end); +- int (*writepage_io_hook)(struct page *page, u64 start, u64 end); ++ int (* const writepage_start_hook)(struct page *page, u64 start, u64 end); ++ int (* const writepage_io_hook)(struct page *page, u64 start, u64 end); + extent_submit_bio_hook_t *submit_bio_hook; +- int (*merge_bio_hook)(struct page *page, unsigned long offset, ++ int (* const merge_bio_hook)(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); +- int (*readpage_io_hook)(struct page *page, u64 start, u64 end); +- int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, ++ int (* const readpage_io_hook)(struct page *page, u64 start, u64 end); ++ int (* const readpage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); +- int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, ++ int (* const writepage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); +- int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, ++ int (* const readpage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state); +- int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, ++ int (* const writepage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate); +- int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, ++ int (* const set_bit_hook)(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits); +- int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, ++ int (* const clear_bit_hook)(struct inode *inode, struct extent_state *state, + unsigned long bits); +- int (*merge_extent_hook)(struct inode *inode, ++ int (* const merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); +- int (*split_extent_hook)(struct inode *inode, ++ int (* const split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); +- int (*write_cache_pages_lock_hook)(struct page *page); ++ int (* const write_cache_pages_lock_hook)(struct page *page); + }; + + struct extent_io_tree { +@@ -88,7 +88,7 @@ struct extent_io_tree { + u64 dirty_bytes; + spinlock_t lock; + spinlock_t buffer_lock; +- struct extent_io_ops *ops; ++ const struct extent_io_ops *ops; + }; + + struct extent_state { +diff -urNp linux-2.6.33.1/fs/btrfs/free-space-cache.c linux-2.6.33.1/fs/btrfs/free-space-cache.c +--- linux-2.6.33.1/fs/btrfs/free-space-cache.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/btrfs/free-space-cache.c 2010-03-20 16:58:41.392898040 -0400 +@@ -1074,8 +1074,6 @@ u64 btrfs_alloc_from_cluster(struct btrf + + while(1) { + if (entry->bytes < bytes || entry->offset < min_start) { +- struct rb_node *node; +- + node = rb_next(&entry->offset_index); + if (!node) + break; +@@ -1226,7 +1224,7 @@ again: + */ + while (entry->bitmap || found_bitmap || + (!entry->bitmap && entry->bytes < min_bytes)) { +- struct rb_node *node = rb_next(&entry->offset_index); ++ node = rb_next(&entry->offset_index); + + if (entry->bitmap && entry->bytes > bytes + empty_size) { + ret = btrfs_bitmap_cluster(block_group, entry, cluster, +diff -urNp linux-2.6.33.1/fs/btrfs/inode.c linux-2.6.33.1/fs/btrfs/inode.c +--- linux-2.6.33.1/fs/btrfs/inode.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/btrfs/inode.c 2010-03-20 16:58:41.396902904 -0400 +@@ -63,7 +63,7 @@ static const struct inode_operations btr + static const struct address_space_operations btrfs_aops; + static const struct address_space_operations btrfs_symlink_aops; + static const struct file_operations btrfs_dir_file_operations; +-static struct extent_io_ops btrfs_extent_io_ops; ++static const struct extent_io_ops btrfs_extent_io_ops; + + static struct kmem_cache *btrfs_inode_cachep; + struct kmem_cache *btrfs_trans_handle_cachep; +@@ -5973,7 +5973,7 @@ static const struct file_operations btrf + .fsync = btrfs_sync_file, + }; + +-static struct extent_io_ops btrfs_extent_io_ops = { ++static const struct extent_io_ops btrfs_extent_io_ops = { + .fill_delalloc = run_delalloc_range, + .submit_bio_hook = btrfs_submit_bio_hook, + .merge_bio_hook = btrfs_merge_bio_hook, +diff -urNp linux-2.6.33.1/fs/btrfs/sysfs.c linux-2.6.33.1/fs/btrfs/sysfs.c +--- linux-2.6.33.1/fs/btrfs/sysfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/btrfs/sysfs.c 2010-03-20 16:58:41.404901120 -0400 +@@ -164,12 +164,12 @@ static void btrfs_root_release(struct ko + complete(&root->kobj_unregister); + } + +-static struct sysfs_ops btrfs_super_attr_ops = { ++static const struct sysfs_ops btrfs_super_attr_ops = { + .show = btrfs_super_attr_show, + .store = btrfs_super_attr_store, + }; + +-static struct sysfs_ops btrfs_root_attr_ops = { ++static const struct sysfs_ops btrfs_root_attr_ops = { + .show = btrfs_root_attr_show, + .store = btrfs_root_attr_store, + }; +diff -urNp linux-2.6.33.1/fs/buffer.c linux-2.6.33.1/fs/buffer.c +--- linux-2.6.33.1/fs/buffer.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/buffer.c 2010-03-20 16:58:41.408897047 -0400 +@@ -25,6 +25,7 @@ + #include <linux/percpu.h> + #include <linux/slab.h> + #include <linux/capability.h> ++#include <linux/security.h> + #include <linux/blkdev.h> + #include <linux/file.h> + #include <linux/quotaops.h> +diff -urNp linux-2.6.33.1/fs/cachefiles/daemon.c linux-2.6.33.1/fs/cachefiles/daemon.c +--- linux-2.6.33.1/fs/cachefiles/daemon.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/cachefiles/daemon.c 2010-03-20 16:58:41.408897047 -0400 +@@ -195,7 +195,7 @@ static ssize_t cachefiles_daemon_read(st + if (n > buflen) + return -EMSGSIZE; + +- if (copy_to_user(_buffer, buffer, n) != 0) ++ if (n > sizeof(buffer) || copy_to_user(_buffer, buffer, n) != 0) + return -EFAULT; + + return n; +diff -urNp linux-2.6.33.1/fs/cachefiles/rdwr.c linux-2.6.33.1/fs/cachefiles/rdwr.c +--- linux-2.6.33.1/fs/cachefiles/rdwr.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/cachefiles/rdwr.c 2010-03-20 16:58:41.412812404 -0400 +@@ -944,7 +944,7 @@ int cachefiles_write_page(struct fscache + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = file->f_op->write( +- file, (const void __user *) data, len, &pos); ++ file, (__force const void __user *) data, len, &pos); + set_fs(old_fs); + kunmap(page); + if (ret != len) +diff -urNp linux-2.6.33.1/fs/cifs/cifs_uniupr.h linux-2.6.33.1/fs/cifs/cifs_uniupr.h +--- linux-2.6.33.1/fs/cifs/cifs_uniupr.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/cifs/cifs_uniupr.h 2010-03-20 16:58:41.416893406 -0400 +@@ -132,7 +132,7 @@ const struct UniCaseRange CifsUniUpperRa + {0x0490, 0x04cc, UniCaseRangeU0490}, + {0x1e00, 0x1ffc, UniCaseRangeU1e00}, + {0xff40, 0xff5a, UniCaseRangeUff40}, +- {0} ++ {0, 0, NULL} + }; + #endif + +diff -urNp linux-2.6.33.1/fs/cifs/link.c linux-2.6.33.1/fs/cifs/link.c +--- linux-2.6.33.1/fs/cifs/link.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/cifs/link.c 2010-03-20 16:58:41.416893406 -0400 +@@ -215,7 +215,7 @@ cifs_symlink(struct inode *inode, struct + + void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie) + { +- char *p = nd_get_link(nd); ++ const char *p = nd_get_link(nd); + if (!IS_ERR(p)) + kfree(p); + } +diff -urNp linux-2.6.33.1/fs/compat_binfmt_elf.c linux-2.6.33.1/fs/compat_binfmt_elf.c +--- linux-2.6.33.1/fs/compat_binfmt_elf.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/compat_binfmt_elf.c 2010-03-20 16:58:41.424927933 -0400 +@@ -29,10 +29,12 @@ + #undef elfhdr + #undef elf_phdr + #undef elf_note ++#undef elf_dyn + #undef elf_addr_t + #define elfhdr elf32_hdr + #define elf_phdr elf32_phdr + #define elf_note elf32_note ++#define elf_dyn Elf32_Dyn + #define elf_addr_t Elf32_Addr + + /* +diff -urNp linux-2.6.33.1/fs/compat.c linux-2.6.33.1/fs/compat.c +--- linux-2.6.33.1/fs/compat.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/compat.c 2010-03-20 16:58:41.424927933 -0400 +@@ -1408,14 +1408,12 @@ static int compat_copy_strings(int argc, + if (!kmapped_page || kpos != (pos & PAGE_MASK)) { + struct page *page; + +-#ifdef CONFIG_STACK_GROWSUP + ret = expand_stack_downwards(bprm->vma, pos); + if (ret < 0) { + /* We've exceed the stack rlimit. */ + ret = -E2BIG; + goto out; + } +-#endif + ret = get_user_pages(current, bprm->mm, pos, + 1, 1, 1, &page, NULL); + if (ret <= 0) { +@@ -1461,6 +1459,11 @@ int compat_do_execve(char * filename, + compat_uptr_t __user *envp, + struct pt_regs * regs) + { ++#ifdef CONFIG_GRKERNSEC ++ struct file *old_exec_file; ++ struct acl_subject_label *old_acl; ++ struct rlimit old_rlim[RLIM_NLIMITS]; ++#endif + struct linux_binprm *bprm; + struct file *file; + struct files_struct *displaced; +@@ -1497,6 +1500,14 @@ int compat_do_execve(char * filename, + bprm->filename = filename; + bprm->interp = filename; + ++ gr_learn_resource(current, RLIMIT_NPROC, atomic_read(¤t->cred->user->processes), 1); ++ retval = -EAGAIN; ++ if (gr_handle_nproc()) ++ goto out_file; ++ retval = -EACCES; ++ if (!gr_acl_handle_execve(file->f_dentry, file->f_vfsmnt)) ++ goto out_file; ++ + retval = bprm_mm_init(bprm); + if (retval) + goto out_file; +@@ -1526,9 +1537,40 @@ int compat_do_execve(char * filename, + if (retval < 0) + goto out; + ++ if (!gr_tpe_allow(file)) { ++ retval = -EACCES; ++ goto out; ++ } ++ ++ if (gr_check_crash_exec(file)) { ++ retval = -EACCES; ++ goto out; ++ } ++ ++ gr_log_chroot_exec(file->f_dentry, file->f_vfsmnt); ++ ++ gr_handle_exec_args(bprm, (char __user * __user *)argv); ++ ++#ifdef CONFIG_GRKERNSEC ++ old_acl = current->acl; ++ memcpy(old_rlim, current->signal->rlim, sizeof(old_rlim)); ++ old_exec_file = current->exec_file; ++ get_file(file); ++ current->exec_file = file; ++#endif ++ ++ retval = gr_set_proc_label(file->f_dentry, file->f_vfsmnt, ++ bprm->unsafe & LSM_UNSAFE_SHARE); ++ if (retval < 0) ++ goto out_fail; ++ + retval = search_binary_handler(bprm, regs); + if (retval < 0) +- goto out; ++ goto out_fail; ++#ifdef CONFIG_GRKERNSEC ++ if (old_exec_file) ++ fput(old_exec_file); ++#endif + + current->stack_start = current->mm->start_stack; + +@@ -1541,6 +1583,14 @@ int compat_do_execve(char * filename, + put_files_struct(displaced); + return retval; + ++out_fail: ++#ifdef CONFIG_GRKERNSEC ++ current->acl = old_acl; ++ memcpy(current->signal->rlim, old_rlim, sizeof(old_rlim)); ++ fput(current->exec_file); ++ current->exec_file = old_exec_file; ++#endif ++ + out: + if (bprm->mm) + mmput(bprm->mm); +diff -urNp linux-2.6.33.1/fs/debugfs/inode.c linux-2.6.33.1/fs/debugfs/inode.c +--- linux-2.6.33.1/fs/debugfs/inode.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/debugfs/inode.c 2010-03-20 16:58:41.432715858 -0400 +@@ -128,7 +128,7 @@ static inline int debugfs_positive(struc + + static int debug_fill_super(struct super_block *sb, void *data, int silent) + { +- static struct tree_descr debug_files[] = {{""}}; ++ static struct tree_descr debug_files[] = {{"", NULL, 0}}; + + return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); + } +diff -urNp linux-2.6.33.1/fs/dlm/lockspace.c linux-2.6.33.1/fs/dlm/lockspace.c +--- linux-2.6.33.1/fs/dlm/lockspace.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/dlm/lockspace.c 2010-03-20 16:58:41.444895285 -0400 +@@ -148,7 +148,7 @@ static void lockspace_kobj_release(struc + kfree(ls); + } + +-static struct sysfs_ops dlm_attr_ops = { ++static const struct sysfs_ops dlm_attr_ops = { + .show = dlm_attr_show, + .store = dlm_attr_store, + }; +diff -urNp linux-2.6.33.1/fs/ecryptfs/inode.c linux-2.6.33.1/fs/ecryptfs/inode.c +--- linux-2.6.33.1/fs/ecryptfs/inode.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ecryptfs/inode.c 2010-03-20 16:58:41.460901500 -0400 +@@ -685,7 +685,7 @@ ecryptfs_readlink(struct dentry *dentry, + old_fs = get_fs(); + set_fs(get_ds()); + rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, +- (char __user *)lower_buf, ++ (__force char __user *)lower_buf, + lower_bufsiz); + set_fs(old_fs); + if (rc >= 0) { +@@ -729,7 +729,7 @@ static void *ecryptfs_follow_link(struct + } + old_fs = get_fs(); + set_fs(get_ds()); +- rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); ++ rc = dentry->d_inode->i_op->readlink(dentry, (__force char __user *)buf, len); + set_fs(old_fs); + if (rc < 0) { + kfree(buf); +@@ -744,7 +744,7 @@ out: + static void + ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) + { +- char *buf = nd_get_link(nd); ++ const char *buf = nd_get_link(nd); + if (!IS_ERR(buf)) { + /* Free the char* */ + kfree(buf); +diff -urNp linux-2.6.33.1/fs/ecryptfs/miscdev.c linux-2.6.33.1/fs/ecryptfs/miscdev.c +--- linux-2.6.33.1/fs/ecryptfs/miscdev.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ecryptfs/miscdev.c 2010-03-20 16:58:41.460901500 -0400 +@@ -327,7 +327,7 @@ check_list: + goto out_unlock_msg_ctx; + i = 5; + if (msg_ctx->msg) { +- if (copy_to_user(&buf[i], packet_length, packet_length_size)) ++ if (packet_length_size > sizeof(packet_length) || copy_to_user(&buf[i], packet_length, packet_length_size)) + goto out_unlock_msg_ctx; + i += packet_length_size; + if (copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size)) +diff -urNp linux-2.6.33.1/fs/exec.c linux-2.6.33.1/fs/exec.c +--- linux-2.6.33.1/fs/exec.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/exec.c 2010-03-20 16:58:41.472962075 -0400 +@@ -55,12 +55,24 @@ + #include <linux/fsnotify.h> + #include <linux/fs_struct.h> + #include <linux/pipe_fs_i.h> ++#include <linux/random.h> ++#include <linux/seq_file.h> ++ ++#ifdef CONFIG_PAX_REFCOUNT ++#include <linux/kallsyms.h> ++#include <linux/kdebug.h> ++#endif + + #include <asm/uaccess.h> + #include <asm/mmu_context.h> + #include <asm/tlb.h> + #include "internal.h" + ++#ifdef CONFIG_PAX_HOOK_ACL_FLAGS ++void (*pax_set_initial_flags_func)(struct linux_binprm *bprm); ++EXPORT_SYMBOL(pax_set_initial_flags_func); ++#endif ++ + int core_uses_pid; + char core_pattern[CORENAME_MAX_SIZE] = "core"; + unsigned int core_pipe_limit; +@@ -114,7 +126,7 @@ SYSCALL_DEFINE1(uselib, const char __use + goto out; + + file = do_filp_open(AT_FDCWD, tmp, +- O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, ++ O_LARGEFILE | O_RDONLY | FMODE_EXEC | FMODE_GREXEC, 0, + MAY_READ | MAY_EXEC | MAY_OPEN); + putname(tmp); + error = PTR_ERR(file); +@@ -162,18 +174,10 @@ static struct page *get_arg_page(struct + int write) + { + struct page *page; +- int ret; + +-#ifdef CONFIG_STACK_GROWSUP +- if (write) { +- ret = expand_stack_downwards(bprm->vma, pos); +- if (ret < 0) +- return NULL; +- } +-#endif +- ret = get_user_pages(current, bprm->mm, pos, +- 1, write, 1, &page, NULL); +- if (ret <= 0) ++ if (0 > expand_stack_downwards(bprm->vma, pos)) ++ return NULL; ++ if (0 >= get_user_pages(current, bprm->mm, pos, 1, write, 1, &page, NULL)) + return NULL; + + if (write) { +@@ -245,6 +249,11 @@ static int __bprm_mm_init(struct linux_b + vma->vm_end = STACK_TOP_MAX; + vma->vm_start = vma->vm_end - PAGE_SIZE; + vma->vm_flags = VM_STACK_FLAGS; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ vma->vm_flags &= ~(VM_EXEC | VM_MAYEXEC); ++#endif ++ + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + err = insert_vm_struct(mm, vma); + if (err) +@@ -253,6 +262,12 @@ static int __bprm_mm_init(struct linux_b + mm->stack_vm = mm->total_vm = 1; + up_write(&mm->mmap_sem); + bprm->p = vma->vm_end - sizeof(void *); ++ ++#ifdef CONFIG_PAX_RANDUSTACK ++ if (randomize_va_space) ++ bprm->p ^= (pax_get_random_long() & ~15) & ~PAGE_MASK; ++#endif ++ + return 0; + err: + up_write(&mm->mmap_sem); +@@ -474,7 +489,7 @@ int copy_strings_kernel(int argc,char ** + int r; + mm_segment_t oldfs = get_fs(); + set_fs(KERNEL_DS); +- r = copy_strings(argc, (char __user * __user *)argv, bprm); ++ r = copy_strings(argc, (__force char __user * __user *)argv, bprm); + set_fs(oldfs); + return r; + } +@@ -504,7 +519,8 @@ static int shift_arg_pages(struct vm_are + unsigned long new_end = old_end - shift; + struct mmu_gather *tlb; + +- BUG_ON(new_start > new_end); ++ if (new_start >= new_end || new_start < mmap_min_addr) ++ return -EFAULT; + + /* + * ensure there are no vmas between where we want to go +@@ -513,6 +529,10 @@ static int shift_arg_pages(struct vm_are + if (vma != find_vma(mm, new_start)) + return -EFAULT; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ BUG_ON(pax_find_mirror_vma(vma)); ++#endif ++ + /* + * cover the whole range: [new_start, old_end) + */ +@@ -604,8 +624,28 @@ int setup_arg_pages(struct linux_binprm + bprm->exec -= stack_shift; + + down_write(&mm->mmap_sem); ++ ++ /* Move stack pages down in memory. */ ++ if (stack_shift) { ++ ret = shift_arg_pages(vma, stack_shift); ++ if (ret) ++ goto out_unlock; ++ } ++ + vm_flags = VM_STACK_FLAGS; + ++#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) ++ if (mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { ++ vm_flags &= ~VM_EXEC; ++ ++#ifdef CONFIG_PAX_MPROTECT ++ if (mm->pax_flags & MF_PAX_MPROTECT) ++ vm_flags &= ~VM_MAYEXEC; ++#endif ++ ++ } ++#endif ++ + /* + * Adjust stack execute permissions; explicitly enable for + * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone +@@ -623,13 +663,6 @@ int setup_arg_pages(struct linux_binprm + goto out_unlock; + BUG_ON(prev != vma); + +- /* Move stack pages down in memory. */ +- if (stack_shift) { +- ret = shift_arg_pages(vma, stack_shift); +- if (ret) +- goto out_unlock; +- } +- + stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE; + stack_size = vma->vm_end - vma->vm_start; + /* +@@ -666,7 +699,7 @@ struct file *open_exec(const char *name) + int err; + + file = do_filp_open(AT_FDCWD, name, +- O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, ++ O_LARGEFILE | O_RDONLY | FMODE_EXEC | FMODE_GREXEC, 0, + MAY_EXEC | MAY_OPEN); + if (IS_ERR(file)) + goto out; +@@ -703,7 +736,7 @@ int kernel_read(struct file *file, loff_ + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ +- result = vfs_read(file, (void __user *)addr, count, &pos); ++ result = vfs_read(file, (__force void __user *)addr, count, &pos); + set_fs(old_fs); + return result; + } +@@ -1120,7 +1153,7 @@ int check_unsafe_exec(struct linux_binpr + } + rcu_read_unlock(); + +- if (p->fs->users > n_fs) { ++ if (atomic_read(&p->fs->users) > n_fs) { + bprm->unsafe |= LSM_UNSAFE_SHARE; + } else { + res = -EAGAIN; +@@ -1316,6 +1349,11 @@ int do_execve(char * filename, + char __user *__user *envp, + struct pt_regs * regs) + { ++#ifdef CONFIG_GRKERNSEC ++ struct file *old_exec_file; ++ struct acl_subject_label *old_acl; ++ struct rlimit old_rlim[RLIM_NLIMITS]; ++#endif + struct linux_binprm *bprm; + struct file *file; + struct files_struct *displaced; +@@ -1352,6 +1390,18 @@ int do_execve(char * filename, + bprm->filename = filename; + bprm->interp = filename; + ++ gr_learn_resource(current, RLIMIT_NPROC, atomic_read(¤t->cred->user->processes), 1); ++ ++ if (gr_handle_nproc()) { ++ retval = -EAGAIN; ++ goto out_file; ++ } ++ ++ if (!gr_acl_handle_execve(file->f_dentry, file->f_vfsmnt)) { ++ retval = -EACCES; ++ goto out_file; ++ } ++ + retval = bprm_mm_init(bprm); + if (retval) + goto out_file; +@@ -1381,10 +1431,41 @@ int do_execve(char * filename, + if (retval < 0) + goto out; + ++ if (!gr_tpe_allow(file)) { ++ retval = -EACCES; ++ goto out; ++ } ++ ++ if (gr_check_crash_exec(file)) { ++ retval = -EACCES; ++ goto out; ++ } ++ ++ gr_log_chroot_exec(file->f_dentry, file->f_vfsmnt); ++ ++ gr_handle_exec_args(bprm, argv); ++ ++#ifdef CONFIG_GRKERNSEC ++ old_acl = current->acl; ++ memcpy(old_rlim, current->signal->rlim, sizeof(old_rlim)); ++ old_exec_file = current->exec_file; ++ get_file(file); ++ current->exec_file = file; ++#endif ++ ++ retval = gr_set_proc_label(file->f_dentry, file->f_vfsmnt, ++ bprm->unsafe & LSM_UNSAFE_SHARE); ++ if (retval < 0) ++ goto out_fail; ++ + current->flags &= ~PF_KTHREAD; + retval = search_binary_handler(bprm,regs); + if (retval < 0) +- goto out; ++ goto out_fail; ++#ifdef CONFIG_GRKERNSEC ++ if (old_exec_file) ++ fput(old_exec_file); ++#endif + + current->stack_start = current->mm->start_stack; + +@@ -1397,6 +1478,14 @@ int do_execve(char * filename, + put_files_struct(displaced); + return retval; + ++out_fail: ++#ifdef CONFIG_GRKERNSEC ++ current->acl = old_acl; ++ memcpy(current->signal->rlim, old_rlim, sizeof(old_rlim)); ++ fput(current->exec_file); ++ current->exec_file = old_exec_file; ++#endif ++ + out: + if (bprm->mm) + mmput (bprm->mm); +@@ -1560,6 +1649,169 @@ out: + return ispipe; + } + ++int pax_check_flags(unsigned long *flags) ++{ ++ int retval = 0; ++ ++#if !defined(CONFIG_X86_32) || !defined(CONFIG_PAX_SEGMEXEC) ++ if (*flags & MF_PAX_SEGMEXEC) ++ { ++ *flags &= ~MF_PAX_SEGMEXEC; ++ retval = -EINVAL; ++ } ++#endif ++ ++ if ((*flags & MF_PAX_PAGEEXEC) ++ ++#ifdef CONFIG_PAX_PAGEEXEC ++ && (*flags & MF_PAX_SEGMEXEC) ++#endif ++ ++ ) ++ { ++ *flags &= ~MF_PAX_PAGEEXEC; ++ retval = -EINVAL; ++ } ++ ++ if ((*flags & MF_PAX_MPROTECT) ++ ++#ifdef CONFIG_PAX_MPROTECT ++ && !(*flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) ++#endif ++ ++ ) ++ { ++ *flags &= ~MF_PAX_MPROTECT; ++ retval = -EINVAL; ++ } ++ ++ if ((*flags & MF_PAX_EMUTRAMP) ++ ++#ifdef CONFIG_PAX_EMUTRAMP ++ && !(*flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) ++#endif ++ ++ ) ++ { ++ *flags &= ~MF_PAX_EMUTRAMP; ++ retval = -EINVAL; ++ } ++ ++ return retval; ++} ++ ++EXPORT_SYMBOL(pax_check_flags); ++ ++#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) ++void pax_report_fault(struct pt_regs *regs, void *pc, void *sp) ++{ ++ struct task_struct *tsk = current; ++ struct mm_struct *mm = current->mm; ++ char *buffer_exec = (char *)__get_free_page(GFP_KERNEL); ++ char *buffer_fault = (char *)__get_free_page(GFP_KERNEL); ++ char *path_exec = NULL; ++ char *path_fault = NULL; ++ unsigned long start = 0UL, end = 0UL, offset = 0UL; ++ ++ if (buffer_exec && buffer_fault) { ++ struct vm_area_struct *vma, *vma_exec = NULL, *vma_fault = NULL; ++ ++ down_read(&mm->mmap_sem); ++ vma = mm->mmap; ++ while (vma && (!vma_exec || !vma_fault)) { ++ if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) ++ vma_exec = vma; ++ if (vma->vm_start <= (unsigned long)pc && (unsigned long)pc < vma->vm_end) ++ vma_fault = vma; ++ vma = vma->vm_next; ++ } ++ if (vma_exec) { ++ path_exec = d_path(&vma_exec->vm_file->f_path, buffer_exec, PAGE_SIZE); ++ if (IS_ERR(path_exec)) ++ path_exec = "<path too long>"; ++ else { ++ path_exec = mangle_path(buffer_exec, path_exec, "\t\n\"); ++ if (path_exec) { ++ *path_exec = 0; ++ path_exec = buffer_exec; ++ } else ++ path_exec = "<path too long>"; ++ } ++ } ++ if (vma_fault) { ++ start = vma_fault->vm_start; ++ end = vma_fault->vm_end; ++ offset = vma_fault->vm_pgoff << PAGE_SHIFT; ++ if (vma_fault->vm_file) { ++ path_fault = d_path(&vma_fault->vm_file->f_path, buffer_fault, PAGE_SIZE); ++ if (IS_ERR(path_fault)) ++ path_fault = "<path too long>"; ++ else { ++ path_fault = mangle_path(buffer_fault, path_fault, "\t\n\"); ++ if (path_fault) { ++ *path_fault = 0; ++ path_fault = buffer_fault; ++ } else ++ path_fault = "<path too long>"; ++ } ++ } else ++ path_fault = "<anonymous mapping>"; ++ } ++ up_read(&mm->mmap_sem); ++ } ++ if (tsk->signal->curr_ip) ++ printk(KERN_ERR "PAX: From %pI4: execution attempt in: %s, %08lx-%08lx %08lx\n", &tsk->signal->curr_ip, path_fault, start, end, offset); ++ else ++ printk(KERN_ERR "PAX: execution attempt in: %s, %08lx-%08lx %08lx\n", path_fault, start, end, offset); ++ printk(KERN_ERR "PAX: terminating task: %s(%s):%d, uid/euid: %u/%u, " ++ "PC: %p, SP: %p\n", path_exec, tsk->comm, task_pid_nr(tsk), ++ task_uid(tsk), task_euid(tsk), pc, sp); ++ free_page((unsigned long)buffer_exec); ++ free_page((unsigned long)buffer_fault); ++ pax_report_insns(pc, sp); ++ do_coredump(SIGKILL, SIGKILL, regs); ++} ++#endif ++ ++#ifdef CONFIG_PAX_REFCOUNT ++void pax_report_refcount_overflow(struct pt_regs *regs) ++{ ++ if (current->signal->curr_ip) ++ printk(KERN_ERR "PAX: From %pI4: refcount overflow detected in: %s:%d, uid/euid: %u/%u\n", ++ ¤t->signal->curr_ip, current->comm, task_pid_nr(current), current_uid(), current_euid()); ++ else ++ printk(KERN_ERR "PAX: refcount overflow detected in: %s:%d, uid/euid: %u/%u\n", ++ current->comm, task_pid_nr(current), current_uid(), current_euid()); ++ print_symbol(KERN_ERR "PAX: refcount overflow occured at: %s\n", instruction_pointer(regs)); ++ show_regs(regs); ++ force_sig_info(SIGKILL, SEND_SIG_FORCED, current); ++} ++#endif ++ ++#ifdef CONFIG_PAX_USERCOPY ++void pax_report_leak_to_user(const void *ptr, unsigned long len) ++{ ++ if (current->signal->curr_ip) ++ printk(KERN_ERR "PAX: From %pI4: kernel memory leak attempt detected from %p (%lu bytes)\n", ++ ¤t->signal->curr_ip, ptr, len); ++ else ++ printk(KERN_ERR "PAX: kernel memory leak attempt detected from %p (%lu bytes)\n", ptr, len); ++ dump_stack(); ++ do_group_exit(SIGKILL); ++} ++ ++void pax_report_overflow_from_user(const void *ptr, unsigned long len) ++{ ++ if (current->signal->curr_ip) ++ printk(KERN_ERR "PAX: From %pI4: kernel memory overflow attempt detected to %p (%lu bytes)\n", ++ ¤t->signal->curr_ip, ptr, len); ++ else ++ printk(KERN_ERR "PAX: kernel memory overflow attempt detected to %p (%lu bytes)\n", ptr, len); ++ dump_stack(); ++ do_group_exit(SIGKILL); ++} ++#endif ++ + static int zap_process(struct task_struct *start) + { + struct task_struct *t; +@@ -1762,17 +2014,17 @@ static void wait_for_dump_helpers(struct + pipe = file->f_path.dentry->d_inode->i_pipe; + + pipe_lock(pipe); +- pipe->readers++; +- pipe->writers--; ++ atomic_inc(&pipe->readers); ++ atomic_dec(&pipe->writers); + +- while ((pipe->readers > 1) && (!signal_pending(current))) { ++ while ((atomic_read(&pipe->readers) > 1) && (!signal_pending(current))) { + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + pipe_wait(pipe); + } + +- pipe->readers--; +- pipe->writers++; ++ atomic_dec(&pipe->readers); ++ atomic_inc(&pipe->writers); + pipe_unlock(pipe); + + } +@@ -1846,6 +2098,10 @@ void do_coredump(long signr, int exit_co + */ + clear_thread_flag(TIF_SIGPENDING); + ++ if (signr == SIGKILL || signr == SIGILL) ++ gr_handle_brute_attach(current); ++ gr_learn_resource(current, RLIMIT_CORE, binfmt->min_coredump, 1); ++ + /* + * lock_kernel() because format_corename() is controlled by sysctl, which + * uses lock_kernel() +diff -urNp linux-2.6.33.1/fs/ext2/balloc.c linux-2.6.33.1/fs/ext2/balloc.c +--- linux-2.6.33.1/fs/ext2/balloc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ext2/balloc.c 2010-03-20 16:58:41.476618429 -0400 +@@ -1192,7 +1192,7 @@ static int ext2_has_free_blocks(struct e + + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); +- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && ++ if (free_blocks < root_blocks + 1 && !capable_nolog(CAP_SYS_RESOURCE) && + sbi->s_resuid != current_fsuid() && + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + return 0; +diff -urNp linux-2.6.33.1/fs/ext3/balloc.c linux-2.6.33.1/fs/ext3/balloc.c +--- linux-2.6.33.1/fs/ext3/balloc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ext3/balloc.c 2010-03-20 16:58:41.484909301 -0400 +@@ -1421,7 +1421,7 @@ static int ext3_has_free_blocks(struct e + + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); +- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && ++ if (free_blocks < root_blocks + 1 && !capable_nolog(CAP_SYS_RESOURCE) && + sbi->s_resuid != current_fsuid() && + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + return 0; +diff -urNp linux-2.6.33.1/fs/ext3/namei.c linux-2.6.33.1/fs/ext3/namei.c +--- linux-2.6.33.1/fs/ext3/namei.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ext3/namei.c 2010-03-20 16:58:41.496970241 -0400 +@@ -1168,7 +1168,7 @@ static struct ext3_dir_entry_2 *do_split + char *data1 = (*bh)->b_data, *data2; + unsigned split, move, size; + struct ext3_dir_entry_2 *de = NULL, *de2; +- int err = 0, i; ++ int i, err = 0; + + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) { +diff -urNp linux-2.6.33.1/fs/ext3/xattr.c linux-2.6.33.1/fs/ext3/xattr.c +--- linux-2.6.33.1/fs/ext3/xattr.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ext3/xattr.c 2010-03-20 16:58:41.500911319 -0400 +@@ -89,8 +89,8 @@ + printk("\n"); \ + } while (0) + #else +-# define ea_idebug(f...) +-# define ea_bdebug(f...) ++# define ea_idebug(f...) do {} while (0) ++# define ea_bdebug(f...) do {} while (0) + #endif + + static void ext3_xattr_cache_insert(struct buffer_head *); +diff -urNp linux-2.6.33.1/fs/ext4/balloc.c linux-2.6.33.1/fs/ext4/balloc.c +--- linux-2.6.33.1/fs/ext4/balloc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ext4/balloc.c 2010-03-20 16:58:41.500911319 -0400 +@@ -535,7 +535,7 @@ int ext4_has_free_blocks(struct ext4_sb_ + /* Hm, nope. Are (enough) root reserved blocks available? */ + if (sbi->s_resuid == current_fsuid() || + ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || +- capable(CAP_SYS_RESOURCE)) { ++ capable_nolog(CAP_SYS_RESOURCE)) { + if (free_blocks >= (nblocks + dirty_blocks)) + return 1; + } +diff -urNp linux-2.6.33.1/fs/ext4/ioctl.c linux-2.6.33.1/fs/ext4/ioctl.c +--- linux-2.6.33.1/fs/ext4/ioctl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ext4/ioctl.c 2010-03-20 16:58:41.500911319 -0400 +@@ -221,6 +221,9 @@ setversion_out: + struct file *donor_filp; + int err; + ++ /* temporary workaround for bugs in here */ ++ return -EOPNOTSUPP; ++ + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; +diff -urNp linux-2.6.33.1/fs/ext4/namei.c linux-2.6.33.1/fs/ext4/namei.c +--- linux-2.6.33.1/fs/ext4/namei.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ext4/namei.c 2010-03-20 16:58:41.504537056 -0400 +@@ -1203,7 +1203,7 @@ static struct ext4_dir_entry_2 *do_split + char *data1 = (*bh)->b_data, *data2; + unsigned split, move, size; + struct ext4_dir_entry_2 *de = NULL, *de2; +- int err = 0, i; ++ int i, err = 0; + + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) { +diff -urNp linux-2.6.33.1/fs/ext4/super.c linux-2.6.33.1/fs/ext4/super.c +--- linux-2.6.33.1/fs/ext4/super.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ext4/super.c 2010-03-20 16:58:41.512931908 -0400 +@@ -2292,7 +2292,7 @@ static void ext4_sb_release(struct kobje + } + + +-static struct sysfs_ops ext4_attr_ops = { ++static const struct sysfs_ops ext4_attr_ops = { + .show = ext4_attr_show, + .store = ext4_attr_store, + }; +diff -urNp linux-2.6.33.1/fs/fcntl.c linux-2.6.33.1/fs/fcntl.c +--- linux-2.6.33.1/fs/fcntl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/fcntl.c 2010-03-20 16:58:41.516920875 -0400 +@@ -344,6 +344,7 @@ static long do_fcntl(int fd, unsigned in + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: ++ gr_learn_resource(current, RLIMIT_NOFILE, arg, 0); + if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + break; + err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); +@@ -500,7 +501,8 @@ static inline int sigio_perm(struct task + ret = ((fown->euid == 0 || + fown->euid == cred->suid || fown->euid == cred->uid || + fown->uid == cred->suid || fown->uid == cred->uid) && +- !security_file_send_sigiotask(p, fown, sig)); ++ !security_file_send_sigiotask(p, fown, sig) && ++ !gr_check_protected_task(p) && !gr_pid_is_chrooted(p)); + rcu_read_unlock(); + return ret; + } +diff -urNp linux-2.6.33.1/fs/fifo.c linux-2.6.33.1/fs/fifo.c +--- linux-2.6.33.1/fs/fifo.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/fifo.c 2010-03-20 16:58:41.516920875 -0400 +@@ -59,10 +59,10 @@ static int fifo_open(struct inode *inode + */ + filp->f_op = &read_pipefifo_fops; + pipe->r_counter++; +- if (pipe->readers++ == 0) ++ if (atomic_inc_return(&pipe->readers) == 1) + wake_up_partner(inode); + +- if (!pipe->writers) { ++ if (!atomic_read(&pipe->writers)) { + if ((filp->f_flags & O_NONBLOCK)) { + /* suppress POLLHUP until we have + * seen a writer */ +@@ -83,15 +83,15 @@ static int fifo_open(struct inode *inode + * errno=ENXIO when there is no process reading the FIFO. + */ + ret = -ENXIO; +- if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) ++ if ((filp->f_flags & O_NONBLOCK) && !atomic_read(&pipe->readers)) + goto err; + + filp->f_op = &write_pipefifo_fops; + pipe->w_counter++; +- if (!pipe->writers++) ++ if (atomic_inc_return(&pipe->writers) == 1) + wake_up_partner(inode); + +- if (!pipe->readers) { ++ if (!atomic_read(&pipe->readers)) { + wait_for_partner(inode, &pipe->r_counter); + if (signal_pending(current)) + goto err_wr; +@@ -107,11 +107,11 @@ static int fifo_open(struct inode *inode + */ + filp->f_op = &rdwr_pipefifo_fops; + +- pipe->readers++; +- pipe->writers++; ++ atomic_inc(&pipe->readers); ++ atomic_inc(&pipe->writers); + pipe->r_counter++; + pipe->w_counter++; +- if (pipe->readers == 1 || pipe->writers == 1) ++ if (atomic_read(&pipe->readers) == 1 || atomic_read(&pipe->writers) == 1) + wake_up_partner(inode); + break; + +@@ -125,19 +125,19 @@ static int fifo_open(struct inode *inode + return 0; + + err_rd: +- if (!--pipe->readers) ++ if (atomic_dec_and_test(&pipe->readers)) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + + err_wr: +- if (!--pipe->writers) ++ if (atomic_dec_and_test(&pipe->writers)) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + + err: +- if (!pipe->readers && !pipe->writers) ++ if (!atomic_read(&pipe->readers) && !atomic_read(&pipe->writers)) + free_pipe_info(inode); + + err_nocleanup: +diff -urNp linux-2.6.33.1/fs/file.c linux-2.6.33.1/fs/file.c +--- linux-2.6.33.1/fs/file.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/file.c 2010-03-20 16:58:41.516920875 -0400 +@@ -14,6 +14,7 @@ + #include <linux/slab.h> + #include <linux/vmalloc.h> + #include <linux/file.h> ++#include <linux/security.h> + #include <linux/fdtable.h> + #include <linux/bitops.h> + #include <linux/interrupt.h> +@@ -257,6 +258,8 @@ int expand_files(struct files_struct *fi + * N.B. For clone tasks sharing a files structure, this test + * will limit the total number of files that can be opened. + */ ++ ++ gr_learn_resource(current, RLIMIT_NOFILE, nr, 0); + if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + return -EMFILE; + +diff -urNp linux-2.6.33.1/fs/fs_struct.c linux-2.6.33.1/fs/fs_struct.c +--- linux-2.6.33.1/fs/fs_struct.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/fs_struct.c 2010-03-20 16:58:41.516920875 -0400 +@@ -45,10 +45,12 @@ void chroot_fs_refs(struct path *old_roo + struct task_struct *g, *p; + struct fs_struct *fs; + int count = 0; ++ unsigned long flags; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); ++ gr_fs_write_lock_irqsave(p, flags); + fs = p->fs; + if (fs) { + write_lock(&fs->lock); +@@ -66,6 +68,7 @@ void chroot_fs_refs(struct path *old_roo + } + write_unlock(&fs->lock); + } ++ gr_fs_write_unlock_irqrestore(p, flags); + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); +@@ -83,14 +86,17 @@ void free_fs_struct(struct fs_struct *fs + void exit_fs(struct task_struct *tsk) + { + struct fs_struct *fs = tsk->fs; ++ unsigned long flags; + + if (fs) { + int kill; + task_lock(tsk); ++ gr_fs_write_lock_irqsave(tsk, flags); + write_lock(&fs->lock); + tsk->fs = NULL; +- kill = !--fs->users; ++ kill = !atomic_dec_return(&fs->users); + write_unlock(&fs->lock); ++ gr_fs_write_unlock_irqrestore(tsk, flags); + task_unlock(tsk); + if (kill) + free_fs_struct(fs); +@@ -102,7 +108,7 @@ struct fs_struct *copy_fs_struct(struct + struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); + /* We don't need to lock fs - think why ;-) */ + if (fs) { +- fs->users = 1; ++ atomic_set(&fs->users, 1); + fs->in_exec = 0; + rwlock_init(&fs->lock); + fs->umask = old->umask; +@@ -121,15 +127,18 @@ int unshare_fs_struct(void) + struct fs_struct *fs = current->fs; + struct fs_struct *new_fs = copy_fs_struct(fs); + int kill; ++ unsigned long flags; + + if (!new_fs) + return -ENOMEM; + + task_lock(current); ++ gr_fs_write_lock_irqsave(current, flags); + write_lock(&fs->lock); +- kill = !--fs->users; ++ kill = !atomic_dec_return(&fs->users); + current->fs = new_fs; + write_unlock(&fs->lock); ++ gr_fs_write_unlock_irqrestore(current, flags); + task_unlock(current); + + if (kill) +@@ -147,7 +156,7 @@ EXPORT_SYMBOL(current_umask); + + /* to be mentioned only in INIT_TASK */ + struct fs_struct init_fs = { +- .users = 1, ++ .users = ATOMIC_INIT(1), + .lock = __RW_LOCK_UNLOCKED(init_fs.lock), + .umask = 0022, + }; +@@ -155,6 +164,7 @@ struct fs_struct init_fs = { + void daemonize_fs_struct(void) + { + struct fs_struct *fs = current->fs; ++ unsigned long flags; + + if (fs) { + int kill; +@@ -162,13 +172,15 @@ void daemonize_fs_struct(void) + task_lock(current); + + write_lock(&init_fs.lock); +- init_fs.users++; ++ atomic_inc(&init_fs.users); + write_unlock(&init_fs.lock); + ++ gr_fs_write_lock_irqsave(current, flags); + write_lock(&fs->lock); + current->fs = &init_fs; +- kill = !--fs->users; ++ kill = !atomic_dec_return(&fs->users); + write_unlock(&fs->lock); ++ gr_fs_write_unlock_irqrestore(current, flags); + + task_unlock(current); + if (kill) +diff -urNp linux-2.6.33.1/fs/fuse/control.c linux-2.6.33.1/fs/fuse/control.c +--- linux-2.6.33.1/fs/fuse/control.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/fuse/control.c 2010-03-20 16:58:41.520615009 -0400 +@@ -293,7 +293,7 @@ void fuse_ctl_remove_conn(struct fuse_co + + static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) + { +- struct tree_descr empty_descr = {""}; ++ struct tree_descr empty_descr = {"", NULL, 0}; + struct fuse_conn *fc; + int err; + +diff -urNp linux-2.6.33.1/fs/fuse/cuse.c linux-2.6.33.1/fs/fuse/cuse.c +--- linux-2.6.33.1/fs/fuse/cuse.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/fuse/cuse.c 2010-03-20 16:58:41.520615009 -0400 +@@ -528,8 +528,18 @@ static int cuse_channel_release(struct i + return rc; + } + +-static struct file_operations cuse_channel_fops; /* initialized during init */ +- ++static const struct file_operations cuse_channel_fops = { /* initialized during init */ ++ .owner = THIS_MODULE, ++ .llseek = no_llseek, ++ .read = do_sync_read, ++ .aio_read = fuse_dev_read, ++ .write = do_sync_write, ++ .aio_write = fuse_dev_write, ++ .poll = fuse_dev_poll, ++ .open = cuse_channel_open, ++ .release = cuse_channel_release, ++ .fasync = fuse_dev_fasync, ++}; + + /************************************************************************** + * Misc stuff and module initializatiion +@@ -575,12 +585,6 @@ static int __init cuse_init(void) + for (i = 0; i < CUSE_CONNTBL_LEN; i++) + INIT_LIST_HEAD(&cuse_conntbl[i]); + +- /* inherit and extend fuse_dev_operations */ +- cuse_channel_fops = fuse_dev_operations; +- cuse_channel_fops.owner = THIS_MODULE; +- cuse_channel_fops.open = cuse_channel_open; +- cuse_channel_fops.release = cuse_channel_release; +- + cuse_class = class_create(THIS_MODULE, "cuse"); + if (IS_ERR(cuse_class)) + return PTR_ERR(cuse_class); +diff -urNp linux-2.6.33.1/fs/fuse/dev.c linux-2.6.33.1/fs/fuse/dev.c +--- linux-2.6.33.1/fs/fuse/dev.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/fuse/dev.c 2010-03-20 16:58:41.524892399 -0400 +@@ -745,7 +745,7 @@ __releases(&fc->lock) + * request_end(). Otherwise add it to the processing list, and set + * the 'sent' flag. + */ +-static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, ++ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) + { + int err; +@@ -828,6 +828,8 @@ static ssize_t fuse_dev_read(struct kioc + return err; + } + ++EXPORT_SYMBOL_GPL(fuse_dev_read); ++ + static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) + { +@@ -885,7 +887,7 @@ static int fuse_notify_inval_entry(struc + { + struct fuse_notify_inval_entry_out outarg; + int err = -EINVAL; +- char buf[FUSE_NAME_MAX+1]; ++ char *buf = NULL; + struct qstr name; + + if (size < sizeof(outarg)) +@@ -899,6 +901,11 @@ static int fuse_notify_inval_entry(struc + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + ++ err = -ENOMEM; ++ buf = kmalloc(FUSE_NAME_MAX+1, GFP_KERNEL); ++ if (!buf) ++ goto err; ++ + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); +@@ -910,17 +917,15 @@ static int fuse_notify_inval_entry(struc + + down_read(&fc->killsb); + err = -ENOENT; +- if (!fc->sb) +- goto err_unlock; +- +- err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); +- +-err_unlock: ++ if (fc->sb) ++ err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); + up_read(&fc->killsb); ++ kfree(buf); + return err; + + err: + fuse_copy_finish(cs); ++ kfree(buf); + return err; + } + +@@ -987,7 +992,7 @@ static int copy_out_args(struct fuse_cop + * it from the list and copy the rest of the buffer to the request. + * The request is finished by calling request_end() + */ +-static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, ++ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) + { + int err; +@@ -1084,7 +1089,9 @@ static ssize_t fuse_dev_write(struct kio + return err; + } + +-static unsigned fuse_dev_poll(struct file *file, poll_table *wait) ++EXPORT_SYMBOL_GPL(fuse_dev_write); ++ ++unsigned fuse_dev_poll(struct file *file, poll_table *wait) + { + unsigned mask = POLLOUT | POLLWRNORM; + struct fuse_conn *fc = fuse_get_conn(file); +@@ -1103,6 +1110,8 @@ static unsigned fuse_dev_poll(struct fil + return mask; + } + ++EXPORT_SYMBOL_GPL(fuse_dev_poll); ++ + /* + * Abort all requests on the given list (pending or processing) + * +@@ -1210,7 +1219,7 @@ int fuse_dev_release(struct inode *inode + } + EXPORT_SYMBOL_GPL(fuse_dev_release); + +-static int fuse_dev_fasync(int fd, struct file *file, int on) ++int fuse_dev_fasync(int fd, struct file *file, int on) + { + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) +@@ -1220,6 +1229,8 @@ static int fuse_dev_fasync(int fd, struc + return fasync_helper(fd, file, on, &fc->fasync); + } + ++EXPORT_SYMBOL_GPL(fuse_dev_fasync); ++ + const struct file_operations fuse_dev_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, +diff -urNp linux-2.6.33.1/fs/fuse/dir.c linux-2.6.33.1/fs/fuse/dir.c +--- linux-2.6.33.1/fs/fuse/dir.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/fuse/dir.c 2010-03-20 16:58:41.528872982 -0400 +@@ -1127,7 +1127,7 @@ static char *read_link(struct dentry *de + return link; + } + +-static void free_link(char *link) ++static void free_link(const char *link) + { + if (!IS_ERR(link)) + free_page((unsigned long) link); +diff -urNp linux-2.6.33.1/fs/fuse/fuse_i.h linux-2.6.33.1/fs/fuse/fuse_i.h +--- linux-2.6.33.1/fs/fuse/fuse_i.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/fuse/fuse_i.h 2010-03-20 16:58:41.532912913 -0400 +@@ -521,6 +521,16 @@ extern const struct file_operations fuse + + extern const struct dentry_operations fuse_dentry_operations; + ++extern ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, ++ unsigned long nr_segs, loff_t pos); ++ ++extern ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, ++ unsigned long nr_segs, loff_t pos); ++ ++extern unsigned fuse_dev_poll(struct file *file, poll_table *wait); ++ ++extern int fuse_dev_fasync(int fd, struct file *file, int on); ++ + /** + * Inode to nodeid comparison. + */ +diff -urNp linux-2.6.33.1/fs/gfs2/sys.c linux-2.6.33.1/fs/gfs2/sys.c +--- linux-2.6.33.1/fs/gfs2/sys.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/gfs2/sys.c 2010-03-20 16:58:41.563618654 -0400 +@@ -49,7 +49,7 @@ static ssize_t gfs2_attr_store(struct ko + return a->store ? a->store(sdp, buf, len) : len; + } + +-static struct sysfs_ops gfs2_attr_ops = { ++static const struct sysfs_ops gfs2_attr_ops = { + .show = gfs2_attr_show, + .store = gfs2_attr_store, + }; +@@ -576,7 +576,7 @@ static int gfs2_uevent(struct kset *kset + return 0; + } + +-static struct kset_uevent_ops gfs2_uevent_ops = { ++static const struct kset_uevent_ops gfs2_uevent_ops = { + .uevent = gfs2_uevent, + }; + +diff -urNp linux-2.6.33.1/fs/hfs/inode.c linux-2.6.33.1/fs/hfs/inode.c +--- linux-2.6.33.1/fs/hfs/inode.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/hfs/inode.c 2010-03-20 16:58:41.564527258 -0400 +@@ -423,7 +423,7 @@ int hfs_write_inode(struct inode *inode, + + if (S_ISDIR(main_inode->i_mode)) { + if (fd.entrylength < sizeof(struct hfs_cat_dir)) +- /* panic? */; ++ {/* panic? */} + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_dir)); + if (rec.type != HFS_CDR_DIR || +@@ -444,7 +444,7 @@ int hfs_write_inode(struct inode *inode, + sizeof(struct hfs_cat_file)); + } else { + if (fd.entrylength < sizeof(struct hfs_cat_file)) +- /* panic? */; ++ {/* panic? */} + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + if (rec.type != HFS_CDR_FIL || +diff -urNp linux-2.6.33.1/fs/hfsplus/inode.c linux-2.6.33.1/fs/hfsplus/inode.c +--- linux-2.6.33.1/fs/hfsplus/inode.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/hfsplus/inode.c 2010-03-20 16:58:41.564527258 -0400 +@@ -406,7 +406,7 @@ int hfsplus_cat_read_inode(struct inode + struct hfsplus_cat_folder *folder = &entry.folder; + + if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) +- /* panic? */; ++ {/* panic? */} + hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, + sizeof(struct hfsplus_cat_folder)); + hfsplus_get_perms(inode, &folder->permissions, 1); +@@ -423,7 +423,7 @@ int hfsplus_cat_read_inode(struct inode + struct hfsplus_cat_file *file = &entry.file; + + if (fd->entrylength < sizeof(struct hfsplus_cat_file)) +- /* panic? */; ++ {/* panic? */} + hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, + sizeof(struct hfsplus_cat_file)); + +@@ -479,7 +479,7 @@ int hfsplus_cat_write_inode(struct inode + struct hfsplus_cat_folder *folder = &entry.folder; + + if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) +- /* panic? */; ++ {/* panic? */} + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_folder)); + /* simple node checks? */ +@@ -501,7 +501,7 @@ int hfsplus_cat_write_inode(struct inode + struct hfsplus_cat_file *file = &entry.file; + + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) +- /* panic? */; ++ {/* panic? */} + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + hfsplus_inode_write_fork(inode, &file->data_fork); +diff -urNp linux-2.6.33.1/fs/ioctl.c linux-2.6.33.1/fs/ioctl.c +--- linux-2.6.33.1/fs/ioctl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ioctl.c 2010-03-20 16:58:41.564527258 -0400 +@@ -97,7 +97,7 @@ int fiemap_fill_next_extent(struct fiema + u64 phys, u64 len, u32 flags) + { + struct fiemap_extent extent; +- struct fiemap_extent *dest = fieinfo->fi_extents_start; ++ struct fiemap_extent __user *dest = fieinfo->fi_extents_start; + + /* only count the extents */ + if (fieinfo->fi_extents_max == 0) { +@@ -207,7 +207,7 @@ static int ioctl_fiemap(struct file *fil + + fieinfo.fi_flags = fiemap.fm_flags; + fieinfo.fi_extents_max = fiemap.fm_extent_count; +- fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); ++ fieinfo.fi_extents_start = (struct fiemap_extent __user *)(arg + sizeof(fiemap)); + + if (fiemap.fm_extent_count != 0 && + !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, +@@ -220,7 +220,7 @@ static int ioctl_fiemap(struct file *fil + error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); + fiemap.fm_flags = fieinfo.fi_flags; + fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; +- if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) ++ if (copy_to_user((__force char __user *)arg, &fiemap, sizeof(fiemap))) + error = -EFAULT; + + return error; +diff -urNp linux-2.6.33.1/fs/jffs2/debug.h linux-2.6.33.1/fs/jffs2/debug.h +--- linux-2.6.33.1/fs/jffs2/debug.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/jffs2/debug.h 2010-03-20 16:58:41.564527258 -0400 +@@ -52,13 +52,13 @@ + #if CONFIG_JFFS2_FS_DEBUG > 0 + #define D1(x) x + #else +-#define D1(x) ++#define D1(x) do {} while (0); + #endif + + #if CONFIG_JFFS2_FS_DEBUG > 1 + #define D2(x) x + #else +-#define D2(x) ++#define D2(x) do {} while (0); + #endif + + /* The prefixes of JFFS2 messages */ +@@ -114,73 +114,73 @@ + #ifdef JFFS2_DBG_READINODE_MESSAGES + #define dbg_readinode(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_readinode(fmt, ...) ++#define dbg_readinode(fmt, ...) do {} while (0) + #endif + #ifdef JFFS2_DBG_READINODE2_MESSAGES + #define dbg_readinode2(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_readinode2(fmt, ...) ++#define dbg_readinode2(fmt, ...) do {} while (0) + #endif + + /* Fragtree build debugging messages */ + #ifdef JFFS2_DBG_FRAGTREE_MESSAGES + #define dbg_fragtree(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_fragtree(fmt, ...) ++#define dbg_fragtree(fmt, ...) do {} while (0) + #endif + #ifdef JFFS2_DBG_FRAGTREE2_MESSAGES + #define dbg_fragtree2(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_fragtree2(fmt, ...) ++#define dbg_fragtree2(fmt, ...) do {} while (0) + #endif + + /* Directory entry list manilulation debugging messages */ + #ifdef JFFS2_DBG_DENTLIST_MESSAGES + #define dbg_dentlist(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_dentlist(fmt, ...) ++#define dbg_dentlist(fmt, ...) do {} while (0) + #endif + + /* Print the messages about manipulating node_refs */ + #ifdef JFFS2_DBG_NODEREF_MESSAGES + #define dbg_noderef(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_noderef(fmt, ...) ++#define dbg_noderef(fmt, ...) do {} while (0) + #endif + + /* Manipulations with the list of inodes (JFFS2 inocache) */ + #ifdef JFFS2_DBG_INOCACHE_MESSAGES + #define dbg_inocache(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_inocache(fmt, ...) ++#define dbg_inocache(fmt, ...) do {} while (0) + #endif + + /* Summary debugging messages */ + #ifdef JFFS2_DBG_SUMMARY_MESSAGES + #define dbg_summary(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_summary(fmt, ...) ++#define dbg_summary(fmt, ...) do {} while (0) + #endif + + /* File system build messages */ + #ifdef JFFS2_DBG_FSBUILD_MESSAGES + #define dbg_fsbuild(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_fsbuild(fmt, ...) ++#define dbg_fsbuild(fmt, ...) do {} while (0) + #endif + + /* Watch the object allocations */ + #ifdef JFFS2_DBG_MEMALLOC_MESSAGES + #define dbg_memalloc(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_memalloc(fmt, ...) ++#define dbg_memalloc(fmt, ...) do {} while (0) + #endif + + /* Watch the XATTR subsystem */ + #ifdef JFFS2_DBG_XATTR_MESSAGES + #define dbg_xattr(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) + #else +-#define dbg_xattr(fmt, ...) ++#define dbg_xattr(fmt, ...) do {} while (0) + #endif + + /* "Sanity" checks */ +diff -urNp linux-2.6.33.1/fs/jffs2/erase.c linux-2.6.33.1/fs/jffs2/erase.c +--- linux-2.6.33.1/fs/jffs2/erase.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/jffs2/erase.c 2010-03-20 16:58:41.588752321 -0400 +@@ -434,7 +434,8 @@ static void jffs2_mark_erased_block(stru + struct jffs2_unknown_node marker = { + .magic = cpu_to_je16(JFFS2_MAGIC_BITMASK), + .nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER), +- .totlen = cpu_to_je32(c->cleanmarker_size) ++ .totlen = cpu_to_je32(c->cleanmarker_size), ++ .hdr_crc = cpu_to_je32(0) + }; + + jffs2_prealloc_raw_node_refs(c, jeb, 1); +diff -urNp linux-2.6.33.1/fs/jffs2/summary.h linux-2.6.33.1/fs/jffs2/summary.h +--- linux-2.6.33.1/fs/jffs2/summary.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/jffs2/summary.h 2010-03-20 16:58:41.588752321 -0400 +@@ -194,18 +194,18 @@ int jffs2_sum_scan_sumnode(struct jffs2_ + + #define jffs2_sum_active() (0) + #define jffs2_sum_init(a) (0) +-#define jffs2_sum_exit(a) +-#define jffs2_sum_disable_collecting(a) ++#define jffs2_sum_exit(a) do {} while (0) ++#define jffs2_sum_disable_collecting(a) do {} while (0) + #define jffs2_sum_is_disabled(a) (0) +-#define jffs2_sum_reset_collected(a) ++#define jffs2_sum_reset_collected(a) do {} while (0) + #define jffs2_sum_add_kvec(a,b,c,d) (0) +-#define jffs2_sum_move_collected(a,b) ++#define jffs2_sum_move_collected(a,b) do {} while (0) + #define jffs2_sum_write_sumnode(a) (0) +-#define jffs2_sum_add_padding_mem(a,b) +-#define jffs2_sum_add_inode_mem(a,b,c) +-#define jffs2_sum_add_dirent_mem(a,b,c) +-#define jffs2_sum_add_xattr_mem(a,b,c) +-#define jffs2_sum_add_xref_mem(a,b,c) ++#define jffs2_sum_add_padding_mem(a,b) do {} while (0) ++#define jffs2_sum_add_inode_mem(a,b,c) do {} while (0) ++#define jffs2_sum_add_dirent_mem(a,b,c) do {} while (0) ++#define jffs2_sum_add_xattr_mem(a,b,c) do {} while (0) ++#define jffs2_sum_add_xref_mem(a,b,c) do {} while (0) + #define jffs2_sum_scan_sumnode(a,b,c,d,e) (0) + + #endif /* CONFIG_JFFS2_SUMMARY */ +diff -urNp linux-2.6.33.1/fs/jffs2/wbuf.c linux-2.6.33.1/fs/jffs2/wbuf.c +--- linux-2.6.33.1/fs/jffs2/wbuf.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/jffs2/wbuf.c 2010-03-20 16:58:41.588752321 -0400 +@@ -1012,7 +1012,8 @@ static const struct jffs2_unknown_node o + { + .magic = constant_cpu_to_je16(JFFS2_MAGIC_BITMASK), + .nodetype = constant_cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER), +- .totlen = constant_cpu_to_je32(8) ++ .totlen = constant_cpu_to_je32(8), ++ .hdr_crc = constant_cpu_to_je32(0) + }; + + /* +diff -urNp linux-2.6.33.1/fs/lockd/svc.c linux-2.6.33.1/fs/lockd/svc.c +--- linux-2.6.33.1/fs/lockd/svc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/lockd/svc.c 2010-03-20 16:58:41.588752321 -0400 +@@ -43,7 +43,7 @@ + + static struct svc_program nlmsvc_program; + +-struct nlmsvc_binding * nlmsvc_ops; ++const struct nlmsvc_binding * nlmsvc_ops; + EXPORT_SYMBOL_GPL(nlmsvc_ops); + + static DEFINE_MUTEX(nlmsvc_mutex); +diff -urNp linux-2.6.33.1/fs/locks.c linux-2.6.33.1/fs/locks.c +--- linux-2.6.33.1/fs/locks.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/locks.c 2010-03-20 16:58:41.600927747 -0400 +@@ -2007,16 +2007,16 @@ void locks_remove_flock(struct file *fil + return; + + if (filp->f_op && filp->f_op->flock) { +- struct file_lock fl = { ++ struct file_lock flock = { + .fl_pid = current->tgid, + .fl_file = filp, + .fl_flags = FL_FLOCK, + .fl_type = F_UNLCK, + .fl_end = OFFSET_MAX, + }; +- filp->f_op->flock(filp, F_SETLKW, &fl); +- if (fl.fl_ops && fl.fl_ops->fl_release_private) +- fl.fl_ops->fl_release_private(&fl); ++ filp->f_op->flock(filp, F_SETLKW, &flock); ++ if (flock.fl_ops && flock.fl_ops->fl_release_private) ++ flock.fl_ops->fl_release_private(&flock); + } + + lock_kernel(); +diff -urNp linux-2.6.33.1/fs/namei.c linux-2.6.33.1/fs/namei.c +--- linux-2.6.33.1/fs/namei.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/namei.c 2010-03-20 16:58:41.612934232 -0400 +@@ -565,7 +565,7 @@ static __always_inline int __do_follow_l + cookie = dentry->d_inode->i_op->follow_link(dentry, nd); + error = PTR_ERR(cookie); + if (!IS_ERR(cookie)) { +- char *s = nd_get_link(nd); ++ const char *s = nd_get_link(nd); + error = 0; + if (s) + error = __vfs_follow_link(nd, s); +@@ -599,6 +599,13 @@ static inline int do_follow_link(struct + err = security_inode_follow_link(path->dentry, nd); + if (err) + goto loop; ++ ++ if (gr_handle_follow_link(path->dentry->d_parent->d_inode, ++ path->dentry->d_inode, path->dentry, nd->path.mnt)) { ++ err = -EACCES; ++ goto loop; ++ } ++ + current->link_count++; + current->total_link_count++; + nd->depth++; +@@ -994,11 +1001,18 @@ return_reval: + break; + } + return_base: ++ if (!gr_acl_handle_hidden_file(nd->path.dentry, nd->path.mnt)) { ++ path_put(&nd->path); ++ return -ENOENT; ++ } + return 0; + out_dput: + path_put_conditional(&next, nd); + break; + } ++ if (!gr_acl_handle_hidden_file(nd->path.dentry, nd->path.mnt)) ++ err = -ENOENT; ++ + path_put(&nd->path); + return_err: + return err; +@@ -1552,12 +1566,19 @@ static int __open_namei_create(struct na + int error; + struct dentry *dir = nd->path.dentry; + ++ if (!gr_acl_handle_creat(path->dentry, nd->path.dentry, nd->path.mnt, flag, mode)) { ++ error = -EACCES; ++ goto out_unlock; ++ } ++ + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current_umask(); + error = security_path_mknod(&nd->path, path->dentry, mode, 0); + if (error) + goto out_unlock; + error = vfs_create(dir->d_inode, path->dentry, mode, nd); ++ if (!error) ++ gr_handle_create(path->dentry, nd->path.mnt); + out_unlock: + mutex_unlock(&dir->d_inode->i_mutex); + dput(nd->path.dentry); +@@ -1665,6 +1686,22 @@ struct file *do_filp_open(int dfd, const + release_open_intent(&nd); + if (error) + return ERR_PTR(error); ++ ++ if (gr_handle_rofs_blockwrite(nd.path.dentry, nd.path.mnt, acc_mode)) { ++ error = -EPERM; ++ goto exit; ++ } ++ ++ if (gr_handle_rawio(nd.path.dentry->d_inode)) { ++ error = -EPERM; ++ goto exit; ++ } ++ ++ if (!gr_acl_handle_open(nd.path.dentry, nd.path.mnt, flag)) { ++ error = -EACCES; ++ goto exit; ++ } ++ + goto ok; + } + +@@ -1758,6 +1795,24 @@ do_last: + /* + * It already exists. + */ ++ ++ if (gr_handle_rofs_blockwrite(path.dentry, nd.path.mnt, acc_mode)) { ++ error = -EPERM; ++ goto exit_mutex_unlock; ++ } ++ if (gr_handle_rawio(path.dentry->d_inode)) { ++ error = -EPERM; ++ goto exit_mutex_unlock; ++ } ++ if (!gr_acl_handle_open(path.dentry, nd.path.mnt, flag)) { ++ error = -EACCES; ++ goto exit_mutex_unlock; ++ } ++ if (gr_handle_fifo(path.dentry, nd.path.mnt, dir, flag, acc_mode)) { ++ error = -EACCES; ++ goto exit_mutex_unlock; ++ } ++ + mutex_unlock(&dir->d_inode->i_mutex); + audit_inode(pathname, path.dentry); + +@@ -1866,6 +1921,13 @@ do_link: + error = security_inode_follow_link(path.dentry, &nd); + if (error) + goto exit_dput; ++ ++ if (gr_handle_follow_link(path.dentry->d_parent->d_inode, path.dentry->d_inode, ++ path.dentry, nd.path.mnt)) { ++ error = -EACCES; ++ goto exit_dput; ++ } ++ + error = __do_follow_link(&path, &nd); + path_put(&path); + if (error) { +@@ -2045,6 +2107,17 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const + error = may_mknod(mode); + if (error) + goto out_dput; ++ ++ if (gr_handle_chroot_mknod(dentry, nd.path.mnt, mode)) { ++ error = -EPERM; ++ goto out_dput; ++ } ++ ++ if (!gr_acl_handle_mknod(dentry, nd.path.dentry, nd.path.mnt, mode)) { ++ error = -EACCES; ++ goto out_dput; ++ } ++ + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; +@@ -2065,6 +2138,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const + } + out_drop_write: + mnt_drop_write(nd.path.mnt); ++ ++ if (!error) ++ gr_handle_create(dentry, nd.path.mnt); + out_dput: + dput(dentry); + out_unlock: +@@ -2118,6 +2194,11 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const + if (IS_ERR(dentry)) + goto out_unlock; + ++ if (!gr_acl_handle_mkdir(dentry, nd.path.dentry, nd.path.mnt)) { ++ error = -EACCES; ++ goto out_dput; ++ } ++ + if (!IS_POSIXACL(nd.path.dentry->d_inode)) + mode &= ~current_umask(); + error = mnt_want_write(nd.path.mnt); +@@ -2129,6 +2210,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const + error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); + out_drop_write: + mnt_drop_write(nd.path.mnt); ++ ++ if (!error) ++ gr_handle_create(dentry, nd.path.mnt); ++ + out_dput: + dput(dentry); + out_unlock: +@@ -2210,6 +2295,8 @@ static long do_rmdir(int dfd, const char + char * name; + struct dentry *dentry; + struct nameidata nd; ++ ino_t saved_ino = 0; ++ dev_t saved_dev = 0; + + error = user_path_parent(dfd, pathname, &nd, &name); + if (error) +@@ -2234,6 +2321,19 @@ static long do_rmdir(int dfd, const char + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto exit2; ++ ++ if (dentry->d_inode != NULL) { ++ if (dentry->d_inode->i_nlink <= 1) { ++ saved_ino = dentry->d_inode->i_ino; ++ saved_dev = dentry->d_inode->i_sb->s_dev; ++ } ++ ++ if (!gr_acl_handle_rmdir(dentry, nd.path.mnt)) { ++ error = -EACCES; ++ goto exit3; ++ } ++ } ++ + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit3; +@@ -2241,6 +2341,8 @@ static long do_rmdir(int dfd, const char + if (error) + goto exit4; + error = vfs_rmdir(nd.path.dentry->d_inode, dentry); ++ if (!error && (saved_dev || saved_ino)) ++ gr_handle_delete(saved_ino, saved_dev); + exit4: + mnt_drop_write(nd.path.mnt); + exit3: +@@ -2302,6 +2404,8 @@ static long do_unlinkat(int dfd, const c + struct dentry *dentry; + struct nameidata nd; + struct inode *inode = NULL; ++ ino_t saved_ino = 0; ++ dev_t saved_dev = 0; + + error = user_path_parent(dfd, pathname, &nd, &name); + if (error) +@@ -2321,8 +2425,19 @@ static long do_unlinkat(int dfd, const c + if (nd.last.name[nd.last.len]) + goto slashes; + inode = dentry->d_inode; +- if (inode) ++ if (inode) { ++ if (inode->i_nlink <= 1) { ++ saved_ino = inode->i_ino; ++ saved_dev = inode->i_sb->s_dev; ++ } ++ + atomic_inc(&inode->i_count); ++ ++ if (!gr_acl_handle_unlink(dentry, nd.path.mnt)) { ++ error = -EACCES; ++ goto exit2; ++ } ++ } + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit2; +@@ -2330,6 +2445,8 @@ static long do_unlinkat(int dfd, const c + if (error) + goto exit3; + error = vfs_unlink(nd.path.dentry->d_inode, dentry); ++ if (!error && (saved_ino || saved_dev)) ++ gr_handle_delete(saved_ino, saved_dev); + exit3: + mnt_drop_write(nd.path.mnt); + exit2: +@@ -2408,6 +2525,11 @@ SYSCALL_DEFINE3(symlinkat, const char __ + if (IS_ERR(dentry)) + goto out_unlock; + ++ if (!gr_acl_handle_symlink(dentry, nd.path.dentry, nd.path.mnt, from)) { ++ error = -EACCES; ++ goto out_dput; ++ } ++ + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; +@@ -2415,6 +2537,8 @@ SYSCALL_DEFINE3(symlinkat, const char __ + if (error) + goto out_drop_write; + error = vfs_symlink(nd.path.dentry->d_inode, dentry, from); ++ if (!error) ++ gr_handle_create(dentry, nd.path.mnt); + out_drop_write: + mnt_drop_write(nd.path.mnt); + out_dput: +@@ -2508,6 +2632,20 @@ SYSCALL_DEFINE5(linkat, int, olddfd, con + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto out_unlock; ++ ++ if (gr_handle_hardlink(old_path.dentry, old_path.mnt, ++ old_path.dentry->d_inode, ++ old_path.dentry->d_inode->i_mode, to)) { ++ error = -EACCES; ++ goto out_dput; ++ } ++ ++ if (!gr_acl_handle_link(new_dentry, nd.path.dentry, nd.path.mnt, ++ old_path.dentry, old_path.mnt, to)) { ++ error = -EACCES; ++ goto out_dput; ++ } ++ + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; +@@ -2515,6 +2653,8 @@ SYSCALL_DEFINE5(linkat, int, olddfd, con + if (error) + goto out_drop_write; + error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry); ++ if (!error) ++ gr_handle_create(new_dentry, nd.path.mnt); + out_drop_write: + mnt_drop_write(nd.path.mnt); + out_dput: +@@ -2748,6 +2888,12 @@ SYSCALL_DEFINE4(renameat, int, olddfd, c + if (new_dentry == trap) + goto exit5; + ++ error = gr_acl_handle_rename(new_dentry, new_dir, newnd.path.mnt, ++ old_dentry, old_dir->d_inode, oldnd.path.mnt, ++ to); ++ if (error) ++ goto exit5; ++ + error = mnt_want_write(oldnd.path.mnt); + if (error) + goto exit5; +@@ -2757,6 +2903,9 @@ SYSCALL_DEFINE4(renameat, int, olddfd, c + goto exit6; + error = vfs_rename(old_dir->d_inode, old_dentry, + new_dir->d_inode, new_dentry); ++ if (!error) ++ gr_handle_rename(old_dir->d_inode, new_dir->d_inode, old_dentry, ++ new_dentry, oldnd.path.mnt, new_dentry->d_inode ? 1 : 0); + exit6: + mnt_drop_write(oldnd.path.mnt); + exit5: +diff -urNp linux-2.6.33.1/fs/namespace.c linux-2.6.33.1/fs/namespace.c +--- linux-2.6.33.1/fs/namespace.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/namespace.c 2010-03-20 16:58:41.649168726 -0400 +@@ -1085,6 +1085,9 @@ static int do_umount(struct vfsmount *mn + if (!(sb->s_flags & MS_RDONLY)) + retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); + up_write(&sb->s_umount); ++ ++ gr_log_remount(mnt->mnt_devname, retval); ++ + return retval; + } + +@@ -1106,6 +1109,9 @@ static int do_umount(struct vfsmount *mn + security_sb_umount_busy(mnt); + up_write(&namespace_sem); + release_mounts(&umount_list); ++ ++ gr_log_unmount(mnt->mnt_devname, retval); ++ + return retval; + } + +@@ -1963,6 +1969,16 @@ long do_mount(char *dev_name, char *dir_ + MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | + MS_STRICTATIME); + ++ if (gr_handle_rofs_mount(path.dentry, path.mnt, mnt_flags)) { ++ retval = -EPERM; ++ goto dput_out; ++ } ++ ++ if (gr_handle_chroot_mount(path.dentry, path.mnt, dev_name)) { ++ retval = -EPERM; ++ goto dput_out; ++ } ++ + if (flags & MS_REMOUNT) + retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, + data_page); +@@ -1977,6 +1993,9 @@ long do_mount(char *dev_name, char *dir_ + dev_name, data_page); + dput_out: + path_put(&path); ++ ++ gr_log_mount(dev_name, dir_name, retval); ++ + return retval; + } + +@@ -2183,6 +2202,12 @@ SYSCALL_DEFINE2(pivot_root, const char _ + goto out1; + } + ++ if (gr_handle_chroot_pivot()) { ++ error = -EPERM; ++ path_put(&old); ++ goto out1; ++ } ++ + read_lock(¤t->fs->lock); + root = current->fs->root; + path_get(¤t->fs->root); +diff -urNp linux-2.6.33.1/fs/nfs/inode.c linux-2.6.33.1/fs/nfs/inode.c +--- linux-2.6.33.1/fs/nfs/inode.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/nfs/inode.c 2010-03-20 16:58:41.649168726 -0400 +@@ -965,16 +965,16 @@ static int nfs_size_need_update(const st + return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); + } + +-static atomic_long_t nfs_attr_generation_counter; ++static atomic_long_unchecked_t nfs_attr_generation_counter; + + static unsigned long nfs_read_attr_generation_counter(void) + { +- return atomic_long_read(&nfs_attr_generation_counter); ++ return atomic_long_read_unchecked(&nfs_attr_generation_counter); + } + + unsigned long nfs_inc_attr_generation_counter(void) + { +- return atomic_long_inc_return(&nfs_attr_generation_counter); ++ return atomic_long_inc_return_unchecked(&nfs_attr_generation_counter); + } + + void nfs_fattr_init(struct nfs_fattr *fattr) +diff -urNp linux-2.6.33.1/fs/nfs/nfs4proc.c linux-2.6.33.1/fs/nfs/nfs4proc.c +--- linux-2.6.33.1/fs/nfs/nfs4proc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/nfs/nfs4proc.c 2010-03-20 16:58:41.652911113 -0400 +@@ -1159,7 +1159,7 @@ static int _nfs4_do_open_reclaim(struct + static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state) + { + struct nfs_server *server = NFS_SERVER(state->inode); +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = _nfs4_do_open_reclaim(ctx, state); +@@ -1201,7 +1201,7 @@ static int _nfs4_open_delegation_recall( + + int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + struct nfs_server *server = NFS_SERVER(state->inode); + int err; + do { +@@ -1572,7 +1572,7 @@ static int _nfs4_open_expired(struct nfs + static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) + { + struct nfs_server *server = NFS_SERVER(state->inode); +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + + do { +@@ -1678,7 +1678,7 @@ out_err: + + static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + struct nfs4_state *res; + int status; + +@@ -1769,7 +1769,7 @@ static int nfs4_do_setattr(struct inode + struct nfs4_state *state) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(server, +@@ -2146,7 +2146,7 @@ static int _nfs4_server_capabilities(str + + int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(server, +@@ -2180,7 +2180,7 @@ static int _nfs4_lookup_root(struct nfs_ + static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(server, +@@ -2269,7 +2269,7 @@ static int _nfs4_proc_getattr(struct nfs + + static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(server, +@@ -2357,7 +2357,7 @@ static int nfs4_proc_lookupfh(struct nfs + struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr); +@@ -2386,7 +2386,7 @@ static int _nfs4_proc_lookup(struct inod + + static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), +@@ -2450,7 +2450,7 @@ static int _nfs4_proc_access(struct inod + + static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), +@@ -2506,7 +2506,7 @@ static int _nfs4_proc_readlink(struct in + static int nfs4_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), +@@ -2604,7 +2604,7 @@ static int _nfs4_proc_remove(struct inod + + static int nfs4_proc_remove(struct inode *dir, struct qstr *name) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), +@@ -2677,7 +2677,7 @@ static int _nfs4_proc_rename(struct inod + static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, + struct inode *new_dir, struct qstr *new_name) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(old_dir), +@@ -2724,7 +2724,7 @@ static int _nfs4_proc_link(struct inode + + static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), +@@ -2816,7 +2816,7 @@ out: + static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, + struct page *page, unsigned int len, struct iattr *sattr) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), +@@ -2847,7 +2847,7 @@ out: + static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), +@@ -2896,7 +2896,7 @@ static int _nfs4_proc_readdir(struct den + static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page *page, unsigned int count, int plus) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), +@@ -2944,7 +2944,7 @@ out: + static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, dev_t rdev) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), +@@ -2976,7 +2976,7 @@ static int _nfs4_proc_statfs(struct nfs_ + + static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(server, +@@ -3007,7 +3007,7 @@ static int _nfs4_do_fsinfo(struct nfs_se + + static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + + do { +@@ -3053,7 +3053,7 @@ static int _nfs4_proc_pathconf(struct nf + static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *pathconf) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + + do { +@@ -3348,7 +3348,7 @@ out_free: + + static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + ssize_t ret; + do { + ret = __nfs4_get_acl_uncached(inode, buf, buflen); +@@ -3404,7 +3404,7 @@ static int __nfs4_proc_set_acl(struct in + + static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), +@@ -3686,7 +3686,7 @@ out: + int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + do { + err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); +@@ -3759,7 +3759,7 @@ out: + + static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + + do { +@@ -4171,7 +4171,7 @@ static int _nfs4_do_setlk(struct nfs4_st + static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) + { + struct nfs_server *server = NFS_SERVER(state->inode); +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + + do { +@@ -4189,7 +4189,7 @@ static int nfs4_lock_reclaim(struct nfs4 + static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request) + { + struct nfs_server *server = NFS_SERVER(state->inode); +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + + err = nfs4_set_lock_state(state, request); +@@ -4253,7 +4253,7 @@ out: + + static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) + { +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + + do { +@@ -4313,7 +4313,7 @@ nfs4_proc_lock(struct file *filp, int cm + int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) + { + struct nfs_server *server = NFS_SERVER(state->inode); +- struct nfs4_exception exception = { }; ++ struct nfs4_exception exception = {0, 0}; + int err; + + err = nfs4_set_lock_state(state, fl); +diff -urNp linux-2.6.33.1/fs/nfsd/lockd.c linux-2.6.33.1/fs/nfsd/lockd.c +--- linux-2.6.33.1/fs/nfsd/lockd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/nfsd/lockd.c 2010-03-20 16:58:41.660926393 -0400 +@@ -61,7 +61,7 @@ nlm_fclose(struct file *filp) + fput(filp); + } + +-static struct nlmsvc_binding nfsd_nlm_ops = { ++static const struct nlmsvc_binding nfsd_nlm_ops = { + .fopen = nlm_fopen, /* open file for locking */ + .fclose = nlm_fclose, /* close file */ + }; +diff -urNp linux-2.6.33.1/fs/nfsd/nfsctl.c linux-2.6.33.1/fs/nfsd/nfsctl.c +--- linux-2.6.33.1/fs/nfsd/nfsctl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/nfsd/nfsctl.c 2010-03-20 16:58:41.676914510 -0400 +@@ -159,7 +159,7 @@ static int export_features_open(struct i + return single_open(file, export_features_show, NULL); + } + +-static struct file_operations export_features_operations = { ++static const struct file_operations export_features_operations = { + .open = export_features_open, + .read = seq_read, + .llseek = seq_lseek, +diff -urNp linux-2.6.33.1/fs/nfsd/vfs.c linux-2.6.33.1/fs/nfsd/vfs.c +--- linux-2.6.33.1/fs/nfsd/vfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/nfsd/vfs.c 2010-03-20 16:58:41.676914510 -0400 +@@ -945,7 +945,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st + } else { + oldfs = get_fs(); + set_fs(KERNEL_DS); +- host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); ++ host_err = vfs_readv(file, (__force struct iovec __user *)vec, vlen, &offset); + set_fs(oldfs); + } + +@@ -1068,7 +1068,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, s + + /* Write the data. */ + oldfs = get_fs(); set_fs(KERNEL_DS); +- host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); ++ host_err = vfs_writev(file, (__force struct iovec __user *)vec, vlen, &offset); + set_fs(oldfs); + if (host_err < 0) + goto out_nfserr; +@@ -1543,7 +1543,7 @@ nfsd_readlink(struct svc_rqst *rqstp, st + */ + + oldfs = get_fs(); set_fs(KERNEL_DS); +- host_err = inode->i_op->readlink(dentry, buf, *lenp); ++ host_err = inode->i_op->readlink(dentry, (__force char __user *)buf, *lenp); + set_fs(oldfs); + + if (host_err < 0) +diff -urNp linux-2.6.33.1/fs/nls/nls_base.c linux-2.6.33.1/fs/nls/nls_base.c +--- linux-2.6.33.1/fs/nls/nls_base.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/nls/nls_base.c 2010-03-20 16:58:41.684912030 -0400 +@@ -41,7 +41,7 @@ static const struct utf8_table utf8_tabl + {0xF8, 0xF0, 3*6, 0x1FFFFF, 0x10000, /* 4 byte sequence */}, + {0xFC, 0xF8, 4*6, 0x3FFFFFF, 0x200000, /* 5 byte sequence */}, + {0xFE, 0xFC, 5*6, 0x7FFFFFFF, 0x4000000, /* 6 byte sequence */}, +- {0, /* end of table */} ++ {0, 0, 0, 0, 0, /* end of table */} + }; + + #define UNICODE_MAX 0x0010ffff +diff -urNp linux-2.6.33.1/fs/ntfs/file.c linux-2.6.33.1/fs/ntfs/file.c +--- linux-2.6.33.1/fs/ntfs/file.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ntfs/file.c 2010-03-20 16:58:41.717206210 -0400 +@@ -2243,6 +2243,6 @@ const struct inode_operations ntfs_file_ + #endif /* NTFS_RW */ + }; + +-const struct file_operations ntfs_empty_file_ops = {}; ++const struct file_operations ntfs_empty_file_ops __read_only; + +-const struct inode_operations ntfs_empty_inode_ops = {}; ++const struct inode_operations ntfs_empty_inode_ops __read_only; +diff -urNp linux-2.6.33.1/fs/ocfs2/cluster/masklog.c linux-2.6.33.1/fs/ocfs2/cluster/masklog.c +--- linux-2.6.33.1/fs/ocfs2/cluster/masklog.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ocfs2/cluster/masklog.c 2010-03-20 16:58:41.724925424 -0400 +@@ -135,7 +135,7 @@ static ssize_t mlog_store(struct kobject + return mlog_mask_store(mlog_attr->mask, buf, count); + } + +-static struct sysfs_ops mlog_attr_ops = { ++static const struct sysfs_ops mlog_attr_ops = { + .show = mlog_show, + .store = mlog_store, + }; +diff -urNp linux-2.6.33.1/fs/ocfs2/localalloc.c linux-2.6.33.1/fs/ocfs2/localalloc.c +--- linux-2.6.33.1/fs/ocfs2/localalloc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ocfs2/localalloc.c 2010-03-20 16:58:41.736921327 -0400 +@@ -1188,7 +1188,7 @@ static int ocfs2_local_alloc_slide_windo + goto bail; + } + +- atomic_inc(&osb->alloc_stats.moves); ++ atomic_inc_unchecked(&osb->alloc_stats.moves); + + status = 0; + bail: +diff -urNp linux-2.6.33.1/fs/ocfs2/ocfs2.h linux-2.6.33.1/fs/ocfs2/ocfs2.h +--- linux-2.6.33.1/fs/ocfs2/ocfs2.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ocfs2/ocfs2.h 2010-03-20 16:58:41.753416780 -0400 +@@ -221,11 +221,11 @@ enum ocfs2_vol_state + + struct ocfs2_alloc_stats + { +- atomic_t moves; +- atomic_t local_data; +- atomic_t bitmap_data; +- atomic_t bg_allocs; +- atomic_t bg_extends; ++ atomic_unchecked_t moves; ++ atomic_unchecked_t local_data; ++ atomic_unchecked_t bitmap_data; ++ atomic_unchecked_t bg_allocs; ++ atomic_unchecked_t bg_extends; + }; + + enum ocfs2_local_alloc_state +diff -urNp linux-2.6.33.1/fs/ocfs2/suballoc.c linux-2.6.33.1/fs/ocfs2/suballoc.c +--- linux-2.6.33.1/fs/ocfs2/suballoc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ocfs2/suballoc.c 2010-03-20 16:58:41.773443597 -0400 +@@ -620,7 +620,7 @@ static int ocfs2_reserve_suballoc_bits(s + mlog_errno(status); + goto bail; + } +- atomic_inc(&osb->alloc_stats.bg_extends); ++ atomic_inc_unchecked(&osb->alloc_stats.bg_extends); + + /* You should never ask for this much metadata */ + BUG_ON(bits_wanted > +@@ -1651,7 +1651,7 @@ int ocfs2_claim_metadata(struct ocfs2_su + mlog_errno(status); + goto bail; + } +- atomic_inc(&osb->alloc_stats.bg_allocs); ++ atomic_inc_unchecked(&osb->alloc_stats.bg_allocs); + + *blkno_start = bg_blkno + (u64) *suballoc_bit_start; + ac->ac_bits_given += (*num_bits); +@@ -1725,7 +1725,7 @@ int ocfs2_claim_new_inode(struct ocfs2_s + mlog_errno(status); + goto bail; + } +- atomic_inc(&osb->alloc_stats.bg_allocs); ++ atomic_inc_unchecked(&osb->alloc_stats.bg_allocs); + + BUG_ON(num_bits != 1); + +@@ -1827,7 +1827,7 @@ int __ocfs2_claim_clusters(struct ocfs2_ + cluster_start, + num_clusters); + if (!status) +- atomic_inc(&osb->alloc_stats.local_data); ++ atomic_inc_unchecked(&osb->alloc_stats.local_data); + } else { + if (min_clusters > (osb->bitmap_cpg - 1)) { + /* The only paths asking for contiguousness +@@ -1855,7 +1855,7 @@ int __ocfs2_claim_clusters(struct ocfs2_ + ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, + bg_blkno, + bg_bit_off); +- atomic_inc(&osb->alloc_stats.bitmap_data); ++ atomic_inc_unchecked(&osb->alloc_stats.bitmap_data); + } + } + if (status < 0) { +diff -urNp linux-2.6.33.1/fs/ocfs2/super.c linux-2.6.33.1/fs/ocfs2/super.c +--- linux-2.6.33.1/fs/ocfs2/super.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ocfs2/super.c 2010-03-20 16:58:41.781934851 -0400 +@@ -286,11 +286,11 @@ static int ocfs2_osb_dump(struct ocfs2_s + "%10s => GlobalAllocs: %d LocalAllocs: %d " + "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", + "Stats", +- atomic_read(&osb->alloc_stats.bitmap_data), +- atomic_read(&osb->alloc_stats.local_data), +- atomic_read(&osb->alloc_stats.bg_allocs), +- atomic_read(&osb->alloc_stats.moves), +- atomic_read(&osb->alloc_stats.bg_extends)); ++ atomic_read_unchecked(&osb->alloc_stats.bitmap_data), ++ atomic_read_unchecked(&osb->alloc_stats.local_data), ++ atomic_read_unchecked(&osb->alloc_stats.bg_allocs), ++ atomic_read_unchecked(&osb->alloc_stats.moves), ++ atomic_read_unchecked(&osb->alloc_stats.bg_extends)); + + out += snprintf(buf + out, len - out, + "%10s => State: %u Descriptor: %llu Size: %u bits " +@@ -1999,11 +1999,11 @@ static int ocfs2_initialize_super(struct + spin_lock_init(&osb->osb_xattr_lock); + ocfs2_init_inode_steal_slot(osb); + +- atomic_set(&osb->alloc_stats.moves, 0); +- atomic_set(&osb->alloc_stats.local_data, 0); +- atomic_set(&osb->alloc_stats.bitmap_data, 0); +- atomic_set(&osb->alloc_stats.bg_allocs, 0); +- atomic_set(&osb->alloc_stats.bg_extends, 0); ++ atomic_set_unchecked(&osb->alloc_stats.moves, 0); ++ atomic_set_unchecked(&osb->alloc_stats.local_data, 0); ++ atomic_set_unchecked(&osb->alloc_stats.bitmap_data, 0); ++ atomic_set_unchecked(&osb->alloc_stats.bg_allocs, 0); ++ atomic_set_unchecked(&osb->alloc_stats.bg_extends, 0); + + /* Copy the blockcheck stats from the superblock probe */ + osb->osb_ecc_stats = *stats; +diff -urNp linux-2.6.33.1/fs/ocfs2/symlink.c linux-2.6.33.1/fs/ocfs2/symlink.c +--- linux-2.6.33.1/fs/ocfs2/symlink.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/ocfs2/symlink.c 2010-03-20 16:58:41.809220780 -0400 +@@ -148,7 +148,7 @@ bail: + + static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) + { +- char *link = nd_get_link(nd); ++ const char *link = nd_get_link(nd); + if (!IS_ERR(link)) + kfree(link); + } +diff -urNp linux-2.6.33.1/fs/open.c linux-2.6.33.1/fs/open.c +--- linux-2.6.33.1/fs/open.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/open.c 2010-03-20 16:58:41.824559234 -0400 +@@ -209,6 +209,9 @@ int do_truncate(struct dentry *dentry, l + if (length < 0) + return -EINVAL; + ++ if (filp && !gr_acl_handle_truncate(dentry, filp->f_path.mnt)) ++ return -EACCES; ++ + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE | time_attrs; + if (filp) { +@@ -514,6 +517,9 @@ SYSCALL_DEFINE3(faccessat, int, dfd, con + if (__mnt_is_readonly(path.mnt)) + res = -EROFS; + ++ if (!res && !gr_acl_handle_access(path.dentry, path.mnt, mode)) ++ res = -EACCES; ++ + out_path_release: + path_put(&path); + out: +@@ -540,6 +546,8 @@ SYSCALL_DEFINE1(chdir, const char __user + if (error) + goto dput_and_out; + ++ gr_log_chdir(path.dentry, path.mnt); ++ + set_fs_pwd(current->fs, &path); + + dput_and_out: +@@ -566,6 +574,13 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd + goto out_putf; + + error = inode_permission(inode, MAY_EXEC | MAY_ACCESS); ++ ++ if (!error && !gr_chroot_fchdir(file->f_path.dentry, file->f_path.mnt)) ++ error = -EPERM; ++ ++ if (!error) ++ gr_log_chdir(file->f_path.dentry, file->f_path.mnt); ++ + if (!error) + set_fs_pwd(current->fs, &file->f_path); + out_putf: +@@ -594,7 +609,18 @@ SYSCALL_DEFINE1(chroot, const char __use + if (error) + goto dput_and_out; + ++ if (gr_handle_chroot_chroot(path.dentry, path.mnt)) ++ goto dput_and_out; ++ ++ if (gr_handle_chroot_caps(&path)) { ++ error = -ENOMEM; ++ goto dput_and_out; ++ } ++ + set_fs_root(current->fs, &path); ++ ++ gr_handle_chroot_chdir(&path); ++ + error = 0; + dput_and_out: + path_put(&path); +@@ -622,6 +648,12 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd + err = mnt_want_write_file(file); + if (err) + goto out_putf; ++ ++ if (!gr_acl_handle_fchmod(dentry, file->f_path.mnt, mode)) { ++ err = -EACCES; ++ goto out_drop_write; ++ } ++ + mutex_lock(&inode->i_mutex); + err = security_path_chmod(dentry, file->f_vfsmnt, mode); + if (err) +@@ -633,6 +665,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd + err = notify_change(dentry, &newattrs); + out_unlock: + mutex_unlock(&inode->i_mutex); ++out_drop_write: + mnt_drop_write(file->f_path.mnt); + out_putf: + fput(file); +@@ -655,17 +688,30 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, cons + error = mnt_want_write(path.mnt); + if (error) + goto dput_and_out; ++ ++ if (!gr_acl_handle_chmod(path.dentry, path.mnt, mode)) { ++ error = -EACCES; ++ goto out_drop_write; ++ } ++ + mutex_lock(&inode->i_mutex); + error = security_path_chmod(path.dentry, path.mnt, mode); + if (error) + goto out_unlock; + if (mode == (mode_t) -1) + mode = inode->i_mode; ++ ++ if (gr_handle_chroot_chmod(path.dentry, path.mnt, mode)) { ++ error = -EACCES; ++ goto out_unlock; ++ } ++ + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + error = notify_change(path.dentry, &newattrs); + out_unlock: + mutex_unlock(&inode->i_mutex); ++out_drop_write: + mnt_drop_write(path.mnt); + dput_and_out: + path_put(&path); +@@ -684,6 +730,9 @@ static int chown_common(struct path *pat + int error; + struct iattr newattrs; + ++ if (!gr_acl_handle_chown(path->dentry, path->mnt)) ++ return -EACCES; ++ + newattrs.ia_valid = ATTR_CTIME; + if (user != (uid_t) -1) { + newattrs.ia_valid |= ATTR_UID; +diff -urNp linux-2.6.33.1/fs/pipe.c linux-2.6.33.1/fs/pipe.c +--- linux-2.6.33.1/fs/pipe.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/pipe.c 2010-03-20 16:58:41.824559234 -0400 +@@ -401,9 +401,9 @@ redo: + } + if (bufs) /* More to do? */ + continue; +- if (!pipe->writers) ++ if (!atomic_read(&pipe->writers)) + break; +- if (!pipe->waiting_writers) { ++ if (!atomic_read(&pipe->waiting_writers)) { + /* syscall merging: Usually we must not sleep + * if O_NONBLOCK is set, or if we got some data. + * But if a writer sleeps in kernel space, then +@@ -462,7 +462,7 @@ pipe_write(struct kiocb *iocb, const str + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + +- if (!pipe->readers) { ++ if (!atomic_read(&pipe->readers)) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + goto out; +@@ -511,7 +511,7 @@ redo1: + for (;;) { + int bufs; + +- if (!pipe->readers) { ++ if (!atomic_read(&pipe->readers)) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; +@@ -597,9 +597,9 @@ redo2: + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } +- pipe->waiting_writers++; ++ atomic_inc(&pipe->waiting_writers); + pipe_wait(pipe); +- pipe->waiting_writers--; ++ atomic_dec(&pipe->waiting_writers); + } + out: + mutex_unlock(&inode->i_mutex); +@@ -666,7 +666,7 @@ pipe_poll(struct file *filp, poll_table + mask = 0; + if (filp->f_mode & FMODE_READ) { + mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; +- if (!pipe->writers && filp->f_version != pipe->w_counter) ++ if (!atomic_read(&pipe->writers) && filp->f_version != pipe->w_counter) + mask |= POLLHUP; + } + +@@ -676,7 +676,7 @@ pipe_poll(struct file *filp, poll_table + * Most Unices do not set POLLERR for FIFOs but on Linux they + * behave exactly like pipes for poll(). + */ +- if (!pipe->readers) ++ if (!atomic_read(&pipe->readers)) + mask |= POLLERR; + } + +@@ -690,10 +690,10 @@ pipe_release(struct inode *inode, int de + + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; +- pipe->readers -= decr; +- pipe->writers -= decw; ++ atomic_sub(decr, &pipe->readers); ++ atomic_sub(decw, &pipe->writers); + +- if (!pipe->readers && !pipe->writers) { ++ if (!atomic_read(&pipe->readers) && !atomic_read(&pipe->writers)) { + free_pipe_info(inode); + } else { + wake_up_interruptible_sync(&pipe->wait); +@@ -783,7 +783,7 @@ pipe_read_open(struct inode *inode, stru + + if (inode->i_pipe) { + ret = 0; +- inode->i_pipe->readers++; ++ atomic_inc(&inode->i_pipe->readers); + } + + mutex_unlock(&inode->i_mutex); +@@ -800,7 +800,7 @@ pipe_write_open(struct inode *inode, str + + if (inode->i_pipe) { + ret = 0; +- inode->i_pipe->writers++; ++ atomic_inc(&inode->i_pipe->writers); + } + + mutex_unlock(&inode->i_mutex); +@@ -818,9 +818,9 @@ pipe_rdwr_open(struct inode *inode, stru + if (inode->i_pipe) { + ret = 0; + if (filp->f_mode & FMODE_READ) +- inode->i_pipe->readers++; ++ atomic_inc(&inode->i_pipe->readers); + if (filp->f_mode & FMODE_WRITE) +- inode->i_pipe->writers++; ++ atomic_inc(&inode->i_pipe->writers); + } + + mutex_unlock(&inode->i_mutex); +@@ -905,7 +905,7 @@ void free_pipe_info(struct inode *inode) + inode->i_pipe = NULL; + } + +-static struct vfsmount *pipe_mnt __read_mostly; ++struct vfsmount *pipe_mnt __read_mostly; + + /* + * pipefs_dname() is called from d_path(). +@@ -933,7 +933,8 @@ static struct inode * get_pipe_inode(voi + goto fail_iput; + inode->i_pipe = pipe; + +- pipe->readers = pipe->writers = 1; ++ atomic_set(&pipe->readers, 1); ++ atomic_set(&pipe->writers, 1); + inode->i_fop = &rdwr_pipefifo_fops; + + /* +diff -urNp linux-2.6.33.1/fs/proc/array.c linux-2.6.33.1/fs/proc/array.c +--- linux-2.6.33.1/fs/proc/array.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/array.c 2010-03-20 16:58:41.824559234 -0400 +@@ -337,6 +337,21 @@ static void task_cpus_allowed(struct seq + seq_printf(m, "\n"); + } + ++#if defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) ++static inline void task_pax(struct seq_file *m, struct task_struct *p) ++{ ++ if (p->mm) ++ seq_printf(m, "PaX:\t%c%c%c%c%c\n", ++ p->mm->pax_flags & MF_PAX_PAGEEXEC ? 'P' : 'p', ++ p->mm->pax_flags & MF_PAX_EMUTRAMP ? 'E' : 'e', ++ p->mm->pax_flags & MF_PAX_MPROTECT ? 'M' : 'm', ++ p->mm->pax_flags & MF_PAX_RANDMMAP ? 'R' : 'r', ++ p->mm->pax_flags & MF_PAX_SEGMEXEC ? 'S' : 's'); ++ else ++ seq_printf(m, "PaX:\t-----\n"); ++} ++#endif ++ + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) + { +@@ -357,9 +372,20 @@ int proc_pid_status(struct seq_file *m, + task_show_regs(m, task); + #endif + task_context_switch_counts(m, task); ++ ++#if defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) ++ task_pax(m, task); ++#endif ++ + return 0; + } + ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++#define PAX_RAND_FLAGS(_mm) (_mm != NULL && _mm != current->mm && \ ++ (_mm->pax_flags & MF_PAX_RANDMMAP || \ ++ _mm->pax_flags & MF_PAX_SEGMEXEC)) ++#endif ++ + static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task, int whole) + { +@@ -452,6 +478,19 @@ static int do_task_stat(struct seq_file + gtime = task->gtime; + } + ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ if (PAX_RAND_FLAGS(mm)) { ++ eip = 0; ++ esp = 0; ++ wchan = 0; ++ } ++#endif ++#ifdef CONFIG_GRKERNSEC_HIDESYM ++ wchan = 0; ++ eip =0; ++ esp =0; ++#endif ++ + /* scale priority and nice values from timeslices to -20..20 */ + /* to make it look like a "normal" Unix priority/nice value */ + priority = task_prio(task); +@@ -492,9 +531,15 @@ static int do_task_stat(struct seq_file + vsize, + mm ? get_mm_rss(mm) : 0, + rsslim, ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ PAX_RAND_FLAGS(mm) ? 1 : (mm ? mm->start_code : 0), ++ PAX_RAND_FLAGS(mm) ? 1 : (mm ? mm->end_code : 0), ++ PAX_RAND_FLAGS(mm) ? 0 : ((permitted && mm) ? task->stack_start : 0), ++#else + mm ? mm->start_code : 0, + mm ? mm->end_code : 0, + (permitted && mm) ? task->stack_start : 0, ++#endif + esp, + eip, + /* The signal information here is obsolete. +@@ -547,3 +592,10 @@ int proc_pid_statm(struct seq_file *m, s + + return 0; + } ++ ++#ifdef CONFIG_GRKERNSEC_PROC_IPADDR ++int proc_pid_ipaddr(struct task_struct *task, char *buffer) ++{ ++ return sprintf(buffer, "%pI4\n", &task->signal->curr_ip); ++} ++#endif +diff -urNp linux-2.6.33.1/fs/proc/base.c linux-2.6.33.1/fs/proc/base.c +--- linux-2.6.33.1/fs/proc/base.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/base.c 2010-03-20 16:58:41.824559234 -0400 +@@ -102,6 +102,22 @@ struct pid_entry { + union proc_op op; + }; + ++struct getdents_callback { ++ struct linux_dirent __user * current_dir; ++ struct linux_dirent __user * previous; ++ struct file * file; ++ int count; ++ int error; ++}; ++ ++static int gr_fake_filldir(void * __buf, const char *name, int namlen, ++ loff_t offset, u64 ino, unsigned int d_type) ++{ ++ struct getdents_callback * buf = (struct getdents_callback *) __buf; ++ buf->error = -EINVAL; ++ return 0; ++} ++ + #define NOD(NAME, MODE, IOP, FOP, OP) { \ + .name = (NAME), \ + .len = sizeof(NAME) - 1, \ +@@ -213,6 +229,9 @@ static int check_mem_permission(struct t + if (task == current) + return 0; + ++ if (gr_handle_proc_ptrace(task) || gr_acl_handle_procpidmem(task)) ++ return -EPERM; ++ + /* + * If current is actively ptrace'ing, and would also be + * permitted to freshly attach with ptrace now, permit it. +@@ -260,6 +279,9 @@ static int proc_pid_cmdline(struct task_ + if (!mm->arg_end) + goto out_mm; /* Shh! No looking before we're done */ + ++ if (gr_acl_handle_procpidmem(task)) ++ goto out_mm; ++ + len = mm->arg_end - mm->arg_start; + + if (len > PAGE_SIZE) +@@ -287,12 +309,26 @@ out: + return res; + } + ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++#define PAX_RAND_FLAGS(_mm) (_mm != NULL && _mm != current->mm && \ ++ (_mm->pax_flags & MF_PAX_RANDMMAP || \ ++ _mm->pax_flags & MF_PAX_SEGMEXEC)) ++#endif ++ + static int proc_pid_auxv(struct task_struct *task, char *buffer) + { + int res = 0; + struct mm_struct *mm = get_task_mm(task); + if (mm) { + unsigned int nwords = 0; ++ ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ if (PAX_RAND_FLAGS(mm)) { ++ mmput(mm); ++ return res; ++ } ++#endif ++ + do { + nwords += 2; + } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ +@@ -328,7 +364,7 @@ static int proc_pid_wchan(struct task_st + } + #endif /* CONFIG_KALLSYMS */ + +-#ifdef CONFIG_STACKTRACE ++#if defined(CONFIG_STACKTRACE) && !defined(CONFIG_GRKERNSEC_HIDESYM) + + #define MAX_STACK_TRACE_DEPTH 64 + +@@ -521,7 +557,7 @@ static int proc_pid_limits(struct task_s + return count; + } + +-#ifdef CONFIG_HAVE_ARCH_TRACEHOOK ++#if defined(CONFIG_HAVE_ARCH_TRACEHOOK) && !defined(CONFIG_GRKERNSEC_PROC_MEMMAP) + static int proc_pid_syscall(struct task_struct *task, char *buffer) + { + long nr; +@@ -935,6 +971,9 @@ static ssize_t environ_read(struct file + if (!task) + goto out_no_task; + ++ if (gr_acl_handle_procpidmem(task)) ++ goto out; ++ + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + +@@ -1520,7 +1559,11 @@ static struct inode *proc_pid_make_inode + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; ++#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP ++ inode->i_gid = CONFIG_GRKERNSEC_PROC_GID; ++#else + inode->i_gid = cred->egid; ++#endif + rcu_read_unlock(); + } + security_task_to_inode(task, inode); +@@ -1538,6 +1581,9 @@ static int pid_getattr(struct vfsmount * + struct inode *inode = dentry->d_inode; + struct task_struct *task; + const struct cred *cred; ++#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ const struct cred *tmpcred = current_cred(); ++#endif + + generic_fillattr(inode, stat); + +@@ -1545,12 +1591,34 @@ static int pid_getattr(struct vfsmount * + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); ++ ++ if (task && (gr_pid_is_chrooted(task) || gr_check_hidden_task(task))) { ++ rcu_read_unlock(); ++ return -ENOENT; ++ } ++ + if (task) { ++ cred = __task_cred(task); ++#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ if (!tmpcred->uid || (tmpcred->uid == cred->uid) ++#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP ++ || in_group_p(CONFIG_GRKERNSEC_PROC_GID) ++#endif ++ ) ++#endif + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || ++#ifdef CONFIG_GRKERNSEC_PROC_USER ++ (inode->i_mode == (S_IFDIR|S_IRUSR|S_IXUSR)) || ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ (inode->i_mode == (S_IFDIR|S_IRUSR|S_IRGRP|S_IXUSR|S_IXGRP)) || ++#endif + task_dumpable(task)) { +- cred = __task_cred(task); + stat->uid = cred->euid; ++#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP ++ stat->gid = CONFIG_GRKERNSEC_PROC_GID; ++#else + stat->gid = cred->egid; ++#endif + } + } + rcu_read_unlock(); +@@ -1582,11 +1650,20 @@ static int pid_revalidate(struct dentry + + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || ++#ifdef CONFIG_GRKERNSEC_PROC_USER ++ (inode->i_mode == (S_IFDIR|S_IRUSR|S_IXUSR)) || ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ (inode->i_mode == (S_IFDIR|S_IRUSR|S_IRGRP|S_IXUSR|S_IXGRP)) || ++#endif + task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; ++#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP ++ inode->i_gid = CONFIG_GRKERNSEC_PROC_GID; ++#else + inode->i_gid = cred->egid; ++#endif + rcu_read_unlock(); + } else { + inode->i_uid = 0; +@@ -1707,7 +1784,8 @@ static int proc_fd_info(struct inode *in + int fd = proc_fd(inode); + + if (task) { +- files = get_files_struct(task); ++ if (!gr_acl_handle_procpidmem(task)) ++ files = get_files_struct(task); + put_task_struct(task); + } + if (files) { +@@ -1959,12 +2037,22 @@ static const struct file_operations proc + static int proc_fd_permission(struct inode *inode, int mask) + { + int rv; ++ struct task_struct *task; + + rv = generic_permission(inode, mask, NULL); +- if (rv == 0) +- return 0; ++ + if (task_pid(current) == proc_pid(inode)) + rv = 0; ++ ++ task = get_proc_task(inode); ++ if (task == NULL) ++ return rv; ++ ++ if (gr_acl_handle_procpidmem(task)) ++ rv = -EACCES; ++ ++ put_task_struct(task); ++ + return rv; + } + +@@ -2073,6 +2161,9 @@ static struct dentry *proc_pident_lookup + if (!task) + goto out_no_task; + ++ if (gr_pid_is_chrooted(task) || gr_check_hidden_task(task)) ++ goto out; ++ + /* + * Yes, it does not scale. And it should not. Don't add + * new entries into /proc/<tgid>/ without very good reasons. +@@ -2117,6 +2208,9 @@ static int proc_pident_readdir(struct fi + if (!task) + goto out_no_task; + ++ if (gr_pid_is_chrooted(task) || gr_check_hidden_task(task)) ++ goto out; ++ + ret = 0; + i = filp->f_pos; + switch (i) { +@@ -2384,7 +2478,7 @@ static void *proc_self_follow_link(struc + static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) + { +- char *s = nd_get_link(nd); ++ const char *s = nd_get_link(nd); + if (!IS_ERR(s)) + __putname(s); + } +@@ -2497,6 +2591,9 @@ static struct dentry *proc_base_lookup(s + if (p > last) + goto out; + ++ if (gr_pid_is_chrooted(task) || gr_check_hidden_task(task)) ++ goto out; ++ + error = proc_base_instantiate(dir, dentry, task, p); + + out: +@@ -2584,7 +2681,7 @@ static const struct pid_entry tgid_base_ + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), + #endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), +-#ifdef CONFIG_HAVE_ARCH_TRACEHOOK ++#if defined(CONFIG_HAVE_ARCH_TRACEHOOK) && !defined(CONFIG_GRKERNSEC_PROC_MEMMAP) + INF("syscall", S_IRUSR, proc_pid_syscall), + #endif + INF("cmdline", S_IRUGO, proc_pid_cmdline), +@@ -2612,7 +2709,7 @@ static const struct pid_entry tgid_base_ + #ifdef CONFIG_KALLSYMS + INF("wchan", S_IRUGO, proc_pid_wchan), + #endif +-#ifdef CONFIG_STACKTRACE ++#if defined(CONFIG_STACKTRACE) && !defined(CONFIG_GRKERNSEC_HIDESYM) + ONE("stack", S_IRUSR, proc_pid_stack), + #endif + #ifdef CONFIG_SCHEDSTATS +@@ -2642,6 +2739,9 @@ static const struct pid_entry tgid_base_ + #ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUGO, proc_tgid_io_accounting), + #endif ++#ifdef CONFIG_GRKERNSEC_PROC_IPADDR ++ INF("ipaddr", S_IRUSR, proc_pid_ipaddr), ++#endif + }; + + static int proc_tgid_base_readdir(struct file * filp, +@@ -2766,7 +2866,14 @@ static struct dentry *proc_pid_instantia + if (!inode) + goto out; + ++#ifdef CONFIG_GRKERNSEC_PROC_USER ++ inode->i_mode = S_IFDIR|S_IRUSR|S_IXUSR; ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ inode->i_gid = CONFIG_GRKERNSEC_PROC_GID; ++ inode->i_mode = S_IFDIR|S_IRUSR|S_IRGRP|S_IXUSR|S_IXGRP; ++#else + inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; ++#endif + inode->i_op = &proc_tgid_base_inode_operations; + inode->i_fop = &proc_tgid_base_operations; + inode->i_flags|=S_IMMUTABLE; +@@ -2808,7 +2915,11 @@ struct dentry *proc_pid_lookup(struct in + if (!task) + goto out; + ++ if (gr_check_hidden_task(task)) ++ goto out_put_task; ++ + result = proc_pid_instantiate(dir, dentry, task, NULL); ++out_put_task: + put_task_struct(task); + out: + return result; +@@ -2873,6 +2984,11 @@ int proc_pid_readdir(struct file * filp, + { + unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; + struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode); ++#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ const struct cred *tmpcred = current_cred(); ++ const struct cred *itercred; ++#endif ++ filldir_t __filldir = filldir; + struct tgid_iter iter; + struct pid_namespace *ns; + +@@ -2891,8 +3007,27 @@ int proc_pid_readdir(struct file * filp, + for (iter = next_tgid(ns, iter); + iter.task; + iter.tgid += 1, iter = next_tgid(ns, iter)) { ++#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ rcu_read_lock(); ++ itercred = __task_cred(iter.task); ++#endif ++ if (gr_pid_is_chrooted(iter.task) || gr_check_hidden_task(iter.task) ++#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ || (tmpcred->uid && (itercred->uid != tmpcred->uid) ++#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP ++ && !in_group_p(CONFIG_GRKERNSEC_PROC_GID) ++#endif ++ ) ++#endif ++ ) ++ __filldir = &gr_fake_filldir; ++ else ++ __filldir = filldir; ++#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ rcu_read_unlock(); ++#endif + filp->f_pos = iter.tgid + TGID_OFFSET; +- if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { ++ if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { + put_task_struct(iter.task); + goto out; + } +@@ -2919,7 +3054,7 @@ static const struct pid_entry tid_base_s + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), + #endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), +-#ifdef CONFIG_HAVE_ARCH_TRACEHOOK ++#if defined(CONFIG_HAVE_ARCH_TRACEHOOK) && !defined(CONFIG_GRKERNSEC_PROC_MEMMAP) + INF("syscall", S_IRUSR, proc_pid_syscall), + #endif + INF("cmdline", S_IRUGO, proc_pid_cmdline), +@@ -2946,7 +3081,7 @@ static const struct pid_entry tid_base_s + #ifdef CONFIG_KALLSYMS + INF("wchan", S_IRUGO, proc_pid_wchan), + #endif +-#ifdef CONFIG_STACKTRACE ++#if defined(CONFIG_STACKTRACE) && !defined(CONFIG_GRKERNSEC_HIDESYM) + ONE("stack", S_IRUSR, proc_pid_stack), + #endif + #ifdef CONFIG_SCHEDSTATS +diff -urNp linux-2.6.33.1/fs/proc/cmdline.c linux-2.6.33.1/fs/proc/cmdline.c +--- linux-2.6.33.1/fs/proc/cmdline.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/cmdline.c 2010-03-20 16:58:41.828775433 -0400 +@@ -23,7 +23,11 @@ static const struct file_operations cmdl + + static int __init proc_cmdline_init(void) + { ++#ifdef CONFIG_GRKERNSEC_PROC_ADD ++ proc_create_grsec("cmdline", 0, NULL, &cmdline_proc_fops); ++#else + proc_create("cmdline", 0, NULL, &cmdline_proc_fops); ++#endif + return 0; + } + module_init(proc_cmdline_init); +diff -urNp linux-2.6.33.1/fs/proc/devices.c linux-2.6.33.1/fs/proc/devices.c +--- linux-2.6.33.1/fs/proc/devices.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/devices.c 2010-03-20 16:58:41.828775433 -0400 +@@ -64,7 +64,11 @@ static const struct file_operations proc + + static int __init proc_devices_init(void) + { ++#ifdef CONFIG_GRKERNSEC_PROC_ADD ++ proc_create_grsec("devices", 0, NULL, &proc_devinfo_operations); ++#else + proc_create("devices", 0, NULL, &proc_devinfo_operations); ++#endif + return 0; + } + module_init(proc_devices_init); +diff -urNp linux-2.6.33.1/fs/proc/inode.c linux-2.6.33.1/fs/proc/inode.c +--- linux-2.6.33.1/fs/proc/inode.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/inode.c 2010-03-20 16:58:41.828775433 -0400 +@@ -434,7 +434,11 @@ struct inode *proc_get_inode(struct supe + if (de->mode) { + inode->i_mode = de->mode; + inode->i_uid = de->uid; ++#ifdef CONFIG_GRKERNSEC_PROC_USERGROUP ++ inode->i_gid = CONFIG_GRKERNSEC_PROC_GID; ++#else + inode->i_gid = de->gid; ++#endif + } + if (de->size) + inode->i_size = de->size; +diff -urNp linux-2.6.33.1/fs/proc/internal.h linux-2.6.33.1/fs/proc/internal.h +--- linux-2.6.33.1/fs/proc/internal.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/internal.h 2010-03-20 16:58:41.828775433 -0400 +@@ -51,6 +51,9 @@ extern int proc_pid_status(struct seq_fi + struct pid *pid, struct task_struct *task); + extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); ++#ifdef CONFIG_GRKERNSEC_PROC_IPADDR ++extern int proc_pid_ipaddr(struct task_struct *task, char *buffer); ++#endif + extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); + + extern const struct file_operations proc_maps_operations; +diff -urNp linux-2.6.33.1/fs/proc/Kconfig linux-2.6.33.1/fs/proc/Kconfig +--- linux-2.6.33.1/fs/proc/Kconfig 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/Kconfig 2010-03-20 16:58:41.828775433 -0400 +@@ -30,12 +30,12 @@ config PROC_FS + + config PROC_KCORE + bool "/proc/kcore support" if !ARM +- depends on PROC_FS && MMU ++ depends on PROC_FS && MMU && !GRKERNSEC_PROC_ADD + + config PROC_VMCORE + bool "/proc/vmcore support (EXPERIMENTAL)" +- depends on PROC_FS && CRASH_DUMP +- default y ++ depends on PROC_FS && CRASH_DUMP && !GRKERNSEC ++ default n + help + Exports the dump image of crashed kernel in ELF format. + +@@ -59,8 +59,8 @@ config PROC_SYSCTL + limited in memory. + + config PROC_PAGE_MONITOR +- default y +- depends on PROC_FS && MMU ++ default n ++ depends on PROC_FS && MMU && !GRKERNSEC + bool "Enable /proc page monitoring" if EMBEDDED + help + Various /proc files exist to monitor process memory utilization: +diff -urNp linux-2.6.33.1/fs/proc/kcore.c linux-2.6.33.1/fs/proc/kcore.c +--- linux-2.6.33.1/fs/proc/kcore.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/kcore.c 2010-03-20 16:58:41.832562866 -0400 +@@ -541,6 +541,9 @@ read_kcore(struct file *file, char __use + + static int open_kcore(struct inode *inode, struct file *filp) + { ++#if defined(CONFIG_GRKERNSEC_PROC_ADD) || defined(CONFIG_GRKERNSEC_HIDESYM) ++ return -EPERM; ++#endif + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + if (kcore_need_update) +diff -urNp linux-2.6.33.1/fs/proc/meminfo.c linux-2.6.33.1/fs/proc/meminfo.c +--- linux-2.6.33.1/fs/proc/meminfo.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/meminfo.c 2010-03-20 16:58:41.832562866 -0400 +@@ -149,7 +149,7 @@ static int meminfo_proc_show(struct seq_ + vmi.used >> 10, + vmi.largest_chunk >> 10 + #ifdef CONFIG_MEMORY_FAILURE +- ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) ++ ,atomic_long_read_unchecked(&mce_bad_pages) << (PAGE_SHIFT - 10) + #endif + ); + +diff -urNp linux-2.6.33.1/fs/proc/nommu.c linux-2.6.33.1/fs/proc/nommu.c +--- linux-2.6.33.1/fs/proc/nommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/nommu.c 2010-03-20 16:58:41.832562866 -0400 +@@ -67,7 +67,7 @@ static int nommu_region_show(struct seq_ + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); +- seq_path(m, &file->f_path, ""); ++ seq_path(m, &file->f_path, "\n\"); + } + + seq_putc(m, '\n'); +diff -urNp linux-2.6.33.1/fs/proc/proc_net.c linux-2.6.33.1/fs/proc/proc_net.c +--- linux-2.6.33.1/fs/proc/proc_net.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/proc_net.c 2010-03-20 16:58:41.832562866 -0400 +@@ -104,6 +104,17 @@ static struct net *get_proc_task_net(str + struct task_struct *task; + struct nsproxy *ns; + struct net *net = NULL; ++#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ const struct cred *cred = current_cred(); ++#endif ++ ++#ifdef CONFIG_GRKERNSEC_PROC_USER ++ if (cred->fsuid) ++ return net; ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ if (cred->fsuid && !in_group_p(CONFIG_GRKERNSEC_PROC_GID)) ++ return net; ++#endif + + rcu_read_lock(); + task = pid_task(proc_pid(dir), PIDTYPE_PID); +diff -urNp linux-2.6.33.1/fs/proc/proc_sysctl.c linux-2.6.33.1/fs/proc/proc_sysctl.c +--- linux-2.6.33.1/fs/proc/proc_sysctl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/proc_sysctl.c 2010-03-20 16:58:41.832562866 -0400 +@@ -7,6 +7,8 @@ + #include <linux/security.h> + #include "internal.h" + ++extern __u32 gr_handle_sysctl(const struct ctl_table *table, const int op); ++ + static const struct dentry_operations proc_sys_dentry_operations; + static const struct file_operations proc_sys_file_operations; + static const struct inode_operations proc_sys_inode_operations; +@@ -109,6 +111,9 @@ static struct dentry *proc_sys_lookup(st + if (!p) + goto out; + ++ if (gr_handle_sysctl(p, MAY_EXEC)) ++ goto out; ++ + err = ERR_PTR(-ENOMEM); + inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); + if (h) +@@ -228,6 +233,9 @@ static int scan(struct ctl_table_header + if (*pos < file->f_pos) + continue; + ++ if (gr_handle_sysctl(table, 0)) ++ continue; ++ + res = proc_sys_fill_cache(file, dirent, filldir, head, table); + if (res) + return res; +@@ -344,6 +352,9 @@ static int proc_sys_getattr(struct vfsmo + if (IS_ERR(head)) + return PTR_ERR(head); + ++ if (table && gr_handle_sysctl(table, MAY_EXEC)) ++ return -ENOENT; ++ + generic_fillattr(inode, stat); + if (table) + stat->mode = (stat->mode & S_IFMT) | table->mode; +diff -urNp linux-2.6.33.1/fs/proc/root.c linux-2.6.33.1/fs/proc/root.c +--- linux-2.6.33.1/fs/proc/root.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/root.c 2010-03-20 16:58:41.836587389 -0400 +@@ -134,7 +134,15 @@ void __init proc_root_init(void) + #ifdef CONFIG_PROC_DEVICETREE + proc_device_tree_init(); + #endif ++#ifdef CONFIG_GRKERNSEC_PROC_ADD ++#ifdef CONFIG_GRKERNSEC_PROC_USER ++ proc_mkdir_mode("bus", S_IRUSR | S_IXUSR, NULL); ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ proc_mkdir_mode("bus", S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP, NULL); ++#endif ++#else + proc_mkdir("bus", NULL); ++#endif + proc_sys_init(); + } + +diff -urNp linux-2.6.33.1/fs/proc/task_mmu.c linux-2.6.33.1/fs/proc/task_mmu.c +--- linux-2.6.33.1/fs/proc/task_mmu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/task_mmu.c 2010-03-20 16:58:41.836587389 -0400 +@@ -46,15 +46,26 @@ void task_mem(struct seq_file *m, struct + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB\n" +- "VmPTE:\t%8lu kB\n", +- hiwater_vm << (PAGE_SHIFT-10), ++ "VmPTE:\t%8lu kB\n" ++ ++#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT ++ "CsBase:\t%8lx\nCsLim:\t%8lx\n" ++#endif ++ ++ ,hiwater_vm << (PAGE_SHIFT-10), + (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + mm->locked_vm << (PAGE_SHIFT-10), + hiwater_rss << (PAGE_SHIFT-10), + total_rss << (PAGE_SHIFT-10), + data << (PAGE_SHIFT-10), + mm->stack_vm << (PAGE_SHIFT-10), text, lib, +- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); ++ (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10 ++ ++#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT ++ , mm->context.user_cs_base, mm->context.user_cs_limit ++#endif ++ ++ ); + } + + unsigned long task_vsize(struct mm_struct *mm) +@@ -199,6 +210,12 @@ static int do_maps_open(struct inode *in + return ret; + } + ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++#define PAX_RAND_FLAGS(_mm) (_mm != NULL && _mm != current->mm && \ ++ (_mm->pax_flags & MF_PAX_RANDMMAP || \ ++ _mm->pax_flags & MF_PAX_SEGMEXEC)) ++#endif ++ + static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) + { + struct mm_struct *mm = vma->vm_mm; +@@ -217,13 +234,22 @@ static void show_map_vma(struct seq_file + } + + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ PAX_RAND_FLAGS(mm) ? 0UL : vma->vm_start, ++ PAX_RAND_FLAGS(mm) ? 0UL : vma->vm_end, ++#else + vma->vm_start, + vma->vm_end, ++#endif + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? 's' : 'p', ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ PAX_RAND_FLAGS(mm) ? 0UL : pgoff, ++#else + pgoff, ++#endif + MAJOR(dev), MINOR(dev), ino, &len); + + /* +@@ -232,16 +258,16 @@ static void show_map_vma(struct seq_file + */ + if (file) { + pad_len_spaces(m, len); +- seq_path(m, &file->f_path, "\n"); ++ seq_path(m, &file->f_path, "\n\"); + } else { + const char *name = arch_vma_name(vma); + if (!name) { + if (mm) { +- if (vma->vm_start <= mm->start_brk && +- vma->vm_end >= mm->brk) { ++ if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + name = "[heap]"; +- } else if (vma->vm_start <= mm->start_stack && +- vma->vm_end >= mm->start_stack) { ++ } else if ((vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) || ++ (vma->vm_start <= mm->start_stack && ++ vma->vm_end >= mm->start_stack)) { + name = "[stack]"; + } else { + unsigned long stack_start; +@@ -402,9 +428,16 @@ static int show_smap(struct seq_file *m, + }; + + memset(&mss, 0, sizeof mss); +- mss.vma = vma; +- if (vma->vm_mm && !is_vm_hugetlb_page(vma)) +- walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); ++ ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ if (!PAX_RAND_FLAGS(vma->vm_mm)) { ++#endif ++ mss.vma = vma; ++ if (vma->vm_mm && !is_vm_hugetlb_page(vma)) ++ walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ } ++#endif + + show_map_vma(m, vma); + +@@ -420,7 +453,11 @@ static int show_smap(struct seq_file *m, + "Swap: %8lu kB\n" + "KernelPageSize: %8lu kB\n" + "MMUPageSize: %8lu kB\n", ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ PAX_RAND_FLAGS(vma->vm_mm) ? 0UL : (vma->vm_end - vma->vm_start) >> 10, ++#else + (vma->vm_end - vma->vm_start) >> 10, ++#endif + mss.resident >> 10, + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), + mss.shared_clean >> 10, +diff -urNp linux-2.6.33.1/fs/proc/task_nommu.c linux-2.6.33.1/fs/proc/task_nommu.c +--- linux-2.6.33.1/fs/proc/task_nommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/proc/task_nommu.c 2010-03-20 16:58:41.836587389 -0400 +@@ -50,7 +50,7 @@ void task_mem(struct seq_file *m, struct + else + bytes += kobjsize(mm); + +- if (current->fs && current->fs->users > 1) ++ if (current->fs && atomic_read(¤t->fs->users) > 1) + sbytes += kobjsize(current->fs); + else + bytes += kobjsize(current->fs); +@@ -158,7 +158,7 @@ static int nommu_vma_show(struct seq_fil + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); +- seq_path(m, &file->f_path, ""); ++ seq_path(m, &file->f_path, "\n\"); + } + + seq_putc(m, '\n'); +diff -urNp linux-2.6.33.1/fs/readdir.c linux-2.6.33.1/fs/readdir.c +--- linux-2.6.33.1/fs/readdir.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/readdir.c 2010-03-20 16:58:41.836587389 -0400 +@@ -16,6 +16,7 @@ + #include <linux/security.h> + #include <linux/syscalls.h> + #include <linux/unistd.h> ++#include <linux/namei.h> + + #include <asm/uaccess.h> + +@@ -67,6 +68,7 @@ struct old_linux_dirent { + + struct readdir_callback { + struct old_linux_dirent __user * dirent; ++ struct file * file; + int result; + }; + +@@ -84,6 +86,10 @@ static int fillonedir(void * __buf, cons + buf->result = -EOVERFLOW; + return -EOVERFLOW; + } ++ ++ if (!gr_acl_handle_filldir(buf->file, name, namlen, ino)) ++ return 0; ++ + buf->result++; + dirent = buf->dirent; + if (!access_ok(VERIFY_WRITE, dirent, +@@ -116,6 +122,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned in + + buf.result = 0; + buf.dirent = dirent; ++ buf.file = file; + + error = vfs_readdir(file, fillonedir, &buf); + if (buf.result) +@@ -142,6 +149,7 @@ struct linux_dirent { + struct getdents_callback { + struct linux_dirent __user * current_dir; + struct linux_dirent __user * previous; ++ struct file * file; + int count; + int error; + }; +@@ -162,6 +170,10 @@ static int filldir(void * __buf, const c + buf->error = -EOVERFLOW; + return -EOVERFLOW; + } ++ ++ if (!gr_acl_handle_filldir(buf->file, name, namlen, ino)) ++ return 0; ++ + dirent = buf->previous; + if (dirent) { + if (__put_user(offset, &dirent->d_off)) +@@ -209,6 +221,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, + buf.previous = NULL; + buf.count = count; + buf.error = 0; ++ buf.file = file; + + error = vfs_readdir(file, filldir, &buf); + if (error >= 0) +@@ -228,6 +241,7 @@ out: + struct getdents_callback64 { + struct linux_dirent64 __user * current_dir; + struct linux_dirent64 __user * previous; ++ struct file *file; + int count; + int error; + }; +@@ -242,6 +256,10 @@ static int filldir64(void * __buf, const + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; ++ ++ if (!gr_acl_handle_filldir(buf->file, name, namlen, ino)) ++ return 0; ++ + dirent = buf->previous; + if (dirent) { + if (__put_user(offset, &dirent->d_off)) +@@ -289,6 +307,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int + + buf.current_dir = dirent; + buf.previous = NULL; ++ buf.file = file; + buf.count = count; + buf.error = 0; + +diff -urNp linux-2.6.33.1/fs/reiserfs/do_balan.c linux-2.6.33.1/fs/reiserfs/do_balan.c +--- linux-2.6.33.1/fs/reiserfs/do_balan.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/reiserfs/do_balan.c 2010-03-20 16:58:41.848535760 -0400 +@@ -2051,7 +2051,7 @@ void do_balance(struct tree_balance *tb, + return; + } + +- atomic_inc(&(fs_generation(tb->tb_sb))); ++ atomic_inc_unchecked(&(fs_generation(tb->tb_sb))); + do_balance_starts(tb); + + /* balance leaf returns 0 except if combining L R and S into +diff -urNp linux-2.6.33.1/fs/reiserfs/item_ops.c linux-2.6.33.1/fs/reiserfs/item_ops.c +--- linux-2.6.33.1/fs/reiserfs/item_ops.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/reiserfs/item_ops.c 2010-03-20 16:58:41.848535760 -0400 +@@ -102,7 +102,7 @@ static void sd_print_vi(struct virtual_i + vi->vi_index, vi->vi_type, vi->vi_ih); + } + +-static struct item_operations stat_data_ops = { ++static const struct item_operations stat_data_ops = { + .bytes_number = sd_bytes_number, + .decrement_key = sd_decrement_key, + .is_left_mergeable = sd_is_left_mergeable, +@@ -196,7 +196,7 @@ static void direct_print_vi(struct virtu + vi->vi_index, vi->vi_type, vi->vi_ih); + } + +-static struct item_operations direct_ops = { ++static const struct item_operations direct_ops = { + .bytes_number = direct_bytes_number, + .decrement_key = direct_decrement_key, + .is_left_mergeable = direct_is_left_mergeable, +@@ -341,7 +341,7 @@ static void indirect_print_vi(struct vir + vi->vi_index, vi->vi_type, vi->vi_ih); + } + +-static struct item_operations indirect_ops = { ++static const struct item_operations indirect_ops = { + .bytes_number = indirect_bytes_number, + .decrement_key = indirect_decrement_key, + .is_left_mergeable = indirect_is_left_mergeable, +@@ -628,7 +628,7 @@ static void direntry_print_vi(struct vir + printk("\n"); + } + +-static struct item_operations direntry_ops = { ++static const struct item_operations direntry_ops = { + .bytes_number = direntry_bytes_number, + .decrement_key = direntry_decrement_key, + .is_left_mergeable = direntry_is_left_mergeable, +@@ -724,7 +724,7 @@ static void errcatch_print_vi(struct vir + "Invalid item type observed, run fsck ASAP"); + } + +-static struct item_operations errcatch_ops = { ++static const struct item_operations errcatch_ops = { + errcatch_bytes_number, + errcatch_decrement_key, + errcatch_is_left_mergeable, +@@ -746,7 +746,7 @@ static struct item_operations errcatch_o + #error Item types must use disk-format assigned values. + #endif + +-struct item_operations *item_ops[TYPE_ANY + 1] = { ++const struct item_operations * const item_ops[TYPE_ANY + 1] = { + &stat_data_ops, + &indirect_ops, + &direct_ops, +diff -urNp linux-2.6.33.1/fs/reiserfs/procfs.c linux-2.6.33.1/fs/reiserfs/procfs.c +--- linux-2.6.33.1/fs/reiserfs/procfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/reiserfs/procfs.c 2010-03-20 16:58:41.848535760 -0400 +@@ -113,7 +113,7 @@ static int show_super(struct seq_file *m + "SMALL_TAILS " : "NO_TAILS ", + replay_only(sb) ? "REPLAY_ONLY " : "", + convert_reiserfs(sb) ? "CONV " : "", +- atomic_read(&r->s_generation_counter), ++ atomic_read_unchecked(&r->s_generation_counter), + SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), + SF(s_do_balance), SF(s_unneeded_left_neighbor), + SF(s_good_search_by_key_reada), SF(s_bmaps), +diff -urNp linux-2.6.33.1/fs/select.c linux-2.6.33.1/fs/select.c +--- linux-2.6.33.1/fs/select.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/select.c 2010-03-20 16:58:41.848535760 -0400 +@@ -20,6 +20,7 @@ + #include <linux/module.h> + #include <linux/slab.h> + #include <linux/poll.h> ++#include <linux/security.h> + #include <linux/personality.h> /* for STICKY_TIMEOUTS */ + #include <linux/file.h> + #include <linux/fdtable.h> +@@ -821,6 +822,7 @@ int do_sys_poll(struct pollfd __user *uf + struct poll_list *walk = head; + unsigned long todo = nfds; + ++ gr_learn_resource(current, RLIMIT_NOFILE, nfds, 1); + if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + return -EINVAL; + +diff -urNp linux-2.6.33.1/fs/seq_file.c linux-2.6.33.1/fs/seq_file.c +--- linux-2.6.33.1/fs/seq_file.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/seq_file.c 2010-03-20 16:58:41.860772579 -0400 +@@ -76,7 +76,8 @@ static int traverse(struct seq_file *m, + return 0; + } + if (!m->buf) { +- m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); ++ m->size = PAGE_SIZE; ++ m->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!m->buf) + return -ENOMEM; + } +@@ -116,7 +117,8 @@ static int traverse(struct seq_file *m, + Eoverflow: + m->op->stop(m, p); + kfree(m->buf); +- m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); ++ m->size <<= 1; ++ m->buf = kmalloc(m->size, GFP_KERNEL); + return !m->buf ? -ENOMEM : -EAGAIN; + } + +@@ -169,7 +171,8 @@ ssize_t seq_read(struct file *file, char + m->version = file->f_version; + /* grab buffer if we didn't have one */ + if (!m->buf) { +- m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); ++ m->size = PAGE_SIZE; ++ m->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!m->buf) + goto Enomem; + } +@@ -210,7 +213,8 @@ ssize_t seq_read(struct file *file, char + goto Fill; + m->op->stop(m, p); + kfree(m->buf); +- m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); ++ m->size <<= 1; ++ m->buf = kmalloc(m->size, GFP_KERNEL); + if (!m->buf) + goto Enomem; + m->count = 0; +diff -urNp linux-2.6.33.1/fs/smbfs/symlink.c linux-2.6.33.1/fs/smbfs/symlink.c +--- linux-2.6.33.1/fs/smbfs/symlink.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/smbfs/symlink.c 2010-03-20 16:58:41.860772579 -0400 +@@ -55,7 +55,7 @@ static void *smb_follow_link(struct dent + + static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p) + { +- char *s = nd_get_link(nd); ++ const char *s = nd_get_link(nd); + if (!IS_ERR(s)) + __putname(s); + } +diff -urNp linux-2.6.33.1/fs/splice.c linux-2.6.33.1/fs/splice.c +--- linux-2.6.33.1/fs/splice.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/splice.c 2010-03-20 16:58:41.877542041 -0400 +@@ -185,7 +185,7 @@ ssize_t splice_to_pipe(struct pipe_inode + pipe_lock(pipe); + + for (;;) { +- if (!pipe->readers) { ++ if (!atomic_read(&pipe->readers)) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; +@@ -239,9 +239,9 @@ ssize_t splice_to_pipe(struct pipe_inode + do_wakeup = 0; + } + +- pipe->waiting_writers++; ++ atomic_inc(&pipe->waiting_writers); + pipe_wait(pipe); +- pipe->waiting_writers--; ++ atomic_dec(&pipe->waiting_writers); + } + + pipe_unlock(pipe); +@@ -531,7 +531,7 @@ static ssize_t kernel_readv(struct file + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ +- res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); ++ res = vfs_readv(file, (__force const struct iovec __user *)vec, vlen, &pos); + set_fs(old_fs); + + return res; +@@ -546,7 +546,7 @@ static ssize_t kernel_write(struct file + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ +- res = vfs_write(file, (const char __user *)buf, count, &pos); ++ res = vfs_write(file, (__force const char __user *)buf, count, &pos); + set_fs(old_fs); + + return res; +@@ -588,7 +588,7 @@ ssize_t default_file_splice_read(struct + goto err; + + this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); +- vec[i].iov_base = (void __user *) page_address(page); ++ vec[i].iov_base = (__force void __user *) page_address(page); + vec[i].iov_len = this_len; + pages[i] = page; + spd.nr_pages++; +@@ -810,10 +810,10 @@ EXPORT_SYMBOL(splice_from_pipe_feed); + int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) + { + while (!pipe->nrbufs) { +- if (!pipe->writers) ++ if (!atomic_read(&pipe->writers)) + return 0; + +- if (!pipe->waiting_writers && sd->num_spliced) ++ if (!atomic_read(&pipe->waiting_writers) && sd->num_spliced) + return 0; + + if (sd->flags & SPLICE_F_NONBLOCK) +@@ -1150,7 +1150,7 @@ ssize_t splice_direct_to_actor(struct fi + * out of the pipe right after the splice_to_pipe(). So set + * PIPE_READERS appropriately. + */ +- pipe->readers = 1; ++ atomic_set(&pipe->readers, 1); + + current->splice_pipe = pipe; + } +@@ -1710,9 +1710,9 @@ static int ipipe_prep(struct pipe_inode_ + ret = -ERESTARTSYS; + break; + } +- if (!pipe->writers) ++ if (!atomic_read(&pipe->writers)) + break; +- if (!pipe->waiting_writers) { ++ if (!atomic_read(&pipe->waiting_writers)) { + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; +@@ -1744,7 +1744,7 @@ static int opipe_prep(struct pipe_inode_ + pipe_lock(pipe); + + while (pipe->nrbufs >= PIPE_BUFFERS) { +- if (!pipe->readers) { ++ if (!atomic_read(&pipe->readers)) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + break; +@@ -1757,9 +1757,9 @@ static int opipe_prep(struct pipe_inode_ + ret = -ERESTARTSYS; + break; + } +- pipe->waiting_writers++; ++ atomic_inc(&pipe->waiting_writers); + pipe_wait(pipe); +- pipe->waiting_writers--; ++ atomic_dec(&pipe->waiting_writers); + } + + pipe_unlock(pipe); +@@ -1795,14 +1795,14 @@ retry: + pipe_double_lock(ipipe, opipe); + + do { +- if (!opipe->readers) { ++ if (!atomic_read(&opipe->readers)) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + +- if (!ipipe->nrbufs && !ipipe->writers) ++ if (!ipipe->nrbufs && !atomic_read(&ipipe->writers)) + break; + + /* +@@ -1902,7 +1902,7 @@ static int link_pipe(struct pipe_inode_i + pipe_double_lock(ipipe, opipe); + + do { +- if (!opipe->readers) { ++ if (!atomic_read(&opipe->readers)) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; +@@ -1947,7 +1947,7 @@ static int link_pipe(struct pipe_inode_i + * return EAGAIN if we have the potential of some data in the + * future, otherwise just return 0 + */ +- if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) ++ if (!ret && atomic_read(&ipipe->waiting_writers) && (flags & SPLICE_F_NONBLOCK)) + ret = -EAGAIN; + + pipe_unlock(ipipe); +diff -urNp linux-2.6.33.1/fs/sysfs/file.c linux-2.6.33.1/fs/sysfs/file.c +--- linux-2.6.33.1/fs/sysfs/file.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/sysfs/file.c 2010-03-20 16:58:41.877542041 -0400 +@@ -53,7 +53,7 @@ struct sysfs_buffer { + size_t count; + loff_t pos; + char * page; +- struct sysfs_ops * ops; ++ const struct sysfs_ops * ops; + struct mutex mutex; + int needs_read_fill; + int event; +@@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentr + { + struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; +- struct sysfs_ops * ops = buffer->ops; ++ const struct sysfs_ops * ops = buffer->ops; + int ret = 0; + ssize_t count; + +@@ -199,7 +199,7 @@ flush_write_buffer(struct dentry * dentr + { + struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; +- struct sysfs_ops * ops = buffer->ops; ++ const struct sysfs_ops * ops = buffer->ops; + int rc; + + /* need attr_sd for attr and ops, its parent for kobj */ +@@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; + struct sysfs_buffer *buffer; +- struct sysfs_ops *ops; ++ const struct sysfs_ops *ops; + int error = -EACCES; + char *p; + +diff -urNp linux-2.6.33.1/fs/sysfs/symlink.c linux-2.6.33.1/fs/sysfs/symlink.c +--- linux-2.6.33.1/fs/sysfs/symlink.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/sysfs/symlink.c 2010-03-20 16:58:41.877542041 -0400 +@@ -204,7 +204,7 @@ static void *sysfs_follow_link(struct de + + static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) + { +- char *page = nd_get_link(nd); ++ const char *page = nd_get_link(nd); + if (!IS_ERR(page)) + free_page((unsigned long)page); + } +diff -urNp linux-2.6.33.1/fs/udf/balloc.c linux-2.6.33.1/fs/udf/balloc.c +--- linux-2.6.33.1/fs/udf/balloc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/udf/balloc.c 2010-03-20 16:58:41.880865953 -0400 +@@ -172,9 +172,7 @@ static void udf_bitmap_free_blocks(struc + + mutex_lock(&sbi->s_alloc_mutex); + partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; +- if (bloc->logicalBlockNum < 0 || +- (bloc->logicalBlockNum + count) > +- partmap->s_partition_len) { ++ if ((bloc->logicalBlockNum + count) > partmap->s_partition_len) { + udf_debug("%d < %d || %d + %d > %d\n", + bloc->logicalBlockNum, 0, bloc->logicalBlockNum, + count, partmap->s_partition_len); +@@ -436,9 +434,7 @@ static void udf_table_free_blocks(struct + + mutex_lock(&sbi->s_alloc_mutex); + partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; +- if (bloc->logicalBlockNum < 0 || +- (bloc->logicalBlockNum + count) > +- partmap->s_partition_len) { ++ if ((bloc->logicalBlockNum + count) > partmap->s_partition_len) { + udf_debug("%d < %d || %d + %d > %d\n", + bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, + partmap->s_partition_len); +diff -urNp linux-2.6.33.1/fs/utimes.c linux-2.6.33.1/fs/utimes.c +--- linux-2.6.33.1/fs/utimes.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/utimes.c 2010-03-20 16:58:41.884933711 -0400 +@@ -1,6 +1,7 @@ + #include <linux/compiler.h> + #include <linux/file.h> + #include <linux/fs.h> ++#include <linux/security.h> + #include <linux/linkage.h> + #include <linux/mount.h> + #include <linux/namei.h> +@@ -101,6 +102,12 @@ static int utimes_common(struct path *pa + goto mnt_drop_write_and_out; + } + } ++ ++ if (!gr_acl_handle_utime(path->dentry, path->mnt)) { ++ error = -EACCES; ++ goto mnt_drop_write_and_out; ++ } ++ + mutex_lock(&inode->i_mutex); + error = notify_change(path->dentry, &newattrs); + mutex_unlock(&inode->i_mutex); +diff -urNp linux-2.6.33.1/fs/xfs/linux-2.6/xfs_ioctl.c linux-2.6.33.1/fs/xfs/linux-2.6/xfs_ioctl.c +--- linux-2.6.33.1/fs/xfs/linux-2.6/xfs_ioctl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/xfs/linux-2.6/xfs_ioctl.c 2010-03-20 16:58:41.884933711 -0400 +@@ -135,7 +135,7 @@ xfs_find_handle( + } + + error = -EFAULT; +- if (copy_to_user(hreq->ohandle, &handle, hsize) || ++ if (hsize > sizeof(handle) || copy_to_user(hreq->ohandle, &handle, hsize) || + copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) + goto out_put; + +diff -urNp linux-2.6.33.1/fs/xfs/linux-2.6/xfs_iops.c linux-2.6.33.1/fs/xfs/linux-2.6/xfs_iops.c +--- linux-2.6.33.1/fs/xfs/linux-2.6/xfs_iops.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/xfs/linux-2.6/xfs_iops.c 2010-03-20 16:58:41.884933711 -0400 +@@ -469,7 +469,7 @@ xfs_vn_put_link( + struct nameidata *nd, + void *p) + { +- char *s = nd_get_link(nd); ++ const char *s = nd_get_link(nd); + + if (!IS_ERR(s)) + kfree(s); +diff -urNp linux-2.6.33.1/fs/xfs/xfs_bmap.c linux-2.6.33.1/fs/xfs/xfs_bmap.c +--- linux-2.6.33.1/fs/xfs/xfs_bmap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/fs/xfs/xfs_bmap.c 2010-03-20 16:58:41.888938450 -0400 +@@ -296,7 +296,7 @@ xfs_bmap_validate_ret( + int nmap, + int ret_nmap); + #else +-#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) ++#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do {} while (0) + #endif /* DEBUG */ + + STATIC int +diff -urNp linux-2.6.33.1/grsecurity/gracl_alloc.c linux-2.6.33.1/grsecurity/gracl_alloc.c +--- linux-2.6.33.1/grsecurity/gracl_alloc.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/gracl_alloc.c 2010-03-20 16:58:41.888938450 -0400 +@@ -0,0 +1,105 @@ ++#include <linux/kernel.h> ++#include <linux/mm.h> ++#include <linux/slab.h> ++#include <linux/vmalloc.h> ++#include <linux/gracl.h> ++#include <linux/grsecurity.h> ++ ++static unsigned long alloc_stack_next = 1; ++static unsigned long alloc_stack_size = 1; ++static void **alloc_stack; ++ ++static __inline__ int ++alloc_pop(void) ++{ ++ if (alloc_stack_next == 1) ++ return 0; ++ ++ kfree(alloc_stack[alloc_stack_next - 2]); ++ ++ alloc_stack_next--; ++ ++ return 1; ++} ++ ++static __inline__ int ++alloc_push(void *buf) ++{ ++ if (alloc_stack_next >= alloc_stack_size) ++ return 1; ++ ++ alloc_stack[alloc_stack_next - 1] = buf; ++ ++ alloc_stack_next++; ++ ++ return 0; ++} ++ ++void * ++acl_alloc(unsigned long len) ++{ ++ void *ret = NULL; ++ ++ if (!len || len > PAGE_SIZE) ++ goto out; ++ ++ ret = kmalloc(len, GFP_KERNEL); ++ ++ if (ret) { ++ if (alloc_push(ret)) { ++ kfree(ret); ++ ret = NULL; ++ } ++ } ++ ++out: ++ return ret; ++} ++ ++void * ++acl_alloc_num(unsigned long num, unsigned long len) ++{ ++ if (!len || (num > (PAGE_SIZE / len))) ++ return NULL; ++ ++ return acl_alloc(num * len); ++} ++ ++void ++acl_free_all(void) ++{ ++ if (gr_acl_is_enabled() || !alloc_stack) ++ return; ++ ++ while (alloc_pop()) ; ++ ++ if (alloc_stack) { ++ if ((alloc_stack_size * sizeof (void *)) <= PAGE_SIZE) ++ kfree(alloc_stack); ++ else ++ vfree(alloc_stack); ++ } ++ ++ alloc_stack = NULL; ++ alloc_stack_size = 1; ++ alloc_stack_next = 1; ++ ++ return; ++} ++ ++int ++acl_alloc_stack_init(unsigned long size) ++{ ++ if ((size * sizeof (void *)) <= PAGE_SIZE) ++ alloc_stack = ++ (void **) kmalloc(size * sizeof (void *), GFP_KERNEL); ++ else ++ alloc_stack = (void **) vmalloc(size * sizeof (void *)); ++ ++ alloc_stack_size = size; ++ ++ if (!alloc_stack) ++ return 0; ++ else ++ return 1; ++} +diff -urNp linux-2.6.33.1/grsecurity/gracl.c linux-2.6.33.1/grsecurity/gracl.c +--- linux-2.6.33.1/grsecurity/gracl.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/gracl.c 2010-03-20 17:00:48.140865901 -0400 +@@ -0,0 +1,3917 @@ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/sched.h> ++#include <linux/mm.h> ++#include <linux/file.h> ++#include <linux/fs.h> ++#include <linux/namei.h> ++#include <linux/mount.h> ++#include <linux/tty.h> ++#include <linux/proc_fs.h> ++#include <linux/smp_lock.h> ++#include <linux/slab.h> ++#include <linux/vmalloc.h> ++#include <linux/types.h> ++#include <linux/sysctl.h> ++#include <linux/netdevice.h> ++#include <linux/ptrace.h> ++#include <linux/gracl.h> ++#include <linux/gralloc.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++#include <linux/pid_namespace.h> ++#include <linux/fdtable.h> ++#include <linux/percpu.h> ++ ++#include <asm/uaccess.h> ++#include <asm/errno.h> ++#include <asm/mman.h> ++ ++static struct acl_role_db acl_role_set; ++static struct name_db name_set; ++static struct inodev_db inodev_set; ++ ++/* for keeping track of userspace pointers used for subjects, so we ++ can share references in the kernel as well ++*/ ++ ++static struct dentry *real_root; ++static struct vfsmount *real_root_mnt; ++ ++static struct acl_subj_map_db subj_map_set; ++ ++static struct acl_role_label *default_role; ++ ++static struct acl_role_label *role_list; ++ ++static u16 acl_sp_role_value; ++ ++extern char *gr_shared_page[4]; ++static DECLARE_MUTEX(gr_dev_sem); ++DEFINE_RWLOCK(gr_inode_lock); ++ ++struct gr_arg *gr_usermode; ++ ++static unsigned int gr_status __read_only = GR_STATUS_INIT; ++ ++extern int chkpw(struct gr_arg *entry, unsigned char *salt, unsigned char *sum); ++extern void gr_clear_learn_entries(void); ++ ++#ifdef CONFIG_GRKERNSEC_RESLOG ++extern void gr_log_resource(const struct task_struct *task, ++ const int res, const unsigned long wanted, const int gt); ++#endif ++ ++unsigned char *gr_system_salt; ++unsigned char *gr_system_sum; ++ ++static struct sprole_pw **acl_special_roles = NULL; ++static __u16 num_sprole_pws = 0; ++ ++static struct acl_role_label *kernel_role = NULL; ++ ++static unsigned int gr_auth_attempts = 0; ++static unsigned long gr_auth_expires = 0UL; ++ ++extern struct vfsmount *sock_mnt; ++extern struct vfsmount *pipe_mnt; ++extern struct vfsmount *shm_mnt; ++static struct acl_object_label *fakefs_obj; ++ ++extern int gr_init_uidset(void); ++extern void gr_free_uidset(void); ++extern void gr_remove_uid(uid_t uid); ++extern int gr_find_uid(uid_t uid); ++ ++__inline__ int ++gr_acl_is_enabled(void) ++{ ++ return (gr_status & GR_READY); ++} ++ ++char gr_roletype_to_char(void) ++{ ++ switch (current->role->roletype & ++ (GR_ROLE_DEFAULT | GR_ROLE_USER | GR_ROLE_GROUP | ++ GR_ROLE_SPECIAL)) { ++ case GR_ROLE_DEFAULT: ++ return 'D'; ++ case GR_ROLE_USER: ++ return 'U'; ++ case GR_ROLE_GROUP: ++ return 'G'; ++ case GR_ROLE_SPECIAL: ++ return 'S'; ++ } ++ ++ return 'X'; ++} ++ ++__inline__ int ++gr_acl_tpe_check(void) ++{ ++ if (unlikely(!(gr_status & GR_READY))) ++ return 0; ++ if (current->role->roletype & GR_ROLE_TPE) ++ return 1; ++ else ++ return 0; ++} ++ ++int ++gr_handle_rawio(const struct inode *inode) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_CAPS ++ if (inode && S_ISBLK(inode->i_mode) && ++ grsec_enable_chroot_caps && proc_is_chrooted(current) && ++ !capable(CAP_SYS_RAWIO)) ++ return 1; ++#endif ++ return 0; ++} ++ ++static int ++gr_streq(const char *a, const char *b, const unsigned int lena, const unsigned int lenb) ++{ ++ int i; ++ unsigned long *l1; ++ unsigned long *l2; ++ unsigned char *c1; ++ unsigned char *c2; ++ int num_longs; ++ ++ if (likely(lena != lenb)) ++ return 0; ++ ++ l1 = (unsigned long *)a; ++ l2 = (unsigned long *)b; ++ ++ num_longs = lena / sizeof(unsigned long); ++ ++ for (i = num_longs; i--; l1++, l2++) { ++ if (unlikely(*l1 != *l2)) ++ return 0; ++ } ++ ++ c1 = (unsigned char *) l1; ++ c2 = (unsigned char *) l2; ++ ++ i = lena - (num_longs * sizeof(unsigned long)); ++ ++ for (; i--; c1++, c2++) { ++ if (unlikely(*c1 != *c2)) ++ return 0; ++ } ++ ++ return 1; ++} ++ ++static char * __our_d_path(struct dentry *dentry, struct vfsmount *vfsmnt, ++ struct dentry *root, struct vfsmount *rootmnt, ++ char *buffer, int buflen) ++{ ++ char * end = buffer+buflen; ++ char * retval; ++ int namelen; ++ ++ *--end = '\0'; ++ buflen--; ++ ++ if (buflen < 1) ++ goto Elong; ++ /* Get '/' right */ ++ retval = end-1; ++ *retval = '/'; ++ ++ for (;;) { ++ struct dentry * parent; ++ ++ if (dentry == root && vfsmnt == rootmnt) ++ break; ++ if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { ++ /* Global root? */ ++ spin_lock(&vfsmount_lock); ++ if (vfsmnt->mnt_parent == vfsmnt) { ++ spin_unlock(&vfsmount_lock); ++ goto global_root; ++ } ++ dentry = vfsmnt->mnt_mountpoint; ++ vfsmnt = vfsmnt->mnt_parent; ++ spin_unlock(&vfsmount_lock); ++ continue; ++ } ++ parent = dentry->d_parent; ++ prefetch(parent); ++ namelen = dentry->d_name.len; ++ buflen -= namelen + 1; ++ if (buflen < 0) ++ goto Elong; ++ end -= namelen; ++ memcpy(end, dentry->d_name.name, namelen); ++ *--end = '/'; ++ retval = end; ++ dentry = parent; ++ } ++ ++ return retval; ++ ++global_root: ++ namelen = dentry->d_name.len; ++ buflen -= namelen; ++ if (buflen < 0) ++ goto Elong; ++ retval -= namelen-1; /* hit the slash */ ++ memcpy(retval, dentry->d_name.name, namelen); ++ return retval; ++Elong: ++ return ERR_PTR(-ENAMETOOLONG); ++} ++ ++static char * ++gen_full_path(struct dentry *dentry, struct vfsmount *vfsmnt, ++ struct dentry *root, struct vfsmount *rootmnt, char *buf, int buflen) ++{ ++ char *retval; ++ ++ retval = __our_d_path(dentry, vfsmnt, root, rootmnt, buf, buflen); ++ if (unlikely(IS_ERR(retval))) ++ retval = strcpy(buf, "<path too long>"); ++ else if (unlikely(retval[1] == '/' && retval[2] == '\0')) ++ retval[1] = '\0'; ++ ++ return retval; ++} ++ ++static char * ++__d_real_path(const struct dentry *dentry, const struct vfsmount *vfsmnt, ++ char *buf, int buflen) ++{ ++ char *res; ++ ++ /* we can use real_root, real_root_mnt, because this is only called ++ by the RBAC system */ ++ res = gen_full_path((struct dentry *)dentry, (struct vfsmount *)vfsmnt, real_root, real_root_mnt, buf, buflen); ++ ++ return res; ++} ++ ++static char * ++d_real_path(const struct dentry *dentry, const struct vfsmount *vfsmnt, ++ char *buf, int buflen) ++{ ++ char *res; ++ struct dentry *root; ++ struct vfsmount *rootmnt; ++ struct task_struct *reaper = &init_task; ++ ++ /* we can't use real_root, real_root_mnt, because they belong only to the RBAC system */ ++ read_lock(&reaper->fs->lock); ++ root = dget(reaper->fs->root.dentry); ++ rootmnt = mntget(reaper->fs->root.mnt); ++ read_unlock(&reaper->fs->lock); ++ ++ spin_lock(&dcache_lock); ++ res = gen_full_path((struct dentry *)dentry, (struct vfsmount *)vfsmnt, root, rootmnt, buf, buflen); ++ spin_unlock(&dcache_lock); ++ ++ dput(root); ++ mntput(rootmnt); ++ return res; ++} ++ ++static char * ++gr_to_filename_rbac(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ char *ret; ++ spin_lock(&dcache_lock); ++ ret = __d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[0],smp_processor_id()), ++ PAGE_SIZE); ++ spin_unlock(&dcache_lock); ++ return ret; ++} ++ ++char * ++gr_to_filename_nolock(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return __d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[0],smp_processor_id()), ++ PAGE_SIZE); ++} ++ ++char * ++gr_to_filename(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[0], smp_processor_id()), ++ PAGE_SIZE); ++} ++ ++char * ++gr_to_filename1(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[1], smp_processor_id()), ++ PAGE_SIZE); ++} ++ ++char * ++gr_to_filename2(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[2], smp_processor_id()), ++ PAGE_SIZE); ++} ++ ++char * ++gr_to_filename3(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[3], smp_processor_id()), ++ PAGE_SIZE); ++} ++ ++__inline__ __u32 ++to_gr_audit(const __u32 reqmode) ++{ ++ /* masks off auditable permission flags, then shifts them to create ++ auditing flags, and adds the special case of append auditing if ++ we're requesting write */ ++ return (((reqmode & ~GR_AUDITS) << 10) | ((reqmode & GR_WRITE) ? GR_AUDIT_APPEND : 0)); ++} ++ ++struct acl_subject_label * ++lookup_subject_map(const struct acl_subject_label *userp) ++{ ++ unsigned int index = shash(userp, subj_map_set.s_size); ++ struct subject_map *match; ++ ++ match = subj_map_set.s_hash[index]; ++ ++ while (match && match->user != userp) ++ match = match->next; ++ ++ if (match != NULL) ++ return match->kernel; ++ else ++ return NULL; ++} ++ ++static void ++insert_subj_map_entry(struct subject_map *subjmap) ++{ ++ unsigned int index = shash(subjmap->user, subj_map_set.s_size); ++ struct subject_map **curr; ++ ++ subjmap->prev = NULL; ++ ++ curr = &subj_map_set.s_hash[index]; ++ if (*curr != NULL) ++ (*curr)->prev = subjmap; ++ ++ subjmap->next = *curr; ++ *curr = subjmap; ++ ++ return; ++} ++ ++static struct acl_role_label * ++lookup_acl_role_label(const struct task_struct *task, const uid_t uid, ++ const gid_t gid) ++{ ++ unsigned int index = rhash(uid, GR_ROLE_USER, acl_role_set.r_size); ++ struct acl_role_label *match; ++ struct role_allowed_ip *ipp; ++ unsigned int x; ++ ++ match = acl_role_set.r_hash[index]; ++ ++ while (match) { ++ if ((match->roletype & (GR_ROLE_DOMAIN | GR_ROLE_USER)) == (GR_ROLE_DOMAIN | GR_ROLE_USER)) { ++ for (x = 0; x < match->domain_child_num; x++) { ++ if (match->domain_children[x] == uid) ++ goto found; ++ } ++ } else if (match->uidgid == uid && match->roletype & GR_ROLE_USER) ++ break; ++ match = match->next; ++ } ++found: ++ if (match == NULL) { ++ try_group: ++ index = rhash(gid, GR_ROLE_GROUP, acl_role_set.r_size); ++ match = acl_role_set.r_hash[index]; ++ ++ while (match) { ++ if ((match->roletype & (GR_ROLE_DOMAIN | GR_ROLE_GROUP)) == (GR_ROLE_DOMAIN | GR_ROLE_GROUP)) { ++ for (x = 0; x < match->domain_child_num; x++) { ++ if (match->domain_children[x] == gid) ++ goto found2; ++ } ++ } else if (match->uidgid == gid && match->roletype & GR_ROLE_GROUP) ++ break; ++ match = match->next; ++ } ++found2: ++ if (match == NULL) ++ match = default_role; ++ if (match->allowed_ips == NULL) ++ return match; ++ else { ++ for (ipp = match->allowed_ips; ipp; ipp = ipp->next) { ++ if (likely ++ ((ntohl(task->signal->curr_ip) & ipp->netmask) == ++ (ntohl(ipp->addr) & ipp->netmask))) ++ return match; ++ } ++ match = default_role; ++ } ++ } else if (match->allowed_ips == NULL) { ++ return match; ++ } else { ++ for (ipp = match->allowed_ips; ipp; ipp = ipp->next) { ++ if (likely ++ ((ntohl(task->signal->curr_ip) & ipp->netmask) == ++ (ntohl(ipp->addr) & ipp->netmask))) ++ return match; ++ } ++ goto try_group; ++ } ++ ++ return match; ++} ++ ++struct acl_subject_label * ++lookup_acl_subj_label(const ino_t ino, const dev_t dev, ++ const struct acl_role_label *role) ++{ ++ unsigned int index = fhash(ino, dev, role->subj_hash_size); ++ struct acl_subject_label *match; ++ ++ match = role->subj_hash[index]; ++ ++ while (match && (match->inode != ino || match->device != dev || ++ (match->mode & GR_DELETED))) { ++ match = match->next; ++ } ++ ++ if (match && !(match->mode & GR_DELETED)) ++ return match; ++ else ++ return NULL; ++} ++ ++struct acl_subject_label * ++lookup_acl_subj_label_deleted(const ino_t ino, const dev_t dev, ++ const struct acl_role_label *role) ++{ ++ unsigned int index = fhash(ino, dev, role->subj_hash_size); ++ struct acl_subject_label *match; ++ ++ match = role->subj_hash[index]; ++ ++ while (match && (match->inode != ino || match->device != dev || ++ !(match->mode & GR_DELETED))) { ++ match = match->next; ++ } ++ ++ if (match && (match->mode & GR_DELETED)) ++ return match; ++ else ++ return NULL; ++} ++ ++static struct acl_object_label * ++lookup_acl_obj_label(const ino_t ino, const dev_t dev, ++ const struct acl_subject_label *subj) ++{ ++ unsigned int index = fhash(ino, dev, subj->obj_hash_size); ++ struct acl_object_label *match; ++ ++ match = subj->obj_hash[index]; ++ ++ while (match && (match->inode != ino || match->device != dev || ++ (match->mode & GR_DELETED))) { ++ match = match->next; ++ } ++ ++ if (match && !(match->mode & GR_DELETED)) ++ return match; ++ else ++ return NULL; ++} ++ ++static struct acl_object_label * ++lookup_acl_obj_label_create(const ino_t ino, const dev_t dev, ++ const struct acl_subject_label *subj) ++{ ++ unsigned int index = fhash(ino, dev, subj->obj_hash_size); ++ struct acl_object_label *match; ++ ++ match = subj->obj_hash[index]; ++ ++ while (match && (match->inode != ino || match->device != dev || ++ !(match->mode & GR_DELETED))) { ++ match = match->next; ++ } ++ ++ if (match && (match->mode & GR_DELETED)) ++ return match; ++ ++ match = subj->obj_hash[index]; ++ ++ while (match && (match->inode != ino || match->device != dev || ++ (match->mode & GR_DELETED))) { ++ match = match->next; ++ } ++ ++ if (match && !(match->mode & GR_DELETED)) ++ return match; ++ else ++ return NULL; ++} ++ ++static struct name_entry * ++lookup_name_entry(const char *name) ++{ ++ unsigned int len = strlen(name); ++ unsigned int key = full_name_hash(name, len); ++ unsigned int index = key % name_set.n_size; ++ struct name_entry *match; ++ ++ match = name_set.n_hash[index]; ++ ++ while (match && (match->key != key || !gr_streq(match->name, name, match->len, len))) ++ match = match->next; ++ ++ return match; ++} ++ ++static struct name_entry * ++lookup_name_entry_create(const char *name) ++{ ++ unsigned int len = strlen(name); ++ unsigned int key = full_name_hash(name, len); ++ unsigned int index = key % name_set.n_size; ++ struct name_entry *match; ++ ++ match = name_set.n_hash[index]; ++ ++ while (match && (match->key != key || !gr_streq(match->name, name, match->len, len) || ++ !match->deleted)) ++ match = match->next; ++ ++ if (match && match->deleted) ++ return match; ++ ++ match = name_set.n_hash[index]; ++ ++ while (match && (match->key != key || !gr_streq(match->name, name, match->len, len) || ++ match->deleted)) ++ match = match->next; ++ ++ if (match && !match->deleted) ++ return match; ++ else ++ return NULL; ++} ++ ++static struct inodev_entry * ++lookup_inodev_entry(const ino_t ino, const dev_t dev) ++{ ++ unsigned int index = fhash(ino, dev, inodev_set.i_size); ++ struct inodev_entry *match; ++ ++ match = inodev_set.i_hash[index]; ++ ++ while (match && (match->nentry->inode != ino || match->nentry->device != dev)) ++ match = match->next; ++ ++ return match; ++} ++ ++static void ++insert_inodev_entry(struct inodev_entry *entry) ++{ ++ unsigned int index = fhash(entry->nentry->inode, entry->nentry->device, ++ inodev_set.i_size); ++ struct inodev_entry **curr; ++ ++ entry->prev = NULL; ++ ++ curr = &inodev_set.i_hash[index]; ++ if (*curr != NULL) ++ (*curr)->prev = entry; ++ ++ entry->next = *curr; ++ *curr = entry; ++ ++ return; ++} ++ ++static void ++__insert_acl_role_label(struct acl_role_label *role, uid_t uidgid) ++{ ++ unsigned int index = ++ rhash(uidgid, role->roletype & (GR_ROLE_USER | GR_ROLE_GROUP), acl_role_set.r_size); ++ struct acl_role_label **curr; ++ struct acl_role_label *tmp; ++ ++ curr = &acl_role_set.r_hash[index]; ++ ++ /* if role was already inserted due to domains and already has ++ a role in the same bucket as it attached, then we need to ++ combine these two buckets ++ */ ++ if (role->next) { ++ tmp = role->next; ++ while (tmp->next) ++ tmp = tmp->next; ++ tmp->next = *curr; ++ } else ++ role->next = *curr; ++ *curr = role; ++ ++ return; ++} ++ ++static void ++insert_acl_role_label(struct acl_role_label *role) ++{ ++ int i; ++ ++ if (role_list == NULL) { ++ role_list = role; ++ role->prev = NULL; ++ } else { ++ role->prev = role_list; ++ role_list = role; ++ } ++ ++ /* used for hash chains */ ++ role->next = NULL; ++ ++ if (role->roletype & GR_ROLE_DOMAIN) { ++ for (i = 0; i < role->domain_child_num; i++) ++ __insert_acl_role_label(role, role->domain_children[i]); ++ } else ++ __insert_acl_role_label(role, role->uidgid); ++} ++ ++static int ++insert_name_entry(char *name, const ino_t inode, const dev_t device, __u8 deleted) ++{ ++ struct name_entry **curr, *nentry; ++ struct inodev_entry *ientry; ++ unsigned int len = strlen(name); ++ unsigned int key = full_name_hash(name, len); ++ unsigned int index = key % name_set.n_size; ++ ++ curr = &name_set.n_hash[index]; ++ ++ while (*curr && ((*curr)->key != key || !gr_streq((*curr)->name, name, (*curr)->len, len))) ++ curr = &((*curr)->next); ++ ++ if (*curr != NULL) ++ return 1; ++ ++ nentry = acl_alloc(sizeof (struct name_entry)); ++ if (nentry == NULL) ++ return 0; ++ ientry = acl_alloc(sizeof (struct inodev_entry)); ++ if (ientry == NULL) ++ return 0; ++ ientry->nentry = nentry; ++ ++ nentry->key = key; ++ nentry->name = name; ++ nentry->inode = inode; ++ nentry->device = device; ++ nentry->len = len; ++ nentry->deleted = deleted; ++ ++ nentry->prev = NULL; ++ curr = &name_set.n_hash[index]; ++ if (*curr != NULL) ++ (*curr)->prev = nentry; ++ nentry->next = *curr; ++ *curr = nentry; ++ ++ /* insert us into the table searchable by inode/dev */ ++ insert_inodev_entry(ientry); ++ ++ return 1; ++} ++ ++static void ++insert_acl_obj_label(struct acl_object_label *obj, ++ struct acl_subject_label *subj) ++{ ++ unsigned int index = ++ fhash(obj->inode, obj->device, subj->obj_hash_size); ++ struct acl_object_label **curr; ++ ++ ++ obj->prev = NULL; ++ ++ curr = &subj->obj_hash[index]; ++ if (*curr != NULL) ++ (*curr)->prev = obj; ++ ++ obj->next = *curr; ++ *curr = obj; ++ ++ return; ++} ++ ++static void ++insert_acl_subj_label(struct acl_subject_label *obj, ++ struct acl_role_label *role) ++{ ++ unsigned int index = fhash(obj->inode, obj->device, role->subj_hash_size); ++ struct acl_subject_label **curr; ++ ++ obj->prev = NULL; ++ ++ curr = &role->subj_hash[index]; ++ if (*curr != NULL) ++ (*curr)->prev = obj; ++ ++ obj->next = *curr; ++ *curr = obj; ++ ++ return; ++} ++ ++/* allocating chained hash tables, so optimal size is where lambda ~ 1 */ ++ ++static void * ++create_table(__u32 * len, int elementsize) ++{ ++ unsigned int table_sizes[] = { ++ 7, 13, 31, 61, 127, 251, 509, 1021, 2039, 4093, 8191, 16381, ++ 32749, 65521, 131071, 262139, 524287, 1048573, 2097143, ++ 4194301, 8388593, 16777213, 33554393, 67108859 ++ }; ++ void *newtable = NULL; ++ unsigned int pwr = 0; ++ ++ while ((pwr < ((sizeof (table_sizes) / sizeof (table_sizes[0])) - 1)) && ++ table_sizes[pwr] <= *len) ++ pwr++; ++ ++ if (table_sizes[pwr] <= *len || (table_sizes[pwr] > ULONG_MAX / elementsize)) ++ return newtable; ++ ++ if ((table_sizes[pwr] * elementsize) <= PAGE_SIZE) ++ newtable = ++ kmalloc(table_sizes[pwr] * elementsize, GFP_KERNEL); ++ else ++ newtable = vmalloc(table_sizes[pwr] * elementsize); ++ ++ *len = table_sizes[pwr]; ++ ++ return newtable; ++} ++ ++static int ++init_variables(const struct gr_arg *arg) ++{ ++ struct task_struct *reaper = &init_task; ++ unsigned int stacksize; ++ ++ subj_map_set.s_size = arg->role_db.num_subjects; ++ acl_role_set.r_size = arg->role_db.num_roles + arg->role_db.num_domain_children; ++ name_set.n_size = arg->role_db.num_objects; ++ inodev_set.i_size = arg->role_db.num_objects; ++ ++ if (!subj_map_set.s_size || !acl_role_set.r_size || ++ !name_set.n_size || !inodev_set.i_size) ++ return 1; ++ ++ if (!gr_init_uidset()) ++ return 1; ++ ++ /* set up the stack that holds allocation info */ ++ ++ stacksize = arg->role_db.num_pointers + 5; ++ ++ if (!acl_alloc_stack_init(stacksize)) ++ return 1; ++ ++ /* grab reference for the real root dentry and vfsmount */ ++ read_lock(&reaper->fs->lock); ++ real_root_mnt = mntget(reaper->fs->root.mnt); ++ real_root = dget(reaper->fs->root.dentry); ++ read_unlock(&reaper->fs->lock); ++ ++ fakefs_obj = acl_alloc(sizeof(struct acl_object_label)); ++ if (fakefs_obj == NULL) ++ return 1; ++ fakefs_obj->mode = GR_FIND | GR_READ | GR_WRITE | GR_EXEC; ++ ++ subj_map_set.s_hash = ++ (struct subject_map **) create_table(&subj_map_set.s_size, sizeof(void *)); ++ acl_role_set.r_hash = ++ (struct acl_role_label **) create_table(&acl_role_set.r_size, sizeof(void *)); ++ name_set.n_hash = (struct name_entry **) create_table(&name_set.n_size, sizeof(void *)); ++ inodev_set.i_hash = ++ (struct inodev_entry **) create_table(&inodev_set.i_size, sizeof(void *)); ++ ++ if (!subj_map_set.s_hash || !acl_role_set.r_hash || ++ !name_set.n_hash || !inodev_set.i_hash) ++ return 1; ++ ++ memset(subj_map_set.s_hash, 0, ++ sizeof(struct subject_map *) * subj_map_set.s_size); ++ memset(acl_role_set.r_hash, 0, ++ sizeof (struct acl_role_label *) * acl_role_set.r_size); ++ memset(name_set.n_hash, 0, ++ sizeof (struct name_entry *) * name_set.n_size); ++ memset(inodev_set.i_hash, 0, ++ sizeof (struct inodev_entry *) * inodev_set.i_size); ++ ++ return 0; ++} ++ ++/* free information not needed after startup ++ currently contains user->kernel pointer mappings for subjects ++*/ ++ ++static void ++free_init_variables(void) ++{ ++ __u32 i; ++ ++ if (subj_map_set.s_hash) { ++ for (i = 0; i < subj_map_set.s_size; i++) { ++ if (subj_map_set.s_hash[i]) { ++ kfree(subj_map_set.s_hash[i]); ++ subj_map_set.s_hash[i] = NULL; ++ } ++ } ++ ++ if ((subj_map_set.s_size * sizeof (struct subject_map *)) <= ++ PAGE_SIZE) ++ kfree(subj_map_set.s_hash); ++ else ++ vfree(subj_map_set.s_hash); ++ } ++ ++ return; ++} ++ ++static void ++free_variables(void) ++{ ++ struct acl_subject_label *s; ++ struct acl_role_label *r; ++ struct task_struct *task, *task2; ++ unsigned int x; ++ ++ gr_clear_learn_entries(); ++ ++ read_lock(&tasklist_lock); ++ do_each_thread(task2, task) { ++ task->acl_sp_role = 0; ++ task->acl_role_id = 0; ++ task->acl = NULL; ++ task->role = NULL; ++ } while_each_thread(task2, task); ++ read_unlock(&tasklist_lock); ++ ++ /* release the reference to the real root dentry and vfsmount */ ++ if (real_root) ++ dput(real_root); ++ real_root = NULL; ++ if (real_root_mnt) ++ mntput(real_root_mnt); ++ real_root_mnt = NULL; ++ ++ /* free all object hash tables */ ++ ++ FOR_EACH_ROLE_START(r) ++ if (r->subj_hash == NULL) ++ goto next_role; ++ FOR_EACH_SUBJECT_START(r, s, x) ++ if (s->obj_hash == NULL) ++ break; ++ if ((s->obj_hash_size * sizeof (struct acl_object_label *)) <= PAGE_SIZE) ++ kfree(s->obj_hash); ++ else ++ vfree(s->obj_hash); ++ FOR_EACH_SUBJECT_END(s, x) ++ FOR_EACH_NESTED_SUBJECT_START(r, s) ++ if (s->obj_hash == NULL) ++ break; ++ if ((s->obj_hash_size * sizeof (struct acl_object_label *)) <= PAGE_SIZE) ++ kfree(s->obj_hash); ++ else ++ vfree(s->obj_hash); ++ FOR_EACH_NESTED_SUBJECT_END(s) ++ if ((r->subj_hash_size * sizeof (struct acl_subject_label *)) <= PAGE_SIZE) ++ kfree(r->subj_hash); ++ else ++ vfree(r->subj_hash); ++ r->subj_hash = NULL; ++next_role: ++ FOR_EACH_ROLE_END(r) ++ ++ acl_free_all(); ++ ++ if (acl_role_set.r_hash) { ++ if ((acl_role_set.r_size * sizeof (struct acl_role_label *)) <= ++ PAGE_SIZE) ++ kfree(acl_role_set.r_hash); ++ else ++ vfree(acl_role_set.r_hash); ++ } ++ if (name_set.n_hash) { ++ if ((name_set.n_size * sizeof (struct name_entry *)) <= ++ PAGE_SIZE) ++ kfree(name_set.n_hash); ++ else ++ vfree(name_set.n_hash); ++ } ++ ++ if (inodev_set.i_hash) { ++ if ((inodev_set.i_size * sizeof (struct inodev_entry *)) <= ++ PAGE_SIZE) ++ kfree(inodev_set.i_hash); ++ else ++ vfree(inodev_set.i_hash); ++ } ++ ++ gr_free_uidset(); ++ ++ memset(&name_set, 0, sizeof (struct name_db)); ++ memset(&inodev_set, 0, sizeof (struct inodev_db)); ++ memset(&acl_role_set, 0, sizeof (struct acl_role_db)); ++ memset(&subj_map_set, 0, sizeof (struct acl_subj_map_db)); ++ ++ default_role = NULL; ++ role_list = NULL; ++ ++ return; ++} ++ ++static __u32 ++count_user_objs(struct acl_object_label *userp) ++{ ++ struct acl_object_label o_tmp; ++ __u32 num = 0; ++ ++ while (userp) { ++ if (copy_from_user(&o_tmp, userp, ++ sizeof (struct acl_object_label))) ++ break; ++ ++ userp = o_tmp.prev; ++ num++; ++ } ++ ++ return num; ++} ++ ++static struct acl_subject_label * ++do_copy_user_subj(struct acl_subject_label *userp, struct acl_role_label *role); ++ ++static int ++copy_user_glob(struct acl_object_label *obj) ++{ ++ struct acl_object_label *g_tmp, **guser; ++ unsigned int len; ++ char *tmp; ++ ++ if (obj->globbed == NULL) ++ return 0; ++ ++ guser = &obj->globbed; ++ while (*guser) { ++ g_tmp = (struct acl_object_label *) ++ acl_alloc(sizeof (struct acl_object_label)); ++ if (g_tmp == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(g_tmp, *guser, ++ sizeof (struct acl_object_label))) ++ return -EFAULT; ++ ++ len = strnlen_user(g_tmp->filename, PATH_MAX); ++ ++ if (!len || len >= PATH_MAX) ++ return -EINVAL; ++ ++ if ((tmp = (char *) acl_alloc(len)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(tmp, g_tmp->filename, len)) ++ return -EFAULT; ++ tmp[len-1] = '\0'; ++ g_tmp->filename = tmp; ++ ++ *guser = g_tmp; ++ guser = &(g_tmp->next); ++ } ++ ++ return 0; ++} ++ ++static int ++copy_user_objs(struct acl_object_label *userp, struct acl_subject_label *subj, ++ struct acl_role_label *role) ++{ ++ struct acl_object_label *o_tmp; ++ unsigned int len; ++ int ret; ++ char *tmp; ++ ++ while (userp) { ++ if ((o_tmp = (struct acl_object_label *) ++ acl_alloc(sizeof (struct acl_object_label))) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(o_tmp, userp, ++ sizeof (struct acl_object_label))) ++ return -EFAULT; ++ ++ userp = o_tmp->prev; ++ ++ len = strnlen_user(o_tmp->filename, PATH_MAX); ++ ++ if (!len || len >= PATH_MAX) ++ return -EINVAL; ++ ++ if ((tmp = (char *) acl_alloc(len)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(tmp, o_tmp->filename, len)) ++ return -EFAULT; ++ tmp[len-1] = '\0'; ++ o_tmp->filename = tmp; ++ ++ insert_acl_obj_label(o_tmp, subj); ++ if (!insert_name_entry(o_tmp->filename, o_tmp->inode, ++ o_tmp->device, (o_tmp->mode & GR_DELETED) ? 1 : 0)) ++ return -ENOMEM; ++ ++ ret = copy_user_glob(o_tmp); ++ if (ret) ++ return ret; ++ ++ if (o_tmp->nested) { ++ o_tmp->nested = do_copy_user_subj(o_tmp->nested, role); ++ if (IS_ERR(o_tmp->nested)) ++ return PTR_ERR(o_tmp->nested); ++ ++ /* insert into nested subject list */ ++ o_tmp->nested->next = role->hash->first; ++ role->hash->first = o_tmp->nested; ++ } ++ } ++ ++ return 0; ++} ++ ++static __u32 ++count_user_subjs(struct acl_subject_label *userp) ++{ ++ struct acl_subject_label s_tmp; ++ __u32 num = 0; ++ ++ while (userp) { ++ if (copy_from_user(&s_tmp, userp, ++ sizeof (struct acl_subject_label))) ++ break; ++ ++ userp = s_tmp.prev; ++ /* do not count nested subjects against this count, since ++ they are not included in the hash table, but are ++ attached to objects. We have already counted ++ the subjects in userspace for the allocation ++ stack ++ */ ++ if (!(s_tmp.mode & GR_NESTED)) ++ num++; ++ } ++ ++ return num; ++} ++ ++static int ++copy_user_allowedips(struct acl_role_label *rolep) ++{ ++ struct role_allowed_ip *ruserip, *rtmp = NULL, *rlast; ++ ++ ruserip = rolep->allowed_ips; ++ ++ while (ruserip) { ++ rlast = rtmp; ++ ++ if ((rtmp = (struct role_allowed_ip *) ++ acl_alloc(sizeof (struct role_allowed_ip))) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(rtmp, ruserip, ++ sizeof (struct role_allowed_ip))) ++ return -EFAULT; ++ ++ ruserip = rtmp->prev; ++ ++ if (!rlast) { ++ rtmp->prev = NULL; ++ rolep->allowed_ips = rtmp; ++ } else { ++ rlast->next = rtmp; ++ rtmp->prev = rlast; ++ } ++ ++ if (!ruserip) ++ rtmp->next = NULL; ++ } ++ ++ return 0; ++} ++ ++static int ++copy_user_transitions(struct acl_role_label *rolep) ++{ ++ struct role_transition *rusertp, *rtmp = NULL, *rlast; ++ ++ unsigned int len; ++ char *tmp; ++ ++ rusertp = rolep->transitions; ++ ++ while (rusertp) { ++ rlast = rtmp; ++ ++ if ((rtmp = (struct role_transition *) ++ acl_alloc(sizeof (struct role_transition))) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(rtmp, rusertp, ++ sizeof (struct role_transition))) ++ return -EFAULT; ++ ++ rusertp = rtmp->prev; ++ ++ len = strnlen_user(rtmp->rolename, GR_SPROLE_LEN); ++ ++ if (!len || len >= GR_SPROLE_LEN) ++ return -EINVAL; ++ ++ if ((tmp = (char *) acl_alloc(len)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(tmp, rtmp->rolename, len)) ++ return -EFAULT; ++ tmp[len-1] = '\0'; ++ rtmp->rolename = tmp; ++ ++ if (!rlast) { ++ rtmp->prev = NULL; ++ rolep->transitions = rtmp; ++ } else { ++ rlast->next = rtmp; ++ rtmp->prev = rlast; ++ } ++ ++ if (!rusertp) ++ rtmp->next = NULL; ++ } ++ ++ return 0; ++} ++ ++static struct acl_subject_label * ++do_copy_user_subj(struct acl_subject_label *userp, struct acl_role_label *role) ++{ ++ struct acl_subject_label *s_tmp = NULL, *s_tmp2; ++ unsigned int len; ++ char *tmp; ++ __u32 num_objs; ++ struct acl_ip_label **i_tmp, *i_utmp2; ++ struct gr_hash_struct ghash; ++ struct subject_map *subjmap; ++ unsigned int i_num; ++ int err; ++ ++ s_tmp = lookup_subject_map(userp); ++ ++ /* we've already copied this subject into the kernel, just return ++ the reference to it, and don't copy it over again ++ */ ++ if (s_tmp) ++ return(s_tmp); ++ ++ if ((s_tmp = (struct acl_subject_label *) ++ acl_alloc(sizeof (struct acl_subject_label))) == NULL) ++ return ERR_PTR(-ENOMEM); ++ ++ subjmap = (struct subject_map *)kmalloc(sizeof (struct subject_map), GFP_KERNEL); ++ if (subjmap == NULL) ++ return ERR_PTR(-ENOMEM); ++ ++ subjmap->user = userp; ++ subjmap->kernel = s_tmp; ++ insert_subj_map_entry(subjmap); ++ ++ if (copy_from_user(s_tmp, userp, ++ sizeof (struct acl_subject_label))) ++ return ERR_PTR(-EFAULT); ++ ++ len = strnlen_user(s_tmp->filename, PATH_MAX); ++ ++ if (!len || len >= PATH_MAX) ++ return ERR_PTR(-EINVAL); ++ ++ if ((tmp = (char *) acl_alloc(len)) == NULL) ++ return ERR_PTR(-ENOMEM); ++ ++ if (copy_from_user(tmp, s_tmp->filename, len)) ++ return ERR_PTR(-EFAULT); ++ tmp[len-1] = '\0'; ++ s_tmp->filename = tmp; ++ ++ if (!strcmp(s_tmp->filename, "/")) ++ role->root_label = s_tmp; ++ ++ if (copy_from_user(&ghash, s_tmp->hash, sizeof(struct gr_hash_struct))) ++ return ERR_PTR(-EFAULT); ++ ++ /* copy user and group transition tables */ ++ ++ if (s_tmp->user_trans_num) { ++ uid_t *uidlist; ++ ++ uidlist = (uid_t *)acl_alloc_num(s_tmp->user_trans_num, sizeof(uid_t)); ++ if (uidlist == NULL) ++ return ERR_PTR(-ENOMEM); ++ if (copy_from_user(uidlist, s_tmp->user_transitions, s_tmp->user_trans_num * sizeof(uid_t))) ++ return ERR_PTR(-EFAULT); ++ ++ s_tmp->user_transitions = uidlist; ++ } ++ ++ if (s_tmp->group_trans_num) { ++ gid_t *gidlist; ++ ++ gidlist = (gid_t *)acl_alloc_num(s_tmp->group_trans_num, sizeof(gid_t)); ++ if (gidlist == NULL) ++ return ERR_PTR(-ENOMEM); ++ if (copy_from_user(gidlist, s_tmp->group_transitions, s_tmp->group_trans_num * sizeof(gid_t))) ++ return ERR_PTR(-EFAULT); ++ ++ s_tmp->group_transitions = gidlist; ++ } ++ ++ /* set up object hash table */ ++ num_objs = count_user_objs(ghash.first); ++ ++ s_tmp->obj_hash_size = num_objs; ++ s_tmp->obj_hash = ++ (struct acl_object_label **) ++ create_table(&(s_tmp->obj_hash_size), sizeof(void *)); ++ ++ if (!s_tmp->obj_hash) ++ return ERR_PTR(-ENOMEM); ++ ++ memset(s_tmp->obj_hash, 0, ++ s_tmp->obj_hash_size * ++ sizeof (struct acl_object_label *)); ++ ++ /* add in objects */ ++ err = copy_user_objs(ghash.first, s_tmp, role); ++ ++ if (err) ++ return ERR_PTR(err); ++ ++ /* set pointer for parent subject */ ++ if (s_tmp->parent_subject) { ++ s_tmp2 = do_copy_user_subj(s_tmp->parent_subject, role); ++ ++ if (IS_ERR(s_tmp2)) ++ return s_tmp2; ++ ++ s_tmp->parent_subject = s_tmp2; ++ } ++ ++ /* add in ip acls */ ++ ++ if (!s_tmp->ip_num) { ++ s_tmp->ips = NULL; ++ goto insert; ++ } ++ ++ i_tmp = ++ (struct acl_ip_label **) acl_alloc_num(s_tmp->ip_num, ++ sizeof (struct acl_ip_label *)); ++ ++ if (!i_tmp) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i_num = 0; i_num < s_tmp->ip_num; i_num++) { ++ *(i_tmp + i_num) = ++ (struct acl_ip_label *) ++ acl_alloc(sizeof (struct acl_ip_label)); ++ if (!*(i_tmp + i_num)) ++ return ERR_PTR(-ENOMEM); ++ ++ if (copy_from_user ++ (&i_utmp2, s_tmp->ips + i_num, ++ sizeof (struct acl_ip_label *))) ++ return ERR_PTR(-EFAULT); ++ ++ if (copy_from_user ++ (*(i_tmp + i_num), i_utmp2, ++ sizeof (struct acl_ip_label))) ++ return ERR_PTR(-EFAULT); ++ ++ if ((*(i_tmp + i_num))->iface == NULL) ++ continue; ++ ++ len = strnlen_user((*(i_tmp + i_num))->iface, IFNAMSIZ); ++ if (!len || len >= IFNAMSIZ) ++ return ERR_PTR(-EINVAL); ++ tmp = acl_alloc(len); ++ if (tmp == NULL) ++ return ERR_PTR(-ENOMEM); ++ if (copy_from_user(tmp, (*(i_tmp + i_num))->iface, len)) ++ return ERR_PTR(-EFAULT); ++ (*(i_tmp + i_num))->iface = tmp; ++ } ++ ++ s_tmp->ips = i_tmp; ++ ++insert: ++ if (!insert_name_entry(s_tmp->filename, s_tmp->inode, ++ s_tmp->device, (s_tmp->mode & GR_DELETED) ? 1 : 0)) ++ return ERR_PTR(-ENOMEM); ++ ++ return s_tmp; ++} ++ ++static int ++copy_user_subjs(struct acl_subject_label *userp, struct acl_role_label *role) ++{ ++ struct acl_subject_label s_pre; ++ struct acl_subject_label * ret; ++ int err; ++ ++ while (userp) { ++ if (copy_from_user(&s_pre, userp, ++ sizeof (struct acl_subject_label))) ++ return -EFAULT; ++ ++ /* do not add nested subjects here, add ++ while parsing objects ++ */ ++ ++ if (s_pre.mode & GR_NESTED) { ++ userp = s_pre.prev; ++ continue; ++ } ++ ++ ret = do_copy_user_subj(userp, role); ++ ++ err = PTR_ERR(ret); ++ if (IS_ERR(ret)) ++ return err; ++ ++ insert_acl_subj_label(ret, role); ++ ++ userp = s_pre.prev; ++ } ++ ++ return 0; ++} ++ ++static int ++copy_user_acl(struct gr_arg *arg) ++{ ++ struct acl_role_label *r_tmp = NULL, **r_utmp, *r_utmp2; ++ struct sprole_pw *sptmp; ++ struct gr_hash_struct *ghash; ++ uid_t *domainlist; ++ unsigned int r_num; ++ unsigned int len; ++ char *tmp; ++ int err = 0; ++ __u16 i; ++ __u32 num_subjs; ++ ++ /* we need a default and kernel role */ ++ if (arg->role_db.num_roles < 2) ++ return -EINVAL; ++ ++ /* copy special role authentication info from userspace */ ++ ++ num_sprole_pws = arg->num_sprole_pws; ++ acl_special_roles = (struct sprole_pw **) acl_alloc_num(num_sprole_pws, sizeof(struct sprole_pw *)); ++ ++ if (!acl_special_roles) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ ++ for (i = 0; i < num_sprole_pws; i++) { ++ sptmp = (struct sprole_pw *) acl_alloc(sizeof(struct sprole_pw)); ++ if (!sptmp) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ if (copy_from_user(sptmp, arg->sprole_pws + i, ++ sizeof (struct sprole_pw))) { ++ err = -EFAULT; ++ goto cleanup; ++ } ++ ++ len = ++ strnlen_user(sptmp->rolename, GR_SPROLE_LEN); ++ ++ if (!len || len >= GR_SPROLE_LEN) { ++ err = -EINVAL; ++ goto cleanup; ++ } ++ ++ if ((tmp = (char *) acl_alloc(len)) == NULL) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ ++ if (copy_from_user(tmp, sptmp->rolename, len)) { ++ err = -EFAULT; ++ goto cleanup; ++ } ++ tmp[len-1] = '\0'; ++#ifdef CONFIG_GRKERNSEC_ACL_DEBUG ++ printk(KERN_ALERT "Copying special role %s\n", tmp); ++#endif ++ sptmp->rolename = tmp; ++ acl_special_roles[i] = sptmp; ++ } ++ ++ r_utmp = (struct acl_role_label **) arg->role_db.r_table; ++ ++ for (r_num = 0; r_num < arg->role_db.num_roles; r_num++) { ++ r_tmp = acl_alloc(sizeof (struct acl_role_label)); ++ ++ if (!r_tmp) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ ++ if (copy_from_user(&r_utmp2, r_utmp + r_num, ++ sizeof (struct acl_role_label *))) { ++ err = -EFAULT; ++ goto cleanup; ++ } ++ ++ if (copy_from_user(r_tmp, r_utmp2, ++ sizeof (struct acl_role_label))) { ++ err = -EFAULT; ++ goto cleanup; ++ } ++ ++ len = strnlen_user(r_tmp->rolename, GR_SPROLE_LEN); ++ ++ if (!len || len >= PATH_MAX) { ++ err = -EINVAL; ++ goto cleanup; ++ } ++ ++ if ((tmp = (char *) acl_alloc(len)) == NULL) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ if (copy_from_user(tmp, r_tmp->rolename, len)) { ++ err = -EFAULT; ++ goto cleanup; ++ } ++ tmp[len-1] = '\0'; ++ r_tmp->rolename = tmp; ++ ++ if (!strcmp(r_tmp->rolename, "default") ++ && (r_tmp->roletype & GR_ROLE_DEFAULT)) { ++ default_role = r_tmp; ++ } else if (!strcmp(r_tmp->rolename, ":::kernel:::")) { ++ kernel_role = r_tmp; ++ } ++ ++ if ((ghash = (struct gr_hash_struct *) acl_alloc(sizeof(struct gr_hash_struct))) == NULL) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ if (copy_from_user(ghash, r_tmp->hash, sizeof(struct gr_hash_struct))) { ++ err = -EFAULT; ++ goto cleanup; ++ } ++ ++ r_tmp->hash = ghash; ++ ++ num_subjs = count_user_subjs(r_tmp->hash->first); ++ ++ r_tmp->subj_hash_size = num_subjs; ++ r_tmp->subj_hash = ++ (struct acl_subject_label **) ++ create_table(&(r_tmp->subj_hash_size), sizeof(void *)); ++ ++ if (!r_tmp->subj_hash) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ ++ err = copy_user_allowedips(r_tmp); ++ if (err) ++ goto cleanup; ++ ++ /* copy domain info */ ++ if (r_tmp->domain_children != NULL) { ++ domainlist = acl_alloc_num(r_tmp->domain_child_num, sizeof(uid_t)); ++ if (domainlist == NULL) { ++ err = -ENOMEM; ++ goto cleanup; ++ } ++ if (copy_from_user(domainlist, r_tmp->domain_children, r_tmp->domain_child_num * sizeof(uid_t))) { ++ err = -EFAULT; ++ goto cleanup; ++ } ++ r_tmp->domain_children = domainlist; ++ } ++ ++ err = copy_user_transitions(r_tmp); ++ if (err) ++ goto cleanup; ++ ++ memset(r_tmp->subj_hash, 0, ++ r_tmp->subj_hash_size * ++ sizeof (struct acl_subject_label *)); ++ ++ err = copy_user_subjs(r_tmp->hash->first, r_tmp); ++ ++ if (err) ++ goto cleanup; ++ ++ /* set nested subject list to null */ ++ r_tmp->hash->first = NULL; ++ ++ insert_acl_role_label(r_tmp); ++ } ++ ++ goto return_err; ++ cleanup: ++ free_variables(); ++ return_err: ++ return err; ++ ++} ++ ++static int ++gracl_init(struct gr_arg *args) ++{ ++ int error = 0; ++ ++ memcpy(gr_system_salt, args->salt, GR_SALT_LEN); ++ memcpy(gr_system_sum, args->sum, GR_SHA_LEN); ++ ++ if (init_variables(args)) { ++ gr_log_str(GR_DONT_AUDIT_GOOD, GR_INITF_ACL_MSG, GR_VERSION); ++ error = -ENOMEM; ++ free_variables(); ++ goto out; ++ } ++ ++ error = copy_user_acl(args); ++ free_init_variables(); ++ if (error) { ++ free_variables(); ++ goto out; ++ } ++ ++ if ((error = gr_set_acls(0))) { ++ free_variables(); ++ goto out; ++ } ++ ++ pax_open_kernel(); ++ gr_status |= GR_READY; ++ pax_close_kernel(); ++ ++ out: ++ return error; ++} ++ ++/* derived from glibc fnmatch() 0: match, 1: no match*/ ++ ++static int ++glob_match(const char *p, const char *n) ++{ ++ char c; ++ ++ while ((c = *p++) != '\0') { ++ switch (c) { ++ case '?': ++ if (*n == '\0') ++ return 1; ++ else if (*n == '/') ++ return 1; ++ break; ++ case '\': ++ if (*n != c) ++ return 1; ++ break; ++ case '*': ++ for (c = *p++; c == '?' || c == '*'; c = *p++) { ++ if (*n == '/') ++ return 1; ++ else if (c == '?') { ++ if (*n == '\0') ++ return 1; ++ else ++ ++n; ++ } ++ } ++ if (c == '\0') { ++ return 0; ++ } else { ++ const char *endp; ++ ++ if ((endp = strchr(n, '/')) == NULL) ++ endp = n + strlen(n); ++ ++ if (c == '[') { ++ for (--p; n < endp; ++n) ++ if (!glob_match(p, n)) ++ return 0; ++ } else if (c == '/') { ++ while (*n != '\0' && *n != '/') ++ ++n; ++ if (*n == '/' && !glob_match(p, n + 1)) ++ return 0; ++ } else { ++ for (--p; n < endp; ++n) ++ if (*n == c && !glob_match(p, n)) ++ return 0; ++ } ++ ++ return 1; ++ } ++ case '[': ++ { ++ int not; ++ char cold; ++ ++ if (*n == '\0' || *n == '/') ++ return 1; ++ ++ not = (*p == '!' || *p == '^'); ++ if (not) ++ ++p; ++ ++ c = *p++; ++ for (;;) { ++ unsigned char fn = (unsigned char)*n; ++ ++ if (c == '\0') ++ return 1; ++ else { ++ if (c == fn) ++ goto matched; ++ cold = c; ++ c = *p++; ++ ++ if (c == '-' && *p != ']') { ++ unsigned char cend = *p++; ++ ++ if (cend == '\0') ++ return 1; ++ ++ if (cold <= fn && fn <= cend) ++ goto matched; ++ ++ c = *p++; ++ } ++ } ++ ++ if (c == ']') ++ break; ++ } ++ if (!not) ++ return 1; ++ break; ++ matched: ++ while (c != ']') { ++ if (c == '\0') ++ return 1; ++ ++ c = *p++; ++ } ++ if (not) ++ return 1; ++ } ++ break; ++ default: ++ if (c != *n) ++ return 1; ++ } ++ ++ ++n; ++ } ++ ++ if (*n == '\0') ++ return 0; ++ ++ if (*n == '/') ++ return 0; ++ ++ return 1; ++} ++ ++static struct acl_object_label * ++chk_glob_label(struct acl_object_label *globbed, ++ struct dentry *dentry, struct vfsmount *mnt, char **path) ++{ ++ struct acl_object_label *tmp; ++ ++ if (*path == NULL) ++ *path = gr_to_filename_nolock(dentry, mnt); ++ ++ tmp = globbed; ++ ++ while (tmp) { ++ if (!glob_match(tmp->filename, *path)) ++ return tmp; ++ tmp = tmp->next; ++ } ++ ++ return NULL; ++} ++ ++static struct acl_object_label * ++__full_lookup(const struct dentry *orig_dentry, const struct vfsmount *orig_mnt, ++ const ino_t curr_ino, const dev_t curr_dev, ++ const struct acl_subject_label *subj, char **path, const int checkglob) ++{ ++ struct acl_subject_label *tmpsubj; ++ struct acl_object_label *retval; ++ struct acl_object_label *retval2; ++ ++ tmpsubj = (struct acl_subject_label *) subj; ++ read_lock(&gr_inode_lock); ++ do { ++ retval = lookup_acl_obj_label(curr_ino, curr_dev, tmpsubj); ++ if (retval) { ++ if (checkglob && retval->globbed) { ++ retval2 = chk_glob_label(retval->globbed, (struct dentry *)orig_dentry, ++ (struct vfsmount *)orig_mnt, path); ++ if (retval2) ++ retval = retval2; ++ } ++ break; ++ } ++ } while ((tmpsubj = tmpsubj->parent_subject)); ++ read_unlock(&gr_inode_lock); ++ ++ return retval; ++} ++ ++static __inline__ struct acl_object_label * ++full_lookup(const struct dentry *orig_dentry, const struct vfsmount *orig_mnt, ++ const struct dentry *curr_dentry, ++ const struct acl_subject_label *subj, char **path, const int checkglob) ++{ ++ return __full_lookup(orig_dentry, orig_mnt, ++ curr_dentry->d_inode->i_ino, ++ curr_dentry->d_inode->i_sb->s_dev, subj, path, checkglob); ++} ++ ++static struct acl_object_label * ++__chk_obj_label(const struct dentry *l_dentry, const struct vfsmount *l_mnt, ++ const struct acl_subject_label *subj, char *path, const int checkglob) ++{ ++ struct dentry *dentry = (struct dentry *) l_dentry; ++ struct vfsmount *mnt = (struct vfsmount *) l_mnt; ++ struct acl_object_label *retval; ++ ++ spin_lock(&dcache_lock); ++ ++ if (unlikely(mnt == shm_mnt || mnt == pipe_mnt || mnt == sock_mnt || ++ /* ignore Eric Biederman */ ++ IS_PRIVATE(l_dentry->d_inode))) { ++ retval = fakefs_obj; ++ goto out; ++ } ++ ++ for (;;) { ++ if (dentry == real_root && mnt == real_root_mnt) ++ break; ++ ++ if (dentry == mnt->mnt_root || IS_ROOT(dentry)) { ++ if (mnt->mnt_parent == mnt) ++ break; ++ ++ retval = full_lookup(l_dentry, l_mnt, dentry, subj, &path, checkglob); ++ if (retval != NULL) ++ goto out; ++ ++ dentry = mnt->mnt_mountpoint; ++ mnt = mnt->mnt_parent; ++ continue; ++ } ++ ++ retval = full_lookup(l_dentry, l_mnt, dentry, subj, &path, checkglob); ++ if (retval != NULL) ++ goto out; ++ ++ dentry = dentry->d_parent; ++ } ++ ++ retval = full_lookup(l_dentry, l_mnt, dentry, subj, &path, checkglob); ++ ++ if (retval == NULL) ++ retval = full_lookup(l_dentry, l_mnt, real_root, subj, &path, checkglob); ++out: ++ spin_unlock(&dcache_lock); ++ return retval; ++} ++ ++static __inline__ struct acl_object_label * ++chk_obj_label(const struct dentry *l_dentry, const struct vfsmount *l_mnt, ++ const struct acl_subject_label *subj) ++{ ++ char *path = NULL; ++ return __chk_obj_label(l_dentry, l_mnt, subj, path, 1); ++} ++ ++static __inline__ struct acl_object_label * ++chk_obj_label_noglob(const struct dentry *l_dentry, const struct vfsmount *l_mnt, ++ const struct acl_subject_label *subj) ++{ ++ char *path = NULL; ++ return __chk_obj_label(l_dentry, l_mnt, subj, path, 0); ++} ++ ++static __inline__ struct acl_object_label * ++chk_obj_create_label(const struct dentry *l_dentry, const struct vfsmount *l_mnt, ++ const struct acl_subject_label *subj, char *path) ++{ ++ return __chk_obj_label(l_dentry, l_mnt, subj, path, 1); ++} ++ ++static struct acl_subject_label * ++chk_subj_label(const struct dentry *l_dentry, const struct vfsmount *l_mnt, ++ const struct acl_role_label *role) ++{ ++ struct dentry *dentry = (struct dentry *) l_dentry; ++ struct vfsmount *mnt = (struct vfsmount *) l_mnt; ++ struct acl_subject_label *retval; ++ ++ spin_lock(&dcache_lock); ++ ++ for (;;) { ++ if (dentry == real_root && mnt == real_root_mnt) ++ break; ++ if (dentry == mnt->mnt_root || IS_ROOT(dentry)) { ++ if (mnt->mnt_parent == mnt) ++ break; ++ ++ read_lock(&gr_inode_lock); ++ retval = ++ lookup_acl_subj_label(dentry->d_inode->i_ino, ++ dentry->d_inode->i_sb->s_dev, role); ++ read_unlock(&gr_inode_lock); ++ if (retval != NULL) ++ goto out; ++ ++ dentry = mnt->mnt_mountpoint; ++ mnt = mnt->mnt_parent; ++ continue; ++ } ++ ++ read_lock(&gr_inode_lock); ++ retval = lookup_acl_subj_label(dentry->d_inode->i_ino, ++ dentry->d_inode->i_sb->s_dev, role); ++ read_unlock(&gr_inode_lock); ++ if (retval != NULL) ++ goto out; ++ ++ dentry = dentry->d_parent; ++ } ++ ++ read_lock(&gr_inode_lock); ++ retval = lookup_acl_subj_label(dentry->d_inode->i_ino, ++ dentry->d_inode->i_sb->s_dev, role); ++ read_unlock(&gr_inode_lock); ++ ++ if (unlikely(retval == NULL)) { ++ read_lock(&gr_inode_lock); ++ retval = lookup_acl_subj_label(real_root->d_inode->i_ino, ++ real_root->d_inode->i_sb->s_dev, role); ++ read_unlock(&gr_inode_lock); ++ } ++out: ++ spin_unlock(&dcache_lock); ++ ++ return retval; ++} ++ ++static void ++gr_log_learn(const struct dentry *dentry, const struct vfsmount *mnt, const __u32 mode) ++{ ++ struct task_struct *task = current; ++ const struct cred *cred = current_cred(); ++ ++ security_learn(GR_LEARN_AUDIT_MSG, task->role->rolename, task->role->roletype, ++ cred->uid, cred->gid, task->exec_file ? gr_to_filename1(task->exec_file->f_path.dentry, ++ task->exec_file->f_path.mnt) : task->acl->filename, task->acl->filename, ++ 1UL, 1UL, gr_to_filename(dentry, mnt), (unsigned long) mode, &task->signal->curr_ip); ++ ++ return; ++} ++ ++static void ++gr_log_learn_sysctl(const char *path, const __u32 mode) ++{ ++ struct task_struct *task = current; ++ const struct cred *cred = current_cred(); ++ ++ security_learn(GR_LEARN_AUDIT_MSG, task->role->rolename, task->role->roletype, ++ cred->uid, cred->gid, task->exec_file ? gr_to_filename1(task->exec_file->f_path.dentry, ++ task->exec_file->f_path.mnt) : task->acl->filename, task->acl->filename, ++ 1UL, 1UL, path, (unsigned long) mode, &task->signal->curr_ip); ++ ++ return; ++} ++ ++static void ++gr_log_learn_id_change(const char type, const unsigned int real, ++ const unsigned int effective, const unsigned int fs) ++{ ++ struct task_struct *task = current; ++ const struct cred *cred = current_cred(); ++ ++ security_learn(GR_ID_LEARN_MSG, task->role->rolename, task->role->roletype, ++ cred->uid, cred->gid, task->exec_file ? gr_to_filename1(task->exec_file->f_path.dentry, ++ task->exec_file->f_path.mnt) : task->acl->filename, task->acl->filename, ++ type, real, effective, fs, &task->signal->curr_ip); ++ ++ return; ++} ++ ++__u32 ++gr_check_link(const struct dentry * new_dentry, ++ const struct dentry * parent_dentry, ++ const struct vfsmount * parent_mnt, ++ const struct dentry * old_dentry, const struct vfsmount * old_mnt) ++{ ++ struct acl_object_label *obj; ++ __u32 oldmode, newmode; ++ __u32 needmode; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return (GR_CREATE | GR_LINK); ++ ++ obj = chk_obj_label(old_dentry, old_mnt, current->acl); ++ oldmode = obj->mode; ++ ++ if (current->acl->mode & (GR_LEARN | GR_INHERITLEARN)) ++ oldmode |= (GR_CREATE | GR_LINK); ++ ++ needmode = GR_CREATE | GR_AUDIT_CREATE | GR_SUPPRESS; ++ if (old_dentry->d_inode->i_mode & (S_ISUID | S_ISGID)) ++ needmode |= GR_SETID | GR_AUDIT_SETID; ++ ++ newmode = ++ gr_check_create(new_dentry, parent_dentry, parent_mnt, ++ oldmode | needmode); ++ ++ needmode = newmode & (GR_FIND | GR_APPEND | GR_WRITE | GR_EXEC | ++ GR_SETID | GR_READ | GR_FIND | GR_DELETE | ++ GR_INHERIT | GR_AUDIT_INHERIT); ++ ++ if (old_dentry->d_inode->i_mode & (S_ISUID | S_ISGID) && !(newmode & GR_SETID)) ++ goto bad; ++ ++ if ((oldmode & needmode) != needmode) ++ goto bad; ++ ++ needmode = oldmode & (GR_NOPTRACE | GR_PTRACERD | GR_INHERIT | GR_AUDITS); ++ if ((newmode & needmode) != needmode) ++ goto bad; ++ ++ if ((newmode & (GR_CREATE | GR_LINK)) == (GR_CREATE | GR_LINK)) ++ return newmode; ++bad: ++ needmode = oldmode; ++ if (old_dentry->d_inode->i_mode & (S_ISUID | S_ISGID)) ++ needmode |= GR_SETID; ++ ++ if (current->acl->mode & (GR_LEARN | GR_INHERITLEARN)) { ++ gr_log_learn(old_dentry, old_mnt, needmode); ++ return (GR_CREATE | GR_LINK); ++ } else if (newmode & GR_SUPPRESS) ++ return GR_SUPPRESS; ++ else ++ return 0; ++} ++ ++__u32 ++gr_search_file(const struct dentry * dentry, const __u32 mode, ++ const struct vfsmount * mnt) ++{ ++ __u32 retval = mode; ++ struct acl_subject_label *curracl; ++ struct acl_object_label *currobj; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return (mode & ~GR_AUDITS); ++ ++ curracl = current->acl; ++ ++ currobj = chk_obj_label(dentry, mnt, curracl); ++ retval = currobj->mode & mode; ++ ++ if (unlikely ++ ((curracl->mode & (GR_LEARN | GR_INHERITLEARN)) && !(mode & GR_NOPTRACE) ++ && (retval != (mode & ~(GR_AUDITS | GR_SUPPRESS))))) { ++ __u32 new_mode = mode; ++ ++ new_mode &= ~(GR_AUDITS | GR_SUPPRESS); ++ ++ retval = new_mode; ++ ++ if (new_mode & GR_EXEC && curracl->mode & GR_INHERITLEARN) ++ new_mode |= GR_INHERIT; ++ ++ if (!(mode & GR_NOLEARN)) ++ gr_log_learn(dentry, mnt, new_mode); ++ } ++ ++ return retval; ++} ++ ++__u32 ++gr_check_create(const struct dentry * new_dentry, const struct dentry * parent, ++ const struct vfsmount * mnt, const __u32 mode) ++{ ++ struct name_entry *match; ++ struct acl_object_label *matchpo; ++ struct acl_subject_label *curracl; ++ char *path; ++ __u32 retval; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return (mode & ~GR_AUDITS); ++ ++ preempt_disable(); ++ path = gr_to_filename_rbac(new_dentry, mnt); ++ match = lookup_name_entry_create(path); ++ ++ if (!match) ++ goto check_parent; ++ ++ curracl = current->acl; ++ ++ read_lock(&gr_inode_lock); ++ matchpo = lookup_acl_obj_label_create(match->inode, match->device, curracl); ++ read_unlock(&gr_inode_lock); ++ ++ if (matchpo) { ++ if ((matchpo->mode & mode) != ++ (mode & ~(GR_AUDITS | GR_SUPPRESS)) ++ && curracl->mode & (GR_LEARN | GR_INHERITLEARN)) { ++ __u32 new_mode = mode; ++ ++ new_mode &= ~(GR_AUDITS | GR_SUPPRESS); ++ ++ gr_log_learn(new_dentry, mnt, new_mode); ++ ++ preempt_enable(); ++ return new_mode; ++ } ++ preempt_enable(); ++ return (matchpo->mode & mode); ++ } ++ ++ check_parent: ++ curracl = current->acl; ++ ++ matchpo = chk_obj_create_label(parent, mnt, curracl, path); ++ retval = matchpo->mode & mode; ++ ++ if ((retval != (mode & ~(GR_AUDITS | GR_SUPPRESS))) ++ && (curracl->mode & (GR_LEARN | GR_INHERITLEARN))) { ++ __u32 new_mode = mode; ++ ++ new_mode &= ~(GR_AUDITS | GR_SUPPRESS); ++ ++ gr_log_learn(new_dentry, mnt, new_mode); ++ preempt_enable(); ++ return new_mode; ++ } ++ ++ preempt_enable(); ++ return retval; ++} ++ ++int ++gr_check_hidden_task(const struct task_struct *task) ++{ ++ if (unlikely(!(gr_status & GR_READY))) ++ return 0; ++ ++ if (!(task->acl->mode & GR_PROCFIND) && !(current->acl->mode & GR_VIEW)) ++ return 1; ++ ++ return 0; ++} ++ ++int ++gr_check_protected_task(const struct task_struct *task) ++{ ++ if (unlikely(!(gr_status & GR_READY) || !task)) ++ return 0; ++ ++ if ((task->acl->mode & GR_PROTECTED) && !(current->acl->mode & GR_KILL) && ++ task->acl != current->acl) ++ return 1; ++ ++ return 0; ++} ++ ++void ++gr_copy_label(struct task_struct *tsk) ++{ ++ tsk->signal->used_accept = 0; ++ tsk->acl_sp_role = 0; ++ tsk->acl_role_id = current->acl_role_id; ++ tsk->acl = current->acl; ++ tsk->role = current->role; ++ tsk->signal->curr_ip = current->signal->curr_ip; ++ if (current->exec_file) ++ get_file(current->exec_file); ++ tsk->exec_file = current->exec_file; ++ tsk->is_writable = current->is_writable; ++ if (unlikely(current->signal->used_accept)) ++ current->signal->curr_ip = 0; ++ ++ return; ++} ++ ++static void ++gr_set_proc_res(struct task_struct *task) ++{ ++ struct acl_subject_label *proc; ++ unsigned short i; ++ ++ proc = task->acl; ++ ++ if (proc->mode & (GR_LEARN | GR_INHERITLEARN)) ++ return; ++ ++ for (i = 0; i < RLIM_NLIMITS; i++) { ++ if (!(proc->resmask & (1 << i))) ++ continue; ++ ++ task->signal->rlim[i].rlim_cur = proc->res[i].rlim_cur; ++ task->signal->rlim[i].rlim_max = proc->res[i].rlim_max; ++ } ++ ++ return; ++} ++ ++int ++gr_check_user_change(int real, int effective, int fs) ++{ ++ unsigned int i; ++ __u16 num; ++ uid_t *uidlist; ++ int curuid; ++ int realok = 0; ++ int effectiveok = 0; ++ int fsok = 0; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return 0; ++ ++ if (current->acl->mode & (GR_LEARN | GR_INHERITLEARN)) ++ gr_log_learn_id_change('u', real, effective, fs); ++ ++ num = current->acl->user_trans_num; ++ uidlist = current->acl->user_transitions; ++ ++ if (uidlist == NULL) ++ return 0; ++ ++ if (real == -1) ++ realok = 1; ++ if (effective == -1) ++ effectiveok = 1; ++ if (fs == -1) ++ fsok = 1; ++ ++ if (current->acl->user_trans_type & GR_ID_ALLOW) { ++ for (i = 0; i < num; i++) { ++ curuid = (int)uidlist[i]; ++ if (real == curuid) ++ realok = 1; ++ if (effective == curuid) ++ effectiveok = 1; ++ if (fs == curuid) ++ fsok = 1; ++ } ++ } else if (current->acl->user_trans_type & GR_ID_DENY) { ++ for (i = 0; i < num; i++) { ++ curuid = (int)uidlist[i]; ++ if (real == curuid) ++ break; ++ if (effective == curuid) ++ break; ++ if (fs == curuid) ++ break; ++ } ++ /* not in deny list */ ++ if (i == num) { ++ realok = 1; ++ effectiveok = 1; ++ fsok = 1; ++ } ++ } ++ ++ if (realok && effectiveok && fsok) ++ return 0; ++ else { ++ gr_log_int(GR_DONT_AUDIT, GR_USRCHANGE_ACL_MSG, realok ? (effectiveok ? (fsok ? 0 : fs) : effective) : real); ++ return 1; ++ } ++} ++ ++int ++gr_check_group_change(int real, int effective, int fs) ++{ ++ unsigned int i; ++ __u16 num; ++ gid_t *gidlist; ++ int curgid; ++ int realok = 0; ++ int effectiveok = 0; ++ int fsok = 0; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return 0; ++ ++ if (current->acl->mode & (GR_LEARN | GR_INHERITLEARN)) ++ gr_log_learn_id_change('g', real, effective, fs); ++ ++ num = current->acl->group_trans_num; ++ gidlist = current->acl->group_transitions; ++ ++ if (gidlist == NULL) ++ return 0; ++ ++ if (real == -1) ++ realok = 1; ++ if (effective == -1) ++ effectiveok = 1; ++ if (fs == -1) ++ fsok = 1; ++ ++ if (current->acl->group_trans_type & GR_ID_ALLOW) { ++ for (i = 0; i < num; i++) { ++ curgid = (int)gidlist[i]; ++ if (real == curgid) ++ realok = 1; ++ if (effective == curgid) ++ effectiveok = 1; ++ if (fs == curgid) ++ fsok = 1; ++ } ++ } else if (current->acl->group_trans_type & GR_ID_DENY) { ++ for (i = 0; i < num; i++) { ++ curgid = (int)gidlist[i]; ++ if (real == curgid) ++ break; ++ if (effective == curgid) ++ break; ++ if (fs == curgid) ++ break; ++ } ++ /* not in deny list */ ++ if (i == num) { ++ realok = 1; ++ effectiveok = 1; ++ fsok = 1; ++ } ++ } ++ ++ if (realok && effectiveok && fsok) ++ return 0; ++ else { ++ gr_log_int(GR_DONT_AUDIT, GR_GRPCHANGE_ACL_MSG, realok ? (effectiveok ? (fsok ? 0 : fs) : effective) : real); ++ return 1; ++ } ++} ++ ++void ++gr_set_role_label(struct task_struct *task, const uid_t uid, const uid_t gid) ++{ ++ struct acl_role_label *role = task->role; ++ struct acl_subject_label *subj = NULL; ++ struct acl_object_label *obj; ++ struct file *filp; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return; ++ ++ filp = task->exec_file; ++ ++ /* kernel process, we'll give them the kernel role */ ++ if (unlikely(!filp)) { ++ task->role = kernel_role; ++ task->acl = kernel_role->root_label; ++ return; ++ } else if (!task->role || !(task->role->roletype & GR_ROLE_SPECIAL)) ++ role = lookup_acl_role_label(task, uid, gid); ++ ++ /* perform subject lookup in possibly new role ++ we can use this result below in the case where role == task->role ++ */ ++ subj = chk_subj_label(filp->f_path.dentry, filp->f_path.mnt, role); ++ ++ /* if we changed uid/gid, but result in the same role ++ and are using inheritance, don't lose the inherited subject ++ if current subject is other than what normal lookup ++ would result in, we arrived via inheritance, don't ++ lose subject ++ */ ++ if (role != task->role || (!(task->acl->mode & GR_INHERITLEARN) && ++ (subj == task->acl))) ++ task->acl = subj; ++ ++ task->role = role; ++ ++ task->is_writable = 0; ++ ++ /* ignore additional mmap checks for processes that are writable ++ by the default ACL */ ++ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, default_role->root_label); ++ if (unlikely(obj->mode & GR_WRITE)) ++ task->is_writable = 1; ++ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, task->role->root_label); ++ if (unlikely(obj->mode & GR_WRITE)) ++ task->is_writable = 1; ++ ++#ifdef CONFIG_GRKERNSEC_ACL_DEBUG ++ printk(KERN_ALERT "Set role label for (%s:%d): role:%s, subject:%s\n", task->comm, task->pid, task->role->rolename, task->acl->filename); ++#endif ++ ++ gr_set_proc_res(task); ++ ++ return; ++} ++ ++int ++gr_set_proc_label(const struct dentry *dentry, const struct vfsmount *mnt, ++ const int unsafe_share) ++{ ++ struct task_struct *task = current; ++ struct acl_subject_label *newacl; ++ struct acl_object_label *obj; ++ __u32 retmode; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return 0; ++ ++ newacl = chk_subj_label(dentry, mnt, task->role); ++ ++ task_lock(task); ++ if ((((task->ptrace & PT_PTRACED) || unsafe_share) && ++ !(task->acl->mode & GR_POVERRIDE) && (task->acl != newacl) && ++ !(task->role->roletype & GR_ROLE_GOD) && ++ !gr_search_file(dentry, GR_PTRACERD, mnt) && ++ !(task->acl->mode & (GR_LEARN | GR_INHERITLEARN)))) { ++ task_unlock(task); ++ if (unsafe_share) ++ gr_log_fs_generic(GR_DONT_AUDIT, GR_UNSAFESHARE_EXEC_ACL_MSG, dentry, mnt); ++ else ++ gr_log_fs_generic(GR_DONT_AUDIT, GR_PTRACE_EXEC_ACL_MSG, dentry, mnt); ++ return -EACCES; ++ } ++ task_unlock(task); ++ ++ obj = chk_obj_label(dentry, mnt, task->acl); ++ retmode = obj->mode & (GR_INHERIT | GR_AUDIT_INHERIT); ++ ++ if (!(task->acl->mode & GR_INHERITLEARN) && ++ ((newacl->mode & GR_LEARN) || !(retmode & GR_INHERIT))) { ++ if (obj->nested) ++ task->acl = obj->nested; ++ else ++ task->acl = newacl; ++ } else if (retmode & GR_INHERIT && retmode & GR_AUDIT_INHERIT) ++ gr_log_str_fs(GR_DO_AUDIT, GR_INHERIT_ACL_MSG, task->acl->filename, dentry, mnt); ++ ++ task->is_writable = 0; ++ ++ /* ignore additional mmap checks for processes that are writable ++ by the default ACL */ ++ obj = chk_obj_label(dentry, mnt, default_role->root_label); ++ if (unlikely(obj->mode & GR_WRITE)) ++ task->is_writable = 1; ++ obj = chk_obj_label(dentry, mnt, task->role->root_label); ++ if (unlikely(obj->mode & GR_WRITE)) ++ task->is_writable = 1; ++ ++ gr_set_proc_res(task); ++ ++#ifdef CONFIG_GRKERNSEC_ACL_DEBUG ++ printk(KERN_ALERT "Set subject label for (%s:%d): role:%s, subject:%s\n", task->comm, task->pid, task->role->rolename, task->acl->filename); ++#endif ++ return 0; ++} ++ ++/* always called with valid inodev ptr */ ++static void ++do_handle_delete(struct inodev_entry *inodev, const ino_t ino, const dev_t dev) ++{ ++ struct acl_object_label *matchpo; ++ struct acl_subject_label *matchps; ++ struct acl_subject_label *subj; ++ struct acl_role_label *role; ++ unsigned int x; ++ ++ FOR_EACH_ROLE_START(role) ++ FOR_EACH_SUBJECT_START(role, subj, x) ++ if ((matchpo = lookup_acl_obj_label(ino, dev, subj)) != NULL) ++ matchpo->mode |= GR_DELETED; ++ FOR_EACH_SUBJECT_END(subj,x) ++ FOR_EACH_NESTED_SUBJECT_START(role, subj) ++ if (subj->inode == ino && subj->device == dev) ++ subj->mode |= GR_DELETED; ++ FOR_EACH_NESTED_SUBJECT_END(subj) ++ if ((matchps = lookup_acl_subj_label(ino, dev, role)) != NULL) ++ matchps->mode |= GR_DELETED; ++ FOR_EACH_ROLE_END(role) ++ ++ inodev->nentry->deleted = 1; ++ ++ return; ++} ++ ++void ++gr_handle_delete(const ino_t ino, const dev_t dev) ++{ ++ struct inodev_entry *inodev; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return; ++ ++ write_lock(&gr_inode_lock); ++ inodev = lookup_inodev_entry(ino, dev); ++ if (inodev != NULL) ++ do_handle_delete(inodev, ino, dev); ++ write_unlock(&gr_inode_lock); ++ ++ return; ++} ++ ++static void ++update_acl_obj_label(const ino_t oldinode, const dev_t olddevice, ++ const ino_t newinode, const dev_t newdevice, ++ struct acl_subject_label *subj) ++{ ++ unsigned int index = fhash(oldinode, olddevice, subj->obj_hash_size); ++ struct acl_object_label *match; ++ ++ match = subj->obj_hash[index]; ++ ++ while (match && (match->inode != oldinode || ++ match->device != olddevice || ++ !(match->mode & GR_DELETED))) ++ match = match->next; ++ ++ if (match && (match->inode == oldinode) ++ && (match->device == olddevice) ++ && (match->mode & GR_DELETED)) { ++ if (match->prev == NULL) { ++ subj->obj_hash[index] = match->next; ++ if (match->next != NULL) ++ match->next->prev = NULL; ++ } else { ++ match->prev->next = match->next; ++ if (match->next != NULL) ++ match->next->prev = match->prev; ++ } ++ match->prev = NULL; ++ match->next = NULL; ++ match->inode = newinode; ++ match->device = newdevice; ++ match->mode &= ~GR_DELETED; ++ ++ insert_acl_obj_label(match, subj); ++ } ++ ++ return; ++} ++ ++static void ++update_acl_subj_label(const ino_t oldinode, const dev_t olddevice, ++ const ino_t newinode, const dev_t newdevice, ++ struct acl_role_label *role) ++{ ++ unsigned int index = fhash(oldinode, olddevice, role->subj_hash_size); ++ struct acl_subject_label *match; ++ ++ match = role->subj_hash[index]; ++ ++ while (match && (match->inode != oldinode || ++ match->device != olddevice || ++ !(match->mode & GR_DELETED))) ++ match = match->next; ++ ++ if (match && (match->inode == oldinode) ++ && (match->device == olddevice) ++ && (match->mode & GR_DELETED)) { ++ if (match->prev == NULL) { ++ role->subj_hash[index] = match->next; ++ if (match->next != NULL) ++ match->next->prev = NULL; ++ } else { ++ match->prev->next = match->next; ++ if (match->next != NULL) ++ match->next->prev = match->prev; ++ } ++ match->prev = NULL; ++ match->next = NULL; ++ match->inode = newinode; ++ match->device = newdevice; ++ match->mode &= ~GR_DELETED; ++ ++ insert_acl_subj_label(match, role); ++ } ++ ++ return; ++} ++ ++static void ++update_inodev_entry(const ino_t oldinode, const dev_t olddevice, ++ const ino_t newinode, const dev_t newdevice) ++{ ++ unsigned int index = fhash(oldinode, olddevice, inodev_set.i_size); ++ struct inodev_entry *match; ++ ++ match = inodev_set.i_hash[index]; ++ ++ while (match && (match->nentry->inode != oldinode || ++ match->nentry->device != olddevice || !match->nentry->deleted)) ++ match = match->next; ++ ++ if (match && (match->nentry->inode == oldinode) ++ && (match->nentry->device == olddevice) && ++ match->nentry->deleted) { ++ if (match->prev == NULL) { ++ inodev_set.i_hash[index] = match->next; ++ if (match->next != NULL) ++ match->next->prev = NULL; ++ } else { ++ match->prev->next = match->next; ++ if (match->next != NULL) ++ match->next->prev = match->prev; ++ } ++ match->prev = NULL; ++ match->next = NULL; ++ match->nentry->inode = newinode; ++ match->nentry->device = newdevice; ++ match->nentry->deleted = 0; ++ ++ insert_inodev_entry(match); ++ } ++ ++ return; ++} ++ ++static void ++do_handle_create(const struct name_entry *matchn, const struct dentry *dentry, ++ const struct vfsmount *mnt) ++{ ++ struct acl_subject_label *subj; ++ struct acl_role_label *role; ++ unsigned int x; ++ ++ FOR_EACH_ROLE_START(role) ++ update_acl_subj_label(matchn->inode, matchn->device, ++ dentry->d_inode->i_ino, ++ dentry->d_inode->i_sb->s_dev, role); ++ ++ FOR_EACH_NESTED_SUBJECT_START(role, subj) ++ if ((subj->inode == dentry->d_inode->i_ino) && ++ (subj->device == dentry->d_inode->i_sb->s_dev)) { ++ subj->inode = dentry->d_inode->i_ino; ++ subj->device = dentry->d_inode->i_sb->s_dev; ++ } ++ FOR_EACH_NESTED_SUBJECT_END(subj) ++ FOR_EACH_SUBJECT_START(role, subj, x) ++ update_acl_obj_label(matchn->inode, matchn->device, ++ dentry->d_inode->i_ino, ++ dentry->d_inode->i_sb->s_dev, subj); ++ FOR_EACH_SUBJECT_END(subj,x) ++ FOR_EACH_ROLE_END(role) ++ ++ update_inodev_entry(matchn->inode, matchn->device, ++ dentry->d_inode->i_ino, dentry->d_inode->i_sb->s_dev); ++ ++ return; ++} ++ ++void ++gr_handle_create(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ struct name_entry *matchn; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return; ++ ++ preempt_disable(); ++ matchn = lookup_name_entry(gr_to_filename_rbac(dentry, mnt)); ++ ++ if (unlikely((unsigned long)matchn)) { ++ write_lock(&gr_inode_lock); ++ do_handle_create(matchn, dentry, mnt); ++ write_unlock(&gr_inode_lock); ++ } ++ preempt_enable(); ++ ++ return; ++} ++ ++void ++gr_handle_rename(struct inode *old_dir, struct inode *new_dir, ++ struct dentry *old_dentry, ++ struct dentry *new_dentry, ++ struct vfsmount *mnt, const __u8 replace) ++{ ++ struct name_entry *matchn; ++ struct inodev_entry *inodev; ++ ++ /* vfs_rename swaps the name and parent link for old_dentry and ++ new_dentry ++ at this point, old_dentry has the new name, parent link, and inode ++ for the renamed file ++ if a file is being replaced by a rename, new_dentry has the inode ++ and name for the replaced file ++ */ ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return; ++ ++ preempt_disable(); ++ matchn = lookup_name_entry(gr_to_filename_rbac(old_dentry, mnt)); ++ ++ /* we wouldn't have to check d_inode if it weren't for ++ NFS silly-renaming ++ */ ++ ++ write_lock(&gr_inode_lock); ++ if (unlikely(replace && new_dentry->d_inode)) { ++ inodev = lookup_inodev_entry(new_dentry->d_inode->i_ino, ++ new_dentry->d_inode->i_sb->s_dev); ++ if (inodev != NULL && (new_dentry->d_inode->i_nlink <= 1)) ++ do_handle_delete(inodev, new_dentry->d_inode->i_ino, ++ new_dentry->d_inode->i_sb->s_dev); ++ } ++ ++ inodev = lookup_inodev_entry(old_dentry->d_inode->i_ino, ++ old_dentry->d_inode->i_sb->s_dev); ++ if (inodev != NULL && (old_dentry->d_inode->i_nlink <= 1)) ++ do_handle_delete(inodev, old_dentry->d_inode->i_ino, ++ old_dentry->d_inode->i_sb->s_dev); ++ ++ if (unlikely((unsigned long)matchn)) ++ do_handle_create(matchn, old_dentry, mnt); ++ ++ write_unlock(&gr_inode_lock); ++ preempt_enable(); ++ ++ return; ++} ++ ++static int ++lookup_special_role_auth(__u16 mode, const char *rolename, unsigned char **salt, ++ unsigned char **sum) ++{ ++ struct acl_role_label *r; ++ struct role_allowed_ip *ipp; ++ struct role_transition *trans; ++ unsigned int i; ++ int found = 0; ++ ++ /* check transition table */ ++ ++ for (trans = current->role->transitions; trans; trans = trans->next) { ++ if (!strcmp(rolename, trans->rolename)) { ++ found = 1; ++ break; ++ } ++ } ++ ++ if (!found) ++ return 0; ++ ++ /* handle special roles that do not require authentication ++ and check ip */ ++ ++ FOR_EACH_ROLE_START(r) ++ if (!strcmp(rolename, r->rolename) && ++ (r->roletype & GR_ROLE_SPECIAL)) { ++ found = 0; ++ if (r->allowed_ips != NULL) { ++ for (ipp = r->allowed_ips; ipp; ipp = ipp->next) { ++ if ((ntohl(current->signal->curr_ip) & ipp->netmask) == ++ (ntohl(ipp->addr) & ipp->netmask)) ++ found = 1; ++ } ++ } else ++ found = 2; ++ if (!found) ++ return 0; ++ ++ if (((mode == GR_SPROLE) && (r->roletype & GR_ROLE_NOPW)) || ++ ((mode == GR_SPROLEPAM) && (r->roletype & GR_ROLE_PAM))) { ++ *salt = NULL; ++ *sum = NULL; ++ return 1; ++ } ++ } ++ FOR_EACH_ROLE_END(r) ++ ++ for (i = 0; i < num_sprole_pws; i++) { ++ if (!strcmp(rolename, acl_special_roles[i]->rolename)) { ++ *salt = acl_special_roles[i]->salt; ++ *sum = acl_special_roles[i]->sum; ++ return 1; ++ } ++ } ++ ++ return 0; ++} ++ ++static void ++assign_special_role(char *rolename) ++{ ++ struct acl_object_label *obj; ++ struct acl_role_label *r; ++ struct acl_role_label *assigned = NULL; ++ struct task_struct *tsk; ++ struct file *filp; ++ ++ FOR_EACH_ROLE_START(r) ++ if (!strcmp(rolename, r->rolename) && ++ (r->roletype & GR_ROLE_SPECIAL)) { ++ assigned = r; ++ break; ++ } ++ FOR_EACH_ROLE_END(r) ++ ++ if (!assigned) ++ return; ++ ++ read_lock(&tasklist_lock); ++ read_lock(&grsec_exec_file_lock); ++ ++ tsk = current->parent; ++ if (tsk == NULL) ++ goto out_unlock; ++ ++ filp = tsk->exec_file; ++ if (filp == NULL) ++ goto out_unlock; ++ ++ tsk->is_writable = 0; ++ ++ tsk->acl_sp_role = 1; ++ tsk->acl_role_id = ++acl_sp_role_value; ++ tsk->role = assigned; ++ tsk->acl = chk_subj_label(filp->f_path.dentry, filp->f_path.mnt, tsk->role); ++ ++ /* ignore additional mmap checks for processes that are writable ++ by the default ACL */ ++ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, default_role->root_label); ++ if (unlikely(obj->mode & GR_WRITE)) ++ tsk->is_writable = 1; ++ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, tsk->role->root_label); ++ if (unlikely(obj->mode & GR_WRITE)) ++ tsk->is_writable = 1; ++ ++#ifdef CONFIG_GRKERNSEC_ACL_DEBUG ++ printk(KERN_ALERT "Assigning special role:%s subject:%s to process (%s:%d)\n", tsk->role->rolename, tsk->acl->filename, tsk->comm, tsk->pid); ++#endif ++ ++out_unlock: ++ read_unlock(&grsec_exec_file_lock); ++ read_unlock(&tasklist_lock); ++ return; ++} ++ ++int gr_check_secure_terminal(struct task_struct *task) ++{ ++ struct task_struct *p, *p2, *p3; ++ struct files_struct *files; ++ struct fdtable *fdt; ++ struct file *our_file = NULL, *file; ++ int i; ++ ++ if (task->signal->tty == NULL) ++ return 1; ++ ++ files = get_files_struct(task); ++ if (files != NULL) { ++ rcu_read_lock(); ++ fdt = files_fdtable(files); ++ for (i=0; i < fdt->max_fds; i++) { ++ file = fcheck_files(files, i); ++ if (file && (our_file == NULL) && (file->private_data == task->signal->tty)) { ++ get_file(file); ++ our_file = file; ++ } ++ } ++ rcu_read_unlock(); ++ put_files_struct(files); ++ } ++ ++ if (our_file == NULL) ++ return 1; ++ ++ read_lock(&tasklist_lock); ++ do_each_thread(p2, p) { ++ files = get_files_struct(p); ++ if (files == NULL || ++ (p->signal && p->signal->tty == task->signal->tty)) { ++ if (files != NULL) ++ put_files_struct(files); ++ continue; ++ } ++ rcu_read_lock(); ++ fdt = files_fdtable(files); ++ for (i=0; i < fdt->max_fds; i++) { ++ file = fcheck_files(files, i); ++ if (file && S_ISCHR(file->f_path.dentry->d_inode->i_mode) && ++ file->f_path.dentry->d_inode->i_rdev == our_file->f_path.dentry->d_inode->i_rdev) { ++ p3 = task; ++ while (p3->pid > 0) { ++ if (p3 == p) ++ break; ++ p3 = p3->parent; ++ } ++ if (p3 == p) ++ break; ++ gr_log_ttysniff(GR_DONT_AUDIT_GOOD, GR_TTYSNIFF_ACL_MSG, p); ++ gr_handle_alertkill(p); ++ rcu_read_unlock(); ++ put_files_struct(files); ++ read_unlock(&tasklist_lock); ++ fput(our_file); ++ return 0; ++ } ++ } ++ rcu_read_unlock(); ++ put_files_struct(files); ++ } while_each_thread(p2, p); ++ read_unlock(&tasklist_lock); ++ ++ fput(our_file); ++ return 1; ++} ++ ++ssize_t ++write_grsec_handler(struct file *file, const char * buf, size_t count, loff_t *ppos) ++{ ++ struct gr_arg_wrapper uwrap; ++ unsigned char *sprole_salt = NULL; ++ unsigned char *sprole_sum = NULL; ++ int error = sizeof (struct gr_arg_wrapper); ++ int error2 = 0; ++ ++ down(&gr_dev_sem); ++ ++ if ((gr_status & GR_READY) && !(current->acl->mode & GR_KERNELAUTH)) { ++ error = -EPERM; ++ goto out; ++ } ++ ++ if (count != sizeof (struct gr_arg_wrapper)) { ++ gr_log_int_int(GR_DONT_AUDIT_GOOD, GR_DEV_ACL_MSG, (int)count, (int)sizeof(struct gr_arg_wrapper)); ++ error = -EINVAL; ++ goto out; ++ } ++ ++ ++ if (gr_auth_expires && time_after_eq(get_seconds(), gr_auth_expires)) { ++ gr_auth_expires = 0; ++ gr_auth_attempts = 0; ++ } ++ ++ if (copy_from_user(&uwrap, buf, sizeof (struct gr_arg_wrapper))) { ++ error = -EFAULT; ++ goto out; ++ } ++ ++ if ((uwrap.version != GRSECURITY_VERSION) || (uwrap.size != sizeof(struct gr_arg))) { ++ error = -EINVAL; ++ goto out; ++ } ++ ++ if (copy_from_user(gr_usermode, uwrap.arg, sizeof (struct gr_arg))) { ++ error = -EFAULT; ++ goto out; ++ } ++ ++ if (gr_usermode->mode != GR_SPROLE && gr_usermode->mode != GR_SPROLEPAM && ++ gr_auth_attempts >= CONFIG_GRKERNSEC_ACL_MAXTRIES && ++ time_after(gr_auth_expires, get_seconds())) { ++ error = -EBUSY; ++ goto out; ++ } ++ ++ /* if non-root trying to do anything other than use a special role, ++ do not attempt authentication, do not count towards authentication ++ locking ++ */ ++ ++ if (gr_usermode->mode != GR_SPROLE && gr_usermode->mode != GR_STATUS && ++ gr_usermode->mode != GR_UNSPROLE && gr_usermode->mode != GR_SPROLEPAM && ++ current_uid()) { ++ error = -EPERM; ++ goto out; ++ } ++ ++ /* ensure pw and special role name are null terminated */ ++ ++ gr_usermode->pw[GR_PW_LEN - 1] = '\0'; ++ gr_usermode->sp_role[GR_SPROLE_LEN - 1] = '\0'; ++ ++ /* Okay. ++ * We have our enough of the argument structure..(we have yet ++ * to copy_from_user the tables themselves) . Copy the tables ++ * only if we need them, i.e. for loading operations. */ ++ ++ switch (gr_usermode->mode) { ++ case GR_STATUS: ++ if (gr_status & GR_READY) { ++ error = 1; ++ if (!gr_check_secure_terminal(current)) ++ error = 3; ++ } else ++ error = 2; ++ goto out; ++ case GR_SHUTDOWN: ++ if ((gr_status & GR_READY) ++ && !(chkpw(gr_usermode, gr_system_salt, gr_system_sum))) { ++ pax_open_kernel(); ++ gr_status &= ~GR_READY; ++ pax_close_kernel(); ++ ++ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_SHUTS_ACL_MSG); ++ free_variables(); ++ memset(gr_usermode, 0, sizeof (struct gr_arg)); ++ memset(gr_system_salt, 0, GR_SALT_LEN); ++ memset(gr_system_sum, 0, GR_SHA_LEN); ++ } else if (gr_status & GR_READY) { ++ gr_log_noargs(GR_DONT_AUDIT, GR_SHUTF_ACL_MSG); ++ error = -EPERM; ++ } else { ++ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_SHUTI_ACL_MSG); ++ error = -EAGAIN; ++ } ++ break; ++ case GR_ENABLE: ++ if (!(gr_status & GR_READY) && !(error2 = gracl_init(gr_usermode))) ++ gr_log_str(GR_DONT_AUDIT_GOOD, GR_ENABLE_ACL_MSG, GR_VERSION); ++ else { ++ if (gr_status & GR_READY) ++ error = -EAGAIN; ++ else ++ error = error2; ++ gr_log_str(GR_DONT_AUDIT, GR_ENABLEF_ACL_MSG, GR_VERSION); ++ } ++ break; ++ case GR_RELOAD: ++ if (!(gr_status & GR_READY)) { ++ gr_log_str(GR_DONT_AUDIT_GOOD, GR_RELOADI_ACL_MSG, GR_VERSION); ++ error = -EAGAIN; ++ } else if (!(chkpw(gr_usermode, gr_system_salt, gr_system_sum))) { ++ lock_kernel(); ++ ++ pax_open_kernel(); ++ gr_status &= ~GR_READY; ++ pax_close_kernel(); ++ ++ free_variables(); ++ if (!(error2 = gracl_init(gr_usermode))) { ++ unlock_kernel(); ++ gr_log_str(GR_DONT_AUDIT_GOOD, GR_RELOAD_ACL_MSG, GR_VERSION); ++ } else { ++ unlock_kernel(); ++ error = error2; ++ gr_log_str(GR_DONT_AUDIT, GR_RELOADF_ACL_MSG, GR_VERSION); ++ } ++ } else { ++ gr_log_str(GR_DONT_AUDIT, GR_RELOADF_ACL_MSG, GR_VERSION); ++ error = -EPERM; ++ } ++ break; ++ case GR_SEGVMOD: ++ if (unlikely(!(gr_status & GR_READY))) { ++ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_SEGVMODI_ACL_MSG); ++ error = -EAGAIN; ++ break; ++ } ++ ++ if (!(chkpw(gr_usermode, gr_system_salt, gr_system_sum))) { ++ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_SEGVMODS_ACL_MSG); ++ if (gr_usermode->segv_device && gr_usermode->segv_inode) { ++ struct acl_subject_label *segvacl; ++ segvacl = ++ lookup_acl_subj_label(gr_usermode->segv_inode, ++ gr_usermode->segv_device, ++ current->role); ++ if (segvacl) { ++ segvacl->crashes = 0; ++ segvacl->expires = 0; ++ } ++ } else if (gr_find_uid(gr_usermode->segv_uid) >= 0) { ++ gr_remove_uid(gr_usermode->segv_uid); ++ } ++ } else { ++ gr_log_noargs(GR_DONT_AUDIT, GR_SEGVMODF_ACL_MSG); ++ error = -EPERM; ++ } ++ break; ++ case GR_SPROLE: ++ case GR_SPROLEPAM: ++ if (unlikely(!(gr_status & GR_READY))) { ++ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_SPROLEI_ACL_MSG); ++ error = -EAGAIN; ++ break; ++ } ++ ++ if (current->role->expires && time_after_eq(get_seconds(), current->role->expires)) { ++ current->role->expires = 0; ++ current->role->auth_attempts = 0; ++ } ++ ++ if (current->role->auth_attempts >= CONFIG_GRKERNSEC_ACL_MAXTRIES && ++ time_after(current->role->expires, get_seconds())) { ++ error = -EBUSY; ++ goto out; ++ } ++ ++ if (lookup_special_role_auth ++ (gr_usermode->mode, gr_usermode->sp_role, &sprole_salt, &sprole_sum) ++ && ((!sprole_salt && !sprole_sum) ++ || !(chkpw(gr_usermode, sprole_salt, sprole_sum)))) { ++ char *p = ""; ++ assign_special_role(gr_usermode->sp_role); ++ read_lock(&tasklist_lock); ++ if (current->parent) ++ p = current->parent->role->rolename; ++ read_unlock(&tasklist_lock); ++ gr_log_str_int(GR_DONT_AUDIT_GOOD, GR_SPROLES_ACL_MSG, ++ p, acl_sp_role_value); ++ } else { ++ gr_log_str(GR_DONT_AUDIT, GR_SPROLEF_ACL_MSG, gr_usermode->sp_role); ++ error = -EPERM; ++ if(!(current->role->auth_attempts++)) ++ current->role->expires = get_seconds() + CONFIG_GRKERNSEC_ACL_TIMEOUT; ++ ++ goto out; ++ } ++ break; ++ case GR_UNSPROLE: ++ if (unlikely(!(gr_status & GR_READY))) { ++ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_UNSPROLEI_ACL_MSG); ++ error = -EAGAIN; ++ break; ++ } ++ ++ if (current->role->roletype & GR_ROLE_SPECIAL) { ++ char *p = ""; ++ int i = 0; ++ ++ read_lock(&tasklist_lock); ++ if (current->parent) { ++ p = current->parent->role->rolename; ++ i = current->parent->acl_role_id; ++ } ++ read_unlock(&tasklist_lock); ++ ++ gr_log_str_int(GR_DONT_AUDIT_GOOD, GR_UNSPROLES_ACL_MSG, p, i); ++ gr_set_acls(1); ++ } else { ++ error = -EPERM; ++ goto out; ++ } ++ break; ++ default: ++ gr_log_int(GR_DONT_AUDIT, GR_INVMODE_ACL_MSG, gr_usermode->mode); ++ error = -EINVAL; ++ break; ++ } ++ ++ if (error != -EPERM) ++ goto out; ++ ++ if(!(gr_auth_attempts++)) ++ gr_auth_expires = get_seconds() + CONFIG_GRKERNSEC_ACL_TIMEOUT; ++ ++ out: ++ up(&gr_dev_sem); ++ return error; ++} ++ ++int ++gr_set_acls(const int type) ++{ ++ struct acl_object_label *obj; ++ struct task_struct *task, *task2; ++ struct file *filp; ++ struct acl_role_label *role = current->role; ++ __u16 acl_role_id = current->acl_role_id; ++ const struct cred *cred; ++ char *tmpname; ++ struct name_entry *nmatch; ++ struct acl_subject_label *tmpsubj; ++ ++ rcu_read_lock(); ++ read_lock(&tasklist_lock); ++ read_lock(&grsec_exec_file_lock); ++ do_each_thread(task2, task) { ++ /* check to see if we're called from the exit handler, ++ if so, only replace ACLs that have inherited the admin ++ ACL */ ++ ++ if (type && (task->role != role || ++ task->acl_role_id != acl_role_id)) ++ continue; ++ ++ task->acl_role_id = 0; ++ task->acl_sp_role = 0; ++ ++ if ((filp = task->exec_file)) { ++ cred = __task_cred(task); ++ task->role = lookup_acl_role_label(task, cred->uid, cred->gid); ++ ++ /* the following is to apply the correct subject ++ on binaries running when the RBAC system ++ is enabled, when the binaries have been ++ replaced or deleted since their execution ++ ----- ++ when the RBAC system starts, the inode/dev ++ from exec_file will be one the RBAC system ++ is unaware of. It only knows the inode/dev ++ of the present file on disk, or the absence ++ of it. ++ */ ++ preempt_disable(); ++ tmpname = gr_to_filename_rbac(filp->f_path.dentry, filp->f_path.mnt); ++ ++ nmatch = lookup_name_entry(tmpname); ++ preempt_enable(); ++ tmpsubj = NULL; ++ if (nmatch) { ++ if (nmatch->deleted) ++ tmpsubj = lookup_acl_subj_label_deleted(nmatch->inode, nmatch->device, task->role); ++ else ++ tmpsubj = lookup_acl_subj_label(nmatch->inode, nmatch->device, task->role); ++ if (tmpsubj != NULL) ++ task->acl = tmpsubj; ++ } ++ if (tmpsubj == NULL) ++ task->acl = chk_subj_label(filp->f_path.dentry, filp->f_path.mnt, ++ task->role); ++ if (task->acl) { ++ struct acl_subject_label *curr; ++ curr = task->acl; ++ ++ task->is_writable = 0; ++ /* ignore additional mmap checks for processes that are writable ++ by the default ACL */ ++ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, default_role->root_label); ++ if (unlikely(obj->mode & GR_WRITE)) ++ task->is_writable = 1; ++ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, task->role->root_label); ++ if (unlikely(obj->mode & GR_WRITE)) ++ task->is_writable = 1; ++ ++ gr_set_proc_res(task); ++ ++#ifdef CONFIG_GRKERNSEC_ACL_DEBUG ++ printk(KERN_ALERT "gr_set_acls for (%s:%d): role:%s, subject:%s\n", task->comm, task->pid, task->role->rolename, task->acl->filename); ++#endif ++ } else { ++ read_unlock(&grsec_exec_file_lock); ++ read_unlock(&tasklist_lock); ++ rcu_read_unlock(); ++ gr_log_str_int(GR_DONT_AUDIT_GOOD, GR_DEFACL_MSG, task->comm, task->pid); ++ return 1; ++ } ++ } else { ++ // it's a kernel process ++ task->role = kernel_role; ++ task->acl = kernel_role->root_label; ++#ifdef CONFIG_GRKERNSEC_ACL_HIDEKERN ++ task->acl->mode &= ~GR_PROCFIND; ++#endif ++ } ++ } while_each_thread(task2, task); ++ read_unlock(&grsec_exec_file_lock); ++ read_unlock(&tasklist_lock); ++ rcu_read_unlock(); ++ ++ return 0; ++} ++ ++void ++gr_learn_resource(const struct task_struct *task, ++ const int res, const unsigned long wanted, const int gt) ++{ ++ struct acl_subject_label *acl; ++ const struct cred *cred; ++ ++ if (unlikely((gr_status & GR_READY) && ++ task->acl && (task->acl->mode & (GR_LEARN | GR_INHERITLEARN)))) ++ goto skip_reslog; ++ ++#ifdef CONFIG_GRKERNSEC_RESLOG ++ gr_log_resource(task, res, wanted, gt); ++#endif ++ skip_reslog: ++ ++ if (unlikely(!(gr_status & GR_READY) || !wanted || res >= GR_NLIMITS)) ++ return; ++ ++ acl = task->acl; ++ ++ if (likely(!acl || !(acl->mode & (GR_LEARN | GR_INHERITLEARN)) || ++ !(acl->resmask & (1 << (unsigned short) res)))) ++ return; ++ ++ if (wanted >= acl->res[res].rlim_cur) { ++ unsigned long res_add; ++ ++ res_add = wanted; ++ switch (res) { ++ case RLIMIT_CPU: ++ res_add += GR_RLIM_CPU_BUMP; ++ break; ++ case RLIMIT_FSIZE: ++ res_add += GR_RLIM_FSIZE_BUMP; ++ break; ++ case RLIMIT_DATA: ++ res_add += GR_RLIM_DATA_BUMP; ++ break; ++ case RLIMIT_STACK: ++ res_add += GR_RLIM_STACK_BUMP; ++ break; ++ case RLIMIT_CORE: ++ res_add += GR_RLIM_CORE_BUMP; ++ break; ++ case RLIMIT_RSS: ++ res_add += GR_RLIM_RSS_BUMP; ++ break; ++ case RLIMIT_NPROC: ++ res_add += GR_RLIM_NPROC_BUMP; ++ break; ++ case RLIMIT_NOFILE: ++ res_add += GR_RLIM_NOFILE_BUMP; ++ break; ++ case RLIMIT_MEMLOCK: ++ res_add += GR_RLIM_MEMLOCK_BUMP; ++ break; ++ case RLIMIT_AS: ++ res_add += GR_RLIM_AS_BUMP; ++ break; ++ case RLIMIT_LOCKS: ++ res_add += GR_RLIM_LOCKS_BUMP; ++ break; ++ case RLIMIT_SIGPENDING: ++ res_add += GR_RLIM_SIGPENDING_BUMP; ++ break; ++ case RLIMIT_MSGQUEUE: ++ res_add += GR_RLIM_MSGQUEUE_BUMP; ++ break; ++ case RLIMIT_NICE: ++ res_add += GR_RLIM_NICE_BUMP; ++ break; ++ case RLIMIT_RTPRIO: ++ res_add += GR_RLIM_RTPRIO_BUMP; ++ break; ++ case RLIMIT_RTTIME: ++ res_add += GR_RLIM_RTTIME_BUMP; ++ break; ++ } ++ ++ acl->res[res].rlim_cur = res_add; ++ ++ if (wanted > acl->res[res].rlim_max) ++ acl->res[res].rlim_max = res_add; ++ ++ /* only log the subject filename, since resource logging is supported for ++ single-subject learning only */ ++ rcu_read_lock(); ++ cred = __task_cred(task); ++ security_learn(GR_LEARN_AUDIT_MSG, task->role->rolename, ++ task->role->roletype, cred->uid, cred->gid, acl->filename, ++ acl->filename, acl->res[res].rlim_cur, acl->res[res].rlim_max, ++ "", (unsigned long) res, &task->signal->curr_ip); ++ rcu_read_unlock(); ++ } ++ ++ return; ++} ++ ++#if defined(CONFIG_PAX_HAVE_ACL_FLAGS) && (defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR)) ++void ++pax_set_initial_flags(struct linux_binprm *bprm) ++{ ++ struct task_struct *task = current; ++ struct acl_subject_label *proc; ++ unsigned long flags; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return; ++ ++ flags = pax_get_flags(task); ++ ++ proc = task->acl; ++ ++ if (proc->pax_flags & GR_PAX_DISABLE_PAGEEXEC) ++ flags &= ~MF_PAX_PAGEEXEC; ++ if (proc->pax_flags & GR_PAX_DISABLE_SEGMEXEC) ++ flags &= ~MF_PAX_SEGMEXEC; ++ if (proc->pax_flags & GR_PAX_DISABLE_RANDMMAP) ++ flags &= ~MF_PAX_RANDMMAP; ++ if (proc->pax_flags & GR_PAX_DISABLE_EMUTRAMP) ++ flags &= ~MF_PAX_EMUTRAMP; ++ if (proc->pax_flags & GR_PAX_DISABLE_MPROTECT) ++ flags &= ~MF_PAX_MPROTECT; ++ ++ if (proc->pax_flags & GR_PAX_ENABLE_PAGEEXEC) ++ flags |= MF_PAX_PAGEEXEC; ++ if (proc->pax_flags & GR_PAX_ENABLE_SEGMEXEC) ++ flags |= MF_PAX_SEGMEXEC; ++ if (proc->pax_flags & GR_PAX_ENABLE_RANDMMAP) ++ flags |= MF_PAX_RANDMMAP; ++ if (proc->pax_flags & GR_PAX_ENABLE_EMUTRAMP) ++ flags |= MF_PAX_EMUTRAMP; ++ if (proc->pax_flags & GR_PAX_ENABLE_MPROTECT) ++ flags |= MF_PAX_MPROTECT; ++ ++ pax_set_flags(task, flags); ++ ++ return; ++} ++#endif ++ ++#ifdef CONFIG_SYSCTL ++/* Eric Biederman likes breaking userland ABI and every inode-based security ++ system to save 35kb of memory */ ++ ++/* we modify the passed in filename, but adjust it back before returning */ ++static struct acl_object_label *gr_lookup_by_name(char *name, unsigned int len) ++{ ++ struct name_entry *nmatch; ++ char *p, *lastp = NULL; ++ struct acl_object_label *obj = NULL, *tmp; ++ struct acl_subject_label *tmpsubj; ++ char c = '\0'; ++ ++ read_lock(&gr_inode_lock); ++ ++ p = name + len - 1; ++ do { ++ nmatch = lookup_name_entry(name); ++ if (lastp != NULL) ++ *lastp = c; ++ ++ if (nmatch == NULL) ++ goto next_component; ++ tmpsubj = current->acl; ++ do { ++ obj = lookup_acl_obj_label(nmatch->inode, nmatch->device, tmpsubj); ++ if (obj != NULL) { ++ tmp = obj->globbed; ++ while (tmp) { ++ if (!glob_match(tmp->filename, name)) { ++ obj = tmp; ++ goto found_obj; ++ } ++ tmp = tmp->next; ++ } ++ goto found_obj; ++ } ++ } while ((tmpsubj = tmpsubj->parent_subject)); ++next_component: ++ /* end case */ ++ if (p == name) ++ break; ++ ++ while (*p != '/') ++ p--; ++ if (p == name) ++ lastp = p + 1; ++ else { ++ lastp = p; ++ p--; ++ } ++ c = *lastp; ++ *lastp = '\0'; ++ } while (1); ++found_obj: ++ read_unlock(&gr_inode_lock); ++ /* obj returned will always be non-null */ ++ return obj; ++} ++ ++/* returns 0 when allowing, non-zero on error ++ op of 0 is used for readdir, so we don't log the names of hidden files ++*/ ++__u32 ++gr_handle_sysctl(const struct ctl_table *table, const int op) ++{ ++ ctl_table *tmp; ++ const char *proc_sys = "/proc/sys"; ++ char *path; ++ struct acl_object_label *obj; ++ unsigned short len = 0, pos = 0, depth = 0, i; ++ __u32 err = 0; ++ __u32 mode = 0; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return 0; ++ ++ /* for now, ignore operations on non-sysctl entries if it's not a ++ readdir*/ ++ if (table->child != NULL && op != 0) ++ return 0; ++ ++ mode |= GR_FIND; ++ /* it's only a read if it's an entry, read on dirs is for readdir */ ++ if (op & MAY_READ) ++ mode |= GR_READ; ++ if (op & MAY_WRITE) ++ mode |= GR_WRITE; ++ ++ preempt_disable(); ++ ++ path = per_cpu_ptr(gr_shared_page[0], smp_processor_id()); ++ ++ /* it's only a read/write if it's an actual entry, not a dir ++ (which are opened for readdir) ++ */ ++ ++ /* convert the requested sysctl entry into a pathname */ ++ ++ for (tmp = (ctl_table *)table; tmp != NULL; tmp = tmp->parent) { ++ len += strlen(tmp->procname); ++ len++; ++ depth++; ++ } ++ ++ if ((len + depth + strlen(proc_sys) + 1) > PAGE_SIZE) { ++ /* deny */ ++ goto out; ++ } ++ ++ memset(path, 0, PAGE_SIZE); ++ ++ memcpy(path, proc_sys, strlen(proc_sys)); ++ ++ pos += strlen(proc_sys); ++ ++ for (; depth > 0; depth--) { ++ path[pos] = '/'; ++ pos++; ++ for (i = 1, tmp = (ctl_table *)table; tmp != NULL; tmp = tmp->parent) { ++ if (depth == i) { ++ memcpy(path + pos, tmp->procname, ++ strlen(tmp->procname)); ++ pos += strlen(tmp->procname); ++ } ++ i++; ++ } ++ } ++ ++ obj = gr_lookup_by_name(path, pos); ++ err = obj->mode & (mode | to_gr_audit(mode) | GR_SUPPRESS); ++ ++ if (unlikely((current->acl->mode & (GR_LEARN | GR_INHERITLEARN)) && ++ ((err & mode) != mode))) { ++ __u32 new_mode = mode; ++ ++ new_mode &= ~(GR_AUDITS | GR_SUPPRESS); ++ ++ err = 0; ++ gr_log_learn_sysctl(path, new_mode); ++ } else if (!(err & GR_FIND) && !(err & GR_SUPPRESS) && op != 0) { ++ gr_log_hidden_sysctl(GR_DONT_AUDIT, GR_HIDDEN_ACL_MSG, path); ++ err = -ENOENT; ++ } else if (!(err & GR_FIND)) { ++ err = -ENOENT; ++ } else if (((err & mode) & ~GR_FIND) != (mode & ~GR_FIND) && !(err & GR_SUPPRESS)) { ++ gr_log_str4(GR_DONT_AUDIT, GR_SYSCTL_ACL_MSG, "denied", ++ path, (mode & GR_READ) ? " reading" : "", ++ (mode & GR_WRITE) ? " writing" : ""); ++ err = -EACCES; ++ } else if ((err & mode) != mode) { ++ err = -EACCES; ++ } else if ((((err & mode) & ~GR_FIND) == (mode & ~GR_FIND)) && (err & GR_AUDITS)) { ++ gr_log_str4(GR_DO_AUDIT, GR_SYSCTL_ACL_MSG, "successful", ++ path, (mode & GR_READ) ? " reading" : "", ++ (mode & GR_WRITE) ? " writing" : ""); ++ err = 0; ++ } else ++ err = 0; ++ ++ out: ++ preempt_enable(); ++ ++ return err; ++} ++#endif ++ ++int ++gr_handle_proc_ptrace(struct task_struct *task) ++{ ++ struct file *filp; ++ struct task_struct *tmp = task; ++ struct task_struct *curtemp = current; ++ __u32 retmode; ++ ++#ifndef CONFIG_GRKERNSEC_HARDEN_PTRACE ++ if (unlikely(!(gr_status & GR_READY))) ++ return 0; ++#endif ++ ++ read_lock(&tasklist_lock); ++ read_lock(&grsec_exec_file_lock); ++ filp = task->exec_file; ++ ++ while (tmp->pid > 0) { ++ if (tmp == curtemp) ++ break; ++ tmp = tmp->parent; ++ } ++ ++ if (!filp || (tmp->pid == 0 && ((grsec_enable_harden_ptrace && current_uid() && !(gr_status & GR_READY)) || ++ ((gr_status & GR_READY) && !(current->acl->mode & GR_RELAXPTRACE))))) { ++ read_unlock(&grsec_exec_file_lock); ++ read_unlock(&tasklist_lock); ++ return 1; ++ } ++ ++#ifdef CONFIG_GRKERNSEC_HARDEN_PTRACE ++ if (!(gr_status & GR_READY)) { ++ read_unlock(&grsec_exec_file_lock); ++ read_unlock(&tasklist_lock); ++ return 0; ++ } ++#endif ++ ++ retmode = gr_search_file(filp->f_path.dentry, GR_NOPTRACE, filp->f_path.mnt); ++ read_unlock(&grsec_exec_file_lock); ++ read_unlock(&tasklist_lock); ++ ++ if (retmode & GR_NOPTRACE) ++ return 1; ++ ++ if (!(current->acl->mode & GR_POVERRIDE) && !(current->role->roletype & GR_ROLE_GOD) ++ && (current->acl != task->acl || (current->acl != current->role->root_label ++ && current->pid != task->pid))) ++ return 1; ++ ++ return 0; ++} ++ ++int ++gr_handle_ptrace(struct task_struct *task, const long request) ++{ ++ struct task_struct *tmp = task; ++ struct task_struct *curtemp = current; ++ __u32 retmode; ++ ++#ifndef CONFIG_GRKERNSEC_HARDEN_PTRACE ++ if (unlikely(!(gr_status & GR_READY))) ++ return 0; ++#endif ++ ++ read_lock(&tasklist_lock); ++ while (tmp->pid > 0) { ++ if (tmp == curtemp) ++ break; ++ tmp = tmp->parent; ++ } ++ ++ if (tmp->pid == 0 && ((grsec_enable_harden_ptrace && current_uid() && !(gr_status & GR_READY)) || ++ ((gr_status & GR_READY) && !(current->acl->mode & GR_RELAXPTRACE)))) { ++ read_unlock(&tasklist_lock); ++ gr_log_ptrace(GR_DONT_AUDIT, GR_PTRACE_ACL_MSG, task); ++ return 1; ++ } ++ read_unlock(&tasklist_lock); ++ ++#ifdef CONFIG_GRKERNSEC_HARDEN_PTRACE ++ if (!(gr_status & GR_READY)) ++ return 0; ++#endif ++ ++ read_lock(&grsec_exec_file_lock); ++ if (unlikely(!task->exec_file)) { ++ read_unlock(&grsec_exec_file_lock); ++ return 0; ++ } ++ ++ retmode = gr_search_file(task->exec_file->f_path.dentry, GR_PTRACERD | GR_NOPTRACE, task->exec_file->f_path.mnt); ++ read_unlock(&grsec_exec_file_lock); ++ ++ if (retmode & GR_NOPTRACE) { ++ gr_log_ptrace(GR_DONT_AUDIT, GR_PTRACE_ACL_MSG, task); ++ return 1; ++ } ++ ++ if (retmode & GR_PTRACERD) { ++ switch (request) { ++ case PTRACE_POKETEXT: ++ case PTRACE_POKEDATA: ++ case PTRACE_POKEUSR: ++#if !defined(CONFIG_PPC32) && !defined(CONFIG_PPC64) && !defined(CONFIG_PARISC) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64) ++ case PTRACE_SETREGS: ++ case PTRACE_SETFPREGS: ++#endif ++#ifdef CONFIG_X86 ++ case PTRACE_SETFPXREGS: ++#endif ++#ifdef CONFIG_ALTIVEC ++ case PTRACE_SETVRREGS: ++#endif ++ return 1; ++ default: ++ return 0; ++ } ++ } else if (!(current->acl->mode & GR_POVERRIDE) && ++ !(current->role->roletype & GR_ROLE_GOD) && ++ (current->acl != task->acl)) { ++ gr_log_ptrace(GR_DONT_AUDIT, GR_PTRACE_ACL_MSG, task); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static int is_writable_mmap(const struct file *filp) ++{ ++ struct task_struct *task = current; ++ struct acl_object_label *obj, *obj2; ++ ++ if (gr_status & GR_READY && !(task->acl->mode & GR_OVERRIDE) && ++ !task->is_writable && S_ISREG(filp->f_path.dentry->d_inode->i_mode)) { ++ obj = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, default_role->root_label); ++ obj2 = chk_obj_label(filp->f_path.dentry, filp->f_path.mnt, ++ task->role->root_label); ++ if (unlikely((obj->mode & GR_WRITE) || (obj2->mode & GR_WRITE))) { ++ gr_log_fs_generic(GR_DONT_AUDIT, GR_WRITLIB_ACL_MSG, filp->f_path.dentry, filp->f_path.mnt); ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++int ++gr_acl_handle_mmap(const struct file *file, const unsigned long prot) ++{ ++ __u32 mode; ++ ++ if (unlikely(!file || !(prot & PROT_EXEC))) ++ return 1; ++ ++ if (is_writable_mmap(file)) ++ return 0; ++ ++ mode = ++ gr_search_file(file->f_path.dentry, ++ GR_EXEC | GR_AUDIT_EXEC | GR_SUPPRESS, ++ file->f_path.mnt); ++ ++ if (!gr_tpe_allow(file)) ++ return 0; ++ ++ if (unlikely(!(mode & GR_EXEC) && !(mode & GR_SUPPRESS))) { ++ gr_log_fs_rbac_generic(GR_DONT_AUDIT, GR_MMAP_ACL_MSG, file->f_path.dentry, file->f_path.mnt); ++ return 0; ++ } else if (unlikely(!(mode & GR_EXEC))) { ++ return 0; ++ } else if (unlikely(mode & GR_EXEC && mode & GR_AUDIT_EXEC)) { ++ gr_log_fs_rbac_generic(GR_DO_AUDIT, GR_MMAP_ACL_MSG, file->f_path.dentry, file->f_path.mnt); ++ return 1; ++ } ++ ++ return 1; ++} ++ ++int ++gr_acl_handle_mprotect(const struct file *file, const unsigned long prot) ++{ ++ __u32 mode; ++ ++ if (unlikely(!file || !(prot & PROT_EXEC))) ++ return 1; ++ ++ if (is_writable_mmap(file)) ++ return 0; ++ ++ mode = ++ gr_search_file(file->f_path.dentry, ++ GR_EXEC | GR_AUDIT_EXEC | GR_SUPPRESS, ++ file->f_path.mnt); ++ ++ if (!gr_tpe_allow(file)) ++ return 0; ++ ++ if (unlikely(!(mode & GR_EXEC) && !(mode & GR_SUPPRESS))) { ++ gr_log_fs_rbac_generic(GR_DONT_AUDIT, GR_MPROTECT_ACL_MSG, file->f_path.dentry, file->f_path.mnt); ++ return 0; ++ } else if (unlikely(!(mode & GR_EXEC))) { ++ return 0; ++ } else if (unlikely(mode & GR_EXEC && mode & GR_AUDIT_EXEC)) { ++ gr_log_fs_rbac_generic(GR_DO_AUDIT, GR_MPROTECT_ACL_MSG, file->f_path.dentry, file->f_path.mnt); ++ return 1; ++ } ++ ++ return 1; ++} ++ ++void ++gr_acl_handle_psacct(struct task_struct *task, const long code) ++{ ++ unsigned long runtime; ++ unsigned long cputime; ++ unsigned int wday, cday; ++ __u8 whr, chr; ++ __u8 wmin, cmin; ++ __u8 wsec, csec; ++ struct timespec timeval; ++ ++ if (unlikely(!(gr_status & GR_READY) || !task->acl || ++ !(task->acl->mode & GR_PROCACCT))) ++ return; ++ ++ do_posix_clock_monotonic_gettime(&timeval); ++ runtime = timeval.tv_sec - task->start_time.tv_sec; ++ wday = runtime / (3600 * 24); ++ runtime -= wday * (3600 * 24); ++ whr = runtime / 3600; ++ runtime -= whr * 3600; ++ wmin = runtime / 60; ++ runtime -= wmin * 60; ++ wsec = runtime; ++ ++ cputime = (task->utime + task->stime) / HZ; ++ cday = cputime / (3600 * 24); ++ cputime -= cday * (3600 * 24); ++ chr = cputime / 3600; ++ cputime -= chr * 3600; ++ cmin = cputime / 60; ++ cputime -= cmin * 60; ++ csec = cputime; ++ ++ gr_log_procacct(GR_DO_AUDIT, GR_ACL_PROCACCT_MSG, task, wday, whr, wmin, wsec, cday, chr, cmin, csec, code); ++ ++ return; ++} ++ ++void gr_set_kernel_label(struct task_struct *task) ++{ ++ if (gr_status & GR_READY) { ++ task->role = kernel_role; ++ task->acl = kernel_role->root_label; ++ } ++ return; ++} ++ ++#ifdef CONFIG_TASKSTATS ++int gr_is_taskstats_denied(int pid) ++{ ++ struct task_struct *task; ++#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ const struct cred *cred; ++#endif ++ int ret = 0; ++ ++ /* restrict taskstats viewing to un-chrooted root users ++ who have the 'view' subject flag if the RBAC system is enabled ++ */ ++ ++ read_lock(&tasklist_lock); ++ task = find_task_by_vpid(pid); ++ if (task) { ++ gr_fs_read_lock(task); ++#ifdef CONFIG_GRKERNSEC_CHROOT ++ if (proc_is_chrooted(task)) ++ ret = -EACCES; ++#endif ++#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ cred = __task_cred(task); ++#ifdef CONFIG_GRKERNSEC_PROC_USER ++ if (cred->uid != 0) ++ ret = -EACCES; ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ if (cred->uid != 0 && !groups_search(cred->group_info, CONFIG_GRKERNSEC_PROC_GID)) ++ ret = -EACCES; ++#endif ++#endif ++ if (gr_status & GR_READY) { ++ if (!(task->acl->mode & GR_VIEW)) ++ ret = -EACCES; ++ } ++ ++ gr_fs_read_unlock(task); ++ } else ++ ret = -ENOENT; ++ ++ read_unlock(&tasklist_lock); ++ ++ return ret; ++} ++#endif ++ ++int gr_acl_handle_filldir(const struct file *file, const char *name, const unsigned int namelen, const ino_t ino) ++{ ++ struct task_struct *task = current; ++ struct dentry *dentry = file->f_path.dentry; ++ struct vfsmount *mnt = file->f_path.mnt; ++ struct acl_object_label *obj, *tmp; ++ struct acl_subject_label *subj; ++ unsigned int bufsize; ++ int is_not_root; ++ char *path; ++ ++ if (unlikely(!(gr_status & GR_READY))) ++ return 1; ++ ++ if (task->acl->mode & (GR_LEARN | GR_INHERITLEARN)) ++ return 1; ++ ++ /* ignore Eric Biederman */ ++ if (IS_PRIVATE(dentry->d_inode)) ++ return 1; ++ ++ subj = task->acl; ++ do { ++ obj = lookup_acl_obj_label(ino, dentry->d_inode->i_sb->s_dev, subj); ++ if (obj != NULL) ++ return (obj->mode & GR_FIND) ? 1 : 0; ++ } while ((subj = subj->parent_subject)); ++ ++ /* this is purely an optimization since we're looking for an object ++ for the directory we're doing a readdir on ++ if it's possible for any globbed object to match the entry we're ++ filling into the directory, then the object we find here will be ++ an anchor point with attached globbed objects ++ */ ++ obj = chk_obj_label_noglob(dentry, mnt, task->acl); ++ if (obj->globbed == NULL) ++ return (obj->mode & GR_FIND) ? 1 : 0; ++ ++ is_not_root = ((obj->filename[0] == '/') && ++ (obj->filename[1] == '\0')) ? 0 : 1; ++ bufsize = PAGE_SIZE - namelen - is_not_root; ++ ++ /* check bufsize > PAGE_SIZE || bufsize == 0 */ ++ if (unlikely((bufsize - 1) > (PAGE_SIZE - 1))) ++ return 1; ++ ++ preempt_disable(); ++ path = d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[0], smp_processor_id()), ++ bufsize); ++ ++ bufsize = strlen(path); ++ ++ /* if base is "/", don't append an additional slash */ ++ if (is_not_root) ++ *(path + bufsize) = '/'; ++ memcpy(path + bufsize + is_not_root, name, namelen); ++ *(path + bufsize + namelen + is_not_root) = '\0'; ++ ++ tmp = obj->globbed; ++ while (tmp) { ++ if (!glob_match(tmp->filename, path)) { ++ preempt_enable(); ++ return (tmp->mode & GR_FIND) ? 1 : 0; ++ } ++ tmp = tmp->next; ++ } ++ preempt_enable(); ++ return (obj->mode & GR_FIND) ? 1 : 0; ++} ++ ++EXPORT_SYMBOL(gr_learn_resource); ++EXPORT_SYMBOL(gr_set_kernel_label); ++#ifdef CONFIG_SECURITY ++EXPORT_SYMBOL(gr_check_user_change); ++EXPORT_SYMBOL(gr_check_group_change); ++#endif ++ +diff -urNp linux-2.6.33.1/grsecurity/gracl_cap.c linux-2.6.33.1/grsecurity/gracl_cap.c +--- linux-2.6.33.1/grsecurity/gracl_cap.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/gracl_cap.c 2010-03-20 16:58:41.888938450 -0400 +@@ -0,0 +1,131 @@ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/sched.h> ++#include <linux/gracl.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++ ++static const char *captab_log[] = { ++ "CAP_CHOWN", ++ "CAP_DAC_OVERRIDE", ++ "CAP_DAC_READ_SEARCH", ++ "CAP_FOWNER", ++ "CAP_FSETID", ++ "CAP_KILL", ++ "CAP_SETGID", ++ "CAP_SETUID", ++ "CAP_SETPCAP", ++ "CAP_LINUX_IMMUTABLE", ++ "CAP_NET_BIND_SERVICE", ++ "CAP_NET_BROADCAST", ++ "CAP_NET_ADMIN", ++ "CAP_NET_RAW", ++ "CAP_IPC_LOCK", ++ "CAP_IPC_OWNER", ++ "CAP_SYS_MODULE", ++ "CAP_SYS_RAWIO", ++ "CAP_SYS_CHROOT", ++ "CAP_SYS_PTRACE", ++ "CAP_SYS_PACCT", ++ "CAP_SYS_ADMIN", ++ "CAP_SYS_BOOT", ++ "CAP_SYS_NICE", ++ "CAP_SYS_RESOURCE", ++ "CAP_SYS_TIME", ++ "CAP_SYS_TTY_CONFIG", ++ "CAP_MKNOD", ++ "CAP_LEASE", ++ "CAP_AUDIT_WRITE", ++ "CAP_AUDIT_CONTROL", ++ "CAP_SETFCAP", ++ "CAP_MAC_OVERRIDE", ++ "CAP_MAC_ADMIN" ++}; ++ ++EXPORT_SYMBOL(gr_is_capable); ++EXPORT_SYMBOL(gr_is_capable_nolog); ++ ++int ++gr_is_capable(const int cap) ++{ ++ struct task_struct *task = current; ++ const struct cred *cred = current_cred(); ++ struct acl_subject_label *curracl; ++ kernel_cap_t cap_drop = __cap_empty_set, cap_mask = __cap_empty_set; ++ ++ if (!gr_acl_is_enabled()) ++ return 1; ++ ++ curracl = task->acl; ++ ++ cap_drop = curracl->cap_lower; ++ cap_mask = curracl->cap_mask; ++ ++ while ((curracl = curracl->parent_subject)) { ++ /* if the cap isn't specified in the current computed mask but is specified in the ++ current level subject, and is lowered in the current level subject, then add ++ it to the set of dropped capabilities ++ otherwise, add the current level subject's mask to the current computed mask ++ */ ++ if (!cap_raised(cap_mask, cap) && cap_raised(curracl->cap_mask, cap)) { ++ cap_raise(cap_mask, cap); ++ if (cap_raised(curracl->cap_lower, cap)) ++ cap_raise(cap_drop, cap); ++ } ++ } ++ ++ if (!cap_raised(cap_drop, cap)) ++ return 1; ++ ++ curracl = task->acl; ++ ++ if ((curracl->mode & (GR_LEARN | GR_INHERITLEARN)) ++ && cap_raised(cred->cap_effective, cap)) { ++ security_learn(GR_LEARN_AUDIT_MSG, task->role->rolename, ++ task->role->roletype, cred->uid, ++ cred->gid, task->exec_file ? ++ gr_to_filename(task->exec_file->f_path.dentry, ++ task->exec_file->f_path.mnt) : curracl->filename, ++ curracl->filename, 0UL, ++ 0UL, "", (unsigned long) cap, &task->signal->curr_ip); ++ return 1; ++ } ++ ++ if ((cap >= 0) && (cap < (sizeof(captab_log)/sizeof(captab_log[0]))) && cap_raised(cred->cap_effective, cap)) ++ gr_log_cap(GR_DONT_AUDIT, GR_CAP_ACL_MSG, task, captab_log[cap]); ++ return 0; ++} ++ ++int ++gr_is_capable_nolog(const int cap) ++{ ++ struct acl_subject_label *curracl; ++ kernel_cap_t cap_drop = __cap_empty_set, cap_mask = __cap_empty_set; ++ ++ if (!gr_acl_is_enabled()) ++ return 1; ++ ++ curracl = current->acl; ++ ++ cap_drop = curracl->cap_lower; ++ cap_mask = curracl->cap_mask; ++ ++ while ((curracl = curracl->parent_subject)) { ++ /* if the cap isn't specified in the current computed mask but is specified in the ++ current level subject, and is lowered in the current level subject, then add ++ it to the set of dropped capabilities ++ otherwise, add the current level subject's mask to the current computed mask ++ */ ++ if (!cap_raised(cap_mask, cap) && cap_raised(curracl->cap_mask, cap)) { ++ cap_raise(cap_mask, cap); ++ if (cap_raised(curracl->cap_lower, cap)) ++ cap_raise(cap_drop, cap); ++ } ++ } ++ ++ if (!cap_raised(cap_drop, cap)) ++ return 1; ++ ++ return 0; ++} ++ +diff -urNp linux-2.6.33.1/grsecurity/gracl_fs.c linux-2.6.33.1/grsecurity/gracl_fs.c +--- linux-2.6.33.1/grsecurity/gracl_fs.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/gracl_fs.c 2010-03-20 16:58:41.888938450 -0400 +@@ -0,0 +1,424 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/types.h> ++#include <linux/fs.h> ++#include <linux/file.h> ++#include <linux/stat.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++#include <linux/gracl.h> ++ ++__u32 ++gr_acl_handle_hidden_file(const struct dentry * dentry, ++ const struct vfsmount * mnt) ++{ ++ __u32 mode; ++ ++ if (unlikely(!dentry->d_inode)) ++ return GR_FIND; ++ ++ mode = ++ gr_search_file(dentry, GR_FIND | GR_AUDIT_FIND | GR_SUPPRESS, mnt); ++ ++ if (unlikely(mode & GR_FIND && mode & GR_AUDIT_FIND)) { ++ gr_log_fs_rbac_generic(GR_DO_AUDIT, GR_HIDDEN_ACL_MSG, dentry, mnt); ++ return mode; ++ } else if (unlikely(!(mode & GR_FIND) && !(mode & GR_SUPPRESS))) { ++ gr_log_fs_rbac_generic(GR_DONT_AUDIT, GR_HIDDEN_ACL_MSG, dentry, mnt); ++ return 0; ++ } else if (unlikely(!(mode & GR_FIND))) ++ return 0; ++ ++ return GR_FIND; ++} ++ ++__u32 ++gr_acl_handle_open(const struct dentry * dentry, const struct vfsmount * mnt, ++ const int fmode) ++{ ++ __u32 reqmode = GR_FIND; ++ __u32 mode; ++ ++ if (unlikely(!dentry->d_inode)) ++ return reqmode; ++ ++ if (unlikely(fmode & O_APPEND)) ++ reqmode |= GR_APPEND; ++ else if (unlikely(fmode & FMODE_WRITE)) ++ reqmode |= GR_WRITE; ++ if (likely((fmode & FMODE_READ) && !(fmode & O_DIRECTORY))) ++ reqmode |= GR_READ; ++ if ((fmode & FMODE_GREXEC) && (fmode & FMODE_EXEC)) ++ reqmode &= ~GR_READ; ++ mode = ++ gr_search_file(dentry, reqmode | to_gr_audit(reqmode) | GR_SUPPRESS, ++ mnt); ++ ++ if (unlikely(((mode & reqmode) == reqmode) && mode & GR_AUDITS)) { ++ gr_log_fs_rbac_mode2(GR_DO_AUDIT, GR_OPEN_ACL_MSG, dentry, mnt, ++ reqmode & GR_READ ? " reading" : "", ++ reqmode & GR_WRITE ? " writing" : reqmode & ++ GR_APPEND ? " appending" : ""); ++ return reqmode; ++ } else ++ if (unlikely((mode & reqmode) != reqmode && !(mode & GR_SUPPRESS))) ++ { ++ gr_log_fs_rbac_mode2(GR_DONT_AUDIT, GR_OPEN_ACL_MSG, dentry, mnt, ++ reqmode & GR_READ ? " reading" : "", ++ reqmode & GR_WRITE ? " writing" : reqmode & ++ GR_APPEND ? " appending" : ""); ++ return 0; ++ } else if (unlikely((mode & reqmode) != reqmode)) ++ return 0; ++ ++ return reqmode; ++} ++ ++__u32 ++gr_acl_handle_creat(const struct dentry * dentry, ++ const struct dentry * p_dentry, ++ const struct vfsmount * p_mnt, const int fmode, ++ const int imode) ++{ ++ __u32 reqmode = GR_WRITE | GR_CREATE; ++ __u32 mode; ++ ++ if (unlikely(fmode & O_APPEND)) ++ reqmode |= GR_APPEND; ++ if (unlikely((fmode & FMODE_READ) && !(fmode & O_DIRECTORY))) ++ reqmode |= GR_READ; ++ if (unlikely((fmode & O_CREAT) && (imode & (S_ISUID | S_ISGID)))) ++ reqmode |= GR_SETID; ++ ++ mode = ++ gr_check_create(dentry, p_dentry, p_mnt, ++ reqmode | to_gr_audit(reqmode) | GR_SUPPRESS); ++ ++ if (unlikely(((mode & reqmode) == reqmode) && mode & GR_AUDITS)) { ++ gr_log_fs_rbac_mode2(GR_DO_AUDIT, GR_CREATE_ACL_MSG, dentry, p_mnt, ++ reqmode & GR_READ ? " reading" : "", ++ reqmode & GR_WRITE ? " writing" : reqmode & ++ GR_APPEND ? " appending" : ""); ++ return reqmode; ++ } else ++ if (unlikely((mode & reqmode) != reqmode && !(mode & GR_SUPPRESS))) ++ { ++ gr_log_fs_rbac_mode2(GR_DONT_AUDIT, GR_CREATE_ACL_MSG, dentry, p_mnt, ++ reqmode & GR_READ ? " reading" : "", ++ reqmode & GR_WRITE ? " writing" : reqmode & ++ GR_APPEND ? " appending" : ""); ++ return 0; ++ } else if (unlikely((mode & reqmode) != reqmode)) ++ return 0; ++ ++ return reqmode; ++} ++ ++__u32 ++gr_acl_handle_access(const struct dentry * dentry, const struct vfsmount * mnt, ++ const int fmode) ++{ ++ __u32 mode, reqmode = GR_FIND; ++ ++ if ((fmode & S_IXOTH) && !S_ISDIR(dentry->d_inode->i_mode)) ++ reqmode |= GR_EXEC; ++ if (fmode & S_IWOTH) ++ reqmode |= GR_WRITE; ++ if (fmode & S_IROTH) ++ reqmode |= GR_READ; ++ ++ mode = ++ gr_search_file(dentry, reqmode | to_gr_audit(reqmode) | GR_SUPPRESS, ++ mnt); ++ ++ if (unlikely(((mode & reqmode) == reqmode) && mode & GR_AUDITS)) { ++ gr_log_fs_rbac_mode3(GR_DO_AUDIT, GR_ACCESS_ACL_MSG, dentry, mnt, ++ reqmode & GR_READ ? " reading" : "", ++ reqmode & GR_WRITE ? " writing" : "", ++ reqmode & GR_EXEC ? " executing" : ""); ++ return reqmode; ++ } else ++ if (unlikely((mode & reqmode) != reqmode && !(mode & GR_SUPPRESS))) ++ { ++ gr_log_fs_rbac_mode3(GR_DONT_AUDIT, GR_ACCESS_ACL_MSG, dentry, mnt, ++ reqmode & GR_READ ? " reading" : "", ++ reqmode & GR_WRITE ? " writing" : "", ++ reqmode & GR_EXEC ? " executing" : ""); ++ return 0; ++ } else if (unlikely((mode & reqmode) != reqmode)) ++ return 0; ++ ++ return reqmode; ++} ++ ++static __u32 generic_fs_handler(const struct dentry *dentry, const struct vfsmount *mnt, __u32 reqmode, const char *fmt) ++{ ++ __u32 mode; ++ ++ mode = gr_search_file(dentry, reqmode | to_gr_audit(reqmode) | GR_SUPPRESS, mnt); ++ ++ if (unlikely(((mode & (reqmode)) == (reqmode)) && mode & GR_AUDITS)) { ++ gr_log_fs_rbac_generic(GR_DO_AUDIT, fmt, dentry, mnt); ++ return mode; ++ } else if (unlikely((mode & (reqmode)) != (reqmode) && !(mode & GR_SUPPRESS))) { ++ gr_log_fs_rbac_generic(GR_DONT_AUDIT, fmt, dentry, mnt); ++ return 0; ++ } else if (unlikely((mode & (reqmode)) != (reqmode))) ++ return 0; ++ ++ return (reqmode); ++} ++ ++__u32 ++gr_acl_handle_rmdir(const struct dentry * dentry, const struct vfsmount * mnt) ++{ ++ return generic_fs_handler(dentry, mnt, GR_WRITE | GR_DELETE , GR_RMDIR_ACL_MSG); ++} ++ ++__u32 ++gr_acl_handle_unlink(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return generic_fs_handler(dentry, mnt, GR_WRITE | GR_DELETE , GR_UNLINK_ACL_MSG); ++} ++ ++__u32 ++gr_acl_handle_truncate(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return generic_fs_handler(dentry, mnt, GR_WRITE, GR_TRUNCATE_ACL_MSG); ++} ++ ++__u32 ++gr_acl_handle_utime(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return generic_fs_handler(dentry, mnt, GR_WRITE, GR_ATIME_ACL_MSG); ++} ++ ++__u32 ++gr_acl_handle_fchmod(const struct dentry *dentry, const struct vfsmount *mnt, ++ mode_t mode) ++{ ++ if (unlikely(dentry->d_inode && S_ISSOCK(dentry->d_inode->i_mode))) ++ return 1; ++ ++ if (unlikely((mode != (mode_t)-1) && (mode & (S_ISUID | S_ISGID)))) { ++ return generic_fs_handler(dentry, mnt, GR_WRITE | GR_SETID, ++ GR_FCHMOD_ACL_MSG); ++ } else { ++ return generic_fs_handler(dentry, mnt, GR_WRITE, GR_FCHMOD_ACL_MSG); ++ } ++} ++ ++__u32 ++gr_acl_handle_chmod(const struct dentry *dentry, const struct vfsmount *mnt, ++ mode_t mode) ++{ ++ if (unlikely((mode != (mode_t)-1) && (mode & (S_ISUID | S_ISGID)))) { ++ return generic_fs_handler(dentry, mnt, GR_WRITE | GR_SETID, ++ GR_CHMOD_ACL_MSG); ++ } else { ++ return generic_fs_handler(dentry, mnt, GR_WRITE, GR_CHMOD_ACL_MSG); ++ } ++} ++ ++__u32 ++gr_acl_handle_chown(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return generic_fs_handler(dentry, mnt, GR_WRITE, GR_CHOWN_ACL_MSG); ++} ++ ++__u32 ++gr_acl_handle_execve(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return generic_fs_handler(dentry, mnt, GR_EXEC, GR_EXEC_ACL_MSG); ++} ++ ++__u32 ++gr_acl_handle_unix(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return generic_fs_handler(dentry, mnt, GR_READ | GR_WRITE, ++ GR_UNIXCONNECT_ACL_MSG); ++} ++ ++/* hardlinks require at minimum create permission, ++ any additional privilege required is based on the ++ privilege of the file being linked to ++*/ ++__u32 ++gr_acl_handle_link(const struct dentry * new_dentry, ++ const struct dentry * parent_dentry, ++ const struct vfsmount * parent_mnt, ++ const struct dentry * old_dentry, ++ const struct vfsmount * old_mnt, const char *to) ++{ ++ __u32 mode; ++ __u32 needmode = GR_CREATE | GR_LINK; ++ __u32 needaudit = GR_AUDIT_CREATE | GR_AUDIT_LINK; ++ ++ mode = ++ gr_check_link(new_dentry, parent_dentry, parent_mnt, old_dentry, ++ old_mnt); ++ ++ if (unlikely(((mode & needmode) == needmode) && (mode & needaudit))) { ++ gr_log_fs_rbac_str(GR_DO_AUDIT, GR_LINK_ACL_MSG, old_dentry, old_mnt, to); ++ return mode; ++ } else if (unlikely(((mode & needmode) != needmode) && !(mode & GR_SUPPRESS))) { ++ gr_log_fs_rbac_str(GR_DONT_AUDIT, GR_LINK_ACL_MSG, old_dentry, old_mnt, to); ++ return 0; ++ } else if (unlikely((mode & needmode) != needmode)) ++ return 0; ++ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_symlink(const struct dentry * new_dentry, ++ const struct dentry * parent_dentry, ++ const struct vfsmount * parent_mnt, const char *from) ++{ ++ __u32 needmode = GR_WRITE | GR_CREATE; ++ __u32 mode; ++ ++ mode = ++ gr_check_create(new_dentry, parent_dentry, parent_mnt, ++ GR_CREATE | GR_AUDIT_CREATE | ++ GR_WRITE | GR_AUDIT_WRITE | GR_SUPPRESS); ++ ++ if (unlikely(mode & GR_WRITE && mode & GR_AUDITS)) { ++ gr_log_fs_str_rbac(GR_DO_AUDIT, GR_SYMLINK_ACL_MSG, from, new_dentry, parent_mnt); ++ return mode; ++ } else if (unlikely(((mode & needmode) != needmode) && !(mode & GR_SUPPRESS))) { ++ gr_log_fs_str_rbac(GR_DONT_AUDIT, GR_SYMLINK_ACL_MSG, from, new_dentry, parent_mnt); ++ return 0; ++ } else if (unlikely((mode & needmode) != needmode)) ++ return 0; ++ ++ return (GR_WRITE | GR_CREATE); ++} ++ ++static __u32 generic_fs_create_handler(const struct dentry *new_dentry, const struct dentry *parent_dentry, const struct vfsmount *parent_mnt, __u32 reqmode, const char *fmt) ++{ ++ __u32 mode; ++ ++ mode = gr_check_create(new_dentry, parent_dentry, parent_mnt, reqmode | to_gr_audit(reqmode) | GR_SUPPRESS); ++ ++ if (unlikely(((mode & (reqmode)) == (reqmode)) && mode & GR_AUDITS)) { ++ gr_log_fs_rbac_generic(GR_DO_AUDIT, fmt, new_dentry, parent_mnt); ++ return mode; ++ } else if (unlikely((mode & (reqmode)) != (reqmode) && !(mode & GR_SUPPRESS))) { ++ gr_log_fs_rbac_generic(GR_DONT_AUDIT, fmt, new_dentry, parent_mnt); ++ return 0; ++ } else if (unlikely((mode & (reqmode)) != (reqmode))) ++ return 0; ++ ++ return (reqmode); ++} ++ ++__u32 ++gr_acl_handle_mknod(const struct dentry * new_dentry, ++ const struct dentry * parent_dentry, ++ const struct vfsmount * parent_mnt, ++ const int mode) ++{ ++ __u32 reqmode = GR_WRITE | GR_CREATE; ++ if (unlikely(mode & (S_ISUID | S_ISGID))) ++ reqmode |= GR_SETID; ++ ++ return generic_fs_create_handler(new_dentry, parent_dentry, parent_mnt, ++ reqmode, GR_MKNOD_ACL_MSG); ++} ++ ++__u32 ++gr_acl_handle_mkdir(const struct dentry *new_dentry, ++ const struct dentry *parent_dentry, ++ const struct vfsmount *parent_mnt) ++{ ++ return generic_fs_create_handler(new_dentry, parent_dentry, parent_mnt, ++ GR_WRITE | GR_CREATE, GR_MKDIR_ACL_MSG); ++} ++ ++#define RENAME_CHECK_SUCCESS(old, new) \ ++ (((old & (GR_WRITE | GR_READ)) == (GR_WRITE | GR_READ)) && \ ++ ((new & (GR_WRITE | GR_READ)) == (GR_WRITE | GR_READ))) ++ ++int ++gr_acl_handle_rename(struct dentry *new_dentry, ++ struct dentry *parent_dentry, ++ const struct vfsmount *parent_mnt, ++ struct dentry *old_dentry, ++ struct inode *old_parent_inode, ++ struct vfsmount *old_mnt, const char *newname) ++{ ++ __u32 comp1, comp2; ++ int error = 0; ++ ++ if (unlikely(!gr_acl_is_enabled())) ++ return 0; ++ ++ if (!new_dentry->d_inode) { ++ comp1 = gr_check_create(new_dentry, parent_dentry, parent_mnt, ++ GR_READ | GR_WRITE | GR_CREATE | GR_AUDIT_READ | ++ GR_AUDIT_WRITE | GR_AUDIT_CREATE | GR_SUPPRESS); ++ comp2 = gr_search_file(old_dentry, GR_READ | GR_WRITE | ++ GR_DELETE | GR_AUDIT_DELETE | ++ GR_AUDIT_READ | GR_AUDIT_WRITE | ++ GR_SUPPRESS, old_mnt); ++ } else { ++ comp1 = gr_search_file(new_dentry, GR_READ | GR_WRITE | ++ GR_CREATE | GR_DELETE | ++ GR_AUDIT_CREATE | GR_AUDIT_DELETE | ++ GR_AUDIT_READ | GR_AUDIT_WRITE | ++ GR_SUPPRESS, parent_mnt); ++ comp2 = ++ gr_search_file(old_dentry, ++ GR_READ | GR_WRITE | GR_AUDIT_READ | ++ GR_DELETE | GR_AUDIT_DELETE | ++ GR_AUDIT_WRITE | GR_SUPPRESS, old_mnt); ++ } ++ ++ if (RENAME_CHECK_SUCCESS(comp1, comp2) && ++ ((comp1 & GR_AUDITS) || (comp2 & GR_AUDITS))) ++ gr_log_fs_rbac_str(GR_DO_AUDIT, GR_RENAME_ACL_MSG, old_dentry, old_mnt, newname); ++ else if (!RENAME_CHECK_SUCCESS(comp1, comp2) && !(comp1 & GR_SUPPRESS) ++ && !(comp2 & GR_SUPPRESS)) { ++ gr_log_fs_rbac_str(GR_DONT_AUDIT, GR_RENAME_ACL_MSG, old_dentry, old_mnt, newname); ++ error = -EACCES; ++ } else if (unlikely(!RENAME_CHECK_SUCCESS(comp1, comp2))) ++ error = -EACCES; ++ ++ return error; ++} ++ ++void ++gr_acl_handle_exit(void) ++{ ++ u16 id; ++ char *rolename; ++ struct file *exec_file; ++ ++ if (unlikely(current->acl_sp_role && gr_acl_is_enabled())) { ++ id = current->acl_role_id; ++ rolename = current->role->rolename; ++ gr_set_acls(1); ++ gr_log_str_int(GR_DONT_AUDIT_GOOD, GR_SPROLEL_ACL_MSG, rolename, id); ++ } ++ ++ write_lock(&grsec_exec_file_lock); ++ exec_file = current->exec_file; ++ current->exec_file = NULL; ++ write_unlock(&grsec_exec_file_lock); ++ ++ if (exec_file) ++ fput(exec_file); ++} ++ ++int ++gr_acl_handle_procpidmem(const struct task_struct *task) ++{ ++ if (unlikely(!gr_acl_is_enabled())) ++ return 0; ++ ++ if (task != current && task->acl->mode & GR_PROTPROCFD) ++ return -EACCES; ++ ++ return 0; ++} +diff -urNp linux-2.6.33.1/grsecurity/gracl_ip.c linux-2.6.33.1/grsecurity/gracl_ip.c +--- linux-2.6.33.1/grsecurity/gracl_ip.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/gracl_ip.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,339 @@ ++#include <linux/kernel.h> ++#include <asm/uaccess.h> ++#include <asm/errno.h> ++#include <net/sock.h> ++#include <linux/file.h> ++#include <linux/fs.h> ++#include <linux/net.h> ++#include <linux/in.h> ++#include <linux/skbuff.h> ++#include <linux/ip.h> ++#include <linux/udp.h> ++#include <linux/smp_lock.h> ++#include <linux/types.h> ++#include <linux/sched.h> ++#include <linux/netdevice.h> ++#include <linux/inetdevice.h> ++#include <linux/gracl.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++ ++#define GR_BIND 0x01 ++#define GR_CONNECT 0x02 ++#define GR_INVERT 0x04 ++#define GR_BINDOVERRIDE 0x08 ++#define GR_CONNECTOVERRIDE 0x10 ++ ++static const char * gr_protocols[256] = { ++ "ip", "icmp", "igmp", "ggp", "ipencap", "st", "tcp", "cbt", ++ "egp", "igp", "bbn-rcc", "nvp", "pup", "argus", "emcon", "xnet", ++ "chaos", "udp", "mux", "dcn", "hmp", "prm", "xns-idp", "trunk-1", ++ "trunk-2", "leaf-1", "leaf-2", "rdp", "irtp", "iso-tp4", "netblt", "mfe-nsp", ++ "merit-inp", "sep", "3pc", "idpr", "xtp", "ddp", "idpr-cmtp", "tp++", ++ "il", "ipv6", "sdrp", "ipv6-route", "ipv6-frag", "idrp", "rsvp", "gre", ++ "mhrp", "bna", "ipv6-crypt", "ipv6-auth", "i-nlsp", "swipe", "narp", "mobile", ++ "tlsp", "skip", "ipv6-icmp", "ipv6-nonxt", "ipv6-opts", "unknown:61", "cftp", "unknown:63", ++ "sat-expak", "kryptolan", "rvd", "ippc", "unknown:68", "sat-mon", "visa", "ipcv", ++ "cpnx", "cphb", "wsn", "pvp", "br-sat-mon", "sun-nd", "wb-mon", "wb-expak", ++ "iso-ip", "vmtp", "secure-vmtp", "vines", "ttp", "nfsnet-igp", "dgp", "tcf", ++ "eigrp", "ospf", "sprite-rpc", "larp", "mtp", "ax.25", "ipip", "micp", ++ "scc-sp", "etherip", "encap", "unknown:99", "gmtp", "ifmp", "pnni", "pim", ++ "aris", "scps", "qnx", "a/n", "ipcomp", "snp", "compaq-peer", "ipx-in-ip", ++ "vrrp", "pgm", "unknown:114", "l2tp", "ddx", "iatp", "stp", "srp", ++ "uti", "smp", "sm", "ptp", "isis", "fire", "crtp", "crdup", ++ "sscopmce", "iplt", "sps", "pipe", "sctp", "fc", "unkown:134", "unknown:135", ++ "unknown:136", "unknown:137", "unknown:138", "unknown:139", "unknown:140", "unknown:141", "unknown:142", "unknown:143", ++ "unknown:144", "unknown:145", "unknown:146", "unknown:147", "unknown:148", "unknown:149", "unknown:150", "unknown:151", ++ "unknown:152", "unknown:153", "unknown:154", "unknown:155", "unknown:156", "unknown:157", "unknown:158", "unknown:159", ++ "unknown:160", "unknown:161", "unknown:162", "unknown:163", "unknown:164", "unknown:165", "unknown:166", "unknown:167", ++ "unknown:168", "unknown:169", "unknown:170", "unknown:171", "unknown:172", "unknown:173", "unknown:174", "unknown:175", ++ "unknown:176", "unknown:177", "unknown:178", "unknown:179", "unknown:180", "unknown:181", "unknown:182", "unknown:183", ++ "unknown:184", "unknown:185", "unknown:186", "unknown:187", "unknown:188", "unknown:189", "unknown:190", "unknown:191", ++ "unknown:192", "unknown:193", "unknown:194", "unknown:195", "unknown:196", "unknown:197", "unknown:198", "unknown:199", ++ "unknown:200", "unknown:201", "unknown:202", "unknown:203", "unknown:204", "unknown:205", "unknown:206", "unknown:207", ++ "unknown:208", "unknown:209", "unknown:210", "unknown:211", "unknown:212", "unknown:213", "unknown:214", "unknown:215", ++ "unknown:216", "unknown:217", "unknown:218", "unknown:219", "unknown:220", "unknown:221", "unknown:222", "unknown:223", ++ "unknown:224", "unknown:225", "unknown:226", "unknown:227", "unknown:228", "unknown:229", "unknown:230", "unknown:231", ++ "unknown:232", "unknown:233", "unknown:234", "unknown:235", "unknown:236", "unknown:237", "unknown:238", "unknown:239", ++ "unknown:240", "unknown:241", "unknown:242", "unknown:243", "unknown:244", "unknown:245", "unknown:246", "unknown:247", ++ "unknown:248", "unknown:249", "unknown:250", "unknown:251", "unknown:252", "unknown:253", "unknown:254", "unknown:255", ++ }; ++ ++static const char * gr_socktypes[11] = { ++ "unknown:0", "stream", "dgram", "raw", "rdm", "seqpacket", "unknown:6", ++ "unknown:7", "unknown:8", "unknown:9", "packet" ++ }; ++ ++const char * ++gr_proto_to_name(unsigned char proto) ++{ ++ return gr_protocols[proto]; ++} ++ ++const char * ++gr_socktype_to_name(unsigned char type) ++{ ++ return gr_socktypes[type]; ++} ++ ++int ++gr_search_socket(const int domain, const int type, const int protocol) ++{ ++ struct acl_subject_label *curr; ++ const struct cred *cred = current_cred(); ++ ++ if (unlikely(!gr_acl_is_enabled())) ++ goto exit; ++ ++ if ((domain < 0) || (type < 0) || (protocol < 0) || (domain != PF_INET) ++ || (domain >= NPROTO) || (type >= SOCK_MAX) || (protocol > 255)) ++ goto exit; // let the kernel handle it ++ ++ curr = current->acl; ++ ++ if (!curr->ips) ++ goto exit; ++ ++ if ((curr->ip_type & (1 << type)) && ++ (curr->ip_proto[protocol / 32] & (1 << (protocol % 32)))) ++ goto exit; ++ ++ if (curr->mode & (GR_LEARN | GR_INHERITLEARN)) { ++ /* we don't place acls on raw sockets , and sometimes ++ dgram/ip sockets are opened for ioctl and not ++ bind/connect, so we'll fake a bind learn log */ ++ if (type == SOCK_RAW || type == SOCK_PACKET) { ++ __u32 fakeip = 0; ++ security_learn(GR_IP_LEARN_MSG, current->role->rolename, ++ current->role->roletype, cred->uid, ++ cred->gid, current->exec_file ? ++ gr_to_filename(current->exec_file->f_path.dentry, ++ current->exec_file->f_path.mnt) : ++ curr->filename, curr->filename, ++ &fakeip, 0, type, ++ protocol, GR_CONNECT, ¤t->signal->curr_ip); ++ } else if ((type == SOCK_DGRAM) && (protocol == IPPROTO_IP)) { ++ __u32 fakeip = 0; ++ security_learn(GR_IP_LEARN_MSG, current->role->rolename, ++ current->role->roletype, cred->uid, ++ cred->gid, current->exec_file ? ++ gr_to_filename(current->exec_file->f_path.dentry, ++ current->exec_file->f_path.mnt) : ++ curr->filename, curr->filename, ++ &fakeip, 0, type, ++ protocol, GR_BIND, ¤t->signal->curr_ip); ++ } ++ /* we'll log when they use connect or bind */ ++ goto exit; ++ } ++ ++ gr_log_str3(GR_DONT_AUDIT, GR_SOCK_MSG, "inet", ++ gr_socktype_to_name(type), gr_proto_to_name(protocol)); ++ ++ return 0; ++ exit: ++ return 1; ++} ++ ++int check_ip_policy(struct acl_ip_label *ip, __u32 ip_addr, __u16 ip_port, __u8 protocol, const int mode, const int type, __u32 our_addr, __u32 our_netmask) ++{ ++ if ((ip->mode & mode) && ++ (ip_port >= ip->low) && ++ (ip_port <= ip->high) && ++ ((ntohl(ip_addr) & our_netmask) == ++ (ntohl(our_addr) & our_netmask)) ++ && (ip->proto[protocol / 32] & (1 << (protocol % 32))) ++ && (ip->type & (1 << type))) { ++ if (ip->mode & GR_INVERT) ++ return 2; // specifically denied ++ else ++ return 1; // allowed ++ } ++ ++ return 0; // not specifically allowed, may continue parsing ++} ++ ++static int ++gr_search_connectbind(const int full_mode, struct sock *sk, ++ struct sockaddr_in *addr, const int type) ++{ ++ char iface[IFNAMSIZ] = {0}; ++ struct acl_subject_label *curr; ++ struct acl_ip_label *ip; ++ struct inet_sock *isk; ++ struct net_device *dev; ++ struct in_device *idev; ++ unsigned long i; ++ int ret; ++ int mode = full_mode & (GR_BIND | GR_CONNECT); ++ __u32 ip_addr = 0; ++ __u32 our_addr; ++ __u32 our_netmask; ++ char *p; ++ __u16 ip_port = 0; ++ const struct cred *cred = current_cred(); ++ ++ if (unlikely(!gr_acl_is_enabled() || sk->sk_family != PF_INET)) ++ return 0; ++ ++ curr = current->acl; ++ isk = inet_sk(sk); ++ ++ /* INADDR_ANY overriding for binds, inaddr_any_override is already in network order */ ++ if ((full_mode & GR_BINDOVERRIDE) && addr->sin_addr.s_addr == htonl(INADDR_ANY) && curr->inaddr_any_override != 0) ++ addr->sin_addr.s_addr = curr->inaddr_any_override; ++ if ((full_mode & GR_CONNECT) && isk->inet_saddr == htonl(INADDR_ANY) && curr->inaddr_any_override != 0) { ++ struct sockaddr_in saddr; ++ int err; ++ ++ saddr.sin_family = AF_INET; ++ saddr.sin_addr.s_addr = curr->inaddr_any_override; ++ saddr.sin_port = isk->inet_sport; ++ ++ err = security_socket_bind(sk->sk_socket, (struct sockaddr *)&saddr, sizeof(struct sockaddr_in)); ++ if (err) ++ return err; ++ ++ err = sk->sk_socket->ops->bind(sk->sk_socket, (struct sockaddr *)&saddr, sizeof(struct sockaddr_in)); ++ if (err) ++ return err; ++ } ++ ++ if (!curr->ips) ++ return 0; ++ ++ ip_addr = addr->sin_addr.s_addr; ++ ip_port = ntohs(addr->sin_port); ++ ++ if (curr->mode & (GR_LEARN | GR_INHERITLEARN)) { ++ security_learn(GR_IP_LEARN_MSG, current->role->rolename, ++ current->role->roletype, cred->uid, ++ cred->gid, current->exec_file ? ++ gr_to_filename(current->exec_file->f_path.dentry, ++ current->exec_file->f_path.mnt) : ++ curr->filename, curr->filename, ++ &ip_addr, ip_port, type, ++ sk->sk_protocol, mode, ¤t->signal->curr_ip); ++ return 0; ++ } ++ ++ for (i = 0; i < curr->ip_num; i++) { ++ ip = *(curr->ips + i); ++ if (ip->iface != NULL) { ++ strncpy(iface, ip->iface, IFNAMSIZ - 1); ++ p = strchr(iface, ':'); ++ if (p != NULL) ++ *p = '\0'; ++ dev = dev_get_by_name(sock_net(sk), iface); ++ if (dev == NULL) ++ continue; ++ idev = in_dev_get(dev); ++ if (idev == NULL) { ++ dev_put(dev); ++ continue; ++ } ++ rcu_read_lock(); ++ for_ifa(idev) { ++ if (!strcmp(ip->iface, ifa->ifa_label)) { ++ our_addr = ifa->ifa_address; ++ our_netmask = 0xffffffff; ++ ret = check_ip_policy(ip, ip_addr, ip_port, sk->sk_protocol, mode, type, our_addr, our_netmask); ++ if (ret == 1) { ++ rcu_read_unlock(); ++ in_dev_put(idev); ++ dev_put(dev); ++ return 0; ++ } else if (ret == 2) { ++ rcu_read_unlock(); ++ in_dev_put(idev); ++ dev_put(dev); ++ goto denied; ++ } ++ } ++ } endfor_ifa(idev); ++ rcu_read_unlock(); ++ in_dev_put(idev); ++ dev_put(dev); ++ } else { ++ our_addr = ip->addr; ++ our_netmask = ip->netmask; ++ ret = check_ip_policy(ip, ip_addr, ip_port, sk->sk_protocol, mode, type, our_addr, our_netmask); ++ if (ret == 1) ++ return 0; ++ else if (ret == 2) ++ goto denied; ++ } ++ } ++ ++denied: ++ if (mode == GR_BIND) ++ gr_log_int5_str2(GR_DONT_AUDIT, GR_BIND_ACL_MSG, &ip_addr, ip_port, gr_socktype_to_name(type), gr_proto_to_name(sk->sk_protocol)); ++ else if (mode == GR_CONNECT) ++ gr_log_int5_str2(GR_DONT_AUDIT, GR_CONNECT_ACL_MSG, &ip_addr, ip_port, gr_socktype_to_name(type), gr_proto_to_name(sk->sk_protocol)); ++ ++ return -EACCES; ++} ++ ++int ++gr_search_connect(struct socket *sock, struct sockaddr_in *addr) ++{ ++ return gr_search_connectbind(GR_CONNECT | GR_CONNECTOVERRIDE, sock->sk, addr, sock->type); ++} ++ ++int ++gr_search_bind(struct socket *sock, struct sockaddr_in *addr) ++{ ++ return gr_search_connectbind(GR_BIND | GR_BINDOVERRIDE, sock->sk, addr, sock->type); ++} ++ ++int gr_search_listen(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ struct sockaddr_in addr; ++ ++ addr.sin_addr.s_addr = inet_sk(sk)->inet_saddr; ++ addr.sin_port = inet_sk(sk)->inet_sport; ++ ++ return gr_search_connectbind(GR_BIND | GR_CONNECTOVERRIDE, sock->sk, &addr, sock->type); ++} ++ ++int gr_search_accept(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ struct sockaddr_in addr; ++ ++ addr.sin_addr.s_addr = inet_sk(sk)->inet_saddr; ++ addr.sin_port = inet_sk(sk)->inet_sport; ++ ++ return gr_search_connectbind(GR_BIND | GR_CONNECTOVERRIDE, sock->sk, &addr, sock->type); ++} ++ ++int ++gr_search_udp_sendmsg(struct sock *sk, struct sockaddr_in *addr) ++{ ++ if (addr) ++ return gr_search_connectbind(GR_CONNECT, sk, addr, SOCK_DGRAM); ++ else { ++ struct sockaddr_in sin; ++ const struct inet_sock *inet = inet_sk(sk); ++ ++ sin.sin_addr.s_addr = inet->inet_daddr; ++ sin.sin_port = inet->inet_dport; ++ ++ return gr_search_connectbind(GR_CONNECT | GR_CONNECTOVERRIDE, sk, &sin, SOCK_DGRAM); ++ } ++} ++ ++int ++gr_search_udp_recvmsg(struct sock *sk, const struct sk_buff *skb) ++{ ++ struct sockaddr_in sin; ++ ++ if (unlikely(skb->len < sizeof (struct udphdr))) ++ return 0; // skip this packet ++ ++ sin.sin_addr.s_addr = ip_hdr(skb)->saddr; ++ sin.sin_port = udp_hdr(skb)->source; ++ ++ return gr_search_connectbind(GR_CONNECT | GR_CONNECTOVERRIDE, sk, &sin, SOCK_DGRAM); ++} +diff -urNp linux-2.6.33.1/grsecurity/gracl_learn.c linux-2.6.33.1/grsecurity/gracl_learn.c +--- linux-2.6.33.1/grsecurity/gracl_learn.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/gracl_learn.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,211 @@ ++#include <linux/kernel.h> ++#include <linux/mm.h> ++#include <linux/sched.h> ++#include <linux/poll.h> ++#include <linux/smp_lock.h> ++#include <linux/string.h> ++#include <linux/file.h> ++#include <linux/types.h> ++#include <linux/vmalloc.h> ++#include <linux/grinternal.h> ++ ++extern ssize_t write_grsec_handler(struct file * file, const char __user * buf, ++ size_t count, loff_t *ppos); ++extern int gr_acl_is_enabled(void); ++ ++static DECLARE_WAIT_QUEUE_HEAD(learn_wait); ++static int gr_learn_attached; ++ ++/* use a 512k buffer */ ++#define LEARN_BUFFER_SIZE (512 * 1024) ++ ++static DEFINE_SPINLOCK(gr_learn_lock); ++static DECLARE_MUTEX(gr_learn_user_sem); ++ ++/* we need to maintain two buffers, so that the kernel context of grlearn ++ uses a semaphore around the userspace copying, and the other kernel contexts ++ use a spinlock when copying into the buffer, since they cannot sleep ++*/ ++static char *learn_buffer; ++static char *learn_buffer_user; ++static int learn_buffer_len; ++static int learn_buffer_user_len; ++ ++static ssize_t ++read_learn(struct file *file, char __user * buf, size_t count, loff_t * ppos) ++{ ++ DECLARE_WAITQUEUE(wait, current); ++ ssize_t retval = 0; ++ ++ add_wait_queue(&learn_wait, &wait); ++ set_current_state(TASK_INTERRUPTIBLE); ++ do { ++ down(&gr_learn_user_sem); ++ spin_lock(&gr_learn_lock); ++ if (learn_buffer_len) ++ break; ++ spin_unlock(&gr_learn_lock); ++ up(&gr_learn_user_sem); ++ if (file->f_flags & O_NONBLOCK) { ++ retval = -EAGAIN; ++ goto out; ++ } ++ if (signal_pending(current)) { ++ retval = -ERESTARTSYS; ++ goto out; ++ } ++ ++ schedule(); ++ } while (1); ++ ++ memcpy(learn_buffer_user, learn_buffer, learn_buffer_len); ++ learn_buffer_user_len = learn_buffer_len; ++ retval = learn_buffer_len; ++ learn_buffer_len = 0; ++ ++ spin_unlock(&gr_learn_lock); ++ ++ if (copy_to_user(buf, learn_buffer_user, learn_buffer_user_len)) ++ retval = -EFAULT; ++ ++ up(&gr_learn_user_sem); ++out: ++ set_current_state(TASK_RUNNING); ++ remove_wait_queue(&learn_wait, &wait); ++ return retval; ++} ++ ++static unsigned int ++poll_learn(struct file * file, poll_table * wait) ++{ ++ poll_wait(file, &learn_wait, wait); ++ ++ if (learn_buffer_len) ++ return (POLLIN | POLLRDNORM); ++ ++ return 0; ++} ++ ++void ++gr_clear_learn_entries(void) ++{ ++ char *tmp; ++ ++ down(&gr_learn_user_sem); ++ if (learn_buffer != NULL) { ++ spin_lock(&gr_learn_lock); ++ tmp = learn_buffer; ++ learn_buffer = NULL; ++ spin_unlock(&gr_learn_lock); ++ vfree(learn_buffer); ++ } ++ if (learn_buffer_user != NULL) { ++ vfree(learn_buffer_user); ++ learn_buffer_user = NULL; ++ } ++ learn_buffer_len = 0; ++ up(&gr_learn_user_sem); ++ ++ return; ++} ++ ++void ++gr_add_learn_entry(const char *fmt, ...) ++{ ++ va_list args; ++ unsigned int len; ++ ++ if (!gr_learn_attached) ++ return; ++ ++ spin_lock(&gr_learn_lock); ++ ++ /* leave a gap at the end so we know when it's "full" but don't have to ++ compute the exact length of the string we're trying to append ++ */ ++ if (learn_buffer_len > LEARN_BUFFER_SIZE - 16384) { ++ spin_unlock(&gr_learn_lock); ++ wake_up_interruptible(&learn_wait); ++ return; ++ } ++ if (learn_buffer == NULL) { ++ spin_unlock(&gr_learn_lock); ++ return; ++ } ++ ++ va_start(args, fmt); ++ len = vsnprintf(learn_buffer + learn_buffer_len, LEARN_BUFFER_SIZE - learn_buffer_len, fmt, args); ++ va_end(args); ++ ++ learn_buffer_len += len + 1; ++ ++ spin_unlock(&gr_learn_lock); ++ wake_up_interruptible(&learn_wait); ++ ++ return; ++} ++ ++static int ++open_learn(struct inode *inode, struct file *file) ++{ ++ if (file->f_mode & FMODE_READ && gr_learn_attached) ++ return -EBUSY; ++ if (file->f_mode & FMODE_READ) { ++ int retval = 0; ++ down(&gr_learn_user_sem); ++ if (learn_buffer == NULL) ++ learn_buffer = vmalloc(LEARN_BUFFER_SIZE); ++ if (learn_buffer_user == NULL) ++ learn_buffer_user = vmalloc(LEARN_BUFFER_SIZE); ++ if (learn_buffer == NULL) { ++ retval = -ENOMEM; ++ goto out_error; ++ } ++ if (learn_buffer_user == NULL) { ++ retval = -ENOMEM; ++ goto out_error; ++ } ++ learn_buffer_len = 0; ++ learn_buffer_user_len = 0; ++ gr_learn_attached = 1; ++out_error: ++ up(&gr_learn_user_sem); ++ return retval; ++ } ++ return 0; ++} ++ ++static int ++close_learn(struct inode *inode, struct file *file) ++{ ++ char *tmp; ++ ++ if (file->f_mode & FMODE_READ) { ++ down(&gr_learn_user_sem); ++ if (learn_buffer != NULL) { ++ spin_lock(&gr_learn_lock); ++ tmp = learn_buffer; ++ learn_buffer = NULL; ++ spin_unlock(&gr_learn_lock); ++ vfree(tmp); ++ } ++ if (learn_buffer_user != NULL) { ++ vfree(learn_buffer_user); ++ learn_buffer_user = NULL; ++ } ++ learn_buffer_len = 0; ++ learn_buffer_user_len = 0; ++ gr_learn_attached = 0; ++ up(&gr_learn_user_sem); ++ } ++ ++ return 0; ++} ++ ++const struct file_operations grsec_fops = { ++ .read = read_learn, ++ .write = write_grsec_handler, ++ .open = open_learn, ++ .release = close_learn, ++ .poll = poll_learn, ++}; +diff -urNp linux-2.6.33.1/grsecurity/gracl_res.c linux-2.6.33.1/grsecurity/gracl_res.c +--- linux-2.6.33.1/grsecurity/gracl_res.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/gracl_res.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,65 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/gracl.h> ++#include <linux/grinternal.h> ++ ++static const char *restab_log[] = { ++ [RLIMIT_CPU] = "RLIMIT_CPU", ++ [RLIMIT_FSIZE] = "RLIMIT_FSIZE", ++ [RLIMIT_DATA] = "RLIMIT_DATA", ++ [RLIMIT_STACK] = "RLIMIT_STACK", ++ [RLIMIT_CORE] = "RLIMIT_CORE", ++ [RLIMIT_RSS] = "RLIMIT_RSS", ++ [RLIMIT_NPROC] = "RLIMIT_NPROC", ++ [RLIMIT_NOFILE] = "RLIMIT_NOFILE", ++ [RLIMIT_MEMLOCK] = "RLIMIT_MEMLOCK", ++ [RLIMIT_AS] = "RLIMIT_AS", ++ [RLIMIT_LOCKS] = "RLIMIT_LOCKS", ++ [RLIMIT_SIGPENDING] = "RLIMIT_SIGPENDING", ++ [RLIMIT_MSGQUEUE] = "RLIMIT_MSGQUEUE", ++ [RLIMIT_NICE] = "RLIMIT_NICE", ++ [RLIMIT_RTPRIO] = "RLIMIT_RTPRIO", ++ [RLIMIT_RTTIME] = "RLIMIT_RTTIME", ++ [GR_CRASH_RES] = "RLIMIT_CRASH" ++}; ++ ++void ++gr_log_resource(const struct task_struct *task, ++ const int res, const unsigned long wanted, const int gt) ++{ ++ const struct cred *cred; ++ ++ if (!gr_acl_is_enabled() && !grsec_resource_logging) ++ return; ++ ++ // not yet supported resource ++ if (!restab_log[res]) ++ return; ++ ++ rcu_read_lock(); ++ cred = __task_cred(task); ++ ++ if (res == RLIMIT_NPROC && ++ (cap_raised(cred->cap_effective, CAP_SYS_ADMIN) || ++ cap_raised(cred->cap_effective, CAP_SYS_RESOURCE))) ++ goto out_rcu_unlock; ++ else if (res == RLIMIT_MEMLOCK && ++ cap_raised(cred->cap_effective, CAP_IPC_LOCK)) ++ goto out_rcu_unlock; ++ else if (res == RLIMIT_NICE && cap_raised(cred->cap_effective, CAP_SYS_NICE)) ++ goto out_rcu_unlock; ++ rcu_read_unlock(); ++ ++ preempt_disable(); ++ ++ if (unlikely(((gt && wanted > task->signal->rlim[res].rlim_cur) || ++ (!gt && wanted >= task->signal->rlim[res].rlim_cur)) && ++ task->signal->rlim[res].rlim_cur != RLIM_INFINITY)) ++ gr_log_res_ulong2_str(GR_DONT_AUDIT, GR_RESOURCE_MSG, task, wanted, restab_log[res], task->signal->rlim[res].rlim_cur); ++ preempt_enable_no_resched(); ++ ++ return; ++out_rcu_unlock: ++ rcu_read_unlock(); ++ return; ++} +diff -urNp linux-2.6.33.1/grsecurity/gracl_segv.c linux-2.6.33.1/grsecurity/gracl_segv.c +--- linux-2.6.33.1/grsecurity/gracl_segv.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/gracl_segv.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,310 @@ ++#include <linux/kernel.h> ++#include <linux/mm.h> ++#include <asm/uaccess.h> ++#include <asm/errno.h> ++#include <asm/mman.h> ++#include <net/sock.h> ++#include <linux/file.h> ++#include <linux/fs.h> ++#include <linux/net.h> ++#include <linux/in.h> ++#include <linux/smp_lock.h> ++#include <linux/slab.h> ++#include <linux/types.h> ++#include <linux/sched.h> ++#include <linux/timer.h> ++#include <linux/gracl.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++ ++static struct crash_uid *uid_set; ++static unsigned short uid_used; ++static DEFINE_SPINLOCK(gr_uid_lock); ++extern rwlock_t gr_inode_lock; ++extern struct acl_subject_label * ++ lookup_acl_subj_label(const ino_t inode, const dev_t dev, ++ struct acl_role_label *role); ++extern int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t); ++ ++int ++gr_init_uidset(void) ++{ ++ uid_set = ++ kmalloc(GR_UIDTABLE_MAX * sizeof (struct crash_uid), GFP_KERNEL); ++ uid_used = 0; ++ ++ return uid_set ? 1 : 0; ++} ++ ++void ++gr_free_uidset(void) ++{ ++ if (uid_set) ++ kfree(uid_set); ++ ++ return; ++} ++ ++int ++gr_find_uid(const uid_t uid) ++{ ++ struct crash_uid *tmp = uid_set; ++ uid_t buid; ++ int low = 0, high = uid_used - 1, mid; ++ ++ while (high >= low) { ++ mid = (low + high) >> 1; ++ buid = tmp[mid].uid; ++ if (buid == uid) ++ return mid; ++ if (buid > uid) ++ high = mid - 1; ++ if (buid < uid) ++ low = mid + 1; ++ } ++ ++ return -1; ++} ++ ++static __inline__ void ++gr_insertsort(void) ++{ ++ unsigned short i, j; ++ struct crash_uid index; ++ ++ for (i = 1; i < uid_used; i++) { ++ index = uid_set[i]; ++ j = i; ++ while ((j > 0) && uid_set[j - 1].uid > index.uid) { ++ uid_set[j] = uid_set[j - 1]; ++ j--; ++ } ++ uid_set[j] = index; ++ } ++ ++ return; ++} ++ ++static __inline__ void ++gr_insert_uid(const uid_t uid, const unsigned long expires) ++{ ++ int loc; ++ ++ if (uid_used == GR_UIDTABLE_MAX) ++ return; ++ ++ loc = gr_find_uid(uid); ++ ++ if (loc >= 0) { ++ uid_set[loc].expires = expires; ++ return; ++ } ++ ++ uid_set[uid_used].uid = uid; ++ uid_set[uid_used].expires = expires; ++ uid_used++; ++ ++ gr_insertsort(); ++ ++ return; ++} ++ ++void ++gr_remove_uid(const unsigned short loc) ++{ ++ unsigned short i; ++ ++ for (i = loc + 1; i < uid_used; i++) ++ uid_set[i - 1] = uid_set[i]; ++ ++ uid_used--; ++ ++ return; ++} ++ ++int ++gr_check_crash_uid(const uid_t uid) ++{ ++ int loc; ++ int ret = 0; ++ ++ if (unlikely(!gr_acl_is_enabled())) ++ return 0; ++ ++ spin_lock(&gr_uid_lock); ++ loc = gr_find_uid(uid); ++ ++ if (loc < 0) ++ goto out_unlock; ++ ++ if (time_before_eq(uid_set[loc].expires, get_seconds())) ++ gr_remove_uid(loc); ++ else ++ ret = 1; ++ ++out_unlock: ++ spin_unlock(&gr_uid_lock); ++ return ret; ++} ++ ++static __inline__ int ++proc_is_setxid(const struct cred *cred) ++{ ++ if (cred->uid != cred->euid || cred->uid != cred->suid || ++ cred->uid != cred->fsuid) ++ return 1; ++ if (cred->gid != cred->egid || cred->gid != cred->sgid || ++ cred->gid != cred->fsgid) ++ return 1; ++ ++ return 0; ++} ++static __inline__ int ++gr_fake_force_sig(int sig, struct task_struct *t) ++{ ++ unsigned long int flags; ++ int ret, blocked, ignored; ++ struct k_sigaction *action; ++ ++ spin_lock_irqsave(&t->sighand->siglock, flags); ++ action = &t->sighand->action[sig-1]; ++ ignored = action->sa.sa_handler == SIG_IGN; ++ blocked = sigismember(&t->blocked, sig); ++ if (blocked || ignored) { ++ action->sa.sa_handler = SIG_DFL; ++ if (blocked) { ++ sigdelset(&t->blocked, sig); ++ recalc_sigpending_and_wake(t); ++ } ++ } ++ if (action->sa.sa_handler == SIG_DFL) ++ t->signal->flags &= ~SIGNAL_UNKILLABLE; ++ ret = specific_send_sig_info(sig, SEND_SIG_PRIV, t); ++ ++ spin_unlock_irqrestore(&t->sighand->siglock, flags); ++ ++ return ret; ++} ++ ++void ++gr_handle_crash(struct task_struct *task, const int sig) ++{ ++ struct acl_subject_label *curr; ++ struct acl_subject_label *curr2; ++ struct task_struct *tsk, *tsk2; ++ const struct cred *cred; ++ const struct cred *cred2; ++ ++ if (sig != SIGSEGV && sig != SIGKILL && sig != SIGBUS && sig != SIGILL) ++ return; ++ ++ if (unlikely(!gr_acl_is_enabled())) ++ return; ++ ++ curr = task->acl; ++ ++ if (!(curr->resmask & (1 << GR_CRASH_RES))) ++ return; ++ ++ if (time_before_eq(curr->expires, get_seconds())) { ++ curr->expires = 0; ++ curr->crashes = 0; ++ } ++ ++ curr->crashes++; ++ ++ if (!curr->expires) ++ curr->expires = get_seconds() + curr->res[GR_CRASH_RES].rlim_max; ++ ++ if ((curr->crashes >= curr->res[GR_CRASH_RES].rlim_cur) && ++ time_after(curr->expires, get_seconds())) { ++ rcu_read_lock(); ++ cred = __task_cred(task); ++ if (cred->uid && proc_is_setxid(cred)) { ++ gr_log_crash1(GR_DONT_AUDIT, GR_SEGVSTART_ACL_MSG, task, curr->res[GR_CRASH_RES].rlim_max); ++ spin_lock(&gr_uid_lock); ++ gr_insert_uid(cred->uid, curr->expires); ++ spin_unlock(&gr_uid_lock); ++ curr->expires = 0; ++ curr->crashes = 0; ++ read_lock(&tasklist_lock); ++ do_each_thread(tsk2, tsk) { ++ cred2 = __task_cred(tsk); ++ if (tsk != task && cred2->uid == cred->uid) ++ gr_fake_force_sig(SIGKILL, tsk); ++ } while_each_thread(tsk2, tsk); ++ read_unlock(&tasklist_lock); ++ } else { ++ gr_log_crash2(GR_DONT_AUDIT, GR_SEGVNOSUID_ACL_MSG, task, curr->res[GR_CRASH_RES].rlim_max); ++ read_lock(&tasklist_lock); ++ do_each_thread(tsk2, tsk) { ++ if (likely(tsk != task)) { ++ curr2 = tsk->acl; ++ ++ if (curr2->device == curr->device && ++ curr2->inode == curr->inode) ++ gr_fake_force_sig(SIGKILL, tsk); ++ } ++ } while_each_thread(tsk2, tsk); ++ read_unlock(&tasklist_lock); ++ } ++ rcu_read_unlock(); ++ } ++ ++ return; ++} ++ ++int ++gr_check_crash_exec(const struct file *filp) ++{ ++ struct acl_subject_label *curr; ++ ++ if (unlikely(!gr_acl_is_enabled())) ++ return 0; ++ ++ read_lock(&gr_inode_lock); ++ curr = lookup_acl_subj_label(filp->f_path.dentry->d_inode->i_ino, ++ filp->f_path.dentry->d_inode->i_sb->s_dev, ++ current->role); ++ read_unlock(&gr_inode_lock); ++ ++ if (!curr || !(curr->resmask & (1 << GR_CRASH_RES)) || ++ (!curr->crashes && !curr->expires)) ++ return 0; ++ ++ if ((curr->crashes >= curr->res[GR_CRASH_RES].rlim_cur) && ++ time_after(curr->expires, get_seconds())) ++ return 1; ++ else if (time_before_eq(curr->expires, get_seconds())) { ++ curr->crashes = 0; ++ curr->expires = 0; ++ } ++ ++ return 0; ++} ++ ++void ++gr_handle_alertkill(struct task_struct *task) ++{ ++ struct acl_subject_label *curracl; ++ __u32 curr_ip; ++ struct task_struct *p, *p2; ++ ++ if (unlikely(!gr_acl_is_enabled())) ++ return; ++ ++ curracl = task->acl; ++ curr_ip = task->signal->curr_ip; ++ ++ if ((curracl->mode & GR_KILLIPPROC) && curr_ip) { ++ read_lock(&tasklist_lock); ++ do_each_thread(p2, p) { ++ if (p->signal->curr_ip == curr_ip) ++ gr_fake_force_sig(SIGKILL, p); ++ } while_each_thread(p2, p); ++ read_unlock(&tasklist_lock); ++ } else if (curracl->mode & GR_KILLPROC) ++ gr_fake_force_sig(SIGKILL, task); ++ ++ return; ++} +diff -urNp linux-2.6.33.1/grsecurity/gracl_shm.c linux-2.6.33.1/grsecurity/gracl_shm.c +--- linux-2.6.33.1/grsecurity/gracl_shm.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/gracl_shm.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,37 @@ ++#include <linux/kernel.h> ++#include <linux/mm.h> ++#include <linux/sched.h> ++#include <linux/file.h> ++#include <linux/ipc.h> ++#include <linux/gracl.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++ ++int ++gr_handle_shmat(const pid_t shm_cprid, const pid_t shm_lapid, ++ const time_t shm_createtime, const uid_t cuid, const int shmid) ++{ ++ struct task_struct *task; ++ ++ if (!gr_acl_is_enabled()) ++ return 1; ++ ++ read_lock(&tasklist_lock); ++ ++ task = find_task_by_vpid(shm_cprid); ++ ++ if (unlikely(!task)) ++ task = find_task_by_vpid(shm_lapid); ++ ++ if (unlikely(task && (time_before_eq((unsigned long)task->start_time.tv_sec, (unsigned long)shm_createtime) || ++ (task->pid == shm_lapid)) && ++ (task->acl->mode & GR_PROTSHM) && ++ (task->acl != current->acl))) { ++ read_unlock(&tasklist_lock); ++ gr_log_int3(GR_DONT_AUDIT, GR_SHMAT_ACL_MSG, cuid, shm_cprid, shmid); ++ return 0; ++ } ++ read_unlock(&tasklist_lock); ++ ++ return 1; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_chdir.c linux-2.6.33.1/grsecurity/grsec_chdir.c +--- linux-2.6.33.1/grsecurity/grsec_chdir.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_chdir.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,19 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/file.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++ ++void ++gr_log_chdir(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++#ifdef CONFIG_GRKERNSEC_AUDIT_CHDIR ++ if ((grsec_enable_chdir && grsec_enable_group && ++ in_group_p(grsec_audit_gid)) || (grsec_enable_chdir && ++ !grsec_enable_group)) { ++ gr_log_fs_generic(GR_DO_AUDIT, GR_CHDIR_AUDIT_MSG, dentry, mnt); ++ } ++#endif ++ return; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_chroot.c linux-2.6.33.1/grsecurity/grsec_chroot.c +--- linux-2.6.33.1/grsecurity/grsec_chroot.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_chroot.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,348 @@ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/sched.h> ++#include <linux/file.h> ++#include <linux/fs.h> ++#include <linux/mount.h> ++#include <linux/types.h> ++#include <linux/pid_namespace.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++ ++int ++gr_handle_chroot_unix(const pid_t pid) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_UNIX ++ struct pid *spid = NULL; ++ ++ if (unlikely(!grsec_enable_chroot_unix)) ++ return 1; ++ ++ if (likely(!proc_is_chrooted(current))) ++ return 1; ++ ++ read_lock(&tasklist_lock); ++ ++ spid = find_vpid(pid); ++ if (spid) { ++ struct task_struct *p; ++ p = pid_task(spid, PIDTYPE_PID); ++ gr_fs_read_lock(p); ++ if (unlikely(!have_same_root(current, p))) { ++ gr_fs_read_unlock(p); ++ read_unlock(&tasklist_lock); ++ gr_log_noargs(GR_DONT_AUDIT, GR_UNIX_CHROOT_MSG); ++ return 0; ++ } ++ gr_fs_read_unlock(p); ++ } ++ read_unlock(&tasklist_lock); ++#endif ++ return 1; ++} ++ ++int ++gr_handle_chroot_nice(void) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_NICE ++ if (grsec_enable_chroot_nice && proc_is_chrooted(current)) { ++ gr_log_noargs(GR_DONT_AUDIT, GR_NICE_CHROOT_MSG); ++ return -EPERM; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_chroot_setpriority(struct task_struct *p, const int niceval) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_NICE ++ if (grsec_enable_chroot_nice && (niceval < task_nice(p)) ++ && proc_is_chrooted(current)) { ++ gr_log_str_int(GR_DONT_AUDIT, GR_PRIORITY_CHROOT_MSG, p->comm, p->pid); ++ return -EACCES; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_chroot_rawio(const struct inode *inode) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_CAPS ++ if (grsec_enable_chroot_caps && proc_is_chrooted(current) && ++ inode && S_ISBLK(inode->i_mode) && !capable(CAP_SYS_RAWIO)) ++ return 1; ++#endif ++ return 0; ++} ++ ++int ++gr_pid_is_chrooted(struct task_struct *p) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_FINDTASK ++ if (!grsec_enable_chroot_findtask || !proc_is_chrooted(current) || p == NULL) ++ return 0; ++ ++ gr_fs_read_lock(p); ++ if ((p->exit_state & (EXIT_ZOMBIE | EXIT_DEAD)) || ++ !have_same_root(current, p)) { ++ gr_fs_read_unlock(p); ++ return 1; ++ } ++ gr_fs_read_unlock(p); ++#endif ++ return 0; ++} ++ ++EXPORT_SYMBOL(gr_pid_is_chrooted); ++ ++#if defined(CONFIG_GRKERNSEC_CHROOT_DOUBLE) || defined(CONFIG_GRKERNSEC_CHROOT_FCHDIR) ++int gr_is_outside_chroot(const struct dentry *u_dentry, const struct vfsmount *u_mnt) ++{ ++ struct dentry *dentry = (struct dentry *)u_dentry; ++ struct vfsmount *mnt = (struct vfsmount *)u_mnt; ++ struct dentry *realroot; ++ struct vfsmount *realrootmnt; ++ struct dentry *currentroot; ++ struct vfsmount *currentmnt; ++ struct task_struct *reaper = &init_task; ++ int ret = 1; ++ ++ read_lock(&reaper->fs->lock); ++ realrootmnt = mntget(reaper->fs->root.mnt); ++ realroot = dget(reaper->fs->root.dentry); ++ read_unlock(&reaper->fs->lock); ++ ++ read_lock(¤t->fs->lock); ++ currentmnt = mntget(current->fs->root.mnt); ++ currentroot = dget(current->fs->root.dentry); ++ read_unlock(¤t->fs->lock); ++ ++ spin_lock(&dcache_lock); ++ for (;;) { ++ if (unlikely((dentry == realroot && mnt == realrootmnt) ++ || (dentry == currentroot && mnt == currentmnt))) ++ break; ++ if (unlikely(dentry == mnt->mnt_root || IS_ROOT(dentry))) { ++ if (mnt->mnt_parent == mnt) ++ break; ++ dentry = mnt->mnt_mountpoint; ++ mnt = mnt->mnt_parent; ++ continue; ++ } ++ dentry = dentry->d_parent; ++ } ++ spin_unlock(&dcache_lock); ++ ++ dput(currentroot); ++ mntput(currentmnt); ++ ++ /* access is outside of chroot */ ++ if (dentry == realroot && mnt == realrootmnt) ++ ret = 0; ++ ++ dput(realroot); ++ mntput(realrootmnt); ++ return ret; ++} ++#endif ++ ++int ++gr_chroot_fchdir(struct dentry *u_dentry, struct vfsmount *u_mnt) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_FCHDIR ++ if (!grsec_enable_chroot_fchdir) ++ return 1; ++ ++ if (!proc_is_chrooted(current)) ++ return 1; ++ else if (!gr_is_outside_chroot(u_dentry, u_mnt)) { ++ gr_log_fs_generic(GR_DONT_AUDIT, GR_CHROOT_FCHDIR_MSG, u_dentry, u_mnt); ++ return 0; ++ } ++#endif ++ return 1; ++} ++ ++int ++gr_chroot_shmat(const pid_t shm_cprid, const pid_t shm_lapid, ++ const time_t shm_createtime) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_SHMAT ++ struct pid *pid = NULL; ++ time_t starttime; ++ ++ if (unlikely(!grsec_enable_chroot_shmat)) ++ return 1; ++ ++ if (likely(!proc_is_chrooted(current))) ++ return 1; ++ ++ read_lock(&tasklist_lock); ++ ++ pid = find_vpid(shm_cprid); ++ if (pid) { ++ struct task_struct *p; ++ p = pid_task(pid, PIDTYPE_PID); ++ gr_fs_read_lock(p); ++ starttime = p->start_time.tv_sec; ++ if (unlikely(!have_same_root(current, p) && ++ time_before_eq((unsigned long)starttime, (unsigned long)shm_createtime))) { ++ gr_fs_read_unlock(p); ++ read_unlock(&tasklist_lock); ++ gr_log_noargs(GR_DONT_AUDIT, GR_SHMAT_CHROOT_MSG); ++ return 0; ++ } ++ gr_fs_read_unlock(p); ++ } else { ++ pid = find_vpid(shm_lapid); ++ if (pid) { ++ struct task_struct *p; ++ p = pid_task(pid, PIDTYPE_PID); ++ gr_fs_read_lock(p); ++ if (unlikely(!have_same_root(current, p))) { ++ gr_fs_read_unlock(p); ++ read_unlock(&tasklist_lock); ++ gr_log_noargs(GR_DONT_AUDIT, GR_SHMAT_CHROOT_MSG); ++ return 0; ++ } ++ gr_fs_read_unlock(p); ++ } ++ } ++ ++ read_unlock(&tasklist_lock); ++#endif ++ return 1; ++} ++ ++void ++gr_log_chroot_exec(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_EXECLOG ++ if (grsec_enable_chroot_execlog && proc_is_chrooted(current)) ++ gr_log_fs_generic(GR_DO_AUDIT, GR_EXEC_CHROOT_MSG, dentry, mnt); ++#endif ++ return; ++} ++ ++int ++gr_handle_chroot_mknod(const struct dentry *dentry, ++ const struct vfsmount *mnt, const int mode) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_MKNOD ++ if (grsec_enable_chroot_mknod && !S_ISFIFO(mode) && !S_ISREG(mode) && ++ proc_is_chrooted(current)) { ++ gr_log_fs_generic(GR_DONT_AUDIT, GR_MKNOD_CHROOT_MSG, dentry, mnt); ++ return -EPERM; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_chroot_mount(const struct dentry *dentry, ++ const struct vfsmount *mnt, const char *dev_name) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_MOUNT ++ if (grsec_enable_chroot_mount && proc_is_chrooted(current)) { ++ gr_log_str_fs(GR_DONT_AUDIT, GR_MOUNT_CHROOT_MSG, dev_name, dentry, mnt); ++ return -EPERM; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_chroot_pivot(void) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_PIVOT ++ if (grsec_enable_chroot_pivot && proc_is_chrooted(current)) { ++ gr_log_noargs(GR_DONT_AUDIT, GR_PIVOT_CHROOT_MSG); ++ return -EPERM; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_chroot_chroot(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_DOUBLE ++ if (grsec_enable_chroot_double && proc_is_chrooted(current) && ++ !gr_is_outside_chroot(dentry, mnt)) { ++ gr_log_fs_generic(GR_DONT_AUDIT, GR_CHROOT_CHROOT_MSG, dentry, mnt); ++ return -EPERM; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_chroot_caps(struct path *path) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_CAPS ++ if (grsec_enable_chroot_caps && current->pid > 1 && current->fs != NULL && ++ (init_task.fs->root.dentry != path->dentry) && ++ (current->nsproxy->mnt_ns->root->mnt_root != path->dentry)) { ++ ++ kernel_cap_t chroot_caps = GR_CHROOT_CAPS; ++ const struct cred *old = current_cred(); ++ struct cred *new = prepare_creds(); ++ if (new == NULL) ++ return 1; ++ ++ new->cap_permitted = cap_drop(old->cap_permitted, ++ chroot_caps); ++ new->cap_inheritable = cap_drop(old->cap_inheritable, ++ chroot_caps); ++ new->cap_effective = cap_drop(old->cap_effective, ++ chroot_caps); ++ ++ commit_creds(new); ++ ++ return 0; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_chroot_sysctl(const int op) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_SYSCTL ++ if (grsec_enable_chroot_sysctl && (op & MAY_WRITE) && ++ proc_is_chrooted(current)) ++ return -EACCES; ++#endif ++ return 0; ++} ++ ++void ++gr_handle_chroot_chdir(struct path *path) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_CHDIR ++ if (grsec_enable_chroot_chdir) ++ set_fs_pwd(current->fs, path); ++#endif ++ return; ++} ++ ++int ++gr_handle_chroot_chmod(const struct dentry *dentry, ++ const struct vfsmount *mnt, const int mode) ++{ ++#ifdef CONFIG_GRKERNSEC_CHROOT_CHMOD ++ if (grsec_enable_chroot_chmod && ++ ((mode & S_ISUID) || ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && ++ proc_is_chrooted(current)) { ++ gr_log_fs_generic(GR_DONT_AUDIT, GR_CHMOD_CHROOT_MSG, dentry, mnt); ++ return -EPERM; ++ } ++#endif ++ return 0; ++} ++ ++#ifdef CONFIG_SECURITY ++EXPORT_SYMBOL(gr_handle_chroot_caps); ++#endif +diff -urNp linux-2.6.33.1/grsecurity/grsec_disabled.c linux-2.6.33.1/grsecurity/grsec_disabled.c +--- linux-2.6.33.1/grsecurity/grsec_disabled.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_disabled.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,426 @@ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/sched.h> ++#include <linux/file.h> ++#include <linux/fs.h> ++#include <linux/kdev_t.h> ++#include <linux/net.h> ++#include <linux/in.h> ++#include <linux/ip.h> ++#include <linux/skbuff.h> ++#include <linux/sysctl.h> ++ ++#ifdef CONFIG_PAX_HAVE_ACL_FLAGS ++void ++pax_set_initial_flags(struct linux_binprm *bprm) ++{ ++ return; ++} ++#endif ++ ++#ifdef CONFIG_SYSCTL ++__u32 ++gr_handle_sysctl(const struct ctl_table * table, const int op) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_TASKSTATS ++int gr_is_taskstats_denied(int pid) ++{ ++ return 0; ++} ++#endif ++ ++int ++gr_acl_is_enabled(void) ++{ ++ return 0; ++} ++ ++int ++gr_handle_rawio(const struct inode *inode) ++{ ++ return 0; ++} ++ ++void ++gr_acl_handle_psacct(struct task_struct *task, const long code) ++{ ++ return; ++} ++ ++int ++gr_handle_ptrace(struct task_struct *task, const long request) ++{ ++ return 0; ++} ++ ++int ++gr_handle_proc_ptrace(struct task_struct *task) ++{ ++ return 0; ++} ++ ++void ++gr_learn_resource(const struct task_struct *task, ++ const int res, const unsigned long wanted, const int gt) ++{ ++ return; ++} ++ ++int ++gr_set_acls(const int type) ++{ ++ return 0; ++} ++ ++int ++gr_check_hidden_task(const struct task_struct *tsk) ++{ ++ return 0; ++} ++ ++int ++gr_check_protected_task(const struct task_struct *task) ++{ ++ return 0; ++} ++ ++void ++gr_copy_label(struct task_struct *tsk) ++{ ++ return; ++} ++ ++void ++gr_set_pax_flags(struct task_struct *task) ++{ ++ return; ++} ++ ++int ++gr_set_proc_label(const struct dentry *dentry, const struct vfsmount *mnt, ++ const int unsafe_share) ++{ ++ return 0; ++} ++ ++void ++gr_handle_delete(const ino_t ino, const dev_t dev) ++{ ++ return; ++} ++ ++void ++gr_handle_create(const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++ return; ++} ++ ++void ++gr_handle_crash(struct task_struct *task, const int sig) ++{ ++ return; ++} ++ ++int ++gr_check_crash_exec(const struct file *filp) ++{ ++ return 0; ++} ++ ++int ++gr_check_crash_uid(const uid_t uid) ++{ ++ return 0; ++} ++ ++void ++gr_handle_rename(struct inode *old_dir, struct inode *new_dir, ++ struct dentry *old_dentry, ++ struct dentry *new_dentry, ++ struct vfsmount *mnt, const __u8 replace) ++{ ++ return; ++} ++ ++int ++gr_search_socket(const int family, const int type, const int protocol) ++{ ++ return 1; ++} ++ ++int ++gr_search_connectbind(const int mode, const struct socket *sock, ++ const struct sockaddr_in *addr) ++{ ++ return 0; ++} ++ ++int ++gr_is_capable(const int cap) ++{ ++ return 1; ++} ++ ++int ++gr_is_capable_nolog(const int cap) ++{ ++ return 1; ++} ++ ++void ++gr_handle_alertkill(struct task_struct *task) ++{ ++ return; ++} ++ ++__u32 ++gr_acl_handle_execve(const struct dentry * dentry, const struct vfsmount * mnt) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_hidden_file(const struct dentry * dentry, ++ const struct vfsmount * mnt) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_open(const struct dentry * dentry, const struct vfsmount * mnt, ++ const int fmode) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_rmdir(const struct dentry * dentry, const struct vfsmount * mnt) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_unlink(const struct dentry * dentry, const struct vfsmount * mnt) ++{ ++ return 1; ++} ++ ++int ++gr_acl_handle_mmap(const struct file *file, const unsigned long prot, ++ unsigned int *vm_flags) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_truncate(const struct dentry * dentry, ++ const struct vfsmount * mnt) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_utime(const struct dentry * dentry, const struct vfsmount * mnt) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_access(const struct dentry * dentry, ++ const struct vfsmount * mnt, const int fmode) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_fchmod(const struct dentry * dentry, const struct vfsmount * mnt, ++ mode_t mode) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_chmod(const struct dentry * dentry, const struct vfsmount * mnt, ++ mode_t mode) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_chown(const struct dentry * dentry, const struct vfsmount * mnt) ++{ ++ return 1; ++} ++ ++void ++grsecurity_init(void) ++{ ++ return; ++} ++ ++__u32 ++gr_acl_handle_mknod(const struct dentry * new_dentry, ++ const struct dentry * parent_dentry, ++ const struct vfsmount * parent_mnt, ++ const int mode) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_mkdir(const struct dentry * new_dentry, ++ const struct dentry * parent_dentry, ++ const struct vfsmount * parent_mnt) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_symlink(const struct dentry * new_dentry, ++ const struct dentry * parent_dentry, ++ const struct vfsmount * parent_mnt, const char *from) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_link(const struct dentry * new_dentry, ++ const struct dentry * parent_dentry, ++ const struct vfsmount * parent_mnt, ++ const struct dentry * old_dentry, ++ const struct vfsmount * old_mnt, const char *to) ++{ ++ return 1; ++} ++ ++int ++gr_acl_handle_rename(const struct dentry *new_dentry, ++ const struct dentry *parent_dentry, ++ const struct vfsmount *parent_mnt, ++ const struct dentry *old_dentry, ++ const struct inode *old_parent_inode, ++ const struct vfsmount *old_mnt, const char *newname) ++{ ++ return 0; ++} ++ ++int ++gr_acl_handle_filldir(const struct file *file, const char *name, ++ const int namelen, const ino_t ino) ++{ ++ return 1; ++} ++ ++int ++gr_handle_shmat(const pid_t shm_cprid, const pid_t shm_lapid, ++ const time_t shm_createtime, const uid_t cuid, const int shmid) ++{ ++ return 1; ++} ++ ++int ++gr_search_bind(const struct socket *sock, const struct sockaddr_in *addr) ++{ ++ return 0; ++} ++ ++int ++gr_search_accept(const struct socket *sock) ++{ ++ return 0; ++} ++ ++int ++gr_search_listen(const struct socket *sock) ++{ ++ return 0; ++} ++ ++int ++gr_search_connect(const struct socket *sock, const struct sockaddr_in *addr) ++{ ++ return 0; ++} ++ ++__u32 ++gr_acl_handle_unix(const struct dentry * dentry, const struct vfsmount * mnt) ++{ ++ return 1; ++} ++ ++__u32 ++gr_acl_handle_creat(const struct dentry * dentry, ++ const struct dentry * p_dentry, ++ const struct vfsmount * p_mnt, const int fmode, ++ const int imode) ++{ ++ return 1; ++} ++ ++void ++gr_acl_handle_exit(void) ++{ ++ return; ++} ++ ++int ++gr_acl_handle_mprotect(const struct file *file, const unsigned long prot) ++{ ++ return 1; ++} ++ ++void ++gr_set_role_label(const uid_t uid, const gid_t gid) ++{ ++ return; ++} ++ ++int ++gr_acl_handle_procpidmem(const struct task_struct *task) ++{ ++ return 0; ++} ++ ++int ++gr_search_udp_recvmsg(const struct sock *sk, const struct sk_buff *skb) ++{ ++ return 0; ++} ++ ++int ++gr_search_udp_sendmsg(const struct sock *sk, const struct sockaddr_in *addr) ++{ ++ return 0; ++} ++ ++void ++gr_set_kernel_label(struct task_struct *task) ++{ ++ return; ++} ++ ++int ++gr_check_user_change(int real, int effective, int fs) ++{ ++ return 0; ++} ++ ++int ++gr_check_group_change(int real, int effective, int fs) ++{ ++ return 0; ++} ++ ++ ++EXPORT_SYMBOL(gr_is_capable); ++EXPORT_SYMBOL(gr_is_capable_nolog); ++EXPORT_SYMBOL(gr_learn_resource); ++EXPORT_SYMBOL(gr_set_kernel_label); ++#ifdef CONFIG_SECURITY ++EXPORT_SYMBOL(gr_check_user_change); ++EXPORT_SYMBOL(gr_check_group_change); ++#endif +diff -urNp linux-2.6.33.1/grsecurity/grsec_exec.c linux-2.6.33.1/grsecurity/grsec_exec.c +--- linux-2.6.33.1/grsecurity/grsec_exec.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_exec.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,89 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/file.h> ++#include <linux/binfmts.h> ++#include <linux/smp_lock.h> ++#include <linux/fs.h> ++#include <linux/types.h> ++#include <linux/grdefs.h> ++#include <linux/grinternal.h> ++#include <linux/capability.h> ++ ++#include <asm/uaccess.h> ++ ++#ifdef CONFIG_GRKERNSEC_EXECLOG ++static char gr_exec_arg_buf[132]; ++static DECLARE_MUTEX(gr_exec_arg_sem); ++#endif ++ ++int ++gr_handle_nproc(void) ++{ ++#ifdef CONFIG_GRKERNSEC_EXECVE ++ const struct cred *cred = current_cred(); ++ if (grsec_enable_execve && cred->user && ++ (atomic_read(&cred->user->processes) > ++ current->signal->rlim[RLIMIT_NPROC].rlim_cur) && ++ !capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE)) { ++ gr_log_noargs(GR_DONT_AUDIT, GR_NPROC_MSG); ++ return -EAGAIN; ++ } ++#endif ++ return 0; ++} ++ ++void ++gr_handle_exec_args(struct linux_binprm *bprm, const char __user *__user *argv) ++{ ++#ifdef CONFIG_GRKERNSEC_EXECLOG ++ char *grarg = gr_exec_arg_buf; ++ unsigned int i, x, execlen = 0; ++ char c; ++ ++ if (!((grsec_enable_execlog && grsec_enable_group && ++ in_group_p(grsec_audit_gid)) ++ || (grsec_enable_execlog && !grsec_enable_group))) ++ return; ++ ++ down(&gr_exec_arg_sem); ++ memset(grarg, 0, sizeof(gr_exec_arg_buf)); ++ ++ if (unlikely(argv == NULL)) ++ goto log; ++ ++ for (i = 0; i < bprm->argc && execlen < 128; i++) { ++ const char __user *p; ++ unsigned int len; ++ ++ if (copy_from_user(&p, argv + i, sizeof(p))) ++ goto log; ++ if (!p) ++ goto log; ++ len = strnlen_user(p, 128 - execlen); ++ if (len > 128 - execlen) ++ len = 128 - execlen; ++ else if (len > 0) ++ len--; ++ if (copy_from_user(grarg + execlen, p, len)) ++ goto log; ++ ++ /* rewrite unprintable characters */ ++ for (x = 0; x < len; x++) { ++ c = *(grarg + execlen + x); ++ if (c < 32 || c > 126) ++ *(grarg + execlen + x) = ' '; ++ } ++ ++ execlen += len; ++ *(grarg + execlen) = ' '; ++ *(grarg + execlen + 1) = '\0'; ++ execlen++; ++ } ++ ++ log: ++ gr_log_fs_str(GR_DO_AUDIT, GR_EXEC_AUDIT_MSG, bprm->file->f_path.dentry, ++ bprm->file->f_path.mnt, grarg); ++ up(&gr_exec_arg_sem); ++#endif ++ return; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_fifo.c linux-2.6.33.1/grsecurity/grsec_fifo.c +--- linux-2.6.33.1/grsecurity/grsec_fifo.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_fifo.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,24 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/file.h> ++#include <linux/grinternal.h> ++ ++int ++gr_handle_fifo(const struct dentry *dentry, const struct vfsmount *mnt, ++ const struct dentry *dir, const int flag, const int acc_mode) ++{ ++#ifdef CONFIG_GRKERNSEC_FIFO ++ const struct cred *cred = current_cred(); ++ ++ if (grsec_enable_fifo && S_ISFIFO(dentry->d_inode->i_mode) && ++ !(flag & O_EXCL) && (dir->d_inode->i_mode & S_ISVTX) && ++ (dentry->d_inode->i_uid != dir->d_inode->i_uid) && ++ (cred->fsuid != dentry->d_inode->i_uid)) { ++ if (!generic_permission(dentry->d_inode, acc_mode, NULL)) ++ gr_log_fs_int2(GR_DONT_AUDIT, GR_FIFO_MSG, dentry, mnt, dentry->d_inode->i_uid, dentry->d_inode->i_gid); ++ return -EACCES; ++ } ++#endif ++ return 0; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_fork.c linux-2.6.33.1/grsecurity/grsec_fork.c +--- linux-2.6.33.1/grsecurity/grsec_fork.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_fork.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,15 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++#include <linux/errno.h> ++ ++void ++gr_log_forkfail(const int retval) ++{ ++#ifdef CONFIG_GRKERNSEC_FORKFAIL ++ if (grsec_enable_forkfail && retval != -ERESTARTNOINTR) ++ gr_log_int(GR_DONT_AUDIT, GR_FAILFORK_MSG, retval); ++#endif ++ return; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_init.c linux-2.6.33.1/grsecurity/grsec_init.c +--- linux-2.6.33.1/grsecurity/grsec_init.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_init.c 2010-03-20 17:00:48.140865901 -0400 +@@ -0,0 +1,241 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/mm.h> ++#include <linux/smp_lock.h> ++#include <linux/gracl.h> ++#include <linux/slab.h> ++#include <linux/vmalloc.h> ++#include <linux/percpu.h> ++ ++int grsec_enable_link; ++int grsec_enable_dmesg; ++int grsec_enable_harden_ptrace; ++int grsec_enable_fifo; ++int grsec_enable_execve; ++int grsec_enable_execlog; ++int grsec_enable_signal; ++int grsec_enable_forkfail; ++int grsec_enable_audit_ptrace; ++int grsec_enable_time; ++int grsec_enable_audit_textrel; ++int grsec_enable_group; ++int grsec_audit_gid; ++int grsec_enable_chdir; ++int grsec_enable_mount; ++int grsec_enable_rofs; ++int grsec_enable_chroot_findtask; ++int grsec_enable_chroot_mount; ++int grsec_enable_chroot_shmat; ++int grsec_enable_chroot_fchdir; ++int grsec_enable_chroot_double; ++int grsec_enable_chroot_pivot; ++int grsec_enable_chroot_chdir; ++int grsec_enable_chroot_chmod; ++int grsec_enable_chroot_mknod; ++int grsec_enable_chroot_nice; ++int grsec_enable_chroot_execlog; ++int grsec_enable_chroot_caps; ++int grsec_enable_chroot_sysctl; ++int grsec_enable_chroot_unix; ++int grsec_enable_tpe; ++int grsec_tpe_gid; ++int grsec_enable_blackhole; ++int grsec_lastack_retries; ++int grsec_enable_tpe_all; ++int grsec_enable_socket_all; ++int grsec_socket_all_gid; ++int grsec_enable_socket_client; ++int grsec_socket_client_gid; ++int grsec_enable_socket_server; ++int grsec_socket_server_gid; ++int grsec_resource_logging; ++int grsec_lock; ++ ++DEFINE_SPINLOCK(grsec_alert_lock); ++unsigned long grsec_alert_wtime = 0; ++unsigned long grsec_alert_fyet = 0; ++ ++DEFINE_SPINLOCK(grsec_audit_lock); ++ ++DEFINE_RWLOCK(grsec_exec_file_lock); ++ ++char *gr_shared_page[4]; ++ ++char *gr_alert_log_fmt; ++char *gr_audit_log_fmt; ++char *gr_alert_log_buf; ++char *gr_audit_log_buf; ++ ++extern struct gr_arg *gr_usermode; ++extern unsigned char *gr_system_salt; ++extern unsigned char *gr_system_sum; ++ ++void __init ++grsecurity_init(void) ++{ ++ int j; ++ /* create the per-cpu shared pages */ ++ ++#ifdef CONFIG_X86 ++ memset((char *)(0x41a + PAGE_OFFSET), 0, 36); ++#endif ++ ++ for (j = 0; j < 4; j++) { ++ gr_shared_page[j] = (char *)__alloc_percpu(PAGE_SIZE, __alignof__(unsigned long long)); ++ if (gr_shared_page[j] == NULL) { ++ panic("Unable to allocate grsecurity shared page"); ++ return; ++ } ++ } ++ ++ /* allocate log buffers */ ++ gr_alert_log_fmt = kmalloc(512, GFP_KERNEL); ++ if (!gr_alert_log_fmt) { ++ panic("Unable to allocate grsecurity alert log format buffer"); ++ return; ++ } ++ gr_audit_log_fmt = kmalloc(512, GFP_KERNEL); ++ if (!gr_audit_log_fmt) { ++ panic("Unable to allocate grsecurity audit log format buffer"); ++ return; ++ } ++ gr_alert_log_buf = (char *) get_zeroed_page(GFP_KERNEL); ++ if (!gr_alert_log_buf) { ++ panic("Unable to allocate grsecurity alert log buffer"); ++ return; ++ } ++ gr_audit_log_buf = (char *) get_zeroed_page(GFP_KERNEL); ++ if (!gr_audit_log_buf) { ++ panic("Unable to allocate grsecurity audit log buffer"); ++ return; ++ } ++ ++ /* allocate memory for authentication structure */ ++ gr_usermode = kmalloc(sizeof(struct gr_arg), GFP_KERNEL); ++ gr_system_salt = kmalloc(GR_SALT_LEN, GFP_KERNEL); ++ gr_system_sum = kmalloc(GR_SHA_LEN, GFP_KERNEL); ++ ++ if (!gr_usermode || !gr_system_salt || !gr_system_sum) { ++ panic("Unable to allocate grsecurity authentication structure"); ++ return; ++ } ++ ++#if !defined(CONFIG_GRKERNSEC_SYSCTL) || defined(CONFIG_GRKERNSEC_SYSCTL_ON) ++#ifndef CONFIG_GRKERNSEC_SYSCTL ++ grsec_lock = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_AUDIT_TEXTREL ++ grsec_enable_audit_textrel = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_AUDIT_GROUP ++ grsec_enable_group = 1; ++ grsec_audit_gid = CONFIG_GRKERNSEC_AUDIT_GID; ++#endif ++#ifdef CONFIG_GRKERNSEC_AUDIT_CHDIR ++ grsec_enable_chdir = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_HARDEN_PTRACE ++ grsec_enable_harden_ptrace = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_AUDIT_MOUNT ++ grsec_enable_mount = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_LINK ++ grsec_enable_link = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_DMESG ++ grsec_enable_dmesg = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++ grsec_enable_blackhole = 1; ++ grsec_lastack_retries = 4; ++#endif ++#ifdef CONFIG_GRKERNSEC_FIFO ++ grsec_enable_fifo = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_EXECVE ++ grsec_enable_execve = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_EXECLOG ++ grsec_enable_execlog = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_SIGNAL ++ grsec_enable_signal = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_FORKFAIL ++ grsec_enable_forkfail = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_TIME ++ grsec_enable_time = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_RESLOG ++ grsec_resource_logging = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_FINDTASK ++ grsec_enable_chroot_findtask = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_UNIX ++ grsec_enable_chroot_unix = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_MOUNT ++ grsec_enable_chroot_mount = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_FCHDIR ++ grsec_enable_chroot_fchdir = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_SHMAT ++ grsec_enable_chroot_shmat = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_AUDIT_PTRACE ++ grsec_enable_audit_ptrace = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_DOUBLE ++ grsec_enable_chroot_double = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_PIVOT ++ grsec_enable_chroot_pivot = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_CHDIR ++ grsec_enable_chroot_chdir = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_CHMOD ++ grsec_enable_chroot_chmod = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_MKNOD ++ grsec_enable_chroot_mknod = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_NICE ++ grsec_enable_chroot_nice = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_EXECLOG ++ grsec_enable_chroot_execlog = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_CAPS ++ grsec_enable_chroot_caps = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_SYSCTL ++ grsec_enable_chroot_sysctl = 1; ++#endif ++#ifdef CONFIG_GRKERNSEC_TPE ++ grsec_enable_tpe = 1; ++ grsec_tpe_gid = CONFIG_GRKERNSEC_TPE_GID; ++#ifdef CONFIG_GRKERNSEC_TPE_ALL ++ grsec_enable_tpe_all = 1; ++#endif ++#endif ++#ifdef CONFIG_GRKERNSEC_SOCKET_ALL ++ grsec_enable_socket_all = 1; ++ grsec_socket_all_gid = CONFIG_GRKERNSEC_SOCKET_ALL_GID; ++#endif ++#ifdef CONFIG_GRKERNSEC_SOCKET_CLIENT ++ grsec_enable_socket_client = 1; ++ grsec_socket_client_gid = CONFIG_GRKERNSEC_SOCKET_CLIENT_GID; ++#endif ++#ifdef CONFIG_GRKERNSEC_SOCKET_SERVER ++ grsec_enable_socket_server = 1; ++ grsec_socket_server_gid = CONFIG_GRKERNSEC_SOCKET_SERVER_GID; ++#endif ++#endif ++ ++ return; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_link.c linux-2.6.33.1/grsecurity/grsec_link.c +--- linux-2.6.33.1/grsecurity/grsec_link.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_link.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,43 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/file.h> ++#include <linux/grinternal.h> ++ ++int ++gr_handle_follow_link(const struct inode *parent, ++ const struct inode *inode, ++ const struct dentry *dentry, const struct vfsmount *mnt) ++{ ++#ifdef CONFIG_GRKERNSEC_LINK ++ const struct cred *cred = current_cred(); ++ ++ if (grsec_enable_link && S_ISLNK(inode->i_mode) && ++ (parent->i_mode & S_ISVTX) && (parent->i_uid != inode->i_uid) && ++ (parent->i_mode & S_IWOTH) && (cred->fsuid != inode->i_uid)) { ++ gr_log_fs_int2(GR_DONT_AUDIT, GR_SYMLINK_MSG, dentry, mnt, inode->i_uid, inode->i_gid); ++ return -EACCES; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_hardlink(const struct dentry *dentry, ++ const struct vfsmount *mnt, ++ struct inode *inode, const int mode, const char *to) ++{ ++#ifdef CONFIG_GRKERNSEC_LINK ++ const struct cred *cred = current_cred(); ++ ++ if (grsec_enable_link && cred->fsuid != inode->i_uid && ++ (!S_ISREG(mode) || (mode & S_ISUID) || ++ ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) || ++ (generic_permission(inode, MAY_READ | MAY_WRITE, NULL))) && ++ !capable(CAP_FOWNER) && cred->uid) { ++ gr_log_fs_int2_str(GR_DONT_AUDIT, GR_HARDLINK_MSG, dentry, mnt, inode->i_uid, inode->i_gid, to); ++ return -EPERM; ++ } ++#endif ++ return 0; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_log.c linux-2.6.33.1/grsecurity/grsec_log.c +--- linux-2.6.33.1/grsecurity/grsec_log.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_log.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,296 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/file.h> ++#include <linux/tty.h> ++#include <linux/fs.h> ++#include <linux/grinternal.h> ++ ++#define BEGIN_LOCKS(x) \ ++ rcu_read_lock(); \ ++ read_lock(&tasklist_lock); \ ++ read_lock(&grsec_exec_file_lock); \ ++ if (x != GR_DO_AUDIT) \ ++ spin_lock(&grsec_alert_lock); \ ++ else \ ++ spin_lock(&grsec_audit_lock) ++ ++#define END_LOCKS(x) \ ++ if (x != GR_DO_AUDIT) \ ++ spin_unlock(&grsec_alert_lock); \ ++ else \ ++ spin_unlock(&grsec_audit_lock); \ ++ read_unlock(&grsec_exec_file_lock); \ ++ read_unlock(&tasklist_lock); \ ++ rcu_read_unlock(); \ ++ if (x == GR_DONT_AUDIT) \ ++ gr_handle_alertkill(current) ++ ++enum { ++ FLOODING, ++ NO_FLOODING ++}; ++ ++extern char *gr_alert_log_fmt; ++extern char *gr_audit_log_fmt; ++extern char *gr_alert_log_buf; ++extern char *gr_audit_log_buf; ++ ++static int gr_log_start(int audit) ++{ ++ char *loglevel = (audit == GR_DO_AUDIT) ? KERN_INFO : KERN_ALERT; ++ char *fmt = (audit == GR_DO_AUDIT) ? gr_audit_log_fmt : gr_alert_log_fmt; ++ char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; ++ ++ if (audit == GR_DO_AUDIT) ++ goto set_fmt; ++ ++ if (!grsec_alert_wtime || jiffies - grsec_alert_wtime > CONFIG_GRKERNSEC_FLOODTIME * HZ) { ++ grsec_alert_wtime = jiffies; ++ grsec_alert_fyet = 0; ++ } else if ((jiffies - grsec_alert_wtime < CONFIG_GRKERNSEC_FLOODTIME * HZ) && (grsec_alert_fyet < CONFIG_GRKERNSEC_FLOODBURST)) { ++ grsec_alert_fyet++; ++ } else if (grsec_alert_fyet == CONFIG_GRKERNSEC_FLOODBURST) { ++ grsec_alert_wtime = jiffies; ++ grsec_alert_fyet++; ++ printk(KERN_ALERT "grsec: more alerts, logging disabled for %d seconds\n", CONFIG_GRKERNSEC_FLOODTIME); ++ return FLOODING; ++ } else return FLOODING; ++ ++set_fmt: ++ memset(buf, 0, PAGE_SIZE); ++ if (current->signal->curr_ip && gr_acl_is_enabled()) { ++ sprintf(fmt, "%s%s", loglevel, "grsec: From %pI4: (%.64s:%c:%.950s) "); ++ snprintf(buf, PAGE_SIZE - 1, fmt, ¤t->signal->curr_ip, current->role->rolename, gr_roletype_to_char(), current->acl->filename); ++ } else if (current->signal->curr_ip) { ++ sprintf(fmt, "%s%s", loglevel, "grsec: From %pI4: "); ++ snprintf(buf, PAGE_SIZE - 1, fmt, ¤t->signal->curr_ip); ++ } else if (gr_acl_is_enabled()) { ++ sprintf(fmt, "%s%s", loglevel, "grsec: (%.64s:%c:%.950s) "); ++ snprintf(buf, PAGE_SIZE - 1, fmt, current->role->rolename, gr_roletype_to_char(), current->acl->filename); ++ } else { ++ sprintf(fmt, "%s%s", loglevel, "grsec: "); ++ strcpy(buf, fmt); ++ } ++ ++ return NO_FLOODING; ++} ++ ++static void gr_log_middle(int audit, const char *msg, va_list ap) ++ __attribute__ ((format (printf, 2, 0))); ++ ++static void gr_log_middle(int audit, const char *msg, va_list ap) ++{ ++ char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; ++ unsigned int len = strlen(buf); ++ ++ vsnprintf(buf + len, PAGE_SIZE - len - 1, msg, ap); ++ ++ return; ++} ++ ++static void gr_log_middle_varargs(int audit, const char *msg, ...) ++ __attribute__ ((format (printf, 2, 3))); ++ ++static void gr_log_middle_varargs(int audit, const char *msg, ...) ++{ ++ char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; ++ unsigned int len = strlen(buf); ++ va_list ap; ++ ++ va_start(ap, msg); ++ vsnprintf(buf + len, PAGE_SIZE - len - 1, msg, ap); ++ va_end(ap); ++ ++ return; ++} ++ ++static void gr_log_end(int audit) ++{ ++ char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; ++ unsigned int len = strlen(buf); ++ ++ snprintf(buf + len, PAGE_SIZE - len - 1, DEFAULTSECMSG, DEFAULTSECARGS(current, current_cred(), __task_cred(current->parent))); ++ printk("%s\n", buf); ++ ++ return; ++} ++ ++void gr_log_varargs(int audit, const char *msg, int argtypes, ...) ++{ ++ int logtype; ++ char *result = (audit == GR_DO_AUDIT) ? "successful" : "denied"; ++ char *str1, *str2, *str3; ++ void *voidptr; ++ int num1, num2; ++ unsigned long ulong1, ulong2; ++ struct dentry *dentry; ++ struct vfsmount *mnt; ++ struct file *file; ++ struct task_struct *task; ++ const struct cred *cred, *pcred; ++ va_list ap; ++ ++ BEGIN_LOCKS(audit); ++ logtype = gr_log_start(audit); ++ if (logtype == FLOODING) { ++ END_LOCKS(audit); ++ return; ++ } ++ va_start(ap, argtypes); ++ switch (argtypes) { ++ case GR_TTYSNIFF: ++ task = va_arg(ap, struct task_struct *); ++ gr_log_middle_varargs(audit, msg, &task->signal->curr_ip, gr_task_fullpath0(task), task->comm, task->pid, gr_parent_task_fullpath0(task), task->parent->comm, task->parent->pid); ++ break; ++ case GR_SYSCTL_HIDDEN: ++ str1 = va_arg(ap, char *); ++ gr_log_middle_varargs(audit, msg, result, str1); ++ break; ++ case GR_RBAC: ++ dentry = va_arg(ap, struct dentry *); ++ mnt = va_arg(ap, struct vfsmount *); ++ gr_log_middle_varargs(audit, msg, result, gr_to_filename(dentry, mnt)); ++ break; ++ case GR_RBAC_STR: ++ dentry = va_arg(ap, struct dentry *); ++ mnt = va_arg(ap, struct vfsmount *); ++ str1 = va_arg(ap, char *); ++ gr_log_middle_varargs(audit, msg, result, gr_to_filename(dentry, mnt), str1); ++ break; ++ case GR_STR_RBAC: ++ str1 = va_arg(ap, char *); ++ dentry = va_arg(ap, struct dentry *); ++ mnt = va_arg(ap, struct vfsmount *); ++ gr_log_middle_varargs(audit, msg, result, str1, gr_to_filename(dentry, mnt)); ++ break; ++ case GR_RBAC_MODE2: ++ dentry = va_arg(ap, struct dentry *); ++ mnt = va_arg(ap, struct vfsmount *); ++ str1 = va_arg(ap, char *); ++ str2 = va_arg(ap, char *); ++ gr_log_middle_varargs(audit, msg, result, gr_to_filename(dentry, mnt), str1, str2); ++ break; ++ case GR_RBAC_MODE3: ++ dentry = va_arg(ap, struct dentry *); ++ mnt = va_arg(ap, struct vfsmount *); ++ str1 = va_arg(ap, char *); ++ str2 = va_arg(ap, char *); ++ str3 = va_arg(ap, char *); ++ gr_log_middle_varargs(audit, msg, result, gr_to_filename(dentry, mnt), str1, str2, str3); ++ break; ++ case GR_FILENAME: ++ dentry = va_arg(ap, struct dentry *); ++ mnt = va_arg(ap, struct vfsmount *); ++ gr_log_middle_varargs(audit, msg, gr_to_filename(dentry, mnt)); ++ break; ++ case GR_STR_FILENAME: ++ str1 = va_arg(ap, char *); ++ dentry = va_arg(ap, struct dentry *); ++ mnt = va_arg(ap, struct vfsmount *); ++ gr_log_middle_varargs(audit, msg, str1, gr_to_filename(dentry, mnt)); ++ break; ++ case GR_FILENAME_STR: ++ dentry = va_arg(ap, struct dentry *); ++ mnt = va_arg(ap, struct vfsmount *); ++ str1 = va_arg(ap, char *); ++ gr_log_middle_varargs(audit, msg, gr_to_filename(dentry, mnt), str1); ++ break; ++ case GR_FILENAME_TWO_INT: ++ dentry = va_arg(ap, struct dentry *); ++ mnt = va_arg(ap, struct vfsmount *); ++ num1 = va_arg(ap, int); ++ num2 = va_arg(ap, int); ++ gr_log_middle_varargs(audit, msg, gr_to_filename(dentry, mnt), num1, num2); ++ break; ++ case GR_FILENAME_TWO_INT_STR: ++ dentry = va_arg(ap, struct dentry *); ++ mnt = va_arg(ap, struct vfsmount *); ++ num1 = va_arg(ap, int); ++ num2 = va_arg(ap, int); ++ str1 = va_arg(ap, char *); ++ gr_log_middle_varargs(audit, msg, gr_to_filename(dentry, mnt), num1, num2, str1); ++ break; ++ case GR_TEXTREL: ++ file = va_arg(ap, struct file *); ++ ulong1 = va_arg(ap, unsigned long); ++ ulong2 = va_arg(ap, unsigned long); ++ gr_log_middle_varargs(audit, msg, file ? gr_to_filename(file->f_path.dentry, file->f_path.mnt) : "<anonymous mapping>", ulong1, ulong2); ++ break; ++ case GR_PTRACE: ++ task = va_arg(ap, struct task_struct *); ++ gr_log_middle_varargs(audit, msg, task->exec_file ? gr_to_filename(task->exec_file->f_path.dentry, task->exec_file->f_path.mnt) : "(none)", task->comm, task->pid); ++ break; ++ case GR_RESOURCE: ++ task = va_arg(ap, struct task_struct *); ++ cred = __task_cred(task); ++ pcred = __task_cred(task->parent); ++ ulong1 = va_arg(ap, unsigned long); ++ str1 = va_arg(ap, char *); ++ ulong2 = va_arg(ap, unsigned long); ++ gr_log_middle_varargs(audit, msg, ulong1, str1, ulong2, gr_task_fullpath(task), task->comm, task->pid, cred->uid, cred->euid, cred->gid, cred->egid, gr_parent_task_fullpath(task), task->parent->comm, task->parent->pid, pcred->uid, pcred->euid, pcred->gid, pcred->egid); ++ break; ++ case GR_CAP: ++ task = va_arg(ap, struct task_struct *); ++ cred = __task_cred(task); ++ pcred = __task_cred(task->parent); ++ str1 = va_arg(ap, char *); ++ gr_log_middle_varargs(audit, msg, str1, gr_task_fullpath(task), task->comm, task->pid, cred->uid, cred->euid, cred->gid, cred->egid, gr_parent_task_fullpath(task), task->parent->comm, task->parent->pid, pcred->uid, pcred->euid, pcred->gid, pcred->egid); ++ break; ++ case GR_SIG: ++ str1 = va_arg(ap, char *); ++ voidptr = va_arg(ap, void *); ++ gr_log_middle_varargs(audit, msg, str1, voidptr); ++ break; ++ case GR_SIG2: ++ task = va_arg(ap, struct task_struct *); ++ cred = __task_cred(task); ++ pcred = __task_cred(task->parent); ++ num1 = va_arg(ap, int); ++ gr_log_middle_varargs(audit, msg, num1, gr_task_fullpath0(task), task->comm, task->pid, cred->uid, cred->euid, cred->gid, cred->egid, gr_parent_task_fullpath0(task), task->parent->comm, task->parent->pid, pcred->uid, pcred->euid, pcred->gid, pcred->egid); ++ break; ++ case GR_CRASH1: ++ task = va_arg(ap, struct task_struct *); ++ cred = __task_cred(task); ++ pcred = __task_cred(task->parent); ++ ulong1 = va_arg(ap, unsigned long); ++ gr_log_middle_varargs(audit, msg, gr_task_fullpath(task), task->comm, task->pid, cred->uid, cred->euid, cred->gid, cred->egid, gr_parent_task_fullpath(task), task->parent->comm, task->parent->pid, pcred->uid, pcred->euid, pcred->gid, pcred->egid, cred->uid, ulong1); ++ break; ++ case GR_CRASH2: ++ task = va_arg(ap, struct task_struct *); ++ cred = __task_cred(task); ++ pcred = __task_cred(task->parent); ++ ulong1 = va_arg(ap, unsigned long); ++ gr_log_middle_varargs(audit, msg, gr_task_fullpath(task), task->comm, task->pid, cred->uid, cred->euid, cred->gid, cred->egid, gr_parent_task_fullpath(task), task->parent->comm, task->parent->pid, pcred->uid, pcred->euid, pcred->gid, pcred->egid, ulong1); ++ break; ++ case GR_PSACCT: ++ { ++ unsigned int wday, cday; ++ __u8 whr, chr; ++ __u8 wmin, cmin; ++ __u8 wsec, csec; ++ char cur_tty[64] = { 0 }; ++ char parent_tty[64] = { 0 }; ++ ++ task = va_arg(ap, struct task_struct *); ++ wday = va_arg(ap, unsigned int); ++ cday = va_arg(ap, unsigned int); ++ whr = va_arg(ap, int); ++ chr = va_arg(ap, int); ++ wmin = va_arg(ap, int); ++ cmin = va_arg(ap, int); ++ wsec = va_arg(ap, int); ++ csec = va_arg(ap, int); ++ ulong1 = va_arg(ap, unsigned long); ++ cred = __task_cred(task); ++ pcred = __task_cred(task->parent); ++ ++ gr_log_middle_varargs(audit, msg, gr_task_fullpath(task), task->comm, task->pid, &task->signal->curr_ip, tty_name(task->signal->tty, cur_tty), cred->uid, cred->euid, cred->gid, cred->egid, wday, whr, wmin, wsec, cday, chr, cmin, csec, (task->flags & PF_SIGNALED) ? "killed by signal" : "exited", ulong1, gr_parent_task_fullpath(task), task->parent->comm, task->parent->pid, &task->parent->signal->curr_ip, tty_name(task->parent->signal->tty, parent_tty), pcred->uid, pcred->euid, pcred->gid, pcred->egid); ++ } ++ break; ++ default: ++ gr_log_middle(audit, msg, ap); ++ } ++ va_end(ap); ++ gr_log_end(audit); ++ END_LOCKS(audit); ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_mem.c linux-2.6.33.1/grsecurity/grsec_mem.c +--- linux-2.6.33.1/grsecurity/grsec_mem.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_mem.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,85 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/mm.h> ++#include <linux/mman.h> ++#include <linux/grinternal.h> ++ ++void ++gr_handle_ioperm(void) ++{ ++ gr_log_noargs(GR_DONT_AUDIT, GR_IOPERM_MSG); ++ return; ++} ++ ++void ++gr_handle_iopl(void) ++{ ++ gr_log_noargs(GR_DONT_AUDIT, GR_IOPL_MSG); ++ return; ++} ++ ++void ++gr_handle_mem_write(void) ++{ ++ gr_log_noargs(GR_DONT_AUDIT, GR_MEM_WRITE_MSG); ++ return; ++} ++ ++void ++gr_handle_kmem_write(void) ++{ ++ gr_log_noargs(GR_DONT_AUDIT, GR_KMEM_MSG); ++ return; ++} ++ ++void ++gr_handle_open_port(void) ++{ ++ gr_log_noargs(GR_DONT_AUDIT, GR_PORT_OPEN_MSG); ++ return; ++} ++ ++int ++gr_handle_mem_mmap(const unsigned long offset, struct vm_area_struct *vma) ++{ ++ unsigned long start, end; ++ ++ start = offset; ++ end = start + vma->vm_end - vma->vm_start; ++ ++ if (start > end) { ++ gr_log_noargs(GR_DONT_AUDIT, GR_MEM_MMAP_MSG); ++ return -EPERM; ++ } ++ ++ /* allowed ranges : ISA I/O BIOS */ ++ if ((start >= __pa(high_memory)) ++#if defined(CONFIG_X86) || defined(CONFIG_PPC) ++ || (start >= 0x000a0000 && end <= 0x00100000) ++ || (start >= 0x00000000 && end <= 0x00001000) ++#endif ++ ) ++ return 0; ++ ++ if (vma->vm_flags & VM_WRITE) { ++ gr_log_noargs(GR_DONT_AUDIT, GR_MEM_MMAP_MSG); ++ return -EPERM; ++ } else ++ vma->vm_flags &= ~VM_MAYWRITE; ++ ++ return 0; ++} ++ ++void ++gr_log_nonroot_mod_load(const char *modname) ++{ ++ gr_log_str(GR_DONT_AUDIT, GR_NONROOT_MODLOAD_MSG, modname); ++ return; ++} ++ ++void ++gr_handle_vm86(void) ++{ ++ gr_log_noargs(GR_DONT_AUDIT, GR_VM86_MSG); ++ return; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_mount.c linux-2.6.33.1/grsecurity/grsec_mount.c +--- linux-2.6.33.1/grsecurity/grsec_mount.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_mount.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,62 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/mount.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++ ++void ++gr_log_remount(const char *devname, const int retval) ++{ ++#ifdef CONFIG_GRKERNSEC_AUDIT_MOUNT ++ if (grsec_enable_mount && (retval >= 0)) ++ gr_log_str(GR_DO_AUDIT, GR_REMOUNT_AUDIT_MSG, devname ? devname : "none"); ++#endif ++ return; ++} ++ ++void ++gr_log_unmount(const char *devname, const int retval) ++{ ++#ifdef CONFIG_GRKERNSEC_AUDIT_MOUNT ++ if (grsec_enable_mount && (retval >= 0)) ++ gr_log_str(GR_DO_AUDIT, GR_UNMOUNT_AUDIT_MSG, devname ? devname : "none"); ++#endif ++ return; ++} ++ ++void ++gr_log_mount(const char *from, const char *to, const int retval) ++{ ++#ifdef CONFIG_GRKERNSEC_AUDIT_MOUNT ++ if (grsec_enable_mount && (retval >= 0)) ++ gr_log_str_str(GR_DO_AUDIT, GR_MOUNT_AUDIT_MSG, from, to); ++#endif ++ return; ++} ++ ++int ++gr_handle_rofs_mount(struct dentry *dentry, struct vfsmount *mnt, int mnt_flags) ++{ ++#ifdef CONFIG_GRKERNSEC_ROFS ++ if (grsec_enable_rofs && !(mnt_flags & MNT_READONLY)) { ++ gr_log_fs_generic(GR_DO_AUDIT, GR_ROFS_MOUNT_MSG, dentry, mnt); ++ return -EPERM; ++ } else ++ return 0; ++#endif ++ return 0; ++} ++ ++int ++gr_handle_rofs_blockwrite(struct dentry *dentry, struct vfsmount *mnt, int acc_mode) ++{ ++#ifdef CONFIG_GRKERNSEC_ROFS ++ if (grsec_enable_rofs && (acc_mode & MAY_WRITE) && ++ dentry->d_inode && S_ISBLK(dentry->d_inode->i_mode)) { ++ gr_log_fs_generic(GR_DO_AUDIT, GR_ROFS_BLOCKWRITE_MSG, dentry, mnt); ++ return -EPERM; ++ } else ++ return 0; ++#endif ++ return 0; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_ptrace.c linux-2.6.33.1/grsecurity/grsec_ptrace.c +--- linux-2.6.33.1/grsecurity/grsec_ptrace.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_ptrace.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,14 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/grinternal.h> ++#include <linux/grsecurity.h> ++ ++void ++gr_audit_ptrace(struct task_struct *task) ++{ ++#ifdef CONFIG_GRKERNSEC_AUDIT_PTRACE ++ if (grsec_enable_audit_ptrace) ++ gr_log_ptrace(GR_DO_AUDIT, GR_PTRACE_AUDIT_MSG, task); ++#endif ++ return; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_sig.c linux-2.6.33.1/grsecurity/grsec_sig.c +--- linux-2.6.33.1/grsecurity/grsec_sig.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_sig.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,65 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/delay.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++ ++char *signames[] = { ++ [SIGSEGV] = "Segmentation fault", ++ [SIGILL] = "Illegal instruction", ++ [SIGABRT] = "Abort", ++ [SIGBUS] = "Invalid alignment/Bus error" ++}; ++ ++void ++gr_log_signal(const int sig, const void *addr, const struct task_struct *t) ++{ ++#ifdef CONFIG_GRKERNSEC_SIGNAL ++ if (grsec_enable_signal && ((sig == SIGSEGV) || (sig == SIGILL) || ++ (sig == SIGABRT) || (sig == SIGBUS))) { ++ if (t->pid == current->pid) { ++ gr_log_sig_addr(GR_DONT_AUDIT_GOOD, GR_UNISIGLOG_MSG, signames[sig], addr); ++ } else { ++ gr_log_sig_task(GR_DONT_AUDIT_GOOD, GR_DUALSIGLOG_MSG, t, sig); ++ } ++ } ++#endif ++ return; ++} ++ ++int ++gr_handle_signal(const struct task_struct *p, const int sig) ++{ ++#ifdef CONFIG_GRKERNSEC ++ if (current->pid > 1 && gr_check_protected_task(p)) { ++ gr_log_sig_task(GR_DONT_AUDIT, GR_SIG_ACL_MSG, p, sig); ++ return -EPERM; ++ } else if (gr_pid_is_chrooted((struct task_struct *)p)) { ++ return -EPERM; ++ } ++#endif ++ return 0; ++} ++ ++void gr_handle_brute_attach(struct task_struct *p) ++{ ++#ifdef CONFIG_GRKERNSEC_BRUTE ++ read_lock(&tasklist_lock); ++ read_lock(&grsec_exec_file_lock); ++ if (p->parent && p->parent->exec_file == p->exec_file) ++ p->parent->brute = 1; ++ read_unlock(&grsec_exec_file_lock); ++ read_unlock(&tasklist_lock); ++#endif ++ return; ++} ++ ++void gr_handle_brute_check(void) ++{ ++#ifdef CONFIG_GRKERNSEC_BRUTE ++ if (current->brute) ++ msleep(30 * 1000); ++#endif ++ return; ++} ++ +diff -urNp linux-2.6.33.1/grsecurity/grsec_sock.c linux-2.6.33.1/grsecurity/grsec_sock.c +--- linux-2.6.33.1/grsecurity/grsec_sock.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_sock.c 2010-03-20 16:58:41.892922620 -0400 +@@ -0,0 +1,271 @@ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/sched.h> ++#include <linux/file.h> ++#include <linux/net.h> ++#include <linux/in.h> ++#include <linux/ip.h> ++#include <net/sock.h> ++#include <net/inet_sock.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++#include <linux/gracl.h> ++ ++kernel_cap_t gr_cap_rtnetlink(struct sock *sock); ++EXPORT_SYMBOL(gr_cap_rtnetlink); ++ ++extern int gr_search_udp_recvmsg(const struct sock *sk, const struct sk_buff *skb); ++extern int gr_search_udp_sendmsg(const struct sock *sk, const struct sockaddr_in *addr); ++ ++EXPORT_SYMBOL(gr_search_udp_recvmsg); ++EXPORT_SYMBOL(gr_search_udp_sendmsg); ++ ++#ifdef CONFIG_UNIX_MODULE ++EXPORT_SYMBOL(gr_acl_handle_unix); ++EXPORT_SYMBOL(gr_acl_handle_mknod); ++EXPORT_SYMBOL(gr_handle_chroot_unix); ++EXPORT_SYMBOL(gr_handle_create); ++#endif ++ ++#ifdef CONFIG_GRKERNSEC ++#define gr_conn_table_size 32749 ++struct conn_table_entry { ++ struct conn_table_entry *next; ++ struct signal_struct *sig; ++}; ++ ++struct conn_table_entry *gr_conn_table[gr_conn_table_size]; ++DEFINE_SPINLOCK(gr_conn_table_lock); ++ ++extern const char * gr_socktype_to_name(unsigned char type); ++extern const char * gr_proto_to_name(unsigned char proto); ++ ++static __inline__ int ++conn_hash(__u32 saddr, __u32 daddr, __u16 sport, __u16 dport, unsigned int size) ++{ ++ return ((daddr + saddr + (sport << 8) + (dport << 16)) % size); ++} ++ ++static __inline__ int ++conn_match(const struct signal_struct *sig, __u32 saddr, __u32 daddr, ++ __u16 sport, __u16 dport) ++{ ++ if (unlikely(sig->gr_saddr == saddr && sig->gr_daddr == daddr && ++ sig->gr_sport == sport && sig->gr_dport == dport)) ++ return 1; ++ else ++ return 0; ++} ++ ++static void gr_add_to_task_ip_table_nolock(struct signal_struct *sig, struct conn_table_entry *newent) ++{ ++ struct conn_table_entry **match; ++ unsigned int index; ++ ++ index = conn_hash(sig->gr_saddr, sig->gr_daddr, ++ sig->gr_sport, sig->gr_dport, ++ gr_conn_table_size); ++ ++ newent->sig = sig; ++ ++ match = &gr_conn_table[index]; ++ newent->next = *match; ++ *match = newent; ++ ++ return; ++} ++ ++static void gr_del_task_from_ip_table_nolock(struct signal_struct *sig) ++{ ++ struct conn_table_entry *match, *last = NULL; ++ unsigned int index; ++ ++ index = conn_hash(sig->gr_saddr, sig->gr_daddr, ++ sig->gr_sport, sig->gr_dport, ++ gr_conn_table_size); ++ ++ match = gr_conn_table[index]; ++ while (match && !conn_match(match->sig, ++ sig->gr_saddr, sig->gr_daddr, sig->gr_sport, ++ sig->gr_dport)) { ++ last = match; ++ match = match->next; ++ } ++ ++ if (match) { ++ if (last) ++ last->next = match->next; ++ else ++ gr_conn_table[index] = NULL; ++ kfree(match); ++ } ++ ++ return; ++} ++ ++static struct signal_struct * gr_lookup_task_ip_table(__u32 saddr, __u32 daddr, ++ __u16 sport, __u16 dport) ++{ ++ struct conn_table_entry *match; ++ unsigned int index; ++ ++ index = conn_hash(saddr, daddr, sport, dport, gr_conn_table_size); ++ ++ match = gr_conn_table[index]; ++ while (match && !conn_match(match->sig, saddr, daddr, sport, dport)) ++ match = match->next; ++ ++ if (match) ++ return match->sig; ++ else ++ return NULL; ++} ++ ++#endif ++ ++void gr_update_task_in_ip_table(struct task_struct *task, const struct inet_sock *inet) ++{ ++#ifdef CONFIG_GRKERNSEC ++ struct signal_struct *sig = task->signal; ++ struct conn_table_entry *newent; ++ ++ newent = kmalloc(sizeof(struct conn_table_entry), GFP_ATOMIC); ++ if (newent == NULL) ++ return; ++ /* no bh lock needed since we are called with bh disabled */ ++ spin_lock(&gr_conn_table_lock); ++ gr_del_task_from_ip_table_nolock(sig); ++ sig->gr_saddr = inet->inet_rcv_saddr; ++ sig->gr_daddr = inet->inet_daddr; ++ sig->gr_sport = inet->inet_sport; ++ sig->gr_dport = inet->inet_dport; ++ gr_add_to_task_ip_table_nolock(sig, newent); ++ spin_unlock(&gr_conn_table_lock); ++#endif ++ return; ++} ++ ++void gr_del_task_from_ip_table(struct task_struct *task) ++{ ++#ifdef CONFIG_GRKERNSEC ++ spin_lock_bh(&gr_conn_table_lock); ++ gr_del_task_from_ip_table_nolock(task->signal); ++ spin_unlock_bh(&gr_conn_table_lock); ++#endif ++ return; ++} ++ ++void ++gr_attach_curr_ip(const struct sock *sk) ++{ ++#ifdef CONFIG_GRKERNSEC ++ struct signal_struct *p, *set; ++ const struct inet_sock *inet = inet_sk(sk); ++ ++ if (unlikely(sk->sk_protocol != IPPROTO_TCP)) ++ return; ++ ++ set = current->signal; ++ ++ spin_lock_bh(&gr_conn_table_lock); ++ p = gr_lookup_task_ip_table(inet->inet_daddr, inet->inet_rcv_saddr, ++ inet->inet_dport, inet->inet_sport); ++ if (unlikely(p != NULL)) { ++ set->curr_ip = p->curr_ip; ++ set->used_accept = 1; ++ gr_del_task_from_ip_table_nolock(p); ++ spin_unlock_bh(&gr_conn_table_lock); ++ return; ++ } ++ spin_unlock_bh(&gr_conn_table_lock); ++ ++ set->curr_ip = inet->inet_daddr; ++ set->used_accept = 1; ++#endif ++ return; ++} ++ ++int ++gr_handle_sock_all(const int family, const int type, const int protocol) ++{ ++#ifdef CONFIG_GRKERNSEC_SOCKET_ALL ++ if (grsec_enable_socket_all && in_group_p(grsec_socket_all_gid) && ++ (family != AF_UNIX) && (family != AF_LOCAL)) { ++ gr_log_int_str2(GR_DONT_AUDIT, GR_SOCK2_MSG, family, gr_socktype_to_name(type), gr_proto_to_name(protocol)); ++ return -EACCES; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_sock_server(const struct sockaddr *sck) ++{ ++#ifdef CONFIG_GRKERNSEC_SOCKET_SERVER ++ if (grsec_enable_socket_server && ++ in_group_p(grsec_socket_server_gid) && ++ sck && (sck->sa_family != AF_UNIX) && ++ (sck->sa_family != AF_LOCAL)) { ++ gr_log_noargs(GR_DONT_AUDIT, GR_BIND_MSG); ++ return -EACCES; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_sock_server_other(const struct sock *sck) ++{ ++#ifdef CONFIG_GRKERNSEC_SOCKET_SERVER ++ if (grsec_enable_socket_server && ++ in_group_p(grsec_socket_server_gid) && ++ sck && (sck->sk_family != AF_UNIX) && ++ (sck->sk_family != AF_LOCAL)) { ++ gr_log_noargs(GR_DONT_AUDIT, GR_BIND_MSG); ++ return -EACCES; ++ } ++#endif ++ return 0; ++} ++ ++int ++gr_handle_sock_client(const struct sockaddr *sck) ++{ ++#ifdef CONFIG_GRKERNSEC_SOCKET_CLIENT ++ if (grsec_enable_socket_client && in_group_p(grsec_socket_client_gid) && ++ sck && (sck->sa_family != AF_UNIX) && ++ (sck->sa_family != AF_LOCAL)) { ++ gr_log_noargs(GR_DONT_AUDIT, GR_CONNECT_MSG); ++ return -EACCES; ++ } ++#endif ++ return 0; ++} ++ ++kernel_cap_t ++gr_cap_rtnetlink(struct sock *sock) ++{ ++#ifdef CONFIG_GRKERNSEC ++ if (!gr_acl_is_enabled()) ++ return current_cap(); ++ else if (sock->sk_protocol == NETLINK_ISCSI && ++ cap_raised(current_cap(), CAP_SYS_ADMIN) && ++ gr_is_capable(CAP_SYS_ADMIN)) ++ return current_cap(); ++ else if (sock->sk_protocol == NETLINK_AUDIT && ++ cap_raised(current_cap(), CAP_AUDIT_WRITE) && ++ gr_is_capable(CAP_AUDIT_WRITE) && ++ cap_raised(current_cap(), CAP_AUDIT_CONTROL) && ++ gr_is_capable(CAP_AUDIT_CONTROL)) ++ return current_cap(); ++ else if (cap_raised(current_cap(), CAP_NET_ADMIN) && ++ ((sock->sk_protocol == NETLINK_ROUTE) ? ++ gr_is_capable_nolog(CAP_NET_ADMIN) : ++ gr_is_capable(CAP_NET_ADMIN))) ++ return current_cap(); ++ else ++ return __cap_empty_set; ++#else ++ return current_cap(); ++#endif ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_sysctl.c linux-2.6.33.1/grsecurity/grsec_sysctl.c +--- linux-2.6.33.1/grsecurity/grsec_sysctl.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_sysctl.c 2010-03-20 17:08:11.436987044 -0400 +@@ -0,0 +1,404 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/sysctl.h> ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++ ++int ++gr_handle_sysctl_mod(const char *dirname, const char *name, const int op) ++{ ++#ifdef CONFIG_GRKERNSEC_SYSCTL ++ if (!strcmp(dirname, "grsecurity") && grsec_lock && (op & MAY_WRITE)) { ++ gr_log_str(GR_DONT_AUDIT, GR_SYSCTL_MSG, name); ++ return -EACCES; ++ } ++#endif ++ return 0; ++} ++ ++#ifdef CONFIG_GRKERNSEC_ROFS ++static int __maybe_unused one = 1; ++#endif ++ ++#if defined(CONFIG_GRKERNSEC_SYSCTL) || defined(CONFIG_GRKERNSEC_ROFS) ++ctl_table grsecurity_table[] = { ++#ifdef CONFIG_GRKERNSEC_SYSCTL ++#ifdef CONFIG_GRKERNSEC_LINK ++ { ++ .procname = "linking_restrictions", ++ .data = &grsec_enable_link, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_FIFO ++ { ++ .procname = "fifo_restrictions", ++ .data = &grsec_enable_fifo, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_EXECVE ++ { ++ .procname = "execve_limiting", ++ .data = &grsec_enable_execve, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++ { ++ .procname = "ip_blackhole", ++ .data = &grsec_enable_blackhole, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .procname = "lastack_retries", ++ .data = &grsec_lastack_retries, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_EXECLOG ++ { ++ .procname = "exec_logging", ++ .data = &grsec_enable_execlog, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_SIGNAL ++ { ++ .procname = "signal_logging", ++ .data = &grsec_enable_signal, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_FORKFAIL ++ { ++ .procname = "forkfail_logging", ++ .data = &grsec_enable_forkfail, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_TIME ++ { ++ .procname = "timechange_logging", ++ .data = &grsec_enable_time, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_SHMAT ++ { ++ .procname = "chroot_deny_shmat", ++ .data = &grsec_enable_chroot_shmat, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_UNIX ++ { ++ .procname = "chroot_deny_unix", ++ .data = &grsec_enable_chroot_unix, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_MOUNT ++ { ++ .procname = "chroot_deny_mount", ++ .data = &grsec_enable_chroot_mount, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_FCHDIR ++ { ++ .procname = "chroot_deny_fchdir", ++ .data = &grsec_enable_chroot_fchdir, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_DOUBLE ++ { ++ .procname = "chroot_deny_chroot", ++ .data = &grsec_enable_chroot_double, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_PIVOT ++ { ++ .procname = "chroot_deny_pivot", ++ .data = &grsec_enable_chroot_pivot, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_CHDIR ++ { ++ .procname = "chroot_enforce_chdir", ++ .data = &grsec_enable_chroot_chdir, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_CHMOD ++ { ++ .procname = "chroot_deny_chmod", ++ .data = &grsec_enable_chroot_chmod, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_MKNOD ++ { ++ .procname = "chroot_deny_mknod", ++ .data = &grsec_enable_chroot_mknod, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_NICE ++ { ++ .procname = "chroot_restrict_nice", ++ .data = &grsec_enable_chroot_nice, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_EXECLOG ++ { ++ .procname = "chroot_execlog", ++ .data = &grsec_enable_chroot_execlog, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_CAPS ++ { ++ .procname = "chroot_caps", ++ .data = &grsec_enable_chroot_caps, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_SYSCTL ++ { ++ .procname = "chroot_deny_sysctl", ++ .data = &grsec_enable_chroot_sysctl, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_TPE ++ { ++ .procname = "tpe", ++ .data = &grsec_enable_tpe, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .procname = "tpe_gid", ++ .data = &grsec_tpe_gid, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_TPE_ALL ++ { ++ .procname = "tpe_restrict_all", ++ .data = &grsec_enable_tpe_all, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_SOCKET_ALL ++ { ++ .procname = "socket_all", ++ .data = &grsec_enable_socket_all, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .procname = "socket_all_gid", ++ .data = &grsec_socket_all_gid, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_SOCKET_CLIENT ++ { ++ .procname = "socket_client", ++ .data = &grsec_enable_socket_client, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .procname = "socket_client_gid", ++ .data = &grsec_socket_client_gid, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_SOCKET_SERVER ++ { ++ .procname = "socket_server", ++ .data = &grsec_enable_socket_server, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .procname = "socket_server_gid", ++ .data = &grsec_socket_server_gid, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_AUDIT_GROUP ++ { ++ .procname = "audit_group", ++ .data = &grsec_enable_group, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .procname = "audit_gid", ++ .data = &grsec_audit_gid, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_AUDIT_CHDIR ++ { ++ .procname = "audit_chdir", ++ .data = &grsec_enable_chdir, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_AUDIT_MOUNT ++ { ++ .procname = "audit_mount", ++ .data = &grsec_enable_mount, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_AUDIT_TEXTREL ++ { ++ .procname = "audit_textrel", ++ .data = &grsec_enable_audit_textrel, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_DMESG ++ { ++ .procname = "dmesg", ++ .data = &grsec_enable_dmesg, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_CHROOT_FINDTASK ++ { ++ .procname = "chroot_findtask", ++ .data = &grsec_enable_chroot_findtask, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_RESLOG ++ { ++ .procname = "resource_logging", ++ .data = &grsec_resource_logging, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_AUDIT_PTRACE ++ { ++ .procname = "audit_ptrace", ++ .data = &grsec_enable_audit_ptrace, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_HARDEN_PTRACE ++ { ++ .procname = "harden_ptrace", ++ .data = &grsec_enable_harden_ptrace, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++ { ++ .procname = "grsec_lock", ++ .data = &grsec_lock, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_GRKERNSEC_ROFS ++ { ++ .procname = "romount_protect", ++ .data = &grsec_enable_rofs, ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one, ++ }, ++#endif ++ { } ++}; ++#endif +diff -urNp linux-2.6.33.1/grsecurity/grsec_textrel.c linux-2.6.33.1/grsecurity/grsec_textrel.c +--- linux-2.6.33.1/grsecurity/grsec_textrel.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_textrel.c 2010-03-20 16:58:41.900965882 -0400 +@@ -0,0 +1,16 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/mm.h> ++#include <linux/file.h> ++#include <linux/grinternal.h> ++#include <linux/grsecurity.h> ++ ++void ++gr_log_textrel(struct vm_area_struct * vma) ++{ ++#ifdef CONFIG_GRKERNSEC_AUDIT_TEXTREL ++ if (grsec_enable_audit_textrel) ++ gr_log_textrel_ulong_ulong(GR_DO_AUDIT, GR_TEXTREL_AUDIT_MSG, vma->vm_file, vma->vm_start, vma->vm_pgoff); ++#endif ++ return; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_time.c linux-2.6.33.1/grsecurity/grsec_time.c +--- linux-2.6.33.1/grsecurity/grsec_time.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_time.c 2010-03-20 16:58:41.900965882 -0400 +@@ -0,0 +1,13 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/grinternal.h> ++ ++void ++gr_log_timechange(void) ++{ ++#ifdef CONFIG_GRKERNSEC_TIME ++ if (grsec_enable_time) ++ gr_log_noargs(GR_DONT_AUDIT_GOOD, GR_TIME_MSG); ++#endif ++ return; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsec_tpe.c linux-2.6.33.1/grsecurity/grsec_tpe.c +--- linux-2.6.33.1/grsecurity/grsec_tpe.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsec_tpe.c 2010-03-20 16:58:41.900965882 -0400 +@@ -0,0 +1,38 @@ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/file.h> ++#include <linux/fs.h> ++#include <linux/grinternal.h> ++ ++extern int gr_acl_tpe_check(void); ++ ++int ++gr_tpe_allow(const struct file *file) ++{ ++#ifdef CONFIG_GRKERNSEC ++ struct inode *inode = file->f_path.dentry->d_parent->d_inode; ++ const struct cred *cred = current_cred(); ++ ++ if (cred->uid && ((grsec_enable_tpe && ++#ifdef CONFIG_GRKERNSEC_TPE_INVERT ++ !in_group_p(grsec_tpe_gid) ++#else ++ in_group_p(grsec_tpe_gid) ++#endif ++ ) || gr_acl_tpe_check()) && ++ (inode->i_uid || (!inode->i_uid && ((inode->i_mode & S_IWGRP) || ++ (inode->i_mode & S_IWOTH))))) { ++ gr_log_fs_generic(GR_DONT_AUDIT, GR_EXEC_TPE_MSG, file->f_path.dentry, file->f_path.mnt); ++ return 0; ++ } ++#ifdef CONFIG_GRKERNSEC_TPE_ALL ++ if (cred->uid && grsec_enable_tpe && grsec_enable_tpe_all && ++ ((inode->i_uid && (inode->i_uid != cred->uid)) || ++ (inode->i_mode & S_IWGRP) || (inode->i_mode & S_IWOTH))) { ++ gr_log_fs_generic(GR_DONT_AUDIT, GR_EXEC_TPE_MSG, file->f_path.dentry, file->f_path.mnt); ++ return 0; ++ } ++#endif ++#endif ++ return 1; ++} +diff -urNp linux-2.6.33.1/grsecurity/grsum.c linux-2.6.33.1/grsecurity/grsum.c +--- linux-2.6.33.1/grsecurity/grsum.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/grsum.c 2010-03-20 16:58:41.900965882 -0400 +@@ -0,0 +1,59 @@ ++#include <linux/err.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/mm.h> ++#include <linux/scatterlist.h> ++#include <linux/crypto.h> ++#include <linux/gracl.h> ++ ++ ++#if !defined(CONFIG_CRYPTO) || defined(CONFIG_CRYPTO_MODULE) || !defined(CONFIG_CRYPTO_SHA256) || defined(CONFIG_CRYPTO_SHA256_MODULE) ++#error "crypto and sha256 must be built into the kernel" ++#endif ++ ++int ++chkpw(struct gr_arg *entry, unsigned char *salt, unsigned char *sum) ++{ ++ char *p; ++ struct crypto_hash *tfm; ++ struct hash_desc desc; ++ struct scatterlist sg; ++ unsigned char temp_sum[GR_SHA_LEN]; ++ volatile int retval = 0; ++ volatile int dummy = 0; ++ unsigned int i; ++ ++ tfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(tfm)) { ++ /* should never happen, since sha256 should be built in */ ++ return 1; ++ } ++ ++ desc.tfm = tfm; ++ desc.flags = 0; ++ ++ crypto_hash_init(&desc); ++ ++ p = salt; ++ sg_set_buf(&sg, p, GR_SALT_LEN); ++ crypto_hash_update(&desc, &sg, sg.length); ++ ++ p = entry->pw; ++ sg_set_buf(&sg, p, strlen(p)); ++ ++ crypto_hash_update(&desc, &sg, sg.length); ++ ++ crypto_hash_final(&desc, temp_sum); ++ ++ memset(entry->pw, 0, GR_PW_LEN); ++ ++ for (i = 0; i < GR_SHA_LEN; i++) ++ if (sum[i] != temp_sum[i]) ++ retval = 1; ++ else ++ dummy = 1; // waste a cycle ++ ++ crypto_free_hash(tfm); ++ ++ return retval; ++} +diff -urNp linux-2.6.33.1/grsecurity/Kconfig linux-2.6.33.1/grsecurity/Kconfig +--- linux-2.6.33.1/grsecurity/Kconfig 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/Kconfig 2010-03-20 17:00:48.140865901 -0400 +@@ -0,0 +1,965 @@ ++# ++# grecurity configuration ++# ++ ++menu "Grsecurity" ++ ++config GRKERNSEC ++ bool "Grsecurity" ++ select CRYPTO ++ select CRYPTO_SHA256 ++ help ++ If you say Y here, you will be able to configure many features ++ that will enhance the security of your system. It is highly ++ recommended that you say Y here and read through the help ++ for each option so that you fully understand the features and ++ can evaluate their usefulness for your machine. ++ ++choice ++ prompt "Security Level" ++ depends on GRKERNSEC ++ default GRKERNSEC_CUSTOM ++ ++config GRKERNSEC_LOW ++ bool "Low" ++ select GRKERNSEC_LINK ++ select GRKERNSEC_FIFO ++ select GRKERNSEC_EXECVE ++ select GRKERNSEC_RANDNET ++ select GRKERNSEC_DMESG ++ select GRKERNSEC_CHROOT ++ select GRKERNSEC_CHROOT_CHDIR ++ ++ help ++ If you choose this option, several of the grsecurity options will ++ be enabled that will give you greater protection against a number ++ of attacks, while assuring that none of your software will have any ++ conflicts with the additional security measures. If you run a lot ++ of unusual software, or you are having problems with the higher ++ security levels, you should say Y here. With this option, the ++ following features are enabled: ++ ++ - Linking restrictions ++ - FIFO restrictions ++ - Enforcing RLIMIT_NPROC on execve ++ - Restricted dmesg ++ - Enforced chdir("/") on chroot ++ - Runtime module disabling ++ ++config GRKERNSEC_MEDIUM ++ bool "Medium" ++ select PAX ++ select PAX_EI_PAX ++ select PAX_PT_PAX_FLAGS ++ select PAX_HAVE_ACL_FLAGS ++ select GRKERNSEC_PROC_MEMMAP if (PAX_NOEXEC || PAX_ASLR) ++ select GRKERNSEC_CHROOT ++ select GRKERNSEC_CHROOT_SYSCTL ++ select GRKERNSEC_LINK ++ select GRKERNSEC_FIFO ++ select GRKERNSEC_EXECVE ++ select GRKERNSEC_DMESG ++ select GRKERNSEC_RANDNET ++ select GRKERNSEC_FORKFAIL ++ select GRKERNSEC_TIME ++ select GRKERNSEC_SIGNAL ++ select GRKERNSEC_CHROOT ++ select GRKERNSEC_CHROOT_UNIX ++ select GRKERNSEC_CHROOT_MOUNT ++ select GRKERNSEC_CHROOT_PIVOT ++ select GRKERNSEC_CHROOT_DOUBLE ++ select GRKERNSEC_CHROOT_CHDIR ++ select GRKERNSEC_CHROOT_MKNOD ++ select GRKERNSEC_PROC ++ select GRKERNSEC_PROC_USERGROUP ++ select PAX_RANDUSTACK ++ select PAX_ASLR ++ select PAX_RANDMMAP ++ select PAX_REFCOUNT if (X86 || SPARC64) ++ select PAX_USERCOPY if ((X86 || SPARC32 || SPARC64 || PPC32 || PPC64) && (SLAB || SLUB || SLOB)) ++ ++ help ++ If you say Y here, several features in addition to those included ++ in the low additional security level will be enabled. These ++ features provide even more security to your system, though in rare ++ cases they may be incompatible with very old or poorly written ++ software. If you enable this option, make sure that your auth ++ service (identd) is running as gid 1001. With this option, ++ the following features (in addition to those provided in the ++ low additional security level) will be enabled: ++ ++ - Failed fork logging ++ - Time change logging ++ - Signal logging ++ - Deny mounts in chroot ++ - Deny double chrooting ++ - Deny sysctl writes in chroot ++ - Deny mknod in chroot ++ - Deny access to abstract AF_UNIX sockets out of chroot ++ - Deny pivot_root in chroot ++ - Denied writes of /dev/kmem, /dev/mem, and /dev/port ++ - /proc restrictions with special GID set to 10 (usually wheel) ++ - Address Space Layout Randomization (ASLR) ++ - Prevent exploitation of most refcount overflows ++ - Bounds checking of copying between the kernel and userland ++ ++config GRKERNSEC_HIGH ++ bool "High" ++ select GRKERNSEC_LINK ++ select GRKERNSEC_FIFO ++ select GRKERNSEC_EXECVE ++ select GRKERNSEC_DMESG ++ select GRKERNSEC_FORKFAIL ++ select GRKERNSEC_TIME ++ select GRKERNSEC_SIGNAL ++ select GRKERNSEC_CHROOT ++ select GRKERNSEC_CHROOT_SHMAT ++ select GRKERNSEC_CHROOT_UNIX ++ select GRKERNSEC_CHROOT_MOUNT ++ select GRKERNSEC_CHROOT_FCHDIR ++ select GRKERNSEC_CHROOT_PIVOT ++ select GRKERNSEC_CHROOT_DOUBLE ++ select GRKERNSEC_CHROOT_CHDIR ++ select GRKERNSEC_CHROOT_MKNOD ++ select GRKERNSEC_CHROOT_CAPS ++ select GRKERNSEC_CHROOT_SYSCTL ++ select GRKERNSEC_CHROOT_FINDTASK ++ select GRKERNSEC_PROC ++ select GRKERNSEC_PROC_MEMMAP if (PAX_NOEXEC || PAX_ASLR) ++ select GRKERNSEC_HIDESYM ++ select GRKERNSEC_BRUTE ++ select GRKERNSEC_PROC_USERGROUP ++ select GRKERNSEC_KMEM ++ select GRKERNSEC_RESLOG ++ select GRKERNSEC_RANDNET ++ select GRKERNSEC_PROC_ADD ++ select GRKERNSEC_CHROOT_CHMOD ++ select GRKERNSEC_CHROOT_NICE ++ select GRKERNSEC_AUDIT_MOUNT ++ select GRKERNSEC_MODHARDEN if (MODULES) ++ select GRKERNSEC_HARDEN_PTRACE ++ select GRKERNSEC_VM86 if (X86_32) ++ select PAX ++ select PAX_RANDUSTACK ++ select PAX_ASLR ++ select PAX_RANDMMAP ++ select PAX_NOEXEC ++ select PAX_MPROTECT ++ select PAX_EI_PAX ++ select PAX_PT_PAX_FLAGS ++ select PAX_HAVE_ACL_FLAGS ++ select PAX_KERNEXEC if ((PPC32 || PPC64 || X86) && (!X86_32 || X86_WP_WORKS_OK) && !XEN) ++ select PAX_MEMORY_UDEREF if (X86_32 && !XEN) ++ select PAX_RANDKSTACK if (X86_TSC && !X86_64) ++ select PAX_SEGMEXEC if (X86_32) ++ select PAX_PAGEEXEC ++ select PAX_EMUPLT if (ALPHA || PARISC || SPARC32 || SPARC64) ++ select PAX_EMUTRAMP if (PARISC) ++ select PAX_EMUSIGRT if (PARISC) ++ select PAX_ETEXECRELOCS if (ALPHA || IA64 || PARISC) ++ select PAX_REFCOUNT if (X86 || SPARC64) ++ select PAX_USERCOPY if ((X86 || PPC32 || PPC64 || SPARC32 || SPARC64) && (SLAB || SLUB || SLOB)) ++ help ++ If you say Y here, many of the features of grsecurity will be ++ enabled, which will protect you against many kinds of attacks ++ against your system. The heightened security comes at a cost ++ of an increased chance of incompatibilities with rare software ++ on your machine. Since this security level enables PaX, you should ++ view http://pax.grsecurity.net and read about the PaX ++ project. While you are there, download chpax and run it on ++ binaries that cause problems with PaX. Also remember that ++ since the /proc restrictions are enabled, you must run your ++ identd as gid 1001. This security level enables the following ++ features in addition to those listed in the low and medium ++ security levels: ++ ++ - Additional /proc restrictions ++ - Chmod restrictions in chroot ++ - No signals, ptrace, or viewing of processes outside of chroot ++ - Capability restrictions in chroot ++ - Deny fchdir out of chroot ++ - Priority restrictions in chroot ++ - Segmentation-based implementation of PaX ++ - Mprotect restrictions ++ - Removal of addresses from /proc/<pid>/[smaps|maps|stat] ++ - Kernel stack randomization ++ - Mount/unmount/remount logging ++ - Kernel symbol hiding ++ - Prevention of memory exhaustion-based exploits ++ - Hardening of module auto-loading ++ - Ptrace restrictions ++ - Restricted vm86 mode ++ ++config GRKERNSEC_CUSTOM ++ bool "Custom" ++ help ++ If you say Y here, you will be able to configure every grsecurity ++ option, which allows you to enable many more features that aren't ++ covered in the basic security levels. These additional features ++ include TPE, socket restrictions, and the sysctl system for ++ grsecurity. It is advised that you read through the help for ++ each option to determine its usefulness in your situation. ++ ++endchoice ++ ++menu "Address Space Protection" ++depends on GRKERNSEC ++ ++config GRKERNSEC_KMEM ++ bool "Deny writing to /dev/kmem, /dev/mem, and /dev/port" ++ help ++ If you say Y here, /dev/kmem and /dev/mem won't be allowed to ++ be written to via mmap or otherwise to modify the running kernel. ++ /dev/port will also not be allowed to be opened. If you have module ++ support disabled, enabling this will close up four ways that are ++ currently used to insert malicious code into the running kernel. ++ Even with all these features enabled, we still highly recommend that ++ you use the RBAC system, as it is still possible for an attacker to ++ modify the running kernel through privileged I/O granted by ioperm/iopl. ++ If you are not using XFree86, you may be able to stop this additional ++ case by enabling the 'Disable privileged I/O' option. Though nothing ++ legitimately writes to /dev/kmem, XFree86 does need to write to /dev/mem, ++ but only to video memory, which is the only writing we allow in this ++ case. If /dev/kmem or /dev/mem are mmaped without PROT_WRITE, they will ++ not be allowed to mprotect it with PROT_WRITE later. ++ It is highly recommended that you say Y here if you meet all the ++ conditions above. ++ ++config GRKERNSEC_VM86 ++ bool "Restrict VM86 mode" ++ depends on X86_32 ++ ++ help ++ If you say Y here, only processes with CAP_SYS_RAWIO will be able to ++ make use of a special execution mode on 32bit x86 processors called ++ Virtual 8086 (VM86) mode. XFree86 may need vm86 mode for certain ++ video cards and will still work with this option enabled. The purpose ++ of the option is to prevent exploitation of emulation errors in ++ virtualization of vm86 mode like the one discovered in VMWare in 2009. ++ Nearly all users should be able to enable this option. ++ ++config GRKERNSEC_IO ++ bool "Disable privileged I/O" ++ depends on X86 ++ select RTC_CLASS ++ select RTC_INTF_DEV ++ select RTC_DRV_CMOS ++ ++ help ++ If you say Y here, all ioperm and iopl calls will return an error. ++ Ioperm and iopl can be used to modify the running kernel. ++ Unfortunately, some programs need this access to operate properly, ++ the most notable of which are XFree86 and hwclock. hwclock can be ++ remedied by having RTC support in the kernel, so real-time ++ clock support is enabled if this option is enabled, to ensure ++ that hwclock operates correctly. XFree86 still will not ++ operate correctly with this option enabled, so DO NOT CHOOSE Y ++ IF YOU USE XFree86. If you use XFree86 and you still want to ++ protect your kernel against modification, use the RBAC system. ++ ++config GRKERNSEC_PROC_MEMMAP ++ bool "Remove addresses from /proc/<pid>/[smaps|maps|stat]" ++ default y if (PAX_NOEXEC || PAX_ASLR) ++ depends on PAX_NOEXEC || PAX_ASLR ++ help ++ If you say Y here, the /proc/<pid>/maps and /proc/<pid>/stat files will ++ give no information about the addresses of its mappings if ++ PaX features that rely on random addresses are enabled on the task. ++ If you use PaX it is greatly recommended that you say Y here as it ++ closes up a hole that makes the full ASLR useless for suid ++ binaries. ++ ++config GRKERNSEC_BRUTE ++ bool "Deter exploit bruteforcing" ++ help ++ If you say Y here, attempts to bruteforce exploits against forking ++ daemons such as apache or sshd will be deterred. When a child of a ++ forking daemon is killed by PaX or crashes due to an illegal ++ instruction, the parent process will be delayed 30 seconds upon every ++ subsequent fork until the administrator is able to assess the ++ situation and restart the daemon. It is recommended that you also ++ enable signal logging in the auditing section so that logs are ++ generated when a process performs an illegal instruction. ++ ++config GRKERNSEC_MODHARDEN ++ bool "Harden module auto-loading" ++ depends on MODULES ++ help ++ If you say Y here, module auto-loading in response to use of some ++ feature implemented by an unloaded module will be restricted to ++ root users. Enabling this option helps defend against attacks ++ by unprivileged users who abuse the auto-loading behavior to ++ cause a vulnerable module to load that is then exploited. ++ ++ If this option prevents a legitimate use of auto-loading for a ++ non-root user, the administrator can execute modprobe manually ++ with the exact name of the module mentioned in the alert log. ++ Alternatively, the administrator can add the module to the list ++ of modules loaded at boot by modifying init scripts. ++ ++ Modification of init scripts will most likely be needed on ++ Ubuntu servers with encrypted home directory support enabled, ++ as the first non-root user logging in will cause the ecb(aes), ++ ecb(aes)-all, cbc(aes), and cbc(aes)-all modules to be loaded. ++ ++config GRKERNSEC_HIDESYM ++ bool "Hide kernel symbols" ++ help ++ If you say Y here, getting information on loaded modules, and ++ displaying all kernel symbols through a syscall will be restricted ++ to users with CAP_SYS_MODULE. For software compatibility reasons, ++ /proc/kallsyms will be restricted to the root user. The RBAC ++ system can hide that entry even from root. Note that this option ++ is only effective provided the following conditions are met: ++ 1) The kernel using grsecurity is not precompiled by some distribution ++ 2) You are using the RBAC system and hiding other files such as your ++ kernel image and System.map. Alternatively, enabling this option ++ causes the permissions on /boot, /lib/modules, and the kernel ++ source directory to change at compile time to prevent ++ reading by non-root users. ++ If the above conditions are met, this option will aid in providing a ++ useful protection against local kernel exploitation of overflows ++ and arbitrary read/write vulnerabilities. ++ ++endmenu ++menu "Role Based Access Control Options" ++depends on GRKERNSEC ++ ++config GRKERNSEC_NO_RBAC ++ bool "Disable RBAC system" ++ help ++ If you say Y here, the /dev/grsec device will be removed from the kernel, ++ preventing the RBAC system from being enabled. You should only say Y ++ here if you have no intention of using the RBAC system, so as to prevent ++ an attacker with root access from misusing the RBAC system to hide files ++ and processes when loadable module support and /dev/[k]mem have been ++ locked down. ++ ++config GRKERNSEC_ACL_HIDEKERN ++ bool "Hide kernel processes" ++ help ++ If you say Y here, all kernel threads will be hidden to all ++ processes but those whose subject has the "view hidden processes" ++ flag. ++ ++config GRKERNSEC_ACL_MAXTRIES ++ int "Maximum tries before password lockout" ++ default 3 ++ help ++ This option enforces the maximum number of times a user can attempt ++ to authorize themselves with the grsecurity RBAC system before being ++ denied the ability to attempt authorization again for a specified time. ++ The lower the number, the harder it will be to brute-force a password. ++ ++config GRKERNSEC_ACL_TIMEOUT ++ int "Time to wait after max password tries, in seconds" ++ default 30 ++ help ++ This option specifies the time the user must wait after attempting to ++ authorize to the RBAC system with the maximum number of invalid ++ passwords. The higher the number, the harder it will be to brute-force ++ a password. ++ ++endmenu ++menu "Filesystem Protections" ++depends on GRKERNSEC ++ ++config GRKERNSEC_PROC ++ bool "Proc restrictions" ++ help ++ If you say Y here, the permissions of the /proc filesystem ++ will be altered to enhance system security and privacy. You MUST ++ choose either a user only restriction or a user and group restriction. ++ Depending upon the option you choose, you can either restrict users to ++ see only the processes they themselves run, or choose a group that can ++ view all processes and files normally restricted to root if you choose ++ the "restrict to user only" option. NOTE: If you're running identd as ++ a non-root user, you will have to run it as the group you specify here. ++ ++config GRKERNSEC_PROC_USER ++ bool "Restrict /proc to user only" ++ depends on GRKERNSEC_PROC ++ help ++ If you say Y here, non-root users will only be able to view their own ++ processes, and restricts them from viewing network-related information, ++ and viewing kernel symbol and module information. ++ ++config GRKERNSEC_PROC_USERGROUP ++ bool "Allow special group" ++ depends on GRKERNSEC_PROC && !GRKERNSEC_PROC_USER ++ help ++ If you say Y here, you will be able to select a group that will be ++ able to view all processes, network-related information, and ++ kernel and symbol information. This option is useful if you want ++ to run identd as a non-root user. ++ ++config GRKERNSEC_PROC_GID ++ int "GID for special group" ++ depends on GRKERNSEC_PROC_USERGROUP ++ default 1001 ++ ++config GRKERNSEC_PROC_ADD ++ bool "Additional restrictions" ++ depends on GRKERNSEC_PROC_USER || GRKERNSEC_PROC_USERGROUP ++ help ++ If you say Y here, additional restrictions will be placed on ++ /proc that keep normal users from viewing device information and ++ slabinfo information that could be useful for exploits. ++ ++config GRKERNSEC_LINK ++ bool "Linking restrictions" ++ help ++ If you say Y here, /tmp race exploits will be prevented, since users ++ will no longer be able to follow symlinks owned by other users in ++ world-writable +t directories (i.e. /tmp), unless the owner of the ++ symlink is the owner of the directory. users will also not be ++ able to hardlink to files they do not own. If the sysctl option is ++ enabled, a sysctl option with name "linking_restrictions" is created. ++ ++config GRKERNSEC_FIFO ++ bool "FIFO restrictions" ++ help ++ If you say Y here, users will not be able to write to FIFOs they don't ++ own in world-writable +t directories (i.e. /tmp), unless the owner of ++ the FIFO is the same owner of the directory it's held in. If the sysctl ++ option is enabled, a sysctl option with name "fifo_restrictions" is ++ created. ++ ++config GRKERNSEC_ROFS ++ bool "Runtime read-only mount protection" ++ help ++ If you say Y here, a sysctl option with name "romount_protect" will ++ be created. By setting this option to 1 at runtime, filesystems ++ will be protected in the following ways: ++ * No new writable mounts will be allowed ++ * Existing read-only mounts won't be able to be remounted read/write ++ * Write operations will be denied on all block devices ++ This option acts independently of grsec_lock: once it is set to 1, ++ it cannot be turned off. Therefore, please be mindful of the resulting ++ behavior if this option is enabled in an init script on a read-only ++ filesystem. This feature is mainly intended for secure embedded systems. ++ ++config GRKERNSEC_CHROOT ++ bool "Chroot jail restrictions" ++ help ++ If you say Y here, you will be able to choose several options that will ++ make breaking out of a chrooted jail much more difficult. If you ++ encounter no software incompatibilities with the following options, it ++ is recommended that you enable each one. ++ ++config GRKERNSEC_CHROOT_MOUNT ++ bool "Deny mounts" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, processes inside a chroot will not be able to ++ mount or remount filesystems. If the sysctl option is enabled, a ++ sysctl option with name "chroot_deny_mount" is created. ++ ++config GRKERNSEC_CHROOT_DOUBLE ++ bool "Deny double-chroots" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, processes inside a chroot will not be able to chroot ++ again outside the chroot. This is a widely used method of breaking ++ out of a chroot jail and should not be allowed. If the sysctl ++ option is enabled, a sysctl option with name ++ "chroot_deny_chroot" is created. ++ ++config GRKERNSEC_CHROOT_PIVOT ++ bool "Deny pivot_root in chroot" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, processes inside a chroot will not be able to use ++ a function called pivot_root() that was introduced in Linux 2.3.41. It ++ works similar to chroot in that it changes the root filesystem. This ++ function could be misused in a chrooted process to attempt to break out ++ of the chroot, and therefore should not be allowed. If the sysctl ++ option is enabled, a sysctl option with name "chroot_deny_pivot" is ++ created. ++ ++config GRKERNSEC_CHROOT_CHDIR ++ bool "Enforce chdir("/") on all chroots" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, the current working directory of all newly-chrooted ++ applications will be set to the the root directory of the chroot. ++ The man page on chroot(2) states: ++ Note that this call does not change the current working ++ directory, so that `.' can be outside the tree rooted at ++ `/'. In particular, the super-user can escape from a ++ `chroot jail' by doing `mkdir foo; chroot foo; cd ..'. ++ ++ It is recommended that you say Y here, since it's not known to break ++ any software. If the sysctl option is enabled, a sysctl option with ++ name "chroot_enforce_chdir" is created. ++ ++config GRKERNSEC_CHROOT_CHMOD ++ bool "Deny (f)chmod +s" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, processes inside a chroot will not be able to chmod ++ or fchmod files to make them have suid or sgid bits. This protects ++ against another published method of breaking a chroot. If the sysctl ++ option is enabled, a sysctl option with name "chroot_deny_chmod" is ++ created. ++ ++config GRKERNSEC_CHROOT_FCHDIR ++ bool "Deny fchdir out of chroot" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, a well-known method of breaking chroots by fchdir'ing ++ to a file descriptor of the chrooting process that points to a directory ++ outside the filesystem will be stopped. If the sysctl option ++ is enabled, a sysctl option with name "chroot_deny_fchdir" is created. ++ ++config GRKERNSEC_CHROOT_MKNOD ++ bool "Deny mknod" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, processes inside a chroot will not be allowed to ++ mknod. The problem with using mknod inside a chroot is that it ++ would allow an attacker to create a device entry that is the same ++ as one on the physical root of your system, which could range from ++ anything from the console device to a device for your harddrive (which ++ they could then use to wipe the drive or steal data). It is recommended ++ that you say Y here, unless you run into software incompatibilities. ++ If the sysctl option is enabled, a sysctl option with name ++ "chroot_deny_mknod" is created. ++ ++config GRKERNSEC_CHROOT_SHMAT ++ bool "Deny shmat() out of chroot" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, processes inside a chroot will not be able to attach ++ to shared memory segments that were created outside of the chroot jail. ++ It is recommended that you say Y here. If the sysctl option is enabled, ++ a sysctl option with name "chroot_deny_shmat" is created. ++ ++config GRKERNSEC_CHROOT_UNIX ++ bool "Deny access to abstract AF_UNIX sockets out of chroot" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, processes inside a chroot will not be able to ++ connect to abstract (meaning not belonging to a filesystem) Unix ++ domain sockets that were bound outside of a chroot. It is recommended ++ that you say Y here. If the sysctl option is enabled, a sysctl option ++ with name "chroot_deny_unix" is created. ++ ++config GRKERNSEC_CHROOT_FINDTASK ++ bool "Protect outside processes" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, processes inside a chroot will not be able to ++ kill, send signals with fcntl, ptrace, capget, getpgid, setpgid, ++ getsid, or view any process outside of the chroot. If the sysctl ++ option is enabled, a sysctl option with name "chroot_findtask" is ++ created. ++ ++config GRKERNSEC_CHROOT_NICE ++ bool "Restrict priority changes" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, processes inside a chroot will not be able to raise ++ the priority of processes in the chroot, or alter the priority of ++ processes outside the chroot. This provides more security than simply ++ removing CAP_SYS_NICE from the process' capability set. If the ++ sysctl option is enabled, a sysctl option with name "chroot_restrict_nice" ++ is created. ++ ++config GRKERNSEC_CHROOT_SYSCTL ++ bool "Deny sysctl writes" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, an attacker in a chroot will not be able to ++ write to sysctl entries, either by sysctl(2) or through a /proc ++ interface. It is strongly recommended that you say Y here. If the ++ sysctl option is enabled, a sysctl option with name ++ "chroot_deny_sysctl" is created. ++ ++config GRKERNSEC_CHROOT_CAPS ++ bool "Capability restrictions" ++ depends on GRKERNSEC_CHROOT ++ help ++ If you say Y here, the capabilities on all root processes within a ++ chroot jail will be lowered to stop module insertion, raw i/o, ++ system and net admin tasks, rebooting the system, modifying immutable ++ files, modifying IPC owned by another, and changing the system time. ++ This is left an option because it can break some apps. Disable this ++ if your chrooted apps are having problems performing those kinds of ++ tasks. If the sysctl option is enabled, a sysctl option with ++ name "chroot_caps" is created. ++ ++endmenu ++menu "Kernel Auditing" ++depends on GRKERNSEC ++ ++config GRKERNSEC_AUDIT_GROUP ++ bool "Single group for auditing" ++ help ++ If you say Y here, the exec, chdir, and (un)mount logging features ++ will only operate on a group you specify. This option is recommended ++ if you only want to watch certain users instead of having a large ++ amount of logs from the entire system. If the sysctl option is enabled, ++ a sysctl option with name "audit_group" is created. ++ ++config GRKERNSEC_AUDIT_GID ++ int "GID for auditing" ++ depends on GRKERNSEC_AUDIT_GROUP ++ default 1007 ++ ++config GRKERNSEC_EXECLOG ++ bool "Exec logging" ++ help ++ If you say Y here, all execve() calls will be logged (since the ++ other exec*() calls are frontends to execve(), all execution ++ will be logged). Useful for shell-servers that like to keep track ++ of their users. If the sysctl option is enabled, a sysctl option with ++ name "exec_logging" is created. ++ WARNING: This option when enabled will produce a LOT of logs, especially ++ on an active system. ++ ++config GRKERNSEC_RESLOG ++ bool "Resource logging" ++ help ++ If you say Y here, all attempts to overstep resource limits will ++ be logged with the resource name, the requested size, and the current ++ limit. It is highly recommended that you say Y here. If the sysctl ++ option is enabled, a sysctl option with name "resource_logging" is ++ created. If the RBAC system is enabled, the sysctl value is ignored. ++ ++config GRKERNSEC_CHROOT_EXECLOG ++ bool "Log execs within chroot" ++ help ++ If you say Y here, all executions inside a chroot jail will be logged ++ to syslog. This can cause a large amount of logs if certain ++ applications (eg. djb's daemontools) are installed on the system, and ++ is therefore left as an option. If the sysctl option is enabled, a ++ sysctl option with name "chroot_execlog" is created. ++ ++config GRKERNSEC_AUDIT_PTRACE ++ bool "Ptrace logging" ++ help ++ If you say Y here, all attempts to attach to a process via ptrace ++ will be logged. If the sysctl option is enabled, a sysctl option ++ with name "audit_ptrace" is created. ++ ++config GRKERNSEC_AUDIT_CHDIR ++ bool "Chdir logging" ++ help ++ If you say Y here, all chdir() calls will be logged. If the sysctl ++ option is enabled, a sysctl option with name "audit_chdir" is created. ++ ++config GRKERNSEC_AUDIT_MOUNT ++ bool "(Un)Mount logging" ++ help ++ If you say Y here, all mounts and unmounts will be logged. If the ++ sysctl option is enabled, a sysctl option with name "audit_mount" is ++ created. ++ ++config GRKERNSEC_SIGNAL ++ bool "Signal logging" ++ help ++ If you say Y here, certain important signals will be logged, such as ++ SIGSEGV, which will as a result inform you of when a error in a program ++ occurred, which in some cases could mean a possible exploit attempt. ++ If the sysctl option is enabled, a sysctl option with name ++ "signal_logging" is created. ++ ++config GRKERNSEC_FORKFAIL ++ bool "Fork failure logging" ++ help ++ If you say Y here, all failed fork() attempts will be logged. ++ This could suggest a fork bomb, or someone attempting to overstep ++ their process limit. If the sysctl option is enabled, a sysctl option ++ with name "forkfail_logging" is created. ++ ++config GRKERNSEC_TIME ++ bool "Time change logging" ++ help ++ If you say Y here, any changes of the system clock will be logged. ++ If the sysctl option is enabled, a sysctl option with name ++ "timechange_logging" is created. ++ ++config GRKERNSEC_PROC_IPADDR ++ bool "/proc/<pid>/ipaddr support" ++ help ++ If you say Y here, a new entry will be added to each /proc/<pid> ++ directory that contains the IP address of the person using the task. ++ The IP is carried across local TCP and AF_UNIX stream sockets. ++ This information can be useful for IDS/IPSes to perform remote response ++ to a local attack. The entry is readable by only the owner of the ++ process (and root if he has CAP_DAC_OVERRIDE, which can be removed via ++ the RBAC system), and thus does not create privacy concerns. ++ ++config GRKERNSEC_AUDIT_TEXTREL ++ bool 'ELF text relocations logging (READ HELP)' ++ depends on PAX_MPROTECT ++ help ++ If you say Y here, text relocations will be logged with the filename ++ of the offending library or binary. The purpose of the feature is ++ to help Linux distribution developers get rid of libraries and ++ binaries that need text relocations which hinder the future progress ++ of PaX. Only Linux distribution developers should say Y here, and ++ never on a production machine, as this option creates an information ++ leak that could aid an attacker in defeating the randomization of ++ a single memory region. If the sysctl option is enabled, a sysctl ++ option with name "audit_textrel" is created. ++ ++endmenu ++ ++menu "Executable Protections" ++depends on GRKERNSEC ++ ++config GRKERNSEC_EXECVE ++ bool "Enforce RLIMIT_NPROC on execs" ++ help ++ If you say Y here, users with a resource limit on processes will ++ have the value checked during execve() calls. The current system ++ only checks the system limit during fork() calls. If the sysctl option ++ is enabled, a sysctl option with name "execve_limiting" is created. ++ ++config GRKERNSEC_DMESG ++ bool "Dmesg(8) restriction" ++ help ++ If you say Y here, non-root users will not be able to use dmesg(8) ++ to view up to the last 4kb of messages in the kernel's log buffer. ++ If the sysctl option is enabled, a sysctl option with name "dmesg" is ++ created. ++ ++config GRKERNSEC_HARDEN_PTRACE ++ bool "Deter ptrace-based process snooping" ++ help ++ If you say Y here, TTY sniffers and other malicious monitoring ++ programs implemented through ptrace will be defeated. If you ++ have been using the RBAC system, this option has already been ++ enabled for several years for all users, with the ability to make ++ fine-grained exceptions. ++ ++ This option only affects the ability of non-root users to ptrace ++ processes that are not a descendent of the ptracing process. ++ This means that strace ./binary and gdb ./binary will still work, ++ but attaching to arbitrary processes will not. If the sysctl ++ option is enabled, a sysctl option with name "harden_ptrace" is ++ created. ++ ++config GRKERNSEC_TPE ++ bool "Trusted Path Execution (TPE)" ++ help ++ If you say Y here, you will be able to choose a gid to add to the ++ supplementary groups of users you want to mark as "untrusted." ++ These users will not be able to execute any files that are not in ++ root-owned directories writable only by root. If the sysctl option ++ is enabled, a sysctl option with name "tpe" is created. ++ ++config GRKERNSEC_TPE_ALL ++ bool "Partially restrict non-root users" ++ depends on GRKERNSEC_TPE ++ help ++ If you say Y here, All non-root users other than the ones in the ++ group specified in the main TPE option will only be allowed to ++ execute files in directories they own that are not group or ++ world-writable, or in directories owned by root and writable only by ++ root. If the sysctl option is enabled, a sysctl option with name ++ "tpe_restrict_all" is created. ++ ++config GRKERNSEC_TPE_INVERT ++ bool "Invert GID option" ++ depends on GRKERNSEC_TPE ++ help ++ If you say Y here, the group you specify in the TPE configuration will ++ decide what group TPE restrictions will be *disabled* for. This ++ option is useful if you want TPE restrictions to be applied to most ++ users on the system. ++ ++config GRKERNSEC_TPE_GID ++ int "GID for untrusted users" ++ depends on GRKERNSEC_TPE && !GRKERNSEC_TPE_INVERT ++ default 1005 ++ help ++ If you have selected the "Invert GID option" above, setting this ++ GID determines what group TPE restrictions will be *disabled* for. ++ If you have not selected the "Invert GID option" above, setting this ++ GID determines what group TPE restrictions will be *enabled* for. ++ If the sysctl option is enabled, a sysctl option with name "tpe_gid" ++ is created. ++ ++config GRKERNSEC_TPE_GID ++ int "GID for trusted users" ++ depends on GRKERNSEC_TPE && GRKERNSEC_TPE_INVERT ++ default 1005 ++ help ++ If you have selected the "Invert GID option" above, setting this ++ GID determines what group TPE restrictions will be *disabled* for. ++ If you have not selected the "Invert GID option" above, setting this ++ GID determines what group TPE restrictions will be *enabled* for. ++ If the sysctl option is enabled, a sysctl option with name "tpe_gid" ++ is created. ++ ++endmenu ++menu "Network Protections" ++depends on GRKERNSEC ++ ++config GRKERNSEC_RANDNET ++ bool "Larger entropy pools" ++ help ++ If you say Y here, the entropy pools used for many features of Linux ++ and grsecurity will be doubled in size. Since several grsecurity ++ features use additional randomness, it is recommended that you say Y ++ here. Saying Y here has a similar effect as modifying ++ /proc/sys/kernel/random/poolsize. ++ ++config GRKERNSEC_BLACKHOLE ++ bool "TCP/UDP blackhole and LAST_ACK DoS prevention" ++ help ++ If you say Y here, neither TCP resets nor ICMP ++ destination-unreachable packets will be sent in response to packets ++ send to ports for which no associated listening process exists. ++ This feature supports both IPV4 and IPV6 and exempts the ++ loopback interface from blackholing. Enabling this feature ++ makes a host more resilient to DoS attacks and reduces network ++ visibility against scanners. ++ ++ The blackhole feature as-implemented is equivalent to the FreeBSD ++ blackhole feature, as prevents RST responses to all packets, not ++ just SYNs. Under most application behavior this causes no ++ problems, but applications (like haproxy) may not close certain ++ connections in a way that cleanly terminates them on the remote ++ end, leaving the remote host in LAST_ACK state. Because of this ++ side-effect and to prevent intentional LAST_ACK DoSes, this ++ feature also adds automatic mitigation against such attacks. ++ The mitigation drastically reduces the amount of time a socket ++ can spend in LAST_ACK state. If you're using haproxy and not ++ all servers it connects to have this option enabled, consider ++ disabling this feature on the haproxy host. ++ ++ If this option is enabled, two sysctl options with names ++ "ip_blackhole" and "lastack_retries" will be created. ++ While "ip_blackhole" takes the standard zero/non-zero on/off ++ toggle, "lastack_retries" uses the same kinds of values as ++ "tcp_retries1" and "tcp_retries2". The default value of 4 ++ prevents a socket from lasting more than 45 seconds in LAST_ACK ++ state. ++ ++config GRKERNSEC_SOCKET ++ bool "Socket restrictions" ++ help ++ If you say Y here, you will be able to choose from several options. ++ If you assign a GID on your system and add it to the supplementary ++ groups of users you want to restrict socket access to, this patch ++ will perform up to three things, based on the option(s) you choose. ++ ++config GRKERNSEC_SOCKET_ALL ++ bool "Deny any sockets to group" ++ depends on GRKERNSEC_SOCKET ++ help ++ If you say Y here, you will be able to choose a GID of whose users will ++ be unable to connect to other hosts from your machine or run server ++ applications from your machine. If the sysctl option is enabled, a ++ sysctl option with name "socket_all" is created. ++ ++config GRKERNSEC_SOCKET_ALL_GID ++ int "GID to deny all sockets for" ++ depends on GRKERNSEC_SOCKET_ALL ++ default 1004 ++ help ++ Here you can choose the GID to disable socket access for. Remember to ++ add the users you want socket access disabled for to the GID ++ specified here. If the sysctl option is enabled, a sysctl option ++ with name "socket_all_gid" is created. ++ ++config GRKERNSEC_SOCKET_CLIENT ++ bool "Deny client sockets to group" ++ depends on GRKERNSEC_SOCKET ++ help ++ If you say Y here, you will be able to choose a GID of whose users will ++ be unable to connect to other hosts from your machine, but will be ++ able to run servers. If this option is enabled, all users in the group ++ you specify will have to use passive mode when initiating ftp transfers ++ from the shell on your machine. If the sysctl option is enabled, a ++ sysctl option with name "socket_client" is created. ++ ++config GRKERNSEC_SOCKET_CLIENT_GID ++ int "GID to deny client sockets for" ++ depends on GRKERNSEC_SOCKET_CLIENT ++ default 1003 ++ help ++ Here you can choose the GID to disable client socket access for. ++ Remember to add the users you want client socket access disabled for to ++ the GID specified here. If the sysctl option is enabled, a sysctl ++ option with name "socket_client_gid" is created. ++ ++config GRKERNSEC_SOCKET_SERVER ++ bool "Deny server sockets to group" ++ depends on GRKERNSEC_SOCKET ++ help ++ If you say Y here, you will be able to choose a GID of whose users will ++ be unable to run server applications from your machine. If the sysctl ++ option is enabled, a sysctl option with name "socket_server" is created. ++ ++config GRKERNSEC_SOCKET_SERVER_GID ++ int "GID to deny server sockets for" ++ depends on GRKERNSEC_SOCKET_SERVER ++ default 1002 ++ help ++ Here you can choose the GID to disable server socket access for. ++ Remember to add the users you want server socket access disabled for to ++ the GID specified here. If the sysctl option is enabled, a sysctl ++ option with name "socket_server_gid" is created. ++ ++endmenu ++menu "Sysctl support" ++depends on GRKERNSEC && SYSCTL ++ ++config GRKERNSEC_SYSCTL ++ bool "Sysctl support" ++ help ++ If you say Y here, you will be able to change the options that ++ grsecurity runs with at bootup, without having to recompile your ++ kernel. You can echo values to files in /proc/sys/kernel/grsecurity ++ to enable (1) or disable (0) various features. All the sysctl entries ++ are mutable until the "grsec_lock" entry is set to a non-zero value. ++ All features enabled in the kernel configuration are disabled at boot ++ if you do not say Y to the "Turn on features by default" option. ++ All options should be set at startup, and the grsec_lock entry should ++ be set to a non-zero value after all the options are set. ++ *THIS IS EXTREMELY IMPORTANT* ++ ++config GRKERNSEC_SYSCTL_ON ++ bool "Turn on features by default" ++ depends on GRKERNSEC_SYSCTL ++ help ++ If you say Y here, instead of having all features enabled in the ++ kernel configuration disabled at boot time, the features will be ++ enabled at boot time. It is recommended you say Y here unless ++ there is some reason you would want all sysctl-tunable features to ++ be disabled by default. As mentioned elsewhere, it is important ++ to enable the grsec_lock entry once you have finished modifying ++ the sysctl entries. ++ ++endmenu ++menu "Logging Options" ++depends on GRKERNSEC ++ ++config GRKERNSEC_FLOODTIME ++ int "Seconds in between log messages (minimum)" ++ default 10 ++ help ++ This option allows you to enforce the number of seconds between ++ grsecurity log messages. The default should be suitable for most ++ people, however, if you choose to change it, choose a value small enough ++ to allow informative logs to be produced, but large enough to ++ prevent flooding. ++ ++config GRKERNSEC_FLOODBURST ++ int "Number of messages in a burst (maximum)" ++ default 4 ++ help ++ This option allows you to choose the maximum number of messages allowed ++ within the flood time interval you chose in a separate option. The ++ default should be suitable for most people, however if you find that ++ many of your logs are being interpreted as flooding, you may want to ++ raise this value. ++ ++endmenu ++ ++endmenu +diff -urNp linux-2.6.33.1/grsecurity/Makefile linux-2.6.33.1/grsecurity/Makefile +--- linux-2.6.33.1/grsecurity/Makefile 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/grsecurity/Makefile 2010-03-20 16:58:41.900965882 -0400 +@@ -0,0 +1,29 @@ ++# grsecurity's ACL system was originally written in 2001 by Michael Dalton ++# during 2001-2009 it has been completely redesigned by Brad Spengler ++# into an RBAC system ++# ++# All code in this directory and various hooks inserted throughout the kernel ++# are copyright Brad Spengler - Open Source Security, Inc., and released ++# under the GPL v2 or higher ++ ++obj-y = grsec_chdir.o grsec_chroot.o grsec_exec.o grsec_fifo.o grsec_fork.o \ ++ grsec_mount.o grsec_sig.o grsec_sock.o grsec_sysctl.o \ ++ grsec_time.o grsec_tpe.o grsec_link.o grsec_textrel.o grsec_ptrace.o ++ ++obj-$(CONFIG_GRKERNSEC) += grsec_init.o grsum.o gracl.o gracl_ip.o gracl_segv.o \ ++ gracl_cap.o gracl_alloc.o gracl_shm.o grsec_mem.o gracl_fs.o \ ++ gracl_learn.o grsec_log.o ++obj-$(CONFIG_GRKERNSEC_RESLOG) += gracl_res.o ++ ++ifndef CONFIG_GRKERNSEC ++obj-y += grsec_disabled.o ++endif ++ ++ifdef CONFIG_GRKERNSEC_HIDESYM ++extra-y := grsec_hidesym.o ++$(obj)/grsec_hidesym.o: ++ @-chmod -f 500 /boot ++ @-chmod -f 500 /lib/modules ++ @-chmod -f 700 . ++ @echo ' grsec: protected kernel image paths' ++endif +diff -urNp linux-2.6.33.1/include/acpi/acpi_drivers.h linux-2.6.33.1/include/acpi/acpi_drivers.h +--- linux-2.6.33.1/include/acpi/acpi_drivers.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/acpi/acpi_drivers.h 2010-03-20 16:58:41.900965882 -0400 +@@ -119,8 +119,8 @@ int acpi_processor_set_thermal_limit(acp + Dock Station + -------------------------------------------------------------------------- */ + struct acpi_dock_ops { +- acpi_notify_handler handler; +- acpi_notify_handler uevent; ++ const acpi_notify_handler handler; ++ const acpi_notify_handler uevent; + }; + + #if defined(CONFIG_ACPI_DOCK) || defined(CONFIG_ACPI_DOCK_MODULE) +@@ -128,7 +128,7 @@ extern int is_dock_device(acpi_handle ha + extern int register_dock_notifier(struct notifier_block *nb); + extern void unregister_dock_notifier(struct notifier_block *nb); + extern int register_hotplug_dock_device(acpi_handle handle, +- struct acpi_dock_ops *ops, ++ const struct acpi_dock_ops *ops, + void *context); + extern void unregister_hotplug_dock_device(acpi_handle handle); + #else +@@ -144,7 +144,7 @@ static inline void unregister_dock_notif + { + } + static inline int register_hotplug_dock_device(acpi_handle handle, +- struct acpi_dock_ops *ops, ++ const struct acpi_dock_ops *ops, + void *context) + { + return -ENODEV; +diff -urNp linux-2.6.33.1/include/asm-generic/atomic-long.h linux-2.6.33.1/include/asm-generic/atomic-long.h +--- linux-2.6.33.1/include/asm-generic/atomic-long.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/asm-generic/atomic-long.h 2010-03-20 16:58:41.900965882 -0400 +@@ -22,6 +22,12 @@ + + typedef atomic64_t atomic_long_t; + ++#ifdef CONFIG_PAX_REFCOUNT ++typedef atomic64_unchecked_t atomic_long_unchecked_t; ++#else ++typedef atomic64_t atomic_long_unchecked_t; ++#endif ++ + #define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i) + + static inline long atomic_long_read(atomic_long_t *l) +@@ -31,6 +37,15 @@ static inline long atomic_long_read(atom + return (long)atomic64_read(v); + } + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline long atomic_long_read_unchecked(atomic_long_unchecked_t *l) ++{ ++ atomic64_unchecked_t *v = (atomic64_unchecked_t *)l; ++ ++ return (long)atomic64_read_unchecked(v); ++} ++#endif ++ + static inline void atomic_long_set(atomic_long_t *l, long i) + { + atomic64_t *v = (atomic64_t *)l; +@@ -38,6 +53,15 @@ static inline void atomic_long_set(atomi + atomic64_set(v, i); + } + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline void atomic_long_set_unchecked(atomic_long_unchecked_t *l, long i) ++{ ++ atomic64_unchecked_t *v = (atomic64_unchecked_t *)l; ++ ++ atomic64_set_unchecked(v, i); ++} ++#endif ++ + static inline void atomic_long_inc(atomic_long_t *l) + { + atomic64_t *v = (atomic64_t *)l; +@@ -45,6 +69,15 @@ static inline void atomic_long_inc(atomi + atomic64_inc(v); + } + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline void atomic_long_inc_unchecked(atomic_long_unchecked_t *l) ++{ ++ atomic64_unchecked_t *v = (atomic64_unchecked_t *)l; ++ ++ atomic64_inc_unchecked(v); ++} ++#endif ++ + static inline void atomic_long_dec(atomic_long_t *l) + { + atomic64_t *v = (atomic64_t *)l; +@@ -59,6 +92,15 @@ static inline void atomic_long_add(long + atomic64_add(i, v); + } + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline void atomic_long_add_unchecked(long i, atomic_long_unchecked_t *l) ++{ ++ atomic64_unchecked_t *v = (atomic64_unchecked_t *)l; ++ ++ atomic64_add_unchecked(i, v); ++} ++#endif ++ + static inline void atomic_long_sub(long i, atomic_long_t *l) + { + atomic64_t *v = (atomic64_t *)l; +@@ -115,6 +157,15 @@ static inline long atomic_long_inc_retur + return (long)atomic64_inc_return(v); + } + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline long atomic_long_inc_return_unchecked(atomic_long_unchecked_t *l) ++{ ++ atomic64_unchecked_t *v = (atomic64_unchecked_t *)l; ++ ++ return (long)atomic64_inc_return_unchecked(v); ++} ++#endif ++ + static inline long atomic_long_dec_return(atomic_long_t *l) + { + atomic64_t *v = (atomic64_t *)l; +@@ -140,6 +191,12 @@ static inline long atomic_long_add_unles + + typedef atomic_t atomic_long_t; + ++#ifdef CONFIG_PAX_REFCOUNT ++typedef atomic_unchecked_t atomic_long_unchecked_t; ++#else ++typedef atomic_t atomic_long_unchecked_t; ++#endif ++ + #define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i) + static inline long atomic_long_read(atomic_long_t *l) + { +@@ -148,6 +205,15 @@ static inline long atomic_long_read(atom + return (long)atomic_read(v); + } + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline long atomic_long_read_unchecked(atomic_long_unchecked_t *l) ++{ ++ atomic_unchecked_t *v = (atomic_unchecked_t *)l; ++ ++ return (long)atomic_read_unchecked(v); ++} ++#endif ++ + static inline void atomic_long_set(atomic_long_t *l, long i) + { + atomic_t *v = (atomic_t *)l; +@@ -155,6 +221,15 @@ static inline void atomic_long_set(atomi + atomic_set(v, i); + } + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline void atomic_long_set_unchecked(atomic_long_unchecked_t *l, long i) ++{ ++ atomic_unchecked_t *v = (atomic_unchecked_t *)l; ++ ++ atomic_set_unchecked(v, i); ++} ++#endif ++ + static inline void atomic_long_inc(atomic_long_t *l) + { + atomic_t *v = (atomic_t *)l; +@@ -162,6 +237,15 @@ static inline void atomic_long_inc(atomi + atomic_inc(v); + } + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline void atomic_long_inc_unchecked(atomic_long_unchecked_t *l) ++{ ++ atomic_unchecked_t *v = (atomic_unchecked_t *)l; ++ ++ atomic_inc_unchecked(v); ++} ++#endif ++ + static inline void atomic_long_dec(atomic_long_t *l) + { + atomic_t *v = (atomic_t *)l; +@@ -176,6 +260,15 @@ static inline void atomic_long_add(long + atomic_add(i, v); + } + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline void atomic_long_add_unchecked(long i, atomic_long_unchecked_t *l) ++{ ++ atomic_unchecked_t *v = (atomic_unchecked_t *)l; ++ ++ atomic_add_unchecked(i, v); ++} ++#endif ++ + static inline void atomic_long_sub(long i, atomic_long_t *l) + { + atomic_t *v = (atomic_t *)l; +@@ -232,6 +325,15 @@ static inline long atomic_long_inc_retur + return (long)atomic_inc_return(v); + } + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline long atomic_long_inc_return_unchecked(atomic_long_unchecked_t *l) ++{ ++ atomic_unchecked_t *v = (atomic_unchecked_t *)l; ++ ++ return (long)atomic_inc_return_unchecked(v); ++} ++#endif ++ + static inline long atomic_long_dec_return(atomic_long_t *l) + { + atomic_t *v = (atomic_t *)l; +@@ -255,4 +357,33 @@ static inline long atomic_long_add_unles + + #endif /* BITS_PER_LONG == 64 */ + ++#ifdef CONFIG_PAX_REFCOUNT ++static inline void pax_refcount_needs_these_functions(void) ++{ ++ atomic_read_unchecked((atomic_unchecked_t *)NULL); ++ atomic_set_unchecked((atomic_unchecked_t *)NULL, 0); ++ atomic_add_unchecked(0, (atomic_unchecked_t *)NULL); ++ atomic_sub_unchecked(0, (atomic_unchecked_t *)NULL); ++ atomic_inc_unchecked((atomic_unchecked_t *)NULL); ++ ++ atomic_long_read_unchecked((atomic_long_unchecked_t *)NULL); ++ atomic_long_set_unchecked((atomic_long_unchecked_t *)NULL, 0); ++ atomic_long_add_unchecked(0, (atomic_long_unchecked_t *)NULL); ++ atomic_long_inc_unchecked((atomic_long_unchecked_t *)NULL); ++ atomic_long_inc_return_unchecked((atomic_long_unchecked_t *)NULL); ++} ++#else ++#define atomic_read_unchecked(v) atomic_read(v) ++#define atomic_set_unchecked(v, i) atomic_set((v), (i)) ++#define atomic_add_unchecked(i, v) atomic_add((i), (v)) ++#define atomic_sub_unchecked(i, v) atomic_sub((i), (v)) ++#define atomic_inc_unchecked(v) atomic_inc(v) ++ ++#define atomic_long_read_unchecked(v) atomic_long_read(v) ++#define atomic_long_set_unchecked(v, i) atomic_long_set((v), (i)) ++#define atomic_long_add_unchecked(i, v) atomic_long_add((i), (v)) ++#define atomic_long_inc_unchecked(v) atomic_long_inc(v) ++#define atomic_long_inc_return_unchecked(v) atomic_long_inc_return(v) ++#endif ++ + #endif /* _ASM_GENERIC_ATOMIC_LONG_H */ +diff -urNp linux-2.6.33.1/include/asm-generic/dma-mapping-common.h linux-2.6.33.1/include/asm-generic/dma-mapping-common.h +--- linux-2.6.33.1/include/asm-generic/dma-mapping-common.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/asm-generic/dma-mapping-common.h 2010-03-20 16:58:41.904583344 -0400 +@@ -11,7 +11,7 @@ static inline dma_addr_t dma_map_single_ + enum dma_data_direction dir, + struct dma_attrs *attrs) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + kmemcheck_mark_initialized(ptr, size); +@@ -30,7 +30,7 @@ static inline void dma_unmap_single_attr + enum dma_data_direction dir, + struct dma_attrs *attrs) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->unmap_page) +@@ -42,7 +42,7 @@ static inline int dma_map_sg_attrs(struc + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + int i, ents; + struct scatterlist *s; + +@@ -59,7 +59,7 @@ static inline void dma_unmap_sg_attrs(st + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + debug_dma_unmap_sg(dev, sg, nents, dir); +@@ -71,7 +71,7 @@ static inline dma_addr_t dma_map_page(st + size_t offset, size_t size, + enum dma_data_direction dir) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + kmemcheck_mark_initialized(page_address(page) + offset, size); +@@ -85,7 +85,7 @@ static inline dma_addr_t dma_map_page(st + static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->unmap_page) +@@ -97,7 +97,7 @@ static inline void dma_sync_single_for_c + size_t size, + enum dma_data_direction dir) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_cpu) +@@ -109,7 +109,7 @@ static inline void dma_sync_single_for_d + dma_addr_t addr, size_t size, + enum dma_data_direction dir) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_device) +@@ -123,7 +123,7 @@ static inline void dma_sync_single_range + size_t size, + enum dma_data_direction dir) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_range_for_cpu) { +@@ -140,7 +140,7 @@ static inline void dma_sync_single_range + size_t size, + enum dma_data_direction dir) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_range_for_device) { +@@ -155,7 +155,7 @@ static inline void + dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_sg_for_cpu) +@@ -167,7 +167,7 @@ static inline void + dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) + { +- struct dma_map_ops *ops = get_dma_ops(dev); ++ const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_sg_for_device) +diff -urNp linux-2.6.33.1/include/asm-generic/futex.h linux-2.6.33.1/include/asm-generic/futex.h +--- linux-2.6.33.1/include/asm-generic/futex.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/asm-generic/futex.h 2010-03-20 16:58:41.904583344 -0400 +@@ -6,7 +6,7 @@ + #include <asm/errno.h> + + static inline int +-futex_atomic_op_inuser (int encoded_op, int __user *uaddr) ++futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) + { + int op = (encoded_op >> 28) & 7; + int cmp = (encoded_op >> 24) & 15; +@@ -48,7 +48,7 @@ futex_atomic_op_inuser (int encoded_op, + } + + static inline int +-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) ++futex_atomic_cmpxchg_inatomic(u32 __user *uaddr, int oldval, int newval) + { + return -ENOSYS; + } +diff -urNp linux-2.6.33.1/include/asm-generic/int-l64.h linux-2.6.33.1/include/asm-generic/int-l64.h +--- linux-2.6.33.1/include/asm-generic/int-l64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/asm-generic/int-l64.h 2010-03-20 16:58:41.904583344 -0400 +@@ -46,6 +46,8 @@ typedef unsigned int u32; + typedef signed long s64; + typedef unsigned long u64; + ++typedef unsigned int intoverflow_t __attribute__ ((mode(TI))); ++ + #define S8_C(x) x + #define U8_C(x) x ## U + #define S16_C(x) x +diff -urNp linux-2.6.33.1/include/asm-generic/int-ll64.h linux-2.6.33.1/include/asm-generic/int-ll64.h +--- linux-2.6.33.1/include/asm-generic/int-ll64.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/asm-generic/int-ll64.h 2010-03-20 16:58:41.904583344 -0400 +@@ -51,6 +51,8 @@ typedef unsigned int u32; + typedef signed long long s64; + typedef unsigned long long u64; + ++typedef unsigned long long intoverflow_t; ++ + #define S8_C(x) x + #define U8_C(x) x ## U + #define S16_C(x) x +diff -urNp linux-2.6.33.1/include/asm-generic/kmap_types.h linux-2.6.33.1/include/asm-generic/kmap_types.h +--- linux-2.6.33.1/include/asm-generic/kmap_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/asm-generic/kmap_types.h 2010-03-20 16:58:41.904583344 -0400 +@@ -28,7 +28,8 @@ KMAP_D(15) KM_UML_USERCOPY, + KMAP_D(16) KM_IRQ_PTE, + KMAP_D(17) KM_NMI, + KMAP_D(18) KM_NMI_PTE, +-KMAP_D(19) KM_TYPE_NR ++KMAP_D(19) KM_CLEARPAGE, ++KMAP_D(20) KM_TYPE_NR + }; + + #undef KMAP_D +diff -urNp linux-2.6.33.1/include/asm-generic/pgtable.h linux-2.6.33.1/include/asm-generic/pgtable.h +--- linux-2.6.33.1/include/asm-generic/pgtable.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/asm-generic/pgtable.h 2010-03-20 16:58:41.904583344 -0400 +@@ -344,6 +344,14 @@ extern void untrack_pfn_vma(struct vm_ar + unsigned long size); + #endif + ++#ifndef __HAVE_ARCH_PAX_OPEN_KERNEL ++static inline unsigned long pax_open_kernel(void) { return 0; } ++#endif ++ ++#ifndef __HAVE_ARCH_PAX_CLOSE_KERNEL ++static inline unsigned long pax_close_kernel(void) { return 0; } ++#endif ++ + #endif /* !__ASSEMBLY__ */ + + #endif /* _ASM_GENERIC_PGTABLE_H */ +diff -urNp linux-2.6.33.1/include/asm-generic/vmlinux.lds.h linux-2.6.33.1/include/asm-generic/vmlinux.lds.h +--- linux-2.6.33.1/include/asm-generic/vmlinux.lds.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/asm-generic/vmlinux.lds.h 2010-03-20 16:58:41.904583344 -0400 +@@ -203,6 +203,7 @@ + .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start_rodata) = .; \ + *(.rodata) *(.rodata.*) \ ++ *(.data.read_only) \ + *(__vermagic) /* Kernel version magic */ \ + *(__markers_strings) /* Markers: strings */ \ + *(__tracepoints_strings)/* Tracepoints: strings */ \ +@@ -660,22 +661,24 @@ + * section in the linker script will go there too. @phdr should have + * a leading colon. + * +- * Note that this macros defines __per_cpu_load as an absolute symbol. ++ * Note that this macros defines per_cpu_load as an absolute symbol. + * If there is no need to put the percpu section at a predetermined + * address, use PERCPU(). + */ + #define PERCPU_VADDR(vaddr, phdr) \ +- VMLINUX_SYMBOL(__per_cpu_load) = .; \ +- .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \ ++ per_cpu_load = .; \ ++ .data.percpu vaddr : AT(VMLINUX_SYMBOL(per_cpu_load) \ + - LOAD_OFFSET) { \ ++ VMLINUX_SYMBOL(__per_cpu_load) = . + per_cpu_load; \ + VMLINUX_SYMBOL(__per_cpu_start) = .; \ + *(.data.percpu.first) \ +- *(.data.percpu.page_aligned) \ + *(.data.percpu) \ ++ . = ALIGN(PAGE_SIZE); \ ++ *(.data.percpu.page_aligned) \ + *(.data.percpu.shared_aligned) \ + VMLINUX_SYMBOL(__per_cpu_end) = .; \ + } phdr \ +- . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu); ++ . = VMLINUX_SYMBOL(per_cpu_load) + SIZEOF(.data.percpu); + + /** + * PERCPU - define output section for percpu area, simple version +diff -urNp linux-2.6.33.1/include/drm/drm_pciids.h linux-2.6.33.1/include/drm/drm_pciids.h +--- linux-2.6.33.1/include/drm/drm_pciids.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/drm/drm_pciids.h 2010-03-20 16:58:41.904583344 -0400 +@@ -375,7 +375,7 @@ + {0x1002, 0x9712, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9713, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9714, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define r128_PCI_IDS \ + {0x1002, 0x4c45, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +@@ -415,14 +415,14 @@ + {0x1002, 0x5446, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x1002, 0x544C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x1002, 0x5452, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define mga_PCI_IDS \ + {0x102b, 0x0520, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MGA_CARD_TYPE_G200}, \ + {0x102b, 0x0521, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MGA_CARD_TYPE_G200}, \ + {0x102b, 0x0525, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MGA_CARD_TYPE_G400}, \ + {0x102b, 0x2527, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MGA_CARD_TYPE_G550}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define mach64_PCI_IDS \ + {0x1002, 0x4749, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +@@ -445,7 +445,7 @@ + {0x1002, 0x4c53, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x1002, 0x4c4d, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x1002, 0x4c4e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define sisdrv_PCI_IDS \ + {0x1039, 0x0300, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +@@ -456,7 +456,7 @@ + {0x1039, 0x7300, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x18CA, 0x0040, PCI_ANY_ID, PCI_ANY_ID, 0, 0, SIS_CHIP_315}, \ + {0x18CA, 0x0042, PCI_ANY_ID, PCI_ANY_ID, 0, 0, SIS_CHIP_315}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define tdfx_PCI_IDS \ + {0x121a, 0x0003, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +@@ -465,7 +465,7 @@ + {0x121a, 0x0007, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x121a, 0x0009, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x121a, 0x000b, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define viadrv_PCI_IDS \ + {0x1106, 0x3022, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +@@ -477,14 +477,14 @@ + {0x1106, 0x3343, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x1106, 0x3230, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VIA_DX9_0}, \ + {0x1106, 0x3157, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VIA_PRO_GROUP_A}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define i810_PCI_IDS \ + {0x8086, 0x7121, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x8086, 0x7123, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x8086, 0x7125, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x8086, 0x1132, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define i830_PCI_IDS \ + {0x8086, 0x3577, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +@@ -492,11 +492,11 @@ + {0x8086, 0x3582, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x8086, 0x2572, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ + {0x8086, 0x358e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define gamma_PCI_IDS \ + {0x3d3d, 0x0008, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define savage_PCI_IDS \ + {0x5333, 0x8a20, PCI_ANY_ID, PCI_ANY_ID, 0, 0, S3_SAVAGE3D}, \ +@@ -522,10 +522,10 @@ + {0x5333, 0x8d02, PCI_ANY_ID, PCI_ANY_ID, 0, 0, S3_TWISTER}, \ + {0x5333, 0x8d03, PCI_ANY_ID, PCI_ANY_ID, 0, 0, S3_PROSAVAGEDDR}, \ + {0x5333, 0x8d04, PCI_ANY_ID, PCI_ANY_ID, 0, 0, S3_PROSAVAGEDDR}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define ffb_PCI_IDS \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} + + #define i915_PCI_IDS \ + {0x8086, 0x3577, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \ +@@ -558,4 +558,4 @@ + {0x8086, 0x35e8, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \ + {0x8086, 0x0042, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \ + {0x8086, 0x0046, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA << 8, 0xffff00, 0}, \ +- {0, 0, 0} ++ {0, 0, 0, 0, 0, 0} +diff -urNp linux-2.6.33.1/include/drm/drmP.h linux-2.6.33.1/include/drm/drmP.h +--- linux-2.6.33.1/include/drm/drmP.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/drm/drmP.h 2010-03-20 16:58:41.904583344 -0400 +@@ -806,7 +806,7 @@ struct drm_driver { + void (*vgaarb_irq)(struct drm_device *dev, bool state); + + /* Driver private ops for this object */ +- struct vm_operations_struct *gem_vm_ops; ++ const struct vm_operations_struct *gem_vm_ops; + + int major; + int minor; +@@ -915,7 +915,7 @@ struct drm_device { + + /** \name Usage Counters */ + /*@{ */ +- int open_count; /**< Outstanding files open */ ++ atomic_t open_count; /**< Outstanding files open */ + atomic_t ioctl_count; /**< Outstanding IOCTLs pending */ + atomic_t vma_count; /**< Outstanding vma areas open */ + int buf_use; /**< Buffers in use -- cannot alloc */ +@@ -926,7 +926,7 @@ struct drm_device { + /*@{ */ + unsigned long counters; + enum drm_stat_type types[15]; +- atomic_t counts[15]; ++ atomic_unchecked_t counts[15]; + /*@} */ + + struct list_head filelist; +diff -urNp linux-2.6.33.1/include/linux/a.out.h linux-2.6.33.1/include/linux/a.out.h +--- linux-2.6.33.1/include/linux/a.out.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/a.out.h 2010-03-20 16:58:41.904583344 -0400 +@@ -39,6 +39,14 @@ enum machine_type { + M_MIPS2 = 152 /* MIPS R6000/R4000 binary */ + }; + ++/* Constants for the N_FLAGS field */ ++#define F_PAX_PAGEEXEC 1 /* Paging based non-executable pages */ ++#define F_PAX_EMUTRAMP 2 /* Emulate trampolines */ ++#define F_PAX_MPROTECT 4 /* Restrict mprotect() */ ++#define F_PAX_RANDMMAP 8 /* Randomize mmap() base */ ++/*#define F_PAX_RANDEXEC 16*/ /* Randomize ET_EXEC base */ ++#define F_PAX_SEGMEXEC 32 /* Segmentation based non-executable pages */ ++ + #if !defined (N_MAGIC) + #define N_MAGIC(exec) ((exec).a_info & 0xffff) + #endif +diff -urNp linux-2.6.33.1/include/linux/atmdev.h linux-2.6.33.1/include/linux/atmdev.h +--- linux-2.6.33.1/include/linux/atmdev.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/atmdev.h 2010-03-20 16:58:41.904583344 -0400 +@@ -237,7 +237,7 @@ struct compat_atm_iobuf { + #endif + + struct k_atm_aal_stats { +-#define __HANDLE_ITEM(i) atomic_t i ++#define __HANDLE_ITEM(i) atomic_unchecked_t i + __AAL_STAT_ITEMS + #undef __HANDLE_ITEM + }; +diff -urNp linux-2.6.33.1/include/linux/binfmts.h linux-2.6.33.1/include/linux/binfmts.h +--- linux-2.6.33.1/include/linux/binfmts.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/binfmts.h 2010-03-20 16:58:41.904583344 -0400 +@@ -86,6 +86,7 @@ struct linux_binfmt { + int (*load_binary)(struct linux_binprm *, struct pt_regs * regs); + int (*load_shlib)(struct file *); + int (*core_dump)(struct coredump_params *cprm); ++ void (*handle_mprotect)(struct vm_area_struct *vma, unsigned long newflags); + unsigned long min_coredump; /* minimal dump size */ + int hasvdso; + }; +diff -urNp linux-2.6.33.1/include/linux/blkdev.h linux-2.6.33.1/include/linux/blkdev.h +--- linux-2.6.33.1/include/linux/blkdev.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/blkdev.h 2010-03-20 16:58:41.908542968 -0400 +@@ -1287,19 +1287,19 @@ static inline int blk_integrity_rq(struc + #endif /* CONFIG_BLK_DEV_INTEGRITY */ + + struct block_device_operations { +- int (*open) (struct block_device *, fmode_t); +- int (*release) (struct gendisk *, fmode_t); +- int (*locked_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); +- int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); +- int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); +- int (*direct_access) (struct block_device *, sector_t, ++ int (* const open) (struct block_device *, fmode_t); ++ int (* const release) (struct gendisk *, fmode_t); ++ int (* const locked_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); ++ int (* const ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); ++ int (* const compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); ++ int (* const direct_access) (struct block_device *, sector_t, + void **, unsigned long *); +- int (*media_changed) (struct gendisk *); +- unsigned long long (*set_capacity) (struct gendisk *, ++ int (* const media_changed) (struct gendisk *); ++ unsigned long long (* const set_capacity) (struct gendisk *, + unsigned long long); +- int (*revalidate_disk) (struct gendisk *); +- int (*getgeo)(struct block_device *, struct hd_geometry *); +- struct module *owner; ++ int (* const revalidate_disk) (struct gendisk *); ++ int (* const getgeo)(struct block_device *, struct hd_geometry *); ++ struct module * const owner; + }; + + extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, +diff -urNp linux-2.6.33.1/include/linux/cache.h linux-2.6.33.1/include/linux/cache.h +--- linux-2.6.33.1/include/linux/cache.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/cache.h 2010-03-20 16:58:41.908542968 -0400 +@@ -16,6 +16,10 @@ + #define __read_mostly + #endif + ++#ifndef __read_only ++#define __read_only __read_mostly ++#endif ++ + #ifndef ____cacheline_aligned + #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) + #endif +diff -urNp linux-2.6.33.1/include/linux/capability.h linux-2.6.33.1/include/linux/capability.h +--- linux-2.6.33.1/include/linux/capability.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/capability.h 2010-03-20 16:58:41.908542968 -0400 +@@ -561,6 +561,7 @@ extern const kernel_cap_t __cap_init_eff + (security_real_capable_noaudit((t), (cap)) == 0) + + extern int capable(int cap); ++int capable_nolog(int cap); + + /* audit system wants to get cap info from files as well */ + struct dentry; +diff -urNp linux-2.6.33.1/include/linux/compiler-gcc4.h linux-2.6.33.1/include/linux/compiler-gcc4.h +--- linux-2.6.33.1/include/linux/compiler-gcc4.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/compiler-gcc4.h 2010-03-20 16:58:41.908542968 -0400 +@@ -50,6 +50,10 @@ + #define unreachable() __builtin_unreachable() + #endif + ++#define __alloc_size(...) __attribute((alloc_size(__VA_ARGS__))) ++#define __bos(ptr, arg) __builtin_object_size((ptr), (arg)) ++#define __bos0(ptr) __bos((ptr), 0) ++#define __bos1(ptr) __bos((ptr), 1) + #endif + + #if __GNUC_MINOR__ > 0 +diff -urNp linux-2.6.33.1/include/linux/compiler.h linux-2.6.33.1/include/linux/compiler.h +--- linux-2.6.33.1/include/linux/compiler.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/compiler.h 2010-03-20 16:58:41.908542968 -0400 +@@ -267,6 +267,22 @@ void ftrace_likely_update(struct ftrace_ + #define __cold + #endif + ++#ifndef __alloc_size ++#define __alloc_size ++#endif ++ ++#ifndef __bos ++#define __bos ++#endif ++ ++#ifndef __bos0 ++#define __bos0 ++#endif ++ ++#ifndef __bos1 ++#define __bos1 ++#endif ++ + /* Simple shorthand for a section definition */ + #ifndef __section + # define __section(S) __attribute__ ((__section__(#S))) +diff -urNp linux-2.6.33.1/include/linux/decompress/mm.h linux-2.6.33.1/include/linux/decompress/mm.h +--- linux-2.6.33.1/include/linux/decompress/mm.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/decompress/mm.h 2010-03-20 16:58:41.908542968 -0400 +@@ -68,7 +68,7 @@ static void free(void *where) + * warnings when not needed (indeed large_malloc / large_free are not + * needed by inflate */ + +-#define malloc(a) kmalloc(a, GFP_KERNEL) ++#define malloc(a) kmalloc((a), GFP_KERNEL) + #define free(a) kfree(a) + + #define large_malloc(a) vmalloc(a) +diff -urNp linux-2.6.33.1/include/linux/dma-mapping.h linux-2.6.33.1/include/linux/dma-mapping.h +--- linux-2.6.33.1/include/linux/dma-mapping.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/dma-mapping.h 2010-03-20 16:58:41.908542968 -0400 +@@ -16,50 +16,50 @@ enum dma_data_direction { + }; + + struct dma_map_ops { +- void* (*alloc_coherent)(struct device *dev, size_t size, ++ void* (* const alloc_coherent)(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp); +- void (*free_coherent)(struct device *dev, size_t size, ++ void (* const free_coherent)(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle); +- dma_addr_t (*map_page)(struct device *dev, struct page *page, ++ dma_addr_t (* const map_page)(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs); +- void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, ++ void (* const unmap_page)(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs); +- int (*map_sg)(struct device *dev, struct scatterlist *sg, ++ int (* const map_sg)(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs); +- void (*unmap_sg)(struct device *dev, ++ void (* const unmap_sg)(struct device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction dir, + struct dma_attrs *attrs); +- void (*sync_single_for_cpu)(struct device *dev, ++ void (* const sync_single_for_cpu)(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction dir); +- void (*sync_single_for_device)(struct device *dev, ++ void (* const sync_single_for_device)(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction dir); +- void (*sync_single_range_for_cpu)(struct device *dev, ++ void (* const sync_single_range_for_cpu)(struct device *dev, + dma_addr_t dma_handle, + unsigned long offset, + size_t size, + enum dma_data_direction dir); +- void (*sync_single_range_for_device)(struct device *dev, ++ void (* const sync_single_range_for_device)(struct device *dev, + dma_addr_t dma_handle, + unsigned long offset, + size_t size, + enum dma_data_direction dir); +- void (*sync_sg_for_cpu)(struct device *dev, ++ void (* const sync_sg_for_cpu)(struct device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction dir); +- void (*sync_sg_for_device)(struct device *dev, ++ void (* const sync_sg_for_device)(struct device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction dir); +- int (*mapping_error)(struct device *dev, dma_addr_t dma_addr); +- int (*dma_supported)(struct device *dev, u64 mask); +- int (*set_dma_mask)(struct device *dev, u64 mask); +- int is_phys; ++ int (* const mapping_error)(struct device *dev, dma_addr_t dma_addr); ++ int (* const dma_supported)(struct device *dev, u64 mask); ++ int (* set_dma_mask)(struct device *dev, u64 mask); ++ const int is_phys; + }; + + #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) +diff -urNp linux-2.6.33.1/include/linux/elf.h linux-2.6.33.1/include/linux/elf.h +--- linux-2.6.33.1/include/linux/elf.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/elf.h 2010-03-20 16:58:41.908542968 -0400 +@@ -49,6 +49,17 @@ typedef __s64 Elf64_Sxword; + #define PT_GNU_EH_FRAME 0x6474e550 + + #define PT_GNU_STACK (PT_LOOS + 0x474e551) ++#define PT_GNU_RELRO (PT_LOOS + 0x474e552) ++ ++#define PT_PAX_FLAGS (PT_LOOS + 0x5041580) ++ ++/* Constants for the e_flags field */ ++#define EF_PAX_PAGEEXEC 1 /* Paging based non-executable pages */ ++#define EF_PAX_EMUTRAMP 2 /* Emulate trampolines */ ++#define EF_PAX_MPROTECT 4 /* Restrict mprotect() */ ++#define EF_PAX_RANDMMAP 8 /* Randomize mmap() base */ ++/*#define EF_PAX_RANDEXEC 16*/ /* Randomize ET_EXEC base */ ++#define EF_PAX_SEGMEXEC 32 /* Segmentation based non-executable pages */ + + /* These constants define the different elf file types */ + #define ET_NONE 0 +@@ -84,6 +95,8 @@ typedef __s64 Elf64_Sxword; + #define DT_DEBUG 21 + #define DT_TEXTREL 22 + #define DT_JMPREL 23 ++#define DT_FLAGS 30 ++ #define DF_TEXTREL 0x00000004 + #define DT_ENCODING 32 + #define OLD_DT_LOOS 0x60000000 + #define DT_LOOS 0x6000000d +@@ -230,6 +243,19 @@ typedef struct elf64_hdr { + #define PF_W 0x2 + #define PF_X 0x1 + ++#define PF_PAGEEXEC (1U << 4) /* Enable PAGEEXEC */ ++#define PF_NOPAGEEXEC (1U << 5) /* Disable PAGEEXEC */ ++#define PF_SEGMEXEC (1U << 6) /* Enable SEGMEXEC */ ++#define PF_NOSEGMEXEC (1U << 7) /* Disable SEGMEXEC */ ++#define PF_MPROTECT (1U << 8) /* Enable MPROTECT */ ++#define PF_NOMPROTECT (1U << 9) /* Disable MPROTECT */ ++/*#define PF_RANDEXEC (1U << 10)*/ /* Enable RANDEXEC */ ++/*#define PF_NORANDEXEC (1U << 11)*/ /* Disable RANDEXEC */ ++#define PF_EMUTRAMP (1U << 12) /* Enable EMUTRAMP */ ++#define PF_NOEMUTRAMP (1U << 13) /* Disable EMUTRAMP */ ++#define PF_RANDMMAP (1U << 14) /* Enable RANDMMAP */ ++#define PF_NORANDMMAP (1U << 15) /* Disable RANDMMAP */ ++ + typedef struct elf32_phdr{ + Elf32_Word p_type; + Elf32_Off p_offset; +@@ -322,6 +348,8 @@ typedef struct elf64_shdr { + #define EI_OSABI 7 + #define EI_PAD 8 + ++#define EI_PAX 14 ++ + #define ELFMAG0 0x7f /* EI_MAG */ + #define ELFMAG1 'E' + #define ELFMAG2 'L' +@@ -386,6 +414,7 @@ extern Elf32_Dyn _DYNAMIC []; + #define elf_phdr elf32_phdr + #define elf_note elf32_note + #define elf_addr_t Elf32_Off ++#define elf_dyn Elf32_Dyn + + #else + +@@ -394,6 +423,7 @@ extern Elf64_Dyn _DYNAMIC []; + #define elf_phdr elf64_phdr + #define elf_note elf64_note + #define elf_addr_t Elf64_Off ++#define elf_dyn Elf64_Dyn + + #endif + +diff -urNp linux-2.6.33.1/include/linux/fs.h linux-2.6.33.1/include/linux/fs.h +--- linux-2.6.33.1/include/linux/fs.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/fs.h 2010-03-20 16:59:37.096727099 -0400 +@@ -90,6 +90,11 @@ struct inodes_stat_t { + /* Expect random access pattern */ + #define FMODE_RANDOM ((__force fmode_t)4096) + ++/* Hack for grsec so as not to require read permission simply to execute ++ * a binary ++ */ ++#define FMODE_GREXEC ((__force fmode_t)8192) ++ + /* + * The below are the various read and write types that we support. Some of + * them include behavioral modifiers that send information down to the +@@ -570,41 +575,41 @@ typedef int (*read_actor_t)(read_descrip + unsigned long, unsigned long); + + struct address_space_operations { +- int (*writepage)(struct page *page, struct writeback_control *wbc); +- int (*readpage)(struct file *, struct page *); +- void (*sync_page)(struct page *); ++ int (* const writepage)(struct page *page, struct writeback_control *wbc); ++ int (* const readpage)(struct file *, struct page *); ++ void (* const sync_page)(struct page *); + + /* Write back some dirty pages from this mapping. */ +- int (*writepages)(struct address_space *, struct writeback_control *); ++ int (* const writepages)(struct address_space *, struct writeback_control *); + + /* Set a page dirty. Return true if this dirtied it */ +- int (*set_page_dirty)(struct page *page); ++ int (* const set_page_dirty)(struct page *page); + +- int (*readpages)(struct file *filp, struct address_space *mapping, ++ int (* const readpages)(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + +- int (*write_begin)(struct file *, struct address_space *mapping, ++ int (* const write_begin)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +- int (*write_end)(struct file *, struct address_space *mapping, ++ int (* const write_end)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ +- sector_t (*bmap)(struct address_space *, sector_t); +- void (*invalidatepage) (struct page *, unsigned long); +- int (*releasepage) (struct page *, gfp_t); +- ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, ++ sector_t (* const bmap)(struct address_space *, sector_t); ++ void (* const invalidatepage) (struct page *, unsigned long); ++ int (* const releasepage) (struct page *, gfp_t); ++ ssize_t (* const direct_IO)(int, struct kiocb *, const struct iovec *iov, + loff_t offset, unsigned long nr_segs); +- int (*get_xip_mem)(struct address_space *, pgoff_t, int, ++ int (* const get_xip_mem)(struct address_space *, pgoff_t, int, + void **, unsigned long *); + /* migrate the contents of a page to the specified target */ +- int (*migratepage) (struct address_space *, ++ int (* const migratepage) (struct address_space *, + struct page *, struct page *); +- int (*launder_page) (struct page *); +- int (*is_partially_uptodate) (struct page *, read_descriptor_t *, ++ int (* const launder_page) (struct page *); ++ int (* const is_partially_uptodate) (struct page *, read_descriptor_t *, + unsigned long); +- int (*error_remove_page)(struct address_space *, struct page *); ++ int (* const error_remove_page)(struct address_space *, struct page *); + }; + + /* +@@ -1032,19 +1037,19 @@ static inline int file_check_writeable(s + typedef struct files_struct *fl_owner_t; + + struct file_lock_operations { +- void (*fl_copy_lock)(struct file_lock *, struct file_lock *); +- void (*fl_release_private)(struct file_lock *); ++ void (* const fl_copy_lock)(struct file_lock *, struct file_lock *); ++ void (* const fl_release_private)(struct file_lock *); + }; + + struct lock_manager_operations { +- int (*fl_compare_owner)(struct file_lock *, struct file_lock *); +- void (*fl_notify)(struct file_lock *); /* unblock callback */ +- int (*fl_grant)(struct file_lock *, struct file_lock *, int); +- void (*fl_copy_lock)(struct file_lock *, struct file_lock *); +- void (*fl_release_private)(struct file_lock *); +- void (*fl_break)(struct file_lock *); +- int (*fl_mylease)(struct file_lock *, struct file_lock *); +- int (*fl_change)(struct file_lock **, int); ++ int (* const fl_compare_owner)(struct file_lock *, struct file_lock *); ++ void (* const fl_notify)(struct file_lock *); /* unblock callback */ ++ int (* const fl_grant)(struct file_lock *, struct file_lock *, int); ++ void (* const fl_copy_lock)(struct file_lock *, struct file_lock *); ++ void (* const fl_release_private)(struct file_lock *); ++ void (* const fl_break)(struct file_lock *); ++ int (* const fl_mylease)(struct file_lock *, struct file_lock *); ++ int (* const fl_change)(struct file_lock **, int); + }; + + struct lock_manager { +@@ -1437,7 +1442,7 @@ struct fiemap_extent_info { + unsigned int fi_flags; /* Flags as passed from user */ + unsigned int fi_extents_mapped; /* Number of mapped extents */ + unsigned int fi_extents_max; /* Size of fiemap_extent array */ +- struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent ++ struct fiemap_extent __user *fi_extents_start; /* Start of fiemap_extent + * array */ + }; + int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, +@@ -1554,30 +1559,30 @@ extern ssize_t vfs_writev(struct file *, + unsigned long, loff_t *); + + struct super_operations { +- struct inode *(*alloc_inode)(struct super_block *sb); +- void (*destroy_inode)(struct inode *); ++ struct inode *(* const alloc_inode)(struct super_block *sb); ++ void (* const destroy_inode)(struct inode *); + +- void (*dirty_inode) (struct inode *); +- int (*write_inode) (struct inode *, int); +- void (*drop_inode) (struct inode *); +- void (*delete_inode) (struct inode *); +- void (*put_super) (struct super_block *); +- void (*write_super) (struct super_block *); +- int (*sync_fs)(struct super_block *sb, int wait); +- int (*freeze_fs) (struct super_block *); +- int (*unfreeze_fs) (struct super_block *); +- int (*statfs) (struct dentry *, struct kstatfs *); +- int (*remount_fs) (struct super_block *, int *, char *); +- void (*clear_inode) (struct inode *); +- void (*umount_begin) (struct super_block *); ++ void (* const dirty_inode) (struct inode *); ++ int (* const write_inode) (struct inode *, int); ++ void (* const drop_inode) (struct inode *); ++ void (* const delete_inode) (struct inode *); ++ void (* const put_super) (struct super_block *); ++ void (* const write_super) (struct super_block *); ++ int (* const sync_fs)(struct super_block *sb, int wait); ++ int (* const freeze_fs) (struct super_block *); ++ int (* const unfreeze_fs) (struct super_block *); ++ int (* const statfs) (struct dentry *, struct kstatfs *); ++ int (* const remount_fs) (struct super_block *, int *, char *); ++ void (* const clear_inode) (struct inode *); ++ void (* const umount_begin) (struct super_block *); + +- int (*show_options)(struct seq_file *, struct vfsmount *); +- int (*show_stats)(struct seq_file *, struct vfsmount *); ++ int (* const show_options)(struct seq_file *, struct vfsmount *); ++ int (* const show_stats)(struct seq_file *, struct vfsmount *); + #ifdef CONFIG_QUOTA +- ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); +- ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); ++ ssize_t (* const quota_read)(struct super_block *, int, char *, size_t, loff_t); ++ ssize_t (* const quota_write)(struct super_block *, int, const char *, size_t, loff_t); + #endif +- int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); ++ int (* const bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); + }; + + /* +diff -urNp linux-2.6.33.1/include/linux/fs_struct.h linux-2.6.33.1/include/linux/fs_struct.h +--- linux-2.6.33.1/include/linux/fs_struct.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/fs_struct.h 2010-03-20 16:58:41.908542968 -0400 +@@ -4,7 +4,7 @@ + #include <linux/path.h> + + struct fs_struct { +- int users; ++ atomic_t users; + rwlock_t lock; + int umask; + int in_exec; +diff -urNp linux-2.6.33.1/include/linux/genhd.h linux-2.6.33.1/include/linux/genhd.h +--- linux-2.6.33.1/include/linux/genhd.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/genhd.h 2010-03-20 16:58:41.912517036 -0400 +@@ -162,7 +162,7 @@ struct gendisk { + + struct timer_rand_state *random; + +- atomic_t sync_io; /* RAID */ ++ atomic_unchecked_t sync_io; /* RAID */ + struct work_struct async_notify; + #ifdef CONFIG_BLK_DEV_INTEGRITY + struct blk_integrity *integrity; +diff -urNp linux-2.6.33.1/include/linux/gracl.h linux-2.6.33.1/include/linux/gracl.h +--- linux-2.6.33.1/include/linux/gracl.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/include/linux/gracl.h 2010-03-20 16:58:41.912517036 -0400 +@@ -0,0 +1,309 @@ ++#ifndef GR_ACL_H ++#define GR_ACL_H ++ ++#include <linux/grdefs.h> ++#include <linux/resource.h> ++#include <linux/capability.h> ++#include <linux/dcache.h> ++#include <asm/resource.h> ++ ++/* Major status information */ ++ ++#define GR_VERSION "grsecurity 2.1.14" ++#define GRSECURITY_VERSION 0x2114 ++ ++enum { ++ GR_SHUTDOWN = 0, ++ GR_ENABLE = 1, ++ GR_SPROLE = 2, ++ GR_RELOAD = 3, ++ GR_SEGVMOD = 4, ++ GR_STATUS = 5, ++ GR_UNSPROLE = 6, ++ GR_PASSSET = 7, ++ GR_SPROLEPAM = 8, ++}; ++ ++/* Password setup definitions ++ * kernel/grhash.c */ ++enum { ++ GR_PW_LEN = 128, ++ GR_SALT_LEN = 16, ++ GR_SHA_LEN = 32, ++}; ++ ++enum { ++ GR_SPROLE_LEN = 64, ++}; ++ ++#define GR_NLIMITS 32 ++ ++/* Begin Data Structures */ ++ ++struct sprole_pw { ++ unsigned char *rolename; ++ unsigned char salt[GR_SALT_LEN]; ++ unsigned char sum[GR_SHA_LEN]; /* 256-bit SHA hash of the password */ ++}; ++ ++struct name_entry { ++ __u32 key; ++ ino_t inode; ++ dev_t device; ++ char *name; ++ __u16 len; ++ __u8 deleted; ++ struct name_entry *prev; ++ struct name_entry *next; ++}; ++ ++struct inodev_entry { ++ struct name_entry *nentry; ++ struct inodev_entry *prev; ++ struct inodev_entry *next; ++}; ++ ++struct acl_role_db { ++ struct acl_role_label **r_hash; ++ __u32 r_size; ++}; ++ ++struct inodev_db { ++ struct inodev_entry **i_hash; ++ __u32 i_size; ++}; ++ ++struct name_db { ++ struct name_entry **n_hash; ++ __u32 n_size; ++}; ++ ++struct crash_uid { ++ uid_t uid; ++ unsigned long expires; ++}; ++ ++struct gr_hash_struct { ++ void **table; ++ void **nametable; ++ void *first; ++ __u32 table_size; ++ __u32 used_size; ++ int type; ++}; ++ ++/* Userspace Grsecurity ACL data structures */ ++ ++struct acl_subject_label { ++ char *filename; ++ ino_t inode; ++ dev_t device; ++ __u32 mode; ++ kernel_cap_t cap_mask; ++ kernel_cap_t cap_lower; ++ ++ struct rlimit res[GR_NLIMITS]; ++ __u32 resmask; ++ ++ __u8 user_trans_type; ++ __u8 group_trans_type; ++ uid_t *user_transitions; ++ gid_t *group_transitions; ++ __u16 user_trans_num; ++ __u16 group_trans_num; ++ ++ __u32 ip_proto[8]; ++ __u32 ip_type; ++ struct acl_ip_label **ips; ++ __u32 ip_num; ++ __u32 inaddr_any_override; ++ ++ __u32 crashes; ++ unsigned long expires; ++ ++ struct acl_subject_label *parent_subject; ++ struct gr_hash_struct *hash; ++ struct acl_subject_label *prev; ++ struct acl_subject_label *next; ++ ++ struct acl_object_label **obj_hash; ++ __u32 obj_hash_size; ++ __u16 pax_flags; ++}; ++ ++struct role_allowed_ip { ++ __u32 addr; ++ __u32 netmask; ++ ++ struct role_allowed_ip *prev; ++ struct role_allowed_ip *next; ++}; ++ ++struct role_transition { ++ char *rolename; ++ ++ struct role_transition *prev; ++ struct role_transition *next; ++}; ++ ++struct acl_role_label { ++ char *rolename; ++ uid_t uidgid; ++ __u16 roletype; ++ ++ __u16 auth_attempts; ++ unsigned long expires; ++ ++ struct acl_subject_label *root_label; ++ struct gr_hash_struct *hash; ++ ++ struct acl_role_label *prev; ++ struct acl_role_label *next; ++ ++ struct role_transition *transitions; ++ struct role_allowed_ip *allowed_ips; ++ uid_t *domain_children; ++ __u16 domain_child_num; ++ ++ struct acl_subject_label **subj_hash; ++ __u32 subj_hash_size; ++}; ++ ++struct user_acl_role_db { ++ struct acl_role_label **r_table; ++ __u32 num_pointers; /* Number of allocations to track */ ++ __u32 num_roles; /* Number of roles */ ++ __u32 num_domain_children; /* Number of domain children */ ++ __u32 num_subjects; /* Number of subjects */ ++ __u32 num_objects; /* Number of objects */ ++}; ++ ++struct acl_object_label { ++ char *filename; ++ ino_t inode; ++ dev_t device; ++ __u32 mode; ++ ++ struct acl_subject_label *nested; ++ struct acl_object_label *globbed; ++ ++ /* next two structures not used */ ++ ++ struct acl_object_label *prev; ++ struct acl_object_label *next; ++}; ++ ++struct acl_ip_label { ++ char *iface; ++ __u32 addr; ++ __u32 netmask; ++ __u16 low, high; ++ __u8 mode; ++ __u32 type; ++ __u32 proto[8]; ++ ++ /* next two structures not used */ ++ ++ struct acl_ip_label *prev; ++ struct acl_ip_label *next; ++}; ++ ++struct gr_arg { ++ struct user_acl_role_db role_db; ++ unsigned char pw[GR_PW_LEN]; ++ unsigned char salt[GR_SALT_LEN]; ++ unsigned char sum[GR_SHA_LEN]; ++ unsigned char sp_role[GR_SPROLE_LEN]; ++ struct sprole_pw *sprole_pws; ++ dev_t segv_device; ++ ino_t segv_inode; ++ uid_t segv_uid; ++ __u16 num_sprole_pws; ++ __u16 mode; ++}; ++ ++struct gr_arg_wrapper { ++ struct gr_arg *arg; ++ __u32 version; ++ __u32 size; ++}; ++ ++struct subject_map { ++ struct acl_subject_label *user; ++ struct acl_subject_label *kernel; ++ struct subject_map *prev; ++ struct subject_map *next; ++}; ++ ++struct acl_subj_map_db { ++ struct subject_map **s_hash; ++ __u32 s_size; ++}; ++ ++/* End Data Structures Section */ ++ ++/* Hash functions generated by empirical testing by Brad Spengler ++ Makes good use of the low bits of the inode. Generally 0-1 times ++ in loop for successful match. 0-3 for unsuccessful match. ++ Shift/add algorithm with modulus of table size and an XOR*/ ++ ++static __inline__ unsigned int ++rhash(const uid_t uid, const __u16 type, const unsigned int sz) ++{ ++ return ((((uid + type) << (16 + type)) ^ uid) % sz); ++} ++ ++ static __inline__ unsigned int ++shash(const struct acl_subject_label *userp, const unsigned int sz) ++{ ++ return ((const unsigned long)userp % sz); ++} ++ ++static __inline__ unsigned int ++fhash(const ino_t ino, const dev_t dev, const unsigned int sz) ++{ ++ return (((ino + dev) ^ ((ino << 13) + (ino << 23) + (dev << 9))) % sz); ++} ++ ++static __inline__ unsigned int ++nhash(const char *name, const __u16 len, const unsigned int sz) ++{ ++ return full_name_hash((const unsigned char *)name, len) % sz; ++} ++ ++#define FOR_EACH_ROLE_START(role) \ ++ role = role_list; \ ++ while (role) { ++ ++#define FOR_EACH_ROLE_END(role) \ ++ role = role->prev; \ ++ } ++ ++#define FOR_EACH_SUBJECT_START(role,subj,iter) \ ++ subj = NULL; \ ++ iter = 0; \ ++ while (iter < role->subj_hash_size) { \ ++ if (subj == NULL) \ ++ subj = role->subj_hash[iter]; \ ++ if (subj == NULL) { \ ++ iter++; \ ++ continue; \ ++ } ++ ++#define FOR_EACH_SUBJECT_END(subj,iter) \ ++ subj = subj->next; \ ++ if (subj == NULL) \ ++ iter++; \ ++ } ++ ++ ++#define FOR_EACH_NESTED_SUBJECT_START(role,subj) \ ++ subj = role->hash->first; \ ++ while (subj != NULL) { ++ ++#define FOR_EACH_NESTED_SUBJECT_END(subj) \ ++ subj = subj->next; \ ++ } ++ ++#endif ++ +diff -urNp linux-2.6.33.1/include/linux/gralloc.h linux-2.6.33.1/include/linux/gralloc.h +--- linux-2.6.33.1/include/linux/gralloc.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/include/linux/gralloc.h 2010-03-20 16:58:41.912517036 -0400 +@@ -0,0 +1,9 @@ ++#ifndef __GRALLOC_H ++#define __GRALLOC_H ++ ++void acl_free_all(void); ++int acl_alloc_stack_init(unsigned long size); ++void *acl_alloc(unsigned long len); ++void *acl_alloc_num(unsigned long num, unsigned long len); ++ ++#endif +diff -urNp linux-2.6.33.1/include/linux/grdefs.h linux-2.6.33.1/include/linux/grdefs.h +--- linux-2.6.33.1/include/linux/grdefs.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/include/linux/grdefs.h 2010-03-20 16:58:41.912517036 -0400 +@@ -0,0 +1,136 @@ ++#ifndef GRDEFS_H ++#define GRDEFS_H ++ ++/* Begin grsecurity status declarations */ ++ ++enum { ++ GR_READY = 0x01, ++ GR_STATUS_INIT = 0x00 // disabled state ++}; ++ ++/* Begin ACL declarations */ ++ ++/* Role flags */ ++ ++enum { ++ GR_ROLE_USER = 0x0001, ++ GR_ROLE_GROUP = 0x0002, ++ GR_ROLE_DEFAULT = 0x0004, ++ GR_ROLE_SPECIAL = 0x0008, ++ GR_ROLE_AUTH = 0x0010, ++ GR_ROLE_NOPW = 0x0020, ++ GR_ROLE_GOD = 0x0040, ++ GR_ROLE_LEARN = 0x0080, ++ GR_ROLE_TPE = 0x0100, ++ GR_ROLE_DOMAIN = 0x0200, ++ GR_ROLE_PAM = 0x0400 ++}; ++ ++/* ACL Subject and Object mode flags */ ++enum { ++ GR_DELETED = 0x80000000 ++}; ++ ++/* ACL Object-only mode flags */ ++enum { ++ GR_READ = 0x00000001, ++ GR_APPEND = 0x00000002, ++ GR_WRITE = 0x00000004, ++ GR_EXEC = 0x00000008, ++ GR_FIND = 0x00000010, ++ GR_INHERIT = 0x00000020, ++ GR_SETID = 0x00000040, ++ GR_CREATE = 0x00000080, ++ GR_DELETE = 0x00000100, ++ GR_LINK = 0x00000200, ++ GR_AUDIT_READ = 0x00000400, ++ GR_AUDIT_APPEND = 0x00000800, ++ GR_AUDIT_WRITE = 0x00001000, ++ GR_AUDIT_EXEC = 0x00002000, ++ GR_AUDIT_FIND = 0x00004000, ++ GR_AUDIT_INHERIT= 0x00008000, ++ GR_AUDIT_SETID = 0x00010000, ++ GR_AUDIT_CREATE = 0x00020000, ++ GR_AUDIT_DELETE = 0x00040000, ++ GR_AUDIT_LINK = 0x00080000, ++ GR_PTRACERD = 0x00100000, ++ GR_NOPTRACE = 0x00200000, ++ GR_SUPPRESS = 0x00400000, ++ GR_NOLEARN = 0x00800000 ++}; ++ ++#define GR_AUDITS (GR_AUDIT_READ | GR_AUDIT_WRITE | GR_AUDIT_APPEND | GR_AUDIT_EXEC | \ ++ GR_AUDIT_FIND | GR_AUDIT_INHERIT | GR_AUDIT_SETID | \ ++ GR_AUDIT_CREATE | GR_AUDIT_DELETE | GR_AUDIT_LINK) ++ ++/* ACL subject-only mode flags */ ++enum { ++ GR_KILL = 0x00000001, ++ GR_VIEW = 0x00000002, ++ GR_PROTECTED = 0x00000004, ++ GR_LEARN = 0x00000008, ++ GR_OVERRIDE = 0x00000010, ++ /* just a placeholder, this mode is only used in userspace */ ++ GR_DUMMY = 0x00000020, ++ GR_PROTSHM = 0x00000040, ++ GR_KILLPROC = 0x00000080, ++ GR_KILLIPPROC = 0x00000100, ++ /* just a placeholder, this mode is only used in userspace */ ++ GR_NOTROJAN = 0x00000200, ++ GR_PROTPROCFD = 0x00000400, ++ GR_PROCACCT = 0x00000800, ++ GR_RELAXPTRACE = 0x00001000, ++ GR_NESTED = 0x00002000, ++ GR_INHERITLEARN = 0x00004000, ++ GR_PROCFIND = 0x00008000, ++ GR_POVERRIDE = 0x00010000, ++ GR_KERNELAUTH = 0x00020000, ++}; ++ ++enum { ++ GR_PAX_ENABLE_SEGMEXEC = 0x0001, ++ GR_PAX_ENABLE_PAGEEXEC = 0x0002, ++ GR_PAX_ENABLE_MPROTECT = 0x0004, ++ GR_PAX_ENABLE_RANDMMAP = 0x0008, ++ GR_PAX_ENABLE_EMUTRAMP = 0x0010, ++ GR_PAX_DISABLE_SEGMEXEC = 0x0100, ++ GR_PAX_DISABLE_PAGEEXEC = 0x0200, ++ GR_PAX_DISABLE_MPROTECT = 0x0400, ++ GR_PAX_DISABLE_RANDMMAP = 0x0800, ++ GR_PAX_DISABLE_EMUTRAMP = 0x1000, ++}; ++ ++enum { ++ GR_ID_USER = 0x01, ++ GR_ID_GROUP = 0x02, ++}; ++ ++enum { ++ GR_ID_ALLOW = 0x01, ++ GR_ID_DENY = 0x02, ++}; ++ ++#define GR_CRASH_RES 31 ++#define GR_UIDTABLE_MAX 500 ++ ++/* begin resource learning section */ ++enum { ++ GR_RLIM_CPU_BUMP = 60, ++ GR_RLIM_FSIZE_BUMP = 50000, ++ GR_RLIM_DATA_BUMP = 10000, ++ GR_RLIM_STACK_BUMP = 1000, ++ GR_RLIM_CORE_BUMP = 10000, ++ GR_RLIM_RSS_BUMP = 500000, ++ GR_RLIM_NPROC_BUMP = 1, ++ GR_RLIM_NOFILE_BUMP = 5, ++ GR_RLIM_MEMLOCK_BUMP = 50000, ++ GR_RLIM_AS_BUMP = 500000, ++ GR_RLIM_LOCKS_BUMP = 2, ++ GR_RLIM_SIGPENDING_BUMP = 5, ++ GR_RLIM_MSGQUEUE_BUMP = 10000, ++ GR_RLIM_NICE_BUMP = 1, ++ GR_RLIM_RTPRIO_BUMP = 1, ++ GR_RLIM_RTTIME_BUMP = 1000000 ++}; ++ ++#endif +diff -urNp linux-2.6.33.1/include/linux/grinternal.h linux-2.6.33.1/include/linux/grinternal.h +--- linux-2.6.33.1/include/linux/grinternal.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/include/linux/grinternal.h 2010-03-20 17:34:52.485855710 -0400 +@@ -0,0 +1,215 @@ ++#ifndef __GRINTERNAL_H ++#define __GRINTERNAL_H ++ ++#ifdef CONFIG_GRKERNSEC ++ ++#include <linux/fs.h> ++#include <linux/mnt_namespace.h> ++#include <linux/nsproxy.h> ++#include <linux/gracl.h> ++#include <linux/grdefs.h> ++#include <linux/grmsg.h> ++ ++void gr_add_learn_entry(const char *fmt, ...) ++ __attribute__ ((format (printf, 1, 2))); ++__u32 gr_search_file(const struct dentry *dentry, const __u32 mode, ++ const struct vfsmount *mnt); ++__u32 gr_check_create(const struct dentry *new_dentry, ++ const struct dentry *parent, ++ const struct vfsmount *mnt, const __u32 mode); ++int gr_check_protected_task(const struct task_struct *task); ++__u32 to_gr_audit(const __u32 reqmode); ++int gr_set_acls(const int type); ++ ++int gr_acl_is_enabled(void); ++char gr_roletype_to_char(void); ++ ++void gr_handle_alertkill(struct task_struct *task); ++char *gr_to_filename(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++char *gr_to_filename1(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++char *gr_to_filename2(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++char *gr_to_filename3(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++ ++extern int grsec_enable_harden_ptrace; ++extern int grsec_enable_link; ++extern int grsec_enable_fifo; ++extern int grsec_enable_execve; ++extern int grsec_enable_shm; ++extern int grsec_enable_execlog; ++extern int grsec_enable_signal; ++extern int grsec_enable_audit_ptrace; ++extern int grsec_enable_forkfail; ++extern int grsec_enable_time; ++extern int grsec_enable_rofs; ++extern int grsec_enable_chroot_shmat; ++extern int grsec_enable_chroot_findtask; ++extern int grsec_enable_chroot_mount; ++extern int grsec_enable_chroot_double; ++extern int grsec_enable_chroot_pivot; ++extern int grsec_enable_chroot_chdir; ++extern int grsec_enable_chroot_chmod; ++extern int grsec_enable_chroot_mknod; ++extern int grsec_enable_chroot_fchdir; ++extern int grsec_enable_chroot_nice; ++extern int grsec_enable_chroot_execlog; ++extern int grsec_enable_chroot_caps; ++extern int grsec_enable_chroot_sysctl; ++extern int grsec_enable_chroot_unix; ++extern int grsec_enable_tpe; ++extern int grsec_tpe_gid; ++extern int grsec_enable_tpe_all; ++extern int grsec_enable_sidcaps; ++extern int grsec_enable_socket_all; ++extern int grsec_socket_all_gid; ++extern int grsec_enable_socket_client; ++extern int grsec_socket_client_gid; ++extern int grsec_enable_socket_server; ++extern int grsec_socket_server_gid; ++extern int grsec_audit_gid; ++extern int grsec_enable_group; ++extern int grsec_enable_audit_textrel; ++extern int grsec_enable_mount; ++extern int grsec_enable_chdir; ++extern int grsec_resource_logging; ++extern int grsec_enable_blackhole; ++extern int grsec_lastack_retries; ++extern int grsec_lock; ++ ++extern spinlock_t grsec_alert_lock; ++extern unsigned long grsec_alert_wtime; ++extern unsigned long grsec_alert_fyet; ++ ++extern spinlock_t grsec_audit_lock; ++ ++extern rwlock_t grsec_exec_file_lock; ++ ++#define gr_task_fullpath(tsk) (tsk->exec_file ? \ ++ gr_to_filename2(tsk->exec_file->f_path.dentry, \ ++ tsk->exec_file->f_vfsmnt) : "/") ++ ++#define gr_parent_task_fullpath(tsk) (tsk->parent->exec_file ? \ ++ gr_to_filename3(tsk->parent->exec_file->f_path.dentry, \ ++ tsk->parent->exec_file->f_vfsmnt) : "/") ++ ++#define gr_task_fullpath0(tsk) (tsk->exec_file ? \ ++ gr_to_filename(tsk->exec_file->f_path.dentry, \ ++ tsk->exec_file->f_vfsmnt) : "/") ++ ++#define gr_parent_task_fullpath0(tsk) (tsk->parent->exec_file ? \ ++ gr_to_filename1(tsk->parent->exec_file->f_path.dentry, \ ++ tsk->parent->exec_file->f_vfsmnt) : "/") ++ ++#define proc_is_chrooted(tsk_a) ((tsk_a->pid > 1) && (tsk_a->fs != NULL) && \ ++ ((init_task.fs->root.dentry != tsk_a->fs->root.dentry) && \ ++ (tsk_a->nsproxy->mnt_ns->root->mnt_root != \ ++ tsk_a->fs->root.dentry))) ++ ++#define have_same_root(tsk_a,tsk_b) ((tsk_a->fs != NULL) && (tsk_b->fs != NULL) && \ ++ (tsk_a->fs->root.dentry == tsk_b->fs->root.dentry)) ++ ++#define DEFAULTSECARGS(task, cred, pcred) gr_task_fullpath(task), task->comm, \ ++ task->pid, cred->uid, \ ++ cred->euid, cred->gid, cred->egid, \ ++ gr_parent_task_fullpath(task), \ ++ task->parent->comm, task->parent->pid, \ ++ pcred->uid, pcred->euid, \ ++ pcred->gid, pcred->egid ++ ++#define GR_CHROOT_CAPS {{ \ ++ CAP_TO_MASK(CAP_LINUX_IMMUTABLE) | CAP_TO_MASK(CAP_NET_ADMIN) | \ ++ CAP_TO_MASK(CAP_SYS_MODULE) | CAP_TO_MASK(CAP_SYS_RAWIO) | \ ++ CAP_TO_MASK(CAP_SYS_PACCT) | CAP_TO_MASK(CAP_SYS_ADMIN) | \ ++ CAP_TO_MASK(CAP_SYS_BOOT) | CAP_TO_MASK(CAP_SYS_TIME) | \ ++ CAP_TO_MASK(CAP_NET_RAW) | CAP_TO_MASK(CAP_SYS_TTY_CONFIG) | \ ++ CAP_TO_MASK(CAP_IPC_OWNER) , 0 }} ++ ++#define security_learn(normal_msg,args...) \ ++({ \ ++ read_lock(&grsec_exec_file_lock); \ ++ gr_add_learn_entry(normal_msg "\n", ## args); \ ++ read_unlock(&grsec_exec_file_lock); \ ++}) ++ ++enum { ++ GR_DO_AUDIT, ++ GR_DONT_AUDIT, ++ GR_DONT_AUDIT_GOOD ++}; ++ ++enum { ++ GR_TTYSNIFF, ++ GR_RBAC, ++ GR_RBAC_STR, ++ GR_STR_RBAC, ++ GR_RBAC_MODE2, ++ GR_RBAC_MODE3, ++ GR_FILENAME, ++ GR_SYSCTL_HIDDEN, ++ GR_NOARGS, ++ GR_ONE_INT, ++ GR_ONE_INT_TWO_STR, ++ GR_ONE_STR, ++ GR_STR_INT, ++ GR_TWO_INT, ++ GR_THREE_INT, ++ GR_FIVE_INT_TWO_STR, ++ GR_TWO_STR, ++ GR_THREE_STR, ++ GR_FOUR_STR, ++ GR_STR_FILENAME, ++ GR_FILENAME_STR, ++ GR_FILENAME_TWO_INT, ++ GR_FILENAME_TWO_INT_STR, ++ GR_TEXTREL, ++ GR_PTRACE, ++ GR_RESOURCE, ++ GR_CAP, ++ GR_SIG, ++ GR_SIG2, ++ GR_CRASH1, ++ GR_CRASH2, ++ GR_PSACCT ++}; ++ ++#define gr_log_hidden_sysctl(audit, msg, str) gr_log_varargs(audit, msg, GR_SYSCTL_HIDDEN, str) ++#define gr_log_ttysniff(audit, msg, task) gr_log_varargs(audit, msg, GR_TTYSNIFF, task) ++#define gr_log_fs_rbac_generic(audit, msg, dentry, mnt) gr_log_varargs(audit, msg, GR_RBAC, dentry, mnt) ++#define gr_log_fs_rbac_str(audit, msg, dentry, mnt, str) gr_log_varargs(audit, msg, GR_RBAC_STR, dentry, mnt, str) ++#define gr_log_fs_str_rbac(audit, msg, str, dentry, mnt) gr_log_varargs(audit, msg, GR_STR_RBAC, str, dentry, mnt) ++#define gr_log_fs_rbac_mode2(audit, msg, dentry, mnt, str1, str2) gr_log_varargs(audit, msg, GR_RBAC_MODE2, dentry, mnt, str1, str2) ++#define gr_log_fs_rbac_mode3(audit, msg, dentry, mnt, str1, str2, str3) gr_log_varargs(audit, msg, GR_RBAC_MODE3, dentry, mnt, str1, str2, str3) ++#define gr_log_fs_generic(audit, msg, dentry, mnt) gr_log_varargs(audit, msg, GR_FILENAME, dentry, mnt) ++#define gr_log_noargs(audit, msg) gr_log_varargs(audit, msg, GR_NOARGS) ++#define gr_log_int(audit, msg, num) gr_log_varargs(audit, msg, GR_ONE_INT, num) ++#define gr_log_int_str2(audit, msg, num, str1, str2) gr_log_varargs(audit, msg, GR_ONE_INT_TWO_STR, num, str1, str2) ++#define gr_log_str(audit, msg, str) gr_log_varargs(audit, msg, GR_ONE_STR, str) ++#define gr_log_str_int(audit, msg, str, num) gr_log_varargs(audit, msg, GR_STR_INT, str, num) ++#define gr_log_int_int(audit, msg, num1, num2) gr_log_varargs(audit, msg, GR_TWO_INT, num1, num2) ++#define gr_log_int3(audit, msg, num1, num2, num3) gr_log_varargs(audit, msg, GR_THREE_INT, num1, num2, num3) ++#define gr_log_int5_str2(audit, msg, num1, num2, str1, str2) gr_log_varargs(audit, msg, GR_FIVE_INT_TWO_STR, num1, num2, str1, str2) ++#define gr_log_str_str(audit, msg, str1, str2) gr_log_varargs(audit, msg, GR_TWO_STR, str1, str2) ++#define gr_log_str3(audit, msg, str1, str2, str3) gr_log_varargs(audit, msg, GR_THREE_STR, str1, str2, str3) ++#define gr_log_str4(audit, msg, str1, str2, str3, str4) gr_log_varargs(audit, msg, GR_FOUR_STR, str1, str2, str3, str4) ++#define gr_log_str_fs(audit, msg, str, dentry, mnt) gr_log_varargs(audit, msg, GR_STR_FILENAME, str, dentry, mnt) ++#define gr_log_fs_str(audit, msg, dentry, mnt, str) gr_log_varargs(audit, msg, GR_FILENAME_STR, dentry, mnt, str) ++#define gr_log_fs_int2(audit, msg, dentry, mnt, num1, num2) gr_log_varargs(audit, msg, GR_FILENAME_TWO_INT, dentry, mnt, num1, num2) ++#define gr_log_fs_int2_str(audit, msg, dentry, mnt, num1, num2, str) gr_log_varargs(audit, msg, GR_FILENAME_TWO_INT_STR, dentry, mnt, num1, num2, str) ++#define gr_log_textrel_ulong_ulong(audit, msg, file, ulong1, ulong2) gr_log_varargs(audit, msg, GR_TEXTREL, file, ulong1, ulong2) ++#define gr_log_ptrace(audit, msg, task) gr_log_varargs(audit, msg, GR_PTRACE, task) ++#define gr_log_res_ulong2_str(audit, msg, task, ulong1, str, ulong2) gr_log_varargs(audit, msg, GR_RESOURCE, task, ulong1, str, ulong2) ++#define gr_log_cap(audit, msg, task, str) gr_log_varargs(audit, msg, GR_CAP, task, str) ++#define gr_log_sig_addr(audit, msg, str, addr) gr_log_varargs(audit, msg, GR_SIG, str, addr) ++#define gr_log_sig_task(audit, msg, task, num) gr_log_varargs(audit, msg, GR_SIG2, task, num) ++#define gr_log_crash1(audit, msg, task, ulong) gr_log_varargs(audit, msg, GR_CRASH1, task, ulong) ++#define gr_log_crash2(audit, msg, task, ulong1) gr_log_varargs(audit, msg, GR_CRASH2, task, ulong1) ++#define gr_log_procacct(audit, msg, task, num1, num2, num3, num4, num5, num6, num7, num8, num9) gr_log_varargs(audit, msg, GR_PSACCT, task, num1, num2, num3, num4, num5, num6, num7, num8, num9) ++ ++void gr_log_varargs(int audit, const char *msg, int argtypes, ...); ++ ++#endif ++ ++#endif +diff -urNp linux-2.6.33.1/include/linux/grmsg.h linux-2.6.33.1/include/linux/grmsg.h +--- linux-2.6.33.1/include/linux/grmsg.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/include/linux/grmsg.h 2010-03-20 17:00:48.140865901 -0400 +@@ -0,0 +1,107 @@ ++#define DEFAULTSECMSG "%.256s[%.16s:%d] uid/euid:%u/%u gid/egid:%u/%u, parent %.256s[%.16s:%d] uid/euid:%u/%u gid/egid:%u/%u" ++#define GR_ACL_PROCACCT_MSG "%.256s[%.16s:%d] IP:%pI4 TTY:%.64s uid/euid:%u/%u gid/egid:%u/%u run time:[%ud %uh %um %us] cpu time:[%ud %uh %um %us] %s with exit code %ld, parent %.256s[%.16s:%d] IP:%pI4 TTY:%.64s uid/euid:%u/%u gid/egid:%u/%u" ++#define GR_PTRACE_ACL_MSG "denied ptrace of %.950s(%.16s:%d) by " ++#define GR_STOPMOD_MSG "denied modification of module state by " ++#define GR_ROFS_BLOCKWRITE_MSG "denied write to block device %.950s by " ++#define GR_ROFS_MOUNT_MSG "denied writable mount of %.950s by " ++#define GR_IOPERM_MSG "denied use of ioperm() by " ++#define GR_IOPL_MSG "denied use of iopl() by " ++#define GR_SHMAT_ACL_MSG "denied attach of shared memory of UID %u, PID %d, ID %u by " ++#define GR_UNIX_CHROOT_MSG "denied connect() to abstract AF_UNIX socket outside of chroot by " ++#define GR_SHMAT_CHROOT_MSG "denied attach of shared memory outside of chroot by " ++#define GR_KMEM_MSG "denied write of /dev/kmem by " ++#define GR_PORT_OPEN_MSG "denied open of /dev/port by " ++#define GR_MEM_WRITE_MSG "denied write of /dev/mem by " ++#define GR_MEM_MMAP_MSG "denied mmap write of /dev/[k]mem by " ++#define GR_SYMLINK_MSG "not following symlink %.950s owned by %d.%d by " ++#define GR_LEARN_AUDIT_MSG "%s\t%u\t%u\t%u\t%.4095s\t%.4095s\t%lu\t%lu\t%.4095s\t%lu\t%pI4" ++#define GR_ID_LEARN_MSG "%s\t%u\t%u\t%u\t%.4095s\t%.4095s\t%c\t%d\t%d\t%d\t%pI4" ++#define GR_HIDDEN_ACL_MSG "%s access to hidden file %.950s by " ++#define GR_OPEN_ACL_MSG "%s open of %.950s for%s%s by " ++#define GR_CREATE_ACL_MSG "%s create of %.950s for%s%s by " ++#define GR_FIFO_MSG "denied writing FIFO %.950s of %d.%d by " ++#define GR_MKNOD_CHROOT_MSG "denied mknod of %.950s from chroot by " ++#define GR_MKNOD_ACL_MSG "%s mknod of %.950s by " ++#define GR_UNIXCONNECT_ACL_MSG "%s connect() to the unix domain socket %.950s by " ++#define GR_TTYSNIFF_ACL_MSG "terminal being sniffed by IP:%pI4 %.480s[%.16s:%d], parent %.480s[%.16s:%d] against " ++#define GR_MKDIR_ACL_MSG "%s mkdir of %.950s by " ++#define GR_RMDIR_ACL_MSG "%s rmdir of %.950s by " ++#define GR_UNLINK_ACL_MSG "%s unlink of %.950s by " ++#define GR_SYMLINK_ACL_MSG "%s symlink from %.480s to %.480s by " ++#define GR_HARDLINK_MSG "denied hardlink of %.930s (owned by %d.%d) to %.30s for " ++#define GR_LINK_ACL_MSG "%s link of %.480s to %.480s by " ++#define GR_INHERIT_ACL_MSG "successful inherit of %.480s's ACL for %.480s by " ++#define GR_RENAME_ACL_MSG "%s rename of %.480s to %.480s by " ++#define GR_UNSAFESHARE_EXEC_ACL_MSG "denied exec with cloned fs of %.950s by " ++#define GR_PTRACE_EXEC_ACL_MSG "denied ptrace of %.950s by " ++#define GR_NPROC_MSG "denied overstep of process limit by " ++#define GR_EXEC_ACL_MSG "%s execution of %.950s by " ++#define GR_EXEC_TPE_MSG "denied untrusted exec of %.950s by " ++#define GR_SEGVSTART_ACL_MSG "possible exploit bruteforcing on " DEFAULTSECMSG " banning uid %u from login for %lu seconds" ++#define GR_SEGVNOSUID_ACL_MSG "possible exploit bruteforcing on " DEFAULTSECMSG " banning execution for %lu seconds" ++#define GR_MOUNT_CHROOT_MSG "denied mount of %.256s as %.930s from chroot by " ++#define GR_PIVOT_CHROOT_MSG "denied pivot_root from chroot by " ++#define GR_TRUNCATE_ACL_MSG "%s truncate of %.950s by " ++#define GR_ATIME_ACL_MSG "%s access time change of %.950s by " ++#define GR_ACCESS_ACL_MSG "%s access of %.950s for%s%s%s by " ++#define GR_CHROOT_CHROOT_MSG "denied double chroot to %.950s by " ++#define GR_FCHMOD_ACL_MSG "%s fchmod of %.950s by " ++#define GR_CHMOD_CHROOT_MSG "denied chmod +s of %.950s by " ++#define GR_CHMOD_ACL_MSG "%s chmod of %.950s by " ++#define GR_CHROOT_FCHDIR_MSG "denied fchdir outside of chroot to %.950s by " ++#define GR_CHOWN_ACL_MSG "%s chown of %.950s by " ++#define GR_WRITLIB_ACL_MSG "denied load of writable library %.950s by " ++#define GR_INITF_ACL_MSG "init_variables() failed %s by " ++#define GR_DISABLED_ACL_MSG "Error loading %s, trying to run kernel with acls disabled. To disable acls at startup use <kernel image name> gracl=off from your boot loader" ++#define GR_DEV_ACL_MSG "/dev/grsec: %d bytes sent %d required, being fed garbaged by " ++#define GR_SHUTS_ACL_MSG "shutdown auth success for " ++#define GR_SHUTF_ACL_MSG "shutdown auth failure for " ++#define GR_SHUTI_ACL_MSG "ignoring shutdown for disabled RBAC system for " ++#define GR_SEGVMODS_ACL_MSG "segvmod auth success for " ++#define GR_SEGVMODF_ACL_MSG "segvmod auth failure for " ++#define GR_SEGVMODI_ACL_MSG "ignoring segvmod for disabled RBAC system for " ++#define GR_ENABLE_ACL_MSG "%s RBAC system loaded by " ++#define GR_ENABLEF_ACL_MSG "unable to load %s for " ++#define GR_RELOADI_ACL_MSG "ignoring reload request for disabled RBAC system" ++#define GR_RELOAD_ACL_MSG "%s RBAC system reloaded by " ++#define GR_RELOADF_ACL_MSG "failed reload of %s for " ++#define GR_SPROLEI_ACL_MSG "ignoring change to special role for disabled RBAC system for " ++#define GR_SPROLES_ACL_MSG "successful change to special role %s (id %d) by " ++#define GR_SPROLEL_ACL_MSG "special role %s (id %d) exited by " ++#define GR_SPROLEF_ACL_MSG "special role %s failure for " ++#define GR_UNSPROLEI_ACL_MSG "ignoring unauth of special role for disabled RBAC system for " ++#define GR_UNSPROLES_ACL_MSG "successful unauth of special role %s (id %d) by " ++#define GR_INVMODE_ACL_MSG "invalid mode %d by " ++#define GR_PRIORITY_CHROOT_MSG "denied priority change of process (%.16s:%d) by " ++#define GR_FAILFORK_MSG "failed fork with errno %d by " ++#define GR_NICE_CHROOT_MSG "denied priority change by " ++#define GR_UNISIGLOG_MSG "%.32s occurred at %p in " ++#define GR_DUALSIGLOG_MSG "signal %d sent to " DEFAULTSECMSG " by " ++#define GR_SIG_ACL_MSG "denied send of signal %d to protected task " DEFAULTSECMSG " by " ++#define GR_SYSCTL_MSG "denied modification of grsecurity sysctl value : %.32s by " ++#define GR_SYSCTL_ACL_MSG "%s sysctl of %.950s for%s%s by " ++#define GR_TIME_MSG "time set by " ++#define GR_DEFACL_MSG "fatal: unable to find subject for (%.16s:%d), loaded by " ++#define GR_MMAP_ACL_MSG "%s executable mmap of %.950s by " ++#define GR_MPROTECT_ACL_MSG "%s executable mprotect of %.950s by " ++#define GR_SOCK_MSG "denied socket(%.16s,%.16s,%.16s) by " ++#define GR_SOCK2_MSG "denied socket(%d,%.16s,%.16s) by " ++#define GR_BIND_MSG "denied bind() by " ++#define GR_CONNECT_MSG "denied connect() by " ++#define GR_BIND_ACL_MSG "denied bind() to %pI4 port %u sock type %.16s protocol %.16s by " ++#define GR_CONNECT_ACL_MSG "denied connect() to %pI4 port %u sock type %.16s protocol %.16s by " ++#define GR_IP_LEARN_MSG "%s\t%u\t%u\t%u\t%.4095s\t%.4095s\t%pI4\t%u\t%u\t%u\t%u\t%pI4" ++#define GR_EXEC_CHROOT_MSG "exec of %.980s within chroot by process " ++#define GR_CAP_ACL_MSG "use of %s denied for " ++#define GR_USRCHANGE_ACL_MSG "change to uid %u denied for " ++#define GR_GRPCHANGE_ACL_MSG "change to gid %u denied for " ++#define GR_REMOUNT_AUDIT_MSG "remount of %.256s by " ++#define GR_UNMOUNT_AUDIT_MSG "unmount of %.256s by " ++#define GR_MOUNT_AUDIT_MSG "mount of %.256s to %.256s by " ++#define GR_CHDIR_AUDIT_MSG "chdir to %.980s by " ++#define GR_EXEC_AUDIT_MSG "exec of %.930s (%.128s) by " ++#define GR_RESOURCE_MSG "denied resource overstep by requesting %lu for %.16s against limit %lu for " ++#define GR_TEXTREL_AUDIT_MSG "text relocation in %s, VMA:0x%08lx 0x%08lx by " ++#define GR_NONROOT_MODLOAD_MSG "denied kernel module auto-load of %.64s by " ++#define GR_VM86_MSG "denied use of vm86 by " ++#define GR_PTRACE_AUDIT_MSG "process %.950s(%.16s:%d) attached to via ptrace by " +diff -urNp linux-2.6.33.1/include/linux/grsecurity.h linux-2.6.33.1/include/linux/grsecurity.h +--- linux-2.6.33.1/include/linux/grsecurity.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/include/linux/grsecurity.h 2010-03-20 17:00:48.140865901 -0400 +@@ -0,0 +1,199 @@ ++#ifndef GR_SECURITY_H ++#define GR_SECURITY_H ++#include <linux/fs.h> ++#include <linux/fs_struct.h> ++#include <linux/binfmts.h> ++#include <linux/gracl.h> ++ ++/* notify of brain-dead configs */ ++#if defined(CONFIG_PAX_NOEXEC) && !defined(CONFIG_PAX_PAGEEXEC) && !defined(CONFIG_PAX_SEGMEXEC) && !defined(CONFIG_PAX_KERNEXEC) ++#error "CONFIG_PAX_NOEXEC enabled, but PAGEEXEC, SEGMEXEC, and KERNEXEC are disabled." ++#endif ++#if defined(CONFIG_PAX_NOEXEC) && !defined(CONFIG_PAX_EI_PAX) && !defined(CONFIG_PAX_PT_PAX_FLAGS) ++#error "CONFIG_PAX_NOEXEC enabled, but neither CONFIG_PAX_EI_PAX nor CONFIG_PAX_PT_PAX_FLAGS are enabled." ++#endif ++#if defined(CONFIG_PAX_ASLR) && (defined(CONFIG_PAX_RANDMMAP) || defined(CONFIG_PAX_RANDUSTACK)) && !defined(CONFIG_PAX_EI_PAX) && !defined(CONFIG_PAX_PT_PAX_FLAGS) ++#error "CONFIG_PAX_ASLR enabled, but neither CONFIG_PAX_EI_PAX nor CONFIG_PAX_PT_PAX_FLAGS are enabled." ++#endif ++#if defined(CONFIG_PAX_ASLR) && !defined(CONFIG_PAX_RANDKSTACK) && !defined(CONFIG_PAX_RANDUSTACK) && !defined(CONFIG_PAX_RANDMMAP) ++#error "CONFIG_PAX_ASLR enabled, but RANDKSTACK, RANDUSTACK, and RANDMMAP are disabled." ++#endif ++#if defined(CONFIG_PAX) && !defined(CONFIG_PAX_NOEXEC) && !defined(CONFIG_PAX_ASLR) ++#error "CONFIG_PAX enabled, but no PaX options are enabled." ++#endif ++ ++void gr_handle_brute_attach(struct task_struct *p); ++void gr_handle_brute_check(void); ++ ++char gr_roletype_to_char(void); ++ ++int gr_check_user_change(int real, int effective, int fs); ++int gr_check_group_change(int real, int effective, int fs); ++ ++void gr_del_task_from_ip_table(struct task_struct *p); ++ ++int gr_pid_is_chrooted(struct task_struct *p); ++int gr_handle_chroot_nice(void); ++int gr_handle_chroot_sysctl(const int op); ++int gr_handle_chroot_setpriority(struct task_struct *p, ++ const int niceval); ++int gr_chroot_fchdir(struct dentry *u_dentry, struct vfsmount *u_mnt); ++int gr_handle_chroot_chroot(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++int gr_handle_chroot_caps(struct path *path); ++void gr_handle_chroot_chdir(struct path *path); ++int gr_handle_chroot_chmod(const struct dentry *dentry, ++ const struct vfsmount *mnt, const int mode); ++int gr_handle_chroot_mknod(const struct dentry *dentry, ++ const struct vfsmount *mnt, const int mode); ++int gr_handle_chroot_mount(const struct dentry *dentry, ++ const struct vfsmount *mnt, ++ const char *dev_name); ++int gr_handle_chroot_pivot(void); ++int gr_handle_chroot_unix(const pid_t pid); ++ ++int gr_handle_rawio(const struct inode *inode); ++int gr_handle_nproc(void); ++ ++void gr_handle_ioperm(void); ++void gr_handle_iopl(void); ++ ++int gr_tpe_allow(const struct file *file); ++ ++int gr_random_pid(void); ++ ++void gr_log_forkfail(const int retval); ++void gr_log_timechange(void); ++void gr_log_signal(const int sig, const void *addr, const struct task_struct *t); ++void gr_log_chdir(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++void gr_log_chroot_exec(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++void gr_handle_exec_args(struct linux_binprm *bprm, char **argv); ++void gr_log_remount(const char *devname, const int retval); ++void gr_log_unmount(const char *devname, const int retval); ++void gr_log_mount(const char *from, const char *to, const int retval); ++void gr_log_textrel(struct vm_area_struct *vma); ++ ++int gr_handle_follow_link(const struct inode *parent, ++ const struct inode *inode, ++ const struct dentry *dentry, ++ const struct vfsmount *mnt); ++int gr_handle_fifo(const struct dentry *dentry, ++ const struct vfsmount *mnt, ++ const struct dentry *dir, const int flag, ++ const int acc_mode); ++int gr_handle_hardlink(const struct dentry *dentry, ++ const struct vfsmount *mnt, ++ struct inode *inode, ++ const int mode, const char *to); ++ ++int gr_is_capable(const int cap); ++int gr_is_capable_nolog(const int cap); ++void gr_learn_resource(const struct task_struct *task, const int limit, ++ const unsigned long wanted, const int gt); ++void gr_copy_label(struct task_struct *tsk); ++void gr_handle_crash(struct task_struct *task, const int sig); ++int gr_handle_signal(const struct task_struct *p, const int sig); ++int gr_check_crash_uid(const uid_t uid); ++int gr_check_protected_task(const struct task_struct *task); ++int gr_acl_handle_mmap(const struct file *file, ++ const unsigned long prot); ++int gr_acl_handle_mprotect(const struct file *file, ++ const unsigned long prot); ++int gr_check_hidden_task(const struct task_struct *tsk); ++__u32 gr_acl_handle_truncate(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++__u32 gr_acl_handle_utime(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++__u32 gr_acl_handle_access(const struct dentry *dentry, ++ const struct vfsmount *mnt, const int fmode); ++__u32 gr_acl_handle_fchmod(const struct dentry *dentry, ++ const struct vfsmount *mnt, mode_t mode); ++__u32 gr_acl_handle_chmod(const struct dentry *dentry, ++ const struct vfsmount *mnt, mode_t mode); ++__u32 gr_acl_handle_chown(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++int gr_handle_ptrace(struct task_struct *task, const long request); ++int gr_handle_proc_ptrace(struct task_struct *task); ++__u32 gr_acl_handle_execve(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++int gr_check_crash_exec(const struct file *filp); ++int gr_acl_is_enabled(void); ++void gr_set_kernel_label(struct task_struct *task); ++void gr_set_role_label(struct task_struct *task, const uid_t uid, ++ const gid_t gid); ++int gr_set_proc_label(const struct dentry *dentry, ++ const struct vfsmount *mnt, ++ const int unsafe_share); ++__u32 gr_acl_handle_hidden_file(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++__u32 gr_acl_handle_open(const struct dentry *dentry, ++ const struct vfsmount *mnt, const int fmode); ++__u32 gr_acl_handle_creat(const struct dentry *dentry, ++ const struct dentry *p_dentry, ++ const struct vfsmount *p_mnt, const int fmode, ++ const int imode); ++void gr_handle_create(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++__u32 gr_acl_handle_mknod(const struct dentry *new_dentry, ++ const struct dentry *parent_dentry, ++ const struct vfsmount *parent_mnt, ++ const int mode); ++__u32 gr_acl_handle_mkdir(const struct dentry *new_dentry, ++ const struct dentry *parent_dentry, ++ const struct vfsmount *parent_mnt); ++__u32 gr_acl_handle_rmdir(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++void gr_handle_delete(const ino_t ino, const dev_t dev); ++__u32 gr_acl_handle_unlink(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++__u32 gr_acl_handle_symlink(const struct dentry *new_dentry, ++ const struct dentry *parent_dentry, ++ const struct vfsmount *parent_mnt, ++ const char *from); ++__u32 gr_acl_handle_link(const struct dentry *new_dentry, ++ const struct dentry *parent_dentry, ++ const struct vfsmount *parent_mnt, ++ const struct dentry *old_dentry, ++ const struct vfsmount *old_mnt, const char *to); ++int gr_acl_handle_rename(struct dentry *new_dentry, ++ struct dentry *parent_dentry, ++ const struct vfsmount *parent_mnt, ++ struct dentry *old_dentry, ++ struct inode *old_parent_inode, ++ struct vfsmount *old_mnt, const char *newname); ++void gr_handle_rename(struct inode *old_dir, struct inode *new_dir, ++ struct dentry *old_dentry, ++ struct dentry *new_dentry, ++ struct vfsmount *mnt, const __u8 replace); ++__u32 gr_check_link(const struct dentry *new_dentry, ++ const struct dentry *parent_dentry, ++ const struct vfsmount *parent_mnt, ++ const struct dentry *old_dentry, ++ const struct vfsmount *old_mnt); ++int gr_acl_handle_filldir(const struct file *file, const char *name, ++ const unsigned int namelen, const ino_t ino); ++ ++__u32 gr_acl_handle_unix(const struct dentry *dentry, ++ const struct vfsmount *mnt); ++void gr_acl_handle_exit(void); ++void gr_acl_handle_psacct(struct task_struct *task, const long code); ++int gr_acl_handle_procpidmem(const struct task_struct *task); ++int gr_handle_rofs_mount(struct dentry *dentry, struct vfsmount *mnt, int mnt_flags); ++int gr_handle_rofs_blockwrite(struct dentry *dentry, struct vfsmount *mnt, int acc_mode); ++void gr_audit_ptrace(struct task_struct *task); ++ ++#ifdef CONFIG_GRKERNSEC ++void gr_log_nonroot_mod_load(const char *modname); ++void gr_handle_vm86(void); ++void gr_handle_mem_write(void); ++void gr_handle_kmem_write(void); ++void gr_handle_open_port(void); ++int gr_handle_mem_mmap(const unsigned long offset, ++ struct vm_area_struct *vma); ++ ++extern int grsec_enable_dmesg; ++#endif ++ ++#endif +diff -urNp linux-2.6.33.1/include/linux/grsock.h linux-2.6.33.1/include/linux/grsock.h +--- linux-2.6.33.1/include/linux/grsock.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/include/linux/grsock.h 2010-03-20 16:58:41.912517036 -0400 +@@ -0,0 +1,19 @@ ++#ifndef __GRSOCK_H ++#define __GRSOCK_H ++ ++extern void gr_attach_curr_ip(const struct sock *sk); ++extern int gr_handle_sock_all(const int family, const int type, ++ const int protocol); ++extern int gr_handle_sock_server(const struct sockaddr *sck); ++extern int gr_handle_sock_server_other(const struct socket *sck); ++extern int gr_handle_sock_client(const struct sockaddr *sck); ++extern int gr_search_connect(struct socket * sock, ++ struct sockaddr_in * addr); ++extern int gr_search_bind(struct socket * sock, ++ struct sockaddr_in * addr); ++extern int gr_search_listen(struct socket * sock); ++extern int gr_search_accept(struct socket * sock); ++extern int gr_search_socket(const int domain, const int type, ++ const int protocol); ++ ++#endif +diff -urNp linux-2.6.33.1/include/linux/hdpu_features.h linux-2.6.33.1/include/linux/hdpu_features.h +--- linux-2.6.33.1/include/linux/hdpu_features.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/hdpu_features.h 2010-03-20 16:58:41.912517036 -0400 +@@ -3,7 +3,7 @@ + struct cpustate_t { + spinlock_t lock; + int excl; +- int open_count; ++ atomic_t open_count; + unsigned char cached_val; + int inited; + unsigned long *set_addr; +diff -urNp linux-2.6.33.1/include/linux/highmem.h linux-2.6.33.1/include/linux/highmem.h +--- linux-2.6.33.1/include/linux/highmem.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/highmem.h 2010-03-20 16:58:41.912517036 -0400 +@@ -137,6 +137,18 @@ static inline void clear_highpage(struct + kunmap_atomic(kaddr, KM_USER0); + } + ++static inline void sanitize_highpage(struct page *page) ++{ ++ void *kaddr; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ kaddr = kmap_atomic(page, KM_CLEARPAGE); ++ clear_page(kaddr); ++ kunmap_atomic(kaddr, KM_CLEARPAGE); ++ local_irq_restore(flags); ++} ++ + static inline void zero_user_segments(struct page *page, + unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +diff -urNp linux-2.6.33.1/include/linux/init_task.h linux-2.6.33.1/include/linux/init_task.h +--- linux-2.6.33.1/include/linux/init_task.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/init_task.h 2010-03-20 16:58:41.912517036 -0400 +@@ -111,6 +111,13 @@ extern struct cred init_cred; + # define INIT_PERF_EVENTS(tsk) + #endif + ++#ifdef CONFIG_GRKERNSEC ++# define INIT_GR_FS_LOCK \ ++ .gr_fs_lock = __RW_LOCK_UNLOCKED(gr_fs_lock), ++#else ++# define INIT_GR_FS_LOCK ++#endif ++ + /* + * INIT_TASK is used to set up the first task table, touch at + * your own risk!. Base=0, limit=0x1fffff (=2MB) +@@ -180,6 +187,7 @@ extern struct cred init_cred; + INIT_FTRACE_GRAPH \ + INIT_TRACE_RECURSION \ + INIT_TASK_RCU_PREEMPT(tsk) \ ++ INIT_GR_FS_LOCK \ + } + + +diff -urNp linux-2.6.33.1/include/linux/interrupt.h linux-2.6.33.1/include/linux/interrupt.h +--- linux-2.6.33.1/include/linux/interrupt.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/interrupt.h 2010-03-20 16:58:41.912517036 -0400 +@@ -357,7 +357,7 @@ enum + /* map softirq index to softirq name. update 'softirq_to_name' in + * kernel/softirq.c when adding a new softirq. + */ +-extern char *softirq_to_name[NR_SOFTIRQS]; ++extern const char * const softirq_to_name[NR_SOFTIRQS]; + + /* softirq mask and active fields moved to irq_cpustat_t in + * asm/hardirq.h to get better cache usage. KAO +@@ -365,12 +365,12 @@ extern char *softirq_to_name[NR_SOFTIRQS + + struct softirq_action + { +- void (*action)(struct softirq_action *); ++ void (*action)(void); + }; + + asmlinkage void do_softirq(void); + asmlinkage void __do_softirq(void); +-extern void open_softirq(int nr, void (*action)(struct softirq_action *)); ++extern void open_softirq(int nr, void (*action)(void)); + extern void softirq_init(void); + #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) + extern void raise_softirq_irqoff(unsigned int nr); +diff -urNp linux-2.6.33.1/include/linux/jbd2.h linux-2.6.33.1/include/linux/jbd2.h +--- linux-2.6.33.1/include/linux/jbd2.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/jbd2.h 2010-03-20 16:58:41.912517036 -0400 +@@ -66,7 +66,7 @@ extern u8 jbd2_journal_enable_debug; + } \ + } while (0) + #else +-#define jbd_debug(f, a...) /**/ ++#define jbd_debug(f, a...) do {} while (0) + #endif + + static inline void *jbd2_alloc(size_t size, gfp_t flags) +diff -urNp linux-2.6.33.1/include/linux/jbd.h linux-2.6.33.1/include/linux/jbd.h +--- linux-2.6.33.1/include/linux/jbd.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/jbd.h 2010-03-20 16:58:41.916544293 -0400 +@@ -66,7 +66,7 @@ extern u8 journal_enable_debug; + } \ + } while (0) + #else +-#define jbd_debug(f, a...) /**/ ++#define jbd_debug(f, a...) do {} while (0) + #endif + + static inline void *jbd_alloc(size_t size, gfp_t flags) +diff -urNp linux-2.6.33.1/include/linux/kallsyms.h linux-2.6.33.1/include/linux/kallsyms.h +--- linux-2.6.33.1/include/linux/kallsyms.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/kallsyms.h 2010-03-20 16:58:41.916544293 -0400 +@@ -15,7 +15,8 @@ + + struct module; + +-#ifdef CONFIG_KALLSYMS ++#ifndef __INCLUDED_BY_HIDESYM ++#if defined(CONFIG_KALLSYMS) && !defined(CONFIG_GRKERNSEC_HIDESYM) + /* Lookup the address for a symbol. Returns 0 if not found. */ + unsigned long kallsyms_lookup_name(const char *name); + +@@ -92,6 +93,9 @@ static inline int lookup_symbol_attrs(un + /* Stupid that this does nothing, but I didn't create this mess. */ + #define __print_symbol(fmt, addr) + #endif /*CONFIG_KALLSYMS*/ ++#else /* when included by kallsyms.c, with HIDESYM enabled */ ++extern void __print_symbol(const char *fmt, unsigned long address); ++#endif + + /* This macro allows us to keep printk typechecking */ + static void __check_printsym_format(const char *fmt, ...) +diff -urNp linux-2.6.33.1/include/linux/kgdb.h linux-2.6.33.1/include/linux/kgdb.h +--- linux-2.6.33.1/include/linux/kgdb.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/kgdb.h 2010-03-20 16:58:41.916544293 -0400 +@@ -250,20 +250,20 @@ struct kgdb_arch { + */ + struct kgdb_io { + const char *name; +- int (*read_char) (void); +- void (*write_char) (u8); +- void (*flush) (void); +- int (*init) (void); +- void (*pre_exception) (void); +- void (*post_exception) (void); ++ int (* const read_char) (void); ++ void (* const write_char) (u8); ++ void (* const flush) (void); ++ int (* const init) (void); ++ void (* const pre_exception) (void); ++ void (* const post_exception) (void); + }; + +-extern struct kgdb_arch arch_kgdb_ops; ++extern const struct kgdb_arch arch_kgdb_ops; + + extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs); + +-extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops); +-extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops); ++extern int kgdb_register_io_module(const struct kgdb_io *local_kgdb_io_ops); ++extern void kgdb_unregister_io_module(const struct kgdb_io *local_kgdb_io_ops); + + extern int kgdb_hex2long(char **ptr, unsigned long *long_val); + extern int kgdb_mem2hex(char *mem, char *buf, int count); +diff -urNp linux-2.6.33.1/include/linux/kobject.h linux-2.6.33.1/include/linux/kobject.h +--- linux-2.6.33.1/include/linux/kobject.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/kobject.h 2010-03-20 16:58:41.916544293 -0400 +@@ -106,7 +106,7 @@ extern char *kobject_get_path(struct kob + + struct kobj_type { + void (*release)(struct kobject *kobj); +- struct sysfs_ops *sysfs_ops; ++ const struct sysfs_ops *sysfs_ops; + struct attribute **default_attrs; + }; + +@@ -118,9 +118,9 @@ struct kobj_uevent_env { + }; + + struct kset_uevent_ops { +- int (*filter)(struct kset *kset, struct kobject *kobj); +- const char *(*name)(struct kset *kset, struct kobject *kobj); +- int (*uevent)(struct kset *kset, struct kobject *kobj, ++ int (* const filter)(struct kset *kset, struct kobject *kobj); ++ const char *(* const name)(struct kset *kset, struct kobject *kobj); ++ int (* const uevent)(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env); + }; + +@@ -132,7 +132,7 @@ struct kobj_attribute { + const char *buf, size_t count); + }; + +-extern struct sysfs_ops kobj_sysfs_ops; ++extern const struct sysfs_ops kobj_sysfs_ops; + + /** + * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem. +@@ -155,14 +155,14 @@ struct kset { + struct list_head list; + spinlock_t list_lock; + struct kobject kobj; +- struct kset_uevent_ops *uevent_ops; ++ const struct kset_uevent_ops *uevent_ops; + }; + + extern void kset_init(struct kset *kset); + extern int __must_check kset_register(struct kset *kset); + extern void kset_unregister(struct kset *kset); + extern struct kset * __must_check kset_create_and_add(const char *name, +- struct kset_uevent_ops *u, ++ const struct kset_uevent_ops *u, + struct kobject *parent_kobj); + + static inline struct kset *to_kset(struct kobject *kobj) +diff -urNp linux-2.6.33.1/include/linux/kvm_host.h linux-2.6.33.1/include/linux/kvm_host.h +--- linux-2.6.33.1/include/linux/kvm_host.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/kvm_host.h 2010-03-20 16:58:41.916544293 -0400 +@@ -225,7 +225,7 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vc + void vcpu_load(struct kvm_vcpu *vcpu); + void vcpu_put(struct kvm_vcpu *vcpu); + +-int kvm_init(void *opaque, unsigned int vcpu_size, ++int kvm_init(const void *opaque, unsigned int vcpu_size, + struct module *module); + void kvm_exit(void); + +@@ -332,7 +332,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug( + struct kvm_guest_debug *dbg); + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); + +-int kvm_arch_init(void *opaque); ++int kvm_arch_init(const void *opaque); + void kvm_arch_exit(void); + + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); +diff -urNp linux-2.6.33.1/include/linux/libata.h linux-2.6.33.1/include/linux/libata.h +--- linux-2.6.33.1/include/linux/libata.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/libata.h 2010-03-20 16:58:41.916544293 -0400 +@@ -64,11 +64,11 @@ + #ifdef ATA_VERBOSE_DEBUG + #define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) + #else +-#define VPRINTK(fmt, args...) ++#define VPRINTK(fmt, args...) do {} while (0) + #endif /* ATA_VERBOSE_DEBUG */ + #else +-#define DPRINTK(fmt, args...) +-#define VPRINTK(fmt, args...) ++#define DPRINTK(fmt, args...) do {} while (0) ++#define VPRINTK(fmt, args...) do {} while (0) + #endif /* ATA_DEBUG */ + + #define BPRINTK(fmt, args...) if (ap->flags & ATA_FLAG_DEBUGMSG) printk(KERN_ERR "%s: " fmt, __func__, ## args) +@@ -524,11 +524,11 @@ struct ata_ioports { + + struct ata_host { + spinlock_t lock; +- struct device *dev; ++ struct device *dev; + void __iomem * const *iomap; + unsigned int n_ports; + void *private_data; +- struct ata_port_operations *ops; ++ const struct ata_port_operations *ops; + unsigned long flags; + #ifdef CONFIG_ATA_ACPI + acpi_handle acpi_handle; +@@ -710,7 +710,7 @@ struct ata_link { + + struct ata_port { + struct Scsi_Host *scsi_host; /* our co-allocated scsi host */ +- struct ata_port_operations *ops; ++ const struct ata_port_operations *ops; + spinlock_t *lock; + /* Flags owned by the EH context. Only EH should touch these once the + port is active */ +@@ -892,7 +892,7 @@ struct ata_port_info { + unsigned long pio_mask; + unsigned long mwdma_mask; + unsigned long udma_mask; +- struct ata_port_operations *port_ops; ++ const struct ata_port_operations *port_ops; + void *private_data; + }; + +@@ -916,7 +916,7 @@ extern const unsigned long sata_deb_timi + extern const unsigned long sata_deb_timing_hotplug[]; + extern const unsigned long sata_deb_timing_long[]; + +-extern struct ata_port_operations ata_dummy_port_ops; ++extern const struct ata_port_operations ata_dummy_port_ops; + extern const struct ata_port_info ata_dummy_port_info; + + static inline const unsigned long * +@@ -962,7 +962,7 @@ extern int ata_host_activate(struct ata_ + struct scsi_host_template *sht); + extern void ata_host_detach(struct ata_host *host); + extern void ata_host_init(struct ata_host *, struct device *, +- unsigned long, struct ata_port_operations *); ++ unsigned long, const struct ata_port_operations *); + extern int ata_scsi_detect(struct scsi_host_template *sht); + extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg); + extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)); +diff -urNp linux-2.6.33.1/include/linux/lockd/bind.h linux-2.6.33.1/include/linux/lockd/bind.h +--- linux-2.6.33.1/include/linux/lockd/bind.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/lockd/bind.h 2010-03-20 16:58:41.916544293 -0400 +@@ -23,13 +23,13 @@ struct svc_rqst; + * This is the set of functions for lockd->nfsd communication + */ + struct nlmsvc_binding { +- __be32 (*fopen)(struct svc_rqst *, ++ __be32 (* const fopen)(struct svc_rqst *, + struct nfs_fh *, + struct file **); +- void (*fclose)(struct file *); ++ void (* const fclose)(struct file *); + }; + +-extern struct nlmsvc_binding * nlmsvc_ops; ++extern const struct nlmsvc_binding * nlmsvc_ops; + + /* + * Similar to nfs_client_initdata, but without the NFS-specific +diff -urNp linux-2.6.33.1/include/linux/mm.h linux-2.6.33.1/include/linux/mm.h +--- linux-2.6.33.1/include/linux/mm.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/mm.h 2010-03-20 16:58:41.916544293 -0400 +@@ -106,6 +106,10 @@ extern unsigned int kobjsize(const void + #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ + #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ + ++#ifdef CONFIG_PAX_PAGEEXEC ++#define VM_PAGEEXEC 0x80000000 /* vma->vm_page_prot needs special handling */ ++#endif ++ + #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ + #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS + #endif +@@ -895,6 +899,8 @@ struct shrinker { + extern void register_shrinker(struct shrinker *); + extern void unregister_shrinker(struct shrinker *); + ++pgprot_t vm_get_page_prot(unsigned long vm_flags); ++ + int vma_wants_writenotify(struct vm_area_struct *vma); + + extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); +@@ -1171,6 +1177,7 @@ out: + } + + extern int do_munmap(struct mm_struct *, unsigned long, size_t); ++extern int __do_munmap(struct mm_struct *, unsigned long, size_t); + + extern unsigned long do_brk(unsigned long, unsigned long); + +@@ -1225,6 +1232,10 @@ extern struct vm_area_struct * find_vma( + extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev); + ++extern struct vm_area_struct *pax_find_mirror_vma(struct vm_area_struct *vma); ++extern void pax_mirror_vma(struct vm_area_struct *vma_m, struct vm_area_struct *vma); ++extern void pax_mirror_file_pte(struct vm_area_struct *vma, unsigned long address, struct page *page_m, spinlock_t *ptl); ++ + /* Look up the first VMA which intersects the interval start_addr..end_addr-1, + NULL if none. Assume start_addr < end_addr. */ + static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +@@ -1241,7 +1252,6 @@ static inline unsigned long vma_pages(st + return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + } + +-pgprot_t vm_get_page_prot(unsigned long vm_flags); + struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); + int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); +@@ -1344,8 +1354,14 @@ extern int unpoison_memory(unsigned long + extern int sysctl_memory_failure_early_kill; + extern int sysctl_memory_failure_recovery; + extern void shake_page(struct page *p, int access); +-extern atomic_long_t mce_bad_pages; ++extern atomic_long_unchecked_t mce_bad_pages; + extern int soft_offline_page(struct page *page, int flags); + ++#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT ++extern void track_exec_limit(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long prot); ++#else ++static inline void track_exec_limit(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long prot) {} ++#endif ++ + #endif /* __KERNEL__ */ + #endif /* _LINUX_MM_H */ +diff -urNp linux-2.6.33.1/include/linux/mm_types.h linux-2.6.33.1/include/linux/mm_types.h +--- linux-2.6.33.1/include/linux/mm_types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/mm_types.h 2010-03-20 16:58:41.916544293 -0400 +@@ -188,6 +188,8 @@ struct vm_area_struct { + #ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ + #endif ++ ++ struct vm_area_struct *vm_mirror;/* PaX: mirror vma or NULL */ + }; + + struct core_thread { +@@ -291,6 +293,24 @@ struct mm_struct { + #ifdef CONFIG_MMU_NOTIFIER + struct mmu_notifier_mm *mmu_notifier_mm; + #endif ++ ++#if defined(CONFIG_PAX_EI_PAX) || defined(CONFIG_PAX_PT_PAX_FLAGS) || defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) ++ unsigned long pax_flags; ++#endif ++ ++#ifdef CONFIG_PAX_DLRESOLVE ++ unsigned long call_dl_resolve; ++#endif ++ ++#if defined(CONFIG_PPC32) && defined(CONFIG_PAX_EMUSIGRT) ++ unsigned long call_syscall; ++#endif ++ ++#ifdef CONFIG_PAX_ASLR ++ unsigned long delta_mmap; /* randomized offset */ ++ unsigned long delta_stack; /* randomized offset */ ++#endif ++ + }; + + /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ +diff -urNp linux-2.6.33.1/include/linux/mmu_notifier.h linux-2.6.33.1/include/linux/mmu_notifier.h +--- linux-2.6.33.1/include/linux/mmu_notifier.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/mmu_notifier.h 2010-03-20 16:58:41.916544293 -0400 +@@ -235,12 +235,12 @@ static inline void mmu_notifier_mm_destr + */ + #define ptep_clear_flush_notify(__vma, __address, __ptep) \ + ({ \ +- pte_t __pte; \ ++ pte_t ___pte; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ +- __pte = ptep_clear_flush(___vma, ___address, __ptep); \ ++ ___pte = ptep_clear_flush(___vma, ___address, __ptep); \ + mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \ +- __pte; \ ++ ___pte; \ + }) + + #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ +diff -urNp linux-2.6.33.1/include/linux/mod_devicetable.h linux-2.6.33.1/include/linux/mod_devicetable.h +--- linux-2.6.33.1/include/linux/mod_devicetable.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/mod_devicetable.h 2010-03-20 16:58:41.916544293 -0400 +@@ -12,7 +12,7 @@ + typedef unsigned long kernel_ulong_t; + #endif + +-#define PCI_ANY_ID (~0) ++#define PCI_ANY_ID ((__u16)~0) + + struct pci_device_id { + __u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/ +@@ -131,7 +131,7 @@ struct usb_device_id { + #define USB_DEVICE_ID_MATCH_INT_SUBCLASS 0x0100 + #define USB_DEVICE_ID_MATCH_INT_PROTOCOL 0x0200 + +-#define HID_ANY_ID (~0) ++#define HID_ANY_ID (~0U) + + struct hid_device_id { + __u16 bus; +diff -urNp linux-2.6.33.1/include/linux/module.h linux-2.6.33.1/include/linux/module.h +--- linux-2.6.33.1/include/linux/module.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/module.h 2010-03-20 16:58:41.916544293 -0400 +@@ -289,16 +289,16 @@ struct module + int (*init)(void); + + /* If this is non-NULL, vfree after init() returns */ +- void *module_init; ++ void *module_init_rx, *module_init_rw; + + /* Here is the actual code + data, vfree'd on unload. */ +- void *module_core; ++ void *module_core_rx, *module_core_rw; + + /* Here are the sizes of the init and core sections */ +- unsigned int init_size, core_size; ++ unsigned int init_size_rw, core_size_rw; + + /* The size of the executable code in each section. */ +- unsigned int init_text_size, core_text_size; ++ unsigned int init_size_rx, core_size_rx; + + /* Arch-specific module values */ + struct mod_arch_specific arch; +@@ -395,16 +395,46 @@ struct module *__module_address(unsigned + bool is_module_address(unsigned long addr); + bool is_module_text_address(unsigned long addr); + ++static inline int within_module_range(unsigned long addr, void *start, unsigned long size) ++{ ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ if (ktla_ktva(addr) >= (unsigned long)start && ++ ktla_ktva(addr) < (unsigned long)start + size) ++ return 1; ++#endif ++ ++ return ((void *)addr >= start && (void *)addr < start + size); ++} ++ ++static inline int within_module_core_rx(unsigned long addr, struct module *mod) ++{ ++ return within_module_range(addr, mod->module_core_rx, mod->core_size_rx); ++} ++ ++static inline int within_module_core_rw(unsigned long addr, struct module *mod) ++{ ++ return within_module_range(addr, mod->module_core_rw, mod->core_size_rw); ++} ++ ++static inline int within_module_init_rx(unsigned long addr, struct module *mod) ++{ ++ return within_module_range(addr, mod->module_init_rx, mod->init_size_rx); ++} ++ ++static inline int within_module_init_rw(unsigned long addr, struct module *mod) ++{ ++ return within_module_range(addr, mod->module_init_rw, mod->init_size_rw); ++} ++ + static inline int within_module_core(unsigned long addr, struct module *mod) + { +- return (unsigned long)mod->module_core <= addr && +- addr < (unsigned long)mod->module_core + mod->core_size; ++ return within_module_core_rx(addr, mod) || within_module_core_rw(addr, mod); + } + + static inline int within_module_init(unsigned long addr, struct module *mod) + { +- return (unsigned long)mod->module_init <= addr && +- addr < (unsigned long)mod->module_init + mod->init_size; ++ return within_module_init_rx(addr, mod) || within_module_init_rw(addr, mod); + } + + /* Search for module by name: must hold module_mutex. */ +diff -urNp linux-2.6.33.1/include/linux/moduleloader.h linux-2.6.33.1/include/linux/moduleloader.h +--- linux-2.6.33.1/include/linux/moduleloader.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/moduleloader.h 2010-03-20 16:58:41.916544293 -0400 +@@ -20,9 +20,21 @@ unsigned int arch_mod_section_prepend(st + sections. Returns NULL on failure. */ + void *module_alloc(unsigned long size); + ++#ifdef CONFIG_PAX_KERNEXEC ++void *module_alloc_exec(unsigned long size); ++#else ++#define module_alloc_exec(x) module_alloc(x) ++#endif ++ + /* Free memory returned from module_alloc. */ + void module_free(struct module *mod, void *module_region); + ++#ifdef CONFIG_PAX_KERNEXEC ++void module_free_exec(struct module *mod, void *module_region); ++#else ++#define module_free_exec(x, y) module_free((x), (y)) ++#endif ++ + /* Apply the given relocation to the (simplified) ELF. Return -error + or 0. */ + int apply_relocate(Elf_Shdr *sechdrs, +diff -urNp linux-2.6.33.1/include/linux/namei.h linux-2.6.33.1/include/linux/namei.h +--- linux-2.6.33.1/include/linux/namei.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/namei.h 2010-03-20 16:58:41.916544293 -0400 +@@ -22,7 +22,7 @@ struct nameidata { + unsigned int flags; + int last_type; + unsigned depth; +- char *saved_names[MAX_NESTED_LINKS + 1]; ++ const char *saved_names[MAX_NESTED_LINKS + 1]; + + /* Intent data */ + union { +@@ -81,12 +81,12 @@ extern int follow_up(struct path *); + extern struct dentry *lock_rename(struct dentry *, struct dentry *); + extern void unlock_rename(struct dentry *, struct dentry *); + +-static inline void nd_set_link(struct nameidata *nd, char *path) ++static inline void nd_set_link(struct nameidata *nd, const char *path) + { + nd->saved_names[nd->depth] = path; + } + +-static inline char *nd_get_link(struct nameidata *nd) ++static inline const char *nd_get_link(const struct nameidata *nd) + { + return nd->saved_names[nd->depth]; + } +diff -urNp linux-2.6.33.1/include/linux/nodemask.h linux-2.6.33.1/include/linux/nodemask.h +--- linux-2.6.33.1/include/linux/nodemask.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/nodemask.h 2010-03-20 16:58:41.916544293 -0400 +@@ -469,11 +469,11 @@ static inline int num_node_state(enum no + + #define any_online_node(mask) \ + ({ \ +- int node; \ +- for_each_node_mask(node, (mask)) \ +- if (node_online(node)) \ ++ int __node; \ ++ for_each_node_mask(__node, (mask)) \ ++ if (node_online(__node)) \ + break; \ +- node; \ ++ __node; \ + }) + + #define num_online_nodes() num_node_state(N_ONLINE) +diff -urNp linux-2.6.33.1/include/linux/oprofile.h linux-2.6.33.1/include/linux/oprofile.h +--- linux-2.6.33.1/include/linux/oprofile.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/oprofile.h 2010-03-20 16:58:41.920592896 -0400 +@@ -129,9 +129,9 @@ int oprofilefs_create_ulong(struct super + int oprofilefs_create_ro_ulong(struct super_block * sb, struct dentry * root, + char const * name, ulong * val); + +-/** Create a file for read-only access to an atomic_t. */ ++/** Create a file for read-only access to an atomic_unchecked_t. */ + int oprofilefs_create_ro_atomic(struct super_block * sb, struct dentry * root, +- char const * name, atomic_t * val); ++ char const * name, atomic_unchecked_t * val); + + /** create a directory */ + struct dentry * oprofilefs_mkdir(struct super_block * sb, struct dentry * root, +diff -urNp linux-2.6.33.1/include/linux/pipe_fs_i.h linux-2.6.33.1/include/linux/pipe_fs_i.h +--- linux-2.6.33.1/include/linux/pipe_fs_i.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/pipe_fs_i.h 2010-03-20 16:58:41.920592896 -0400 +@@ -46,9 +46,9 @@ struct pipe_inode_info { + wait_queue_head_t wait; + unsigned int nrbufs, curbuf; + struct page *tmp_page; +- unsigned int readers; +- unsigned int writers; +- unsigned int waiting_writers; ++ atomic_t readers; ++ atomic_t writers; ++ atomic_t waiting_writers; + unsigned int r_counter; + unsigned int w_counter; + struct fasync_struct *fasync_readers; +diff -urNp linux-2.6.33.1/include/linux/poison.h linux-2.6.33.1/include/linux/poison.h +--- linux-2.6.33.1/include/linux/poison.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/poison.h 2010-03-20 16:58:41.920592896 -0400 +@@ -19,8 +19,8 @@ + * under normal circumstances, used to verify that nobody uses + * non-initialized list entries. + */ +-#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) +-#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) ++#define LIST_POISON1 ((void *) (long)0xFFFFFF01) ++#define LIST_POISON2 ((void *) (long)0xFFFFFF02) + + /********** include/linux/timer.h **********/ + /* +diff -urNp linux-2.6.33.1/include/linux/proc_fs.h linux-2.6.33.1/include/linux/proc_fs.h +--- linux-2.6.33.1/include/linux/proc_fs.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/proc_fs.h 2010-03-20 16:58:41.920592896 -0400 +@@ -155,6 +155,19 @@ static inline struct proc_dir_entry *pro + return proc_create_data(name, mode, parent, proc_fops, NULL); + } + ++static inline struct proc_dir_entry *proc_create_grsec(const char *name, mode_t mode, ++ struct proc_dir_entry *parent, const struct file_operations *proc_fops) ++{ ++#ifdef CONFIG_GRKERNSEC_PROC_USER ++ return proc_create_data(name, S_IRUSR, parent, proc_fops, NULL); ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ return proc_create_data(name, S_IRUSR | S_IRGRP, parent, proc_fops, NULL); ++#else ++ return proc_create_data(name, mode, parent, proc_fops, NULL); ++#endif ++} ++ ++ + static inline struct proc_dir_entry *create_proc_read_entry(const char *name, + mode_t mode, struct proc_dir_entry *base, + read_proc_t *read_proc, void * data) +diff -urNp linux-2.6.33.1/include/linux/random.h linux-2.6.33.1/include/linux/random.h +--- linux-2.6.33.1/include/linux/random.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/random.h 2010-03-20 16:58:41.920592896 -0400 +@@ -74,6 +74,11 @@ unsigned long randomize_range(unsigned l + u32 random32(void); + void srandom32(u32 seed); + ++static inline unsigned long pax_get_random_long(void) ++{ ++ return random32() + (sizeof(long) > 4 ? (unsigned long)random32() << 32 : 0); ++} ++ + #endif /* __KERNEL___ */ + + #endif /* _LINUX_RANDOM_H */ +diff -urNp linux-2.6.33.1/include/linux/reiserfs_fs.h linux-2.6.33.1/include/linux/reiserfs_fs.h +--- linux-2.6.33.1/include/linux/reiserfs_fs.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/reiserfs_fs.h 2010-03-20 16:58:41.920592896 -0400 +@@ -1404,7 +1404,7 @@ static inline loff_t max_reiserfs_offset + #define REISERFS_USER_MEM 1 /* reiserfs user memory mode */ + + #define fs_generation(s) (REISERFS_SB(s)->s_generation_counter) +-#define get_generation(s) atomic_read (&fs_generation(s)) ++#define get_generation(s) atomic_read_unchecked (&fs_generation(s)) + #define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) + #define __fs_changed(gen,s) (gen != get_generation (s)) + #define fs_changed(gen,s) \ +@@ -1616,24 +1616,24 @@ static inline struct super_block *sb_fro + */ + + struct item_operations { +- int (*bytes_number) (struct item_head * ih, int block_size); +- void (*decrement_key) (struct cpu_key *); +- int (*is_left_mergeable) (struct reiserfs_key * ih, ++ int (* const bytes_number) (struct item_head * ih, int block_size); ++ void (* const decrement_key) (struct cpu_key *); ++ int (* const is_left_mergeable) (struct reiserfs_key * ih, + unsigned long bsize); +- void (*print_item) (struct item_head *, char *item); +- void (*check_item) (struct item_head *, char *item); ++ void (* const print_item) (struct item_head *, char *item); ++ void (* const check_item) (struct item_head *, char *item); + +- int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi, ++ int (* const create_vi) (struct virtual_node * vn, struct virtual_item * vi, + int is_affected, int insert_size); +- int (*check_left) (struct virtual_item * vi, int free, ++ int (* const check_left) (struct virtual_item * vi, int free, + int start_skip, int end_skip); +- int (*check_right) (struct virtual_item * vi, int free); +- int (*part_size) (struct virtual_item * vi, int from, int to); +- int (*unit_num) (struct virtual_item * vi); +- void (*print_vi) (struct virtual_item * vi); ++ int (* const check_right) (struct virtual_item * vi, int free); ++ int (* const part_size) (struct virtual_item * vi, int from, int to); ++ int (* const unit_num) (struct virtual_item * vi); ++ void (* const print_vi) (struct virtual_item * vi); + }; + +-extern struct item_operations *item_ops[TYPE_ANY + 1]; ++extern const struct item_operations * const item_ops[TYPE_ANY + 1]; + + #define op_bytes_number(ih,bsize) item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize) + #define op_is_left_mergeable(key,bsize) item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize) +diff -urNp linux-2.6.33.1/include/linux/reiserfs_fs_sb.h linux-2.6.33.1/include/linux/reiserfs_fs_sb.h +--- linux-2.6.33.1/include/linux/reiserfs_fs_sb.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/reiserfs_fs_sb.h 2010-03-20 16:58:41.920592896 -0400 +@@ -386,7 +386,7 @@ struct reiserfs_sb_info { + /* Comment? -Hans */ + wait_queue_head_t s_wait; + /* To be obsoleted soon by per buffer seals.. -Hans */ +- atomic_t s_generation_counter; // increased by one every time the ++ atomic_unchecked_t s_generation_counter; // increased by one every time the + // tree gets re-balanced + unsigned long s_properties; /* File system properties. Currently holds + on-disk FS format */ +diff -urNp linux-2.6.33.1/include/linux/sched.h linux-2.6.33.1/include/linux/sched.h +--- linux-2.6.33.1/include/linux/sched.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/sched.h 2010-03-20 16:58:41.920592896 -0400 +@@ -101,6 +101,7 @@ struct bio; + struct fs_struct; + struct bts_context; + struct perf_event_context; ++struct linux_binprm; + + /* + * List of flags we want to share for kernel threads, +@@ -678,6 +679,15 @@ struct signal_struct { + struct tty_audit_buf *tty_audit_buf; + #endif + ++#ifdef CONFIG_GRKERNSEC ++ u32 curr_ip; ++ u32 gr_saddr; ++ u32 gr_daddr; ++ u16 gr_sport; ++ u16 gr_dport; ++ u8 used_accept:1; ++#endif ++ + int oom_adj; /* OOM kill score adjustment (bit shift) */ + }; + +@@ -1234,7 +1244,7 @@ struct rcu_node; + + struct task_struct { + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ +- void *stack; ++ struct thread_info *stack; + atomic_t usage; + unsigned int flags; /* per process flags, defined below */ + unsigned int ptrace; +@@ -1346,8 +1356,8 @@ struct task_struct { + struct list_head thread_group; + + struct completion *vfork_done; /* for vfork() */ +- int __user *set_child_tid; /* CLONE_CHILD_SETTID */ +- int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ ++ pid_t __user *set_child_tid; /* CLONE_CHILD_SETTID */ ++ pid_t __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ + + cputime_t utime, stime, utimescaled, stimescaled; + cputime_t gtime; +@@ -1363,16 +1373,6 @@ struct task_struct { + struct task_cputime cputime_expires; + struct list_head cpu_timers[3]; + +-/* process credentials */ +- const struct cred *real_cred; /* objective and real subjective task +- * credentials (COW) */ +- const struct cred *cred; /* effective (overridable) subjective task +- * credentials (COW) */ +- struct mutex cred_guard_mutex; /* guard against foreign influences on +- * credential calculations +- * (notably. ptrace) */ +- struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */ +- + char comm[TASK_COMM_LEN]; /* executable name excluding path + - access with [gs]et_task_comm (which lock + it with task_lock()) +@@ -1456,6 +1456,15 @@ struct task_struct { + int softirqs_enabled; + int softirq_context; + #endif ++ ++/* process credentials */ ++ const struct cred *real_cred; /* objective and real subjective task ++ * credentials (COW) */ ++ struct mutex cred_guard_mutex; /* guard against foreign influences on ++ * credential calculations ++ * (notably. ptrace) */ ++ struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */ ++ + #ifdef CONFIG_LOCKDEP + # define MAX_LOCK_DEPTH 48UL + u64 curr_chain_key; +@@ -1476,6 +1485,9 @@ struct task_struct { + + struct backing_dev_info *backing_dev_info; + ++ const struct cred *cred; /* effective (overridable) subjective task ++ * credentials (COW) */ ++ + struct io_context *io_context; + + unsigned long ptrace_message; +@@ -1539,6 +1551,19 @@ struct task_struct { + unsigned long default_timer_slack_ns; + + struct list_head *scm_work_list; ++ ++#ifdef CONFIG_GRKERNSEC ++ /* grsecurity */ ++ rwlock_t gr_fs_lock; ++ struct acl_subject_label *acl; ++ struct acl_role_label *role; ++ struct file *exec_file; ++ u16 acl_role_id; ++ u8 acl_sp_role; ++ u8 is_writable; ++ u8 brute; ++#endif ++ + #ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* Index of current stored adress in ret_stack */ + int curr_ret_stack; +@@ -1571,6 +1596,52 @@ struct task_struct { + #endif + }; + ++#define MF_PAX_PAGEEXEC 0x01000000 /* Paging based non-executable pages */ ++#define MF_PAX_EMUTRAMP 0x02000000 /* Emulate trampolines */ ++#define MF_PAX_MPROTECT 0x04000000 /* Restrict mprotect() */ ++#define MF_PAX_RANDMMAP 0x08000000 /* Randomize mmap() base */ ++/*#define MF_PAX_RANDEXEC 0x10000000*/ /* Randomize ET_EXEC base */ ++#define MF_PAX_SEGMEXEC 0x20000000 /* Segmentation based non-executable pages */ ++ ++#ifdef CONFIG_PAX_SOFTMODE ++extern unsigned int pax_softmode; ++#endif ++ ++extern int pax_check_flags(unsigned long *); ++ ++/* if tsk != current then task_lock must be held on it */ ++#if defined(CONFIG_PAX_NOEXEC) || defined(CONFIG_PAX_ASLR) ++static inline unsigned long pax_get_flags(struct task_struct *tsk) ++{ ++ if (likely(tsk->mm)) ++ return tsk->mm->pax_flags; ++ else ++ return 0UL; ++} ++ ++/* if tsk != current then task_lock must be held on it */ ++static inline long pax_set_flags(struct task_struct *tsk, unsigned long flags) ++{ ++ if (likely(tsk->mm)) { ++ tsk->mm->pax_flags = flags; ++ return 0; ++ } ++ return -EINVAL; ++} ++#endif ++ ++#ifdef CONFIG_PAX_HAVE_ACL_FLAGS ++extern void pax_set_initial_flags(struct linux_binprm *bprm); ++#elif defined(CONFIG_PAX_HOOK_ACL_FLAGS) ++extern void (*pax_set_initial_flags_func)(struct linux_binprm *bprm); ++#endif ++ ++void pax_report_fault(struct pt_regs *regs, void *pc, void *sp); ++void pax_report_insns(void *pc, void *sp); ++void pax_report_refcount_overflow(struct pt_regs *regs); ++void pax_report_leak_to_user(const void *ptr, unsigned long len); ++void pax_report_overflow_from_user(const void *ptr, unsigned long len); ++ + /* Future-safe accessor for struct task_struct's cpus_allowed. */ + #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) + +@@ -2172,7 +2243,7 @@ extern void __cleanup_sighand(struct sig + extern void exit_itimers(struct signal_struct *); + extern void flush_itimer_signals(void); + +-extern NORET_TYPE void do_group_exit(int); ++extern NORET_TYPE void do_group_exit(int) ATTRIB_NORET; + + extern void daemonize(const char *, ...); + extern int allow_signal(int); +@@ -2274,6 +2345,33 @@ static inline void task_unlock(struct ta + spin_unlock(&p->alloc_lock); + } + ++/* grsec: protects only ->fs as task_lock is overkill and we can't ++ be using a spin_lock in interrupt context ++*/ ++#ifdef CONFIG_GRKERNSEC ++#define gr_fs_write_lock_irqsave(x, y) \ ++ write_lock_irqsave(&x->gr_fs_lock, y) ++#define gr_fs_write_unlock_irqrestore(x, y) \ ++ write_unlock_irqrestore(&x->gr_fs_lock, y) ++#else ++#define gr_fs_write_lock_irqsave(x, y) ++#define gr_fs_write_unlock_irqrestore(x, y) ++#endif ++ ++static inline void gr_fs_read_lock(struct task_struct *p) ++{ ++#ifdef CONFIG_GRKERNSEC ++ read_lock(&p->gr_fs_lock); ++#endif ++} ++ ++static inline void gr_fs_read_unlock(struct task_struct *p) ++{ ++#ifdef CONFIG_GRKERNSEC ++ read_unlock(&p->gr_fs_lock); ++#endif ++} ++ + extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk, + unsigned long *flags); + +@@ -2285,8 +2383,8 @@ static inline void unlock_task_sighand(s + + #ifndef __HAVE_THREAD_FUNCTIONS + +-#define task_thread_info(task) ((struct thread_info *)(task)->stack) +-#define task_stack_page(task) ((task)->stack) ++#define task_thread_info(task) ((task)->stack) ++#define task_stack_page(task) ((void *)(task)->stack) + + static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) + { +@@ -2301,13 +2399,31 @@ static inline unsigned long *end_of_stac + + #endif + +-static inline int object_is_on_stack(void *obj) ++static inline int object_starts_on_stack(void *obj) + { +- void *stack = task_stack_page(current); ++ const void *stack = task_stack_page(current); + + return (obj >= stack) && (obj < (stack + THREAD_SIZE)); + } + ++/* 0: not at all, 1: fully, -1: partially (implies an error) */ ++static inline int object_is_on_stack(const void *obj, unsigned long len) ++{ ++ const void *stack = task_stack_page(current); ++ const void *stackend = stack + THREAD_SIZE; ++ ++ if (obj + len < obj) ++ return -1; ++ ++ if (stack <= obj && obj + len <= stackend) ++ return 1; ++ ++ if (obj + len <= stack || stackend <= obj) ++ return 0; ++ ++ return -1; ++} ++ + extern void thread_info_cache_init(void); + + #ifdef CONFIG_DEBUG_STACK_USAGE +diff -urNp linux-2.6.33.1/include/linux/screen_info.h linux-2.6.33.1/include/linux/screen_info.h +--- linux-2.6.33.1/include/linux/screen_info.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/screen_info.h 2010-03-20 16:58:41.920592896 -0400 +@@ -43,7 +43,8 @@ struct screen_info { + __u16 pages; /* 0x32 */ + __u16 vesa_attributes; /* 0x34 */ + __u32 capabilities; /* 0x36 */ +- __u8 _reserved[6]; /* 0x3a */ ++ __u16 vesapm_size; /* 0x3a */ ++ __u8 _reserved[4]; /* 0x3c */ + } __attribute__((packed)); + + #define VIDEO_TYPE_MDA 0x10 /* Monochrome Text Display */ +diff -urNp linux-2.6.33.1/include/linux/security.h linux-2.6.33.1/include/linux/security.h +--- linux-2.6.33.1/include/linux/security.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/security.h 2010-03-20 16:58:41.924637706 -0400 +@@ -34,6 +34,7 @@ + #include <linux/key.h> + #include <linux/xfrm.h> + #include <linux/gfp.h> ++#include <linux/grsecurity.h> + #include <net/flow.h> + + /* Maximum number of letters for an LSM name string */ +diff -urNp linux-2.6.33.1/include/linux/shm.h linux-2.6.33.1/include/linux/shm.h +--- linux-2.6.33.1/include/linux/shm.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/shm.h 2010-03-20 16:58:41.924637706 -0400 +@@ -95,6 +95,10 @@ struct shmid_kernel /* private to the ke + pid_t shm_cprid; + pid_t shm_lprid; + struct user_struct *mlock_user; ++#ifdef CONFIG_GRKERNSEC ++ time_t shm_createtime; ++ pid_t shm_lapid; ++#endif + }; + + /* shm_mode upper byte flags */ +diff -urNp linux-2.6.33.1/include/linux/slab.h linux-2.6.33.1/include/linux/slab.h +--- linux-2.6.33.1/include/linux/slab.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/slab.h 2010-03-20 16:58:41.924637706 -0400 +@@ -11,6 +11,7 @@ + + #include <linux/gfp.h> + #include <linux/types.h> ++#include <linux/err.h> + + /* + * Flags to pass to kmem_cache_create(). +@@ -82,10 +83,13 @@ + * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can. + * Both make kfree a no-op. + */ +-#define ZERO_SIZE_PTR ((void *)16) ++#define ZERO_SIZE_PTR \ ++({ \ ++ BUILD_BUG_ON(!(MAX_ERRNO & ~PAGE_MASK));\ ++ (void *)(-MAX_ERRNO-1L); \ ++}) + +-#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \ +- (unsigned long)ZERO_SIZE_PTR) ++#define ZERO_OR_NULL_PTR(x) (!(x) || (x) == ZERO_SIZE_PTR) + + /* + * struct kmem_cache related prototypes +@@ -138,6 +142,7 @@ void * __must_check krealloc(const void + void kfree(const void *); + void kzfree(const void *); + size_t ksize(const void *); ++void check_object_size(const void *ptr, unsigned long n, bool to); + + /* + * Allocator specific definitions. These are mainly used to establish optimized +@@ -328,4 +333,37 @@ static inline void *kzalloc_node(size_t + + void __init kmem_cache_init_late(void); + ++#define kmalloc(x, y) \ ++({ \ ++ void *___retval; \ ++ intoverflow_t ___x = (intoverflow_t)x; \ ++ if (WARN(___x > ULONG_MAX, "kmalloc size overflow\n"))\ ++ ___retval = NULL; \ ++ else \ ++ ___retval = kmalloc((size_t)___x, (y)); \ ++ ___retval; \ ++}) ++ ++#define kmalloc_node(x, y, z) \ ++({ \ ++ void *___retval; \ ++ intoverflow_t ___x = (intoverflow_t)x; \ ++ if (WARN(___x > ULONG_MAX, "kmalloc_node size overflow\n"))\ ++ ___retval = NULL; \ ++ else \ ++ ___retval = kmalloc_node((size_t)___x, (y), (z));\ ++ ___retval; \ ++}) ++ ++#define kzalloc(x, y) \ ++({ \ ++ void *___retval; \ ++ intoverflow_t ___x = (intoverflow_t)x; \ ++ if (WARN(___x > ULONG_MAX, "kzalloc size overflow\n"))\ ++ ___retval = NULL; \ ++ else \ ++ ___retval = kzalloc((size_t)___x, (y)); \ ++ ___retval; \ ++}) ++ + #endif /* _LINUX_SLAB_H */ +diff -urNp linux-2.6.33.1/include/linux/slub_def.h linux-2.6.33.1/include/linux/slub_def.h +--- linux-2.6.33.1/include/linux/slub_def.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/slub_def.h 2010-03-20 16:58:41.924637706 -0400 +@@ -86,7 +86,7 @@ struct kmem_cache { + struct kmem_cache_order_objects max; + struct kmem_cache_order_objects min; + gfp_t allocflags; /* gfp flags to use on each alloc */ +- int refcount; /* Refcount for slab cache destroy */ ++ atomic_t refcount; /* Refcount for slab cache destroy */ + void (*ctor)(void *); + int inuse; /* Offset to metadata */ + int align; /* Alignment */ +diff -urNp linux-2.6.33.1/include/linux/sonet.h linux-2.6.33.1/include/linux/sonet.h +--- linux-2.6.33.1/include/linux/sonet.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/sonet.h 2010-03-20 16:58:41.924637706 -0400 +@@ -61,7 +61,7 @@ struct sonet_stats { + #include <asm/atomic.h> + + struct k_sonet_stats { +-#define __HANDLE_ITEM(i) atomic_t i ++#define __HANDLE_ITEM(i) atomic_unchecked_t i + __SONET_ITEMS + #undef __HANDLE_ITEM + }; +diff -urNp linux-2.6.33.1/include/linux/suspend.h linux-2.6.33.1/include/linux/suspend.h +--- linux-2.6.33.1/include/linux/suspend.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/suspend.h 2010-03-20 16:58:41.924637706 -0400 +@@ -104,15 +104,15 @@ typedef int __bitwise suspend_state_t; + * which require special recovery actions in that situation. + */ + struct platform_suspend_ops { +- int (*valid)(suspend_state_t state); +- int (*begin)(suspend_state_t state); +- int (*prepare)(void); +- int (*prepare_late)(void); +- int (*enter)(suspend_state_t state); +- void (*wake)(void); +- void (*finish)(void); +- void (*end)(void); +- void (*recover)(void); ++ int (* const valid)(suspend_state_t state); ++ int (* const begin)(suspend_state_t state); ++ int (* const prepare)(void); ++ int (* const prepare_late)(void); ++ int (* const enter)(suspend_state_t state); ++ void (* const wake)(void); ++ void (* const finish)(void); ++ void (* const end)(void); ++ void (* const recover)(void); + }; + + #ifdef CONFIG_SUSPEND +@@ -120,7 +120,7 @@ struct platform_suspend_ops { + * suspend_set_ops - set platform dependent suspend operations + * @ops: The new suspend operations to set. + */ +-extern void suspend_set_ops(struct platform_suspend_ops *ops); ++extern void suspend_set_ops(const struct platform_suspend_ops *ops); + extern int suspend_valid_only_mem(suspend_state_t state); + + /** +@@ -145,7 +145,7 @@ extern int pm_suspend(suspend_state_t st + #else /* !CONFIG_SUSPEND */ + #define suspend_valid_only_mem NULL + +-static inline void suspend_set_ops(struct platform_suspend_ops *ops) {} ++static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} + static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } + #endif /* !CONFIG_SUSPEND */ + +@@ -215,16 +215,16 @@ extern void mark_free_pages(struct zone + * platforms which require special recovery actions in that situation. + */ + struct platform_hibernation_ops { +- int (*begin)(void); +- void (*end)(void); +- int (*pre_snapshot)(void); +- void (*finish)(void); +- int (*prepare)(void); +- int (*enter)(void); +- void (*leave)(void); +- int (*pre_restore)(void); +- void (*restore_cleanup)(void); +- void (*recover)(void); ++ int (* const begin)(void); ++ void (* const end)(void); ++ int (* const pre_snapshot)(void); ++ void (* const finish)(void); ++ int (* const prepare)(void); ++ int (* const enter)(void); ++ void (* const leave)(void); ++ int (* const pre_restore)(void); ++ void (* const restore_cleanup)(void); ++ void (* const recover)(void); + }; + + #ifdef CONFIG_HIBERNATION +@@ -243,7 +243,7 @@ extern void swsusp_set_page_free(struct + extern void swsusp_unset_page_free(struct page *); + extern unsigned long get_safe_page(gfp_t gfp_mask); + +-extern void hibernation_set_ops(struct platform_hibernation_ops *ops); ++extern void hibernation_set_ops(const struct platform_hibernation_ops *ops); + extern int hibernate(void); + extern bool system_entering_hibernation(void); + #else /* CONFIG_HIBERNATION */ +@@ -251,7 +251,7 @@ static inline int swsusp_page_is_forbidd + static inline void swsusp_set_page_free(struct page *p) {} + static inline void swsusp_unset_page_free(struct page *p) {} + +-static inline void hibernation_set_ops(struct platform_hibernation_ops *ops) {} ++static inline void hibernation_set_ops(const struct platform_hibernation_ops *ops) {} + static inline int hibernate(void) { return -ENOSYS; } + static inline bool system_entering_hibernation(void) { return false; } + #endif /* CONFIG_HIBERNATION */ +diff -urNp linux-2.6.33.1/include/linux/sysctl.h linux-2.6.33.1/include/linux/sysctl.h +--- linux-2.6.33.1/include/linux/sysctl.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/sysctl.h 2010-03-20 16:58:41.924637706 -0400 +@@ -155,7 +155,11 @@ enum + KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ + }; + +- ++#ifdef CONFIG_PAX_SOFTMODE ++enum { ++ PAX_SOFTMODE=1 /* PaX: disable/enable soft mode */ ++}; ++#endif + + /* CTL_VM names: */ + enum +diff -urNp linux-2.6.33.1/include/linux/sysfs.h linux-2.6.33.1/include/linux/sysfs.h +--- linux-2.6.33.1/include/linux/sysfs.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/sysfs.h 2010-03-20 16:58:41.924637706 -0400 +@@ -75,8 +75,8 @@ struct bin_attribute { + }; + + struct sysfs_ops { +- ssize_t (*show)(struct kobject *, struct attribute *,char *); +- ssize_t (*store)(struct kobject *,struct attribute *,const char *, size_t); ++ ssize_t (* const show)(struct kobject *, struct attribute *,char *); ++ ssize_t (* const store)(struct kobject *,struct attribute *,const char *, size_t); + }; + + struct sysfs_dirent; +diff -urNp linux-2.6.33.1/include/linux/thread_info.h linux-2.6.33.1/include/linux/thread_info.h +--- linux-2.6.33.1/include/linux/thread_info.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/thread_info.h 2010-03-20 16:58:41.924637706 -0400 +@@ -23,7 +23,7 @@ struct restart_block { + }; + /* For futex_wait and futex_wait_requeue_pi */ + struct { +- u32 *uaddr; ++ u32 __user *uaddr; + u32 val; + u32 flags; + u32 bitset; +diff -urNp linux-2.6.33.1/include/linux/tty.h linux-2.6.33.1/include/linux/tty.h +--- linux-2.6.33.1/include/linux/tty.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/tty.h 2010-03-20 16:58:41.924637706 -0400 +@@ -13,6 +13,7 @@ + #include <linux/tty_driver.h> + #include <linux/tty_ldisc.h> + #include <linux/mutex.h> ++#include <linux/poll.h> + + #include <asm/system.h> + +@@ -440,7 +441,6 @@ extern int tty_perform_flush(struct tty_ + extern dev_t tty_devnum(struct tty_struct *tty); + extern void proc_clear_tty(struct task_struct *p); + extern struct tty_struct *get_current_tty(void); +-extern void tty_default_fops(struct file_operations *fops); + extern struct tty_struct *alloc_tty_struct(void); + extern void free_tty_struct(struct tty_struct *tty); + extern void initialize_tty_struct(struct tty_struct *tty, +@@ -501,6 +501,18 @@ extern void tty_ldisc_begin(void); + /* This last one is just for the tty layer internals and shouldn't be used elsewhere */ + extern void tty_ldisc_enable(struct tty_struct *tty); + ++/* tty_io.c */ ++extern ssize_t tty_read(struct file *, char __user *, size_t, loff_t *); ++extern ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *); ++extern unsigned int tty_poll(struct file *, poll_table *); ++#ifdef CONFIG_COMPAT ++extern long tty_compat_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg); ++#else ++#define tty_compat_ioctl NULL ++#endif ++extern int tty_release(struct inode *, struct file *); ++extern int tty_fasync(int fd, struct file *filp, int on); + + /* n_tty.c */ + extern struct tty_ldisc_ops tty_ldisc_N_TTY; +diff -urNp linux-2.6.33.1/include/linux/tty_ldisc.h linux-2.6.33.1/include/linux/tty_ldisc.h +--- linux-2.6.33.1/include/linux/tty_ldisc.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/tty_ldisc.h 2010-03-20 16:58:41.924637706 -0400 +@@ -139,7 +139,7 @@ struct tty_ldisc_ops { + + struct module *owner; + +- int refcount; ++ atomic_t refcount; + }; + + struct tty_ldisc { +diff -urNp linux-2.6.33.1/include/linux/types.h linux-2.6.33.1/include/linux/types.h +--- linux-2.6.33.1/include/linux/types.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/types.h 2010-03-20 16:58:41.924637706 -0400 +@@ -191,10 +191,26 @@ typedef struct { + volatile int counter; + } atomic_t; + ++#ifdef CONFIG_PAX_REFCOUNT ++typedef struct { ++ volatile int counter; ++} atomic_unchecked_t; ++#else ++typedef atomic_t atomic_unchecked_t; ++#endif ++ + #ifdef CONFIG_64BIT + typedef struct { + volatile long counter; + } atomic64_t; ++ ++#ifdef CONFIG_PAX_REFCOUNT ++typedef struct { ++ volatile long counter; ++} atomic64_unchecked_t; ++#else ++typedef atomic64_t atomic64_unchecked_t; ++#endif + #endif + + struct ustat { +diff -urNp linux-2.6.33.1/include/linux/uaccess.h linux-2.6.33.1/include/linux/uaccess.h +--- linux-2.6.33.1/include/linux/uaccess.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/uaccess.h 2010-03-20 16:58:41.924637706 -0400 +@@ -76,11 +76,11 @@ static inline unsigned long __copy_from_ + long ret; \ + mm_segment_t old_fs = get_fs(); \ + \ +- set_fs(KERNEL_DS); \ + pagefault_disable(); \ ++ set_fs(KERNEL_DS); \ + ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \ +- pagefault_enable(); \ + set_fs(old_fs); \ ++ pagefault_enable(); \ + ret; \ + }) + +@@ -93,8 +93,8 @@ static inline unsigned long __copy_from_ + * Safely read from address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ +-extern long probe_kernel_read(void *dst, void *src, size_t size); +-extern long __probe_kernel_read(void *dst, void *src, size_t size); ++extern long probe_kernel_read(void *dst, const void *src, size_t size); ++extern long __probe_kernel_read(void *dst, const void *src, size_t size); + + /* + * probe_kernel_write(): safely attempt to write to a location +@@ -105,7 +105,7 @@ extern long __probe_kernel_read(void *ds + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ +-extern long notrace probe_kernel_write(void *dst, void *src, size_t size); +-extern long notrace __probe_kernel_write(void *dst, void *src, size_t size); ++extern long notrace probe_kernel_write(void *dst, const void *src, size_t size); ++extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size); + + #endif /* __LINUX_UACCESS_H__ */ +diff -urNp linux-2.6.33.1/include/linux/vmalloc.h linux-2.6.33.1/include/linux/vmalloc.h +--- linux-2.6.33.1/include/linux/vmalloc.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/linux/vmalloc.h 2010-03-20 16:58:41.928524770 -0400 +@@ -13,6 +13,11 @@ struct vm_area_struct; /* vma defining + #define VM_MAP 0x00000004 /* vmap()ed pages */ + #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ + #define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ ++ ++#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++#define VM_KERNEXEC 0x00000020 /* allocate from executable kernel memory range */ ++#endif ++ + /* bits [20..32] reserved for arch specific ioremap internals */ + + /* +@@ -121,4 +126,81 @@ struct vm_struct **pcpu_get_vm_areas(con + + void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); + ++#define vmalloc(x) \ ++({ \ ++ void *___retval; \ ++ intoverflow_t ___x = (intoverflow_t)x; \ ++ if (WARN(___x > ULONG_MAX, "vmalloc size overflow\n")) \ ++ ___retval = NULL; \ ++ else \ ++ ___retval = vmalloc((unsigned long)___x); \ ++ ___retval; \ ++}) ++ ++#define __vmalloc(x, y, z) \ ++({ \ ++ void *___retval; \ ++ intoverflow_t ___x = (intoverflow_t)x; \ ++ if (WARN(___x > ULONG_MAX, "__vmalloc size overflow\n"))\ ++ ___retval = NULL; \ ++ else \ ++ ___retval = __vmalloc((unsigned long)___x, (y), (z));\ ++ ___retval; \ ++}) ++ ++#define vmalloc_user(x) \ ++({ \ ++ void *___retval; \ ++ intoverflow_t ___x = (intoverflow_t)x; \ ++ if (WARN(___x > ULONG_MAX, "vmalloc_user size overflow\n"))\ ++ ___retval = NULL; \ ++ else \ ++ ___retval = vmalloc_user((unsigned long)___x); \ ++ ___retval; \ ++}) ++ ++#define vmalloc_exec(x) \ ++({ \ ++ void *___retval; \ ++ intoverflow_t ___x = (intoverflow_t)x; \ ++ if (WARN(___x > ULONG_MAX, "vmalloc_exec size overflow\n"))\ ++ ___retval = NULL; \ ++ else \ ++ ___retval = vmalloc_exec((unsigned long)___x); \ ++ ___retval; \ ++}) ++ ++#define vmalloc_node(x, y) \ ++({ \ ++ void *___retval; \ ++ intoverflow_t ___x = (intoverflow_t)x; \ ++ if (WARN(___x > ULONG_MAX, "vmalloc_node size overflow\n"))\ ++ ___retval = NULL; \ ++ else \ ++ ___retval = vmalloc_node((unsigned long)___x, (y));\ ++ ___retval; \ ++}) ++ ++#define vmalloc_32(x) \ ++({ \ ++ void *___retval; \ ++ intoverflow_t ___x = (intoverflow_t)x; \ ++ if (WARN(___x > ULONG_MAX, "vmalloc_32 size overflow\n"))\ ++ ___retval = NULL; \ ++ else \ ++ ___retval = vmalloc_32((unsigned long)___x); \ ++ ___retval; \ ++}) ++ ++#define vmalloc_32_user(x) \ ++({ \ ++ void *___retval; \ ++ intoverflow_t ___x = (intoverflow_t)x; \ ++ if (WARN(___x > ULONG_MAX, "vmalloc_32_user size overflow\n"))\ ++ ___retval = NULL; \ ++ else \ ++ ___retval = vmalloc_32_user((unsigned long)___x);\ ++ ___retval; \ ++}) ++ + #endif /* _LINUX_VMALLOC_H */ +diff -urNp linux-2.6.33.1/include/net/irda/ircomm_tty.h linux-2.6.33.1/include/net/irda/ircomm_tty.h +--- linux-2.6.33.1/include/net/irda/ircomm_tty.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/net/irda/ircomm_tty.h 2010-03-20 16:58:41.928524770 -0400 +@@ -105,8 +105,8 @@ struct ircomm_tty_cb { + unsigned short close_delay; + unsigned short closing_wait; /* time to wait before closing */ + +- int open_count; +- int blocked_open; /* # of blocked opens */ ++ atomic_t open_count; ++ atomic_t blocked_open; /* # of blocked opens */ + + /* Protect concurent access to : + * o self->open_count +diff -urNp linux-2.6.33.1/include/net/neighbour.h linux-2.6.33.1/include/net/neighbour.h +--- linux-2.6.33.1/include/net/neighbour.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/net/neighbour.h 2010-03-20 16:58:41.928524770 -0400 +@@ -116,12 +116,12 @@ struct neighbour { + + struct neigh_ops { + int family; +- void (*solicit)(struct neighbour *, struct sk_buff*); +- void (*error_report)(struct neighbour *, struct sk_buff*); +- int (*output)(struct sk_buff*); +- int (*connected_output)(struct sk_buff*); +- int (*hh_output)(struct sk_buff*); +- int (*queue_xmit)(struct sk_buff*); ++ void (* const solicit)(struct neighbour *, struct sk_buff*); ++ void (* const error_report)(struct neighbour *, struct sk_buff*); ++ int (* const output)(struct sk_buff*); ++ int (* const connected_output)(struct sk_buff*); ++ int (* const hh_output)(struct sk_buff*); ++ int (* const queue_xmit)(struct sk_buff*); + }; + + struct pneigh_entry { +diff -urNp linux-2.6.33.1/include/net/sctp/sctp.h linux-2.6.33.1/include/net/sctp/sctp.h +--- linux-2.6.33.1/include/net/sctp/sctp.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/net/sctp/sctp.h 2010-03-20 16:58:41.928524770 -0400 +@@ -304,8 +304,8 @@ extern int sctp_debug_flag; + + #else /* SCTP_DEBUG */ + +-#define SCTP_DEBUG_PRINTK(whatever...) +-#define SCTP_DEBUG_PRINTK_IPADDR(whatever...) ++#define SCTP_DEBUG_PRINTK(whatever...) do {} while (0) ++#define SCTP_DEBUG_PRINTK_IPADDR(whatever...) do {} while (0) + #define SCTP_ENABLE_DEBUG + #define SCTP_DISABLE_DEBUG + #define SCTP_ASSERT(expr, str, func) +diff -urNp linux-2.6.33.1/include/net/tcp.h linux-2.6.33.1/include/net/tcp.h +--- linux-2.6.33.1/include/net/tcp.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/net/tcp.h 2010-03-20 16:58:41.928524770 -0400 +@@ -1392,6 +1392,7 @@ enum tcp_seq_states { + struct tcp_seq_afinfo { + char *name; + sa_family_t family; ++ /* cannot be const */ + struct file_operations seq_fops; + struct seq_operations seq_ops; + }; +diff -urNp linux-2.6.33.1/include/net/udp.h linux-2.6.33.1/include/net/udp.h +--- linux-2.6.33.1/include/net/udp.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/net/udp.h 2010-03-20 16:58:41.928524770 -0400 +@@ -221,6 +221,7 @@ struct udp_seq_afinfo { + char *name; + sa_family_t family; + struct udp_table *udp_table; ++ /* cannot be const */ + struct file_operations seq_fops; + struct seq_operations seq_ops; + }; +diff -urNp linux-2.6.33.1/include/sound/ac97_codec.h linux-2.6.33.1/include/sound/ac97_codec.h +--- linux-2.6.33.1/include/sound/ac97_codec.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/sound/ac97_codec.h 2010-03-20 16:58:41.928524770 -0400 +@@ -419,15 +419,15 @@ + struct snd_ac97; + + struct snd_ac97_build_ops { +- int (*build_3d) (struct snd_ac97 *ac97); +- int (*build_specific) (struct snd_ac97 *ac97); +- int (*build_spdif) (struct snd_ac97 *ac97); +- int (*build_post_spdif) (struct snd_ac97 *ac97); ++ int (* const build_3d) (struct snd_ac97 *ac97); ++ int (* const build_specific) (struct snd_ac97 *ac97); ++ int (* const build_spdif) (struct snd_ac97 *ac97); ++ int (* const build_post_spdif) (struct snd_ac97 *ac97); + #ifdef CONFIG_PM +- void (*suspend) (struct snd_ac97 *ac97); +- void (*resume) (struct snd_ac97 *ac97); ++ void (* const suspend) (struct snd_ac97 *ac97); ++ void (* const resume) (struct snd_ac97 *ac97); + #endif +- void (*update_jacks) (struct snd_ac97 *ac97); /* for jack-sharing */ ++ void (* const update_jacks) (struct snd_ac97 *ac97); /* for jack-sharing */ + }; + + struct snd_ac97_bus_ops { +@@ -477,7 +477,7 @@ struct snd_ac97_template { + + struct snd_ac97 { + /* -- lowlevel (hardware) driver specific -- */ +- struct snd_ac97_build_ops * build_ops; ++ const struct snd_ac97_build_ops * build_ops; + void *private_data; + void (*private_free) (struct snd_ac97 *ac97); + /* --- */ +diff -urNp linux-2.6.33.1/include/trace/events/irq.h linux-2.6.33.1/include/trace/events/irq.h +--- linux-2.6.33.1/include/trace/events/irq.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/trace/events/irq.h 2010-03-20 16:58:41.928524770 -0400 +@@ -34,7 +34,7 @@ + */ + TRACE_EVENT(irq_handler_entry, + +- TP_PROTO(int irq, struct irqaction *action), ++ TP_PROTO(int irq, const struct irqaction *action), + + TP_ARGS(irq, action), + +@@ -64,7 +64,7 @@ TRACE_EVENT(irq_handler_entry, + */ + TRACE_EVENT(irq_handler_exit, + +- TP_PROTO(int irq, struct irqaction *action, int ret), ++ TP_PROTO(int irq, const struct irqaction *action, int ret), + + TP_ARGS(irq, action, ret), + +@@ -84,7 +84,7 @@ TRACE_EVENT(irq_handler_exit, + + DECLARE_EVENT_CLASS(softirq, + +- TP_PROTO(struct softirq_action *h, struct softirq_action *vec), ++ TP_PROTO(const struct softirq_action *h, const struct softirq_action *vec), + + TP_ARGS(h, vec), + +@@ -113,7 +113,7 @@ DECLARE_EVENT_CLASS(softirq, + */ + DEFINE_EVENT(softirq, softirq_entry, + +- TP_PROTO(struct softirq_action *h, struct softirq_action *vec), ++ TP_PROTO(const struct softirq_action *h, const struct softirq_action *vec), + + TP_ARGS(h, vec) + ); +@@ -131,7 +131,7 @@ DEFINE_EVENT(softirq, softirq_entry, + */ + DEFINE_EVENT(softirq, softirq_exit, + +- TP_PROTO(struct softirq_action *h, struct softirq_action *vec), ++ TP_PROTO(const struct softirq_action *h, const struct softirq_action *vec), + + TP_ARGS(h, vec) + ); +diff -urNp linux-2.6.33.1/include/video/uvesafb.h linux-2.6.33.1/include/video/uvesafb.h +--- linux-2.6.33.1/include/video/uvesafb.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/include/video/uvesafb.h 2010-03-20 16:58:41.928524770 -0400 +@@ -177,6 +177,7 @@ struct uvesafb_par { + u8 ypan; /* 0 - nothing, 1 - ypan, 2 - ywrap */ + u8 pmi_setpal; /* PMI for palette changes */ + u16 *pmi_base; /* protected mode interface location */ ++ u8 *pmi_code; /* protected mode code location */ + void *pmi_start; + void *pmi_pal; + u8 *vbe_state_orig; /* +diff -urNp linux-2.6.33.1/init/do_mounts.c linux-2.6.33.1/init/do_mounts.c +--- linux-2.6.33.1/init/do_mounts.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/init/do_mounts.c 2010-03-20 16:58:41.936928392 -0400 +@@ -216,11 +216,11 @@ static void __init get_fs_names(char *pa + + static int __init do_mount_root(char *name, char *fs, int flags, void *data) + { +- int err = sys_mount(name, "/root", fs, flags, data); ++ int err = sys_mount((__force char __user *)name, (__force char __user *)"/root", (__force char __user *)fs, flags, (__force void __user *)data); + if (err) + return err; + +- sys_chdir("/root"); ++ sys_chdir((__force char __user *)"/root"); + ROOT_DEV = current->fs->pwd.mnt->mnt_sb->s_dev; + printk("VFS: Mounted root (%s filesystem)%s on device %u:%u.\n", + current->fs->pwd.mnt->mnt_sb->s_type->name, +@@ -311,18 +311,18 @@ void __init change_floppy(char *fmt, ... + va_start(args, fmt); + vsprintf(buf, fmt, args); + va_end(args); +- fd = sys_open("/dev/root", O_RDWR | O_NDELAY, 0); ++ fd = sys_open((char __user *)"/dev/root", O_RDWR | O_NDELAY, 0); + if (fd >= 0) { + sys_ioctl(fd, FDEJECT, 0); + sys_close(fd); + } + printk(KERN_NOTICE "VFS: Insert %s and press ENTER\n", buf); +- fd = sys_open("/dev/console", O_RDWR, 0); ++ fd = sys_open((char __user *)"/dev/console", O_RDWR, 0); + if (fd >= 0) { + sys_ioctl(fd, TCGETS, (long)&termios); + termios.c_lflag &= ~ICANON; + sys_ioctl(fd, TCSETSF, (long)&termios); +- sys_read(fd, &c, 1); ++ sys_read(fd, (char __user *)&c, 1); + termios.c_lflag |= ICANON; + sys_ioctl(fd, TCSETSF, (long)&termios); + sys_close(fd); +@@ -416,6 +416,6 @@ void __init prepare_namespace(void) + mount_root(); + out: + devtmpfs_mount("dev"); +- sys_mount(".", "/", NULL, MS_MOVE, NULL); +- sys_chroot("."); ++ sys_mount((__force char __user *)".", (__force char __user *)"/", NULL, MS_MOVE, NULL); ++ sys_chroot((__force char __user *)"."); + } +diff -urNp linux-2.6.33.1/init/do_mounts.h linux-2.6.33.1/init/do_mounts.h +--- linux-2.6.33.1/init/do_mounts.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/init/do_mounts.h 2010-03-20 16:58:41.973158089 -0400 +@@ -15,15 +15,15 @@ extern int root_mountflags; + + static inline int create_dev(char *name, dev_t dev) + { +- sys_unlink(name); +- return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); ++ sys_unlink((__force char __user *)name); ++ return sys_mknod((__force char __user *)name, S_IFBLK|0600, new_encode_dev(dev)); + } + + #if BITS_PER_LONG == 32 + static inline u32 bstat(char *name) + { + struct stat64 stat; +- if (sys_stat64(name, &stat) != 0) ++ if (sys_stat64((__force char __user *)name, (__force struct stat64 __user *)&stat) != 0) + return 0; + if (!S_ISBLK(stat.st_mode)) + return 0; +diff -urNp linux-2.6.33.1/init/do_mounts_initrd.c linux-2.6.33.1/init/do_mounts_initrd.c +--- linux-2.6.33.1/init/do_mounts_initrd.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/init/do_mounts_initrd.c 2010-03-20 16:58:41.973158089 -0400 +@@ -32,7 +32,7 @@ static int __init do_linuxrc(void * shel + sys_close(old_fd);sys_close(root_fd); + sys_close(0);sys_close(1);sys_close(2); + sys_setsid(); +- (void) sys_open("/dev/console",O_RDWR,0); ++ (void) sys_open((__force const char __user *)"/dev/console",O_RDWR,0); + (void) sys_dup(0); + (void) sys_dup(0); + return kernel_execve(shell, argv, envp_init); +@@ -47,13 +47,13 @@ static void __init handle_initrd(void) + create_dev("/dev/root.old", Root_RAM0); + /* mount initrd on rootfs' /root */ + mount_block_root("/dev/root.old", root_mountflags & ~MS_RDONLY); +- sys_mkdir("/old", 0700); +- root_fd = sys_open("/", 0, 0); +- old_fd = sys_open("/old", 0, 0); ++ sys_mkdir((__force const char __user *)"/old", 0700); ++ root_fd = sys_open((__force const char __user *)"/", 0, 0); ++ old_fd = sys_open((__force const char __user *)"/old", 0, 0); + /* move initrd over / and chdir/chroot in initrd root */ +- sys_chdir("/root"); +- sys_mount(".", "/", NULL, MS_MOVE, NULL); +- sys_chroot("."); ++ sys_chdir((__force const char __user *)"/root"); ++ sys_mount((__force char __user *)".", (__force char __user *)"/", NULL, MS_MOVE, NULL); ++ sys_chroot((__force const char __user *)"."); + + /* + * In case that a resume from disk is carried out by linuxrc or one of +@@ -70,15 +70,15 @@ static void __init handle_initrd(void) + + /* move initrd to rootfs' /old */ + sys_fchdir(old_fd); +- sys_mount("/", ".", NULL, MS_MOVE, NULL); ++ sys_mount((__force char __user *)"/", (__force char __user *)".", NULL, MS_MOVE, NULL); + /* switch root and cwd back to / of rootfs */ + sys_fchdir(root_fd); +- sys_chroot("."); ++ sys_chroot((__force const char __user *)"."); + sys_close(old_fd); + sys_close(root_fd); + + if (new_decode_dev(real_root_dev) == Root_RAM0) { +- sys_chdir("/old"); ++ sys_chdir((__force const char __user *)"/old"); + return; + } + +@@ -86,17 +86,17 @@ static void __init handle_initrd(void) + mount_root(); + + printk(KERN_NOTICE "Trying to move old root to /initrd ... "); +- error = sys_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL); ++ error = sys_mount((__force char __user *)"/old", (__force char __user *)"/root/initrd", NULL, MS_MOVE, NULL); + if (!error) + printk("okay\n"); + else { +- int fd = sys_open("/dev/root.old", O_RDWR, 0); ++ int fd = sys_open((__force const char __user *)"/dev/root.old", O_RDWR, 0); + if (error == -ENOENT) + printk("/initrd does not exist. Ignored.\n"); + else + printk("failed\n"); + printk(KERN_NOTICE "Unmounting old root\n"); +- sys_umount("/old", MNT_DETACH); ++ sys_umount((__force char __user *)"/old", MNT_DETACH); + printk(KERN_NOTICE "Trying to free ramdisk memory ... "); + if (fd < 0) { + error = fd; +@@ -119,11 +119,11 @@ int __init initrd_load(void) + * mounted in the normal path. + */ + if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) { +- sys_unlink("/initrd.image"); ++ sys_unlink((__force const char __user *)"/initrd.image"); + handle_initrd(); + return 1; + } + } +- sys_unlink("/initrd.image"); ++ sys_unlink((__force const char __user *)"/initrd.image"); + return 0; + } +diff -urNp linux-2.6.33.1/init/do_mounts_md.c linux-2.6.33.1/init/do_mounts_md.c +--- linux-2.6.33.1/init/do_mounts_md.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/init/do_mounts_md.c 2010-03-20 16:58:41.973158089 -0400 +@@ -170,7 +170,7 @@ static void __init md_setup_drive(void) + partitioned ? "_d" : "", minor, + md_setup_args[ent].device_names); + +- fd = sys_open(name, 0, 0); ++ fd = sys_open((__force char __user *)name, 0, 0); + if (fd < 0) { + printk(KERN_ERR "md: open failed - cannot start " + "array %s\n", name); +@@ -233,7 +233,7 @@ static void __init md_setup_drive(void) + * array without it + */ + sys_close(fd); +- fd = sys_open(name, 0, 0); ++ fd = sys_open((__force char __user *)name, 0, 0); + sys_ioctl(fd, BLKRRPART, 0); + } + sys_close(fd); +@@ -283,7 +283,7 @@ static void __init autodetect_raid(void) + + wait_for_device_probe(); + +- fd = sys_open("/dev/md0", 0, 0); ++ fd = sys_open((__force char __user *)"/dev/md0", 0, 0); + if (fd >= 0) { + sys_ioctl(fd, RAID_AUTORUN, raid_autopart); + sys_close(fd); +diff -urNp linux-2.6.33.1/init/initramfs.c linux-2.6.33.1/init/initramfs.c +--- linux-2.6.33.1/init/initramfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/init/initramfs.c 2010-03-20 16:58:41.973158089 -0400 +@@ -74,7 +74,7 @@ static void __init free_hash(void) + } + } + +-static long __init do_utime(char __user *filename, time_t mtime) ++static long __init do_utime(__force char __user *filename, time_t mtime) + { + struct timespec t[2]; + +@@ -109,7 +109,7 @@ static void __init dir_utime(void) + struct dir_entry *de, *tmp; + list_for_each_entry_safe(de, tmp, &dir_list, list) { + list_del(&de->list); +- do_utime(de->name, de->mtime); ++ do_utime((__force char __user *)de->name, de->mtime); + kfree(de->name); + kfree(de); + } +@@ -271,7 +271,7 @@ static int __init maybe_link(void) + if (nlink >= 2) { + char *old = find_link(major, minor, ino, mode, collected); + if (old) +- return (sys_link(old, collected) < 0) ? -1 : 1; ++ return (sys_link((__force char __user *)old, (__force char __user *)collected) < 0) ? -1 : 1; + } + return 0; + } +@@ -280,11 +280,11 @@ static void __init clean_path(char *path + { + struct stat st; + +- if (!sys_newlstat(path, &st) && (st.st_mode^mode) & S_IFMT) { ++ if (!sys_newlstat((__force char __user *)path, (__force struct stat __user *)&st) && (st.st_mode^mode) & S_IFMT) { + if (S_ISDIR(st.st_mode)) +- sys_rmdir(path); ++ sys_rmdir((__force char __user *)path); + else +- sys_unlink(path); ++ sys_unlink((__force char __user *)path); + } + } + +@@ -305,7 +305,7 @@ static int __init do_name(void) + int openflags = O_WRONLY|O_CREAT; + if (ml != 1) + openflags |= O_TRUNC; +- wfd = sys_open(collected, openflags, mode); ++ wfd = sys_open((__force char __user *)collected, openflags, mode); + + if (wfd >= 0) { + sys_fchown(wfd, uid, gid); +@@ -317,17 +317,17 @@ static int __init do_name(void) + } + } + } else if (S_ISDIR(mode)) { +- sys_mkdir(collected, mode); +- sys_chown(collected, uid, gid); +- sys_chmod(collected, mode); ++ sys_mkdir((__force char __user *)collected, mode); ++ sys_chown((__force char __user *)collected, uid, gid); ++ sys_chmod((__force char __user *)collected, mode); + dir_add(collected, mtime); + } else if (S_ISBLK(mode) || S_ISCHR(mode) || + S_ISFIFO(mode) || S_ISSOCK(mode)) { + if (maybe_link() == 0) { +- sys_mknod(collected, mode, rdev); +- sys_chown(collected, uid, gid); +- sys_chmod(collected, mode); +- do_utime(collected, mtime); ++ sys_mknod((__force char __user *)collected, mode, rdev); ++ sys_chown((__force char __user *)collected, uid, gid); ++ sys_chmod((__force char __user *)collected, mode); ++ do_utime((__force char __user *)collected, mtime); + } + } + return 0; +@@ -336,15 +336,15 @@ static int __init do_name(void) + static int __init do_copy(void) + { + if (count >= body_len) { +- sys_write(wfd, victim, body_len); ++ sys_write(wfd, (__force char __user *)victim, body_len); + sys_close(wfd); +- do_utime(vcollected, mtime); ++ do_utime((__force char __user *)vcollected, mtime); + kfree(vcollected); + eat(body_len); + state = SkipIt; + return 0; + } else { +- sys_write(wfd, victim, count); ++ sys_write(wfd, (__force char __user *)victim, count); + body_len -= count; + eat(count); + return 1; +@@ -355,9 +355,9 @@ static int __init do_symlink(void) + { + collected[N_ALIGN(name_len) + body_len] = '\0'; + clean_path(collected, 0); +- sys_symlink(collected + N_ALIGN(name_len), collected); +- sys_lchown(collected, uid, gid); +- do_utime(collected, mtime); ++ sys_symlink((__force char __user *)collected + N_ALIGN(name_len), (__force char __user *)collected); ++ sys_lchown((__force char __user *)collected, uid, gid); ++ do_utime((__force char __user *)collected, mtime); + state = SkipIt; + next_state = Reset; + return 0; +diff -urNp linux-2.6.33.1/init/Kconfig linux-2.6.33.1/init/Kconfig +--- linux-2.6.33.1/init/Kconfig 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/init/Kconfig 2010-03-20 16:58:41.973158089 -0400 +@@ -1046,7 +1046,7 @@ config SLUB_DEBUG + + config COMPAT_BRK + bool "Disable heap randomization" +- default y ++ default n + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). +@@ -1158,9 +1158,9 @@ config HAVE_GENERIC_DMA_COHERENT + + config SLABINFO + bool +- depends on PROC_FS ++ depends on PROC_FS && !GRKERNSEC_PROC_ADD + depends on SLAB || SLUB_DEBUG +- default y ++ default n + + config RT_MUTEXES + boolean +diff -urNp linux-2.6.33.1/init/main.c linux-2.6.33.1/init/main.c +--- linux-2.6.33.1/init/main.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/init/main.c 2010-03-20 16:58:41.976562026 -0400 +@@ -97,6 +97,7 @@ static inline void mark_rodata_ro(void) + #ifdef CONFIG_TC + extern void tc_init(void); + #endif ++extern void grsecurity_init(void); + + enum system_states system_state __read_mostly; + EXPORT_SYMBOL(system_state); +@@ -183,6 +184,35 @@ static int __init set_reset_devices(char + + __setup("reset_devices", set_reset_devices); + ++#if defined(CONFIG_PAX_MEMORY_UDEREF) && defined(CONFIG_X86_32) ++static int __init setup_pax_nouderef(char *str) ++{ ++ unsigned int cpu; ++ ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ get_cpu_gdt_table(cpu)[GDT_ENTRY_KERNEL_DS].type = 3; ++ get_cpu_gdt_table(cpu)[GDT_ENTRY_KERNEL_DS].limit = 0xf; ++ } ++ asm("mov %0, %%ds" : : "r" (__KERNEL_DS) : "memory"); ++ asm("mov %0, %%es" : : "r" (__KERNEL_DS) : "memory"); ++ asm("mov %0, %%ss" : : "r" (__KERNEL_DS) : "memory"); ++ ++ return 0; ++} ++early_param("pax_nouderef", setup_pax_nouderef); ++#endif ++ ++#ifdef CONFIG_PAX_SOFTMODE ++unsigned int pax_softmode; ++ ++static int __init setup_pax_softmode(char *str) ++{ ++ get_option(&str, &pax_softmode); ++ return 1; ++} ++__setup("pax_softmode=", setup_pax_softmode); ++#endif ++ + static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; + char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; + static const char *panic_later, *panic_param; +@@ -697,52 +727,53 @@ int initcall_debug; + core_param(initcall_debug, initcall_debug, bool, 0644); + + static char msgbuf[64]; +-static struct boot_trace_call call; +-static struct boot_trace_ret ret; ++static struct boot_trace_call trace_call; ++static struct boot_trace_ret trace_ret; + + int do_one_initcall(initcall_t fn) + { + int count = preempt_count(); + ktime_t calltime, delta, rettime; ++ const char *msg1 = "", *msg2 = ""; + + if (initcall_debug) { +- call.caller = task_pid_nr(current); +- printk("calling %pF @ %i\n", fn, call.caller); ++ trace_call.caller = task_pid_nr(current); ++ printk("calling %pF @ %i\n", fn, trace_call.caller); + calltime = ktime_get(); +- trace_boot_call(&call, fn); ++ trace_boot_call(&trace_call, fn); + enable_boot_trace(); + } + +- ret.result = fn(); ++ trace_ret.result = fn(); + + if (initcall_debug) { + disable_boot_trace(); + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); +- ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10; +- trace_boot_ret(&ret, fn); ++ trace_ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10; ++ trace_boot_ret(&trace_ret, fn); + printk("initcall %pF returned %d after %Ld usecs\n", fn, +- ret.result, ret.duration); ++ trace_ret.result, trace_ret.duration); + } + + msgbuf[0] = 0; + +- if (ret.result && ret.result != -ENODEV && initcall_debug) +- sprintf(msgbuf, "error code %d ", ret.result); ++ if (trace_ret.result && trace_ret.result != -ENODEV && initcall_debug) ++ sprintf(msgbuf, "error code %d ", trace_ret.result); + + if (preempt_count() != count) { +- strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); ++ msg1 = " preemption imbalance"; + preempt_count() = count; + } + if (irqs_disabled()) { +- strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); ++ msg2 = " disabled interrupts"; + local_irq_enable(); + } +- if (msgbuf[0]) { +- printk("initcall %pF returned with %s\n", fn, msgbuf); ++ if (msgbuf[0] || *msg1 || *msg2) { ++ printk("initcall %pF returned with %s%s%s\n", fn, msgbuf, msg1, msg2); + } + +- return ret.result; ++ return trace_ret.result; + } + + +@@ -881,11 +912,13 @@ static int __init kernel_init(void * unu + if (!ramdisk_execute_command) + ramdisk_execute_command = "/init"; + +- if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) { ++ if (sys_access((__force const char __user *) ramdisk_execute_command, 0) != 0) { + ramdisk_execute_command = NULL; + prepare_namespace(); + } + ++ grsecurity_init(); ++ + /* + * Ok, we have completed the initial bootup, and + * we're essentially up and running. Get rid of the +diff -urNp linux-2.6.33.1/init/noinitramfs.c linux-2.6.33.1/init/noinitramfs.c +--- linux-2.6.33.1/init/noinitramfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/init/noinitramfs.c 2010-03-20 16:58:41.976562026 -0400 +@@ -29,7 +29,7 @@ static int __init default_rootfs(void) + { + int err; + +- err = sys_mkdir("/dev", 0755); ++ err = sys_mkdir((const char __user *)"/dev", 0755); + if (err < 0) + goto out; + +@@ -39,7 +39,7 @@ static int __init default_rootfs(void) + if (err < 0) + goto out; + +- err = sys_mkdir("/root", 0700); ++ err = sys_mkdir((const char __user *)"/root", 0700); + if (err < 0) + goto out; + +diff -urNp linux-2.6.33.1/ipc/mqueue.c linux-2.6.33.1/ipc/mqueue.c +--- linux-2.6.33.1/ipc/mqueue.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/ipc/mqueue.c 2010-03-20 16:58:41.976562026 -0400 +@@ -149,6 +149,7 @@ static struct inode *mqueue_get_inode(st + mq_bytes = (mq_msg_tblsz + + (info->attr.mq_maxmsg * info->attr.mq_msgsize)); + ++ gr_learn_resource(current, RLIMIT_MSGQUEUE, u->mq_bytes + mq_bytes, 1); + spin_lock(&mq_lock); + if (u->mq_bytes + mq_bytes < u->mq_bytes || + u->mq_bytes + mq_bytes > +diff -urNp linux-2.6.33.1/ipc/shm.c linux-2.6.33.1/ipc/shm.c +--- linux-2.6.33.1/ipc/shm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/ipc/shm.c 2010-03-20 16:58:41.997340945 -0400 +@@ -69,6 +69,14 @@ static void shm_destroy (struct ipc_name + static int sysvipc_shm_proc_show(struct seq_file *s, void *it); + #endif + ++#ifdef CONFIG_GRKERNSEC ++extern int gr_handle_shmat(const pid_t shm_cprid, const pid_t shm_lapid, ++ const time_t shm_createtime, const uid_t cuid, ++ const int shmid); ++extern int gr_chroot_shmat(const pid_t shm_cprid, const pid_t shm_lapid, ++ const time_t shm_createtime); ++#endif ++ + void shm_init_ns(struct ipc_namespace *ns) + { + ns->shm_ctlmax = SHMMAX; +@@ -398,6 +406,14 @@ static int newseg(struct ipc_namespace * + shp->shm_lprid = 0; + shp->shm_atim = shp->shm_dtim = 0; + shp->shm_ctim = get_seconds(); ++#ifdef CONFIG_GRKERNSEC ++ { ++ struct timespec timeval; ++ do_posix_clock_monotonic_gettime(&timeval); ++ ++ shp->shm_createtime = timeval.tv_sec; ++ } ++#endif + shp->shm_segsz = size; + shp->shm_nattch = 0; + shp->shm_file = file; +@@ -881,9 +897,21 @@ long do_shmat(int shmid, char __user *sh + if (err) + goto out_unlock; + ++#ifdef CONFIG_GRKERNSEC ++ if (!gr_handle_shmat(shp->shm_cprid, shp->shm_lapid, shp->shm_createtime, ++ shp->shm_perm.cuid, shmid) || ++ !gr_chroot_shmat(shp->shm_cprid, shp->shm_lapid, shp->shm_createtime)) { ++ err = -EACCES; ++ goto out_unlock; ++ } ++#endif ++ + path = shp->shm_file->f_path; + path_get(&path); + shp->shm_nattch++; ++#ifdef CONFIG_GRKERNSEC ++ shp->shm_lapid = current->pid; ++#endif + size = i_size_read(path.dentry->d_inode); + shm_unlock(shp); + +diff -urNp linux-2.6.33.1/kernel/acct.c linux-2.6.33.1/kernel/acct.c +--- linux-2.6.33.1/kernel/acct.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/acct.c 2010-03-20 16:58:42.008931878 -0400 +@@ -579,7 +579,7 @@ static void do_acct_process(struct bsd_a + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; +- file->f_op->write(file, (char *)&ac, ++ file->f_op->write(file, (__force char __user *)&ac, + sizeof(acct_t), &file->f_pos); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; + set_fs(fs); +diff -urNp linux-2.6.33.1/kernel/capability.c linux-2.6.33.1/kernel/capability.c +--- linux-2.6.33.1/kernel/capability.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/capability.c 2010-03-20 16:58:42.008931878 -0400 +@@ -206,6 +206,9 @@ SYSCALL_DEFINE2(capget, cap_user_header_ + * before modification is attempted and the application + * fails. + */ ++ if (tocopy > ARRAY_SIZE(kdata)) ++ return -EFAULT; ++ + if (copy_to_user(dataptr, kdata, tocopy + * sizeof(struct __user_cap_data_struct))) { + return -EFAULT; +@@ -307,10 +310,21 @@ int capable(int cap) + BUG(); + } + +- if (security_capable(cap) == 0) { ++ if (security_capable(cap) == 0 && gr_is_capable(cap)) { ++ current->flags |= PF_SUPERPRIV; ++ return 1; ++ } ++ return 0; ++} ++ ++int capable_nolog(int cap) ++{ ++ if (security_capable(cap) == 0 && gr_is_capable_nolog(cap)) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; + } ++ + EXPORT_SYMBOL(capable); ++EXPORT_SYMBOL(capable_nolog); +diff -urNp linux-2.6.33.1/kernel/configs.c linux-2.6.33.1/kernel/configs.c +--- linux-2.6.33.1/kernel/configs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/configs.c 2010-03-20 16:58:42.020953047 -0400 +@@ -73,8 +73,19 @@ static int __init ikconfig_init(void) + struct proc_dir_entry *entry; + + /* create the current config file */ ++#if defined(CONFIG_GRKERNSEC_PROC_ADD) || defined(CONFIG_GRKERNSEC_HIDESYM) ++#if defined(CONFIG_GRKERNSEC_PROC_USER) || defined(CONFIG_GRKERNSEC_HIDESYM) ++ entry = proc_create("config.gz", S_IFREG | S_IRUSR, NULL, ++ &ikconfig_file_ops); ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ entry = proc_create("config.gz", S_IFREG | S_IRUSR | S_IRGRP, NULL, ++ &ikconfig_file_ops); ++#endif ++#else + entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, + &ikconfig_file_ops); ++#endif ++ + if (!entry) + return -ENOMEM; + +diff -urNp linux-2.6.33.1/kernel/cpu.c linux-2.6.33.1/kernel/cpu.c +--- linux-2.6.33.1/kernel/cpu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/cpu.c 2010-03-20 16:58:42.028929112 -0400 +@@ -19,7 +19,7 @@ + /* Serializes the updates to cpu_online_mask, cpu_present_mask */ + static DEFINE_MUTEX(cpu_add_remove_lock); + +-static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); ++static RAW_NOTIFIER_HEAD(cpu_chain); + + /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. + * Should always be manipulated under cpu_add_remove_lock +diff -urNp linux-2.6.33.1/kernel/cred.c linux-2.6.33.1/kernel/cred.c +--- linux-2.6.33.1/kernel/cred.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/cred.c 2010-03-20 16:58:42.028929112 -0400 +@@ -520,6 +520,8 @@ int commit_creds(struct cred *new) + + get_cred(new); /* we will require a ref for the subj creds too */ + ++ gr_set_role_label(task, new->uid, new->gid); ++ + /* dumpability changes */ + if (old->euid != new->euid || + old->egid != new->egid || +diff -urNp linux-2.6.33.1/kernel/exit.c linux-2.6.33.1/kernel/exit.c +--- linux-2.6.33.1/kernel/exit.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/exit.c 2010-03-20 16:58:42.032663961 -0400 +@@ -57,6 +57,10 @@ + #include <asm/mmu_context.h> + #include "cred-internals.h" + ++#ifdef CONFIG_GRKERNSEC ++extern rwlock_t grsec_exec_file_lock; ++#endif ++ + static void exit_mm(struct task_struct * tsk); + + static void __unhash_process(struct task_struct *p) +@@ -168,6 +172,8 @@ void release_task(struct task_struct * p + struct task_struct *leader; + int zap_leader; + repeat: ++ gr_del_task_from_ip_table(p); ++ + tracehook_prepare_release_task(p); + /* don't need to get the RCU readlock here - the process is dead and + * can't be modifying its own credentials */ +@@ -335,11 +341,22 @@ static void reparent_to_kthreadd(void) + { + write_lock_irq(&tasklist_lock); + ++#ifdef CONFIG_GRKERNSEC ++ write_lock(&grsec_exec_file_lock); ++ if (current->exec_file) { ++ fput(current->exec_file); ++ current->exec_file = NULL; ++ } ++ write_unlock(&grsec_exec_file_lock); ++#endif ++ + ptrace_unlink(current); + /* Reparent to init */ + current->real_parent = current->parent = kthreadd_task; + list_move_tail(¤t->sibling, ¤t->real_parent->children); + ++ gr_set_kernel_label(current); ++ + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + +@@ -391,7 +408,7 @@ int allow_signal(int sig) + * know it'll be handled, so that they don't get converted to + * SIGKILL or just silently dropped. + */ +- current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; ++ current->sighand->action[(sig)-1].sa.sa_handler = (__force void __user *)2; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +@@ -427,6 +444,17 @@ void daemonize(const char *name, ...) + vsnprintf(current->comm, sizeof(current->comm), name, args); + va_end(args); + ++#ifdef CONFIG_GRKERNSEC ++ write_lock(&grsec_exec_file_lock); ++ if (current->exec_file) { ++ fput(current->exec_file); ++ current->exec_file = NULL; ++ } ++ write_unlock(&grsec_exec_file_lock); ++#endif ++ ++ gr_set_kernel_label(current); ++ + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them +@@ -961,6 +989,9 @@ NORET_TYPE void do_exit(long code) + tsk->exit_code = code; + taskstats_exit(tsk, group_dead); + ++ gr_acl_handle_psacct(tsk, code); ++ gr_acl_handle_exit(); ++ + exit_mm(tsk); + + if (group_dead) +@@ -1180,7 +1211,7 @@ static int wait_task_zombie(struct wait_ + + if (unlikely(wo->wo_flags & WNOWAIT)) { + int exit_code = p->exit_code; +- int why, status; ++ int why; + + get_task_struct(p); + read_unlock(&tasklist_lock); +diff -urNp linux-2.6.33.1/kernel/fork.c linux-2.6.33.1/kernel/fork.c +--- linux-2.6.33.1/kernel/fork.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/fork.c 2010-03-20 16:58:42.032663961 -0400 +@@ -255,7 +255,7 @@ static struct task_struct *dup_task_stru + *stackend = STACK_END_MAGIC; /* for overflow detection */ + + #ifdef CONFIG_CC_STACKPROTECTOR +- tsk->stack_canary = get_random_int(); ++ tsk->stack_canary = pax_get_random_long(); + #endif + + /* One for us, one for whoever does the "release_task()" (usually parent) */ +@@ -295,8 +295,8 @@ static int dup_mmap(struct mm_struct *mm + mm->locked_vm = 0; + mm->mmap = NULL; + mm->mmap_cache = NULL; +- mm->free_area_cache = oldmm->mmap_base; +- mm->cached_hole_size = ~0UL; ++ mm->free_area_cache = oldmm->free_area_cache; ++ mm->cached_hole_size = oldmm->cached_hole_size; + mm->map_count = 0; + cpumask_clear(mm_cpumask(mm)); + mm->mm_rb = RB_ROOT; +@@ -336,6 +336,7 @@ static int dup_mmap(struct mm_struct *mm + tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_mm = mm; + tmp->vm_next = NULL; ++ tmp->vm_mirror = NULL; + anon_vma_link(tmp); + file = tmp->vm_file; + if (file) { +@@ -383,6 +384,31 @@ static int dup_mmap(struct mm_struct *mm + if (retval) + goto out; + } ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (oldmm->pax_flags & MF_PAX_SEGMEXEC) { ++ struct vm_area_struct *mpnt_m; ++ ++ for (mpnt = oldmm->mmap, mpnt_m = mm->mmap; mpnt; mpnt = mpnt->vm_next, mpnt_m = mpnt_m->vm_next) { ++ BUG_ON(!mpnt_m || mpnt_m->vm_mirror || mpnt->vm_mm != oldmm || mpnt_m->vm_mm != mm); ++ ++ if (!mpnt->vm_mirror) ++ continue; ++ ++ if (mpnt->vm_end <= SEGMEXEC_TASK_SIZE) { ++ BUG_ON(mpnt->vm_mirror->vm_mirror != mpnt); ++ mpnt->vm_mirror = mpnt_m; ++ } else { ++ BUG_ON(mpnt->vm_mirror->vm_mirror == mpnt || mpnt->vm_mirror->vm_mirror->vm_mm != mm); ++ mpnt_m->vm_mirror = mpnt->vm_mirror->vm_mirror; ++ mpnt_m->vm_mirror->vm_mirror = mpnt_m; ++ mpnt->vm_mirror->vm_mirror = mpnt; ++ } ++ } ++ BUG_ON(mpnt_m); ++ } ++#endif ++ + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); + retval = 0; +@@ -733,7 +759,7 @@ static int copy_fs(unsigned long clone_f + write_unlock(&fs->lock); + return -EAGAIN; + } +- fs->users++; ++ atomic_inc(&fs->users); + write_unlock(&fs->lock); + return 0; + } +@@ -1032,10 +1058,13 @@ static struct task_struct *copy_process( + DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); + #endif + retval = -EAGAIN; ++ ++ gr_learn_resource(p, RLIMIT_NPROC, atomic_read(&p->real_cred->user->processes), 0); ++ + if (atomic_read(&p->real_cred->user->processes) >= + p->signal->rlim[RLIMIT_NPROC].rlim_cur) { +- if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && +- p->real_cred->user != INIT_USER) ++ if (p->real_cred->user != INIT_USER && ++ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) + goto bad_fork_free; + } + +@@ -1064,6 +1093,10 @@ static struct task_struct *copy_process( + p->vfork_done = NULL; + spin_lock_init(&p->alloc_lock); + ++#ifdef CONFIG_GRKERNSEC ++ rwlock_init(&p->gr_fs_lock); ++#endif ++ + init_sigpending(&p->pending); + + p->utime = cputime_zero; +@@ -1190,6 +1223,8 @@ static struct task_struct *copy_process( + goto bad_fork_free_pid; + } + ++ gr_copy_label(p); ++ + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? +@@ -1342,6 +1377,8 @@ bad_fork_cleanup_count: + bad_fork_free: + free_task(p); + fork_out: ++ gr_log_forkfail(retval); ++ + return ERR_PTR(retval); + } + +@@ -1435,6 +1472,8 @@ long do_fork(unsigned long clone_flags, + if (clone_flags & CLONE_PARENT_SETTID) + put_user(nr, parent_tidptr); + ++ gr_handle_brute_check(); ++ + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); +@@ -1567,7 +1606,7 @@ static int unshare_fs(unsigned long unsh + return 0; + + /* don't need lock here; in the worst case we'll do useless copy */ +- if (fs->users == 1) ++ if (atomic_read(&fs->users) == 1) + return 0; + + *new_fsp = copy_fs_struct(fs); +@@ -1687,14 +1726,18 @@ SYSCALL_DEFINE1(unshare, unsigned long, + task_lock(current); + + if (new_fs) { ++ unsigned long flags; ++ ++ gr_fs_write_lock_irqsave(current, flags); + fs = current->fs; + write_lock(&fs->lock); + current->fs = new_fs; +- if (--fs->users) ++ if (atomic_dec_return(&fs->users)) + new_fs = NULL; + else + new_fs = fs; + write_unlock(&fs->lock); ++ gr_fs_write_unlock_irqrestore(current, flags); + } + + if (new_mm) { +diff -urNp linux-2.6.33.1/kernel/futex.c linux-2.6.33.1/kernel/futex.c +--- linux-2.6.33.1/kernel/futex.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/futex.c 2010-03-20 16:58:42.048537218 -0400 +@@ -54,6 +54,7 @@ + #include <linux/mount.h> + #include <linux/pagemap.h> + #include <linux/syscalls.h> ++#include <linux/ptrace.h> + #include <linux/signal.h> + #include <linux/module.h> + #include <linux/magic.h> +@@ -221,6 +222,11 @@ get_futex_key(u32 __user *uaddr, int fsh + struct page *page; + int err; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && address >= SEGMEXEC_TASK_SIZE) ++ return -EFAULT; ++#endif ++ + /* + * The futex address must be "naturally" aligned. + */ +@@ -1852,7 +1858,7 @@ retry: + + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_wait_restart; +- restart->futex.uaddr = (u32 *)uaddr; ++ restart->futex.uaddr = uaddr; + restart->futex.val = val; + restart->futex.time = abs_time->tv64; + restart->futex.bitset = bitset; +@@ -2385,7 +2391,10 @@ SYSCALL_DEFINE3(get_robust_list, int, pi + { + struct robust_list_head __user *head; + unsigned long ret; +- const struct cred *cred = current_cred(), *pcred; ++#ifndef CONFIG_GRKERNSEC_PROC_MEMMAP ++ const struct cred *cred = current_cred(); ++ const struct cred *pcred; ++#endif + + if (!futex_cmpxchg_enabled) + return -ENOSYS; +@@ -2401,11 +2410,16 @@ SYSCALL_DEFINE3(get_robust_list, int, pi + if (!p) + goto err_unlock; + ret = -EPERM; ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ if (!ptrace_may_access(p, PTRACE_MODE_READ)) ++ goto err_unlock; ++#else + pcred = __task_cred(p); + if (cred->euid != pcred->euid && + cred->euid != pcred->uid && + !capable(CAP_SYS_PTRACE)) + goto err_unlock; ++#endif + head = p->robust_list; + rcu_read_unlock(); + } +@@ -2467,7 +2481,7 @@ retry: + */ + static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user * __user *head, +- int *pi) ++ unsigned int *pi) + { + unsigned long uentry; + +diff -urNp linux-2.6.33.1/kernel/futex_compat.c linux-2.6.33.1/kernel/futex_compat.c +--- linux-2.6.33.1/kernel/futex_compat.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/futex_compat.c 2010-03-20 16:58:42.048537218 -0400 +@@ -10,6 +10,7 @@ + #include <linux/compat.h> + #include <linux/nsproxy.h> + #include <linux/futex.h> ++#include <linux/ptrace.h> + + #include <asm/uaccess.h> + +@@ -135,7 +136,10 @@ compat_sys_get_robust_list(int pid, comp + { + struct compat_robust_list_head __user *head; + unsigned long ret; +- const struct cred *cred = current_cred(), *pcred; ++ const struct cred *cred = current_cred(); ++#ifndef CONFIG_GRKERNSEC_PROC_MEMMAP ++ const struct cred *pcred; ++#endif + + if (!futex_cmpxchg_enabled) + return -ENOSYS; +@@ -151,11 +155,16 @@ compat_sys_get_robust_list(int pid, comp + if (!p) + goto err_unlock; + ret = -EPERM; ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ if (!ptrace_may_access(p, PTRACE_MODE_READ)) ++ goto err_unlock; ++#else + pcred = __task_cred(p); + if (cred->euid != pcred->euid && + cred->euid != pcred->uid && + !capable(CAP_SYS_PTRACE)) + goto err_unlock; ++#endif + head = p->compat_robust_list; + read_unlock(&tasklist_lock); + } +diff -urNp linux-2.6.33.1/kernel/gcov/base.c linux-2.6.33.1/kernel/gcov/base.c +--- linux-2.6.33.1/kernel/gcov/base.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/gcov/base.c 2010-03-20 16:58:42.048537218 -0400 +@@ -102,11 +102,6 @@ void gcov_enable_events(void) + } + + #ifdef CONFIG_MODULES +-static inline int within(void *addr, void *start, unsigned long size) +-{ +- return ((addr >= start) && (addr < start + size)); +-} +- + /* Update list and generate events when modules are unloaded. */ + static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, + void *data) +@@ -121,7 +116,7 @@ static int gcov_module_notifier(struct n + prev = NULL; + /* Remove entries located in module from linked list. */ + for (info = gcov_info_head; info; info = info->next) { +- if (within(info, mod->module_core, mod->core_size)) { ++ if (within_module_core_rw((unsigned long)info, mod)) { + if (prev) + prev->next = info->next; + else +diff -urNp linux-2.6.33.1/kernel/hrtimer.c linux-2.6.33.1/kernel/hrtimer.c +--- linux-2.6.33.1/kernel/hrtimer.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/hrtimer.c 2010-03-20 16:58:42.048537218 -0400 +@@ -1398,7 +1398,7 @@ void hrtimer_peek_ahead_timers(void) + local_irq_restore(flags); + } + +-static void run_hrtimer_softirq(struct softirq_action *h) ++static void run_hrtimer_softirq(void) + { + hrtimer_peek_ahead_timers(); + } +diff -urNp linux-2.6.33.1/kernel/kallsyms.c linux-2.6.33.1/kernel/kallsyms.c +--- linux-2.6.33.1/kernel/kallsyms.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/kallsyms.c 2010-03-20 16:58:42.064920323 -0400 +@@ -11,6 +11,9 @@ + * Changed the compression method from stem compression to "table lookup" + * compression (see scripts/kallsyms.c for a more complete description) + */ ++#ifdef CONFIG_GRKERNSEC_HIDESYM ++#define __INCLUDED_BY_HIDESYM 1 ++#endif + #include <linux/kallsyms.h> + #include <linux/module.h> + #include <linux/init.h> +@@ -51,6 +54,9 @@ extern const unsigned long kallsyms_mark + + static inline int is_kernel_inittext(unsigned long addr) + { ++ if (system_state != SYSTEM_BOOTING) ++ return 0; ++ + if (addr >= (unsigned long)_sinittext + && addr <= (unsigned long)_einittext) + return 1; +@@ -67,6 +73,24 @@ static inline int is_kernel_text(unsigne + + static inline int is_kernel(unsigned long addr) + { ++ if (is_kernel_inittext(addr)) ++ return 1; ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ if ((unsigned long)MODULES_EXEC_VADDR <= ktla_ktva(addr) && ktla_ktva(addr) <= (unsigned long)MODULES_EXEC_END) ++ return 0; ++ ++ if (is_kernel_text(addr)) ++ return 1; ++ ++ if (ktla_ktva((unsigned long)_stext) <= addr && addr < ktla_ktva((unsigned long)_etext)) ++ return 1; ++ ++ if ((addr >= (unsigned long)_sdata && addr <= (unsigned long)_end)) ++ return 1; ++ return in_gate_area_no_task(addr); ++#endif ++ + if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) + return 1; + return in_gate_area_no_task(addr); +@@ -414,7 +438,6 @@ static unsigned long get_ksymbol_core(st + + static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) + { +- iter->name[0] = '\0'; + iter->nameoff = get_symbol_offset(new_pos); + iter->pos = new_pos; + } +@@ -462,6 +485,11 @@ static int s_show(struct seq_file *m, vo + { + struct kallsym_iter *iter = m->private; + ++#ifdef CONFIG_GRKERNSEC_HIDESYM ++ if (current_uid()) ++ return 0; ++#endif ++ + /* Some debugging symbols have no name. Ignore them. */ + if (!iter->name[0]) + return 0; +@@ -502,7 +530,7 @@ static int kallsyms_open(struct inode *i + struct kallsym_iter *iter; + int ret; + +- iter = kmalloc(sizeof(*iter), GFP_KERNEL); ++ iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + reset_iter(iter, 0); +diff -urNp linux-2.6.33.1/kernel/kgdb.c linux-2.6.33.1/kernel/kgdb.c +--- linux-2.6.33.1/kernel/kgdb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/kgdb.c 2010-03-20 16:58:42.068659558 -0400 +@@ -86,7 +86,7 @@ static int kgdb_io_module_registered; + /* Guard for recursive entry */ + static int exception_level; + +-static struct kgdb_io *kgdb_io_ops; ++static const struct kgdb_io *kgdb_io_ops; + static DEFINE_SPINLOCK(kgdb_registration_lock); + + /* kgdb console driver is loaded */ +@@ -1664,7 +1664,7 @@ static void kgdb_initial_breakpoint(void + * + * Register it with the KGDB core. + */ +-int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops) ++int kgdb_register_io_module(const struct kgdb_io *new_kgdb_io_ops) + { + int err; + +@@ -1709,7 +1709,7 @@ EXPORT_SYMBOL_GPL(kgdb_register_io_modul + * + * Unregister it with the KGDB core. + */ +-void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops) ++void kgdb_unregister_io_module(const struct kgdb_io *old_kgdb_io_ops) + { + BUG_ON(kgdb_connected); + +diff -urNp linux-2.6.33.1/kernel/kmod.c linux-2.6.33.1/kernel/kmod.c +--- linux-2.6.33.1/kernel/kmod.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/kmod.c 2010-03-20 16:58:42.068659558 -0400 +@@ -90,6 +90,18 @@ int __request_module(bool wait, const ch + if (ret) + return ret; + ++#ifdef CONFIG_GRKERNSEC_MODHARDEN ++ /* we could do a tighter check here, but some distros ++ are taking it upon themselves to remove CAP_SYS_MODULE ++ from even root-running apps which cause modules to be ++ auto-loaded ++ */ ++ if (current_uid()) { ++ gr_log_nonroot_mod_load(module_name); ++ return -EPERM; ++ } ++#endif ++ + /* If modprobe needs a service that is in a module, we get a recursive + * loop. Limit the number of running kmod threads to max_threads/2 or + * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method +diff -urNp linux-2.6.33.1/kernel/kprobes.c linux-2.6.33.1/kernel/kprobes.c +--- linux-2.6.33.1/kernel/kprobes.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/kprobes.c 2010-03-20 16:58:42.068659558 -0400 +@@ -186,7 +186,7 @@ static kprobe_opcode_t __kprobes *__get_ + * kernel image and loaded module images reside. This is required + * so x86_64 can correctly handle the %rip-relative fixups. + */ +- kip->insns = module_alloc(PAGE_SIZE); ++ kip->insns = module_alloc_exec(PAGE_SIZE); + if (!kip->insns) { + kfree(kip); + return NULL; +@@ -223,7 +223,7 @@ static int __kprobes collect_one_slot(st + */ + if (!list_is_singular(&kprobe_insn_pages)) { + list_del(&kip->list); +- module_free(NULL, kip->insns); ++ module_free_exec(NULL, kip->insns); + kfree(kip); + } + return 1; +diff -urNp linux-2.6.33.1/kernel/lockdep.c linux-2.6.33.1/kernel/lockdep.c +--- linux-2.6.33.1/kernel/lockdep.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/lockdep.c 2010-03-20 16:58:42.072946288 -0400 +@@ -586,6 +586,10 @@ static int static_obj(void *obj) + int i; + #endif + ++#ifdef CONFIG_PAX_KERNEXEC ++ start = ktla_ktva(start); ++#endif ++ + /* + * static variable? + */ +@@ -601,8 +605,7 @@ static int static_obj(void *obj) + */ + for_each_possible_cpu(i) { + start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); +- end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM +- + per_cpu_offset(i); ++ end = start + PERCPU_ENOUGH_ROOM; + + if ((addr >= start) && (addr < end)) + return 1; +@@ -719,6 +722,7 @@ register_lock_class(struct lockdep_map * + if (!static_obj(lock->key)) { + debug_locks_off(); + printk("INFO: trying to register non-static key.\n"); ++ printk("lock:%pS key:%pS.\n", lock, lock->key); + printk("the code is fine but needs lockdep annotation.\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); +diff -urNp linux-2.6.33.1/kernel/module.c linux-2.6.33.1/kernel/module.c +--- linux-2.6.33.1/kernel/module.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/module.c 2010-03-20 16:58:42.072946288 -0400 +@@ -89,7 +89,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq + static BLOCKING_NOTIFIER_HEAD(module_notify_list); + + /* Bounds of module allocation, for speeding __module_address */ +-static unsigned long module_addr_min = -1UL, module_addr_max = 0; ++static unsigned long module_addr_min_rw = -1UL, module_addr_max_rw = 0; ++static unsigned long module_addr_min_rx = -1UL, module_addr_max_rx = 0; + + int register_module_notifier(struct notifier_block * nb) + { +@@ -245,7 +246,7 @@ bool each_symbol(bool (*fn)(const struct + return true; + + list_for_each_entry_rcu(mod, &modules, list) { +- struct symsearch arr[] = { ++ struct symsearch modarr[] = { + { mod->syms, mod->syms + mod->num_syms, mod->crcs, + NOT_GPL_ONLY, false }, + { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, +@@ -267,7 +268,7 @@ bool each_symbol(bool (*fn)(const struct + #endif + }; + +- if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) ++ if (each_symbol_in_section(modarr, ARRAY_SIZE(modarr), mod, fn, data)) + return true; + } + return false; +@@ -375,7 +376,7 @@ static void *percpu_modalloc(unsigned lo + { + void *ptr; + +- if (align > PAGE_SIZE) { ++ if (align-1 >= PAGE_SIZE) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + name, align, PAGE_SIZE); + align = PAGE_SIZE; +@@ -1393,7 +1394,8 @@ static void free_module(struct module *m + destroy_params(mod->kp, mod->num_kp); + + /* This may be NULL, but that's OK */ +- module_free(mod, mod->module_init); ++ module_free(mod, mod->module_init_rw); ++ module_free_exec(mod, mod->module_init_rx); + kfree(mod->args); + if (mod->percpu) + percpu_modfree(mod->percpu); +@@ -1402,10 +1404,12 @@ static void free_module(struct module *m + percpu_modfree(mod->refptr); + #endif + /* Free lock-classes: */ +- lockdep_free_key_range(mod->module_core, mod->core_size); ++ lockdep_free_key_range(mod->module_core_rx, mod->core_size_rx); ++ lockdep_free_key_range(mod->module_core_rw, mod->core_size_rw); + + /* Finally, free the core (containing the module structure) */ +- module_free(mod, mod->module_core); ++ module_free_exec(mod, mod->module_core_rx); ++ module_free(mod, mod->module_core_rw); + + #ifdef CONFIG_MPU + update_protections(current->mm); +@@ -1499,7 +1503,9 @@ static int simplify_symbols(Elf_Shdr *se + strtab + sym[i].st_name, mod); + /* Ok if resolved. */ + if (ksym) { ++ pax_open_kernel(); + sym[i].st_value = ksym->value; ++ pax_close_kernel(); + break; + } + +@@ -1518,7 +1524,9 @@ static int simplify_symbols(Elf_Shdr *se + secbase = (unsigned long)mod->percpu; + else + secbase = sechdrs[sym[i].st_shndx].sh_addr; ++ pax_open_kernel(); + sym[i].st_value += secbase; ++ pax_close_kernel(); + break; + } + } +@@ -1579,11 +1587,12 @@ static void layout_sections(struct modul + || s->sh_entsize != ~0UL + || strstarts(secstrings + s->sh_name, ".init")) + continue; +- s->sh_entsize = get_offset(mod, &mod->core_size, s, i); ++ if ((s->sh_flags & SHF_WRITE) || !(s->sh_flags & SHF_ALLOC)) ++ s->sh_entsize = get_offset(mod, &mod->core_size_rw, s, i); ++ else ++ s->sh_entsize = get_offset(mod, &mod->core_size_rx, s, i); + DEBUGP("\t%s\n", secstrings + s->sh_name); + } +- if (m == 0) +- mod->core_text_size = mod->core_size; + } + + DEBUGP("Init section allocation order:\n"); +@@ -1596,12 +1605,13 @@ static void layout_sections(struct modul + || s->sh_entsize != ~0UL + || !strstarts(secstrings + s->sh_name, ".init")) + continue; +- s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) +- | INIT_OFFSET_MASK); ++ if ((s->sh_flags & SHF_WRITE) || !(s->sh_flags & SHF_ALLOC)) ++ s->sh_entsize = get_offset(mod, &mod->init_size_rw, s, i); ++ else ++ s->sh_entsize = get_offset(mod, &mod->init_size_rx, s, i); ++ s->sh_entsize |= INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + s->sh_name); + } +- if (m == 0) +- mod->init_text_size = mod->init_size; + } + } + +@@ -1705,9 +1715,8 @@ static int is_exported(const char *name, + + /* As per nm */ + static char elf_type(const Elf_Sym *sym, +- Elf_Shdr *sechdrs, +- const char *secstrings, +- struct module *mod) ++ const Elf_Shdr *sechdrs, ++ const char *secstrings) + { + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { + if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) +@@ -1782,7 +1791,7 @@ static unsigned long layout_symtab(struc + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; +- symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, ++ symsect->sh_entsize = get_offset(mod, &mod->init_size_rx, symsect, + symindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + symsect->sh_name); + +@@ -1799,19 +1808,19 @@ static unsigned long layout_symtab(struc + } + + /* Append room for core symbols at end of core part. */ +- symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); +- mod->core_size = symoffs + ndst * sizeof(Elf_Sym); ++ symoffs = ALIGN(mod->core_size_rx, symsect->sh_addralign ?: 1); ++ mod->core_size_rx = symoffs + ndst * sizeof(Elf_Sym); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; +- strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, ++ strsect->sh_entsize = get_offset(mod, &mod->init_size_rx, strsect, + strindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + strsect->sh_name); + + /* Append room for core symbols' strings at end of core part. */ +- *pstroffs = mod->core_size; ++ *pstroffs = mod->core_size_rx; + __set_bit(0, strmap); +- mod->core_size += bitmap_weight(strmap, strsect->sh_size); ++ mod->core_size_rx += bitmap_weight(strmap, strsect->sh_size); + + return symoffs; + } +@@ -1835,12 +1844,14 @@ static void add_kallsyms(struct module * + mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); + mod->strtab = (void *)sechdrs[strindex].sh_addr; + ++ pax_open_kernel(); ++ + /* Set types up while we still have access to sections. */ + for (i = 0; i < mod->num_symtab; i++) + mod->symtab[i].st_info +- = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); ++ = elf_type(&mod->symtab[i], sechdrs, secstrings); + +- mod->core_symtab = dst = mod->module_core + symoffs; ++ mod->core_symtab = dst = mod->module_core_rx + symoffs; + src = mod->symtab; + *dst = *src; + for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { +@@ -1852,10 +1863,12 @@ static void add_kallsyms(struct module * + } + mod->core_num_syms = ndst; + +- mod->core_strtab = s = mod->module_core + stroffs; ++ mod->core_strtab = s = mod->module_core_rx + stroffs; + for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) + if (test_bit(i, strmap)) + *++s = mod->strtab[i]; ++ ++ pax_close_kernel(); + } + #else + static inline unsigned long layout_symtab(struct module *mod, +@@ -1892,16 +1905,30 @@ static void dynamic_debug_setup(struct _ + #endif + } + +-static void *module_alloc_update_bounds(unsigned long size) ++static void *module_alloc_update_bounds_rw(unsigned long size) + { + void *ret = module_alloc(size); + + if (ret) { + /* Update module bounds. */ +- if ((unsigned long)ret < module_addr_min) +- module_addr_min = (unsigned long)ret; +- if ((unsigned long)ret + size > module_addr_max) +- module_addr_max = (unsigned long)ret + size; ++ if ((unsigned long)ret < module_addr_min_rw) ++ module_addr_min_rw = (unsigned long)ret; ++ if ((unsigned long)ret + size > module_addr_max_rw) ++ module_addr_max_rw = (unsigned long)ret + size; ++ } ++ return ret; ++} ++ ++static void *module_alloc_update_bounds_rx(unsigned long size) ++{ ++ void *ret = module_alloc_exec(size); ++ ++ if (ret) { ++ /* Update module bounds. */ ++ if ((unsigned long)ret < module_addr_min_rx) ++ module_addr_min_rx = (unsigned long)ret; ++ if ((unsigned long)ret + size > module_addr_max_rx) ++ module_addr_max_rx = (unsigned long)ret + size; + } + return ret; + } +@@ -2108,7 +2135,7 @@ static noinline struct module *load_modu + secstrings, &stroffs, strmap); + + /* Do the allocs. */ +- ptr = module_alloc_update_bounds(mod->core_size); ++ ptr = module_alloc_update_bounds_rw(mod->core_size_rw); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. Just mark it as not being a +@@ -2119,23 +2146,47 @@ static noinline struct module *load_modu + err = -ENOMEM; + goto free_percpu; + } +- memset(ptr, 0, mod->core_size); +- mod->module_core = ptr; ++ memset(ptr, 0, mod->core_size_rw); ++ mod->module_core_rw = ptr; + +- ptr = module_alloc_update_bounds(mod->init_size); ++ ptr = module_alloc_update_bounds_rw(mod->init_size_rw); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. This block doesn't need to be + * scanned as it contains data and code that will be freed + * after the module is initialized. + */ +- kmemleak_ignore(ptr); +- if (!ptr && mod->init_size) { ++ kmemleak_not_leak(ptr); ++ if (!ptr && mod->init_size_rw) { ++ err = -ENOMEM; ++ goto free_core_rw; ++ } ++ memset(ptr, 0, mod->init_size_rw); ++ mod->module_init_rw = ptr; ++ ++ ptr = module_alloc_update_bounds_rx(mod->core_size_rx); ++ kmemleak_not_leak(ptr); ++ if (!ptr) { + err = -ENOMEM; +- goto free_core; ++ goto free_init_rw; + } +- memset(ptr, 0, mod->init_size); +- mod->module_init = ptr; ++ ++ pax_open_kernel(); ++ memset(ptr, 0, mod->core_size_rx); ++ pax_close_kernel(); ++ mod->module_core_rx = ptr; ++ ++ ptr = module_alloc_update_bounds_rx(mod->init_size_rx); ++ kmemleak_not_leak(ptr); ++ if (!ptr && mod->init_size_rx) { ++ err = -ENOMEM; ++ goto free_core_rx; ++ } ++ ++ pax_open_kernel(); ++ memset(ptr, 0, mod->init_size_rx); ++ pax_close_kernel(); ++ mod->module_init_rx = ptr; + + /* Transfer each section which specifies SHF_ALLOC */ + DEBUGP("final section addresses:\n"); +@@ -2145,17 +2196,41 @@ static noinline struct module *load_modu + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + +- if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) +- dest = mod->module_init +- + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); +- else +- dest = mod->module_core + sechdrs[i].sh_entsize; ++ if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) { ++ if ((sechdrs[i].sh_flags & SHF_WRITE) || !(sechdrs[i].sh_flags & SHF_ALLOC)) ++ dest = mod->module_init_rw ++ + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); ++ else ++ dest = mod->module_init_rx ++ + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); ++ } else { ++ if ((sechdrs[i].sh_flags & SHF_WRITE) || !(sechdrs[i].sh_flags & SHF_ALLOC)) ++ dest = mod->module_core_rw + sechdrs[i].sh_entsize; ++ else ++ dest = mod->module_core_rx + sechdrs[i].sh_entsize; ++ } ++ ++ if (sechdrs[i].sh_type != SHT_NOBITS) { ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ if (!(sechdrs[i].sh_flags & SHF_WRITE) && (sechdrs[i].sh_flags & SHF_ALLOC)) { ++ pax_open_kernel(); ++ memcpy(dest, (void *)sechdrs[i].sh_addr, sechdrs[i].sh_size); ++ pax_close_kernel(); ++ } else ++#endif + +- if (sechdrs[i].sh_type != SHT_NOBITS) +- memcpy(dest, (void *)sechdrs[i].sh_addr, +- sechdrs[i].sh_size); ++ memcpy(dest, (void *)sechdrs[i].sh_addr, sechdrs[i].sh_size); ++ } + /* Update sh_addr to point to copy in image. */ +- sechdrs[i].sh_addr = (unsigned long)dest; ++ ++#ifdef CONFIG_PAX_KERNEXEC ++ if (sechdrs[i].sh_flags & SHF_EXECINSTR) ++ sechdrs[i].sh_addr = ktva_ktla((unsigned long)dest); ++ else ++#endif ++ ++ sechdrs[i].sh_addr = (unsigned long)dest; + DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); + } + /* Module has been moved. */ +@@ -2167,7 +2242,7 @@ static noinline struct module *load_modu + mod->name); + if (!mod->refptr) { + err = -ENOMEM; +- goto free_init; ++ goto free_init_rx; + } + #endif + /* Now we've moved module, initialize linked lists, etc. */ +@@ -2282,8 +2357,8 @@ static noinline struct module *load_modu + + /* Now do relocations. */ + for (i = 1; i < hdr->e_shnum; i++) { +- const char *strtab = (char *)sechdrs[strindex].sh_addr; + unsigned int info = sechdrs[i].sh_info; ++ strtab = (char *)sechdrs[strindex].sh_addr; + + /* Not a valid relocation section? */ + if (info >= hdr->e_shnum) +@@ -2344,12 +2419,12 @@ static noinline struct module *load_modu + * Do it before processing of module parameters, so the module + * can provide parameter accessor functions of its own. + */ +- if (mod->module_init) +- flush_icache_range((unsigned long)mod->module_init, +- (unsigned long)mod->module_init +- + mod->init_size); +- flush_icache_range((unsigned long)mod->module_core, +- (unsigned long)mod->module_core + mod->core_size); ++ if (mod->module_init_rx) ++ flush_icache_range((unsigned long)mod->module_init_rx, ++ (unsigned long)mod->module_init_rx ++ + mod->init_size_rx); ++ flush_icache_range((unsigned long)mod->module_core_rx, ++ (unsigned long)mod->module_core_rx + mod->core_size_rx); + + set_fs(old_fs); + +@@ -2397,12 +2472,16 @@ static noinline struct module *load_modu + free_unload: + module_unload_free(mod); + #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) ++ free_init_rx: + percpu_modfree(mod->refptr); +- free_init: + #endif +- module_free(mod, mod->module_init); +- free_core: +- module_free(mod, mod->module_core); ++ module_free_exec(mod, mod->module_init_rx); ++ free_core_rx: ++ module_free_exec(mod, mod->module_core_rx); ++ free_init_rw: ++ module_free(mod, mod->module_init_rw); ++ free_core_rw: ++ module_free(mod, mod->module_core_rw); + /* mod will be freed with core. Don't access it beyond this line! */ + free_percpu: + if (percpu) +@@ -2504,10 +2583,12 @@ SYSCALL_DEFINE3(init_module, void __user + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; + #endif +- module_free(mod, mod->module_init); +- mod->module_init = NULL; +- mod->init_size = 0; +- mod->init_text_size = 0; ++ module_free(mod, mod->module_init_rw); ++ module_free_exec(mod, mod->module_init_rx); ++ mod->module_init_rw = NULL; ++ mod->module_init_rx = NULL; ++ mod->init_size_rw = 0; ++ mod->init_size_rx = 0; + mutex_unlock(&module_mutex); + + return 0; +@@ -2538,10 +2619,16 @@ static const char *get_ksymbol(struct mo + unsigned long nextval; + + /* At worse, next value is at end of module */ +- if (within_module_init(addr, mod)) +- nextval = (unsigned long)mod->module_init+mod->init_text_size; ++ if (within_module_init_rx(addr, mod)) ++ nextval = (unsigned long)mod->module_init_rx+mod->init_size_rx; ++ else if (within_module_init_rw(addr, mod)) ++ nextval = (unsigned long)mod->module_init_rw+mod->init_size_rw; ++ else if (within_module_core_rx(addr, mod)) ++ nextval = (unsigned long)mod->module_core_rx+mod->core_size_rx; ++ else if (within_module_core_rw(addr, mod)) ++ nextval = (unsigned long)mod->module_core_rw+mod->core_size_rw; + else +- nextval = (unsigned long)mod->module_core+mod->core_text_size; ++ return NULL; + + /* Scan for closest preceeding symbol, and next symbol. (ELF + starts real symbols at 1). */ +@@ -2787,7 +2874,7 @@ static int m_show(struct seq_file *m, vo + char buf[8]; + + seq_printf(m, "%s %u", +- mod->name, mod->init_size + mod->core_size); ++ mod->name, mod->init_size_rx + mod->init_size_rw + mod->core_size_rx + mod->core_size_rw); + print_unload_info(m, mod); + + /* Informative for users. */ +@@ -2796,7 +2883,7 @@ static int m_show(struct seq_file *m, vo + mod->state == MODULE_STATE_COMING ? "Loading": + "Live"); + /* Used by oprofile and other similar tools. */ +- seq_printf(m, " 0x%p", mod->module_core); ++ seq_printf(m, " 0x%p 0x%p", mod->module_core_rx, mod->module_core_rw); + + /* Taints info */ + if (mod->taints) +@@ -2832,7 +2919,17 @@ static const struct file_operations proc + + static int __init proc_modules_init(void) + { ++#ifndef CONFIG_GRKERNSEC_HIDESYM ++#ifdef CONFIG_GRKERNSEC_PROC_USER ++ proc_create("modules", S_IRUSR, NULL, &proc_modules_operations); ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ proc_create("modules", S_IRUSR | S_IRGRP, NULL, &proc_modules_operations); ++#else + proc_create("modules", 0, NULL, &proc_modules_operations); ++#endif ++#else ++ proc_create("modules", S_IRUSR, NULL, &proc_modules_operations); ++#endif + return 0; + } + module_init(proc_modules_init); +@@ -2891,12 +2988,12 @@ struct module *__module_address(unsigned + { + struct module *mod; + +- if (addr < module_addr_min || addr > module_addr_max) ++ if ((addr < module_addr_min_rx || addr > module_addr_max_rx) && ++ (addr < module_addr_min_rw || addr > module_addr_max_rw)) + return NULL; + + list_for_each_entry_rcu(mod, &modules, list) +- if (within_module_core(addr, mod) +- || within_module_init(addr, mod)) ++ if (within_module_init(addr, mod) || within_module_core(addr, mod)) + return mod; + return NULL; + } +@@ -2930,11 +3027,20 @@ bool is_module_text_address(unsigned lon + */ + struct module *__module_text_address(unsigned long addr) + { +- struct module *mod = __module_address(addr); ++ struct module *mod; ++ ++#ifdef CONFIG_X86_32 ++ addr = ktla_ktva(addr); ++#endif ++ ++ if (addr < module_addr_min_rx || addr > module_addr_max_rx) ++ return NULL; ++ ++ mod = __module_address(addr); ++ + if (mod) { + /* Make sure it's within the text section. */ +- if (!within(addr, mod->module_init, mod->init_text_size) +- && !within(addr, mod->module_core, mod->core_text_size)) ++ if (!within_module_init_rx(addr, mod) && !within_module_core_rx(addr, mod)) + mod = NULL; + } + return mod; +diff -urNp linux-2.6.33.1/kernel/panic.c linux-2.6.33.1/kernel/panic.c +--- linux-2.6.33.1/kernel/panic.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/panic.c 2010-03-20 16:58:42.076969908 -0400 +@@ -396,7 +396,8 @@ EXPORT_SYMBOL(warn_slowpath_null); + */ + void __stack_chk_fail(void) + { +- panic("stack-protector: Kernel stack is corrupted in: %p\n", ++ dump_stack(); ++ panic("stack-protector: Kernel stack is corrupted in: %pS\n", + __builtin_return_address(0)); + } + EXPORT_SYMBOL(__stack_chk_fail); +diff -urNp linux-2.6.33.1/kernel/params.c linux-2.6.33.1/kernel/params.c +--- linux-2.6.33.1/kernel/params.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/params.c 2010-03-20 16:58:42.088938657 -0400 +@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct + return ret; + } + +-static struct sysfs_ops module_sysfs_ops = { ++static const struct sysfs_ops module_sysfs_ops = { + .show = module_attr_show, + .store = module_attr_store, + }; +@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *ks + return 0; + } + +-static struct kset_uevent_ops module_uevent_ops = { ++static const struct kset_uevent_ops module_uevent_ops = { + .filter = uevent_filter, + }; + +diff -urNp linux-2.6.33.1/kernel/pid.c linux-2.6.33.1/kernel/pid.c +--- linux-2.6.33.1/kernel/pid.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/pid.c 2010-03-20 16:58:42.092540045 -0400 +@@ -33,6 +33,7 @@ + #include <linux/rculist.h> + #include <linux/bootmem.h> + #include <linux/hash.h> ++#include <linux/security.h> + #include <linux/pid_namespace.h> + #include <linux/init_task.h> + #include <linux/syscalls.h> +@@ -45,7 +46,7 @@ struct pid init_struct_pid = INIT_STRUCT + + int pid_max = PID_MAX_DEFAULT; + +-#define RESERVED_PIDS 300 ++#define RESERVED_PIDS 500 + + int pid_max_min = RESERVED_PIDS + 1; + int pid_max_max = PID_MAX_LIMIT; +@@ -380,7 +381,14 @@ EXPORT_SYMBOL(pid_task); + */ + struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) + { +- return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); ++ struct task_struct *task; ++ ++ task = pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); ++ ++ if (gr_pid_is_chrooted(task)) ++ return NULL; ++ ++ return task; + } + + struct task_struct *find_task_by_vpid(pid_t vnr) +diff -urNp linux-2.6.33.1/kernel/posix-cpu-timers.c linux-2.6.33.1/kernel/posix-cpu-timers.c +--- linux-2.6.33.1/kernel/posix-cpu-timers.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/posix-cpu-timers.c 2010-03-20 16:58:42.092540045 -0400 +@@ -6,6 +6,7 @@ + #include <linux/posix-timers.h> + #include <linux/errno.h> + #include <linux/math64.h> ++#include <linux/security.h> + #include <asm/uaccess.h> + #include <linux/kernel_stat.h> + #include <trace/events/timer.h> +@@ -1043,6 +1044,7 @@ static void check_thread_timers(struct t + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } ++ gr_learn_resource(tsk, RLIMIT_RTTIME, tsk->rt.timeout, 1); + if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { + /* + * At the soft limit, send a SIGXCPU every second. +@@ -1205,6 +1207,7 @@ static void check_process_timers(struct + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } ++ gr_learn_resource(tsk, RLIMIT_CPU, psecs, 0); + if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { + /* + * At the soft limit, send a SIGXCPU every second. +diff -urNp linux-2.6.33.1/kernel/power/hibernate.c linux-2.6.33.1/kernel/power/hibernate.c +--- linux-2.6.33.1/kernel/power/hibernate.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/power/hibernate.c 2010-03-20 16:58:42.092540045 -0400 +@@ -49,14 +49,14 @@ enum { + + static int hibernation_mode = HIBERNATION_SHUTDOWN; + +-static struct platform_hibernation_ops *hibernation_ops; ++static const struct platform_hibernation_ops *hibernation_ops; + + /** + * hibernation_set_ops - set the global hibernate operations + * @ops: the hibernation operations to use in subsequent hibernation transitions + */ + +-void hibernation_set_ops(struct platform_hibernation_ops *ops) ++void hibernation_set_ops(const struct platform_hibernation_ops *ops) + { + if (ops && !(ops->begin && ops->end && ops->pre_snapshot + && ops->prepare && ops->finish && ops->enter && ops->pre_restore +diff -urNp linux-2.6.33.1/kernel/power/poweroff.c linux-2.6.33.1/kernel/power/poweroff.c +--- linux-2.6.33.1/kernel/power/poweroff.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/power/poweroff.c 2010-03-20 16:58:42.096640751 -0400 +@@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_powerof + .enable_mask = SYSRQ_ENABLE_BOOT, + }; + +-static int pm_sysrq_init(void) ++static int __init pm_sysrq_init(void) + { + register_sysrq_key('o', &sysrq_poweroff_op); + return 0; +diff -urNp linux-2.6.33.1/kernel/power/process.c linux-2.6.33.1/kernel/power/process.c +--- linux-2.6.33.1/kernel/power/process.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/power/process.c 2010-03-20 16:58:42.096640751 -0400 +@@ -38,12 +38,15 @@ static int try_to_freeze_tasks(bool sig_ + struct timeval start, end; + u64 elapsed_csecs64; + unsigned int elapsed_csecs; ++ bool timedout = false; + + do_gettimeofday(&start); + + end_time = jiffies + TIMEOUT; + while (true) { + todo = 0; ++ if (time_after(jiffies, end_time)) ++ timedout = true; + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (frozen(p) || !freezeable(p)) +@@ -58,12 +61,16 @@ static int try_to_freeze_tasks(bool sig_ + * It is "frozen enough". If the task does wake + * up, it will immediately call try_to_freeze. + */ +- if (!task_is_stopped_or_traced(p) && +- !freezer_should_skip(p)) ++ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) { + todo++; ++ if (timedout) { ++ printk(KERN_ERR "Task refusing to freeze:\n"); ++ sched_show_task(p); ++ } ++ } + } while_each_thread(g, p); + read_unlock(&tasklist_lock); +- if (!todo || time_after(jiffies, end_time)) ++ if (!todo || timedout) + break; + + /* +diff -urNp linux-2.6.33.1/kernel/power/suspend.c linux-2.6.33.1/kernel/power/suspend.c +--- linux-2.6.33.1/kernel/power/suspend.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/power/suspend.c 2010-03-20 16:58:42.096640751 -0400 +@@ -23,13 +23,13 @@ const char *const pm_states[PM_SUSPEND_M + [PM_SUSPEND_MEM] = "mem", + }; + +-static struct platform_suspend_ops *suspend_ops; ++static const struct platform_suspend_ops *suspend_ops; + + /** + * suspend_set_ops - Set the global suspend method table. + * @ops: Pointer to ops structure. + */ +-void suspend_set_ops(struct platform_suspend_ops *ops) ++void suspend_set_ops(const struct platform_suspend_ops *ops) + { + mutex_lock(&pm_mutex); + suspend_ops = ops; +diff -urNp linux-2.6.33.1/kernel/printk.c linux-2.6.33.1/kernel/printk.c +--- linux-2.6.33.1/kernel/printk.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/printk.c 2010-03-20 16:58:42.096640751 -0400 +@@ -280,6 +280,11 @@ int do_syslog(int type, char __user *buf + char c; + int error = 0; + ++#ifdef CONFIG_GRKERNSEC_DMESG ++ if (grsec_enable_dmesg && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++#endif ++ + error = security_syslog(type); + if (error) + return error; +diff -urNp linux-2.6.33.1/kernel/ptrace.c linux-2.6.33.1/kernel/ptrace.c +--- linux-2.6.33.1/kernel/ptrace.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/ptrace.c 2010-03-20 16:58:42.104944311 -0400 +@@ -141,7 +141,7 @@ int __ptrace_may_access(struct task_stru + cred->gid != tcred->egid || + cred->gid != tcred->sgid || + cred->gid != tcred->gid) && +- !capable(CAP_SYS_PTRACE)) { ++ !capable_nolog(CAP_SYS_PTRACE)) { + rcu_read_unlock(); + return -EPERM; + } +@@ -149,7 +149,7 @@ int __ptrace_may_access(struct task_stru + smp_rmb(); + if (task->mm) + dumpable = get_dumpable(task->mm); +- if (!dumpable && !capable(CAP_SYS_PTRACE)) ++ if (!dumpable && !capable_nolog(CAP_SYS_PTRACE)) + return -EPERM; + + return security_ptrace_access_check(task, mode); +@@ -199,7 +199,7 @@ int ptrace_attach(struct task_struct *ta + goto unlock_tasklist; + + task->ptrace = PT_PTRACED; +- if (capable(CAP_SYS_PTRACE)) ++ if (capable_nolog(CAP_SYS_PTRACE)) + task->ptrace |= PT_PTRACE_CAP; + + __ptrace_link(task, current); +@@ -362,7 +362,7 @@ int ptrace_readdata(struct task_struct * + break; + return -EIO; + } +- if (copy_to_user(dst, buf, retval)) ++ if (retval > sizeof(buf) || copy_to_user(dst, buf, retval)) + return -EFAULT; + copied += retval; + src += retval; +@@ -532,18 +532,18 @@ int ptrace_request(struct task_struct *c + ret = ptrace_setoptions(child, data); + break; + case PTRACE_GETEVENTMSG: +- ret = put_user(child->ptrace_message, (unsigned long __user *) data); ++ ret = put_user(child->ptrace_message, (__force unsigned long __user *) data); + break; + + case PTRACE_GETSIGINFO: + ret = ptrace_getsiginfo(child, &siginfo); + if (!ret) +- ret = copy_siginfo_to_user((siginfo_t __user *) data, ++ ret = copy_siginfo_to_user((__force siginfo_t __user *) data, + &siginfo); + break; + + case PTRACE_SETSIGINFO: +- if (copy_from_user(&siginfo, (siginfo_t __user *) data, ++ if (copy_from_user(&siginfo, (__force siginfo_t __user *) data, + sizeof siginfo)) + ret = -EFAULT; + else +@@ -621,14 +621,21 @@ SYSCALL_DEFINE4(ptrace, long, request, l + goto out; + } + ++ if (gr_handle_ptrace(child, request)) { ++ ret = -EPERM; ++ goto out_put_task_struct; ++ } ++ + if (request == PTRACE_ATTACH) { + ret = ptrace_attach(child); + /* + * Some architectures need to do book-keeping after + * a ptrace attach. + */ +- if (!ret) ++ if (!ret) { + arch_ptrace_attach(child); ++ gr_audit_ptrace(child); ++ } + goto out_put_task_struct; + } + +@@ -653,7 +660,7 @@ int generic_ptrace_peekdata(struct task_ + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + if (copied != sizeof(tmp)) + return -EIO; +- return put_user(tmp, (unsigned long __user *)data); ++ return put_user(tmp, (__force unsigned long __user *)data); + } + + int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) +diff -urNp linux-2.6.33.1/kernel/rcutree.c linux-2.6.33.1/kernel/rcutree.c +--- linux-2.6.33.1/kernel/rcutree.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/rcutree.c 2010-03-20 16:58:42.108722969 -0400 +@@ -1315,7 +1315,7 @@ __rcu_process_callbacks(struct rcu_state + /* + * Do softirq processing for the current CPU. + */ +-static void rcu_process_callbacks(struct softirq_action *unused) ++static void rcu_process_callbacks(void) + { + /* + * Memory references from any prior RCU read-side critical sections +diff -urNp linux-2.6.33.1/kernel/relay.c linux-2.6.33.1/kernel/relay.c +--- linux-2.6.33.1/kernel/relay.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/relay.c 2010-03-20 16:58:42.108722969 -0400 +@@ -1292,7 +1292,7 @@ static int subbuf_splice_actor(struct fi + return 0; + + ret = *nonpad_ret = splice_to_pipe(pipe, &spd); +- if (ret < 0 || ret < total_len) ++ if ((int)ret < 0 || ret < total_len) + return ret; + + if (read_start + ret == nonpad_end) +diff -urNp linux-2.6.33.1/kernel/resource.c linux-2.6.33.1/kernel/resource.c +--- linux-2.6.33.1/kernel/resource.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/resource.c 2010-03-20 16:58:42.108722969 -0400 +@@ -132,8 +132,18 @@ static const struct file_operations proc + + static int __init ioresources_init(void) + { ++#ifdef CONFIG_GRKERNSEC_PROC_ADD ++#ifdef CONFIG_GRKERNSEC_PROC_USER ++ proc_create("ioports", S_IRUSR, NULL, &proc_ioports_operations); ++ proc_create("iomem", S_IRUSR, NULL, &proc_iomem_operations); ++#elif defined(CONFIG_GRKERNSEC_PROC_USERGROUP) ++ proc_create("ioports", S_IRUSR | S_IRGRP, NULL, &proc_ioports_operations); ++ proc_create("iomem", S_IRUSR | S_IRGRP, NULL, &proc_iomem_operations); ++#endif ++#else + proc_create("ioports", 0, NULL, &proc_ioports_operations); + proc_create("iomem", 0, NULL, &proc_iomem_operations); ++#endif + return 0; + } + __initcall(ioresources_init); +diff -urNp linux-2.6.33.1/kernel/sched.c linux-2.6.33.1/kernel/sched.c +--- linux-2.6.33.1/kernel/sched.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/sched.c 2010-03-20 16:58:42.116976245 -0400 +@@ -4855,7 +4855,7 @@ out: + * In CONFIG_NO_HZ case, the idle load balance owner will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +-static void run_rebalance_domains(struct softirq_action *h) ++static void run_rebalance_domains(void) + { + int this_cpu = smp_processor_id(); + struct rq *this_rq = cpu_rq(this_cpu); +@@ -6158,6 +6158,8 @@ int can_nice(const struct task_struct *p + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = 20 - nice; + ++ gr_learn_resource(p, RLIMIT_NICE, nice_rlim, 1); ++ + return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || + capable(CAP_SYS_NICE)); + } +@@ -6191,7 +6193,8 @@ SYSCALL_DEFINE1(nice, int, increment) + if (nice > 19) + nice = 19; + +- if (increment < 0 && !can_nice(current, nice)) ++ if (increment < 0 && (!can_nice(current, nice) || ++ gr_handle_chroot_nice())) + return -EPERM; + + retval = security_task_setnice(current, nice); +@@ -6333,6 +6336,8 @@ recheck: + if (rt_policy(policy)) { + unsigned long rlim_rtprio; + ++ gr_learn_resource(p, RLIMIT_RTPRIO, param->sched_priority, 1); ++ + if (!lock_task_sighand(p, &flags)) + return -ESRCH; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; +diff -urNp linux-2.6.33.1/kernel/signal.c linux-2.6.33.1/kernel/signal.c +--- linux-2.6.33.1/kernel/signal.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/signal.c 2010-03-20 16:58:42.116976245 -0400 +@@ -226,6 +226,9 @@ __sigqueue_alloc(int sig, struct task_st + atomic_inc(&user->sigpending); + rcu_read_unlock(); + ++ if (!override_rlimit) ++ gr_learn_resource(t, RLIMIT_SIGPENDING, atomic_read(&user->sigpending), 1); ++ + if (override_rlimit || + atomic_read(&user->sigpending) <= + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { +@@ -659,6 +662,9 @@ static int check_kill_permission(int sig + } + } + ++ if (gr_handle_signal(t, sig)) ++ return -EPERM; ++ + return security_task_kill(t, info, sig, 0); + } + +@@ -1006,7 +1012,7 @@ __group_send_sig_info(int sig, struct si + return send_signal(sig, info, p, 1); + } + +-static int ++int + specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) + { + return send_signal(sig, info, t, 0); +@@ -1060,6 +1066,9 @@ force_sig_info(int sig, struct siginfo * + ret = specific_send_sig_info(sig, info, t); + spin_unlock_irqrestore(&t->sighand->siglock, flags); + ++ gr_log_signal(sig, !is_si_special(info) ? info->si_addr : NULL, t); ++ gr_handle_crash(t, sig); ++ + return ret; + } + +@@ -1113,8 +1122,11 @@ int group_send_sig_info(int sig, struct + { + int ret = check_kill_permission(sig, info, p); + +- if (!ret && sig) ++ if (!ret && sig) { + ret = do_send_sig_info(sig, info, p, true); ++ if (!ret) ++ gr_log_signal(sig, !is_si_special(info) ? info->si_addr : NULL, p); ++ } + + return ret; + } +diff -urNp linux-2.6.33.1/kernel/smp.c linux-2.6.33.1/kernel/smp.c +--- linux-2.6.33.1/kernel/smp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/smp.c 2010-03-20 16:58:42.120938497 -0400 +@@ -498,22 +498,22 @@ int smp_call_function(void (*func)(void + } + EXPORT_SYMBOL(smp_call_function); + +-void ipi_call_lock(void) ++void ipi_call_lock(void) __acquires(call_function.lock) + { + raw_spin_lock(&call_function.lock); + } + +-void ipi_call_unlock(void) ++void ipi_call_unlock(void) __releases(call_function.lock) + { + raw_spin_unlock(&call_function.lock); + } + +-void ipi_call_lock_irq(void) ++void ipi_call_lock_irq(void) __acquires(call_function.lock) + { + raw_spin_lock_irq(&call_function.lock); + } + +-void ipi_call_unlock_irq(void) ++void ipi_call_unlock_irq(void) __releases(call_function.lock) + { + raw_spin_unlock_irq(&call_function.lock); + } +diff -urNp linux-2.6.33.1/kernel/softirq.c linux-2.6.33.1/kernel/softirq.c +--- linux-2.6.33.1/kernel/softirq.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/softirq.c 2010-03-20 16:58:42.120938497 -0400 +@@ -56,7 +56,7 @@ static struct softirq_action softirq_vec + + static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); + +-char *softirq_to_name[NR_SOFTIRQS] = { ++const char * const softirq_to_name[NR_SOFTIRQS] = { + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", + "TASKLET", "SCHED", "HRTIMER", "RCU" + }; +@@ -190,7 +190,7 @@ EXPORT_SYMBOL(local_bh_enable_ip); + + asmlinkage void __do_softirq(void) + { +- struct softirq_action *h; ++ const struct softirq_action *h; + __u32 pending; + int max_restart = MAX_SOFTIRQ_RESTART; + int cpu; +@@ -216,7 +216,7 @@ restart: + kstat_incr_softirqs_this_cpu(h - softirq_vec); + + trace_softirq_entry(h, softirq_vec); +- h->action(h); ++ h->action(); + trace_softirq_exit(h, softirq_vec); + if (unlikely(prev_count != preempt_count())) { + printk(KERN_ERR "huh, entered softirq %td %s %p" +@@ -340,7 +340,7 @@ void raise_softirq(unsigned int nr) + local_irq_restore(flags); + } + +-void open_softirq(int nr, void (*action)(struct softirq_action *)) ++void open_softirq(int nr, void (*action)(void)) + { + softirq_vec[nr].action = action; + } +@@ -396,7 +396,7 @@ void __tasklet_hi_schedule_first(struct + + EXPORT_SYMBOL(__tasklet_hi_schedule_first); + +-static void tasklet_action(struct softirq_action *a) ++static void tasklet_action(void) + { + struct tasklet_struct *list; + +@@ -431,7 +431,7 @@ static void tasklet_action(struct softir + } + } + +-static void tasklet_hi_action(struct softirq_action *a) ++static void tasklet_hi_action(void) + { + struct tasklet_struct *list; + +diff -urNp linux-2.6.33.1/kernel/sys.c linux-2.6.33.1/kernel/sys.c +--- linux-2.6.33.1/kernel/sys.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/sys.c 2010-03-20 16:58:42.128666983 -0400 +@@ -132,6 +132,12 @@ static int set_one_prio(struct task_stru + error = -EACCES; + goto out; + } ++ ++ if (gr_handle_chroot_setpriority(p, niceval)) { ++ error = -EACCES; ++ goto out; ++ } ++ + no_nice = security_task_setnice(p, niceval); + if (no_nice) { + error = no_nice; +@@ -513,6 +519,9 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, g + goto error; + } + ++ if (gr_check_group_change(new->gid, new->egid, -1)) ++ goto error; ++ + if (rgid != (gid_t) -1 || + (egid != (gid_t) -1 && egid != old->gid)) + new->sgid = new->egid; +@@ -546,6 +555,10 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) + goto error; + + retval = -EPERM; ++ ++ if (gr_check_group_change(gid, gid, gid)) ++ goto error; ++ + if (capable(CAP_SETGID)) + new->gid = new->egid = new->sgid = new->fsgid = gid; + else if (gid == old->gid || gid == old->sgid) +@@ -636,6 +649,9 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, u + goto error; + } + ++ if (gr_check_user_change(new->uid, new->euid, -1)) ++ goto error; ++ + if (new->uid != old->uid) { + retval = set_user(new); + if (retval < 0) +@@ -684,6 +700,12 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) + goto error; + + retval = -EPERM; ++ ++ if (gr_check_crash_uid(uid)) ++ goto error; ++ if (gr_check_user_change(uid, uid, uid)) ++ goto error; ++ + if (capable(CAP_SETUID)) { + new->suid = new->uid = uid; + if (uid != old->uid) { +@@ -741,6 +763,9 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, + goto error; + } + ++ if (gr_check_user_change(ruid, euid, -1)) ++ goto error; ++ + if (ruid != (uid_t) -1) { + new->uid = ruid; + if (ruid != old->uid) { +@@ -809,6 +834,9 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, + goto error; + } + ++ if (gr_check_group_change(rgid, egid, -1)) ++ goto error; ++ + if (rgid != (gid_t) -1) + new->gid = rgid; + if (egid != (gid_t) -1) +@@ -858,6 +886,9 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) + if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) + goto error; + ++ if (gr_check_user_change(-1, -1, uid)) ++ goto error; ++ + if (uid == old->uid || uid == old->euid || + uid == old->suid || uid == old->fsuid || + capable(CAP_SETUID)) { +@@ -898,6 +929,9 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) + if (gid == old->gid || gid == old->egid || + gid == old->sgid || gid == old->fsgid || + capable(CAP_SETGID)) { ++ if (gr_check_group_change(-1, -1, gid)) ++ goto error; ++ + if (gid != old_fsgid) { + new->fsgid = gid; + goto change_okay; +@@ -1460,7 +1494,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsi + error = get_dumpable(me->mm); + break; + case PR_SET_DUMPABLE: +- if (arg2 < 0 || arg2 > 1) { ++ if (arg2 > 1) { + error = -EINVAL; + break; + } +diff -urNp linux-2.6.33.1/kernel/sysctl.c linux-2.6.33.1/kernel/sysctl.c +--- linux-2.6.33.1/kernel/sysctl.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/sysctl.c 2010-03-20 16:58:42.186495404 -0400 +@@ -62,6 +62,13 @@ + + + #if defined(CONFIG_SYSCTL) ++#include <linux/grsecurity.h> ++#include <linux/grinternal.h> ++ ++extern __u32 gr_handle_sysctl(const ctl_table *table, const int op); ++extern int gr_handle_sysctl_mod(const char *dirname, const char *name, ++ const int op); ++extern int gr_handle_chroot_sysctl(const int op); + + /* External variables not in a header file. */ + extern int C_A_D; +@@ -169,6 +176,7 @@ static int proc_do_cad_pid(struct ctl_ta + static int proc_taint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + #endif ++extern ctl_table grsecurity_table[]; + + static struct ctl_table root_table[]; + static struct ctl_table_root sysctl_table_root; +@@ -201,6 +209,20 @@ extern struct ctl_table epoll_table[]; + int sysctl_legacy_va_layout; + #endif + ++#ifdef CONFIG_PAX_SOFTMODE ++static ctl_table pax_table[] = { ++ { ++ .procname = "softmode", ++ .data = &pax_softmode, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0600, ++ .proc_handler = &proc_dointvec, ++ }, ++ ++ { } ++}; ++#endif ++ + extern int prove_locking; + extern int lock_stat; + +@@ -251,6 +273,22 @@ static int max_sched_shares_ratelimit = + #endif + + static struct ctl_table kern_table[] = { ++#if defined(CONFIG_GRKERNSEC_SYSCTL) || defined(CONFIG_GRKERNSEC_ROFS) ++ { ++ .procname = "grsecurity", ++ .mode = 0500, ++ .child = grsecurity_table, ++ }, ++#endif ++ ++#ifdef CONFIG_PAX_SOFTMODE ++ { ++ .procname = "pax", ++ .mode = 0500, ++ .child = pax_table, ++ }, ++#endif ++ + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -1629,6 +1667,16 @@ int sysctl_perm(struct ctl_table_root *r + int error; + int mode; + ++ if (table->parent != NULL && table->parent->procname != NULL && ++ table->procname != NULL && ++ gr_handle_sysctl_mod(table->parent->procname, table->procname, op)) ++ return -EACCES; ++ if (gr_handle_chroot_sysctl(op)) ++ return -EACCES; ++ error = gr_handle_sysctl(table, op); ++ if (error) ++ return error; ++ + error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) + return error; +@@ -2137,6 +2185,8 @@ static int __do_proc_dointvec(void *tbl_ + len = strlen(buf); + if (len > left) + len = left; ++ if (len > sizeof(buf)) ++ len = sizeof(buf); + if(copy_to_user(s, buf, len)) + return -EFAULT; + left -= len; +@@ -2362,6 +2412,8 @@ static int __do_proc_doulongvec_minmax(v + len = strlen(buf); + if (len > left) + len = left; ++ if (len > sizeof(buf)) ++ len = sizeof(buf); + if(copy_to_user(s, buf, len)) + return -EFAULT; + left -= len; +diff -urNp linux-2.6.33.1/kernel/taskstats.c linux-2.6.33.1/kernel/taskstats.c +--- linux-2.6.33.1/kernel/taskstats.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/taskstats.c 2010-03-20 16:58:42.186495404 -0400 +@@ -26,9 +26,12 @@ + #include <linux/cgroup.h> + #include <linux/fs.h> + #include <linux/file.h> ++#include <linux/grsecurity.h> + #include <net/genetlink.h> + #include <asm/atomic.h> + ++extern int gr_is_taskstats_denied(int pid); ++ + /* + * Maximum length of a cpumask that can be specified in + * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute +@@ -433,6 +436,9 @@ static int taskstats_user_cmd(struct sk_ + size_t size; + cpumask_var_t mask; + ++ if (gr_is_taskstats_denied(current->pid)) ++ return -EACCES; ++ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + +diff -urNp linux-2.6.33.1/kernel/time/tick-broadcast.c linux-2.6.33.1/kernel/time/tick-broadcast.c +--- linux-2.6.33.1/kernel/time/tick-broadcast.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/time/tick-broadcast.c 2010-03-20 16:58:42.200972419 -0400 +@@ -116,7 +116,7 @@ int tick_device_uses_broadcast(struct cl + * then clear the broadcast bit. + */ + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { +- int cpu = smp_processor_id(); ++ cpu = smp_processor_id(); + + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + tick_broadcast_clear_oneshot(cpu); +diff -urNp linux-2.6.33.1/kernel/time.c linux-2.6.33.1/kernel/time.c +--- linux-2.6.33.1/kernel/time.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/time.c 2010-03-20 16:58:42.200972419 -0400 +@@ -94,6 +94,9 @@ SYSCALL_DEFINE1(stime, time_t __user *, + return err; + + do_settimeofday(&tv); ++ ++ gr_log_timechange(); ++ + return 0; + } + +@@ -202,6 +205,8 @@ SYSCALL_DEFINE2(settimeofday, struct tim + return -EFAULT; + } + ++ gr_log_timechange(); ++ + return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); + } + +@@ -240,7 +245,7 @@ EXPORT_SYMBOL(current_fs_time); + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +-unsigned int inline jiffies_to_msecs(const unsigned long j) ++inline unsigned int jiffies_to_msecs(const unsigned long j) + { + #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +@@ -256,7 +261,7 @@ unsigned int inline jiffies_to_msecs(con + } + EXPORT_SYMBOL(jiffies_to_msecs); + +-unsigned int inline jiffies_to_usecs(const unsigned long j) ++inline unsigned int jiffies_to_usecs(const unsigned long j) + { + #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +diff -urNp linux-2.6.33.1/kernel/timer.c linux-2.6.33.1/kernel/timer.c +--- linux-2.6.33.1/kernel/timer.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/timer.c 2010-03-20 16:58:42.212949807 -0400 +@@ -1206,7 +1206,7 @@ void update_process_times(int user_tick) + /* + * This function runs timers and the timer-tq in bottom half context. + */ +-static void run_timer_softirq(struct softirq_action *h) ++static void run_timer_softirq(void) + { + struct tvec_base *base = __get_cpu_var(tvec_bases); + +diff -urNp linux-2.6.33.1/kernel/trace/ftrace.c linux-2.6.33.1/kernel/trace/ftrace.c +--- linux-2.6.33.1/kernel/trace/ftrace.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/trace/ftrace.c 2010-03-20 16:58:42.212949807 -0400 +@@ -1102,13 +1102,18 @@ ftrace_code_disable(struct module *mod, + + ip = rec->ip; + ++ ret = ftrace_arch_code_modify_prepare(); ++ FTRACE_WARN_ON(ret); ++ if (ret) ++ return 0; ++ + ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); ++ FTRACE_WARN_ON(ftrace_arch_code_modify_post_process()); + if (ret) { + ftrace_bug(ret, ip); + rec->flags |= FTRACE_FL_FAILED; +- return 0; + } +- return 1; ++ return ret ? 0 : 1; + } + + /* +diff -urNp linux-2.6.33.1/kernel/trace/Kconfig linux-2.6.33.1/kernel/trace/Kconfig +--- linux-2.6.33.1/kernel/trace/Kconfig 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/trace/Kconfig 2010-03-20 16:58:42.212949807 -0400 +@@ -124,6 +124,7 @@ if FTRACE + config FUNCTION_TRACER + bool "Kernel Function Tracer" + depends on HAVE_FUNCTION_TRACER ++ depends on !PAX_KERNEXEC + select FRAME_POINTER + select KALLSYMS + select GENERIC_TRACER +@@ -362,6 +363,7 @@ config PROFILE_KSYM_TRACER + config STACK_TRACER + bool "Trace max stack" + depends on HAVE_FUNCTION_TRACER ++ depends on !PAX_KERNEXEC + select FUNCTION_TRACER + select STACKTRACE + select KALLSYMS +diff -urNp linux-2.6.33.1/kernel/trace/trace.c linux-2.6.33.1/kernel/trace/trace.c +--- linux-2.6.33.1/kernel/trace/trace.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/trace/trace.c 2010-03-20 16:58:42.217937104 -0400 +@@ -3820,10 +3820,9 @@ static const struct file_operations trac + }; + #endif + +-static struct dentry *d_tracer; +- + struct dentry *tracing_init_dentry(void) + { ++ static struct dentry *d_tracer; + static int once; + + if (d_tracer) +@@ -3843,10 +3842,9 @@ struct dentry *tracing_init_dentry(void) + return d_tracer; + } + +-static struct dentry *d_percpu; +- + struct dentry *tracing_dentry_percpu(void) + { ++ static struct dentry *d_percpu; + static int once; + struct dentry *d_tracer; + +diff -urNp linux-2.6.33.1/kernel/trace/trace_output.c linux-2.6.33.1/kernel/trace/trace_output.c +--- linux-2.6.33.1/kernel/trace/trace_output.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/trace/trace_output.c 2010-03-20 16:58:42.220764190 -0400 +@@ -280,7 +280,7 @@ int trace_seq_path(struct trace_seq *s, + + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); + if (!IS_ERR(p)) { +- p = mangle_path(s->buffer + s->len, p, "\n"); ++ p = mangle_path(s->buffer + s->len, p, "\n\"); + if (p) { + s->len = p - s->buffer; + return 1; +diff -urNp linux-2.6.33.1/kernel/trace/trace_stack.c linux-2.6.33.1/kernel/trace/trace_stack.c +--- linux-2.6.33.1/kernel/trace/trace_stack.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/kernel/trace/trace_stack.c 2010-03-20 16:58:42.220764190 -0400 +@@ -50,7 +50,7 @@ static inline void check_stack(void) + return; + + /* we do not handle interrupt stacks yet */ +- if (!object_is_on_stack(&this_size)) ++ if (!object_starts_on_stack(&this_size)) + return; + + local_irq_save(flags); +diff -urNp linux-2.6.33.1/lib/bug.c linux-2.6.33.1/lib/bug.c +--- linux-2.6.33.1/lib/bug.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/lib/bug.c 2010-03-20 16:58:42.225948146 -0400 +@@ -135,6 +135,8 @@ enum bug_trap_type report_bug(unsigned l + return BUG_TRAP_TYPE_NONE; + + bug = find_bug(bugaddr); ++ if (!bug) ++ return BUG_TRAP_TYPE_NONE; + + printk(KERN_EMERG "------------[ cut here ]------------\n"); + +diff -urNp linux-2.6.33.1/lib/debugobjects.c linux-2.6.33.1/lib/debugobjects.c +--- linux-2.6.33.1/lib/debugobjects.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/lib/debugobjects.c 2010-03-20 16:58:42.225948146 -0400 +@@ -277,7 +277,7 @@ static void debug_object_is_on_stack(voi + if (limit > 4) + return; + +- is_on_stack = object_is_on_stack(addr); ++ is_on_stack = object_starts_on_stack(addr); + if (is_on_stack == onstack) + return; + +diff -urNp linux-2.6.33.1/lib/dma-debug.c linux-2.6.33.1/lib/dma-debug.c +--- linux-2.6.33.1/lib/dma-debug.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/lib/dma-debug.c 2010-03-20 16:58:42.240954646 -0400 +@@ -861,7 +861,7 @@ out: + + static void check_for_stack(struct device *dev, void *addr) + { +- if (object_is_on_stack(addr)) ++ if (object_starts_on_stack(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from" + "stack [addr=%p]\n", addr); + } +diff -urNp linux-2.6.33.1/lib/inflate.c linux-2.6.33.1/lib/inflate.c +--- linux-2.6.33.1/lib/inflate.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/lib/inflate.c 2010-03-20 16:58:42.240954646 -0400 +@@ -266,7 +266,7 @@ static void free(void *where) + malloc_ptr = free_mem_ptr; + } + #else +-#define malloc(a) kmalloc(a, GFP_KERNEL) ++#define malloc(a) kmalloc((a), GFP_KERNEL) + #define free(a) kfree(a) + #endif + +diff -urNp linux-2.6.33.1/lib/Kconfig.debug linux-2.6.33.1/lib/Kconfig.debug +--- linux-2.6.33.1/lib/Kconfig.debug 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/lib/Kconfig.debug 2010-03-20 16:58:42.244515655 -0400 +@@ -914,7 +914,7 @@ config LATENCYTOP + select STACKTRACE + select SCHEDSTATS + select SCHED_DEBUG +- depends on HAVE_LATENCYTOP_SUPPORT ++ depends on HAVE_LATENCYTOP_SUPPORT && !GRKERNSEC_HIDESYM + help + Enable this option if you want to use the LatencyTOP tool + to find out which userspace is blocking on what kernel operations. +diff -urNp linux-2.6.33.1/lib/kobject.c linux-2.6.33.1/lib/kobject.c +--- linux-2.6.33.1/lib/kobject.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/lib/kobject.c 2010-03-20 16:58:42.244515655 -0400 +@@ -700,7 +700,7 @@ static ssize_t kobj_attr_store(struct ko + return ret; + } + +-struct sysfs_ops kobj_sysfs_ops = { ++const struct sysfs_ops kobj_sysfs_ops = { + .show = kobj_attr_show, + .store = kobj_attr_store, + }; +@@ -789,7 +789,7 @@ static struct kobj_type kset_ktype = { + * If the kset was not able to be created, NULL will be returned. + */ + static struct kset *kset_create(const char *name, +- struct kset_uevent_ops *uevent_ops, ++ const struct kset_uevent_ops *uevent_ops, + struct kobject *parent_kobj) + { + struct kset *kset; +@@ -832,7 +832,7 @@ static struct kset *kset_create(const ch + * If the kset was not able to be created, NULL will be returned. + */ + struct kset *kset_create_and_add(const char *name, +- struct kset_uevent_ops *uevent_ops, ++ const struct kset_uevent_ops *uevent_ops, + struct kobject *parent_kobj) + { + struct kset *kset; +diff -urNp linux-2.6.33.1/lib/kobject_uevent.c linux-2.6.33.1/lib/kobject_uevent.c +--- linux-2.6.33.1/lib/kobject_uevent.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/lib/kobject_uevent.c 2010-03-20 16:58:42.244515655 -0400 +@@ -95,7 +95,7 @@ int kobject_uevent_env(struct kobject *k + const char *subsystem; + struct kobject *top_kobj; + struct kset *kset; +- struct kset_uevent_ops *uevent_ops; ++ const struct kset_uevent_ops *uevent_ops; + u64 seq; + int i = 0; + int retval = 0; +diff -urNp linux-2.6.33.1/lib/parser.c linux-2.6.33.1/lib/parser.c +--- linux-2.6.33.1/lib/parser.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/lib/parser.c 2010-03-20 16:58:42.244515655 -0400 +@@ -129,7 +129,7 @@ static int match_number(substring_t *s, + char *buf; + int ret; + +- buf = kmalloc(s->to - s->from + 1, GFP_KERNEL); ++ buf = kmalloc((s->to - s->from) + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, s->from, s->to - s->from); +diff -urNp linux-2.6.33.1/lib/radix-tree.c linux-2.6.33.1/lib/radix-tree.c +--- linux-2.6.33.1/lib/radix-tree.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/lib/radix-tree.c 2010-03-20 16:58:42.253002352 -0400 +@@ -81,7 +81,7 @@ struct radix_tree_preload { + int nr; + struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; + }; +-static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; ++static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads); + + static inline gfp_t root_gfp_mask(struct radix_tree_root *root) + { +diff -urNp linux-2.6.33.1/lib/random32.c linux-2.6.33.1/lib/random32.c +--- linux-2.6.33.1/lib/random32.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/lib/random32.c 2010-03-20 16:58:42.256945498 -0400 +@@ -61,7 +61,7 @@ static u32 __random32(struct rnd_state * + */ + static inline u32 __seed(u32 x, u32 m) + { +- return (x < m) ? x + m : x; ++ return (x <= m) ? x + m + 1 : x; + } + + /** +diff -urNp linux-2.6.33.1/localversion-grsec linux-2.6.33.1/localversion-grsec +--- linux-2.6.33.1/localversion-grsec 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.33.1/localversion-grsec 2010-03-20 16:58:42.256945498 -0400 +@@ -0,0 +1 @@ ++-grsec +diff -urNp linux-2.6.33.1/Makefile linux-2.6.33.1/Makefile +--- linux-2.6.33.1/Makefile 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/Makefile 2010-03-20 16:58:42.256945498 -0400 +@@ -227,8 +227,8 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" + + HOSTCC = gcc + HOSTCXX = g++ +-HOSTCFLAGS = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer +-HOSTCXXFLAGS = -O2 ++HOSTCFLAGS = -Wall -W -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-delete-null-pointer-checks ++HOSTCXXFLAGS = -O2 -fno-delete-null-pointer-checks + + # Decide whether to build built-in, modular, or both. + # Normally, just do built-in. +@@ -650,7 +650,7 @@ export mod_strip_cmd + + + ifeq ($(KBUILD_EXTMOD),) +-core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ ++core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ grsecurity/ + + vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ + $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ +diff -urNp linux-2.6.33.1/mm/filemap.c linux-2.6.33.1/mm/filemap.c +--- linux-2.6.33.1/mm/filemap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/filemap.c 2010-03-20 16:58:42.256945498 -0400 +@@ -1601,7 +1601,7 @@ int generic_file_mmap(struct file * file + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) +- return -ENOEXEC; ++ return -ENODEV; + file_accessed(file); + vma->vm_ops = &generic_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; +@@ -1997,6 +1997,7 @@ inline int generic_write_checks(struct f + *pos = i_size_read(inode); + + if (limit != RLIM_INFINITY) { ++ gr_learn_resource(current, RLIMIT_FSIZE,*pos, 0); + if (*pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; +diff -urNp linux-2.6.33.1/mm/fremap.c linux-2.6.33.1/mm/fremap.c +--- linux-2.6.33.1/mm/fremap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/fremap.c 2010-03-20 16:58:42.256945498 -0400 +@@ -153,6 +153,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsign + retry: + vma = find_vma(mm, start); + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma && (mm->pax_flags & MF_PAX_SEGMEXEC) && (vma->vm_flags & VM_MAYEXEC)) ++ goto out; ++#endif ++ + /* + * Make sure the vma is shared, that it supports prefaulting, + * and that the remapped range is valid and fully within +diff -urNp linux-2.6.33.1/mm/highmem.c linux-2.6.33.1/mm/highmem.c +--- linux-2.6.33.1/mm/highmem.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/highmem.c 2010-03-20 16:58:42.256945498 -0400 +@@ -116,9 +116,10 @@ static void flush_all_zero_pkmaps(void) + * So no dangers, even with speculative execution. + */ + page = pte_page(pkmap_page_table[i]); ++ pax_open_kernel(); + pte_clear(&init_mm, (unsigned long)page_address(page), + &pkmap_page_table[i]); +- ++ pax_close_kernel(); + set_page_address(page, NULL); + need_flush = 1; + } +@@ -177,9 +178,11 @@ start: + } + } + vaddr = PKMAP_ADDR(last_pkmap_nr); ++ ++ pax_open_kernel(); + set_pte_at(&init_mm, vaddr, + &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); +- ++ pax_close_kernel(); + pkmap_count[last_pkmap_nr] = 1; + set_page_address(page, (void *)vaddr); + +diff -urNp linux-2.6.33.1/mm/hugetlb.c linux-2.6.33.1/mm/hugetlb.c +--- linux-2.6.33.1/mm/hugetlb.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/hugetlb.c 2010-03-20 16:58:42.256945498 -0400 +@@ -2267,6 +2267,26 @@ static int unmap_ref_private(struct mm_s + return 1; + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++static void pax_mirror_huge_pte(struct vm_area_struct *vma, unsigned long address, struct page *page_m) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ struct vm_area_struct *vma_m; ++ unsigned long address_m; ++ pte_t *ptep_m; ++ ++ vma_m = pax_find_mirror_vma(vma); ++ if (!vma_m) ++ return; ++ ++ BUG_ON(address >= SEGMEXEC_TASK_SIZE); ++ address_m = address + SEGMEXEC_TASK_SIZE; ++ ptep_m = huge_pte_offset(mm, address_m & HPAGE_MASK); ++ get_page(page_m); ++ set_huge_pte_at(mm, address_m, ptep_m, make_huge_pte(vma_m, page_m, 0)); ++} ++#endif ++ + static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, pte_t pte, + struct page *pagecache_page) +@@ -2347,6 +2367,11 @@ retry_avoidcopy: + huge_ptep_clear_flush(vma, address, ptep); + set_huge_pte_at(mm, address, ptep, + make_huge_pte(vma, new_page, 1)); ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ pax_mirror_huge_pte(vma, address, new_page); ++#endif ++ + /* Make the old page be freed below */ + new_page = old_page; + } +@@ -2476,6 +2501,10 @@ retry: + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, address, ptep, new_pte); + ++#ifdef CONFIG_PAX_SEGMEXEC ++ pax_mirror_huge_pte(vma, address, page); ++#endif ++ + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + /* Optimization, do the COW without a second fault */ + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); +@@ -2504,6 +2533,28 @@ int hugetlb_fault(struct mm_struct *mm, + static DEFINE_MUTEX(hugetlb_instantiation_mutex); + struct hstate *h = hstate_vma(vma); + ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m; ++ ++ vma_m = pax_find_mirror_vma(vma); ++ if (vma_m) { ++ unsigned long address_m; ++ ++ if (vma->vm_start > vma_m->vm_start) { ++ address_m = address; ++ address -= SEGMEXEC_TASK_SIZE; ++ vma = vma_m; ++ h = hstate_vma(vma); ++ } else ++ address_m = address + SEGMEXEC_TASK_SIZE; ++ ++ if (!huge_pte_alloc(mm, address_m, huge_page_size(h))) ++ return VM_FAULT_OOM; ++ address_m &= HPAGE_MASK; ++ unmap_hugepage_range(vma, address_m, address_m + HPAGE_SIZE, NULL); ++ } ++#endif ++ + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; +diff -urNp linux-2.6.33.1/mm/Kconfig linux-2.6.33.1/mm/Kconfig +--- linux-2.6.33.1/mm/Kconfig 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/Kconfig 2010-03-20 16:58:42.256945498 -0400 +@@ -222,7 +222,7 @@ config KSM + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + depends on MMU +- default 4096 ++ default 65536 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages +diff -urNp linux-2.6.33.1/mm/maccess.c linux-2.6.33.1/mm/maccess.c +--- linux-2.6.33.1/mm/maccess.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/maccess.c 2010-03-20 16:58:42.256945498 -0400 +@@ -15,10 +15,10 @@ + * happens, handle that and return -EFAULT. + */ + +-long __weak probe_kernel_read(void *dst, void *src, size_t size) ++long __weak probe_kernel_read(void *dst, const void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + +-long __probe_kernel_read(void *dst, void *src, size_t size) ++long __probe_kernel_read(void *dst, const void *src, size_t size) + { + long ret; + mm_segment_t old_fs = get_fs(); +@@ -43,10 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ +-long __weak probe_kernel_write(void *dst, void *src, size_t size) ++long __weak probe_kernel_write(void *dst, const void *src, size_t size) + __attribute__((alias("__probe_kernel_write"))); + +-long __probe_kernel_write(void *dst, void *src, size_t size) ++long __probe_kernel_write(void *dst, const void *src, size_t size) + { + long ret; + mm_segment_t old_fs = get_fs(); +diff -urNp linux-2.6.33.1/mm/madvise.c linux-2.6.33.1/mm/madvise.c +--- linux-2.6.33.1/mm/madvise.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/madvise.c 2010-03-20 16:58:42.256945498 -0400 +@@ -45,6 +45,10 @@ static long madvise_behavior(struct vm_a + pgoff_t pgoff; + unsigned long new_flags = vma->vm_flags; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m; ++#endif ++ + switch (behavior) { + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; +@@ -104,6 +108,13 @@ success: + /* + * vm_flags is protected by the mmap_sem held in write mode. + */ ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ vma_m = pax_find_mirror_vma(vma); ++ if (vma_m) ++ vma_m->vm_flags = new_flags & ~(VM_WRITE | VM_MAYWRITE | VM_ACCOUNT); ++#endif ++ + vma->vm_flags = new_flags; + + out: +@@ -162,6 +173,11 @@ static long madvise_dontneed(struct vm_a + struct vm_area_struct ** prev, + unsigned long start, unsigned long end) + { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m; ++#endif ++ + *prev = vma; + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; +@@ -174,6 +190,21 @@ static long madvise_dontneed(struct vm_a + zap_page_range(vma, start, end - start, &details); + } else + zap_page_range(vma, start, end - start, NULL); ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ vma_m = pax_find_mirror_vma(vma); ++ if (vma_m) { ++ if (unlikely(vma->vm_flags & VM_NONLINEAR)) { ++ struct zap_details details = { ++ .nonlinear_vma = vma_m, ++ .last_index = ULONG_MAX, ++ }; ++ zap_page_range(vma, start + SEGMEXEC_TASK_SIZE, end - start, &details); ++ } else ++ zap_page_range(vma, start + SEGMEXEC_TASK_SIZE, end - start, NULL); ++ } ++#endif ++ + return 0; + } + +@@ -366,6 +397,16 @@ SYSCALL_DEFINE3(madvise, unsigned long, + if (end < start) + goto out; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) { ++ if (end > SEGMEXEC_TASK_SIZE) ++ goto out; ++ } else ++#endif ++ ++ if (end > TASK_SIZE) ++ goto out; ++ + error = 0; + if (end == start) + goto out; +diff -urNp linux-2.6.33.1/mm/memory.c linux-2.6.33.1/mm/memory.c +--- linux-2.6.33.1/mm/memory.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/memory.c 2010-03-20 16:58:42.260726041 -0400 +@@ -48,6 +48,7 @@ + #include <linux/ksm.h> + #include <linux/rmap.h> + #include <linux/module.h> ++#include <linux/security.h> + #include <linux/delayacct.h> + #include <linux/init.h> + #include <linux/writeback.h> +@@ -1266,10 +1267,10 @@ int __get_user_pages(struct task_struct + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + i = 0; + +- do { ++ while (nr_pages) { + struct vm_area_struct *vma; + +- vma = find_extend_vma(mm, start); ++ vma = find_vma(mm, start); + if (!vma && in_gate_area(tsk, start)) { + unsigned long pg = start & PAGE_MASK; + struct vm_area_struct *gate_vma = get_gate_vma(tsk); +@@ -1311,7 +1312,7 @@ int __get_user_pages(struct task_struct + continue; + } + +- if (!vma || ++ if (!vma || start < vma->vm_start || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) + return i ? : -EFAULT; +@@ -1386,7 +1387,7 @@ int __get_user_pages(struct task_struct + start += PAGE_SIZE; + nr_pages--; + } while (nr_pages && start < vma->vm_end); +- } while (nr_pages); ++ } + return i; + } + +@@ -1982,6 +1983,186 @@ static inline void cow_user_page(struct + copy_user_highpage(dst, src, va, vma); + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++static void pax_unmap_mirror_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ spinlock_t *ptl; ++ pte_t *pte, entry; ++ ++ pte = pte_offset_map_lock(mm, pmd, address, &ptl); ++ entry = *pte; ++ if (!pte_present(entry)) { ++ if (!pte_none(entry)) { ++ BUG_ON(pte_file(entry)); ++ free_swap_and_cache(pte_to_swp_entry(entry)); ++ pte_clear_not_present_full(mm, address, pte, 0); ++ } ++ } else { ++ struct page *page; ++ ++ flush_cache_page(vma, address, pte_pfn(entry)); ++ entry = ptep_clear_flush(vma, address, pte); ++ BUG_ON(pte_dirty(entry)); ++ page = vm_normal_page(vma, address, entry); ++ if (page) { ++ update_hiwater_rss(mm); ++ if (PageAnon(page)) ++ dec_mm_counter(mm, anon_rss); ++ else ++ dec_mm_counter(mm, file_rss); ++ page_remove_rmap(page); ++ page_cache_release(page); ++ } ++ } ++ pte_unmap_unlock(pte, ptl); ++} ++ ++/* PaX: if vma is mirrored, synchronize the mirror's PTE ++ * ++ * the ptl of the lower mapped page is held on entry and is not released on exit ++ * or inside to ensure atomic changes to the PTE states (swapout, mremap, munmap, etc) ++ */ ++static void pax_mirror_anon_pte(struct vm_area_struct *vma, unsigned long address, struct page *page_m, spinlock_t *ptl) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long address_m; ++ spinlock_t *ptl_m; ++ struct vm_area_struct *vma_m; ++ pmd_t *pmd_m; ++ pte_t *pte_m, entry_m; ++ ++ BUG_ON(!page_m || !PageAnon(page_m)); ++ ++ vma_m = pax_find_mirror_vma(vma); ++ if (!vma_m) ++ return; ++ ++ BUG_ON(!PageLocked(page_m)); ++ BUG_ON(address >= SEGMEXEC_TASK_SIZE); ++ address_m = address + SEGMEXEC_TASK_SIZE; ++ pmd_m = pmd_offset(pud_offset(pgd_offset(mm, address_m), address_m), address_m); ++ pte_m = pte_offset_map_nested(pmd_m, address_m); ++ ptl_m = pte_lockptr(mm, pmd_m); ++ if (ptl != ptl_m) { ++ spin_lock_nested(ptl_m, SINGLE_DEPTH_NESTING); ++ if (!pte_none(*pte_m)) ++ goto out; ++ } ++ ++ entry_m = pfn_pte(page_to_pfn(page_m), vma_m->vm_page_prot); ++ page_cache_get(page_m); ++ page_add_anon_rmap(page_m, vma_m, address_m); ++ inc_mm_counter(mm, anon_rss); ++ set_pte_at(mm, address_m, pte_m, entry_m); ++ update_mmu_cache(vma_m, address_m, entry_m); ++out: ++ if (ptl != ptl_m) ++ spin_unlock(ptl_m); ++ pte_unmap_nested(pte_m); ++ unlock_page(page_m); ++} ++ ++void pax_mirror_file_pte(struct vm_area_struct *vma, unsigned long address, struct page *page_m, spinlock_t *ptl) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long address_m; ++ spinlock_t *ptl_m; ++ struct vm_area_struct *vma_m; ++ pmd_t *pmd_m; ++ pte_t *pte_m, entry_m; ++ ++ BUG_ON(!page_m || PageAnon(page_m)); ++ ++ vma_m = pax_find_mirror_vma(vma); ++ if (!vma_m) ++ return; ++ ++ BUG_ON(address >= SEGMEXEC_TASK_SIZE); ++ address_m = address + SEGMEXEC_TASK_SIZE; ++ pmd_m = pmd_offset(pud_offset(pgd_offset(mm, address_m), address_m), address_m); ++ pte_m = pte_offset_map_nested(pmd_m, address_m); ++ ptl_m = pte_lockptr(mm, pmd_m); ++ if (ptl != ptl_m) { ++ spin_lock_nested(ptl_m, SINGLE_DEPTH_NESTING); ++ if (!pte_none(*pte_m)) ++ goto out; ++ } ++ ++ entry_m = pfn_pte(page_to_pfn(page_m), vma_m->vm_page_prot); ++ page_cache_get(page_m); ++ page_add_file_rmap(page_m); ++ inc_mm_counter(mm, file_rss); ++ set_pte_at(mm, address_m, pte_m, entry_m); ++ update_mmu_cache(vma_m, address_m, entry_m); ++out: ++ if (ptl != ptl_m) ++ spin_unlock(ptl_m); ++ pte_unmap_nested(pte_m); ++} ++ ++static void pax_mirror_pfn_pte(struct vm_area_struct *vma, unsigned long address, unsigned long pfn_m, spinlock_t *ptl) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long address_m; ++ spinlock_t *ptl_m; ++ struct vm_area_struct *vma_m; ++ pmd_t *pmd_m; ++ pte_t *pte_m, entry_m; ++ ++ vma_m = pax_find_mirror_vma(vma); ++ if (!vma_m) ++ return; ++ ++ BUG_ON(address >= SEGMEXEC_TASK_SIZE); ++ address_m = address + SEGMEXEC_TASK_SIZE; ++ pmd_m = pmd_offset(pud_offset(pgd_offset(mm, address_m), address_m), address_m); ++ pte_m = pte_offset_map_nested(pmd_m, address_m); ++ ptl_m = pte_lockptr(mm, pmd_m); ++ if (ptl != ptl_m) { ++ spin_lock_nested(ptl_m, SINGLE_DEPTH_NESTING); ++ if (!pte_none(*pte_m)) ++ goto out; ++ } ++ ++ entry_m = pfn_pte(pfn_m, vma_m->vm_page_prot); ++ set_pte_at(mm, address_m, pte_m, entry_m); ++out: ++ if (ptl != ptl_m) ++ spin_unlock(ptl_m); ++ pte_unmap_nested(pte_m); ++} ++ ++static void pax_mirror_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, spinlock_t *ptl) ++{ ++ struct page *page_m; ++ pte_t entry; ++ ++ if (!(vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC)) ++ goto out; ++ ++ entry = *pte; ++ page_m = vm_normal_page(vma, address, entry); ++ if (!page_m) ++ pax_mirror_pfn_pte(vma, address, pte_pfn(entry), ptl); ++ else if (PageAnon(page_m)) { ++ if (pax_find_mirror_vma(vma)) { ++ pte_unmap_unlock(pte, ptl); ++ lock_page(page_m); ++ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); ++ if (pte_same(entry, *pte)) ++ pax_mirror_anon_pte(vma, address, page_m, ptl); ++ else ++ unlock_page(page_m); ++ } ++ } else ++ pax_mirror_file_pte(vma, address, page_m, ptl); ++ ++out: ++ pte_unmap_unlock(pte, ptl); ++} ++#endif ++ + /* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address +@@ -2161,6 +2342,12 @@ gotten: + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (pax_find_mirror_vma(vma)) ++ BUG_ON(!trylock_page(new_page)); ++#endif ++ + if (old_page) { + if (!PageAnon(old_page)) { + dec_mm_counter(mm, file_rss); +@@ -2212,6 +2399,10 @@ gotten: + page_remove_rmap(old_page); + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++ pax_mirror_anon_pte(vma, address, new_page, ptl); ++#endif ++ + /* Free the old page.. */ + new_page = old_page; + ret |= VM_FAULT_WRITE; +@@ -2619,6 +2810,11 @@ static int do_swap_page(struct mm_struct + swap_free(entry); + if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + try_to_free_swap(page); ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((flags & FAULT_FLAG_WRITE) || !pax_find_mirror_vma(vma)) ++#endif ++ + unlock_page(page); + + if (flags & FAULT_FLAG_WRITE) { +@@ -2630,6 +2826,11 @@ static int do_swap_page(struct mm_struct + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ pax_mirror_anon_pte(vma, address, page, ptl); ++#endif ++ + unlock: + pte_unmap_unlock(page_table, ptl); + out: +@@ -2653,7 +2854,7 @@ static int do_anonymous_page(struct mm_s + unsigned long address, pte_t *page_table, pmd_t *pmd, + unsigned int flags) + { +- struct page *page; ++ struct page *page = NULL; + spinlock_t *ptl; + pte_t entry; + +@@ -2688,6 +2889,11 @@ static int do_anonymous_page(struct mm_s + if (!pte_none(*page_table)) + goto release; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (pax_find_mirror_vma(vma)) ++ BUG_ON(!trylock_page(page)); ++#endif ++ + inc_mm_counter(mm, anon_rss); + page_add_new_anon_rmap(page, vma, address); + setpte: +@@ -2695,6 +2901,12 @@ setpte: + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, entry); ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (page) ++ pax_mirror_anon_pte(vma, address, page, ptl); ++#endif ++ + unlock: + pte_unmap_unlock(page_table, ptl); + return 0; +@@ -2837,6 +3049,12 @@ static int __do_fault(struct mm_struct * + */ + /* Only go through if we didn't race with anybody else... */ + if (likely(pte_same(*page_table, orig_pte))) { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (anon && pax_find_mirror_vma(vma)) ++ BUG_ON(!trylock_page(page)); ++#endif ++ + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (flags & FAULT_FLAG_WRITE) +@@ -2856,6 +3074,14 @@ static int __do_fault(struct mm_struct * + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, entry); ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (anon) ++ pax_mirror_anon_pte(vma, address, page, ptl); ++ else ++ pax_mirror_file_pte(vma, address, page, ptl); ++#endif ++ + } else { + if (charged) + mem_cgroup_uncharge_page(page); +@@ -3003,6 +3229,12 @@ static inline int handle_pte_fault(struc + if (flags & FAULT_FLAG_WRITE) + flush_tlb_page(vma, address); + } ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ pax_mirror_pte(vma, address, pte, pmd, ptl); ++ return 0; ++#endif ++ + unlock: + pte_unmap_unlock(pte, ptl); + return 0; +@@ -3019,6 +3251,10 @@ int handle_mm_fault(struct mm_struct *mm + pmd_t *pmd; + pte_t *pte; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m; ++#endif ++ + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); +@@ -3026,6 +3262,34 @@ int handle_mm_fault(struct mm_struct *mm + if (unlikely(is_vm_hugetlb_page(vma))) + return hugetlb_fault(mm, vma, address, flags); + ++#ifdef CONFIG_PAX_SEGMEXEC ++ vma_m = pax_find_mirror_vma(vma); ++ if (vma_m) { ++ unsigned long address_m; ++ pgd_t *pgd_m; ++ pud_t *pud_m; ++ pmd_t *pmd_m; ++ ++ if (vma->vm_start > vma_m->vm_start) { ++ address_m = address; ++ address -= SEGMEXEC_TASK_SIZE; ++ vma = vma_m; ++ } else ++ address_m = address + SEGMEXEC_TASK_SIZE; ++ ++ pgd_m = pgd_offset(mm, address_m); ++ pud_m = pud_alloc(mm, pgd_m, address_m); ++ if (!pud_m) ++ return VM_FAULT_OOM; ++ pmd_m = pmd_alloc(mm, pud_m, address_m); ++ if (!pmd_m) ++ return VM_FAULT_OOM; ++ if (!pmd_present(*pmd_m) && __pte_alloc(mm, pmd_m, address_m)) ++ return VM_FAULT_OOM; ++ pax_unmap_mirror_pte(vma_m, address_m, pmd_m); ++ } ++#endif ++ + pgd = pgd_offset(mm, address); + pud = pud_alloc(mm, pgd, address); + if (!pud) +@@ -3123,7 +3387,7 @@ static int __init gate_vma_init(void) + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; +- gate_vma.vm_page_prot = __P101; ++ gate_vma.vm_page_prot = vm_get_page_prot(gate_vma.vm_flags); + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later +diff -urNp linux-2.6.33.1/mm/memory-failure.c linux-2.6.33.1/mm/memory-failure.c +--- linux-2.6.33.1/mm/memory-failure.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/memory-failure.c 2010-03-20 16:58:42.260726041 -0400 +@@ -50,7 +50,7 @@ int sysctl_memory_failure_early_kill __r + + int sysctl_memory_failure_recovery __read_mostly = 1; + +-atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); ++atomic_long_unchecked_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); + + #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) + +@@ -935,7 +935,7 @@ int __memory_failure(unsigned long pfn, + return 0; + } + +- atomic_long_add(1, &mce_bad_pages); ++ atomic_long_add_unchecked(1, &mce_bad_pages); + + /* + * We need/can do nothing about count=0 pages. +diff -urNp linux-2.6.33.1/mm/mempolicy.c linux-2.6.33.1/mm/mempolicy.c +--- linux-2.6.33.1/mm/mempolicy.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/mempolicy.c 2010-03-20 16:58:42.260726041 -0400 +@@ -569,6 +569,10 @@ static int mbind_range(struct vm_area_st + struct vm_area_struct *next; + int err; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m; ++#endif ++ + err = 0; + for (; vma && vma->vm_start < end; vma = next) { + next = vma->vm_next; +@@ -580,6 +584,16 @@ static int mbind_range(struct vm_area_st + err = policy_vma(vma, new); + if (err) + break; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ vma_m = pax_find_mirror_vma(vma); ++ if (vma_m) { ++ err = policy_vma(vma_m, new); ++ if (err) ++ break; ++ } ++#endif ++ + } + return err; + } +@@ -1000,6 +1014,17 @@ static long do_mbind(unsigned long start + + if (end < start) + return -EINVAL; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) { ++ if (end > SEGMEXEC_TASK_SIZE) ++ return -EINVAL; ++ } else ++#endif ++ ++ if (end > TASK_SIZE) ++ return -EINVAL; ++ + if (end == start) + return 0; + +@@ -1205,6 +1230,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pi + if (!mm) + return -EINVAL; + ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ if (mm != current->mm && ++ (mm->pax_flags & MF_PAX_RANDMMAP || mm->pax_flags & MF_PAX_SEGMEXEC)) { ++ err = -EPERM; ++ goto out; ++ } ++#endif ++ + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative +@@ -1214,8 +1247,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pi + rcu_read_lock(); + tcred = __task_cred(task); + if (cred->euid != tcred->suid && cred->euid != tcred->uid && +- cred->uid != tcred->suid && cred->uid != tcred->uid && +- !capable(CAP_SYS_NICE)) { ++ cred->uid != tcred->suid && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + err = -EPERM; + goto out; +@@ -2431,7 +2463,7 @@ int show_numa_map(struct seq_file *m, vo + + if (file) { + seq_printf(m, " file="); +- seq_path(m, &file->f_path, "\n\t= "); ++ seq_path(m, &file->f_path, "\n\t\= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && +diff -urNp linux-2.6.33.1/mm/migrate.c linux-2.6.33.1/mm/migrate.c +--- linux-2.6.33.1/mm/migrate.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/migrate.c 2010-03-20 16:58:42.260726041 -0400 +@@ -1059,6 +1059,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, + if (!mm) + return -EINVAL; + ++#ifdef CONFIG_GRKERNSEC_PROC_MEMMAP ++ if (mm != current->mm && ++ (mm->pax_flags & MF_PAX_RANDMMAP || mm->pax_flags & MF_PAX_SEGMEXEC)) { ++ err = -EPERM; ++ goto out; ++ } ++#endif ++ + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative +@@ -1068,8 +1076,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, + rcu_read_lock(); + tcred = __task_cred(task); + if (cred->euid != tcred->suid && cred->euid != tcred->uid && +- cred->uid != tcred->suid && cred->uid != tcred->uid && +- !capable(CAP_SYS_NICE)) { ++ cred->uid != tcred->suid && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + err = -EPERM; + goto out; +diff -urNp linux-2.6.33.1/mm/mlock.c linux-2.6.33.1/mm/mlock.c +--- linux-2.6.33.1/mm/mlock.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/mlock.c 2010-03-20 16:58:42.264564412 -0400 +@@ -13,6 +13,7 @@ + #include <linux/pagemap.h> + #include <linux/mempolicy.h> + #include <linux/syscalls.h> ++#include <linux/security.h> + #include <linux/sched.h> + #include <linux/module.h> + #include <linux/rmap.h> +@@ -432,6 +433,17 @@ static int do_mlock(unsigned long start, + return -EINVAL; + if (end == start) + return 0; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) { ++ if (end > SEGMEXEC_TASK_SIZE) ++ return -EINVAL; ++ } else ++#endif ++ ++ if (end > TASK_SIZE) ++ return -EINVAL; ++ + vma = find_vma_prev(current->mm, start, &prev); + if (!vma || vma->vm_start > start) + return -ENOMEM; +@@ -491,6 +503,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, st + lock_limit >>= PAGE_SHIFT; + + /* check against resource limits */ ++ gr_learn_resource(current, RLIMIT_MEMLOCK, (current->mm->locked_vm << PAGE_SHIFT) + len, 1); + if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) + error = do_mlock(start, len, 1); + up_write(¤t->mm->mmap_sem); +@@ -512,10 +525,10 @@ SYSCALL_DEFINE2(munlock, unsigned long, + static int do_mlockall(int flags) + { + struct vm_area_struct * vma, * prev = NULL; +- unsigned int def_flags = 0; ++ unsigned int def_flags = current->mm->def_flags & ~VM_LOCKED; + + if (flags & MCL_FUTURE) +- def_flags = VM_LOCKED; ++ def_flags |= VM_LOCKED; + current->mm->def_flags = def_flags; + if (flags == MCL_FUTURE) + goto out; +@@ -523,6 +536,12 @@ static int do_mlockall(int flags) + for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { + unsigned int newflags; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((current->mm->pax_flags & MF_PAX_SEGMEXEC) && (vma->vm_start >= SEGMEXEC_TASK_SIZE)) ++ break; ++#endif ++ ++ BUG_ON(vma->vm_end > TASK_SIZE); + newflags = vma->vm_flags | VM_LOCKED; + if (!(flags & MCL_CURRENT)) + newflags &= ~VM_LOCKED; +@@ -554,6 +573,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) + lock_limit >>= PAGE_SHIFT; + + ret = -ENOMEM; ++ gr_learn_resource(current, RLIMIT_MEMLOCK, current->mm->total_vm, 1); + if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || + capable(CAP_IPC_LOCK)) + ret = do_mlockall(flags); +diff -urNp linux-2.6.33.1/mm/mmap.c linux-2.6.33.1/mm/mmap.c +--- linux-2.6.33.1/mm/mmap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/mmap.c 2010-03-20 16:58:42.264564412 -0400 +@@ -44,6 +44,16 @@ + #define arch_rebalance_pgtables(addr, len) (addr) + #endif + ++static inline void verify_mm_writelocked(struct mm_struct *mm) ++{ ++#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAX) ++ if (unlikely(down_read_trylock(&mm->mmap_sem))) { ++ up_read(&mm->mmap_sem); ++ BUG(); ++ } ++#endif ++} ++ + static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end); +@@ -69,16 +79,25 @@ static void unmap_region(struct mm_struc + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ +-pgprot_t protection_map[16] = { ++pgprot_t protection_map[16] __read_only = { + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 + }; + + pgprot_t vm_get_page_prot(unsigned long vm_flags) + { +- return __pgprot(pgprot_val(protection_map[vm_flags & ++ pgprot_t prot = __pgprot(pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | + pgprot_val(arch_vm_get_page_prot(vm_flags))); ++ ++#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_X86_32) ++ if (!(__supported_pte_mask & _PAGE_NX) && ++ (vm_flags & (VM_PAGEEXEC | VM_EXEC)) == VM_PAGEEXEC && ++ (vm_flags & (VM_READ | VM_WRITE))) ++ prot = __pgprot(pte_val(pte_exprotect(__pte(pgprot_val(prot))))); ++#endif ++ ++ return prot; + } + EXPORT_SYMBOL(vm_get_page_prot); + +@@ -230,6 +249,7 @@ static struct vm_area_struct *remove_vma + struct vm_area_struct *next = vma->vm_next; + + might_sleep(); ++ BUG_ON(vma->vm_mirror); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) { +@@ -266,6 +286,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) + * not page aligned -Ram Gupta + */ + rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; ++ gr_learn_resource(current, RLIMIT_DATA, (brk - mm->start_brk) + (mm->end_data - mm->start_data), 1); + if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) + goto out; +@@ -693,6 +714,12 @@ static int + can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC) && vma->vm_start == SEGMEXEC_TASK_SIZE) ++ return 0; ++#endif ++ + if (is_mergeable_vma(vma, file, vm_flags) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { + if (vma->vm_pgoff == vm_pgoff) +@@ -712,6 +739,12 @@ static int + can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC) && vma->vm_end == SEGMEXEC_TASK_SIZE) ++ return 0; ++#endif ++ + if (is_mergeable_vma(vma, file, vm_flags) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { + pgoff_t vm_pglen; +@@ -754,12 +787,19 @@ can_vma_merge_after(struct vm_area_struc + struct vm_area_struct *vma_merge(struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, +- struct anon_vma *anon_vma, struct file *file, ++ struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy) + { + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; + struct vm_area_struct *area, *next; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ unsigned long addr_m = addr + SEGMEXEC_TASK_SIZE, end_m = end + SEGMEXEC_TASK_SIZE; ++ struct vm_area_struct *area_m = NULL, *next_m = NULL, *prev_m = NULL; ++ ++ BUG_ON((mm->pax_flags & MF_PAX_SEGMEXEC) && SEGMEXEC_TASK_SIZE < end); ++#endif ++ + /* + * We later require that vma->vm_flags == vm_flags, + * so this tests vma->vm_flags & VM_SPECIAL, too. +@@ -775,6 +815,15 @@ struct vm_area_struct *vma_merge(struct + if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = next->vm_next; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (prev) ++ prev_m = pax_find_mirror_vma(prev); ++ if (area) ++ area_m = pax_find_mirror_vma(area); ++ if (next) ++ next_m = pax_find_mirror_vma(next); ++#endif ++ + /* + * Can it merge with the predecessor? + */ +@@ -794,9 +843,24 @@ struct vm_area_struct *vma_merge(struct + /* cases 1, 6 */ + vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL); +- } else /* cases 2, 5, 7 */ ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (prev_m) ++ vma_adjust(prev_m, prev_m->vm_start, ++ next_m->vm_end, prev_m->vm_pgoff, NULL); ++#endif ++ ++ } else { /* cases 2, 5, 7 */ + vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL); ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (prev_m) ++ vma_adjust(prev_m, prev_m->vm_start, ++ end_m, prev_m->vm_pgoff, NULL); ++#endif ++ ++ } + return prev; + } + +@@ -807,12 +871,27 @@ struct vm_area_struct *vma_merge(struct + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen)) { +- if (prev && addr < prev->vm_end) /* case 4 */ ++ if (prev && addr < prev->vm_end) { /* case 4 */ + vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL); +- else /* cases 3, 8 */ ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (prev_m) ++ vma_adjust(prev_m, prev_m->vm_start, ++ addr_m, prev_m->vm_pgoff, NULL); ++#endif ++ ++ } else { /* cases 3, 8 */ + vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL); ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (area_m) ++ vma_adjust(area_m, addr_m, next_m->vm_end, ++ next_m->vm_pgoff - pglen, NULL); ++#endif ++ ++ } + return area; + } + +@@ -887,14 +966,11 @@ none: + void vm_stat_account(struct mm_struct *mm, unsigned long flags, + struct file *file, long pages) + { +- const unsigned long stack_flags +- = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); +- + if (file) { + mm->shared_vm += pages; + if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) + mm->exec_vm += pages; +- } else if (flags & stack_flags) ++ } else if (flags & (VM_GROWSUP|VM_GROWSDOWN)) + mm->stack_vm += pages; + if (flags & (VM_RESERVED|VM_IO)) + mm->reserved_vm += pages; +@@ -921,7 +997,7 @@ unsigned long do_mmap_pgoff(struct file + * (the exception is when the underlying filesystem is noexec + * mounted, in which case we dont add PROT_EXEC.) + */ +- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) ++ if ((prot & (PROT_READ | PROT_WRITE)) && (current->personality & READ_IMPLIES_EXEC)) + if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) + prot |= PROT_EXEC; + +@@ -947,7 +1023,7 @@ unsigned long do_mmap_pgoff(struct file + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ +- addr = get_unmapped_area(file, addr, len, pgoff, flags); ++ addr = get_unmapped_area(file, addr, len, pgoff, flags | ((prot & PROT_EXEC) ? MAP_EXECUTABLE : 0)); + if (addr & ~PAGE_MASK) + return addr; + +@@ -958,6 +1034,26 @@ unsigned long do_mmap_pgoff(struct file + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + ++#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) ++ if (mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { ++ ++#ifdef CONFIG_PAX_MPROTECT ++ if (mm->pax_flags & MF_PAX_MPROTECT) { ++ if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC) ++ vm_flags &= ~(VM_EXEC | VM_MAYEXEC); ++ else ++ vm_flags &= ~(VM_WRITE | VM_MAYWRITE); ++ } ++#endif ++ ++ } ++#endif ++ ++#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_X86_32) ++ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && file) ++ vm_flags &= ~VM_PAGEEXEC; ++#endif ++ + if (flags & MAP_LOCKED) + if (!can_do_mlock()) + return -EPERM; +@@ -969,6 +1065,7 @@ unsigned long do_mmap_pgoff(struct file + locked += mm->locked_vm; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; ++ gr_learn_resource(current, RLIMIT_MEMLOCK, locked << PAGE_SHIFT, 1); + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } +@@ -1039,6 +1136,9 @@ unsigned long do_mmap_pgoff(struct file + if (error) + return error; + ++ if (!gr_acl_handle_mmap(file, prot)) ++ return -EACCES; ++ + return mmap_region(file, addr, len, flags, vm_flags, pgoff); + } + EXPORT_SYMBOL(do_mmap_pgoff); +@@ -1091,10 +1191,10 @@ out: + */ + int vma_wants_writenotify(struct vm_area_struct *vma) + { +- unsigned int vm_flags = vma->vm_flags; ++ unsigned long vm_flags = vma->vm_flags; + + /* If it was private or non-writable, the write bit is already clear */ +- if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) ++ if ((vm_flags & (VM_WRITE|VM_SHARED)) != (VM_WRITE|VM_SHARED)) + return 0; + + /* The backer wishes to know when pages are first written to? */ +@@ -1143,14 +1243,24 @@ unsigned long mmap_region(struct file *f + unsigned long charged = 0; + struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m = NULL; ++#endif ++ ++ /* ++ * mm->mmap_sem is required to protect against another thread ++ * changing the mappings in case we sleep. ++ */ ++ verify_mm_writelocked(mm); ++ + /* Clear old maps */ + error = -ENOMEM; +-munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; +- goto munmap_back; ++ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); ++ BUG_ON(vma && vma->vm_start < addr + len); + } + + /* Check against address space limit. */ +@@ -1199,6 +1309,16 @@ munmap_back: + goto unacct_error; + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (vm_flags & VM_EXEC)) { ++ vma_m = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); ++ if (!vma_m) { ++ error = -ENOMEM; ++ goto free_vma; ++ } ++ } ++#endif ++ + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; +@@ -1221,6 +1341,19 @@ munmap_back: + error = file->f_op->mmap(file, vma); + if (error) + goto unmap_and_free_vma; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma_m && (vm_flags & VM_EXECUTABLE)) ++ added_exe_file_vma(mm); ++#endif ++ ++#if defined(CONFIG_PAX_PAGEEXEC) && defined(CONFIG_X86_32) ++ if ((mm->pax_flags & MF_PAX_PAGEEXEC) && !(vma->vm_flags & VM_SPECIAL)) { ++ vma->vm_flags |= VM_PAGEEXEC; ++ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); ++ } ++#endif ++ + if (vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); + +@@ -1256,6 +1389,11 @@ munmap_back: + vma_link(mm, vma, prev, rb_link, rb_parent); + file = vma->vm_file; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma_m) ++ pax_mirror_vma(vma_m, vma); ++#endif ++ + /* Once vma denies write, undo our temporary denial count */ + if (correct_wcount) + atomic_inc(&inode->i_writecount); +@@ -1264,6 +1402,7 @@ out: + + mm->total_vm += len >> PAGE_SHIFT; + vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); ++ track_exec_limit(mm, addr, addr + len, vm_flags); + if (vm_flags & VM_LOCKED) { + /* + * makes pages present; downgrades, drops, reacquires mmap_sem +@@ -1286,6 +1425,12 @@ unmap_and_free_vma: + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + charged = 0; + free_vma: ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma_m) ++ kmem_cache_free(vm_area_cachep, vma_m); ++#endif ++ + kmem_cache_free(vm_area_cachep, vma); + unacct_error: + if (charged) +@@ -1319,6 +1464,10 @@ arch_get_unmapped_area(struct file *filp + if (flags & MAP_FIXED) + return addr; + ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) ++#endif ++ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); +@@ -1327,10 +1476,10 @@ arch_get_unmapped_area(struct file *filp + return addr; + } + if (len > mm->cached_hole_size) { +- start_addr = addr = mm->free_area_cache; ++ start_addr = addr = mm->free_area_cache; + } else { +- start_addr = addr = TASK_UNMAPPED_BASE; +- mm->cached_hole_size = 0; ++ start_addr = addr = mm->mmap_base; ++ mm->cached_hole_size = 0; + } + + full_search: +@@ -1341,9 +1490,8 @@ full_search: + * Start a new search - just in case we missed + * some holes. + */ +- if (start_addr != TASK_UNMAPPED_BASE) { +- addr = TASK_UNMAPPED_BASE; +- start_addr = addr; ++ if (start_addr != mm->mmap_base) { ++ start_addr = addr = mm->mmap_base; + mm->cached_hole_size = 0; + goto full_search; + } +@@ -1365,10 +1513,16 @@ full_search: + + void arch_unmap_area(struct mm_struct *mm, unsigned long addr) + { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && SEGMEXEC_TASK_SIZE <= addr) ++ return; ++#endif ++ + /* + * Is this a new hole at the lowest possible address? + */ +- if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { ++ if (addr >= mm->mmap_base && addr < mm->free_area_cache) { + mm->free_area_cache = addr; + mm->cached_hole_size = ~0UL; + } +@@ -1386,7 +1540,7 @@ arch_get_unmapped_area_topdown(struct fi + { + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; +- unsigned long addr = addr0; ++ unsigned long base = mm->mmap_base, addr = addr0; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) +@@ -1395,6 +1549,10 @@ arch_get_unmapped_area_topdown(struct fi + if (flags & MAP_FIXED) + return addr; + ++#ifdef CONFIG_PAX_RANDMMAP ++ if (!(mm->pax_flags & MF_PAX_RANDMMAP)) ++#endif ++ + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); +@@ -1452,13 +1610,21 @@ bottomup: + * can happen with large stack limits and large mmap() + * allocations. + */ ++ mm->mmap_base = TASK_UNMAPPED_BASE; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base += mm->delta_mmap; ++#endif ++ ++ mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; +- mm->free_area_cache = TASK_UNMAPPED_BASE; + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + /* + * Restore the topdown base: + */ +- mm->free_area_cache = mm->mmap_base; ++ mm->mmap_base = base; ++ mm->free_area_cache = base; + mm->cached_hole_size = ~0UL; + + return addr; +@@ -1467,6 +1633,12 @@ bottomup: + + void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) + { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && SEGMEXEC_TASK_SIZE <= addr) ++ return; ++#endif ++ + /* + * Is this a new hole at the highest possible address? + */ +@@ -1474,8 +1646,10 @@ void arch_unmap_area_topdown(struct mm_s + mm->free_area_cache = addr; + + /* dont allow allocations above current base */ +- if (mm->free_area_cache > mm->mmap_base) ++ if (mm->free_area_cache > mm->mmap_base) { + mm->free_area_cache = mm->mmap_base; ++ mm->cached_hole_size = ~0UL; ++ } + } + + unsigned long +@@ -1583,6 +1757,27 @@ out: + return prev ? prev->vm_next : vma; + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++struct vm_area_struct *pax_find_mirror_vma(struct vm_area_struct *vma) ++{ ++ struct vm_area_struct *vma_m; ++ ++ BUG_ON(!vma || vma->vm_start >= vma->vm_end); ++ if (!(vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC) || !(vma->vm_flags & VM_EXEC)) { ++ BUG_ON(vma->vm_mirror); ++ return NULL; ++ } ++ BUG_ON(vma->vm_start < SEGMEXEC_TASK_SIZE && SEGMEXEC_TASK_SIZE < vma->vm_end); ++ vma_m = vma->vm_mirror; ++ BUG_ON(!vma_m || vma_m->vm_mirror != vma); ++ BUG_ON(vma->vm_file != vma_m->vm_file); ++ BUG_ON(vma->vm_end - vma->vm_start != vma_m->vm_end - vma_m->vm_start); ++ BUG_ON(vma->vm_pgoff != vma_m->vm_pgoff || vma->anon_vma != vma_m->anon_vma); ++ BUG_ON((vma->vm_flags ^ vma_m->vm_flags) & ~(VM_WRITE | VM_MAYWRITE | VM_ACCOUNT | VM_LOCKED)); ++ return vma_m; ++} ++#endif ++ + /* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the +@@ -1599,6 +1794,7 @@ static int acct_stack_growth(struct vm_a + return -ENOMEM; + + /* Stack limit test */ ++ gr_learn_resource(current, RLIMIT_STACK, size, 1); + if (size > rlim[RLIMIT_STACK].rlim_cur) + return -ENOMEM; + +@@ -1608,6 +1804,7 @@ static int acct_stack_growth(struct vm_a + unsigned long limit; + locked = mm->locked_vm + grow; + limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; ++ gr_learn_resource(current, RLIMIT_MEMLOCK, locked << PAGE_SHIFT, 1); + if (locked > limit && !capable(CAP_IPC_LOCK)) + return -ENOMEM; + } +@@ -1643,35 +1840,40 @@ static + #endif + int expand_upwards(struct vm_area_struct *vma, unsigned long address) + { +- int error; ++ int error, locknext; + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + ++ /* Also guard against wrapping around to address 0. */ ++ if (address < PAGE_ALIGN(address+1)) ++ address = PAGE_ALIGN(address+1); ++ else ++ return -ENOMEM; ++ + /* + * We must make sure the anon_vma is allocated + * so that the anon_vma locking is not a noop. + */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; ++ locknext = vma->vm_next && (vma->vm_next->vm_flags & VM_GROWSDOWN); ++ if (locknext && unlikely(anon_vma_prepare(vma->vm_next))) ++ return -ENOMEM; + anon_vma_lock(vma); ++ if (locknext) ++ anon_vma_lock(vma->vm_next); + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the +- * anon_vma lock to serialize against concurrent expand_stacks. +- * Also guard against wrapping around to address 0. ++ * anon_vma locks to serialize against concurrent expand_stacks ++ * and expand_upwards. + */ +- if (address < PAGE_ALIGN(address+4)) +- address = PAGE_ALIGN(address+4); +- else { +- anon_vma_unlock(vma); +- return -ENOMEM; +- } + error = 0; + + /* Somebody else might have raced and expanded it already */ +- if (address > vma->vm_end) { ++ if (address > vma->vm_end && (!locknext || vma->vm_next->vm_start >= address)) { + unsigned long size, grow; + + size = address - vma->vm_start; +@@ -1681,6 +1883,8 @@ int expand_upwards(struct vm_area_struct + if (!error) + vma->vm_end = address; + } ++ if (locknext) ++ anon_vma_unlock(vma->vm_next); + anon_vma_unlock(vma); + return error; + } +@@ -1692,7 +1896,8 @@ int expand_upwards(struct vm_area_struct + static int expand_downwards(struct vm_area_struct *vma, + unsigned long address) + { +- int error; ++ int error, lockprev = 0; ++ struct vm_area_struct *prev = NULL; + + /* + * We must make sure the anon_vma is allocated +@@ -1706,6 +1911,15 @@ static int expand_downwards(struct vm_ar + if (error) + return error; + ++#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) ++ find_vma_prev(vma->vm_mm, address, &prev); ++ lockprev = prev && (prev->vm_flags & VM_GROWSUP); ++#endif ++ if (lockprev && unlikely(anon_vma_prepare(prev))) ++ return -ENOMEM; ++ if (lockprev) ++ anon_vma_lock(prev); ++ + anon_vma_lock(vma); + + /* +@@ -1715,9 +1929,15 @@ static int expand_downwards(struct vm_ar + */ + + /* Somebody else might have raced and expanded it already */ +- if (address < vma->vm_start) { ++ if (address < vma->vm_start && (!lockprev || prev->vm_end <= address)) { + unsigned long size, grow; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m; ++ ++ vma_m = pax_find_mirror_vma(vma); ++#endif ++ + size = vma->vm_end - address; + grow = (vma->vm_start - address) >> PAGE_SHIFT; + +@@ -1725,9 +1945,20 @@ static int expand_downwards(struct vm_ar + if (!error) { + vma->vm_start = address; + vma->vm_pgoff -= grow; ++ track_exec_limit(vma->vm_mm, vma->vm_start, vma->vm_end, vma->vm_flags); ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma_m) { ++ vma_m->vm_start -= grow << PAGE_SHIFT; ++ vma_m->vm_pgoff -= grow; ++ } ++#endif ++ + } + } + anon_vma_unlock(vma); ++ if (lockprev) ++ anon_vma_unlock(prev); + return error; + } + +@@ -1803,6 +2034,13 @@ static void remove_vma_list(struct mm_st + do { + long nrpages = vma_pages(vma); + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (vma->vm_start >= SEGMEXEC_TASK_SIZE)) { ++ vma = remove_vma(vma); ++ continue; ++ } ++#endif ++ + mm->total_vm -= nrpages; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + vma = remove_vma(vma); +@@ -1847,6 +2085,16 @@ detach_vmas_to_be_unmapped(struct mm_str + + insertion_point = (prev ? &prev->vm_next : &mm->mmap); + do { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma->vm_mirror) { ++ BUG_ON(!vma->vm_mirror->vm_mirror || vma->vm_mirror->vm_mirror != vma); ++ vma->vm_mirror->vm_mirror = NULL; ++ vma->vm_mirror->vm_flags &= ~VM_EXEC; ++ vma->vm_mirror = NULL; ++ } ++#endif ++ + rb_erase(&vma->vm_rb, &mm->mm_rb); + mm->map_count--; + tail_vma = vma; +@@ -1872,14 +2120,33 @@ static int __split_vma(struct mm_struct + struct mempolicy *pol; + struct vm_area_struct *new; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m, *new_m = NULL; ++ unsigned long addr_m = addr + SEGMEXEC_TASK_SIZE; ++#endif ++ + if (is_vm_hugetlb_page(vma) && (addr & + ~(huge_page_mask(hstate_vma(vma))))) + return -EINVAL; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ vma_m = pax_find_mirror_vma(vma); ++#endif ++ + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!new) + return -ENOMEM; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma_m) { ++ new_m = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); ++ if (!new_m) { ++ kmem_cache_free(vm_area_cachep, new); ++ return -ENOMEM; ++ } ++ } ++#endif ++ + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + +@@ -1890,8 +2157,29 @@ static int __split_vma(struct mm_struct + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma_m) { ++ *new_m = *vma_m; ++ new_m->vm_mirror = new; ++ new->vm_mirror = new_m; ++ ++ if (new_below) ++ new_m->vm_end = addr_m; ++ else { ++ new_m->vm_start = addr_m; ++ new_m->vm_pgoff += ((addr_m - vma_m->vm_start) >> PAGE_SHIFT); ++ } ++ } ++#endif ++ + pol = mpol_dup(vma_policy(vma)); + if (IS_ERR(pol)) { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (new_m) ++ kmem_cache_free(vm_area_cachep, new_m); ++#endif ++ + kmem_cache_free(vm_area_cachep, new); + return PTR_ERR(pol); + } +@@ -1912,6 +2200,28 @@ static int __split_vma(struct mm_struct + else + vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma_m) { ++ mpol_get(pol); ++ vma_set_policy(new_m, pol); ++ ++ if (new_m->vm_file) { ++ get_file(new_m->vm_file); ++ if (vma_m->vm_flags & VM_EXECUTABLE) ++ added_exe_file_vma(mm); ++ } ++ ++ if (new_m->vm_ops && new_m->vm_ops->open) ++ new_m->vm_ops->open(new_m); ++ ++ if (new_below) ++ vma_adjust(vma_m, addr_m, vma_m->vm_end, vma_m->vm_pgoff + ++ ((addr_m - new_m->vm_start) >> PAGE_SHIFT), new_m); ++ else ++ vma_adjust(vma_m, vma_m->vm_start, addr_m, vma_m->vm_pgoff, new_m); ++ } ++#endif ++ + return 0; + } + +@@ -1922,6 +2232,15 @@ static int __split_vma(struct mm_struct + int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) + { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) { ++ BUG_ON(vma->vm_end > SEGMEXEC_TASK_SIZE); ++ if (mm->map_count >= sysctl_max_map_count-1) ++ return -ENOMEM; ++ } else ++#endif ++ + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + +@@ -1933,11 +2252,30 @@ int split_vma(struct mm_struct *mm, stru + * work. This now handles partial unmappings. + * Jeremy Fitzhardinge jeremy@goop.org + */ ++#ifdef CONFIG_PAX_SEGMEXEC + int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) + { ++ int ret = __do_munmap(mm, start, len); ++ if (ret || !(mm->pax_flags & MF_PAX_SEGMEXEC)) ++ return ret; ++ ++ return __do_munmap(mm, start + SEGMEXEC_TASK_SIZE, len); ++} ++ ++int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len) ++#else ++int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) ++#endif ++{ + unsigned long end; + struct vm_area_struct *vma, *prev, *last; + ++ /* ++ * mm->mmap_sem is required to protect against another thread ++ * changing the mappings in case we sleep. ++ */ ++ verify_mm_writelocked(mm); ++ + if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + +@@ -2011,6 +2349,8 @@ int do_munmap(struct mm_struct *mm, unsi + /* Fix up all other VM information */ + remove_vma_list(mm, vma); + ++ track_exec_limit(mm, start, end, 0UL); ++ + return 0; + } + +@@ -2023,22 +2363,18 @@ SYSCALL_DEFINE2(munmap, unsigned long, a + + profile_munmap(addr); + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && ++ (len > SEGMEXEC_TASK_SIZE || addr > SEGMEXEC_TASK_SIZE-len)) ++ return -EINVAL; ++#endif ++ + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; + } + +-static inline void verify_mm_writelocked(struct mm_struct *mm) +-{ +-#ifdef CONFIG_DEBUG_VM +- if (unlikely(down_read_trylock(&mm->mmap_sem))) { +- WARN_ON(1); +- up_read(&mm->mmap_sem); +- } +-#endif +-} +- + /* + * this is really a simplified "do_mmap". it only handles + * anonymous maps. eventually we may be able to do some +@@ -2052,6 +2388,11 @@ unsigned long do_brk(unsigned long addr, + struct rb_node ** rb_link, * rb_parent; + pgoff_t pgoff = addr >> PAGE_SHIFT; + int error; ++ unsigned long charged; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m = NULL; ++#endif + + len = PAGE_ALIGN(len); + if (!len) +@@ -2063,16 +2404,30 @@ unsigned long do_brk(unsigned long addr, + + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + ++#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) ++ if (mm->pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) { ++ flags &= ~VM_EXEC; ++ ++#ifdef CONFIG_PAX_MPROTECT ++ if (mm->pax_flags & MF_PAX_MPROTECT) ++ flags &= ~VM_MAYEXEC; ++#endif ++ ++ } ++#endif ++ + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (error & ~PAGE_MASK) + return error; + ++ charged = len >> PAGE_SHIFT; ++ + /* + * mlock MCL_FUTURE? + */ + if (mm->def_flags & VM_LOCKED) { + unsigned long locked, lock_limit; +- locked = len >> PAGE_SHIFT; ++ locked = charged; + locked += mm->locked_vm; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; +@@ -2089,22 +2444,22 @@ unsigned long do_brk(unsigned long addr, + /* + * Clear old maps. this also does some error checking for us + */ +- munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; +- goto munmap_back; ++ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); ++ BUG_ON(vma && vma->vm_start < addr + len); + } + + /* Check against address space limits *after* clearing old maps... */ +- if (!may_expand_vm(mm, len >> PAGE_SHIFT)) ++ if (!may_expand_vm(mm, charged)) + return -ENOMEM; + + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + +- if (security_vm_enough_memory(len >> PAGE_SHIFT)) ++ if (security_vm_enough_memory(charged)) + return -ENOMEM; + + /* Can we just expand an old private anonymous mapping? */ +@@ -2118,10 +2473,21 @@ unsigned long do_brk(unsigned long addr, + */ + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) { +- vm_unacct_memory(len >> PAGE_SHIFT); ++ vm_unacct_memory(charged); + return -ENOMEM; + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (flags & VM_EXEC)) { ++ vma_m = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); ++ if (!vma_m) { ++ kmem_cache_free(vm_area_cachep, vma); ++ vm_unacct_memory(charged); ++ return -ENOMEM; ++ } ++ } ++#endif ++ + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; +@@ -2130,11 +2496,12 @@ unsigned long do_brk(unsigned long addr, + vma->vm_page_prot = vm_get_page_prot(flags); + vma_link(mm, vma, prev, rb_link, rb_parent); + out: +- mm->total_vm += len >> PAGE_SHIFT; ++ mm->total_vm += charged; + if (flags & VM_LOCKED) { + if (!mlock_vma_pages_range(vma, addr, addr + len)) +- mm->locked_vm += (len >> PAGE_SHIFT); ++ mm->locked_vm += charged; + } ++ track_exec_limit(mm, addr, addr + len, flags); + return addr; + } + +@@ -2181,8 +2548,10 @@ void exit_mmap(struct mm_struct *mm) + * Walk the list again, actually closing and freeing it, + * with preemption enabled, without holding any MM locks. + */ +- while (vma) ++ while (vma) { ++ vma->vm_mirror = NULL; + vma = remove_vma(vma); ++ } + + BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + } +@@ -2196,6 +2565,10 @@ int insert_vm_struct(struct mm_struct * + struct vm_area_struct * __vma, * prev; + struct rb_node ** rb_link, * rb_parent; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m = NULL; ++#endif ++ + /* + * The vm_pgoff of a purely anonymous vma should be irrelevant + * until its first write fault, when page's anon_vma and index +@@ -2218,7 +2591,22 @@ int insert_vm_struct(struct mm_struct * + if ((vma->vm_flags & VM_ACCOUNT) && + security_vm_enough_memory_mm(mm, vma_pages(vma))) + return -ENOMEM; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (vma->vm_flags & VM_EXEC)) { ++ vma_m = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); ++ if (!vma_m) ++ return -ENOMEM; ++ } ++#endif ++ + vma_link(mm, vma, prev, rb_link, rb_parent); ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (vma_m) ++ pax_mirror_vma(vma_m, vma); ++#endif ++ + return 0; + } + +@@ -2236,6 +2624,8 @@ struct vm_area_struct *copy_vma(struct v + struct rb_node **rb_link, *rb_parent; + struct mempolicy *pol; + ++ BUG_ON(vma->vm_mirror); ++ + /* + * If anonymous vma has not yet been faulted, update new pgoff + * to match new location, to increase its chance of merging. +@@ -2279,6 +2669,35 @@ struct vm_area_struct *copy_vma(struct v + return new_vma; + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++void pax_mirror_vma(struct vm_area_struct *vma_m, struct vm_area_struct *vma) ++{ ++ struct vm_area_struct *prev_m; ++ struct rb_node **rb_link_m, *rb_parent_m; ++ struct mempolicy *pol_m; ++ ++ BUG_ON(!(vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC) || !(vma->vm_flags & VM_EXEC)); ++ BUG_ON(vma->vm_mirror || vma_m->vm_mirror); ++ BUG_ON(!mpol_equal(vma_policy(vma), vma_policy(vma_m))); ++ *vma_m = *vma; ++ pol_m = vma_policy(vma_m); ++ mpol_get(pol_m); ++ vma_set_policy(vma_m, pol_m); ++ vma_m->vm_start += SEGMEXEC_TASK_SIZE; ++ vma_m->vm_end += SEGMEXEC_TASK_SIZE; ++ vma_m->vm_flags &= ~(VM_WRITE | VM_MAYWRITE | VM_ACCOUNT | VM_LOCKED); ++ vma_m->vm_page_prot = vm_get_page_prot(vma_m->vm_flags); ++ if (vma_m->vm_file) ++ get_file(vma_m->vm_file); ++ if (vma_m->vm_ops && vma_m->vm_ops->open) ++ vma_m->vm_ops->open(vma_m); ++ find_vma_prepare(vma->vm_mm, vma_m->vm_start, &prev_m, &rb_link_m, &rb_parent_m); ++ vma_link(vma->vm_mm, vma_m, prev_m, rb_link_m, rb_parent_m); ++ vma_m->vm_mirror = vma; ++ vma->vm_mirror = vma_m; ++} ++#endif ++ + /* + * Return true if the calling process may expand its vm space by the passed + * number of pages +@@ -2289,7 +2708,7 @@ int may_expand_vm(struct mm_struct *mm, + unsigned long lim; + + lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; +- ++ gr_learn_resource(current, RLIMIT_AS, (cur + npages) << PAGE_SHIFT, 1); + if (cur + npages > lim) + return 0; + return 1; +@@ -2358,6 +2777,15 @@ int install_special_mapping(struct mm_st + vma->vm_start = addr; + vma->vm_end = addr + len; + ++#ifdef CONFIG_PAX_MPROTECT ++ if (mm->pax_flags & MF_PAX_MPROTECT) { ++ if ((vm_flags & (VM_WRITE | VM_EXEC)) != VM_EXEC) ++ vm_flags &= ~(VM_EXEC | VM_MAYEXEC); ++ else ++ vm_flags &= ~(VM_WRITE | VM_MAYWRITE); ++ } ++#endif ++ + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + +diff -urNp linux-2.6.33.1/mm/mprotect.c linux-2.6.33.1/mm/mprotect.c +--- linux-2.6.33.1/mm/mprotect.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/mprotect.c 2010-03-20 16:58:42.264564412 -0400 +@@ -24,10 +24,16 @@ + #include <linux/mmu_notifier.h> + #include <linux/migrate.h> + #include <linux/perf_event.h> ++ ++#ifdef CONFIG_PAX_MPROTECT ++#include <linux/elf.h> ++#endif ++ + #include <asm/uaccess.h> + #include <asm/pgtable.h> + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> ++#include <asm/mmu_context.h> + + #ifndef pgprot_modify + static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +@@ -132,6 +138,48 @@ static void change_protection(struct vm_ + flush_tlb_range(vma, start, end); + } + ++#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT ++/* called while holding the mmap semaphor for writing except stack expansion */ ++void track_exec_limit(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long prot) ++{ ++ unsigned long oldlimit, newlimit = 0UL; ++ ++ if (!(mm->pax_flags & MF_PAX_PAGEEXEC) || (__supported_pte_mask & _PAGE_NX)) ++ return; ++ ++ spin_lock(&mm->page_table_lock); ++ oldlimit = mm->context.user_cs_limit; ++ if ((prot & VM_EXEC) && oldlimit < end) ++ /* USER_CS limit moved up */ ++ newlimit = end; ++ else if (!(prot & VM_EXEC) && start < oldlimit && oldlimit <= end) ++ /* USER_CS limit moved down */ ++ newlimit = start; ++ ++ if (newlimit) { ++ mm->context.user_cs_limit = newlimit; ++ ++#ifdef CONFIG_SMP ++ wmb(); ++ cpus_clear(mm->context.cpu_user_cs_mask); ++ cpu_set(smp_processor_id(), mm->context.cpu_user_cs_mask); ++#endif ++ ++ set_user_cs(mm->context.user_cs_base, mm->context.user_cs_limit, smp_processor_id()); ++ } ++ spin_unlock(&mm->page_table_lock); ++ if (newlimit == end) { ++ struct vm_area_struct *vma = find_vma(mm, oldlimit); ++ ++ for (; vma && vma->vm_start < end; vma = vma->vm_next) ++ if (is_vm_hugetlb_page(vma)) ++ hugetlb_change_protection(vma, vma->vm_start, vma->vm_end, vma->vm_page_prot); ++ else ++ change_protection(vma, vma->vm_start, vma->vm_end, vma->vm_page_prot, vma_wants_writenotify(vma)); ++ } ++} ++#endif ++ + int + mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags) +@@ -144,6 +192,14 @@ mprotect_fixup(struct vm_area_struct *vm + int error; + int dirty_accountable = 0; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m = NULL; ++ unsigned long start_m, end_m; ++ ++ start_m = start + SEGMEXEC_TASK_SIZE; ++ end_m = end + SEGMEXEC_TASK_SIZE; ++#endif ++ + if (newflags == oldflags) { + *pprev = vma; + return 0; +@@ -165,6 +221,38 @@ mprotect_fixup(struct vm_area_struct *vm + } + } + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if ((mm->pax_flags & MF_PAX_SEGMEXEC) && ((oldflags ^ newflags) & VM_EXEC)) { ++ if (start != vma->vm_start) { ++ error = split_vma(mm, vma, start, 1); ++ if (error) ++ goto fail; ++ BUG_ON(!*pprev || (*pprev)->vm_next == vma); ++ *pprev = (*pprev)->vm_next; ++ } ++ ++ if (end != vma->vm_end) { ++ error = split_vma(mm, vma, end, 0); ++ if (error) ++ goto fail; ++ } ++ ++ if (pax_find_mirror_vma(vma)) { ++ error = __do_munmap(mm, start_m, end_m - start_m); ++ if (error) ++ goto fail; ++ } else { ++ vma_m = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); ++ if (!vma_m) { ++ error = -ENOMEM; ++ goto fail; ++ } ++ vma->vm_flags = newflags; ++ pax_mirror_vma(vma_m, vma); ++ } ++ } ++#endif ++ + /* + * First try to merge with previous and/or next vma. + */ +@@ -196,8 +284,14 @@ success: + * held in write mode. + */ + vma->vm_flags = newflags; ++ ++#ifdef CONFIG_PAX_MPROTECT ++ if (mm->binfmt && mm->binfmt->handle_mprotect) ++ mm->binfmt->handle_mprotect(vma, newflags); ++#endif ++ + vma->vm_page_prot = pgprot_modify(vma->vm_page_prot, +- vm_get_page_prot(newflags)); ++ vm_get_page_prot(vma->vm_flags)); + + if (vma_wants_writenotify(vma)) { + vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED); +@@ -238,6 +332,17 @@ SYSCALL_DEFINE3(mprotect, unsigned long, + end = start + len; + if (end <= start) + return -ENOMEM; ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (current->mm->pax_flags & MF_PAX_SEGMEXEC) { ++ if (end > SEGMEXEC_TASK_SIZE) ++ return -EINVAL; ++ } else ++#endif ++ ++ if (end > TASK_SIZE) ++ return -EINVAL; ++ + if (!arch_validate_prot(prot)) + return -EINVAL; + +@@ -245,7 +350,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, + /* + * Does the application expect PROT_READ to imply PROT_EXEC: + */ +- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) ++ if ((prot & (PROT_READ | PROT_WRITE)) && (current->personality & READ_IMPLIES_EXEC)) + prot |= PROT_EXEC; + + vm_flags = calc_vm_prot_bits(prot); +@@ -277,6 +382,16 @@ SYSCALL_DEFINE3(mprotect, unsigned long, + if (start > vma->vm_start) + prev = vma; + ++ if (!gr_acl_handle_mprotect(vma->vm_file, prot)) { ++ error = -EACCES; ++ goto out; ++ } ++ ++#ifdef CONFIG_PAX_MPROTECT ++ if (current->mm->binfmt && current->mm->binfmt->handle_mprotect) ++ current->mm->binfmt->handle_mprotect(vma, vm_flags); ++#endif ++ + for (nstart = start ; ; ) { + unsigned long newflags; + +@@ -301,6 +416,9 @@ SYSCALL_DEFINE3(mprotect, unsigned long, + if (error) + goto out; + perf_event_mmap(vma); ++ ++ track_exec_limit(current->mm, nstart, tmp, vm_flags); ++ + nstart = tmp; + + if (nstart < prev->vm_end) +diff -urNp linux-2.6.33.1/mm/mremap.c linux-2.6.33.1/mm/mremap.c +--- linux-2.6.33.1/mm/mremap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/mremap.c 2010-03-20 16:58:42.264564412 -0400 +@@ -114,6 +114,12 @@ static void move_ptes(struct vm_area_str + continue; + pte = ptep_clear_flush(vma, old_addr, old_pte); + pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); ++ ++#ifdef CONFIG_ARCH_TRACK_EXEC_LIMIT ++ if (!(__supported_pte_mask & _PAGE_NX) && (new_vma->vm_flags & (VM_PAGEEXEC | VM_EXEC)) == VM_PAGEEXEC) ++ pte = pte_exprotect(pte); ++#endif ++ + set_pte_at(mm, new_addr, new_pte, pte); + } + +@@ -273,6 +279,11 @@ static struct vm_area_struct *vma_to_res + if (is_vm_hugetlb_page(vma)) + goto Einval; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (pax_find_mirror_vma(vma)) ++ goto Einval; ++#endif ++ + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + goto Efault; +@@ -322,20 +333,23 @@ static unsigned long mremap_to(unsigned + unsigned long ret = -EINVAL; + unsigned long charged = 0; + unsigned long map_flags; ++ unsigned long pax_task_size = TASK_SIZE; + + if (new_addr & ~PAGE_MASK) + goto out; + +- if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++#endif ++ ++ if (new_len > TASK_SIZE || new_addr > pax_task_size - new_len) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ +- if ((new_addr <= addr) && (new_addr+new_len) > addr) +- goto out; +- +- if ((addr <= new_addr) && (addr+old_len) > new_addr) ++ if (addr + old_len > new_addr && new_addr + new_len > addr) + goto out; + + ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); +@@ -407,6 +421,7 @@ unsigned long do_mremap(unsigned long ad + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + unsigned long charged = 0; ++ unsigned long pax_task_size = TASK_SIZE; + + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) + goto out; +@@ -425,6 +440,15 @@ unsigned long do_mremap(unsigned long ad + if (!new_len) + goto out; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ if (mm->pax_flags & MF_PAX_SEGMEXEC) ++ pax_task_size = SEGMEXEC_TASK_SIZE; ++#endif ++ ++ if (new_len > pax_task_size || addr > pax_task_size-new_len || ++ old_len > pax_task_size || addr > pax_task_size-old_len) ++ goto out; ++ + if (flags & MREMAP_FIXED) { + if (flags & MREMAP_MAYMOVE) + ret = mremap_to(addr, old_len, new_addr, new_len); +@@ -471,6 +495,7 @@ unsigned long do_mremap(unsigned long ad + addr + new_len); + } + ret = addr; ++ track_exec_limit(vma->vm_mm, vma->vm_start, addr + new_len, vma->vm_flags); + goto out; + } + } +@@ -497,7 +522,13 @@ unsigned long do_mremap(unsigned long ad + ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); + if (ret) + goto out; ++ ++ map_flags = vma->vm_flags; + ret = move_vma(vma, addr, old_len, new_len, new_addr); ++ if (!(ret & ~PAGE_MASK)) { ++ track_exec_limit(current->mm, addr, addr + old_len, 0UL); ++ track_exec_limit(current->mm, new_addr, new_addr + new_len, map_flags); ++ } + } + out: + if (ret & ~PAGE_MASK) +diff -urNp linux-2.6.33.1/mm/nommu.c linux-2.6.33.1/mm/nommu.c +--- linux-2.6.33.1/mm/nommu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/nommu.c 2010-03-20 16:58:42.268525965 -0400 +@@ -759,15 +759,6 @@ struct vm_area_struct *find_vma(struct m + EXPORT_SYMBOL(find_vma); + + /* +- * find a VMA +- * - we don't extend stack VMAs under NOMMU conditions +- */ +-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +-{ +- return find_vma(mm, addr); +-} +- +-/* + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +diff -urNp linux-2.6.33.1/mm/page_alloc.c linux-2.6.33.1/mm/page_alloc.c +--- linux-2.6.33.1/mm/page_alloc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/page_alloc.c 2010-03-20 16:58:42.272540377 -0400 +@@ -583,6 +583,10 @@ static void __free_pages_ok(struct page + int bad = 0; + int wasMlocked = __TestClearPageMlocked(page); + ++#ifdef CONFIG_PAX_MEMORY_SANITIZE ++ unsigned long index = 1UL << order; ++#endif ++ + kmemcheck_free_shadow(page, order); + + for (i = 0 ; i < (1 << order) ; ++i) +@@ -595,6 +599,12 @@ static void __free_pages_ok(struct page + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } ++ ++#ifdef CONFIG_PAX_MEMORY_SANITIZE ++ for (; index; --index) ++ sanitize_highpage(page + index - 1); ++#endif ++ + arch_free_page(page, order); + kernel_map_pages(page, 1 << order, 0); + +@@ -698,8 +708,10 @@ static int prep_new_page(struct page *pa + arch_alloc_page(page, order); + kernel_map_pages(page, 1 << order, 1); + ++#ifndef CONFIG_PAX_MEMORY_SANITIZE + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); ++#endif + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); +@@ -1093,6 +1105,11 @@ static void free_hot_cold_page(struct pa + debug_check_no_locks_freed(page_address(page), PAGE_SIZE); + debug_check_no_obj_freed(page_address(page), PAGE_SIZE); + } ++ ++#ifdef CONFIG_PAX_MEMORY_SANITIZE ++ sanitize_highpage(page); ++#endif ++ + arch_free_page(page, 0); + kernel_map_pages(page, 1, 0); + +diff -urNp linux-2.6.33.1/mm/percpu.c linux-2.6.33.1/mm/percpu.c +--- linux-2.6.33.1/mm/percpu.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/percpu.c 2010-03-20 16:58:42.272540377 -0400 +@@ -114,7 +114,7 @@ static unsigned int pcpu_first_unit_cpu + static unsigned int pcpu_last_unit_cpu __read_mostly; + + /* the address of the first chunk which starts with the kernel static area */ +-void *pcpu_base_addr __read_mostly; ++void *pcpu_base_addr __read_only; + EXPORT_SYMBOL_GPL(pcpu_base_addr); + + static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ +diff -urNp linux-2.6.33.1/mm/rmap.c linux-2.6.33.1/mm/rmap.c +--- linux-2.6.33.1/mm/rmap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/rmap.c 2010-03-20 16:58:42.272540377 -0400 +@@ -109,6 +109,10 @@ int anon_vma_prepare(struct vm_area_stru + struct mm_struct *mm = vma->vm_mm; + struct anon_vma *allocated; + ++#ifdef CONFIG_PAX_SEGMEXEC ++ struct vm_area_struct *vma_m; ++#endif ++ + anon_vma = find_mergeable_anon_vma(vma); + allocated = NULL; + if (!anon_vma) { +@@ -122,6 +126,15 @@ int anon_vma_prepare(struct vm_area_stru + /* page_table_lock to protect against threads */ + spin_lock(&mm->page_table_lock); + if (likely(!vma->anon_vma)) { ++ ++#ifdef CONFIG_PAX_SEGMEXEC ++ vma_m = pax_find_mirror_vma(vma); ++ if (vma_m) { ++ vma_m->anon_vma = anon_vma; ++ __anon_vma_link(vma_m); ++ } ++#endif ++ + vma->anon_vma = anon_vma; + list_add_tail(&vma->anon_vma_node, &anon_vma->head); + allocated = NULL; +diff -urNp linux-2.6.33.1/mm/shmem.c linux-2.6.33.1/mm/shmem.c +--- linux-2.6.33.1/mm/shmem.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/shmem.c 2010-03-20 16:58:42.272540377 -0400 +@@ -30,7 +30,7 @@ + #include <linux/module.h> + #include <linux/swap.h> + +-static struct vfsmount *shm_mnt; ++struct vfsmount *shm_mnt; + + #ifdef CONFIG_SHMEM + /* +diff -urNp linux-2.6.33.1/mm/slab.c linux-2.6.33.1/mm/slab.c +--- linux-2.6.33.1/mm/slab.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/slab.c 2010-03-20 16:58:42.276578221 -0400 +@@ -308,7 +308,7 @@ struct kmem_list3 { + * Need this for bootstrapping a per node allocator. + */ + #define NUM_INIT_LISTS (3 * MAX_NUMNODES) +-struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; ++struct kmem_list3 initkmem_list3[NUM_INIT_LISTS]; + #define CACHE_CACHE 0 + #define SIZE_AC MAX_NUMNODES + #define SIZE_L3 (2 * MAX_NUMNODES) +@@ -558,7 +558,7 @@ static inline void *index_to_obj(struct + * reciprocal_divide(offset, cache->reciprocal_buffer_size) + */ + static inline unsigned int obj_to_index(const struct kmem_cache *cache, +- const struct slab *slab, void *obj) ++ const struct slab *slab, const void *obj) + { + u32 offset = (obj - slab->s_mem); + return reciprocal_divide(offset, cache->reciprocal_buffer_size); +@@ -584,14 +584,14 @@ struct cache_names { + static struct cache_names __initdata cache_names[] = { + #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, + #include <linux/kmalloc_sizes.h> +- {NULL,} ++ {NULL, NULL} + #undef CACHE + }; + + static struct arraycache_init initarray_cache __initdata = +- { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; ++ { {0, BOOT_CPUCACHE_ENTRIES, 1, 0}, {NULL} }; + static struct arraycache_init initarray_generic = +- { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; ++ { {0, BOOT_CPUCACHE_ENTRIES, 1, 0}, {NULL} }; + + /* internal cache of cache description objs */ + static struct kmem_cache cache_cache = { +@@ -4104,7 +4104,7 @@ out: + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + } + +-#ifdef CONFIG_SLABINFO ++#if defined(CONFIG_SLABINFO) && !defined(CONFIG_GRKERNSEC_PROC_ADD) + + static void print_slabinfo_header(struct seq_file *m) + { +@@ -4502,6 +4502,51 @@ static int __init slab_proc_init(void) + module_init(slab_proc_init); + #endif + ++void check_object_size(const void *ptr, unsigned long n, bool to) ++{ ++ ++#ifdef CONFIG_PAX_USERCOPY ++ struct kmem_cache *cachep; ++ struct slab *slabp; ++ struct page *page; ++ unsigned int objnr; ++ unsigned long offset; ++ ++ if (!n) ++ return; ++ ++ if (ZERO_OR_NULL_PTR(ptr)) ++ goto report; ++ ++ if (!virt_addr_valid(ptr)) ++ return; ++ ++ page = virt_to_head_page(ptr); ++ ++ if (!PageSlab(page)) { ++ if (object_is_on_stack(ptr, n) == -1) ++ goto report; ++ return; ++ } ++ ++ cachep = page_get_cache(page); ++ slabp = page_get_slab(page); ++ objnr = obj_to_index(cachep, slabp, ptr); ++ BUG_ON(objnr >= cachep->num); ++ offset = ptr - index_to_obj(cachep, slabp, objnr) - obj_offset(cachep); ++ if (offset <= obj_size(cachep) && n <= obj_size(cachep) - offset) ++ return; ++ ++report: ++ if (to) ++ pax_report_leak_to_user(ptr, n); ++ else ++ pax_report_overflow_from_user(ptr, n); ++#endif ++ ++} ++EXPORT_SYMBOL(check_object_size); ++ + /** + * ksize - get the actual amount of memory allocated for a given object + * @objp: Pointer to the object +diff -urNp linux-2.6.33.1/mm/slob.c linux-2.6.33.1/mm/slob.c +--- linux-2.6.33.1/mm/slob.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/slob.c 2010-03-20 16:58:42.276578221 -0400 +@@ -29,7 +29,7 @@ + * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls + * alloc_pages() directly, allocating compound pages so the page order + * does not have to be separately tracked, and also stores the exact +- * allocation size in page->private so that it can be used to accurately ++ * allocation size in slob_page->size so that it can be used to accurately + * provide ksize(). These objects are detected in kfree() because slob_page() + * is false for them. + * +@@ -58,6 +58,7 @@ + */ + + #include <linux/kernel.h> ++#include <linux/sched.h> + #include <linux/slab.h> + #include <linux/mm.h> + #include <linux/swap.h> /* struct reclaim_state */ +@@ -100,7 +101,8 @@ struct slob_page { + unsigned long flags; /* mandatory */ + atomic_t _count; /* mandatory */ + slobidx_t units; /* free units left in page */ +- unsigned long pad[2]; ++ unsigned long pad[1]; ++ unsigned long size; /* size when >=PAGE_SIZE */ + slob_t *free; /* first free slob_t in page */ + struct list_head list; /* linked list of free pages */ + }; +@@ -133,7 +135,7 @@ static LIST_HEAD(free_slob_large); + */ + static inline int is_slob_page(struct slob_page *sp) + { +- return PageSlab((struct page *)sp); ++ return PageSlab((struct page *)sp) && !sp->size; + } + + static inline void set_slob_page(struct slob_page *sp) +@@ -148,7 +150,7 @@ static inline void clear_slob_page(struc + + static inline struct slob_page *slob_page(const void *addr) + { +- return (struct slob_page *)virt_to_page(addr); ++ return (struct slob_page *)virt_to_head_page(addr); + } + + /* +@@ -208,7 +210,7 @@ static void set_slob(slob_t *s, slobidx_ + /* + * Return the size of a slob block. + */ +-static slobidx_t slob_units(slob_t *s) ++static slobidx_t slob_units(const slob_t *s) + { + if (s->units > 0) + return s->units; +@@ -218,7 +220,7 @@ static slobidx_t slob_units(slob_t *s) + /* + * Return the next free slob block pointer after this one. + */ +-static slob_t *slob_next(slob_t *s) ++static slob_t *slob_next(const slob_t *s) + { + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t next; +@@ -233,7 +235,7 @@ static slob_t *slob_next(slob_t *s) + /* + * Returns true if s is the last free block in its page. + */ +-static int slob_last(slob_t *s) ++static int slob_last(const slob_t *s) + { + return !((unsigned long)slob_next(s) & ~PAGE_MASK); + } +@@ -252,6 +254,7 @@ static void *slob_new_pages(gfp_t gfp, i + if (!page) + return NULL; + ++ set_slob_page(page); + return page_address(page); + } + +@@ -368,11 +371,11 @@ static void *slob_alloc(size_t size, gfp + if (!b) + return NULL; + sp = slob_page(b); +- set_slob_page(sp); + + spin_lock_irqsave(&slob_lock, flags); + sp->units = SLOB_UNITS(PAGE_SIZE); + sp->free = b; ++ sp->size = 0; + INIT_LIST_HEAD(&sp->list); + set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); + set_slob_page_free(sp, slob_list); +@@ -475,10 +478,9 @@ out: + #define ARCH_SLAB_MINALIGN __alignof__(unsigned long) + #endif + +-void *__kmalloc_node(size_t size, gfp_t gfp, int node) ++static void *__kmalloc_node_align(size_t size, gfp_t gfp, int node, int align) + { +- unsigned int *m; +- int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); ++ slob_t *m; + void *ret; + + lockdep_trace_alloc(gfp); +@@ -491,7 +493,10 @@ void *__kmalloc_node(size_t size, gfp_t + + if (!m) + return NULL; +- *m = size; ++ BUILD_BUG_ON(ARCH_KMALLOC_MINALIGN < 2 * SLOB_UNIT); ++ BUILD_BUG_ON(ARCH_SLAB_MINALIGN < 2 * SLOB_UNIT); ++ m[0].units = size; ++ m[1].units = align; + ret = (void *)m + align; + + trace_kmalloc_node(_RET_IP_, ret, +@@ -501,9 +506,9 @@ void *__kmalloc_node(size_t size, gfp_t + + ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); + if (ret) { +- struct page *page; +- page = virt_to_page(ret); +- page->private = size; ++ struct slob_page *sp; ++ sp = slob_page(ret); ++ sp->size = size; + } + + trace_kmalloc_node(_RET_IP_, ret, +@@ -513,6 +518,13 @@ void *__kmalloc_node(size_t size, gfp_t + kmemleak_alloc(ret, size, 1, gfp); + return ret; + } ++ ++void *__kmalloc_node(size_t size, gfp_t gfp, int node) ++{ ++ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); ++ ++ return __kmalloc_node_align(size, gfp, node, align); ++} + EXPORT_SYMBOL(__kmalloc_node); + + void kfree(const void *block) +@@ -528,13 +540,84 @@ void kfree(const void *block) + sp = slob_page(block); + if (is_slob_page(sp)) { + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); +- unsigned int *m = (unsigned int *)(block - align); +- slob_free(m, *m + align); +- } else ++ slob_t *m = (slob_t *)(block - align); ++ slob_free(m, m[0].units + align); ++ } else { ++ clear_slob_page(sp); ++ free_slob_page(sp); ++ sp->size = 0; + put_page(&sp->page); ++ } + } + EXPORT_SYMBOL(kfree); + ++void check_object_size(const void *ptr, unsigned long n, bool to) ++{ ++ ++#ifdef CONFIG_PAX_USERCOPY ++ struct slob_page *sp; ++ const slob_t *free; ++ const void *base; ++ ++ if (!n) ++ return; ++ ++ if (ZERO_OR_NULL_PTR(ptr)) ++ goto report; ++ ++ if (!virt_addr_valid(ptr)) ++ return; ++ ++ sp = slob_page(ptr); ++ if (!PageSlab((struct page*)sp)) { ++ if (object_is_on_stack(ptr, n) == -1) ++ goto report; ++ return; ++ } ++ ++ if (sp->size) { ++ base = page_address(&sp->page); ++ if (base <= ptr && n <= sp->size - (ptr - base)) ++ return; ++ goto report; ++ } ++ ++ /* some tricky double walking to find the chunk */ ++ base = (void *)((unsigned long)ptr & PAGE_MASK); ++ free = sp->free; ++ ++ while (!slob_last(free) && (void *)free <= ptr) { ++ base = free + slob_units(free); ++ free = slob_next(free); ++ } ++ ++ while (base < (void *)free) { ++ slobidx_t m = ((slob_t *)base)[0].units, align = ((slob_t *)base)[1].units; ++ int size = SLOB_UNIT * SLOB_UNITS(m + align); ++ int offset; ++ ++ if (ptr < base + align) ++ goto report; ++ ++ offset = ptr - base - align; ++ if (offset < m) { ++ if (n <= m - offset) ++ return; ++ goto report; ++ } ++ base += size; ++ } ++ ++report: ++ if (to) ++ pax_report_leak_to_user(ptr, n); ++ else ++ pax_report_overflow_from_user(ptr, n); ++#endif ++ ++} ++EXPORT_SYMBOL(check_object_size); ++ + /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ + size_t ksize(const void *block) + { +@@ -547,10 +630,10 @@ size_t ksize(const void *block) + sp = slob_page(block); + if (is_slob_page(sp)) { + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); +- unsigned int *m = (unsigned int *)(block - align); +- return SLOB_UNITS(*m) * SLOB_UNIT; ++ slob_t *m = (slob_t *)(block - align); ++ return SLOB_UNITS(m[0].units) * SLOB_UNIT; + } else +- return sp->page.private; ++ return sp->size; + } + EXPORT_SYMBOL(ksize); + +@@ -605,17 +688,25 @@ void *kmem_cache_alloc_node(struct kmem_ + { + void *b; + ++#ifdef CONFIG_PAX_USERCOPY ++ b = __kmalloc_node_align(c->size, flags, node, c->align); ++#else + if (c->size < PAGE_SIZE) { + b = slob_alloc(c->size, flags, c->align, node); + trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + SLOB_UNITS(c->size) * SLOB_UNIT, + flags, node); + } else { ++ struct slob_page *sp; ++ + b = slob_new_pages(flags, get_order(c->size), node); ++ sp = slob_page(b); ++ sp->size = c->size; + trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + PAGE_SIZE << get_order(c->size), + flags, node); + } ++#endif + + if (c->ctor) + c->ctor(b); +@@ -627,10 +718,16 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); + + static void __kmem_cache_free(void *b, int size) + { +- if (size < PAGE_SIZE) ++ struct slob_page *sp = slob_page(b); ++ ++ if (is_slob_page(sp)) + slob_free(b, size); +- else ++ else { ++ clear_slob_page(sp); ++ free_slob_page(sp); ++ sp->size = 0; + slob_free_pages(b, get_order(size)); ++ } + } + + static void kmem_rcu_free(struct rcu_head *head) +@@ -643,15 +740,24 @@ static void kmem_rcu_free(struct rcu_hea + + void kmem_cache_free(struct kmem_cache *c, void *b) + { ++ int size = c->size; ++ ++#ifdef CONFIG_PAX_USERCOPY ++ if (size + c->align < PAGE_SIZE) { ++ size += c->align; ++ b -= c->align; ++ } ++#endif ++ + kmemleak_free_recursive(b, c->flags); + if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { + struct slob_rcu *slob_rcu; +- slob_rcu = b + (c->size - sizeof(struct slob_rcu)); ++ slob_rcu = b + (size - sizeof(struct slob_rcu)); + INIT_RCU_HEAD(&slob_rcu->head); +- slob_rcu->size = c->size; ++ slob_rcu->size = size; + call_rcu(&slob_rcu->head, kmem_rcu_free); + } else { +- __kmem_cache_free(b, c->size); ++ __kmem_cache_free(b, size); + } + + trace_kmem_cache_free(_RET_IP_, b); +diff -urNp linux-2.6.33.1/mm/slub.c linux-2.6.33.1/mm/slub.c +--- linux-2.6.33.1/mm/slub.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/slub.c 2010-03-20 16:58:42.280534479 -0400 +@@ -1893,6 +1893,8 @@ void kmem_cache_free(struct kmem_cache * + + page = virt_to_head_page(x); + ++ BUG_ON(!PageSlab(page)); ++ + slab_free(s, page, x, _RET_IP_); + + trace_kmem_cache_free(_RET_IP_, x); +@@ -1937,7 +1939,7 @@ static int slub_min_objects; + * Merge control. If this is set then no merging of slab caches will occur. + * (Could be removed. This was introduced to pacify the merge skeptics.) + */ +-static int slub_nomerge; ++static int slub_nomerge = 1; + + /* + * Calculate the order of allocation given an slab object size. +@@ -2493,7 +2495,7 @@ static int kmem_cache_open(struct kmem_c + * list to avoid pounding the page allocator excessively. + */ + set_min_partial(s, ilog2(s->size)); +- s->refcount = 1; ++ atomic_set(&s->refcount, 1); + #ifdef CONFIG_NUMA + s->remote_node_defrag_ratio = 1000; + #endif +@@ -2630,8 +2632,7 @@ static inline int kmem_cache_close(struc + void kmem_cache_destroy(struct kmem_cache *s) + { + down_write(&slub_lock); +- s->refcount--; +- if (!s->refcount) { ++ if (atomic_dec_and_test(&s->refcount)) { + list_del(&s->list); + up_write(&slub_lock); + if (kmem_cache_close(s)) { +@@ -2915,6 +2916,46 @@ void *__kmalloc_node(size_t size, gfp_t + EXPORT_SYMBOL(__kmalloc_node); + #endif + ++void check_object_size(const void *ptr, unsigned long n, bool to) ++{ ++ ++#ifdef CONFIG_PAX_USERCOPY ++ struct page *page; ++ struct kmem_cache *s; ++ unsigned long offset; ++ ++ if (!n) ++ return; ++ ++ if (ZERO_OR_NULL_PTR(ptr)) ++ goto report; ++ ++ if (!virt_addr_valid(ptr)) ++ return; ++ ++ page = get_object_page(ptr); ++ ++ if (!page) { ++ if (object_is_on_stack(ptr, n) == -1) ++ goto report; ++ return; ++ } ++ ++ s = page->slab; ++ offset = (ptr - page_address(page)) % s->size; ++ if (offset <= s->objsize && n <= s->objsize - offset) ++ return; ++ ++report: ++ if (to) ++ pax_report_leak_to_user(ptr, n); ++ else ++ pax_report_overflow_from_user(ptr, n); ++#endif ++ ++} ++EXPORT_SYMBOL(check_object_size); ++ + size_t ksize(const void *object) + { + struct page *page; +@@ -3186,7 +3227,7 @@ void __init kmem_cache_init(void) + */ + create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", + sizeof(struct kmem_cache_node), GFP_NOWAIT); +- kmalloc_caches[0].refcount = -1; ++ atomic_set(&kmalloc_caches[0].refcount, -1); + caches++; + + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); +@@ -3293,7 +3334,7 @@ static int slab_unmergeable(struct kmem_ + /* + * We may have set a slab to be unmergeable during bootstrap. + */ +- if (s->refcount < 0) ++ if (atomic_read(&s->refcount) < 0) + return 1; + + return 0; +@@ -3353,7 +3394,7 @@ struct kmem_cache *kmem_cache_create(con + if (s) { + int cpu; + +- s->refcount++; ++ atomic_inc(&s->refcount); + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. +@@ -3372,7 +3413,7 @@ struct kmem_cache *kmem_cache_create(con + + if (sysfs_slab_alias(s, name)) { + down_write(&slub_lock); +- s->refcount--; ++ atomic_dec(&s->refcount); + up_write(&slub_lock); + goto err; + } +@@ -4101,7 +4142,7 @@ SLAB_ATTR_RO(ctor); + + static ssize_t aliases_show(struct kmem_cache *s, char *buf) + { +- return sprintf(buf, "%d\n", s->refcount - 1); ++ return sprintf(buf, "%d\n", atomic_read(&s->refcount) - 1); + } + SLAB_ATTR_RO(aliases); + +@@ -4519,7 +4560,7 @@ static void kmem_cache_release(struct ko + kfree(s); + } + +-static struct sysfs_ops slab_sysfs_ops = { ++static const struct sysfs_ops slab_sysfs_ops = { + .show = slab_attr_show, + .store = slab_attr_store, + }; +@@ -4538,7 +4579,7 @@ static int uevent_filter(struct kset *ks + return 0; + } + +-static struct kset_uevent_ops slab_uevent_ops = { ++static const struct kset_uevent_ops slab_uevent_ops = { + .filter = uevent_filter, + }; + +@@ -4712,7 +4753,7 @@ __initcall(slab_sysfs_init); + /* + * The /proc/slabinfo ABI + */ +-#ifdef CONFIG_SLABINFO ++#if defined(CONFIG_SLABINFO) && !defined(CONFIG_GRKERNSEC_PROC_ADD) + static void print_slabinfo_header(struct seq_file *m) + { + seq_puts(m, "slabinfo - version: 2.1\n"); +diff -urNp linux-2.6.33.1/mm/util.c linux-2.6.33.1/mm/util.c +--- linux-2.6.33.1/mm/util.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/util.c 2010-03-20 16:58:42.280534479 -0400 +@@ -224,6 +224,12 @@ EXPORT_SYMBOL(strndup_user); + void arch_pick_mmap_layout(struct mm_struct *mm) + { + mm->mmap_base = TASK_UNMAPPED_BASE; ++ ++#ifdef CONFIG_PAX_RANDMMAP ++ if (mm->pax_flags & MF_PAX_RANDMMAP) ++ mm->mmap_base += mm->delta_mmap; ++#endif ++ + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } +diff -urNp linux-2.6.33.1/mm/vmalloc.c linux-2.6.33.1/mm/vmalloc.c +--- linux-2.6.33.1/mm/vmalloc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/mm/vmalloc.c 2010-03-20 16:58:42.280534479 -0400 +@@ -40,8 +40,19 @@ static void vunmap_pte_range(pmd_t *pmd, + + pte = pte_offset_kernel(pmd, addr); + do { +- pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); +- WARN_ON(!pte_none(ptent) && !pte_present(ptent)); ++ ++#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ if ((unsigned long)MODULES_EXEC_VADDR <= addr && addr < (unsigned long)MODULES_EXEC_END) { ++ BUG_ON(!pte_exec(*pte)); ++ set_pte_at(&init_mm, addr, pte, pfn_pte(__pa(addr) >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); ++ continue; ++ } ++#endif ++ ++ { ++ pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); ++ WARN_ON(!pte_none(ptent) && !pte_present(ptent)); ++ } + } while (pte++, addr += PAGE_SIZE, addr != end); + } + +@@ -92,6 +103,7 @@ static int vmap_pte_range(pmd_t *pmd, un + unsigned long end, pgprot_t prot, struct page **pages, int *nr) + { + pte_t *pte; ++ int ret = -ENOMEM; + + /* + * nr is a running index into the array which helps higher level +@@ -101,17 +113,30 @@ static int vmap_pte_range(pmd_t *pmd, un + pte = pte_alloc_kernel(pmd, addr); + if (!pte) + return -ENOMEM; ++ ++ pax_open_kernel(); + do { + struct page *page = pages[*nr]; + +- if (WARN_ON(!pte_none(*pte))) +- return -EBUSY; +- if (WARN_ON(!page)) +- return -ENOMEM; ++#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ if (pgprot_val(prot) & _PAGE_NX) ++#endif ++ ++ if (WARN_ON(!pte_none(*pte))) { ++ ret = -EBUSY; ++ goto out; ++ } ++ if (WARN_ON(!page)) { ++ ret = -ENOMEM; ++ goto out; ++ } + set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); + (*nr)++; + } while (pte++, addr += PAGE_SIZE, addr != end); +- return 0; ++ ret = 0; ++out: ++ pax_close_kernel(); ++ return ret; + } + + static int vmap_pmd_range(pud_t *pud, unsigned long addr, +@@ -192,11 +217,20 @@ int is_vmalloc_or_module_addr(const void + * and fall back on vmalloc() if that fails. Others + * just put it in the vmalloc space. + */ +-#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) ++#ifdef CONFIG_MODULES ++#ifdef MODULES_VADDR + unsigned long addr = (unsigned long)x; + if (addr >= MODULES_VADDR && addr < MODULES_END) + return 1; + #endif ++ ++#if defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ if (x >= (const void *)MODULES_EXEC_VADDR && x < (const void *)MODULES_EXEC_END) ++ return 1; ++#endif ++ ++#endif ++ + return is_vmalloc_addr(x); + } + +@@ -292,13 +326,13 @@ static void __insert_vmap_area(struct vm + struct rb_node *tmp; + + while (*p) { +- struct vmap_area *tmp; ++ struct vmap_area *varea; + + parent = *p; +- tmp = rb_entry(parent, struct vmap_area, rb_node); +- if (va->va_start < tmp->va_end) ++ varea = rb_entry(parent, struct vmap_area, rb_node); ++ if (va->va_start < varea->va_end) + p = &(*p)->rb_left; +- else if (va->va_end > tmp->va_start) ++ else if (va->va_end > varea->va_start) + p = &(*p)->rb_right; + else + BUG(); +@@ -1224,6 +1258,16 @@ static struct vm_struct *__get_vm_area_n + struct vm_struct *area; + + BUG_ON(in_interrupt()); ++ ++#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ if (flags & VM_KERNEXEC) { ++ if (start != VMALLOC_START || end != VMALLOC_END) ++ return NULL; ++ start = (unsigned long)&MODULES_EXEC_VADDR; ++ end = (unsigned long)&MODULES_EXEC_END; ++ } ++#endif ++ + if (flags & VM_IOREMAP) { + int bit = fls(size); + +@@ -1449,6 +1493,11 @@ void *vmap(struct page **pages, unsigned + if (count > totalram_pages) + return NULL; + ++#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ if (!(pgprot_val(prot) & _PAGE_NX)) ++ flags |= VM_KERNEXEC; ++#endif ++ + area = get_vm_area_caller((count << PAGE_SHIFT), flags, + __builtin_return_address(0)); + if (!area) +@@ -1558,6 +1607,13 @@ static void *__vmalloc_node(unsigned lon + if (!size || (size >> PAGE_SHIFT) > totalram_pages) + return NULL; + ++#if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) ++ if (!(pgprot_val(prot) & _PAGE_NX)) ++ area = __get_vm_area_node(size, align, VM_ALLOC | VM_KERNEXEC, VMALLOC_START, VMALLOC_END, ++ node, gfp_mask, caller); ++ else ++#endif ++ + area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, + VMALLOC_END, node, gfp_mask, caller); + +@@ -1576,6 +1632,7 @@ static void *__vmalloc_node(unsigned lon + return addr; + } + ++#undef __vmalloc + void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) + { + return __vmalloc_node(size, 1, gfp_mask, prot, -1, +@@ -1592,6 +1649,7 @@ EXPORT_SYMBOL(__vmalloc); + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ ++#undef vmalloc + void *vmalloc(unsigned long size) + { + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, +@@ -1606,6 +1664,7 @@ EXPORT_SYMBOL(vmalloc); + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + */ ++#undef vmalloc_user + void *vmalloc_user(unsigned long size) + { + struct vm_struct *area; +@@ -1633,6 +1692,7 @@ EXPORT_SYMBOL(vmalloc_user); + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ ++#undef vmalloc_node + void *vmalloc_node(unsigned long size, int node) + { + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, +@@ -1655,10 +1715,10 @@ EXPORT_SYMBOL(vmalloc_node); + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +- ++#undef vmalloc_exec + void *vmalloc_exec(unsigned long size) + { +- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, ++ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL_EXEC, + -1, __builtin_return_address(0)); + } + +@@ -1677,6 +1737,7 @@ void *vmalloc_exec(unsigned long size) + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into contiguous kernel virtual space. + */ ++#undef vmalloc_32 + void *vmalloc_32(unsigned long size) + { + return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, +@@ -1691,6 +1752,7 @@ EXPORT_SYMBOL(vmalloc_32); + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. + */ ++#undef vmalloc_32_user + void *vmalloc_32_user(unsigned long size) + { + struct vm_struct *area; +diff -urNp linux-2.6.33.1/net/atm/atm_misc.c linux-2.6.33.1/net/atm/atm_misc.c +--- linux-2.6.33.1/net/atm/atm_misc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/atm/atm_misc.c 2010-03-20 16:58:42.280534479 -0400 +@@ -19,7 +19,7 @@ int atm_charge(struct atm_vcc *vcc,int t + if (atomic_read(&sk_atm(vcc)->sk_rmem_alloc) <= sk_atm(vcc)->sk_rcvbuf) + return 1; + atm_return(vcc,truesize); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + return 0; + } + +@@ -41,7 +41,7 @@ struct sk_buff *atm_alloc_charge(struct + } + } + atm_return(vcc,guess); +- atomic_inc(&vcc->stats->rx_drop); ++ atomic_inc_unchecked(&vcc->stats->rx_drop); + return NULL; + } + +@@ -88,7 +88,7 @@ int atm_pcr_goal(const struct atm_trafpr + + void sonet_copy_stats(struct k_sonet_stats *from,struct sonet_stats *to) + { +-#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i) ++#define __HANDLE_ITEM(i) to->i = atomic_read_unchecked(&from->i) + __SONET_ITEMS + #undef __HANDLE_ITEM + } +@@ -96,7 +96,7 @@ void sonet_copy_stats(struct k_sonet_sta + + void sonet_subtract_stats(struct k_sonet_stats *from,struct sonet_stats *to) + { +-#define __HANDLE_ITEM(i) atomic_sub(to->i,&from->i) ++#define __HANDLE_ITEM(i) atomic_sub_unchecked(to->i,&from->i) + __SONET_ITEMS + #undef __HANDLE_ITEM + } +diff -urNp linux-2.6.33.1/net/atm/proc.c linux-2.6.33.1/net/atm/proc.c +--- linux-2.6.33.1/net/atm/proc.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/atm/proc.c 2010-03-20 16:58:42.280534479 -0400 +@@ -43,9 +43,9 @@ static void add_stats(struct seq_file *s + const struct k_atm_aal_stats *stats) + { + seq_printf(seq, "%s ( %d %d %d %d %d )", aal, +- atomic_read(&stats->tx),atomic_read(&stats->tx_err), +- atomic_read(&stats->rx),atomic_read(&stats->rx_err), +- atomic_read(&stats->rx_drop)); ++ atomic_read_unchecked(&stats->tx),atomic_read_unchecked(&stats->tx_err), ++ atomic_read_unchecked(&stats->rx),atomic_read_unchecked(&stats->rx_err), ++ atomic_read_unchecked(&stats->rx_drop)); + } + + static void atm_dev_info(struct seq_file *seq, const struct atm_dev *dev) +diff -urNp linux-2.6.33.1/net/atm/resources.c linux-2.6.33.1/net/atm/resources.c +--- linux-2.6.33.1/net/atm/resources.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/atm/resources.c 2010-03-20 16:58:42.280534479 -0400 +@@ -161,7 +161,7 @@ void atm_dev_deregister(struct atm_dev * + static void copy_aal_stats(struct k_atm_aal_stats *from, + struct atm_aal_stats *to) + { +-#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i) ++#define __HANDLE_ITEM(i) to->i = atomic_read_unchecked(&from->i) + __AAL_STAT_ITEMS + #undef __HANDLE_ITEM + } +@@ -170,7 +170,7 @@ static void copy_aal_stats(struct k_atm_ + static void subtract_aal_stats(struct k_atm_aal_stats *from, + struct atm_aal_stats *to) + { +-#define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i) ++#define __HANDLE_ITEM(i) atomic_sub_unchecked(to->i, &from->i) + __AAL_STAT_ITEMS + #undef __HANDLE_ITEM + } +diff -urNp linux-2.6.33.1/net/bridge/br_private.h linux-2.6.33.1/net/bridge/br_private.h +--- linux-2.6.33.1/net/bridge/br_private.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/bridge/br_private.h 2010-03-20 16:58:42.280534479 -0400 +@@ -254,7 +254,7 @@ extern void br_ifinfo_notify(int event, + + #ifdef CONFIG_SYSFS + /* br_sysfs_if.c */ +-extern struct sysfs_ops brport_sysfs_ops; ++extern const struct sysfs_ops brport_sysfs_ops; + extern int br_sysfs_addif(struct net_bridge_port *p); + + /* br_sysfs_br.c */ +diff -urNp linux-2.6.33.1/net/bridge/br_stp_if.c linux-2.6.33.1/net/bridge/br_stp_if.c +--- linux-2.6.33.1/net/bridge/br_stp_if.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/bridge/br_stp_if.c 2010-03-20 16:58:42.280534479 -0400 +@@ -146,7 +146,7 @@ static void br_stp_stop(struct net_bridg + char *envp[] = { NULL }; + + if (br->stp_enabled == BR_USER_STP) { +- r = call_usermodehelper(BR_STP_PROG, argv, envp, 1); ++ r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); + printk(KERN_INFO "%s: userspace STP stopped, return code %d\n", + br->dev->name, r); + +diff -urNp linux-2.6.33.1/net/bridge/br_sysfs_if.c linux-2.6.33.1/net/bridge/br_sysfs_if.c +--- linux-2.6.33.1/net/bridge/br_sysfs_if.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/bridge/br_sysfs_if.c 2010-03-20 16:58:42.280534479 -0400 +@@ -220,7 +220,7 @@ static ssize_t brport_store(struct kobje + return ret; + } + +-struct sysfs_ops brport_sysfs_ops = { ++const struct sysfs_ops brport_sysfs_ops = { + .show = brport_show, + .store = brport_store, + }; +diff -urNp linux-2.6.33.1/net/bridge/netfilter/ebtables.c linux-2.6.33.1/net/bridge/netfilter/ebtables.c +--- linux-2.6.33.1/net/bridge/netfilter/ebtables.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/bridge/netfilter/ebtables.c 2010-03-20 16:58:42.284535154 -0400 +@@ -1456,7 +1456,7 @@ static int do_ebt_get_ctl(struct sock *s + tmp.valid_hooks = t->table->valid_hooks; + } + mutex_unlock(&ebt_mutex); +- if (copy_to_user(user, &tmp, *len) != 0){ ++ if (*len > sizeof(tmp) || copy_to_user(user, &tmp, *len) != 0){ + BUGPRINT("c2u Didn't work\n"); + ret = -EFAULT; + break; +diff -urNp linux-2.6.33.1/net/core/dev.c linux-2.6.33.1/net/core/dev.c +--- linux-2.6.33.1/net/core/dev.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/core/dev.c 2010-03-20 16:58:42.284535154 -0400 +@@ -2183,7 +2183,7 @@ int netif_rx_ni(struct sk_buff *skb) + } + EXPORT_SYMBOL(netif_rx_ni); + +-static void net_tx_action(struct softirq_action *h) ++static void net_tx_action(void) + { + struct softnet_data *sd = &__get_cpu_var(softnet_data); + +@@ -2939,7 +2939,7 @@ void netif_napi_del(struct napi_struct * + EXPORT_SYMBOL(netif_napi_del); + + +-static void net_rx_action(struct softirq_action *h) ++static void net_rx_action(void) + { + struct list_head *list = &__get_cpu_var(softnet_data).poll_list; + unsigned long time_limit = jiffies + 2; +diff -urNp linux-2.6.33.1/net/core/flow.c linux-2.6.33.1/net/core/flow.c +--- linux-2.6.33.1/net/core/flow.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/core/flow.c 2010-03-20 16:58:42.284535154 -0400 +@@ -39,7 +39,7 @@ atomic_t flow_cache_genid = ATOMIC_INIT( + + static u32 flow_hash_shift; + #define flow_hash_size (1 << flow_hash_shift) +-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; ++static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables); + + #define flow_table(cpu) (per_cpu(flow_tables, cpu)) + +@@ -52,7 +52,7 @@ struct flow_percpu_info { + u32 hash_rnd; + int count; + }; +-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 }; ++static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info); + + #define flow_hash_rnd_recalc(cpu) \ + (per_cpu(flow_hash_info, cpu).hash_rnd_recalc) +@@ -69,7 +69,7 @@ struct flow_flush_info { + atomic_t cpuleft; + struct completion completion; + }; +-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL }; ++static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets); + + #define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu)) + +diff -urNp linux-2.6.33.1/net/core/sock.c linux-2.6.33.1/net/core/sock.c +--- linux-2.6.33.1/net/core/sock.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/core/sock.c 2010-03-20 16:58:42.284535154 -0400 +@@ -896,7 +896,7 @@ int sock_getsockopt(struct socket *sock, + return -ENOTCONN; + if (lv < len) + return -EINVAL; +- if (copy_to_user(optval, address, len)) ++ if (len > sizeof(address) || copy_to_user(optval, address, len)) + return -EFAULT; + goto lenout; + } +@@ -929,7 +929,7 @@ int sock_getsockopt(struct socket *sock, + + if (len > lv) + len = lv; +- if (copy_to_user(optval, &v, len)) ++ if (len > sizeof(v) || copy_to_user(optval, &v, len)) + return -EFAULT; + lenout: + if (put_user(len, optlen)) +diff -urNp linux-2.6.33.1/net/dccp/ccids/ccid3.c linux-2.6.33.1/net/dccp/ccids/ccid3.c +--- linux-2.6.33.1/net/dccp/ccids/ccid3.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/dccp/ccids/ccid3.c 2010-03-20 16:58:42.284535154 -0400 +@@ -41,7 +41,7 @@ + static int ccid3_debug; + #define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a) + #else +-#define ccid3_pr_debug(format, a...) ++#define ccid3_pr_debug(format, a...) do {} while (0) + #endif + + /* +diff -urNp linux-2.6.33.1/net/dccp/dccp.h linux-2.6.33.1/net/dccp/dccp.h +--- linux-2.6.33.1/net/dccp/dccp.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/dccp/dccp.h 2010-03-20 16:58:42.284535154 -0400 +@@ -44,9 +44,9 @@ extern int dccp_debug; + #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) + #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) + #else +-#define dccp_pr_debug(format, a...) +-#define dccp_pr_debug_cat(format, a...) +-#define dccp_debug(format, a...) ++#define dccp_pr_debug(format, a...) do {} while (0) ++#define dccp_pr_debug_cat(format, a...) do {} while (0) ++#define dccp_debug(format, a...) do {} while (0) + #endif + + extern struct inet_hashinfo dccp_hashinfo; +diff -urNp linux-2.6.33.1/net/decnet/sysctl_net_decnet.c linux-2.6.33.1/net/decnet/sysctl_net_decnet.c +--- linux-2.6.33.1/net/decnet/sysctl_net_decnet.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/decnet/sysctl_net_decnet.c 2010-03-20 16:58:42.284535154 -0400 +@@ -173,7 +173,7 @@ static int dn_node_address_handler(ctl_t + + if (len > *lenp) len = *lenp; + +- if (copy_to_user(buffer, addr, len)) ++ if (len > sizeof(addr) || copy_to_user(buffer, addr, len)) + return -EFAULT; + + *lenp = len; +@@ -236,7 +236,7 @@ static int dn_def_dev_handler(ctl_table + + if (len > *lenp) len = *lenp; + +- if (copy_to_user(buffer, devname, len)) ++ if (len > sizeof(devname) || copy_to_user(buffer, devname, len)) + return -EFAULT; + + *lenp = len; +diff -urNp linux-2.6.33.1/net/ipv4/inet_hashtables.c linux-2.6.33.1/net/ipv4/inet_hashtables.c +--- linux-2.6.33.1/net/ipv4/inet_hashtables.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv4/inet_hashtables.c 2010-03-20 16:58:42.311008973 -0400 +@@ -18,11 +18,14 @@ + #include <linux/sched.h> + #include <linux/slab.h> + #include <linux/wait.h> ++#include <linux/security.h> + + #include <net/inet_connection_sock.h> + #include <net/inet_hashtables.h> + #include <net/ip.h> + ++extern void gr_update_task_in_ip_table(struct task_struct *task, const struct inet_sock *inet); ++ + /* + * Allocate and initialize a new local port bind bucket. + * The bindhash mutex for snum's hash chain must be held here. +@@ -506,6 +509,8 @@ ok: + twrefcnt += inet_twsk_bind_unhash(tw, hinfo); + spin_unlock(&head->lock); + ++ gr_update_task_in_ip_table(current, inet_sk(sk)); ++ + if (tw) { + inet_twsk_deschedule(tw, death_row); + while (twrefcnt) { +diff -urNp linux-2.6.33.1/net/ipv4/netfilter/nf_nat_snmp_basic.c linux-2.6.33.1/net/ipv4/netfilter/nf_nat_snmp_basic.c +--- linux-2.6.33.1/net/ipv4/netfilter/nf_nat_snmp_basic.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv4/netfilter/nf_nat_snmp_basic.c 2010-03-20 16:58:42.311008973 -0400 +@@ -397,7 +397,7 @@ static unsigned char asn1_octets_decode( + + *len = 0; + +- *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); ++ *octets = kmalloc((eoc - ctx->pointer), GFP_ATOMIC); + if (*octets == NULL) { + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); +diff -urNp linux-2.6.33.1/net/ipv4/tcp_ipv4.c linux-2.6.33.1/net/ipv4/tcp_ipv4.c +--- linux-2.6.33.1/net/ipv4/tcp_ipv4.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv4/tcp_ipv4.c 2010-03-20 17:00:48.140865901 -0400 +@@ -84,6 +84,9 @@ + int sysctl_tcp_tw_reuse __read_mostly; + int sysctl_tcp_low_latency __read_mostly; + ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++extern int grsec_enable_blackhole; ++#endif + + #ifdef CONFIG_TCP_MD5SIG + static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, +@@ -1646,12 +1649,20 @@ int tcp_v4_rcv(struct sk_buff *skb) + TCP_SKB_CB(skb)->sacked = 0; + + sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); +- if (!sk) ++ if (!sk) { ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++ ret = 1; ++#endif + goto no_tcp_socket; ++ } + + process: +- if (sk->sk_state == TCP_TIME_WAIT) ++ if (sk->sk_state == TCP_TIME_WAIT) { ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++ ret = 2; ++#endif + goto do_time_wait; ++ } + + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; +@@ -1693,6 +1704,10 @@ no_tcp_socket: + bad_packet: + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + } else { ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++ if (!grsec_enable_blackhole || (ret == 1 && ++ (skb->dev->flags & IFF_LOOPBACK))) ++#endif + tcp_v4_send_reset(NULL, skb); + } + +diff -urNp linux-2.6.33.1/net/ipv4/tcp_minisocks.c linux-2.6.33.1/net/ipv4/tcp_minisocks.c +--- linux-2.6.33.1/net/ipv4/tcp_minisocks.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv4/tcp_minisocks.c 2010-03-20 17:06:01.445852790 -0400 +@@ -26,6 +26,10 @@ + #include <net/inet_common.h> + #include <net/xfrm.h> + ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++extern int grsec_enable_blackhole; ++#endif ++ + int sysctl_tcp_syncookies __read_mostly = 1; + EXPORT_SYMBOL(sysctl_tcp_syncookies); + +@@ -698,8 +702,11 @@ listen_overflow: + + embryonic_reset: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); +- if (!(flg & TCP_FLAG_RST)) ++ ++#ifndef CONFIG_GRKERNSEC_BLACKHOLE ++ if (!grsec_enable_blackhole || !(flg & TCP_FLAG_RST)) + req->rsk_ops->send_reset(sk, skb); ++#endif + + inet_csk_reqsk_queue_drop(sk, req, prev); + return NULL; +diff -urNp linux-2.6.33.1/net/ipv4/tcp_probe.c linux-2.6.33.1/net/ipv4/tcp_probe.c +--- linux-2.6.33.1/net/ipv4/tcp_probe.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv4/tcp_probe.c 2010-03-20 16:58:42.312547376 -0400 +@@ -201,7 +201,7 @@ static ssize_t tcpprobe_read(struct file + if (cnt + width >= len) + break; + +- if (copy_to_user(buf + cnt, tbuf, width)) ++ if (width > sizeof(tbuf) || copy_to_user(buf + cnt, tbuf, width)) + return -EFAULT; + cnt += width; + } +diff -urNp linux-2.6.33.1/net/ipv4/tcp_timer.c linux-2.6.33.1/net/ipv4/tcp_timer.c +--- linux-2.6.33.1/net/ipv4/tcp_timer.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv4/tcp_timer.c 2010-03-20 17:00:48.145360815 -0400 +@@ -21,6 +21,10 @@ + #include <linux/module.h> + #include <net/tcp.h> + ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++extern int grsec_lastack_retries; ++#endif ++ + int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; + int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; + int sysctl_tcp_keepalive_time __read_mostly = TCP_KEEPALIVE_TIME; +@@ -193,6 +197,13 @@ static int tcp_write_timeout(struct sock + } + } + ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++ if ((sk->sk_state == TCP_LAST_ACK) && ++ (grsec_lastack_retries > 0) && ++ (grsec_lastack_retries < retry_until)) ++ retry_until = grsec_lastack_retries; ++#endif ++ + if (retransmits_timed_out(sk, retry_until)) { + /* Has it gone just too far? */ + tcp_write_err(sk); +diff -urNp linux-2.6.33.1/net/ipv4/udp.c linux-2.6.33.1/net/ipv4/udp.c +--- linux-2.6.33.1/net/ipv4/udp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv4/udp.c 2010-03-20 17:06:32.628937453 -0400 +@@ -86,6 +86,7 @@ + #include <linux/types.h> + #include <linux/fcntl.h> + #include <linux/module.h> ++#include <linux/security.h> + #include <linux/socket.h> + #include <linux/sockios.h> + #include <linux/igmp.h> +@@ -106,6 +107,10 @@ + #include <net/xfrm.h> + #include "udp_impl.h" + ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++extern int grsec_enable_blackhole; ++#endif ++ + struct udp_table udp_table __read_mostly; + EXPORT_SYMBOL(udp_table); + +@@ -562,6 +567,9 @@ found: + return s; + } + ++extern int gr_search_udp_recvmsg(struct sock *sk, const struct sk_buff *skb); ++extern int gr_search_udp_sendmsg(struct sock *sk, struct sockaddr_in *addr); ++ + /* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should +@@ -830,9 +838,18 @@ int udp_sendmsg(struct kiocb *iocb, stru + dport = usin->sin_port; + if (dport == 0) + return -EINVAL; ++ ++ err = gr_search_udp_sendmsg(sk, usin); ++ if (err) ++ return err; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; ++ ++ err = gr_search_udp_sendmsg(sk, NULL); ++ if (err) ++ return err; ++ + daddr = inet->inet_daddr; + dport = inet->inet_dport; + /* Open fast path for connected socket. +@@ -1137,6 +1154,10 @@ try_again: + if (!skb) + goto out; + ++ err = gr_search_udp_recvmsg(sk, skb); ++ if (err) ++ goto out_free; ++ + ulen = skb->len - sizeof(struct udphdr); + copied = len; + if (copied > ulen) +@@ -1568,6 +1589,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, + goto csum_error; + + UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++ if (!grsec_enable_blackhole || (skb->dev->flags & IFF_LOOPBACK)) ++#endif + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + /* +diff -urNp linux-2.6.33.1/net/ipv6/exthdrs.c linux-2.6.33.1/net/ipv6/exthdrs.c +--- linux-2.6.33.1/net/ipv6/exthdrs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv6/exthdrs.c 2010-03-20 16:58:42.312547376 -0400 +@@ -635,7 +635,7 @@ static struct tlvtype_proc tlvprochopopt + .type = IPV6_TLV_JUMBO, + .func = ipv6_hop_jumbo, + }, +- { -1, } ++ { -1, NULL } + }; + + int ipv6_parse_hopopts(struct sk_buff *skb) +diff -urNp linux-2.6.33.1/net/ipv6/raw.c linux-2.6.33.1/net/ipv6/raw.c +--- linux-2.6.33.1/net/ipv6/raw.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv6/raw.c 2010-03-20 16:58:42.312547376 -0400 +@@ -597,7 +597,7 @@ out: + return err; + } + +-static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, ++static int rawv6_send_hdrinc(struct sock *sk, void *from, unsigned int length, + struct flowi *fl, struct rt6_info *rt, + unsigned int flags) + { +diff -urNp linux-2.6.33.1/net/ipv6/tcp_ipv6.c linux-2.6.33.1/net/ipv6/tcp_ipv6.c +--- linux-2.6.33.1/net/ipv6/tcp_ipv6.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv6/tcp_ipv6.c 2010-03-20 16:58:42.316527391 -0400 +@@ -1625,6 +1625,9 @@ static int tcp_v6_do_rcv(struct sock *sk + return 0; + + reset: ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++ if (!skb->dev || (skb->dev->flags & IFF_LOOPBACK)) ++#endif + tcp_v6_send_reset(sk, skb); + discard: + if (opt_skb) +@@ -1747,6 +1750,9 @@ no_tcp_socket: + bad_packet: + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + } else { ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++ if (skb->dev->flags & IFF_LOOPBACK) ++#endif + tcp_v6_send_reset(NULL, skb); + } + +diff -urNp linux-2.6.33.1/net/ipv6/udp.c linux-2.6.33.1/net/ipv6/udp.c +--- linux-2.6.33.1/net/ipv6/udp.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/ipv6/udp.c 2010-03-20 16:58:42.316527391 -0400 +@@ -745,6 +745,9 @@ int __udp6_lib_rcv(struct sk_buff *skb, + UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, + proto == IPPROTO_UDPLITE); + ++#ifdef CONFIG_GRKERNSEC_BLACKHOLE ++ if (skb->dev->flags & IFF_LOOPBACK) ++#endif + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev); + + kfree_skb(skb); +diff -urNp linux-2.6.33.1/net/irda/ircomm/ircomm_tty.c linux-2.6.33.1/net/irda/ircomm/ircomm_tty.c +--- linux-2.6.33.1/net/irda/ircomm/ircomm_tty.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/irda/ircomm/ircomm_tty.c 2010-03-20 16:58:42.316527391 -0400 +@@ -280,16 +280,16 @@ static int ircomm_tty_block_til_ready(st + add_wait_queue(&self->open_wait, &wait); + + IRDA_DEBUG(2, "%s(%d):block_til_ready before block on %s open_count=%d\n", +- __FILE__,__LINE__, tty->driver->name, self->open_count ); ++ __FILE__,__LINE__, tty->driver->name, atomic_read(&self->open_count) ); + + /* As far as I can see, we protect open_count - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); + if (!tty_hung_up_p(filp)) { + extra_count = 1; +- self->open_count--; ++ atomic_dec(&self->open_count); + } + spin_unlock_irqrestore(&self->spinlock, flags); +- self->blocked_open++; ++ atomic_inc(&self->blocked_open); + + while (1) { + if (tty->termios->c_cflag & CBAUD) { +@@ -329,7 +329,7 @@ static int ircomm_tty_block_til_ready(st + } + + IRDA_DEBUG(1, "%s(%d):block_til_ready blocking on %s open_count=%d\n", +- __FILE__,__LINE__, tty->driver->name, self->open_count ); ++ __FILE__,__LINE__, tty->driver->name, atomic_read(&self->open_count) ); + + schedule(); + } +@@ -340,13 +340,13 @@ static int ircomm_tty_block_til_ready(st + if (extra_count) { + /* ++ is not atomic, so this should be protected - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); +- self->open_count++; ++ atomic_inc(&self->open_count); + spin_unlock_irqrestore(&self->spinlock, flags); + } +- self->blocked_open--; ++ atomic_dec(&self->blocked_open); + + IRDA_DEBUG(1, "%s(%d):block_til_ready after blocking on %s open_count=%d\n", +- __FILE__,__LINE__, tty->driver->name, self->open_count); ++ __FILE__,__LINE__, tty->driver->name, atomic_read(&self->open_count)); + + if (!retval) + self->flags |= ASYNC_NORMAL_ACTIVE; +@@ -415,14 +415,14 @@ static int ircomm_tty_open(struct tty_st + } + /* ++ is not atomic, so this should be protected - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); +- self->open_count++; ++ atomic_inc(&self->open_count); + + tty->driver_data = self; + self->tty = tty; + spin_unlock_irqrestore(&self->spinlock, flags); + + IRDA_DEBUG(1, "%s(), %s%d, count = %d\n", __func__ , tty->driver->name, +- self->line, self->open_count); ++ self->line, atomic_read(&self->open_count)); + + /* Not really used by us, but lets do it anyway */ + self->tty->low_latency = (self->flags & ASYNC_LOW_LATENCY) ? 1 : 0; +@@ -511,7 +511,7 @@ static void ircomm_tty_close(struct tty_ + return; + } + +- if ((tty->count == 1) && (self->open_count != 1)) { ++ if ((tty->count == 1) && (atomic_read(&self->open_count) != 1)) { + /* + * Uh, oh. tty->count is 1, which means that the tty + * structure will be freed. state->count should always +@@ -521,16 +521,16 @@ static void ircomm_tty_close(struct tty_ + */ + IRDA_DEBUG(0, "%s(), bad serial port count; " + "tty->count is 1, state->count is %d\n", __func__ , +- self->open_count); +- self->open_count = 1; ++ atomic_read(&self->open_count)); ++ atomic_set(&self->open_count, 1); + } + +- if (--self->open_count < 0) { ++ if (atomic_dec_return(&self->open_count) < 0) { + IRDA_ERROR("%s(), bad serial port count for ttys%d: %d\n", +- __func__, self->line, self->open_count); +- self->open_count = 0; ++ __func__, self->line, atomic_read(&self->open_count)); ++ atomic_set(&self->open_count, 0); + } +- if (self->open_count) { ++ if (atomic_read(&self->open_count)) { + spin_unlock_irqrestore(&self->spinlock, flags); + + IRDA_DEBUG(0, "%s(), open count > 0\n", __func__ ); +@@ -562,7 +562,7 @@ static void ircomm_tty_close(struct tty_ + tty->closing = 0; + self->tty = NULL; + +- if (self->blocked_open) { ++ if (atomic_read(&self->blocked_open)) { + if (self->close_delay) + schedule_timeout_interruptible(self->close_delay); + wake_up_interruptible(&self->open_wait); +@@ -1017,7 +1017,7 @@ static void ircomm_tty_hangup(struct tty + spin_lock_irqsave(&self->spinlock, flags); + self->flags &= ~ASYNC_NORMAL_ACTIVE; + self->tty = NULL; +- self->open_count = 0; ++ atomic_set(&self->open_count, 0); + spin_unlock_irqrestore(&self->spinlock, flags); + + wake_up_interruptible(&self->open_wait); +@@ -1369,7 +1369,7 @@ static void ircomm_tty_line_info(struct + seq_putc(m, '\n'); + + seq_printf(m, "Role: %s\n", self->client ? "client" : "server"); +- seq_printf(m, "Open count: %d\n", self->open_count); ++ seq_printf(m, "Open count: %d\n", atomic_read(&self->open_count)); + seq_printf(m, "Max data size: %d\n", self->max_data_size); + seq_printf(m, "Max header size: %d\n", self->max_header_size); + +diff -urNp linux-2.6.33.1/net/mac80211/ieee80211_i.h linux-2.6.33.1/net/mac80211/ieee80211_i.h +--- linux-2.6.33.1/net/mac80211/ieee80211_i.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/mac80211/ieee80211_i.h 2010-03-20 16:58:42.316527391 -0400 +@@ -574,7 +574,7 @@ struct ieee80211_local { + /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */ + spinlock_t queue_stop_reason_lock; + +- int open_count; ++ atomic_t open_count; + int monitors, cooked_mntrs; + /* number of interfaces with corresponding FIF_ flags */ + int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll; +diff -urNp linux-2.6.33.1/net/mac80211/iface.c linux-2.6.33.1/net/mac80211/iface.c +--- linux-2.6.33.1/net/mac80211/iface.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/mac80211/iface.c 2010-03-20 16:58:42.316527391 -0400 +@@ -166,7 +166,7 @@ static int ieee80211_open(struct net_dev + break; + } + +- if (local->open_count == 0) { ++ if (atomic_read(&local->open_count) == 0) { + res = drv_start(local); + if (res) + goto err_del_bss; +@@ -198,7 +198,7 @@ static int ieee80211_open(struct net_dev + * Validate the MAC address for this device. + */ + if (!is_valid_ether_addr(dev->dev_addr)) { +- if (!local->open_count) ++ if (!atomic_read(&local->open_count)) + drv_stop(local); + return -EADDRNOTAVAIL; + } +@@ -294,7 +294,7 @@ static int ieee80211_open(struct net_dev + + hw_reconf_flags |= __ieee80211_recalc_idle(local); + +- local->open_count++; ++ atomic_inc(&local->open_count); + if (hw_reconf_flags) { + ieee80211_hw_config(local, hw_reconf_flags); + /* +@@ -322,7 +322,7 @@ static int ieee80211_open(struct net_dev + err_del_interface: + drv_remove_interface(local, &conf); + err_stop: +- if (!local->open_count) ++ if (!atomic_read(&local->open_count)) + drv_stop(local); + err_del_bss: + sdata->bss = NULL; +@@ -422,7 +422,7 @@ static int ieee80211_stop(struct net_dev + WARN_ON(!list_empty(&sdata->u.ap.vlans)); + } + +- local->open_count--; ++ atomic_dec(&local->open_count); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: +@@ -528,7 +528,7 @@ static int ieee80211_stop(struct net_dev + + ieee80211_recalc_ps(local, -1); + +- if (local->open_count == 0) { ++ if (atomic_read(&local->open_count) == 0) { + ieee80211_clear_tx_pending(local); + ieee80211_stop_device(local); + +diff -urNp linux-2.6.33.1/net/mac80211/main.c linux-2.6.33.1/net/mac80211/main.c +--- linux-2.6.33.1/net/mac80211/main.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/mac80211/main.c 2010-03-20 16:58:42.316527391 -0400 +@@ -129,7 +129,7 @@ int ieee80211_hw_config(struct ieee80211 + local->hw.conf.power_level = power; + } + +- if (changed && local->open_count) { ++ if (changed && atomic_read(&local->open_count)) { + ret = drv_config(local, changed); + /* + * Goal: +diff -urNp linux-2.6.33.1/net/mac80211/pm.c linux-2.6.33.1/net/mac80211/pm.c +--- linux-2.6.33.1/net/mac80211/pm.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/mac80211/pm.c 2010-03-20 16:58:42.316527391 -0400 +@@ -107,7 +107,7 @@ int __ieee80211_suspend(struct ieee80211 + } + + /* stop hardware - this must stop RX */ +- if (local->open_count) ++ if (atomic_read(&local->open_count)) + ieee80211_stop_device(local); + + local->suspended = true; +diff -urNp linux-2.6.33.1/net/mac80211/rate.c linux-2.6.33.1/net/mac80211/rate.c +--- linux-2.6.33.1/net/mac80211/rate.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/mac80211/rate.c 2010-03-20 16:58:42.316527391 -0400 +@@ -288,7 +288,7 @@ int ieee80211_init_rate_ctrl_alg(struct + + ASSERT_RTNL(); + +- if (local->open_count) ++ if (atomic_read(&local->open_count)) + return -EBUSY; + + if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) { +diff -urNp linux-2.6.33.1/net/mac80211/rc80211_pid_debugfs.c linux-2.6.33.1/net/mac80211/rc80211_pid_debugfs.c +--- linux-2.6.33.1/net/mac80211/rc80211_pid_debugfs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/mac80211/rc80211_pid_debugfs.c 2010-03-20 16:58:42.323982192 -0400 +@@ -191,7 +191,7 @@ static ssize_t rate_control_pid_events_r + + spin_unlock_irqrestore(&events->lock, status); + +- if (copy_to_user(buf, pb, p)) ++ if (p > sizeof(pb) || copy_to_user(buf, pb, p)) + return -EFAULT; + + return p; +diff -urNp linux-2.6.33.1/net/mac80211/util.c linux-2.6.33.1/net/mac80211/util.c +--- linux-2.6.33.1/net/mac80211/util.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/mac80211/util.c 2010-03-20 16:58:42.329957671 -0400 +@@ -1050,14 +1050,14 @@ int ieee80211_reconfig(struct ieee80211_ + local->resuming = true; + + /* restart hardware */ +- if (local->open_count) { ++ if (atomic_read(&local->open_count)) { + /* + * Upon resume hardware can sometimes be goofy due to + * various platform / driver / bus issues, so restarting + * the device may at times not work immediately. Propagate + * the error. + */ +- res = drv_start(local); ++ res = drv_start(local); + if (res) { + WARN(local->suspended, "Harware became unavailable " + "upon resume. This is could be a software issue" +diff -urNp linux-2.6.33.1/net/packet/af_packet.c linux-2.6.33.1/net/packet/af_packet.c +--- linux-2.6.33.1/net/packet/af_packet.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/packet/af_packet.c 2010-03-20 16:58:42.329957671 -0400 +@@ -1886,7 +1886,7 @@ static int packet_getsockopt(struct sock + case PACKET_HDRLEN: + if (len > sizeof(int)) + len = sizeof(int); +- if (copy_from_user(&val, optval, len)) ++ if (len > sizeof(val) || copy_from_user(&val, optval, len)) + return -EFAULT; + switch (val) { + case TPACKET_V1: +@@ -1919,7 +1919,7 @@ static int packet_getsockopt(struct sock + + if (put_user(len, optlen)) + return -EFAULT; +- if (copy_to_user(optval, data, len)) ++ if (len > sizeof(st) || copy_to_user(optval, data, len)) + return -EFAULT; + return 0; + } +diff -urNp linux-2.6.33.1/net/sctp/socket.c linux-2.6.33.1/net/sctp/socket.c +--- linux-2.6.33.1/net/sctp/socket.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/sctp/socket.c 2010-03-20 16:58:42.332512321 -0400 +@@ -1482,7 +1482,7 @@ SCTP_STATIC int sctp_sendmsg(struct kioc + struct sctp_sndrcvinfo *sinfo; + struct sctp_initmsg *sinit; + sctp_assoc_t associd = 0; +- sctp_cmsgs_t cmsgs = { NULL }; ++ sctp_cmsgs_t cmsgs = { NULL, NULL }; + int err; + sctp_scope_t scope; + long timeo; +@@ -4386,7 +4386,7 @@ static int sctp_getsockopt_peer_addrs(st + addrlen = sctp_get_af_specific(sk->sk_family)->sockaddr_len; + if (space_left < addrlen) + return -ENOMEM; +- if (copy_to_user(to, &temp, addrlen)) ++ if (addrlen > sizeof(temp) || copy_to_user(to, &temp, addrlen)) + return -EFAULT; + to += addrlen; + cnt++; +@@ -5478,7 +5478,6 @@ pp_found: + */ + int reuse = sk->sk_reuse; + struct sock *sk2; +- struct hlist_node *node; + + SCTP_DEBUG_PRINTK("sctp_get_port() found a possible match\n"); + if (pp->fastreuse && sk->sk_reuse && +diff -urNp linux-2.6.33.1/net/socket.c linux-2.6.33.1/net/socket.c +--- linux-2.6.33.1/net/socket.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/socket.c 2010-03-20 16:58:42.332512321 -0400 +@@ -87,6 +87,7 @@ + #include <linux/wireless.h> + #include <linux/nsproxy.h> + #include <linux/magic.h> ++#include <linux/in.h> + + #include <asm/uaccess.h> + #include <asm/unistd.h> +@@ -103,6 +104,8 @@ + #include <linux/sockios.h> + #include <linux/atalk.h> + ++#include <linux/grsock.h> ++ + static int sock_no_open(struct inode *irrelevant, struct file *dontcare); + static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +@@ -304,7 +307,7 @@ static int sockfs_get_sb(struct file_sys + mnt); + } + +-static struct vfsmount *sock_mnt __read_mostly; ++struct vfsmount *sock_mnt __read_mostly; + + static struct file_system_type sock_fs_type = { + .name = "sockfs", +@@ -1310,6 +1313,16 @@ SYSCALL_DEFINE3(socket, int, family, int + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + ++ if(!gr_search_socket(family, type, protocol)) { ++ retval = -EACCES; ++ goto out; ++ } ++ ++ if (gr_handle_sock_all(family, type, protocol)) { ++ retval = -EACCES; ++ goto out; ++ } ++ + retval = sock_create(family, type, protocol, &sock); + if (retval < 0) + goto out; +@@ -1422,6 +1435,14 @@ SYSCALL_DEFINE3(bind, int, fd, struct so + if (sock) { + err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address); + if (err >= 0) { ++ if (gr_handle_sock_server((struct sockaddr *)&address)) { ++ err = -EACCES; ++ goto error; ++ } ++ err = gr_search_bind(sock, (struct sockaddr_in *)&address); ++ if (err) ++ goto error; ++ + err = security_socket_bind(sock, + (struct sockaddr *)&address, + addrlen); +@@ -1430,6 +1451,7 @@ SYSCALL_DEFINE3(bind, int, fd, struct so + (struct sockaddr *) + &address, addrlen); + } ++error: + fput_light(sock->file, fput_needed); + } + return err; +@@ -1453,10 +1475,20 @@ SYSCALL_DEFINE2(listen, int, fd, int, ba + if ((unsigned)backlog > somaxconn) + backlog = somaxconn; + ++ if (gr_handle_sock_server_other(sock)) { ++ err = -EPERM; ++ goto error; ++ } ++ ++ err = gr_search_listen(sock); ++ if (err) ++ goto error; ++ + err = security_socket_listen(sock, backlog); + if (!err) + err = sock->ops->listen(sock, backlog); + ++error: + fput_light(sock->file, fput_needed); + } + return err; +@@ -1499,6 +1531,18 @@ SYSCALL_DEFINE4(accept4, int, fd, struct + newsock->type = sock->type; + newsock->ops = sock->ops; + ++ if (gr_handle_sock_server_other(sock)) { ++ err = -EPERM; ++ sock_release(newsock); ++ goto out_put; ++ } ++ ++ err = gr_search_accept(sock); ++ if (err) { ++ sock_release(newsock); ++ goto out_put; ++ } ++ + /* + * We don't need try_module_get here, as the listening socket (sock) + * has the protocol module (sock->ops->owner) held. +@@ -1537,6 +1581,8 @@ SYSCALL_DEFINE4(accept4, int, fd, struct + fd_install(newfd, newfile); + err = newfd; + ++ gr_attach_curr_ip(newsock->sk); ++ + out_put: + fput_light(sock->file, fput_needed); + out: +@@ -1569,6 +1615,7 @@ SYSCALL_DEFINE3(connect, int, fd, struct + int, addrlen) + { + struct socket *sock; ++ struct sockaddr *sck; + struct sockaddr_storage address; + int err, fput_needed; + +@@ -1579,6 +1626,17 @@ SYSCALL_DEFINE3(connect, int, fd, struct + if (err < 0) + goto out_put; + ++ sck = (struct sockaddr *)&address; ++ ++ if (gr_handle_sock_client(sck)) { ++ err = -EACCES; ++ goto out_put; ++ } ++ ++ err = gr_search_connect(sock, (struct sockaddr_in *)sck); ++ if (err) ++ goto out_put; ++ + err = + security_socket_connect(sock, (struct sockaddr *)&address, addrlen); + if (err) +diff -urNp linux-2.6.33.1/net/sunrpc/xprtrdma/svc_rdma.c linux-2.6.33.1/net/sunrpc/xprtrdma/svc_rdma.c +--- linux-2.6.33.1/net/sunrpc/xprtrdma/svc_rdma.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/sunrpc/xprtrdma/svc_rdma.c 2010-03-20 16:58:42.332512321 -0400 +@@ -105,7 +105,7 @@ static int read_reset_stat(ctl_table *ta + len -= *ppos; + if (len > *lenp) + len = *lenp; +- if (len && copy_to_user(buffer, str_buf, len)) ++ if (len > sizeof(str_buf) || (len && copy_to_user(buffer, str_buf, len))) + return -EFAULT; + *lenp = len; + *ppos += len; +diff -urNp linux-2.6.33.1/net/sysctl_net.c linux-2.6.33.1/net/sysctl_net.c +--- linux-2.6.33.1/net/sysctl_net.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/sysctl_net.c 2010-03-20 16:58:42.332512321 -0400 +@@ -46,7 +46,7 @@ static int net_ctl_permissions(struct ct + struct ctl_table *table) + { + /* Allow network administrator to have same access as root. */ +- if (capable(CAP_NET_ADMIN)) { ++ if (capable_nolog(CAP_NET_ADMIN)) { + int mode = (table->mode >> 6) & 7; + return (mode << 6) | (mode << 3) | mode; + } +diff -urNp linux-2.6.33.1/net/unix/af_unix.c linux-2.6.33.1/net/unix/af_unix.c +--- linux-2.6.33.1/net/unix/af_unix.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/net/unix/af_unix.c 2010-03-20 16:58:42.332512321 -0400 +@@ -735,6 +735,12 @@ static struct sock *unix_find_other(stru + err = -ECONNREFUSED; + if (!S_ISSOCK(inode->i_mode)) + goto put_fail; ++ ++ if (!gr_acl_handle_unix(path.dentry, path.mnt)) { ++ err = -EACCES; ++ goto put_fail; ++ } ++ + u = unix_find_socket_byinode(net, inode); + if (!u) + goto put_fail; +@@ -755,6 +761,13 @@ static struct sock *unix_find_other(stru + if (u) { + struct dentry *dentry; + dentry = unix_sk(u)->dentry; ++ ++ if (!gr_handle_chroot_unix(u->sk_peercred.pid)) { ++ err = -EPERM; ++ sock_put(u); ++ goto fail; ++ } ++ + if (dentry) + touch_atime(unix_sk(u)->mnt, dentry); + } else +@@ -840,11 +853,18 @@ static int unix_bind(struct socket *sock + err = security_path_mknod(&nd.path, dentry, mode, 0); + if (err) + goto out_mknod_drop_write; ++ if (!gr_acl_handle_mknod(dentry, nd.path.dentry, nd.path.mnt, mode)) { ++ err = -EACCES; ++ goto out_mknod_drop_write; ++ } + err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0); + out_mknod_drop_write: + mnt_drop_write(nd.path.mnt); + if (err) + goto out_mknod_dput; ++ ++ gr_handle_create(dentry, nd.path.mnt); ++ + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + dput(nd.path.dentry); + nd.path.dentry = dentry; +@@ -862,6 +882,10 @@ out_mknod_drop_write: + goto out_unlock; + } + ++#ifdef CONFIG_GRKERNSEC_CHROOT_UNIX ++ sk->sk_peercred.pid = current->pid; ++#endif ++ + list = &unix_socket_table[addr->hash]; + } else { + list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; +diff -urNp linux-2.6.33.1/samples/kobject/kset-example.c linux-2.6.33.1/samples/kobject/kset-example.c +--- linux-2.6.33.1/samples/kobject/kset-example.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/samples/kobject/kset-example.c 2010-03-20 16:58:42.344967210 -0400 +@@ -87,7 +87,7 @@ static ssize_t foo_attr_store(struct kob + } + + /* Our custom sysfs_ops that we will associate with our ktype later on */ +-static struct sysfs_ops foo_sysfs_ops = { ++static const struct sysfs_ops foo_sysfs_ops = { + .show = foo_attr_show, + .store = foo_attr_store, + }; +diff -urNp linux-2.6.33.1/scripts/basic/fixdep.c linux-2.6.33.1/scripts/basic/fixdep.c +--- linux-2.6.33.1/scripts/basic/fixdep.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/scripts/basic/fixdep.c 2010-03-20 16:58:42.344967210 -0400 +@@ -222,9 +222,9 @@ static void use_config(char *m, int slen + + static void parse_config_file(char *map, size_t len) + { +- int *end = (int *) (map + len); ++ unsigned int *end = (unsigned int *) (map + len); + /* start at +1, so that p can never be < map */ +- int *m = (int *) map + 1; ++ unsigned int *m = (unsigned int *) map + 1; + char *p, *q; + + for (; m < end; m++) { +@@ -371,7 +371,7 @@ static void print_deps(void) + static void traps(void) + { + static char test[] __attribute__((aligned(sizeof(int)))) = "CONF"; +- int *p = (int *)test; ++ unsigned int *p = (unsigned int *)test; + + if (*p != INT_CONF) { + fprintf(stderr, "fixdep: sizeof(int) != 4 or wrong endianess? %#x\n", +diff -urNp linux-2.6.33.1/scripts/kallsyms.c linux-2.6.33.1/scripts/kallsyms.c +--- linux-2.6.33.1/scripts/kallsyms.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/scripts/kallsyms.c 2010-03-20 16:58:42.344967210 -0400 +@@ -43,10 +43,10 @@ struct text_range { + + static unsigned long long _text; + static struct text_range text_ranges[] = { +- { "_stext", "_etext" }, +- { "_sinittext", "_einittext" }, +- { "_stext_l1", "_etext_l1" }, /* Blackfin on-chip L1 inst SRAM */ +- { "_stext_l2", "_etext_l2" }, /* Blackfin on-chip L2 SRAM */ ++ { "_stext", "_etext", 0, 0 }, ++ { "_sinittext", "_einittext", 0, 0 }, ++ { "_stext_l1", "_etext_l1", 0, 0 }, /* Blackfin on-chip L1 inst SRAM */ ++ { "_stext_l2", "_etext_l2", 0, 0 }, /* Blackfin on-chip L2 SRAM */ + }; + #define text_range_text (&text_ranges[0]) + #define text_range_inittext (&text_ranges[1]) +diff -urNp linux-2.6.33.1/scripts/mod/file2alias.c linux-2.6.33.1/scripts/mod/file2alias.c +--- linux-2.6.33.1/scripts/mod/file2alias.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/scripts/mod/file2alias.c 2010-03-20 16:58:42.348544513 -0400 +@@ -72,7 +72,7 @@ static void device_id_check(const char * + unsigned long size, unsigned long id_size, + void *symval) + { +- int i; ++ unsigned int i; + + if (size % id_size || size < id_size) { + if (cross_build != 0) +@@ -102,7 +102,7 @@ static void device_id_check(const char * + /* USB is special because the bcdDevice can be matched against a numeric range */ + /* Looks like "usb:vNpNdNdcNdscNdpNicNiscNipN" */ + static void do_usb_entry(struct usb_device_id *id, +- unsigned int bcdDevice_initial, int bcdDevice_initial_digits, ++ unsigned int bcdDevice_initial, unsigned int bcdDevice_initial_digits, + unsigned char range_lo, unsigned char range_hi, + unsigned char max, struct module *mod) + { +@@ -437,7 +437,7 @@ static void do_pnp_device_entry(void *sy + for (i = 0; i < count; i++) { + const char *id = (char *)devs[i].id; + char acpi_id[sizeof(devs[0].id)]; +- int j; ++ unsigned int j; + + buf_printf(&mod->dev_table_buf, + "MODULE_ALIAS("pnp:d%s*");\n", id); +@@ -467,7 +467,7 @@ static void do_pnp_card_entries(void *sy + + for (j = 0; j < PNP_MAX_DEVICES; j++) { + const char *id = (char *)card->devs[j].id; +- int i2, j2; ++ unsigned int i2, j2; + int dup = 0; + + if (!id[0]) +@@ -493,7 +493,7 @@ static void do_pnp_card_entries(void *sy + /* add an individual alias for every device entry */ + if (!dup) { + char acpi_id[sizeof(card->devs[0].id)]; +- int k; ++ unsigned int k; + + buf_printf(&mod->dev_table_buf, + "MODULE_ALIAS("pnp:d%s*");\n", id); +@@ -768,7 +768,7 @@ static void dmi_ascii_filter(char *d, co + static int do_dmi_entry(const char *filename, struct dmi_system_id *id, + char *alias) + { +- int i, j; ++ unsigned int i, j; + + sprintf(alias, "dmi*"); + +diff -urNp linux-2.6.33.1/scripts/mod/modpost.c linux-2.6.33.1/scripts/mod/modpost.c +--- linux-2.6.33.1/scripts/mod/modpost.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/scripts/mod/modpost.c 2010-03-20 16:58:42.348544513 -0400 +@@ -842,6 +842,7 @@ enum mismatch { + INIT_TO_EXIT, + EXIT_TO_INIT, + EXPORT_TO_INIT_EXIT, ++ DATA_TO_TEXT + }; + + struct sectioncheck { +@@ -927,6 +928,12 @@ const struct sectioncheck sectioncheck[] + .fromsec = { "__ksymtab*", NULL }, + .tosec = { INIT_SECTIONS, EXIT_SECTIONS, NULL }, + .mismatch = EXPORT_TO_INIT_EXIT ++}, ++/* Do not reference code from writable data */ ++{ ++ .fromsec = { DATA_SECTIONS, NULL }, ++ .tosec = { TEXT_SECTIONS, NULL }, ++ .mismatch = DATA_TO_TEXT + } + }; + +@@ -1031,10 +1038,10 @@ static Elf_Sym *find_elf_symbol(struct e + continue; + if (ELF_ST_TYPE(sym->st_info) == STT_SECTION) + continue; +- if (sym->st_value == addr) +- return sym; + /* Find a symbol nearby - addr are maybe negative */ + d = sym->st_value - addr; ++ if (d == 0) ++ return sym; + if (d < 0) + d = addr - sym->st_value; + if (d < distance) { +@@ -1275,6 +1282,14 @@ static void report_sec_mismatch(const ch + "Fix this by removing the %sannotation of %s " + "or drop the export.\n", + tosym, sec2annotation(tosec), sec2annotation(tosec), tosym); ++ case DATA_TO_TEXT: ++/* ++ fprintf(stderr, ++ "The variable %s references\n" ++ "the %s %s%s%s\n", ++ fromsym, to, sec2annotation(tosec), tosym, to_p); ++*/ ++ break; + case NO_MISMATCH: + /* To get warnings on missing members */ + break; +@@ -1600,7 +1615,7 @@ void __attribute__((format(printf, 2, 3) + va_end(ap); + } + +-void buf_write(struct buffer *buf, const char *s, int len) ++void buf_write(struct buffer *buf, const char *s, unsigned int len) + { + if (buf->size - buf->pos < len) { + buf->size += len + SZ; +@@ -1812,7 +1827,7 @@ static void write_if_changed(struct buff + if (fstat(fileno(file), &st) < 0) + goto close_write; + +- if (st.st_size != b->pos) ++ if (st.st_size != (off_t)b->pos) + goto close_write; + + tmp = NOFAIL(malloc(b->pos)); +diff -urNp linux-2.6.33.1/scripts/mod/modpost.h linux-2.6.33.1/scripts/mod/modpost.h +--- linux-2.6.33.1/scripts/mod/modpost.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/scripts/mod/modpost.h 2010-03-20 16:58:42.348544513 -0400 +@@ -92,15 +92,15 @@ void *do_nofail(void *ptr, const char *e + + struct buffer { + char *p; +- int pos; +- int size; ++ unsigned int pos; ++ unsigned int size; + }; + + void __attribute__((format(printf, 2, 3))) + buf_printf(struct buffer *buf, const char *fmt, ...); + + void +-buf_write(struct buffer *buf, const char *s, int len); ++buf_write(struct buffer *buf, const char *s, unsigned int len); + + struct module { + struct module *next; +diff -urNp linux-2.6.33.1/scripts/mod/sumversion.c linux-2.6.33.1/scripts/mod/sumversion.c +--- linux-2.6.33.1/scripts/mod/sumversion.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/scripts/mod/sumversion.c 2010-03-20 16:58:42.348544513 -0400 +@@ -455,7 +455,7 @@ static void write_version(const char *fi + goto out; + } + +- if (write(fd, sum, strlen(sum)+1) != strlen(sum)+1) { ++ if (write(fd, sum, strlen(sum)+1) != (ssize_t)strlen(sum)+1) { + warn("writing sum in %s failed: %s\n", + filename, strerror(errno)); + goto out; +diff -urNp linux-2.6.33.1/scripts/pnmtologo.c linux-2.6.33.1/scripts/pnmtologo.c +--- linux-2.6.33.1/scripts/pnmtologo.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/scripts/pnmtologo.c 2010-03-20 16:58:42.348544513 -0400 +@@ -237,14 +237,14 @@ static void write_header(void) + fprintf(out, " * Linux logo %s\n", logoname); + fputs(" */\n\n", out); + fputs("#include <linux/linux_logo.h>\n\n", out); +- fprintf(out, "static unsigned char %s_data[] __initdata = {\n", ++ fprintf(out, "static unsigned char %s_data[] = {\n", + logoname); + } + + static void write_footer(void) + { + fputs("\n};\n\n", out); +- fprintf(out, "const struct linux_logo %s __initconst = {\n", logoname); ++ fprintf(out, "const struct linux_logo %s = {\n", logoname); + fprintf(out, "\t.type\t\t= %s,\n", logo_types[logo_type]); + fprintf(out, "\t.width\t\t= %d,\n", logo_width); + fprintf(out, "\t.height\t\t= %d,\n", logo_height); +@@ -374,7 +374,7 @@ static void write_logo_clut224(void) + fputs("\n};\n\n", out); + + /* write logo clut */ +- fprintf(out, "static unsigned char %s_clut[] __initdata = {\n", ++ fprintf(out, "static unsigned char %s_clut[] = {\n", + logoname); + write_hex_cnt = 0; + for (i = 0; i < logo_clutsize; i++) { +diff -urNp linux-2.6.33.1/security/commoncap.c linux-2.6.33.1/security/commoncap.c +--- linux-2.6.33.1/security/commoncap.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/security/commoncap.c 2010-03-20 16:58:42.348544513 -0400 +@@ -27,7 +27,7 @@ + #include <linux/sched.h> + #include <linux/prctl.h> + #include <linux/securebits.h> +- ++#include <net/sock.h> + /* + * If a non-root user executes a setuid-root binary in + * !secure(SECURE_NOROOT) mode, then we raise capabilities. +@@ -50,9 +50,11 @@ static void warn_setuid_and_fcaps_mixed( + } + } + ++extern kernel_cap_t gr_cap_rtnetlink(struct sock *sk); ++ + int cap_netlink_send(struct sock *sk, struct sk_buff *skb) + { +- NETLINK_CB(skb).eff_cap = current_cap(); ++ NETLINK_CB(skb).eff_cap = gr_cap_rtnetlink(sk); + return 0; + } + +diff -urNp linux-2.6.33.1/security/integrity/ima/ima_api.c linux-2.6.33.1/security/integrity/ima/ima_api.c +--- linux-2.6.33.1/security/integrity/ima/ima_api.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/security/integrity/ima/ima_api.c 2010-03-20 16:58:42.348544513 -0400 +@@ -74,7 +74,7 @@ void ima_add_violation(struct inode *ino + int result; + + /* can overflow, only indicator */ +- atomic_long_inc(&ima_htable.violations); ++ atomic_long_inc_unchecked(&ima_htable.violations); + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { +diff -urNp linux-2.6.33.1/security/integrity/ima/ima_fs.c linux-2.6.33.1/security/integrity/ima/ima_fs.c +--- linux-2.6.33.1/security/integrity/ima/ima_fs.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/security/integrity/ima/ima_fs.c 2010-03-20 16:58:42.348544513 -0400 +@@ -27,12 +27,12 @@ + static int valid_policy = 1; + #define TMPBUFLEN 12 + static ssize_t ima_show_htable_value(char __user *buf, size_t count, +- loff_t *ppos, atomic_long_t *val) ++ loff_t *ppos, atomic_long_unchecked_t *val) + { + char tmpbuf[TMPBUFLEN]; + ssize_t len; + +- len = scnprintf(tmpbuf, TMPBUFLEN, "%li\n", atomic_long_read(val)); ++ len = scnprintf(tmpbuf, TMPBUFLEN, "%li\n", atomic_long_read_unchecked(val)); + return simple_read_from_buffer(buf, count, ppos, tmpbuf, len); + } + +diff -urNp linux-2.6.33.1/security/integrity/ima/ima.h linux-2.6.33.1/security/integrity/ima/ima.h +--- linux-2.6.33.1/security/integrity/ima/ima.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/security/integrity/ima/ima.h 2010-03-20 16:58:42.348544513 -0400 +@@ -83,8 +83,8 @@ void ima_add_violation(struct inode *ino + extern spinlock_t ima_queue_lock; + + struct ima_h_table { +- atomic_long_t len; /* number of stored measurements in the list */ +- atomic_long_t violations; ++ atomic_long_unchecked_t len; /* number of stored measurements in the list */ ++ atomic_long_unchecked_t violations; + struct hlist_head queue[IMA_MEASURE_HTABLE_SIZE]; + }; + extern struct ima_h_table ima_htable; +diff -urNp linux-2.6.33.1/security/integrity/ima/ima_queue.c linux-2.6.33.1/security/integrity/ima/ima_queue.c +--- linux-2.6.33.1/security/integrity/ima/ima_queue.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/security/integrity/ima/ima_queue.c 2010-03-20 16:58:42.348544513 -0400 +@@ -78,7 +78,7 @@ static int ima_add_digest_entry(struct i + INIT_LIST_HEAD(&qe->later); + list_add_tail_rcu(&qe->later, &ima_measurements); + +- atomic_long_inc(&ima_htable.len); ++ atomic_long_inc_unchecked(&ima_htable.len); + key = ima_hash_key(entry->digest); + hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); + return 0; +diff -urNp linux-2.6.33.1/security/Kconfig linux-2.6.33.1/security/Kconfig +--- linux-2.6.33.1/security/Kconfig 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/security/Kconfig 2010-03-20 17:07:22.208972401 -0400 +@@ -4,6 +4,488 @@ + + menu "Security options" + ++source grsecurity/Kconfig ++ ++menu "PaX" ++ ++config PAX ++ bool "Enable various PaX features" ++ depends on GRKERNSEC && (ALPHA || ARM || AVR32 || IA64 || MIPS32 || MIPS64 || PARISC || PPC || SPARC || X86) ++ help ++ This allows you to enable various PaX features. PaX adds ++ intrusion prevention mechanisms to the kernel that reduce ++ the risks posed by exploitable memory corruption bugs. ++ ++menu "PaX Control" ++ depends on PAX ++ ++config PAX_SOFTMODE ++ bool 'Support soft mode' ++ select PAX_PT_PAX_FLAGS ++ help ++ Enabling this option will allow you to run PaX in soft mode, that ++ is, PaX features will not be enforced by default, only on executables ++ marked explicitly. You must also enable PT_PAX_FLAGS support as it ++ is the only way to mark executables for soft mode use. ++ ++ Soft mode can be activated by using the "pax_softmode=1" kernel command ++ line option on boot. Furthermore you can control various PaX features ++ at runtime via the entries in /proc/sys/kernel/pax. ++ ++config PAX_EI_PAX ++ bool 'Use legacy ELF header marking' ++ help ++ Enabling this option will allow you to control PaX features on ++ a per executable basis via the 'chpax' utility available at ++ http://pax.grsecurity.net/. The control flags will be read from ++ an otherwise reserved part of the ELF header. This marking has ++ numerous drawbacks (no support for soft-mode, toolchain does not ++ know about the non-standard use of the ELF header) therefore it ++ has been deprecated in favour of PT_PAX_FLAGS support. ++ ++ If you have applications not marked by the PT_PAX_FLAGS ELF ++ program header then you MUST enable this option otherwise they ++ will not get any protection. ++ ++ Note that if you enable PT_PAX_FLAGS marking support as well, ++ the PT_PAX_FLAG marks will override the legacy EI_PAX marks. ++ ++config PAX_PT_PAX_FLAGS ++ bool 'Use ELF program header marking' ++ help ++ Enabling this option will allow you to control PaX features on ++ a per executable basis via the 'paxctl' utility available at ++ http://pax.grsecurity.net/. The control flags will be read from ++ a PaX specific ELF program header (PT_PAX_FLAGS). This marking ++ has the benefits of supporting both soft mode and being fully ++ integrated into the toolchain (the binutils patch is available ++ from http://pax.grsecurity.net). ++ ++ If you have applications not marked by the PT_PAX_FLAGS ELF ++ program header then you MUST enable the EI_PAX marking support ++ otherwise they will not get any protection. ++ ++ Note that if you enable the legacy EI_PAX marking support as well, ++ the EI_PAX marks will be overridden by the PT_PAX_FLAGS marks. ++ ++choice ++ prompt 'MAC system integration' ++ default PAX_HAVE_ACL_FLAGS ++ help ++ Mandatory Access Control systems have the option of controlling ++ PaX flags on a per executable basis, choose the method supported ++ by your particular system. ++ ++ - "none": if your MAC system does not interact with PaX, ++ - "direct": if your MAC system defines pax_set_initial_flags() itself, ++ - "hook": if your MAC system uses the pax_set_initial_flags_func callback. ++ ++ NOTE: this option is for developers/integrators only. ++ ++ config PAX_NO_ACL_FLAGS ++ bool 'none' ++ ++ config PAX_HAVE_ACL_FLAGS ++ bool 'direct' ++ ++ config PAX_HOOK_ACL_FLAGS ++ bool 'hook' ++endchoice ++ ++endmenu ++ ++menu "Non-executable pages" ++ depends on PAX ++ ++config PAX_NOEXEC ++ bool "Enforce non-executable pages" ++ depends on (PAX_EI_PAX || PAX_PT_PAX_FLAGS || PAX_HAVE_ACL_FLAGS || PAX_HOOK_ACL_FLAGS) && (ALPHA || (ARM && (CPU_V6 || CPU_V7)) || IA64 || MIPS || PARISC || PPC || S390 || SPARC || X86) ++ help ++ By design some architectures do not allow for protecting memory ++ pages against execution or even if they do, Linux does not make ++ use of this feature. In practice this means that if a page is ++ readable (such as the stack or heap) it is also executable. ++ ++ There is a well known exploit technique that makes use of this ++ fact and a common programming mistake where an attacker can ++ introduce code of his choice somewhere in the attacked program's ++ memory (typically the stack or the heap) and then execute it. ++ ++ If the attacked program was running with different (typically ++ higher) privileges than that of the attacker, then he can elevate ++ his own privilege level (e.g. get a root shell, write to files for ++ which he does not have write access to, etc). ++ ++ Enabling this option will let you choose from various features ++ that prevent the injection and execution of 'foreign' code in ++ a program. ++ ++ This will also break programs that rely on the old behaviour and ++ expect that dynamically allocated memory via the malloc() family ++ of functions is executable (which it is not). Notable examples ++ are the XFree86 4.x server, the java runtime and wine. ++ ++config PAX_PAGEEXEC ++ bool "Paging based non-executable pages" ++ depends on PAX_NOEXEC && (!X86_32 || M586 || M586TSC || M586MMX || M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC || MK7 || MK8 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MVIAC3_2 || MVIAC7) ++ select S390_SWITCH_AMODE if S390 ++ select S390_EXEC_PROTECT if S390 ++ help ++ This implementation is based on the paging feature of the CPU. ++ On i386 without hardware non-executable bit support there is a ++ variable but usually low performance impact, however on Intel's ++ P4 core based CPUs it is very high so you should not enable this ++ for kernels meant to be used on such CPUs. ++ ++ On alpha, avr32, ia64, parisc, sparc, sparc64, x86_64 and i386 ++ with hardware non-executable bit support there is no performance ++ impact, on ppc the impact is negligible. ++ ++ Note that several architectures require various emulations due to ++ badly designed userland ABIs, this will cause a performance impact ++ but will disappear as soon as userland is fixed. For example, ppc ++ userland MUST have been built with secure-plt by a recent toolchain. ++ ++config PAX_SEGMEXEC ++ bool "Segmentation based non-executable pages" ++ depends on PAX_NOEXEC && X86_32 ++ help ++ This implementation is based on the segmentation feature of the ++ CPU and has a very small performance impact, however applications ++ will be limited to a 1.5 GB address space instead of the normal ++ 3 GB. ++ ++config PAX_EMUTRAMP ++ bool "Emulate trampolines" if (PAX_PAGEEXEC || PAX_SEGMEXEC) && (PARISC || X86) ++ default y if PARISC ++ help ++ There are some programs and libraries that for one reason or ++ another attempt to execute special small code snippets from ++ non-executable memory pages. Most notable examples are the ++ signal handler return code generated by the kernel itself and ++ the GCC trampolines. ++ ++ If you enabled CONFIG_PAX_PAGEEXEC or CONFIG_PAX_SEGMEXEC then ++ such programs will no longer work under your kernel. ++ ++ As a remedy you can say Y here and use the 'chpax' or 'paxctl' ++ utilities to enable trampoline emulation for the affected programs ++ yet still have the protection provided by the non-executable pages. ++ ++ On parisc you MUST enable this option and EMUSIGRT as well, otherwise ++ your system will not even boot. ++ ++ Alternatively you can say N here and use the 'chpax' or 'paxctl' ++ utilities to disable CONFIG_PAX_PAGEEXEC and CONFIG_PAX_SEGMEXEC ++ for the affected files. ++ ++ NOTE: enabling this feature *may* open up a loophole in the ++ protection provided by non-executable pages that an attacker ++ could abuse. Therefore the best solution is to not have any ++ files on your system that would require this option. This can ++ be achieved by not using libc5 (which relies on the kernel ++ signal handler return code) and not using or rewriting programs ++ that make use of the nested function implementation of GCC. ++ Skilled users can just fix GCC itself so that it implements ++ nested function calls in a way that does not interfere with PaX. ++ ++config PAX_EMUSIGRT ++ bool "Automatically emulate sigreturn trampolines" ++ depends on PAX_EMUTRAMP && PARISC ++ default y ++ help ++ Enabling this option will have the kernel automatically detect ++ and emulate signal return trampolines executing on the stack ++ that would otherwise lead to task termination. ++ ++ This solution is intended as a temporary one for users with ++ legacy versions of libc (libc5, glibc 2.0, uClibc before 0.9.17, ++ Modula-3 runtime, etc) or executables linked to such, basically ++ everything that does not specify its own SA_RESTORER function in ++ normal executable memory like glibc 2.1+ does. ++ ++ On parisc you MUST enable this option, otherwise your system will ++ not even boot. ++ ++ NOTE: this feature cannot be disabled on a per executable basis ++ and since it *does* open up a loophole in the protection provided ++ by non-executable pages, the best solution is to not have any ++ files on your system that would require this option. ++ ++config PAX_MPROTECT ++ bool "Restrict mprotect()" ++ depends on (PAX_PAGEEXEC || PAX_SEGMEXEC) ++ help ++ Enabling this option will prevent programs from ++ - changing the executable status of memory pages that were ++ not originally created as executable, ++ - making read-only executable pages writable again, ++ - creating executable pages from anonymous memory. ++ ++ You should say Y here to complete the protection provided by ++ the enforcement of non-executable pages. ++ ++ NOTE: you can use the 'chpax' or 'paxctl' utilities to control ++ this feature on a per file basis. ++ ++config PAX_NOELFRELOCS ++ bool "Disallow ELF text relocations" ++ depends on PAX_MPROTECT && !PAX_ETEXECRELOCS && (IA64 || PPC || X86) ++ help ++ Non-executable pages and mprotect() restrictions are effective ++ in preventing the introduction of new executable code into an ++ attacked task's address space. There remain only two venues ++ for this kind of attack: if the attacker can execute already ++ existing code in the attacked task then he can either have it ++ create and mmap() a file containing his code or have it mmap() ++ an already existing ELF library that does not have position ++ independent code in it and use mprotect() on it to make it ++ writable and copy his code there. While protecting against ++ the former approach is beyond PaX, the latter can be prevented ++ by having only PIC ELF libraries on one's system (which do not ++ need to relocate their code). If you are sure this is your case, ++ then enable this option otherwise be careful as you may not even ++ be able to boot or log on your system (for example, some PAM ++ modules are erroneously compiled as non-PIC by default). ++ ++ NOTE: if you are using dynamic ELF executables (as suggested ++ when using ASLR) then you must have made sure that you linked ++ your files using the PIC version of crt1 (the et_dyn.tar.gz package ++ referenced there has already been updated to support this). ++ ++config PAX_ETEXECRELOCS ++ bool "Allow ELF ET_EXEC text relocations" ++ depends on PAX_MPROTECT && (ALPHA || IA64 || PARISC) ++ default y ++ help ++ On some architectures there are incorrectly created applications ++ that require text relocations and would not work without enabling ++ this option. If you are an alpha, ia64 or parisc user, you should ++ enable this option and disable it once you have made sure that ++ none of your applications need it. ++ ++config PAX_EMUPLT ++ bool "Automatically emulate ELF PLT" ++ depends on PAX_MPROTECT && (ALPHA || PARISC || SPARC) ++ default y ++ help ++ Enabling this option will have the kernel automatically detect ++ and emulate the Procedure Linkage Table entries in ELF files. ++ On some architectures such entries are in writable memory, and ++ become non-executable leading to task termination. Therefore ++ it is mandatory that you enable this option on alpha, parisc, ++ sparc and sparc64, otherwise your system would not even boot. ++ ++ NOTE: this feature *does* open up a loophole in the protection ++ provided by the non-executable pages, therefore the proper ++ solution is to modify the toolchain to produce a PLT that does ++ not need to be writable. ++ ++config PAX_DLRESOLVE ++ bool 'Emulate old glibc resolver stub' ++ depends on PAX_EMUPLT && SPARC ++ default n ++ help ++ This option is needed if userland has an old glibc (before 2.4) ++ that puts a 'save' instruction into the runtime generated resolver ++ stub that needs special emulation. ++ ++config PAX_KERNEXEC ++ bool "Enforce non-executable kernel pages" ++ depends on PAX_NOEXEC && (PPC || X86) && (!X86_32 || X86_WP_WORKS_OK) && !XEN ++ help ++ This is the kernel land equivalent of PAGEEXEC and MPROTECT, ++ that is, enabling this option will make it harder to inject ++ and execute 'foreign' code in kernel memory itself. ++ ++config PAX_KERNEXEC_MODULE_TEXT ++ int "Minimum amount of memory reserved for module code" ++ default "4" ++ depends on PAX_KERNEXEC && X86_32 && MODULES ++ help ++ Due to implementation details the kernel must reserve a fixed ++ amount of memory for module code at compile time that cannot be ++ changed at runtime. Here you can specify the minimum amount ++ in MB that will be reserved. Due to the same implementation ++ details this size will always be rounded up to the next 2/4 MB ++ boundary (depends on PAE) so the actually available memory for ++ module code will usually be more than this minimum. ++ ++ The default 4 MB should be enough for most users but if you have ++ an excessive number of modules (e.g., most distribution configs ++ compile many drivers as modules) or use huge modules such as ++ nvidia's kernel driver, you will need to adjust this amount. ++ A good rule of thumb is to look at your currently loaded kernel ++ modules and add up their sizes. ++ ++endmenu ++ ++menu "Address Space Layout Randomization" ++ depends on PAX ++ ++config PAX_ASLR ++ bool "Address Space Layout Randomization" ++ depends on PAX_EI_PAX || PAX_PT_PAX_FLAGS || PAX_HAVE_ACL_FLAGS || PAX_HOOK_ACL_FLAGS ++ help ++ Many if not most exploit techniques rely on the knowledge of ++ certain addresses in the attacked program. The following options ++ will allow the kernel to apply a certain amount of randomization ++ to specific parts of the program thereby forcing an attacker to ++ guess them in most cases. Any failed guess will most likely crash ++ the attacked program which allows the kernel to detect such attempts ++ and react on them. PaX itself provides no reaction mechanisms, ++ instead it is strongly encouraged that you make use of Nergal's ++ segvguard (ftp://ftp.pl.openwall.com/misc/segvguard/) or grsecurity's ++ (http://www.grsecurity.net/) built-in crash detection features or ++ develop one yourself. ++ ++ By saying Y here you can choose to randomize the following areas: ++ - top of the task's kernel stack ++ - top of the task's userland stack ++ - base address for mmap() requests that do not specify one ++ (this includes all libraries) ++ - base address of the main executable ++ ++ It is strongly recommended to say Y here as address space layout ++ randomization has negligible impact on performance yet it provides ++ a very effective protection. ++ ++ NOTE: you can use the 'chpax' or 'paxctl' utilities to control ++ this feature on a per file basis. ++ ++config PAX_RANDKSTACK ++ bool "Randomize kernel stack base" ++ depends on PAX_ASLR && X86_TSC && X86_32 ++ help ++ By saying Y here the kernel will randomize every task's kernel ++ stack on every system call. This will not only force an attacker ++ to guess it but also prevent him from making use of possible ++ leaked information about it. ++ ++ Since the kernel stack is a rather scarce resource, randomization ++ may cause unexpected stack overflows, therefore you should very ++ carefully test your system. Note that once enabled in the kernel ++ configuration, this feature cannot be disabled on a per file basis. ++ ++config PAX_RANDUSTACK ++ bool "Randomize user stack base" ++ depends on PAX_ASLR ++ help ++ By saying Y here the kernel will randomize every task's userland ++ stack. The randomization is done in two steps where the second ++ one may apply a big amount of shift to the top of the stack and ++ cause problems for programs that want to use lots of memory (more ++ than 2.5 GB if SEGMEXEC is not active, or 1.25 GB when it is). ++ For this reason the second step can be controlled by 'chpax' or ++ 'paxctl' on a per file basis. ++ ++config PAX_RANDMMAP ++ bool "Randomize mmap() base" ++ depends on PAX_ASLR ++ help ++ By saying Y here the kernel will use a randomized base address for ++ mmap() requests that do not specify one themselves. As a result ++ all dynamically loaded libraries will appear at random addresses ++ and therefore be harder to exploit by a technique where an attacker ++ attempts to execute library code for his purposes (e.g. spawn a ++ shell from an exploited program that is running at an elevated ++ privilege level). ++ ++ Furthermore, if a program is relinked as a dynamic ELF file, its ++ base address will be randomized as well, completing the full ++ randomization of the address space layout. Attacking such programs ++ becomes a guess game. You can find an example of doing this at ++ http://pax.grsecurity.net/et_dyn.tar.gz and practical samples at ++ http://www.grsecurity.net/grsec-gcc-specs.tar.gz . ++ ++ NOTE: you can use the 'chpax' or 'paxctl' utilities to control this ++ feature on a per file basis. ++ ++endmenu ++ ++menu "Miscellaneous hardening features" ++ ++config PAX_MEMORY_SANITIZE ++ bool "Sanitize all freed memory" ++ help ++ By saying Y here the kernel will erase memory pages as soon as they ++ are freed. This in turn reduces the lifetime of data stored in the ++ pages, making it less likely that sensitive information such as ++ passwords, cryptographic secrets, etc stay in memory for too long. ++ ++ This is especially useful for programs whose runtime is short, long ++ lived processes and the kernel itself benefit from this as long as ++ they operate on whole memory pages and ensure timely freeing of pages ++ that may hold sensitive information. ++ ++ The tradeoff is performance impact, on a single CPU system kernel ++ compilation sees a 3% slowdown, other systems and workloads may vary ++ and you are advised to test this feature on your expected workload ++ before deploying it. ++ ++ Note that this feature does not protect data stored in live pages, ++ e.g., process memory swapped to disk may stay there for a long time. ++ ++config PAX_MEMORY_UDEREF ++ bool "Prevent invalid userland pointer dereference" ++ depends on X86_32 && !UML_X86 && !XEN ++ help ++ By saying Y here the kernel will be prevented from dereferencing ++ userland pointers in contexts where the kernel expects only kernel ++ pointers. This is both a useful runtime debugging feature and a ++ security measure that prevents exploiting a class of kernel bugs. ++ ++ The tradeoff is that some virtualization solutions may experience ++ a huge slowdown and therefore you should not enable this feature ++ for kernels meant to run in such environments. Whether a given VM ++ solution is affected or not is best determined by simply trying it ++ out, the performance impact will be obvious right on boot as this ++ mechanism engages from very early on. A good rule of thumb is that ++ VMs running on CPUs without hardware virtualization support (i.e., ++ the majority of IA-32 CPUs) will likely experience the slowdown. ++ ++config PAX_REFCOUNT ++ bool "Prevent various kernel object reference counter overflows" ++ depends on GRKERNSEC && (X86 || SPARC64) ++ help ++ By saying Y here the kernel will detect and prevent overflowing ++ various (but not all) kinds of object reference counters. Such ++ overflows can normally occur due to bugs only and are often, if ++ not always, exploitable. ++ ++ The tradeoff is that data structures protected by an overflowed ++ refcount will never be freed and therefore will leak memory. Note ++ that this leak also happens even without this protection but in ++ that case the overflow can eventually trigger the freeing of the ++ data structure while it is still being used elsewhere, resulting ++ in the exploitable situation that this feature prevents. ++ ++ Since this has a negligible performance impact, you should enable ++ this feature. ++ ++config PAX_USERCOPY ++ bool "Bounds check heap object copies between kernel and userland" ++ depends on X86 || PPC || SPARC ++ depends on GRKERNSEC && (SLAB || SLUB || SLOB) ++ help ++ By saying Y here the kernel will enforce the size of heap objects ++ when they are copied in either direction between the kernel and ++ userland, even if only a part of the heap object is copied. ++ ++ Specifically, this checking prevents information leaking from the ++ kernel heap during kernel to userland copies (if the kernel heap ++ object is otherwise fully initialized) and prevents kernel heap ++ overflows during userland to kernel copies. ++ ++ Note that the current implementation provides the strictest checks ++ for the SLUB allocator. ++ ++ Since this has a negligible performance impact, you should enable ++ this feature. ++endmenu ++ ++endmenu ++ + config KEYS + bool "Enable access key retention support" + help +@@ -124,7 +606,7 @@ config INTEL_TXT + config LSM_MMAP_MIN_ADDR + int "Low address space for LSM to protect from user allocation" + depends on SECURITY && SECURITY_SELINUX +- default 65536 ++ default 32768 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages +diff -urNp linux-2.6.33.1/security/min_addr.c linux-2.6.33.1/security/min_addr.c +--- linux-2.6.33.1/security/min_addr.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/security/min_addr.c 2010-03-20 16:58:42.348544513 -0400 +@@ -14,6 +14,7 @@ unsigned long dac_mmap_min_addr = CONFIG + */ + static void update_mmap_min_addr(void) + { ++#ifndef SPARC + #ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + mmap_min_addr = dac_mmap_min_addr; +@@ -22,6 +23,7 @@ static void update_mmap_min_addr(void) + #else + mmap_min_addr = dac_mmap_min_addr; + #endif ++#endif + } + + /* +diff -urNp linux-2.6.33.1/sound/aoa/codecs/onyx.c linux-2.6.33.1/sound/aoa/codecs/onyx.c +--- linux-2.6.33.1/sound/aoa/codecs/onyx.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/aoa/codecs/onyx.c 2010-03-20 16:58:42.365465627 -0400 +@@ -53,7 +53,7 @@ struct onyx { + spdif_locked:1, + analog_locked:1, + original_mute:2; +- int open_count; ++ atomic_t open_count; + struct codec_info *codec_info; + + /* mutex serializes concurrent access to the device +@@ -752,7 +752,7 @@ static int onyx_open(struct codec_info_i + struct onyx *onyx = cii->codec_data; + + mutex_lock(&onyx->mutex); +- onyx->open_count++; ++ atomic_inc(&onyx->open_count); + mutex_unlock(&onyx->mutex); + + return 0; +@@ -764,8 +764,7 @@ static int onyx_close(struct codec_info_ + struct onyx *onyx = cii->codec_data; + + mutex_lock(&onyx->mutex); +- onyx->open_count--; +- if (!onyx->open_count) ++ if (atomic_dec_and_test(&onyx->open_count)) + onyx->spdif_locked = onyx->analog_locked = 0; + mutex_unlock(&onyx->mutex); + +diff -urNp linux-2.6.33.1/sound/core/oss/pcm_oss.c linux-2.6.33.1/sound/core/oss/pcm_oss.c +--- linux-2.6.33.1/sound/core/oss/pcm_oss.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/core/oss/pcm_oss.c 2010-03-20 16:58:42.372956563 -0400 +@@ -2949,8 +2949,8 @@ static void snd_pcm_oss_proc_done(struct + } + } + #else /* !CONFIG_SND_VERBOSE_PROCFS */ +-#define snd_pcm_oss_proc_init(pcm) +-#define snd_pcm_oss_proc_done(pcm) ++#define snd_pcm_oss_proc_init(pcm) do {} while (0) ++#define snd_pcm_oss_proc_done(pcm) do {} while (0) + #endif /* CONFIG_SND_VERBOSE_PROCFS */ + + /* +diff -urNp linux-2.6.33.1/sound/core/seq/seq_lock.h linux-2.6.33.1/sound/core/seq/seq_lock.h +--- linux-2.6.33.1/sound/core/seq/seq_lock.h 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/core/seq/seq_lock.h 2010-03-20 16:58:42.376941145 -0400 +@@ -23,10 +23,10 @@ void snd_use_lock_sync_helper(snd_use_lo + #else /* SMP || CONFIG_SND_DEBUG */ + + typedef spinlock_t snd_use_lock_t; /* dummy */ +-#define snd_use_lock_init(lockp) /**/ +-#define snd_use_lock_use(lockp) /**/ +-#define snd_use_lock_free(lockp) /**/ +-#define snd_use_lock_sync(lockp) /**/ ++#define snd_use_lock_init(lockp) do {} while (0) ++#define snd_use_lock_use(lockp) do {} while (0) ++#define snd_use_lock_free(lockp) do {} while (0) ++#define snd_use_lock_sync(lockp) do {} while (0) + + #endif /* SMP || CONFIG_SND_DEBUG */ + +diff -urNp linux-2.6.33.1/sound/drivers/mts64.c linux-2.6.33.1/sound/drivers/mts64.c +--- linux-2.6.33.1/sound/drivers/mts64.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/drivers/mts64.c 2010-03-20 16:58:42.380965540 -0400 +@@ -65,7 +65,7 @@ struct mts64 { + struct pardevice *pardev; + int pardev_claimed; + +- int open_count; ++ atomic_t open_count; + int current_midi_output_port; + int current_midi_input_port; + u8 mode[MTS64_NUM_INPUT_PORTS]; +@@ -695,7 +695,7 @@ static int snd_mts64_rawmidi_open(struct + { + struct mts64 *mts = substream->rmidi->private_data; + +- if (mts->open_count == 0) { ++ if (atomic_read(&mts->open_count) == 0) { + /* We don't need a spinlock here, because this is just called + if the device has not been opened before. + So there aren't any IRQs from the device */ +@@ -703,7 +703,7 @@ static int snd_mts64_rawmidi_open(struct + + msleep(50); + } +- ++(mts->open_count); ++ atomic_inc(&mts->open_count); + + return 0; + } +@@ -713,8 +713,7 @@ static int snd_mts64_rawmidi_close(struc + struct mts64 *mts = substream->rmidi->private_data; + unsigned long flags; + +- --(mts->open_count); +- if (mts->open_count == 0) { ++ if (atomic_dec_return(&mts->open_count) == 0) { + /* We need the spinlock_irqsave here because we can still + have IRQs at this point */ + spin_lock_irqsave(&mts->lock, flags); +@@ -723,8 +722,8 @@ static int snd_mts64_rawmidi_close(struc + + msleep(500); + +- } else if (mts->open_count < 0) +- mts->open_count = 0; ++ } else if (atomic_read(&mts->open_count) < 0) ++ atomic_set(&mts->open_count, 0); + + return 0; + } +diff -urNp linux-2.6.33.1/sound/drivers/portman2x4.c linux-2.6.33.1/sound/drivers/portman2x4.c +--- linux-2.6.33.1/sound/drivers/portman2x4.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/drivers/portman2x4.c 2010-03-20 16:58:42.396942474 -0400 +@@ -83,7 +83,7 @@ struct portman { + struct pardevice *pardev; + int pardev_claimed; + +- int open_count; ++ atomic_t open_count; + int mode[PORTMAN_NUM_INPUT_PORTS]; + struct snd_rawmidi_substream *midi_input[PORTMAN_NUM_INPUT_PORTS]; + }; +diff -urNp linux-2.6.33.1/sound/oss/sb_audio.c linux-2.6.33.1/sound/oss/sb_audio.c +--- linux-2.6.33.1/sound/oss/sb_audio.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/oss/sb_audio.c 2010-03-20 16:58:42.404957454 -0400 +@@ -901,7 +901,7 @@ sb16_copy_from_user(int dev, + buf16 = (signed short *)(localbuf + localoffs); + while (c) + { +- locallen = (c >= LBUFCOPYSIZE ? LBUFCOPYSIZE : c); ++ locallen = ((unsigned)c >= LBUFCOPYSIZE ? LBUFCOPYSIZE : c); + if (copy_from_user(lbuf8, + userbuf+useroffs + p, + locallen)) +diff -urNp linux-2.6.33.1/sound/pci/ac97/ac97_codec.c linux-2.6.33.1/sound/pci/ac97/ac97_codec.c +--- linux-2.6.33.1/sound/pci/ac97/ac97_codec.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/pci/ac97/ac97_codec.c 2010-03-20 16:58:42.416962707 -0400 +@@ -1962,7 +1962,7 @@ static int snd_ac97_dev_disconnect(struc + } + + /* build_ops to do nothing */ +-static struct snd_ac97_build_ops null_build_ops; ++static const struct snd_ac97_build_ops null_build_ops; + + #ifdef CONFIG_SND_AC97_POWER_SAVE + static void do_update_power(struct work_struct *work) +diff -urNp linux-2.6.33.1/sound/pci/ac97/ac97_patch.c linux-2.6.33.1/sound/pci/ac97/ac97_patch.c +--- linux-2.6.33.1/sound/pci/ac97/ac97_patch.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/pci/ac97/ac97_patch.c 2010-03-20 16:58:42.440672764 -0400 +@@ -371,7 +371,7 @@ static int patch_yamaha_ymf743_build_spd + return 0; + } + +-static struct snd_ac97_build_ops patch_yamaha_ymf743_ops = { ++static const struct snd_ac97_build_ops patch_yamaha_ymf743_ops = { + .build_spdif = patch_yamaha_ymf743_build_spdif, + .build_3d = patch_yamaha_ymf7x3_3d, + }; +@@ -455,7 +455,7 @@ static int patch_yamaha_ymf753_post_spdi + return 0; + } + +-static struct snd_ac97_build_ops patch_yamaha_ymf753_ops = { ++static const struct snd_ac97_build_ops patch_yamaha_ymf753_ops = { + .build_3d = patch_yamaha_ymf7x3_3d, + .build_post_spdif = patch_yamaha_ymf753_post_spdif + }; +@@ -502,7 +502,7 @@ static int patch_wolfson_wm9703_specific + return 0; + } + +-static struct snd_ac97_build_ops patch_wolfson_wm9703_ops = { ++static const struct snd_ac97_build_ops patch_wolfson_wm9703_ops = { + .build_specific = patch_wolfson_wm9703_specific, + }; + +@@ -533,7 +533,7 @@ static int patch_wolfson_wm9704_specific + return 0; + } + +-static struct snd_ac97_build_ops patch_wolfson_wm9704_ops = { ++static const struct snd_ac97_build_ops patch_wolfson_wm9704_ops = { + .build_specific = patch_wolfson_wm9704_specific, + }; + +@@ -555,7 +555,7 @@ static int patch_wolfson_wm9705_specific + return 0; + } + +-static struct snd_ac97_build_ops patch_wolfson_wm9705_ops = { ++static const struct snd_ac97_build_ops patch_wolfson_wm9705_ops = { + .build_specific = patch_wolfson_wm9705_specific, + }; + +@@ -692,7 +692,7 @@ static int patch_wolfson_wm9711_specific + return 0; + } + +-static struct snd_ac97_build_ops patch_wolfson_wm9711_ops = { ++static const struct snd_ac97_build_ops patch_wolfson_wm9711_ops = { + .build_specific = patch_wolfson_wm9711_specific, + }; + +@@ -886,7 +886,7 @@ static void patch_wolfson_wm9713_resume + } + #endif + +-static struct snd_ac97_build_ops patch_wolfson_wm9713_ops = { ++static const struct snd_ac97_build_ops patch_wolfson_wm9713_ops = { + .build_specific = patch_wolfson_wm9713_specific, + .build_3d = patch_wolfson_wm9713_3d, + #ifdef CONFIG_PM +@@ -991,7 +991,7 @@ static int patch_sigmatel_stac97xx_speci + return 0; + } + +-static struct snd_ac97_build_ops patch_sigmatel_stac9700_ops = { ++static const struct snd_ac97_build_ops patch_sigmatel_stac9700_ops = { + .build_3d = patch_sigmatel_stac9700_3d, + .build_specific = patch_sigmatel_stac97xx_specific + }; +@@ -1038,7 +1038,7 @@ static int patch_sigmatel_stac9708_speci + return patch_sigmatel_stac97xx_specific(ac97); + } + +-static struct snd_ac97_build_ops patch_sigmatel_stac9708_ops = { ++static const struct snd_ac97_build_ops patch_sigmatel_stac9708_ops = { + .build_3d = patch_sigmatel_stac9708_3d, + .build_specific = patch_sigmatel_stac9708_specific + }; +@@ -1267,7 +1267,7 @@ static int patch_sigmatel_stac9758_speci + return 0; + } + +-static struct snd_ac97_build_ops patch_sigmatel_stac9758_ops = { ++static const struct snd_ac97_build_ops patch_sigmatel_stac9758_ops = { + .build_3d = patch_sigmatel_stac9700_3d, + .build_specific = patch_sigmatel_stac9758_specific + }; +@@ -1342,7 +1342,7 @@ static int patch_cirrus_build_spdif(stru + return 0; + } + +-static struct snd_ac97_build_ops patch_cirrus_ops = { ++static const struct snd_ac97_build_ops patch_cirrus_ops = { + .build_spdif = patch_cirrus_build_spdif + }; + +@@ -1399,7 +1399,7 @@ static int patch_conexant_build_spdif(st + return 0; + } + +-static struct snd_ac97_build_ops patch_conexant_ops = { ++static const struct snd_ac97_build_ops patch_conexant_ops = { + .build_spdif = patch_conexant_build_spdif + }; + +@@ -1501,7 +1501,7 @@ static const struct snd_ac97_res_table a + { AC97_VIDEO, 0x9f1f }, + { AC97_AUX, 0x9f1f }, + { AC97_PCM, 0x9f1f }, +- { } /* terminator */ ++ { 0, 0 } /* terminator */ + }; + + static int patch_ad1819(struct snd_ac97 * ac97) +@@ -1575,7 +1575,7 @@ static void patch_ad1881_chained(struct + } + } + +-static struct snd_ac97_build_ops patch_ad1881_build_ops = { ++static const struct snd_ac97_build_ops patch_ad1881_build_ops = { + #ifdef CONFIG_PM + .resume = ad18xx_resume + #endif +@@ -1662,7 +1662,7 @@ static int patch_ad1885_specific(struct + return 0; + } + +-static struct snd_ac97_build_ops patch_ad1885_build_ops = { ++static const struct snd_ac97_build_ops patch_ad1885_build_ops = { + .build_specific = &patch_ad1885_specific, + #ifdef CONFIG_PM + .resume = ad18xx_resume +@@ -1689,7 +1689,7 @@ static int patch_ad1886_specific(struct + return 0; + } + +-static struct snd_ac97_build_ops patch_ad1886_build_ops = { ++static const struct snd_ac97_build_ops patch_ad1886_build_ops = { + .build_specific = &patch_ad1886_specific, + #ifdef CONFIG_PM + .resume = ad18xx_resume +@@ -1894,7 +1894,7 @@ static int patch_ad1981a_specific(struct + ARRAY_SIZE(snd_ac97_ad1981x_jack_sense)); + } + +-static struct snd_ac97_build_ops patch_ad1981a_build_ops = { ++static const struct snd_ac97_build_ops patch_ad1981a_build_ops = { + .build_post_spdif = patch_ad198x_post_spdif, + .build_specific = patch_ad1981a_specific, + #ifdef CONFIG_PM +@@ -1949,7 +1949,7 @@ static int patch_ad1981b_specific(struct + ARRAY_SIZE(snd_ac97_ad1981x_jack_sense)); + } + +-static struct snd_ac97_build_ops patch_ad1981b_build_ops = { ++static const struct snd_ac97_build_ops patch_ad1981b_build_ops = { + .build_post_spdif = patch_ad198x_post_spdif, + .build_specific = patch_ad1981b_specific, + #ifdef CONFIG_PM +@@ -2088,7 +2088,7 @@ static int patch_ad1888_specific(struct + return patch_build_controls(ac97, snd_ac97_ad1888_controls, ARRAY_SIZE(snd_ac97_ad1888_controls)); + } + +-static struct snd_ac97_build_ops patch_ad1888_build_ops = { ++static const struct snd_ac97_build_ops patch_ad1888_build_ops = { + .build_post_spdif = patch_ad198x_post_spdif, + .build_specific = patch_ad1888_specific, + #ifdef CONFIG_PM +@@ -2137,7 +2137,7 @@ static int patch_ad1980_specific(struct + return patch_build_controls(ac97, &snd_ac97_ad198x_2cmic, 1); + } + +-static struct snd_ac97_build_ops patch_ad1980_build_ops = { ++static const struct snd_ac97_build_ops patch_ad1980_build_ops = { + .build_post_spdif = patch_ad198x_post_spdif, + .build_specific = patch_ad1980_specific, + #ifdef CONFIG_PM +@@ -2252,7 +2252,7 @@ static int patch_ad1985_specific(struct + ARRAY_SIZE(snd_ac97_ad1985_controls)); + } + +-static struct snd_ac97_build_ops patch_ad1985_build_ops = { ++static const struct snd_ac97_build_ops patch_ad1985_build_ops = { + .build_post_spdif = patch_ad198x_post_spdif, + .build_specific = patch_ad1985_specific, + #ifdef CONFIG_PM +@@ -2544,7 +2544,7 @@ static int patch_ad1986_specific(struct + ARRAY_SIZE(snd_ac97_ad1985_controls)); + } + +-static struct snd_ac97_build_ops patch_ad1986_build_ops = { ++static const struct snd_ac97_build_ops patch_ad1986_build_ops = { + .build_post_spdif = patch_ad198x_post_spdif, + .build_specific = patch_ad1986_specific, + #ifdef CONFIG_PM +@@ -2649,7 +2649,7 @@ static int patch_alc650_specific(struct + return 0; + } + +-static struct snd_ac97_build_ops patch_alc650_ops = { ++static const struct snd_ac97_build_ops patch_alc650_ops = { + .build_specific = patch_alc650_specific, + .update_jacks = alc650_update_jacks + }; +@@ -2801,7 +2801,7 @@ static int patch_alc655_specific(struct + return 0; + } + +-static struct snd_ac97_build_ops patch_alc655_ops = { ++static const struct snd_ac97_build_ops patch_alc655_ops = { + .build_specific = patch_alc655_specific, + .update_jacks = alc655_update_jacks + }; +@@ -2913,7 +2913,7 @@ static int patch_alc850_specific(struct + return 0; + } + +-static struct snd_ac97_build_ops patch_alc850_ops = { ++static const struct snd_ac97_build_ops patch_alc850_ops = { + .build_specific = patch_alc850_specific, + .update_jacks = alc850_update_jacks + }; +@@ -2975,7 +2975,7 @@ static int patch_cm9738_specific(struct + return patch_build_controls(ac97, snd_ac97_cm9738_controls, ARRAY_SIZE(snd_ac97_cm9738_controls)); + } + +-static struct snd_ac97_build_ops patch_cm9738_ops = { ++static const struct snd_ac97_build_ops patch_cm9738_ops = { + .build_specific = patch_cm9738_specific, + .update_jacks = cm9738_update_jacks + }; +@@ -3066,7 +3066,7 @@ static int patch_cm9739_post_spdif(struc + return patch_build_controls(ac97, snd_ac97_cm9739_controls_spdif, ARRAY_SIZE(snd_ac97_cm9739_controls_spdif)); + } + +-static struct snd_ac97_build_ops patch_cm9739_ops = { ++static const struct snd_ac97_build_ops patch_cm9739_ops = { + .build_specific = patch_cm9739_specific, + .build_post_spdif = patch_cm9739_post_spdif, + .update_jacks = cm9739_update_jacks +@@ -3240,7 +3240,7 @@ static int patch_cm9761_specific(struct + return patch_build_controls(ac97, snd_ac97_cm9761_controls, ARRAY_SIZE(snd_ac97_cm9761_controls)); + } + +-static struct snd_ac97_build_ops patch_cm9761_ops = { ++static const struct snd_ac97_build_ops patch_cm9761_ops = { + .build_specific = patch_cm9761_specific, + .build_post_spdif = patch_cm9761_post_spdif, + .update_jacks = cm9761_update_jacks +@@ -3336,7 +3336,7 @@ static int patch_cm9780_specific(struct + return patch_build_controls(ac97, cm9780_controls, ARRAY_SIZE(cm9780_controls)); + } + +-static struct snd_ac97_build_ops patch_cm9780_ops = { ++static const struct snd_ac97_build_ops patch_cm9780_ops = { + .build_specific = patch_cm9780_specific, + .build_post_spdif = patch_cm9761_post_spdif /* identical with CM9761 */ + }; +@@ -3456,7 +3456,7 @@ static int patch_vt1616_specific(struct + return 0; + } + +-static struct snd_ac97_build_ops patch_vt1616_ops = { ++static const struct snd_ac97_build_ops patch_vt1616_ops = { + .build_specific = patch_vt1616_specific + }; + +@@ -3810,7 +3810,7 @@ static int patch_it2646_specific(struct + return 0; + } + +-static struct snd_ac97_build_ops patch_it2646_ops = { ++static const struct snd_ac97_build_ops patch_it2646_ops = { + .build_specific = patch_it2646_specific, + .update_jacks = it2646_update_jacks + }; +@@ -3844,7 +3844,7 @@ static int patch_si3036_specific(struct + return 0; + } + +-static struct snd_ac97_build_ops patch_si3036_ops = { ++static const struct snd_ac97_build_ops patch_si3036_ops = { + .build_specific = patch_si3036_specific, + }; + +@@ -3877,7 +3877,7 @@ static struct snd_ac97_res_table lm4550_ + { AC97_AUX, 0x1f1f }, + { AC97_PCM, 0x1f1f }, + { AC97_REC_GAIN, 0x0f0f }, +- { } /* terminator */ ++ { 0, 0 } /* terminator */ + }; + + static int patch_lm4550(struct snd_ac97 *ac97) +@@ -3911,7 +3911,7 @@ static int patch_ucb1400_specific(struct + return 0; + } + +-static struct snd_ac97_build_ops patch_ucb1400_ops = { ++static const struct snd_ac97_build_ops patch_ucb1400_ops = { + .build_specific = patch_ucb1400_specific, + }; + +diff -urNp linux-2.6.33.1/sound/pci/ens1370.c linux-2.6.33.1/sound/pci/ens1370.c +--- linux-2.6.33.1/sound/pci/ens1370.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/pci/ens1370.c 2010-03-20 16:58:42.452959690 -0400 +@@ -452,7 +452,7 @@ static struct pci_device_id snd_audiopci + { PCI_VDEVICE(ENSONIQ, 0x5880), 0, }, /* ES1373 - CT5880 */ + { PCI_VDEVICE(ECTIVA, 0x8938), 0, }, /* Ectiva EV1938 */ + #endif +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(pci, snd_audiopci_ids); +diff -urNp linux-2.6.33.1/sound/pci/intel8x0.c linux-2.6.33.1/sound/pci/intel8x0.c +--- linux-2.6.33.1/sound/pci/intel8x0.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/pci/intel8x0.c 2010-03-20 16:58:42.464956802 -0400 +@@ -444,7 +444,7 @@ static struct pci_device_id snd_intel8x0 + { PCI_VDEVICE(AMD, 0x746d), DEVICE_INTEL }, /* AMD8111 */ + { PCI_VDEVICE(AMD, 0x7445), DEVICE_INTEL }, /* AMD768 */ + { PCI_VDEVICE(AL, 0x5455), DEVICE_ALI }, /* Ali5455 */ +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(pci, snd_intel8x0_ids); +@@ -2129,7 +2129,7 @@ static struct ac97_quirk ac97_quirks[] _ + .type = AC97_TUNE_HP_ONLY + }, + #endif +- { } /* terminator */ ++ { 0, 0, 0, 0, NULL, 0 } /* terminator */ + }; + + static int __devinit snd_intel8x0_mixer(struct intel8x0 *chip, int ac97_clock, +diff -urNp linux-2.6.33.1/sound/pci/intel8x0m.c linux-2.6.33.1/sound/pci/intel8x0m.c +--- linux-2.6.33.1/sound/pci/intel8x0m.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/sound/pci/intel8x0m.c 2010-03-20 16:58:42.469451186 -0400 +@@ -239,7 +239,7 @@ static struct pci_device_id snd_intel8x0 + { PCI_VDEVICE(AMD, 0x746d), DEVICE_INTEL }, /* AMD8111 */ + { PCI_VDEVICE(AL, 0x5455), DEVICE_ALI }, /* Ali5455 */ + #endif +- { 0, } ++ { 0, 0, 0, 0, 0, 0, 0 } + }; + + MODULE_DEVICE_TABLE(pci, snd_intel8x0m_ids); +@@ -1264,7 +1264,7 @@ static struct shortname_table { + { 0x5455, "ALi M5455" }, + { 0x746d, "AMD AMD8111" }, + #endif +- { 0 }, ++ { 0, NULL }, + }; + + static int __devinit snd_intel8x0m_probe(struct pci_dev *pci, +diff -urNp linux-2.6.33.1/usr/gen_init_cpio.c linux-2.6.33.1/usr/gen_init_cpio.c +--- linux-2.6.33.1/usr/gen_init_cpio.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/usr/gen_init_cpio.c 2010-03-20 16:58:42.469451186 -0400 +@@ -299,7 +299,7 @@ static int cpio_mkfile(const char *name, + int retval; + int rc = -1; + int namesize; +- int i; ++ unsigned int i; + + mode |= S_IFREG; + +@@ -386,9 +386,10 @@ static char *cpio_replace_env(char *new_ + *env_var = *expanded = '\0'; + strncat(env_var, start + 2, end - start - 2); + strncat(expanded, new_location, start - new_location); +- strncat(expanded, getenv(env_var), PATH_MAX); +- strncat(expanded, end + 1, PATH_MAX); ++ strncat(expanded, getenv(env_var), PATH_MAX - strlen(expanded)); ++ strncat(expanded, end + 1, PATH_MAX - strlen(expanded)); + strncpy(new_location, expanded, PATH_MAX); ++ new_location[PATH_MAX] = 0; + } else + break; + } +diff -urNp linux-2.6.33.1/virt/kvm/kvm_main.c linux-2.6.33.1/virt/kvm/kvm_main.c +--- linux-2.6.33.1/virt/kvm/kvm_main.c 2010-03-15 12:09:39.000000000 -0400 ++++ linux-2.6.33.1/virt/kvm/kvm_main.c 2010-03-20 16:58:42.489458390 -0400 +@@ -1168,6 +1168,7 @@ static int kvm_vcpu_release(struct inode + return 0; + } + ++/* cannot be const */ + static struct file_operations kvm_vcpu_fops = { + .release = kvm_vcpu_release, + .unlocked_ioctl = kvm_vcpu_ioctl, +@@ -1624,6 +1625,7 @@ static int kvm_vm_mmap(struct file *file + return 0; + } + ++/* cannot be const */ + static struct file_operations kvm_vm_fops = { + .release = kvm_vm_release, + .unlocked_ioctl = kvm_vm_ioctl, +@@ -1714,6 +1716,7 @@ out: + return r; + } + ++/* cannot be const */ + static struct file_operations kvm_chardev_ops = { + .unlocked_ioctl = kvm_dev_ioctl, + .compat_ioctl = kvm_dev_ioctl, +@@ -1723,6 +1726,9 @@ static struct miscdevice kvm_dev = { + KVM_MINOR, + "kvm", + &kvm_chardev_ops, ++ {NULL, NULL}, ++ NULL, ++ NULL + }; + + static void hardware_enable(void *junk) +@@ -2050,7 +2056,7 @@ static void kvm_sched_out(struct preempt + kvm_arch_vcpu_put(vcpu); + } + +-int kvm_init(void *opaque, unsigned int vcpu_size, ++int kvm_init(const void *opaque, unsigned int vcpu_size, + struct module *module) + { + int r; diff --git a/pkgs/core/kernel/patches/linux-2.6.31.1-disable-compat_vdso-1.patch.off b/pkgs/core/kernel/patches/linux-2.6.31.1-disable-compat_vdso-1.patch.off deleted file mode 100644 index 3780030..0000000 --- a/pkgs/core/kernel/patches/linux-2.6.31.1-disable-compat_vdso-1.patch.off +++ /dev/null @@ -1,74 +0,0 @@ -From: Gordon Malm gengor@gentoo.org -From: Kerin Millar kerframil@gmail.com - -COMPAT_VDSO is inappropriate for any modern Hardened Gentoo system. It -conflicts with various parts of PaX, crashing the system if enabled -while PaX's NOEXEC or UDEREF features are active. Moreover, it prevents -a number of important PaX options from appearing in the configuration -menu, including all PaX NOEXEC implementations. Unfortunately, the -reason for the disappearance of these PaX configuration options is -often far from obvious to inexperienced users. - -Therefore, we disable the COMPAT_VDSO menu entry entirely. However, -COMPAT_VDSO operation can still be enabled via bootparam and sysctl -interfaces. Consequently, we must also disable the ability to select -COMPAT_VDSO operation at boot or runtime. Here we patch the kernel so -that selecting COMPAT_VDSO operation at boot/runtime has no effect if -conflicting PaX options are enabled, leaving VDSO_ENABLED operation -intact. - -Closes bug: http://bugs.gentoo.org/show_bug.cgi?id=210138 - ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1215,16 +1215,7 @@ config HOTPLUG_CPU - - config COMPAT_VDSO - def_bool n -- prompt "Compat VDSO support" - depends on (X86_32 || IA32_EMULATION) && !PAX_NOEXEC -- help -- Map the 32-bit VDSO to the predictable old-style address too. -- ---help--- -- Say N here if you are running a sufficiently recent glibc -- version (2.3.3 or later), to remove the high-mapped -- VDSO mapping and to exclusively use the randomized VDSO. -- -- If unsure, say Y. - - endmenu - ---- a/arch/x86/vdso/vdso32-setup.c -+++ b/arch/x86/vdso/vdso32-setup.c -@@ -333,17 +333,21 @@ int arch_setup_additional_pages(struct l - - map_compat_vdso(compat); - -+#if !defined(CONFIG_PAX_NOEXEC) && !defined(CONFIG_PAX_MEMORY_UDEREF) - if (compat) - addr = VDSO_HIGH_BASE; - else { -+#endif - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, MAP_EXECUTABLE); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } -+#if !defined(CONFIG_PAX_NOEXEC) && !defined(CONFIG_PAX_MEMORY_UDEREF) - } - - if (compat_uses_vma || !compat) { -+#endif - /* - * MAYWRITE to allow gdb to COW and set breakpoints - * -@@ -361,7 +365,9 @@ int arch_setup_additional_pages(struct l - - if (ret) - goto up_fail; -+#if !defined(CONFIG_PAX_NOEXEC) && !defined(CONFIG_PAX_MEMORY_UDEREF) - } -+#endif - - current->mm->context.vdso = addr; - current_thread_info()->sysenter_return = diff --git a/pkgs/core/kernel/patches/linux-2.6.31.1-scsi.h-fix-1.patch b/pkgs/core/kernel/patches/linux-2.6.31.1-scsi.h-fix-1.patch deleted file mode 100644 index 7ff5ef0..0000000 --- a/pkgs/core/kernel/patches/linux-2.6.31.1-scsi.h-fix-1.patch +++ /dev/null @@ -1,19 +0,0 @@ -diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h -index 084478e..dfcfaab 100644 ---- a/include/scsi/scsi.h -+++ b/include/scsi/scsi.h -@@ -142,10 +142,10 @@ struct scsi_cmnd; - - /* defined in T10 SCSI Primary Commands-2 (SPC2) */ - struct scsi_varlen_cdb_hdr { -- u8 opcode; /* opcode always == VARIABLE_LENGTH_CMD */ -- u8 control; -- u8 misc[5]; -- u8 additional_cdb_length; /* total cdb length - 8 */ -+ __u8 opcode; /* opcode always == VARIABLE_LENGTH_CMD */ -+ __u8 control; -+ __u8 misc[5]; -+ __u8 additional_cdb_length; /* total cdb length - 8 */ - __be16 service_action; - /* service specific data follows */ - }; diff --git a/pkgs/core/kernel/patches/linux-2.6.31.1-scsi.h-fix-1.patch.off b/pkgs/core/kernel/patches/linux-2.6.31.1-scsi.h-fix-1.patch.off new file mode 100644 index 0000000..7ff5ef0 --- /dev/null +++ b/pkgs/core/kernel/patches/linux-2.6.31.1-scsi.h-fix-1.patch.off @@ -0,0 +1,19 @@ +diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h +index 084478e..dfcfaab 100644 +--- a/include/scsi/scsi.h ++++ b/include/scsi/scsi.h +@@ -142,10 +142,10 @@ struct scsi_cmnd; + + /* defined in T10 SCSI Primary Commands-2 (SPC2) */ + struct scsi_varlen_cdb_hdr { +- u8 opcode; /* opcode always == VARIABLE_LENGTH_CMD */ +- u8 control; +- u8 misc[5]; +- u8 additional_cdb_length; /* total cdb length - 8 */ ++ __u8 opcode; /* opcode always == VARIABLE_LENGTH_CMD */ ++ __u8 control; ++ __u8 misc[5]; ++ __u8 additional_cdb_length; /* total cdb length - 8 */ + __be16 service_action; + /* service specific data follows */ + }; diff --git a/pkgs/core/kernel/patches/linux-2.6.33-disable-compat-vdso.patch b/pkgs/core/kernel/patches/linux-2.6.33-disable-compat-vdso.patch new file mode 100644 index 0000000..8716cce --- /dev/null +++ b/pkgs/core/kernel/patches/linux-2.6.33-disable-compat-vdso.patch @@ -0,0 +1,46 @@ +No need to wrap vdso calls as gentoo does not use any version of +glibc <=2.3.3 +--- +From: Gordon Malm gengor@gentoo.org +From: Kerin Millar kerframil@gmail.com +From: Jory A. Pratt anarchy@gentoo.org + +COMPAT_VDSO is inappropriate for any modern Hardened Gentoo system. It +conflicts with various parts of PaX, crashing the system if enabled +while PaX's NOEXEC or UDEREF features are active. Moreover, it prevents +a number of important PaX options from appearing in the configuration +menu, including all PaX NOEXEC implementations. Unfortunately, the +reason for the disappearance of these PaX configuration options is +often far from obvious to inexperienced users. + +Therefore, we disable the COMPAT_VDSO menu entry entirely. However, +COMPAT_VDSO operation can still be enabled via bootparam and sysctl +interfaces. Consequently, we must also disable the ability to select +COMPAT_VDSO operation at boot or runtime. Here we patch the kernel so +that selecting COMPAT_VDSO operation at boot/runtime has no effect if +conflicting PaX options are enabled, leaving VDSO_ENABLED operation +intact. + +Closes bug: http://bugs.gentoo.org/show_bug.cgi?id=210138 + +diff -urp a/arch/x86/Kconfig b/arch/x86/Kconfig +--- a/arch/x86/Kconfig 2009-07-31 01:36:57.323857684 +0100 ++++ b/arch/x86/Kconfig 2009-07-31 01:51:39.395749681 +0100 +@@ -1607,17 +1607,8 @@ + + config COMPAT_VDSO + def_bool n +- prompt "Compat VDSO support" + depends on X86_32 || IA32_EMULATION + depends on !PAX_NOEXEC && !PAX_MEMORY_UDEREF +- ---help--- +- Map the 32-bit VDSO to the predictable old-style address too. +- +- Say N here if you are running a sufficiently recent glibc +- version (2.3.3 or later), to remove the high-mapped +- VDSO mapping and to exclusively use the randomized VDSO. +- +- If unsure, say Y. + + config CMDLINE_BOOL + bool "Built-in kernel command line" diff --git a/pkgs/core/kernel/patches/reiser4-for-2.6.31.1.patch b/pkgs/core/kernel/patches/reiser4-for-2.6.31.1.patch deleted file mode 100644 index a4dca5f..0000000 --- a/pkgs/core/kernel/patches/reiser4-for-2.6.31.1.patch +++ /dev/null @@ -1,78369 +0,0 @@ -diff -urN linux-2.6.30.orig/Documentation/Changes linux-2.6.30/Documentation/Changes ---- linux-2.6.30.orig/Documentation/Changes 2009-03-24 00:12:14.000000000 +0100 -+++ linux-2.6.30/Documentation/Changes 2009-06-22 16:08:11.000000000 +0200 -@@ -36,6 +36,7 @@ - o e2fsprogs 1.41.4 # e2fsck -V - o jfsutils 1.1.3 # fsck.jfs -V - o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs -+o reiser4progs 1.0.0 # fsck.reiser4 -V - o xfsprogs 2.6.0 # xfs_db -V - o squashfs-tools 4.0 # mksquashfs -version - o btrfs-progs 0.18 # btrfsck -@@ -147,6 +148,13 @@ - versions of mkreiserfs, resize_reiserfs, debugreiserfs and - reiserfsck. These utils work on both i386 and alpha platforms. - -+Reiser4progs -+------------ -+ -+The reiser4progs package contains utilities for the reiser4 file system. -+Detailed instructions are provided in the README file located at: -+ftp://ftp.namesys.com/pub/reiser4progs/README. -+ - Xfsprogs - -------- - -@@ -325,6 +333,10 @@ - ------------- - o http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz - -+Reiser4progs -+------------ -+o ftp://ftp.namesys.com/pub/reiser4progs/ -+ - Xfsprogs - -------- - o ftp://oss.sgi.com/projects/xfs/download/ -diff -urN linux-2.6.30.orig/Documentation/filesystems/reiser4.txt linux-2.6.30/Documentation/filesystems/reiser4.txt ---- linux-2.6.30.orig/Documentation/filesystems/reiser4.txt 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/Documentation/filesystems/reiser4.txt 2009-06-22 16:08:11.000000000 +0200 -@@ -0,0 +1,75 @@ -+Reiser4 filesystem -+================== -+Reiser4 is a file system based on dancing tree algorithms, and is -+described at http://www.namesys.com -+ -+ -+References -+========== -+web page http://namesys.com/v4/v4.html -+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/ -+userland tools ftp://ftp.namesys.com/pub/reiser4progs/ -+install page http://www.namesys.com/install_v4.html -+ -+Compile options -+=============== -+Enable reiser4 debug mode -+ This checks everything imaginable while reiser4 -+ runs -+ -+Mount options -+============= -+tmgr.atom_max_size=N -+ Atoms containing more than N blocks will be forced to commit. -+ N is decimal. -+ Default is nr_free_pagecache_pages() / 2 at mount time. -+ -+tmgr.atom_max_age=N -+ Atoms older than N seconds will be forced to commit. N is decimal. -+ Default is 600. -+ -+tmgr.atom_max_flushers=N -+ Limit of concurrent flushers for one atom. 0 means no limit. -+ Default is 0. -+ -+tree.cbk_cache.nr_slots=N -+ Number of slots in the cbk cache. -+ -+flush.relocate_threshold=N -+ If flush finds more than N adjacent dirty leaf-level blocks it -+ will force them to be relocated. -+ Default is 64. -+ -+flush.relocate_distance=N -+ If flush finds can find a block allocation closer than at most -+ N from the preceder it will relocate to that position. -+ Default is 64. -+ -+flush.scan_maxnodes=N -+ The maximum number of nodes to scan left on a level during -+ flush. -+ Default is 10000. -+ -+optimal_io_size=N -+ Preferred IO size. This value is used to set st_blksize of -+ struct stat. -+ Default is 65536. -+ -+bsdgroups -+ Turn on BSD-style gid assignment. -+ -+32bittimes -+ By default file in reiser4 have 64 bit timestamps. Files -+ created when filesystem is mounted with 32bittimes mount -+ option will get 32 bit timestamps. -+ -+mtflush -+ Turn off concurrent flushing. -+ -+nopseudo -+ Disable pseudo files support. See -+ http://namesys.com/v4/pseudo.html for more about pseudo files. -+ -+dont_load_bitmap -+ Don't load all bitmap blocks at mount time, it is useful for -+ machines with tiny RAM and large disks. -diff -urN linux-2.6.30.orig/fs/fs-writeback.c linux-2.6.30/fs/fs-writeback.c ---- linux-2.6.30.orig/fs/fs-writeback.c 2009-06-23 00:20:39.000000000 +0200 -+++ linux-2.6.30/fs/fs-writeback.c 2009-06-22 16:08:13.000000000 +0200 -@@ -593,7 +593,10 @@ - static void sync_sb_inodes(struct super_block *sb, - struct writeback_control *wbc) - { -- generic_sync_sb_inodes(sb, wbc); -+ if (sb->s_op->sync_inodes) -+ sb->s_op->sync_inodes(sb, wbc); -+ else -+ generic_sync_sb_inodes(sb, wbc); - } - - /* -diff -urN linux-2.6.30.orig/fs/Kconfig linux-2.6.30/fs/Kconfig ---- linux-2.6.30.orig/fs/Kconfig 2009-06-23 00:20:39.000000000 +0200 -+++ linux-2.6.30/fs/Kconfig 2009-06-22 16:08:13.000000000 +0200 -@@ -27,6 +27,7 @@ - default y if EXT4_FS=y && EXT4_FS_XATTR - default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR - -+source "fs/reiser4/Kconfig" - source "fs/reiserfs/Kconfig" - source "fs/jfs/Kconfig" - -diff -urN linux-2.6.30.orig/fs/Makefile linux-2.6.30/fs/Makefile ---- linux-2.6.30.orig/fs/Makefile 2009-06-23 00:20:39.000000000 +0200 -+++ linux-2.6.30/fs/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -65,6 +65,7 @@ - # Do not add any filesystems before this line - obj-$(CONFIG_FSCACHE) += fscache/ - obj-$(CONFIG_REISERFS_FS) += reiserfs/ -+obj-$(CONFIG_REISER4_FS) += reiser4/ - obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 - obj-$(CONFIG_EXT2_FS) += ext2/ - # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 -diff -urN linux-2.6.30.orig/fs/reiser4/as_ops.c linux-2.6.30/fs/reiser4/as_ops.c ---- linux-2.6.30.orig/fs/reiser4/as_ops.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/as_ops.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,337 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Interface to VFS. Reiser4 address_space_operations are defined here. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/file/file.h" -+#include "plugin/security/perm.h" -+#include "plugin/disk_format/disk_format.h" -+#include "plugin/plugin.h" -+#include "plugin/plugin_set.h" -+#include "plugin/object.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "reiser4.h" -+#include "entd.h" -+ -+#include <linux/profile.h> -+#include <linux/types.h> -+#include <linux/mount.h> -+#include <linux/vfs.h> -+#include <linux/mm.h> -+#include <linux/buffer_head.h> -+#include <linux/dcache.h> -+#include <linux/list.h> -+#include <linux/pagemap.h> -+#include <linux/slab.h> -+#include <linux/seq_file.h> -+#include <linux/init.h> -+#include <linux/module.h> -+#include <linux/writeback.h> -+#include <linux/backing-dev.h> -+#include <linux/quotaops.h> -+#include <linux/security.h> -+ -+/* address space operations */ -+ -+/** -+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting -+ * @page: page to be dirtied -+ * -+ * Operation of struct address_space_operations. This implementation is used by -+ * unix and cryptcompress file plugins. -+ * -+ * This is called when reiser4 page gets dirtied outside of reiser4, for -+ * example, when dirty bit is moved from pte to physical page. -+ * -+ * Tags page in the mapping's page tree with special tag so that it is possible -+ * to do all the reiser4 specific work wrt dirty pages (jnode creation, -+ * capturing by an atom) later because it can not be done in the contexts where -+ * set_page_dirty is called. -+ */ -+int reiser4_set_page_dirty(struct page *page) -+{ -+ /* this page can be unformatted only */ -+ assert("vs-1734", (page->mapping && -+ page->mapping->host && -+ reiser4_get_super_fake(page->mapping->host->i_sb) != -+ page->mapping->host && -+ reiser4_get_cc_fake(page->mapping->host->i_sb) != -+ page->mapping->host && -+ reiser4_get_bitmap_fake(page->mapping->host->i_sb) != -+ page->mapping->host)); -+ return __set_page_dirty_nobuffers(page); -+} -+ -+/* ->invalidatepage method for reiser4 */ -+ -+/* -+ * this is called for each truncated page from -+ * truncate_inode_pages()->truncate_{complete,partial}_page(). -+ * -+ * At the moment of call, page is under lock, and outstanding io (if any) has -+ * completed. -+ */ -+ -+/** -+ * reiser4_invalidatepage -+ * @page: page to invalidate -+ * @offset: starting offset for partial invalidation -+ * -+ */ -+void reiser4_invalidatepage(struct page *page, unsigned long offset) -+{ -+ int ret = 0; -+ reiser4_context *ctx; -+ struct inode *inode; -+ jnode *node; -+ -+ /* -+ * This is called to truncate file's page. -+ * -+ * Originally, reiser4 implemented truncate in a standard way -+ * (vmtruncate() calls ->invalidatepage() on all truncated pages -+ * first, then file system ->truncate() call-back is invoked). -+ * -+ * This lead to the problem when ->invalidatepage() was called on a -+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT -+ * process. That is, truncate was bypassing transactions. To avoid -+ * this, try_capture_page_to_invalidate() call was added here. -+ * -+ * After many troubles with vmtruncate() based truncate (including -+ * races with flush, tail conversion, etc.) it was re-written in the -+ * top-to-bottom style: items are killed in reiser4_cut_tree_object() -+ * and pages belonging to extent are invalidated in kill_hook_extent(). -+ * So probably now additional call to capture is not needed here. -+ */ -+ -+ assert("nikita-3137", PageLocked(page)); -+ assert("nikita-3138", !PageWriteback(page)); -+ inode = page->mapping->host; -+ -+ /* -+ * ->invalidatepage() should only be called for the unformatted -+ * jnodes. Destruction of all other types of jnodes is performed -+ * separately. But, during some corner cases (like handling errors -+ * during mount) it is simpler to let ->invalidatepage to be called on -+ * them. Check for this, and do nothing. -+ */ -+ if (reiser4_get_super_fake(inode->i_sb) == inode) -+ return; -+ if (reiser4_get_cc_fake(inode->i_sb) == inode) -+ return; -+ if (reiser4_get_bitmap_fake(inode->i_sb) == inode) -+ return; -+ assert("vs-1426", PagePrivate(page)); -+ assert("vs-1427", -+ page->mapping == jnode_get_mapping(jnode_by_page(page))); -+ assert("", jprivate(page) != NULL); -+ assert("", ergo(inode_file_plugin(inode) != -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID), -+ offset == 0)); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return; -+ -+ node = jprivate(page); -+ spin_lock_jnode(node); -+ if (!(node->state & ((1 << JNODE_DIRTY) | (1 << JNODE_FLUSH_QUEUED) | -+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) { -+ /* there is not need to capture */ -+ jref(node); -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ page_clear_jnode(page, node); -+ reiser4_uncapture_jnode(node); -+ unhash_unformatted_jnode(node); -+ jput(node); -+ reiser4_exit_context(ctx); -+ return; -+ } -+ spin_unlock_jnode(node); -+ -+ /* capture page being truncated. */ -+ ret = try_capture_page_to_invalidate(page); -+ if (ret != 0) -+ warning("nikita-3141", "Cannot capture: %i", ret); -+ -+ if (offset == 0) { -+ /* remove jnode from transaction and detach it from page. */ -+ jref(node); -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ /* page cannot be detached from jnode concurrently, because it -+ * is locked */ -+ reiser4_uncapture_page(page); -+ -+ /* this detaches page from jnode, so that jdelete will not try -+ * to lock page which is already locked */ -+ spin_lock_jnode(node); -+ page_clear_jnode(page, node); -+ spin_unlock_jnode(node); -+ unhash_unformatted_jnode(node); -+ -+ jput(node); -+ } -+ -+ reiser4_exit_context(ctx); -+} -+ -+/* help function called from reiser4_releasepage(). It returns true if jnode -+ * can be detached from its page and page released. */ -+int jnode_is_releasable(jnode * node/* node to check */) -+{ -+ assert("nikita-2781", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ assert_spin_locked(&(node->load)); -+ -+ /* is some thread is currently using jnode page, later cannot be -+ * detached */ -+ if (atomic_read(&node->d_count) != 0) -+ return 0; -+ -+ assert("vs-1214", !jnode_is_loaded(node)); -+ -+ /* -+ * can only release page if real block number is assigned to it. Simple -+ * check for ->atom wouldn't do, because it is possible for node to be -+ * clean, not it atom yet, and still having fake block number. For -+ * example, node just created in jinit_new(). -+ */ -+ if (reiser4_blocknr_is_fake(jnode_get_block(node))) -+ return 0; -+ -+ /* -+ * pages prepared for write can not be released anyway, so avoid -+ * detaching jnode from the page -+ */ -+ if (JF_ISSET(node, JNODE_WRITE_PREPARED)) -+ return 0; -+ -+ /* -+ * dirty jnode cannot be released. It can however be submitted to disk -+ * as part of early flushing, but only after getting flush-prepped. -+ */ -+ if (JF_ISSET(node, JNODE_DIRTY)) -+ return 0; -+ -+ /* overwrite set is only written by log writer. */ -+ if (JF_ISSET(node, JNODE_OVRWR)) -+ return 0; -+ -+ /* jnode is already under writeback */ -+ if (JF_ISSET(node, JNODE_WRITEBACK)) -+ return 0; -+ -+ /* don't flush bitmaps or journal records */ -+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node)) -+ return 0; -+ -+ return 1; -+} -+ -+/* -+ * ->releasepage method for reiser4 -+ * -+ * This is called by VM scanner when it comes across clean page. What we have -+ * to do here is to check whether page can really be released (freed that is) -+ * and if so, detach jnode from it and remove page from the page cache. -+ * -+ * Check for releasability is done by releasable() function. -+ */ -+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG) -+{ -+ jnode *node; -+ -+ assert("nikita-2257", PagePrivate(page)); -+ assert("nikita-2259", PageLocked(page)); -+ assert("nikita-2892", !PageWriteback(page)); -+ assert("nikita-3019", reiser4_schedulable()); -+ -+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It -+ is not clear what to do in this case. A lot of deadlocks seems be -+ possible. */ -+ -+ node = jnode_by_page(page); -+ assert("nikita-2258", node != NULL); -+ assert("reiser4-4", page->mapping != NULL); -+ assert("reiser4-5", page->mapping->host != NULL); -+ -+ if (PageDirty(page)) -+ return 0; -+ -+ /* extra page reference is used by reiser4 to protect -+ * jnode<->page link from this ->releasepage(). */ -+ if (page_count(page) > 3) -+ return 0; -+ -+ /* releasable() needs jnode lock, because it looks at the jnode fields -+ * and we need jload_lock here to avoid races with jload(). */ -+ spin_lock_jnode(node); -+ spin_lock(&(node->load)); -+ if (jnode_is_releasable(node)) { -+ struct address_space *mapping; -+ -+ mapping = page->mapping; -+ jref(node); -+ /* there is no need to synchronize against -+ * jnode_extent_write() here, because pages seen by -+ * jnode_extent_write() are !releasable(). */ -+ page_clear_jnode(page, node); -+ spin_unlock(&(node->load)); -+ spin_unlock_jnode(node); -+ -+ /* we are under memory pressure so release jnode also. */ -+ jput(node); -+ -+ return 1; -+ } else { -+ spin_unlock(&(node->load)); -+ spin_unlock_jnode(node); -+ assert("nikita-3020", reiser4_schedulable()); -+ return 0; -+ } -+} -+ -+int reiser4_readpage(struct file *file, struct page *page) -+{ -+ assert("edward-1533", PageLocked(page)); -+ assert("edward-1534", !PageUptodate(page)); -+ assert("edward-1535", page->mapping && page->mapping->host); -+ -+ return inode_file_plugin(page->mapping->host)->readpage(file, page); -+} -+ -+int reiser4_readpages(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ return inode_file_plugin(mapping->host)->readpages(file, mapping, -+ pages, nr_pages); -+} -+ -+int reiser4_writepages(struct address_space *mapping, -+ struct writeback_control *wbc) -+{ -+ return inode_file_plugin(mapping->host)->writepages(mapping, wbc); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/block_alloc.c linux-2.6.30/fs/reiser4/block_alloc.c ---- linux-2.6.30.orig/fs/reiser4/block_alloc.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/block_alloc.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1142 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+reiser4/README */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "super.h" -+ -+#include <linux/types.h> /* for __u?? */ -+#include <linux/fs.h> /* for struct super_block */ -+#include <linux/spinlock.h> -+ -+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */ -+ -+/* We need to be able to reserve enough disk space to ensure that an atomic -+ operation will have enough disk space to flush (see flush.c and -+ http://namesys.com/v4/v4.html) and commit it once it is started. -+ -+ In our design a call for reserving disk space may fail but not an actual -+ block allocation. -+ -+ All free blocks, already allocated blocks, and all kinds of reserved blocks -+ are counted in different per-fs block counters. -+ -+ A reiser4 super block's set of block counters currently is: -+ -+ free -- free blocks, -+ used -- already allocated blocks, -+ -+ grabbed -- initially reserved for performing an fs operation, those blocks -+ are taken from free blocks, then grabbed disk space leaks from grabbed -+ blocks counter to other counters like "fake allocated", "flush -+ reserved", "used", the rest of not used grabbed space is returned to -+ free space at the end of fs operation; -+ -+ fake allocated -- counts all nodes without real disk block numbers assigned, -+ we have separate accounting for formatted and unformatted -+ nodes (for easier debugging); -+ -+ flush reserved -- disk space needed for flushing and committing an atom. -+ Each dirty already allocated block could be written as a -+ part of atom's overwrite set or as a part of atom's -+ relocate set. In both case one additional block is needed, -+ it is used as a wandered block if we do overwrite or as a -+ new location for a relocated block. -+ -+ In addition, blocks in some states are counted on per-thread and per-atom -+ basis. A reiser4 context has a counter of blocks grabbed by this transaction -+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values -+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved" -+ blocks, which are reserved for flush processing and atom commit. */ -+ -+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate -+ number of blocks to grab for most expensive case of balancing when the leaf -+ node we insert new item to gets split and new leaf node is allocated. -+ -+ So, we need to grab blocks for -+ -+ 1) one block for possible dirtying the node we insert an item to. That block -+ would be used for node relocation at flush time or for allocating of a -+ wandered one, it depends what will be a result (what set, relocate or -+ overwrite the node gets assigned to) of the node processing by the flush -+ algorithm. -+ -+ 2) one block for either allocating a new node, or dirtying of right or left -+ clean neighbor, only one case may happen. -+ -+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying -+ of left neighbor, right neighbor, current node, and creation of new node. -+ Have I forgotten something? email me. -+ -+ These grabbed blocks are counted in both reiser4 context "grabbed blocks" -+ counter and in the fs-wide one (both ctx->grabbed_blocks and -+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is -+ decremented by 2. -+ -+ Suppose both two blocks were spent for dirtying of an already allocated clean -+ node (one block went from "grabbed" to "flush reserved") and for new block -+ allocating (one block went from "grabbed" to "fake allocated formatted"). -+ -+ Inserting of a child pointer to the parent node caused parent node to be -+ split, the balancing code takes care about this grabbing necessary space -+ immediately by calling reiser4_grab with BA_RESERVED flag set which means -+ "can use the 5% reserved disk space". -+ -+ At this moment insertion completes and grabbed blocks (if they were not used) -+ should be returned to the free space counter. -+ -+ However the atom life-cycle is not completed. The atom had one "flush -+ reserved" block added by our insertion and the new fake allocated node is -+ counted as a "fake allocated formatted" one. The atom has to be fully -+ processed by flush before commit. Suppose that the flush moved the first, -+ already allocated node to the atom's overwrite list, the new fake allocated -+ node, obviously, went into the atom relocate set. The reiser4 flush -+ allocates the new node using one unit from "fake allocated formatted" -+ counter, the log writer uses one from "flush reserved" for wandered block -+ allocation. -+ -+ And, it is not the end. When the wandered block is deallocated after the -+ atom gets fully played (see wander.c for term description), the disk space -+ occupied for it is returned to free blocks. */ -+ -+/* BLOCK NUMBERS */ -+ -+/* Any reiser4 node has a block number assigned to it. We use these numbers for -+ indexing in hash tables, so if a block has not yet been assigned a location -+ on disk we need to give it a temporary fake block number. -+ -+ Current implementation of reiser4 uses 64-bit integers for block numbers. We -+ use highest bit in 64-bit block number to distinguish fake and real block -+ numbers. So, only 63 bits may be used to addressing of real device -+ blocks. That "fake" block numbers space is divided into subspaces of fake -+ block numbers for data blocks and for shadow (working) bitmap blocks. -+ -+ Fake block numbers for data blocks are generated by a cyclic counter, which -+ gets incremented after each real block allocation. We assume that it is -+ impossible to overload this counter during one transaction life. */ -+ -+/* Initialize a blocknr hint. */ -+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint) -+{ -+ memset(hint, 0, sizeof(reiser4_blocknr_hint)); -+} -+ -+/* Release any resources of a blocknr hint. */ -+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG) -+{ -+/* No resources should be freed in current blocknr_hint implementation. */ -+} -+ -+/* see above for explanation of fake block number. */ -+/* Audited by: green(2002.06.11) */ -+int reiser4_blocknr_is_fake(const reiser4_block_nr * da) -+{ -+ /* The reason for not simply returning result of '&' operation is that -+ while return value is (possibly 32bit) int, the reiser4_block_nr is -+ at least 64 bits long, and high bit (which is the only possible -+ non zero bit after the masking) would be stripped off */ -+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0; -+} -+ -+/* Static functions for <reiser4 super block>/<reiser4 context> block counters -+ arithmetic. Mostly, they are isolated to not to code same assertions in -+ several places. */ -+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count) -+{ -+ BUG_ON(ctx->grabbed_blocks < count); -+ assert("zam-527", ctx->grabbed_blocks >= count); -+ ctx->grabbed_blocks -= count; -+} -+ -+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count) -+{ -+ ctx->grabbed_blocks += count; -+} -+ -+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("zam-525", sbinfo->blocks_grabbed >= count); -+ sbinfo->blocks_grabbed -= count; -+} -+ -+/* Decrease the counter of block reserved for flush in super block. */ -+static void -+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("vpf-291", sbinfo->blocks_flush_reserved >= count); -+ sbinfo->blocks_flush_reserved -= count; -+} -+ -+static void -+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count, -+ reiser4_ba_flags_t flags) -+{ -+ if (flags & BA_FORMATTED) { -+ assert("zam-806", sbinfo->blocks_fake_allocated >= count); -+ sbinfo->blocks_fake_allocated -= count; -+ } else { -+ assert("zam-528", -+ sbinfo->blocks_fake_allocated_unformatted >= count); -+ sbinfo->blocks_fake_allocated_unformatted -= count; -+ } -+} -+ -+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("zam-530", -+ sbinfo->blocks_used >= count + sbinfo->min_blocks_used); -+ sbinfo->blocks_used -= count; -+} -+ -+static void -+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("edward-501", sbinfo->blocks_clustered >= count); -+ sbinfo->blocks_clustered -= count; -+} -+ -+/* Increase the counter of block reserved for flush in atom. */ -+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count) -+{ -+ assert("zam-772", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ atom->flush_reserved += count; -+} -+ -+/* Decrease the counter of block reserved for flush in atom. */ -+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count) -+{ -+ assert("zam-774", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-2790", atom->flush_reserved >= count); -+ atom->flush_reserved -= count; -+} -+ -+/* super block has 6 counters: free, used, grabbed, fake allocated -+ (formatted and unformatted) and flush reserved. Their sum must be -+ number of blocks on a device. This function checks this */ -+int reiser4_check_block_counters(const struct super_block *super) -+{ -+ __u64 sum; -+ -+ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) + -+ reiser4_data_blocks(super) + reiser4_fake_allocated(super) + -+ reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) + -+ reiser4_clustered_blocks(super); -+ if (reiser4_block_count(super) != sum) { -+ printk("super block counters: " -+ "used %llu, free %llu, " -+ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), " -+ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n", -+ (unsigned long long)reiser4_data_blocks(super), -+ (unsigned long long)reiser4_free_blocks(super), -+ (unsigned long long)reiser4_grabbed_blocks(super), -+ (unsigned long long)reiser4_fake_allocated(super), -+ (unsigned long long) -+ reiser4_fake_allocated_unformatted(super), -+ (unsigned long long)reiser4_flush_reserved(super), -+ (unsigned long long)reiser4_clustered_blocks(super), -+ (unsigned long long)sum, -+ (unsigned long long)reiser4_block_count(super)); -+ return 0; -+ } -+ return 1; -+} -+ -+/* Adjust "working" free blocks counter for number of blocks we are going to -+ allocate. Record number of grabbed blocks in fs-wide and per-thread -+ counters. This function should be called before bitmap scanning or -+ allocating fake block numbers -+ -+ @super -- pointer to reiser4 super block; -+ @count -- number of blocks we reserve; -+ -+ @return -- 0 if success, -ENOSPC, if all -+ free blocks are preserved or already allocated. -+*/ -+ -+static int -+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags) -+{ -+ __u64 free_blocks; -+ int ret = 0, use_reserved = flags & BA_RESERVED; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("vs-1276", ctx == get_current_context()); -+ -+ /* Do not grab anything on ro-mounted fs. */ -+ if (rofs_super(ctx->super)) { -+ ctx->grab_enabled = 0; -+ return 0; -+ } -+ -+ sbinfo = get_super_private(ctx->super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ free_blocks = sbinfo->blocks_free; -+ -+ if ((use_reserved && free_blocks < count) || -+ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) { -+ ret = RETERR(-ENOSPC); -+ goto unlock_and_ret; -+ } -+ -+ add_to_ctx_grabbed(ctx, count); -+ -+ sbinfo->blocks_grabbed += count; -+ sbinfo->blocks_free -= count; -+ -+#if REISER4_DEBUG -+ if (ctx->grabbed_initially == 0) -+ ctx->grabbed_initially = count; -+#endif -+ -+ assert("nikita-2986", reiser4_check_block_counters(ctx->super)); -+ -+ /* disable grab space in current context */ -+ ctx->grab_enabled = 0; -+ -+unlock_and_ret: -+ spin_unlock_reiser4_super(sbinfo); -+ -+ return ret; -+} -+ -+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags) -+{ -+ int ret; -+ reiser4_context *ctx; -+ -+ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT, -+ lock_stack_isclean(get_current_lock_stack -+ ()))); -+ ctx = get_current_context(); -+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) -+ return 0; -+ -+ ret = reiser4_grab(ctx, count, flags); -+ if (ret == -ENOSPC) { -+ -+ /* Trying to commit the all transactions if BA_CAN_COMMIT flag -+ present */ -+ if (flags & BA_CAN_COMMIT) { -+ txnmgr_force_commit_all(ctx->super, 0); -+ ctx->grab_enabled = 1; -+ ret = reiser4_grab(ctx, count, flags); -+ } -+ } -+ /* -+ * allocation from reserved pool cannot fail. This is severe error. -+ */ -+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0)); -+ return ret; -+} -+ -+/* -+ * SPACE RESERVED FOR UNLINK/TRUNCATE -+ * -+ * Unlink and truncate require space in transaction (to update stat data, at -+ * least). But we don't want rm(1) to fail with "No space on device" error. -+ * -+ * Solution is to reserve 5% of disk space for truncates and -+ * unlinks. Specifically, normal space grabbing requests don't grab space from -+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to -+ * drain it. Per super block delete mutex is used to allow only one -+ * thread at a time to grab from reserved area. -+ * -+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT -+ * flag. -+ * -+ */ -+ -+int reiser4_grab_reserved(struct super_block *super, -+ __u64 count, reiser4_ba_flags_t flags) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ -+ assert("nikita-3175", flags & BA_CAN_COMMIT); -+ -+ /* Check the delete mutex already taken by us, we assume that -+ * reading of machine word is atomic. */ -+ if (sbinfo->delete_mutex_owner == current) { -+ if (reiser4_grab_space -+ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) { -+ warning("zam-1003", -+ "nested call of grab_reserved fails count=(%llu)", -+ (unsigned long long)count); -+ reiser4_release_reserved(super); -+ return RETERR(-ENOSPC); -+ } -+ return 0; -+ } -+ -+ if (reiser4_grab_space(count, flags)) { -+ mutex_lock(&sbinfo->delete_mutex); -+ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL); -+ sbinfo->delete_mutex_owner = current; -+ -+ if (reiser4_grab_space(count, flags | BA_RESERVED)) { -+ warning("zam-833", -+ "reserved space is not enough (%llu)", -+ (unsigned long long)count); -+ reiser4_release_reserved(super); -+ return RETERR(-ENOSPC); -+ } -+ } -+ return 0; -+} -+ -+void reiser4_release_reserved(struct super_block *super) -+{ -+ reiser4_super_info_data *info; -+ -+ info = get_super_private(super); -+ if (info->delete_mutex_owner == current) { -+ info->delete_mutex_owner = NULL; -+ mutex_unlock(&info->delete_mutex); -+ } -+} -+ -+static reiser4_super_info_data *grabbed2fake_allocated_head(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sub_from_ctx_grabbed(ctx, count); -+ -+ sbinfo = get_super_private(ctx->super); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ /* return sbinfo locked */ -+ return sbinfo; -+} -+ -+/* is called after @count fake block numbers are allocated and pointer to -+ those blocks are inserted into tree. */ -+static void grabbed2fake_allocated_formatted(void) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = grabbed2fake_allocated_head(1); -+ sbinfo->blocks_fake_allocated++; -+ -+ assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/** -+ * grabbed2fake_allocated_unformatted -+ * @count: -+ * -+ */ -+static void grabbed2fake_allocated_unformatted(int count) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = grabbed2fake_allocated_head(count); -+ sbinfo->blocks_fake_allocated_unformatted += count; -+ -+ assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void grabbed2cluster_reserved(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sub_from_ctx_grabbed(ctx, count); -+ -+ sbinfo = get_super_private(ctx->super); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ sbinfo->blocks_clustered += count; -+ -+ assert("edward-504", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void cluster_reserved2grabbed(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ -+ sbinfo = get_super_private(ctx->super); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_cluster_reserved(sbinfo, count); -+ sbinfo->blocks_grabbed += count; -+ -+ assert("edward-505", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+ add_to_ctx_grabbed(ctx, count); -+} -+ -+void cluster_reserved2free(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ cluster_reserved2grabbed(count); -+ grabbed2free(ctx, sbinfo, count); -+} -+ -+static DEFINE_SPINLOCK(fake_lock); -+static reiser4_block_nr fake_gen = 0; -+ -+/** -+ * assign_fake_blocknr -+ * @blocknr: -+ * @count: -+ * -+ * Obtain a fake block number for new node which will be used to refer to -+ * this newly allocated node until real allocation is done. -+ */ -+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count) -+{ -+ spin_lock(&fake_lock); -+ *blocknr = fake_gen; -+ fake_gen += count; -+ spin_unlock(&fake_lock); -+ -+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK); -+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/ -+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE; -+ assert("zam-394", zlook(current_tree, blocknr) == NULL); -+} -+ -+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr) -+{ -+ assign_fake_blocknr(blocknr, 1); -+ grabbed2fake_allocated_formatted(); -+ return 0; -+} -+ -+/** -+ * fake_blocknrs_unformatted -+ * @count: number of fake numbers to get -+ * -+ * Allocates @count fake block numbers which will be assigned to jnodes -+ */ -+reiser4_block_nr fake_blocknr_unformatted(int count) -+{ -+ reiser4_block_nr blocknr; -+ -+ assign_fake_blocknr(&blocknr, count); -+ grabbed2fake_allocated_unformatted(count); -+ -+ return blocknr; -+} -+ -+/* adjust sb block counters, if real (on-disk) block allocation immediately -+ follows grabbing of free disk space. */ -+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo, -+ __u64 count) -+{ -+ sub_from_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ sbinfo->blocks_used += count; -+ -+ assert("nikita-2679", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* adjust sb block counters when @count unallocated blocks get mapped to disk */ -+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count, -+ reiser4_ba_flags_t flags) -+{ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_fake_allocated(sbinfo, count, flags); -+ sbinfo->blocks_used += count; -+ -+ assert("nikita-2680", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+static void flush_reserved2used(txn_atom * atom, __u64 count) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ assert("zam-787", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count); -+ -+ sbinfo = get_current_super_private(); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_flush_reserved(sbinfo, count); -+ sbinfo->blocks_used += count; -+ -+ assert("zam-789", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* update the per fs blocknr hint default value. */ -+void -+update_blocknr_hint_default(const struct super_block *s, -+ const reiser4_block_nr * block) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ -+ assert("nikita-3342", !reiser4_blocknr_is_fake(block)); -+ -+ spin_lock_reiser4_super(sbinfo); -+ if (*block < sbinfo->block_count) { -+ sbinfo->blocknr_hint_default = *block; -+ } else { -+ warning("zam-676", -+ "block number %llu is too large to be used in a blocknr hint\n", -+ (unsigned long long)*block); -+ dump_stack(); -+ DEBUGON(1); -+ } -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* get current value of the default blocknr hint. */ -+void get_blocknr_hint_default(reiser4_block_nr * result) -+{ -+ reiser4_super_info_data *sbinfo = get_current_super_private(); -+ -+ spin_lock_reiser4_super(sbinfo); -+ *result = sbinfo->blocknr_hint_default; -+ assert("zam-677", *result < sbinfo->block_count); -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* Allocate "real" disk blocks by calling a proper space allocation plugin -+ * method. Blocks are allocated in one contiguous disk region. The plugin -+ * independent part accounts blocks by subtracting allocated amount from grabbed -+ * or fake block counter and add the same amount to the counter of allocated -+ * blocks. -+ * -+ * @hint -- a reiser4 blocknr hint object which contains further block -+ * allocation hints and parameters (search start, a stage of block -+ * which will be mapped to disk, etc.), -+ * @blk -- an out parameter for the beginning of the allocated region, -+ * @len -- in/out parameter, it should contain the maximum number of allocated -+ * blocks, after block allocation completes, it contains the length of -+ * allocated disk region. -+ * @flags -- see reiser4_ba_flags_t description. -+ * -+ * @return -- 0 if success, error code otherwise. -+ */ -+int -+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk, -+ reiser4_block_nr * len, reiser4_ba_flags_t flags) -+{ -+ __u64 needed = *len; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ int ret; -+ -+ assert("zam-986", hint != NULL); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ /* For write-optimized data we use default search start value, which is -+ * close to last write location. */ -+ if (flags & BA_USE_DEFAULT_SEARCH_START) -+ get_blocknr_hint_default(&hint->blk); -+ -+ /* VITALY: allocator should grab this for internal/tx-lists/similar -+ only. */ -+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)?*/ -+ if (hint->block_stage == BLOCK_NOT_COUNTED) { -+ ret = reiser4_grab_space_force(*len, flags); -+ if (ret != 0) -+ return ret; -+ } -+ -+ ret = -+ sa_alloc_blocks(reiser4_get_space_allocator(ctx->super), -+ hint, (int)needed, blk, len); -+ -+ if (!ret) { -+ assert("zam-680", *blk < reiser4_block_count(ctx->super)); -+ assert("zam-681", -+ *blk + *len <= reiser4_block_count(ctx->super)); -+ -+ if (flags & BA_PERMANENT) { -+ /* we assume that current atom exists at this moment */ -+ txn_atom *atom = get_current_atom_locked(); -+ atom->nr_blocks_allocated += *len; -+ spin_unlock_atom(atom); -+ } -+ -+ switch (hint->block_stage) { -+ case BLOCK_NOT_COUNTED: -+ case BLOCK_GRABBED: -+ grabbed2used(ctx, sbinfo, *len); -+ break; -+ case BLOCK_UNALLOCATED: -+ fake_allocated2used(sbinfo, *len, flags); -+ break; -+ case BLOCK_FLUSH_RESERVED: -+ { -+ txn_atom *atom = get_current_atom_locked(); -+ flush_reserved2used(atom, *len); -+ spin_unlock_atom(atom); -+ } -+ break; -+ default: -+ impossible("zam-531", "wrong block stage"); -+ } -+ } else { -+ assert("zam-821", -+ ergo(hint->max_dist == 0 -+ && !hint->backward, ret != -ENOSPC)); -+ if (hint->block_stage == BLOCK_NOT_COUNTED) -+ grabbed2free(ctx, sbinfo, needed); -+ } -+ -+ return ret; -+} -+ -+/* used -> fake_allocated -> grabbed -> free */ -+ -+/* adjust sb block counters when @count unallocated blocks get unmapped from -+ disk */ -+static void -+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count, -+ int formatted) -+{ -+ spin_lock_reiser4_super(sbinfo); -+ -+ if (formatted) -+ sbinfo->blocks_fake_allocated += count; -+ else -+ sbinfo->blocks_fake_allocated_unformatted += count; -+ -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2681", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+static void -+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom, -+ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG) -+{ -+ assert("nikita-2791", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ add_to_atom_flush_reserved_nolock(atom, (__u32) count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_flush_reserved += count; -+ /*add_to_sb_flush_reserved(sbinfo, count); */ -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2681", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* disk space, virtually used by fake block numbers is counted as "grabbed" -+ again. */ -+static void -+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo, -+ __u64 count, reiser4_ba_flags_t flags) -+{ -+ add_to_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ assert("nikita-2682", reiser4_check_block_counters(ctx->super)); -+ -+ sbinfo->blocks_grabbed += count; -+ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED); -+ -+ assert("nikita-2683", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ fake_allocated2grabbed(ctx, sbinfo, count, flags); -+ grabbed2free(ctx, sbinfo, count); -+} -+ -+void grabbed2free_mark(__u64 mark) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ assert("nikita-3007", (__s64) mark >= 0); -+ assert("nikita-3006", ctx->grabbed_blocks >= mark); -+ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark); -+} -+ -+/** -+ * grabbed2free - adjust grabbed and free block counters -+ * @ctx: context to update grabbed block counter of -+ * @sbinfo: super block to update grabbed and free block counters of -+ * @count: number of blocks to adjust counters by -+ * -+ * Decreases context's and per filesystem's counters of grabbed -+ * blocks. Increases per filesystem's counter of free blocks. -+ */ -+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo, -+ __u64 count) -+{ -+ sub_from_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ sbinfo->blocks_free += count; -+ assert("nikita-2684", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("vs-1095", atom); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ sub_from_ctx_grabbed(ctx, count); -+ -+ add_to_atom_flush_reserved_nolock(atom, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_flush_reserved += count; -+ sub_from_sb_grabbed(sbinfo, count); -+ -+ assert("vpf-292", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void grabbed2flush_reserved(__u64 count) -+{ -+ txn_atom *atom = get_current_atom_locked(); -+ -+ grabbed2flush_reserved_nolock(atom, count); -+ -+ spin_unlock_atom(atom); -+} -+ -+void flush_reserved2grabbed(txn_atom * atom, __u64 count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("nikita-2788", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ add_to_ctx_grabbed(ctx, count); -+ -+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_grabbed += count; -+ sub_from_sb_flush_reserved(sbinfo, count); -+ -+ assert("vpf-292", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/** -+ * all_grabbed2free - releases all blocks grabbed in context -+ * -+ * Decreases context's and super block's grabbed block counters by number of -+ * blocks grabbed by current context and increases super block's free block -+ * counter correspondingly. -+ */ -+void all_grabbed2free(void) -+{ -+ reiser4_context *ctx = get_current_context(); -+ -+ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks); -+} -+ -+/* adjust sb block counters if real (on-disk) blocks do not become unallocated -+ after freeing, @count blocks become "grabbed". */ -+static void -+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo, -+ __u64 count) -+{ -+ add_to_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_grabbed += count; -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2685", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* this used to be done through used2grabbed and grabbed2free*/ -+static void used2free(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_free += count; -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2685", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+#if REISER4_DEBUG -+ -+/* check "allocated" state of given block range */ -+static void -+reiser4_check_blocks(const reiser4_block_nr * start, -+ const reiser4_block_nr * len, int desired) -+{ -+ sa_check_blocks(start, len, desired); -+} -+ -+/* check "allocated" state of given block */ -+void reiser4_check_block(const reiser4_block_nr * block, int desired) -+{ -+ const reiser4_block_nr one = 1; -+ -+ reiser4_check_blocks(block, &one, desired); -+} -+ -+#endif -+ -+/* Blocks deallocation function may do an actual deallocation through space -+ plugin allocation or store deleted block numbers in atom's delete_set data -+ structure depend on @defer parameter. */ -+ -+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks -+ which will be deleted from WORKING bitmap. They might be just unmapped from -+ disk, or freed but disk space is still grabbed by current thread, or these -+ blocks must not be counted in any reiser4 sb block counters, -+ see block_stage_t comment */ -+ -+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to -+ distinguish blocks allocated for unformatted and formatted nodes */ -+ -+int -+reiser4_dealloc_blocks(const reiser4_block_nr * start, -+ const reiser4_block_nr * len, -+ block_stage_t target_stage, reiser4_ba_flags_t flags) -+{ -+ txn_atom *atom = NULL; -+ int ret; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ if (REISER4_DEBUG) { -+ assert("zam-431", *len != 0); -+ assert("zam-432", *start != 0); -+ assert("zam-558", !reiser4_blocknr_is_fake(start)); -+ -+ spin_lock_reiser4_super(sbinfo); -+ assert("zam-562", *start < sbinfo->block_count); -+ spin_unlock_reiser4_super(sbinfo); -+ } -+ -+ if (flags & BA_DEFER) { -+ blocknr_set_entry *bsep = NULL; -+ -+ /* storing deleted block numbers in a blocknr set -+ datastructure for further actual deletion */ -+ do { -+ atom = get_current_atom_locked(); -+ assert("zam-430", atom != NULL); -+ -+ ret = -+ blocknr_set_add_extent(atom, &atom->delete_set, -+ &bsep, start, len); -+ -+ if (ret == -ENOMEM) -+ return ret; -+ -+ /* This loop might spin at most two times */ -+ } while (ret == -E_REPEAT); -+ -+ assert("zam-477", ret == 0); -+ assert("zam-433", atom != NULL); -+ -+ spin_unlock_atom(atom); -+ -+ } else { -+ assert("zam-425", get_current_super_private() != NULL); -+ sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super), -+ *start, *len); -+ -+ if (flags & BA_PERMANENT) { -+ /* These blocks were counted as allocated, we have to -+ * revert it back if allocation is discarded. */ -+ txn_atom *atom = get_current_atom_locked(); -+ atom->nr_blocks_allocated -= *len; -+ spin_unlock_atom(atom); -+ } -+ -+ switch (target_stage) { -+ case BLOCK_NOT_COUNTED: -+ assert("vs-960", flags & BA_FORMATTED); -+ /* VITALY: This is what was grabbed for -+ internal/tx-lists/similar only */ -+ used2free(sbinfo, *len); -+ break; -+ -+ case BLOCK_GRABBED: -+ used2grabbed(ctx, sbinfo, *len); -+ break; -+ -+ case BLOCK_UNALLOCATED: -+ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED); -+ break; -+ -+ case BLOCK_FLUSH_RESERVED:{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ used2flush_reserved(sbinfo, atom, *len, -+ flags & BA_FORMATTED); -+ spin_unlock_atom(atom); -+ break; -+ } -+ default: -+ impossible("zam-532", "wrong block stage"); -+ } -+ } -+ -+ return 0; -+} -+ -+/* wrappers for block allocator plugin methods */ -+int reiser4_pre_commit_hook(void) -+{ -+ assert("zam-502", get_current_super_private() != NULL); -+ sa_pre_commit_hook(); -+ return 0; -+} -+ -+/* an actor which applies delete set to block allocator data */ -+static int -+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, -+ const reiser4_block_nr * b, void *data UNUSED_ARG) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ __u64 len = 1; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT); -+ assert("zam-552", sbinfo != NULL); -+ -+ if (b != NULL) -+ len = *b; -+ -+ if (REISER4_DEBUG) { -+ spin_lock_reiser4_super(sbinfo); -+ -+ assert("zam-554", *a < reiser4_block_count(ctx->super)); -+ assert("zam-555", *a + len <= reiser4_block_count(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+ } -+ -+ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len); -+ /* adjust sb block counters */ -+ used2free(sbinfo, len); -+ return 0; -+} -+ -+void reiser4_post_commit_hook(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT); -+ spin_unlock_atom(atom); -+ -+ /* do the block deallocation which was deferred -+ until commit is done */ -+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1); -+ -+ assert("zam-504", get_current_super_private() != NULL); -+ sa_post_commit_hook(); -+} -+ -+void reiser4_post_write_back_hook(void) -+{ -+ assert("zam-504", get_current_super_private() != NULL); -+ -+ sa_post_commit_hook(); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/block_alloc.h linux-2.6.30/fs/reiser4/block_alloc.h ---- linux-2.6.30.orig/fs/reiser4/block_alloc.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/block_alloc.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,177 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined(__FS_REISER4_BLOCK_ALLOC_H__) -+#define __FS_REISER4_BLOCK_ALLOC_H__ -+ -+#include "dformat.h" -+#include "forward.h" -+ -+#include <linux/types.h> /* for __u?? */ -+#include <linux/fs.h> -+ -+/* Mask when is applied to given block number shows is that block number is a -+ fake one */ -+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL -+/* Mask which isolates a type of object this fake block number was assigned -+ to */ -+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL -+ -+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared -+ against these two values to understand is the object unallocated or bitmap -+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */ -+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL -+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL -+ -+/* specification how block allocation was counted in sb block counters */ -+typedef enum { -+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */ -+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation -+ of this block */ -+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */ -+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object -+ ( unallocated formatted or unformatted -+ node) */ -+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block -+ number assigned */ -+} block_stage_t; -+ -+/* a hint for block allocator */ -+struct reiser4_blocknr_hint { -+ /* FIXME: I think we want to add a longterm lock on the bitmap block -+ here. This is to prevent jnode_flush() calls from interleaving -+ allocations on the same bitmap, once a hint is established. */ -+ -+ /* search start hint */ -+ reiser4_block_nr blk; -+ /* if not zero, it is a region size we search for free blocks in */ -+ reiser4_block_nr max_dist; -+ /* level for allocation, may be useful have branch-level and higher -+ write-optimized. */ -+ tree_level level; -+ /* block allocator assumes that blocks, which will be mapped to disk, -+ are in this specified block_stage */ -+ block_stage_t block_stage; -+ /* If direction = 1 allocate blocks in backward direction from the end -+ * of disk to the beginning of disk. */ -+ unsigned int backward:1; -+ -+}; -+ -+/* These flags control block allocation/deallocation behavior */ -+enum reiser4_ba_flags { -+ /* do allocatations from reserved (5%) area */ -+ BA_RESERVED = (1 << 0), -+ -+ /* block allocator can do commit trying to recover free space */ -+ BA_CAN_COMMIT = (1 << 1), -+ -+ /* if operation will be applied to formatted block */ -+ BA_FORMATTED = (1 << 2), -+ -+ /* defer actual block freeing until transaction commit */ -+ BA_DEFER = (1 << 3), -+ -+ /* allocate blocks for permanent fs objects (formatted or unformatted), -+ not wandered of log blocks */ -+ BA_PERMANENT = (1 << 4), -+ -+ /* grab space even it was disabled */ -+ BA_FORCE = (1 << 5), -+ -+ /* use default start value for free blocks search. */ -+ BA_USE_DEFAULT_SEARCH_START = (1 << 6) -+}; -+ -+typedef enum reiser4_ba_flags reiser4_ba_flags_t; -+ -+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint); -+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint); -+extern void update_blocknr_hint_default(const struct super_block *, -+ const reiser4_block_nr *); -+extern void get_blocknr_hint_default(reiser4_block_nr *); -+ -+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super); -+ -+int assign_fake_blocknr_formatted(reiser4_block_nr *); -+reiser4_block_nr fake_blocknr_unformatted(int); -+ -+/* free -> grabbed -> fake_allocated -> used */ -+ -+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags); -+void all_grabbed2free(void); -+void grabbed2free(reiser4_context * , reiser4_super_info_data * , __u64 count); -+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags); -+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count); -+void grabbed2flush_reserved(__u64 count); -+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint, -+ reiser4_block_nr * start, -+ reiser4_block_nr * len, reiser4_ba_flags_t flags); -+int reiser4_dealloc_blocks(const reiser4_block_nr *, -+ const reiser4_block_nr *, -+ block_stage_t, reiser4_ba_flags_t flags); -+ -+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint, -+ reiser4_block_nr * start, -+ reiser4_ba_flags_t flags) -+{ -+ reiser4_block_nr one = 1; -+ return reiser4_alloc_blocks(hint, start, &one, flags); -+} -+ -+static inline int reiser4_dealloc_block(const reiser4_block_nr * block, -+ block_stage_t stage, -+ reiser4_ba_flags_t flags) -+{ -+ const reiser4_block_nr one = 1; -+ return reiser4_dealloc_blocks(block, &one, stage, flags); -+} -+ -+#define reiser4_grab_space_force(count, flags) \ -+ reiser4_grab_space(count, flags | BA_FORCE) -+ -+extern void grabbed2free_mark(__u64 mark); -+extern int reiser4_grab_reserved(struct super_block *, -+ __u64, reiser4_ba_flags_t); -+extern void reiser4_release_reserved(struct super_block *super); -+ -+/* grabbed -> fake_allocated */ -+ -+/* fake_allocated -> used */ -+ -+/* used -> fake_allocated -> grabbed -> free */ -+ -+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count); -+ -+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da); -+ -+extern void grabbed2cluster_reserved(int count); -+extern void cluster_reserved2grabbed(int count); -+extern void cluster_reserved2free(int count); -+ -+extern int reiser4_check_block_counters(const struct super_block *); -+ -+#if REISER4_DEBUG -+ -+extern void reiser4_check_block(const reiser4_block_nr *, int); -+ -+#else -+ -+# define reiser4_check_block(beg, val) noop -+ -+#endif -+ -+extern int reiser4_pre_commit_hook(void); -+extern void reiser4_post_commit_hook(void); -+extern void reiser4_post_write_back_hook(void); -+ -+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/blocknrset.c linux-2.6.30/fs/reiser4/blocknrset.c ---- linux-2.6.30.orig/fs/reiser4/blocknrset.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/blocknrset.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,371 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+reiser4/README */ -+ -+/* This file contains code for various block number sets used by the atom to -+ track the deleted set and wandered block mappings. */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "txnmgr.h" -+#include "context.h" -+ -+#include <linux/slab.h> -+ -+/* The proposed data structure for storing unordered block number sets is a -+ list of elements, each of which contains an array of block number or/and -+ array of block number pairs. That element called blocknr_set_entry is used -+ to store block numbers from the beginning and for extents from the end of -+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields -+ count numbers of blocks and extents. -+ -+ +------------------- blocknr_set_entry->data ------------------+ -+ |block1|block2| ... <free space> ... |pair3|pair2|pair1| -+ +------------------------------------------------------------+ -+ -+ When current blocknr_set_entry is full, allocate a new one. */ -+ -+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete -+ * set (single blocks and block extents), in that case blocknr pair represent an -+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs -+ * there represent a (real block) -> (wandered block) mapping. */ -+ -+/* Protection: blocknr sets belong to reiser4 atom, and -+ * their modifications are performed with the atom lock held */ -+ -+/* The total size of a blocknr_set_entry. */ -+#define BLOCKNR_SET_ENTRY_SIZE 128 -+ -+/* The number of blocks that can fit the blocknr data area. */ -+#define BLOCKNR_SET_ENTRIES_NUMBER \ -+ ((BLOCKNR_SET_ENTRY_SIZE - \ -+ 2 * sizeof(unsigned) - \ -+ sizeof(struct list_head)) / \ -+ sizeof(reiser4_block_nr)) -+ -+/* An entry of the blocknr_set */ -+struct blocknr_set_entry { -+ unsigned nr_singles; -+ unsigned nr_pairs; -+ struct list_head link; -+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER]; -+}; -+ -+/* A pair of blocks as recorded in the blocknr_set_entry data. */ -+struct blocknr_pair { -+ reiser4_block_nr a; -+ reiser4_block_nr b; -+}; -+ -+/* Return the number of blocknr slots available in a blocknr_set_entry. */ -+/* Audited by: green(2002.06.11) */ -+static unsigned bse_avail(blocknr_set_entry * bse) -+{ -+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs; -+ -+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used); -+ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE); -+ -+ return BLOCKNR_SET_ENTRIES_NUMBER - used; -+} -+ -+/* Initialize a blocknr_set_entry. */ -+static void bse_init(blocknr_set_entry *bse) -+{ -+ bse->nr_singles = 0; -+ bse->nr_pairs = 0; -+ INIT_LIST_HEAD(&bse->link); -+} -+ -+/* Allocate and initialize a blocknr_set_entry. */ -+/* Audited by: green(2002.06.11) */ -+static blocknr_set_entry *bse_alloc(void) -+{ -+ blocknr_set_entry *e; -+ -+ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry), -+ reiser4_ctx_gfp_mask_get())) == NULL) -+ return NULL; -+ -+ bse_init(e); -+ -+ return e; -+} -+ -+/* Free a blocknr_set_entry. */ -+/* Audited by: green(2002.06.11) */ -+static void bse_free(blocknr_set_entry * bse) -+{ -+ kfree(bse); -+} -+ -+/* Add a block number to a blocknr_set_entry */ -+/* Audited by: green(2002.06.11) */ -+static void -+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block) -+{ -+ assert("jmacd-5099", bse_avail(bse) >= 1); -+ -+ bse->entries[bse->nr_singles++] = *block; -+} -+ -+/* Get a pair of block numbers */ -+/* Audited by: green(2002.06.11) */ -+static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse, -+ unsigned pno) -+{ -+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1)); -+ -+ return (struct blocknr_pair *) (bse->entries + -+ BLOCKNR_SET_ENTRIES_NUMBER - -+ 2 * (pno + 1)); -+} -+ -+/* Add a pair of block numbers to a blocknr_set_entry */ -+/* Audited by: green(2002.06.11) */ -+static void -+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a, -+ const reiser4_block_nr * b) -+{ -+ struct blocknr_pair *pair; -+ -+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL); -+ -+ pair = bse_get_pair(bse, bse->nr_pairs++); -+ -+ pair->a = *a; -+ pair->b = *b; -+} -+ -+/* Add either a block or pair of blocks to the block number set. The first -+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if -+ @b is non-NULL a pair is added. The block number set belongs to atom, and -+ the call is made with the atom lock held. There may not be enough space in -+ the current blocknr_set_entry. If new_bsep points to a non-NULL -+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep -+ will be set to NULL. If new_bsep contains NULL then the atom lock will be -+ released and a new bse will be allocated in new_bsep. E_REPEAT will be -+ returned with the atom unlocked for the operation to be tried again. If -+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not -+ used during the call, it will be freed automatically. */ -+static int blocknr_set_add(txn_atom *atom, struct list_head *bset, -+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a, -+ const reiser4_block_nr *b) -+{ -+ blocknr_set_entry *bse; -+ unsigned entries_needed; -+ -+ assert("jmacd-5101", a != NULL); -+ -+ entries_needed = (b == NULL) ? 1 : 2; -+ if (list_empty(bset) || -+ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) { -+ /* See if a bse was previously allocated. */ -+ if (*new_bsep == NULL) { -+ spin_unlock_atom(atom); -+ *new_bsep = bse_alloc(); -+ return (*new_bsep != NULL) ? -E_REPEAT : -+ RETERR(-ENOMEM); -+ } -+ -+ /* Put it on the head of the list. */ -+ list_add(&((*new_bsep)->link), bset); -+ -+ *new_bsep = NULL; -+ } -+ -+ /* Add the single or pair. */ -+ bse = list_entry(bset->next, blocknr_set_entry, link); -+ if (b == NULL) { -+ bse_put_single(bse, a); -+ } else { -+ bse_put_pair(bse, a, b); -+ } -+ -+ /* If new_bsep is non-NULL then there was an allocation race, free this -+ copy. */ -+ if (*new_bsep != NULL) { -+ bse_free(*new_bsep); -+ *new_bsep = NULL; -+ } -+ -+ return 0; -+} -+ -+/* Add an extent to the block set. If the length is 1, it is treated as a -+ single block (e.g., reiser4_set_add_block). */ -+/* Audited by: green(2002.06.11) */ -+/* Auditor note: Entire call chain cannot hold any spinlocks, because -+ kmalloc might schedule. The only exception is atom spinlock, which is -+ properly freed. */ -+int -+blocknr_set_add_extent(txn_atom * atom, -+ struct list_head *bset, -+ blocknr_set_entry ** new_bsep, -+ const reiser4_block_nr * start, -+ const reiser4_block_nr * len) -+{ -+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0); -+ return blocknr_set_add(atom, bset, new_bsep, start, -+ *len == 1 ? NULL : len); -+} -+ -+/* Add a block pair to the block set. It adds exactly a pair, which is checked -+ * by an assertion that both arguments are not null.*/ -+/* Audited by: green(2002.06.11) */ -+/* Auditor note: Entire call chain cannot hold any spinlocks, because -+ kmalloc might schedule. The only exception is atom spinlock, which is -+ properly freed. */ -+int -+blocknr_set_add_pair(txn_atom * atom, -+ struct list_head *bset, -+ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, -+ const reiser4_block_nr * b) -+{ -+ assert("jmacd-5103", a != NULL && b != NULL); -+ return blocknr_set_add(atom, bset, new_bsep, a, b); -+} -+ -+/* Initialize a blocknr_set. */ -+void blocknr_set_init(struct list_head *bset) -+{ -+ INIT_LIST_HEAD(bset); -+} -+ -+/* Release the entries of a blocknr_set. */ -+void blocknr_set_destroy(struct list_head *bset) -+{ -+ blocknr_set_entry *bse; -+ -+ while (!list_empty(bset)) { -+ bse = list_entry(bset->next, blocknr_set_entry, link); -+ list_del_init(&bse->link); -+ bse_free(bse); -+ } -+} -+ -+/* Merge blocknr_set entries out of @from into @into. */ -+/* Audited by: green(2002.06.11) */ -+/* Auditor comments: This merge does not know if merged sets contain -+ blocks pairs (As for wandered sets) or extents, so it cannot really merge -+ overlapping ranges if there is some. So I believe it may lead to -+ some blocks being presented several times in one blocknr_set. To help -+ debugging such problems it might help to check for duplicate entries on -+ actual processing of this set. Testing this kind of stuff right here is -+ also complicated by the fact that these sets are not sorted and going -+ through whole set on each element addition is going to be CPU-heavy task */ -+void blocknr_set_merge(struct list_head *from, struct list_head *into) -+{ -+ blocknr_set_entry *bse_into = NULL; -+ -+ /* If @from is empty, no work to perform. */ -+ if (list_empty(from)) -+ return; -+ /* If @into is not empty, try merging partial-entries. */ -+ if (!list_empty(into)) { -+ -+ /* Neither set is empty, pop the front to members and try to -+ combine them. */ -+ blocknr_set_entry *bse_from; -+ unsigned into_avail; -+ -+ bse_into = list_entry(into->next, blocknr_set_entry, link); -+ list_del_init(&bse_into->link); -+ bse_from = list_entry(from->next, blocknr_set_entry, link); -+ list_del_init(&bse_from->link); -+ -+ /* Combine singles. */ -+ for (into_avail = bse_avail(bse_into); -+ into_avail != 0 && bse_from->nr_singles != 0; -+ into_avail -= 1) { -+ bse_put_single(bse_into, -+ &bse_from->entries[--bse_from-> -+ nr_singles]); -+ } -+ -+ /* Combine pairs. */ -+ for (; into_avail > 1 && bse_from->nr_pairs != 0; -+ into_avail -= 2) { -+ struct blocknr_pair *pair = -+ bse_get_pair(bse_from, --bse_from->nr_pairs); -+ bse_put_pair(bse_into, &pair->a, &pair->b); -+ } -+ -+ /* If bse_from is empty, delete it now. */ -+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) { -+ bse_free(bse_from); -+ } else { -+ /* Otherwise, bse_into is full or nearly full (e.g., -+ it could have one slot avail and bse_from has one -+ pair left). Push it back onto the list. bse_from -+ becomes bse_into, which will be the new partial. */ -+ list_add(&bse_into->link, into); -+ bse_into = bse_from; -+ } -+ } -+ -+ /* Splice lists together. */ -+ list_splice_init(from, into->prev); -+ -+ /* Add the partial entry back to the head of the list. */ -+ if (bse_into != NULL) -+ list_add(&bse_into->link, into); -+} -+ -+/* Iterate over all blocknr set elements. */ -+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset, -+ blocknr_set_actor_f actor, void *data, int delete) -+{ -+ -+ blocknr_set_entry *entry; -+ -+ assert("zam-429", atom != NULL); -+ assert("zam-430", atom_is_protected(atom)); -+ assert("zam-431", bset != 0); -+ assert("zam-432", actor != NULL); -+ -+ entry = list_entry(bset->next, blocknr_set_entry, link); -+ while (bset != &entry->link) { -+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link); -+ unsigned int i; -+ int ret; -+ -+ for (i = 0; i < entry->nr_singles; i++) { -+ ret = actor(atom, &entry->entries[i], NULL, data); -+ -+ /* We can't break a loop if delete flag is set. */ -+ if (ret != 0 && !delete) -+ return ret; -+ } -+ -+ for (i = 0; i < entry->nr_pairs; i++) { -+ struct blocknr_pair *ab; -+ -+ ab = bse_get_pair(entry, i); -+ -+ ret = actor(atom, &ab->a, &ab->b, data); -+ -+ if (ret != 0 && !delete) -+ return ret; -+ } -+ -+ if (delete) { -+ list_del(&entry->link); -+ bse_free(entry); -+ } -+ -+ entry = tmp; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/carry.c linux-2.6.30/fs/reiser4/carry.c ---- linux-2.6.30.orig/fs/reiser4/carry.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/carry.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1398 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+/* Functions to "carry" tree modification(s) upward. */ -+/* Tree is modified one level at a time. As we modify a level we accumulate a -+ set of changes that need to be propagated to the next level. We manage -+ node locking such that any searches that collide with carrying are -+ restarted, from the root if necessary. -+ -+ Insertion of a new item may result in items being moved among nodes and -+ this requires the delimiting key to be updated at the least common parent -+ of the nodes modified to preserve search tree invariants. Also, insertion -+ may require allocation of a new node. A pointer to the new node has to be -+ inserted into some node on the parent level, etc. -+ -+ Tree carrying is meant to be analogous to arithmetic carrying. -+ -+ A carry operation is always associated with some node (&carry_node). -+ -+ Carry process starts with some initial set of operations to be performed -+ and an initial set of already locked nodes. Operations are performed one -+ by one. Performing each single operation has following possible effects: -+ -+ - content of carry node associated with operation is modified -+ - new carry nodes are locked and involved into carry process on this level -+ - new carry operations are posted to the next level -+ -+ After all carry operations on this level are done, process is repeated for -+ the accumulated sequence on carry operations for the next level. This -+ starts by trying to lock (in left to right order) all carry nodes -+ associated with carry operations on the parent level. After this, we decide -+ whether more nodes are required on the left of already locked set. If so, -+ all locks taken on the parent level are released, new carry nodes are -+ added, and locking process repeats. -+ -+ It may happen that balancing process fails owing to unrecoverable error on -+ some of upper levels of a tree (possible causes are io error, failure to -+ allocate new node, etc.). In this case we should unmount the filesystem, -+ rebooting if it is the root, and possibly advise the use of fsck. -+ -+ USAGE: -+ -+ int some_tree_operation( znode *node, ... ) -+ { -+ // Allocate on a stack pool of carry objects: operations and nodes. -+ // Most carry processes will only take objects from here, without -+ // dynamic allocation. -+ -+I feel uneasy about this pool. It adds to code complexity, I understand why it -+exists, but.... -Hans -+ -+ carry_pool pool; -+ carry_level lowest_level; -+ carry_op *op; -+ -+ init_carry_pool( &pool ); -+ init_carry_level( &lowest_level, &pool ); -+ -+ // operation may be one of: -+ // COP_INSERT --- insert new item into node -+ // COP_CUT --- remove part of or whole node -+ // COP_PASTE --- increase size of item -+ // COP_DELETE --- delete pointer from parent node -+ // COP_UPDATE --- update delimiting key in least -+ // common ancestor of two -+ -+ op = reiser4_post_carry( &lowest_level, operation, node, 0 ); -+ if( IS_ERR( op ) || ( op == NULL ) ) { -+ handle error -+ } else { -+ // fill in remaining fields in @op, according to carry.h:carry_op -+ result = carry(&lowest_level, NULL); -+ } -+ done_carry_pool(&pool); -+ } -+ -+ When you are implementing node plugin method that participates in carry -+ (shifting, insertion, deletion, etc.), do the following: -+ -+ int foo_node_method(znode * node, ..., carry_level * todo) -+ { -+ carry_op *op; -+ -+ .... -+ -+ // note, that last argument to reiser4_post_carry() is non-null -+ // here, because @op is to be applied to the parent of @node, rather -+ // than to the @node itself as in the previous case. -+ -+ op = node_post_carry(todo, operation, node, 1); -+ // fill in remaining fields in @op, according to carry.h:carry_op -+ -+ .... -+ -+ } -+ -+ BATCHING: -+ -+ One of the main advantages of level-by-level balancing implemented here is -+ ability to batch updates on a parent level and to peform them more -+ efficiently as a result. -+ -+ Description To Be Done (TBD). -+ -+ DIFFICULTIES AND SUBTLE POINTS: -+ -+ 1. complex plumbing is required, because: -+ -+ a. effective allocation through pools is needed -+ -+ b. target of operation is not exactly known when operation is -+ posted. This is worked around through bitfields in &carry_node and -+ logic in lock_carry_node() -+ -+ c. of interaction with locking code: node should be added into sibling -+ list when pointer to it is inserted into its parent, which is some time -+ after node was created. Between these moments, node is somewhat in -+ suspended state and is only registered in the carry lists -+ -+ 2. whole balancing logic is implemented here, in particular, insertion -+ logic is coded in make_space(). -+ -+ 3. special cases like insertion (reiser4_add_tree_root()) or deletion -+ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert -+ (insert_paste()) have to be handled. -+ -+ 4. there is non-trivial interdependency between allocation of new nodes -+ and almost everything else. This is mainly due to the (1.c) above. I shall -+ write about this later. -+ -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/item/extent.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "znode.h" -+#include "tree_mod.h" -+#include "tree_walk.h" -+#include "block_alloc.h" -+#include "pool.h" -+#include "tree.h" -+#include "carry.h" -+#include "carry_ops.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include <linux/types.h> -+ -+/* level locking/unlocking */ -+static int lock_carry_level(carry_level * level); -+static void unlock_carry_level(carry_level * level, int failure); -+static void done_carry_level(carry_level * level); -+static void unlock_carry_node(carry_level * level, carry_node * node, int fail); -+ -+int lock_carry_node(carry_level * level, carry_node * node); -+int lock_carry_node_tail(carry_node * node); -+ -+/* carry processing proper */ -+static int carry_on_level(carry_level * doing, carry_level * todo); -+ -+static carry_op *add_op(carry_level * level, pool_ordering order, -+ carry_op * reference); -+ -+/* handlers for carry operations. */ -+ -+static void fatal_carry_error(carry_level * doing, int ecode); -+static int add_new_root(carry_level * level, carry_node * node, znode * fake); -+ -+static void print_level(const char *prefix, carry_level * level); -+ -+#if REISER4_DEBUG -+typedef enum { -+ CARRY_TODO, -+ CARRY_DOING -+} carry_queue_state; -+static int carry_level_invariant(carry_level * level, carry_queue_state state); -+#endif -+ -+/* main entry point for tree balancing. -+ -+ Tree carry performs operations from @doing and while doing so accumulates -+ information about operations to be performed on the next level ("carried" -+ to the parent level). Carried operations are performed, causing possibly -+ more operations to be carried upward etc. carry() takes care about -+ locking and pinning znodes while operating on them. -+ -+ For usage, see comment at the top of fs/reiser4/carry.c -+ -+*/ -+int reiser4_carry(carry_level * doing /* set of carry operations to be -+ * performed */ , -+ carry_level * done /* set of nodes, already performed -+ * at the previous level. -+ * NULL in most cases */) -+{ -+ int result = 0; -+ /* queue of new requests */ -+ carry_level *todo; -+ ON_DEBUG(STORE_COUNTERS); -+ -+ assert("nikita-888", doing != NULL); -+ BUG_ON(done != NULL); -+ -+ todo = doing + 1; -+ init_carry_level(todo, doing->pool); -+ -+ /* queue of requests preformed on the previous level */ -+ done = todo + 1; -+ init_carry_level(done, doing->pool); -+ -+ /* iterate until there is nothing more to do */ -+ while (result == 0 && doing->ops_num > 0) { -+ carry_level *tmp; -+ -+ /* at this point @done is locked. */ -+ /* repeat lock/do/unlock while -+ -+ (1) lock_carry_level() fails due to deadlock avoidance, or -+ -+ (2) carry_on_level() decides that more nodes have to -+ be involved. -+ -+ (3) some unexpected error occurred while balancing on the -+ upper levels. In this case all changes are rolled back. -+ -+ */ -+ while (1) { -+ result = lock_carry_level(doing); -+ if (result == 0) { -+ /* perform operations from @doing and -+ accumulate new requests in @todo */ -+ result = carry_on_level(doing, todo); -+ if (result == 0) -+ break; -+ else if (result != -E_REPEAT || -+ !doing->restartable) { -+ warning("nikita-1043", -+ "Fatal error during carry: %i", -+ result); -+ print_level("done", done); -+ print_level("doing", doing); -+ print_level("todo", todo); -+ /* do some rough stuff like aborting -+ all pending transcrashes and thus -+ pushing tree back to the consistent -+ state. Alternatvely, just panic. -+ */ -+ fatal_carry_error(doing, result); -+ return result; -+ } -+ } else if (result != -E_REPEAT) { -+ fatal_carry_error(doing, result); -+ return result; -+ } -+ unlock_carry_level(doing, 1); -+ } -+ /* at this point @done can be safely unlocked */ -+ done_carry_level(done); -+ -+ /* cyclically shift queues */ -+ tmp = done; -+ done = doing; -+ doing = todo; -+ todo = tmp; -+ init_carry_level(todo, doing->pool); -+ -+ /* give other threads chance to run */ -+ reiser4_preempt_point(); -+ } -+ done_carry_level(done); -+ -+ /* all counters, but x_refs should remain the same. x_refs can change -+ owing to transaction manager */ -+ ON_DEBUG(CHECK_COUNTERS); -+ return result; -+} -+ -+/* perform carry operations on given level. -+ -+ Optimizations proposed by pooh: -+ -+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as -+ required; -+ -+ (2) unlock node if there are no more operations to be performed upon it and -+ node didn't add any operation to @todo. This can be implemented by -+ attaching to each node two counters: counter of operaions working on this -+ node and counter and operations carried upward from this node. -+ -+*/ -+static int carry_on_level(carry_level * doing /* queue of carry operations to -+ * do on this level */ , -+ carry_level * todo /* queue where new carry -+ * operations to be performed on -+ * the * parent level are -+ * accumulated during @doing -+ * processing. */ ) -+{ -+ int result; -+ int (*f) (carry_op *, carry_level *, carry_level *); -+ carry_op *op; -+ carry_op *tmp_op; -+ -+ assert("nikita-1034", doing != NULL); -+ assert("nikita-1035", todo != NULL); -+ -+ /* @doing->nodes are locked. */ -+ -+ /* This function can be split into two phases: analysis and modification -+ -+ Analysis calculates precisely what items should be moved between -+ nodes. This information is gathered in some structures attached to -+ each carry_node in a @doing queue. Analysis also determines whether -+ new nodes are to be allocated etc. -+ -+ After analysis is completed, actual modification is performed. Here -+ we can take advantage of "batch modification": if there are several -+ operations acting on the same node, modifications can be performed -+ more efficiently when batched together. -+ -+ Above is an optimization left for the future. -+ */ -+ /* Important, but delayed optimization: it's possible to batch -+ operations together and perform them more efficiently as a -+ result. For example, deletion of several neighboring items from a -+ node can be converted to a single ->cut() operation. -+ -+ Before processing queue, it should be scanned and "mergeable" -+ operations merged. -+ */ -+ result = 0; -+ for_all_ops(doing, op, tmp_op) { -+ carry_opcode opcode; -+ -+ assert("nikita-1041", op != NULL); -+ opcode = op->op; -+ assert("nikita-1042", op->op < COP_LAST_OP); -+ f = op_dispatch_table[op->op].handler; -+ result = f(op, doing, todo); -+ /* locking can fail with -E_REPEAT. Any different error is fatal -+ and will be handled by fatal_carry_error() sledgehammer. -+ */ -+ if (result != 0) -+ break; -+ } -+ if (result == 0) { -+ carry_plugin_info info; -+ carry_node *scan; -+ carry_node *tmp_scan; -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ assert("nikita-3002", -+ carry_level_invariant(doing, CARRY_DOING)); -+ for_all_nodes(doing, scan, tmp_scan) { -+ znode *node; -+ -+ node = reiser4_carry_real(scan); -+ assert("nikita-2547", node != NULL); -+ if (node_is_empty(node)) { -+ result = -+ node_plugin_by_node(node)-> -+ prepare_removal(node, &info); -+ if (result != 0) -+ break; -+ } -+ } -+ } -+ return result; -+} -+ -+/* post carry operation -+ -+ This is main function used by external carry clients: node layout plugins -+ and tree operations to create new carry operation to be performed on some -+ level. -+ -+ New operation will be included in the @level queue. To actually perform it, -+ call carry( level, ... ). This function takes write lock on @node. Carry -+ manages all its locks by itself, don't worry about this. -+ -+ This function adds operation and node at the end of the queue. It is up to -+ caller to guarantee proper ordering of node queue. -+ -+*/ -+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation -+ * is to be posted at */ , -+ carry_opcode op /* opcode of operation */ , -+ znode * node /* node on which this operation -+ * will operate */ , -+ int apply_to_parent_p /* whether operation will -+ * operate directly on @node -+ * or on it parent. */) -+{ -+ carry_op *result; -+ carry_node *child; -+ -+ assert("nikita-1046", level != NULL); -+ assert("nikita-1788", znode_is_write_locked(node)); -+ -+ result = add_op(level, POOLO_LAST, NULL); -+ if (IS_ERR(result)) -+ return result; -+ child = reiser4_add_carry(level, POOLO_LAST, NULL); -+ if (IS_ERR(child)) { -+ reiser4_pool_free(&level->pool->op_pool, &result->header); -+ return (carry_op *) child; -+ } -+ result->node = child; -+ result->op = op; -+ child->parent = apply_to_parent_p; -+ if (ZF_ISSET(node, JNODE_ORPHAN)) -+ child->left_before = 1; -+ child->node = node; -+ return result; -+} -+ -+/* initialize carry queue */ -+void init_carry_level(carry_level * level /* level to initialize */ , -+ carry_pool * pool /* pool @level will allocate objects -+ * from */ ) -+{ -+ assert("nikita-1045", level != NULL); -+ assert("nikita-967", pool != NULL); -+ -+ memset(level, 0, sizeof *level); -+ level->pool = pool; -+ -+ INIT_LIST_HEAD(&level->nodes); -+ INIT_LIST_HEAD(&level->ops); -+} -+ -+/* allocate carry pool and initialize pools within queue */ -+carry_pool *init_carry_pool(int size) -+{ -+ carry_pool *pool; -+ -+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level)); -+ pool = kmalloc(size, reiser4_ctx_gfp_mask_get()); -+ if (pool == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE, -+ (char *)pool->op); -+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node), -+ NODES_LOCKED_POOL_SIZE, (char *)pool->node); -+ return pool; -+} -+ -+/* finish with queue pools */ -+void done_carry_pool(carry_pool * pool/* pool to destroy */) -+{ -+ reiser4_done_pool(&pool->op_pool); -+ reiser4_done_pool(&pool->node_pool); -+ kfree(pool); -+} -+ -+/* add new carry node to the @level. -+ -+ Returns pointer to the new carry node allocated from pool. It's up to -+ callers to maintain proper order in the @level. Assumption is that if carry -+ nodes on one level are already sorted and modifications are peroformed from -+ left to right, carry nodes added on the parent level will be ordered -+ automatically. To control ordering use @order and @reference parameters. -+ -+*/ -+carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add -+ * node to */ , -+ pool_ordering order /* where to insert: -+ * at the beginning of -+ * @level, -+ * before @reference, -+ * after @reference, -+ * at the end of @level -+ */ , -+ carry_node * reference/* reference node for -+ * insertion */) -+{ -+ ON_DEBUG(carry_node * orig_ref = reference); -+ -+ if (order == POOLO_BEFORE) { -+ reference = find_left_carry(reference, level); -+ if (reference == NULL) -+ reference = list_entry(level->nodes.next, carry_node, -+ header.level_linkage); -+ else -+ reference = list_entry(reference->header.level_linkage.next, -+ carry_node, header.level_linkage); -+ } else if (order == POOLO_AFTER) { -+ reference = find_right_carry(reference, level); -+ if (reference == NULL) -+ reference = list_entry(level->nodes.prev, carry_node, -+ header.level_linkage); -+ else -+ reference = list_entry(reference->header.level_linkage.prev, -+ carry_node, header.level_linkage); -+ } -+ assert("nikita-2209", -+ ergo(orig_ref != NULL, -+ reiser4_carry_real(reference) == -+ reiser4_carry_real(orig_ref))); -+ return reiser4_add_carry(level, order, reference); -+} -+ -+carry_node *reiser4_add_carry(carry_level * level, /* carry_level to add -+ node to */ -+ pool_ordering order, /* where to insert: -+ * at the beginning of -+ * @level; -+ * before @reference; -+ * after @reference; -+ * at the end of @level -+ */ -+ carry_node * reference /* reference node for -+ * insertion */) -+{ -+ carry_node *result; -+ -+ result = -+ (carry_node *) reiser4_add_obj(&level->pool->node_pool, -+ &level->nodes, -+ order, &reference->header); -+ if (!IS_ERR(result) && (result != NULL)) -+ ++level->nodes_num; -+ return result; -+} -+ -+/** -+ * add new carry operation to the @level. -+ * -+ * Returns pointer to the new carry operations allocated from pool. It's up to -+ * callers to maintain proper order in the @level. To control ordering use -+ * @order and @reference parameters. -+ */ -+static carry_op *add_op(carry_level * level, /* &carry_level to add node to */ -+ pool_ordering order, /* where to insert: -+ * at the beginning of @level; -+ * before @reference; -+ * after @reference; -+ * at the end of @level */ -+ carry_op * reference /* reference node for insertion */) -+{ -+ carry_op *result; -+ -+ result = -+ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops, -+ order, &reference->header); -+ if (!IS_ERR(result) && (result != NULL)) -+ ++level->ops_num; -+ return result; -+} -+ -+/** -+ * Return node on the right of which @node was created. -+ * -+ * Each node is created on the right of some existing node (or it is new root, -+ * which is special case not handled here). -+ * -+ * @node is new node created on some level, but not yet inserted into its -+ * parent, it has corresponding bit (JNODE_ORPHAN) set in zstate. -+ */ -+static carry_node *find_begetting_brother(carry_node * node,/* node to start -+ search from */ -+ carry_level * kin UNUSED_ARG -+ /* level to scan */) -+{ -+ carry_node *scan; -+ -+ assert("nikita-1614", node != NULL); -+ assert("nikita-1615", kin != NULL); -+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree)); -+ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL, -+ ZF_ISSET(reiser4_carry_real(node), -+ JNODE_ORPHAN))); -+ for (scan = node;; -+ scan = list_entry(scan->header.level_linkage.prev, carry_node, -+ header.level_linkage)) { -+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage); -+ if ((scan->node != node->node) && -+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) { -+ assert("nikita-1618", reiser4_carry_real(scan) != NULL); -+ break; -+ } -+ } -+ return scan; -+} -+ -+static cmp_t -+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2) -+{ -+ assert("nikita-2199", n1 != NULL); -+ assert("nikita-2200", n2 != NULL); -+ -+ if (n1 == n2) -+ return EQUAL_TO; -+ while (1) { -+ n1 = carry_node_next(n1); -+ if (carry_node_end(level, n1)) -+ return GREATER_THAN; -+ if (n1 == n2) -+ return LESS_THAN; -+ } -+ impossible("nikita-2201", "End of level reached"); -+} -+ -+carry_node *find_carry_node(carry_level * level, const znode * node) -+{ -+ carry_node *scan; -+ carry_node *tmp_scan; -+ -+ assert("nikita-2202", level != NULL); -+ assert("nikita-2203", node != NULL); -+ -+ for_all_nodes(level, scan, tmp_scan) { -+ if (reiser4_carry_real(scan) == node) -+ return scan; -+ } -+ return NULL; -+} -+ -+znode *reiser4_carry_real(const carry_node * node) -+{ -+ assert("nikita-3061", node != NULL); -+ -+ return node->lock_handle.node; -+} -+ -+carry_node *insert_carry_node(carry_level * doing, carry_level * todo, -+ const znode * node) -+{ -+ carry_node *base; -+ carry_node *scan; -+ carry_node *tmp_scan; -+ carry_node *proj; -+ -+ base = find_carry_node(doing, node); -+ assert("nikita-2204", base != NULL); -+ -+ for_all_nodes(todo, scan, tmp_scan) { -+ proj = find_carry_node(doing, scan->node); -+ assert("nikita-2205", proj != NULL); -+ if (carry_node_cmp(doing, proj, base) != LESS_THAN) -+ break; -+ } -+ return scan; -+} -+ -+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo, -+ znode * node) -+{ -+ carry_node *reference; -+ -+ assert("nikita-2994", doing != NULL); -+ assert("nikita-2995", todo != NULL); -+ assert("nikita-2996", node != NULL); -+ -+ reference = insert_carry_node(doing, todo, node); -+ assert("nikita-2997", reference != NULL); -+ -+ return reiser4_add_carry(todo, POOLO_BEFORE, reference); -+} -+ -+/* like reiser4_post_carry(), but designed to be called from node plugin -+ methods. This function is different from reiser4_post_carry() in that it -+ finds proper place to insert node in the queue. */ -+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters -+ * passed down to node -+ * plugin */ , -+ carry_opcode op /* opcode of operation */ , -+ znode * node /* node on which this -+ * operation will operate */ , -+ int apply_to_parent_p /* whether operation will -+ * operate directly on @node -+ * or on it parent. */ ) -+{ -+ carry_op *result; -+ carry_node *child; -+ -+ assert("nikita-2207", info != NULL); -+ assert("nikita-2208", info->todo != NULL); -+ -+ if (info->doing == NULL) -+ return reiser4_post_carry(info->todo, op, node, -+ apply_to_parent_p); -+ -+ result = add_op(info->todo, POOLO_LAST, NULL); -+ if (IS_ERR(result)) -+ return result; -+ child = add_carry_atplace(info->doing, info->todo, node); -+ if (IS_ERR(child)) { -+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header); -+ return (carry_op *) child; -+ } -+ result->node = child; -+ result->op = op; -+ child->parent = apply_to_parent_p; -+ if (ZF_ISSET(node, JNODE_ORPHAN)) -+ child->left_before = 1; -+ child->node = node; -+ return result; -+} -+ -+/* lock all carry nodes in @level */ -+static int lock_carry_level(carry_level * level/* level to lock */) -+{ -+ int result; -+ carry_node *node; -+ carry_node *tmp_node; -+ -+ assert("nikita-881", level != NULL); -+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO)); -+ -+ /* lock nodes from left to right */ -+ result = 0; -+ for_all_nodes(level, node, tmp_node) { -+ result = lock_carry_node(level, node); -+ if (result != 0) -+ break; -+ } -+ return result; -+} -+ -+/* Synchronize delimiting keys between @node and its left neighbor. -+ -+ To reduce contention on dk key and simplify carry code, we synchronize -+ delimiting keys only when carry ultimately leaves tree level (carrying -+ changes upward) and unlocks nodes at this level. -+ -+ This function first finds left neighbor of @node and then updates left -+ neighbor's right delimiting key to conincide with least key in @node. -+ -+*/ -+ -+ON_DEBUG(extern atomic_t delim_key_version; -+ ) -+ -+static void sync_dkeys(znode * spot/* node to update */) -+{ -+ reiser4_key pivot; -+ reiser4_tree *tree; -+ -+ assert("nikita-1610", spot != NULL); -+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk)); -+ -+ tree = znode_get_tree(spot); -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ -+ assert("nikita-2192", znode_is_loaded(spot)); -+ -+ /* sync left delimiting key of @spot with key in its leftmost item */ -+ if (node_is_empty(spot)) -+ pivot = *znode_get_rd_key(spot); -+ else -+ leftmost_key_in_node(spot, &pivot); -+ -+ znode_set_ld_key(spot, &pivot); -+ -+ /* there can be sequence of empty nodes pending removal on the left of -+ @spot. Scan them and update their left and right delimiting keys to -+ match left delimiting key of @spot. Also, update right delimiting -+ key of first non-empty left neighbor. -+ */ -+ while (1) { -+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED)) -+ break; -+ -+ spot = spot->left; -+ if (spot == NULL) -+ break; -+ -+ znode_set_rd_key(spot, &pivot); -+ /* don't sink into the domain of another balancing */ -+ if (!znode_is_write_locked(spot)) -+ break; -+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE)) -+ znode_set_ld_key(spot, &pivot); -+ else -+ break; -+ } -+ -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+} -+ -+/* unlock all carry nodes in @level */ -+static void unlock_carry_level(carry_level * level /* level to unlock */ , -+ int failure /* true if unlocking owing to -+ * failure */ ) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ -+ assert("nikita-889", level != NULL); -+ -+ if (!failure) { -+ znode *spot; -+ -+ spot = NULL; -+ /* update delimiting keys */ -+ for_all_nodes(level, node, tmp_node) { -+ if (reiser4_carry_real(node) != spot) { -+ spot = reiser4_carry_real(node); -+ sync_dkeys(spot); -+ } -+ } -+ } -+ -+ /* nodes can be unlocked in arbitrary order. In preemptible -+ environment it's better to unlock in reverse order of locking, -+ though. -+ */ -+ for_all_nodes_back(level, node, tmp_node) { -+ /* all allocated nodes should be already linked to their -+ parents at this moment. */ -+ assert("nikita-1631", -+ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node), -+ JNODE_ORPHAN))); -+ ON_DEBUG(check_dkeys(reiser4_carry_real(node))); -+ unlock_carry_node(level, node, failure); -+ } -+ level->new_root = NULL; -+} -+ -+/* finish with @level -+ -+ Unlock nodes and release all allocated resources */ -+static void done_carry_level(carry_level * level/* level to finish */) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ carry_op *op; -+ carry_op *tmp_op; -+ -+ assert("nikita-1076", level != NULL); -+ -+ unlock_carry_level(level, 0); -+ for_all_nodes(level, node, tmp_node) { -+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link)); -+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link)); -+ reiser4_pool_free(&level->pool->node_pool, &node->header); -+ } -+ for_all_ops(level, op, tmp_op) -+ reiser4_pool_free(&level->pool->op_pool, &op->header); -+} -+ -+/* helper function to complete locking of carry node -+ -+ Finish locking of carry node. There are several ways in which new carry -+ node can be added into carry level and locked. Normal is through -+ lock_carry_node(), but also from find_{left|right}_neighbor(). This -+ function factors out common final part of all locking scenarios. It -+ supposes that @node -> lock_handle is lock handle for lock just taken and -+ fills ->real_node from this lock handle. -+ -+*/ -+int lock_carry_node_tail(carry_node * node/* node to complete locking of */) -+{ -+ assert("nikita-1052", node != NULL); -+ assert("nikita-1187", reiser4_carry_real(node) != NULL); -+ assert("nikita-1188", !node->unlock); -+ -+ node->unlock = 1; -+ /* Load node content into memory and install node plugin by -+ looking at the node header. -+ -+ Most of the time this call is cheap because the node is -+ already in memory. -+ -+ Corresponding zrelse() is in unlock_carry_node() -+ */ -+ return zload(reiser4_carry_real(node)); -+} -+ -+/* lock carry node -+ -+ "Resolve" node to real znode, lock it and mark as locked. -+ This requires recursive locking of znodes. -+ -+ When operation is posted to the parent level, node it will be applied to is -+ not yet known. For example, when shifting data between two nodes, -+ delimiting has to be updated in parent or parents of nodes involved. But -+ their parents is not yet locked and, moreover said nodes can be reparented -+ by concurrent balancing. -+ -+ To work around this, carry operation is applied to special "carry node" -+ rather than to the znode itself. Carry node consists of some "base" or -+ "reference" znode and flags indicating how to get to the target of carry -+ operation (->real_node field of carry_node) from base. -+ -+*/ -+int lock_carry_node(carry_level * level /* level @node is in */ , -+ carry_node * node/* node to lock */) -+{ -+ int result; -+ znode *reference_point; -+ lock_handle lh; -+ lock_handle tmp_lh; -+ reiser4_tree *tree; -+ -+ assert("nikita-887", level != NULL); -+ assert("nikita-882", node != NULL); -+ -+ result = 0; -+ reference_point = node->node; -+ init_lh(&lh); -+ init_lh(&tmp_lh); -+ if (node->left_before) { -+ /* handling of new nodes, allocated on the previous level: -+ -+ some carry ops were propably posted from the new node, but -+ this node neither has parent pointer set, nor is -+ connected. This will be done in ->create_hook() for -+ internal item. -+ -+ No then less, parent of new node has to be locked. To do -+ this, first go to the "left" in the carry order. This -+ depends on the decision to always allocate new node on the -+ right of existing one. -+ -+ Loop handles case when multiple nodes, all orphans, were -+ inserted. -+ -+ Strictly speaking, taking tree lock is not necessary here, -+ because all nodes scanned by loop in -+ find_begetting_brother() are write-locked by this thread, -+ and thus, their sibling linkage cannot change. -+ -+ */ -+ tree = znode_get_tree(reference_point); -+ read_lock_tree(tree); -+ reference_point = find_begetting_brother(node, level)->node; -+ read_unlock_tree(tree); -+ assert("nikita-1186", reference_point != NULL); -+ } -+ if (node->parent && (result == 0)) { -+ result = -+ reiser4_get_parent(&tmp_lh, reference_point, -+ ZNODE_WRITE_LOCK); -+ if (result != 0) { -+ ; /* nothing */ -+ } else if (znode_get_level(tmp_lh.node) == 0) { -+ assert("nikita-1347", znode_above_root(tmp_lh.node)); -+ result = add_new_root(level, node, tmp_lh.node); -+ if (result == 0) { -+ reference_point = level->new_root; -+ move_lh(&lh, &node->lock_handle); -+ } -+ } else if ((level->new_root != NULL) -+ && (level->new_root != -+ znode_parent_nolock(reference_point))) { -+ /* parent of node exists, but this level aready -+ created different new root, so */ -+ warning("nikita-1109", -+ /* it should be "radicis", but tradition is -+ tradition. do banshees read latin? */ -+ "hodie natus est radici frater"); -+ result = -EIO; -+ } else { -+ move_lh(&lh, &tmp_lh); -+ reference_point = lh.node; -+ } -+ } -+ if (node->left && (result == 0)) { -+ assert("nikita-1183", node->parent); -+ assert("nikita-883", reference_point != NULL); -+ result = -+ reiser4_get_left_neighbor(&tmp_lh, reference_point, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result == 0) { -+ done_lh(&lh); -+ move_lh(&lh, &tmp_lh); -+ reference_point = lh.node; -+ } -+ } -+ if (!node->parent && !node->left && !node->left_before) { -+ result = -+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_HIPRI); -+ } -+ if (result == 0) { -+ move_lh(&node->lock_handle, &lh); -+ result = lock_carry_node_tail(node); -+ } -+ done_lh(&tmp_lh); -+ done_lh(&lh); -+ return result; -+} -+ -+/* release a lock on &carry_node. -+ -+ Release if necessary lock on @node. This opearion is pair of -+ lock_carry_node() and is idempotent: you can call it more than once on the -+ same node. -+ -+*/ -+static void -+unlock_carry_node(carry_level * level, -+ carry_node * node /* node to be released */ , -+ int failure /* 0 if node is unlocked due -+ * to some error */ ) -+{ -+ znode *real_node; -+ -+ assert("nikita-884", node != NULL); -+ -+ real_node = reiser4_carry_real(node); -+ /* pair to zload() in lock_carry_node_tail() */ -+ zrelse(real_node); -+ if (node->unlock && (real_node != NULL)) { -+ assert("nikita-899", real_node == node->lock_handle.node); -+ longterm_unlock_znode(&node->lock_handle); -+ } -+ if (failure) { -+ if (node->deallocate && (real_node != NULL)) { -+ /* free node in bitmap -+ -+ Prepare node for removal. Last zput() will finish -+ with it. -+ */ -+ ZF_SET(real_node, JNODE_HEARD_BANSHEE); -+ } -+ if (node->free) { -+ assert("nikita-2177", -+ list_empty_careful(&node->lock_handle.locks_link)); -+ assert("nikita-2112", -+ list_empty_careful(&node->lock_handle.owners_link)); -+ reiser4_pool_free(&level->pool->node_pool, -+ &node->header); -+ } -+ } -+} -+ -+/* fatal_carry_error() - all-catching error handling function -+ -+ It is possible that carry faces unrecoverable error, like unability to -+ insert pointer at the internal level. Our simple solution is just panic in -+ this situation. More sophisticated things like attempt to remount -+ file-system as read-only can be implemented without much difficlties. -+ -+ It is believed, that: -+ -+ 1. in stead of panicking, all current transactions can be aborted rolling -+ system back to the consistent state. -+ -+Umm, if you simply panic without doing anything more at all, then all current -+transactions are aborted and the system is rolled back to a consistent state, -+by virtue of the design of the transactional mechanism. Well, wait, let's be -+precise. If an internal node is corrupted on disk due to hardware failure, -+then there may be no consistent state that can be rolled back to, so instead -+we should say that it will rollback the transactions, which barring other -+factors means rolling back to a consistent state. -+ -+# Nikita: there is a subtle difference between panic and aborting -+# transactions: machine doesn't reboot. Processes aren't killed. Processes -+# don't using reiser4 (not that we care about such processes), or using other -+# reiser4 mounts (about them we do care) will simply continue to run. With -+# some luck, even application using aborted file system can survive: it will -+# get some error, like EBADF, from each file descriptor on failed file system, -+# but applications that do care about tolerance will cope with this (squid -+# will). -+ -+It would be a nice feature though to support rollback without rebooting -+followed by remount, but this can wait for later versions. -+ -+ 2. once isolated transactions will be implemented it will be possible to -+ roll back offending transaction. -+ -+2. is additional code complexity of inconsistent value (it implies that a -+broken tree should be kept in operation), so we must think about it more -+before deciding if it should be done. -Hans -+ -+*/ -+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level -+ * where -+ * unrecoverable -+ * error -+ * occurred */ , -+ int ecode/* error code */) -+{ -+ assert("nikita-1230", doing != NULL); -+ assert("nikita-1231", ecode < 0); -+ -+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode); -+} -+ -+/** -+ * Add new root to the tree -+ * -+ * This function itself only manages changes in carry structures and delegates -+ * all hard work (allocation of znode for new root, changes of parent and -+ * sibling pointers) to the reiser4_add_tree_root(). -+ * -+ * Locking: old tree root is locked by carry at this point. Fake znode is also -+ * locked. -+ */ -+static int add_new_root(carry_level * level,/* carry level in context of which -+ * operation is performed */ -+ carry_node * node, /* carry node for existing root */ -+ znode * fake /* "fake" znode already locked by -+ * us */) -+{ -+ int result; -+ -+ assert("nikita-1104", level != NULL); -+ assert("nikita-1105", node != NULL); -+ -+ assert("nikita-1403", znode_is_write_locked(node->node)); -+ assert("nikita-1404", znode_is_write_locked(fake)); -+ -+ /* trying to create new root. */ -+ /* @node is root and it's already locked by us. This -+ means that nobody else can be trying to add/remove -+ tree root right now. -+ */ -+ if (level->new_root == NULL) -+ level->new_root = reiser4_add_tree_root(node->node, fake); -+ if (!IS_ERR(level->new_root)) { -+ assert("nikita-1210", znode_is_root(level->new_root)); -+ node->deallocate = 1; -+ result = -+ longterm_lock_znode(&node->lock_handle, level->new_root, -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); -+ if (result == 0) -+ zput(level->new_root); -+ } else { -+ result = PTR_ERR(level->new_root); -+ level->new_root = NULL; -+ } -+ return result; -+} -+ -+/* allocate new znode and add the operation that inserts the -+ pointer to it into the parent node into the todo level -+ -+ Allocate new znode, add it into carry queue and post into @todo queue -+ request to add pointer to new node into its parent. -+ -+ This is carry related routing that calls reiser4_new_node() to allocate new -+ node. -+*/ -+carry_node *add_new_znode(znode * brother /* existing left neighbor of new -+ * node */ , -+ carry_node * ref /* carry node after which new -+ * carry node is to be inserted -+ * into queue. This affects -+ * locking. */ , -+ carry_level * doing /* carry queue where new node is -+ * to be added */ , -+ carry_level * todo /* carry queue where COP_INSERT -+ * operation to add pointer to -+ * new node will ne added */ ) -+{ -+ carry_node *fresh; -+ znode *new_znode; -+ carry_op *add_pointer; -+ carry_plugin_info info; -+ -+ assert("nikita-1048", brother != NULL); -+ assert("nikita-1049", todo != NULL); -+ -+ /* There is a lot of possible variations here: to what parent -+ new node will be attached and where. For simplicity, always -+ do the following: -+ -+ (1) new node and @brother will have the same parent. -+ -+ (2) new node is added on the right of @brother -+ -+ */ -+ -+ fresh = reiser4_add_carry_skip(doing, -+ ref ? POOLO_AFTER : POOLO_LAST, ref); -+ if (IS_ERR(fresh)) -+ return fresh; -+ -+ fresh->deallocate = 1; -+ fresh->free = 1; -+ -+ new_znode = reiser4_new_node(brother, znode_get_level(brother)); -+ if (IS_ERR(new_znode)) -+ /* @fresh will be deallocated automatically by error -+ handling code in the caller. */ -+ return (carry_node *) new_znode; -+ -+ /* new_znode returned znode with x_count 1. Caller has to decrease -+ it. make_space() does. */ -+ -+ ZF_SET(new_znode, JNODE_ORPHAN); -+ fresh->node = new_znode; -+ -+ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) { -+ ref = carry_node_prev(ref); -+ assert("nikita-1606", !carry_node_end(doing, ref)); -+ } -+ -+ info.todo = todo; -+ info.doing = doing; -+ add_pointer = node_post_carry(&info, COP_INSERT, -+ reiser4_carry_real(ref), 1); -+ if (IS_ERR(add_pointer)) { -+ /* no need to deallocate @new_znode here: it will be -+ deallocated during carry error handling. */ -+ return (carry_node *) add_pointer; -+ } -+ -+ add_pointer->u.insert.type = COPT_CHILD; -+ add_pointer->u.insert.child = fresh; -+ add_pointer->u.insert.brother = brother; -+ /* initially new node spawns empty key range */ -+ write_lock_dk(znode_get_tree(brother)); -+ znode_set_ld_key(new_znode, -+ znode_set_rd_key(new_znode, -+ znode_get_rd_key(brother))); -+ write_unlock_dk(znode_get_tree(brother)); -+ return fresh; -+} -+ -+/* DEBUGGING FUNCTIONS. -+ -+ Probably we also should leave them on even when -+ debugging is turned off to print dumps at errors. -+*/ -+#if REISER4_DEBUG -+static int carry_level_invariant(carry_level * level, carry_queue_state state) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ -+ if (level == NULL) -+ return 0; -+ -+ if (level->track_type != 0 && -+ level->track_type != CARRY_TRACK_NODE && -+ level->track_type != CARRY_TRACK_CHANGE) -+ return 0; -+ -+ /* check that nodes are in ascending order */ -+ for_all_nodes(level, node, tmp_node) { -+ znode *left; -+ znode *right; -+ -+ reiser4_key lkey; -+ reiser4_key rkey; -+ -+ if (node != carry_node_front(level)) { -+ if (state == CARRY_TODO) { -+ right = node->node; -+ left = carry_node_prev(node)->node; -+ } else { -+ right = reiser4_carry_real(node); -+ left = reiser4_carry_real(carry_node_prev(node)); -+ } -+ if (right == NULL || left == NULL) -+ continue; -+ if (node_is_empty(right) || node_is_empty(left)) -+ continue; -+ if (!keyle(leftmost_key_in_node(left, &lkey), -+ leftmost_key_in_node(right, &rkey))) { -+ warning("", "wrong key order"); -+ return 0; -+ } -+ } -+ } -+ return 1; -+} -+#endif -+ -+/* get symbolic name for boolean */ -+static const char *tf(int boolean/* truth value */) -+{ -+ return boolean ? "t" : "f"; -+} -+ -+/* symbolic name for carry operation */ -+static const char *carry_op_name(carry_opcode op/* carry opcode */) -+{ -+ switch (op) { -+ case COP_INSERT: -+ return "COP_INSERT"; -+ case COP_DELETE: -+ return "COP_DELETE"; -+ case COP_CUT: -+ return "COP_CUT"; -+ case COP_PASTE: -+ return "COP_PASTE"; -+ case COP_UPDATE: -+ return "COP_UPDATE"; -+ case COP_EXTENT: -+ return "COP_EXTENT"; -+ case COP_INSERT_FLOW: -+ return "COP_INSERT_FLOW"; -+ default:{ -+ /* not mt safe, but who cares? */ -+ static char buf[20]; -+ -+ sprintf(buf, "unknown op: %x", op); -+ return buf; -+ } -+ } -+} -+ -+/* dump information about carry node */ -+static void print_carry(const char *prefix /* prefix to print */ , -+ carry_node * node/* node to print */) -+{ -+ if (node == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ printk -+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n", -+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock), -+ tf(node->free), tf(node->deallocate)); -+} -+ -+/* dump information about carry operation */ -+static void print_op(const char *prefix /* prefix to print */ , -+ carry_op * op/* operation to print */) -+{ -+ if (op == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op)); -+ print_carry("\tnode", op->node); -+ switch (op->op) { -+ case COP_INSERT: -+ case COP_PASTE: -+ print_coord("\tcoord", -+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0); -+ reiser4_print_key("\tkey", -+ op->u.insert.d ? op->u.insert.d->key : NULL); -+ print_carry("\tchild", op->u.insert.child); -+ break; -+ case COP_DELETE: -+ print_carry("\tchild", op->u.delete.child); -+ break; -+ case COP_CUT: -+ if (op->u.cut_or_kill.is_cut) { -+ print_coord("\tfrom", -+ op->u.cut_or_kill.u.kill->params.from, 0); -+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to, -+ 0); -+ } else { -+ print_coord("\tfrom", -+ op->u.cut_or_kill.u.cut->params.from, 0); -+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to, -+ 0); -+ } -+ break; -+ case COP_UPDATE: -+ print_carry("\tleft", op->u.update.left); -+ break; -+ default: -+ /* do nothing */ -+ break; -+ } -+} -+ -+/* dump information about all nodes and operations in a @level */ -+static void print_level(const char *prefix /* prefix to print */ , -+ carry_level * level/* level to print */) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ carry_op *op; -+ carry_op *tmp_op; -+ -+ if (level == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ printk("%s: %p, restartable: %s\n", -+ prefix, level, tf(level->restartable)); -+ -+ for_all_nodes(level, node, tmp_node) -+ print_carry("\tcarry node", node); -+ for_all_ops(level, op, tmp_op) -+ print_op("\tcarry op", op); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/carry.h linux-2.6.30/fs/reiser4/carry.h ---- linux-2.6.30.orig/fs/reiser4/carry.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/carry.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,445 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* Functions and data types to "carry" tree modification(s) upward. -+ See fs/reiser4/carry.c for details. */ -+ -+#if !defined(__FS_REISER4_CARRY_H__) -+#define __FS_REISER4_CARRY_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "pool.h" -+#include "znode.h" -+ -+#include <linux/types.h> -+ -+/* &carry_node - "location" of carry node. -+ -+ "location" of node that is involved or going to be involved into -+ carry process. Node where operation will be carried to on the -+ parent level cannot be recorded explicitly. Operation will be carried -+ usually to the parent of some node (where changes are performed at -+ the current level) or, to the left neighbor of its parent. But while -+ modifications are performed at the current level, parent may -+ change. So, we have to allow some indirection (or, positevly, -+ flexibility) in locating carry nodes. -+ -+*/ -+typedef struct carry_node { -+ /* pool linkage */ -+ struct reiser4_pool_header header; -+ -+ /* base node from which real_node is calculated. See -+ fs/reiser4/carry.c:lock_carry_node(). */ -+ znode *node; -+ -+ /* how to get ->real_node */ -+ /* to get ->real_node obtain parent of ->node */ -+ __u32 parent:1; -+ /* to get ->real_node obtain left neighbor of parent of -+ ->node */ -+ __u32 left:1; -+ __u32 left_before:1; -+ -+ /* locking */ -+ -+ /* this node was locked by carry process and should be -+ unlocked when carry leaves a level */ -+ __u32 unlock:1; -+ -+ /* disk block for this node was allocated by carry process and -+ should be deallocated when carry leaves a level */ -+ __u32 deallocate:1; -+ /* this carry node was allocated by carry process and should be -+ freed when carry leaves a level */ -+ __u32 free:1; -+ -+ /* type of lock we want to take on this node */ -+ lock_handle lock_handle; -+} carry_node; -+ -+/* &carry_opcode - elementary operations that can be carried upward -+ -+ Operations that carry() can handle. This list is supposed to be -+ expanded. -+ -+ Each carry operation (cop) is handled by appropriate function defined -+ in fs/reiser4/carry.c. For example COP_INSERT is handled by -+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn -+ call plugins of nodes affected by operation to modify nodes' content -+ and to gather operations to be performed on the next level. -+ -+*/ -+typedef enum { -+ /* insert new item into node. */ -+ COP_INSERT, -+ /* delete pointer from parent node */ -+ COP_DELETE, -+ /* remove part of or whole node. */ -+ COP_CUT, -+ /* increase size of item. */ -+ COP_PASTE, -+ /* insert extent (that is sequence of unformatted nodes). */ -+ COP_EXTENT, -+ /* update delimiting key in least common ancestor of two -+ nodes. This is performed when items are moved between two -+ nodes. -+ */ -+ COP_UPDATE, -+ /* insert flow */ -+ COP_INSERT_FLOW, -+ COP_LAST_OP, -+} carry_opcode; -+ -+#define CARRY_FLOW_NEW_NODES_LIMIT 20 -+ -+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target -+ item is determined. */ -+typedef enum { -+ /* target item is one containing pointer to the ->child node */ -+ COPT_CHILD, -+ /* target item is given explicitly by @coord */ -+ COPT_ITEM_DATA, -+ /* target item is given by key */ -+ COPT_KEY, -+ /* see insert_paste_common() for more comments on this. */ -+ COPT_PASTE_RESTARTED, -+} cop_insert_pos_type; -+ -+/* flags to cut and delete */ -+typedef enum { -+ /* don't kill node even if it became completely empty as results of -+ * cut. This is needed for eottl handling. See carry_extent() for -+ * details. */ -+ DELETE_RETAIN_EMPTY = (1 << 0) -+} cop_delete_flag; -+ -+/* -+ * carry() implements "lock handle tracking" feature. -+ * -+ * Callers supply carry with node where to perform initial operation and lock -+ * handle on this node. Trying to optimize node utilization carry may actually -+ * move insertion point to different node. Callers expect that lock handle -+ * will rebe transferred to the new node also. -+ * -+ */ -+typedef enum { -+ /* transfer lock handle along with insertion point */ -+ CARRY_TRACK_CHANGE = 1, -+ /* acquire new lock handle to the node where insertion point is. This -+ * is used when carry() client doesn't initially possess lock handle -+ * on the insertion point node, for example, by extent insertion -+ * code. See carry_extent(). */ -+ CARRY_TRACK_NODE = 2 -+} carry_track_type; -+ -+/* data supplied to COP_{INSERT|PASTE} by callers */ -+typedef struct carry_insert_data { -+ /* position where new item is to be inserted */ -+ coord_t *coord; -+ /* new item description */ -+ reiser4_item_data * data; -+ /* key of new item */ -+ const reiser4_key * key; -+} carry_insert_data; -+ -+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the -+ below structure of parameters */ -+struct cut_kill_params { -+ /* coord where cut starts (inclusive) */ -+ coord_t *from; -+ /* coord where cut stops (inclusive, this item/unit will also be -+ * cut) */ -+ coord_t *to; -+ /* starting key. This is necessary when item and unit pos don't -+ * uniquely identify what portion or tree to remove. For example, this -+ * indicates what portion of extent unit will be affected. */ -+ const reiser4_key * from_key; -+ /* exclusive stop key */ -+ const reiser4_key * to_key; -+ /* if this is not NULL, smallest actually removed key is stored -+ * here. */ -+ reiser4_key *smallest_removed; -+ /* kill_node_content() is called for file truncate */ -+ int truncate; -+}; -+ -+struct carry_cut_data { -+ struct cut_kill_params params; -+}; -+ -+struct carry_kill_data { -+ struct cut_kill_params params; -+ /* parameter to be passed to the ->kill_hook() method of item -+ * plugin */ -+ /*void *iplug_params; *//* FIXME: unused currently */ -+ /* if not NULL---inode whose items are being removed. This is needed -+ * for ->kill_hook() of extent item to update VM structures when -+ * removing pages. */ -+ struct inode *inode; -+ /* sibling list maintenance is complicated by existence of eottl. When -+ * eottl whose left and right neighbors are formatted leaves is -+ * removed, one has to connect said leaves in the sibling list. This -+ * cannot be done when extent removal is just started as locking rules -+ * require sibling list update to happen atomically with removal of -+ * extent item. Therefore: 1. pointers to left and right neighbors -+ * have to be passed down to the ->kill_hook() of extent item, and -+ * 2. said neighbors have to be locked. */ -+ lock_handle *left; -+ lock_handle *right; -+ /* flags modifying behavior of kill. Currently, it may have -+ DELETE_RETAIN_EMPTY set. */ -+ unsigned flags; -+ char *buf; -+}; -+ -+/* &carry_tree_op - operation to "carry" upward. -+ -+ Description of an operation we want to "carry" to the upper level of -+ a tree: e.g, when we insert something and there is not enough space -+ we allocate a new node and "carry" the operation of inserting a -+ pointer to the new node to the upper level, on removal of empty node, -+ we carry up operation of removing appropriate entry from parent. -+ -+ There are two types of carry ops: when adding or deleting node we -+ node at the parent level where appropriate modification has to be -+ performed is known in advance. When shifting items between nodes -+ (split, merge), delimiting key should be changed in the least common -+ parent of the nodes involved that is not known in advance. -+ -+ For the operations of the first type we store in &carry_op pointer to -+ the &carry_node at the parent level. For the operation of the second -+ type we store &carry_node or parents of the left and right nodes -+ modified and keep track of them upward until they coincide. -+ -+*/ -+typedef struct carry_op { -+ /* pool linkage */ -+ struct reiser4_pool_header header; -+ carry_opcode op; -+ /* node on which operation is to be performed: -+ -+ for insert, paste: node where new item is to be inserted -+ -+ for delete: node where pointer is to be deleted -+ -+ for cut: node to cut from -+ -+ for update: node where delimiting key is to be modified -+ -+ for modify: parent of modified node -+ -+ */ -+ carry_node *node; -+ union { -+ struct { -+ /* (sub-)type of insertion/paste. Taken from -+ cop_insert_pos_type. */ -+ __u8 type; -+ /* various operation flags. Taken from -+ cop_insert_flag. */ -+ __u8 flags; -+ carry_insert_data *d; -+ carry_node *child; -+ znode *brother; -+ } insert, paste, extent; -+ -+ struct { -+ int is_cut; -+ union { -+ carry_kill_data *kill; -+ carry_cut_data *cut; -+ } u; -+ } cut_or_kill; -+ -+ struct { -+ carry_node *left; -+ } update; -+ struct { -+ /* changed child */ -+ carry_node *child; -+ /* bitmask of changes. See &cop_modify_flag */ -+ __u32 flag; -+ } modify; -+ struct { -+ /* flags to deletion operation. Are taken from -+ cop_delete_flag */ -+ __u32 flags; -+ /* child to delete from parent. If this is -+ NULL, delete op->node. */ -+ carry_node *child; -+ } delete; -+ struct { -+ /* various operation flags. Taken from -+ cop_insert_flag. */ -+ __u32 flags; -+ flow_t *flow; -+ coord_t *insert_point; -+ reiser4_item_data *data; -+ /* flow insertion is limited by number of new blocks -+ added in that operation which do not get any data -+ but part of flow. This limit is set by macro -+ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number -+ of nodes added already during one carry_flow */ -+ int new_nodes; -+ } insert_flow; -+ } u; -+} carry_op; -+ -+/* &carry_op_pool - preallocated pool of carry operations, and nodes */ -+typedef struct carry_pool { -+ carry_op op[CARRIES_POOL_SIZE]; -+ struct reiser4_pool op_pool; -+ carry_node node[NODES_LOCKED_POOL_SIZE]; -+ struct reiser4_pool node_pool; -+} carry_pool; -+ -+/* &carry_tree_level - carry process on given level -+ -+ Description of balancing process on the given level. -+ -+ No need for locking here, as carry_tree_level is essentially per -+ thread thing (for now). -+ -+*/ -+struct carry_level { -+ /* this level may be restarted */ -+ __u32 restartable:1; -+ /* list of carry nodes on this level, ordered by key order */ -+ struct list_head nodes; -+ struct list_head ops; -+ /* pool where new objects are allocated from */ -+ carry_pool *pool; -+ int ops_num; -+ int nodes_num; -+ /* new root created on this level, if any */ -+ znode *new_root; -+ /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.) -+ when they want ->tracked to automagically wander to the node where -+ insertion point moved after insert or paste. -+ */ -+ carry_track_type track_type; -+ /* lock handle supplied by user that we are tracking. See -+ above. */ -+ lock_handle *tracked; -+}; -+ -+/* information carry passes to plugin methods that may add new operations to -+ the @todo queue */ -+struct carry_plugin_info { -+ carry_level *doing; -+ carry_level *todo; -+}; -+ -+int reiser4_carry(carry_level * doing, carry_level * done); -+ -+carry_node *reiser4_add_carry(carry_level * level, pool_ordering order, -+ carry_node * reference); -+carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order, -+ carry_node * reference); -+ -+extern carry_node *insert_carry_node(carry_level * doing, -+ carry_level * todo, const znode * node); -+ -+extern carry_pool *init_carry_pool(int); -+extern void done_carry_pool(carry_pool * pool); -+ -+extern void init_carry_level(carry_level * level, carry_pool * pool); -+ -+extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op, -+ znode * node, int apply_to_parent); -+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op, -+ znode * node, int apply_to_parent_p); -+ -+carry_node *add_new_znode(znode * brother, carry_node * reference, -+ carry_level * doing, carry_level * todo); -+ -+carry_node *find_carry_node(carry_level * level, const znode * node); -+ -+extern znode *reiser4_carry_real(const carry_node * node); -+ -+/* helper macros to iterate over carry queues */ -+ -+#define carry_node_next(node) \ -+ list_entry((node)->header.level_linkage.next, carry_node, \ -+ header.level_linkage) -+ -+#define carry_node_prev(node) \ -+ list_entry((node)->header.level_linkage.prev, carry_node, \ -+ header.level_linkage) -+ -+#define carry_node_front(level) \ -+ list_entry((level)->nodes.next, carry_node, header.level_linkage) -+ -+#define carry_node_back(level) \ -+ list_entry((level)->nodes.prev, carry_node, header.level_linkage) -+ -+#define carry_node_end(level, node) \ -+ (&(level)->nodes == &(node)->header.level_linkage) -+ -+/* macro to iterate over all operations in a @level */ -+#define for_all_ops(level /* carry level (of type carry_level *) */, \ -+ op /* pointer to carry operation, modified by loop (of \ -+ * type carry_op *) */, \ -+ tmp /* pointer to carry operation (of type carry_op *), \ -+ * used to make iterator stable in the face of \ -+ * deletions from the level */ ) \ -+for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \ -+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \ -+ &op->header.level_linkage != &level->ops; \ -+ op = tmp, \ -+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage)) -+ -+#if 0 -+for (op = (carry_op *) pool_level_list_front(&level->ops), \ -+ tmp = (carry_op *) pool_level_list_next(&op->header) ; \ -+ !pool_level_list_end(&level->ops, &op->header) ; \ -+ op = tmp, tmp = (carry_op *) pool_level_list_next(&op->header)) -+#endif -+ -+/* macro to iterate over all nodes in a @level */ \ -+#define for_all_nodes(level /* carry level (of type carry_level *) */, \ -+ node /* pointer to carry node, modified by loop (of \ -+ * type carry_node *) */, \ -+ tmp /* pointer to carry node (of type carry_node *), \ -+ * used to make iterator stable in the face of * \ -+ * deletions from the level */ ) \ -+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \ -+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \ -+ &node->header.level_linkage != &level->nodes; \ -+ node = tmp, \ -+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage)) -+ -+#if 0 -+for (node = carry_node_front(level), \ -+ tmp = carry_node_next(node) ; !carry_node_end(level, node) ; \ -+ node = tmp, tmp = carry_node_next(node)) -+#endif -+ -+/* macro to iterate over all nodes in a @level in reverse order -+ -+ This is used, because nodes are unlocked in reversed order of locking */ -+#define for_all_nodes_back(level /* carry level (of type carry_level *) */, \ -+ node /* pointer to carry node, modified by loop \ -+ * (of type carry_node *) */, \ -+ tmp /* pointer to carry node (of type carry_node \ -+ * *), used to make iterator stable in the \ -+ * face of deletions from the level */ ) \ -+for (node = carry_node_back(level), \ -+ tmp = carry_node_prev(node) ; !carry_node_end(level, node) ; \ -+ node = tmp, tmp = carry_node_prev(node)) -+ -+/* __FS_REISER4_CARRY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/carry_ops.c linux-2.6.30/fs/reiser4/carry_ops.c ---- linux-2.6.30.orig/fs/reiser4/carry_ops.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/carry_ops.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,2132 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* implementation of carry operations */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "pool.h" -+#include "tree_mod.h" -+#include "carry.h" -+#include "carry_ops.h" -+#include "tree.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include <linux/types.h> -+#include <linux/err.h> -+ -+static int carry_shift_data(sideof side, coord_t *insert_coord, znode * node, -+ carry_level * doing, carry_level * todo, -+ unsigned int including_insert_coord_p); -+ -+extern int lock_carry_node(carry_level * level, carry_node * node); -+extern int lock_carry_node_tail(carry_node * node); -+ -+/* find left neighbor of a carry node -+ -+ Look for left neighbor of @node and add it to the @doing queue. See -+ comments in the body. -+ -+*/ -+static carry_node *find_left_neighbor(carry_op * op /* node to find left -+ * neighbor of */ , -+ carry_level * doing/* level to scan */) -+{ -+ int result; -+ carry_node *node; -+ carry_node *left; -+ int flags; -+ reiser4_tree *tree; -+ -+ node = op->node; -+ -+ tree = current_tree; -+ read_lock_tree(tree); -+ /* first, check whether left neighbor is already in a @doing queue */ -+ if (reiser4_carry_real(node)->left != NULL) { -+ /* NOTE: there is locking subtlety here. Look into -+ * find_right_neighbor() for more info */ -+ if (find_carry_node(doing, -+ reiser4_carry_real(node)->left) != NULL) { -+ read_unlock_tree(tree); -+ left = node; -+ do { -+ left = list_entry(left->header.level_linkage.prev, -+ carry_node, header.level_linkage); -+ assert("nikita-3408", !carry_node_end(doing, -+ left)); -+ } while (reiser4_carry_real(left) == -+ reiser4_carry_real(node)); -+ return left; -+ } -+ } -+ read_unlock_tree(tree); -+ -+ left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node); -+ if (IS_ERR(left)) -+ return left; -+ -+ left->node = node->node; -+ left->free = 1; -+ -+ flags = GN_TRY_LOCK; -+ if (!op->u.insert.flags & COPI_LOAD_LEFT) -+ flags |= GN_NO_ALLOC; -+ -+ /* then, feeling lucky, peek left neighbor in the cache. */ -+ result = reiser4_get_left_neighbor(&left->lock_handle, -+ reiser4_carry_real(node), -+ ZNODE_WRITE_LOCK, flags); -+ if (result == 0) { -+ /* ok, node found and locked. */ -+ result = lock_carry_node_tail(left); -+ if (result != 0) -+ left = ERR_PTR(result); -+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) { -+ /* node is leftmost node in a tree, or neighbor wasn't in -+ cache, or there is an extent on the left. */ -+ reiser4_pool_free(&doing->pool->node_pool, &left->header); -+ left = NULL; -+ } else if (doing->restartable) { -+ /* if left neighbor is locked, and level is restartable, add -+ new node to @doing and restart. */ -+ assert("nikita-913", node->parent != 0); -+ assert("nikita-914", node->node != NULL); -+ left->left = 1; -+ left->free = 0; -+ left = ERR_PTR(-E_REPEAT); -+ } else { -+ /* left neighbor is locked, level cannot be restarted. Just -+ ignore left neighbor. */ -+ reiser4_pool_free(&doing->pool->node_pool, &left->header); -+ left = NULL; -+ } -+ return left; -+} -+ -+/* find right neighbor of a carry node -+ -+ Look for right neighbor of @node and add it to the @doing queue. See -+ comments in the body. -+ -+*/ -+static carry_node *find_right_neighbor(carry_op * op /* node to find right -+ * neighbor of */ , -+ carry_level * doing/* level to scan */) -+{ -+ int result; -+ carry_node *node; -+ carry_node *right; -+ lock_handle lh; -+ int flags; -+ reiser4_tree *tree; -+ -+ init_lh(&lh); -+ -+ node = op->node; -+ -+ tree = current_tree; -+ read_lock_tree(tree); -+ /* first, check whether right neighbor is already in a @doing queue */ -+ if (reiser4_carry_real(node)->right != NULL) { -+ /* -+ * Tree lock is taken here anyway, because, even if _outcome_ -+ * of (find_carry_node() != NULL) doesn't depends on -+ * concurrent updates to ->right, find_carry_node() cannot -+ * work with second argument NULL. Hence, following comment is -+ * of historic importance only. -+ * -+ * Subtle: -+ * -+ * Q: why don't we need tree lock here, looking for the right -+ * neighbor? -+ * -+ * A: even if value of node->real_node->right were changed -+ * during find_carry_node() execution, outcome of execution -+ * wouldn't change, because (in short) other thread cannot add -+ * elements to the @doing, and if node->real_node->right -+ * already was in @doing, value of node->real_node->right -+ * couldn't change, because node cannot be inserted between -+ * locked neighbors. -+ */ -+ if (find_carry_node(doing, -+ reiser4_carry_real(node)->right) != NULL) { -+ read_unlock_tree(tree); -+ /* -+ * What we are doing here (this is also applicable to -+ * the find_left_neighbor()). -+ * -+ * tree_walk.c code requires that insertion of a -+ * pointer to a child, modification of parent pointer -+ * in the child, and insertion of the child into -+ * sibling list are atomic (see -+ * plugin/item/internal.c:create_hook_internal()). -+ * -+ * carry allocates new node long before pointer to it -+ * is inserted into parent and, actually, long before -+ * parent is even known. Such allocated-but-orphaned -+ * nodes are only trackable through carry level lists. -+ * -+ * Situation that is handled here is following: @node -+ * has valid ->right pointer, but there is -+ * allocated-but-orphaned node in the carry queue that -+ * is logically between @node and @node->right. Here -+ * we are searching for it. Critical point is that -+ * this is only possible if @node->right is also in -+ * the carry queue (this is checked above), because -+ * this is the only way new orphaned node could be -+ * inserted between them (before inserting new node, -+ * make_space() first tries to shift to the right, so, -+ * right neighbor will be locked and queued). -+ * -+ */ -+ right = node; -+ do { -+ right = list_entry(right->header.level_linkage.next, -+ carry_node, header.level_linkage); -+ assert("nikita-3408", !carry_node_end(doing, -+ right)); -+ } while (reiser4_carry_real(right) == -+ reiser4_carry_real(node)); -+ return right; -+ } -+ } -+ read_unlock_tree(tree); -+ -+ flags = GN_CAN_USE_UPPER_LEVELS; -+ if (!op->u.insert.flags & COPI_LOAD_RIGHT) -+ flags = GN_NO_ALLOC; -+ -+ /* then, try to lock right neighbor */ -+ init_lh(&lh); -+ result = reiser4_get_right_neighbor(&lh, -+ reiser4_carry_real(node), -+ ZNODE_WRITE_LOCK, flags); -+ if (result == 0) { -+ /* ok, node found and locked. */ -+ right = reiser4_add_carry_skip(doing, POOLO_AFTER, node); -+ if (!IS_ERR(right)) { -+ right->node = lh.node; -+ move_lh(&right->lock_handle, &lh); -+ right->free = 1; -+ result = lock_carry_node_tail(right); -+ if (result != 0) -+ right = ERR_PTR(result); -+ } -+ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) { -+ /* node is rightmost node in a tree, or neighbor wasn't in -+ cache, or there is an extent on the right. */ -+ right = NULL; -+ } else -+ right = ERR_PTR(result); -+ done_lh(&lh); -+ return right; -+} -+ -+/* how much free space in a @node is needed for @op -+ -+ How much space in @node is required for completion of @op, where @op is -+ insert or paste operation. -+*/ -+static unsigned int space_needed_for_op(znode * node /* znode data are -+ * inserted or -+ * pasted in */ , -+ carry_op * op /* carry -+ operation */ ) -+{ -+ assert("nikita-919", op != NULL); -+ -+ switch (op->op) { -+ default: -+ impossible("nikita-1701", "Wrong opcode"); -+ case COP_INSERT: -+ return space_needed(node, NULL, op->u.insert.d->data, 1); -+ case COP_PASTE: -+ return space_needed(node, op->u.insert.d->coord, -+ op->u.insert.d->data, 0); -+ } -+} -+ -+/* how much space in @node is required to insert or paste @data at -+ @coord. */ -+unsigned int space_needed(const znode * node /* node data are inserted or -+ * pasted in */ , -+ const coord_t *coord /* coord where data are -+ * inserted or pasted -+ * at */ , -+ const reiser4_item_data * data /* data to insert or -+ * paste */ , -+ int insertion/* non-0 is inserting, 0---paste */) -+{ -+ int result; -+ item_plugin *iplug; -+ -+ assert("nikita-917", node != NULL); -+ assert("nikita-918", node_plugin_by_node(node) != NULL); -+ assert("vs-230", !insertion || (coord == NULL)); -+ -+ result = 0; -+ iplug = data->iplug; -+ if (iplug->b.estimate != NULL) { -+ /* ask item plugin how much space is needed to insert this -+ item */ -+ result += iplug->b.estimate(insertion ? NULL : coord, data); -+ } else { -+ /* reasonable default */ -+ result += data->length; -+ } -+ if (insertion) { -+ node_plugin *nplug; -+ -+ nplug = node->nplug; -+ /* and add node overhead */ -+ if (nplug->item_overhead != NULL) -+ result += nplug->item_overhead(node, NULL); -+ } -+ return result; -+} -+ -+/* find &coord in parent where pointer to new child is to be stored. */ -+static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to -+ * insert pointer to new -+ * child */ ) -+{ -+ int result; -+ znode *node; -+ znode *child; -+ -+ assert("nikita-941", op != NULL); -+ assert("nikita-942", op->op == COP_INSERT); -+ -+ node = reiser4_carry_real(op->node); -+ assert("nikita-943", node != NULL); -+ assert("nikita-944", node_plugin_by_node(node) != NULL); -+ -+ child = reiser4_carry_real(op->u.insert.child); -+ result = -+ find_new_child_ptr(node, child, op->u.insert.brother, -+ op->u.insert.d->coord); -+ -+ build_child_ptr_data(child, op->u.insert.d->data); -+ return result; -+} -+ -+/* additional amount of free space in @node required to complete @op */ -+static int free_space_shortage(znode * node /* node to check */ , -+ carry_op * op/* operation being performed */) -+{ -+ assert("nikita-1061", node != NULL); -+ assert("nikita-1062", op != NULL); -+ -+ switch (op->op) { -+ default: -+ impossible("nikita-1702", "Wrong opcode"); -+ case COP_INSERT: -+ case COP_PASTE: -+ return space_needed_for_op(node, op) - znode_free_space(node); -+ case COP_EXTENT: -+ /* when inserting extent shift data around until insertion -+ point is utmost in the node. */ -+ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE) -+ return +1; -+ else -+ return -1; -+ } -+} -+ -+/* helper function: update node pointer in operation after insertion -+ point was probably shifted into @target. */ -+static znode *sync_op(carry_op * op, carry_node * target) -+{ -+ znode *insertion_node; -+ -+ /* reget node from coord: shift might move insertion coord to -+ the neighbor */ -+ insertion_node = op->u.insert.d->coord->node; -+ /* if insertion point was actually moved into new node, -+ update carry node pointer in operation. */ -+ if (insertion_node != reiser4_carry_real(op->node)) { -+ op->node = target; -+ assert("nikita-2540", -+ reiser4_carry_real(target) == insertion_node); -+ } -+ assert("nikita-2541", -+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node); -+ return insertion_node; -+} -+ -+/* -+ * complete make_space() call: update tracked lock handle if necessary. See -+ * comments for fs/reiser4/carry.h:carry_track_type -+ */ -+static int -+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node) -+{ -+ int result; -+ carry_track_type tracking; -+ znode *node; -+ -+ tracking = doing->track_type; -+ node = op->u.insert.d->coord->node; -+ -+ if (tracking == CARRY_TRACK_NODE || -+ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) { -+ /* inserting or pasting into node different from -+ original. Update lock handle supplied by caller. */ -+ assert("nikita-1417", doing->tracked != NULL); -+ done_lh(doing->tracked); -+ init_lh(doing->tracked); -+ result = longterm_lock_znode(doing->tracked, node, -+ ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_HIPRI); -+ } else -+ result = 0; -+ return result; -+} -+ -+/* This is insertion policy function. It shifts data to the left and right -+ neighbors of insertion coord and allocates new nodes until there is enough -+ free space to complete @op. -+ -+ See comments in the body. -+ -+ Assumes that the node format favors insertions at the right end of the node -+ as node40 does. -+ -+ See carry_flow() on detail about flow insertion -+*/ -+static int make_space(carry_op * op /* carry operation, insert or paste */ , -+ carry_level * doing /* current carry queue */ , -+ carry_level * todo/* carry queue on the parent level */) -+{ -+ znode *node; -+ int result; -+ int not_enough_space; -+ int blk_alloc; -+ znode *orig_node; -+ __u32 flags; -+ -+ coord_t *coord; -+ -+ assert("nikita-890", op != NULL); -+ assert("nikita-891", todo != NULL); -+ assert("nikita-892", -+ op->op == COP_INSERT || -+ op->op == COP_PASTE || op->op == COP_EXTENT); -+ assert("nikita-1607", -+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node); -+ -+ flags = op->u.insert.flags; -+ -+ /* NOTE check that new node can only be allocated after checking left -+ * and right neighbors. This is necessary for proper work of -+ * find_{left,right}_neighbor(). */ -+ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE, -+ flags & COPI_DONT_SHIFT_LEFT)); -+ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE, -+ flags & COPI_DONT_SHIFT_RIGHT)); -+ -+ coord = op->u.insert.d->coord; -+ orig_node = node = coord->node; -+ -+ assert("nikita-908", node != NULL); -+ assert("nikita-909", node_plugin_by_node(node) != NULL); -+ -+ result = 0; -+ /* If there is not enough space in a node, try to shift something to -+ the left neighbor. This is a bit tricky, as locking to the left is -+ low priority. This is handled by restart logic in carry(). -+ */ -+ not_enough_space = free_space_shortage(node, op); -+ if (not_enough_space <= 0) -+ /* it is possible that carry was called when there actually -+ was enough space in the node. For example, when inserting -+ leftmost item so that delimiting keys have to be updated. -+ */ -+ return make_space_tail(op, doing, orig_node); -+ if (!(flags & COPI_DONT_SHIFT_LEFT)) { -+ carry_node *left; -+ /* make note in statistics of an attempt to move -+ something into the left neighbor */ -+ left = find_left_neighbor(op, doing); -+ if (unlikely(IS_ERR(left))) { -+ if (PTR_ERR(left) == -E_REPEAT) -+ return -E_REPEAT; -+ else { -+ /* some error other than restart request -+ occurred. This shouldn't happen. Issue a -+ warning and continue as if left neighbor -+ weren't existing. -+ */ -+ warning("nikita-924", -+ "Error accessing left neighbor: %li", -+ PTR_ERR(left)); -+ } -+ } else if (left != NULL) { -+ -+ /* shift everything possible on the left of and -+ including insertion coord into the left neighbor */ -+ result = carry_shift_data(LEFT_SIDE, coord, -+ reiser4_carry_real(left), -+ doing, todo, -+ flags & COPI_GO_LEFT); -+ -+ /* reget node from coord: shift_left() might move -+ insertion coord to the left neighbor */ -+ node = sync_op(op, left); -+ -+ not_enough_space = free_space_shortage(node, op); -+ /* There is not enough free space in @node, but -+ may be, there is enough free space in -+ @left. Various balancing decisions are valid here. -+ The same for the shifiting to the right. -+ */ -+ } -+ } -+ /* If there still is not enough space, shift to the right */ -+ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) { -+ carry_node *right; -+ -+ right = find_right_neighbor(op, doing); -+ if (IS_ERR(right)) { -+ warning("nikita-1065", -+ "Error accessing right neighbor: %li", -+ PTR_ERR(right)); -+ } else if (right != NULL) { -+ /* node containing insertion point, and its right -+ neighbor node are write locked by now. -+ -+ shift everything possible on the right of but -+ excluding insertion coord into the right neighbor -+ */ -+ result = carry_shift_data(RIGHT_SIDE, coord, -+ reiser4_carry_real(right), -+ doing, todo, -+ flags & COPI_GO_RIGHT); -+ /* reget node from coord: shift_right() might move -+ insertion coord to the right neighbor */ -+ node = sync_op(op, right); -+ not_enough_space = free_space_shortage(node, op); -+ } -+ } -+ /* If there is still not enough space, allocate new node(s). -+ -+ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in -+ the carry operation flags (currently this is needed during flush -+ only). -+ */ -+ for (blk_alloc = 0; -+ not_enough_space > 0 && result == 0 && blk_alloc < 2 && -+ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) { -+ carry_node *fresh; /* new node we are allocating */ -+ coord_t coord_shadow; /* remembered insertion point before -+ * shifting data into new node */ -+ carry_node *node_shadow; /* remembered insertion node -+ * before shifting */ -+ unsigned int gointo; /* whether insertion point should move -+ * into newly allocated node */ -+ -+ /* allocate new node on the right of @node. Znode and disk -+ fake block number for new node are allocated. -+ -+ add_new_znode() posts carry operation COP_INSERT with -+ COPT_CHILD option to the parent level to add -+ pointer to newly created node to its parent. -+ -+ Subtle point: if several new nodes are required to complete -+ insertion operation at this level, they will be inserted -+ into their parents in the order of creation, which means -+ that @node will be valid "cookie" at the time of insertion. -+ -+ */ -+ fresh = add_new_znode(node, op->node, doing, todo); -+ if (IS_ERR(fresh)) -+ return PTR_ERR(fresh); -+ -+ /* Try to shift into new node. */ -+ result = lock_carry_node(doing, fresh); -+ zput(reiser4_carry_real(fresh)); -+ if (result != 0) { -+ warning("nikita-947", -+ "Cannot lock new node: %i", result); -+ return result; -+ } -+ -+ /* both nodes are write locked by now. -+ -+ shift everything possible on the right of and -+ including insertion coord into the right neighbor. -+ */ -+ coord_dup(&coord_shadow, op->u.insert.d->coord); -+ node_shadow = op->node; -+ /* move insertion point into newly created node if: -+ -+ . insertion point is rightmost in the source node, or -+ . this is not the first node we are allocating in a row. -+ */ -+ gointo = -+ (blk_alloc > 0) || -+ coord_is_after_rightmost(op->u.insert.d->coord); -+ -+ if (gointo && -+ op->op == COP_PASTE && -+ coord_is_existing_item(op->u.insert.d->coord) && -+ is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) { -+ /* paste into solid (atomic) item, which can contain -+ only one unit, so we need to shift it right, where -+ insertion point supposed to be */ -+ -+ assert("edward-1444", op->u.insert.d->data->iplug == -+ item_plugin_by_id(STATIC_STAT_DATA_ID)); -+ assert("edward-1445", -+ op->u.insert.d->data->length > -+ node_plugin_by_node(coord->node)->free_space -+ (coord->node)); -+ -+ op->u.insert.d->coord->between = BEFORE_UNIT; -+ } -+ -+ result = carry_shift_data(RIGHT_SIDE, coord, -+ reiser4_carry_real(fresh), -+ doing, todo, gointo); -+ /* if insertion point was actually moved into new node, -+ update carry node pointer in operation. */ -+ node = sync_op(op, fresh); -+ not_enough_space = free_space_shortage(node, op); -+ if ((not_enough_space > 0) && (node != coord_shadow.node)) { -+ /* there is not enough free in new node. Shift -+ insertion point back to the @shadow_node so that -+ next new node would be inserted between -+ @shadow_node and @fresh. -+ */ -+ coord_normalize(&coord_shadow); -+ coord_dup(coord, &coord_shadow); -+ node = coord->node; -+ op->node = node_shadow; -+ if (1 || (flags & COPI_STEP_BACK)) { -+ /* still not enough space?! Maybe there is -+ enough space in the source node (i.e., node -+ data are moved from) now. -+ */ -+ not_enough_space = -+ free_space_shortage(node, op); -+ } -+ } -+ } -+ if (not_enough_space > 0) { -+ if (!(flags & COPI_DONT_ALLOCATE)) -+ warning("nikita-948", "Cannot insert new item"); -+ result = -E_NODE_FULL; -+ } -+ assert("nikita-1622", ergo(result == 0, -+ reiser4_carry_real(op->node) == coord->node)); -+ assert("nikita-2616", coord == op->u.insert.d->coord); -+ if (result == 0) -+ result = make_space_tail(op, doing, orig_node); -+ return result; -+} -+ -+/* insert_paste_common() - common part of insert and paste operations -+ -+ This function performs common part of COP_INSERT and COP_PASTE. -+ -+ There are two ways in which insertion/paste can be requested: -+ -+ . by directly supplying reiser4_item_data. In this case, op -> -+ u.insert.type is set to COPT_ITEM_DATA. -+ -+ . by supplying child pointer to which is to inserted into parent. In this -+ case op -> u.insert.type == COPT_CHILD. -+ -+ . by supplying key of new item/unit. This is currently only used during -+ extent insertion -+ -+ This is required, because when new node is allocated we don't know at what -+ position pointer to it is to be stored in the parent. Actually, we don't -+ even know what its parent will be, because parent can be re-balanced -+ concurrently and new node re-parented, and because parent can be full and -+ pointer to the new node will go into some other node. -+ -+ insert_paste_common() resolves pointer to child node into position in the -+ parent by calling find_new_child_coord(), that fills -+ reiser4_item_data. After this, insertion/paste proceeds uniformly. -+ -+ Another complication is with finding free space during pasting. It may -+ happen that while shifting items to the neighbors and newly allocated -+ nodes, insertion coord can no longer be in the item we wanted to paste -+ into. At this point, paste becomes (morphs) into insert. Moreover free -+ space analysis has to be repeated, because amount of space required for -+ insertion is different from that of paste (item header overhead, etc). -+ -+ This function "unifies" different insertion modes (by resolving child -+ pointer or key into insertion coord), and then calls make_space() to free -+ enough space in the node by shifting data to the left and right and by -+ allocating new nodes if necessary. Carry operation knows amount of space -+ required for its completion. After enough free space is obtained, caller of -+ this function (carry_{insert,paste,etc.}) performs actual insertion/paste -+ by calling item plugin method. -+ -+*/ -+static int insert_paste_common(carry_op * op /* carry operation being -+ * performed */ , -+ carry_level * doing /* current carry level */ , -+ carry_level * todo /* next carry level */ , -+ carry_insert_data * cdata /* pointer to -+ * cdata */ , -+ coord_t *coord /* insertion/paste coord */ , -+ reiser4_item_data * data /* data to be -+ * inserted/pasted */ ) -+{ -+ assert("nikita-981", op != NULL); -+ assert("nikita-980", todo != NULL); -+ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE) -+ || (op->op == COP_EXTENT)); -+ -+ if (op->u.insert.type == COPT_PASTE_RESTARTED) { -+ /* nothing to do. Fall through to make_space(). */ -+ ; -+ } else if (op->u.insert.type == COPT_KEY) { -+ node_search_result intra_node; -+ znode *node; -+ /* Problem with doing batching at the lowest level, is that -+ operations here are given by coords where modification is -+ to be performed, and one modification can invalidate coords -+ of all following operations. -+ -+ So, we are implementing yet another type for operation that -+ will use (the only) "locator" stable across shifting of -+ data between nodes, etc.: key (COPT_KEY). -+ -+ This clause resolves key to the coord in the node. -+ -+ But node can change also. Probably some pieces have to be -+ added to the lock_carry_node(), to lock node by its key. -+ -+ */ -+ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain -+ if you need something else. */ -+ op->u.insert.d->coord = coord; -+ node = reiser4_carry_real(op->node); -+ intra_node = node_plugin_by_node(node)->lookup -+ (node, op->u.insert.d->key, FIND_EXACT, -+ op->u.insert.d->coord); -+ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) { -+ warning("nikita-1715", "Intra node lookup failure: %i", -+ intra_node); -+ return intra_node; -+ } -+ } else if (op->u.insert.type == COPT_CHILD) { -+ /* if we are asked to insert pointer to the child into -+ internal node, first convert pointer to the child into -+ coord within parent node. -+ */ -+ znode *child; -+ int result; -+ -+ op->u.insert.d = cdata; -+ op->u.insert.d->coord = coord; -+ op->u.insert.d->data = data; -+ op->u.insert.d->coord->node = reiser4_carry_real(op->node); -+ result = find_new_child_coord(op); -+ child = reiser4_carry_real(op->u.insert.child); -+ if (result != NS_NOT_FOUND) { -+ warning("nikita-993", -+ "Cannot find a place for child pointer: %i", -+ result); -+ return result; -+ } -+ /* This only happens when we did multiple insertions at -+ the previous level, trying to insert single item and -+ it so happened, that insertion of pointers to all new -+ nodes before this one already caused parent node to -+ split (may be several times). -+ -+ I am going to come up with better solution. -+ -+ You are not expected to understand this. -+ -- v6root/usr/sys/ken/slp.c -+ -+ Basically, what happens here is the following: carry came -+ to the parent level and is about to insert internal item -+ pointing to the child node that it just inserted in the -+ level below. Position where internal item is to be inserted -+ was found by find_new_child_coord() above, but node of the -+ current carry operation (that is, parent node of child -+ inserted on the previous level), was determined earlier in -+ the lock_carry_level/lock_carry_node. It could so happen -+ that other carry operations already performed on the parent -+ level already split parent node, so that insertion point -+ moved into another node. Handle this by creating new carry -+ node for insertion point if necessary. -+ */ -+ if (reiser4_carry_real(op->node) != -+ op->u.insert.d->coord->node) { -+ pool_ordering direction; -+ znode *z1; -+ znode *z2; -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ /* -+ * determine in what direction insertion point -+ * moved. Do this by comparing delimiting keys. -+ */ -+ z1 = op->u.insert.d->coord->node; -+ z2 = reiser4_carry_real(op->node); -+ if (keyle(leftmost_key_in_node(z1, &k1), -+ leftmost_key_in_node(z2, &k2))) -+ /* insertion point moved to the left */ -+ direction = POOLO_BEFORE; -+ else -+ /* insertion point moved to the right */ -+ direction = POOLO_AFTER; -+ -+ op->node = reiser4_add_carry_skip(doing, -+ direction, op->node); -+ if (IS_ERR(op->node)) -+ return PTR_ERR(op->node); -+ op->node->node = op->u.insert.d->coord->node; -+ op->node->free = 1; -+ result = lock_carry_node(doing, op->node); -+ if (result != 0) -+ return result; -+ } -+ -+ /* -+ * set up key of an item being inserted: we are inserting -+ * internal item and its key is (by the very definition of -+ * search tree) is leftmost key in the child node. -+ */ -+ write_lock_dk(znode_get_tree(child)); -+ op->u.insert.d->key = leftmost_key_in_node(child, -+ znode_get_ld_key(child)); -+ write_unlock_dk(znode_get_tree(child)); -+ op->u.insert.d->data->arg = op->u.insert.brother; -+ } else { -+ assert("vs-243", op->u.insert.d->coord != NULL); -+ op->u.insert.d->coord->node = reiser4_carry_real(op->node); -+ } -+ -+ /* find free space. */ -+ return make_space(op, doing, todo); -+} -+ -+/* handle carry COP_INSERT operation. -+ -+ Insert new item into node. New item can be given in one of two ways: -+ -+ - by passing &tree_coord and &reiser4_item_data as part of @op. This is -+ only applicable at the leaf/twig level. -+ -+ - by passing a child node pointer to which is to be inserted by this -+ operation. -+ -+*/ -+static int carry_insert(carry_op * op /* operation to perform */ , -+ carry_level * doing /* queue of operations @op -+ * is part of */ , -+ carry_level * todo /* queue where new operations -+ * are accumulated */ ) -+{ -+ znode *node; -+ carry_insert_data cdata; -+ coord_t coord; -+ reiser4_item_data data; -+ carry_plugin_info info; -+ int result; -+ -+ assert("nikita-1036", op != NULL); -+ assert("nikita-1037", todo != NULL); -+ assert("nikita-1038", op->op == COP_INSERT); -+ -+ coord_init_zero(&coord); -+ -+ /* perform common functionality of insert and paste. */ -+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data); -+ if (result != 0) -+ return result; -+ -+ node = op->u.insert.d->coord->node; -+ assert("nikita-1039", node != NULL); -+ assert("nikita-1040", node_plugin_by_node(node) != NULL); -+ -+ assert("nikita-949", -+ space_needed_for_op(node, op) <= znode_free_space(node)); -+ -+ /* ask node layout to create new item. */ -+ info.doing = doing; -+ info.todo = todo; -+ result = node_plugin_by_node(node)->create_item -+ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data, -+ &info); -+ doing->restartable = 0; -+ znode_make_dirty(node); -+ -+ return result; -+} -+ -+/* -+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is -+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree -+ * by slicing into multiple items. -+ */ -+ -+#define flow_insert_point(op) ((op)->u.insert_flow.insert_point) -+#define flow_insert_flow(op) ((op)->u.insert_flow.flow) -+#define flow_insert_data(op) ((op)->u.insert_flow.data) -+ -+static size_t item_data_overhead(carry_op * op) -+{ -+ if (flow_insert_data(op)->iplug->b.estimate == NULL) -+ return 0; -+ return (flow_insert_data(op)->iplug->b. -+ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) - -+ flow_insert_data(op)->length); -+} -+ -+/* FIXME-VS: this is called several times during one make_flow_for_insertion -+ and it will always return the same result. Some optimization could be made -+ by calculating this value once at the beginning and passing it around. That -+ would reduce some flexibility in future changes -+*/ -+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *); -+static size_t flow_insertion_overhead(carry_op * op) -+{ -+ znode *node; -+ size_t insertion_overhead; -+ -+ node = flow_insert_point(op)->node; -+ insertion_overhead = 0; -+ if (node->nplug->item_overhead && -+ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key, -+ flow_insert_data(op))) -+ insertion_overhead = -+ node->nplug->item_overhead(node, NULL) + -+ item_data_overhead(op); -+ return insertion_overhead; -+} -+ -+/* how many bytes of flow does fit to the node */ -+static int what_can_fit_into_node(carry_op * op) -+{ -+ size_t free, overhead; -+ -+ overhead = flow_insertion_overhead(op); -+ free = znode_free_space(flow_insert_point(op)->node); -+ if (free <= overhead) -+ return 0; -+ free -= overhead; -+ /* FIXME: flow->length is loff_t only to not get overflowed in case of -+ expandign truncate */ -+ if (free < op->u.insert_flow.flow->length) -+ return free; -+ return (int)op->u.insert_flow.flow->length; -+} -+ -+/* in make_space_for_flow_insertion we need to check either whether whole flow -+ fits into a node or whether minimal fraction of flow fits into a node */ -+static int enough_space_for_whole_flow(carry_op * op) -+{ -+ return (unsigned)what_can_fit_into_node(op) == -+ op->u.insert_flow.flow->length; -+} -+ -+#define MIN_FLOW_FRACTION 1 -+static int enough_space_for_min_flow_fraction(carry_op * op) -+{ -+ assert("vs-902", coord_is_after_rightmost(flow_insert_point(op))); -+ -+ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION; -+} -+ -+/* this returns 0 if left neighbor was obtained successfully and everything -+ upto insertion point including it were shifted and left neighbor still has -+ some free space to put minimal fraction of flow into it */ -+static int -+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo) -+{ -+ carry_node *left; -+ znode *orig; -+ -+ left = find_left_neighbor(op, doing); -+ if (unlikely(IS_ERR(left))) { -+ warning("vs-899", -+ "make_space_by_shift_left: " -+ "error accessing left neighbor: %li", PTR_ERR(left)); -+ return 1; -+ } -+ if (left == NULL) -+ /* left neighbor either does not exist or is unformatted -+ node */ -+ return 1; -+ -+ orig = flow_insert_point(op)->node; -+ /* try to shift content of node @orig from its head upto insert point -+ including insertion point into the left neighbor */ -+ carry_shift_data(LEFT_SIDE, flow_insert_point(op), -+ reiser4_carry_real(left), doing, todo, -+ 1/* including insert point */); -+ if (reiser4_carry_real(left) != flow_insert_point(op)->node) { -+ /* insertion point did not move */ -+ return 1; -+ } -+ -+ /* insertion point is set after last item in the node */ -+ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op))); -+ -+ if (!enough_space_for_min_flow_fraction(op)) { -+ /* insertion point node does not have enough free space to put -+ even minimal portion of flow into it, therefore, move -+ insertion point back to orig node (before first item) */ -+ coord_init_before_first_item(flow_insert_point(op), orig); -+ return 1; -+ } -+ -+ /* part of flow is to be written to the end of node */ -+ op->node = left; -+ return 0; -+} -+ -+/* this returns 0 if right neighbor was obtained successfully and everything to -+ the right of insertion point was shifted to it and node got enough free -+ space to put minimal fraction of flow into it */ -+static int -+make_space_by_shift_right(carry_op * op, carry_level * doing, -+ carry_level * todo) -+{ -+ carry_node *right; -+ -+ right = find_right_neighbor(op, doing); -+ if (unlikely(IS_ERR(right))) { -+ warning("nikita-1065", "shift_right_excluding_insert_point: " -+ "error accessing right neighbor: %li", PTR_ERR(right)); -+ return 1; -+ } -+ if (right) { -+ /* shift everything possible on the right of but excluding -+ insertion coord into the right neighbor */ -+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op), -+ reiser4_carry_real(right), doing, todo, -+ 0/* not including insert point */); -+ } else { -+ /* right neighbor either does not exist or is unformatted -+ node */ -+ ; -+ } -+ if (coord_is_after_rightmost(flow_insert_point(op))) { -+ if (enough_space_for_min_flow_fraction(op)) { -+ /* part of flow is to be written to the end of node */ -+ return 0; -+ } -+ } -+ -+ /* new node is to be added if insert point node did not get enough -+ space for whole flow */ -+ return 1; -+} -+ -+/* this returns 0 when insert coord is set at the node end and fraction of flow -+ fits into that node */ -+static int -+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo) -+{ -+ int result; -+ znode *node; -+ carry_node *new; -+ -+ node = flow_insert_point(op)->node; -+ -+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT) -+ return RETERR(-E_NODE_FULL); -+ /* add new node after insert point node */ -+ new = add_new_znode(node, op->node, doing, todo); -+ if (unlikely(IS_ERR(new))) -+ return PTR_ERR(new); -+ result = lock_carry_node(doing, new); -+ zput(reiser4_carry_real(new)); -+ if (unlikely(result)) -+ return result; -+ op->u.insert_flow.new_nodes++; -+ if (!coord_is_after_rightmost(flow_insert_point(op))) { -+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op), -+ reiser4_carry_real(new), doing, todo, -+ 0/* not including insert point */); -+ assert("vs-901", -+ coord_is_after_rightmost(flow_insert_point(op))); -+ -+ if (enough_space_for_min_flow_fraction(op)) -+ return 0; -+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT) -+ return RETERR(-E_NODE_FULL); -+ -+ /* add one more new node */ -+ new = add_new_znode(node, op->node, doing, todo); -+ if (unlikely(IS_ERR(new))) -+ return PTR_ERR(new); -+ result = lock_carry_node(doing, new); -+ zput(reiser4_carry_real(new)); -+ if (unlikely(result)) -+ return result; -+ op->u.insert_flow.new_nodes++; -+ } -+ -+ /* move insertion point to new node */ -+ coord_init_before_first_item(flow_insert_point(op), -+ reiser4_carry_real(new)); -+ op->node = new; -+ return 0; -+} -+ -+static int -+make_space_for_flow_insertion(carry_op * op, carry_level * doing, -+ carry_level * todo) -+{ -+ __u32 flags = op->u.insert_flow.flags; -+ -+ if (enough_space_for_whole_flow(op)) { -+ /* whole flow fits into insert point node */ -+ return 0; -+ } -+ -+ if (!(flags & COPI_DONT_SHIFT_LEFT) -+ && (make_space_by_shift_left(op, doing, todo) == 0)) { -+ /* insert point is shifted to left neighbor of original insert -+ point node and is set after last unit in that node. It has -+ enough space to fit at least minimal fraction of flow. */ -+ return 0; -+ } -+ -+ if (enough_space_for_whole_flow(op)) { -+ /* whole flow fits into insert point node */ -+ return 0; -+ } -+ -+ if (!(flags & COPI_DONT_SHIFT_RIGHT) -+ && (make_space_by_shift_right(op, doing, todo) == 0)) { -+ /* insert point is still set to the same node, but there is -+ nothing to the right of insert point. */ -+ return 0; -+ } -+ -+ if (enough_space_for_whole_flow(op)) { -+ /* whole flow fits into insert point node */ -+ return 0; -+ } -+ -+ return make_space_by_new_nodes(op, doing, todo); -+} -+ -+/* implements COP_INSERT_FLOW operation */ -+static int -+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo) -+{ -+ int result; -+ flow_t *f; -+ coord_t *insert_point; -+ node_plugin *nplug; -+ carry_plugin_info info; -+ znode *orig_node; -+ lock_handle *orig_lh; -+ -+ f = op->u.insert_flow.flow; -+ result = 0; -+ -+ /* carry system needs this to work */ -+ info.doing = doing; -+ info.todo = todo; -+ -+ orig_node = flow_insert_point(op)->node; -+ orig_lh = doing->tracked; -+ -+ while (f->length) { -+ result = make_space_for_flow_insertion(op, doing, todo); -+ if (result) -+ break; -+ -+ insert_point = flow_insert_point(op); -+ nplug = node_plugin_by_node(insert_point->node); -+ -+ /* compose item data for insertion/pasting */ -+ flow_insert_data(op)->data = f->data; -+ flow_insert_data(op)->length = what_can_fit_into_node(op); -+ -+ if (can_paste(insert_point, &f->key, flow_insert_data(op))) { -+ /* insert point is set to item of file we are writing to -+ and we have to append to it */ -+ assert("vs-903", insert_point->between == AFTER_UNIT); -+ nplug->change_item_size(insert_point, -+ flow_insert_data(op)->length); -+ flow_insert_data(op)->iplug->b.paste(insert_point, -+ flow_insert_data -+ (op), &info); -+ } else { -+ /* new item must be inserted */ -+ pos_in_node_t new_pos; -+ flow_insert_data(op)->length += item_data_overhead(op); -+ -+ /* FIXME-VS: this is because node40_create_item changes -+ insert_point for obscure reasons */ -+ switch (insert_point->between) { -+ case AFTER_ITEM: -+ new_pos = insert_point->item_pos + 1; -+ break; -+ case EMPTY_NODE: -+ new_pos = 0; -+ break; -+ case BEFORE_ITEM: -+ assert("vs-905", insert_point->item_pos == 0); -+ new_pos = 0; -+ break; -+ default: -+ impossible("vs-906", -+ "carry_insert_flow: invalid coord"); -+ new_pos = 0; -+ break; -+ } -+ -+ nplug->create_item(insert_point, &f->key, -+ flow_insert_data(op), &info); -+ coord_set_item_pos(insert_point, new_pos); -+ } -+ coord_init_after_item_end(insert_point); -+ doing->restartable = 0; -+ znode_make_dirty(insert_point->node); -+ -+ move_flow_forward(f, (unsigned)flow_insert_data(op)->length); -+ } -+ -+ if (orig_node != flow_insert_point(op)->node) { -+ /* move lock to new insert point */ -+ done_lh(orig_lh); -+ init_lh(orig_lh); -+ result = -+ longterm_lock_znode(orig_lh, flow_insert_point(op)->node, -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); -+ } -+ -+ return result; -+} -+ -+/* implements COP_DELETE operation -+ -+ Remove pointer to @op -> u.delete.child from it's parent. -+ -+ This function also handles killing of a tree root is last pointer from it -+ was removed. This is complicated by our handling of "twig" level: root on -+ twig level is never killed. -+ -+*/ -+static int carry_delete(carry_op * op /* operation to be performed */ , -+ carry_level * doing UNUSED_ARG /* current carry -+ * level */ , -+ carry_level * todo/* next carry level */) -+{ -+ int result; -+ coord_t coord; -+ coord_t coord2; -+ znode *parent; -+ znode *child; -+ carry_plugin_info info; -+ reiser4_tree *tree; -+ -+ /* -+ * This operation is called to delete internal item pointing to the -+ * child node that was removed by carry from the tree on the previous -+ * tree level. -+ */ -+ -+ assert("nikita-893", op != NULL); -+ assert("nikita-894", todo != NULL); -+ assert("nikita-895", op->op == COP_DELETE); -+ -+ coord_init_zero(&coord); -+ coord_init_zero(&coord2); -+ -+ parent = reiser4_carry_real(op->node); -+ child = op->u.delete.child ? -+ reiser4_carry_real(op->u.delete.child) : op->node->node; -+ tree = znode_get_tree(child); -+ read_lock_tree(tree); -+ -+ /* -+ * @parent was determined when carry entered parent level -+ * (lock_carry_level/lock_carry_node). Since then, actual parent of -+ * @child node could change due to other carry operations performed on -+ * the parent level. Check for this. -+ */ -+ -+ if (znode_parent(child) != parent) { -+ /* NOTE-NIKITA add stat counter for this. */ -+ parent = znode_parent(child); -+ assert("nikita-2581", find_carry_node(doing, parent)); -+ } -+ read_unlock_tree(tree); -+ -+ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL); -+ -+ /* Twig level horrors: tree should be of height at least 2. So, last -+ pointer from the root at twig level is preserved even if child is -+ empty. This is ugly, but so it was architectured. -+ */ -+ -+ if (znode_is_root(parent) && -+ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT && -+ node_num_items(parent) == 1) { -+ /* Delimiting key manipulations. */ -+ write_lock_dk(tree); -+ znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key())); -+ znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key())); -+ ZF_SET(child, JNODE_DKSET); -+ write_unlock_dk(tree); -+ -+ /* @child escaped imminent death! */ -+ ZF_CLR(child, JNODE_HEARD_BANSHEE); -+ return 0; -+ } -+ -+ /* convert child pointer to the coord_t */ -+ result = find_child_ptr(parent, child, &coord); -+ if (result != NS_FOUND) { -+ warning("nikita-994", "Cannot find child pointer: %i", result); -+ print_coord_content("coord", &coord); -+ return result; -+ } -+ -+ coord_dup(&coord2, &coord); -+ info.doing = doing; -+ info.todo = todo; -+ { -+ /* -+ * Actually kill internal item: prepare structure with -+ * arguments for ->cut_and_kill() method... -+ */ -+ -+ struct carry_kill_data kdata; -+ kdata.params.from = &coord; -+ kdata.params.to = &coord2; -+ kdata.params.from_key = NULL; -+ kdata.params.to_key = NULL; -+ kdata.params.smallest_removed = NULL; -+ kdata.params.truncate = 1; -+ kdata.flags = op->u.delete.flags; -+ kdata.inode = NULL; -+ kdata.left = NULL; -+ kdata.right = NULL; -+ kdata.buf = NULL; -+ /* ... and call it. */ -+ result = node_plugin_by_node(parent)->cut_and_kill(&kdata, -+ &info); -+ } -+ doing->restartable = 0; -+ -+ /* check whether root should be killed violently */ -+ if (znode_is_root(parent) && -+ /* don't kill roots at and lower than twig level */ -+ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT && -+ node_num_items(parent) == 1) -+ result = reiser4_kill_tree_root(coord.node); -+ -+ return result < 0 ? : 0; -+} -+ -+/* implements COP_CUT opration -+ -+ Cuts part or whole content of node. -+ -+*/ -+static int carry_cut(carry_op * op /* operation to be performed */ , -+ carry_level * doing /* current carry level */ , -+ carry_level * todo/* next carry level */) -+{ -+ int result; -+ carry_plugin_info info; -+ node_plugin *nplug; -+ -+ assert("nikita-896", op != NULL); -+ assert("nikita-897", todo != NULL); -+ assert("nikita-898", op->op == COP_CUT); -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ nplug = node_plugin_by_node(reiser4_carry_real(op->node)); -+ if (op->u.cut_or_kill.is_cut) -+ result = nplug->cut(op->u.cut_or_kill.u.cut, &info); -+ else -+ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info); -+ -+ doing->restartable = 0; -+ return result < 0 ? : 0; -+} -+ -+/* helper function for carry_paste(): returns true if @op can be continued as -+ paste */ -+static int -+can_paste(coord_t *icoord, const reiser4_key * key, -+ const reiser4_item_data * data) -+{ -+ coord_t circa; -+ item_plugin *new_iplug; -+ item_plugin *old_iplug; -+ int result = 0; /* to keep gcc shut */ -+ -+ assert("", icoord->between != AT_UNIT); -+ -+ /* obviously, one cannot paste when node is empty---there is nothing -+ to paste into. */ -+ if (node_is_empty(icoord->node)) -+ return 0; -+ /* if insertion point is at the middle of the item, then paste */ -+ if (!coord_is_between_items(icoord)) -+ return 1; -+ coord_dup(&circa, icoord); -+ circa.between = AT_UNIT; -+ -+ old_iplug = item_plugin_by_coord(&circa); -+ new_iplug = data->iplug; -+ -+ /* check whether we can paste to the item @icoord is "at" when we -+ ignore ->between field */ -+ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) -+ result = 1; -+ else if (icoord->between == BEFORE_UNIT -+ || icoord->between == BEFORE_ITEM) { -+ /* otherwise, try to glue to the item at the left, if any */ -+ coord_dup(&circa, icoord); -+ if (coord_set_to_left(&circa)) { -+ result = 0; -+ coord_init_before_item(icoord); -+ } else { -+ old_iplug = item_plugin_by_coord(&circa); -+ result = (old_iplug == new_iplug) -+ && item_can_contain_key(icoord, key, data); -+ if (result) { -+ coord_dup(icoord, &circa); -+ icoord->between = AFTER_UNIT; -+ } -+ } -+ } else if (icoord->between == AFTER_UNIT -+ || icoord->between == AFTER_ITEM) { -+ coord_dup(&circa, icoord); -+ /* otherwise, try to glue to the item at the right, if any */ -+ if (coord_set_to_right(&circa)) { -+ result = 0; -+ coord_init_after_item(icoord); -+ } else { -+ int (*cck) (const coord_t *, const reiser4_key *, -+ const reiser4_item_data *); -+ -+ old_iplug = item_plugin_by_coord(&circa); -+ -+ cck = old_iplug->b.can_contain_key; -+ if (cck == NULL) -+ /* item doesn't define ->can_contain_key -+ method? So it is not expandable. */ -+ result = 0; -+ else { -+ result = (old_iplug == new_iplug) -+ && cck(&circa /*icoord */ , key, data); -+ if (result) { -+ coord_dup(icoord, &circa); -+ icoord->between = BEFORE_UNIT; -+ } -+ } -+ } -+ } else -+ impossible("nikita-2513", "Nothing works"); -+ if (result) { -+ if (icoord->between == BEFORE_ITEM) { -+ assert("vs-912", icoord->unit_pos == 0); -+ icoord->between = BEFORE_UNIT; -+ } else if (icoord->between == AFTER_ITEM) { -+ coord_init_after_item_end(icoord); -+ } -+ } -+ return result; -+} -+ -+/* implements COP_PASTE operation -+ -+ Paste data into existing item. This is complicated by the fact that after -+ we shifted something to the left or right neighbors trying to free some -+ space, item we were supposed to paste into can be in different node than -+ insertion coord. If so, we are no longer doing paste, but insert. See -+ comments in insert_paste_common(). -+ -+*/ -+static int carry_paste(carry_op * op /* operation to be performed */ , -+ carry_level * doing UNUSED_ARG /* current carry -+ * level */ , -+ carry_level * todo/* next carry level */) -+{ -+ znode *node; -+ carry_insert_data cdata; -+ coord_t dcoord; -+ reiser4_item_data data; -+ int result; -+ int real_size; -+ item_plugin *iplug; -+ carry_plugin_info info; -+ coord_t *coord; -+ -+ assert("nikita-982", op != NULL); -+ assert("nikita-983", todo != NULL); -+ assert("nikita-984", op->op == COP_PASTE); -+ -+ coord_init_zero(&dcoord); -+ -+ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data); -+ if (result != 0) -+ return result; -+ -+ coord = op->u.insert.d->coord; -+ -+ /* handle case when op -> u.insert.coord doesn't point to the item -+ of required type. restart as insert. */ -+ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) { -+ op->op = COP_INSERT; -+ op->u.insert.type = COPT_PASTE_RESTARTED; -+ result = op_dispatch_table[COP_INSERT].handler(op, doing, todo); -+ -+ return result; -+ } -+ -+ node = coord->node; -+ iplug = item_plugin_by_coord(coord); -+ assert("nikita-992", iplug != NULL); -+ -+ assert("nikita-985", node != NULL); -+ assert("nikita-986", node_plugin_by_node(node) != NULL); -+ -+ assert("nikita-987", -+ space_needed_for_op(node, op) <= znode_free_space(node)); -+ -+ assert("nikita-1286", coord_is_existing_item(coord)); -+ -+ /* -+ * if item is expanded as a result of this operation, we should first -+ * change item size, than call ->b.paste item method. If item is -+ * shrunk, it should be done other way around: first call ->b.paste -+ * method, then reduce item size. -+ */ -+ -+ real_size = space_needed_for_op(node, op); -+ if (real_size > 0) -+ node->nplug->change_item_size(coord, real_size); -+ -+ doing->restartable = 0; -+ info.doing = doing; -+ info.todo = todo; -+ -+ result = iplug->b.paste(coord, op->u.insert.d->data, &info); -+ -+ if (real_size < 0) -+ node->nplug->change_item_size(coord, real_size); -+ -+ /* if we pasted at the beginning of the item, update item's key. */ -+ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT) -+ node->nplug->update_item_key(coord, op->u.insert.d->key, &info); -+ -+ znode_make_dirty(node); -+ return result; -+} -+ -+/* handle carry COP_EXTENT operation. */ -+static int carry_extent(carry_op * op /* operation to perform */ , -+ carry_level * doing /* queue of operations @op -+ * is part of */ , -+ carry_level * todo /* queue where new operations -+ * are accumulated */ ) -+{ -+ znode *node; -+ carry_insert_data cdata; -+ coord_t coord; -+ reiser4_item_data data; -+ carry_op *delete_dummy; -+ carry_op *insert_extent; -+ int result; -+ carry_plugin_info info; -+ -+ assert("nikita-1751", op != NULL); -+ assert("nikita-1752", todo != NULL); -+ assert("nikita-1753", op->op == COP_EXTENT); -+ -+ /* extent insertion overview: -+ -+ extents live on the TWIG LEVEL, which is level one above the leaf -+ one. This complicates extent insertion logic somewhat: it may -+ happen (and going to happen all the time) that in logical key -+ ordering extent has to be placed between items I1 and I2, located -+ at the leaf level, but I1 and I2 are in the same formatted leaf -+ node N1. To insert extent one has to -+ -+ (1) reach node N1 and shift data between N1, its neighbors and -+ possibly newly allocated nodes until I1 and I2 fall into different -+ nodes. Since I1 and I2 are still neighboring items in logical key -+ order, they will be necessary utmost items in their respective -+ nodes. -+ -+ (2) After this new extent item is inserted into node on the twig -+ level. -+ -+ Fortunately this process can reuse almost all code from standard -+ insertion procedure (viz. make_space() and insert_paste_common()), -+ due to the following observation: make_space() only shifts data up -+ to and excluding or including insertion point. It never -+ "over-moves" through insertion point. Thus, one can use -+ make_space() to perform step (1). All required for this is just to -+ instruct free_space_shortage() to keep make_space() shifting data -+ until insertion point is at the node border. -+ -+ */ -+ -+ /* perform common functionality of insert and paste. */ -+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data); -+ if (result != 0) -+ return result; -+ -+ node = op->u.extent.d->coord->node; -+ assert("nikita-1754", node != NULL); -+ assert("nikita-1755", node_plugin_by_node(node) != NULL); -+ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE); -+ -+ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that -+ extent fits between items. */ -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ /* there is another complication due to placement of extents on the -+ twig level: extents are "rigid" in the sense that key-range -+ occupied by extent cannot grow indefinitely to the right as it is -+ for the formatted leaf nodes. Because of this when search finds two -+ adjacent extents on the twig level, it has to "drill" to the leaf -+ level, creating new node. Here we are removing this node. -+ */ -+ if (node_is_empty(node)) { -+ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1); -+ if (IS_ERR(delete_dummy)) -+ return PTR_ERR(delete_dummy); -+ delete_dummy->u.delete.child = NULL; -+ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY; -+ ZF_SET(node, JNODE_HEARD_BANSHEE); -+ } -+ -+ /* proceed with inserting extent item into parent. We are definitely -+ inserting rather than pasting if we get that far. */ -+ insert_extent = node_post_carry(&info, COP_INSERT, node, 1); -+ if (IS_ERR(insert_extent)) -+ /* @delete_dummy will be automatically destroyed on the level -+ exiting */ -+ return PTR_ERR(insert_extent); -+ /* NOTE-NIKITA insertion by key is simplest option here. Another -+ possibility is to insert on the left or right of already existing -+ item. -+ */ -+ insert_extent->u.insert.type = COPT_KEY; -+ insert_extent->u.insert.d = op->u.extent.d; -+ assert("nikita-1719", op->u.extent.d->key != NULL); -+ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord; -+ insert_extent->u.insert.flags = -+ znode_get_tree(node)->carry.new_extent_flags; -+ -+ /* -+ * if carry was asked to track lock handle we should actually track -+ * lock handle on the twig node rather than on the leaf where -+ * operation was started from. Transfer tracked lock handle. -+ */ -+ if (doing->track_type) { -+ assert("nikita-3242", doing->tracked != NULL); -+ assert("nikita-3244", todo->tracked == NULL); -+ todo->tracked = doing->tracked; -+ todo->track_type = CARRY_TRACK_NODE; -+ doing->tracked = NULL; -+ doing->track_type = 0; -+ } -+ -+ return 0; -+} -+ -+/* update key in @parent between pointers to @left and @right. -+ -+ Find coords of @left and @right and update delimiting key between them. -+ This is helper function called by carry_update(). Finds position of -+ internal item involved. Updates item key. Updates delimiting keys of child -+ nodes involved. -+*/ -+static int update_delimiting_key(znode * parent /* node key is updated -+ * in */ , -+ znode * left /* child of @parent */ , -+ znode * right /* child of @parent */ , -+ carry_level * doing /* current carry -+ * level */ , -+ carry_level * todo /* parent carry -+ * level */ , -+ const char **error_msg /* place to -+ * store error -+ * message */ ) -+{ -+ coord_t left_pos; -+ coord_t right_pos; -+ int result; -+ reiser4_key ldkey; -+ carry_plugin_info info; -+ -+ assert("nikita-1177", right != NULL); -+ /* find position of right left child in a parent */ -+ result = find_child_ptr(parent, right, &right_pos); -+ if (result != NS_FOUND) { -+ *error_msg = "Cannot find position of right child"; -+ return result; -+ } -+ -+ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) { -+ /* find position of the left child in a parent */ -+ result = find_child_ptr(parent, left, &left_pos); -+ if (result != NS_FOUND) { -+ *error_msg = "Cannot find position of left child"; -+ return result; -+ } -+ assert("nikita-1355", left_pos.node != NULL); -+ } else -+ left_pos.node = NULL; -+ -+ /* check that they are separated by exactly one key and are basically -+ sane */ -+ if (REISER4_DEBUG) { -+ if ((left_pos.node != NULL) -+ && !coord_is_existing_unit(&left_pos)) { -+ *error_msg = "Left child is bastard"; -+ return RETERR(-EIO); -+ } -+ if (!coord_is_existing_unit(&right_pos)) { -+ *error_msg = "Right child is bastard"; -+ return RETERR(-EIO); -+ } -+ if (left_pos.node != NULL && -+ !coord_are_neighbors(&left_pos, &right_pos)) { -+ *error_msg = "Children are not direct siblings"; -+ return RETERR(-EIO); -+ } -+ } -+ *error_msg = NULL; -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ /* -+ * If child node is not empty, new key of internal item is a key of -+ * leftmost item in the child node. If the child is empty, take its -+ * right delimiting key as a new key of the internal item. Precise key -+ * in the latter case is not important per se, because the child (and -+ * the internal item) are going to be killed shortly anyway, but we -+ * have to preserve correct order of keys in the parent node. -+ */ -+ -+ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE)) -+ leftmost_key_in_node(right, &ldkey); -+ else { -+ read_lock_dk(znode_get_tree(parent)); -+ ldkey = *znode_get_rd_key(right); -+ read_unlock_dk(znode_get_tree(parent)); -+ } -+ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info); -+ doing->restartable = 0; -+ znode_make_dirty(parent); -+ return 0; -+} -+ -+/* implements COP_UPDATE opration -+ -+ Update delimiting keys. -+ -+*/ -+static int carry_update(carry_op * op /* operation to be performed */ , -+ carry_level * doing /* current carry level */ , -+ carry_level * todo/* next carry level */) -+{ -+ int result; -+ carry_node *missing UNUSED_ARG; -+ znode *left; -+ znode *right; -+ carry_node *lchild; -+ carry_node *rchild; -+ const char *error_msg; -+ reiser4_tree *tree; -+ -+ /* -+ * This operation is called to update key of internal item. This is -+ * necessary when carry shifted of cut data on the child -+ * level. Arguments of this operation are: -+ * -+ * @right --- child node. Operation should update key of internal -+ * item pointing to @right. -+ * -+ * @left --- left neighbor of @right. This parameter is optional. -+ */ -+ -+ assert("nikita-902", op != NULL); -+ assert("nikita-903", todo != NULL); -+ assert("nikita-904", op->op == COP_UPDATE); -+ -+ lchild = op->u.update.left; -+ rchild = op->node; -+ -+ if (lchild != NULL) { -+ assert("nikita-1001", lchild->parent); -+ assert("nikita-1003", !lchild->left); -+ left = reiser4_carry_real(lchild); -+ } else -+ left = NULL; -+ -+ tree = znode_get_tree(rchild->node); -+ read_lock_tree(tree); -+ right = znode_parent(rchild->node); -+ read_unlock_tree(tree); -+ -+ if (right != NULL) { -+ result = update_delimiting_key(right, -+ lchild ? lchild->node : NULL, -+ rchild->node, -+ doing, todo, &error_msg); -+ } else { -+ error_msg = "Cannot find node to update key in"; -+ result = RETERR(-EIO); -+ } -+ /* operation will be reposted to the next level by the -+ ->update_item_key() method of node plugin, if necessary. */ -+ -+ if (result != 0) { -+ warning("nikita-999", "Error updating delimiting key: %s (%i)", -+ error_msg ? : "", result); -+ } -+ return result; -+} -+ -+/* move items from @node during carry */ -+static int carry_shift_data(sideof side /* in what direction to move data */ , -+ coord_t *insert_coord /* coord where new item -+ * is to be inserted */, -+ znode * node /* node which data are moved from */ , -+ carry_level * doing /* active carry queue */ , -+ carry_level * todo /* carry queue where new -+ * operations are to be put -+ * in */ , -+ unsigned int including_insert_coord_p -+ /* true if @insertion_coord can be moved */ ) -+{ -+ int result; -+ znode *source; -+ carry_plugin_info info; -+ node_plugin *nplug; -+ -+ source = insert_coord->node; -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ nplug = node_plugin_by_node(node); -+ result = nplug->shift(insert_coord, node, -+ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0, -+ (int)including_insert_coord_p, &info); -+ /* the only error ->shift() method of node plugin can return is -+ -ENOMEM due to carry node/operation allocation. */ -+ assert("nikita-915", result >= 0 || result == -ENOMEM); -+ if (result > 0) { -+ /* -+ * if some number of bytes was actually shifted, mark nodes -+ * dirty, and carry level as non-restartable. -+ */ -+ doing->restartable = 0; -+ znode_make_dirty(source); -+ znode_make_dirty(node); -+ } -+ -+ assert("nikita-2077", coord_check(insert_coord)); -+ return 0; -+} -+ -+typedef carry_node *(*carry_iterator) (carry_node * node); -+static carry_node *find_dir_carry(carry_node * node, carry_level * level, -+ carry_iterator iterator); -+ -+static carry_node *pool_level_list_prev(carry_node *node) -+{ -+ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage); -+} -+ -+/* look for the left neighbor of given carry node in a carry queue. -+ -+ This is used by find_left_neighbor(), but I am not sure that this -+ really gives any advantage. More statistics required. -+ -+*/ -+carry_node *find_left_carry(carry_node * node /* node to find left neighbor -+ * of */ , -+ carry_level * level/* level to scan */) -+{ -+ return find_dir_carry(node, level, -+ (carry_iterator) pool_level_list_prev); -+} -+ -+static carry_node *pool_level_list_next(carry_node *node) -+{ -+ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); -+} -+ -+/* look for the right neighbor of given carry node in a -+ carry queue. -+ -+ This is used by find_right_neighbor(), but I am not sure that this -+ really gives any advantage. More statistics required. -+ -+*/ -+carry_node *find_right_carry(carry_node * node /* node to find right neighbor -+ * of */ , -+ carry_level * level/* level to scan */) -+{ -+ return find_dir_carry(node, level, -+ (carry_iterator) pool_level_list_next); -+} -+ -+/* look for the left or right neighbor of given carry node in a carry -+ queue. -+ -+ Helper function used by find_{left|right}_carry(). -+*/ -+static carry_node *find_dir_carry(carry_node * node /* node to start -+ * scanning from */ , -+ carry_level * level /* level to scan */ , -+ carry_iterator iterator /* operation to -+ * move to the -+ * next node */) -+{ -+ carry_node *neighbor; -+ -+ assert("nikita-1059", node != NULL); -+ assert("nikita-1060", level != NULL); -+ -+ /* scan list of carry nodes on this list dir-ward, skipping all -+ carry nodes referencing the same znode. */ -+ neighbor = node; -+ while (1) { -+ neighbor = iterator(neighbor); -+ if (carry_node_end(level, neighbor)) -+ /* list head is reached */ -+ return NULL; -+ if (reiser4_carry_real(neighbor) != reiser4_carry_real(node)) -+ return neighbor; -+ } -+} -+ -+/* -+ * Memory reservation estimation. -+ * -+ * Carry process proceeds through tree levels upwards. Carry assumes that it -+ * takes tree in consistent state (e.g., that search tree invariants hold), -+ * and leaves tree consistent after it finishes. This means that when some -+ * error occurs carry cannot simply return if there are pending carry -+ * operations. Generic solution for this problem is carry-undo either as -+ * transaction manager feature (requiring checkpoints and isolation), or -+ * through some carry specific mechanism. -+ * -+ * Our current approach is to panic if carry hits an error while tree is -+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around -+ * this "memory reservation" mechanism was added. -+ * -+ * Memory reservation is implemented by perthread-pages.diff patch from -+ * core-patches. Its API is defined in <linux/gfp.h> -+ * -+ * int perthread_pages_reserve(int nrpages, gfp_t gfp); -+ * void perthread_pages_release(int nrpages); -+ * int perthread_pages_count(void); -+ * -+ * carry estimates its worst case memory requirements at the entry, reserved -+ * enough memory, and released unused pages before returning. -+ * -+ * Code below estimates worst case memory requirements for a given carry -+ * queue. This is dome by summing worst case memory requirements for each -+ * operation in the queue. -+ * -+ */ -+ -+/* -+ * Memory memory requirements of many operations depends on the tree -+ * height. For example, item insertion requires new node to be inserted at -+ * each tree level in the worst case. What tree height should be used for -+ * estimation? Current tree height is wrong, because tree height can change -+ * between the time when estimation was done and the time when operation is -+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT) -+ * is also not desirable, because it would lead to the huge over-estimation -+ * all the time. Plausible solution is "capped tree height": if current tree -+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is -+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is -+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely -+ * to be increased even more during short interval of time. -+ */ -+#define TREE_HEIGHT_CAP (5) -+ -+/* return capped tree height for the @tree. See comment above. */ -+static int cap_tree_height(reiser4_tree * tree) -+{ -+ return max_t(int, tree->height, TREE_HEIGHT_CAP); -+} -+ -+/* return capped tree height for the current tree. */ -+static int capped_height(void) -+{ -+ return cap_tree_height(current_tree); -+} -+ -+/* return number of pages required to store given number of bytes */ -+static int bytes_to_pages(int bytes) -+{ -+ return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -+} -+ -+/* how many pages are required to allocate znodes during item insertion. */ -+static int carry_estimate_znodes(void) -+{ -+ /* -+ * Note, that there we have some problem here: there is no way to -+ * reserve pages specifically for the given slab. This means that -+ * these pages can be hijacked for some other end. -+ */ -+ -+ /* in the worst case we need 3 new znode on each tree level */ -+ return bytes_to_pages(capped_height() * sizeof(znode) * 3); -+} -+ -+/* -+ * how many pages are required to load bitmaps. One bitmap per level. -+ */ -+static int carry_estimate_bitmaps(void) -+{ -+ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) { -+ int bytes; -+ -+ bytes = capped_height() * (0 + /* bnode should be added, but -+ * it is private to bitmap.c, -+ * skip for now. */ -+ 2 * sizeof(jnode)); -+ /* working and commit jnodes */ -+ return bytes_to_pages(bytes) + 2; /* and their contents */ -+ } else -+ /* bitmaps were pre-loaded during mount */ -+ return 0; -+} -+ -+/* worst case item insertion memory requirements */ -+static int carry_estimate_insert(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + -+ /* new atom */ -+ capped_height() + /* new block on each level */ -+ 1 + /* and possibly extra new block at the leaf level */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case item deletion memory requirements */ -+static int carry_estimate_delete(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + -+ /* new atom */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case tree cut memory requirements */ -+static int carry_estimate_cut(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + -+ /* new atom */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case memory requirements of pasting into item */ -+static int carry_estimate_paste(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + -+ /* new atom */ -+ capped_height() + /* new block on each level */ -+ 1 + /* and possibly extra new block at the leaf level */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case memory requirements of extent insertion */ -+static int carry_estimate_extent(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_insert(op, level) + /* insert extent */ -+ carry_estimate_delete(op, level); /* kill leaf */ -+} -+ -+/* worst case memory requirements of key update */ -+static int carry_estimate_update(carry_op * op, carry_level * level) -+{ -+ return 0; -+} -+ -+/* worst case memory requirements of flow insertion */ -+static int carry_estimate_insert_flow(carry_op * op, carry_level * level) -+{ -+ int newnodes; -+ -+ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length), -+ CARRY_FLOW_NEW_NODES_LIMIT); -+ /* -+ * roughly estimate insert_flow as a sequence of insertions. -+ */ -+ return newnodes * carry_estimate_insert(op, level); -+} -+ -+/* This is dispatch table for carry operations. It can be trivially -+ abstracted into useful plugin: tunable balancing policy is a good -+ thing. */ -+carry_op_handler op_dispatch_table[COP_LAST_OP] = { -+ [COP_INSERT] = { -+ .handler = carry_insert, -+ .estimate = carry_estimate_insert} -+ , -+ [COP_DELETE] = { -+ .handler = carry_delete, -+ .estimate = carry_estimate_delete} -+ , -+ [COP_CUT] = { -+ .handler = carry_cut, -+ .estimate = carry_estimate_cut} -+ , -+ [COP_PASTE] = { -+ .handler = carry_paste, -+ .estimate = carry_estimate_paste} -+ , -+ [COP_EXTENT] = { -+ .handler = carry_extent, -+ .estimate = carry_estimate_extent} -+ , -+ [COP_UPDATE] = { -+ .handler = carry_update, -+ .estimate = carry_estimate_update} -+ , -+ [COP_INSERT_FLOW] = { -+ .handler = carry_insert_flow, -+ .estimate = carry_estimate_insert_flow} -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/carry_ops.h linux-2.6.30/fs/reiser4/carry_ops.h ---- linux-2.6.30.orig/fs/reiser4/carry_ops.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/carry_ops.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,43 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* implementation of carry operations. See carry_ops.c for details. */ -+ -+#if !defined(__CARRY_OPS_H__) -+#define __CARRY_OPS_H__ -+ -+#include "forward.h" -+#include "znode.h" -+#include "carry.h" -+ -+/* carry operation handlers */ -+typedef struct carry_op_handler { -+ /* perform operation */ -+ int (*handler) (carry_op * op, carry_level * doing, carry_level * todo); -+ /* estimate memory requirements for @op */ -+ int (*estimate) (carry_op * op, carry_level * level); -+} carry_op_handler; -+ -+/* This is dispatch table for carry operations. It can be trivially -+ abstracted into useful plugin: tunable balancing policy is a good -+ thing. */ -+extern carry_op_handler op_dispatch_table[COP_LAST_OP]; -+ -+unsigned int space_needed(const znode * node, const coord_t *coord, -+ const reiser4_item_data * data, int inserting); -+extern carry_node *find_left_carry(carry_node * node, carry_level * level); -+extern carry_node *find_right_carry(carry_node * node, carry_level * level); -+ -+/* __CARRY_OPS_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/context.c linux-2.6.30/fs/reiser4/context.c ---- linux-2.6.30.orig/fs/reiser4/context.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/context.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,289 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Manipulation of reiser4_context */ -+ -+/* -+ * global context used during system call. Variable of this type is allocated -+ * on the stack at the beginning of the reiser4 part of the system call and -+ * pointer to it is stored in the current->fs_context. This allows us to avoid -+ * passing pointer to current transaction and current lockstack (both in -+ * one-to-one mapping with threads) all over the call chain. -+ * -+ * It's kind of like those global variables the prof used to tell you not to -+ * use in CS1, except thread specific.;-) Nikita, this was a good idea. -+ * -+ * In some situations it is desirable to have ability to enter reiser4_context -+ * more than once for the same thread (nested contexts). For example, there -+ * are some functions that can be called either directly from VFS/VM or from -+ * already active reiser4 context (->writepage, for example). -+ * -+ * In such situations "child" context acts like dummy: all activity is -+ * actually performed in the top level context, and get_current_context() -+ * always returns top level context. -+ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly -+ * nested any way. -+ * -+ * Note that there is an important difference between reiser4 uses -+ * ->fs_context and the way other file systems use it. Other file systems -+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_ -+ * (this is why ->fs_context was initially called ->journal_info). This means, -+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry -+ * to the file system, they assume that some transaction is already underway, -+ * and usually bail out, because starting nested transaction would most likely -+ * lead to the deadlock. This gives false positives with reiser4, because we -+ * set ->fs_context before starting transaction. -+ */ -+ -+#include "debug.h" -+#include "super.h" -+#include "context.h" -+ -+#include <linux/writeback.h> /* balance_dirty_pages() */ -+#include <linux/hardirq.h> -+ -+static void _reiser4_init_context(reiser4_context * context, -+ struct super_block *super) -+{ -+ memset(context, 0, sizeof(*context)); -+ -+ context->super = super; -+ context->magic = context_magic; -+ context->outer = current->journal_info; -+ current->journal_info = (void *)context; -+ context->nr_children = 0; -+ context->gfp_mask = GFP_KERNEL; -+ -+ init_lock_stack(&context->stack); -+ -+ reiser4_txn_begin(context); -+ -+ /* initialize head of tap list */ -+ INIT_LIST_HEAD(&context->taps); -+#if REISER4_DEBUG -+ context->task = current; -+#endif -+ grab_space_enable(); -+} -+ -+/* initialize context and bind it to the current thread -+ -+ This function should be called at the beginning of reiser4 part of -+ syscall. -+*/ -+reiser4_context * reiser4_init_context(struct super_block *super) -+{ -+ reiser4_context *context; -+ -+ assert("nikita-2662", !in_interrupt() && !in_irq()); -+ assert("nikita-3357", super != NULL); -+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super)); -+ -+ context = get_current_context_check(); -+ if (context && context->super == super) { -+ context = (reiser4_context *) current->journal_info; -+ context->nr_children++; -+ return context; -+ } -+ -+ context = kmalloc(sizeof(*context), GFP_KERNEL); -+ if (context == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ _reiser4_init_context(context, super); -+ return context; -+} -+ -+/* this is used in scan_mgr which is called with spinlock held and in -+ reiser4_fill_super magic */ -+void init_stack_context(reiser4_context *context, struct super_block *super) -+{ -+ assert("nikita-2662", !in_interrupt() && !in_irq()); -+ assert("nikita-3357", super != NULL); -+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super)); -+ assert("vs-12", !is_in_reiser4_context()); -+ -+ _reiser4_init_context(context, super); -+ context->on_stack = 1; -+ return; -+} -+ -+/* cast lock stack embedded into reiser4 context up to its container */ -+reiser4_context *get_context_by_lock_stack(lock_stack * owner) -+{ -+ return container_of(owner, reiser4_context, stack); -+} -+ -+/* true if there is already _any_ reiser4 context for the current thread */ -+int is_in_reiser4_context(void) -+{ -+ reiser4_context *ctx; -+ -+ ctx = current->journal_info; -+ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic; -+} -+ -+/* -+ * call balance dirty pages for the current context. -+ * -+ * File system is expected to call balance_dirty_pages_ratelimited() whenever -+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during -+ * write---this covers vast majority of all dirty traffic), but we cannot do -+ * this immediately when formatted node is dirtied, because long term lock is -+ * usually held at that time. To work around this, dirtying of formatted node -+ * simply increases ->nr_marked_dirty counter in the current reiser4 -+ * context. When we are about to leave this context, -+ * balance_dirty_pages_ratelimited() is called, if necessary. -+ * -+ * This introduces another problem: sometimes we do not want to run -+ * balance_dirty_pages_ratelimited() when leaving a context, for example -+ * because some important lock (like ->i_mutex on the parent directory) is -+ * held. To achieve this, ->nobalance flag can be set in the current context. -+ */ -+static void balance_dirty_pages_at(reiser4_context *context) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(context->super); -+ -+ /* -+ * call balance_dirty_pages_ratelimited() to process formatted nodes -+ * dirtied during this system call. Do that only if we are not in mount -+ * and there were nodes dirtied in this context and we are not in -+ * writepage (to avoid deadlock) and not in pdflush -+ */ -+ if (sbinfo != NULL && sbinfo->fake != NULL && -+ context->nr_marked_dirty != 0 && -+ !(current->flags & PF_MEMALLOC) && -+ !current_is_pdflush()) -+ balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping); -+} -+ -+/* release resources associated with context. -+ -+ This function should be called at the end of "session" with reiser4, -+ typically just before leaving reiser4 driver back to VFS. -+ -+ This is good place to put some degugging consistency checks, like that -+ thread released all locks and closed transcrash etc. -+ -+*/ -+static void reiser4_done_context(reiser4_context * context) -+ /* context being released */ -+{ -+ assert("nikita-860", context != NULL); -+ assert("nikita-859", context->magic == context_magic); -+ assert("vs-646", (reiser4_context *) current->journal_info == context); -+ assert("zam-686", !in_interrupt() && !in_irq()); -+ -+ /* only do anything when leaving top-level reiser4 context. All nested -+ * contexts are just dummies. */ -+ if (context->nr_children == 0) { -+ assert("jmacd-673", context->trans == NULL); -+ assert("jmacd-1002", lock_stack_isclean(&context->stack)); -+ assert("nikita-1936", reiser4_no_counters_are_held()); -+ assert("nikita-2626", list_empty_careful(reiser4_taps_list())); -+ assert("zam-1004", ergo(get_super_private(context->super), -+ get_super_private(context->super)->delete_mutex_owner != -+ current)); -+ -+ /* release all grabbed but as yet unused blocks */ -+ if (context->grabbed_blocks != 0) -+ all_grabbed2free(); -+ -+ /* -+ * synchronize against longterm_unlock_znode(): -+ * wake_up_requestor() wakes up requestors without holding -+ * zlock (otherwise they will immediately bump into that lock -+ * after wake up on another CPU). To work around (rare) -+ * situation where requestor has been woken up asynchronously -+ * and managed to run until completion (and destroy its -+ * context and lock stack) before wake_up_requestor() called -+ * wake_up() on it, wake_up_requestor() synchronize on lock -+ * stack spin lock. It has actually been observed that spin -+ * lock _was_ locked at this point, because -+ * wake_up_requestor() took interrupt. -+ */ -+ spin_lock_stack(&context->stack); -+ spin_unlock_stack(&context->stack); -+ -+ assert("zam-684", context->nr_children == 0); -+ /* restore original ->fs_context value */ -+ current->journal_info = context->outer; -+ if (context->on_stack == 0) -+ kfree(context); -+ } else { -+ context->nr_children--; -+#if REISER4_DEBUG -+ assert("zam-685", context->nr_children >= 0); -+#endif -+ } -+} -+ -+/* -+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close -+ * transaction. Call done_context() to do context related book-keeping. -+ */ -+void reiser4_exit_context(reiser4_context * context) -+{ -+ assert("nikita-3021", reiser4_schedulable()); -+ -+ if (context->nr_children == 0) { -+ if (!context->nobalance) { -+ reiser4_txn_restart(context); -+ balance_dirty_pages_at(context); -+ } -+ -+ /* if filesystem is mounted with -o sync or -o dirsync - commit -+ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid -+ commiting on exit_context when inode semaphore is held and -+ to have ktxnmgrd to do commit instead to get better -+ concurrent filesystem accesses. But, when one mounts with -o -+ sync, he cares more about reliability than about -+ performance. So, for now we have this simple mount -o sync -+ support. */ -+ if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) { -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked_nocheck(); -+ if (atom) { -+ atom->flags |= ATOM_FORCE_COMMIT; -+ context->trans->flags &= ~TXNH_DONT_COMMIT; -+ spin_unlock_atom(atom); -+ } -+ } -+ reiser4_txn_end(context); -+ } -+ reiser4_done_context(context); -+} -+ -+void reiser4_ctx_gfp_mask_set(void) -+{ -+ reiser4_context *ctx; -+ -+ ctx = get_current_context(); -+ if (ctx->entd == 0 && -+ list_empty(&ctx->stack.locks) && -+ ctx->trans->atom == NULL) -+ ctx->gfp_mask = GFP_KERNEL; -+ else -+ ctx->gfp_mask = GFP_NOFS; -+} -+ -+void reiser4_ctx_gfp_mask_force(gfp_t mask) -+{ -+ reiser4_context *ctx; -+ ctx = get_current_context(); -+ -+ assert("edward-1454", ctx != NULL); -+ -+ ctx->gfp_mask = mask; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/context.h linux-2.6.30/fs/reiser4/context.h ---- linux-2.6.30.orig/fs/reiser4/context.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/context.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,228 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Reiser4 context. See context.c for details. */ -+ -+#if !defined( __REISER4_CONTEXT_H__ ) -+#define __REISER4_CONTEXT_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "tap.h" -+#include "lock.h" -+ -+#include <linux/types.h> /* for __u?? */ -+#include <linux/fs.h> /* for struct super_block */ -+#include <linux/spinlock.h> -+#include <linux/sched.h> /* for struct task_struct */ -+ -+/* reiser4 per-thread context */ -+struct reiser4_context { -+ /* magic constant. For identification of reiser4 contexts. */ -+ __u32 magic; -+ -+ /* current lock stack. See lock.[ch]. This is where list of all -+ locks taken by current thread is kept. This is also used in -+ deadlock detection. */ -+ lock_stack stack; -+ -+ /* current transcrash. */ -+ txn_handle *trans; -+ /* transaction handle embedded into reiser4_context. ->trans points -+ * here by default. */ -+ txn_handle trans_in_ctx; -+ -+ /* super block we are working with. To get the current tree -+ use &get_super_private (reiser4_get_current_sb ())->tree. */ -+ struct super_block *super; -+ -+ /* parent fs activation */ -+ struct fs_activation *outer; -+ -+ /* per-thread grabbed (for further allocation) blocks counter */ -+ reiser4_block_nr grabbed_blocks; -+ -+ /* list of taps currently monitored. See tap.c */ -+ struct list_head taps; -+ -+ /* grabbing space is enabled */ -+ unsigned int grab_enabled:1; -+ /* should be set when we are write dirty nodes to disk in jnode_flush or -+ * reiser4_write_logs() */ -+ unsigned int writeout_mode:1; -+ /* true, if current thread is an ent thread */ -+ unsigned int entd:1; -+ /* true, if balance_dirty_pages() should not be run when leaving this -+ * context. This is used to avoid lengthly balance_dirty_pages() -+ * operation when holding some important resource, like directory -+ * ->i_mutex */ -+ unsigned int nobalance:1; -+ -+ /* this bit is used on reiser4_done_context to decide whether context is -+ kmalloc-ed and has to be kfree-ed */ -+ unsigned int on_stack:1; -+ -+ /* count non-trivial jnode_set_dirty() calls */ -+ unsigned long nr_marked_dirty; -+ -+ /* reiser4_sync_inodes calls (via generic_sync_sb_inodes) -+ * reiser4_writepages for each of dirty inodes. Reiser4_writepages -+ * captures pages. When number of pages captured in one -+ * reiser4_sync_inodes reaches some threshold - some atoms get -+ * flushed */ -+ int nr_captured; -+ int nr_children; /* number of child contexts */ -+#if REISER4_DEBUG -+ /* debugging information about reiser4 locks held by the current -+ * thread */ -+ reiser4_lock_cnt_info locks; -+ struct task_struct *task; /* so we can easily find owner of the stack */ -+ -+ /* -+ * disk space grabbing debugging support -+ */ -+ /* how many disk blocks were grabbed by the first call to -+ * reiser4_grab_space() in this context */ -+ reiser4_block_nr grabbed_initially; -+ -+ /* list of all threads doing flush currently */ -+ struct list_head flushers_link; -+ /* information about last error encountered by reiser4 */ -+ err_site err; -+#endif -+ void *vp; -+ gfp_t gfp_mask; -+}; -+ -+extern reiser4_context *get_context_by_lock_stack(lock_stack *); -+ -+/* Debugging helps. */ -+#if REISER4_DEBUG -+extern void print_contexts(void); -+#endif -+ -+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree)) -+#define current_blocksize reiser4_get_current_sb()->s_blocksize -+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits -+ -+extern reiser4_context *reiser4_init_context(struct super_block *); -+extern void init_stack_context(reiser4_context *, struct super_block *); -+extern void reiser4_exit_context(reiser4_context *); -+ -+/* magic constant we store in reiser4_context allocated at the stack. Used to -+ catch accesses to staled or uninitialized contexts. */ -+#define context_magic ((__u32) 0x4b1b5d0b) -+ -+extern int is_in_reiser4_context(void); -+ -+/* -+ * return reiser4_context for the thread @tsk -+ */ -+static inline reiser4_context *get_context(const struct task_struct *tsk) -+{ -+ assert("vs-1682", -+ ((reiser4_context *) tsk->journal_info)->magic == context_magic); -+ return (reiser4_context *) tsk->journal_info; -+} -+ -+/* -+ * return reiser4 context of the current thread, or NULL if there is none. -+ */ -+static inline reiser4_context *get_current_context_check(void) -+{ -+ if (is_in_reiser4_context()) -+ return get_context(current); -+ else -+ return NULL; -+} -+ -+static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */ -+ -+/* return context associated with current thread */ -+static inline reiser4_context *get_current_context(void) -+{ -+ return get_context(current); -+} -+ -+static inline gfp_t reiser4_ctx_gfp_mask_get(void) -+{ -+ reiser4_context *ctx; -+ -+ ctx = get_current_context_check(); -+ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask; -+} -+ -+void reiser4_ctx_gfp_mask_set(void); -+void reiser4_ctx_gfp_mask_force (gfp_t mask); -+ -+/* -+ * true if current thread is in the write-out mode. Thread enters write-out -+ * mode during jnode_flush and reiser4_write_logs(). -+ */ -+static inline int is_writeout_mode(void) -+{ -+ return get_current_context()->writeout_mode; -+} -+ -+/* -+ * enter write-out mode -+ */ -+static inline void writeout_mode_enable(void) -+{ -+ assert("zam-941", !get_current_context()->writeout_mode); -+ get_current_context()->writeout_mode = 1; -+} -+ -+/* -+ * leave write-out mode -+ */ -+static inline void writeout_mode_disable(void) -+{ -+ assert("zam-942", get_current_context()->writeout_mode); -+ get_current_context()->writeout_mode = 0; -+} -+ -+static inline void grab_space_enable(void) -+{ -+ get_current_context()->grab_enabled = 1; -+} -+ -+static inline void grab_space_disable(void) -+{ -+ get_current_context()->grab_enabled = 0; -+} -+ -+static inline void grab_space_set_enabled(int enabled) -+{ -+ get_current_context()->grab_enabled = enabled; -+} -+ -+static inline int is_grab_enabled(reiser4_context * ctx) -+{ -+ return ctx->grab_enabled; -+} -+ -+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or -+ * flush would be performed when it is closed. This is necessary when handle -+ * has to be closed under some coarse semaphore, like i_mutex of -+ * directory. Commit will be performed by ktxnmgrd. */ -+static inline void context_set_commit_async(reiser4_context * context) -+{ -+ context->nobalance = 1; -+ context->trans->flags |= TXNH_DONT_COMMIT; -+} -+ -+/* __REISER4_CONTEXT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/coord.c linux-2.6.30/fs/reiser4/coord.c ---- linux-2.6.30.orig/fs/reiser4/coord.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/coord.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,928 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "tree.h" -+#include "plugin/item/item.h" -+#include "znode.h" -+#include "coord.h" -+ -+/* Internal constructor. */ -+static inline void -+coord_init_values(coord_t *coord, const znode * node, pos_in_node_t item_pos, -+ pos_in_node_t unit_pos, between_enum between) -+{ -+ coord->node = (znode *) node; -+ coord_set_item_pos(coord, item_pos); -+ coord->unit_pos = unit_pos; -+ coord->between = between; -+ ON_DEBUG(coord->plug_v = 0); -+ ON_DEBUG(coord->body_v = 0); -+ -+ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, -+ node, item_pos, unit_pos, coord_tween_tostring (between)); */ -+} -+ -+/* after shifting of node content, coord previously set properly may become -+ invalid, try to "normalize" it. */ -+void coord_normalize(coord_t *coord) -+{ -+ znode *node; -+ -+ node = coord->node; -+ assert("vs-683", node); -+ -+ coord_clear_iplug(coord); -+ -+ if (node_is_empty(node)) { -+ coord_init_first_unit(coord, node); -+ } else if ((coord->between == AFTER_ITEM) -+ || (coord->between == AFTER_UNIT)) { -+ return; -+ } else if (coord->item_pos == coord_num_items(coord) -+ && coord->between == BEFORE_ITEM) { -+ coord_dec_item_pos(coord); -+ coord->between = AFTER_ITEM; -+ } else if (coord->unit_pos == coord_num_units(coord) -+ && coord->between == BEFORE_UNIT) { -+ coord->unit_pos--; -+ coord->between = AFTER_UNIT; -+ } else if (coord->item_pos == coord_num_items(coord) -+ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) { -+ coord_dec_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+ } -+} -+ -+/* Copy a coordinate. */ -+void coord_dup(coord_t *coord, const coord_t *old_coord) -+{ -+ assert("jmacd-9800", coord_check(old_coord)); -+ coord_dup_nocheck(coord, old_coord); -+} -+ -+/* Copy a coordinate without check. Useful when old_coord->node is not -+ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */ -+void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord) -+{ -+ coord->node = old_coord->node; -+ coord_set_item_pos(coord, old_coord->item_pos); -+ coord->unit_pos = old_coord->unit_pos; -+ coord->between = old_coord->between; -+ coord->iplugid = old_coord->iplugid; -+ ON_DEBUG(coord->plug_v = old_coord->plug_v); -+ ON_DEBUG(coord->body_v = old_coord->body_v); -+} -+ -+/* Initialize an invalid coordinate. */ -+void coord_init_invalid(coord_t *coord, const znode * node) -+{ -+ coord_init_values(coord, node, 0, 0, INVALID_COORD); -+} -+ -+void coord_init_first_unit_nocheck(coord_t *coord, const znode * node) -+{ -+ coord_init_values(coord, node, 0, 0, AT_UNIT); -+} -+ -+/* Initialize a coordinate to point at the first unit of the first item. If the -+ node is empty, it is positioned at the EMPTY_NODE. */ -+void coord_init_first_unit(coord_t *coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT)); -+ -+ assert("jmacd-9801", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to point at the last unit of the last item. If the -+ node is empty, it is positioned at the EMPTY_NODE. */ -+void coord_init_last_unit(coord_t *coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, -+ (is_empty ? 0 : node_num_items(node) - 1), 0, -+ (is_empty ? EMPTY_NODE : AT_UNIT)); -+ if (!is_empty) -+ coord->unit_pos = coord_last_unit_pos(coord); -+ assert("jmacd-9802", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to before the first item. If the node is empty, it is -+ positioned at the EMPTY_NODE. */ -+void coord_init_before_first_item(coord_t *coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, 0, 0, -+ (is_empty ? EMPTY_NODE : BEFORE_UNIT)); -+ -+ assert("jmacd-9803", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to after the last item. If the node is empty, it is -+ positioned at the EMPTY_NODE. */ -+void coord_init_after_last_item(coord_t *coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, -+ (is_empty ? 0 : node_num_items(node) - 1), 0, -+ (is_empty ? EMPTY_NODE : AFTER_ITEM)); -+ -+ assert("jmacd-9804", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to after last unit in the item. Coord must be set -+ already to existing item */ -+void coord_init_after_item_end(coord_t *coord) -+{ -+ coord->between = AFTER_UNIT; -+ coord->unit_pos = coord_last_unit_pos(coord); -+} -+ -+/* Initialize a coordinate to before the item. Coord must be set already to -+ existing item */ -+void coord_init_before_item(coord_t *coord) -+{ -+ coord->unit_pos = 0; -+ coord->between = BEFORE_ITEM; -+} -+ -+/* Initialize a coordinate to after the item. Coord must be set already to -+ existing item */ -+void coord_init_after_item(coord_t *coord) -+{ -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+} -+ -+/* Initialize a coordinate by 0s. Used in places where init_coord was used and -+ it was not clear how actually */ -+void coord_init_zero(coord_t *coord) -+{ -+ memset(coord, 0, sizeof(*coord)); -+} -+ -+/* Return the number of units at the present item. -+ Asserts coord_is_existing_item(). */ -+unsigned coord_num_units(const coord_t *coord) -+{ -+ assert("jmacd-9806", coord_is_existing_item(coord)); -+ -+ return item_plugin_by_coord(coord)->b.nr_units(coord); -+} -+ -+/* Returns true if the coord was initializewd by coord_init_invalid (). */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_invalid(const coord_t *coord) -+{ -+ return coord->between == INVALID_COORD; -+} -+ -+/* Returns true if the coordinate is positioned at an existing item, not before -+ or after an item. It may be placed at, before, or after any unit within the -+ item, whether existing or not. */ -+int coord_is_existing_item(const coord_t *coord) -+{ -+ switch (coord->between) { -+ case EMPTY_NODE: -+ case BEFORE_ITEM: -+ case AFTER_ITEM: -+ case INVALID_COORD: -+ return 0; -+ -+ case BEFORE_UNIT: -+ case AT_UNIT: -+ case AFTER_UNIT: -+ return coord->item_pos < coord_num_items(coord); -+ } -+ -+ impossible("jmacd-9900", "unreachable coord: %p", coord); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned at an existing unit, not before -+ or after a unit. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_existing_unit(const coord_t *coord) -+{ -+ switch (coord->between) { -+ case EMPTY_NODE: -+ case BEFORE_UNIT: -+ case AFTER_UNIT: -+ case BEFORE_ITEM: -+ case AFTER_ITEM: -+ case INVALID_COORD: -+ return 0; -+ -+ case AT_UNIT: -+ return (coord->item_pos < coord_num_items(coord) -+ && coord->unit_pos < coord_num_units(coord)); -+ } -+ -+ impossible("jmacd-9902", "unreachable"); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned at the first unit of the first -+ item. Not true for empty nodes nor coordinates positioned before the first -+ item. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_leftmost_unit(const coord_t *coord) -+{ -+ return (coord->between == AT_UNIT && coord->item_pos == 0 -+ && coord->unit_pos == 0); -+} -+ -+#if REISER4_DEBUG -+/* For assertions only, checks for a valid coordinate. */ -+int coord_check(const coord_t *coord) -+{ -+ if (coord->node == NULL) -+ return 0; -+ if (znode_above_root(coord->node)) -+ return 1; -+ -+ switch (coord->between) { -+ default: -+ case INVALID_COORD: -+ return 0; -+ case EMPTY_NODE: -+ if (!node_is_empty(coord->node)) -+ return 0; -+ return coord->item_pos == 0 && coord->unit_pos == 0; -+ -+ case BEFORE_UNIT: -+ case AFTER_UNIT: -+ if (node_is_empty(coord->node) && (coord->item_pos == 0) -+ && (coord->unit_pos == 0)) -+ return 1; -+ case AT_UNIT: -+ break; -+ case AFTER_ITEM: -+ case BEFORE_ITEM: -+ /* before/after item should not set unit_pos. */ -+ if (coord->unit_pos != 0) -+ return 0; -+ break; -+ } -+ -+ if (coord->item_pos >= node_num_items(coord->node)) -+ return 0; -+ -+ /* FIXME-VS: we are going to check unit_pos. This makes no sense when -+ between is set either AFTER_ITEM or BEFORE_ITEM */ -+ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM) -+ return 1; -+ -+ if (coord_is_iplug_set(coord) && -+ coord->unit_pos > -+ item_plugin_by_coord(coord)->b.nr_units(coord) - 1) -+ return 0; -+ return 1; -+} -+#endif -+ -+/* Adjust coordinate boundaries based on the number of items prior to -+ coord_next/prev. Returns 1 if the new position is does not exist. */ -+static int coord_adjust_items(coord_t *coord, unsigned items, int is_next) -+{ -+ /* If the node is invalid, leave it. */ -+ if (coord->between == INVALID_COORD) -+ return 1; -+ -+ /* If the node is empty, set it appropriately. */ -+ if (items == 0) { -+ coord->between = EMPTY_NODE; -+ coord_set_item_pos(coord, 0); -+ coord->unit_pos = 0; -+ return 1; -+ } -+ -+ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */ -+ if (coord->between == EMPTY_NODE) { -+ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM); -+ coord_set_item_pos(coord, 0); -+ coord->unit_pos = 0; -+ return 0; -+ } -+ -+ /* If the item_pos is out-of-range, set it appropriatly. */ -+ if (coord->item_pos >= items) { -+ coord->between = AFTER_ITEM; -+ coord_set_item_pos(coord, items - 1); -+ coord->unit_pos = 0; -+ /* If is_next, return 1 (can't go any further). */ -+ return is_next; -+ } -+ -+ return 0; -+} -+ -+/* Advances the coordinate by one unit to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new -+ position is an existing unit. */ -+int coord_next_unit(coord_t *coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 1) == 1) -+ return 1; -+ -+ switch (coord->between) { -+ case BEFORE_UNIT: -+ /* Now it is positioned at the same unit. */ -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_UNIT: -+ case AT_UNIT: -+ /* If it was at or after a unit and there are more units in this -+ item, advance to the next one. */ -+ if (coord->unit_pos < coord_last_unit_pos(coord)) { -+ coord->unit_pos += 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ /* Otherwise, it is crossing an item boundary and treated as if -+ it was after the current item. */ -+ coord->between = AFTER_ITEM; -+ coord->unit_pos = 0; -+ /* FALLTHROUGH */ -+ -+ case AFTER_ITEM: -+ /* Check for end-of-node. */ -+ if (coord->item_pos == items - 1) -+ return 1; -+ -+ coord_inc_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case BEFORE_ITEM: -+ /* The adjust_items checks ensure that we are valid here. */ -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ /* Handled in coord_adjust_items(). */ -+ break; -+ } -+ -+ impossible("jmacd-9902", "unreachable"); -+ return 0; -+} -+ -+/* Advances the coordinate by one item to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new -+ position is an existing item. */ -+int coord_next_item(coord_t *coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 1) == 1) -+ return 1; -+ -+ switch (coord->between) { -+ case AFTER_UNIT: -+ case AT_UNIT: -+ case BEFORE_UNIT: -+ case AFTER_ITEM: -+ /* Check for end-of-node. */ -+ if (coord->item_pos == items - 1) { -+ coord->between = AFTER_ITEM; -+ coord->unit_pos = 0; -+ coord_clear_iplug(coord); -+ return 1; -+ } -+ -+ /* Anywhere in an item, go to the next one. */ -+ coord->between = AT_UNIT; -+ coord_inc_item_pos(coord); -+ coord->unit_pos = 0; -+ return 0; -+ -+ case BEFORE_ITEM: -+ /* The out-of-range check ensures that we are valid here. */ -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ /* Handled in coord_adjust_items(). */ -+ break; -+ } -+ -+ impossible("jmacd-9903", "unreachable"); -+ return 0; -+} -+ -+/* Advances the coordinate by one unit to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new -+ position is an existing unit. */ -+int coord_prev_unit(coord_t *coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 0) == 1) -+ return 1; -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ case BEFORE_UNIT: -+ if (coord->unit_pos > 0) { -+ coord->unit_pos -= 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ if (coord->item_pos == 0) { -+ coord->between = BEFORE_ITEM; -+ return 1; -+ } -+ -+ coord_dec_item_pos(coord); -+ coord->unit_pos = coord_last_unit_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_UNIT: -+ /* What if unit_pos is out-of-range? */ -+ assert("jmacd-5442", -+ coord->unit_pos <= coord_last_unit_pos(coord)); -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case BEFORE_ITEM: -+ if (coord->item_pos == 0) -+ return 1; -+ -+ coord_dec_item_pos(coord); -+ /* FALLTHROUGH */ -+ -+ case AFTER_ITEM: -+ coord->between = AT_UNIT; -+ coord->unit_pos = coord_last_unit_pos(coord); -+ return 0; -+ -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ break; -+ } -+ -+ impossible("jmacd-9904", "unreachable"); -+ return 0; -+} -+ -+/* Advances the coordinate by one item to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new -+ position is an existing item. */ -+int coord_prev_item(coord_t *coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 0) == 1) -+ return 1; -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ case AFTER_UNIT: -+ case BEFORE_UNIT: -+ case BEFORE_ITEM: -+ -+ if (coord->item_pos == 0) { -+ coord->between = BEFORE_ITEM; -+ coord->unit_pos = 0; -+ return 1; -+ } -+ -+ coord_dec_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_ITEM: -+ coord->between = AT_UNIT; -+ coord->unit_pos = 0; -+ return 0; -+ -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ break; -+ } -+ -+ impossible("jmacd-9905", "unreachable"); -+ return 0; -+} -+ -+/* Calls either coord_init_first_unit or coord_init_last_unit depending on -+ sideof argument. */ -+void coord_init_sideof_unit(coord_t *coord, const znode * node, sideof dir) -+{ -+ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE); -+ if (dir == LEFT_SIDE) { -+ coord_init_first_unit(coord, node); -+ } else { -+ coord_init_last_unit(coord, node); -+ } -+} -+ -+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending -+ on sideof argument. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_after_sideof_unit(coord_t *coord, sideof dir) -+{ -+ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE); -+ if (dir == LEFT_SIDE) { -+ return coord_is_before_leftmost(coord); -+ } else { -+ return coord_is_after_rightmost(coord); -+ } -+} -+ -+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. -+ */ -+/* Audited by: green(2002.06.15) */ -+int coord_sideof_unit(coord_t *coord, sideof dir) -+{ -+ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE); -+ if (dir == LEFT_SIDE) { -+ return coord_prev_unit(coord); -+ } else { -+ return coord_next_unit(coord); -+ } -+} -+ -+#if REISER4_DEBUG -+int coords_equal(const coord_t *c1, const coord_t *c2) -+{ -+ assert("nikita-2840", c1 != NULL); -+ assert("nikita-2841", c2 != NULL); -+ -+ return -+ c1->node == c2->node && -+ c1->item_pos == c2->item_pos && -+ c1->unit_pos == c2->unit_pos && c1->between == c2->between; -+} -+#endif /* REISER4_DEBUG */ -+ -+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if -+ coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return -+ NCOORD_INSIDE. */ -+/* Audited by: green(2002.06.15) */ -+coord_wrt_node coord_wrt(const coord_t *coord) -+{ -+ if (coord_is_before_leftmost(coord)) -+ return COORD_ON_THE_LEFT; -+ -+ if (coord_is_after_rightmost(coord)) -+ return COORD_ON_THE_RIGHT; -+ -+ return COORD_INSIDE; -+} -+ -+/* Returns true if the coordinate is positioned after the last item or after the -+ last unit of the last item or it is an empty node. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_after_rightmost(const coord_t *coord) -+{ -+ assert("jmacd-7313", coord_check(coord)); -+ -+ switch (coord->between) { -+ case INVALID_COORD: -+ case AT_UNIT: -+ case BEFORE_UNIT: -+ case BEFORE_ITEM: -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case AFTER_ITEM: -+ return (coord->item_pos == node_num_items(coord->node) - 1); -+ -+ case AFTER_UNIT: -+ return ((coord->item_pos == node_num_items(coord->node) - 1) && -+ coord->unit_pos == coord_last_unit_pos(coord)); -+ } -+ -+ impossible("jmacd-9908", "unreachable"); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned before the first item or it is -+ an empty node. */ -+int coord_is_before_leftmost(const coord_t *coord) -+{ -+ /* FIXME-VS: coord_check requires node to be loaded whereas it is not -+ necessary to check if coord is set before leftmost -+ assert ("jmacd-7313", coord_check (coord)); */ -+ switch (coord->between) { -+ case INVALID_COORD: -+ case AT_UNIT: -+ case AFTER_ITEM: -+ case AFTER_UNIT: -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case BEFORE_ITEM: -+ case BEFORE_UNIT: -+ return (coord->item_pos == 0) && (coord->unit_pos == 0); -+ } -+ -+ impossible("jmacd-9908", "unreachable"); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned after a item, before a item, -+ after the last unit of an item, before the first unit of an item, or at an -+ empty node. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_between_items(const coord_t *coord) -+{ -+ assert("jmacd-7313", coord_check(coord)); -+ -+ switch (coord->between) { -+ case INVALID_COORD: -+ case AT_UNIT: -+ return 0; -+ -+ case AFTER_ITEM: -+ case BEFORE_ITEM: -+ case EMPTY_NODE: -+ return 1; -+ -+ case BEFORE_UNIT: -+ return coord->unit_pos == 0; -+ -+ case AFTER_UNIT: -+ return coord->unit_pos == coord_last_unit_pos(coord); -+ } -+ -+ impossible("jmacd-9908", "unreachable"); -+ return 0; -+} -+ -+#if REISER4_DEBUG -+/* Returns true if the coordinates are positioned at adjacent units, regardless -+ of before-after or item boundaries. */ -+int coord_are_neighbors(coord_t *c1, coord_t *c2) -+{ -+ coord_t *left; -+ coord_t *right; -+ -+ assert("nikita-1241", c1 != NULL); -+ assert("nikita-1242", c2 != NULL); -+ assert("nikita-1243", c1->node == c2->node); -+ assert("nikita-1244", coord_is_existing_unit(c1)); -+ assert("nikita-1245", coord_is_existing_unit(c2)); -+ -+ left = right = NULL; -+ switch (coord_compare(c1, c2)) { -+ case COORD_CMP_ON_LEFT: -+ left = c1; -+ right = c2; -+ break; -+ case COORD_CMP_ON_RIGHT: -+ left = c2; -+ right = c1; -+ break; -+ case COORD_CMP_SAME: -+ return 0; -+ default: -+ wrong_return_value("nikita-1246", "compare_coords()"); -+ } -+ assert("vs-731", left && right); -+ if (left->item_pos == right->item_pos) { -+ return left->unit_pos + 1 == right->unit_pos; -+ } else if (left->item_pos + 1 == right->item_pos) { -+ return (left->unit_pos == coord_last_unit_pos(left)) -+ && (right->unit_pos == 0); -+ } else { -+ return 0; -+ } -+} -+#endif /* REISER4_DEBUG */ -+ -+/* Assuming two coordinates are positioned in the same node, return -+ COORD_CMP_ON_RIGHT, COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's -+ position relative to c2. */ -+/* Audited by: green(2002.06.15) */ -+coord_cmp coord_compare(coord_t *c1, coord_t *c2) -+{ -+ assert("vs-209", c1->node == c2->node); -+ assert("vs-194", coord_is_existing_unit(c1) -+ && coord_is_existing_unit(c2)); -+ -+ if (c1->item_pos > c2->item_pos) -+ return COORD_CMP_ON_RIGHT; -+ if (c1->item_pos < c2->item_pos) -+ return COORD_CMP_ON_LEFT; -+ if (c1->unit_pos > c2->unit_pos) -+ return COORD_CMP_ON_RIGHT; -+ if (c1->unit_pos < c2->unit_pos) -+ return COORD_CMP_ON_LEFT; -+ return COORD_CMP_SAME; -+} -+ -+/* If the coordinate is between items, shifts it to the right. Returns 0 on -+ success and non-zero if there is no position to the right. */ -+int coord_set_to_right(coord_t *coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 1) == 1) -+ return 1; -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ return 0; -+ -+ case BEFORE_ITEM: -+ case BEFORE_UNIT: -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_UNIT: -+ if (coord->unit_pos < coord_last_unit_pos(coord)) { -+ coord->unit_pos += 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } else { -+ -+ coord->unit_pos = 0; -+ -+ if (coord->item_pos == items - 1) { -+ coord->between = AFTER_ITEM; -+ return 1; -+ } -+ -+ coord_inc_item_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ case AFTER_ITEM: -+ if (coord->item_pos == items - 1) -+ return 1; -+ -+ coord_inc_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case INVALID_COORD: -+ break; -+ } -+ -+ impossible("jmacd-9920", "unreachable"); -+ return 0; -+} -+ -+/* If the coordinate is between items, shifts it to the left. Returns 0 on -+ success and non-zero if there is no position to the left. */ -+int coord_set_to_left(coord_t *coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 0) == 1) -+ return 1; -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ return 0; -+ -+ case AFTER_UNIT: -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_ITEM: -+ coord->between = AT_UNIT; -+ coord->unit_pos = coord_last_unit_pos(coord); -+ return 0; -+ -+ case BEFORE_UNIT: -+ if (coord->unit_pos > 0) { -+ coord->unit_pos -= 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } else { -+ -+ if (coord->item_pos == 0) { -+ coord->between = BEFORE_ITEM; -+ return 1; -+ } -+ -+ coord->unit_pos = coord_last_unit_pos(coord); -+ coord_dec_item_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ case BEFORE_ITEM: -+ if (coord->item_pos == 0) -+ return 1; -+ -+ coord_dec_item_pos(coord); -+ coord->unit_pos = coord_last_unit_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case INVALID_COORD: -+ break; -+ } -+ -+ impossible("jmacd-9920", "unreachable"); -+ return 0; -+} -+ -+static const char *coord_tween_tostring(between_enum n) -+{ -+ switch (n) { -+ case BEFORE_UNIT: -+ return "before unit"; -+ case BEFORE_ITEM: -+ return "before item"; -+ case AT_UNIT: -+ return "at unit"; -+ case AFTER_UNIT: -+ return "after unit"; -+ case AFTER_ITEM: -+ return "after item"; -+ case EMPTY_NODE: -+ return "empty node"; -+ case INVALID_COORD: -+ return "invalid"; -+ default: -+ { -+ static char buf[30]; -+ -+ sprintf(buf, "unknown: %i", n); -+ return buf; -+ } -+ } -+} -+ -+void print_coord(const char *mes, const coord_t *coord, int node) -+{ -+ if (coord == NULL) { -+ printk("%s: null\n", mes); -+ return; -+ } -+ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n", -+ mes, coord->item_pos, coord->unit_pos, -+ coord_tween_tostring(coord->between), coord->iplugid); -+} -+ -+int -+item_utmost_child_real_block(const coord_t *coord, sideof side, -+ reiser4_block_nr * blk) -+{ -+ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord, -+ side, -+ blk); -+} -+ -+int item_utmost_child(const coord_t *coord, sideof side, jnode ** child) -+{ -+ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child); -+} -+ -+/* @count bytes of flow @f got written, update correspondingly f->length, -+ f->data and f->key */ -+void move_flow_forward(flow_t *f, unsigned count) -+{ -+ if (f->data) -+ f->data += count; -+ f->length -= count; -+ set_key_offset(&f->key, get_key_offset(&f->key) + count); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/coord.h linux-2.6.30/fs/reiser4/coord.h ---- linux-2.6.30.orig/fs/reiser4/coord.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/coord.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,399 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* Coords */ -+ -+#if !defined(__REISER4_COORD_H__) -+#define __REISER4_COORD_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+ -+/* insertions happen between coords in the tree, so we need some means -+ of specifying the sense of betweenness. */ -+typedef enum { -+ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */ -+ AT_UNIT, -+ AFTER_UNIT, -+ BEFORE_ITEM, -+ AFTER_ITEM, -+ INVALID_COORD, -+ EMPTY_NODE, -+} between_enum; -+ -+/* location of coord w.r.t. its node */ -+typedef enum { -+ COORD_ON_THE_LEFT = -1, -+ COORD_ON_THE_RIGHT = +1, -+ COORD_INSIDE = 0 -+} coord_wrt_node; -+ -+typedef enum { -+ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1 -+} coord_cmp; -+ -+struct coord { -+ /* node in a tree */ -+ /* 0 */ znode *node; -+ -+ /* position of item within node */ -+ /* 4 */ pos_in_node_t item_pos; -+ /* position of unit within item */ -+ /* 6 */ pos_in_node_t unit_pos; -+ /* optimization: plugin of item is stored in coord_t. Until this was -+ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid -+ is invalidated (set to 0xff) on each modification of ->item_pos, -+ and all such modifications are funneled through coord_*_item_pos() -+ functions below. -+ */ -+ /* 8 */ char iplugid; -+ /* position of coord w.r.t. to neighboring items and/or units. -+ Values are taken from &between_enum above. -+ */ -+ /* 9 */ char between; -+ /* padding. It will be added by the compiler anyway to conform to the -+ * C language alignment requirements. We keep it here to be on the -+ * safe side and to have a clear picture of the memory layout of this -+ * structure. */ -+ /* 10 */ __u16 pad; -+ /* 12 */ int offset; -+#if REISER4_DEBUG -+ unsigned long plug_v; -+ unsigned long body_v; -+#endif -+}; -+ -+#define INVALID_PLUGID ((char)((1 << 8) - 1)) -+#define INVALID_OFFSET -1 -+ -+static inline void coord_clear_iplug(coord_t *coord) -+{ -+ assert("nikita-2835", coord != NULL); -+ coord->iplugid = INVALID_PLUGID; -+ coord->offset = INVALID_OFFSET; -+} -+ -+static inline int coord_is_iplug_set(const coord_t *coord) -+{ -+ assert("nikita-2836", coord != NULL); -+ return coord->iplugid != INVALID_PLUGID; -+} -+ -+static inline void coord_set_item_pos(coord_t *coord, pos_in_node_t pos) -+{ -+ assert("nikita-2478", coord != NULL); -+ coord->item_pos = pos; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_dec_item_pos(coord_t *coord) -+{ -+ assert("nikita-2480", coord != NULL); -+ --coord->item_pos; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_inc_item_pos(coord_t *coord) -+{ -+ assert("nikita-2481", coord != NULL); -+ ++coord->item_pos; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_add_item_pos(coord_t *coord, int delta) -+{ -+ assert("nikita-2482", coord != NULL); -+ coord->item_pos += delta; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_invalid_item_pos(coord_t *coord) -+{ -+ assert("nikita-2832", coord != NULL); -+ coord->item_pos = (unsigned short)~0; -+ coord_clear_iplug(coord); -+} -+ -+/* Reverse a direction. */ -+static inline sideof sideof_reverse(sideof side) -+{ -+ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE; -+} -+ -+/* NOTE: There is a somewhat odd mixture of the following opposed terms: -+ -+ "first" and "last" -+ "next" and "prev" -+ "before" and "after" -+ "leftmost" and "rightmost" -+ -+ But I think the chosen names are decent the way they are. -+*/ -+ -+/* COORD INITIALIZERS */ -+ -+/* Initialize an invalid coordinate. */ -+extern void coord_init_invalid(coord_t *coord, const znode * node); -+ -+extern void coord_init_first_unit_nocheck(coord_t *coord, const znode * node); -+ -+/* Initialize a coordinate to point at the first unit of the first item. If the -+ node is empty, it is positioned at the EMPTY_NODE. */ -+extern void coord_init_first_unit(coord_t *coord, const znode * node); -+ -+/* Initialize a coordinate to point at the last unit of the last item. If the -+ node is empty, it is positioned at the EMPTY_NODE. */ -+extern void coord_init_last_unit(coord_t *coord, const znode * node); -+ -+/* Initialize a coordinate to before the first item. If the node is empty, it is -+ positioned at the EMPTY_NODE. */ -+extern void coord_init_before_first_item(coord_t *coord, const znode * node); -+ -+/* Initialize a coordinate to after the last item. If the node is empty, it is -+ positioned at the EMPTY_NODE. */ -+extern void coord_init_after_last_item(coord_t *coord, const znode * node); -+ -+/* Initialize a coordinate to after last unit in the item. Coord must be set -+ already to existing item */ -+void coord_init_after_item_end(coord_t *coord); -+ -+/* Initialize a coordinate to before the item. Coord must be set already to -+ existing item */ -+void coord_init_before_item(coord_t *); -+/* Initialize a coordinate to after the item. Coord must be set already to -+ existing item */ -+void coord_init_after_item(coord_t *); -+ -+/* Calls either coord_init_first_unit or coord_init_last_unit depending on -+ sideof argument. */ -+extern void coord_init_sideof_unit(coord_t *coord, const znode * node, -+ sideof dir); -+ -+/* Initialize a coordinate by 0s. Used in places where init_coord was used and -+ it was not clear how actually -+ FIXME-VS: added by vs (2002, june, 8) */ -+extern void coord_init_zero(coord_t *coord); -+ -+/* COORD METHODS */ -+ -+/* after shifting of node content, coord previously set properly may become -+ invalid, try to "normalize" it. */ -+void coord_normalize(coord_t *coord); -+ -+/* Copy a coordinate. */ -+extern void coord_dup(coord_t *coord, const coord_t *old_coord); -+ -+/* Copy a coordinate without check. */ -+void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord); -+ -+unsigned coord_num_units(const coord_t *coord); -+ -+/* Return the last valid unit number at the present item (i.e., -+ coord_num_units() - 1). */ -+static inline unsigned coord_last_unit_pos(const coord_t *coord) -+{ -+ return coord_num_units(coord) - 1; -+} -+ -+#if REISER4_DEBUG -+/* For assertions only, checks for a valid coordinate. */ -+extern int coord_check(const coord_t *coord); -+ -+extern unsigned long znode_times_locked(const znode * z); -+ -+static inline void coord_update_v(coord_t *coord) -+{ -+ coord->plug_v = coord->body_v = znode_times_locked(coord->node); -+} -+#endif -+ -+extern int coords_equal(const coord_t *c1, const coord_t *c2); -+ -+extern void print_coord(const char *mes, const coord_t *coord, int print_node); -+ -+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if -+ coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return -+ NCOORD_INSIDE. */ -+extern coord_wrt_node coord_wrt(const coord_t *coord); -+ -+/* Returns true if the coordinates are positioned at adjacent units, regardless -+ of before-after or item boundaries. */ -+extern int coord_are_neighbors(coord_t *c1, coord_t *c2); -+ -+/* Assuming two coordinates are positioned in the same node, return -+ NCOORD_CMP_ON_RIGHT, NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's -+ position relative to c2. */ -+extern coord_cmp coord_compare(coord_t *c1, coord_t *c2); -+ -+/* COORD PREDICATES */ -+ -+/* Returns true if the coord was initializewd by coord_init_invalid (). */ -+extern int coord_is_invalid(const coord_t *coord); -+ -+/* Returns true if the coordinate is positioned at an existing item, not before -+ or after an item. It may be placed at, before, or after any unit within the -+ item, whether existing or not. If this is true you can call methods of the -+ item plugin. */ -+extern int coord_is_existing_item(const coord_t *coord); -+ -+/* Returns true if the coordinate is positioned after a item, before a item, -+ after the last unit of an item, before the first unit of an item, or at an -+ empty node. */ -+extern int coord_is_between_items(const coord_t *coord); -+ -+/* Returns true if the coordinate is positioned at an existing unit, not before -+ or after a unit. */ -+extern int coord_is_existing_unit(const coord_t *coord); -+ -+/* Returns true if the coordinate is positioned at an empty node. */ -+extern int coord_is_empty(const coord_t *coord); -+ -+/* Returns true if the coordinate is positioned at the first unit of the first -+ item. Not true for empty nodes nor coordinates positioned before the first -+ item. */ -+extern int coord_is_leftmost_unit(const coord_t *coord); -+ -+/* Returns true if the coordinate is positioned after the last item or after the -+ last unit of the last item or it is an empty node. */ -+extern int coord_is_after_rightmost(const coord_t *coord); -+ -+/* Returns true if the coordinate is positioned before the first item or it is -+ an empty node. */ -+extern int coord_is_before_leftmost(const coord_t *coord); -+ -+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending -+ on sideof argument. */ -+extern int coord_is_after_sideof_unit(coord_t *coord, sideof dir); -+ -+/* COORD MODIFIERS */ -+ -+/* Advances the coordinate by one unit to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new -+ position is an existing unit. */ -+extern int coord_next_unit(coord_t *coord); -+ -+/* Advances the coordinate by one item to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new -+ position is an existing item. */ -+extern int coord_next_item(coord_t *coord); -+ -+/* Advances the coordinate by one unit to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new -+ position is an existing unit. */ -+extern int coord_prev_unit(coord_t *coord); -+ -+/* Advances the coordinate by one item to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new -+ position is an existing item. */ -+extern int coord_prev_item(coord_t *coord); -+ -+/* If the coordinate is between items, shifts it to the right. Returns 0 on -+ success and non-zero if there is no position to the right. */ -+extern int coord_set_to_right(coord_t *coord); -+ -+/* If the coordinate is between items, shifts it to the left. Returns 0 on -+ success and non-zero if there is no position to the left. */ -+extern int coord_set_to_left(coord_t *coord); -+ -+/* If the coordinate is at an existing unit, set to after that unit. Returns 0 -+ on success and non-zero if the unit did not exist. */ -+extern int coord_set_after_unit(coord_t *coord); -+ -+/* Calls either coord_next_unit or coord_prev_unit depending on sideof -+ argument. */ -+extern int coord_sideof_unit(coord_t *coord, sideof dir); -+ -+/* iterate over all units in @node */ -+#define for_all_units(coord, node) \ -+ for (coord_init_before_first_item((coord), (node)) ; \ -+ coord_next_unit(coord) == 0 ;) -+ -+/* iterate over all items in @node */ -+#define for_all_items(coord, node) \ -+ for (coord_init_before_first_item((coord), (node)) ; \ -+ coord_next_item(coord) == 0 ;) -+ -+/* COORD/ITEM METHODS */ -+ -+extern int item_utmost_child_real_block(const coord_t *coord, sideof side, -+ reiser4_block_nr * blk); -+extern int item_utmost_child(const coord_t *coord, sideof side, -+ jnode ** child); -+ -+/* a flow is a sequence of bytes being written to or read from the tree. The -+ tree will slice the flow into items while storing it into nodes, but all of -+ that is hidden from anything outside the tree. */ -+ -+struct flow { -+ reiser4_key key; /* key of start of flow's sequence of bytes */ -+ loff_t length; /* length of flow's sequence of bytes */ -+ char *data; /* start of flow's sequence of bytes */ -+ int user; /* if 1 data is user space, 0 - kernel space */ -+ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */ -+}; -+ -+void move_flow_forward(flow_t *f, unsigned count); -+ -+/* &reiser4_item_data - description of data to be inserted or pasted -+ -+ Q: articulate the reasons for the difference between this and flow. -+ -+ A: Becides flow we insert into tree other things: stat data, directory -+ entry, etc. To insert them into tree one has to provide this structure. If -+ one is going to insert flow - he can use insert_flow, where this structure -+ does not have to be created -+*/ -+struct reiser4_item_data { -+ /* actual data to be inserted. If NULL, ->create_item() will not -+ do xmemcpy itself, leaving this up to the caller. This can -+ save some amount of unnecessary memory copying, for example, -+ during insertion of stat data. -+ -+ */ -+ char *data; -+ /* 1 if 'char * data' contains pointer to user space and 0 if it is -+ kernel space */ -+ int user; -+ /* amount of data we are going to insert or paste */ -+ int length; -+ /* "Arg" is opaque data that is passed down to the -+ ->create_item() method of node layout, which in turn -+ hands it to the ->create_hook() of item being created. This -+ arg is currently used by: -+ -+ . ->create_hook() of internal item -+ (fs/reiser4/plugin/item/internal.c:internal_create_hook()), -+ . ->paste() method of directory item. -+ . ->create_hook() of extent item -+ -+ For internal item, this is left "brother" of new node being -+ inserted and it is used to add new node into sibling list -+ after parent to it was just inserted into parent. -+ -+ While ->arg does look somewhat of unnecessary compication, -+ it actually saves a lot of headache in many places, because -+ all data necessary to insert or paste new data into tree are -+ collected in one place, and this eliminates a lot of extra -+ argument passing and storing everywhere. -+ -+ */ -+ void *arg; -+ /* plugin of item we are inserting */ -+ item_plugin *iplug; -+}; -+ -+/* __REISER4_COORD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/debug.c linux-2.6.30/fs/reiser4/debug.c ---- linux-2.6.30.orig/fs/reiser4/debug.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/debug.c 2009-06-22 18:11:49.000000000 +0200 -@@ -0,0 +1,308 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Debugging facilities. */ -+ -+/* -+ * This file contains generic debugging functions used by reiser4. Roughly -+ * following: -+ * -+ * panicking: reiser4_do_panic(), reiser4_print_prefix(). -+ * -+ * locking: -+ * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(), -+ * reiser4_no_counters_are_held(), reiser4_commit_check_locks() -+ * -+ * error code monitoring (see comment before RETERR macro): -+ * reiser4_return_err(), reiser4_report_err(). -+ * -+ * stack back-tracing: fill_backtrace() -+ * -+ * miscellaneous: reiser4_preempt_point(), call_on_each_assert(), -+ * reiser4_debugtrap(). -+ * -+ */ -+ -+#include "reiser4.h" -+#include "context.h" -+#include "super.h" -+#include "txnmgr.h" -+#include "znode.h" -+ -+#include <linux/sysfs.h> -+#include <linux/slab.h> -+#include <linux/types.h> -+#include <linux/fs.h> -+#include <linux/spinlock.h> -+#include <linux/kallsyms.h> -+#include <linux/vmalloc.h> -+#include <linux/ctype.h> -+#include <linux/sysctl.h> -+#include <linux/hardirq.h> -+ -+#if 0 -+#if REISER4_DEBUG -+static void reiser4_report_err(void); -+#else -+#define reiser4_report_err() noop -+#endif -+#endif /* 0 */ -+ -+/* -+ * global buffer where message given to reiser4_panic is formatted. -+ */ -+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE]; -+ -+/* -+ * lock protecting consistency of panic_buf under concurrent panics -+ */ -+static DEFINE_SPINLOCK(panic_guard); -+ -+/* Your best friend. Call it on each occasion. This is called by -+ fs/reiser4/debug.h:reiser4_panic(). */ -+void reiser4_do_panic(const char *format/* format string */ , ... /* rest */) -+{ -+ static int in_panic = 0; -+ va_list args; -+ -+ /* -+ * check for recursive panic. -+ */ -+ if (in_panic == 0) { -+ in_panic = 1; -+ -+ spin_lock(&panic_guard); -+ va_start(args, format); -+ vsnprintf(panic_buf, sizeof(panic_buf), format, args); -+ va_end(args); -+ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf); -+ spin_unlock(&panic_guard); -+ -+ /* -+ * if kernel debugger is configured---drop in. Early dropping -+ * into kgdb is not always convenient, because panic message -+ * is not yet printed most of the times. But: -+ * -+ * (1) message can be extracted from printk_buf[] -+ * (declared static inside of printk()), and -+ * -+ * (2) sometimes serial/kgdb combo dies while printing -+ * long panic message, so it's more prudent to break into -+ * debugger earlier. -+ * -+ */ -+ DEBUGON(1); -+ } -+ /* to make gcc happy about noreturn attribute */ -+ panic("%s", panic_buf); -+} -+ -+#if 0 -+void -+reiser4_print_prefix(const char *level, int reperr, const char *mid, -+ const char *function, const char *file, int lineno) -+{ -+ const char *comm; -+ int pid; -+ -+ if (unlikely(in_interrupt() || in_irq())) { -+ comm = "interrupt"; -+ pid = 0; -+ } else { -+ comm = current->comm; -+ pid = current->pid; -+ } -+ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n", -+ level, comm, pid, function, file, lineno, mid); -+ if (reperr) -+ reiser4_report_err(); -+} -+#endif /* 0 */ -+ -+/* Preemption point: this should be called periodically during long running -+ operations (carry, allocate, and squeeze are best examples) */ -+int reiser4_preempt_point(void) -+{ -+ assert("nikita-3008", reiser4_schedulable()); -+ cond_resched(); -+ return signal_pending(current); -+} -+ -+#if REISER4_DEBUG -+/* Debugging aid: return struct where information about locks taken by current -+ thread is accumulated. This can be used to formulate lock ordering -+ constraints and various assertions. -+ -+*/ -+reiser4_lock_cnt_info *reiser4_lock_counters(void) -+{ -+ reiser4_context *ctx = get_current_context(); -+ assert("jmacd-1123", ctx != NULL); -+ return &ctx->locks; -+} -+ -+/* -+ * print human readable information about locks held by the reiser4 context. -+ */ -+static void print_lock_counters(const char *prefix, -+ const reiser4_lock_cnt_info * info) -+{ -+ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n" -+ "jload: %i, " -+ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, " -+ "ktxnmgrd: %i, fq: %i\n" -+ "inode: %i, " -+ "cbk_cache: %i (r:%i,w%i), " -+ "eflush: %i, " -+ "zlock: %i,\n" -+ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n" -+ "d: %i, x: %i, t: %i\n", prefix, -+ info->spin_locked_jnode, -+ info->rw_locked_tree, info->read_locked_tree, -+ info->write_locked_tree, -+ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk, -+ info->spin_locked_jload, -+ info->spin_locked_txnh, -+ info->spin_locked_atom, info->spin_locked_stack, -+ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd, -+ info->spin_locked_fq, -+ info->spin_locked_inode, -+ info->rw_locked_cbk_cache, -+ info->read_locked_cbk_cache, -+ info->write_locked_cbk_cache, -+ info->spin_locked_super_eflush, -+ info->spin_locked_zlock, -+ info->spin_locked, -+ info->long_term_locked_znode, -+ info->inode_sem_r, info->inode_sem_w, -+ info->d_refs, info->x_refs, info->t_refs); -+} -+ -+/* check that no spinlocks are held */ -+int reiser4_schedulable(void) -+{ -+ if (get_current_context_check() != NULL) { -+ if (!LOCK_CNT_NIL(spin_locked)) { -+ print_lock_counters("in atomic", reiser4_lock_counters()); -+ return 0; -+ } -+ } -+ might_sleep(); -+ return 1; -+} -+/* -+ * return true, iff no locks are held. -+ */ -+int reiser4_no_counters_are_held(void) -+{ -+ reiser4_lock_cnt_info *counters; -+ -+ counters = reiser4_lock_counters(); -+ return -+ (counters->spin_locked_zlock == 0) && -+ (counters->spin_locked_jnode == 0) && -+ (counters->rw_locked_tree == 0) && -+ (counters->read_locked_tree == 0) && -+ (counters->write_locked_tree == 0) && -+ (counters->rw_locked_dk == 0) && -+ (counters->read_locked_dk == 0) && -+ (counters->write_locked_dk == 0) && -+ (counters->spin_locked_txnh == 0) && -+ (counters->spin_locked_atom == 0) && -+ (counters->spin_locked_stack == 0) && -+ (counters->spin_locked_txnmgr == 0) && -+ (counters->spin_locked_inode == 0) && -+ (counters->spin_locked == 0) && -+ (counters->long_term_locked_znode == 0) && -+ (counters->inode_sem_r == 0) && -+ (counters->inode_sem_w == 0) && (counters->d_refs == 0); -+} -+ -+/* -+ * return true, iff transaction commit can be done under locks held by the -+ * current thread. -+ */ -+int reiser4_commit_check_locks(void) -+{ -+ reiser4_lock_cnt_info *counters; -+ int inode_sem_r; -+ int inode_sem_w; -+ int result; -+ -+ /* -+ * inode's read/write semaphore is the only reiser4 lock that can be -+ * held during commit. -+ */ -+ -+ counters = reiser4_lock_counters(); -+ inode_sem_r = counters->inode_sem_r; -+ inode_sem_w = counters->inode_sem_w; -+ -+ counters->inode_sem_r = counters->inode_sem_w = 0; -+ result = reiser4_no_counters_are_held(); -+ counters->inode_sem_r = inode_sem_r; -+ counters->inode_sem_w = inode_sem_w; -+ return result; -+} -+ -+/* -+ * fill "error site" in the current reiser4 context. See comment before RETERR -+ * macro for more details. -+ */ -+void reiser4_return_err(int code, const char *file, int line) -+{ -+ if (code < 0 && is_in_reiser4_context()) { -+ reiser4_context *ctx = get_current_context(); -+ -+ if (ctx != NULL) { -+ ctx->err.code = code; -+ ctx->err.file = file; -+ ctx->err.line = line; -+ } -+ } -+} -+ -+#if 0 -+/* -+ * report error information recorder by reiser4_return_err(). -+ */ -+static void reiser4_report_err(void) -+{ -+ reiser4_context *ctx = get_current_context_check(); -+ -+ if (ctx != NULL) { -+ if (ctx->err.code != 0) { -+ printk("code: %i at %s:%i\n", -+ ctx->err.code, ctx->err.file, ctx->err.line); -+ } -+ } -+} -+#endif /* 0 */ -+ -+#endif /* REISER4_DEBUG */ -+ -+#if KERNEL_DEBUGGER -+ -+/* -+ * this functions just drops into kernel debugger. It is a convenient place to -+ * put breakpoint in. -+ */ -+void reiser4_debugtrap(void) -+{ -+ /* do nothing. Put break point here. */ -+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE) -+ extern void kgdb_breakpoint(void); -+ kgdb_breakpoint(); -+#endif -+} -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/debug.h linux-2.6.30/fs/reiser4/debug.h ---- linux-2.6.30.orig/fs/reiser4/debug.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/debug.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,351 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* Declarations of debug macros. */ -+ -+#if !defined(__FS_REISER4_DEBUG_H__) -+#define __FS_REISER4_DEBUG_H__ -+ -+#include "forward.h" -+#include "reiser4.h" -+ -+/* generic function to produce formatted output, decorating it with -+ whatever standard prefixes/postfixes we want. "Fun" is a function -+ that will be actually called, can be printk, panic etc. -+ This is for use by other debugging macros, not by users. */ -+#define DCALL(lev, fun, reperr, label, format, ...) \ -+({ \ -+ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \ -+ current->comm, current->pid, __FUNCTION__, \ -+ __FILE__, __LINE__, label, ## __VA_ARGS__); \ -+}) -+ -+/* -+ * cause kernel to crash -+ */ -+#define reiser4_panic(mid, format, ...) \ -+ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__) -+ -+/* print message with indication of current process, file, line and -+ function */ -+#define reiser4_log(label, format, ...) \ -+ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__) -+ -+/* Assertion checked during compilation. -+ If "cond" is false (0) we get duplicate case label in switch. -+ Use this to check something like famous -+ cassert (sizeof(struct reiserfs_journal_commit) == 4096) ; -+ in 3.x journal.c. If cassertion fails you get compiler error, -+ so no "maintainer-id". -+*/ -+#define cassert(cond) ({ switch (-1) { case (cond): case 0: break; } }) -+ -+#define noop do {; } while (0) -+ -+#if REISER4_DEBUG -+/* version of info that only actually prints anything when _d_ebugging -+ is on */ -+#define dinfo(format, ...) printk(format , ## __VA_ARGS__) -+/* macro to catch logical errors. Put it into `default' clause of -+ switch() statement. */ -+#define impossible(label, format, ...) \ -+ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__) -+/* assert assures that @cond is true. If it is not, reiser4_panic() is -+ called. Use this for checking logical consistency and _never_ call -+ this to check correctness of external data: disk blocks and user-input . */ -+#define assert(label, cond) \ -+({ \ -+ /* call_on_each_assert(); */ \ -+ if (cond) { \ -+ /* put negated check to avoid using !(cond) that would lose \ -+ * warnings for things like assert(a = b); */ \ -+ ; \ -+ } else { \ -+ DEBUGON(1); \ -+ reiser4_panic(label, "assertion failed: %s", #cond); \ -+ } \ -+}) -+ -+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */ -+#define check_me(label, expr) assert(label, (expr)) -+ -+#define ON_DEBUG(exp) exp -+ -+extern int reiser4_schedulable(void); -+extern void call_on_each_assert(void); -+ -+#else -+ -+#define dinfo(format, args...) noop -+#define impossible(label, format, args...) noop -+#define assert(label, cond) noop -+#define check_me(label, expr) ((void) (expr)) -+#define ON_DEBUG(exp) -+#define reiser4_schedulable() might_sleep() -+ -+/* REISER4_DEBUG */ -+#endif -+ -+#if REISER4_DEBUG -+/* per-thread information about lock acquired by this thread. Used by lock -+ * ordering checking in spin_macros.h */ -+typedef struct reiser4_lock_cnt_info { -+ int rw_locked_tree; -+ int read_locked_tree; -+ int write_locked_tree; -+ -+ int rw_locked_dk; -+ int read_locked_dk; -+ int write_locked_dk; -+ -+ int rw_locked_cbk_cache; -+ int read_locked_cbk_cache; -+ int write_locked_cbk_cache; -+ -+ int spin_locked_zlock; -+ int spin_locked_jnode; -+ int spin_locked_jload; -+ int spin_locked_txnh; -+ int spin_locked_atom; -+ int spin_locked_stack; -+ int spin_locked_txnmgr; -+ int spin_locked_ktxnmgrd; -+ int spin_locked_fq; -+ int spin_locked_inode; -+ int spin_locked_super_eflush; -+ int spin_locked; -+ int long_term_locked_znode; -+ -+ int inode_sem_r; -+ int inode_sem_w; -+ -+ int d_refs; -+ int x_refs; -+ int t_refs; -+} reiser4_lock_cnt_info; -+ -+extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void); -+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b)) -+ -+/* increment lock-counter @counter, if present */ -+#define LOCK_CNT_INC(counter) \ -+ IN_CONTEXT(++(reiser4_lock_counters()->counter), 0) -+ -+/* decrement lock-counter @counter, if present */ -+#define LOCK_CNT_DEC(counter) \ -+ IN_CONTEXT(--(reiser4_lock_counters()->counter), 0) -+ -+/* check that lock-counter is zero. This is for use in assertions */ -+#define LOCK_CNT_NIL(counter) \ -+ IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1) -+ -+/* check that lock-counter is greater than zero. This is for use in -+ * assertions */ -+#define LOCK_CNT_GTZ(counter) \ -+ IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1) -+#define LOCK_CNT_LT(counter,n) \ -+ IN_CONTEXT(reiser4_lock_counters()->counter < n, 1) -+ -+#else /* REISER4_DEBUG */ -+ -+/* no-op versions on the above */ -+ -+typedef struct reiser4_lock_cnt_info { -+} reiser4_lock_cnt_info; -+ -+#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL) -+#define LOCK_CNT_INC(counter) noop -+#define LOCK_CNT_DEC(counter) noop -+#define LOCK_CNT_NIL(counter) (1) -+#define LOCK_CNT_GTZ(counter) (1) -+#define LOCK_CNT_LT(counter, n) (1) -+ -+#endif /* REISER4_DEBUG */ -+ -+#define assert_spin_not_locked(lock) BUG_ON(0) -+#define assert_rw_write_locked(lock) BUG_ON(0) -+#define assert_rw_read_locked(lock) BUG_ON(0) -+#define assert_rw_locked(lock) BUG_ON(0) -+#define assert_rw_not_write_locked(lock) BUG_ON(0) -+#define assert_rw_not_read_locked(lock) BUG_ON(0) -+#define assert_rw_not_locked(lock) BUG_ON(0) -+ -+/* flags controlling debugging behavior. Are set through debug_flags=N mount -+ option. */ -+typedef enum { -+ /* print a lot of information during panic. When this is on all jnodes -+ * are listed. This can be *very* large output. Usually you don't want -+ * this. Especially over serial line. */ -+ REISER4_VERBOSE_PANIC = 0x00000001, -+ /* print a lot of information during umount */ -+ REISER4_VERBOSE_UMOUNT = 0x00000002, -+ /* print gathered statistics on umount */ -+ REISER4_STATS_ON_UMOUNT = 0x00000004, -+ /* check node consistency */ -+ REISER4_CHECK_NODE = 0x00000008 -+} reiser4_debug_flags; -+ -+extern int is_in_reiser4_context(void); -+ -+/* -+ * evaluate expression @e only if with reiser4 context -+ */ -+#define ON_CONTEXT(e) do { \ -+ if (is_in_reiser4_context()) { \ -+ e; \ -+ } } while (0) -+ -+/* -+ * evaluate expression @e only when within reiser4_context and debugging is -+ * on. -+ */ -+#define ON_DEBUG_CONTEXT(e) ON_DEBUG(ON_CONTEXT(e)) -+ -+/* -+ * complain about unexpected function result and crash. Used in "default" -+ * branches of switch statements and alike to assert that invalid results are -+ * not silently ignored. -+ */ -+#define wrong_return_value(label, function) \ -+ impossible(label, "wrong return value from " function) -+ -+/* Issue different types of reiser4 messages to the console */ -+#define warning(label, format, ...) \ -+ DCALL(KERN_WARNING, \ -+ printk, 1, label, "WARNING: " format , ## __VA_ARGS__) -+#define notice(label, format, ...) \ -+ DCALL(KERN_NOTICE, \ -+ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__) -+ -+/* mark not yet implemented functionality */ -+#define not_yet(label, format, ...) \ -+ reiser4_panic(label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__) -+ -+extern void reiser4_do_panic(const char *format, ...) -+ __attribute__ ((noreturn, format(printf, 1, 2))); -+ -+extern int reiser4_preempt_point(void); -+extern void reiser4_print_stats(void); -+ -+#if REISER4_DEBUG -+extern int reiser4_no_counters_are_held(void); -+extern int reiser4_commit_check_locks(void); -+#else -+#define reiser4_no_counters_are_held() (1) -+#define reiser4_commit_check_locks() (1) -+#endif -+ -+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */ -+#define IS_POW(i) \ -+({ \ -+ typeof(i) __i; \ -+ \ -+ __i = (i); \ -+ !(__i & (__i - 1)); \ -+}) -+ -+#define KERNEL_DEBUGGER (1) -+ -+#if KERNEL_DEBUGGER -+ -+extern void reiser4_debugtrap(void); -+ -+/* -+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If -+ * kgdb is not compiled in, do nothing. -+ */ -+#define DEBUGON(cond) \ -+({ \ -+ if (unlikely(cond)) \ -+ reiser4_debugtrap(); \ -+}) -+#else -+#define DEBUGON(cond) noop -+#endif -+ -+/* -+ * Error code tracing facility. (Idea is borrowed from XFS code.) -+ * -+ * Suppose some strange and/or unexpected code is returned from some function -+ * (for example, write(2) returns -EEXIST). It is possible to place a -+ * breakpoint in the reiser4_write(), but it is too late here. How to find out -+ * in what particular place -EEXIST was generated first? -+ * -+ * In reiser4 all places where actual error codes are produced (that is, -+ * statements of the form -+ * -+ * return -EFOO; // (1), or -+ * -+ * result = -EFOO; // (2) -+ * -+ * are replaced with -+ * -+ * return RETERR(-EFOO); // (1a), and -+ * -+ * result = RETERR(-EFOO); // (2a) respectively -+ * -+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is -+ * printed in error and warning messages. Moreover, it's possible to put a -+ * conditional breakpoint in reiser4_return_err (low-level function called -+ * by RETERR() to do the actual work) to break into debugger immediately -+ * when particular error happens. -+ * -+ */ -+ -+#if REISER4_DEBUG -+ -+/* -+ * data-type to store information about where error happened ("error site"). -+ */ -+typedef struct err_site { -+ int code; /* error code */ -+ const char *file; /* source file, filled by __FILE__ */ -+ int line; /* source file line, filled by __LINE__ */ -+} err_site; -+ -+extern void reiser4_return_err(int code, const char *file, int line); -+ -+/* -+ * fill &get_current_context()->err_site with error information. -+ */ -+#define RETERR(code) \ -+({ \ -+ typeof(code) __code; \ -+ \ -+ __code = (code); \ -+ reiser4_return_err(__code, __FILE__, __LINE__); \ -+ __code; \ -+}) -+ -+#else -+ -+/* -+ * no-op versions of the above -+ */ -+ -+typedef struct err_site { -+} err_site; -+#define RETERR(code) code -+#endif -+ -+#if REISER4_LARGE_KEY -+/* -+ * conditionally compile arguments only if REISER4_LARGE_KEY is on. -+ */ -+#define ON_LARGE_KEY(...) __VA_ARGS__ -+#else -+#define ON_LARGE_KEY(...) -+#endif -+ -+/* __FS_REISER4_DEBUG_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/dformat.h linux-2.6.30/fs/reiser4/dformat.h ---- linux-2.6.30.orig/fs/reiser4/dformat.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/dformat.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,71 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* Formats of on-disk data and conversion functions. */ -+ -+/* put all item formats in the files describing the particular items, -+ our model is, everything you need to do to add an item to reiser4, -+ (excepting the changes to the plugin that uses the item which go -+ into the file defining that plugin), you put into one file. */ -+/* Data on disk are stored in little-endian format. -+ To declare fields of on-disk structures, use d8, d16, d32 and d64. -+ d??tocpu() and cputod??() to convert. */ -+ -+#if !defined(__FS_REISER4_DFORMAT_H__) -+#define __FS_REISER4_DFORMAT_H__ -+ -+#include <asm/byteorder.h> -+#include <asm/unaligned.h> -+#include <linux/types.h> -+ -+typedef __u8 d8; -+typedef __le16 d16; -+typedef __le32 d32; -+typedef __le64 d64; -+ -+#define PACKED __attribute__((packed)) -+ -+/* data-type for block number */ -+typedef __u64 reiser4_block_nr; -+ -+/* data-type for block number on disk, disk format */ -+typedef __le64 reiser4_dblock_nr; -+ -+/** -+ * disk_addr_eq - compare disk addresses -+ * @b1: pointer to block number ot compare -+ * @b2: pointer to block number ot compare -+ * -+ * Returns true if if disk addresses are the same -+ */ -+static inline int disk_addr_eq(const reiser4_block_nr * b1, -+ const reiser4_block_nr * b2) -+{ -+ assert("nikita-1033", b1 != NULL); -+ assert("nikita-1266", b2 != NULL); -+ -+ return !memcmp(b1, b2, sizeof *b1); -+} -+ -+/* structure of master reiser4 super block */ -+typedef struct reiser4_master_sb { -+ char magic[16]; /* "ReIsEr4" */ -+ __le16 disk_plugin_id; /* id of disk layout plugin */ -+ __le16 blocksize; -+ char uuid[16]; /* unique id */ -+ char label[16]; /* filesystem label */ -+ __le64 diskmap; /* location of the diskmap. 0 if not present */ -+} reiser4_master_sb; -+ -+/* __FS_REISER4_DFORMAT_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/dscale.c linux-2.6.30/fs/reiser4/dscale.c ---- linux-2.6.30.orig/fs/reiser4/dscale.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/dscale.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,192 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Scalable on-disk integers */ -+ -+/* -+ * Various on-disk structures contain integer-like structures. Stat-data -+ * contain [yes, "data" is plural, check the dictionary] file size, link -+ * count; extent unit contains extent width etc. To accommodate for general -+ * case enough space is reserved to keep largest possible value. 64 bits in -+ * all cases above. But in overwhelming majority of cases numbers actually -+ * stored in these fields will be comparatively small and reserving 8 bytes is -+ * a waste of precious disk bandwidth. -+ * -+ * Scalable integers are one way to solve this problem. dscale_write() -+ * function stores __u64 value in the given area consuming from 1 to 9 bytes, -+ * depending on the magnitude of the value supplied. dscale_read() reads value -+ * previously stored by dscale_write(). -+ * -+ * dscale_write() produces format not completely unlike of UTF: two highest -+ * bits of the first byte are used to store "tag". One of 4 possible tag -+ * values is chosen depending on the number being encoded: -+ * -+ * 0 ... 0x3f => 0 [table 1] -+ * 0x40 ... 0x3fff => 1 -+ * 0x4000 ... 0x3fffffff => 2 -+ * 0x40000000 ... 0xffffffffffffffff => 3 -+ * -+ * (see dscale_range() function) -+ * -+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes -+ * to be stored, so in this case there is no place in the first byte to store -+ * tag. For such values tag is stored in an extra 9th byte. -+ * -+ * As _highest_ bits are used for the test (which is natural) scaled integers -+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which -+ * uses LITTLE-ENDIAN. -+ * -+ */ -+ -+#include "debug.h" -+#include "dscale.h" -+ -+/* return tag of scaled integer stored at @address */ -+static int gettag(const unsigned char *address) -+{ -+ /* tag is stored in two highest bits */ -+ return (*address) >> 6; -+} -+ -+/* clear tag from value. Clear tag embedded into @value. */ -+static void cleartag(__u64 *value, int tag) -+{ -+ /* -+ * W-w-what ?! -+ * -+ * Actually, this is rather simple: @value passed here was read by -+ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by -+ * zeroes. Tag is still stored in the highest (arithmetically) -+ * non-zero bits of @value, but relative position of tag within __u64 -+ * depends on @tag. -+ * -+ * For example if @tag is 0, it's stored 2 highest bits of lowest -+ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits. -+ * -+ * If tag is 1, it's stored in two highest bits of 2nd lowest byte, -+ * and it's offset if (2 * 8) - 2 == 14 bits. -+ * -+ * See table 1 above for details. -+ * -+ * All these cases are captured by the formula: -+ */ -+ *value &= ~(3 << (((1 << tag) << 3) - 2)); -+ /* -+ * That is, clear two (3 == 0t11) bits at the offset -+ * -+ * 8 * (2 ^ tag) - 2, -+ * -+ * that is, two highest bits of (2 ^ tag)-th byte of @value. -+ */ -+} -+ -+/* return tag for @value. See table 1 above for details. */ -+static int dscale_range(__u64 value) -+{ -+ if (value > 0x3fffffff) -+ return 3; -+ if (value > 0x3fff) -+ return 2; -+ if (value > 0x3f) -+ return 1; -+ return 0; -+} -+ -+/* restore value stored at @adderss by dscale_write() and return number of -+ * bytes consumed */ -+int dscale_read(unsigned char *address, __u64 *value) -+{ -+ int tag; -+ -+ /* read tag */ -+ tag = gettag(address); -+ switch (tag) { -+ case 3: -+ /* In this case tag is stored in an extra byte, skip this byte -+ * and decode value stored in the next 8 bytes.*/ -+ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1))); -+ /* worst case: 8 bytes for value itself plus one byte for -+ * tag. */ -+ return 9; -+ case 0: -+ *value = get_unaligned(address); -+ break; -+ case 1: -+ *value = __be16_to_cpu(get_unaligned((__be16 *)address)); -+ break; -+ case 2: -+ *value = __be32_to_cpu(get_unaligned((__be32 *)address)); -+ break; -+ default: -+ return RETERR(-EIO); -+ } -+ /* clear tag embedded into @value */ -+ cleartag(value, tag); -+ /* number of bytes consumed is (2 ^ tag)---see table 1. */ -+ return 1 << tag; -+} -+ -+/* number of bytes consumed */ -+int dscale_bytes_to_read(unsigned char *address) -+{ -+ int tag; -+ -+ tag = gettag(address); -+ switch (tag) { -+ case 0: -+ case 1: -+ case 2: -+ return 1 << tag; -+ case 3: -+ return 9; -+ default: -+ return RETERR(-EIO); -+ } -+} -+ -+/* store @value at @address and return number of bytes consumed */ -+int dscale_write(unsigned char *address, __u64 value) -+{ -+ int tag; -+ int shift; -+ __be64 v; -+ unsigned char *valarr; -+ -+ tag = dscale_range(value); -+ v = __cpu_to_be64(value); -+ valarr = (unsigned char *)&v; -+ shift = (tag == 3) ? 1 : 0; -+ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag); -+ *address |= (tag << 6); -+ return shift + (1 << tag); -+} -+ -+/* number of bytes required to store @value */ -+int dscale_bytes_to_write(__u64 value) -+{ -+ int bytes; -+ -+ bytes = 1 << dscale_range(value); -+ if (bytes == 8) -+ ++bytes; -+ return bytes; -+} -+ -+/* returns true if @value and @other require the same number of bytes to be -+ * stored. Used by detect when data structure (like stat-data) has to be -+ * expanded or contracted. */ -+int dscale_fit(__u64 value, __u64 other) -+{ -+ return dscale_range(value) == dscale_range(other); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/dscale.h linux-2.6.30/fs/reiser4/dscale.h ---- linux-2.6.30.orig/fs/reiser4/dscale.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/dscale.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,28 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Scalable on-disk integers. See dscale.h for details. */ -+ -+#if !defined(__FS_REISER4_DSCALE_H__) -+#define __FS_REISER4_DSCALE_H__ -+ -+#include "dformat.h" -+ -+extern int dscale_read(unsigned char *address, __u64 *value); -+extern int dscale_write(unsigned char *address, __u64 value); -+extern int dscale_bytes_to_read(unsigned char *address); -+extern int dscale_bytes_to_write(__u64 value); -+extern int dscale_fit(__u64 value, __u64 other); -+ -+/* __FS_REISER4_DSCALE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/entd.c linux-2.6.30/fs/reiser4/entd.c ---- linux-2.6.30.orig/fs/reiser4/entd.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/entd.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,335 @@ -+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Ent daemon. */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "tree.h" -+#include "entd.h" -+#include "super.h" -+#include "context.h" -+#include "reiser4.h" -+#include "vfs_ops.h" -+#include "page_cache.h" -+#include "inode.h" -+ -+#include <linux/sched.h> /* struct task_struct */ -+#include <linux/suspend.h> -+#include <linux/kernel.h> -+#include <linux/writeback.h> -+#include <linux/time.h> /* INITIAL_JIFFIES */ -+#include <linux/backing-dev.h> /* bdi_write_congested */ -+#include <linux/wait.h> -+#include <linux/kthread.h> -+#include <linux/freezer.h> -+ -+#define DEF_PRIORITY 12 -+#define MAX_ENTD_ITERS 10 -+ -+static void entd_flush(struct super_block *, struct wbq *); -+static int entd(void *arg); -+ -+/* -+ * set ->comm field of end thread to make its state visible to the user level -+ */ -+#define entd_set_comm(state) \ -+ snprintf(current->comm, sizeof(current->comm), \ -+ "ent:%s%s", super->s_id, (state)) -+ -+/** -+ * reiser4_init_entd - initialize entd context and start kernel daemon -+ * @super: super block to start ent thread for -+ * -+ * Creates entd contexts, starts kernel thread and waits until it -+ * initializes. -+ */ -+int reiser4_init_entd(struct super_block *super) -+{ -+ entd_context *ctx; -+ -+ assert("nikita-3104", super != NULL); -+ -+ ctx = get_entd_context(super); -+ -+ memset(ctx, 0, sizeof *ctx); -+ spin_lock_init(&ctx->guard); -+ init_waitqueue_head(&ctx->wait); -+#if REISER4_DEBUG -+ INIT_LIST_HEAD(&ctx->flushers_list); -+#endif -+ /* lists of writepage requests */ -+ INIT_LIST_HEAD(&ctx->todo_list); -+ INIT_LIST_HEAD(&ctx->done_list); -+ /* start entd */ -+ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id); -+ if (IS_ERR(ctx->tsk)) -+ return PTR_ERR(ctx->tsk); -+ return 0; -+} -+ -+static void put_wbq(struct wbq *rq) -+{ -+ iput(rq->mapping->host); -+ complete(&rq->completion); -+} -+ -+/* ent should be locked */ -+static struct wbq *__get_wbq(entd_context * ent) -+{ -+ struct wbq *wbq; -+ -+ if (list_empty(&ent->todo_list)) -+ return NULL; -+ -+ ent->nr_todo_reqs--; -+ wbq = list_entry(ent->todo_list.next, struct wbq, link); -+ list_del_init(&wbq->link); -+ return wbq; -+} -+ -+/* ent thread function */ -+static int entd(void *arg) -+{ -+ struct super_block *super; -+ entd_context *ent; -+ int done = 0; -+ -+ super = arg; -+ /* do_fork() just copies task_struct into the new -+ thread. ->fs_context shouldn't be copied of course. This shouldn't -+ be a problem for the rest of the code though. -+ */ -+ current->journal_info = NULL; -+ -+ ent = get_entd_context(super); -+ -+ while (!done) { -+ try_to_freeze(); -+ -+ spin_lock(&ent->guard); -+ while (ent->nr_todo_reqs != 0) { -+ struct wbq *rq; -+ -+ assert("", list_empty(&ent->done_list)); -+ -+ /* take request from the queue head */ -+ rq = __get_wbq(ent); -+ assert("", rq != NULL); -+ ent->cur_request = rq; -+ spin_unlock(&ent->guard); -+ -+ entd_set_comm("!"); -+ entd_flush(super, rq); -+ -+ put_wbq(rq); -+ -+ /* -+ * wakeup all requestors and iput their inodes -+ */ -+ spin_lock(&ent->guard); -+ while (!list_empty(&ent->done_list)) { -+ rq = list_entry(ent->done_list.next, struct wbq, link); -+ list_del_init(&rq->link); -+ ent->nr_done_reqs--; -+ spin_unlock(&ent->guard); -+ assert("", rq->written == 1); -+ put_wbq(rq); -+ spin_lock(&ent->guard); -+ } -+ } -+ spin_unlock(&ent->guard); -+ -+ entd_set_comm("."); -+ -+ { -+ DEFINE_WAIT(__wait); -+ -+ do { -+ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE); -+ if (kthread_should_stop()) { -+ done = 1; -+ break; -+ } -+ if (ent->nr_todo_reqs != 0) -+ break; -+ schedule(); -+ } while (0); -+ finish_wait(&ent->wait, &__wait); -+ } -+ } -+ BUG_ON(ent->nr_todo_reqs != 0); -+ return 0; -+} -+ -+/** -+ * reiser4_done_entd - stop entd kernel thread -+ * @super: super block to stop ent thread for -+ * -+ * It is called on umount. Sends stop signal to entd and wait until it handles -+ * it. -+ */ -+void reiser4_done_entd(struct super_block *super) -+{ -+ entd_context *ent; -+ -+ assert("nikita-3103", super != NULL); -+ -+ ent = get_entd_context(super); -+ assert("zam-1055", ent->tsk != NULL); -+ kthread_stop(ent->tsk); -+} -+ -+/* called at the beginning of jnode_flush to register flusher thread with ent -+ * daemon */ -+void reiser4_enter_flush(struct super_block *super) -+{ -+ entd_context *ent; -+ -+ assert("zam-1029", super != NULL); -+ ent = get_entd_context(super); -+ -+ assert("zam-1030", ent != NULL); -+ -+ spin_lock(&ent->guard); -+ ent->flushers++; -+#if REISER4_DEBUG -+ list_add(&get_current_context()->flushers_link, &ent->flushers_list); -+#endif -+ spin_unlock(&ent->guard); -+} -+ -+/* called at the end of jnode_flush */ -+void reiser4_leave_flush(struct super_block *super) -+{ -+ entd_context *ent; -+ int wake_up_ent; -+ -+ assert("zam-1027", super != NULL); -+ ent = get_entd_context(super); -+ -+ assert("zam-1028", ent != NULL); -+ -+ spin_lock(&ent->guard); -+ ent->flushers--; -+ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0); -+#if REISER4_DEBUG -+ list_del_init(&get_current_context()->flushers_link); -+#endif -+ spin_unlock(&ent->guard); -+ if (wake_up_ent) -+ wake_up_process(ent->tsk); -+} -+ -+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX -+ -+static void entd_flush(struct super_block *super, struct wbq *rq) -+{ -+ reiser4_context ctx; -+ int tmp; -+ -+ init_stack_context(&ctx, super); -+ ctx.entd = 1; -+ ctx.gfp_mask = GFP_NOFS; -+ -+ rq->wbc->range_start = page_offset(rq->page); -+ rq->wbc->range_end = rq->wbc->range_start + -+ (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT); -+ tmp = rq->wbc->nr_to_write; -+ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc); -+ -+ if (rq->wbc->nr_to_write > 0) { -+ rq->wbc->range_start = 0; -+ rq->wbc->range_end = LLONG_MAX; -+ generic_sync_sb_inodes(super, rq->wbc); -+ } -+ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST; -+ reiser4_writeout(super, rq->wbc); -+ -+ context_set_commit_async(&ctx); -+ reiser4_exit_context(&ctx); -+} -+ -+/** -+ * write_page_by_ent - ask entd thread to flush this page as part of slum -+ * @page: page to be written -+ * @wbc: writeback control passed to reiser4_writepage -+ * -+ * Creates a request, puts it on entd list of requests, wakeups entd if -+ * necessary, waits until entd completes with the request. -+ */ -+int write_page_by_ent(struct page *page, struct writeback_control *wbc) -+{ -+ struct super_block *sb; -+ struct inode *inode; -+ entd_context *ent; -+ struct wbq rq; -+ -+ assert("", PageLocked(page)); -+ assert("", page->mapping != NULL); -+ -+ sb = page->mapping->host->i_sb; -+ ent = get_entd_context(sb); -+ assert("", ent && ent->done == 0); -+ -+ /* -+ * we are going to unlock page and ask ent thread to write the -+ * page. Re-dirty page before unlocking so that if ent thread fails to -+ * write it - it will remain dirty -+ */ -+ set_page_dirty_notag(page); -+ -+ /* -+ * pin inode in memory, unlock page, entd_flush will iput. We can not -+ * iput here becasue we can not allow delete_inode to be called here -+ */ -+ inode = igrab(page->mapping->host); -+ unlock_page(page); -+ if (inode == NULL) -+ /* inode is getting freed */ -+ return 0; -+ -+ /* init wbq */ -+ INIT_LIST_HEAD(&rq.link); -+ rq.magic = WBQ_MAGIC; -+ rq.wbc = wbc; -+ rq.page = page; -+ rq.mapping = inode->i_mapping; -+ rq.node = NULL; -+ rq.written = 0; -+ init_completion(&rq.completion); -+ -+ /* add request to entd's list of writepage requests */ -+ spin_lock(&ent->guard); -+ ent->nr_todo_reqs++; -+ list_add_tail(&rq.link, &ent->todo_list); -+ if (ent->nr_todo_reqs == 1) -+ wake_up_process(ent->tsk); -+ -+ spin_unlock(&ent->guard); -+ -+ /* wait until entd finishes */ -+ wait_for_completion(&rq.completion); -+ -+ if (rq.written) -+ /* Eventually ENTD has written the page to disk. */ -+ return 0; -+ return 0; -+} -+ -+int wbq_available(void) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ entd_context *ent = get_entd_context(sb); -+ return ent->nr_todo_reqs; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/entd.h linux-2.6.30/fs/reiser4/entd.h ---- linux-2.6.30.orig/fs/reiser4/entd.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/entd.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,90 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Ent daemon. */ -+ -+#ifndef __ENTD_H__ -+#define __ENTD_H__ -+ -+#include "context.h" -+ -+#include <linux/fs.h> -+#include <linux/completion.h> -+#include <linux/wait.h> -+#include <linux/spinlock.h> -+#include <linux/sched.h> /* for struct task_struct */ -+ -+#define WBQ_MAGIC 0x7876dc76 -+ -+/* write-back request. */ -+struct wbq { -+ int magic; -+ struct list_head link; /* list head of this list is in entd context */ -+ struct writeback_control *wbc; -+ struct page *page; -+ struct address_space *mapping; -+ struct completion completion; -+ jnode *node; /* set if ent thread captured requested page */ -+ int written; /* set if ent thread wrote requested page */ -+}; -+ -+/* ent-thread context. This is used to synchronize starting/stopping ent -+ * threads. */ -+typedef struct entd_context { -+ /* wait queue that ent thread waits on for more work. It's -+ * signaled by write_page_by_ent(). */ -+ wait_queue_head_t wait; -+ /* spinlock protecting other fields */ -+ spinlock_t guard; -+ /* ent thread */ -+ struct task_struct *tsk; -+ /* set to indicate that ent thread should leave. */ -+ int done; -+ /* counter of active flushers */ -+ int flushers; -+ /* -+ * when reiser4_writepage asks entd to write a page - it adds struct -+ * wbq to this list -+ */ -+ struct list_head todo_list; -+ /* number of elements on the above list */ -+ int nr_todo_reqs; -+ -+ struct wbq *cur_request; -+ /* -+ * when entd writes a page it moves write-back request from todo_list -+ * to done_list. This list is used at the end of entd iteration to -+ * wakeup requestors and iput inodes. -+ */ -+ struct list_head done_list; -+ /* number of elements on the above list */ -+ int nr_done_reqs; -+ -+#if REISER4_DEBUG -+ /* list of all active flushers */ -+ struct list_head flushers_list; -+#endif -+} entd_context; -+ -+extern int reiser4_init_entd(struct super_block *); -+extern void reiser4_done_entd(struct super_block *); -+ -+extern void reiser4_enter_flush(struct super_block *); -+extern void reiser4_leave_flush(struct super_block *); -+ -+extern int write_page_by_ent(struct page *, struct writeback_control *); -+extern int wbq_available(void); -+extern void ent_writes_page(struct super_block *, struct page *); -+ -+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *); -+/* __ENTD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/eottl.c linux-2.6.30/fs/reiser4/eottl.c ---- linux-2.6.30.orig/fs/reiser4/eottl.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/eottl.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,510 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "tree_mod.h" -+#include "carry.h" -+#include "tree.h" -+#include "super.h" -+ -+#include <linux/types.h> /* for __u?? */ -+ -+/* -+ * Extents on the twig level (EOTTL) handling. -+ * -+ * EOTTL poses some problems to the tree traversal, that are better explained -+ * by example. -+ * -+ * Suppose we have block B1 on the twig level with the following items: -+ * -+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id, -+ * offset) -+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each -+ * 2. internal item I2 with key (10:0:0:0) -+ * -+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and -+ * then intra-node lookup is done. This lookup finished on the E1, because the -+ * key we are looking for is larger than the key of E1 and is smaller than key -+ * the of I2. -+ * -+ * Here search is stuck. -+ * -+ * After some thought it is clear what is wrong here: extents on the twig level -+ * break some basic property of the *search* tree (on the pretext, that they -+ * restore property of balanced tree). -+ * -+ * Said property is the following: if in the internal node of the search tree -+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be -+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible -+ * through the Pointer. -+ * -+ * This is not true, when Pointer is Extent-Pointer, simply because extent -+ * cannot expand indefinitely to the right to include any item with -+ * -+ * Key1 <= Key <= Key2. -+ * -+ * For example, our E1 extent is only responsible for the data with keys -+ * -+ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and -+ * -+ * so, key range -+ * -+ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) ) -+ * -+ * is orphaned: there is no way to get there from the tree root. -+ * -+ * In other words, extent pointers are different than normal child pointers as -+ * far as search tree is concerned, and this creates such problems. -+ * -+ * Possible solution for this problem is to insert our item into node pointed -+ * to by I2. There are some problems through: -+ * -+ * (1) I2 can be in a different node. -+ * (2) E1 can be immediately followed by another extent E2. -+ * -+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting -+ * for locks/coords as necessary. -+ * -+ * (2) is more complex. Solution here is to insert new empty leaf node and -+ * insert internal item between E1 and E2 pointing to said leaf node. This is -+ * further complicated by possibility that E2 is in a different node, etc. -+ * -+ * Problems: -+ * -+ * (1) if there was internal item I2 immediately on the right of an extent E1 -+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then -+ * key of S1 will be less than smallest key in the N2. Normally, search key -+ * checks that key we are looking for is in the range of keys covered by the -+ * node key is being looked in. To work around of this situation, while -+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the -+ * cbk falgs bitmask. This flag is automatically set on entrance to the -+ * coord_by_key() and is only cleared when we are about to enter situation -+ * described above. -+ * -+ * (2) If extent E1 is immediately followed by another extent E2 and we are -+ * searching for the key that is between E1 and E2 we only have to insert new -+ * empty leaf node when coord_by_key was called for insertion, rather than just -+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to -+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls -+ * performed by insert_by_key() and friends. -+ * -+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any -+ * case it requires modification of node content which is only possible under -+ * write lock. It may well happen that we only have read lock on the node where -+ * new internal pointer is to be inserted (common case: lookup of non-existent -+ * stat-data that fells between two extents). If only read lock is held, tree -+ * traversal is restarted with lock_level modified so that next time we hit -+ * this problem, write lock will be held. Once we have write lock, balancing -+ * will be performed. -+ */ -+ -+/** -+ * is_next_item_internal - check whether next item is internal -+ * @coord: coordinate of extent item in twig node -+ * @key: search key -+ * @lh: twig node lock handle -+ * -+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned, -+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved -+ * to that node, @coord is set to its first unit. If next item is not internal -+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2 -+ * is returned if search restart has to be done. -+ */ -+static int -+is_next_item_internal(coord_t *coord, const reiser4_key * key, -+ lock_handle * lh) -+{ -+ coord_t next; -+ lock_handle rn; -+ int result; -+ -+ coord_dup(&next, coord); -+ if (coord_next_unit(&next) == 0) { -+ /* next unit is in this node */ -+ if (item_is_internal(&next)) { -+ coord_dup(coord, &next); -+ return 1; -+ } -+ assert("vs-3", item_is_extent(&next)); -+ return 0; -+ } -+ -+ /* -+ * next unit either does not exist or is in right neighbor. If it is in -+ * right neighbor we have to check right delimiting key because -+ * concurrent thread could get their first and insert item with a key -+ * smaller than @key -+ */ -+ read_lock_dk(current_tree); -+ result = keycmp(key, znode_get_rd_key(coord->node)); -+ read_unlock_dk(current_tree); -+ assert("vs-6", result != EQUAL_TO); -+ if (result == GREATER_THAN) -+ return 2; -+ -+ /* lock right neighbor */ -+ init_lh(&rn); -+ result = reiser4_get_right_neighbor(&rn, coord->node, -+ znode_is_wlocked(coord->node) ? -+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result == -E_NO_NEIGHBOR) { -+ /* we are on the rightmost edge of the tree */ -+ done_lh(&rn); -+ return 0; -+ } -+ -+ if (result) { -+ assert("vs-4", result < 0); -+ done_lh(&rn); -+ return result; -+ } -+ -+ /* -+ * check whether concurrent thread managed to insert item with a key -+ * smaller than @key -+ */ -+ read_lock_dk(current_tree); -+ result = keycmp(key, znode_get_ld_key(rn.node)); -+ read_unlock_dk(current_tree); -+ assert("vs-6", result != EQUAL_TO); -+ if (result == GREATER_THAN) { -+ done_lh(&rn); -+ return 2; -+ } -+ -+ result = zload(rn.node); -+ if (result) { -+ assert("vs-5", result < 0); -+ done_lh(&rn); -+ return result; -+ } -+ -+ coord_init_first_unit(&next, rn.node); -+ if (item_is_internal(&next)) { -+ /* -+ * next unit is in right neighbor and it is an unit of internal -+ * item. Unlock coord->node. Move @lh to right neighbor. @coord -+ * is set to the first unit of right neighbor. -+ */ -+ coord_dup(coord, &next); -+ zrelse(rn.node); -+ done_lh(lh); -+ move_lh(lh, &rn); -+ return 1; -+ } -+ -+ /* -+ * next unit is unit of extent item. Return without chaning @lh and -+ * @coord. -+ */ -+ assert("vs-6", item_is_extent(&next)); -+ zrelse(rn.node); -+ done_lh(&rn); -+ return 0; -+} -+ -+/** -+ * rd_key - calculate key of an item next to the given one -+ * @coord: position in a node -+ * @key: storage for result key -+ * -+ * @coord is set between items or after the last item in a node. Calculate key -+ * of item to the right of @coord. -+ */ -+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key) -+{ -+ coord_t dup; -+ -+ assert("nikita-2281", coord_is_between_items(coord)); -+ coord_dup(&dup, coord); -+ -+ if (coord_set_to_right(&dup) == 0) -+ /* next item is in this node. Return its key. */ -+ unit_key_by_coord(&dup, key); -+ else { -+ /* -+ * next item either does not exist or is in right -+ * neighbor. Return znode's right delimiting key. -+ */ -+ read_lock_dk(current_tree); -+ *key = *znode_get_rd_key(coord->node); -+ read_unlock_dk(current_tree); -+ } -+ return key; -+} -+ -+/** -+ * add_empty_leaf - insert empty leaf between two extents -+ * @insert_coord: position in twig node between two extents -+ * @lh: twig node lock handle -+ * @key: left delimiting key of new node -+ * @rdkey: right delimiting key of new node -+ * -+ * Inserts empty leaf node between two extent items. It is necessary when we -+ * have to insert an item on leaf level between two extents (items on the twig -+ * level). -+ */ -+static int -+add_empty_leaf(coord_t *insert_coord, lock_handle *lh, -+ const reiser4_key *key, const reiser4_key *rdkey) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *todo; -+ reiser4_item_data *item; -+ carry_insert_data *cdata; -+ carry_op *op; -+ znode *node; -+ reiser4_tree *tree; -+ -+ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key)); -+ tree = znode_get_tree(insert_coord->node); -+ node = reiser4_new_node(insert_coord->node, LEAF_LEVEL); -+ if (IS_ERR(node)) -+ return PTR_ERR(node); -+ -+ /* setup delimiting keys for node being inserted */ -+ write_lock_dk(tree); -+ znode_set_ld_key(node, key); -+ znode_set_rd_key(node, rdkey); -+ ON_DEBUG(node->creator = current); -+ ON_DEBUG(node->first_key = *key); -+ write_unlock_dk(tree); -+ -+ ZF_SET(node, JNODE_ORPHAN); -+ -+ /* -+ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and -+ * carry_insert_data -+ */ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) + -+ sizeof(*item) + sizeof(*cdata)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ todo = (carry_level *) (pool + 1); -+ init_carry_level(todo, pool); -+ -+ item = (reiser4_item_data *) (todo + 3); -+ cdata = (carry_insert_data *) (item + 1); -+ -+ op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0); -+ if (!IS_ERR(op)) { -+ cdata->coord = insert_coord; -+ cdata->key = key; -+ cdata->data = item; -+ op->u.insert.d = cdata; -+ op->u.insert.type = COPT_ITEM_DATA; -+ build_child_ptr_data(node, item); -+ item->arg = NULL; -+ /* have @insert_coord to be set at inserted item after -+ insertion is done */ -+ todo->track_type = CARRY_TRACK_CHANGE; -+ todo->tracked = lh; -+ -+ result = reiser4_carry(todo, NULL); -+ if (result == 0) { -+ /* -+ * pin node in memory. This is necessary for -+ * znode_make_dirty() below. -+ */ -+ result = zload(node); -+ if (result == 0) { -+ lock_handle local_lh; -+ -+ /* -+ * if we inserted new child into tree we have -+ * to mark it dirty so that flush will be able -+ * to process it. -+ */ -+ init_lh(&local_lh); -+ result = longterm_lock_znode(&local_lh, node, -+ ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (result == 0) { -+ znode_make_dirty(node); -+ -+ /* -+ * when internal item pointing to @node -+ * was inserted into twig node -+ * create_hook_internal did not connect -+ * it properly because its right -+ * neighbor was not known. Do it -+ * here -+ */ -+ write_lock_tree(tree); -+ assert("nikita-3312", -+ znode_is_right_connected(node)); -+ assert("nikita-2984", -+ node->right == NULL); -+ ZF_CLR(node, JNODE_RIGHT_CONNECTED); -+ write_unlock_tree(tree); -+ result = -+ connect_znode(insert_coord, node); -+ ON_DEBUG(if (result == 0) check_dkeys(node);); -+ -+ done_lh(lh); -+ move_lh(lh, &local_lh); -+ assert("vs-1676", node_is_empty(node)); -+ coord_init_first_unit(insert_coord, -+ node); -+ } else { -+ warning("nikita-3136", -+ "Cannot lock child"); -+ } -+ done_lh(&local_lh); -+ zrelse(node); -+ } -+ } -+ } else -+ result = PTR_ERR(op); -+ zput(node); -+ done_carry_pool(pool); -+ return result; -+} -+ -+/** -+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal -+ * @h: search handle -+ * @outcome: flag saying whether search has to restart or is done -+ * -+ * Handles search on twig level. If this function completes search itself then -+ * it returns 1. If search has to go one level down then 0 is returned. If -+ * error happens then LOOKUP_DONE is returned via @outcome and error code is -+ * saved in @h->result. -+ */ -+int handle_eottl(cbk_handle *h, int *outcome) -+{ -+ int result; -+ reiser4_key key; -+ coord_t *coord; -+ -+ coord = h->coord; -+ -+ if (h->level != TWIG_LEVEL || -+ (coord_is_existing_item(coord) && item_is_internal(coord))) { -+ /* Continue to traverse tree downward. */ -+ return 0; -+ } -+ -+ /* -+ * make sure that @h->coord is set to twig node and that it is either -+ * set to extent item or after extent item -+ */ -+ assert("vs-356", h->level == TWIG_LEVEL); -+ assert("vs-357", ({ -+ coord_t lcoord; -+ coord_dup(&lcoord, coord); -+ check_me("vs-733", coord_set_to_left(&lcoord) == 0); -+ item_is_extent(&lcoord); -+ } -+ )); -+ -+ if (*outcome == NS_FOUND) { -+ /* we have found desired key on twig level in extent item */ -+ h->result = CBK_COORD_FOUND; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ -+ if (!(h->flags & CBK_FOR_INSERT)) { -+ /* tree traversal is not for insertion. Just return -+ CBK_COORD_NOTFOUND. */ -+ h->result = CBK_COORD_NOTFOUND; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ -+ /* take a look at the item to the right of h -> coord */ -+ result = is_next_item_internal(coord, h->key, h->active_lh); -+ if (unlikely(result < 0)) { -+ h->error = "get_right_neighbor failed"; -+ h->result = result; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ if (result == 0) { -+ /* -+ * item to the right is also an extent one. Allocate a new node -+ * and insert pointer to it after item h -> coord. -+ * -+ * This is a result of extents being located at the twig -+ * level. For explanation, see comment just above -+ * is_next_item_internal(). -+ */ -+ znode *loaded; -+ -+ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) { -+ /* -+ * we got node read locked, restart coord_by_key to -+ * have write lock on twig level -+ */ -+ h->lock_level = TWIG_LEVEL; -+ h->lock_mode = ZNODE_WRITE_LOCK; -+ *outcome = LOOKUP_REST; -+ return 1; -+ } -+ -+ loaded = coord->node; -+ result = -+ add_empty_leaf(coord, h->active_lh, h->key, -+ rd_key(coord, &key)); -+ if (result) { -+ h->error = "could not add empty leaf"; -+ h->result = result; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ /* added empty leaf is locked (h->active_lh), its parent node -+ is unlocked, h->coord is set as EMPTY */ -+ assert("vs-13", coord->between == EMPTY_NODE); -+ assert("vs-14", znode_is_write_locked(coord->node)); -+ assert("vs-15", -+ WITH_DATA(coord->node, node_is_empty(coord->node))); -+ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node))); -+ assert("vs-17", coord->node == h->active_lh->node); -+ *outcome = LOOKUP_DONE; -+ h->result = CBK_COORD_NOTFOUND; -+ return 1; -+ } else if (result == 1) { -+ /* -+ * this is special case mentioned in the comment on -+ * tree.h:cbk_flags. We have found internal item immediately on -+ * the right of extent, and we are going to insert new item -+ * there. Key of item we are going to insert is smaller than -+ * leftmost key in the node pointed to by said internal item -+ * (otherwise search wouldn't come to the extent in the first -+ * place). -+ * -+ * This is a result of extents being located at the twig -+ * level. For explanation, see comment just above -+ * is_next_item_internal(). -+ */ -+ h->flags &= ~CBK_TRUST_DK; -+ } else { -+ assert("vs-8", result == 2); -+ *outcome = LOOKUP_REST; -+ return 1; -+ } -+ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord))); -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/estimate.c linux-2.6.30/fs/reiser4/estimate.c ---- linux-2.6.30.orig/fs/reiser4/estimate.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/estimate.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,129 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "tree.h" -+#include "carry.h" -+#include "inode.h" -+#include "plugin/cluster.h" -+#include "plugin/item/ctail.h" -+ -+/* This returns how many nodes might get dirty and added nodes if @children -+ nodes are dirtied -+ -+ Amount of internals which will get dirty or get allocated we estimate as 5% -+ of the childs + 1 balancing. 1 balancing is 2 neighbours, 2 new blocks and -+ the current block on the leaf level, 2 neighbour nodes + the current (or 1 -+ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on -+ upper levels and 1 for a new root. So 5 for leaf level, 3 for twig level, -+ 2 on upper + 1 for root. -+ -+ Do not calculate the current node of the lowest level here - this is overhead -+ only. -+ -+ children is almost always 1 here. Exception is flow insertion -+*/ -+static reiser4_block_nr -+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height) -+{ -+ reiser4_block_nr ten_percent; -+ -+ ten_percent = ((103 * childen) >> 10); -+ -+ /* If we have too many balancings at the time, tree height can raise on -+ more then 1. Assume that if tree_height is 5, it can raise on 1 only. -+ */ -+ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent)); -+} -+ -+/* this returns maximal possible number of nodes which can be modified plus -+ number of new nodes which can be required to perform insertion of one item -+ into the tree */ -+/* it is only called when tree height changes, or gets initialized */ -+reiser4_block_nr calc_estimate_one_insert(tree_level height) -+{ -+ return 1 + max_balance_overhead(1, height); -+} -+ -+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree) -+{ -+ return tree->estimate_one_insert; -+} -+ -+/* this returns maximal possible number of nodes which can be modified plus -+ number of new nodes which can be required to perform insertion of one unit -+ into an item in the tree */ -+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree) -+{ -+ /* estimate insert into item just like item insertion */ -+ return tree->estimate_one_insert; -+} -+ -+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree) -+{ -+ /* on item removal reiser4 does not try to pack nodes more complact, so, -+ only one node may be dirtied on leaf level */ -+ return tree->estimate_one_insert; -+} -+ -+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and -+ dirty 3 existing nodes (insert point and both its neighbors). -+ Max_balance_overhead should estimate number of blocks which may change/get -+ added on internal levels */ -+reiser4_block_nr estimate_insert_flow(tree_level height) -+{ -+ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 + -+ CARRY_FLOW_NEW_NODES_LIMIT, -+ height); -+} -+ -+/* returnes max number of nodes can be occupied by disk cluster */ -+static reiser4_block_nr estimate_cluster(struct inode *inode, int unprepped) -+{ -+ int per_cluster; -+ per_cluster = (unprepped ? 1 : cluster_nrpages(inode)); -+ return 3 + per_cluster + -+ max_balance_overhead(3 + per_cluster, -+ REISER4_MAX_ZTREE_HEIGHT); -+} -+ -+/* how many nodes might get dirty and added -+ during insertion of a disk cluster */ -+reiser4_block_nr estimate_insert_cluster(struct inode *inode) -+{ -+ return estimate_cluster(inode, 1); /* 24 */ -+} -+ -+/* how many nodes might get dirty and added -+ during update of a (prepped or unprepped) disk cluster */ -+reiser4_block_nr estimate_update_cluster(struct inode *inode) -+{ -+ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */ -+} -+ -+/* How many nodes occupied by a disk cluster might get dirty. -+ Note that this estimation is not precise (i.e. disk cluster -+ can occupy more nodes). -+ Q: Why we don't use precise estimation? -+ A: 1.Because precise estimation is fairly bad: 65536 nodes -+ for 64K logical cluster, it means 256M of dead space on -+ a partition -+ 2.It is a very rare case when disk cluster occupies more -+ nodes then this estimation returns. -+*/ -+reiser4_block_nr estimate_dirty_cluster(struct inode *inode) -+{ -+ return cluster_nrpages(inode) + 4; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/export_ops.c linux-2.6.30/fs/reiser4/export_ops.c ---- linux-2.6.30.orig/fs/reiser4/export_ops.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/export_ops.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,328 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "inode.h" -+#include "plugin/plugin.h" -+ -+/* -+ * Supported file-handle types -+ */ -+typedef enum { -+ FH_WITH_PARENT = 0x10, /* file handle with parent */ -+ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */ -+} reiser4_fhtype; -+ -+#define NFSERROR (255) -+ -+/* initialize place-holder for object */ -+static void object_on_wire_init(reiser4_object_on_wire *o) -+{ -+ o->plugin = NULL; -+} -+ -+/* finish with @o */ -+static void object_on_wire_done(reiser4_object_on_wire *o) -+{ -+ if (o->plugin != NULL) -+ o->plugin->wire.done(o); -+} -+ -+/* -+ * read serialized object identity from @addr and store information about -+ * object in @obj. This is dual to encode_inode(). -+ */ -+static char *decode_inode(struct super_block *s, char *addr, -+ reiser4_object_on_wire * obj) -+{ -+ file_plugin *fplug; -+ -+ /* identifier of object plugin is stored in the first two bytes, -+ * followed by... */ -+ fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr); -+ if (fplug != NULL) { -+ addr += sizeof(d16); -+ obj->plugin = fplug; -+ assert("nikita-3520", fplug->wire.read != NULL); -+ /* plugin specific encoding of object identity. */ -+ addr = fplug->wire.read(addr, obj); -+ } else -+ addr = ERR_PTR(RETERR(-EINVAL)); -+ return addr; -+} -+ -+static struct dentry *reiser4_get_dentry(struct super_block *super, -+ void *data); -+/** -+ * reiser4_decode_fh: decode on-wire object - helper function -+ * for fh_to_dentry, fh_to_parent export operations; -+ * @super: super block; -+ * @addr: onwire object to be decoded; -+ * -+ * Returns dentry referring to the object being decoded. -+ */ -+static struct dentry *reiser4_decode_fh(struct super_block * super, -+ char * addr) -+{ -+ reiser4_object_on_wire object; -+ -+ object_on_wire_init(&object); -+ -+ addr = decode_inode(super, addr, &object); -+ if (!IS_ERR(addr)) { -+ struct dentry *d; -+ d = reiser4_get_dentry(super, &object); -+ if (d != NULL && !IS_ERR(d)) -+ /* FIXME check for -ENOMEM */ -+ reiser4_get_dentry_fsdata(d)->stateless = 1; -+ addr = (char *)d; -+ } -+ object_on_wire_done(&object); -+ return (void *)addr; -+} -+ -+static struct dentry *reiser4_fh_to_dentry(struct super_block *sb, -+ struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ reiser4_context *ctx; -+ struct dentry *d; -+ -+ assert("edward-1536", -+ fh_type == FH_WITH_PARENT || fh_type == FH_WITHOUT_PARENT); -+ -+ ctx = reiser4_init_context(sb); -+ if (IS_ERR(ctx)) -+ return (struct dentry *)ctx; -+ -+ d = reiser4_decode_fh(sb, (char *)fid->raw); -+ -+ reiser4_exit_context(ctx); -+ return d; -+} -+ -+static struct dentry *reiser4_fh_to_parent(struct super_block *sb, -+ struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ char * addr; -+ struct dentry * d; -+ reiser4_context *ctx; -+ file_plugin *fplug; -+ -+ if (fh_type == FH_WITHOUT_PARENT) -+ return NULL; -+ assert("edward-1537", fh_type == FH_WITH_PARENT); -+ -+ ctx = reiser4_init_context(sb); -+ if (IS_ERR(ctx)) -+ return (struct dentry *)ctx; -+ addr = (char *)fid->raw; -+ /* extract 2-bytes file plugin id */ -+ fplug = file_plugin_by_disk_id(reiser4_get_tree(sb), (d16 *)addr); -+ if (fplug == NULL) { -+ d = ERR_PTR(RETERR(-EINVAL)); -+ goto exit; -+ } -+ addr += sizeof(d16); -+ /* skip previously encoded object */ -+ addr = fplug->wire.read(addr, NULL /* skip */); -+ if (IS_ERR(addr)) { -+ d = (struct dentry *)addr; -+ goto exit; -+ } -+ /* @extract and decode parent object */ -+ d = reiser4_decode_fh(sb, addr); -+ exit: -+ reiser4_exit_context(ctx); -+ return d; -+} -+ -+/* -+ * Object serialization support. -+ * -+ * To support knfsd file system provides export_operations that are used to -+ * construct and interpret NFS file handles. As a generalization of this, -+ * reiser4 object plugins have serialization support: it provides methods to -+ * create on-wire representation of identity of reiser4 object, and -+ * re-create/locate object given its on-wire identity. -+ * -+ */ -+ -+/* -+ * return number of bytes that on-wire representation of @inode's identity -+ * consumes. -+ */ -+static int encode_inode_size(struct inode *inode) -+{ -+ assert("nikita-3514", inode != NULL); -+ assert("nikita-3515", inode_file_plugin(inode) != NULL); -+ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL); -+ -+ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16); -+} -+ -+/* -+ * store on-wire representation of @inode's identity at the area beginning at -+ * @start. -+ */ -+static char *encode_inode(struct inode *inode, char *start) -+{ -+ assert("nikita-3517", inode != NULL); -+ assert("nikita-3518", inode_file_plugin(inode) != NULL); -+ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL); -+ -+ /* -+ * first, store two-byte identifier of object plugin, then -+ */ -+ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)), -+ (d16 *) start); -+ start += sizeof(d16); -+ /* -+ * call plugin to serialize object's identity -+ */ -+ return inode_file_plugin(inode)->wire.write(inode, start); -+} -+ -+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is -+ * returned if file handle can not be stored */ -+/** -+ * reiser4_encode_fh - encode_fh of export operations -+ * @dentry: -+ * @fh: -+ * @lenp: -+ * @need_parent: -+ * -+ */ -+static int -+reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp, -+ int need_parent) -+{ -+ struct inode *inode; -+ struct inode *parent; -+ char *addr; -+ int need; -+ int delta; -+ int result; -+ reiser4_context *ctx; -+ -+ /* -+ * knfsd asks as to serialize object in @dentry, and, optionally its -+ * parent (if need_parent != 0). -+ * -+ * encode_inode() and encode_inode_size() is used to build -+ * representation of object and its parent. All hard work is done by -+ * object plugins. -+ */ -+ inode = dentry->d_inode; -+ parent = dentry->d_parent->d_inode; -+ -+ addr = (char *)fh; -+ -+ need = encode_inode_size(inode); -+ if (need < 0) -+ return NFSERROR; -+ if (need_parent) { -+ delta = encode_inode_size(parent); -+ if (delta < 0) -+ return NFSERROR; -+ need += delta; -+ } -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ if (need <= sizeof(__u32) * (*lenp)) { -+ addr = encode_inode(inode, addr); -+ if (need_parent) -+ addr = encode_inode(parent, addr); -+ -+ /* store in lenp number of 32bit words required for file -+ * handle. */ -+ *lenp = (need + sizeof(__u32) - 1) >> 2; -+ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT; -+ } else -+ /* no enough space in file handle */ -+ result = NFSERROR; -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * reiser4_get_dentry_parent - get_parent of export operations -+ * @child: -+ * -+ */ -+static struct dentry *reiser4_get_dentry_parent(struct dentry *child) -+{ -+ struct inode *dir; -+ dir_plugin *dplug; -+ struct dentry *result; -+ reiser4_context *ctx; -+ -+ assert("nikita-3527", child != NULL); -+ -+ dir = child->d_inode; -+ assert("nikita-3529", dir != NULL); -+ -+ ctx = reiser4_init_context(dir->i_sb); -+ if (IS_ERR(ctx)) -+ return (void *)ctx; -+ -+ dplug = inode_dir_plugin(dir); -+ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL)); -+ -+ if (unlikely(dplug == NULL)) { -+ reiser4_exit_context(ctx); -+ return ERR_PTR(RETERR(-ENOTDIR)); -+ } -+ result = dplug->get_parent(dir); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * reiser4_get_dentry - get_dentry of export operations -+ * @super: -+ * @data: -+ * -+ * -+ */ -+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data) -+{ -+ reiser4_object_on_wire *o; -+ -+ assert("nikita-3522", super != NULL); -+ assert("nikita-3523", data != NULL); -+ /* -+ * this is only supposed to be called by -+ * -+ * reiser4_decode_fh->find_exported_dentry -+ * -+ * so, reiser4_context should be here already. -+ */ -+ assert("nikita-3526", is_in_reiser4_context()); -+ -+ o = (reiser4_object_on_wire *)data; -+ assert("nikita-3524", o->plugin != NULL); -+ assert("nikita-3525", o->plugin->wire.get != NULL); -+ -+ return o->plugin->wire.get(super, o); -+} -+ -+struct export_operations reiser4_export_operations = { -+ .encode_fh = reiser4_encode_fh, -+ .fh_to_dentry = reiser4_fh_to_dentry, -+ .fh_to_parent = reiser4_fh_to_parent, -+ .get_parent = reiser4_get_dentry_parent, -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/flush.c linux-2.6.30/fs/reiser4/flush.c ---- linux-2.6.30.orig/fs/reiser4/flush.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/flush.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,3703 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/plugin.h" -+#include "plugin/object.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "carry.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "page_cache.h" -+#include "wander.h" -+#include "super.h" -+#include "entd.h" -+#include "reiser4.h" -+#include "flush.h" -+#include "writeout.h" -+ -+#include <asm/atomic.h> -+#include <linux/fs.h> /* for struct super_block */ -+#include <linux/mm.h> /* for struct page */ -+#include <linux/bio.h> /* for struct bio */ -+#include <linux/pagemap.h> -+#include <linux/blkdev.h> -+ -+/* IMPLEMENTATION NOTES */ -+ -+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of -+ assigning a total order to the nodes of the tree in which the parent is -+ placed before its children, which are ordered (recursively) in left-to-right -+ order. When we speak of a "parent-first preceder", it describes the node that -+ "came before in forward parent-first order". When we speak of a "parent-first -+ follower", it describes the node that "comes next in parent-first order" -+ (alternatively the node that "came before in reverse parent-first order"). -+ -+ The following pseudo-code prints the nodes of a tree in forward parent-first -+ order: -+ -+ void parent_first (node) -+ { -+ print_node (node); -+ if (node->level > leaf) { -+ for (i = 0; i < num_children; i += 1) { -+ parent_first (node->child[i]); -+ } -+ } -+ } -+*/ -+ -+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block -+ allocation so that a left-to-right scan of the tree's data (i.e., the leaves -+ in left-to-right order) can be accomplished with sequential reads, which -+ results in reading nodes in their parent-first order. This is a -+ read-optimization aspect of the flush algorithm, and there is also a -+ write-optimization aspect, which is that we wish to make large sequential -+ writes to the disk by allocating or reallocating blocks so that they can be -+ written in sequence. Sometimes the read-optimization and write-optimization -+ goals conflict with each other, as we discuss in more detail below. -+*/ -+ -+/* STATE BITS: The flush code revolves around the state of the jnodes it covers. -+ Here are the relevant jnode->state bits and their relevence to flush: -+ -+ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be -+ written it must be allocated first. In order to be considered allocated, -+ the jnode must have exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These -+ two bits are exclusive, and all dirtied jnodes eventually have one of these -+ bits set during each transaction. -+ -+ JNODE_CREATED: The node was freshly created in its transaction and has no -+ previous block address, so it is unconditionally assigned to be relocated, -+ although this is mainly for code-convenience. It is not being 'relocated' -+ from anything, but in almost every regard it is treated as part of the -+ relocate set. The JNODE_CREATED bit remains set even after JNODE_RELOC is -+ set, so the actual relocate can be distinguished from the -+ created-and-allocated set easily: relocate-set members (belonging to the -+ preserve-set) have (JNODE_RELOC) set and created-set members which have no -+ previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set. -+ -+ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm -+ made the decision to maintain the pre-existing location for this node and -+ it will be written to the wandered-log. -+ -+ JNODE_RELOC: The flush algorithm made the decision to relocate this block -+ (if it was not created, see note above). A block with JNODE_RELOC set is -+ eligible for early-flushing and may be submitted during flush_empty_queues. -+ When the JNODE_RELOC bit is set on a znode, the parent node's internal item -+ is modified and the znode is rehashed. -+ -+ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm -+ scans the node and calls plugin->f.squeeze() method for its items. By this -+ technology we update disk clusters of cryptcompress objects. Also if -+ leftmost point that was found by flush scan has this flag (races with -+ write(), rare case) the flush algorythm makes the decision to pass it to -+ squalloc() in spite of its flushprepped status for squeezing, not for -+ repeated allocation. -+ -+ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode -+ into its flush queue. This means the jnode is not on any clean or dirty -+ list, instead it is moved to one of the flush queue (see flush_queue.h) -+ object private list. This prevents multiple concurrent flushes from -+ attempting to start flushing from the same node. -+ -+ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up -+ squeeze-and-allocate on a node while its children are actively being -+ squeezed and allocated. This flag was created to avoid submitting a write -+ request for a node while its children are still being allocated and -+ squeezed. Then flush queue was re-implemented to allow unlimited number of -+ nodes be queued. This flag support was commented out in source code because -+ we decided that there was no reason to submit queued nodes before -+ jnode_flush() finishes. However, current code calls fq_write() during a -+ slum traversal and may submit "busy nodes" to disk. Probably we can -+ re-enable the JNODE_FLUSH_BUSY bit support in future. -+ -+ With these state bits, we describe a test used frequently in the code below, -+ jnode_is_flushprepped()(and the spin-lock-taking jnode_check_flushprepped()). -+ The test for "flushprepped" returns true if any of the following are true: -+ -+ - The node is not dirty -+ - The node has JNODE_RELOC set -+ - The node has JNODE_OVRWR set -+ -+ If either the node is not dirty or it has already been processed by flush -+ (and assigned JNODE_OVRWR or JNODE_RELOC), then it is prepped. If -+ jnode_is_flushprepped() returns true then flush has work to do on that node. -+*/ -+ -+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never -+ flushprepped twice (unless an explicit call to flush_unprep is made as -+ described in detail below). For example a node is dirtied, allocated, and -+ then early-flushed to disk and set clean. Before the transaction commits, the -+ page is dirtied again and, due to memory pressure, the node is flushed again. -+ The flush algorithm will not relocate the node to a new disk location, it -+ will simply write it to the same, previously relocated position again. -+*/ -+ -+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm -+ where we start at a leaf node and allocate in parent-first order by iterating -+ to the right. At each step of the iteration, we check for the right neighbor. -+ Before advancing to the right neighbor, we check if the current position and -+ the right neighbor share the same parent. If they do not share the same -+ parent, the parent is allocated before the right neighbor. -+ -+ This process goes recursively up the tree and squeeze nodes level by level as -+ long as the right neighbor and the current position have different parents, -+ then it allocates the right-neighbors-with-different-parents on the way back -+ down. This process is described in more detail in -+ flush_squalloc_changed_ancestor and the recursive function -+ squalloc_one_changed_ancestor. But the purpose here is not to discuss the -+ specifics of the bottom-up approach as it is to contrast the bottom-up and -+ top-down approaches. -+ -+ The top-down algorithm was implemented earlier (April-May 2002). In the -+ top-down approach, we find a starting point by scanning left along each level -+ past dirty nodes, then going up and repeating the process until the left node -+ and the parent node are clean. We then perform a parent-first traversal from -+ the starting point, which makes allocating in parent-first order trivial. -+ After one subtree has been allocated in this manner, we move to the right, -+ try moving upward, then repeat the parent-first traversal. -+ -+ Both approaches have problems that need to be addressed. Both are -+ approximately the same amount of code, but the bottom-up approach has -+ advantages in the order it acquires locks which, at the very least, make it -+ the better approach. At first glance each one makes the other one look -+ simpler, so it is important to remember a few of the problems with each one. -+ -+ Main problem with the top-down approach: When you encounter a clean child -+ during the parent-first traversal, what do you do? You would like to avoid -+ searching through a large tree of nodes just to find a few dirty leaves at -+ the bottom, and there is not an obvious solution. One of the advantages of -+ the top-down approach is that during the parent-first traversal you check -+ every child of a parent to see if it is dirty. In this way, the top-down -+ approach easily handles the main problem of the bottom-up approach: -+ unallocated children. -+ -+ The unallocated children problem is that before writing a node to disk we -+ must make sure that all of its children are allocated. Otherwise, the writing -+ the node means extra I/O because the node will have to be written again when -+ the child is finally allocated. -+ -+ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, -+ this should not cause any file system corruption, it only degrades I/O -+ performance because a node may be written when it is sure to be written at -+ least one more time in the same transaction when the remaining children are -+ allocated. What follows is a description of how we will solve the problem. -+*/ -+ -+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node, -+ then proceeding in parent first order, allocate some of its left-children, -+ then encounter a clean child in the middle of the parent. We do not allocate -+ the clean child, but there may remain unallocated (dirty) children to the -+ right of the clean child. If we were to stop flushing at this moment and -+ write everything to disk, the parent might still contain unallocated -+ children. -+ -+ We could try to allocate all the descendents of every node that we allocate, -+ but this is not necessary. Doing so could result in allocating the entire -+ tree: if the root node is allocated then every unallocated node would have to -+ be allocated before flushing. Actually, we do not have to write a node just -+ because we allocate it. It is possible to allocate but not write a node -+ during flush, when it still has unallocated children. However, this approach -+ is probably not optimal for the following reason. -+ -+ The flush algorithm is designed to allocate nodes in parent-first order in an -+ attempt to optimize reads that occur in the same order. Thus we are -+ read-optimizing for a left-to-right scan through all the leaves in the -+ system, and we are hoping to write-optimize at the same time because those -+ nodes will be written together in batch. What happens, however, if we assign -+ a block number to a node in its read-optimized order but then avoid writing -+ it because it has unallocated children? In that situation, we lose out on the -+ write-optimization aspect because a node will have to be written again to the -+ its location on the device, later, which likely means seeking back to that -+ location. -+ -+ So there are tradeoffs. We can choose either: -+ -+ A. Allocate all unallocated children to preserve both write-optimization and -+ read-optimization, but this is not always desirable because it may mean -+ having to allocate and flush very many nodes at once. -+ -+ B. Defer writing nodes with unallocated children, keep their read-optimized -+ locations, but sacrifice write-optimization because those nodes will be -+ written again. -+ -+ C. Defer writing nodes with unallocated children, but do not keep their -+ read-optimized locations. Instead, choose to write-optimize them later, when -+ they are written. To facilitate this, we "undo" the read-optimized allocation -+ that was given to the node so that later it can be write-optimized, thus -+ "unpreparing" the flush decision. This is a case where we disturb the -+ FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a call to -+ flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit; -+ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate -+ its block location, and set the JNODE_CREATED bit, effectively setting the -+ node back to an unallocated state. -+ -+ We will take the following approach in v4.0: for twig nodes we will always -+ finish allocating unallocated children (A). For nodes with (level > TWIG) -+ we will defer writing and choose write-optimization (C). -+ -+ To summarize, there are several parts to a solution that avoids the problem -+ with unallocated children: -+ -+ FIXME-ZAM: Still no one approach is implemented to eliminate the -+ "UNALLOCATED CHILDREN" problem because there was an experiment which was done -+ showed that we have 1-2 nodes with unallocated children for thousands of -+ written nodes. The experiment was simple like coping/deletion of linux kernel -+ sources. However the problem can arise in more complex tests. I think we have -+ jnode_io_hook to insert a check for unallocated children and see what kind of -+ problem we have. -+ -+ 1. When flush reaches a stopping point (e.g. a clean node) it should continue -+ calling squeeze-and-allocate on any remaining unallocated children. -+ FIXME: Difficulty to implement: should be simple -- amounts to adding a while -+ loop to jnode_flush, see comments in that function. -+ -+ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes -+ may still have unallocated children. If the twig level has unallocated -+ children it is an assertion failure. If a higher-level node has unallocated -+ children, then it should be explicitly de-allocated by a call to -+ flush_unprep(). -+ FIXME: Difficulty to implement: should be simple. -+ -+ 3. (CPU-Optimization) Checking whether a node has unallocated children may -+ consume more CPU cycles than we would like, and it is possible (but medium -+ complexity) to optimize this somewhat in the case where large sub-trees are -+ flushed. The following observation helps: if both the left- and -+ right-neighbor of a node are processed by the flush algorithm then the node -+ itself is guaranteed to have all of its children allocated. However, the cost -+ of this check may not be so expensive after all: it is not needed for leaves -+ and flush can guarantee this property for twigs. That leaves only (level > -+ TWIG) nodes that have to be checked, so this optimization only helps if at -+ least three (level > TWIG) nodes are flushed in one pass, and the savings -+ will be very small unless there are many more (level > TWIG) nodes. But if -+ there are many (level > TWIG) nodes then the number of blocks being written -+ will be very large, so the savings may be insignificant. That said, the idea -+ is to maintain both the left and right edges of nodes that are processed in -+ flush. When flush_empty_queue() is called, a relatively simple test will -+ tell whether the (level > TWIG) node is on the edge. If it is on the edge, -+ the slow check is necessary, but if it is in the interior then it can be -+ assumed to have all of its children allocated. FIXME: medium complexity to -+ implement, but simple to verify given that we must have a slow check anyway. -+ -+ 4. (Optional) This part is optional, not for v4.0--flush should work -+ independently of whether this option is used or not. Called RAPID_SCAN, the -+ idea is to amend the left-scan operation to take unallocated children into -+ account. Normally, the left-scan operation goes left as long as adjacent -+ nodes are dirty up until some large maximum value (FLUSH_SCAN_MAXNODES) at -+ which point it stops and begins flushing. But scan-left may stop at a -+ position where there are unallocated children to the left with the same -+ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops -+ after FLUSH_RELOCATE_THRESHOLD, which is much smaller than -+ FLUSH_SCAN_MAXNODES, then procedes with a rapid scan. The rapid scan skips -+ all the interior children of a node--if the leftmost child of a twig is -+ dirty, check its left neighbor (the rightmost child of the twig to the left). -+ If the left neighbor of the leftmost child is also dirty, then continue the -+ scan at the left twig and repeat. This option will cause flush to allocate -+ more twigs in a single pass, but it also has the potential to write many more -+ nodes than would otherwise be written without the RAPID_SCAN option. -+ RAPID_SCAN was partially implemented, code removed August 12, 2002 by JMACD. -+*/ -+ -+/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that -+ the starting point for flush is a leaf node, but actually the flush code -+ cares very little about whether or not this is true. It is possible that all -+ the leaf nodes are flushed and dirty parent nodes still remain, in which case -+ jnode_flush() is called on a non-leaf argument. Flush doesn't care--it treats -+ the argument node as if it were a leaf, even when it is not. This is a simple -+ approach, and there may be a more optimal policy but until a problem with -+ this approach is discovered, simplest is probably best. -+ -+ NOTE: In this case, the ordering produced by flush is parent-first only if -+ you ignore the leaves. This is done as a matter of simplicity and there is -+ only one (shaky) justification. When an atom commits, it flushes all leaf -+ level nodes first, followed by twigs, and so on. With flushing done in this -+ order, if flush is eventually called on a non-leaf node it means that -+ (somehow) we reached a point where all leaves are clean and only internal -+ nodes need to be flushed. If that it the case, then it means there were no -+ leaves that were the parent-first preceder/follower of the parent. This is -+ expected to be a rare case, which is why we do nothing special about it. -+ However, memory pressure may pass an internal node to flush when there are -+ still dirty leaf nodes that need to be flushed, which could prove our -+ original assumptions "inoperative". If this needs to be fixed, then -+ scan_left/right should have special checks for the non-leaf levels. For -+ example, instead of passing from a node to the left neighbor, it should pass -+ from the node to the left neighbor's rightmost descendent (if dirty). -+ -+*/ -+ -+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB -+ chunks, dirtying everything and putting it into a transaction. We tell the -+ allocator to allocate the blocks as far as possible towards one end of the -+ logical device--the left (starting) end of the device if we are walking from -+ left to right, the right end of the device if we are walking from right to -+ left. We then make passes in alternating directions, and as we do this the -+ device becomes sorted such that tree order and block number order fully -+ correlate. -+ -+ Resizing is done by shifting everything either all the way to the left or all -+ the way to the right, and then reporting the last block. -+*/ -+ -+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. -+ This descibes the policy from the highest level: -+ -+ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive -+ nodes on the leaf level during flush-scan (right, left), then we -+ unconditionally decide to relocate leaf nodes. -+ -+ Otherwise, there are two contexts in which we make a decision to relocate: -+ -+ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test(). -+ During the initial stages of flush, after scan-right completes, we want to -+ ask the question: should we relocate this leaf node and thus dirty the parent -+ node. Then if the node is a leftmost child its parent is its own parent-first -+ preceder, thus we repeat the question at the next level up, and so on. In -+ these cases we are moving in the reverse-parent first direction. -+ -+ There is another case which is considered the reverse direction, which comes -+ at the end of a twig in reverse_relocate_end_of_twig(). As we finish -+ processing a twig we may reach a point where there is a clean twig to the -+ right with a dirty leftmost child. In this case, we may wish to relocate the -+ child by testing if it should be relocated relative to its parent. -+ -+ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done -+ in allocate_znode. What distinguishes the forward parent-first case from the -+ reverse-parent first case is that the preceder has already been allocated in -+ the forward case, whereas in the reverse case we don't know what the preceder -+ is until we finish "going in reverse". That simplifies the forward case -+ considerably, and there we actually use the block allocator to determine -+ whether, e.g., a block closer to the preceder is available. -+*/ -+ -+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, -+ once we finish scan-left and find a starting point, if the parent's left -+ neighbor is dirty then squeeze the parent's left neighbor and the parent. -+ This may change the flush-starting-node's parent. Repeat until the child's -+ parent is stable. If the child is a leftmost child, repeat this left-edge -+ squeezing operation at the next level up. Note that we cannot allocate -+ extents during this or they will be out of parent-first order. There is also -+ some difficult coordinate maintenence issues. We can't do a tree search to -+ find coordinates again (because we hold locks), we have to determine them -+ from the two nodes being squeezed. Looks difficult, but has potential to -+ increase space utilization. */ -+ -+/* Flush-scan helper functions. */ -+static void scan_init(flush_scan * scan); -+static void scan_done(flush_scan * scan); -+ -+/* Flush-scan algorithm. */ -+static int scan_left(flush_scan * scan, flush_scan * right, jnode * node, -+ unsigned limit); -+static int scan_right(flush_scan * scan, jnode * node, unsigned limit); -+static int scan_common(flush_scan * scan, flush_scan * other); -+static int scan_formatted(flush_scan * scan); -+static int scan_unformatted(flush_scan * scan, flush_scan * other); -+static int scan_by_coord(flush_scan * scan); -+ -+/* Initial flush-point ancestor allocation. */ -+static int alloc_pos_and_ancestors(flush_pos_t *pos); -+static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos); -+static int set_preceder(const coord_t *coord_in, flush_pos_t *pos); -+ -+/* Main flush algorithm. -+ Note on abbreviation: "squeeze and allocate" == "squalloc". */ -+static int squalloc(flush_pos_t *pos); -+ -+/* Flush squeeze implementation. */ -+static int squeeze_right_non_twig(znode * left, znode * right); -+static int shift_one_internal_unit(znode * left, znode * right); -+ -+/* Flush reverse parent-first relocation routines. */ -+static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, -+ const reiser4_block_nr * nblk); -+static int reverse_relocate_test(jnode * node, const coord_t *parent_coord, -+ flush_pos_t *pos); -+static int reverse_relocate_check_dirty_parent(jnode * node, -+ const coord_t *parent_coord, -+ flush_pos_t *pos); -+ -+/* Flush allocate write-queueing functions: */ -+static int allocate_znode(znode * node, const coord_t *parent_coord, -+ flush_pos_t *pos); -+static int allocate_znode_update(znode * node, const coord_t *parent_coord, -+ flush_pos_t *pos); -+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *); -+ -+/* Flush helper functions: */ -+static int jnode_lock_parent_coord(jnode * node, -+ coord_t *coord, -+ lock_handle * parent_lh, -+ load_count * parent_zh, -+ znode_lock_mode mode, int try); -+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side, -+ znode_lock_mode mode, int check_dirty, int expected); -+static int znode_same_parents(znode * a, znode * b); -+ -+static int znode_check_flushprepped(znode * node) -+{ -+ return jnode_check_flushprepped(ZJNODE(node)); -+} -+ -+/* Flush position functions */ -+static void pos_init(flush_pos_t *pos); -+static int pos_valid(flush_pos_t *pos); -+static void pos_done(flush_pos_t *pos); -+static int pos_stop(flush_pos_t *pos); -+ -+/* check that @org is first jnode extent unit, if extent is unallocated, -+ * because all jnodes of unallocated extent are dirty and of the same atom. */ -+#define checkchild(scan) \ -+assert("nikita-3435", \ -+ ergo(scan->direction == LEFT_SIDE && \ -+ (scan->parent_coord.node->level == TWIG_LEVEL) && \ -+ jnode_is_unformatted(scan->node) && \ -+ extent_is_unallocated(&scan->parent_coord), \ -+ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node))) -+ -+/* This flush_cnt variable is used to track the number of concurrent flush -+ operations, useful for debugging. It is initialized in txnmgr.c out of -+ laziness (because flush has no static initializer function...) */ -+ON_DEBUG(atomic_t flush_cnt; -+ ) -+ -+/* check fs backing device for write congestion */ -+static int check_write_congestion(void) -+{ -+ struct super_block *sb; -+ struct backing_dev_info *bdi; -+ -+ sb = reiser4_get_current_sb(); -+ bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info; -+ return bdi_write_congested(bdi); -+} -+ -+/* conditionally write flush queue */ -+static int write_prepped_nodes(flush_pos_t *pos) -+{ -+ int ret; -+ -+ assert("zam-831", pos); -+ assert("zam-832", pos->fq); -+ -+ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS)) -+ return 0; -+ -+ if (check_write_congestion()) -+ return 0; -+ -+ ret = reiser4_write_fq(pos->fq, pos->nr_written, -+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM); -+ return ret; -+} -+ -+/* Proper release all flush pos. resources then move flush position to new -+ locked node */ -+static void move_flush_pos(flush_pos_t *pos, lock_handle * new_lock, -+ load_count * new_load, const coord_t *new_coord) -+{ -+ assert("zam-857", new_lock->node == new_load->node); -+ -+ if (new_coord) { -+ assert("zam-858", new_coord->node == new_lock->node); -+ coord_dup(&pos->coord, new_coord); -+ } else { -+ coord_init_first_unit(&pos->coord, new_lock->node); -+ } -+ -+ if (pos->child) { -+ jput(pos->child); -+ pos->child = NULL; -+ } -+ -+ move_load_count(&pos->load, new_load); -+ done_lh(&pos->lock); -+ move_lh(&pos->lock, new_lock); -+} -+ -+/* delete empty node which link from the parent still exists. */ -+static int delete_empty_node(znode * node) -+{ -+ reiser4_key smallest_removed; -+ -+ assert("zam-1019", node != NULL); -+ assert("zam-1020", node_is_empty(node)); -+ assert("zam-1023", znode_is_wlocked(node)); -+ -+ return reiser4_delete_node(node, &smallest_removed, NULL, 1); -+} -+ -+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */ -+static int prepare_flush_pos(flush_pos_t *pos, jnode * org) -+{ -+ int ret; -+ load_count load; -+ lock_handle lock; -+ -+ init_lh(&lock); -+ init_load_count(&load); -+ -+ if (jnode_is_znode(org)) { -+ ret = longterm_lock_znode(&lock, JZNODE(org), -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); -+ if (ret) -+ return ret; -+ -+ ret = incr_load_count_znode(&load, JZNODE(org)); -+ if (ret) -+ return ret; -+ -+ pos->state = -+ (jnode_get_level(org) == -+ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL; -+ move_flush_pos(pos, &lock, &load, NULL); -+ } else { -+ coord_t parent_coord; -+ ret = jnode_lock_parent_coord(org, &parent_coord, &lock, -+ &load, ZNODE_WRITE_LOCK, 0); -+ if (ret) -+ goto done; -+ if (!item_is_extent(&parent_coord)) { -+ /* file was converted to tail, org became HB, we found -+ internal item */ -+ ret = -EAGAIN; -+ goto done; -+ } -+ -+ pos->state = POS_ON_EPOINT; -+ move_flush_pos(pos, &lock, &load, &parent_coord); -+ pos->child = jref(org); -+ if (extent_is_unallocated(&parent_coord) -+ && extent_unit_index(&parent_coord) != index_jnode(org)) { -+ /* @org is not first child of its parent unit. This may -+ happen because longerm lock of its parent node was -+ released between scan_left and scan_right. For now -+ work around this having flush to repeat */ -+ ret = -EAGAIN; -+ } -+ } -+ -+done: -+ done_load_count(&load); -+ done_lh(&lock); -+ return ret; -+} -+ -+/* TODO LIST (no particular order): */ -+/* I have labelled most of the legitimate FIXME comments in this file with -+ letters to indicate which issue they relate to. There are a few miscellaneous -+ FIXMEs with specific names mentioned instead that need to be -+ inspected/resolved. */ -+/* B. There is an issue described in reverse_relocate_test having to do with an -+ imprecise is_preceder? check having to do with partially-dirty extents. The -+ code that sets preceder hints and computes the preceder is basically -+ untested. Careful testing needs to be done that preceder calculations are -+ done correctly, since if it doesn't affect correctness we will not catch this -+ stuff during regular testing. */ -+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of -+ these are considered expected but unlikely conditions. Flush currently -+ returns 0 (i.e., success but no progress, i.e., restart) whenever it receives -+ any of these in jnode_flush(). Many of the calls that may produce one of -+ these return values (i.e., longterm_lock_znode, reiser4_get_parent, -+ reiser4_get_neighbor, ...) check some of these values themselves and, for -+ instance, stop flushing instead of resulting in a restart. If any of these -+ results are true error conditions then flush will go into a busy-loop, as we -+ noticed during testing when a corrupt tree caused find_child_ptr to return -+ ENOENT. It needs careful thought and testing of corner conditions. -+*/ -+/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a -+ created block is assigned a block number then early-flushed to disk. It is -+ dirtied again and flush is called again. Concurrently, that block is deleted, -+ and the de-allocation of its block number does not need to be deferred, since -+ it is not part of the preserve set (i.e., it didn't exist before the -+ transaction). I think there may be a race condition where flush writes the -+ dirty, created block after the non-deferred deallocated block number is -+ re-allocated, making it possible to write deleted data on top of non-deleted -+ data. Its just a theory, but it needs to be thought out. */ -+/* F. bio_alloc() failure is not handled gracefully. */ -+/* G. Unallocated children. */ -+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered -+ blocks. */ -+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */ -+ -+/* JNODE_FLUSH: MAIN ENTRY POINT */ -+/* This is the main entry point for flushing a jnode and its dirty neighborhood -+ (dirty neighborhood is named "slum"). Jnode_flush() is called if reiser4 has -+ to write dirty blocks to disk, it happens when Linux VM decides to reduce -+ number of dirty pages or as a part of transaction commit. -+ -+ Our objective here is to prep and flush the slum the jnode belongs to. We -+ want to squish the slum together, and allocate the nodes in it as we squish -+ because allocation of children affects squishing of parents. -+ -+ The "argument" @node tells flush where to start. From there, flush finds the -+ left edge of the slum, and calls squalloc (in which nodes are squeezed and -+ allocated). To find a "better place" to start squalloc first we perform a -+ flush_scan. -+ -+ Flush-scanning may be performed in both left and right directions, but for -+ different purposes. When scanning to the left, we are searching for a node -+ that precedes a sequence of parent-first-ordered nodes which we will then -+ flush in parent-first order. During flush-scanning, we also take the -+ opportunity to count the number of consecutive leaf nodes. If this number is -+ past some threshold (FLUSH_RELOCATE_THRESHOLD), then we make a decision to -+ reallocate leaf nodes (thus favoring write-optimization). -+ -+ Since the flush argument node can be anywhere in a sequence of dirty leaves, -+ there may also be dirty nodes to the right of the argument. If the scan-left -+ operation does not count at least FLUSH_RELOCATE_THRESHOLD nodes then we -+ follow it with a right-scan operation to see whether there is, in fact, -+ enough nodes to meet the relocate threshold. Each right- and left-scan -+ operation uses a single flush_scan object. -+ -+ After left-scan and possibly right-scan, we prepare a flush_position object -+ with the starting flush point or parent coordinate, which was determined -+ using scan-left. -+ -+ Next we call the main flush routine, squalloc, which iterates along the leaf -+ level, squeezing and allocating nodes (and placing them into the flush -+ queue). -+ -+ After squalloc returns we take extra steps to ensure that all the children -+ of the final twig node are allocated--this involves repeating squalloc -+ until we finish at a twig with no unallocated children. -+ -+ Finally, we call flush_empty_queue to submit write-requests to disk. If we -+ encounter any above-twig nodes during flush_empty_queue that still have -+ unallocated children, we flush_unprep them. -+ -+ Flush treats several "failure" cases as non-failures, essentially causing -+ them to start over. E_DEADLOCK is one example. -+ FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should probably be handled -+ properly rather than restarting, but there are a bunch of cases to audit. -+*/ -+ -+static int -+jnode_flush(jnode * node, long nr_to_write, long *nr_written, -+ flush_queue_t *fq, int flags) -+{ -+ long ret = 0; -+ flush_scan *right_scan; -+ flush_scan *left_scan; -+ flush_pos_t *flush_pos; -+ int todo; -+ struct super_block *sb; -+ reiser4_super_info_data *sbinfo; -+ jnode *leftmost_in_slum = NULL; -+ -+ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack())); -+ assert("nikita-3022", reiser4_schedulable()); -+ -+ assert("nikita-3185", -+ get_current_super_private()->delete_mutex_owner != current); -+ -+ /* allocate right_scan, left_scan and flush_pos */ -+ right_scan = -+ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos), -+ reiser4_ctx_gfp_mask_get()); -+ if (right_scan == NULL) -+ return RETERR(-ENOMEM); -+ left_scan = right_scan + 1; -+ flush_pos = (flush_pos_t *) (left_scan + 1); -+ -+ sb = reiser4_get_current_sb(); -+ sbinfo = get_super_private(sb); -+ -+ /* Flush-concurrency debug code */ -+#if REISER4_DEBUG -+ atomic_inc(&flush_cnt); -+#endif -+ -+ reiser4_enter_flush(sb); -+ -+ /* Initialize a flush position. */ -+ pos_init(flush_pos); -+ -+ flush_pos->nr_written = nr_written; -+ flush_pos->fq = fq; -+ flush_pos->flags = flags; -+ flush_pos->nr_to_write = nr_to_write; -+ -+ scan_init(right_scan); -+ scan_init(left_scan); -+ -+ /* First scan left and remember the leftmost scan position. If the -+ leftmost position is unformatted we remember its parent_coord. We -+ scan until counting FLUSH_SCAN_MAXNODES. -+ -+ If starting @node is unformatted, at the beginning of left scan its -+ parent (twig level node, containing extent item) will be long term -+ locked and lock handle will be stored in the -+ @right_scan->parent_lock. This lock is used to start the rightward -+ scan without redoing the tree traversal (necessary to find parent) -+ and, hence, is kept during leftward scan. As a result, we have to -+ use try-lock when taking long term locks during the leftward scan. -+ */ -+ ret = scan_left(left_scan, right_scan, -+ node, sbinfo->flush.scan_maxnodes); -+ if (ret != 0) -+ goto failed; -+ -+ leftmost_in_slum = jref(left_scan->node); -+ scan_done(left_scan); -+ -+ /* Then possibly go right to decide if we will use a policy of -+ relocating leaves. This is only done if we did not scan past (and -+ count) enough nodes during the leftward scan. If we do scan right, -+ we only care to go far enough to establish that at least -+ FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The scan -+ limit is the difference between left_scan.count and the threshold. */ -+ -+ todo = sbinfo->flush.relocate_threshold - left_scan->count; -+ /* scan right is inherently deadlock prone, because we are -+ * (potentially) holding a lock on the twig node at this moment. -+ * FIXME: this is incorrect comment: lock is not held */ -+ if (todo > 0) { -+ ret = scan_right(right_scan, node, (unsigned)todo); -+ if (ret != 0) -+ goto failed; -+ } -+ -+ /* Only the right-scan count is needed, release any rightward locks -+ right away. */ -+ scan_done(right_scan); -+ -+ /* ... and the answer is: we should relocate leaf nodes if at least -+ FLUSH_RELOCATE_THRESHOLD nodes were found. */ -+ flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) || -+ (left_scan->count + right_scan->count >= -+ sbinfo->flush.relocate_threshold); -+ -+ /* Funny business here. We set the 'point' in the flush_position at -+ prior to starting squalloc regardless of whether the first point is -+ formatted or unformatted. Without this there would be an invariant, -+ in the rest of the code, that if the flush_position is unformatted -+ then flush_position->point is NULL and -+ flush_position->parent_{lock,coord} is set, and if the flush_position -+ is formatted then flush_position->point is non-NULL and no parent -+ info is set. -+ -+ This seems lazy, but it makes the initial calls to -+ reverse_relocate_test (which ask "is it the pos->point the leftmost -+ child of its parent") much easier because we know the first child -+ already. Nothing is broken by this, but the reasoning is subtle. -+ Holding an extra reference on a jnode during flush can cause us to -+ see nodes with HEARD_BANSHEE during squalloc, because nodes are not -+ removed from sibling lists until they have zero reference count. -+ Flush would never observe a HEARD_BANSHEE node on the left-edge of -+ flush, nodes are only deleted to the right. So if nothing is broken, -+ why fix it? -+ -+ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any -+ point and in any moment, because of the concurrent file system -+ activity (for example, truncate). */ -+ -+ /* Check jnode state after flush_scan completed. Having a lock on this -+ node or its parent (in case of unformatted) helps us in case of -+ concurrent flushing. */ -+ if (jnode_check_flushprepped(leftmost_in_slum) -+ && !jnode_convertible(leftmost_in_slum)) { -+ ret = 0; -+ goto failed; -+ } -+ -+ /* Now setup flush_pos using scan_left's endpoint. */ -+ ret = prepare_flush_pos(flush_pos, leftmost_in_slum); -+ if (ret) -+ goto failed; -+ -+ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL -+ && node_is_empty(flush_pos->coord.node)) { -+ znode *empty = flush_pos->coord.node; -+ -+ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE)); -+ ret = delete_empty_node(empty); -+ goto failed; -+ } -+ -+ if (jnode_check_flushprepped(leftmost_in_slum) -+ && !jnode_convertible(leftmost_in_slum)) { -+ ret = 0; -+ goto failed; -+ } -+ -+ /* Set pos->preceder and (re)allocate pos and its ancestors if it is -+ needed */ -+ ret = alloc_pos_and_ancestors(flush_pos); -+ if (ret) -+ goto failed; -+ -+ /* Do the main rightward-bottom-up squeeze and allocate loop. */ -+ ret = squalloc(flush_pos); -+ pos_stop(flush_pos); -+ if (ret) -+ goto failed; -+ -+ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated -+ children. First, the pos_stop() and pos_valid() routines should be -+ modified so that pos_stop() sets a flush_position->stop flag to 1 -+ without releasing the current position immediately--instead release -+ it in pos_done(). This is a better implementation than the current -+ one anyway. -+ -+ It is not clear that all fields of the flush_position should not be -+ released, but at the very least the parent_lock, parent_coord, and -+ parent_load should remain held because they are hold the last twig -+ when pos_stop() is called. -+ -+ When we reach this point in the code, if the parent_coord is set to -+ after the last item then we know that flush reached the end of a twig -+ (and according to the new flush queueing design, we will return now). -+ If parent_coord is not past the last item, we should check if the -+ current twig has any unallocated children to the right (we are not -+ concerned with unallocated children to the left--in that case the -+ twig itself should not have been allocated). If the twig has -+ unallocated children to the right, set the parent_coord to that -+ position and then repeat the call to squalloc. -+ -+ Testing for unallocated children may be defined in two ways: if any -+ internal item has a fake block number, it is unallocated; if any -+ extent item is unallocated then all of its children are unallocated. -+ But there is a more aggressive approach: if there are any dirty -+ children of the twig to the right of the current position, we may -+ wish to relocate those nodes now. Checking for potential relocation -+ is more expensive as it requires knowing whether there are any dirty -+ children that are not unallocated. The extent_needs_allocation should -+ be used after setting the correct preceder. -+ -+ When we reach the end of a twig at this point in the code, if the -+ flush can continue (when the queue is ready) it will need some -+ information on the future starting point. That should be stored away -+ in the flush_handle using a seal, I believe. Holding a jref() on the -+ future starting point may break other code that deletes that node. -+ */ -+ -+ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is -+ called above the twig level. If the VM calls flush above the twig -+ level, do nothing and return (but figure out why this happens). The -+ txnmgr should be modified to only flush its leaf-level dirty list. -+ This will do all the necessary squeeze and allocate steps but leave -+ unallocated branches and possibly unallocated twigs (when the twig's -+ leftmost child is not dirty). After flushing the leaf level, the -+ remaining unallocated nodes should be given write-optimized -+ locations. (Possibly, the remaining unallocated twigs should be -+ allocated just before their leftmost child.) -+ */ -+ -+ /* Any failure reaches this point. */ -+failed: -+ -+ switch (ret) { -+ case -E_REPEAT: -+ case -EINVAL: -+ case -E_DEADLOCK: -+ case -E_NO_NEIGHBOR: -+ case -ENOENT: -+ /* FIXME(C): Except for E_DEADLOCK, these should probably be -+ handled properly in each case. They already are handled in -+ many cases. */ -+ /* Something bad happened, but difficult to avoid... Try again! -+ */ -+ ret = 0; -+ } -+ -+ if (leftmost_in_slum) -+ jput(leftmost_in_slum); -+ -+ pos_done(flush_pos); -+ scan_done(left_scan); -+ scan_done(right_scan); -+ kfree(right_scan); -+ -+ ON_DEBUG(atomic_dec(&flush_cnt)); -+ -+ reiser4_leave_flush(sb); -+ -+ return ret; -+} -+ -+/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that -+ * flusher should submit all prepped nodes immediately without keeping them in -+ * flush queues for long time. The reason for rapid flush mode is to free -+ * memory as fast as possible. */ -+ -+#if REISER4_USE_RAPID_FLUSH -+ -+/** -+ * submit all prepped nodes if rapid flush mode is set, -+ * turn rapid flush mode off. -+ */ -+ -+static int rapid_flush(flush_pos_t *pos) -+{ -+ if (!wbq_available()) -+ return 0; -+ -+ return write_prepped_nodes(pos); -+} -+ -+#else -+ -+#define rapid_flush(pos) (0) -+ -+#endif /* REISER4_USE_RAPID_FLUSH */ -+ -+static jnode *find_flush_start_jnode(jnode *start, txn_atom * atom, -+ flush_queue_t *fq, int *nr_queued, -+ int flags) -+{ -+ jnode * node; -+ -+ if (start != NULL) { -+ spin_lock_jnode(start); -+ if (!jnode_is_flushprepped(start)) { -+ assert("zam-1056", start->atom == atom); -+ node = start; -+ goto enter; -+ } -+ spin_unlock_jnode(start); -+ } -+ /* -+ * In this loop we process all already prepped (RELOC or OVRWR) and -+ * dirtied again nodes. The atom spin lock is not released until all -+ * dirty nodes processed or not prepped node found in the atom dirty -+ * lists. -+ */ -+ while ((node = find_first_dirty_jnode(atom, flags))) { -+ spin_lock_jnode(node); -+enter: -+ assert("zam-881", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR)); -+ -+ if (JF_ISSET(node, JNODE_WRITEBACK)) { -+ /* move node to the end of atom's writeback list */ -+ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom)); -+ -+ /* -+ * jnode is not necessarily on dirty list: if it was -+ * dirtied when it was on flush queue - it does not get -+ * moved to dirty list -+ */ -+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), -+ WB_LIST, 1)); -+ -+ } else if (jnode_is_znode(node) -+ && znode_above_root(JZNODE(node))) { -+ /* -+ * A special case for znode-above-root. The above-root -+ * (fake) znode is captured and dirtied when the tree -+ * height changes or when the root node is relocated. -+ * This causes atoms to fuse so that changes at the root -+ * are serialized. However, this node is never flushed. -+ * This special case used to be in lock.c to prevent the -+ * above-root node from ever being captured, but now -+ * that it is captured we simply prevent it from -+ * flushing. The log-writer code relies on this to -+ * properly log superblock modifications of the tree -+ * height. -+ */ -+ jnode_make_wander_nolock(node); -+ } else if (JF_ISSET(node, JNODE_RELOC)) { -+ queue_jnode(fq, node); -+ ++(*nr_queued); -+ } else -+ break; -+ -+ spin_unlock_jnode(node); -+ } -+ return node; -+} -+ -+/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are -+ * more nodes to flush, return 0 if atom's dirty lists empty and keep current -+ * atom locked, return other errors as they are. */ -+int -+flush_current_atom(int flags, long nr_to_write, long *nr_submitted, -+ txn_atom ** atom, jnode *start) -+{ -+ reiser4_super_info_data *sinfo = get_current_super_private(); -+ flush_queue_t *fq = NULL; -+ jnode *node; -+ int nr_queued; -+ int ret; -+ -+ assert("zam-889", atom != NULL && *atom != NULL); -+ assert_spin_locked(&((*atom)->alock)); -+ assert("zam-892", get_current_context()->trans->atom == *atom); -+ -+ nr_to_write = LONG_MAX; -+ while (1) { -+ ret = reiser4_fq_by_atom(*atom, &fq); -+ if (ret != -E_REPEAT) -+ break; -+ *atom = get_current_atom_locked(); -+ } -+ if (ret) -+ return ret; -+ -+ assert_spin_locked(&((*atom)->alock)); -+ -+ /* parallel flushers limit */ -+ if (sinfo->tmgr.atom_max_flushers != 0) { -+ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) { -+ /* An reiser4_atom_send_event() call is inside -+ reiser4_fq_put_nolock() which is called when flush is -+ finished and nr_flushers is decremented. */ -+ reiser4_atom_wait_event(*atom); -+ *atom = get_current_atom_locked(); -+ } -+ } -+ -+ /* count ourself as a flusher */ -+ (*atom)->nr_flushers++; -+ -+ writeout_mode_enable(); -+ -+ nr_queued = 0; -+ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags); -+ -+ if (node == NULL) { -+ if (nr_queued == 0) { -+ (*atom)->nr_flushers--; -+ reiser4_fq_put_nolock(fq); -+ reiser4_atom_send_event(*atom); -+ /* current atom remains locked */ -+ writeout_mode_disable(); -+ return 0; -+ } -+ spin_unlock_atom(*atom); -+ } else { -+ jref(node); -+ BUG_ON((*atom)->super != node->tree->super); -+ spin_unlock_atom(*atom); -+ spin_unlock_jnode(node); -+ BUG_ON(nr_to_write == 0); -+ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags); -+ jput(node); -+ } -+ -+ ret = -+ reiser4_write_fq(fq, nr_submitted, -+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM); -+ -+ *atom = get_current_atom_locked(); -+ (*atom)->nr_flushers--; -+ reiser4_fq_put_nolock(fq); -+ reiser4_atom_send_event(*atom); -+ spin_unlock_atom(*atom); -+ -+ writeout_mode_disable(); -+ -+ if (ret == 0) -+ ret = -E_REPEAT; -+ -+ return ret; -+} -+ -+/* REVERSE PARENT-FIRST RELOCATION POLICIES */ -+ -+/* This implements the is-it-close-enough-to-its-preceder? test for relocation -+ in the reverse parent-first relocate context. Here all we know is the -+ preceder and the block number. Since we are going in reverse, the preceder -+ may still be relocated as well, so we can't ask the block allocator "is there -+ a closer block available to relocate?" here. In the _forward_ parent-first -+ relocate context (not here) we actually call the block allocator to try and -+ find a closer location. */ -+static int -+reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, -+ const reiser4_block_nr * nblk) -+{ -+ reiser4_block_nr dist; -+ -+ assert("jmacd-7710", *pblk != 0 && *nblk != 0); -+ assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk)); -+ assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk)); -+ -+ /* Distance is the absolute value. */ -+ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk); -+ -+ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from -+ its preceder block, do not relocate. */ -+ if (dist <= get_current_super_private()->flush.relocate_distance) -+ return 0; -+ -+ return 1; -+} -+ -+/* This function is a predicate that tests for relocation. Always called in the -+ reverse-parent-first context, when we are asking whether the current node -+ should be relocated in order to expand the flush by dirtying the parent level -+ (and thus proceeding to flush that level). When traversing in the forward -+ parent-first direction (not here), relocation decisions are handled in two -+ places: allocate_znode() and extent_needs_allocation(). */ -+static int -+reverse_relocate_test(jnode * node, const coord_t *parent_coord, -+ flush_pos_t *pos) -+{ -+ reiser4_block_nr pblk = 0; -+ reiser4_block_nr nblk = 0; -+ -+ assert("jmacd-8989", !jnode_is_root(node)); -+ -+ /* -+ * This function is called only from the -+ * reverse_relocate_check_dirty_parent() and only if the parent -+ * node is clean. This implies that the parent has the real (i.e., not -+ * fake) block number, and, so does the child, because otherwise the -+ * parent would be dirty. -+ */ -+ -+ /* New nodes are treated as if they are being relocated. */ -+ if (JF_ISSET(node, JNODE_CREATED) || -+ (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) -+ return 1; -+ -+ /* Find the preceder. FIXME(B): When the child is an unformatted, -+ previously existing node, the coord may be leftmost even though the -+ child is not the parent-first preceder of the parent. If the first -+ dirty node appears somewhere in the middle of the first extent unit, -+ this preceder calculation is wrong. -+ Needs more logic in here. */ -+ if (coord_is_leftmost_unit(parent_coord)) { -+ pblk = *znode_get_block(parent_coord->node); -+ } else { -+ pblk = pos->preceder.blk; -+ } -+ check_preceder(pblk); -+ -+ /* If (pblk == 0) then the preceder isn't allocated or isn't known: -+ relocate. */ -+ if (pblk == 0) -+ return 1; -+ -+ nblk = *jnode_get_block(node); -+ -+ if (reiser4_blocknr_is_fake(&nblk)) -+ /* child is unallocated, mark parent dirty */ -+ return 1; -+ -+ return reverse_relocate_if_close_enough(&pblk, &nblk); -+} -+ -+/* This function calls reverse_relocate_test to make a reverse-parent-first -+ relocation decision and then, if yes, it marks the parent dirty. */ -+static int -+reverse_relocate_check_dirty_parent(jnode * node, const coord_t *parent_coord, -+ flush_pos_t *pos) -+{ -+ int ret; -+ -+ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) { -+ -+ ret = reverse_relocate_test(node, parent_coord, pos); -+ if (ret < 0) -+ return ret; -+ -+ /* FIXME-ZAM -+ if parent is already relocated - we do not want to grab space, -+ right? */ -+ if (ret == 1) { -+ int grabbed; -+ -+ grabbed = get_current_context()->grabbed_blocks; -+ if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) != -+ 0) -+ reiser4_panic("umka-1250", -+ "No space left during flush."); -+ -+ assert("jmacd-18923", -+ znode_is_write_locked(parent_coord->node)); -+ znode_make_dirty(parent_coord->node); -+ grabbed2free_mark(grabbed); -+ } -+ } -+ -+ return 0; -+} -+ -+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE -+ FORWARD PARENT-FIRST LOOP BEGINS) */ -+ -+/* Get the leftmost child for given coord. */ -+static int get_leftmost_child_of_unit(const coord_t *coord, jnode ** child) -+{ -+ int ret; -+ -+ ret = item_utmost_child(coord, LEFT_SIDE, child); -+ -+ if (ret) -+ return ret; -+ -+ if (IS_ERR(*child)) -+ return PTR_ERR(*child); -+ -+ return 0; -+} -+ -+/* This step occurs after the left- and right-scans are completed, before -+ starting the forward parent-first traversal. Here we attempt to allocate -+ ancestors of the starting flush point, which means continuing in the reverse -+ parent-first direction to the parent, grandparent, and so on (as long as the -+ child is a leftmost child). This routine calls a recursive process, -+ alloc_one_ancestor, which does the real work, except there is special-case -+ handling here for the first ancestor, which may be a twig. At each level -+ (here and alloc_one_ancestor), we check for relocation and then, if the child -+ is a leftmost child, repeat at the next level. On the way back down (the -+ recursion), we allocate the ancestors in parent-first order. */ -+static int alloc_pos_and_ancestors(flush_pos_t *pos) -+{ -+ int ret = 0; -+ lock_handle plock; -+ load_count pload; -+ coord_t pcoord; -+ -+ if (znode_check_flushprepped(pos->lock.node)) -+ return 0; -+ -+ coord_init_invalid(&pcoord, NULL); -+ init_lh(&plock); -+ init_load_count(&pload); -+ -+ if (pos->state == POS_ON_EPOINT) { -+ /* a special case for pos on twig level, where we already have -+ a lock on parent node. */ -+ /* The parent may not be dirty, in which case we should decide -+ whether to relocate the child now. If decision is made to -+ relocate the child, the parent is marked dirty. */ -+ ret = -+ reverse_relocate_check_dirty_parent(pos->child, &pos->coord, -+ pos); -+ if (ret) -+ goto exit; -+ -+ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child -+ is leftmost) and the leaf/child, so recursion is not needed. -+ Levels above the twig will be allocated for -+ write-optimization before the transaction commits. */ -+ -+ /* Do the recursive step, allocating zero or more of our -+ * ancestors. */ -+ ret = alloc_one_ancestor(&pos->coord, pos); -+ -+ } else { -+ if (!znode_is_root(pos->lock.node)) { -+ /* all formatted nodes except tree root */ -+ ret = -+ reiser4_get_parent(&plock, pos->lock.node, -+ ZNODE_WRITE_LOCK); -+ if (ret) -+ goto exit; -+ -+ ret = incr_load_count_znode(&pload, plock.node); -+ if (ret) -+ goto exit; -+ -+ ret = -+ find_child_ptr(plock.node, pos->lock.node, &pcoord); -+ if (ret) -+ goto exit; -+ -+ ret = -+ reverse_relocate_check_dirty_parent(ZJNODE -+ (pos->lock. -+ node), &pcoord, -+ pos); -+ if (ret) -+ goto exit; -+ -+ ret = alloc_one_ancestor(&pcoord, pos); -+ if (ret) -+ goto exit; -+ } -+ -+ ret = allocate_znode(pos->lock.node, &pcoord, pos); -+ } -+exit: -+ done_load_count(&pload); -+ done_lh(&plock); -+ return ret; -+} -+ -+/* This is the recursive step described in alloc_pos_and_ancestors, above. -+ Ignoring the call to set_preceder, which is the next function described, this -+ checks if the child is a leftmost child and returns if it is not. If the -+ child is a leftmost child it checks for relocation, possibly dirtying the -+ parent. Then it performs the recursive step. */ -+static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos) -+{ -+ int ret = 0; -+ lock_handle alock; -+ load_count aload; -+ coord_t acoord; -+ -+ /* As we ascend at the left-edge of the region to flush, take this -+ opportunity at the twig level to find our parent-first preceder -+ unless we have already set it. */ -+ if (pos->preceder.blk == 0) { -+ ret = set_preceder(coord, pos); -+ if (ret != 0) -+ return ret; -+ } -+ -+ /* If the ancestor is clean or already allocated, or if the child is not -+ a leftmost child, stop going up, even leaving coord->node not -+ flushprepped. */ -+ if (znode_check_flushprepped(coord->node) -+ || !coord_is_leftmost_unit(coord)) -+ return 0; -+ -+ init_lh(&alock); -+ init_load_count(&aload); -+ coord_init_invalid(&acoord, NULL); -+ -+ /* Only ascend to the next level if it is a leftmost child, but -+ write-lock the parent in case we will relocate the child. */ -+ if (!znode_is_root(coord->node)) { -+ -+ ret = -+ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord, -+ &alock, &aload, ZNODE_WRITE_LOCK, -+ 0); -+ if (ret != 0) { -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ goto exit; -+ } -+ -+ ret = -+ reverse_relocate_check_dirty_parent(ZJNODE(coord->node), -+ &acoord, pos); -+ if (ret != 0) -+ goto exit; -+ -+ /* Recursive call. */ -+ if (!znode_check_flushprepped(acoord.node)) { -+ ret = alloc_one_ancestor(&acoord, pos); -+ if (ret) -+ goto exit; -+ } -+ } -+ -+ /* Note: we call allocate with the parent write-locked (except at the -+ root) in case we relocate the child, in which case it will modify the -+ parent during this call. */ -+ ret = allocate_znode(coord->node, &acoord, pos); -+ -+exit: -+ done_load_count(&aload); -+ done_lh(&alock); -+ return ret; -+} -+ -+/* During the reverse parent-first alloc_pos_and_ancestors process described -+ above there is a call to this function at the twig level. During -+ alloc_pos_and_ancestors we may ask: should this node be relocated (in reverse -+ parent-first context)? We repeat this process as long as the child is the -+ leftmost child, eventually reaching an ancestor of the flush point that is -+ not a leftmost child. The preceder of that ancestors, which is not a leftmost -+ child, is actually on the leaf level. The preceder of that block is the -+ left-neighbor of the flush point. The preceder of that block is the rightmost -+ child of the twig on the left. So, when alloc_pos_and_ancestors passes upward -+ through the twig level, it stops momentarily to remember the block of the -+ rightmost child of the twig on the left and sets it to the flush_position's -+ preceder_hint. -+ -+ There is one other place where we may set the flush_position's preceder hint, -+ which is during scan-left. -+*/ -+static int set_preceder(const coord_t *coord_in, flush_pos_t *pos) -+{ -+ int ret; -+ coord_t coord; -+ lock_handle left_lock; -+ load_count left_load; -+ -+ coord_dup(&coord, coord_in); -+ -+ init_lh(&left_lock); -+ init_load_count(&left_load); -+ -+ /* FIXME(B): Same FIXME as in "Find the preceder" in -+ reverse_relocate_test. coord_is_leftmost_unit is not the right test -+ if the unformatted child is in the middle of the first extent unit.*/ -+ if (!coord_is_leftmost_unit(&coord)) { -+ coord_prev_unit(&coord); -+ } else { -+ ret = -+ reiser4_get_left_neighbor(&left_lock, coord.node, -+ ZNODE_READ_LOCK, GN_SAME_ATOM); -+ if (ret) { -+ /* If we fail for any reason it doesn't matter because -+ the preceder is only a hint. We are low-priority at -+ this point, so this must be the case. */ -+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR || -+ ret == -ENOENT || ret == -EINVAL -+ || ret == -E_DEADLOCK) -+ ret = 0; -+ goto exit; -+ } -+ -+ ret = incr_load_count_znode(&left_load, left_lock.node); -+ if (ret) -+ goto exit; -+ -+ coord_init_last_unit(&coord, left_lock.node); -+ } -+ -+ ret = -+ item_utmost_child_real_block(&coord, RIGHT_SIDE, -+ &pos->preceder.blk); -+exit: -+ check_preceder(pos->preceder.blk); -+ done_load_count(&left_load); -+ done_lh(&left_lock); -+ return ret; -+} -+ -+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */ -+ -+/* This procedure implements the outer loop of the flush algorithm. To put this -+ in context, here is the general list of steps taken by the flush routine as a -+ whole: -+ -+ 1. Scan-left -+ 2. Scan-right (maybe) -+ 3. Allocate initial flush position and its ancestors -+ 4. <handle extents> -+ 5. <squeeze and next position and its ancestors to-the-right, -+ then update position to-the-right> -+ 6. <repeat from #4 until flush is stopped> -+ -+ This procedure implements the loop in steps 4 through 6 in the above listing. -+ -+ Step 4: if the current flush position is an extent item (position on the twig -+ level), it allocates the extent (allocate_extent_item_in_place) then shifts -+ to the next coordinate. If the next coordinate's leftmost child needs -+ flushprep, we will continue. If the next coordinate is an internal item, we -+ descend back to the leaf level, otherwise we repeat a step #4 (labeled -+ ALLOC_EXTENTS below). If the "next coordinate" brings us past the end of the -+ twig level, then we call reverse_relocate_end_of_twig to possibly dirty the -+ next (right) twig, prior to step #5 which moves to the right. -+ -+ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up -+ the tree to allocate any ancestors of the next-right flush position that are -+ not also ancestors of the current position. Those ancestors (in top-down -+ order) are the next in parent-first order. We squeeze adjacent nodes on the -+ way up until the right node and current node share the same parent, then -+ allocate on the way back down. Finally, this step sets the flush position to -+ the next-right node. Then repeat steps 4 and 5. -+*/ -+ -+/* SQUEEZE CODE */ -+ -+/* squalloc_right_twig helper function, cut a range of extent items from -+ cut node to->node from the beginning up to coord @to. */ -+static int squalloc_right_twig_cut(coord_t *to, reiser4_key * to_key, -+ znode * left) -+{ -+ coord_t from; -+ reiser4_key from_key; -+ -+ coord_init_first_unit(&from, to->node); -+ item_key_by_coord(&from, &from_key); -+ -+ return cut_node_content(&from, to, &from_key, to_key, NULL); -+} -+ -+/* Copy as much of the leading extents from @right to @left, allocating -+ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or -+ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an -+ internal item it calls shift_one_internal_unit and may then return -+ SUBTREE_MOVED. */ -+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t *pos) -+{ -+ int ret = SUBTREE_MOVED; -+ coord_t coord; /* used to iterate over items */ -+ reiser4_key stop_key; -+ -+ assert("jmacd-2008", !node_is_empty(right)); -+ coord_init_first_unit(&coord, right); -+ -+ /* FIXME: can be optimized to cut once */ -+ while (!node_is_empty(coord.node) && item_is_extent(&coord)) { -+ ON_DEBUG(void *vp); -+ -+ assert("vs-1468", coord_is_leftmost_unit(&coord)); -+ ON_DEBUG(vp = shift_check_prepare(left, coord.node)); -+ -+ /* stop_key is used to find what was copied and what to cut */ -+ stop_key = *reiser4_min_key(); -+ ret = squalloc_extent(left, &coord, pos, &stop_key); -+ if (ret != SQUEEZE_CONTINUE) { -+ ON_DEBUG(kfree(vp)); -+ break; -+ } -+ assert("vs-1465", !keyeq(&stop_key, reiser4_min_key())); -+ -+ /* Helper function to do the cutting. */ -+ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1); -+ check_me("vs-1466", -+ squalloc_right_twig_cut(&coord, &stop_key, left) == 0); -+ -+ ON_DEBUG(shift_check(vp, left, coord.node)); -+ } -+ -+ if (node_is_empty(coord.node)) -+ ret = SQUEEZE_SOURCE_EMPTY; -+ -+ if (ret == SQUEEZE_TARGET_FULL) -+ goto out; -+ -+ if (node_is_empty(right)) { -+ /* The whole right node was copied into @left. */ -+ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY); -+ goto out; -+ } -+ -+ coord_init_first_unit(&coord, right); -+ -+ if (!item_is_internal(&coord)) { -+ /* we do not want to squeeze anything else to left neighbor -+ because "slum" is over */ -+ ret = SQUEEZE_TARGET_FULL; -+ goto out; -+ } -+ assert("jmacd-433", item_is_internal(&coord)); -+ -+ /* Shift an internal unit. The child must be allocated before shifting -+ any more extents, so we stop here. */ -+ ret = shift_one_internal_unit(left, right); -+ -+out: -+ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL -+ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY); -+ -+ if (ret == SQUEEZE_TARGET_FULL) { -+ /* We submit prepped nodes here and expect that this @left twig -+ * will not be modified again during this jnode_flush() call. */ -+ int ret1; -+ -+ /* NOTE: seems like io is done under long term locks. */ -+ ret1 = write_prepped_nodes(pos); -+ if (ret1 < 0) -+ return ret1; -+ } -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+static void item_convert_invariant(flush_pos_t *pos) -+{ -+ assert("edward-1225", coord_is_existing_item(&pos->coord)); -+ if (chaining_data_present(pos)) { -+ item_plugin *iplug = item_convert_plug(pos); -+ -+ assert("edward-1000", -+ iplug == item_plugin_by_coord(&pos->coord)); -+ assert("edward-1001", iplug->f.convert != NULL); -+ } else -+ assert("edward-1226", pos->child == NULL); -+} -+#else -+ -+#define item_convert_invariant(pos) noop -+ -+#endif -+ -+/* Scan node items starting from the first one and apply for each -+ item its flush ->convert() method (if any). This method may -+ resize/kill the item so the tree will be changed. -+*/ -+static int convert_node(flush_pos_t *pos, znode * node) -+{ -+ int ret = 0; -+ item_plugin *iplug; -+ -+ assert("edward-304", pos != NULL); -+ assert("edward-305", pos->child == NULL); -+ assert("edward-475", znode_convertible(node)); -+ assert("edward-669", znode_is_wlocked(node)); -+ assert("edward-1210", !node_is_empty(node)); -+ -+ if (znode_get_level(node) != LEAF_LEVEL) -+ /* unsupported */ -+ goto exit; -+ -+ coord_init_first_unit(&pos->coord, node); -+ -+ while (1) { -+ ret = 0; -+ coord_set_to_left(&pos->coord); -+ item_convert_invariant(pos); -+ -+ iplug = item_plugin_by_coord(&pos->coord); -+ assert("edward-844", iplug != NULL); -+ -+ if (iplug->f.convert) { -+ ret = iplug->f.convert(pos); -+ if (ret) -+ goto exit; -+ } -+ assert("edward-307", pos->child == NULL); -+ -+ if (coord_next_item(&pos->coord)) { -+ /* node is over */ -+ -+ if (!chaining_data_present(pos)) -+ /* finished this node */ -+ break; -+ if (should_chain_next_node(pos)) { -+ /* go to next node */ -+ move_chaining_data(pos, 0/* to next node */); -+ break; -+ } -+ /* repeat this node */ -+ move_chaining_data(pos, 1/* this node */); -+ continue; -+ } -+ /* Node is not over. -+ Check if there is attached convert data. -+ If so roll one item position back and repeat -+ on this node -+ */ -+ if (chaining_data_present(pos)) { -+ -+ if (iplug != item_plugin_by_coord(&pos->coord)) -+ set_item_convert_count(pos, 0); -+ -+ ret = coord_prev_item(&pos->coord); -+ assert("edward-1003", !ret); -+ -+ move_chaining_data(pos, 1/* this node */); -+ } -+ } -+ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE); -+ znode_make_dirty(node); -+exit: -+ assert("edward-1004", !ret); -+ return ret; -+} -+ -+/* Squeeze and allocate the right neighbor. This is called after @left and -+ its current children have been squeezed and allocated already. This -+ procedure's job is to squeeze and items from @right to @left. -+ -+ If at the leaf level, use the shift_everything_left memcpy-optimized -+ version of shifting (squeeze_right_leaf). -+ -+ If at the twig level, extents are allocated as they are shifted from @right -+ to @left (squalloc_right_twig). -+ -+ At any other level, shift one internal item and return to the caller -+ (squalloc_parent_first) so that the shifted-subtree can be processed in -+ parent-first order. -+ -+ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is -+ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is -+ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL -+ is returned. -+*/ -+ -+static int squeeze_right_neighbor(flush_pos_t *pos, znode * left, -+ znode * right) -+{ -+ int ret; -+ -+ /* FIXME it is possible to see empty hasn't-heard-banshee node in a -+ * tree owing to error (for example, ENOSPC) in write */ -+ /* assert("jmacd-9321", !node_is_empty(left)); */ -+ assert("jmacd-9322", !node_is_empty(right)); -+ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right)); -+ -+ switch (znode_get_level(left)) { -+ case TWIG_LEVEL: -+ /* Shift with extent allocating until either an internal item -+ is encountered or everything is shifted or no free space -+ left in @left */ -+ ret = squeeze_right_twig(left, right, pos); -+ break; -+ -+ default: -+ /* All other levels can use shift_everything until we implement -+ per-item flush plugins. */ -+ ret = squeeze_right_non_twig(left, right); -+ break; -+ } -+ -+ assert("jmacd-2011", (ret < 0 || -+ ret == SQUEEZE_SOURCE_EMPTY -+ || ret == SQUEEZE_TARGET_FULL -+ || ret == SUBTREE_MOVED)); -+ return ret; -+} -+ -+static int squeeze_right_twig_and_advance_coord(flush_pos_t *pos, -+ znode * right) -+{ -+ int ret; -+ -+ ret = squeeze_right_twig(pos->lock.node, right, pos); -+ if (ret < 0) -+ return ret; -+ if (ret > 0) { -+ coord_init_after_last_item(&pos->coord, pos->lock.node); -+ return ret; -+ } -+ -+ coord_init_last_unit(&pos->coord, pos->lock.node); -+ return 0; -+} -+ -+/* forward declaration */ -+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *); -+ -+/* do a fast check for "same parents" condition before calling -+ * squalloc_upper_levels() */ -+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t *pos, -+ znode * left, -+ znode * right) -+{ -+ if (znode_same_parents(left, right)) -+ return 0; -+ -+ return squalloc_upper_levels(pos, left, right); -+} -+ -+/* Check whether the parent of given @right node needs to be processes -+ ((re)allocated) prior to processing of the child. If @left and @right do not -+ share at least the parent of the @right is after the @left but before the -+ @right in parent-first order, we have to (re)allocate it before the @right -+ gets (re)allocated. */ -+static int squalloc_upper_levels(flush_pos_t *pos, znode * left, znode * right) -+{ -+ int ret; -+ -+ lock_handle left_parent_lock; -+ lock_handle right_parent_lock; -+ -+ load_count left_parent_load; -+ load_count right_parent_load; -+ -+ init_lh(&left_parent_lock); -+ init_lh(&right_parent_lock); -+ -+ init_load_count(&left_parent_load); -+ init_load_count(&right_parent_load); -+ -+ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ /* Check for same parents */ -+ if (left_parent_lock.node == right_parent_lock.node) -+ goto out; -+ -+ if (znode_check_flushprepped(right_parent_lock.node)) { -+ /* Keep parent-first order. In the order, the right parent node -+ stands before the @right node. If it is already allocated, -+ we set the preceder (next block search start point) to its -+ block number, @right node should be allocated after it. -+ -+ However, preceder is set only if the right parent is on twig -+ level. The explanation is the following: new branch nodes are -+ allocated over already allocated children while the tree -+ grows, it is difficult to keep tree ordered, we assume that -+ only leaves and twings are correctly allocated. So, only -+ twigs are used as a preceder for allocating of the rest of -+ the slum. */ -+ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) { -+ pos->preceder.blk = -+ *znode_get_block(right_parent_lock.node); -+ check_preceder(pos->preceder.blk); -+ } -+ goto out; -+ } -+ -+ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = -+ squeeze_right_neighbor(pos, left_parent_lock.node, -+ right_parent_lock.node); -+ /* We stop if error. We stop if some items/units were shifted (ret == 0) -+ * and thus @right changed its parent. It means we have not process -+ * right_parent node prior to processing of @right. Positive return -+ * values say that shifting items was not happen because of "empty -+ * source" or "target full" conditions. */ -+ if (ret <= 0) -+ goto out; -+ -+ /* parent(@left) and parent(@right) may have different parents also. We -+ * do a recursive call for checking that. */ -+ ret = -+ check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node, -+ right_parent_lock.node); -+ if (ret) -+ goto out; -+ -+ /* allocate znode when going down */ -+ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos); -+ -+out: -+ done_load_count(&left_parent_load); -+ done_load_count(&right_parent_load); -+ -+ done_lh(&left_parent_lock); -+ done_lh(&right_parent_lock); -+ -+ return ret; -+} -+ -+/* Check the leftmost child "flushprepped" status, also returns true if child -+ * node was not found in cache. */ -+static int leftmost_child_of_unit_check_flushprepped(const coord_t *coord) -+{ -+ int ret; -+ int prepped; -+ -+ jnode *child; -+ -+ ret = get_leftmost_child_of_unit(coord, &child); -+ -+ if (ret) -+ return ret; -+ -+ if (child) { -+ prepped = jnode_check_flushprepped(child); -+ jput(child); -+ } else { -+ /* We consider not existing child as a node which slum -+ processing should not continue to. Not cached node is clean, -+ so it is flushprepped. */ -+ prepped = 1; -+ } -+ -+ return prepped; -+} -+ -+/* (re)allocate znode with automated getting parent node */ -+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t *pos) -+{ -+ int ret; -+ lock_handle parent_lock; -+ load_count parent_load; -+ coord_t pcoord; -+ -+ assert("zam-851", znode_is_write_locked(node)); -+ -+ init_lh(&parent_lock); -+ init_load_count(&parent_load); -+ -+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&parent_load, parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = find_child_ptr(parent_lock.node, node, &pcoord); -+ if (ret) -+ goto out; -+ -+ ret = allocate_znode(node, &pcoord, pos); -+ -+out: -+ done_load_count(&parent_load); -+ done_lh(&parent_lock); -+ return ret; -+} -+ -+/* Process nodes on leaf level until unformatted node or rightmost node in the -+ * slum reached. */ -+static int handle_pos_on_formatted(flush_pos_t *pos) -+{ -+ int ret; -+ lock_handle right_lock; -+ load_count right_load; -+ -+ init_lh(&right_lock); -+ init_load_count(&right_load); -+ -+ if (should_convert_node(pos, pos->lock.node)) { -+ ret = convert_node(pos, pos->lock.node); -+ if (ret) -+ return ret; -+ } -+ -+ while (1) { -+ int expected; -+ expected = should_convert_next_node(pos); -+ ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE, -+ ZNODE_WRITE_LOCK, !expected, expected); -+ if (ret) { -+ if (expected) -+ warning("edward-1495", -+ "Expected neighbor not found (ret = %d). Fsck?", -+ ret); -+ break; -+ } -+ -+ /* we don't prep(allocate) nodes for flushing twice. This can be -+ * suboptimal, or it can be optimal. For now we choose to live -+ * with the risk that it will be suboptimal because it would be -+ * quite complex to code it to be smarter. */ -+ if (znode_check_flushprepped(right_lock.node) -+ && !znode_convertible(right_lock.node)) { -+ assert("edward-1005", !should_convert_next_node(pos)); -+ pos_stop(pos); -+ break; -+ } -+ -+ ret = incr_load_count_znode(&right_load, right_lock.node); -+ if (ret) -+ break; -+ if (should_convert_node(pos, right_lock.node)) { -+ ret = convert_node(pos, right_lock.node); -+ if (ret) -+ break; -+ if (node_is_empty(right_lock.node)) { -+ /* node became empty after converting, repeat */ -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ continue; -+ } -+ } -+ -+ /* squeeze _before_ going upward. */ -+ ret = -+ squeeze_right_neighbor(pos, pos->lock.node, -+ right_lock.node); -+ if (ret < 0) -+ break; -+ -+ if (znode_check_flushprepped(right_lock.node)) { -+ if (should_convert_next_node(pos)) { -+ /* in spite of flushprepped status of the node, -+ its right slum neighbor should be converted*/ -+ assert("edward-953", convert_data(pos)); -+ assert("edward-954", item_convert_data(pos)); -+ -+ if (node_is_empty(right_lock.node)) { -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ } else -+ move_flush_pos(pos, &right_lock, -+ &right_load, NULL); -+ continue; -+ } -+ pos_stop(pos); -+ break; -+ } -+ -+ if (node_is_empty(right_lock.node)) { -+ /* repeat if right node was squeezed completely */ -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ continue; -+ } -+ -+ /* parent(right_lock.node) has to be processed before -+ * (right_lock.node) due to "parent-first" allocation order. */ -+ ret = -+ check_parents_and_squalloc_upper_levels(pos, pos->lock.node, -+ right_lock.node); -+ if (ret) -+ break; -+ /* (re)allocate _after_ going upward */ -+ ret = lock_parent_and_allocate_znode(right_lock.node, pos); -+ if (ret) -+ break; -+ if (should_terminate_squalloc(pos)) { -+ set_item_convert_count(pos, 0); -+ break; -+ } -+ -+ /* advance the flush position to the right neighbor */ -+ move_flush_pos(pos, &right_lock, &right_load, NULL); -+ -+ ret = rapid_flush(pos); -+ if (ret) -+ break; -+ } -+ check_convert_info(pos); -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ -+ /* This function indicates via pos whether to stop or go to twig or -+ * continue on current level. */ -+ return ret; -+ -+} -+ -+/* Process nodes on leaf level until unformatted node or rightmost node in the -+ * slum reached. */ -+static int handle_pos_on_leaf(flush_pos_t *pos) -+{ -+ int ret; -+ -+ assert("zam-845", pos->state == POS_ON_LEAF); -+ -+ ret = handle_pos_on_formatted(pos); -+ -+ if (ret == -E_NO_NEIGHBOR) { -+ /* cannot get right neighbor, go process extents. */ -+ pos->state = POS_TO_TWIG; -+ return 0; -+ } -+ -+ return ret; -+} -+ -+/* Process slum on level > 1 */ -+static int handle_pos_on_internal(flush_pos_t *pos) -+{ -+ assert("zam-850", pos->state == POS_ON_INTERNAL); -+ return handle_pos_on_formatted(pos); -+} -+ -+/* check whether squalloc should stop before processing given extent */ -+static int squalloc_extent_should_stop(flush_pos_t *pos) -+{ -+ assert("zam-869", item_is_extent(&pos->coord)); -+ -+ /* pos->child is a jnode handle_pos_on_extent() should start with in -+ * stead of the first child of the first extent unit. */ -+ if (pos->child) { -+ int prepped; -+ -+ assert("vs-1383", jnode_is_unformatted(pos->child)); -+ prepped = jnode_check_flushprepped(pos->child); -+ pos->pos_in_unit = -+ jnode_get_index(pos->child) - -+ extent_unit_index(&pos->coord); -+ assert("vs-1470", -+ pos->pos_in_unit < extent_unit_width(&pos->coord)); -+ assert("nikita-3434", -+ ergo(extent_is_unallocated(&pos->coord), -+ pos->pos_in_unit == 0)); -+ jput(pos->child); -+ pos->child = NULL; -+ -+ return prepped; -+ } -+ -+ pos->pos_in_unit = 0; -+ if (extent_is_unallocated(&pos->coord)) -+ return 0; -+ -+ return leftmost_child_of_unit_check_flushprepped(&pos->coord); -+} -+ -+/* Handle the case when regular reiser4 tree (znodes connected one to its -+ * neighbors by sibling pointers) is interrupted on leaf level by one or more -+ * unformatted nodes. By having a lock on twig level and use extent code -+ * routines to process unformatted nodes we swim around an irregular part of -+ * reiser4 tree. */ -+static int handle_pos_on_twig(flush_pos_t *pos) -+{ -+ int ret; -+ -+ assert("zam-844", pos->state == POS_ON_EPOINT); -+ assert("zam-843", item_is_extent(&pos->coord)); -+ -+ /* We decide should we continue slum processing with current extent -+ unit: if leftmost child of current extent unit is flushprepped -+ (i.e. clean or already processed by flush) we stop squalloc(). There -+ is a fast check for unallocated extents which we assume contain all -+ not flushprepped nodes. */ -+ /* FIXME: Here we implement simple check, we are only looking on the -+ leftmost child. */ -+ ret = squalloc_extent_should_stop(pos); -+ if (ret != 0) { -+ pos_stop(pos); -+ return ret; -+ } -+ -+ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord) -+ && item_is_extent(&pos->coord)) { -+ ret = reiser4_alloc_extent(pos); -+ if (ret) -+ break; -+ coord_next_unit(&pos->coord); -+ } -+ -+ if (coord_is_after_rightmost(&pos->coord)) { -+ pos->state = POS_END_OF_TWIG; -+ return 0; -+ } -+ if (item_is_internal(&pos->coord)) { -+ pos->state = POS_TO_LEAF; -+ return 0; -+ } -+ -+ assert("zam-860", item_is_extent(&pos->coord)); -+ -+ /* "slum" is over */ -+ pos->state = POS_INVALID; -+ return 0; -+} -+ -+/* When we about to return flush position from twig to leaf level we can process -+ * the right twig node or move position to the leaf. This processes right twig -+ * if it is possible and jump to leaf level if not. */ -+static int handle_pos_end_of_twig(flush_pos_t *pos) -+{ -+ int ret; -+ lock_handle right_lock; -+ load_count right_load; -+ coord_t at_right; -+ jnode *child = NULL; -+ -+ assert("zam-848", pos->state == POS_END_OF_TWIG); -+ assert("zam-849", coord_is_after_rightmost(&pos->coord)); -+ -+ init_lh(&right_lock); -+ init_load_count(&right_load); -+ -+ /* We get a lock on the right twig node even it is not dirty because -+ * slum continues or discontinues on leaf level not on next twig. This -+ * lock on the right twig is needed for getting its leftmost child. */ -+ ret = -+ reiser4_get_right_neighbor(&right_lock, pos->lock.node, -+ ZNODE_WRITE_LOCK, GN_SAME_ATOM); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&right_load, right_lock.node); -+ if (ret) -+ goto out; -+ -+ /* right twig could be not dirty */ -+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) { -+ /* If right twig node is dirty we always attempt to squeeze it -+ * content to the left... */ -+became_dirty: -+ ret = -+ squeeze_right_twig_and_advance_coord(pos, right_lock.node); -+ if (ret <= 0) { -+ /* pos->coord is on internal item, go to leaf level, or -+ * we have an error which will be caught in squalloc() -+ */ -+ pos->state = POS_TO_LEAF; -+ goto out; -+ } -+ -+ /* If right twig was squeezed completely we wave to re-lock -+ * right twig. now it is done through the top-level squalloc -+ * routine. */ -+ if (node_is_empty(right_lock.node)) -+ goto out; -+ -+ /* ... and prep it if it is not yet prepped */ -+ if (!znode_check_flushprepped(right_lock.node)) { -+ /* As usual, process parent before ... */ -+ ret = -+ check_parents_and_squalloc_upper_levels(pos, -+ pos->lock. -+ node, -+ right_lock. -+ node); -+ if (ret) -+ goto out; -+ -+ /* ... processing the child */ -+ ret = -+ lock_parent_and_allocate_znode(right_lock.node, -+ pos); -+ if (ret) -+ goto out; -+ } -+ } else { -+ coord_init_first_unit(&at_right, right_lock.node); -+ -+ /* check first child of next twig, should we continue there ? */ -+ ret = get_leftmost_child_of_unit(&at_right, &child); -+ if (ret || child == NULL || jnode_check_flushprepped(child)) { -+ pos_stop(pos); -+ goto out; -+ } -+ -+ /* check clean twig for possible relocation */ -+ if (!znode_check_flushprepped(right_lock.node)) { -+ ret = -+ reverse_relocate_check_dirty_parent(child, -+ &at_right, pos); -+ if (ret) -+ goto out; -+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) -+ goto became_dirty; -+ } -+ } -+ -+ assert("zam-875", znode_check_flushprepped(right_lock.node)); -+ -+ /* Update the preceder by a block number of just processed right twig -+ * node. The code above could miss the preceder updating because -+ * allocate_znode() could not be called for this node. */ -+ pos->preceder.blk = *znode_get_block(right_lock.node); -+ check_preceder(pos->preceder.blk); -+ -+ coord_init_first_unit(&at_right, right_lock.node); -+ assert("zam-868", coord_is_existing_unit(&at_right)); -+ -+ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF; -+ move_flush_pos(pos, &right_lock, &right_load, &at_right); -+ -+out: -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ -+ if (child) -+ jput(child); -+ -+ return ret; -+} -+ -+/* Move the pos->lock to leaf node pointed by pos->coord, check should we -+ * continue there. */ -+static int handle_pos_to_leaf(flush_pos_t *pos) -+{ -+ int ret; -+ lock_handle child_lock; -+ load_count child_load; -+ jnode *child; -+ -+ assert("zam-846", pos->state == POS_TO_LEAF); -+ assert("zam-847", item_is_internal(&pos->coord)); -+ -+ init_lh(&child_lock); -+ init_load_count(&child_load); -+ -+ ret = get_leftmost_child_of_unit(&pos->coord, &child); -+ if (ret) -+ return ret; -+ if (child == NULL) { -+ pos_stop(pos); -+ return 0; -+ } -+ -+ if (jnode_check_flushprepped(child)) { -+ pos->state = POS_INVALID; -+ goto out; -+ } -+ -+ ret = -+ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&child_load, JZNODE(child)); -+ if (ret) -+ goto out; -+ -+ ret = allocate_znode(JZNODE(child), &pos->coord, pos); -+ if (ret) -+ goto out; -+ -+ /* move flush position to leaf level */ -+ pos->state = POS_ON_LEAF; -+ move_flush_pos(pos, &child_lock, &child_load, NULL); -+ -+ if (node_is_empty(JZNODE(child))) { -+ ret = delete_empty_node(JZNODE(child)); -+ pos->state = POS_INVALID; -+ } -+out: -+ done_load_count(&child_load); -+ done_lh(&child_lock); -+ jput(child); -+ -+ return ret; -+} -+ -+/* move pos from leaf to twig, and move lock from leaf to twig. */ -+/* Move pos->lock to upper (twig) level */ -+static int handle_pos_to_twig(flush_pos_t *pos) -+{ -+ int ret; -+ -+ lock_handle parent_lock; -+ load_count parent_load; -+ coord_t pcoord; -+ -+ assert("zam-852", pos->state == POS_TO_TWIG); -+ -+ init_lh(&parent_lock); -+ init_load_count(&parent_load); -+ -+ ret = -+ reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&parent_load, parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord); -+ if (ret) -+ goto out; -+ -+ assert("zam-870", item_is_internal(&pcoord)); -+ coord_next_item(&pcoord); -+ -+ if (coord_is_after_rightmost(&pcoord)) -+ pos->state = POS_END_OF_TWIG; -+ else if (item_is_extent(&pcoord)) -+ pos->state = POS_ON_EPOINT; -+ else { -+ /* Here we understand that getting -E_NO_NEIGHBOR in -+ * handle_pos_on_leaf() was because of just a reaching edge of -+ * slum */ -+ pos_stop(pos); -+ goto out; -+ } -+ -+ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord); -+ -+out: -+ done_load_count(&parent_load); -+ done_lh(&parent_lock); -+ -+ return ret; -+} -+ -+typedef int (*pos_state_handle_t) (flush_pos_t *); -+static pos_state_handle_t flush_pos_handlers[] = { -+ /* process formatted nodes on leaf level, keep lock on a leaf node */ -+ [POS_ON_LEAF] = handle_pos_on_leaf, -+ /* process unformatted nodes, keep lock on twig node, pos->coord points -+ * to extent currently being processed */ -+ [POS_ON_EPOINT] = handle_pos_on_twig, -+ /* move a lock from leaf node to its parent for further processing of -+ unformatted nodes */ -+ [POS_TO_TWIG] = handle_pos_to_twig, -+ /* move a lock from twig to leaf level when a processing of unformatted -+ * nodes finishes, pos->coord points to the leaf node we jump to */ -+ [POS_TO_LEAF] = handle_pos_to_leaf, -+ /* after processing last extent in the twig node, attempting to shift -+ * items from the twigs right neighbor and process them while shifting*/ -+ [POS_END_OF_TWIG] = handle_pos_end_of_twig, -+ /* process formatted nodes on internal level, keep lock on an internal -+ node */ -+ [POS_ON_INTERNAL] = handle_pos_on_internal -+}; -+ -+/* Advance flush position horizontally, prepare for flushing ((re)allocate, -+ * squeeze, encrypt) nodes and their ancestors in "parent-first" order */ -+static int squalloc(flush_pos_t *pos) -+{ -+ int ret = 0; -+ -+ /* maybe needs to be made a case statement with handle_pos_on_leaf as -+ * first case, for greater CPU efficiency? Measure and see.... -Hans */ -+ while (pos_valid(pos)) { -+ ret = flush_pos_handlers[pos->state] (pos); -+ if (ret < 0) -+ break; -+ -+ ret = rapid_flush(pos); -+ if (ret) -+ break; -+ } -+ -+ /* any positive value or -E_NO_NEIGHBOR are legal return codes for -+ handle_pos* routines, -E_NO_NEIGHBOR means that slum edge was -+ reached */ -+ if (ret > 0 || ret == -E_NO_NEIGHBOR) -+ ret = 0; -+ -+ return ret; -+} -+ -+static void update_ldkey(znode * node) -+{ -+ reiser4_key ldkey; -+ -+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); -+ if (node_is_empty(node)) -+ return; -+ -+ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey)); -+} -+ -+/* this is to be called after calling of shift node's method to shift data from -+ @right to @left. It sets left delimiting keys of @left and @right to keys of -+ first items of @left and @right correspondingly and sets right delimiting key -+ of @left to first key of @right */ -+static void update_znode_dkeys(znode * left, znode * right) -+{ -+ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock)); -+ assert("vs-1629", (znode_is_write_locked(left) && -+ znode_is_write_locked(right))); -+ -+ /* we need to update left delimiting of left if it was empty before -+ shift */ -+ update_ldkey(left); -+ update_ldkey(right); -+ if (node_is_empty(right)) -+ znode_set_rd_key(left, znode_get_rd_key(right)); -+ else -+ znode_set_rd_key(left, znode_get_ld_key(right)); -+} -+ -+/* try to shift everything from @right to @left. If everything was shifted - -+ @right is removed from the tree. Result is the number of bytes shifted. */ -+static int -+shift_everything_left(znode * right, znode * left, carry_level * todo) -+{ -+ coord_t from; -+ node_plugin *nplug; -+ carry_plugin_info info; -+ -+ coord_init_after_last_item(&from, right); -+ -+ nplug = node_plugin_by_node(right); -+ info.doing = NULL; -+ info.todo = todo; -+ return nplug->shift(&from, left, SHIFT_LEFT, -+ 1 /* delete @right if it becomes empty */ , -+ 1 -+ /* move coord @from to node @left if everything will -+ be shifted */ -+ , -+ &info); -+} -+ -+/* Shift as much as possible from @right to @left using the memcpy-optimized -+ shift_everything_left. @left and @right are formatted neighboring nodes on -+ leaf level. */ -+static int squeeze_right_non_twig(znode * left, znode * right) -+{ -+ int ret; -+ carry_pool *pool; -+ carry_level *todo; -+ -+ assert("nikita-2246", znode_get_level(left) == znode_get_level(right)); -+ -+ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) || -+ !JF_ISSET(ZJNODE(right), JNODE_DIRTY)) -+ return SQUEEZE_TARGET_FULL; -+ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ todo = (carry_level *) (pool + 1); -+ init_carry_level(todo, pool); -+ -+ ret = shift_everything_left(right, left, todo); -+ if (ret > 0) { -+ /* something was shifted */ -+ reiser4_tree *tree; -+ __u64 grabbed; -+ -+ znode_make_dirty(left); -+ znode_make_dirty(right); -+ -+ /* update delimiting keys of nodes which participated in -+ shift. FIXME: it would be better to have this in shift -+ node's operation. But it can not be done there. Nobody -+ remembers why, though */ -+ tree = znode_get_tree(left); -+ write_lock_dk(tree); -+ update_znode_dkeys(left, right); -+ write_unlock_dk(tree); -+ -+ /* Carry is called to update delimiting key and, maybe, to -+ remove empty node. */ -+ grabbed = get_current_context()->grabbed_blocks; -+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED); -+ assert("nikita-3003", ret == 0); /* reserved space is -+ exhausted. Ask Hans. */ -+ ret = reiser4_carry(todo, NULL/* previous level */); -+ grabbed2free_mark(grabbed); -+ } else { -+ /* Shifting impossible, we return appropriate result code */ -+ ret = -+ node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY : -+ SQUEEZE_TARGET_FULL; -+ } -+ -+ done_carry_pool(pool); -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+static int sibling_link_is_ok(const znode *left, const znode *right) -+{ -+ int result; -+ -+ read_lock_tree(znode_get_tree(left)); -+ result = (left->right == right && left == right->left); -+ read_unlock_tree(znode_get_tree(left)); -+ return result; -+} -+#endif -+ -+/* Shift first unit of first item if it is an internal one. Return -+ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return -+ SUBTREE_MOVED. */ -+static int shift_one_internal_unit(znode * left, znode * right) -+{ -+ int ret; -+ carry_pool *pool; -+ carry_level *todo; -+ coord_t *coord; -+ carry_plugin_info *info; -+ int size, moved; -+ -+ assert("nikita-2247", znode_get_level(left) == znode_get_level(right)); -+ assert("nikita-2435", znode_is_write_locked(left)); -+ assert("nikita-2436", znode_is_write_locked(right)); -+ assert("nikita-2434", sibling_link_is_ok(left, right)); -+ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) + -+ sizeof(*coord) + sizeof(*info) -+#if REISER4_DEBUG -+ + sizeof(*coord) + 2 * sizeof(reiser4_key) -+#endif -+ ); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ todo = (carry_level *) (pool + 1); -+ init_carry_level(todo, pool); -+ -+ coord = (coord_t *) (todo + 3); -+ coord_init_first_unit(coord, right); -+ info = (carry_plugin_info *) (coord + 1); -+ -+#if REISER4_DEBUG -+ if (!node_is_empty(left)) { -+ coord_t *last; -+ reiser4_key *right_key; -+ reiser4_key *left_key; -+ -+ last = (coord_t *) (info + 1); -+ right_key = (reiser4_key *) (last + 1); -+ left_key = right_key + 1; -+ coord_init_last_unit(last, left); -+ -+ assert("nikita-2463", -+ keyle(item_key_by_coord(last, left_key), -+ item_key_by_coord(coord, right_key))); -+ } -+#endif -+ -+ assert("jmacd-2007", item_is_internal(coord)); -+ -+ size = item_length_by_coord(coord); -+ info->todo = todo; -+ info->doing = NULL; -+ -+ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT, -+ 1 -+ /* delete @right if it becomes -+ empty */ -+ , -+ 0 -+ /* do not move coord @coord to -+ node @left */ -+ , -+ info); -+ -+ /* If shift returns positive, then we shifted the item. */ -+ assert("vs-423", ret <= 0 || size == ret); -+ moved = (ret > 0); -+ -+ if (moved) { -+ /* something was moved */ -+ reiser4_tree *tree; -+ int grabbed; -+ -+ znode_make_dirty(left); -+ znode_make_dirty(right); -+ tree = znode_get_tree(left); -+ write_lock_dk(tree); -+ update_znode_dkeys(left, right); -+ write_unlock_dk(tree); -+ -+ /* reserve space for delimiting keys after shifting */ -+ grabbed = get_current_context()->grabbed_blocks; -+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED); -+ assert("nikita-3003", ret == 0); /* reserved space is -+ exhausted. Ask Hans. */ -+ -+ ret = reiser4_carry(todo, NULL/* previous level */); -+ grabbed2free_mark(grabbed); -+ } -+ -+ done_carry_pool(pool); -+ -+ if (ret != 0) { -+ /* Shift or carry operation failed. */ -+ assert("jmacd-7325", ret < 0); -+ return ret; -+ } -+ -+ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL; -+} -+ -+/* Make the final relocate/wander decision during forward parent-first squalloc -+ for a znode. For unformatted nodes this is done in -+ plugin/item/extent.c:extent_needs_allocation(). */ -+static int -+allocate_znode_loaded(znode * node, -+ const coord_t *parent_coord, flush_pos_t *pos) -+{ -+ int ret; -+ reiser4_super_info_data *sbinfo = get_current_super_private(); -+ /* FIXME(D): We have the node write-locked and should have checked for ! -+ allocated() somewhere before reaching this point, but there can be a -+ race, so this assertion is bogus. */ -+ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node))); -+ assert("jmacd-7988", znode_is_write_locked(node)); -+ assert("jmacd-7989", coord_is_invalid(parent_coord) -+ || znode_is_write_locked(parent_coord->node)); -+ -+ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) || -+ znode_is_root(node) || -+ /* We have enough nodes to relocate no matter what. */ -+ (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) { -+ /* No need to decide with new nodes, they are treated the same -+ as relocate. If the root node is dirty, relocate. */ -+ if (pos->preceder.blk == 0) { -+ /* preceder is unknown and we have decided to relocate -+ node -- using of default value for search start is -+ better than search from block #0. */ -+ get_blocknr_hint_default(&pos->preceder.blk); -+ check_preceder(pos->preceder.blk); -+ } -+ -+ goto best_reloc; -+ -+ } else if (pos->preceder.blk == 0) { -+ /* If we don't know the preceder, leave it where it is. */ -+ jnode_make_wander(ZJNODE(node)); -+ } else { -+ /* Make a decision based on block distance. */ -+ reiser4_block_nr dist; -+ reiser4_block_nr nblk = *znode_get_block(node); -+ -+ assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk)); -+ assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk)); -+ assert("jmacd-6174", pos->preceder.blk != 0); -+ -+ if (pos->preceder.blk == nblk - 1) { -+ /* Ideal. */ -+ jnode_make_wander(ZJNODE(node)); -+ } else { -+ -+ dist = -+ (nblk < -+ pos->preceder.blk) ? (pos->preceder.blk - -+ nblk) : (nblk - -+ pos->preceder.blk); -+ -+ /* See if we can find a closer block -+ (forward direction only). */ -+ pos->preceder.max_dist = -+ min((reiser4_block_nr) sbinfo->flush. -+ relocate_distance, dist); -+ pos->preceder.level = znode_get_level(node); -+ -+ ret = allocate_znode_update(node, parent_coord, pos); -+ -+ pos->preceder.max_dist = 0; -+ -+ if (ret && (ret != -ENOSPC)) -+ return ret; -+ -+ if (ret == 0) { -+ /* Got a better allocation. */ -+ znode_make_reloc(node, pos->fq); -+ } else if (dist < sbinfo->flush.relocate_distance) { -+ /* The present allocation is good enough. */ -+ jnode_make_wander(ZJNODE(node)); -+ } else { -+ /* Otherwise, try to relocate to the best -+ position. */ -+best_reloc: -+ ret = -+ allocate_znode_update(node, parent_coord, -+ pos); -+ if (ret != 0) -+ return ret; -+ -+ /* set JNODE_RELOC bit _after_ node gets -+ allocated */ -+ znode_make_reloc(node, pos->fq); -+ } -+ } -+ } -+ -+ /* This is the new preceder. */ -+ pos->preceder.blk = *znode_get_block(node); -+ check_preceder(pos->preceder.blk); -+ pos->alloc_cnt += 1; -+ -+ assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk)); -+ -+ return 0; -+} -+ -+static int -+allocate_znode(znode * node, const coord_t *parent_coord, flush_pos_t *pos) -+{ -+ /* -+ * perform znode allocation with znode pinned in memory to avoid races -+ * with asynchronous emergency flush (which plays with -+ * JNODE_FLUSH_RESERVED bit). -+ */ -+ return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos)); -+} -+ -+/* A subroutine of allocate_znode, this is called first to see if there is a -+ close position to relocate to. It may return ENOSPC if there is no close -+ position. If there is no close position it may not relocate. This takes care -+ of updating the parent node with the relocated block address. */ -+static int -+allocate_znode_update(znode * node, const coord_t *parent_coord, -+ flush_pos_t *pos) -+{ -+ int ret; -+ reiser4_block_nr blk; -+ lock_handle uber_lock; -+ int flush_reserved_used = 0; -+ int grabbed; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ init_lh(&uber_lock); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ grabbed = ctx->grabbed_blocks; -+ -+ /* discard e-flush allocation */ -+ ret = zload(node); -+ if (ret) -+ return ret; -+ -+ if (ZF_ISSET(node, JNODE_CREATED)) { -+ assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node))); -+ pos->preceder.block_stage = BLOCK_UNALLOCATED; -+ } else { -+ pos->preceder.block_stage = BLOCK_GRABBED; -+ -+ /* The disk space for relocating the @node is already reserved -+ * in "flush reserved" counter if @node is leaf, otherwise we -+ * grab space using BA_RESERVED (means grab space from whole -+ * disk not from only 95%). */ -+ if (znode_get_level(node) == LEAF_LEVEL) { -+ /* -+ * earlier (during do_jnode_make_dirty()) we decided -+ * that @node can possibly go into overwrite set and -+ * reserved block for its wandering location. -+ */ -+ txn_atom *atom = get_current_atom_locked(); -+ assert("nikita-3449", -+ ZF_ISSET(node, JNODE_FLUSH_RESERVED)); -+ flush_reserved2grabbed(atom, (__u64) 1); -+ spin_unlock_atom(atom); -+ /* -+ * we are trying to move node into relocate -+ * set. Allocation of relocated position "uses" -+ * reserved block. -+ */ -+ ZF_CLR(node, JNODE_FLUSH_RESERVED); -+ flush_reserved_used = 1; -+ } else { -+ ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED); -+ if (ret != 0) -+ goto exit; -+ } -+ } -+ -+ /* We may do not use 5% of reserved disk space here and flush will not -+ pack tightly. */ -+ ret = reiser4_alloc_block(&pos->preceder, &blk, -+ BA_FORMATTED | BA_PERMANENT); -+ if (ret) -+ goto exit; -+ -+ if (!ZF_ISSET(node, JNODE_CREATED) && -+ (ret = -+ reiser4_dealloc_block(znode_get_block(node), 0, -+ BA_DEFER | BA_FORMATTED))) -+ goto exit; -+ -+ if (likely(!znode_is_root(node))) { -+ item_plugin *iplug; -+ -+ iplug = item_plugin_by_coord(parent_coord); -+ assert("nikita-2954", iplug->f.update != NULL); -+ iplug->f.update(parent_coord, &blk); -+ -+ znode_make_dirty(parent_coord->node); -+ -+ } else { -+ reiser4_tree *tree = znode_get_tree(node); -+ znode *uber; -+ -+ /* We take a longterm lock on the fake node in order to change -+ the root block number. This may cause atom fusion. */ -+ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI, -+ &uber_lock); -+ /* The fake node cannot be deleted, and we must have priority -+ here, and may not be confused with ENOSPC. */ -+ assert("jmacd-74412", -+ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC); -+ -+ if (ret) -+ goto exit; -+ -+ uber = uber_lock.node; -+ -+ write_lock_tree(tree); -+ tree->root_block = blk; -+ write_unlock_tree(tree); -+ -+ znode_make_dirty(uber); -+ } -+ -+ ret = znode_rehash(node, &blk); -+exit: -+ if (ret) { -+ /* Get flush reserved block back if something fails, because -+ * callers assume that on error block wasn't relocated and its -+ * flush reserved block wasn't used. */ -+ if (flush_reserved_used) { -+ /* -+ * ok, we failed to move node into relocate -+ * set. Restore status quo. -+ */ -+ grabbed2flush_reserved((__u64) 1); -+ ZF_SET(node, JNODE_FLUSH_RESERVED); -+ } -+ } -+ zrelse(node); -+ done_lh(&uber_lock); -+ grabbed2free_mark(grabbed); -+ return ret; -+} -+ -+/* JNODE INTERFACE */ -+ -+/* Lock a node (if formatted) and then get its parent locked, set the child's -+ coordinate in the parent. If the child is the root node, the above_root -+ znode is returned but the coord is not set. This function may cause atom -+ fusion, but it is only used for read locks (at this point) and therefore -+ fusion only occurs when the parent is already dirty. */ -+/* Hans adds this note: remember to ask how expensive this operation is vs. -+ storing parent pointer in jnodes. */ -+static int -+jnode_lock_parent_coord(jnode * node, -+ coord_t *coord, -+ lock_handle * parent_lh, -+ load_count * parent_zh, -+ znode_lock_mode parent_mode, int try) -+{ -+ int ret; -+ -+ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node)); -+ assert("edward-54", jnode_is_unformatted(node) -+ || znode_is_any_locked(JZNODE(node))); -+ -+ if (!jnode_is_znode(node)) { -+ reiser4_key key; -+ tree_level stop_level = TWIG_LEVEL; -+ lookup_bias bias = FIND_EXACT; -+ -+ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP)); -+ -+ /* The case when node is not znode, but can have parent coord -+ (unformatted node, node which represents cluster page, -+ etc..). Generate a key for the appropriate entry, search -+ in the tree using coord_by_key, which handles locking for -+ us. */ -+ -+ /* -+ * nothing is locked at this moment, so, nothing prevents -+ * concurrent truncate from removing jnode from inode. To -+ * prevent this spin-lock jnode. jnode can be truncated just -+ * after call to the jnode_build_key(), but this is ok, -+ * because coord_by_key() will just fail to find appropriate -+ * extent. -+ */ -+ spin_lock_jnode(node); -+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { -+ jnode_build_key(node, &key); -+ ret = 0; -+ } else -+ ret = RETERR(-ENOENT); -+ spin_unlock_jnode(node); -+ -+ if (ret != 0) -+ return ret; -+ -+ if (jnode_is_cluster_page(node)) -+ stop_level = LEAF_LEVEL; -+ -+ assert("jmacd-1812", coord != NULL); -+ -+ ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh, -+ parent_mode, bias, stop_level, stop_level, -+ CBK_UNIQUE, NULL/*ra_info */); -+ switch (ret) { -+ case CBK_COORD_NOTFOUND: -+ assert("edward-1038", -+ ergo(jnode_is_cluster_page(node), -+ JF_ISSET(node, JNODE_HEARD_BANSHEE))); -+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) -+ warning("nikita-3177", "Parent not found"); -+ return ret; -+ case CBK_COORD_FOUND: -+ if (coord->between != AT_UNIT) { -+ /* FIXME: comment needed */ -+ done_lh(parent_lh); -+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { -+ warning("nikita-3178", -+ "Found but not happy: %i", -+ coord->between); -+ } -+ return RETERR(-ENOENT); -+ } -+ ret = incr_load_count_znode(parent_zh, parent_lh->node); -+ if (ret != 0) -+ return ret; -+ /* if (jnode_is_cluster_page(node)) { -+ races with write() are possible -+ check_child_cluster (parent_lh->node); -+ } -+ */ -+ break; -+ default: -+ return ret; -+ } -+ -+ } else { -+ int flags; -+ znode *z; -+ -+ z = JZNODE(node); -+ /* Formatted node case: */ -+ assert("jmacd-2061", !znode_is_root(z)); -+ -+ flags = GN_ALLOW_NOT_CONNECTED; -+ if (try) -+ flags |= GN_TRY_LOCK; -+ -+ ret = -+ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags); -+ if (ret != 0) -+ /* -E_REPEAT is ok here, it is handled by the caller. */ -+ return ret; -+ -+ /* Make the child's position "hint" up-to-date. (Unless above -+ root, which caller must check.) */ -+ if (coord != NULL) { -+ -+ ret = incr_load_count_znode(parent_zh, parent_lh->node); -+ if (ret != 0) { -+ warning("jmacd-976812386", -+ "incr_load_count_znode failed: %d", -+ ret); -+ return ret; -+ } -+ -+ ret = find_child_ptr(parent_lh->node, z, coord); -+ if (ret != 0) { -+ warning("jmacd-976812", -+ "find_child_ptr failed: %d", ret); -+ return ret; -+ } -+ } -+ } -+ -+ return 0; -+} -+ -+/* Get the (locked) next neighbor of a znode which is dirty and a member of the -+ same atom. If there is no next neighbor or the neighbor is not in memory or -+ if there is a neighbor but it is not dirty or not in the same atom, -+ -E_NO_NEIGHBOR is returned. In some cases the slum may include nodes which -+ are not dirty, if so @check_dirty should be 0 */ -+static int neighbor_in_slum(znode * node, /* starting point */ -+ lock_handle * lock, /* lock on starting point */ -+ sideof side, /* left or right direction we -+ seek the next node in */ -+ znode_lock_mode mode, /* kind of lock we want */ -+ int check_dirty, /* true if the neighbor should -+ be dirty */ -+ int use_upper_levels /* get neighbor by going though -+ upper levels */) -+{ -+ int ret; -+ int flags; -+ -+ assert("jmacd-6334", znode_is_connected(node)); -+ -+ flags = GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0); -+ if (use_upper_levels) -+ flags |= GN_CAN_USE_UPPER_LEVELS; -+ -+ ret = reiser4_get_neighbor(lock, node, mode, flags); -+ if (ret) { -+ /* May return -ENOENT or -E_NO_NEIGHBOR. */ -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ if (ret == -ENOENT) -+ ret = RETERR(-E_NO_NEIGHBOR); -+ return ret; -+ } -+ if (!check_dirty) -+ return 0; -+ /* Check dirty bit of locked znode, no races here */ -+ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY)) -+ return 0; -+ -+ done_lh(lock); -+ return RETERR(-E_NO_NEIGHBOR); -+} -+ -+/* Return true if two znodes have the same parent. This is called with both -+ nodes write-locked (for squeezing) so no tree lock is needed. */ -+static int znode_same_parents(znode * a, znode * b) -+{ -+ int result; -+ -+ assert("jmacd-7011", znode_is_write_locked(a)); -+ assert("jmacd-7012", znode_is_write_locked(b)); -+ -+ /* We lock the whole tree for this check.... I really don't like whole -+ * tree locks... -Hans */ -+ read_lock_tree(znode_get_tree(a)); -+ result = (znode_parent(a) == znode_parent(b)); -+ read_unlock_tree(znode_get_tree(a)); -+ return result; -+} -+ -+/* FLUSH SCAN */ -+ -+/* Initialize the flush_scan data structure. */ -+static void scan_init(flush_scan * scan) -+{ -+ memset(scan, 0, sizeof(*scan)); -+ init_lh(&scan->node_lock); -+ init_lh(&scan->parent_lock); -+ init_load_count(&scan->parent_load); -+ init_load_count(&scan->node_load); -+ coord_init_invalid(&scan->parent_coord, NULL); -+} -+ -+/* Release any resources held by the flush scan, e.g. release locks, -+ free memory, etc. */ -+static void scan_done(flush_scan * scan) -+{ -+ done_load_count(&scan->node_load); -+ if (scan->node != NULL) { -+ jput(scan->node); -+ scan->node = NULL; -+ } -+ done_load_count(&scan->parent_load); -+ done_lh(&scan->parent_lock); -+ done_lh(&scan->node_lock); -+} -+ -+/* Returns true if flush scanning is finished. */ -+int reiser4_scan_finished(flush_scan * scan) -+{ -+ return scan->stop || (scan->direction == RIGHT_SIDE && -+ scan->count >= scan->max_count); -+} -+ -+/* Return true if the scan should continue to the @tonode. True if the node -+ meets the same_slum_check condition. If not, deref the "left" node and stop -+ the scan. */ -+int reiser4_scan_goto(flush_scan * scan, jnode * tonode) -+{ -+ int go = same_slum_check(scan->node, tonode, 1, 0); -+ -+ if (!go) { -+ scan->stop = 1; -+ jput(tonode); -+ } -+ -+ return go; -+} -+ -+/* Set the current scan->node, refcount it, increment count by the @add_count -+ (number to count, e.g., skipped unallocated nodes), deref previous current, -+ and copy the current parent coordinate. */ -+int -+scan_set_current(flush_scan * scan, jnode * node, unsigned add_count, -+ const coord_t *parent) -+{ -+ /* Release the old references, take the new reference. */ -+ done_load_count(&scan->node_load); -+ -+ if (scan->node != NULL) -+ jput(scan->node); -+ scan->node = node; -+ scan->count += add_count; -+ -+ /* This next stmt is somewhat inefficient. The reiser4_scan_extent() -+ code could delay this update step until it finishes and update the -+ parent_coord only once. It did that before, but there was a bug and -+ this was the easiest way to make it correct. */ -+ if (parent != NULL) -+ coord_dup(&scan->parent_coord, parent); -+ -+ /* Failure may happen at the incr_load_count call, but the caller can -+ assume the reference is safely taken. */ -+ return incr_load_count_jnode(&scan->node_load, node); -+} -+ -+/* Return true if scanning in the leftward direction. */ -+int reiser4_scanning_left(flush_scan * scan) -+{ -+ return scan->direction == LEFT_SIDE; -+} -+ -+/* Performs leftward scanning starting from either kind of node. Counts the -+ starting node. The right-scan object is passed in for the left-scan in order -+ to copy the parent of an unformatted starting position. This way we avoid -+ searching for the unformatted node's parent when scanning in each direction. -+ If we search for the parent once it is set in both scan objects. The limit -+ parameter tells flush-scan when to stop. -+ -+ Rapid scanning is used only during scan_left, where we are interested in -+ finding the 'leftpoint' where we begin flushing. We are interested in -+ stopping at the left child of a twig that does not have a dirty left -+ neighbour. THIS IS A SPECIAL CASE. The problem is finding a way to flush only -+ those nodes without unallocated children, and it is difficult to solve in the -+ bottom-up flushing algorithm we are currently using. The problem can be -+ solved by scanning left at every level as we go upward, but this would -+ basically bring us back to using a top-down allocation strategy, which we -+ already tried (see BK history from May 2002), and has a different set of -+ problems. The top-down strategy makes avoiding unallocated children easier, -+ but makes it difficult to propertly flush dirty children with clean parents -+ that would otherwise stop the top-down flush, only later to dirty the parent -+ once the children are flushed. So we solve the problem in the bottom-up -+ algorithm with a special case for twigs and leaves only. -+ -+ The first step in solving the problem is this rapid leftward scan. After we -+ determine that there are at least enough nodes counted to qualify for -+ FLUSH_RELOCATE_THRESHOLD we are no longer interested in the exact count, we -+ are only interested in finding the best place to start the flush. -+ -+ We could choose one of two possibilities: -+ -+ 1. Stop at the leftmost child (of a twig) that does not have a dirty left -+ neighbor. This requires checking one leaf per rapid-scan twig -+ -+ 2. Stop at the leftmost child (of a twig) where there are no dirty children -+ of the twig to the left. This requires checking possibly all of the in-memory -+ children of each twig during the rapid scan. -+ -+ For now we implement the first policy. -+*/ -+static int -+scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit) -+{ -+ int ret = 0; -+ -+ scan->max_count = limit; -+ scan->direction = LEFT_SIDE; -+ -+ ret = scan_set_current(scan, jref(node), 1, NULL); -+ if (ret != 0) -+ return ret; -+ -+ ret = scan_common(scan, right); -+ if (ret != 0) -+ return ret; -+ -+ /* Before rapid scanning, we need a lock on scan->node so that we can -+ get its parent, only if formatted. */ -+ if (jnode_is_znode(scan->node)) { -+ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node), -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); -+ } -+ -+ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD) -+ */ -+ return ret; -+} -+ -+/* Performs rightward scanning... Does not count the starting node. The limit -+ parameter is described in scan_left. If the starting node is unformatted then -+ the parent_coord was already set during scan_left. The rapid_after parameter -+ is not used during right-scanning. -+ -+ scan_right is only called if the scan_left operation does not count at least -+ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter -+ is set to the difference between scan-left's count and -+ FLUSH_RELOCATE_THRESHOLD, meaning scan-right counts as high as -+ FLUSH_RELOCATE_THRESHOLD and then stops. */ -+static int scan_right(flush_scan * scan, jnode * node, unsigned limit) -+{ -+ int ret; -+ -+ scan->max_count = limit; -+ scan->direction = RIGHT_SIDE; -+ -+ ret = scan_set_current(scan, jref(node), 0, NULL); -+ if (ret != 0) -+ return ret; -+ -+ return scan_common(scan, NULL); -+} -+ -+/* Common code to perform left or right scanning. */ -+static int scan_common(flush_scan * scan, flush_scan * other) -+{ -+ int ret; -+ -+ assert("nikita-2376", scan->node != NULL); -+ assert("edward-54", jnode_is_unformatted(scan->node) -+ || jnode_is_znode(scan->node)); -+ -+ /* Special case for starting at an unformatted node. Optimization: we -+ only want to search for the parent (which requires a tree traversal) -+ once. Obviously, we shouldn't have to call it once for the left scan -+ and once for the right scan. For this reason, if we search for the -+ parent during scan-left we then duplicate the coord/lock/load into -+ the scan-right object. */ -+ if (jnode_is_unformatted(scan->node)) { -+ ret = scan_unformatted(scan, other); -+ if (ret != 0) -+ return ret; -+ } -+ /* This loop expects to start at a formatted position and performs -+ chaining of formatted regions */ -+ while (!reiser4_scan_finished(scan)) { -+ -+ ret = scan_formatted(scan); -+ if (ret != 0) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int scan_unformatted(flush_scan * scan, flush_scan * other) -+{ -+ int ret = 0; -+ int try = 0; -+ -+ if (!coord_is_invalid(&scan->parent_coord)) -+ goto scan; -+ -+ /* set parent coord from */ -+ if (!jnode_is_unformatted(scan->node)) { -+ /* formatted position */ -+ -+ lock_handle lock; -+ assert("edward-301", jnode_is_znode(scan->node)); -+ init_lh(&lock); -+ -+ /* -+ * when flush starts from unformatted node, first thing it -+ * does is tree traversal to find formatted parent of starting -+ * node. This parent is then kept lock across scans to the -+ * left and to the right. This means that during scan to the -+ * left we cannot take left-ward lock, because this is -+ * dead-lock prone. So, if we are scanning to the left and -+ * there is already lock held by this thread, -+ * jnode_lock_parent_coord() should use try-lock. -+ */ -+ try = reiser4_scanning_left(scan) -+ && !lock_stack_isclean(get_current_lock_stack()); -+ /* Need the node locked to get the parent lock, We have to -+ take write lock since there is at least one call path -+ where this znode is already write-locked by us. */ -+ ret = -+ longterm_lock_znode(&lock, JZNODE(scan->node), -+ ZNODE_WRITE_LOCK, -+ reiser4_scanning_left(scan) ? -+ ZNODE_LOCK_LOPRI : -+ ZNODE_LOCK_HIPRI); -+ if (ret != 0) -+ /* EINVAL or E_DEADLOCK here mean... try again! At this -+ point we've scanned too far and can't back out, just -+ start over. */ -+ return ret; -+ -+ ret = jnode_lock_parent_coord(scan->node, -+ &scan->parent_coord, -+ &scan->parent_lock, -+ &scan->parent_load, -+ ZNODE_WRITE_LOCK, try); -+ -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ done_lh(&lock); -+ if (ret == -E_REPEAT) { -+ scan->stop = 1; -+ return 0; -+ } -+ if (ret) -+ return ret; -+ -+ } else { -+ /* unformatted position */ -+ -+ ret = -+ jnode_lock_parent_coord(scan->node, &scan->parent_coord, -+ &scan->parent_lock, -+ &scan->parent_load, -+ ZNODE_WRITE_LOCK, try); -+ -+ if (IS_CBKERR(ret)) -+ return ret; -+ -+ if (ret == CBK_COORD_NOTFOUND) -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ return ret; -+ -+ /* parent was found */ -+ assert("jmacd-8661", other != NULL); -+ /* Duplicate the reference into the other flush_scan. */ -+ coord_dup(&other->parent_coord, &scan->parent_coord); -+ copy_lh(&other->parent_lock, &scan->parent_lock); -+ copy_load_count(&other->parent_load, &scan->parent_load); -+ } -+scan: -+ return scan_by_coord(scan); -+} -+ -+/* Performs left- or rightward scanning starting from a formatted node. Follow -+ left pointers under tree lock as long as: -+ -+ - node->left/right is non-NULL -+ - node->left/right is connected, dirty -+ - node->left/right belongs to the same atom -+ - scan has not reached maximum count -+*/ -+static int scan_formatted(flush_scan * scan) -+{ -+ int ret; -+ znode *neighbor = NULL; -+ -+ assert("jmacd-1401", !reiser4_scan_finished(scan)); -+ -+ do { -+ znode *node = JZNODE(scan->node); -+ -+ /* Node should be connected, but if not stop the scan. */ -+ if (!znode_is_connected(node)) { -+ scan->stop = 1; -+ break; -+ } -+ -+ /* Lock the tree, check-for and reference the next sibling. */ -+ read_lock_tree(znode_get_tree(node)); -+ -+ /* It may be that a node is inserted or removed between a node -+ and its left sibling while the tree lock is released, but the -+ flush-scan count does not need to be precise. Thus, we -+ release the tree lock as soon as we get the neighboring node. -+ */ -+ neighbor = -+ reiser4_scanning_left(scan) ? node->left : node->right; -+ if (neighbor != NULL) -+ zref(neighbor); -+ -+ read_unlock_tree(znode_get_tree(node)); -+ -+ /* If neighbor is NULL at the leaf level, need to check for an -+ unformatted sibling using the parent--break in any case. */ -+ if (neighbor == NULL) -+ break; -+ -+ /* Check the condition for going left, break if it is not met. -+ This also releases (jputs) the neighbor if false. */ -+ if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) -+ break; -+ -+ /* Advance the flush_scan state to the left, repeat. */ -+ ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL); -+ if (ret != 0) -+ return ret; -+ -+ } while (!reiser4_scan_finished(scan)); -+ -+ /* If neighbor is NULL then we reached the end of a formatted region, or -+ else the sibling is out of memory, now check for an extent to the -+ left (as long as LEAF_LEVEL). */ -+ if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL -+ || reiser4_scan_finished(scan)) { -+ scan->stop = 1; -+ return 0; -+ } -+ /* Otherwise, calls scan_by_coord for the right(left)most item of the -+ left(right) neighbor on the parent level, then possibly continue. */ -+ -+ coord_init_invalid(&scan->parent_coord, NULL); -+ return scan_unformatted(scan, NULL); -+} -+ -+/* NOTE-EDWARD: -+ This scans adjacent items of the same type and calls scan flush plugin for -+ each one. Performs left(right)ward scanning starting from a (possibly) -+ unformatted node. If we start from unformatted node, then we continue only if -+ the next neighbor is also unformatted. When called from scan_formatted, we -+ skip first iteration (to make sure that right(left)most item of the -+ left(right) neighbor on the parent level is of the same type and set -+ appropriate coord). */ -+static int scan_by_coord(flush_scan * scan) -+{ -+ int ret = 0; -+ int scan_this_coord; -+ lock_handle next_lock; -+ load_count next_load; -+ coord_t next_coord; -+ jnode *child; -+ item_plugin *iplug; -+ -+ init_lh(&next_lock); -+ init_load_count(&next_load); -+ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0); -+ -+ /* set initial item id */ -+ iplug = item_plugin_by_coord(&scan->parent_coord); -+ -+ for (; !reiser4_scan_finished(scan); scan_this_coord = 1) { -+ if (scan_this_coord) { -+ /* Here we expect that unit is scannable. it would not -+ * be so due to race with extent->tail conversion. */ -+ if (iplug->f.scan == NULL) { -+ scan->stop = 1; -+ ret = -E_REPEAT; -+ /* skip the check at the end. */ -+ goto race; -+ } -+ -+ ret = iplug->f.scan(scan); -+ if (ret != 0) -+ goto exit; -+ -+ if (reiser4_scan_finished(scan)) { -+ checkchild(scan); -+ break; -+ } -+ } else { -+ /* the same race against truncate as above is possible -+ * here, it seems */ -+ -+ /* NOTE-JMACD: In this case, apply the same end-of-node -+ logic but don't scan the first coordinate. */ -+ assert("jmacd-1231", -+ item_is_internal(&scan->parent_coord)); -+ } -+ -+ if (iplug->f.utmost_child == NULL -+ || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) { -+ /* stop this coord and continue on parrent level */ -+ ret = -+ scan_set_current(scan, -+ ZJNODE(zref -+ (scan->parent_coord.node)), -+ 1, NULL); -+ if (ret != 0) -+ goto exit; -+ break; -+ } -+ -+ /* Either way, the invariant is that scan->parent_coord is set -+ to the parent of scan->node. Now get the next unit. */ -+ coord_dup(&next_coord, &scan->parent_coord); -+ coord_sideof_unit(&next_coord, scan->direction); -+ -+ /* If off-the-end of the twig, try the next twig. */ -+ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) { -+ /* We take the write lock because we may start flushing -+ * from this coordinate. */ -+ ret = neighbor_in_slum(next_coord.node, -+ &next_lock, -+ scan->direction, -+ ZNODE_WRITE_LOCK, -+ 1 /* check dirty */, -+ 0 /* don't go though upper -+ levels */); -+ if (ret == -E_NO_NEIGHBOR) { -+ scan->stop = 1; -+ ret = 0; -+ break; -+ } -+ -+ if (ret != 0) -+ goto exit; -+ -+ ret = incr_load_count_znode(&next_load, next_lock.node); -+ if (ret != 0) -+ goto exit; -+ -+ coord_init_sideof_unit(&next_coord, next_lock.node, -+ sideof_reverse(scan->direction)); -+ } -+ -+ iplug = item_plugin_by_coord(&next_coord); -+ -+ /* Get the next child. */ -+ ret = -+ iplug->f.utmost_child(&next_coord, -+ sideof_reverse(scan->direction), -+ &child); -+ if (ret != 0) -+ goto exit; -+ /* If the next child is not in memory, or, item_utmost_child -+ failed (due to race with unlink, most probably), stop -+ here. */ -+ if (child == NULL || IS_ERR(child)) { -+ scan->stop = 1; -+ checkchild(scan); -+ break; -+ } -+ -+ assert("nikita-2374", jnode_is_unformatted(child) -+ || jnode_is_znode(child)); -+ -+ /* See if it is dirty, part of the same atom. */ -+ if (!reiser4_scan_goto(scan, child)) { -+ checkchild(scan); -+ break; -+ } -+ -+ /* If so, make this child current. */ -+ ret = scan_set_current(scan, child, 1, &next_coord); -+ if (ret != 0) -+ goto exit; -+ -+ /* Now continue. If formatted we release the parent lock and -+ return, then proceed. */ -+ if (jnode_is_znode(child)) -+ break; -+ -+ /* Otherwise, repeat the above loop with next_coord. */ -+ if (next_load.node != NULL) { -+ done_lh(&scan->parent_lock); -+ move_lh(&scan->parent_lock, &next_lock); -+ move_load_count(&scan->parent_load, &next_load); -+ } -+ } -+ -+ assert("jmacd-6233", -+ reiser4_scan_finished(scan) || jnode_is_znode(scan->node)); -+exit: -+ checkchild(scan); -+race: /* skip the above check */ -+ if (jnode_is_znode(scan->node)) { -+ done_lh(&scan->parent_lock); -+ done_load_count(&scan->parent_load); -+ } -+ -+ done_load_count(&next_load); -+ done_lh(&next_lock); -+ return ret; -+} -+ -+/* FLUSH POS HELPERS */ -+ -+/* Initialize the fields of a flush_position. */ -+static void pos_init(flush_pos_t *pos) -+{ -+ memset(pos, 0, sizeof *pos); -+ -+ pos->state = POS_INVALID; -+ coord_init_invalid(&pos->coord, NULL); -+ init_lh(&pos->lock); -+ init_load_count(&pos->load); -+ -+ reiser4_blocknr_hint_init(&pos->preceder); -+} -+ -+/* The flush loop inside squalloc periodically checks pos_valid to determine -+ when "enough flushing" has been performed. This will return true until one -+ of the following conditions is met: -+ -+ 1. the number of flush-queued nodes has reached the kernel-supplied -+ "int *nr_to_flush" parameter, meaning we have flushed as many blocks as the -+ kernel requested. When flushing to commit, this parameter is NULL. -+ -+ 2. pos_stop() is called because squalloc discovers that the "next" node in -+ the flush order is either non-existant, not dirty, or not in the same atom. -+*/ -+ -+static int pos_valid(flush_pos_t *pos) -+{ -+ return pos->state != POS_INVALID; -+} -+ -+/* Release any resources of a flush_position. Called when jnode_flush -+ finishes. */ -+static void pos_done(flush_pos_t *pos) -+{ -+ pos_stop(pos); -+ reiser4_blocknr_hint_done(&pos->preceder); -+ if (convert_data(pos)) -+ free_convert_data(pos); -+} -+ -+/* Reset the point and parent. Called during flush subroutines to terminate the -+ squalloc loop. */ -+static int pos_stop(flush_pos_t *pos) -+{ -+ pos->state = POS_INVALID; -+ done_lh(&pos->lock); -+ done_load_count(&pos->load); -+ coord_init_invalid(&pos->coord, NULL); -+ -+ if (pos->child) { -+ jput(pos->child); -+ pos->child = NULL; -+ } -+ -+ return 0; -+} -+ -+/* Return the flush_position's block allocator hint. */ -+reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos) -+{ -+ return &pos->preceder; -+} -+ -+flush_queue_t *reiser4_pos_fq(flush_pos_t *pos) -+{ -+ return pos->fq; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 90 -+ LocalWords: preceder -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/flush.h linux-2.6.30/fs/reiser4/flush.h ---- linux-2.6.30.orig/fs/reiser4/flush.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/flush.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,300 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* DECLARATIONS: */ -+ -+#if !defined(__REISER4_FLUSH_H__) -+#define __REISER4_FLUSH_H__ -+ -+#include "plugin/cluster.h" -+ -+/* The flush_scan data structure maintains the state of an in-progress -+ flush-scan on a single level of the tree. A flush-scan is used for counting -+ the number of adjacent nodes to flush, which is used to determine whether we -+ should relocate, and it is also used to find a starting point for flush. A -+ flush-scan object can scan in both right and left directions via the -+ scan_left() and scan_right() interfaces. The right- and left-variations are -+ similar but perform different functions. When scanning left we (optionally -+ perform rapid scanning and then) longterm-lock the endpoint node. When -+ scanning right we are simply counting the number of adjacent, dirty nodes. */ -+struct flush_scan { -+ -+ /* The current number of nodes scanned on this level. */ -+ unsigned count; -+ -+ /* There may be a maximum number of nodes for a scan on any single -+ level. When going leftward, max_count is determined by -+ FLUSH_SCAN_MAXNODES (see reiser4.h) */ -+ unsigned max_count; -+ -+ /* Direction: Set to one of the sideof enumeration: -+ { LEFT_SIDE, RIGHT_SIDE }. */ -+ sideof direction; -+ -+ /* Initially @stop is set to false then set true once some condition -+ stops the search (e.g., we found a clean node before reaching -+ max_count or we found a node belonging to another atom). */ -+ int stop; -+ -+ /* The current scan position. If @node is non-NULL then its reference -+ count has been incremented to reflect this reference. */ -+ jnode *node; -+ -+ /* A handle for zload/zrelse of current scan position node. */ -+ load_count node_load; -+ -+ /* During left-scan, if the final position (a.k.a. endpoint node) is -+ formatted the node is locked using this lock handle. The endpoint -+ needs to be locked for transfer to the flush_position object after -+ scanning finishes. */ -+ lock_handle node_lock; -+ -+ /* When the position is unformatted, its parent, coordinate, and parent -+ zload/zrelse handle. */ -+ lock_handle parent_lock; -+ coord_t parent_coord; -+ load_count parent_load; -+ -+ /* The block allocator preceder hint. Sometimes flush_scan determines -+ what the preceder is and if so it sets it here, after which it is -+ copied into the flush_position. Otherwise, the preceder is computed -+ later. */ -+ reiser4_block_nr preceder_blk; -+}; -+ -+struct convert_item_info { -+ dc_item_stat d_cur; /* disk cluster state of the current item */ -+ dc_item_stat d_next; /* disk cluster state of the next slum item */ -+ int cluster_shift; /* disk cluster shift */ -+ flow_t flow; /* disk cluster data */ -+}; -+ -+struct convert_info { -+ int count; /* for squalloc terminating */ -+ item_plugin *iplug; /* current item plugin */ -+ struct convert_item_info *itm; /* current item info */ -+ struct cluster_handle clust; /* transform cluster */ -+}; -+ -+typedef enum flush_position_state { -+ POS_INVALID, /* Invalid or stopped pos, do not continue slum -+ * processing */ -+ POS_ON_LEAF, /* pos points to already prepped, locked -+ * formatted node at leaf level */ -+ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field -+ * is used to traverse unformatted nodes */ -+ POS_TO_LEAF, /* pos is being moved to leaf level */ -+ POS_TO_TWIG, /* pos is being moved to twig level */ -+ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is -+ * after rightmost unit of the current twig */ -+ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal -+ * node */ -+} flushpos_state_t; -+ -+/* An encapsulation of the current flush point and all the parameters that are -+ passed through the entire squeeze-and-allocate stage of the flush routine. -+ A single flush_position object is constructed after left- and right-scanning -+ finishes. */ -+struct flush_position { -+ flushpos_state_t state; -+ -+ coord_t coord; /* coord to traverse unformatted nodes */ -+ lock_handle lock; /* current lock we hold */ -+ load_count load; /* load status for current locked formatted node -+ */ -+ jnode *child; /* for passing a reference to unformatted child -+ * across pos state changes */ -+ -+ reiser4_blocknr_hint preceder; /* The flush 'hint' state. */ -+ int leaf_relocate; /* True if enough leaf-level nodes were -+ * found to suggest a relocate policy. */ -+ int alloc_cnt; /* The number of nodes allocated during squeeze -+ and allococate. */ -+ int prep_or_free_cnt; /* The number of nodes prepared for write -+ (allocate) or squeezed and freed. */ -+ flush_queue_t *fq; -+ long *nr_written; /* number of nodes submitted to disk */ -+ int flags; /* a copy of jnode_flush flags argument */ -+ -+ znode *prev_twig; /* previous parent pointer value, used to catch -+ * processing of new twig node */ -+ struct convert_info *sq; /* convert info */ -+ -+ unsigned long pos_in_unit; /* for extents only. Position -+ within an extent unit of first -+ jnode of slum */ -+ long nr_to_write; /* number of unformatted nodes to handle on -+ flush */ -+}; -+ -+static inline int item_convert_count(flush_pos_t *pos) -+{ -+ return pos->sq->count; -+} -+static inline void inc_item_convert_count(flush_pos_t *pos) -+{ -+ pos->sq->count++; -+} -+static inline void set_item_convert_count(flush_pos_t *pos, int count) -+{ -+ pos->sq->count = count; -+} -+static inline item_plugin *item_convert_plug(flush_pos_t *pos) -+{ -+ return pos->sq->iplug; -+} -+ -+static inline struct convert_info *convert_data(flush_pos_t *pos) -+{ -+ return pos->sq; -+} -+ -+static inline struct convert_item_info *item_convert_data(flush_pos_t *pos) -+{ -+ assert("edward-955", convert_data(pos)); -+ return pos->sq->itm; -+} -+ -+static inline struct tfm_cluster *tfm_cluster_sq(flush_pos_t *pos) -+{ -+ return &pos->sq->clust.tc; -+} -+ -+static inline struct tfm_stream *tfm_stream_sq(flush_pos_t *pos, -+ tfm_stream_id id) -+{ -+ assert("edward-854", pos->sq != NULL); -+ return get_tfm_stream(tfm_cluster_sq(pos), id); -+} -+ -+static inline int chaining_data_present(flush_pos_t *pos) -+{ -+ return convert_data(pos) && item_convert_data(pos); -+} -+ -+/* Returns true if next node contains next item of the disk cluster -+ so item convert data should be moved to the right slum neighbor. -+*/ -+static inline int should_chain_next_node(flush_pos_t *pos) -+{ -+ int result = 0; -+ -+ assert("edward-1007", chaining_data_present(pos)); -+ -+ switch (item_convert_data(pos)->d_next) { -+ case DC_CHAINED_ITEM: -+ result = 1; -+ break; -+ case DC_AFTER_CLUSTER: -+ break; -+ default: -+ impossible("edward-1009", "bad state of next slum item"); -+ } -+ return result; -+} -+ -+/* update item state in a disk cluster to assign conversion mode */ -+static inline void -+move_chaining_data(flush_pos_t *pos, int this_node/* where is next item */) -+{ -+ -+ assert("edward-1010", chaining_data_present(pos)); -+ -+ if (this_node == 0) { -+ /* next item is on the right neighbor */ -+ assert("edward-1011", -+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM || -+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM); -+ assert("edward-1012", -+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM); -+ -+ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM; -+ item_convert_data(pos)->d_next = DC_INVALID_STATE; -+ } else { -+ /* next item is on the same node */ -+ assert("edward-1013", -+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM || -+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM); -+ assert("edward-1227", -+ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER || -+ item_convert_data(pos)->d_next == DC_INVALID_STATE); -+ -+ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER; -+ item_convert_data(pos)->d_next = DC_INVALID_STATE; -+ } -+} -+ -+static inline int should_convert_node(flush_pos_t *pos, znode * node) -+{ -+ return znode_convertible(node); -+} -+ -+/* true if there is attached convert item info */ -+static inline int should_convert_next_node(flush_pos_t *pos) -+{ -+ return convert_data(pos) && item_convert_data(pos); -+} -+ -+#define SQUALLOC_THRESHOLD 256 -+ -+static inline int should_terminate_squalloc(flush_pos_t *pos) -+{ -+ return convert_data(pos) && -+ !item_convert_data(pos) && -+ item_convert_count(pos) >= SQUALLOC_THRESHOLD; -+} -+ -+#if 1 -+#define check_convert_info(pos) \ -+do { \ -+ if (unlikely(should_convert_next_node(pos))) { \ -+ warning("edward-1006", "unprocessed chained data"); \ -+ printk("d_cur = %d, d_next = %d, flow.len = %llu\n", \ -+ item_convert_data(pos)->d_cur, \ -+ item_convert_data(pos)->d_next, \ -+ item_convert_data(pos)->flow.length); \ -+ } \ -+} while (0) -+#else -+#define check_convert_info(pos) -+#endif /* REISER4_DEBUG */ -+ -+void free_convert_data(flush_pos_t *pos); -+/* used in extent.c */ -+int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size, -+ const coord_t *parent); -+int reiser4_scan_finished(flush_scan * scan); -+int reiser4_scanning_left(flush_scan * scan); -+int reiser4_scan_goto(flush_scan * scan, jnode * tonode); -+txn_atom *atom_locked_by_fq(flush_queue_t *fq); -+int reiser4_alloc_extent(flush_pos_t *flush_pos); -+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *, -+ reiser4_key *stop_key); -+extern int reiser4_init_fqs(void); -+extern void reiser4_done_fqs(void); -+ -+#if REISER4_DEBUG -+ -+extern void reiser4_check_fq(const txn_atom *atom); -+extern atomic_t flush_cnt; -+ -+#define check_preceder(blk) \ -+assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb())); -+extern void check_pos(flush_pos_t *pos); -+#else -+#define check_preceder(b) noop -+#define check_pos(pos) noop -+#endif -+ -+/* __REISER4_FLUSH_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 90 -+ LocalWords: preceder -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/flush_queue.c linux-2.6.30/fs/reiser4/flush_queue.c ---- linux-2.6.30.orig/fs/reiser4/flush_queue.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/flush_queue.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,678 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+#include "debug.h" -+#include "super.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "page_cache.h" -+#include "wander.h" -+#include "vfs_ops.h" -+#include "writeout.h" -+#include "flush.h" -+ -+#include <linux/bio.h> -+#include <linux/mm.h> -+#include <linux/pagemap.h> -+#include <linux/blkdev.h> -+#include <linux/writeback.h> -+ -+/* A flush queue object is an accumulator for keeping jnodes prepared -+ by the jnode_flush() function for writing to disk. Those "queued" jnodes are -+ kept on the flush queue until memory pressure or atom commit asks -+ flush queues to write some or all from their jnodes. */ -+ -+/* -+ LOCKING: -+ -+ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped -+ list protected by atom spin lock. fq->prepped list uses the following -+ locking: -+ -+ two ways to protect fq->prepped list for read-only list traversal: -+ -+ 1. atom spin-lock atom. -+ 2. fq is IN_USE, atom->nr_running_queues increased. -+ -+ and one for list modification: -+ -+ 1. atom is spin-locked and one condition is true: fq is IN_USE or -+ atom->nr_running_queues == 0. -+ -+ The deadlock-safe order for flush queues and atoms is: first lock atom, then -+ lock flush queue, then lock jnode. -+*/ -+ -+#define fq_in_use(fq) ((fq)->state & FQ_IN_USE) -+#define fq_ready(fq) (!fq_in_use(fq)) -+ -+#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0) -+#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0) -+ -+/* get lock on atom from locked flush queue object */ -+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t *fq) -+{ -+ /* This code is similar to jnode_get_atom(), look at it for the -+ * explanation. */ -+ txn_atom *atom; -+ -+ assert_spin_locked(&(fq->guard)); -+ -+ while (1) { -+ atom = fq->atom; -+ if (atom == NULL) -+ break; -+ -+ if (spin_trylock_atom(atom)) -+ break; -+ -+ atomic_inc(&atom->refcount); -+ spin_unlock(&(fq->guard)); -+ spin_lock_atom(atom); -+ spin_lock(&(fq->guard)); -+ -+ if (fq->atom == atom) { -+ atomic_dec(&atom->refcount); -+ break; -+ } -+ -+ spin_unlock(&(fq->guard)); -+ atom_dec_and_unlock(atom); -+ spin_lock(&(fq->guard)); -+ } -+ -+ return atom; -+} -+ -+txn_atom *atom_locked_by_fq(flush_queue_t *fq) -+{ -+ txn_atom *atom; -+ -+ spin_lock(&(fq->guard)); -+ atom = atom_locked_by_fq_nolock(fq); -+ spin_unlock(&(fq->guard)); -+ return atom; -+} -+ -+static void init_fq(flush_queue_t *fq) -+{ -+ memset(fq, 0, sizeof *fq); -+ -+ atomic_set(&fq->nr_submitted, 0); -+ -+ INIT_LIST_HEAD(ATOM_FQ_LIST(fq)); -+ -+ init_waitqueue_head(&fq->wait); -+ spin_lock_init(&fq->guard); -+} -+ -+/* slab for flush queues */ -+static struct kmem_cache *fq_slab; -+ -+/** -+ * reiser4_init_fqs - create flush queue cache -+ * -+ * Initializes slab cache of flush queues. It is part of reiser4 module -+ * initialization. -+ */ -+int reiser4_init_fqs(void) -+{ -+ fq_slab = kmem_cache_create("fq", -+ sizeof(flush_queue_t), -+ 0, SLAB_HWCACHE_ALIGN, NULL); -+ if (fq_slab == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * reiser4_done_fqs - delete flush queue cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_fqs(void) -+{ -+ destroy_reiser4_cache(&fq_slab); -+} -+ -+/* create new flush queue object */ -+static flush_queue_t *create_fq(gfp_t gfp) -+{ -+ flush_queue_t *fq; -+ -+ fq = kmem_cache_alloc(fq_slab, gfp); -+ if (fq) -+ init_fq(fq); -+ -+ return fq; -+} -+ -+/* adjust atom's and flush queue's counters of queued nodes */ -+static void count_enqueued_node(flush_queue_t *fq) -+{ -+ ON_DEBUG(fq->atom->num_queued++); -+} -+ -+static void count_dequeued_node(flush_queue_t *fq) -+{ -+ assert("zam-993", fq->atom->num_queued > 0); -+ ON_DEBUG(fq->atom->num_queued--); -+} -+ -+/* attach flush queue object to the atom */ -+static void attach_fq(txn_atom *atom, flush_queue_t *fq) -+{ -+ assert_spin_locked(&(atom->alock)); -+ list_add(&fq->alink, &atom->flush_queues); -+ fq->atom = atom; -+ ON_DEBUG(atom->nr_flush_queues++); -+} -+ -+static void detach_fq(flush_queue_t *fq) -+{ -+ assert_spin_locked(&(fq->atom->alock)); -+ -+ spin_lock(&(fq->guard)); -+ list_del_init(&fq->alink); -+ assert("vs-1456", fq->atom->nr_flush_queues > 0); -+ ON_DEBUG(fq->atom->nr_flush_queues--); -+ fq->atom = NULL; -+ spin_unlock(&(fq->guard)); -+} -+ -+/* destroy flush queue object */ -+static void done_fq(flush_queue_t *fq) -+{ -+ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq))); -+ assert("zam-766", atomic_read(&fq->nr_submitted) == 0); -+ -+ kmem_cache_free(fq_slab, fq); -+} -+ -+/* */ -+static void mark_jnode_queued(flush_queue_t *fq, jnode * node) -+{ -+ JF_SET(node, JNODE_FLUSH_QUEUED); -+ count_enqueued_node(fq); -+} -+ -+/* Putting jnode into the flush queue. Both atom and jnode should be -+ spin-locked. */ -+void queue_jnode(flush_queue_t *fq, jnode * node) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert("zam-713", node->atom != NULL); -+ assert_spin_locked(&(node->atom->alock)); -+ assert("zam-716", fq->atom != NULL); -+ assert("zam-717", fq->atom == node->atom); -+ assert("zam-907", fq_in_use(fq)); -+ -+ assert("zam-714", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-826", JF_ISSET(node, JNODE_RELOC)); -+ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); -+ assert("vs-1481", NODE_LIST(node) != FQ_LIST); -+ -+ mark_jnode_queued(fq, node); -+ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq)); -+ -+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node), -+ FQ_LIST, 1)); -+} -+ -+/* repeatable process for waiting io completion on a flush queue object */ -+static int wait_io(flush_queue_t *fq, int *nr_io_errors) -+{ -+ assert("zam-738", fq->atom != NULL); -+ assert_spin_locked(&(fq->atom->alock)); -+ assert("zam-736", fq_in_use(fq)); -+ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq))); -+ -+ if (atomic_read(&fq->nr_submitted) != 0) { -+ struct super_block *super; -+ -+ spin_unlock_atom(fq->atom); -+ -+ assert("nikita-3013", reiser4_schedulable()); -+ -+ super = reiser4_get_current_sb(); -+ -+ /* FIXME: this is instead of blk_run_queues() */ -+ blk_run_address_space(reiser4_get_super_fake(super)->i_mapping); -+ -+ if (!(super->s_flags & MS_RDONLY)) -+ wait_event(fq->wait, -+ atomic_read(&fq->nr_submitted) == 0); -+ -+ /* Ask the caller to re-acquire the locks and call this -+ function again. Note: this technique is commonly used in -+ the txnmgr code. */ -+ return -E_REPEAT; -+ } -+ -+ *nr_io_errors += atomic_read(&fq->nr_errors); -+ return 0; -+} -+ -+/* wait on I/O completion, re-submit dirty nodes to write */ -+static int finish_fq(flush_queue_t *fq, int *nr_io_errors) -+{ -+ int ret; -+ txn_atom *atom = fq->atom; -+ -+ assert("zam-801", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("zam-762", fq_in_use(fq)); -+ -+ ret = wait_io(fq, nr_io_errors); -+ if (ret) -+ return ret; -+ -+ detach_fq(fq); -+ done_fq(fq); -+ -+ reiser4_atom_send_event(atom); -+ -+ return 0; -+} -+ -+/* wait for all i/o for given atom to be completed, actually do one iteration -+ on that and return -E_REPEAT if there more iterations needed */ -+static int finish_all_fq(txn_atom * atom, int *nr_io_errors) -+{ -+ flush_queue_t *fq; -+ -+ assert_spin_locked(&(atom->alock)); -+ -+ if (list_empty_careful(&atom->flush_queues)) -+ return 0; -+ -+ list_for_each_entry(fq, &atom->flush_queues, alink) { -+ if (fq_ready(fq)) { -+ int ret; -+ -+ mark_fq_in_use(fq); -+ assert("vs-1247", fq->owner == NULL); -+ ON_DEBUG(fq->owner = current); -+ ret = finish_fq(fq, nr_io_errors); -+ -+ if (*nr_io_errors) -+ reiser4_handle_error(); -+ -+ if (ret) { -+ reiser4_fq_put(fq); -+ return ret; -+ } -+ -+ spin_unlock_atom(atom); -+ -+ return -E_REPEAT; -+ } -+ } -+ -+ /* All flush queues are in use; atom remains locked */ -+ return -EBUSY; -+} -+ -+/* wait all i/o for current atom */ -+int current_atom_finish_all_fq(void) -+{ -+ txn_atom *atom; -+ int nr_io_errors = 0; -+ int ret = 0; -+ -+ do { -+ while (1) { -+ atom = get_current_atom_locked(); -+ ret = finish_all_fq(atom, &nr_io_errors); -+ if (ret != -EBUSY) -+ break; -+ reiser4_atom_wait_event(atom); -+ } -+ } while (ret == -E_REPEAT); -+ -+ /* we do not need locked atom after this function finishes, SUCCESS or -+ -EBUSY are two return codes when atom remains locked after -+ finish_all_fq */ -+ if (!ret) -+ spin_unlock_atom(atom); -+ -+ assert_spin_not_locked(&(atom->alock)); -+ -+ if (ret) -+ return ret; -+ -+ if (nr_io_errors) -+ return RETERR(-EIO); -+ -+ return 0; -+} -+ -+/* change node->atom field for all jnode from given list */ -+static void -+scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom) -+{ -+ jnode *cur; -+ -+ list_for_each_entry(cur, list, capture_link) { -+ spin_lock_jnode(cur); -+ cur->atom = atom; -+ spin_unlock_jnode(cur); -+ } -+} -+ -+/* support for atom fusion operation */ -+void reiser4_fuse_fq(txn_atom *to, txn_atom *from) -+{ -+ flush_queue_t *fq; -+ -+ assert_spin_locked(&(to->alock)); -+ assert_spin_locked(&(from->alock)); -+ -+ list_for_each_entry(fq, &from->flush_queues, alink) { -+ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to); -+ spin_lock(&(fq->guard)); -+ fq->atom = to; -+ spin_unlock(&(fq->guard)); -+ } -+ -+ list_splice_init(&from->flush_queues, to->flush_queues.prev); -+ -+#if REISER4_DEBUG -+ to->num_queued += from->num_queued; -+ to->nr_flush_queues += from->nr_flush_queues; -+ from->nr_flush_queues = 0; -+#endif -+} -+ -+#if REISER4_DEBUG -+int atom_fq_parts_are_clean(txn_atom * atom) -+{ -+ assert("zam-915", atom != NULL); -+ return list_empty_careful(&atom->flush_queues); -+} -+#endif -+/* Bio i/o completion routine for reiser4 write operations. */ -+static void -+end_io_handler(struct bio *bio, int err) -+{ -+ int i; -+ int nr_errors = 0; -+ flush_queue_t *fq; -+ -+ assert("zam-958", bio->bi_rw & WRITE); -+ -+ if (err == -EOPNOTSUPP) -+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); -+ -+ /* we expect that bio->private is set to NULL or fq object which is used -+ * for synchronization and error counting. */ -+ fq = bio->bi_private; -+ /* Check all elements of io_vec for correct write completion. */ -+ for (i = 0; i < bio->bi_vcnt; i += 1) { -+ struct page *pg = bio->bi_io_vec[i].bv_page; -+ -+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { -+ SetPageError(pg); -+ nr_errors++; -+ } -+ -+ { -+ /* jnode WRITEBACK ("write is in progress bit") is -+ * atomically cleared here. */ -+ jnode *node; -+ -+ assert("zam-736", pg != NULL); -+ assert("zam-736", PagePrivate(pg)); -+ node = jprivate(pg); -+ -+ JF_CLR(node, JNODE_WRITEBACK); -+ } -+ -+ end_page_writeback(pg); -+ page_cache_release(pg); -+ } -+ -+ if (fq) { -+ /* count i/o error in fq object */ -+ atomic_add(nr_errors, &fq->nr_errors); -+ -+ /* If all write requests registered in this "fq" are done we up -+ * the waiter. */ -+ if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted)) -+ wake_up(&fq->wait); -+ } -+ -+ bio_put(bio); -+} -+ -+/* Count I/O requests which will be submitted by @bio in given flush queues -+ @fq */ -+void add_fq_to_bio(flush_queue_t *fq, struct bio *bio) -+{ -+ bio->bi_private = fq; -+ bio->bi_end_io = end_io_handler; -+ -+ if (fq) -+ atomic_add(bio->bi_vcnt, &fq->nr_submitted); -+} -+ -+/* Move all queued nodes out from @fq->prepped list. */ -+static void release_prepped_list(flush_queue_t *fq) -+{ -+ txn_atom *atom; -+ -+ assert("zam-904", fq_in_use(fq)); -+ atom = atom_locked_by_fq(fq); -+ -+ while (!list_empty(ATOM_FQ_LIST(fq))) { -+ jnode *cur; -+ -+ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link); -+ list_del_init(&cur->capture_link); -+ -+ count_dequeued_node(fq); -+ spin_lock_jnode(cur); -+ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR)); -+ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC)); -+ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED)); -+ JF_CLR(cur, JNODE_FLUSH_QUEUED); -+ -+ if (JF_ISSET(cur, JNODE_DIRTY)) { -+ list_add_tail(&cur->capture_link, -+ ATOM_DIRTY_LIST(atom, -+ jnode_get_level(cur))); -+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST, -+ DIRTY_LIST, 1)); -+ } else { -+ list_add_tail(&cur->capture_link, -+ ATOM_CLEAN_LIST(atom)); -+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST, -+ CLEAN_LIST, 1)); -+ } -+ -+ spin_unlock_jnode(cur); -+ } -+ -+ if (--atom->nr_running_queues == 0) -+ reiser4_atom_send_event(atom); -+ -+ spin_unlock_atom(atom); -+} -+ -+/* Submit write requests for nodes on the already filled flush queue @fq. -+ -+ @fq: flush queue object which contains jnodes we can (and will) write. -+ @return: number of submitted blocks (>=0) if success, otherwise -- an error -+ code (<0). */ -+int reiser4_write_fq(flush_queue_t *fq, long *nr_submitted, int flags) -+{ -+ int ret; -+ txn_atom *atom; -+ -+ while (1) { -+ atom = atom_locked_by_fq(fq); -+ assert("zam-924", atom); -+ /* do not write fq in parallel. */ -+ if (atom->nr_running_queues == 0 -+ || !(flags & WRITEOUT_SINGLE_STREAM)) -+ break; -+ reiser4_atom_wait_event(atom); -+ } -+ -+ atom->nr_running_queues++; -+ spin_unlock_atom(atom); -+ -+ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags); -+ release_prepped_list(fq); -+ -+ return ret; -+} -+ -+/* Getting flush queue object for exclusive use by one thread. May require -+ several iterations which is indicated by -E_REPEAT return code. -+ -+ This function does not contain code for obtaining an atom lock because an -+ atom lock is obtained by different ways in different parts of reiser4, -+ usually it is current atom, but we need a possibility for getting fq for the -+ atom of given jnode. */ -+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp) -+{ -+ flush_queue_t *fq; -+ -+ assert_spin_locked(&(atom->alock)); -+ -+ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink); -+ while (&atom->flush_queues != &fq->alink) { -+ spin_lock(&(fq->guard)); -+ -+ if (fq_ready(fq)) { -+ mark_fq_in_use(fq); -+ assert("vs-1246", fq->owner == NULL); -+ ON_DEBUG(fq->owner = current); -+ spin_unlock(&(fq->guard)); -+ -+ if (*new_fq) -+ done_fq(*new_fq); -+ -+ *new_fq = fq; -+ -+ return 0; -+ } -+ -+ spin_unlock(&(fq->guard)); -+ -+ fq = list_entry(fq->alink.next, flush_queue_t, alink); -+ } -+ -+ /* Use previously allocated fq object */ -+ if (*new_fq) { -+ mark_fq_in_use(*new_fq); -+ assert("vs-1248", (*new_fq)->owner == 0); -+ ON_DEBUG((*new_fq)->owner = current); -+ attach_fq(atom, *new_fq); -+ -+ return 0; -+ } -+ -+ spin_unlock_atom(atom); -+ -+ *new_fq = create_fq(gfp); -+ -+ if (*new_fq == NULL) -+ return RETERR(-ENOMEM); -+ -+ return RETERR(-E_REPEAT); -+} -+ -+int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t **new_fq) -+{ -+ return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get()); -+} -+ -+/* A wrapper around reiser4_fq_by_atom for getting a flush queue -+ object for current atom, if success fq->atom remains locked. */ -+flush_queue_t *get_fq_for_current_atom(void) -+{ -+ flush_queue_t *fq = NULL; -+ txn_atom *atom; -+ int ret; -+ -+ do { -+ atom = get_current_atom_locked(); -+ ret = reiser4_fq_by_atom(atom, &fq); -+ } while (ret == -E_REPEAT); -+ -+ if (ret) -+ return ERR_PTR(ret); -+ return fq; -+} -+ -+/* Releasing flush queue object after exclusive use */ -+void reiser4_fq_put_nolock(flush_queue_t *fq) -+{ -+ assert("zam-747", fq->atom != NULL); -+ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq))); -+ mark_fq_ready(fq); -+ assert("vs-1245", fq->owner == current); -+ ON_DEBUG(fq->owner = NULL); -+} -+ -+void reiser4_fq_put(flush_queue_t *fq) -+{ -+ txn_atom *atom; -+ -+ spin_lock(&(fq->guard)); -+ atom = atom_locked_by_fq_nolock(fq); -+ -+ assert("zam-746", atom != NULL); -+ -+ reiser4_fq_put_nolock(fq); -+ reiser4_atom_send_event(atom); -+ -+ spin_unlock(&(fq->guard)); -+ spin_unlock_atom(atom); -+} -+ -+/* A part of atom object initialization related to the embedded flush queue -+ list head */ -+ -+void init_atom_fq_parts(txn_atom *atom) -+{ -+ INIT_LIST_HEAD(&atom->flush_queues); -+} -+ -+#if REISER4_DEBUG -+ -+void reiser4_check_fq(const txn_atom *atom) -+{ -+ /* check number of nodes on all atom's flush queues */ -+ flush_queue_t *fq; -+ int count; -+ struct list_head *pos; -+ -+ count = 0; -+ list_for_each_entry(fq, &atom->flush_queues, alink) { -+ spin_lock(&(fq->guard)); -+ /* calculate number of jnodes on fq' list of prepped jnodes */ -+ list_for_each(pos, ATOM_FQ_LIST(fq)) -+ count++; -+ spin_unlock(&(fq->guard)); -+ } -+ if (count != atom->fq) -+ warning("", "fq counter %d, real %d\n", atom->fq, count); -+ -+} -+ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/forward.h linux-2.6.30/fs/reiser4/forward.h ---- linux-2.6.30.orig/fs/reiser4/forward.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/forward.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,256 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* Forward declarations. Thank you Kernighan. */ -+ -+#if !defined(__REISER4_FORWARD_H__) -+#define __REISER4_FORWARD_H__ -+ -+#include <asm/errno.h> -+#include <linux/types.h> -+ -+typedef struct zlock zlock; -+typedef struct lock_stack lock_stack; -+typedef struct lock_handle lock_handle; -+typedef struct znode znode; -+typedef struct flow flow_t; -+typedef struct coord coord_t; -+typedef struct tree_access_pointer tap_t; -+typedef struct reiser4_object_create_data reiser4_object_create_data; -+typedef union reiser4_plugin reiser4_plugin; -+typedef __u16 reiser4_plugin_id; -+typedef __u64 reiser4_plugin_groups; -+typedef struct item_plugin item_plugin; -+typedef struct jnode_plugin jnode_plugin; -+typedef struct reiser4_item_data reiser4_item_data; -+typedef union reiser4_key reiser4_key; -+typedef struct reiser4_tree reiser4_tree; -+typedef struct carry_cut_data carry_cut_data; -+typedef struct carry_kill_data carry_kill_data; -+typedef struct carry_tree_op carry_tree_op; -+typedef struct carry_tree_node carry_tree_node; -+typedef struct carry_plugin_info carry_plugin_info; -+typedef struct reiser4_journal reiser4_journal; -+typedef struct txn_atom txn_atom; -+typedef struct txn_handle txn_handle; -+typedef struct txn_mgr txn_mgr; -+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc; -+typedef struct reiser4_context reiser4_context; -+typedef struct carry_level carry_level; -+typedef struct blocknr_set_entry blocknr_set_entry; -+/* super_block->s_fs_info points to this */ -+typedef struct reiser4_super_info_data reiser4_super_info_data; -+/* next two objects are fields of reiser4_super_info_data */ -+typedef struct reiser4_oid_allocator reiser4_oid_allocator; -+typedef struct reiser4_space_allocator reiser4_space_allocator; -+ -+typedef struct flush_scan flush_scan; -+typedef struct flush_position flush_pos_t; -+ -+typedef unsigned short pos_in_node_t; -+#define MAX_POS_IN_NODE 65535 -+ -+typedef struct jnode jnode; -+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint; -+ -+typedef struct uf_coord uf_coord_t; -+typedef struct hint hint_t; -+ -+typedef struct ktxnmgrd_context ktxnmgrd_context; -+ -+struct inode; -+struct page; -+struct file; -+struct dentry; -+struct super_block; -+ -+/* return values of coord_by_key(). cbk == coord_by_key */ -+typedef enum { -+ CBK_COORD_FOUND = 0, -+ CBK_COORD_NOTFOUND = -ENOENT, -+} lookup_result; -+ -+/* results of lookup with directory file */ -+typedef enum { -+ FILE_NAME_FOUND = 0, -+ FILE_NAME_NOTFOUND = -ENOENT, -+ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, -+ IO_ERROR return codes for each search. */ -+ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, -+ IO_ERROR return codes for each search. */ -+} file_lookup_result; -+ -+/* behaviors of lookup. If coord we are looking for is actually in a tree, -+ both coincide. */ -+typedef enum { -+ /* search exactly for the coord with key given */ -+ FIND_EXACT, -+ /* search for coord with the maximal key not greater than one -+ given */ -+ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */ -+} lookup_bias; -+ -+typedef enum { -+ /* number of leaf level of the tree -+ The fake root has (tree_level=0). */ -+ LEAF_LEVEL = 1, -+ -+ /* number of level one above leaf level of the tree. -+ -+ It is supposed that internal tree used by reiser4 to store file -+ system data and meta data will have height 2 initially (when -+ created by mkfs). -+ */ -+ TWIG_LEVEL = 2, -+} tree_level; -+ -+/* The "real" maximum ztree height is the 0-origin size of any per-level -+ array, since the zero'th level is not used. */ -+#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL) -+ -+/* enumeration of possible mutual position of item and coord. This enum is -+ return type of ->is_in_item() item plugin method which see. */ -+typedef enum { -+ /* coord is on the left of an item */ -+ IP_ON_THE_LEFT, -+ /* coord is inside item */ -+ IP_INSIDE, -+ /* coord is inside item, but to the right of the rightmost unit of -+ this item */ -+ IP_RIGHT_EDGE, -+ /* coord is on the right of an item */ -+ IP_ON_THE_RIGHT -+} interposition; -+ -+/* type of lock to acquire on znode before returning it to caller */ -+typedef enum { -+ ZNODE_NO_LOCK = 0, -+ ZNODE_READ_LOCK = 1, -+ ZNODE_WRITE_LOCK = 2, -+} znode_lock_mode; -+ -+/* type of lock request */ -+typedef enum { -+ ZNODE_LOCK_LOPRI = 0, -+ ZNODE_LOCK_HIPRI = (1 << 0), -+ -+ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to -+ longterm_lock_znode will not sleep waiting for the lock to become -+ available. If the lock is unavailable, reiser4_znode_lock will -+ immediately return the value -E_REPEAT. */ -+ ZNODE_LOCK_NONBLOCK = (1 << 1), -+ /* An option for longterm_lock_znode which prevents atom fusion */ -+ ZNODE_LOCK_DONT_FUSE = (1 << 2) -+} znode_lock_request; -+ -+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op; -+ -+/* used to specify direction of shift. These must be -1 and 1 */ -+typedef enum { -+ SHIFT_LEFT = 1, -+ SHIFT_RIGHT = -1 -+} shift_direction; -+ -+typedef enum { -+ LEFT_SIDE, -+ RIGHT_SIDE -+} sideof; -+ -+#define round_up(value, order) \ -+ ((typeof(value))(((long) (value) + (order) - 1U) & \ -+ ~((order) - 1))) -+ -+/* values returned by squalloc_right_neighbor and its auxiliary functions */ -+typedef enum { -+ /* unit of internal item is moved */ -+ SUBTREE_MOVED = 0, -+ /* nothing else can be squeezed into left neighbor */ -+ SQUEEZE_TARGET_FULL = 1, -+ /* all content of node is squeezed into its left neighbor */ -+ SQUEEZE_SOURCE_EMPTY = 2, -+ /* one more item is copied (this is only returned by -+ allocate_and_copy_extent to squalloc_twig)) */ -+ SQUEEZE_CONTINUE = 3 -+} squeeze_result; -+ -+/* Do not change items ids. If you do - there will be format change */ -+typedef enum { -+ STATIC_STAT_DATA_ID = 0x0, -+ SIMPLE_DIR_ENTRY_ID = 0x1, -+ COMPOUND_DIR_ID = 0x2, -+ NODE_POINTER_ID = 0x3, -+ EXTENT_POINTER_ID = 0x5, -+ FORMATTING_ID = 0x6, -+ CTAIL_ID = 0x7, -+ BLACK_BOX_ID = 0x8, -+ LAST_ITEM_ID = 0x9 -+} item_id; -+ -+/* Flags passed to jnode_flush() to allow it to distinguish default settings -+ based on whether commit() was called or VM memory pressure was applied. */ -+typedef enum { -+ /* submit flush queue to disk at jnode_flush completion */ -+ JNODE_FLUSH_WRITE_BLOCKS = 1, -+ -+ /* flush is called for commit */ -+ JNODE_FLUSH_COMMIT = 2, -+ /* not implemented */ -+ JNODE_FLUSH_MEMORY_FORMATTED = 4, -+ -+ /* not implemented */ -+ JNODE_FLUSH_MEMORY_UNFORMATTED = 8, -+} jnode_flush_flags; -+ -+/* Flags to insert/paste carry operations. Currently they only used in -+ flushing code, but in future, they can be used to optimize for repetitive -+ accesses. */ -+typedef enum { -+ /* carry is not allowed to shift data to the left when trying to find -+ free space */ -+ COPI_DONT_SHIFT_LEFT = (1 << 0), -+ /* carry is not allowed to shift data to the right when trying to find -+ free space */ -+ COPI_DONT_SHIFT_RIGHT = (1 << 1), -+ /* carry is not allowed to allocate new node(s) when trying to find -+ free space */ -+ COPI_DONT_ALLOCATE = (1 << 2), -+ /* try to load left neighbor if its not in a cache */ -+ COPI_LOAD_LEFT = (1 << 3), -+ /* try to load right neighbor if its not in a cache */ -+ COPI_LOAD_RIGHT = (1 << 4), -+ /* shift insertion point to the left neighbor */ -+ COPI_GO_LEFT = (1 << 5), -+ /* shift insertion point to the right neighbor */ -+ COPI_GO_RIGHT = (1 << 6), -+ /* try to step back into original node if insertion into new node -+ fails after shifting data there. */ -+ COPI_STEP_BACK = (1 << 7) -+} cop_insert_flag; -+ -+typedef enum { -+ SAFE_UNLINK, /* safe-link for unlink */ -+ SAFE_TRUNCATE /* safe-link for truncate */ -+} reiser4_safe_link_t; -+ -+/* this is to show on which list of atom jnode is */ -+typedef enum { -+ NOT_CAPTURED, -+ DIRTY_LIST, -+ CLEAN_LIST, -+ FQ_LIST, -+ WB_LIST, -+ OVRWR_LIST -+} atom_list; -+ -+/* __REISER4_FORWARD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/fsdata.c linux-2.6.30/fs/reiser4/fsdata.c ---- linux-2.6.30.orig/fs/reiser4/fsdata.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/fsdata.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,804 @@ -+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "fsdata.h" -+#include "inode.h" -+ -+ -+/* cache or dir_cursors */ -+static struct kmem_cache *d_cursor_cache; -+ -+/* list of unused cursors */ -+static LIST_HEAD(cursor_cache); -+ -+/* number of cursors in list of ununsed cursors */ -+static unsigned long d_cursor_unused = 0; -+ -+/* spinlock protecting manipulations with dir_cursor's hash table and lists */ -+DEFINE_SPINLOCK(d_lock); -+ -+static reiser4_file_fsdata *create_fsdata(struct file *file); -+static int file_is_stateless(struct file *file); -+static void free_fsdata(reiser4_file_fsdata *fsdata); -+static void kill_cursor(dir_cursor *); -+ -+/** -+ * d_cursor_shrink - shrink callback for cache of dir_cursor-s -+ * @nr: number of objects to free -+ * @mask: GFP mask -+ * -+ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested -+ * number. Return number of still freeable cursors. -+ */ -+static int d_cursor_shrink(int nr, gfp_t mask) -+{ -+ if (nr != 0) { -+ dir_cursor *scan; -+ int killed; -+ -+ killed = 0; -+ spin_lock(&d_lock); -+ while (!list_empty(&cursor_cache)) { -+ scan = list_entry(cursor_cache.next, dir_cursor, alist); -+ assert("nikita-3567", scan->ref == 0); -+ kill_cursor(scan); -+ ++killed; -+ --nr; -+ if (nr == 0) -+ break; -+ } -+ spin_unlock(&d_lock); -+ } -+ return d_cursor_unused; -+} -+ -+/* -+ * actually, d_cursors are "priceless", because there is no way to -+ * recover information stored in them. On the other hand, we don't -+ * want to consume all kernel memory by them. As a compromise, just -+ * assign higher "seeks" value to d_cursor cache, so that it will be -+ * shrunk only if system is really tight on memory. -+ */ -+static struct shrinker d_cursor_shrinker = { -+ .shrink = d_cursor_shrink, -+ .seeks = DEFAULT_SEEKS << 3, -+}; -+ -+/** -+ * reiser4_init_d_cursor - create d_cursor cache -+ * -+ * Initializes slab cache of d_cursors. It is part of reiser4 module -+ * initialization. -+ */ -+int reiser4_init_d_cursor(void) -+{ -+ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0, -+ SLAB_HWCACHE_ALIGN, NULL); -+ if (d_cursor_cache == NULL) -+ return RETERR(-ENOMEM); -+ -+ register_shrinker(&d_cursor_shrinker); -+ return 0; -+} -+ -+/** -+ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_d_cursor(void) -+{ -+ unregister_shrinker(&d_cursor_shrinker); -+ -+ destroy_reiser4_cache(&d_cursor_cache); -+} -+ -+#define D_CURSOR_TABLE_SIZE (256) -+ -+static inline unsigned long -+d_cursor_hash(d_cursor_hash_table * table, const struct d_cursor_key *key) -+{ -+ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE)); -+ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1); -+} -+ -+static inline int d_cursor_eq(const struct d_cursor_key *k1, -+ const struct d_cursor_key *k2) -+{ -+ return k1->cid == k2->cid && k1->oid == k2->oid; -+} -+ -+/* -+ * define functions to manipulate reiser4 super block's hash table of -+ * dir_cursors -+ */ -+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) -+#define KFREE(ptr, size) kfree(ptr) -+TYPE_SAFE_HASH_DEFINE(d_cursor, -+ dir_cursor, -+ struct d_cursor_key, -+ key, hash, d_cursor_hash, d_cursor_eq); -+#undef KFREE -+#undef KMALLOC -+ -+/** -+ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources -+ * @super: super block to initialize -+ * -+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part -+ * of mount. -+ */ -+int reiser4_init_super_d_info(struct super_block *super) -+{ -+ struct d_cursor_info *p; -+ -+ p = &get_super_private(super)->d_info; -+ -+ INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get()); -+ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE); -+} -+ -+/** -+ * reiser4_done_super_d_info - release per-super-block d_cursor resources -+ * @super: super block being umounted -+ * -+ * It is called on umount. Kills all directory cursors attached to suoer block. -+ */ -+void reiser4_done_super_d_info(struct super_block *super) -+{ -+ struct d_cursor_info *d_info; -+ dir_cursor *cursor, *next; -+ -+ d_info = &get_super_private(super)->d_info; -+ for_all_in_htable(&d_info->table, d_cursor, cursor, next) -+ kill_cursor(cursor); -+ -+ BUG_ON(d_info->tree.rnode != NULL); -+ d_cursor_hash_done(&d_info->table); -+} -+ -+/** -+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it -+ * @cursor: cursor to free -+ * -+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of -+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from -+ * indices, hash table, list of unused cursors and frees it. -+ */ -+static void kill_cursor(dir_cursor *cursor) -+{ -+ unsigned long index; -+ -+ assert("nikita-3566", cursor->ref == 0); -+ assert("nikita-3572", cursor->fsdata != NULL); -+ -+ index = (unsigned long)cursor->key.oid; -+ list_del_init(&cursor->fsdata->dir.linkage); -+ free_fsdata(cursor->fsdata); -+ cursor->fsdata = NULL; -+ -+ if (list_empty_careful(&cursor->list)) -+ /* this is last cursor for a file. Kill radix-tree entry */ -+ radix_tree_delete(&cursor->info->tree, index); -+ else { -+ void **slot; -+ -+ /* -+ * there are other cursors for the same oid. -+ */ -+ -+ /* -+ * if radix tree point to the cursor being removed, re-target -+ * radix tree slot to the next cursor in the (non-empty as was -+ * checked above) element of the circular list of all cursors -+ * for this oid. -+ */ -+ slot = radix_tree_lookup_slot(&cursor->info->tree, index); -+ assert("nikita-3571", *slot != NULL); -+ if (*slot == cursor) -+ *slot = list_entry(cursor->list.next, dir_cursor, list); -+ /* remove cursor from circular list */ -+ list_del_init(&cursor->list); -+ } -+ /* remove cursor from the list of unused cursors */ -+ list_del_init(&cursor->alist); -+ /* remove cursor from the hash table */ -+ d_cursor_hash_remove(&cursor->info->table, cursor); -+ /* and free it */ -+ kmem_cache_free(d_cursor_cache, cursor); -+ --d_cursor_unused; -+} -+ -+/* possible actions that can be performed on all cursors for the given file */ -+enum cursor_action { -+ /* -+ * load all detached state: this is called when stat-data is loaded -+ * from the disk to recover information about all pending readdirs -+ */ -+ CURSOR_LOAD, -+ /* -+ * detach all state from inode, leaving it in the cache. This is called -+ * when inode is removed form the memory by memory pressure -+ */ -+ CURSOR_DISPOSE, -+ /* -+ * detach cursors from the inode, and free them. This is called when -+ * inode is destroyed -+ */ -+ CURSOR_KILL -+}; -+ -+/* -+ * return d_cursor data for the file system @inode is in. -+ */ -+static inline struct d_cursor_info *d_info(struct inode *inode) -+{ -+ return &get_super_private(inode->i_sb)->d_info; -+} -+ -+/* -+ * lookup d_cursor in the per-super-block radix tree. -+ */ -+static inline dir_cursor *lookup(struct d_cursor_info *info, -+ unsigned long index) -+{ -+ return (dir_cursor *) radix_tree_lookup(&info->tree, index); -+} -+ -+/* -+ * attach @cursor to the radix tree. There may be multiple cursors for the -+ * same oid, they are chained into circular list. -+ */ -+static void bind_cursor(dir_cursor * cursor, unsigned long index) -+{ -+ dir_cursor *head; -+ -+ head = lookup(cursor->info, index); -+ if (head == NULL) { -+ /* this is the first cursor for this index */ -+ INIT_LIST_HEAD(&cursor->list); -+ radix_tree_insert(&cursor->info->tree, index, cursor); -+ } else { -+ /* some cursor already exists. Chain ours */ -+ list_add(&cursor->list, &head->list); -+ } -+} -+ -+/* -+ * detach fsdata (if detachable) from file descriptor, and put cursor on the -+ * "unused" list. Called when file descriptor is not longer in active use. -+ */ -+static void clean_fsdata(struct file *file) -+{ -+ dir_cursor *cursor; -+ reiser4_file_fsdata *fsdata; -+ -+ assert("nikita-3570", file_is_stateless(file)); -+ -+ fsdata = (reiser4_file_fsdata *) file->private_data; -+ if (fsdata != NULL) { -+ cursor = fsdata->cursor; -+ if (cursor != NULL) { -+ spin_lock(&d_lock); -+ --cursor->ref; -+ if (cursor->ref == 0) { -+ list_add_tail(&cursor->alist, &cursor_cache); -+ ++d_cursor_unused; -+ } -+ spin_unlock(&d_lock); -+ file->private_data = NULL; -+ } -+ } -+} -+ -+/* -+ * global counter used to generate "client ids". These ids are encoded into -+ * high bits of fpos. -+ */ -+static __u32 cid_counter = 0; -+#define CID_SHIFT (20) -+#define CID_MASK (0xfffffull) -+ -+static void free_file_fsdata_nolock(struct file *); -+ -+/** -+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table -+ * @cursor: -+ * @file: -+ * @inode: -+ * -+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to -+ * reiser4 super block's hash table and radix tree. -+ add detachable readdir -+ * state to the @f -+ */ -+static int insert_cursor(dir_cursor *cursor, struct file *file, -+ struct inode *inode) -+{ -+ int result; -+ reiser4_file_fsdata *fsdata; -+ -+ memset(cursor, 0, sizeof *cursor); -+ -+ /* this is either first call to readdir, or rewind. Anyway, create new -+ * cursor. */ -+ fsdata = create_fsdata(NULL); -+ if (fsdata != NULL) { -+ result = radix_tree_preload(reiser4_ctx_gfp_mask_get()); -+ if (result == 0) { -+ struct d_cursor_info *info; -+ oid_t oid; -+ -+ info = d_info(inode); -+ oid = get_inode_oid(inode); -+ /* cid occupies higher 12 bits of f->f_pos. Don't -+ * allow it to become negative: this confuses -+ * nfsd_readdir() */ -+ cursor->key.cid = (++cid_counter) & 0x7ff; -+ cursor->key.oid = oid; -+ cursor->fsdata = fsdata; -+ cursor->info = info; -+ cursor->ref = 1; -+ -+ spin_lock_inode(inode); -+ /* install cursor as @f's private_data, discarding old -+ * one if necessary */ -+#if REISER4_DEBUG -+ if (file->private_data) -+ warning("", "file has fsdata already"); -+#endif -+ clean_fsdata(file); -+ free_file_fsdata_nolock(file); -+ file->private_data = fsdata; -+ fsdata->cursor = cursor; -+ spin_unlock_inode(inode); -+ spin_lock(&d_lock); -+ /* insert cursor into hash table */ -+ d_cursor_hash_insert(&info->table, cursor); -+ /* and chain it into radix-tree */ -+ bind_cursor(cursor, (unsigned long)oid); -+ spin_unlock(&d_lock); -+ radix_tree_preload_end(); -+ file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT; -+ } -+ } else -+ result = RETERR(-ENOMEM); -+ return result; -+} -+ -+/** -+ * process_cursors - do action on each cursor attached to inode -+ * @inode: -+ * @act: action to do -+ * -+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors -+ * and performs action specified by @act on each of cursors. -+ */ -+static void process_cursors(struct inode *inode, enum cursor_action act) -+{ -+ oid_t oid; -+ dir_cursor *start; -+ struct list_head *head; -+ reiser4_context *ctx; -+ struct d_cursor_info *info; -+ -+ /* this can be called by -+ * -+ * kswapd->...->prune_icache->..reiser4_destroy_inode -+ * -+ * without reiser4_context -+ */ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ warning("vs-23", "failed to init context"); -+ return; -+ } -+ -+ assert("nikita-3558", inode != NULL); -+ -+ info = d_info(inode); -+ oid = get_inode_oid(inode); -+ spin_lock_inode(inode); -+ head = get_readdir_list(inode); -+ spin_lock(&d_lock); -+ /* find any cursor for this oid: reference to it is hanging of radix -+ * tree */ -+ start = lookup(info, (unsigned long)oid); -+ if (start != NULL) { -+ dir_cursor *scan; -+ reiser4_file_fsdata *fsdata; -+ -+ /* process circular list of cursors for this oid */ -+ scan = start; -+ do { -+ dir_cursor *next; -+ -+ next = list_entry(scan->list.next, dir_cursor, list); -+ fsdata = scan->fsdata; -+ assert("nikita-3557", fsdata != NULL); -+ if (scan->key.oid == oid) { -+ switch (act) { -+ case CURSOR_DISPOSE: -+ list_del_init(&fsdata->dir.linkage); -+ break; -+ case CURSOR_LOAD: -+ list_add(&fsdata->dir.linkage, head); -+ break; -+ case CURSOR_KILL: -+ kill_cursor(scan); -+ break; -+ } -+ } -+ if (scan == next) -+ /* last cursor was just killed */ -+ break; -+ scan = next; -+ } while (scan != start); -+ } -+ spin_unlock(&d_lock); -+ /* check that we killed 'em all */ -+ assert("nikita-3568", -+ ergo(act == CURSOR_KILL, -+ list_empty_careful(get_readdir_list(inode)))); -+ assert("nikita-3569", -+ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL)); -+ spin_unlock_inode(inode); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_dispose_cursors - removes cursors from inode's list -+ * @inode: inode to dispose cursors of -+ * -+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata -+ * attached to cursor from inode's readdir list. This is called when inode is -+ * removed from the memory by memory pressure. -+ */ -+void reiser4_dispose_cursors(struct inode *inode) -+{ -+ process_cursors(inode, CURSOR_DISPOSE); -+} -+ -+/** -+ * reiser4_load_cursors - attach cursors to inode -+ * @inode: inode to load cursors to -+ * -+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata -+ * attached to cursor to inode's readdir list. This is done when inode is -+ * loaded into memory. -+ */ -+void reiser4_load_cursors(struct inode *inode) -+{ -+ process_cursors(inode, CURSOR_LOAD); -+} -+ -+/** -+ * reiser4_kill_cursors - kill all inode cursors -+ * @inode: inode to kill cursors of -+ * -+ * Frees all cursors for this inode. This is called when inode is destroyed. -+ */ -+void reiser4_kill_cursors(struct inode *inode) -+{ -+ process_cursors(inode, CURSOR_KILL); -+} -+ -+/** -+ * file_is_stateless - -+ * @file: -+ * -+ * true, if file descriptor @f is created by NFS server by "demand" to serve -+ * one file system operation. This means that there may be "detached state" -+ * for underlying inode. -+ */ -+static int file_is_stateless(struct file *file) -+{ -+ return reiser4_get_dentry_fsdata(file->f_dentry)->stateless; -+} -+ -+/** -+ * reiser4_get_dir_fpos - -+ * @dir: -+ * -+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but -+ * in the case of stateless directory operation (readdir-over-nfs), client id -+ * was encoded in the high bits of cookie and should me masked off. -+ */ -+loff_t reiser4_get_dir_fpos(struct file *dir) -+{ -+ if (file_is_stateless(dir)) -+ return dir->f_pos & CID_MASK; -+ else -+ return dir->f_pos; -+} -+ -+/** -+ * reiser4_attach_fsdata - try to attach fsdata -+ * @file: -+ * @inode: -+ * -+ * Finds or creates cursor for readdir-over-nfs. -+ */ -+int reiser4_attach_fsdata(struct file *file, struct inode *inode) -+{ -+ loff_t pos; -+ int result; -+ dir_cursor *cursor; -+ -+ /* -+ * we are serialized by inode->i_mutex -+ */ -+ if (!file_is_stateless(file)) -+ return 0; -+ -+ pos = file->f_pos; -+ result = 0; -+ if (pos == 0) { -+ /* -+ * first call to readdir (or rewind to the beginning of -+ * directory) -+ */ -+ cursor = kmem_cache_alloc(d_cursor_cache, -+ reiser4_ctx_gfp_mask_get()); -+ if (cursor != NULL) -+ result = insert_cursor(cursor, file, inode); -+ else -+ result = RETERR(-ENOMEM); -+ } else { -+ /* try to find existing cursor */ -+ struct d_cursor_key key; -+ -+ key.cid = pos >> CID_SHIFT; -+ key.oid = get_inode_oid(inode); -+ spin_lock(&d_lock); -+ cursor = d_cursor_hash_find(&d_info(inode)->table, &key); -+ if (cursor != NULL) { -+ /* cursor was found */ -+ if (cursor->ref == 0) { -+ /* move it from unused list */ -+ list_del_init(&cursor->alist); -+ --d_cursor_unused; -+ } -+ ++cursor->ref; -+ } -+ spin_unlock(&d_lock); -+ if (cursor != NULL) { -+ spin_lock_inode(inode); -+ assert("nikita-3556", cursor->fsdata->back == NULL); -+ clean_fsdata(file); -+ free_file_fsdata_nolock(file); -+ file->private_data = cursor->fsdata; -+ spin_unlock_inode(inode); -+ } -+ } -+ return result; -+} -+ -+/** -+ * reiser4_detach_fsdata - ??? -+ * @file: -+ * -+ * detach fsdata, if necessary -+ */ -+void reiser4_detach_fsdata(struct file *file) -+{ -+ struct inode *inode; -+ -+ if (!file_is_stateless(file)) -+ return; -+ -+ inode = file->f_dentry->d_inode; -+ spin_lock_inode(inode); -+ clean_fsdata(file); -+ spin_unlock_inode(inode); -+} -+ -+/* slab for reiser4_dentry_fsdata */ -+static struct kmem_cache *dentry_fsdata_cache; -+ -+/** -+ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata -+ * -+ * Initializes slab cache of structures attached to denty->d_fsdata. It is -+ * part of reiser4 module initialization. -+ */ -+int reiser4_init_dentry_fsdata(void) -+{ -+ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata", -+ sizeof(struct reiser4_dentry_fsdata), -+ 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, -+ NULL); -+ if (dentry_fsdata_cache == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_dentry_fsdata(void) -+{ -+ destroy_reiser4_cache(&dentry_fsdata_cache); -+} -+ -+/** -+ * reiser4_get_dentry_fsdata - get fs-specific dentry data -+ * @dentry: queried dentry -+ * -+ * Allocates if necessary and returns per-dentry data that we attach to each -+ * dentry. -+ */ -+struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry) -+{ -+ assert("nikita-1365", dentry != NULL); -+ -+ if (dentry->d_fsdata == NULL) { -+ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache, -+ reiser4_ctx_gfp_mask_get()); -+ if (dentry->d_fsdata == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ memset(dentry->d_fsdata, 0, -+ sizeof(struct reiser4_dentry_fsdata)); -+ } -+ return dentry->d_fsdata; -+} -+ -+/** -+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata -+ * @dentry: dentry to free fsdata of -+ * -+ * Detaches and frees fs-specific dentry data -+ */ -+void reiser4_free_dentry_fsdata(struct dentry *dentry) -+{ -+ if (dentry->d_fsdata != NULL) { -+ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata); -+ dentry->d_fsdata = NULL; -+ } -+} -+ -+/* slab for reiser4_file_fsdata */ -+static struct kmem_cache *file_fsdata_cache; -+ -+/** -+ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata -+ * -+ * Initializes slab cache of structures attached to file->private_data. It is -+ * part of reiser4 module initialization. -+ */ -+int reiser4_init_file_fsdata(void) -+{ -+ file_fsdata_cache = kmem_cache_create("file_fsdata", -+ sizeof(reiser4_file_fsdata), -+ 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL); -+ if (file_fsdata_cache == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_file_fsdata(void) -+{ -+ destroy_reiser4_cache(&file_fsdata_cache); -+} -+ -+/** -+ * create_fsdata - allocate and initialize reiser4_file_fsdata -+ * @file: what to create file_fsdata for, may be NULL -+ * -+ * Allocates and initializes reiser4_file_fsdata structure. -+ */ -+static reiser4_file_fsdata *create_fsdata(struct file *file) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ fsdata = kmem_cache_alloc(file_fsdata_cache, -+ reiser4_ctx_gfp_mask_get()); -+ if (fsdata != NULL) { -+ memset(fsdata, 0, sizeof *fsdata); -+ fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024; -+ fsdata->back = file; -+ INIT_LIST_HEAD(&fsdata->dir.linkage); -+ } -+ return fsdata; -+} -+ -+/** -+ * free_fsdata - free reiser4_file_fsdata -+ * @fsdata: object to free -+ * -+ * Dual to create_fsdata(). Free reiser4_file_fsdata. -+ */ -+static void free_fsdata(reiser4_file_fsdata *fsdata) -+{ -+ BUG_ON(fsdata == NULL); -+ kmem_cache_free(file_fsdata_cache, fsdata); -+} -+ -+/** -+ * reiser4_get_file_fsdata - get fs-specific file data -+ * @file: queried file -+ * -+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches -+ * to @file. -+ */ -+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file) -+{ -+ assert("nikita-1603", file != NULL); -+ -+ if (file->private_data == NULL) { -+ reiser4_file_fsdata *fsdata; -+ struct inode *inode; -+ -+ fsdata = create_fsdata(file); -+ if (fsdata == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ inode = file->f_dentry->d_inode; -+ spin_lock_inode(inode); -+ if (file->private_data == NULL) { -+ file->private_data = fsdata; -+ fsdata = NULL; -+ } -+ spin_unlock_inode(inode); -+ if (fsdata != NULL) -+ /* other thread initialized ->fsdata */ -+ kmem_cache_free(file_fsdata_cache, fsdata); -+ } -+ assert("nikita-2665", file->private_data != NULL); -+ return file->private_data; -+} -+ -+/** -+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata -+ * @file: -+ * -+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from -+ * readdir list, frees if it is not linked to d_cursor object. -+ */ -+static void free_file_fsdata_nolock(struct file *file) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ assert("", spin_inode_is_locked(file->f_dentry->d_inode)); -+ fsdata = file->private_data; -+ if (fsdata != NULL) { -+ list_del_init(&fsdata->dir.linkage); -+ if (fsdata->cursor == NULL) -+ free_fsdata(fsdata); -+ } -+ file->private_data = NULL; -+} -+ -+/** -+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata -+ * @file: -+ * -+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work. -+ */ -+void reiser4_free_file_fsdata(struct file *file) -+{ -+ spin_lock_inode(file->f_dentry->d_inode); -+ free_file_fsdata_nolock(file); -+ spin_unlock_inode(file->f_dentry->d_inode); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/fsdata.h linux-2.6.30/fs/reiser4/fsdata.h ---- linux-2.6.30.orig/fs/reiser4/fsdata.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/fsdata.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,205 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#if !defined(__REISER4_FSDATA_H__) -+#define __REISER4_FSDATA_H__ -+ -+#include "debug.h" -+#include "kassign.h" -+#include "seal.h" -+#include "type_safe_hash.h" -+#include "plugin/file/file.h" -+#include "readahead.h" -+ -+/* -+ * comment about reiser4_dentry_fsdata -+ * -+ * -+ */ -+ -+/* -+ * locking: fields of per file descriptor readdir_pos and ->f_pos are -+ * protected by ->i_mutex on inode. Under this lock following invariant -+ * holds: -+ * -+ * file descriptor is "looking" at the entry_no-th directory entry from -+ * the beginning of directory. This entry has key dir_entry_key and is -+ * pos-th entry with duplicate-key sequence. -+ * -+ */ -+ -+/* logical position within directory */ -+struct dir_pos { -+ /* key of directory entry (actually, part of a key sufficient to -+ identify directory entry) */ -+ de_id dir_entry_key; -+ /* ordinal number of directory entry among all entries with the same -+ key. (Starting from 0.) */ -+ unsigned pos; -+}; -+ -+struct readdir_pos { -+ /* f_pos corresponding to this readdir position */ -+ __u64 fpos; -+ /* logical position within directory */ -+ struct dir_pos position; -+ /* logical number of directory entry within -+ directory */ -+ __u64 entry_no; -+}; -+ -+/* -+ * this is used to speed up lookups for directory entry: on initial call to -+ * ->lookup() seal and coord of directory entry (if found, that is) are stored -+ * in struct dentry and reused later to avoid tree traversals. -+ */ -+struct de_location { -+ /* seal covering directory entry */ -+ seal_t entry_seal; -+ /* coord of directory entry */ -+ coord_t entry_coord; -+ /* ordinal number of directory entry among all entries with the same -+ key. (Starting from 0.) */ -+ int pos; -+}; -+ -+/** -+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries -+ * -+ * This is allocated dynamically and released in d_op->d_release() -+ * -+ * Currently it only contains cached location (hint) of directory entry, but -+ * it is expected that other information will be accumulated here. -+ */ -+struct reiser4_dentry_fsdata { -+ /* -+ * here will go fields filled by ->lookup() to speedup next -+ * create/unlink, like blocknr of znode with stat-data, or key of -+ * stat-data. -+ */ -+ struct de_location dec; -+ int stateless; /* created through reiser4_decode_fh, needs -+ * special treatment in readdir. */ -+}; -+ -+extern int reiser4_init_dentry_fsdata(void); -+extern void reiser4_done_dentry_fsdata(void); -+extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *); -+extern void reiser4_free_dentry_fsdata(struct dentry *dentry); -+ -+/** -+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data -+ * -+ * This is allocated dynamically and released in inode->i_fop->release -+ */ -+typedef struct reiser4_file_fsdata { -+ /* -+ * pointer back to the struct file which this reiser4_file_fsdata is -+ * part of -+ */ -+ struct file *back; -+ /* detached cursor for stateless readdir. */ -+ struct dir_cursor *cursor; -+ /* -+ * We need both directory and regular file parts here, because there -+ * are file system objects that are files and directories. -+ */ -+ struct { -+ /* -+ * position in directory. It is updated each time directory is -+ * modified -+ */ -+ struct readdir_pos readdir; -+ /* head of this list is reiser4_inode->lists.readdir_list */ -+ struct list_head linkage; -+ } dir; -+ /* hints to speed up operations with regular files: read and write. */ -+ struct { -+ hint_t hint; -+ } reg; -+ struct reiser4_file_ra_state ra1; -+ -+} reiser4_file_fsdata; -+ -+extern int reiser4_init_file_fsdata(void); -+extern void reiser4_done_file_fsdata(void); -+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *); -+extern void reiser4_free_file_fsdata(struct file *); -+ -+/* -+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are -+ * used to address problem reiser4 has with readdir accesses via NFS. See -+ * plugin/file_ops_readdir.c for more details. -+ */ -+struct d_cursor_key{ -+ __u16 cid; -+ __u64 oid; -+}; -+ -+/* -+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to -+ * maintain hash table of dir_cursor-s in reiser4's super block -+ */ -+typedef struct dir_cursor dir_cursor; -+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor); -+ -+struct dir_cursor { -+ int ref; -+ reiser4_file_fsdata *fsdata; -+ -+ /* link to reiser4 super block hash table of cursors */ -+ d_cursor_hash_link hash; -+ -+ /* -+ * this is to link cursors to reiser4 super block's radix tree of -+ * cursors if there are more than one cursor of the same objectid -+ */ -+ struct list_head list; -+ struct d_cursor_key key; -+ struct d_cursor_info *info; -+ /* list of unused cursors */ -+ struct list_head alist; -+}; -+ -+extern int reiser4_init_d_cursor(void); -+extern void reiser4_done_d_cursor(void); -+ -+extern int reiser4_init_super_d_info(struct super_block *); -+extern void reiser4_done_super_d_info(struct super_block *); -+ -+extern loff_t reiser4_get_dir_fpos(struct file *); -+extern int reiser4_attach_fsdata(struct file *, struct inode *); -+extern void reiser4_detach_fsdata(struct file *); -+ -+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for -+ more details */ -+void reiser4_dispose_cursors(struct inode *inode); -+void reiser4_load_cursors(struct inode *inode); -+void reiser4_kill_cursors(struct inode *inode); -+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de, -+ int offset, int adj); -+ -+/* -+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors -+ * (detached readdir state). See plugin/file_ops_readdir.c for more details. -+ */ -+struct d_cursor_info { -+ d_cursor_hash_table table; -+ struct radix_tree_root tree; -+}; -+ -+/* spinlock protecting readdir cursors */ -+extern spinlock_t d_lock; -+ -+/* __REISER4_FSDATA_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/init_super.c linux-2.6.30/fs/reiser4/init_super.c ---- linux-2.6.30.orig/fs/reiser4/init_super.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/init_super.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,751 @@ -+/* Copyright by Hans Reiser, 2003 */ -+ -+#include "super.h" -+#include "inode.h" -+#include "plugin/plugin_set.h" -+ -+#include <linux/swap.h> -+ -+/** -+ * init_fs_info - allocate reiser4 specific super block -+ * @super: super block of filesystem -+ * -+ * Allocates and initialize reiser4_super_info_data, attaches it to -+ * super->s_fs_info, initializes structures maintaining d_cursor-s. -+ */ -+int reiser4_init_fs_info(struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = kzalloc(sizeof(reiser4_super_info_data), -+ reiser4_ctx_gfp_mask_get()); -+ if (!sbinfo) -+ return RETERR(-ENOMEM); -+ -+ super->s_fs_info = sbinfo; -+ super->s_op = NULL; -+ -+ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes)); -+ ON_DEBUG(spin_lock_init(&sbinfo->all_guard)); -+ -+ mutex_init(&sbinfo->delete_mutex); -+ spin_lock_init(&(sbinfo->guard)); -+ -+ /* initialize per-super-block d_cursor resources */ -+ reiser4_init_super_d_info(super); -+ -+ return 0; -+} -+ -+/** -+ * reiser4_done_fs_info - free reiser4 specific super block -+ * @super: super block of filesystem -+ * -+ * Performs some sanity checks, releases structures maintaining d_cursor-s, -+ * frees reiser4_super_info_data. -+ */ -+void reiser4_done_fs_info(struct super_block *super) -+{ -+ assert("zam-990", super->s_fs_info != NULL); -+ -+ /* release per-super-block d_cursor resources */ -+ reiser4_done_super_d_info(super); -+ -+ /* make sure that there are not jnodes already */ -+ assert("", list_empty(&get_super_private(super)->all_jnodes)); -+ assert("", get_current_context()->trans->atom == NULL); -+ reiser4_check_block_counters(super); -+ kfree(super->s_fs_info); -+ super->s_fs_info = NULL; -+} -+ -+/* type of option parseable by parse_option() */ -+typedef enum { -+ /* value of option is arbitrary string */ -+ OPT_STRING, -+ -+ /* -+ * option specifies bit in a bitmask. When option is set - bit in -+ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush, -+ * dont_load_bitmap, atomic_write. -+ */ -+ OPT_BIT, -+ -+ /* -+ * value of option should conform to sprintf() format. Examples are -+ * tmgr.atom_max_size=N, tmgr.atom_max_age=N -+ */ -+ OPT_FORMAT, -+ -+ /* -+ * option can take one of predefined values. Example is onerror=panic or -+ * onerror=remount-ro -+ */ -+ OPT_ONEOF, -+} opt_type_t; -+ -+#if 0 -+struct opt_bitmask_bit { -+ const char *bit_name; -+ int bit_nr; -+}; -+#endif -+ -+/* description of option parseable by parse_option() */ -+struct opt_desc { -+ /* option name. -+ -+ parsed portion of string has a form "name=value". -+ */ -+ const char *name; -+ /* type of option */ -+ opt_type_t type; -+ union { -+ /* where to store value of string option (type == OPT_STRING) */ -+ char **string; -+ /* description of bits for bit option (type == OPT_BIT) */ -+ struct { -+ int nr; -+ void *addr; -+ } bit; -+ /* description of format and targets for format option (type -+ == OPT_FORMAT) */ -+ struct { -+ const char *format; -+ int nr_args; -+ void *arg1; -+ void *arg2; -+ void *arg3; -+ void *arg4; -+ } f; -+ struct { -+ int *result; -+ const char *list[10]; -+ } oneof; -+ struct { -+ void *addr; -+ int nr_bits; -+ /* struct opt_bitmask_bit *bits; */ -+ } bitmask; -+ } u; -+}; -+ -+/** -+ * parse_option - parse one option -+ * @opt_strin: starting point of parsing -+ * @opt: option description -+ * -+ * foo=bar, -+ * ^ ^ ^ -+ * | | +-- replaced to '\0' -+ * | +-- val_start -+ * +-- opt_string -+ * Figures out option type and handles option correspondingly. -+ */ -+static int parse_option(char *opt_string, struct opt_desc *opt) -+{ -+ char *val_start; -+ int result; -+ const char *err_msg; -+ -+ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */ -+ -+ val_start = strchr(opt_string, '='); -+ if (val_start != NULL) { -+ *val_start = '\0'; -+ ++val_start; -+ } -+ -+ err_msg = NULL; -+ result = 0; -+ switch (opt->type) { -+ case OPT_STRING: -+ if (val_start == NULL) { -+ err_msg = "String arg missing"; -+ result = RETERR(-EINVAL); -+ } else -+ *opt->u.string = val_start; -+ break; -+ case OPT_BIT: -+ if (val_start != NULL) -+ err_msg = "Value ignored"; -+ else -+ set_bit(opt->u.bit.nr, opt->u.bit.addr); -+ break; -+ case OPT_FORMAT: -+ if (val_start == NULL) { -+ err_msg = "Formatted arg missing"; -+ result = RETERR(-EINVAL); -+ break; -+ } -+ if (sscanf(val_start, opt->u.f.format, -+ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3, -+ opt->u.f.arg4) != opt->u.f.nr_args) { -+ err_msg = "Wrong conversion"; -+ result = RETERR(-EINVAL); -+ } -+ break; -+ case OPT_ONEOF: -+ { -+ int i = 0; -+ -+ if (val_start == NULL) { -+ err_msg = "Value is missing"; -+ result = RETERR(-EINVAL); -+ break; -+ } -+ err_msg = "Wrong option value"; -+ result = RETERR(-EINVAL); -+ while (opt->u.oneof.list[i]) { -+ if (!strcmp(opt->u.oneof.list[i], val_start)) { -+ result = 0; -+ err_msg = NULL; -+ *opt->u.oneof.result = i; -+ break; -+ } -+ i++; -+ } -+ break; -+ } -+ default: -+ wrong_return_value("nikita-2100", "opt -> type"); -+ break; -+ } -+ if (err_msg != NULL) { -+ warning("nikita-2496", "%s when parsing option "%s%s%s"", -+ err_msg, opt->name, val_start ? "=" : "", -+ val_start ? : ""); -+ } -+ return result; -+} -+ -+/** -+ * parse_options - parse reiser4 mount options -+ * @opt_string: starting point -+ * @opts: array of option description -+ * @nr_opts: number of elements in @opts -+ * -+ * Parses comma separated list of reiser4 mount options. -+ */ -+static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts) -+{ -+ int result; -+ -+ result = 0; -+ while ((result == 0) && opt_string && *opt_string) { -+ int j; -+ char *next; -+ -+ next = strchr(opt_string, ','); -+ if (next != NULL) { -+ *next = '\0'; -+ ++next; -+ } -+ for (j = 0; j < nr_opts; ++j) { -+ if (!strncmp(opt_string, opts[j].name, -+ strlen(opts[j].name))) { -+ result = parse_option(opt_string, &opts[j]); -+ break; -+ } -+ } -+ if (j == nr_opts) { -+ warning("nikita-2307", "Unrecognized option: "%s"", -+ opt_string); -+ /* traditionally, -EINVAL is returned on wrong mount -+ option */ -+ result = RETERR(-EINVAL); -+ } -+ opt_string = next; -+ } -+ return result; -+} -+ -+#define NUM_OPT(label, fmt, addr) \ -+ { \ -+ .name = (label), \ -+ .type = OPT_FORMAT, \ -+ .u = { \ -+ .f = { \ -+ .format = (fmt), \ -+ .nr_args = 1, \ -+ .arg1 = (addr), \ -+ .arg2 = NULL, \ -+ .arg3 = NULL, \ -+ .arg4 = NULL \ -+ } \ -+ } \ -+ } -+ -+#define SB_FIELD_OPT(field, fmt) NUM_OPT(#field, fmt, &sbinfo->field) -+ -+#define BIT_OPT(label, bitnr) \ -+ { \ -+ .name = label, \ -+ .type = OPT_BIT, \ -+ .u = { \ -+ .bit = { \ -+ .nr = bitnr, \ -+ .addr = &sbinfo->fs_flags \ -+ } \ -+ } \ -+ } -+ -+#define MAX_NR_OPTIONS (30) -+ -+/** -+ * reiser4_init_super_data - initialize reiser4 private super block -+ * @super: super block to initialize -+ * @opt_string: list of reiser4 mount options -+ * -+ * Sets various reiser4 parameters to default values. Parses mount options and -+ * overwrites default settings. -+ */ -+int reiser4_init_super_data(struct super_block *super, char *opt_string) -+{ -+ int result; -+ struct opt_desc *opts, *p; -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ -+ /* initialize super, export, dentry operations */ -+ sbinfo->ops.super = reiser4_super_operations; -+ sbinfo->ops.export = reiser4_export_operations; -+ sbinfo->ops.dentry = reiser4_dentry_operations; -+ super->s_op = &sbinfo->ops.super; -+ super->s_export_op = &sbinfo->ops.export; -+ -+ /* initialize transaction manager parameters to default values */ -+ sbinfo->tmgr.atom_max_size = totalram_pages / 4; -+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ; -+ sbinfo->tmgr.atom_min_size = 256; -+ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS; -+ -+ /* initialize cbk cache parameter */ -+ sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS; -+ -+ /* initialize flush parameters */ -+ sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD; -+ sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE; -+ sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD; -+ sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES; -+ -+ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE; -+ -+ /* preliminary tree initializations */ -+ sbinfo->tree.super = super; -+ sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS; -+ sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS; -+ sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS; -+ sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS; -+ rwlock_init(&(sbinfo->tree.tree_lock)); -+ spin_lock_init(&(sbinfo->tree.epoch_lock)); -+ -+ /* initialize default readahead params */ -+ sbinfo->ra_params.max = num_physpages / 4; -+ sbinfo->ra_params.flags = 0; -+ -+ /* allocate memory for structure describing reiser4 mount options */ -+ opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS, -+ reiser4_ctx_gfp_mask_get()); -+ if (opts == NULL) -+ return RETERR(-ENOMEM); -+ -+ /* initialize structure describing reiser4 mount options */ -+ p = opts; -+ -+#if REISER4_DEBUG -+# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \ -+ warning("zam-1046", "opt array is overloaded"); break; \ -+ } -+#else -+# define OPT_ARRAY_CHECK noop -+#endif -+ -+#define PUSH_OPT(...) \ -+do { \ -+ struct opt_desc o = __VA_ARGS__; \ -+ OPT_ARRAY_CHECK; \ -+ *p ++ = o; \ -+} while (0) -+ -+#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format)) -+#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit)) -+ -+ /* -+ * tmgr.atom_max_size=N -+ * Atoms containing more than N blocks will be forced to commit. N is -+ * decimal. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u"); -+ /* -+ * tmgr.atom_max_age=N -+ * Atoms older than N seconds will be forced to commit. N is decimal. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u"); -+ /* -+ * tmgr.atom_min_size=N -+ * In committing an atom to free dirty pages, force the atom less than -+ * N in size to fuse with another one. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u"); -+ /* -+ * tmgr.atom_max_flushers=N -+ * limit of concurrent flushers for one atom. 0 means no limit. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u"); -+ /* -+ * tree.cbk_cache_slots=N -+ * Number of slots in the cbk cache. -+ */ -+ PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u"); -+ /* -+ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty -+ * leaf-level blocks it will force them to be relocated. -+ */ -+ PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u"); -+ /* -+ * If flush finds can find a block allocation closer than at most -+ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that -+ * position. -+ */ -+ PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u"); -+ /* -+ * If we have written this much or more blocks before encountering busy -+ * jnode in flush list - abort flushing hoping that next time we get -+ * called this jnode will be clean already, and we will save some -+ * seeks. -+ */ -+ PUSH_SB_FIELD_OPT(flush.written_threshold, "%u"); -+ /* The maximum number of nodes to scan left on a level during flush. */ -+ PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u"); -+ /* preferred IO size */ -+ PUSH_SB_FIELD_OPT(optimal_io_size, "%u"); -+ /* carry flags used for insertion of new nodes */ -+ PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u"); -+ /* carry flags used for insertion of new extents */ -+ PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u"); -+ /* carry flags used for paste operations */ -+ PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u"); -+ /* carry flags used for insert operations */ -+ PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u"); -+ -+#ifdef CONFIG_REISER4_BADBLOCKS -+ /* -+ * Alternative master superblock location in case if it's original -+ * location is not writeable/accessable. This is offset in BYTES. -+ */ -+ PUSH_SB_FIELD_OPT(altsuper, "%lu"); -+#endif -+ -+ /* turn on BSD-style gid assignment */ -+ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID); -+ /* turn on 32 bit times */ -+ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES); -+ /* -+ * Don't load all bitmap blocks at mount time, it is useful for -+ * machines with tiny RAM and large disks. -+ */ -+ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP); -+ /* disable transaction commits during write() */ -+ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE); -+ /* disable use of write barriers in the reiser4 log writer. */ -+ PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER); -+ -+ PUSH_OPT( -+ { -+ /* -+ * tree traversal readahead parameters: -+ * -o readahead:MAXNUM:FLAGS -+ * MAXNUM - max number fo nodes to request readahead for: -1UL -+ * will set it to max_sane_readahead() -+ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS, -+ * CONTINUE_ON_PRESENT -+ */ -+ .name = "readahead", -+ .type = OPT_FORMAT, -+ .u = { -+ .f = { -+ .format = "%u:%u", -+ .nr_args = 2, -+ .arg1 = &sbinfo->ra_params.max, -+ .arg2 = &sbinfo->ra_params.flags, -+ .arg3 = NULL, -+ .arg4 = NULL -+ } -+ } -+ } -+ ); -+ -+ /* What to do in case of fs error */ -+ PUSH_OPT( -+ { -+ .name = "onerror", -+ .type = OPT_ONEOF, -+ .u = { -+ .oneof = { -+ .result = &sbinfo->onerror, -+ .list = { -+ "panic", "remount-ro", NULL -+ }, -+ } -+ } -+ } -+ ); -+ -+ /* modify default settings to values set by mount options */ -+ result = parse_options(opt_string, opts, p - opts); -+ kfree(opts); -+ if (result != 0) -+ return result; -+ -+ /* correct settings to sanity values */ -+ sbinfo->tmgr.atom_max_age *= HZ; -+ if (sbinfo->tmgr.atom_max_age <= 0) -+ /* overflow */ -+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE; -+ -+ /* round optimal io size up to 512 bytes */ -+ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS; -+ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS; -+ if (sbinfo->optimal_io_size == 0) { -+ warning("nikita-2497", "optimal_io_size is too small"); -+ return RETERR(-EINVAL); -+ } -+ return result; -+} -+ -+/** -+ * reiser4_init_read_super - read reiser4 master super block -+ * @super: super block to fill -+ * @silent: if 0 - print warnings -+ * -+ * Reads reiser4 master super block either from predefined location or from -+ * location specified by altsuper mount option, initializes disk format plugin. -+ */ -+int reiser4_init_read_super(struct super_block *super, int silent) -+{ -+ struct buffer_head *super_bh; -+ struct reiser4_master_sb *master_sb; -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ unsigned long blocksize; -+ -+ read_super_block: -+#ifdef CONFIG_REISER4_BADBLOCKS -+ if (sbinfo->altsuper) -+ /* -+ * read reiser4 master super block at position specified by -+ * mount option -+ */ -+ super_bh = sb_bread(super, -+ (sector_t)(sbinfo->altsuper / super->s_blocksize)); -+ else -+#endif -+ /* read reiser4 master super block at 16-th 4096 block */ -+ super_bh = sb_bread(super, -+ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize)); -+ if (!super_bh) -+ return RETERR(-EIO); -+ -+ master_sb = (struct reiser4_master_sb *)super_bh->b_data; -+ /* check reiser4 magic string */ -+ if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING, -+ sizeof(REISER4_SUPER_MAGIC_STRING))) { -+ /* reiser4 master super block contains filesystem blocksize */ -+ blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize)); -+ -+ if (blocksize != PAGE_CACHE_SIZE) { -+ /* -+ * currenly reiser4's blocksize must be equal to -+ * pagesize -+ */ -+ if (!silent) -+ warning("nikita-2609", -+ "%s: wrong block size %ld\n", super->s_id, -+ blocksize); -+ brelse(super_bh); -+ return RETERR(-EINVAL); -+ } -+ if (blocksize != super->s_blocksize) { -+ /* -+ * filesystem uses different blocksize. Reread master -+ * super block with correct blocksize -+ */ -+ brelse(super_bh); -+ if (!sb_set_blocksize(super, (int)blocksize)) -+ return RETERR(-EINVAL); -+ goto read_super_block; -+ } -+ -+ sbinfo->df_plug = -+ disk_format_plugin_by_id( -+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id))); -+ if (sbinfo->df_plug == NULL) { -+ if (!silent) -+ warning("nikita-26091", -+ "%s: unknown disk format plugin %d\n", -+ super->s_id, -+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id))); -+ brelse(super_bh); -+ return RETERR(-EINVAL); -+ } -+ sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap)); -+ brelse(super_bh); -+ return 0; -+ } -+ -+ /* there is no reiser4 on the device */ -+ if (!silent) -+ warning("nikita-2608", -+ "%s: wrong master super block magic", super->s_id); -+ brelse(super_bh); -+ return RETERR(-EINVAL); -+} -+ -+static struct { -+ reiser4_plugin_type type; -+ reiser4_plugin_id id; -+} default_plugins[PSET_LAST] = { -+ [PSET_FILE] = { -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .id = UNIX_FILE_PLUGIN_ID -+ }, -+ [PSET_DIR] = { -+ .type = REISER4_DIR_PLUGIN_TYPE, -+ .id = HASHED_DIR_PLUGIN_ID -+ }, -+ [PSET_HASH] = { -+ .type = REISER4_HASH_PLUGIN_TYPE, -+ .id = R5_HASH_ID -+ }, -+ [PSET_FIBRATION] = { -+ .type = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_DOT_O -+ }, -+ [PSET_PERM] = { -+ .type = REISER4_PERM_PLUGIN_TYPE, -+ .id = NULL_PERM_ID -+ }, -+ [PSET_FORMATTING] = { -+ .type = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = SMALL_FILE_FORMATTING_ID -+ }, -+ [PSET_SD] = { -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .id = STATIC_STAT_DATA_ID -+ }, -+ [PSET_DIR_ITEM] = { -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .id = COMPOUND_DIR_ID -+ }, -+ [PSET_CIPHER] = { -+ .type = REISER4_CIPHER_PLUGIN_TYPE, -+ .id = NONE_CIPHER_ID -+ }, -+ [PSET_DIGEST] = { -+ .type = REISER4_DIGEST_PLUGIN_TYPE, -+ .id = SHA256_32_DIGEST_ID -+ }, -+ [PSET_COMPRESSION] = { -+ .type = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .id = LZO1_COMPRESSION_ID -+ }, -+ [PSET_COMPRESSION_MODE] = { -+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = CONVX_COMPRESSION_MODE_ID -+ }, -+ [PSET_CLUSTER] = { -+ .type = REISER4_CLUSTER_PLUGIN_TYPE, -+ .id = CLUSTER_64K_ID -+ }, -+ [PSET_CREATE] = { -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .id = UNIX_FILE_PLUGIN_ID -+ } -+}; -+ -+/* access to default plugin table */ -+reiser4_plugin *get_default_plugin(pset_member memb) -+{ -+ return plugin_by_id(default_plugins[memb].type, -+ default_plugins[memb].id); -+} -+ -+/** -+ * reiser4_init_root_inode - obtain inode of root directory -+ * @super: super block of filesystem -+ * -+ * Obtains inode of root directory (reading it from disk), initializes plugin -+ * set it was not initialized. -+ */ -+int reiser4_init_root_inode(struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ struct inode *inode; -+ int result = 0; -+ -+ inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0); -+ if (IS_ERR(inode)) -+ return RETERR(PTR_ERR(inode)); -+ -+ super->s_root = d_alloc_root(inode); -+ if (!super->s_root) { -+ iput(inode); -+ return RETERR(-ENOMEM); -+ } -+ -+ super->s_root->d_op = &sbinfo->ops.dentry; -+ -+ if (!is_inode_loaded(inode)) { -+ pset_member memb; -+ plugin_set *pset; -+ -+ pset = reiser4_inode_data(inode)->pset; -+ for (memb = 0; memb < PSET_LAST; ++memb) { -+ -+ if (aset_get(pset, memb) != NULL) -+ continue; -+ -+ result = grab_plugin_pset(inode, NULL, memb); -+ if (result != 0) -+ break; -+ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+ } -+ -+ if (result == 0) { -+ if (REISER4_DEBUG) { -+ for (memb = 0; memb < PSET_LAST; ++memb) -+ assert("nikita-3500", -+ aset_get(pset, memb) != NULL); -+ } -+ } else -+ warning("nikita-3448", "Cannot set plugins of root: %i", -+ result); -+ reiser4_iget_complete(inode); -+ -+ /* As the default pset kept in the root dir may has been changed -+ (length is unknown), call update_sd. */ -+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) { -+ result = reiser4_grab_space( -+ inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+ -+ if (result == 0) -+ result = reiser4_update_sd(inode); -+ -+ all_grabbed2free(); -+ } -+ } -+ -+ super->s_maxbytes = MAX_LFS_FILESIZE; -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/inode.c linux-2.6.30/fs/reiser4/inode.c ---- linux-2.6.30.orig/fs/reiser4/inode.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/inode.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,711 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* Inode specific operations. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "kassign.h" -+#include "coord.h" -+#include "seal.h" -+#include "dscale.h" -+#include "plugin/item/item.h" -+#include "plugin/security/perm.h" -+#include "plugin/plugin.h" -+#include "plugin/object.h" -+#include "znode.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include <linux/fs.h> /* for struct super_block, address_space */ -+ -+/* return reiser4 internal tree which inode belongs to */ -+/* Audited by: green(2002.06.17) */ -+reiser4_tree *reiser4_tree_by_inode(const struct inode *inode/* inode queried*/) -+{ -+ assert("nikita-256", inode != NULL); -+ assert("nikita-257", inode->i_sb != NULL); -+ return reiser4_get_tree(inode->i_sb); -+} -+ -+/* return reiser4-specific inode flags */ -+static inline unsigned long *inode_flags(const struct inode *const inode) -+{ -+ assert("nikita-2842", inode != NULL); -+ return &reiser4_inode_data(inode)->flags; -+} -+ -+/* set reiser4-specific flag @f in @inode */ -+void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f) -+{ -+ assert("nikita-2248", inode != NULL); -+ set_bit((int)f, inode_flags(inode)); -+} -+ -+/* clear reiser4-specific flag @f in @inode */ -+void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f) -+{ -+ assert("nikita-2250", inode != NULL); -+ clear_bit((int)f, inode_flags(inode)); -+} -+ -+/* true if reiser4-specific flag @f is set in @inode */ -+int reiser4_inode_get_flag(const struct inode *inode, -+ reiser4_file_plugin_flags f) -+{ -+ assert("nikita-2251", inode != NULL); -+ return test_bit((int)f, inode_flags(inode)); -+} -+ -+/* convert oid to inode number */ -+ino_t oid_to_ino(oid_t oid) -+{ -+ return (ino_t) oid; -+} -+ -+/* convert oid to user visible inode number */ -+ino_t oid_to_uino(oid_t oid) -+{ -+ /* reiser4 object is uniquely identified by oid which is 64 bit -+ quantity. Kernel in-memory inode is indexed (in the hash table) by -+ 32 bit i_ino field, but this is not a problem, because there is a -+ way to further distinguish inodes with identical inode numbers -+ (find_actor supplied to iget()). -+ -+ But user space expects unique 32 bit inode number. Obviously this -+ is impossible. Work-around is to somehow hash oid into user visible -+ inode number. -+ */ -+ oid_t max_ino = (ino_t) ~0; -+ -+ if (REISER4_INO_IS_OID || (oid <= max_ino)) -+ return oid; -+ else -+ /* this is remotely similar to algorithm used to find next pid -+ to use for process: after wrap-around start from some -+ offset rather than from 0. Idea is that there are some long -+ living objects with which we don't want to collide. -+ */ -+ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1)); -+} -+ -+/* check that "inode" is on reiser4 file-system */ -+int is_reiser4_inode(const struct inode *inode/* inode queried */) -+{ -+ return inode != NULL && is_reiser4_super(inode->i_sb); -+} -+ -+/* Maximal length of a name that can be stored in directory @inode. -+ -+ This is used in check during file creation and lookup. */ -+int reiser4_max_filename_len(const struct inode *inode/* inode queried */) -+{ -+ assert("nikita-287", is_reiser4_inode(inode)); -+ assert("nikita-1710", inode_dir_item_plugin(inode)); -+ if (inode_dir_item_plugin(inode)->s.dir.max_name_len) -+ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode); -+ else -+ return 255; -+} -+ -+#if REISER4_USE_COLLISION_LIMIT -+/* Maximal number of hash collisions for this directory. */ -+int max_hash_collisions(const struct inode *dir/* inode queried */) -+{ -+ assert("nikita-1711", dir != NULL); -+ return reiser4_inode_data(dir)->plugin.max_collisions; -+} -+#endif /* REISER4_USE_COLLISION_LIMIT */ -+ -+/* Install file, inode, and address_space operation on @inode, depending on -+ its mode. */ -+int setup_inode_ops(struct inode *inode /* inode to intialize */ , -+ reiser4_object_create_data * data /* parameters to create -+ * object */ ) -+{ -+ reiser4_super_info_data *sinfo; -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ fplug = inode_file_plugin(inode); -+ dplug = inode_dir_plugin(inode); -+ -+ sinfo = get_super_private(inode->i_sb); -+ -+ switch (inode->i_mode & S_IFMT) { -+ case S_IFSOCK: -+ case S_IFBLK: -+ case S_IFCHR: -+ case S_IFIFO: -+ { -+ dev_t rdev; /* to keep gcc happy */ -+ -+ assert("vs-46", fplug != NULL); -+ /* ugly hack with rdev */ -+ if (data == NULL) { -+ rdev = inode->i_rdev; -+ inode->i_rdev = 0; -+ } else -+ rdev = data->rdev; -+ inode->i_blocks = 0; -+ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID); -+ inode->i_op = file_plugins[fplug->h.id].inode_ops; -+ /* initialize inode->i_fop and inode->i_rdev for block -+ and char devices */ -+ init_special_inode(inode, inode->i_mode, rdev); -+ /* all address space operations are null */ -+ inode->i_mapping->a_ops = -+ file_plugins[fplug->h.id].as_ops; -+ break; -+ } -+ case S_IFLNK: -+ assert("vs-46", fplug != NULL); -+ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID); -+ inode->i_op = file_plugins[fplug->h.id].inode_ops; -+ inode->i_fop = NULL; -+ /* all address space operations are null */ -+ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops; -+ break; -+ case S_IFDIR: -+ assert("vs-46", dplug != NULL); -+ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID || -+ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID)); -+ inode->i_op = dir_plugins[dplug->h.id].inode_ops; -+ inode->i_fop = dir_plugins[dplug->h.id].file_ops; -+ inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops; -+ break; -+ case S_IFREG: -+ assert("vs-46", fplug != NULL); -+ assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID || -+ fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ inode->i_op = file_plugins[fplug->h.id].inode_ops; -+ inode->i_fop = file_plugins[fplug->h.id].file_ops; -+ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops; -+ break; -+ default: -+ warning("nikita-291", "wrong file mode: %o for %llu", -+ inode->i_mode, -+ (unsigned long long)get_inode_oid(inode)); -+ reiser4_make_bad_inode(inode); -+ return RETERR(-EINVAL); -+ } -+ return 0; -+} -+ -+/* Initialize inode from disk data. Called with inode locked. -+ Return inode locked. */ -+static int init_inode(struct inode *inode /* inode to intialise */ , -+ coord_t *coord/* coord of stat data */) -+{ -+ int result; -+ item_plugin *iplug; -+ void *body; -+ int length; -+ reiser4_inode *state; -+ -+ assert("nikita-292", coord != NULL); -+ assert("nikita-293", inode != NULL); -+ -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (result) -+ return result; -+ iplug = item_plugin_by_coord(coord); -+ body = item_body_by_coord(coord); -+ length = item_length_by_coord(coord); -+ -+ assert("nikita-295", iplug != NULL); -+ assert("nikita-296", body != NULL); -+ assert("nikita-297", length > 0); -+ -+ /* inode is under I_LOCK now */ -+ -+ state = reiser4_inode_data(inode); -+ /* call stat-data plugin method to load sd content into inode */ -+ result = iplug->s.sd.init_inode(inode, body, length); -+ set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug)); -+ if (result == 0) { -+ result = setup_inode_ops(inode, NULL); -+ if (result == 0 && inode->i_sb->s_root && -+ inode->i_sb->s_root->d_inode) -+ result = finish_pset(inode); -+ } -+ zrelse(coord->node); -+ return result; -+} -+ -+/* read `inode' from the disk. This is what was previously in -+ reiserfs_read_inode2(). -+ -+ Must be called with inode locked. Return inode still locked. -+*/ -+static int read_inode(struct inode *inode /* inode to read from disk */ , -+ const reiser4_key * key /* key of stat data */ , -+ int silent) -+{ -+ int result; -+ lock_handle lh; -+ reiser4_inode *info; -+ coord_t coord; -+ -+ assert("nikita-298", inode != NULL); -+ assert("nikita-1945", !is_inode_loaded(inode)); -+ -+ info = reiser4_inode_data(inode); -+ assert("nikita-300", info->locality_id != 0); -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ /* locate stat-data in a tree and return znode locked */ -+ result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent); -+ assert("nikita-301", !is_inode_loaded(inode)); -+ if (result == 0) { -+ /* use stat-data plugin to load sd into inode. */ -+ result = init_inode(inode, &coord); -+ if (result == 0) { -+ /* initialize stat-data seal */ -+ spin_lock_inode(inode); -+ reiser4_seal_init(&info->sd_seal, &coord, key); -+ info->sd_coord = coord; -+ spin_unlock_inode(inode); -+ -+ /* call file plugin's method to initialize plugin -+ * specific part of inode */ -+ if (inode_file_plugin(inode)->init_inode_data) -+ inode_file_plugin(inode)->init_inode_data(inode, -+ NULL, -+ 0); -+ /* load detached directory cursors for stateless -+ * directory readers (NFS). */ -+ reiser4_load_cursors(inode); -+ -+ /* Check the opened inode for consistency. */ -+ result = -+ get_super_private(inode->i_sb)->df_plug-> -+ check_open(inode); -+ } -+ } -+ /* lookup_sd() doesn't release coord because we want znode -+ stay read-locked while stat-data fields are accessed in -+ init_inode() */ -+ done_lh(&lh); -+ -+ if (result != 0) -+ reiser4_make_bad_inode(inode); -+ return result; -+} -+ -+/* initialise new reiser4 inode being inserted into hash table. */ -+static int init_locked_inode(struct inode *inode /* new inode */ , -+ void *opaque /* key of stat data passed to -+ * the iget5_locked as cookie */) -+{ -+ reiser4_key *key; -+ -+ assert("nikita-1995", inode != NULL); -+ assert("nikita-1996", opaque != NULL); -+ key = opaque; -+ set_inode_oid(inode, get_key_objectid(key)); -+ reiser4_inode_data(inode)->locality_id = get_key_locality(key); -+ return 0; -+} -+ -+/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to -+ iget5_locked(). -+ -+ This function is called by iget5_locked() to distinguish reiser4 inodes -+ having the same inode numbers. Such inodes can only exist due to some error -+ condition. One of them should be bad. Inodes with identical inode numbers -+ (objectids) are distinguished by their packing locality. -+ -+*/ -+static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table -+ * to check */ , -+ void *opaque /* "cookie" passed to -+ * iget5_locked(). This -+ * is stat-data key */) -+{ -+ reiser4_key *key; -+ -+ key = opaque; -+ return -+ /* oid is unique, so first term is enough, actually. */ -+ get_inode_oid(inode) == get_key_objectid(key) && -+ /* -+ * also, locality should be checked, but locality is stored in -+ * the reiser4-specific part of the inode, and actor can be -+ * called against arbitrary inode that happened to be in this -+ * hash chain. Hence we first have to check that this is -+ * reiser4 inode at least. is_reiser4_inode() is probably too -+ * early to call, as inode may have ->i_op not yet -+ * initialised. -+ */ -+ is_reiser4_super(inode->i_sb) && -+ /* -+ * usually objectid is unique, but pseudo files use counter to -+ * generate objectid. All pseudo files are placed into special -+ * (otherwise unused) locality. -+ */ -+ reiser4_inode_data(inode)->locality_id == get_key_locality(key); -+} -+ -+/* hook for kmem_cache_create */ -+void loading_init_once(reiser4_inode * info) -+{ -+ mutex_init(&info->loading); -+} -+ -+/* for reiser4_alloc_inode */ -+void loading_alloc(reiser4_inode * info) -+{ -+ assert("vs-1717", !mutex_is_locked(&info->loading)); -+} -+ -+/* for reiser4_destroy */ -+void loading_destroy(reiser4_inode * info) -+{ -+ assert("vs-1717a", !mutex_is_locked(&info->loading)); -+} -+ -+static void loading_begin(reiser4_inode * info) -+{ -+ mutex_lock(&info->loading); -+} -+ -+static void loading_end(reiser4_inode * info) -+{ -+ mutex_unlock(&info->loading); -+} -+ -+/** -+ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary -+ * @super: super block of filesystem -+ * @key: key of inode's stat-data -+ * @silent: -+ * -+ * This is our helper function a la iget(). This is be called by -+ * lookup_common() and reiser4_read_super(). Return inode locked or error -+ * encountered. -+ */ -+struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key, -+ int silent) -+{ -+ struct inode *inode; -+ int result; -+ reiser4_inode *info; -+ -+ assert("nikita-302", super != NULL); -+ assert("nikita-303", key != NULL); -+ -+ result = 0; -+ -+ /* call iget(). Our ->read_inode() is dummy, so this will either -+ find inode in cache or return uninitialised inode */ -+ inode = iget5_locked(super, -+ (unsigned long)get_key_objectid(key), -+ reiser4_inode_find_actor, -+ init_locked_inode, (reiser4_key *) key); -+ if (inode == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ if (is_bad_inode(inode)) { -+ warning("nikita-304", "Bad inode found"); -+ reiser4_print_key("key", key); -+ iput(inode); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ -+ info = reiser4_inode_data(inode); -+ -+ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully -+ loaded and initialized inode from just allocated inode. If -+ REISER4_LOADED bit is not set, reiser4_iget() completes loading under -+ info->loading. The place in reiser4 which uses not initialized inode -+ is the reiser4 repacker, see repacker-related functions in -+ plugin/item/extent.c */ -+ if (!is_inode_loaded(inode)) { -+ loading_begin(info); -+ if (!is_inode_loaded(inode)) { -+ /* locking: iget5_locked returns locked inode */ -+ assert("nikita-1941", !is_inode_loaded(inode)); -+ assert("nikita-1949", -+ reiser4_inode_find_actor(inode, -+ (reiser4_key *) key)); -+ /* now, inode has objectid as ->i_ino and locality in -+ reiser4-specific part. This is enough for -+ read_inode() to read stat data from the disk */ -+ result = read_inode(inode, key, silent); -+ } else -+ loading_end(info); -+ } -+ -+ if (inode->i_state & I_NEW) -+ unlock_new_inode(inode); -+ -+ if (is_bad_inode(inode)) { -+ assert("vs-1717", result != 0); -+ loading_end(info); -+ iput(inode); -+ inode = ERR_PTR(result); -+ } else if (REISER4_DEBUG) { -+ reiser4_key found_key; -+ -+ assert("vs-1717", result == 0); -+ build_sd_key(inode, &found_key); -+ if (!keyeq(&found_key, key)) { -+ warning("nikita-305", "Wrong key in sd"); -+ reiser4_print_key("sought for", key); -+ reiser4_print_key("found", &found_key); -+ } -+ if (inode->i_nlink == 0) { -+ warning("nikita-3559", "Unlinked inode found: %llu\n", -+ (unsigned long long)get_inode_oid(inode)); -+ } -+ } -+ return inode; -+} -+ -+/* reiser4_iget() may return not fully initialized inode, this function should -+ * be called after one completes reiser4 inode initializing. */ -+void reiser4_iget_complete(struct inode *inode) -+{ -+ assert("zam-988", is_reiser4_inode(inode)); -+ -+ if (!is_inode_loaded(inode)) { -+ reiser4_inode_set_flag(inode, REISER4_LOADED); -+ loading_end(reiser4_inode_data(inode)); -+ } -+} -+ -+void reiser4_make_bad_inode(struct inode *inode) -+{ -+ assert("nikita-1934", inode != NULL); -+ -+ /* clear LOADED bit */ -+ reiser4_inode_clr_flag(inode, REISER4_LOADED); -+ make_bad_inode(inode); -+ return; -+} -+ -+file_plugin *inode_file_plugin(const struct inode *inode) -+{ -+ assert("nikita-1997", inode != NULL); -+ return reiser4_inode_data(inode)->pset->file; -+} -+ -+dir_plugin *inode_dir_plugin(const struct inode *inode) -+{ -+ assert("nikita-1998", inode != NULL); -+ return reiser4_inode_data(inode)->pset->dir; -+} -+ -+formatting_plugin *inode_formatting_plugin(const struct inode *inode) -+{ -+ assert("nikita-2000", inode != NULL); -+ return reiser4_inode_data(inode)->pset->formatting; -+} -+ -+hash_plugin *inode_hash_plugin(const struct inode *inode) -+{ -+ assert("nikita-2001", inode != NULL); -+ return reiser4_inode_data(inode)->pset->hash; -+} -+ -+fibration_plugin *inode_fibration_plugin(const struct inode *inode) -+{ -+ assert("nikita-2001", inode != NULL); -+ return reiser4_inode_data(inode)->pset->fibration; -+} -+ -+cipher_plugin *inode_cipher_plugin(const struct inode *inode) -+{ -+ assert("edward-36", inode != NULL); -+ return reiser4_inode_data(inode)->pset->cipher; -+} -+ -+compression_plugin *inode_compression_plugin(const struct inode *inode) -+{ -+ assert("edward-37", inode != NULL); -+ return reiser4_inode_data(inode)->pset->compression; -+} -+ -+compression_mode_plugin *inode_compression_mode_plugin(const struct inode * -+ inode) -+{ -+ assert("edward-1330", inode != NULL); -+ return reiser4_inode_data(inode)->pset->compression_mode; -+} -+ -+cluster_plugin *inode_cluster_plugin(const struct inode *inode) -+{ -+ assert("edward-1328", inode != NULL); -+ return reiser4_inode_data(inode)->pset->cluster; -+} -+ -+file_plugin *inode_create_plugin(const struct inode *inode) -+{ -+ assert("edward-1329", inode != NULL); -+ return reiser4_inode_data(inode)->pset->create; -+} -+ -+digest_plugin *inode_digest_plugin(const struct inode *inode) -+{ -+ assert("edward-86", inode != NULL); -+ return reiser4_inode_data(inode)->pset->digest; -+} -+ -+item_plugin *inode_sd_plugin(const struct inode *inode) -+{ -+ assert("vs-534", inode != NULL); -+ return reiser4_inode_data(inode)->pset->sd; -+} -+ -+item_plugin *inode_dir_item_plugin(const struct inode *inode) -+{ -+ assert("vs-534", inode != NULL); -+ return reiser4_inode_data(inode)->pset->dir_item; -+} -+ -+file_plugin *child_create_plugin(const struct inode *inode) -+{ -+ assert("edward-1329", inode != NULL); -+ return reiser4_inode_data(inode)->hset->create; -+} -+ -+void inode_set_extension(struct inode *inode, sd_ext_bits ext) -+{ -+ reiser4_inode *state; -+ -+ assert("nikita-2716", inode != NULL); -+ assert("nikita-2717", ext < LAST_SD_EXTENSION); -+ assert("nikita-3491", spin_inode_is_locked(inode)); -+ -+ state = reiser4_inode_data(inode); -+ state->extmask |= 1 << ext; -+ /* force re-calculation of stat-data length on next call to -+ update_sd(). */ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+} -+ -+void inode_clr_extension(struct inode *inode, sd_ext_bits ext) -+{ -+ reiser4_inode *state; -+ -+ assert("vpf-1926", inode != NULL); -+ assert("vpf-1927", ext < LAST_SD_EXTENSION); -+ assert("vpf-1928", spin_inode_is_locked(inode)); -+ -+ state = reiser4_inode_data(inode); -+ state->extmask &= ~(1 << ext); -+ /* force re-calculation of stat-data length on next call to -+ update_sd(). */ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+} -+ -+void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new) -+{ -+ assert("edward-1287", inode != NULL); -+ if (!dscale_fit(old, new)) -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+ return; -+} -+ -+void inode_check_scale(struct inode *inode, __u64 old, __u64 new) -+{ -+ assert("nikita-2875", inode != NULL); -+ spin_lock_inode(inode); -+ inode_check_scale_nolock(inode, old, new); -+ spin_unlock_inode(inode); -+} -+ -+/* -+ * initialize ->ordering field of inode. This field defines how file stat-data -+ * and body is ordered within a tree with respect to other objects within the -+ * same parent directory. -+ */ -+void -+init_inode_ordering(struct inode *inode, -+ reiser4_object_create_data * crd, int create) -+{ -+ reiser4_key key; -+ -+ if (create) { -+ struct inode *parent; -+ -+ parent = crd->parent; -+ assert("nikita-3224", inode_dir_plugin(parent) != NULL); -+ inode_dir_plugin(parent)->build_entry_key(parent, -+ &crd->dentry->d_name, -+ &key); -+ } else { -+ coord_t *coord; -+ -+ coord = &reiser4_inode_data(inode)->sd_coord; -+ coord_clear_iplug(coord); -+ /* safe to use ->sd_coord, because node is under long term -+ * lock */ -+ WITH_DATA(coord->node, item_key_by_coord(coord, &key)); -+ } -+ -+ set_inode_ordering(inode, get_key_ordering(&key)); -+} -+ -+znode *inode_get_vroot(struct inode *inode) -+{ -+ reiser4_block_nr blk; -+ znode *result; -+ -+ spin_lock_inode(inode); -+ blk = reiser4_inode_data(inode)->vroot; -+ spin_unlock_inode(inode); -+ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk)) -+ result = zlook(reiser4_tree_by_inode(inode), &blk); -+ else -+ result = NULL; -+ return result; -+} -+ -+void inode_set_vroot(struct inode *inode, znode *vroot) -+{ -+ spin_lock_inode(inode); -+ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot); -+ spin_unlock_inode(inode); -+} -+ -+#if REISER4_DEBUG -+ -+void reiser4_inode_invariant(const struct inode *inode) -+{ -+ assert("nikita-3077", spin_inode_is_locked(inode)); -+} -+ -+int inode_has_no_jnodes(reiser4_inode * r4_inode) -+{ -+ return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL && -+ r4_inode->nr_jnodes == 0; -+} -+ -+#endif -+ -+/* true if directory is empty (only contains dot and dotdot) */ -+/* FIXME: shouldn't it be dir plugin method? */ -+int is_dir_empty(const struct inode *dir) -+{ -+ assert("nikita-1976", dir != NULL); -+ -+ /* rely on our method to maintain directory i_size being equal to the -+ number of entries. */ -+ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/inode.h linux-2.6.30/fs/reiser4/inode.h ---- linux-2.6.30.orig/fs/reiser4/inode.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/inode.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,453 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* Inode functions. */ -+ -+#if !defined(__REISER4_INODE_H__) -+#define __REISER4_INODE_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "seal.h" -+#include "plugin/plugin.h" -+#include "plugin/file/cryptcompress.h" -+#include "plugin/file/file.h" -+#include "plugin/dir/dir.h" -+#include "plugin/plugin_set.h" -+#include "plugin/security/perm.h" -+#include "vfs_ops.h" -+#include "jnode.h" -+#include "fsdata.h" -+ -+#include <linux/types.h> /* for __u?? , ino_t */ -+#include <linux/fs.h> /* for struct super_block, struct -+ * rw_semaphore, etc */ -+#include <linux/spinlock.h> -+#include <asm/types.h> -+ -+/* reiser4-specific inode flags. They are "transient" and are not -+ supposed to be stored on disk. Used to trace "state" of -+ inode -+*/ -+typedef enum { -+ /* this is light-weight inode, inheriting some state from its -+ parent */ -+ REISER4_LIGHT_WEIGHT = 0, -+ /* stat data wasn't yet created */ -+ REISER4_NO_SD = 1, -+ /* internal immutable flag. Currently is only used -+ to avoid race condition during file creation. -+ See comment in create_object(). */ -+ REISER4_IMMUTABLE = 2, -+ /* inode was read from storage */ -+ REISER4_LOADED = 3, -+ /* this bit is set for symlinks. inode->i_private points to target -+ name of symlink. */ -+ REISER4_GENERIC_PTR_USED = 4, -+ /* set if size of stat-data item for this inode is known. If this is -+ * set we can avoid recalculating size of stat-data on each update. */ -+ REISER4_SDLEN_KNOWN = 5, -+ /* reiser4_inode->crypt points to the crypto stat */ -+ REISER4_CRYPTO_STAT_LOADED = 6, -+ /* cryptcompress_inode_data points to the secret key */ -+ REISER4_SECRET_KEY_INSTALLED = 7, -+ /* File (possibly) has pages corresponding to the tail items, that -+ * were created by ->readpage. It is set by mmap_unix_file() and -+ * sendfile_unix_file(). This bit is inspected by write_unix_file and -+ * kill-hook of tail items. It is never cleared once set. This bit is -+ * modified and inspected under i_mutex. */ -+ REISER4_HAS_MMAP = 8, -+ REISER4_PART_MIXED = 9, -+ REISER4_PART_IN_CONV = 10, -+ /* This flag indicates that file plugin conversion is in progress */ -+ REISER4_FILE_CONV_IN_PROGRESS = 11 -+} reiser4_file_plugin_flags; -+ -+/* state associated with each inode. -+ reiser4 inode. -+ -+ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes -+ be of the same size. File-system allocates inodes by itself through -+ s_op->allocate_inode() method. So, it is possible to adjust size of inode -+ at the time of its creation. -+ -+ Invariants involving parts of this data-type: -+ -+ [inode->eflushed] -+ -+*/ -+ -+typedef struct reiser4_inode reiser4_inode; -+/* return pointer to reiser4-specific part of inode */ -+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode -+ /* inode queried */ ); -+ -+#if BITS_PER_LONG == 64 -+ -+#define REISER4_INO_IS_OID (1) -+typedef struct {; -+} oid_hi_t; -+ -+/* BITS_PER_LONG == 64 */ -+#else -+ -+#define REISER4_INO_IS_OID (0) -+typedef __u32 oid_hi_t; -+ -+/* BITS_PER_LONG == 64 */ -+#endif -+ -+struct reiser4_inode { -+ /* spin lock protecting fields of this structure. */ -+ spinlock_t guard; -+ /* main plugin set that control the file -+ (see comments in plugin/plugin_set.c) */ -+ plugin_set *pset; -+ /* plugin set for inheritance -+ (see comments in plugin/plugin_set.c) */ -+ plugin_set *hset; -+ /* high 32 bits of object id */ -+ oid_hi_t oid_hi; -+ /* seal for stat-data */ -+ seal_t sd_seal; -+ /* locality id for this file */ -+ oid_t locality_id; -+#if REISER4_LARGE_KEY -+ __u64 ordering; -+#endif -+ /* coord of stat-data in sealed node */ -+ coord_t sd_coord; -+ /* bit-mask of stat-data extentions used by this file */ -+ __u64 extmask; -+ /* bitmask of non-default plugins for this inode */ -+ __u16 plugin_mask; -+ /* bitmask of set heir plugins for this inode. */ -+ __u16 heir_mask; -+ union { -+ struct list_head readdir_list; -+ struct list_head not_used; -+ } lists; -+ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */ -+ unsigned long flags; -+ union { -+ /* fields specific to unix_file plugin */ -+ struct unix_file_info unix_file_info; -+ /* fields specific to cryptcompress file plugin */ -+ struct cryptcompress_info cryptcompress_info; -+ } file_plugin_data; -+ -+ /* this semaphore is to serialize readers and writers of @pset->file -+ * when file plugin conversion is enabled -+ */ -+ struct rw_semaphore conv_sem; -+ -+ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are -+ tagged in that tree by EFLUSH_TAG_ANONYMOUS */ -+ struct radix_tree_root jnodes_tree; -+#if REISER4_DEBUG -+ /* number of unformatted node jnodes of this file in jnode hash table */ -+ unsigned long nr_jnodes; -+#endif -+ -+ /* block number of virtual root for this object. See comment above -+ * fs/reiser4/search.c:handle_vroot() */ -+ reiser4_block_nr vroot; -+ struct mutex loading; -+}; -+ -+void loading_init_once(reiser4_inode *); -+void loading_alloc(reiser4_inode *); -+void loading_destroy(reiser4_inode *); -+ -+struct reiser4_inode_object { -+ /* private part */ -+ reiser4_inode p; -+ /* generic fields not specific to reiser4, but used by VFS */ -+ struct inode vfs_inode; -+}; -+ -+/* return pointer to the reiser4 specific portion of @inode */ -+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode -+ /* inode queried */ ) -+{ -+ assert("nikita-254", inode != NULL); -+ return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p; -+} -+ -+static inline struct inode *inode_by_reiser4_inode(const reiser4_inode * -+ r4_inode /* inode queried */ -+ ) -+{ -+ return &container_of(r4_inode, struct reiser4_inode_object, -+ p)->vfs_inode; -+} -+ -+/* -+ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct -+ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64 -+ * bits. -+ * -+ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part -+ * of inode, otherwise whole oid is stored in i_ino. -+ * -+ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference. -+ */ -+ -+#define OID_HI_SHIFT (sizeof(ino_t) * 8) -+ -+#if REISER4_INO_IS_OID -+ -+static inline oid_t get_inode_oid(const struct inode *inode) -+{ -+ return inode->i_ino; -+} -+ -+static inline void set_inode_oid(struct inode *inode, oid_t oid) -+{ -+ inode->i_ino = oid; -+} -+ -+/* REISER4_INO_IS_OID */ -+#else -+ -+static inline oid_t get_inode_oid(const struct inode *inode) -+{ -+ return -+ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) | -+ inode->i_ino; -+} -+ -+static inline void set_inode_oid(struct inode *inode, oid_t oid) -+{ -+ assert("nikita-2519", inode != NULL); -+ inode->i_ino = (ino_t) (oid); -+ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT; -+ assert("nikita-2521", get_inode_oid(inode) == (oid)); -+} -+ -+/* REISER4_INO_IS_OID */ -+#endif -+ -+static inline oid_t get_inode_locality(const struct inode *inode) -+{ -+ return reiser4_inode_data(inode)->locality_id; -+} -+ -+#if REISER4_LARGE_KEY -+static inline __u64 get_inode_ordering(const struct inode *inode) -+{ -+ return reiser4_inode_data(inode)->ordering; -+} -+ -+static inline void set_inode_ordering(const struct inode *inode, __u64 ordering) -+{ -+ reiser4_inode_data(inode)->ordering = ordering; -+} -+ -+#else -+ -+#define get_inode_ordering(inode) (0) -+#define set_inode_ordering(inode, val) noop -+ -+#endif -+ -+/* return inode in which @uf_info is embedded */ -+static inline struct inode * -+unix_file_info_to_inode(const struct unix_file_info *uf_info) -+{ -+ return &container_of(uf_info, struct reiser4_inode_object, -+ p.file_plugin_data.unix_file_info)->vfs_inode; -+} -+ -+extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const)); -+extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const)); -+ -+extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode); -+ -+#if REISER4_DEBUG -+extern void reiser4_inode_invariant(const struct inode *inode); -+extern int inode_has_no_jnodes(reiser4_inode *); -+#else -+#define reiser4_inode_invariant(inode) noop -+#endif -+ -+static inline int spin_inode_is_locked(const struct inode *inode) -+{ -+ assert_spin_locked(&reiser4_inode_data(inode)->guard); -+ return 1; -+} -+ -+/** -+ * spin_lock_inode - lock reiser4_inode' embedded spinlock -+ * @inode: inode to lock -+ * -+ * In debug mode it checks that lower priority locks are not held and -+ * increments reiser4_context's lock counters on which lock ordering checking -+ * is based. -+ */ -+static inline void spin_lock_inode(struct inode *inode) -+{ -+ assert("", LOCK_CNT_NIL(spin_locked)); -+ /* check lock ordering */ -+ assert_spin_not_locked(&d_lock); -+ -+ spin_lock(&reiser4_inode_data(inode)->guard); -+ -+ LOCK_CNT_INC(spin_locked_inode); -+ LOCK_CNT_INC(spin_locked); -+ -+ reiser4_inode_invariant(inode); -+} -+ -+/** -+ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock -+ * @inode: inode to unlock -+ * -+ * In debug mode it checks that spinlock is held and decrements -+ * reiser4_context's lock counters on which lock ordering checking is based. -+ */ -+static inline void spin_unlock_inode(struct inode *inode) -+{ -+ assert_spin_locked(&reiser4_inode_data(inode)->guard); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ reiser4_inode_invariant(inode); -+ -+ LOCK_CNT_DEC(spin_locked_inode); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&reiser4_inode_data(inode)->guard); -+} -+ -+extern znode *inode_get_vroot(struct inode *inode); -+extern void inode_set_vroot(struct inode *inode, znode * vroot); -+ -+extern int reiser4_max_filename_len(const struct inode *inode); -+extern int max_hash_collisions(const struct inode *dir); -+extern void reiser4_unlock_inode(struct inode *inode); -+extern int is_reiser4_inode(const struct inode *inode); -+extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *); -+extern struct inode *reiser4_iget(struct super_block *super, -+ const reiser4_key * key, int silent); -+extern void reiser4_iget_complete(struct inode *inode); -+extern void reiser4_inode_set_flag(struct inode *inode, -+ reiser4_file_plugin_flags f); -+extern void reiser4_inode_clr_flag(struct inode *inode, -+ reiser4_file_plugin_flags f); -+extern int reiser4_inode_get_flag(const struct inode *inode, -+ reiser4_file_plugin_flags f); -+ -+/* has inode been initialized? */ -+static inline int -+is_inode_loaded(const struct inode *inode/* inode queried */) -+{ -+ assert("nikita-1120", inode != NULL); -+ return reiser4_inode_get_flag(inode, REISER4_LOADED); -+} -+ -+extern file_plugin *inode_file_plugin(const struct inode *inode); -+extern dir_plugin *inode_dir_plugin(const struct inode *inode); -+extern formatting_plugin *inode_formatting_plugin(const struct inode *inode); -+extern hash_plugin *inode_hash_plugin(const struct inode *inode); -+extern fibration_plugin *inode_fibration_plugin(const struct inode *inode); -+extern cipher_plugin *inode_cipher_plugin(const struct inode *inode); -+extern digest_plugin *inode_digest_plugin(const struct inode *inode); -+extern compression_plugin *inode_compression_plugin(const struct inode *inode); -+extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode -+ *inode); -+extern cluster_plugin *inode_cluster_plugin(const struct inode *inode); -+extern file_plugin *inode_create_plugin(const struct inode *inode); -+extern item_plugin *inode_sd_plugin(const struct inode *inode); -+extern item_plugin *inode_dir_item_plugin(const struct inode *inode); -+extern file_plugin *child_create_plugin(const struct inode *inode); -+ -+extern void reiser4_make_bad_inode(struct inode *inode); -+ -+extern void inode_set_extension(struct inode *inode, sd_ext_bits ext); -+extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext); -+extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new); -+extern void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new); -+ -+#define INODE_SET_SIZE(i, value) \ -+({ \ -+ struct inode *__i; \ -+ typeof(value) __v; \ -+ \ -+ __i = (i); \ -+ __v = (value); \ -+ inode_check_scale(__i, __i->i_size, __v); \ -+ i_size_write(__i, __v); \ -+}) -+ -+/* -+ * update field @field in inode @i to contain value @value. -+ */ -+#define INODE_SET_FIELD(i, field, value) \ -+({ \ -+ struct inode *__i; \ -+ typeof(value) __v; \ -+ \ -+ __i = (i); \ -+ __v = (value); \ -+ inode_check_scale(__i, __i->field, __v); \ -+ __i->field = __v; \ -+}) -+ -+#define INODE_INC_FIELD(i, field) \ -+({ \ -+ struct inode *__i; \ -+ \ -+ __i = (i); \ -+ inode_check_scale(__i, __i->field, __i->field + 1); \ -+ ++ __i->field; \ -+}) -+ -+#define INODE_DEC_FIELD(i, field) \ -+({ \ -+ struct inode *__i; \ -+ \ -+ __i = (i); \ -+ inode_check_scale(__i, __i->field, __i->field - 1); \ -+ -- __i->field; \ -+}) -+ -+/* See comment before reiser4_readdir_common() for description. */ -+static inline struct list_head *get_readdir_list(const struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->lists.readdir_list; -+} -+ -+extern void init_inode_ordering(struct inode *inode, -+ reiser4_object_create_data * crd, int create); -+ -+static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->jnodes_tree; -+} -+ -+static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode -+ *r4_inode) -+{ -+ return &r4_inode->jnodes_tree; -+} -+ -+#if REISER4_DEBUG -+extern void print_inode(const char *prefix, const struct inode *i); -+#endif -+ -+int is_dir_empty(const struct inode *); -+ -+/* __REISER4_INODE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/ioctl.h linux-2.6.30/fs/reiser4/ioctl.h ---- linux-2.6.30.orig/fs/reiser4/ioctl.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/ioctl.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,41 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#if !defined(__REISER4_IOCTL_H__) -+#define __REISER4_IOCTL_H__ -+ -+#include <linux/fs.h> -+ -+/* -+ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into -+ * extents and fix in this state. This is used by applications that rely on -+ * -+ * . files being block aligned, and -+ * -+ * . files never migrating on disk -+ * -+ * for example, boot loaders (LILO) need this. -+ * -+ * This ioctl should be used as -+ * -+ * result = ioctl(fd, REISER4_IOC_UNPACK); -+ * -+ * File behind fd descriptor will be converted to the extents (if necessary), -+ * and its stat-data will be updated so that it will never be converted back -+ * into tails again. -+ */ -+#define REISER4_IOC_UNPACK _IOW(0xCD, 1, long) -+ -+/* __REISER4_IOCTL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/jnode.c linux-2.6.30/fs/reiser4/jnode.c ---- linux-2.6.30.orig/fs/reiser4/jnode.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/jnode.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1923 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* Jnode manipulation functions. */ -+/* Jnode is entity used to track blocks with data and meta-data in reiser4. -+ -+ In particular, jnodes are used to track transactional information -+ associated with each block. Each znode contains jnode as ->zjnode field. -+ -+ Jnode stands for either Josh or Journal node. -+*/ -+ -+/* -+ * Taxonomy. -+ * -+ * Jnode represents block containing data or meta-data. There are jnodes -+ * for: -+ * -+ * unformatted blocks (jnodes proper). There are plans, however to -+ * have a handle per extent unit rather than per each unformatted -+ * block, because there are so many of them. -+ * -+ * For bitmaps. Each bitmap is actually represented by two jnodes--one -+ * for working and another for "commit" data, together forming bnode. -+ * -+ * For io-heads. These are used by log writer. -+ * -+ * For formatted nodes (znode). See comment at the top of znode.c for -+ * details specific to the formatted nodes (znodes). -+ * -+ * Node data. -+ * -+ * Jnode provides access to the data of node it represents. Data are -+ * stored in a page. Page is kept in a page cache. This means, that jnodes -+ * are highly interconnected with page cache and VM internals. -+ * -+ * jnode has a pointer to page (->pg) containing its data. Pointer to data -+ * themselves is cached in ->data field to avoid frequent calls to -+ * page_address(). -+ * -+ * jnode and page are attached to each other by jnode_attach_page(). This -+ * function places pointer to jnode in set_page_private(), sets PG_private -+ * flag and increments page counter. -+ * -+ * Opposite operation is performed by page_clear_jnode(). -+ * -+ * jnode->pg is protected by jnode spin lock, and page->private is -+ * protected by page lock. See comment at the top of page_cache.c for -+ * more. -+ * -+ * page can be detached from jnode for two reasons: -+ * -+ * . jnode is removed from a tree (file is truncated, of formatted -+ * node is removed by balancing). -+ * -+ * . during memory pressure, VM calls ->releasepage() method -+ * (reiser4_releasepage()) to evict page from memory. -+ * -+ * (there, of course, is also umount, but this is special case we are not -+ * concerned with here). -+ * -+ * To protect jnode page from eviction, one calls jload() function that -+ * "pins" page in memory (loading it if necessary), increments -+ * jnode->d_count, and kmap()s page. Page is unpinned through call to -+ * jrelse(). -+ * -+ * Jnode life cycle. -+ * -+ * jnode is created, placed in hash table, and, optionally, in per-inode -+ * radix tree. Page can be attached to jnode, pinned, released, etc. -+ * -+ * When jnode is captured into atom its reference counter is -+ * increased. While being part of an atom, jnode can be "early -+ * flushed". This means that as part of flush procedure, jnode is placed -+ * into "relocate set", and its page is submitted to the disk. After io -+ * completes, page can be detached, then loaded again, re-dirtied, etc. -+ * -+ * Thread acquired reference to jnode by calling jref() and releases it by -+ * jput(). When last reference is removed, jnode is still retained in -+ * memory (cached) if it has page attached, _unless_ it is scheduled for -+ * destruction (has JNODE_HEARD_BANSHEE bit set). -+ * -+ * Tree read-write lock was used as "existential" lock for jnodes. That is, -+ * jnode->x_count could be changed from 0 to 1 only under tree write lock, -+ * that is, tree lock protected unreferenced jnodes stored in the hash -+ * table, from recycling. -+ * -+ * This resulted in high contention on tree lock, because jref()/jput() is -+ * frequent operation. To ameliorate this problem, RCU is used: when jput() -+ * is just about to release last reference on jnode it sets JNODE_RIP bit -+ * on it, and then proceed with jnode destruction (removing jnode from hash -+ * table, cbk_cache, detaching page, etc.). All places that change jnode -+ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and -+ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by -+ * jnode_rip_check() function), and pretend that nothing was found in hash -+ * table if bit is set. -+ * -+ * jput defers actual return of jnode into slab cache to some later time -+ * (by call_rcu()), this guarantees that other threads can safely continue -+ * working with JNODE_RIP-ped jnode. -+ * -+ */ -+ -+#include "reiser4.h" -+#include "debug.h" -+#include "dformat.h" -+#include "jnode.h" -+#include "plugin/plugin_header.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+/*#include "jnode.h"*/ -+#include "znode.h" -+#include "tree.h" -+#include "tree_walk.h" -+#include "super.h" -+#include "inode.h" -+#include "page_cache.h" -+ -+#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */ -+#include <linux/types.h> -+#include <linux/slab.h> -+#include <linux/pagemap.h> -+#include <linux/swap.h> -+#include <linux/fs.h> /* for struct address_space */ -+#include <linux/writeback.h> /* for inode_lock */ -+ -+static struct kmem_cache *_jnode_slab = NULL; -+ -+static void jnode_set_type(jnode * node, jnode_type type); -+static int jdelete(jnode * node); -+static int jnode_try_drop(jnode * node); -+ -+#if REISER4_DEBUG -+static int jnode_invariant(jnode * node, int tlocked, int jlocked); -+#endif -+ -+/* true if valid page is attached to jnode */ -+static inline int jnode_is_parsed(jnode * node) -+{ -+ return JF_ISSET(node, JNODE_PARSED); -+} -+ -+/* hash table support */ -+ -+/* compare two jnode keys for equality. Used by hash-table macros */ -+static inline int jnode_key_eq(const struct jnode_key *k1, -+ const struct jnode_key *k2) -+{ -+ assert("nikita-2350", k1 != NULL); -+ assert("nikita-2351", k2 != NULL); -+ -+ return (k1->index == k2->index && k1->objectid == k2->objectid); -+} -+ -+/* Hash jnode by its key (inode plus offset). Used by hash-table macros */ -+static inline __u32 jnode_key_hashfn(j_hash_table * table, -+ const struct jnode_key *key) -+{ -+ assert("nikita-2352", key != NULL); -+ assert("nikita-3346", IS_POW(table->_buckets)); -+ -+ /* yes, this is remarkable simply (where not stupid) hash function. */ -+ return (key->objectid + key->index) & (table->_buckets - 1); -+} -+ -+/* The hash table definition */ -+#define KMALLOC(size) reiser4_vmalloc(size) -+#define KFREE(ptr, size) vfree(ptr) -+TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j, -+ jnode_key_hashfn, jnode_key_eq); -+#undef KFREE -+#undef KMALLOC -+ -+/* call this to initialise jnode hash table */ -+int jnodes_tree_init(reiser4_tree * tree/* tree to initialise jnodes for */) -+{ -+ assert("nikita-2359", tree != NULL); -+ return j_hash_init(&tree->jhash_table, 16384); -+} -+ -+/* call this to destroy jnode hash table. This is called during umount. */ -+int jnodes_tree_done(reiser4_tree * tree/* tree to destroy jnodes for */) -+{ -+ j_hash_table *jtable; -+ jnode *node; -+ jnode *next; -+ -+ assert("nikita-2360", tree != NULL); -+ -+ /* -+ * Scan hash table and free all jnodes. -+ */ -+ jtable = &tree->jhash_table; -+ if (jtable->_table) { -+ for_all_in_htable(jtable, j, node, next) { -+ assert("nikita-2361", !atomic_read(&node->x_count)); -+ jdrop(node); -+ } -+ -+ j_hash_done(&tree->jhash_table); -+ } -+ return 0; -+} -+ -+/** -+ * init_jnodes - create jnode cache -+ * -+ * Initializes slab cache jnodes. It is part of reiser4 module initialization. -+ */ -+int init_jnodes(void) -+{ -+ assert("umka-168", _jnode_slab == NULL); -+ -+ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL); -+ if (_jnode_slab == NULL) -+ return RETERR(-ENOMEM); -+ -+ return 0; -+} -+ -+/** -+ * done_znodes - delete znode cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_jnodes(void) -+{ -+ destroy_reiser4_cache(&_jnode_slab); -+} -+ -+/* Initialize a jnode. */ -+void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type) -+{ -+ assert("umka-175", node != NULL); -+ -+ memset(node, 0, sizeof(jnode)); -+ ON_DEBUG(node->magic = JMAGIC); -+ jnode_set_type(node, type); -+ atomic_set(&node->d_count, 0); -+ atomic_set(&node->x_count, 0); -+ spin_lock_init(&node->guard); -+ spin_lock_init(&node->load); -+ node->atom = NULL; -+ node->tree = tree; -+ INIT_LIST_HEAD(&node->capture_link); -+ -+ ASSIGN_NODE_LIST(node, NOT_CAPTURED); -+ -+ INIT_RCU_HEAD(&node->rcu); -+ -+#if REISER4_DEBUG -+ { -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(tree->super); -+ spin_lock_irq(&sbinfo->all_guard); -+ list_add(&node->jnodes, &sbinfo->all_jnodes); -+ spin_unlock_irq(&sbinfo->all_guard); -+ } -+#endif -+} -+ -+#if REISER4_DEBUG -+/* -+ * Remove jnode from ->all_jnodes list. -+ */ -+static void jnode_done(jnode * node, reiser4_tree * tree) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(tree->super); -+ -+ spin_lock_irq(&sbinfo->all_guard); -+ assert("nikita-2422", !list_empty(&node->jnodes)); -+ list_del_init(&node->jnodes); -+ spin_unlock_irq(&sbinfo->all_guard); -+} -+#endif -+ -+/* return already existing jnode of page */ -+jnode *jnode_by_page(struct page *pg) -+{ -+ assert("nikita-2066", pg != NULL); -+ assert("nikita-2400", PageLocked(pg)); -+ assert("nikita-2068", PagePrivate(pg)); -+ assert("nikita-2067", jprivate(pg) != NULL); -+ return jprivate(pg); -+} -+ -+/* exported functions to allocate/free jnode objects outside this file */ -+jnode *jalloc(void) -+{ -+ jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get()); -+ return jal; -+} -+ -+/* return jnode back to the slab allocator */ -+inline void jfree(jnode * node) -+{ -+ assert("zam-449", node != NULL); -+ -+ assert("nikita-2663", (list_empty_careful(&node->capture_link) && -+ NODE_LIST(node) == NOT_CAPTURED)); -+ assert("nikita-3222", list_empty(&node->jnodes)); -+ assert("nikita-3221", jnode_page(node) == NULL); -+ -+ /* not yet phash_jnode_destroy(node); */ -+ -+ kmem_cache_free(_jnode_slab, node); -+} -+ -+/* -+ * This function is supplied as RCU callback. It actually frees jnode when -+ * last reference to it is gone. -+ */ -+static void jnode_free_actor(struct rcu_head *head) -+{ -+ jnode *node; -+ jnode_type jtype; -+ -+ node = container_of(head, jnode, rcu); -+ jtype = jnode_get_type(node); -+ -+ ON_DEBUG(jnode_done(node, jnode_get_tree(node))); -+ -+ switch (jtype) { -+ case JNODE_IO_HEAD: -+ case JNODE_BITMAP: -+ case JNODE_UNFORMATTED_BLOCK: -+ jfree(node); -+ break; -+ case JNODE_FORMATTED_BLOCK: -+ zfree(JZNODE(node)); -+ break; -+ case JNODE_INODE: -+ default: -+ wrong_return_value("nikita-3197", "Wrong jnode type"); -+ } -+} -+ -+/* -+ * Free a jnode. Post a callback to be executed later through RCU when all -+ * references to @node are released. -+ */ -+static inline void jnode_free(jnode * node, jnode_type jtype) -+{ -+ if (jtype != JNODE_INODE) { -+ /*assert("nikita-3219", list_empty(&node->rcu.list)); */ -+ call_rcu(&node->rcu, jnode_free_actor); -+ } else -+ jnode_list_remove(node); -+} -+ -+/* allocate new unformatted jnode */ -+static jnode *jnew_unformatted(void) -+{ -+ jnode *jal; -+ -+ jal = jalloc(); -+ if (jal == NULL) -+ return NULL; -+ -+ jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK); -+ jal->key.j.mapping = NULL; -+ jal->key.j.index = (unsigned long)-1; -+ jal->key.j.objectid = 0; -+ return jal; -+} -+ -+/* look for jnode with given mapping and offset within hash table */ -+jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index) -+{ -+ struct jnode_key jkey; -+ jnode *node; -+ -+ assert("nikita-2353", tree != NULL); -+ -+ jkey.objectid = objectid; -+ jkey.index = index; -+ -+ /* -+ * hash table is _not_ protected by any lock during lookups. All we -+ * have to do is to disable preemption to keep RCU happy. -+ */ -+ -+ rcu_read_lock(); -+ node = j_hash_find(&tree->jhash_table, &jkey); -+ if (node != NULL) { -+ /* protect @node from recycling */ -+ jref(node); -+ assert("nikita-2955", jnode_invariant(node, 0, 0)); -+ node = jnode_rip_check(tree, node); -+ } -+ rcu_read_unlock(); -+ return node; -+} -+ -+/* per inode radix tree of jnodes is protected by tree's read write spin lock */ -+static jnode *jfind_nolock(struct address_space *mapping, unsigned long index) -+{ -+ assert("vs-1694", mapping->host != NULL); -+ -+ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index); -+} -+ -+jnode *jfind(struct address_space *mapping, unsigned long index) -+{ -+ reiser4_tree *tree; -+ jnode *node; -+ -+ assert("vs-1694", mapping->host != NULL); -+ tree = reiser4_tree_by_inode(mapping->host); -+ -+ read_lock_tree(tree); -+ node = jfind_nolock(mapping, index); -+ if (node != NULL) -+ jref(node); -+ read_unlock_tree(tree); -+ return node; -+} -+ -+static void inode_attach_jnode(jnode * node) -+{ -+ struct inode *inode; -+ reiser4_inode *info; -+ struct radix_tree_root *rtree; -+ -+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); -+ assert("zam-1043", node->key.j.mapping != NULL); -+ inode = node->key.j.mapping->host; -+ info = reiser4_inode_data(inode); -+ rtree = jnode_tree_by_reiser4_inode(info); -+ if (rtree->rnode == NULL) { -+ /* prevent inode from being pruned when it has jnodes attached -+ to it */ -+ spin_lock_irq(&inode->i_data.tree_lock); -+ inode->i_data.nrpages++; -+ spin_unlock_irq(&inode->i_data.tree_lock); -+ } -+ assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0)); -+ check_me("zam-1045", -+ !radix_tree_insert(rtree, node->key.j.index, node)); -+ ON_DEBUG(info->nr_jnodes++); -+} -+ -+static void inode_detach_jnode(jnode * node) -+{ -+ struct inode *inode; -+ reiser4_inode *info; -+ struct radix_tree_root *rtree; -+ -+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); -+ assert("zam-1044", node->key.j.mapping != NULL); -+ inode = node->key.j.mapping->host; -+ info = reiser4_inode_data(inode); -+ rtree = jnode_tree_by_reiser4_inode(info); -+ -+ assert("zam-1051", info->nr_jnodes != 0); -+ assert("zam-1052", rtree->rnode != NULL); -+ ON_DEBUG(info->nr_jnodes--); -+ -+ /* delete jnode from inode's radix tree of jnodes */ -+ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index)); -+ if (rtree->rnode == NULL) { -+ /* inode can be pruned now */ -+ spin_lock_irq(&inode->i_data.tree_lock); -+ inode->i_data.nrpages--; -+ spin_unlock_irq(&inode->i_data.tree_lock); -+ } -+} -+ -+/* put jnode into hash table (where they can be found by flush who does not know -+ mapping) and to inode's tree of jnodes (where they can be found (hopefully -+ faster) in places where mapping is known). Currently it is used by -+ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is -+ created */ -+static void -+hash_unformatted_jnode(jnode * node, struct address_space *mapping, -+ unsigned long index) -+{ -+ j_hash_table *jtable; -+ -+ assert("vs-1446", jnode_is_unformatted(node)); -+ assert("vs-1442", node->key.j.mapping == 0); -+ assert("vs-1443", node->key.j.objectid == 0); -+ assert("vs-1444", node->key.j.index == (unsigned long)-1); -+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); -+ -+ node->key.j.mapping = mapping; -+ node->key.j.objectid = get_inode_oid(mapping->host); -+ node->key.j.index = index; -+ -+ jtable = &jnode_get_tree(node)->jhash_table; -+ -+ /* race with some other thread inserting jnode into the hash table is -+ * impossible, because we keep the page lock. */ -+ /* -+ * following assertion no longer holds because of RCU: it is possible -+ * jnode is in the hash table, but with JNODE_RIP bit set. -+ */ -+ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */ -+ j_hash_insert_rcu(jtable, node); -+ inode_attach_jnode(node); -+} -+ -+static void unhash_unformatted_node_nolock(jnode * node) -+{ -+ assert("vs-1683", node->key.j.mapping != NULL); -+ assert("vs-1684", -+ node->key.j.objectid == -+ get_inode_oid(node->key.j.mapping->host)); -+ -+ /* remove jnode from hash-table */ -+ j_hash_remove_rcu(&node->tree->jhash_table, node); -+ inode_detach_jnode(node); -+ node->key.j.mapping = NULL; -+ node->key.j.index = (unsigned long)-1; -+ node->key.j.objectid = 0; -+ -+} -+ -+/* remove jnode from hash table and from inode's tree of jnodes. This is used in -+ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes -> -+ reiser4_uncapture_jnode */ -+void unhash_unformatted_jnode(jnode * node) -+{ -+ assert("vs-1445", jnode_is_unformatted(node)); -+ -+ write_lock_tree(node->tree); -+ unhash_unformatted_node_nolock(node); -+ write_unlock_tree(node->tree); -+} -+ -+/* -+ * search hash table for a jnode with given oid and index. If not found, -+ * allocate new jnode, insert it, and also insert into radix tree for the -+ * given inode/mapping. -+ */ -+static jnode *find_get_jnode(reiser4_tree * tree, -+ struct address_space *mapping, -+ oid_t oid, unsigned long index) -+{ -+ jnode *result; -+ jnode *shadow; -+ int preload; -+ -+ result = jnew_unformatted(); -+ -+ if (unlikely(result == NULL)) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ preload = radix_tree_preload(reiser4_ctx_gfp_mask_get()); -+ if (preload != 0) -+ return ERR_PTR(preload); -+ -+ write_lock_tree(tree); -+ shadow = jfind_nolock(mapping, index); -+ if (likely(shadow == NULL)) { -+ /* add new jnode to hash table and inode's radix tree of -+ * jnodes */ -+ jref(result); -+ hash_unformatted_jnode(result, mapping, index); -+ } else { -+ /* jnode is found in inode's radix tree of jnodes */ -+ jref(shadow); -+ jnode_free(result, JNODE_UNFORMATTED_BLOCK); -+ assert("vs-1498", shadow->key.j.mapping == mapping); -+ result = shadow; -+ } -+ write_unlock_tree(tree); -+ -+ assert("nikita-2955", -+ ergo(result != NULL, jnode_invariant(result, 0, 0))); -+ radix_tree_preload_end(); -+ return result; -+} -+ -+/* jget() (a la zget() but for unformatted nodes). Returns (and possibly -+ creates) jnode corresponding to page @pg. jnode is attached to page and -+ inserted into jnode hash-table. */ -+static jnode *do_jget(reiser4_tree * tree, struct page *pg) -+{ -+ /* -+ * There are two ways to create jnode: starting with pre-existing page -+ * and without page. -+ * -+ * When page already exists, jnode is created -+ * (jnode_of_page()->do_jget()) under page lock. This is done in -+ * ->writepage(), or when capturing anonymous page dirtied through -+ * mmap. -+ * -+ * Jnode without page is created by index_extent_jnode(). -+ * -+ */ -+ -+ jnode *result; -+ oid_t oid = get_inode_oid(pg->mapping->host); -+ -+ assert("umka-176", pg != NULL); -+ assert("nikita-2394", PageLocked(pg)); -+ -+ result = jprivate(pg); -+ if (likely(result != NULL)) -+ return jref(result); -+ -+ tree = reiser4_tree_by_page(pg); -+ -+ /* check hash-table first */ -+ result = jfind(pg->mapping, pg->index); -+ if (unlikely(result != NULL)) { -+ spin_lock_jnode(result); -+ jnode_attach_page(result, pg); -+ spin_unlock_jnode(result); -+ result->key.j.mapping = pg->mapping; -+ return result; -+ } -+ -+ /* since page is locked, jnode should be allocated with GFP_NOFS flag */ -+ reiser4_ctx_gfp_mask_force(GFP_NOFS); -+ result = find_get_jnode(tree, pg->mapping, oid, pg->index); -+ if (unlikely(IS_ERR(result))) -+ return result; -+ /* attach jnode to page */ -+ spin_lock_jnode(result); -+ jnode_attach_page(result, pg); -+ spin_unlock_jnode(result); -+ return result; -+} -+ -+/* -+ * return jnode for @pg, creating it if necessary. -+ */ -+jnode *jnode_of_page(struct page *pg) -+{ -+ jnode *result; -+ -+ assert("umka-176", pg != NULL); -+ assert("nikita-2394", PageLocked(pg)); -+ -+ result = do_jget(reiser4_tree_by_page(pg), pg); -+ -+ if (REISER4_DEBUG && !IS_ERR(result)) { -+ assert("nikita-3210", result == jprivate(pg)); -+ assert("nikita-2046", jnode_page(jprivate(pg)) == pg); -+ if (jnode_is_unformatted(jprivate(pg))) { -+ assert("nikita-2364", -+ jprivate(pg)->key.j.index == pg->index); -+ assert("nikita-2367", -+ jprivate(pg)->key.j.mapping == pg->mapping); -+ assert("nikita-2365", -+ jprivate(pg)->key.j.objectid == -+ get_inode_oid(pg->mapping->host)); -+ assert("vs-1200", -+ jprivate(pg)->key.j.objectid == -+ pg->mapping->host->i_ino); -+ assert("nikita-2356", -+ jnode_is_unformatted(jnode_by_page(pg))); -+ } -+ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0)); -+ } -+ return result; -+} -+ -+/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the -+ * page.*/ -+void jnode_attach_page(jnode * node, struct page *pg) -+{ -+ assert("nikita-2060", node != NULL); -+ assert("nikita-2061", pg != NULL); -+ -+ assert("nikita-2050", jprivate(pg) == 0ul); -+ assert("nikita-2393", !PagePrivate(pg)); -+ assert("vs-1741", node->pg == NULL); -+ -+ assert("nikita-2396", PageLocked(pg)); -+ assert_spin_locked(&(node->guard)); -+ -+ page_cache_get(pg); -+ set_page_private(pg, (unsigned long)node); -+ node->pg = pg; -+ SetPagePrivate(pg); -+} -+ -+/* Dual to jnode_attach_page: break a binding between page and jnode */ -+void page_clear_jnode(struct page *page, jnode * node) -+{ -+ assert("nikita-2424", page != NULL); -+ assert("nikita-2425", PageLocked(page)); -+ assert("nikita-2426", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ assert("nikita-2428", PagePrivate(page)); -+ -+ assert("nikita-3551", !PageWriteback(page)); -+ -+ JF_CLR(node, JNODE_PARSED); -+ set_page_private(page, 0ul); -+ ClearPagePrivate(page); -+ node->pg = NULL; -+ page_cache_release(page); -+} -+ -+#if 0 -+/* it is only used in one place to handle error */ -+void -+page_detach_jnode(struct page *page, struct address_space *mapping, -+ unsigned long index) -+{ -+ assert("nikita-2395", page != NULL); -+ -+ lock_page(page); -+ if ((page->mapping == mapping) && (page->index == index) -+ && PagePrivate(page)) { -+ jnode *node; -+ -+ node = jprivate(page); -+ spin_lock_jnode(node); -+ page_clear_jnode(page, node); -+ spin_unlock_jnode(node); -+ } -+ unlock_page(page); -+} -+#endif /* 0 */ -+ -+/* return @node page locked. -+ -+ Locking ordering requires that one first takes page lock and afterwards -+ spin lock on node attached to this page. Sometimes it is necessary to go in -+ the opposite direction. This is done through standard trylock-and-release -+ loop. -+*/ -+static struct page *jnode_lock_page(jnode * node) -+{ -+ struct page *page; -+ -+ assert("nikita-2052", node != NULL); -+ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode)); -+ -+ while (1) { -+ -+ spin_lock_jnode(node); -+ page = jnode_page(node); -+ if (page == NULL) -+ break; -+ -+ /* no need to page_cache_get( page ) here, because page cannot -+ be evicted from memory without detaching it from jnode and -+ this requires spin lock on jnode that we already hold. -+ */ -+ if (trylock_page(page)) { -+ /* We won a lock on jnode page, proceed. */ -+ break; -+ } -+ -+ /* Page is locked by someone else. */ -+ page_cache_get(page); -+ spin_unlock_jnode(node); -+ wait_on_page_locked(page); -+ /* it is possible that page was detached from jnode and -+ returned to the free pool, or re-assigned while we were -+ waiting on locked bit. This will be rechecked on the next -+ loop iteration. -+ */ -+ page_cache_release(page); -+ -+ /* try again */ -+ } -+ return page; -+} -+ -+/* -+ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify -+ * validness of jnode content. -+ */ -+static inline int jparse(jnode * node) -+{ -+ int result; -+ -+ assert("nikita-2466", node != NULL); -+ -+ spin_lock_jnode(node); -+ if (likely(!jnode_is_parsed(node))) { -+ result = jnode_ops(node)->parse(node); -+ if (likely(result == 0)) -+ JF_SET(node, JNODE_PARSED); -+ } else -+ result = 0; -+ spin_unlock_jnode(node); -+ return result; -+} -+ -+/* Lock a page attached to jnode, create and attach page to jnode if it had no -+ * one. */ -+static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags) -+{ -+ struct page *page; -+ -+ spin_lock_jnode(node); -+ page = jnode_page(node); -+ -+ if (page == NULL) { -+ spin_unlock_jnode(node); -+ page = find_or_create_page(jnode_get_mapping(node), -+ jnode_get_index(node), gfp_flags); -+ if (page == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ } else { -+ if (trylock_page(page)) { -+ spin_unlock_jnode(node); -+ return page; -+ } -+ page_cache_get(page); -+ spin_unlock_jnode(node); -+ lock_page(page); -+ assert("nikita-3134", page->mapping == jnode_get_mapping(node)); -+ } -+ -+ spin_lock_jnode(node); -+ if (!jnode_page(node)) -+ jnode_attach_page(node, page); -+ spin_unlock_jnode(node); -+ -+ page_cache_release(page); -+ assert("zam-894", jnode_page(node) == page); -+ return page; -+} -+ -+/* Start read operation for jnode's page if page is not up-to-date. */ -+static int jnode_start_read(jnode * node, struct page *page) -+{ -+ assert("zam-893", PageLocked(page)); -+ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return 0; -+ } -+ return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get()); -+} -+ -+#if REISER4_DEBUG -+static void check_jload(jnode * node, struct page *page) -+{ -+ if (jnode_is_znode(node)) { -+ node40_header *nh; -+ znode *z; -+ -+ z = JZNODE(node); -+ if (znode_is_any_locked(z)) { -+ nh = (node40_header *) kmap(page); -+ /* this only works for node40-only file systems. For -+ * debugging. */ -+ assert("nikita-3253", -+ z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items))); -+ kunmap(page); -+ } -+ assert("nikita-3565", znode_invariant(z)); -+ } -+} -+#else -+#define check_jload(node, page) noop -+#endif -+ -+/* prefetch jnode to speed up next call to jload. Call this when you are going -+ * to call jload() shortly. This will bring appropriate portion of jnode into -+ * CPU cache. */ -+void jload_prefetch(jnode * node) -+{ -+ prefetchw(&node->x_count); -+} -+ -+/* load jnode's data into memory */ -+int jload_gfp(jnode * node /* node to load */ , -+ gfp_t gfp_flags /* allocation flags */ , -+ int do_kmap/* true if page should be kmapped */) -+{ -+ struct page *page; -+ int result = 0; -+ int parsed; -+ -+ assert("nikita-3010", reiser4_schedulable()); -+ -+ prefetchw(&node->pg); -+ -+ /* taking d-reference implies taking x-reference. */ -+ jref(node); -+ -+ /* -+ * acquiring d-reference to @jnode and check for JNODE_PARSED bit -+ * should be atomic, otherwise there is a race against -+ * reiser4_releasepage(). -+ */ -+ spin_lock(&(node->load)); -+ add_d_ref(node); -+ parsed = jnode_is_parsed(node); -+ spin_unlock(&(node->load)); -+ -+ if (unlikely(!parsed)) { -+ page = jnode_get_page_locked(node, gfp_flags); -+ if (unlikely(IS_ERR(page))) { -+ result = PTR_ERR(page); -+ goto failed; -+ } -+ -+ result = jnode_start_read(node, page); -+ if (unlikely(result != 0)) -+ goto failed; -+ -+ wait_on_page_locked(page); -+ if (unlikely(!PageUptodate(page))) { -+ result = RETERR(-EIO); -+ goto failed; -+ } -+ -+ if (do_kmap) -+ node->data = kmap(page); -+ -+ result = jparse(node); -+ if (unlikely(result != 0)) { -+ if (do_kmap) -+ kunmap(page); -+ goto failed; -+ } -+ check_jload(node, page); -+ } else { -+ page = jnode_page(node); -+ check_jload(node, page); -+ if (do_kmap) -+ node->data = kmap(page); -+ } -+ -+ if (!is_writeout_mode()) -+ /* We do not mark pages active if jload is called as a part of -+ * jnode_flush() or reiser4_write_logs(). Both jnode_flush() -+ * and write_logs() add no value to cached data, there is no -+ * sense to mark pages as active when they go to disk, it just -+ * confuses vm scanning routines because clean page could be -+ * moved out from inactive list as a result of this -+ * mark_page_accessed() call. */ -+ mark_page_accessed(page); -+ -+ return 0; -+ -+failed: -+ jrelse_tail(node); -+ return result; -+ -+} -+ -+/* start asynchronous reading for given jnode's page. */ -+int jstartio(jnode * node) -+{ -+ struct page *page; -+ -+ page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get()); -+ if (IS_ERR(page)) -+ return PTR_ERR(page); -+ -+ return jnode_start_read(node, page); -+} -+ -+/* Initialize a node by calling appropriate plugin instead of reading -+ * node from disk as in jload(). */ -+int jinit_new(jnode * node, gfp_t gfp_flags) -+{ -+ struct page *page; -+ int result; -+ -+ jref(node); -+ add_d_ref(node); -+ -+ page = jnode_get_page_locked(node, gfp_flags); -+ if (IS_ERR(page)) { -+ result = PTR_ERR(page); -+ goto failed; -+ } -+ -+ SetPageUptodate(page); -+ unlock_page(page); -+ -+ node->data = kmap(page); -+ -+ if (!jnode_is_parsed(node)) { -+ jnode_plugin *jplug = jnode_ops(node); -+ spin_lock_jnode(node); -+ result = jplug->init(node); -+ spin_unlock_jnode(node); -+ if (result) { -+ kunmap(page); -+ goto failed; -+ } -+ JF_SET(node, JNODE_PARSED); -+ } -+ -+ return 0; -+ -+failed: -+ jrelse(node); -+ return result; -+} -+ -+/* release a reference to jnode acquired by jload(), decrement ->d_count */ -+void jrelse_tail(jnode * node/* jnode to release references to */) -+{ -+ assert("nikita-489", atomic_read(&node->d_count) > 0); -+ atomic_dec(&node->d_count); -+ /* release reference acquired in jload_gfp() or jinit_new() */ -+ jput(node); -+ if (jnode_is_unformatted(node) || jnode_is_znode(node)) -+ LOCK_CNT_DEC(d_refs); -+} -+ -+/* drop reference to node data. When last reference is dropped, data are -+ unloaded. */ -+void jrelse(jnode * node/* jnode to release references to */) -+{ -+ struct page *page; -+ -+ assert("nikita-487", node != NULL); -+ assert_spin_not_locked(&(node->guard)); -+ -+ page = jnode_page(node); -+ if (likely(page != NULL)) { -+ /* -+ * it is safe not to lock jnode here, because at this point -+ * @node->d_count is greater than zero (if jrelse() is used -+ * correctly, that is). JNODE_PARSED may be not set yet, if, -+ * for example, we got here as a result of error handling path -+ * in jload(). Anyway, page cannot be detached by -+ * reiser4_releasepage(). truncate will invalidate page -+ * regardless, but this should not be a problem. -+ */ -+ kunmap(page); -+ } -+ jrelse_tail(node); -+} -+ -+/* called from jput() to wait for io completion */ -+static void jnode_finish_io(jnode * node) -+{ -+ struct page *page; -+ -+ assert("nikita-2922", node != NULL); -+ -+ spin_lock_jnode(node); -+ page = jnode_page(node); -+ if (page != NULL) { -+ page_cache_get(page); -+ spin_unlock_jnode(node); -+ wait_on_page_writeback(page); -+ page_cache_release(page); -+ } else -+ spin_unlock_jnode(node); -+} -+ -+/* -+ * This is called by jput() when last reference to jnode is released. This is -+ * separate function, because we want fast path of jput() to be inline and, -+ * therefore, small. -+ */ -+void jput_final(jnode * node) -+{ -+ int r_i_p; -+ -+ /* A fast check for keeping node in cache. We always keep node in cache -+ * if its page is present and node was not marked for deletion */ -+ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) { -+ rcu_read_unlock(); -+ return; -+ } -+ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP); -+ /* -+ * if r_i_p is true, we were first to set JNODE_RIP on this node. In -+ * this case it is safe to access node after unlock. -+ */ -+ rcu_read_unlock(); -+ if (r_i_p) { -+ jnode_finish_io(node); -+ if (JF_ISSET(node, JNODE_HEARD_BANSHEE)) -+ /* node is removed from the tree. */ -+ jdelete(node); -+ else -+ jnode_try_drop(node); -+ } -+ /* if !r_i_p some other thread is already killing it */ -+} -+ -+int jwait_io(jnode * node, int rw) -+{ -+ struct page *page; -+ int result; -+ -+ assert("zam-447", node != NULL); -+ assert("zam-448", jnode_page(node) != NULL); -+ -+ page = jnode_page(node); -+ -+ result = 0; -+ if (rw == READ) { -+ wait_on_page_locked(page); -+ } else { -+ assert("nikita-2227", rw == WRITE); -+ wait_on_page_writeback(page); -+ } -+ if (PageError(page)) -+ result = RETERR(-EIO); -+ -+ return result; -+} -+ -+/* -+ * jnode types and plugins. -+ * -+ * jnode by itself is a "base type". There are several different jnode -+ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code -+ * has to do different things based on jnode type. In the standard reiser4 way -+ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin). -+ * -+ * Functions below deal with jnode types and define methods of jnode plugin. -+ * -+ */ -+ -+/* set jnode type. This is done during jnode initialization. */ -+static void jnode_set_type(jnode * node, jnode_type type) -+{ -+ static unsigned long type_to_mask[] = { -+ [JNODE_UNFORMATTED_BLOCK] = 1, -+ [JNODE_FORMATTED_BLOCK] = 0, -+ [JNODE_BITMAP] = 2, -+ [JNODE_IO_HEAD] = 6, -+ [JNODE_INODE] = 4 -+ }; -+ -+ assert("zam-647", type < LAST_JNODE_TYPE); -+ assert("nikita-2815", !jnode_is_loaded(node)); -+ assert("nikita-3386", node->state == 0); -+ -+ node->state |= (type_to_mask[type] << JNODE_TYPE_1); -+} -+ -+/* ->init() method of jnode plugin for jnodes that don't require plugin -+ * specific initialization. */ -+static int init_noinit(jnode * node UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* ->parse() method of jnode plugin for jnodes that don't require plugin -+ * specific pasring. */ -+static int parse_noparse(jnode * node UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* ->mapping() method for unformatted jnode */ -+struct address_space *mapping_jnode(const jnode * node) -+{ -+ struct address_space *map; -+ -+ assert("nikita-2713", node != NULL); -+ -+ /* mapping is stored in jnode */ -+ -+ map = node->key.j.mapping; -+ assert("nikita-2714", map != NULL); -+ assert("nikita-2897", is_reiser4_inode(map->host)); -+ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid); -+ return map; -+} -+ -+/* ->index() method for unformatted jnodes */ -+unsigned long index_jnode(const jnode * node) -+{ -+ /* index is stored in jnode */ -+ return node->key.j.index; -+} -+ -+/* ->remove() method for unformatted jnodes */ -+static inline void remove_jnode(jnode * node, reiser4_tree * tree) -+{ -+ /* remove jnode from hash table and radix tree */ -+ if (node->key.j.mapping) -+ unhash_unformatted_node_nolock(node); -+} -+ -+/* ->mapping() method for znodes */ -+static struct address_space *mapping_znode(const jnode * node) -+{ -+ /* all znodes belong to fake inode */ -+ return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping; -+} -+ -+/* ->index() method for znodes */ -+static unsigned long index_znode(const jnode * node) -+{ -+ unsigned long addr; -+ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode)); -+ -+ /* index of znode is just its address (shifted) */ -+ addr = (unsigned long)node; -+ return (addr - PAGE_OFFSET) >> znode_shift_order; -+} -+ -+/* ->mapping() method for bitmap jnode */ -+static struct address_space *mapping_bitmap(const jnode * node) -+{ -+ /* all bitmap blocks belong to special bitmap inode */ -+ return get_super_private(jnode_get_tree(node)->super)->bitmap-> -+ i_mapping; -+} -+ -+/* ->index() method for jnodes that are indexed by address */ -+static unsigned long index_is_address(const jnode * node) -+{ -+ unsigned long ind; -+ -+ ind = (unsigned long)node; -+ return ind - PAGE_OFFSET; -+} -+ -+/* resolve race with jput */ -+jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node) -+{ -+ /* -+ * This is used as part of RCU-based jnode handling. -+ * -+ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work -+ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is -+ * not protected during this, so concurrent thread may execute -+ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be -+ * freed in jput_final(). To avoid such races, jput_final() sets -+ * JNODE_RIP on jnode (under tree lock). All places that work with -+ * unreferenced jnodes call this function. It checks for JNODE_RIP bit -+ * (first without taking tree lock), and if this bit is set, released -+ * reference acquired by the current thread and returns NULL. -+ * -+ * As a result, if jnode is being concurrently freed, NULL is returned -+ * and caller should pretend that jnode wasn't found in the first -+ * place. -+ * -+ * Otherwise it's safe to release "rcu-read-lock" and continue with -+ * jnode. -+ */ -+ if (unlikely(JF_ISSET(node, JNODE_RIP))) { -+ read_lock_tree(tree); -+ if (JF_ISSET(node, JNODE_RIP)) { -+ dec_x_ref(node); -+ node = NULL; -+ } -+ read_unlock_tree(tree); -+ } -+ return node; -+} -+ -+reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key) -+{ -+ struct inode *inode; -+ item_plugin *iplug; -+ loff_t off; -+ -+ assert("nikita-3092", node != NULL); -+ assert("nikita-3093", key != NULL); -+ assert("nikita-3094", jnode_is_unformatted(node)); -+ -+ off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT; -+ inode = mapping_jnode(node)->host; -+ -+ if (node->parent_item_id != 0) -+ iplug = item_plugin_by_id(node->parent_item_id); -+ else -+ iplug = NULL; -+ -+ if (iplug != NULL && iplug->f.key_by_offset) -+ iplug->f.key_by_offset(inode, off, key); -+ else { -+ file_plugin *fplug; -+ -+ fplug = inode_file_plugin(inode); -+ assert("zam-1007", fplug != NULL); -+ assert("zam-1008", fplug->key_by_inode != NULL); -+ -+ fplug->key_by_inode(inode, off, key); -+ } -+ -+ return key; -+} -+ -+/* ->parse() method for formatted nodes */ -+static int parse_znode(jnode * node) -+{ -+ return zparse(JZNODE(node)); -+} -+ -+/* ->delete() method for formatted nodes */ -+static void delete_znode(jnode * node, reiser4_tree * tree) -+{ -+ znode *z; -+ -+ assert_rw_write_locked(&(tree->tree_lock)); -+ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ z = JZNODE(node); -+ assert("vs-899", z->c_count == 0); -+ -+ /* delete znode from sibling list. */ -+ sibling_list_remove(z); -+ -+ znode_remove(z, tree); -+} -+ -+/* ->remove() method for formatted nodes */ -+static int remove_znode(jnode * node, reiser4_tree * tree) -+{ -+ znode *z; -+ -+ assert_rw_write_locked(&(tree->tree_lock)); -+ z = JZNODE(node); -+ -+ if (z->c_count == 0) { -+ /* detach znode from sibling list. */ -+ sibling_list_drop(z); -+ /* this is called with tree spin-lock held, so call -+ znode_remove() directly (rather than znode_lock_remove()). */ -+ znode_remove(z, tree); -+ return 0; -+ } -+ return RETERR(-EBUSY); -+} -+ -+/* ->init() method for formatted nodes */ -+static int init_znode(jnode * node) -+{ -+ znode *z; -+ -+ z = JZNODE(node); -+ /* call node plugin to do actual initialization */ -+ return z->nplug->init(z); -+} -+ -+/* ->clone() method for formatted nodes */ -+static jnode *clone_formatted(jnode * node) -+{ -+ znode *clone; -+ -+ assert("vs-1430", jnode_is_znode(node)); -+ clone = zalloc(reiser4_ctx_gfp_mask_get()); -+ if (clone == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ zinit(clone, NULL, current_tree); -+ jnode_set_block(ZJNODE(clone), jnode_get_block(node)); -+ /* ZJNODE(clone)->key.z is not initialized */ -+ clone->level = JZNODE(node)->level; -+ -+ return ZJNODE(clone); -+} -+ -+/* jplug->clone for unformatted nodes */ -+static jnode *clone_unformatted(jnode * node) -+{ -+ jnode *clone; -+ -+ assert("vs-1431", jnode_is_unformatted(node)); -+ clone = jalloc(); -+ if (clone == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK); -+ jnode_set_block(clone, jnode_get_block(node)); -+ -+ return clone; -+ -+} -+ -+/* -+ * Setup jnode plugin methods for various jnode types. -+ */ -+jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = { -+ [JNODE_UNFORMATTED_BLOCK] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_UNFORMATTED_BLOCK, -+ .pops = NULL, -+ .label = "unformatted", -+ .desc = "unformatted node", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_noinit, -+ .parse = parse_noparse, -+ .mapping = mapping_jnode, -+ .index = index_jnode, -+ .clone = clone_unformatted -+ }, -+ [JNODE_FORMATTED_BLOCK] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_FORMATTED_BLOCK, -+ .pops = NULL, -+ .label = "formatted", -+ .desc = "formatted tree node", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_znode, -+ .parse = parse_znode, -+ .mapping = mapping_znode, -+ .index = index_znode, -+ .clone = clone_formatted -+ }, -+ [JNODE_BITMAP] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_BITMAP, -+ .pops = NULL, -+ .label = "bitmap", -+ .desc = "bitmap node", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_noinit, -+ .parse = parse_noparse, -+ .mapping = mapping_bitmap, -+ .index = index_is_address, -+ .clone = NULL -+ }, -+ [JNODE_IO_HEAD] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_IO_HEAD, -+ .pops = NULL, -+ .label = "io head", -+ .desc = "io head", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_noinit, -+ .parse = parse_noparse, -+ .mapping = mapping_bitmap, -+ .index = index_is_address, -+ .clone = NULL -+ }, -+ [JNODE_INODE] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_INODE, -+ .pops = NULL, -+ .label = "inode", -+ .desc = "inode's builtin jnode", -+ .linkage = {NULL, NULL} -+ }, -+ .init = NULL, -+ .parse = NULL, -+ .mapping = NULL, -+ .index = NULL, -+ .clone = NULL -+ } -+}; -+ -+/* -+ * jnode destruction. -+ * -+ * Thread may use a jnode after it acquired a reference to it. References are -+ * counted in ->x_count field. Reference protects jnode from being -+ * recycled. This is different from protecting jnode data (that are stored in -+ * jnode page) from being evicted from memory. Data are protected by jload() -+ * and released by jrelse(). -+ * -+ * If thread already possesses a reference to the jnode it can acquire another -+ * one through jref(). Initial reference is obtained (usually) by locating -+ * jnode in some indexing structure that depends on jnode type: formatted -+ * nodes are kept in global hash table, where they are indexed by block -+ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash -+ * table, which is indexed by oid and offset within file, and in per-inode -+ * radix tree. -+ * -+ * Reference to jnode is released by jput(). If last reference is released, -+ * jput_final() is called. This function determines whether jnode has to be -+ * deleted (this happens when corresponding node is removed from the file -+ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it -+ * should be just "removed" (deleted from memory). -+ * -+ * Jnode destruction is signally delicate dance because of locking and RCU. -+ */ -+ -+/* -+ * Returns true if jnode cannot be removed right now. This check is called -+ * under tree lock. If it returns true, jnode is irrevocably committed to be -+ * deleted/removed. -+ */ -+static inline int jnode_is_busy(const jnode * node, jnode_type jtype) -+{ -+ /* if other thread managed to acquire a reference to this jnode, don't -+ * free it. */ -+ if (atomic_read(&node->x_count) > 0) -+ return 1; -+ /* also, don't free znode that has children in memory */ -+ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0) -+ return 1; -+ return 0; -+} -+ -+/* -+ * this is called as part of removing jnode. Based on jnode type, call -+ * corresponding function that removes jnode from indices and returns it back -+ * to the appropriate slab (through RCU). -+ */ -+static inline void -+jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree) -+{ -+ switch (jtype) { -+ case JNODE_UNFORMATTED_BLOCK: -+ remove_jnode(node, tree); -+ break; -+ case JNODE_IO_HEAD: -+ case JNODE_BITMAP: -+ break; -+ case JNODE_INODE: -+ break; -+ case JNODE_FORMATTED_BLOCK: -+ remove_znode(node, tree); -+ break; -+ default: -+ wrong_return_value("nikita-3196", "Wrong jnode type"); -+ } -+} -+ -+/* -+ * this is called as part of deleting jnode. Based on jnode type, call -+ * corresponding function that removes jnode from indices and returns it back -+ * to the appropriate slab (through RCU). -+ * -+ * This differs from jnode_remove() only for formatted nodes---for them -+ * sibling list handling is different for removal and deletion. -+ */ -+static inline void -+jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG) -+{ -+ switch (jtype) { -+ case JNODE_UNFORMATTED_BLOCK: -+ remove_jnode(node, tree); -+ break; -+ case JNODE_IO_HEAD: -+ case JNODE_BITMAP: -+ break; -+ case JNODE_FORMATTED_BLOCK: -+ delete_znode(node, tree); -+ break; -+ case JNODE_INODE: -+ default: -+ wrong_return_value("nikita-3195", "Wrong jnode type"); -+ } -+} -+ -+#if REISER4_DEBUG -+/* -+ * remove jnode from the debugging list of all jnodes hanging off super-block. -+ */ -+void jnode_list_remove(jnode * node) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(jnode_get_tree(node)->super); -+ -+ spin_lock_irq(&sbinfo->all_guard); -+ assert("nikita-2422", !list_empty(&node->jnodes)); -+ list_del_init(&node->jnodes); -+ spin_unlock_irq(&sbinfo->all_guard); -+} -+#endif -+ -+/* -+ * this is called by jput_final() to remove jnode when last reference to it is -+ * released. -+ */ -+static int jnode_try_drop(jnode * node) -+{ -+ int result; -+ reiser4_tree *tree; -+ jnode_type jtype; -+ -+ assert("nikita-2491", node != NULL); -+ assert("nikita-2583", JF_ISSET(node, JNODE_RIP)); -+ -+ tree = jnode_get_tree(node); -+ jtype = jnode_get_type(node); -+ -+ spin_lock_jnode(node); -+ write_lock_tree(tree); -+ /* -+ * if jnode has a page---leave it alone. Memory pressure will -+ * eventually kill page and jnode. -+ */ -+ if (jnode_page(node) != NULL) { -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ JF_CLR(node, JNODE_RIP); -+ return RETERR(-EBUSY); -+ } -+ -+ /* re-check ->x_count under tree lock. */ -+ result = jnode_is_busy(node, jtype); -+ if (result == 0) { -+ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert("jmacd-511/b", atomic_read(&node->d_count) == 0); -+ -+ spin_unlock_jnode(node); -+ /* no page and no references---despatch him. */ -+ jnode_remove(node, jtype, tree); -+ write_unlock_tree(tree); -+ jnode_free(node, jtype); -+ } else { -+ /* busy check failed: reference was acquired by concurrent -+ * thread. */ -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ JF_CLR(node, JNODE_RIP); -+ } -+ return result; -+} -+ -+/* jdelete() -- Delete jnode from the tree and file system */ -+static int jdelete(jnode * node/* jnode to finish with */) -+{ -+ struct page *page; -+ int result; -+ reiser4_tree *tree; -+ jnode_type jtype; -+ -+ assert("nikita-467", node != NULL); -+ assert("nikita-2531", JF_ISSET(node, JNODE_RIP)); -+ -+ jtype = jnode_get_type(node); -+ -+ page = jnode_lock_page(node); -+ assert_spin_locked(&(node->guard)); -+ -+ tree = jnode_get_tree(node); -+ -+ write_lock_tree(tree); -+ /* re-check ->x_count under tree lock. */ -+ result = jnode_is_busy(node, jtype); -+ if (likely(!result)) { -+ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert("jmacd-511", atomic_read(&node->d_count) == 0); -+ -+ /* detach page */ -+ if (page != NULL) { -+ /* -+ * FIXME this is racy against jnode_extent_write(). -+ */ -+ page_clear_jnode(page, node); -+ } -+ spin_unlock_jnode(node); -+ /* goodbye */ -+ jnode_delete(node, jtype, tree); -+ write_unlock_tree(tree); -+ jnode_free(node, jtype); -+ /* @node is no longer valid pointer */ -+ if (page != NULL) -+ reiser4_drop_page(page); -+ } else { -+ /* busy check failed: reference was acquired by concurrent -+ * thread. */ -+ JF_CLR(node, JNODE_RIP); -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ if (page != NULL) -+ unlock_page(page); -+ } -+ return result; -+} -+ -+/* drop jnode on the floor. -+ -+ Return value: -+ -+ -EBUSY: failed to drop jnode, because there are still references to it -+ -+ 0: successfully dropped jnode -+ -+*/ -+static int jdrop_in_tree(jnode * node, reiser4_tree * tree) -+{ -+ struct page *page; -+ jnode_type jtype; -+ int result; -+ -+ assert("zam-602", node != NULL); -+ assert_rw_not_read_locked(&(tree->tree_lock)); -+ assert_rw_not_write_locked(&(tree->tree_lock)); -+ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ jtype = jnode_get_type(node); -+ -+ page = jnode_lock_page(node); -+ assert_spin_locked(&(node->guard)); -+ -+ write_lock_tree(tree); -+ -+ /* re-check ->x_count under tree lock. */ -+ result = jnode_is_busy(node, jtype); -+ if (!result) { -+ assert("nikita-2488", page == jnode_page(node)); -+ assert("nikita-2533", atomic_read(&node->d_count) == 0); -+ if (page != NULL) { -+ assert("nikita-2126", !PageDirty(page)); -+ assert("nikita-2127", PageUptodate(page)); -+ assert("nikita-2181", PageLocked(page)); -+ page_clear_jnode(page, node); -+ } -+ spin_unlock_jnode(node); -+ jnode_remove(node, jtype, tree); -+ write_unlock_tree(tree); -+ jnode_free(node, jtype); -+ if (page != NULL) -+ reiser4_drop_page(page); -+ } else { -+ /* busy check failed: reference was acquired by concurrent -+ * thread. */ -+ JF_CLR(node, JNODE_RIP); -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ if (page != NULL) -+ unlock_page(page); -+ } -+ return result; -+} -+ -+/* This function frees jnode "if possible". In particular, [dcx]_count has to -+ be 0 (where applicable). */ -+void jdrop(jnode * node) -+{ -+ jdrop_in_tree(node, jnode_get_tree(node)); -+} -+ -+/* IO head jnode implementation; The io heads are simple j-nodes with limited -+ functionality (these j-nodes are not in any hash table) just for reading -+ from and writing to disk. */ -+ -+jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) -+{ -+ jnode *jal = jalloc(); -+ -+ if (jal != NULL) { -+ jnode_init(jal, current_tree, JNODE_IO_HEAD); -+ jnode_set_block(jal, block); -+ } -+ -+ jref(jal); -+ -+ return jal; -+} -+ -+void reiser4_drop_io_head(jnode * node) -+{ -+ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD); -+ -+ jput(node); -+ jdrop(node); -+} -+ -+/* protect keep jnode data from reiser4_releasepage() */ -+void pin_jnode_data(jnode * node) -+{ -+ assert("zam-671", jnode_page(node) != NULL); -+ page_cache_get(jnode_page(node)); -+} -+ -+/* make jnode data free-able again */ -+void unpin_jnode_data(jnode * node) -+{ -+ assert("zam-672", jnode_page(node) != NULL); -+ page_cache_release(jnode_page(node)); -+} -+ -+struct address_space *jnode_get_mapping(const jnode * node) -+{ -+ assert("nikita-3162", node != NULL); -+ return jnode_ops(node)->mapping(node); -+} -+ -+#if REISER4_DEBUG -+/* debugging aid: jnode invariant */ -+int jnode_invariant_f(const jnode * node, char const **msg) -+{ -+#define _ergo(ant, con) \ -+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con))) -+#define _check(exp) ((*msg) = #exp, (exp)) -+ -+ return _check(node != NULL) && -+ /* [jnode-queued] */ -+ /* only relocated node can be queued, except that when znode -+ * is being deleted, its JNODE_RELOC bit is cleared */ -+ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED), -+ JF_ISSET(node, JNODE_RELOC) || -+ JF_ISSET(node, JNODE_HEARD_BANSHEE)) && -+ _check(node->jnodes.prev != NULL) && -+ _check(node->jnodes.next != NULL) && -+ /* [jnode-dirty] invariant */ -+ /* dirty inode is part of atom */ -+ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) && -+ /* [jnode-oid] invariant */ -+ /* for unformatted node ->objectid and ->mapping fields are -+ * consistent */ -+ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL, -+ node->key.j.objectid == -+ get_inode_oid(node->key.j.mapping->host)) && -+ /* [jnode-atom-valid] invariant */ -+ /* node atom has valid state */ -+ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) && -+ /* [jnode-page-binding] invariant */ -+ /* if node points to page, it points back to node */ -+ _ergo(node->pg != NULL, jprivate(node->pg) == node) && -+ /* [jnode-refs] invariant */ -+ /* only referenced jnode can be loaded */ -+ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count)); -+ -+} -+ -+static const char *jnode_type_name(jnode_type type) -+{ -+ switch (type) { -+ case JNODE_UNFORMATTED_BLOCK: -+ return "unformatted"; -+ case JNODE_FORMATTED_BLOCK: -+ return "formatted"; -+ case JNODE_BITMAP: -+ return "bitmap"; -+ case JNODE_IO_HEAD: -+ return "io head"; -+ case JNODE_INODE: -+ return "inode"; -+ case LAST_JNODE_TYPE: -+ return "last"; -+ default:{ -+ static char unknown[30]; -+ -+ sprintf(unknown, "unknown %i", type); -+ return unknown; -+ } -+ } -+} -+ -+#define jnode_state_name(node, flag) \ -+ (JF_ISSET((node), (flag)) ? ((#flag "|")+6) : "") -+ -+/* debugging aid: output human readable information about @node */ -+static void info_jnode(const char *prefix /* prefix to print */ , -+ const jnode * node/* node to print */) -+{ -+ assert("umka-068", prefix != NULL); -+ -+ if (node == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ -+ printk -+ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i," -+ " block: %s, d_count: %d, x_count: %d, " -+ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node, -+ node->state, -+ jnode_state_name(node, JNODE_PARSED), -+ jnode_state_name(node, JNODE_HEARD_BANSHEE), -+ jnode_state_name(node, JNODE_LEFT_CONNECTED), -+ jnode_state_name(node, JNODE_RIGHT_CONNECTED), -+ jnode_state_name(node, JNODE_ORPHAN), -+ jnode_state_name(node, JNODE_CREATED), -+ jnode_state_name(node, JNODE_RELOC), -+ jnode_state_name(node, JNODE_OVRWR), -+ jnode_state_name(node, JNODE_DIRTY), -+ jnode_state_name(node, JNODE_IS_DYING), -+ jnode_state_name(node, JNODE_RIP), -+ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE), -+ jnode_state_name(node, JNODE_WRITEBACK), -+ jnode_state_name(node, JNODE_NEW), -+ jnode_state_name(node, JNODE_DKSET), -+ jnode_state_name(node, JNODE_REPACK), -+ jnode_state_name(node, JNODE_CLUSTER_PAGE), -+ jnode_get_level(node), sprint_address(jnode_get_block(node)), -+ atomic_read(&node->d_count), atomic_read(&node->x_count), -+ jnode_page(node), node->atom, 0, 0, -+ jnode_type_name(jnode_get_type(node))); -+ if (jnode_is_unformatted(node)) { -+ printk("inode: %llu, index: %lu, ", -+ node->key.j.objectid, node->key.j.index); -+ } -+} -+ -+/* debugging aid: check znode invariant and panic if it doesn't hold */ -+static int jnode_invariant(jnode * node, int tlocked, int jlocked) -+{ -+ char const *failed_msg; -+ int result; -+ reiser4_tree *tree; -+ -+ tree = jnode_get_tree(node); -+ -+ assert("umka-063312", node != NULL); -+ assert("umka-064321", tree != NULL); -+ -+ if (!jlocked && !tlocked) -+ spin_lock_jnode((jnode *) node); -+ if (!tlocked) -+ read_lock_tree(jnode_get_tree(node)); -+ result = jnode_invariant_f(node, &failed_msg); -+ if (!result) { -+ info_jnode("corrupted node", node); -+ warning("jmacd-555", "Condition %s failed", failed_msg); -+ } -+ if (!tlocked) -+ read_unlock_tree(jnode_get_tree(node)); -+ if (!jlocked && !tlocked) -+ spin_unlock_jnode((jnode *) node); -+ return result; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/jnode.h linux-2.6.30/fs/reiser4/jnode.h ---- linux-2.6.30.orig/fs/reiser4/jnode.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/jnode.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,704 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Declaration of jnode. See jnode.c for details. */ -+ -+#ifndef __JNODE_H__ -+#define __JNODE_H__ -+ -+#include "forward.h" -+#include "type_safe_hash.h" -+#include "txnmgr.h" -+#include "key.h" -+#include "debug.h" -+#include "dformat.h" -+#include "page_cache.h" -+#include "context.h" -+ -+#include "plugin/plugin.h" -+ -+#include <linux/fs.h> -+#include <linux/mm.h> -+#include <linux/spinlock.h> -+#include <asm/atomic.h> -+#include <linux/bitops.h> -+#include <linux/list.h> -+#include <linux/rcupdate.h> -+ -+/* declare hash table of jnodes (jnodes proper, that is, unformatted -+ nodes) */ -+TYPE_SAFE_HASH_DECLARE(j, jnode); -+ -+/* declare hash table of znodes */ -+TYPE_SAFE_HASH_DECLARE(z, znode); -+ -+struct jnode_key { -+ __u64 objectid; -+ unsigned long index; -+ struct address_space *mapping; -+}; -+ -+/* -+ Jnode is the "base class" of other nodes in reiser4. It is also happens to -+ be exactly the node we use for unformatted tree nodes. -+ -+ Jnode provides following basic functionality: -+ -+ . reference counting and indexing. -+ -+ . integration with page cache. Jnode has ->pg reference to which page can -+ be attached. -+ -+ . interface to transaction manager. It is jnode that is kept in transaction -+ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this -+ means, there should be special type of jnode for inode.) -+ -+ Locking: -+ -+ Spin lock: the following fields are protected by the per-jnode spin lock: -+ -+ ->state -+ ->atom -+ ->capture_link -+ -+ Following fields are protected by the global tree lock: -+ -+ ->link -+ ->key.z (content of ->key.z is only changed in znode_rehash()) -+ ->key.j -+ -+ Atomic counters -+ -+ ->x_count -+ ->d_count -+ -+ ->pg, and ->data are protected by spin lock for unused jnode and are -+ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable() -+ is false). -+ -+ ->tree is immutable after creation -+ -+ Unclear -+ -+ ->blocknr: should be under jnode spin-lock, but current interface is based -+ on passing of block address. -+ -+ If you ever need to spin lock two nodes at once, do this in "natural" -+ memory order: lock znode with lower address first. (See lock_two_nodes().) -+ -+ Invariants involving this data-type: -+ -+ [jnode-dirty] -+ [jnode-refs] -+ [jnode-oid] -+ [jnode-queued] -+ [jnode-atom-valid] -+ [jnode-page-binding] -+*/ -+ -+struct jnode { -+#if REISER4_DEBUG -+#define JMAGIC 0x52654973 /* "ReIs" */ -+ int magic; -+#endif -+ /* FIRST CACHE LINE (16 bytes): data used by jload */ -+ -+ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */ -+ /* 0 */ unsigned long state; -+ -+ /* lock, protecting jnode's fields. */ -+ /* 4 */ spinlock_t load; -+ -+ /* counter of references to jnode itself. Increased on jref(). -+ Decreased on jput(). -+ */ -+ /* 8 */ atomic_t x_count; -+ -+ /* counter of references to jnode's data. Pin data page(s) in -+ memory while this is greater than 0. Increased on jload(). -+ Decreased on jrelse(). -+ */ -+ /* 12 */ atomic_t d_count; -+ -+ /* SECOND CACHE LINE: data used by hash table lookups */ -+ -+ /* 16 */ union { -+ /* znodes are hashed by block number */ -+ reiser4_block_nr z; -+ /* unformatted nodes are hashed by mapping plus offset */ -+ struct jnode_key j; -+ } key; -+ -+ /* THIRD CACHE LINE */ -+ -+ /* 32 */ union { -+ /* pointers to maintain hash-table */ -+ z_hash_link z; -+ j_hash_link j; -+ } link; -+ -+ /* pointer to jnode page. */ -+ /* 36 */ struct page *pg; -+ /* pointer to node itself. This is page_address(node->pg) when page is -+ attached to the jnode -+ */ -+ /* 40 */ void *data; -+ -+ /* 44 */ reiser4_tree *tree; -+ -+ /* FOURTH CACHE LINE: atom related fields */ -+ -+ /* 48 */ spinlock_t guard; -+ -+ /* atom the block is in, if any */ -+ /* 52 */ txn_atom *atom; -+ -+ /* capture list */ -+ /* 56 */ struct list_head capture_link; -+ -+ /* FIFTH CACHE LINE */ -+ -+ /* 64 */ struct rcu_head rcu; -+ /* crosses cache line */ -+ -+ /* SIXTH CACHE LINE */ -+ -+ /* the real blocknr (where io is going to/from) */ -+ /* 80 */ reiser4_block_nr blocknr; -+ /* Parent item type, unformatted and CRC need it for -+ * offset => key conversion. */ -+ /* NOTE: this parent_item_id looks like jnode type. */ -+ /* 88 */ reiser4_plugin_id parent_item_id; -+ /* 92 */ -+#if REISER4_DEBUG -+ /* list of all jnodes for debugging purposes. */ -+ struct list_head jnodes; -+ /* how many times this jnode was written in one transaction */ -+ int written; -+ /* this indicates which atom's list the jnode is on */ -+ atom_list list; -+#endif -+} __attribute__ ((aligned(16))); -+ -+/* -+ * jnode types. Enumeration of existing jnode types. -+ */ -+typedef enum { -+ JNODE_UNFORMATTED_BLOCK, /* unformatted block */ -+ JNODE_FORMATTED_BLOCK, /* formatted block, znode */ -+ JNODE_BITMAP, /* bitmap */ -+ JNODE_IO_HEAD, /* jnode representing a block in the -+ * wandering log */ -+ JNODE_INODE, /* jnode embedded into inode */ -+ LAST_JNODE_TYPE -+} jnode_type; -+ -+/* jnode states */ -+typedef enum { -+ /* jnode's page is loaded and data checked */ -+ JNODE_PARSED = 0, -+ /* node was deleted, not all locks on it were released. This -+ node is empty and is going to be removed from the tree -+ shortly. */ -+ JNODE_HEARD_BANSHEE = 1, -+ /* left sibling pointer is valid */ -+ JNODE_LEFT_CONNECTED = 2, -+ /* right sibling pointer is valid */ -+ JNODE_RIGHT_CONNECTED = 3, -+ -+ /* znode was just created and doesn't yet have a pointer from -+ its parent */ -+ JNODE_ORPHAN = 4, -+ -+ /* this node was created by its transaction and has not been assigned -+ a block address. */ -+ JNODE_CREATED = 5, -+ -+ /* this node is currently relocated */ -+ JNODE_RELOC = 6, -+ /* this node is currently wandered */ -+ JNODE_OVRWR = 7, -+ -+ /* this znode has been modified */ -+ JNODE_DIRTY = 8, -+ -+ /* znode lock is being invalidated */ -+ JNODE_IS_DYING = 9, -+ -+ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */ -+ -+ /* jnode is queued for flushing. */ -+ JNODE_FLUSH_QUEUED = 12, -+ -+ /* In the following bits jnode type is encoded. */ -+ JNODE_TYPE_1 = 13, -+ JNODE_TYPE_2 = 14, -+ JNODE_TYPE_3 = 15, -+ -+ /* jnode is being destroyed */ -+ JNODE_RIP = 16, -+ -+ /* znode was not captured during locking (it might so be because -+ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */ -+ JNODE_MISSED_IN_CAPTURE = 17, -+ -+ /* write is in progress */ -+ JNODE_WRITEBACK = 18, -+ -+ /* FIXME: now it is used by crypto-compress plugin only */ -+ JNODE_NEW = 19, -+ -+ /* delimiting keys are already set for this znode. */ -+ JNODE_DKSET = 20, -+ -+ /* when this bit is set page and jnode can not be disconnected */ -+ JNODE_WRITE_PREPARED = 21, -+ -+ JNODE_CLUSTER_PAGE = 22, -+ /* Jnode is marked for repacking, that means the reiser4 flush and the -+ * block allocator should process this node special way */ -+ JNODE_REPACK = 23, -+ /* node should be converted by flush in squalloc phase */ -+ JNODE_CONVERTIBLE = 24, -+ /* -+ * When jnode is dirtied for the first time in given transaction, -+ * do_jnode_make_dirty() checks whether this jnode can possible became -+ * member of overwrite set. If so, this bit is set, and one block is -+ * reserved in the ->flush_reserved space of atom. -+ * -+ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when -+ * -+ * (1) flush decides that we want this block to go into relocate -+ * set after all. -+ * -+ * (2) wandering log is allocated (by log writer) -+ * -+ * (3) extent is allocated -+ * -+ */ -+ JNODE_FLUSH_RESERVED = 29 -+} reiser4_jnode_state; -+ -+/* Macros for accessing the jnode state. */ -+ -+static inline void JF_CLR(jnode * j, int f) -+{ -+ assert("unknown-1", j->magic == JMAGIC); -+ clear_bit(f, &j->state); -+} -+static inline int JF_ISSET(const jnode * j, int f) -+{ -+ assert("unknown-2", j->magic == JMAGIC); -+ return test_bit(f, &((jnode *) j)->state); -+} -+static inline void JF_SET(jnode * j, int f) -+{ -+ assert("unknown-3", j->magic == JMAGIC); -+ set_bit(f, &j->state); -+} -+ -+static inline int JF_TEST_AND_SET(jnode * j, int f) -+{ -+ assert("unknown-4", j->magic == JMAGIC); -+ return test_and_set_bit(f, &j->state); -+} -+ -+static inline void spin_lock_jnode(jnode *node) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(rw_locked_tree) && -+ LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_LT(spin_locked_jnode, 2))); -+ -+ spin_lock(&(node->guard)); -+ -+ LOCK_CNT_INC(spin_locked_jnode); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_unlock_jnode(jnode *node) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_jnode); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(node->guard)); -+} -+ -+static inline int jnode_is_in_deleteset(const jnode * node) -+{ -+ return JF_ISSET(node, JNODE_RELOC); -+} -+ -+extern int init_jnodes(void); -+extern void done_jnodes(void); -+ -+/* Jnode routines */ -+extern jnode *jalloc(void); -+extern void jfree(jnode * node) NONNULL; -+extern jnode *jclone(jnode *); -+extern jnode *jlookup(reiser4_tree * tree, -+ oid_t objectid, unsigned long ind) NONNULL; -+extern jnode *jfind(struct address_space *, unsigned long index) NONNULL; -+extern jnode *jnode_by_page(struct page *pg) NONNULL; -+extern jnode *jnode_of_page(struct page *pg) NONNULL; -+void jnode_attach_page(jnode * node, struct page *pg); -+ -+void unhash_unformatted_jnode(jnode *); -+extern jnode *page_next_jnode(jnode * node) NONNULL; -+extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL; -+extern void jnode_make_dirty(jnode * node) NONNULL; -+extern void jnode_make_clean(jnode * node) NONNULL; -+extern void jnode_make_wander_nolock(jnode * node) NONNULL; -+extern void jnode_make_wander(jnode *) NONNULL; -+extern void znode_make_reloc(znode * , flush_queue_t *) NONNULL; -+extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL; -+extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL; -+ -+/** -+ * jnode_get_block -+ * @node: jnode to query -+ * -+ */ -+static inline const reiser4_block_nr *jnode_get_block(const jnode *node) -+{ -+ assert("nikita-528", node != NULL); -+ -+ return &node->blocknr; -+} -+ -+/** -+ * jnode_set_block -+ * @node: jnode to update -+ * @blocknr: new block nr -+ */ -+static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr) -+{ -+ assert("nikita-2020", node != NULL); -+ assert("umka-055", blocknr != NULL); -+ node->blocknr = *blocknr; -+} -+ -+ -+/* block number for IO. Usually this is the same as jnode_get_block(), unless -+ * jnode was emergency flushed---then block number chosen by eflush is -+ * used. */ -+static inline const reiser4_block_nr *jnode_get_io_block(jnode * node) -+{ -+ assert("nikita-2768", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ -+ return jnode_get_block(node); -+} -+ -+/* Jnode flush interface. */ -+extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos); -+extern flush_queue_t *reiser4_pos_fq(flush_pos_t *pos); -+ -+/* FIXME-VS: these are used in plugin/item/extent.c */ -+ -+/* does extent_get_block have to be called */ -+#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED) -+#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED) -+ -+/* the node should be converted during flush squalloc phase */ -+#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE) -+#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE) -+ -+/* Macros to convert from jnode to znode, znode to jnode. These are macros -+ because C doesn't allow overloading of const prototypes. */ -+#define ZJNODE(x) (&(x)->zjnode) -+#define JZNODE(x) \ -+({ \ -+ typeof(x) __tmp_x; \ -+ \ -+ __tmp_x = (x); \ -+ assert("jmacd-1300", jnode_is_znode(__tmp_x)); \ -+ (znode*) __tmp_x; \ -+}) -+ -+extern int jnodes_tree_init(reiser4_tree * tree); -+extern int jnodes_tree_done(reiser4_tree * tree); -+ -+#if REISER4_DEBUG -+ -+extern int znode_is_any_locked(const znode * node); -+extern void jnode_list_remove(jnode * node); -+ -+#else -+ -+#define jnode_list_remove(node) noop -+ -+#endif -+ -+int znode_is_root(const znode * node) NONNULL; -+ -+/* bump reference counter on @node */ -+static inline void add_x_ref(jnode * node/* node to increase x_count of */) -+{ -+ assert("nikita-1911", node != NULL); -+ -+ atomic_inc(&node->x_count); -+ LOCK_CNT_INC(x_refs); -+} -+ -+static inline void dec_x_ref(jnode * node) -+{ -+ assert("nikita-3215", node != NULL); -+ assert("nikita-3216", atomic_read(&node->x_count) > 0); -+ -+ atomic_dec(&node->x_count); -+ assert("nikita-3217", LOCK_CNT_GTZ(x_refs)); -+ LOCK_CNT_DEC(x_refs); -+} -+ -+/* jref() - increase counter of references to jnode/znode (x_count) */ -+static inline jnode *jref(jnode * node) -+{ -+ assert("jmacd-508", (node != NULL) && !IS_ERR(node)); -+ add_x_ref(node); -+ return node; -+} -+ -+/* get the page of jnode */ -+static inline struct page *jnode_page(const jnode * node) -+{ -+ return node->pg; -+} -+ -+/* return pointer to jnode data */ -+static inline char *jdata(const jnode * node) -+{ -+ assert("nikita-1415", node != NULL); -+ assert("nikita-3198", jnode_page(node) != NULL); -+ return node->data; -+} -+ -+static inline int jnode_is_loaded(const jnode * node) -+{ -+ assert("zam-506", node != NULL); -+ return atomic_read(&node->d_count) > 0; -+} -+ -+extern void page_clear_jnode(struct page *page, jnode * node) NONNULL; -+ -+static inline void jnode_set_reloc(jnode * node) -+{ -+ assert("nikita-2431", node != NULL); -+ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR)); -+ JF_SET(node, JNODE_RELOC); -+} -+ -+/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */ -+ -+extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL; -+ -+static inline int jload(jnode *node) -+{ -+ return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1); -+} -+ -+extern int jinit_new(jnode *, gfp_t) NONNULL; -+extern int jstartio(jnode *) NONNULL; -+ -+extern void jdrop(jnode *) NONNULL; -+extern int jwait_io(jnode *, int rw) NONNULL; -+ -+void jload_prefetch(jnode *); -+ -+extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL; -+extern void reiser4_drop_io_head(jnode * node) NONNULL; -+ -+static inline reiser4_tree *jnode_get_tree(const jnode * node) -+{ -+ assert("nikita-2691", node != NULL); -+ return node->tree; -+} -+ -+extern void pin_jnode_data(jnode *); -+extern void unpin_jnode_data(jnode *); -+ -+static inline jnode_type jnode_get_type(const jnode * node) -+{ -+ static const unsigned long state_mask = -+ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3); -+ -+ static jnode_type mask_to_type[] = { -+ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */ -+ -+ /* 000 */ -+ [0] = JNODE_FORMATTED_BLOCK, -+ /* 001 */ -+ [1] = JNODE_UNFORMATTED_BLOCK, -+ /* 010 */ -+ [2] = JNODE_BITMAP, -+ /* 011 */ -+ [3] = LAST_JNODE_TYPE, /*invalid */ -+ /* 100 */ -+ [4] = JNODE_INODE, -+ /* 101 */ -+ [5] = LAST_JNODE_TYPE, -+ /* 110 */ -+ [6] = JNODE_IO_HEAD, -+ /* 111 */ -+ [7] = LAST_JNODE_TYPE, /* invalid */ -+ }; -+ -+ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1]; -+} -+ -+/* returns true if node is a znode */ -+static inline int jnode_is_znode(const jnode * node) -+{ -+ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK; -+} -+ -+static inline int jnode_is_flushprepped(jnode * node) -+{ -+ assert("jmacd-78212", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) || -+ JF_ISSET(node, JNODE_OVRWR); -+} -+ -+/* Return true if @node has already been processed by the squeeze and allocate -+ process. This implies the block address has been finalized for the -+ duration of this atom (or it is clean and will remain in place). If this -+ returns true you may use the block number as a hint. */ -+static inline int jnode_check_flushprepped(jnode * node) -+{ -+ int result; -+ -+ /* It must be clean or relocated or wandered. New allocations are set -+ * to relocate. */ -+ spin_lock_jnode(node); -+ result = jnode_is_flushprepped(node); -+ spin_unlock_jnode(node); -+ return result; -+} -+ -+/* returns true if node is unformatted */ -+static inline int jnode_is_unformatted(const jnode * node) -+{ -+ assert("jmacd-0123", node != NULL); -+ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK; -+} -+ -+/* returns true if node represents a cluster cache page */ -+static inline int jnode_is_cluster_page(const jnode * node) -+{ -+ assert("edward-50", node != NULL); -+ return (JF_ISSET(node, JNODE_CLUSTER_PAGE)); -+} -+ -+/* returns true is node is builtin inode's jnode */ -+static inline int jnode_is_inode(const jnode * node) -+{ -+ assert("vs-1240", node != NULL); -+ return jnode_get_type(node) == JNODE_INODE; -+} -+ -+static inline jnode_plugin *jnode_ops_of(const jnode_type type) -+{ -+ assert("nikita-2367", type < LAST_JNODE_TYPE); -+ return jnode_plugin_by_id((reiser4_plugin_id) type); -+} -+ -+static inline jnode_plugin *jnode_ops(const jnode * node) -+{ -+ assert("nikita-2366", node != NULL); -+ -+ return jnode_ops_of(jnode_get_type(node)); -+} -+ -+/* Get the index of a block. */ -+static inline unsigned long jnode_get_index(jnode * node) -+{ -+ return jnode_ops(node)->index(node); -+} -+ -+/* return true if "node" is the root */ -+static inline int jnode_is_root(const jnode * node) -+{ -+ return jnode_is_znode(node) && znode_is_root(JZNODE(node)); -+} -+ -+extern struct address_space *mapping_jnode(const jnode * node); -+extern unsigned long index_jnode(const jnode * node); -+ -+static inline void jput(jnode * node); -+extern void jput_final(jnode * node); -+ -+/* bump data counter on @node */ -+static inline void add_d_ref(jnode * node/* node to increase d_count of */) -+{ -+ assert("nikita-1962", node != NULL); -+ -+ atomic_inc(&node->d_count); -+ if (jnode_is_unformatted(node) || jnode_is_znode(node)) -+ LOCK_CNT_INC(d_refs); -+} -+ -+/* jput() - decrement x_count reference counter on znode. -+ -+ Count may drop to 0, jnode stays in cache until memory pressure causes the -+ eviction of its page. The c_count variable also ensures that children are -+ pressured out of memory before the parent. The jnode remains hashed as -+ long as the VM allows its page to stay in memory. -+*/ -+static inline void jput(jnode * node) -+{ -+ assert("jmacd-509", node != NULL); -+ assert("jmacd-510", atomic_read(&node->x_count) > 0); -+ assert("zam-926", reiser4_schedulable()); -+ LOCK_CNT_DEC(x_refs); -+ -+ rcu_read_lock(); -+ /* -+ * we don't need any kind of lock here--jput_final() uses RCU. -+ */ -+ if (unlikely(atomic_dec_and_test(&node->x_count))) -+ jput_final(node); -+ else -+ rcu_read_unlock(); -+ assert("nikita-3473", reiser4_schedulable()); -+} -+ -+extern void jrelse(jnode * node); -+extern void jrelse_tail(jnode * node); -+ -+extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node); -+ -+/* resolve race with jput */ -+static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node) -+{ -+ if (unlikely(JF_ISSET(node, JNODE_RIP))) -+ node = jnode_rip_sync(tree, node); -+ return node; -+} -+ -+extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key); -+ -+#if REISER4_DEBUG -+extern int jnode_invariant_f(const jnode *node, char const **msg); -+#endif -+ -+extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE]; -+ -+/* __JNODE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/kassign.c linux-2.6.30/fs/reiser4/kassign.c ---- linux-2.6.30.orig/fs/reiser4/kassign.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/kassign.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,677 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Key assignment policy implementation */ -+ -+/* -+ * In reiser4 every piece of file system data and meta-data has a key. Keys -+ * are used to store information in and retrieve it from reiser4 internal -+ * tree. In addition to this, keys define _ordering_ of all file system -+ * information: things having close keys are placed into the same or -+ * neighboring (in the tree order) nodes of the tree. As our block allocator -+ * tries to respect tree order (see flush.c), keys also define order in which -+ * things are laid out on the disk, and hence, affect performance directly. -+ * -+ * Obviously, assignment of keys to data and meta-data should be consistent -+ * across whole file system. Algorithm that calculates a key for a given piece -+ * of data or meta-data is referred to as "key assignment". -+ * -+ * Key assignment is too expensive to be implemented as a plugin (that is, -+ * with an ability to support different key assignment schemas in the same -+ * compiled kernel image). As a compromise, all key-assignment functions and -+ * data-structures are collected in this single file, so that modifications to -+ * key assignment algorithm can be localized. Additional changes may be -+ * required in key.[ch]. -+ * -+ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one -+ * may guess, there is "Plan B" too. -+ * -+ */ -+ -+/* -+ * Additional complication with key assignment implementation is a requirement -+ * to support different key length. -+ */ -+ -+/* -+ * KEY ASSIGNMENT: PLAN A, LONG KEYS. -+ * -+ * DIRECTORY ITEMS -+ * -+ * | 60 | 4 | 7 |1| 56 | 64 | 64 | -+ * +--------------+---+---+-+-------------+------------------+-----------------+ -+ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash | -+ * +--------------+---+---+-+-------------+------------------+-----------------+ -+ * | | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * dirid objectid of directory this item is for -+ * -+ * F fibration, see fs/reiser4/plugin/fibration.[ch] -+ * -+ * H 1 if last 8 bytes of the key contain hash, -+ * 0 if last 8 bytes of the key contain prefix-3 -+ * -+ * prefix-1 first 7 characters of file name. -+ * Padded by zeroes if name is not long enough. -+ * -+ * prefix-2 next 8 characters of the file name. -+ * -+ * prefix-3 next 8 characters of the file name. -+ * -+ * hash hash of the rest of file name (i.e., portion of file -+ * name not included into prefix-1 and prefix-2). -+ * -+ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded -+ * in the key. Such file names are called "short". They are distinguished by H -+ * bit set 0 in the key. -+ * -+ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7 -+ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the -+ * key. Last 8 bytes of the key are occupied by hash of the remaining -+ * characters of the name. -+ * -+ * This key assignment reaches following important goals: -+ * -+ * (1) directory entries are sorted in approximately lexicographical -+ * order. -+ * -+ * (2) collisions (when multiple directory items have the same key), while -+ * principally unavoidable in a tree with fixed length keys, are rare. -+ * -+ * STAT DATA -+ * -+ * | 60 | 4 | 64 | 4 | 60 | 64 | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | locality id | 1 | ordering | 0 | objectid | 0 | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * ordering copy of second 8-byte portion of the key of directory -+ * entry for the first name of this object. Ordering has a form -+ * { -+ * fibration :7; -+ * h :1; -+ * prefix1 :56; -+ * } -+ * see description of key for directory entry above. -+ * -+ * objectid object id for this object -+ * -+ * This key assignment policy is designed to keep stat-data in the same order -+ * as corresponding directory items, thus speeding up readdir/stat types of -+ * workload. -+ * -+ * FILE BODY -+ * -+ * | 60 | 4 | 64 | 4 | 60 | 64 | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | locality id | 4 | ordering | 0 | objectid | offset | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * ordering the same as in the key of stat-data for this object -+ * -+ * objectid object id for this object -+ * -+ * offset logical offset from the beginning of this file. -+ * Measured in bytes. -+ * -+ * -+ * KEY ASSIGNMENT: PLAN A, SHORT KEYS. -+ * -+ * DIRECTORY ITEMS -+ * -+ * | 60 | 4 | 7 |1| 56 | 64 | -+ * +--------------+---+---+-+-------------+-----------------+ -+ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash | -+ * +--------------+---+---+-+-------------+-----------------+ -+ * | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * dirid objectid of directory this item is for -+ * -+ * F fibration, see fs/reiser4/plugin/fibration.[ch] -+ * -+ * H 1 if last 8 bytes of the key contain hash, -+ * 0 if last 8 bytes of the key contain prefix-2 -+ * -+ * prefix-1 first 7 characters of file name. -+ * Padded by zeroes if name is not long enough. -+ * -+ * prefix-2 next 8 characters of the file name. -+ * -+ * hash hash of the rest of file name (i.e., portion of file -+ * name not included into prefix-1). -+ * -+ * File names shorter than 15 (== 7 + 8) characters are completely encoded in -+ * the key. Such file names are called "short". They are distinguished by H -+ * bit set in the key. -+ * -+ * Other file names are "long". For long name, H bit is 0, and first 7 -+ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the -+ * key are occupied by hash of the remaining characters of the name. -+ * -+ * STAT DATA -+ * -+ * | 60 | 4 | 4 | 60 | 64 | -+ * +--------------+---+---+--------------+-----------------+ -+ * | locality id | 1 | 0 | objectid | 0 | -+ * +--------------+---+---+--------------+-----------------+ -+ * | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * objectid object id for this object -+ * -+ * FILE BODY -+ * -+ * | 60 | 4 | 4 | 60 | 64 | -+ * +--------------+---+---+--------------+-----------------+ -+ * | locality id | 4 | 0 | objectid | offset | -+ * +--------------+---+---+--------------+-----------------+ -+ * | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * objectid object id for this object -+ * -+ * offset logical offset from the beginning of this file. -+ * Measured in bytes. -+ * -+ * -+ */ -+ -+#include "debug.h" -+#include "key.h" -+#include "kassign.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "super.h" -+#include "dscale.h" -+ -+#include <linux/types.h> /* for __u?? */ -+#include <linux/fs.h> /* for struct super_block, etc */ -+ -+/* bitmask for H bit (see comment at the beginning of this file */ -+static const __u64 longname_mark = 0x0100000000000000ull; -+/* bitmask for F and H portions of the key. */ -+static const __u64 fibration_mask = 0xff00000000000000ull; -+ -+/* return true if name is not completely encoded in @key */ -+int is_longname_key(const reiser4_key * key) -+{ -+ __u64 highpart; -+ -+ assert("nikita-2863", key != NULL); -+ if (get_key_type(key) != KEY_FILE_NAME_MINOR) -+ reiser4_print_key("oops", key); -+ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR); -+ -+ if (REISER4_LARGE_KEY) -+ highpart = get_key_ordering(key); -+ else -+ highpart = get_key_objectid(key); -+ -+ return (highpart & longname_mark) ? 1 : 0; -+} -+ -+/* return true if @name is too long to be completely encoded in the key */ -+int is_longname(const char *name UNUSED_ARG, int len) -+{ -+ if (REISER4_LARGE_KEY) -+ return len > 23; -+ else -+ return len > 15; -+} -+ -+/* code ascii string into __u64. -+ -+ Put characters of @name into result (@str) one after another starting -+ from @start_idx-th highest (arithmetically) byte. This produces -+ endian-safe encoding. memcpy(2) will not do. -+ -+*/ -+static __u64 pack_string(const char *name /* string to encode */ , -+ int start_idx /* highest byte in result from -+ * which to start encoding */ ) -+{ -+ unsigned i; -+ __u64 str; -+ -+ str = 0; -+ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) { -+ str <<= 8; -+ str |= (unsigned char)name[i]; -+ } -+ str <<= (sizeof str - i - start_idx) << 3; -+ return str; -+} -+ -+/* opposite to pack_string(). Takes value produced by pack_string(), restores -+ * string encoded in it and stores result in @buf */ -+char *reiser4_unpack_string(__u64 value, char *buf) -+{ -+ do { -+ *buf = value >> (64 - 8); -+ if (*buf) -+ ++buf; -+ value <<= 8; -+ } while (value != 0); -+ *buf = 0; -+ return buf; -+} -+ -+/* obtain name encoded in @key and store it in @buf */ -+char *extract_name_from_key(const reiser4_key * key, char *buf) -+{ -+ char *c; -+ -+ assert("nikita-2868", !is_longname_key(key)); -+ -+ c = buf; -+ if (REISER4_LARGE_KEY) { -+ c = reiser4_unpack_string(get_key_ordering(key) & -+ ~fibration_mask, c); -+ c = reiser4_unpack_string(get_key_fulloid(key), c); -+ } else -+ c = reiser4_unpack_string(get_key_fulloid(key) & -+ ~fibration_mask, c); -+ reiser4_unpack_string(get_key_offset(key), c); -+ return buf; -+} -+ -+/** -+ * complete_entry_key - calculate entry key by name -+ * @dir: directory where entry is (or will be) in -+ * @name: name to calculate key of -+ * @len: lenth of name -+ * @result: place to store result in -+ * -+ * Sets fields of entry key @result which depend on file name. -+ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering, -+ * objectid and offset. Otherwise, objectid and offset are set. -+ */ -+void complete_entry_key(const struct inode *dir, const char *name, -+ int len, reiser4_key *result) -+{ -+#if REISER4_LARGE_KEY -+ __u64 ordering; -+ __u64 objectid; -+ __u64 offset; -+ -+ assert("nikita-1139", dir != NULL); -+ assert("nikita-1142", result != NULL); -+ assert("nikita-2867", strlen(name) == len); -+ -+ /* -+ * key allocation algorithm for directory entries in case of large -+ * keys: -+ * -+ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7 -+ * characters into ordering field of key, next 8 charactes (if any) -+ * into objectid field of key and next 8 ones (of any) into offset -+ * field of key -+ * -+ * If file name is longer than 23 characters, put first 7 characters -+ * into key's ordering, next 8 to objectid and hash of remaining -+ * characters into offset field. -+ * -+ * To distinguish above cases, in latter set up unused high bit in -+ * ordering field. -+ */ -+ -+ /* [0-6] characters to ordering */ -+ ordering = pack_string(name, 1); -+ if (len > 7) { -+ /* [7-14] characters to objectid */ -+ objectid = pack_string(name + 7, 0); -+ if (len > 15) { -+ if (len <= 23) { -+ /* [15-23] characters to offset */ -+ offset = pack_string(name + 15, 0); -+ } else { -+ /* note in a key the fact that offset contains -+ * hash */ -+ ordering |= longname_mark; -+ -+ /* offset is the hash of the file name's tail */ -+ offset = inode_hash_plugin(dir)->hash(name + 15, -+ len - 15); -+ } -+ } else { -+ offset = 0ull; -+ } -+ } else { -+ objectid = 0ull; -+ offset = 0ull; -+ } -+ -+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL); -+ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len); -+ -+ set_key_ordering(result, ordering); -+ set_key_fulloid(result, objectid); -+ set_key_offset(result, offset); -+ return; -+ -+#else -+ __u64 objectid; -+ __u64 offset; -+ -+ assert("nikita-1139", dir != NULL); -+ assert("nikita-1142", result != NULL); -+ assert("nikita-2867", strlen(name) == len); -+ -+ /* -+ * key allocation algorithm for directory entries in case of not large -+ * keys: -+ * -+ * If name is not longer than 7 + 8 = 15 characters, put first 7 -+ * characters into objectid field of key, next 8 charactes (if any) -+ * into offset field of key -+ * -+ * If file name is longer than 15 characters, put first 7 characters -+ * into key's objectid, and hash of remaining characters into offset -+ * field. -+ * -+ * To distinguish above cases, in latter set up unused high bit in -+ * objectid field. -+ */ -+ -+ /* [0-6] characters to objectid */ -+ objectid = pack_string(name, 1); -+ if (len > 7) { -+ if (len <= 15) { -+ /* [7-14] characters to offset */ -+ offset = pack_string(name + 7, 0); -+ } else { -+ /* note in a key the fact that offset contains hash. */ -+ objectid |= longname_mark; -+ -+ /* offset is the hash of the file name. */ -+ offset = inode_hash_plugin(dir)->hash(name + 7, -+ len - 7); -+ } -+ } else -+ offset = 0ull; -+ -+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL); -+ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len); -+ -+ set_key_fulloid(result, objectid); -+ set_key_offset(result, offset); -+ return; -+#endif /* ! REISER4_LARGE_KEY */ -+} -+ -+/* true, if @key is the key of "." */ -+int is_dot_key(const reiser4_key * key/* key to check */) -+{ -+ assert("nikita-1717", key != NULL); -+ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR); -+ return -+ (get_key_ordering(key) == 0ull) && -+ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull); -+} -+ -+/* build key for stat-data. -+ -+ return key of stat-data of this object. This should became sd plugin -+ method in the future. For now, let it be here. -+ -+*/ -+reiser4_key *build_sd_key(const struct inode *target /* inode of an object */ , -+ reiser4_key * result /* resulting key of @target -+ stat-data */ ) -+{ -+ assert("nikita-261", result != NULL); -+ -+ reiser4_key_init(result); -+ set_key_locality(result, reiser4_inode_data(target)->locality_id); -+ set_key_ordering(result, get_inode_ordering(target)); -+ set_key_objectid(result, get_inode_oid(target)); -+ set_key_type(result, KEY_SD_MINOR); -+ set_key_offset(result, (__u64) 0); -+ return result; -+} -+ -+/* encode part of key into &obj_key_id -+ -+ This encodes into @id part of @key sufficient to restore @key later, -+ given that latter is key of object (key of stat-data). -+ -+ See &obj_key_id -+*/ -+int build_obj_key_id(const reiser4_key * key /* key to encode */ , -+ obj_key_id * id/* id where key is encoded in */) -+{ -+ assert("nikita-1151", key != NULL); -+ assert("nikita-1152", id != NULL); -+ -+ memcpy(id, key, sizeof *id); -+ return 0; -+} -+ -+/* encode reference to @obj in @id. -+ -+ This is like build_obj_key_id() above, but takes inode as parameter. */ -+int build_inode_key_id(const struct inode *obj /* object to build key of */ , -+ obj_key_id * id/* result */) -+{ -+ reiser4_key sdkey; -+ -+ assert("nikita-1166", obj != NULL); -+ assert("nikita-1167", id != NULL); -+ -+ build_sd_key(obj, &sdkey); -+ build_obj_key_id(&sdkey, id); -+ return 0; -+} -+ -+/* decode @id back into @key -+ -+ Restore key of object stat-data from @id. This is dual to -+ build_obj_key_id() above. -+*/ -+int extract_key_from_id(const obj_key_id * id /* object key id to extract key -+ * from */ , -+ reiser4_key * key/* result */) -+{ -+ assert("nikita-1153", id != NULL); -+ assert("nikita-1154", key != NULL); -+ -+ reiser4_key_init(key); -+ memcpy(key, id, sizeof *id); -+ return 0; -+} -+ -+/* extract objectid of directory from key of directory entry within said -+ directory. -+ */ -+oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of -+ * directory -+ * entry */ ) -+{ -+ assert("nikita-1314", de_key != NULL); -+ return get_key_locality(de_key); -+} -+ -+/* encode into @id key of directory entry. -+ -+ Encode into @id information sufficient to later distinguish directory -+ entries within the same directory. This is not whole key, because all -+ directory entries within directory item share locality which is equal -+ to objectid of their directory. -+ -+*/ -+int build_de_id(const struct inode *dir /* inode of directory */ , -+ const struct qstr *name /* name to be given to @obj by -+ * directory entry being -+ * constructed */ , -+ de_id * id/* short key of directory entry */) -+{ -+ reiser4_key key; -+ -+ assert("nikita-1290", dir != NULL); -+ assert("nikita-1292", id != NULL); -+ -+ /* NOTE-NIKITA this is suboptimal. */ -+ inode_dir_plugin(dir)->build_entry_key(dir, name, &key); -+ return build_de_id_by_key(&key, id); -+} -+ -+/* encode into @id key of directory entry. -+ -+ Encode into @id information sufficient to later distinguish directory -+ entries within the same directory. This is not whole key, because all -+ directory entries within directory item share locality which is equal -+ to objectid of their directory. -+ -+*/ -+int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory -+ * entry */ , -+ de_id * id/* short key of directory entry */) -+{ -+ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id); -+ return 0; -+} -+ -+/* restore from @id key of directory entry. -+ -+ Function dual to build_de_id(): given @id and locality, build full -+ key of directory entry within directory item. -+ -+*/ -+int extract_key_from_de_id(const oid_t locality /* locality of directory -+ * entry */ , -+ const de_id * id /* directory entry id */ , -+ reiser4_key * key/* result */) -+{ -+ /* no need to initialise key here: all fields are overwritten */ -+ memcpy(((__u64 *) key) + 1, id, sizeof *id); -+ set_key_locality(key, locality); -+ set_key_type(key, KEY_FILE_NAME_MINOR); -+ return 0; -+} -+ -+/* compare two &de_id's */ -+cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ , -+ const de_id * id2/* second &de_id to compare */) -+{ -+ /* NOTE-NIKITA ugly implementation */ -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ extract_key_from_de_id((oid_t) 0, id1, &k1); -+ extract_key_from_de_id((oid_t) 0, id2, &k2); -+ return keycmp(&k1, &k2); -+} -+ -+/* compare &de_id with key */ -+cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ , -+ const reiser4_key * key/* key to compare */) -+{ -+ cmp_t result; -+ reiser4_key *k1; -+ -+ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]); -+ result = KEY_DIFF_EL(k1, key, 1); -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF_EL(k1, key, 2); -+ if (REISER4_LARGE_KEY && result == EQUAL_TO) -+ result = KEY_DIFF_EL(k1, key, 3); -+ } -+ return result; -+} -+ -+/* -+ * return number of bytes necessary to encode @inode identity. -+ */ -+int inode_onwire_size(const struct inode *inode) -+{ -+ int result; -+ -+ result = dscale_bytes_to_write(get_inode_oid(inode)); -+ result += dscale_bytes_to_write(get_inode_locality(inode)); -+ -+ /* -+ * ordering is large (it usually has highest bits set), so it makes -+ * little sense to dscale it. -+ */ -+ if (REISER4_LARGE_KEY) -+ result += sizeof(get_inode_ordering(inode)); -+ return result; -+} -+ -+/* -+ * encode @inode identity at @start -+ */ -+char *build_inode_onwire(const struct inode *inode, char *start) -+{ -+ start += dscale_write(start, get_inode_locality(inode)); -+ start += dscale_write(start, get_inode_oid(inode)); -+ -+ if (REISER4_LARGE_KEY) { -+ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start); -+ start += sizeof(get_inode_ordering(inode)); -+ } -+ return start; -+} -+ -+/* -+ * extract key that was previously encoded by build_inode_onwire() at @addr -+ */ -+char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id) -+{ -+ __u64 val; -+ -+ addr += dscale_read(addr, &val); -+ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR; -+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality); -+ addr += dscale_read(addr, &val); -+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid); -+#if REISER4_LARGE_KEY -+ memcpy(&key_id->ordering, addr, sizeof key_id->ordering); -+ addr += sizeof key_id->ordering; -+#endif -+ return addr; -+} -+ -+/* -+ * skip a key that was previously encoded by build_inode_onwire() at @addr -+ * FIXME: handle IO errors. -+ */ -+char * locate_obj_key_id_onwire(char * addr) -+{ -+ /* locality */ -+ addr += dscale_bytes_to_read(addr); -+ /* objectid */ -+ addr += dscale_bytes_to_read(addr); -+#if REISER4_LARGE_KEY -+ addr += sizeof ((obj_key_id *)0)->ordering; -+#endif -+ return addr; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/kassign.h linux-2.6.30/fs/reiser4/kassign.h ---- linux-2.6.30.orig/fs/reiser4/kassign.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/kassign.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,111 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Key assignment policy interface. See kassign.c for details. */ -+ -+#if !defined(__KASSIGN_H__) -+#define __KASSIGN_H__ -+ -+#include "forward.h" -+#include "key.h" -+#include "dformat.h" -+ -+#include <linux/types.h> /* for __u?? */ -+#include <linux/fs.h> /* for struct super_block, etc */ -+#include <linux/dcache.h> /* for struct qstr */ -+ -+/* key assignment functions */ -+ -+/* Information from which key of file stat-data can be uniquely -+ restored. This depends on key assignment policy for -+ stat-data. Currently it's enough to store object id and locality id -+ (60+60==120) bits, because minor packing locality and offset of -+ stat-data key are always known constants: KEY_SD_MINOR and 0 -+ respectively. For simplicity 4 bits are wasted in each id, and just -+ two 64 bit integers are stored. -+ -+ This field has to be byte-aligned, because we don't want to waste -+ space in directory entries. There is another side of a coin of -+ course: we waste CPU and bus bandwidth in stead, by copying data back -+ and forth. -+ -+ Next optimization: &obj_key_id is mainly used to address stat data from -+ directory entries. Under the assumption that majority of files only have -+ only name (one hard link) from *the* parent directory it seems reasonable -+ to only store objectid of stat data and take its locality from key of -+ directory item. -+ -+ This requires some flag to be added to the &obj_key_id to distinguish -+ between these two cases. Remaining bits in flag byte are then asking to be -+ used to store file type. -+ -+ This optimization requires changes in directory item handling code. -+ -+*/ -+typedef struct obj_key_id { -+ d8 locality[sizeof(__u64)]; -+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)]; -+ ) -+ d8 objectid[sizeof(__u64)]; -+} -+obj_key_id; -+ -+/* Information sufficient to uniquely identify directory entry within -+ compressed directory item. -+ -+ For alignment issues see &obj_key_id above. -+*/ -+typedef struct de_id { -+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];) -+ d8 objectid[sizeof(__u64)]; -+ d8 offset[sizeof(__u64)]; -+} -+de_id; -+ -+extern int inode_onwire_size(const struct inode *obj); -+extern char *build_inode_onwire(const struct inode *obj, char *area); -+extern char *locate_obj_key_id_onwire(char *area); -+extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id); -+ -+extern int build_inode_key_id(const struct inode *obj, obj_key_id * id); -+extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key); -+extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id); -+extern oid_t extract_dir_id_from_key(const reiser4_key * de_key); -+extern int build_de_id(const struct inode *dir, const struct qstr *name, -+ de_id * id); -+extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id); -+extern int extract_key_from_de_id(const oid_t locality, const de_id * id, -+ reiser4_key * key); -+extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2); -+extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key); -+ -+extern int build_readdir_key_common(struct file *dir, reiser4_key * result); -+extern void build_entry_key_common(const struct inode *dir, -+ const struct qstr *name, -+ reiser4_key * result); -+extern void build_entry_key_stable_entry(const struct inode *dir, -+ const struct qstr *name, -+ reiser4_key * result); -+extern int is_dot_key(const reiser4_key * key); -+extern reiser4_key *build_sd_key(const struct inode *target, -+ reiser4_key * result); -+ -+extern int is_longname_key(const reiser4_key * key); -+extern int is_longname(const char *name, int len); -+extern char *extract_name_from_key(const reiser4_key * key, char *buf); -+extern char *reiser4_unpack_string(__u64 value, char *buf); -+extern void complete_entry_key(const struct inode *dir, const char *name, -+ int len, reiser4_key *result); -+ -+/* __KASSIGN_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/Kconfig linux-2.6.30/fs/reiser4/Kconfig ---- linux-2.6.30.orig/fs/reiser4/Kconfig 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/Kconfig 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,34 @@ -+config REISER4_FS -+ tristate "Reiser4 (EXPERIMENTAL)" -+ depends on EXPERIMENTAL -+ select ZLIB_INFLATE -+ select ZLIB_DEFLATE -+ select LZO_COMPRESS -+ select LZO_DECOMPRESS -+ select CRYPTO -+ help -+ Reiser4 is a filesystem that performs all filesystem operations -+ as atomic transactions, which means that it either performs a -+ write, or it does not, and in the event of a crash it does not -+ partially perform it or corrupt it. -+ -+ It stores files in dancing trees, which are like balanced trees but -+ faster. It packs small files together so that they share blocks -+ without wasting space. This means you can use it to store really -+ small files. It also means that it saves you disk space. It avoids -+ hassling you with anachronisms like having a maximum number of -+ inodes, and wasting space if you use less than that number. -+ -+ Reiser4 is a distinct filesystem type from reiserfs (V3). -+ It's therefore not possible to use reiserfs file systems -+ with reiser4. -+ -+ To learn more about reiser4, go to http://www.namesys.com -+ -+config REISER4_DEBUG -+ bool "Enable reiser4 debug mode" -+ depends on REISER4_FS -+ help -+ Don't use this unless you are debugging reiser4. -+ -+ If unsure, say N. -diff -urN linux-2.6.30.orig/fs/reiser4/key.c linux-2.6.30/fs/reiser4/key.c ---- linux-2.6.30.orig/fs/reiser4/key.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/key.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,138 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Key manipulations. */ -+ -+#include "debug.h" -+#include "key.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include <linux/types.h> /* for __u?? */ -+ -+/* Minimal possible key: all components are zero. It is presumed that this is -+ independent of key scheme. */ -+static const reiser4_key MINIMAL_KEY = { -+ .el = { -+ 0ull, -+ ON_LARGE_KEY(0ull,) -+ 0ull, -+ 0ull -+ } -+}; -+ -+/* Maximal possible key: all components are ~0. It is presumed that this is -+ independent of key scheme. */ -+static const reiser4_key MAXIMAL_KEY = { -+ .el = { -+ __constant_cpu_to_le64(~0ull), -+ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),) -+ __constant_cpu_to_le64(~0ull), -+ __constant_cpu_to_le64(~0ull) -+ } -+}; -+ -+/* Initialize key. */ -+void reiser4_key_init(reiser4_key * key/* key to init */) -+{ -+ assert("nikita-1169", key != NULL); -+ memset(key, 0, sizeof *key); -+} -+ -+/* minimal possible key in the tree. Return pointer to the static storage. */ -+const reiser4_key * reiser4_min_key(void) -+{ -+ return &MINIMAL_KEY; -+} -+ -+/* maximum possible key in the tree. Return pointer to the static storage. */ -+const reiser4_key * reiser4_max_key(void) -+{ -+ return &MAXIMAL_KEY; -+} -+ -+#if REISER4_DEBUG -+/* debugging aid: print symbolic name of key type */ -+static const char *type_name(unsigned int key_type/* key type */) -+{ -+ switch (key_type) { -+ case KEY_FILE_NAME_MINOR: -+ return "file name"; -+ case KEY_SD_MINOR: -+ return "stat data"; -+ case KEY_ATTR_NAME_MINOR: -+ return "attr name"; -+ case KEY_ATTR_BODY_MINOR: -+ return "attr body"; -+ case KEY_BODY_MINOR: -+ return "file body"; -+ default: -+ return "unknown"; -+ } -+} -+ -+/* debugging aid: print human readable information about key */ -+void reiser4_print_key(const char *prefix /* prefix to print */ , -+ const reiser4_key * key/* key to print */) -+{ -+ /* turn bold on */ -+ /* printf ("\033[1m"); */ -+ if (key == NULL) -+ printk("%s: null key\n", prefix); -+ else { -+ if (REISER4_LARGE_KEY) -+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix, -+ get_key_locality(key), -+ get_key_type(key), -+ get_key_ordering(key), -+ get_key_band(key), -+ get_key_objectid(key), get_key_offset(key)); -+ else -+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix, -+ get_key_locality(key), -+ get_key_type(key), -+ get_key_band(key), -+ get_key_objectid(key), get_key_offset(key)); -+ /* -+ * if this is a key of directory entry, try to decode part of -+ * a name stored in the key, and output it. -+ */ -+ if (get_key_type(key) == KEY_FILE_NAME_MINOR) { -+ char buf[DE_NAME_BUF_LEN]; -+ char *c; -+ -+ c = buf; -+ c = reiser4_unpack_string(get_key_ordering(key), c); -+ reiser4_unpack_string(get_key_fulloid(key), c); -+ printk("[%s", buf); -+ if (is_longname_key(key)) -+ /* -+ * only part of the name is stored in the key. -+ */ -+ printk("...]\n"); -+ else { -+ /* -+ * whole name is stored in the key. -+ */ -+ reiser4_unpack_string(get_key_offset(key), buf); -+ printk("%s]\n", buf); -+ } -+ } else { -+ printk("[%s]\n", type_name(get_key_type(key))); -+ } -+ } -+ /* turn bold off */ -+ /* printf ("\033[m\017"); */ -+} -+ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/key.h linux-2.6.30/fs/reiser4/key.h ---- linux-2.6.30.orig/fs/reiser4/key.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/key.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,392 @@ -+/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Declarations of key-related data-structures and operations on keys. */ -+ -+#if !defined(__REISER4_KEY_H__) -+#define __REISER4_KEY_H__ -+ -+#include "dformat.h" -+#include "forward.h" -+#include "debug.h" -+ -+#include <linux/types.h> /* for __u?? */ -+ -+/* Operations on keys in reiser4 tree */ -+ -+/* No access to any of these fields shall be done except via a -+ wrapping macro/function, and that wrapping macro/function shall -+ convert to little endian order. Compare keys will consider cpu byte order. */ -+ -+/* A storage layer implementation difference between a regular unix file body -+ and its attributes is in the typedef below which causes all of the attributes -+ of a file to be near in key to all of the other attributes for all of the -+ files within that directory, and not near to the file itself. It is -+ interesting to consider whether this is the wrong approach, and whether there -+ should be no difference at all. For current usage patterns this choice is -+ probably the right one. */ -+ -+/* possible values for minor packing locality (4 bits required) */ -+typedef enum { -+ /* file name */ -+ KEY_FILE_NAME_MINOR = 0, -+ /* stat-data */ -+ KEY_SD_MINOR = 1, -+ /* file attribute name */ -+ KEY_ATTR_NAME_MINOR = 2, -+ /* file attribute value */ -+ KEY_ATTR_BODY_MINOR = 3, -+ /* file body (tail or extent) */ -+ KEY_BODY_MINOR = 4, -+} key_minor_locality; -+ -+/* Everything stored in the tree has a unique key, which means that the tree is -+ (logically) fully ordered by key. Physical order is determined by dynamic -+ heuristics that attempt to reflect key order when allocating available space, -+ and by the repacker. It is stylistically better to put aggregation -+ information into the key. Thus, if you want to segregate extents from tails, -+ it is better to give them distinct minor packing localities rather than -+ changing block_alloc.c to check the node type when deciding where to allocate -+ the node. -+ -+ The need to randomly displace new directories and large files disturbs this -+ symmetry unfortunately. However, it should be noted that this is a need that -+ is not clearly established given the existence of a repacker. Also, in our -+ current implementation tails have a different minor packing locality from -+ extents, and no files have both extents and tails, so maybe symmetry can be -+ had without performance cost after all. Symmetry is what we ship for now.... -+*/ -+ -+/* Arbitrary major packing localities can be assigned to objects using -+ the reiser4(filenameA/..packing<=some_number) system call. -+ -+ In reiser4, the creat() syscall creates a directory -+ -+ whose default flow (that which is referred to if the directory is -+ read as a file) is the traditional unix file body. -+ -+ whose directory plugin is the 'filedir' -+ -+ whose major packing locality is that of the parent of the object created. -+ -+ The static_stat item is a particular commonly used directory -+ compression (the one for normal unix files). -+ -+ The filedir plugin checks to see if the static_stat item exists. -+ There is a unique key for static_stat. If yes, then it uses the -+ static_stat item for all of the values that it contains. The -+ static_stat item contains a flag for each stat it contains which -+ indicates whether one should look outside the static_stat item for its -+ contents. -+*/ -+ -+/* offset of fields in reiser4_key. Value of each element of this enum -+ is index within key (thought as array of __u64's) where this field -+ is. */ -+typedef enum { -+ /* major "locale", aka dirid. Sits in 1st element */ -+ KEY_LOCALITY_INDEX = 0, -+ /* minor "locale", aka item type. Sits in 1st element */ -+ KEY_TYPE_INDEX = 0, -+ ON_LARGE_KEY(KEY_ORDERING_INDEX,) -+ /* "object band". Sits in 2nd element */ -+ KEY_BAND_INDEX, -+ /* objectid. Sits in 2nd element */ -+ KEY_OBJECTID_INDEX = KEY_BAND_INDEX, -+ /* full objectid. Sits in 2nd element */ -+ KEY_FULLOID_INDEX = KEY_BAND_INDEX, -+ /* Offset. Sits in 3rd element */ -+ KEY_OFFSET_INDEX, -+ /* Name hash. Sits in 3rd element */ -+ KEY_HASH_INDEX = KEY_OFFSET_INDEX, -+ KEY_CACHELINE_END = KEY_OFFSET_INDEX, -+ KEY_LAST_INDEX -+} reiser4_key_field_index; -+ -+/* key in reiser4 internal "balanced" tree. It is just array of three -+ 64bit integers in disk byte order (little-endian by default). This -+ array is actually indexed by reiser4_key_field. Each __u64 within -+ this array is called "element". Logical key component encoded within -+ elements are called "fields". -+ -+ We declare this as union with second component dummy to suppress -+ inconvenient array<->pointer casts implied in C. */ -+union reiser4_key { -+ __le64 el[KEY_LAST_INDEX]; -+ int pad; -+}; -+ -+/* bitmasks showing where within reiser4_key particular key is stored. */ -+/* major locality occupies higher 60 bits of the first element */ -+#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull -+ -+/* minor locality occupies lower 4 bits of the first element */ -+#define KEY_TYPE_MASK 0xfull -+ -+/* controversial band occupies higher 4 bits of the 2nd element */ -+#define KEY_BAND_MASK 0xf000000000000000ull -+ -+/* objectid occupies lower 60 bits of the 2nd element */ -+#define KEY_OBJECTID_MASK 0x0fffffffffffffffull -+ -+/* full 64bit objectid*/ -+#define KEY_FULLOID_MASK 0xffffffffffffffffull -+ -+/* offset is just 3rd L.M.Nt itself */ -+#define KEY_OFFSET_MASK 0xffffffffffffffffull -+ -+/* ordering is whole second element */ -+#define KEY_ORDERING_MASK 0xffffffffffffffffull -+ -+/* how many bits key element should be shifted to left to get particular field -+ */ -+typedef enum { -+ KEY_LOCALITY_SHIFT = 4, -+ KEY_TYPE_SHIFT = 0, -+ KEY_BAND_SHIFT = 60, -+ KEY_OBJECTID_SHIFT = 0, -+ KEY_FULLOID_SHIFT = 0, -+ KEY_OFFSET_SHIFT = 0, -+ KEY_ORDERING_SHIFT = 0, -+} reiser4_key_field_shift; -+ -+static inline __u64 -+get_key_el(const reiser4_key * key, reiser4_key_field_index off) -+{ -+ assert("nikita-753", key != NULL); -+ assert("nikita-754", off < KEY_LAST_INDEX); -+ return le64_to_cpu(get_unaligned(&key->el[off])); -+} -+ -+static inline void -+set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value) -+{ -+ assert("nikita-755", key != NULL); -+ assert("nikita-756", off < KEY_LAST_INDEX); -+ put_unaligned(cpu_to_le64(value), &key->el[off]); -+} -+ -+/* macro to define getter and setter functions for field F with type T */ -+#define DEFINE_KEY_FIELD(L, U, T) \ -+static inline T get_key_ ## L(const reiser4_key *key) \ -+{ \ -+ assert("nikita-750", key != NULL); \ -+ return (T) (get_key_el(key, KEY_ ## U ## _INDEX) & \ -+ KEY_ ## U ## _MASK) >> KEY_ ## U ## _SHIFT; \ -+} \ -+ \ -+static inline void set_key_ ## L(reiser4_key * key, T loc) \ -+{ \ -+ __u64 el; \ -+ \ -+ assert("nikita-752", key != NULL); \ -+ \ -+ el = get_key_el(key, KEY_ ## U ## _INDEX); \ -+ /* clear field bits in the key */ \ -+ el &= ~KEY_ ## U ## _MASK; \ -+ /* actually it should be \ -+ \ -+ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \ -+ \ -+ but we trust user to never pass values that wouldn't fit \ -+ into field. Clearing extra bits is one operation, but this \ -+ function is time-critical. \ -+ But check this in assertion. */ \ -+ assert("nikita-759", ((loc << KEY_ ## U ## _SHIFT) & \ -+ ~KEY_ ## U ## _MASK) == 0); \ -+ el |= (loc << KEY_ ## U ## _SHIFT); \ -+ set_key_el(key, KEY_ ## U ## _INDEX, el); \ -+} -+ -+typedef __u64 oid_t; -+ -+/* define get_key_locality(), set_key_locality() */ -+DEFINE_KEY_FIELD(locality, LOCALITY, oid_t); -+/* define get_key_type(), set_key_type() */ -+DEFINE_KEY_FIELD(type, TYPE, key_minor_locality); -+/* define get_key_band(), set_key_band() */ -+DEFINE_KEY_FIELD(band, BAND, __u64); -+/* define get_key_objectid(), set_key_objectid() */ -+DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t); -+/* define get_key_fulloid(), set_key_fulloid() */ -+DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t); -+/* define get_key_offset(), set_key_offset() */ -+DEFINE_KEY_FIELD(offset, OFFSET, __u64); -+#if (REISER4_LARGE_KEY) -+/* define get_key_ordering(), set_key_ordering() */ -+DEFINE_KEY_FIELD(ordering, ORDERING, __u64); -+#else -+static inline __u64 get_key_ordering(const reiser4_key * key) -+{ -+ return 0; -+} -+ -+static inline void set_key_ordering(reiser4_key * key, __u64 val) -+{ -+} -+#endif -+ -+/* key comparison result */ -+typedef enum { LESS_THAN = -1, /* if first key is less than second */ -+ EQUAL_TO = 0, /* if keys are equal */ -+ GREATER_THAN = +1 /* if first key is greater than second */ -+} cmp_t; -+ -+void reiser4_key_init(reiser4_key * key); -+ -+/* minimal possible key in the tree. Return pointer to the static storage. */ -+extern const reiser4_key *reiser4_min_key(void); -+extern const reiser4_key *reiser4_max_key(void); -+ -+/* helper macro for keycmp() */ -+#define KEY_DIFF(k1, k2, field) \ -+({ \ -+ typeof(get_key_ ## field(k1)) f1; \ -+ typeof(get_key_ ## field(k2)) f2; \ -+ \ -+ f1 = get_key_ ## field(k1); \ -+ f2 = get_key_ ## field(k2); \ -+ \ -+ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \ -+}) -+ -+/* helper macro for keycmp() */ -+#define KEY_DIFF_EL(k1, k2, off) \ -+({ \ -+ __u64 e1; \ -+ __u64 e2; \ -+ \ -+ e1 = get_key_el(k1, off); \ -+ e2 = get_key_el(k2, off); \ -+ \ -+ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \ -+}) -+ -+/* compare `k1' and `k2'. This function is a heart of "key allocation -+ policy". All you need to implement new policy is to add yet another -+ clause here. */ -+static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2/* second key to compare */) -+{ -+ cmp_t result; -+ -+ /* -+ * This function is the heart of reiser4 tree-routines. Key comparison -+ * is among most heavily used operations in the file system. -+ */ -+ -+ assert("nikita-439", k1 != NULL); -+ assert("nikita-440", k2 != NULL); -+ -+ /* there is no actual branch here: condition is compile time constant -+ * and constant folding and propagation ensures that only one branch -+ * is actually compiled in. */ -+ -+ if (REISER4_PLANA_KEY_ALLOCATION) { -+ /* if physical order of fields in a key is identical -+ with logical order, we can implement key comparison -+ as three 64bit comparisons. */ -+ /* logical order of fields in plan-a: -+ locality->type->objectid->offset. */ -+ /* compare locality and type at once */ -+ result = KEY_DIFF_EL(k1, k2, 0); -+ if (result == EQUAL_TO) { -+ /* compare objectid (and band if it's there) */ -+ result = KEY_DIFF_EL(k1, k2, 1); -+ /* compare offset */ -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF_EL(k1, k2, 2); -+ if (REISER4_LARGE_KEY && result == EQUAL_TO) -+ result = KEY_DIFF_EL(k1, k2, 3); -+ } -+ } -+ } else if (REISER4_3_5_KEY_ALLOCATION) { -+ result = KEY_DIFF(k1, k2, locality); -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF(k1, k2, objectid); -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF(k1, k2, type); -+ if (result == EQUAL_TO) -+ result = KEY_DIFF(k1, k2, offset); -+ } -+ } -+ } else -+ impossible("nikita-441", "Unknown key allocation scheme!"); -+ return result; -+} -+ -+/* true if @k1 equals @k2 */ -+static inline int keyeq(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2/* second key to compare */) -+{ -+ assert("nikita-1879", k1 != NULL); -+ assert("nikita-1880", k2 != NULL); -+ return !memcmp(k1, k2, sizeof *k1); -+} -+ -+/* true if @k1 is less than @k2 */ -+static inline int keylt(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2/* second key to compare */) -+{ -+ assert("nikita-1952", k1 != NULL); -+ assert("nikita-1953", k2 != NULL); -+ return keycmp(k1, k2) == LESS_THAN; -+} -+ -+/* true if @k1 is less than or equal to @k2 */ -+static inline int keyle(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2/* second key to compare */) -+{ -+ assert("nikita-1954", k1 != NULL); -+ assert("nikita-1955", k2 != NULL); -+ return keycmp(k1, k2) != GREATER_THAN; -+} -+ -+/* true if @k1 is greater than @k2 */ -+static inline int keygt(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2/* second key to compare */) -+{ -+ assert("nikita-1959", k1 != NULL); -+ assert("nikita-1960", k2 != NULL); -+ return keycmp(k1, k2) == GREATER_THAN; -+} -+ -+/* true if @k1 is greater than or equal to @k2 */ -+static inline int keyge(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2/* second key to compare */) -+{ -+ assert("nikita-1956", k1 != NULL); -+ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched -+ * November 3: Laika */ -+ return keycmp(k1, k2) != LESS_THAN; -+} -+ -+static inline void prefetchkey(reiser4_key * key) -+{ -+ prefetch(key); -+ prefetch(&key->el[KEY_CACHELINE_END]); -+} -+ -+/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) = -+ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */ -+/* size of a buffer suitable to hold human readable key representation */ -+#define KEY_BUF_LEN (80) -+ -+#if REISER4_DEBUG -+extern void reiser4_print_key(const char *prefix, const reiser4_key * key); -+#else -+#define reiser4_print_key(p, k) noop -+#endif -+ -+/* __FS_REISERFS_KEY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/ktxnmgrd.c linux-2.6.30/fs/reiser4/ktxnmgrd.c ---- linux-2.6.30.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/ktxnmgrd.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,215 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Transaction manager daemon. */ -+ -+/* -+ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is -+ * needed/important for the following reasons: -+ * -+ * 1. in reiser4 atom is not committed immediately when last transaction -+ * handle closes, unless atom is either too old or too large (see -+ * atom_should_commit()). This is done to avoid committing too frequently. -+ * because: -+ * -+ * 2. sometimes we don't want to commit atom when closing last transaction -+ * handle even if it is old and fat enough. For example, because we are at -+ * this point under directory semaphore, and committing would stall all -+ * accesses to this directory. -+ * -+ * ktxnmgrd binds its time sleeping on condition variable. When is awakes -+ * either due to (tunable) timeout or because it was explicitly woken up by -+ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones -+ * eligible. -+ * -+ */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "tree.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include <linux/sched.h> /* for struct task_struct */ -+#include <linux/wait.h> -+#include <linux/suspend.h> -+#include <linux/kernel.h> -+#include <linux/writeback.h> -+#include <linux/kthread.h> -+#include <linux/freezer.h> -+ -+static int scan_mgr(struct super_block *); -+ -+/* -+ * change current->comm so that ps, top, and friends will see changed -+ * state. This serves no useful purpose whatsoever, but also costs nothing. May -+ * be it will make lonely system administrator feeling less alone at 3 A.M. -+ */ -+#define set_comm(state) \ -+ snprintf(current->comm, sizeof(current->comm), \ -+ "%s:%s:%s", __FUNCTION__, (super)->s_id, (state)) -+ -+/** -+ * ktxnmgrd - kernel txnmgr daemon -+ * @arg: pointer to super block -+ * -+ * The background transaction manager daemon, started as a kernel thread during -+ * reiser4 initialization. -+ */ -+static int ktxnmgrd(void *arg) -+{ -+ struct super_block *super; -+ ktxnmgrd_context *ctx; -+ txn_mgr *mgr; -+ int done = 0; -+ -+ super = arg; -+ mgr = &get_super_private(super)->tmgr; -+ -+ /* -+ * do_fork() just copies task_struct into the new thread. ->fs_context -+ * shouldn't be copied of course. This shouldn't be a problem for the -+ * rest of the code though. -+ */ -+ current->journal_info = NULL; -+ ctx = mgr->daemon; -+ while (1) { -+ try_to_freeze(); -+ set_comm("wait"); -+ { -+ DEFINE_WAIT(__wait); -+ -+ prepare_to_wait(&ctx->wait, &__wait, -+ TASK_INTERRUPTIBLE); -+ if (kthread_should_stop()) -+ done = 1; -+ else -+ schedule_timeout(ctx->timeout); -+ finish_wait(&ctx->wait, &__wait); -+ } -+ if (done) -+ break; -+ set_comm("run"); -+ spin_lock(&ctx->guard); -+ /* -+ * wait timed out or ktxnmgrd was woken up by explicit request -+ * to commit something. Scan list of atoms in txnmgr and look -+ * for too old atoms. -+ */ -+ do { -+ ctx->rescan = 0; -+ scan_mgr(super); -+ spin_lock(&ctx->guard); -+ if (ctx->rescan) { -+ /* -+ * the list could be modified while ctx -+ * spinlock was released, we have to repeat -+ * scanning from the beginning -+ */ -+ break; -+ } -+ } while (ctx->rescan); -+ spin_unlock(&ctx->guard); -+ } -+ return 0; -+} -+ -+#undef set_comm -+ -+/** -+ * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon -+ * @super: pointer to super block -+ * -+ * Allocates and initializes ktxnmgrd_context, attaches it to transaction -+ * manager. Starts kernel txnmgr daemon. This is called on mount. -+ */ -+int reiser4_init_ktxnmgrd(struct super_block *super) -+{ -+ txn_mgr *mgr; -+ ktxnmgrd_context *ctx; -+ -+ mgr = &get_super_private(super)->tmgr; -+ -+ assert("zam-1014", mgr->daemon == NULL); -+ -+ ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get()); -+ if (!ctx) -+ return RETERR(-ENOMEM); -+ -+ assert("nikita-2442", ctx != NULL); -+ -+ init_waitqueue_head(&ctx->wait); -+ -+ /*kcond_init(&ctx->startup);*/ -+ spin_lock_init(&ctx->guard); -+ ctx->timeout = REISER4_TXNMGR_TIMEOUT; -+ ctx->rescan = 1; -+ mgr->daemon = ctx; -+ -+ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd"); -+ if (IS_ERR(ctx->tsk)) { -+ int ret = PTR_ERR(ctx->tsk); -+ mgr->daemon = NULL; -+ kfree(ctx); -+ return RETERR(ret); -+ } -+ return 0; -+} -+ -+void ktxnmgrd_kick(txn_mgr *mgr) -+{ -+ assert("nikita-3234", mgr != NULL); -+ assert("nikita-3235", mgr->daemon != NULL); -+ wake_up(&mgr->daemon->wait); -+} -+ -+int is_current_ktxnmgrd(void) -+{ -+ return (get_current_super_private()->tmgr.daemon->tsk == current); -+} -+ -+/** -+ * scan_mgr - commit atoms which are to be committed -+ * @super: super block to commit atoms of -+ * -+ * Commits old atoms. -+ */ -+static int scan_mgr(struct super_block *super) -+{ -+ int ret; -+ reiser4_context ctx; -+ -+ init_stack_context(&ctx, super); -+ -+ ret = commit_some_atoms(&get_super_private(super)->tmgr); -+ -+ reiser4_exit_context(&ctx); -+ return ret; -+} -+ -+/** -+ * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context -+ * @mgr: -+ * -+ * This is called on umount. Stops ktxnmgrd and free t -+ */ -+void reiser4_done_ktxnmgrd(struct super_block *super) -+{ -+ txn_mgr *mgr; -+ -+ mgr = &get_super_private(super)->tmgr; -+ assert("zam-1012", mgr->daemon != NULL); -+ -+ kthread_stop(mgr->daemon->tsk); -+ kfree(mgr->daemon); -+ mgr->daemon = NULL; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/ktxnmgrd.h linux-2.6.30/fs/reiser4/ktxnmgrd.h ---- linux-2.6.30.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/ktxnmgrd.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,52 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Transaction manager daemon. See ktxnmgrd.c for comments. */ -+ -+#ifndef __KTXNMGRD_H__ -+#define __KTXNMGRD_H__ -+ -+#include "txnmgr.h" -+ -+#include <linux/fs.h> -+#include <linux/wait.h> -+#include <linux/completion.h> -+#include <linux/spinlock.h> -+#include <asm/atomic.h> -+#include <linux/sched.h> /* for struct task_struct */ -+ -+/* in this structure all data necessary to start up, shut down and communicate -+ * with ktxnmgrd are kept. */ -+struct ktxnmgrd_context { -+ /* wait queue head on which ktxnmgrd sleeps */ -+ wait_queue_head_t wait; -+ /* spin lock protecting all fields of this structure */ -+ spinlock_t guard; -+ /* timeout of sleeping on ->wait */ -+ signed long timeout; -+ /* kernel thread running ktxnmgrd */ -+ struct task_struct *tsk; -+ /* list of all file systems served by this ktxnmgrd */ -+ struct list_head queue; -+ /* should ktxnmgrd repeat scanning of atoms? */ -+ unsigned int rescan:1; -+}; -+ -+extern int reiser4_init_ktxnmgrd(struct super_block *); -+extern void reiser4_done_ktxnmgrd(struct super_block *); -+ -+extern void ktxnmgrd_kick(txn_mgr * mgr); -+extern int is_current_ktxnmgrd(void); -+ -+/* __KTXNMGRD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/lock.c linux-2.6.30/fs/reiser4/lock.c ---- linux-2.6.30.orig/fs/reiser4/lock.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/lock.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1237 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Traditional deadlock avoidance is achieved by acquiring all locks in a single -+ order. V4 balances the tree from the bottom up, and searches the tree from -+ the top down, and that is really the way we want it, so tradition won't work -+ for us. -+ -+ Instead we have two lock orderings, a high priority lock ordering, and a low -+ priority lock ordering. Each node in the tree has a lock in its znode. -+ -+ Suppose we have a set of processes which lock (R/W) tree nodes. Each process -+ has a set (maybe empty) of already locked nodes ("process locked set"). Each -+ process may have a pending lock request to a node locked by another process. -+ Note: we lock and unlock, but do not transfer locks: it is possible -+ transferring locks instead would save some bus locking.... -+ -+ Deadlock occurs when we have a loop constructed from process locked sets and -+ lock request vectors. -+ -+ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in -+ memory is extended with "znodes" with which we connect nodes with their left -+ and right neighbors using sibling pointers stored in the znodes. When we -+ perform balancing operations we often go from left to right and from right to -+ left. -+ -+ +-P1-+ +-P3-+ -+ |+--+| V1 |+--+| -+ ||N1|| -------> ||N3|| -+ |+--+| |+--+| -+ +----+ +----+ -+ ^ | -+ |V2 |V3 -+ | v -+ +---------P2---------+ -+ |+--+ +--+| -+ ||N2| -------- |N4|| -+ |+--+ +--+| -+ +--------------------+ -+ -+ We solve this by ensuring that only low priority processes lock in top to -+ bottom order and from right to left, and high priority processes lock from -+ bottom to top and left to right. -+ -+ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and -+ kill those damn busy loops. -+ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom -+ stage) cannot be ordered that way. There are no rules what nodes can belong -+ to the atom and what nodes cannot. We cannot define what is right or left -+ direction, what is top or bottom. We can take immediate parent or side -+ neighbor of one node, but nobody guarantees that, say, left neighbor node is -+ not a far right neighbor for other nodes from the same atom. It breaks -+ deadlock avoidance rules and hi-low priority locking cannot be applied for -+ atom locks. -+ -+ How does it help to avoid deadlocks ? -+ -+ Suppose we have a deadlock with n processes. Processes from one priority -+ class never deadlock because they take locks in one consistent -+ order. -+ -+ So, any possible deadlock loop must have low priority as well as high -+ priority processes. There are no other lock priority levels except low and -+ high. We know that any deadlock loop contains at least one node locked by a -+ low priority process and requested by a high priority process. If this -+ situation is caught and resolved it is sufficient to avoid deadlocks. -+ -+ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION. -+ -+ The deadlock prevention algorithm is based on comparing -+ priorities of node owners (processes which keep znode locked) and -+ requesters (processes which want to acquire a lock on znode). We -+ implement a scheme where low-priority owners yield locks to -+ high-priority requesters. We created a signal passing system that -+ is used to ask low-priority processes to yield one or more locked -+ znodes. -+ -+ The condition when a znode needs to change its owners is described by the -+ following formula: -+ -+ ############################################# -+ # # -+ # (number of high-priority requesters) > 0 # -+ # AND # -+ # (numbers of high-priority owners) == 0 # -+ # # -+ ############################################# -+ -+ Note that a low-priority process delays node releasing if another -+ high-priority process owns this node. So, slightly more strictly speaking, -+ to have a deadlock capable cycle you must have a loop in which a high -+ priority process is waiting on a low priority process to yield a node, which -+ is slightly different from saying a high priority process is waiting on a -+ node owned by a low priority process. -+ -+ It is enough to avoid deadlocks if we prevent any low-priority process from -+ falling asleep if its locked set contains a node which satisfies the -+ deadlock condition. -+ -+ That condition is implicitly or explicitly checked in all places where new -+ high-priority requests may be added or removed from node request queue or -+ high-priority process takes or releases a lock on node. The main -+ goal of these checks is to never lose the moment when node becomes "has -+ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners -+ at that time. -+ -+ The information about received signals is stored in the per-process -+ structure (lock stack) and analyzed before a low-priority process goes to -+ sleep but after a "fast" attempt to lock a node fails. Any signal wakes -+ sleeping process up and forces him to re-check lock status and received -+ signal info. If "must-yield-this-lock" signals were received the locking -+ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code. -+ -+ V4 LOCKING DRAWBACKS -+ -+ If we have already balanced on one level, and we are propagating our changes -+ upward to a higher level, it could be very messy to surrender all locks on -+ the lower level because we put so much computational work into it, and -+ reverting them to their state before they were locked might be very complex. -+ We also don't want to acquire all locks before performing balancing because -+ that would either be almost as much work as the balancing, or it would be -+ too conservative and lock too much. We want balancing to be done only at -+ high priority. Yet, we might want to go to the left one node and use some -+ of its empty space... So we make one attempt at getting the node to the left -+ using try_lock, and if it fails we do without it, because we didn't really -+ need it, it was only a nice to have. -+ -+ LOCK STRUCTURES DESCRIPTION -+ -+ The following data structures are used in the reiser4 locking -+ implementation: -+ -+ All fields related to long-term locking are stored in znode->lock. -+ -+ The lock stack is a per thread object. It owns all znodes locked by the -+ thread. One znode may be locked by several threads in case of read lock or -+ one znode may be write locked by one thread several times. The special link -+ objects (lock handles) support n<->m relation between znodes and lock -+ owners. -+ -+ <Thread 1> <Thread 2> -+ -+ +---------+ +---------+ -+ | LS1 | | LS2 | -+ +---------+ +---------+ -+ ^ ^ -+ |---------------+ +----------+ -+ v v v v -+ +---------+ +---------+ +---------+ +---------+ -+ | LH1 | | LH2 | | LH3 | | LH4 | -+ +---------+ +---------+ +---------+ +---------+ -+ ^ ^ ^ ^ -+ | +------------+ | -+ v v v -+ +---------+ +---------+ +---------+ -+ | Z1 | | Z2 | | Z3 | -+ +---------+ +---------+ +---------+ -+ -+ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The -+ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and -+ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode -+ Z1 is locked by only one thread, znode has only one lock handle LH1 on its -+ list, similar situation is for Z3 which is locked by the thread 2 only. Z2 -+ is locked (for read) twice by different threads and two lock handles are on -+ its list. Each lock handle represents a single relation of a locking of a -+ znode by a thread. Locking of a znode is an establishing of a locking -+ relation between the lock stack and the znode by adding of a new lock handle -+ to a list of lock handles, the lock stack. The lock stack links all lock -+ handles for all znodes locked by the lock stack. The znode list groups all -+ lock handles for all locks stacks which locked the znode. -+ -+ Yet another relation may exist between znode and lock owners. If lock -+ procedure cannot immediately take lock on an object it adds the lock owner -+ on special `requestors' list belongs to znode. That list represents a -+ queue of pending lock requests. Because one lock owner may request only -+ only one lock object at a time, it is a 1->n relation between lock objects -+ and a lock owner implemented as it is described above. Full information -+ (priority, pointers to lock and link objects) about each lock request is -+ stored in lock owner structure in `request' field. -+ -+ SHORT_TERM LOCKING -+ -+ This is a list of primitive operations over lock stacks / lock handles / -+ znodes and locking descriptions for them. -+ -+ 1. locking / unlocking which is done by two list insertion/deletion, one -+ to/from znode's list of lock handles, another one is to/from lock stack's -+ list of lock handles. The first insertion is protected by -+ znode->lock.guard spinlock. The list owned by the lock stack can be -+ modified only by thread who owns the lock stack and nobody else can -+ modify/read it. There is nothing to be protected by a spinlock or -+ something else. -+ -+ 2. adding/removing a lock request to/from znode requesters list. The rule is -+ that znode->lock.guard spinlock should be taken for this. -+ -+ 3. we can traverse list of lock handles and use references to lock stacks who -+ locked given znode if znode->lock.guard spinlock is taken. -+ -+ 4. If a lock stack is associated with a znode as a lock requestor or lock -+ owner its existence is guaranteed by znode->lock.guard spinlock. Some its -+ (lock stack's) fields should be protected from being accessed in parallel -+ by two or more threads. Please look at lock_stack structure definition -+ for the info how those fields are protected. */ -+ -+/* Znode lock and capturing intertwining. */ -+/* In current implementation we capture formatted nodes before locking -+ them. Take a look on longterm lock znode, reiser4_try_capture() request -+ precedes locking requests. The longterm_lock_znode function unconditionally -+ captures znode before even checking of locking conditions. -+ -+ Another variant is to capture znode after locking it. It was not tested, but -+ at least one deadlock condition is supposed to be there. One thread has -+ locked a znode (Node-1) and calls reiser4_try_capture() for it. -+ reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state. -+ Second thread is a flushing thread, its current atom is the atom Node-1 -+ belongs to. Second thread wants to lock Node-1 and sleeps because Node-1 -+ is locked by the first thread. The described situation is a deadlock. */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "znode.h" -+#include "jnode.h" -+#include "tree.h" -+#include "plugin/node/node.h" -+#include "super.h" -+ -+#include <linux/spinlock.h> -+ -+#if REISER4_DEBUG -+static int request_is_deadlock_safe(znode * , znode_lock_mode, -+ znode_lock_request); -+#endif -+ -+/* Returns a lock owner associated with current thread */ -+lock_stack *get_current_lock_stack(void) -+{ -+ return &get_current_context()->stack; -+} -+ -+/* Wakes up all low priority owners informing them about possible deadlock */ -+static void wake_up_all_lopri_owners(znode * node) -+{ -+ lock_handle *handle; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ list_for_each_entry(handle, &node->lock.owners, owners_link) { -+ assert("nikita-1832", handle->node == node); -+ /* count this signal in owner->nr_signaled */ -+ if (!handle->signaled) { -+ handle->signaled = 1; -+ atomic_inc(&handle->owner->nr_signaled); -+ /* Wake up a single process */ -+ reiser4_wake_up(handle->owner); -+ } -+ } -+} -+ -+/* Adds a lock to a lock owner, which means creating a link to the lock and -+ putting the link into the two lists all links are on (the doubly linked list -+ that forms the lock_stack, and the doubly linked list of links attached -+ to a lock. -+*/ -+static inline void -+link_object(lock_handle * handle, lock_stack * owner, znode * node) -+{ -+ assert("jmacd-810", handle->owner == NULL); -+ assert_spin_locked(&(node->lock.guard)); -+ -+ handle->owner = owner; -+ handle->node = node; -+ -+ assert("reiser4-4", -+ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0)); -+ -+ /* add lock handle to the end of lock_stack's list of locks */ -+ list_add_tail(&handle->locks_link, &owner->locks); -+ ON_DEBUG(owner->nr_locks++); -+ reiser4_ctx_gfp_mask_set(); -+ -+ /* add lock handle to the head of znode's list of owners */ -+ list_add(&handle->owners_link, &node->lock.owners); -+ handle->signaled = 0; -+} -+ -+/* Breaks a relation between a lock and its owner */ -+static inline void unlink_object(lock_handle * handle) -+{ -+ assert("zam-354", handle->owner != NULL); -+ assert("nikita-1608", handle->node != NULL); -+ assert_spin_locked(&(handle->node->lock.guard)); -+ assert("nikita-1829", handle->owner == get_current_lock_stack()); -+ assert("reiser4-5", handle->owner->nr_locks > 0); -+ -+ /* remove lock handle from lock_stack's list of locks */ -+ list_del(&handle->locks_link); -+ ON_DEBUG(handle->owner->nr_locks--); -+ reiser4_ctx_gfp_mask_set(); -+ assert("reiser4-6", -+ ergo(list_empty_careful(&handle->owner->locks), -+ handle->owner->nr_locks == 0)); -+ /* remove lock handle from znode's list of owners */ -+ list_del(&handle->owners_link); -+ /* indicates that lock handle is free now */ -+ handle->node = NULL; -+#if REISER4_DEBUG -+ INIT_LIST_HEAD(&handle->locks_link); -+ INIT_LIST_HEAD(&handle->owners_link); -+ handle->owner = NULL; -+#endif -+} -+ -+/* Actually locks an object knowing that we are able to do this */ -+static void lock_object(lock_stack * owner) -+{ -+ struct lock_request *request; -+ znode *node; -+ -+ request = &owner->request; -+ node = request->node; -+ assert_spin_locked(&(node->lock.guard)); -+ if (request->mode == ZNODE_READ_LOCK) { -+ node->lock.nr_readers++; -+ } else { -+ /* check that we don't switched from read to write lock */ -+ assert("nikita-1840", node->lock.nr_readers <= 0); -+ /* We allow recursive locking; a node can be locked several -+ times for write by same process */ -+ node->lock.nr_readers--; -+ } -+ -+ link_object(request->handle, owner, node); -+ -+ if (owner->curpri) -+ node->lock.nr_hipri_owners++; -+} -+ -+/* Check for recursive write locking */ -+static int recursive(lock_stack * owner) -+{ -+ int ret; -+ znode *node; -+ lock_handle *lh; -+ -+ node = owner->request.node; -+ -+ /* Owners list is not empty for a locked node */ -+ assert("zam-314", !list_empty_careful(&node->lock.owners)); -+ assert("nikita-1841", owner == get_current_lock_stack()); -+ assert_spin_locked(&(node->lock.guard)); -+ -+ lh = list_entry(node->lock.owners.next, lock_handle, owners_link); -+ ret = (lh->owner == owner); -+ -+ /* Recursive read locking should be done usual way */ -+ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK); -+ /* mixing of read/write locks is not allowed */ -+ assert("zam-341", !ret || znode_is_wlocked(node)); -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+/* Returns true if the lock is held by the calling thread. */ -+int znode_is_any_locked(const znode * node) -+{ -+ lock_handle *handle; -+ lock_stack *stack; -+ int ret; -+ -+ if (!znode_is_locked(node)) -+ return 0; -+ -+ stack = get_current_lock_stack(); -+ -+ spin_lock_stack(stack); -+ -+ ret = 0; -+ -+ list_for_each_entry(handle, &stack->locks, locks_link) { -+ if (handle->node == node) { -+ ret = 1; -+ break; -+ } -+ } -+ -+ spin_unlock_stack(stack); -+ -+ return ret; -+} -+ -+#endif -+ -+/* Returns true if a write lock is held by the calling thread. */ -+int znode_is_write_locked(const znode * node) -+{ -+ lock_stack *stack; -+ lock_handle *handle; -+ -+ assert("jmacd-8765", node != NULL); -+ -+ if (!znode_is_wlocked(node)) -+ return 0; -+ -+ stack = get_current_lock_stack(); -+ -+ /* -+ * When znode is write locked, all owner handles point to the same lock -+ * stack. Get pointer to lock stack from the first lock handle from -+ * znode's owner list -+ */ -+ handle = list_entry(node->lock.owners.next, lock_handle, owners_link); -+ -+ return (handle->owner == stack); -+} -+ -+/* This "deadlock" condition is the essential part of reiser4 locking -+ implementation. This condition is checked explicitly by calling -+ check_deadlock_condition() or implicitly in all places where znode lock -+ state (set of owners and request queue) is changed. Locking code is -+ designed to use this condition to trigger procedure of passing object from -+ low priority owner(s) to high priority one(s). -+ -+ The procedure results in passing an event (setting lock_handle->signaled -+ flag) and counting this event in nr_signaled field of owner's lock stack -+ object and wakeup owner's process. -+*/ -+static inline int check_deadlock_condition(znode * node) -+{ -+ assert_spin_locked(&(node->lock.guard)); -+ return node->lock.nr_hipri_requests > 0 -+ && node->lock.nr_hipri_owners == 0; -+} -+ -+static int check_livelock_condition(znode * node, znode_lock_mode mode) -+{ -+ zlock * lock = &node->lock; -+ -+ return mode == ZNODE_READ_LOCK && -+ lock->nr_readers >= 0 && lock->nr_hipri_write_requests > 0; -+} -+ -+/* checks lock/request compatibility */ -+static int can_lock_object(lock_stack * owner) -+{ -+ znode *node = owner->request.node; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ /* See if the node is disconnected. */ -+ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING))) -+ return RETERR(-EINVAL); -+ -+ /* Do not ever try to take a lock if we are going in low priority -+ direction and a node have a high priority request without high -+ priority owners. */ -+ if (unlikely(!owner->curpri && check_deadlock_condition(node))) -+ return RETERR(-E_REPEAT); -+ if (unlikely(owner->curpri && -+ check_livelock_condition(node, owner->request.mode))) -+ return RETERR(-E_REPEAT); -+ if (unlikely(!is_lock_compatible(node, owner->request.mode))) -+ return RETERR(-E_REPEAT); -+ return 0; -+} -+ -+/* Setting of a high priority to the process. It clears "signaled" flags -+ because znode locked by high-priority process can't satisfy our "deadlock -+ condition". */ -+static void set_high_priority(lock_stack * owner) -+{ -+ assert("nikita-1846", owner == get_current_lock_stack()); -+ /* Do nothing if current priority is already high */ -+ if (!owner->curpri) { -+ /* We don't need locking for owner->locks list, because, this -+ * function is only called with the lock stack of the current -+ * thread, and no other thread can play with owner->locks list -+ * and/or change ->node pointers of lock handles in this list. -+ * -+ * (Interrupts also are not involved.) -+ */ -+ lock_handle *item = list_entry(owner->locks.next, lock_handle, -+ locks_link); -+ while (&owner->locks != &item->locks_link) { -+ znode *node = item->node; -+ -+ spin_lock_zlock(&node->lock); -+ -+ node->lock.nr_hipri_owners++; -+ -+ /* we can safely set signaled to zero, because -+ previous statement (nr_hipri_owners ++) guarantees -+ that signaled will be never set again. */ -+ item->signaled = 0; -+ spin_unlock_zlock(&node->lock); -+ -+ item = list_entry(item->locks_link.next, lock_handle, -+ locks_link); -+ } -+ owner->curpri = 1; -+ atomic_set(&owner->nr_signaled, 0); -+ } -+} -+ -+/* Sets a low priority to the process. */ -+static void set_low_priority(lock_stack * owner) -+{ -+ assert("nikita-3075", owner == get_current_lock_stack()); -+ /* Do nothing if current priority is already low */ -+ if (owner->curpri) { -+ /* scan all locks (lock handles) held by @owner, which is -+ actually current thread, and check whether we are reaching -+ deadlock possibility anywhere. -+ */ -+ lock_handle *handle = list_entry(owner->locks.next, lock_handle, -+ locks_link); -+ while (&owner->locks != &handle->locks_link) { -+ znode *node = handle->node; -+ spin_lock_zlock(&node->lock); -+ /* this thread just was hipri owner of @node, so -+ nr_hipri_owners has to be greater than zero. */ -+ assert("nikita-1835", node->lock.nr_hipri_owners > 0); -+ node->lock.nr_hipri_owners--; -+ /* If we have deadlock condition, adjust a nr_signaled -+ field. It is enough to set "signaled" flag only for -+ current process, other low-pri owners will be -+ signaled and waken up after current process unlocks -+ this object and any high-priority requestor takes -+ control. */ -+ if (check_deadlock_condition(node) -+ && !handle->signaled) { -+ handle->signaled = 1; -+ atomic_inc(&owner->nr_signaled); -+ } -+ spin_unlock_zlock(&node->lock); -+ handle = list_entry(handle->locks_link.next, -+ lock_handle, locks_link); -+ } -+ owner->curpri = 0; -+ } -+} -+ -+static void remove_lock_request(lock_stack * requestor) -+{ -+ zlock * lock = &requestor->request.node->lock; -+ -+ if (requestor->curpri) { -+ assert("nikita-1838", lock->nr_hipri_requests > 0); -+ lock->nr_hipri_requests--; -+ if (requestor->request.mode == ZNODE_WRITE_LOCK) -+ lock->nr_hipri_write_requests--; -+ } -+ list_del(&requestor->requestors_link); -+} -+ -+static void invalidate_all_lock_requests(znode * node) -+{ -+ lock_stack *requestor, *tmp; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, -+ requestors_link) { -+ remove_lock_request(requestor); -+ requestor->request.ret_code = -EINVAL; -+ reiser4_wake_up(requestor); -+ requestor->request.mode = ZNODE_NO_LOCK; -+ } -+} -+ -+static void dispatch_lock_requests(znode * node) -+{ -+ lock_stack *requestor, *tmp; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, -+ requestors_link) { -+ if (znode_is_write_locked(node)) -+ break; -+ if (!can_lock_object(requestor)) { -+ lock_object(requestor); -+ remove_lock_request(requestor); -+ requestor->request.ret_code = 0; -+ reiser4_wake_up(requestor); -+ requestor->request.mode = ZNODE_NO_LOCK; -+ } -+ } -+} -+ -+/* release long-term lock, acquired by longterm_lock_znode() */ -+void longterm_unlock_znode(lock_handle * handle) -+{ -+ znode *node = handle->node; -+ lock_stack *oldowner = handle->owner; -+ int hipri; -+ int readers; -+ int rdelta; -+ int youdie; -+ -+ /* -+ * this is time-critical and highly optimized code. Modify carefully. -+ */ -+ -+ assert("jmacd-1021", handle != NULL); -+ assert("jmacd-1022", handle->owner != NULL); -+ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode)); -+ -+ assert("zam-130", oldowner == get_current_lock_stack()); -+ -+ LOCK_CNT_DEC(long_term_locked_znode); -+ -+ /* -+ * to minimize amount of operations performed under lock, pre-compute -+ * all variables used within critical section. This makes code -+ * obscure. -+ */ -+ -+ /* was this lock of hi or lo priority */ -+ hipri = oldowner->curpri ? 1 : 0; -+ /* number of readers */ -+ readers = node->lock.nr_readers; -+ /* +1 if write lock, -1 if read lock */ -+ rdelta = (readers > 0) ? -1 : +1; -+ /* true if node is to die and write lock is released */ -+ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0); -+ -+ spin_lock_zlock(&node->lock); -+ -+ assert("zam-101", znode_is_locked(node)); -+ -+ /* Adjust a number of high priority owners of this lock */ -+ assert("nikita-1836", node->lock.nr_hipri_owners >= hipri); -+ node->lock.nr_hipri_owners -= hipri; -+ -+ /* Handle znode deallocation on last write-lock release. */ -+ if (znode_is_wlocked_once(node)) { -+ if (youdie) { -+ forget_znode(handle); -+ assert("nikita-2191", znode_invariant(node)); -+ zput(node); -+ return; -+ } -+ } -+ -+ if (handle->signaled) -+ atomic_dec(&oldowner->nr_signaled); -+ -+ /* Unlocking means owner<->object link deletion */ -+ unlink_object(handle); -+ -+ /* This is enough to be sure whether an object is completely -+ unlocked. */ -+ node->lock.nr_readers += rdelta; -+ -+ /* If the node is locked it must have an owners list. Likewise, if -+ the node is unlocked it must have an empty owners list. */ -+ assert("zam-319", equi(znode_is_locked(node), -+ !list_empty_careful(&node->lock.owners))); -+ -+#if REISER4_DEBUG -+ if (!znode_is_locked(node)) -+ ++node->times_locked; -+#endif -+ -+ /* If there are pending lock requests we wake up a requestor */ -+ if (!znode_is_wlocked(node)) -+ dispatch_lock_requests(node); -+ if (check_deadlock_condition(node)) -+ wake_up_all_lopri_owners(node); -+ spin_unlock_zlock(&node->lock); -+ -+ /* minus one reference from handle->node */ -+ assert("nikita-2190", znode_invariant(node)); -+ ON_DEBUG(check_lock_data()); -+ ON_DEBUG(check_lock_node_data(node)); -+ zput(node); -+} -+ -+/* final portion of longterm-lock */ -+static int -+lock_tail(lock_stack * owner, int ok, znode_lock_mode mode) -+{ -+ znode *node = owner->request.node; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ /* If we broke with (ok == 0) it means we can_lock, now do it. */ -+ if (ok == 0) { -+ lock_object(owner); -+ owner->request.mode = 0; -+ /* count a reference from lockhandle->node -+ -+ znode was already referenced at the entry to this function, -+ hence taking spin-lock here is not necessary (see comment -+ in the zref()). -+ */ -+ zref(node); -+ -+ LOCK_CNT_INC(long_term_locked_znode); -+ } -+ spin_unlock_zlock(&node->lock); -+ ON_DEBUG(check_lock_data()); -+ ON_DEBUG(check_lock_node_data(node)); -+ return ok; -+} -+ -+/* -+ * version of longterm_znode_lock() optimized for the most common case: read -+ * lock without any special flags. This is the kind of lock that any tree -+ * traversal takes on the root node of the tree, which is very frequent. -+ */ -+static int longterm_lock_tryfast(lock_stack * owner) -+{ -+ int result; -+ znode *node; -+ zlock *lock; -+ -+ node = owner->request.node; -+ lock = &node->lock; -+ -+ assert("nikita-3340", reiser4_schedulable()); -+ assert("nikita-3341", request_is_deadlock_safe(node, -+ ZNODE_READ_LOCK, -+ ZNODE_LOCK_LOPRI)); -+ spin_lock_zlock(lock); -+ result = can_lock_object(owner); -+ spin_unlock_zlock(lock); -+ -+ if (likely(result != -EINVAL)) { -+ spin_lock_znode(node); -+ result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0); -+ spin_unlock_znode(node); -+ spin_lock_zlock(lock); -+ if (unlikely(result != 0)) { -+ owner->request.mode = 0; -+ } else { -+ result = can_lock_object(owner); -+ if (unlikely(result == -E_REPEAT)) { -+ /* fall back to longterm_lock_znode() */ -+ spin_unlock_zlock(lock); -+ return 1; -+ } -+ } -+ return lock_tail(owner, result, ZNODE_READ_LOCK); -+ } else -+ return 1; -+} -+ -+/* locks given lock object */ -+int longterm_lock_znode( -+ /* local link object (allocated by lock owner -+ * thread, usually on its own stack) */ -+ lock_handle * handle, -+ /* znode we want to lock. */ -+ znode * node, -+ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */ -+ znode_lock_mode mode, -+ /* {0, -EINVAL, -E_DEADLOCK}, see return codes -+ description. */ -+ znode_lock_request request) { -+ int ret; -+ int hipri = (request & ZNODE_LOCK_HIPRI) != 0; -+ int non_blocking = 0; -+ int has_atom; -+ txn_capture cap_flags; -+ zlock *lock; -+ txn_handle *txnh; -+ tree_level level; -+ -+ /* Get current process context */ -+ lock_stack *owner = get_current_lock_stack(); -+ -+ /* Check that the lock handle is initialized and isn't already being -+ * used. */ -+ assert("jmacd-808", handle->owner == NULL); -+ assert("nikita-3026", reiser4_schedulable()); -+ assert("nikita-3219", request_is_deadlock_safe(node, mode, request)); -+ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0); -+ /* long term locks are not allowed in the VM contexts (->writepage(), -+ * prune_{d,i}cache()). -+ * -+ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode -+ * bug caused by d_splice_alias() only working for directories. -+ */ -+ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0)); -+ assert("zam-1055", mode != ZNODE_NO_LOCK); -+ -+ cap_flags = 0; -+ if (request & ZNODE_LOCK_NONBLOCK) { -+ cap_flags |= TXN_CAPTURE_NONBLOCKING; -+ non_blocking = 1; -+ } -+ -+ if (request & ZNODE_LOCK_DONT_FUSE) -+ cap_flags |= TXN_CAPTURE_DONT_FUSE; -+ -+ /* If we are changing our process priority we must adjust a number -+ of high priority owners for each znode that we already lock */ -+ if (hipri) { -+ set_high_priority(owner); -+ } else { -+ set_low_priority(owner); -+ } -+ -+ level = znode_get_level(node); -+ -+ /* Fill request structure with our values. */ -+ owner->request.mode = mode; -+ owner->request.handle = handle; -+ owner->request.node = node; -+ -+ txnh = get_current_context()->trans; -+ lock = &node->lock; -+ -+ if (mode == ZNODE_READ_LOCK && request == 0) { -+ ret = longterm_lock_tryfast(owner); -+ if (ret <= 0) -+ return ret; -+ } -+ -+ has_atom = (txnh->atom != NULL); -+ -+ /* Synchronize on node's zlock guard lock. */ -+ spin_lock_zlock(lock); -+ -+ if (znode_is_locked(node) && -+ mode == ZNODE_WRITE_LOCK && recursive(owner)) -+ return lock_tail(owner, 0, mode); -+ -+ for (;;) { -+ /* Check the lock's availability: if it is unavaiable we get -+ E_REPEAT, 0 indicates "can_lock", otherwise the node is -+ invalid. */ -+ ret = can_lock_object(owner); -+ -+ if (unlikely(ret == -EINVAL)) { -+ /* @node is dying. Leave it alone. */ -+ break; -+ } -+ -+ if (unlikely(ret == -E_REPEAT && non_blocking)) { -+ /* either locking of @node by the current thread will -+ * lead to the deadlock, or lock modes are -+ * incompatible. */ -+ break; -+ } -+ -+ assert("nikita-1844", (ret == 0) -+ || ((ret == -E_REPEAT) && !non_blocking)); -+ /* If we can get the lock... Try to capture first before -+ taking the lock. */ -+ -+ /* first handle commonest case where node and txnh are already -+ * in the same atom. */ -+ /* safe to do without taking locks, because: -+ * -+ * 1. read of aligned word is atomic with respect to writes to -+ * this word -+ * -+ * 2. false negatives are handled in reiser4_try_capture(). -+ * -+ * 3. false positives are impossible. -+ * -+ * PROOF: left as an exercise to the curious reader. -+ * -+ * Just kidding. Here is one: -+ * -+ * At the time T0 txnh->atom is stored in txnh_atom. -+ * -+ * At the time T1 node->atom is stored in node_atom. -+ * -+ * At the time T2 we observe that -+ * -+ * txnh_atom != NULL && node_atom == txnh_atom. -+ * -+ * Imagine that at this moment we acquire node and txnh spin -+ * lock in this order. Suppose that under spin lock we have -+ * -+ * node->atom != txnh->atom, (S1) -+ * -+ * at the time T3. -+ * -+ * txnh->atom != NULL still, because txnh is open by the -+ * current thread. -+ * -+ * Suppose node->atom == NULL, that is, node was un-captured -+ * between T1, and T3. But un-capturing of formatted node is -+ * always preceded by the call to reiser4_invalidate_lock(), -+ * which marks znode as JNODE_IS_DYING under zlock spin -+ * lock. Contradiction, because can_lock_object() above checks -+ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3. -+ * -+ * Suppose that node->atom != node_atom, that is, atom, node -+ * belongs to was fused into another atom: node_atom was fused -+ * into node->atom. Atom of txnh was equal to node_atom at T2, -+ * which means that under spin lock, txnh->atom == node->atom, -+ * because txnh->atom can only follow fusion -+ * chain. Contradicts S1. -+ * -+ * The same for hypothesis txnh->atom != txnh_atom. Hence, -+ * node->atom == node_atom == txnh_atom == txnh->atom. Again -+ * contradicts S1. Hence S1 is false. QED. -+ * -+ */ -+ -+ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) { -+ ; -+ } else { -+ /* -+ * unlock zlock spin lock here. It is possible for -+ * longterm_unlock_znode() to sneak in here, but there -+ * is no harm: reiser4_invalidate_lock() will mark znode -+ * as JNODE_IS_DYING and this will be noted by -+ * can_lock_object() below. -+ */ -+ spin_unlock_zlock(lock); -+ spin_lock_znode(node); -+ ret = reiser4_try_capture(ZJNODE(node), mode, -+ cap_flags); -+ spin_unlock_znode(node); -+ spin_lock_zlock(lock); -+ if (unlikely(ret != 0)) { -+ /* In the failure case, the txnmgr releases -+ the znode's lock (or in some cases, it was -+ released a while ago). There's no need to -+ reacquire it so we should return here, -+ avoid releasing the lock. */ -+ owner->request.mode = 0; -+ break; -+ } -+ -+ /* Check the lock's availability again -- this is -+ because under some circumstances the capture code -+ has to release and reacquire the znode spinlock. */ -+ ret = can_lock_object(owner); -+ } -+ -+ /* This time, a return of (ret == 0) means we can lock, so we -+ should break out of the loop. */ -+ if (likely(ret != -E_REPEAT || non_blocking)) -+ break; -+ -+ /* Lock is unavailable, we have to wait. */ -+ ret = reiser4_prepare_to_sleep(owner); -+ if (unlikely(ret != 0)) -+ break; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ if (hipri) { -+ /* If we are going in high priority direction then -+ increase high priority requests counter for the -+ node */ -+ lock->nr_hipri_requests++; -+ if (mode == ZNODE_WRITE_LOCK) -+ lock->nr_hipri_write_requests++; -+ /* If there are no high priority owners for a node, -+ then immediately wake up low priority owners, so -+ they can detect possible deadlock */ -+ if (lock->nr_hipri_owners == 0) -+ wake_up_all_lopri_owners(node); -+ } -+ list_add_tail(&owner->requestors_link, &lock->requestors); -+ -+ /* Ok, here we have prepared a lock request, so unlock -+ a znode ... */ -+ spin_unlock_zlock(lock); -+ /* ... and sleep */ -+ reiser4_go_to_sleep(owner); -+ if (owner->request.mode == ZNODE_NO_LOCK) -+ goto request_is_done; -+ spin_lock_zlock(lock); -+ if (owner->request.mode == ZNODE_NO_LOCK) { -+ spin_unlock_zlock(lock); -+request_is_done: -+ if (owner->request.ret_code == 0) { -+ LOCK_CNT_INC(long_term_locked_znode); -+ zref(node); -+ } -+ return owner->request.ret_code; -+ } -+ remove_lock_request(owner); -+ } -+ -+ return lock_tail(owner, ret, mode); -+} -+ -+/* lock object invalidation means changing of lock object state to `INVALID' -+ and waiting for all other processes to cancel theirs lock requests. */ -+void reiser4_invalidate_lock(lock_handle * handle /* path to lock -+ * owner and lock -+ * object is being -+ * invalidated. */ ) -+{ -+ znode *node = handle->node; -+ lock_stack *owner = handle->owner; -+ -+ assert("zam-325", owner == get_current_lock_stack()); -+ assert("zam-103", znode_is_write_locked(node)); -+ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED)); -+ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED)); -+ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert("nikita-3097", znode_is_wlocked_once(node)); -+ assert_spin_locked(&(node->lock.guard)); -+ -+ if (handle->signaled) -+ atomic_dec(&owner->nr_signaled); -+ -+ ZF_SET(node, JNODE_IS_DYING); -+ unlink_object(handle); -+ node->lock.nr_readers = 0; -+ -+ invalidate_all_lock_requests(node); -+ spin_unlock_zlock(&node->lock); -+} -+ -+/* Initializes lock_stack. */ -+void init_lock_stack(lock_stack * owner /* pointer to -+ * allocated -+ * structure. */ ) -+{ -+ INIT_LIST_HEAD(&owner->locks); -+ INIT_LIST_HEAD(&owner->requestors_link); -+ spin_lock_init(&owner->sguard); -+ owner->curpri = 1; -+ init_waitqueue_head(&owner->wait); -+} -+ -+/* Initializes lock object. */ -+void reiser4_init_lock(zlock * lock /* pointer on allocated -+ * uninitialized lock object -+ * structure. */ ) -+{ -+ memset(lock, 0, sizeof(zlock)); -+ spin_lock_init(&lock->guard); -+ INIT_LIST_HEAD(&lock->requestors); -+ INIT_LIST_HEAD(&lock->owners); -+} -+ -+/* Transfer a lock handle (presumably so that variables can be moved between -+ stack and heap locations). */ -+static void -+move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old) -+{ -+ znode *node = old->node; -+ lock_stack *owner = old->owner; -+ int signaled; -+ -+ /* locks_list, modified by link_object() is not protected by -+ anything. This is valid because only current thread ever modifies -+ locks_list of its lock_stack. -+ */ -+ assert("nikita-1827", owner == get_current_lock_stack()); -+ assert("nikita-1831", new->owner == NULL); -+ -+ spin_lock_zlock(&node->lock); -+ -+ signaled = old->signaled; -+ if (unlink_old) { -+ unlink_object(old); -+ } else { -+ if (node->lock.nr_readers > 0) { -+ node->lock.nr_readers += 1; -+ } else { -+ node->lock.nr_readers -= 1; -+ } -+ if (signaled) -+ atomic_inc(&owner->nr_signaled); -+ if (owner->curpri) -+ node->lock.nr_hipri_owners += 1; -+ LOCK_CNT_INC(long_term_locked_znode); -+ -+ zref(node); -+ } -+ link_object(new, owner, node); -+ new->signaled = signaled; -+ -+ spin_unlock_zlock(&node->lock); -+} -+ -+void move_lh(lock_handle * new, lock_handle * old) -+{ -+ move_lh_internal(new, old, /*unlink_old */ 1); -+} -+ -+void copy_lh(lock_handle * new, lock_handle * old) -+{ -+ move_lh_internal(new, old, /*unlink_old */ 0); -+} -+ -+/* after getting -E_DEADLOCK we unlock znodes until this function returns false -+ */ -+int reiser4_check_deadlock(void) -+{ -+ lock_stack *owner = get_current_lock_stack(); -+ return atomic_read(&owner->nr_signaled) != 0; -+} -+ -+/* Before going to sleep we re-check "release lock" requests which might come -+ from threads with hi-pri lock priorities. */ -+int reiser4_prepare_to_sleep(lock_stack * owner) -+{ -+ assert("nikita-1847", owner == get_current_lock_stack()); -+ -+ /* We return -E_DEADLOCK if one or more "give me the lock" messages are -+ * counted in nr_signaled */ -+ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) { -+ assert("zam-959", !owner->curpri); -+ return RETERR(-E_DEADLOCK); -+ } -+ return 0; -+} -+ -+/* Wakes up a single thread */ -+void __reiser4_wake_up(lock_stack * owner) -+{ -+ atomic_set(&owner->wakeup, 1); -+ wake_up(&owner->wait); -+} -+ -+/* Puts a thread to sleep */ -+void reiser4_go_to_sleep(lock_stack * owner) -+{ -+ /* Well, we might sleep here, so holding of any spinlocks is no-no */ -+ assert("nikita-3027", reiser4_schedulable()); -+ -+ wait_event(owner->wait, atomic_read(&owner->wakeup)); -+ atomic_set(&owner->wakeup, 0); -+} -+ -+int lock_stack_isclean(lock_stack * owner) -+{ -+ if (list_empty_careful(&owner->locks)) { -+ assert("zam-353", atomic_read(&owner->nr_signaled) == 0); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+#if REISER4_DEBUG -+ -+/* -+ * debugging functions -+ */ -+ -+static void list_check(struct list_head *head) -+{ -+ struct list_head *pos; -+ -+ list_for_each(pos, head) -+ assert("", (pos->prev != NULL && pos->next != NULL && -+ pos->prev->next == pos && pos->next->prev == pos)); -+} -+ -+/* check consistency of locking data-structures hanging of the @stack */ -+static void check_lock_stack(lock_stack * stack) -+{ -+ spin_lock_stack(stack); -+ /* check that stack->locks is not corrupted */ -+ list_check(&stack->locks); -+ spin_unlock_stack(stack); -+} -+ -+/* check consistency of locking data structures */ -+void check_lock_data(void) -+{ -+ check_lock_stack(&get_current_context()->stack); -+} -+ -+/* check consistency of locking data structures for @node */ -+void check_lock_node_data(znode * node) -+{ -+ spin_lock_zlock(&node->lock); -+ list_check(&node->lock.owners); -+ list_check(&node->lock.requestors); -+ spin_unlock_zlock(&node->lock); -+} -+ -+/* check that given lock request is dead lock safe. This check is, of course, -+ * not exhaustive. */ -+static int -+request_is_deadlock_safe(znode * node, znode_lock_mode mode, -+ znode_lock_request request) -+{ -+ lock_stack *owner; -+ -+ owner = get_current_lock_stack(); -+ /* -+ * check that hipri lock request is not issued when there are locked -+ * nodes at the higher levels. -+ */ -+ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) && -+ znode_get_level(node) != 0) { -+ lock_handle *item; -+ -+ list_for_each_entry(item, &owner->locks, locks_link) { -+ znode *other; -+ -+ other = item->node; -+ -+ if (znode_get_level(other) == 0) -+ continue; -+ if (znode_get_level(other) > znode_get_level(node)) -+ return 0; -+ } -+ } -+ return 1; -+} -+ -+#endif -+ -+/* return pointer to static storage with name of lock_mode. For -+ debugging */ -+const char *lock_mode_name(znode_lock_mode lock/* lock mode to get name of */) -+{ -+ if (lock == ZNODE_READ_LOCK) -+ return "read"; -+ else if (lock == ZNODE_WRITE_LOCK) -+ return "write"; -+ else { -+ static char buf[30]; -+ -+ sprintf(buf, "unknown: %i", lock); -+ return buf; -+ } -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 79 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/lock.h linux-2.6.30/fs/reiser4/lock.h ---- linux-2.6.30.orig/fs/reiser4/lock.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/lock.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,250 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Long term locking data structures. See lock.c for details. */ -+ -+#ifndef __LOCK_H__ -+#define __LOCK_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/node/node.h" -+#include "txnmgr.h" -+#include "readahead.h" -+ -+#include <linux/types.h> -+#include <linux/spinlock.h> -+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */ -+#include <asm/atomic.h> -+#include <linux/wait.h> -+ -+/* Per-znode lock object */ -+struct zlock { -+ spinlock_t guard; -+ /* The number of readers if positive; the number of recursively taken -+ write locks if negative. Protected by zlock spin lock. */ -+ int nr_readers; -+ /* A number of processes (lock_stacks) that have this object -+ locked with high priority */ -+ unsigned nr_hipri_owners; -+ /* A number of attempts to lock znode in high priority direction */ -+ unsigned nr_hipri_requests; -+ /* A linked list of lock_handle objects that contains pointers -+ for all lock_stacks which have this lock object locked */ -+ unsigned nr_hipri_write_requests; -+ struct list_head owners; -+ /* A linked list of lock_stacks that wait for this lock */ -+ struct list_head requestors; -+}; -+ -+static inline void spin_lock_zlock(zlock *lock) -+{ -+ /* check that zlock is not locked */ -+ assert("", LOCK_CNT_NIL(spin_locked_zlock)); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", LOCK_CNT_NIL(spin_locked_stack)); -+ -+ spin_lock(&lock->guard); -+ -+ LOCK_CNT_INC(spin_locked_zlock); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_unlock_zlock(zlock *lock) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_zlock); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&lock->guard); -+} -+ -+#define lock_is_locked(lock) ((lock)->nr_readers != 0) -+#define lock_is_rlocked(lock) ((lock)->nr_readers > 0) -+#define lock_is_wlocked(lock) ((lock)->nr_readers < 0) -+#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1) -+#define lock_can_be_rlocked(lock) ((lock)->nr_readers >= 0) -+#define lock_mode_compatible(lock, mode) \ -+ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \ -+ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock))) -+ -+/* Since we have R/W znode locks we need additional bidirectional `link' -+ objects to implement n<->m relationship between lock owners and lock -+ objects. We call them `lock handles'. -+ -+ Locking: see lock.c/"SHORT-TERM LOCKING" -+*/ -+struct lock_handle { -+ /* This flag indicates that a signal to yield a lock was passed to -+ lock owner and counted in owner->nr_signalled -+ -+ Locking: this is accessed under spin lock on ->node. -+ */ -+ int signaled; -+ /* A link to owner of a lock */ -+ lock_stack *owner; -+ /* A link to znode locked */ -+ znode *node; -+ /* A list of all locks for a process */ -+ struct list_head locks_link; -+ /* A list of all owners for a znode */ -+ struct list_head owners_link; -+}; -+ -+struct lock_request { -+ /* A pointer to uninitialized link object */ -+ lock_handle *handle; -+ /* A pointer to the object we want to lock */ -+ znode *node; -+ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */ -+ znode_lock_mode mode; -+ /* how dispatch_lock_requests() returns lock request result code */ -+ int ret_code; -+}; -+ -+/* A lock stack structure for accumulating locks owned by a process */ -+struct lock_stack { -+ /* A guard lock protecting a lock stack */ -+ spinlock_t sguard; -+ /* number of znodes which were requested by high priority processes */ -+ atomic_t nr_signaled; -+ /* Current priority of a process -+ -+ This is only accessed by the current thread and thus requires no -+ locking. -+ */ -+ int curpri; -+ /* A list of all locks owned by this process. Elements can be added to -+ * this list only by the current thread. ->node pointers in this list -+ * can be only changed by the current thread. */ -+ struct list_head locks; -+ /* When lock_stack waits for the lock, it puts itself on double-linked -+ requestors list of that lock */ -+ struct list_head requestors_link; -+ /* Current lock request info. -+ -+ This is only accessed by the current thread and thus requires no -+ locking. -+ */ -+ struct lock_request request; -+ /* the following two fields are the lock stack's -+ * synchronization object to use with the standard linux/wait.h -+ * interface. See reiser4_go_to_sleep and __reiser4_wake_up for -+ * usage details. */ -+ wait_queue_head_t wait; -+ atomic_t wakeup; -+#if REISER4_DEBUG -+ int nr_locks; /* number of lock handles in the above list */ -+#endif -+}; -+ -+/* -+ User-visible znode locking functions -+*/ -+ -+extern int longterm_lock_znode(lock_handle * handle, -+ znode * node, -+ znode_lock_mode mode, -+ znode_lock_request request); -+ -+extern void longterm_unlock_znode(lock_handle * handle); -+ -+extern int reiser4_check_deadlock(void); -+ -+extern lock_stack *get_current_lock_stack(void); -+ -+extern void init_lock_stack(lock_stack * owner); -+extern void reiser4_init_lock(zlock * lock); -+ -+static inline void init_lh(lock_handle *lh) -+{ -+#if REISER4_DEBUG -+ memset(lh, 0, sizeof *lh); -+ INIT_LIST_HEAD(&lh->locks_link); -+ INIT_LIST_HEAD(&lh->owners_link); -+#else -+ lh->node = NULL; -+#endif -+} -+ -+static inline void done_lh(lock_handle *lh) -+{ -+ assert("zam-342", lh != NULL); -+ if (lh->node != NULL) -+ longterm_unlock_znode(lh); -+} -+ -+extern void move_lh(lock_handle * new, lock_handle * old); -+extern void copy_lh(lock_handle * new, lock_handle * old); -+ -+extern int reiser4_prepare_to_sleep(lock_stack * owner); -+extern void reiser4_go_to_sleep(lock_stack * owner); -+extern void __reiser4_wake_up(lock_stack * owner); -+ -+extern int lock_stack_isclean(lock_stack * owner); -+ -+/* zlock object state check macros: only used in assertions. Both forms imply -+ that the lock is held by the current thread. */ -+extern int znode_is_write_locked(const znode *); -+extern void reiser4_invalidate_lock(lock_handle *); -+ -+/* lock ordering is: first take zlock spin lock, then lock stack spin lock */ -+#define spin_ordering_pred_stack(stack) \ -+ (LOCK_CNT_NIL(spin_locked_stack) && \ -+ LOCK_CNT_NIL(spin_locked_txnmgr) && \ -+ LOCK_CNT_NIL(spin_locked_inode) && \ -+ LOCK_CNT_NIL(rw_locked_cbk_cache) && \ -+ LOCK_CNT_NIL(spin_locked_super_eflush)) -+ -+static inline void spin_lock_stack(lock_stack *stack) -+{ -+ assert("", spin_ordering_pred_stack(stack)); -+ spin_lock(&(stack->sguard)); -+ LOCK_CNT_INC(spin_locked_stack); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_unlock_stack(lock_stack *stack) -+{ -+ assert_spin_locked(&(stack->sguard)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ LOCK_CNT_DEC(spin_locked_stack); -+ LOCK_CNT_DEC(spin_locked); -+ spin_unlock(&(stack->sguard)); -+} -+ -+static inline void reiser4_wake_up(lock_stack * owner) -+{ -+ spin_lock_stack(owner); -+ __reiser4_wake_up(owner); -+ spin_unlock_stack(owner); -+} -+ -+const char *lock_mode_name(znode_lock_mode lock); -+ -+#if REISER4_DEBUG -+extern void check_lock_data(void); -+extern void check_lock_node_data(znode * node); -+#else -+#define check_lock_data() noop -+#define check_lock_node_data() noop -+#endif -+ -+/* __LOCK_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/Makefile linux-2.6.30/fs/reiser4/Makefile ---- linux-2.6.30.orig/fs/reiser4/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,98 @@ -+# -+# reiser4/Makefile -+# -+ -+obj-$(CONFIG_REISER4_FS) += reiser4.o -+ -+reiser4-y := \ -+ debug.o \ -+ jnode.o \ -+ znode.o \ -+ key.o \ -+ pool.o \ -+ tree_mod.o \ -+ estimate.o \ -+ carry.o \ -+ carry_ops.o \ -+ lock.o \ -+ tree.o \ -+ context.o \ -+ tap.o \ -+ coord.o \ -+ block_alloc.o \ -+ txnmgr.o \ -+ kassign.o \ -+ flush.o \ -+ wander.o \ -+ eottl.o \ -+ search.o \ -+ page_cache.o \ -+ seal.o \ -+ dscale.o \ -+ flush_queue.o \ -+ ktxnmgrd.o \ -+ blocknrset.o \ -+ super.o \ -+ super_ops.o \ -+ fsdata.o \ -+ export_ops.o \ -+ oid.o \ -+ tree_walk.o \ -+ inode.o \ -+ vfs_ops.o \ -+ as_ops.o \ -+ entd.o\ -+ readahead.o \ -+ status_flags.o \ -+ init_super.o \ -+ safe_link.o \ -+ \ -+ plugin/plugin.o \ -+ plugin/plugin_set.o \ -+ plugin/node/node.o \ -+ plugin/object.o \ -+ plugin/cluster.o \ -+ plugin/inode_ops.o \ -+ plugin/inode_ops_rename.o \ -+ plugin/file_ops.o \ -+ plugin/file_ops_readdir.o \ -+ plugin/file_plugin_common.o \ -+ plugin/file/file.o \ -+ plugin/file/tail_conversion.o \ -+ plugin/file/file_conversion.o \ -+ plugin/file/symlink.o \ -+ plugin/file/cryptcompress.o \ -+ plugin/dir_plugin_common.o \ -+ plugin/dir/hashed_dir.o \ -+ plugin/dir/seekable_dir.o \ -+ plugin/node/node40.o \ -+ \ -+ plugin/crypto/cipher.o \ -+ plugin/crypto/digest.o \ -+ \ -+ plugin/compress/compress.o \ -+ plugin/compress/compress_mode.o \ -+ \ -+ plugin/item/static_stat.o \ -+ plugin/item/sde.o \ -+ plugin/item/cde.o \ -+ plugin/item/blackbox.o \ -+ plugin/item/internal.o \ -+ plugin/item/tail.o \ -+ plugin/item/ctail.o \ -+ plugin/item/extent.o \ -+ plugin/item/extent_item_ops.o \ -+ plugin/item/extent_file_ops.o \ -+ plugin/item/extent_flush_ops.o \ -+ \ -+ plugin/hash.o \ -+ plugin/fibration.o \ -+ plugin/tail_policy.o \ -+ plugin/item/item.o \ -+ \ -+ plugin/security/perm.o \ -+ plugin/space/bitmap.o \ -+ \ -+ plugin/disk_format/disk_format40.o \ -+ plugin/disk_format/disk_format.o -+ -diff -urN linux-2.6.30.orig/fs/reiser4/oid.c linux-2.6.30/fs/reiser4/oid.c ---- linux-2.6.30.orig/fs/reiser4/oid.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/oid.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,141 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "debug.h" -+#include "super.h" -+#include "txnmgr.h" -+ -+/* we used to have oid allocation plugin. It was removed because it -+ was recognized as providing unneeded level of abstraction. If one -+ ever will find it useful - look at yet_unneeded_abstractions/oid -+*/ -+ -+/* -+ * initialize in-memory data for oid allocator at @super. @nr_files and @next -+ * are provided by disk format plugin that reads them from the disk during -+ * mount. -+ */ -+int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(super); -+ -+ sbinfo->next_to_use = next; -+ sbinfo->oids_in_use = nr_files; -+ return 0; -+} -+ -+/* -+ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator -+ * runs out of oids. -+ */ -+oid_t oid_allocate(struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ oid_t oid; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) { -+ oid = sbinfo->next_to_use++; -+ sbinfo->oids_in_use++; -+ } else -+ oid = ABSOLUTE_MAX_OID; -+ spin_unlock_reiser4_super(sbinfo); -+ return oid; -+} -+ -+/* -+ * Tell oid allocator that @oid is now free. -+ */ -+int oid_release(struct super_block *super, oid_t oid UNUSED_ARG) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ sbinfo->oids_in_use--; -+ spin_unlock_reiser4_super(sbinfo); -+ return 0; -+} -+ -+/* -+ * return next @oid that would be allocated (i.e., returned by oid_allocate()) -+ * without actually allocating it. This is used by disk format plugin to save -+ * oid allocator state on the disk. -+ */ -+oid_t oid_next(const struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ oid_t oid; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ oid = sbinfo->next_to_use; -+ spin_unlock_reiser4_super(sbinfo); -+ return oid; -+} -+ -+/* -+ * returns number of currently used oids. This is used by statfs(2) to report -+ * number of "inodes" and by disk format plugin to save oid allocator state on -+ * the disk. -+ */ -+long oids_used(const struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ oid_t used; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ used = sbinfo->oids_in_use; -+ spin_unlock_reiser4_super(sbinfo); -+ if (used < (__u64) ((long)~0) >> 1) -+ return (long)used; -+ else -+ return (long)-1; -+} -+ -+/* -+ * Count oid as allocated in atom. This is done after call to oid_allocate() -+ * at the point when we are irrevocably committed to creation of the new file -+ * (i.e., when oid allocation cannot be any longer rolled back due to some -+ * error). -+ */ -+void oid_count_allocated(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ atom->nr_objects_created++; -+ spin_unlock_atom(atom); -+} -+ -+/* -+ * Count oid as free in atom. This is done after call to oid_release() at the -+ * point when we are irrevocably committed to the deletion of the file (i.e., -+ * when oid release cannot be any longer rolled back due to some error). -+ */ -+void oid_count_released(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ atom->nr_objects_deleted++; -+ spin_unlock_atom(atom); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/page_cache.c linux-2.6.30/fs/reiser4/page_cache.c ---- linux-2.6.30.orig/fs/reiser4/page_cache.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/page_cache.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,693 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Memory pressure hooks. Fake inodes handling. */ -+ -+/* GLOSSARY -+ -+ . Formatted and unformatted nodes. -+ Elements of reiser4 balanced tree to store data and metadata. -+ Unformatted nodes are pointed to by extent pointers. Such nodes -+ are used to store data of large objects. Unlike unformatted nodes, -+ formatted ones have associated format described by node4X plugin. -+ -+ . Jnode (or journal node) -+ The in-memory header which is used to track formatted and unformatted -+ nodes, bitmap nodes, etc. In particular, jnodes are used to track -+ transactional information associated with each block(see reiser4/jnode.c -+ for details). -+ -+ . Znode -+ The in-memory header which is used to track formatted nodes. Contains -+ embedded jnode (see reiser4/znode.c for details). -+*/ -+ -+/* We store all file system meta data (and data, of course) in the page cache. -+ -+ What does this mean? In stead of using bread/brelse we create special -+ "fake" inode (one per super block) and store content of formatted nodes -+ into pages bound to this inode in the page cache. In newer kernels bread() -+ already uses inode attached to block device (bd_inode). Advantage of having -+ our own fake inode is that we can install appropriate methods in its -+ address_space operations. Such methods are called by VM on memory pressure -+ (or during background page flushing) and we can use them to react -+ appropriately. -+ -+ In initial version we only support one block per page. Support for multiple -+ blocks per page is complicated by relocation. -+ -+ To each page, used by reiser4, jnode is attached. jnode is analogous to -+ buffer head. Difference is that jnode is bound to the page permanently: -+ jnode cannot be removed from memory until its backing page is. -+ -+ jnode contain pointer to page (->pg field) and page contain pointer to -+ jnode in ->private field. Pointer from jnode to page is protected to by -+ jnode's spinlock and pointer from page to jnode is protected by page lock -+ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin -+ lock. To go into reverse direction use jnode_lock_page() function that uses -+ standard try-lock-and-release device. -+ -+ Properties: -+ -+ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page -+ reference counter is increased. -+ -+ 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page -+ reference counter is decreased. -+ -+ 3. on jload() reference counter on jnode page is increased, page is -+ kmapped and `referenced'. -+ -+ 4. on jrelse() inverse operations are performed. -+ -+ 5. kmapping/kunmapping of unformatted pages is done by read/write methods. -+ -+ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting -+ historically.] -+ -+ [In the following discussion, `lock' invariably means long term lock on -+ znode.] (What about page locks?) -+ -+ There is some special class of deadlock possibilities related to memory -+ pressure. Locks acquired by other reiser4 threads are accounted for in -+ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is -+ invoked additional hidden arc is added to the locking graph: thread that -+ tries to allocate memory waits for ->vm_writeback() to finish. If this -+ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock -+ prevention is useless. -+ -+ Another related problem is possibility for ->vm_writeback() to run out of -+ memory itself. This is not a problem for ext2 and friends, because their -+ ->vm_writeback() don't allocate much memory, but reiser4 flush is -+ definitely able to allocate huge amounts of memory. -+ -+ It seems that there is no reliable way to cope with the problems above. In -+ stead it was decided that ->vm_writeback() (as invoked in the kswapd -+ context) wouldn't perform any flushing itself, but rather should just wake -+ up some auxiliary thread dedicated for this purpose (or, the same thread -+ that does periodic commit of old atoms (ktxnmgrd.c)). -+ -+ Details: -+ -+ 1. Page is called `reclaimable' against particular reiser4 mount F if this -+ page can be ultimately released by try_to_free_pages() under presumptions -+ that: -+ -+ a. ->vm_writeback() for F is no-op, and -+ -+ b. none of the threads accessing F are making any progress, and -+ -+ c. other reiser4 mounts obey the same memory reservation protocol as F -+ (described below). -+ -+ For example, clean un-pinned page, or page occupied by ext2 data are -+ reclaimable against any reiser4 mount. -+ -+ When there is more than one reiser4 mount in a system, condition (c) makes -+ reclaim-ability not easily verifiable beyond trivial cases mentioned above. -+ -+ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE -+ -+ Fake inode is used to bound formatted nodes and each node is indexed within -+ fake inode by its block number. If block size of smaller than page size, it -+ may so happen that block mapped to the page with formatted node is occupied -+ by unformatted node or is unallocated. This lead to some complications, -+ because flushing whole page can lead to an incorrect overwrite of -+ unformatted node that is moreover, can be cached in some other place as -+ part of the file body. To avoid this, buffers for unformatted nodes are -+ never marked dirty. Also pages in the fake are never marked dirty. This -+ rules out usage of ->writepage() as memory pressure hook. In stead -+ ->releasepage() is used. -+ -+ Josh is concerned that page->buffer is going to die. This should not pose -+ significant problem though, because we need to add some data structures to -+ the page anyway (jnode) and all necessary book keeping can be put there. -+ -+*/ -+ -+/* Life cycle of pages/nodes. -+ -+ jnode contains reference to page and page contains reference back to -+ jnode. This reference is counted in page ->count. Thus, page bound to jnode -+ cannot be released back into free pool. -+ -+ 1. Formatted nodes. -+ -+ 1. formatted node is represented by znode. When new znode is created its -+ ->pg pointer is NULL initially. -+ -+ 2. when node content is loaded into znode (by call to zload()) for the -+ first time following happens (in call to ->read_node() or -+ ->allocate_node()): -+ -+ 1. new page is added to the page cache. -+ -+ 2. this page is attached to znode and its ->count is increased. -+ -+ 3. page is kmapped. -+ -+ 3. if more calls to zload() follow (without corresponding zrelses), page -+ counter is left intact and in its stead ->d_count is increased in znode. -+ -+ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero -+ ->release_node() is called and page is kunmapped as result. -+ -+ 5. at some moment node can be captured by a transaction. Its ->x_count -+ is then increased by transaction manager. -+ -+ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE -+ bit set) following will happen (also see comment at the top of znode.c): -+ -+ 1. when last lock is released, node will be uncaptured from -+ transaction. This released reference that transaction manager acquired -+ at the step 5. -+ -+ 2. when last reference is released, zput() detects that node is -+ actually deleted and calls ->delete_node() -+ operation. page_cache_delete_node() implementation detaches jnode from -+ page and releases page. -+ -+ 7. otherwise (node wasn't removed from the tree), last reference to -+ znode will be released after transaction manager committed transaction -+ node was in. This implies squallocing of this node (see -+ flush.c). Nothing special happens at this point. Znode is still in the -+ hash table and page is still attached to it. -+ -+ 8. znode is actually removed from the memory because of the memory -+ pressure, or during umount (znodes_tree_done()). Anyway, znode is -+ removed by the call to zdrop(). At this moment, page is detached from -+ znode and removed from the inode address space. -+ -+*/ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "super.h" -+#include "entd.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+ -+#include <linux/types.h> -+#include <linux/fs.h> -+#include <linux/mm.h> /* for struct page */ -+#include <linux/swap.h> /* for struct page */ -+#include <linux/pagemap.h> -+#include <linux/bio.h> -+#include <linux/writeback.h> -+#include <linux/blkdev.h> -+ -+static struct bio *page_bio(struct page *, jnode * , int rw, gfp_t gfp); -+ -+static struct address_space_operations formatted_fake_as_ops; -+ -+static const oid_t fake_ino = 0x1; -+static const oid_t bitmap_ino = 0x2; -+static const oid_t cc_ino = 0x3; -+ -+static void -+init_fake_inode(struct super_block *super, struct inode *fake, -+ struct inode **pfake) -+{ -+ assert("nikita-2168", fake->i_state & I_NEW); -+ fake->i_mapping->a_ops = &formatted_fake_as_ops; -+ *pfake = fake; -+ /* NOTE-NIKITA something else? */ -+ unlock_new_inode(fake); -+} -+ -+/** -+ * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps -+ * @super: super block to init fake inode for -+ * -+ * Initializes fake inode to which formatted nodes are bound in the page cache -+ * and inode for bitmaps. -+ */ -+int reiser4_init_formatted_fake(struct super_block *super) -+{ -+ struct inode *fake; -+ struct inode *bitmap; -+ struct inode *cc; -+ reiser4_super_info_data *sinfo; -+ -+ assert("nikita-1703", super != NULL); -+ -+ sinfo = get_super_private_nocheck(super); -+ fake = iget_locked(super, oid_to_ino(fake_ino)); -+ -+ if (fake != NULL) { -+ init_fake_inode(super, fake, &sinfo->fake); -+ -+ bitmap = iget_locked(super, oid_to_ino(bitmap_ino)); -+ if (bitmap != NULL) { -+ init_fake_inode(super, bitmap, &sinfo->bitmap); -+ -+ cc = iget_locked(super, oid_to_ino(cc_ino)); -+ if (cc != NULL) { -+ init_fake_inode(super, cc, &sinfo->cc); -+ return 0; -+ } else { -+ iput(sinfo->fake); -+ iput(sinfo->bitmap); -+ sinfo->fake = NULL; -+ sinfo->bitmap = NULL; -+ } -+ } else { -+ iput(sinfo->fake); -+ sinfo->fake = NULL; -+ } -+ } -+ return RETERR(-ENOMEM); -+} -+ -+/** -+ * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps -+ * @super: super block to init fake inode for -+ * -+ * Releases inodes which were used as address spaces of bitmap and formatted -+ * nodes. -+ */ -+void reiser4_done_formatted_fake(struct super_block *super) -+{ -+ reiser4_super_info_data *sinfo; -+ -+ sinfo = get_super_private_nocheck(super); -+ -+ if (sinfo->fake != NULL) { -+ iput(sinfo->fake); -+ sinfo->fake = NULL; -+ } -+ -+ if (sinfo->bitmap != NULL) { -+ iput(sinfo->bitmap); -+ sinfo->bitmap = NULL; -+ } -+ -+ if (sinfo->cc != NULL) { -+ iput(sinfo->cc); -+ sinfo->cc = NULL; -+ } -+ return; -+} -+ -+void reiser4_wait_page_writeback(struct page *page) -+{ -+ assert("zam-783", PageLocked(page)); -+ -+ do { -+ unlock_page(page); -+ wait_on_page_writeback(page); -+ lock_page(page); -+ } while (PageWriteback(page)); -+} -+ -+/* return tree @page is in */ -+reiser4_tree *reiser4_tree_by_page(const struct page *page/* page to query */) -+{ -+ assert("nikita-2461", page != NULL); -+ return &get_super_private(page->mapping->host->i_sb)->tree; -+} -+ -+/* completion handler for single page bio-based read. -+ -+ mpage_end_io_read() would also do. But it's static. -+ -+*/ -+static void -+end_bio_single_page_read(struct bio *bio, int err UNUSED_ARG) -+{ -+ struct page *page; -+ -+ page = bio->bi_io_vec[0].bv_page; -+ -+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) { -+ SetPageUptodate(page); -+ } else { -+ ClearPageUptodate(page); -+ SetPageError(page); -+ } -+ unlock_page(page); -+ bio_put(bio); -+} -+ -+/* completion handler for single page bio-based write. -+ -+ mpage_end_io_write() would also do. But it's static. -+ -+*/ -+static void -+end_bio_single_page_write(struct bio *bio, int err UNUSED_ARG) -+{ -+ struct page *page; -+ -+ page = bio->bi_io_vec[0].bv_page; -+ -+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) -+ SetPageError(page); -+ end_page_writeback(page); -+ bio_put(bio); -+} -+ -+/* ->readpage() method for formatted nodes */ -+static int formatted_readpage(struct file *f UNUSED_ARG, -+ struct page *page/* page to read */) -+{ -+ assert("nikita-2412", PagePrivate(page) && jprivate(page)); -+ return reiser4_page_io(page, jprivate(page), READ, -+ reiser4_ctx_gfp_mask_get()); -+} -+ -+/** -+ * reiser4_page_io - submit single-page bio request -+ * @page: page to perform io for -+ * @node: jnode of page -+ * @rw: read or write -+ * @gfp: gfp mask for bio allocation -+ * -+ * Submits single page read or write. -+ */ -+int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp) -+{ -+ struct bio *bio; -+ int result; -+ -+ assert("nikita-2094", page != NULL); -+ assert("nikita-2226", PageLocked(page)); -+ assert("nikita-2634", node != NULL); -+ assert("nikita-2893", rw == READ || rw == WRITE); -+ -+ if (rw) { -+ if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) { -+ unlock_page(page); -+ return 0; -+ } -+ } -+ -+ bio = page_bio(page, node, rw, gfp); -+ if (!IS_ERR(bio)) { -+ if (rw == WRITE) { -+ set_page_writeback(page); -+ unlock_page(page); -+ } -+ reiser4_submit_bio(rw, bio); -+ result = 0; -+ } else { -+ unlock_page(page); -+ result = PTR_ERR(bio); -+ } -+ -+ return result; -+} -+ -+/* helper function to construct bio for page */ -+static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp) -+{ -+ struct bio *bio; -+ assert("nikita-2092", page != NULL); -+ assert("nikita-2633", node != NULL); -+ -+ /* Simple implementation in the assumption that blocksize == pagesize. -+ -+ We only have to submit one block, but submit_bh() will allocate bio -+ anyway, so lets use all the bells-and-whistles of bio code. -+ */ -+ -+ bio = bio_alloc(gfp, 1); -+ if (bio != NULL) { -+ int blksz; -+ struct super_block *super; -+ reiser4_block_nr blocknr; -+ -+ super = page->mapping->host->i_sb; -+ assert("nikita-2029", super != NULL); -+ blksz = super->s_blocksize; -+ assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE); -+ -+ spin_lock_jnode(node); -+ blocknr = *jnode_get_io_block(node); -+ spin_unlock_jnode(node); -+ -+ assert("nikita-2275", blocknr != (reiser4_block_nr) 0); -+ assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr)); -+ -+ bio->bi_bdev = super->s_bdev; -+ /* fill bio->bi_sector before calling bio_add_page(), because -+ * q->merge_bvec_fn may want to inspect it (see -+ * drivers/md/linear.c:linear_mergeable_bvec() for example. */ -+ bio->bi_sector = blocknr * (blksz >> 9); -+ -+ if (!bio_add_page(bio, page, blksz, 0)) { -+ warning("nikita-3452", -+ "Single page bio cannot be constructed"); -+ return ERR_PTR(RETERR(-EINVAL)); -+ } -+ -+ /* bio -> bi_idx is filled by bio_init() */ -+ bio->bi_end_io = (rw == READ) ? -+ end_bio_single_page_read : end_bio_single_page_write; -+ -+ return bio; -+ } else -+ return ERR_PTR(RETERR(-ENOMEM)); -+} -+ -+#if 0 -+static int can_hit_entd(reiser4_context *ctx, struct super_block *s) -+{ -+ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic) -+ return 1; -+ if (ctx->super != s) -+ return 1; -+ if (get_super_private(s)->entd.tsk == current) -+ return 0; -+ if (!lock_stack_isclean(&ctx->stack)) -+ return 0; -+ if (ctx->trans->atom != NULL) -+ return 0; -+ return 1; -+} -+#endif -+ -+/** -+ * reiser4_writepage - writepage of struct address_space_operations -+ * @page: page to write -+ * @wbc: -+ * -+ * -+ */ -+/* Common memory pressure notification. */ -+int reiser4_writepage(struct page *page, -+ struct writeback_control *wbc) -+{ -+ struct super_block *s; -+ reiser4_context *ctx; -+ -+ assert("vs-828", PageLocked(page)); -+ -+ s = page->mapping->host->i_sb; -+ ctx = get_current_context_check(); -+ -+ /* assert("", can_hit_entd(ctx, s)); */ -+ return write_page_by_ent(page, wbc); -+} -+ -+/* ->set_page_dirty() method of formatted address_space */ -+static int formatted_set_page_dirty(struct page *page) -+{ -+ assert("nikita-2173", page != NULL); -+ BUG(); -+ return __set_page_dirty_nobuffers(page); -+} -+ -+/* writepages method of address space operations in reiser4 is used to involve -+ into transactions pages which are dirtied via mmap. Only regular files can -+ have such pages. Fake inode is used to access formatted nodes via page -+ cache. As formatted nodes can never be mmaped, fake inode's writepages has -+ nothing to do */ -+static int -+writepages_fake(struct address_space *mapping, struct writeback_control *wbc) -+{ -+ return 0; -+} -+ -+/* address space operations for the fake inode */ -+static struct address_space_operations formatted_fake_as_ops = { -+ /* Perform a writeback of a single page as a memory-freeing -+ * operation. */ -+ .writepage = reiser4_writepage, -+ /* this is called to read formatted node */ -+ .readpage = formatted_readpage, -+ /* ->sync_page() method of fake inode address space operations. Called -+ from wait_on_page() and lock_page(). -+ -+ This is most annoyingly misnomered method. Actually it is called -+ from wait_on_page_bit() and lock_page() and its purpose is to -+ actually start io by jabbing device drivers. -+ */ -+ .sync_page = block_sync_page, -+ /* Write back some dirty pages from this mapping. Called from sync. -+ called during sync (pdflush) */ -+ .writepages = writepages_fake, -+ /* Set a page dirty */ -+ .set_page_dirty = formatted_set_page_dirty, -+ /* used for read-ahead. Not applicable */ -+ .readpages = NULL, -+ .write_begin = NULL, -+ .write_end = NULL, -+ .bmap = NULL, -+ /* called just before page is being detached from inode mapping and -+ removed from memory. Called on truncate, cut/squeeze, and -+ umount. */ -+ .invalidatepage = reiser4_invalidatepage, -+ /* this is called by shrink_cache() so that file system can try to -+ release objects (jnodes, buffers, journal heads) attached to page -+ and, may be made page itself free-able. -+ */ -+ .releasepage = reiser4_releasepage, -+ .direct_IO = NULL -+}; -+ -+/* called just before page is released (no longer used by reiser4). Callers: -+ jdelete() and extent2tail(). */ -+void reiser4_drop_page(struct page *page) -+{ -+ assert("nikita-2181", PageLocked(page)); -+ clear_page_dirty_for_io(page); -+ ClearPageUptodate(page); -+#if defined(PG_skipped) -+ ClearPageSkipped(page); -+#endif -+ unlock_page(page); -+} -+ -+#define JNODE_GANG_SIZE (16) -+ -+/* find all jnodes from range specified and invalidate them */ -+static int -+truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count) -+{ -+ reiser4_inode *info; -+ int truncated_jnodes; -+ reiser4_tree *tree; -+ unsigned long index; -+ unsigned long end; -+ -+ if (inode_file_plugin(inode) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) -+ /* -+ * No need to get rid of jnodes here: if the single jnode of -+ * page cluster did not have page, then it was found and killed -+ * before in -+ * truncate_complete_page_cluster()->jput()->jput_final(), -+ * otherwise it will be dropped by reiser4_invalidatepage() -+ */ -+ return 0; -+ truncated_jnodes = 0; -+ -+ info = reiser4_inode_data(inode); -+ tree = reiser4_tree_by_inode(inode); -+ -+ index = from; -+ end = from + count; -+ -+ while (1) { -+ jnode *gang[JNODE_GANG_SIZE]; -+ int taken; -+ int i; -+ jnode *node; -+ -+ assert("nikita-3466", index <= end); -+ -+ read_lock_tree(tree); -+ taken = -+ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info), -+ (void **)gang, index, -+ JNODE_GANG_SIZE); -+ for (i = 0; i < taken; ++i) { -+ node = gang[i]; -+ if (index_jnode(node) < end) -+ jref(node); -+ else -+ gang[i] = NULL; -+ } -+ read_unlock_tree(tree); -+ -+ for (i = 0; i < taken; ++i) { -+ node = gang[i]; -+ if (node != NULL) { -+ index = max(index, index_jnode(node)); -+ spin_lock_jnode(node); -+ assert("edward-1457", node->pg == NULL); -+ /* this is always called after -+ truncate_inode_pages_range(). Therefore, here -+ jnode can not have page. New pages can not be -+ created because truncate_jnodes_range goes -+ under exclusive access on file obtained, -+ where as new page creation requires -+ non-exclusive access obtained */ -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ reiser4_uncapture_jnode(node); -+ unhash_unformatted_jnode(node); -+ truncated_jnodes++; -+ jput(node); -+ } else -+ break; -+ } -+ if (i != taken || taken == 0) -+ break; -+ } -+ return truncated_jnodes; -+} -+ -+/* Truncating files in reiser4: problems and solutions. -+ -+ VFS calls fs's truncate after it has called truncate_inode_pages() -+ to get rid of pages corresponding to part of file being truncated. -+ In reiser4 it may cause existence of unallocated extents which do -+ not have jnodes. Flush code does not expect that. Solution of this -+ problem is straightforward. As vfs's truncate is implemented using -+ setattr operation, it seems reasonable to have ->setattr() that -+ will cut file body. However, flush code also does not expect dirty -+ pages without parent items, so it is impossible to cut all items, -+ then truncate all pages in two steps. We resolve this problem by -+ cutting items one-by-one. Each such fine-grained step performed -+ under longterm znode lock calls at the end ->kill_hook() method of -+ a killed item to remove its binded pages and jnodes. -+ -+ The following function is a common part of mentioned kill hooks. -+ Also, this is called before tail-to-extent conversion (to not manage -+ few copies of the data). -+*/ -+void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from, -+ unsigned long count, int even_cows) -+{ -+ loff_t from_bytes, count_bytes; -+ -+ if (count == 0) -+ return; -+ from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT; -+ count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT; -+ -+ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows); -+ truncate_inode_pages_range(mapping, from_bytes, -+ from_bytes + count_bytes - 1); -+ truncate_jnodes_range(mapping->host, from, count); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/page_cache.h linux-2.6.30/fs/reiser4/page_cache.h ---- linux-2.6.30.orig/fs/reiser4/page_cache.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/page_cache.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,66 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */ -+ -+#if !defined(__REISER4_PAGE_CACHE_H__) -+#define __REISER4_PAGE_CACHE_H__ -+ -+#include "forward.h" -+#include "context.h" /* for reiser4_ctx_gfp_mask_get() */ -+ -+#include <linux/fs.h> /* for struct super_block, address_space */ -+#include <linux/mm.h> /* for struct page */ -+#include <linux/pagemap.h> /* for lock_page() */ -+#include <linux/vmalloc.h> /* for __vmalloc() */ -+ -+extern int reiser4_init_formatted_fake(struct super_block *); -+extern void reiser4_done_formatted_fake(struct super_block *); -+ -+extern reiser4_tree *reiser4_tree_by_page(const struct page *); -+ -+#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio)) -+ -+extern void reiser4_wait_page_writeback(struct page *); -+static inline void lock_and_wait_page_writeback(struct page *page) -+{ -+ lock_page(page); -+ if (unlikely(PageWriteback(page))) -+ reiser4_wait_page_writeback(page); -+} -+ -+#define jprivate(page) ((jnode *)page_private(page)) -+ -+extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t); -+extern void reiser4_drop_page(struct page *); -+extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from, -+ unsigned long count, int even_cows); -+extern void capture_reiser4_inodes(struct super_block *, -+ struct writeback_control *); -+static inline void *reiser4_vmalloc(unsigned long size) -+{ -+ return __vmalloc(size, -+ reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM, -+ PAGE_KERNEL); -+} -+ -+#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY -+ -+#if REISER4_DEBUG -+extern void print_page(const char *prefix, struct page *page); -+#else -+#define print_page(prf, p) noop -+#endif -+ -+/* __REISER4_PAGE_CACHE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/cluster.c linux-2.6.30/fs/reiser4/plugin/cluster.c ---- linux-2.6.30.orig/fs/reiser4/plugin/cluster.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/cluster.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,72 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Contains reiser4 cluster plugins (see -+ http://www.namesys.com/cryptcompress_design.html -+ "Concepts of clustering" for details). */ -+ -+#include "plugin_header.h" -+#include "plugin.h" -+#include "../inode.h" -+ -+static int change_cluster(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ assert("edward-1324", inode != NULL); -+ assert("edward-1325", plugin != NULL); -+ assert("edward-1326", is_reiser4_inode(inode)); -+ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE); -+ -+ /* Can't change the cluster plugin for already existent regular files */ -+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) -+ return RETERR(-EINVAL); -+ -+ /* If matches, nothing to change. */ -+ if (inode_hash_plugin(inode) != NULL && -+ inode_hash_plugin(inode)->h.id == plugin->h.id) -+ return 0; -+ -+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_CLUSTER, plugin); -+} -+ -+static reiser4_plugin_ops cluster_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = &change_cluster -+}; -+ -+#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \ -+ [CLUSTER_ ## ID ## _ID] = { \ -+ .h = { \ -+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \ -+ .id = CLUSTER_ ## ID ## _ID, \ -+ .pops = &cluster_plugin_ops, \ -+ .label = LABEL, \ -+ .desc = DESC, \ -+ .linkage = {NULL, NULL} \ -+ }, \ -+ .shift = SHIFT \ -+ } -+ -+cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = { -+ SUPPORT_CLUSTER(16, 64K, "64K", "Large"), -+ SUPPORT_CLUSTER(15, 32K, "32K", "Big"), -+ SUPPORT_CLUSTER(14, 16K, "16K", "Average"), -+ SUPPORT_CLUSTER(13, 8K, "8K", "Small"), -+ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal") -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/cluster.h linux-2.6.30/fs/reiser4/plugin/cluster.h ---- linux-2.6.30.orig/fs/reiser4/plugin/cluster.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/cluster.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,410 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* This file contains size/offset translators, modulators -+ and other helper functions. */ -+ -+#if !defined(__FS_REISER4_CLUSTER_H__) -+#define __FS_REISER4_CLUSTER_H__ -+ -+#include "../inode.h" -+ -+static inline int inode_cluster_shift(struct inode *inode) -+{ -+ assert("edward-92", inode != NULL); -+ assert("edward-93", reiser4_inode_data(inode) != NULL); -+ -+ return inode_cluster_plugin(inode)->shift; -+} -+ -+static inline unsigned cluster_nrpages_shift(struct inode *inode) -+{ -+ return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT; -+} -+ -+/* cluster size in page units */ -+static inline unsigned cluster_nrpages(struct inode *inode) -+{ -+ return 1U << cluster_nrpages_shift(inode); -+} -+ -+static inline size_t inode_cluster_size(struct inode *inode) -+{ -+ assert("edward-96", inode != NULL); -+ -+ return 1U << inode_cluster_shift(inode); -+} -+ -+static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode) -+{ -+ return idx >> cluster_nrpages_shift(inode); -+} -+ -+static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode) -+{ -+ return idx << cluster_nrpages_shift(inode); -+} -+ -+static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode) -+{ -+ return clust_to_pg(pg_to_clust(idx, inode), inode); -+} -+ -+static inline pgoff_t off_to_pg(loff_t off) -+{ -+ return (off >> PAGE_CACHE_SHIFT); -+} -+ -+static inline loff_t pg_to_off(pgoff_t idx) -+{ -+ return ((loff_t) (idx) << PAGE_CACHE_SHIFT); -+} -+ -+static inline cloff_t off_to_clust(loff_t off, struct inode *inode) -+{ -+ return off >> inode_cluster_shift(inode); -+} -+ -+static inline loff_t clust_to_off(cloff_t idx, struct inode *inode) -+{ -+ return (loff_t) idx << inode_cluster_shift(inode); -+} -+ -+static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode) -+{ -+ return clust_to_off(off_to_clust(off, inode), inode); -+} -+ -+static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode) -+{ -+ return clust_to_pg(off_to_clust(off, inode), inode); -+} -+ -+static inline unsigned off_to_pgoff(loff_t off) -+{ -+ return off & (PAGE_CACHE_SIZE - 1); -+} -+ -+static inline unsigned off_to_cloff(loff_t off, struct inode *inode) -+{ -+ return off & ((loff_t) (inode_cluster_size(inode)) - 1); -+} -+ -+static inline pgoff_t offset_in_clust(struct page *page) -+{ -+ assert("edward-1488", page != NULL); -+ assert("edward-1489", page->mapping != NULL); -+ -+ return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1); -+} -+ -+static inline int first_page_in_cluster(struct page *page) -+{ -+ return offset_in_clust(page) == 0; -+} -+ -+static inline int last_page_in_cluster(struct page *page) -+{ -+ return offset_in_clust(page) == -+ cluster_nrpages(page->mapping->host) - 1; -+} -+ -+static inline unsigned -+pg_to_off_to_cloff(unsigned long idx, struct inode *inode) -+{ -+ return off_to_cloff(pg_to_off(idx), inode); -+} -+ -+/*********************** Size translators **************************/ -+ -+/* Translate linear size. -+ * New units are (1 << @blk_shift) times larger, then old ones. -+ * In other words, calculate number of logical blocks, occupied -+ * by @count elements -+ */ -+static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits) -+{ -+ return (count + (1UL << blkbits) - 1) >> blkbits; -+} -+ -+/* size in pages */ -+static inline pgoff_t size_in_pages(loff_t size) -+{ -+ return size_in_blocks(size, PAGE_CACHE_SHIFT); -+} -+ -+/* size in logical clusters */ -+static inline cloff_t size_in_lc(loff_t size, struct inode *inode) -+{ -+ return size_in_blocks(size, inode_cluster_shift(inode)); -+} -+ -+/* size in pages to the size in page clusters */ -+static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode) -+{ -+ return size_in_blocks(size, cluster_nrpages_shift(inode)); -+} -+ -+/*********************** Size modulators ***************************/ -+ -+/* -+ Modulate linear size by nominated block size and offset. -+ -+ The "finite" function (which is zero almost everywhere). -+ How much is a height of the figure at a position @pos, -+ when trying to construct rectangle of height (1 << @blkbits), -+ and square @size. -+ -+ ****** -+ ******* -+ ******* -+ ******* -+ ----------> pos -+*/ -+static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits) -+{ -+ unsigned end = size >> blkbits; -+ if (pos < end) -+ return 1U << blkbits; -+ if (unlikely(pos > end)) -+ return 0; -+ return size & ~(~0ull << blkbits); -+} -+ -+/* the same as above, but block size is page size */ -+static inline unsigned __mbp(loff_t size, pgoff_t pos) -+{ -+ return __mbb(size, pos, PAGE_CACHE_SHIFT); -+} -+ -+/* number of file's bytes in the nominated logical cluster */ -+static inline unsigned lbytes(cloff_t index, struct inode *inode) -+{ -+ return __mbb(i_size_read(inode), index, inode_cluster_shift(inode)); -+} -+ -+/* number of file's bytes in the nominated page */ -+static inline unsigned pbytes(pgoff_t index, struct inode *inode) -+{ -+ return __mbp(i_size_read(inode), index); -+} -+ -+/** -+ * number of pages occuped by @win->count bytes starting from -+ * @win->off at logical cluster defined by @win. This is exactly -+ * a number of pages to be modified and dirtied in any cluster operation. -+ */ -+static inline pgoff_t win_count_to_nrpages(struct reiser4_slide * win) -+{ -+ return ((win->off + win->count + -+ (1UL << PAGE_CACHE_SHIFT) - 1) >> PAGE_CACHE_SHIFT) - -+ off_to_pg(win->off); -+} -+ -+/* return true, if logical cluster is not occupied by the file */ -+static inline int new_logical_cluster(struct cluster_handle *clust, -+ struct inode *inode) -+{ -+ return clust_to_off(clust->index, inode) >= i_size_read(inode); -+} -+ -+/* return true, if pages @p1 and @p2 are of the same page cluster */ -+static inline int same_page_cluster(struct page *p1, struct page *p2) -+{ -+ assert("edward-1490", p1 != NULL); -+ assert("edward-1491", p2 != NULL); -+ assert("edward-1492", p1->mapping != NULL); -+ assert("edward-1493", p2->mapping != NULL); -+ -+ return (pg_to_clust(page_index(p1), p1->mapping->host) == -+ pg_to_clust(page_index(p2), p2->mapping->host)); -+} -+ -+static inline int cluster_is_complete(struct cluster_handle *clust, -+ struct inode *inode) -+{ -+ return clust->tc.lsize == inode_cluster_size(inode); -+} -+ -+static inline void reiser4_slide_init(struct reiser4_slide *win) -+{ -+ assert("edward-1084", win != NULL); -+ memset(win, 0, sizeof *win); -+} -+ -+static inline tfm_action -+cluster_get_tfm_act(struct tfm_cluster *tc) -+{ -+ assert("edward-1356", tc != NULL); -+ return tc->act; -+} -+ -+static inline void -+cluster_set_tfm_act(struct tfm_cluster *tc, tfm_action act) -+{ -+ assert("edward-1356", tc != NULL); -+ tc->act = act; -+} -+ -+static inline void cluster_init_act(struct cluster_handle *clust, -+ tfm_action act, -+ struct reiser4_slide *window) -+{ -+ assert("edward-84", clust != NULL); -+ memset(clust, 0, sizeof *clust); -+ cluster_set_tfm_act(&clust->tc, act); -+ clust->dstat = INVAL_DISK_CLUSTER; -+ clust->win = window; -+} -+ -+static inline void cluster_init_read(struct cluster_handle *clust, -+ struct reiser4_slide *window) -+{ -+ cluster_init_act(clust, TFMA_READ, window); -+} -+ -+static inline void cluster_init_write(struct cluster_handle *clust, -+ struct reiser4_slide *window) -+{ -+ cluster_init_act(clust, TFMA_WRITE, window); -+} -+ -+/* true if @p1 and @p2 are items of the same disk cluster */ -+static inline int same_disk_cluster(const coord_t *p1, const coord_t *p2) -+{ -+ /* drop this if you have other items to aggregate */ -+ assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID); -+ -+ return item_plugin_by_coord(p1)->b.mergeable(p1, p2); -+} -+ -+static inline int dclust_get_extension_dsize(hint_t *hint) -+{ -+ return hint->ext_coord.extension.ctail.dsize; -+} -+ -+static inline void dclust_set_extension_dsize(hint_t *hint, int dsize) -+{ -+ hint->ext_coord.extension.ctail.dsize = dsize; -+} -+ -+static inline int dclust_get_extension_shift(hint_t *hint) -+{ -+ return hint->ext_coord.extension.ctail.shift; -+} -+ -+static inline int dclust_get_extension_ncount(hint_t *hint) -+{ -+ return hint->ext_coord.extension.ctail.ncount; -+} -+ -+static inline void dclust_inc_extension_ncount(hint_t *hint) -+{ -+ hint->ext_coord.extension.ctail.ncount++; -+} -+ -+static inline void dclust_init_extension(hint_t *hint) -+{ -+ memset(&hint->ext_coord.extension.ctail, 0, -+ sizeof(hint->ext_coord.extension.ctail)); -+} -+ -+static inline int hint_is_unprepped_dclust(hint_t *hint) -+{ -+ assert("edward-1451", hint_is_valid(hint)); -+ return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT; -+} -+ -+static inline void coord_set_between_clusters(coord_t *coord) -+{ -+#if REISER4_DEBUG -+ int result; -+ result = zload(coord->node); -+ assert("edward-1296", !result); -+#endif -+ if (!coord_is_between_items(coord)) { -+ coord->between = AFTER_ITEM; -+ coord->unit_pos = 0; -+ } -+#if REISER4_DEBUG -+ zrelse(coord->node); -+#endif -+} -+ -+int reiser4_inflate_cluster(struct cluster_handle *, struct inode *); -+int find_disk_cluster(struct cluster_handle *, struct inode *, int read, -+ znode_lock_mode mode); -+int checkout_logical_cluster(struct cluster_handle *, jnode * , struct inode *); -+int reiser4_deflate_cluster(struct cluster_handle *, struct inode *); -+void truncate_complete_page_cluster(struct inode *inode, cloff_t start, -+ int even_cows); -+void invalidate_hint_cluster(struct cluster_handle *clust); -+int get_disk_cluster_locked(struct cluster_handle *clust, struct inode *inode, -+ znode_lock_mode lock_mode); -+void reset_cluster_params(struct cluster_handle *clust); -+int set_cluster_by_page(struct cluster_handle *clust, struct page *page, -+ int count); -+int prepare_page_cluster(struct inode *inode, struct cluster_handle *clust, -+ rw_op rw); -+void __put_page_cluster(int from, int count, struct page **pages, -+ struct inode *inode); -+void put_page_cluster(struct cluster_handle *clust, -+ struct inode *inode, rw_op rw); -+void put_cluster_handle(struct cluster_handle *clust); -+int grab_tfm_stream(struct inode *inode, struct tfm_cluster *tc, -+ tfm_stream_id id); -+int tfm_cluster_is_uptodate(struct tfm_cluster *tc); -+void tfm_cluster_set_uptodate(struct tfm_cluster *tc); -+void tfm_cluster_clr_uptodate(struct tfm_cluster *tc); -+ -+/* move cluster handle to the target position -+ specified by the page of index @pgidx */ -+static inline void move_cluster_forward(struct cluster_handle *clust, -+ struct inode *inode, -+ pgoff_t pgidx) -+{ -+ assert("edward-1297", clust != NULL); -+ assert("edward-1298", inode != NULL); -+ -+ reset_cluster_params(clust); -+ if (clust->index_valid && -+ /* Hole in the indices. Hint became invalid and can not be -+ used by find_cluster_item() even if seal/node versions -+ will coincide */ -+ pg_to_clust(pgidx, inode) != clust->index + 1) { -+ reiser4_unset_hint(clust->hint); -+ invalidate_hint_cluster(clust); -+ } -+ clust->index = pg_to_clust(pgidx, inode); -+ clust->index_valid = 1; -+} -+ -+static inline int alloc_clust_pages(struct cluster_handle *clust, -+ struct inode *inode) -+{ -+ assert("edward-791", clust != NULL); -+ assert("edward-792", inode != NULL); -+ clust->pages = -+ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode), -+ reiser4_ctx_gfp_mask_get()); -+ if (!clust->pages) -+ return -ENOMEM; -+ return 0; -+} -+ -+static inline void free_clust_pages(struct cluster_handle *clust) -+{ -+ kfree(clust->pages); -+} -+ -+#endif /* __FS_REISER4_CLUSTER_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.30/fs/reiser4/plugin/compress/compress.c ---- linux-2.6.30.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/compress/compress.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,355 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* reiser4 compression transform plugins */ -+ -+#include "../../debug.h" -+#include "../../inode.h" -+#include "../plugin.h" -+ -+#include <linux/lzo.h> -+#include <linux/zlib.h> -+#include <linux/types.h> -+#include <linux/hardirq.h> -+ -+static int change_compression(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ assert("edward-1316", inode != NULL); -+ assert("edward-1317", plugin != NULL); -+ assert("edward-1318", is_reiser4_inode(inode)); -+ assert("edward-1319", -+ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE); -+ -+ /* cannot change compression plugin of already existing regular object */ -+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) -+ return RETERR(-EINVAL); -+ -+ /* If matches, nothing to change. */ -+ if (inode_hash_plugin(inode) != NULL && -+ inode_hash_plugin(inode)->h.id == plugin->h.id) -+ return 0; -+ -+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_COMPRESSION, plugin); -+} -+ -+static reiser4_plugin_ops compression_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = &change_compression -+}; -+ -+/******************************************************************************/ -+/* gzip1 compression */ -+/******************************************************************************/ -+ -+#define GZIP1_DEF_LEVEL Z_BEST_SPEED -+#define GZIP1_DEF_WINBITS 15 -+#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL -+ -+static int gzip1_init(void) -+{ -+ return 0; -+} -+ -+static int gzip1_overrun(unsigned src_len UNUSED_ARG) -+{ -+ return 0; -+} -+ -+static coa_t gzip1_alloc(tfm_action act) -+{ -+ coa_t coa = NULL; -+ int ret = 0; -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ coa = reiser4_vmalloc(zlib_deflate_workspacesize()); -+ if (!coa) { -+ ret = -ENOMEM; -+ break; -+ } -+ break; -+ case TFMA_READ: /* decompress */ -+ coa = reiser4_vmalloc(zlib_inflate_workspacesize()); -+ if (!coa) { -+ ret = -ENOMEM; -+ break; -+ } -+ break; -+ default: -+ impossible("edward-767", -+ "trying to alloc workspace for unknown tfm action"); -+ } -+ if (ret) { -+ warning("edward-768", -+ "alloc workspace for gzip1 (tfm action = %d) failed\n", -+ act); -+ return ERR_PTR(ret); -+ } -+ return coa; -+} -+ -+static void gzip1_free(coa_t coa, tfm_action act) -+{ -+ assert("edward-769", coa != NULL); -+ -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ vfree(coa); -+ break; -+ case TFMA_READ: /* decompress */ -+ vfree(coa); -+ break; -+ default: -+ impossible("edward-770", "unknown tfm action"); -+ } -+ return; -+} -+ -+static int gzip1_min_size_deflate(void) -+{ -+ return 64; -+} -+ -+static void -+gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+ int ret = 0; -+ struct z_stream_s stream; -+ -+ assert("edward-842", coa != NULL); -+ assert("edward-875", src_len != 0); -+ -+ stream.workspace = coa; -+ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED, -+ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL, -+ Z_DEFAULT_STRATEGY); -+ if (ret != Z_OK) { -+ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret); -+ goto rollback; -+ } -+ ret = zlib_deflateReset(&stream); -+ if (ret != Z_OK) { -+ warning("edward-772", "zlib_deflateReset returned %d\n", ret); -+ goto rollback; -+ } -+ stream.next_in = src_first; -+ stream.avail_in = src_len; -+ stream.next_out = dst_first; -+ stream.avail_out = *dst_len; -+ -+ ret = zlib_deflate(&stream, Z_FINISH); -+ if (ret != Z_STREAM_END) { -+ if (ret != Z_OK) -+ warning("edward-773", -+ "zlib_deflate returned %d\n", ret); -+ goto rollback; -+ } -+ *dst_len = stream.total_out; -+ return; -+ rollback: -+ *dst_len = src_len; -+ return; -+} -+ -+static void -+gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+ int ret = 0; -+ struct z_stream_s stream; -+ -+ assert("edward-843", coa != NULL); -+ assert("edward-876", src_len != 0); -+ -+ stream.workspace = coa; -+ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS); -+ if (ret != Z_OK) { -+ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret); -+ return; -+ } -+ ret = zlib_inflateReset(&stream); -+ if (ret != Z_OK) { -+ warning("edward-775", "zlib_inflateReset returned %d\n", ret); -+ return; -+ } -+ -+ stream.next_in = src_first; -+ stream.avail_in = src_len; -+ stream.next_out = dst_first; -+ stream.avail_out = *dst_len; -+ -+ ret = zlib_inflate(&stream, Z_SYNC_FLUSH); -+ /* -+ * Work around a bug in zlib, which sometimes wants to taste an extra -+ * byte when being used in the (undocumented) raw deflate mode. -+ * (From USAGI). -+ */ -+ if (ret == Z_OK && !stream.avail_in && stream.avail_out) { -+ u8 zerostuff = 0; -+ stream.next_in = &zerostuff; -+ stream.avail_in = 1; -+ ret = zlib_inflate(&stream, Z_FINISH); -+ } -+ if (ret != Z_STREAM_END) { -+ warning("edward-776", "zlib_inflate returned %d\n", ret); -+ return; -+ } -+ *dst_len = stream.total_out; -+ return; -+} -+ -+/******************************************************************************/ -+/* lzo1 compression */ -+/******************************************************************************/ -+ -+static int lzo1_init(void) -+{ -+ return 0; -+} -+ -+static int lzo1_overrun(unsigned in_len) -+{ -+ return in_len / 64 + 16 + 3; -+} -+ -+static coa_t lzo1_alloc(tfm_action act) -+{ -+ int ret = 0; -+ coa_t coa = NULL; -+ -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS); -+ if (!coa) { -+ ret = -ENOMEM; -+ break; -+ } -+ case TFMA_READ: /* decompress */ -+ break; -+ default: -+ impossible("edward-877", -+ "trying to alloc workspace for unknown tfm action"); -+ } -+ if (ret) { -+ warning("edward-878", -+ "alloc workspace for lzo1 (tfm action = %d) failed\n", -+ act); -+ return ERR_PTR(ret); -+ } -+ return coa; -+} -+ -+static void lzo1_free(coa_t coa, tfm_action act) -+{ -+ assert("edward-879", coa != NULL); -+ -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ vfree(coa); -+ break; -+ case TFMA_READ: /* decompress */ -+ impossible("edward-1304", -+ "trying to free non-allocated workspace"); -+ default: -+ impossible("edward-880", "unknown tfm action"); -+ } -+ return; -+} -+ -+static int lzo1_min_size_deflate(void) -+{ -+ return 256; -+} -+ -+static void -+lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+ int result; -+ -+ assert("edward-846", coa != NULL); -+ assert("edward-847", src_len != 0); -+ -+ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa); -+ if (unlikely(result != LZO_E_OK)) { -+ warning("edward-849", "lzo1x_1_compress failed\n"); -+ goto out; -+ } -+ if (*dst_len >= src_len) { -+ //warning("edward-850", "lzo1x_1_compress: incompressible data\n"); -+ goto out; -+ } -+ return; -+ out: -+ *dst_len = src_len; -+ return; -+} -+ -+static void -+lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+ int result; -+ -+ assert("edward-851", coa == NULL); -+ assert("edward-852", src_len != 0); -+ -+ result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len); -+ if (result != LZO_E_OK) -+ warning("edward-853", "lzo1x_1_decompress failed\n"); -+ return; -+} -+ -+compression_plugin compression_plugins[LAST_COMPRESSION_ID] = { -+ [LZO1_COMPRESSION_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .id = LZO1_COMPRESSION_ID, -+ .pops = &compression_plugin_ops, -+ .label = "lzo1", -+ .desc = "lzo1 compression transform", -+ .linkage = {NULL, NULL} -+ }, -+ .init = lzo1_init, -+ .overrun = lzo1_overrun, -+ .alloc = lzo1_alloc, -+ .free = lzo1_free, -+ .min_size_deflate = lzo1_min_size_deflate, -+ .checksum = reiser4_adler32, -+ .compress = lzo1_compress, -+ .decompress = lzo1_decompress -+ }, -+ [GZIP1_COMPRESSION_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .id = GZIP1_COMPRESSION_ID, -+ .pops = &compression_plugin_ops, -+ .label = "gzip1", -+ .desc = "gzip1 compression transform", -+ .linkage = {NULL, NULL} -+ }, -+ .init = gzip1_init, -+ .overrun = gzip1_overrun, -+ .alloc = gzip1_alloc, -+ .free = gzip1_free, -+ .min_size_deflate = gzip1_min_size_deflate, -+ .checksum = reiser4_adler32, -+ .compress = gzip1_compress, -+ .decompress = gzip1_decompress -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.30/fs/reiser4/plugin/compress/compress.h ---- linux-2.6.30.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/compress/compress.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,43 @@ -+#if !defined( __FS_REISER4_COMPRESS_H__ ) -+#define __FS_REISER4_COMPRESS_H__ -+ -+#include <linux/types.h> -+#include <linux/string.h> -+ -+/* transform direction */ -+typedef enum { -+ TFMA_READ, /* decrypt, decompress */ -+ TFMA_WRITE, /* encrypt, compress */ -+ TFMA_LAST -+} tfm_action; -+ -+/* supported compression algorithms */ -+typedef enum { -+ LZO1_COMPRESSION_ID, -+ GZIP1_COMPRESSION_ID, -+ LAST_COMPRESSION_ID, -+} reiser4_compression_id; -+ -+/* the same as pgoff, but units are page clusters */ -+typedef unsigned long cloff_t; -+ -+/* working data of a (de)compression algorithm */ -+typedef void *coa_t; -+ -+/* table for all supported (de)compression algorithms */ -+typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST]; -+ -+__u32 reiser4_adler32(char *data, __u32 len); -+ -+#endif /* __FS_REISER4_COMPRESS_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.30/fs/reiser4/plugin/compress/compress_mode.c ---- linux-2.6.30.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/compress/compress_mode.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,162 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* This file contains Reiser4 compression mode plugins. -+ -+ Compression mode plugin is a set of handlers called by compressor -+ at flush time and represent some heuristics including the ones -+ which are to avoid compression of incompressible data, see -+ http://www.namesys.com/cryptcompress_design.html for more details. -+*/ -+#include "../../inode.h" -+#include "../plugin.h" -+ -+static int should_deflate_none(struct inode * inode, cloff_t index) -+{ -+ return 0; -+} -+ -+static int should_deflate_common(struct inode * inode, cloff_t index) -+{ -+ return compression_is_on(cryptcompress_inode_data(inode)); -+} -+ -+static int discard_hook_ultim(struct inode *inode, cloff_t index) -+{ -+ turn_off_compression(cryptcompress_inode_data(inode)); -+ return 0; -+} -+ -+static int discard_hook_lattd(struct inode *inode, cloff_t index) -+{ -+ struct cryptcompress_info * info = cryptcompress_inode_data(inode); -+ -+ assert("edward-1462", -+ get_lattice_factor(info) >= MIN_LATTICE_FACTOR && -+ get_lattice_factor(info) <= MAX_LATTICE_FACTOR); -+ -+ turn_off_compression(info); -+ if (get_lattice_factor(info) < MAX_LATTICE_FACTOR) -+ set_lattice_factor(info, get_lattice_factor(info) << 1); -+ return 0; -+} -+ -+static int accept_hook_lattd(struct inode *inode, cloff_t index) -+{ -+ turn_on_compression(cryptcompress_inode_data(inode)); -+ set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR); -+ return 0; -+} -+ -+/* Check on dynamic lattice, the adaptive compression modes which -+ defines the following behavior: -+ -+ Compression is on: try to compress everything and turn -+ it off, whenever cluster is incompressible. -+ -+ Compression is off: try to compress clusters of indexes -+ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of -+ them is compressible. If incompressible, then increase FACTOR */ -+ -+/* check if @index belongs to one-dimensional lattice -+ of sparce factor @factor */ -+static int is_on_lattice(cloff_t index, int factor) -+{ -+ return (factor ? index % factor == 0: index == 0); -+} -+ -+static int should_deflate_lattd(struct inode * inode, cloff_t index) -+{ -+ return should_deflate_common(inode, index) || -+ is_on_lattice(index, -+ get_lattice_factor -+ (cryptcompress_inode_data(inode))); -+} -+ -+/* compression mode_plugins */ -+compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = { -+ [NONE_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = NONE_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "none", -+ .desc = "Compress nothing", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_none, -+ .accept_hook = NULL, -+ .discard_hook = NULL -+ }, -+ /* Check-on-dynamic-lattice adaptive compression mode */ -+ [LATTD_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = LATTD_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "lattd", -+ .desc = "Check on dynamic lattice", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_lattd, -+ .accept_hook = accept_hook_lattd, -+ .discard_hook = discard_hook_lattd -+ }, -+ /* Check-ultimately compression mode: -+ Turn off compression forever as soon as we meet -+ incompressible data */ -+ [ULTIM_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = ULTIM_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "ultim", -+ .desc = "Check ultimately", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_common, -+ .accept_hook = NULL, -+ .discard_hook = discard_hook_ultim -+ }, -+ /* Force-to-compress-everything compression mode */ -+ [FORCE_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = FORCE_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "force", -+ .desc = "Force to compress everything", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = NULL, -+ .accept_hook = NULL, -+ .discard_hook = NULL -+ }, -+ /* Convert-to-extent compression mode. -+ In this mode items will be converted to extents and management -+ will be passed to (classic) unix file plugin as soon as ->write() -+ detects that the first complete logical cluster (of index #0) is -+ incompressible. */ -+ [CONVX_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = CONVX_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "conv", -+ .desc = "Convert to extent", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_common, -+ .accept_hook = NULL, -+ .discard_hook = NULL -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.30/fs/reiser4/plugin/compress/Makefile ---- linux-2.6.30.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/compress/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += compress_plugins.o -+ -+compress_plugins-objs := \ -+ compress.o \ -+ compress_mode.o -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.30/fs/reiser4/plugin/crypto/cipher.c ---- linux-2.6.30.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/crypto/cipher.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,37 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, -+ licensing governed by reiser4/README */ -+/* Reiser4 cipher transform plugins */ -+ -+#include "../../debug.h" -+#include "../plugin.h" -+ -+cipher_plugin cipher_plugins[LAST_CIPHER_ID] = { -+ [NONE_CIPHER_ID] = { -+ .h = { -+ .type_id = REISER4_CIPHER_PLUGIN_TYPE, -+ .id = NONE_CIPHER_ID, -+ .pops = NULL, -+ .label = "none", -+ .desc = "no cipher transform", -+ .linkage = {NULL, NULL} -+ }, -+ .alloc = NULL, -+ .free = NULL, -+ .scale = NULL, -+ .align_stream = NULL, -+ .setkey = NULL, -+ .encrypt = NULL, -+ .decrypt = NULL -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.30/fs/reiser4/plugin/crypto/cipher.h ---- linux-2.6.30.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/crypto/cipher.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,55 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* This file contains definitions for the objects operated -+ by reiser4 key manager, which is something like keyring -+ wrapped by appropriate reiser4 plugin */ -+ -+#if !defined( __FS_REISER4_CRYPT_H__ ) -+#define __FS_REISER4_CRYPT_H__ -+ -+#include <linux/crypto.h> -+ -+/* key info imported from user space */ -+struct reiser4_crypto_data { -+ int keysize; /* uninstantiated key size */ -+ __u8 * key; /* uninstantiated key */ -+ int keyid_size; /* size of passphrase */ -+ __u8 * keyid; /* passphrase */ -+}; -+ -+/* This object contains all needed infrastructure to implement -+ cipher transform. This is operated (allocating, inheriting, -+ validating, binding to host inode, etc..) by reiser4 key manager. -+ -+ This info can be allocated in two cases: -+ 1. importing a key from user space. -+ 2. reading inode from disk */ -+struct reiser4_crypto_info { -+ struct inode * host; -+ struct crypto_hash * digest; -+ struct crypto_blkcipher * cipher; -+#if 0 -+ cipher_key_plugin * kplug; /* key manager */ -+#endif -+ __u8 * keyid; /* key fingerprint, created by digest plugin, -+ using uninstantiated key and passphrase. -+ supposed to be stored in disk stat-data */ -+ int inst; /* this indicates if the cipher key is -+ instantiated (case 1 above) */ -+ int keysize; /* uninstantiated key size (bytes), supposed -+ to be stored in disk stat-data */ -+ int keyload_count; /* number of the objects which has this -+ crypto-stat attached */ -+}; -+ -+#endif /* __FS_REISER4_CRYPT_H__ */ -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.30/fs/reiser4/plugin/crypto/digest.c ---- linux-2.6.30.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/crypto/digest.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,58 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */ -+/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */ -+#include "../../debug.h" -+#include "../plugin_header.h" -+#include "../plugin.h" -+#include "../file/cryptcompress.h" -+ -+#include <linux/types.h> -+ -+extern digest_plugin digest_plugins[LAST_DIGEST_ID]; -+ -+static struct crypto_hash * alloc_sha256 (void) -+{ -+#if REISER4_SHA256 -+ return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC); -+#else -+ warning("edward-1418", "sha256 unsupported"); -+ return ERR_PTR(-EINVAL); -+#endif -+} -+ -+static void free_sha256 (struct crypto_hash * tfm) -+{ -+#if REISER4_SHA256 -+ crypto_free_hash(tfm); -+#endif -+ return; -+} -+ -+/* digest plugins */ -+digest_plugin digest_plugins[LAST_DIGEST_ID] = { -+ [SHA256_32_DIGEST_ID] = { -+ .h = { -+ .type_id = REISER4_DIGEST_PLUGIN_TYPE, -+ .id = SHA256_32_DIGEST_ID, -+ .pops = NULL, -+ .label = "sha256_32", -+ .desc = "sha256_32 digest transform", -+ .linkage = {NULL, NULL} -+ }, -+ .fipsize = sizeof(__u32), -+ .alloc = alloc_sha256, -+ .free = free_sha256 -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.30/fs/reiser4/plugin/dir/dir.h ---- linux-2.6.30.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/dir/dir.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,36 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* this file contains declarations of methods implementing directory plugins */ -+ -+#if !defined( __REISER4_DIR_H__ ) -+#define __REISER4_DIR_H__ -+ -+/*#include "../../key.h" -+ -+#include <linux/fs.h>*/ -+ -+/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */ -+ -+/* "hashed" directory methods of dir plugin */ -+void build_entry_key_hashed(const struct inode *, const struct qstr *, -+ reiser4_key *); -+ -+/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */ -+ -+/* "seekable" directory methods of dir plugin */ -+void build_entry_key_seekable(const struct inode *, const struct qstr *, -+ reiser4_key *); -+ -+/* __REISER4_DIR_H__ */ -+#endif -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.30/fs/reiser4/plugin/dir/hashed_dir.c ---- linux-2.6.30.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/dir/hashed_dir.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,81 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file -+ names to the files. */ -+ -+/* -+ * Hashed directory logically consists of persistent directory -+ * entries. Directory entry is a pair of a file name and a key of stat-data of -+ * a file that has this name in the given directory. -+ * -+ * Directory entries are stored in the tree in the form of directory -+ * items. Directory item should implement dir_entry_ops portion of item plugin -+ * interface (see plugin/item/item.h). Hashed directory interacts with -+ * directory item plugin exclusively through dir_entry_ops operations. -+ * -+ * Currently there are two implementations of directory items: "simple -+ * directory item" (plugin/item/sde.[ch]), and "compound directory item" -+ * (plugin/item/cde.[ch]) with the latter being the default. -+ * -+ * There is, however some delicate way through which directory code interferes -+ * with item plugin: key assignment policy. A key for a directory item is -+ * chosen by directory code, and as described in kassign.c, this key contains -+ * a portion of file name. Directory item uses this knowledge to avoid storing -+ * this portion of file name twice: in the key and in the directory item body. -+ * -+ */ -+ -+#include "../../inode.h" -+ -+void complete_entry_key(const struct inode *, const char *name, -+ int len, reiser4_key * result); -+ -+/* this is implementation of build_entry_key method of dir -+ plugin for HASHED_DIR_PLUGIN_ID -+ */ -+void build_entry_key_hashed(const struct inode *dir, /* directory where entry is -+ * (or will be) in.*/ -+ const struct qstr *qname, /* name of file referenced -+ * by this entry */ -+ reiser4_key * result /* resulting key of directory -+ * entry */ ) -+{ -+ const char *name; -+ int len; -+ -+ assert("nikita-1139", dir != NULL); -+ assert("nikita-1140", qname != NULL); -+ assert("nikita-1141", qname->name != NULL); -+ assert("nikita-1142", result != NULL); -+ -+ name = qname->name; -+ len = qname->len; -+ -+ assert("nikita-2867", strlen(name) == len); -+ -+ reiser4_key_init(result); -+ /* locality of directory entry's key is objectid of parent -+ directory */ -+ set_key_locality(result, get_inode_oid(dir)); -+ /* minor packing locality is constant */ -+ set_key_type(result, KEY_FILE_NAME_MINOR); -+ /* dot is special case---we always want it to be first entry in -+ a directory. Actually, we just want to have smallest -+ directory entry. -+ */ -+ if (len == 1 && name[0] == '.') -+ return; -+ -+ /* initialize part of entry key which depends on file name */ -+ complete_entry_key(dir, name, len, result); -+} -+ -+/* Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.30/fs/reiser4/plugin/dir/Makefile ---- linux-2.6.30.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/dir/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += dir_plugins.o -+ -+dir_plugins-objs := \ -+ hashed_dir.o \ -+ seekable_dir.o -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.30/fs/reiser4/plugin/dir/seekable_dir.c ---- linux-2.6.30.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/dir/seekable_dir.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,46 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "../../inode.h" -+ -+/* this is implementation of build_entry_key method of dir -+ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID -+ This is for directories where we want repeatable and restartable readdir() -+ even in case 32bit user level struct dirent (readdir(3)). -+*/ -+void -+build_entry_key_seekable(const struct inode *dir, const struct qstr *name, -+ reiser4_key * result) -+{ -+ oid_t objectid; -+ -+ assert("nikita-2283", dir != NULL); -+ assert("nikita-2284", name != NULL); -+ assert("nikita-2285", name->name != NULL); -+ assert("nikita-2286", result != NULL); -+ -+ reiser4_key_init(result); -+ /* locality of directory entry's key is objectid of parent -+ directory */ -+ set_key_locality(result, get_inode_oid(dir)); -+ /* minor packing locality is constant */ -+ set_key_type(result, KEY_FILE_NAME_MINOR); -+ /* dot is special case---we always want it to be first entry in -+ a directory. Actually, we just want to have smallest -+ directory entry. -+ */ -+ if ((name->len == 1) && (name->name[0] == '.')) -+ return; -+ -+ /* objectid of key is 31 lowest bits of hash. */ -+ objectid = -+ inode_hash_plugin(dir)->hash(name->name, -+ (int)name->len) & 0x7fffffff; -+ -+ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK)); -+ set_key_objectid(result, objectid); -+ -+ /* offset is always 0. */ -+ set_key_offset(result, (__u64) 0); -+ return; -+} -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.30/fs/reiser4/plugin/dir_plugin_common.c ---- linux-2.6.30.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/dir_plugin_common.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,865 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* this file contains typical implementations for most of methods of -+ directory plugin -+*/ -+ -+#include "../inode.h" -+ -+int reiser4_find_entry(struct inode *dir, struct dentry *name, -+ lock_handle * , znode_lock_mode, reiser4_dir_entry_desc *); -+int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, -+ reiser4_key * key); -+void check_light_weight(struct inode *inode, struct inode *parent); -+ -+/* this is common implementation of get_parent method of dir plugin -+ this is used by NFS kernel server to "climb" up directory tree to -+ check permissions -+ */ -+struct dentry *get_parent_common(struct inode *child) -+{ -+ struct super_block *s; -+ struct inode *parent; -+ struct dentry dotdot; -+ struct dentry *dentry; -+ reiser4_key key; -+ int result; -+ -+ /* -+ * lookup dotdot entry. -+ */ -+ -+ s = child->i_sb; -+ memset(&dotdot, 0, sizeof(dotdot)); -+ dotdot.d_name.name = ".."; -+ dotdot.d_name.len = 2; -+ dotdot.d_op = &get_super_private(s)->ops.dentry; -+ -+ result = reiser4_lookup_name(child, &dotdot, &key); -+ if (result != 0) -+ return ERR_PTR(result); -+ -+ parent = reiser4_iget(s, &key, 1); -+ if (!IS_ERR(parent)) { -+ /* -+ * FIXME-NIKITA dubious: attributes are inherited from @child -+ * to @parent. But: -+ * -+ * (*) this is the only this we can do -+ * -+ * (*) attributes of light-weight object are inherited -+ * from a parent through which object was looked up first, -+ * so it is ambiguous anyway. -+ * -+ */ -+ check_light_weight(parent, child); -+ reiser4_iget_complete(parent); -+ dentry = d_obtain_alias(parent); -+ if (!IS_ERR(dentry)) -+ dentry->d_op = &get_super_private(s)->ops.dentry; -+ } else if (PTR_ERR(parent) == -ENOENT) -+ dentry = ERR_PTR(RETERR(-ESTALE)); -+ else -+ dentry = (void *)parent; -+ return dentry; -+} -+ -+/* this is common implementation of is_name_acceptable method of dir -+ plugin -+ */ -+int is_name_acceptable_common(const struct inode *inode, /* directory to check*/ -+ const char *name UNUSED_ARG, /* name to check */ -+ int len/* @name's length */) -+{ -+ assert("nikita-733", inode != NULL); -+ assert("nikita-734", name != NULL); -+ assert("nikita-735", len > 0); -+ -+ return len <= reiser4_max_filename_len(inode); -+} -+ -+/* there is no common implementation of build_entry_key method of dir -+ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or -+ plugin/dir/seekable.c:build_entry_key_seekable() for example -+*/ -+ -+/* this is common implementation of build_readdir_key method of dir -+ plugin -+ see reiser4_readdir_common for more details -+*/ -+int build_readdir_key_common(struct file *dir /* directory being read */ , -+ reiser4_key * result/* where to store key */) -+{ -+ reiser4_file_fsdata *fdata; -+ struct inode *inode; -+ -+ assert("nikita-1361", dir != NULL); -+ assert("nikita-1362", result != NULL); -+ assert("nikita-1363", dir->f_dentry != NULL); -+ inode = dir->f_dentry->d_inode; -+ assert("nikita-1373", inode != NULL); -+ -+ fdata = reiser4_get_file_fsdata(dir); -+ if (IS_ERR(fdata)) -+ return PTR_ERR(fdata); -+ assert("nikita-1364", fdata != NULL); -+ return extract_key_from_de_id(get_inode_oid(inode), -+ &fdata->dir.readdir.position. -+ dir_entry_key, result); -+ -+} -+ -+void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset, -+ int adj); -+ -+/* this is common implementation of add_entry method of dir plugin -+*/ -+int reiser4_add_entry_common(struct inode *object, /* directory to add new name -+ * in */ -+ struct dentry *where, /* new name */ -+ reiser4_object_create_data * data, /* parameters of -+ * new object */ -+ reiser4_dir_entry_desc * entry /* parameters of -+ * new directory -+ * entry */) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle lh; -+ struct reiser4_dentry_fsdata *fsdata; -+ reiser4_block_nr reserve; -+ -+ assert("nikita-1114", object != NULL); -+ assert("nikita-1250", where != NULL); -+ -+ fsdata = reiser4_get_dentry_fsdata(where); -+ if (unlikely(IS_ERR(fsdata))) -+ return PTR_ERR(fsdata); -+ -+ reserve = inode_dir_plugin(object)->estimate.add_entry(object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ init_lh(&lh); -+ coord = &fsdata->dec.entry_coord; -+ coord_clear_iplug(coord); -+ -+ /* check for this entry in a directory. This is plugin method. */ -+ result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK, -+ entry); -+ if (likely(result == -ENOENT)) { -+ /* add new entry. Just pass control to the directory -+ item plugin. */ -+ assert("nikita-1709", inode_dir_item_plugin(object)); -+ assert("nikita-2230", coord->node == lh.node); -+ reiser4_seal_done(&fsdata->dec.entry_seal); -+ result = -+ inode_dir_item_plugin(object)->s.dir.add_entry(object, -+ coord, &lh, -+ where, -+ entry); -+ if (result == 0) { -+ reiser4_adjust_dir_file(object, where, -+ fsdata->dec.pos + 1, +1); -+ INODE_INC_FIELD(object, i_size); -+ } -+ } else if (result == 0) { -+ assert("nikita-2232", coord->node == lh.node); -+ result = RETERR(-EEXIST); -+ } -+ done_lh(&lh); -+ -+ return result; -+} -+ -+/** -+ * rem_entry - remove entry from directory item -+ * @dir: -+ * @dentry: -+ * @entry: -+ * @coord: -+ * @lh: -+ * -+ * Checks that coordinate @coord is set properly and calls item plugin -+ * method to cut entry. -+ */ -+static int -+rem_entry(struct inode *dir, struct dentry *dentry, -+ reiser4_dir_entry_desc * entry, coord_t *coord, lock_handle * lh) -+{ -+ item_plugin *iplug; -+ struct inode *child; -+ -+ iplug = inode_dir_item_plugin(dir); -+ child = dentry->d_inode; -+ assert("nikita-3399", child != NULL); -+ -+ /* check that we are really destroying an entry for @child */ -+ if (REISER4_DEBUG) { -+ int result; -+ reiser4_key key; -+ -+ result = iplug->s.dir.extract_key(coord, &key); -+ if (result != 0) -+ return result; -+ if (get_key_objectid(&key) != get_inode_oid(child)) { -+ warning("nikita-3397", -+ "rem_entry: %#llx != %#llx\n", -+ get_key_objectid(&key), -+ (unsigned long long)get_inode_oid(child)); -+ return RETERR(-EIO); -+ } -+ } -+ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry); -+} -+ -+/** -+ * reiser4_rem_entry_common - remove entry from a directory -+ * @dir: directory to remove entry from -+ * @where: name that is being removed -+ * @entry: description of entry being removed -+ * -+ * This is common implementation of rem_entry method of dir plugin. -+ */ -+int reiser4_rem_entry_common(struct inode *dir, -+ struct dentry *dentry, -+ reiser4_dir_entry_desc * entry) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle lh; -+ struct reiser4_dentry_fsdata *fsdata; -+ __u64 tograb; -+ -+ assert("nikita-1124", dir != NULL); -+ assert("nikita-1125", dentry != NULL); -+ -+ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir); -+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED); -+ if (result != 0) -+ return RETERR(-ENOSPC); -+ -+ init_lh(&lh); -+ -+ /* check for this entry in a directory. This is plugin method. */ -+ result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry); -+ fsdata = reiser4_get_dentry_fsdata(dentry); -+ if (IS_ERR(fsdata)) { -+ done_lh(&lh); -+ return PTR_ERR(fsdata); -+ } -+ -+ coord = &fsdata->dec.entry_coord; -+ -+ assert("nikita-3404", -+ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) || -+ dir->i_size <= 1); -+ -+ coord_clear_iplug(coord); -+ if (result == 0) { -+ /* remove entry. Just pass control to the directory item -+ plugin. */ -+ assert("vs-542", inode_dir_item_plugin(dir)); -+ reiser4_seal_done(&fsdata->dec.entry_seal); -+ reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1); -+ result = -+ WITH_COORD(coord, -+ rem_entry(dir, dentry, entry, coord, &lh)); -+ if (result == 0) { -+ if (dir->i_size >= 1) -+ INODE_DEC_FIELD(dir, i_size); -+ else { -+ warning("nikita-2509", "Dir %llu is runt", -+ (unsigned long long) -+ get_inode_oid(dir)); -+ result = RETERR(-EIO); -+ } -+ -+ assert("nikita-3405", dentry->d_inode->i_nlink != 1 || -+ dentry->d_inode->i_size != 2 || -+ inode_dir_plugin(dentry->d_inode) == NULL); -+ } -+ } -+ done_lh(&lh); -+ -+ return result; -+} -+ -+static reiser4_block_nr estimate_init(struct inode *parent, -+ struct inode *object); -+static int create_dot_dotdot(struct inode *object, struct inode *parent); -+ -+/* this is common implementation of init method of dir plugin -+ create "." and ".." entries -+*/ -+int reiser4_dir_init_common(struct inode *object, /* new directory */ -+ struct inode *parent, /* parent directory */ -+ reiser4_object_create_data * data /* info passed -+ * to us, this -+ * is filled by -+ * reiser4() -+ * syscall in -+ * particular */) -+{ -+ reiser4_block_nr reserve; -+ -+ assert("nikita-680", object != NULL); -+ assert("nikita-681", S_ISDIR(object->i_mode)); -+ assert("nikita-682", parent != NULL); -+ assert("nikita-684", data != NULL); -+ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID); -+ assert("nikita-687", object->i_mode & S_IFDIR); -+ -+ reserve = estimate_init(parent, object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ return create_dot_dotdot(object, parent); -+} -+ -+/* this is common implementation of done method of dir plugin -+ remove "." entry -+*/ -+int reiser4_dir_done_common(struct inode *object/* object being deleted */) -+{ -+ int result; -+ reiser4_block_nr reserve; -+ struct dentry goodby_dots; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-1449", object != NULL); -+ -+ if (reiser4_inode_get_flag(object, REISER4_NO_SD)) -+ return 0; -+ -+ /* of course, this can be rewritten to sweep everything in one -+ reiser4_cut_tree(). */ -+ memset(&entry, 0, sizeof entry); -+ -+ /* FIXME: this done method is called from reiser4_delete_dir_common -+ * which reserved space already */ -+ reserve = inode_dir_plugin(object)->estimate.rem_entry(object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED)) -+ return RETERR(-ENOSPC); -+ -+ memset(&goodby_dots, 0, sizeof goodby_dots); -+ entry.obj = goodby_dots.d_inode = object; -+ goodby_dots.d_name.name = "."; -+ goodby_dots.d_name.len = 1; -+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry); -+ reiser4_free_dentry_fsdata(&goodby_dots); -+ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT)) -+ warning("nikita-2252", "Cannot remove dot of %lli: %i", -+ (unsigned long long)get_inode_oid(object), result); -+ return 0; -+} -+ -+/* this is common implementation of attach method of dir plugin -+*/ -+int reiser4_attach_common(struct inode *child UNUSED_ARG, -+ struct inode *parent UNUSED_ARG) -+{ -+ assert("nikita-2647", child != NULL); -+ assert("nikita-2648", parent != NULL); -+ -+ return 0; -+} -+ -+/* this is common implementation of detach method of dir plugin -+ remove "..", decrease nlink on parent -+*/ -+int reiser4_detach_common(struct inode *object, struct inode *parent) -+{ -+ int result; -+ struct dentry goodby_dots; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-2885", object != NULL); -+ assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ -+ memset(&entry, 0, sizeof entry); -+ -+ /* NOTE-NIKITA this only works if @parent is -the- parent of -+ @object, viz. object whose key is stored in dotdot -+ entry. Wouldn't work with hard-links on directories. */ -+ memset(&goodby_dots, 0, sizeof goodby_dots); -+ entry.obj = goodby_dots.d_inode = parent; -+ goodby_dots.d_name.name = ".."; -+ goodby_dots.d_name.len = 2; -+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry); -+ reiser4_free_dentry_fsdata(&goodby_dots); -+ if (result == 0) { -+ /* the dot should be the only entry remaining at this time... */ -+ assert("nikita-3400", -+ object->i_size == 1 && object->i_nlink <= 2); -+#if 0 -+ /* and, together with the only name directory can have, they -+ * provides for the last 2 remaining references. If we get -+ * here as part of error handling during mkdir, @object -+ * possibly has no name yet, so its nlink == 1. If we get here -+ * from rename (targeting empty directory), it has no name -+ * already, so its nlink == 1. */ -+ assert("nikita-3401", -+ object->i_nlink == 2 || object->i_nlink == 1); -+#endif -+ -+ /* decrement nlink of directory removed ".." pointed -+ to */ -+ reiser4_del_nlink(parent, NULL, 0); -+ } -+ return result; -+} -+ -+/* this is common implementation of estimate.add_entry method of -+ dir plugin -+ estimation of adding entry which supposes that entry is inserting a -+ unit into item -+*/ -+reiser4_block_nr estimate_add_entry_common(const struct inode *inode) -+{ -+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode)); -+} -+ -+/* this is common implementation of estimate.rem_entry method of dir -+ plugin -+*/ -+reiser4_block_nr estimate_rem_entry_common(const struct inode *inode) -+{ -+ return estimate_one_item_removal(reiser4_tree_by_inode(inode)); -+} -+ -+/* this is common implementation of estimate.unlink method of dir -+ plugin -+*/ -+reiser4_block_nr -+dir_estimate_unlink_common(const struct inode *parent, -+ const struct inode *object) -+{ -+ reiser4_block_nr res; -+ -+ /* hashed_rem_entry(object) */ -+ res = inode_dir_plugin(object)->estimate.rem_entry(object); -+ /* del_nlink(parent) */ -+ res += 2 * inode_file_plugin(parent)->estimate.update(parent); -+ -+ return res; -+} -+ -+/* -+ * helper for inode_ops ->lookup() and dir plugin's ->get_parent() -+ * methods: if @inode is a light-weight file, setup its credentials -+ * that are not stored in the stat-data in this case -+ */ -+void check_light_weight(struct inode *inode, struct inode *parent) -+{ -+ if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) { -+ inode->i_uid = parent->i_uid; -+ inode->i_gid = parent->i_gid; -+ /* clear light-weight flag. If inode would be read by any -+ other name, [ug]id wouldn't change. */ -+ reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT); -+ } -+} -+ -+/* looks for name specified in @dentry in directory @parent and if name is -+ found - key of object found entry points to is stored in @entry->key */ -+int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup -+ * for name in */ -+ struct dentry *dentry, /* name to look for */ -+ reiser4_key * key/* place to store key */) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle lh; -+ const char *name; -+ int len; -+ reiser4_dir_entry_desc entry; -+ struct reiser4_dentry_fsdata *fsdata; -+ -+ assert("nikita-1247", parent != NULL); -+ assert("nikita-1248", dentry != NULL); -+ assert("nikita-1123", dentry->d_name.name != NULL); -+ assert("vs-1486", -+ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry); -+ -+ name = dentry->d_name.name; -+ len = dentry->d_name.len; -+ -+ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len)) -+ /* some arbitrary error code to return */ -+ return RETERR(-ENAMETOOLONG); -+ -+ fsdata = reiser4_get_dentry_fsdata(dentry); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ -+ coord = &fsdata->dec.entry_coord; -+ coord_clear_iplug(coord); -+ init_lh(&lh); -+ -+ /* find entry in a directory. This is plugin method. */ -+ result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, -+ &entry); -+ if (result == 0) { -+ /* entry was found, extract object key from it. */ -+ result = -+ WITH_COORD(coord, -+ item_plugin_by_coord(coord)->s.dir. -+ extract_key(coord, key)); -+ } -+ done_lh(&lh); -+ return result; -+ -+} -+ -+/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */ -+static reiser4_block_nr -+estimate_init(struct inode *parent, struct inode *object) -+{ -+ reiser4_block_nr res = 0; -+ -+ assert("vpf-321", parent != NULL); -+ assert("vpf-322", object != NULL); -+ -+ /* hashed_add_entry(object) */ -+ res += inode_dir_plugin(object)->estimate.add_entry(object); -+ /* reiser4_add_nlink(object) */ -+ res += inode_file_plugin(object)->estimate.update(object); -+ /* hashed_add_entry(object) */ -+ res += inode_dir_plugin(object)->estimate.add_entry(object); -+ /* reiser4_add_nlink(parent) */ -+ res += inode_file_plugin(parent)->estimate.update(parent); -+ -+ return 0; -+} -+ -+/* helper function for reiser4_dir_init_common(). Create "." and ".." */ -+static int create_dot_dotdot(struct inode *object/* object to create dot and -+ * dotdot for */ , -+ struct inode *parent/* parent of @object */) -+{ -+ int result; -+ struct dentry dots_entry; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-688", object != NULL); -+ assert("nikita-689", S_ISDIR(object->i_mode)); -+ assert("nikita-691", parent != NULL); -+ -+ /* We store dot and dotdot as normal directory entries. This is -+ not necessary, because almost all information stored in them -+ is already in the stat-data of directory, the only thing -+ being missed is objectid of grand-parent directory that can -+ easily be added there as extension. -+ -+ But it is done the way it is done, because not storing dot -+ and dotdot will lead to the following complications: -+ -+ . special case handling in ->lookup(). -+ . addition of another extension to the sd. -+ . dependency on key allocation policy for stat data. -+ -+ */ -+ -+ memset(&entry, 0, sizeof entry); -+ memset(&dots_entry, 0, sizeof dots_entry); -+ entry.obj = dots_entry.d_inode = object; -+ dots_entry.d_name.name = "."; -+ dots_entry.d_name.len = 1; -+ result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry); -+ reiser4_free_dentry_fsdata(&dots_entry); -+ -+ if (result == 0) { -+ result = reiser4_add_nlink(object, object, 0); -+ if (result == 0) { -+ entry.obj = dots_entry.d_inode = parent; -+ dots_entry.d_name.name = ".."; -+ dots_entry.d_name.len = 2; -+ result = reiser4_add_entry_common(object, -+ &dots_entry, NULL, &entry); -+ reiser4_free_dentry_fsdata(&dots_entry); -+ /* if creation of ".." failed, iput() will delete -+ object with ".". */ -+ if (result == 0) { -+ result = reiser4_add_nlink(parent, object, 0); -+ if (result != 0) -+ /* -+ * if we failed to bump i_nlink, try -+ * to remove ".." -+ */ -+ reiser4_detach_common(object, parent); -+ } -+ } -+ } -+ -+ if (result != 0) { -+ /* -+ * in the case of error, at least update stat-data so that, -+ * ->i_nlink updates are not lingering. -+ */ -+ reiser4_update_sd(object); -+ reiser4_update_sd(parent); -+ } -+ -+ return result; -+} -+ -+/* -+ * return 0 iff @coord contains a directory entry for the file with the name -+ * @name. -+ */ -+static int -+check_item(const struct inode *dir, const coord_t *coord, const char *name) -+{ -+ item_plugin *iplug; -+ char buf[DE_NAME_BUF_LEN]; -+ -+ iplug = item_plugin_by_coord(coord); -+ if (iplug == NULL) { -+ warning("nikita-1135", "Cannot get item plugin"); -+ print_coord("coord", coord, 1); -+ return RETERR(-EIO); -+ } else if (item_id_by_coord(coord) != -+ item_id_by_plugin(inode_dir_item_plugin(dir))) { -+ /* item id of current item does not match to id of items a -+ directory is built of */ -+ warning("nikita-1136", "Wrong item plugin"); -+ print_coord("coord", coord, 1); -+ return RETERR(-EIO); -+ } -+ assert("nikita-1137", iplug->s.dir.extract_name); -+ -+ /* Compare name stored in this entry with name we are looking for. -+ -+ NOTE-NIKITA Here should go code for support of something like -+ unicode, code tables, etc. -+ */ -+ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf)); -+} -+ -+static int -+check_entry(const struct inode *dir, coord_t *coord, const struct qstr *name) -+{ -+ return WITH_COORD(coord, check_item(dir, coord, name->name)); -+} -+ -+/* -+ * argument package used by entry_actor to scan entries with identical keys. -+ */ -+struct entry_actor_args { -+ /* name we are looking for */ -+ const char *name; -+ /* key of directory entry. entry_actor() scans through sequence of -+ * items/units having the same key */ -+ reiser4_key *key; -+ /* how many entries with duplicate key was scanned so far. */ -+ int non_uniq; -+#if REISER4_USE_COLLISION_LIMIT -+ /* scan limit */ -+ int max_non_uniq; -+#endif -+ /* return parameter: set to true, if ->name wasn't found */ -+ int not_found; -+ /* what type of lock to take when moving to the next node during -+ * scan */ -+ znode_lock_mode mode; -+ -+ /* last coord that was visited during scan */ -+ coord_t last_coord; -+ /* last node locked during scan */ -+ lock_handle last_lh; -+ /* inode of directory */ -+ const struct inode *inode; -+}; -+ -+/* Function called by reiser4_find_entry() to look for given name -+ in the directory. */ -+static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ , -+ coord_t *coord /* current coord */ , -+ lock_handle * lh /* current lock handle */ , -+ void *entry_actor_arg/* argument to scan */) -+{ -+ reiser4_key unit_key; -+ struct entry_actor_args *args; -+ -+ assert("nikita-1131", tree != NULL); -+ assert("nikita-1132", coord != NULL); -+ assert("nikita-1133", entry_actor_arg != NULL); -+ -+ args = entry_actor_arg; -+ ++args->non_uniq; -+#if REISER4_USE_COLLISION_LIMIT -+ if (args->non_uniq > args->max_non_uniq) { -+ args->not_found = 1; -+ /* hash collision overflow. */ -+ return RETERR(-EBUSY); -+ } -+#endif -+ -+ /* -+ * did we just reach the end of the sequence of items/units with -+ * identical keys? -+ */ -+ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) { -+ assert("nikita-1791", -+ keylt(args->key, unit_key_by_coord(coord, &unit_key))); -+ args->not_found = 1; -+ args->last_coord.between = AFTER_UNIT; -+ return 0; -+ } -+ -+ coord_dup(&args->last_coord, coord); -+ /* -+ * did scan just moved to the next node? -+ */ -+ if (args->last_lh.node != lh->node) { -+ int lock_result; -+ -+ /* -+ * if so, lock new node with the mode requested by the caller -+ */ -+ done_lh(&args->last_lh); -+ assert("nikita-1896", znode_is_any_locked(lh->node)); -+ lock_result = longterm_lock_znode(&args->last_lh, lh->node, -+ args->mode, ZNODE_LOCK_HIPRI); -+ if (lock_result != 0) -+ return lock_result; -+ } -+ return check_item(args->inode, coord, args->name); -+} -+ -+/* Look for given @name within directory @dir. -+ -+ This is called during lookup, creation and removal of directory -+ entries and on reiser4_rename_common -+ -+ First calculate key that directory entry for @name would have. Search -+ for this key in the tree. If such key is found, scan all items with -+ the same key, checking name in each directory entry along the way. -+*/ -+int reiser4_find_entry(struct inode *dir, /* directory to scan */ -+ struct dentry *de, /* name to search for */ -+ lock_handle * lh, /* resulting lock handle */ -+ znode_lock_mode mode, /* required lock mode */ -+ reiser4_dir_entry_desc * entry /* parameters of found -+ directory entry */) -+{ -+ const struct qstr *name; -+ seal_t *seal; -+ coord_t *coord; -+ int result; -+ __u32 flags; -+ struct de_location *dec; -+ struct reiser4_dentry_fsdata *fsdata; -+ -+ assert("nikita-1130", lh != NULL); -+ assert("nikita-1128", dir != NULL); -+ -+ name = &de->d_name; -+ assert("nikita-1129", name != NULL); -+ -+ /* dentry private data don't require lock, because dentry -+ manipulations are protected by i_mutex on parent. -+ -+ This is not so for inodes, because there is no -the- parent in -+ inode case. -+ */ -+ fsdata = reiser4_get_dentry_fsdata(de); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ dec = &fsdata->dec; -+ -+ coord = &dec->entry_coord; -+ coord_clear_iplug(coord); -+ seal = &dec->entry_seal; -+ /* compose key of directory entry for @name */ -+ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key); -+ -+ if (reiser4_seal_is_set(seal)) { -+ /* check seal */ -+ result = reiser4_seal_validate(seal, coord, &entry->key, -+ lh, mode, ZNODE_LOCK_LOPRI); -+ if (result == 0) { -+ /* key was found. Check that it is really item we are -+ looking for. */ -+ result = check_entry(dir, coord, name); -+ if (result == 0) -+ return 0; -+ } -+ } -+ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0; -+ /* -+ * find place in the tree where directory item should be located. -+ */ -+ result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode, -+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, -+ flags, NULL/*ra_info */); -+ if (result == CBK_COORD_FOUND) { -+ struct entry_actor_args arg; -+ -+ /* fast path: no hash collisions */ -+ result = check_entry(dir, coord, name); -+ if (result == 0) { -+ reiser4_seal_init(seal, coord, &entry->key); -+ dec->pos = 0; -+ } else if (result > 0) { -+ /* Iterate through all units with the same keys. */ -+ arg.name = name->name; -+ arg.key = &entry->key; -+ arg.not_found = 0; -+ arg.non_uniq = 0; -+#if REISER4_USE_COLLISION_LIMIT -+ arg.max_non_uniq = max_hash_collisions(dir); -+ assert("nikita-2851", arg.max_non_uniq > 1); -+#endif -+ arg.mode = mode; -+ arg.inode = dir; -+ coord_init_zero(&arg.last_coord); -+ init_lh(&arg.last_lh); -+ -+ result = reiser4_iterate_tree -+ (reiser4_tree_by_inode(dir), -+ coord, lh, -+ entry_actor, &arg, mode, 1); -+ /* if end of the tree or extent was reached during -+ scanning. */ -+ if (arg.not_found || (result == -E_NO_NEIGHBOR)) { -+ /* step back */ -+ done_lh(lh); -+ -+ result = zload(arg.last_coord.node); -+ if (result == 0) { -+ coord_clear_iplug(&arg.last_coord); -+ coord_dup(coord, &arg.last_coord); -+ move_lh(lh, &arg.last_lh); -+ result = RETERR(-ENOENT); -+ zrelse(arg.last_coord.node); -+ --arg.non_uniq; -+ } -+ } -+ -+ done_lh(&arg.last_lh); -+ if (result == 0) -+ reiser4_seal_init(seal, coord, &entry->key); -+ -+ if (result == 0 || result == -ENOENT) { -+ assert("nikita-2580", arg.non_uniq > 0); -+ dec->pos = arg.non_uniq - 1; -+ } -+ } -+ } else -+ dec->pos = -1; -+ return result; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.30/fs/reiser4/plugin/disk_format/disk_format40.c ---- linux-2.6.30.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/disk_format/disk_format40.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,655 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../key.h" -+#include "../node/node.h" -+#include "../space/space_allocator.h" -+#include "disk_format40.h" -+#include "../plugin.h" -+#include "../../txnmgr.h" -+#include "../../jnode.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../wander.h" -+#include "../../inode.h" -+#include "../../ktxnmgrd.h" -+#include "../../status_flags.h" -+ -+#include <linux/types.h> /* for __u?? */ -+#include <linux/fs.h> /* for struct super_block */ -+#include <linux/buffer_head.h> -+ -+/* reiser 4.0 default disk layout */ -+ -+/* Amount of free blocks needed to perform release_format40 when fs gets -+ mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header -+ & tx record. */ -+#define RELEASE_RESERVED 4 -+ -+/* The greatest supported format40 version number */ -+#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION -+ -+/* This flag indicates that backup should be updated -+ (the update is performed by fsck) */ -+#define FORMAT40_UPDATE_BACKUP (1 << 31) -+ -+/* functions to access fields of format40_disk_super_block */ -+static __u64 get_format40_block_count(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->block_count)); -+} -+ -+static __u64 get_format40_free_blocks(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->free_blocks)); -+} -+ -+static __u64 get_format40_root_block(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->root_block)); -+} -+ -+static __u16 get_format40_tree_height(const format40_disk_super_block * sb) -+{ -+ return le16_to_cpu(get_unaligned(&sb->tree_height)); -+} -+ -+static __u64 get_format40_file_count(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->file_count)); -+} -+ -+static __u64 get_format40_oid(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->oid)); -+} -+ -+static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb) -+{ -+ return le32_to_cpu(get_unaligned(&sb->mkfs_id)); -+} -+ -+static __u64 get_format40_flags(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->flags)); -+} -+ -+static __u32 get_format40_version(const format40_disk_super_block * sb) -+{ -+ return le32_to_cpu(get_unaligned(&sb->version)) & -+ ~FORMAT40_UPDATE_BACKUP; -+} -+ -+static int update_backup_version(const format40_disk_super_block * sb) -+{ -+ return (le32_to_cpu(get_unaligned(&sb->version)) & -+ FORMAT40_UPDATE_BACKUP); -+} -+ -+static int update_disk_version(const format40_disk_super_block * sb) -+{ -+ return (get_format40_version(sb) < FORMAT40_VERSION); -+} -+ -+static int incomplete_compatibility(const format40_disk_super_block * sb) -+{ -+ return (get_format40_version(sb) > FORMAT40_VERSION); -+} -+ -+static format40_super_info *get_sb_info(struct super_block *super) -+{ -+ return &get_super_private(super)->u.format40; -+} -+ -+static int consult_diskmap(struct super_block *s) -+{ -+ format40_super_info *info; -+ journal_location *jloc; -+ -+ info = get_sb_info(s); -+ jloc = &get_super_private(s)->jloc; -+ /* Default format-specific locations, if there is nothing in -+ * diskmap */ -+ jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR; -+ jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR; -+ info->loc.super = FORMAT40_OFFSET / s->s_blocksize; -+#ifdef CONFIG_REISER4_BADBLOCKS -+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF, -+ &jloc->footer); -+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH, -+ &jloc->header); -+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER, -+ &info->loc.super); -+#endif -+ return 0; -+} -+ -+/* find any valid super block of disk_format40 (even if the first -+ super block is destroyed), will change block numbers of actual journal header/footer (jf/jh) -+ if needed */ -+static struct buffer_head *find_a_disk_format40_super_block(struct super_block -+ *s) -+{ -+ struct buffer_head *super_bh; -+ format40_disk_super_block *disk_sb; -+ format40_super_info *info; -+ -+ assert("umka-487", s != NULL); -+ -+ info = get_sb_info(s); -+ -+ super_bh = sb_bread(s, info->loc.super); -+ if (super_bh == NULL) -+ return ERR_PTR(RETERR(-EIO)); -+ -+ disk_sb = (format40_disk_super_block *) super_bh->b_data; -+ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) { -+ brelse(super_bh); -+ return ERR_PTR(RETERR(-EINVAL)); -+ } -+ -+ reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count))); -+ reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) - -+ le64_to_cpu(get_unaligned(&disk_sb->free_blocks))); -+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks))); -+ -+ return super_bh; -+} -+ -+/* find the most recent version of super block. This is called after journal is -+ replayed */ -+static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG) -+{ -+ /* Here the most recent superblock copy has to be read. However, as -+ journal replay isn't complete, we are using -+ find_a_disk_format40_super_block() function. */ -+ return find_a_disk_format40_super_block(s); -+} -+ -+static int get_super_jnode(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ jnode *sb_jnode; -+ int ret; -+ -+ sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super); -+ -+ ret = jload(sb_jnode); -+ -+ if (ret) { -+ reiser4_drop_io_head(sb_jnode); -+ return ret; -+ } -+ -+ pin_jnode_data(sb_jnode); -+ jrelse(sb_jnode); -+ -+ sbinfo->u.format40.sb_jnode = sb_jnode; -+ -+ return 0; -+} -+ -+static void done_super_jnode(struct super_block *s) -+{ -+ jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode; -+ -+ if (sb_jnode) { -+ unpin_jnode_data(sb_jnode); -+ reiser4_drop_io_head(sb_jnode); -+ } -+} -+ -+typedef enum format40_init_stage { -+ NONE_DONE = 0, -+ CONSULT_DISKMAP, -+ FIND_A_SUPER, -+ INIT_JOURNAL_INFO, -+ INIT_STATUS, -+ JOURNAL_REPLAY, -+ READ_SUPER, -+ KEY_CHECK, -+ INIT_OID, -+ INIT_TREE, -+ JOURNAL_RECOVER, -+ INIT_SA, -+ INIT_JNODE, -+ ALL_DONE -+} format40_init_stage; -+ -+static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh) -+{ -+ format40_disk_super_block *sb_copy; -+ -+ sb_copy = kmalloc(sizeof(format40_disk_super_block), -+ reiser4_ctx_gfp_mask_get()); -+ if (sb_copy == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data), -+ sizeof(format40_disk_super_block)); -+ return sb_copy; -+} -+ -+static int check_key_format(const format40_disk_super_block *sb_copy) -+{ -+ if (!equi(REISER4_LARGE_KEY, -+ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) { -+ warning("nikita-3228", "Key format mismatch. " -+ "Only %s keys are supported.", -+ REISER4_LARGE_KEY ? "large" : "small"); -+ return RETERR(-EINVAL); -+ } -+ return 0; -+} -+ -+/** -+ * try_init_format40 -+ * @super: -+ * @stage: -+ * -+ */ -+static int try_init_format40(struct super_block *super, -+ format40_init_stage *stage) -+{ -+ int result; -+ struct buffer_head *super_bh; -+ reiser4_super_info_data *sbinfo; -+ format40_disk_super_block *sb_copy; -+ tree_level height; -+ reiser4_block_nr root_block; -+ node_plugin *nplug; -+ -+ assert("vs-475", super != NULL); -+ assert("vs-474", get_super_private(super)); -+ -+ *stage = NONE_DONE; -+ -+ result = consult_diskmap(super); -+ if (result) -+ return result; -+ *stage = CONSULT_DISKMAP; -+ -+ super_bh = find_a_disk_format40_super_block(super); -+ if (IS_ERR(super_bh)) -+ return PTR_ERR(super_bh); -+ brelse(super_bh); -+ *stage = FIND_A_SUPER; -+ -+ /* ok, we are sure that filesystem format is a format40 format */ -+ -+ /* map jnodes for journal control blocks (header, footer) to disk */ -+ result = reiser4_init_journal_info(super); -+ if (result) -+ return result; -+ *stage = INIT_JOURNAL_INFO; -+ -+ /* ok, we are sure that filesystem format is a format40 format */ -+ /* Now check it's state */ -+ result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR); -+ if (result != 0 && result != -EINVAL) -+ /* -EINVAL means there is no magic, so probably just old -+ * fs. */ -+ return result; -+ *stage = INIT_STATUS; -+ -+ result = reiser4_status_query(NULL, NULL); -+ if (result == REISER4_STATUS_MOUNT_WARN) -+ notice("vpf-1363", "Warning: mounting %s with errors.", -+ super->s_id); -+ if (result == REISER4_STATUS_MOUNT_RO) -+ notice("vpf-1364", "Warning: mounting %s with fatal errors," -+ " forcing read-only mount.", super->s_id); -+ result = reiser4_journal_replay(super); -+ if (result) -+ return result; -+ *stage = JOURNAL_REPLAY; -+ -+ super_bh = read_super_block(super); -+ if (IS_ERR(super_bh)) -+ return PTR_ERR(super_bh); -+ *stage = READ_SUPER; -+ -+ /* allocate and make a copy of format40_disk_super_block */ -+ sb_copy = copy_sb(super_bh); -+ brelse(super_bh); -+ -+ if (IS_ERR(sb_copy)) -+ return PTR_ERR(sb_copy); -+ printk("reiser4: %s: found disk format 4.0.%u.\n", -+ super->s_id, -+ get_format40_version(sb_copy)); -+ if (incomplete_compatibility(sb_copy)) -+ printk("reiser4: Warning: The last completely supported " -+ "version of disk format40 is %u. Some objects of " -+ "the semantic tree can be unaccessible.\n", -+ FORMAT40_VERSION); -+ /* make sure that key format of kernel and filesystem match */ -+ result = check_key_format(sb_copy); -+ if (result) { -+ kfree(sb_copy); -+ return result; -+ } -+ *stage = KEY_CHECK; -+ -+ result = oid_init_allocator(super, get_format40_file_count(sb_copy), -+ get_format40_oid(sb_copy)); -+ if (result) { -+ kfree(sb_copy); -+ return result; -+ } -+ *stage = INIT_OID; -+ -+ /* get things necessary to init reiser4_tree */ -+ root_block = get_format40_root_block(sb_copy); -+ height = get_format40_tree_height(sb_copy); -+ nplug = node_plugin_by_id(NODE40_ID); -+ -+ /* initialize reiser4_super_info_data */ -+ sbinfo = get_super_private(super); -+ assert("", sbinfo->tree.super == super); -+ /* init reiser4_tree for the filesystem */ -+ result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug); -+ if (result) { -+ kfree(sb_copy); -+ return result; -+ } -+ *stage = INIT_TREE; -+ -+ /* -+ * initialize reiser4_super_info_data with data from format40 super -+ * block -+ */ -+ sbinfo->default_uid = 0; -+ sbinfo->default_gid = 0; -+ sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy); -+ /* number of blocks in filesystem and reserved space */ -+ reiser4_set_block_count(super, get_format40_block_count(sb_copy)); -+ sbinfo->blocks_free = get_format40_free_blocks(sb_copy); -+ sbinfo->version = get_format40_version(sb_copy); -+ kfree(sb_copy); -+ -+ if (update_backup_version(sb_copy)) -+ printk("reiser4: Warning: metadata backup is not updated. " -+ "Please run 'fsck.reiser4 --fix' on %s.\n", -+ super->s_id); -+ -+ sbinfo->fsuid = 0; -+ sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories -+ * are not supported */ -+ sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in -+ * layout 40 are -+ * of one -+ * plugin */ -+ /* sbinfo->tmgr is initialized already */ -+ -+ /* recover sb data which were logged separately from sb block */ -+ -+ /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls -+ * oid_init_allocator() and reiser4_set_free_blocks() with new -+ * data. What's the reason to call them above? */ -+ result = reiser4_journal_recover_sb_data(super); -+ if (result != 0) -+ return result; -+ *stage = JOURNAL_RECOVER; -+ -+ /* -+ * Set number of used blocks. The number of used blocks is not stored -+ * neither in on-disk super block nor in the journal footer blocks. At -+ * this moment actual values of total blocks and free block counters -+ * are set in the reiser4 super block (in-memory structure) and we can -+ * calculate number of used blocks from them. -+ */ -+ reiser4_set_data_blocks(super, -+ reiser4_block_count(super) - -+ reiser4_free_blocks(super)); -+ -+#if REISER4_DEBUG -+ sbinfo->min_blocks_used = 16 /* reserved area */ + -+ 2 /* super blocks */ + -+ 2 /* journal footer and header */ ; -+#endif -+ -+ /* init disk space allocator */ -+ result = sa_init_allocator(reiser4_get_space_allocator(super), -+ super, NULL); -+ if (result) -+ return result; -+ *stage = INIT_SA; -+ -+ result = get_super_jnode(super); -+ if (result == 0) -+ *stage = ALL_DONE; -+ return result; -+} -+ -+/* plugin->u.format.get_ready */ -+int init_format_format40(struct super_block *s, void *data UNUSED_ARG) -+{ -+ int result; -+ format40_init_stage stage; -+ -+ result = try_init_format40(s, &stage); -+ switch (stage) { -+ case ALL_DONE: -+ assert("nikita-3458", result == 0); -+ break; -+ case INIT_JNODE: -+ done_super_jnode(s); -+ case INIT_SA: -+ sa_destroy_allocator(reiser4_get_space_allocator(s), s); -+ case JOURNAL_RECOVER: -+ case INIT_TREE: -+ reiser4_done_tree(&get_super_private(s)->tree); -+ case INIT_OID: -+ case KEY_CHECK: -+ case READ_SUPER: -+ case JOURNAL_REPLAY: -+ case INIT_STATUS: -+ reiser4_status_finish(); -+ case INIT_JOURNAL_INFO: -+ reiser4_done_journal_info(s); -+ case FIND_A_SUPER: -+ case CONSULT_DISKMAP: -+ case NONE_DONE: -+ break; -+ default: -+ impossible("nikita-3457", "init stage: %i", stage); -+ } -+ -+ if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED) -+ return RETERR(-ENOSPC); -+ -+ return result; -+} -+ -+static void pack_format40_super(const struct super_block *s, char *data) -+{ -+ format40_disk_super_block *super_data = -+ (format40_disk_super_block *) data; -+ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ -+ assert("zam-591", data != NULL); -+ -+ put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)), -+ &super_data->free_blocks); -+ -+ put_unaligned(cpu_to_le64(sbinfo->tree.root_block), -+ &super_data->root_block); -+ -+ put_unaligned(cpu_to_le64(oid_next(s)), -+ &super_data->oid); -+ -+ put_unaligned(cpu_to_le64(oids_used(s)), -+ &super_data->file_count); -+ -+ put_unaligned(cpu_to_le16(sbinfo->tree.height), -+ &super_data->tree_height); -+ -+ if (update_disk_version(super_data)) { -+ __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP; -+ -+ put_unaligned(cpu_to_le32(version), &super_data->version); -+ } -+} -+ -+/* plugin->u.format.log_super -+ return a jnode which should be added to transaction when the super block -+ gets logged */ -+jnode *log_super_format40(struct super_block *s) -+{ -+ jnode *sb_jnode; -+ -+ sb_jnode = get_super_private(s)->u.format40.sb_jnode; -+ -+ jload(sb_jnode); -+ -+ pack_format40_super(s, jdata(sb_jnode)); -+ -+ jrelse(sb_jnode); -+ -+ return sb_jnode; -+} -+ -+/* plugin->u.format.release */ -+int release_format40(struct super_block *s) -+{ -+ int ret; -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(s); -+ assert("zam-579", sbinfo != NULL); -+ -+ if (!rofs_super(s)) { -+ ret = reiser4_capture_super_block(s); -+ if (ret != 0) -+ warning("vs-898", -+ "reiser4_capture_super_block failed: %d", -+ ret); -+ -+ ret = txnmgr_force_commit_all(s, 1); -+ if (ret != 0) -+ warning("jmacd-74438", "txn_force failed: %d", ret); -+ -+ all_grabbed2free(); -+ } -+ -+ sa_destroy_allocator(&sbinfo->space_allocator, s); -+ reiser4_done_journal_info(s); -+ done_super_jnode(s); -+ -+ rcu_barrier(); -+ reiser4_done_tree(&sbinfo->tree); -+ /* call finish_rcu(), because some znode were "released" in -+ * reiser4_done_tree(). */ -+ rcu_barrier(); -+ -+ return 0; -+} -+ -+#define FORMAT40_ROOT_LOCALITY 41 -+#define FORMAT40_ROOT_OBJECTID 42 -+ -+/* plugin->u.format.root_dir_key */ -+const reiser4_key *root_dir_key_format40(const struct super_block *super -+ UNUSED_ARG) -+{ -+ static const reiser4_key FORMAT40_ROOT_DIR_KEY = { -+ .el = { -+ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR), -+#if REISER4_LARGE_KEY -+ ON_LARGE_KEY(0ull,) -+#endif -+ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID), -+ 0ull -+ } -+ }; -+ -+ return &FORMAT40_ROOT_DIR_KEY; -+} -+ -+/* plugin->u.format.check_open. -+ Check the opened object for validness. For now it checks for the valid oid & -+ locality only, can be improved later and it its work may depend on the mount -+ options. */ -+int check_open_format40(const struct inode *object) -+{ -+ oid_t max, oid; -+ -+ max = oid_next(object->i_sb) - 1; -+ -+ /* Check the oid. */ -+ oid = get_inode_oid(object); -+ if (oid > max) { -+ warning("vpf-1360", "The object with the oid %llu " -+ "greater then the max used oid %llu found.", -+ (unsigned long long)oid, (unsigned long long)max); -+ -+ return RETERR(-EIO); -+ } -+ -+ /* Check the locality. */ -+ oid = reiser4_inode_data(object)->locality_id; -+ if (oid > max) { -+ warning("vpf-1361", "The object with the locality %llu " -+ "greater then the max used oid %llu found.", -+ (unsigned long long)oid, (unsigned long long)max); -+ -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.format.version_update. -+ Perform all version update operations from the on-disk -+ format40_disk_super_block.version on disk to FORMAT40_VERSION. -+ */ -+int version_update_format40(struct super_block *super) { -+ txn_handle * trans; -+ lock_handle lh; -+ txn_atom *atom; -+ int ret; -+ -+ /* Nothing to do if RO mount or the on-disk version is not less. */ -+ if (super->s_flags & MS_RDONLY) -+ return 0; -+ -+ if (get_super_private(super)->version >= FORMAT40_VERSION) -+ return 0; -+ -+ printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata " -+ "backup is left unchanged. Please run 'fsck.reiser4 --fix' " -+ "on %s to update it too.\n", FORMAT40_VERSION, super->s_id); -+ -+ /* Mark the uber znode dirty to call log_super on write_logs. */ -+ init_lh(&lh); -+ ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_HIPRI, &lh); -+ if (ret != 0) -+ return ret; -+ -+ znode_make_dirty(lh.node); -+ done_lh(&lh); -+ -+ /* Update the backup blocks. */ -+ -+ /* Force write_logs immediately. */ -+ trans = get_current_context()->trans; -+ atom = get_current_atom_locked(); -+ assert("vpf-1906", atom != NULL); -+ -+ spin_lock_txnh(trans); -+ return force_commit_atom(trans); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.30/fs/reiser4/plugin/disk_format/disk_format40.h ---- linux-2.6.30.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/disk_format/disk_format40.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,109 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* this file contains: -+ - definition of ondisk super block of standart disk layout for -+ reiser 4.0 (layout 40) -+ - definition of layout 40 specific portion of in-core super block -+ - declarations of functions implementing methods of layout plugin -+ for layout 40 -+ - declarations of functions used to get/set fields in layout 40 super block -+*/ -+ -+#ifndef __DISK_FORMAT40_H__ -+#define __DISK_FORMAT40_H__ -+ -+/* magic for default reiser4 layout */ -+#define FORMAT40_MAGIC "ReIsEr40FoRmAt" -+#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE) -+ -+#include "../../dformat.h" -+ -+#include <linux/fs.h> /* for struct super_block */ -+ -+typedef enum { -+ FORMAT40_LARGE_KEYS -+} format40_flags; -+ -+/* ondisk super block for format 40. It is 512 bytes long */ -+typedef struct format40_disk_super_block { -+ /* 0 */ d64 block_count; -+ /* number of block in a filesystem */ -+ /* 8 */ d64 free_blocks; -+ /* number of free blocks */ -+ /* 16 */ d64 root_block; -+ /* filesystem tree root block */ -+ /* 24 */ d64 oid; -+ /* smallest free objectid */ -+ /* 32 */ d64 file_count; -+ /* number of files in a filesystem */ -+ /* 40 */ d64 flushes; -+ /* number of times super block was -+ flushed. Needed if format 40 -+ will have few super blocks */ -+ /* 48 */ d32 mkfs_id; -+ /* unique identifier of fs */ -+ /* 52 */ char magic[16]; -+ /* magic string ReIsEr40FoRmAt */ -+ /* 68 */ d16 tree_height; -+ /* height of filesystem tree */ -+ /* 70 */ d16 formatting_policy; -+ /* not used anymore */ -+ /* 72 */ d64 flags; -+ /* 80 */ d32 version; -+ /* on-disk format version number -+ initially assigned by mkfs as the greatest format40 -+ version number supported by reiser4progs and updated -+ in mount time in accordance with the greatest format40 -+ version number supported by kernel. -+ Is used by fsck to catch possible corruption and -+ for various compatibility issues */ -+ /* 84 */ char not_used[428]; -+} format40_disk_super_block; -+ -+/* format 40 specific part of reiser4_super_info_data */ -+typedef struct format40_super_info { -+/* format40_disk_super_block actual_sb; */ -+ jnode *sb_jnode; -+ struct { -+ reiser4_block_nr super; -+ } loc; -+} format40_super_info; -+ -+/* Defines for journal header and footer respectively. */ -+#define FORMAT40_JOURNAL_HEADER_BLOCKNR \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3) -+ -+#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4) -+ -+#define FORMAT40_STATUS_BLOCKNR \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5) -+ -+/* Diskmap declarations */ -+#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID)) -+#define FORMAT40_SUPER 1 -+#define FORMAT40_JH 2 -+#define FORMAT40_JF 3 -+ -+/* declarations of functions implementing methods of layout plugin for -+ format 40. The functions theirself are in disk_format40.c */ -+extern int init_format_format40(struct super_block *, void *data); -+extern const reiser4_key *root_dir_key_format40(const struct super_block *); -+extern int release_format40(struct super_block *s); -+extern jnode *log_super_format40(struct super_block *s); -+extern int check_open_format40(const struct inode *object); -+extern int version_update_format40(struct super_block *super); -+ -+/* __DISK_FORMAT40_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.30/fs/reiser4/plugin/disk_format/disk_format.c ---- linux-2.6.30.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/disk_format/disk_format.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,38 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../plugin_header.h" -+#include "disk_format40.h" -+#include "disk_format.h" -+#include "../plugin.h" -+ -+/* initialization of disk layout plugins */ -+disk_format_plugin format_plugins[LAST_FORMAT_ID] = { -+ [FORMAT40_ID] = { -+ .h = { -+ .type_id = REISER4_FORMAT_PLUGIN_TYPE, -+ .id = FORMAT40_ID, -+ .pops = NULL, -+ .label = "reiser40", -+ .desc = "standard disk layout for reiser40", -+ .linkage = {NULL, NULL} -+ }, -+ .init_format = init_format_format40, -+ .root_dir_key = root_dir_key_format40, -+ .release = release_format40, -+ .log_super = log_super_format40, -+ .check_open = check_open_format40, -+ .version_update = version_update_format40 -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.30/fs/reiser4/plugin/disk_format/disk_format.h ---- linux-2.6.30.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/disk_format/disk_format.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,27 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* identifiers for disk layouts, they are also used as indexes in array of disk -+ plugins */ -+ -+#if !defined( __REISER4_DISK_FORMAT_H__ ) -+#define __REISER4_DISK_FORMAT_H__ -+ -+typedef enum { -+ /* standard reiser4 disk layout plugin id */ -+ FORMAT40_ID, -+ LAST_FORMAT_ID -+} disk_format_id; -+ -+/* __REISER4_DISK_FORMAT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.30/fs/reiser4/plugin/disk_format/Makefile ---- linux-2.6.30.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/disk_format/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += df_plugins.o -+ -+df_plugins-objs := \ -+ disk_format40.o \ -+ disk_format.o -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/fibration.c linux-2.6.30/fs/reiser4/plugin/fibration.c ---- linux-2.6.30.orig/fs/reiser4/plugin/fibration.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/fibration.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,175 @@ -+/* Copyright 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Directory fibrations */ -+ -+/* -+ * Suppose we have a directory tree with sources of some project. During -+ * compilation .o files are created within this tree. This makes access -+ * to the original source files less efficient, because source files are -+ * now "diluted" by object files: default directory plugin uses prefix -+ * of a file name as a part of the key for directory entry (and this -+ * part is also inherited by the key of file body). This means that -+ * foo.o will be located close to foo.c and foo.h in the tree. -+ * -+ * To avoid this effect directory plugin fill highest 7 (unused -+ * originally) bits of the second component of the directory entry key -+ * by bit-pattern depending on the file name (see -+ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called -+ * "fibre". Fibre of the file name key is inherited by key of stat data -+ * and keys of file body (in the case of REISER4_LARGE_KEY). -+ * -+ * Fibre for a given file is chosen by per-directory fibration -+ * plugin. Names within given fibre are ordered lexicographically. -+ */ -+ -+#include "../debug.h" -+#include "plugin_header.h" -+#include "plugin.h" -+#include "../super.h" -+#include "../inode.h" -+ -+#include <linux/types.h> -+ -+static const int fibre_shift = 57; -+ -+#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift) -+ -+/* -+ * Trivial fibration: all files of directory are just ordered -+ * lexicographically. -+ */ -+static __u64 fibre_trivial(const struct inode *dir, const char *name, int len) -+{ -+ return FIBRE_NO(0); -+} -+ -+/* -+ * dot-o fibration: place .o files after all others. -+ */ -+static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len) -+{ -+ /* special treatment for .*.o */ -+ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.') -+ return FIBRE_NO(1); -+ else -+ return FIBRE_NO(0); -+} -+ -+/* -+ * ext.1 fibration: subdivide directory into 128 fibrations one for each -+ * 7bit extension character (file "foo.h" goes into fibre "h"), plus -+ * default fibre for the rest. -+ */ -+static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len) -+{ -+ if (len > 2 && name[len - 2] == '.') -+ return FIBRE_NO(name[len - 1]); -+ else -+ return FIBRE_NO(0); -+} -+ -+/* -+ * ext.3 fibration: try to separate files with different 3-character -+ * extensions from each other. -+ */ -+static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len) -+{ -+ if (len > 4 && name[len - 4] == '.') -+ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]); -+ else -+ return FIBRE_NO(0); -+} -+ -+static int change_fibration(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ int result; -+ -+ assert("nikita-3503", inode != NULL); -+ assert("nikita-3504", plugin != NULL); -+ -+ assert("nikita-3505", is_reiser4_inode(inode)); -+ assert("nikita-3506", inode_dir_plugin(inode) != NULL); -+ assert("nikita-3507", -+ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE); -+ -+ result = 0; -+ if (inode_fibration_plugin(inode) == NULL || -+ inode_fibration_plugin(inode)->h.id != plugin->h.id) { -+ if (is_dir_empty(inode) == 0) -+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_FIBRATION, plugin); -+ else -+ result = RETERR(-ENOTEMPTY); -+ -+ } -+ return result; -+} -+ -+static reiser4_plugin_ops fibration_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = change_fibration -+}; -+ -+/* fibration plugins */ -+fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = { -+ [FIBRATION_LEXICOGRAPHIC] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_LEXICOGRAPHIC, -+ .pops = &fibration_plugin_ops, -+ .label = "lexicographic", -+ .desc = "no fibration", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_trivial -+ }, -+ [FIBRATION_DOT_O] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_DOT_O, -+ .pops = &fibration_plugin_ops, -+ .label = "dot-o", -+ .desc = "fibrate .o files separately", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_dot_o -+ }, -+ [FIBRATION_EXT_1] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_EXT_1, -+ .pops = &fibration_plugin_ops, -+ .label = "ext-1", -+ .desc = "fibrate file by single character extension", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_ext_1 -+ }, -+ [FIBRATION_EXT_3] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_EXT_3, -+ .pops = &fibration_plugin_ops, -+ .label = "ext-3", -+ .desc = "fibrate file by three character extension", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_ext_3 -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/fibration.h linux-2.6.30/fs/reiser4/plugin/fibration.h ---- linux-2.6.30.orig/fs/reiser4/plugin/fibration.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/fibration.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,37 @@ -+/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Fibration plugin used by hashed directory plugin to segment content -+ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */ -+ -+#if !defined(__FS_REISER4_PLUGIN_FIBRATION_H__) -+#define __FS_REISER4_PLUGIN_FIBRATION_H__ -+ -+#include "plugin_header.h" -+ -+typedef struct fibration_plugin { -+ /* generic fields */ -+ plugin_header h; -+ -+ __u64(*fibre) (const struct inode *dir, const char *name, int len); -+} fibration_plugin; -+ -+typedef enum { -+ FIBRATION_LEXICOGRAPHIC, -+ FIBRATION_DOT_O, -+ FIBRATION_EXT_1, -+ FIBRATION_EXT_3, -+ LAST_FIBRATION_ID -+} reiser4_fibration_id; -+ -+/* __FS_REISER4_PLUGIN_FIBRATION_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.30/fs/reiser4/plugin/file/cryptcompress.c ---- linux-2.6.30.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file/cryptcompress.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,3807 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+/* -+ * Written by Edward Shishkin. -+ * -+ * Implementations of inode/file/address_space operations -+ * specific for cryptcompress file plugin which manages -+ * regular files built of compressed and(or) encrypted bodies. -+ * See http://dev.namesys.com/CryptcompressPlugin for details. -+ */ -+ -+#include "../../inode.h" -+#include "../cluster.h" -+#include "../object.h" -+#include "../../tree_walk.h" -+#include "cryptcompress.h" -+ -+#include <linux/pagevec.h> -+#include <asm/uaccess.h> -+#include <linux/swap.h> -+#include <linux/writeback.h> -+#include <linux/random.h> -+#include <linux/scatterlist.h> -+ -+/* -+ Managing primary and secondary caches by Reiser4 -+ cryptcompress file plugin. Synchronization scheme. -+ -+ -+ +------------------+ -+ +------------------->| tfm stream | -+ | | (compressed data)| -+ flush | +------------------+ -+ +-----------------+ | -+ |(->)longterm lock| V -+--+ writepages() | | +-***-+ reiser4 +---+ -+ | | +--+ | *** | storage tree | | -+ | | | +-***-+ (primary cache)| | -+u | write() (secondary| cache) V / | \ | | -+s | ----> +----+ +----+ +----+ +----+ +-***** ******* **----+ ----> | d | -+e | | | |page cluster | | | **disk cluster** | | i | -+r | <---- +----+ +----+ +----+ +----+ +-***** **********----+ <---- | s | -+ | read() ^ ^ | | k | -+ | | (->)longterm lock| | page_io()| | -+ | | +------+ | | -+--+ readpages() | | +---+ -+ | V -+ | +------------------+ -+ +--------------------| tfm stream | -+ | (plain text) | -+ +------------------+ -+*/ -+ -+/* get cryptcompress specific portion of inode */ -+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info; -+} -+ -+/* plugin->u.file.init_inode_data */ -+void init_inode_data_cryptcompress(struct inode *inode, -+ reiser4_object_create_data * crd, -+ int create) -+{ -+ struct cryptcompress_info *data; -+ -+ data = cryptcompress_inode_data(inode); -+ assert("edward-685", data != NULL); -+ -+ memset(data, 0, sizeof(*data)); -+ -+ mutex_init(&data->checkin_mutex); -+ data->trunc_index = ULONG_MAX; -+ turn_on_compression(data); -+ set_lattice_factor(data, MIN_LATTICE_FACTOR); -+ init_inode_ordering(inode, crd, create); -+} -+ -+/* The following is a part of reiser4 cipher key manager -+ which is called when opening/creating a cryptcompress file */ -+ -+/* get/set cipher key info */ -+struct reiser4_crypto_info * inode_crypto_info (struct inode * inode) -+{ -+ assert("edward-90", inode != NULL); -+ assert("edward-91", reiser4_inode_data(inode) != NULL); -+ return cryptcompress_inode_data(inode)->crypt; -+} -+ -+static void set_inode_crypto_info (struct inode * inode, -+ struct reiser4_crypto_info * info) -+{ -+ cryptcompress_inode_data(inode)->crypt = info; -+} -+ -+/* allocate a cipher key info */ -+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode) -+{ -+ struct reiser4_crypto_info *info; -+ int fipsize; -+ -+ info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get()); -+ if (!info) -+ return ERR_PTR(-ENOMEM); -+ -+ fipsize = inode_digest_plugin(inode)->fipsize; -+ info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get()); -+ if (!info->keyid) { -+ kfree(info); -+ return ERR_PTR(-ENOMEM); -+ } -+ info->host = inode; -+ return info; -+} -+ -+#if 0 -+/* allocate/free low-level info for cipher and digest -+ transforms */ -+static int alloc_crypto_tfms(struct reiser4_crypto_info * info) -+{ -+ struct crypto_blkcipher * ctfm = NULL; -+ struct crypto_hash * dtfm = NULL; -+ cipher_plugin * cplug = inode_cipher_plugin(info->host); -+ digest_plugin * dplug = inode_digest_plugin(info->host); -+ -+ if (cplug->alloc) { -+ ctfm = cplug->alloc(); -+ if (IS_ERR(ctfm)) { -+ warning("edward-1364", -+ "Can not allocate info for %s\n", -+ cplug->h.desc); -+ return RETERR(PTR_ERR(ctfm)); -+ } -+ } -+ info_set_cipher(info, ctfm); -+ if (dplug->alloc) { -+ dtfm = dplug->alloc(); -+ if (IS_ERR(dtfm)) { -+ warning("edward-1365", -+ "Can not allocate info for %s\n", -+ dplug->h.desc); -+ goto unhappy_with_digest; -+ } -+ } -+ info_set_digest(info, dtfm); -+ return 0; -+ unhappy_with_digest: -+ if (cplug->free) { -+ cplug->free(ctfm); -+ info_set_cipher(info, NULL); -+ } -+ return RETERR(PTR_ERR(dtfm)); -+} -+#endif -+ -+static void -+free_crypto_tfms(struct reiser4_crypto_info * info) -+{ -+ assert("edward-1366", info != NULL); -+ if (!info_get_cipher(info)) { -+ assert("edward-1601", !info_get_digest(info)); -+ return; -+ } -+ inode_cipher_plugin(info->host)->free(info_get_cipher(info)); -+ info_set_cipher(info, NULL); -+ inode_digest_plugin(info->host)->free(info_get_digest(info)); -+ info_set_digest(info, NULL); -+ return; -+} -+ -+#if 0 -+/* create a key fingerprint for disk stat-data */ -+static int create_keyid (struct reiser4_crypto_info * info, -+ struct reiser4_crypto_data * data) -+{ -+ int ret = -ENOMEM; -+ size_t blk, pad; -+ __u8 * dmem; -+ __u8 * cmem; -+ struct hash_desc ddesc; -+ struct blkcipher_desc cdesc; -+ struct scatterlist sg; -+ -+ assert("edward-1367", info != NULL); -+ assert("edward-1368", info->keyid != NULL); -+ -+ ddesc.tfm = info_get_digest(info); -+ ddesc.flags = 0; -+ cdesc.tfm = info_get_cipher(info); -+ cdesc.flags = 0; -+ -+ dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm), -+ reiser4_ctx_gfp_mask_get()); -+ if (!dmem) -+ goto exit1; -+ -+ blk = crypto_blkcipher_blocksize(cdesc.tfm); -+ -+ pad = data->keyid_size % blk; -+ pad = (pad ? blk - pad : 0); -+ -+ cmem = kmalloc((size_t)data->keyid_size + pad, -+ reiser4_ctx_gfp_mask_get()); -+ if (!cmem) -+ goto exit2; -+ memcpy(cmem, data->keyid, data->keyid_size); -+ memset(cmem + data->keyid_size, 0, pad); -+ -+ sg_init_one(&sg, cmem, data->keyid_size + pad); -+ -+ ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg, -+ data->keyid_size + pad); -+ if (ret) { -+ warning("edward-1369", -+ "encryption failed flags=%x\n", cdesc.flags); -+ goto exit3; -+ } -+ ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem); -+ if (ret) { -+ warning("edward-1602", -+ "digest failed flags=%x\n", ddesc.flags); -+ goto exit3; -+ } -+ memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize); -+ exit3: -+ kfree(cmem); -+ exit2: -+ kfree(dmem); -+ exit1: -+ return ret; -+} -+#endif -+ -+static void destroy_keyid(struct reiser4_crypto_info * info) -+{ -+ assert("edward-1370", info != NULL); -+ assert("edward-1371", info->keyid != NULL); -+ kfree(info->keyid); -+ return; -+} -+ -+static void __free_crypto_info (struct inode * inode) -+{ -+ struct reiser4_crypto_info * info = inode_crypto_info(inode); -+ assert("edward-1372", info != NULL); -+ -+ free_crypto_tfms(info); -+ destroy_keyid(info); -+ kfree(info); -+} -+ -+#if 0 -+static void instantiate_crypto_info(struct reiser4_crypto_info * info) -+{ -+ assert("edward-1373", info != NULL); -+ assert("edward-1374", info->inst == 0); -+ info->inst = 1; -+} -+#endif -+ -+static void uninstantiate_crypto_info(struct reiser4_crypto_info * info) -+{ -+ assert("edward-1375", info != NULL); -+ info->inst = 0; -+} -+ -+#if 0 -+static int is_crypto_info_instantiated(struct reiser4_crypto_info * info) -+{ -+ return info->inst; -+} -+ -+static int inode_has_cipher_key(struct inode * inode) -+{ -+ assert("edward-1376", inode != NULL); -+ return inode_crypto_info(inode) && -+ is_crypto_info_instantiated(inode_crypto_info(inode)); -+} -+#endif -+ -+static void free_crypto_info (struct inode * inode) -+{ -+ uninstantiate_crypto_info(inode_crypto_info(inode)); -+ __free_crypto_info(inode); -+} -+ -+static int need_cipher(struct inode * inode) -+{ -+ return inode_cipher_plugin(inode) != -+ cipher_plugin_by_id(NONE_CIPHER_ID); -+} -+ -+/* Parse @data which contains a (uninstantiated) cipher key imported -+ from user space, create a low-level cipher info and attach it to -+ the @object. If success, then info contains an instantiated key */ -+#if 0 -+struct reiser4_crypto_info * create_crypto_info(struct inode * object, -+ struct reiser4_crypto_data * data) -+{ -+ int ret; -+ struct reiser4_crypto_info * info; -+ -+ assert("edward-1377", data != NULL); -+ assert("edward-1378", need_cipher(object)); -+ -+ if (inode_file_plugin(object) != -+ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID)) -+ return ERR_PTR(-EINVAL); -+ -+ info = reiser4_alloc_crypto_info(object); -+ if (IS_ERR(info)) -+ return info; -+ ret = alloc_crypto_tfms(info); -+ if (ret) -+ goto err; -+ /* instantiating a key */ -+ ret = crypto_blkcipher_setkey(info_get_cipher(info), -+ data->key, -+ data->keysize); -+ if (ret) { -+ warning("edward-1379", -+ "setkey failed flags=%x", -+ crypto_blkcipher_get_flags(info_get_cipher(info))); -+ goto err; -+ } -+ info->keysize = data->keysize; -+ ret = create_keyid(info, data); -+ if (ret) -+ goto err; -+ instantiate_crypto_info(info); -+ return info; -+ err: -+ __free_crypto_info(object); -+ return ERR_PTR(ret); -+} -+#endif -+ -+/* increment/decrement a load counter when -+ attaching/detaching the crypto-stat to any object */ -+static void load_crypto_info(struct reiser4_crypto_info * info) -+{ -+ assert("edward-1380", info != NULL); -+ inc_keyload_count(info); -+} -+ -+static void unload_crypto_info(struct inode * inode) -+{ -+ struct reiser4_crypto_info * info = inode_crypto_info(inode); -+ assert("edward-1381", info->keyload_count > 0); -+ -+ dec_keyload_count(inode_crypto_info(inode)); -+ if (info->keyload_count == 0) -+ /* final release */ -+ free_crypto_info(inode); -+} -+ -+/* attach/detach an existing crypto-stat */ -+void reiser4_attach_crypto_info(struct inode * inode, -+ struct reiser4_crypto_info * info) -+{ -+ assert("edward-1382", inode != NULL); -+ assert("edward-1383", info != NULL); -+ assert("edward-1384", inode_crypto_info(inode) == NULL); -+ -+ set_inode_crypto_info(inode, info); -+ load_crypto_info(info); -+} -+ -+/* returns true, if crypto stat can be attached to the @host */ -+#if REISER4_DEBUG -+static int host_allows_crypto_info(struct inode * host) -+{ -+ int ret; -+ file_plugin * fplug = inode_file_plugin(host); -+ -+ switch (fplug->h.id) { -+ case CRYPTCOMPRESS_FILE_PLUGIN_ID: -+ ret = 1; -+ break; -+ default: -+ ret = 0; -+ } -+ return ret; -+} -+#endif /* REISER4_DEBUG */ -+ -+static void reiser4_detach_crypto_info(struct inode * inode) -+{ -+ assert("edward-1385", inode != NULL); -+ assert("edward-1386", host_allows_crypto_info(inode)); -+ -+ if (inode_crypto_info(inode)) -+ unload_crypto_info(inode); -+ set_inode_crypto_info(inode, NULL); -+} -+ -+#if 0 -+ -+/* compare fingerprints of @child and @parent */ -+static int keyid_eq(struct reiser4_crypto_info * child, -+ struct reiser4_crypto_info * parent) -+{ -+ return !memcmp(child->keyid, -+ parent->keyid, -+ info_digest_plugin(parent)->fipsize); -+} -+ -+/* check if a crypto-stat (which is bound to @parent) can be inherited */ -+int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent) -+{ -+ if (!need_cipher(child)) -+ return 0; -+ /* the child is created */ -+ if (!inode_crypto_info(child)) -+ return 1; -+ /* the child is looked up */ -+ if (!inode_crypto_info(parent)) -+ return 0; -+ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) && -+ inode_digest_plugin(child) == inode_digest_plugin(parent) && -+ inode_crypto_info(child)->keysize == -+ inode_crypto_info(parent)->keysize && -+ keyid_eq(inode_crypto_info(child), inode_crypto_info(parent))); -+} -+#endif -+ -+/* helper functions for ->create() method of the cryptcompress plugin */ -+static int inode_set_crypto(struct inode * object) -+{ -+ reiser4_inode * info; -+ if (!inode_crypto_info(object)) { -+ if (need_cipher(object)) -+ return RETERR(-EINVAL); -+ /* the file is not to be encrypted */ -+ return 0; -+ } -+ info = reiser4_inode_data(object); -+ info->extmask |= (1 << CRYPTO_STAT); -+ return 0; -+} -+ -+static int inode_init_compression(struct inode * object) -+{ -+ int result = 0; -+ assert("edward-1461", object != NULL); -+ if (inode_compression_plugin(object)->init) -+ result = inode_compression_plugin(object)->init(); -+ return result; -+} -+ -+static int inode_check_cluster(struct inode * object) -+{ -+ assert("edward-696", object != NULL); -+ -+ if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) { -+ warning("edward-1320", "Can not support '%s' " -+ "logical clusters (less then page size)", -+ inode_cluster_plugin(object)->h.label); -+ return RETERR(-EINVAL); -+ } -+ if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){ -+ warning("edward-1463", "Can not support '%s' " -+ "logical clusters (too big for transform)", -+ inode_cluster_plugin(object)->h.label); -+ return RETERR(-EINVAL); -+ } -+ return 0; -+} -+ -+/* plugin->destroy_inode() */ -+void destroy_inode_cryptcompress(struct inode * inode) -+{ -+ assert("edward-1464", INODE_PGCOUNT(inode) == 0); -+ reiser4_detach_crypto_info(inode); -+ return; -+} -+ -+/* plugin->create_object(): -+. install plugins -+. attach crypto info if specified -+. attach compression info if specified -+. attach cluster info -+*/ -+int create_object_cryptcompress(struct inode *object, struct inode *parent, -+ reiser4_object_create_data * data) -+{ -+ int result; -+ reiser4_inode *info; -+ -+ assert("edward-23", object != NULL); -+ assert("edward-24", parent != NULL); -+ assert("edward-30", data != NULL); -+ assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID); -+ -+ info = reiser4_inode_data(object); -+ -+ assert("edward-29", info != NULL); -+ -+ /* set file bit */ -+ info->plugin_mask |= (1 << PSET_FILE); -+ -+ /* set crypto */ -+ result = inode_set_crypto(object); -+ if (result) -+ goto error; -+ /* set compression */ -+ result = inode_init_compression(object); -+ if (result) -+ goto error; -+ /* set cluster */ -+ result = inode_check_cluster(object); -+ if (result) -+ goto error; -+ -+ /* save everything in disk stat-data */ -+ result = write_sd_by_inode_common(object); -+ if (!result) -+ return 0; -+ error: -+ reiser4_detach_crypto_info(object); -+ return result; -+} -+ -+/* plugin->open() */ -+int open_cryptcompress(struct inode * inode, struct file * file) -+{ -+ return 0; -+} -+ -+/* returns a blocksize, the attribute of a cipher algorithm */ -+static unsigned int -+cipher_blocksize(struct inode * inode) -+{ -+ assert("edward-758", need_cipher(inode)); -+ assert("edward-1400", inode_crypto_info(inode) != NULL); -+ return crypto_blkcipher_blocksize -+ (info_get_cipher(inode_crypto_info(inode))); -+} -+ -+/* returns offset translated by scale factor of the crypto-algorithm */ -+static loff_t inode_scaled_offset (struct inode * inode, -+ const loff_t src_off /* input offset */) -+{ -+ assert("edward-97", inode != NULL); -+ -+ if (!need_cipher(inode) || -+ src_off == get_key_offset(reiser4_min_key()) || -+ src_off == get_key_offset(reiser4_max_key())) -+ return src_off; -+ -+ return inode_cipher_plugin(inode)->scale(inode, -+ cipher_blocksize(inode), -+ src_off); -+} -+ -+/* returns disk cluster size */ -+size_t inode_scaled_cluster_size(struct inode * inode) -+{ -+ assert("edward-110", inode != NULL); -+ -+ return inode_scaled_offset(inode, inode_cluster_size(inode)); -+} -+ -+/* set number of cluster pages */ -+static void set_cluster_nrpages(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ struct reiser4_slide * win; -+ -+ assert("edward-180", clust != NULL); -+ assert("edward-1040", inode != NULL); -+ -+ clust->old_nrpages = size_in_pages(lbytes(clust->index, inode)); -+ win = clust->win; -+ if (!win) { -+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode)); -+ return; -+ } -+ assert("edward-1176", clust->op != LC_INVAL); -+ assert("edward-1064", win->off + win->count + win->delta != 0); -+ -+ if (win->stat == HOLE_WINDOW && -+ win->off == 0 && win->count == inode_cluster_size(inode)) { -+ /* special case: writing a "fake" logical cluster */ -+ clust->nr_pages = 0; -+ return; -+ } -+ clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta, -+ lbytes(clust->index, inode))); -+ return; -+} -+ -+/* plugin->key_by_inode() -+ build key of a disk cluster */ -+int key_by_inode_cryptcompress(struct inode *inode, loff_t off, -+ reiser4_key * key) -+{ -+ assert("edward-64", inode != 0); -+ -+ if (likely(off != get_key_offset(reiser4_max_key()))) -+ off = off_to_clust_to_off(off, inode); -+ if (inode_crypto_info(inode)) -+ off = inode_scaled_offset(inode, off); -+ -+ key_by_inode_and_offset_common(inode, 0, key); -+ set_key_offset(key, (__u64)off); -+ return 0; -+} -+ -+/* plugin->flow_by_inode() */ -+/* flow is used to read/write disk clusters */ -+int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf, -+ int user, /* 1: @buf is of user space, -+ 0: kernel space */ -+ loff_t size, /* @buf size */ -+ loff_t off, /* offset to start io from */ -+ rw_op op, /* READ or WRITE */ -+ flow_t * f /* resulting flow */) -+{ -+ assert("edward-436", f != NULL); -+ assert("edward-149", inode != NULL); -+ assert("edward-150", inode_file_plugin(inode) != NULL); -+ assert("edward-1465", user == 0); /* we use flow to read/write -+ disk clusters located in -+ kernel space */ -+ f->length = size; -+ memcpy(&f->data, &buf, sizeof(buf)); -+ f->user = user; -+ f->op = op; -+ -+ return key_by_inode_cryptcompress(inode, off, &f->key); -+} -+ -+static int -+cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key, -+ znode_lock_mode lock_mode) -+{ -+ coord_t *coord; -+ -+ assert("edward-704", hint != NULL); -+ assert("edward-1089", !hint_is_valid(hint)); -+ assert("edward-706", hint->lh.owner == NULL); -+ -+ coord = &hint->ext_coord.coord; -+ -+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode) -+ /* hint either not set or set by different operation */ -+ return RETERR(-E_REPEAT); -+ -+ if (get_key_offset(key) != hint->offset) -+ /* hint is set for different key */ -+ return RETERR(-E_REPEAT); -+ -+ assert("edward-707", reiser4_schedulable()); -+ -+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, -+ key, &hint->lh, lock_mode, -+ ZNODE_LOCK_LOPRI); -+} -+ -+/* reserve disk space when writing a logical cluster */ -+static int reserve4cluster(struct inode *inode, struct cluster_handle *clust) -+{ -+ int result = 0; -+ -+ assert("edward-965", reiser4_schedulable()); -+ assert("edward-439", inode != NULL); -+ assert("edward-440", clust != NULL); -+ assert("edward-441", clust->pages != NULL); -+ -+ if (clust->nr_pages == 0) { -+ assert("edward-1152", clust->win != NULL); -+ assert("edward-1153", clust->win->stat == HOLE_WINDOW); -+ /* don't reserve disk space for fake logical cluster */ -+ return 0; -+ } -+ assert("edward-442", jprivate(clust->pages[0]) != NULL); -+ -+ result = reiser4_grab_space_force(estimate_insert_cluster(inode) + -+ estimate_update_cluster(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ return result; -+ clust->reserved = 1; -+ grabbed2cluster_reserved(estimate_insert_cluster(inode) + -+ estimate_update_cluster(inode)); -+#if REISER4_DEBUG -+ clust->reserved_prepped = estimate_update_cluster(inode); -+ clust->reserved_unprepped = estimate_insert_cluster(inode); -+#endif -+ /* there can be space grabbed by txnmgr_force_commit_all */ -+ return 0; -+} -+ -+/* free reserved disk space if writing a logical cluster fails */ -+static void free_reserved4cluster(struct inode *inode, -+ struct cluster_handle *ch, int count) -+{ -+ assert("edward-967", ch->reserved == 1); -+ -+ cluster_reserved2free(count); -+ ch->reserved = 0; -+} -+ -+/* The core search procedure of the cryptcompress plugin. -+ If returned value is not cbk_errored, then current znode is locked */ -+static int find_cluster_item(hint_t * hint, -+ const reiser4_key * key, /* key of the item we are -+ looking for */ -+ znode_lock_mode lock_mode /* which lock */ , -+ ra_info_t * ra_info, lookup_bias bias, __u32 flags) -+{ -+ int result; -+ reiser4_key ikey; -+ int went_right = 0; -+ coord_t *coord = &hint->ext_coord.coord; -+ coord_t orig = *coord; -+ -+ assert("edward-152", hint != NULL); -+ -+ if (!hint_is_valid(hint)) { -+ result = cryptcompress_hint_validate(hint, key, lock_mode); -+ if (result == -E_REPEAT) -+ goto traverse_tree; -+ else if (result) { -+ assert("edward-1216", 0); -+ return result; -+ } -+ hint_set_valid(hint); -+ } -+ assert("edward-709", znode_is_any_locked(coord->node)); -+ -+ /* In-place lookup is going here, it means we just need to -+ check if next item of the @coord match to the @keyhint) */ -+ -+ if (equal_to_rdk(coord->node, key)) { -+ result = goto_right_neighbor(coord, &hint->lh); -+ if (result == -E_NO_NEIGHBOR) { -+ assert("edward-1217", 0); -+ return RETERR(-EIO); -+ } -+ if (result) -+ return result; -+ assert("edward-1218", equal_to_ldk(coord->node, key)); -+ went_right = 1; -+ } else { -+ coord->item_pos++; -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ } -+ result = zload(coord->node); -+ if (result) -+ return result; -+ assert("edward-1219", !node_is_empty(coord->node)); -+ -+ if (!coord_is_existing_item(coord)) { -+ zrelse(coord->node); -+ goto not_found; -+ } -+ item_key_by_coord(coord, &ikey); -+ zrelse(coord->node); -+ if (!keyeq(key, &ikey)) -+ goto not_found; -+ /* Ok, item is found, update node counts */ -+ if (went_right) -+ dclust_inc_extension_ncount(hint); -+ return CBK_COORD_FOUND; -+ -+ not_found: -+ assert("edward-1220", coord->item_pos > 0); -+ //coord->item_pos--; -+ /* roll back */ -+ *coord = orig; -+ ON_DEBUG(coord_update_v(coord)); -+ return CBK_COORD_NOTFOUND; -+ -+ traverse_tree: -+ assert("edward-713", hint->lh.owner == NULL); -+ assert("edward-714", reiser4_schedulable()); -+ -+ reiser4_unset_hint(hint); -+ dclust_init_extension(hint); -+ coord_init_zero(coord); -+ result = coord_by_key(current_tree, key, coord, &hint->lh, -+ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL, -+ CBK_UNIQUE | flags, ra_info); -+ if (cbk_errored(result)) -+ return result; -+ if(result == CBK_COORD_FOUND) -+ dclust_inc_extension_ncount(hint); -+ hint_set_valid(hint); -+ return result; -+} -+ -+/* This function is called by deflate[inflate] manager when -+ creating a transformed/plain stream to check if we should -+ create/cut some overhead. If this returns true, then @oh -+ contains the size of this overhead. -+ */ -+static int need_cut_or_align(struct inode * inode, -+ struct cluster_handle * ch, rw_op rw, int * oh) -+{ -+ struct tfm_cluster * tc = &ch->tc; -+ switch (rw) { -+ case WRITE_OP: /* estimate align */ -+ *oh = tc->len % cipher_blocksize(inode); -+ if (*oh != 0) -+ return 1; -+ break; -+ case READ_OP: /* estimate cut */ -+ *oh = *(tfm_output_data(ch) + tc->len - 1); -+ break; -+ default: -+ impossible("edward-1401", "bad option"); -+ } -+ return (tc->len != tc->lsize); -+} -+ -+/* create/cut an overhead of transformed/plain stream */ -+static void align_or_cut_overhead(struct inode * inode, -+ struct cluster_handle * ch, rw_op rw) -+{ -+ unsigned int oh; -+ cipher_plugin * cplug = inode_cipher_plugin(inode); -+ -+ assert("edward-1402", need_cipher(inode)); -+ -+ if (!need_cut_or_align(inode, ch, rw, &oh)) -+ return; -+ switch (rw) { -+ case WRITE_OP: /* do align */ -+ ch->tc.len += -+ cplug->align_stream(tfm_input_data(ch) + -+ ch->tc.len, ch->tc.len, -+ cipher_blocksize(inode)); -+ *(tfm_input_data(ch) + ch->tc.len - 1) = -+ cipher_blocksize(inode) - oh; -+ break; -+ case READ_OP: /* do cut */ -+ assert("edward-1403", oh <= cipher_blocksize(inode)); -+ ch->tc.len -= oh; -+ break; -+ default: -+ impossible("edward-1404", "bad option"); -+ } -+ return; -+} -+ -+static unsigned max_cipher_overhead(struct inode * inode) -+{ -+ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream) -+ return 0; -+ return cipher_blocksize(inode); -+} -+ -+static int deflate_overhead(struct inode *inode) -+{ -+ return (inode_compression_plugin(inode)-> -+ checksum ? DC_CHECKSUM_SIZE : 0); -+} -+ -+static unsigned deflate_overrun(struct inode * inode, int ilen) -+{ -+ return coa_overrun(inode_compression_plugin(inode), ilen); -+} -+ -+/* Estimating compressibility of a logical cluster by various -+ policies represented by compression mode plugin. -+ If this returns false, then compressor won't be called for -+ the cluster of index @index. -+*/ -+static int should_compress(struct tfm_cluster * tc, cloff_t index, -+ struct inode *inode) -+{ -+ compression_plugin *cplug = inode_compression_plugin(inode); -+ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode); -+ -+ assert("edward-1321", tc->len != 0); -+ assert("edward-1322", cplug != NULL); -+ assert("edward-1323", mplug != NULL); -+ -+ return /* estimate by size */ -+ (cplug->min_size_deflate ? -+ tc->len >= cplug->min_size_deflate() : -+ 1) && -+ /* estimate by compression mode plugin */ -+ (mplug->should_deflate ? -+ mplug->should_deflate(inode, index) : -+ 1); -+} -+ -+/* Evaluating results of compression transform. -+ Returns true, if we need to accept this results */ -+static int save_compressed(int size_before, int size_after, struct inode *inode) -+{ -+ return (size_after + deflate_overhead(inode) + -+ max_cipher_overhead(inode) < size_before); -+} -+ -+/* Guess result of the evaluation above */ -+static int need_inflate(struct cluster_handle * ch, struct inode * inode, -+ int encrypted /* is cluster encrypted */ ) -+{ -+ struct tfm_cluster * tc = &ch->tc; -+ -+ assert("edward-142", tc != 0); -+ assert("edward-143", inode != NULL); -+ -+ return tc->len < -+ (encrypted ? -+ inode_scaled_offset(inode, tc->lsize) : -+ tc->lsize); -+} -+ -+/* If results of compression were accepted, then we add -+ a checksum to catch possible disk cluster corruption. -+ The following is a format of the data stored in disk clusters: -+ -+ data This is (transformed) logical cluster. -+ cipher_overhead This is created by ->align() method -+ of cipher plugin. May be absent. -+ checksum (4) This is created by ->checksum method -+ of compression plugin to check -+ integrity. May be absent. -+ -+ Crypto overhead format: -+ -+ data -+ control_byte (1) contains aligned overhead size: -+ 1 <= overhead <= cipher_blksize -+*/ -+/* Append a checksum at the end of a transformed stream */ -+static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc) -+{ -+ __u32 checksum; -+ -+ assert("edward-1309", tc != NULL); -+ assert("edward-1310", tc->len > 0); -+ assert("edward-1311", cplug->checksum != NULL); -+ -+ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len); -+ put_unaligned(cpu_to_le32(checksum), -+ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len)); -+ tc->len += (int)DC_CHECKSUM_SIZE; -+} -+ -+/* Check a disk cluster checksum. -+ Returns 0 if checksum is correct, otherwise returns 1 */ -+static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc) -+{ -+ assert("edward-1312", tc != NULL); -+ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE); -+ assert("edward-1314", cplug->checksum != NULL); -+ -+ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM), -+ tc->len - (int)DC_CHECKSUM_SIZE) != -+ le32_to_cpu(get_unaligned((d32 *) -+ (tfm_stream_data(tc, INPUT_STREAM) -+ + tc->len - (int)DC_CHECKSUM_SIZE)))) { -+ warning("edward-156", -+ "Bad disk cluster checksum %d, (should be %d) Fsck?\n", -+ (int)le32_to_cpu -+ (get_unaligned((d32 *) -+ (tfm_stream_data(tc, INPUT_STREAM) + -+ tc->len - (int)DC_CHECKSUM_SIZE))), -+ (int)cplug->checksum -+ (tfm_stream_data(tc, INPUT_STREAM), -+ tc->len - (int)DC_CHECKSUM_SIZE)); -+ return 1; -+ } -+ tc->len -= (int)DC_CHECKSUM_SIZE; -+ return 0; -+} -+ -+/* get input/output stream for some transform action */ -+int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc, -+ tfm_stream_id id) -+{ -+ size_t size = inode_scaled_cluster_size(inode); -+ -+ assert("edward-901", tc != NULL); -+ assert("edward-1027", inode_compression_plugin(inode) != NULL); -+ -+ if (cluster_get_tfm_act(tc) == TFMA_WRITE) -+ size += deflate_overrun(inode, inode_cluster_size(inode)); -+ -+ if (!get_tfm_stream(tc, id) && id == INPUT_STREAM) -+ alternate_streams(tc); -+ if (!get_tfm_stream(tc, id)) -+ return alloc_tfm_stream(tc, size, id); -+ -+ assert("edward-902", tfm_stream_is_set(tc, id)); -+ -+ if (tfm_stream_size(tc, id) < size) -+ return realloc_tfm_stream(tc, size, id); -+ return 0; -+} -+ -+/* Common deflate manager */ -+int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode) -+{ -+ int result = 0; -+ int compressed = 0; -+ int encrypted = 0; -+ struct tfm_cluster * tc = &clust->tc; -+ compression_plugin * coplug; -+ -+ assert("edward-401", inode != NULL); -+ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM)); -+ assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE); -+ assert("edward-498", !tfm_cluster_is_uptodate(tc)); -+ -+ coplug = inode_compression_plugin(inode); -+ if (should_compress(tc, clust->index, inode)) { -+ /* try to compress, discard bad results */ -+ __u32 dst_len; -+ compression_mode_plugin * mplug = -+ inode_compression_mode_plugin(inode); -+ assert("edward-602", coplug != NULL); -+ assert("edward-1423", coplug->compress != NULL); -+ -+ result = grab_coa(tc, coplug); -+ if (result) { -+ warning("edward-1424", -+ "alloc_coa failed with ret=%d, skipped compression", -+ result); -+ goto cipher; -+ } -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) { -+ warning("edward-1425", -+ "alloc stream failed with ret=%d, skipped compression", -+ result); -+ goto cipher; -+ } -+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM); -+ coplug->compress(get_coa(tc, coplug->h.id, tc->act), -+ tfm_input_data(clust), tc->len, -+ tfm_output_data(clust), &dst_len); -+ /* make sure we didn't overwrite extra bytes */ -+ assert("edward-603", -+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM)); -+ -+ /* evaluate results of compression transform */ -+ if (save_compressed(tc->len, dst_len, inode)) { -+ /* good result, accept */ -+ tc->len = dst_len; -+ if (mplug->accept_hook != NULL) { -+ result = mplug->accept_hook(inode, clust->index); -+ if (result) -+ warning("edward-1426", -+ "accept_hook failed with ret=%d", -+ result); -+ } -+ compressed = 1; -+ } -+ else { -+ /* bad result, discard */ -+#if 0 -+ if (cluster_is_complete(clust, inode)) -+ warning("edward-1496", -+ "incompressible cluster %lu (inode %llu)", -+ clust->index, -+ (unsigned long long)get_inode_oid(inode)); -+#endif -+ if (mplug->discard_hook != NULL && -+ cluster_is_complete(clust, inode)) { -+ result = mplug->discard_hook(inode, -+ clust->index); -+ if (result) -+ warning("edward-1427", -+ "discard_hook failed with ret=%d", -+ result); -+ } -+ } -+ } -+ cipher: -+ if (need_cipher(inode)) { -+ cipher_plugin * ciplug; -+ struct blkcipher_desc desc; -+ struct scatterlist src; -+ struct scatterlist dst; -+ -+ ciplug = inode_cipher_plugin(inode); -+ desc.tfm = info_get_cipher(inode_crypto_info(inode)); -+ desc.flags = 0; -+ if (compressed) -+ alternate_streams(tc); -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ return result; -+ -+ align_or_cut_overhead(inode, clust, WRITE_OP); -+ sg_init_one(&src, tfm_input_data(clust), tc->len); -+ sg_init_one(&dst, tfm_output_data(clust), tc->len); -+ -+ result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len); -+ if (result) { -+ warning("edward-1405", -+ "encryption failed flags=%x\n", desc.flags); -+ return result; -+ } -+ encrypted = 1; -+ } -+ if (compressed && coplug->checksum != NULL) -+ dc_set_checksum(coplug, tc); -+ if (!compressed && !encrypted) -+ alternate_streams(tc); -+ return result; -+} -+ -+/* Common inflate manager. */ -+int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode) -+{ -+ int result = 0; -+ int transformed = 0; -+ struct tfm_cluster * tc = &clust->tc; -+ compression_plugin * coplug; -+ -+ assert("edward-905", inode != NULL); -+ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER); -+ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM)); -+ assert("edward-1349", tc->act == TFMA_READ); -+ assert("edward-907", !tfm_cluster_is_uptodate(tc)); -+ -+ /* Handle a checksum (if any) */ -+ coplug = inode_compression_plugin(inode); -+ if (need_inflate(clust, inode, need_cipher(inode)) && -+ coplug->checksum != NULL) { -+ result = dc_check_checksum(coplug, tc); -+ if (unlikely(result)) { -+ warning("edward-1460", -+ "Inode %llu: disk cluster %lu looks corrupted", -+ (unsigned long long)get_inode_oid(inode), -+ clust->index); -+ return RETERR(-EIO); -+ } -+ } -+ if (need_cipher(inode)) { -+ cipher_plugin * ciplug; -+ struct blkcipher_desc desc; -+ struct scatterlist src; -+ struct scatterlist dst; -+ -+ ciplug = inode_cipher_plugin(inode); -+ desc.tfm = info_get_cipher(inode_crypto_info(inode)); -+ desc.flags = 0; -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ return result; -+ assert("edward-909", tfm_cluster_is_set(tc)); -+ -+ sg_init_one(&src, tfm_input_data(clust), tc->len); -+ sg_init_one(&dst, tfm_output_data(clust), tc->len); -+ -+ result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len); -+ if (result) { -+ warning("edward-1600", "decrypt failed flags=%x\n", -+ desc.flags); -+ return result; -+ } -+ align_or_cut_overhead(inode, clust, READ_OP); -+ transformed = 1; -+ } -+ if (need_inflate(clust, inode, 0)) { -+ unsigned dst_len = inode_cluster_size(inode); -+ if(transformed) -+ alternate_streams(tc); -+ -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ return result; -+ assert("edward-1305", coplug->decompress != NULL); -+ assert("edward-910", tfm_cluster_is_set(tc)); -+ -+ coplug->decompress(get_coa(tc, coplug->h.id, tc->act), -+ tfm_input_data(clust), tc->len, -+ tfm_output_data(clust), &dst_len); -+ /* check length */ -+ tc->len = dst_len; -+ assert("edward-157", dst_len == tc->lsize); -+ transformed = 1; -+ } -+ if (!transformed) -+ alternate_streams(tc); -+ return result; -+} -+ -+/* This is implementation of readpage method of struct -+ address_space_operations for cryptcompress plugin. */ -+int readpage_cryptcompress(struct file *file, struct page *page) -+{ -+ reiser4_context *ctx; -+ struct cluster_handle clust; -+ item_plugin *iplug; -+ int result; -+ -+ assert("edward-88", PageLocked(page)); -+ assert("vs-976", !PageUptodate(page)); -+ assert("edward-89", page->mapping && page->mapping->host); -+ -+ ctx = reiser4_init_context(page->mapping->host->i_sb); -+ if (IS_ERR(ctx)) { -+ unlock_page(page); -+ return PTR_ERR(ctx); -+ } -+ assert("edward-113", -+ ergo(file != NULL, -+ page->mapping == file->f_dentry->d_inode->i_mapping)); -+ -+ if (PageUptodate(page)) { -+ warning("edward-1338", "page is already uptodate\n"); -+ unlock_page(page); -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ cluster_init_read(&clust, NULL); -+ clust.file = file; -+ iplug = item_plugin_by_id(CTAIL_ID); -+ if (!iplug->s.file.readpage) { -+ unlock_page(page); -+ put_cluster_handle(&clust); -+ reiser4_exit_context(ctx); -+ return -EINVAL; -+ } -+ result = iplug->s.file.readpage(&clust, page); -+ -+ put_cluster_handle(&clust); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* number of pages to check in */ -+static int get_new_nrpages(struct cluster_handle * clust) -+{ -+ switch (clust->op) { -+ case LC_APPOV: -+ return clust->nr_pages; -+ case LC_TRUNC: -+ assert("edward-1179", clust->win != NULL); -+ return size_in_pages(clust->win->off + clust->win->count); -+ default: -+ impossible("edward-1180", "bad page cluster option"); -+ return 0; -+ } -+} -+ -+static void set_cluster_pages_dirty(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ int i; -+ struct page *pg; -+ int nrpages = get_new_nrpages(clust); -+ -+ for (i = 0; i < nrpages; i++) { -+ -+ pg = clust->pages[i]; -+ assert("edward-968", pg != NULL); -+ lock_page(pg); -+ assert("edward-1065", PageUptodate(pg)); -+ set_page_dirty_notag(pg); -+ unlock_page(pg); -+ mark_page_accessed(pg); -+ } -+} -+ -+/* Grab a page cluster for read/write operations. -+ Attach a jnode for write operations (when preparing for modifications, which -+ are supposed to be committed). -+ -+ We allocate only one jnode per page cluster; this jnode is binded to the -+ first page of this cluster, so we have an extra-reference that will be put -+ as soon as jnode is evicted from memory), other references will be cleaned -+ up in flush time (assume that check in page cluster was successful). -+*/ -+int grab_page_cluster(struct inode * inode, -+ struct cluster_handle * clust, rw_op rw) -+{ -+ int i; -+ int result = 0; -+ jnode *node = NULL; -+ -+ assert("edward-182", clust != NULL); -+ assert("edward-183", clust->pages != NULL); -+ assert("edward-1466", clust->node == NULL); -+ assert("edward-1428", inode != NULL); -+ assert("edward-1429", inode->i_mapping != NULL); -+ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode)); -+ -+ if (clust->nr_pages == 0) -+ return 0; -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ -+ assert("edward-1044", clust->pages[i] == NULL); -+ -+ clust->pages[i] = -+ find_or_create_page(inode->i_mapping, -+ clust_to_pg(clust->index, inode) + i, -+ reiser4_ctx_gfp_mask_get()); -+ if (!clust->pages[i]) { -+ result = RETERR(-ENOMEM); -+ break; -+ } -+ if (i == 0 && rw == WRITE_OP) { -+ node = jnode_of_page(clust->pages[i]); -+ if (IS_ERR(node)) { -+ result = PTR_ERR(node); -+ unlock_page(clust->pages[i]); -+ break; -+ } -+ JF_SET(node, JNODE_CLUSTER_PAGE); -+ assert("edward-920", jprivate(clust->pages[0])); -+ } -+ INODE_PGCOUNT_INC(inode); -+ unlock_page(clust->pages[i]); -+ } -+ if (unlikely(result)) { -+ while (i) { -+ put_cluster_page(clust->pages[--i]); -+ INODE_PGCOUNT_DEC(inode); -+ } -+ if (node && !IS_ERR(node)) -+ jput(node); -+ return result; -+ } -+ clust->node = node; -+ return 0; -+} -+ -+static void truncate_page_cluster_range(struct inode * inode, -+ struct page ** pages, -+ cloff_t index, -+ int from, int count, -+ int even_cows) -+{ -+ assert("edward-1467", count > 0); -+ reiser4_invalidate_pages(inode->i_mapping, -+ clust_to_pg(index, inode) + from, -+ count, even_cows); -+} -+ -+/* Put @count pages starting from @from offset */ -+void __put_page_cluster(int from, int count, -+ struct page ** pages, struct inode * inode) -+{ -+ int i; -+ assert("edward-1468", pages != NULL); -+ assert("edward-1469", inode != NULL); -+ assert("edward-1470", from >= 0 && count >= 0); -+ -+ for (i = 0; i < count; i++) { -+ assert("edward-1471", pages[from + i] != NULL); -+ assert("edward-1472", -+ pages[from + i]->index == pages[from]->index + i); -+ -+ put_cluster_page(pages[from + i]); -+ INODE_PGCOUNT_DEC(inode); -+ } -+} -+ -+/* -+ * This is dual to grab_page_cluster, -+ * however if @rw == WRITE_OP, then we call this function -+ * only if something is failed before checkin page cluster. -+ */ -+void put_page_cluster(struct cluster_handle * clust, -+ struct inode * inode, rw_op rw) -+{ -+ assert("edward-445", clust != NULL); -+ assert("edward-922", clust->pages != NULL); -+ assert("edward-446", -+ ergo(clust->nr_pages != 0, clust->pages[0] != NULL)); -+ -+ __put_page_cluster(0, clust->nr_pages, clust->pages, inode); -+ if (rw == WRITE_OP) { -+ if (unlikely(clust->node)) { -+ assert("edward-447", -+ clust->node == jprivate(clust->pages[0])); -+ jput(clust->node); -+ clust->node = NULL; -+ } -+ } -+} -+ -+#if REISER4_DEBUG -+int cryptcompress_inode_ok(struct inode *inode) -+{ -+ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE))) -+ return 0; -+ if (!cluster_shift_ok(inode_cluster_shift(inode))) -+ return 0; -+ return 1; -+} -+ -+static int window_ok(struct reiser4_slide * win, struct inode *inode) -+{ -+ assert("edward-1115", win != NULL); -+ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW)); -+ -+ return (win->off != inode_cluster_size(inode)) && -+ (win->off + win->count + win->delta <= inode_cluster_size(inode)); -+} -+ -+static int cluster_ok(struct cluster_handle * clust, struct inode *inode) -+{ -+ assert("edward-279", clust != NULL); -+ -+ if (!clust->pages) -+ return 0; -+ return (clust->win ? window_ok(clust->win, inode) : 1); -+} -+#if 0 -+static int pages_truncate_ok(struct inode *inode, pgoff_t start) -+{ -+ int found; -+ struct page * page; -+ -+ found = find_get_pages(inode->i_mapping, start, 1, &page); -+ if (found) -+ put_cluster_page(page); -+ return !found; -+} -+#else -+#define pages_truncate_ok(inode, start) 1 -+#endif -+ -+static int jnode_truncate_ok(struct inode *inode, cloff_t index) -+{ -+ jnode *node; -+ node = jlookup(current_tree, get_inode_oid(inode), -+ clust_to_pg(index, inode)); -+ if (likely(!node)) -+ return 1; -+ jput(node); -+ return 0; -+} -+ -+static int find_fake_appended(struct inode *inode, cloff_t * index); -+ -+static int body_truncate_ok(struct inode *inode, cloff_t aidx) -+{ -+ int result; -+ cloff_t raidx; -+ -+ result = find_fake_appended(inode, &raidx); -+ return !result && (aidx == raidx); -+} -+#endif -+ -+/* guess next window stat */ -+static inline window_stat next_window_stat(struct reiser4_slide * win) -+{ -+ assert("edward-1130", win != NULL); -+ return ((win->stat == HOLE_WINDOW && win->delta == 0) ? -+ HOLE_WINDOW : DATA_WINDOW); -+} -+ -+/* guess and set next cluster index and window params */ -+static void move_update_window(struct inode * inode, -+ struct cluster_handle * clust, -+ loff_t file_off, loff_t to_file) -+{ -+ struct reiser4_slide * win; -+ -+ assert("edward-185", clust != NULL); -+ assert("edward-438", clust->pages != NULL); -+ assert("edward-281", cluster_ok(clust, inode)); -+ -+ win = clust->win; -+ if (!win) -+ return; -+ -+ switch (win->stat) { -+ case DATA_WINDOW: -+ /* increment */ -+ clust->index++; -+ win->stat = DATA_WINDOW; -+ win->off = 0; -+ win->count = min((loff_t)inode_cluster_size(inode), to_file); -+ break; -+ case HOLE_WINDOW: -+ switch (next_window_stat(win)) { -+ case HOLE_WINDOW: -+ /* skip */ -+ clust->index = off_to_clust(file_off, inode); -+ win->stat = HOLE_WINDOW; -+ win->off = 0; -+ win->count = off_to_cloff(file_off, inode); -+ win->delta = min((loff_t)(inode_cluster_size(inode) - -+ win->count), to_file); -+ break; -+ case DATA_WINDOW: -+ /* stay */ -+ win->stat = DATA_WINDOW; -+ /* off+count+delta=inv */ -+ win->off = win->off + win->count; -+ win->count = win->delta; -+ win->delta = 0; -+ break; -+ default: -+ impossible("edward-282", "wrong next window state"); -+ } -+ break; -+ default: -+ impossible("edward-283", "wrong current window state"); -+ } -+ assert("edward-1068", cluster_ok(clust, inode)); -+} -+ -+static int update_sd_cryptcompress(struct inode *inode) -+{ -+ int result = 0; -+ -+ assert("edward-978", reiser4_schedulable()); -+ -+ result = reiser4_grab_space_force(/* one for stat data update */ -+ estimate_update_common(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ return result; -+ inode->i_ctime = inode->i_mtime = CURRENT_TIME; -+ result = reiser4_update_sd(inode); -+ -+ return result; -+} -+ -+static void uncapture_cluster_jnode(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert_spin_locked(&(node->guard)); -+ -+ atom = jnode_get_atom(node); -+ if (atom == NULL) { -+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); -+ spin_unlock_jnode(node); -+ return; -+ } -+ reiser4_uncapture_block(node); -+ spin_unlock_atom(atom); -+ jput(node); -+} -+ -+static void put_found_pages(struct page **pages, int nr) -+{ -+ int i; -+ for (i = 0; i < nr; i++) { -+ assert("edward-1045", pages[i] != NULL); -+ put_cluster_page(pages[i]); -+ } -+} -+ -+/* Lifecycle of a logical cluster in the system. -+ * -+ * -+ * Logical cluster of a cryptcompress file is represented in the system by -+ * . page cluster (in memory, primary cache, contains plain text); -+ * . disk cluster (in memory, secondary cache, contains transformed text). -+ * Primary cache is to reduce number of transform operations (compression, -+ * encryption), i.e. to implement transform-caching strategy. -+ * Secondary cache is to reduce number of I/O operations, i.e. for usual -+ * write-caching strategy. Page cluster is a set of pages, i.e. mapping of -+ * a logical cluster to the primary cache. Disk cluster is a set of items -+ * of the same type defined by some reiser4 item plugin id. -+ * -+ * 1. Performing modifications -+ * -+ * Every modification of a cryptcompress file is considered as a set of -+ * operations performed on file's logical clusters. Every such "atomic" -+ * modification is truncate, append and(or) overwrite some bytes of a -+ * logical cluster performed in the primary cache with the following -+ * synchronization with the secondary cache (in flush time). Disk clusters, -+ * which live in the secondary cache, are supposed to be synchronized with -+ * disk. The mechanism of synchronization of primary and secondary caches -+ * includes so-called checkin/checkout technique described below. -+ * -+ * 2. Submitting modifications -+ * -+ * Each page cluster has associated jnode (a special in-memory header to -+ * keep a track of transactions in reiser4), which is attached to its first -+ * page when grabbing page cluster for modifications (see grab_page_cluster). -+ * Submitting modifications (see checkin_logical_cluster) is going per logical -+ * cluster and includes: -+ * . checkin_cluster_size; -+ * . checkin_page_cluster. -+ * checkin_cluster_size() is resolved to file size update (which completely -+ * defines new size of logical cluster (number of file's bytes in a logical -+ * cluster). -+ * checkin_page_cluster() captures jnode of a page cluster and installs -+ * jnode's dirty flag (if needed) to indicate that modifications are -+ * successfully checked in. -+ * -+ * 3. Checking out modifications -+ * -+ * Is going per logical cluster in flush time (see checkout_logical_cluster). -+ * This is the time of synchronizing primary and secondary caches. -+ * checkout_logical_cluster() includes: -+ * . checkout_page_cluster (retrieving checked in pages). -+ * . uncapture jnode (including clear dirty flag and unlock) -+ * -+ * 4. Committing modifications -+ * -+ * Proceeding a synchronization of primary and secondary caches. When checking -+ * out page cluster (the phase above) pages are locked/flushed/unlocked -+ * one-by-one in ascending order of their indexes to contiguous stream, which -+ * is supposed to be transformed (compressed, encrypted), chopped up into items -+ * and committed to disk as a disk cluster. -+ * -+ * 5. Managing page references -+ * -+ * Every checked in page have a special additional "control" reference, -+ * which is dropped at checkout. We need this to avoid unexpected evicting -+ * pages from memory before checkout. Control references are managed so -+ * they are not accumulated with every checkin: -+ * -+ * 0 -+ * checkin -> 1 -+ * 0 -> checkout -+ * checkin -> 1 -+ * checkin -> 1 -+ * checkin -> 1 -+ * 0 -> checkout -+ * ... -+ * -+ * Every page cluster has its own unique "cluster lock". Update/drop -+ * references are serialized via this lock. Number of checked in cluster -+ * pages is calculated by i_size under cluster lock. File size is updated -+ * at every checkin action also under cluster lock (except cases of -+ * appending/truncating fake logical clusters). -+ * -+ * Proof of correctness: -+ * -+ * Since we update file size under cluster lock, in the case of non-fake -+ * logical cluster with its lock held we do have expected number of checked -+ * in pages. On the other hand, append/truncate of fake logical clusters -+ * doesn't change number of checked in pages of any cluster. -+ * -+ * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode. -+ * Currently, I don't see any reason to create a special lock for those -+ * needs. -+ */ -+ -+static inline void lock_cluster(jnode * node) -+{ -+ spin_lock_jnode(node); -+} -+ -+static inline void unlock_cluster(jnode * node) -+{ -+ spin_unlock_jnode(node); -+} -+ -+static inline void unlock_cluster_uncapture(jnode * node) -+{ -+ uncapture_cluster_jnode(node); -+} -+ -+/* Set new file size by window. Cluster lock is required. */ -+static void checkin_file_size(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ loff_t new_size; -+ struct reiser4_slide * win; -+ -+ assert("edward-1181", clust != NULL); -+ assert("edward-1182", inode != NULL); -+ assert("edward-1473", clust->pages != NULL); -+ assert("edward-1474", clust->pages[0] != NULL); -+ assert("edward-1475", jprivate(clust->pages[0]) != NULL); -+ assert_spin_locked(&(jprivate(clust->pages[0])->guard)); -+ -+ -+ win = clust->win; -+ assert("edward-1183", win != NULL); -+ -+ new_size = clust_to_off(clust->index, inode) + win->off; -+ -+ switch (clust->op) { -+ case LC_APPOV: -+ if (new_size + win->count <= i_size_read(inode)) -+ /* overwrite only */ -+ return; -+ new_size += win->count; -+ break; -+ case LC_TRUNC: -+ break; -+ default: -+ impossible("edward-1184", "bad page cluster option"); -+ break; -+ } -+ inode_check_scale_nolock(inode, i_size_read(inode), new_size); -+ i_size_write(inode, new_size); -+ return; -+} -+ -+static inline void checkin_cluster_size(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ if (clust->win) -+ checkin_file_size(clust, inode); -+} -+ -+static int checkin_page_cluster(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ int result; -+ jnode * node; -+ int old_nrpages = clust->old_nrpages; -+ int new_nrpages = get_new_nrpages(clust); -+ -+ node = clust->node; -+ -+ assert("edward-221", node != NULL); -+ assert("edward-971", clust->reserved == 1); -+ assert("edward-1263", -+ clust->reserved_prepped == estimate_update_cluster(inode)); -+ assert("edward-1264", clust->reserved_unprepped == 0); -+ -+ if (JF_ISSET(node, JNODE_DIRTY)) { -+ /* -+ * page cluster was checked in, but not yet -+ * checked out, so release related resources -+ */ -+ free_reserved4cluster(inode, clust, -+ estimate_update_cluster(inode)); -+ __put_page_cluster(0, clust->old_nrpages, -+ clust->pages, inode); -+ } else { -+ result = capture_cluster_jnode(node); -+ if (unlikely(result)) { -+ unlock_cluster(node); -+ return result; -+ } -+ jnode_make_dirty_locked(node); -+ clust->reserved = 0; -+ } -+ unlock_cluster(node); -+ -+ if (new_nrpages < old_nrpages) { -+ /* truncate >= 1 complete pages */ -+ __put_page_cluster(new_nrpages, -+ old_nrpages - new_nrpages, -+ clust->pages, inode); -+ truncate_page_cluster_range(inode, -+ clust->pages, clust->index, -+ new_nrpages, -+ old_nrpages - new_nrpages, -+ 0); -+ } -+#if REISER4_DEBUG -+ clust->reserved_prepped -= estimate_update_cluster(inode); -+#endif -+ return 0; -+} -+ -+/* Submit modifications of a logical cluster */ -+static int checkin_logical_cluster(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ int result = 0; -+ jnode * node; -+ -+ node = clust->node; -+ -+ assert("edward-1035", node != NULL); -+ assert("edward-1029", clust != NULL); -+ assert("edward-1030", clust->reserved == 1); -+ assert("edward-1031", clust->nr_pages != 0); -+ assert("edward-1032", clust->pages != NULL); -+ assert("edward-1033", clust->pages[0] != NULL); -+ assert("edward-1446", jnode_is_cluster_page(node)); -+ assert("edward-1476", node == jprivate(clust->pages[0])); -+ -+ lock_cluster(node); -+ checkin_cluster_size(clust, inode); -+ /* this will unlock cluster */ -+ result = checkin_page_cluster(clust, inode); -+ jput(node); -+ clust->node = NULL; -+ return result; -+} -+ -+/* -+ * Retrieve size of logical cluster that was checked in at -+ * the latest modifying session (cluster lock is required) -+ */ -+static inline void checkout_cluster_size(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ struct tfm_cluster *tc = &clust->tc; -+ -+ tc->len = lbytes(clust->index, inode); -+ assert("edward-1478", tc->len != 0); -+} -+ -+/* -+ * Retrieve a page cluster with the latest submitted modifications -+ * and flush its pages to previously allocated contiguous stream. -+ */ -+static void checkout_page_cluster(struct cluster_handle * clust, -+ jnode * node, struct inode * inode) -+{ -+ int i; -+ int found; -+ int to_put; -+ struct tfm_cluster *tc = &clust->tc; -+ -+ /* find and put checked in pages: cluster is locked, -+ * so we must get expected number (to_put) of pages -+ */ -+ to_put = size_in_pages(lbytes(clust->index, inode)); -+ found = find_get_pages(inode->i_mapping, -+ clust_to_pg(clust->index, inode), -+ to_put, clust->pages); -+ BUG_ON(found != to_put); -+ -+ __put_page_cluster(0, to_put, clust->pages, inode); -+ unlock_cluster_uncapture(node); -+ -+ /* Flush found pages. -+ * -+ * Note, that we don't disable modifications while flushing, -+ * moreover, some found pages can be truncated, as we have -+ * released cluster lock. -+ */ -+ for (i = 0; i < found; i++) { -+ int in_page; -+ char * data; -+ assert("edward-1479", -+ clust->pages[i]->index == clust->pages[0]->index + i); -+ -+ lock_page(clust->pages[i]); -+ if (!PageUptodate(clust->pages[i])) { -+ /* page was truncated */ -+ assert("edward-1480", -+ i_size_read(inode) <= page_offset(clust->pages[i])); -+ assert("edward-1481", -+ clust->pages[i]->mapping != inode->i_mapping); -+ unlock_page(clust->pages[i]); -+ break; -+ } -+ /* Update the number of bytes in the logical cluster, -+ * as it could be partially truncated. Note, that only -+ * partial truncate is possible (complete truncate can -+ * not go here, as it is performed via ->kill_hook() -+ * called by cut_file_items(), and the last one must -+ * wait for znode locked with parent coord). -+ */ -+ checkout_cluster_size(clust, inode); -+ -+ /* this can be zero, as new file size is -+ checked in before truncating pages */ -+ in_page = __mbp(tc->len, i); -+ -+ data = kmap(clust->pages[i]); -+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i), -+ data, in_page); -+ kunmap(clust->pages[i]); -+ -+ if (PageDirty(clust->pages[i])) -+ cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE); -+ -+ unlock_page(clust->pages[i]); -+ -+ if (in_page < PAGE_CACHE_SIZE) -+ /* end of the file */ -+ break; -+ } -+ put_found_pages(clust->pages, found); /* find_get_pages */ -+ tc->lsize = tc->len; -+ return; -+} -+ -+/* Check out modifications of a logical cluster */ -+int checkout_logical_cluster(struct cluster_handle * clust, -+ jnode * node, struct inode *inode) -+{ -+ int result; -+ struct tfm_cluster *tc = &clust->tc; -+ -+ assert("edward-980", node != NULL); -+ assert("edward-236", inode != NULL); -+ assert("edward-237", clust != NULL); -+ assert("edward-240", !clust->win); -+ assert("edward-241", reiser4_schedulable()); -+ assert("edward-718", cryptcompress_inode_ok(inode)); -+ -+ result = grab_tfm_stream(inode, tc, INPUT_STREAM); -+ if (result) { -+ warning("edward-1430", "alloc stream failed with ret=%d", -+ result); -+ return RETERR(-E_REPEAT); -+ } -+ lock_cluster(node); -+ -+ if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) { -+ /* race with another flush */ -+ warning("edward-982", -+ "checking out logical cluster %lu of inode %llu: " -+ "jnode is not dirty", clust->index, -+ (unsigned long long)get_inode_oid(inode)); -+ unlock_cluster(node); -+ return RETERR(-E_REPEAT); -+ } -+ cluster_reserved2grabbed(estimate_update_cluster(inode)); -+ -+ /* this will unlock cluster */ -+ checkout_page_cluster(clust, node, inode); -+ return 0; -+} -+ -+/* set hint for the cluster of the index @index */ -+static void set_hint_cluster(struct inode *inode, hint_t * hint, -+ cloff_t index, znode_lock_mode mode) -+{ -+ reiser4_key key; -+ assert("edward-722", cryptcompress_inode_ok(inode)); -+ assert("edward-723", -+ inode_file_plugin(inode) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ -+ inode_file_plugin(inode)->key_by_inode(inode, -+ clust_to_off(index, inode), -+ &key); -+ -+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key); -+ hint->offset = get_key_offset(&key); -+ hint->mode = mode; -+} -+ -+void invalidate_hint_cluster(struct cluster_handle * clust) -+{ -+ assert("edward-1291", clust != NULL); -+ assert("edward-1292", clust->hint != NULL); -+ -+ done_lh(&clust->hint->lh); -+ hint_clr_valid(clust->hint); -+} -+ -+static void put_hint_cluster(struct cluster_handle * clust, -+ struct inode *inode, znode_lock_mode mode) -+{ -+ assert("edward-1286", clust != NULL); -+ assert("edward-1287", clust->hint != NULL); -+ -+ set_hint_cluster(inode, clust->hint, clust->index + 1, mode); -+ invalidate_hint_cluster(clust); -+} -+ -+static int balance_dirty_page_cluster(struct cluster_handle * clust, -+ struct inode *inode, loff_t off, -+ loff_t to_file, -+ int nr_dirtied) -+{ -+ int result; -+ struct cryptcompress_info * info; -+ -+ assert("edward-724", inode != NULL); -+ assert("edward-725", cryptcompress_inode_ok(inode)); -+ assert("edward-1547", -+ nr_dirtied != 0 && nr_dirtied <= cluster_nrpages(inode)); -+ -+ /* set next window params */ -+ move_update_window(inode, clust, off, to_file); -+ -+ result = update_sd_cryptcompress(inode); -+ if (result) -+ return result; -+ assert("edward-726", clust->hint->lh.owner == NULL); -+ info = cryptcompress_inode_data(inode); -+ -+ mutex_unlock(&info->checkin_mutex); -+ reiser4_txn_restart_current(); -+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, nr_dirtied); -+ mutex_lock(&info->checkin_mutex); -+ return 0; -+} -+ -+/* set zeroes to the page cluster, proceed it, and maybe, try to capture -+ its pages */ -+static int write_hole(struct inode *inode, struct cluster_handle * clust, -+ loff_t file_off, loff_t to_file) -+{ -+ int result = 0; -+ unsigned cl_off, cl_count = 0; -+ unsigned to_pg, pg_off; -+ struct reiser4_slide * win; -+ -+ assert("edward-190", clust != NULL); -+ assert("edward-1069", clust->win != NULL); -+ assert("edward-191", inode != NULL); -+ assert("edward-727", cryptcompress_inode_ok(inode)); -+ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER); -+ assert("edward-1154", -+ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1)); -+ -+ win = clust->win; -+ -+ assert("edward-1070", win != NULL); -+ assert("edward-201", win->stat == HOLE_WINDOW); -+ assert("edward-192", cluster_ok(clust, inode)); -+ -+ if (win->off == 0 && win->count == inode_cluster_size(inode)) { -+ /* This part of the hole will be represented by "fake" -+ * logical cluster, i.e. which doesn't have appropriate -+ * disk cluster until someone modify this logical cluster -+ * and make it dirty. -+ * So go forward here.. -+ */ -+ move_update_window(inode, clust, file_off, to_file); -+ return 0; -+ } -+ cl_count = win->count; /* number of zeroes to write */ -+ cl_off = win->off; -+ pg_off = off_to_pgoff(win->off); -+ -+ while (cl_count) { -+ struct page *page; -+ page = clust->pages[off_to_pg(cl_off)]; -+ -+ assert("edward-284", page != NULL); -+ -+ to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count); -+ lock_page(page); -+ zero_user(page, pg_off, to_pg); -+ SetPageUptodate(page); -+ set_page_dirty_notag(page); -+ mark_page_accessed(page); -+ unlock_page(page); -+ -+ cl_off += to_pg; -+ cl_count -= to_pg; -+ pg_off = 0; -+ } -+ if (!win->delta) { -+ /* only zeroes in this window, try to capture -+ */ -+ result = checkin_logical_cluster(clust, inode); -+ if (result) -+ return result; -+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK); -+ result = balance_dirty_page_cluster(clust, -+ inode, file_off, to_file, -+ win_count_to_nrpages(win)); -+ } else -+ move_update_window(inode, clust, file_off, to_file); -+ return result; -+} -+ -+/* -+ The main disk search procedure for cryptcompress plugin, which -+ . scans all items of disk cluster with the lock mode @mode -+ . maybe reads each one (if @read) -+ . maybe makes its znode dirty (if write lock mode was specified) -+ -+ NOTE-EDWARD: Callers should handle the case when disk cluster -+ is incomplete (-EIO) -+*/ -+int find_disk_cluster(struct cluster_handle * clust, -+ struct inode *inode, int read, znode_lock_mode mode) -+{ -+ flow_t f; -+ hint_t *hint; -+ int result = 0; -+ int was_grabbed; -+ ra_info_t ra_info; -+ file_plugin *fplug; -+ item_plugin *iplug; -+ struct tfm_cluster *tc; -+ struct cryptcompress_info * info; -+ -+ assert("edward-138", clust != NULL); -+ assert("edward-728", clust->hint != NULL); -+ assert("edward-226", reiser4_schedulable()); -+ assert("edward-137", inode != NULL); -+ assert("edward-729", cryptcompress_inode_ok(inode)); -+ -+ hint = clust->hint; -+ fplug = inode_file_plugin(inode); -+ was_grabbed = get_current_context()->grabbed_blocks; -+ info = cryptcompress_inode_data(inode); -+ tc = &clust->tc; -+ -+ assert("edward-462", !tfm_cluster_is_uptodate(tc)); -+ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM))); -+ -+ dclust_init_extension(hint); -+ -+ /* set key of the first disk cluster item */ -+ fplug->flow_by_inode(inode, -+ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL), -+ 0 /* kernel space */ , -+ inode_scaled_cluster_size(inode), -+ clust_to_off(clust->index, inode), READ_OP, &f); -+ if (mode == ZNODE_WRITE_LOCK) { -+ /* reserve for flush to make dirty all the leaf nodes -+ which contain disk cluster */ -+ result = -+ reiser4_grab_space_force(estimate_dirty_cluster(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ goto out; -+ } -+ -+ ra_info.key_to_stop = f.key; -+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key())); -+ -+ while (f.length) { -+ result = find_cluster_item(hint, &f.key, mode, -+ NULL, FIND_EXACT, -+ (mode == ZNODE_WRITE_LOCK ? -+ CBK_FOR_INSERT : 0)); -+ switch (result) { -+ case CBK_COORD_NOTFOUND: -+ result = 0; -+ if (inode_scaled_offset -+ (inode, clust_to_off(clust->index, inode)) == -+ get_key_offset(&f.key)) { -+ /* first item not found, this is treated -+ as disk cluster is absent */ -+ clust->dstat = FAKE_DISK_CLUSTER; -+ goto out; -+ } -+ /* we are outside the cluster, stop search here */ -+ assert("edward-146", -+ f.length != inode_scaled_cluster_size(inode)); -+ goto ok; -+ case CBK_COORD_FOUND: -+ assert("edward-148", -+ hint->ext_coord.coord.between == AT_UNIT); -+ assert("edward-460", -+ hint->ext_coord.coord.unit_pos == 0); -+ -+ coord_clear_iplug(&hint->ext_coord.coord); -+ result = zload_ra(hint->ext_coord.coord.node, &ra_info); -+ if (unlikely(result)) -+ goto out; -+ iplug = item_plugin_by_coord(&hint->ext_coord.coord); -+ assert("edward-147", -+ item_id_by_coord(&hint->ext_coord.coord) == -+ CTAIL_ID); -+ -+ result = iplug->s.file.read(NULL, &f, hint); -+ if (result) { -+ zrelse(hint->ext_coord.coord.node); -+ goto out; -+ } -+ if (mode == ZNODE_WRITE_LOCK) { -+ /* Don't make dirty more nodes then it was -+ estimated (see comments before -+ estimate_dirty_cluster). Missed nodes will be -+ read up in flush time if they are evicted from -+ memory */ -+ if (dclust_get_extension_ncount(hint) <= -+ estimate_dirty_cluster(inode)) -+ znode_make_dirty(hint->ext_coord.coord.node); -+ -+ znode_set_convertible(hint->ext_coord.coord. -+ node); -+ } -+ zrelse(hint->ext_coord.coord.node); -+ break; -+ default: -+ goto out; -+ } -+ } -+ ok: -+ /* at least one item was found */ -+ /* NOTE-EDWARD: Callers should handle the case -+ when disk cluster is incomplete (-EIO) */ -+ tc->len = inode_scaled_cluster_size(inode) - f.length; -+ tc->lsize = lbytes(clust->index, inode); -+ assert("edward-1196", tc->len > 0); -+ assert("edward-1406", tc->lsize > 0); -+ -+ if (hint_is_unprepped_dclust(clust->hint)) { -+ clust->dstat = UNPR_DISK_CLUSTER; -+ } else if (clust->index == info->trunc_index) { -+ clust->dstat = TRNC_DISK_CLUSTER; -+ } else { -+ clust->dstat = PREP_DISK_CLUSTER; -+ dclust_set_extension_dsize(clust->hint, tc->len); -+ } -+ out: -+ assert("edward-1339", -+ get_current_context()->grabbed_blocks >= was_grabbed); -+ grabbed2free(get_current_context(), -+ get_current_super_private(), -+ get_current_context()->grabbed_blocks - was_grabbed); -+ return result; -+} -+ -+int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode, -+ znode_lock_mode lock_mode) -+{ -+ reiser4_key key; -+ ra_info_t ra_info; -+ -+ assert("edward-730", reiser4_schedulable()); -+ assert("edward-731", clust != NULL); -+ assert("edward-732", inode != NULL); -+ -+ if (hint_is_valid(clust->hint)) { -+ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER); -+ assert("edward-1294", -+ znode_is_write_locked(clust->hint->lh.node)); -+ /* already have a valid locked position */ -+ return (clust->dstat == -+ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND : -+ CBK_COORD_FOUND); -+ } -+ key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode), -+ &key); -+ ra_info.key_to_stop = key; -+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key())); -+ -+ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT, -+ CBK_FOR_INSERT); -+} -+ -+/* Read needed cluster pages before modifying. -+ If success, @clust->hint contains locked position in the tree. -+ Also: -+ . find and set disk cluster state -+ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER. -+*/ -+static int read_some_cluster_pages(struct inode * inode, -+ struct cluster_handle * clust) -+{ -+ int i; -+ int result = 0; -+ item_plugin *iplug; -+ struct reiser4_slide * win = clust->win; -+ znode_lock_mode mode = ZNODE_WRITE_LOCK; -+ -+ iplug = item_plugin_by_id(CTAIL_ID); -+ -+ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc)); -+ -+#if REISER4_DEBUG -+ if (clust->nr_pages == 0) { -+ /* start write hole from fake disk cluster */ -+ assert("edward-1117", win != NULL); -+ assert("edward-1118", win->stat == HOLE_WINDOW); -+ assert("edward-1119", new_logical_cluster(clust, inode)); -+ } -+#endif -+ if (new_logical_cluster(clust, inode)) { -+ /* -+ new page cluster is about to be written, nothing to read, -+ */ -+ assert("edward-734", reiser4_schedulable()); -+ assert("edward-735", clust->hint->lh.owner == NULL); -+ -+ if (clust->nr_pages) { -+ int off; -+ struct page * pg; -+ assert("edward-1419", clust->pages != NULL); -+ pg = clust->pages[clust->nr_pages - 1]; -+ assert("edward-1420", pg != NULL); -+ off = off_to_pgoff(win->off+win->count+win->delta); -+ if (off) { -+ lock_page(pg); -+ zero_user_segment(pg, off, PAGE_CACHE_SIZE); -+ unlock_page(pg); -+ } -+ } -+ clust->dstat = FAKE_DISK_CLUSTER; -+ return 0; -+ } -+ /* -+ Here we should search for disk cluster to figure out its real state. -+ Also there is one more important reason to do disk search: we need -+ to make disk cluster _dirty_ if it exists -+ */ -+ -+ /* if windows is specified, read the only pages -+ that will be modified partially */ -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ struct page *pg = clust->pages[i]; -+ -+ lock_page(pg); -+ if (PageUptodate(pg)) { -+ unlock_page(pg); -+ continue; -+ } -+ unlock_page(pg); -+ -+ if (win && -+ i >= size_in_pages(win->off) && -+ i < off_to_pg(win->off + win->count + win->delta)) -+ /* page will be completely overwritten */ -+ continue; -+ -+ if (win && (i == clust->nr_pages - 1) && -+ /* the last page is -+ partially modified, -+ not uptodate .. */ -+ (size_in_pages(i_size_read(inode)) <= pg->index)) { -+ /* .. and appended, -+ so set zeroes to the rest */ -+ int offset; -+ lock_page(pg); -+ assert("edward-1260", -+ size_in_pages(win->off + win->count + -+ win->delta) - 1 == i); -+ -+ offset = -+ off_to_pgoff(win->off + win->count + win->delta); -+ zero_user_segment(pg, offset, PAGE_CACHE_SIZE); -+ unlock_page(pg); -+ /* still not uptodate */ -+ break; -+ } -+ lock_page(pg); -+ result = do_readpage_ctail(inode, clust, pg, mode); -+ -+ assert("edward-1526", ergo(!result, PageUptodate(pg))); -+ unlock_page(pg); -+ if (result) { -+ warning("edward-219", "do_readpage_ctail failed"); -+ goto out; -+ } -+ } -+ if (!tfm_cluster_is_uptodate(&clust->tc)) { -+ /* disk cluster unclaimed, but we need to make its znodes dirty -+ * to make flush update convert its content -+ */ -+ result = find_disk_cluster(clust, inode, -+ 0 /* do not read items */, -+ mode); -+ } -+ out: -+ tfm_cluster_clr_uptodate(&clust->tc); -+ return result; -+} -+ -+static int should_create_unprepped_cluster(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ assert("edward-737", clust != NULL); -+ -+ switch (clust->dstat) { -+ case PREP_DISK_CLUSTER: -+ case UNPR_DISK_CLUSTER: -+ return 0; -+ case FAKE_DISK_CLUSTER: -+ if (clust->win && -+ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) { -+ assert("edward-1172", -+ new_logical_cluster(clust, inode)); -+ return 0; -+ } -+ return 1; -+ default: -+ impossible("edward-1173", "bad disk cluster state"); -+ return 0; -+ } -+} -+ -+static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ int result; -+ -+ assert("edward-1123", reiser4_schedulable()); -+ assert("edward-737", clust != NULL); -+ assert("edward-738", inode != NULL); -+ assert("edward-739", cryptcompress_inode_ok(inode)); -+ assert("edward-1053", clust->hint != NULL); -+ -+ if (!should_create_unprepped_cluster(clust, inode)) { -+ if (clust->reserved) { -+ cluster_reserved2free(estimate_insert_cluster(inode)); -+#if REISER4_DEBUG -+ assert("edward-1267", -+ clust->reserved_unprepped == -+ estimate_insert_cluster(inode)); -+ clust->reserved_unprepped -= -+ estimate_insert_cluster(inode); -+#endif -+ } -+ return 0; -+ } -+ assert("edward-1268", clust->reserved); -+ cluster_reserved2grabbed(estimate_insert_cluster(inode)); -+#if REISER4_DEBUG -+ assert("edward-1441", -+ clust->reserved_unprepped == estimate_insert_cluster(inode)); -+ clust->reserved_unprepped -= estimate_insert_cluster(inode); -+#endif -+ result = ctail_insert_unprepped_cluster(clust, inode); -+ if (result) -+ return result; -+ -+ inode_add_bytes(inode, inode_cluster_size(inode)); -+ -+ assert("edward-743", cryptcompress_inode_ok(inode)); -+ assert("edward-744", znode_is_write_locked(clust->hint->lh.node)); -+ -+ clust->dstat = UNPR_DISK_CLUSTER; -+ return 0; -+} -+ -+/* . Grab page cluster for read, write, setattr, etc. operations; -+ * . Truncate its complete pages, if needed; -+ */ -+int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust, -+ rw_op rw) -+{ -+ assert("edward-177", inode != NULL); -+ assert("edward-741", cryptcompress_inode_ok(inode)); -+ assert("edward-740", clust->pages != NULL); -+ -+ set_cluster_nrpages(clust, inode); -+ reset_cluster_pgset(clust, cluster_nrpages(inode)); -+ return grab_page_cluster(inode, clust, rw); -+} -+ -+/* Truncate complete page cluster of index @index. -+ * This is called by ->kill_hook() method of item -+ * plugin when deleting a disk cluster of such index. -+ */ -+void truncate_complete_page_cluster(struct inode *inode, cloff_t index, -+ int even_cows) -+{ -+ int found; -+ int nr_pages; -+ jnode *node; -+ struct page *pages[MAX_CLUSTER_NRPAGES]; -+ -+ node = jlookup(current_tree, get_inode_oid(inode), -+ clust_to_pg(index, inode)); -+ nr_pages = size_in_pages(lbytes(index, inode)); -+ assert("edward-1483", nr_pages != 0); -+ if (!node) -+ goto truncate; -+ found = find_get_pages(inode->i_mapping, -+ clust_to_pg(index, inode), -+ cluster_nrpages(inode), pages); -+ if (!found) { -+ assert("edward-1484", jnode_truncate_ok(inode, index)); -+ return; -+ } -+ lock_cluster(node); -+ -+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) -+ && index == 0) -+ /* converting to unix_file is in progress */ -+ JF_CLR(node, JNODE_CLUSTER_PAGE); -+ if (JF_ISSET(node, JNODE_DIRTY)) { -+ /* -+ * @nr_pages were checked in, but not yet checked out - -+ * we need to release them. (also there can be pages -+ * attached to page cache by read(), etc. - don't take -+ * them into account). -+ */ -+ assert("edward-1198", found >= nr_pages); -+ -+ /* free disk space grabbed for disk cluster converting */ -+ cluster_reserved2grabbed(estimate_update_cluster(inode)); -+ grabbed2free(get_current_context(), -+ get_current_super_private(), -+ estimate_update_cluster(inode)); -+ __put_page_cluster(0, nr_pages, pages, inode); -+ -+ /* This will clear dirty bit, uncapture and unlock jnode */ -+ unlock_cluster_uncapture(node); -+ } else -+ unlock_cluster(node); -+ jput(node); /* jlookup */ -+ put_found_pages(pages, found); /* find_get_pages */ -+ truncate: -+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) && -+ index == 0) -+ return; -+ truncate_page_cluster_range(inode, pages, index, 0, -+ cluster_nrpages(inode), -+ even_cows); -+ assert("edward-1201", -+ ergo(!reiser4_inode_get_flag(inode, -+ REISER4_FILE_CONV_IN_PROGRESS), -+ jnode_truncate_ok(inode, index))); -+ return; -+} -+ -+/* -+ * Set cluster handle @clust of a logical cluster before -+ * modifications which are supposed to be committed. -+ * -+ * . grab cluster pages; -+ * . reserve disk space; -+ * . maybe read pages from disk and set the disk cluster dirty; -+ * . maybe write hole and check in (partially zeroed) logical cluster; -+ * . create 'unprepped' disk cluster for new or fake logical one. -+ */ -+static int prepare_logical_cluster(struct inode *inode, -+ loff_t file_off, /* write position -+ in the file */ -+ loff_t to_file, /* bytes of users data -+ to write to the file */ -+ struct cluster_handle * clust, -+ logical_cluster_op op) -+{ -+ int result = 0; -+ struct reiser4_slide * win = clust->win; -+ -+ reset_cluster_params(clust); -+ cluster_set_tfm_act(&clust->tc, TFMA_READ); -+#if REISER4_DEBUG -+ clust->ctx = get_current_context(); -+#endif -+ assert("edward-1190", op != LC_INVAL); -+ -+ clust->op = op; -+ -+ result = prepare_page_cluster(inode, clust, WRITE_OP); -+ if (result) -+ return result; -+ assert("edward-1447", -+ ergo(clust->nr_pages != 0, jprivate(clust->pages[0]))); -+ assert("edward-1448", -+ ergo(clust->nr_pages != 0, -+ jnode_is_cluster_page(jprivate(clust->pages[0])))); -+ -+ result = reserve4cluster(inode, clust); -+ if (result) -+ goto err1; -+ result = read_some_cluster_pages(inode, clust); -+ if (result) { -+ free_reserved4cluster(inode, -+ clust, -+ estimate_update_cluster(inode) + -+ estimate_insert_cluster(inode)); -+ goto err1; -+ } -+ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER); -+ -+ result = cryptcompress_make_unprepped_cluster(clust, inode); -+ if (result) -+ goto err2; -+ if (win && win->stat == HOLE_WINDOW) { -+ result = write_hole(inode, clust, file_off, to_file); -+ if (result) -+ goto err2; -+ } -+ return 0; -+ err2: -+ free_reserved4cluster(inode, clust, -+ estimate_update_cluster(inode)); -+ err1: -+ put_page_cluster(clust, inode, WRITE_OP); -+ assert("edward-1125", result == -ENOSPC); -+ return result; -+} -+ -+/* set window by two offsets */ -+static void set_window(struct cluster_handle * clust, -+ struct reiser4_slide * win, struct inode *inode, -+ loff_t o1, loff_t o2) -+{ -+ assert("edward-295", clust != NULL); -+ assert("edward-296", inode != NULL); -+ assert("edward-1071", win != NULL); -+ assert("edward-297", o1 <= o2); -+ -+ clust->index = off_to_clust(o1, inode); -+ -+ win->off = off_to_cloff(o1, inode); -+ win->count = min((loff_t)(inode_cluster_size(inode) - win->off), -+ o2 - o1); -+ win->delta = 0; -+ -+ clust->win = win; -+} -+ -+static int set_cluster_by_window(struct inode *inode, -+ struct cluster_handle * clust, -+ struct reiser4_slide * win, size_t length, -+ loff_t file_off) -+{ -+ int result; -+ -+ assert("edward-197", clust != NULL); -+ assert("edward-1072", win != NULL); -+ assert("edward-198", inode != NULL); -+ -+ result = alloc_cluster_pgset(clust, cluster_nrpages(inode)); -+ if (result) -+ return result; -+ -+ if (file_off > i_size_read(inode)) { -+ /* Uhmm, hole in cryptcompress file... */ -+ loff_t hole_size; -+ hole_size = file_off - inode->i_size; -+ -+ set_window(clust, win, inode, inode->i_size, file_off); -+ win->stat = HOLE_WINDOW; -+ if (win->off + hole_size < inode_cluster_size(inode)) -+ /* there is also user's data to append to the hole */ -+ win->delta = min(inode_cluster_size(inode) - -+ (win->off + win->count), length); -+ return 0; -+ } -+ set_window(clust, win, inode, file_off, file_off + length); -+ win->stat = DATA_WINDOW; -+ return 0; -+} -+ -+int set_cluster_by_page(struct cluster_handle * clust, struct page * page, -+ int count) -+{ -+ int result = 0; -+ int (*setting_actor)(struct cluster_handle * clust, int count); -+ -+ assert("edward-1358", clust != NULL); -+ assert("edward-1359", page != NULL); -+ assert("edward-1360", page->mapping != NULL); -+ assert("edward-1361", page->mapping->host != NULL); -+ -+ setting_actor = -+ (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset); -+ result = setting_actor(clust, count); -+ clust->index = pg_to_clust(page->index, page->mapping->host); -+ return result; -+} -+ -+/* reset all the params that not get updated */ -+void reset_cluster_params(struct cluster_handle * clust) -+{ -+ assert("edward-197", clust != NULL); -+ -+ clust->dstat = INVAL_DISK_CLUSTER; -+ clust->tc.uptodate = 0; -+ clust->tc.len = 0; -+} -+ -+/* the heart of write_cryptcompress */ -+static loff_t do_write_cryptcompress(struct file *file, struct inode *inode, -+ const char __user *buf, size_t to_write, -+ loff_t pos, struct psched_context *cont) -+{ -+ int i; -+ hint_t *hint; -+ int result = 0; -+ size_t count; -+ struct reiser4_slide win; -+ struct cluster_handle clust; -+ struct cryptcompress_info * info; -+ -+ assert("edward-154", buf != NULL); -+ assert("edward-161", reiser4_schedulable()); -+ assert("edward-748", cryptcompress_inode_ok(inode)); -+ assert("edward-159", current_blocksize == PAGE_CACHE_SIZE); -+ assert("edward-1274", get_current_context()->grabbed_blocks == 0); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ -+ result = load_file_hint(file, hint); -+ if (result) { -+ kfree(hint); -+ return result; -+ } -+ count = to_write; -+ -+ reiser4_slide_init(&win); -+ cluster_init_read(&clust, &win); -+ clust.hint = hint; -+ info = cryptcompress_inode_data(inode); -+ -+ mutex_lock(&info->checkin_mutex); -+ -+ result = set_cluster_by_window(inode, &clust, &win, to_write, pos); -+ if (result) -+ goto out; -+ -+ if (next_window_stat(&win) == HOLE_WINDOW) { -+ /* write hole in this iteration -+ separated from the loop below */ -+ result = write_pschedule_hook(file, inode, -+ pos, -+ &clust, -+ cont); -+ if (result) -+ goto out; -+ result = prepare_logical_cluster(inode, pos, count, &clust, -+ LC_APPOV); -+ if (result) -+ goto out; -+ } -+ do { -+ const char __user * src; -+ unsigned page_off, to_page; -+ -+ assert("edward-750", reiser4_schedulable()); -+ -+ result = write_pschedule_hook(file, inode, -+ pos + to_write - count, -+ &clust, -+ cont); -+ if (result) -+ goto out; -+ if (cont->state == PSCHED_ASSIGNED_NEW) -+ /* done_lh was called in write_pschedule_hook */ -+ goto out_no_longterm_lock; -+ -+ result = prepare_logical_cluster(inode, pos, count, &clust, -+ LC_APPOV); -+ if (result) -+ goto out; -+ -+ assert("edward-751", cryptcompress_inode_ok(inode)); -+ assert("edward-204", win.stat == DATA_WINDOW); -+ assert("edward-1288", hint_is_valid(clust.hint)); -+ assert("edward-752", -+ znode_is_write_locked(hint->ext_coord.coord.node)); -+ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK); -+ -+ /* set write position in page */ -+ page_off = off_to_pgoff(win.off); -+ -+ /* copy user's data to cluster pages */ -+ for (i = off_to_pg(win.off), src = buf; -+ i < size_in_pages(win.off + win.count); -+ i++, src += to_page) { -+ to_page = __mbp(win.off + win.count, i) - page_off; -+ assert("edward-1039", -+ page_off + to_page <= PAGE_CACHE_SIZE); -+ assert("edward-287", clust.pages[i] != NULL); -+ -+ fault_in_pages_readable(src, to_page); -+ -+ lock_page(clust.pages[i]); -+ result = -+ __copy_from_user((char *)kmap(clust.pages[i]) + -+ page_off, src, to_page); -+ kunmap(clust.pages[i]); -+ if (unlikely(result)) { -+ unlock_page(clust.pages[i]); -+ result = -EFAULT; -+ goto err2; -+ } -+ SetPageUptodate(clust.pages[i]); -+ set_page_dirty_notag(clust.pages[i]); -+ flush_dcache_page(clust.pages[i]); -+ mark_page_accessed(clust.pages[i]); -+ unlock_page(clust.pages[i]); -+ page_off = 0; -+ } -+ assert("edward-753", cryptcompress_inode_ok(inode)); -+ -+ result = checkin_logical_cluster(&clust, inode); -+ if (result) -+ goto err2; -+ -+ buf += win.count; -+ count -= win.count; -+ -+ result = balance_dirty_page_cluster(&clust, inode, 0, count, -+ win_count_to_nrpages(&win)); -+ if (result) -+ goto err1; -+ assert("edward-755", hint->lh.owner == NULL); -+ reset_cluster_params(&clust); -+ continue; -+ err2: -+ put_page_cluster(&clust, inode, WRITE_OP); -+ err1: -+ if (clust.reserved) -+ free_reserved4cluster(inode, -+ &clust, -+ estimate_update_cluster(inode)); -+ break; -+ } while (count); -+ out: -+ done_lh(&hint->lh); -+ save_file_hint(file, hint); -+ out_no_longterm_lock: -+ mutex_unlock(&info->checkin_mutex); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ assert("edward-195", -+ ergo((to_write == count), -+ (result < 0 || cont->state == PSCHED_ASSIGNED_NEW))); -+ return (to_write - count) ? (to_write - count) : result; -+} -+ -+/** -+ * plugin->write() -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @read_amount: number of bytes to write -+ * @off: position in file to write to -+ */ -+ssize_t write_cryptcompress(struct file *file, const char __user *buf, -+ size_t count, loff_t *off, -+ struct psched_context *cont) -+{ -+ ssize_t result; -+ struct inode *inode; -+ reiser4_context *ctx; -+ loff_t pos = *off; -+ struct cryptcompress_info *info; -+ -+ assert("edward-1449", cont->state == PSCHED_INVAL_STATE); -+ -+ inode = file->f_dentry->d_inode; -+ assert("edward-196", cryptcompress_inode_ok(inode)); -+ -+ info = cryptcompress_inode_data(inode); -+ ctx = get_current_context(); -+ -+ result = generic_write_checks(file, &pos, &count, 0); -+ if (unlikely(result != 0)) { -+ context_set_commit_async(ctx); -+ return result; -+ } -+ if (unlikely(count == 0)) -+ return 0; -+ result = file_remove_suid(file); -+ if (unlikely(result != 0)) { -+ context_set_commit_async(ctx); -+ return result; -+ } -+ /* remove_suid might create a transaction */ -+ reiser4_txn_restart(ctx); -+ -+ result = do_write_cryptcompress(file, inode, buf, count, pos, cont); -+ -+ if (unlikely(result < 0)) { -+ context_set_commit_async(ctx); -+ return result; -+ } -+ /* update position in a file */ -+ *off = pos + result; -+ return result; -+} -+ -+/* plugin->readpages */ -+int readpages_cryptcompress(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ reiser4_context * ctx; -+ int ret; -+ -+ ctx = reiser4_init_context(mapping->host->i_sb); -+ if (IS_ERR(ctx)) { -+ ret = PTR_ERR(ctx); -+ goto err; -+ } -+ /* cryptcompress file can be built of ctail items only */ -+ ret = readpages_ctail(file, mapping, pages); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ if (ret) { -+err: -+ put_pages_list(pages); -+ } -+ return ret; -+} -+ -+static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode) -+{ -+ /* reserve one block to update stat data item */ -+ assert("edward-1193", -+ inode_file_plugin(inode)->estimate.update == -+ estimate_update_common); -+ return estimate_update_common(inode); -+} -+ -+/** -+ * plugin->read -+ * @file: file to read from -+ * @buf: address of user-space buffer -+ * @read_amount: number of bytes to read -+ * @off: position in file to read from -+ */ -+ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size, -+ loff_t * off) -+{ -+ ssize_t result; -+ struct inode *inode; -+ reiser4_context *ctx; -+ struct cryptcompress_info *info; -+ reiser4_block_nr needed; -+ -+ inode = file->f_dentry->d_inode; -+ assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ info = cryptcompress_inode_data(inode); -+ needed = cryptcompress_estimate_read(inode); -+ -+ result = reiser4_grab_space(needed, BA_CAN_COMMIT); -+ if (result != 0) { -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ result = do_sync_read(file, buf, size, off); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return result; -+} -+ -+/* Look for a disk cluster and keep lookup result in @found. -+ * If @index > 0, then find disk cluster of the index (@index - 1); -+ * If @index == 0, then find the rightmost disk cluster. -+ * Keep incremented index of the found disk cluster in @found. -+ * @found == 0 means that disk cluster was not found (in the last -+ * case (@index == 0) it means that file doesn't have disk clusters). -+ */ -+static int lookup_disk_cluster(struct inode *inode, cloff_t * found, -+ cloff_t index) -+{ -+ int result; -+ reiser4_key key; -+ loff_t offset; -+ hint_t *hint; -+ lock_handle *lh; -+ lookup_bias bias; -+ coord_t *coord; -+ item_plugin *iplug; -+ -+ assert("edward-1131", inode != NULL); -+ assert("edward-95", cryptcompress_inode_ok(inode)); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN); -+ offset = -+ (index ? clust_to_off(index, inode) - -+ 1 : get_key_offset(reiser4_max_key())); -+ -+ key_by_inode_cryptcompress(inode, offset, &key); -+ -+ /* find the last item of this object */ -+ result = -+ find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */, -+ bias, 0); -+ if (cbk_errored(result)) { -+ done_lh(lh); -+ kfree(hint); -+ return result; -+ } -+ if (result == CBK_COORD_NOTFOUND) { -+ /* no real disk clusters */ -+ done_lh(lh); -+ kfree(hint); -+ *found = 0; -+ return 0; -+ } -+ /* disk cluster is found */ -+ coord = &hint->ext_coord.coord; -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (unlikely(result)) { -+ done_lh(lh); -+ kfree(hint); -+ return result; -+ } -+ iplug = item_plugin_by_coord(coord); -+ assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID)); -+ assert("edward-1202", ctail_ok(coord)); -+ -+ item_key_by_coord(coord, &key); -+ *found = off_to_clust(get_key_offset(&key), inode) + 1; -+ -+ assert("edward-1132", ergo(index, index == *found)); -+ -+ zrelse(coord->node); -+ done_lh(lh); -+ kfree(hint); -+ return 0; -+} -+ -+static int find_fake_appended(struct inode *inode, cloff_t * index) -+{ -+ return lookup_disk_cluster(inode, index, -+ 0 /* find last real one */ ); -+} -+ -+/* Set left coord when unit is not found after node_lookup() -+ This takes into account that there can be holes in a sequence -+ of disk clusters */ -+ -+static void adjust_left_coord(coord_t * left_coord) -+{ -+ switch (left_coord->between) { -+ case AFTER_UNIT: -+ left_coord->between = AFTER_ITEM; -+ case AFTER_ITEM: -+ case BEFORE_UNIT: -+ break; -+ default: -+ impossible("edward-1204", "bad left coord to cut"); -+ } -+ return; -+} -+ -+#define CRC_CUT_TREE_MIN_ITERATIONS 64 -+ -+/* plugin->cut_tree_worker */ -+int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, -+ struct inode *object, int truncate, -+ int *progress) -+{ -+ lock_handle next_node_lock; -+ coord_t left_coord; -+ int result; -+ -+ assert("edward-1158", tap->coord->node != NULL); -+ assert("edward-1159", znode_is_write_locked(tap->coord->node)); -+ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL); -+ -+ *progress = 0; -+ init_lh(&next_node_lock); -+ -+ while (1) { -+ znode *node; /* node from which items are cut */ -+ node_plugin *nplug; /* node plugin for @node */ -+ -+ node = tap->coord->node; -+ -+ /* Move next_node_lock to the next node on the left. */ -+ result = -+ reiser4_get_left_neighbor(&next_node_lock, node, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result != 0 && result != -E_NO_NEIGHBOR) -+ break; -+ /* FIXME-EDWARD: Check can we delete the node as a whole. */ -+ result = reiser4_tap_load(tap); -+ if (result) -+ return result; -+ -+ /* Prepare the second (right) point for cut_node() */ -+ if (*progress) -+ coord_init_last_unit(tap->coord, node); -+ -+ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL) -+ /* set rightmost unit for the items without lookup method */ -+ tap->coord->unit_pos = coord_last_unit_pos(tap->coord); -+ -+ nplug = node->nplug; -+ -+ assert("edward-1161", nplug); -+ assert("edward-1162", nplug->lookup); -+ -+ /* left_coord is leftmost unit cut from @node */ -+ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord); -+ -+ if (IS_CBKERR(result)) -+ break; -+ -+ if (result == CBK_COORD_NOTFOUND) -+ adjust_left_coord(&left_coord); -+ -+ /* adjust coordinates so that they are set to existing units */ -+ if (coord_set_to_right(&left_coord) -+ || coord_set_to_left(tap->coord)) { -+ result = 0; -+ break; -+ } -+ -+ if (coord_compare(&left_coord, tap->coord) == -+ COORD_CMP_ON_RIGHT) { -+ /* keys from @from_key to @to_key are not in the tree */ -+ result = 0; -+ break; -+ } -+ -+ /* cut data from one node */ -+ *smallest_removed = *reiser4_min_key(); -+ result = kill_node_content(&left_coord, -+ tap->coord, -+ from_key, -+ to_key, -+ smallest_removed, -+ next_node_lock.node, -+ object, truncate); -+ reiser4_tap_relse(tap); -+ -+ if (result) -+ break; -+ -+ ++(*progress); -+ -+ /* Check whether all items with keys >= from_key were removed -+ * from the tree. */ -+ if (keyle(smallest_removed, from_key)) -+ /* result = 0; */ -+ break; -+ -+ if (next_node_lock.node == NULL) -+ break; -+ -+ result = reiser4_tap_move(tap, &next_node_lock); -+ done_lh(&next_node_lock); -+ if (result) -+ break; -+ -+ /* Break long cut_tree operation (deletion of a large file) if -+ * atom requires commit. */ -+ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS -+ && current_atom_should_commit()) { -+ result = -E_REPEAT; -+ break; -+ } -+ } -+ done_lh(&next_node_lock); -+ return result; -+} -+ -+/* Append or expand hole in two steps: -+ * 1) set zeroes to the rightmost page of the rightmost non-fake -+ * logical cluster; -+ * 2) expand hole via fake logical clusters (just increase i_size) -+ */ -+static int cryptcompress_append_hole(struct inode *inode /* with old size */, -+ loff_t new_size) -+{ -+ int result = 0; -+ hint_t *hint; -+ lock_handle *lh; -+ loff_t hole_size; -+ int nr_zeroes; -+ struct reiser4_slide win; -+ struct cluster_handle clust; -+ -+ assert("edward-1133", inode->i_size < new_size); -+ assert("edward-1134", reiser4_schedulable()); -+ assert("edward-1135", cryptcompress_inode_ok(inode)); -+ assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE); -+ assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ reiser4_slide_init(&win); -+ cluster_init_read(&clust, &win); -+ clust.hint = hint; -+ -+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (result) -+ goto out; -+ if (off_to_cloff(inode->i_size, inode) == 0) -+ goto append_fake; -+ hole_size = new_size - inode->i_size; -+ nr_zeroes = -+ inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode); -+ if (hole_size < nr_zeroes) -+ nr_zeroes = hole_size; -+ set_window(&clust, &win, inode, inode->i_size, -+ inode->i_size + nr_zeroes); -+ win.stat = HOLE_WINDOW; -+ -+ assert("edward-1137", -+ clust.index == off_to_clust(inode->i_size, inode)); -+ -+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV); -+ -+ assert("edward-1271", !result || result == -ENOSPC); -+ if (result) -+ goto out; -+ assert("edward-1139", -+ clust.dstat == PREP_DISK_CLUSTER || -+ clust.dstat == UNPR_DISK_CLUSTER); -+ -+ assert("edward-1431", hole_size >= nr_zeroes); -+ if (hole_size == nr_zeroes) -+ /* nothing to append anymore */ -+ goto out; -+ append_fake: -+ INODE_SET_SIZE(inode, new_size); -+ out: -+ done_lh(lh); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ return result; -+} -+ -+static int update_cryptcompress_size(struct inode *inode, loff_t new_size, -+ int update_sd) -+{ -+ return (new_size & ((loff_t) (inode_cluster_size(inode)) - 1) -+ ? 0 : reiser4_update_file_size(inode, new_size, update_sd)); -+} -+ -+/* Prune cryptcompress file in two steps: -+ * 1) cut all nominated logical clusters except the leftmost one which -+ * is to be partially truncated. Note, that there can be "holes" -+ * represented by fake logical clusters. -+ * 2) set zeroes and capture leftmost partially truncated logical -+ * cluster, if it is not fake; otherwise prune fake logical cluster -+ * (just decrease i_size). -+ */ -+static int prune_cryptcompress(struct inode *inode, loff_t new_size, -+ int update_sd, cloff_t aidx) -+{ -+ int result = 0; -+ unsigned nr_zeroes; -+ loff_t to_prune; -+ loff_t old_size; -+ cloff_t ridx; -+ -+ hint_t *hint; -+ lock_handle *lh; -+ struct reiser4_slide win; -+ struct cluster_handle clust; -+ -+ assert("edward-1140", inode->i_size >= new_size); -+ assert("edward-1141", reiser4_schedulable()); -+ assert("edward-1142", cryptcompress_inode_ok(inode)); -+ assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE); -+ -+ old_size = inode->i_size; -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ reiser4_slide_init(&win); -+ cluster_init_read(&clust, &win); -+ clust.hint = hint; -+ -+ /* calculate index of the rightmost logical cluster -+ that will be completely truncated */ -+ ridx = size_in_lc(new_size, inode); -+ -+ /* truncate all disk clusters starting from @ridx */ -+ assert("edward-1174", ridx <= aidx); -+ old_size = inode->i_size; -+ if (ridx != aidx) { -+ struct cryptcompress_info * info; -+ info = cryptcompress_inode_data(inode); -+ result = cut_file_items(inode, -+ clust_to_off(ridx, inode), -+ update_sd, -+ clust_to_off(aidx, inode), -+ update_cryptcompress_size); -+ info->trunc_index = ULONG_MAX; -+ if (result) -+ goto out; -+ } -+ /* -+ * there can be pages of fake logical clusters, truncate them -+ */ -+ truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode)); -+ assert("edward-1524", -+ pages_truncate_ok(inode, clust_to_pg(ridx, inode))); -+ /* -+ * now perform partial truncate of last logical cluster -+ */ -+ if (!off_to_cloff(new_size, inode)) { -+ /* no partial truncate is needed */ -+ assert("edward-1145", inode->i_size == new_size); -+ goto truncate_fake; -+ } -+ assert("edward-1146", new_size < inode->i_size); -+ -+ to_prune = inode->i_size - new_size; -+ -+ /* check if the last logical cluster is fake */ -+ result = lookup_disk_cluster(inode, &aidx, ridx); -+ if (result) -+ goto out; -+ if (!aidx) -+ /* yup, this is fake one */ -+ goto truncate_fake; -+ -+ assert("edward-1148", aidx == ridx); -+ -+ /* do partial truncate of the last page cluster, -+ and try to capture this one */ -+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (result) -+ goto out; -+ nr_zeroes = (off_to_pgoff(new_size) ? -+ PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0); -+ set_window(&clust, &win, inode, new_size, new_size + nr_zeroes); -+ win.stat = HOLE_WINDOW; -+ -+ assert("edward-1149", clust.index == ridx - 1); -+ -+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC); -+ if (result) -+ goto out; -+ assert("edward-1151", -+ clust.dstat == PREP_DISK_CLUSTER || -+ clust.dstat == UNPR_DISK_CLUSTER); -+ -+ assert("edward-1191", inode->i_size == new_size); -+ assert("edward-1206", body_truncate_ok(inode, ridx)); -+ truncate_fake: -+ /* drop all the pages that don't have jnodes (i.e. pages -+ which can not be truncated by cut_file_items() because -+ of holes represented by fake disk clusters) including -+ the pages of partially truncated cluster which was -+ released by prepare_logical_cluster() */ -+ INODE_SET_SIZE(inode, new_size); -+ truncate_inode_pages(inode->i_mapping, new_size); -+ out: -+ assert("edward-1334", !result || result == -ENOSPC); -+ assert("edward-1497", -+ pages_truncate_ok(inode, size_in_pages(new_size))); -+ -+ done_lh(lh); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ return result; -+} -+ -+/* Prepare cryptcompress file for truncate: -+ * prune or append rightmost fake logical clusters (if any) -+ */ -+static int start_truncate_fake(struct inode *inode, cloff_t aidx, -+ loff_t new_size, int update_sd) -+{ -+ int result = 0; -+ int bytes; -+ -+ if (new_size > inode->i_size) { -+ /* append */ -+ if (inode->i_size < clust_to_off(aidx, inode)) -+ /* no fake bytes */ -+ return 0; -+ bytes = new_size - inode->i_size; -+ INODE_SET_SIZE(inode, inode->i_size + bytes); -+ } else { -+ /* prune */ -+ if (inode->i_size <= clust_to_off(aidx, inode)) -+ /* no fake bytes */ -+ return 0; -+ bytes = inode->i_size - -+ max(new_size, clust_to_off(aidx, inode)); -+ if (!bytes) -+ return 0; -+ INODE_SET_SIZE(inode, inode->i_size - bytes); -+ /* In the case of fake prune we need to drop page cluster. -+ There are only 2 cases for partially truncated page: -+ 1. If is is dirty, therefore it is anonymous -+ (was dirtied via mmap), and will be captured -+ later via ->capture(). -+ 2. If is clean, therefore it is filled by zeroes. -+ In both cases we don't need to make it dirty and -+ capture here. -+ */ -+ truncate_inode_pages(inode->i_mapping, inode->i_size); -+ } -+ if (update_sd) -+ result = update_sd_cryptcompress(inode); -+ return result; -+} -+ -+/** -+ * This is called in setattr_cryptcompress when it is used to truncate, -+ * and in delete_object_cryptcompress -+ */ -+static int cryptcompress_truncate(struct inode *inode, /* old size */ -+ loff_t new_size, /* new size */ -+ int update_sd) -+{ -+ int result; -+ cloff_t aidx; -+ -+ result = find_fake_appended(inode, &aidx); -+ if (result) -+ return result; -+ assert("edward-1208", -+ ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode))); -+ -+ result = start_truncate_fake(inode, aidx, new_size, update_sd); -+ if (result) -+ return result; -+ if (inode->i_size == new_size) -+ /* nothing to truncate anymore */ -+ return 0; -+ result = (inode->i_size < new_size ? -+ cryptcompress_append_hole(inode, new_size) : -+ prune_cryptcompress(inode, new_size, update_sd, aidx)); -+ if (!result && update_sd) -+ result = update_sd_cryptcompress(inode); -+ return result; -+} -+ -+/** -+ * Capture a pager cluster. -+ * @clust must be set up by a caller. -+ */ -+static int capture_page_cluster(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ int result; -+ -+ assert("edward-1073", clust != NULL); -+ assert("edward-1074", inode != NULL); -+ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER); -+ -+ result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV); -+ if (result) -+ return result; -+ -+ set_cluster_pages_dirty(clust, inode); -+ result = checkin_logical_cluster(clust, inode); -+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK); -+ if (unlikely(result)) -+ put_page_cluster(clust, inode, WRITE_OP); -+ return result; -+} -+ -+/* Starting from @index find tagged pages of the same page cluster. -+ * Clear the tag for each of them. Return number of found pages. -+ */ -+static int find_anon_page_cluster(struct address_space * mapping, -+ pgoff_t * index, struct page ** pages) -+{ -+ int i = 0; -+ int found; -+ spin_lock_irq(&mapping->tree_lock); -+ do { -+ /* looking for one page */ -+ found = radix_tree_gang_lookup_tag(&mapping->page_tree, -+ (void **)&pages[i], -+ *index, 1, -+ PAGECACHE_TAG_REISER4_MOVED); -+ if (!found) -+ break; -+ if (!same_page_cluster(pages[0], pages[i])) -+ break; -+ -+ /* found */ -+ page_cache_get(pages[i]); -+ *index = pages[i]->index + 1; -+ -+ radix_tree_tag_clear(&mapping->page_tree, -+ pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ if (last_page_in_cluster(pages[i++])) -+ break; -+ } while (1); -+ spin_unlock_irq(&mapping->tree_lock); -+ return i; -+} -+ -+#define MAX_PAGES_TO_CAPTURE (1024) -+ -+/* Capture anonymous page clusters */ -+static int capture_anon_pages(struct address_space * mapping, pgoff_t * index, -+ int to_capture) -+{ -+ int count = 0; -+ int found = 0; -+ int result = 0; -+ hint_t *hint; -+ lock_handle *lh; -+ struct inode * inode; -+ struct cluster_handle clust; -+ struct page * pages[MAX_CLUSTER_NRPAGES]; -+ -+ assert("edward-1127", mapping != NULL); -+ assert("edward-1128", mapping->host != NULL); -+ assert("edward-1440", mapping->host->i_mapping == mapping); -+ -+ inode = mapping->host; -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ cluster_init_read(&clust, NULL); -+ clust.hint = hint; -+ -+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (result) -+ goto out; -+ -+ while (to_capture > 0) { -+ found = find_anon_page_cluster(mapping, index, pages); -+ if (!found) { -+ *index = (pgoff_t) - 1; -+ break; -+ } -+ move_cluster_forward(&clust, inode, pages[0]->index); -+ result = capture_page_cluster(&clust, inode); -+ -+ put_found_pages(pages, found); /* find_anon_page_cluster */ -+ if (result) -+ break; -+ to_capture -= clust.nr_pages; -+ count += clust.nr_pages; -+ } -+ if (result) { -+ warning("edward-1077", -+ "Capture failed (inode %llu, result=%i, captured=%d)\n", -+ (unsigned long long)get_inode_oid(inode), result, count); -+ } else { -+ assert("edward-1078", ergo(found > 0, count > 0)); -+ if (to_capture <= 0) -+ /* there may be left more pages */ -+ __mark_inode_dirty(inode, I_DIRTY_PAGES); -+ result = count; -+ } -+ out: -+ done_lh(lh); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ return result; -+} -+ -+/* Returns true if inode's mapping has dirty pages -+ which do not belong to any atom */ -+static int cryptcompress_inode_has_anon_pages(struct inode *inode) -+{ -+ int result; -+ spin_lock_irq(&inode->i_mapping->tree_lock); -+ result = radix_tree_tagged(&inode->i_mapping->page_tree, -+ PAGECACHE_TAG_REISER4_MOVED); -+ spin_unlock_irq(&inode->i_mapping->tree_lock); -+ return result; -+} -+ -+/* plugin->writepages */ -+int writepages_cryptcompress(struct address_space *mapping, -+ struct writeback_control *wbc) -+{ -+ int result = 0; -+ long to_capture; -+ pgoff_t nrpages; -+ pgoff_t index = 0; -+ struct inode *inode; -+ struct cryptcompress_info *info; -+ -+ inode = mapping->host; -+ if (!cryptcompress_inode_has_anon_pages(inode)) -+ goto end; -+ info = cryptcompress_inode_data(inode); -+ nrpages = size_in_pages(i_size_read(inode)); -+ -+ if (wbc->sync_mode != WB_SYNC_ALL) -+ to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE); -+ else -+ to_capture = MAX_PAGES_TO_CAPTURE; -+ do { -+ reiser4_context *ctx; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ result = PTR_ERR(ctx); -+ break; -+ } -+ /* avoid recursive calls to ->sync_inodes */ -+ ctx->nobalance = 1; -+ -+ assert("edward-1079", -+ lock_stack_isclean(get_current_lock_stack())); -+ -+ reiser4_txn_restart_current(); -+ -+ if (get_current_context()->entd) { -+ if (mutex_trylock(&info->checkin_mutex) == 0) { -+ /* the mutex might be occupied by -+ entd caller */ -+ result = RETERR(-EBUSY); -+ reiser4_exit_context(ctx); -+ break; -+ } -+ } else -+ mutex_lock(&info->checkin_mutex); -+ -+ result = capture_anon_pages(inode->i_mapping, &index, -+ to_capture); -+ mutex_unlock(&info->checkin_mutex); -+ -+ if (result < 0) { -+ reiser4_exit_context(ctx); -+ break; -+ } -+ wbc->nr_to_write -= result; -+ if (wbc->sync_mode != WB_SYNC_ALL) { -+ reiser4_exit_context(ctx); -+ break; -+ } -+ result = txnmgr_force_commit_all(inode->i_sb, 0); -+ reiser4_exit_context(ctx); -+ } while (result >= 0 && index < nrpages); -+ -+ end: -+ if (is_in_reiser4_context()) { -+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) { -+ /* there are already pages to flush, flush them out, -+ do not delay until end of reiser4_sync_inodes */ -+ reiser4_writeout(inode->i_sb, wbc); -+ get_current_context()->nr_captured = 0; -+ } -+ } -+ return result; -+} -+ -+/* plugin->ioctl */ -+int ioctl_cryptcompress(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg) -+{ -+ return RETERR(-ENOSYS); -+} -+ -+/* plugin->mmap */ -+int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma) -+{ -+ int result; -+ struct inode *inode; -+ reiser4_context *ctx; -+ -+ inode = file->f_dentry->d_inode; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ /* -+ * generic_file_mmap will do update_atime. Grab space for stat data -+ * update. -+ */ -+ result = reiser4_grab_space_force -+ (inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+ if (result) { -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ result = generic_file_mmap(file, vma); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* plugin->delete_object */ -+int delete_object_cryptcompress(struct inode *inode) -+{ -+ int result; -+ struct cryptcompress_info * info; -+ -+ assert("edward-429", inode->i_nlink == 0); -+ -+ reiser4_txn_restart_current(); -+ info = cryptcompress_inode_data(inode); -+ -+ mutex_lock(&info->checkin_mutex); -+ result = cryptcompress_truncate(inode, 0, 0); -+ mutex_unlock(&info->checkin_mutex); -+ -+ if (result) { -+ warning("edward-430", -+ "cannot truncate cryptcompress file %lli: %i", -+ (unsigned long long)get_inode_oid(inode), -+ result); -+ } -+ truncate_inode_pages(inode->i_mapping, 0); -+ assert("edward-1487", pages_truncate_ok(inode, 0)); -+ /* and remove stat data */ -+ return reiser4_delete_object_common(inode); -+} -+ -+/* -+ * plugin->setattr -+ * This implements actual truncate (see comments in reiser4/page_cache.c) -+ */ -+int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr) -+{ -+ int result; -+ struct inode *inode; -+ struct cryptcompress_info * info; -+ -+ inode = dentry->d_inode; -+ info = cryptcompress_inode_data(inode); -+ -+ if (attr->ia_valid & ATTR_SIZE) { -+ if (i_size_read(inode) != attr->ia_size) { -+ reiser4_context *ctx; -+ loff_t old_size; -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ result = setattr_pschedule_hook(inode); -+ if (result) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ old_size = i_size_read(inode); -+ inode_check_scale(inode, old_size, attr->ia_size); -+ -+ mutex_lock(&info->checkin_mutex); -+ result = cryptcompress_truncate(inode, -+ attr->ia_size, -+ 1/* update sd */); -+ mutex_unlock(&info->checkin_mutex); -+ if (result) { -+ warning("edward-1192", -+ "truncate_cryptcompress failed: oid %lli, " -+ "old size %lld, new size %lld, retval %d", -+ (unsigned long long) -+ get_inode_oid(inode), old_size, -+ attr->ia_size, result); -+ } -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ } else -+ result = 0; -+ } else -+ result = reiser4_setattr_common(dentry, attr); -+ return result; -+} -+ -+/* plugin->release */ -+int release_cryptcompress(struct inode *inode, struct file *file) -+{ -+ reiser4_context *ctx = reiser4_init_context(inode->i_sb); -+ -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ reiser4_free_file_fsdata(file); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+/* plugin->prepare_write */ -+int write_begin_cryptcompress(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ return do_prepare_write(file, page, from, to); -+} -+ -+/* plugin->commit_write */ -+int write_end_cryptcompress(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ int ret; -+ hint_t *hint; -+ lock_handle *lh; -+ struct inode * inode; -+ struct cluster_handle clust; -+ -+ unlock_page(page); -+ -+ inode = page->mapping->host; -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ cluster_init_read(&clust, NULL); -+ clust.hint = hint; -+ -+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (ret) -+ goto out; -+ clust.index = pg_to_clust(page->index, inode); -+ ret = capture_page_cluster(&clust, inode); -+ if (ret) -+ warning("edward-1557", -+ "Capture failed (inode %llu, result=%i)", -+ (unsigned long long)get_inode_oid(inode), ret); -+ out: -+ done_lh(lh); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ return ret; -+} -+ -+/* plugin->bmap */ -+sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock) -+{ -+ return -EINVAL; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.30/fs/reiser4/plugin/file/cryptcompress.h ---- linux-2.6.30.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file/cryptcompress.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,616 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* See http://www.namesys.com/cryptcompress_design.html */ -+ -+#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ ) -+#define __FS_REISER4_CRYPTCOMPRESS_H__ -+ -+#include "../../page_cache.h" -+#include "../compress/compress.h" -+#include "../crypto/cipher.h" -+ -+#include <linux/pagemap.h> -+ -+#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT -+#define MAX_CLUSTER_SHIFT 16 -+#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT) -+#define DC_CHECKSUM_SIZE 4 -+ -+#define MIN_LATTICE_FACTOR 1 -+#define MAX_LATTICE_FACTOR 32 -+ -+/* this mask contains all non-standard plugins that might -+ be present in reiser4-specific part of inode managed by -+ cryptcompress file plugin */ -+#define cryptcompress_mask \ -+ ((1 << PSET_FILE) | \ -+ (1 << PSET_CLUSTER) | \ -+ (1 << PSET_CIPHER) | \ -+ (1 << PSET_DIGEST) | \ -+ (1 << PSET_COMPRESSION) | \ -+ (1 << PSET_COMPRESSION_MODE)) -+ -+#if REISER4_DEBUG -+static inline int cluster_shift_ok(int shift) -+{ -+ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT); -+} -+#endif -+ -+#if REISER4_DEBUG -+#define INODE_PGCOUNT(inode) \ -+({ \ -+ assert("edward-1530", inode_file_plugin(inode) == \ -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \ -+ atomic_read(&cryptcompress_inode_data(inode)->pgcount); \ -+ }) -+#define INODE_PGCOUNT_INC(inode) \ -+do { \ -+ assert("edward-1531", inode_file_plugin(inode) == \ -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \ -+ atomic_inc(&cryptcompress_inode_data(inode)->pgcount); \ -+} while (0) -+#define INODE_PGCOUNT_DEC(inode) \ -+do { \ -+ if (inode_file_plugin(inode) == \ -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) \ -+ atomic_dec(&cryptcompress_inode_data(inode)->pgcount); \ -+} while (0) -+#else -+#define INODE_PGCOUNT(inode) (0) -+#define INODE_PGCOUNT_INC(inode) -+#define INODE_PGCOUNT_DEC(inode) -+#endif /* REISER4_DEBUG */ -+ -+struct tfm_stream { -+ __u8 *data; -+ size_t size; -+}; -+ -+typedef enum { -+ INPUT_STREAM, -+ OUTPUT_STREAM, -+ LAST_STREAM -+} tfm_stream_id; -+ -+typedef struct tfm_stream * tfm_unit[LAST_STREAM]; -+ -+static inline __u8 *ts_data(struct tfm_stream * stm) -+{ -+ assert("edward-928", stm != NULL); -+ return stm->data; -+} -+ -+static inline size_t ts_size(struct tfm_stream * stm) -+{ -+ assert("edward-929", stm != NULL); -+ return stm->size; -+} -+ -+static inline void set_ts_size(struct tfm_stream * stm, size_t size) -+{ -+ assert("edward-930", stm != NULL); -+ -+ stm->size = size; -+} -+ -+static inline int alloc_ts(struct tfm_stream ** stm) -+{ -+ assert("edward-931", stm); -+ assert("edward-932", *stm == NULL); -+ -+ *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get()); -+ if (!*stm) -+ return -ENOMEM; -+ return 0; -+} -+ -+static inline void free_ts(struct tfm_stream * stm) -+{ -+ assert("edward-933", !ts_data(stm)); -+ assert("edward-934", !ts_size(stm)); -+ -+ kfree(stm); -+} -+ -+static inline int alloc_ts_data(struct tfm_stream * stm, size_t size) -+{ -+ assert("edward-935", !ts_data(stm)); -+ assert("edward-936", !ts_size(stm)); -+ assert("edward-937", size != 0); -+ -+ stm->data = reiser4_vmalloc(size); -+ if (!stm->data) -+ return -ENOMEM; -+ set_ts_size(stm, size); -+ return 0; -+} -+ -+static inline void free_ts_data(struct tfm_stream * stm) -+{ -+ assert("edward-938", equi(ts_data(stm), ts_size(stm))); -+ -+ if (ts_data(stm)) -+ vfree(ts_data(stm)); -+ memset(stm, 0, sizeof *stm); -+} -+ -+/* Write modes for item conversion in flush convert phase */ -+typedef enum { -+ CRC_APPEND_ITEM = 1, -+ CRC_OVERWRITE_ITEM = 2, -+ CRC_CUT_ITEM = 3 -+} cryptcompress_write_mode_t; -+ -+typedef enum { -+ LC_INVAL = 0, /* invalid value */ -+ LC_APPOV = 1, /* append and/or overwrite */ -+ LC_TRUNC = 2 /* truncate */ -+} logical_cluster_op; -+ -+/* Transform cluster. -+ * Intermediate state between page cluster and disk cluster -+ * Is used for data transform (compression/encryption) -+ */ -+struct tfm_cluster { -+ coa_set coa; /* compression algorithms info */ -+ tfm_unit tun; /* plain and transformed streams */ -+ tfm_action act; -+ int uptodate; -+ int lsize; /* number of bytes in logical cluster */ -+ int len; /* length of the transform stream */ -+}; -+ -+static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id, -+ tfm_action act) -+{ -+ return tc->coa[id][act]; -+} -+ -+static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id, -+ tfm_action act, coa_t coa) -+{ -+ tc->coa[id][act] = coa; -+} -+ -+static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug) -+{ -+ coa_t coa; -+ -+ coa = cplug->alloc(tc->act); -+ if (IS_ERR(coa)) -+ return PTR_ERR(coa); -+ set_coa(tc, cplug->h.id, tc->act, coa); -+ return 0; -+} -+ -+static inline int -+grab_coa(struct tfm_cluster * tc, compression_plugin * cplug) -+{ -+ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ? -+ alloc_coa(tc, cplug) : 0); -+} -+ -+static inline void free_coa_set(struct tfm_cluster * tc) -+{ -+ tfm_action j; -+ reiser4_compression_id i; -+ compression_plugin *cplug; -+ -+ assert("edward-810", tc != NULL); -+ -+ for (j = 0; j < TFMA_LAST; j++) -+ for (i = 0; i < LAST_COMPRESSION_ID; i++) { -+ if (!get_coa(tc, i, j)) -+ continue; -+ cplug = compression_plugin_by_id(i); -+ assert("edward-812", cplug->free != NULL); -+ cplug->free(get_coa(tc, i, j), j); -+ set_coa(tc, i, j, 0); -+ } -+ return; -+} -+ -+static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc, -+ tfm_stream_id id) -+{ -+ return tc->tun[id]; -+} -+ -+static inline void set_tfm_stream(struct tfm_cluster * tc, -+ tfm_stream_id id, struct tfm_stream * ts) -+{ -+ tc->tun[id] = ts; -+} -+ -+static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id) -+{ -+ return ts_data(get_tfm_stream(tc, id)); -+} -+ -+static inline void set_tfm_stream_data(struct tfm_cluster * tc, -+ tfm_stream_id id, __u8 * data) -+{ -+ get_tfm_stream(tc, id)->data = data; -+} -+ -+static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id) -+{ -+ return ts_size(get_tfm_stream(tc, id)); -+} -+ -+static inline void -+set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size) -+{ -+ get_tfm_stream(tc, id)->size = size; -+} -+ -+static inline int -+alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id) -+{ -+ assert("edward-939", tc != NULL); -+ assert("edward-940", !get_tfm_stream(tc, id)); -+ -+ tc->tun[id] = kzalloc(sizeof(struct tfm_stream), -+ reiser4_ctx_gfp_mask_get()); -+ if (!tc->tun[id]) -+ return -ENOMEM; -+ return alloc_ts_data(get_tfm_stream(tc, id), size); -+} -+ -+static inline int -+realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id) -+{ -+ assert("edward-941", tfm_stream_size(tc, id) < size); -+ free_ts_data(get_tfm_stream(tc, id)); -+ return alloc_ts_data(get_tfm_stream(tc, id), size); -+} -+ -+static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id) -+{ -+ free_ts_data(get_tfm_stream(tc, id)); -+ free_ts(get_tfm_stream(tc, id)); -+ set_tfm_stream(tc, id, 0); -+} -+ -+static inline unsigned coa_overrun(compression_plugin * cplug, int ilen) -+{ -+ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0); -+} -+ -+static inline void free_tfm_unit(struct tfm_cluster * tc) -+{ -+ tfm_stream_id id; -+ for (id = 0; id < LAST_STREAM; id++) { -+ if (!get_tfm_stream(tc, id)) -+ continue; -+ free_tfm_stream(tc, id); -+ } -+} -+ -+static inline void put_tfm_cluster(struct tfm_cluster * tc) -+{ -+ assert("edward-942", tc != NULL); -+ free_coa_set(tc); -+ free_tfm_unit(tc); -+} -+ -+static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc) -+{ -+ assert("edward-943", tc != NULL); -+ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1); -+ return (tc->uptodate == 1); -+} -+ -+static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc) -+{ -+ assert("edward-945", tc != NULL); -+ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1); -+ tc->uptodate = 1; -+ return; -+} -+ -+static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc) -+{ -+ assert("edward-947", tc != NULL); -+ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1); -+ tc->uptodate = 0; -+ return; -+} -+ -+static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id) -+{ -+ return (get_tfm_stream(tc, id) && -+ tfm_stream_data(tc, id) && tfm_stream_size(tc, id)); -+} -+ -+static inline int tfm_cluster_is_set(struct tfm_cluster * tc) -+{ -+ int i; -+ for (i = 0; i < LAST_STREAM; i++) -+ if (!tfm_stream_is_set(tc, i)) -+ return 0; -+ return 1; -+} -+ -+static inline void alternate_streams(struct tfm_cluster * tc) -+{ -+ struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM); -+ -+ set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM)); -+ set_tfm_stream(tc, OUTPUT_STREAM, tmp); -+} -+ -+/* Set of states to indicate a kind of data -+ * that will be written to the window */ -+typedef enum { -+ DATA_WINDOW, /* user's data */ -+ HOLE_WINDOW /* zeroes (such kind of data can be written -+ * if we start to write from offset > i_size) */ -+} window_stat; -+ -+/* Window (of logical cluster size) discretely sliding along a file. -+ * Is used to locate hole region in a logical cluster to be properly -+ * represented on disk. -+ * We split a write to cryptcompress file into writes to its logical -+ * clusters. Before writing to a logical cluster we set a window, i.e. -+ * calculate values of the following fields: -+ */ -+struct reiser4_slide { -+ unsigned off; /* offset to write from */ -+ unsigned count; /* number of bytes to write */ -+ unsigned delta; /* number of bytes to append to the hole */ -+ window_stat stat; /* what kind of data will be written starting -+ from @off */ -+}; -+ -+/* Possible states of a disk cluster */ -+typedef enum { -+ INVAL_DISK_CLUSTER, /* unknown state */ -+ PREP_DISK_CLUSTER, /* disk cluster got converted by flush -+ * at least 1 time */ -+ UNPR_DISK_CLUSTER, /* disk cluster just created and should be -+ * converted by flush */ -+ FAKE_DISK_CLUSTER, /* disk cluster doesn't exist neither in memory -+ * nor on disk */ -+ TRNC_DISK_CLUSTER /* disk cluster is partially truncated */ -+} disk_cluster_stat; -+ -+/* The following structure represents various stages of the same logical -+ * cluster of index @index: -+ * . fixed slide -+ * . page cluster (stage in primary cache) -+ * . transform cluster (transition stage) -+ * . disk cluster (stage in secondary cache) -+ * This structure is used in transition and synchronizing operations, e.g. -+ * transform cluster is a transition state when synchronizing page cluster -+ * and disk cluster. -+ * FIXME: Encapsulate page cluster, disk cluster. -+ */ -+struct cluster_handle { -+ cloff_t index; /* offset in a file (unit is a cluster size) */ -+ int index_valid; /* for validating the index above, if needed */ -+ struct file *file; /* host file */ -+ -+ /* logical cluster */ -+ struct reiser4_slide *win; /* sliding window to locate holes */ -+ logical_cluster_op op; /* logical cluster operation (truncate or -+ append/overwrite) */ -+ /* transform cluster */ -+ struct tfm_cluster tc; /* contains all needed info to synchronize -+ page cluster and disk cluster) */ -+ /* page cluster */ -+ int nr_pages; /* number of pages of current checkin action */ -+ int old_nrpages; /* number of pages of last checkin action */ -+ struct page **pages; /* attached pages */ -+ jnode * node; /* jnode for capture */ -+ -+ /* disk cluster */ -+ hint_t *hint; /* current position in the tree */ -+ disk_cluster_stat dstat; /* state of the current disk cluster */ -+ int reserved; /* is space for disk cluster reserved */ -+#if REISER4_DEBUG -+ reiser4_context *ctx; -+ int reserved_prepped; -+ int reserved_unprepped; -+#endif -+ -+}; -+ -+static inline __u8 * tfm_input_data (struct cluster_handle * clust) -+{ -+ return tfm_stream_data(&clust->tc, INPUT_STREAM); -+} -+ -+static inline __u8 * tfm_output_data (struct cluster_handle * clust) -+{ -+ return tfm_stream_data(&clust->tc, OUTPUT_STREAM); -+} -+ -+static inline int reset_cluster_pgset(struct cluster_handle * clust, -+ int nrpages) -+{ -+ assert("edward-1057", clust->pages != NULL); -+ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages); -+ return 0; -+} -+ -+static inline int alloc_cluster_pgset(struct cluster_handle * clust, -+ int nrpages) -+{ -+ assert("edward-949", clust != NULL); -+ assert("edward-1362", clust->pages == NULL); -+ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES); -+ -+ clust->pages = kzalloc(sizeof(*clust->pages) * nrpages, -+ reiser4_ctx_gfp_mask_get()); -+ if (!clust->pages) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+static inline void move_cluster_pgset(struct cluster_handle *clust, -+ struct page ***pages, int * nr_pages) -+{ -+ assert("edward-1545", clust != NULL && clust->pages != NULL); -+ assert("edward-1546", pages != NULL && *pages == NULL); -+ *pages = clust->pages; -+ *nr_pages = clust->nr_pages; -+ clust->pages = NULL; -+} -+ -+static inline void free_cluster_pgset(struct cluster_handle * clust) -+{ -+ assert("edward-951", clust->pages != NULL); -+ kfree(clust->pages); -+ clust->pages = NULL; -+} -+ -+static inline void put_cluster_handle(struct cluster_handle * clust) -+{ -+ assert("edward-435", clust != NULL); -+ -+ put_tfm_cluster(&clust->tc); -+ if (clust->pages) -+ free_cluster_pgset(clust); -+ memset(clust, 0, sizeof *clust); -+} -+ -+static inline void inc_keyload_count(struct reiser4_crypto_info * data) -+{ -+ assert("edward-1410", data != NULL); -+ data->keyload_count++; -+} -+ -+static inline void dec_keyload_count(struct reiser4_crypto_info * data) -+{ -+ assert("edward-1411", data != NULL); -+ assert("edward-1412", data->keyload_count > 0); -+ data->keyload_count--; -+} -+ -+static inline int capture_cluster_jnode(jnode * node) -+{ -+ return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+} -+ -+/* cryptcompress specific part of reiser4_inode */ -+struct cryptcompress_info { -+ struct mutex checkin_mutex; /* This is to serialize -+ * checkin_logical_cluster operations */ -+ cloff_t trunc_index; /* Index of the leftmost truncated disk -+ * cluster (to resolve races with read) */ -+ struct reiser4_crypto_info *crypt; -+ /* -+ * the following 2 fields are controlled by compression mode plugin -+ */ -+ int compress_toggle; /* Current status of compressibility */ -+ int lattice_factor; /* Factor of dynamic lattice. FIXME: Have -+ * a compression_toggle to keep the factor -+ */ -+#if REISER4_DEBUG -+ atomic_t pgcount; /* number of grabbed pages */ -+#endif -+}; -+ -+static inline void set_compression_toggle (struct cryptcompress_info * info, int val) -+{ -+ info->compress_toggle = val; -+} -+ -+static inline int get_compression_toggle (struct cryptcompress_info * info) -+{ -+ return info->compress_toggle; -+} -+ -+static inline int compression_is_on(struct cryptcompress_info * info) -+{ -+ return get_compression_toggle(info) == 1; -+} -+ -+static inline void turn_on_compression(struct cryptcompress_info * info) -+{ -+ set_compression_toggle(info, 1); -+} -+ -+static inline void turn_off_compression(struct cryptcompress_info * info) -+{ -+ set_compression_toggle(info, 0); -+} -+ -+static inline void set_lattice_factor(struct cryptcompress_info * info, int val) -+{ -+ info->lattice_factor = val; -+} -+ -+static inline int get_lattice_factor(struct cryptcompress_info * info) -+{ -+ return info->lattice_factor; -+} -+ -+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *); -+int equal_to_rdk(znode *, const reiser4_key *); -+int goto_right_neighbor(coord_t *, lock_handle *); -+int cryptcompress_inode_ok(struct inode *inode); -+int coord_is_unprepped_ctail(const coord_t * coord); -+extern int do_readpage_ctail(struct inode *, struct cluster_handle *, -+ struct page * page, znode_lock_mode mode); -+extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust, -+ struct inode * inode); -+extern int readpages_cryptcompress(struct file*, struct address_space*, -+ struct list_head*, unsigned); -+int bind_cryptcompress(struct inode *child, struct inode *parent); -+void destroy_inode_cryptcompress(struct inode * inode); -+int grab_page_cluster(struct inode *inode, struct cluster_handle * clust, -+ rw_op rw); -+int write_pschedule_hook(struct file *file, struct inode * inode, -+ loff_t pos, struct cluster_handle * clust, -+ struct psched_context * cont); -+int setattr_pschedule_hook(struct inode * inode); -+struct reiser4_crypto_info * inode_crypto_info(struct inode * inode); -+void inherit_crypto_info_common(struct inode * parent, struct inode * object, -+ int (*can_inherit)(struct inode * child, -+ struct inode * parent)); -+void reiser4_attach_crypto_info(struct inode * inode, -+ struct reiser4_crypto_info * info); -+void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new); -+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode); -+ -+static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info) -+{ -+ return info->cipher; -+} -+ -+static inline void info_set_cipher(struct reiser4_crypto_info * info, -+ struct crypto_blkcipher * tfm) -+{ -+ info->cipher = tfm; -+} -+ -+static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info) -+{ -+ return info->digest; -+} -+ -+static inline void info_set_digest(struct reiser4_crypto_info * info, -+ struct crypto_hash * tfm) -+{ -+ info->digest = tfm; -+} -+ -+static inline void put_cluster_page(struct page * page) -+{ -+ page_cache_release(page); -+} -+ -+#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file/file.c linux-2.6.30/fs/reiser4/plugin/file/file.c ---- linux-2.6.30.orig/fs/reiser4/plugin/file/file.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file/file.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,2687 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * this file contains implementations of inode/file/address_space/file plugin -+ * operations specific for "unix file plugin" (plugin id is -+ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only -+ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have -+ * no items but stat data) -+ */ -+ -+#include "../../inode.h" -+#include "../../super.h" -+#include "../../tree_walk.h" -+#include "../../carry.h" -+#include "../../page_cache.h" -+#include "../../ioctl.h" -+#include "../object.h" -+#include "../cluster.h" -+#include "../../safe_link.h" -+ -+#include <linux/writeback.h> -+#include <linux/pagevec.h> -+#include <linux/syscalls.h> -+ -+ -+static int unpack(struct file *file, struct inode *inode, int forever); -+static void drop_access(struct unix_file_info *); -+static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key, -+ znode_lock_mode lock_mode); -+ -+/* Get exclusive access and make sure that file is not partially -+ * converted (It may happen that another process is doing tail -+ * conversion. If so, wait until it completes) -+ */ -+static inline void get_exclusive_access_careful(struct unix_file_info * uf_info, -+ struct inode *inode) -+{ -+ do { -+ get_exclusive_access(uf_info); -+ if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)) -+ break; -+ drop_exclusive_access(uf_info); -+ schedule(); -+ } while (1); -+} -+ -+/* get unix file plugin specific portion of inode */ -+struct unix_file_info *unix_file_inode_data(const struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info; -+} -+ -+/** -+ * equal_to_rdk - compare key and znode's right delimiting key -+ * @node: node whose right delimiting key to compare with @key -+ * @key: key to compare with @node's right delimiting key -+ * -+ * Returns true if @key is equal to right delimiting key of @node. -+ */ -+int equal_to_rdk(znode *node, const reiser4_key *key) -+{ -+ int result; -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = keyeq(key, znode_get_rd_key(node)); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+#if REISER4_DEBUG -+ -+/** -+ * equal_to_ldk - compare key and znode's left delimiting key -+ * @node: node whose left delimiting key to compare with @key -+ * @key: key to compare with @node's left delimiting key -+ * -+ * Returns true if @key is equal to left delimiting key of @node. -+ */ -+int equal_to_ldk(znode *node, const reiser4_key *key) -+{ -+ int result; -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = keyeq(key, znode_get_ld_key(node)); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+/** -+ * check_coord - check whether coord corresponds to key -+ * @coord: coord to check -+ * @key: key @coord has to correspond to -+ * -+ * Returns true if @coord is set as if it was set as result of lookup with @key -+ * in coord->node. -+ */ -+static int check_coord(const coord_t *coord, const reiser4_key *key) -+{ -+ coord_t twin; -+ -+ node_plugin_by_node(coord->node)->lookup(coord->node, key, -+ FIND_MAX_NOT_MORE_THAN, &twin); -+ return coords_equal(coord, &twin); -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/** -+ * init_uf_coord - initialize extended coord -+ * @uf_coord: -+ * @lh: -+ * -+ * -+ */ -+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh) -+{ -+ coord_init_zero(&uf_coord->coord); -+ coord_clear_iplug(&uf_coord->coord); -+ uf_coord->lh = lh; -+ init_lh(lh); -+ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension)); -+ uf_coord->valid = 0; -+} -+ -+static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset) -+{ -+ assert("vs-1333", uf_coord->valid == 0); -+ -+ if (coord_is_between_items(&uf_coord->coord)) -+ return; -+ -+ assert("vs-1348", -+ item_plugin_by_coord(&uf_coord->coord)->s.file. -+ init_coord_extension); -+ -+ item_body_by_coord(&uf_coord->coord); -+ item_plugin_by_coord(&uf_coord->coord)->s.file. -+ init_coord_extension(uf_coord, offset); -+} -+ -+/** -+ * goto_right_neighbor - lock right neighbor, drop current node lock -+ * @coord: -+ * @lh: -+ * -+ * Obtain lock on right neighbor and drop lock on current node. -+ */ -+int goto_right_neighbor(coord_t *coord, lock_handle *lh) -+{ -+ int result; -+ lock_handle lh_right; -+ -+ assert("vs-1100", znode_is_locked(coord->node)); -+ -+ init_lh(&lh_right); -+ result = reiser4_get_right_neighbor(&lh_right, coord->node, -+ znode_is_wlocked(coord->node) ? -+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result) { -+ done_lh(&lh_right); -+ return result; -+ } -+ -+ /* -+ * we hold two longterm locks on neighboring nodes. Unlock left of -+ * them -+ */ -+ done_lh(lh); -+ -+ coord_init_first_unit_nocheck(coord, lh_right.node); -+ move_lh(lh, &lh_right); -+ -+ return 0; -+ -+} -+ -+/** -+ * set_file_state -+ * @uf_info: -+ * @cbk_result: -+ * @level: -+ * -+ * This is to be used by find_file_item and in find_file_state to -+ * determine real state of file -+ */ -+static void set_file_state(struct unix_file_info *uf_info, int cbk_result, -+ tree_level level) -+{ -+ if (cbk_errored(cbk_result)) -+ /* error happened in find_file_item */ -+ return; -+ -+ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL); -+ -+ if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ if (cbk_result == CBK_COORD_NOTFOUND) -+ uf_info->container = UF_CONTAINER_EMPTY; -+ else if (level == LEAF_LEVEL) -+ uf_info->container = UF_CONTAINER_TAILS; -+ else -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ } else { -+ /* -+ * file state is known, check whether it is set correctly if -+ * file is not being tail converted -+ */ -+ if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info), -+ REISER4_PART_IN_CONV)) { -+ assert("vs-1162", -+ ergo(level == LEAF_LEVEL && -+ cbk_result == CBK_COORD_FOUND, -+ uf_info->container == UF_CONTAINER_TAILS)); -+ assert("vs-1165", -+ ergo(level == TWIG_LEVEL && -+ cbk_result == CBK_COORD_FOUND, -+ uf_info->container == UF_CONTAINER_EXTENTS)); -+ } -+ } -+} -+ -+int find_file_item_nohint(coord_t *coord, lock_handle *lh, -+ const reiser4_key *key, znode_lock_mode lock_mode, -+ struct inode *inode) -+{ -+ return reiser4_object_lookup(inode, key, coord, lh, lock_mode, -+ FIND_MAX_NOT_MORE_THAN, -+ TWIG_LEVEL, LEAF_LEVEL, -+ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE : -+ (CBK_UNIQUE | CBK_FOR_INSERT), -+ NULL /* ra_info */ ); -+} -+ -+/** -+ * find_file_item - look for file item in the tree -+ * @hint: provides coordinate, lock handle, seal -+ * @key: key for search -+ * @mode: mode of lock to put on returned node -+ * @ra_info: -+ * @inode: -+ * -+ * This finds position in the tree corresponding to @key. It first tries to use -+ * @hint's seal if it is set. -+ */ -+int find_file_item(hint_t *hint, const reiser4_key *key, -+ znode_lock_mode lock_mode, -+ struct inode *inode) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle *lh; -+ -+ assert("nikita-3030", reiser4_schedulable()); -+ assert("vs-1707", hint != NULL); -+ assert("vs-47", inode != NULL); -+ -+ coord = &hint->ext_coord.coord; -+ lh = hint->ext_coord.lh; -+ init_lh(lh); -+ -+ result = hint_validate(hint, key, 1 /* check key */, lock_mode); -+ if (!result) { -+ if (coord->between == AFTER_UNIT && -+ equal_to_rdk(coord->node, key)) { -+ result = goto_right_neighbor(coord, lh); -+ if (result == -E_NO_NEIGHBOR) -+ return RETERR(-EIO); -+ if (result) -+ return result; -+ assert("vs-1152", equal_to_ldk(coord->node, key)); -+ /* -+ * we moved to different node. Invalidate coord -+ * extension, zload is necessary to init it again -+ */ -+ hint->ext_coord.valid = 0; -+ } -+ -+ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND, -+ znode_get_level(coord->node)); -+ -+ return CBK_COORD_FOUND; -+ } -+ -+ coord_init_zero(coord); -+ result = find_file_item_nohint(coord, lh, key, lock_mode, inode); -+ set_file_state(unix_file_inode_data(inode), result, -+ znode_get_level(coord->node)); -+ -+ /* FIXME: we might already have coord extension initialized */ -+ hint->ext_coord.valid = 0; -+ return result; -+} -+ -+/* plugin->u.file.write_flowom = NULL -+ plugin->u.file.read_flow = NULL */ -+ -+void hint_init_zero(hint_t * hint) -+{ -+ memset(hint, 0, sizeof(*hint)); -+ init_lh(&hint->lh); -+ hint->ext_coord.lh = &hint->lh; -+} -+ -+static int find_file_state(struct inode *inode, struct unix_file_info *uf_info) -+{ -+ int result; -+ reiser4_key key; -+ coord_t coord; -+ lock_handle lh; -+ -+ assert("vs-1628", ea_obtained(uf_info)); -+ -+ if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ key_by_inode_and_offset_common(inode, 0, &key); -+ init_lh(&lh); -+ result = find_file_item_nohint(&coord, &lh, &key, -+ ZNODE_READ_LOCK, inode); -+ set_file_state(uf_info, result, znode_get_level(coord.node)); -+ done_lh(&lh); -+ if (!cbk_errored(result)) -+ result = 0; -+ } else -+ result = 0; -+ assert("vs-1074", -+ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN)); -+ reiser4_txn_restart_current(); -+ return result; -+} -+ -+/** -+ * Estimate and reserve space needed to truncate page -+ * which gets partially truncated: one block for page -+ * itself, stat-data update (estimate_one_insert_into_item) -+ * and one item insertion (estimate_one_insert_into_item) -+ * which may happen if page corresponds to hole extent and -+ * unallocated one will have to be created -+ */ -+static int reserve_partial_page(reiser4_tree * tree) -+{ -+ grab_space_enable(); -+ return reiser4_grab_reserved(reiser4_get_current_sb(), -+ 1 + -+ 2 * estimate_one_insert_into_item(tree), -+ BA_CAN_COMMIT); -+} -+ -+/* estimate and reserve space needed to cut one item and update one stat data */ -+static int reserve_cut_iteration(reiser4_tree * tree) -+{ -+ __u64 estimate = estimate_one_item_removal(tree) -+ + estimate_one_insert_into_item(tree); -+ -+ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack())); -+ -+ grab_space_enable(); -+ /* We need to double our estimate now that we can delete more than one -+ node. */ -+ return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2, -+ BA_CAN_COMMIT); -+} -+ -+int reiser4_update_file_size(struct inode *inode, loff_t new_size, -+ int update_sd) -+{ -+ int result = 0; -+ -+ INODE_SET_SIZE(inode, new_size); -+ if (update_sd) { -+ inode->i_ctime = inode->i_mtime = CURRENT_TIME; -+ result = reiser4_update_sd(inode); -+ } -+ return result; -+} -+ -+/** -+ * Cut file items one by one starting from the last one until -+ * new file size (inode->i_size) is reached. Reserve space -+ * and update file stat data on every single cut from the tree -+ */ -+int cut_file_items(struct inode *inode, loff_t new_size, -+ int update_sd, loff_t cur_size, -+ int (*update_actor) (struct inode *, loff_t, int)) -+{ -+ reiser4_key from_key, to_key; -+ reiser4_key smallest_removed; -+ file_plugin *fplug = inode_file_plugin(inode); -+ int result; -+ int progress = 0; -+ -+ assert("vs-1248", -+ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) || -+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ -+ fplug->key_by_inode(inode, new_size, &from_key); -+ to_key = from_key; -+ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ ); -+ /* this loop normally runs just once */ -+ while (1) { -+ result = reserve_cut_iteration(reiser4_tree_by_inode(inode)); -+ if (result) -+ break; -+ -+ result = reiser4_cut_tree_object(current_tree, &from_key, &to_key, -+ &smallest_removed, inode, 1, -+ &progress); -+ if (result == -E_REPEAT) { -+ /** -+ * -E_REPEAT is a signal to interrupt a long -+ * file truncation process -+ */ -+ if (progress) { -+ result = update_actor(inode, -+ get_key_offset(&smallest_removed), -+ update_sd); -+ if (result) -+ break; -+ } -+ /* the below does up(sbinfo->delete_mutex). -+ * Do not get folled */ -+ reiser4_release_reserved(inode->i_sb); -+ /** -+ * reiser4_cut_tree_object() was interrupted probably -+ * because current atom requires commit, we have to -+ * release transaction handle to allow atom commit. -+ */ -+ reiser4_txn_restart_current(); -+ continue; -+ } -+ if (result -+ && !(result == CBK_COORD_NOTFOUND && new_size == 0 -+ && inode->i_size == 0)) -+ break; -+ -+ set_key_offset(&smallest_removed, new_size); -+ /* Final sd update after the file gets its correct size */ -+ result = update_actor(inode, get_key_offset(&smallest_removed), -+ update_sd); -+ break; -+ } -+ -+ /* the below does up(sbinfo->delete_mutex). Do not get folled */ -+ reiser4_release_reserved(inode->i_sb); -+ -+ return result; -+} -+ -+int find_or_create_extent(struct page *page); -+ -+/* part of truncate_file_body: it is called when truncate is used to make file -+ shorter */ -+static int shorten_file(struct inode *inode, loff_t new_size) -+{ -+ int result; -+ struct page *page; -+ int padd_from; -+ unsigned long index; -+ struct unix_file_info *uf_info; -+ -+ /* -+ * all items of ordinary reiser4 file are grouped together. That is why -+ * we can use reiser4_cut_tree. Plan B files (for instance) can not be -+ * truncated that simply -+ */ -+ result = cut_file_items(inode, new_size, 1 /*update_sd */ , -+ get_key_offset(reiser4_max_key()), -+ reiser4_update_file_size); -+ if (result) -+ return result; -+ -+ uf_info = unix_file_inode_data(inode); -+ assert("vs-1105", new_size == inode->i_size); -+ if (new_size == 0) { -+ uf_info->container = UF_CONTAINER_EMPTY; -+ return 0; -+ } -+ -+ result = find_file_state(inode, uf_info); -+ if (result) -+ return result; -+ if (uf_info->container == UF_CONTAINER_TAILS) -+ /* -+ * No need to worry about zeroing last page after new file -+ * end -+ */ -+ return 0; -+ -+ padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1); -+ if (!padd_from) -+ /* file is truncated to page boundary */ -+ return 0; -+ -+ result = reserve_partial_page(reiser4_tree_by_inode(inode)); -+ if (result) { -+ reiser4_release_reserved(inode->i_sb); -+ return result; -+ } -+ -+ /* last page is partially truncated - zero its content */ -+ index = (inode->i_size >> PAGE_CACHE_SHIFT); -+ page = read_mapping_page(inode->i_mapping, index, NULL); -+ if (IS_ERR(page)) { -+ /* -+ * the below does up(sbinfo->delete_mutex). Do not get -+ * confused -+ */ -+ reiser4_release_reserved(inode->i_sb); -+ if (likely(PTR_ERR(page) == -EINVAL)) { -+ /* looks like file is built of tail items */ -+ return 0; -+ } -+ return PTR_ERR(page); -+ } -+ wait_on_page_locked(page); -+ if (!PageUptodate(page)) { -+ page_cache_release(page); -+ /* -+ * the below does up(sbinfo->delete_mutex). Do not get -+ * confused -+ */ -+ reiser4_release_reserved(inode->i_sb); -+ return RETERR(-EIO); -+ } -+ -+ /* -+ * if page correspons to hole extent unit - unallocated one will be -+ * created here. This is not necessary -+ */ -+ result = find_or_create_extent(page); -+ -+ /* -+ * FIXME: cut_file_items has already updated inode. Probably it would -+ * be better to update it here when file is really truncated -+ */ -+ if (result) { -+ page_cache_release(page); -+ /* -+ * the below does up(sbinfo->delete_mutex). Do not get -+ * confused -+ */ -+ reiser4_release_reserved(inode->i_sb); -+ return result; -+ } -+ -+ lock_page(page); -+ assert("vs-1066", PageLocked(page)); -+ zero_user_segment(page, padd_from, PAGE_CACHE_SIZE); -+ unlock_page(page); -+ page_cache_release(page); -+ /* the below does up(sbinfo->delete_mutex). Do not get confused */ -+ reiser4_release_reserved(inode->i_sb); -+ return 0; -+} -+ -+/** -+ * should_have_notail -+ * @uf_info: -+ * @new_size: -+ * -+ * Calls formatting plugin to see whether file of size @new_size has to be -+ * stored in unformatted nodes or in tail items. 0 is returned for later case. -+ */ -+static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size) -+{ -+ if (!uf_info->tplug) -+ return 1; -+ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info), -+ new_size); -+ -+} -+ -+/** -+ * truncate_file_body - change length of file -+ * @inode: inode of file -+ * @new_size: new file length -+ * -+ * Adjusts items file @inode is built of to match @new_size. It may either cut -+ * items or add them to represent a hole at the end of file. The caller has to -+ * obtain exclusive access to the file. -+ */ -+static int truncate_file_body(struct inode *inode, struct iattr *attr) -+{ -+ int result; -+ loff_t new_size = attr->ia_size; -+ -+ if (inode->i_size < new_size) { -+ /* expanding truncate */ -+ struct unix_file_info *uf_info = unix_file_inode_data(inode); -+ -+ result = find_file_state(inode, uf_info); -+ if (result) -+ return result; -+ -+ if (should_have_notail(uf_info, new_size)) { -+ /* -+ * file of size @new_size has to be built of -+ * extents. If it is built of tails - convert to -+ * extents -+ */ -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * if file is being convered by another process -+ * - wait until it completes -+ */ -+ while (1) { -+ if (reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)) { -+ drop_exclusive_access(uf_info); -+ schedule(); -+ get_exclusive_access(uf_info); -+ continue; -+ } -+ break; -+ } -+ -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ result = tail2extent(uf_info); -+ if (result) -+ return result; -+ } -+ } -+ result = reiser4_write_extent(NULL, inode, NULL, -+ 0, &new_size); -+ if (result) -+ return result; -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ } else { -+ if (uf_info->container == UF_CONTAINER_EXTENTS) { -+ result = reiser4_write_extent(NULL, inode, NULL, -+ 0, &new_size); -+ if (result) -+ return result; -+ } else { -+ result = reiser4_write_tail(NULL, inode, NULL, -+ 0, &new_size); -+ if (result) -+ return result; -+ uf_info->container = UF_CONTAINER_TAILS; -+ } -+ } -+ BUG_ON(result > 0); -+ result = reiser4_update_file_size(inode, new_size, 1); -+ BUG_ON(result != 0); -+ } else -+ result = shorten_file(inode, new_size); -+ return result; -+} -+ -+/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */ -+ -+/** -+ * load_file_hint - copy hint from struct file to local variable -+ * @file: file to get hint from -+ * @hint: structure to fill -+ * -+ * Reiser4 specific portion of struct file may contain information (hint) -+ * stored on exiting from previous read or write. That information includes -+ * seal of znode and coord within that znode where previous read or write -+ * stopped. This function copies that information to @hint if it was stored or -+ * initializes @hint by 0s otherwise. -+ */ -+int load_file_hint(struct file *file, hint_t *hint) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ if (file) { -+ fsdata = reiser4_get_file_fsdata(file); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ -+ spin_lock_inode(file->f_dentry->d_inode); -+ if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) { -+ *hint = fsdata->reg.hint; -+ init_lh(&hint->lh); -+ hint->ext_coord.lh = &hint->lh; -+ spin_unlock_inode(file->f_dentry->d_inode); -+ /* -+ * force re-validation of the coord on the first -+ * iteration of the read/write loop. -+ */ -+ hint->ext_coord.valid = 0; -+ assert("nikita-19892", coords_equal(&hint->seal.coord1, -+ &hint->ext_coord. -+ coord)); -+ return 0; -+ } -+ memset(&fsdata->reg.hint, 0, sizeof(hint_t)); -+ spin_unlock_inode(file->f_dentry->d_inode); -+ } -+ hint_init_zero(hint); -+ return 0; -+} -+ -+/** -+ * save_file_hint - copy hint to reiser4 private struct file's part -+ * @file: file to save hint in -+ * @hint: hint to save -+ * -+ * This copies @hint to reiser4 private part of struct file. It can help -+ * speedup future accesses to the file. -+ */ -+void save_file_hint(struct file *file, const hint_t *hint) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ assert("edward-1337", hint != NULL); -+ -+ if (!file || !reiser4_seal_is_set(&hint->seal)) -+ return; -+ fsdata = reiser4_get_file_fsdata(file); -+ assert("vs-965", !IS_ERR(fsdata)); -+ assert("nikita-19891", -+ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord)); -+ assert("vs-30", hint->lh.owner == NULL); -+ spin_lock_inode(file->f_dentry->d_inode); -+ fsdata->reg.hint = *hint; -+ spin_unlock_inode(file->f_dentry->d_inode); -+ return; -+} -+ -+void reiser4_unset_hint(hint_t * hint) -+{ -+ assert("vs-1315", hint); -+ hint->ext_coord.valid = 0; -+ reiser4_seal_done(&hint->seal); -+ done_lh(&hint->lh); -+} -+ -+/* coord must be set properly. So, that reiser4_set_hint -+ has nothing to do */ -+void reiser4_set_hint(hint_t * hint, const reiser4_key * key, -+ znode_lock_mode mode) -+{ -+ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord); -+ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key))); -+ -+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key); -+ hint->offset = get_key_offset(key); -+ hint->mode = mode; -+ done_lh(&hint->lh); -+} -+ -+int hint_is_set(const hint_t * hint) -+{ -+ return reiser4_seal_is_set(&hint->seal); -+} -+ -+#if REISER4_DEBUG -+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2) -+{ -+ return (get_key_locality(k1) == get_key_locality(k2) && -+ get_key_type(k1) == get_key_type(k2) && -+ get_key_band(k1) == get_key_band(k2) && -+ get_key_ordering(k1) == get_key_ordering(k2) && -+ get_key_objectid(k1) == get_key_objectid(k2)); -+} -+#endif -+ -+static int -+hint_validate(hint_t * hint, const reiser4_key * key, int check_key, -+ znode_lock_mode lock_mode) -+{ -+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode) -+ /* hint either not set or set by different operation */ -+ return RETERR(-E_REPEAT); -+ -+ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key)); -+ -+ if (check_key && get_key_offset(key) != hint->offset) -+ /* hint is set for different key */ -+ return RETERR(-E_REPEAT); -+ -+ assert("vs-31", hint->ext_coord.lh == &hint->lh); -+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key, -+ hint->ext_coord.lh, lock_mode, -+ ZNODE_LOCK_LOPRI); -+} -+ -+/** -+ * Look for place at twig level for extent corresponding to page, -+ * call extent's writepage method to create unallocated extent if -+ * it does not exist yet, initialize jnode, capture page -+ */ -+int find_or_create_extent(struct page *page) -+{ -+ int result; -+ struct inode *inode; -+ int plugged_hole; -+ -+ jnode *node; -+ -+ assert("vs-1065", page->mapping && page->mapping->host); -+ inode = page->mapping->host; -+ -+ lock_page(page); -+ node = jnode_of_page(page); -+ if (IS_ERR(node)) { -+ unlock_page(page); -+ return PTR_ERR(node); -+ } -+ JF_SET(node, JNODE_WRITE_PREPARED); -+ unlock_page(page); -+ -+ if (node->blocknr == 0) { -+ plugged_hole = 0; -+ result = reiser4_update_extent(inode, node, page_offset(page), -+ &plugged_hole); -+ if (result) { -+ JF_CLR(node, JNODE_WRITE_PREPARED); -+ jput(node); -+ warning("edward-1549", -+ "reiser4_update_extent failed: %d", result); -+ return result; -+ } -+ if (plugged_hole) -+ reiser4_update_sd(inode); -+ } else { -+ spin_lock_jnode(node); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ } -+ -+ BUG_ON(node->atom == NULL); -+ JF_CLR(node, JNODE_WRITE_PREPARED); -+ jput(node); -+ -+ if (get_current_context()->entd) { -+ entd_context *ent = get_entd_context(node->tree->super); -+ -+ if (ent->cur_request->page == page) -+ ent->cur_request->node = node; -+ } -+ return 0; -+} -+ -+/** -+ * has_anonymous_pages - check whether inode has pages dirtied via mmap -+ * @inode: inode to check -+ * -+ * Returns true if inode's mapping has dirty pages which do not belong to any -+ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page -+ * tree or were eflushed and can be found via jnodes tagged -+ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes. -+ */ -+static int has_anonymous_pages(struct inode *inode) -+{ -+ int result; -+ -+ spin_lock_irq(&inode->i_mapping->tree_lock); -+ result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED); -+ spin_unlock_irq(&inode->i_mapping->tree_lock); -+ return result; -+} -+ -+/** -+ * capture_page_and_create_extent - -+ * @page: page to be captured -+ * -+ * Grabs space for extent creation and stat data update and calls function to -+ * do actual work. -+ */ -+static int capture_page_and_create_extent(struct page *page) -+{ -+ int result; -+ struct inode *inode; -+ -+ assert("vs-1084", page->mapping && page->mapping->host); -+ inode = page->mapping->host; -+ assert("vs-1139", -+ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS); -+ /* page belongs to file */ -+ assert("vs-1393", -+ inode->i_size > page_offset(page)); -+ -+ /* page capture may require extent creation (if it does not exist yet) -+ and stat data's update (number of blocks changes on extent -+ creation) */ -+ grab_space_enable(); -+ result = reiser4_grab_space(2 * estimate_one_insert_into_item -+ (reiser4_tree_by_inode(inode)), -+ BA_CAN_COMMIT); -+ if (likely(!result)) -+ result = find_or_create_extent(page); -+ -+ if (result != 0) -+ SetPageError(page); -+ return result; -+} -+ -+/* plugin->write_end() */ -+int write_end_unix_file(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ unlock_page(page); -+ return capture_page_and_create_extent(page); -+} -+ -+/* -+ * Support for "anonymous" pages and jnodes. -+ * -+ * When file is write-accessed through mmap pages can be dirtied from the user -+ * level. In this case kernel is not notified until one of following happens: -+ * -+ * (1) msync() -+ * -+ * (2) truncate() (either explicit or through unlink) -+ * -+ * (3) VM scanner starts reclaiming mapped pages, dirtying them before -+ * starting write-back. -+ * -+ * As a result of (3) ->writepage may be called on a dirty page without -+ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads -+ * (iozone) generate huge number of anonymous pages. -+ * -+ * reiser4_sync_sb() method tries to insert anonymous pages into -+ * tree. This is done by capture_anonymous_*() functions below. -+ */ -+ -+/** -+ * capture_anonymous_page - involve page into transaction -+ * @pg: page to deal with -+ * -+ * Takes care that @page has corresponding metadata in the tree, creates jnode -+ * for @page and captures it. On success 1 is returned. -+ */ -+static int capture_anonymous_page(struct page *page) -+{ -+ int result; -+ -+ if (PageWriteback(page)) -+ /* FIXME: do nothing? */ -+ return 0; -+ -+ result = capture_page_and_create_extent(page); -+ if (result == 0) { -+ result = 1; -+ } else -+ warning("nikita-3329", -+ "Cannot capture anon page: %i", result); -+ -+ return result; -+} -+ -+/** -+ * capture_anonymous_pages - find and capture pages dirtied via mmap -+ * @mapping: address space where to look for pages -+ * @index: start index -+ * @to_capture: maximum number of pages to capture -+ * -+ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page, -+ * captures (involves into atom) them, returns number of captured pages, -+ * updates @index to next page after the last captured one. -+ */ -+static int -+capture_anonymous_pages(struct address_space *mapping, pgoff_t *index, -+ unsigned int to_capture) -+{ -+ int result; -+ struct pagevec pvec; -+ unsigned int i, count; -+ int nr; -+ -+ pagevec_init(&pvec, 0); -+ count = min(pagevec_space(&pvec), to_capture); -+ nr = 0; -+ -+ /* find pages tagged MOVED */ -+ spin_lock_irq(&mapping->tree_lock); -+ pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree, -+ (void **)pvec.pages, *index, count, -+ PAGECACHE_TAG_REISER4_MOVED); -+ if (pagevec_count(&pvec) == 0) { -+ /* -+ * there are no pages tagged MOVED in mapping->page_tree -+ * starting from *index -+ */ -+ spin_unlock_irq(&mapping->tree_lock); -+ *index = (pgoff_t)-1; -+ return 0; -+ } -+ -+ /* clear MOVED tag for all found pages */ -+ for (i = 0; i < pagevec_count(&pvec); i++) { -+ page_cache_get(pvec.pages[i]); -+ radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ } -+ spin_unlock_irq(&mapping->tree_lock); -+ -+ -+ *index = pvec.pages[i - 1]->index + 1; -+ -+ for (i = 0; i < pagevec_count(&pvec); i++) { -+ result = capture_anonymous_page(pvec.pages[i]); -+ if (result == 1) -+ nr++; -+ else { -+ if (result < 0) { -+ warning("vs-1454", -+ "failed to capture page: " -+ "result=%d, captured=%d)\n", -+ result, i); -+ -+ /* -+ * set MOVED tag to all pages which left not -+ * captured -+ */ -+ spin_lock_irq(&mapping->tree_lock); -+ for (; i < pagevec_count(&pvec); i ++) { -+ radix_tree_tag_set(&mapping->page_tree, -+ pvec.pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ } -+ spin_unlock_irq(&mapping->tree_lock); -+ -+ pagevec_release(&pvec); -+ return result; -+ } else { -+ /* -+ * result == 0. capture_anonymous_page returns -+ * 0 for Writeback-ed page. Set MOVED tag on -+ * that page -+ */ -+ spin_lock_irq(&mapping->tree_lock); -+ radix_tree_tag_set(&mapping->page_tree, -+ pvec.pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ spin_unlock_irq(&mapping->tree_lock); -+ if (i == 0) -+ *index = pvec.pages[0]->index; -+ else -+ *index = pvec.pages[i - 1]->index + 1; -+ } -+ } -+ } -+ pagevec_release(&pvec); -+ return nr; -+} -+ -+/** -+ * capture_anonymous_jnodes - find and capture anonymous jnodes -+ * @mapping: address space where to look for jnodes -+ * @from: start index -+ * @to: end index -+ * @to_capture: maximum number of jnodes to capture -+ * -+ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in -+ * the range of indexes @from-@to and captures them, returns number of captured -+ * jnodes, updates @from to next jnode after the last captured one. -+ */ -+static int -+capture_anonymous_jnodes(struct address_space *mapping, -+ pgoff_t *from, pgoff_t to, int to_capture) -+{ -+ *from = to; -+ return 0; -+} -+ -+/* -+ * Commit atom of the jnode of a page. -+ */ -+static int sync_page(struct page *page) -+{ -+ int result; -+ do { -+ jnode *node; -+ txn_atom *atom; -+ -+ lock_page(page); -+ node = jprivate(page); -+ if (node != NULL) { -+ spin_lock_jnode(node); -+ atom = jnode_get_atom(node); -+ spin_unlock_jnode(node); -+ } else -+ atom = NULL; -+ unlock_page(page); -+ result = reiser4_sync_atom(atom); -+ } while (result == -E_REPEAT); -+ /* -+ * ZAM-FIXME-HANS: document the logic of this loop, is it just to -+ * handle the case where more pages get added to the atom while we are -+ * syncing it? -+ */ -+ assert("nikita-3485", ergo(result == 0, -+ get_current_context()->trans->atom == NULL)); -+ return result; -+} -+ -+/* -+ * Commit atoms of pages on @pages list. -+ * call sync_page for each page from mapping's page tree -+ */ -+static int sync_page_list(struct inode *inode) -+{ -+ int result; -+ struct address_space *mapping; -+ unsigned long from; /* start index for radix_tree_gang_lookup */ -+ unsigned int found; /* return value for radix_tree_gang_lookup */ -+ -+ mapping = inode->i_mapping; -+ from = 0; -+ result = 0; -+ spin_lock_irq(&mapping->tree_lock); -+ while (result == 0) { -+ struct page *page; -+ -+ found = -+ radix_tree_gang_lookup(&mapping->page_tree, (void **)&page, -+ from, 1); -+ assert("edward-1550", found < 2); -+ if (found == 0) -+ break; -+ /** -+ * page may not leave radix tree because it is protected from -+ * truncating by inode->i_mutex locked by sys_fsync -+ */ -+ page_cache_get(page); -+ spin_unlock_irq(&mapping->tree_lock); -+ -+ from = page->index + 1; -+ -+ result = sync_page(page); -+ -+ page_cache_release(page); -+ spin_lock_irq(&mapping->tree_lock); -+ } -+ -+ spin_unlock_irq(&mapping->tree_lock); -+ return result; -+} -+ -+static int commit_file_atoms(struct inode *inode) -+{ -+ int result; -+ struct unix_file_info *uf_info; -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access(uf_info); -+ /* -+ * find what items file is made from -+ */ -+ result = find_file_state(inode, uf_info); -+ drop_exclusive_access(uf_info); -+ if (result != 0) -+ return result; -+ -+ /* -+ * file state cannot change because we are under ->i_mutex -+ */ -+ switch (uf_info->container) { -+ case UF_CONTAINER_EXTENTS: -+ /* find_file_state might open join an atom */ -+ reiser4_txn_restart_current(); -+ result = -+ /* -+ * when we are called by -+ * filemap_fdatawrite-> -+ * do_writepages()-> -+ * reiser4_writepages() -+ * -+ * inode->i_mapping->dirty_pages are spices into -+ * ->io_pages, leaving ->dirty_pages dirty. -+ * -+ * When we are called from -+ * reiser4_fsync()->sync_unix_file(), we have to -+ * commit atoms of all pages on the ->dirty_list. -+ * -+ * So for simplicity we just commit ->io_pages and -+ * ->dirty_pages. -+ */ -+ sync_page_list(inode); -+ break; -+ case UF_CONTAINER_TAILS: -+ /* -+ * NOTE-NIKITA probably we can be smarter for tails. For now -+ * just commit all existing atoms. -+ */ -+ result = txnmgr_force_commit_all(inode->i_sb, 0); -+ break; -+ case UF_CONTAINER_EMPTY: -+ result = 0; -+ break; -+ case UF_CONTAINER_UNKNOWN: -+ default: -+ result = -EIO; -+ break; -+ } -+ -+ /* -+ * commit current transaction: there can be captured nodes from -+ * find_file_state() and finish_conversion(). -+ */ -+ reiser4_txn_restart_current(); -+ return result; -+} -+ -+/** -+ * writepages_unix_file - writepages of struct address_space_operations -+ * @mapping: -+ * @wbc: -+ * -+ * This captures anonymous pages and anonymous jnodes. Anonymous pages are -+ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were -+ * created by reiser4_writepage. -+ */ -+int writepages_unix_file(struct address_space *mapping, -+ struct writeback_control *wbc) -+{ -+ int result; -+ struct unix_file_info *uf_info; -+ pgoff_t pindex, jindex, nr_pages; -+ long to_capture; -+ struct inode *inode; -+ -+ inode = mapping->host; -+ if (!has_anonymous_pages(inode)) { -+ result = 0; -+ goto end; -+ } -+ jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT; -+ result = 0; -+ nr_pages = size_in_pages(i_size_read(inode)); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ do { -+ reiser4_context *ctx; -+ -+ if (wbc->sync_mode != WB_SYNC_ALL) -+ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST); -+ else -+ to_capture = CAPTURE_APAGE_BURST; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ result = PTR_ERR(ctx); -+ break; -+ } -+ /* avoid recursive calls to ->sync_inodes */ -+ ctx->nobalance = 1; -+ assert("zam-760", lock_stack_isclean(get_current_lock_stack())); -+ assert("edward-1551", LOCK_CNT_NIL(inode_sem_w)); -+ assert("edward-1552", LOCK_CNT_NIL(inode_sem_r)); -+ -+ reiser4_txn_restart_current(); -+ -+ /* we have to get nonexclusive access to the file */ -+ if (get_current_context()->entd) { -+ /* -+ * use nonblocking version of nonexclusive_access to -+ * avoid deadlock which might look like the following: -+ * process P1 holds NEA on file F1 and called entd to -+ * reclaim some memory. Entd works for P1 and is going -+ * to capture pages of file F2. To do that entd has to -+ * get NEA to F2. F2 is held by process P2 which also -+ * called entd. But entd is serving P1 at the moment -+ * and P2 has to wait. Process P3 trying to get EA to -+ * file F2. Existence of pending EA request to file F2 -+ * makes impossible for entd to get NEA to file -+ * F2. Neither of these process can continue. Using -+ * nonblocking version of gettign NEA is supposed to -+ * avoid this deadlock. -+ */ -+ if (try_to_get_nonexclusive_access(uf_info) == 0) { -+ result = RETERR(-EBUSY); -+ reiser4_exit_context(ctx); -+ break; -+ } -+ } else -+ get_nonexclusive_access(uf_info); -+ -+ while (to_capture > 0) { -+ pgoff_t start; -+ -+ assert("vs-1727", jindex <= pindex); -+ if (pindex == jindex) { -+ start = pindex; -+ result = -+ capture_anonymous_pages(inode->i_mapping, -+ &pindex, -+ to_capture); -+ if (result <= 0) -+ break; -+ to_capture -= result; -+ wbc->nr_to_write -= result; -+ if (start + result == pindex) { -+ jindex = pindex; -+ continue; -+ } -+ if (to_capture <= 0) -+ break; -+ } -+ /* deal with anonymous jnodes between jindex and pindex */ -+ result = -+ capture_anonymous_jnodes(inode->i_mapping, &jindex, -+ pindex, to_capture); -+ if (result < 0) -+ break; -+ to_capture -= result; -+ get_current_context()->nr_captured += result; -+ -+ if (jindex == (pgoff_t) - 1) { -+ assert("vs-1728", pindex == (pgoff_t) - 1); -+ break; -+ } -+ } -+ if (to_capture <= 0) -+ /* there may be left more pages */ -+ __mark_inode_dirty(inode, I_DIRTY_PAGES); -+ -+ drop_nonexclusive_access(uf_info); -+ if (result < 0) { -+ /* error happened */ -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ if (wbc->sync_mode != WB_SYNC_ALL) { -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ result = commit_file_atoms(inode); -+ reiser4_exit_context(ctx); -+ if (pindex >= nr_pages && jindex == pindex) -+ break; -+ } while (1); -+ -+ end: -+ if (is_in_reiser4_context()) { -+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) { -+ /* -+ * there are already pages to flush, flush them out, do -+ * not delay until end of reiser4_sync_inodes -+ */ -+ reiser4_writeout(inode->i_sb, wbc); -+ get_current_context()->nr_captured = 0; -+ } -+ } -+ return result; -+} -+ -+/** -+ * readpage_unix_file_nolock - readpage of struct address_space_operations -+ * @file: -+ * @page: -+ * -+ * Compose a key and search for item containing information about @page -+ * data. If item is found - its readpage method is called. -+ */ -+int readpage_unix_file(struct file *file, struct page *page) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *inode; -+ reiser4_key key; -+ item_plugin *iplug; -+ hint_t *hint; -+ lock_handle *lh; -+ coord_t *coord; -+ -+ assert("vs-1062", PageLocked(page)); -+ assert("vs-976", !PageUptodate(page)); -+ assert("vs-1061", page->mapping && page->mapping->host); -+ -+ if (page->mapping->host->i_size <= page_offset(page)) { -+ /* page is out of file */ -+ zero_user(page, 0, PAGE_CACHE_SIZE); -+ SetPageUptodate(page); -+ unlock_page(page); -+ return 0; -+ } -+ -+ inode = page->mapping->host; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ unlock_page(page); -+ return PTR_ERR(ctx); -+ } -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) { -+ unlock_page(page); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOMEM); -+ } -+ -+ result = load_file_hint(file, hint); -+ if (result) { -+ kfree(hint); -+ unlock_page(page); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ lh = &hint->lh; -+ -+ /* get key of first byte of the page */ -+ key_by_inode_and_offset_common(inode, page_offset(page), &key); -+ -+ /* look for file metadata corresponding to first byte of page */ -+ page_cache_get(page); -+ unlock_page(page); -+ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode); -+ lock_page(page); -+ page_cache_release(page); -+ -+ if (page->mapping == NULL) { -+ /* -+ * readpage allows truncate to run concurrently. Page was -+ * truncated while it was not locked -+ */ -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return -EINVAL; -+ } -+ -+ if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) { -+ if (result == CBK_COORD_FOUND && -+ hint->ext_coord.coord.between != AT_UNIT) -+ /* file is truncated */ -+ result = -EINVAL; -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ /* -+ * item corresponding to page is found. It can not be removed because -+ * znode lock is held -+ */ -+ if (PageUptodate(page)) { -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ -+ coord = &hint->ext_coord.coord; -+ result = zload(coord->node); -+ if (result) { -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ validate_extended_coord(&hint->ext_coord, page_offset(page)); -+ -+ if (!coord_is_existing_unit(coord)) { -+ /* this indicates corruption */ -+ warning("vs-280", -+ "Looking for page %lu of file %llu (size %lli). " -+ "No file items found (%d). File is corrupted?\n", -+ page->index, (unsigned long long)get_inode_oid(inode), -+ inode->i_size, result); -+ zrelse(coord->node); -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-EIO); -+ } -+ -+ /* -+ * get plugin of found item or use plugin if extent if there are no -+ * one -+ */ -+ iplug = item_plugin_by_coord(coord); -+ if (iplug->s.file.readpage) -+ result = iplug->s.file.readpage(coord, page); -+ else -+ result = RETERR(-EINVAL); -+ -+ if (!result) { -+ set_key_offset(&key, -+ (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT); -+ /* FIXME should call reiser4_set_hint() */ -+ reiser4_unset_hint(hint); -+ } else { -+ unlock_page(page); -+ reiser4_unset_hint(hint); -+ } -+ assert("vs-979", -+ ergo(result == 0, (PageLocked(page) || PageUptodate(page)))); -+ assert("vs-9791", ergo(result != 0, !PageLocked(page))); -+ -+ zrelse(coord->node); -+ done_lh(lh); -+ -+ save_file_hint(file, hint); -+ kfree(hint); -+ -+ /* -+ * FIXME: explain why it is needed. HINT: page allocation in write can -+ * not be done when atom is not NULL because reiser4_writepage can not -+ * kick entd and have to eflush -+ */ -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+struct uf_readpages_context { -+ lock_handle lh; -+ coord_t coord; -+}; -+ -+/* A callback function for readpages_unix_file/read_cache_pages. -+ * If the file is build of tails, then return error (-ENOENT). -+ * -+ * @data -- a pointer to reiser4_readpages_context object, -+ * to save the twig lock and the coord between -+ * read_cache_page iterations. -+ * @page -- page to start read. -+ */ -+static int uf_readpages_filler(void * data, struct page * page) -+{ -+ struct uf_readpages_context *rc = data; -+ jnode * node; -+ int ret = 0; -+ reiser4_extent *ext; -+ __u64 ext_index; -+ int cbk_done = 0; -+ struct address_space * mapping = page->mapping; -+ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return 0; -+ } -+ page_cache_get(page); -+ -+ if (rc->lh.node == 0) { -+ /* no twig lock - have to do tree search. */ -+ reiser4_key key; -+ repeat: -+ unlock_page(page); -+ key_by_inode_and_offset_common( -+ mapping->host, page_offset(page), &key); -+ ret = coord_by_key( -+ &get_super_private(mapping->host->i_sb)->tree, -+ &key, &rc->coord, &rc->lh, -+ ZNODE_READ_LOCK, FIND_EXACT, -+ TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL); -+ if (unlikely(ret)) -+ goto exit; -+ lock_page(page); -+ if (PageUptodate(page)) -+ goto unlock; -+ cbk_done = 1; -+ } -+ ret = zload(rc->coord.node); -+ if (unlikely(ret)) -+ goto unlock; -+ if (!coord_is_existing_item(&rc->coord) || -+ !item_is_extent(&rc->coord)) { -+ zrelse(rc->coord.node); -+ ret = RETERR(-EIO); -+ goto unlock; -+ } -+ ext = extent_by_coord(&rc->coord); -+ ext_index = extent_unit_index(&rc->coord); -+ if (page->index < ext_index || -+ page->index >= ext_index + extent_get_width(ext)) { -+ /* the page index doesn't belong to the extent unit -+ which the coord points to - release the lock and -+ repeat with tree search. */ -+ zrelse(rc->coord.node); -+ done_lh(&rc->lh); -+ /* we can be here after a CBK call only in case of -+ corruption of the tree or the tree lookup algorithm bug. */ -+ if (unlikely(cbk_done)) { -+ ret = RETERR(-EIO); -+ goto unlock; -+ } -+ goto repeat; -+ } -+ node = jnode_of_page(page); -+ if (unlikely(IS_ERR(node))) { -+ zrelse(rc->coord.node); -+ ret = PTR_ERR(node); -+ goto unlock; -+ } -+ ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page); -+ jput(node); -+ zrelse(rc->coord.node); -+ if (likely(!ret)) -+ goto exit; -+ unlock: -+ unlock_page(page); -+ exit: -+ page_cache_release(page); -+ return ret; -+} -+ -+/** -+ * readpages_unix_file - called by the readahead code, starts reading for each -+ * page of given list of pages -+ */ -+int readpages_unix_file( -+ struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ reiser4_context *ctx; -+ struct uf_readpages_context rc; -+ int ret; -+ -+ ctx = reiser4_init_context(mapping->host->i_sb); -+ if (IS_ERR(ctx)) { -+ put_pages_list(pages); -+ return PTR_ERR(ctx); -+ } -+ init_lh(&rc.lh); -+ ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc); -+ done_lh(&rc.lh); -+ context_set_commit_async(ctx); -+ /* close the transaction to protect further page allocation from deadlocks */ -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return ret; -+} -+ -+static reiser4_block_nr unix_file_estimate_read(struct inode *inode, -+ loff_t count UNUSED_ARG) -+{ -+ /* We should reserve one block, because of updating of the stat data -+ item */ -+ assert("vs-1249", -+ inode_file_plugin(inode)->estimate.update == -+ estimate_update_common); -+ return estimate_update_common(inode); -+} -+ -+/* this is called with nonexclusive access obtained, file's container can not change */ -+static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */ -+ char __user *buf, /* address of user-space buffer */ -+ size_t count, /* number of bytes to read */ -+ loff_t *off) -+{ -+ int result; -+ struct inode *inode; -+ flow_t flow; -+ int (*read_f) (struct file *, flow_t *, hint_t *); -+ coord_t *coord; -+ znode *loaded; -+ -+ inode = file->f_dentry->d_inode; -+ -+ /* build flow */ -+ assert("vs-1250", -+ inode_file_plugin(inode)->flow_by_inode == -+ flow_by_inode_unix_file); -+ result = -+ flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count, -+ *off, READ_OP, &flow); -+ if (unlikely(result)) -+ return result; -+ -+ /* get seal and coord sealed with it from reiser4 private data -+ of struct file. The coord will tell us where our last read -+ of this file finished, and the seal will help to determine -+ if that location is still valid. -+ */ -+ coord = &hint->ext_coord.coord; -+ while (flow.length && result == 0) { -+ result = -+ find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode); -+ if (cbk_errored(result)) -+ /* error happened */ -+ break; -+ -+ if (coord->between != AT_UNIT) { -+ /* there were no items corresponding to given offset */ -+ done_lh(hint->ext_coord.lh); -+ break; -+ } -+ -+ loaded = coord->node; -+ result = zload(loaded); -+ if (unlikely(result)) { -+ done_lh(hint->ext_coord.lh); -+ break; -+ } -+ -+ if (hint->ext_coord.valid == 0) -+ validate_extended_coord(&hint->ext_coord, -+ get_key_offset(&flow.key)); -+ -+ assert("vs-4", hint->ext_coord.valid == 1); -+ assert("vs-33", hint->ext_coord.lh == &hint->lh); -+ /* call item's read method */ -+ read_f = item_plugin_by_coord(coord)->s.file.read; -+ result = read_f(file, &flow, hint); -+ zrelse(loaded); -+ done_lh(hint->ext_coord.lh); -+ } -+ -+ return (count - flow.length) ? (count - flow.length) : result; -+} -+ -+static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*); -+ -+/** -+ * read_unix_file - read of struct file_operations -+ * @file: file to read from -+ * @buf: address of user-space buffer -+ * @read_amount: number of bytes to read -+ * @off: position in file to read from -+ * -+ * This is implementation of vfs's read method of struct file_operations for -+ * unix file plugin. -+ */ -+ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount, -+ loff_t *off) -+{ -+ reiser4_context *ctx; -+ ssize_t result; -+ struct inode *inode; -+ struct unix_file_info *uf_info; -+ -+ if (unlikely(read_amount == 0)) -+ return 0; -+ -+ assert("umka-072", file != NULL); -+ assert("umka-074", off != NULL); -+ inode = file->f_dentry->d_inode; -+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ uf_info = unix_file_inode_data(inode); -+ if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ get_exclusive_access(uf_info); -+ result = find_file_state(inode, uf_info); -+ if (unlikely(result != 0)) -+ goto out; -+ } else -+ get_nonexclusive_access(uf_info); -+ result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount), -+ BA_CAN_COMMIT); -+ if (unlikely(result != 0)) -+ goto out; -+ if (uf_info->container == UF_CONTAINER_EXTENTS){ -+ result = do_sync_read(file, buf, read_amount, off); -+ } else if (uf_info->container == UF_CONTAINER_TAILS || -+ reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) || -+ reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ result = read_unix_file_container_tails(file, buf, read_amount, off); -+ } else { -+ assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY); -+ result = 0; -+ } -+out: -+ drop_access(uf_info); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+static ssize_t read_unix_file_container_tails( -+ struct file *file, char __user *buf, size_t read_amount, loff_t *off) -+{ -+ int result; -+ struct inode *inode; -+ hint_t *hint; -+ struct unix_file_info *uf_info; -+ size_t count, read, left; -+ loff_t size; -+ -+ assert("umka-072", file != NULL); -+ assert("umka-074", off != NULL); -+ inode = file->f_dentry->d_inode; -+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ -+ result = load_file_hint(file, hint); -+ if (result) { -+ kfree(hint); -+ return result; -+ } -+ -+ left = read_amount; -+ count = 0; -+ uf_info = unix_file_inode_data(inode); -+ while (left > 0) { -+ reiser4_txn_restart_current(); -+ size = i_size_read(inode); -+ if (*off >= size) -+ /* position to read from is past the end of file */ -+ break; -+ if (*off + left > size) -+ left = size - *off; -+ /* faultin user page */ -+ result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left); -+ if (result) -+ return RETERR(-EFAULT); -+ -+ read = read_file(hint, file, buf, -+ left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left, -+ off); -+ if (read < 0) { -+ result = read; -+ break; -+ } -+ left -= read; -+ buf += read; -+ -+ /* update position in a file */ -+ *off += read; -+ /* total number of read bytes */ -+ count += read; -+ } -+ done_lh(&hint->lh); -+ save_file_hint(file, hint); -+ kfree(hint); -+ if (count) -+ file_accessed(file); -+ /* return number of read bytes or error code if nothing is read */ -+ return count ? count : result; -+} -+ -+/* This function takes care about @file's pages. First of all it checks if -+ filesystems readonly and if so gets out. Otherwise, it throws out all -+ pages of file if it was mapped for read and going to be mapped for write -+ and consists of tails. This is done in order to not manage few copies -+ of the data (first in page cache and second one in tails them selves) -+ for the case of mapping files consisting tails. -+ -+ Here also tail2extent conversion is performed if it is allowed and file -+ is going to be written or mapped for write. This functions may be called -+ from write_unix_file() or mmap_unix_file(). */ -+static int check_pages_unix_file(struct file *file, struct inode *inode) -+{ -+ reiser4_invalidate_pages(inode->i_mapping, 0, -+ (inode->i_size + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT, 0); -+ return unpack(file, inode, 0 /* not forever */ ); -+} -+ -+/** -+ * mmap_unix_file - mmap of struct file_operations -+ * @file: file to mmap -+ * @vma: -+ * -+ * This is implementation of vfs's mmap method of struct file_operations for -+ * unix file plugin. It converts file to extent if necessary. Sets -+ * reiser4_inode's flag - REISER4_HAS_MMAP. -+ */ -+int mmap_unix_file(struct file *file, struct vm_area_struct *vma) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *inode; -+ struct unix_file_info *uf_info; -+ reiser4_block_nr needed; -+ -+ inode = file->f_dentry->d_inode; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access_careful(uf_info, inode); -+ -+ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) { -+ /* -+ * we need file built of extent items. If it is still built of -+ * tail items we have to convert it. Find what items the file -+ * is built of -+ */ -+ result = find_file_state(inode, uf_info); -+ if (result != 0) { -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS || -+ uf_info->container == UF_CONTAINER_EXTENTS || -+ uf_info->container == UF_CONTAINER_EMPTY)); -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * invalidate all pages and convert file from tails to -+ * extents -+ */ -+ result = check_pages_unix_file(file, inode); -+ if (result) { -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ } -+ } -+ -+ /* -+ * generic_file_mmap will do update_atime. Grab space for stat data -+ * update. -+ */ -+ needed = inode_file_plugin(inode)->estimate.update(inode); -+ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT); -+ if (result) { -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = generic_file_mmap(file, vma); -+ if (result == 0) { -+ /* mark file as having mapping. */ -+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP); -+ } -+ -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * find_first_item -+ * @inode: -+ * -+ * Finds file item which is responsible for first byte in the file. -+ */ -+static int find_first_item(struct inode *inode) -+{ -+ coord_t coord; -+ lock_handle lh; -+ reiser4_key key; -+ int result; -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ inode_file_plugin(inode)->key_by_inode(inode, 0, &key); -+ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, -+ inode); -+ if (result == CBK_COORD_FOUND) { -+ if (coord.between == AT_UNIT) { -+ result = zload(coord.node); -+ if (result == 0) { -+ result = item_id_by_coord(&coord); -+ zrelse(coord.node); -+ if (result != EXTENT_POINTER_ID && -+ result != FORMATTING_ID) -+ result = RETERR(-EIO); -+ } -+ } else -+ result = RETERR(-EIO); -+ } -+ done_lh(&lh); -+ return result; -+} -+ -+/** -+ * open_unix_file -+ * @inode: -+ * @file: -+ * -+ * If filesystem is not readonly - complete uncompleted tail conversion if -+ * there was one -+ */ -+int open_unix_file(struct inode *inode, struct file *file) -+{ -+ int result; -+ reiser4_context *ctx; -+ struct unix_file_info *uf_info; -+ -+ if (IS_RDONLY(inode)) -+ return 0; -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) -+ return 0; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access_careful(uf_info, inode); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ /* -+ * other process completed the conversion -+ */ -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ -+ /* -+ * file left in semi converted state after unclean shutdown or another -+ * thread is doing conversion and dropped exclusive access which doing -+ * balance dirty pages. Complete the conversion -+ */ -+ result = find_first_item(inode); -+ if (result == EXTENT_POINTER_ID) -+ /* -+ * first item is extent, therefore there was incomplete -+ * tail2extent conversion. Complete it -+ */ -+ result = tail2extent(unix_file_inode_data(inode)); -+ else if (result == FORMATTING_ID) -+ /* -+ * first item is formatting item, therefore there was -+ * incomplete extent2tail conversion. Complete it -+ */ -+ result = extent2tail(file, unix_file_inode_data(inode)); -+ else -+ result = -EIO; -+ -+ assert("vs-1712", -+ ergo(result == 0, -+ (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) && -+ !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)))); -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+#define NEITHER_OBTAINED 0 -+#define EA_OBTAINED 1 -+#define NEA_OBTAINED 2 -+ -+static void drop_access(struct unix_file_info *uf_info) -+{ -+ if (uf_info->exclusive_use) -+ drop_exclusive_access(uf_info); -+ else -+ drop_nonexclusive_access(uf_info); -+} -+ -+#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \ -+ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) -+ -+/** -+ * write_unix_file - private ->write() method of unix_file plugin. -+ * -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @count: number of bytes to write -+ * @pos: position in file to write to -+ * @cont: unused argument, as we don't perform plugin conversion when being -+ * managed by unix_file plugin. -+ */ -+ssize_t write_unix_file(struct file *file, const char __user *buf, -+ size_t count, loff_t *pos, struct psched_context *cont) -+{ -+ int result; -+ reiser4_context *ctx; -+ struct inode *inode; -+ struct unix_file_info *uf_info; -+ ssize_t written; -+ int try_free_space; -+ int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY; -+ size_t left; -+ ssize_t (*write_op)(struct file *, struct inode *, -+ const char __user *, size_t, -+ loff_t *pos); -+ int ea; -+ loff_t new_size; -+ -+ ctx = get_current_context(); -+ inode = file->f_dentry->d_inode; -+ -+ assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))); -+ -+ /* check amount of bytes to write and writing position */ -+ result = generic_write_checks(file, pos, &count, 0); -+ if (result) { -+ context_set_commit_async(ctx); -+ return result; -+ } -+ -+ result = file_remove_suid(file); -+ if (result) { -+ context_set_commit_async(ctx); -+ return result; -+ } -+ /* remove_suid might create a transaction */ -+ reiser4_txn_restart(ctx); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ current->backing_dev_info = inode->i_mapping->backing_dev_info; -+ written = 0; -+ try_free_space = 0; -+ left = count; -+ ea = NEITHER_OBTAINED; -+ -+ new_size = i_size_read(inode); -+ if (*pos + count > new_size) -+ new_size = *pos + count; -+ -+ while (left) { -+ if (left < to_write) -+ to_write = left; -+ -+ if (uf_info->container == UF_CONTAINER_EMPTY) { -+ get_exclusive_access(uf_info); -+ ea = EA_OBTAINED; -+ if (uf_info->container != UF_CONTAINER_EMPTY) { -+ /* file is made not empty by another process */ -+ drop_exclusive_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ continue; -+ } -+ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ /* -+ * get exclusive access directly just to not have to -+ * re-obtain it if file will appear empty -+ */ -+ get_exclusive_access(uf_info); -+ ea = EA_OBTAINED; -+ result = find_file_state(inode, uf_info); -+ if (result) { -+ drop_exclusive_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ break; -+ } -+ } else { -+ get_nonexclusive_access(uf_info); -+ ea = NEA_OBTAINED; -+ } -+ -+ /* either EA or NEA is obtained. Choose item write method */ -+ if (uf_info->container == UF_CONTAINER_EXTENTS) { -+ /* file is built of extent items */ -+ write_op = reiser4_write_extent; -+ } else if (uf_info->container == UF_CONTAINER_EMPTY) { -+ /* file is empty */ -+ if (should_have_notail(uf_info, new_size)) -+ write_op = reiser4_write_extent; -+ else -+ write_op = reiser4_write_tail; -+ } else { -+ /* file is built of tail items */ -+ if (should_have_notail(uf_info, new_size)) { -+ if (ea == NEA_OBTAINED) { -+ drop_nonexclusive_access(uf_info); -+ get_exclusive_access(uf_info); -+ ea = EA_OBTAINED; -+ } -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * if file is being convered by another -+ * process - wait until it completes -+ */ -+ while (1) { -+ if (reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)) { -+ drop_exclusive_access(uf_info); -+ schedule(); -+ get_exclusive_access(uf_info); -+ continue; -+ } -+ break; -+ } -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ result = tail2extent(uf_info); -+ if (result) { -+ drop_exclusive_access(uf_info); -+ context_set_commit_async(ctx); -+ break; -+ } -+ } -+ } -+ drop_exclusive_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ continue; -+ } -+ write_op = reiser4_write_tail; -+ } -+ -+ written = write_op(file, inode, buf, to_write, pos); -+ if (written == -ENOSPC && try_free_space) { -+ drop_access(uf_info); -+ txnmgr_force_commit_all(inode->i_sb, 0); -+ try_free_space = 0; -+ continue; -+ } -+ if (written < 0) { -+ drop_access(uf_info); -+ result = written; -+ break; -+ } -+ /* something is written. */ -+ if (uf_info->container == UF_CONTAINER_EMPTY) { -+ assert("edward-1553", ea == EA_OBTAINED); -+ uf_info->container = -+ (write_op == reiser4_write_extent) ? -+ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS; -+ } else { -+ assert("edward-1554", ergo(uf_info->container == UF_CONTAINER_EXTENTS, -+ write_op == reiser4_write_extent)); -+ assert("edward-1555", ergo(uf_info->container == UF_CONTAINER_TAILS, -+ write_op == reiser4_write_tail)); -+ } -+ if (*pos + written > inode->i_size) -+ INODE_SET_FIELD(inode, i_size, *pos + written); -+ file_update_time(file); -+ result = reiser4_update_sd(inode); -+ if (result) { -+ current->backing_dev_info = NULL; -+ drop_access(uf_info); -+ context_set_commit_async(ctx); -+ break; -+ } -+ drop_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ reiser4_txn_restart(ctx); -+ current->journal_info = NULL; -+ /* -+ * tell VM how many pages were dirtied. Maybe number of pages -+ * which were dirty already should not be counted -+ */ -+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, -+ (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE); -+ current->journal_info = ctx; -+ -+ left -= written; -+ buf += written; -+ *pos += written; -+ } -+ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { -+ reiser4_txn_restart_current(); -+ grab_space_enable(); -+ result = reiser4_sync_file_common(file, file->f_dentry, -+ 0 /* data and stat data */); -+ if (result) -+ warning("reiser4-7", "failed to sync file %llu", -+ (unsigned long long)get_inode_oid(inode)); -+ } -+ -+ current->backing_dev_info = NULL; -+ -+ /* -+ * return number of written bytes or error code if nothing is -+ * written. Note, that it does not work correctly in case when -+ * sync_unix_file returns error -+ */ -+ return (count - left) ? (count - left) : result; -+} -+ -+/** -+ * release_unix_file - release of struct file_operations -+ * @inode: inode of released file -+ * @file: file to release -+ * -+ * Implementation of release method of struct file_operations for unix file -+ * plugin. If last reference to indode is released - convert all extent items -+ * into tail items if necessary. Frees reiser4 specific file data. -+ */ -+int release_unix_file(struct inode *inode, struct file *file) -+{ -+ reiser4_context *ctx; -+ struct unix_file_info *uf_info; -+ int result; -+ int in_reiser4; -+ -+ in_reiser4 = is_in_reiser4_context(); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ result = 0; -+ if (in_reiser4 == 0) { -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access_careful(uf_info, inode); -+ if (atomic_read(&file->f_dentry->d_count) == 1 && -+ uf_info->container == UF_CONTAINER_EXTENTS && -+ !should_have_notail(uf_info, inode->i_size) && -+ !rofs_inode(inode)) { -+ result = extent2tail(file, uf_info); -+ if (result != 0) { -+ context_set_commit_async(ctx); -+ warning("nikita-3233", -+ "Failed (%d) to convert in %s (%llu)", -+ result, __FUNCTION__, -+ (unsigned long long) -+ get_inode_oid(inode)); -+ } -+ } -+ drop_exclusive_access(uf_info); -+ } else { -+ /* -+ we are within reiser4 context already. How latter is -+ possible? Simple: -+ -+ (gdb) bt -+ #0 get_exclusive_access () -+ #2 0xc01e56d3 in release_unix_file () -+ #3 0xc01c3643 in reiser4_release () -+ #4 0xc014cae0 in __fput () -+ #5 0xc013ffc3 in remove_vm_struct () -+ #6 0xc0141786 in exit_mmap () -+ #7 0xc0118480 in mmput () -+ #8 0xc0133205 in oom_kill () -+ #9 0xc01332d1 in out_of_memory () -+ #10 0xc013bc1d in try_to_free_pages () -+ #11 0xc013427b in __alloc_pages () -+ #12 0xc013f058 in do_anonymous_page () -+ #13 0xc013f19d in do_no_page () -+ #14 0xc013f60e in handle_mm_fault () -+ #15 0xc01131e5 in do_page_fault () -+ #16 0xc0104935 in error_code () -+ #17 0xc025c0c6 in __copy_to_user_ll () -+ #18 0xc01d496f in reiser4_read_tail () -+ #19 0xc01e4def in read_unix_file () -+ #20 0xc01c3504 in reiser4_read () -+ #21 0xc014bd4f in vfs_read () -+ #22 0xc014bf66 in sys_read () -+ */ -+ warning("vs-44", "out of memory?"); -+ } -+ -+ reiser4_free_file_fsdata(file); -+ -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+static void set_file_notail(struct inode *inode) -+{ -+ reiser4_inode *state; -+ formatting_plugin *tplug; -+ -+ state = reiser4_inode_data(inode); -+ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID); -+ force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug); -+} -+ -+/* if file is built of tails - convert it to extents */ -+static int unpack(struct file *filp, struct inode *inode, int forever) -+{ -+ int result = 0; -+ struct unix_file_info *uf_info; -+ -+ uf_info = unix_file_inode_data(inode); -+ assert("vs-1628", ea_obtained(uf_info)); -+ -+ result = find_file_state(inode, uf_info); -+ if (result) -+ return result; -+ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN); -+ -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * if file is being convered by another process - wait until it -+ * completes -+ */ -+ while (1) { -+ if (reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)) { -+ drop_exclusive_access(uf_info); -+ schedule(); -+ get_exclusive_access(uf_info); -+ continue; -+ } -+ break; -+ } -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ result = tail2extent(uf_info); -+ if (result) -+ return result; -+ } -+ } -+ if (forever) { -+ /* safe new formatting plugin in stat data */ -+ __u64 tograb; -+ -+ set_file_notail(inode); -+ -+ grab_space_enable(); -+ tograb = inode_file_plugin(inode)->estimate.update(inode); -+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT); -+ result = reiser4_update_sd(inode); -+ } -+ -+ return result; -+} -+ -+/* implentation of vfs' ioctl method of struct file_operations for unix file -+ plugin -+*/ -+int -+ioctl_unix_file(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg UNUSED_ARG) -+{ -+ reiser4_context *ctx; -+ int result; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ switch (cmd) { -+ case REISER4_IOC_UNPACK: -+ get_exclusive_access(unix_file_inode_data(inode)); -+ result = unpack(filp, inode, 1 /* forever */ ); -+ drop_exclusive_access(unix_file_inode_data(inode)); -+ break; -+ -+ default: -+ result = RETERR(-ENOSYS); -+ break; -+ } -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* implentation of vfs' bmap method of struct address_space_operations for unix -+ file plugin -+*/ -+sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock) -+{ -+ reiser4_context *ctx; -+ sector_t result; -+ reiser4_key key; -+ coord_t coord; -+ lock_handle lh; -+ struct inode *inode; -+ item_plugin *iplug; -+ sector_t block; -+ -+ inode = mapping->host; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ key_by_inode_and_offset_common(inode, -+ (loff_t) lblock * current_blocksize, -+ &key); -+ -+ init_lh(&lh); -+ result = -+ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode); -+ if (cbk_errored(result)) { -+ done_lh(&lh); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = zload(coord.node); -+ if (result) { -+ done_lh(&lh); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ iplug = item_plugin_by_coord(&coord); -+ if (iplug->s.file.get_block) { -+ result = iplug->s.file.get_block(&coord, lblock, &block); -+ if (result == 0) -+ result = block; -+ } else -+ result = RETERR(-EINVAL); -+ -+ zrelse(coord.node); -+ done_lh(&lh); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * flow_by_inode_unix_file - initizlize structure flow -+ * @inode: inode of file for which read or write is abou -+ * @buf: buffer to perform read to or write from -+ * @user: flag showing whether @buf is user space or kernel space -+ * @size: size of buffer @buf -+ * @off: start offset fro read or write -+ * @op: READ or WRITE -+ * @flow: -+ * -+ * Initializes fields of @flow: key, size of data, i/o mode (read or write). -+ */ -+int flow_by_inode_unix_file(struct inode *inode, -+ const char __user *buf, int user, -+ loff_t size, loff_t off, -+ rw_op op, flow_t *flow) -+{ -+ assert("nikita-1100", inode != NULL); -+ -+ flow->length = size; -+ memcpy(&flow->data, &buf, sizeof(buf)); -+ flow->user = user; -+ flow->op = op; -+ assert("nikita-1931", inode_file_plugin(inode) != NULL); -+ assert("nikita-1932", -+ inode_file_plugin(inode)->key_by_inode == -+ key_by_inode_and_offset_common); -+ /* calculate key of write position and insert it into flow->key */ -+ return key_by_inode_and_offset_common(inode, off, &flow->key); -+} -+ -+/* plugin->u.file.set_plug_in_sd = NULL -+ plugin->u.file.set_plug_in_inode = NULL -+ plugin->u.file.create_blank_sd = NULL */ -+/* plugin->u.file.delete */ -+/* -+ plugin->u.file.add_link = reiser4_add_link_common -+ plugin->u.file.rem_link = NULL */ -+ -+/* plugin->u.file.owns_item -+ this is common_file_owns_item with assertion */ -+/* Audited by: green(2002.06.15) */ -+int -+owns_item_unix_file(const struct inode *inode /* object to check against */ , -+ const coord_t * coord /* coord to check */ ) -+{ -+ int result; -+ -+ result = owns_item_common(inode, coord); -+ if (!result) -+ return 0; -+ if (!plugin_of_group(item_plugin_by_coord(coord), -+ UNIX_FILE_METADATA_ITEM_TYPE)) -+ return 0; -+ assert("vs-547", -+ item_id_by_coord(coord) == EXTENT_POINTER_ID || -+ item_id_by_coord(coord) == FORMATTING_ID); -+ return 1; -+} -+ -+static int setattr_truncate(struct inode *inode, struct iattr *attr) -+{ -+ int result; -+ int s_result; -+ loff_t old_size; -+ reiser4_tree *tree; -+ -+ inode_check_scale(inode, inode->i_size, attr->ia_size); -+ -+ old_size = inode->i_size; -+ tree = reiser4_tree_by_inode(inode); -+ -+ result = safe_link_grab(tree, BA_CAN_COMMIT); -+ if (result == 0) -+ result = safe_link_add(inode, SAFE_TRUNCATE); -+ if (result == 0) -+ result = truncate_file_body(inode, attr); -+ if (result) -+ warning("vs-1588", "truncate_file failed: oid %lli, " -+ "old size %lld, new size %lld, retval %d", -+ (unsigned long long)get_inode_oid(inode), -+ old_size, attr->ia_size, result); -+ -+ s_result = safe_link_grab(tree, BA_CAN_COMMIT); -+ if (s_result == 0) -+ s_result = -+ safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE); -+ if (s_result != 0) { -+ warning("nikita-3417", "Cannot kill safelink %lli: %i", -+ (unsigned long long)get_inode_oid(inode), s_result); -+ } -+ safe_link_release(tree); -+ return result; -+} -+ -+/* plugin->u.file.setattr method */ -+/* This calls inode_setattr and if truncate is in effect it also takes -+ exclusive inode access to avoid races */ -+int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */ -+ struct iattr *attr /* change description */ ) -+{ -+ int result; -+ -+ if (attr->ia_valid & ATTR_SIZE) { -+ reiser4_context *ctx; -+ struct unix_file_info *uf_info; -+ -+ /* truncate does reservation itself and requires exclusive -+ access obtained */ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(dentry->d_inode); -+ get_exclusive_access_careful(uf_info, dentry->d_inode); -+ result = setattr_truncate(dentry->d_inode, attr); -+ drop_exclusive_access(uf_info); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ } else -+ result = reiser4_setattr_common(dentry, attr); -+ -+ return result; -+} -+ -+/* plugin->u.file.init_inode_data */ -+void -+init_inode_data_unix_file(struct inode *inode, -+ reiser4_object_create_data * crd, int create) -+{ -+ struct unix_file_info *data; -+ -+ data = unix_file_inode_data(inode); -+ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN; -+ init_rwsem(&data->latch); -+ data->tplug = inode_formatting_plugin(inode); -+ data->exclusive_use = 0; -+ -+#if REISER4_DEBUG -+ data->ea_owner = NULL; -+ atomic_set(&data->nr_neas, 0); -+#endif -+ init_inode_ordering(inode, crd, create); -+} -+ -+/** -+ * delete_unix_file - delete_object of file_plugin -+ * @inode: inode to be deleted -+ * -+ * Truncates file to length 0, removes stat data and safe link. -+ */ -+int delete_object_unix_file(struct inode *inode) -+{ -+ struct unix_file_info *uf_info; -+ int result; -+ -+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD)) -+ return 0; -+ -+ /* truncate file bogy first */ -+ uf_info = unix_file_inode_data(inode); -+ get_exclusive_access(uf_info); -+ result = shorten_file(inode, 0 /* size */ ); -+ drop_exclusive_access(uf_info); -+ -+ if (result) -+ warning("edward-1556", -+ "failed to truncate file (%llu) on removal: %d", -+ get_inode_oid(inode), result); -+ -+ /* remove stat data and safe link */ -+ return reiser4_delete_object_common(inode); -+} -+ -+/* plugin->write_begin() */ -+int write_begin_unix_file(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ int ret; -+ struct unix_file_info *info; -+ -+ info = unix_file_inode_data(file->f_dentry->d_inode); -+ get_exclusive_access(info); -+ ret = find_file_state(file->f_dentry->d_inode, info); -+ if (likely(ret == 0)) { -+ if (info->container == UF_CONTAINER_TAILS) -+ ret = -EINVAL; -+ else -+ ret = do_prepare_write(file, page, from, to); -+ } -+ drop_exclusive_access(info); -+ return ret; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.30/fs/reiser4/plugin/file/file_conversion.c ---- linux-2.6.30.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file/file_conversion.c 2009-06-22 16:44:21.000000000 +0200 -@@ -0,0 +1,779 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, -+ licensing governed by reiser4/README */ -+ -+/** -+ * This file contains plugin schedule hooks, and plugin conversion methods. -+ * -+ * Plugin schedule hook makes a decision (at plugin schedule point) about the -+ * most reasonable plugins for managing a regular file. Usually such decisions -+ * is made by some O(1)-heuristic. -+ * -+ * By default we assign a unix_file plugin id when writing incompressible file -+ * managed by cryptcompress plugin id. Currently used heuristic for estimating -+ * compressibility is very simple: if first complete logical cluster (64K by -+ * default) of a file is incompressible, then we make a decision, that the whole -+ * file is incompressible (*). -+ * -+ * To enable a conversion we install a special "magic" compression mode plugin -+ * (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for details) -+ * at file creation time (**). -+ * -+ * Note, that we don't perform back conversion (unix_file->cryptcompress) -+ * because of compatibility reasons (see http://dev.namesys.com/Version4.X.Y -+ * for details). -+ * -+ * The conversion is accompanied by rebuilding disk structures of a file, so it -+ * is important to protect them from being interacted with other plugins which -+ * don't expect them to be in such inconsistent state. For this to be protected -+ * we serialize readers and writers of a file's conversion set (FCS). -+ * -+ * We define FCS as a file plugin installed in inode's pset plus file's data -+ * and metadata that this file plugin manipulates with (items, etc). -+ * Note, that FCS is defined per file. -+ * FCS reader is defined as a set of instruction of the following type: -+ * {inode_file_plugin(inode)->method()} (I.e. retrieving a file plugin id -+ * conjoined with all method's instructions should be atomic). -+ * FCS writer is a set of instructions that perform file plugin conversion -+ * (convert items, update pset, etc). -+ * Example: -+ * reiser4_write_careful() supplied to VFS as a ->write() file operation is -+ * composed of the following (optional) instructions: -+ * 1 2 3 -+ * *********************** ####### --------------------------------------------> -+ * -+ * 1) "****" are instructions performed on behalf of cryptcompress file plugin; -+ * 2) "####" is a FCS writer (performing a conversion cryptcompress->unix_file); -+ * 3) "----" are instructions performed on behalf of unix_file plugin; -+ * Here (1) and (3) are FCS readers. -+ * -+ * In this example FCS readers and writers are already serialized (by design), -+ * however there can be readers and writers executing at the same time in -+ * different contexts, so we need a common mechanism of serialization. -+ * -+ * Currently serialization of FCS readers and writers is performed via acquiring -+ * a special per-inode rw-semaphore (conv_sem). And yes, {down, up}_read is for -+ * FCS readers, and {down, up}_write is for FCS writers, see the macros below -+ * for passive/active protection. -+ * -+ * --- -+ * (*) This heuristic can be changed to a better one (benchmarking is needed). -+ * (**) Such technique allows to keep enable/disable state on disk. -+ */ -+ -+#include "../../inode.h" -+#include "../cluster.h" -+#include "file.h" -+ -+#define conversion_enabled(inode) \ -+ (inode_compression_mode_plugin(inode) == \ -+ compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID)) -+ -+/** -+ * Located sections (readers and writers of @pset) are not permanently -+ * critical: cryptcompress file can be converted only if the conversion -+ * is enabled (see the macrio above). Also we don't perform back -+ * conversion. The following helper macro is a sanity check to decide -+ * if we need the protection (locks are always additional overheads). -+ */ -+#define should_protect(inode) \ -+ (inode_file_plugin(inode) == \ -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \ -+ conversion_enabled(inode)) -+/** -+ * To avoid confusion with read/write file operations, we'll speak about -+ * "passive" protection for FCS readers and "active" protection for FCS -+ * writers. All methods with active or passive protection have suffix -+ * "careful". -+ */ -+/** -+ * Macros for passive protection. -+ * -+ * Construct invariant operation to be supplied to VFS. -+ * The macro accepts the following lexemes: -+ * @type - type of the value represented by the compound statement; -+ * @method - name of an operation to be supplied to VFS (reiser4 file -+ * plugin also should contain a method with such name). -+ */ -+#define PROT_PASSIVE(type, method, args) \ -+({ \ -+ type _result; \ -+ struct rw_semaphore * guard = \ -+ &reiser4_inode_data(inode)->conv_sem; \ -+ \ -+ if (should_protect(inode)) { \ -+ down_read(guard); \ -+ if (!should_protect(inode)) \ -+ up_read(guard); \ -+ } \ -+ _result = inode_file_plugin(inode)->method args; \ -+ if (should_protect(inode)) \ -+ up_read(guard); \ -+ _result; \ -+}) -+ -+#define PROT_PASSIVE_VOID(method, args) \ -+({ \ -+ struct rw_semaphore * guard = \ -+ &reiser4_inode_data(inode)->conv_sem; \ -+ \ -+ if (should_protect(inode)) { \ -+ down_read(guard); \ -+ if (!should_protect(inode)) \ -+ up_read(guard); \ -+ } \ -+ inode_file_plugin(inode)->method args; \ -+ \ -+ if (should_protect(inode)) \ -+ up_read(guard); \ -+}) -+ -+/* Pass management to the unix-file plugin with "notail" policy */ -+static int __cryptcompress2unixfile(struct file *file, struct inode * inode) -+{ -+ int result; -+ reiser4_inode *info; -+ struct unix_file_info * uf; -+ info = reiser4_inode_data(inode); -+ -+ result = aset_set_unsafe(&info->pset, -+ PSET_FILE, -+ (reiser4_plugin *) -+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)); -+ if (result) -+ return result; -+ result = aset_set_unsafe(&info->pset, -+ PSET_FORMATTING, -+ (reiser4_plugin *) -+ formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID)); -+ if (result) -+ return result; -+ /* get rid of non-standard plugins */ -+ info->plugin_mask &= ~cryptcompress_mask; -+ /* get rid of plugin stat-data extension */ -+ info->extmask &= ~(1 << PLUGIN_STAT); -+ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+ -+ /* FIXME use init_inode_data_unix_file() instead, -+ but aviod init_inode_ordering() */ -+ /* Init unix-file specific part of inode */ -+ uf = unix_file_inode_data(inode); -+ uf->container = UF_CONTAINER_UNKNOWN; -+ init_rwsem(&uf->latch); -+ uf->tplug = inode_formatting_plugin(inode); -+ uf->exclusive_use = 0; -+#if REISER4_DEBUG -+ uf->ea_owner = NULL; -+ atomic_set(&uf->nr_neas, 0); -+#endif -+ /** -+ * we was carefull for file_ops, inode_ops and as_ops -+ * to be invariant for plugin conversion, so there is -+ * no need to update ones already installed in the -+ * vfs's residence. -+ */ -+ return 0; -+} -+ -+#if REISER4_DEBUG -+static int disabled_conversion_inode_ok(struct inode * inode) -+{ -+ __u64 extmask = reiser4_inode_data(inode)->extmask; -+ __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask; -+ -+ return ((extmask & (1 << LIGHT_WEIGHT_STAT)) && -+ (extmask & (1 << UNIX_STAT)) && -+ (extmask & (1 << LARGE_TIMES_STAT)) && -+ (extmask & (1 << PLUGIN_STAT)) && -+ (plugin_mask & (1 << PSET_COMPRESSION_MODE))); -+} -+#endif -+ -+/** -+ * Disable future attempts to schedule/convert file plugin. -+ * This function is called by plugin schedule hooks. -+ * -+ * To disable conversion we assign any compression mode plugin id -+ * different from CONVX_COMPRESSION_MODE_ID. -+ */ -+static int disable_conversion(struct inode * inode) -+{ -+ int result; -+ result = -+ force_plugin_pset(inode, -+ PSET_COMPRESSION_MODE, -+ (reiser4_plugin *)compression_mode_plugin_by_id -+ (LATTD_COMPRESSION_MODE_ID)); -+ assert("edward-1500", -+ ergo(!result, disabled_conversion_inode_ok(inode))); -+ return result; -+} -+ -+/** -+ * Check if we really have achieved plugin scheduling point -+ */ -+static int check_psched_point(struct inode * inode, -+ loff_t pos /* position in the -+ file to write from */, -+ struct cluster_handle * clust, -+ struct psched_context * cont) -+{ -+ assert("edward-1505", conversion_enabled(inode)); -+ /* -+ * if file size is more then cluster size, then compressible -+ * status must be figured out (i.e. compression was disabled, -+ * or file plugin was converted to unix_file) -+ */ -+ assert("edward-1506", inode->i_size <= inode_cluster_size(inode)); -+ -+ if (pos > inode->i_size) -+ /* first logical cluster will contain a (partial) hole */ -+ return disable_conversion(inode); -+ if (pos < inode_cluster_size(inode)) -+ /* writing to the first logical cluster */ -+ return 0; -+ /* -+ * here we have: -+ * cluster_size <= pos <= i_size <= cluster_size, -+ * and, hence, pos == i_size == cluster_size -+ */ -+ assert("edward-1498", -+ pos == inode->i_size && -+ pos == inode_cluster_size(inode)); -+ assert("edward-1539", cont != NULL); -+ assert("edward-1540", cont->state == PSCHED_INVAL_STATE); -+ -+ cont->state = PSCHED_SCHED_POINT; -+ return 0; -+} -+ -+static void start_check_compressibility(struct inode * inode, -+ struct cluster_handle * clust, -+ hint_t * hint) -+{ -+ assert("edward-1507", clust->index == 1); -+ assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc)); -+ assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ); -+ -+ hint_init_zero(hint); -+ clust->hint = hint; -+ clust->index --; -+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode)); -+ -+ /* first logical cluster (of index #0) must be complete */ -+ assert("edward-1510", lbytes(clust->index, inode) == -+ inode_cluster_size(inode)); -+} -+ -+static void finish_check_compressibility(struct inode * inode, -+ struct cluster_handle * clust, -+ hint_t * hint) -+{ -+ reiser4_unset_hint(clust->hint); -+ clust->hint = hint; -+ clust->index ++; -+} -+ -+#if REISER4_DEBUG -+static int prepped_dclust_ok(hint_t * hint) -+{ -+ reiser4_key key; -+ coord_t * coord = &hint->ext_coord.coord; -+ -+ item_key_by_coord(coord, &key); -+ return (item_id_by_coord(coord) == CTAIL_ID && -+ !coord_is_unprepped_ctail(coord) && -+ (get_key_offset(&key) + nr_units_ctail(coord) == -+ dclust_get_extension_dsize(hint))); -+} -+#endif -+ -+#define fifty_persent(size) (size >> 1) -+/* evaluation of data compressibility */ -+#define data_is_compressible(osize, isize) \ -+ (osize < fifty_persent(isize)) -+ -+/** -+ * A simple O(1)-heuristic for compressibility. -+ * This is called not more then one time per file's life. -+ * Read first logical cluster (of index #0) and estimate its compressibility. -+ * Save estimation result in @cont. -+ */ -+static int read_check_compressibility(struct inode * inode, -+ struct cluster_handle * clust, -+ struct psched_context * cont) -+{ -+ int i; -+ int result; -+ __u32 dst_len; -+ hint_t tmp_hint; -+ hint_t * cur_hint = clust->hint; -+ assert("edward-1541", cont->state == PSCHED_SCHED_POINT); -+ -+ start_check_compressibility(inode, clust, &tmp_hint); -+ -+ reset_cluster_pgset(clust, cluster_nrpages(inode)); -+ result = grab_page_cluster(inode, clust, READ_OP); -+ if (result) -+ return result; -+ /* Read page cluster here */ -+ for (i = 0; i < clust->nr_pages; i++) { -+ struct page *page = clust->pages[i]; -+ lock_page(page); -+ result = do_readpage_ctail(inode, clust, page, -+ ZNODE_READ_LOCK); -+ unlock_page(page); -+ if (result) -+ goto error; -+ } -+ tfm_cluster_clr_uptodate(&clust->tc); -+ -+ cluster_set_tfm_act(&clust->tc, TFMA_WRITE); -+ -+ if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) { -+ /* lenght of compressed data is known, no need to compress */ -+ assert("edward-1511", -+ znode_is_any_locked(tmp_hint.lh.node)); -+ assert("edward-1512", -+ WITH_DATA(tmp_hint.ext_coord.coord.node, -+ prepped_dclust_ok(&tmp_hint))); -+ dst_len = dclust_get_extension_dsize(&tmp_hint); -+ } -+ else { -+ struct tfm_cluster * tc = &clust->tc; -+ compression_plugin * cplug = inode_compression_plugin(inode); -+ result = grab_tfm_stream(inode, tc, INPUT_STREAM); -+ if (result) -+ goto error; -+ for (i = 0; i < clust->nr_pages; i++) { -+ char *data; -+ lock_page(clust->pages[i]); -+ BUG_ON(!PageUptodate(clust->pages[i])); -+ data = kmap(clust->pages[i]); -+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i), -+ data, PAGE_CACHE_SIZE); -+ kunmap(clust->pages[i]); -+ unlock_page(clust->pages[i]); -+ } -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ goto error; -+ result = grab_coa(tc, cplug); -+ if (result) -+ goto error; -+ tc->len = tc->lsize = lbytes(clust->index, inode); -+ assert("edward-1513", tc->len == inode_cluster_size(inode)); -+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM); -+ cplug->compress(get_coa(tc, cplug->h.id, tc->act), -+ tfm_input_data(clust), tc->len, -+ tfm_output_data(clust), &dst_len); -+ assert("edward-1514", -+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM)); -+ } -+ finish_check_compressibility(inode, clust, cur_hint); -+ cont->state = -+ (data_is_compressible(dst_len, inode_cluster_size(inode)) ? -+ PSCHED_REMAINS_OLD : -+ PSCHED_ASSIGNED_NEW); -+ return 0; -+ error: -+ put_page_cluster(clust, inode, READ_OP); -+ return result; -+} -+ -+/* Cut disk cluster of index @idx */ -+static int cut_disk_cluster(struct inode * inode, cloff_t idx) -+{ -+ reiser4_key from, to; -+ assert("edward-1515", inode_file_plugin(inode) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from); -+ to = from; -+ set_key_offset(&to, -+ get_key_offset(&from) + inode_cluster_size(inode) - 1); -+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), -+ &from, &to, inode, 0); -+} -+ -+static int reserve_cryptcompress2unixfile(struct inode *inode) -+{ -+ reiser4_block_nr unformatted_nodes; -+ reiser4_tree *tree; -+ -+ tree = reiser4_tree_by_inode(inode); -+ -+ /* number of unformatted nodes which will be created */ -+ unformatted_nodes = cluster_nrpages(inode); /* N */ -+ -+ /* -+ * space required for one iteration of extent->tail conversion: -+ * -+ * 1. kill ctail items -+ * -+ * 2. insert N unformatted nodes -+ * -+ * 3. insert N (worst-case single-block -+ * extents) extent units. -+ * -+ * 4. drilling to the leaf level by coord_by_key() -+ * -+ * 5. possible update of stat-data -+ * -+ */ -+ grab_space_enable(); -+ return reiser4_grab_space -+ (2 * tree->height + -+ unformatted_nodes + -+ unformatted_nodes * estimate_one_insert_into_item(tree) + -+ 1 + estimate_one_insert_item(tree) + -+ inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+} -+ -+/** -+ * Convert cryptcompress file plugin to unix_file plugin. -+ */ -+static int cryptcompress2unixfile(struct file * file, struct inode * inode, -+ struct psched_context * cont) -+{ -+ int i; -+ int result = 0; -+ struct cryptcompress_info *cr_info; -+ struct unix_file_info *uf_info; -+ assert("edward-1516", cont->pages[0]->index == 0); -+ -+ /* release all cryptcompress-specific resources */ -+ cr_info = cryptcompress_inode_data(inode); -+ result = reserve_cryptcompress2unixfile(inode); -+ if (result) -+ goto out; -+ /* tell kill_hook to not truncate pages */ -+ reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS); -+ result = cut_disk_cluster(inode, 0); -+ if (result) -+ goto out; -+ /* captured jnode of cluster and assotiated resources (pages, -+ reserved disk space) were released by ->kill_hook() method -+ of the item plugin */ -+ -+ result = __cryptcompress2unixfile(file, inode); -+ if (result) -+ goto out; -+ /* At this point file is managed by unix file plugin */ -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ assert("edward-1518", -+ ergo(jprivate(cont->pages[0]), -+ !jnode_is_cluster_page(jprivate(cont->pages[0])))); -+ for(i = 0; i < cont->nr_pages; i++) { -+ assert("edward-1519", cont->pages[i]); -+ assert("edward-1520", PageUptodate(cont->pages[i])); -+ -+ result = find_or_create_extent(cont->pages[i]); -+ if (result) -+ break; -+ } -+ if (unlikely(result)) -+ goto out; -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ result = reiser4_update_sd(inode); -+ out: -+ all_grabbed2free(); -+ return result; -+} -+ -+#define convert_file_plugin cryptcompress2unixfile -+ -+/** -+ * This is called by ->write() method of a cryptcompress file plugin. -+ * Make a decision about the most reasonable file plugin id to manage -+ * the file. -+ */ -+int write_pschedule_hook(struct file * file, struct inode * inode, -+ loff_t pos, struct cluster_handle * clust, -+ struct psched_context * cont) -+{ -+ int result; -+ if (!conversion_enabled(inode)) -+ return 0; -+ result = check_psched_point(inode, pos, clust, cont); -+ if (result || cont->state != PSCHED_SCHED_POINT) -+ return result; -+ result = read_check_compressibility(inode, clust, cont); -+ if (result) -+ return result; -+ if (cont->state == PSCHED_REMAINS_OLD) { -+ put_page_cluster(clust, inode, READ_OP); -+ return disable_conversion(inode); -+ } -+ assert("edward-1543", cont->state == PSCHED_ASSIGNED_NEW); -+ /* -+ * page cluster is grabbed and uptodate. It will be -+ * released with a pgset after plugin conversion is -+ * finished, see put_psched_context(). -+ */ -+ reiser4_unset_hint(clust->hint); -+ move_cluster_pgset(clust, &cont->pages, &cont->nr_pages); -+ return 0; -+} -+ -+/** -+ * This is called by ->setattr() method of cryptcompress file plugin. -+ */ -+int setattr_pschedule_hook(struct inode * inode) -+{ -+ if (conversion_enabled(inode)) -+ return disable_conversion(inode); -+ return 0; -+} -+ -+static inline void init_psched_context(struct psched_context * cont) -+{ -+ memset(cont, 0, sizeof(*cont)); -+} -+ -+static inline void done_psched_context(struct psched_context * cont, -+ struct inode * inode) -+{ -+ if (cont->pages) { -+ __put_page_cluster(0, cont->nr_pages, cont->pages, inode); -+ kfree(cont->pages); -+ } -+} -+/** -+ * Here are wrappers with "protection", aka Reiser4 "careful" methods. -+ * They are used by vfs (as methods of file_ops, inode_ops or as_ops), -+ * which is not aware of plugin conversion performed by Reiser4. -+ */ -+ -+/* -+ * Wrappers with active protection for: -+ * -+ * ->write(); -+ */ -+ -+/* -+ * ->write() file operation supplied to VFS. -+ * Write a file in 3 steps (some of them can be optional). -+ */ -+ssize_t reiser4_write_careful(struct file *file, const char __user *buf, -+ size_t count, loff_t *off) -+{ -+ int result; -+ reiser4_context *ctx; -+ ssize_t written_old = 0; /* bytes written with initial plugin */ -+ ssize_t written_new = 0; /* bytes written with new plugin */ -+ struct psched_context cont; -+ struct inode * inode = file->f_dentry->d_inode; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ init_psched_context(&cont); -+ mutex_lock(&inode->i_mutex); -+ /** -+ * First step. -+ * Start write with initial file plugin. -+ * Keep a plugin schedule status at @cont (if any). -+ */ -+ written_old = inode_file_plugin(inode)->write(file, -+ buf, -+ count, -+ off, -+ &cont); -+ if (cont.state != PSCHED_ASSIGNED_NEW || written_old < 0) -+ goto exit; -+ /** -+ * Second step. -+ * New file plugin has been scheduled. -+ * Perform conversion to the new plugin. -+ */ -+ down_read(&reiser4_inode_data(inode)->conv_sem); -+ result = convert_file_plugin(file, inode, &cont); -+ up_read(&reiser4_inode_data(inode)->conv_sem); -+ if (result) { -+ warning("edward-1544", -+ "Inode %llu: file plugin conversion failed (%d)", -+ (unsigned long long)get_inode_oid(inode), -+ result); -+ context_set_commit_async(ctx); -+ goto exit; -+ } -+ reiser4_txn_restart(ctx); -+ /** -+ * Third step: -+ * Finish write with the new file plugin. -+ */ -+ assert("edward-1536", -+ inode_file_plugin(inode) == -+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)); -+ -+ written_new = inode_file_plugin(inode)->write(file, -+ buf + written_old, -+ count - written_old, -+ off, -+ NULL); -+ exit: -+ mutex_unlock(&inode->i_mutex); -+ done_psched_context(&cont, inode); -+ reiser4_exit_context(ctx); -+ -+ return written_old + (written_new < 0 ? 0 : written_new); -+} -+ -+/* Wrappers with passive protection for: -+ * -+ * ->open(); -+ * ->read(); -+ * ->ioctl(); -+ * ->mmap(); -+ * ->release(); -+ * ->bmap(). -+ */ -+ -+int reiser4_open_careful(struct inode *inode, struct file *file) -+{ -+ return PROT_PASSIVE(int, open, (inode, file)); -+} -+ -+ssize_t reiser4_read_careful(struct file * file, char __user * buf, -+ size_t size, loff_t * off) -+{ -+ struct inode * inode = file->f_dentry->d_inode; -+ return PROT_PASSIVE(ssize_t, read, (file, buf, size, off)); -+} -+ -+int reiser4_ioctl_careful(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg) -+{ -+ return PROT_PASSIVE(int, ioctl, (inode, filp, cmd, arg)); -+} -+ -+int reiser4_mmap_careful(struct file *file, struct vm_area_struct *vma) -+{ -+ struct inode *inode = file->f_dentry->d_inode; -+ return PROT_PASSIVE(int, mmap, (file, vma)); -+} -+ -+int reiser4_release_careful(struct inode *inode, struct file *file) -+{ -+ return PROT_PASSIVE(int, release, (inode, file)); -+} -+ -+sector_t reiser4_bmap_careful(struct address_space * mapping, sector_t lblock) -+{ -+ struct inode *inode = mapping->host; -+ return PROT_PASSIVE(sector_t, bmap, (mapping, lblock)); -+} -+ -+/** -+ * NOTE: The following two methods are -+ * used only for loopback functionality. -+ * reiser4_write_end() can not cope with -+ * short writes for now. -+ */ -+int reiser4_write_begin_careful(struct file *file, -+ struct address_space *mapping, -+ loff_t pos, -+ unsigned len, -+ unsigned flags, -+ struct page **pagep, -+ void **fsdata) -+{ -+ int ret = 0; -+ unsigned start, end; -+ struct page *page; -+ pgoff_t index; -+ reiser4_context *ctx; -+ struct inode * inode = file->f_dentry->d_inode; -+ -+ index = pos >> PAGE_CACHE_SHIFT; -+ start = pos & (PAGE_CACHE_SIZE - 1); -+ end = start + len; -+ -+ page = grab_cache_page_write_begin(mapping, index, -+ flags & AOP_FLAG_NOFS); -+ *pagep = page; -+ if (!page) -+ return -ENOMEM; -+ -+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) { -+ ret = PTR_ERR(ctx); -+ goto out; -+ } -+ ret = PROT_PASSIVE(int, write_begin, (file, page, start, end)); -+ -+ /* don't commit transaction under inode semaphore */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ out: -+ if (unlikely(ret)) { -+ unlock_page(page); -+ page_cache_release(page); -+ } -+ return ret; -+} -+ -+int reiser4_write_end_careful(struct file *file, -+ struct address_space *mapping, -+ loff_t pos, -+ unsigned len, -+ unsigned copied, -+ struct page *page, -+ void *fsdata) -+{ -+ int ret; -+ reiser4_context *ctx; -+ unsigned start, end; -+ struct inode *inode = page->mapping->host; -+ -+ assert("umka-3101", file != NULL); -+ assert("umka-3102", page != NULL); -+ assert("umka-3093", PageLocked(page)); -+ -+ start = pos & (PAGE_CACHE_SIZE - 1); -+ end = start + len; -+ -+ flush_dcache_page(page); -+ SetPageUptodate(page); -+ -+ ctx = reiser4_init_context(page->mapping->host->i_sb); -+ if (IS_ERR(ctx)){ -+ unlock_page(page); -+ ret = PTR_ERR(ctx); -+ goto out; -+ } -+ ret = PROT_PASSIVE(int, write_end, (file, page, start, end)); -+ -+ /* don't commit transaction under inode semaphore */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ out: -+ page_cache_release(page); -+ if (!ret) -+ ret = copied; -+ return ret; -+} -+ -+/* -+ * Wrappers without protection for: -+ * -+ * ->setattr() -+ */ -+int reiser4_setattr(struct dentry *dentry, struct iattr *attr) -+{ -+ return inode_file_plugin(dentry->d_inode)->setattr(dentry, attr); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file/file.h linux-2.6.30/fs/reiser4/plugin/file/file.h ---- linux-2.6.30.orig/fs/reiser4/plugin/file/file.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file/file.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,336 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* this file contains declarations of methods implementing -+ file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID -+ and SYMLINK_FILE_PLUGIN_ID) */ -+ -+#if !defined( __REISER4_FILE_H__ ) -+#define __REISER4_FILE_H__ -+ -+/* possible states when scheduling a new file plugin */ -+typedef enum { -+ PSCHED_INVAL_STATE, /* invalid state */ -+ PSCHED_SCHED_POINT, /* scheduling point has been achieved */ -+ PSCHED_REMAINS_OLD, /* made a decision to be managed by old plugin */ -+ PSCHED_ASSIGNED_NEW /* new plugin has been scheduled */ -+} psched_state; -+ -+struct psched_context { -+ int nr_pages; -+ struct page **pages; -+ psched_state state; -+}; -+ -+/** -+ * Declarations of common/careful/generic methods. -+ * Suppose ->foo() is a vs method (of f_ops, i_ops, or a_ops); -+ * Then common reiser4 method for foo looks like reiser4_foo_common; -+ * careful method looks like reiser4_foo_careful; -+ * generic method looks like reiser4_foo. -+ * -+ * Common method is a simple instruction set eligible for more -+ * then one plugin id. -+ * -+ * Generic method looks at the plugin installed in inode's -+ * plugin set and calls its appropriate method. -+ * -+ * Careful method looks like generic method with protected pset -+ * (see plugin/file/file_conversion.c for details). -+ */ -+ -+/* inode operations */ -+int reiser4_setattr(struct dentry *, struct iattr *); -+ -+/* file operations */ -+ssize_t reiser4_read_careful(struct file *, char __user *buf, -+ size_t count, loff_t *off); -+ssize_t reiser4_write_careful(struct file *, const char __user *buf, -+ size_t count, loff_t * off); -+int reiser4_ioctl_careful(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg); -+int reiser4_mmap_careful(struct file *, struct vm_area_struct *); -+int reiser4_open_careful(struct inode *inode, struct file *file); -+int reiser4_release_careful(struct inode *, struct file *); -+int reiser4_sync_file_common(struct file *, struct dentry *, int datasync); -+ -+/* address space operations */ -+int reiser4_readpage(struct file *, struct page *); -+int reiser4_readpages(struct file*, struct address_space*, struct list_head*, -+ unsigned); -+int reiser4_writepages(struct address_space *, struct writeback_control *); -+int reiser4_write_begin_careful(struct file *file, -+ struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned flags, -+ struct page **pagep, void **fsdata); -+int reiser4_write_end_careful(struct file *file, -+ struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned copied, -+ struct page *page, void *fsdata); -+sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock); -+ -+/* -+ * Private methods of unix-file plugin -+ * (UNIX_FILE_PLUGIN_ID) -+ */ -+ -+/* private inode operations */ -+int setattr_unix_file(struct dentry *, struct iattr *); -+ -+/* private file operations */ -+ -+ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount, -+ loff_t *off); -+ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount, -+ loff_t * off, struct psched_context * cont); -+int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd, -+ unsigned long arg); -+int mmap_unix_file(struct file *, struct vm_area_struct *); -+int open_unix_file(struct inode *, struct file *); -+int release_unix_file(struct inode *, struct file *); -+ -+/* private address space operations */ -+int readpage_unix_file(struct file *, struct page *); -+int readpages_unix_file(struct file*, struct address_space*, struct list_head*, -+ unsigned); -+int writepages_unix_file(struct address_space *, struct writeback_control *); -+int write_begin_unix_file(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+int write_end_unix_file(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+sector_t bmap_unix_file(struct address_space *, sector_t lblock); -+ -+/* other private methods */ -+int delete_object_unix_file(struct inode *); -+int flow_by_inode_unix_file(struct inode *, const char __user *buf, -+ int user, loff_t, loff_t, rw_op, flow_t *); -+int owns_item_unix_file(const struct inode *, const coord_t *); -+void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *, -+ int create); -+ -+/* -+ * Private methods of cryptcompress file plugin -+ * (CRYPTCOMPRESS_FILE_PLUGIN_ID) -+ */ -+ -+/* private inode operations */ -+int setattr_cryptcompress(struct dentry *, struct iattr *); -+ -+/* private file operations */ -+ssize_t read_cryptcompress(struct file *, char __user *buf, -+ size_t count, loff_t *off); -+ssize_t write_cryptcompress(struct file *, const char __user *buf, -+ size_t count, loff_t * off, -+ struct psched_context *cont); -+int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd, -+ unsigned long arg); -+int mmap_cryptcompress(struct file *, struct vm_area_struct *); -+int open_cryptcompress(struct inode *, struct file *); -+int release_cryptcompress(struct inode *, struct file *); -+ -+/* private address space operations */ -+int readpage_cryptcompress(struct file *, struct page *); -+int readpages_cryptcompress(struct file*, struct address_space*, -+ struct list_head*, unsigned); -+int writepages_cryptcompress(struct address_space *, -+ struct writeback_control *); -+int write_begin_cryptcompress(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+int write_end_cryptcompress(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+sector_t bmap_cryptcompress(struct address_space *, sector_t lblock); -+ -+/* other private methods */ -+int flow_by_inode_cryptcompress(struct inode *, const char __user *buf, -+ int user, loff_t, loff_t, rw_op, flow_t *); -+int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *); -+int create_object_cryptcompress(struct inode *, struct inode *, -+ reiser4_object_create_data *); -+int delete_object_cryptcompress(struct inode *); -+void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *, -+ int create); -+int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, -+ struct inode *object, int truncate, -+ int *progress); -+void destroy_inode_cryptcompress(struct inode *); -+ -+/* -+ * Private methods of symlink file plugin -+ * (SYMLINK_FILE_PLUGIN_ID) -+ */ -+int reiser4_create_symlink(struct inode *symlink, struct inode *dir, -+ reiser4_object_create_data *); -+void destroy_inode_symlink(struct inode *); -+ -+/* -+ * all the write into unix file is performed by item write method. Write method -+ * of unix file plugin only decides which item plugin (extent or tail) and in -+ * which mode (one from the enum below) to call -+ */ -+typedef enum { -+ FIRST_ITEM = 1, -+ APPEND_ITEM = 2, -+ OVERWRITE_ITEM = 3 -+} write_mode_t; -+ -+/* unix file may be in one the following states */ -+typedef enum { -+ UF_CONTAINER_UNKNOWN = 0, -+ UF_CONTAINER_TAILS = 1, -+ UF_CONTAINER_EXTENTS = 2, -+ UF_CONTAINER_EMPTY = 3 -+} file_container_t; -+ -+struct formatting_plugin; -+struct inode; -+ -+/* unix file plugin specific part of reiser4 inode */ -+struct unix_file_info { -+ /* -+ * this read-write lock protects file containerization change. Accesses -+ * which do not change file containerization (see file_container_t) -+ * (read, readpage, writepage, write (until tail conversion is -+ * involved)) take read-lock. Accesses which modify file -+ * containerization (truncate, conversion from tail to extent and back) -+ * take write-lock. -+ */ -+ struct rw_semaphore latch; -+ /* this enum specifies which items are used to build the file */ -+ file_container_t container; -+ /* -+ * plugin which controls when file is to be converted to extents and -+ * back to tail -+ */ -+ struct formatting_plugin *tplug; -+ /* if this is set, file is in exclusive use */ -+ int exclusive_use; -+#if REISER4_DEBUG -+ /* pointer to task struct of thread owning exclusive access to file */ -+ void *ea_owner; -+ atomic_t nr_neas; -+ void *last_reader; -+#endif -+}; -+ -+struct unix_file_info *unix_file_inode_data(const struct inode *inode); -+void get_exclusive_access(struct unix_file_info *); -+void drop_exclusive_access(struct unix_file_info *); -+void get_nonexclusive_access(struct unix_file_info *); -+void drop_nonexclusive_access(struct unix_file_info *); -+int try_to_get_nonexclusive_access(struct unix_file_info *); -+int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode, -+ struct inode *); -+int find_file_item_nohint(coord_t *, lock_handle *, -+ const reiser4_key *, znode_lock_mode, -+ struct inode *); -+ -+int load_file_hint(struct file *, hint_t *); -+void save_file_hint(struct file *, const hint_t *); -+ -+#include "../item/extent.h" -+#include "../item/tail.h" -+#include "../item/ctail.h" -+ -+struct uf_coord { -+ coord_t coord; -+ lock_handle *lh; -+ int valid; -+ union { -+ struct extent_coord_extension extent; -+ struct tail_coord_extension tail; -+ struct ctail_coord_extension ctail; -+ } extension; -+}; -+ -+#include "../../forward.h" -+#include "../../seal.h" -+#include "../../lock.h" -+ -+/* -+ * This structure is used to speed up file operations (reads and writes). A -+ * hint is a suggestion about where a key resolved to last time. A seal -+ * indicates whether a node has been modified since a hint was last recorded. -+ * You check the seal, and if the seal is still valid, you can use the hint -+ * without traversing the tree again. -+ */ -+struct hint { -+ seal_t seal; /* a seal over last file item accessed */ -+ uf_coord_t ext_coord; -+ loff_t offset; -+ znode_lock_mode mode; -+ lock_handle lh; -+}; -+ -+static inline int hint_is_valid(hint_t * hint) -+{ -+ return hint->ext_coord.valid; -+} -+ -+static inline void hint_set_valid(hint_t * hint) -+{ -+ hint->ext_coord.valid = 1; -+} -+ -+static inline void hint_clr_valid(hint_t * hint) -+{ -+ hint->ext_coord.valid = 0; -+} -+ -+int load_file_hint(struct file *, hint_t *); -+void save_file_hint(struct file *, const hint_t *); -+void hint_init_zero(hint_t *); -+void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode); -+int hint_is_set(const hint_t *); -+void reiser4_unset_hint(hint_t *); -+ -+int reiser4_update_file_size(struct inode *, loff_t, int update_sd); -+int cut_file_items(struct inode *, loff_t new_size, -+ int update_sd, loff_t cur_size, -+ int (*update_actor) (struct inode *, loff_t, int)); -+#if REISER4_DEBUG -+ -+/* return 1 is exclusive access is obtained, 0 - otherwise */ -+static inline int ea_obtained(struct unix_file_info * uf_info) -+{ -+ int ret; -+ -+ ret = down_read_trylock(&uf_info->latch); -+ if (ret) -+ up_read(&uf_info->latch); -+ return !ret; -+} -+ -+#endif -+ -+#define WRITE_GRANULARITY 32 -+ -+int tail2extent(struct unix_file_info *); -+int extent2tail(struct file *, struct unix_file_info *); -+ -+int goto_right_neighbor(coord_t *, lock_handle *); -+int find_or_create_extent(struct page *); -+int equal_to_ldk(znode *, const reiser4_key *); -+ -+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh); -+ -+static inline int cbk_errored(int cbk_result) -+{ -+ return (cbk_result != CBK_COORD_NOTFOUND -+ && cbk_result != CBK_COORD_FOUND); -+} -+ -+/* __REISER4_FILE_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file/Makefile linux-2.6.30/fs/reiser4/plugin/file/Makefile ---- linux-2.6.30.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,7 @@ -+obj-$(CONFIG_REISER4_FS) += file_plugins.o -+ -+file_plugins-objs := \ -+ file.o \ -+ tail_conversion.o \ -+ symlink.o \ -+ cryptcompress.o -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.30/fs/reiser4/plugin/file/symfile.c ---- linux-2.6.30.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file/symfile.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,87 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Symfiles are a generalization of Unix symlinks. -+ -+ A symfile when read behaves as though you took its contents and -+ substituted them into the reiser4 naming system as the right hand side -+ of an assignment, and then read that which you had assigned to it. -+ -+ A key issue for symfiles is how to implement writes through to -+ subfiles. In general, one must have some method of determining what -+ of that which is written to the symfile is written to what subfile. -+ This can be done by use of custom plugin methods written by users, or -+ by using a few general methods we provide for those willing to endure -+ the insertion of delimiters into what is read. -+ -+ Writing to symfiles without delimiters to denote what is written to -+ what subfile is not supported by any plugins we provide in this -+ release. Our most sophisticated support for writes is that embodied -+ by the invert plugin (see invert.c). -+ -+ A read only version of the /etc/passwd file might be -+ constructed as a symfile whose contents are as follows: -+ -+ /etc/passwd/userlines/* -+ -+ or -+ -+ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root -+ -+ or -+ -+ /etc/passwd/userlines/(demidov+edward+reiser+root) -+ -+ A symfile with contents -+ -+ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB -+ -+ will return when read -+ -+ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB -+ -+ and write of what has been read will not be possible to implement as -+ an identity operation because there are no delimiters denoting the -+ boundaries of what is to be written to what subfile. -+ -+ Note that one could make this a read/write symfile if one specified -+ delimiters, and the write method understood those delimiters delimited -+ what was written to subfiles. -+ -+ So, specifying the symfile in a manner that allows writes: -+ -+ /etc/passwd/userlines/demidov+"( -+ )+/etc/passwd/userlines/edward+"( -+ )+/etc/passwd/userlines/reiser+"( -+ )+/etc/passwd/userlines/root+"( -+ ) -+ -+ or -+ -+ /etc/passwd/userlines/(demidov+"( -+ )+edward+"( -+ )+reiser+"( -+ )+root+"( -+ )) -+ -+ and the file demidov might be specified as: -+ -+ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell -+ -+ or -+ -+ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell) -+ -+ Notice that if the file demidov has a carriage return in it, the -+ parsing fails, but then if you put carriage returns in the wrong place -+ in a normal /etc/passwd file it breaks things also. -+ -+ Note that it is forbidden to have no text between two interpolations -+ if one wants to be able to define what parts of a write go to what -+ subfiles referenced in an interpolation. -+ -+ If one wants to be able to add new lines by writing to the file, one -+ must either write a custom plugin for /etc/passwd that knows how to -+ name an added line, or one must use an invert, or one must use a more -+ sophisticated symfile syntax that we are not planning to write for -+ version 4.0. -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.30/fs/reiser4/plugin/file/symlink.c ---- linux-2.6.30.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file/symlink.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,95 @@ -+/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../inode.h" -+ -+#include <linux/types.h> -+#include <linux/fs.h> -+ -+/* file plugin methods specific for symlink files -+ (SYMLINK_FILE_PLUGIN_ID) */ -+ -+/* this is implementation of create_object method of file plugin for -+ SYMLINK_FILE_PLUGIN_ID -+ */ -+ -+/** -+ * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID -+ * @symlink: inode of symlink object -+ * @dir: inode of parent directory -+ * @info: parameters of new object -+ * -+ * Inserts stat data with symlink extension where into the tree. -+ */ -+int reiser4_create_symlink(struct inode *symlink, -+ struct inode *dir UNUSED_ARG, -+ reiser4_object_create_data *data /* info passed to us -+ * this is filled by -+ * reiser4() syscall -+ * in particular */) -+{ -+ int result; -+ -+ assert("nikita-680", symlink != NULL); -+ assert("nikita-681", S_ISLNK(symlink->i_mode)); -+ assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD)); -+ assert("nikita-682", dir != NULL); -+ assert("nikita-684", data != NULL); -+ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID); -+ -+ /* -+ * stat data of symlink has symlink extension in which we store -+ * symlink content, that is, path symlink is pointing to. -+ */ -+ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT); -+ -+ assert("vs-838", symlink->i_private == NULL); -+ symlink->i_private = (void *)data->name; -+ -+ assert("vs-843", symlink->i_size == 0); -+ INODE_SET_FIELD(symlink, i_size, strlen(data->name)); -+ -+ /* insert stat data appended with data->name */ -+ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink); -+ if (result) { -+ /* FIXME-VS: Make sure that symlink->i_private is not attached -+ to kmalloced data */ -+ INODE_SET_FIELD(symlink, i_size, 0); -+ } else { -+ assert("vs-849", symlink->i_private -+ && reiser4_inode_get_flag(symlink, -+ REISER4_GENERIC_PTR_USED)); -+ assert("vs-850", -+ !memcmp((char *)symlink->i_private, data->name, -+ (size_t) symlink->i_size + 1)); -+ } -+ return result; -+} -+ -+/* this is implementation of destroy_inode method of file plugin for -+ SYMLINK_FILE_PLUGIN_ID -+ */ -+void destroy_inode_symlink(struct inode *inode) -+{ -+ assert("edward-799", -+ inode_file_plugin(inode) == -+ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID)); -+ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode)); -+ assert("edward-801", reiser4_inode_get_flag(inode, -+ REISER4_GENERIC_PTR_USED)); -+ assert("vs-839", S_ISLNK(inode->i_mode)); -+ -+ kfree(inode->i_private); -+ inode->i_private = NULL; -+ reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.30/fs/reiser4/plugin/file/tail_conversion.c ---- linux-2.6.30.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file/tail_conversion.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,737 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../inode.h" -+#include "../../super.h" -+#include "../../page_cache.h" -+#include "../../carry.h" -+#include "../../safe_link.h" -+#include "../../vfs_ops.h" -+ -+#include <linux/writeback.h> -+ -+/* this file contains: -+ tail2extent and extent2tail */ -+ -+/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */ -+void get_exclusive_access(struct unix_file_info * uf_info) -+{ -+ assert("nikita-3028", reiser4_schedulable()); -+ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w)); -+ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r)); -+ /* -+ * "deadlock avoidance": sometimes we commit a transaction under -+ * rw-semaphore on a file. Such commit can deadlock with another -+ * thread that captured some block (hence preventing atom from being -+ * committed) and waits on rw-semaphore. -+ */ -+ reiser4_txn_restart_current(); -+ LOCK_CNT_INC(inode_sem_w); -+ down_write(&uf_info->latch); -+ uf_info->exclusive_use = 1; -+ assert("vs-1713", uf_info->ea_owner == NULL); -+ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0); -+ ON_DEBUG(uf_info->ea_owner = current); -+} -+ -+void drop_exclusive_access(struct unix_file_info * uf_info) -+{ -+ assert("vs-1714", uf_info->ea_owner == current); -+ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0); -+ ON_DEBUG(uf_info->ea_owner = NULL); -+ uf_info->exclusive_use = 0; -+ up_write(&uf_info->latch); -+ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r)); -+ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w)); -+ LOCK_CNT_DEC(inode_sem_w); -+ reiser4_txn_restart_current(); -+} -+ -+/** -+ * nea_grabbed - do something when file semaphore is down_read-ed -+ * @uf_info: -+ * -+ * This is called when nonexclisive access is obtained on file. All it does is -+ * for debugging purposes. -+ */ -+static void nea_grabbed(struct unix_file_info *uf_info) -+{ -+#if REISER4_DEBUG -+ LOCK_CNT_INC(inode_sem_r); -+ assert("vs-1716", uf_info->ea_owner == NULL); -+ atomic_inc(&uf_info->nr_neas); -+ uf_info->last_reader = current; -+#endif -+} -+ -+/** -+ * get_nonexclusive_access - get nonexclusive access to a file -+ * @uf_info: unix file specific part of inode to obtain access to -+ * -+ * Nonexclusive access is obtained on a file before read, write, readpage. -+ */ -+void get_nonexclusive_access(struct unix_file_info *uf_info) -+{ -+ assert("nikita-3029", reiser4_schedulable()); -+ assert("nikita-3361", get_current_context()->trans->atom == NULL); -+ -+ down_read(&uf_info->latch); -+ nea_grabbed(uf_info); -+} -+ -+/** -+ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file -+ * @uf_info: unix file specific part of inode to obtain access to -+ * -+ * Non-blocking version of nonexclusive access obtaining. -+ */ -+int try_to_get_nonexclusive_access(struct unix_file_info *uf_info) -+{ -+ int result; -+ -+ result = down_read_trylock(&uf_info->latch); -+ if (result) -+ nea_grabbed(uf_info); -+ return result; -+} -+ -+void drop_nonexclusive_access(struct unix_file_info * uf_info) -+{ -+ assert("vs-1718", uf_info->ea_owner == NULL); -+ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0); -+ ON_DEBUG(atomic_dec(&uf_info->nr_neas)); -+ -+ up_read(&uf_info->latch); -+ -+ LOCK_CNT_DEC(inode_sem_r); -+ reiser4_txn_restart_current(); -+} -+ -+/* part of tail2extent. Cut all items covering @count bytes starting from -+ @offset */ -+/* Audited by: green(2002.06.15) */ -+static int cut_formatting_items(struct inode *inode, loff_t offset, int count) -+{ -+ reiser4_key from, to; -+ -+ /* AUDIT: How about putting an assertion here, what would check -+ all provided range is covered by tail items only? */ -+ /* key of first byte in the range to be cut */ -+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from); -+ -+ /* key of last byte in that range */ -+ to = from; -+ set_key_offset(&to, (__u64) (offset + count - 1)); -+ -+ /* cut everything between those keys */ -+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to, -+ inode, 0); -+} -+ -+static void release_all_pages(struct page **pages, unsigned nr_pages) -+{ -+ unsigned i; -+ -+ for (i = 0; i < nr_pages; i++) { -+ if (pages[i] == NULL) { -+#if REISER4_DEBUG -+ unsigned j; -+ for (j = i + 1; j < nr_pages; j++) -+ assert("vs-1620", pages[j] == NULL); -+#endif -+ break; -+ } -+ page_cache_release(pages[i]); -+ pages[i] = NULL; -+ } -+} -+ -+/* part of tail2extent. replace tail items with extent one. Content of tail -+ items (@count bytes) being cut are copied already into -+ pages. extent_writepage method is called to create extents corresponding to -+ those pages */ -+static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count) -+{ -+ int result; -+ unsigned i; -+ STORE_COUNTERS; -+ -+ if (nr_pages == 0) -+ return 0; -+ -+ assert("vs-596", pages[0]); -+ -+ /* cut copied items */ -+ result = cut_formatting_items(inode, page_offset(pages[0]), count); -+ if (result) -+ return result; -+ -+ CHECK_COUNTERS; -+ -+ /* put into tree replacement for just removed items: extent item, namely */ -+ for (i = 0; i < nr_pages; i++) { -+ result = add_to_page_cache_lru(pages[i], inode->i_mapping, -+ pages[i]->index, -+ mapping_gfp_mask(inode-> -+ i_mapping)); -+ if (result) -+ break; -+ unlock_page(pages[i]); -+ result = find_or_create_extent(pages[i]); -+ if (result) -+ break; -+ SetPageUptodate(pages[i]); -+ } -+ return result; -+} -+ -+#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail -+ * items */ -+ -+static int reserve_tail2extent_iteration(struct inode *inode) -+{ -+ reiser4_block_nr unformatted_nodes; -+ reiser4_tree *tree; -+ -+ tree = reiser4_tree_by_inode(inode); -+ -+ /* number of unformatted nodes which will be created */ -+ unformatted_nodes = TAIL2EXTENT_PAGE_NUM; -+ -+ /* -+ * space required for one iteration of extent->tail conversion: -+ * -+ * 1. kill N tail items -+ * -+ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes -+ * -+ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block -+ * extents) extent units. -+ * -+ * 4. drilling to the leaf level by coord_by_key() -+ * -+ * 5. possible update of stat-data -+ * -+ */ -+ grab_space_enable(); -+ return reiser4_grab_space -+ (2 * tree->height + -+ TAIL2EXTENT_PAGE_NUM + -+ TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) + -+ 1 + estimate_one_insert_item(tree) + -+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT); -+} -+ -+/* clear stat data's flag indicating that conversion is being converted */ -+static int complete_conversion(struct inode *inode) -+{ -+ int result; -+ -+ grab_space_enable(); -+ result = -+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+ if (result == 0) { -+ reiser4_inode_clr_flag(inode, REISER4_PART_MIXED); -+ result = reiser4_update_sd(inode); -+ } -+ if (result) -+ warning("vs-1696", "Failed to clear converting bit of %llu: %i", -+ (unsigned long long)get_inode_oid(inode), result); -+ return 0; -+} -+ -+/** -+ * find_start -+ * @inode: -+ * @id: -+ * @offset: -+ * -+ * this is used by tail2extent and extent2tail to detect where previous -+ * uncompleted conversion stopped -+ */ -+static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset) -+{ -+ int result; -+ lock_handle lh; -+ coord_t coord; -+ struct unix_file_info *ufo; -+ int found; -+ reiser4_key key; -+ -+ ufo = unix_file_inode_data(inode); -+ init_lh(&lh); -+ result = 0; -+ found = 0; -+ inode_file_plugin(inode)->key_by_inode(inode, *offset, &key); -+ do { -+ init_lh(&lh); -+ result = find_file_item_nohint(&coord, &lh, &key, -+ ZNODE_READ_LOCK, inode); -+ -+ if (result == CBK_COORD_FOUND) { -+ if (coord.between == AT_UNIT) { -+ /*coord_clear_iplug(&coord); */ -+ result = zload(coord.node); -+ if (result == 0) { -+ if (item_id_by_coord(&coord) == id) -+ found = 1; -+ else -+ item_plugin_by_coord(&coord)->s. -+ file.append_key(&coord, -+ &key); -+ zrelse(coord.node); -+ } -+ } else -+ result = RETERR(-ENOENT); -+ } -+ done_lh(&lh); -+ } while (result == 0 && !found); -+ *offset = get_key_offset(&key); -+ return result; -+} -+ -+/** -+ * tail2extent -+ * @uf_info: -+ * -+ * -+ */ -+int tail2extent(struct unix_file_info *uf_info) -+{ -+ int result; -+ reiser4_key key; /* key of next byte to be moved to page */ -+ char *p_data; /* data of page */ -+ unsigned page_off = 0, /* offset within the page where to copy data */ -+ count; /* number of bytes of item which can be -+ * copied to page */ -+ struct page *pages[TAIL2EXTENT_PAGE_NUM]; -+ struct page *page; -+ int done; /* set to 1 when all file is read */ -+ char *item; -+ int i; -+ struct inode *inode; -+ int first_iteration; -+ int bytes; -+ __u64 offset; -+ -+ assert("nikita-3362", ea_obtained(uf_info)); -+ inode = unix_file_info_to_inode(uf_info); -+ assert("nikita-3412", !IS_RDONLY(inode)); -+ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS); -+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)); -+ -+ offset = 0; -+ first_iteration = 1; -+ result = 0; -+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ /* -+ * file is marked on disk as there was a conversion which did -+ * not complete due to either crash or some error. Find which -+ * offset tail conversion stopped at -+ */ -+ result = find_start(inode, FORMATTING_ID, &offset); -+ if (result == -ENOENT) { -+ /* no tail items found, everything is converted */ -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ complete_conversion(inode); -+ return 0; -+ } else if (result != 0) -+ /* some other error */ -+ return result; -+ first_iteration = 0; -+ } -+ -+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV); -+ -+ /* get key of first byte of a file */ -+ inode_file_plugin(inode)->key_by_inode(inode, offset, &key); -+ -+ done = 0; -+ while (done == 0) { -+ memset(pages, 0, sizeof(pages)); -+ result = reserve_tail2extent_iteration(inode); -+ if (result != 0) { -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ goto out; -+ } -+ if (first_iteration) { -+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); -+ reiser4_update_sd(inode); -+ first_iteration = 0; -+ } -+ bytes = 0; -+ for (i = 0; i < sizeof_array(pages) && done == 0; i++) { -+ assert("vs-598", -+ (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0); -+ page = alloc_page(reiser4_ctx_gfp_mask_get()); -+ if (!page) { -+ result = RETERR(-ENOMEM); -+ goto error; -+ } -+ -+ page->index = -+ (unsigned long)(get_key_offset(&key) >> -+ PAGE_CACHE_SHIFT); -+ /* -+ * usually when one is going to longterm lock znode (as -+ * find_file_item does, for instance) he must not hold -+ * locked pages. However, there is an exception for -+ * case tail2extent. Pages appearing here are not -+ * reachable to everyone else, they are clean, they do -+ * not have jnodes attached so keeping them locked do -+ * not risk deadlock appearance -+ */ -+ assert("vs-983", !PagePrivate(page)); -+ reiser4_invalidate_pages(inode->i_mapping, page->index, -+ 1, 0); -+ -+ for (page_off = 0; page_off < PAGE_CACHE_SIZE;) { -+ coord_t coord; -+ lock_handle lh; -+ -+ /* get next item */ -+ /* FIXME: we might want to readahead here */ -+ init_lh(&lh); -+ result = -+ find_file_item_nohint(&coord, &lh, &key, -+ ZNODE_READ_LOCK, -+ inode); -+ if (result != CBK_COORD_FOUND) { -+ /* -+ * error happened of not items of file -+ * were found -+ */ -+ done_lh(&lh); -+ page_cache_release(page); -+ goto error; -+ } -+ -+ if (coord.between == AFTER_UNIT) { -+ /* -+ * end of file is reached. Padd page -+ * with zeros -+ */ -+ done_lh(&lh); -+ done = 1; -+ p_data = kmap_atomic(page, KM_USER0); -+ memset(p_data + page_off, 0, -+ PAGE_CACHE_SIZE - page_off); -+ kunmap_atomic(p_data, KM_USER0); -+ break; -+ } -+ -+ result = zload(coord.node); -+ if (result) { -+ page_cache_release(page); -+ done_lh(&lh); -+ goto error; -+ } -+ assert("vs-856", coord.between == AT_UNIT); -+ item = ((char *)item_body_by_coord(&coord)) + -+ coord.unit_pos; -+ -+ /* how many bytes to copy */ -+ count = -+ item_length_by_coord(&coord) - -+ coord.unit_pos; -+ /* limit length of copy to end of page */ -+ if (count > PAGE_CACHE_SIZE - page_off) -+ count = PAGE_CACHE_SIZE - page_off; -+ -+ /* -+ * copy item (as much as will fit starting from -+ * the beginning of the item) into the page -+ */ -+ p_data = kmap_atomic(page, KM_USER0); -+ memcpy(p_data + page_off, item, count); -+ kunmap_atomic(p_data, KM_USER0); -+ -+ page_off += count; -+ bytes += count; -+ set_key_offset(&key, -+ get_key_offset(&key) + count); -+ -+ zrelse(coord.node); -+ done_lh(&lh); -+ } /* end of loop which fills one page by content of -+ * formatting items */ -+ -+ if (page_off) { -+ /* something was copied into page */ -+ pages[i] = page; -+ } else { -+ page_cache_release(page); -+ assert("vs-1648", done == 1); -+ break; -+ } -+ } /* end of loop through pages of one conversion iteration */ -+ -+ if (i > 0) { -+ result = replace(inode, pages, i, bytes); -+ release_all_pages(pages, sizeof_array(pages)); -+ if (result) -+ goto error; -+ /* -+ * We have to drop exclusive access to avoid deadlock -+ * which may happen because called by reiser4_writepages -+ * capture_unix_file requires to get non-exclusive -+ * access to a file. It is safe to drop EA in the middle -+ * of tail2extent conversion because write_unix_file, -+ * setattr_unix_file(truncate), mmap_unix_file, -+ * release_unix_file(extent2tail) checks if conversion -+ * is not in progress (see comments before -+ * get_exclusive_access_careful(). -+ * Other processes that acquire non-exclusive access -+ * (read_unix_file, reiser4_writepages, etc) should work -+ * on partially converted files. -+ */ -+ drop_exclusive_access(uf_info); -+ /* throttle the conversion */ -+ reiser4_throttle_write(inode); -+ get_exclusive_access(uf_info); -+ -+ /* -+ * nobody is allowed to complete conversion but a -+ * process which started it -+ */ -+ assert("", reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED)); -+ } -+ } -+ if (result == 0) { -+ /* file is converted to extent items */ -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ assert("vs-1697", reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED)); -+ -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ complete_conversion(inode); -+ } else { -+ /* -+ * conversion is not complete. Inode was already marked as -+ * REISER4_PART_MIXED and stat-data were updated at the first -+ * iteration of the loop above. -+ */ -+ error: -+ release_all_pages(pages, sizeof_array(pages)); -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ warning("edward-1548", "Partial conversion of %llu: %i", -+ (unsigned long long)get_inode_oid(inode), result); -+ } -+ -+ out: -+ /* this flag should be cleared, otherwise get_exclusive_access_careful() -+ will fall into infinite loop */ -+ assert("edward-1549", !reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)); -+ return result; -+} -+ -+static int reserve_extent2tail_iteration(struct inode *inode) -+{ -+ reiser4_tree *tree; -+ -+ tree = reiser4_tree_by_inode(inode); -+ /* -+ * reserve blocks for (in this order): -+ * -+ * 1. removal of extent item -+ * -+ * 2. insertion of tail by insert_flow() -+ * -+ * 3. drilling to the leaf level by coord_by_key() -+ * -+ * 4. possible update of stat-data -+ */ -+ grab_space_enable(); -+ return reiser4_grab_space -+ (estimate_one_item_removal(tree) + -+ estimate_insert_flow(tree->height) + -+ 1 + estimate_one_insert_item(tree) + -+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT); -+} -+ -+/* for every page of file: read page, cut part of extent pointing to this page, -+ put data of page tree by tail item */ -+int extent2tail(struct file * file, struct unix_file_info *uf_info) -+{ -+ int result; -+ struct inode *inode; -+ struct page *page; -+ unsigned long num_pages, i; -+ unsigned long start_page; -+ reiser4_key from; -+ reiser4_key to; -+ unsigned count; -+ __u64 offset; -+ -+ assert("nikita-3362", ea_obtained(uf_info)); -+ inode = unix_file_info_to_inode(uf_info); -+ assert("nikita-3412", !IS_RDONLY(inode)); -+ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS); -+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)); -+ -+ offset = 0; -+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ /* -+ * file is marked on disk as there was a conversion which did -+ * not complete due to either crash or some error. Find which -+ * offset tail conversion stopped at -+ */ -+ result = find_start(inode, EXTENT_POINTER_ID, &offset); -+ if (result == -ENOENT) { -+ /* no extent found, everything is converted */ -+ uf_info->container = UF_CONTAINER_TAILS; -+ complete_conversion(inode); -+ return 0; -+ } else if (result != 0) -+ /* some other error */ -+ return result; -+ } -+ -+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV); -+ -+ /* number of pages in the file */ -+ num_pages = -+ (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -+ start_page = offset >> PAGE_CACHE_SHIFT; -+ -+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from); -+ to = from; -+ -+ result = 0; -+ for (i = 0; i < num_pages; i++) { -+ __u64 start_byte; -+ -+ result = reserve_extent2tail_iteration(inode); -+ if (result != 0) -+ break; -+ if (i == 0 && offset == 0) { -+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); -+ reiser4_update_sd(inode); -+ } -+ -+ page = read_mapping_page(inode->i_mapping, -+ (unsigned)(i + start_page), NULL); -+ if (IS_ERR(page)) { -+ result = PTR_ERR(page); -+ break; -+ } -+ -+ wait_on_page_locked(page); -+ -+ if (!PageUptodate(page)) { -+ page_cache_release(page); -+ result = RETERR(-EIO); -+ break; -+ } -+ -+ /* cut part of file we have read */ -+ start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT); -+ set_key_offset(&from, start_byte); -+ set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1); -+ /* -+ * reiser4_cut_tree_object() returns -E_REPEAT to allow atom -+ * commits during over-long truncates. But -+ * extent->tail conversion should be performed in one -+ * transaction. -+ */ -+ result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, -+ &to, inode, 0); -+ -+ if (result) { -+ page_cache_release(page); -+ break; -+ } -+ -+ /* put page data into tree via tail_write */ -+ count = PAGE_CACHE_SIZE; -+ if ((i == (num_pages - 1)) && -+ (inode->i_size & ~PAGE_CACHE_MASK)) -+ /* last page can be incompleted */ -+ count = (inode->i_size & ~PAGE_CACHE_MASK); -+ while (count) { -+ loff_t pos = start_byte; -+ -+ assert("edward-1537", -+ file != NULL && file->f_dentry != NULL); -+ assert("edward-1538", -+ file->f_dentry->d_inode == inode); -+ -+ result = reiser4_write_tail(file, inode, -+ (char __user *)kmap(page), -+ count, &pos); -+ reiser4_free_file_fsdata(file); -+ if (result <= 0) { -+ warning("", "reiser4_write_tail failed"); -+ page_cache_release(page); -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ return result; -+ } -+ count -= result; -+ } -+ -+ /* release page */ -+ lock_page(page); -+ /* page is already detached from jnode and mapping. */ -+ assert("vs-1086", page->mapping == NULL); -+ assert("nikita-2690", -+ (!PagePrivate(page) && jprivate(page) == 0)); -+ /* waiting for writeback completion with page lock held is -+ * perfectly valid. */ -+ wait_on_page_writeback(page); -+ reiser4_drop_page(page); -+ /* release reference taken by read_cache_page() above */ -+ page_cache_release(page); -+ -+ drop_exclusive_access(uf_info); -+ /* throttle the conversion */ -+ reiser4_throttle_write(inode); -+ get_exclusive_access(uf_info); -+ /* -+ * nobody is allowed to complete conversion but a process which -+ * started it -+ */ -+ assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED)); -+ } -+ -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ -+ if (i == num_pages) { -+ /* file is converted to formatted items */ -+ assert("vs-1698", reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED)); -+ assert("vs-1260", -+ inode_has_no_jnodes(reiser4_inode_data(inode))); -+ -+ uf_info->container = UF_CONTAINER_TAILS; -+ complete_conversion(inode); -+ return 0; -+ } -+ /* -+ * conversion is not complete. Inode was already marked as -+ * REISER4_PART_MIXED and stat-data were updated at the first -+ * iteration of the loop above. -+ */ -+ warning("nikita-2282", -+ "Partial conversion of %llu: %lu of %lu: %i", -+ (unsigned long long)get_inode_oid(inode), i, -+ num_pages, result); -+ -+ /* this flag should be cleared, otherwise get_exclusive_access_careful() -+ will fall into infinite loop */ -+ assert("edward-1550", !reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)); -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file_ops.c linux-2.6.30/fs/reiser4/plugin/file_ops.c ---- linux-2.6.30.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file_ops.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,162 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* this file contains typical implementations for some of methods of -+ struct file_operations and of struct address_space_operations -+*/ -+ -+#include "../inode.h" -+#include "object.h" -+ -+/* file operations */ -+ -+/* implementation of vfs's llseek method of struct file_operations for -+ typical directory can be found in readdir_common.c -+*/ -+loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin); -+ -+/* implementation of vfs's readdir method of struct file_operations for -+ typical directory can be found in readdir_common.c -+*/ -+int reiser4_readdir_common(struct file *, void *dirent, filldir_t); -+ -+/** -+ * reiser4_release_dir_common - release of struct file_operations -+ * @inode: inode of released file -+ * @file: file to release -+ * -+ * Implementation of release method of struct file_operations for typical -+ * directory. All it does is freeing of reiser4 specific file data. -+*/ -+int reiser4_release_dir_common(struct inode *inode, struct file *file) -+{ -+ reiser4_context *ctx; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ reiser4_free_file_fsdata(file); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+/* this is common implementation of vfs's fsync method of struct -+ file_operations -+*/ -+int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync) -+{ -+ reiser4_context *ctx; -+ int result; -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* -+ * common sync method for regular files. -+ * -+ * We are trying to be smart here. Instead of committing all atoms (original -+ * solution), we scan dirty pages of this file and commit all atoms they are -+ * part of. -+ * -+ * Situation is complicated by anonymous pages: i.e., extent-less pages -+ * dirtied through mmap. Fortunately sys_fsync() first calls -+ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert -+ * all missing extents and capture anonymous pages. -+ */ -+int reiser4_sync_file_common(struct file *file, -+ struct dentry *dentry, int datasync) -+{ -+ reiser4_context *ctx; -+ txn_atom *atom; -+ reiser4_block_nr reserve; -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ reserve = estimate_update_common(dentry->d_inode); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOSPC); -+ } -+ write_sd_by_inode_common(dentry->d_inode); -+ -+ atom = get_current_atom_locked(); -+ spin_lock_txnh(ctx->trans); -+ force_commit_atom(ctx->trans); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+ -+/* address space operations */ -+ -+ -+/* this is helper for plugin->write_begin() */ -+int do_prepare_write(struct file *file, struct page *page, unsigned from, -+ unsigned to) -+{ -+ int result; -+ file_plugin *fplug; -+ struct inode *inode; -+ -+ assert("umka-3099", file != NULL); -+ assert("umka-3100", page != NULL); -+ assert("umka-3095", PageLocked(page)); -+ -+ if (to - from == PAGE_CACHE_SIZE || PageUptodate(page)) -+ return 0; -+ -+ inode = page->mapping->host; -+ fplug = inode_file_plugin(inode); -+ -+ if (page->mapping->a_ops->readpage == NULL) -+ return RETERR(-EINVAL); -+ -+ result = page->mapping->a_ops->readpage(file, page); -+ if (result != 0) { -+ SetPageError(page); -+ ClearPageUptodate(page); -+ /* All reiser4 readpage() implementations should return the -+ * page locked in case of error. */ -+ assert("nikita-3472", PageLocked(page)); -+ } else { -+ /* -+ * ->readpage() either: -+ * -+ * 1. starts IO against @page. @page is locked for IO in -+ * this case. -+ * -+ * 2. doesn't start IO. @page is unlocked. -+ * -+ * In either case, page should be locked. -+ */ -+ lock_page(page); -+ /* -+ * IO (if any) is completed at this point. Check for IO -+ * errors. -+ */ -+ if (!PageUptodate(page)) -+ result = RETERR(-EIO); -+ } -+ assert("umka-3098", PageLocked(page)); -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.30/fs/reiser4/plugin/file_ops_readdir.c ---- linux-2.6.30.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file_ops_readdir.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,658 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "../inode.h" -+ -+/* return true, iff @coord points to the valid directory item that is part of -+ * @inode directory. */ -+static int is_valid_dir_coord(struct inode *inode, coord_t *coord) -+{ -+ return plugin_of_group(item_plugin_by_coord(coord), -+ DIR_ENTRY_ITEM_TYPE) && -+ inode_file_plugin(inode)->owns_item(inode, coord); -+} -+ -+/* compare two logical positions within the same directory */ -+static cmp_t dir_pos_cmp(const struct dir_pos *p1, const struct dir_pos *p2) -+{ -+ cmp_t result; -+ -+ assert("nikita-2534", p1 != NULL); -+ assert("nikita-2535", p2 != NULL); -+ -+ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key); -+ if (result == EQUAL_TO) { -+ int diff; -+ -+ diff = p1->pos - p2->pos; -+ result = -+ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO); -+ } -+ return result; -+} -+ -+/* see comment before reiser4_readdir_common() for overview of why "adjustment" -+ * is necessary. */ -+static void -+adjust_dir_pos(struct file *dir, struct readdir_pos *readdir_spot, -+ const struct dir_pos *mod_point, int adj) -+{ -+ struct dir_pos *pos; -+ -+ /* -+ * new directory entry was added (adj == +1) or removed (adj == -1) at -+ * the @mod_point. Directory file descriptor @dir is doing readdir and -+ * is currently positioned at @readdir_spot. Latter has to be updated -+ * to maintain stable readdir. -+ */ -+ /* directory is positioned to the beginning. */ -+ if (readdir_spot->entry_no == 0) -+ return; -+ -+ pos = &readdir_spot->position; -+ switch (dir_pos_cmp(mod_point, pos)) { -+ case LESS_THAN: -+ /* @mod_pos is _before_ @readdir_spot, that is, entry was -+ * added/removed on the left (in key order) of current -+ * position. */ -+ /* logical number of directory entry readdir is "looking" at -+ * changes */ -+ readdir_spot->entry_no += adj; -+ assert("nikita-2577", -+ ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0)); -+ if (de_id_cmp(&pos->dir_entry_key, -+ &mod_point->dir_entry_key) == EQUAL_TO) { -+ assert("nikita-2575", mod_point->pos < pos->pos); -+ /* -+ * if entry added/removed has the same key as current -+ * for readdir, update counter of duplicate keys in -+ * @readdir_spot. -+ */ -+ pos->pos += adj; -+ } -+ break; -+ case GREATER_THAN: -+ /* directory is modified after @pos: nothing to do. */ -+ break; -+ case EQUAL_TO: -+ /* cannot insert an entry readdir is looking at, because it -+ already exists. */ -+ assert("nikita-2576", adj < 0); -+ /* directory entry to which @pos points to is being -+ removed. -+ -+ NOTE-NIKITA: Right thing to do is to update @pos to point -+ to the next entry. This is complex (we are under spin-lock -+ for one thing). Just rewind it to the beginning. Next -+ readdir will have to scan the beginning of -+ directory. Proper solution is to use semaphore in -+ spin lock's stead and use rewind_right() here. -+ -+ NOTE-NIKITA: now, semaphore is used, so... -+ */ -+ memset(readdir_spot, 0, sizeof *readdir_spot); -+ } -+} -+ -+/* scan all file-descriptors for this directory and adjust their -+ positions respectively. Should be used by implementations of -+ add_entry and rem_entry of dir plugin */ -+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de, -+ int offset, int adj) -+{ -+ reiser4_file_fsdata *scan; -+ struct dir_pos mod_point; -+ -+ assert("nikita-2536", dir != NULL); -+ assert("nikita-2538", de != NULL); -+ assert("nikita-2539", adj != 0); -+ -+ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key); -+ mod_point.pos = offset; -+ -+ spin_lock_inode(dir); -+ -+ /* -+ * new entry was added/removed in directory @dir. Scan all file -+ * descriptors for @dir that are currently involved into @readdir and -+ * update them. -+ */ -+ -+ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage) -+ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj); -+ -+ spin_unlock_inode(dir); -+} -+ -+/* -+ * traverse tree to start/continue readdir from the readdir position @pos. -+ */ -+static int dir_go_to(struct file *dir, struct readdir_pos *pos, tap_t *tap) -+{ -+ reiser4_key key; -+ int result; -+ struct inode *inode; -+ -+ assert("nikita-2554", pos != NULL); -+ -+ inode = dir->f_dentry->d_inode; -+ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key); -+ if (result != 0) -+ return result; -+ result = reiser4_object_lookup(inode, -+ &key, -+ tap->coord, -+ tap->lh, -+ tap->mode, -+ FIND_EXACT, -+ LEAF_LEVEL, LEAF_LEVEL, -+ 0, &tap->ra_info); -+ if (result == CBK_COORD_FOUND) -+ result = rewind_right(tap, (int)pos->position.pos); -+ else { -+ tap->coord->node = NULL; -+ done_lh(tap->lh); -+ result = RETERR(-EIO); -+ } -+ return result; -+} -+ -+/* -+ * handling of non-unique keys: calculate at what ordinal position within -+ * sequence of directory items with identical keys @pos is. -+ */ -+static int set_pos(struct inode *inode, struct readdir_pos *pos, tap_t *tap) -+{ -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ tap_t scan; -+ de_id *did; -+ reiser4_key de_key; -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK); -+ reiser4_tap_copy(&scan, tap); -+ reiser4_tap_load(&scan); -+ pos->position.pos = 0; -+ -+ did = &pos->position.dir_entry_key; -+ -+ if (is_valid_dir_coord(inode, scan.coord)) { -+ -+ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did); -+ -+ while (1) { -+ -+ result = go_prev_unit(&scan); -+ if (result != 0) -+ break; -+ -+ if (!is_valid_dir_coord(inode, scan.coord)) { -+ result = -EINVAL; -+ break; -+ } -+ -+ /* get key of directory entry */ -+ unit_key_by_coord(scan.coord, &de_key); -+ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) { -+ /* duplicate-sequence is over */ -+ break; -+ } -+ pos->position.pos++; -+ } -+ } else -+ result = RETERR(-ENOENT); -+ reiser4_tap_relse(&scan); -+ reiser4_tap_done(&scan); -+ return result; -+} -+ -+/* -+ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly. -+ */ -+static int dir_rewind(struct file *dir, struct readdir_pos *pos, tap_t *tap) -+{ -+ __u64 destination; -+ __s64 shift; -+ int result; -+ struct inode *inode; -+ loff_t dirpos; -+ -+ assert("nikita-2553", dir != NULL); -+ assert("nikita-2548", pos != NULL); -+ assert("nikita-2551", tap->coord != NULL); -+ assert("nikita-2552", tap->lh != NULL); -+ -+ dirpos = reiser4_get_dir_fpos(dir); -+ shift = dirpos - pos->fpos; -+ /* this is logical directory entry within @dir which we are rewinding -+ * to */ -+ destination = pos->entry_no + shift; -+ -+ inode = dir->f_dentry->d_inode; -+ if (dirpos < 0) -+ return RETERR(-EINVAL); -+ else if (destination == 0ll || dirpos == 0) { -+ /* rewind to the beginning of directory */ -+ memset(pos, 0, sizeof *pos); -+ return dir_go_to(dir, pos, tap); -+ } else if (destination >= inode->i_size) -+ return RETERR(-ENOENT); -+ -+ if (shift < 0) { -+ /* I am afraid of negative numbers */ -+ shift = -shift; -+ /* rewinding to the left */ -+ if (shift <= (int)pos->position.pos) { -+ /* destination is within sequence of entries with -+ duplicate keys. */ -+ result = dir_go_to(dir, pos, tap); -+ } else { -+ shift -= pos->position.pos; -+ while (1) { -+ /* repetitions: deadlock is possible when -+ going to the left. */ -+ result = dir_go_to(dir, pos, tap); -+ if (result == 0) { -+ result = rewind_left(tap, shift); -+ if (result == -E_DEADLOCK) { -+ reiser4_tap_done(tap); -+ continue; -+ } -+ } -+ break; -+ } -+ } -+ } else { -+ /* rewinding to the right */ -+ result = dir_go_to(dir, pos, tap); -+ if (result == 0) -+ result = rewind_right(tap, shift); -+ } -+ if (result == 0) { -+ result = set_pos(inode, pos, tap); -+ if (result == 0) { -+ /* update pos->position.pos */ -+ pos->entry_no = destination; -+ pos->fpos = dirpos; -+ } -+ } -+ return result; -+} -+ -+/* -+ * Function that is called by common_readdir() on each directory entry while -+ * doing readdir. ->filldir callback may block, so we had to release long term -+ * lock while calling it. To avoid repeating tree traversal, seal is used. If -+ * seal is broken, we return -E_REPEAT. Node is unlocked in this case. -+ * -+ * Whether node is unlocked in case of any other error is undefined. It is -+ * guaranteed to be still locked if success (0) is returned. -+ * -+ * When ->filldir() wants no more, feed_entry() returns 1, and node is -+ * unlocked. -+ */ -+static int -+feed_entry(struct file *f, struct readdir_pos *pos, tap_t *tap, -+ filldir_t filldir, void *dirent) -+{ -+ item_plugin *iplug; -+ char *name; -+ reiser4_key sd_key; -+ int result; -+ char buf[DE_NAME_BUF_LEN]; -+ char name_buf[32]; -+ char *local_name; -+ unsigned file_type; -+ seal_t seal; -+ coord_t *coord; -+ reiser4_key entry_key; -+ -+ coord = tap->coord; -+ iplug = item_plugin_by_coord(coord); -+ -+ /* pointer to name within the node */ -+ name = iplug->s.dir.extract_name(coord, buf); -+ assert("nikita-1371", name != NULL); -+ -+ /* key of object the entry points to */ -+ if (iplug->s.dir.extract_key(coord, &sd_key) != 0) -+ return RETERR(-EIO); -+ -+ /* we must release longterm znode lock before calling filldir to avoid -+ deadlock which may happen if filldir causes page fault. So, copy -+ name to intermediate buffer */ -+ if (strlen(name) + 1 > sizeof(name_buf)) { -+ local_name = kmalloc(strlen(name) + 1, -+ reiser4_ctx_gfp_mask_get()); -+ if (local_name == NULL) -+ return RETERR(-ENOMEM); -+ } else -+ local_name = name_buf; -+ -+ strcpy(local_name, name); -+ file_type = iplug->s.dir.extract_file_type(coord); -+ -+ unit_key_by_coord(coord, &entry_key); -+ reiser4_seal_init(&seal, coord, &entry_key); -+ -+ longterm_unlock_znode(tap->lh); -+ -+ /* -+ * send information about directory entry to the ->filldir() filler -+ * supplied to us by caller (VFS). -+ * -+ * ->filldir is entitled to do weird things. For example, ->filldir -+ * supplied by knfsd re-enters file system. Make sure no locks are -+ * held. -+ */ -+ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack())); -+ -+ reiser4_txn_restart_current(); -+ result = filldir(dirent, name, (int)strlen(name), -+ /* offset of this entry */ -+ f->f_pos, -+ /* inode number of object bounden by this entry */ -+ oid_to_uino(get_key_objectid(&sd_key)), file_type); -+ if (local_name != name_buf) -+ kfree(local_name); -+ if (result < 0) -+ /* ->filldir() is satisfied. (no space in buffer, IOW) */ -+ result = 1; -+ else -+ result = reiser4_seal_validate(&seal, coord, &entry_key, -+ tap->lh, tap->mode, -+ ZNODE_LOCK_HIPRI); -+ return result; -+} -+ -+static void move_entry(struct readdir_pos *pos, coord_t *coord) -+{ -+ reiser4_key de_key; -+ de_id *did; -+ -+ /* update @pos */ -+ ++pos->entry_no; -+ did = &pos->position.dir_entry_key; -+ -+ /* get key of directory entry */ -+ unit_key_by_coord(coord, &de_key); -+ -+ if (de_id_key_cmp(did, &de_key) == EQUAL_TO) -+ /* we are within sequence of directory entries -+ with duplicate keys. */ -+ ++pos->position.pos; -+ else { -+ pos->position.pos = 0; -+ build_de_id_by_key(&de_key, did); -+ } -+ ++pos->fpos; -+} -+ -+/* -+ * STATELESS READDIR -+ * -+ * readdir support in reiser4 relies on ability to update readdir_pos embedded -+ * into reiser4_file_fsdata on each directory modification (name insertion and -+ * removal), see reiser4_readdir_common() function below. This obviously doesn't -+ * work when reiser4 is accessed over NFS, because NFS doesn't keep any state -+ * across client READDIR requests for the same directory. -+ * -+ * To address this we maintain a "pool" of detached reiser4_file_fsdata -+ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to -+ * find detached reiser4_file_fsdata corresponding to previous readdir -+ * request. In other words, additional state is maintained on the -+ * server. (This is somewhat contrary to the design goals of NFS protocol.) -+ * -+ * To efficiently detect when our ->readdir() method is called by NFS server, -+ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by -+ * file_is_stateless() function). -+ * -+ * To find out d_cursor in the pool, we encode client id (cid) in the highest -+ * bits of NFS readdir cookie: when first readdir request comes to the given -+ * directory from the given client, cookie is set to 0. This situation is -+ * detected, global cid_counter is incremented, and stored in highest bits of -+ * all direntry offsets returned to the client, including last one. As the -+ * only valid readdir cookie is one obtained as direntry->offset, we are -+ * guaranteed that next readdir request (continuing current one) will have -+ * current cid in the highest bits of starting readdir cookie. All d_cursors -+ * are hashed into per-super-block hash table by (oid, cid) key. -+ * -+ * In addition d_cursors are placed into per-super-block radix tree where they -+ * are keyed by oid alone. This is necessary to efficiently remove them during -+ * rmdir. -+ * -+ * At last, currently unused d_cursors are linked into special list. This list -+ * is used d_cursor_shrink to reclaim d_cursors on memory pressure. -+ * -+ */ -+ -+/* -+ * prepare for readdir. -+ */ -+static int dir_readdir_init(struct file *f, tap_t *tap, -+ struct readdir_pos **pos) -+{ -+ struct inode *inode; -+ reiser4_file_fsdata *fsdata; -+ int result; -+ -+ assert("nikita-1359", f != NULL); -+ inode = f->f_dentry->d_inode; -+ assert("nikita-1360", inode != NULL); -+ -+ if (!S_ISDIR(inode->i_mode)) -+ return RETERR(-ENOTDIR); -+ -+ /* try to find detached readdir state */ -+ result = reiser4_attach_fsdata(f, inode); -+ if (result != 0) -+ return result; -+ -+ fsdata = reiser4_get_file_fsdata(f); -+ assert("nikita-2571", fsdata != NULL); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ -+ /* add file descriptor to the readdir list hanging of directory -+ * inode. This list is used to scan "readdirs-in-progress" while -+ * inserting or removing names in the directory. */ -+ spin_lock_inode(inode); -+ if (list_empty_careful(&fsdata->dir.linkage)) -+ list_add(&fsdata->dir.linkage, get_readdir_list(inode)); -+ *pos = &fsdata->dir.readdir; -+ spin_unlock_inode(inode); -+ -+ /* move @tap to the current position */ -+ return dir_rewind(f, *pos, tap); -+} -+ -+/* this is implementation of vfs's llseek method of struct file_operations for -+ typical directory -+ See comment before reiser4_readdir_common() for explanation. -+*/ -+loff_t reiser4_llseek_dir_common(struct file *file, loff_t off, int origin) -+{ -+ reiser4_context *ctx; -+ loff_t result; -+ struct inode *inode; -+ -+ inode = file->f_dentry->d_inode; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ mutex_lock(&inode->i_mutex); -+ -+ /* update ->f_pos */ -+ result = default_llseek(file, off, origin); -+ if (result >= 0) { -+ int ff; -+ coord_t coord; -+ lock_handle lh; -+ tap_t tap; -+ struct readdir_pos *pos; -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); -+ -+ ff = dir_readdir_init(file, &tap, &pos); -+ reiser4_detach_fsdata(file); -+ if (ff != 0) -+ result = (loff_t) ff; -+ reiser4_tap_done(&tap); -+ } -+ reiser4_detach_fsdata(file); -+ mutex_unlock(&inode->i_mutex); -+ -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* this is common implementation of vfs's readdir method of struct -+ file_operations -+ -+ readdir problems: -+ -+ readdir(2)/getdents(2) interface is based on implicit assumption that -+ readdir can be restarted from any particular point by supplying file system -+ with off_t-full of data. That is, file system fills ->d_off field in struct -+ dirent and later user passes ->d_off to the seekdir(3), which is, actually, -+ implemented by glibc as lseek(2) on directory. -+ -+ Reiser4 cannot restart readdir from 64 bits of data, because two last -+ components of the key of directory entry are unknown, which given 128 bits: -+ locality and type fields in the key of directory entry are always known, to -+ start readdir() from given point objectid and offset fields have to be -+ filled. -+ -+ Traditional UNIX API for scanning through directory -+ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the -+ assumption that directory is structured very much like regular file, in -+ particular, it is implied that each name within given directory (directory -+ entry) can be uniquely identified by scalar offset and that such offset is -+ stable across the life-time of the name is identifies. -+ -+ This is manifestly not so for reiser4. In reiser4 the only stable unique -+ identifies for the directory entry is its key that doesn't fit into -+ seekdir/telldir API. -+ -+ solution: -+ -+ Within each file descriptor participating in readdir-ing of directory -+ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of -+ the "current" directory entry that file descriptor looks at. It contains a -+ key of directory entry (plus some additional info to deal with non-unique -+ keys that we wouldn't dwell onto here) and a logical position of this -+ directory entry starting from the beginning of the directory, that is -+ ordinal number of this entry in the readdir order. -+ -+ Obviously this logical position is not stable in the face of directory -+ modifications. To work around this, on each addition or removal of directory -+ entry all file descriptors for directory inode are scanned and their -+ readdir_pos are updated accordingly (adjust_dir_pos()). -+*/ -+int reiser4_readdir_common(struct file *f /* directory file being read */, -+ void *dirent /* opaque data passed to us by VFS */, -+ filldir_t filld /* filler function passed to us -+ * by VFS */) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *inode; -+ coord_t coord; -+ lock_handle lh; -+ tap_t tap; -+ struct readdir_pos *pos; -+ -+ assert("nikita-1359", f != NULL); -+ inode = f->f_dentry->d_inode; -+ assert("nikita-1360", inode != NULL); -+ -+ if (!S_ISDIR(inode->i_mode)) -+ return RETERR(-ENOTDIR); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); -+ -+ reiser4_readdir_readahead_init(inode, &tap); -+ -+repeat: -+ result = dir_readdir_init(f, &tap, &pos); -+ if (result == 0) { -+ result = reiser4_tap_load(&tap); -+ /* scan entries one by one feeding them to @filld */ -+ while (result == 0) { -+ coord_t *coord; -+ -+ coord = tap.coord; -+ assert("nikita-2572", coord_is_existing_unit(coord)); -+ assert("nikita-3227", is_valid_dir_coord(inode, coord)); -+ -+ result = feed_entry(f, pos, &tap, filld, dirent); -+ if (result > 0) { -+ break; -+ } else if (result == 0) { -+ ++f->f_pos; -+ result = go_next_unit(&tap); -+ if (result == -E_NO_NEIGHBOR || -+ result == -ENOENT) { -+ result = 0; -+ break; -+ } else if (result == 0) { -+ if (is_valid_dir_coord(inode, coord)) -+ move_entry(pos, coord); -+ else -+ break; -+ } -+ } else if (result == -E_REPEAT) { -+ /* feed_entry() had to restart. */ -+ ++f->f_pos; -+ reiser4_tap_relse(&tap); -+ goto repeat; -+ } else -+ warning("vs-1617", -+ "reiser4_readdir_common: unexpected error %d", -+ result); -+ } -+ reiser4_tap_relse(&tap); -+ -+ if (result >= 0) -+ f->f_version = inode->i_version; -+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) -+ result = 0; -+ reiser4_tap_done(&tap); -+ reiser4_detach_fsdata(f); -+ -+ /* try to update directory's atime */ -+ if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT) != 0) -+ warning("", "failed to update atime on readdir: %llu", -+ get_inode_oid(inode)); -+ else -+ file_accessed(f); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return (result <= 0) ? result : 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.30/fs/reiser4/plugin/file_plugin_common.c ---- linux-2.6.30.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/file_plugin_common.c 2009-06-22 17:27:31.000000000 +0200 -@@ -0,0 +1,1008 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* this file contains typical implementations for most of methods of -+ file plugin -+*/ -+ -+#include "../inode.h" -+#include "object.h" -+#include "../safe_link.h" -+ -+#include <linux/quotaops.h> -+ -+static int insert_new_sd(struct inode *inode); -+static int update_sd(struct inode *inode); -+ -+/* this is common implementation of write_sd_by_inode method of file plugin -+ either insert stat data or update it -+ */ -+int write_sd_by_inode_common(struct inode *inode/* object to save */) -+{ -+ int result; -+ -+ assert("nikita-730", inode != NULL); -+ -+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD)) -+ /* object doesn't have stat-data yet */ -+ result = insert_new_sd(inode); -+ else -+ result = update_sd(inode); -+ if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM) -+ /* Don't issue warnings about "name is too long" */ -+ warning("nikita-2221", "Failed to save sd for %llu: %i", -+ (unsigned long long)get_inode_oid(inode), result); -+ return result; -+} -+ -+/* this is common implementation of key_by_inode method of file plugin -+ */ -+int -+key_by_inode_and_offset_common(struct inode *inode, loff_t off, -+ reiser4_key * key) -+{ -+ reiser4_key_init(key); -+ set_key_locality(key, reiser4_inode_data(inode)->locality_id); -+ set_key_ordering(key, get_inode_ordering(inode)); -+ set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */ -+ set_key_type(key, KEY_BODY_MINOR); -+ set_key_offset(key, (__u64) off); -+ return 0; -+} -+ -+/* this is common implementation of set_plug_in_inode method of file plugin -+ */ -+int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ , -+ struct inode *parent /* parent object */ , -+ reiser4_object_create_data * data /* creational -+ * data */ ) -+{ -+ __u64 mask; -+ -+ object->i_mode = data->mode; -+ /* this should be plugin decision */ -+ object->i_uid = current->cred->fsuid; -+ object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME; -+ -+ /* support for BSD style group-id assignment. See mount's manual page -+ description of bsdgroups ext2 mount options for more details */ -+ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID)) -+ object->i_gid = parent->i_gid; -+ else if (parent->i_mode & S_ISGID) { -+ /* parent directory has sguid bit */ -+ object->i_gid = parent->i_gid; -+ if (S_ISDIR(object->i_mode)) -+ /* sguid is inherited by sub-directories */ -+ object->i_mode |= S_ISGID; -+ } else -+ object->i_gid = current->cred->fsgid; -+ -+ /* this object doesn't have stat-data yet */ -+ reiser4_inode_set_flag(object, REISER4_NO_SD); -+#if 0 -+ /* this is now called after all inode plugins are initialized: -+ do_create_vfs_child after adjust_to_parent */ -+ /* setup inode and file-operations for this inode */ -+ setup_inode_ops(object, data); -+#endif -+ object->i_nlink = 0; -+ reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL); -+ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT); -+ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES)) -+ mask |= (1 << LARGE_TIMES_STAT); -+ -+ reiser4_inode_data(object)->extmask = mask; -+ return 0; -+} -+ -+/* this is common implementation of adjust_to_parent method of file plugin for -+ regular files -+ */ -+int adjust_to_parent_common(struct inode *object /* new object */ , -+ struct inode *parent /* parent directory */ , -+ struct inode *root/* root directory */) -+{ -+ assert("nikita-2165", object != NULL); -+ if (parent == NULL) -+ parent = root; -+ assert("nikita-2069", parent != NULL); -+ -+ /* -+ * inherit missing plugins from parent -+ */ -+ -+ grab_plugin_pset(object, parent, PSET_FILE); -+ grab_plugin_pset(object, parent, PSET_SD); -+ grab_plugin_pset(object, parent, PSET_FORMATTING); -+ grab_plugin_pset(object, parent, PSET_PERM); -+ return 0; -+} -+ -+/* this is common implementation of adjust_to_parent method of file plugin for -+ typical directories -+ */ -+int adjust_to_parent_common_dir(struct inode *object /* new object */ , -+ struct inode *parent /* parent directory */ , -+ struct inode *root/* root directory */) -+{ -+ int result = 0; -+ pset_member memb; -+ -+ assert("nikita-2166", object != NULL); -+ if (parent == NULL) -+ parent = root; -+ assert("nikita-2167", parent != NULL); -+ -+ /* -+ * inherit missing plugins from parent -+ */ -+ for (memb = 0; memb < PSET_LAST; ++memb) { -+ result = grab_plugin_pset(object, parent, memb); -+ if (result != 0) -+ break; -+ } -+ return result; -+} -+ -+int adjust_to_parent_cryptcompress(struct inode *object /* new object */ , -+ struct inode *parent /* parent directory */, -+ struct inode *root/* root directory */) -+{ -+ int result; -+ result = adjust_to_parent_common(object, parent, root); -+ if (result) -+ return result; -+ assert("edward-1416", parent != NULL); -+ -+ grab_plugin_pset(object, parent, PSET_CLUSTER); -+ grab_plugin_pset(object, parent, PSET_CIPHER); -+ grab_plugin_pset(object, parent, PSET_DIGEST); -+ grab_plugin_pset(object, parent, PSET_COMPRESSION); -+ grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE); -+ -+ return 0; -+} -+ -+/* this is common implementation of create_object method of file plugin -+ */ -+int reiser4_create_object_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data * data) -+{ -+ reiser4_block_nr reserve; -+ assert("nikita-744", object != NULL); -+ assert("nikita-745", parent != NULL); -+ assert("nikita-747", data != NULL); -+ assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ -+ reserve = estimate_create_common(object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ return write_sd_by_inode_common(object); -+} -+ -+static int common_object_delete_no_reserve(struct inode *inode); -+ -+/** -+ * reiser4_delete_object_common - delete_object of file_plugin -+ * @inode: inode to be deleted -+ * -+ * This is common implementation of delete_object method of file_plugin. It -+ * applies to object its deletion consists of removing two items - stat data -+ * and safe-link. -+ */ -+int reiser4_delete_object_common(struct inode *inode) -+{ -+ int result; -+ -+ assert("nikita-1477", inode != NULL); -+ /* FIXME: if file body deletion failed (i/o error, for instance), -+ inode->i_size can be != 0 here */ -+ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode)); -+ assert("nikita-3421", inode->i_nlink == 0); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) { -+ reiser4_block_nr reserve; -+ -+ /* grab space which is needed to remove 2 items from the tree: -+ stat data and safe-link */ -+ reserve = 2 * -+ estimate_one_item_removal(reiser4_tree_by_inode(inode)); -+ if (reiser4_grab_space_force(reserve, -+ BA_RESERVED | BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ result = common_object_delete_no_reserve(inode); -+ } else -+ result = 0; -+ return result; -+} -+ -+/** -+ * reiser4_delete_dir_common - delete_object of file_plugin -+ * @inode: inode to be deleted -+ * -+ * This is common implementation of delete_object method of file_plugin for -+ * typical directory. It calls done method of dir_plugin to remove "." and -+ * removes stat data and safe-link. -+ */ -+int reiser4_delete_dir_common(struct inode *inode) -+{ -+ int result; -+ dir_plugin *dplug; -+ -+ assert("", (get_current_context() && -+ get_current_context()->trans->atom == NULL)); -+ -+ dplug = inode_dir_plugin(inode); -+ assert("vs-1101", dplug && dplug->done); -+ -+ /* kill cursors which might be attached to inode */ -+ reiser4_kill_cursors(inode); -+ -+ /* grab space enough for removing two items */ -+ if (reiser4_grab_space -+ (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)), -+ BA_RESERVED | BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ result = dplug->done(inode); -+ if (!result) -+ result = common_object_delete_no_reserve(inode); -+ return result; -+} -+ -+/* this is common implementation of add_link method of file plugin -+ */ -+int reiser4_add_link_common(struct inode *object, struct inode *parent) -+{ -+ /* -+ * increment ->i_nlink and update ->i_ctime -+ */ -+ -+ INODE_INC_FIELD(object, i_nlink); -+ object->i_ctime = CURRENT_TIME; -+ return 0; -+} -+ -+/* this is common implementation of rem_link method of file plugin -+ */ -+int reiser4_rem_link_common(struct inode *object, struct inode *parent) -+{ -+ assert("nikita-2021", object != NULL); -+ assert("nikita-2163", object->i_nlink > 0); -+ -+ /* -+ * decrement ->i_nlink and update ->i_ctime -+ */ -+ -+ INODE_DEC_FIELD(object, i_nlink); -+ object->i_ctime = CURRENT_TIME; -+ return 0; -+} -+ -+/* this is common implementation of rem_link method of file plugin for typical -+ directory -+*/ -+int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG) -+{ -+ assert("nikita-20211", object != NULL); -+ assert("nikita-21631", object->i_nlink > 0); -+ -+ /* -+ * decrement ->i_nlink and update ->i_ctime -+ */ -+ INODE_DEC_FIELD(object, i_nlink); -+ if (object->i_nlink == 1) -+ INODE_DEC_FIELD(object, i_nlink); -+ object->i_ctime = CURRENT_TIME; -+ return 0; -+} -+ -+/* this is common implementation of owns_item method of file plugin -+ compare objectids of keys in inode and coord */ -+int owns_item_common(const struct inode *inode, /* object to check -+ * against */ -+ const coord_t *coord/* coord to check */) -+{ -+ reiser4_key item_key; -+ reiser4_key file_key; -+ -+ assert("nikita-760", inode != NULL); -+ assert("nikita-761", coord != NULL); -+ -+ return coord_is_existing_item(coord) && -+ (get_key_objectid(build_sd_key(inode, &file_key)) == -+ get_key_objectid(item_key_by_coord(coord, &item_key))); -+} -+ -+/* this is common implementation of owns_item method of file plugin -+ for typical directory -+*/ -+int owns_item_common_dir(const struct inode *inode,/* object to check against */ -+ const coord_t *coord/* coord of item to check */) -+{ -+ reiser4_key item_key; -+ -+ assert("nikita-1335", inode != NULL); -+ assert("nikita-1334", coord != NULL); -+ -+ if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE)) -+ return get_key_locality(item_key_by_coord(coord, &item_key)) == -+ get_inode_oid(inode); -+ else -+ return owns_item_common(inode, coord); -+} -+ -+/* this is common implementation of can_add_link method of file plugin -+ checks whether yet another hard links to this object can be added -+*/ -+int can_add_link_common(const struct inode *object/* object to check */) -+{ -+ assert("nikita-732", object != NULL); -+ -+ /* inode->i_nlink is unsigned int, so just check for integer -+ overflow */ -+ return object->i_nlink + 1 != 0; -+} -+ -+/* this is common implementation of can_rem_link method of file plugin for -+ typical directory -+*/ -+int can_rem_link_common_dir(const struct inode *inode) -+{ -+ /* is_dir_empty() returns 0 is dir is empty */ -+ return !is_dir_empty(inode); -+} -+ -+/* this is common implementation of detach method of file plugin for typical -+ directory -+*/ -+int reiser4_detach_common_dir(struct inode *child, struct inode *parent) -+{ -+ dir_plugin *dplug; -+ -+ dplug = inode_dir_plugin(child); -+ assert("nikita-2883", dplug != NULL); -+ assert("nikita-2884", dplug->detach != NULL); -+ return dplug->detach(child, parent); -+} -+ -+/* this is common implementation of bind method of file plugin for typical -+ directory -+*/ -+int reiser4_bind_common_dir(struct inode *child, struct inode *parent) -+{ -+ dir_plugin *dplug; -+ -+ dplug = inode_dir_plugin(child); -+ assert("nikita-2646", dplug != NULL); -+ return dplug->attach(child, parent); -+} -+ -+static int process_truncate(struct inode *, __u64 size); -+ -+/* this is common implementation of safelink method of file plugin -+ */ -+int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value) -+{ -+ int result; -+ -+ assert("vs-1705", get_current_context()->trans->atom == NULL); -+ if (link == SAFE_UNLINK) -+ /* nothing to do. iput() in the caller (process_safelink) will -+ * finish with file */ -+ result = 0; -+ else if (link == SAFE_TRUNCATE) -+ result = process_truncate(object, value); -+ else { -+ warning("nikita-3438", "Unrecognized safe-link type: %i", link); -+ result = RETERR(-EIO); -+ } -+ return result; -+} -+ -+/* this is common implementation of estimate.create method of file plugin -+ can be used when object creation involves insertion of one item (usually stat -+ data) into tree -+*/ -+reiser4_block_nr estimate_create_common(const struct inode *object) -+{ -+ return estimate_one_insert_item(reiser4_tree_by_inode(object)); -+} -+ -+/* this is common implementation of estimate.create method of file plugin for -+ typical directory -+ can be used when directory creation involves insertion of two items (usually -+ stat data and item containing "." and "..") into tree -+*/ -+reiser4_block_nr estimate_create_common_dir(const struct inode *object) -+{ -+ return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object)); -+} -+ -+/* this is common implementation of estimate.update method of file plugin -+ can be used when stat data update does not do more than inserting a unit -+ into a stat data item which is probably true for most cases -+*/ -+reiser4_block_nr estimate_update_common(const struct inode *inode) -+{ -+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode)); -+} -+ -+/* this is common implementation of estimate.unlink method of file plugin -+ */ -+reiser4_block_nr -+estimate_unlink_common(const struct inode *object UNUSED_ARG, -+ const struct inode *parent UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* this is common implementation of estimate.unlink method of file plugin for -+ typical directory -+*/ -+reiser4_block_nr -+estimate_unlink_common_dir(const struct inode *object, -+ const struct inode *parent) -+{ -+ dir_plugin *dplug; -+ -+ dplug = inode_dir_plugin(object); -+ assert("nikita-2888", dplug != NULL); -+ assert("nikita-2887", dplug->estimate.unlink != NULL); -+ return dplug->estimate.unlink(object, parent); -+} -+ -+char *wire_write_common(struct inode *inode, char *start) -+{ -+ return build_inode_onwire(inode, start); -+} -+ -+char *wire_read_common(char *addr, reiser4_object_on_wire * obj) -+{ -+ if (!obj) -+ return locate_obj_key_id_onwire(addr); -+ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id); -+} -+ -+struct dentry *wire_get_common(struct super_block *sb, -+ reiser4_object_on_wire * obj) -+{ -+ struct inode *inode; -+ struct dentry *dentry; -+ reiser4_key key; -+ -+ extract_key_from_id(&obj->u.std.key_id, &key); -+ inode = reiser4_iget(sb, &key, 1); -+ if (!IS_ERR(inode)) { -+ reiser4_iget_complete(inode); -+ dentry = d_obtain_alias(inode); -+ if (!IS_ERR(dentry)) -+ dentry->d_op = &get_super_private(sb)->ops.dentry; -+ } else if (PTR_ERR(inode) == -ENOENT) -+ /* -+ * inode wasn't found at the key encoded in the file -+ * handle. Hence, file handle is stale. -+ */ -+ dentry = ERR_PTR(RETERR(-ESTALE)); -+ else -+ dentry = (void *)inode; -+ return dentry; -+} -+ -+int wire_size_common(struct inode *inode) -+{ -+ return inode_onwire_size(inode); -+} -+ -+void wire_done_common(reiser4_object_on_wire * obj) -+{ -+ /* nothing to do */ -+} -+ -+/* helper function to print errors */ -+static void key_warning(const reiser4_key * key /* key to print */ , -+ const struct inode *inode, -+ int code/* error code to print */) -+{ -+ assert("nikita-716", key != NULL); -+ -+ if (code != -ENOMEM) { -+ warning("nikita-717", "Error for inode %llu (%i)", -+ (unsigned long long)get_key_objectid(key), code); -+ reiser4_print_key("for key", key); -+ } -+} -+ -+/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */ -+#if REISER4_DEBUG -+static void -+check_inode_seal(const struct inode *inode, -+ const coord_t *coord, const reiser4_key * key) -+{ -+ reiser4_key unit_key; -+ -+ unit_key_by_coord(coord, &unit_key); -+ assert("nikita-2752", -+ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key))); -+ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key)); -+} -+ -+static void check_sd_coord(coord_t *coord, const reiser4_key * key) -+{ -+ reiser4_key ukey; -+ -+ coord_clear_iplug(coord); -+ if (zload(coord->node)) -+ return; -+ -+ if (!coord_is_existing_unit(coord) || -+ !item_plugin_by_coord(coord) || -+ !keyeq(unit_key_by_coord(coord, &ukey), key) || -+ (znode_get_level(coord->node) != LEAF_LEVEL) || -+ !item_is_statdata(coord)) { -+ warning("nikita-1901", "Conspicuous seal"); -+ reiser4_print_key("key", key); -+ print_coord("coord", coord, 1); -+ impossible("nikita-2877", "no way"); -+ } -+ zrelse(coord->node); -+} -+ -+#else -+#define check_inode_seal(inode, coord, key) noop -+#define check_sd_coord(coord, key) noop -+#endif -+ -+/* insert new stat-data into tree. Called with inode state -+ locked. Return inode state locked. */ -+static int insert_new_sd(struct inode *inode/* inode to create sd for */) -+{ -+ int result; -+ reiser4_key key; -+ coord_t coord; -+ reiser4_item_data data; -+ char *area; -+ reiser4_inode *ref; -+ lock_handle lh; -+ oid_t oid; -+ -+ assert("nikita-723", inode != NULL); -+ assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ ref = reiser4_inode_data(inode); -+ spin_lock_inode(inode); -+ -+ if (ref->plugin_mask != 0) -+ /* inode has non-standard plugins */ -+ inode_set_extension(inode, PLUGIN_STAT); -+ /* -+ * prepare specification of new item to be inserted -+ */ -+ -+ data.iplug = inode_sd_plugin(inode); -+ data.length = data.iplug->s.sd.save_len(inode); -+ spin_unlock_inode(inode); -+ -+ data.data = NULL; -+ data.user = 0; -+/* could be optimized for case where there is only one node format in -+ * use in the filesystem, probably there are lots of such -+ * places we could optimize for only one node layout.... -Hans */ -+ if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()) { -+ /* This is silly check, but we don't know actual node where -+ insertion will go into. */ -+ return RETERR(-ENAMETOOLONG); -+ } -+ oid = oid_allocate(inode->i_sb); -+/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be -+ * encapsulated into oid_allocate? */ -+ if (oid == ABSOLUTE_MAX_OID) -+ return RETERR(-EOVERFLOW); -+ -+ set_inode_oid(inode, oid); -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ -+ result = insert_by_key(reiser4_tree_by_inode(inode), -+ build_sd_key(inode, &key), &data, &coord, &lh, -+ /* stat data lives on a leaf level */ -+ LEAF_LEVEL, CBK_UNIQUE); -+ -+ /* we don't want to re-check that somebody didn't insert -+ stat-data while we were doing io, because if it did, -+ insert_by_key() returned error. */ -+ /* but what _is_ possible is that plugin for inode's stat-data, -+ list of non-standard plugins or their state would change -+ during io, so that stat-data wouldn't fit into sd. To avoid -+ this race we keep inode_state lock. This lock has to be -+ taken each time you access inode in a way that would cause -+ changes in sd size: changing plugins etc. -+ */ -+ -+ if (result == IBK_INSERT_OK) { -+ coord_clear_iplug(&coord); -+ result = zload(coord.node); -+ if (result == 0) { -+ /* have we really inserted stat data? */ -+ assert("nikita-725", item_is_statdata(&coord)); -+ -+ /* inode was just created. It is inserted into hash -+ table, but no directory entry was yet inserted into -+ parent. So, inode is inaccessible through -+ ->lookup(). All places that directly grab inode -+ from hash-table (like old knfsd), should check -+ IMMUTABLE flag that is set by common_create_child. -+ */ -+ assert("nikita-3240", data.iplug != NULL); -+ assert("nikita-3241", data.iplug->s.sd.save != NULL); -+ area = item_body_by_coord(&coord); -+ result = data.iplug->s.sd.save(inode, &area); -+ znode_make_dirty(coord.node); -+ if (result == 0) { -+ /* object has stat-data now */ -+ reiser4_inode_clr_flag(inode, REISER4_NO_SD); -+ reiser4_inode_set_flag(inode, -+ REISER4_SDLEN_KNOWN); -+ /* initialise stat-data seal */ -+ reiser4_seal_init(&ref->sd_seal, &coord, &key); -+ ref->sd_coord = coord; -+ check_inode_seal(inode, &coord, &key); -+ } else if (result != -ENOMEM) -+ /* -+ * convert any other error code to -EIO to -+ * avoid confusing user level with unexpected -+ * errors. -+ */ -+ result = RETERR(-EIO); -+ zrelse(coord.node); -+ } -+ } -+ done_lh(&lh); -+ -+ if (result != 0) -+ key_warning(&key, inode, result); -+ else -+ oid_count_allocated(); -+ -+ return result; -+} -+ -+/* find sd of inode in a tree, deal with errors */ -+int lookup_sd(struct inode *inode /* inode to look sd for */ , -+ znode_lock_mode lock_mode /* lock mode */ , -+ coord_t *coord /* resulting coord */ , -+ lock_handle * lh /* resulting lock handle */ , -+ const reiser4_key * key /* resulting key */ , -+ int silent) -+{ -+ int result; -+ __u32 flags; -+ -+ assert("nikita-1692", inode != NULL); -+ assert("nikita-1693", coord != NULL); -+ assert("nikita-1694", key != NULL); -+ -+ /* look for the object's stat data in a tree. -+ This returns in "node" pointer to a locked znode and in "pos" -+ position of an item found in node. Both are only valid if -+ coord_found is returned. */ -+ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0; -+ flags |= CBK_UNIQUE; -+ /* -+ * traverse tree to find stat data. We cannot use vroot here, because -+ * it only covers _body_ of the file, and stat data don't belong -+ * there. -+ */ -+ result = coord_by_key(reiser4_tree_by_inode(inode), -+ key, -+ coord, -+ lh, -+ lock_mode, -+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL); -+ if (REISER4_DEBUG && result == 0) -+ check_sd_coord(coord, key); -+ -+ if (result != 0 && !silent) -+ key_warning(key, inode, result); -+ return result; -+} -+ -+static int -+locate_inode_sd(struct inode *inode, -+ reiser4_key * key, coord_t *coord, lock_handle * lh) -+{ -+ reiser4_inode *state; -+ seal_t seal; -+ int result; -+ -+ assert("nikita-3483", inode != NULL); -+ -+ state = reiser4_inode_data(inode); -+ spin_lock_inode(inode); -+ *coord = state->sd_coord; -+ coord_clear_iplug(coord); -+ seal = state->sd_seal; -+ spin_unlock_inode(inode); -+ -+ build_sd_key(inode, key); -+ if (reiser4_seal_is_set(&seal)) { -+ /* first, try to use seal */ -+ result = reiser4_seal_validate(&seal, -+ coord, -+ key, -+ lh, ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (result == 0) -+ check_sd_coord(coord, key); -+ } else -+ result = -E_REPEAT; -+ -+ if (result != 0) { -+ coord_init_zero(coord); -+ result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0); -+ } -+ return result; -+} -+ -+#if REISER4_DEBUG -+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2) -+{ -+ return (get_key_locality(k1) == get_key_locality(k2) && -+ get_key_type(k1) == get_key_type(k2) && -+ get_key_band(k1) == get_key_band(k2) && -+ get_key_ordering(k1) == get_key_ordering(k2) && -+ get_key_objectid(k1) == get_key_objectid(k2)); -+} -+ -+#include "../tree_walk.h" -+ -+/* make some checks before and after stat-data resize operation */ -+static int check_sd_resize(struct inode *inode, coord_t *coord, -+ int length, int progress/* 1 means after resize */) -+{ -+ int ret = 0; -+ lock_handle left_lock; -+ coord_t left_coord; -+ reiser4_key left_key; -+ reiser4_key key; -+ -+ if (inode_file_plugin(inode) != -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) -+ return 0; -+ if (!length) -+ return 0; -+ if (coord->item_pos != 0) -+ return 0; -+ -+ init_lh(&left_lock); -+ ret = reiser4_get_left_neighbor(&left_lock, -+ coord->node, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR || -+ ret == -ENOENT || ret == -EINVAL -+ || ret == -E_DEADLOCK) { -+ ret = 0; -+ goto exit; -+ } -+ ret = zload(left_lock.node); -+ if (ret) -+ goto exit; -+ coord_init_last_unit(&left_coord, left_lock.node); -+ item_key_by_coord(&left_coord, &left_key); -+ item_key_by_coord(coord, &key); -+ -+ if (all_but_offset_key_eq(&key, &left_key)) -+ /* corruption occured */ -+ ret = 1; -+ zrelse(left_lock.node); -+ exit: -+ done_lh(&left_lock); -+ return ret; -+} -+#endif -+ -+/* update stat-data at @coord */ -+static int -+update_sd_at(struct inode *inode, coord_t *coord, reiser4_key * key, -+ lock_handle * lh) -+{ -+ int result; -+ reiser4_item_data data; -+ char *area; -+ reiser4_inode *state; -+ znode *loaded; -+ -+ state = reiser4_inode_data(inode); -+ -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (result != 0) -+ return result; -+ loaded = coord->node; -+ -+ spin_lock_inode(inode); -+ assert("nikita-728", inode_sd_plugin(inode) != NULL); -+ data.iplug = inode_sd_plugin(inode); -+ -+ /* if inode has non-standard plugins, add appropriate stat data -+ * extension */ -+ if (state->extmask & (1 << PLUGIN_STAT)) { -+ if (state->plugin_mask == 0) -+ inode_clr_extension(inode, PLUGIN_STAT); -+ } else if (state->plugin_mask != 0) -+ inode_set_extension(inode, PLUGIN_STAT); -+ -+ if (state->extmask & (1 << HEIR_STAT)) { -+ if (state->heir_mask == 0) -+ inode_clr_extension(inode, HEIR_STAT); -+ } else if (state->heir_mask != 0) -+ inode_set_extension(inode, HEIR_STAT); -+ -+ /* data.length is how much space to add to (or remove -+ from if negative) sd */ -+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) { -+ /* recalculate stat-data length */ -+ data.length = -+ data.iplug->s.sd.save_len(inode) - -+ item_length_by_coord(coord); -+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN); -+ } else -+ data.length = 0; -+ spin_unlock_inode(inode); -+ -+ /* if on-disk stat data is of different length than required -+ for this inode, resize it */ -+ -+ if (data.length != 0) { -+ data.data = NULL; -+ data.user = 0; -+ -+ assert("edward-1441", -+ !check_sd_resize(inode, coord, -+ data.length, 0/* before resize */)); -+ -+ /* insertion code requires that insertion point (coord) was -+ * between units. */ -+ coord->between = AFTER_UNIT; -+ result = reiser4_resize_item(coord, &data, key, lh, -+ COPI_DONT_SHIFT_LEFT); -+ if (result != 0) { -+ key_warning(key, inode, result); -+ zrelse(loaded); -+ return result; -+ } -+ if (loaded != coord->node) { -+ /* reiser4_resize_item moved coord to another node. -+ Zload it */ -+ zrelse(loaded); -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (result != 0) -+ return result; -+ loaded = coord->node; -+ } -+ assert("edward-1442", -+ !check_sd_resize(inode, coord, -+ data.length, 1/* after resize */)); -+ } -+ area = item_body_by_coord(coord); -+ spin_lock_inode(inode); -+ result = data.iplug->s.sd.save(inode, &area); -+ znode_make_dirty(coord->node); -+ -+ /* re-initialise stat-data seal */ -+ -+ /* -+ * coord.between was possibly skewed from AT_UNIT when stat-data size -+ * was changed and new extensions were pasted into item. -+ */ -+ coord->between = AT_UNIT; -+ reiser4_seal_init(&state->sd_seal, coord, key); -+ state->sd_coord = *coord; -+ spin_unlock_inode(inode); -+ check_inode_seal(inode, coord, key); -+ zrelse(loaded); -+ return result; -+} -+ -+/* Update existing stat-data in a tree. Called with inode state locked. Return -+ inode state locked. */ -+static int update_sd(struct inode *inode/* inode to update sd for */) -+{ -+ int result; -+ reiser4_key key; -+ coord_t coord; -+ lock_handle lh; -+ -+ assert("nikita-726", inode != NULL); -+ -+ /* no stat-data, nothing to update?! */ -+ assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ init_lh(&lh); -+ -+ result = locate_inode_sd(inode, &key, &coord, &lh); -+ if (result == 0) -+ result = update_sd_at(inode, &coord, &key, &lh); -+ done_lh(&lh); -+ -+ return result; -+} -+ -+/* helper for reiser4_delete_object_common and reiser4_delete_dir_common. -+ Remove object stat data. Space for that must be reserved by caller before -+*/ -+static int -+common_object_delete_no_reserve(struct inode *inode/* object to remove */) -+{ -+ int result; -+ -+ assert("nikita-1477", inode != NULL); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) { -+ reiser4_key sd_key; -+ -+ vfs_dq_free_inode(inode); -+ vfs_dq_drop(inode); -+ -+ build_sd_key(inode, &sd_key); -+ result = -+ reiser4_cut_tree(reiser4_tree_by_inode(inode), -+ &sd_key, &sd_key, NULL, 0); -+ if (result == 0) { -+ reiser4_inode_set_flag(inode, REISER4_NO_SD); -+ result = oid_release(inode->i_sb, get_inode_oid(inode)); -+ if (result == 0) { -+ oid_count_released(); -+ -+ result = safe_link_del(reiser4_tree_by_inode(inode), -+ get_inode_oid(inode), -+ SAFE_UNLINK); -+ } -+ } -+ } else -+ result = 0; -+ return result; -+} -+ -+/* helper for safelink_common */ -+static int process_truncate(struct inode *inode, __u64 size) -+{ -+ int result; -+ struct iattr attr; -+ file_plugin *fplug; -+ reiser4_context *ctx; -+ struct dentry dentry; -+ -+ assert("vs-21", is_in_reiser4_context()); -+ ctx = reiser4_init_context(inode->i_sb); -+ assert("vs-22", !IS_ERR(ctx)); -+ -+ attr.ia_size = size; -+ attr.ia_valid = ATTR_SIZE | ATTR_CTIME; -+ fplug = inode_file_plugin(inode); -+ -+ mutex_lock(&inode->i_mutex); -+ assert("vs-1704", get_current_context()->trans->atom == NULL); -+ dentry.d_inode = inode; -+ result = inode->i_op->setattr(&dentry, &attr); -+ mutex_unlock(&inode->i_mutex); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return result; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/hash.c linux-2.6.30/fs/reiser4/plugin/hash.c ---- linux-2.6.30.orig/fs/reiser4/plugin/hash.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/hash.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,352 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Hash functions */ -+ -+#include "../debug.h" -+#include "plugin_header.h" -+#include "plugin.h" -+#include "../super.h" -+#include "../inode.h" -+ -+#include <linux/types.h> -+ -+/* old rupasov (yura) hash */ -+static __u64 hash_rupasov(const unsigned char *name /* name to hash */ , -+ int len/* @name's length */) -+{ -+ int i; -+ int j; -+ int pow; -+ __u64 a; -+ __u64 c; -+ -+ assert("nikita-672", name != NULL); -+ assert("nikita-673", len >= 0); -+ -+ for (pow = 1, i = 1; i < len; ++i) -+ pow = pow * 10; -+ -+ if (len == 1) -+ a = name[0] - 48; -+ else -+ a = (name[0] - 48) * pow; -+ -+ for (i = 1; i < len; ++i) { -+ c = name[i] - 48; -+ for (pow = 1, j = i; j < len - 1; ++j) -+ pow = pow * 10; -+ a = a + c * pow; -+ } -+ for (; i < 40; ++i) { -+ c = '0' - 48; -+ for (pow = 1, j = i; j < len - 1; ++j) -+ pow = pow * 10; -+ a = a + c * pow; -+ } -+ -+ for (; i < 256; ++i) { -+ c = i; -+ for (pow = 1, j = i; j < len - 1; ++j) -+ pow = pow * 10; -+ a = a + c * pow; -+ } -+ -+ a = a << 7; -+ return a; -+} -+ -+/* r5 hash */ -+static __u64 hash_r5(const unsigned char *name /* name to hash */ , -+ int len UNUSED_ARG/* @name's length */) -+{ -+ __u64 a = 0; -+ -+ assert("nikita-674", name != NULL); -+ assert("nikita-675", len >= 0); -+ -+ while (*name) { -+ a += *name << 4; -+ a += *name >> 4; -+ a *= 11; -+ name++; -+ } -+ return a; -+} -+ -+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function -+ H0 = Key -+ Hi = E Mi(Hi-1) + Hi-1 -+ -+ (see Applied Cryptography, 2nd edition, p448). -+ -+ Jeremy Fitzhardinge jeremy@zip.com.au 1998 -+ -+ Jeremy has agreed to the contents of reiserfs/README. -Hans -+ -+ This code was blindly upgraded to __u64 by s/__u32/__u64/g. -+*/ -+static __u64 hash_tea(const unsigned char *name /* name to hash */ , -+ int len/* @name's length */) -+{ -+ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u }; -+ -+ __u64 h0 = k[0], h1 = k[1]; -+ __u64 a, b, c, d; -+ __u64 pad; -+ int i; -+ -+ assert("nikita-676", name != NULL); -+ assert("nikita-677", len >= 0); -+ -+#define DELTA 0x9E3779B9u -+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ -+#define PARTROUNDS 6 /* 6 gets complete mixing */ -+ -+/* a, b, c, d - data; h0, h1 - accumulated hash */ -+#define TEACORE(rounds) \ -+ do { \ -+ __u64 sum = 0; \ -+ int n = rounds; \ -+ __u64 b0, b1; \ -+ \ -+ b0 = h0; \ -+ b1 = h1; \ -+ \ -+ do { \ -+ sum += DELTA; \ -+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ -+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ -+ } while (--n); \ -+ \ -+ h0 += b0; \ -+ h1 += b1; \ -+ } while (0) -+ -+ pad = (__u64) len | ((__u64) len << 8); -+ pad |= pad << 16; -+ -+ while (len >= 16) { -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << -+ 16 | (__u64) name[7] << 24; -+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << -+ 16 | (__u64) name[11] << 24; -+ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14] -+ << 16 | (__u64) name[15] << 24; -+ -+ TEACORE(PARTROUNDS); -+ -+ len -= 16; -+ name += 16; -+ } -+ -+ if (len >= 12) { -+ /* assert(len < 16); */ -+ if (len >= 16) -+ *(int *)0 = 0; -+ -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << -+ 16 | (__u64) name[7] << 24; -+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << -+ 16 | (__u64) name[11] << 24; -+ -+ d = pad; -+ for (i = 12; i < len; i++) { -+ d <<= 8; -+ d |= name[i]; -+ } -+ } else if (len >= 8) { -+ /* assert(len < 12); */ -+ if (len >= 12) -+ *(int *)0 = 0; -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << -+ 16 | (__u64) name[7] << 24; -+ -+ c = d = pad; -+ for (i = 8; i < len; i++) { -+ c <<= 8; -+ c |= name[i]; -+ } -+ } else if (len >= 4) { -+ /* assert(len < 8); */ -+ if (len >= 8) -+ *(int *)0 = 0; -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ -+ b = c = d = pad; -+ for (i = 4; i < len; i++) { -+ b <<= 8; -+ b |= name[i]; -+ } -+ } else { -+ /* assert(len < 4); */ -+ if (len >= 4) -+ *(int *)0 = 0; -+ a = b = c = d = pad; -+ for (i = 0; i < len; i++) { -+ a <<= 8; -+ a |= name[i]; -+ } -+ } -+ -+ TEACORE(FULLROUNDS); -+ -+/* return 0;*/ -+ return h0 ^ h1; -+ -+} -+ -+/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash. -+ -+ See http://www.isthe.com/chongo/tech/comp/fnv/ for details. -+ -+ Excerpts: -+ -+ FNV hashes are designed to be fast while maintaining a low collision -+ rate. -+ -+ [This version also seems to preserve lexicographical order locally.] -+ -+ FNV hash algorithms and source code have been released into the public -+ domain. -+ -+*/ -+static __u64 hash_fnv1(const unsigned char *name /* name to hash */ , -+ int len UNUSED_ARG/* @name's length */) -+{ -+ unsigned long long a = 0xcbf29ce484222325ull; -+ const unsigned long long fnv_64_prime = 0x100000001b3ull; -+ -+ assert("nikita-678", name != NULL); -+ assert("nikita-679", len >= 0); -+ -+ /* FNV-1 hash each octet in the buffer */ -+ for (; *name; ++name) { -+ /* multiply by the 32 bit FNV magic prime mod 2^64 */ -+ a *= fnv_64_prime; -+ /* xor the bottom with the current octet */ -+ a ^= (unsigned long long)(*name); -+ } -+ /* return our new hash value */ -+ return a; -+} -+ -+/* degenerate hash function used to simplify testing of non-unique key -+ handling */ -+static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ , -+ int len UNUSED_ARG/* @name's length */) -+{ -+ return 0xc0c0c0c010101010ull; -+} -+ -+static int change_hash(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ int result; -+ -+ assert("nikita-3503", inode != NULL); -+ assert("nikita-3504", plugin != NULL); -+ -+ assert("nikita-3505", is_reiser4_inode(inode)); -+ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE); -+ -+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) -+ return RETERR(-EINVAL); -+ -+ result = 0; -+ if (inode_hash_plugin(inode) == NULL || -+ inode_hash_plugin(inode)->h.id != plugin->h.id) { -+ if (is_dir_empty(inode) == 0) -+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_HASH, plugin); -+ else -+ result = RETERR(-ENOTEMPTY); -+ -+ } -+ return result; -+} -+ -+static reiser4_plugin_ops hash_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = change_hash -+}; -+ -+/* hash plugins */ -+hash_plugin hash_plugins[LAST_HASH_ID] = { -+ [RUPASOV_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = RUPASOV_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "rupasov", -+ .desc = "Original Yura's hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_rupasov -+ }, -+ [R5_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = R5_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "r5", -+ .desc = "r5 hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_r5 -+ }, -+ [TEA_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = TEA_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "tea", -+ .desc = "tea hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_tea -+ }, -+ [FNV1_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = FNV1_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "fnv1", -+ .desc = "fnv1 hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_fnv1 -+ }, -+ [DEGENERATE_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = DEGENERATE_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "degenerate hash", -+ .desc = "Degenerate hash: only for testing", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_deg -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.30/fs/reiser4/plugin/inode_ops.c ---- linux-2.6.30.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/inode_ops.c 2009-06-22 17:27:31.000000000 +0200 -@@ -0,0 +1,906 @@ -+/* -+ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README -+ */ -+ -+/* -+ * this file contains typical implementations for most of methods of struct -+ * inode_operations -+ */ -+ -+#include "../inode.h" -+#include "../safe_link.h" -+ -+#include <linux/quotaops.h> -+#include <linux/namei.h> -+ -+static int create_vfs_object(struct inode *parent, struct dentry *dentry, -+ reiser4_object_create_data *data); -+ -+/** -+ * reiser4_create_common - create of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of new object to create -+ * @mode: the permissions to use -+ * @nameidata: -+ * -+ * This is common implementation of vfs's create method of struct -+ * inode_operations. -+ * Creates regular file using file plugin from parent directory plugin set. -+ */ -+int reiser4_create_common(struct inode *parent, struct dentry *dentry, -+ int mode, struct nameidata *nameidata) -+{ -+ reiser4_object_create_data data; -+ file_plugin *fplug; -+ -+ memset(&data, 0, sizeof data); -+ data.mode = S_IFREG | mode; -+ fplug = child_create_plugin(parent) ? : inode_create_plugin(parent); -+ if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) { -+ warning("vpf-1900", "'%s' is not a regular file plugin.", -+ fplug->h.label); -+ return RETERR(-EIO); -+ } -+ data.id = fplug->h.id; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *); -+void check_light_weight(struct inode *inode, struct inode *parent); -+ -+/** -+ * reiser4_lookup_common - lookup of inode operations -+ * @parent: inode of directory to lookup into -+ * @dentry: name to look for -+ * @nameidata: -+ * -+ * This is common implementation of vfs's lookup method of struct -+ * inode_operations. -+ */ -+struct dentry *reiser4_lookup_common(struct inode *parent, -+ struct dentry *dentry, -+ struct nameidata *nameidata) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct dentry *new; -+ struct inode *inode; -+ reiser4_dir_entry_desc entry; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return (struct dentry *)ctx; -+ -+ /* set up operations on dentry. */ -+ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry; -+ -+ result = reiser4_lookup_name(parent, dentry, &entry.key); -+ if (result) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ if (result == -ENOENT) { -+ /* object not found */ -+ if (!IS_DEADDIR(parent)) -+ d_add(dentry, NULL); -+ return NULL; -+ } -+ return ERR_PTR(result); -+ } -+ -+ inode = reiser4_iget(parent->i_sb, &entry.key, 0); -+ if (IS_ERR(inode)) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return ERR_PTR(PTR_ERR(inode)); -+ } -+ -+ /* success */ -+ check_light_weight(inode, parent); -+ new = d_splice_alias(inode, dentry); -+ reiser4_iget_complete(inode); -+ -+ /* prevent balance_dirty_pages() from being called: we don't want to -+ * do this under directory i_mutex. */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return new; -+} -+ -+static reiser4_block_nr common_estimate_link(struct inode *parent, -+ struct inode *object); -+int reiser4_update_dir(struct inode *); -+ -+/** -+ * reiser4_link_common - link of inode operations -+ * @existing: dentry of object which is to get new name -+ * @parent: directory where new name is to be created -+ * @newname: new name -+ * -+ * This is common implementation of vfs's link method of struct -+ * inode_operations. -+ */ -+int reiser4_link_common(struct dentry *existing, struct inode *parent, -+ struct dentry *newname) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *object; -+ dir_plugin *parent_dplug; -+ reiser4_dir_entry_desc entry; -+ reiser4_object_create_data data; -+ reiser4_block_nr reserve; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ assert("nikita-1431", existing != NULL); -+ assert("nikita-1432", parent != NULL); -+ assert("nikita-1433", newname != NULL); -+ -+ object = existing->d_inode; -+ assert("nikita-1434", object != NULL); -+ -+ /* check for race with create_object() */ -+ if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-E_REPEAT); -+ } -+ -+ parent_dplug = inode_dir_plugin(parent); -+ -+ memset(&entry, 0, sizeof entry); -+ entry.obj = object; -+ -+ data.mode = object->i_mode; -+ data.id = inode_file_plugin(object)->h.id; -+ -+ reserve = common_estimate_link(parent, existing->d_inode); -+ if ((__s64) reserve < 0) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return reserve; -+ } -+ -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOSPC); -+ } -+ -+ /* -+ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It -+ * means that link(2) can race against unlink(2) or rename(2), and -+ * inode is dead (->i_nlink == 0) when reiser4_link() is entered. -+ * -+ * For such inode we have to undo special processing done in -+ * reiser4_unlink() viz. creation of safe-link. -+ */ -+ if (unlikely(object->i_nlink == 0)) { -+ result = safe_link_del(reiser4_tree_by_inode(object), -+ get_inode_oid(object), SAFE_UNLINK); -+ if (result != 0) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ } -+ -+ /* increment nlink of @existing and update its stat data */ -+ result = reiser4_add_nlink(object, parent, 1); -+ if (result == 0) { -+ /* add entry to the parent */ -+ result = -+ parent_dplug->add_entry(parent, newname, &data, &entry); -+ if (result != 0) { -+ /* failed to add entry to the parent, decrement nlink -+ of @existing */ -+ reiser4_del_nlink(object, parent, 1); -+ /* -+ * now, if that failed, we have a file with too big -+ * nlink---space leak, much better than directory -+ * entry pointing to nowhere -+ */ -+ } -+ } -+ if (result == 0) { -+ atomic_inc(&object->i_count); -+ /* -+ * Upon successful completion, link() shall mark for update -+ * the st_ctime field of the file. Also, the st_ctime and -+ * st_mtime fields of the directory that contains the new -+ * entry shall be marked for update. --SUS -+ */ -+ result = reiser4_update_dir(parent); -+ } -+ if (result == 0) -+ d_instantiate(newname, existing->d_inode); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim); -+ -+/** -+ * reiser4_unlink_common - unlink of inode operations -+ * @parent: inode of directory to remove name from -+ * @victim: name to be removed -+ * -+ * This is common implementation of vfs's unlink method of struct -+ * inode_operations. -+ */ -+int reiser4_unlink_common(struct inode *parent, struct dentry *victim) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *object; -+ file_plugin *fplug; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ object = victim->d_inode; -+ fplug = inode_file_plugin(object); -+ assert("nikita-2882", fplug->detach != NULL); -+ -+ result = unlink_check_and_grab(parent, victim); -+ if (result != 0) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = fplug->detach(object, parent); -+ if (result == 0) { -+ dir_plugin *parent_dplug; -+ reiser4_dir_entry_desc entry; -+ -+ parent_dplug = inode_dir_plugin(parent); -+ memset(&entry, 0, sizeof entry); -+ -+ /* first, delete directory entry */ -+ result = parent_dplug->rem_entry(parent, victim, &entry); -+ if (result == 0) { -+ /* -+ * if name was removed successfully, we _have_ to -+ * return 0 from this function, because upper level -+ * caller (vfs_{rmdir,unlink}) expect this. -+ * -+ * now that directory entry is removed, update -+ * stat-data -+ */ -+ reiser4_del_nlink(object, parent, 1); -+ /* -+ * Upon successful completion, unlink() shall mark for -+ * update the st_ctime and st_mtime fields of the -+ * parent directory. Also, if the file's link count is -+ * not 0, the st_ctime field of the file shall be -+ * marked for update. --SUS -+ */ -+ reiser4_update_dir(parent); -+ /* add safe-link for this file */ -+ if (object->i_nlink == 0) -+ safe_link_add(object, SAFE_UNLINK); -+ } -+ } -+ -+ if (unlikely(result != 0)) { -+ if (result != -ENOMEM) -+ warning("nikita-3398", "Cannot unlink %llu (%i)", -+ (unsigned long long)get_inode_oid(object), -+ result); -+ /* if operation failed commit pending inode modifications to -+ * the stat-data */ -+ reiser4_update_sd(object); -+ reiser4_update_sd(parent); -+ } -+ -+ reiser4_release_reserved(object->i_sb); -+ -+ /* @object's i_ctime was updated by ->rem_link() method(). */ -+ -+ /* @victim can be already removed from the disk by this time. Inode is -+ then marked so that iput() wouldn't try to remove stat data. But -+ inode itself is still there. -+ */ -+ -+ /* -+ * we cannot release directory semaphore here, because name has -+ * already been deleted, but dentry (@victim) still exists. Prevent -+ * balance_dirty_pages() from being called on exiting this context: we -+ * don't want to do this under directory i_mutex. -+ */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * reiser4_symlink_common - symlink of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of object to be created -+ * @linkname: string symlink is to contain -+ * -+ * This is common implementation of vfs's symlink method of struct -+ * inode_operations. -+ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID. -+ */ -+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry, -+ const char *linkname) -+{ -+ reiser4_object_create_data data; -+ -+ memset(&data, 0, sizeof data); -+ data.name = linkname; -+ data.id = SYMLINK_FILE_PLUGIN_ID; -+ data.mode = S_IFLNK | S_IRWXUGO; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+/** -+ * reiser4_mkdir_common - mkdir of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of object to be created -+ * @mode: the permissions to use -+ * -+ * This is common implementation of vfs's mkdir method of struct -+ * inode_operations. -+ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID. -+ */ -+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode) -+{ -+ reiser4_object_create_data data; -+ -+ memset(&data, 0, sizeof data); -+ data.mode = S_IFDIR | mode; -+ data.id = DIRECTORY_FILE_PLUGIN_ID; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+/** -+ * reiser4_mknod_common - mknod of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of object to be created -+ * @mode: the permissions to use and file type -+ * @rdev: minor and major of new device file -+ * -+ * This is common implementation of vfs's mknod method of struct -+ * inode_operations. -+ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID. -+ */ -+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry, -+ int mode, dev_t rdev) -+{ -+ reiser4_object_create_data data; -+ -+ memset(&data, 0, sizeof data); -+ data.mode = mode; -+ data.rdev = rdev; -+ data.id = SPECIAL_FILE_PLUGIN_ID; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+/* -+ * implementation of vfs's rename method of struct inode_operations for typical -+ * directory is in inode_ops_rename.c -+ */ -+ -+/** -+ * reiser4_follow_link_common - follow_link of inode operations -+ * @dentry: dentry of symlink -+ * @data: -+ * -+ * This is common implementation of vfs's followlink method of struct -+ * inode_operations. -+ * Assumes that inode's i_private points to the content of symbolic link. -+ */ -+void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd) -+{ -+ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode)); -+ -+ if (!dentry->d_inode->i_private -+ || !reiser4_inode_get_flag(dentry->d_inode, -+ REISER4_GENERIC_PTR_USED)) -+ return ERR_PTR(RETERR(-EINVAL)); -+ nd_set_link(nd, dentry->d_inode->i_private); -+ return NULL; -+} -+ -+/** -+ * reiser4_permission_common - permission of inode operations -+ * @inode: inode to check permissions for -+ * @mask: mode bits to check permissions for -+ * @nameidata: -+ * -+ * Uses generic function to check for rwx permissions. -+ */ -+int reiser4_permission_common(struct inode *inode, int mask) -+{ -+ return generic_permission(inode, mask, NULL); -+} -+ -+static int setattr_reserve(reiser4_tree *); -+ -+/* this is common implementation of vfs's setattr method of struct -+ inode_operations -+*/ -+int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr) -+{ -+ reiser4_context *ctx; -+ struct inode *inode; -+ int result; -+ -+ inode = dentry->d_inode; -+ result = inode_change_ok(inode, attr); -+ if (result) -+ return result; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE)); -+ -+ /* -+ * grab disk space and call standard inode_setattr(). -+ */ -+ result = setattr_reserve(reiser4_tree_by_inode(inode)); -+ if (!result) { -+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) -+ || (attr->ia_valid & ATTR_GID -+ && attr->ia_gid != inode->i_gid)) { -+ result = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; -+ if (result) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ } -+ result = inode_setattr(inode, attr); -+ if (!result) -+ reiser4_update_sd(inode); -+ } -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* this is common implementation of vfs's getattr method of struct -+ inode_operations -+*/ -+int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG, -+ struct dentry *dentry, struct kstat *stat) -+{ -+ struct inode *obj; -+ -+ assert("nikita-2298", dentry != NULL); -+ assert("nikita-2299", stat != NULL); -+ assert("nikita-2300", dentry->d_inode != NULL); -+ -+ obj = dentry->d_inode; -+ -+ stat->dev = obj->i_sb->s_dev; -+ stat->ino = oid_to_uino(get_inode_oid(obj)); -+ stat->mode = obj->i_mode; -+ /* don't confuse userland with huge nlink. This is not entirely -+ * correct, because nlink_t is not necessary 16 bit signed. */ -+ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff); -+ stat->uid = obj->i_uid; -+ stat->gid = obj->i_gid; -+ stat->rdev = obj->i_rdev; -+ stat->atime = obj->i_atime; -+ stat->mtime = obj->i_mtime; -+ stat->ctime = obj->i_ctime; -+ stat->size = obj->i_size; -+ stat->blocks = -+ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS; -+ /* "preferred" blocksize for efficient file system I/O */ -+ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size; -+ -+ return 0; -+} -+ -+/* Estimate the maximum amount of nodes which might be allocated or changed on -+ typical new object creation. Typical creation consists of calling create -+ method of file plugin, adding directory entry to parent and update parent -+ directory's stat data. -+*/ -+static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, -+ /* parent object */ -+ struct inode *object -+ /* object */) -+{ -+ assert("vpf-309", parent != NULL); -+ assert("vpf-307", object != NULL); -+ -+ return -+ /* object creation estimation */ -+ inode_file_plugin(object)->estimate.create(object) + -+ /* stat data of parent directory estimation */ -+ inode_file_plugin(parent)->estimate.update(parent) + -+ /* adding entry estimation */ -+ inode_dir_plugin(parent)->estimate.add_entry(parent) + -+ /* to undo in the case of failure */ -+ inode_dir_plugin(parent)->estimate.rem_entry(parent); -+} -+ -+/* Create child in directory. -+ -+ . get object's plugin -+ . get fresh inode -+ . initialize inode -+ . add object's stat-data -+ . initialize object's directory -+ . add entry to the parent -+ . instantiate dentry -+ -+*/ -+static int do_create_vfs_child(reiser4_object_create_data * data,/* parameters -+ of new -+ object */ -+ struct inode **retobj) -+{ -+ int result; -+ -+ struct dentry *dentry; /* parent object */ -+ struct inode *parent; /* new name */ -+ -+ dir_plugin *par_dir; /* directory plugin on the parent */ -+ dir_plugin *obj_dir; /* directory plugin on the new object */ -+ file_plugin *obj_plug; /* object plugin on the new object */ -+ struct inode *object; /* new object */ -+ reiser4_block_nr reserve; -+ -+ reiser4_dir_entry_desc entry; /* new directory entry */ -+ -+ assert("nikita-1420", data != NULL); -+ parent = data->parent; -+ dentry = data->dentry; -+ -+ assert("nikita-1418", parent != NULL); -+ assert("nikita-1419", dentry != NULL); -+ -+ /* check, that name is acceptable for parent */ -+ par_dir = inode_dir_plugin(parent); -+ if (par_dir->is_name_acceptable && -+ !par_dir->is_name_acceptable(parent, -+ dentry->d_name.name, -+ (int)dentry->d_name.len)) -+ return RETERR(-ENAMETOOLONG); -+ -+ result = 0; -+ obj_plug = file_plugin_by_id((int)data->id); -+ if (obj_plug == NULL) { -+ warning("nikita-430", "Cannot find plugin %i", data->id); -+ return RETERR(-ENOENT); -+ } -+ object = new_inode(parent->i_sb); -+ if (object == NULL) -+ return RETERR(-ENOMEM); -+ /* we'll update i_nlink below */ -+ object->i_nlink = 0; -+ /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0, -+ * to simplify error handling: if some error occurs before i_ino is -+ * initialized with oid, i_ino should already be set to some -+ * distinguished value. */ -+ object->i_ino = 0; -+ -+ /* So that on error iput will be called. */ -+ *retobj = object; -+ -+ if (vfs_dq_alloc_inode(object)) { -+ vfs_dq_drop(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-EDQUOT); -+ } -+ -+ memset(&entry, 0, sizeof entry); -+ entry.obj = object; -+ -+ set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE, -+ file_plugin_to_plugin(obj_plug)); -+ result = obj_plug->set_plug_in_inode(object, parent, data); -+ if (result) { -+ warning("nikita-431", "Cannot install plugin %i on %llx", -+ data->id, (unsigned long long)get_inode_oid(object)); -+ vfs_dq_free_inode(object); -+ object->i_flags |= S_NOQUOTA; -+ return result; -+ } -+ -+ /* reget plugin after installation */ -+ obj_plug = inode_file_plugin(object); -+ -+ if (obj_plug->create_object == NULL) { -+ vfs_dq_free_inode(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-EPERM); -+ } -+ -+ /* if any of hash, tail, sd or permission plugins for newly created -+ object are not set yet set them here inheriting them from parent -+ directory -+ */ -+ assert("nikita-2070", obj_plug->adjust_to_parent != NULL); -+ result = obj_plug->adjust_to_parent(object, -+ parent, -+ object->i_sb->s_root->d_inode); -+ if (result == 0) -+ result = finish_pset(object); -+ if (result != 0) { -+ warning("nikita-432", "Cannot inherit from %llx to %llx", -+ (unsigned long long)get_inode_oid(parent), -+ (unsigned long long)get_inode_oid(object)); -+ vfs_dq_free_inode(object); -+ object->i_flags |= S_NOQUOTA; -+ return result; -+ } -+ -+ /* setup inode and file-operations for this inode */ -+ setup_inode_ops(object, data); -+ -+ /* call file plugin's method to initialize plugin specific part of -+ * inode */ -+ if (obj_plug->init_inode_data) -+ obj_plug->init_inode_data(object, data, 1/*create */); -+ -+ /* obtain directory plugin (if any) for new object. */ -+ obj_dir = inode_dir_plugin(object); -+ if (obj_dir != NULL && obj_dir->init == NULL) { -+ vfs_dq_free_inode(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-EPERM); -+ } -+ -+ reiser4_inode_data(object)->locality_id = get_inode_oid(parent); -+ -+ reserve = estimate_create_vfs_object(parent, object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { -+ vfs_dq_free_inode(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-ENOSPC); -+ } -+ -+ /* mark inode `immutable'. We disable changes to the file being -+ created until valid directory entry for it is inserted. Otherwise, -+ if file were expanded and insertion of directory entry fails, we -+ have to remove file, but we only alloted enough space in -+ transaction to remove _empty_ file. 3.x code used to remove stat -+ data in different transaction thus possibly leaking disk space on -+ crash. This all only matters if it's possible to access file -+ without name, for example, by inode number -+ */ -+ reiser4_inode_set_flag(object, REISER4_IMMUTABLE); -+ -+ /* create empty object, this includes allocation of new objectid. For -+ directories this implies creation of dot and dotdot */ -+ assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ -+ /* mark inode as `loaded'. From this point onward -+ reiser4_delete_inode() will try to remove its stat-data. */ -+ reiser4_inode_set_flag(object, REISER4_LOADED); -+ -+ result = obj_plug->create_object(object, parent, data); -+ if (result != 0) { -+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE); -+ if (result != -ENAMETOOLONG && result != -ENOMEM) -+ warning("nikita-2219", -+ "Failed to create sd for %llu", -+ (unsigned long long)get_inode_oid(object)); -+ vfs_dq_free_inode(object); -+ object->i_flags |= S_NOQUOTA; -+ return result; -+ } -+ -+ if (obj_dir != NULL) -+ result = obj_dir->init(object, parent, data); -+ if (result == 0) { -+ assert("nikita-434", !reiser4_inode_get_flag(object, -+ REISER4_NO_SD)); -+ /* insert inode into VFS hash table */ -+ insert_inode_hash(object); -+ /* create entry */ -+ result = par_dir->add_entry(parent, dentry, data, &entry); -+ if (result == 0) { -+ result = reiser4_add_nlink(object, parent, 0); -+ /* If O_CREAT is set and the file did not previously -+ exist, upon successful completion, open() shall -+ mark for update the st_atime, st_ctime, and -+ st_mtime fields of the file and the st_ctime and -+ st_mtime fields of the parent directory. --SUS -+ */ -+ /* @object times are already updated by -+ reiser4_add_nlink() */ -+ if (result == 0) -+ reiser4_update_dir(parent); -+ if (result != 0) -+ /* cleanup failure to add nlink */ -+ par_dir->rem_entry(parent, dentry, &entry); -+ } -+ if (result != 0) -+ /* cleanup failure to add entry */ -+ obj_plug->detach(object, parent); -+ } else if (result != -ENOMEM) -+ warning("nikita-2219", "Failed to initialize dir for %llu: %i", -+ (unsigned long long)get_inode_oid(object), result); -+ -+ /* -+ * update stat-data, committing all pending modifications to the inode -+ * fields. -+ */ -+ reiser4_update_sd(object); -+ if (result != 0) { -+ vfs_dq_free_inode(object); -+ object->i_flags |= S_NOQUOTA; -+ /* if everything was ok (result == 0), parent stat-data is -+ * already updated above (update_parent_dir()) */ -+ reiser4_update_sd(parent); -+ /* failure to create entry, remove object */ -+ obj_plug->delete_object(object); -+ } -+ -+ /* file has name now, clear immutable flag */ -+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE); -+ -+ /* on error, iput() will call ->delete_inode(). We should keep track -+ of the existence of stat-data for this inode and avoid attempt to -+ remove it in reiser4_delete_inode(). This is accomplished through -+ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags -+ */ -+ return result; -+} -+ -+/* this is helper for common implementations of reiser4_mkdir, reiser4_create, -+ reiser4_mknod and reiser4_symlink -+*/ -+static int -+create_vfs_object(struct inode *parent, -+ struct dentry *dentry, reiser4_object_create_data * data) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *child; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ context_set_commit_async(ctx); -+ -+ data->parent = parent; -+ data->dentry = dentry; -+ child = NULL; -+ result = do_create_vfs_child(data, &child); -+ if (unlikely(result != 0)) { -+ if (child != NULL) { -+ reiser4_make_bad_inode(child); -+ iput(child); -+ } -+ } else -+ d_instantiate(dentry, child); -+ -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * helper for link_common. Estimate disk space necessary to add a link -+ * from @parent to @object -+ */ -+static reiser4_block_nr common_estimate_link(struct inode *parent /* parent -+ * directory -+ */, -+ struct inode *object /* object to -+ * which new -+ * link is -+ * being -+ * created */) -+{ -+ reiser4_block_nr res = 0; -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ assert("vpf-317", object != NULL); -+ assert("vpf-318", parent != NULL); -+ -+ fplug = inode_file_plugin(object); -+ dplug = inode_dir_plugin(parent); -+ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice -+ * instead of multiplying by 2? */ -+ /* reiser4_add_nlink(object) */ -+ res += fplug->estimate.update(object); -+ /* add_entry(parent) */ -+ res += dplug->estimate.add_entry(parent); -+ /* reiser4_del_nlink(object) */ -+ res += fplug->estimate.update(object); -+ /* update_dir(parent) */ -+ res += inode_file_plugin(parent)->estimate.update(parent); -+ /* safe-link */ -+ res += estimate_one_item_removal(reiser4_tree_by_inode(object)); -+ -+ return res; -+} -+ -+/* Estimate disk space necessary to remove a link between @parent and -+ @object. -+*/ -+static reiser4_block_nr estimate_unlink(struct inode *parent /* parent -+ * directory */, -+ struct inode *object /* object to which -+ * new link is -+ * being created -+ */) -+{ -+ reiser4_block_nr res = 0; -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ assert("vpf-317", object != NULL); -+ assert("vpf-318", parent != NULL); -+ -+ fplug = inode_file_plugin(object); -+ dplug = inode_dir_plugin(parent); -+ -+ /* rem_entry(parent) */ -+ res += dplug->estimate.rem_entry(parent); -+ /* reiser4_del_nlink(object) */ -+ res += fplug->estimate.update(object); -+ /* update_dir(parent) */ -+ res += inode_file_plugin(parent)->estimate.update(parent); -+ /* fplug->unlink */ -+ res += fplug->estimate.unlink(object, parent); -+ /* safe-link */ -+ res += estimate_one_insert_item(reiser4_tree_by_inode(object)); -+ -+ return res; -+} -+ -+/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */ -+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim) -+{ -+ file_plugin *fplug; -+ struct inode *child; -+ int result; -+ -+ result = 0; -+ child = victim->d_inode; -+ fplug = inode_file_plugin(child); -+ -+ /* check for race with create_object() */ -+ if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE)) -+ return RETERR(-E_REPEAT); -+ /* object being deleted should have stat data */ -+ assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD)); -+ -+ /* ask object plugin */ -+ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child)) -+ return RETERR(-ENOTEMPTY); -+ -+ result = (int)estimate_unlink(parent, child); -+ if (result < 0) -+ return result; -+ -+ return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT); -+} -+ -+/* helper for reiser4_setattr_common */ -+static int setattr_reserve(reiser4_tree * tree) -+{ -+ assert("vs-1096", is_grab_enabled(get_current_context())); -+ return reiser4_grab_space(estimate_one_insert_into_item(tree), -+ BA_CAN_COMMIT); -+} -+ -+/* helper function. Standards require that for many file-system operations -+ on success ctime and mtime of parent directory is to be updated. */ -+int reiser4_update_dir(struct inode *dir) -+{ -+ assert("nikita-2525", dir != NULL); -+ -+ dir->i_ctime = dir->i_mtime = CURRENT_TIME; -+ return reiser4_update_sd(dir); -+} -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.30/fs/reiser4/plugin/inode_ops_rename.c ---- linux-2.6.30.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/inode_ops_rename.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,925 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "../inode.h" -+#include "../safe_link.h" -+ -+static const char *possible_leak = "Possible disk space leak."; -+ -+/* re-bind existing name at @from_coord in @from_dir to point to @to_inode. -+ -+ Helper function called from hashed_rename() */ -+static int replace_name(struct inode *to_inode, /* inode where @from_coord is -+ * to be re-targeted at */ -+ struct inode *from_dir, /* directory where @from_coord -+ * lives */ -+ struct inode *from_inode, /* inode @from_coord -+ * originally point to */ -+ coord_t *from_coord, /* where directory entry is in -+ * the tree */ -+ lock_handle * from_lh/* lock handle on @from_coord */) -+{ -+ item_plugin *from_item; -+ int result; -+ znode *node; -+ -+ coord_clear_iplug(from_coord); -+ node = from_coord->node; -+ result = zload(node); -+ if (result != 0) -+ return result; -+ from_item = item_plugin_by_coord(from_coord); -+ if (plugin_of_group(item_plugin_by_coord(from_coord), -+ DIR_ENTRY_ITEM_TYPE)) { -+ reiser4_key to_key; -+ -+ build_sd_key(to_inode, &to_key); -+ -+ /* everything is found and prepared to change directory entry -+ at @from_coord to point to @to_inode. -+ -+ @to_inode is just about to get new name, so bump its link -+ counter. -+ -+ */ -+ result = reiser4_add_nlink(to_inode, from_dir, 0); -+ if (result != 0) { -+ /* Don't issue warning: this may be plain -EMLINK */ -+ zrelse(node); -+ return result; -+ } -+ -+ result = -+ from_item->s.dir.update_key(from_coord, &to_key, from_lh); -+ if (result != 0) { -+ reiser4_del_nlink(to_inode, from_dir, 0); -+ zrelse(node); -+ return result; -+ } -+ -+ /* @from_inode just lost its name, he-he. -+ -+ If @from_inode was directory, it contained dotdot pointing -+ to @from_dir. @from_dir i_nlink will be decreased when -+ iput() will be called on @from_inode. -+ -+ If file-system is not ADG (hard-links are -+ supported on directories), iput(from_inode) will not remove -+ @from_inode, and thus above is incorrect, but hard-links on -+ directories are problematic in many other respects. -+ */ -+ result = reiser4_del_nlink(from_inode, from_dir, 0); -+ if (result != 0) { -+ warning("nikita-2330", -+ "Cannot remove link from source: %i. %s", -+ result, possible_leak); -+ } -+ /* Has to return success, because entry is already -+ * modified. */ -+ result = 0; -+ -+ /* NOTE-NIKITA consider calling plugin method in stead of -+ accessing inode fields directly. */ -+ from_dir->i_mtime = CURRENT_TIME; -+ } else { -+ warning("nikita-2326", "Unexpected item type"); -+ result = RETERR(-EIO); -+ } -+ zrelse(node); -+ return result; -+} -+ -+/* add new entry pointing to @inode into @dir at @coord, locked by @lh -+ -+ Helper function used by hashed_rename(). */ -+static int add_name(struct inode *inode, /* inode where @coord is to be -+ * re-targeted at */ -+ struct inode *dir, /* directory where @coord lives */ -+ struct dentry *name, /* new name */ -+ coord_t *coord, /* where directory entry is in the tree -+ */ -+ lock_handle * lh, /* lock handle on @coord */ -+ int is_dir/* true, if @inode is directory */) -+{ -+ int result; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-2333", lh->node == coord->node); -+ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode)); -+ -+ memset(&entry, 0, sizeof entry); -+ entry.obj = inode; -+ /* build key of directory entry description */ -+ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key); -+ -+ /* ext2 does this in different order: first inserts new entry, -+ then increases directory nlink. We don't want do this, -+ because reiser4_add_nlink() calls ->add_link() plugin -+ method that can fail for whatever reason, leaving as with -+ cleanup problems. -+ */ -+ /* @inode is getting new name */ -+ reiser4_add_nlink(inode, dir, 0); -+ /* create @new_name in @new_dir pointing to -+ @old_inode */ -+ result = WITH_COORD(coord, -+ inode_dir_item_plugin(dir)->s.dir.add_entry(dir, -+ coord, -+ lh, -+ name, -+ &entry)); -+ if (result != 0) { -+ int result2; -+ result2 = reiser4_del_nlink(inode, dir, 0); -+ if (result2 != 0) { -+ warning("nikita-2327", -+ "Cannot drop link on %lli %i. %s", -+ (unsigned long long)get_inode_oid(inode), -+ result2, possible_leak); -+ } -+ } else -+ INODE_INC_FIELD(dir, i_size); -+ return result; -+} -+ -+static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory -+ * where @old is -+ * located */ -+ struct dentry *old_name,/* old name */ -+ struct inode *new_dir, /* directory -+ * where @new is -+ * located */ -+ struct dentry *new_name /* new name */) -+{ -+ reiser4_block_nr res1, res2; -+ dir_plugin * p_parent_old, *p_parent_new; -+ file_plugin * p_child_old, *p_child_new; -+ -+ assert("vpf-311", old_dir != NULL); -+ assert("vpf-312", new_dir != NULL); -+ assert("vpf-313", old_name != NULL); -+ assert("vpf-314", new_name != NULL); -+ -+ p_parent_old = inode_dir_plugin(old_dir); -+ p_parent_new = inode_dir_plugin(new_dir); -+ p_child_old = inode_file_plugin(old_name->d_inode); -+ if (new_name->d_inode) -+ p_child_new = inode_file_plugin(new_name->d_inode); -+ else -+ p_child_new = NULL; -+ -+ /* find_entry - can insert one leaf. */ -+ res1 = res2 = 1; -+ -+ /* replace_name */ -+ { -+ /* reiser4_add_nlink(p_child_old) and -+ * reiser4_del_nlink(p_child_old) */ -+ res1 += 2 * p_child_old->estimate.update(old_name->d_inode); -+ /* update key */ -+ res1 += 1; -+ /* reiser4_del_nlink(p_child_new) */ -+ if (p_child_new) -+ res1 += p_child_new->estimate.update(new_name->d_inode); -+ } -+ -+ /* else add_name */ -+ { -+ /* reiser4_add_nlink(p_parent_new) and -+ * reiser4_del_nlink(p_parent_new) */ -+ res2 += -+ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir); -+ /* reiser4_add_nlink(p_parent_old) */ -+ res2 += p_child_old->estimate.update(old_name->d_inode); -+ /* add_entry(p_parent_new) */ -+ res2 += p_parent_new->estimate.add_entry(new_dir); -+ /* reiser4_del_nlink(p_parent_old) */ -+ res2 += p_child_old->estimate.update(old_name->d_inode); -+ } -+ -+ res1 = res1 < res2 ? res2 : res1; -+ -+ /* reiser4_write_sd(p_parent_new) */ -+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); -+ -+ /* reiser4_write_sd(p_child_new) */ -+ if (p_child_new) -+ res1 += p_child_new->estimate.update(new_name->d_inode); -+ -+ /* hashed_rem_entry(p_parent_old) */ -+ res1 += p_parent_old->estimate.rem_entry(old_dir); -+ -+ /* reiser4_del_nlink(p_child_old) */ -+ res1 += p_child_old->estimate.update(old_name->d_inode); -+ -+ /* replace_name */ -+ { -+ /* reiser4_add_nlink(p_parent_dir_new) */ -+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); -+ /* update_key */ -+ res1 += 1; -+ /* reiser4_del_nlink(p_parent_new) */ -+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); -+ /* reiser4_del_nlink(p_parent_old) */ -+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir); -+ } -+ -+ /* reiser4_write_sd(p_parent_old) */ -+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir); -+ -+ /* reiser4_write_sd(p_child_old) */ -+ res1 += p_child_old->estimate.update(old_name->d_inode); -+ -+ return res1; -+} -+ -+static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory -+ * where @old -+ * is located -+ */ -+ struct dentry *old_name,/* old name -+ */ -+ struct inode *new_dir, /* directory -+ * where @new -+ * is located -+ */ -+ struct dentry *new_name /* new name -+ */) -+{ -+ reiser4_block_nr reserve; -+ -+ reserve = estimate_rename(old_dir, old_name, new_dir, new_name); -+ -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ return 0; -+} -+ -+/* check whether @old_inode and @new_inode can be moved within file system -+ * tree. This singles out attempts to rename pseudo-files, for example. */ -+static int can_rename(struct inode *old_dir, struct inode *old_inode, -+ struct inode *new_dir, struct inode *new_inode) -+{ -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ assert("nikita-3370", old_inode != NULL); -+ -+ dplug = inode_dir_plugin(new_dir); -+ fplug = inode_file_plugin(old_inode); -+ -+ if (dplug == NULL) -+ return RETERR(-ENOTDIR); -+ else if (new_dir->i_op->create == NULL) -+ return RETERR(-EPERM); -+ else if (!fplug->can_add_link(old_inode)) -+ return RETERR(-EMLINK); -+ else if (new_inode != NULL) { -+ fplug = inode_file_plugin(new_inode); -+ if (fplug->can_rem_link != NULL && -+ !fplug->can_rem_link(new_inode)) -+ return RETERR(-EBUSY); -+ } -+ return 0; -+} -+ -+int reiser4_find_entry(struct inode *, struct dentry *, lock_handle * , -+ znode_lock_mode, reiser4_dir_entry_desc *); -+int reiser4_update_dir(struct inode *); -+ -+/* this is common implementation of vfs's rename method of struct -+ inode_operations -+ See comments in the body. -+ -+ It is arguable that this function can be made generic so, that it -+ will be applicable to any kind of directory plugin that deals with -+ directories composed out of directory entries. The only obstacle -+ here is that we don't have any data-type to represent directory -+ entry. This should be re-considered when more than one different -+ directory plugin will be implemented. -+*/ -+int reiser4_rename_common(struct inode *old_dir /* directory where @old -+ * is located */ , -+ struct dentry *old_name /* old name */ , -+ struct inode *new_dir /* directory where @new -+ * is located */ , -+ struct dentry *new_name/* new name */) -+{ -+ /* From `The Open Group Base Specifications Issue 6' -+ -+ If either the old or new argument names a symbolic link, rename() -+ shall operate on the symbolic link itself, and shall not resolve -+ the last component of the argument. If the old argument and the new -+ argument resolve to the same existing file, rename() shall return -+ successfully and perform no other action. -+ -+ [this is done by VFS: vfs_rename()] -+ -+ If the old argument points to the pathname of a file that is not a -+ directory, the new argument shall not point to the pathname of a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the link named by the new argument exists, it shall -+ be removed and old renamed to new. In this case, a link named new -+ shall remain visible to other processes throughout the renaming -+ operation and refer either to the file referred to by new or old -+ before the operation began. -+ -+ [we should assure this] -+ -+ Write access permission is required for -+ both the directory containing old and the directory containing new. -+ -+ [checked by VFS: vfs_rename->may_delete(), may_create()] -+ -+ If the old argument points to the pathname of a directory, the new -+ argument shall not point to the pathname of a file that is not a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the directory named by the new argument exists, it -+ shall be removed and old renamed to new. In this case, a link named -+ new shall exist throughout the renaming operation and shall refer -+ either to the directory referred to by new or old before the -+ operation began. -+ -+ [we should assure this] -+ -+ If new names an existing directory, it shall be -+ required to be an empty directory. -+ -+ [we should check this] -+ -+ If the old argument points to a pathname of a symbolic link, the -+ symbolic link shall be renamed. If the new argument points to a -+ pathname of a symbolic link, the symbolic link shall be removed. -+ -+ The new pathname shall not contain a path prefix that names -+ old. Write access permission is required for the directory -+ containing old and the directory containing new. If the old -+ argument points to the pathname of a directory, write access -+ permission may be required for the directory named by old, and, if -+ it exists, the directory named by new. -+ -+ [checked by VFS: vfs_rename(), vfs_rename_dir()] -+ -+ If the link named by the new argument exists and the file's link -+ count becomes 0 when it is removed and no process has the file -+ open, the space occupied by the file shall be freed and the file -+ shall no longer be accessible. If one or more processes have the -+ file open when the last link is removed, the link shall be removed -+ before rename() returns, but the removal of the file contents shall -+ be postponed until all references to the file are closed. -+ -+ [iput() handles this, but we can do this manually, a la -+ reiser4_unlink()] -+ -+ Upon successful completion, rename() shall mark for update the -+ st_ctime and st_mtime fields of the parent directory of each file. -+ -+ [N/A] -+ -+ */ -+ reiser4_context *ctx; -+ int result; -+ int is_dir; /* is @old_name directory */ -+ -+ struct inode *old_inode; -+ struct inode *new_inode; -+ coord_t *new_coord; -+ -+ struct reiser4_dentry_fsdata *new_fsdata; -+ dir_plugin *dplug; -+ file_plugin *fplug; -+ -+ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry; -+ lock_handle * new_lh, *dotdot_lh; -+ struct dentry *dotdot_name; -+ struct reiser4_dentry_fsdata *dataonstack; -+ -+ ctx = reiser4_init_context(old_dir->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) + -+ sizeof(*dotdot_name) + sizeof(*dataonstack), -+ reiser4_ctx_gfp_mask_get()); -+ if (!old_entry) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOMEM); -+ } -+ -+ new_entry = old_entry + 1; -+ dotdot_entry = old_entry + 2; -+ new_lh = (lock_handle *)(old_entry + 3); -+ dotdot_lh = new_lh + 1; -+ dotdot_name = (struct dentry *)(new_lh + 2); -+ dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1); -+ -+ assert("nikita-2318", old_dir != NULL); -+ assert("nikita-2319", new_dir != NULL); -+ assert("nikita-2320", old_name != NULL); -+ assert("nikita-2321", new_name != NULL); -+ -+ old_inode = old_name->d_inode; -+ new_inode = new_name->d_inode; -+ -+ dplug = inode_dir_plugin(old_dir); -+ fplug = NULL; -+ -+ new_fsdata = reiser4_get_dentry_fsdata(new_name); -+ if (IS_ERR(new_fsdata)) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return PTR_ERR(new_fsdata); -+ } -+ -+ new_coord = &new_fsdata->dec.entry_coord; -+ coord_clear_iplug(new_coord); -+ -+ is_dir = S_ISDIR(old_inode->i_mode); -+ -+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir); -+ -+ /* if target is existing directory and it's not empty---return error. -+ -+ This check is done specifically, because is_dir_empty() requires -+ tree traversal and have to be done before locks are taken. -+ */ -+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOTEMPTY); -+ } -+ -+ result = can_rename(old_dir, old_inode, new_dir, new_inode); -+ if (result != 0) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = hashed_rename_estimate_and_grab(old_dir, old_name, -+ new_dir, new_name); -+ if (result != 0) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ init_lh(new_lh); -+ -+ /* find entry for @new_name */ -+ result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK, -+ new_entry); -+ -+ if (IS_CBKERR(result)) { -+ done_lh(new_lh); -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ reiser4_seal_done(&new_fsdata->dec.entry_seal); -+ -+ /* add or replace name for @old_inode as @new_name */ -+ if (new_inode != NULL) { -+ /* target (@new_name) exists. */ -+ /* Not clear what to do with objects that are -+ both directories and files at the same time. */ -+ if (result == CBK_COORD_FOUND) { -+ result = replace_name(old_inode, -+ new_dir, -+ new_inode, new_coord, new_lh); -+ if (result == 0) -+ fplug = inode_file_plugin(new_inode); -+ } else if (result == CBK_COORD_NOTFOUND) { -+ /* VFS told us that @new_name is bound to existing -+ inode, but we failed to find directory entry. */ -+ warning("nikita-2324", "Target not found"); -+ result = RETERR(-ENOENT); -+ } -+ } else { -+ /* target (@new_name) doesn't exists. */ -+ if (result == CBK_COORD_NOTFOUND) -+ result = add_name(old_inode, -+ new_dir, -+ new_name, new_coord, new_lh, is_dir); -+ else if (result == CBK_COORD_FOUND) { -+ /* VFS told us that @new_name is "negative" dentry, -+ but we found directory entry. */ -+ warning("nikita-2331", "Target found unexpectedly"); -+ result = RETERR(-EIO); -+ } -+ } -+ -+ assert("nikita-3462", ergo(result == 0, -+ old_inode->i_nlink >= 2 + !!is_dir)); -+ -+ /* We are done with all modifications to the @new_dir, release lock on -+ node. */ -+ done_lh(new_lh); -+ -+ if (fplug != NULL) { -+ /* detach @new_inode from name-space */ -+ result = fplug->detach(new_inode, new_dir); -+ if (result != 0) -+ warning("nikita-2330", "Cannot detach %lli: %i. %s", -+ (unsigned long long)get_inode_oid(new_inode), -+ result, possible_leak); -+ } -+ -+ if (new_inode != NULL) -+ reiser4_update_sd(new_inode); -+ -+ if (result == 0) { -+ old_entry->obj = old_inode; -+ -+ dplug->build_entry_key(old_dir, -+ &old_name->d_name, &old_entry->key); -+ -+ /* At this stage new name was introduced for -+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink -+ counters were updated. -+ -+ We want to remove @old_name now. If @old_inode wasn't -+ directory this is simple. -+ */ -+ result = dplug->rem_entry(old_dir, old_name, old_entry); -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2335", -+ "Cannot remove old name: %i", result); -+ } else { -+ result = reiser4_del_nlink(old_inode, old_dir, 0); -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2337", -+ "Cannot drop link on old: %i", result); -+ } -+ } -+ -+ if (result == 0 && is_dir) { -+ /* @old_inode is directory. We also have to update -+ dotdot entry. */ -+ coord_t *dotdot_coord; -+ -+ memset(dataonstack, 0, sizeof dataonstack); -+ memset(dotdot_entry, 0, sizeof dotdot_entry); -+ dotdot_entry->obj = old_dir; -+ memset(dotdot_name, 0, sizeof dotdot_name); -+ dotdot_name->d_name.name = ".."; -+ dotdot_name->d_name.len = 2; -+ /* -+ * allocate ->d_fsdata on the stack to avoid using -+ * reiser4_get_dentry_fsdata(). Locking is not needed, -+ * because dentry is private to the current thread. -+ */ -+ dotdot_name->d_fsdata = dataonstack; -+ init_lh(dotdot_lh); -+ -+ dotdot_coord = &dataonstack->dec.entry_coord; -+ coord_clear_iplug(dotdot_coord); -+ -+ result = reiser4_find_entry(old_inode, dotdot_name, -+ dotdot_lh, ZNODE_WRITE_LOCK, -+ dotdot_entry); -+ if (result == 0) { -+ /* replace_name() decreases i_nlink on -+ * @old_dir */ -+ result = replace_name(new_dir, -+ old_inode, -+ old_dir, -+ dotdot_coord, dotdot_lh); -+ } else -+ result = RETERR(-EIO); -+ done_lh(dotdot_lh); -+ } -+ } -+ reiser4_update_dir(new_dir); -+ reiser4_update_dir(old_dir); -+ reiser4_update_sd(old_inode); -+ if (result == 0) { -+ file_plugin *fplug; -+ -+ if (new_inode != NULL) { -+ /* add safe-link for target file (in case we removed -+ * last reference to the poor fellow */ -+ fplug = inode_file_plugin(new_inode); -+ if (new_inode->i_nlink == 0) -+ result = safe_link_add(new_inode, SAFE_UNLINK); -+ } -+ } -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+#if 0 -+int reiser4_rename_common(struct inode *old_dir /* directory where @old -+ * is located */ , -+ struct dentry *old_name /* old name */ , -+ struct inode *new_dir /* directory where @new -+ * is located */ , -+ struct dentry *new_name/* new name */) -+{ -+ /* From `The Open Group Base Specifications Issue 6' -+ -+ If either the old or new argument names a symbolic link, rename() -+ shall operate on the symbolic link itself, and shall not resolve -+ the last component of the argument. If the old argument and the new -+ argument resolve to the same existing file, rename() shall return -+ successfully and perform no other action. -+ -+ [this is done by VFS: vfs_rename()] -+ -+ If the old argument points to the pathname of a file that is not a -+ directory, the new argument shall not point to the pathname of a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the link named by the new argument exists, it shall -+ be removed and old renamed to new. In this case, a link named new -+ shall remain visible to other processes throughout the renaming -+ operation and refer either to the file referred to by new or old -+ before the operation began. -+ -+ [we should assure this] -+ -+ Write access permission is required for -+ both the directory containing old and the directory containing new. -+ -+ [checked by VFS: vfs_rename->may_delete(), may_create()] -+ -+ If the old argument points to the pathname of a directory, the new -+ argument shall not point to the pathname of a file that is not a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the directory named by the new argument exists, it -+ shall be removed and old renamed to new. In this case, a link named -+ new shall exist throughout the renaming operation and shall refer -+ either to the directory referred to by new or old before the -+ operation began. -+ -+ [we should assure this] -+ -+ If new names an existing directory, it shall be -+ required to be an empty directory. -+ -+ [we should check this] -+ -+ If the old argument points to a pathname of a symbolic link, the -+ symbolic link shall be renamed. If the new argument points to a -+ pathname of a symbolic link, the symbolic link shall be removed. -+ -+ The new pathname shall not contain a path prefix that names -+ old. Write access permission is required for the directory -+ containing old and the directory containing new. If the old -+ argument points to the pathname of a directory, write access -+ permission may be required for the directory named by old, and, if -+ it exists, the directory named by new. -+ -+ [checked by VFS: vfs_rename(), vfs_rename_dir()] -+ -+ If the link named by the new argument exists and the file's link -+ count becomes 0 when it is removed and no process has the file -+ open, the space occupied by the file shall be freed and the file -+ shall no longer be accessible. If one or more processes have the -+ file open when the last link is removed, the link shall be removed -+ before rename() returns, but the removal of the file contents shall -+ be postponed until all references to the file are closed. -+ -+ [iput() handles this, but we can do this manually, a la -+ reiser4_unlink()] -+ -+ Upon successful completion, rename() shall mark for update the -+ st_ctime and st_mtime fields of the parent directory of each file. -+ -+ [N/A] -+ -+ */ -+ reiser4_context *ctx; -+ int result; -+ int is_dir; /* is @old_name directory */ -+ struct inode *old_inode; -+ struct inode *new_inode; -+ reiser4_dir_entry_desc old_entry; -+ reiser4_dir_entry_desc new_entry; -+ coord_t *new_coord; -+ struct reiser4_dentry_fsdata *new_fsdata; -+ lock_handle new_lh; -+ dir_plugin *dplug; -+ file_plugin *fplug; -+ -+ ctx = reiser4_init_context(old_dir->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ assert("nikita-2318", old_dir != NULL); -+ assert("nikita-2319", new_dir != NULL); -+ assert("nikita-2320", old_name != NULL); -+ assert("nikita-2321", new_name != NULL); -+ -+ old_inode = old_name->d_inode; -+ new_inode = new_name->d_inode; -+ -+ dplug = inode_dir_plugin(old_dir); -+ fplug = NULL; -+ -+ new_fsdata = reiser4_get_dentry_fsdata(new_name); -+ if (IS_ERR(new_fsdata)) { -+ result = PTR_ERR(new_fsdata); -+ goto exit; -+ } -+ -+ new_coord = &new_fsdata->dec.entry_coord; -+ coord_clear_iplug(new_coord); -+ -+ is_dir = S_ISDIR(old_inode->i_mode); -+ -+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir); -+ -+ /* if target is existing directory and it's not empty---return error. -+ -+ This check is done specifically, because is_dir_empty() requires -+ tree traversal and have to be done before locks are taken. -+ */ -+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) -+ return RETERR(-ENOTEMPTY); -+ -+ result = can_rename(old_dir, old_inode, new_dir, new_inode); -+ if (result != 0) -+ goto exit; -+ -+ result = hashed_rename_estimate_and_grab(old_dir, old_name, -+ new_dir, new_name); -+ if (result != 0) -+ goto exit; -+ -+ init_lh(&new_lh); -+ -+ /* find entry for @new_name */ -+ result = reiser4_find_entry(new_dir, new_name, &new_lh, -+ ZNODE_WRITE_LOCK, &new_entry); -+ -+ if (IS_CBKERR(result)) { -+ done_lh(&new_lh); -+ goto exit; -+ } -+ -+ reiser4_seal_done(&new_fsdata->dec.entry_seal); -+ -+ /* add or replace name for @old_inode as @new_name */ -+ if (new_inode != NULL) { -+ /* target (@new_name) exists. */ -+ /* Not clear what to do with objects that are -+ both directories and files at the same time. */ -+ if (result == CBK_COORD_FOUND) { -+ result = replace_name(old_inode, -+ new_dir, -+ new_inode, new_coord, &new_lh); -+ if (result == 0) -+ fplug = inode_file_plugin(new_inode); -+ } else if (result == CBK_COORD_NOTFOUND) { -+ /* VFS told us that @new_name is bound to existing -+ inode, but we failed to find directory entry. */ -+ warning("nikita-2324", "Target not found"); -+ result = RETERR(-ENOENT); -+ } -+ } else { -+ /* target (@new_name) doesn't exists. */ -+ if (result == CBK_COORD_NOTFOUND) -+ result = add_name(old_inode, -+ new_dir, -+ new_name, new_coord, &new_lh, is_dir); -+ else if (result == CBK_COORD_FOUND) { -+ /* VFS told us that @new_name is "negative" dentry, -+ but we found directory entry. */ -+ warning("nikita-2331", "Target found unexpectedly"); -+ result = RETERR(-EIO); -+ } -+ } -+ -+ assert("nikita-3462", ergo(result == 0, -+ old_inode->i_nlink >= 2 + !!is_dir)); -+ -+ /* We are done with all modifications to the @new_dir, release lock on -+ node. */ -+ done_lh(&new_lh); -+ -+ if (fplug != NULL) { -+ /* detach @new_inode from name-space */ -+ result = fplug->detach(new_inode, new_dir); -+ if (result != 0) -+ warning("nikita-2330", "Cannot detach %lli: %i. %s", -+ (unsigned long long)get_inode_oid(new_inode), -+ result, possible_leak); -+ } -+ -+ if (new_inode != NULL) -+ reiser4_update_sd(new_inode); -+ -+ if (result == 0) { -+ memset(&old_entry, 0, sizeof old_entry); -+ old_entry.obj = old_inode; -+ -+ dplug->build_entry_key(old_dir, -+ &old_name->d_name, &old_entry.key); -+ -+ /* At this stage new name was introduced for -+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink -+ counters were updated. -+ -+ We want to remove @old_name now. If @old_inode wasn't -+ directory this is simple. -+ */ -+ result = dplug->rem_entry(old_dir, old_name, &old_entry); -+ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */ -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2335", -+ "Cannot remove old name: %i", result); -+ } else { -+ result = reiser4_del_nlink(old_inode, old_dir, 0); -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2337", -+ "Cannot drop link on old: %i", result); -+ } -+ } -+ -+ if (result == 0 && is_dir) { -+ /* @old_inode is directory. We also have to update -+ dotdot entry. */ -+ coord_t *dotdot_coord; -+ lock_handle dotdot_lh; -+ struct dentry dotdot_name; -+ reiser4_dir_entry_desc dotdot_entry; -+ struct reiser4_dentry_fsdata dataonstack; -+ struct reiser4_dentry_fsdata *fsdata; -+ -+ memset(&dataonstack, 0, sizeof dataonstack); -+ memset(&dotdot_entry, 0, sizeof dotdot_entry); -+ dotdot_entry.obj = old_dir; -+ memset(&dotdot_name, 0, sizeof dotdot_name); -+ dotdot_name.d_name.name = ".."; -+ dotdot_name.d_name.len = 2; -+ /* -+ * allocate ->d_fsdata on the stack to avoid using -+ * reiser4_get_dentry_fsdata(). Locking is not needed, -+ * because dentry is private to the current thread. -+ */ -+ dotdot_name.d_fsdata = &dataonstack; -+ init_lh(&dotdot_lh); -+ -+ fsdata = &dataonstack; -+ dotdot_coord = &fsdata->dec.entry_coord; -+ coord_clear_iplug(dotdot_coord); -+ -+ result = reiser4_find_entry(old_inode, -+ &dotdot_name, -+ &dotdot_lh, -+ ZNODE_WRITE_LOCK, -+ &dotdot_entry); -+ if (result == 0) { -+ /* replace_name() decreases i_nlink on -+ * @old_dir */ -+ result = replace_name(new_dir, -+ old_inode, -+ old_dir, -+ dotdot_coord, &dotdot_lh); -+ } else -+ result = RETERR(-EIO); -+ done_lh(&dotdot_lh); -+ } -+ } -+ reiser4_update_dir(new_dir); -+ reiser4_update_dir(old_dir); -+ reiser4_update_sd(old_inode); -+ if (result == 0) { -+ file_plugin *fplug; -+ -+ if (new_inode != NULL) { -+ /* add safe-link for target file (in case we removed -+ * last reference to the poor fellow */ -+ fplug = inode_file_plugin(new_inode); -+ if (new_inode->i_nlink == 0) -+ result = safe_link_add(new_inode, SAFE_UNLINK); -+ } -+ } -+exit: -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+#endif -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/acl.h linux-2.6.30/fs/reiser4/plugin/item/acl.h ---- linux-2.6.30.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/acl.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,66 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ) -+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+ -+#include <linux/fs.h> -+#include <linux/dcache.h> /* for struct dentry */ -+ -+typedef struct directory_entry_format { -+ /* key of object stat-data. It's not necessary to store whole -+ key here, because it's always key of stat-data, so minor -+ packing locality and offset can be omitted here. But this -+ relies on particular key allocation scheme for stat-data, so, -+ for extensibility sake, whole key can be stored here. -+ -+ We store key as array of bytes, because we don't want 8-byte -+ alignment of dir entries. -+ */ -+ obj_key_id id; -+ /* file name. Null terminated string. */ -+ d8 name[0]; -+} directory_entry_format; -+ -+void print_de(const char *prefix, coord_t * coord); -+int extract_key_de(const coord_t * coord, reiser4_key * key); -+int update_key_de(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh); -+char *extract_name_de(const coord_t * coord, char *buf); -+unsigned extract_file_type_de(const coord_t * coord); -+int add_entry_de(struct inode *dir, coord_t * coord, -+ lock_handle * lh, const struct dentry *name, -+ reiser4_dir_entry_desc * entry); -+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord, -+ lock_handle * lh, reiser4_dir_entry_desc * entry); -+int max_name_len_de(const struct inode *dir); -+ -+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length); -+ -+char *extract_dent_name(const coord_t * coord, -+ directory_entry_format * dent, char *buf); -+ -+#if REISER4_LARGE_KEY -+#define DE_NAME_BUF_LEN (24) -+#else -+#define DE_NAME_BUF_LEN (16) -+#endif -+ -+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.30/fs/reiser4/plugin/item/blackbox.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/blackbox.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,142 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Black box item implementation */ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../coord.h" -+#include "../../tree.h" -+#include "../../lock.h" -+ -+#include "blackbox.h" -+#include "item.h" -+#include "../plugin.h" -+ -+int -+store_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length) -+{ -+ int result; -+ reiser4_item_data idata; -+ coord_t coord; -+ lock_handle lh; -+ -+ memset(&idata, 0, sizeof idata); -+ -+ idata.data = data; -+ idata.user = 0; -+ idata.length = length; -+ idata.iplug = item_plugin_by_id(BLACK_BOX_ID); -+ -+ init_lh(&lh); -+ result = insert_by_key(tree, key, -+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE); -+ -+ assert("nikita-3413", -+ ergo(result == 0, -+ WITH_COORD(&coord, -+ item_length_by_coord(&coord) == length))); -+ -+ done_lh(&lh); -+ return result; -+} -+ -+int -+load_black_box(reiser4_tree * tree, -+ reiser4_key * key, void *data, int length, int exact) -+{ -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ -+ init_lh(&lh); -+ result = coord_by_key(tree, key, -+ &coord, &lh, ZNODE_READ_LOCK, -+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN, -+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL); -+ -+ if (result == 0) { -+ int ilen; -+ -+ result = zload(coord.node); -+ if (result == 0) { -+ ilen = item_length_by_coord(&coord); -+ if (ilen <= length) { -+ memcpy(data, item_body_by_coord(&coord), ilen); -+ unit_key_by_coord(&coord, key); -+ } else if (exact) { -+ /* -+ * item is larger than buffer provided by the -+ * user. Only issue a warning if @exact is -+ * set. If @exact is false, we are iterating -+ * over all safe-links and here we are reaching -+ * the end of the iteration. -+ */ -+ warning("nikita-3415", -+ "Wrong black box length: %i > %i", -+ ilen, length); -+ result = RETERR(-EIO); -+ } -+ zrelse(coord.node); -+ } -+ } -+ -+ done_lh(&lh); -+ return result; -+ -+} -+ -+int -+update_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length) -+{ -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ -+ init_lh(&lh); -+ result = coord_by_key(tree, key, -+ &coord, &lh, ZNODE_READ_LOCK, -+ FIND_EXACT, -+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL); -+ if (result == 0) { -+ int ilen; -+ -+ result = zload(coord.node); -+ if (result == 0) { -+ ilen = item_length_by_coord(&coord); -+ if (length <= ilen) { -+ memcpy(item_body_by_coord(&coord), data, -+ length); -+ } else { -+ warning("nikita-3437", -+ "Wrong black box length: %i < %i", -+ ilen, length); -+ result = RETERR(-EIO); -+ } -+ zrelse(coord.node); -+ } -+ } -+ -+ done_lh(&lh); -+ return result; -+ -+} -+ -+int kill_black_box(reiser4_tree * tree, const reiser4_key * key) -+{ -+ return reiser4_cut_tree(tree, key, key, NULL, 1); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.30/fs/reiser4/plugin/item/blackbox.h ---- linux-2.6.30.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/blackbox.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,33 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* "Black box" entry to fixed-width contain user supplied data */ -+ -+#if !defined( __FS_REISER4_BLACK_BOX_H__ ) -+#define __FS_REISER4_BLACK_BOX_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+ -+extern int store_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length); -+extern int load_black_box(reiser4_tree * tree, -+ reiser4_key * key, void *data, int length, int exact); -+extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key); -+extern int update_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length); -+ -+/* __FS_REISER4_BLACK_BOX_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/cde.c linux-2.6.30/fs/reiser4/plugin/item/cde.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/cde.c 2009-06-22 17:27:31.000000000 +0200 -@@ -0,0 +1,1008 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry implementation */ -+ -+/* DESCRIPTION: -+ -+ This is "compound" directory item plugin implementation. This directory -+ item type is compound (as opposed to the "simple directory item" in -+ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory -+ entries. -+ -+ The reason behind this decision is disk space efficiency: all directory -+ entries inside the same directory have identical fragment in their -+ keys. This, of course, depends on key assignment policy. In our default key -+ assignment policy, all directory entries have the same locality which is -+ equal to the object id of their directory. -+ -+ Composing directory item out of several directory entries for the same -+ directory allows us to store said key fragment only once. That is, this is -+ some ad hoc form of key compression (stem compression) that is implemented -+ here, because general key compression is not supposed to be implemented in -+ v4.0. -+ -+ Another decision that was made regarding all directory item plugins, is -+ that they will store entry keys unaligned. This is for that sake of disk -+ space efficiency again. -+ -+ In should be noted, that storing keys unaligned increases CPU consumption, -+ at least on some architectures. -+ -+ Internal on-disk structure of the compound directory item is the following: -+ -+ HEADER cde_item_format. Here number of entries is stored. -+ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and -+ ENTRY_HEADER_1 offset of entry body are stored. -+ ENTRY_HEADER_2 (basically two last parts of key) -+ ... -+ ENTRY_HEADER_N -+ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and -+ ENTRY_BODY_1 NUL-terminated name are stored. -+ ENTRY_BODY_2 (part of statadta key in the -+ sence that since all SDs have -+ zero offset, this offset is not -+ stored on disk). -+ ... -+ ENTRY_BODY_N -+ -+ When it comes to the balancing, each directory entry in compound directory -+ item is unit, that is, something that can be cut from one item and pasted -+ into another item of the same type. Handling of unit cut and paste is major -+ reason for the complexity of code below. -+ -+*/ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "sde.h" -+#include "cde.h" -+#include "item.h" -+#include "../node/node.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../carry.h" -+#include "../../tree.h" -+#include "../../inode.h" -+ -+#include <linux/fs.h> /* for struct inode */ -+#include <linux/dcache.h> /* for struct dentry */ -+#include <linux/quotaops.h> -+ -+#if 0 -+#define CHECKME(coord) \ -+({ \ -+ const char *message; \ -+ coord_t dup; \ -+ \ -+ coord_dup_nocheck(&dup, (coord)); \ -+ dup.unit_pos = 0; \ -+ assert("nikita-2871", cde_check(&dup, &message) == 0); \ -+}) -+#else -+#define CHECKME(coord) noop -+#endif -+ -+/* return body of compound directory item at @coord */ -+static inline cde_item_format *formatted_at(const coord_t * coord) -+{ -+ assert("nikita-1282", coord != NULL); -+ return item_body_by_coord(coord); -+} -+ -+/* return entry header at @coord */ -+static inline cde_unit_header *header_at(const coord_t * -+ coord /* coord of item */ , -+ int idx /* index of unit */ ) -+{ -+ assert("nikita-1283", coord != NULL); -+ return &formatted_at(coord)->entry[idx]; -+} -+ -+/* return number of units in compound directory item at @coord */ -+static int units(const coord_t * coord /* coord of item */ ) -+{ -+ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries)); -+} -+ -+/* return offset of the body of @idx-th entry in @coord */ -+static unsigned int offset_of(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ ) -+{ -+ if (idx < units(coord)) -+ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset)); -+ else if (idx == units(coord)) -+ return item_length_by_coord(coord); -+ else -+ impossible("nikita-1308", "Wrong idx"); -+ return 0; -+} -+ -+/* set offset of the body of @idx-th entry in @coord */ -+static void set_offset(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ , -+ unsigned int offset /* new offset */ ) -+{ -+ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset); -+} -+ -+static void adj_offset(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ , -+ int delta /* offset change */ ) -+{ -+ d16 *doffset; -+ __u16 offset; -+ -+ doffset = &header_at(coord, idx)->offset; -+ offset = le16_to_cpu(get_unaligned(doffset)); -+ offset += delta; -+ put_unaligned(cpu_to_le16((__u16) offset), doffset); -+} -+ -+/* return pointer to @offset-th byte from the beginning of @coord */ -+static char *address(const coord_t * coord /* coord of item */ , -+ int offset) -+{ -+ return ((char *)item_body_by_coord(coord)) + offset; -+} -+ -+/* return pointer to the body of @idx-th entry in @coord */ -+static directory_entry_format *entry_at(const coord_t * coord /* coord of -+ * item */ , -+ int idx /* index of unit */ ) -+{ -+ return (directory_entry_format *) address(coord, -+ (int)offset_of(coord, idx)); -+} -+ -+/* return number of unit referenced by @coord */ -+static int idx_of(const coord_t * coord /* coord of item */ ) -+{ -+ assert("nikita-1285", coord != NULL); -+ return coord->unit_pos; -+} -+ -+/* find position where entry with @entry_key would be inserted into @coord */ -+static int find(const coord_t * coord /* coord of item */ , -+ const reiser4_key * entry_key /* key to look for */ , -+ cmp_t * last /* result of last comparison */ ) -+{ -+ int entries; -+ -+ int left; -+ int right; -+ -+ cde_unit_header *header; -+ -+ assert("nikita-1295", coord != NULL); -+ assert("nikita-1296", entry_key != NULL); -+ assert("nikita-1297", last != NULL); -+ -+ entries = units(coord); -+ left = 0; -+ right = entries - 1; -+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) { -+ int median; -+ -+ median = (left + right) >> 1; -+ -+ header = header_at(coord, median); -+ *last = de_id_key_cmp(&header->hash, entry_key); -+ switch (*last) { -+ case LESS_THAN: -+ left = median; -+ break; -+ case GREATER_THAN: -+ right = median; -+ break; -+ case EQUAL_TO:{ -+ do { -+ median--; -+ header--; -+ } while (median >= 0 && -+ de_id_key_cmp(&header->hash, -+ entry_key) == EQUAL_TO); -+ return median + 1; -+ } -+ } -+ } -+ header = header_at(coord, left); -+ for (; left < entries; ++left, ++header) { -+ prefetch(header + 1); -+ *last = de_id_key_cmp(&header->hash, entry_key); -+ if (*last != LESS_THAN) -+ break; -+ } -+ if (left < entries) -+ return left; -+ else -+ return RETERR(-ENOENT); -+ -+} -+ -+/* expand @coord as to accommodate for insertion of @no new entries starting -+ from @pos, with total bodies size @size. */ -+static int expand_item(const coord_t * coord /* coord of item */ , -+ int pos /* unit position */ , int no /* number of new -+ * units*/ , -+ int size /* total size of new units' data */ , -+ unsigned int data_size /* free space already reserved -+ * in the item for insertion */ ) -+{ -+ int entries; -+ cde_unit_header *header; -+ char *dent; -+ int i; -+ -+ assert("nikita-1310", coord != NULL); -+ assert("nikita-1311", pos >= 0); -+ assert("nikita-1312", no > 0); -+ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format)); -+ assert("nikita-1343", -+ item_length_by_coord(coord) >= -+ (int)(size + data_size + no * sizeof *header)); -+ -+ entries = units(coord); -+ -+ if (pos == entries) -+ dent = address(coord, size); -+ else -+ dent = (char *)entry_at(coord, pos); -+ /* place where new header will be in */ -+ header = header_at(coord, pos); -+ /* free space for new entry headers */ -+ memmove(header + no, header, -+ (unsigned)(address(coord, size) - (char *)header)); -+ /* if adding to the end initialise first new header */ -+ if (pos == entries) { -+ set_offset(coord, pos, (unsigned)size); -+ } -+ -+ /* adjust entry pointer and size */ -+ dent = dent + no * sizeof *header; -+ size += no * sizeof *header; -+ /* free space for new entries */ -+ memmove(dent + data_size, dent, -+ (unsigned)(address(coord, size) - dent)); -+ -+ /* increase counter */ -+ entries += no; -+ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries); -+ -+ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header ) -+ bytes. */ -+ for (i = 0; i <= pos; ++i) -+ adj_offset(coord, i, no * sizeof *header); -+ /* [ pos + no ... +\infty ) entries were shifted by ( no * -+ sizeof *header + data_size ) bytes */ -+ for (i = pos + no; i < entries; ++i) -+ adj_offset(coord, i, no * sizeof *header + data_size); -+ return 0; -+} -+ -+/* insert new @entry into item */ -+static int expand(const coord_t * coord /* coord of item */ , -+ struct cde_entry * entry /* entry to insert */ , -+ int len /* length of @entry data */ , -+ int *pos /* position to insert */ , -+ reiser4_dir_entry_desc * dir_entry /* parameters for new -+ * entry */ ) -+{ -+ cmp_t cmp_res; -+ int datasize; -+ -+ *pos = find(coord, &dir_entry->key, &cmp_res); -+ if (*pos < 0) -+ *pos = units(coord); -+ -+ datasize = sizeof(directory_entry_format); -+ if (is_longname(entry->name->name, entry->name->len)) -+ datasize += entry->name->len + 1; -+ -+ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len, -+ datasize); -+ return 0; -+} -+ -+/* paste body of @entry into item */ -+static int paste_entry(const coord_t * coord /* coord of item */ , -+ struct cde_entry * entry /* new entry */ , -+ int pos /* position to insert */ , -+ reiser4_dir_entry_desc * dir_entry /* parameters for -+ * new entry */ ) -+{ -+ cde_unit_header *header; -+ directory_entry_format *dent; -+ const char *name; -+ int len; -+ -+ header = header_at(coord, pos); -+ dent = entry_at(coord, pos); -+ -+ build_de_id_by_key(&dir_entry->key, &header->hash); -+ build_inode_key_id(entry->obj, &dent->id); -+ /* AUDIT unsafe strcpy() operation! It should be replaced with -+ much less CPU hungry -+ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len ); -+ -+ Also a more major thing is that there should be a way to figure out -+ amount of space in dent -> name and be able to check that we are -+ not going to overwrite more than we supposed to */ -+ name = entry->name->name; -+ len = entry->name->len; -+ if (is_longname(name, len)) { -+ strcpy((unsigned char *)dent->name, name); -+ put_unaligned(0, &dent->name[len]); -+ } -+ return 0; -+} -+ -+/* estimate how much space is necessary in item to insert/paste set of entries -+ described in @data. */ -+int estimate_cde(const coord_t * coord /* coord of item */ , -+ const reiser4_item_data * data /* parameters for new item */ ) -+{ -+ struct cde_entry_data *e; -+ int result; -+ int i; -+ -+ e = (struct cde_entry_data *) data->data; -+ -+ assert("nikita-1288", e != NULL); -+ assert("nikita-1289", e->num_of_entries >= 0); -+ -+ if (coord == NULL) -+ /* insert */ -+ result = sizeof(cde_item_format); -+ else -+ /* paste */ -+ result = 0; -+ -+ result += e->num_of_entries * -+ (sizeof(cde_unit_header) + sizeof(directory_entry_format)); -+ for (i = 0; i < e->num_of_entries; ++i) { -+ const char *name; -+ int len; -+ -+ name = e->entry[i].name->name; -+ len = e->entry[i].name->len; -+ assert("nikita-2054", strlen(name) == len); -+ if (is_longname(name, len)) -+ result += len + 1; -+ } -+ ((reiser4_item_data *) data)->length = result; -+ return result; -+} -+ -+/* ->nr_units() method for this item plugin. */ -+pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ ) -+{ -+ return units(coord); -+} -+ -+/* ->unit_key() method for this item plugin. */ -+reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ , -+ reiser4_key * key /* resulting key */ ) -+{ -+ assert("nikita-1452", coord != NULL); -+ assert("nikita-1345", idx_of(coord) < units(coord)); -+ assert("nikita-1346", key != NULL); -+ -+ item_key_by_coord(coord, key); -+ extract_key_from_de_id(extract_dir_id_from_key(key), -+ &header_at(coord, idx_of(coord))->hash, key); -+ return key; -+} -+ -+/* mergeable_cde(): implementation of ->mergeable() item method. -+ -+ Two directory items are mergeable iff they are from the same -+ directory. That simple. -+ -+*/ -+int mergeable_cde(const coord_t * p1 /* coord of first item */ , -+ const coord_t * p2 /* coord of second item */ ) -+{ -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ assert("nikita-1339", p1 != NULL); -+ assert("nikita-1340", p2 != NULL); -+ -+ return -+ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) && -+ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) == -+ extract_dir_id_from_key(item_key_by_coord(p2, &k2))); -+ -+} -+ -+/* ->max_key_inside() method for this item plugin. */ -+reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ , -+ reiser4_key * result /* resulting key */ ) -+{ -+ assert("nikita-1342", coord != NULL); -+ -+ item_key_by_coord(coord, result); -+ set_key_ordering(result, get_key_ordering(reiser4_max_key())); -+ set_key_fulloid(result, get_key_fulloid(reiser4_max_key())); -+ set_key_offset(result, get_key_offset(reiser4_max_key())); -+ return result; -+} -+ -+/* @data contains data which are to be put into tree */ -+int can_contain_key_cde(const coord_t * coord /* coord of item */ , -+ const reiser4_key * key /* key to check */ , -+ const reiser4_item_data * data /* parameters of new -+ * item/unit being -+ * created */ ) -+{ -+ reiser4_key item_key; -+ -+ /* FIXME-VS: do not rely on anything but iplug field of @data. Only -+ data->iplug is initialized */ -+ assert("vs-457", data && data->iplug); -+/* assert( "vs-553", data -> user == 0 );*/ -+ item_key_by_coord(coord, &item_key); -+ -+ return (item_plugin_by_coord(coord) == data->iplug) && -+ (extract_dir_id_from_key(&item_key) == -+ extract_dir_id_from_key(key)); -+} -+ -+#if REISER4_DEBUG -+/* cde_check ->check() method for compressed directory items -+ -+ used for debugging, every item should have here the most complete -+ possible check of the consistency of the item that the inventor can -+ construct -+*/ -+int reiser4_check_cde(const coord_t * coord /* coord of item to check */, -+ const char **error /* where to store error message */) -+{ -+ int i; -+ int result; -+ char *item_start; -+ char *item_end; -+ reiser4_key key; -+ -+ coord_t c; -+ -+ assert("nikita-1357", coord != NULL); -+ assert("nikita-1358", error != NULL); -+ -+ if (!ergo(coord->item_pos != 0, -+ is_dot_key(item_key_by_coord(coord, &key)))) { -+ *error = "CDE doesn't start with dot"; -+ return -1; -+ } -+ item_start = item_body_by_coord(coord); -+ item_end = item_start + item_length_by_coord(coord); -+ -+ coord_dup(&c, coord); -+ result = 0; -+ for (i = 0; i < units(coord); ++i) { -+ directory_entry_format *entry; -+ -+ if ((char *)(header_at(coord, i) + 1) > -+ item_end - units(coord) * sizeof *entry) { -+ *error = "CDE header is out of bounds"; -+ result = -1; -+ break; -+ } -+ entry = entry_at(coord, i); -+ if ((char *)entry < item_start + sizeof(cde_item_format)) { -+ *error = "CDE header is too low"; -+ result = -1; -+ break; -+ } -+ if ((char *)(entry + 1) > item_end) { -+ *error = "CDE header is too high"; -+ result = -1; -+ break; -+ } -+ } -+ -+ return result; -+} -+#endif -+ -+/* ->init() method for this item plugin. */ -+int init_cde(coord_t * coord /* coord of item */ , -+ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */ -+ UNUSED_ARG) -+{ -+ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries); -+ return 0; -+} -+ -+/* ->lookup() method for this item plugin. */ -+lookup_result lookup_cde(const reiser4_key * key /* key to search for */ , -+ lookup_bias bias /* search bias */ , -+ coord_t * coord /* coord of item to lookup in */ ) -+{ -+ cmp_t last_comp; -+ int pos; -+ -+ reiser4_key utmost_key; -+ -+ assert("nikita-1293", coord != NULL); -+ assert("nikita-1294", key != NULL); -+ -+ CHECKME(coord); -+ -+ if (keygt(item_key_by_coord(coord, &utmost_key), key)) { -+ coord->unit_pos = 0; -+ coord->between = BEFORE_UNIT; -+ return CBK_COORD_NOTFOUND; -+ } -+ pos = find(coord, key, &last_comp); -+ if (pos >= 0) { -+ coord->unit_pos = (int)pos; -+ switch (last_comp) { -+ case EQUAL_TO: -+ coord->between = AT_UNIT; -+ return CBK_COORD_FOUND; -+ case GREATER_THAN: -+ coord->between = BEFORE_UNIT; -+ return RETERR(-ENOENT); -+ case LESS_THAN: -+ default: -+ impossible("nikita-1298", "Broken find"); -+ return RETERR(-EIO); -+ } -+ } else { -+ coord->unit_pos = units(coord) - 1; -+ coord->between = AFTER_UNIT; -+ return (bias == -+ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND : -+ CBK_COORD_NOTFOUND; -+ } -+} -+ -+/* ->paste() method for this item plugin. */ -+int paste_cde(coord_t * coord /* coord of item */ , -+ reiser4_item_data * data /* parameters of new unit being -+ * inserted */ , -+ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ ) -+{ -+ struct cde_entry_data *e; -+ int result; -+ int i; -+ -+ CHECKME(coord); -+ e = (struct cde_entry_data *) data->data; -+ -+ result = 0; -+ for (i = 0; i < e->num_of_entries; ++i) { -+ int pos; -+ int phantom_size; -+ -+ phantom_size = data->length; -+ if (units(coord) == 0) -+ phantom_size -= sizeof(cde_item_format); -+ -+ result = -+ expand(coord, e->entry + i, phantom_size, &pos, data->arg); -+ if (result != 0) -+ break; -+ result = paste_entry(coord, e->entry + i, pos, data->arg); -+ if (result != 0) -+ break; -+ } -+ CHECKME(coord); -+ return result; -+} -+ -+/* amount of space occupied by all entries starting from @idx both headers and -+ bodies. */ -+static unsigned int part_size(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ ) -+{ -+ assert("nikita-1299", coord != NULL); -+ assert("nikita-1300", idx < (int)units(coord)); -+ -+ return sizeof(cde_item_format) + -+ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord, -+ idx + 1) - -+ offset_of(coord, 0); -+} -+ -+/* how many but not more than @want units of @source can be merged with -+ item in @target node. If pend == append - we try to append last item -+ of @target by first units of @source. If pend == prepend - we try to -+ "prepend" first item in @target by last units of @source. @target -+ node has @free_space bytes of free space. Total size of those units -+ are returned via @size */ -+int can_shift_cde(unsigned free_space /* free space in item */ , -+ coord_t * coord /* coord of source item */ , -+ znode * target /* target node */ , -+ shift_direction pend /* shift direction */ , -+ unsigned *size /* resulting number of shifted bytes */ , -+ unsigned want /* maximal number of bytes to shift */ ) -+{ -+ int shift; -+ -+ CHECKME(coord); -+ if (want == 0) { -+ *size = 0; -+ return 0; -+ } -+ -+ /* pend == SHIFT_LEFT <==> shifting to the left */ -+ if (pend == SHIFT_LEFT) { -+ for (shift = min((int)want - 1, units(coord)); shift >= 0; -+ --shift) { -+ *size = part_size(coord, shift); -+ if (target != NULL) -+ *size -= sizeof(cde_item_format); -+ if (*size <= free_space) -+ break; -+ } -+ shift = shift + 1; -+ } else { -+ int total_size; -+ -+ assert("nikita-1301", pend == SHIFT_RIGHT); -+ -+ total_size = item_length_by_coord(coord); -+ for (shift = units(coord) - want - 1; shift < units(coord) - 1; -+ ++shift) { -+ *size = total_size - part_size(coord, shift); -+ if (target == NULL) -+ *size += sizeof(cde_item_format); -+ if (*size <= free_space) -+ break; -+ } -+ shift = units(coord) - shift - 1; -+ } -+ if (shift == 0) -+ *size = 0; -+ CHECKME(coord); -+ return shift; -+} -+ -+/* ->copy_units() method for this item plugin. */ -+void copy_units_cde(coord_t * target /* coord of target item */ , -+ coord_t * source /* coord of source item */ , -+ unsigned from /* starting unit */ , -+ unsigned count /* how many units to copy */ , -+ shift_direction where_is_free_space /* shift direction */ , -+ unsigned free_space /* free space in item */ ) -+{ -+ char *header_from; -+ char *header_to; -+ -+ char *entry_from; -+ char *entry_to; -+ -+ int pos_in_target; -+ int data_size; -+ int data_delta; -+ int i; -+ -+ assert("nikita-1303", target != NULL); -+ assert("nikita-1304", source != NULL); -+ assert("nikita-1305", (int)from < units(source)); -+ assert("nikita-1307", (int)(from + count) <= units(source)); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ assert("nikita-1453", from == 0); -+ pos_in_target = units(target); -+ } else { -+ assert("nikita-1309", (int)(from + count) == units(source)); -+ pos_in_target = 0; -+ memmove(item_body_by_coord(target), -+ (char *)item_body_by_coord(target) + free_space, -+ item_length_by_coord(target) - free_space); -+ } -+ -+ CHECKME(target); -+ CHECKME(source); -+ -+ /* expand @target */ -+ data_size = -+ offset_of(source, (int)(from + count)) - offset_of(source, -+ (int)from); -+ -+ if (units(target) == 0) -+ free_space -= sizeof(cde_item_format); -+ -+ expand_item(target, pos_in_target, (int)count, -+ (int)(item_length_by_coord(target) - free_space), -+ (unsigned)data_size); -+ -+ /* copy first @count units of @source into @target */ -+ data_delta = -+ offset_of(target, pos_in_target) - offset_of(source, (int)from); -+ -+ /* copy entries */ -+ entry_from = (char *)entry_at(source, (int)from); -+ entry_to = (char *)entry_at(source, (int)(from + count)); -+ memmove(entry_at(target, pos_in_target), entry_from, -+ (unsigned)(entry_to - entry_from)); -+ -+ /* copy headers */ -+ header_from = (char *)header_at(source, (int)from); -+ header_to = (char *)header_at(source, (int)(from + count)); -+ memmove(header_at(target, pos_in_target), header_from, -+ (unsigned)(header_to - header_from)); -+ -+ /* update offsets */ -+ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i) -+ adj_offset(target, i, data_delta); -+ CHECKME(target); -+ CHECKME(source); -+} -+ -+/* ->cut_units() method for this item plugin. */ -+int cut_units_cde(coord_t * coord /* coord of item */ , -+ pos_in_node_t from /* start unit pos */ , -+ pos_in_node_t to /* stop unit pos */ , -+ struct carry_cut_data *cdata UNUSED_ARG, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ char *header_from; -+ char *header_to; -+ -+ char *entry_from; -+ char *entry_to; -+ -+ int size; -+ int entry_delta; -+ int header_delta; -+ int i; -+ -+ unsigned count; -+ -+ CHECKME(coord); -+ -+ count = to - from + 1; -+ -+ assert("nikita-1454", coord != NULL); -+ assert("nikita-1455", (int)(from + count) <= units(coord)); -+ -+ if (smallest_removed) -+ unit_key_by_coord(coord, smallest_removed); -+ -+ if (new_first) { -+ coord_t next; -+ -+ /* not everything is cut from item head */ -+ assert("vs-1527", from == 0); -+ assert("vs-1528", to < units(coord) - 1); -+ -+ coord_dup(&next, coord); -+ next.unit_pos++; -+ unit_key_by_coord(&next, new_first); -+ } -+ -+ size = item_length_by_coord(coord); -+ if (count == (unsigned)units(coord)) { -+ return size; -+ } -+ -+ header_from = (char *)header_at(coord, (int)from); -+ header_to = (char *)header_at(coord, (int)(from + count)); -+ -+ entry_from = (char *)entry_at(coord, (int)from); -+ entry_to = (char *)entry_at(coord, (int)(from + count)); -+ -+ /* move headers */ -+ memmove(header_from, header_to, -+ (unsigned)(address(coord, size) - header_to)); -+ -+ header_delta = header_to - header_from; -+ -+ entry_from -= header_delta; -+ entry_to -= header_delta; -+ size -= header_delta; -+ -+ /* copy entries */ -+ memmove(entry_from, entry_to, -+ (unsigned)(address(coord, size) - entry_to)); -+ -+ entry_delta = entry_to - entry_from; -+ size -= entry_delta; -+ -+ /* update offsets */ -+ -+ for (i = 0; i < (int)from; ++i) -+ adj_offset(coord, i, -header_delta); -+ -+ for (i = from; i < units(coord) - (int)count; ++i) -+ adj_offset(coord, i, -header_delta - entry_delta); -+ -+ put_unaligned(cpu_to_le16((__u16) units(coord) - count), -+ &formatted_at(coord)->num_of_entries); -+ -+ if (from == 0) { -+ /* entries from head was removed - move remaining to right */ -+ memmove((char *)item_body_by_coord(coord) + -+ header_delta + entry_delta, item_body_by_coord(coord), -+ (unsigned)size); -+ if (REISER4_DEBUG) -+ memset(item_body_by_coord(coord), 0, -+ (unsigned)header_delta + entry_delta); -+ } else { -+ /* freed space is already at the end of item */ -+ if (REISER4_DEBUG) -+ memset((char *)item_body_by_coord(coord) + size, 0, -+ (unsigned)header_delta + entry_delta); -+ } -+ -+ return header_delta + entry_delta; -+} -+ -+int kill_units_cde(coord_t * coord /* coord of item */ , -+ pos_in_node_t from /* start unit pos */ , -+ pos_in_node_t to /* stop unit pos */ , -+ struct carry_kill_data *kdata UNUSED_ARG, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first); -+} -+ -+/* ->s.dir.extract_key() method for this item plugin. */ -+int extract_key_cde(const coord_t * coord /* coord of item */ , -+ reiser4_key * key /* resulting key */ ) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1155", coord != NULL); -+ assert("nikita-1156", key != NULL); -+ -+ dent = entry_at(coord, idx_of(coord)); -+ return extract_key_from_id(&dent->id, key); -+} -+ -+int -+update_key_cde(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh UNUSED_ARG) -+{ -+ directory_entry_format *dent; -+ obj_key_id obj_id; -+ int result; -+ -+ assert("nikita-2344", coord != NULL); -+ assert("nikita-2345", key != NULL); -+ -+ dent = entry_at(coord, idx_of(coord)); -+ result = build_obj_key_id(key, &obj_id); -+ if (result == 0) { -+ dent->id = obj_id; -+ znode_make_dirty(coord->node); -+ } -+ return 0; -+} -+ -+/* ->s.dir.extract_name() method for this item plugin. */ -+char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1157", coord != NULL); -+ -+ dent = entry_at(coord, idx_of(coord)); -+ return extract_dent_name(coord, dent, buf); -+} -+ -+static int cde_bytes(int pasting, const reiser4_item_data * data) -+{ -+ int result; -+ -+ result = data->length; -+ if (!pasting) -+ result -= sizeof(cde_item_format); -+ return result; -+} -+ -+/* ->s.dir.add_entry() method for this item plugin */ -+int add_entry_cde(struct inode *dir /* directory object */ , -+ coord_t * coord /* coord of item */ , -+ lock_handle * lh /* lock handle for insertion */ , -+ const struct dentry *name /* name to insert */ , -+ reiser4_dir_entry_desc * dir_entry /* parameters of new -+ * directory entry */ ) -+{ -+ reiser4_item_data data; -+ struct cde_entry entry; -+ struct cde_entry_data edata; -+ int result; -+ -+ assert("nikita-1656", coord->node == lh->node); -+ assert("nikita-1657", znode_is_write_locked(coord->node)); -+ -+ edata.num_of_entries = 1; -+ edata.entry = &entry; -+ -+ entry.dir = dir; -+ entry.obj = dir_entry->obj; -+ entry.name = &name->d_name; -+ -+ data.data = (char *)&edata; -+ data.user = 0; /* &edata is not user space */ -+ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID); -+ data.arg = dir_entry; -+ assert("nikita-1302", data.iplug != NULL); -+ -+ result = is_dot_key(&dir_entry->key); -+ data.length = estimate_cde(result ? coord : NULL, &data); -+ -+ /* NOTE-NIKITA quota plugin? */ -+ if (vfs_dq_alloc_space_nodirty(dir, cde_bytes(result, &data))) -+ return RETERR(-EDQUOT); -+ -+ if (result) -+ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0); -+ else -+ result = reiser4_resize_item(coord, &data, &dir_entry->key, -+ lh, 0); -+ return result; -+} -+ -+/* ->s.dir.rem_entry() */ -+int rem_entry_cde(struct inode *dir /* directory of item */ , -+ const struct qstr *name, coord_t * coord /* coord of item */ , -+ lock_handle * lh UNUSED_ARG /* lock handle for -+ * removal */ , -+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of -+ * directory entry -+ * being removed */ ) -+{ -+ coord_t shadow; -+ int result; -+ int length; -+ ON_DEBUG(char buf[DE_NAME_BUF_LEN]); -+ -+ assert("nikita-2870", strlen(name->name) == name->len); -+ assert("nikita-2869", -+ !strcmp(name->name, extract_name_cde(coord, buf))); -+ -+ length = sizeof(directory_entry_format) + sizeof(cde_unit_header); -+ if (is_longname(name->name, name->len)) -+ length += name->len + 1; -+ -+ if (inode_get_bytes(dir) < length) { -+ warning("nikita-2628", "Dir is broke: %llu: %llu", -+ (unsigned long long)get_inode_oid(dir), -+ inode_get_bytes(dir)); -+ -+ return RETERR(-EIO); -+ } -+ -+ /* cut_node() is supposed to take pointers to _different_ -+ coords, because it will modify them without respect to -+ possible aliasing. To work around this, create temporary copy -+ of @coord. -+ */ -+ coord_dup(&shadow, coord); -+ result = -+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0); -+ if (result == 0) { -+ /* NOTE-NIKITA quota plugin? */ -+ vfs_dq_free_space_nodirty(dir, length); -+ } -+ return result; -+} -+ -+/* ->s.dir.max_name_len() method for this item plugin */ -+int max_name_len_cde(const struct inode *dir /* directory */ ) -+{ -+ return -+ reiser4_tree_by_inode(dir)->nplug->max_item_size() - -+ sizeof(directory_entry_format) - sizeof(cde_item_format) - -+ sizeof(cde_unit_header) - 2; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/cde.h linux-2.6.30/fs/reiser4/plugin/item/cde.h ---- linux-2.6.30.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/cde.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,87 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Compound directory item. See cde.c for description. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ ) -+#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ -+ -+#include "../../forward.h" -+#include "../../kassign.h" -+#include "../../dformat.h" -+ -+#include <linux/fs.h> /* for struct inode */ -+#include <linux/dcache.h> /* for struct dentry, etc */ -+ -+typedef struct cde_unit_header { -+ de_id hash; -+ d16 offset; -+} cde_unit_header; -+ -+typedef struct cde_item_format { -+ d16 num_of_entries; -+ cde_unit_header entry[0]; -+} cde_item_format; -+ -+struct cde_entry { -+ const struct inode *dir; -+ const struct inode *obj; -+ const struct qstr *name; -+}; -+ -+struct cde_entry_data { -+ int num_of_entries; -+ struct cde_entry *entry; -+}; -+ -+/* plugin->item.b.* */ -+reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result); -+int can_contain_key_cde(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data *); -+int mergeable_cde(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_cde(const coord_t * coord); -+reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key); -+int estimate_cde(const coord_t * coord, const reiser4_item_data * data); -+void print_cde(const char *prefix, coord_t * coord); -+int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data); -+lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias, -+ coord_t * coord); -+int paste_cde(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG); -+int can_shift_cde(unsigned free_space, coord_t * coord, znode * target, -+ shift_direction pend, unsigned *size, unsigned want); -+void copy_units_cde(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction where_is_free_space, -+ unsigned free_space); -+int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+void print_cde(const char *prefix, coord_t * coord); -+int reiser4_check_cde(const coord_t * coord, const char **error); -+ -+/* plugin->u.item.s.dir.* */ -+int extract_key_cde(const coord_t * coord, reiser4_key * key); -+int update_key_cde(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh); -+char *extract_name_cde(const coord_t * coord, char *buf); -+int add_entry_cde(struct inode *dir, coord_t * coord, -+ lock_handle * lh, const struct dentry *name, -+ reiser4_dir_entry_desc * entry); -+int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord, -+ lock_handle * lh, reiser4_dir_entry_desc * entry); -+int max_name_len_cde(const struct inode *dir); -+ -+/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.30/fs/reiser4/plugin/item/ctail.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/ctail.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1613 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* ctails (aka "clustered tails") are items for cryptcompress objects */ -+ -+/* DESCRIPTION: -+ -+Each cryptcompress object is stored on disk as a set of clusters sliced -+into ctails. -+ -+Internal on-disk structure: -+ -+ HEADER (1) Here stored disk cluster shift -+ BODY -+*/ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "item.h" -+#include "../node/node.h" -+#include "../plugin.h" -+#include "../object.h" -+#include "../../znode.h" -+#include "../../carry.h" -+#include "../../tree.h" -+#include "../../inode.h" -+#include "../../super.h" -+#include "../../context.h" -+#include "../../page_cache.h" -+#include "../cluster.h" -+#include "../../flush.h" -+#include "../../tree_walk.h" -+ -+#include <linux/pagevec.h> -+#include <linux/swap.h> -+#include <linux/fs.h> -+ -+/* return body of ctail item at @coord */ -+static ctail_item_format *ctail_formatted_at(const coord_t * coord) -+{ -+ assert("edward-60", coord != NULL); -+ return item_body_by_coord(coord); -+} -+ -+static int cluster_shift_by_coord(const coord_t * coord) -+{ -+ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift); -+} -+ -+static inline void dclust_set_extension_shift(hint_t * hint) -+{ -+ assert("edward-1270", -+ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID); -+ hint->ext_coord.extension.ctail.shift = -+ cluster_shift_by_coord(&hint->ext_coord.coord); -+} -+ -+static loff_t off_by_coord(const coord_t * coord) -+{ -+ reiser4_key key; -+ return get_key_offset(item_key_by_coord(coord, &key)); -+} -+ -+int coord_is_unprepped_ctail(const coord_t * coord) -+{ -+ assert("edward-1233", coord != NULL); -+ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-1235", -+ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT, -+ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS)); -+ -+ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT; -+} -+ -+static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode) -+{ -+ int shift; -+ -+ if (inode != NULL) { -+ shift = inode_cluster_shift(inode); -+ assert("edward-1236", -+ ergo(!coord_is_unprepped_ctail(coord), -+ shift == cluster_shift_by_coord(coord))); -+ } else { -+ assert("edward-1237", !coord_is_unprepped_ctail(coord)); -+ shift = cluster_shift_by_coord(coord); -+ } -+ return off_by_coord(coord) >> shift; -+} -+ -+static int disk_cluster_size(const coord_t * coord) -+{ -+ assert("edward-1156", -+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); -+ /* calculation of disk cluster size -+ is meaninless if ctail is unprepped */ -+ assert("edward-1238", !coord_is_unprepped_ctail(coord)); -+ -+ return 1 << cluster_shift_by_coord(coord); -+} -+ -+/* true if the key is of first disk cluster item */ -+static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord) -+{ -+ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID); -+ -+ return coord_is_unprepped_ctail(coord) || -+ ((get_key_offset(key) & -+ ((loff_t) disk_cluster_size(coord) - 1)) == 0); -+} -+ -+static char *first_unit(coord_t * coord) -+{ -+ /* FIXME: warning: pointer of type `void *' used in arithmetic */ -+ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format); -+} -+ -+/* plugin->u.item.b.max_key_inside : -+ tail_max_key_inside */ -+ -+/* plugin->u.item.b.can_contain_key */ -+int -+can_contain_key_ctail(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data * data) -+{ -+ reiser4_key item_key; -+ -+ if (item_plugin_by_coord(coord) != data->iplug) -+ return 0; -+ -+ item_key_by_coord(coord, &item_key); -+ if (get_key_locality(key) != get_key_locality(&item_key) || -+ get_key_objectid(key) != get_key_objectid(&item_key)) -+ return 0; -+ if (get_key_offset(&item_key) + nr_units_ctail(coord) != -+ get_key_offset(key)) -+ return 0; -+ if (is_disk_cluster_key(key, coord)) -+ return 0; -+ return 1; -+} -+ -+/* plugin->u.item.b.mergeable */ -+int mergeable_ctail(const coord_t * p1, const coord_t * p2) -+{ -+ reiser4_key key1, key2; -+ -+ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID); -+ assert("edward-61", plugin_of_group(item_plugin_by_coord(p1), -+ UNIX_FILE_METADATA_ITEM_TYPE)); -+ -+ if (item_id_by_coord(p2) != CTAIL_ID) { -+ /* second item is of another type */ -+ return 0; -+ } -+ -+ item_key_by_coord(p1, &key1); -+ item_key_by_coord(p2, &key2); -+ if (get_key_locality(&key1) != get_key_locality(&key2) || -+ get_key_objectid(&key1) != get_key_objectid(&key2) || -+ get_key_type(&key1) != get_key_type(&key2)) { -+ /* items of different objects */ -+ return 0; -+ } -+ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2)) -+ /* not adjacent items */ -+ return 0; -+ if (is_disk_cluster_key(&key2, p2)) -+ return 0; -+ return 1; -+} -+ -+/* plugin->u.item.b.nr_units */ -+pos_in_node_t nr_units_ctail(const coord_t * coord) -+{ -+ return (item_length_by_coord(coord) - -+ sizeof(ctail_formatted_at(coord)->cluster_shift)); -+} -+ -+/* plugin->u.item.b.estimate: -+ estimate how much space is needed to insert/paste @data->length bytes -+ into ctail at @coord */ -+int estimate_ctail(const coord_t * coord /* coord of item */ , -+ const reiser4_item_data * -+ data /* parameters for new item */ ) -+{ -+ if (coord == NULL) -+ /* insert */ -+ return (sizeof(ctail_item_format) + data->length); -+ else -+ /* paste */ -+ return data->length; -+} -+ -+/* ->init() method for this item plugin. */ -+int init_ctail(coord_t * to /* coord of item */ , -+ coord_t * from /* old_item */ , -+ reiser4_item_data * data /* structure used for insertion */ ) -+{ -+ int cluster_shift; /* cpu value to convert */ -+ -+ if (data) { -+ assert("edward-463", data->length > sizeof(ctail_item_format)); -+ cluster_shift = *((int *)(data->arg)); -+ data->length -= sizeof(ctail_item_format); -+ } else { -+ assert("edward-464", from != NULL); -+ assert("edward-855", ctail_ok(from)); -+ cluster_shift = (int)(cluster_shift_by_coord(from)); -+ } -+ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift); -+ assert("edward-856", ctail_ok(to)); -+ return 0; -+} -+ -+/* plugin->u.item.b.lookup: -+ NULL: We are looking for item keys only */ -+ -+#if REISER4_DEBUG -+int ctail_ok(const coord_t * coord) -+{ -+ return coord_is_unprepped_ctail(coord) || -+ cluster_shift_ok(cluster_shift_by_coord(coord)); -+} -+ -+/* plugin->u.item.b.check */ -+int check_ctail(const coord_t * coord, const char **error) -+{ -+ if (!ctail_ok(coord)) { -+ if (error) -+ *error = "bad cluster shift in ctail"; -+ return 1; -+ } -+ return 0; -+} -+#endif -+ -+/* plugin->u.item.b.paste */ -+int -+paste_ctail(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG) -+{ -+ unsigned old_nr_units; -+ -+ assert("edward-268", data->data != NULL); -+ /* copy only from kernel space */ -+ assert("edward-66", data->user == 0); -+ -+ old_nr_units = -+ item_length_by_coord(coord) - sizeof(ctail_item_format) - -+ data->length; -+ -+ /* ctail items never get pasted in the middle */ -+ -+ if (coord->unit_pos == 0 && coord->between == AT_UNIT) { -+ -+ /* paste at the beginning when create new item */ -+ assert("edward-450", -+ item_length_by_coord(coord) == -+ data->length + sizeof(ctail_item_format)); -+ assert("edward-451", old_nr_units == 0); -+ } else if (coord->unit_pos == old_nr_units - 1 -+ && coord->between == AFTER_UNIT) { -+ -+ /* paste at the end */ -+ coord->unit_pos++; -+ } else -+ impossible("edward-453", "bad paste position"); -+ -+ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length); -+ -+ assert("edward-857", ctail_ok(coord)); -+ -+ return 0; -+} -+ -+/* plugin->u.item.b.fast_paste */ -+ -+/* plugin->u.item.b.can_shift -+ number of units is returned via return value, number of bytes via @size. For -+ ctail items they coincide */ -+int -+can_shift_ctail(unsigned free_space, coord_t * source, -+ znode * target, shift_direction direction UNUSED_ARG, -+ unsigned *size /* number of bytes */ , unsigned want) -+{ -+ /* make sure that that we do not want to shift more than we have */ -+ assert("edward-68", want > 0 && want <= nr_units_ctail(source)); -+ -+ *size = min(want, free_space); -+ -+ if (!target) { -+ /* new item will be created */ -+ if (*size <= sizeof(ctail_item_format)) { -+ *size = 0; -+ return 0; -+ } -+ return *size - sizeof(ctail_item_format); -+ } -+ return *size; -+} -+ -+/* plugin->u.item.b.copy_units -+ cooperates with ->can_shift() */ -+void -+copy_units_ctail(coord_t * target, coord_t * source, -+ unsigned from, unsigned count /* units */ , -+ shift_direction where_is_free_space, -+ unsigned free_space /* bytes */ ) -+{ -+ /* make sure that item @target is expanded already */ -+ assert("edward-69", (unsigned)item_length_by_coord(target) >= count); -+ assert("edward-70", free_space == count || free_space == count + 1); -+ -+ assert("edward-858", ctail_ok(source)); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ /* append item @target with @count first bytes of @source: -+ this restriction came from ordinary tails */ -+ assert("edward-71", from == 0); -+ assert("edward-860", ctail_ok(target)); -+ -+ memcpy(first_unit(target) + nr_units_ctail(target) - count, -+ first_unit(source), count); -+ } else { -+ /* target item is moved to right already */ -+ reiser4_key key; -+ -+ assert("edward-72", nr_units_ctail(source) == from + count); -+ -+ if (free_space == count) { -+ init_ctail(target, source, NULL); -+ } else { -+ /* new item has been created */ -+ assert("edward-862", ctail_ok(target)); -+ } -+ memcpy(first_unit(target), first_unit(source) + from, count); -+ -+ assert("edward-863", ctail_ok(target)); -+ -+ /* new units are inserted before first unit in an item, -+ therefore, we have to update item key */ -+ item_key_by_coord(source, &key); -+ set_key_offset(&key, get_key_offset(&key) + from); -+ -+ node_plugin_by_node(target->node)->update_item_key(target, &key, -+ NULL /*info */); -+ } -+} -+ -+/* plugin->u.item.b.create_hook */ -+int create_hook_ctail(const coord_t * coord, void *arg) -+{ -+ assert("edward-864", znode_is_loaded(coord->node)); -+ -+ znode_set_convertible(coord->node); -+ return 0; -+} -+ -+/* plugin->u.item.b.kill_hook */ -+int kill_hook_ctail(const coord_t * coord, pos_in_node_t from, -+ pos_in_node_t count, carry_kill_data * kdata) -+{ -+ struct inode *inode; -+ -+ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-291", znode_is_write_locked(coord->node)); -+ -+ inode = kdata->inode; -+ if (inode) { -+ reiser4_key key; -+ struct cryptcompress_info * info; -+ cloff_t index; -+ -+ item_key_by_coord(coord, &key); -+ info = cryptcompress_inode_data(inode); -+ index = off_to_clust(get_key_offset(&key), inode); -+ -+ if (from == 0) { -+ info->trunc_index = index; -+ if (is_disk_cluster_key(&key, coord)) { -+ /* -+ * first item of disk cluster is to be killed -+ */ -+ truncate_complete_page_cluster( -+ inode, index, kdata->params.truncate); -+ inode_sub_bytes(inode, -+ inode_cluster_size(inode)); -+ } -+ } -+ } -+ return 0; -+} -+ -+/* for shift_hook_ctail(), -+ return true if the first disk cluster item has dirty child -+*/ -+static int ctail_convertible(const coord_t * coord) -+{ -+ int result; -+ reiser4_key key; -+ jnode *child = NULL; -+ -+ assert("edward-477", coord != NULL); -+ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID); -+ -+ if (coord_is_unprepped_ctail(coord)) -+ /* unprepped ctail should be converted */ -+ return 1; -+ -+ item_key_by_coord(coord, &key); -+ child = jlookup(current_tree, -+ get_key_objectid(&key), -+ off_to_pg(off_by_coord(coord))); -+ if (!child) -+ return 0; -+ result = JF_ISSET(child, JNODE_DIRTY); -+ jput(child); -+ return result; -+} -+ -+/* FIXME-EDWARD */ -+/* plugin->u.item.b.shift_hook */ -+int shift_hook_ctail(const coord_t * item /* coord of item */ , -+ unsigned from UNUSED_ARG /* start unit */ , -+ unsigned count UNUSED_ARG /* stop unit */ , -+ znode * old_node /* old parent */ ) -+{ -+ assert("edward-479", item != NULL); -+ assert("edward-480", item->node != old_node); -+ -+ if (!znode_convertible(old_node) || znode_convertible(item->node)) -+ return 0; -+ if (ctail_convertible(item)) -+ znode_set_convertible(item->node); -+ return 0; -+} -+ -+static int -+cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ int cut, void *p, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ pos_in_node_t count; /* number of units to cut */ -+ char *item; -+ -+ count = to - from + 1; -+ item = item_body_by_coord(coord); -+ -+ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord))); -+ -+ if (smallest_removed) { -+ /* store smallest key removed */ -+ item_key_by_coord(coord, smallest_removed); -+ set_key_offset(smallest_removed, -+ get_key_offset(smallest_removed) + from); -+ } -+ -+ if (new_first) { -+ assert("vs-1531", from == 0); -+ -+ item_key_by_coord(coord, new_first); -+ set_key_offset(new_first, -+ get_key_offset(new_first) + from + count); -+ } -+ -+ if (!cut) -+ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p); -+ -+ if (from == 0) { -+ if (count != nr_units_ctail(coord)) { -+ /* part of item is removed, so move free space at the beginning -+ of the item and update item key */ -+ reiser4_key key; -+ memcpy(item + to + 1, item, sizeof(ctail_item_format)); -+ item_key_by_coord(coord, &key); -+ set_key_offset(&key, get_key_offset(&key) + count); -+ node_plugin_by_node(coord->node)->update_item_key(coord, -+ &key, -+ NULL); -+ } else { -+ /* cut_units should not be called to cut evrything */ -+ assert("vs-1532", ergo(cut, 0)); -+ /* whole item is cut, so more then amount of space occupied -+ by units got freed */ -+ count += sizeof(ctail_item_format); -+ } -+ if (REISER4_DEBUG) -+ memset(item, 0, count); -+ } else if (REISER4_DEBUG) -+ memset(item + sizeof(ctail_item_format) + from, 0, count); -+ return count; -+} -+ -+/* plugin->u.item.b.cut_units */ -+int -+cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to, -+ carry_cut_data * cdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ return cut_or_kill_ctail_units(item, from, to, 1, NULL, -+ smallest_removed, new_first); -+} -+ -+/* plugin->u.item.b.kill_units */ -+int -+kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *kdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ return cut_or_kill_ctail_units(item, from, to, 0, kdata, -+ smallest_removed, new_first); -+} -+ -+/* plugin->u.item.s.file.read */ -+int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint) -+{ -+ uf_coord_t *uf_coord; -+ coord_t *coord; -+ -+ uf_coord = &hint->ext_coord; -+ coord = &uf_coord->coord; -+ assert("edward-127", f->user == 0); -+ assert("edward-129", coord && coord->node); -+ assert("edward-130", coord_is_existing_unit(coord)); -+ assert("edward-132", znode_is_loaded(coord->node)); -+ -+ /* start read only from the beginning of ctail */ -+ assert("edward-133", coord->unit_pos == 0); -+ /* read only whole ctails */ -+ assert("edward-135", nr_units_ctail(coord) <= f->length); -+ -+ assert("edward-136", reiser4_schedulable()); -+ assert("edward-886", ctail_ok(coord)); -+ -+ if (f->data) -+ memcpy(f->data, (char *)first_unit(coord), -+ (size_t) nr_units_ctail(coord)); -+ -+ dclust_set_extension_shift(hint); -+ mark_page_accessed(znode_page(coord->node)); -+ move_flow_forward(f, nr_units_ctail(coord)); -+ -+ return 0; -+} -+ -+/** -+ * Prepare transform stream with plain text for page -+ * @page taking into account synchronization issues. -+ */ -+static int ctail_read_disk_cluster(struct cluster_handle * clust, -+ struct inode * inode, struct page * page, -+ znode_lock_mode mode) -+{ -+ int result; -+ -+ assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK); -+ assert("edward-671", clust->hint != NULL); -+ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER); -+ assert("edward-672", cryptcompress_inode_ok(inode)); -+ assert("edward-1527", PageLocked(page)); -+ -+ unlock_page(page); -+ -+ /* set input stream */ -+ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM); -+ if (result) { -+ lock_page(page); -+ return result; -+ } -+ result = find_disk_cluster(clust, inode, 1 /* read items */, mode); -+ lock_page(page); -+ if (result) -+ return result; -+ /* -+ * at this point we have locked position in the tree -+ */ -+ assert("edward-1528", znode_is_any_locked(clust->hint->lh.node)); -+ -+ if (page->mapping != inode->i_mapping) { -+ /* page was truncated */ -+ reiser4_unset_hint(clust->hint); -+ reset_cluster_params(clust); -+ return AOP_TRUNCATED_PAGE; -+ } -+ if (PageUptodate(page)) { -+ /* disk cluster can be obsolete, don't use it! */ -+ reiser4_unset_hint(clust->hint); -+ reset_cluster_params(clust); -+ return 0; -+ } -+ if (clust->dstat == FAKE_DISK_CLUSTER || -+ clust->dstat == UNPR_DISK_CLUSTER || -+ clust->dstat == TRNC_DISK_CLUSTER) { -+ /* -+ * this information about disk cluster will be valid -+ * as long as we keep the position in the tree locked -+ */ -+ tfm_cluster_set_uptodate(&clust->tc); -+ return 0; -+ } -+ /* now prepare output stream.. */ -+ result = grab_coa(&clust->tc, inode_compression_plugin(inode)); -+ if (result) -+ return result; -+ /* ..and fill this with plain text */ -+ result = reiser4_inflate_cluster(clust, inode); -+ if (result) -+ return result; -+ /* -+ * The stream is ready! It won't be obsolete as -+ * long as we keep last disk cluster item locked. -+ */ -+ tfm_cluster_set_uptodate(&clust->tc); -+ return 0; -+} -+ -+/* -+ * fill one page with plain text. -+ */ -+int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust, -+ struct page *page, znode_lock_mode mode) -+{ -+ int ret; -+ unsigned cloff; -+ char *data; -+ size_t to_page; -+ struct tfm_cluster * tc = &clust->tc; -+ -+ assert("edward-212", PageLocked(page)); -+ -+ if (unlikely(page->mapping != inode->i_mapping)) -+ return AOP_TRUNCATED_PAGE; -+ if (PageUptodate(page)) -+ goto exit; -+ to_page = pbytes(page_index(page), inode); -+ if (to_page == 0) { -+ zero_user(page, 0, PAGE_CACHE_SIZE); -+ SetPageUptodate(page); -+ goto exit; -+ } -+ if (!tfm_cluster_is_uptodate(&clust->tc)) { -+ clust->index = pg_to_clust(page->index, inode); -+ -+ /* this will unlock/lock the page */ -+ ret = ctail_read_disk_cluster(clust, inode, page, mode); -+ -+ assert("edward-212", PageLocked(page)); -+ if (ret) -+ return ret; -+ -+ /* refresh bytes */ -+ to_page = pbytes(page_index(page), inode); -+ if (to_page == 0) { -+ zero_user(page, 0, PAGE_CACHE_SIZE); -+ SetPageUptodate(page); -+ goto exit; -+ } -+ } -+ if (PageUptodate(page)) -+ /* somebody else fill it already */ -+ goto exit; -+ -+ assert("edward-119", tfm_cluster_is_uptodate(tc)); -+ assert("edward-1529", znode_is_any_locked(clust->hint->lh.node)); -+ -+ switch (clust->dstat) { -+ case UNPR_DISK_CLUSTER: -+ BUG_ON(1); -+ case TRNC_DISK_CLUSTER: -+ /* -+ * Race with truncate! -+ * We resolve it in favour of the last one (the only way, -+ * as in this case plain text is unrecoverable) -+ */ -+ case FAKE_DISK_CLUSTER: -+ /* fill the page by zeroes */ -+ zero_user(page, 0, PAGE_CACHE_SIZE); -+ SetPageUptodate(page); -+ break; -+ case PREP_DISK_CLUSTER: -+ /* fill page by transformed stream with plain text */ -+ assert("edward-1058", !PageUptodate(page)); -+ assert("edward-120", tc->len <= inode_cluster_size(inode)); -+ -+ /* page index in this logical cluster */ -+ cloff = pg_to_off_to_cloff(page->index, inode); -+ -+ data = kmap(page); -+ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page); -+ memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page); -+ flush_dcache_page(page); -+ kunmap(page); -+ SetPageUptodate(page); -+ break; -+ default: -+ impossible("edward-1169", "bad disk cluster state"); -+ } -+ exit: -+ return 0; -+} -+ -+/* plugin->u.item.s.file.readpage */ -+int readpage_ctail(void *vp, struct page *page) -+{ -+ int result; -+ hint_t * hint; -+ struct cluster_handle * clust = vp; -+ -+ assert("edward-114", clust != NULL); -+ assert("edward-115", PageLocked(page)); -+ assert("edward-116", !PageUptodate(page)); -+ assert("edward-118", page->mapping && page->mapping->host); -+ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc)); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) { -+ unlock_page(page); -+ return RETERR(-ENOMEM); -+ } -+ clust->hint = hint; -+ result = load_file_hint(clust->file, hint); -+ if (result) { -+ kfree(hint); -+ unlock_page(page); -+ return result; -+ } -+ assert("vs-25", hint->ext_coord.lh == &hint->lh); -+ -+ result = do_readpage_ctail(page->mapping->host, clust, page, -+ ZNODE_READ_LOCK); -+ assert("edward-213", PageLocked(page)); -+ assert("edward-1163", ergo(!result, PageUptodate(page))); -+ -+ unlock_page(page); -+ done_lh(&hint->lh); -+ hint->ext_coord.valid = 0; -+ save_file_hint(clust->file, hint); -+ kfree(hint); -+ tfm_cluster_clr_uptodate(&clust->tc); -+ -+ return result; -+} -+ -+/* Helper function for ->readpages() */ -+static int ctail_read_page_cluster(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ int i; -+ int result; -+ assert("edward-779", clust != NULL); -+ assert("edward-1059", clust->win == NULL); -+ assert("edward-780", inode != NULL); -+ -+ result = prepare_page_cluster(inode, clust, READ_OP); -+ if (result) -+ return result; -+ -+ assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc)); -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ struct page *page = clust->pages[i]; -+ lock_page(page); -+ result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK); -+ unlock_page(page); -+ if (result) -+ break; -+ } -+ tfm_cluster_clr_uptodate(&clust->tc); -+ put_page_cluster(clust, inode, READ_OP); -+ return result; -+} -+ -+/* filler for read_cache_pages() */ -+static int ctail_readpages_filler(void * data, struct page * page) -+{ -+ int ret = 0; -+ struct cluster_handle * clust = data; -+ struct inode * inode = clust->file->f_dentry->d_inode; -+ -+ assert("edward-1525", page->mapping == inode->i_mapping); -+ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return 0; -+ } -+ if (pbytes(page_index(page), inode) == 0) { -+ zero_user(page, 0, PAGE_CACHE_SIZE); -+ SetPageUptodate(page); -+ unlock_page(page); -+ return 0; -+ } -+ move_cluster_forward(clust, inode, page->index); -+ unlock_page(page); -+ /* -+ * read the whole page cluster -+ */ -+ ret = ctail_read_page_cluster(clust, inode); -+ -+ assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc)); -+ return ret; -+} -+ -+/* -+ * We populate a bit more then upper readahead suggests: -+ * with each nominated page we read the whole page cluster -+ * this page belongs to. -+ */ -+int readpages_ctail(struct file *file, struct address_space *mapping, -+ struct list_head *pages) -+{ -+ int ret = 0; -+ hint_t *hint; -+ struct cluster_handle clust; -+ struct inode *inode = mapping->host; -+ -+ assert("edward-1521", inode == file->f_dentry->d_inode); -+ -+ cluster_init_read(&clust, NULL); -+ clust.file = file; -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) { -+ warning("vs-28", "failed to allocate hint"); -+ ret = RETERR(-ENOMEM); -+ goto exit1; -+ } -+ clust.hint = hint; -+ ret = load_file_hint(clust.file, hint); -+ if (ret) { -+ warning("edward-1522", "failed to load hint"); -+ goto exit2; -+ } -+ assert("vs-26", hint->ext_coord.lh == &hint->lh); -+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (ret) { -+ warning("edward-1523", "failed to alloc pgset"); -+ goto exit3; -+ } -+ ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust); -+ -+ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc)); -+ exit3: -+ done_lh(&hint->lh); -+ save_file_hint(file, hint); -+ hint->ext_coord.valid = 0; -+ exit2: -+ kfree(hint); -+ exit1: -+ put_cluster_handle(&clust); -+ return ret; -+} -+ -+/* -+ plugin->u.item.s.file.append_key -+ key of the first item of the next disk cluster -+*/ -+reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key) -+{ -+ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord))); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1) -+ << cluster_shift_by_coord(coord)); -+ return key; -+} -+ -+static int insert_unprepped_ctail(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ int result; -+ char buf[UCTAIL_NR_UNITS]; -+ reiser4_item_data data; -+ reiser4_key key; -+ int shift = (int)UCTAIL_SHIFT; -+ -+ memset(buf, 0, (size_t) UCTAIL_NR_UNITS); -+ result = key_by_inode_cryptcompress(inode, -+ clust_to_off(clust->index, inode), -+ &key); -+ if (result) -+ return result; -+ data.user = 0; -+ data.iplug = item_plugin_by_id(CTAIL_ID); -+ data.arg = &shift; -+ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS; -+ data.data = buf; -+ -+ result = insert_by_coord(&clust->hint->ext_coord.coord, -+ &data, &key, clust->hint->ext_coord.lh, 0); -+ return result; -+} -+ -+static int -+insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f, -+ int cluster_shift) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ reiser4_item_data *data; -+ carry_op *op; -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*data)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ data = (reiser4_item_data *) (lowest_level + 3); -+ -+ assert("edward-466", coord->between == AFTER_ITEM -+ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM -+ || coord->between == EMPTY_NODE -+ || coord->between == BEFORE_UNIT); -+ -+ if (coord->between == AFTER_UNIT) { -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+ } -+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node, -+ 0 /* operate directly on coord -> node */); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ data->user = 0; -+ data->iplug = item_plugin_by_id(CTAIL_ID); -+ data->arg = &cluster_shift; -+ -+ data->length = 0; -+ data->data = NULL; -+ -+ op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT; -+ op->u.insert_flow.insert_point = coord; -+ op->u.insert_flow.flow = f; -+ op->u.insert_flow.data = data; -+ op->u.insert_flow.new_nodes = 0; -+ -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */ -+static int insert_cryptcompress_flow_in_place(coord_t * coord, -+ lock_handle * lh, flow_t * f, -+ int cluster_shift) -+{ -+ int ret; -+ coord_t pos; -+ lock_handle lock; -+ -+ assert("edward-484", -+ coord->between == AT_UNIT || coord->between == AFTER_ITEM); -+ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID); -+ -+ coord_dup(&pos, coord); -+ pos.unit_pos = 0; -+ pos.between = AFTER_ITEM; -+ -+ init_lh(&lock); -+ copy_lh(&lock, lh); -+ -+ ret = insert_cryptcompress_flow(&pos, &lock, f, cluster_shift); -+ done_lh(&lock); -+ assert("edward-1347", znode_is_write_locked(lh->node)); -+ assert("edward-1228", !ret); -+ return ret; -+} -+ -+/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */ -+static int overwrite_ctail(coord_t * coord, flow_t * f) -+{ -+ unsigned count; -+ -+ assert("edward-269", f->user == 0); -+ assert("edward-270", f->data != NULL); -+ assert("edward-271", f->length > 0); -+ assert("edward-272", coord_is_existing_unit(coord)); -+ assert("edward-273", coord->unit_pos == 0); -+ assert("edward-274", znode_is_write_locked(coord->node)); -+ assert("edward-275", reiser4_schedulable()); -+ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-1243", ctail_ok(coord)); -+ -+ count = nr_units_ctail(coord); -+ -+ if (count > f->length) -+ count = f->length; -+ memcpy(first_unit(coord), f->data, count); -+ move_flow_forward(f, count); -+ coord->unit_pos += count; -+ return 0; -+} -+ -+/* Implementation of CRC_CUT_ITEM mode of ctail conversion: -+ cut ctail (part or whole) starting from next unit position */ -+static int cut_ctail(coord_t * coord) -+{ -+ coord_t stop; -+ -+ assert("edward-435", coord->between == AT_UNIT && -+ coord->item_pos < coord_num_items(coord) && -+ coord->unit_pos <= coord_num_units(coord)); -+ -+ if (coord->unit_pos == coord_num_units(coord)) -+ /* nothing to cut */ -+ return 0; -+ coord_dup(&stop, coord); -+ stop.unit_pos = coord_last_unit_pos(coord); -+ -+ return cut_node_content(coord, &stop, NULL, NULL, NULL); -+} -+ -+int ctail_insert_unprepped_cluster(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ int result; -+ assert("edward-1244", inode != NULL); -+ assert("edward-1245", clust->hint != NULL); -+ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER); -+ assert("edward-1247", clust->reserved == 1); -+ -+ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK); -+ if (cbk_errored(result)) -+ return result; -+ assert("edward-1249", result == CBK_COORD_NOTFOUND); -+ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node)); -+ -+ assert("edward-1295", -+ clust->hint->ext_coord.lh->node == -+ clust->hint->ext_coord.coord.node); -+ -+ coord_set_between_clusters(&clust->hint->ext_coord.coord); -+ -+ result = insert_unprepped_ctail(clust, inode); -+ all_grabbed2free(); -+ -+ assert("edward-1251", !result); -+ assert("edward-1252", cryptcompress_inode_ok(inode)); -+ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node)); -+ assert("edward-1254", -+ reiser4_clustered_blocks(reiser4_get_current_sb())); -+ assert("edward-1255", -+ znode_convertible(clust->hint->ext_coord.coord.node)); -+ -+ return result; -+} -+ -+static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode) -+{ -+ int result = 0; -+ struct convert_item_info * info; -+ -+ assert("edward-468", pos != NULL); -+ assert("edward-469", pos->sq != NULL); -+ assert("edward-845", item_convert_data(pos) != NULL); -+ -+ info = item_convert_data(pos); -+ assert("edward-679", info->flow.data != NULL); -+ -+ switch (mode) { -+ case CRC_APPEND_ITEM: -+ assert("edward-1229", info->flow.length != 0); -+ assert("edward-1256", -+ cluster_shift_ok(cluster_shift_by_coord(&pos->coord))); -+ result = -+ insert_cryptcompress_flow_in_place(&pos->coord, -+ &pos->lock, -+ &info->flow, -+ info->cluster_shift); -+ break; -+ case CRC_OVERWRITE_ITEM: -+ assert("edward-1230", info->flow.length != 0); -+ overwrite_ctail(&pos->coord, &info->flow); -+ if (info->flow.length != 0) -+ break; -+ case CRC_CUT_ITEM: -+ assert("edward-1231", info->flow.length == 0); -+ result = cut_ctail(&pos->coord); -+ break; -+ default: -+ result = RETERR(-EIO); -+ impossible("edward-244", "bad convert mode"); -+ } -+ return result; -+} -+ -+/* plugin->u.item.f.scan */ -+int scan_ctail(flush_scan * scan) -+{ -+ int result = 0; -+ struct page *page; -+ struct inode *inode; -+ jnode *node = scan->node; -+ -+ assert("edward-227", scan->node != NULL); -+ assert("edward-228", jnode_is_cluster_page(scan->node)); -+ assert("edward-639", znode_is_write_locked(scan->parent_lock.node)); -+ -+ page = jnode_page(node); -+ inode = page->mapping->host; -+ -+ if (!reiser4_scanning_left(scan)) -+ return result; -+ if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY)) -+ znode_make_dirty(scan->parent_lock.node); -+ -+ if (!znode_convertible(scan->parent_lock.node)) { -+ if (JF_ISSET(scan->node, JNODE_DIRTY)) -+ znode_set_convertible(scan->parent_lock.node); -+ else { -+ warning("edward-681", -+ "cluster page is already processed"); -+ return -EAGAIN; -+ } -+ } -+ return result; -+} -+ -+/* If true, this function attaches children */ -+static int should_attach_convert_idata(flush_pos_t * pos) -+{ -+ int result; -+ assert("edward-431", pos != NULL); -+ assert("edward-432", pos->child == NULL); -+ assert("edward-619", znode_is_write_locked(pos->coord.node)); -+ assert("edward-470", -+ item_plugin_by_coord(&pos->coord) == -+ item_plugin_by_id(CTAIL_ID)); -+ -+ /* check for leftmost child */ -+ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child); -+ -+ if (!pos->child) -+ return 0; -+ spin_lock_jnode(pos->child); -+ result = (JF_ISSET(pos->child, JNODE_DIRTY) && -+ pos->child->atom == ZJNODE(pos->coord.node)->atom); -+ spin_unlock_jnode(pos->child); -+ if (!result && pos->child) { -+ /* existing child isn't to attach, clear up this one */ -+ jput(pos->child); -+ pos->child = NULL; -+ } -+ return result; -+} -+ -+/** -+ * Collect all needed information about the object here, -+ * as in-memory inode can be evicted from memory before -+ * disk update completion. -+ */ -+static int init_convert_data_ctail(struct convert_item_info * idata, -+ struct inode *inode) -+{ -+ assert("edward-813", idata != NULL); -+ assert("edward-814", inode != NULL); -+ -+ idata->cluster_shift = inode_cluster_shift(inode); -+ idata->d_cur = DC_FIRST_ITEM; -+ idata->d_next = DC_INVALID_STATE; -+ -+ return 0; -+} -+ -+static int alloc_item_convert_data(struct convert_info * sq) -+{ -+ assert("edward-816", sq != NULL); -+ assert("edward-817", sq->itm == NULL); -+ -+ sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get()); -+ if (sq->itm == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+static void free_item_convert_data(struct convert_info * sq) -+{ -+ assert("edward-818", sq != NULL); -+ assert("edward-819", sq->itm != NULL); -+ assert("edward-820", sq->iplug != NULL); -+ -+ kfree(sq->itm); -+ sq->itm = NULL; -+ return; -+} -+ -+static int alloc_convert_data(flush_pos_t * pos) -+{ -+ assert("edward-821", pos != NULL); -+ assert("edward-822", pos->sq == NULL); -+ -+ pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get()); -+ if (!pos->sq) -+ return RETERR(-ENOMEM); -+ memset(pos->sq, 0, sizeof(*pos->sq)); -+ cluster_init_write(&pos->sq->clust, NULL); -+ return 0; -+} -+ -+void free_convert_data(flush_pos_t * pos) -+{ -+ struct convert_info *sq; -+ -+ assert("edward-823", pos != NULL); -+ assert("edward-824", pos->sq != NULL); -+ -+ sq = pos->sq; -+ if (sq->itm) -+ free_item_convert_data(sq); -+ put_cluster_handle(&sq->clust); -+ kfree(pos->sq); -+ pos->sq = NULL; -+ return; -+} -+ -+static int init_item_convert_data(flush_pos_t * pos, struct inode *inode) -+{ -+ struct convert_info *sq; -+ -+ assert("edward-825", pos != NULL); -+ assert("edward-826", pos->sq != NULL); -+ assert("edward-827", item_convert_data(pos) != NULL); -+ assert("edward-828", inode != NULL); -+ -+ sq = pos->sq; -+ -+ memset(sq->itm, 0, sizeof(*sq->itm)); -+ -+ /* iplug->init_convert_data() */ -+ return init_convert_data_ctail(sq->itm, inode); -+} -+ -+/* create and attach disk cluster info used by 'convert' phase of the flush -+ squalloc() */ -+static int attach_convert_idata(flush_pos_t * pos, struct inode *inode) -+{ -+ int ret = 0; -+ struct convert_item_info *info; -+ struct cluster_handle *clust; -+ file_plugin *fplug = inode_file_plugin(inode); -+ compression_plugin *cplug = inode_compression_plugin(inode); -+ -+ assert("edward-248", pos != NULL); -+ assert("edward-249", pos->child != NULL); -+ assert("edward-251", inode != NULL); -+ assert("edward-682", cryptcompress_inode_ok(inode)); -+ assert("edward-252", -+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ assert("edward-473", -+ item_plugin_by_coord(&pos->coord) == -+ item_plugin_by_id(CTAIL_ID)); -+ -+ if (!pos->sq) { -+ ret = alloc_convert_data(pos); -+ if (ret) -+ return ret; -+ } -+ clust = &pos->sq->clust; -+ ret = grab_coa(&clust->tc, cplug); -+ if (ret) -+ goto err; -+ ret = set_cluster_by_page(clust, -+ jnode_page(pos->child), -+ MAX_CLUSTER_NRPAGES); -+ if (ret) -+ goto err; -+ -+ assert("edward-829", pos->sq != NULL); -+ assert("edward-250", item_convert_data(pos) == NULL); -+ -+ pos->sq->iplug = item_plugin_by_id(CTAIL_ID); -+ -+ ret = alloc_item_convert_data(pos->sq); -+ if (ret) -+ goto err; -+ ret = init_item_convert_data(pos, inode); -+ if (ret) -+ goto err; -+ info = item_convert_data(pos); -+ -+ ret = checkout_logical_cluster(clust, pos->child, inode); -+ if (ret) -+ goto err; -+ -+ reiser4_deflate_cluster(clust, inode); -+ inc_item_convert_count(pos); -+ -+ /* prepare flow for insertion */ -+ fplug->flow_by_inode(inode, -+ (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM), -+ 0 /* kernel space */ , -+ clust->tc.len, -+ clust_to_off(clust->index, inode), -+ WRITE_OP, &info->flow); -+ jput(pos->child); -+ return 0; -+ err: -+ jput(pos->child); -+ free_convert_data(pos); -+ return ret; -+} -+ -+/* clear up disk cluster info */ -+static void detach_convert_idata(struct convert_info * sq) -+{ -+ struct convert_item_info *info; -+ -+ assert("edward-253", sq != NULL); -+ assert("edward-840", sq->itm != NULL); -+ -+ info = sq->itm; -+ assert("edward-1212", info->flow.length == 0); -+ -+ free_item_convert_data(sq); -+ return; -+} -+ -+/* plugin->u.item.f.utmost_child */ -+ -+/* This function sets leftmost child for a first cluster item, -+ if the child exists, and NULL in other cases. -+ NOTE-EDWARD: Do not call this for RIGHT_SIDE */ -+ -+int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child) -+{ -+ reiser4_key key; -+ -+ item_key_by_coord(coord, &key); -+ -+ assert("edward-257", coord != NULL); -+ assert("edward-258", child != NULL); -+ assert("edward-259", side == LEFT_SIDE); -+ assert("edward-260", -+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); -+ -+ if (!is_disk_cluster_key(&key, coord)) -+ *child = NULL; -+ else -+ *child = jlookup(current_tree, -+ get_key_objectid(item_key_by_coord -+ (coord, &key)), -+ off_to_pg(get_key_offset(&key))); -+ return 0; -+} -+ -+/* Returns true if @p2 is the next item to @p1 -+ in the _same_ disk cluster. -+ Disk cluster is a set of items. If ->clustered() != NULL, -+ with each item the whole disk cluster should be read/modified -+*/ -+ -+/* Go rightward and check for next disk cluster item, set -+ * d_next to DC_CHAINED_ITEM, if the last one exists. -+ * If the current position is last item, go to right neighbor. -+ * Skip empty nodes. Note, that right neighbors may be not in -+ * the slum because of races. If so, make it dirty and -+ * convertible. -+ */ -+static int next_item_dc_stat(flush_pos_t * pos) -+{ -+ int ret = 0; -+ int stop = 0; -+ znode *cur; -+ coord_t coord; -+ lock_handle lh; -+ lock_handle right_lock; -+ -+ assert("edward-1232", !node_is_empty(pos->coord.node)); -+ assert("edward-1014", -+ pos->coord.item_pos < coord_num_items(&pos->coord)); -+ assert("edward-1015", chaining_data_present(pos)); -+ assert("edward-1017", -+ item_convert_data(pos)->d_next == DC_INVALID_STATE); -+ -+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER; -+ -+ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER) -+ return ret; -+ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1) -+ return ret; -+ -+ /* Check next slum item. -+ * Note, that it can not be killed by concurrent truncate, -+ * as the last one will want the lock held by us. -+ */ -+ init_lh(&right_lock); -+ cur = pos->coord.node; -+ -+ while (!stop) { -+ init_lh(&lh); -+ ret = reiser4_get_right_neighbor(&lh, -+ cur, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (ret) -+ break; -+ ret = zload(lh.node); -+ if (ret) { -+ done_lh(&lh); -+ break; -+ } -+ coord_init_before_first_item(&coord, lh.node); -+ -+ if (node_is_empty(lh.node)) { -+ znode_make_dirty(lh.node); -+ znode_set_convertible(lh.node); -+ stop = 0; -+ } else if (same_disk_cluster(&pos->coord, &coord)) { -+ -+ item_convert_data(pos)->d_next = DC_CHAINED_ITEM; -+ -+ if (!ZF_ISSET(lh.node, JNODE_DIRTY)) { -+ /* -+ warning("edward-1024", -+ "next slum item mergeable, " -+ "but znode %p isn't dirty\n", -+ lh.node); -+ */ -+ znode_make_dirty(lh.node); -+ } -+ if (!znode_convertible(lh.node)) { -+ /* -+ warning("edward-1272", -+ "next slum item mergeable, " -+ "but znode %p isn't convertible\n", -+ lh.node); -+ */ -+ znode_set_convertible(lh.node); -+ } -+ stop = 1; -+ } else -+ stop = 1; -+ zrelse(lh.node); -+ done_lh(&right_lock); -+ copy_lh(&right_lock, &lh); -+ done_lh(&lh); -+ cur = right_lock.node; -+ } -+ done_lh(&right_lock); -+ -+ if (ret == -E_NO_NEIGHBOR) -+ ret = 0; -+ return ret; -+} -+ -+static int -+assign_convert_mode(struct convert_item_info * idata, -+ cryptcompress_write_mode_t * mode) -+{ -+ int result = 0; -+ -+ assert("edward-1025", idata != NULL); -+ -+ if (idata->flow.length) { -+ /* append or overwrite */ -+ switch (idata->d_cur) { -+ case DC_FIRST_ITEM: -+ case DC_CHAINED_ITEM: -+ *mode = CRC_OVERWRITE_ITEM; -+ break; -+ case DC_AFTER_CLUSTER: -+ *mode = CRC_APPEND_ITEM; -+ break; -+ default: -+ impossible("edward-1018", "wrong current item state"); -+ } -+ } else { -+ /* cut or invalidate */ -+ switch (idata->d_cur) { -+ case DC_FIRST_ITEM: -+ case DC_CHAINED_ITEM: -+ *mode = CRC_CUT_ITEM; -+ break; -+ case DC_AFTER_CLUSTER: -+ result = 1; -+ break; -+ default: -+ impossible("edward-1019", "wrong current item state"); -+ } -+ } -+ return result; -+} -+ -+/* plugin->u.item.f.convert */ -+/* write ctail in guessed mode */ -+int convert_ctail(flush_pos_t * pos) -+{ -+ int result; -+ int nr_items; -+ cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM; -+ -+ assert("edward-1020", pos != NULL); -+ assert("edward-1213", coord_num_items(&pos->coord) != 0); -+ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID); -+ assert("edward-1258", ctail_ok(&pos->coord)); -+ assert("edward-261", pos->coord.node != NULL); -+ -+ nr_items = coord_num_items(&pos->coord); -+ if (!chaining_data_present(pos)) { -+ if (should_attach_convert_idata(pos)) { -+ /* attach convert item info */ -+ struct inode *inode; -+ -+ assert("edward-264", pos->child != NULL); -+ assert("edward-265", jnode_page(pos->child) != NULL); -+ assert("edward-266", -+ jnode_page(pos->child)->mapping != NULL); -+ -+ inode = jnode_page(pos->child)->mapping->host; -+ -+ assert("edward-267", inode != NULL); -+ -+ /* attach item convert info by child and put the last one */ -+ result = attach_convert_idata(pos, inode); -+ pos->child = NULL; -+ if (result == -E_REPEAT) { -+ /* jnode became clean, or there is no dirty -+ pages (nothing to update in disk cluster) */ -+ warning("edward-1021", -+ "convert_ctail: nothing to attach"); -+ return 0; -+ } -+ if (result != 0) -+ return result; -+ } else -+ /* unconvertible */ -+ return 0; -+ } else { -+ /* use old convert info */ -+ -+ struct convert_item_info *idata; -+ -+ idata = item_convert_data(pos); -+ -+ result = assign_convert_mode(idata, &mode); -+ if (result) { -+ /* disk cluster is over, -+ nothing to update anymore */ -+ detach_convert_idata(pos->sq); -+ return 0; -+ } -+ } -+ -+ assert("edward-433", chaining_data_present(pos)); -+ assert("edward-1022", -+ pos->coord.item_pos < coord_num_items(&pos->coord)); -+ -+ /* check if next item is of current disk cluster */ -+ result = next_item_dc_stat(pos); -+ if (result) { -+ detach_convert_idata(pos->sq); -+ return result; -+ } -+ result = do_convert_ctail(pos, mode); -+ if (result) { -+ detach_convert_idata(pos->sq); -+ return result; -+ } -+ switch (mode) { -+ case CRC_CUT_ITEM: -+ assert("edward-1214", item_convert_data(pos)->flow.length == 0); -+ assert("edward-1215", -+ coord_num_items(&pos->coord) == nr_items || -+ coord_num_items(&pos->coord) == nr_items - 1); -+ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM) -+ break; -+ if (coord_num_items(&pos->coord) != nr_items) { -+ /* the item was killed, no more chained items */ -+ detach_convert_idata(pos->sq); -+ if (!node_is_empty(pos->coord.node)) -+ /* make sure the next item will be scanned */ -+ coord_init_before_item(&pos->coord); -+ break; -+ } -+ case CRC_APPEND_ITEM: -+ assert("edward-434", item_convert_data(pos)->flow.length == 0); -+ detach_convert_idata(pos->sq); -+ break; -+ case CRC_OVERWRITE_ITEM: -+ if (coord_is_unprepped_ctail(&pos->coord)) { -+ /* convert unpprepped ctail to prepped one */ -+ assert("edward-1259", -+ cluster_shift_ok(item_convert_data(pos)-> -+ cluster_shift)); -+ put_unaligned((d8)item_convert_data(pos)->cluster_shift, -+ &ctail_formatted_at(&pos->coord)-> -+ cluster_shift); -+ } -+ break; -+ } -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.30/fs/reiser4/plugin/item/ctail.h ---- linux-2.6.30.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/ctail.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,102 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Ctail items are fragments (or bodies) of special tipe to provide -+ optimal storage of encrypted and(or) compressed files. */ -+ -+ -+#if !defined( __FS_REISER4_CTAIL_H__ ) -+#define __FS_REISER4_CTAIL_H__ -+ -+/* Disk format of ctail item */ -+typedef struct ctail_item_format { -+ /* packed shift; -+ if its value is different from UCTAIL_SHIFT (see below), then -+ size of disk cluster is calculated as (1 << cluster_shift) */ -+ d8 cluster_shift; -+ /* ctail body */ -+ d8 body[0]; -+} __attribute__ ((packed)) ctail_item_format; -+ -+/* "Unprepped" disk cluster is represented by a single ctail item -+ with the following "magic" attributes: */ -+/* "magic" cluster_shift */ -+#define UCTAIL_SHIFT 0xff -+/* How many units unprepped ctail item has */ -+#define UCTAIL_NR_UNITS 1 -+ -+/* The following is a set of various item states in a disk cluster. -+ Disk cluster is a set of items whose keys belong to the interval -+ [dc_key , dc_key + disk_cluster_size - 1] */ -+typedef enum { -+ DC_INVALID_STATE = 0, -+ DC_FIRST_ITEM = 1, -+ DC_CHAINED_ITEM = 2, -+ DC_AFTER_CLUSTER = 3 -+} dc_item_stat; -+ -+/* ctail-specific extension. -+ In particular this describes parameters of disk cluster an item belongs to */ -+struct ctail_coord_extension { -+ int shift; /* this contains cluster_shift extracted from -+ ctail_item_format (above), or UCTAIL_SHIFT -+ (the last one is the "magic" of unprepped disk clusters)*/ -+ int dsize; /* size of a prepped disk cluster */ -+ int ncount; /* count of nodes occupied by a disk cluster */ -+}; -+ -+struct cut_list; -+ -+/* plugin->item.b.* */ -+int can_contain_key_ctail(const coord_t *, const reiser4_key *, -+ const reiser4_item_data *); -+int mergeable_ctail(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_ctail(const coord_t * coord); -+int estimate_ctail(const coord_t * coord, const reiser4_item_data * data); -+void print_ctail(const char *prefix, coord_t * coord); -+lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *); -+ -+int paste_ctail(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG); -+int init_ctail(coord_t *, coord_t *, reiser4_item_data *); -+int can_shift_ctail(unsigned free_space, coord_t * coord, -+ znode * target, shift_direction pend, unsigned *size, -+ unsigned want); -+void copy_units_ctail(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction where_is_free_space, -+ unsigned free_space); -+int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int ctail_ok(const coord_t * coord); -+int check_ctail(const coord_t * coord, const char **error); -+ -+/* plugin->u.item.s.* */ -+int read_ctail(struct file *, flow_t *, hint_t *); -+int readpage_ctail(void *, struct page *); -+int readpages_ctail(struct file *, struct address_space *, struct list_head *); -+reiser4_key *append_key_ctail(const coord_t *, reiser4_key *); -+int create_hook_ctail(const coord_t * coord, void *arg); -+int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t, -+ carry_kill_data *); -+int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *); -+ -+/* plugin->u.item.f */ -+int utmost_child_ctail(const coord_t *, sideof, jnode **); -+int scan_ctail(flush_scan *); -+int convert_ctail(flush_pos_t *); -+size_t inode_scaled_cluster_size(struct inode *); -+ -+#endif /* __FS_REISER4_CTAIL_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/extent.c linux-2.6.30/fs/reiser4/plugin/item/extent.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/extent.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,197 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../key.h" -+#include "../../super.h" -+#include "../../carry.h" -+#include "../../inode.h" -+#include "../../page_cache.h" -+#include "../../flush.h" -+#include "../object.h" -+ -+/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */ -+/* Audited by: green(2002.06.13) */ -+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit, -+ int nr_extents) -+{ -+ data->data = ext_unit; -+ /* data->data is kernel space */ -+ data->user = 0; -+ data->length = sizeof(reiser4_extent) * nr_extents; -+ data->arg = NULL; -+ data->iplug = item_plugin_by_id(EXTENT_POINTER_ID); -+ return data; -+} -+ -+/* how many bytes are addressed by @nr first extents of the extent item */ -+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr) -+{ -+ pos_in_node_t i; -+ reiser4_block_nr blocks; -+ reiser4_extent *ext; -+ -+ ext = item_body_by_coord(coord); -+ assert("vs-263", nr <= nr_units_extent(coord)); -+ -+ blocks = 0; -+ for (i = 0; i < nr; i++, ext++) { -+ blocks += extent_get_width(ext); -+ } -+ -+ return blocks * current_blocksize; -+} -+ -+extent_state state_of_extent(reiser4_extent * ext) -+{ -+ switch ((int)extent_get_start(ext)) { -+ case 0: -+ return HOLE_EXTENT; -+ case 1: -+ return UNALLOCATED_EXTENT; -+ default: -+ break; -+ } -+ return ALLOCATED_EXTENT; -+} -+ -+int extent_is_unallocated(const coord_t * item) -+{ -+ assert("jmacd-5133", item_is_extent(item)); -+ -+ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT; -+} -+ -+/* set extent's start and width */ -+void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start, -+ reiser4_block_nr width) -+{ -+ extent_set_start(ext, start); -+ extent_set_width(ext, width); -+} -+ -+/** -+ * reiser4_replace_extent - replace extent and paste 1 or 2 after it -+ * @un_extent: coordinate of extent to be overwritten -+ * @lh: need better comment -+ * @key: need better comment -+ * @exts_to_add: data prepared for insertion into tree -+ * @replace: need better comment -+ * @flags: need better comment -+ * @return_insert_position: need better comment -+ * -+ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If -+ * @return_inserted_position is 1 - @un_extent and @lh are returned set to -+ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned -+ * set to extent which was overwritten. -+ */ -+int reiser4_replace_extent(struct replace_handle *h, -+ int return_inserted_position) -+{ -+ int result; -+ znode *orig_znode; -+ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */ -+ -+ assert("vs-990", coord_is_existing_unit(h->coord)); -+ assert("vs-1375", znode_is_write_locked(h->coord->node)); -+ assert("vs-1426", extent_get_width(&h->overwrite) != 0); -+ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0); -+ assert("vs-1427", ergo(h->nr_new_extents == 2, -+ extent_get_width(&h->new_extents[1]) != 0)); -+ -+ /* compose structure for paste */ -+ init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents); -+ -+ coord_dup(&h->coord_after, h->coord); -+ init_lh(&h->lh_after); -+ copy_lh(&h->lh_after, h->lh); -+ reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK); -+ reiser4_tap_monitor(&h->watch); -+ -+ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord)); -+ orig_znode = h->coord->node; -+ -+#if REISER4_DEBUG -+ /* make sure that key is set properly */ -+ unit_key_by_coord(h->coord, &h->tmp); -+ set_key_offset(&h->tmp, -+ get_key_offset(&h->tmp) + -+ extent_get_width(&h->overwrite) * current_blocksize); -+ assert("vs-1080", keyeq(&h->tmp, &h->paste_key)); -+#endif -+ -+ /* set insert point after unit to be replaced */ -+ h->coord->between = AFTER_UNIT; -+ -+ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL, -+ &h->paste_key, &h->item, h->flags); -+ if (!result) { -+ /* now we have to replace the unit after which new units were -+ inserted. Its position is tracked by @watch */ -+ reiser4_extent *ext; -+ znode *node; -+ -+ node = h->coord_after.node; -+ if (node != orig_znode) { -+ coord_clear_iplug(&h->coord_after); -+ result = zload(node); -+ } -+ -+ if (likely(!result)) { -+ ext = extent_by_coord(&h->coord_after); -+ -+ assert("vs-987", znode_is_loaded(node)); -+ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext))); -+ -+ /* overwrite extent unit */ -+ memcpy(ext, &h->overwrite, sizeof(reiser4_extent)); -+ znode_make_dirty(node); -+ -+ if (node != orig_znode) -+ zrelse(node); -+ -+ if (return_inserted_position == 0) { -+ /* coord and lh are to be set to overwritten -+ extent */ -+ assert("vs-1662", -+ WITH_DATA(node, !memcmp(&h->overwrite, -+ extent_by_coord( -+ &h->coord_after), -+ sizeof(reiser4_extent)))); -+ -+ *h->coord = h->coord_after; -+ done_lh(h->lh); -+ copy_lh(h->lh, &h->lh_after); -+ } else { -+ /* h->coord and h->lh are to be set to first of -+ inserted units */ -+ assert("vs-1663", -+ WITH_DATA(h->coord->node, -+ !memcmp(&h->new_extents[0], -+ extent_by_coord(h->coord), -+ sizeof(reiser4_extent)))); -+ assert("vs-1664", h->lh->node == h->coord->node); -+ } -+ } -+ } -+ reiser4_tap_done(&h->watch); -+ -+ return result; -+} -+ -+lock_handle *znode_lh(znode *node) -+{ -+ assert("vs-1371", znode_is_write_locked(node)); -+ assert("vs-1372", znode_is_wlocked_once(node)); -+ return list_entry(node->lock.owners.next, lock_handle, owners_link); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.30/fs/reiser4/plugin/item/extent_file_ops.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/extent_file_ops.c 2009-06-22 17:27:31.000000000 +0200 -@@ -0,0 +1,1453 @@ -+/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../inode.h" -+#include "../../page_cache.h" -+#include "../object.h" -+ -+#include <linux/quotaops.h> -+#include <linux/swap.h> -+ -+static inline reiser4_extent *ext_by_offset(const znode *node, int offset) -+{ -+ reiser4_extent *ext; -+ -+ ext = (reiser4_extent *) (zdata(node) + offset); -+ return ext; -+} -+ -+/** -+ * check_uf_coord - verify coord extension -+ * @uf_coord: -+ * @key: -+ * -+ * Makes sure that all fields of @uf_coord are set properly. If @key is -+ * specified - check whether @uf_coord is set correspondingly. -+ */ -+static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key) -+{ -+#if REISER4_DEBUG -+ const coord_t *coord; -+ const struct extent_coord_extension *ext_coord; -+ reiser4_extent *ext; -+ -+ coord = &uf_coord->coord; -+ ext_coord = &uf_coord->extension.extent; -+ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset); -+ -+ assert("", -+ WITH_DATA(coord->node, -+ (uf_coord->valid == 1 && -+ coord_is_iplug_set(coord) && -+ item_is_extent(coord) && -+ ext_coord->nr_units == nr_units_extent(coord) && -+ ext == extent_by_coord(coord) && -+ ext_coord->width == extent_get_width(ext) && -+ coord->unit_pos < ext_coord->nr_units && -+ ext_coord->pos_in_unit < ext_coord->width && -+ memcmp(ext, &ext_coord->extent, -+ sizeof(reiser4_extent)) == 0))); -+ if (key) { -+ reiser4_key coord_key; -+ -+ unit_key_by_coord(&uf_coord->coord, &coord_key); -+ set_key_offset(&coord_key, -+ get_key_offset(&coord_key) + -+ (uf_coord->extension.extent. -+ pos_in_unit << PAGE_CACHE_SHIFT)); -+ assert("", keyeq(key, &coord_key)); -+ } -+#endif -+} -+ -+static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord) -+{ -+ check_uf_coord(uf_coord, NULL); -+ -+ return ext_by_offset(uf_coord->coord.node, -+ uf_coord->extension.extent.ext_offset); -+} -+ -+#if REISER4_DEBUG -+ -+/** -+ * offset_is_in_unit -+ * -+ * -+ * -+ */ -+/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set -+ pos_in_unit inside of unit correspondingly */ -+static int offset_is_in_unit(const coord_t *coord, loff_t off) -+{ -+ reiser4_key unit_key; -+ __u64 unit_off; -+ reiser4_extent *ext; -+ -+ ext = extent_by_coord(coord); -+ -+ unit_key_extent(coord, &unit_key); -+ unit_off = get_key_offset(&unit_key); -+ if (off < unit_off) -+ return 0; -+ if (off >= (unit_off + (current_blocksize * extent_get_width(ext)))) -+ return 0; -+ return 1; -+} -+ -+static int -+coord_matches_key_extent(const coord_t * coord, const reiser4_key * key) -+{ -+ reiser4_key item_key; -+ -+ assert("vs-771", coord_is_existing_unit(coord)); -+ assert("vs-1258", keylt(key, append_key_extent(coord, &item_key))); -+ assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key))); -+ -+ return offset_is_in_unit(coord, get_key_offset(key)); -+} -+ -+#endif -+ -+/** -+ * can_append - -+ * @key: -+ * @coord: -+ * -+ * Returns 1 if @key is equal to an append key of item @coord is set to -+ */ -+static int can_append(const reiser4_key *key, const coord_t *coord) -+{ -+ reiser4_key append_key; -+ -+ return keyeq(key, append_key_extent(coord, &append_key)); -+} -+ -+/** -+ * append_hole -+ * @coord: -+ * @lh: -+ * @key: -+ * -+ */ -+static int append_hole(coord_t *coord, lock_handle *lh, -+ const reiser4_key *key) -+{ -+ reiser4_key append_key; -+ reiser4_block_nr hole_width; -+ reiser4_extent *ext, new_ext; -+ reiser4_item_data idata; -+ -+ /* last item of file may have to be appended with hole */ -+ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL); -+ assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID); -+ -+ /* key of first byte which is not addressed by this extent */ -+ append_key_extent(coord, &append_key); -+ -+ assert("", keyle(&append_key, key)); -+ -+ /* -+ * extent item has to be appended with hole. Calculate length of that -+ * hole -+ */ -+ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) + -+ current_blocksize - 1) >> current_blocksize_bits); -+ assert("vs-954", hole_width > 0); -+ -+ /* set coord after last unit */ -+ coord_init_after_item_end(coord); -+ -+ /* get last extent in the item */ -+ ext = extent_by_coord(coord); -+ if (state_of_extent(ext) == HOLE_EXTENT) { -+ /* -+ * last extent of a file is hole extent. Widen that extent by -+ * @hole_width blocks. Note that we do not worry about -+ * overflowing - extent width is 64 bits -+ */ -+ reiser4_set_extent(ext, HOLE_EXTENT_START, -+ extent_get_width(ext) + hole_width); -+ znode_make_dirty(coord->node); -+ return 0; -+ } -+ -+ /* append last item of the file with hole extent unit */ -+ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT || -+ state_of_extent(ext) == UNALLOCATED_EXTENT)); -+ -+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width); -+ init_new_extent(&idata, &new_ext, 1); -+ return insert_into_item(coord, lh, &append_key, &idata, 0); -+} -+ -+/** -+ * check_jnodes -+ * @twig: longterm locked twig node -+ * @key: -+ * -+ */ -+static void check_jnodes(znode *twig, const reiser4_key *key, int count) -+{ -+#if REISER4_DEBUG -+ coord_t c; -+ reiser4_key node_key, jnode_key; -+ -+ jnode_key = *key; -+ -+ assert("", twig != NULL); -+ assert("", znode_get_level(twig) == TWIG_LEVEL); -+ assert("", znode_is_write_locked(twig)); -+ -+ zload(twig); -+ /* get the smallest key in twig node */ -+ coord_init_first_unit(&c, twig); -+ unit_key_by_coord(&c, &node_key); -+ assert("", keyle(&node_key, &jnode_key)); -+ -+ coord_init_last_unit(&c, twig); -+ unit_key_by_coord(&c, &node_key); -+ if (item_plugin_by_coord(&c)->s.file.append_key) -+ item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key); -+ set_key_offset(&jnode_key, -+ get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1); -+ assert("", keylt(&jnode_key, &node_key)); -+ zrelse(twig); -+#endif -+} -+ -+/** -+ * append_last_extent - append last file item -+ * @uf_coord: coord to start insertion from -+ * @jnodes: array of jnodes -+ * @count: number of jnodes in the array -+ * -+ * There is already at least one extent item of file @inode in the tree. Append -+ * the last of them with unallocated extent unit of width @count. Assign -+ * fake block numbers to jnodes corresponding to the inserted extent. -+ */ -+static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode **jnodes, int count) -+{ -+ int result; -+ reiser4_extent new_ext; -+ reiser4_item_data idata; -+ coord_t *coord; -+ struct extent_coord_extension *ext_coord; -+ reiser4_extent *ext; -+ reiser4_block_nr block; -+ jnode *node; -+ int i; -+ -+ coord = &uf_coord->coord; -+ ext_coord = &uf_coord->extension.extent; -+ ext = ext_by_ext_coord(uf_coord); -+ -+ /* check correctness of position in the item */ -+ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord)); -+ assert("vs-1311", coord->between == AFTER_UNIT); -+ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1); -+ -+ if (!can_append(key, coord)) { -+ /* hole extent has to be inserted */ -+ result = append_hole(coord, uf_coord->lh, key); -+ uf_coord->valid = 0; -+ return result; -+ } -+ -+ if (count == 0) -+ return 0; -+ -+ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE); -+ -+ result = vfs_dq_alloc_block_nodirty(mapping_jnode(jnodes[0])->host, -+ count); -+ BUG_ON(result != 0); -+ -+ switch (state_of_extent(ext)) { -+ case UNALLOCATED_EXTENT: -+ /* -+ * last extent unit of the file is unallocated one. Increase -+ * its width by @count -+ */ -+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, -+ extent_get_width(ext) + count); -+ znode_make_dirty(coord->node); -+ -+ /* update coord extension */ -+ ext_coord->width += count; -+ ON_DEBUG(extent_set_width -+ (&uf_coord->extension.extent.extent, -+ ext_coord->width)); -+ break; -+ -+ case HOLE_EXTENT: -+ case ALLOCATED_EXTENT: -+ /* -+ * last extent unit of the file is either hole or allocated -+ * one. Append one unallocated extent of width @count -+ */ -+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count); -+ init_new_extent(&idata, &new_ext, 1); -+ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0); -+ uf_coord->valid = 0; -+ if (result) -+ return result; -+ break; -+ -+ default: -+ return RETERR(-EIO); -+ } -+ -+ /* -+ * make sure that we hold long term locked twig node containing all -+ * jnodes we are about to capture -+ */ -+ check_jnodes(uf_coord->lh->node, key, count); -+ -+ /* -+ * assign fake block numbers to all jnodes. FIXME: make sure whether -+ * twig node containing inserted extent item is locked -+ */ -+ block = fake_blocknr_unformatted(count); -+ for (i = 0; i < count; i ++, block ++) { -+ node = jnodes[i]; -+ spin_lock_jnode(node); -+ JF_SET(node, JNODE_CREATED); -+ jnode_set_block(node, &block); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ } -+ return count; -+} -+ -+/** -+ * insert_first_hole - inser hole extent into tree -+ * @coord: -+ * @lh: -+ * @key: -+ * -+ * -+ */ -+static int insert_first_hole(coord_t *coord, lock_handle *lh, -+ const reiser4_key *key) -+{ -+ reiser4_extent new_ext; -+ reiser4_item_data idata; -+ reiser4_key item_key; -+ reiser4_block_nr hole_width; -+ -+ /* @coord must be set for inserting of new item */ -+ assert("vs-711", coord_is_between_items(coord)); -+ -+ item_key = *key; -+ set_key_offset(&item_key, 0ull); -+ -+ hole_width = ((get_key_offset(key) + current_blocksize - 1) >> -+ current_blocksize_bits); -+ assert("vs-710", hole_width > 0); -+ -+ /* compose body of hole extent and insert item into tree */ -+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width); -+ init_new_extent(&idata, &new_ext, 1); -+ return insert_extent_by_coord(coord, &idata, &item_key, lh); -+} -+ -+ -+/** -+ * insert_first_extent - insert first file item -+ * @inode: inode of file -+ * @uf_coord: coord to start insertion from -+ * @jnodes: array of jnodes -+ * @count: number of jnodes in the array -+ * @inode: -+ * -+ * There are no items of file @inode in the tree yet. Insert unallocated extent -+ * of width @count into tree or hole extent if writing not to the -+ * beginning. Assign fake block numbers to jnodes corresponding to the inserted -+ * unallocated extent. Returns number of jnodes or error code. -+ */ -+static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode **jnodes, int count, -+ struct inode *inode) -+{ -+ int result; -+ int i; -+ reiser4_extent new_ext; -+ reiser4_item_data idata; -+ reiser4_block_nr block; -+ struct unix_file_info *uf_info; -+ jnode *node; -+ -+ /* first extent insertion starts at leaf level */ -+ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL); -+ assert("vs-711", coord_is_between_items(&uf_coord->coord)); -+ -+ if (get_key_offset(key) != 0) { -+ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key); -+ uf_coord->valid = 0; -+ uf_info = unix_file_inode_data(inode); -+ -+ /* -+ * first item insertion is only possible when writing to empty -+ * file or performing tail conversion -+ */ -+ assert("", (uf_info->container == UF_CONTAINER_EMPTY || -+ (reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED) && -+ reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)))); -+ /* if file was empty - update its state */ -+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY) -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ return result; -+ } -+ -+ if (count == 0) -+ return 0; -+ -+ result = vfs_dq_alloc_block_nodirty(mapping_jnode(jnodes[0])->host, -+ count); -+ BUG_ON(result != 0); -+ -+ /* -+ * prepare for tree modification: compose body of item and item data -+ * structure needed for insertion -+ */ -+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count); -+ init_new_extent(&idata, &new_ext, 1); -+ -+ /* insert extent item into the tree */ -+ result = insert_extent_by_coord(&uf_coord->coord, &idata, key, -+ uf_coord->lh); -+ if (result) -+ return result; -+ -+ /* -+ * make sure that we hold long term locked twig node containing all -+ * jnodes we are about to capture -+ */ -+ check_jnodes(uf_coord->lh->node, key, count); -+ /* -+ * assign fake block numbers to all jnodes, capture and mark them dirty -+ */ -+ block = fake_blocknr_unformatted(count); -+ for (i = 0; i < count; i ++, block ++) { -+ node = jnodes[i]; -+ spin_lock_jnode(node); -+ JF_SET(node, JNODE_CREATED); -+ jnode_set_block(node, &block); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ } -+ -+ /* -+ * invalidate coordinate, research must be performed to continue -+ * because write will continue on twig level -+ */ -+ uf_coord->valid = 0; -+ return count; -+} -+ -+/** -+ * plug_hole - replace hole extent with unallocated and holes -+ * @uf_coord: -+ * @key: -+ * @node: -+ * @h: structure containing coordinate, lock handle, key, etc -+ * -+ * Creates an unallocated extent of width 1 within a hole. In worst case two -+ * additional extents can be created. -+ */ -+static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how) -+{ -+ struct replace_handle rh; -+ reiser4_extent *ext; -+ reiser4_block_nr width, pos_in_unit; -+ coord_t *coord; -+ struct extent_coord_extension *ext_coord; -+ int return_inserted_position; -+ -+ check_uf_coord(uf_coord, key); -+ -+ rh.coord = coord_by_uf_coord(uf_coord); -+ rh.lh = uf_coord->lh; -+ rh.flags = 0; -+ -+ coord = coord_by_uf_coord(uf_coord); -+ ext_coord = ext_coord_by_uf_coord(uf_coord); -+ ext = ext_by_ext_coord(uf_coord); -+ -+ width = ext_coord->width; -+ pos_in_unit = ext_coord->pos_in_unit; -+ -+ *how = 0; -+ if (width == 1) { -+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1); -+ znode_make_dirty(coord->node); -+ /* update uf_coord */ -+ ON_DEBUG(ext_coord->extent = *ext); -+ *how = 1; -+ return 0; -+ } else if (pos_in_unit == 0) { -+ /* we deal with first element of extent */ -+ if (coord->unit_pos) { -+ /* there is an extent to the left */ -+ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) { -+ /* -+ * left neighboring unit is an unallocated -+ * extent. Increase its width and decrease -+ * width of hole -+ */ -+ extent_set_width(ext - 1, -+ extent_get_width(ext - 1) + 1); -+ extent_set_width(ext, width - 1); -+ znode_make_dirty(coord->node); -+ -+ /* update coord extension */ -+ coord->unit_pos--; -+ ext_coord->width = extent_get_width(ext - 1); -+ ext_coord->pos_in_unit = ext_coord->width - 1; -+ ext_coord->ext_offset -= sizeof(reiser4_extent); -+ ON_DEBUG(ext_coord->extent = -+ *extent_by_coord(coord)); -+ *how = 2; -+ return 0; -+ } -+ } -+ /* extent for replace */ -+ reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1); -+ /* extent to be inserted */ -+ reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START, -+ width - 1); -+ rh.nr_new_extents = 1; -+ -+ /* have reiser4_replace_extent to return with @coord and -+ @uf_coord->lh set to unit which was replaced */ -+ return_inserted_position = 0; -+ *how = 3; -+ } else if (pos_in_unit == width - 1) { -+ /* we deal with last element of extent */ -+ if (coord->unit_pos < nr_units_extent(coord) - 1) { -+ /* there is an extent unit to the right */ -+ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) { -+ /* -+ * right neighboring unit is an unallocated -+ * extent. Increase its width and decrease -+ * width of hole -+ */ -+ extent_set_width(ext + 1, -+ extent_get_width(ext + 1) + 1); -+ extent_set_width(ext, width - 1); -+ znode_make_dirty(coord->node); -+ -+ /* update coord extension */ -+ coord->unit_pos++; -+ ext_coord->width = extent_get_width(ext + 1); -+ ext_coord->pos_in_unit = 0; -+ ext_coord->ext_offset += sizeof(reiser4_extent); -+ ON_DEBUG(ext_coord->extent = -+ *extent_by_coord(coord)); -+ *how = 4; -+ return 0; -+ } -+ } -+ /* extent for replace */ -+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1); -+ /* extent to be inserted */ -+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, -+ 1); -+ rh.nr_new_extents = 1; -+ -+ /* have reiser4_replace_extent to return with @coord and -+ @uf_coord->lh set to unit which was inserted */ -+ return_inserted_position = 1; -+ *how = 5; -+ } else { -+ /* extent for replace */ -+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, -+ pos_in_unit); -+ /* extents to be inserted */ -+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, -+ 1); -+ reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START, -+ width - pos_in_unit - 1); -+ rh.nr_new_extents = 2; -+ -+ /* have reiser4_replace_extent to return with @coord and -+ @uf_coord->lh set to first of units which were inserted */ -+ return_inserted_position = 1; -+ *how = 6; -+ } -+ unit_key_by_coord(coord, &rh.paste_key); -+ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) + -+ extent_get_width(&rh.overwrite) * current_blocksize); -+ -+ uf_coord->valid = 0; -+ return reiser4_replace_extent(&rh, return_inserted_position); -+} -+ -+/** -+ * overwrite_one_block - -+ * @uf_coord: -+ * @key: -+ * @node: -+ * -+ * If @node corresponds to hole extent - create unallocated extent for it and -+ * assign fake block number. If @node corresponds to allocated extent - assign -+ * block number of jnode -+ */ -+static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode *node, int *hole_plugged) -+{ -+ int result; -+ struct extent_coord_extension *ext_coord; -+ reiser4_extent *ext; -+ reiser4_block_nr block; -+ int how; -+ -+ assert("vs-1312", uf_coord->coord.between == AT_UNIT); -+ -+ result = 0; -+ ext_coord = ext_coord_by_uf_coord(uf_coord); -+ ext = ext_by_ext_coord(uf_coord); -+ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT); -+ -+ switch (state_of_extent(ext)) { -+ case ALLOCATED_EXTENT: -+ block = extent_get_start(ext) + ext_coord->pos_in_unit; -+ break; -+ -+ case HOLE_EXTENT: -+ result = vfs_dq_alloc_block_nodirty(mapping_jnode(node)->host, -+ 1); -+ BUG_ON(result != 0); -+ result = plug_hole(uf_coord, key, &how); -+ if (result) -+ return result; -+ block = fake_blocknr_unformatted(1); -+ if (hole_plugged) -+ *hole_plugged = 1; -+ JF_SET(node, JNODE_CREATED); -+ break; -+ -+ default: -+ return RETERR(-EIO); -+ } -+ -+ jnode_set_block(node, &block); -+ return 0; -+} -+ -+/** -+ * move_coord - move coordinate forward -+ * @uf_coord: -+ * -+ * Move coordinate one data block pointer forward. Return 1 if coord is set to -+ * the last one already or is invalid. -+ */ -+static int move_coord(uf_coord_t *uf_coord) -+{ -+ struct extent_coord_extension *ext_coord; -+ -+ if (uf_coord->valid == 0) -+ return 1; -+ ext_coord = &uf_coord->extension.extent; -+ ext_coord->pos_in_unit ++; -+ if (ext_coord->pos_in_unit < ext_coord->width) -+ /* coordinate moved within the unit */ -+ return 0; -+ -+ /* end of unit is reached. Try to move to next unit */ -+ ext_coord->pos_in_unit = 0; -+ uf_coord->coord.unit_pos ++; -+ if (uf_coord->coord.unit_pos < ext_coord->nr_units) { -+ /* coordinate moved to next unit */ -+ ext_coord->ext_offset += sizeof(reiser4_extent); -+ ext_coord->width = -+ extent_get_width(ext_by_offset -+ (uf_coord->coord.node, -+ ext_coord->ext_offset)); -+ ON_DEBUG(ext_coord->extent = -+ *ext_by_offset(uf_coord->coord.node, -+ ext_coord->ext_offset)); -+ return 0; -+ } -+ /* end of item is reached */ -+ uf_coord->valid = 0; -+ return 1; -+} -+ -+/** -+ * overwrite_extent - -+ * @inode: -+ * -+ * Returns number of handled jnodes. -+ */ -+static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode **jnodes, int count, int *plugged_hole) -+{ -+ int result; -+ reiser4_key k; -+ int i; -+ jnode *node; -+ -+ k = *key; -+ for (i = 0; i < count; i ++) { -+ node = jnodes[i]; -+ if (*jnode_get_block(node) == 0) { -+ result = overwrite_one_block(uf_coord, &k, node, plugged_hole); -+ if (result) -+ return result; -+ } -+ /* -+ * make sure that we hold long term locked twig node containing -+ * all jnodes we are about to capture -+ */ -+ check_jnodes(uf_coord->lh->node, &k, 1); -+ /* -+ * assign fake block numbers to all jnodes, capture and mark -+ * them dirty -+ */ -+ spin_lock_jnode(node); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ -+ if (uf_coord->valid == 0) -+ return i + 1; -+ -+ check_uf_coord(uf_coord, &k); -+ -+ if (move_coord(uf_coord)) { -+ /* -+ * failed to move to the next node pointer. Either end -+ * of file or end of twig node is reached. In the later -+ * case we might go to the right neighbor. -+ */ -+ uf_coord->valid = 0; -+ return i + 1; -+ } -+ set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE); -+ } -+ -+ return count; -+} -+ -+/** -+ * reiser4_update_extent -+ * @file: -+ * @jnodes: -+ * @count: -+ * @off: -+ * -+ */ -+int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos, -+ int *plugged_hole) -+{ -+ int result; -+ znode *loaded; -+ uf_coord_t uf_coord; -+ coord_t *coord; -+ lock_handle lh; -+ reiser4_key key; -+ -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ -+ key_by_inode_and_offset_common(inode, pos, &key); -+ -+ init_uf_coord(&uf_coord, &lh); -+ coord = &uf_coord.coord; -+ result = find_file_item_nohint(coord, &lh, &key, -+ ZNODE_WRITE_LOCK, inode); -+ if (IS_CBKERR(result)) { -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return result; -+ } -+ -+ result = zload(coord->node); -+ BUG_ON(result != 0); -+ loaded = coord->node; -+ -+ if (coord->between == AFTER_UNIT) { -+ /* -+ * append existing extent item with unallocated extent of width -+ * nr_jnodes -+ */ -+ init_coord_extension_extent(&uf_coord, -+ get_key_offset(&key)); -+ result = append_last_extent(&uf_coord, &key, -+ &node, 1); -+ } else if (coord->between == AT_UNIT) { -+ /* -+ * overwrite -+ * not optimal yet. Will be optimized if new write will show -+ * performance win. -+ */ -+ init_coord_extension_extent(&uf_coord, -+ get_key_offset(&key)); -+ result = overwrite_extent(&uf_coord, &key, -+ &node, 1, plugged_hole); -+ } else { -+ /* -+ * there are no items of this file in the tree yet. Create -+ * first item of the file inserting one unallocated extent of -+ * width nr_jnodes -+ */ -+ result = insert_first_extent(&uf_coord, &key, &node, 1, inode); -+ } -+ assert("", result == 1 || result < 0); -+ zrelse(loaded); -+ done_lh(&lh); -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return (result == 1) ? 0 : result; -+} -+ -+/** -+ * update_extents -+ * @file: -+ * @jnodes: -+ * @count: -+ * @off: -+ * -+ */ -+static int update_extents(struct file *file, struct inode *inode, -+ jnode **jnodes, int count, loff_t pos) -+{ -+ struct hint hint; -+ reiser4_key key; -+ int result; -+ znode *loaded; -+ -+ result = load_file_hint(file, &hint); -+ BUG_ON(result != 0); -+ -+ if (count != 0) -+ /* -+ * count == 0 is special case: expanding truncate -+ */ -+ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT; -+ key_by_inode_and_offset_common(inode, pos, &key); -+ -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ -+ do { -+ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode); -+ if (IS_CBKERR(result)) { -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return result; -+ } -+ -+ result = zload(hint.ext_coord.coord.node); -+ BUG_ON(result != 0); -+ loaded = hint.ext_coord.coord.node; -+ -+ if (hint.ext_coord.coord.between == AFTER_UNIT) { -+ /* -+ * append existing extent item with unallocated extent -+ * of width nr_jnodes -+ */ -+ if (hint.ext_coord.valid == 0) -+ /* NOTE: get statistics on this */ -+ init_coord_extension_extent(&hint.ext_coord, -+ get_key_offset(&key)); -+ result = append_last_extent(&hint.ext_coord, &key, -+ jnodes, count); -+ } else if (hint.ext_coord.coord.between == AT_UNIT) { -+ /* -+ * overwrite -+ * not optimal yet. Will be optimized if new write will -+ * show performance win. -+ */ -+ if (hint.ext_coord.valid == 0) -+ /* NOTE: get statistics on this */ -+ init_coord_extension_extent(&hint.ext_coord, -+ get_key_offset(&key)); -+ result = overwrite_extent(&hint.ext_coord, &key, -+ jnodes, count, NULL); -+ } else { -+ /* -+ * there are no items of this file in the tree -+ * yet. Create first item of the file inserting one -+ * unallocated extent of * width nr_jnodes -+ */ -+ result = insert_first_extent(&hint.ext_coord, &key, -+ jnodes, count, inode); -+ } -+ zrelse(loaded); -+ if (result < 0) { -+ done_lh(hint.ext_coord.lh); -+ break; -+ } -+ -+ jnodes += result; -+ count -= result; -+ set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE); -+ -+ /* seal and unlock znode */ -+ if (hint.ext_coord.valid) -+ reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK); -+ else -+ reiser4_unset_hint(&hint); -+ -+ } while (count > 0); -+ -+ save_file_hint(file, &hint); -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return result; -+} -+ -+/** -+ * write_extent_reserve_space - reserve space for extent write operation -+ * @inode: -+ * -+ * Estimates and reserves space which may be required for writing -+ * WRITE_GRANULARITY pages of file. -+ */ -+static int write_extent_reserve_space(struct inode *inode) -+{ -+ __u64 count; -+ reiser4_tree *tree; -+ -+ /* -+ * to write WRITE_GRANULARITY pages to a file by extents we have to -+ * reserve disk space for: -+ -+ * 1. find_file_item may have to insert empty node to the tree (empty -+ * leaf node between two extent items). This requires 1 block and -+ * number of blocks which are necessary to perform insertion of an -+ * internal item into twig level. -+ -+ * 2. for each of written pages there might be needed 1 block and -+ * number of blocks which might be necessary to perform insertion of or -+ * paste to an extent item. -+ -+ * 3. stat data update -+ */ -+ tree = reiser4_tree_by_inode(inode); -+ count = estimate_one_insert_item(tree) + -+ WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) + -+ estimate_one_insert_item(tree); -+ grab_space_enable(); -+ return reiser4_grab_space(count, 0 /* flags */); -+} -+ -+/* -+ * filemap_copy_from_user no longer exists in generic code, because it -+ * is deadlocky (copying from user while holding the page lock is bad). -+ * As a temporary fix for reiser4, just define it here. -+ */ -+static inline size_t -+filemap_copy_from_user(struct page *page, unsigned long offset, -+ const char __user *buf, unsigned bytes) -+{ -+ char *kaddr; -+ int left; -+ -+ kaddr = kmap_atomic(page, KM_USER0); -+ left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); -+ kunmap_atomic(kaddr, KM_USER0); -+ -+ if (left != 0) { -+ /* Do it the slow way */ -+ kaddr = kmap(page); -+ left = __copy_from_user_nocache(kaddr + offset, buf, bytes); -+ kunmap(page); -+ } -+ return bytes - left; -+} -+ -+/** -+ * reiser4_write_extent - write method of extent item plugin -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @count: number of bytes to write -+ * @pos: position in file to write to -+ * -+ */ -+ssize_t reiser4_write_extent(struct file *file, struct inode * inode, -+ const char __user *buf, size_t count, loff_t *pos) -+{ -+ int have_to_update_extent; -+ int nr_pages, nr_dirty; -+ struct page *page; -+ jnode *jnodes[WRITE_GRANULARITY + 1]; -+ unsigned long index; -+ unsigned long end; -+ int i; -+ int to_page, page_off; -+ size_t left, written; -+ int result = 0; -+ -+ if (write_extent_reserve_space(inode)) -+ return RETERR(-ENOSPC); -+ -+ if (count == 0) { -+ /* truncate case */ -+ update_extents(file, inode, jnodes, 0, *pos); -+ return 0; -+ } -+ -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ -+ left = count; -+ index = *pos >> PAGE_CACHE_SHIFT; -+ /* calculate number of pages which are to be written */ -+ end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT); -+ nr_pages = end - index + 1; -+ nr_dirty = 0; -+ assert("", nr_pages <= WRITE_GRANULARITY + 1); -+ -+ /* get pages and jnodes */ -+ for (i = 0; i < nr_pages; i ++) { -+ page = find_or_create_page(inode->i_mapping, index + i, -+ reiser4_ctx_gfp_mask_get()); -+ if (page == NULL) { -+ nr_pages = i; -+ result = RETERR(-ENOMEM); -+ goto out; -+ } -+ -+ jnodes[i] = jnode_of_page(page); -+ if (IS_ERR(jnodes[i])) { -+ unlock_page(page); -+ page_cache_release(page); -+ nr_pages = i; -+ result = RETERR(-ENOMEM); -+ goto out; -+ } -+ /* prevent jnode and page from disconnecting */ -+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED); -+ unlock_page(page); -+ } -+ -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ -+ have_to_update_extent = 0; -+ -+ page_off = (*pos & (PAGE_CACHE_SIZE - 1)); -+ for (i = 0; i < nr_pages; i ++) { -+ to_page = PAGE_CACHE_SIZE - page_off; -+ if (to_page > left) -+ to_page = left; -+ page = jnode_page(jnodes[i]); -+ if (page_offset(page) < inode->i_size && -+ !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) { -+ /* -+ * the above is not optimal for partial write to last -+ * page of file when file size is not at boundary of -+ * page -+ */ -+ lock_page(page); -+ if (!PageUptodate(page)) { -+ result = readpage_unix_file(NULL, page); -+ BUG_ON(result != 0); -+ /* wait for read completion */ -+ lock_page(page); -+ BUG_ON(!PageUptodate(page)); -+ } else -+ result = 0; -+ unlock_page(page); -+ } -+ -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ fault_in_pages_readable(buf, to_page); -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ -+ lock_page(page); -+ if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) -+ zero_user_segments(page, 0, page_off, -+ page_off + to_page, -+ PAGE_CACHE_SIZE); -+ -+ written = filemap_copy_from_user(page, page_off, buf, to_page); -+ if (unlikely(written != to_page)) { -+ unlock_page(page); -+ result = RETERR(-EFAULT); -+ break; -+ } -+ -+ flush_dcache_page(page); -+ set_page_dirty_notag(page); -+ unlock_page(page); -+ nr_dirty++; -+ -+ mark_page_accessed(page); -+ SetPageUptodate(page); -+ -+ if (jnodes[i]->blocknr == 0) -+ have_to_update_extent ++; -+ -+ page_off = 0; -+ buf += to_page; -+ left -= to_page; -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ } -+ -+ if (have_to_update_extent) { -+ update_extents(file, inode, jnodes, nr_dirty, *pos); -+ } else { -+ for (i = 0; i < nr_dirty; i ++) { -+ int ret; -+ spin_lock_jnode(jnodes[i]); -+ ret = reiser4_try_capture(jnodes[i], -+ ZNODE_WRITE_LOCK, 0); -+ BUG_ON(ret != 0); -+ jnode_make_dirty_locked(jnodes[i]); -+ spin_unlock_jnode(jnodes[i]); -+ } -+ } -+out: -+ for (i = 0; i < nr_pages; i ++) { -+ page_cache_release(jnode_page(jnodes[i])); -+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED); -+ jput(jnodes[i]); -+ } -+ -+ /* the only errors handled so far is ENOMEM and -+ EFAULT on copy_from_user */ -+ -+ return (count - left) ? (count - left) : result; -+} -+ -+int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos, -+ struct page *page) -+{ -+ jnode *j; -+ struct address_space *mapping; -+ unsigned long index; -+ oid_t oid; -+ reiser4_block_nr block; -+ -+ mapping = page->mapping; -+ oid = get_inode_oid(mapping->host); -+ index = page->index; -+ -+ switch (state_of_extent(ext)) { -+ case HOLE_EXTENT: -+ /* -+ * it is possible to have hole page with jnode, if page was -+ * eflushed previously. -+ */ -+ j = jfind(mapping, index); -+ if (j == NULL) { -+ zero_user(page, 0, PAGE_CACHE_SIZE); -+ SetPageUptodate(page); -+ unlock_page(page); -+ return 0; -+ } -+ spin_lock_jnode(j); -+ if (!jnode_page(j)) { -+ jnode_attach_page(j, page); -+ } else { -+ BUG_ON(jnode_page(j) != page); -+ assert("vs-1504", jnode_page(j) == page); -+ } -+ block = *jnode_get_io_block(j); -+ spin_unlock_jnode(j); -+ if (block == 0) { -+ zero_user(page, 0, PAGE_CACHE_SIZE); -+ SetPageUptodate(page); -+ unlock_page(page); -+ jput(j); -+ return 0; -+ } -+ break; -+ -+ case ALLOCATED_EXTENT: -+ j = jnode_of_page(page); -+ if (IS_ERR(j)) -+ return PTR_ERR(j); -+ if (*jnode_get_block(j) == 0) { -+ reiser4_block_nr blocknr; -+ -+ blocknr = extent_get_start(ext) + pos; -+ jnode_set_block(j, &blocknr); -+ } else -+ assert("vs-1403", -+ j->blocknr == extent_get_start(ext) + pos); -+ break; -+ -+ case UNALLOCATED_EXTENT: -+ j = jfind(mapping, index); -+ assert("nikita-2688", j); -+ assert("vs-1426", jnode_page(j) == NULL); -+ -+ spin_lock_jnode(j); -+ jnode_attach_page(j, page); -+ spin_unlock_jnode(j); -+ break; -+ -+ default: -+ warning("vs-957", "wrong extent\n"); -+ return RETERR(-EIO); -+ } -+ -+ BUG_ON(j == 0); -+ reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get()); -+ jput(j); -+ return 0; -+} -+ -+/* Implements plugin->u.item.s.file.read operation for extent items. */ -+int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint) -+{ -+ int result; -+ struct page *page; -+ unsigned long cur_page, next_page; -+ unsigned long page_off, count; -+ struct address_space *mapping; -+ loff_t file_off; -+ uf_coord_t *uf_coord; -+ coord_t *coord; -+ struct extent_coord_extension *ext_coord; -+ unsigned long nr_pages; -+ char *kaddr; -+ -+ assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE); -+ assert("vs-572", flow->user == 1); -+ assert("vs-1351", flow->length > 0); -+ -+ uf_coord = &hint->ext_coord; -+ -+ check_uf_coord(uf_coord, NULL); -+ assert("vs-33", uf_coord->lh == &hint->lh); -+ -+ coord = &uf_coord->coord; -+ assert("vs-1119", znode_is_rlocked(coord->node)); -+ assert("vs-1120", znode_is_loaded(coord->node)); -+ assert("vs-1256", coord_matches_key_extent(coord, &flow->key)); -+ -+ mapping = file->f_dentry->d_inode->i_mapping; -+ ext_coord = &uf_coord->extension.extent; -+ -+ /* offset in a file to start read from */ -+ file_off = get_key_offset(&flow->key); -+ /* offset within the page to start read from */ -+ page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1)); -+ /* bytes which can be read from the page which contains file_off */ -+ count = PAGE_CACHE_SIZE - page_off; -+ -+ /* index of page containing offset read is to start from */ -+ cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT); -+ next_page = cur_page; -+ /* number of pages flow spans over */ -+ nr_pages = -+ ((file_off + flow->length + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT) - cur_page; -+ -+ /* we start having twig node read locked. However, we do not want to -+ keep that lock all the time readahead works. So, set a sel and -+ release twig node. */ -+ reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK); -+ /* &hint->lh is done-ed */ -+ -+ do { -+ reiser4_txn_restart_current(); -+ page = read_mapping_page(mapping, cur_page, file); -+ if (IS_ERR(page)) -+ return PTR_ERR(page); -+ lock_page(page); -+ if (!PageUptodate(page)) { -+ unlock_page(page); -+ page_cache_release(page); -+ warning("jmacd-97178", "extent_read: page is not up to date"); -+ return RETERR(-EIO); -+ } -+ mark_page_accessed(page); -+ unlock_page(page); -+ -+ /* If users can be writing to this page using arbitrary virtual -+ addresses, take care about potential aliasing before reading -+ the page on the kernel side. -+ */ -+ if (mapping_writably_mapped(mapping)) -+ flush_dcache_page(page); -+ -+ assert("nikita-3034", reiser4_schedulable()); -+ -+ /* number of bytes which are to be read from the page */ -+ if (count > flow->length) -+ count = flow->length; -+ -+ result = fault_in_pages_writeable(flow->data, count); -+ if (result) { -+ page_cache_release(page); -+ return RETERR(-EFAULT); -+ } -+ -+ kaddr = kmap_atomic(page, KM_USER0); -+ result = __copy_to_user_inatomic(flow->data, -+ kaddr + page_off, count); -+ kunmap_atomic(kaddr, KM_USER0); -+ if (result != 0) { -+ kaddr = kmap(page); -+ result = __copy_to_user(flow->data, kaddr + page_off, count); -+ kunmap(page); -+ if (unlikely(result)) -+ return RETERR(-EFAULT); -+ } -+ -+ page_cache_release(page); -+ -+ /* increase key (flow->key), update user area pointer (flow->data) */ -+ move_flow_forward(flow, count); -+ -+ page_off = 0; -+ cur_page ++; -+ count = PAGE_CACHE_SIZE; -+ nr_pages--; -+ } while (flow->length); -+ -+ return 0; -+} -+ -+/* -+ plugin->s.file.readpage -+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage -+ or -+ filemap_fault->reiser4_readpage->readpage_unix_file->->readpage_extent -+ -+ At the beginning: coord->node is read locked, zloaded, page is -+ locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index) -+*/ -+int reiser4_readpage_extent(void *vp, struct page *page) -+{ -+ uf_coord_t *uf_coord = vp; -+ ON_DEBUG(coord_t * coord = &uf_coord->coord); -+ ON_DEBUG(reiser4_key key); -+ -+ assert("vs-1040", PageLocked(page)); -+ assert("vs-1050", !PageUptodate(page)); -+ assert("vs-1039", page->mapping && page->mapping->host); -+ -+ assert("vs-1044", znode_is_loaded(coord->node)); -+ assert("vs-758", item_is_extent(coord)); -+ assert("vs-1046", coord_is_existing_unit(coord)); -+ assert("vs-1045", znode_is_rlocked(coord->node)); -+ assert("vs-1047", -+ page->mapping->host->i_ino == -+ get_key_objectid(item_key_by_coord(coord, &key))); -+ check_uf_coord(uf_coord, NULL); -+ -+ return reiser4_do_readpage_extent( -+ ext_by_ext_coord(uf_coord), -+ uf_coord->extension.extent.pos_in_unit, page); -+} -+ -+/** -+ * get_block_address_extent -+ * @coord: -+ * @block: -+ * @result: -+ * -+ * -+ */ -+int get_block_address_extent(const coord_t *coord, sector_t block, -+ sector_t *result) -+{ -+ reiser4_extent *ext; -+ -+ if (!coord_is_existing_unit(coord)) -+ return RETERR(-EINVAL); -+ -+ ext = extent_by_coord(coord); -+ -+ if (state_of_extent(ext) != ALLOCATED_EXTENT) -+ /* FIXME: bad things may happen if it is unallocated extent */ -+ *result = 0; -+ else { -+ reiser4_key key; -+ -+ unit_key_by_coord(coord, &key); -+ assert("vs-1645", -+ block >= get_key_offset(&key) >> current_blocksize_bits); -+ assert("vs-1646", -+ block < -+ (get_key_offset(&key) >> current_blocksize_bits) + -+ extent_get_width(ext)); -+ *result = -+ extent_get_start(ext) + (block - -+ (get_key_offset(&key) >> -+ current_blocksize_bits)); -+ } -+ return 0; -+} -+ -+/* -+ plugin->u.item.s.file.append_key -+ key of first byte which is the next to last byte by addressed by this extent -+*/ -+reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, -+ get_key_offset(key) + reiser4_extent_size(coord, -+ nr_units_extent -+ (coord))); -+ -+ assert("vs-610", get_key_offset(key) -+ && (get_key_offset(key) & (current_blocksize - 1)) == 0); -+ return key; -+} -+ -+/* plugin->u.item.s.file.init_coord_extension */ -+void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped) -+{ -+ coord_t *coord; -+ struct extent_coord_extension *ext_coord; -+ reiser4_key key; -+ loff_t offset; -+ -+ assert("vs-1295", uf_coord->valid == 0); -+ -+ coord = &uf_coord->coord; -+ assert("vs-1288", coord_is_iplug_set(coord)); -+ assert("vs-1327", znode_is_loaded(coord->node)); -+ -+ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT) -+ return; -+ -+ ext_coord = &uf_coord->extension.extent; -+ ext_coord->nr_units = nr_units_extent(coord); -+ ext_coord->ext_offset = -+ (char *)extent_by_coord(coord) - zdata(coord->node); -+ ext_coord->width = extent_get_width(extent_by_coord(coord)); -+ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord)); -+ uf_coord->valid = 1; -+ -+ /* pos_in_unit is the only uninitialized field in extended coord */ -+ if (coord->between == AFTER_UNIT) { -+ assert("vs-1330", -+ coord->unit_pos == nr_units_extent(coord) - 1); -+ -+ ext_coord->pos_in_unit = ext_coord->width - 1; -+ } else { -+ /* AT_UNIT */ -+ unit_key_by_coord(coord, &key); -+ offset = get_key_offset(&key); -+ -+ assert("vs-1328", offset <= lookuped); -+ assert("vs-1329", -+ lookuped < -+ offset + ext_coord->width * current_blocksize); -+ ext_coord->pos_in_unit = -+ ((lookuped - offset) >> current_blocksize_bits); -+ } -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.30/fs/reiser4/plugin/item/extent_flush_ops.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/extent_flush_ops.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1028 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../tree.h" -+#include "../../jnode.h" -+#include "../../super.h" -+#include "../../flush.h" -+#include "../../carry.h" -+#include "../object.h" -+ -+#include <linux/pagemap.h> -+ -+static reiser4_block_nr extent_unit_start(const coord_t * item); -+ -+/* Return either first or last extent (depending on @side) of the item -+ @coord is set to. Set @pos_in_unit either to first or to last block -+ of extent. */ -+static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side, -+ reiser4_block_nr * pos_in_unit) -+{ -+ reiser4_extent *ext; -+ -+ if (side == LEFT_SIDE) { -+ /* get first extent of item */ -+ ext = extent_item(coord); -+ *pos_in_unit = 0; -+ } else { -+ /* get last extent of item and last position within it */ -+ assert("vs-363", side == RIGHT_SIDE); -+ ext = extent_item(coord) + coord_last_unit_pos(coord); -+ *pos_in_unit = extent_get_width(ext) - 1; -+ } -+ -+ return ext; -+} -+ -+/* item_plugin->f.utmost_child */ -+/* Return the child. Coord is set to extent item. Find jnode corresponding -+ either to first or to last unformatted node pointed by the item */ -+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp) -+{ -+ reiser4_extent *ext; -+ reiser4_block_nr pos_in_unit; -+ -+ ext = extent_utmost_ext(coord, side, &pos_in_unit); -+ -+ switch (state_of_extent(ext)) { -+ case HOLE_EXTENT: -+ *childp = NULL; -+ return 0; -+ case ALLOCATED_EXTENT: -+ case UNALLOCATED_EXTENT: -+ break; -+ default: -+ /* this should never happen */ -+ assert("vs-1417", 0); -+ } -+ -+ { -+ reiser4_key key; -+ reiser4_tree *tree; -+ unsigned long index; -+ -+ if (side == LEFT_SIDE) { -+ /* get key of first byte addressed by the extent */ -+ item_key_by_coord(coord, &key); -+ } else { -+ /* get key of byte which next after last byte addressed by the extent */ -+ append_key_extent(coord, &key); -+ } -+ -+ assert("vs-544", -+ (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul); -+ /* index of first or last (depending on @side) page addressed -+ by the extent */ -+ index = -+ (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT); -+ if (side == RIGHT_SIDE) -+ index--; -+ -+ tree = coord->node->zjnode.tree; -+ *childp = jlookup(tree, get_key_objectid(&key), index); -+ } -+ -+ return 0; -+} -+ -+/* item_plugin->f.utmost_child_real_block */ -+/* Return the child's block, if allocated. */ -+int -+utmost_child_real_block_extent(const coord_t * coord, sideof side, -+ reiser4_block_nr * block) -+{ -+ reiser4_extent *ext; -+ -+ ext = extent_by_coord(coord); -+ -+ switch (state_of_extent(ext)) { -+ case ALLOCATED_EXTENT: -+ *block = extent_get_start(ext); -+ if (side == RIGHT_SIDE) -+ *block += extent_get_width(ext) - 1; -+ break; -+ case HOLE_EXTENT: -+ case UNALLOCATED_EXTENT: -+ *block = 0; -+ break; -+ default: -+ /* this should never happen */ -+ assert("vs-1418", 0); -+ } -+ -+ return 0; -+} -+ -+/* item_plugin->f.scan */ -+/* Performs leftward scanning starting from an unformatted node and its parent coordinate. -+ This scan continues, advancing the parent coordinate, until either it encounters a -+ formatted child or it finishes scanning this node. -+ -+ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm -+ not sure this is last property (same atom) is enforced, but it should be the case since -+ one atom must write the parent and the others must read the parent, thus fusing?). In -+ any case, the code below asserts this case for unallocated extents. Unallocated -+ extents are thus optimized because we can skip to the endpoint when scanning. -+ -+ It returns control to reiser4_scan_extent, handles these terminating conditions, -+ e.g., by loading the next twig. -+*/ -+int reiser4_scan_extent(flush_scan * scan) -+{ -+ coord_t coord; -+ jnode *neighbor; -+ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist; -+ reiser4_block_nr unit_start; -+ __u64 oid; -+ reiser4_key key; -+ int ret = 0, allocated, incr; -+ reiser4_tree *tree; -+ -+ if (!JF_ISSET(scan->node, JNODE_DIRTY)) { -+ scan->stop = 1; -+ return 0; /* Race with truncate, this node is already -+ * truncated. */ -+ } -+ -+ coord_dup(&coord, &scan->parent_coord); -+ -+ assert("jmacd-1404", !reiser4_scan_finished(scan)); -+ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL); -+ assert("jmacd-1406", jnode_is_unformatted(scan->node)); -+ -+ /* The scan_index variable corresponds to the current page index of the -+ unformatted block scan position. */ -+ scan_index = index_jnode(scan->node); -+ -+ assert("jmacd-7889", item_is_extent(&coord)); -+ -+ repeat: -+ /* objectid of file */ -+ oid = get_key_objectid(item_key_by_coord(&coord, &key)); -+ -+ allocated = !extent_is_unallocated(&coord); -+ /* Get the values of this extent unit: */ -+ unit_index = extent_unit_index(&coord); -+ unit_width = extent_unit_width(&coord); -+ unit_start = extent_unit_start(&coord); -+ -+ assert("jmacd-7187", unit_width > 0); -+ assert("jmacd-7188", scan_index >= unit_index); -+ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1); -+ -+ /* Depending on the scan direction, we set different maximum values for scan_index -+ (scan_max) and the number of nodes that would be passed if the scan goes the -+ entire way (scan_dist). Incr is an integer reflecting the incremental -+ direction of scan_index. */ -+ if (reiser4_scanning_left(scan)) { -+ scan_max = unit_index; -+ scan_dist = scan_index - unit_index; -+ incr = -1; -+ } else { -+ scan_max = unit_index + unit_width - 1; -+ scan_dist = scan_max - unit_index; -+ incr = +1; -+ } -+ -+ tree = coord.node->zjnode.tree; -+ -+ /* If the extent is allocated we have to check each of its blocks. If the extent -+ is unallocated we can skip to the scan_max. */ -+ if (allocated) { -+ do { -+ neighbor = jlookup(tree, oid, scan_index); -+ if (neighbor == NULL) -+ goto stop_same_parent; -+ -+ if (scan->node != neighbor -+ && !reiser4_scan_goto(scan, neighbor)) { -+ /* @neighbor was jput() by reiser4_scan_goto */ -+ goto stop_same_parent; -+ } -+ -+ ret = scan_set_current(scan, neighbor, 1, &coord); -+ if (ret != 0) { -+ goto exit; -+ } -+ -+ /* reference to @neighbor is stored in @scan, no need -+ to jput(). */ -+ scan_index += incr; -+ -+ } while (incr + scan_max != scan_index); -+ -+ } else { -+ /* Optimized case for unallocated extents, skip to the end. */ -+ neighbor = jlookup(tree, oid, scan_max /*index */ ); -+ if (neighbor == NULL) { -+ /* Race with truncate */ -+ scan->stop = 1; -+ ret = 0; -+ goto exit; -+ } -+ -+ assert("zam-1043", -+ reiser4_blocknr_is_fake(jnode_get_block(neighbor))); -+ -+ ret = scan_set_current(scan, neighbor, scan_dist, &coord); -+ if (ret != 0) { -+ goto exit; -+ } -+ } -+ -+ if (coord_sideof_unit(&coord, scan->direction) == 0 -+ && item_is_extent(&coord)) { -+ /* Continue as long as there are more extent units. */ -+ -+ scan_index = -+ extent_unit_index(&coord) + -+ (reiser4_scanning_left(scan) ? -+ extent_unit_width(&coord) - 1 : 0); -+ goto repeat; -+ } -+ -+ if (0) { -+ stop_same_parent: -+ -+ /* If we are scanning left and we stop in the middle of an allocated -+ extent, we know the preceder immediately.. */ -+ /* middle of extent is (scan_index - unit_index) != 0. */ -+ if (reiser4_scanning_left(scan) && -+ (scan_index - unit_index) != 0) { -+ /* FIXME(B): Someone should step-through and verify that this preceder -+ calculation is indeed correct. */ -+ /* @unit_start is starting block (number) of extent -+ unit. Flush stopped at the @scan_index block from -+ the beginning of the file, which is (scan_index - -+ unit_index) block within extent. -+ */ -+ if (unit_start) { -+ /* skip preceder update when we are at hole */ -+ scan->preceder_blk = -+ unit_start + scan_index - unit_index; -+ check_preceder(scan->preceder_blk); -+ } -+ } -+ -+ /* In this case, we leave coord set to the parent of scan->node. */ -+ scan->stop = 1; -+ -+ } else { -+ /* In this case, we are still scanning, coord is set to the next item which is -+ either off-the-end of the node or not an extent. */ -+ assert("jmacd-8912", scan->stop == 0); -+ assert("jmacd-7812", -+ (coord_is_after_sideof_unit(&coord, scan->direction) -+ || !item_is_extent(&coord))); -+ } -+ -+ ret = 0; -+ exit: -+ return ret; -+} -+ -+/* ask block allocator for some blocks */ -+static void extent_allocate_blocks(reiser4_blocknr_hint *preceder, -+ reiser4_block_nr wanted_count, -+ reiser4_block_nr *first_allocated, -+ reiser4_block_nr *allocated, -+ block_stage_t block_stage) -+{ -+ *allocated = wanted_count; -+ preceder->max_dist = 0; /* scan whole disk, if needed */ -+ -+ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */ -+ preceder->block_stage = block_stage; -+ -+ /* FIXME: we do not handle errors here now */ -+ check_me("vs-420", -+ reiser4_alloc_blocks(preceder, first_allocated, allocated, -+ BA_PERMANENT) == 0); -+ /* update flush_pos's preceder to last allocated block number */ -+ preceder->blk = *first_allocated + *allocated - 1; -+} -+ -+/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent -+ will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have -+ to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */ -+static reiser4_block_nr reserve_replace(void) -+{ -+ reiser4_block_nr grabbed, needed; -+ -+ grabbed = get_current_context()->grabbed_blocks; -+ needed = estimate_one_insert_into_item(current_tree); -+ check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED)); -+ return grabbed; -+} -+ -+static void free_replace_reserved(reiser4_block_nr grabbed) -+{ -+ reiser4_context *ctx; -+ -+ ctx = get_current_context(); -+ grabbed2free(ctx, get_super_private(ctx->super), -+ ctx->grabbed_blocks - grabbed); -+} -+ -+/* Block offset of first block addressed by unit */ -+__u64 extent_unit_index(const coord_t * item) -+{ -+ reiser4_key key; -+ -+ assert("vs-648", coord_is_existing_unit(item)); -+ unit_key_by_coord(item, &key); -+ return get_key_offset(&key) >> current_blocksize_bits; -+} -+ -+/* AUDIT shouldn't return value be of reiser4_block_nr type? -+ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */ -+__u64 extent_unit_width(const coord_t * item) -+{ -+ assert("vs-649", coord_is_existing_unit(item)); -+ return width_by_coord(item); -+} -+ -+/* Starting block location of this unit */ -+static reiser4_block_nr extent_unit_start(const coord_t * item) -+{ -+ return extent_get_start(extent_by_coord(item)); -+} -+ -+/** -+ * split_allocated_extent - -+ * @coord: -+ * @pos_in_unit: -+ * -+ * replace allocated extent with two allocated extents -+ */ -+static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit) -+{ -+ int result; -+ struct replace_handle *h; -+ reiser4_extent *ext; -+ reiser4_block_nr grabbed; -+ -+ ext = extent_by_coord(coord); -+ assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT); -+ assert("vs-1411", extent_get_width(ext) > pos_in_unit); -+ -+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get()); -+ if (h == NULL) -+ return RETERR(-ENOMEM); -+ h->coord = coord; -+ h->lh = znode_lh(coord->node); -+ h->pkey = &h->key; -+ unit_key_by_coord(coord, h->pkey); -+ set_key_offset(h->pkey, -+ (get_key_offset(h->pkey) + -+ pos_in_unit * current_blocksize)); -+ reiser4_set_extent(&h->overwrite, extent_get_start(ext), -+ pos_in_unit); -+ reiser4_set_extent(&h->new_extents[0], -+ extent_get_start(ext) + pos_in_unit, -+ extent_get_width(ext) - pos_in_unit); -+ h->nr_new_extents = 1; -+ h->flags = COPI_DONT_SHIFT_LEFT; -+ h->paste_key = h->key; -+ -+ /* reserve space for extent unit paste, @grabbed is reserved before */ -+ grabbed = reserve_replace(); -+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten -+ extent */); -+ /* restore reserved */ -+ free_replace_reserved(grabbed); -+ kfree(h); -+ return result; -+} -+ -+/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is -+ one). Return 1 if it succeeded, 0 - otherwise */ -+static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext, -+ reiser4_extent *replace) -+{ -+ assert("vs-1415", extent_by_coord(coord) == ext); -+ -+ if (coord->unit_pos == 0 -+ || state_of_extent(ext - 1) != ALLOCATED_EXTENT) -+ /* @ext either does not exist or is not allocated extent */ -+ return 0; -+ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) != -+ extent_get_start(replace)) -+ return 0; -+ -+ /* we can glue, widen previous unit */ -+ extent_set_width(ext - 1, -+ extent_get_width(ext - 1) + extent_get_width(replace)); -+ -+ if (extent_get_width(ext) != extent_get_width(replace)) { -+ /* make current extent narrower */ -+ if (state_of_extent(ext) == ALLOCATED_EXTENT) -+ extent_set_start(ext, -+ extent_get_start(ext) + -+ extent_get_width(replace)); -+ extent_set_width(ext, -+ extent_get_width(ext) - -+ extent_get_width(replace)); -+ } else { -+ /* current extent completely glued with its left neighbor, remove it */ -+ coord_t from, to; -+ -+ coord_dup(&from, coord); -+ from.unit_pos = nr_units_extent(coord) - 1; -+ coord_dup(&to, &from); -+ -+ /* currently cut from extent can cut either from the beginning or from the end. Move place which got -+ freed after unit removal to end of item */ -+ memmove(ext, ext + 1, -+ (from.unit_pos - -+ coord->unit_pos) * sizeof(reiser4_extent)); -+ /* wipe part of item which is going to be cut, so that node_check will not be confused */ -+ cut_node_content(&from, &to, NULL, NULL, NULL); -+ } -+ znode_make_dirty(coord->node); -+ /* move coord back */ -+ coord->unit_pos--; -+ return 1; -+} -+ -+/** -+ * conv_extent - replace extent with 2 ones -+ * @coord: coordinate of extent to be replaced -+ * @replace: extent to overwrite the one @coord is set to -+ * -+ * Overwrites extent @coord is set to and paste one extent unit after -+ * overwritten one if @replace is shorter than initial extent -+ */ -+static int conv_extent(coord_t *coord, reiser4_extent *replace) -+{ -+ int result; -+ struct replace_handle *h; -+ reiser4_extent *ext; -+ reiser4_block_nr start, width, new_width; -+ reiser4_block_nr grabbed; -+ extent_state state; -+ -+ ext = extent_by_coord(coord); -+ state = state_of_extent(ext); -+ start = extent_get_start(ext); -+ width = extent_get_width(ext); -+ new_width = extent_get_width(replace); -+ -+ assert("vs-1458", (state == UNALLOCATED_EXTENT || -+ state == ALLOCATED_EXTENT)); -+ assert("vs-1459", width >= new_width); -+ -+ if (try_to_merge_with_left(coord, ext, replace)) { -+ /* merged @replace with left neighbor. Current unit is either -+ removed or narrowed */ -+ return 0; -+ } -+ -+ if (width == new_width) { -+ /* replace current extent with @replace */ -+ *ext = *replace; -+ znode_make_dirty(coord->node); -+ return 0; -+ } -+ -+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get()); -+ if (h == NULL) -+ return RETERR(-ENOMEM); -+ h->coord = coord; -+ h->lh = znode_lh(coord->node); -+ h->pkey = &h->key; -+ unit_key_by_coord(coord, h->pkey); -+ set_key_offset(h->pkey, -+ (get_key_offset(h->pkey) + new_width * current_blocksize)); -+ h->overwrite = *replace; -+ -+ /* replace @ext with @replace and padding extent */ -+ reiser4_set_extent(&h->new_extents[0], -+ (state == ALLOCATED_EXTENT) ? -+ (start + new_width) : -+ UNALLOCATED_EXTENT_START, -+ width - new_width); -+ h->nr_new_extents = 1; -+ h->flags = COPI_DONT_SHIFT_LEFT; -+ h->paste_key = h->key; -+ -+ /* reserve space for extent unit paste, @grabbed is reserved before */ -+ grabbed = reserve_replace(); -+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten -+ extent */); -+ -+ /* restore reserved */ -+ free_replace_reserved(grabbed); -+ kfree(h); -+ return result; -+} -+ -+/** -+ * assign_real_blocknrs -+ * @flush_pos: -+ * @oid: objectid of file jnodes to assign block number to belongs to -+ * @index: first jnode on the range -+ * @count: number of jnodes to assign block numbers to -+ * @first: start of allocated block range -+ * -+ * Assigns block numbers to each of @count jnodes. Index of first jnode is -+ * @index. Jnodes get lookuped with jlookup. -+ */ -+static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid, -+ unsigned long index, reiser4_block_nr count, -+ reiser4_block_nr first) -+{ -+ unsigned long i; -+ reiser4_tree *tree; -+ txn_atom *atom; -+ int nr; -+ -+ atom = atom_locked_by_fq(flush_pos->fq); -+ assert("vs-1468", atom); -+ BUG_ON(atom == NULL); -+ -+ nr = 0; -+ tree = current_tree; -+ for (i = 0; i < count; ++i, ++index) { -+ jnode *node; -+ -+ node = jlookup(tree, oid, index); -+ assert("", node != NULL); -+ BUG_ON(node == NULL); -+ -+ spin_lock_jnode(node); -+ assert("", !jnode_is_flushprepped(node)); -+ assert("vs-1475", node->atom == atom); -+ assert("vs-1476", atomic_read(&node->x_count) > 0); -+ -+ JF_CLR(node, JNODE_FLUSH_RESERVED); -+ jnode_set_block(node, &first); -+ unformatted_make_reloc(node, flush_pos->fq); -+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node), -+ FQ_LIST, 0)); -+ spin_unlock_jnode(node); -+ first++; -+ -+ atomic_dec(&node->x_count); -+ nr ++; -+ } -+ -+ spin_unlock_atom(atom); -+ return; -+} -+ -+/** -+ * make_node_ovrwr - assign node to overwrite set -+ * @jnodes: overwrite set list head -+ * @node: jnode to belong to overwrite set -+ * -+ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes -+ * which is an accumulator for nodes before they get to overwrite set list of -+ * atom. -+ */ -+static void make_node_ovrwr(struct list_head *jnodes, jnode *node) -+{ -+ spin_lock_jnode(node); -+ -+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC)); -+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR)); -+ -+ JF_SET(node, JNODE_OVRWR); -+ list_move_tail(&node->capture_link, jnodes); -+ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0)); -+ -+ spin_unlock_jnode(node); -+} -+ -+/** -+ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set -+ * @flush_pos: flush position -+ * @oid: objectid of file jnodes belong to -+ * @index: starting index -+ * @width: extent width -+ * -+ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's -+ * overwrite set. Starting from the one with index @index. If end of slum is -+ * detected (node is not found or flushprepped) - stop iterating and set flush -+ * position's state to POS_INVALID. -+ */ -+static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid, -+ unsigned long index, reiser4_block_nr width) -+{ -+ unsigned long i; -+ reiser4_tree *tree; -+ jnode *node; -+ txn_atom *atom; -+ LIST_HEAD(jnodes); -+ -+ tree = current_tree; -+ -+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos)); -+ assert("vs-1478", atom); -+ -+ for (i = flush_pos->pos_in_unit; i < width; i++, index++) { -+ node = jlookup(tree, oid, index); -+ if (!node) { -+ flush_pos->state = POS_INVALID; -+ break; -+ } -+ if (jnode_check_flushprepped(node)) { -+ flush_pos->state = POS_INVALID; -+ atomic_dec(&node->x_count); -+ break; -+ } -+ if (node->atom != atom) { -+ flush_pos->state = POS_INVALID; -+ atomic_dec(&node->x_count); -+ break; -+ } -+ make_node_ovrwr(&jnodes, node); -+ atomic_dec(&node->x_count); -+ } -+ -+ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev); -+ spin_unlock_atom(atom); -+} -+ -+/** -+ * allocated_extent_slum_size -+ * @flush_pos: -+ * @oid: -+ * @index: -+ * @count: -+ * -+ * -+ */ -+static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid, -+ unsigned long index, unsigned long count) -+{ -+ unsigned long i; -+ reiser4_tree *tree; -+ txn_atom *atom; -+ int nr; -+ -+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos)); -+ assert("vs-1468", atom); -+ -+ nr = 0; -+ tree = current_tree; -+ for (i = 0; i < count; ++i, ++index) { -+ jnode *node; -+ -+ node = jlookup(tree, oid, index); -+ if (!node) -+ break; -+ -+ if (jnode_check_flushprepped(node)) { -+ atomic_dec(&node->x_count); -+ break; -+ } -+ -+ if (node->atom != atom) { -+ /* -+ * this is possible on overwrite: extent_write may -+ * capture several unformatted nodes without capturing -+ * any formatted nodes. -+ */ -+ atomic_dec(&node->x_count); -+ break; -+ } -+ -+ assert("vs-1476", atomic_read(&node->x_count) > 1); -+ atomic_dec(&node->x_count); -+ nr ++; -+ } -+ -+ spin_unlock_atom(atom); -+ return nr; -+} -+ -+/** -+ * alloc_extent -+ * @flush_pos: -+ * -+ * -+ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord -+ * is set to. It is to prepare for flushing sequence of not flushprepped nodes -+ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position -+ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is -+ * set to 1 and to overwrite set otherwise -+ */ -+int reiser4_alloc_extent(flush_pos_t *flush_pos) -+{ -+ coord_t *coord; -+ reiser4_extent *ext; -+ reiser4_extent replace_ext; -+ oid_t oid; -+ reiser4_block_nr protected; -+ reiser4_block_nr start; -+ __u64 index; -+ __u64 width; -+ extent_state state; -+ int result; -+ reiser4_block_nr first_allocated; -+ __u64 allocated; -+ reiser4_key key; -+ block_stage_t block_stage; -+ -+ assert("vs-1468", flush_pos->state == POS_ON_EPOINT); -+ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord) -+ && item_is_extent(&flush_pos->coord)); -+ -+ coord = &flush_pos->coord; -+ -+ ext = extent_by_coord(coord); -+ state = state_of_extent(ext); -+ if (state == HOLE_EXTENT) { -+ flush_pos->state = POS_INVALID; -+ return 0; -+ } -+ -+ item_key_by_coord(coord, &key); -+ oid = get_key_objectid(&key); -+ index = extent_unit_index(coord) + flush_pos->pos_in_unit; -+ start = extent_get_start(ext); -+ width = extent_get_width(ext); -+ -+ assert("vs-1457", width > flush_pos->pos_in_unit); -+ -+ if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) { -+ /* relocate */ -+ if (flush_pos->pos_in_unit) { -+ /* split extent unit into two */ -+ result = -+ split_allocated_extent(coord, -+ flush_pos->pos_in_unit); -+ flush_pos->pos_in_unit = 0; -+ return result; -+ } -+ -+ /* limit number of nodes to allocate */ -+ if (flush_pos->nr_to_write < width) -+ width = flush_pos->nr_to_write; -+ -+ if (state == ALLOCATED_EXTENT) { -+ /* -+ * all protected nodes are not flushprepped, therefore -+ * they are counted as flush_reserved -+ */ -+ block_stage = BLOCK_FLUSH_RESERVED; -+ protected = allocated_extent_slum_size(flush_pos, oid, -+ index, width); -+ if (protected == 0) { -+ flush_pos->state = POS_INVALID; -+ flush_pos->pos_in_unit = 0; -+ return 0; -+ } -+ } else { -+ block_stage = BLOCK_UNALLOCATED; -+ protected = width; -+ } -+ -+ /* -+ * look at previous unit if possible. If it is allocated, make -+ * preceder more precise -+ */ -+ if (coord->unit_pos && -+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT)) -+ reiser4_pos_hint(flush_pos)->blk = -+ extent_get_start(ext - 1) + -+ extent_get_width(ext - 1); -+ -+ /* allocate new block numbers for protected nodes */ -+ extent_allocate_blocks(reiser4_pos_hint(flush_pos), -+ protected, -+ &first_allocated, &allocated, -+ block_stage); -+ -+ if (state == ALLOCATED_EXTENT) -+ /* -+ * on relocating - free nodes which are going to be -+ * relocated -+ */ -+ reiser4_dealloc_blocks(&start, &allocated, -+ BLOCK_ALLOCATED, BA_DEFER); -+ -+ /* assign new block numbers to protected nodes */ -+ assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated); -+ -+ /* prepare extent which will replace current one */ -+ reiser4_set_extent(&replace_ext, first_allocated, allocated); -+ -+ /* adjust extent item */ -+ result = conv_extent(coord, &replace_ext); -+ if (result != 0 && result != -ENOMEM) { -+ warning("vs-1461", -+ "Failed to allocate extent. Should not happen\n"); -+ return result; -+ } -+ -+ /* -+ * break flush: we prepared for flushing as many blocks as we -+ * were asked for -+ */ -+ if (flush_pos->nr_to_write == allocated) -+ flush_pos->state = POS_INVALID; -+ } else { -+ /* overwrite */ -+ mark_jnodes_overwrite(flush_pos, oid, index, width); -+ } -+ flush_pos->pos_in_unit = 0; -+ return 0; -+} -+ -+/* if @key is glueable to the item @coord is set to */ -+static int must_insert(const coord_t *coord, const reiser4_key *key) -+{ -+ reiser4_key last; -+ -+ if (item_id_by_coord(coord) == EXTENT_POINTER_ID -+ && keyeq(append_key_extent(coord, &last), key)) -+ return 0; -+ return 1; -+} -+ -+/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item, -+ or modify last unit of last item to have greater width */ -+static int put_unit_to_end(znode *node, const reiser4_key *key, -+ reiser4_extent *copy_ext) -+{ -+ int result; -+ coord_t coord; -+ cop_insert_flag flags; -+ reiser4_extent *last_ext; -+ reiser4_item_data data; -+ -+ /* set coord after last unit in an item */ -+ coord_init_last_unit(&coord, node); -+ coord.between = AFTER_UNIT; -+ -+ flags = -+ COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE; -+ if (must_insert(&coord, key)) { -+ result = -+ insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1), -+ key, NULL /*lh */ , flags); -+ -+ } else { -+ /* try to glue with last unit */ -+ last_ext = extent_by_coord(&coord); -+ if (state_of_extent(last_ext) && -+ extent_get_start(last_ext) + extent_get_width(last_ext) == -+ extent_get_start(copy_ext)) { -+ /* widen last unit of node */ -+ extent_set_width(last_ext, -+ extent_get_width(last_ext) + -+ extent_get_width(copy_ext)); -+ znode_make_dirty(node); -+ return 0; -+ } -+ -+ /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */ -+ result = -+ insert_into_item(&coord, NULL /*lh */ , key, -+ init_new_extent(&data, copy_ext, 1), -+ flags); -+ } -+ -+ assert("vs-438", result == 0 || result == -E_NODE_FULL); -+ return result; -+} -+ -+/* @coord is set to extent unit */ -+squeeze_result squalloc_extent(znode *left, const coord_t *coord, -+ flush_pos_t *flush_pos, -+ reiser4_key *stop_key) -+{ -+ reiser4_extent *ext; -+ __u64 index; -+ __u64 width; -+ reiser4_block_nr start; -+ extent_state state; -+ oid_t oid; -+ reiser4_block_nr first_allocated; -+ __u64 allocated; -+ __u64 protected; -+ reiser4_extent copy_extent; -+ reiser4_key key; -+ int result; -+ block_stage_t block_stage; -+ -+ assert("vs-1457", flush_pos->pos_in_unit == 0); -+ assert("vs-1467", coord_is_leftmost_unit(coord)); -+ assert("vs-1467", item_is_extent(coord)); -+ -+ ext = extent_by_coord(coord); -+ index = extent_unit_index(coord); -+ start = extent_get_start(ext); -+ width = extent_get_width(ext); -+ state = state_of_extent(ext); -+ unit_key_by_coord(coord, &key); -+ oid = get_key_objectid(&key); -+ -+ if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) || -+ (state == UNALLOCATED_EXTENT)) { -+ /* relocate */ -+ if (state == ALLOCATED_EXTENT) { -+ /* all protected nodes are not flushprepped, therefore -+ * they are counted as flush_reserved */ -+ block_stage = BLOCK_FLUSH_RESERVED; -+ protected = allocated_extent_slum_size(flush_pos, oid, -+ index, width); -+ if (protected == 0) { -+ flush_pos->state = POS_INVALID; -+ flush_pos->pos_in_unit = 0; -+ return 0; -+ } -+ } else { -+ block_stage = BLOCK_UNALLOCATED; -+ protected = width; -+ } -+ -+ /* -+ * look at previous unit if possible. If it is allocated, make -+ * preceder more precise -+ */ -+ if (coord->unit_pos && -+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT)) -+ reiser4_pos_hint(flush_pos)->blk = -+ extent_get_start(ext - 1) + -+ extent_get_width(ext - 1); -+ -+ /* allocate new block numbers for protected nodes */ -+ extent_allocate_blocks(reiser4_pos_hint(flush_pos), -+ protected, -+ &first_allocated, &allocated, -+ block_stage); -+ -+ /* prepare extent which will be copied to left */ -+ reiser4_set_extent(©_extent, first_allocated, allocated); -+ -+ result = put_unit_to_end(left, &key, ©_extent); -+ if (result == -E_NODE_FULL) { -+ int target_block_stage; -+ -+ /* free blocks which were just allocated */ -+ target_block_stage = -+ (state == -+ ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED : -+ BLOCK_UNALLOCATED; -+ reiser4_dealloc_blocks(&first_allocated, &allocated, -+ target_block_stage, -+ BA_PERMANENT); -+ -+ /* rewind the preceder. */ -+ flush_pos->preceder.blk = first_allocated; -+ check_preceder(flush_pos->preceder.blk); -+ -+ return SQUEEZE_TARGET_FULL; -+ } -+ -+ if (state == ALLOCATED_EXTENT) { -+ /* free nodes which were relocated */ -+ reiser4_dealloc_blocks(&start, &allocated, -+ BLOCK_ALLOCATED, BA_DEFER); -+ } -+ -+ /* assign new block numbers to protected nodes */ -+ assign_real_blocknrs(flush_pos, oid, index, allocated, -+ first_allocated); -+ -+ set_key_offset(&key, -+ get_key_offset(&key) + -+ (allocated << current_blocksize_bits)); -+ } else { -+ /* -+ * overwrite: try to copy unit as it is to left neighbor and -+ * make all first not flushprepped nodes overwrite nodes -+ */ -+ reiser4_set_extent(©_extent, start, width); -+ result = put_unit_to_end(left, &key, ©_extent); -+ if (result == -E_NODE_FULL) -+ return SQUEEZE_TARGET_FULL; -+ -+ if (state != HOLE_EXTENT) -+ mark_jnodes_overwrite(flush_pos, oid, index, width); -+ set_key_offset(&key, -+ get_key_offset(&key) + -+ (width << current_blocksize_bits)); -+ } -+ *stop_key = key; -+ return SQUEEZE_CONTINUE; -+} -+ -+int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key) -+{ -+ return key_by_inode_and_offset_common(inode, off, key); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/extent.h linux-2.6.30/fs/reiser4/plugin/item/extent.h ---- linux-2.6.30.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/extent.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,231 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#ifndef __REISER4_EXTENT_H__ -+#define __REISER4_EXTENT_H__ -+ -+/* on disk extent */ -+typedef struct { -+ reiser4_dblock_nr start; -+ reiser4_dblock_nr width; -+} reiser4_extent; -+ -+struct extent_stat { -+ int unallocated_units; -+ int unallocated_blocks; -+ int allocated_units; -+ int allocated_blocks; -+ int hole_units; -+ int hole_blocks; -+}; -+ -+/* extents in an extent item can be either holes, or unallocated or allocated -+ extents */ -+typedef enum { -+ HOLE_EXTENT, -+ UNALLOCATED_EXTENT, -+ ALLOCATED_EXTENT -+} extent_state; -+ -+#define HOLE_EXTENT_START 0 -+#define UNALLOCATED_EXTENT_START 1 -+#define UNALLOCATED_EXTENT_START2 2 -+ -+struct extent_coord_extension { -+ reiser4_block_nr pos_in_unit; -+ reiser4_block_nr width; /* width of current unit */ -+ pos_in_node_t nr_units; /* number of units */ -+ int ext_offset; /* offset from the beginning of zdata() */ -+ unsigned long expected_page; -+#if REISER4_DEBUG -+ reiser4_extent extent; -+#endif -+}; -+ -+/* macros to set/get fields of on-disk extent */ -+static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext) -+{ -+ return le64_to_cpu(ext->start); -+} -+ -+static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext) -+{ -+ return le64_to_cpu(ext->width); -+} -+ -+extern __u64 reiser4_current_block_count(void); -+ -+static inline void -+extent_set_start(reiser4_extent * ext, reiser4_block_nr start) -+{ -+ cassert(sizeof(ext->start) == 8); -+ assert("nikita-2510", -+ ergo(start > 1, start < reiser4_current_block_count())); -+ put_unaligned(cpu_to_le64(start), &ext->start); -+} -+ -+static inline void -+extent_set_width(reiser4_extent * ext, reiser4_block_nr width) -+{ -+ cassert(sizeof(ext->width) == 8); -+ assert("", width > 0); -+ put_unaligned(cpu_to_le64(width), &ext->width); -+ assert("nikita-2511", -+ ergo(extent_get_start(ext) > 1, -+ extent_get_start(ext) + width <= -+ reiser4_current_block_count())); -+} -+ -+#define extent_item(coord) \ -+({ \ -+ assert("nikita-3143", item_is_extent(coord)); \ -+ ((reiser4_extent *)item_body_by_coord (coord)); \ -+}) -+ -+#define extent_by_coord(coord) \ -+({ \ -+ assert("nikita-3144", item_is_extent(coord)); \ -+ (extent_item (coord) + (coord)->unit_pos); \ -+}) -+ -+#define width_by_coord(coord) \ -+({ \ -+ assert("nikita-3145", item_is_extent(coord)); \ -+ extent_get_width (extent_by_coord(coord)); \ -+}) -+ -+struct carry_cut_data; -+struct carry_kill_data; -+ -+/* plugin->u.item.b.* */ -+reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *); -+int can_contain_key_extent(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data *); -+int mergeable_extent(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_extent(const coord_t *); -+lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *); -+void init_coord_extent(coord_t *); -+int init_extent(coord_t *, reiser4_item_data *); -+int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *); -+int can_shift_extent(unsigned free_space, -+ coord_t * source, znode * target, shift_direction, -+ unsigned *size, unsigned want); -+void copy_units_extent(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction where_is_free_space, -+ unsigned free_space); -+int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count, -+ struct carry_kill_data *); -+int create_hook_extent(const coord_t * coord, void *arg); -+int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+reiser4_key *unit_key_extent(const coord_t *, reiser4_key *); -+reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *); -+void print_extent(const char *, coord_t *); -+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child); -+int utmost_child_real_block_extent(const coord_t * coord, sideof side, -+ reiser4_block_nr * block); -+void item_stat_extent(const coord_t * coord, void *vp); -+int reiser4_check_extent(const coord_t * coord, const char **error); -+ -+/* plugin->u.item.s.file.* */ -+ssize_t reiser4_write_extent(struct file *, struct inode * inode, -+ const char __user *, size_t, loff_t *); -+int reiser4_read_extent(struct file *, flow_t *, hint_t *); -+int reiser4_readpage_extent(void *, struct page *); -+int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*); -+reiser4_key *append_key_extent(const coord_t *, reiser4_key *); -+void init_coord_extension_extent(uf_coord_t *, loff_t offset); -+int get_block_address_extent(const coord_t *, sector_t block, -+ sector_t * result); -+ -+/* these are used in flush.c -+ FIXME-VS: should they be somewhere in item_plugin? */ -+int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos); -+int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos, -+ reiser4_key * stop_key); -+ -+int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */ -+__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */ -+__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */ -+ -+/* plugin->u.item.f. */ -+int reiser4_scan_extent(flush_scan * scan); -+extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *); -+ -+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit, -+ int nr_extents); -+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr); -+extent_state state_of_extent(reiser4_extent * ext); -+void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start, -+ reiser4_block_nr width); -+int reiser4_update_extent(struct inode *, jnode *, loff_t pos, -+ int *plugged_hole); -+ -+#include "../../coord.h" -+#include "../../lock.h" -+#include "../../tap.h" -+ -+struct replace_handle { -+ /* these are to be set before calling reiser4_replace_extent */ -+ coord_t *coord; -+ lock_handle *lh; -+ reiser4_key key; -+ reiser4_key *pkey; -+ reiser4_extent overwrite; -+ reiser4_extent new_extents[2]; -+ int nr_new_extents; -+ unsigned flags; -+ -+ /* these are used by reiser4_replace_extent */ -+ reiser4_item_data item; -+ coord_t coord_after; -+ lock_handle lh_after; -+ tap_t watch; -+ reiser4_key paste_key; -+#if REISER4_DEBUG -+ reiser4_extent orig_ext; -+ reiser4_key tmp; -+#endif -+}; -+ -+/* this structure is kmalloced before calling make_extent to avoid excessive -+ stack consumption on plug_hole->reiser4_replace_extent */ -+struct make_extent_handle { -+ uf_coord_t *uf_coord; -+ reiser4_block_nr blocknr; -+ int created; -+ struct inode *inode; -+ union { -+ struct { -+ } append; -+ struct replace_handle replace; -+ } u; -+}; -+ -+int reiser4_replace_extent(struct replace_handle *, -+ int return_inserted_position); -+lock_handle *znode_lh(znode *); -+ -+/* the reiser4 repacker support */ -+struct repacker_cursor; -+extern int process_extent_backward_for_repacking(tap_t *, -+ struct repacker_cursor *); -+extern int mark_extent_for_repacking(tap_t *, int); -+ -+#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord)) -+#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent)) -+ -+/* __REISER4_EXTENT_H__ */ -+#endif -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.30/fs/reiser4/plugin/item/extent_item_ops.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/extent_item_ops.c 2009-06-22 17:27:31.000000000 +0200 -@@ -0,0 +1,889 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../inode.h" -+#include "../../tree_walk.h" /* check_sibling_list() */ -+#include "../../page_cache.h" -+#include "../../carry.h" -+ -+#include <linux/quotaops.h> -+ -+/* item_plugin->b.max_key_inside */ -+reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, get_key_offset(reiser4_max_key())); -+ return key; -+} -+ -+/* item_plugin->b.can_contain_key -+ this checks whether @key of @data is matching to position set by @coord */ -+int -+can_contain_key_extent(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data * data) -+{ -+ reiser4_key item_key; -+ -+ if (item_plugin_by_coord(coord) != data->iplug) -+ return 0; -+ -+ item_key_by_coord(coord, &item_key); -+ if (get_key_locality(key) != get_key_locality(&item_key) || -+ get_key_objectid(key) != get_key_objectid(&item_key) || -+ get_key_ordering(key) != get_key_ordering(&item_key)) -+ return 0; -+ -+ return 1; -+} -+ -+/* item_plugin->b.mergeable -+ first item is of extent type */ -+/* Audited by: green(2002.06.13) */ -+int mergeable_extent(const coord_t * p1, const coord_t * p2) -+{ -+ reiser4_key key1, key2; -+ -+ assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID); -+ /* FIXME-VS: Which is it? Assert or return 0 */ -+ if (item_id_by_coord(p2) != EXTENT_POINTER_ID) { -+ return 0; -+ } -+ -+ item_key_by_coord(p1, &key1); -+ item_key_by_coord(p2, &key2); -+ if (get_key_locality(&key1) != get_key_locality(&key2) || -+ get_key_objectid(&key1) != get_key_objectid(&key2) || -+ get_key_ordering(&key1) != get_key_ordering(&key2) || -+ get_key_type(&key1) != get_key_type(&key2)) -+ return 0; -+ if (get_key_offset(&key1) + -+ reiser4_extent_size(p1, nr_units_extent(p1)) != -+ get_key_offset(&key2)) -+ return 0; -+ return 1; -+} -+ -+/* item_plugin->b.nr_units */ -+pos_in_node_t nr_units_extent(const coord_t * coord) -+{ -+ /* length of extent item has to be multiple of extent size */ -+ assert("vs-1424", -+ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0); -+ return item_length_by_coord(coord) / sizeof(reiser4_extent); -+} -+ -+/* item_plugin->b.lookup */ -+lookup_result -+lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG, -+ coord_t * coord) -+{ /* znode and item_pos are -+ set to an extent item to -+ look through */ -+ reiser4_key item_key; -+ reiser4_block_nr lookuped, offset; -+ unsigned i, nr_units; -+ reiser4_extent *ext; -+ unsigned blocksize; -+ unsigned char blocksize_bits; -+ -+ item_key_by_coord(coord, &item_key); -+ offset = get_key_offset(&item_key); -+ -+ /* key we are looking for must be greater than key of item @coord */ -+ assert("vs-414", keygt(key, &item_key)); -+ -+ assert("umka-99945", -+ !keygt(key, max_key_inside_extent(coord, &item_key))); -+ -+ ext = extent_item(coord); -+ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset)); -+ -+ blocksize = current_blocksize; -+ blocksize_bits = current_blocksize_bits; -+ -+ /* offset we are looking for */ -+ lookuped = get_key_offset(key); -+ -+ nr_units = nr_units_extent(coord); -+ /* go through all extents until the one which address given offset */ -+ for (i = 0; i < nr_units; i++, ext++) { -+ offset += (extent_get_width(ext) << blocksize_bits); -+ if (offset > lookuped) { -+ /* desired byte is somewhere in this extent */ -+ coord->unit_pos = i; -+ coord->between = AT_UNIT; -+ return CBK_COORD_FOUND; -+ } -+ } -+ -+ /* set coord after last unit */ -+ coord->unit_pos = nr_units - 1; -+ coord->between = AFTER_UNIT; -+ return CBK_COORD_FOUND; -+} -+ -+/* item_plugin->b.paste -+ item @coord is set to has been appended with @data->length of free -+ space. data->data contains data to be pasted into the item in position -+ @coord->in_item.unit_pos. It must fit into that free space. -+ @coord must be set between units. -+*/ -+int -+paste_extent(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG) -+{ -+ unsigned old_nr_units; -+ reiser4_extent *ext; -+ int item_length; -+ -+ ext = extent_item(coord); -+ item_length = item_length_by_coord(coord); -+ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent); -+ -+ /* this is also used to copy extent into newly created item, so -+ old_nr_units could be 0 */ -+ assert("vs-260", item_length >= data->length); -+ -+ /* make sure that coord is set properly */ -+ assert("vs-35", -+ ((!coord_is_existing_unit(coord)) -+ || (!old_nr_units && !coord->unit_pos))); -+ -+ /* first unit to be moved */ -+ switch (coord->between) { -+ case AFTER_UNIT: -+ coord->unit_pos++; -+ case BEFORE_UNIT: -+ coord->between = AT_UNIT; -+ break; -+ case AT_UNIT: -+ assert("vs-331", !old_nr_units && !coord->unit_pos); -+ break; -+ default: -+ impossible("vs-330", "coord is set improperly"); -+ } -+ -+ /* prepare space for new units */ -+ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent), -+ ext + coord->unit_pos, -+ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent)); -+ -+ /* copy new data from kernel space */ -+ assert("vs-556", data->user == 0); -+ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length); -+ -+ /* after paste @coord is set to first of pasted units */ -+ assert("vs-332", coord_is_existing_unit(coord)); -+ assert("vs-333", -+ !memcmp(data->data, extent_by_coord(coord), -+ (unsigned)data->length)); -+ return 0; -+} -+ -+/* item_plugin->b.can_shift */ -+int -+can_shift_extent(unsigned free_space, coord_t * source, -+ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG, -+ unsigned *size, unsigned want) -+{ -+ *size = item_length_by_coord(source); -+ if (*size > free_space) -+ /* never split a unit of extent item */ -+ *size = free_space - free_space % sizeof(reiser4_extent); -+ -+ /* we can shift *size bytes, calculate how many do we want to shift */ -+ if (*size > want * sizeof(reiser4_extent)) -+ *size = want * sizeof(reiser4_extent); -+ -+ if (*size % sizeof(reiser4_extent) != 0) -+ impossible("vs-119", "Wrong extent size: %i %zd", *size, -+ sizeof(reiser4_extent)); -+ return *size / sizeof(reiser4_extent); -+ -+} -+ -+/* item_plugin->b.copy_units */ -+void -+copy_units_extent(coord_t * target, coord_t * source, -+ unsigned from, unsigned count, -+ shift_direction where_is_free_space, unsigned free_space) -+{ -+ char *from_ext, *to_ext; -+ -+ assert("vs-217", free_space == count * sizeof(reiser4_extent)); -+ -+ from_ext = item_body_by_coord(source); -+ to_ext = item_body_by_coord(target); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ assert("vs-215", from == 0); -+ -+ /* At this moment, item length was already updated in the item -+ header by shifting code, hence nr_units_extent() will -+ return "new" number of units---one we obtain after copying -+ units. -+ */ -+ to_ext += -+ (nr_units_extent(target) - count) * sizeof(reiser4_extent); -+ } else { -+ reiser4_key key; -+ coord_t coord; -+ -+ assert("vs-216", -+ from + count == coord_last_unit_pos(source) + 1); -+ -+ from_ext += item_length_by_coord(source) - free_space; -+ -+ /* new units are inserted before first unit in an item, -+ therefore, we have to update item key */ -+ coord = *source; -+ coord.unit_pos = from; -+ unit_key_extent(&coord, &key); -+ -+ node_plugin_by_node(target->node)->update_item_key(target, &key, -+ NULL /*info */); -+ } -+ -+ memcpy(to_ext, from_ext, free_space); -+} -+ -+/* item_plugin->b.create_hook -+ @arg is znode of leaf node for which we need to update right delimiting key */ -+int create_hook_extent(const coord_t * coord, void *arg) -+{ -+ coord_t *child_coord; -+ znode *node; -+ reiser4_key key; -+ reiser4_tree *tree; -+ -+ if (!arg) -+ return 0; -+ -+ child_coord = arg; -+ tree = znode_get_tree(coord->node); -+ -+ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL); -+ -+ write_lock_tree(tree); -+ write_lock_dk(tree); -+ /* find a node on the left level for which right delimiting key has to -+ be updated */ -+ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) { -+ assert("vs-411", znode_is_left_connected(child_coord->node)); -+ node = child_coord->node->left; -+ } else { -+ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT); -+ node = child_coord->node; -+ assert("nikita-3314", node != NULL); -+ } -+ -+ if (node != NULL) { -+ znode_set_rd_key(node, item_key_by_coord(coord, &key)); -+ -+ assert("nikita-3282", check_sibling_list(node)); -+ /* break sibling links */ -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) { -+ ON_DEBUG(node->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ node->right_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ node->right->left = NULL; -+ node->right = NULL; -+ } -+ } -+ write_unlock_dk(tree); -+ write_unlock_tree(tree); -+ return 0; -+} -+ -+#define ITEM_TAIL_KILLED 0 -+#define ITEM_HEAD_KILLED 1 -+#define ITEM_KILLED 2 -+ -+/* item_plugin->b.kill_hook -+ this is called when @count units starting from @from-th one are going to be removed -+ */ -+int -+kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count, -+ struct carry_kill_data *kdata) -+{ -+ reiser4_extent *ext; -+ reiser4_block_nr start, length; -+ const reiser4_key *pfrom_key, *pto_key; -+ struct inode *inode; -+ reiser4_tree *tree; -+ pgoff_t from_off, to_off, offset, skip; -+ int retval; -+ -+ /* these are located in memory kmalloc-ed by kill_node_content */ -+ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key; -+ coord_t *dup, *next; -+ -+ assert("zam-811", znode_is_write_locked(coord->node)); -+ assert("nikita-3315", kdata != NULL); -+ assert("vs-34", kdata->buf != NULL); -+ -+ /* map structures to kdata->buf */ -+ min_item_key = (reiser4_key *) (kdata->buf); -+ max_item_key = min_item_key + 1; -+ from_key = max_item_key + 1; -+ to_key = from_key + 1; -+ key = to_key + 1; -+ dup = (coord_t *) (key + 1); -+ next = dup + 1; -+ -+ item_key_by_coord(coord, min_item_key); -+ max_item_key_by_coord(coord, max_item_key); -+ -+ if (kdata->params.from_key) { -+ pfrom_key = kdata->params.from_key; -+ pto_key = kdata->params.to_key; -+ } else { -+ assert("vs-1549", from == coord->unit_pos); -+ unit_key_by_coord(coord, from_key); -+ pfrom_key = from_key; -+ -+ coord_dup(dup, coord); -+ dup->unit_pos = from + count - 1; -+ max_unit_key_by_coord(dup, to_key); -+ pto_key = to_key; -+ } -+ -+ if (!keylt(pto_key, max_item_key)) { -+ if (!keygt(pfrom_key, min_item_key)) { -+ znode *left, *right; -+ -+ /* item is to be removed completely */ -+ assert("nikita-3316", kdata->left != NULL -+ && kdata->right != NULL); -+ -+ left = kdata->left->node; -+ right = kdata->right->node; -+ -+ tree = current_tree; -+ /* we have to do two things: -+ * -+ * 1. link left and right formatted neighbors of -+ * extent being removed, and -+ * -+ * 2. update their delimiting keys. -+ * -+ * atomicity of these operations is protected by -+ * taking dk-lock and tree-lock. -+ */ -+ /* if neighbors of item being removed are znodes - -+ * link them */ -+ write_lock_tree(tree); -+ write_lock_dk(tree); -+ link_left_and_right(left, right); -+ if (left) { -+ /* update right delimiting key of left -+ * neighbor of extent item */ -+ /*coord_t next; -+ reiser4_key key; */ -+ -+ coord_dup(next, coord); -+ -+ if (coord_next_item(next)) -+ *key = *znode_get_rd_key(coord->node); -+ else -+ item_key_by_coord(next, key); -+ znode_set_rd_key(left, key); -+ } -+ write_unlock_dk(tree); -+ write_unlock_tree(tree); -+ -+ from_off = -+ get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT; -+ to_off = -+ (get_key_offset(max_item_key) + -+ 1) >> PAGE_CACHE_SHIFT; -+ retval = ITEM_KILLED; -+ } else { -+ /* tail of item is to be removed */ -+ from_off = -+ (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT; -+ to_off = -+ (get_key_offset(max_item_key) + -+ 1) >> PAGE_CACHE_SHIFT; -+ retval = ITEM_TAIL_KILLED; -+ } -+ } else { -+ /* head of item is to be removed */ -+ assert("vs-1571", keyeq(pfrom_key, min_item_key)); -+ assert("vs-1572", -+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == -+ 0); -+ assert("vs-1573", -+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - -+ 1)) == 0); -+ -+ if (kdata->left->node) { -+ /* update right delimiting key of left neighbor of extent item */ -+ /*reiser4_key key; */ -+ -+ *key = *pto_key; -+ set_key_offset(key, get_key_offset(pto_key) + 1); -+ -+ write_lock_dk(current_tree); -+ znode_set_rd_key(kdata->left->node, key); -+ write_unlock_dk(current_tree); -+ } -+ -+ from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT; -+ to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT; -+ retval = ITEM_HEAD_KILLED; -+ } -+ -+ inode = kdata->inode; -+ assert("vs-1545", inode != NULL); -+ if (inode != NULL) -+ /* take care of pages and jnodes corresponding to part of item being killed */ -+ reiser4_invalidate_pages(inode->i_mapping, from_off, -+ to_off - from_off, -+ kdata->params.truncate); -+ -+ ext = extent_item(coord) + from; -+ offset = -+ (get_key_offset(min_item_key) + -+ reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT; -+ -+ assert("vs-1551", from_off >= offset); -+ assert("vs-1552", from_off - offset <= extent_get_width(ext)); -+ skip = from_off - offset; -+ offset = from_off; -+ -+ while (offset < to_off) { -+ length = extent_get_width(ext) - skip; -+ if (state_of_extent(ext) == HOLE_EXTENT) { -+ skip = 0; -+ offset += length; -+ ext++; -+ continue; -+ } -+ -+ if (offset + length > to_off) { -+ length = to_off - offset; -+ } -+ -+ vfs_dq_free_block_nodirty(inode, length); -+ -+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) { -+ /* some jnodes corresponding to this unallocated extent */ -+ fake_allocated2free(length, 0 /* unformatted */ ); -+ -+ skip = 0; -+ offset += length; -+ ext++; -+ continue; -+ } -+ -+ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT); -+ -+ if (length != 0) { -+ start = extent_get_start(ext) + skip; -+ -+ /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed -+ immediately */ -+ reiser4_dealloc_blocks(&start, &length, -+ 0 /* not used */ , -+ BA_DEFER -+ /* unformatted with defer */ ); -+ } -+ skip = 0; -+ offset += length; -+ ext++; -+ } -+ return retval; -+} -+ -+/* item_plugin->b.kill_units */ -+int -+kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *kdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ reiser4_extent *ext; -+ reiser4_key item_key; -+ pos_in_node_t count; -+ reiser4_key from_key, to_key; -+ const reiser4_key *pfrom_key, *pto_key; -+ loff_t off; -+ int result; -+ -+ assert("vs-1541", -+ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL) -+ || (kdata->params.from_key != NULL -+ && kdata->params.to_key != NULL))); -+ -+ if (kdata->params.from_key) { -+ pfrom_key = kdata->params.from_key; -+ pto_key = kdata->params.to_key; -+ } else { -+ coord_t dup; -+ -+ /* calculate key range of kill */ -+ assert("vs-1549", from == coord->unit_pos); -+ unit_key_by_coord(coord, &from_key); -+ pfrom_key = &from_key; -+ -+ coord_dup(&dup, coord); -+ dup.unit_pos = to; -+ max_unit_key_by_coord(&dup, &to_key); -+ pto_key = &to_key; -+ } -+ -+ item_key_by_coord(coord, &item_key); -+ -+#if REISER4_DEBUG -+ { -+ reiser4_key max_item_key; -+ -+ max_item_key_by_coord(coord, &max_item_key); -+ -+ if (new_first) { -+ /* head of item is to be cut */ -+ assert("vs-1542", keyeq(pfrom_key, &item_key)); -+ assert("vs-1538", keylt(pto_key, &max_item_key)); -+ } else { -+ /* tail of item is to be cut */ -+ assert("vs-1540", keygt(pfrom_key, &item_key)); -+ assert("vs-1543", !keylt(pto_key, &max_item_key)); -+ } -+ } -+#endif -+ -+ if (smallest_removed) -+ *smallest_removed = *pfrom_key; -+ -+ if (new_first) { -+ /* item head is cut. Item key will change. This new key is calculated here */ -+ assert("vs-1556", -+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == -+ (PAGE_CACHE_SIZE - 1)); -+ *new_first = *pto_key; -+ set_key_offset(new_first, get_key_offset(new_first) + 1); -+ } -+ -+ count = to - from + 1; -+ result = kill_hook_extent(coord, from, count, kdata); -+ if (result == ITEM_TAIL_KILLED) { -+ assert("vs-1553", -+ get_key_offset(pfrom_key) >= -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, from)); -+ off = -+ get_key_offset(pfrom_key) - -+ (get_key_offset(&item_key) + -+ reiser4_extent_size(coord, from)); -+ if (off) { -+ /* unit @from is to be cut partially. Its width decreases */ -+ ext = extent_item(coord) + from; -+ extent_set_width(ext, -+ (off + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT); -+ count--; -+ } -+ } else { -+ __u64 max_to_offset; -+ __u64 rest; -+ -+ assert("vs-1575", result == ITEM_HEAD_KILLED); -+ assert("", from == 0); -+ assert("", -+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - -+ 1)) == 0); -+ assert("", -+ get_key_offset(pto_key) + 1 > -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to)); -+ max_to_offset = -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to + 1) - 1; -+ assert("", get_key_offset(pto_key) <= max_to_offset); -+ -+ rest = -+ (max_to_offset - -+ get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT; -+ if (rest) { -+ /* unit @to is to be cut partially */ -+ ext = extent_item(coord) + to; -+ -+ assert("", extent_get_width(ext) > rest); -+ -+ if (state_of_extent(ext) == ALLOCATED_EXTENT) -+ extent_set_start(ext, -+ extent_get_start(ext) + -+ (extent_get_width(ext) - -+ rest)); -+ -+ extent_set_width(ext, rest); -+ count--; -+ } -+ } -+ return count * sizeof(reiser4_extent); -+} -+ -+/* item_plugin->b.cut_units -+ this is too similar to kill_units_extent */ -+int -+cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *cdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ reiser4_extent *ext; -+ reiser4_key item_key; -+ pos_in_node_t count; -+ reiser4_key from_key, to_key; -+ const reiser4_key *pfrom_key, *pto_key; -+ loff_t off; -+ -+ assert("vs-1541", -+ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL) -+ || (cdata->params.from_key != NULL -+ && cdata->params.to_key != NULL))); -+ -+ if (cdata->params.from_key) { -+ pfrom_key = cdata->params.from_key; -+ pto_key = cdata->params.to_key; -+ } else { -+ coord_t dup; -+ -+ /* calculate key range of kill */ -+ coord_dup(&dup, coord); -+ dup.unit_pos = from; -+ unit_key_by_coord(&dup, &from_key); -+ -+ dup.unit_pos = to; -+ max_unit_key_by_coord(&dup, &to_key); -+ -+ pfrom_key = &from_key; -+ pto_key = &to_key; -+ } -+ -+ assert("vs-1555", -+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0); -+ assert("vs-1556", -+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == -+ (PAGE_CACHE_SIZE - 1)); -+ -+ item_key_by_coord(coord, &item_key); -+ -+#if REISER4_DEBUG -+ { -+ reiser4_key max_item_key; -+ -+ assert("vs-1584", -+ get_key_locality(pfrom_key) == -+ get_key_locality(&item_key)); -+ assert("vs-1585", -+ get_key_type(pfrom_key) == get_key_type(&item_key)); -+ assert("vs-1586", -+ get_key_objectid(pfrom_key) == -+ get_key_objectid(&item_key)); -+ assert("vs-1587", -+ get_key_ordering(pfrom_key) == -+ get_key_ordering(&item_key)); -+ -+ max_item_key_by_coord(coord, &max_item_key); -+ -+ if (new_first != NULL) { -+ /* head of item is to be cut */ -+ assert("vs-1542", keyeq(pfrom_key, &item_key)); -+ assert("vs-1538", keylt(pto_key, &max_item_key)); -+ } else { -+ /* tail of item is to be cut */ -+ assert("vs-1540", keygt(pfrom_key, &item_key)); -+ assert("vs-1543", keyeq(pto_key, &max_item_key)); -+ } -+ } -+#endif -+ -+ if (smallest_removed) -+ *smallest_removed = *pfrom_key; -+ -+ if (new_first) { -+ /* item head is cut. Item key will change. This new key is calculated here */ -+ *new_first = *pto_key; -+ set_key_offset(new_first, get_key_offset(new_first) + 1); -+ } -+ -+ count = to - from + 1; -+ -+ assert("vs-1553", -+ get_key_offset(pfrom_key) >= -+ get_key_offset(&item_key) + reiser4_extent_size(coord, from)); -+ off = -+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) + -+ reiser4_extent_size(coord, from)); -+ if (off) { -+ /* tail of unit @from is to be cut partially. Its width decreases */ -+ assert("vs-1582", new_first == NULL); -+ ext = extent_item(coord) + from; -+ extent_set_width(ext, off >> PAGE_CACHE_SHIFT); -+ count--; -+ } -+ -+ assert("vs-1554", -+ get_key_offset(pto_key) <= -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to + 1) - 1); -+ off = -+ (get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to + 1) - 1) - -+ get_key_offset(pto_key); -+ if (off) { -+ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased -+ and width decreased. */ -+ assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0); -+ ext = extent_item(coord) + to; -+ if (state_of_extent(ext) == ALLOCATED_EXTENT) -+ extent_set_start(ext, -+ extent_get_start(ext) + -+ (extent_get_width(ext) - -+ (off >> PAGE_CACHE_SHIFT))); -+ -+ extent_set_width(ext, (off >> PAGE_CACHE_SHIFT)); -+ count--; -+ } -+ return count * sizeof(reiser4_extent); -+} -+ -+/* item_plugin->b.unit_key */ -+reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key) -+{ -+ assert("vs-300", coord_is_existing_unit(coord)); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, -+ (get_key_offset(key) + -+ reiser4_extent_size(coord, coord->unit_pos))); -+ -+ return key; -+} -+ -+/* item_plugin->b.max_unit_key */ -+reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key) -+{ -+ assert("vs-300", coord_is_existing_unit(coord)); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, -+ (get_key_offset(key) + -+ reiser4_extent_size(coord, coord->unit_pos + 1) - 1)); -+ return key; -+} -+ -+/* item_plugin->b.estimate -+ item_plugin->b.item_data_by_flow */ -+ -+#if REISER4_DEBUG -+ -+/* item_plugin->b.check -+ used for debugging, every item should have here the most complete -+ possible check of the consistency of the item that the inventor can -+ construct -+*/ -+int reiser4_check_extent(const coord_t * coord /* coord of item to check */, -+ const char **error /* where to store error message */) -+{ -+ reiser4_extent *ext, *first; -+ unsigned i, j; -+ reiser4_block_nr start, width, blk_cnt; -+ unsigned num_units; -+ reiser4_tree *tree; -+ oid_t oid; -+ reiser4_key key; -+ coord_t scan; -+ -+ assert("vs-933", REISER4_DEBUG); -+ -+ if (znode_get_level(coord->node) != TWIG_LEVEL) { -+ *error = "Extent on the wrong level"; -+ return -1; -+ } -+ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) { -+ *error = "Wrong item size"; -+ return -1; -+ } -+ ext = first = extent_item(coord); -+ blk_cnt = reiser4_block_count(reiser4_get_current_sb()); -+ num_units = coord_num_units(coord); -+ tree = znode_get_tree(coord->node); -+ item_key_by_coord(coord, &key); -+ oid = get_key_objectid(&key); -+ coord_dup(&scan, coord); -+ -+ for (i = 0; i < num_units; ++i, ++ext) { -+ __u64 index; -+ -+ scan.unit_pos = i; -+ index = extent_unit_index(&scan); -+ -+#if 0 -+ /* check that all jnodes are present for the unallocated -+ * extent */ -+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) { -+ for (j = 0; j < extent_get_width(ext); j++) { -+ jnode *node; -+ -+ node = jlookup(tree, oid, index + j); -+ if (node == NULL) { -+ print_coord("scan", &scan, 0); -+ *error = "Jnode missing"; -+ return -1; -+ } -+ jput(node); -+ } -+ } -+#endif -+ -+ start = extent_get_start(ext); -+ if (start < 2) -+ continue; -+ /* extent is allocated one */ -+ width = extent_get_width(ext); -+ if (start >= blk_cnt) { -+ *error = "Start too large"; -+ return -1; -+ } -+ if (start + width > blk_cnt) { -+ *error = "End too large"; -+ return -1; -+ } -+ /* make sure that this extent does not overlap with other -+ allocated extents extents */ -+ for (j = 0; j < i; j++) { -+ if (state_of_extent(first + j) != ALLOCATED_EXTENT) -+ continue; -+ if (! -+ ((extent_get_start(ext) >= -+ extent_get_start(first + j) + -+ extent_get_width(first + j)) -+ || (extent_get_start(ext) + -+ extent_get_width(ext) <= -+ extent_get_start(first + j)))) { -+ *error = "Extent overlaps with others"; -+ return -1; -+ } -+ } -+ -+ } -+ -+ return 0; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/internal.c linux-2.6.30/fs/reiser4/plugin/item/internal.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/internal.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,404 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Implementation of internal-item plugin methods. */ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "internal.h" -+#include "item.h" -+#include "../node/node.h" -+#include "../plugin.h" -+#include "../../jnode.h" -+#include "../../znode.h" -+#include "../../tree_walk.h" -+#include "../../tree_mod.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../block_alloc.h" -+ -+/* see internal.h for explanation */ -+ -+/* plugin->u.item.b.mergeable */ -+int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ , -+ const coord_t * p2 UNUSED_ARG /* second item */ ) -+{ -+ /* internal items are not mergeable */ -+ return 0; -+} -+ -+/* ->lookup() method for internal items */ -+lookup_result lookup_internal(const reiser4_key * key /* key to look up */ , -+ lookup_bias bias UNUSED_ARG /* lookup bias */ , -+ coord_t * coord /* coord of item */ ) -+{ -+ reiser4_key ukey; -+ -+ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) { -+ default: -+ impossible("", "keycmp()?!"); -+ case LESS_THAN: -+ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord -+ item plugin can not be taken using coord set this way */ -+ assert("vs-681", coord->unit_pos == 0); -+ coord->between = AFTER_UNIT; -+ case EQUAL_TO: -+ return CBK_COORD_FOUND; -+ case GREATER_THAN: -+ return CBK_COORD_NOTFOUND; -+ } -+} -+ -+/* return body of internal item at @coord */ -+static internal_item_layout *internal_at(const coord_t * coord /* coord of -+ * item */ ) -+{ -+ assert("nikita-607", coord != NULL); -+ assert("nikita-1650", -+ item_plugin_by_coord(coord) == -+ item_plugin_by_id(NODE_POINTER_ID)); -+ return (internal_item_layout *) item_body_by_coord(coord); -+} -+ -+void reiser4_update_internal(const coord_t * coord, -+ const reiser4_block_nr * blocknr) -+{ -+ internal_item_layout *item = internal_at(coord); -+ assert("nikita-2959", reiser4_blocknr_is_sane(blocknr)); -+ -+ put_unaligned(cpu_to_le64(*blocknr), &item->pointer); -+} -+ -+/* return child block number stored in the internal item at @coord */ -+static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ ) -+{ -+ assert("nikita-608", coord != NULL); -+ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer)); -+} -+ -+/* get znode pointed to by internal @item */ -+static znode *znode_at(const coord_t * item /* coord of item */ , -+ znode * parent /* parent node */ ) -+{ -+ return child_znode(item, parent, 1, 0); -+} -+ -+/* store pointer from internal item into "block". Implementation of -+ ->down_link() method */ -+void down_link_internal(const coord_t * coord /* coord of item */ , -+ const reiser4_key * key UNUSED_ARG /* key to get -+ * pointer for */ , -+ reiser4_block_nr * block /* resulting block number */ ) -+{ -+ ON_DEBUG(reiser4_key item_key); -+ -+ assert("nikita-609", coord != NULL); -+ assert("nikita-611", block != NULL); -+ assert("nikita-612", (key == NULL) || -+ /* twig horrors */ -+ (znode_get_level(coord->node) == TWIG_LEVEL) -+ || keyle(item_key_by_coord(coord, &item_key), key)); -+ -+ *block = pointer_at(coord); -+ assert("nikita-2960", reiser4_blocknr_is_sane(block)); -+} -+ -+/* Get the child's block number, or 0 if the block is unallocated. */ -+int -+utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG, -+ reiser4_block_nr * block) -+{ -+ assert("jmacd-2059", coord != NULL); -+ -+ *block = pointer_at(coord); -+ assert("nikita-2961", reiser4_blocknr_is_sane(block)); -+ -+ if (reiser4_blocknr_is_fake(block)) { -+ *block = 0; -+ } -+ -+ return 0; -+} -+ -+/* Return the child. */ -+int -+utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG, -+ jnode ** childp) -+{ -+ reiser4_block_nr block = pointer_at(coord); -+ znode *child; -+ -+ assert("jmacd-2059", childp != NULL); -+ assert("nikita-2962", reiser4_blocknr_is_sane(&block)); -+ -+ child = zlook(znode_get_tree(coord->node), &block); -+ -+ if (IS_ERR(child)) { -+ return PTR_ERR(child); -+ } -+ -+ *childp = ZJNODE(child); -+ -+ return 0; -+} -+ -+#if REISER4_DEBUG -+ -+static void check_link(znode * left, znode * right) -+{ -+ znode *scan; -+ -+ for (scan = left; scan != right; scan = scan->right) { -+ if (ZF_ISSET(scan, JNODE_RIP)) -+ break; -+ if (znode_is_right_connected(scan) && scan->right != NULL) { -+ if (ZF_ISSET(scan->right, JNODE_RIP)) -+ break; -+ assert("nikita-3285", -+ znode_is_left_connected(scan->right)); -+ assert("nikita-3265", -+ ergo(scan != left, -+ ZF_ISSET(scan, JNODE_HEARD_BANSHEE))); -+ assert("nikita-3284", scan->right->left == scan); -+ } else -+ break; -+ } -+} -+ -+int check__internal(const coord_t * coord, const char **error) -+{ -+ reiser4_block_nr blk; -+ znode *child; -+ coord_t cpy; -+ -+ blk = pointer_at(coord); -+ if (!reiser4_blocknr_is_sane(&blk)) { -+ *error = "Invalid pointer"; -+ return -1; -+ } -+ coord_dup(&cpy, coord); -+ child = znode_at(&cpy, cpy.node); -+ if (child != NULL) { -+ znode *left_child; -+ znode *right_child; -+ -+ left_child = right_child = NULL; -+ -+ assert("nikita-3256", znode_invariant(child)); -+ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) { -+ left_child = znode_at(&cpy, cpy.node); -+ if (left_child != NULL) { -+ read_lock_tree(znode_get_tree(child)); -+ check_link(left_child, child); -+ read_unlock_tree(znode_get_tree(child)); -+ zput(left_child); -+ } -+ } -+ coord_dup(&cpy, coord); -+ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) { -+ right_child = znode_at(&cpy, cpy.node); -+ if (right_child != NULL) { -+ read_lock_tree(znode_get_tree(child)); -+ check_link(child, right_child); -+ read_unlock_tree(znode_get_tree(child)); -+ zput(right_child); -+ } -+ } -+ zput(child); -+ } -+ return 0; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* return true only if this item really points to "block" */ -+/* Audited by: green(2002.06.14) */ -+int has_pointer_to_internal(const coord_t * coord /* coord of item */ , -+ const reiser4_block_nr * block /* block number to -+ * check */ ) -+{ -+ assert("nikita-613", coord != NULL); -+ assert("nikita-614", block != NULL); -+ -+ return pointer_at(coord) == *block; -+} -+ -+/* hook called by ->create_item() method of node plugin after new internal -+ item was just created. -+ -+ This is point where pointer to new node is inserted into tree. Initialize -+ parent pointer in child znode, insert child into sibling list and slum. -+ -+*/ -+int create_hook_internal(const coord_t * item /* coord of item */ , -+ void *arg /* child's left neighbor, if any */ ) -+{ -+ znode *child; -+ __u64 child_ptr; -+ -+ assert("nikita-1252", item != NULL); -+ assert("nikita-1253", item->node != NULL); -+ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL); -+ assert("nikita-1450", item->unit_pos == 0); -+ -+ /* -+ * preparing to item insertion build_child_ptr_data sets pointer to -+ * data to be inserted to jnode's blocknr which is in cpu byte -+ * order. Node's create_item simply copied those data. As result we -+ * have child pointer in cpu's byte order. Convert content of internal -+ * item to little endian byte order. -+ */ -+ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item)); -+ reiser4_update_internal(item, &child_ptr); -+ -+ child = znode_at(item, item->node); -+ if (child != NULL && !IS_ERR(child)) { -+ znode *left; -+ int result = 0; -+ reiser4_tree *tree; -+ -+ left = arg; -+ tree = znode_get_tree(item->node); -+ write_lock_tree(tree); -+ write_lock_dk(tree); -+ assert("nikita-1400", (child->in_parent.node == NULL) -+ || (znode_above_root(child->in_parent.node))); -+ ++item->node->c_count; -+ coord_to_parent_coord(item, &child->in_parent); -+ sibling_list_insert_nolock(child, left); -+ -+ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN)); -+ ZF_CLR(child, JNODE_ORPHAN); -+ -+ if ((left != NULL) && !keyeq(znode_get_rd_key(left), -+ znode_get_rd_key(child))) { -+ znode_set_rd_key(child, znode_get_rd_key(left)); -+ } -+ write_unlock_dk(tree); -+ write_unlock_tree(tree); -+ zput(child); -+ return result; -+ } else { -+ if (child == NULL) -+ child = ERR_PTR(-EIO); -+ return PTR_ERR(child); -+ } -+} -+ -+/* hook called by ->cut_and_kill() method of node plugin just before internal -+ item is removed. -+ -+ This is point where empty node is removed from the tree. Clear parent -+ pointer in child, and mark node for pending deletion. -+ -+ Node will be actually deleted later and in several installations: -+ -+ . when last lock on this node will be released, node will be removed from -+ the sibling list and its lock will be invalidated -+ -+ . when last reference to this node will be dropped, bitmap will be updated -+ and node will be actually removed from the memory. -+ -+*/ -+int kill_hook_internal(const coord_t * item /* coord of item */ , -+ pos_in_node_t from UNUSED_ARG /* start unit */ , -+ pos_in_node_t count UNUSED_ARG /* stop unit */ , -+ struct carry_kill_data *p UNUSED_ARG) -+{ -+ znode *child; -+ int result = 0; -+ -+ assert("nikita-1222", item != NULL); -+ assert("nikita-1224", from == 0); -+ assert("nikita-1225", count == 1); -+ -+ child = znode_at(item, item->node); -+ if (child == NULL) -+ return 0; -+ if (IS_ERR(child)) -+ return PTR_ERR(child); -+ result = zload(child); -+ if (result) { -+ zput(child); -+ return result; -+ } -+ if (node_is_empty(child)) { -+ reiser4_tree *tree; -+ -+ assert("nikita-1397", znode_is_write_locked(child)); -+ assert("nikita-1398", child->c_count == 0); -+ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE)); -+ -+ tree = znode_get_tree(item->node); -+ write_lock_tree(tree); -+ init_parent_coord(&child->in_parent, NULL); -+ --item->node->c_count; -+ write_unlock_tree(tree); -+ } else { -+ warning("nikita-1223", -+ "Cowardly refuse to remove link to non-empty node"); -+ result = RETERR(-EIO); -+ } -+ zrelse(child); -+ zput(child); -+ return result; -+} -+ -+/* hook called by ->shift() node plugin method when iternal item was just -+ moved from one node to another. -+ -+ Update parent pointer in child and c_counts in old and new parent -+ -+*/ -+int shift_hook_internal(const coord_t * item /* coord of item */ , -+ unsigned from UNUSED_ARG /* start unit */ , -+ unsigned count UNUSED_ARG /* stop unit */ , -+ znode * old_node /* old parent */ ) -+{ -+ znode *child; -+ znode *new_node; -+ reiser4_tree *tree; -+ -+ assert("nikita-1276", item != NULL); -+ assert("nikita-1277", from == 0); -+ assert("nikita-1278", count == 1); -+ assert("nikita-1451", item->unit_pos == 0); -+ -+ new_node = item->node; -+ assert("nikita-2132", new_node != old_node); -+ tree = znode_get_tree(item->node); -+ child = child_znode(item, old_node, 1, 0); -+ if (child == NULL) -+ return 0; -+ if (!IS_ERR(child)) { -+ write_lock_tree(tree); -+ ++new_node->c_count; -+ assert("nikita-1395", znode_parent(child) == old_node); -+ assert("nikita-1396", old_node->c_count > 0); -+ coord_to_parent_coord(item, &child->in_parent); -+ assert("nikita-1781", znode_parent(child) == new_node); -+ assert("nikita-1782", -+ check_tree_pointer(item, child) == NS_FOUND); -+ --old_node->c_count; -+ write_unlock_tree(tree); -+ zput(child); -+ return 0; -+ } else -+ return PTR_ERR(child); -+} -+ -+/* plugin->u.item.b.max_key_inside - not defined */ -+ -+/* plugin->u.item.b.nr_units - item.c:single_unit */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/internal.h linux-2.6.30/fs/reiser4/plugin/item/internal.h ---- linux-2.6.30.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/internal.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,57 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Internal item contains down-link to the child of the internal/twig -+ node in a tree. It is internal items that are actually used during -+ tree traversal. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ ) -+#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+ -+/* on-disk layout of internal item */ -+typedef struct internal_item_layout { -+ /* 0 */ reiser4_dblock_nr pointer; -+ /* 4 */ -+} internal_item_layout; -+ -+struct cut_list; -+ -+int mergeable_internal(const coord_t * p1, const coord_t * p2); -+lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias, -+ coord_t * coord); -+/* store pointer from internal item into "block". Implementation of -+ ->down_link() method */ -+extern void down_link_internal(const coord_t * coord, const reiser4_key * key, -+ reiser4_block_nr * block); -+extern int has_pointer_to_internal(const coord_t * coord, -+ const reiser4_block_nr * block); -+extern int create_hook_internal(const coord_t * item, void *arg); -+extern int kill_hook_internal(const coord_t * item, pos_in_node_t from, -+ pos_in_node_t count, struct carry_kill_data *); -+extern int shift_hook_internal(const coord_t * item, unsigned from, -+ unsigned count, znode * old_node); -+extern void reiser4_print_internal(const char *prefix, coord_t * coord); -+ -+extern int utmost_child_internal(const coord_t * coord, sideof side, -+ jnode ** child); -+int utmost_child_real_block_internal(const coord_t * coord, sideof side, -+ reiser4_block_nr * block); -+ -+extern void reiser4_update_internal(const coord_t * coord, -+ const reiser4_block_nr * blocknr); -+/* FIXME: reiserfs has check_internal */ -+extern int check__internal(const coord_t * coord, const char **error); -+ -+/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/item.c linux-2.6.30/fs/reiser4/plugin/item/item.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/item.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/item.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,719 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* definition of item plugins. */ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "../plugin_header.h" -+#include "sde.h" -+#include "internal.h" -+#include "item.h" -+#include "static_stat.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../tree.h" -+#include "../../context.h" -+#include "ctail.h" -+ -+/* return pointer to item body */ -+void item_body_by_coord_hard(coord_t * coord /* coord to query */ ) -+{ -+ assert("nikita-324", coord != NULL); -+ assert("nikita-325", coord->node != NULL); -+ assert("nikita-326", znode_is_loaded(coord->node)); -+ assert("nikita-3200", coord->offset == INVALID_OFFSET); -+ -+ coord->offset = -+ node_plugin_by_node(coord->node)->item_by_coord(coord) - -+ zdata(coord->node); -+ ON_DEBUG(coord->body_v = coord->node->times_locked); -+} -+ -+void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ ) -+{ -+ return zdata(coord->node) + coord->offset; -+} -+ -+#if REISER4_DEBUG -+ -+int item_body_is_valid(const coord_t * coord) -+{ -+ return -+ coord->offset == -+ node_plugin_by_node(coord->node)->item_by_coord(coord) - -+ zdata(coord->node); -+} -+ -+#endif -+ -+/* return length of item at @coord */ -+pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ ) -+{ -+ int len; -+ -+ assert("nikita-327", coord != NULL); -+ assert("nikita-328", coord->node != NULL); -+ assert("nikita-329", znode_is_loaded(coord->node)); -+ -+ len = node_plugin_by_node(coord->node)->length_by_coord(coord); -+ return len; -+} -+ -+void obtain_item_plugin(const coord_t * coord) -+{ -+ assert("nikita-330", coord != NULL); -+ assert("nikita-331", coord->node != NULL); -+ assert("nikita-332", znode_is_loaded(coord->node)); -+ -+ coord_set_iplug((coord_t *) coord, -+ node_plugin_by_node(coord->node)-> -+ plugin_by_coord(coord)); -+ assert("nikita-2479", -+ coord_iplug(coord) == -+ node_plugin_by_node(coord->node)->plugin_by_coord(coord)); -+} -+ -+/* return id of item */ -+/* Audited by: green(2002.06.15) */ -+item_id item_id_by_coord(const coord_t * coord /* coord to query */ ) -+{ -+ assert("vs-539", coord != NULL); -+ assert("vs-538", coord->node != NULL); -+ assert("vs-537", znode_is_loaded(coord->node)); -+ assert("vs-536", item_plugin_by_coord(coord) != NULL); -+ assert("vs-540", -+ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID); -+ -+ return item_id_by_plugin(item_plugin_by_coord(coord)); -+} -+ -+/* return key of item at @coord */ -+/* Audited by: green(2002.06.15) */ -+reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-338", coord != NULL); -+ assert("nikita-339", coord->node != NULL); -+ assert("nikita-340", znode_is_loaded(coord->node)); -+ -+ return node_plugin_by_node(coord->node)->key_at(coord, key); -+} -+ -+/* this returns max key in the item */ -+reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ coord_t last; -+ -+ assert("nikita-338", coord != NULL); -+ assert("nikita-339", coord->node != NULL); -+ assert("nikita-340", znode_is_loaded(coord->node)); -+ -+ /* make coord pointing to last item's unit */ -+ coord_dup(&last, coord); -+ last.unit_pos = coord_num_units(&last) - 1; -+ assert("vs-1560", coord_is_existing_unit(&last)); -+ -+ max_unit_key_by_coord(&last, key); -+ return key; -+} -+ -+/* return key of unit at @coord */ -+reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-772", coord != NULL); -+ assert("nikita-774", coord->node != NULL); -+ assert("nikita-775", znode_is_loaded(coord->node)); -+ -+ if (item_plugin_by_coord(coord)->b.unit_key != NULL) -+ return item_plugin_by_coord(coord)->b.unit_key(coord, key); -+ else -+ return item_key_by_coord(coord, key); -+} -+ -+/* return the biggest key contained the unit @coord */ -+reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-772", coord != NULL); -+ assert("nikita-774", coord->node != NULL); -+ assert("nikita-775", znode_is_loaded(coord->node)); -+ -+ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL) -+ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key); -+ else -+ return unit_key_by_coord(coord, key); -+} -+ -+/* ->max_key_inside() method for items consisting of exactly one key (like -+ stat-data) */ -+static reiser4_key *max_key_inside_single_key(const coord_t * -+ coord /* coord of item */ , -+ reiser4_key * -+ result /* resulting key */ ) -+{ -+ assert("nikita-604", coord != NULL); -+ -+ /* coord -> key is starting key of this item and it has to be already -+ filled in */ -+ return unit_key_by_coord(coord, result); -+} -+ -+/* ->nr_units() method for items consisting of exactly one unit always */ -+pos_in_node_t -+nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ ) -+{ -+ return 1; -+} -+ -+static int -+paste_no_paste(coord_t * coord UNUSED_ARG, -+ reiser4_item_data * data UNUSED_ARG, -+ carry_plugin_info * info UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* default ->fast_paste() method */ -+static int -+agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ ) -+{ -+ return 1; -+} -+ -+int item_can_contain_key(const coord_t * item /* coord of item */ , -+ const reiser4_key * key /* key to check */ , -+ const reiser4_item_data * data /* parameters of item -+ * being created */ ) -+{ -+ item_plugin *iplug; -+ reiser4_key min_key_in_item; -+ reiser4_key max_key_in_item; -+ -+ assert("nikita-1658", item != NULL); -+ assert("nikita-1659", key != NULL); -+ -+ iplug = item_plugin_by_coord(item); -+ if (iplug->b.can_contain_key != NULL) -+ return iplug->b.can_contain_key(item, key, data); -+ else { -+ assert("nikita-1681", iplug->b.max_key_inside != NULL); -+ item_key_by_coord(item, &min_key_in_item); -+ iplug->b.max_key_inside(item, &max_key_in_item); -+ -+ /* can contain key if -+ min_key_in_item <= key && -+ key <= max_key_in_item -+ */ -+ return keyle(&min_key_in_item, key) -+ && keyle(key, &max_key_in_item); -+ } -+} -+ -+/* mergeable method for non mergeable items */ -+static int -+not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */ -+int are_items_mergeable(const coord_t * i1 /* coord of first item */ , -+ const coord_t * i2 /* coord of second item */ ) -+{ -+ item_plugin *iplug; -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ assert("nikita-1336", i1 != NULL); -+ assert("nikita-1337", i2 != NULL); -+ -+ iplug = item_plugin_by_coord(i1); -+ assert("nikita-1338", iplug != NULL); -+ -+ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in -+ shifting code when nodes are in "suspended" state. */ -+ assert("nikita-1663", -+ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2))); -+ -+ if (iplug->b.mergeable != NULL) { -+ return iplug->b.mergeable(i1, i2); -+ } else if (iplug->b.max_key_inside != NULL) { -+ iplug->b.max_key_inside(i1, &k1); -+ item_key_by_coord(i2, &k2); -+ -+ /* mergeable if ->max_key_inside() >= key of i2; */ -+ return keyge(iplug->b.max_key_inside(i1, &k1), -+ item_key_by_coord(i2, &k2)); -+ } else { -+ item_key_by_coord(i1, &k1); -+ item_key_by_coord(i2, &k2); -+ -+ return -+ (get_key_locality(&k1) == get_key_locality(&k2)) && -+ (get_key_objectid(&k1) == get_key_objectid(&k2)) -+ && (iplug == item_plugin_by_coord(i2)); -+ } -+} -+ -+int item_is_extent(const coord_t * item) -+{ -+ assert("vs-482", coord_is_existing_item(item)); -+ return item_id_by_coord(item) == EXTENT_POINTER_ID; -+} -+ -+int item_is_tail(const coord_t * item) -+{ -+ assert("vs-482", coord_is_existing_item(item)); -+ return item_id_by_coord(item) == FORMATTING_ID; -+} -+ -+#if REISER4_DEBUG -+ -+int item_is_statdata(const coord_t * item) -+{ -+ assert("vs-516", coord_is_existing_item(item)); -+ return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE); -+} -+ -+int item_is_ctail(const coord_t * item) -+{ -+ assert("edward-xx", coord_is_existing_item(item)); -+ return item_id_by_coord(item) == CTAIL_ID; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+static int change_item(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ /* cannot change constituent item (sd, or dir_item) */ -+ return RETERR(-EINVAL); -+} -+ -+static reiser4_plugin_ops item_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = change_item -+}; -+ -+item_plugin item_plugins[LAST_ITEM_ID] = { -+ [STATIC_STAT_DATA_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = STATIC_STAT_DATA_ID, -+ .groups = (1 << STAT_DATA_ITEM_TYPE), -+ .pops = &item_plugin_ops, -+ .label = "sd", -+ .desc = "stat-data", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_single_key, -+ .can_contain_key = NULL, -+ .mergeable = not_mergeable, -+ .nr_units = nr_units_single_unit, -+ .lookup = NULL, -+ .init = NULL, -+ .paste = paste_no_paste, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .sd = { -+ .init_inode = init_inode_static_sd, -+ .save_len = save_len_static_sd, -+ .save = save_static_sd -+ } -+ } -+ }, -+ [SIMPLE_DIR_ENTRY_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = SIMPLE_DIR_ENTRY_ID, -+ .groups = (1 << DIR_ENTRY_ITEM_TYPE), -+ .pops = &item_plugin_ops, -+ .label = "de", -+ .desc = "directory entry", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_single_key, -+ .can_contain_key = NULL, -+ .mergeable = NULL, -+ .nr_units = nr_units_single_unit, -+ .lookup = NULL, -+ .init = NULL, -+ .paste = NULL, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .dir = { -+ .extract_key = extract_key_de, -+ .update_key = update_key_de, -+ .extract_name = extract_name_de, -+ .extract_file_type = extract_file_type_de, -+ .add_entry = add_entry_de, -+ .rem_entry = rem_entry_de, -+ .max_name_len = max_name_len_de -+ } -+ } -+ }, -+ [COMPOUND_DIR_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = COMPOUND_DIR_ID, -+ .groups = (1 << DIR_ENTRY_ITEM_TYPE), -+ .pops = &item_plugin_ops, -+ .label = "cde", -+ .desc = "compressed directory entry", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_cde, -+ .can_contain_key = can_contain_key_cde, -+ .mergeable = mergeable_cde, -+ .nr_units = nr_units_cde, -+ .lookup = lookup_cde, -+ .init = init_cde, -+ .paste = paste_cde, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_cde, -+ .copy_units = copy_units_cde, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = cut_units_cde, -+ .kill_units = kill_units_cde, -+ .unit_key = unit_key_cde, -+ .max_unit_key = unit_key_cde, -+ .estimate = estimate_cde, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = reiser4_check_cde -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .dir = { -+ .extract_key = extract_key_cde, -+ .update_key = update_key_cde, -+ .extract_name = extract_name_cde, -+ .extract_file_type = extract_file_type_de, -+ .add_entry = add_entry_cde, -+ .rem_entry = rem_entry_cde, -+ .max_name_len = max_name_len_cde -+ } -+ } -+ }, -+ [NODE_POINTER_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = NODE_POINTER_ID, -+ .groups = (1 << INTERNAL_ITEM_TYPE), -+ .pops = NULL, -+ .label = "internal", -+ .desc = "internal item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = NULL, -+ .can_contain_key = NULL, -+ .mergeable = mergeable_internal, -+ .nr_units = nr_units_single_unit, -+ .lookup = lookup_internal, -+ .init = NULL, -+ .paste = NULL, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = create_hook_internal, -+ .kill_hook = kill_hook_internal, -+ .shift_hook = shift_hook_internal, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = check__internal -+#endif -+ }, -+ .f = { -+ .utmost_child = utmost_child_internal, -+ .utmost_child_real_block = -+ utmost_child_real_block_internal, -+ .update = reiser4_update_internal, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .internal = { -+ .down_link = down_link_internal, -+ .has_pointer_to = has_pointer_to_internal -+ } -+ } -+ }, -+ [EXTENT_POINTER_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = EXTENT_POINTER_ID, -+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), -+ .pops = NULL, -+ .label = "extent", -+ .desc = "extent item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_extent, -+ .can_contain_key = can_contain_key_extent, -+ .mergeable = mergeable_extent, -+ .nr_units = nr_units_extent, -+ .lookup = lookup_extent, -+ .init = NULL, -+ .paste = paste_extent, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_extent, -+ .create_hook = create_hook_extent, -+ .copy_units = copy_units_extent, -+ .kill_hook = kill_hook_extent, -+ .shift_hook = NULL, -+ .cut_units = cut_units_extent, -+ .kill_units = kill_units_extent, -+ .unit_key = unit_key_extent, -+ .max_unit_key = max_unit_key_extent, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = reiser4_check_extent -+#endif -+ }, -+ .f = { -+ .utmost_child = utmost_child_extent, -+ .utmost_child_real_block = -+ utmost_child_real_block_extent, -+ .update = NULL, -+ .scan = reiser4_scan_extent, -+ .convert = NULL, -+ .key_by_offset = key_by_offset_extent -+ }, -+ .s = { -+ .file = { -+ .write = reiser4_write_extent, -+ .read = reiser4_read_extent, -+ .readpage = reiser4_readpage_extent, -+ .get_block = get_block_address_extent, -+ .append_key = append_key_extent, -+ .init_coord_extension = -+ init_coord_extension_extent -+ } -+ } -+ }, -+ [FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = FORMATTING_ID, -+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), -+ .pops = NULL, -+ .label = "body", -+ .desc = "body (or tail?) item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_tail, -+ .can_contain_key = can_contain_key_tail, -+ .mergeable = mergeable_tail, -+ .nr_units = nr_units_tail, -+ .lookup = lookup_tail, -+ .init = NULL, -+ .paste = paste_tail, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_tail, -+ .create_hook = NULL, -+ .copy_units = copy_units_tail, -+ .kill_hook = kill_hook_tail, -+ .shift_hook = NULL, -+ .cut_units = cut_units_tail, -+ .kill_units = kill_units_tail, -+ .unit_key = unit_key_tail, -+ .max_unit_key = unit_key_tail, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .file = { -+ .write = reiser4_write_tail, -+ .read = reiser4_read_tail, -+ .readpage = readpage_tail, -+ .get_block = get_block_address_tail, -+ .append_key = append_key_tail, -+ .init_coord_extension = -+ init_coord_extension_tail -+ } -+ } -+ }, -+ [CTAIL_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = CTAIL_ID, -+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), -+ .pops = NULL, -+ .label = "ctail", -+ .desc = "cryptcompress tail item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_tail, -+ .can_contain_key = can_contain_key_ctail, -+ .mergeable = mergeable_ctail, -+ .nr_units = nr_units_ctail, -+ .lookup = NULL, -+ .init = init_ctail, -+ .paste = paste_ctail, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_ctail, -+ .create_hook = create_hook_ctail, -+ .copy_units = copy_units_ctail, -+ .kill_hook = kill_hook_ctail, -+ .shift_hook = shift_hook_ctail, -+ .cut_units = cut_units_ctail, -+ .kill_units = kill_units_ctail, -+ .unit_key = unit_key_tail, -+ .max_unit_key = unit_key_tail, -+ .estimate = estimate_ctail, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = check_ctail -+#endif -+ }, -+ .f = { -+ .utmost_child = utmost_child_ctail, -+ /* FIXME-EDWARD: write this */ -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = scan_ctail, -+ .convert = convert_ctail -+ }, -+ .s = { -+ .file = { -+ .write = NULL, -+ .read = read_ctail, -+ .readpage = readpage_ctail, -+ .get_block = get_block_address_tail, -+ .append_key = append_key_ctail, -+ .init_coord_extension = -+ init_coord_extension_tail -+ } -+ } -+ }, -+ [BLACK_BOX_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = BLACK_BOX_ID, -+ .groups = (1 << OTHER_ITEM_TYPE), -+ .pops = NULL, -+ .label = "blackbox", -+ .desc = "black box item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = NULL, -+ .can_contain_key = NULL, -+ .mergeable = not_mergeable, -+ .nr_units = nr_units_single_unit, -+ /* to need for ->lookup method */ -+ .lookup = NULL, -+ .init = NULL, -+ .paste = NULL, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ } -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/item.h linux-2.6.30/fs/reiser4/plugin/item/item.h ---- linux-2.6.30.orig/fs/reiser4/plugin/item/item.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/item.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,398 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* first read balance.c comments before reading this */ -+ -+/* An item_plugin implements all of the operations required for -+ balancing that are item specific. */ -+ -+/* an item plugin also implements other operations that are specific to that -+ item. These go into the item specific operations portion of the item -+ handler, and all of the item specific portions of the item handler are put -+ into a union. */ -+ -+#if !defined( __REISER4_ITEM_H__ ) -+#define __REISER4_ITEM_H__ -+ -+#include "../../forward.h" -+#include "../plugin_header.h" -+#include "../../dformat.h" -+#include "../../seal.h" -+#include "../../plugin/file/file.h" -+ -+#include <linux/fs.h> /* for struct file, struct inode */ -+#include <linux/mm.h> /* for struct page */ -+#include <linux/dcache.h> /* for struct dentry */ -+ -+typedef enum { -+ STAT_DATA_ITEM_TYPE, -+ DIR_ENTRY_ITEM_TYPE, -+ INTERNAL_ITEM_TYPE, -+ UNIX_FILE_METADATA_ITEM_TYPE, -+ OTHER_ITEM_TYPE -+} item_type_id; -+ -+/* this is the part of each item plugin that all items are expected to -+ support or at least explicitly fail to support by setting the -+ pointer to null. */ -+struct balance_ops { -+ /* operations called by balancing -+ -+ It is interesting to consider that some of these item -+ operations could be given sources or targets that are not -+ really items in nodes. This could be ok/useful. -+ -+ */ -+ /* maximal key that can _possibly_ be occupied by this item -+ -+ When inserting, and node ->lookup() method (called by -+ coord_by_key()) reaches an item after binary search, -+ the ->max_key_inside() item plugin method is used to determine -+ whether new item should pasted into existing item -+ (new_key<=max_key_inside()) or new item has to be created -+ (new_key>max_key_inside()). -+ -+ For items that occupy exactly one key (like stat-data) -+ this method should return this key. For items that can -+ grow indefinitely (extent, directory item) this should -+ return reiser4_max_key(). -+ -+ For example extent with the key -+ -+ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks, -+ -+ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and -+ */ -+ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *); -+ -+ /* true if item @coord can merge data at @key. */ -+ int (*can_contain_key) (const coord_t *, const reiser4_key *, -+ const reiser4_item_data *); -+ /* mergeable() - check items for mergeability -+ -+ Optional method. Returns true if two items can be merged. -+ -+ */ -+ int (*mergeable) (const coord_t *, const coord_t *); -+ -+ /* number of atomic things in an item. -+ NOTE FOR CONTRIBUTORS: use a generic method -+ nr_units_single_unit() for solid (atomic) items, as -+ tree operations use it as a criterion of solidness -+ (see is_solid_item macro) */ -+ pos_in_node_t(*nr_units) (const coord_t *); -+ -+ /* search within item for a unit within the item, and return a -+ pointer to it. This can be used to calculate how many -+ bytes to shrink an item if you use pointer arithmetic and -+ compare to the start of the item body if the item's data -+ are continuous in the node, if the item's data are not -+ continuous in the node, all sorts of other things are maybe -+ going to break as well. */ -+ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *); -+ /* method called by ode_plugin->create_item() to initialise new -+ item */ -+ int (*init) (coord_t * target, coord_t * from, -+ reiser4_item_data * data); -+ /* method called (e.g., by reiser4_resize_item()) to place new data -+ into item when it grows */ -+ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *); -+ /* return true if paste into @coord is allowed to skip -+ carry. That is, if such paste would require any changes -+ at the parent level -+ */ -+ int (*fast_paste) (const coord_t *); -+ /* how many but not more than @want units of @source can be -+ shifted into @target node. If pend == append - we try to -+ append last item of @target by first units of @source. If -+ pend == prepend - we try to "prepend" first item in @target -+ by last units of @source. @target node has @free_space -+ bytes of free space. Total size of those units are returned -+ via @size. -+ -+ @target is not NULL if shifting to the mergeable item and -+ NULL is new item will be created during shifting. -+ */ -+ int (*can_shift) (unsigned free_space, coord_t *, -+ znode *, shift_direction, unsigned *size, -+ unsigned want); -+ -+ /* starting off @from-th unit of item @source append or -+ prepend @count units to @target. @target has been already -+ expanded by @free_space bytes. That must be exactly what is -+ needed for those items in @target. If @where_is_free_space -+ == SHIFT_LEFT - free space is at the end of @target item, -+ othersize - it is in the beginning of it. */ -+ void (*copy_units) (coord_t *, coord_t *, -+ unsigned from, unsigned count, -+ shift_direction where_is_free_space, -+ unsigned free_space); -+ -+ int (*create_hook) (const coord_t *, void *); -+ /* do whatever is necessary to do when @count units starting -+ from @from-th one are removed from the tree */ -+ /* FIXME-VS: this is used to be here for, in particular, -+ extents and items of internal type to free blocks they point -+ to at the same time with removing items from a -+ tree. Problems start, however, when dealloc_block fails due -+ to some reason. Item gets removed, but blocks it pointed to -+ are not freed. It is not clear how to fix this for items of -+ internal type because a need to remove internal item may -+ appear in the middle of balancing, and there is no way to -+ undo changes made. OTOH, if space allocator involves -+ balancing to perform dealloc_block - this will probably -+ break balancing due to deadlock issues -+ */ -+ int (*kill_hook) (const coord_t *, pos_in_node_t from, -+ pos_in_node_t count, struct carry_kill_data *); -+ int (*shift_hook) (const coord_t *, unsigned from, unsigned count, -+ znode * _node); -+ -+ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key -+ including boundaries. When units are cut from item beginning - move space which gets freed to head of -+ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of -+ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in -+ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0 -+ */ -+ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, -+ reiser4_key * smallest_removed, -+ reiser4_key * new_first_key); -+ -+ /* like cut_units, except that these units are removed from the -+ tree, not only from a node */ -+ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, -+ reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+ -+ /* if @key_of_coord == 1 - returned key of coord, otherwise - -+ key of unit is returned. If @coord is not set to certain -+ unit - ERR_PTR(-ENOENT) is returned */ -+ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *); -+ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *); -+ /* estimate how much space is needed for paste @data into item at -+ @coord. if @coord==0 - estimate insertion, otherwise - estimate -+ pasting -+ */ -+ int (*estimate) (const coord_t *, const reiser4_item_data *); -+ -+ /* converts flow @f to item data. @coord == 0 on insert */ -+ int (*item_data_by_flow) (const coord_t *, const flow_t *, -+ reiser4_item_data *); -+ -+ /*void (*show) (struct seq_file *, coord_t *); */ -+ -+#if REISER4_DEBUG -+ /* used for debugging, every item should have here the most -+ complete possible check of the consistency of the item that -+ the inventor can construct */ -+ int (*check) (const coord_t *, const char **error); -+#endif -+ -+}; -+ -+struct flush_ops { -+ /* return the right or left child of @coord, only if it is in memory */ -+ int (*utmost_child) (const coord_t *, sideof side, jnode ** child); -+ -+ /* return whether the right or left child of @coord has a non-fake -+ block number. */ -+ int (*utmost_child_real_block) (const coord_t *, sideof side, -+ reiser4_block_nr *); -+ /* relocate child at @coord to the @block */ -+ void (*update) (const coord_t *, const reiser4_block_nr *); -+ /* count unformatted nodes per item for leave relocation policy, etc.. */ -+ int (*scan) (flush_scan * scan); -+ /* convert item by flush */ -+ int (*convert) (flush_pos_t * pos); -+ /* backward mapping from jnode offset to a key. */ -+ int (*key_by_offset) (struct inode *, loff_t, reiser4_key *); -+}; -+ -+/* operations specific to the directory item */ -+struct dir_entry_iops { -+ /* extract stat-data key from directory entry at @coord and place it -+ into @key. */ -+ int (*extract_key) (const coord_t *, reiser4_key * key); -+ /* update object key in item. */ -+ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *); -+ /* extract name from directory entry at @coord and return it */ -+ char *(*extract_name) (const coord_t *, char *buf); -+ /* extract file type (DT_* stuff) from directory entry at @coord and -+ return it */ -+ unsigned (*extract_file_type) (const coord_t *); -+ int (*add_entry) (struct inode * dir, -+ coord_t *, lock_handle *, -+ const struct dentry * name, -+ reiser4_dir_entry_desc * entry); -+ int (*rem_entry) (struct inode * dir, const struct qstr * name, -+ coord_t *, lock_handle *, -+ reiser4_dir_entry_desc * entry); -+ int (*max_name_len) (const struct inode * dir); -+}; -+ -+/* operations specific to items regular (unix) file metadata are built of */ -+struct file_iops{ -+ int (*write) (struct file *, struct inode *, -+ const char __user *, size_t, loff_t *pos); -+ int (*read) (struct file *, flow_t *, hint_t *); -+ int (*readpage) (void *, struct page *); -+ int (*get_block) (const coord_t *, sector_t, sector_t *); -+ /* -+ * key of first byte which is not addressed by the item @coord is set -+ * to. -+ * For example, for extent item with the key -+ * -+ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks, -+ * -+ * ->append_key is -+ * -+ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size) -+ */ -+ reiser4_key *(*append_key) (const coord_t *, reiser4_key *); -+ -+ void (*init_coord_extension) (uf_coord_t *, loff_t); -+}; -+ -+/* operations specific to items of stat data type */ -+struct sd_iops { -+ int (*init_inode) (struct inode * inode, char *sd, int len); -+ int (*save_len) (struct inode * inode); -+ int (*save) (struct inode * inode, char **area); -+}; -+ -+/* operations specific to internal item */ -+struct internal_iops{ -+ /* all tree traversal want to know from internal item is where -+ to go next. */ -+ void (*down_link) (const coord_t * coord, -+ const reiser4_key * key, reiser4_block_nr * block); -+ /* check that given internal item contains given pointer. */ -+ int (*has_pointer_to) (const coord_t * coord, -+ const reiser4_block_nr * block); -+}; -+ -+struct item_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* methods common for all item types */ -+ struct balance_ops b; /* balance operations */ -+ struct flush_ops f; /* flush operates with items via this methods */ -+ -+ /* methods specific to particular type of item */ -+ union { -+ struct dir_entry_iops dir; -+ struct file_iops file; -+ struct sd_iops sd; -+ struct internal_iops internal; -+ } s; -+}; -+ -+#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit) -+ -+static inline item_id item_id_by_plugin(item_plugin * plugin) -+{ -+ return plugin->h.id; -+} -+ -+static inline char get_iplugid(item_plugin * iplug) -+{ -+ assert("nikita-2838", iplug != NULL); -+ assert("nikita-2839", iplug->h.id < 0xff); -+ return (char)item_id_by_plugin(iplug); -+} -+ -+extern unsigned long znode_times_locked(const znode * z); -+ -+static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug) -+{ -+ assert("nikita-2837", coord != NULL); -+ assert("nikita-2838", iplug != NULL); -+ coord->iplugid = get_iplugid(iplug); -+ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node)); -+} -+ -+static inline item_plugin *coord_iplug(const coord_t * coord) -+{ -+ assert("nikita-2833", coord != NULL); -+ assert("nikita-2834", coord->iplugid != INVALID_PLUGID); -+ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node)); -+ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE, -+ coord->iplugid); -+} -+ -+extern int item_can_contain_key(const coord_t * item, const reiser4_key * key, -+ const reiser4_item_data *); -+extern int are_items_mergeable(const coord_t * i1, const coord_t * i2); -+extern int item_is_extent(const coord_t *); -+extern int item_is_tail(const coord_t *); -+extern int item_is_statdata(const coord_t * item); -+extern int item_is_ctail(const coord_t *); -+ -+extern pos_in_node_t item_length_by_coord(const coord_t * coord); -+extern pos_in_node_t nr_units_single_unit(const coord_t * coord); -+extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ ); -+extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key); -+extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *); -+extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key); -+extern reiser4_key *max_unit_key_by_coord(const coord_t * coord, -+ reiser4_key * key); -+extern void obtain_item_plugin(const coord_t * coord); -+ -+#if defined(REISER4_DEBUG) -+extern int znode_is_loaded(const znode * node); -+#endif -+ -+/* return plugin of item at @coord */ -+static inline item_plugin *item_plugin_by_coord(const coord_t * -+ coord /* coord to query */ ) -+{ -+ assert("nikita-330", coord != NULL); -+ assert("nikita-331", coord->node != NULL); -+ assert("nikita-332", znode_is_loaded(coord->node)); -+ -+ if (unlikely(!coord_is_iplug_set(coord))) -+ obtain_item_plugin(coord); -+ return coord_iplug(coord); -+} -+ -+/* this returns true if item is of internal type */ -+static inline int item_is_internal(const coord_t * item) -+{ -+ assert("vs-483", coord_is_existing_item(item)); -+ return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE); -+} -+ -+extern void item_body_by_coord_hard(coord_t * coord); -+extern void *item_body_by_coord_easy(const coord_t * coord); -+#if REISER4_DEBUG -+extern int item_body_is_valid(const coord_t * coord); -+#endif -+ -+/* return pointer to item body */ -+static inline void *item_body_by_coord(const coord_t * -+ coord /* coord to query */ ) -+{ -+ assert("nikita-324", coord != NULL); -+ assert("nikita-325", coord->node != NULL); -+ assert("nikita-326", znode_is_loaded(coord->node)); -+ -+ if (coord->offset == INVALID_OFFSET) -+ item_body_by_coord_hard((coord_t *) coord); -+ assert("nikita-3201", item_body_is_valid(coord)); -+ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node)); -+ return item_body_by_coord_easy(coord); -+} -+ -+/* __REISER4_ITEM_H__ */ -+#endif -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/Makefile linux-2.6.30/fs/reiser4/plugin/item/Makefile ---- linux-2.6.30.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,18 @@ -+obj-$(CONFIG_REISER4_FS) += item_plugins.o -+ -+item_plugins-objs := \ -+ item.o \ -+ static_stat.o \ -+ sde.o \ -+ cde.o \ -+ blackbox.o \ -+ internal.o \ -+ tail.o \ -+ ctail.o \ -+ extent.o \ -+ extent_item_ops.o \ -+ extent_file_ops.o \ -+ extent_flush_ops.o -+ -+ -+ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/sde.c linux-2.6.30/fs/reiser4/plugin/item/sde.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/sde.c 2009-06-22 17:27:31.000000000 +0200 -@@ -0,0 +1,190 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry implementation */ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../coord.h" -+#include "sde.h" -+#include "item.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../carry.h" -+#include "../../tree.h" -+#include "../../inode.h" -+ -+#include <linux/fs.h> /* for struct inode */ -+#include <linux/dcache.h> /* for struct dentry */ -+#include <linux/quotaops.h> -+ -+/* ->extract_key() method of simple directory item plugin. */ -+int extract_key_de(const coord_t * coord /* coord of item */ , -+ reiser4_key * key /* resulting key */ ) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1458", coord != NULL); -+ assert("nikita-1459", key != NULL); -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent); -+ return extract_key_from_id(&dent->id, key); -+} -+ -+int -+update_key_de(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh UNUSED_ARG) -+{ -+ directory_entry_format *dent; -+ obj_key_id obj_id; -+ int result; -+ -+ assert("nikita-2342", coord != NULL); -+ assert("nikita-2343", key != NULL); -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ result = build_obj_key_id(key, &obj_id); -+ if (result == 0) { -+ dent->id = obj_id; -+ znode_make_dirty(coord->node); -+ } -+ return 0; -+} -+ -+char *extract_dent_name(const coord_t * coord, directory_entry_format * dent, -+ char *buf) -+{ -+ reiser4_key key; -+ -+ unit_key_by_coord(coord, &key); -+ if (get_key_type(&key) != KEY_FILE_NAME_MINOR) -+ reiser4_print_address("oops", znode_get_block(coord->node)); -+ if (!is_longname_key(&key)) { -+ if (is_dot_key(&key)) -+ return (char *)"."; -+ else -+ return extract_name_from_key(&key, buf); -+ } else -+ return (char *)dent->name; -+} -+ -+/* ->extract_name() method of simple directory item plugin. */ -+char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1460", coord != NULL); -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ return extract_dent_name(coord, dent, buf); -+} -+ -+/* ->extract_file_type() method of simple directory item plugin. */ -+unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of -+ * item */ ) -+{ -+ assert("nikita-1764", coord != NULL); -+ /* we don't store file type in the directory entry yet. -+ -+ But see comments at kassign.h:obj_key_id -+ */ -+ return DT_UNKNOWN; -+} -+ -+int add_entry_de(struct inode *dir /* directory of item */ , -+ coord_t * coord /* coord of item */ , -+ lock_handle * lh /* insertion lock handle */ , -+ const struct dentry *de /* name to add */ , -+ reiser4_dir_entry_desc * entry /* parameters of new directory -+ * entry */ ) -+{ -+ reiser4_item_data data; -+ directory_entry_format *dent; -+ int result; -+ const char *name; -+ int len; -+ int longname; -+ -+ name = de->d_name.name; -+ len = de->d_name.len; -+ assert("nikita-1163", strlen(name) == len); -+ -+ longname = is_longname(name, len); -+ -+ data.length = sizeof *dent; -+ if (longname) -+ data.length += len + 1; -+ data.data = NULL; -+ data.user = 0; -+ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID); -+ -+ /* NOTE-NIKITA quota plugin */ -+ if (vfs_dq_alloc_space_nodirty(dir, data.length)) -+ return -EDQUOT; -+ -+ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ ); -+ if (result != 0) -+ return result; -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ build_inode_key_id(entry->obj, &dent->id); -+ if (longname) { -+ memcpy(dent->name, name, len); -+ put_unaligned(0, &dent->name[len]); -+ } -+ return 0; -+} -+ -+int rem_entry_de(struct inode *dir /* directory of item */ , -+ const struct qstr *name UNUSED_ARG, -+ coord_t * coord /* coord of item */ , -+ lock_handle * lh UNUSED_ARG /* lock handle for -+ * removal */ , -+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of -+ * directory entry -+ * being removed */ ) -+{ -+ coord_t shadow; -+ int result; -+ int length; -+ -+ length = item_length_by_coord(coord); -+ if (inode_get_bytes(dir) < length) { -+ warning("nikita-2627", "Dir is broke: %llu: %llu", -+ (unsigned long long)get_inode_oid(dir), -+ inode_get_bytes(dir)); -+ -+ return RETERR(-EIO); -+ } -+ -+ /* cut_node() is supposed to take pointers to _different_ -+ coords, because it will modify them without respect to -+ possible aliasing. To work around this, create temporary copy -+ of @coord. -+ */ -+ coord_dup(&shadow, coord); -+ result = -+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0); -+ if (result == 0) { -+ /* NOTE-NIKITA quota plugin */ -+ vfs_dq_free_space_nodirty(dir, length); -+ } -+ return result; -+} -+ -+int max_name_len_de(const struct inode *dir) -+{ -+ return reiser4_tree_by_inode(dir)->nplug->max_item_size() - -+ sizeof(directory_entry_format) - 2; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/sde.h linux-2.6.30/fs/reiser4/plugin/item/sde.h ---- linux-2.6.30.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/sde.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,66 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ) -+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+ -+#include <linux/fs.h> -+#include <linux/dcache.h> /* for struct dentry */ -+ -+typedef struct directory_entry_format { -+ /* key of object stat-data. It's not necessary to store whole -+ key here, because it's always key of stat-data, so minor -+ packing locality and offset can be omitted here. But this -+ relies on particular key allocation scheme for stat-data, so, -+ for extensibility sake, whole key can be stored here. -+ -+ We store key as array of bytes, because we don't want 8-byte -+ alignment of dir entries. -+ */ -+ obj_key_id id; -+ /* file name. Null terminated string. */ -+ d8 name[0]; -+} directory_entry_format; -+ -+void print_de(const char *prefix, coord_t * coord); -+int extract_key_de(const coord_t * coord, reiser4_key * key); -+int update_key_de(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh); -+char *extract_name_de(const coord_t * coord, char *buf); -+unsigned extract_file_type_de(const coord_t * coord); -+int add_entry_de(struct inode *dir, coord_t * coord, -+ lock_handle * lh, const struct dentry *name, -+ reiser4_dir_entry_desc * entry); -+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord, -+ lock_handle * lh, reiser4_dir_entry_desc * entry); -+int max_name_len_de(const struct inode *dir); -+ -+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length); -+ -+char *extract_dent_name(const coord_t * coord, -+ directory_entry_format * dent, char *buf); -+ -+#if REISER4_LARGE_KEY -+#define DE_NAME_BUF_LEN (24) -+#else -+#define DE_NAME_BUF_LEN (16) -+#endif -+ -+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.30/fs/reiser4/plugin/item/static_stat.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/static_stat.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1107 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* stat data manipulation. */ -+ -+#include "../../forward.h" -+#include "../../super.h" -+#include "../../vfs_ops.h" -+#include "../../inode.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../object.h" -+#include "../plugin.h" -+#include "../plugin_header.h" -+#include "static_stat.h" -+#include "item.h" -+ -+#include <linux/types.h> -+#include <linux/fs.h> -+ -+/* see static_stat.h for explanation */ -+ -+/* helper function used while we are dumping/loading inode/plugin state -+ to/from the stat-data. */ -+ -+static void move_on(int *length /* space remaining in stat-data */ , -+ char **area /* current coord in stat data */ , -+ int size_of /* how many bytes to move forward */ ) -+{ -+ assert("nikita-615", length != NULL); -+ assert("nikita-616", area != NULL); -+ -+ *length -= size_of; -+ *area += size_of; -+ -+ assert("nikita-617", *length >= 0); -+} -+ -+/* helper function used while loading inode/plugin state from stat-data. -+ Complain if there is less space in stat-data than was expected. -+ Can only happen on disk corruption. */ -+static int not_enough_space(struct inode *inode /* object being processed */ , -+ const char *where /* error message */ ) -+{ -+ assert("nikita-618", inode != NULL); -+ -+ warning("nikita-619", "Not enough space in %llu while loading %s", -+ (unsigned long long)get_inode_oid(inode), where); -+ -+ return RETERR(-EINVAL); -+} -+ -+/* helper function used while loading inode/plugin state from -+ stat-data. Call it if invalid plugin id was found. */ -+static int unknown_plugin(reiser4_plugin_id id /* invalid id */ , -+ struct inode *inode /* object being processed */ ) -+{ -+ warning("nikita-620", "Unknown plugin %i in %llu", -+ id, (unsigned long long)get_inode_oid(inode)); -+ -+ return RETERR(-EINVAL); -+} -+ -+/* this is installed as ->init_inode() method of -+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). -+ Copies data from on-disk stat-data format into inode. -+ Handles stat-data extensions. */ -+/* was sd_load */ -+int init_inode_static_sd(struct inode *inode /* object being processed */ , -+ char *sd /* stat-data body */ , -+ int len /* length of stat-data */ ) -+{ -+ int result; -+ int bit; -+ int chunk; -+ __u16 mask; -+ __u64 bigmask; -+ reiser4_stat_data_base *sd_base; -+ reiser4_inode *state; -+ -+ assert("nikita-625", inode != NULL); -+ assert("nikita-626", sd != NULL); -+ -+ result = 0; -+ sd_base = (reiser4_stat_data_base *) sd; -+ state = reiser4_inode_data(inode); -+ mask = le16_to_cpu(get_unaligned(&sd_base->extmask)); -+ bigmask = mask; -+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN); -+ -+ move_on(&len, &sd, sizeof *sd_base); -+ for (bit = 0, chunk = 0; -+ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION; -+ ++bit, mask >>= 1) { -+ if (((bit + 1) % 16) != 0) { -+ /* handle extension */ -+ sd_ext_plugin *sdplug; -+ -+ if (bit >= LAST_SD_EXTENSION) { -+ warning("vpf-1904", -+ "No such extension %i in inode %llu", -+ bit, -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ -+ sdplug = sd_ext_plugin_by_id(bit); -+ if (sdplug == NULL) { -+ warning("nikita-627", -+ "No such extension %i in inode %llu", -+ bit, -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ if (mask & 1) { -+ assert("nikita-628", sdplug->present); -+ /* alignment is not supported in node layout -+ plugin yet. -+ result = align( inode, &len, &sd, -+ sdplug -> alignment ); -+ if( result != 0 ) -+ return result; */ -+ result = sdplug->present(inode, &sd, &len); -+ } else if (sdplug->absent != NULL) -+ result = sdplug->absent(inode); -+ if (result) -+ break; -+ /* else, we are looking at the last bit in 16-bit -+ portion of bitmask */ -+ } else if (mask & 1) { -+ /* next portion of bitmask */ -+ if (len < (int)sizeof(d16)) { -+ warning("nikita-629", -+ "No space for bitmap in inode %llu", -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ mask = le16_to_cpu(get_unaligned((d16 *)sd)); -+ bigmask <<= 16; -+ bigmask |= mask; -+ move_on(&len, &sd, sizeof(d16)); -+ ++chunk; -+ if (chunk == 3) { -+ if (!(mask & 0x8000)) { -+ /* clear last bit */ -+ mask &= ~0x8000; -+ continue; -+ } -+ /* too much */ -+ warning("nikita-630", -+ "Too many extensions in %llu", -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ } else -+ /* bitmask exhausted */ -+ break; -+ } -+ state->extmask = bigmask; -+ /* common initialisations */ -+ if (len - (bit / 16 * sizeof(d16)) > 0) { -+ /* alignment in save_len_static_sd() is taken into account -+ -edward */ -+ warning("nikita-631", "unused space in inode %llu", -+ (unsigned long long)get_inode_oid(inode)); -+ } -+ -+ return result; -+} -+ -+/* estimates size of stat-data required to store inode. -+ Installed as ->save_len() method of -+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */ -+/* was sd_len */ -+int save_len_static_sd(struct inode *inode /* object being processed */ ) -+{ -+ unsigned int result; -+ __u64 mask; -+ int bit; -+ -+ assert("nikita-632", inode != NULL); -+ -+ result = sizeof(reiser4_stat_data_base); -+ mask = reiser4_inode_data(inode)->extmask; -+ for (bit = 0; mask != 0; ++bit, mask >>= 1) { -+ if (mask & 1) { -+ sd_ext_plugin *sdplug; -+ -+ sdplug = sd_ext_plugin_by_id(bit); -+ assert("nikita-633", sdplug != NULL); -+ /* no aligment support -+ result += -+ round_up( result, sdplug -> alignment ) - result; */ -+ result += sdplug->save_len(inode); -+ } -+ } -+ result += bit / 16 * sizeof(d16); -+ return result; -+} -+ -+/* saves inode into stat-data. -+ Installed as ->save() method of -+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */ -+/* was sd_save */ -+int save_static_sd(struct inode *inode /* object being processed */ , -+ char **area /* where to save stat-data */ ) -+{ -+ int result; -+ __u64 emask; -+ int bit; -+ unsigned int len; -+ reiser4_stat_data_base *sd_base; -+ -+ assert("nikita-634", inode != NULL); -+ assert("nikita-635", area != NULL); -+ -+ result = 0; -+ emask = reiser4_inode_data(inode)->extmask; -+ sd_base = (reiser4_stat_data_base *) * area; -+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask); -+ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/ -+ -+ *area += sizeof *sd_base; -+ len = 0xffffffffu; -+ for (bit = 0; emask != 0; ++bit, emask >>= 1) { -+ if (emask & 1) { -+ if ((bit + 1) % 16 != 0) { -+ sd_ext_plugin *sdplug; -+ sdplug = sd_ext_plugin_by_id(bit); -+ assert("nikita-636", sdplug != NULL); -+ /* no alignment support yet -+ align( inode, &len, area, -+ sdplug -> alignment ); */ -+ result = sdplug->save(inode, area); -+ if (result) -+ break; -+ } else { -+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), -+ (d16 *)(*area)); -+ /*cputod16((unsigned)(emask & 0xffff), -+ (d16 *) * area);*/ -+ *area += sizeof(d16); -+ } -+ } -+ } -+ return result; -+} -+ -+/* stat-data extension handling functions. */ -+ -+static int present_lw_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ if (*len >= (int)sizeof(reiser4_light_weight_stat)) { -+ reiser4_light_weight_stat *sd_lw; -+ -+ sd_lw = (reiser4_light_weight_stat *) * area; -+ -+ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode)); -+ inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink)); -+ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size)); -+ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) { -+ inode->i_mode &= ~S_IFIFO; -+ warning("", "partially converted file is encountered"); -+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); -+ } -+ move_on(len, area, sizeof *sd_lw); -+ return 0; -+ } else -+ return not_enough_space(inode, "lw sd"); -+} -+ -+static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being -+ * processed */ ) -+{ -+ return sizeof(reiser4_light_weight_stat); -+} -+ -+static int save_lw_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_light_weight_stat *sd; -+ mode_t delta; -+ -+ assert("nikita-2705", inode != NULL); -+ assert("nikita-2706", area != NULL); -+ assert("nikita-2707", *area != NULL); -+ -+ sd = (reiser4_light_weight_stat *) * area; -+ -+ delta = (reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED) ? S_IFIFO : 0); -+ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode); -+ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink); -+ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size); -+ *area += sizeof *sd; -+ return 0; -+} -+ -+static int present_unix_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ assert("nikita-637", inode != NULL); -+ assert("nikita-638", area != NULL); -+ assert("nikita-639", *area != NULL); -+ assert("nikita-640", len != NULL); -+ assert("nikita-641", *len > 0); -+ -+ if (*len >= (int)sizeof(reiser4_unix_stat)) { -+ reiser4_unix_stat *sd; -+ -+ sd = (reiser4_unix_stat *) * area; -+ -+ inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid)); -+ inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid)); -+ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime)); -+ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime)); -+ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime)); -+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) -+ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev)); -+ else -+ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes))); -+ move_on(len, area, sizeof *sd); -+ return 0; -+ } else -+ return not_enough_space(inode, "unix sd"); -+} -+ -+static int absent_unix_sd(struct inode *inode /* object being processed */ ) -+{ -+ inode->i_uid = get_super_private(inode->i_sb)->default_uid; -+ inode->i_gid = get_super_private(inode->i_sb)->default_gid; -+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; -+ inode_set_bytes(inode, inode->i_size); -+ /* mark inode as lightweight, so that caller (lookup_common) will -+ complete initialisation by copying [ug]id from a parent. */ -+ reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT); -+ return 0; -+} -+ -+/* Audited by: green(2002.06.14) */ -+static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being -+ * processed */ ) -+{ -+ return sizeof(reiser4_unix_stat); -+} -+ -+static int save_unix_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_unix_stat *sd; -+ -+ assert("nikita-642", inode != NULL); -+ assert("nikita-643", area != NULL); -+ assert("nikita-644", *area != NULL); -+ -+ sd = (reiser4_unix_stat *) * area; -+ put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid); -+ put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid); -+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime); -+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) -+ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev); -+ else -+ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes); -+ *area += sizeof *sd; -+ return 0; -+} -+ -+static int -+present_large_times_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ if (*len >= (int)sizeof(reiser4_large_times_stat)) { -+ reiser4_large_times_stat *sd_lt; -+ -+ sd_lt = (reiser4_large_times_stat *) * area; -+ -+ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime)); -+ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime)); -+ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime)); -+ -+ move_on(len, area, sizeof *sd_lt); -+ return 0; -+ } else -+ return not_enough_space(inode, "large times sd"); -+} -+ -+static int -+save_len_large_times_sd(struct inode *inode UNUSED_ARG -+ /* object being processed */ ) -+{ -+ return sizeof(reiser4_large_times_stat); -+} -+ -+static int -+save_large_times_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_large_times_stat *sd; -+ -+ assert("nikita-2817", inode != NULL); -+ assert("nikita-2818", area != NULL); -+ assert("nikita-2819", *area != NULL); -+ -+ sd = (reiser4_large_times_stat *) * area; -+ -+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime); -+ -+ *area += sizeof *sd; -+ return 0; -+} -+ -+/* symlink stat data extension */ -+ -+/* allocate memory for symlink target and attach it to inode->i_private */ -+static int -+symlink_target_to_inode(struct inode *inode, const char *target, int len) -+{ -+ assert("vs-845", inode->i_private == NULL); -+ assert("vs-846", !reiser4_inode_get_flag(inode, -+ REISER4_GENERIC_PTR_USED)); -+ /* FIXME-VS: this is prone to deadlock. Not more than other similar -+ places, though */ -+ inode->i_private = kmalloc((size_t) len + 1, -+ reiser4_ctx_gfp_mask_get()); -+ if (!inode->i_private) -+ return RETERR(-ENOMEM); -+ -+ memcpy((char *)(inode->i_private), target, (size_t) len); -+ ((char *)(inode->i_private))[len] = 0; -+ reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED); -+ return 0; -+} -+ -+/* this is called on read_inode. There is nothing to do actually, but some -+ sanity checks */ -+static int present_symlink_sd(struct inode *inode, char **area, int *len) -+{ -+ int result; -+ int length; -+ reiser4_symlink_stat *sd; -+ -+ length = (int)inode->i_size; -+ /* -+ * *len is number of bytes in stat data item from *area to the end of -+ * item. It must be not less than size of symlink + 1 for ending 0 -+ */ -+ if (length > *len) -+ return not_enough_space(inode, "symlink"); -+ -+ if (*(*area + length) != 0) { -+ warning("vs-840", "Symlink is not zero terminated"); -+ return RETERR(-EIO); -+ } -+ -+ sd = (reiser4_symlink_stat *) * area; -+ result = symlink_target_to_inode(inode, sd->body, length); -+ -+ move_on(len, area, length + 1); -+ return result; -+} -+ -+static int save_len_symlink_sd(struct inode *inode) -+{ -+ return inode->i_size + 1; -+} -+ -+/* this is called on create and update stat data. Do nothing on update but -+ update @area */ -+static int save_symlink_sd(struct inode *inode, char **area) -+{ -+ int result; -+ int length; -+ reiser4_symlink_stat *sd; -+ -+ length = (int)inode->i_size; -+ /* inode->i_size must be set already */ -+ assert("vs-841", length); -+ -+ result = 0; -+ sd = (reiser4_symlink_stat *) * area; -+ if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) { -+ const char *target; -+ -+ target = (const char *)(inode->i_private); -+ inode->i_private = NULL; -+ -+ result = symlink_target_to_inode(inode, target, length); -+ -+ /* copy symlink to stat data */ -+ memcpy(sd->body, target, (size_t) length); -+ (*area)[length] = 0; -+ } else { -+ /* there is nothing to do in update but move area */ -+ assert("vs-844", -+ !memcmp(inode->i_private, sd->body, -+ (size_t) length + 1)); -+ } -+ -+ *area += (length + 1); -+ return result; -+} -+ -+static int present_flags_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ assert("nikita-645", inode != NULL); -+ assert("nikita-646", area != NULL); -+ assert("nikita-647", *area != NULL); -+ assert("nikita-648", len != NULL); -+ assert("nikita-649", *len > 0); -+ -+ if (*len >= (int)sizeof(reiser4_flags_stat)) { -+ reiser4_flags_stat *sd; -+ -+ sd = (reiser4_flags_stat *) * area; -+ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags)); -+ move_on(len, area, sizeof *sd); -+ return 0; -+ } else -+ return not_enough_space(inode, "generation and attrs"); -+} -+ -+/* Audited by: green(2002.06.14) */ -+static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being -+ * processed */ ) -+{ -+ return sizeof(reiser4_flags_stat); -+} -+ -+static int save_flags_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_flags_stat *sd; -+ -+ assert("nikita-650", inode != NULL); -+ assert("nikita-651", area != NULL); -+ assert("nikita-652", *area != NULL); -+ -+ sd = (reiser4_flags_stat *) * area; -+ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags); -+ *area += sizeof *sd; -+ return 0; -+} -+ -+static int absent_plugin_sd(struct inode *inode); -+static int present_plugin_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */, -+ int is_pset /* 1 if plugin set, 0 if heir set. */) -+{ -+ reiser4_plugin_stat *sd; -+ reiser4_plugin *plugin; -+ reiser4_inode *info; -+ int i; -+ __u16 mask; -+ int result; -+ int num_of_plugins; -+ -+ assert("nikita-653", inode != NULL); -+ assert("nikita-654", area != NULL); -+ assert("nikita-655", *area != NULL); -+ assert("nikita-656", len != NULL); -+ assert("nikita-657", *len > 0); -+ -+ if (*len < (int)sizeof(reiser4_plugin_stat)) -+ return not_enough_space(inode, "plugin"); -+ -+ sd = (reiser4_plugin_stat *) * area; -+ info = reiser4_inode_data(inode); -+ -+ mask = 0; -+ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no)); -+ move_on(len, area, sizeof *sd); -+ result = 0; -+ for (i = 0; i < num_of_plugins; ++i) { -+ reiser4_plugin_slot *slot; -+ reiser4_plugin_type type; -+ pset_member memb; -+ -+ slot = (reiser4_plugin_slot *) * area; -+ if (*len < (int)sizeof *slot) -+ return not_enough_space(inode, "additional plugin"); -+ -+ memb = le16_to_cpu(get_unaligned(&slot->pset_memb)); -+ type = aset_member_to_type_unsafe(memb); -+ -+ if (type == REISER4_PLUGIN_TYPES) { -+ warning("nikita-3502", -+ "wrong %s member (%i) for %llu", is_pset ? -+ "pset" : "hset", memb, -+ (unsigned long long)get_inode_oid(inode)); -+ return RETERR(-EINVAL); -+ } -+ plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode), -+ type, &slot->id); -+ if (plugin == NULL) -+ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode); -+ -+ /* plugin is loaded into inode, mark this into inode's -+ bitmask of loaded non-standard plugins */ -+ if (!(mask & (1 << memb))) { -+ mask |= (1 << memb); -+ } else { -+ warning("nikita-658", "duplicate plugin for %llu", -+ (unsigned long long)get_inode_oid(inode)); -+ return RETERR(-EINVAL); -+ } -+ move_on(len, area, sizeof *slot); -+ /* load plugin data, if any */ -+ if (plugin->h.pops != NULL && plugin->h.pops->load) -+ result = plugin->h.pops->load(inode, plugin, area, len); -+ else -+ result = aset_set_unsafe(is_pset ? &info->pset : -+ &info->hset, memb, plugin); -+ if (result) -+ return result; -+ } -+ if (is_pset) { -+ /* if object plugin wasn't loaded from stat-data, guess it by -+ mode bits */ -+ plugin = file_plugin_to_plugin(inode_file_plugin(inode)); -+ if (plugin == NULL) -+ result = absent_plugin_sd(inode); -+ info->plugin_mask = mask; -+ } else -+ info->heir_mask = mask; -+ -+ return result; -+} -+ -+static int present_pset_sd(struct inode *inode, char **area, int *len) { -+ return present_plugin_sd(inode, area, len, 1 /* pset */); -+} -+ -+/* Determine object plugin for @inode based on i_mode. -+ -+ Many objects in reiser4 file system are controlled by standard object -+ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on. -+ -+ For such files we don't explicitly store plugin id in object stat -+ data. Rather required plugin is guessed from mode bits, where file "type" -+ is encoded (see stat(2)). -+*/ -+static int -+guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ ) -+{ -+ int fplug_id; -+ int dplug_id; -+ reiser4_inode *info; -+ -+ assert("nikita-736", inode != NULL); -+ -+ dplug_id = fplug_id = -1; -+ -+ switch (inode->i_mode & S_IFMT) { -+ case S_IFSOCK: -+ case S_IFBLK: -+ case S_IFCHR: -+ case S_IFIFO: -+ fplug_id = SPECIAL_FILE_PLUGIN_ID; -+ break; -+ case S_IFLNK: -+ fplug_id = SYMLINK_FILE_PLUGIN_ID; -+ break; -+ case S_IFDIR: -+ fplug_id = DIRECTORY_FILE_PLUGIN_ID; -+ dplug_id = HASHED_DIR_PLUGIN_ID; -+ break; -+ default: -+ warning("nikita-737", "wrong file mode: %o", inode->i_mode); -+ return RETERR(-EIO); -+ case S_IFREG: -+ fplug_id = UNIX_FILE_PLUGIN_ID; -+ break; -+ } -+ info = reiser4_inode_data(inode); -+ set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ? -+ plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL); -+ set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ? -+ plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL); -+ return 0; -+} -+ -+/* Audited by: green(2002.06.14) */ -+static int absent_plugin_sd(struct inode *inode /* object being processed */ ) -+{ -+ int result; -+ -+ assert("nikita-659", inode != NULL); -+ -+ result = guess_plugin_by_mode(inode); -+ /* if mode was wrong, guess_plugin_by_mode() returns "regular file", -+ but setup_inode_ops() will call make_bad_inode(). -+ Another, more logical but bit more complex solution is to add -+ "bad-file plugin". */ -+ /* FIXME-VS: activate was called here */ -+ return result; -+} -+ -+/* helper function for plugin_sd_save_len(): calculate how much space -+ required to save state of given plugin */ -+/* Audited by: green(2002.06.14) */ -+static int len_for(reiser4_plugin * plugin /* plugin to save */ , -+ struct inode *inode /* object being processed */ , -+ pset_member memb, -+ int len, int is_pset) -+{ -+ reiser4_inode *info; -+ assert("nikita-661", inode != NULL); -+ -+ if (plugin == NULL) -+ return len; -+ -+ info = reiser4_inode_data(inode); -+ if (is_pset ? -+ info->plugin_mask & (1 << memb) : -+ info->heir_mask & (1 << memb)) { -+ len += sizeof(reiser4_plugin_slot); -+ if (plugin->h.pops && plugin->h.pops->save_len != NULL) { -+ /* non-standard plugin, call method */ -+ /* commented as it is incompatible with alignment -+ * policy in save_plug() -edward */ -+ /* len = round_up(len, plugin->h.pops->alignment); */ -+ len += plugin->h.pops->save_len(inode, plugin); -+ } -+ } -+ return len; -+} -+ -+/* calculate how much space is required to save state of all plugins, -+ associated with inode */ -+static int save_len_plugin_sd(struct inode *inode /* object being processed */, -+ int is_pset) -+{ -+ int len; -+ int last; -+ reiser4_inode *state; -+ pset_member memb; -+ -+ assert("nikita-663", inode != NULL); -+ -+ state = reiser4_inode_data(inode); -+ -+ /* common case: no non-standard plugins */ -+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0) -+ return 0; -+ len = sizeof(reiser4_plugin_stat); -+ last = PSET_LAST; -+ -+ for (memb = 0; memb < last; ++memb) { -+ len = len_for(aset_get(is_pset ? state->pset : state->hset, memb), -+ inode, memb, len, is_pset); -+ } -+ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat)); -+ return len; -+} -+ -+static int save_len_pset_sd(struct inode *inode) { -+ return save_len_plugin_sd(inode, 1 /* pset */); -+} -+ -+/* helper function for plugin_sd_save(): save plugin, associated with -+ inode. */ -+static int save_plug(reiser4_plugin * plugin /* plugin to save */ , -+ struct inode *inode /* object being processed */ , -+ int memb /* what element of pset is saved */ , -+ char **area /* position in stat-data */ , -+ int *count /* incremented if plugin were actually saved. */, -+ int is_pset /* 1 for plugin set, 0 for heir set */) -+{ -+ reiser4_plugin_slot *slot; -+ int fake_len; -+ int result; -+ -+ assert("nikita-665", inode != NULL); -+ assert("nikita-666", area != NULL); -+ assert("nikita-667", *area != NULL); -+ -+ if (plugin == NULL) -+ return 0; -+ -+ if (is_pset ? -+ !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) : -+ !(reiser4_inode_data(inode)->heir_mask & (1 << memb))) -+ return 0; -+ slot = (reiser4_plugin_slot *) * area; -+ put_unaligned(cpu_to_le16(memb), &slot->pset_memb); -+ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id); -+ fake_len = (int)0xffff; -+ move_on(&fake_len, area, sizeof *slot); -+ ++*count; -+ result = 0; -+ if (plugin->h.pops != NULL) { -+ if (plugin->h.pops->save != NULL) -+ result = plugin->h.pops->save(inode, plugin, area); -+ } -+ return result; -+} -+ -+/* save state of all non-standard plugins associated with inode */ -+static int save_plugin_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */, -+ int is_pset /* 1 for pset, 0 for hset */) -+{ -+ int fake_len; -+ int result = 0; -+ int num_of_plugins; -+ reiser4_plugin_stat *sd; -+ reiser4_inode *state; -+ pset_member memb; -+ -+ assert("nikita-669", inode != NULL); -+ assert("nikita-670", area != NULL); -+ assert("nikita-671", *area != NULL); -+ -+ state = reiser4_inode_data(inode); -+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0) -+ return 0; -+ sd = (reiser4_plugin_stat *) * area; -+ fake_len = (int)0xffff; -+ move_on(&fake_len, area, sizeof *sd); -+ -+ num_of_plugins = 0; -+ for (memb = 0; memb < PSET_LAST; ++memb) { -+ result = save_plug(aset_get(is_pset ? state->pset : state->hset, -+ memb), -+ inode, memb, area, &num_of_plugins, is_pset); -+ if (result != 0) -+ break; -+ } -+ -+ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no); -+ return result; -+} -+ -+static int save_pset_sd(struct inode *inode, char **area) { -+ return save_plugin_sd(inode, area, 1 /* pset */); -+} -+ -+static int present_hset_sd(struct inode *inode, char **area, int *len) { -+ return present_plugin_sd(inode, area, len, 0 /* hset */); -+} -+ -+static int save_len_hset_sd(struct inode *inode) { -+ return save_len_plugin_sd(inode, 0 /* pset */); -+} -+ -+static int save_hset_sd(struct inode *inode, char **area) { -+ return save_plugin_sd(inode, area, 0 /* hset */); -+} -+ -+/* helper function for crypto_sd_present(), crypto_sd_save. -+ Extract crypto info from stat-data and attach it to inode */ -+static int extract_crypto_info (struct inode * inode, -+ reiser4_crypto_stat * sd) -+{ -+ struct reiser4_crypto_info * info; -+ assert("edward-11", !inode_crypto_info(inode)); -+ assert("edward-1413", -+ !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)); -+ /* create and attach a crypto-stat without secret key loaded */ -+ info = reiser4_alloc_crypto_info(inode); -+ if (IS_ERR(info)) -+ return PTR_ERR(info); -+ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize)); -+ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize); -+ reiser4_attach_crypto_info(inode, info); -+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED); -+ return 0; -+} -+ -+/* crypto stat-data extension */ -+ -+static int present_crypto_sd(struct inode *inode, char **area, int *len) -+{ -+ int result; -+ reiser4_crypto_stat *sd; -+ digest_plugin *dplug = inode_digest_plugin(inode); -+ -+ assert("edward-06", dplug != NULL); -+ assert("edward-684", dplug->fipsize); -+ assert("edward-07", area != NULL); -+ assert("edward-08", *area != NULL); -+ assert("edward-09", len != NULL); -+ assert("edward-10", *len > 0); -+ -+ if (*len < (int)sizeof(reiser4_crypto_stat)) { -+ return not_enough_space(inode, "crypto-sd"); -+ } -+ /* *len is number of bytes in stat data item from *area to the end of -+ item. It must be not less than size of this extension */ -+ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len); -+ -+ sd = (reiser4_crypto_stat *) * area; -+ result = extract_crypto_info(inode, sd); -+ move_on(len, area, sizeof(*sd) + dplug->fipsize); -+ -+ return result; -+} -+ -+static int save_len_crypto_sd(struct inode *inode) -+{ -+ return sizeof(reiser4_crypto_stat) + -+ inode_digest_plugin(inode)->fipsize; -+} -+ -+static int save_crypto_sd(struct inode *inode, char **area) -+{ -+ int result = 0; -+ reiser4_crypto_stat *sd; -+ struct reiser4_crypto_info * info = inode_crypto_info(inode); -+ digest_plugin *dplug = inode_digest_plugin(inode); -+ -+ assert("edward-12", dplug != NULL); -+ assert("edward-13", area != NULL); -+ assert("edward-14", *area != NULL); -+ assert("edward-15", info != NULL); -+ assert("edward-1414", info->keyid != NULL); -+ assert("edward-1415", info->keysize != 0); -+ assert("edward-76", reiser4_inode_data(inode) != NULL); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) { -+ /* file is just created */ -+ sd = (reiser4_crypto_stat *) *area; -+ /* copy everything but private key to the disk stat-data */ -+ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize); -+ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize); -+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED); -+ } -+ *area += (sizeof(*sd) + dplug->fipsize); -+ return result; -+} -+ -+static int eio(struct inode *inode, char **area, int *len) -+{ -+ return RETERR(-EIO); -+} -+ -+sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = { -+ [LIGHT_WEIGHT_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = LIGHT_WEIGHT_STAT, -+ .pops = NULL, -+ .label = "light-weight sd", -+ .desc = "sd for light-weight files", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_lw_sd, -+ .absent = NULL, -+ .save_len = save_len_lw_sd, -+ .save = save_lw_sd, -+ .alignment = 8 -+ }, -+ [UNIX_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = UNIX_STAT, -+ .pops = NULL, -+ .label = "unix-sd", -+ .desc = "unix stat-data fields", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_unix_sd, -+ .absent = absent_unix_sd, -+ .save_len = save_len_unix_sd, -+ .save = save_unix_sd, -+ .alignment = 8 -+ }, -+ [LARGE_TIMES_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = LARGE_TIMES_STAT, -+ .pops = NULL, -+ .label = "64time-sd", -+ .desc = "nanosecond resolution for times", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_large_times_sd, -+ .absent = NULL, -+ .save_len = save_len_large_times_sd, -+ .save = save_large_times_sd, -+ .alignment = 8 -+ }, -+ [SYMLINK_STAT] = { -+ /* stat data of symlink has this extension */ -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = SYMLINK_STAT, -+ .pops = NULL, -+ .label = "symlink-sd", -+ .desc = -+ "stat data is appended with symlink name", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_symlink_sd, -+ .absent = NULL, -+ .save_len = save_len_symlink_sd, -+ .save = save_symlink_sd, -+ .alignment = 8 -+ }, -+ [PLUGIN_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = PLUGIN_STAT, -+ .pops = NULL, -+ .label = "plugin-sd", -+ .desc = "plugin stat-data fields", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_pset_sd, -+ .absent = absent_plugin_sd, -+ .save_len = save_len_pset_sd, -+ .save = save_pset_sd, -+ .alignment = 8 -+ }, -+ [HEIR_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = HEIR_STAT, -+ .pops = NULL, -+ .label = "heir-plugin-sd", -+ .desc = "heir plugin stat-data fields", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_hset_sd, -+ .absent = NULL, -+ .save_len = save_len_hset_sd, -+ .save = save_hset_sd, -+ .alignment = 8 -+ }, -+ [FLAGS_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = FLAGS_STAT, -+ .pops = NULL, -+ .label = "flags-sd", -+ .desc = "inode bit flags", -+ .linkage = {NULL, NULL} -+ }, -+ .present = present_flags_sd, -+ .absent = NULL, -+ .save_len = save_len_flags_sd, -+ .save = save_flags_sd, -+ .alignment = 8 -+ }, -+ [CAPABILITIES_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = CAPABILITIES_STAT, -+ .pops = NULL, -+ .label = "capabilities-sd", -+ .desc = "capabilities", -+ .linkage = {NULL, NULL} -+ }, -+ .present = eio, -+ .absent = NULL, -+ .save_len = save_len_flags_sd, -+ .save = save_flags_sd, -+ .alignment = 8 -+ }, -+ [CRYPTO_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = CRYPTO_STAT, -+ .pops = NULL, -+ .label = "crypto-sd", -+ .desc = "secret key size and id", -+ .linkage = {NULL, NULL} -+ }, -+ .present = present_crypto_sd, -+ .absent = NULL, -+ .save_len = save_len_crypto_sd, -+ .save = save_crypto_sd, -+ .alignment = 8 -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.30/fs/reiser4/plugin/item/static_stat.h ---- linux-2.6.30.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/static_stat.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,224 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* This describes the static_stat item, used to hold all information needed by the stat() syscall. -+ -+In the case where each file has not less than the fields needed by the -+stat() syscall, it is more compact to store those fields in this -+struct. -+ -+If this item does not exist, then all stats are dynamically resolved. -+At the moment, we either resolve all stats dynamically or all of them -+statically. If you think this is not fully optimal, and the rest of -+reiser4 is working, then fix it...:-) -+ -+*/ -+ -+#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ ) -+#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+ -+#include <linux/fs.h> /* for struct inode */ -+ -+/* Stat data layout: goals and implementation. -+ -+ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to -+ them, including not having semantic metadata attached to them. -+ -+ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you -+ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically -+ sized structure because the statically sized structure knows without recording it what the names and lengths of the -+ attributes are. -+ -+ This leads to a natural compromise, which is to special case those files which have simply the standard unix file -+ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix -+ file in their use of file attributes. -+ -+ Yet this compromise deserves to be compromised a little. -+ -+ We accommodate the case where you have no more than the standard unix file attributes by using an "extension -+ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum). -+ -+ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited -+ from parent directory (as uid, gid) or initialised to some sane values. -+ -+ To capitalize on existing code infrastructure, extensions are -+ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE. -+ Each stat-data extension plugin implements four methods: -+ -+ ->present() called by sd_load() when this extension is found in stat-data -+ ->absent() called by sd_load() when this extension is not found in stat-data -+ ->save_len() called by sd_len() to calculate total length of stat-data -+ ->save() called by sd_save() to store extension data into stat-data -+ -+ Implementation is in fs/reiser4/plugin/item/static_stat.c -+*/ -+ -+/* stat-data extension. Please order this by presumed frequency of use */ -+typedef enum { -+ /* support for light-weight files */ -+ LIGHT_WEIGHT_STAT, -+ /* data required to implement unix stat(2) call. Layout is in -+ reiser4_unix_stat. If this is not present, file is light-weight */ -+ UNIX_STAT, -+ /* this contains additional set of 32bit [anc]time fields to implement -+ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage -+ if this extension is governed by 32bittimes mount option. */ -+ LARGE_TIMES_STAT, -+ /* stat data has link name included */ -+ SYMLINK_STAT, -+ /* on-disk slots of non-standard plugins for main plugin table -+ (@reiser4_inode->pset), that is, plugins that cannot be deduced -+ from file mode bits), for example, aggregation, interpolation etc. */ -+ PLUGIN_STAT, -+ /* this extension contains persistent inode flags. These flags are -+ single bits: immutable, append, only, etc. Layout is in -+ reiser4_flags_stat. */ -+ FLAGS_STAT, -+ /* this extension contains capabilities sets, associated with this -+ file. Layout is in reiser4_capabilities_stat */ -+ CAPABILITIES_STAT, -+ /* this extension contains size and public id of the secret key. -+ Layout is in reiser4_crypto_stat */ -+ CRYPTO_STAT, -+ /* on-disk slots of non-default plugins for inheritance, which -+ are extracted to special plugin table (@reiser4_inode->hset). -+ By default, children of the object will inherit plugins from -+ its main plugin table (pset). */ -+ HEIR_STAT, -+ LAST_SD_EXTENSION, -+ /* -+ * init_inode_static_sd() iterates over extension mask until all -+ * non-zero bits are processed. This means, that neither ->present(), -+ * nor ->absent() methods will be called for stat-data extensions that -+ * go after last present extension. But some basic extensions, we want -+ * either ->absent() or ->present() method to be called, because these -+ * extensions set up something in inode even when they are not -+ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all -+ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either -+ * ->present(), or ->absent() method will be called, independently of -+ * what other extensions are present. -+ */ -+ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT -+} sd_ext_bits; -+ -+/* minimal stat-data. This allows to support light-weight files. */ -+typedef struct reiser4_stat_data_base { -+ /* 0 */ __le16 extmask; -+ /* 2 */ -+} PACKED reiser4_stat_data_base; -+ -+typedef struct reiser4_light_weight_stat { -+ /* 0 */ __le16 mode; -+ /* 2 */ __le32 nlink; -+ /* 6 */ __le64 size; -+ /* size in bytes */ -+ /* 14 */ -+} PACKED reiser4_light_weight_stat; -+ -+typedef struct reiser4_unix_stat { -+ /* owner id */ -+ /* 0 */ __le32 uid; -+ /* group id */ -+ /* 4 */ __le32 gid; -+ /* access time */ -+ /* 8 */ __le32 atime; -+ /* modification time */ -+ /* 12 */ __le32 mtime; -+ /* change time */ -+ /* 16 */ __le32 ctime; -+ union { -+ /* minor:major for device files */ -+ /* 20 */ __le64 rdev; -+ /* bytes used by file */ -+ /* 20 */ __le64 bytes; -+ } u; -+ /* 28 */ -+} PACKED reiser4_unix_stat; -+ -+/* symlink stored as part of inode */ -+typedef struct reiser4_symlink_stat { -+ char body[0]; -+} PACKED reiser4_symlink_stat; -+ -+typedef struct reiser4_plugin_slot { -+ /* 0 */ __le16 pset_memb; -+ /* 2 */ __le16 id; -+ /* 4 *//* here plugin stores its persistent state */ -+} PACKED reiser4_plugin_slot; -+ -+/* stat-data extension for files with non-standard plugin. */ -+typedef struct reiser4_plugin_stat { -+ /* number of additional plugins, associated with this object */ -+ /* 0 */ __le16 plugins_no; -+ /* 2 */ reiser4_plugin_slot slot[0]; -+ /* 2 */ -+} PACKED reiser4_plugin_stat; -+ -+/* stat-data extension for inode flags. Currently it is just fixed-width 32 -+ * bit mask. If need arise, this can be replaced with variable width -+ * bitmask. */ -+typedef struct reiser4_flags_stat { -+ /* 0 */ __le32 flags; -+ /* 4 */ -+} PACKED reiser4_flags_stat; -+ -+typedef struct reiser4_capabilities_stat { -+ /* 0 */ __le32 effective; -+ /* 8 */ __le32 permitted; -+ /* 16 */ -+} PACKED reiser4_capabilities_stat; -+ -+typedef struct reiser4_cluster_stat { -+/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */ -+ /* 0 */ d8 cluster_shift; -+ /* 1 */ -+} PACKED reiser4_cluster_stat; -+ -+typedef struct reiser4_crypto_stat { -+ /* secret key size, bits */ -+ /* 0 */ d16 keysize; -+ /* secret key id */ -+ /* 2 */ d8 keyid[0]; -+ /* 2 */ -+} PACKED reiser4_crypto_stat; -+ -+typedef struct reiser4_large_times_stat { -+ /* access time */ -+ /* 0 */ d32 atime; -+ /* modification time */ -+ /* 4 */ d32 mtime; -+ /* change time */ -+ /* 8 */ d32 ctime; -+ /* 12 */ -+} PACKED reiser4_large_times_stat; -+ -+/* this structure is filled by sd_item_stat */ -+typedef struct sd_stat { -+ int dirs; -+ int files; -+ int others; -+} sd_stat; -+ -+/* plugin->item.common.* */ -+extern void print_sd(const char *prefix, coord_t * coord); -+extern void item_stat_static_sd(const coord_t * coord, void *vp); -+ -+/* plugin->item.s.sd.* */ -+extern int init_inode_static_sd(struct inode *inode, char *sd, int len); -+extern int save_len_static_sd(struct inode *inode); -+extern int save_static_sd(struct inode *inode, char **area); -+ -+/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/tail.c linux-2.6.30/fs/reiser4/plugin/item/tail.c ---- linux-2.6.30.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/tail.c 2009-06-22 17:36:20.000000000 +0200 -@@ -0,0 +1,807 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../inode.h" -+#include "../../page_cache.h" -+#include "../../carry.h" -+#include "../../vfs_ops.h" -+ -+#include <linux/quotaops.h> -+#include <asm/uaccess.h> -+#include <linux/swap.h> -+#include <linux/writeback.h> -+ -+/* plugin->u.item.b.max_key_inside */ -+reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, get_key_offset(reiser4_max_key())); -+ return key; -+} -+ -+/* plugin->u.item.b.can_contain_key */ -+int can_contain_key_tail(const coord_t *coord, const reiser4_key *key, -+ const reiser4_item_data *data) -+{ -+ reiser4_key item_key; -+ -+ if (item_plugin_by_coord(coord) != data->iplug) -+ return 0; -+ -+ item_key_by_coord(coord, &item_key); -+ if (get_key_locality(key) != get_key_locality(&item_key) || -+ get_key_objectid(key) != get_key_objectid(&item_key)) -+ return 0; -+ -+ return 1; -+} -+ -+/* plugin->u.item.b.mergeable -+ first item is of tail type */ -+/* Audited by: green(2002.06.14) */ -+int mergeable_tail(const coord_t *p1, const coord_t *p2) -+{ -+ reiser4_key key1, key2; -+ -+ assert("vs-535", plugin_of_group(item_plugin_by_coord(p1), -+ UNIX_FILE_METADATA_ITEM_TYPE)); -+ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID); -+ -+ if (item_id_by_coord(p2) != FORMATTING_ID) { -+ /* second item is of another type */ -+ return 0; -+ } -+ -+ item_key_by_coord(p1, &key1); -+ item_key_by_coord(p2, &key2); -+ if (get_key_locality(&key1) != get_key_locality(&key2) || -+ get_key_objectid(&key1) != get_key_objectid(&key2) -+ || get_key_type(&key1) != get_key_type(&key2)) { -+ /* items of different objects */ -+ return 0; -+ } -+ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) { -+ /* not adjacent items */ -+ return 0; -+ } -+ return 1; -+} -+ -+/* plugin->u.item.b.print -+ plugin->u.item.b.check */ -+ -+/* plugin->u.item.b.nr_units */ -+pos_in_node_t nr_units_tail(const coord_t * coord) -+{ -+ return item_length_by_coord(coord); -+} -+ -+/* plugin->u.item.b.lookup */ -+lookup_result -+lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord) -+{ -+ reiser4_key item_key; -+ __u64 lookuped, offset; -+ unsigned nr_units; -+ -+ item_key_by_coord(coord, &item_key); -+ offset = get_key_offset(item_key_by_coord(coord, &item_key)); -+ nr_units = nr_units_tail(coord); -+ -+ /* key we are looking for must be greater than key of item @coord */ -+ assert("vs-416", keygt(key, &item_key)); -+ -+ /* offset we are looking for */ -+ lookuped = get_key_offset(key); -+ -+ if (lookuped >= offset && lookuped < offset + nr_units) { -+ /* byte we are looking for is in this item */ -+ coord->unit_pos = lookuped - offset; -+ coord->between = AT_UNIT; -+ return CBK_COORD_FOUND; -+ } -+ -+ /* set coord after last unit */ -+ coord->unit_pos = nr_units - 1; -+ coord->between = AFTER_UNIT; -+ return bias == -+ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND; -+} -+ -+/* plugin->u.item.b.paste */ -+int -+paste_tail(coord_t *coord, reiser4_item_data *data, -+ carry_plugin_info *info UNUSED_ARG) -+{ -+ unsigned old_item_length; -+ char *item; -+ -+ /* length the item had before resizing has been performed */ -+ old_item_length = item_length_by_coord(coord) - data->length; -+ -+ /* tail items never get pasted in the middle */ -+ assert("vs-363", -+ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) || -+ (coord->unit_pos == old_item_length - 1 && -+ coord->between == AFTER_UNIT) || -+ (coord->unit_pos == 0 && old_item_length == 0 -+ && coord->between == AT_UNIT)); -+ -+ item = item_body_by_coord(coord); -+ if (coord->unit_pos == 0) -+ /* make space for pasted data when pasting at the beginning of -+ the item */ -+ memmove(item + data->length, item, old_item_length); -+ -+ if (coord->between == AFTER_UNIT) -+ coord->unit_pos++; -+ -+ if (data->data) { -+ assert("vs-554", data->user == 0 || data->user == 1); -+ if (data->user) { -+ assert("nikita-3035", reiser4_schedulable()); -+ /* copy from user space */ -+ if (__copy_from_user(item + coord->unit_pos, -+ (const char __user *)data->data, -+ (unsigned)data->length)) -+ return RETERR(-EFAULT); -+ } else -+ /* copy from kernel space */ -+ memcpy(item + coord->unit_pos, data->data, -+ (unsigned)data->length); -+ } else { -+ memset(item + coord->unit_pos, 0, (unsigned)data->length); -+ } -+ return 0; -+} -+ -+/* plugin->u.item.b.fast_paste */ -+ -+/* plugin->u.item.b.can_shift -+ number of units is returned via return value, number of bytes via @size. For -+ tail items they coincide */ -+int -+can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG, -+ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG, -+ unsigned *size, unsigned want) -+{ -+ /* make sure that that we do not want to shift more than we have */ -+ assert("vs-364", want > 0 -+ && want <= (unsigned)item_length_by_coord(source)); -+ -+ *size = min(want, free_space); -+ return *size; -+} -+ -+/* plugin->u.item.b.copy_units */ -+void -+copy_units_tail(coord_t * target, coord_t * source, -+ unsigned from, unsigned count, -+ shift_direction where_is_free_space, -+ unsigned free_space UNUSED_ARG) -+{ -+ /* make sure that item @target is expanded already */ -+ assert("vs-366", (unsigned)item_length_by_coord(target) >= count); -+ assert("vs-370", free_space >= count); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ /* append item @target with @count first bytes of @source */ -+ assert("vs-365", from == 0); -+ -+ memcpy((char *)item_body_by_coord(target) + -+ item_length_by_coord(target) - count, -+ (char *)item_body_by_coord(source), count); -+ } else { -+ /* target item is moved to right already */ -+ reiser4_key key; -+ -+ assert("vs-367", -+ (unsigned)item_length_by_coord(source) == from + count); -+ -+ memcpy((char *)item_body_by_coord(target), -+ (char *)item_body_by_coord(source) + from, count); -+ -+ /* new units are inserted before first unit in an item, -+ therefore, we have to update item key */ -+ item_key_by_coord(source, &key); -+ set_key_offset(&key, get_key_offset(&key) + from); -+ -+ node_plugin_by_node(target->node)->update_item_key(target, &key, -+ NULL /*info */); -+ } -+} -+ -+/* plugin->u.item.b.create_hook */ -+ -+/* item_plugin->b.kill_hook -+ this is called when @count units starting from @from-th one are going to be removed -+ */ -+int -+kill_hook_tail(const coord_t * coord, pos_in_node_t from, -+ pos_in_node_t count, struct carry_kill_data *kdata) -+{ -+ reiser4_key key; -+ loff_t start, end; -+ -+ assert("vs-1577", kdata); -+ assert("vs-1579", kdata->inode); -+ -+ item_key_by_coord(coord, &key); -+ start = get_key_offset(&key) + from; -+ end = start + count; -+ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate); -+ return 0; -+} -+ -+/* plugin->u.item.b.shift_hook */ -+ -+/* helper for kill_units_tail and cut_units_tail */ -+static int -+do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ pos_in_node_t count; -+ -+ /* this method is only called to remove part of item */ -+ assert("vs-374", (to - from + 1) < item_length_by_coord(coord)); -+ /* tails items are never cut from the middle of an item */ -+ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord))); -+ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord))); -+ -+ count = to - from + 1; -+ -+ if (smallest_removed) { -+ /* store smallest key removed */ -+ item_key_by_coord(coord, smallest_removed); -+ set_key_offset(smallest_removed, -+ get_key_offset(smallest_removed) + from); -+ } -+ if (new_first) { -+ /* head of item is cut */ -+ assert("vs-1529", from == 0); -+ -+ item_key_by_coord(coord, new_first); -+ set_key_offset(new_first, -+ get_key_offset(new_first) + from + count); -+ } -+ -+ if (REISER4_DEBUG) -+ memset((char *)item_body_by_coord(coord) + from, 0, count); -+ return count; -+} -+ -+/* plugin->u.item.b.cut_units */ -+int -+cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *cdata UNUSED_ARG, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first); -+} -+ -+/* plugin->u.item.b.kill_units */ -+int -+kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *kdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ kill_hook_tail(coord, from, to - from + 1, kdata); -+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first); -+} -+ -+/* plugin->u.item.b.unit_key */ -+reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key) -+{ -+ assert("vs-375", coord_is_existing_unit(coord)); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, (get_key_offset(key) + coord->unit_pos)); -+ -+ return key; -+} -+ -+/* plugin->u.item.b.estimate -+ plugin->u.item.b.item_data_by_flow */ -+ -+/* tail redpage function. It is called from readpage_tail(). */ -+static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page) -+{ -+ tap_t tap; -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ int count, mapped; -+ struct inode *inode; -+ char *pagedata; -+ -+ /* saving passed coord in order to do not move it by tap. */ -+ init_lh(&lh); -+ copy_lh(&lh, uf_coord->lh); -+ inode = page->mapping->host; -+ coord_dup(&coord, &uf_coord->coord); -+ -+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); -+ -+ if ((result = reiser4_tap_load(&tap))) -+ goto out_tap_done; -+ -+ /* lookup until page is filled up. */ -+ for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) { -+ /* number of bytes to be copied to page */ -+ count = item_length_by_coord(&coord) - coord.unit_pos; -+ if (count > PAGE_CACHE_SIZE - mapped) -+ count = PAGE_CACHE_SIZE - mapped; -+ -+ /* attach @page to address space and get data address */ -+ pagedata = kmap_atomic(page, KM_USER0); -+ -+ /* copy tail item to page */ -+ memcpy(pagedata + mapped, -+ ((char *)item_body_by_coord(&coord) + coord.unit_pos), -+ count); -+ mapped += count; -+ -+ flush_dcache_page(page); -+ -+ /* dettach page from address space */ -+ kunmap_atomic(pagedata, KM_USER0); -+ -+ /* Getting next tail item. */ -+ if (mapped < PAGE_CACHE_SIZE) { -+ /* -+ * unlock page in order to avoid keep it locked -+ * during tree lookup, which takes long term locks -+ */ -+ unlock_page(page); -+ -+ /* getting right neighbour. */ -+ result = go_dir_el(&tap, RIGHT_SIDE, 0); -+ -+ /* lock page back */ -+ lock_page(page); -+ if (PageUptodate(page)) { -+ /* -+ * another thread read the page, we have -+ * nothing to do -+ */ -+ result = 0; -+ goto out_unlock_page; -+ } -+ -+ if (result) { -+ if (result == -E_NO_NEIGHBOR) { -+ /* -+ * rigth neighbor is not a formatted -+ * node -+ */ -+ result = 0; -+ goto done; -+ } else { -+ goto out_tap_relse; -+ } -+ } else { -+ if (!inode_file_plugin(inode)-> -+ owns_item(inode, &coord)) { -+ /* item of another file is found */ -+ result = 0; -+ goto done; -+ } -+ } -+ } -+ } -+ -+ done: -+ if (mapped != PAGE_CACHE_SIZE) -+ zero_user_segment(page, mapped, PAGE_CACHE_SIZE); -+ SetPageUptodate(page); -+ out_unlock_page: -+ unlock_page(page); -+ out_tap_relse: -+ reiser4_tap_relse(&tap); -+ out_tap_done: -+ reiser4_tap_done(&tap); -+ return result; -+} -+ -+/* -+ plugin->s.file.readpage -+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail -+ or -+ filemap_fault->reiser4_readpage->readpage_unix_file->->readpage_tail -+ -+ At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail -+ item. */ -+int readpage_tail(void *vp, struct page *page) -+{ -+ uf_coord_t *uf_coord = vp; -+ ON_DEBUG(coord_t * coord = &uf_coord->coord); -+ ON_DEBUG(reiser4_key key); -+ -+ assert("umka-2515", PageLocked(page)); -+ assert("umka-2516", !PageUptodate(page)); -+ assert("umka-2517", !jprivate(page) && !PagePrivate(page)); -+ assert("umka-2518", page->mapping && page->mapping->host); -+ -+ assert("umka-2519", znode_is_loaded(coord->node)); -+ assert("umka-2520", item_is_tail(coord)); -+ assert("umka-2521", coord_is_existing_unit(coord)); -+ assert("umka-2522", znode_is_rlocked(coord->node)); -+ assert("umka-2523", -+ page->mapping->host->i_ino == -+ get_key_objectid(item_key_by_coord(coord, &key))); -+ -+ return do_readpage_tail(uf_coord, page); -+} -+ -+/** -+ * overwrite_tail -+ * @flow: -+ * @coord: -+ * -+ * Overwrites tail item or its part by user data. Returns number of bytes -+ * written or error code. -+ */ -+static int overwrite_tail(flow_t *flow, coord_t *coord) -+{ -+ unsigned count; -+ -+ assert("vs-570", flow->user == 1); -+ assert("vs-946", flow->data); -+ assert("vs-947", coord_is_existing_unit(coord)); -+ assert("vs-948", znode_is_write_locked(coord->node)); -+ assert("nikita-3036", reiser4_schedulable()); -+ -+ count = item_length_by_coord(coord) - coord->unit_pos; -+ if (count > flow->length) -+ count = flow->length; -+ -+ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos, -+ (const char __user *)flow->data, count)) -+ return RETERR(-EFAULT); -+ -+ znode_make_dirty(coord->node); -+ return count; -+} -+ -+/** -+ * insert_first_tail -+ * @inode: -+ * @flow: -+ * @coord: -+ * @lh: -+ * -+ * Returns number of bytes written or error code. -+ */ -+static ssize_t insert_first_tail(struct inode *inode, flow_t *flow, -+ coord_t *coord, lock_handle *lh) -+{ -+ int result; -+ loff_t to_write; -+ struct unix_file_info *uf_info; -+ -+ if (get_key_offset(&flow->key) != 0) { -+ /* -+ * file is empty and we have to write not to the beginning of -+ * file. Create a hole at the beginning of file. On success -+ * insert_flow returns 0 as number of written bytes which is -+ * what we have to return on padding a file with holes -+ */ -+ flow->data = NULL; -+ flow->length = get_key_offset(&flow->key); -+ set_key_offset(&flow->key, 0); -+ /* -+ * holes in files built of tails are stored just like if there -+ * were real data which are all zeros. Therefore we have to -+ * allocate quota here as well -+ */ -+ if (vfs_dq_alloc_space_nodirty(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ vfs_dq_free_space_nodirty(inode, flow->length); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ /* -+ * first item insertion is only possible when writing to empty -+ * file or performing tail conversion -+ */ -+ assert("", (uf_info->container == UF_CONTAINER_EMPTY || -+ (reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED) && -+ reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)))); -+ /* if file was empty - update its state */ -+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY) -+ uf_info->container = UF_CONTAINER_TAILS; -+ return result; -+ } -+ -+ /* check quota before appending data */ -+ if (vfs_dq_alloc_space_nodirty(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ -+ to_write = flow->length; -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ vfs_dq_free_space_nodirty(inode, flow->length); -+ return (to_write - flow->length) ? (to_write - flow->length) : result; -+} -+ -+/** -+ * append_tail -+ * @inode: -+ * @flow: -+ * @coord: -+ * @lh: -+ * -+ * Returns number of bytes written or error code. -+ */ -+static ssize_t append_tail(struct inode *inode, -+ flow_t *flow, coord_t *coord, lock_handle *lh) -+{ -+ int result; -+ reiser4_key append_key; -+ loff_t to_write; -+ -+ if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) { -+ flow->data = NULL; -+ flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key); -+ set_key_offset(&flow->key, get_key_offset(&append_key)); -+ /* -+ * holes in files built of tails are stored just like if there -+ * were real data which are all zeros. Therefore we have to -+ * allocate quota here as well -+ */ -+ if (vfs_dq_alloc_space_nodirty(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ vfs_dq_free_space_nodirty(inode, flow->length); -+ return result; -+ } -+ -+ /* check quota before appending data */ -+ if (vfs_dq_alloc_space_nodirty(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ -+ to_write = flow->length; -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ vfs_dq_free_space_nodirty(inode, flow->length); -+ return (to_write - flow->length) ? (to_write - flow->length) : result; -+} -+ -+/** -+ * write_tail_reserve_space - reserve space for tail write operation -+ * @inode: -+ * -+ * Estimates and reserves space which may be required for writing one flow to a -+ * file -+ */ -+static int write_extent_reserve_space(struct inode *inode) -+{ -+ __u64 count; -+ reiser4_tree *tree; -+ -+ /* -+ * to write one flow to a file by tails we have to reserve disk space for: -+ -+ * 1. find_file_item may have to insert empty node to the tree (empty -+ * leaf node between two extent items). This requires 1 block and -+ * number of blocks which are necessary to perform insertion of an -+ * internal item into twig level. -+ * -+ * 2. flow insertion -+ * -+ * 3. stat data update -+ */ -+ tree = reiser4_tree_by_inode(inode); -+ count = estimate_one_insert_item(tree) + -+ estimate_insert_flow(tree->height) + -+ estimate_one_insert_item(tree); -+ grab_space_enable(); -+ return reiser4_grab_space(count, 0 /* flags */); -+} -+ -+#define PAGE_PER_FLOW 4 -+ -+static loff_t faultin_user_pages(const char __user *buf, size_t count) -+{ -+ loff_t faulted; -+ int to_fault; -+ -+ if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE) -+ count = PAGE_PER_FLOW * PAGE_CACHE_SIZE; -+ faulted = 0; -+ while (count > 0) { -+ to_fault = PAGE_CACHE_SIZE; -+ if (count < to_fault) -+ to_fault = count; -+ fault_in_pages_readable(buf + faulted, to_fault); -+ count -= to_fault; -+ faulted += to_fault; -+ } -+ return faulted; -+} -+ -+/** -+ * reiser4_write_tail - write method of tail item plugin -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @count: number of bytes to write -+ * @pos: position in file to write to -+ * -+ * Returns number of written bytes or error code. -+ */ -+ssize_t reiser4_write_tail(struct file *file, struct inode * inode, -+ const char __user *buf, size_t count, loff_t *pos) -+{ -+ struct hint hint; -+ int result; -+ flow_t flow; -+ coord_t *coord; -+ lock_handle *lh; -+ znode *loaded; -+ -+ assert("edward-1548", inode != NULL); -+ -+ if (write_extent_reserve_space(inode)) -+ return RETERR(-ENOSPC); -+ -+ result = load_file_hint(file, &hint); -+ BUG_ON(result != 0); -+ -+ flow.length = faultin_user_pages(buf, count); -+ flow.user = 1; -+ memcpy(&flow.data, &buf, sizeof(buf)); -+ flow.op = WRITE_OP; -+ key_by_inode_and_offset_common(inode, *pos, &flow.key); -+ -+ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode); -+ if (IS_CBKERR(result)) -+ return result; -+ -+ coord = &hint.ext_coord.coord; -+ lh = hint.ext_coord.lh; -+ -+ result = zload(coord->node); -+ BUG_ON(result != 0); -+ loaded = coord->node; -+ -+ if (coord->between == AFTER_UNIT) { -+ /* append with data or hole */ -+ result = append_tail(inode, &flow, coord, lh); -+ } else if (coord->between == AT_UNIT) { -+ /* overwrite */ -+ result = overwrite_tail(&flow, coord); -+ } else { -+ /* no items of this file yet. insert data or hole */ -+ result = insert_first_tail(inode, &flow, coord, lh); -+ } -+ zrelse(loaded); -+ if (result < 0) { -+ done_lh(lh); -+ return result; -+ } -+ -+ /* seal and unlock znode */ -+ hint.ext_coord.valid = 0; -+ if (hint.ext_coord.valid) -+ reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK); -+ else -+ reiser4_unset_hint(&hint); -+ -+ save_file_hint(file, &hint); -+ return result; -+} -+ -+#if REISER4_DEBUG -+ -+static int -+coord_matches_key_tail(const coord_t * coord, const reiser4_key * key) -+{ -+ reiser4_key item_key; -+ -+ assert("vs-1356", coord_is_existing_unit(coord)); -+ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key))); -+ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key))); -+ return get_key_offset(key) == -+ get_key_offset(&item_key) + coord->unit_pos; -+ -+} -+ -+#endif -+ -+/* plugin->u.item.s.file.read */ -+int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint) -+{ -+ unsigned count; -+ int item_length; -+ coord_t *coord; -+ uf_coord_t *uf_coord; -+ -+ uf_coord = &hint->ext_coord; -+ coord = &uf_coord->coord; -+ -+ assert("vs-571", f->user == 1); -+ assert("vs-571", f->data); -+ assert("vs-967", coord && coord->node); -+ assert("vs-1117", znode_is_rlocked(coord->node)); -+ assert("vs-1118", znode_is_loaded(coord->node)); -+ -+ assert("nikita-3037", reiser4_schedulable()); -+ assert("vs-1357", coord_matches_key_tail(coord, &f->key)); -+ -+ /* calculate number of bytes to read off the item */ -+ item_length = item_length_by_coord(coord); -+ count = item_length_by_coord(coord) - coord->unit_pos; -+ if (count > f->length) -+ count = f->length; -+ -+ /* user page has to be brought in so that major page fault does not -+ * occur here when longtem lock is held */ -+ if (__copy_to_user((char __user *)f->data, -+ ((char *)item_body_by_coord(coord) + coord->unit_pos), -+ count)) -+ return RETERR(-EFAULT); -+ -+ /* probably mark_page_accessed() should only be called if -+ * coord->unit_pos is zero. */ -+ mark_page_accessed(znode_page(coord->node)); -+ move_flow_forward(f, count); -+ -+ coord->unit_pos += count; -+ if (item_length == coord->unit_pos) { -+ coord->unit_pos--; -+ coord->between = AFTER_UNIT; -+ } -+ reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK); -+ return 0; -+} -+ -+/* -+ plugin->u.item.s.file.append_key -+ key of first byte which is the next to last byte by addressed by this item -+*/ -+reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord)); -+ return key; -+} -+ -+/* plugin->u.item.s.file.init_coord_extension */ -+void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped) -+{ -+ uf_coord->valid = 1; -+} -+ -+/* -+ plugin->u.item.s.file.get_block -+*/ -+int -+get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block) -+{ -+ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL); -+ -+ if (reiser4_blocknr_is_fake(znode_get_block(coord->node))) -+ /* if node has'nt obtainet its block number yet, return 0. -+ * Lets avoid upsetting users with some cosmic numbers beyond -+ * the device capacity.*/ -+ *block = 0; -+ else -+ *block = *znode_get_block(coord->node); -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/item/tail.h linux-2.6.30/fs/reiser4/plugin/item/tail.h ---- linux-2.6.30.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/item/tail.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,58 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined( __REISER4_TAIL_H__ ) -+#define __REISER4_TAIL_H__ -+ -+struct tail_coord_extension { -+ int not_used; -+}; -+ -+struct cut_list; -+ -+/* plugin->u.item.b.* */ -+reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *); -+int can_contain_key_tail(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data *); -+int mergeable_tail(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_tail(const coord_t *); -+lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *); -+int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *); -+int can_shift_tail(unsigned free_space, coord_t * source, -+ znode * target, shift_direction, unsigned *size, -+ unsigned want); -+void copy_units_tail(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction, unsigned free_space); -+int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count, -+ struct carry_kill_data *); -+int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+reiser4_key *unit_key_tail(const coord_t *, reiser4_key *); -+ -+/* plugin->u.item.s.* */ -+ssize_t reiser4_write_tail(struct file *file, struct inode * inode, -+ const char __user *buf, size_t count, loff_t *pos); -+int reiser4_read_tail(struct file *, flow_t *, hint_t *); -+int readpage_tail(void *vp, struct page *page); -+reiser4_key *append_key_tail(const coord_t *, reiser4_key *); -+void init_coord_extension_tail(uf_coord_t *, loff_t offset); -+int get_block_address_tail(const coord_t *, sector_t, sector_t *); -+int item_balance_dirty_pages(struct address_space *, const flow_t *, -+ hint_t *, int back_to_dirty, int set_hint); -+ -+/* __REISER4_TAIL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/Makefile linux-2.6.30/fs/reiser4/plugin/Makefile ---- linux-2.6.30.orig/fs/reiser4/plugin/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,26 @@ -+obj-$(CONFIG_REISER4_FS) += plugins.o -+ -+plugins-objs := \ -+ plugin.o \ -+ plugin_set.o \ -+ object.o \ -+ inode_ops.o \ -+ inode_ops_rename.o \ -+ file_ops.o \ -+ file_ops_readdir.o \ -+ file_plugin_common.o \ -+ dir_plugin_common.o \ -+ digest.o \ -+ hash.o \ -+ fibration.o \ -+ tail_policy.o \ -+ regular.o -+ -+obj-$(CONFIG_REISER4_FS) += item/ -+obj-$(CONFIG_REISER4_FS) += file/ -+obj-$(CONFIG_REISER4_FS) += dir/ -+obj-$(CONFIG_REISER4_FS) += node/ -+obj-$(CONFIG_REISER4_FS) += compress/ -+obj-$(CONFIG_REISER4_FS) += space/ -+obj-$(CONFIG_REISER4_FS) += disk_format/ -+obj-$(CONFIG_REISER4_FS) += security/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/node/Makefile linux-2.6.30/fs/reiser4/plugin/node/Makefile ---- linux-2.6.30.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/node/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += node_plugins.o -+ -+node_plugins-objs := \ -+ node.o \ -+ node40.o -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/node/node40.c linux-2.6.30/fs/reiser4/plugin/node/node40.c ---- linux-2.6.30.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/node/node40.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,2924 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "../plugin_header.h" -+#include "../item/item.h" -+#include "node.h" -+#include "node40.h" -+#include "../plugin.h" -+#include "../../jnode.h" -+#include "../../znode.h" -+#include "../../pool.h" -+#include "../../carry.h" -+#include "../../tap.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../reiser4.h" -+ -+#include <asm/uaccess.h> -+#include <linux/types.h> -+#include <linux/prefetch.h> -+ -+/* leaf 40 format: -+ -+ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ] -+ plugin_id (16) key -+ free_space (16) pluginid (16) -+ free_space_start (16) offset (16) -+ level (8) -+ num_items (16) -+ magic (32) -+ flush_time (32) -+*/ -+/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */ -+/* magic number that is stored in ->magic field of node header */ -+static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */ -+ -+static int prepare_for_update(znode * left, znode * right, -+ carry_plugin_info * info); -+ -+/* header of node of reiser40 format is at the beginning of node */ -+static inline node40_header *node40_node_header(const znode * node /* node to -+ * query */ ) -+{ -+ assert("nikita-567", node != NULL); -+ assert("nikita-568", znode_page(node) != NULL); -+ assert("nikita-569", zdata(node) != NULL); -+ return (node40_header *) zdata(node); -+} -+ -+/* functions to get/set fields of node40_header */ -+#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic)) -+#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space)) -+#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start)) -+#define nh40_get_level(nh) get_unaligned(&(nh)->level) -+#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items)) -+#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id)) -+ -+#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic) -+#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space) -+#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start) -+#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level) -+#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items) -+#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id) -+ -+/* plugin field of node header should be read/set by -+ plugin_by_disk_id/save_disk_plugin */ -+ -+/* array of item headers is at the end of node */ -+static inline item_header40 *node40_ih_at(const znode * node, unsigned pos) -+{ -+ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1; -+} -+ -+/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1 -+ */ -+static inline item_header40 *node40_ih_at_coord(const coord_t * coord) -+{ -+ return (item_header40 *) (zdata(coord->node) + -+ znode_size(coord->node)) - (coord->item_pos) - -+ 1; -+} -+ -+/* functions to get/set fields of item_header40 */ -+#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset)) -+ -+#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset) -+ -+/* plugin field of item header should be read/set by -+ plugin_by_disk_id/save_disk_plugin */ -+ -+/* plugin methods */ -+ -+/* plugin->u.node.item_overhead -+ look for description of this method in plugin/node/node.h */ -+size_t -+item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG) -+{ -+ return sizeof(item_header40); -+} -+ -+/* plugin->u.node.free_space -+ look for description of this method in plugin/node/node.h */ -+size_t free_space_node40(znode * node) -+{ -+ assert("nikita-577", node != NULL); -+ assert("nikita-578", znode_is_loaded(node)); -+ assert("nikita-579", zdata(node) != NULL); -+ -+ return nh40_get_free_space(node40_node_header(node)); -+} -+ -+/* private inline version of node40_num_of_items() for use in this file. This -+ is necessary, because address of node40_num_of_items() is taken and it is -+ never inlined as a result. */ -+static inline short node40_num_of_items_internal(const znode * node) -+{ -+ return nh40_get_num_items(node40_node_header(node)); -+} -+ -+#if REISER4_DEBUG -+static inline void check_num_items(const znode * node) -+{ -+ assert("nikita-2749", -+ node40_num_of_items_internal(node) == node->nr_items); -+ assert("nikita-2746", znode_is_write_locked(node)); -+} -+#else -+#define check_num_items(node) noop -+#endif -+ -+/* plugin->u.node.num_of_items -+ look for description of this method in plugin/node/node.h */ -+int num_of_items_node40(const znode * node) -+{ -+ return node40_num_of_items_internal(node); -+} -+ -+static void -+node40_set_num_items(znode * node, node40_header * nh, unsigned value) -+{ -+ assert("nikita-2751", node != NULL); -+ assert("nikita-2750", nh == node40_node_header(node)); -+ -+ check_num_items(node); -+ nh40_set_num_items(nh, value); -+ node->nr_items = value; -+ check_num_items(node); -+} -+ -+/* plugin->u.node.item_by_coord -+ look for description of this method in plugin/node/node.h */ -+char *item_by_coord_node40(const coord_t * coord) -+{ -+ item_header40 *ih; -+ char *p; -+ -+ /* @coord is set to existing item */ -+ assert("nikita-596", coord != NULL); -+ assert("vs-255", coord_is_existing_item(coord)); -+ -+ ih = node40_ih_at_coord(coord); -+ p = zdata(coord->node) + ih40_get_offset(ih); -+ return p; -+} -+ -+/* plugin->u.node.length_by_coord -+ look for description of this method in plugin/node/node.h */ -+int length_by_coord_node40(const coord_t * coord) -+{ -+ item_header40 *ih; -+ int result; -+ -+ /* @coord is set to existing item */ -+ assert("vs-256", coord != NULL); -+ assert("vs-257", coord_is_existing_item(coord)); -+ -+ ih = node40_ih_at_coord(coord); -+ if ((int)coord->item_pos == -+ node40_num_of_items_internal(coord->node) - 1) -+ result = -+ nh40_get_free_space_start(node40_node_header(coord->node)) - -+ ih40_get_offset(ih); -+ else -+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih); -+ -+ return result; -+} -+ -+static pos_in_node_t -+node40_item_length(const znode * node, pos_in_node_t item_pos) -+{ -+ item_header40 *ih; -+ pos_in_node_t result; -+ -+ /* @coord is set to existing item */ -+ assert("vs-256", node != NULL); -+ assert("vs-257", node40_num_of_items_internal(node) > item_pos); -+ -+ ih = node40_ih_at(node, item_pos); -+ if (item_pos == node40_num_of_items_internal(node) - 1) -+ result = -+ nh40_get_free_space_start(node40_node_header(node)) - -+ ih40_get_offset(ih); -+ else -+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih); -+ -+ return result; -+} -+ -+/* plugin->u.node.plugin_by_coord -+ look for description of this method in plugin/node/node.h */ -+item_plugin *plugin_by_coord_node40(const coord_t * coord) -+{ -+ item_header40 *ih; -+ item_plugin *result; -+ -+ /* @coord is set to existing item */ -+ assert("vs-258", coord != NULL); -+ assert("vs-259", coord_is_existing_item(coord)); -+ -+ ih = node40_ih_at_coord(coord); -+ /* pass NULL in stead of current tree. This is time critical call. */ -+ result = item_plugin_by_disk_id(NULL, &ih->plugin_id); -+ return result; -+} -+ -+/* plugin->u.node.key_at -+ look for description of this method in plugin/node/node.h */ -+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key) -+{ -+ item_header40 *ih; -+ -+ assert("nikita-1765", coord_is_existing_item(coord)); -+ -+ /* @coord is set to existing item */ -+ ih = node40_ih_at_coord(coord); -+ memcpy(key, &ih->key, sizeof(reiser4_key)); -+ return key; -+} -+ -+/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */ -+ -+#define NODE_INCSTAT(n, counter) \ -+ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter) -+ -+#define NODE_ADDSTAT(n, counter, val) \ -+ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val) -+ -+/* plugin->u.node.lookup -+ look for description of this method in plugin/node/node.h */ -+node_search_result lookup_node40(znode * node /* node to query */ , -+ const reiser4_key * key /* key to look for */ , -+ lookup_bias bias /* search bias */ , -+ coord_t * coord /* resulting coord */ ) -+{ -+ int left; -+ int right; -+ int found; -+ int items; -+ -+ item_header40 *lefth; -+ item_header40 *righth; -+ -+ item_plugin *iplug; -+ item_header40 *bstop; -+ item_header40 *ih; -+ cmp_t order; -+ -+ assert("nikita-583", node != NULL); -+ assert("nikita-584", key != NULL); -+ assert("nikita-585", coord != NULL); -+ assert("nikita-2693", znode_is_any_locked(node)); -+ cassert(REISER4_SEQ_SEARCH_BREAK > 2); -+ -+ items = node_num_items(node); -+ -+ if (unlikely(items == 0)) { -+ coord_init_first_unit(coord, node); -+ return NS_NOT_FOUND; -+ } -+ -+ /* binary search for item that can contain given key */ -+ left = 0; -+ right = items - 1; -+ coord->node = node; -+ coord_clear_iplug(coord); -+ found = 0; -+ -+ lefth = node40_ih_at(node, left); -+ righth = node40_ih_at(node, right); -+ -+ /* It is known that for small arrays sequential search is on average -+ more efficient than binary. This is because sequential search is -+ coded as tight loop that can be better optimized by compilers and -+ for small array size gain from this optimization makes sequential -+ search the winner. Another, maybe more important, reason for this, -+ is that sequential array is more CPU cache friendly, whereas binary -+ search effectively destroys CPU caching. -+ -+ Critical here is the notion of "smallness". Reasonable value of -+ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in -+ fs/reiser4/ulevel/ulevel.c:test_search(). -+ -+ Don't try to further optimize sequential search by scanning from -+ right to left in attempt to use more efficient loop termination -+ condition (comparison with 0). This doesn't work. -+ -+ */ -+ -+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) { -+ int median; -+ item_header40 *medianh; -+ -+ median = (left + right) / 2; -+ medianh = node40_ih_at(node, median); -+ -+ assert("nikita-1084", median >= 0); -+ assert("nikita-1085", median < items); -+ switch (keycmp(key, &medianh->key)) { -+ case LESS_THAN: -+ right = median; -+ righth = medianh; -+ break; -+ default: -+ wrong_return_value("nikita-586", "keycmp"); -+ case GREATER_THAN: -+ left = median; -+ lefth = medianh; -+ break; -+ case EQUAL_TO: -+ do { -+ --median; -+ /* headers are ordered from right to left */ -+ ++medianh; -+ } while (median >= 0 && keyeq(key, &medianh->key)); -+ right = left = median + 1; -+ ih = lefth = righth = medianh - 1; -+ found = 1; -+ break; -+ } -+ } -+ /* sequential scan. Item headers, and, therefore, keys are stored at -+ the rightmost part of a node from right to left. We are trying to -+ access memory from left to right, and hence, scan in _descending_ -+ order of item numbers. -+ */ -+ if (!found) { -+ for (left = right, ih = righth; left >= 0; ++ih, --left) { -+ cmp_t comparison; -+ -+ prefetchkey(&(ih + 1)->key); -+ comparison = keycmp(&ih->key, key); -+ if (comparison == GREATER_THAN) -+ continue; -+ if (comparison == EQUAL_TO) { -+ found = 1; -+ do { -+ --left; -+ ++ih; -+ } while (left >= 0 && keyeq(&ih->key, key)); -+ ++left; -+ --ih; -+ } else { -+ assert("nikita-1256", comparison == LESS_THAN); -+ } -+ break; -+ } -+ if (unlikely(left < 0)) -+ left = 0; -+ } -+ -+ assert("nikita-3212", right >= left); -+ assert("nikita-3214", -+ equi(found, keyeq(&node40_ih_at(node, left)->key, key))); -+ -+ coord_set_item_pos(coord, left); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ -+ /* key < leftmost key in a mode or node is corrupted and keys -+ are not sorted */ -+ bstop = node40_ih_at(node, (unsigned)left); -+ order = keycmp(&bstop->key, key); -+ if (unlikely(order == GREATER_THAN)) { -+ if (unlikely(left != 0)) { -+ /* screw up */ -+ warning("nikita-587", "Key less than %i key in a node", -+ left); -+ reiser4_print_key("key", key); -+ reiser4_print_key("min", &bstop->key); -+ print_coord_content("coord", coord); -+ return RETERR(-EIO); -+ } else { -+ coord->between = BEFORE_UNIT; -+ return NS_NOT_FOUND; -+ } -+ } -+ /* left <= key, ok */ -+ iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id); -+ -+ if (unlikely(iplug == NULL)) { -+ warning("nikita-588", "Unknown plugin %i", -+ le16_to_cpu(get_unaligned(&bstop->plugin_id))); -+ reiser4_print_key("key", key); -+ print_coord_content("coord", coord); -+ return RETERR(-EIO); -+ } -+ -+ coord_set_iplug(coord, iplug); -+ -+ /* if exact key from item header was found by binary search, no -+ further checks are necessary. */ -+ if (found) { -+ assert("nikita-1259", order == EQUAL_TO); -+ return NS_FOUND; -+ } -+ if (iplug->b.max_key_inside != NULL) { -+ reiser4_key max_item_key; -+ -+ /* key > max_item_key --- outside of an item */ -+ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) { -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+ /* FIXME-VS: key we are looking for does not fit into -+ found item. Return NS_NOT_FOUND then. Without that -+ the following case does not work: there is extent of -+ file 10000, 10001. File 10000, 10002 has been just -+ created. When writing to position 0 in that file - -+ traverse_tree will stop here on twig level. When we -+ want it to go down to leaf level -+ */ -+ return NS_NOT_FOUND; -+ } -+ } -+ -+ if (iplug->b.lookup != NULL) { -+ return iplug->b.lookup(key, bias, coord); -+ } else { -+ assert("nikita-1260", order == LESS_THAN); -+ coord->between = AFTER_UNIT; -+ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND; -+ } -+} -+ -+#undef NODE_ADDSTAT -+#undef NODE_INCSTAT -+ -+/* plugin->u.node.estimate -+ look for description of this method in plugin/node/node.h */ -+size_t estimate_node40(znode * node) -+{ -+ size_t result; -+ -+ assert("nikita-597", node != NULL); -+ -+ result = free_space_node40(node) - sizeof(item_header40); -+ -+ return (result > 0) ? result : 0; -+} -+ -+/* plugin->u.node.check -+ look for description of this method in plugin/node/node.h */ -+int check_node40(const znode * node /* node to check */ , -+ __u32 flags /* check flags */ , -+ const char **error /* where to store error message */ ) -+{ -+ int nr_items; -+ int i; -+ reiser4_key prev; -+ unsigned old_offset; -+ tree_level level; -+ coord_t coord; -+ int result; -+ -+ assert("nikita-580", node != NULL); -+ assert("nikita-581", error != NULL); -+ assert("nikita-2948", znode_is_loaded(node)); -+ -+ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE)) -+ return 0; -+ -+ assert("nikita-582", zdata(node) != NULL); -+ -+ nr_items = node40_num_of_items_internal(node); -+ if (nr_items < 0) { -+ *error = "Negative number of items"; -+ return -1; -+ } -+ -+ if (flags & REISER4_NODE_DKEYS) -+ prev = *znode_get_ld_key((znode *) node); -+ else -+ prev = *reiser4_min_key(); -+ -+ old_offset = 0; -+ coord_init_zero(&coord); -+ coord.node = (znode *) node; -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ level = znode_get_level(node); -+ for (i = 0; i < nr_items; i++) { -+ item_header40 *ih; -+ reiser4_key unit_key; -+ unsigned j; -+ -+ ih = node40_ih_at(node, (unsigned)i); -+ coord_set_item_pos(&coord, i); -+ if ((ih40_get_offset(ih) >= -+ znode_size(node) - nr_items * sizeof(item_header40)) || -+ (ih40_get_offset(ih) < sizeof(node40_header))) { -+ *error = "Offset is out of bounds"; -+ return -1; -+ } -+ if (ih40_get_offset(ih) <= old_offset) { -+ *error = "Offsets are in wrong order"; -+ return -1; -+ } -+ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) { -+ *error = "Wrong offset of first item"; -+ return -1; -+ } -+ old_offset = ih40_get_offset(ih); -+ -+ if (keygt(&prev, &ih->key)) { -+ *error = "Keys are in wrong order"; -+ return -1; -+ } -+ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) { -+ *error = "Wrong key of first unit"; -+ return -1; -+ } -+ prev = ih->key; -+ for (j = 0; j < coord_num_units(&coord); ++j) { -+ coord.unit_pos = j; -+ unit_key_by_coord(&coord, &unit_key); -+ if (keygt(&prev, &unit_key)) { -+ *error = "Unit keys are in wrong order"; -+ return -1; -+ } -+ prev = unit_key; -+ } -+ coord.unit_pos = 0; -+ if (level != TWIG_LEVEL && item_is_extent(&coord)) { -+ *error = "extent on the wrong level"; -+ return -1; -+ } -+ if (level == LEAF_LEVEL && item_is_internal(&coord)) { -+ *error = "internal item on the wrong level"; -+ return -1; -+ } -+ if (level != LEAF_LEVEL && -+ !item_is_internal(&coord) && !item_is_extent(&coord)) { -+ *error = "wrong item on the internal level"; -+ return -1; -+ } -+ if (level > TWIG_LEVEL && !item_is_internal(&coord)) { -+ *error = "non-internal item on the internal level"; -+ return -1; -+ } -+#if REISER4_DEBUG -+ if (item_plugin_by_coord(&coord)->b.check -+ && item_plugin_by_coord(&coord)->b.check(&coord, error)) -+ return -1; -+#endif -+ if (i) { -+ coord_t prev_coord; -+ /* two neighboring items can not be mergeable */ -+ coord_dup(&prev_coord, &coord); -+ coord_prev_item(&prev_coord); -+ if (are_items_mergeable(&prev_coord, &coord)) { -+ *error = "mergeable items in one node"; -+ return -1; -+ } -+ -+ } -+ } -+ -+ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) { -+ coord_t coord; -+ item_plugin *iplug; -+ -+ coord_init_last_unit(&coord, node); -+ iplug = item_plugin_by_coord(&coord); -+ if ((item_is_extent(&coord) || item_is_tail(&coord)) && -+ iplug->s.file.append_key != NULL) { -+ reiser4_key mkey; -+ -+ iplug->s.file.append_key(&coord, &mkey); -+ set_key_offset(&mkey, get_key_offset(&mkey) - 1); -+ read_lock_dk(current_tree); -+ result = keygt(&mkey, znode_get_rd_key((znode *) node)); -+ read_unlock_dk(current_tree); -+ if (result) { -+ *error = "key of rightmost item is too large"; -+ return -1; -+ } -+ } -+ } -+ if (flags & REISER4_NODE_DKEYS) { -+ read_lock_tree(current_tree); -+ read_lock_dk(current_tree); -+ -+ flags |= REISER4_NODE_TREE_STABLE; -+ -+ if (keygt(&prev, znode_get_rd_key((znode *) node))) { -+ if (flags & REISER4_NODE_TREE_STABLE) { -+ *error = "Last key is greater than rdkey"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ } -+ if (keygt -+ (znode_get_ld_key((znode *) node), -+ znode_get_rd_key((znode *) node))) { -+ *error = "ldkey is greater than rdkey"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && -+ (node->left != NULL) && -+ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) && -+ ergo(flags & REISER4_NODE_TREE_STABLE, -+ !keyeq(znode_get_rd_key(node->left), -+ znode_get_ld_key((znode *) node))) -+ && ergo(!(flags & REISER4_NODE_TREE_STABLE), -+ keygt(znode_get_rd_key(node->left), -+ znode_get_ld_key((znode *) node)))) { -+ *error = "left rdkey or ldkey is wrong"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && -+ (node->right != NULL) && -+ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) && -+ ergo(flags & REISER4_NODE_TREE_STABLE, -+ !keyeq(znode_get_rd_key((znode *) node), -+ znode_get_ld_key(node->right))) -+ && ergo(!(flags & REISER4_NODE_TREE_STABLE), -+ keygt(znode_get_rd_key((znode *) node), -+ znode_get_ld_key(node->right)))) { -+ *error = "rdkey or right ldkey is wrong"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.node.parse -+ look for description of this method in plugin/node/node.h */ -+int parse_node40(znode * node /* node to parse */ ) -+{ -+ node40_header *header; -+ int result; -+ d8 level; -+ -+ header = node40_node_header((znode *) node); -+ result = -EIO; -+ level = nh40_get_level(header); -+ if (unlikely(((__u8) znode_get_level(node)) != level)) -+ warning("nikita-494", "Wrong level found in node: %i != %i", -+ znode_get_level(node), level); -+ else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC)) -+ warning("nikita-495", -+ "Wrong magic in tree node: want %x, got %x", -+ REISER4_NODE_MAGIC, nh40_get_magic(header)); -+ else { -+ node->nr_items = node40_num_of_items_internal(node); -+ result = 0; -+ } -+ return RETERR(result); -+} -+ -+/* plugin->u.node.init -+ look for description of this method in plugin/node/node.h */ -+int init_node40(znode * node /* node to initialise */ ) -+{ -+ node40_header *header; -+ -+ assert("nikita-570", node != NULL); -+ assert("nikita-572", zdata(node) != NULL); -+ -+ header = node40_node_header(node); -+ memset(header, 0, sizeof(node40_header)); -+ nh40_set_free_space(header, znode_size(node) - sizeof(node40_header)); -+ nh40_set_free_space_start(header, sizeof(node40_header)); -+ /* sane hypothesis: 0 in CPU format is 0 in disk format */ -+ /* items: 0 */ -+ save_plugin_id(node_plugin_to_plugin(node->nplug), -+ &header->common_header.plugin_id); -+ nh40_set_level(header, znode_get_level(node)); -+ nh40_set_magic(header, REISER4_NODE_MAGIC); -+ node->nr_items = 0; -+ nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb())); -+ -+ /* flags: 0 */ -+ return 0; -+} -+ -+#ifdef GUESS_EXISTS -+int guess_node40(const znode * node /* node to guess plugin of */ ) -+{ -+ node40_header *nethack; -+ -+ assert("nikita-1058", node != NULL); -+ nethack = node40_node_header(node); -+ return -+ (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) && -+ (plugin_by_disk_id(znode_get_tree(node), -+ REISER4_NODE_PLUGIN_TYPE, -+ &nethack->common_header.plugin_id)->h.id == -+ NODE40_ID); -+} -+#endif -+ -+/* plugin->u.node.chage_item_size -+ look for description of this method in plugin/node/node.h */ -+void change_item_size_node40(coord_t * coord, int by) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ char *item_data; -+ int item_length; -+ unsigned i; -+ -+ /* make sure that @item is coord of existing item */ -+ assert("vs-210", coord_is_existing_item(coord)); -+ -+ nh = node40_node_header(coord->node); -+ -+ item_data = item_by_coord_node40(coord); -+ item_length = length_by_coord_node40(coord); -+ -+ /* move item bodies */ -+ ih = node40_ih_at_coord(coord); -+ memmove(item_data + item_length + by, item_data + item_length, -+ nh40_get_free_space_start(node40_node_header(coord->node)) - -+ (ih40_get_offset(ih) + item_length)); -+ -+ /* update offsets of moved items */ -+ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) { -+ ih = node40_ih_at(coord->node, i); -+ ih40_set_offset(ih, ih40_get_offset(ih) + by); -+ } -+ -+ /* update node header */ -+ nh40_set_free_space(nh, nh40_get_free_space(nh) - by); -+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by); -+} -+ -+static int should_notify_parent(const znode * node) -+{ -+ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */ -+ return !disk_addr_eq(znode_get_block(node), -+ &znode_get_tree(node)->root_block); -+} -+ -+/* plugin->u.node.create_item -+ look for description of this method in plugin/node/node.h */ -+int -+create_item_node40(coord_t *target, const reiser4_key *key, -+ reiser4_item_data *data, carry_plugin_info *info) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ unsigned offset; -+ unsigned i; -+ -+ nh = node40_node_header(target->node); -+ -+ assert("vs-212", coord_is_between_items(target)); -+ /* node must have enough free space */ -+ assert("vs-254", -+ free_space_node40(target->node) >= -+ data->length + sizeof(item_header40)); -+ assert("vs-1410", data->length >= 0); -+ -+ if (coord_set_to_right(target)) -+ /* there are not items to the right of @target, so, new item -+ will be inserted after last one */ -+ coord_set_item_pos(target, nh40_get_num_items(nh)); -+ -+ if (target->item_pos < nh40_get_num_items(nh)) { -+ /* there are items to be moved to prepare space for new -+ item */ -+ ih = node40_ih_at_coord(target); -+ /* new item will start at this offset */ -+ offset = ih40_get_offset(ih); -+ -+ memmove(zdata(target->node) + offset + data->length, -+ zdata(target->node) + offset, -+ nh40_get_free_space_start(nh) - offset); -+ /* update headers of moved items */ -+ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) { -+ ih = node40_ih_at(target->node, i); -+ ih40_set_offset(ih, ih40_get_offset(ih) + data->length); -+ } -+ -+ /* @ih is set to item header of the last item, move item headers */ -+ memmove(ih - 1, ih, -+ sizeof(item_header40) * (nh40_get_num_items(nh) - -+ target->item_pos)); -+ } else { -+ /* new item will start at this offset */ -+ offset = nh40_get_free_space_start(nh); -+ } -+ -+ /* make item header for the new item */ -+ ih = node40_ih_at_coord(target); -+ memcpy(&ih->key, key, sizeof(reiser4_key)); -+ ih40_set_offset(ih, offset); -+ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id); -+ -+ /* update node header */ -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - data->length - -+ sizeof(item_header40)); -+ nh40_set_free_space_start(nh, -+ nh40_get_free_space_start(nh) + data->length); -+ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1); -+ -+ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */ -+ target->unit_pos = 0; -+ target->between = AT_UNIT; -+ coord_clear_iplug(target); -+ -+ /* initialize item */ -+ if (data->iplug->b.init != NULL) { -+ data->iplug->b.init(target, NULL, data); -+ } -+ /* copy item body */ -+ if (data->iplug->b.paste != NULL) { -+ data->iplug->b.paste(target, data, info); -+ } else if (data->data != NULL) { -+ if (data->user) { -+ /* AUDIT: Are we really should not check that pointer -+ from userspace was valid and data bytes were -+ available? How will we return -EFAULT of some kind -+ without this check? */ -+ assert("nikita-3038", reiser4_schedulable()); -+ /* copy data from user space */ -+ __copy_from_user(zdata(target->node) + offset, -+ (const char __user *)data->data, -+ (unsigned)data->length); -+ } else -+ /* copy from kernel space */ -+ memcpy(zdata(target->node) + offset, data->data, -+ (unsigned)data->length); -+ } -+ -+ if (target->item_pos == 0) { -+ /* left delimiting key has to be updated */ -+ prepare_for_update(NULL, target->node, info); -+ } -+ -+ if (item_plugin_by_coord(target)->b.create_hook != NULL) { -+ item_plugin_by_coord(target)->b.create_hook(target, data->arg); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.node.update_item_key -+ look for description of this method in plugin/node/node.h */ -+void -+update_item_key_node40(coord_t * target, const reiser4_key * key, -+ carry_plugin_info * info) -+{ -+ item_header40 *ih; -+ -+ ih = node40_ih_at_coord(target); -+ memcpy(&ih->key, key, sizeof(reiser4_key)); -+ -+ if (target->item_pos == 0) { -+ prepare_for_update(NULL, target->node, info); -+ } -+} -+ -+/* this bits encode cut mode */ -+#define CMODE_TAIL 1 -+#define CMODE_WHOLE 2 -+#define CMODE_HEAD 4 -+ -+struct cut40_info { -+ int mode; -+ pos_in_node_t tail_removed; /* position of item which gets tail removed */ -+ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */ -+ pos_in_node_t removed_count; /* number of items removed completely */ -+ pos_in_node_t head_removed; /* position of item which gets head removed */ -+ -+ pos_in_node_t freed_space_start; -+ pos_in_node_t freed_space_end; -+ pos_in_node_t first_moved; -+ pos_in_node_t head_removed_location; -+}; -+ -+static void init_cinfo(struct cut40_info *cinfo) -+{ -+ cinfo->mode = 0; -+ cinfo->tail_removed = MAX_POS_IN_NODE; -+ cinfo->first_removed = MAX_POS_IN_NODE; -+ cinfo->removed_count = MAX_POS_IN_NODE; -+ cinfo->head_removed = MAX_POS_IN_NODE; -+ cinfo->freed_space_start = MAX_POS_IN_NODE; -+ cinfo->freed_space_end = MAX_POS_IN_NODE; -+ cinfo->first_moved = MAX_POS_IN_NODE; -+ cinfo->head_removed_location = MAX_POS_IN_NODE; -+} -+ -+/* complete cut_node40/kill_node40 content by removing the gap created by */ -+static void compact(znode * node, struct cut40_info *cinfo) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ pos_in_node_t freed; -+ pos_in_node_t pos, nr_items; -+ -+ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE && -+ cinfo->freed_space_end != MAX_POS_IN_NODE && -+ cinfo->first_moved != MAX_POS_IN_NODE)); -+ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start); -+ -+ nh = node40_node_header(node); -+ nr_items = nh40_get_num_items(nh); -+ -+ /* remove gap made up by removal */ -+ memmove(zdata(node) + cinfo->freed_space_start, -+ zdata(node) + cinfo->freed_space_end, -+ nh40_get_free_space_start(nh) - cinfo->freed_space_end); -+ -+ /* update item headers of moved items - change their locations */ -+ pos = cinfo->first_moved; -+ ih = node40_ih_at(node, pos); -+ if (cinfo->head_removed_location != MAX_POS_IN_NODE) { -+ assert("vs-1580", pos == cinfo->head_removed); -+ ih40_set_offset(ih, cinfo->head_removed_location); -+ pos++; -+ ih--; -+ } -+ -+ freed = cinfo->freed_space_end - cinfo->freed_space_start; -+ for (; pos < nr_items; pos++, ih--) { -+ assert("vs-1581", ih == node40_ih_at(node, pos)); -+ ih40_set_offset(ih, ih40_get_offset(ih) - freed); -+ } -+ -+ /* free space start moved to right */ -+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed); -+ -+ if (cinfo->removed_count != MAX_POS_IN_NODE) { -+ /* number of items changed. Remove item headers of those items */ -+ ih = node40_ih_at(node, nr_items - 1); -+ memmove(ih + cinfo->removed_count, ih, -+ sizeof(item_header40) * (nr_items - -+ cinfo->removed_count - -+ cinfo->first_removed)); -+ freed += sizeof(item_header40) * cinfo->removed_count; -+ node40_set_num_items(node, nh, nr_items - cinfo->removed_count); -+ } -+ -+ /* total amount of free space increased */ -+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed); -+} -+ -+int shrink_item_node40(coord_t * coord, int delta) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ pos_in_node_t pos; -+ pos_in_node_t nr_items; -+ char *end; -+ znode *node; -+ int off; -+ -+ assert("nikita-3487", coord != NULL); -+ assert("nikita-3488", delta >= 0); -+ -+ node = coord->node; -+ nh = node40_node_header(node); -+ nr_items = nh40_get_num_items(nh); -+ -+ ih = node40_ih_at_coord(coord); -+ assert("nikita-3489", delta <= length_by_coord_node40(coord)); -+ off = ih40_get_offset(ih) + length_by_coord_node40(coord); -+ end = zdata(node) + off; -+ -+ /* remove gap made up by removal */ -+ memmove(end - delta, end, nh40_get_free_space_start(nh) - off); -+ -+ /* update item headers of moved items - change their locations */ -+ pos = coord->item_pos + 1; -+ ih = node40_ih_at(node, pos); -+ for (; pos < nr_items; pos++, ih--) { -+ assert("nikita-3490", ih == node40_ih_at(node, pos)); -+ ih40_set_offset(ih, ih40_get_offset(ih) - delta); -+ } -+ -+ /* free space start moved to left */ -+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta); -+ /* total amount of free space increased */ -+ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta); -+ /* -+ * This method does _not_ changes number of items. Hence, it cannot -+ * make node empty. Also it doesn't remove items at all, which means -+ * that no keys have to be updated either. -+ */ -+ return 0; -+} -+ -+/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types -+ of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the -+ rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item -+ getting head cut. Function returns 0 in this case */ -+static int -+parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params) -+{ -+ reiser4_key left_key, right_key; -+ reiser4_key min_from_key, max_to_key; -+ const reiser4_key *from_key, *to_key; -+ -+ init_cinfo(cinfo); -+ -+ /* calculate minimal key stored in first item of items to be cut (params->from) */ -+ item_key_by_coord(params->from, &min_from_key); -+ /* and max key stored in last item of items to be cut (params->to) */ -+ max_item_key_by_coord(params->to, &max_to_key); -+ -+ /* if cut key range is not defined in input parameters - define it using cut coord range */ -+ if (params->from_key == NULL) { -+ assert("vs-1513", params->to_key == NULL); -+ unit_key_by_coord(params->from, &left_key); -+ from_key = &left_key; -+ max_unit_key_by_coord(params->to, &right_key); -+ to_key = &right_key; -+ } else { -+ from_key = params->from_key; -+ to_key = params->to_key; -+ } -+ -+ if (params->from->item_pos == params->to->item_pos) { -+ if (keylt(&min_from_key, from_key) -+ && keylt(to_key, &max_to_key)) -+ return 1; -+ -+ if (keygt(from_key, &min_from_key)) { -+ /* tail of item is to be cut cut */ -+ cinfo->tail_removed = params->from->item_pos; -+ cinfo->mode |= CMODE_TAIL; -+ } else if (keylt(to_key, &max_to_key)) { -+ /* head of item is to be cut */ -+ cinfo->head_removed = params->from->item_pos; -+ cinfo->mode |= CMODE_HEAD; -+ } else { -+ /* item is removed completely */ -+ cinfo->first_removed = params->from->item_pos; -+ cinfo->removed_count = 1; -+ cinfo->mode |= CMODE_WHOLE; -+ } -+ } else { -+ cinfo->first_removed = params->from->item_pos + 1; -+ cinfo->removed_count = -+ params->to->item_pos - params->from->item_pos - 1; -+ -+ if (keygt(from_key, &min_from_key)) { -+ /* first item is not cut completely */ -+ cinfo->tail_removed = params->from->item_pos; -+ cinfo->mode |= CMODE_TAIL; -+ } else { -+ cinfo->first_removed--; -+ cinfo->removed_count++; -+ } -+ if (keylt(to_key, &max_to_key)) { -+ /* last item is not cut completely */ -+ cinfo->head_removed = params->to->item_pos; -+ cinfo->mode |= CMODE_HEAD; -+ } else { -+ cinfo->removed_count++; -+ } -+ if (cinfo->removed_count) -+ cinfo->mode |= CMODE_WHOLE; -+ } -+ -+ return 0; -+} -+ -+static void -+call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count, -+ carry_kill_data * kdata) -+{ -+ coord_t coord; -+ item_plugin *iplug; -+ pos_in_node_t pos; -+ -+ coord.node = node; -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ for (pos = 0; pos < count; pos++) { -+ coord_set_item_pos(&coord, from + pos); -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ iplug = item_plugin_by_coord(&coord); -+ if (iplug->b.kill_hook) { -+ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord), -+ kdata); -+ } -+ } -+} -+ -+/* this is used to kill item partially */ -+static pos_in_node_t -+kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data, -+ reiser4_key * smallest_removed, reiser4_key * new_first_key) -+{ -+ struct carry_kill_data *kdata; -+ item_plugin *iplug; -+ -+ kdata = data; -+ iplug = item_plugin_by_coord(coord); -+ -+ assert("vs-1524", iplug->b.kill_units); -+ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed, -+ new_first_key); -+} -+ -+/* call item plugin to cut tail of file */ -+static pos_in_node_t -+kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed) -+{ -+ struct carry_kill_data *kdata; -+ pos_in_node_t to; -+ -+ kdata = data; -+ to = coord_last_unit_pos(coord); -+ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed, -+ NULL); -+} -+ -+/* call item plugin to cut head of item */ -+static pos_in_node_t -+kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed, -+ reiser4_key * new_first_key) -+{ -+ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed, -+ new_first_key); -+} -+ -+/* this is used to cut item partially */ -+static pos_in_node_t -+cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data, -+ reiser4_key * smallest_removed, reiser4_key * new_first_key) -+{ -+ carry_cut_data *cdata; -+ item_plugin *iplug; -+ -+ cdata = data; -+ iplug = item_plugin_by_coord(coord); -+ assert("vs-302", iplug->b.cut_units); -+ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed, -+ new_first_key); -+} -+ -+/* call item plugin to cut tail of file */ -+static pos_in_node_t -+cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed) -+{ -+ carry_cut_data *cdata; -+ pos_in_node_t to; -+ -+ cdata = data; -+ to = coord_last_unit_pos(cdata->params.from); -+ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL); -+} -+ -+/* call item plugin to cut head of item */ -+static pos_in_node_t -+cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed, -+ reiser4_key * new_first_key) -+{ -+ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed, -+ new_first_key); -+} -+ -+/* this returns 1 of key of first item changed, 0 - if it did not */ -+static int -+prepare_for_compact(struct cut40_info *cinfo, -+ const struct cut_kill_params *params, int is_cut, -+ void *data, carry_plugin_info * info) -+{ -+ znode *node; -+ item_header40 *ih; -+ pos_in_node_t freed; -+ pos_in_node_t item_pos; -+ coord_t coord; -+ reiser4_key new_first_key; -+ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t, -+ void *, reiser4_key *, reiser4_key *); -+ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *); -+ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *, -+ reiser4_key *); -+ int retval; -+ -+ retval = 0; -+ -+ node = params->from->node; -+ -+ assert("vs-184", node == params->to->node); -+ assert("vs-312", !node_is_empty(node)); -+ assert("vs-297", -+ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT); -+ -+ if (is_cut) { -+ kill_units_f = cut_units; -+ kill_tail_f = cut_tail; -+ kill_head_f = cut_head; -+ } else { -+ kill_units_f = kill_units; -+ kill_tail_f = kill_tail; -+ kill_head_f = kill_head; -+ } -+ -+ if (parse_cut(cinfo, params) == 1) { -+ /* cut from the middle of item */ -+ freed = -+ kill_units_f(params->from, params->from->unit_pos, -+ params->to->unit_pos, data, -+ params->smallest_removed, NULL); -+ -+ item_pos = params->from->item_pos; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos) - freed; -+ cinfo->freed_space_end = cinfo->freed_space_start + freed; -+ cinfo->first_moved = item_pos + 1; -+ } else { -+ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE || -+ cinfo->first_removed != MAX_POS_IN_NODE || -+ cinfo->head_removed != MAX_POS_IN_NODE)); -+ -+ switch (cinfo->mode) { -+ case CMODE_TAIL: -+ /* one item gets cut partially from its end */ -+ assert("vs-1562", -+ cinfo->tail_removed == params->from->item_pos); -+ -+ freed = -+ kill_tail_f(params->from, data, -+ params->smallest_removed); -+ -+ item_pos = cinfo->tail_removed; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos) - -+ freed; -+ cinfo->freed_space_end = -+ cinfo->freed_space_start + freed; -+ cinfo->first_moved = cinfo->tail_removed + 1; -+ break; -+ -+ case CMODE_WHOLE: -+ /* one or more items get removed completely */ -+ assert("vs-1563", -+ cinfo->first_removed == params->from->item_pos); -+ assert("vs-1564", cinfo->removed_count > 0 -+ && cinfo->removed_count != MAX_POS_IN_NODE); -+ -+ /* call kill hook for all items removed completely */ -+ if (is_cut == 0) -+ call_kill_hooks(node, cinfo->first_removed, -+ cinfo->removed_count, data); -+ -+ item_pos = cinfo->first_removed; -+ ih = node40_ih_at(node, item_pos); -+ -+ if (params->smallest_removed) -+ memcpy(params->smallest_removed, &ih->key, -+ sizeof(reiser4_key)); -+ -+ cinfo->freed_space_start = ih40_get_offset(ih); -+ -+ item_pos += (cinfo->removed_count - 1); -+ ih -= (cinfo->removed_count - 1); -+ cinfo->freed_space_end = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos); -+ cinfo->first_moved = item_pos + 1; -+ if (cinfo->first_removed == 0) -+ /* key of first item of the node changes */ -+ retval = 1; -+ break; -+ -+ case CMODE_HEAD: -+ /* one item gets cut partially from its head */ -+ assert("vs-1565", -+ cinfo->head_removed == params->from->item_pos); -+ -+ freed = -+ kill_head_f(params->to, data, -+ params->smallest_removed, -+ &new_first_key); -+ -+ item_pos = cinfo->head_removed; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = ih40_get_offset(ih); -+ cinfo->freed_space_end = ih40_get_offset(ih) + freed; -+ cinfo->first_moved = cinfo->head_removed + 1; -+ -+ /* item head is removed, therefore, item key changed */ -+ coord.node = node; -+ coord_set_item_pos(&coord, item_pos); -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ update_item_key_node40(&coord, &new_first_key, NULL); -+ if (item_pos == 0) -+ /* key of first item of the node changes */ -+ retval = 1; -+ break; -+ -+ case CMODE_TAIL | CMODE_WHOLE: -+ /* one item gets cut from its end and one or more items get removed completely */ -+ assert("vs-1566", -+ cinfo->tail_removed == params->from->item_pos); -+ assert("vs-1567", -+ cinfo->first_removed == cinfo->tail_removed + 1); -+ assert("vs-1564", cinfo->removed_count > 0 -+ && cinfo->removed_count != MAX_POS_IN_NODE); -+ -+ freed = -+ kill_tail_f(params->from, data, -+ params->smallest_removed); -+ -+ item_pos = cinfo->tail_removed; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos) - -+ freed; -+ -+ /* call kill hook for all items removed completely */ -+ if (is_cut == 0) -+ call_kill_hooks(node, cinfo->first_removed, -+ cinfo->removed_count, data); -+ -+ item_pos += cinfo->removed_count; -+ ih -= cinfo->removed_count; -+ cinfo->freed_space_end = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos); -+ cinfo->first_moved = item_pos + 1; -+ break; -+ -+ case CMODE_WHOLE | CMODE_HEAD: -+ /* one or more items get removed completely and one item gets cut partially from its head */ -+ assert("vs-1568", -+ cinfo->first_removed == params->from->item_pos); -+ assert("vs-1564", cinfo->removed_count > 0 -+ && cinfo->removed_count != MAX_POS_IN_NODE); -+ assert("vs-1569", -+ cinfo->head_removed == -+ cinfo->first_removed + cinfo->removed_count); -+ -+ /* call kill hook for all items removed completely */ -+ if (is_cut == 0) -+ call_kill_hooks(node, cinfo->first_removed, -+ cinfo->removed_count, data); -+ -+ item_pos = cinfo->first_removed; -+ ih = node40_ih_at(node, item_pos); -+ -+ if (params->smallest_removed) -+ memcpy(params->smallest_removed, &ih->key, -+ sizeof(reiser4_key)); -+ -+ freed = -+ kill_head_f(params->to, data, NULL, &new_first_key); -+ -+ cinfo->freed_space_start = ih40_get_offset(ih); -+ -+ ih = node40_ih_at(node, cinfo->head_removed); -+ /* this is the most complex case. Item which got head removed and items which are to be moved -+ intact change their location differently. */ -+ cinfo->freed_space_end = ih40_get_offset(ih) + freed; -+ cinfo->first_moved = cinfo->head_removed; -+ cinfo->head_removed_location = cinfo->freed_space_start; -+ -+ /* item head is removed, therefore, item key changed */ -+ coord.node = node; -+ coord_set_item_pos(&coord, cinfo->head_removed); -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ update_item_key_node40(&coord, &new_first_key, NULL); -+ -+ assert("vs-1579", cinfo->first_removed == 0); -+ /* key of first item of the node changes */ -+ retval = 1; -+ break; -+ -+ case CMODE_TAIL | CMODE_HEAD: -+ /* one item get cut from its end and its neighbor gets cut from its tail */ -+ impossible("vs-1576", "this can not happen currently"); -+ break; -+ -+ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD: -+ impossible("vs-1577", "this can not happen currently"); -+ break; -+ default: -+ impossible("vs-1578", "unexpected cut mode"); -+ break; -+ } -+ } -+ return retval; -+} -+ -+/* plugin->u.node.kill -+ return value is number of items removed completely */ -+int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info) -+{ -+ znode *node; -+ struct cut40_info cinfo; -+ int first_key_changed; -+ -+ node = kdata->params.from->node; -+ -+ first_key_changed = -+ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata, -+ info); -+ compact(node, &cinfo); -+ -+ if (info) { -+ /* it is not called by node40_shift, so we have to take care -+ of changes on upper levels */ -+ if (node_is_empty(node) -+ && !(kdata->flags & DELETE_RETAIN_EMPTY)) -+ /* all contents of node is deleted */ -+ prepare_removal_node40(node, info); -+ else if (first_key_changed) { -+ prepare_for_update(NULL, node, info); -+ } -+ } -+ -+ coord_clear_iplug(kdata->params.from); -+ coord_clear_iplug(kdata->params.to); -+ -+ znode_make_dirty(node); -+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count; -+} -+ -+/* plugin->u.node.cut -+ return value is number of items removed completely */ -+int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info) -+{ -+ znode *node; -+ struct cut40_info cinfo; -+ int first_key_changed; -+ -+ node = cdata->params.from->node; -+ -+ first_key_changed = -+ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata, -+ info); -+ compact(node, &cinfo); -+ -+ if (info) { -+ /* it is not called by node40_shift, so we have to take care -+ of changes on upper levels */ -+ if (node_is_empty(node)) -+ /* all contents of node is deleted */ -+ prepare_removal_node40(node, info); -+ else if (first_key_changed) { -+ prepare_for_update(NULL, node, info); -+ } -+ } -+ -+ coord_clear_iplug(cdata->params.from); -+ coord_clear_iplug(cdata->params.to); -+ -+ znode_make_dirty(node); -+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count; -+} -+ -+/* this structure is used by shift method of node40 plugin */ -+struct shift_params { -+ shift_direction pend; /* when @pend == append - we are shifting to -+ left, when @pend == prepend - to right */ -+ coord_t wish_stop; /* when shifting to left this is last unit we -+ want shifted, when shifting to right - this -+ is set to unit we want to start shifting -+ from */ -+ znode *target; -+ int everything; /* it is set to 1 if everything we have to shift is -+ shifted, 0 - otherwise */ -+ -+ /* FIXME-VS: get rid of read_stop */ -+ -+ /* these are set by estimate_shift */ -+ coord_t real_stop; /* this will be set to last unit which will be -+ really shifted */ -+ -+ /* coordinate in source node before operation of unit which becomes -+ first after shift to left of last after shift to right */ -+ union { -+ coord_t future_first; -+ coord_t future_last; -+ } u; -+ -+ unsigned merging_units; /* number of units of first item which have to -+ be merged with last item of target node */ -+ unsigned merging_bytes; /* number of bytes in those units */ -+ -+ unsigned entire; /* items shifted in their entirety */ -+ unsigned entire_bytes; /* number of bytes in those items */ -+ -+ unsigned part_units; /* number of units of partially copied item */ -+ unsigned part_bytes; /* number of bytes in those units */ -+ -+ unsigned shift_bytes; /* total number of bytes in items shifted (item -+ headers not included) */ -+ -+}; -+ -+static int item_creation_overhead(coord_t *item) -+{ -+ return node_plugin_by_coord(item)->item_overhead(item->node, NULL); -+} -+ -+/* how many units are there in @source starting from source->unit_pos -+ but not further than @stop_coord */ -+static int -+wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend) -+{ -+ if (pend == SHIFT_LEFT) { -+ assert("vs-181", source->unit_pos == 0); -+ } else { -+ assert("vs-182", -+ source->unit_pos == coord_last_unit_pos(source)); -+ } -+ -+ if (source->item_pos != stop_coord->item_pos) { -+ /* @source and @stop_coord are different items */ -+ return coord_last_unit_pos(source) + 1; -+ } -+ -+ if (pend == SHIFT_LEFT) { -+ return stop_coord->unit_pos + 1; -+ } else { -+ return source->unit_pos - stop_coord->unit_pos + 1; -+ } -+} -+ -+/* this calculates what can be copied from @shift->wish_stop.node to -+ @shift->target */ -+static void -+estimate_shift(struct shift_params *shift, const reiser4_context * ctx) -+{ -+ unsigned target_free_space, size; -+ pos_in_node_t stop_item; /* item which estimating should not consider */ -+ unsigned want; /* number of units of item we want shifted */ -+ coord_t source; /* item being estimated */ -+ item_plugin *iplug; -+ -+ /* shifting to left/right starts from first/last units of -+ @shift->wish_stop.node */ -+ if (shift->pend == SHIFT_LEFT) { -+ coord_init_first_unit(&source, shift->wish_stop.node); -+ } else { -+ coord_init_last_unit(&source, shift->wish_stop.node); -+ } -+ shift->real_stop = source; -+ -+ /* free space in target node and number of items in source */ -+ target_free_space = znode_free_space(shift->target); -+ -+ shift->everything = 0; -+ if (!node_is_empty(shift->target)) { -+ /* target node is not empty, check for boundary items -+ mergeability */ -+ coord_t to; -+ -+ /* item we try to merge @source with */ -+ if (shift->pend == SHIFT_LEFT) { -+ coord_init_last_unit(&to, shift->target); -+ } else { -+ coord_init_first_unit(&to, shift->target); -+ } -+ -+ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to, -+ &source) : -+ are_items_mergeable(&source, &to)) { -+ /* how many units of @source do we want to merge to -+ item @to */ -+ want = -+ wanted_units(&source, &shift->wish_stop, -+ shift->pend); -+ -+ /* how many units of @source we can merge to item -+ @to */ -+ iplug = item_plugin_by_coord(&source); -+ if (iplug->b.can_shift != NULL) -+ shift->merging_units = -+ iplug->b.can_shift(target_free_space, -+ &source, shift->target, -+ shift->pend, &size, -+ want); -+ else { -+ shift->merging_units = 0; -+ size = 0; -+ } -+ shift->merging_bytes = size; -+ shift->shift_bytes += size; -+ /* update stop coord to be set to last unit of @source -+ we can merge to @target */ -+ if (shift->merging_units) -+ /* at least one unit can be shifted */ -+ shift->real_stop.unit_pos = -+ (shift->merging_units - source.unit_pos - -+ 1) * shift->pend; -+ else { -+ /* nothing can be shifted */ -+ if (shift->pend == SHIFT_LEFT) -+ coord_init_before_first_item(&shift-> -+ real_stop, -+ source. -+ node); -+ else -+ coord_init_after_last_item(&shift-> -+ real_stop, -+ source.node); -+ } -+ assert("nikita-2081", shift->real_stop.unit_pos + 1); -+ -+ if (shift->merging_units != want) { -+ /* we could not copy as many as we want, so, -+ there is no reason for estimating any -+ longer */ -+ return; -+ } -+ -+ target_free_space -= size; -+ coord_add_item_pos(&source, shift->pend); -+ } -+ } -+ -+ /* number of item nothing of which we want to shift */ -+ stop_item = shift->wish_stop.item_pos + shift->pend; -+ -+ /* calculate how many items can be copied into given free -+ space as whole */ -+ for (; source.item_pos != stop_item; -+ coord_add_item_pos(&source, shift->pend)) { -+ if (shift->pend == SHIFT_RIGHT) -+ source.unit_pos = coord_last_unit_pos(&source); -+ -+ /* how many units of @source do we want to copy */ -+ want = wanted_units(&source, &shift->wish_stop, shift->pend); -+ -+ if (want == coord_last_unit_pos(&source) + 1) { -+ /* we want this item to be copied entirely */ -+ size = -+ item_length_by_coord(&source) + -+ item_creation_overhead(&source); -+ if (size <= target_free_space) { -+ /* item fits into target node as whole */ -+ target_free_space -= size; -+ shift->shift_bytes += -+ size - item_creation_overhead(&source); -+ shift->entire_bytes += -+ size - item_creation_overhead(&source); -+ shift->entire++; -+ -+ /* update shift->real_stop coord to be set to -+ last unit of @source we can merge to -+ @target */ -+ shift->real_stop = source; -+ if (shift->pend == SHIFT_LEFT) -+ shift->real_stop.unit_pos = -+ coord_last_unit_pos(&shift-> -+ real_stop); -+ else -+ shift->real_stop.unit_pos = 0; -+ continue; -+ } -+ } -+ -+ /* we reach here only for an item which does not fit into -+ target node in its entirety. This item may be either -+ partially shifted, or not shifted at all. We will have to -+ create new item in target node, so decrease amout of free -+ space by an item creation overhead. We can reach here also -+ if stop coord is in this item */ -+ if (target_free_space >= -+ (unsigned)item_creation_overhead(&source)) { -+ target_free_space -= item_creation_overhead(&source); -+ iplug = item_plugin_by_coord(&source); -+ if (iplug->b.can_shift) { -+ shift->part_units = iplug->b.can_shift(target_free_space, -+ &source, -+ NULL, /* target */ -+ shift->pend, -+ &size, -+ want); -+ } else { -+ target_free_space = 0; -+ shift->part_units = 0; -+ size = 0; -+ } -+ } else { -+ target_free_space = 0; -+ shift->part_units = 0; -+ size = 0; -+ } -+ shift->part_bytes = size; -+ shift->shift_bytes += size; -+ -+ /* set @shift->real_stop to last unit of @source we can merge -+ to @shift->target */ -+ if (shift->part_units) { -+ shift->real_stop = source; -+ shift->real_stop.unit_pos = -+ (shift->part_units - source.unit_pos - -+ 1) * shift->pend; -+ assert("nikita-2082", shift->real_stop.unit_pos + 1); -+ } -+ -+ if (want != shift->part_units) -+ /* not everything wanted were shifted */ -+ return; -+ break; -+ } -+ -+ shift->everything = 1; -+} -+ -+static void -+copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count, -+ shift_direction dir, unsigned free_space) -+{ -+ item_plugin *iplug; -+ -+ assert("nikita-1463", target != NULL); -+ assert("nikita-1464", source != NULL); -+ assert("nikita-1465", from + count <= coord_num_units(source)); -+ -+ iplug = item_plugin_by_coord(source); -+ assert("nikita-1468", iplug == item_plugin_by_coord(target)); -+ iplug->b.copy_units(target, source, from, count, dir, free_space); -+ -+ if (dir == SHIFT_RIGHT) { -+ /* FIXME-VS: this looks not necessary. update_item_key was -+ called already by copy_units method */ -+ reiser4_key split_key; -+ -+ assert("nikita-1469", target->unit_pos == 0); -+ -+ unit_key_by_coord(target, &split_key); -+ node_plugin_by_coord(target)->update_item_key(target, -+ &split_key, NULL); -+ } -+} -+ -+/* copy part of @shift->real_stop.node starting either from its beginning or -+ from its end and ending at @shift->real_stop to either the end or the -+ beginning of @shift->target */ -+static void copy(struct shift_params *shift) -+{ -+ node40_header *nh; -+ coord_t from; -+ coord_t to; -+ item_header40 *from_ih, *to_ih; -+ int free_space_start; -+ int new_items; -+ unsigned old_items; -+ int old_offset; -+ unsigned i; -+ -+ nh = node40_node_header(shift->target); -+ free_space_start = nh40_get_free_space_start(nh); -+ old_items = nh40_get_num_items(nh); -+ new_items = shift->entire + (shift->part_units ? 1 : 0); -+ assert("vs-185", -+ shift->shift_bytes == -+ shift->merging_bytes + shift->entire_bytes + shift->part_bytes); -+ -+ from = shift->wish_stop; -+ -+ coord_init_first_unit(&to, shift->target); -+ -+ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty, -+ hence to.between is set to EMPTY_NODE above. Looks like we want it -+ to be AT_UNIT. -+ -+ Oh, wonders of ->betweeness... -+ -+ */ -+ to.between = AT_UNIT; -+ -+ if (shift->pend == SHIFT_LEFT) { -+ /* copying to left */ -+ -+ coord_set_item_pos(&from, 0); -+ from_ih = node40_ih_at(from.node, 0); -+ -+ coord_set_item_pos(&to, -+ node40_num_of_items_internal(to.node) - 1); -+ if (shift->merging_units) { -+ /* expand last item, so that plugin methods will see -+ correct data */ -+ free_space_start += shift->merging_bytes; -+ nh40_set_free_space_start(nh, -+ (unsigned)free_space_start); -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - -+ shift->merging_bytes); -+ -+ /* appending last item of @target */ -+ copy_units(&to, &from, 0, /* starting from 0-th unit */ -+ shift->merging_units, SHIFT_LEFT, -+ shift->merging_bytes); -+ coord_inc_item_pos(&from); -+ from_ih--; -+ coord_inc_item_pos(&to); -+ } -+ -+ to_ih = node40_ih_at(shift->target, old_items); -+ if (shift->entire) { -+ /* copy @entire items entirely */ -+ -+ /* copy item headers */ -+ memcpy(to_ih - shift->entire + 1, -+ from_ih - shift->entire + 1, -+ shift->entire * sizeof(item_header40)); -+ /* update item header offset */ -+ old_offset = ih40_get_offset(from_ih); -+ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */ -+ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--) -+ ih40_set_offset(to_ih, -+ ih40_get_offset(from_ih) - -+ old_offset + free_space_start); -+ -+ /* copy item bodies */ -+ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */ -+ shift->entire_bytes); -+ -+ coord_add_item_pos(&from, (int)shift->entire); -+ coord_add_item_pos(&to, (int)shift->entire); -+ } -+ -+ nh40_set_free_space_start(nh, -+ free_space_start + -+ shift->shift_bytes - -+ shift->merging_bytes); -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - -+ (shift->shift_bytes - shift->merging_bytes + -+ sizeof(item_header40) * new_items)); -+ -+ /* update node header */ -+ node40_set_num_items(shift->target, nh, old_items + new_items); -+ assert("vs-170", -+ nh40_get_free_space(nh) < znode_size(shift->target)); -+ -+ if (shift->part_units) { -+ /* copy heading part (@part units) of @source item as -+ a new item into @target->node */ -+ -+ /* copy item header of partially copied item */ -+ coord_set_item_pos(&to, -+ node40_num_of_items_internal(to.node) -+ - 1); -+ memcpy(to_ih, from_ih, sizeof(item_header40)); -+ ih40_set_offset(to_ih, -+ nh40_get_free_space_start(nh) - -+ shift->part_bytes); -+ if (item_plugin_by_coord(&to)->b.init) -+ item_plugin_by_coord(&to)->b.init(&to, &from, -+ NULL); -+ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT, -+ shift->part_bytes); -+ } -+ -+ } else { -+ /* copying to right */ -+ -+ coord_set_item_pos(&from, -+ node40_num_of_items_internal(from.node) - 1); -+ from_ih = node40_ih_at_coord(&from); -+ -+ coord_set_item_pos(&to, 0); -+ -+ /* prepare space for new items */ -+ memmove(zdata(to.node) + sizeof(node40_header) + -+ shift->shift_bytes, -+ zdata(to.node) + sizeof(node40_header), -+ free_space_start - sizeof(node40_header)); -+ /* update item headers of moved items */ -+ to_ih = node40_ih_at(to.node, 0); -+ /* first item gets @merging_bytes longer. free space appears -+ at its beginning */ -+ if (!node_is_empty(to.node)) -+ ih40_set_offset(to_ih, -+ ih40_get_offset(to_ih) + -+ shift->shift_bytes - -+ shift->merging_bytes); -+ -+ for (i = 1; i < old_items; i++) -+ ih40_set_offset(to_ih - i, -+ ih40_get_offset(to_ih - i) + -+ shift->shift_bytes); -+ -+ /* move item headers to make space for new items */ -+ memmove(to_ih - old_items + 1 - new_items, -+ to_ih - old_items + 1, -+ sizeof(item_header40) * old_items); -+ to_ih -= (new_items - 1); -+ -+ nh40_set_free_space_start(nh, -+ free_space_start + -+ shift->shift_bytes); -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - -+ (shift->shift_bytes + -+ sizeof(item_header40) * new_items)); -+ -+ /* update node header */ -+ node40_set_num_items(shift->target, nh, old_items + new_items); -+ assert("vs-170", -+ nh40_get_free_space(nh) < znode_size(shift->target)); -+ -+ if (shift->merging_units) { -+ coord_add_item_pos(&to, new_items); -+ to.unit_pos = 0; -+ to.between = AT_UNIT; -+ /* prepend first item of @to */ -+ copy_units(&to, &from, -+ coord_last_unit_pos(&from) - -+ shift->merging_units + 1, -+ shift->merging_units, SHIFT_RIGHT, -+ shift->merging_bytes); -+ coord_dec_item_pos(&from); -+ from_ih++; -+ } -+ -+ if (shift->entire) { -+ /* copy @entire items entirely */ -+ -+ /* copy item headers */ -+ memcpy(to_ih, from_ih, -+ shift->entire * sizeof(item_header40)); -+ -+ /* update item header offset */ -+ old_offset = -+ ih40_get_offset(from_ih + shift->entire - 1); -+ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */ -+ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++) -+ ih40_set_offset(to_ih, -+ ih40_get_offset(from_ih) - -+ old_offset + -+ sizeof(node40_header) + -+ shift->part_bytes); -+ /* copy item bodies */ -+ coord_add_item_pos(&from, -(int)(shift->entire - 1)); -+ memcpy(zdata(to.node) + sizeof(node40_header) + -+ shift->part_bytes, item_by_coord_node40(&from), -+ shift->entire_bytes); -+ coord_dec_item_pos(&from); -+ } -+ -+ if (shift->part_units) { -+ coord_set_item_pos(&to, 0); -+ to.unit_pos = 0; -+ to.between = AT_UNIT; -+ /* copy heading part (@part units) of @source item as -+ a new item into @target->node */ -+ -+ /* copy item header of partially copied item */ -+ memcpy(to_ih, from_ih, sizeof(item_header40)); -+ ih40_set_offset(to_ih, sizeof(node40_header)); -+ if (item_plugin_by_coord(&to)->b.init) -+ item_plugin_by_coord(&to)->b.init(&to, &from, -+ NULL); -+ copy_units(&to, &from, -+ coord_last_unit_pos(&from) - -+ shift->part_units + 1, shift->part_units, -+ SHIFT_RIGHT, shift->part_bytes); -+ } -+ } -+} -+ -+/* remove everything either before or after @fact_stop. Number of items -+ removed completely is returned */ -+static int delete_copied(struct shift_params *shift) -+{ -+ coord_t from; -+ coord_t to; -+ struct carry_cut_data cdata; -+ -+ if (shift->pend == SHIFT_LEFT) { -+ /* we were shifting to left, remove everything from the -+ beginning of @shift->wish_stop->node upto -+ @shift->wish_stop */ -+ coord_init_first_unit(&from, shift->real_stop.node); -+ to = shift->real_stop; -+ -+ /* store old coordinate of unit which will be first after -+ shift to left */ -+ shift->u.future_first = to; -+ coord_next_unit(&shift->u.future_first); -+ } else { -+ /* we were shifting to right, remove everything from -+ @shift->stop_coord upto to end of -+ @shift->stop_coord->node */ -+ from = shift->real_stop; -+ coord_init_last_unit(&to, from.node); -+ -+ /* store old coordinate of unit which will be last after -+ shift to right */ -+ shift->u.future_last = from; -+ coord_prev_unit(&shift->u.future_last); -+ } -+ -+ cdata.params.from = &from; -+ cdata.params.to = &to; -+ cdata.params.from_key = NULL; -+ cdata.params.to_key = NULL; -+ cdata.params.smallest_removed = NULL; -+ return cut_node40(&cdata, NULL); -+} -+ -+/* something was moved between @left and @right. Add carry operation to @info -+ list to have carry to update delimiting key between them */ -+static int -+prepare_for_update(znode * left, znode * right, carry_plugin_info * info) -+{ -+ carry_op *op; -+ carry_node *cn; -+ -+ if (info == NULL) -+ /* nowhere to send operation to. */ -+ return 0; -+ -+ if (!should_notify_parent(right)) -+ return 0; -+ -+ op = node_post_carry(info, COP_UPDATE, right, 1); -+ if (IS_ERR(op) || op == NULL) -+ return op ? PTR_ERR(op) : -EIO; -+ -+ if (left != NULL) { -+ carry_node *reference; -+ -+ if (info->doing) -+ reference = insert_carry_node(info->doing, -+ info->todo, left); -+ else -+ reference = op->node; -+ assert("nikita-2992", reference != NULL); -+ cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference); -+ if (IS_ERR(cn)) -+ return PTR_ERR(cn); -+ cn->parent = 1; -+ cn->node = left; -+ if (ZF_ISSET(left, JNODE_ORPHAN)) -+ cn->left_before = 1; -+ op->u.update.left = cn; -+ } else -+ op->u.update.left = NULL; -+ return 0; -+} -+ -+/* plugin->u.node.prepare_removal -+ to delete a pointer to @empty from the tree add corresponding carry -+ operation (delete) to @info list */ -+int prepare_removal_node40(znode * empty, carry_plugin_info * info) -+{ -+ carry_op *op; -+ reiser4_tree *tree; -+ -+ if (!should_notify_parent(empty)) -+ return 0; -+ /* already on a road to Styx */ -+ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE)) -+ return 0; -+ op = node_post_carry(info, COP_DELETE, empty, 1); -+ if (IS_ERR(op) || op == NULL) -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ -+ op->u.delete.child = NULL; -+ op->u.delete.flags = 0; -+ -+ /* fare thee well */ -+ tree = znode_get_tree(empty); -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ znode_set_ld_key(empty, znode_get_rd_key(empty)); -+ if (znode_is_left_connected(empty) && empty->left) -+ znode_set_rd_key(empty->left, znode_get_rd_key(empty)); -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+ -+ ZF_SET(empty, JNODE_HEARD_BANSHEE); -+ return 0; -+} -+ -+/* something were shifted from @insert_coord->node to @shift->target, update -+ @insert_coord correspondingly */ -+static void -+adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed, -+ int including_insert_coord) -+{ -+ /* item plugin was invalidated by shifting */ -+ coord_clear_iplug(insert_coord); -+ -+ if (node_is_empty(shift->wish_stop.node)) { -+ assert("vs-242", shift->everything); -+ if (including_insert_coord) { -+ if (shift->pend == SHIFT_RIGHT) { -+ /* set @insert_coord before first unit of -+ @shift->target node */ -+ coord_init_before_first_item(insert_coord, -+ shift->target); -+ } else { -+ /* set @insert_coord after last in target node */ -+ coord_init_after_last_item(insert_coord, -+ shift->target); -+ } -+ } else { -+ /* set @insert_coord inside of empty node. There is -+ only one possible coord within an empty -+ node. init_first_unit will set that coord */ -+ coord_init_first_unit(insert_coord, -+ shift->wish_stop.node); -+ } -+ return; -+ } -+ -+ if (shift->pend == SHIFT_RIGHT) { -+ /* there was shifting to right */ -+ if (shift->everything) { -+ /* everything wanted was shifted */ -+ if (including_insert_coord) { -+ /* @insert_coord is set before first unit of -+ @to node */ -+ coord_init_before_first_item(insert_coord, -+ shift->target); -+ insert_coord->between = BEFORE_UNIT; -+ } else { -+ /* @insert_coord is set after last unit of -+ @insert->node */ -+ coord_init_last_unit(insert_coord, -+ shift->wish_stop.node); -+ insert_coord->between = AFTER_UNIT; -+ } -+ } -+ return; -+ } -+ -+ /* there was shifting to left */ -+ if (shift->everything) { -+ /* everything wanted was shifted */ -+ if (including_insert_coord) { -+ /* @insert_coord is set after last unit in @to node */ -+ coord_init_after_last_item(insert_coord, shift->target); -+ } else { -+ /* @insert_coord is set before first unit in the same -+ node */ -+ coord_init_before_first_item(insert_coord, -+ shift->wish_stop.node); -+ } -+ return; -+ } -+ -+ /* FIXME-VS: the code below is complicated because with between == -+ AFTER_ITEM unit_pos is set to 0 */ -+ -+ if (!removed) { -+ /* no items were shifted entirely */ -+ assert("vs-195", shift->merging_units == 0 -+ || shift->part_units == 0); -+ -+ if (shift->real_stop.item_pos == insert_coord->item_pos) { -+ if (shift->merging_units) { -+ if (insert_coord->between == AFTER_UNIT) { -+ assert("nikita-1441", -+ insert_coord->unit_pos >= -+ shift->merging_units); -+ insert_coord->unit_pos -= -+ shift->merging_units; -+ } else if (insert_coord->between == BEFORE_UNIT) { -+ assert("nikita-2090", -+ insert_coord->unit_pos > -+ shift->merging_units); -+ insert_coord->unit_pos -= -+ shift->merging_units; -+ } -+ -+ assert("nikita-2083", -+ insert_coord->unit_pos + 1); -+ } else { -+ if (insert_coord->between == AFTER_UNIT) { -+ assert("nikita-1442", -+ insert_coord->unit_pos >= -+ shift->part_units); -+ insert_coord->unit_pos -= -+ shift->part_units; -+ } else if (insert_coord->between == BEFORE_UNIT) { -+ assert("nikita-2089", -+ insert_coord->unit_pos > -+ shift->part_units); -+ insert_coord->unit_pos -= -+ shift->part_units; -+ } -+ -+ assert("nikita-2084", -+ insert_coord->unit_pos + 1); -+ } -+ } -+ return; -+ } -+ -+ /* we shifted to left and there was no enough space for everything */ -+ switch (insert_coord->between) { -+ case AFTER_UNIT: -+ case BEFORE_UNIT: -+ if (shift->real_stop.item_pos == insert_coord->item_pos) -+ insert_coord->unit_pos -= shift->part_units; -+ case AFTER_ITEM: -+ coord_add_item_pos(insert_coord, -removed); -+ break; -+ default: -+ impossible("nikita-2087", "not ready"); -+ } -+ assert("nikita-2085", insert_coord->unit_pos + 1); -+} -+ -+static int call_shift_hooks(struct shift_params *shift) -+{ -+ unsigned i, shifted; -+ coord_t coord; -+ item_plugin *iplug; -+ -+ assert("vs-275", !node_is_empty(shift->target)); -+ -+ /* number of items shift touches */ -+ shifted = -+ shift->entire + (shift->merging_units ? 1 : 0) + -+ (shift->part_units ? 1 : 0); -+ -+ if (shift->pend == SHIFT_LEFT) { -+ /* moved items are at the end */ -+ coord_init_last_unit(&coord, shift->target); -+ coord.unit_pos = 0; -+ -+ assert("vs-279", shift->pend == 1); -+ for (i = 0; i < shifted; i++) { -+ unsigned from, count; -+ -+ iplug = item_plugin_by_coord(&coord); -+ if (i == 0 && shift->part_units) { -+ assert("vs-277", -+ coord_num_units(&coord) == -+ shift->part_units); -+ count = shift->part_units; -+ from = 0; -+ } else if (i == shifted - 1 && shift->merging_units) { -+ count = shift->merging_units; -+ from = coord_num_units(&coord) - count; -+ } else { -+ count = coord_num_units(&coord); -+ from = 0; -+ } -+ -+ if (iplug->b.shift_hook) { -+ iplug->b.shift_hook(&coord, from, count, -+ shift->wish_stop.node); -+ } -+ coord_add_item_pos(&coord, -shift->pend); -+ } -+ } else { -+ /* moved items are at the beginning */ -+ coord_init_first_unit(&coord, shift->target); -+ -+ assert("vs-278", shift->pend == -1); -+ for (i = 0; i < shifted; i++) { -+ unsigned from, count; -+ -+ iplug = item_plugin_by_coord(&coord); -+ if (i == 0 && shift->part_units) { -+ assert("vs-277", -+ coord_num_units(&coord) == -+ shift->part_units); -+ count = coord_num_units(&coord); -+ from = 0; -+ } else if (i == shifted - 1 && shift->merging_units) { -+ count = shift->merging_units; -+ from = 0; -+ } else { -+ count = coord_num_units(&coord); -+ from = 0; -+ } -+ -+ if (iplug->b.shift_hook) { -+ iplug->b.shift_hook(&coord, from, count, -+ shift->wish_stop.node); -+ } -+ coord_add_item_pos(&coord, -shift->pend); -+ } -+ } -+ -+ return 0; -+} -+ -+/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */ -+static int -+unit_moved_left(const struct shift_params *shift, const coord_t * old) -+{ -+ assert("vs-944", shift->real_stop.node == old->node); -+ -+ if (shift->real_stop.item_pos < old->item_pos) -+ return 0; -+ if (shift->real_stop.item_pos == old->item_pos) { -+ if (shift->real_stop.unit_pos < old->unit_pos) -+ return 0; -+ } -+ return 1; -+} -+ -+/* shift to right is completed. Return 1 if unit @old was moved to right -+ neighbor */ -+static int -+unit_moved_right(const struct shift_params *shift, const coord_t * old) -+{ -+ assert("vs-944", shift->real_stop.node == old->node); -+ -+ if (shift->real_stop.item_pos > old->item_pos) -+ return 0; -+ if (shift->real_stop.item_pos == old->item_pos) { -+ if (shift->real_stop.unit_pos > old->unit_pos) -+ return 0; -+ } -+ return 1; -+} -+ -+/* coord @old was set in node from which shift was performed. What was shifted -+ is stored in @shift. Update @old correspondingly to performed shift */ -+static coord_t *adjust_coord2(const struct shift_params *shift, -+ const coord_t * old, coord_t * new) -+{ -+ coord_clear_iplug(new); -+ new->between = old->between; -+ -+ coord_clear_iplug(new); -+ if (old->node == shift->target) { -+ if (shift->pend == SHIFT_LEFT) { -+ /* coord which is set inside of left neighbor does not -+ change during shift to left */ -+ coord_dup(new, old); -+ return new; -+ } -+ new->node = old->node; -+ coord_set_item_pos(new, -+ old->item_pos + shift->entire + -+ (shift->part_units ? 1 : 0)); -+ new->unit_pos = old->unit_pos; -+ if (old->item_pos == 0 && shift->merging_units) -+ new->unit_pos += shift->merging_units; -+ return new; -+ } -+ -+ assert("vs-977", old->node == shift->wish_stop.node); -+ if (shift->pend == SHIFT_LEFT) { -+ if (unit_moved_left(shift, old)) { -+ /* unit @old moved to left neighbor. Calculate its -+ coordinate there */ -+ new->node = shift->target; -+ coord_set_item_pos(new, -+ node_num_items(shift->target) - -+ shift->entire - -+ (shift->part_units ? 1 : 0) + -+ old->item_pos); -+ -+ new->unit_pos = old->unit_pos; -+ if (shift->merging_units) { -+ coord_dec_item_pos(new); -+ if (old->item_pos == 0) { -+ /* unit_pos only changes if item got -+ merged */ -+ new->unit_pos = -+ coord_num_units(new) - -+ (shift->merging_units - -+ old->unit_pos); -+ } -+ } -+ } else { -+ /* unit @old did not move to left neighbor. -+ -+ Use _nocheck, because @old is outside of its node. -+ */ -+ coord_dup_nocheck(new, old); -+ coord_add_item_pos(new, -+ -shift->u.future_first.item_pos); -+ if (new->item_pos == 0) -+ new->unit_pos -= shift->u.future_first.unit_pos; -+ } -+ } else { -+ if (unit_moved_right(shift, old)) { -+ /* unit @old moved to right neighbor */ -+ new->node = shift->target; -+ coord_set_item_pos(new, -+ old->item_pos - -+ shift->real_stop.item_pos); -+ if (new->item_pos == 0) { -+ /* unit @old might change unit pos */ -+ coord_set_item_pos(new, -+ old->unit_pos - -+ shift->real_stop.unit_pos); -+ } -+ } else { -+ /* unit @old did not move to right neighbor, therefore -+ it did not change */ -+ coord_dup(new, old); -+ } -+ } -+ coord_set_iplug(new, item_plugin_by_coord(new)); -+ return new; -+} -+ -+/* this is called when shift is completed (something of source node is copied -+ to target and deleted in source) to update all taps set in current -+ context */ -+static void update_taps(const struct shift_params *shift) -+{ -+ tap_t *tap; -+ coord_t new; -+ -+ for_all_taps(tap) { -+ /* update only taps set to nodes participating in shift */ -+ if (tap->coord->node == shift->wish_stop.node -+ || tap->coord->node == shift->target) -+ tap_to_coord(tap, -+ adjust_coord2(shift, tap->coord, &new)); -+ } -+} -+ -+#if REISER4_DEBUG -+ -+struct shift_check { -+ reiser4_key key; -+ __u16 plugin_id; -+ union { -+ __u64 bytes; -+ __u64 entries; -+ void *unused; -+ } u; -+}; -+ -+void *shift_check_prepare(const znode * left, const znode * right) -+{ -+ pos_in_node_t i, nr_items; -+ int mergeable; -+ struct shift_check *data; -+ item_header40 *ih; -+ -+ if (node_is_empty(left) || node_is_empty(right)) -+ mergeable = 0; -+ else { -+ coord_t l, r; -+ -+ coord_init_last_unit(&l, left); -+ coord_init_first_unit(&r, right); -+ mergeable = are_items_mergeable(&l, &r); -+ } -+ nr_items = -+ node40_num_of_items_internal(left) + -+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0); -+ data = -+ kmalloc(sizeof(struct shift_check) * nr_items, -+ reiser4_ctx_gfp_mask_get()); -+ if (data != NULL) { -+ coord_t coord; -+ pos_in_node_t item_pos; -+ -+ coord_init_first_unit(&coord, left); -+ i = 0; -+ -+ for (item_pos = 0; -+ item_pos < node40_num_of_items_internal(left); -+ item_pos++) { -+ -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ data[i].key = ih->key; -+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id)); -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ data[i].u.bytes = coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ data[i].u.bytes = -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ data[i].u.entries = coord_num_units(&coord); -+ break; -+ default: -+ data[i].u.unused = NULL; -+ break; -+ } -+ i++; -+ } -+ -+ coord_init_first_unit(&coord, right); -+ -+ if (mergeable) { -+ assert("vs-1609", i != 0); -+ -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1589", -+ data[i - 1].plugin_id == -+ le16_to_cpu(get_unaligned(&ih->plugin_id))); -+ switch (data[i - 1].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ data[i - 1].u.bytes += coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ data[i - 1].u.bytes += -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ data[i - 1].u.entries += -+ coord_num_units(&coord); -+ break; -+ default: -+ impossible("vs-1605", "wrong mergeable item"); -+ break; -+ } -+ item_pos = 1; -+ } else -+ item_pos = 0; -+ for (; item_pos < node40_num_of_items_internal(right); -+ item_pos++) { -+ -+ assert("vs-1604", i < nr_items); -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ data[i].key = ih->key; -+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id)); -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ data[i].u.bytes = coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ data[i].u.bytes = -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ data[i].u.entries = coord_num_units(&coord); -+ break; -+ default: -+ data[i].u.unused = NULL; -+ break; -+ } -+ i++; -+ } -+ assert("vs-1606", i == nr_items); -+ } -+ return data; -+} -+ -+void shift_check(void *vp, const znode * left, const znode * right) -+{ -+ pos_in_node_t i, nr_items; -+ coord_t coord; -+ __u64 last_bytes; -+ int mergeable; -+ item_header40 *ih; -+ pos_in_node_t item_pos; -+ struct shift_check *data; -+ -+ data = (struct shift_check *)vp; -+ -+ if (data == NULL) -+ return; -+ -+ if (node_is_empty(left) || node_is_empty(right)) -+ mergeable = 0; -+ else { -+ coord_t l, r; -+ -+ coord_init_last_unit(&l, left); -+ coord_init_first_unit(&r, right); -+ mergeable = are_items_mergeable(&l, &r); -+ } -+ -+ nr_items = -+ node40_num_of_items_internal(left) + -+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0); -+ -+ i = 0; -+ last_bytes = 0; -+ -+ coord_init_first_unit(&coord, left); -+ -+ for (item_pos = 0; item_pos < node40_num_of_items_internal(left); -+ item_pos++) { -+ -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1611", i == item_pos); -+ assert("vs-1590", keyeq(&ih->key, &data[i].key)); -+ assert("vs-1591", -+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id); -+ if ((i < (node40_num_of_items_internal(left) - 1)) -+ || !mergeable) { -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ assert("vs-1592", -+ data[i].u.bytes == -+ coord_num_units(&coord)); -+ break; -+ case EXTENT_POINTER_ID: -+ assert("vs-1593", -+ data[i].u.bytes == -+ reiser4_extent_size(&coord, -+ coord_num_units -+ (&coord))); -+ break; -+ case COMPOUND_DIR_ID: -+ assert("vs-1594", -+ data[i].u.entries == -+ coord_num_units(&coord)); -+ break; -+ default: -+ break; -+ } -+ } -+ if (item_pos == (node40_num_of_items_internal(left) - 1) -+ && mergeable) { -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ last_bytes = coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ last_bytes = -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ last_bytes = coord_num_units(&coord); -+ break; -+ default: -+ impossible("vs-1595", "wrong mergeable item"); -+ break; -+ } -+ } -+ i++; -+ } -+ -+ coord_init_first_unit(&coord, right); -+ if (mergeable) { -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1589", -+ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id))); -+ assert("vs-1608", last_bytes != 0); -+ switch (data[i - 1].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ assert("vs-1596", -+ data[i - 1].u.bytes == -+ last_bytes + coord_num_units(&coord)); -+ break; -+ -+ case EXTENT_POINTER_ID: -+ assert("vs-1597", -+ data[i - 1].u.bytes == -+ last_bytes + reiser4_extent_size(&coord, -+ coord_num_units -+ (&coord))); -+ break; -+ -+ case COMPOUND_DIR_ID: -+ assert("vs-1598", -+ data[i - 1].u.bytes == -+ last_bytes + coord_num_units(&coord)); -+ break; -+ default: -+ impossible("vs-1599", "wrong mergeable item"); -+ break; -+ } -+ item_pos = 1; -+ } else -+ item_pos = 0; -+ -+ for (; item_pos < node40_num_of_items_internal(right); item_pos++) { -+ -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1612", keyeq(&ih->key, &data[i].key)); -+ assert("vs-1613", -+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id); -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ assert("vs-1600", -+ data[i].u.bytes == coord_num_units(&coord)); -+ break; -+ case EXTENT_POINTER_ID: -+ assert("vs-1601", -+ data[i].u.bytes == -+ reiser4_extent_size(&coord, -+ coord_num_units -+ (&coord))); -+ break; -+ case COMPOUND_DIR_ID: -+ assert("vs-1602", -+ data[i].u.entries == coord_num_units(&coord)); -+ break; -+ default: -+ break; -+ } -+ i++; -+ } -+ -+ assert("vs-1603", i == nr_items); -+ kfree(data); -+} -+ -+#endif -+ -+/* plugin->u.node.shift -+ look for description of this method in plugin/node/node.h */ -+int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be -+ deleted from the tree if this is set to 1 */ -+ int including_stop_coord, carry_plugin_info * info) -+{ -+ struct shift_params shift; -+ int result; -+ znode *left, *right; -+ znode *source; -+ int target_empty; -+ -+ assert("nikita-2161", coord_check(from)); -+ -+ memset(&shift, 0, sizeof(shift)); -+ shift.pend = pend; -+ shift.wish_stop = *from; -+ shift.target = to; -+ -+ assert("nikita-1473", znode_is_write_locked(from->node)); -+ assert("nikita-1474", znode_is_write_locked(to)); -+ -+ source = from->node; -+ -+ /* set @shift.wish_stop to rightmost/leftmost unit among units we want -+ shifted */ -+ if (pend == SHIFT_LEFT) { -+ result = coord_set_to_left(&shift.wish_stop); -+ left = to; -+ right = from->node; -+ } else { -+ result = coord_set_to_right(&shift.wish_stop); -+ left = from->node; -+ right = to; -+ } -+ -+ if (result) { -+ /* move insertion coord even if there is nothing to move */ -+ if (including_stop_coord) { -+ /* move insertion coord (@from) */ -+ if (pend == SHIFT_LEFT) { -+ /* after last item in target node */ -+ coord_init_after_last_item(from, to); -+ } else { -+ /* before first item in target node */ -+ coord_init_before_first_item(from, to); -+ } -+ } -+ -+ if (delete_child && node_is_empty(shift.wish_stop.node)) -+ result = -+ prepare_removal_node40(shift.wish_stop.node, info); -+ else -+ result = 0; -+ /* there is nothing to shift */ -+ assert("nikita-2078", coord_check(from)); -+ return result; -+ } -+ -+ target_empty = node_is_empty(to); -+ -+ /* when first node plugin with item body compression is implemented, -+ this must be changed to call node specific plugin */ -+ -+ /* shift->stop_coord is updated to last unit which really will be -+ shifted */ -+ estimate_shift(&shift, get_current_context()); -+ if (!shift.shift_bytes) { -+ /* we could not shift anything */ -+ assert("nikita-2079", coord_check(from)); -+ return 0; -+ } -+ -+ copy(&shift); -+ -+ /* result value of this is important. It is used by adjust_coord below */ -+ result = delete_copied(&shift); -+ -+ assert("vs-1610", result >= 0); -+ assert("vs-1471", -+ ((reiser4_context *) current->journal_info)->magic == -+ context_magic); -+ -+ /* item which has been moved from one node to another might want to do -+ something on that event. This can be done by item's shift_hook -+ method, which will be now called for every moved items */ -+ call_shift_hooks(&shift); -+ -+ assert("vs-1472", -+ ((reiser4_context *) current->journal_info)->magic == -+ context_magic); -+ -+ update_taps(&shift); -+ -+ assert("vs-1473", -+ ((reiser4_context *) current->journal_info)->magic == -+ context_magic); -+ -+ /* adjust @from pointer in accordance with @including_stop_coord flag -+ and amount of data which was really shifted */ -+ adjust_coord(from, &shift, result, including_stop_coord); -+ -+ if (target_empty) -+ /* -+ * items were shifted into empty node. Update delimiting key. -+ */ -+ result = prepare_for_update(NULL, left, info); -+ -+ /* add update operation to @info, which is the list of operations to -+ be performed on a higher level */ -+ result = prepare_for_update(left, right, info); -+ if (!result && node_is_empty(source) && delete_child) { -+ /* all contents of @from->node is moved to @to and @from->node -+ has to be removed from the tree, so, on higher level we -+ will be removing the pointer to node @from->node */ -+ result = prepare_removal_node40(source, info); -+ } -+ assert("nikita-2080", coord_check(from)); -+ return result ? result : (int)shift.shift_bytes; -+} -+ -+/* plugin->u.node.fast_insert() -+ look for description of this method in plugin/node/node.h */ -+int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) -+{ -+ return 1; -+} -+ -+/* plugin->u.node.fast_paste() -+ look for description of this method in plugin/node/node.h */ -+int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) -+{ -+ return 1; -+} -+ -+/* plugin->u.node.fast_cut() -+ look for description of this method in plugin/node/node.h */ -+int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) -+{ -+ return 1; -+} -+ -+/* plugin->u.node.modify - not defined */ -+ -+/* plugin->u.node.max_item_size */ -+int max_item_size_node40(void) -+{ -+ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) - -+ sizeof(item_header40); -+} -+ -+/* plugin->u.node.set_item_plugin */ -+int set_item_plugin_node40(coord_t *coord, item_id id) -+{ -+ item_header40 *ih; -+ -+ ih = node40_ih_at_coord(coord); -+ put_unaligned(cpu_to_le16(id), &ih->plugin_id); -+ coord->iplugid = id; -+ return 0; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/node/node40.h linux-2.6.30/fs/reiser4/plugin/node/node40.h ---- linux-2.6.30.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/node/node40.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,125 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined( __REISER4_NODE40_H__ ) -+#define __REISER4_NODE40_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "node.h" -+ -+#include <linux/types.h> -+ -+/* format of node header for 40 node layouts. Keep bloat out of this struct. */ -+typedef struct node40_header { -+ /* identifier of node plugin. Must be located at the very beginning -+ of a node. */ -+ common_node_header common_header; /* this is 16 bits */ -+ /* number of items. Should be first element in the node header, -+ because we haven't yet finally decided whether it shouldn't go into -+ common_header. -+ */ -+/* NIKITA-FIXME-HANS: Create a macro such that if there is only one -+ * node format at compile time, and it is this one, accesses do not function dereference when -+ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */ -+ d16 nr_items; -+ /* free space in node measured in bytes */ -+ d16 free_space; -+ /* offset to start of free space in node */ -+ d16 free_space_start; -+ /* for reiser4_fsck. When information about what is a free -+ block is corrupted, and we try to recover everything even -+ if marked as freed, then old versions of data may -+ duplicate newer versions, and this field allows us to -+ restore the newer version. Also useful for when users -+ who don't have the new trashcan installed on their linux distro -+ delete the wrong files and send us desperate emails -+ offering $25 for them back. */ -+ -+ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */ -+ d32 magic; -+ /* flushstamp is made of mk_id and write_counter. mk_id is an -+ id generated randomly at mkreiserfs time. So we can just -+ skip all nodes with different mk_id. write_counter is d64 -+ incrementing counter of writes on disk. It is used for -+ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */ -+ -+ d32 mkfs_id; -+ d64 flush_id; -+ /* node flags to be used by fsck (reiser4ck or reiser4fsck?) -+ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */ -+ d16 flags; -+ -+ /* 1 is leaf level, 2 is twig level, root is the numerically -+ largest level */ -+ d8 level; -+ -+ d8 pad; -+} PACKED node40_header; -+ -+/* item headers are not standard across all node layouts, pass -+ pos_in_node to functions instead */ -+typedef struct item_header40 { -+ /* key of item */ -+ /* 0 */ reiser4_key key; -+ /* offset from start of a node measured in 8-byte chunks */ -+ /* 24 */ d16 offset; -+ /* 26 */ d16 flags; -+ /* 28 */ d16 plugin_id; -+} PACKED item_header40; -+ -+size_t item_overhead_node40(const znode * node, flow_t * aflow); -+size_t free_space_node40(znode * node); -+node_search_result lookup_node40(znode * node, const reiser4_key * key, -+ lookup_bias bias, coord_t * coord); -+int num_of_items_node40(const znode * node); -+char *item_by_coord_node40(const coord_t * coord); -+int length_by_coord_node40(const coord_t * coord); -+item_plugin *plugin_by_coord_node40(const coord_t * coord); -+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key); -+size_t estimate_node40(znode * node); -+int check_node40(const znode * node, __u32 flags, const char **error); -+int parse_node40(znode * node); -+int init_node40(znode * node); -+#ifdef GUESS_EXISTS -+int guess_node40(const znode * node); -+#endif -+void change_item_size_node40(coord_t * coord, int by); -+int create_item_node40(coord_t * target, const reiser4_key * key, -+ reiser4_item_data * data, carry_plugin_info * info); -+void update_item_key_node40(coord_t * target, const reiser4_key * key, -+ carry_plugin_info * info); -+int kill_node40(struct carry_kill_data *, carry_plugin_info *); -+int cut_node40(struct carry_cut_data *, carry_plugin_info *); -+int shift_node40(coord_t * from, znode * to, shift_direction pend, -+ /* if @from->node becomes -+ empty - it will be deleted from -+ the tree if this is set to 1 -+ */ -+ int delete_child, int including_stop_coord, -+ carry_plugin_info * info); -+ -+int fast_insert_node40(const coord_t * coord); -+int fast_paste_node40(const coord_t * coord); -+int fast_cut_node40(const coord_t * coord); -+int max_item_size_node40(void); -+int prepare_removal_node40(znode * empty, carry_plugin_info * info); -+int set_item_plugin_node40(coord_t * coord, item_id id); -+int shrink_item_node40(coord_t * coord, int delta); -+ -+#if REISER4_DEBUG -+void *shift_check_prepare(const znode *left, const znode *right); -+void shift_check(void *vp, const znode *left, const znode *right); -+#endif -+ -+/* __REISER4_NODE40_H__ */ -+#endif -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/node/node.c linux-2.6.30/fs/reiser4/plugin/node/node.c ---- linux-2.6.30.orig/fs/reiser4/plugin/node/node.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/node/node.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,131 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Node plugin interface. -+ -+ Description: The tree provides the abstraction of flows, which it -+ internally fragments into items which it stores in nodes. -+ -+ A key_atom is a piece of data bound to a single key. -+ -+ For reasonable space efficiency to be achieved it is often -+ necessary to store key_atoms in the nodes in the form of items, where -+ an item is a sequence of key_atoms of the same or similar type. It is -+ more space-efficient, because the item can implement (very) -+ efficient compression of key_atom's bodies using internal knowledge -+ about their semantics, and it can often avoid having a key for each -+ key_atom. Each type of item has specific operations implemented by its -+ item handler (see balance.c). -+ -+ Rationale: the rest of the code (specifically balancing routines) -+ accesses leaf level nodes through this interface. This way we can -+ implement various block layouts and even combine various layouts -+ within the same tree. Balancing/allocating algorithms should not -+ care about peculiarities of splitting/merging specific item types, -+ but rather should leave that to the item's item handler. -+ -+ Items, including those that provide the abstraction of flows, have -+ the property that if you move them in part or in whole to another -+ node, the balancing code invokes their is_left_mergeable() -+ item_operation to determine if they are mergeable with their new -+ neighbor in the node you have moved them to. For some items the -+ is_left_mergeable() function always returns null. -+ -+ When moving the bodies of items from one node to another: -+ -+ if a partial item is shifted to another node the balancing code invokes -+ an item handler method to handle the item splitting. -+ -+ if the balancing code needs to merge with an item in the node it -+ is shifting to, it will invoke an item handler method to handle -+ the item merging. -+ -+ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy() -+ adjusting the item headers after the move is done using the node handler. -+*/ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "../plugin_header.h" -+#include "../item/item.h" -+#include "node.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../reiser4.h" -+ -+/** -+ * leftmost_key_in_node - get the smallest key in node -+ * @node: -+ * @key: store result here -+ * -+ * Stores the leftmost key of @node in @key. -+ */ -+reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key) -+{ -+ assert("nikita-1634", node != NULL); -+ assert("nikita-1635", key != NULL); -+ -+ if (!node_is_empty(node)) { -+ coord_t first_item; -+ -+ coord_init_first_unit(&first_item, (znode *) node); -+ item_key_by_coord(&first_item, key); -+ } else -+ *key = *reiser4_max_key(); -+ return key; -+} -+ -+node_plugin node_plugins[LAST_NODE_ID] = { -+ [NODE40_ID] = { -+ .h = { -+ .type_id = REISER4_NODE_PLUGIN_TYPE, -+ .id = NODE40_ID, -+ .pops = NULL, -+ .label = "unified", -+ .desc = "unified node layout", -+ .linkage = {NULL, NULL} -+ }, -+ .item_overhead = item_overhead_node40, -+ .free_space = free_space_node40, -+ .lookup = lookup_node40, -+ .num_of_items = num_of_items_node40, -+ .item_by_coord = item_by_coord_node40, -+ .length_by_coord = length_by_coord_node40, -+ .plugin_by_coord = plugin_by_coord_node40, -+ .key_at = key_at_node40, -+ .estimate = estimate_node40, -+ .check = check_node40, -+ .parse = parse_node40, -+ .init = init_node40, -+#ifdef GUESS_EXISTS -+ .guess = guess_node40, -+#endif -+ .change_item_size = change_item_size_node40, -+ .create_item = create_item_node40, -+ .update_item_key = update_item_key_node40, -+ .cut_and_kill = kill_node40, -+ .cut = cut_node40, -+ .shift = shift_node40, -+ .shrink_item = shrink_item_node40, -+ .fast_insert = fast_insert_node40, -+ .fast_paste = fast_paste_node40, -+ .fast_cut = fast_cut_node40, -+ .max_item_size = max_item_size_node40, -+ .prepare_removal = prepare_removal_node40, -+ .set_item_plugin = set_item_plugin_node40 -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/node/node.h linux-2.6.30/fs/reiser4/plugin/node/node.h ---- linux-2.6.30.orig/fs/reiser4/plugin/node/node.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/node/node.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,272 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* We need a definition of the default node layout here. */ -+ -+/* Generally speaking, it is best to have free space in the middle of the -+ node so that two sets of things can grow towards it, and to have the -+ item bodies on the left so that the last one of them grows into free -+ space. We optimize for the case where we append new items to the end -+ of the node, or grow the last item, because it hurts nothing to so -+ optimize and it is a common special case to do massive insertions in -+ increasing key order (and one of cases more likely to have a real user -+ notice the delay time for). -+ -+ formatted leaf default layout: (leaf1) -+ -+ |node header:item bodies:free space:key + pluginid + item offset| -+ -+ We grow towards the middle, optimizing layout for the case where we -+ append new items to the end of the node. The node header is fixed -+ length. Keys, and item offsets plus pluginids for the items -+ corresponding to them are in increasing key order, and are fixed -+ length. Item offsets are relative to start of node (16 bits creating -+ a node size limit of 64k, 12 bits might be a better choice....). Item -+ bodies are in decreasing key order. Item bodies have a variable size. -+ There is a one to one to one mapping of keys to item offsets to item -+ bodies. Item offsets consist of pointers to the zeroth byte of the -+ item body. Item length equals the start of the next item minus the -+ start of this item, except the zeroth item whose length equals the end -+ of the node minus the start of that item (plus a byte). In other -+ words, the item length is not recorded anywhere, and it does not need -+ to be since it is computable. -+ -+ Leaf variable length items and keys layout : (lvar) -+ -+ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies| -+ -+ We grow towards the middle, optimizing layout for the case where we -+ append new items to the end of the node. The node header is fixed -+ length. Keys and item offsets for the items corresponding to them are -+ in increasing key order, and keys are variable length. Item offsets -+ are relative to start of node (16 bits). Item bodies are in -+ decreasing key order. Item bodies have a variable size. There is a -+ one to one to one mapping of keys to item offsets to item bodies. -+ Item offsets consist of pointers to the zeroth byte of the item body. -+ Item length equals the start of the next item's key minus the start of -+ this item, except the zeroth item whose length equals the end of the -+ node minus the start of that item (plus a byte). -+ -+ leaf compressed keys layout: (lcomp) -+ -+ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies| -+ -+ We grow towards the middle, optimizing layout for the case where we -+ append new items to the end of the node. The node header is fixed -+ length. Keys and item offsets for the items corresponding to them are -+ in increasing key order, and keys are variable length. The "key -+ inherit" field indicates how much of the key prefix is identical to -+ the previous key (stem compression as described in "Managing -+ Gigabytes" is used). key_inherit is a one byte integer. The -+ intra-node searches performed through this layout are linear searches, -+ and this is theorized to not hurt performance much due to the high -+ cost of processor stalls on modern CPUs, and the small number of keys -+ in a single node. Item offsets are relative to start of node (16 -+ bits). Item bodies are in decreasing key order. Item bodies have a -+ variable size. There is a one to one to one mapping of keys to item -+ offsets to item bodies. Item offsets consist of pointers to the -+ zeroth byte of the item body. Item length equals the start of the -+ next item minus the start of this item, except the zeroth item whose -+ length equals the end of the node minus the start of that item (plus a -+ byte). In other words, item length and key length is not recorded -+ anywhere, and it does not need to be since it is computable. -+ -+ internal node default layout: (idef1) -+ -+ just like ldef1 except that item bodies are either blocknrs of -+ children or extents, and moving them may require updating parent -+ pointers in the nodes that they point to. -+*/ -+ -+/* There is an inherent 3-way tradeoff between optimizing and -+ exchanging disks between different architectures and code -+ complexity. This is optimal and simple and inexchangeable. -+ Someone else can do the code for exchanging disks and make it -+ complex. It would not be that hard. Using other than the PAGE_SIZE -+ might be suboptimal. -+*/ -+ -+#if !defined( __REISER4_NODE_H__ ) -+#define __REISER4_NODE_H__ -+ -+#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE -+ -+#include "../../dformat.h" -+#include "../plugin_header.h" -+ -+#include <linux/types.h> -+ -+typedef enum { -+ NS_FOUND = 0, -+ NS_NOT_FOUND = -ENOENT -+} node_search_result; -+ -+/* Maximal possible space overhead for creation of new item in a node */ -+#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 ) -+ -+typedef enum { -+ REISER4_NODE_DKEYS = (1 << 0), -+ REISER4_NODE_TREE_STABLE = (1 << 1) -+} reiser4_node_check_flag; -+ -+/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */ -+struct cut_list { -+ coord_t *from; -+ coord_t *to; -+ const reiser4_key *from_key; -+ const reiser4_key *to_key; -+ reiser4_key *smallest_removed; -+ carry_plugin_info *info; -+ __u32 flags; -+ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */ -+ lock_handle *left; -+ lock_handle *right; -+}; -+ -+struct carry_cut_data; -+struct carry_kill_data; -+ -+/* The responsibility of the node plugin is to store and give access -+ to the sequence of items within the node. */ -+typedef struct node_plugin { -+ /* generic plugin fields */ -+ plugin_header h; -+ -+ /* calculates the amount of space that will be required to store an -+ item which is in addition to the space consumed by the item body. -+ (the space consumed by the item body can be gotten by calling -+ item->estimate) */ -+ size_t(*item_overhead) (const znode * node, flow_t * f); -+ -+ /* returns free space by looking into node (i.e., without using -+ znode->free_space). */ -+ size_t(*free_space) (znode * node); -+ /* search within the node for the one item which might -+ contain the key, invoking item->search_within to search within -+ that item to see if it is in there */ -+ node_search_result(*lookup) (znode * node, const reiser4_key * key, -+ lookup_bias bias, coord_t * coord); -+ /* number of items in node */ -+ int (*num_of_items) (const znode * node); -+ -+ /* store information about item in @coord in @data */ -+ /* break into several node ops, don't add any more uses of this before doing so */ -+ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */ -+ char *(*item_by_coord) (const coord_t * coord); -+ int (*length_by_coord) (const coord_t * coord); -+ item_plugin *(*plugin_by_coord) (const coord_t * coord); -+ -+ /* store item key in @key */ -+ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key); -+ /* conservatively estimate whether unit of what size can fit -+ into node. This estimation should be performed without -+ actually looking into the node's content (free space is saved in -+ znode). */ -+ size_t(*estimate) (znode * node); -+ -+ /* performs every consistency check the node plugin author could -+ imagine. Optional. */ -+ int (*check) (const znode * node, __u32 flags, const char **error); -+ -+ /* Called when node is read into memory and node plugin is -+ already detected. This should read some data into znode (like free -+ space counter) and, optionally, check data consistency. -+ */ -+ int (*parse) (znode * node); -+ /* This method is called on a new node to initialise plugin specific -+ data (header, etc.) */ -+ int (*init) (znode * node); -+ /* Check whether @node content conforms to this plugin format. -+ Probably only useful after support for old V3.x formats is added. -+ Uncomment after 4.0 only. -+ */ -+ /* int ( *guess )( const znode *node ); */ -+#if REISER4_DEBUG -+ void (*print) (const char *prefix, const znode * node, __u32 flags); -+#endif -+ /* change size of @item by @by bytes. @item->node has enough free -+ space. When @by > 0 - free space is appended to end of item. When -+ @by < 0 - item is truncated - it is assumed that last @by bytes if -+ the item are freed already */ -+ void (*change_item_size) (coord_t * item, int by); -+ -+ /* create new item @length bytes long in coord @target */ -+ int (*create_item) (coord_t * target, const reiser4_key * key, -+ reiser4_item_data * data, carry_plugin_info * info); -+ -+ /* update key of item. */ -+ void (*update_item_key) (coord_t * target, const reiser4_key * key, -+ carry_plugin_info * info); -+ -+ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *); -+ int (*cut) (struct carry_cut_data *, carry_plugin_info *); -+ -+ /* -+ * shrink item pointed to by @coord by @delta bytes. -+ */ -+ int (*shrink_item) (coord_t * coord, int delta); -+ -+ /* copy as much as possible but not more than up to @stop from -+ @stop->node to @target. If (pend == append) then data from beginning of -+ @stop->node are copied to the end of @target. If (pend == prepend) then -+ data from the end of @stop->node are copied to the beginning of -+ @target. Copied data are removed from @stop->node. Information -+ about what to do on upper level is stored in @todo */ -+ int (*shift) (coord_t * stop, znode * target, shift_direction pend, -+ int delete_node, int including_insert_coord, -+ carry_plugin_info * info); -+ /* return true if this node allows skip carry() in some situations -+ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format -+ emulation doesn't. -+ -+ This will speedup insertions that doesn't require updates to the -+ parent, by bypassing initialisation of carry() structures. It's -+ believed that majority of insertions will fit there. -+ -+ */ -+ int (*fast_insert) (const coord_t * coord); -+ int (*fast_paste) (const coord_t * coord); -+ int (*fast_cut) (const coord_t * coord); -+ /* this limits max size of item which can be inserted into a node and -+ number of bytes item in a node may be appended with */ -+ int (*max_item_size) (void); -+ int (*prepare_removal) (znode * empty, carry_plugin_info * info); -+ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular -+ * files */ -+ int (*set_item_plugin) (coord_t * coord, item_id); -+} node_plugin; -+ -+typedef enum { -+ /* standard unified node layout used for both leaf and internal -+ nodes */ -+ NODE40_ID, -+ LAST_NODE_ID -+} reiser4_node_id; -+ -+extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key); -+#if REISER4_DEBUG -+extern void print_node_content(const char *prefix, const znode * node, -+ __u32 flags); -+#endif -+ -+extern void indent_znode(const znode * node); -+ -+typedef struct common_node_header { -+ /* -+ * identifier of node plugin. Must be located at the very beginning of -+ * a node. -+ */ -+ __le16 plugin_id; -+} common_node_header; -+ -+/* __REISER4_NODE_H__ */ -+#endif -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/object.c linux-2.6.30/fs/reiser4/plugin/object.c ---- linux-2.6.30.orig/fs/reiser4/plugin/object.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/object.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,531 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * Examples of object plugins: file, directory, symlink, special file. -+ * -+ * Plugins associated with inode: -+ * -+ * Plugin of inode is plugin referenced by plugin-id field of on-disk -+ * stat-data. How we store this plugin in in-core inode is not -+ * important. Currently pointers are used, another variant is to store offsets -+ * and do array lookup on each access. -+ * -+ * Now, each inode has one selected plugin: object plugin that -+ * determines what type of file this object is: directory, regular etc. -+ * -+ * This main plugin can use other plugins that are thus subordinated to -+ * it. Directory instance of object plugin uses hash; regular file -+ * instance uses tail policy plugin. -+ * -+ * Object plugin is either taken from id in stat-data or guessed from -+ * i_mode bits. Once it is established we ask it to install its -+ * subordinate plugins, by looking again in stat-data or inheriting them -+ * from parent. -+ * -+ * How new inode is initialized during ->read_inode(): -+ * 1 read stat-data and initialize inode fields: i_size, i_mode, -+ * i_generation, capabilities etc. -+ * 2 read plugin id from stat data or try to guess plugin id -+ * from inode->i_mode bits if plugin id is missing. -+ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields. -+ * -+ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What -+ * if stat data does contain i_size, etc., due to it being an unusual plugin? -+ * -+ * 4 Call ->activate() method of object's plugin. Plugin is either read from -+ * from stat-data or guessed from mode bits -+ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized -+ * plugins from parent. -+ * -+ * Easy induction proves that on last step all plugins of inode would be -+ * initialized. -+ * -+ * When creating new object: -+ * 1 obtain object plugin id (see next period) -+ * NIKITA-FIXME-HANS: period? -+ * 2 ->install() this plugin -+ * 3 ->inherit() the rest from the parent -+ * -+ * We need some examples of creating an object with default and non-default -+ * plugin ids. Nikita, please create them. -+ */ -+ -+#include "../inode.h" -+ -+static int _bugop(void) -+{ -+ BUG_ON(1); -+ return 0; -+} -+ -+#define bugop ((void *)_bugop) -+ -+static int _dummyop(void) -+{ -+ return 0; -+} -+ -+#define dummyop ((void *)_dummyop) -+ -+static int change_file(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ /* cannot change object plugin of already existing object */ -+ if (memb == PSET_FILE) -+ return RETERR(-EINVAL); -+ -+ /* Change PSET_CREATE */ -+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin); -+} -+ -+static reiser4_plugin_ops file_plugin_ops = { -+ .change = change_file -+}; -+ -+static struct inode_operations null_i_ops = {.create = NULL}; -+static struct file_operations null_f_ops = {.owner = NULL}; -+static struct address_space_operations null_a_ops = {.writepage = NULL}; -+ -+/* VFS methods for regular files */ -+static struct inode_operations regular_file_i_ops = { -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr, -+ .getattr = reiser4_getattr_common -+}; -+static struct file_operations regular_file_f_ops = { -+ .llseek = generic_file_llseek, -+ .read = reiser4_read_careful, -+ .write = reiser4_write_careful, -+ .aio_read = generic_file_aio_read, -+ .ioctl = reiser4_ioctl_careful, -+ .mmap = reiser4_mmap_careful, -+ .open = reiser4_open_careful, -+ .release = reiser4_release_careful, -+ .fsync = reiser4_sync_file_common, -+ .splice_read = generic_file_splice_read, -+ .splice_write = generic_file_splice_write -+}; -+static struct address_space_operations regular_file_a_ops = { -+ .writepage = reiser4_writepage, -+ .readpage = reiser4_readpage, -+ .sync_page = block_sync_page, -+ .writepages = reiser4_writepages, -+ .set_page_dirty = reiser4_set_page_dirty, -+ .readpages = reiser4_readpages, -+ .write_begin = reiser4_write_begin_careful, -+ .write_end = reiser4_write_end_careful, -+ .bmap = reiser4_bmap_careful, -+ .invalidatepage = reiser4_invalidatepage, -+ .releasepage = reiser4_releasepage -+}; -+ -+/* VFS methods for symlink files */ -+static struct inode_operations symlink_file_i_ops = { -+ .readlink = generic_readlink, -+ .follow_link = reiser4_follow_link_common, -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr_common, -+ .getattr = reiser4_getattr_common -+}; -+ -+/* VFS methods for special files */ -+static struct inode_operations special_file_i_ops = { -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr_common, -+ .getattr = reiser4_getattr_common -+}; -+ -+/* VFS methods for directories */ -+static struct inode_operations directory_i_ops = { -+ .create = reiser4_create_common, -+ .lookup = reiser4_lookup_common, -+ .link = reiser4_link_common, -+ .unlink = reiser4_unlink_common, -+ .symlink = reiser4_symlink_common, -+ .mkdir = reiser4_mkdir_common, -+ .rmdir = reiser4_unlink_common, -+ .mknod = reiser4_mknod_common, -+ .rename = reiser4_rename_common, -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr_common, -+ .getattr = reiser4_getattr_common -+}; -+static struct file_operations directory_f_ops = { -+ .llseek = reiser4_llseek_dir_common, -+ .read = generic_read_dir, -+ .readdir = reiser4_readdir_common, -+ .release = reiser4_release_dir_common, -+ .fsync = reiser4_sync_common -+}; -+static struct address_space_operations directory_a_ops = { -+ .writepage = bugop, -+ .sync_page = bugop, -+ .writepages = dummyop, -+ .set_page_dirty = bugop, -+ .readpages = bugop, -+ .write_begin = bugop, -+ .write_end = bugop, -+ .bmap = bugop, -+ .invalidatepage = bugop, -+ .releasepage = bugop -+}; -+ -+/* -+ * Definitions of object plugins. -+ */ -+ -+file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = { -+ [UNIX_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = UNIX_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_REGULAR_FILE), -+ .pops = &file_plugin_ops, -+ .label = "reg", -+ .desc = "regular file", -+ .linkage = {NULL, NULL}, -+ }, -+ /* -+ * invariant vfs ops -+ */ -+ .inode_ops = ®ular_file_i_ops, -+ .file_ops = ®ular_file_f_ops, -+ .as_ops = ®ular_file_a_ops, -+ /* -+ * private i_ops -+ */ -+ .setattr = setattr_unix_file, -+ .open = open_unix_file, -+ .read = read_unix_file, -+ .write = write_unix_file, -+ .ioctl = ioctl_unix_file, -+ .mmap = mmap_unix_file, -+ .release = release_unix_file, -+ /* -+ * private f_ops -+ */ -+ .readpage = readpage_unix_file, -+ .readpages = readpages_unix_file, -+ .writepages = writepages_unix_file, -+ .write_begin = write_begin_unix_file, -+ .write_end = write_end_unix_file, -+ /* -+ * private a_ops -+ */ -+ .bmap = bmap_unix_file, -+ /* -+ * other private methods -+ */ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .flow_by_inode = flow_by_inode_unix_file, -+ .key_by_inode = key_by_inode_and_offset_common, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common, -+ .create_object = reiser4_create_object_common, -+ .delete_object = delete_object_unix_file, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .owns_item = owns_item_unix_file, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_data_unix_file, -+ .cut_tree_worker = cut_tree_worker_common, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ }, -+ [DIRECTORY_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = DIRECTORY_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_DIRECTORY_FILE), -+ .pops = &file_plugin_ops, -+ .label = "dir", -+ .desc = "directory", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = &null_i_ops, -+ .file_ops = &null_f_ops, -+ .as_ops = &null_a_ops, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .flow_by_inode = bugop, -+ .key_by_inode = bugop, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common_dir, -+ .create_object = reiser4_create_object_common, -+ .delete_object = reiser4_delete_dir_common, -+ .add_link = reiser4_add_link_common, -+ .rem_link = rem_link_common_dir, -+ .owns_item = owns_item_common_dir, -+ .can_add_link = can_add_link_common, -+ .can_rem_link = can_rem_link_common_dir, -+ .detach = reiser4_detach_common_dir, -+ .bind = reiser4_bind_common_dir, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common_dir, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common_dir -+ }, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ }, -+ .init_inode_data = init_inode_ordering, -+ .cut_tree_worker = cut_tree_worker_common, -+ }, -+ [SYMLINK_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = SYMLINK_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_SYMLINK_FILE), -+ .pops = &file_plugin_ops, -+ .label = "symlink", -+ .desc = "symbolic link", -+ .linkage = {NULL,NULL} -+ }, -+ .inode_ops = &symlink_file_i_ops, -+ /* inode->i_fop of symlink is initialized -+ by NULL in setup_inode_ops */ -+ .file_ops = &null_f_ops, -+ .as_ops = &null_a_ops, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common, -+ .create_object = reiser4_create_symlink, -+ .delete_object = reiser4_delete_object_common, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_ordering, -+ .cut_tree_worker = cut_tree_worker_common, -+ .destroy_inode = destroy_inode_symlink, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ }, -+ [SPECIAL_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = SPECIAL_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_SPECIAL_FILE), -+ .pops = &file_plugin_ops, -+ .label = "special", -+ .desc = -+ "special: fifo, device or socket", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = &special_file_i_ops, -+ /* file_ops of special files (sockets, block, char, fifo) are -+ initialized by init_special_inode. */ -+ .file_ops = &null_f_ops, -+ .as_ops = &null_a_ops, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common, -+ .create_object = reiser4_create_object_common, -+ .delete_object = reiser4_delete_object_common, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .owns_item = owns_item_common, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_ordering, -+ .cut_tree_worker = cut_tree_worker_common, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ }, -+ [CRYPTCOMPRESS_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = CRYPTCOMPRESS_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_REGULAR_FILE), -+ .pops = &file_plugin_ops, -+ .label = "cryptcompress", -+ .desc = "cryptcompress file", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = ®ular_file_i_ops, -+ .file_ops = ®ular_file_f_ops, -+ .as_ops = ®ular_file_a_ops, -+ -+ .setattr = setattr_cryptcompress, -+ .open = open_cryptcompress, -+ .read = read_cryptcompress, -+ .write = write_cryptcompress, -+ .ioctl = ioctl_cryptcompress, -+ .mmap = mmap_cryptcompress, -+ .release = release_cryptcompress, -+ -+ .readpage = readpage_cryptcompress, -+ .readpages = readpages_cryptcompress, -+ .writepages = writepages_cryptcompress, -+ .write_begin = write_begin_cryptcompress, -+ .write_end = write_end_cryptcompress, -+ -+ .bmap = bmap_cryptcompress, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .flow_by_inode = flow_by_inode_cryptcompress, -+ .key_by_inode = key_by_inode_cryptcompress, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_cryptcompress, -+ .create_object = create_object_cryptcompress, -+ .delete_object = delete_object_cryptcompress, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .owns_item = owns_item_common, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_data_cryptcompress, -+ .cut_tree_worker = cut_tree_worker_cryptcompress, -+ .destroy_inode = destroy_inode_cryptcompress, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ } -+}; -+ -+static int change_dir(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ /* cannot change dir plugin of already existing object */ -+ return RETERR(-EINVAL); -+} -+ -+static reiser4_plugin_ops dir_plugin_ops = { -+ .change = change_dir -+}; -+ -+/* -+ * definition of directory plugins -+ */ -+ -+dir_plugin dir_plugins[LAST_DIR_ID] = { -+ /* standard hashed directory plugin */ -+ [HASHED_DIR_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_DIR_PLUGIN_TYPE, -+ .id = HASHED_DIR_PLUGIN_ID, -+ .pops = &dir_plugin_ops, -+ .label = "dir", -+ .desc = "hashed directory", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = &directory_i_ops, -+ .file_ops = &directory_f_ops, -+ .as_ops = &directory_a_ops, -+ -+ .get_parent = get_parent_common, -+ .is_name_acceptable = is_name_acceptable_common, -+ .build_entry_key = build_entry_key_hashed, -+ .build_readdir_key = build_readdir_key_common, -+ .add_entry = reiser4_add_entry_common, -+ .rem_entry = reiser4_rem_entry_common, -+ .init = reiser4_dir_init_common, -+ .done = reiser4_dir_done_common, -+ .attach = reiser4_attach_common, -+ .detach = reiser4_detach_common, -+ .estimate = { -+ .add_entry = estimate_add_entry_common, -+ .rem_entry = estimate_rem_entry_common, -+ .unlink = dir_estimate_unlink_common -+ } -+ }, -+ /* hashed directory for which seekdir/telldir are guaranteed to -+ * work. Brain-damage. */ -+ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_DIR_PLUGIN_TYPE, -+ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID, -+ .pops = &dir_plugin_ops, -+ .label = "dir32", -+ .desc = "directory hashed with 31 bit hash", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = &directory_i_ops, -+ .file_ops = &directory_f_ops, -+ .as_ops = &directory_a_ops, -+ -+ .get_parent = get_parent_common, -+ .is_name_acceptable = is_name_acceptable_common, -+ .build_entry_key = build_entry_key_seekable, -+ .build_readdir_key = build_readdir_key_common, -+ .add_entry = reiser4_add_entry_common, -+ .rem_entry = reiser4_rem_entry_common, -+ .init = reiser4_dir_init_common, -+ .done = reiser4_dir_done_common, -+ .attach = reiser4_attach_common, -+ .detach = reiser4_detach_common, -+ .estimate = { -+ .add_entry = estimate_add_entry_common, -+ .rem_entry = estimate_rem_entry_common, -+ .unlink = dir_estimate_unlink_common -+ } -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/object.h linux-2.6.30/fs/reiser4/plugin/object.h ---- linux-2.6.30.orig/fs/reiser4/plugin/object.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/object.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,117 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Declaration of object plugin functions. */ -+ -+#if !defined(__FS_REISER4_PLUGIN_OBJECT_H__) -+#define __FS_REISER4_PLUGIN_OBJECT_H__ -+ -+#include "../type_safe_hash.h" -+ -+/* common implementations of inode operations */ -+int reiser4_create_common(struct inode *parent, struct dentry *dentry, -+ int mode, struct nameidata *); -+struct dentry *reiser4_lookup_common(struct inode *parent, -+ struct dentry *dentry, -+ struct nameidata *nameidata); -+int reiser4_link_common(struct dentry *existing, struct inode *parent, -+ struct dentry *newname); -+int reiser4_unlink_common(struct inode *parent, struct dentry *victim); -+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode); -+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry, -+ const char *linkname); -+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry, -+ int mode, dev_t rdev); -+int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name, -+ struct inode *new_dir, struct dentry *new_name); -+void *reiser4_follow_link_common(struct dentry *, struct nameidata *data); -+int reiser4_permission_common(struct inode *, int mask); -+int reiser4_setattr_common(struct dentry *, struct iattr *); -+int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *, -+ struct kstat *); -+ -+/* common implementations of file operations */ -+loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin); -+int reiser4_readdir_common(struct file *, void *dirent, filldir_t); -+int reiser4_release_dir_common(struct inode *, struct file *); -+int reiser4_sync_common(struct file *, struct dentry *, int datasync); -+ -+ -+/* file plugin operations: common implementations */ -+int write_sd_by_inode_common(struct inode *); -+int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *); -+int set_plug_in_inode_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+int adjust_to_parent_common(struct inode *object, struct inode *parent, -+ struct inode *root); -+int adjust_to_parent_common_dir(struct inode *object, struct inode *parent, -+ struct inode *root); -+int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent, -+ struct inode *root); -+int reiser4_create_object_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+int reiser4_delete_object_common(struct inode *); -+int reiser4_delete_dir_common(struct inode *); -+int reiser4_add_link_common(struct inode *object, struct inode *parent); -+int reiser4_rem_link_common(struct inode *object, struct inode *parent); -+int rem_link_common_dir(struct inode *object, struct inode *parent); -+int owns_item_common(const struct inode *, const coord_t *); -+int owns_item_common_dir(const struct inode *, const coord_t *); -+int can_add_link_common(const struct inode *); -+int can_rem_link_common_dir(const struct inode *); -+int reiser4_detach_common_dir(struct inode *child, struct inode *parent); -+int reiser4_bind_common_dir(struct inode *child, struct inode *parent); -+int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value); -+reiser4_block_nr estimate_create_common(const struct inode *); -+reiser4_block_nr estimate_create_common_dir(const struct inode *); -+reiser4_block_nr estimate_update_common(const struct inode *); -+reiser4_block_nr estimate_unlink_common(const struct inode *, -+ const struct inode *); -+reiser4_block_nr estimate_unlink_common_dir(const struct inode *, -+ const struct inode *); -+char *wire_write_common(struct inode *, char *start); -+char *wire_read_common(char *addr, reiser4_object_on_wire *); -+struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *); -+int wire_size_common(struct inode *); -+void wire_done_common(reiser4_object_on_wire *); -+ -+/* dir plugin operations: common implementations */ -+struct dentry *get_parent_common(struct inode *child); -+int is_name_acceptable_common(const struct inode *, const char *name, int len); -+void build_entry_key_common(const struct inode *, -+ const struct qstr *qname, reiser4_key *); -+int build_readdir_key_common(struct file *dir, reiser4_key *); -+int reiser4_add_entry_common(struct inode *object, struct dentry *where, -+ reiser4_object_create_data * , reiser4_dir_entry_desc *); -+int reiser4_rem_entry_common(struct inode *object, struct dentry *where, -+ reiser4_dir_entry_desc *); -+int reiser4_dir_init_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+int reiser4_dir_done_common(struct inode *); -+int reiser4_attach_common(struct inode *child, struct inode *parent); -+int reiser4_detach_common(struct inode *object, struct inode *parent); -+reiser4_block_nr estimate_add_entry_common(const struct inode *); -+reiser4_block_nr estimate_rem_entry_common(const struct inode *); -+reiser4_block_nr dir_estimate_unlink_common(const struct inode *, -+ const struct inode *); -+ -+/* these are essential parts of common implementations, they are to make -+ customized implementations easier */ -+int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to); -+ -+/* merely useful functions */ -+int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle * , -+ const reiser4_key * , int silent); -+ -+/* __FS_REISER4_PLUGIN_OBJECT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/plugin.c linux-2.6.30/fs/reiser4/plugin/plugin.c ---- linux-2.6.30.orig/fs/reiser4/plugin/plugin.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/plugin.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,560 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Basic plugin infrastructure, lookup etc. */ -+ -+/* PLUGINS: -+ -+ Plugins are internal Reiser4 "modules" or "objects" used to increase -+ extensibility and allow external users to easily adapt reiser4 to -+ their needs. -+ -+ Plugins are classified into several disjoint "types". Plugins -+ belonging to the particular plugin type are termed "instances" of -+ this type. Existing types are listed by enum reiser4_plugin_type -+ (see plugin/plugin_header.h) -+ -+NIKITA-FIXME-HANS: update this list, and review this entire comment for currency -+ -+ Object (file) plugin determines how given file-system object serves -+ standard VFS requests for read, write, seek, mmap etc. Instances of -+ file plugins are: regular file, directory, symlink. Another example -+ of file plugin is audit plugin, that optionally records accesses to -+ underlying object and forwards requests to it. -+ -+ Hash plugins compute hashes used by reiser4 to store and locate -+ files within directories. Instances of hash plugin type are: r5, -+ tea, rupasov. -+ -+ Tail plugins (or, more precisely, tail policy plugins) determine -+ when last part of the file should be stored in a formatted item. -+ -+ Scope and lookup: -+ -+ label such that pair ( type_label, plugin_label ) is unique. This -+ pair is a globally persistent and user-visible plugin -+ identifier. Internally kernel maintains plugins and plugin types in -+ arrays using an index into those arrays as plugin and plugin type -+ identifiers. File-system in turn, also maintains persistent -+ "dictionary" which is mapping from plugin label to numerical -+ identifier which is stored in file-system objects. That is, we -+ store the offset into the plugin array for that plugin type as the -+ plugin id in the stat data of the filesystem object. -+ -+ Internal kernel plugin type identifier (index in plugins[] array) is -+ of type reiser4_plugin_type. Set of available plugin types is -+ currently static, but dynamic loading doesn't seem to pose -+ insurmountable problems. -+ -+ Within each type plugins are addressed by the identifiers of type -+ reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]). -+ Such identifiers are only required to be unique within one type, -+ not globally. -+ -+ Thus, plugin in memory is uniquely identified by the pair (type_id, -+ id). -+ -+ Usage: -+ -+ There exists only one instance of each plugin instance, but this -+ single instance can be associated with many entities (file-system -+ objects, items, nodes, transactions, file-descriptors etc.). Entity -+ to which plugin of given type is termed (due to the lack of -+ imagination) "subject" of this plugin type and, by abuse of -+ terminology, subject of particular instance of this type to which -+ it's attached currently. For example, inode is subject of object -+ plugin type. Inode representing directory is subject of directory -+ plugin, hash plugin type and some particular instance of hash plugin -+ type. Inode, representing regular file is subject of "regular file" -+ plugin, tail-policy plugin type etc. -+ -+ With each subject the plugin possibly stores some state. For example, -+ the state of a directory plugin (instance of object plugin type) is pointer -+ to hash plugin (if directories always use hashing that is). -+ -+ Interface: -+ -+ In addition to a scalar identifier, each plugin type and plugin -+ proper has a "label": short string and a "description"---longer -+ descriptive string. Labels and descriptions of plugin types are -+ hard-coded into plugins[] array, declared and defined in -+ plugin.c. Label and description of plugin are stored in .label and -+ .desc fields of reiser4_plugin_header respectively. It's possible to -+ locate plugin by the pair of labels. -+ -+ Features (not implemented): -+ -+ . user-level plugin manipulations: -+ + reiser4("filename/..file_plugin<='audit'"); -+ + write(open("filename/..file_plugin"), "audit", 8); -+ -+ . user level utilities lsplug and chplug to manipulate plugins. -+ Utilities are not of primary priority. Possibly they will be not -+ working on v4.0 -+ -+ NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount -+ option, do you agree? I don't think that specifying it at mount time, -+ and then changing it with each mount, is a good model for usage. -+ -+ . mount option "plug" to set-up plugins of root-directory. -+ "plug=foo:bar" will set "bar" as default plugin of type "foo". -+ -+ Limitations: -+ -+ . each plugin type has to provide at least one builtin -+ plugin. This is technical limitation and it can be lifted in the -+ future. -+ -+ TODO: -+ -+ New plugin types/plugings: -+ Things we should be able to separately choose to inherit: -+ -+ security plugins -+ -+ stat data -+ -+ file bodies -+ -+ file plugins -+ -+ dir plugins -+ -+ . perm:acl -+ -+ . audi---audit plugin intercepting and possibly logging all -+ accesses to object. Requires to put stub functions in file_operations -+ in stead of generic_file_*. -+ -+NIKITA-FIXME-HANS: why make overflows a plugin? -+ . over---handle hash overflows -+ -+ . sqnt---handle different access patterns and instruments read-ahead -+ -+NIKITA-FIXME-HANS: describe the line below in more detail. -+ -+ . hier---handle inheritance of plugins along file-system hierarchy -+ -+ Different kinds of inheritance: on creation vs. on access. -+ Compatible/incompatible plugins. -+ Inheritance for multi-linked files. -+ Layered plugins. -+ Notion of plugin context is abandoned. -+ -+Each file is associated -+ with one plugin and dependant plugins (hash, etc.) are stored as -+ main plugin state. Now, if we have plugins used for regular files -+ but not for directories, how such plugins would be inherited? -+ . always store them with directories also -+ -+NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing -+the line below which is also useful. -+ -+ . use inheritance hierarchy, independent of file-system namespace -+*/ -+ -+#include "../debug.h" -+#include "../dformat.h" -+#include "plugin_header.h" -+#include "item/static_stat.h" -+#include "node/node.h" -+#include "security/perm.h" -+#include "space/space_allocator.h" -+#include "disk_format/disk_format.h" -+#include "plugin.h" -+#include "../reiser4.h" -+#include "../jnode.h" -+#include "../inode.h" -+ -+#include <linux/fs.h> /* for struct super_block */ -+ -+/* -+ * init_plugins - initialize plugin sub-system. -+ * Just call this once on reiser4 startup. -+ * -+ * Initializes plugin sub-system. It is part of reiser4 module -+ * initialization. For each plugin of each type init method is called and each -+ * plugin is put into list of plugins. -+ */ -+int init_plugins(void) -+{ -+ reiser4_plugin_type type_id; -+ -+ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) { -+ struct reiser4_plugin_type_data *ptype; -+ int i; -+ -+ ptype = &plugins[type_id]; -+ assert("nikita-3508", ptype->label != NULL); -+ assert("nikita-3509", ptype->type_id == type_id); -+ -+ INIT_LIST_HEAD(&ptype->plugins_list); -+/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term -+ * builtin. */ -+ for (i = 0; i < ptype->builtin_num; ++i) { -+ reiser4_plugin *plugin; -+ -+ plugin = plugin_at(ptype, i); -+ -+ if (plugin->h.label == NULL) -+ /* uninitialized slot encountered */ -+ continue; -+ assert("nikita-3445", plugin->h.type_id == type_id); -+ plugin->h.id = i; -+ if (plugin->h.pops != NULL && -+ plugin->h.pops->init != NULL) { -+ int result; -+ -+ result = plugin->h.pops->init(plugin); -+ if (result != 0) -+ return result; -+ } -+ INIT_LIST_HEAD(&plugin->h.linkage); -+ list_add_tail(&plugin->h.linkage, &ptype->plugins_list); -+ } -+ } -+ return 0; -+} -+ -+/* true if plugin type id is valid */ -+int is_plugin_type_valid(reiser4_plugin_type type) -+{ -+ /* "type" is unsigned, so no comparison with 0 is -+ necessary */ -+ return (type < REISER4_PLUGIN_TYPES); -+} -+ -+/* true if plugin id is valid */ -+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id) -+{ -+ assert("nikita-1653", is_plugin_type_valid(type)); -+ return id < plugins[type].builtin_num; -+} -+ -+/* return plugin by its @type and @id. -+ -+ Both arguments are checked for validness: this is supposed to be called -+ from user-level. -+ -+NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in -+user space, and passed to the filesystem by use of method files? Your -+comment really confused me on the first reading.... -+ -+*/ -+reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type -+ * unchecked */, -+ reiser4_plugin_id id /* plugin id, -+ * unchecked */) -+{ -+ if (is_plugin_type_valid(type)) { -+ if (is_plugin_id_valid(type, id)) -+ return plugin_at(&plugins[type], id); -+ else -+ /* id out of bounds */ -+ warning("nikita-2913", -+ "Invalid plugin id: [%i:%i]", type, id); -+ } else -+ /* type_id out of bounds */ -+ warning("nikita-2914", "Invalid type_id: %i", type); -+ return NULL; -+} -+ -+/** -+ * save_plugin_id - store plugin id in disk format -+ * @plugin: plugin to convert -+ * @area: where to store result -+ * -+ * Puts id of @plugin in little endian format to address @area. -+ */ -+int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ , -+ d16 * area/* where to store result */) -+{ -+ assert("nikita-1261", plugin != NULL); -+ assert("nikita-1262", area != NULL); -+ -+ put_unaligned(cpu_to_le16(plugin->h.id), area); -+ return 0; -+} -+ -+/* list of all plugins of given type */ -+struct list_head *get_plugin_list(reiser4_plugin_type type) -+{ -+ assert("nikita-1056", is_plugin_type_valid(type)); -+ return &plugins[type].plugins_list; -+} -+ -+static void update_pset_mask(reiser4_inode * info, pset_member memb) -+{ -+ struct dentry *rootdir; -+ reiser4_inode *root; -+ -+ assert("edward-1443", memb != PSET_FILE); -+ -+ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root; -+ if (rootdir != NULL) { -+ root = reiser4_inode_data(rootdir->d_inode); -+ /* -+ * if inode is different from the default one, or we are -+ * changing plugin of root directory, update plugin_mask -+ */ -+ if (aset_get(info->pset, memb) != -+ aset_get(root->pset, memb) || -+ info == root) -+ info->plugin_mask |= (1 << memb); -+ else -+ info->plugin_mask &= ~(1 << memb); -+ } -+} -+ -+/* Get specified plugin set member from parent, -+ or from fs-defaults (if no parent is given) and -+ install the result to pset of @self */ -+int grab_plugin_pset(struct inode *self, -+ struct inode *ancestor, -+ pset_member memb) -+{ -+ reiser4_plugin *plug; -+ reiser4_inode *info; -+ int result = 0; -+ -+ /* Do not grab if initialised already. */ -+ info = reiser4_inode_data(self); -+ if (aset_get(info->pset, memb) != NULL) -+ return 0; -+ if (ancestor) { -+ reiser4_inode *parent; -+ -+ parent = reiser4_inode_data(ancestor); -+ plug = aset_get(parent->hset, memb) ? : -+ aset_get(parent->pset, memb); -+ } else -+ plug = get_default_plugin(memb); -+ -+ result = set_plugin(&info->pset, memb, plug); -+ if (result == 0) { -+ if (!ancestor || self->i_sb->s_root->d_inode != self) -+ update_pset_mask(info, memb); -+ } -+ return result; -+} -+ -+/* Take missing pset members from root inode */ -+int finish_pset(struct inode *inode) -+{ -+ reiser4_plugin *plug; -+ reiser4_inode *root; -+ reiser4_inode *info; -+ pset_member memb; -+ int result = 0; -+ -+ root = reiser4_inode_data(inode->i_sb->s_root->d_inode); -+ info = reiser4_inode_data(inode); -+ -+ assert("edward-1455", root != NULL); -+ assert("edward-1456", info != NULL); -+ -+ /* file and directory plugins are already initialized. */ -+ for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) { -+ -+ /* Do not grab if initialised already. */ -+ if (aset_get(info->pset, memb) != NULL) -+ continue; -+ -+ plug = aset_get(root->pset, memb); -+ result = set_plugin(&info->pset, memb, plug); -+ if (result != 0) -+ break; -+ } -+ if (result != 0) { -+ warning("nikita-3447", -+ "Cannot set up plugins for %lli", -+ (unsigned long long) -+ get_inode_oid(inode)); -+ } -+ return result; -+} -+ -+int force_plugin_pset(struct inode *self, pset_member memb, -+ reiser4_plugin * plug) -+{ -+ reiser4_inode *info; -+ int result = 0; -+ -+ if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) { -+ /* Changing pset in the root object. */ -+ return RETERR(-EINVAL); -+ } -+ -+ info = reiser4_inode_data(self); -+ if (plug->h.pops != NULL && plug->h.pops->change != NULL) -+ result = plug->h.pops->change(self, plug, memb); -+ else -+ result = aset_set_unsafe(&info->pset, memb, plug); -+ if (result == 0) { -+ __u16 oldmask = info->plugin_mask; -+ -+ update_pset_mask(info, memb); -+ if (oldmask != info->plugin_mask) -+ reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN); -+ } -+ return result; -+} -+ -+struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = { -+ /* C90 initializers */ -+ [REISER4_FILE_PLUGIN_TYPE] = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .label = "file", -+ .desc = "Object plugins", -+ .builtin_num = sizeof_array(file_plugins), -+ .builtin = file_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(file_plugin) -+ }, -+ [REISER4_DIR_PLUGIN_TYPE] = { -+ .type_id = REISER4_DIR_PLUGIN_TYPE, -+ .label = "dir", -+ .desc = "Directory plugins", -+ .builtin_num = sizeof_array(dir_plugins), -+ .builtin = dir_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(dir_plugin) -+ }, -+ [REISER4_HASH_PLUGIN_TYPE] = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .label = "hash", -+ .desc = "Directory hashes", -+ .builtin_num = sizeof_array(hash_plugins), -+ .builtin = hash_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(hash_plugin) -+ }, -+ [REISER4_FIBRATION_PLUGIN_TYPE] = { -+ .type_id = -+ REISER4_FIBRATION_PLUGIN_TYPE, -+ .label = "fibration", -+ .desc = "Directory fibrations", -+ .builtin_num = sizeof_array(fibration_plugins), -+ .builtin = fibration_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(fibration_plugin) -+ }, -+ [REISER4_CIPHER_PLUGIN_TYPE] = { -+ .type_id = REISER4_CIPHER_PLUGIN_TYPE, -+ .label = "cipher", -+ .desc = "Cipher plugins", -+ .builtin_num = sizeof_array(cipher_plugins), -+ .builtin = cipher_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(cipher_plugin) -+ }, -+ [REISER4_DIGEST_PLUGIN_TYPE] = { -+ .type_id = REISER4_DIGEST_PLUGIN_TYPE, -+ .label = "digest", -+ .desc = "Digest plugins", -+ .builtin_num = sizeof_array(digest_plugins), -+ .builtin = digest_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(digest_plugin) -+ }, -+ [REISER4_COMPRESSION_PLUGIN_TYPE] = { -+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .label = "compression", -+ .desc = "Compression plugins", -+ .builtin_num = sizeof_array(compression_plugins), -+ .builtin = compression_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(compression_plugin) -+ }, -+ [REISER4_FORMATTING_PLUGIN_TYPE] = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .label = "formatting", -+ .desc = "Tail inlining policies", -+ .builtin_num = sizeof_array(formatting_plugins), -+ .builtin = formatting_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(formatting_plugin) -+ }, -+ [REISER4_PERM_PLUGIN_TYPE] = { -+ .type_id = REISER4_PERM_PLUGIN_TYPE, -+ .label = "perm", -+ .desc = "Permission checks", -+ .builtin_num = sizeof_array(perm_plugins), -+ .builtin = perm_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(perm_plugin) -+ }, -+ [REISER4_ITEM_PLUGIN_TYPE] = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .label = "item", -+ .desc = "Item handlers", -+ .builtin_num = sizeof_array(item_plugins), -+ .builtin = item_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(item_plugin) -+ }, -+ [REISER4_NODE_PLUGIN_TYPE] = { -+ .type_id = REISER4_NODE_PLUGIN_TYPE, -+ .label = "node", -+ .desc = "node layout handlers", -+ .builtin_num = sizeof_array(node_plugins), -+ .builtin = node_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(node_plugin) -+ }, -+ [REISER4_SD_EXT_PLUGIN_TYPE] = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .label = "sd_ext", -+ .desc = "Parts of stat-data", -+ .builtin_num = sizeof_array(sd_ext_plugins), -+ .builtin = sd_ext_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(sd_ext_plugin) -+ }, -+ [REISER4_FORMAT_PLUGIN_TYPE] = { -+ .type_id = REISER4_FORMAT_PLUGIN_TYPE, -+ .label = "disk_layout", -+ .desc = "defines filesystem on disk layout", -+ .builtin_num = sizeof_array(format_plugins), -+ .builtin = format_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(disk_format_plugin) -+ }, -+ [REISER4_JNODE_PLUGIN_TYPE] = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .label = "jnode", -+ .desc = "defines kind of jnode", -+ .builtin_num = sizeof_array(jnode_plugins), -+ .builtin = jnode_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(jnode_plugin) -+ }, -+ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .label = "compression_mode", -+ .desc = "Defines compression mode", -+ .builtin_num = sizeof_array(compression_mode_plugins), -+ .builtin = compression_mode_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(compression_mode_plugin) -+ }, -+ [REISER4_CLUSTER_PLUGIN_TYPE] = { -+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, -+ .label = "cluster", -+ .desc = "Defines cluster size", -+ .builtin_num = sizeof_array(cluster_plugins), -+ .builtin = cluster_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(cluster_plugin) -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/plugin.h linux-2.6.30/fs/reiser4/plugin/plugin.h ---- linux-2.6.30.orig/fs/reiser4/plugin/plugin.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/plugin.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,942 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Basic plugin data-types. -+ see fs/reiser4/plugin/plugin.c for details */ -+ -+#if !defined(__FS_REISER4_PLUGIN_TYPES_H__) -+#define __FS_REISER4_PLUGIN_TYPES_H__ -+ -+#include "../forward.h" -+#include "../debug.h" -+#include "../dformat.h" -+#include "../key.h" -+#include "compress/compress.h" -+#include "crypto/cipher.h" -+#include "plugin_header.h" -+#include "item/static_stat.h" -+#include "item/internal.h" -+#include "item/sde.h" -+#include "item/cde.h" -+#include "item/item.h" -+#include "node/node.h" -+#include "node/node40.h" -+#include "security/perm.h" -+#include "fibration.h" -+ -+#include "space/bitmap.h" -+#include "space/space_allocator.h" -+ -+#include "disk_format/disk_format40.h" -+#include "disk_format/disk_format.h" -+ -+#include <linux/fs.h> /* for struct super_block, address_space */ -+#include <linux/mm.h> /* for struct page */ -+#include <linux/buffer_head.h> /* for struct buffer_head */ -+#include <linux/dcache.h> /* for struct dentry */ -+#include <linux/types.h> -+#include <linux/crypto.h> -+ -+typedef struct reiser4_object_on_wire reiser4_object_on_wire; -+ -+/* -+ * File plugin. Defines the set of methods that file plugins implement, some -+ * of which are optional. -+ * -+ * A file plugin offers to the caller an interface for IO ( writing to and/or -+ * reading from) to what the caller sees as one sequence of bytes. An IO to it -+ * may affect more than one physical sequence of bytes, or no physical sequence -+ * of bytes, it may affect sequences of bytes offered by other file plugins to -+ * the semantic layer, and the file plugin may invoke other plugins and -+ * delegate work to them, but its interface is structured for offering the -+ * caller the ability to read and/or write what the caller sees as being a -+ * single sequence of bytes. -+ * -+ * The file plugin must present a sequence of bytes to the caller, but it does -+ * not necessarily have to store a sequence of bytes, it does not necessarily -+ * have to support efficient tree traversal to any offset in the sequence of -+ * bytes (tail and extent items, whose keys contain offsets, do however provide -+ * efficient non-sequential lookup of any offset in the sequence of bytes). -+ * -+ * Directory plugins provide methods for selecting file plugins by resolving a -+ * name for them. -+ * -+ * The functionality other filesystems call an attribute, and rigidly tie -+ * together, we decompose into orthogonal selectable features of files. Using -+ * the terminology we will define next, an attribute is a perhaps constrained, -+ * perhaps static length, file whose parent has a uni-count-intra-link to it, -+ * which might be grandparent-major-packed, and whose parent has a deletion -+ * method that deletes it. -+ * -+ * File plugins can implement constraints. -+ * -+ * Files can be of variable length (e.g. regular unix files), or of static -+ * length (e.g. static sized attributes). -+ * -+ * An object may have many sequences of bytes, and many file plugins, but, it -+ * has exactly one objectid. It is usually desirable that an object has a -+ * deletion method which deletes every item with that objectid. Items cannot -+ * in general be found by just their objectids. This means that an object must -+ * have either a method built into its deletion plugin method for knowing what -+ * items need to be deleted, or links stored with the object that provide the -+ * plugin with a method for finding those items. Deleting a file within an -+ * object may or may not have the effect of deleting the entire object, -+ * depending on the file plugin's deletion method. -+ * -+ * LINK TAXONOMY: -+ * -+ * Many objects have a reference count, and when the reference count reaches 0 -+ * the object's deletion method is invoked. Some links embody a reference -+ * count increase ("countlinks"), and others do not ("nocountlinks"). -+ * -+ * Some links are bi-directional links ("bilinks"), and some are -+ * uni-directional("unilinks"). -+ * -+ * Some links are between parts of the same object ("intralinks"), and some are -+ * between different objects ("interlinks"). -+ * -+ * PACKING TAXONOMY: -+ * -+ * Some items of an object are stored with a major packing locality based on -+ * their object's objectid (e.g. unix directory items in plan A), and these are -+ * called "self-major-packed". -+ * -+ * Some items of an object are stored with a major packing locality based on -+ * their semantic parent object's objectid (e.g. unix file bodies in plan A), -+ * and these are called "parent-major-packed". -+ * -+ * Some items of an object are stored with a major packing locality based on -+ * their semantic grandparent, and these are called "grandparent-major-packed". -+ * Now carefully notice that we run into trouble with key length if we have to -+ * store a 8 byte major+minor grandparent based packing locality, an 8 byte -+ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in -+ * a 24 byte key. One of these fields must be sacrificed if an item is to be -+ * grandparent-major-packed, and which to sacrifice is left to the item author -+ * choosing to make the item grandparent-major-packed. You cannot make tail -+ * items and extent items grandparent-major-packed, though you could make them -+ * self-major-packed (usually they are parent-major-packed). -+ * -+ * In the case of ACLs (which are composed of fixed length ACEs which consist -+ * of {subject-type, subject, and permission bitmask} triples), it makes sense -+ * to not have an offset field in the ACE item key, and to allow duplicate keys -+ * for ACEs. Thus, the set of ACES for a given file is found by looking for a -+ * key consisting of the objectid of the grandparent (thus grouping all ACLs in -+ * a directory together), the minor packing locality of ACE, the objectid of -+ * the file, and 0. -+ * -+ * IO involves moving data from one location to another, which means that two -+ * locations must be specified, source and destination. -+ * -+ * This source and destination can be in the filesystem, or they can be a -+ * pointer in the user process address space plus a byte count. -+ * -+ * If both source and destination are in the filesystem, then at least one of -+ * them must be representable as a pure stream of bytes (which we call a flow, -+ * and define as a struct containing a key, a data pointer, and a length). -+ * This may mean converting one of them into a flow. We provide a generic -+ * cast_into_flow() method, which will work for any plugin supporting -+ * read_flow(), though it is inefficiently implemented in that it temporarily -+ * stores the flow in a buffer (Question: what to do with huge flows that -+ * cannot fit into memory? Answer: we must not convert them all at once. ) -+ * -+ * Performing a write requires resolving the write request into a flow defining -+ * the source, and a method that performs the write, and a key that defines -+ * where in the tree the write is to go. -+ * -+ * Performing a read requires resolving the read request into a flow defining -+ * the target, and a method that performs the read, and a key that defines -+ * where in the tree the read is to come from. -+ * -+ * There will exist file plugins which have no pluginid stored on the disk for -+ * them, and which are only invoked by other plugins. -+ */ -+ -+/* This should be incremented with each new contributed -+ pair (plugin type, plugin id). -+ NOTE: Make sure there is a release of reiser4progs -+ with the corresponding version number */ -+#define PLUGIN_LIBRARY_VERSION 0 -+ -+ /* enumeration of fields within plugin_set */ -+typedef enum { -+ PSET_FILE, -+ PSET_DIR, /* PSET_FILE and PSET_DIR should be first -+ * elements: inode.c:read_inode() depends on -+ * this. */ -+ PSET_PERM, -+ PSET_FORMATTING, -+ PSET_HASH, -+ PSET_FIBRATION, -+ PSET_SD, -+ PSET_DIR_ITEM, -+ PSET_CIPHER, -+ PSET_DIGEST, -+ PSET_COMPRESSION, -+ PSET_COMPRESSION_MODE, -+ PSET_CLUSTER, -+ PSET_CREATE, -+ PSET_LAST -+} pset_member; -+ -+/* builtin file-plugins */ -+typedef enum { -+ /* regular file */ -+ UNIX_FILE_PLUGIN_ID, -+ /* directory */ -+ DIRECTORY_FILE_PLUGIN_ID, -+ /* symlink */ -+ SYMLINK_FILE_PLUGIN_ID, -+ /* for objects completely handled by the VFS: fifos, devices, -+ sockets */ -+ SPECIAL_FILE_PLUGIN_ID, -+ /* regular cryptcompress file */ -+ CRYPTCOMPRESS_FILE_PLUGIN_ID, -+ /* number of file plugins. Used as size of arrays to hold -+ file plugins. */ -+ LAST_FILE_PLUGIN_ID -+} reiser4_file_id; -+ -+typedef struct file_plugin { -+ -+ /* generic fields */ -+ plugin_header h; -+ -+ /* VFS methods. -+ * Must be invariant with respect to plugin conversion. -+ * It can be achieved by using "common" methods, which -+ * are the same for all plugins that take participation in -+ * conversion, or by using "generic" or "careful" methods, -+ * which provide automatic redirection to proper private -+ * plugin methods ("careful" are the same as "generic", -+ * but with protection of pset and other disk structures -+ * from being rebuilt during conversion. -+ */ -+ struct inode_operations * inode_ops; -+ struct file_operations * file_ops; -+ struct address_space_operations * as_ops; -+ /** -+ * Private methods. These are optional. If used they will allow you -+ * to minimize the amount of code needed to implement a deviation -+ * from some other method that also uses them. -+ */ -+ /* -+ * private inode_ops -+ */ -+ int (*setattr)(struct dentry *, struct iattr *); -+ /* -+ * private file_ops -+ */ -+ /* do whatever is necessary to do when object is opened */ -+ int (*open) (struct inode *inode, struct file *file); -+ ssize_t (*read) (struct file *, char __user *buf, size_t read_amount, -+ loff_t *off); -+ /* write as much as possible bytes from nominated @write_amount -+ * before plugin scheduling is occurred. Save scheduling state -+ * in @cont */ -+ ssize_t (*write) (struct file *, const char __user *buf, -+ size_t write_amount, loff_t * off, -+ struct psched_context * cont); -+ int (*ioctl) (struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg); -+ int (*mmap) (struct file *, struct vm_area_struct *); -+ int (*release) (struct inode *, struct file *); -+ /* -+ * private a_ops -+ */ -+ int (*readpage) (struct file *file, struct page *page); -+ int (*readpages)(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages); -+ int (*writepages)(struct address_space *mapping, -+ struct writeback_control *wbc); -+ int (*write_begin)(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+ int (*write_end)(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+ sector_t (*bmap) (struct address_space * mapping, sector_t lblock); -+ /* other private methods */ -+ /* save inode cached stat-data onto disk. It was called -+ reiserfs_update_sd() in 3.x */ -+ int (*write_sd_by_inode) (struct inode *); -+ /* -+ * Construct flow into @flow according to user-supplied data. -+ * -+ * This is used by read/write methods to construct a flow to -+ * write/read. ->flow_by_inode() is plugin method, rather than single -+ * global implementation, because key in a flow used by plugin may -+ * depend on data in a @buf. -+ * -+ * NIKITA-FIXME-HANS: please create statistics on what functions are -+ * dereferenced how often for the mongo benchmark. You can supervise -+ * Elena doing this for you if that helps. Email me the list of the -+ * top 10, with their counts, and an estimate of the total number of -+ * CPU cycles spent dereferencing as a percentage of CPU cycles spent -+ * processing (non-idle processing). If the total percent is, say, -+ * less than 1%, it will make our coding discussions much easier, and -+ * keep me from questioning whether functions like the below are too -+ * frequently called to be dereferenced. If the total percent is more -+ * than 1%, perhaps private methods should be listed in a "required" -+ * comment at the top of each plugin (with stern language about how if -+ * the comment is missing it will not be accepted by the maintainer), -+ * and implemented using macros not dereferenced functions. How about -+ * replacing this whole private methods part of the struct with a -+ * thorough documentation of what the standard helper functions are for -+ * use in constructing plugins? I think users have been asking for -+ * that, though not in so many words. -+ */ -+ int (*flow_by_inode) (struct inode *, const char __user *buf, -+ int user, loff_t size, -+ loff_t off, rw_op op, flow_t *); -+ /* -+ * Return the key used to retrieve an offset of a file. It is used by -+ * default implementation of ->flow_by_inode() method -+ * (common_build_flow()) and, among other things, to get to the extent -+ * from jnode of unformatted node. -+ */ -+ int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *); -+ -+ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you -+ * think.... */ -+ /* -+ * set the plugin for a file. Called during file creation in creat() -+ * but not reiser4() unless an inode already exists for the file. -+ */ -+ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent, -+ reiser4_object_create_data *); -+ -+ /* NIKITA-FIXME-HANS: comment and name seem to say different things, -+ * are you setting up the object itself also or just adjusting the -+ * parent?.... */ -+ /* set up plugins for new @object created in @parent. @root is root -+ directory. */ -+ int (*adjust_to_parent) (struct inode *object, struct inode *parent, -+ struct inode *root); -+ /* -+ * this does whatever is necessary to do when object is created. For -+ * instance, for unix files stat data is inserted. It is supposed to be -+ * called by create of struct inode_operations. -+ */ -+ int (*create_object) (struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+ /* -+ * this method should check REISER4_NO_SD and set REISER4_NO_SD on -+ * success. Deletion of an object usually includes removal of items -+ * building file body (for directories this is removal of "." and "..") -+ * and removal of stat-data item. -+ */ -+ int (*delete_object) (struct inode *); -+ -+ /* add link from @parent to @object */ -+ int (*add_link) (struct inode *object, struct inode *parent); -+ -+ /* remove link from @parent to @object */ -+ int (*rem_link) (struct inode *object, struct inode *parent); -+ -+ /* -+ * return true if item addressed by @coord belongs to @inode. This is -+ * used by read/write to properly slice flow into items in presence of -+ * multiple key assignment policies, because items of a file are not -+ * necessarily contiguous in a key space, for example, in a plan-b. -+ */ -+ int (*owns_item) (const struct inode *, const coord_t *); -+ -+ /* checks whether yet another hard links to this object can be -+ added */ -+ int (*can_add_link) (const struct inode *); -+ -+ /* checks whether hard links to this object can be removed */ -+ int (*can_rem_link) (const struct inode *); -+ -+ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls -+ detach of directory plugin to remove ".." */ -+ int (*detach) (struct inode *child, struct inode *parent); -+ -+ /* called when @child was just looked up in the @parent. It is not -+ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of -+ directory plugin */ -+ int (*bind) (struct inode *child, struct inode *parent); -+ -+ /* process safe-link during mount */ -+ int (*safelink) (struct inode *object, reiser4_safe_link_t link, -+ __u64 value); -+ -+ /* The couple of estimate methods for all file operations */ -+ struct { -+ reiser4_block_nr(*create) (const struct inode *); -+ reiser4_block_nr(*update) (const struct inode *); -+ reiser4_block_nr(*unlink) (const struct inode *, -+ const struct inode *); -+ } estimate; -+ -+ /* -+ * reiser4 specific part of inode has a union of structures which are -+ * specific to a plugin. This method is called when inode is read -+ * (read_inode) and when file is created (common_create_child) so that -+ * file plugin could initialize its inode data -+ */ -+ void (*init_inode_data) (struct inode *, reiser4_object_create_data * , -+ int); -+ -+ /* -+ * This method performs progressive deletion of items and whole nodes -+ * from right to left. -+ * -+ * @tap: the point deletion process begins from, -+ * @from_key: the beginning of the deleted key range, -+ * @to_key: the end of the deleted key range, -+ * @smallest_removed: the smallest removed key, -+ * -+ * @return: 0 if success, error code otherwise, -E_REPEAT means that -+ * long cut_tree operation was interrupted for allowing atom commit . -+ */ -+ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, struct inode *, -+ int, int *); -+ -+ /* called from ->destroy_inode() */ -+ void (*destroy_inode) (struct inode *); -+ -+ /* -+ * methods to serialize object identify. This is used, for example, by -+ * reiser4_{en,de}code_fh(). -+ */ -+ struct { -+ /* store object's identity at @area */ -+ char *(*write) (struct inode *inode, char *area); -+ /* parse object from wire to the @obj */ -+ char *(*read) (char *area, reiser4_object_on_wire * obj); -+ /* given object identity in @obj, find or create its dentry */ -+ struct dentry *(*get) (struct super_block *s, -+ reiser4_object_on_wire * obj); -+ /* how many bytes ->wire.write() consumes */ -+ int (*size) (struct inode *inode); -+ /* finish with object identify */ -+ void (*done) (reiser4_object_on_wire * obj); -+ } wire; -+} file_plugin; -+ -+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID]; -+ -+struct reiser4_object_on_wire { -+ file_plugin *plugin; -+ union { -+ struct { -+ obj_key_id key_id; -+ } std; -+ void *generic; -+ } u; -+}; -+ -+/* builtin dir-plugins */ -+typedef enum { -+ HASHED_DIR_PLUGIN_ID, -+ SEEKABLE_HASHED_DIR_PLUGIN_ID, -+ LAST_DIR_ID -+} reiser4_dir_id; -+ -+typedef struct dir_plugin { -+ /* generic fields */ -+ plugin_header h; -+ -+ struct inode_operations * inode_ops; -+ struct file_operations * file_ops; -+ struct address_space_operations * as_ops; -+ -+ /* -+ * private methods: These are optional. If used they will allow you to -+ * minimize the amount of code needed to implement a deviation from -+ * some other method that uses them. You could logically argue that -+ * they should be a separate type of plugin. -+ */ -+ -+ struct dentry *(*get_parent) (struct inode *childdir); -+ -+ /* -+ * check whether "name" is acceptable name to be inserted into this -+ * object. Optionally implemented by directory-like objects. Can check -+ * for maximal length, reserved symbols etc -+ */ -+ int (*is_name_acceptable) (const struct inode *inode, const char *name, -+ int len); -+ -+ void (*build_entry_key) (const struct inode *dir /* directory where -+ * entry is (or will -+ * be) in.*/ , -+ const struct qstr *name /* name of file -+ * referenced by this -+ * entry */ , -+ reiser4_key * result /* resulting key of -+ * directory entry */ ); -+ int (*build_readdir_key) (struct file *dir, reiser4_key * result); -+ int (*add_entry) (struct inode *object, struct dentry *where, -+ reiser4_object_create_data * data, -+ reiser4_dir_entry_desc * entry); -+ int (*rem_entry) (struct inode *object, struct dentry *where, -+ reiser4_dir_entry_desc * entry); -+ -+ /* -+ * initialize directory structure for newly created object. For normal -+ * unix directories, insert dot and dotdot. -+ */ -+ int (*init) (struct inode *object, struct inode *parent, -+ reiser4_object_create_data * data); -+ -+ /* destroy directory */ -+ int (*done) (struct inode *child); -+ -+ /* called when @subdir was just looked up in the @dir */ -+ int (*attach) (struct inode *subdir, struct inode *dir); -+ int (*detach) (struct inode *subdir, struct inode *dir); -+ -+ struct { -+ reiser4_block_nr(*add_entry) (const struct inode *); -+ reiser4_block_nr(*rem_entry) (const struct inode *); -+ reiser4_block_nr(*unlink) (const struct inode *, -+ const struct inode *); -+ } estimate; -+} dir_plugin; -+ -+extern dir_plugin dir_plugins[LAST_DIR_ID]; -+ -+typedef struct formatting_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* returns non-zero iff file's tail has to be stored -+ in a direct item. */ -+ int (*have_tail) (const struct inode *inode, loff_t size); -+} formatting_plugin; -+ -+typedef struct hash_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* computes hash of the given name */ -+ __u64(*hash) (const unsigned char *name, int len); -+} hash_plugin; -+ -+typedef struct cipher_plugin { -+ /* generic fields */ -+ plugin_header h; -+ struct crypto_blkcipher * (*alloc) (void); -+ void (*free) (struct crypto_blkcipher *tfm); -+ /* Offset translator. For each offset this returns (k * offset), where -+ k (k >= 1) is an expansion factor of the cipher algorithm. -+ For all symmetric algorithms k == 1. For asymmetric algorithms (which -+ inflate data) offset translation guarantees that all disk cluster's -+ units will have keys smaller then next cluster's one. -+ */ -+ loff_t(*scale) (struct inode *inode, size_t blocksize, loff_t src); -+ /* Cipher algorithms can accept data only by chunks of cipher block -+ size. This method is to align any flow up to cipher block size when -+ we pass it to cipher algorithm. To align means to append padding of -+ special format specific to the cipher algorithm */ -+ int (*align_stream) (__u8 *tail, int clust_size, int blocksize); -+ /* low-level key manager (check, install, etc..) */ -+ int (*setkey) (struct crypto_tfm *tfm, const __u8 *key, -+ unsigned int keylen); -+ /* main text processing procedures */ -+ void (*encrypt) (__u32 *expkey, __u8 *dst, const __u8 *src); -+ void (*decrypt) (__u32 *expkey, __u8 *dst, const __u8 *src); -+} cipher_plugin; -+ -+typedef struct digest_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* fingerprint size in bytes */ -+ int fipsize; -+ struct crypto_hash * (*alloc) (void); -+ void (*free) (struct crypto_hash *tfm); -+} digest_plugin; -+ -+typedef struct compression_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*init) (void); -+ /* the maximum number of bytes the size of the "compressed" data can -+ * exceed the uncompressed data. */ -+ int (*overrun) (unsigned src_len); -+ coa_t(*alloc) (tfm_action act); -+ void (*free) (coa_t coa, tfm_action act); -+ /* minimal size of the flow we still try to compress */ -+ int (*min_size_deflate) (void); -+ __u32(*checksum) (char *data, __u32 length); -+ /* main transform procedures */ -+ void (*compress) (coa_t coa, __u8 *src_first, unsigned src_len, -+ __u8 *dst_first, unsigned *dst_len); -+ void (*decompress) (coa_t coa, __u8 *src_first, unsigned src_len, -+ __u8 *dst_first, unsigned *dst_len); -+} compression_plugin; -+ -+typedef struct compression_mode_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* this is called when estimating compressibility -+ of a logical cluster by its content */ -+ int (*should_deflate) (struct inode *inode, cloff_t index); -+ /* this is called when results of compression should be saved */ -+ int (*accept_hook) (struct inode *inode, cloff_t index); -+ /* this is called when results of compression should be discarded */ -+ int (*discard_hook) (struct inode *inode, cloff_t index); -+} compression_mode_plugin; -+ -+typedef struct cluster_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int shift; -+} cluster_plugin; -+ -+typedef struct sd_ext_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*present) (struct inode *inode, char **area, int *len); -+ int (*absent) (struct inode *inode); -+ int (*save_len) (struct inode *inode); -+ int (*save) (struct inode *inode, char **area); -+ /* alignment requirement for this stat-data part */ -+ int alignment; -+} sd_ext_plugin; -+ -+/* this plugin contains methods to allocate objectid for newly created files, -+ to deallocate objectid when file gets removed, to report number of used and -+ free objectids */ -+typedef struct oid_allocator_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files, -+ __u64 oids); -+ /* used to report statfs->f_files */ -+ __u64(*oids_used) (reiser4_oid_allocator * map); -+ /* get next oid to use */ -+ __u64(*next_oid) (reiser4_oid_allocator * map); -+ /* used to report statfs->f_ffree */ -+ __u64(*oids_free) (reiser4_oid_allocator * map); -+ /* allocate new objectid */ -+ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *); -+ /* release objectid */ -+ int (*release_oid) (reiser4_oid_allocator * map, oid_t); -+ /* how many pages to reserve in transaction for allocation of new -+ objectid */ -+ int (*oid_reserve_allocate) (reiser4_oid_allocator * map); -+ /* how many pages to reserve in transaction for freeing of an -+ objectid */ -+ int (*oid_reserve_release) (reiser4_oid_allocator * map); -+ void (*print_info) (const char *, reiser4_oid_allocator *); -+} oid_allocator_plugin; -+ -+/* disk layout plugin: this specifies super block, journal, bitmap (if there -+ are any) locations, etc */ -+typedef struct disk_format_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* replay journal, initialize super_info_data, etc */ -+ int (*init_format) (struct super_block *, void *data); -+ -+ /* key of root directory stat data */ -+ const reiser4_key * (*root_dir_key) (const struct super_block *); -+ -+ int (*release) (struct super_block *); -+ jnode * (*log_super) (struct super_block *); -+ int (*check_open) (const struct inode *object); -+ int (*version_update) (struct super_block *); -+} disk_format_plugin; -+ -+struct jnode_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*init) (jnode * node); -+ int (*parse) (jnode * node); -+ struct address_space *(*mapping) (const jnode * node); -+ unsigned long (*index) (const jnode * node); -+ jnode * (*clone) (jnode * node); -+}; -+ -+/* plugin instance. */ -+/* */ -+/* This is "wrapper" union for all types of plugins. Most of the code uses */ -+/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */ -+/* operates with pointers to reiser4_plugin. This union is only used in */ -+/* some generic code in plugin/plugin.c that operates on all */ -+/* plugins. Technically speaking purpose of this union is to add type */ -+/* safety to said generic code: each plugin type (file_plugin, for */ -+/* example), contains plugin_header as its first memeber. This first member */ -+/* is located at the same place in memory as .h member of */ -+/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */ -+/* looks in the .h which is header of plugin type located in union. This */ -+/* allows to avoid type-casts. */ -+union reiser4_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* file plugin */ -+ file_plugin file; -+ /* directory plugin */ -+ dir_plugin dir; -+ /* hash plugin, used by directory plugin */ -+ hash_plugin hash; -+ /* fibration plugin used by directory plugin */ -+ fibration_plugin fibration; -+ /* cipher transform plugin, used by file plugin */ -+ cipher_plugin cipher; -+ /* digest transform plugin, used by file plugin */ -+ digest_plugin digest; -+ /* compression transform plugin, used by file plugin */ -+ compression_plugin compression; -+ /* tail plugin, used by file plugin */ -+ formatting_plugin formatting; -+ /* permission plugin */ -+ perm_plugin perm; -+ /* node plugin */ -+ node_plugin node; -+ /* item plugin */ -+ item_plugin item; -+ /* stat-data extension plugin */ -+ sd_ext_plugin sd_ext; -+ /* disk layout plugin */ -+ disk_format_plugin format; -+ /* object id allocator plugin */ -+ oid_allocator_plugin oid_allocator; -+ /* plugin for different jnode types */ -+ jnode_plugin jnode; -+ /* compression mode plugin, used by object plugin */ -+ compression_mode_plugin compression_mode; -+ /* cluster plugin, used by object plugin */ -+ cluster_plugin clust; -+ /* place-holder for new plugin types that can be registered -+ dynamically, and used by other dynamically loaded plugins. */ -+ void *generic; -+}; -+ -+struct reiser4_plugin_ops { -+ /* called when plugin is initialized */ -+ int (*init) (reiser4_plugin * plugin); -+ /* called when plugin is unloaded */ -+ int (*done) (reiser4_plugin * plugin); -+ /* load given plugin from disk */ -+ int (*load) (struct inode *inode, -+ reiser4_plugin * plugin, char **area, int *len); -+ /* how many space is required to store this plugin's state -+ in stat-data */ -+ int (*save_len) (struct inode *inode, reiser4_plugin * plugin); -+ /* save persistent plugin-data to disk */ -+ int (*save) (struct inode *inode, reiser4_plugin * plugin, -+ char **area); -+ /* alignment requirement for on-disk state of this plugin -+ in number of bytes */ -+ int alignment; -+ /* install itself into given inode. This can return error -+ (e.g., you cannot change hash of non-empty directory). */ -+ int (*change) (struct inode *inode, reiser4_plugin * plugin, -+ pset_member memb); -+ /* install itself into given inode. This can return error -+ (e.g., you cannot change hash of non-empty directory). */ -+ int (*inherit) (struct inode *inode, struct inode *parent, -+ reiser4_plugin * plugin); -+}; -+ -+/* functions implemented in fs/reiser4/plugin/plugin.c */ -+ -+/* stores plugin reference in reiser4-specific part of inode */ -+extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id); -+extern int init_plugins(void); -+ -+/* builtin plugins */ -+ -+/* builtin hash-plugins */ -+ -+typedef enum { -+ RUPASOV_HASH_ID, -+ R5_HASH_ID, -+ TEA_HASH_ID, -+ FNV1_HASH_ID, -+ DEGENERATE_HASH_ID, -+ LAST_HASH_ID -+} reiser4_hash_id; -+ -+/* builtin cipher plugins */ -+ -+typedef enum { -+ NONE_CIPHER_ID, -+ LAST_CIPHER_ID -+} reiser4_cipher_id; -+ -+/* builtin digest plugins */ -+ -+typedef enum { -+ SHA256_32_DIGEST_ID, -+ LAST_DIGEST_ID -+} reiser4_digest_id; -+ -+/* builtin compression mode plugins */ -+typedef enum { -+ NONE_COMPRESSION_MODE_ID, -+ LATTD_COMPRESSION_MODE_ID, -+ ULTIM_COMPRESSION_MODE_ID, -+ FORCE_COMPRESSION_MODE_ID, -+ CONVX_COMPRESSION_MODE_ID, -+ LAST_COMPRESSION_MODE_ID -+} reiser4_compression_mode_id; -+ -+/* builtin cluster plugins */ -+typedef enum { -+ CLUSTER_64K_ID, -+ CLUSTER_32K_ID, -+ CLUSTER_16K_ID, -+ CLUSTER_8K_ID, -+ CLUSTER_4K_ID, -+ LAST_CLUSTER_ID -+} reiser4_cluster_id; -+ -+/* builtin tail-plugins */ -+ -+typedef enum { -+ NEVER_TAILS_FORMATTING_ID, -+ ALWAYS_TAILS_FORMATTING_ID, -+ SMALL_FILE_FORMATTING_ID, -+ LAST_TAIL_FORMATTING_ID -+} reiser4_formatting_id; -+ -+/* data type used to pack parameters that we pass to vfs object creation -+ function create_object() */ -+struct reiser4_object_create_data { -+ /* plugin to control created object */ -+ reiser4_file_id id; -+ /* mode of regular file, directory or special file */ -+/* what happens if some other sort of perm plugin is in use? */ -+ int mode; -+ /* rdev of special file */ -+ dev_t rdev; -+ /* symlink target */ -+ const char *name; -+ /* add here something for non-standard objects you invent, like -+ query for interpolation file etc. */ -+ -+ struct reiser4_crypto_info *crypto; -+ -+ struct inode *parent; -+ struct dentry *dentry; -+}; -+ -+/* description of directory entry being created/destroyed/sought for -+ -+ It is passed down to the directory plugin and farther to the -+ directory item plugin methods. Creation of new directory is done in -+ several stages: first we search for an entry with the same name, then -+ create new one. reiser4_dir_entry_desc is used to store some information -+ collected at some stage of this process and required later: key of -+ item that we want to insert/delete and pointer to an object that will -+ be bound by the new directory entry. Probably some more fields will -+ be added there. -+ -+*/ -+struct reiser4_dir_entry_desc { -+ /* key of directory entry */ -+ reiser4_key key; -+ /* object bound by this entry. */ -+ struct inode *obj; -+}; -+ -+#define MAX_PLUGIN_TYPE_LABEL_LEN 32 -+#define MAX_PLUGIN_PLUG_LABEL_LEN 32 -+ -+#define PLUGIN_BY_ID(TYPE, ID, FIELD) \ -+static inline TYPE *TYPE ## _by_id(reiser4_plugin_id id) \ -+{ \ -+ reiser4_plugin *plugin = plugin_by_id(ID, id); \ -+ return plugin ? &plugin->FIELD : NULL; \ -+} \ -+static inline TYPE *TYPE ## _by_disk_id(reiser4_tree * tree, d16 *id) \ -+{ \ -+ reiser4_plugin *plugin = plugin_by_disk_id(tree, ID, id); \ -+ return plugin ? &plugin->FIELD : NULL; \ -+} \ -+static inline TYPE *TYPE ## _by_unsafe_id(reiser4_plugin_id id) \ -+{ \ -+ reiser4_plugin *plugin = plugin_by_unsafe_id(ID, id); \ -+ return plugin ? &plugin->FIELD : NULL; \ -+} \ -+static inline reiser4_plugin* TYPE ## _to_plugin(TYPE* plugin) \ -+{ \ -+ return (reiser4_plugin *) plugin; \ -+} \ -+static inline reiser4_plugin_id TYPE ## _id(TYPE* plugin) \ -+{ \ -+ return TYPE ## _to_plugin(plugin)->h.id; \ -+} \ -+typedef struct { int foo; } TYPE ## _plugin_dummy -+ -+PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item); -+PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file); -+PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir); -+PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node); -+PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext); -+PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm); -+PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash); -+PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration); -+PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher); -+PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest); -+PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression); -+PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting); -+PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format); -+PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode); -+PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ compression_mode); -+PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust); -+ -+extern int save_plugin_id(reiser4_plugin * plugin, d16 * area); -+ -+extern struct list_head *get_plugin_list(reiser4_plugin_type type_id); -+ -+#define for_all_plugins(ptype, plugin) \ -+for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \ -+ get_plugin_list(ptype) != &plugin->h.linkage; \ -+ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage)) -+ -+ -+extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, -+ pset_member memb); -+extern int force_plugin_pset(struct inode *self, pset_member memb, -+ reiser4_plugin *plug); -+extern int finish_pset(struct inode *inode); -+ -+/* defined in fs/reiser4/plugin/object.c */ -+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID]; -+/* defined in fs/reiser4/plugin/object.c */ -+extern dir_plugin dir_plugins[LAST_DIR_ID]; -+/* defined in fs/reiser4/plugin/item/static_stat.c */ -+extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION]; -+/* defined in fs/reiser4/plugin/hash.c */ -+extern hash_plugin hash_plugins[LAST_HASH_ID]; -+/* defined in fs/reiser4/plugin/fibration.c */ -+extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID]; -+/* defined in fs/reiser4/plugin/crypt.c */ -+extern cipher_plugin cipher_plugins[LAST_CIPHER_ID]; -+/* defined in fs/reiser4/plugin/digest.c */ -+extern digest_plugin digest_plugins[LAST_DIGEST_ID]; -+/* defined in fs/reiser4/plugin/compress/compress.c */ -+extern compression_plugin compression_plugins[LAST_COMPRESSION_ID]; -+/* defined in fs/reiser4/plugin/compress/compression_mode.c */ -+extern compression_mode_plugin -+compression_mode_plugins[LAST_COMPRESSION_MODE_ID]; -+/* defined in fs/reiser4/plugin/cluster.c */ -+extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID]; -+/* defined in fs/reiser4/plugin/tail.c */ -+extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID]; -+/* defined in fs/reiser4/plugin/security/security.c */ -+extern perm_plugin perm_plugins[LAST_PERM_ID]; -+/* defined in fs/reiser4/plugin/item/item.c */ -+extern item_plugin item_plugins[LAST_ITEM_ID]; -+/* defined in fs/reiser4/plugin/node/node.c */ -+extern node_plugin node_plugins[LAST_NODE_ID]; -+/* defined in fs/reiser4/plugin/disk_format/disk_format.c */ -+extern disk_format_plugin format_plugins[LAST_FORMAT_ID]; -+ -+/* __FS_REISER4_PLUGIN_TYPES_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.30/fs/reiser4/plugin/plugin_header.h ---- linux-2.6.30.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/plugin_header.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,157 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* plugin header. Data structures required by all plugin types. */ -+ -+#if !defined(__PLUGIN_HEADER_H__) -+#define __PLUGIN_HEADER_H__ -+ -+/* plugin data-types and constants */ -+ -+#include "../debug.h" -+#include "../dformat.h" -+ -+/* Every plugin type can be considered as a class of virtual objects -+ {(type, i) | i = 0, 1, ...}, which has one the following categories -+ of virtualization: -+ A - no virtualization; -+ F - per-file virtualization; -+ S - per-superblock virtualization; -+ FIXME-EDWARD: Define every such category */ -+ -+/* Supported plugin types: (id, (virtualization category), short description) */ -+typedef enum { -+ REISER4_FILE_PLUGIN_TYPE, /* (F) service VFS enry-points */ -+ REISER4_DIR_PLUGIN_TYPE, /* (F) service VFS enry-points */ -+ REISER4_ITEM_PLUGIN_TYPE, /* (F) manage items */ -+ REISER4_NODE_PLUGIN_TYPE, /* (S) manage formatted nodes */ -+ REISER4_HASH_PLUGIN_TYPE, /* (F) compute hash */ -+ REISER4_FIBRATION_PLUGIN_TYPE, /* (F) directory fibrations */ -+ REISER4_FORMATTING_PLUGIN_TYPE, /* (F) tail-packing policy */ -+ REISER4_PERM_PLUGIN_TYPE, /* stub (vacancy) */ -+ REISER4_SD_EXT_PLUGIN_TYPE, /* (A) stat-data extensions */ -+ REISER4_FORMAT_PLUGIN_TYPE, /* (S) specify disk format */ -+ REISER4_JNODE_PLUGIN_TYPE, /* (A) in-memory node headers */ -+ REISER4_CIPHER_PLUGIN_TYPE, /* (F) cipher transform algs */ -+ REISER4_DIGEST_PLUGIN_TYPE, /* (F) digest transform algs */ -+ REISER4_COMPRESSION_PLUGIN_TYPE, /* (F) compression tfm algs */ -+ REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* (F) compression heuristic */ -+ REISER4_CLUSTER_PLUGIN_TYPE, /* (F) size of logical cluster */ -+ REISER4_PLUGIN_TYPES -+} reiser4_plugin_type; -+ -+/* Supported plugin groups */ -+typedef enum { -+ REISER4_DIRECTORY_FILE, -+ REISER4_REGULAR_FILE, -+ REISER4_SYMLINK_FILE, -+ REISER4_SPECIAL_FILE, -+} file_plugin_group; -+ -+struct reiser4_plugin_ops; -+/* generic plugin operations, supported by each -+ plugin type. */ -+typedef struct reiser4_plugin_ops reiser4_plugin_ops; -+ -+/* the common part of all plugin instances. */ -+typedef struct plugin_header { -+ /* plugin type */ -+ reiser4_plugin_type type_id; -+ /* id of this plugin */ -+ reiser4_plugin_id id; -+ /* bitmask of groups the plugin belongs to. */ -+ reiser4_plugin_groups groups; -+ /* plugin operations */ -+ reiser4_plugin_ops *pops; -+/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and -+ * defined. */ -+ /* short label of this plugin */ -+ const char *label; -+ /* descriptive string.. */ -+ const char *desc; -+ /* list linkage */ -+ struct list_head linkage; -+} plugin_header; -+ -+#define plugin_of_group(plug, group) (plug->h.groups & (1 << group)) -+ -+/* PRIVATE INTERFACES */ -+/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in -+ * plugin_header? */ -+/* plugin type representation. */ -+struct reiser4_plugin_type_data { -+ /* internal plugin type identifier. Should coincide with -+ index of this item in plugins[] array. */ -+ reiser4_plugin_type type_id; -+ /* short symbolic label of this plugin type. Should be no longer -+ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */ -+ const char *label; -+ /* plugin type description longer than .label */ -+ const char *desc; -+ -+/* NIKITA-FIXME-HANS: define built-in */ -+ /* number of built-in plugin instances of this type */ -+ int builtin_num; -+ /* array of built-in plugins */ -+ void *builtin; -+ struct list_head plugins_list; -+ size_t size; -+}; -+ -+extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES]; -+ -+int is_plugin_type_valid(reiser4_plugin_type type); -+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id); -+ -+static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data *ptype, -+ int i) -+{ -+ char *builtin; -+ -+ builtin = ptype->builtin; -+ return (reiser4_plugin *) (builtin + i * ptype->size); -+} -+ -+/* return plugin by its @type_id and @id */ -+static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type, -+ reiser4_plugin_id id) -+{ -+ assert("nikita-1651", is_plugin_type_valid(type)); -+ assert("nikita-1652", is_plugin_id_valid(type, id)); -+ return plugin_at(&plugins[type], id); -+} -+ -+extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id, -+ reiser4_plugin_id id); -+ -+/** -+ * plugin_by_disk_id - get reiser4_plugin -+ * @type_id: plugin type id -+ * @did: plugin id in disk format -+ * -+ * Returns reiser4_plugin by plugin type id an dplugin_id. -+ */ -+static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG, -+ reiser4_plugin_type type_id, -+ __le16 *plugin_id) -+{ -+ /* -+ * what we should do properly is to maintain within each file-system a -+ * dictionary that maps on-disk plugin ids to "universal" ids. This -+ * dictionary will be resolved on mount time, so that this function -+ * will perform just one additional array lookup. -+ */ -+ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id)); -+} -+ -+/* __PLUGIN_HEADER_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.30/fs/reiser4/plugin/plugin_set.c ---- linux-2.6.30.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/plugin_set.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,380 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* This file contains Reiser4 plugin set operations */ -+ -+/* plugin sets -+ * -+ * Each file in reiser4 is controlled by a whole set of plugins (file plugin, -+ * directory plugin, hash plugin, tail policy plugin, security plugin, etc.) -+ * assigned (inherited, deduced from mode bits, etc.) at creation time. This -+ * set of plugins (so called pset) is described by structure plugin_set (see -+ * plugin/plugin_set.h), which contains pointers to all required plugins. -+ * -+ * Children can inherit some pset members from their parent, however sometimes -+ * it is useful to specify members different from parent ones. Since object's -+ * pset can not be easily changed without fatal consequences, we use for this -+ * purpose another special plugin table (so called hset, or heir set) described -+ * by the same structure. -+ * -+ * Inode only stores a pointers to pset and hset. Different inodes with the -+ * same set of pset (hset) members point to the same pset (hset). This is -+ * archived by storing psets and hsets in global hash table. Races are avoided -+ * by simple (and efficient so far) solution of never recycling psets, even -+ * when last inode pointing to it is destroyed. -+ */ -+ -+#include "../debug.h" -+#include "../super.h" -+#include "plugin_set.h" -+ -+#include <linux/slab.h> -+#include <linux/stddef.h> -+ -+/* slab for plugin sets */ -+static struct kmem_cache *plugin_set_slab; -+ -+static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = { -+ [0 ... 7] = SPIN_LOCK_UNLOCKED -+}; -+ -+/* hash table support */ -+ -+#define PS_TABLE_SIZE (32) -+ -+static inline plugin_set *cast_to(const unsigned long *a) -+{ -+ return container_of(a, plugin_set, hashval); -+} -+ -+static inline int pseq(const unsigned long *a1, const unsigned long *a2) -+{ -+ plugin_set *set1; -+ plugin_set *set2; -+ -+ /* make sure fields are not missed in the code below */ -+ cassert(sizeof *set1 == -+ sizeof set1->hashval + -+ sizeof set1->link + -+ sizeof set1->file + -+ sizeof set1->dir + -+ sizeof set1->perm + -+ sizeof set1->formatting + -+ sizeof set1->hash + -+ sizeof set1->fibration + -+ sizeof set1->sd + -+ sizeof set1->dir_item + -+ sizeof set1->cipher + -+ sizeof set1->digest + -+ sizeof set1->compression + -+ sizeof set1->compression_mode + -+ sizeof set1->cluster + -+ sizeof set1->create); -+ -+ set1 = cast_to(a1); -+ set2 = cast_to(a2); -+ return -+ set1->hashval == set2->hashval && -+ set1->file == set2->file && -+ set1->dir == set2->dir && -+ set1->perm == set2->perm && -+ set1->formatting == set2->formatting && -+ set1->hash == set2->hash && -+ set1->fibration == set2->fibration && -+ set1->sd == set2->sd && -+ set1->dir_item == set2->dir_item && -+ set1->cipher == set2->cipher && -+ set1->digest == set2->digest && -+ set1->compression == set2->compression && -+ set1->compression_mode == set2->compression_mode && -+ set1->cluster == set2->cluster && -+ set1->create == set2->create; -+} -+ -+#define HASH_FIELD(hash, set, field) \ -+({ \ -+ (hash) += (unsigned long)(set)->field >> 2; \ -+}) -+ -+static inline unsigned long calculate_hash(const plugin_set * set) -+{ -+ unsigned long result; -+ -+ result = 0; -+ HASH_FIELD(result, set, file); -+ HASH_FIELD(result, set, dir); -+ HASH_FIELD(result, set, perm); -+ HASH_FIELD(result, set, formatting); -+ HASH_FIELD(result, set, hash); -+ HASH_FIELD(result, set, fibration); -+ HASH_FIELD(result, set, sd); -+ HASH_FIELD(result, set, dir_item); -+ HASH_FIELD(result, set, cipher); -+ HASH_FIELD(result, set, digest); -+ HASH_FIELD(result, set, compression); -+ HASH_FIELD(result, set, compression_mode); -+ HASH_FIELD(result, set, cluster); -+ HASH_FIELD(result, set, create); -+ return result & (PS_TABLE_SIZE - 1); -+} -+ -+static inline unsigned long -+pshash(ps_hash_table * table, const unsigned long *a) -+{ -+ return *a; -+} -+ -+/* The hash table definition */ -+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) -+#define KFREE(ptr, size) kfree(ptr) -+TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash, -+ pseq); -+#undef KFREE -+#undef KMALLOC -+ -+static ps_hash_table ps_table; -+static plugin_set empty_set = { -+ .hashval = 0, -+ .file = NULL, -+ .dir = NULL, -+ .perm = NULL, -+ .formatting = NULL, -+ .hash = NULL, -+ .fibration = NULL, -+ .sd = NULL, -+ .dir_item = NULL, -+ .cipher = NULL, -+ .digest = NULL, -+ .compression = NULL, -+ .compression_mode = NULL, -+ .cluster = NULL, -+ .create = NULL, -+ .link = {NULL} -+}; -+ -+plugin_set *plugin_set_get_empty(void) -+{ -+ return &empty_set; -+} -+ -+void plugin_set_put(plugin_set * set) -+{ -+} -+ -+static inline unsigned long *pset_field(plugin_set * set, int offset) -+{ -+ return (unsigned long *)(((char *)set) + offset); -+} -+ -+static int plugin_set_field(plugin_set ** set, const unsigned long val, -+ const int offset) -+{ -+ unsigned long *spot; -+ spinlock_t *lock; -+ plugin_set replica; -+ plugin_set *twin; -+ plugin_set *psal; -+ plugin_set *orig; -+ -+ assert("nikita-2902", set != NULL); -+ assert("nikita-2904", *set != NULL); -+ -+ spot = pset_field(*set, offset); -+ if (unlikely(*spot == val)) -+ return 0; -+ -+ replica = *(orig = *set); -+ *pset_field(&replica, offset) = val; -+ replica.hashval = calculate_hash(&replica); -+ rcu_read_lock(); -+ twin = ps_hash_find(&ps_table, &replica.hashval); -+ if (unlikely(twin == NULL)) { -+ rcu_read_unlock(); -+ psal = kmem_cache_alloc(plugin_set_slab, -+ reiser4_ctx_gfp_mask_get()); -+ if (psal == NULL) -+ return RETERR(-ENOMEM); -+ *psal = replica; -+ lock = &plugin_set_lock[replica.hashval & 7]; -+ spin_lock(lock); -+ twin = ps_hash_find(&ps_table, &replica.hashval); -+ if (likely(twin == NULL)) { -+ *set = psal; -+ ps_hash_insert_rcu(&ps_table, psal); -+ } else { -+ *set = twin; -+ kmem_cache_free(plugin_set_slab, psal); -+ } -+ spin_unlock(lock); -+ } else { -+ rcu_read_unlock(); -+ *set = twin; -+ } -+ return 0; -+} -+ -+static struct { -+ int offset; -+ reiser4_plugin_groups groups; -+ reiser4_plugin_type type; -+} pset_descr[PSET_LAST] = { -+ [PSET_FILE] = { -+ .offset = offsetof(plugin_set, file), -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_DIR] = { -+ .offset = offsetof(plugin_set, dir), -+ .type = REISER4_DIR_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_PERM] = { -+ .offset = offsetof(plugin_set, perm), -+ .type = REISER4_PERM_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_FORMATTING] = { -+ .offset = offsetof(plugin_set, formatting), -+ .type = REISER4_FORMATTING_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_HASH] = { -+ .offset = offsetof(plugin_set, hash), -+ .type = REISER4_HASH_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_FIBRATION] = { -+ .offset = offsetof(plugin_set, fibration), -+ .type = REISER4_FIBRATION_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_SD] = { -+ .offset = offsetof(plugin_set, sd), -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .groups = (1 << STAT_DATA_ITEM_TYPE) -+ }, -+ [PSET_DIR_ITEM] = { -+ .offset = offsetof(plugin_set, dir_item), -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .groups = (1 << DIR_ENTRY_ITEM_TYPE) -+ }, -+ [PSET_CIPHER] = { -+ .offset = offsetof(plugin_set, cipher), -+ .type = REISER4_CIPHER_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_DIGEST] = { -+ .offset = offsetof(plugin_set, digest), -+ .type = REISER4_DIGEST_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_COMPRESSION] = { -+ .offset = offsetof(plugin_set, compression), -+ .type = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_COMPRESSION_MODE] = { -+ .offset = offsetof(plugin_set, compression_mode), -+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_CLUSTER] = { -+ .offset = offsetof(plugin_set, cluster), -+ .type = REISER4_CLUSTER_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_CREATE] = { -+ .offset = offsetof(plugin_set, create), -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .groups = (1 << REISER4_REGULAR_FILE) -+ } -+}; -+ -+#define DEFINE_PSET_OPS(PREFIX) \ -+ reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \ -+{ \ -+ if (memb > PSET_LAST) \ -+ return REISER4_PLUGIN_TYPES; \ -+ return pset_descr[memb].type; \ -+} \ -+ \ -+int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \ -+ reiser4_plugin * plugin) \ -+{ \ -+ assert("nikita-3492", set != NULL); \ -+ assert("nikita-3493", *set != NULL); \ -+ assert("nikita-3494", plugin != NULL); \ -+ assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \ -+ assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \ -+ \ -+ if (pset_descr[memb].groups) \ -+ if (!(pset_descr[memb].groups & plugin->h.groups)) \ -+ return -EINVAL; \ -+ \ -+ return plugin_set_field(set, \ -+ (unsigned long)plugin, pset_descr[memb].offset); \ -+} \ -+ \ -+reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \ -+{ \ -+ assert("nikita-3497", set != NULL); \ -+ assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \ -+ \ -+ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \ -+} -+ -+DEFINE_PSET_OPS(aset); -+ -+int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) -+{ -+ return plugin_set_field(set, -+ (unsigned long)plugin, pset_descr[memb].offset); -+} -+ -+/** -+ * init_plugin_set - create plugin set cache and hash table -+ * -+ * Initializes slab cache of plugin_set-s and their hash table. It is part of -+ * reiser4 module initialization. -+ */ -+int init_plugin_set(void) -+{ -+ int result; -+ -+ result = ps_hash_init(&ps_table, PS_TABLE_SIZE); -+ if (result == 0) { -+ plugin_set_slab = kmem_cache_create("plugin_set", -+ sizeof(plugin_set), 0, -+ SLAB_HWCACHE_ALIGN, -+ NULL); -+ if (plugin_set_slab == NULL) -+ result = RETERR(-ENOMEM); -+ } -+ return result; -+} -+ -+/** -+ * done_plugin_set - delete plugin_set cache and plugin_set hash table -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_plugin_set(void) -+{ -+ plugin_set *cur, *next; -+ -+ for_all_in_htable(&ps_table, ps, cur, next) { -+ ps_hash_remove(&ps_table, cur); -+ kmem_cache_free(plugin_set_slab, cur); -+ } -+ destroy_reiser4_cache(&plugin_set_slab); -+ ps_hash_done(&ps_table); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.30/fs/reiser4/plugin/plugin_set.h ---- linux-2.6.30.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/plugin_set.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,78 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Reiser4 plugin set definition. -+ See fs/reiser4/plugin/plugin_set.c for details */ -+ -+#if !defined(__PLUGIN_SET_H__) -+#define __PLUGIN_SET_H__ -+ -+#include "../type_safe_hash.h" -+#include "plugin.h" -+ -+#include <linux/rcupdate.h> -+ -+struct plugin_set; -+typedef struct plugin_set plugin_set; -+ -+TYPE_SAFE_HASH_DECLARE(ps, plugin_set); -+ -+struct plugin_set { -+ unsigned long hashval; -+ /* plugin of file */ -+ file_plugin *file; -+ /* plugin of dir */ -+ dir_plugin *dir; -+ /* perm plugin for this file */ -+ perm_plugin *perm; -+ /* tail policy plugin. Only meaningful for regular files */ -+ formatting_plugin *formatting; -+ /* hash plugin. Only meaningful for directories. */ -+ hash_plugin *hash; -+ /* fibration plugin. Only meaningful for directories. */ -+ fibration_plugin *fibration; -+ /* plugin of stat-data */ -+ item_plugin *sd; -+ /* plugin of items a directory is built of */ -+ item_plugin *dir_item; -+ /* cipher plugin */ -+ cipher_plugin *cipher; -+ /* digest plugin */ -+ digest_plugin *digest; -+ /* compression plugin */ -+ compression_plugin *compression; -+ /* compression mode plugin */ -+ compression_mode_plugin *compression_mode; -+ /* cluster plugin */ -+ cluster_plugin *cluster; -+ /* this specifies file plugin of regular children. -+ only meaningful for directories */ -+ file_plugin *create; -+ ps_hash_link link; -+}; -+ -+extern plugin_set *plugin_set_get_empty(void); -+extern void plugin_set_put(plugin_set * set); -+ -+extern int init_plugin_set(void); -+extern void done_plugin_set(void); -+ -+extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb); -+extern int set_plugin(plugin_set ** set, pset_member memb, -+ reiser4_plugin * plugin); -+extern int aset_set_unsafe(plugin_set ** set, pset_member memb, -+ reiser4_plugin * plugin); -+extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb); -+ -+/* __PLUGIN_SET_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/security/Makefile linux-2.6.30/fs/reiser4/plugin/security/Makefile ---- linux-2.6.30.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/security/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,4 @@ -+obj-$(CONFIG_REISER4_FS) += security_plugins.o -+ -+security_plugins-objs := \ -+ perm.o -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/security/perm.c linux-2.6.30/fs/reiser4/plugin/security/perm.c ---- linux-2.6.30.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/security/perm.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,33 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* -+ * This file contains implementation of permission plugins. -+ * See the comments in perm.h -+ */ -+ -+#include "../plugin.h" -+#include "../plugin_header.h" -+#include "../../debug.h" -+ -+perm_plugin perm_plugins[LAST_PERM_ID] = { -+ [NULL_PERM_ID] = { -+ .h = { -+ .type_id = REISER4_PERM_PLUGIN_TYPE, -+ .id = NULL_PERM_ID, -+ .pops = NULL, -+ .label = "null", -+ .desc = "stub permission plugin", -+ .linkage = {NULL, NULL} -+ } -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/security/perm.h linux-2.6.30/fs/reiser4/plugin/security/perm.h ---- linux-2.6.30.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/security/perm.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,38 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Perm (short for "permissions") plugins common stuff. */ -+ -+#if !defined( __REISER4_PERM_H__ ) -+#define __REISER4_PERM_H__ -+ -+#include "../../forward.h" -+#include "../plugin_header.h" -+ -+#include <linux/types.h> -+ -+/* Definition of permission plugin */ -+/* NIKITA-FIXME-HANS: define what this is targeted for. -+ It does not seem to be intended for use with sys_reiser4. Explain. */ -+ -+/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4. -+ Consider it like a temporary "seam" and reserved pset member. -+ If you have something usefull to add, then rename this plugin and add here */ -+typedef struct perm_plugin { -+ /* generic plugin fields */ -+ plugin_header h; -+} perm_plugin; -+ -+typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id; -+ -+/* __REISER4_PERM_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.30/fs/reiser4/plugin/space/bitmap.c ---- linux-2.6.30.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/space/bitmap.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1585 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../txnmgr.h" -+#include "../../jnode.h" -+#include "../../block_alloc.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../plugin.h" -+#include "space_allocator.h" -+#include "bitmap.h" -+ -+#include <linux/types.h> -+#include <linux/fs.h> /* for struct super_block */ -+#include <linux/mutex.h> -+#include <asm/div64.h> -+ -+/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap -+ * blocks -+ -+ A useful optimization of reiser4 bitmap handling would be dynamic bitmap -+ blocks loading/unloading which is different from v3.x where all bitmap -+ blocks are loaded at mount time. -+ -+ To implement bitmap blocks unloading we need to count bitmap block usage -+ and detect currently unused blocks allowing them to be unloaded. It is not -+ a simple task since we allow several threads to modify one bitmap block -+ simultaneously. -+ -+ Briefly speaking, the following schema is proposed: we count in special -+ variable associated with each bitmap block. That is for counting of block -+ alloc/dealloc operations on that bitmap block. With a deferred block -+ deallocation feature of reiser4 all those operation will be represented in -+ atom dirty/deleted lists as jnodes for freshly allocated or deleted -+ nodes. -+ -+ So, we increment usage counter for each new node allocated or deleted, and -+ decrement it at atom commit one time for each node from the dirty/deleted -+ atom's list. Of course, freshly allocated node deletion and node reusing -+ from atom deleted (if we do so) list should decrement bitmap usage counter -+ also. -+ -+ This schema seems to be working but that reference counting is -+ not easy to debug. I think we should agree with Hans and do not implement -+ it in v4.0. Current code implements "on-demand" bitmap blocks loading only. -+ -+ For simplicity all bitmap nodes (both commit and working bitmap blocks) are -+ loaded into memory on fs mount time or each bitmap nodes are loaded at the -+ first access to it, the "dont_load_bitmap" mount option controls whether -+ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap -+ nodes currently is not supported. */ -+ -+#define CHECKSUM_SIZE 4 -+ -+#define BYTES_PER_LONG (sizeof(long)) -+ -+#if BITS_PER_LONG == 64 -+# define LONG_INT_SHIFT (6) -+#else -+# define LONG_INT_SHIFT (5) -+#endif -+ -+#define LONG_INT_MASK (BITS_PER_LONG - 1UL) -+ -+typedef unsigned long ulong_t; -+ -+#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE) -+#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3) -+ -+/* Block allocation/deallocation are done through special bitmap objects which -+ are allocated in an array at fs mount. */ -+struct bitmap_node { -+ struct mutex mutex; /* long term lock object */ -+ -+ jnode *wjnode; /* j-nodes for WORKING ... */ -+ jnode *cjnode; /* ... and COMMIT bitmap blocks */ -+ -+ bmap_off_t first_zero_bit; /* for skip_busy option implementation */ -+ -+ atomic_t loaded; /* a flag which shows that bnode is loaded -+ * already */ -+}; -+ -+static inline char *bnode_working_data(struct bitmap_node *bnode) -+{ -+ char *data; -+ -+ data = jdata(bnode->wjnode); -+ assert("zam-429", data != NULL); -+ -+ return data + CHECKSUM_SIZE; -+} -+ -+static inline char *bnode_commit_data(const struct bitmap_node *bnode) -+{ -+ char *data; -+ -+ data = jdata(bnode->cjnode); -+ assert("zam-430", data != NULL); -+ -+ return data + CHECKSUM_SIZE; -+} -+ -+static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode) -+{ -+ char *data; -+ -+ data = jdata(bnode->cjnode); -+ assert("vpf-261", data != NULL); -+ -+ return le32_to_cpu(get_unaligned((d32 *)data)); -+} -+ -+static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc) -+{ -+ char *data; -+ -+ data = jdata(bnode->cjnode); -+ assert("vpf-261", data != NULL); -+ -+ put_unaligned(cpu_to_le32(crc), (d32 *)data); -+} -+ -+/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having -+ * written the code, does this added abstraction still have */ -+/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the -+ * reiser4_space_allocator structure) */ -+/* ZAM-FIXME-HANS: I don't understand your english in comment above. */ -+/* FIXME-HANS(Zam): I don't understand the questions like "might be a union -+ * someday?". What they about? If there is a reason to have a union, it should -+ * be a union, if not, it should not be a union. "..might be someday" means no -+ * reason. */ -+struct bitmap_allocator_data { -+ /* an array for bitmap blocks direct access */ -+ struct bitmap_node *bitmap; -+}; -+ -+#define get_barray(super) \ -+(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap) -+ -+#define get_bnode(super, i) (get_barray(super) + i) -+ -+/* allocate and initialize jnode with JNODE_BITMAP type */ -+static jnode *bnew(void) -+{ -+ jnode *jal = jalloc(); -+ -+ if (jal) -+ jnode_init(jal, current_tree, JNODE_BITMAP); -+ -+ return jal; -+} -+ -+/* this file contains: -+ - bitmap based implementation of space allocation plugin -+ - all the helper functions like set bit, find_first_zero_bit, etc */ -+ -+/* Audited by: green(2002.06.12) */ -+static int find_next_zero_bit_in_word(ulong_t word, int start_bit) -+{ -+ ulong_t mask = 1UL << start_bit; -+ int i = start_bit; -+ -+ while ((word & mask) != 0) { -+ mask <<= 1; -+ if (++i >= BITS_PER_LONG) -+ break; -+ } -+ -+ return i; -+} -+ -+#include <linux/bitops.h> -+ -+#if BITS_PER_LONG == 64 -+ -+#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3) -+#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1))) -+ -+static inline void reiser4_set_bit(int nr, void *addr) -+{ -+ ext2_set_bit(nr + OFF(addr), BASE(addr)); -+} -+ -+static inline void reiser4_clear_bit(int nr, void *addr) -+{ -+ ext2_clear_bit(nr + OFF(addr), BASE(addr)); -+} -+ -+static inline int reiser4_test_bit(int nr, void *addr) -+{ -+ return ext2_test_bit(nr + OFF(addr), BASE(addr)); -+} -+static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset, -+ int offset) -+{ -+ int off = OFF(addr); -+ -+ return ext2_find_next_zero_bit(BASE(addr), maxoffset + off, -+ offset + off) - off; -+} -+ -+#else -+ -+#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr) -+#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr) -+#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr) -+ -+#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \ -+ext2_find_next_zero_bit(addr, maxoffset, offset) -+#endif -+ -+/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets -+ * are counted from @addr, return the offset of the first bit if it is found, -+ * @maxoffset otherwise. */ -+static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset, -+ bmap_off_t start_offset) -+{ -+ ulong_t *base = addr; -+ /* start_offset is in bits, convert it to byte offset within bitmap. */ -+ int word_nr = start_offset >> LONG_INT_SHIFT; -+ /* bit number within the byte. */ -+ int bit_nr = start_offset & LONG_INT_MASK; -+ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT; -+ -+ assert("zam-387", max_offset != 0); -+ -+ /* Unaligned @start_offset case. */ -+ if (bit_nr != 0) { -+ bmap_nr_t nr; -+ -+ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr); -+ -+ if (nr < BITS_PER_LONG) -+ return (word_nr << LONG_INT_SHIFT) + nr; -+ -+ ++word_nr; -+ } -+ -+ /* Fast scan trough aligned words. */ -+ while (word_nr <= max_word_nr) { -+ if (base[word_nr] != 0) { -+ return (word_nr << LONG_INT_SHIFT) -+ + find_next_zero_bit_in_word(~(base[word_nr]), 0); -+ } -+ -+ ++word_nr; -+ } -+ -+ return max_offset; -+} -+ -+#if BITS_PER_LONG == 64 -+ -+static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset, -+ bmap_off_t start_offset) -+{ -+ bmap_off_t off = OFF(addr); -+ -+ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off, -+ start_offset + off) - off; -+} -+ -+#else -+#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \ -+ __reiser4_find_next_set_bit(addr, max_offset, start_offset) -+#endif -+ -+/* search for the first set bit in single word. */ -+static int find_last_set_bit_in_word(ulong_t word, int start_bit) -+{ -+ ulong_t bit_mask; -+ int nr = start_bit; -+ -+ assert("zam-965", start_bit < BITS_PER_LONG); -+ assert("zam-966", start_bit >= 0); -+ -+ bit_mask = (1UL << nr); -+ -+ while (bit_mask != 0) { -+ if (bit_mask & word) -+ return nr; -+ bit_mask >>= 1; -+ nr--; -+ } -+ return BITS_PER_LONG; -+} -+ -+/* Search bitmap for a set bit in backward direction from the end to the -+ * beginning of given region -+ * -+ * @result: result offset of the last set bit -+ * @addr: base memory address, -+ * @low_off: low end of the search region, edge bit included into the region, -+ * @high_off: high end of the search region, edge bit included into the region, -+ * -+ * @return: 0 - set bit was found, -1 otherwise. -+ */ -+static int -+reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off, -+ bmap_off_t high_off) -+{ -+ ulong_t *base = addr; -+ int last_word; -+ int first_word; -+ int last_bit; -+ int nr; -+ -+ assert("zam-962", high_off >= low_off); -+ -+ last_word = high_off >> LONG_INT_SHIFT; -+ last_bit = high_off & LONG_INT_MASK; -+ first_word = low_off >> LONG_INT_SHIFT; -+ -+ if (last_bit < BITS_PER_LONG) { -+ nr = find_last_set_bit_in_word(base[last_word], last_bit); -+ if (nr < BITS_PER_LONG) { -+ *result = (last_word << LONG_INT_SHIFT) + nr; -+ return 0; -+ } -+ --last_word; -+ } -+ while (last_word >= first_word) { -+ if (base[last_word] != 0x0) { -+ last_bit = -+ find_last_set_bit_in_word(base[last_word], -+ BITS_PER_LONG - 1); -+ assert("zam-972", last_bit < BITS_PER_LONG); -+ *result = (last_word << LONG_INT_SHIFT) + last_bit; -+ return 0; -+ } -+ --last_word; -+ } -+ -+ return -1; /* set bit not found */ -+} -+ -+/* Search bitmap for a clear bit in backward direction from the end to the -+ * beginning of given region */ -+static int -+reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off, -+ bmap_off_t high_off) -+{ -+ ulong_t *base = addr; -+ int last_word; -+ int first_word; -+ int last_bit; -+ int nr; -+ -+ last_word = high_off >> LONG_INT_SHIFT; -+ last_bit = high_off & LONG_INT_MASK; -+ first_word = low_off >> LONG_INT_SHIFT; -+ -+ if (last_bit < BITS_PER_LONG) { -+ nr = find_last_set_bit_in_word(~base[last_word], last_bit); -+ if (nr < BITS_PER_LONG) { -+ *result = (last_word << LONG_INT_SHIFT) + nr; -+ return 0; -+ } -+ --last_word; -+ } -+ while (last_word >= first_word) { -+ if (base[last_word] != (ulong_t) (-1)) { -+ *result = (last_word << LONG_INT_SHIFT) + -+ find_last_set_bit_in_word(~base[last_word], -+ BITS_PER_LONG - 1); -+ return 0; -+ } -+ --last_word; -+ } -+ -+ return -1; /* zero bit not found */ -+} -+ -+/* Audited by: green(2002.06.12) */ -+static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end) -+{ -+ int first_byte; -+ int last_byte; -+ -+ unsigned char first_byte_mask = 0xFF; -+ unsigned char last_byte_mask = 0xFF; -+ -+ assert("zam-410", start < end); -+ -+ first_byte = start >> 3; -+ last_byte = (end - 1) >> 3; -+ -+ if (last_byte > first_byte + 1) -+ memset(addr + first_byte + 1, 0, -+ (size_t) (last_byte - first_byte - 1)); -+ -+ first_byte_mask >>= 8 - (start & 0x7); -+ last_byte_mask <<= ((end - 1) & 0x7) + 1; -+ -+ if (first_byte == last_byte) { -+ addr[first_byte] &= (first_byte_mask | last_byte_mask); -+ } else { -+ addr[first_byte] &= first_byte_mask; -+ addr[last_byte] &= last_byte_mask; -+ } -+} -+ -+/* Audited by: green(2002.06.12) */ -+/* ZAM-FIXME-HANS: comment this */ -+static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end) -+{ -+ int first_byte; -+ int last_byte; -+ -+ unsigned char first_byte_mask = 0xFF; -+ unsigned char last_byte_mask = 0xFF; -+ -+ assert("zam-386", start < end); -+ -+ first_byte = start >> 3; -+ last_byte = (end - 1) >> 3; -+ -+ if (last_byte > first_byte + 1) -+ memset(addr + first_byte + 1, 0xFF, -+ (size_t) (last_byte - first_byte - 1)); -+ -+ first_byte_mask <<= start & 0x7; -+ last_byte_mask >>= 7 - ((end - 1) & 0x7); -+ -+ if (first_byte == last_byte) { -+ addr[first_byte] |= (first_byte_mask & last_byte_mask); -+ } else { -+ addr[first_byte] |= first_byte_mask; -+ addr[last_byte] |= last_byte_mask; -+ } -+} -+ -+#define ADLER_BASE 65521 -+#define ADLER_NMAX 5552 -+ -+/* Calculates the adler32 checksum for the data pointed by `data` of the -+ length `len`. This function was originally taken from zlib, version 1.1.3, -+ July 9th, 1998. -+ -+ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler -+ -+ This software is provided 'as-is', without any express or implied -+ warranty. In no event will the authors be held liable for any damages -+ arising from the use of this software. -+ -+ Permission is granted to anyone to use this software for any purpose, -+ including commercial applications, and to alter it and redistribute it -+ freely, subject to the following restrictions: -+ -+ 1. The origin of this software must not be misrepresented; you must not -+ claim that you wrote the original software. If you use this software -+ in a product, an acknowledgment in the product documentation would be -+ appreciated but is not required. -+ 2. Altered source versions must be plainly marked as such, and must not be -+ misrepresented as being the original software. -+ 3. This notice may not be removed or altered from any source distribution. -+ -+ Jean-loup Gailly Mark Adler -+ jloup@gzip.org madler@alumni.caltech.edu -+ -+ The above comment applies only to the reiser4_adler32 function. -+*/ -+ -+__u32 reiser4_adler32(char *data, __u32 len) -+{ -+ unsigned char *t = data; -+ __u32 s1 = 1; -+ __u32 s2 = 0; -+ int k; -+ -+ while (len > 0) { -+ k = len < ADLER_NMAX ? len : ADLER_NMAX; -+ len -= k; -+ -+ while (k--) { -+ s1 += *t++; -+ s2 += s1; -+ } -+ -+ s1 %= ADLER_BASE; -+ s2 %= ADLER_BASE; -+ } -+ return (s2 << 16) | s1; -+} -+ -+#define sb_by_bnode(bnode) \ -+ ((struct super_block *)jnode_get_tree(bnode->wjnode)->super) -+ -+static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size) -+{ -+ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size)); -+} -+ -+static int -+bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size) -+{ -+ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) { -+ bmap_nr_t bmap; -+ -+ bmap = bnode - get_bnode(sb_by_bnode(bnode), 0); -+ -+ warning("vpf-263", -+ "Checksum for the bitmap block %llu is incorrect", -+ bmap); -+ -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+#define REISER4_CHECK_BMAP_CRC (0) -+ -+#if REISER4_CHECK_BMAP_CRC -+static int bnode_check_crc(const struct bitmap_node *bnode) -+{ -+ return bnode_check_adler32(bnode, -+ bmap_size(sb_by_bnode(bnode)->s_blocksize)); -+} -+ -+/* REISER4_CHECK_BMAP_CRC */ -+#else -+ -+#define bnode_check_crc(bnode) (0) -+ -+/* REISER4_CHECK_BMAP_CRC */ -+#endif -+ -+/* Recalculates the adler32 checksum for only 1 byte change. -+ adler - previous adler checksum -+ old_data, data - old, new byte values. -+ tail == (chunk - offset) : length, checksum was calculated for, - offset of -+ the changed byte within this chunk. -+ This function can be used for checksum calculation optimisation. -+*/ -+ -+static __u32 -+adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data, -+ __u32 tail) -+{ -+ __u32 delta = data - old_data + 2 * ADLER_BASE; -+ __u32 s1 = adler & 0xffff; -+ __u32 s2 = (adler >> 16) & 0xffff; -+ -+ s1 = (delta + s1) % ADLER_BASE; -+ s2 = (delta * tail + s2) % ADLER_BASE; -+ -+ return (s2 << 16) | s1; -+} -+ -+#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val)) -+ -+/** -+ * get_nr_bitmap - calculate number of bitmap blocks -+ * @super: super block with initialized blocksize and block count -+ * -+ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to -+ * maintain free disk space. It assumes that each bitmap addresses the same -+ * number of blocks which is calculated by bmap_block_count macro defined in -+ * above. Number of blocks in the filesystem has to be initialized in reiser4 -+ * private data of super block already so that it can be obtained via -+ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap -+ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have -+ * to use special function to divide and modulo 64bits filesystem block -+ * counters. -+ * -+ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap -+ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address -+ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2. -+ */ -+static bmap_nr_t get_nr_bmap(const struct super_block *super) -+{ -+ u64 quotient; -+ -+ assert("zam-393", reiser4_block_count(super) != 0); -+ -+ quotient = reiser4_block_count(super) - 1; -+ do_div(quotient, bmap_bit_count(super->s_blocksize)); -+ return quotient + 1; -+} -+ -+/** -+ * parse_blocknr - calculate bitmap number and offset in it by block number -+ * @block: pointer to block number to calculate location in bitmap of -+ * @bmap: pointer where to store bitmap block number -+ * @offset: pointer where to store offset within bitmap block -+ * -+ * Calculates location of bit which is responsible for allocation/freeing of -+ * block @*block. That location is represented by bitmap block number and offset -+ * within that bitmap block. -+ */ -+static void -+parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap, -+ bmap_off_t *offset) -+{ -+ struct super_block *super = get_current_context()->super; -+ u64 quotient = *block; -+ -+ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize)); -+ *bmap = quotient; -+ -+ assert("zam-433", *bmap < get_nr_bmap(super)); -+ assert("", *offset < bmap_bit_count(super->s_blocksize)); -+} -+ -+#if REISER4_DEBUG -+/* Audited by: green(2002.06.12) */ -+static void -+check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ assert("zam-436", sb != NULL); -+ -+ assert("zam-455", start != NULL); -+ assert("zam-437", *start != 0); -+ assert("zam-541", !reiser4_blocknr_is_fake(start)); -+ assert("zam-441", *start < reiser4_block_count(sb)); -+ -+ if (len != NULL) { -+ assert("zam-438", *len != 0); -+ assert("zam-442", *start + *len <= reiser4_block_count(sb)); -+ } -+} -+ -+static void check_bnode_loaded(const struct bitmap_node *bnode) -+{ -+ assert("zam-485", bnode != NULL); -+ assert("zam-483", jnode_page(bnode->wjnode) != NULL); -+ assert("zam-484", jnode_page(bnode->cjnode) != NULL); -+ assert("nikita-2820", jnode_is_loaded(bnode->wjnode)); -+ assert("nikita-2821", jnode_is_loaded(bnode->cjnode)); -+} -+ -+#else -+ -+# define check_block_range(start, len) do { /* nothing */} while(0) -+# define check_bnode_loaded(bnode) do { /* nothing */} while(0) -+ -+#endif -+ -+/* modify bnode->first_zero_bit (if we free bits before); bnode should be -+ spin-locked */ -+static inline void -+adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset) -+{ -+ if (offset < bnode->first_zero_bit) -+ bnode->first_zero_bit = offset; -+} -+ -+/* return a physical disk address for logical bitmap number @bmap */ -+/* FIXME-VS: this is somehow related to disk layout? */ -+/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference -+ * per block allocation so that performance is not affected. Probably this -+ * whole file should be considered part of the disk layout plugin, and other -+ * disk layouts can use other defines and efficiency will not be significantly -+ * affected. */ -+ -+#define REISER4_FIRST_BITMAP_BLOCK \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2) -+ -+/* Audited by: green(2002.06.12) */ -+static void -+get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap, -+ reiser4_block_nr * bnr) -+{ -+ -+ assert("zam-390", bmap < get_nr_bmap(super)); -+ -+#ifdef CONFIG_REISER4_BADBLOCKS -+#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff)) -+ /* Check if the diskmap have this already, first. */ -+ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0) -+ return; /* Found it in diskmap */ -+#endif -+ /* FIXME_ZAM: before discussing of disk layouts and disk format -+ plugins I implement bitmap location scheme which is close to scheme -+ used in reiser 3.6 */ -+ if (bmap == 0) { -+ *bnr = REISER4_FIRST_BITMAP_BLOCK; -+ } else { -+ *bnr = bmap * bmap_bit_count(super->s_blocksize); -+ } -+} -+ -+/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */ -+/* Audited by: green(2002.06.12) */ -+static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr) -+{ -+ *bnr = -+ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) | -+ REISER4_BITMAP_BLOCKS_STATUS_VALUE); -+} -+ -+/* bnode structure initialization */ -+static void -+init_bnode(struct bitmap_node *bnode, -+ struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG) -+{ -+ memset(bnode, 0, sizeof(struct bitmap_node)); -+ -+ mutex_init(&bnode->mutex); -+ atomic_set(&bnode->loaded, 0); -+} -+ -+static void release(jnode * node) -+{ -+ jrelse(node); -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ jput(node); -+} -+ -+/* This function is for internal bitmap.c use because it assumes that jnode is -+ in under full control of this thread */ -+static void done_bnode(struct bitmap_node *bnode) -+{ -+ if (bnode) { -+ atomic_set(&bnode->loaded, 0); -+ if (bnode->wjnode != NULL) -+ release(bnode->wjnode); -+ if (bnode->cjnode != NULL) -+ release(bnode->cjnode); -+ bnode->wjnode = bnode->cjnode = NULL; -+ } -+} -+ -+/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/ -+static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret, -+ jnode **wjnode_ret) -+{ -+ struct super_block *super; -+ jnode *cjnode; -+ jnode *wjnode; -+ bmap_nr_t bmap; -+ int ret; -+ -+ super = reiser4_get_current_sb(); -+ -+ *wjnode_ret = wjnode = bnew(); -+ if (wjnode == NULL) { -+ *cjnode_ret = NULL; -+ return RETERR(-ENOMEM); -+ } -+ -+ *cjnode_ret = cjnode = bnew(); -+ if (cjnode == NULL) -+ return RETERR(-ENOMEM); -+ -+ bmap = bnode - get_bnode(super, 0); -+ -+ get_working_bitmap_blocknr(bmap, &wjnode->blocknr); -+ get_bitmap_blocknr(super, bmap, &cjnode->blocknr); -+ -+ jref(cjnode); -+ jref(wjnode); -+ -+ /* load commit bitmap */ -+ ret = jload_gfp(cjnode, GFP_NOFS, 1); -+ -+ if (ret) -+ goto error; -+ -+ /* allocate memory for working bitmap block. Note that for -+ * bitmaps jinit_new() doesn't actually modifies node content, -+ * so parallel calls to this are ok. */ -+ ret = jinit_new(wjnode, GFP_NOFS); -+ -+ if (ret != 0) { -+ jrelse(cjnode); -+ goto error; -+ } -+ -+ return 0; -+ -+ error: -+ jput(cjnode); -+ jput(wjnode); -+ *wjnode_ret = *cjnode_ret = NULL; -+ return ret; -+ -+} -+ -+/* Check the bnode data on read. */ -+static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize) -+{ -+ void *data; -+ int ret; -+ -+ /* Check CRC */ -+ ret = bnode_check_adler32(bnode, blksize); -+ -+ if (ret) { -+ return ret; -+ } -+ -+ data = jdata(bnode->cjnode) + CHECKSUM_SIZE; -+ -+ /* Check the very first bit -- it must be busy. */ -+ if (!reiser4_test_bit(0, data)) { -+ warning("vpf-1362", "The allocator block %llu is not marked " -+ "as used.", (unsigned long long)bnode->cjnode->blocknr); -+ -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+/* load bitmap blocks "on-demand" */ -+static int load_and_lock_bnode(struct bitmap_node *bnode) -+{ -+ int ret; -+ -+ jnode *cjnode; -+ jnode *wjnode; -+ -+ assert("nikita-3040", reiser4_schedulable()); -+ -+/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not -+ * need to be atomic, right? Just leave a comment that if bitmaps were -+ * unloadable, this would need to be atomic. */ -+ if (atomic_read(&bnode->loaded)) { -+ /* bitmap is already loaded, nothing to do */ -+ check_bnode_loaded(bnode); -+ mutex_lock(&bnode->mutex); -+ assert("nikita-2827", atomic_read(&bnode->loaded)); -+ return 0; -+ } -+ -+ ret = prepare_bnode(bnode, &cjnode, &wjnode); -+ if (ret == 0) { -+ mutex_lock(&bnode->mutex); -+ -+ if (!atomic_read(&bnode->loaded)) { -+ assert("nikita-2822", cjnode != NULL); -+ assert("nikita-2823", wjnode != NULL); -+ assert("nikita-2824", jnode_is_loaded(cjnode)); -+ assert("nikita-2825", jnode_is_loaded(wjnode)); -+ -+ bnode->wjnode = wjnode; -+ bnode->cjnode = cjnode; -+ -+ ret = check_struct_bnode(bnode, current_blocksize); -+ if (!ret) { -+ cjnode = wjnode = NULL; -+ atomic_set(&bnode->loaded, 1); -+ /* working bitmap is initialized by on-disk -+ * commit bitmap. This should be performed -+ * under mutex. */ -+ memcpy(bnode_working_data(bnode), -+ bnode_commit_data(bnode), -+ bmap_size(current_blocksize)); -+ } else -+ mutex_unlock(&bnode->mutex); -+ } else -+ /* race: someone already loaded bitmap while we were -+ * busy initializing data. */ -+ check_bnode_loaded(bnode); -+ } -+ -+ if (wjnode != NULL) { -+ release(wjnode); -+ bnode->wjnode = NULL; -+ } -+ if (cjnode != NULL) { -+ release(cjnode); -+ bnode->cjnode = NULL; -+ } -+ -+ return ret; -+} -+ -+static void release_and_unlock_bnode(struct bitmap_node *bnode) -+{ -+ check_bnode_loaded(bnode); -+ mutex_unlock(&bnode->mutex); -+} -+ -+/* This function does all block allocation work but only for one bitmap -+ block.*/ -+/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap -+ block responsibility zone boundaries. This had no sense in v3.6 but may -+ have it in v4.x */ -+/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */ -+static int -+search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset, -+ bmap_off_t max_offset, int min_len, int max_len) -+{ -+ struct super_block *super = get_current_context()->super; -+ struct bitmap_node *bnode = get_bnode(super, bmap); -+ -+ char *data; -+ -+ bmap_off_t search_end; -+ bmap_off_t start; -+ bmap_off_t end; -+ -+ int set_first_zero_bit = 0; -+ -+ int ret; -+ -+ assert("zam-364", min_len > 0); -+ assert("zam-365", max_len >= min_len); -+ assert("zam-366", *offset <= max_offset); -+ -+ ret = load_and_lock_bnode(bnode); -+ -+ if (ret) -+ return ret; -+ -+ data = bnode_working_data(bnode); -+ -+ start = *offset; -+ -+ if (bnode->first_zero_bit >= start) { -+ start = bnode->first_zero_bit; -+ set_first_zero_bit = 1; -+ } -+ -+ while (start + min_len < max_offset) { -+ -+ start = -+ reiser4_find_next_zero_bit((long *)data, max_offset, start); -+ if (set_first_zero_bit) { -+ bnode->first_zero_bit = start; -+ set_first_zero_bit = 0; -+ } -+ if (start >= max_offset) -+ break; -+ -+ search_end = LIMIT(start + max_len, max_offset); -+ end = -+ reiser4_find_next_set_bit((long *)data, search_end, start); -+ if (end >= start + min_len) { -+ /* we can't trust find_next_set_bit result if set bit -+ was not fount, result may be bigger than -+ max_offset */ -+ if (end > search_end) -+ end = search_end; -+ -+ ret = end - start; -+ *offset = start; -+ -+ reiser4_set_bits(data, start, end); -+ -+ /* FIXME: we may advance first_zero_bit if [start, -+ end] region overlaps the first_zero_bit point */ -+ -+ break; -+ } -+ -+ start = end + 1; -+ } -+ -+ release_and_unlock_bnode(bnode); -+ -+ return ret; -+} -+ -+static int -+search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset, -+ bmap_off_t end_offset, int min_len, int max_len) -+{ -+ struct super_block *super = get_current_context()->super; -+ struct bitmap_node *bnode = get_bnode(super, bmap); -+ char *data; -+ bmap_off_t start; -+ int ret; -+ -+ assert("zam-958", min_len > 0); -+ assert("zam-959", max_len >= min_len); -+ assert("zam-960", *start_offset >= end_offset); -+ -+ ret = load_and_lock_bnode(bnode); -+ if (ret) -+ return ret; -+ -+ data = bnode_working_data(bnode); -+ start = *start_offset; -+ -+ while (1) { -+ bmap_off_t end, search_end; -+ -+ /* Find the beginning of the zero filled region */ -+ if (reiser4_find_last_zero_bit(&start, data, end_offset, start)) -+ break; -+ /* Is there more than `min_len' bits from `start' to -+ * `end_offset'? */ -+ if (start < end_offset + min_len - 1) -+ break; -+ -+ /* Do not search to `end_offset' if we need to find less than -+ * `max_len' zero bits. */ -+ if (end_offset + max_len - 1 < start) -+ search_end = start - max_len + 1; -+ else -+ search_end = end_offset; -+ -+ if (reiser4_find_last_set_bit(&end, data, search_end, start)) -+ end = search_end; -+ else -+ end++; -+ -+ if (end + min_len <= start + 1) { -+ if (end < search_end) -+ end = search_end; -+ ret = start - end + 1; -+ *start_offset = end; /* `end' is lowest offset */ -+ assert("zam-987", -+ reiser4_find_next_set_bit(data, start + 1, -+ end) >= start + 1); -+ reiser4_set_bits(data, end, start + 1); -+ break; -+ } -+ -+ if (end <= end_offset) -+ /* left search boundary reached. */ -+ break; -+ start = end - 1; -+ } -+ -+ release_and_unlock_bnode(bnode); -+ return ret; -+} -+ -+/* allocate contiguous range of blocks in bitmap */ -+static int bitmap_alloc_forward(reiser4_block_nr * start, -+ const reiser4_block_nr * end, int min_len, -+ int max_len) -+{ -+ bmap_nr_t bmap, end_bmap; -+ bmap_off_t offset, end_offset; -+ int len; -+ -+ reiser4_block_nr tmp; -+ -+ struct super_block *super = get_current_context()->super; -+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize); -+ -+ parse_blocknr(start, &bmap, &offset); -+ -+ tmp = *end - 1; -+ parse_blocknr(&tmp, &end_bmap, &end_offset); -+ ++end_offset; -+ -+ assert("zam-358", end_bmap >= bmap); -+ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset)); -+ -+ for (; bmap < end_bmap; bmap++, offset = 0) { -+ len = -+ search_one_bitmap_forward(bmap, &offset, max_offset, -+ min_len, max_len); -+ if (len != 0) -+ goto out; -+ } -+ -+ len = -+ search_one_bitmap_forward(bmap, &offset, end_offset, min_len, -+ max_len); -+ out: -+ *start = bmap * max_offset + offset; -+ return len; -+} -+ -+/* allocate contiguous range of blocks in bitmap (from @start to @end in -+ * backward direction) */ -+static int bitmap_alloc_backward(reiser4_block_nr * start, -+ const reiser4_block_nr * end, int min_len, -+ int max_len) -+{ -+ bmap_nr_t bmap, end_bmap; -+ bmap_off_t offset, end_offset; -+ int len; -+ struct super_block *super = get_current_context()->super; -+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize); -+ -+ parse_blocknr(start, &bmap, &offset); -+ parse_blocknr(end, &end_bmap, &end_offset); -+ -+ assert("zam-961", end_bmap <= bmap); -+ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset)); -+ -+ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) { -+ len = -+ search_one_bitmap_backward(bmap, &offset, 0, min_len, -+ max_len); -+ if (len != 0) -+ goto out; -+ } -+ -+ len = -+ search_one_bitmap_backward(bmap, &offset, end_offset, min_len, -+ max_len); -+ out: -+ *start = bmap * max_offset + offset; -+ return len; -+} -+ -+/* plugin->u.space_allocator.alloc_blocks() */ -+static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed, -+ reiser4_block_nr *start, reiser4_block_nr *len) -+{ -+ struct super_block *super = get_current_context()->super; -+ int actual_len; -+ -+ reiser4_block_nr search_start; -+ reiser4_block_nr search_end; -+ -+ assert("zam-398", super != NULL); -+ assert("zam-412", hint != NULL); -+ assert("zam-397", hint->blk <= reiser4_block_count(super)); -+ -+ if (hint->max_dist == 0) -+ search_end = reiser4_block_count(super); -+ else -+ search_end = -+ LIMIT(hint->blk + hint->max_dist, -+ reiser4_block_count(super)); -+ -+ /* We use @hint -> blk as a search start and search from it to the end -+ of the disk or in given region if @hint -> max_dist is not zero */ -+ search_start = hint->blk; -+ -+ actual_len = -+ bitmap_alloc_forward(&search_start, &search_end, 1, needed); -+ -+ /* There is only one bitmap search if max_dist was specified or first -+ pass was from the beginning of the bitmap. We also do one pass for -+ scanning bitmap in backward direction. */ -+ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) { -+ /* next step is a scanning from 0 to search_start */ -+ search_end = search_start; -+ search_start = 0; -+ actual_len = -+ bitmap_alloc_forward(&search_start, &search_end, 1, needed); -+ } -+ if (actual_len == 0) -+ return RETERR(-ENOSPC); -+ if (actual_len < 0) -+ return RETERR(actual_len); -+ *len = actual_len; -+ *start = search_start; -+ return 0; -+} -+ -+static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed, -+ reiser4_block_nr * start, -+ reiser4_block_nr * len) -+{ -+ reiser4_block_nr search_start; -+ reiser4_block_nr search_end; -+ int actual_len; -+ -+ ON_DEBUG(struct super_block *super = reiser4_get_current_sb()); -+ -+ assert("zam-969", super != NULL); -+ assert("zam-970", hint != NULL); -+ assert("zam-971", hint->blk <= reiser4_block_count(super)); -+ -+ search_start = hint->blk; -+ if (hint->max_dist == 0 || search_start <= hint->max_dist) -+ search_end = 0; -+ else -+ search_end = search_start - hint->max_dist; -+ -+ actual_len = -+ bitmap_alloc_backward(&search_start, &search_end, 1, needed); -+ if (actual_len == 0) -+ return RETERR(-ENOSPC); -+ if (actual_len < 0) -+ return RETERR(actual_len); -+ *len = actual_len; -+ *start = search_start; -+ return 0; -+} -+ -+/* plugin->u.space_allocator.alloc_blocks() */ -+int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator, -+ reiser4_blocknr_hint * hint, int needed, -+ reiser4_block_nr * start, reiser4_block_nr * len) -+{ -+ if (hint->backward) -+ return alloc_blocks_backward(hint, needed, start, len); -+ return alloc_blocks_forward(hint, needed, start, len); -+} -+ -+/* plugin->u.space_allocator.dealloc_blocks(). */ -+/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted -+ nodes deletion is deferred until transaction commit. However, deallocation -+ of temporary objects like wandered blocks and transaction commit records -+ requires immediate node deletion from WORKING BITMAP.*/ -+void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator, -+ reiser4_block_nr start, reiser4_block_nr len) -+{ -+ struct super_block *super = reiser4_get_current_sb(); -+ -+ bmap_nr_t bmap; -+ bmap_off_t offset; -+ -+ struct bitmap_node *bnode; -+ int ret; -+ -+ assert("zam-468", len != 0); -+ check_block_range(&start, &len); -+ -+ parse_blocknr(&start, &bmap, &offset); -+ -+ assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize)); -+ -+ bnode = get_bnode(super, bmap); -+ -+ assert("zam-470", bnode != NULL); -+ -+ ret = load_and_lock_bnode(bnode); -+ assert("zam-481", ret == 0); -+ -+ reiser4_clear_bits(bnode_working_data(bnode), offset, -+ (bmap_off_t) (offset + len)); -+ -+ adjust_first_zero_bit(bnode, offset); -+ -+ release_and_unlock_bnode(bnode); -+} -+ -+/* plugin->u.space_allocator.check_blocks(). */ -+void reiser4_check_blocks_bitmap(const reiser4_block_nr * start, -+ const reiser4_block_nr * len, int desired) -+{ -+#if REISER4_DEBUG -+ struct super_block *super = reiser4_get_current_sb(); -+ -+ bmap_nr_t bmap; -+ bmap_off_t start_offset; -+ bmap_off_t end_offset; -+ -+ struct bitmap_node *bnode; -+ int ret; -+ -+ assert("zam-622", len != NULL); -+ check_block_range(start, len); -+ parse_blocknr(start, &bmap, &start_offset); -+ -+ end_offset = start_offset + *len; -+ assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize)); -+ -+ bnode = get_bnode(super, bmap); -+ -+ assert("nikita-2215", bnode != NULL); -+ -+ ret = load_and_lock_bnode(bnode); -+ assert("zam-626", ret == 0); -+ -+ assert("nikita-2216", jnode_is_loaded(bnode->wjnode)); -+ -+ if (desired) { -+ assert("zam-623", -+ reiser4_find_next_zero_bit(bnode_working_data(bnode), -+ end_offset, start_offset) -+ >= end_offset); -+ } else { -+ assert("zam-624", -+ reiser4_find_next_set_bit(bnode_working_data(bnode), -+ end_offset, start_offset) -+ >= end_offset); -+ } -+ -+ release_and_unlock_bnode(bnode); -+#endif -+} -+ -+/* conditional insertion of @node into atom's overwrite set if it was not there */ -+static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node) -+{ -+ assert("zam-546", atom != NULL); -+ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT); -+ assert("zam-548", node != NULL); -+ -+ spin_lock_atom(atom); -+ spin_lock_jnode(node); -+ -+ if (node->atom == NULL) { -+ JF_SET(node, JNODE_OVRWR); -+ insert_into_atom_ovrwr_list(atom, node); -+ } else { -+ assert("zam-549", node->atom == atom); -+ } -+ -+ spin_unlock_jnode(node); -+ spin_unlock_atom(atom); -+} -+ -+/* an actor which applies delete set to COMMIT bitmap pages and link modified -+ pages in a single-linked list */ -+static int -+apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start, -+ const reiser4_block_nr * len, void *data) -+{ -+ -+ bmap_nr_t bmap; -+ bmap_off_t offset; -+ int ret; -+ -+ long long *blocks_freed_p = data; -+ -+ struct bitmap_node *bnode; -+ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ check_block_range(start, len); -+ -+ parse_blocknr(start, &bmap, &offset); -+ -+ /* FIXME-ZAM: we assume that all block ranges are allocated by this -+ bitmap-based allocator and each block range can't go over a zone of -+ responsibility of one bitmap block; same assumption is used in -+ other journal hooks in bitmap code. */ -+ bnode = get_bnode(sb, bmap); -+ assert("zam-448", bnode != NULL); -+ -+ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */ -+ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT); -+ ret = load_and_lock_bnode(bnode); -+ if (ret) -+ return ret; -+ -+ /* put bnode into atom's overwrite set */ -+ cond_add_to_overwrite_set(atom, bnode->cjnode); -+ -+ data = bnode_commit_data(bnode); -+ -+ ret = bnode_check_crc(bnode); -+ if (ret != 0) -+ return ret; -+ -+ if (len != NULL) { -+ /* FIXME-ZAM: a check that all bits are set should be there */ -+ assert("zam-443", -+ offset + *len <= bmap_bit_count(sb->s_blocksize)); -+ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len)); -+ -+ (*blocks_freed_p) += *len; -+ } else { -+ reiser4_clear_bit(offset, data); -+ (*blocks_freed_p)++; -+ } -+ -+ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize)); -+ -+ release_and_unlock_bnode(bnode); -+ -+ return 0; -+} -+ -+/* plugin->u.space_allocator.pre_commit_hook(). */ -+/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the -+ rest is done by transaction manager (allocate wandered locations for COMMIT -+ BITMAP blocks, copy COMMIT BITMAP blocks data). */ -+/* Only one instance of this function can be running at one given time, because -+ only one transaction can be committed a time, therefore it is safe to access -+ some global variables without any locking */ -+ -+int reiser4_pre_commit_hook_bitmap(void) -+{ -+ struct super_block *super = reiser4_get_current_sb(); -+ txn_atom *atom; -+ -+ long long blocks_freed = 0; -+ -+ atom = get_current_atom_locked(); -+ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT); -+ spin_unlock_atom(atom); -+ -+ { /* scan atom's captured list and find all freshly allocated nodes, -+ * mark corresponded bits in COMMIT BITMAP as used */ -+ struct list_head *head = ATOM_CLEAN_LIST(atom); -+ jnode *node = list_entry(head->next, jnode, capture_link); -+ -+ while (head != &node->capture_link) { -+ /* we detect freshly allocated jnodes */ -+ if (JF_ISSET(node, JNODE_RELOC)) { -+ int ret; -+ bmap_nr_t bmap; -+ -+ bmap_off_t offset; -+ bmap_off_t index; -+ struct bitmap_node *bn; -+ __u32 size = bmap_size(super->s_blocksize); -+ __u32 crc; -+ char byte; -+ -+ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-460", -+ !reiser4_blocknr_is_fake(&node->blocknr)); -+ -+ parse_blocknr(&node->blocknr, &bmap, &offset); -+ bn = get_bnode(super, bmap); -+ -+ index = offset >> 3; -+ assert("vpf-276", index < size); -+ -+ ret = bnode_check_crc(bnode); -+ if (ret != 0) -+ return ret; -+ -+ check_bnode_loaded(bn); -+ load_and_lock_bnode(bn); -+ -+ byte = *(bnode_commit_data(bn) + index); -+ reiser4_set_bit(offset, bnode_commit_data(bn)); -+ -+ crc = adler32_recalc(bnode_commit_crc(bn), byte, -+ *(bnode_commit_data(bn) + -+ index), -+ size - index), -+ bnode_set_commit_crc(bn, crc); -+ -+ release_and_unlock_bnode(bn); -+ -+ ret = bnode_check_crc(bn); -+ if (ret != 0) -+ return ret; -+ -+ /* working of this depends on how it inserts -+ new j-node into clean list, because we are -+ scanning the same list now. It is OK, if -+ insertion is done to the list front */ -+ cond_add_to_overwrite_set(atom, bn->cjnode); -+ } -+ -+ node = list_entry(node->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap, -+ &blocks_freed, 0); -+ -+ blocks_freed -= atom->nr_blocks_allocated; -+ -+ { -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ sbinfo->blocks_free_committed += blocks_freed; -+ spin_unlock_reiser4_super(sbinfo); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.space_allocator.init_allocator -+ constructor of reiser4_space_allocator object. It is called on fs mount */ -+int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator, -+ struct super_block *super, void *arg) -+{ -+ struct bitmap_allocator_data *data = NULL; -+ bmap_nr_t bitmap_blocks_nr; -+ bmap_nr_t i; -+ -+ assert("nikita-3039", reiser4_schedulable()); -+ -+ /* getting memory for bitmap allocator private data holder */ -+ data = -+ kmalloc(sizeof(struct bitmap_allocator_data), -+ reiser4_ctx_gfp_mask_get()); -+ -+ if (data == NULL) -+ return RETERR(-ENOMEM); -+ -+ /* allocation and initialization for the array of bnodes */ -+ bitmap_blocks_nr = get_nr_bmap(super); -+ -+ /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps -+ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17, -+ may I never meet someone who still uses the ia32 architecture when -+ storage devices of that size enter the market, and wants to use ia32 -+ with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and, -+ probably, another dynamic data structure should replace a static -+ array of bnodes. */ -+ /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */ -+ data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr); -+ if (data->bitmap == NULL) { -+ kfree(data); -+ return RETERR(-ENOMEM); -+ } -+ -+ for (i = 0; i < bitmap_blocks_nr; i++) -+ init_bnode(data->bitmap + i, super, i); -+ -+ allocator->u.generic = data; -+ -+#if REISER4_DEBUG -+ get_super_private(super)->min_blocks_used += bitmap_blocks_nr; -+#endif -+ -+ /* Load all bitmap blocks at mount time. */ -+ if (!test_bit -+ (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) { -+ __u64 start_time, elapsed_time; -+ struct bitmap_node *bnode; -+ int ret; -+ -+ if (REISER4_DEBUG) -+ printk(KERN_INFO "loading reiser4 bitmap..."); -+ start_time = jiffies; -+ -+ for (i = 0; i < bitmap_blocks_nr; i++) { -+ bnode = data->bitmap + i; -+ ret = load_and_lock_bnode(bnode); -+ if (ret) { -+ reiser4_destroy_allocator_bitmap(allocator, -+ super); -+ return ret; -+ } -+ release_and_unlock_bnode(bnode); -+ } -+ -+ elapsed_time = jiffies - start_time; -+ if (REISER4_DEBUG) -+ printk("...done (%llu jiffies)\n", -+ (unsigned long long)elapsed_time); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.space_allocator.destroy_allocator -+ destructor. It is called on fs unmount */ -+int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator, -+ struct super_block *super) -+{ -+ bmap_nr_t bitmap_blocks_nr; -+ bmap_nr_t i; -+ -+ struct bitmap_allocator_data *data = allocator->u.generic; -+ -+ assert("zam-414", data != NULL); -+ assert("zam-376", data->bitmap != NULL); -+ -+ bitmap_blocks_nr = get_nr_bmap(super); -+ -+ for (i = 0; i < bitmap_blocks_nr; i++) { -+ struct bitmap_node *bnode = data->bitmap + i; -+ -+ mutex_lock(&bnode->mutex); -+ -+#if REISER4_DEBUG -+ if (atomic_read(&bnode->loaded)) { -+ jnode *wj = bnode->wjnode; -+ jnode *cj = bnode->cjnode; -+ -+ assert("zam-480", jnode_page(cj) != NULL); -+ assert("zam-633", jnode_page(wj) != NULL); -+ -+ assert("zam-634", -+ memcmp(jdata(wj), jdata(wj), -+ bmap_size(super->s_blocksize)) == 0); -+ -+ } -+#endif -+ done_bnode(bnode); -+ mutex_unlock(&bnode->mutex); -+ } -+ -+ vfree(data->bitmap); -+ kfree(data); -+ -+ allocator->u.generic = NULL; -+ -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.30/fs/reiser4/plugin/space/bitmap.h ---- linux-2.6.30.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/space/bitmap.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,47 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__) -+#define __REISER4_PLUGIN_SPACE_BITMAP_H__ -+ -+#include "../../dformat.h" -+#include "../../block_alloc.h" -+ -+#include <linux/types.h> /* for __u?? */ -+#include <linux/fs.h> /* for struct super_block */ -+/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */ -+/* declarations of functions implementing methods of space allocator plugin for -+ bitmap based allocator. The functions themselves are in bitmap.c */ -+extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *, -+ struct super_block *, void *); -+extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *, -+ struct super_block *); -+extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *, -+ reiser4_blocknr_hint *, int needed, -+ reiser4_block_nr * start, -+ reiser4_block_nr * len); -+extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *, -+ const reiser4_block_nr *, int); -+extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *, -+ reiser4_block_nr, -+ reiser4_block_nr); -+extern int reiser4_pre_commit_hook_bitmap(void); -+ -+#define reiser4_post_commit_hook_bitmap() do{}while(0) -+#define reiser4_post_write_back_hook_bitmap() do{}while(0) -+#define reiser4_print_info_bitmap(pref, al) do{}while(0) -+ -+typedef __u64 bmap_nr_t; -+typedef __u32 bmap_off_t; -+ -+#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/space/Makefile linux-2.6.30/fs/reiser4/plugin/space/Makefile ---- linux-2.6.30.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/space/Makefile 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,4 @@ -+obj-$(CONFIG_REISER4_FS) += space_plugins.o -+ -+space_plugins-objs := \ -+ bitmap.o -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.30/fs/reiser4/plugin/space/space_allocator.h ---- linux-2.6.30.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/space/space_allocator.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,80 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#ifndef __SPACE_ALLOCATOR_H__ -+#define __SPACE_ALLOCATOR_H__ -+ -+#include "../../forward.h" -+#include "bitmap.h" -+/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now, -+ * but... */ -+#define DEF_SPACE_ALLOCATOR(allocator) \ -+ \ -+static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \ -+{ \ -+ return reiser4_init_allocator_##allocator (al, s, opaque); \ -+} \ -+ \ -+static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \ -+{ \ -+ reiser4_destroy_allocator_##allocator (al, s); \ -+} \ -+ \ -+static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \ -+ int needed, reiser4_block_nr * start, reiser4_block_nr * len) \ -+{ \ -+ return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \ -+} \ -+static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \ -+{ \ -+ reiser4_dealloc_blocks_##allocator (al, start, len); \ -+} \ -+ \ -+static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \ -+{ \ -+ reiser4_check_blocks_##allocator (start, end, desired); \ -+} \ -+ \ -+static inline void sa_pre_commit_hook (void) \ -+{ \ -+ reiser4_pre_commit_hook_##allocator (); \ -+} \ -+ \ -+static inline void sa_post_commit_hook (void) \ -+{ \ -+ reiser4_post_commit_hook_##allocator (); \ -+} \ -+ \ -+static inline void sa_post_write_back_hook (void) \ -+{ \ -+ reiser4_post_write_back_hook_##allocator(); \ -+} \ -+ \ -+static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \ -+{ \ -+ reiser4_print_info_##allocator (prefix, al); \ -+} -+ -+DEF_SPACE_ALLOCATOR(bitmap) -+ -+/* this object is part of reiser4 private in-core super block */ -+struct reiser4_space_allocator { -+ union { -+ /* space allocators might use this pointer to reference their -+ * data. */ -+ void *generic; -+ } u; -+}; -+ -+/* __SPACE_ALLOCATOR_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.30/fs/reiser4/plugin/tail_policy.c ---- linux-2.6.30.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/plugin/tail_policy.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,113 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Formatting policy plugins */ -+ -+/* -+ * Formatting policy plugin is used by object plugin (of regular file) to -+ * convert file between two representations. -+ * -+ * Currently following policies are implemented: -+ * never store file in formatted nodes -+ * always store file in formatted nodes -+ * store file in formatted nodes if file is smaller than 4 blocks (default) -+ */ -+ -+#include "../tree.h" -+#include "../inode.h" -+#include "../super.h" -+#include "object.h" -+#include "plugin.h" -+#include "node/node.h" -+#include "plugin_header.h" -+ -+#include <linux/pagemap.h> -+#include <linux/fs.h> /* For struct inode */ -+ -+/** -+ * have_formatting_never - -+ * @inode: -+ * @size: -+ * -+ * -+ */ -+/* Never store file's tail as direct item */ -+/* Audited by: green(2002.06.12) */ -+static int have_formatting_never(const struct inode *inode UNUSED_ARG -+ /* inode to operate on */ , -+ loff_t size UNUSED_ARG/* new object size */) -+{ -+ return 0; -+} -+ -+/* Always store file's tail as direct item */ -+/* Audited by: green(2002.06.12) */ -+static int -+have_formatting_always(const struct inode *inode UNUSED_ARG -+ /* inode to operate on */ , -+ loff_t size UNUSED_ARG/* new object size */) -+{ -+ return 1; -+} -+ -+/* This function makes test if we should store file denoted @inode as tails only -+ or as extents only. */ -+static int -+have_formatting_default(const struct inode *inode UNUSED_ARG -+ /* inode to operate on */ , -+ loff_t size/* new object size */) -+{ -+ assert("umka-1253", inode != NULL); -+ -+ if (size > inode->i_sb->s_blocksize * 4) -+ return 0; -+ -+ return 1; -+} -+ -+/* tail plugins */ -+formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = { -+ [NEVER_TAILS_FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = NEVER_TAILS_FORMATTING_ID, -+ .pops = NULL, -+ .label = "never", -+ .desc = "Never store file's tail", -+ .linkage = {NULL, NULL} -+ }, -+ .have_tail = have_formatting_never -+ }, -+ [ALWAYS_TAILS_FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = ALWAYS_TAILS_FORMATTING_ID, -+ .pops = NULL, -+ .label = "always", -+ .desc = "Always store file's tail", -+ .linkage = {NULL, NULL} -+ }, -+ .have_tail = have_formatting_always -+ }, -+ [SMALL_FILE_FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = SMALL_FILE_FORMATTING_ID, -+ .pops = NULL, -+ .label = "4blocks", -+ .desc = "store files shorter than 4 blocks in tail items", -+ .linkage = {NULL, NULL} -+ }, -+ .have_tail = have_formatting_default -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/pool.c linux-2.6.30/fs/reiser4/pool.c ---- linux-2.6.30.orig/fs/reiser4/pool.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/pool.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,231 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Fast pool allocation. -+ -+ There are situations when some sub-system normally asks memory allocator -+ for only few objects, but under some circumstances could require much -+ more. Typical and actually motivating example is tree balancing. It needs -+ to keep track of nodes that were involved into it, and it is well-known -+ that in reasonable packed balanced tree most (92.938121%) percent of all -+ balancings end up after working with only few nodes (3.141592 on -+ average). But in rare cases balancing can involve much more nodes -+ (3*tree_height+1 in extremal situation). -+ -+ On the one hand, we don't want to resort to dynamic allocation (slab, -+ malloc(), etc.) to allocate data structures required to keep track of -+ nodes during balancing. On the other hand, we cannot statically allocate -+ required amount of space on the stack, because first: it is useless wastage -+ of precious resource, and second: this amount is unknown in advance (tree -+ height can change). -+ -+ Pools, implemented in this file are solution for this problem: -+ -+ - some configurable amount of objects is statically preallocated on the -+ stack -+ -+ - if this preallocated pool is exhausted and more objects is requested -+ they are allocated dynamically. -+ -+ Pools encapsulate distinction between statically and dynamically allocated -+ objects. Both allocation and recycling look exactly the same. -+ -+ To keep track of dynamically allocated objects, pool adds its own linkage -+ to each object. -+ -+ NOTE-NIKITA This linkage also contains some balancing-specific data. This -+ is not perfect. On the other hand, balancing is currently the only client -+ of pool code. -+ -+ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation -+ functions in the style of tslist/tshash, i.e., make them unreadable, but -+ type-safe. -+ -+*/ -+ -+#include "debug.h" -+#include "pool.h" -+#include "super.h" -+ -+#include <linux/types.h> -+#include <linux/err.h> -+ -+/* initialize new pool object @h */ -+static void reiser4_init_pool_obj(struct reiser4_pool_header *h) -+{ -+ INIT_LIST_HEAD(&h->usage_linkage); -+ INIT_LIST_HEAD(&h->level_linkage); -+ INIT_LIST_HEAD(&h->extra_linkage); -+} -+ -+/* initialize new pool */ -+void reiser4_init_pool(struct reiser4_pool *pool /* pool to initialize */ , -+ size_t obj_size /* size of objects in @pool */ , -+ int num_of_objs /* number of preallocated objects */ , -+ char *data/* area for preallocated objects */) -+{ -+ struct reiser4_pool_header *h; -+ int i; -+ -+ assert("nikita-955", pool != NULL); -+ assert("nikita-1044", obj_size > 0); -+ assert("nikita-956", num_of_objs >= 0); -+ assert("nikita-957", data != NULL); -+ -+ memset(pool, 0, sizeof *pool); -+ pool->obj_size = obj_size; -+ pool->data = data; -+ INIT_LIST_HEAD(&pool->free); -+ INIT_LIST_HEAD(&pool->used); -+ INIT_LIST_HEAD(&pool->extra); -+ memset(data, 0, obj_size * num_of_objs); -+ for (i = 0; i < num_of_objs; ++i) { -+ h = (struct reiser4_pool_header *) (data + i * obj_size); -+ reiser4_init_pool_obj(h); -+ /* add pool header to the end of pool's free list */ -+ list_add_tail(&h->usage_linkage, &pool->free); -+ } -+} -+ -+/* release pool resources -+ -+ Release all resources acquired by this pool, specifically, dynamically -+ allocated objects. -+ -+*/ -+void reiser4_done_pool(struct reiser4_pool *pool UNUSED_ARG) -+{ -+} -+ -+/* allocate carry object from @pool -+ -+ First, try to get preallocated object. If this fails, resort to dynamic -+ allocation. -+ -+*/ -+static void *reiser4_pool_alloc(struct reiser4_pool *pool) -+{ -+ struct reiser4_pool_header *result; -+ -+ assert("nikita-959", pool != NULL); -+ -+ if (!list_empty(&pool->free)) { -+ struct list_head *linkage; -+ -+ linkage = pool->free.next; -+ list_del(linkage); -+ INIT_LIST_HEAD(linkage); -+ result = list_entry(linkage, struct reiser4_pool_header, -+ usage_linkage); -+ BUG_ON(!list_empty(&result->level_linkage) || -+ !list_empty(&result->extra_linkage)); -+ } else { -+ /* pool is empty. Extra allocations don't deserve dedicated -+ slab to be served from, as they are expected to be rare. */ -+ result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get()); -+ if (result != 0) { -+ reiser4_init_pool_obj(result); -+ list_add(&result->extra_linkage, &pool->extra); -+ } else -+ return ERR_PTR(RETERR(-ENOMEM)); -+ BUG_ON(!list_empty(&result->usage_linkage) || -+ !list_empty(&result->level_linkage)); -+ } -+ ++pool->objs; -+ list_add(&result->usage_linkage, &pool->used); -+ memset(result + 1, 0, pool->obj_size - sizeof *result); -+ return result; -+} -+ -+/* return object back to the pool */ -+void reiser4_pool_free(struct reiser4_pool *pool, -+ struct reiser4_pool_header *h) -+{ -+ assert("nikita-961", h != NULL); -+ assert("nikita-962", pool != NULL); -+ -+ --pool->objs; -+ assert("nikita-963", pool->objs >= 0); -+ -+ list_del_init(&h->usage_linkage); -+ list_del_init(&h->level_linkage); -+ -+ if (list_empty(&h->extra_linkage)) -+ /* -+ * pool header is not an extra one. Push it onto free list -+ * using usage_linkage -+ */ -+ list_add(&h->usage_linkage, &pool->free); -+ else { -+ /* remove pool header from pool's extra list and kfree it */ -+ list_del(&h->extra_linkage); -+ kfree(h); -+ } -+} -+ -+/* add new object to the carry level list -+ -+ Carry level is FIFO most of the time, but not always. Complications arise -+ when make_space() function tries to go to the left neighbor and thus adds -+ carry node before existing nodes, and also, when updating delimiting keys -+ after moving data between two nodes, we want left node to be locked before -+ right node. -+ -+ Latter case is confusing at the first glance. Problem is that COP_UPDATE -+ opration that updates delimiting keys is sometimes called with two nodes -+ (when data are moved between two nodes) and sometimes with only one node -+ (when leftmost item is deleted in a node). In any case operation is -+ supplied with at least node whose left delimiting key is to be updated -+ (that is "right" node). -+ -+ @pool - from which to allocate new object; -+ @list - where to add object; -+ @reference - after (or before) which existing object to add -+*/ -+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool, -+ struct list_head *list, -+ pool_ordering order, -+ struct reiser4_pool_header *reference) -+{ -+ struct reiser4_pool_header *result; -+ -+ assert("nikita-972", pool != NULL); -+ -+ result = reiser4_pool_alloc(pool); -+ if (IS_ERR(result)) -+ return result; -+ -+ assert("nikita-973", result != NULL); -+ -+ switch (order) { -+ case POOLO_BEFORE: -+ __list_add(&result->level_linkage, -+ reference->level_linkage.prev, -+ &reference->level_linkage); -+ break; -+ case POOLO_AFTER: -+ __list_add(&result->level_linkage, -+ &reference->level_linkage, -+ reference->level_linkage.next); -+ break; -+ case POOLO_LAST: -+ list_add_tail(&result->level_linkage, list); -+ break; -+ case POOLO_FIRST: -+ list_add(&result->level_linkage, list); -+ break; -+ default: -+ wrong_return_value("nikita-927", "order"); -+ } -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/pool.h linux-2.6.30/fs/reiser4/pool.h ---- linux-2.6.30.orig/fs/reiser4/pool.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/pool.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,57 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Fast pool allocation */ -+ -+#ifndef __REISER4_POOL_H__ -+#define __REISER4_POOL_H__ -+ -+#include <linux/types.h> -+ -+struct reiser4_pool { -+ size_t obj_size; -+ int objs; -+ char *data; -+ struct list_head free; -+ struct list_head used; -+ struct list_head extra; -+}; -+ -+struct reiser4_pool_header { -+ /* object is either on free or "used" lists */ -+ struct list_head usage_linkage; -+ struct list_head level_linkage; -+ struct list_head extra_linkage; -+}; -+ -+typedef enum { -+ POOLO_BEFORE, -+ POOLO_AFTER, -+ POOLO_LAST, -+ POOLO_FIRST -+} pool_ordering; -+ -+/* pool manipulation functions */ -+ -+extern void reiser4_init_pool(struct reiser4_pool *pool, size_t obj_size, -+ int num_of_objs, char *data); -+extern void reiser4_done_pool(struct reiser4_pool *pool); -+extern void reiser4_pool_free(struct reiser4_pool *pool, -+ struct reiser4_pool_header *h); -+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool, -+ struct list_head *list, -+ pool_ordering order, -+ struct reiser4_pool_header *reference); -+ -+/* __REISER4_POOL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/readahead.c linux-2.6.30/fs/reiser4/readahead.c ---- linux-2.6.30.orig/fs/reiser4/readahead.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/readahead.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,140 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "forward.h" -+#include "tree.h" -+#include "tree_walk.h" -+#include "super.h" -+#include "inode.h" -+#include "key.h" -+#include "znode.h" -+ -+#include <linux/swap.h> /* for totalram_pages */ -+ -+void reiser4_init_ra_info(ra_info_t *rai) -+{ -+ rai->key_to_stop = *reiser4_min_key(); -+} -+ -+/* global formatted node readahead parameter. It can be set by mount option -+ * -o readahead:NUM:1 */ -+static inline int ra_adjacent_only(int flags) -+{ -+ return flags & RA_ADJACENT_ONLY; -+} -+ -+/* this is used by formatted_readahead to decide whether read for right neighbor -+ * of node is to be issued. It returns 1 if right neighbor's first key is less -+ * or equal to readahead's stop key */ -+static int should_readahead_neighbor(znode * node, ra_info_t *info) -+{ -+ int result; -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = keyle(znode_get_rd_key(node), &info->key_to_stop); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+#define LOW_MEM_PERCENTAGE (5) -+ -+static int low_on_memory(void) -+{ -+ unsigned int freepages; -+ -+ freepages = nr_free_pages(); -+ return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100); -+} -+ -+/* start read for @node and for a few of its right neighbors */ -+void formatted_readahead(znode * node, ra_info_t *info) -+{ -+ struct formatted_ra_params *ra_params; -+ znode *cur; -+ int i; -+ int grn_flags; -+ lock_handle next_lh; -+ -+ /* do nothing if node block number has not been assigned to node (which -+ * means it is still in cache). */ -+ if (reiser4_blocknr_is_fake(znode_get_block(node))) -+ return; -+ -+ ra_params = get_current_super_ra_params(); -+ -+ if (znode_page(node) == NULL) -+ jstartio(ZJNODE(node)); -+ -+ if (znode_get_level(node) != LEAF_LEVEL) -+ return; -+ -+ /* don't waste memory for read-ahead when low on memory */ -+ if (low_on_memory()) -+ return; -+ -+ /* We can have locked nodes on upper tree levels, in this situation lock -+ priorities do not help to resolve deadlocks, we have to use TRY_LOCK -+ here. */ -+ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK); -+ -+ i = 0; -+ cur = zref(node); -+ init_lh(&next_lh); -+ while (i < ra_params->max) { -+ const reiser4_block_nr * nextblk; -+ -+ if (!should_readahead_neighbor(cur, info)) -+ break; -+ -+ if (reiser4_get_right_neighbor -+ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags)) -+ break; -+ -+ nextblk = znode_get_block(next_lh.node); -+ if (reiser4_blocknr_is_fake(nextblk) || -+ (ra_adjacent_only(ra_params->flags) -+ && *nextblk != *znode_get_block(cur) + 1)) -+ break; -+ -+ zput(cur); -+ cur = zref(next_lh.node); -+ done_lh(&next_lh); -+ if (znode_page(cur) == NULL) -+ jstartio(ZJNODE(cur)); -+ else -+ /* Do not scan read-ahead window if pages already -+ * allocated (and i/o already started). */ -+ break; -+ -+ i++; -+ } -+ zput(cur); -+ done_lh(&next_lh); -+} -+ -+void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap) -+{ -+ reiser4_key *stop_key; -+ -+ assert("nikita-3542", dir != NULL); -+ assert("nikita-3543", tap != NULL); -+ -+ stop_key = &tap->ra_info.key_to_stop; -+ /* initialize readdir readahead information: include into readahead -+ * stat data of all files of the directory */ -+ set_key_locality(stop_key, get_inode_oid(dir)); -+ set_key_type(stop_key, KEY_SD_MINOR); -+ set_key_ordering(stop_key, get_key_ordering(reiser4_max_key())); -+ set_key_objectid(stop_key, get_key_objectid(reiser4_max_key())); -+ set_key_offset(stop_key, get_key_offset(reiser4_max_key())); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/readahead.h linux-2.6.30/fs/reiser4/readahead.h ---- linux-2.6.30.orig/fs/reiser4/readahead.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/readahead.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,52 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#ifndef __READAHEAD_H__ -+#define __READAHEAD_H__ -+ -+#include "key.h" -+ -+typedef enum { -+ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent. -+ Default is NO (not only adjacent) */ -+} ra_global_flags; -+ -+/* reiser4 super block has a field of this type. -+ It controls readahead during tree traversals */ -+struct formatted_ra_params { -+ unsigned long max; /* request not more than this amount of nodes. -+ Default is totalram_pages / 4 */ -+ int flags; -+}; -+ -+typedef struct { -+ reiser4_key key_to_stop; -+} ra_info_t; -+ -+void formatted_readahead(znode * , ra_info_t *); -+void reiser4_init_ra_info(ra_info_t *rai); -+ -+struct reiser4_file_ra_state { -+ loff_t start; /* Current window */ -+ loff_t size; -+ loff_t next_size; /* Next window size */ -+ loff_t ahead_start; /* Ahead window */ -+ loff_t ahead_size; -+ loff_t max_window_size; /* Maximum readahead window */ -+ loff_t slow_start; /* enlarging r/a size algorithm. */ -+}; -+ -+extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap); -+ -+/* __READAHEAD_H__ */ -+#endif -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/README linux-2.6.30/fs/reiser4/README ---- linux-2.6.30.orig/fs/reiser4/README 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/README 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,128 @@ -+[LICENSING] -+ -+Reiser4 is hereby licensed under the GNU General -+Public License version 2. -+ -+Source code files that contain the phrase "licensing governed by -+reiser4/README" are "governed files" throughout this file. Governed -+files are licensed under the GPL. The portions of them owned by Hans -+Reiser, or authorized to be licensed by him, have been in the past, -+and likely will be in the future, licensed to other parties under -+other licenses. If you add your code to governed files, and don't -+want it to be owned by Hans Reiser, put your copyright label on that -+code so the poor blight and his customers can keep things straight. -+All portions of governed files not labeled otherwise are owned by Hans -+Reiser, and by adding your code to it, widely distributing it to -+others or sending us a patch, and leaving the sentence in stating that -+licensing is governed by the statement in this file, you accept this. -+It will be a kindness if you identify whether Hans Reiser is allowed -+to license code labeled as owned by you on your behalf other than -+under the GPL, because he wants to know if it is okay to do so and put -+a check in the mail to you (for non-trivial improvements) when he -+makes his next sale. He makes no guarantees as to the amount if any, -+though he feels motivated to motivate contributors, and you can surely -+discuss this with him before or after contributing. You have the -+right to decline to allow him to license your code contribution other -+than under the GPL. -+ -+Further licensing options are available for commercial and/or other -+interests directly from Hans Reiser: reiser@namesys.com. If you interpret -+the GPL as not allowing those additional licensing options, you read -+it wrongly, and Richard Stallman agrees with me, when carefully read -+you can see that those restrictions on additional terms do not apply -+to the owner of the copyright, and my interpretation of this shall -+govern for this license. -+ -+[END LICENSING] -+ -+Reiser4 is a file system based on dancing tree algorithms, and is -+described at http://www.namesys.com -+ -+mkfs.reiser4 and other utilities are on our webpage or wherever your -+Linux provider put them. You really want to be running the latest -+version off the website if you use fsck. -+ -+Yes, if you update your reiser4 kernel module you do have to -+recompile your kernel, most of the time. The errors you get will be -+quite cryptic if your forget to do so. -+ -+Hideous Commercial Pitch: Spread your development costs across other OS -+vendors. Select from the best in the world, not the best in your -+building, by buying from third party OS component suppliers. Leverage -+the software component development power of the internet. Be the most -+aggressive in taking advantage of the commercial possibilities of -+decentralized internet development, and add value through your branded -+integration that you sell as an operating system. Let your competitors -+be the ones to compete against the entire internet by themselves. Be -+hip, get with the new economic trend, before your competitors do. Send -+email to reiser@namesys.com -+ -+Hans Reiser was the primary architect of Reiser4, but a whole team -+chipped their ideas in. He invested everything he had into Namesys -+for 5.5 dark years of no money before Reiser3 finally started to work well -+enough to bring in money. He owns the copyright. -+ -+DARPA was the primary sponsor of Reiser4. DARPA does not endorse -+Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal -+opinion, unique in its willingness to invest into things more -+theoretical than the VC community can readily understand, and more -+longterm than allows them to be sure that they will be the ones to -+extract the economic benefits from. DARPA also integrated us into a -+security community that transformed our security worldview. -+ -+Vladimir Saveliev is our lead programmer, with us from the beginning, -+and he worked long hours writing the cleanest code. This is why he is -+now the lead programmer after years of commitment to our work. He -+always made the effort to be the best he could be, and to make his -+code the best that it could be. What resulted was quite remarkable. I -+don't think that money can ever motivate someone to work the way he -+did, he is one of the most selfless men I know. -+ -+Alexander Lyamin was our sysadmin, and helped to educate us in -+security issues. Moscow State University and IMT were very generous -+in the internet access they provided us, and in lots of other little -+ways that a generous institution can be. -+ -+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the -+locking code, the block allocator, and finished the flushing code. -+His code is always crystal clean and well structured. -+ -+Nikita Danilov wrote the core of the balancing code, the core of the -+plugins code, and the directory code. He worked a steady pace of long -+hours that produced a whole lot of well abstracted code. He is our -+senior computer scientist. -+ -+Vladimir Demidov wrote the parser. Writing an in kernel parser is -+something very few persons have the skills for, and it is thanks to -+him that we can say that the parser is really not so big compared to -+various bits of our other code, and making a parser work in the kernel -+was not so complicated as everyone would imagine mainly because it was -+him doing it... -+ -+Joshua McDonald wrote the transaction manager, and the flush code. -+The flush code unexpectedly turned out be extremely hairy for reasons -+you can read about on our web page, and he did a great job on an -+extremely difficult task. -+ -+Nina Reiser handled our accounting, government relations, and much -+more. -+ -+Ramon Reiser developed our website. -+ -+Beverly Palmer drew our graphics. -+ -+Vitaly Fertman developed librepair, userspace plugins repair code, fsck -+and worked with Umka on developing libreiser4 and userspace plugins. -+ -+Yury Umanets (aka Umka) developed libreiser4, userspace plugins and -+userspace tools (reiser4progs). -+ -+Oleg Drokin (aka Green) is the release manager who fixes everything. -+It is so nice to have someone like that on the team. He (plus Chris -+and Jeff) make it possible for the entire rest of the Namesys team to -+focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It -+is just amazing to watch his talent for spotting bugs in action. -+ -+Edward Shishkin wrote cryptcompress file plugin (which manages files -+built of encrypted and(or) compressed bodies) and other plugins related -+to transparent encryption and compression support. -diff -urN linux-2.6.30.orig/fs/reiser4/reiser4.h linux-2.6.30/fs/reiser4/reiser4.h ---- linux-2.6.30.orig/fs/reiser4/reiser4.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/reiser4.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,259 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* definitions of common constants used by reiser4 */ -+ -+#if !defined( __REISER4_H__ ) -+#define __REISER4_H__ -+ -+#include <asm/param.h> /* for HZ */ -+#include <linux/errno.h> -+#include <linux/types.h> -+#include <linux/fs.h> -+#include <linux/hardirq.h> -+#include <linux/sched.h> -+ -+/* -+ * reiser4 compilation options. -+ */ -+ -+#if defined(CONFIG_REISER4_DEBUG) -+/* turn on assertion checks */ -+#define REISER4_DEBUG (1) -+#else -+#define REISER4_DEBUG (0) -+#endif -+ -+#define REISER4_SHA256 (0) -+ -+/* -+ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4 -+ * 8-byte components. In the old "small key" mode, it's 3 8-byte -+ * components. Additional component, referred to as "ordering" is used to -+ * order items from which given object is composed of. As such, ordering is -+ * placed between locality and objectid. For directory item ordering contains -+ * initial prefix of the file name this item is for. This sorts all directory -+ * items within given directory lexicographically (but see -+ * fibration.[ch]). For file body and stat-data, ordering contains initial -+ * prefix of the name file was initially created with. In the common case -+ * (files with single name) this allows to order file bodies and stat-datas in -+ * the same order as their respective directory entries, thus speeding up -+ * readdir. -+ * -+ * Note, that kernel can only mount file system with the same key size as one -+ * it is compiled for, so flipping this option may render your data -+ * inaccessible. -+ */ -+#define REISER4_LARGE_KEY (1) -+/*#define REISER4_LARGE_KEY (0)*/ -+ -+/*#define GUESS_EXISTS 1*/ -+ -+/* -+ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation -+ * option -+ */ -+ -+extern const char *REISER4_SUPER_MAGIC_STRING; -+extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the -+ * beginning of device */ -+ -+/* here go tunable parameters that are not worth special entry in kernel -+ configuration */ -+ -+/* default number of slots in coord-by-key caches */ -+#define CBK_CACHE_SLOTS (16) -+/* how many elementary tree operation to carry on the next level */ -+#define CARRIES_POOL_SIZE (5) -+/* size of pool of preallocated nodes for carry process. */ -+#define NODES_LOCKED_POOL_SIZE (5) -+ -+#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT) -+#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT) -+#define REISER4_PASTE_FLAGS (COPI_GO_LEFT) -+#define REISER4_INSERT_FLAGS (COPI_GO_LEFT) -+ -+/* we are supporting reservation of disk space on uid basis */ -+#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0) -+/* we are supporting reservation of disk space for groups */ -+#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0) -+/* we are supporting reservation of disk space for root */ -+#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0) -+/* we use rapid flush mode, see flush.c for comments. */ -+#define REISER4_USE_RAPID_FLUSH (1) -+ -+/* -+ * set this to 0 if you don't want to use wait-for-flush in ->writepage(). -+ */ -+#define REISER4_USE_ENTD (1) -+ -+/* key allocation is Plan-A */ -+#define REISER4_PLANA_KEY_ALLOCATION (1) -+/* key allocation follows good old 3.x scheme */ -+#define REISER4_3_5_KEY_ALLOCATION (0) -+ -+/* size of hash-table for znodes */ -+#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13) -+ -+/* number of buckets in lnode hash-table */ -+#define LNODE_HTABLE_BUCKETS (1024) -+ -+/* some ridiculously high maximal limit on height of znode tree. This -+ is used in declaration of various per level arrays and -+ to allocate stattistics gathering array for per-level stats. */ -+#define REISER4_MAX_ZTREE_HEIGHT (8) -+ -+#define REISER4_PANIC_MSG_BUFFER_SIZE (1024) -+ -+/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then, -+ sequential search is on average faster than binary. This is because -+ of better optimization and because sequential search is more CPU -+ cache friendly. This number (25) was found by experiments on dual AMD -+ Athlon(tm), 1400MHz. -+ -+ NOTE: testing in kernel has shown that binary search is more effective than -+ implied by results of the user level benchmarking. Probably because in the -+ node keys are separated by other data. So value was adjusted after few -+ tests. More thorough tuning is needed. -+*/ -+#define REISER4_SEQ_SEARCH_BREAK (3) -+ -+/* don't allow tree to be lower than this */ -+#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL) -+ -+/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to -+ * available memory. */ -+/* Default value of maximal atom size. Can be ovewritten by -+ tmgr.atom_max_size mount option. By default infinity. */ -+#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0)) -+ -+/* Default value of maximal atom age (in jiffies). After reaching this age -+ atom will be forced to commit, either synchronously or asynchronously. Can -+ be overwritten by tmgr.atom_max_age mount option. */ -+#define REISER4_ATOM_MAX_AGE (600 * HZ) -+ -+/* sleeping period for ktxnmrgd */ -+#define REISER4_TXNMGR_TIMEOUT (5 * HZ) -+ -+/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */ -+#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000) -+ -+/* start complaining after that many restarts in coord_by_key(). -+ -+ This either means incredibly heavy contention for this part of a tree, or -+ some corruption or bug. -+*/ -+#define REISER4_CBK_ITERATIONS_LIMIT (100) -+ -+/* return -EIO after that many iterations in coord_by_key(). -+ -+ I have witnessed more than 800 iterations (in 30 thread test) before cbk -+ finished. --nikita -+*/ -+#define REISER4_MAX_CBK_ITERATIONS 500000 -+ -+/* put a per-inode limit on maximal number of directory entries with identical -+ keys in hashed directory. -+ -+ Disable this until inheritance interfaces stabilize: we need some way to -+ set per directory limit. -+*/ -+#define REISER4_USE_COLLISION_LIMIT (0) -+ -+/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level -+ blocks it will force them to be relocated. */ -+#define FLUSH_RELOCATE_THRESHOLD 64 -+/* If flush finds can find a block allocation closer than at most -+ FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that position. -+ */ -+#define FLUSH_RELOCATE_DISTANCE 64 -+ -+/* If we have written this much or more blocks before encountering busy jnode -+ in flush list - abort flushing hoping that next time we get called -+ this jnode will be clean already, and we will save some seeks. */ -+#define FLUSH_WRITTEN_THRESHOLD 50 -+ -+/* The maximum number of nodes to scan left on a level during flush. */ -+#define FLUSH_SCAN_MAXNODES 10000 -+ -+/* per-atom limit of flushers */ -+#define ATOM_MAX_FLUSHERS (1) -+ -+/* default tracing buffer size */ -+#define REISER4_TRACE_BUF_SIZE (1 << 15) -+ -+/* what size units of IO we would like cp, etc., to use, in writing to -+ reiser4. In bytes. -+ -+ Can be overwritten by optimal_io_size mount option. -+*/ -+#define REISER4_OPTIMAL_IO_SIZE (64 * 1024) -+ -+/* see comments in inode.c:oid_to_uino() */ -+#define REISER4_UINO_SHIFT (1 << 30) -+ -+/* Mark function argument as unused to avoid compiler warnings. */ -+#define UNUSED_ARG __attribute__((unused)) -+ -+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3) -+#define NONNULL __attribute__((nonnull)) -+#else -+#define NONNULL -+#endif -+ -+/* master super block offset in bytes.*/ -+#define REISER4_MASTER_OFFSET 65536 -+ -+/* size of VFS block */ -+#define VFS_BLKSIZE 512 -+/* number of bits in size of VFS block (512==2^9) */ -+#define VFS_BLKSIZE_BITS 9 -+ -+#define REISER4_I reiser4_inode_data -+ -+/* implication */ -+#define ergo(antecedent, consequent) (!(antecedent) || (consequent)) -+/* logical equivalence */ -+#define equi(p1, p2) (ergo((p1), (p2)) && ergo((p2), (p1))) -+ -+#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0]))) -+ -+#define NOT_YET (0) -+ -+/** Reiser4 specific error codes **/ -+ -+#define REISER4_ERROR_CODE_BASE 10000 -+ -+/* Neighbor is not available (side neighbor or parent) */ -+#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE) -+ -+/* Node was not found in cache */ -+#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1) -+ -+/* node has no free space enough for completion of balancing operation */ -+#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2) -+ -+/* repeat operation */ -+#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3) -+ -+/* deadlock happens */ -+#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4) -+ -+/* operation cannot be performed, because it would block and non-blocking mode -+ * was requested. */ -+#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5) -+ -+/* wait some event (depends on context), then repeat */ -+#define E_WAIT (REISER4_ERROR_CODE_BASE + 6) -+ -+#endif /* __REISER4_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/safe_link.c linux-2.6.30/fs/reiser4/safe_link.c ---- linux-2.6.30.orig/fs/reiser4/safe_link.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/safe_link.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,354 @@ -+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Safe-links. */ -+ -+/* -+ * Safe-links are used to maintain file system consistency during operations -+ * that spawns multiple transactions. For example: -+ * -+ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files -+ * without user-visible names in the file system, but still opened by some -+ * active process. What happens here is that unlink proper (i.e., removal -+ * of the last file name) and file deletion (truncate of file body to zero -+ * and deletion of stat-data, that happens when last file descriptor is -+ * closed), may belong to different transactions T1 and T2. If a crash -+ * happens after T1 commit, but before T2 commit, on-disk file system has -+ * a file without name, that is, disk space leak. -+ * -+ * 2. Truncate. Truncate of large file may spawn multiple transactions. If -+ * system crashes while truncate was in-progress, file is left partially -+ * truncated, which violates "atomicity guarantees" of reiser4, viz. that -+ * every system is atomic. -+ * -+ * Safe-links address both above cases. Basically, safe-link is a way post -+ * some operation to be executed during commit of some other transaction than -+ * current one. (Another way to look at the safe-link is to interpret it as a -+ * logical logging.) -+ * -+ * Specifically, at the beginning of unlink safe-link in inserted in the -+ * tree. This safe-link is normally removed by file deletion code (during -+ * transaction T2 in the above terms). Truncate also inserts safe-link that is -+ * normally removed when truncate operation is finished. -+ * -+ * This means, that in the case of "clean umount" there are no safe-links in -+ * the tree. If safe-links are observed during mount, it means that (a) system -+ * was terminated abnormally, and (b) safe-link correspond to the "pending" -+ * (i.e., not finished) operations that were in-progress during system -+ * termination. Each safe-link record enough information to complete -+ * corresponding operation, and mount simply "replays" them (hence, the -+ * analogy with the logical logging). -+ * -+ * Safe-links are implemented as blackbox items (see -+ * plugin/item/blackbox.[ch]). -+ * -+ * For the reference: ext3 also has similar mechanism, it's called "an orphan -+ * list" there. -+ */ -+ -+#include "safe_link.h" -+#include "debug.h" -+#include "inode.h" -+ -+#include "plugin/item/blackbox.h" -+ -+#include <linux/fs.h> -+ -+/* -+ * On-disk format of safe-link. -+ */ -+typedef struct safelink { -+ reiser4_key sdkey; /* key of stat-data for the file safe-link is -+ * for */ -+ d64 size; /* size to which file should be truncated */ -+} safelink_t; -+ -+/* -+ * locality where safe-link items are stored. Next to the objectid of root -+ * directory. -+ */ -+static oid_t safe_link_locality(reiser4_tree * tree) -+{ -+ return get_key_objectid(get_super_private(tree->super)->df_plug-> -+ root_dir_key(tree->super)) + 1; -+} -+ -+/* -+ Construct a key for the safe-link. Key has the following format: -+ -+| 60 | 4 | 64 | 4 | 60 | 64 | -++---------------+---+------------------+---+---------------+------------------+ -+| locality | 0 | 0 | 0 | objectid | link type | -++---------------+---+------------------+---+---------------+------------------+ -+| | | | | -+| 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ -+ This is in large keys format. In small keys format second 8 byte chunk is -+ out. Locality is a constant returned by safe_link_locality(). objectid is -+ an oid of a file on which operation protected by this safe-link is -+ performed. link-type is used to distinguish safe-links for different -+ operations. -+ -+ */ -+static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid, -+ reiser4_safe_link_t link, reiser4_key * key) -+{ -+ reiser4_key_init(key); -+ set_key_locality(key, safe_link_locality(tree)); -+ set_key_objectid(key, oid); -+ set_key_offset(key, link); -+ return key; -+} -+ -+/* -+ * how much disk space is necessary to insert and remove (in the -+ * error-handling path) safe-link. -+ */ -+static __u64 safe_link_tograb(reiser4_tree * tree) -+{ -+ return -+ /* insert safe link */ -+ estimate_one_insert_item(tree) + -+ /* remove safe link */ -+ estimate_one_item_removal(tree) + -+ /* drill to the leaf level during insertion */ -+ 1 + estimate_one_insert_item(tree) + -+ /* -+ * possible update of existing safe-link. Actually, if -+ * safe-link existed already (we failed to remove it), then no -+ * insertion is necessary, so this term is already "covered", -+ * but for simplicity let's left it. -+ */ -+ 1; -+} -+ -+/* -+ * grab enough disk space to insert and remove (in the error-handling path) -+ * safe-link. -+ */ -+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags) -+{ -+ int result; -+ -+ grab_space_enable(); -+ /* The sbinfo->delete_mutex can be taken here. -+ * safe_link_release() should be called before leaving reiser4 -+ * context. */ -+ result = -+ reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags); -+ grab_space_enable(); -+ return result; -+} -+ -+/* -+ * release unused disk space reserved by safe_link_grab(). -+ */ -+void safe_link_release(reiser4_tree * tree) -+{ -+ reiser4_release_reserved(tree->super); -+} -+ -+/* -+ * insert into tree safe-link for operation @link on inode @inode. -+ */ -+int safe_link_add(struct inode *inode, reiser4_safe_link_t link) -+{ -+ reiser4_key key; -+ safelink_t sl; -+ int length; -+ int result; -+ reiser4_tree *tree; -+ -+ build_sd_key(inode, &sl.sdkey); -+ length = sizeof sl.sdkey; -+ -+ if (link == SAFE_TRUNCATE) { -+ /* -+ * for truncate we have to store final file length also, -+ * expand item. -+ */ -+ length += sizeof(sl.size); -+ put_unaligned(cpu_to_le64(inode->i_size), &sl.size); -+ } -+ tree = reiser4_tree_by_inode(inode); -+ build_link_key(tree, get_inode_oid(inode), link, &key); -+ -+ result = store_black_box(tree, &key, &sl, length); -+ if (result == -EEXIST) -+ result = update_black_box(tree, &key, &sl, length); -+ return result; -+} -+ -+/* -+ * remove safe-link corresponding to the operation @link on inode @inode from -+ * the tree. -+ */ -+int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link) -+{ -+ reiser4_key key; -+ -+ return kill_black_box(tree, build_link_key(tree, oid, link, &key)); -+} -+ -+/* -+ * in-memory structure to keep information extracted from safe-link. This is -+ * used to iterate over all safe-links. -+ */ -+struct safe_link_context { -+ reiser4_tree *tree; /* internal tree */ -+ reiser4_key key; /* safe-link key */ -+ reiser4_key sdkey; /* key of object stat-data */ -+ reiser4_safe_link_t link; /* safe-link type */ -+ oid_t oid; /* object oid */ -+ __u64 size; /* final size for truncate */ -+}; -+ -+/* -+ * start iterating over all safe-links. -+ */ -+static void safe_link_iter_begin(reiser4_tree * tree, -+ struct safe_link_context *ctx) -+{ -+ ctx->tree = tree; -+ reiser4_key_init(&ctx->key); -+ set_key_locality(&ctx->key, safe_link_locality(tree)); -+ set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key())); -+ set_key_offset(&ctx->key, get_key_offset(reiser4_max_key())); -+} -+ -+/* -+ * return next safe-link. -+ */ -+static int safe_link_iter_next(struct safe_link_context *ctx) -+{ -+ int result; -+ safelink_t sl; -+ -+ result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0); -+ if (result == 0) { -+ ctx->oid = get_key_objectid(&ctx->key); -+ ctx->link = get_key_offset(&ctx->key); -+ ctx->sdkey = sl.sdkey; -+ if (ctx->link == SAFE_TRUNCATE) -+ ctx->size = le64_to_cpu(get_unaligned(&sl.size)); -+ } -+ return result; -+} -+ -+/* -+ * check are there any more safe-links left in the tree. -+ */ -+static int safe_link_iter_finished(struct safe_link_context *ctx) -+{ -+ return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree); -+} -+ -+/* -+ * finish safe-link iteration. -+ */ -+static void safe_link_iter_end(struct safe_link_context *ctx) -+{ -+ /* nothing special */ -+} -+ -+/* -+ * process single safe-link. -+ */ -+static int process_safelink(struct super_block *super, reiser4_safe_link_t link, -+ reiser4_key * sdkey, oid_t oid, __u64 size) -+{ -+ struct inode *inode; -+ int result; -+ -+ /* -+ * obtain object inode by reiser4_iget(), then call object plugin -+ * ->safelink() method to do actual work, then delete safe-link on -+ * success. -+ */ -+ inode = reiser4_iget(super, sdkey, 1); -+ if (!IS_ERR(inode)) { -+ file_plugin *fplug; -+ -+ fplug = inode_file_plugin(inode); -+ assert("nikita-3428", fplug != NULL); -+ assert("", oid == get_inode_oid(inode)); -+ if (fplug->safelink != NULL) { -+ /* reiser4_txn_restart_current is not necessary because -+ * mounting is signle thread. However, without it -+ * deadlock detection code will complain (see -+ * nikita-3361). */ -+ reiser4_txn_restart_current(); -+ result = fplug->safelink(inode, link, size); -+ } else { -+ warning("nikita-3430", -+ "Cannot handle safelink for %lli", -+ (unsigned long long)oid); -+ reiser4_print_key("key", sdkey); -+ result = 0; -+ } -+ if (result != 0) { -+ warning("nikita-3431", -+ "Error processing safelink for %lli: %i", -+ (unsigned long long)oid, result); -+ } -+ reiser4_iget_complete(inode); -+ iput(inode); -+ if (result == 0) { -+ result = safe_link_grab(reiser4_get_tree(super), -+ BA_CAN_COMMIT); -+ if (result == 0) -+ result = -+ safe_link_del(reiser4_get_tree(super), oid, -+ link); -+ safe_link_release(reiser4_get_tree(super)); -+ /* -+ * restart transaction: if there was large number of -+ * safe-links, their processing may fail to fit into -+ * single transaction. -+ */ -+ if (result == 0) -+ reiser4_txn_restart_current(); -+ } -+ } else -+ result = PTR_ERR(inode); -+ return result; -+} -+ -+/* -+ * iterate over all safe-links in the file-system processing them one by one. -+ */ -+int process_safelinks(struct super_block *super) -+{ -+ struct safe_link_context ctx; -+ int result; -+ -+ if (rofs_super(super)) -+ /* do nothing on the read-only file system */ -+ return 0; -+ safe_link_iter_begin(&get_super_private(super)->tree, &ctx); -+ result = 0; -+ do { -+ result = safe_link_iter_next(&ctx); -+ if (safe_link_iter_finished(&ctx) || result == -ENOENT) { -+ result = 0; -+ break; -+ } -+ if (result == 0) -+ result = process_safelink(super, ctx.link, -+ &ctx.sdkey, ctx.oid, -+ ctx.size); -+ } while (result == 0); -+ safe_link_iter_end(&ctx); -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/safe_link.h linux-2.6.30/fs/reiser4/safe_link.h ---- linux-2.6.30.orig/fs/reiser4/safe_link.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/safe_link.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,29 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Safe-links. See safe_link.c for details. */ -+ -+#if !defined(__FS_SAFE_LINK_H__) -+#define __FS_SAFE_LINK_H__ -+ -+#include "tree.h" -+ -+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags); -+void safe_link_release(reiser4_tree * tree); -+int safe_link_add(struct inode *inode, reiser4_safe_link_t link); -+int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link); -+ -+int process_safelinks(struct super_block *super); -+ -+/* __FS_SAFE_LINK_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/seal.c linux-2.6.30/fs/reiser4/seal.c ---- linux-2.6.30.orig/fs/reiser4/seal.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/seal.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,218 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Seals implementation. */ -+/* Seals are "weak" tree pointers. They are analogous to tree coords in -+ allowing to bypass tree traversal. But normal usage of coords implies that -+ node pointed to by coord is locked, whereas seals don't keep a lock (or -+ even a reference) to znode. In stead, each znode contains a version number, -+ increased on each znode modification. This version number is copied into a -+ seal when seal is created. Later, one can "validate" seal by calling -+ reiser4_seal_validate(). If znode is in cache and its version number is -+ still the same, seal is "pristine" and coord associated with it can be -+ re-used immediately. -+ -+ If, on the other hand, znode is out of cache, or it is obviously different -+ one from the znode seal was initially attached to (for example, it is on -+ the different level, or is being removed from the tree), seal is -+ irreparably invalid ("burned") and tree traversal has to be repeated. -+ -+ Otherwise, there is some hope, that while znode was modified (and seal was -+ "broken" as a result), key attached to the seal is still in the node. This -+ is checked by first comparing this key with delimiting keys of node and, if -+ key is ok, doing intra-node lookup. -+ -+ Znode version is maintained in the following way: -+ -+ there is reiser4_tree.znode_epoch counter. Whenever new znode is created, -+ znode_epoch is incremented and its new value is stored in ->version field -+ of new znode. Whenever znode is dirtied (which means it was probably -+ modified), znode_epoch is also incremented and its new value is stored in -+ znode->version. This is done so, because just incrementing znode->version -+ on each update is not enough: it may so happen, that znode get deleted, new -+ znode is allocated for the same disk block and gets the same version -+ counter, tricking seal code into false positive. -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "seal.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "znode.h" -+#include "super.h" -+ -+static znode *seal_node(const seal_t *seal); -+static int seal_matches(const seal_t *seal, znode * node); -+ -+/* initialise seal. This can be called several times on the same seal. @coord -+ and @key can be NULL. */ -+void reiser4_seal_init(seal_t *seal /* seal to initialise */ , -+ const coord_t *coord /* coord @seal will be -+ * attached to */ , -+ const reiser4_key * key UNUSED_ARG /* key @seal will be -+ * attached to */ ) -+{ -+ assert("nikita-1886", seal != NULL); -+ memset(seal, 0, sizeof *seal); -+ if (coord != NULL) { -+ znode *node; -+ -+ node = coord->node; -+ assert("nikita-1987", node != NULL); -+ spin_lock_znode(node); -+ seal->version = node->version; -+ assert("nikita-1988", seal->version != 0); -+ seal->block = *znode_get_block(node); -+#if REISER4_DEBUG -+ seal->coord1 = *coord; -+ if (key != NULL) -+ seal->key = *key; -+#endif -+ spin_unlock_znode(node); -+ } -+} -+ -+/* finish with seal */ -+void reiser4_seal_done(seal_t *seal/* seal to clear */) -+{ -+ assert("nikita-1887", seal != NULL); -+ seal->version = 0; -+} -+ -+/* true if seal was initialised */ -+int reiser4_seal_is_set(const seal_t *seal/* seal to query */) -+{ -+ assert("nikita-1890", seal != NULL); -+ return seal->version != 0; -+} -+ -+#if REISER4_DEBUG -+/* helper function for reiser4_seal_validate(). It checks that item at @coord -+ * has expected key. This is to detect cases where node was modified but wasn't -+ * marked dirty. */ -+static inline int check_seal_match(const coord_t *coord /* coord to check */ , -+ const reiser4_key * k/* expected key */) -+{ -+ reiser4_key ukey; -+ -+ return (coord->between != AT_UNIT) || -+ /* FIXME-VS: we only can compare keys for items whose units -+ represent exactly one key */ -+ ((coord_is_existing_unit(coord)) -+ && (item_is_extent(coord) -+ || keyeq(k, unit_key_by_coord(coord, &ukey)))) -+ || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord)) -+ && keyge(k, unit_key_by_coord(coord, &ukey))); -+} -+#endif -+ -+/* this is used by reiser4_seal_validate. It accepts return value of -+ * longterm_lock_znode and returns 1 if it can be interpreted as seal -+ * validation failure. For instance, when longterm_lock_znode returns -EINVAL, -+ * reiser4_seal_validate returns -E_REPEAT and caller will call tre search. -+ * We cannot do this in longterm_lock_znode(), because sometimes we want to -+ * distinguish between -EINVAL and -E_REPEAT. */ -+static int should_repeat(int return_code) -+{ -+ return return_code == -EINVAL; -+} -+ -+/* (re-)validate seal. -+ -+ Checks whether seal is pristine, and try to revalidate it if possible. -+ -+ If seal was burned, or broken irreparably, return -E_REPEAT. -+ -+ NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are -+ looking for is in range of keys covered by the sealed node, but item wasn't -+ found by node ->lookup() method. Alternative is to return -ENOENT in this -+ case, but this would complicate callers logic. -+ -+*/ -+int reiser4_seal_validate(seal_t *seal /* seal to validate */, -+ coord_t *coord /* coord to validate against */, -+ const reiser4_key * key /* key to validate against */, -+ lock_handle * lh /* resulting lock handle */, -+ znode_lock_mode mode /* lock node */, -+ znode_lock_request request/* locking priority */) -+{ -+ znode *node; -+ int result; -+ -+ assert("nikita-1889", seal != NULL); -+ assert("nikita-1881", reiser4_seal_is_set(seal)); -+ assert("nikita-1882", key != NULL); -+ assert("nikita-1883", coord != NULL); -+ assert("nikita-1884", lh != NULL); -+ assert("nikita-1885", keyeq(&seal->key, key)); -+ assert("nikita-1989", coords_equal(&seal->coord1, coord)); -+ -+ /* obtain znode by block number */ -+ node = seal_node(seal); -+ if (node != NULL) { -+ /* znode was in cache, lock it */ -+ result = longterm_lock_znode(lh, node, mode, request); -+ zput(node); -+ if (result == 0) { -+ if (seal_matches(seal, node)) { -+ /* if seal version and znode version -+ coincide */ -+ ON_DEBUG(coord_update_v(coord)); -+ assert("nikita-1990", -+ node == seal->coord1.node); -+ assert("nikita-1898", -+ WITH_DATA_RET(coord->node, 1, -+ check_seal_match(coord, -+ key))); -+ } else -+ result = RETERR(-E_REPEAT); -+ } -+ if (result != 0) { -+ if (should_repeat(result)) -+ result = RETERR(-E_REPEAT); -+ /* unlock node on failure */ -+ done_lh(lh); -+ } -+ } else { -+ /* znode wasn't in cache */ -+ result = RETERR(-E_REPEAT); -+ } -+ return result; -+} -+ -+/* helpers functions */ -+ -+/* obtain reference to znode seal points to, if in cache */ -+static znode *seal_node(const seal_t *seal/* seal to query */) -+{ -+ assert("nikita-1891", seal != NULL); -+ return zlook(current_tree, &seal->block); -+} -+ -+/* true if @seal version and @node version coincide */ -+static int seal_matches(const seal_t *seal /* seal to check */ , -+ znode * node/* node to check */) -+{ -+ int result; -+ -+ assert("nikita-1991", seal != NULL); -+ assert("nikita-1993", node != NULL); -+ -+ spin_lock_znode(node); -+ result = (seal->version == node->version); -+ spin_unlock_znode(node); -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/seal.h linux-2.6.30/fs/reiser4/seal.h ---- linux-2.6.30.orig/fs/reiser4/seal.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/seal.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,49 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */ -+ -+#ifndef __SEAL_H__ -+#define __SEAL_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+ -+/* for __u?? types */ -+/*#include <linux/types.h>*/ -+ -+/* seal. See comment at the top of seal.c */ -+typedef struct seal_s { -+ /* version of znode recorder at the time of seal creation */ -+ __u64 version; -+ /* block number of znode attached to this seal */ -+ reiser4_block_nr block; -+#if REISER4_DEBUG -+ /* coord this seal is attached to. For debugging. */ -+ coord_t coord1; -+ /* key this seal is attached to. For debugging. */ -+ reiser4_key key; -+#endif -+} seal_t; -+ -+extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *); -+extern void reiser4_seal_done(seal_t *); -+extern int reiser4_seal_is_set(const seal_t *); -+extern int reiser4_seal_validate(seal_t *, coord_t *, -+ const reiser4_key *, lock_handle * , -+ znode_lock_mode mode, znode_lock_request request); -+ -+/* __SEAL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/search.c linux-2.6.30/fs/reiser4/search.c ---- linux-2.6.30.orig/fs/reiser4/search.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/search.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1612 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "seal.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "tree.h" -+#include "reiser4.h" -+#include "super.h" -+#include "inode.h" -+ -+#include <linux/slab.h> -+ -+static const char *bias_name(lookup_bias bias); -+ -+/* tree searching algorithm, intranode searching algorithms are in -+ plugin/node/ */ -+ -+/* tree lookup cache -+ * -+ * The coord by key cache consists of small list of recently accessed nodes -+ * maintained according to the LRU discipline. Before doing real top-to-down -+ * tree traversal this cache is scanned for nodes that can contain key -+ * requested. -+ * -+ * The efficiency of coord cache depends heavily on locality of reference for -+ * tree accesses. Our user level simulations show reasonably good hit ratios -+ * for coord cache under most loads so far. -+ */ -+ -+/* Initialise coord cache slot */ -+static void cbk_cache_init_slot(cbk_cache_slot *slot) -+{ -+ assert("nikita-345", slot != NULL); -+ -+ INIT_LIST_HEAD(&slot->lru); -+ slot->node = NULL; -+} -+ -+/* Initialize coord cache */ -+int cbk_cache_init(cbk_cache * cache/* cache to init */) -+{ -+ int i; -+ -+ assert("nikita-346", cache != NULL); -+ -+ cache->slot = -+ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots, -+ reiser4_ctx_gfp_mask_get()); -+ if (cache->slot == NULL) -+ return RETERR(-ENOMEM); -+ -+ INIT_LIST_HEAD(&cache->lru); -+ for (i = 0; i < cache->nr_slots; ++i) { -+ cbk_cache_init_slot(cache->slot + i); -+ list_add_tail(&((cache->slot + i)->lru), &cache->lru); -+ } -+ rwlock_init(&cache->guard); -+ return 0; -+} -+ -+/* free cbk cache data */ -+void cbk_cache_done(cbk_cache * cache/* cache to release */) -+{ -+ assert("nikita-2493", cache != NULL); -+ if (cache->slot != NULL) { -+ kfree(cache->slot); -+ cache->slot = NULL; -+ } -+} -+ -+/* macro to iterate over all cbk cache slots */ -+#define for_all_slots(cache, slot) \ -+ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \ -+ &(cache)->lru != &(slot)->lru; \ -+ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru)) -+ -+#if REISER4_DEBUG -+/* this function assures that [cbk-cache-invariant] invariant holds */ -+static int cbk_cache_invariant(const cbk_cache * cache) -+{ -+ cbk_cache_slot *slot; -+ int result; -+ int unused; -+ -+ if (cache->nr_slots == 0) -+ return 1; -+ -+ assert("nikita-2469", cache != NULL); -+ unused = 0; -+ result = 1; -+ read_lock(&((cbk_cache *)cache)->guard); -+ for_all_slots(cache, slot) { -+ /* in LRU first go all `used' slots followed by `unused' */ -+ if (unused && (slot->node != NULL)) -+ result = 0; -+ if (slot->node == NULL) -+ unused = 1; -+ else { -+ cbk_cache_slot *scan; -+ -+ /* all cached nodes are different */ -+ scan = slot; -+ while (result) { -+ scan = list_entry(scan->lru.next, -+ cbk_cache_slot, lru); -+ if (&cache->lru == &scan->lru) -+ break; -+ if (slot->node == scan->node) -+ result = 0; -+ } -+ } -+ if (!result) -+ break; -+ } -+ read_unlock(&((cbk_cache *)cache)->guard); -+ return result; -+} -+ -+#endif -+ -+/* Remove references, if any, to @node from coord cache */ -+void cbk_cache_invalidate(const znode * node /* node to remove from cache */ , -+ reiser4_tree * tree/* tree to remove node from */) -+{ -+ cbk_cache_slot *slot; -+ cbk_cache *cache; -+ int i; -+ -+ assert("nikita-350", node != NULL); -+ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree)); -+ -+ cache = &tree->cbk_cache; -+ assert("nikita-2470", cbk_cache_invariant(cache)); -+ -+ write_lock(&(cache->guard)); -+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) { -+ if (slot->node == node) { -+ list_move_tail(&slot->lru, &cache->lru); -+ slot->node = NULL; -+ break; -+ } -+ } -+ write_unlock(&(cache->guard)); -+ assert("nikita-2471", cbk_cache_invariant(cache)); -+} -+ -+/* add to the cbk-cache in the "tree" information about "node". This -+ can actually be update of existing slot in a cache. */ -+static void cbk_cache_add(const znode * node/* node to add to the cache */) -+{ -+ cbk_cache *cache; -+ -+ cbk_cache_slot *slot; -+ int i; -+ -+ assert("nikita-352", node != NULL); -+ -+ cache = &znode_get_tree(node)->cbk_cache; -+ assert("nikita-2472", cbk_cache_invariant(cache)); -+ -+ if (cache->nr_slots == 0) -+ return; -+ -+ write_lock(&(cache->guard)); -+ /* find slot to update/add */ -+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) { -+ /* oops, this node is already in a cache */ -+ if (slot->node == node) -+ break; -+ } -+ /* if all slots are used, reuse least recently used one */ -+ if (i == cache->nr_slots) { -+ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru); -+ slot->node = (znode *) node; -+ } -+ list_move(&slot->lru, &cache->lru); -+ write_unlock(&(cache->guard)); -+ assert("nikita-2473", cbk_cache_invariant(cache)); -+} -+ -+static int setup_delimiting_keys(cbk_handle * h); -+static lookup_result coord_by_handle(cbk_handle * handle); -+static lookup_result traverse_tree(cbk_handle * h); -+static int cbk_cache_search(cbk_handle * h); -+ -+static level_lookup_result cbk_level_lookup(cbk_handle * h); -+static level_lookup_result cbk_node_lookup(cbk_handle * h); -+ -+/* helper functions */ -+ -+static void update_stale_dk(reiser4_tree * tree, znode * node); -+ -+/* release parent node during traversal */ -+static void put_parent(cbk_handle * h); -+/* check consistency of fields */ -+static int sanity_check(cbk_handle * h); -+/* release resources in handle */ -+static void hput(cbk_handle * h); -+ -+static level_lookup_result search_to_left(cbk_handle * h); -+ -+/* pack numerous (numberous I should say) arguments of coord_by_key() into -+ * cbk_handle */ -+static cbk_handle *cbk_pack(cbk_handle * handle, -+ reiser4_tree * tree, -+ const reiser4_key * key, -+ coord_t *coord, -+ lock_handle * active_lh, -+ lock_handle * parent_lh, -+ znode_lock_mode lock_mode, -+ lookup_bias bias, -+ tree_level lock_level, -+ tree_level stop_level, -+ __u32 flags, ra_info_t *info) -+{ -+ memset(handle, 0, sizeof *handle); -+ -+ handle->tree = tree; -+ handle->key = key; -+ handle->lock_mode = lock_mode; -+ handle->bias = bias; -+ handle->lock_level = lock_level; -+ handle->stop_level = stop_level; -+ handle->coord = coord; -+ /* set flags. See comment in tree.h:cbk_flags */ -+ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK; -+ -+ handle->active_lh = active_lh; -+ handle->parent_lh = parent_lh; -+ handle->ra_info = info; -+ return handle; -+} -+ -+/* main tree lookup procedure -+ -+ Check coord cache. If key we are looking for is not found there, call cbk() -+ to do real tree traversal. -+ -+ As we have extents on the twig level, @lock_level and @stop_level can -+ be different from LEAF_LEVEL and each other. -+ -+ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode -+ long term locks) while calling this. -+*/ -+lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search -+ * in. Usually this tree is -+ * part of file-system -+ * super-block */ , -+ const reiser4_key * key /* key to look for */ , -+ coord_t *coord /* where to store found -+ * position in a tree. Fields -+ * in "coord" are only valid if -+ * coord_by_key() returned -+ * "CBK_COORD_FOUND" */ , -+ lock_handle * lh, /* resulting lock handle */ -+ znode_lock_mode lock_mode /* type of lookup we -+ * want on node. Pass -+ * ZNODE_READ_LOCK here -+ * if you only want to -+ * read item found and -+ * ZNODE_WRITE_LOCK if -+ * you want to modify -+ * it */ , -+ lookup_bias bias /* what to return if coord -+ * with exactly the @key is -+ * not in the tree */ , -+ tree_level lock_level/* tree level where to start -+ * taking @lock type of -+ * locks */ , -+ tree_level stop_level/* tree level to stop. Pass -+ * LEAF_LEVEL or TWIG_LEVEL -+ * here Item being looked -+ * for has to be between -+ * @lock_level and -+ * @stop_level, inclusive */ , -+ __u32 flags /* search flags */ , -+ ra_info_t * -+ info -+ /* information about desired tree traversal -+ * readahead */ -+ ) -+{ -+ cbk_handle handle; -+ lock_handle parent_lh; -+ lookup_result result; -+ -+ init_lh(lh); -+ init_lh(&parent_lh); -+ -+ assert("nikita-3023", reiser4_schedulable()); -+ -+ assert("nikita-353", tree != NULL); -+ assert("nikita-354", key != NULL); -+ assert("nikita-355", coord != NULL); -+ assert("nikita-356", (bias == FIND_EXACT) -+ || (bias == FIND_MAX_NOT_MORE_THAN)); -+ assert("nikita-357", stop_level >= LEAF_LEVEL); -+ /* no locks can be held during tree traversal */ -+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack())); -+ -+ cbk_pack(&handle, -+ tree, -+ key, -+ coord, -+ lh, -+ &parent_lh, -+ lock_mode, bias, lock_level, stop_level, flags, info); -+ -+ result = coord_by_handle(&handle); -+ assert("nikita-3247", -+ ergo(!IS_CBKERR(result), coord->node == lh->node)); -+ return result; -+} -+ -+/* like coord_by_key(), but starts traversal from vroot of @object rather than -+ * from tree root. */ -+lookup_result reiser4_object_lookup(struct inode *object, -+ const reiser4_key * key, -+ coord_t *coord, -+ lock_handle * lh, -+ znode_lock_mode lock_mode, -+ lookup_bias bias, -+ tree_level lock_level, -+ tree_level stop_level, __u32 flags, -+ ra_info_t *info) -+{ -+ cbk_handle handle; -+ lock_handle parent_lh; -+ lookup_result result; -+ -+ init_lh(lh); -+ init_lh(&parent_lh); -+ -+ assert("nikita-3023", reiser4_schedulable()); -+ -+ assert("nikita-354", key != NULL); -+ assert("nikita-355", coord != NULL); -+ assert("nikita-356", (bias == FIND_EXACT) -+ || (bias == FIND_MAX_NOT_MORE_THAN)); -+ assert("nikita-357", stop_level >= LEAF_LEVEL); -+ /* no locks can be held during tree search by key */ -+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack())); -+ -+ cbk_pack(&handle, -+ object != NULL ? reiser4_tree_by_inode(object) : current_tree, -+ key, -+ coord, -+ lh, -+ &parent_lh, -+ lock_mode, bias, lock_level, stop_level, flags, info); -+ handle.object = object; -+ -+ result = coord_by_handle(&handle); -+ assert("nikita-3247", -+ ergo(!IS_CBKERR(result), coord->node == lh->node)); -+ return result; -+} -+ -+/* lookup by cbk_handle. Common part of coord_by_key() and -+ reiser4_object_lookup(). */ -+static lookup_result coord_by_handle(cbk_handle * handle) -+{ -+ /* -+ * first check cbk_cache (which is look-aside cache for our tree) and -+ * of this fails, start traversal. -+ */ -+ /* first check whether "key" is in cache of recent lookups. */ -+ if (cbk_cache_search(handle) == 0) -+ return handle->result; -+ else -+ return traverse_tree(handle); -+} -+ -+/* Execute actor for each item (or unit, depending on @through_units_p), -+ starting from @coord, right-ward, until either: -+ -+ - end of the tree is reached -+ - unformatted node is met -+ - error occurred -+ - @actor returns 0 or less -+ -+ Error code, or last actor return value is returned. -+ -+ This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through -+ sequence of entries with identical keys and alikes. -+*/ -+int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ , -+ coord_t *coord /* coord to start from */ , -+ lock_handle * lh /* lock handle to start with and to -+ * update along the way */ , -+ tree_iterate_actor_t actor /* function to call on each -+ * item/unit */ , -+ void *arg /* argument to pass to @actor */ , -+ znode_lock_mode mode /* lock mode on scanned nodes */ , -+ int through_units_p /* call @actor on each item or on -+ * each unit */ ) -+{ -+ int result; -+ -+ assert("nikita-1143", tree != NULL); -+ assert("nikita-1145", coord != NULL); -+ assert("nikita-1146", lh != NULL); -+ assert("nikita-1147", actor != NULL); -+ -+ result = zload(coord->node); -+ coord_clear_iplug(coord); -+ if (result != 0) -+ return result; -+ if (!coord_is_existing_unit(coord)) { -+ zrelse(coord->node); -+ return -ENOENT; -+ } -+ while ((result = actor(tree, coord, lh, arg)) > 0) { -+ /* move further */ -+ if ((through_units_p && coord_next_unit(coord)) || -+ (!through_units_p && coord_next_item(coord))) { -+ do { -+ lock_handle couple; -+ -+ /* move to the next node */ -+ init_lh(&couple); -+ result = -+ reiser4_get_right_neighbor(&couple, -+ coord->node, -+ (int)mode, -+ GN_CAN_USE_UPPER_LEVELS); -+ zrelse(coord->node); -+ if (result == 0) { -+ -+ result = zload(couple.node); -+ if (result != 0) { -+ done_lh(&couple); -+ return result; -+ } -+ -+ coord_init_first_unit(coord, -+ couple.node); -+ done_lh(lh); -+ move_lh(lh, &couple); -+ } else -+ return result; -+ } while (node_is_empty(coord->node)); -+ } -+ -+ assert("nikita-1149", coord_is_existing_unit(coord)); -+ } -+ zrelse(coord->node); -+ return result; -+} -+ -+/* return locked uber znode for @tree */ -+int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode, -+ znode_lock_request pri, lock_handle * lh) -+{ -+ int result; -+ -+ result = longterm_lock_znode(lh, tree->uber, mode, pri); -+ return result; -+} -+ -+/* true if @key is strictly within @node -+ -+ we are looking for possibly non-unique key and it is item is at the edge of -+ @node. May be it is in the neighbor. -+*/ -+static int znode_contains_key_strict(znode * node /* node to check key -+ * against */ , -+ const reiser4_key * -+ key /* key to check */ , -+ int isunique) -+{ -+ int answer; -+ -+ assert("nikita-1760", node != NULL); -+ assert("nikita-1722", key != NULL); -+ -+ if (keyge(key, &node->rd_key)) -+ return 0; -+ -+ answer = keycmp(&node->ld_key, key); -+ -+ if (isunique) -+ return answer != GREATER_THAN; -+ else -+ return answer == LESS_THAN; -+} -+ -+/* -+ * Virtual Root (vroot) code. -+ * -+ * For given file system object (e.g., regular file or directory) let's -+ * define its "virtual root" as lowest in the tree (that is, furtherest -+ * from the tree root) node such that all body items of said object are -+ * located in a tree rooted at this node. -+ * -+ * Once vroot of object is found all tree lookups for items within body of -+ * this object ("object lookups") can be started from its vroot rather -+ * than from real root. This has following advantages: -+ * -+ * 1. amount of nodes traversed during lookup (and, hence, amount of -+ * key comparisons made) decreases, and -+ * -+ * 2. contention on tree root is decreased. This latter was actually -+ * motivating reason behind vroot, because spin lock of root node, -+ * which is taken when acquiring long-term lock on root node is the -+ * hottest lock in the reiser4. -+ * -+ * How to find vroot. -+ * -+ * When vroot of object F is not yet determined, all object lookups start -+ * from the root of the tree. At each tree level during traversal we have -+ * a node N such that a key we are looking for (which is the key inside -+ * object's body) is located within N. In function handle_vroot() called -+ * from cbk_level_lookup() we check whether N is possible vroot for -+ * F. Check is trivial---if neither leftmost nor rightmost item of N -+ * belongs to F (and we already have helpful ->owns_item() method of -+ * object plugin for this), then N is possible vroot of F. This, of -+ * course, relies on the assumption that each object occupies contiguous -+ * range of keys in the tree. -+ * -+ * Thus, traversing tree downward and checking each node as we go, we can -+ * find lowest such node, which, by definition, is vroot. -+ * -+ * How to track vroot. -+ * -+ * Nohow. If actual vroot changes, next object lookup will just restart -+ * from the actual tree root, refreshing object's vroot along the way. -+ * -+ */ -+ -+/* -+ * Check whether @node is possible vroot of @object. -+ */ -+static void handle_vroot(struct inode *object, znode * node) -+{ -+ file_plugin *fplug; -+ coord_t coord; -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-3353", fplug != NULL); -+ assert("nikita-3354", fplug->owns_item != NULL); -+ -+ if (unlikely(node_is_empty(node))) -+ return; -+ -+ coord_init_first_unit(&coord, node); -+ /* -+ * if leftmost item of @node belongs to @object, we cannot be sure -+ * that @node is vroot of @object, because, some items of @object are -+ * probably in the sub-tree rooted at the left neighbor of @node. -+ */ -+ if (fplug->owns_item(object, &coord)) -+ return; -+ coord_init_last_unit(&coord, node); -+ /* mutatis mutandis for the rightmost item */ -+ if (fplug->owns_item(object, &coord)) -+ return; -+ /* otherwise, @node is possible vroot of @object */ -+ inode_set_vroot(object, node); -+} -+ -+/* -+ * helper function used by traverse tree to start tree traversal not from the -+ * tree root, but from @h->object's vroot, if possible. -+ */ -+static int prepare_object_lookup(cbk_handle * h) -+{ -+ znode *vroot; -+ int result; -+ -+ vroot = inode_get_vroot(h->object); -+ if (vroot == NULL) { -+ /* -+ * object doesn't have known vroot, start from real tree root. -+ */ -+ return LOOKUP_CONT; -+ } -+ -+ h->level = znode_get_level(vroot); -+ /* take a long-term lock on vroot */ -+ h->result = longterm_lock_znode(h->active_lh, vroot, -+ cbk_lock_mode(h->level, h), -+ ZNODE_LOCK_LOPRI); -+ result = LOOKUP_REST; -+ if (h->result == 0) { -+ int isunique; -+ int inside; -+ -+ isunique = h->flags & CBK_UNIQUE; -+ /* check that key is inside vroot */ -+ read_lock_dk(h->tree); -+ inside = (znode_contains_key_strict(vroot, h->key, isunique) && -+ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE)); -+ read_unlock_dk(h->tree); -+ if (inside) { -+ h->result = zload(vroot); -+ if (h->result == 0) { -+ /* search for key in vroot. */ -+ result = cbk_node_lookup(h); -+ zrelse(vroot); /*h->active_lh->node); */ -+ if (h->active_lh->node != vroot) { -+ result = LOOKUP_REST; -+ } else if (result == LOOKUP_CONT) { -+ move_lh(h->parent_lh, h->active_lh); -+ h->flags &= ~CBK_DKSET; -+ } -+ } -+ } -+ } -+ -+ zput(vroot); -+ -+ if (IS_CBKERR(h->result) || result == LOOKUP_REST) -+ hput(h); -+ return result; -+} -+ -+/* main function that handles common parts of tree traversal: starting -+ (fake znode handling), restarts, error handling, completion */ -+static lookup_result traverse_tree(cbk_handle * h/* search handle */) -+{ -+ int done; -+ int iterations; -+ int vroot_used; -+ -+ assert("nikita-365", h != NULL); -+ assert("nikita-366", h->tree != NULL); -+ assert("nikita-367", h->key != NULL); -+ assert("nikita-368", h->coord != NULL); -+ assert("nikita-369", (h->bias == FIND_EXACT) -+ || (h->bias == FIND_MAX_NOT_MORE_THAN)); -+ assert("nikita-370", h->stop_level >= LEAF_LEVEL); -+ assert("nikita-2949", !(h->flags & CBK_DKSET)); -+ assert("zam-355", lock_stack_isclean(get_current_lock_stack())); -+ -+ done = 0; -+ iterations = 0; -+ vroot_used = 0; -+ -+ /* loop for restarts */ -+restart: -+ -+ assert("nikita-3024", reiser4_schedulable()); -+ -+ h->result = CBK_COORD_FOUND; -+ /* connect_znode() needs it */ -+ h->ld_key = *reiser4_min_key(); -+ h->rd_key = *reiser4_max_key(); -+ h->flags |= CBK_DKSET; -+ h->error = NULL; -+ -+ if (!vroot_used && h->object != NULL) { -+ vroot_used = 1; -+ done = prepare_object_lookup(h); -+ if (done == LOOKUP_REST) -+ goto restart; -+ else if (done == LOOKUP_DONE) -+ return h->result; -+ } -+ if (h->parent_lh->node == NULL) { -+ done = -+ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI, -+ h->parent_lh); -+ -+ assert("nikita-1637", done != -E_DEADLOCK); -+ -+ h->block = h->tree->root_block; -+ h->level = h->tree->height; -+ h->coord->node = h->parent_lh->node; -+ -+ if (done != 0) -+ return done; -+ } -+ -+ /* loop descending a tree */ -+ while (!done) { -+ -+ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) && -+ IS_POW(iterations))) { -+ warning("nikita-1481", "Too many iterations: %i", -+ iterations); -+ reiser4_print_key("key", h->key); -+ ++iterations; -+ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) { -+ h->error = -+ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring."; -+ h->result = RETERR(-EIO); -+ break; -+ } -+ switch (cbk_level_lookup(h)) { -+ case LOOKUP_CONT: -+ move_lh(h->parent_lh, h->active_lh); -+ continue; -+ default: -+ wrong_return_value("nikita-372", "cbk_level"); -+ case LOOKUP_DONE: -+ done = 1; -+ break; -+ case LOOKUP_REST: -+ hput(h); -+ /* deadlock avoidance is normal case. */ -+ if (h->result != -E_DEADLOCK) -+ ++iterations; -+ reiser4_preempt_point(); -+ goto restart; -+ } -+ } -+ /* that's all. The rest is error handling */ -+ if (unlikely(h->error != NULL)) { -+ warning("nikita-373", "%s: level: %i, " -+ "lock_level: %i, stop_level: %i " -+ "lock_mode: %s, bias: %s", -+ h->error, h->level, h->lock_level, h->stop_level, -+ lock_mode_name(h->lock_mode), bias_name(h->bias)); -+ reiser4_print_address("block", &h->block); -+ reiser4_print_key("key", h->key); -+ print_coord_content("coord", h->coord); -+ } -+ /* `unlikely' error case */ -+ if (unlikely(IS_CBKERR(h->result))) { -+ /* failure. do cleanup */ -+ hput(h); -+ } else { -+ assert("nikita-1605", WITH_DATA_RET -+ (h->coord->node, 1, -+ ergo((h->result == CBK_COORD_FOUND) && -+ (h->bias == FIND_EXACT) && -+ (!node_is_empty(h->coord->node)), -+ coord_is_existing_item(h->coord)))); -+ } -+ return h->result; -+} -+ -+/* find delimiting keys of child -+ -+ Determine left and right delimiting keys for child pointed to by -+ @parent_coord. -+ -+*/ -+static void find_child_delimiting_keys(znode * parent /* parent znode, passed -+ * locked */ , -+ const coord_t *parent_coord -+ /* coord where pointer -+ * to child is stored -+ */ , -+ reiser4_key * ld /* where to store left -+ * delimiting key */ , -+ reiser4_key * rd /* where to store right -+ * delimiting key */ ) -+{ -+ coord_t neighbor; -+ -+ assert("nikita-1484", parent != NULL); -+ assert_rw_locked(&(znode_get_tree(parent)->dk_lock)); -+ -+ coord_dup(&neighbor, parent_coord); -+ -+ if (neighbor.between == AT_UNIT) -+ /* imitate item ->lookup() behavior. */ -+ neighbor.between = AFTER_UNIT; -+ -+ if (coord_set_to_left(&neighbor) == 0) -+ unit_key_by_coord(&neighbor, ld); -+ else { -+ assert("nikita-14851", 0); -+ *ld = *znode_get_ld_key(parent); -+ } -+ -+ coord_dup(&neighbor, parent_coord); -+ if (neighbor.between == AT_UNIT) -+ neighbor.between = AFTER_UNIT; -+ if (coord_set_to_right(&neighbor) == 0) -+ unit_key_by_coord(&neighbor, rd); -+ else -+ *rd = *znode_get_rd_key(parent); -+} -+ -+/* -+ * setup delimiting keys for a child -+ * -+ * @parent parent node -+ * -+ * @coord location in @parent where pointer to @child is -+ * -+ * @child child node -+ */ -+int -+set_child_delimiting_keys(znode * parent, const coord_t *coord, znode * child) -+{ -+ reiser4_tree *tree; -+ -+ assert("nikita-2952", -+ znode_get_level(parent) == znode_get_level(coord->node)); -+ -+ /* fast check without taking dk lock. This is safe, because -+ * JNODE_DKSET is never cleared once set. */ -+ if (!ZF_ISSET(child, JNODE_DKSET)) { -+ tree = znode_get_tree(parent); -+ write_lock_dk(tree); -+ if (likely(!ZF_ISSET(child, JNODE_DKSET))) { -+ find_child_delimiting_keys(parent, coord, -+ &child->ld_key, -+ &child->rd_key); -+ ON_DEBUG(child->ld_key_version = -+ atomic_inc_return(&delim_key_version); -+ child->rd_key_version = -+ atomic_inc_return(&delim_key_version);); -+ ZF_SET(child, JNODE_DKSET); -+ } -+ write_unlock_dk(tree); -+ return 1; -+ } -+ return 0; -+} -+ -+/* Perform tree lookup at one level. This is called from cbk_traverse() -+ function that drives lookup through tree and calls cbk_node_lookup() to -+ perform lookup within one node. -+ -+ See comments in a code. -+*/ -+static level_lookup_result cbk_level_lookup(cbk_handle * h/* search handle */) -+{ -+ int ret; -+ int setdk; -+ int ldkeyset = 0; -+ reiser4_key ldkey; -+ reiser4_key key; -+ znode *active; -+ -+ assert("nikita-3025", reiser4_schedulable()); -+ -+ /* acquire reference to @active node */ -+ active = -+ zget(h->tree, &h->block, h->parent_lh->node, h->level, -+ reiser4_ctx_gfp_mask_get()); -+ -+ if (IS_ERR(active)) { -+ h->result = PTR_ERR(active); -+ return LOOKUP_DONE; -+ } -+ -+ /* lock @active */ -+ h->result = longterm_lock_znode(h->active_lh, -+ active, -+ cbk_lock_mode(h->level, h), -+ ZNODE_LOCK_LOPRI); -+ /* longterm_lock_znode() acquires additional reference to znode (which -+ will be later released by longterm_unlock_znode()). Release -+ reference acquired by zget(). -+ */ -+ zput(active); -+ if (unlikely(h->result != 0)) -+ goto fail_or_restart; -+ -+ setdk = 0; -+ /* if @active is accessed for the first time, setup delimiting keys on -+ it. Delimiting keys are taken from the parent node. See -+ setup_delimiting_keys() for details. -+ */ -+ if (h->flags & CBK_DKSET) { -+ setdk = setup_delimiting_keys(h); -+ h->flags &= ~CBK_DKSET; -+ } else { -+ znode *parent; -+ -+ parent = h->parent_lh->node; -+ h->result = zload(parent); -+ if (unlikely(h->result != 0)) -+ goto fail_or_restart; -+ -+ if (!ZF_ISSET(active, JNODE_DKSET)) -+ setdk = set_child_delimiting_keys(parent, -+ h->coord, active); -+ else { -+ read_lock_dk(h->tree); -+ find_child_delimiting_keys(parent, h->coord, &ldkey, -+ &key); -+ read_unlock_dk(h->tree); -+ ldkeyset = 1; -+ } -+ zrelse(parent); -+ } -+ -+ /* this is ugly kludge. Reminder: this is necessary, because -+ ->lookup() method returns coord with ->between field probably set -+ to something different from AT_UNIT. -+ */ -+ h->coord->between = AT_UNIT; -+ -+ if (znode_just_created(active) && (h->coord->node != NULL)) { -+ write_lock_tree(h->tree); -+ /* if we are going to load znode right now, setup -+ ->in_parent: coord where pointer to this node is stored in -+ parent. -+ */ -+ coord_to_parent_coord(h->coord, &active->in_parent); -+ write_unlock_tree(h->tree); -+ } -+ -+ /* check connectedness without holding tree lock---false negatives -+ * will be re-checked by connect_znode(), and false positives are -+ * impossible---@active cannot suddenly turn into unconnected -+ * state. */ -+ if (!znode_is_connected(active)) { -+ h->result = connect_znode(h->coord, active); -+ if (unlikely(h->result != 0)) { -+ put_parent(h); -+ goto fail_or_restart; -+ } -+ } -+ -+ jload_prefetch(ZJNODE(active)); -+ -+ if (setdk) -+ update_stale_dk(h->tree, active); -+ -+ /* put_parent() cannot be called earlier, because connect_znode() -+ assumes parent node is referenced; */ -+ put_parent(h); -+ -+ if ((!znode_contains_key_lock(active, h->key) && -+ (h->flags & CBK_TRUST_DK)) -+ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) { -+ /* 1. key was moved out of this node while this thread was -+ waiting for the lock. Restart. More elaborate solution is -+ to determine where key moved (to the left, or to the right) -+ and try to follow it through sibling pointers. -+ -+ 2. or, node itself is going to be removed from the -+ tree. Release lock and restart. -+ */ -+ h->result = -E_REPEAT; -+ } -+ if (h->result == -E_REPEAT) -+ return LOOKUP_REST; -+ -+ h->result = zload_ra(active, h->ra_info); -+ if (h->result) -+ return LOOKUP_DONE; -+ -+ /* sanity checks */ -+ if (sanity_check(h)) { -+ zrelse(active); -+ return LOOKUP_DONE; -+ } -+ -+ /* check that key of leftmost item in the @active is the same as in -+ * its parent */ -+ if (ldkeyset && !node_is_empty(active) && -+ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) { -+ warning("vs-3533", "Keys are inconsistent. Fsck?"); -+ reiser4_print_key("inparent", &ldkey); -+ reiser4_print_key("inchild", &key); -+ h->result = RETERR(-EIO); -+ zrelse(active); -+ return LOOKUP_DONE; -+ } -+ -+ if (h->object != NULL) -+ handle_vroot(h->object, active); -+ -+ ret = cbk_node_lookup(h); -+ -+ /* h->active_lh->node might change, but active is yet to be zrelsed */ -+ zrelse(active); -+ -+ return ret; -+ -+fail_or_restart: -+ if (h->result == -E_DEADLOCK) -+ return LOOKUP_REST; -+ return LOOKUP_DONE; -+} -+ -+#if REISER4_DEBUG -+/* check left and right delimiting keys of a znode */ -+void check_dkeys(znode * node) -+{ -+ znode *left; -+ znode *right; -+ -+ read_lock_tree(current_tree); -+ read_lock_dk(current_tree); -+ -+ assert("vs-1710", znode_is_any_locked(node)); -+ assert("vs-1197", -+ !keygt(znode_get_ld_key(node), znode_get_rd_key(node))); -+ -+ left = node->left; -+ right = node->right; -+ -+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET) -+ && left != NULL && ZF_ISSET(left, JNODE_DKSET)) -+ /* check left neighbor. Note that left neighbor is not locked, -+ so it might get wrong delimiting keys therefore */ -+ assert("vs-1198", -+ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node)) -+ || ZF_ISSET(left, JNODE_HEARD_BANSHEE))); -+ -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET) -+ && right != NULL && ZF_ISSET(right, JNODE_DKSET)) -+ /* check right neighbor. Note that right neighbor is not -+ locked, so it might get wrong delimiting keys therefore */ -+ assert("vs-1199", -+ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right)) -+ || ZF_ISSET(right, JNODE_HEARD_BANSHEE))); -+ -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+} -+#endif -+ -+/* true if @key is left delimiting key of @node */ -+static int key_is_ld(znode * node, const reiser4_key * key) -+{ -+ int ld; -+ -+ assert("nikita-1716", node != NULL); -+ assert("nikita-1758", key != NULL); -+ -+ read_lock_dk(znode_get_tree(node)); -+ assert("nikita-1759", znode_contains_key(node, key)); -+ ld = keyeq(znode_get_ld_key(node), key); -+ read_unlock_dk(znode_get_tree(node)); -+ return ld; -+} -+ -+/* Process one node during tree traversal. -+ -+ This is called by cbk_level_lookup(). */ -+static level_lookup_result cbk_node_lookup(cbk_handle * h/* search handle */) -+{ -+ /* node plugin of @active */ -+ node_plugin *nplug; -+ /* item plugin of item that was found */ -+ item_plugin *iplug; -+ /* search bias */ -+ lookup_bias node_bias; -+ /* node we are operating upon */ -+ znode *active; -+ /* tree we are searching in */ -+ reiser4_tree *tree; -+ /* result */ -+ int result; -+ -+ assert("nikita-379", h != NULL); -+ -+ active = h->active_lh->node; -+ tree = h->tree; -+ -+ nplug = active->nplug; -+ assert("nikita-380", nplug != NULL); -+ -+ ON_DEBUG(check_dkeys(active)); -+ -+ /* return item from "active" node with maximal key not greater than -+ "key" */ -+ node_bias = h->bias; -+ result = nplug->lookup(active, h->key, node_bias, h->coord); -+ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) { -+ /* error occurred */ -+ h->result = result; -+ return LOOKUP_DONE; -+ } -+ if (h->level == h->stop_level) { -+ /* welcome to the stop level */ -+ assert("nikita-381", h->coord->node == active); -+ if (result == NS_FOUND) { -+ /* success of tree lookup */ -+ if (!(h->flags & CBK_UNIQUE) -+ && key_is_ld(active, h->key)) -+ return search_to_left(h); -+ else -+ h->result = CBK_COORD_FOUND; -+ } else { -+ h->result = CBK_COORD_NOTFOUND; -+ } -+ if (!(h->flags & CBK_IN_CACHE)) -+ cbk_cache_add(active); -+ return LOOKUP_DONE; -+ } -+ -+ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) { -+ h->error = "not found on internal node"; -+ h->result = result; -+ return LOOKUP_DONE; -+ } -+ -+ assert("vs-361", h->level > h->stop_level); -+ -+ if (handle_eottl(h, &result)) { -+ assert("vs-1674", (result == LOOKUP_DONE || -+ result == LOOKUP_REST)); -+ return result; -+ } -+ -+ /* go down to next level */ -+ check_me("vs-12", zload(h->coord->node) == 0); -+ assert("nikita-2116", item_is_internal(h->coord)); -+ iplug = item_plugin_by_coord(h->coord); -+ iplug->s.internal.down_link(h->coord, h->key, &h->block); -+ zrelse(h->coord->node); -+ --h->level; -+ return LOOKUP_CONT; /* continue */ -+} -+ -+/* scan cbk_cache slots looking for a match for @h */ -+static int cbk_cache_scan_slots(cbk_handle * h/* cbk handle */) -+{ -+ level_lookup_result llr; -+ znode *node; -+ reiser4_tree *tree; -+ cbk_cache_slot *slot; -+ cbk_cache *cache; -+ tree_level level; -+ int isunique; -+ const reiser4_key *key; -+ int result; -+ -+ assert("nikita-1317", h != NULL); -+ assert("nikita-1315", h->tree != NULL); -+ assert("nikita-1316", h->key != NULL); -+ -+ tree = h->tree; -+ cache = &tree->cbk_cache; -+ if (cache->nr_slots == 0) -+ /* size of cbk cache was set to 0 by mount time option. */ -+ return RETERR(-ENOENT); -+ -+ assert("nikita-2474", cbk_cache_invariant(cache)); -+ node = NULL; /* to keep gcc happy */ -+ level = h->level; -+ key = h->key; -+ isunique = h->flags & CBK_UNIQUE; -+ result = RETERR(-ENOENT); -+ -+ /* -+ * this is time-critical function and dragons had, hence, been settled -+ * here. -+ * -+ * Loop below scans cbk cache slots trying to find matching node with -+ * suitable range of delimiting keys and located at the h->level. -+ * -+ * Scan is done under cbk cache spin lock that protects slot->node -+ * pointers. If suitable node is found we want to pin it in -+ * memory. But slot->node can point to the node with x_count 0 -+ * (unreferenced). Such node can be recycled at any moment, or can -+ * already be in the process of being recycled (within jput()). -+ * -+ * As we found node in the cbk cache, it means that jput() hasn't yet -+ * called cbk_cache_invalidate(). -+ * -+ * We acquire reference to the node without holding tree lock, and -+ * later, check node's RIP bit. This avoids races with jput(). -+ */ -+ -+ rcu_read_lock(); -+ read_lock(&((cbk_cache *)cache)->guard); -+ -+ slot = list_entry(cache->lru.next, cbk_cache_slot, lru); -+ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru); -+ BUG_ON(&slot->lru != &cache->lru);/*????*/ -+ while (1) { -+ -+ slot = list_entry(slot->lru.next, cbk_cache_slot, lru); -+ -+ if (&cache->lru != &slot->lru) -+ node = slot->node; -+ else -+ node = NULL; -+ -+ if (unlikely(node == NULL)) -+ break; -+ -+ /* -+ * this is (hopefully) the only place in the code where we are -+ * working with delimiting keys without holding dk lock. This -+ * is fine here, because this is only "guess" anyway---keys -+ * are rechecked under dk lock below. -+ */ -+ if (znode_get_level(node) == level && -+ /* reiser4_min_key < key < reiser4_max_key */ -+ znode_contains_key_strict(node, key, isunique)) { -+ zref(node); -+ result = 0; -+ spin_lock_prefetch(&tree->tree_lock); -+ break; -+ } -+ } -+ read_unlock(&((cbk_cache *)cache)->guard); -+ -+ assert("nikita-2475", cbk_cache_invariant(cache)); -+ -+ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP))) -+ result = -ENOENT; -+ -+ rcu_read_unlock(); -+ -+ if (result != 0) { -+ h->result = CBK_COORD_NOTFOUND; -+ return RETERR(-ENOENT); -+ } -+ -+ result = -+ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h), -+ ZNODE_LOCK_LOPRI); -+ zput(node); -+ if (result != 0) -+ return result; -+ result = zload(node); -+ if (result != 0) -+ return result; -+ -+ /* recheck keys */ -+ read_lock_dk(tree); -+ result = (znode_contains_key_strict(node, key, isunique) && -+ !ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ read_unlock_dk(tree); -+ if (result) { -+ /* do lookup inside node */ -+ llr = cbk_node_lookup(h); -+ /* if cbk_node_lookup() wandered to another node (due to eottl -+ or non-unique keys), adjust @node */ -+ /*node = h->active_lh->node; */ -+ -+ if (llr != LOOKUP_DONE) { -+ /* restart or continue on the next level */ -+ result = RETERR(-ENOENT); -+ } else if (IS_CBKERR(h->result)) -+ /* io or oom */ -+ result = RETERR(-ENOENT); -+ else { -+ /* good. Either item found or definitely not found. */ -+ result = 0; -+ -+ write_lock(&(cache->guard)); -+ if (slot->node == h->active_lh->node) { -+ /* if this node is still in cbk cache---move -+ its slot to the head of the LRU list. */ -+ list_move(&slot->lru, &cache->lru); -+ } -+ write_unlock(&(cache->guard)); -+ } -+ } else { -+ /* race. While this thread was waiting for the lock, node was -+ rebalanced and item we are looking for, shifted out of it -+ (if it ever was here). -+ -+ Continuing scanning is almost hopeless: node key range was -+ moved to, is almost certainly at the beginning of the LRU -+ list at this time, because it's hot, but restarting -+ scanning from the very beginning is complex. Just return, -+ so that cbk() will be performed. This is not that -+ important, because such races should be rare. Are they? -+ */ -+ result = RETERR(-ENOENT); /* -ERAUGHT */ -+ } -+ zrelse(node); -+ assert("nikita-2476", cbk_cache_invariant(cache)); -+ return result; -+} -+ -+/* look for item with given key in the coord cache -+ -+ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache) -+ which is a small LRU list of znodes accessed lately. For each znode in -+ znode in this list, it checks whether key we are looking for fits into key -+ range covered by this node. If so, and in addition, node lies at allowed -+ level (this is to handle extents on a twig level), node is locked, and -+ lookup inside it is performed. -+ -+ we need a measurement of the cost of this cache search compared to the cost -+ of coord_by_key. -+ -+*/ -+static int cbk_cache_search(cbk_handle * h/* cbk handle */) -+{ -+ int result = 0; -+ tree_level level; -+ -+ /* add CBK_IN_CACHE to the handle flags. This means that -+ * cbk_node_lookup() assumes that cbk_cache is scanned and would add -+ * found node to the cache. */ -+ h->flags |= CBK_IN_CACHE; -+ for (level = h->stop_level; level <= h->lock_level; ++level) { -+ h->level = level; -+ result = cbk_cache_scan_slots(h); -+ if (result != 0) { -+ done_lh(h->active_lh); -+ done_lh(h->parent_lh); -+ } else { -+ assert("nikita-1319", !IS_CBKERR(h->result)); -+ break; -+ } -+ } -+ h->flags &= ~CBK_IN_CACHE; -+ return result; -+} -+ -+/* type of lock we want to obtain during tree traversal. On stop level -+ we want type of lock user asked for, on upper levels: read lock. */ -+znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h) -+{ -+ assert("nikita-382", h != NULL); -+ -+ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK; -+} -+ -+/* update outdated delimiting keys */ -+static void stale_dk(reiser4_tree * tree, znode * node) -+{ -+ znode *right; -+ -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ right = node->right; -+ -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && -+ right && ZF_ISSET(right, JNODE_DKSET) && -+ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right))) -+ znode_set_rd_key(node, znode_get_ld_key(right)); -+ -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+} -+ -+/* check for possibly outdated delimiting keys, and update them if -+ * necessary. */ -+static void update_stale_dk(reiser4_tree * tree, znode * node) -+{ -+ znode *right; -+ reiser4_key rd; -+ -+ read_lock_tree(tree); -+ read_lock_dk(tree); -+ rd = *znode_get_rd_key(node); -+ right = node->right; -+ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && -+ right && ZF_ISSET(right, JNODE_DKSET) && -+ !keyeq(&rd, znode_get_ld_key(right)))) { -+ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET)); -+ read_unlock_dk(tree); -+ read_unlock_tree(tree); -+ stale_dk(tree, node); -+ return; -+ } -+ read_unlock_dk(tree); -+ read_unlock_tree(tree); -+} -+ -+/* -+ * handle searches a the non-unique key. -+ * -+ * Suppose that we are looking for an item with possibly non-unique key 100. -+ * -+ * Root node contains two pointers: one to a node with left delimiting key 0, -+ * and another to a node with left delimiting key 100. Item we interested in -+ * may well happen in the sub-tree rooted at the first pointer. -+ * -+ * To handle this search_to_left() is called when search reaches stop -+ * level. This function checks it is _possible_ that item we are looking for -+ * is in the left neighbor (this can be done by comparing delimiting keys) and -+ * if so, tries to lock left neighbor (this is low priority lock, so it can -+ * deadlock, tree traversal is just restarted if it did) and then checks -+ * whether left neighbor actually contains items with our key. -+ * -+ * Note that this is done on the stop level only. It is possible to try such -+ * left-check on each level, but as duplicate keys are supposed to be rare -+ * (very unlikely that more than one node is completely filled with items with -+ * duplicate keys), it sis cheaper to scan to the left on the stop level once. -+ * -+ */ -+static level_lookup_result search_to_left(cbk_handle * h/* search handle */) -+{ -+ level_lookup_result result; -+ coord_t *coord; -+ znode *node; -+ znode *neighbor; -+ -+ lock_handle lh; -+ -+ assert("nikita-1761", h != NULL); -+ assert("nikita-1762", h->level == h->stop_level); -+ -+ init_lh(&lh); -+ coord = h->coord; -+ node = h->active_lh->node; -+ assert("nikita-1763", coord_is_leftmost_unit(coord)); -+ -+ h->result = -+ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode, -+ GN_CAN_USE_UPPER_LEVELS); -+ neighbor = NULL; -+ switch (h->result) { -+ case -E_DEADLOCK: -+ result = LOOKUP_REST; -+ break; -+ case 0:{ -+ node_plugin *nplug; -+ coord_t crd; -+ lookup_bias bias; -+ -+ neighbor = lh.node; -+ h->result = zload(neighbor); -+ if (h->result != 0) { -+ result = LOOKUP_DONE; -+ break; -+ } -+ -+ nplug = neighbor->nplug; -+ -+ coord_init_zero(&crd); -+ bias = h->bias; -+ h->bias = FIND_EXACT; -+ h->result = -+ nplug->lookup(neighbor, h->key, h->bias, &crd); -+ h->bias = bias; -+ -+ if (h->result == NS_NOT_FOUND) { -+ case -E_NO_NEIGHBOR: -+ h->result = CBK_COORD_FOUND; -+ if (!(h->flags & CBK_IN_CACHE)) -+ cbk_cache_add(node); -+ default: /* some other error */ -+ result = LOOKUP_DONE; -+ } else if (h->result == NS_FOUND) { -+ read_lock_dk(znode_get_tree(neighbor)); -+ h->rd_key = *znode_get_ld_key(node); -+ leftmost_key_in_node(neighbor, &h->ld_key); -+ read_unlock_dk(znode_get_tree(neighbor)); -+ h->flags |= CBK_DKSET; -+ -+ h->block = *znode_get_block(neighbor); -+ /* clear coord->node so that cbk_level_lookup() -+ wouldn't overwrite parent hint in neighbor. -+ -+ Parent hint was set up by -+ reiser4_get_left_neighbor() -+ */ -+ /* FIXME: why do we have to spinlock here? */ -+ write_lock_tree(znode_get_tree(neighbor)); -+ h->coord->node = NULL; -+ write_unlock_tree(znode_get_tree(neighbor)); -+ result = LOOKUP_CONT; -+ } else { -+ result = LOOKUP_DONE; -+ } -+ if (neighbor != NULL) -+ zrelse(neighbor); -+ } -+ } -+ done_lh(&lh); -+ return result; -+} -+ -+/* debugging aid: return symbolic name of search bias */ -+static const char *bias_name(lookup_bias bias/* bias to get name of */) -+{ -+ if (bias == FIND_EXACT) -+ return "exact"; -+ else if (bias == FIND_MAX_NOT_MORE_THAN) -+ return "left-slant"; -+/* else if( bias == RIGHT_SLANT_BIAS ) */ -+/* return "right-bias"; */ -+ else { -+ static char buf[30]; -+ -+ sprintf(buf, "unknown: %i", bias); -+ return buf; -+ } -+} -+ -+#if REISER4_DEBUG -+/* debugging aid: print human readable information about @p */ -+void print_coord_content(const char *prefix /* prefix to print */ , -+ coord_t *p/* coord to print */) -+{ -+ reiser4_key key; -+ -+ if (p == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ if ((p->node != NULL) && znode_is_loaded(p->node) -+ && coord_is_existing_item(p)) -+ printk("%s: data: %p, length: %i\n", prefix, -+ item_body_by_coord(p), item_length_by_coord(p)); -+ if (znode_is_loaded(p->node)) { -+ item_key_by_coord(p, &key); -+ reiser4_print_key(prefix, &key); -+ } -+} -+ -+/* debugging aid: print human readable information about @block */ -+void reiser4_print_address(const char *prefix /* prefix to print */ , -+ const reiser4_block_nr * block/* block number to print */) -+{ -+ printk("%s: %s\n", prefix, sprint_address(block)); -+} -+#endif -+ -+/* return string containing human readable representation of @block */ -+char *sprint_address(const reiser4_block_nr * -+ block/* block number to print */) -+{ -+ static char address[30]; -+ -+ if (block == NULL) -+ sprintf(address, "null"); -+ else if (reiser4_blocknr_is_fake(block)) -+ sprintf(address, "%llx", (unsigned long long)(*block)); -+ else -+ sprintf(address, "%llu", (unsigned long long)(*block)); -+ return address; -+} -+ -+/* release parent node during traversal */ -+static void put_parent(cbk_handle * h/* search handle */) -+{ -+ assert("nikita-383", h != NULL); -+ if (h->parent_lh->node != NULL) -+ longterm_unlock_znode(h->parent_lh); -+} -+ -+/* helper function used by coord_by_key(): release reference to parent znode -+ stored in handle before processing its child. */ -+static void hput(cbk_handle * h/* search handle */) -+{ -+ assert("nikita-385", h != NULL); -+ done_lh(h->parent_lh); -+ done_lh(h->active_lh); -+} -+ -+/* Helper function used by cbk(): update delimiting keys of child node (stored -+ in h->active_lh->node) using key taken from parent on the parent level. */ -+static int setup_delimiting_keys(cbk_handle * h/* search handle */) -+{ -+ znode *active; -+ reiser4_tree *tree; -+ -+ assert("nikita-1088", h != NULL); -+ -+ active = h->active_lh->node; -+ -+ /* fast check without taking dk lock. This is safe, because -+ * JNODE_DKSET is never cleared once set. */ -+ if (!ZF_ISSET(active, JNODE_DKSET)) { -+ tree = znode_get_tree(active); -+ write_lock_dk(tree); -+ if (!ZF_ISSET(active, JNODE_DKSET)) { -+ znode_set_ld_key(active, &h->ld_key); -+ znode_set_rd_key(active, &h->rd_key); -+ ZF_SET(active, JNODE_DKSET); -+ } -+ write_unlock_dk(tree); -+ return 1; -+ } -+ return 0; -+} -+ -+/* true if @block makes sense for the @tree. Used to detect corrupted node -+ * pointers */ -+static int -+block_nr_is_correct(reiser4_block_nr * block /* block number to check */ , -+ reiser4_tree * tree/* tree to check against */) -+{ -+ assert("nikita-757", block != NULL); -+ assert("nikita-758", tree != NULL); -+ -+ /* check to see if it exceeds the size of the device. */ -+ return reiser4_blocknr_is_sane_for(tree->super, block); -+} -+ -+/* check consistency of fields */ -+static int sanity_check(cbk_handle * h/* search handle */) -+{ -+ assert("nikita-384", h != NULL); -+ -+ if (h->level < h->stop_level) { -+ h->error = "Buried under leaves"; -+ h->result = RETERR(-EIO); -+ return LOOKUP_DONE; -+ } else if (!block_nr_is_correct(&h->block, h->tree)) { -+ h->error = "bad block number"; -+ h->result = RETERR(-EIO); -+ return LOOKUP_DONE; -+ } else -+ return 0; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/status_flags.c linux-2.6.30/fs/reiser4/status_flags.c ---- linux-2.6.30.orig/fs/reiser4/status_flags.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/status_flags.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,174 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Functions that deal with reiser4 status block, query status and update it, -+ * if needed */ -+ -+#include <linux/bio.h> -+#include <linux/highmem.h> -+#include <linux/fs.h> -+#include <linux/blkdev.h> -+#include "debug.h" -+#include "dformat.h" -+#include "status_flags.h" -+#include "super.h" -+ -+/* This is our end I/O handler that marks page uptodate if IO was successful. -+ It also unconditionally unlocks the page, so we can see that io was done. -+ We do not free bio, because we hope to reuse that. */ -+static void reiser4_status_endio(struct bio *bio, int err) -+{ -+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) { -+ SetPageUptodate(bio->bi_io_vec->bv_page); -+ } else { -+ ClearPageUptodate(bio->bi_io_vec->bv_page); -+ SetPageError(bio->bi_io_vec->bv_page); -+ } -+ unlock_page(bio->bi_io_vec->bv_page); -+} -+ -+/* Initialise status code. This is expected to be called from the disk format -+ code. block paremeter is where status block lives. */ -+int reiser4_status_init(reiser4_block_nr block) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ struct reiser4_status *statuspage; -+ struct bio *bio; -+ struct page *page; -+ -+ get_super_private(sb)->status_page = NULL; -+ get_super_private(sb)->status_bio = NULL; -+ -+ page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0); -+ if (!page) -+ return -ENOMEM; -+ -+ bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1); -+ if (bio != NULL) { -+ bio->bi_sector = block * (sb->s_blocksize >> 9); -+ bio->bi_bdev = sb->s_bdev; -+ bio->bi_io_vec[0].bv_page = page; -+ bio->bi_io_vec[0].bv_len = sb->s_blocksize; -+ bio->bi_io_vec[0].bv_offset = 0; -+ bio->bi_vcnt = 1; -+ bio->bi_size = sb->s_blocksize; -+ bio->bi_end_io = reiser4_status_endio; -+ } else { -+ __free_pages(page, 0); -+ return -ENOMEM; -+ } -+ lock_page(page); -+ submit_bio(READ, bio); -+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping); -+ wait_on_page_locked(page); -+ if (!PageUptodate(page)) { -+ warning("green-2007", -+ "I/O error while tried to read status page\n"); -+ return -EIO; -+ } -+ -+ statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0); -+ if (memcmp -+ (statuspage->magic, REISER4_STATUS_MAGIC, -+ sizeof(REISER4_STATUS_MAGIC))) { -+ /* Magic does not match. */ -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ warning("green-2008", "Wrong magic in status block\n"); -+ __free_pages(page, 0); -+ bio_put(bio); -+ return -EINVAL; -+ } -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ -+ get_super_private(sb)->status_page = page; -+ get_super_private(sb)->status_bio = bio; -+ return 0; -+} -+ -+/* Query the status of fs. Returns if the FS can be safely mounted. -+ Also if "status" and "extended" parameters are given, it will fill -+ actual parts of status from disk there. */ -+int reiser4_status_query(u64 *status, u64 *extended) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ struct reiser4_status *statuspage; -+ int retval; -+ -+ if (!get_super_private(sb)->status_page) -+ /* No status page? */ -+ return REISER4_STATUS_MOUNT_UNKNOWN; -+ statuspage = (struct reiser4_status *) -+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0); -+ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { -+ /* FIXME: this cast is a hack for 32 bit arches to work. */ -+ case REISER4_STATUS_OK: -+ retval = REISER4_STATUS_MOUNT_OK; -+ break; -+ case REISER4_STATUS_CORRUPTED: -+ retval = REISER4_STATUS_MOUNT_WARN; -+ break; -+ case REISER4_STATUS_DAMAGED: -+ case REISER4_STATUS_DESTROYED: -+ case REISER4_STATUS_IOERROR: -+ retval = REISER4_STATUS_MOUNT_RO; -+ break; -+ default: -+ retval = REISER4_STATUS_MOUNT_UNKNOWN; -+ break; -+ } -+ -+ if (status) -+ *status = le64_to_cpu(get_unaligned(&statuspage->status)); -+ if (extended) -+ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status)); -+ -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ return retval; -+} -+ -+/* This function should be called when something bad happens (e.g. from -+ reiser4_panic). It fills the status structure and tries to push it to disk.*/ -+int reiser4_status_write(__u64 status, __u64 extended_status, char *message) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ struct reiser4_status *statuspage; -+ struct bio *bio = get_super_private(sb)->status_bio; -+ -+ if (!get_super_private(sb)->status_page) -+ /* No status page? */ -+ return -1; -+ statuspage = (struct reiser4_status *) -+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0); -+ -+ put_unaligned(cpu_to_le64(status), &statuspage->status); -+ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status); -+ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN); -+ -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ bio->bi_bdev = sb->s_bdev; -+ bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page; -+ bio->bi_io_vec[0].bv_len = sb->s_blocksize; -+ bio->bi_io_vec[0].bv_offset = 0; -+ bio->bi_vcnt = 1; -+ bio->bi_size = sb->s_blocksize; -+ bio->bi_end_io = reiser4_status_endio; -+ lock_page(get_super_private(sb)->status_page); /* Safe as nobody should -+ * touch our page. */ -+ /* We can block now, but we have no other choice anyway */ -+ submit_bio(WRITE, bio); -+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping); -+ return 0; /* We do not wait for io to finish. */ -+} -+ -+/* Frees the page with status and bio structure. Should be called by disk format -+ * at umount time */ -+int reiser4_status_finish(void) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ __free_pages(get_super_private(sb)->status_page, 0); -+ get_super_private(sb)->status_page = NULL; -+ bio_put(get_super_private(sb)->status_bio); -+ get_super_private(sb)->status_bio = NULL; -+ return 0; -+} -diff -urN linux-2.6.30.orig/fs/reiser4/status_flags.h linux-2.6.30/fs/reiser4/status_flags.h ---- linux-2.6.30.orig/fs/reiser4/status_flags.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/status_flags.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,47 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Here we declare structures and flags that store reiser4 status on disk. -+ The status that helps us to find out if the filesystem is valid or if it -+ contains some critical, or not so critical errors */ -+ -+#if !defined(__REISER4_STATUS_FLAGS_H__) -+#define __REISER4_STATUS_FLAGS_H__ -+ -+#include "dformat.h" -+/* These are major status flags */ -+#define REISER4_STATUS_OK 0 -+#define REISER4_STATUS_CORRUPTED 0x1 -+#define REISER4_STATUS_DAMAGED 0x2 -+#define REISER4_STATUS_DESTROYED 0x4 -+#define REISER4_STATUS_IOERROR 0x8 -+ -+/* Return values for reiser4_status_query() */ -+#define REISER4_STATUS_MOUNT_OK 0 -+#define REISER4_STATUS_MOUNT_WARN 1 -+#define REISER4_STATUS_MOUNT_RO 2 -+#define REISER4_STATUS_MOUNT_UNKNOWN -1 -+ -+#define REISER4_TEXTERROR_LEN 256 -+ -+#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl" -+/* We probably need to keep its size under sector size which is 512 bytes */ -+struct reiser4_status { -+ char magic[16]; -+ d64 status; /* Current FS state */ -+ d64 extended_status; /* Any additional info that might have sense in -+ * addition to "status". E.g. last sector where -+ * io error happened if status is -+ * "io error encountered" */ -+ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */ -+ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if -+ * appropriate, otherwise filled -+ * with zeroes */ -+}; -+ -+int reiser4_status_init(reiser4_block_nr block); -+int reiser4_status_query(u64 *status, u64 *extended); -+int reiser4_status_write(u64 status, u64 extended_status, char *message); -+int reiser4_status_finish(void); -+ -+#endif -diff -urN linux-2.6.30.orig/fs/reiser4/super.c linux-2.6.30/fs/reiser4/super.c ---- linux-2.6.30.orig/fs/reiser4/super.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/super.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,306 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Super-block manipulations. */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "plugin/security/perm.h" -+#include "plugin/space/space_allocator.h" -+#include "plugin/plugin.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include <linux/types.h> /* for __u?? */ -+#include <linux/fs.h> /* for struct super_block */ -+ -+static __u64 reserved_for_gid(const struct super_block *super, gid_t gid); -+static __u64 reserved_for_uid(const struct super_block *super, uid_t uid); -+static __u64 reserved_for_root(const struct super_block *super); -+ -+/* Return reiser4-specific part of super block */ -+reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super) -+{ -+ return (reiser4_super_info_data *) super->s_fs_info; -+} -+ -+/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() -+ */ -+long reiser4_statfs_type(const struct super_block *super UNUSED_ARG) -+{ -+ assert("nikita-448", super != NULL); -+ assert("nikita-449", is_reiser4_super(super)); -+ return (long)REISER4_SUPER_MAGIC; -+} -+ -+/* functions to read/modify fields of reiser4_super_info_data */ -+ -+/* get number of blocks in file system */ -+__u64 reiser4_block_count(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("vs-494", super != NULL); -+ assert("vs-495", is_reiser4_super(super)); -+ return get_super_private(super)->block_count; -+} -+ -+#if REISER4_DEBUG -+/* -+ * number of blocks in the current file system -+ */ -+__u64 reiser4_current_block_count(void) -+{ -+ return get_current_super_private()->block_count; -+} -+#endif /* REISER4_DEBUG */ -+ -+/* set number of block in filesystem */ -+void reiser4_set_block_count(const struct super_block *super, __u64 nr) -+{ -+ assert("vs-501", super != NULL); -+ assert("vs-502", is_reiser4_super(super)); -+ get_super_private(super)->block_count = nr; -+ /* -+ * The proper calculation of the reserved space counter (%5 of device -+ * block counter) we need a 64 bit division which is missing in Linux -+ * on i386 platform. Because we do not need a precise calculation here -+ * we can replace a div64 operation by this combination of -+ * multiplication and shift: 51. / (2^10) == .0498 . -+ * FIXME: this is a bug. It comes up only for very small filesystems -+ * which probably are never used. Nevertheless, it is a bug. Number of -+ * reserved blocks must be not less than maximal number of blocks which -+ * get grabbed with BA_RESERVED. -+ */ -+ get_super_private(super)->blocks_reserved = ((nr * 51) >> 10); -+} -+ -+/* amount of blocks used (allocated for data) in file system */ -+__u64 reiser4_data_blocks(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("nikita-452", super != NULL); -+ assert("nikita-453", is_reiser4_super(super)); -+ return get_super_private(super)->blocks_used; -+} -+ -+/* set number of block used in filesystem */ -+void reiser4_set_data_blocks(const struct super_block *super, __u64 nr) -+{ -+ assert("vs-503", super != NULL); -+ assert("vs-504", is_reiser4_super(super)); -+ get_super_private(super)->blocks_used = nr; -+} -+ -+/* amount of free blocks in file system */ -+__u64 reiser4_free_blocks(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("nikita-454", super != NULL); -+ assert("nikita-455", is_reiser4_super(super)); -+ return get_super_private(super)->blocks_free; -+} -+ -+/* set number of blocks free in filesystem */ -+void reiser4_set_free_blocks(const struct super_block *super, __u64 nr) -+{ -+ assert("vs-505", super != NULL); -+ assert("vs-506", is_reiser4_super(super)); -+ get_super_private(super)->blocks_free = nr; -+} -+ -+/* get mkfs unique identifier */ -+__u32 reiser4_mkfs_id(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("vpf-221", super != NULL); -+ assert("vpf-222", is_reiser4_super(super)); -+ return get_super_private(super)->mkfs_id; -+} -+ -+/* amount of free blocks in file system */ -+__u64 reiser4_free_committed_blocks(const struct super_block *super) -+{ -+ assert("vs-497", super != NULL); -+ assert("vs-498", is_reiser4_super(super)); -+ return get_super_private(super)->blocks_free_committed; -+} -+ -+/* amount of blocks in the file system reserved for @uid and @gid */ -+long reiser4_reserved_blocks(const struct super_block *super /* super block -+ queried */ , -+ uid_t uid /* user id */ , -+ gid_t gid/* group id */) -+{ -+ long reserved; -+ -+ assert("nikita-456", super != NULL); -+ assert("nikita-457", is_reiser4_super(super)); -+ -+ reserved = 0; -+ if (REISER4_SUPPORT_GID_SPACE_RESERVATION) -+ reserved += reserved_for_gid(super, gid); -+ if (REISER4_SUPPORT_UID_SPACE_RESERVATION) -+ reserved += reserved_for_uid(super, uid); -+ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0)) -+ reserved += reserved_for_root(super); -+ return reserved; -+} -+ -+/* get/set value of/to grabbed blocks counter */ -+__u64 reiser4_grabbed_blocks(const struct super_block * super) -+{ -+ assert("zam-512", super != NULL); -+ assert("zam-513", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_grabbed; -+} -+ -+__u64 reiser4_flush_reserved(const struct super_block *super) -+{ -+ assert("vpf-285", super != NULL); -+ assert("vpf-286", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_flush_reserved; -+} -+ -+/* get/set value of/to counter of fake allocated formatted blocks */ -+__u64 reiser4_fake_allocated(const struct super_block *super) -+{ -+ assert("zam-516", super != NULL); -+ assert("zam-517", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_fake_allocated; -+} -+ -+/* get/set value of/to counter of fake allocated unformatted blocks */ -+__u64 reiser4_fake_allocated_unformatted(const struct super_block *super) -+{ -+ assert("zam-516", super != NULL); -+ assert("zam-517", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_fake_allocated_unformatted; -+} -+ -+/* get/set value of/to counter of clustered blocks */ -+__u64 reiser4_clustered_blocks(const struct super_block *super) -+{ -+ assert("edward-601", super != NULL); -+ assert("edward-602", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_clustered; -+} -+ -+/* space allocator used by this file system */ -+reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block -+ *super) -+{ -+ assert("nikita-1965", super != NULL); -+ assert("nikita-1966", is_reiser4_super(super)); -+ return &get_super_private(super)->space_allocator; -+} -+ -+/* return fake inode used to bind formatted nodes in the page cache */ -+struct inode *reiser4_get_super_fake(const struct super_block *super) -+{ -+ assert("nikita-1757", super != NULL); -+ return get_super_private(super)->fake; -+} -+ -+/* return fake inode used to bind copied on capture nodes in the page cache */ -+struct inode *reiser4_get_cc_fake(const struct super_block *super) -+{ -+ assert("nikita-1757", super != NULL); -+ return get_super_private(super)->cc; -+} -+ -+/* return fake inode used to bind bitmaps and journlal heads */ -+struct inode *reiser4_get_bitmap_fake(const struct super_block *super) -+{ -+ assert("nikita-17571", super != NULL); -+ return get_super_private(super)->bitmap; -+} -+ -+/* tree used by this file system */ -+reiser4_tree *reiser4_get_tree(const struct super_block *super) -+{ -+ assert("nikita-460", super != NULL); -+ assert("nikita-461", is_reiser4_super(super)); -+ return &get_super_private(super)->tree; -+} -+ -+/* Check that @super is (looks like) reiser4 super block. This is mainly for -+ use in assertions. */ -+int is_reiser4_super(const struct super_block *super) -+{ -+ return -+ super != NULL && -+ get_super_private(super) != NULL && -+ super->s_op == &(get_super_private(super)->ops.super); -+} -+ -+int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f) -+{ -+ return test_bit((int)f, &get_super_private(super)->fs_flags); -+} -+ -+/* amount of blocks reserved for given group in file system */ -+static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG, -+ gid_t gid UNUSED_ARG/* group id */) -+{ -+ return 0; -+} -+ -+/* amount of blocks reserved for given user in file system */ -+static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG, -+ uid_t uid UNUSED_ARG/* user id */) -+{ -+ return 0; -+} -+ -+/* amount of blocks reserved for super user in file system */ -+static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* -+ * true if block number @blk makes sense for the file system at @super. -+ */ -+int -+reiser4_blocknr_is_sane_for(const struct super_block *super, -+ const reiser4_block_nr * blk) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ assert("nikita-2957", super != NULL); -+ assert("nikita-2958", blk != NULL); -+ -+ if (reiser4_blocknr_is_fake(blk)) -+ return 1; -+ -+ sbinfo = get_super_private(super); -+ return *blk < sbinfo->block_count; -+} -+ -+#if REISER4_DEBUG -+/* -+ * true, if block number @blk makes sense for the current file system -+ */ -+int reiser4_blocknr_is_sane(const reiser4_block_nr * blk) -+{ -+ return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk); -+} -+#endif /* REISER4_DEBUG */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/super.h linux-2.6.30/fs/reiser4/super.h ---- linux-2.6.30.orig/fs/reiser4/super.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/super.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,466 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Super-block functions. See super.c for details. */ -+ -+#if !defined(__REISER4_SUPER_H__) -+#define __REISER4_SUPER_H__ -+ -+#include <linux/exportfs.h> -+ -+#include "tree.h" -+#include "entd.h" -+#include "wander.h" -+#include "fsdata.h" -+#include "plugin/object.h" -+#include "plugin/space/space_allocator.h" -+ -+/* -+ * Flush algorithms parameters. -+ */ -+struct flush_params { -+ unsigned relocate_threshold; -+ unsigned relocate_distance; -+ unsigned written_threshold; -+ unsigned scan_maxnodes; -+}; -+ -+typedef enum { -+ /* -+ * True if this file system doesn't support hard-links (multiple names) -+ * for directories: this is default UNIX behavior. -+ * -+ * If hard-links on directoires are not allowed, file system is Acyclic -+ * Directed Graph (modulo dot, and dotdot, of course). -+ * -+ * This is used by reiser4_link(). -+ */ -+ REISER4_ADG = 0, -+ /* -+ * set if all nodes in internal tree have the same node layout plugin. -+ * If so, znode_guess_plugin() will return tree->node_plugin in stead -+ * of guessing plugin by plugin id stored in the node. -+ */ -+ REISER4_ONE_NODE_PLUGIN = 1, -+ /* if set, bsd gid assignment is supported. */ -+ REISER4_BSD_GID = 2, -+ /* [mac]_time are 32 bit in inode */ -+ REISER4_32_BIT_TIMES = 3, -+ /* load all bitmap blocks at mount time */ -+ REISER4_DONT_LOAD_BITMAP = 5, -+ /* enforce atomicity during write(2) */ -+ REISER4_ATOMIC_WRITE = 6, -+ /* don't use write barriers in the log writer code. */ -+ REISER4_NO_WRITE_BARRIER = 7 -+} reiser4_fs_flag; -+ -+/* -+ * VFS related operation vectors. -+ */ -+struct object_ops { -+ struct super_operations super; -+ struct dentry_operations dentry; -+ struct export_operations export; -+}; -+ -+/* reiser4-specific part of super block -+ -+ Locking -+ -+ Fields immutable after mount: -+ -+ ->oid* -+ ->space* -+ ->default_[ug]id -+ ->mkfs_id -+ ->trace_flags -+ ->debug_flags -+ ->fs_flags -+ ->df_plug -+ ->optimal_io_size -+ ->plug -+ ->flush -+ ->u (bad name) -+ ->txnmgr -+ ->ra_params -+ ->fsuid -+ ->journal_header -+ ->journal_footer -+ -+ Fields protected by ->lnode_guard -+ -+ ->lnode_htable -+ -+ Fields protected by per-super block spin lock -+ -+ ->block_count -+ ->blocks_used -+ ->blocks_free -+ ->blocks_free_committed -+ ->blocks_grabbed -+ ->blocks_fake_allocated_unformatted -+ ->blocks_fake_allocated -+ ->blocks_flush_reserved -+ ->eflushed -+ ->blocknr_hint_default -+ -+ After journal replaying during mount, -+ -+ ->last_committed_tx -+ -+ is protected by ->tmgr.commit_mutex -+ -+ Invariants involving this data-type: -+ -+ [sb-block-counts] -+ [sb-grabbed] -+ [sb-fake-allocated] -+*/ -+struct reiser4_super_info_data { -+ /* -+ * guard spinlock which protects reiser4 super block fields (currently -+ * blocks_free, blocks_free_committed) -+ */ -+ spinlock_t guard; -+ -+ /* next oid that will be returned by oid_allocate() */ -+ oid_t next_to_use; -+ /* total number of used oids */ -+ oid_t oids_in_use; -+ -+ /* space manager plugin */ -+ reiser4_space_allocator space_allocator; -+ -+ /* reiser4 internal tree */ -+ reiser4_tree tree; -+ -+ /* -+ * default user id used for light-weight files without their own -+ * stat-data. -+ */ -+ uid_t default_uid; -+ -+ /* -+ * default group id used for light-weight files without their own -+ * stat-data. -+ */ -+ gid_t default_gid; -+ -+ /* mkfs identifier generated at mkfs time. */ -+ __u32 mkfs_id; -+ /* amount of blocks in a file system */ -+ __u64 block_count; -+ -+ /* inviolable reserve */ -+ __u64 blocks_reserved; -+ -+ /* amount of blocks used by file system data and meta-data. */ -+ __u64 blocks_used; -+ -+ /* -+ * amount of free blocks. This is "working" free blocks counter. It is -+ * like "working" bitmap, please see block_alloc.c for description. -+ */ -+ __u64 blocks_free; -+ -+ /* -+ * free block count for fs committed state. This is "commit" version of -+ * free block counter. -+ */ -+ __u64 blocks_free_committed; -+ -+ /* -+ * number of blocks reserved for further allocation, for all -+ * threads. -+ */ -+ __u64 blocks_grabbed; -+ -+ /* number of fake allocated unformatted blocks in tree. */ -+ __u64 blocks_fake_allocated_unformatted; -+ -+ /* number of fake allocated formatted blocks in tree. */ -+ __u64 blocks_fake_allocated; -+ -+ /* number of blocks reserved for flush operations. */ -+ __u64 blocks_flush_reserved; -+ -+ /* number of blocks reserved for cluster operations. */ -+ __u64 blocks_clustered; -+ -+ /* unique file-system identifier */ -+ __u32 fsuid; -+ -+ /* On-disk format version. If does not equal to the disk_format -+ plugin version, some format updates (e.g. enlarging plugin -+ set, etc) may have place on mount. */ -+ int version; -+ -+ /* file-system wide flags. See reiser4_fs_flag enum */ -+ unsigned long fs_flags; -+ -+ /* transaction manager */ -+ txn_mgr tmgr; -+ -+ /* ent thread */ -+ entd_context entd; -+ -+ /* fake inode used to bind formatted nodes */ -+ struct inode *fake; -+ /* inode used to bind bitmaps (and journal heads) */ -+ struct inode *bitmap; -+ /* inode used to bind copied on capture nodes */ -+ struct inode *cc; -+ -+ /* disk layout plugin */ -+ disk_format_plugin *df_plug; -+ -+ /* disk layout specific part of reiser4 super info data */ -+ union { -+ format40_super_info format40; -+ } u; -+ -+ /* value we return in st_blksize on stat(2) */ -+ unsigned long optimal_io_size; -+ -+ /* parameters for the flush algorithm */ -+ struct flush_params flush; -+ -+ /* pointers to jnodes for journal header and footer */ -+ jnode *journal_header; -+ jnode *journal_footer; -+ -+ journal_location jloc; -+ -+ /* head block number of last committed transaction */ -+ __u64 last_committed_tx; -+ -+ /* -+ * we remember last written location for using as a hint for new block -+ * allocation -+ */ -+ __u64 blocknr_hint_default; -+ -+ /* committed number of files (oid allocator state variable ) */ -+ __u64 nr_files_committed; -+ -+ struct formatted_ra_params ra_params; -+ -+ /* -+ * A mutex for serializing cut tree operation if out-of-free-space: -+ * the only one cut_tree thread is allowed to grab space from reserved -+ * area (it is 5% of disk space) -+ */ -+ struct mutex delete_mutex; -+ /* task owning ->delete_mutex */ -+ struct task_struct *delete_mutex_owner; -+ -+ /* Diskmap's blocknumber */ -+ __u64 diskmap_block; -+ -+ /* What to do in case of error */ -+ int onerror; -+ -+ /* operations for objects on this file system */ -+ struct object_ops ops; -+ -+ /* -+ * structure to maintain d_cursors. See plugin/file_ops_readdir.c for -+ * more details -+ */ -+ struct d_cursor_info d_info; -+ -+#ifdef CONFIG_REISER4_BADBLOCKS -+ /* Alternative master superblock offset (in bytes) */ -+ unsigned long altsuper; -+#endif -+ struct repacker *repacker; -+ struct page *status_page; -+ struct bio *status_bio; -+ -+#if REISER4_DEBUG -+ /* -+ * minimum used blocks value (includes super blocks, bitmap blocks and -+ * other fs reserved areas), depends on fs format and fs size. -+ */ -+ __u64 min_blocks_used; -+ -+ /* -+ * when debugging is on, all jnodes (including znodes, bitmaps, etc.) -+ * are kept on a list anchored at sbinfo->all_jnodes. This list is -+ * protected by sbinfo->all_guard spin lock. This lock should be taken -+ * with _irq modifier, because it is also modified from interrupt -+ * contexts (by RCU). -+ */ -+ spinlock_t all_guard; -+ /* list of all jnodes */ -+ struct list_head all_jnodes; -+#endif -+ struct dentry *debugfs_root; -+}; -+ -+extern reiser4_super_info_data *get_super_private_nocheck(const struct -+ super_block * super); -+ -+/* Return reiser4-specific part of super block */ -+static inline reiser4_super_info_data *get_super_private(const struct -+ super_block * super) -+{ -+ assert("nikita-447", super != NULL); -+ -+ return (reiser4_super_info_data *) super->s_fs_info; -+} -+ -+/* get ent context for the @super */ -+static inline entd_context *get_entd_context(struct super_block *super) -+{ -+ return &get_super_private(super)->entd; -+} -+ -+/* "Current" super-block: main super block used during current system -+ call. Reference to this super block is stored in reiser4_context. */ -+static inline struct super_block *reiser4_get_current_sb(void) -+{ -+ return get_current_context()->super; -+} -+ -+/* Reiser4-specific part of "current" super-block: main super block used -+ during current system call. Reference to this super block is stored in -+ reiser4_context. */ -+static inline reiser4_super_info_data *get_current_super_private(void) -+{ -+ return get_super_private(reiser4_get_current_sb()); -+} -+ -+static inline struct formatted_ra_params *get_current_super_ra_params(void) -+{ -+ return &(get_current_super_private()->ra_params); -+} -+ -+/* -+ * true, if file system on @super is read-only -+ */ -+static inline int rofs_super(struct super_block *super) -+{ -+ return super->s_flags & MS_RDONLY; -+} -+ -+/* -+ * true, if @tree represents read-only file system -+ */ -+static inline int rofs_tree(reiser4_tree * tree) -+{ -+ return rofs_super(tree->super); -+} -+ -+/* -+ * true, if file system where @inode lives on, is read-only -+ */ -+static inline int rofs_inode(struct inode *inode) -+{ -+ return rofs_super(inode->i_sb); -+} -+ -+/* -+ * true, if file system where @node lives on, is read-only -+ */ -+static inline int rofs_jnode(jnode * node) -+{ -+ return rofs_tree(jnode_get_tree(node)); -+} -+ -+extern __u64 reiser4_current_block_count(void); -+ -+extern void build_object_ops(struct super_block *super, struct object_ops *ops); -+ -+#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */ -+ -+static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo) -+{ -+ spin_lock(&(sbinfo->guard)); -+} -+ -+static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo) -+{ -+ assert_spin_locked(&(sbinfo->guard)); -+ spin_unlock(&(sbinfo->guard)); -+} -+ -+extern __u64 reiser4_flush_reserved(const struct super_block *); -+extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f); -+extern long reiser4_statfs_type(const struct super_block *super); -+extern __u64 reiser4_block_count(const struct super_block *super); -+extern void reiser4_set_block_count(const struct super_block *super, __u64 nr); -+extern __u64 reiser4_data_blocks(const struct super_block *super); -+extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr); -+extern __u64 reiser4_free_blocks(const struct super_block *super); -+extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr); -+extern __u32 reiser4_mkfs_id(const struct super_block *super); -+ -+extern __u64 reiser4_free_committed_blocks(const struct super_block *super); -+ -+extern __u64 reiser4_grabbed_blocks(const struct super_block *); -+extern __u64 reiser4_fake_allocated(const struct super_block *); -+extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *); -+extern __u64 reiser4_clustered_blocks(const struct super_block *); -+ -+extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid, -+ gid_t gid); -+ -+extern reiser4_space_allocator * -+reiser4_get_space_allocator(const struct super_block *super); -+extern reiser4_oid_allocator * -+reiser4_get_oid_allocator(const struct super_block *super); -+extern struct inode *reiser4_get_super_fake(const struct super_block *super); -+extern struct inode *reiser4_get_cc_fake(const struct super_block *super); -+extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super); -+extern reiser4_tree *reiser4_get_tree(const struct super_block *super); -+extern int is_reiser4_super(const struct super_block *super); -+ -+extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk); -+extern int reiser4_blocknr_is_sane_for(const struct super_block *super, -+ const reiser4_block_nr * blk); -+extern int reiser4_fill_super(struct super_block *s, void *data, int silent); -+extern int reiser4_done_super(struct super_block *s); -+ -+/* step of fill super */ -+extern int reiser4_init_fs_info(struct super_block *); -+extern void reiser4_done_fs_info(struct super_block *); -+extern int reiser4_init_super_data(struct super_block *, char *opt_string); -+extern int reiser4_init_read_super(struct super_block *, int silent); -+extern int reiser4_init_root_inode(struct super_block *); -+extern reiser4_plugin *get_default_plugin(pset_member memb); -+ -+/* Maximal possible object id. */ -+#define ABSOLUTE_MAX_OID ((oid_t)~0) -+ -+#define OIDS_RESERVED (1 << 16) -+int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next); -+oid_t oid_allocate(struct super_block *); -+int oid_release(struct super_block *, oid_t); -+oid_t oid_next(const struct super_block *); -+void oid_count_allocated(void); -+void oid_count_released(void); -+long oids_used(const struct super_block *); -+ -+#if REISER4_DEBUG -+void print_fs_info(const char *prefix, const struct super_block *); -+#endif -+ -+extern void destroy_reiser4_cache(struct kmem_cache **); -+ -+extern struct super_operations reiser4_super_operations; -+extern struct export_operations reiser4_export_operations; -+extern struct dentry_operations reiser4_dentry_operations; -+ -+/* __REISER4_SUPER_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/super_ops.c linux-2.6.30/fs/reiser4/super_ops.c ---- linux-2.6.30.orig/fs/reiser4/super_ops.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/super_ops.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,725 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "inode.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+#include "flush.h" -+#include "safe_link.h" -+ -+#include <linux/vfs.h> -+#include <linux/writeback.h> -+#include <linux/mount.h> -+#include <linux/seq_file.h> -+#include <linux/debugfs.h> -+ -+/* slab cache for inodes */ -+static struct kmem_cache *inode_cache; -+ -+static struct dentry *reiser4_debugfs_root = NULL; -+ -+/** -+ * init_once - constructor for reiser4 inodes -+ * @cache: cache @obj belongs to -+ * @obj: inode to be initialized -+ * -+ * Initialization function to be called when new page is allocated by reiser4 -+ * inode cache. It is set on inode cache creation. -+ */ -+static void init_once(void *obj) -+{ -+ struct reiser4_inode_object *info; -+ -+ info = obj; -+ -+ /* initialize vfs inode */ -+ inode_init_once(&info->vfs_inode); -+ -+ /* -+ * initialize reiser4 specific part fo inode. -+ * NOTE-NIKITA add here initializations for locks, list heads, -+ * etc. that will be added to our private inode part. -+ */ -+ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode)); -+ init_rwsem(&info->p.conv_sem); -+ /* init semaphore which is used during inode loading */ -+ loading_init_once(&info->p); -+ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p), -+ GFP_ATOMIC); -+#if REISER4_DEBUG -+ info->p.nr_jnodes = 0; -+#endif -+} -+ -+/** -+ * init_inodes - create znode cache -+ * -+ * Initializes slab cache of inodes. It is part of reiser4 module initialization -+ */ -+static int init_inodes(void) -+{ -+ inode_cache = kmem_cache_create("reiser4_inode", -+ sizeof(struct reiser4_inode_object), -+ 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, init_once); -+ if (inode_cache == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * done_inodes - delete inode cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+static void done_inodes(void) -+{ -+ destroy_reiser4_cache(&inode_cache); -+} -+ -+/** -+ * reiser4_alloc_inode - alloc_inode of super operations -+ * @super: super block new inode is allocated for -+ * -+ * Allocates new inode, initializes reiser4 specific part of it. -+ */ -+static struct inode *reiser4_alloc_inode(struct super_block *super) -+{ -+ struct reiser4_inode_object *obj; -+ -+ assert("nikita-1696", super != NULL); -+ obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get()); -+ if (obj != NULL) { -+ reiser4_inode *info; -+ -+ info = &obj->p; -+ -+ info->pset = plugin_set_get_empty(); -+ info->hset = plugin_set_get_empty(); -+ info->extmask = 0; -+ info->locality_id = 0ull; -+ info->plugin_mask = 0; -+ info->heir_mask = 0; -+#if !REISER4_INO_IS_OID -+ info->oid_hi = 0; -+#endif -+ reiser4_seal_init(&info->sd_seal, NULL, NULL); -+ coord_init_invalid(&info->sd_coord, NULL); -+ info->flags = 0; -+ spin_lock_init(&info->guard); -+ /* this deals with info's loading semaphore */ -+ loading_alloc(info); -+ info->vroot = UBER_TREE_ADDR; -+ return &obj->vfs_inode; -+ } else -+ return NULL; -+} -+ -+/** -+ * reiser4_destroy_inode - destroy_inode of super operations -+ * @inode: inode being destroyed -+ * -+ * Puts reiser4 specific portion of inode, frees memory occupied by inode. -+ */ -+static void reiser4_destroy_inode(struct inode *inode) -+{ -+ reiser4_inode *info; -+ -+ info = reiser4_inode_data(inode); -+ -+ assert("vs-1220", inode_has_no_jnodes(info)); -+ -+ if (!is_bad_inode(inode) && is_inode_loaded(inode)) { -+ file_plugin *fplug = inode_file_plugin(inode); -+ if (fplug->destroy_inode != NULL) -+ fplug->destroy_inode(inode); -+ } -+ reiser4_dispose_cursors(inode); -+ if (info->pset) -+ plugin_set_put(info->pset); -+ if (info->hset) -+ plugin_set_put(info->hset); -+ -+ /* -+ * cannot add similar assertion about ->i_list as prune_icache return -+ * inode into slab with dangling ->list.{next,prev}. This is safe, -+ * because they are re-initialized in the new_inode(). -+ */ -+ assert("nikita-2895", list_empty(&inode->i_dentry)); -+ assert("nikita-2896", hlist_unhashed(&inode->i_hash)); -+ assert("nikita-2898", list_empty_careful(get_readdir_list(inode))); -+ -+ /* this deals with info's loading semaphore */ -+ loading_destroy(info); -+ -+ kmem_cache_free(inode_cache, -+ container_of(info, struct reiser4_inode_object, p)); -+} -+ -+/** -+ * reiser4_dirty_inode - dirty_inode of super operations -+ * @inode: inode being dirtied -+ * -+ * Updates stat data. -+ */ -+static void reiser4_dirty_inode(struct inode *inode) -+{ -+ int result; -+ -+ if (!is_in_reiser4_context()) -+ return; -+ assert("", !IS_RDONLY(inode)); -+ assert("", (inode_file_plugin(inode)->estimate.update(inode) <= -+ get_current_context()->grabbed_blocks)); -+ -+ result = reiser4_update_sd(inode); -+ if (result) -+ warning("", "failed to dirty inode for %llu: %d", -+ get_inode_oid(inode), result); -+} -+ -+/** -+ * reiser4_delete_inode - delete_inode of super operations -+ * @inode: inode to delete -+ * -+ * Calls file plugin's delete_object method to delete object items from -+ * filesystem tree and calls clear_inode. -+ */ -+static void reiser4_delete_inode(struct inode *inode) -+{ -+ reiser4_context *ctx; -+ file_plugin *fplug; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ warning("vs-15", "failed to init context"); -+ return; -+ } -+ -+ if (is_inode_loaded(inode)) { -+ fplug = inode_file_plugin(inode); -+ if (fplug != NULL && fplug->delete_object != NULL) -+ fplug->delete_object(inode); -+ } -+ -+ truncate_inode_pages(&inode->i_data, 0); -+ inode->i_blocks = 0; -+ clear_inode(inode); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_put_super - put_super of super operations -+ * @super: super block to free -+ * -+ * Stops daemons, release resources, umounts in short. -+ */ -+static void reiser4_put_super(struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ reiser4_context *ctx; -+ -+ sbinfo = get_super_private(super); -+ assert("vs-1699", sbinfo); -+ -+ debugfs_remove(sbinfo->tmgr.debugfs_atom_count); -+ debugfs_remove(sbinfo->tmgr.debugfs_id_count); -+ debugfs_remove(sbinfo->debugfs_root); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) { -+ warning("vs-17", "failed to init context"); -+ return; -+ } -+ -+ /* have disk format plugin to free its resources */ -+ if (get_super_private(super)->df_plug->release) -+ get_super_private(super)->df_plug->release(super); -+ -+ reiser4_done_formatted_fake(super); -+ -+ /* stop daemons: ktxnmgr and entd */ -+ reiser4_done_entd(super); -+ reiser4_done_ktxnmgrd(super); -+ reiser4_done_txnmgr(&sbinfo->tmgr); -+ -+ reiser4_done_fs_info(super); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_write_super - write_super of super operations -+ * @super: super block to write -+ * -+ * Captures znode associated with super block, comit all transactions. -+ */ -+static void reiser4_write_super(struct super_block *super) -+{ -+ int ret; -+ reiser4_context *ctx; -+ -+ assert("vs-1700", !rofs_super(super)); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) { -+ warning("vs-16", "failed to init context"); -+ return; -+ } -+ -+ ret = reiser4_capture_super_block(super); -+ if (ret != 0) -+ warning("vs-1701", -+ "reiser4_capture_super_block failed in write_super: %d", -+ ret); -+ ret = txnmgr_force_commit_all(super, 0); -+ if (ret != 0) -+ warning("jmacd-77113", -+ "txn_force failed in write_super: %d", ret); -+ -+ super->s_dirt = 0; -+ -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_statfs - statfs of super operations -+ * @super: super block of file system in queried -+ * @stafs: buffer to fill with statistics -+ * -+ * Returns information about filesystem. -+ */ -+static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs) -+{ -+ sector_t total; -+ sector_t reserved; -+ sector_t free; -+ sector_t forroot; -+ sector_t deleted; -+ reiser4_context *ctx; -+ struct super_block *super = dentry->d_sb; -+ -+ assert("nikita-408", super != NULL); -+ assert("nikita-409", statfs != NULL); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ statfs->f_type = reiser4_statfs_type(super); -+ statfs->f_bsize = super->s_blocksize; -+ -+ /* -+ * 5% of total block space is reserved. This is needed for flush and -+ * for truncates (so that we are able to perform truncate/unlink even -+ * on the otherwise completely full file system). If this reservation -+ * is hidden from statfs(2), users will mistakenly guess that they -+ * have enough free space to complete some operation, which is -+ * frustrating. -+ * -+ * Another possible solution is to subtract ->blocks_reserved from -+ * ->f_bfree, but changing available space seems less intrusive than -+ * letting user to see 5% of disk space to be used directly after -+ * mkfs. -+ */ -+ total = reiser4_block_count(super); -+ reserved = get_super_private(super)->blocks_reserved; -+ deleted = txnmgr_count_deleted_blocks(); -+ free = reiser4_free_blocks(super) + deleted; -+ forroot = reiser4_reserved_blocks(super, 0, 0); -+ -+ /* -+ * These counters may be in inconsistent state because we take the -+ * values without keeping any global spinlock. Here we do a sanity -+ * check that free block counter does not exceed the number of all -+ * blocks. -+ */ -+ if (free > total) -+ free = total; -+ statfs->f_blocks = total - reserved; -+ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */ -+ if (free > reserved) -+ free -= reserved; -+ else -+ free = 0; -+ statfs->f_bfree = free; -+ -+ if (free > forroot) -+ free -= forroot; -+ else -+ free = 0; -+ statfs->f_bavail = free; -+ -+ statfs->f_files = 0; -+ statfs->f_ffree = 0; -+ -+ /* maximal acceptable name length depends on directory plugin. */ -+ assert("nikita-3351", super->s_root->d_inode != NULL); -+ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+/** -+ * reiser4_clear_inode - clear_inode of super operation -+ * @inode: inode about to destroy -+ * -+ * Does sanity checks: being destroyed should have all jnodes detached. -+ */ -+static void reiser4_clear_inode(struct inode *inode) -+{ -+#if REISER4_DEBUG -+ reiser4_inode *r4_inode; -+ -+ r4_inode = reiser4_inode_data(inode); -+ if (!inode_has_no_jnodes(r4_inode)) -+ warning("vs-1732", "reiser4 inode has %ld jnodes\n", -+ r4_inode->nr_jnodes); -+#endif -+} -+ -+/** -+ * reiser4_sync_inodes - sync_inodes of super operations -+ * @super: -+ * @wbc: -+ * -+ * This method is called by background and non-backgound writeback. Reiser4's -+ * implementation uses generic_sync_sb_inodes to call reiser4_writepages for -+ * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared -+ * mapping - dirty pages get into atoms. Writeout is called to flush some -+ * atoms. -+ */ -+static void reiser4_sync_inodes(struct super_block *super, -+ struct writeback_control *wbc) -+{ -+ reiser4_context *ctx; -+ long to_write; -+ -+ if (wbc->for_kupdate) -+ /* reiser4 has its own means of periodical write-out */ -+ return; -+ -+ to_write = wbc->nr_to_write; -+ assert("vs-49", wbc->older_than_this == NULL); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) { -+ warning("vs-13", "failed to init context"); -+ return; -+ } -+ -+ /* -+ * call reiser4_writepages for each of dirty inodes to turn dirty pages -+ * into transactions if they were not yet. -+ */ -+ generic_sync_sb_inodes(super, wbc); -+ -+ /* flush goes here */ -+ wbc->nr_to_write = to_write; -+ reiser4_writeout(super, wbc); -+ -+ /* avoid recursive calls to ->sync_inodes */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_show_options - show_options of super operations -+ * @m: file where to write information -+ * @mnt: mount structure -+ * -+ * Makes reiser4 mount options visible in /proc/mounts. -+ */ -+static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt) -+{ -+ struct super_block *super; -+ reiser4_super_info_data *sbinfo; -+ -+ super = mnt->mnt_sb; -+ sbinfo = get_super_private(super); -+ -+ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size); -+ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age); -+ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size); -+ seq_printf(m, ",atom_max_flushers=0x%x", -+ sbinfo->tmgr.atom_max_flushers); -+ seq_printf(m, ",cbk_cache_slots=0x%x", -+ sbinfo->tree.cbk_cache.nr_slots); -+ -+ return 0; -+} -+ -+struct super_operations reiser4_super_operations = { -+ .alloc_inode = reiser4_alloc_inode, -+ .destroy_inode = reiser4_destroy_inode, -+ .dirty_inode = reiser4_dirty_inode, -+ .delete_inode = reiser4_delete_inode, -+ .put_super = reiser4_put_super, -+ .write_super = reiser4_write_super, -+ .statfs = reiser4_statfs, -+ .clear_inode = reiser4_clear_inode, -+ .sync_inodes = reiser4_sync_inodes, -+ .show_options = reiser4_show_options -+}; -+ -+/** -+ * fill_super - initialize super block on mount -+ * @super: super block to fill -+ * @data: reiser4 specific mount option -+ * @silent: -+ * -+ * This is to be called by reiser4_get_sb. Mounts filesystem. -+ */ -+static int fill_super(struct super_block *super, void *data, int silent) -+{ -+ reiser4_context ctx; -+ int result; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("zam-989", super != NULL); -+ -+ super->s_op = NULL; -+ init_stack_context(&ctx, super); -+ -+ /* allocate reiser4 specific super block */ -+ if ((result = reiser4_init_fs_info(super)) != 0) -+ goto failed_init_sinfo; -+ -+ sbinfo = get_super_private(super); -+ /* initialize various reiser4 parameters, parse mount options */ -+ if ((result = reiser4_init_super_data(super, data)) != 0) -+ goto failed_init_super_data; -+ -+ /* read reiser4 master super block, initialize disk format plugin */ -+ if ((result = reiser4_init_read_super(super, silent)) != 0) -+ goto failed_init_read_super; -+ -+ /* initialize transaction manager */ -+ reiser4_init_txnmgr(&sbinfo->tmgr); -+ -+ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */ -+ if ((result = reiser4_init_ktxnmgrd(super)) != 0) -+ goto failed_init_ktxnmgrd; -+ -+ /* initialize entd context and start kernel thread entd */ -+ if ((result = reiser4_init_entd(super)) != 0) -+ goto failed_init_entd; -+ -+ /* initialize address spaces for formatted nodes and bitmaps */ -+ if ((result = reiser4_init_formatted_fake(super)) != 0) -+ goto failed_init_formatted_fake; -+ -+ /* initialize disk format plugin */ -+ if ((result = get_super_private(super)->df_plug->init_format(super, -+ data)) != 0) -+ goto failed_init_disk_format; -+ -+ /* -+ * There are some 'committed' versions of reiser4 super block counters, -+ * which correspond to reiser4 on-disk state. These counters are -+ * initialized here -+ */ -+ sbinfo->blocks_free_committed = sbinfo->blocks_free; -+ sbinfo->nr_files_committed = oids_used(super); -+ -+ /* get inode of root directory */ -+ if ((result = reiser4_init_root_inode(super)) != 0) -+ goto failed_init_root_inode; -+ -+ if ((result = get_super_private(super)->df_plug->version_update(super)) != 0) -+ goto failed_update_format_version; -+ -+ process_safelinks(super); -+ reiser4_exit_context(&ctx); -+ -+ sbinfo->debugfs_root = debugfs_create_dir(super->s_id, -+ reiser4_debugfs_root); -+ if (sbinfo->debugfs_root) { -+ sbinfo->tmgr.debugfs_atom_count = -+ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR, -+ sbinfo->debugfs_root, -+ &sbinfo->tmgr.atom_count); -+ sbinfo->tmgr.debugfs_id_count = -+ debugfs_create_u32("id_count", S_IFREG|S_IRUSR, -+ sbinfo->debugfs_root, -+ &sbinfo->tmgr.id_count); -+ } -+ return 0; -+ -+ failed_update_format_version: -+ failed_init_root_inode: -+ if (sbinfo->df_plug->release) -+ sbinfo->df_plug->release(super); -+ failed_init_disk_format: -+ reiser4_done_formatted_fake(super); -+ failed_init_formatted_fake: -+ reiser4_done_entd(super); -+ failed_init_entd: -+ reiser4_done_ktxnmgrd(super); -+ failed_init_ktxnmgrd: -+ reiser4_done_txnmgr(&sbinfo->tmgr); -+ failed_init_read_super: -+ failed_init_super_data: -+ reiser4_done_fs_info(super); -+ failed_init_sinfo: -+ reiser4_exit_context(&ctx); -+ return result; -+} -+ -+/** -+ * reiser4_get_sb - get_sb of file_system_type operations -+ * @fs_type: -+ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc -+ * @dev_name: block device file name -+ * @data: specific mount options -+ * -+ * Reiser4 mount entry. -+ */ -+static int reiser4_get_sb(struct file_system_type *fs_type, int flags, -+ const char *dev_name, void *data, struct vfsmount *mnt) -+{ -+ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); -+} -+ -+/* structure describing the reiser4 filesystem implementation */ -+static struct file_system_type reiser4_fs_type = { -+ .owner = THIS_MODULE, -+ .name = "reiser4", -+ .fs_flags = FS_REQUIRES_DEV, -+ .get_sb = reiser4_get_sb, -+ .kill_sb = kill_block_super, -+ .next = NULL -+}; -+ -+void destroy_reiser4_cache(struct kmem_cache **cachep) -+{ -+ BUG_ON(*cachep == NULL); -+ kmem_cache_destroy(*cachep); -+ *cachep = NULL; -+} -+ -+/** -+ * init_reiser4 - reiser4 initialization entry point -+ * -+ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called -+ * on kernel initialization or during reiser4 module load. -+ */ -+static int __init init_reiser4(void) -+{ -+ int result; -+ -+ printk(KERN_INFO -+ "Loading Reiser4. " -+ "See www.namesys.com for a description of Reiser4.\n"); -+ -+ /* initialize slab cache of inodes */ -+ if ((result = init_inodes()) != 0) -+ goto failed_inode_cache; -+ -+ /* initialize cache of znodes */ -+ if ((result = init_znodes()) != 0) -+ goto failed_init_znodes; -+ -+ /* initialize all plugins */ -+ if ((result = init_plugins()) != 0) -+ goto failed_init_plugins; -+ -+ /* initialize cache of plugin_set-s and plugin_set's hash table */ -+ if ((result = init_plugin_set()) != 0) -+ goto failed_init_plugin_set; -+ -+ /* initialize caches of txn_atom-s and txn_handle-s */ -+ if ((result = init_txnmgr_static()) != 0) -+ goto failed_init_txnmgr_static; -+ -+ /* initialize cache of jnodes */ -+ if ((result = init_jnodes()) != 0) -+ goto failed_init_jnodes; -+ -+ /* initialize cache of flush queues */ -+ if ((result = reiser4_init_fqs()) != 0) -+ goto failed_init_fqs; -+ -+ /* initialize cache of structures attached to dentry->d_fsdata */ -+ if ((result = reiser4_init_dentry_fsdata()) != 0) -+ goto failed_init_dentry_fsdata; -+ -+ /* initialize cache of structures attached to file->private_data */ -+ if ((result = reiser4_init_file_fsdata()) != 0) -+ goto failed_init_file_fsdata; -+ -+ /* -+ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for -+ * more details -+ */ -+ if ((result = reiser4_init_d_cursor()) != 0) -+ goto failed_init_d_cursor; -+ -+ if ((result = register_filesystem(&reiser4_fs_type)) == 0) { -+ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL); -+ return 0; -+ } -+ -+ reiser4_done_d_cursor(); -+ failed_init_d_cursor: -+ reiser4_done_file_fsdata(); -+ failed_init_file_fsdata: -+ reiser4_done_dentry_fsdata(); -+ failed_init_dentry_fsdata: -+ reiser4_done_fqs(); -+ failed_init_fqs: -+ done_jnodes(); -+ failed_init_jnodes: -+ done_txnmgr_static(); -+ failed_init_txnmgr_static: -+ done_plugin_set(); -+ failed_init_plugin_set: -+ failed_init_plugins: -+ done_znodes(); -+ failed_init_znodes: -+ done_inodes(); -+ failed_inode_cache: -+ return result; -+} -+ -+/** -+ * done_reiser4 - reiser4 exit entry point -+ * -+ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown -+ * or at module unload. -+ */ -+static void __exit done_reiser4(void) -+{ -+ int result; -+ -+ debugfs_remove(reiser4_debugfs_root); -+ result = unregister_filesystem(&reiser4_fs_type); -+ BUG_ON(result != 0); -+ reiser4_done_d_cursor(); -+ reiser4_done_file_fsdata(); -+ reiser4_done_dentry_fsdata(); -+ reiser4_done_fqs(); -+ done_jnodes(); -+ done_txnmgr_static(); -+ done_plugin_set(); -+ done_znodes(); -+ destroy_reiser4_cache(&inode_cache); -+} -+ -+module_init(init_reiser4); -+module_exit(done_reiser4); -+ -+MODULE_DESCRIPTION("Reiser4 filesystem"); -+MODULE_AUTHOR("Hans Reiser Reiser@Namesys.COM"); -+ -+MODULE_LICENSE("GPL"); -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/tap.c linux-2.6.30/fs/reiser4/tap.c ---- linux-2.6.30.orig/fs/reiser4/tap.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/tap.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,376 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ Tree Access Pointer (tap). -+ -+ tap is data structure combining coord and lock handle (mostly). It is -+ useful when one has to scan tree nodes (for example, in readdir, or flush), -+ for tap functions allow to move tap in either direction transparently -+ crossing unit/item/node borders. -+ -+ Tap doesn't provide automatic synchronization of its fields as it is -+ supposed to be per-thread object. -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "coord.h" -+#include "tree.h" -+#include "context.h" -+#include "tap.h" -+#include "znode.h" -+#include "tree_walk.h" -+ -+#if REISER4_DEBUG -+static int tap_invariant(const tap_t *tap); -+static void tap_check(const tap_t *tap); -+#else -+#define tap_check(tap) noop -+#endif -+ -+/** load node tap is pointing to, if not loaded already */ -+int reiser4_tap_load(tap_t *tap) -+{ -+ tap_check(tap); -+ if (tap->loaded == 0) { -+ int result; -+ -+ result = zload_ra(tap->coord->node, &tap->ra_info); -+ if (result != 0) -+ return result; -+ coord_clear_iplug(tap->coord); -+ } -+ ++tap->loaded; -+ tap_check(tap); -+ return 0; -+} -+ -+/** release node tap is pointing to. Dual to tap_load() */ -+void reiser4_tap_relse(tap_t *tap) -+{ -+ tap_check(tap); -+ if (tap->loaded > 0) { -+ --tap->loaded; -+ if (tap->loaded == 0) -+ zrelse(tap->coord->node); -+ } -+ tap_check(tap); -+} -+ -+/** -+ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with -+ * @mode -+ */ -+void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh, -+ znode_lock_mode mode) -+{ -+ tap->coord = coord; -+ tap->lh = lh; -+ tap->mode = mode; -+ tap->loaded = 0; -+ INIT_LIST_HEAD(&tap->linkage); -+ reiser4_init_ra_info(&tap->ra_info); -+} -+ -+/** add @tap to the per-thread list of all taps */ -+void reiser4_tap_monitor(tap_t *tap) -+{ -+ assert("nikita-2623", tap != NULL); -+ tap_check(tap); -+ list_add(&tap->linkage, reiser4_taps_list()); -+ tap_check(tap); -+} -+ -+/* duplicate @src into @dst. Copy lock handle. @dst is not initially -+ * loaded. */ -+void reiser4_tap_copy(tap_t *dst, tap_t *src) -+{ -+ assert("nikita-3193", src != NULL); -+ assert("nikita-3194", dst != NULL); -+ -+ *dst->coord = *src->coord; -+ if (src->lh->node) -+ copy_lh(dst->lh, src->lh); -+ dst->mode = src->mode; -+ dst->loaded = 0; -+ INIT_LIST_HEAD(&dst->linkage); -+ dst->ra_info = src->ra_info; -+} -+ -+/** finish with @tap */ -+void reiser4_tap_done(tap_t *tap) -+{ -+ assert("nikita-2565", tap != NULL); -+ tap_check(tap); -+ if (tap->loaded > 0) -+ zrelse(tap->coord->node); -+ done_lh(tap->lh); -+ tap->loaded = 0; -+ list_del_init(&tap->linkage); -+ tap->coord->node = NULL; -+} -+ -+/** -+ * move @tap to the new node, locked with @target. Load @target, if @tap was -+ * already loaded. -+ */ -+int reiser4_tap_move(tap_t *tap, lock_handle * target) -+{ -+ int result = 0; -+ -+ assert("nikita-2567", tap != NULL); -+ assert("nikita-2568", target != NULL); -+ assert("nikita-2570", target->node != NULL); -+ assert("nikita-2569", tap->coord->node == tap->lh->node); -+ -+ tap_check(tap); -+ if (tap->loaded > 0) -+ result = zload_ra(target->node, &tap->ra_info); -+ -+ if (result == 0) { -+ if (tap->loaded > 0) -+ zrelse(tap->coord->node); -+ done_lh(tap->lh); -+ copy_lh(tap->lh, target); -+ tap->coord->node = target->node; -+ coord_clear_iplug(tap->coord); -+ } -+ tap_check(tap); -+ return result; -+} -+ -+/** -+ * move @tap to @target. Acquire lock on @target, if @tap was already -+ * loaded. -+ */ -+static int tap_to(tap_t *tap, znode * target) -+{ -+ int result; -+ -+ assert("nikita-2624", tap != NULL); -+ assert("nikita-2625", target != NULL); -+ -+ tap_check(tap); -+ result = 0; -+ if (tap->coord->node != target) { -+ lock_handle here; -+ -+ init_lh(&here); -+ result = longterm_lock_znode(&here, target, -+ tap->mode, ZNODE_LOCK_HIPRI); -+ if (result == 0) { -+ result = reiser4_tap_move(tap, &here); -+ done_lh(&here); -+ } -+ } -+ tap_check(tap); -+ return result; -+} -+ -+/** -+ * move @tap to given @target, loading and locking @target->node if -+ * necessary -+ */ -+int tap_to_coord(tap_t *tap, coord_t *target) -+{ -+ int result; -+ -+ tap_check(tap); -+ result = tap_to(tap, target->node); -+ if (result == 0) -+ coord_dup(tap->coord, target); -+ tap_check(tap); -+ return result; -+} -+ -+/** return list of all taps */ -+struct list_head *reiser4_taps_list(void) -+{ -+ return &get_current_context()->taps; -+} -+ -+/** helper function for go_{next,prev}_{item,unit,node}() */ -+int go_dir_el(tap_t *tap, sideof dir, int units_p) -+{ -+ coord_t dup; -+ coord_t *coord; -+ int result; -+ -+ int (*coord_dir) (coord_t *); -+ int (*get_dir_neighbor) (lock_handle *, znode *, int, int); -+ void (*coord_init) (coord_t *, const znode *); -+ ON_DEBUG(int (*coord_check) (const coord_t *)); -+ -+ assert("nikita-2556", tap != NULL); -+ assert("nikita-2557", tap->coord != NULL); -+ assert("nikita-2558", tap->lh != NULL); -+ assert("nikita-2559", tap->coord->node != NULL); -+ -+ tap_check(tap); -+ if (dir == LEFT_SIDE) { -+ coord_dir = units_p ? coord_prev_unit : coord_prev_item; -+ get_dir_neighbor = reiser4_get_left_neighbor; -+ coord_init = coord_init_last_unit; -+ } else { -+ coord_dir = units_p ? coord_next_unit : coord_next_item; -+ get_dir_neighbor = reiser4_get_right_neighbor; -+ coord_init = coord_init_first_unit; -+ } -+ ON_DEBUG(coord_check = -+ units_p ? coord_is_existing_unit : coord_is_existing_item); -+ assert("nikita-2560", coord_check(tap->coord)); -+ -+ coord = tap->coord; -+ coord_dup(&dup, coord); -+ if (coord_dir(&dup) != 0) { -+ do { -+ /* move to the left neighboring node */ -+ lock_handle dup; -+ -+ init_lh(&dup); -+ result = -+ get_dir_neighbor(&dup, coord->node, (int)tap->mode, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result == 0) { -+ result = reiser4_tap_move(tap, &dup); -+ if (result == 0) -+ coord_init(tap->coord, dup.node); -+ done_lh(&dup); -+ } -+ /* skip empty nodes */ -+ } while ((result == 0) && node_is_empty(coord->node)); -+ } else { -+ result = 0; -+ coord_dup(coord, &dup); -+ } -+ assert("nikita-2564", ergo(!result, coord_check(tap->coord))); -+ tap_check(tap); -+ return result; -+} -+ -+/** -+ * move @tap to the next unit, transparently crossing item and node -+ * boundaries -+ */ -+int go_next_unit(tap_t *tap) -+{ -+ return go_dir_el(tap, RIGHT_SIDE, 1); -+} -+ -+/** -+ * move @tap to the previous unit, transparently crossing item and node -+ * boundaries -+ */ -+int go_prev_unit(tap_t *tap) -+{ -+ return go_dir_el(tap, LEFT_SIDE, 1); -+} -+ -+/** -+ * @shift times apply @actor to the @tap. This is used to move @tap by -+ * @shift units (or items, or nodes) in either direction. -+ */ -+static int rewind_to(tap_t *tap, go_actor_t actor, int shift) -+{ -+ int result; -+ -+ assert("nikita-2555", shift >= 0); -+ assert("nikita-2562", tap->coord->node == tap->lh->node); -+ -+ tap_check(tap); -+ result = reiser4_tap_load(tap); -+ if (result != 0) -+ return result; -+ -+ for (; shift > 0; --shift) { -+ result = actor(tap); -+ assert("nikita-2563", tap->coord->node == tap->lh->node); -+ if (result != 0) -+ break; -+ } -+ reiser4_tap_relse(tap); -+ tap_check(tap); -+ return result; -+} -+ -+/** move @tap @shift units rightward */ -+int rewind_right(tap_t *tap, int shift) -+{ -+ return rewind_to(tap, go_next_unit, shift); -+} -+ -+/** move @tap @shift units leftward */ -+int rewind_left(tap_t *tap, int shift) -+{ -+ return rewind_to(tap, go_prev_unit, shift); -+} -+ -+#if REISER4_DEBUG -+/** debugging function: print @tap content in human readable form */ -+static void print_tap(const char *prefix, const tap_t *tap) -+{ -+ if (tap == NULL) { -+ printk("%s: null tap\n", prefix); -+ return; -+ } -+ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix, -+ tap->loaded, (&tap->linkage == tap->linkage.next && -+ &tap->linkage == tap->linkage.prev), -+ tap->lh->node, -+ lock_mode_name(tap->mode)); -+ print_coord("\tcoord", tap->coord, 0); -+} -+ -+/** check [tap-sane] invariant */ -+static int tap_invariant(const tap_t *tap) -+{ -+ /* [tap-sane] invariant */ -+ -+ if (tap == NULL) -+ return 1; -+ /* tap->mode is one of -+ * -+ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and -+ */ -+ if (tap->mode != ZNODE_NO_LOCK && -+ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK) -+ return 2; -+ /* tap->coord != NULL, and */ -+ if (tap->coord == NULL) -+ return 3; -+ /* tap->lh != NULL, and */ -+ if (tap->lh == NULL) -+ return 4; -+ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */ -+ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node))) -+ return 5; -+ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */ -+ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node) -+ return 6; -+ return 0; -+} -+ -+/** debugging function: check internal @tap consistency */ -+static void tap_check(const tap_t *tap) -+{ -+ int result; -+ -+ result = tap_invariant(tap); -+ if (result != 0) { -+ print_tap("broken", tap); -+ reiser4_panic("nikita-2831", "tap broken: %i\n", result); -+ } -+} -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/tap.h linux-2.6.30/fs/reiser4/tap.h ---- linux-2.6.30.orig/fs/reiser4/tap.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/tap.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,70 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Tree Access Pointers. See tap.c for more details. */ -+ -+#if !defined(__REISER4_TAP_H__) -+#define __REISER4_TAP_H__ -+ -+#include "forward.h" -+#include "readahead.h" -+ -+/** -+ tree_access_pointer aka tap. Data structure combining coord_t and lock -+ handle. -+ Invariants involving this data-type, see doc/lock-ordering for details: -+ -+ [tap-sane] -+ */ -+struct tree_access_pointer { -+ /* coord tap is at */ -+ coord_t *coord; -+ /* lock handle on ->coord->node */ -+ lock_handle *lh; -+ /* mode of lock acquired by this tap */ -+ znode_lock_mode mode; -+ /* incremented by reiser4_tap_load(). -+ Decremented by reiser4_tap_relse(). */ -+ int loaded; -+ /* list of taps */ -+ struct list_head linkage; -+ /* read-ahead hint */ -+ ra_info_t ra_info; -+}; -+ -+typedef int (*go_actor_t) (tap_t *tap); -+ -+extern int reiser4_tap_load(tap_t *tap); -+extern void reiser4_tap_relse(tap_t *tap); -+extern void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh, -+ znode_lock_mode mode); -+extern void reiser4_tap_monitor(tap_t *tap); -+extern void reiser4_tap_copy(tap_t *dst, tap_t *src); -+extern void reiser4_tap_done(tap_t *tap); -+extern int reiser4_tap_move(tap_t *tap, lock_handle * target); -+extern int tap_to_coord(tap_t *tap, coord_t *target); -+ -+extern int go_dir_el(tap_t *tap, sideof dir, int units_p); -+extern int go_next_unit(tap_t *tap); -+extern int go_prev_unit(tap_t *tap); -+extern int rewind_right(tap_t *tap, int shift); -+extern int rewind_left(tap_t *tap, int shift); -+ -+extern struct list_head *reiser4_taps_list(void); -+ -+#define for_all_taps(tap) \ -+ for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \ -+ reiser4_taps_list() != &tap->linkage; \ -+ tap = list_entry(tap->linkage.next, tap_t, linkage)) -+ -+/* __REISER4_TAP_H__ */ -+#endif -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/tree.c linux-2.6.30/fs/reiser4/tree.c ---- linux-2.6.30.orig/fs/reiser4/tree.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/tree.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1878 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * KEYS IN A TREE. -+ * -+ * The tree consists of nodes located on the disk. Node in the tree is either -+ * formatted or unformatted. Formatted node is one that has structure -+ * understood by the tree balancing and traversal code. Formatted nodes are -+ * further classified into leaf and internal nodes. Latter distinctions is -+ * (almost) of only historical importance: general structure of leaves and -+ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data -+ * that are part of bodies of ordinary files and attributes. -+ * -+ * Each node in the tree spawns some interval in the key space. Key ranges for -+ * all nodes in the tree are disjoint. Actually, this only holds in some weak -+ * sense, because of the non-unique keys: intersection of key ranges for -+ * different nodes is either empty, or consists of exactly one key. -+ * -+ * Formatted node consists of a sequence of items. Each item spawns some -+ * interval in key space. Key ranges for all items in a tree are disjoint, -+ * modulo non-unique keys again. Items within nodes are ordered in the key -+ * order of the smallest key in a item. -+ * -+ * Particular type of item can be further split into units. Unit is piece of -+ * item that can be cut from item and moved into another item of the same -+ * time. Units are used by balancing code to repack data during balancing. -+ * -+ * Unit can be further split into smaller entities (for example, extent unit -+ * represents several pages, and it is natural for extent code to operate on -+ * particular pages and even bytes within one unit), but this is of no -+ * relevance to the generic balancing and lookup code. -+ * -+ * Although item is said to "spawn" range or interval of keys, it is not -+ * necessary that item contains piece of data addressable by each and every -+ * key in this range. For example, compound directory item, consisting of -+ * units corresponding to directory entries and keyed by hashes of file names, -+ * looks more as having "discrete spectrum": only some disjoint keys inside -+ * range occupied by this item really address data. -+ * -+ * No than less, each item always has well-defined least (minimal) key, that -+ * is recorded in item header, stored in the node this item is in. Also, item -+ * plugin can optionally define method ->max_key_inside() returning maximal -+ * key that can _possibly_ be located within this item. This method is used -+ * (mainly) to determine when given piece of data should be merged into -+ * existing item, in stead of creating new one. Because of this, even though -+ * ->max_key_inside() can be larger that any key actually located in the item, -+ * intervals -+ * -+ * [ reiser4_min_key( item ), ->max_key_inside( item ) ] -+ * -+ * are still disjoint for all items within the _same_ node. -+ * -+ * In memory node is represented by znode. It plays several roles: -+ * -+ * . something locks are taken on -+ * -+ * . something tracked by transaction manager (this is going to change) -+ * -+ * . something used to access node data -+ * -+ * . something used to maintain tree structure in memory: sibling and -+ * parental linkage. -+ * -+ * . something used to organize nodes into "slums" -+ * -+ * More on znodes see in znode.[ch] -+ * -+ * DELIMITING KEYS -+ * -+ * To simplify balancing, allow some flexibility in locking and speed up -+ * important coord cache optimization, we keep delimiting keys of nodes in -+ * memory. Depending on disk format (implemented by appropriate node plugin) -+ * node on disk can record both left and right delimiting key, only one of -+ * them, or none. Still, our balancing and tree traversal code keep both -+ * delimiting keys for a node that is in memory stored in the znode. When -+ * node is first brought into memory during tree traversal, its left -+ * delimiting key is taken from its parent, and its right delimiting key is -+ * either next key in its parent, or is right delimiting key of parent if -+ * node is the rightmost child of parent. -+ * -+ * Physical consistency of delimiting key is protected by special dk -+ * read-write lock. That is, delimiting keys can only be inspected or -+ * modified under this lock. But dk lock is only sufficient for fast -+ * "pessimistic" check, because to simplify code and to decrease lock -+ * contention, balancing (carry) only updates delimiting keys right before -+ * unlocking all locked nodes on the given tree level. For example, -+ * coord-by-key cache scans LRU list of recently accessed znodes. For each -+ * node it first does fast check under dk spin lock. If key looked for is -+ * not between delimiting keys for this node, next node is inspected and so -+ * on. If key is inside of the key range, long term lock is taken on node -+ * and key range is rechecked. -+ * -+ * COORDINATES -+ * -+ * To find something in the tree, you supply a key, and the key is resolved -+ * by coord_by_key() into a coord (coordinate) that is valid as long as the -+ * node the coord points to remains locked. As mentioned above trees -+ * consist of nodes that consist of items that consist of units. A unit is -+ * the smallest and indivisible piece of tree as far as balancing and tree -+ * search are concerned. Each node, item, and unit can be addressed by -+ * giving its level in the tree and the key occupied by this entity. A node -+ * knows what the key ranges are of the items within it, and how to find its -+ * items and invoke their item handlers, but it does not know how to access -+ * individual units within its items except through the item handlers. -+ * coord is a structure containing a pointer to the node, the ordinal number -+ * of the item within this node (a sort of item offset), and the ordinal -+ * number of the unit within this item. -+ * -+ * TREE LOOKUP -+ * -+ * There are two types of access to the tree: lookup and modification. -+ * -+ * Lookup is a search for the key in the tree. Search can look for either -+ * exactly the key given to it, or for the largest key that is not greater -+ * than the key given to it. This distinction is determined by "bias" -+ * parameter of search routine (coord_by_key()). coord_by_key() either -+ * returns error (key is not in the tree, or some kind of external error -+ * occurred), or successfully resolves key into coord. -+ * -+ * This resolution is done by traversing tree top-to-bottom from root level -+ * to the desired level. On levels above twig level (level one above the -+ * leaf level) nodes consist exclusively of internal items. Internal item is -+ * nothing more than pointer to the tree node on the child level. On twig -+ * level nodes consist of internal items intermixed with extent -+ * items. Internal items form normal search tree structure used by traversal -+ * to descent through the tree. -+ * -+ * TREE LOOKUP OPTIMIZATIONS -+ * -+ * Tree lookup described above is expensive even if all nodes traversed are -+ * already in the memory: for each node binary search within it has to be -+ * performed and binary searches are CPU consuming and tend to destroy CPU -+ * caches. -+ * -+ * Several optimizations are used to work around this: -+ * -+ * . cbk_cache (look-aside cache for tree traversals, see search.c for -+ * details) -+ * -+ * . seals (see seal.[ch]) -+ * -+ * . vroot (see search.c) -+ * -+ * General search-by-key is layered thusly: -+ * -+ * [check seal, if any] --ok--> done -+ * | -+ * failed -+ * | -+ * V -+ * [vroot defined] --no--> node = tree_root -+ * | | -+ * yes | -+ * | | -+ * V | -+ * node = vroot | -+ * | | -+ * | | -+ * | | -+ * V V -+ * [check cbk_cache for key] --ok--> done -+ * | -+ * failed -+ * | -+ * V -+ * [start tree traversal from node] -+ * -+ */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/static_stat.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "carry.h" -+#include "carry_ops.h" -+#include "tap.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "page_cache.h" -+#include "super.h" -+#include "reiser4.h" -+#include "inode.h" -+ -+#include <linux/fs.h> /* for struct super_block */ -+#include <linux/spinlock.h> -+ -+/* Disk address (block number) never ever used for any real tree node. This is -+ used as block number of "uber" znode. -+ -+ Invalid block addresses are 0 by tradition. -+ -+*/ -+const reiser4_block_nr UBER_TREE_ADDR = 0ull; -+ -+#define CUT_TREE_MIN_ITERATIONS 64 -+ -+static int find_child_by_addr(znode * parent, znode * child, coord_t *result); -+ -+/* return node plugin of coord->node */ -+node_plugin *node_plugin_by_coord(const coord_t *coord) -+{ -+ assert("vs-1", coord != NULL); -+ assert("vs-2", coord->node != NULL); -+ -+ return coord->node->nplug; -+} -+ -+/* insert item into tree. Fields of @coord are updated so that they can be -+ * used by consequent insert operation. */ -+insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item -+ * into */ , -+ const reiser4_key * key /* key of new item */ , -+ reiser4_item_data * data /* parameters for item -+ * creation */ , -+ coord_t *coord /* resulting insertion coord */ , -+ lock_handle * lh /* resulting lock -+ * handle */ , -+ tree_level stop_level /* level where to insert */ , -+ __u32 flags/* insertion flags */) -+{ -+ int result; -+ -+ assert("nikita-358", tree != NULL); -+ assert("nikita-360", coord != NULL); -+ -+ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK, -+ FIND_EXACT, stop_level, stop_level, -+ flags | CBK_FOR_INSERT, NULL/*ra_info */); -+ switch (result) { -+ default: -+ break; -+ case CBK_COORD_FOUND: -+ result = IBK_ALREADY_EXISTS; -+ break; -+ case CBK_COORD_NOTFOUND: -+ assert("nikita-2017", coord->node != NULL); -+ result = insert_by_coord(coord, data, key, lh, 0/*flags */); -+ break; -+ } -+ return result; -+} -+ -+/* insert item by calling carry. Helper function called if short-cut -+ insertion failed */ -+static insert_result insert_with_carry_by_coord(coord_t *coord, -+ /* coord where to insert */ -+ lock_handle * lh, -+ /* lock handle of insertion node */ -+ reiser4_item_data * data, -+ /* parameters of new item */ -+ const reiser4_key * key, -+ /* key of new item */ -+ carry_opcode cop, -+ /* carry operation to perform */ -+ cop_insert_flag flags -+ /* carry flags */ ) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_insert_data *cdata; -+ carry_op *op; -+ -+ assert("umka-314", coord != NULL); -+ -+ /* allocate carry_pool and 3 carry_level-s */ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*cdata)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, cop, coord->node, 0); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ cdata = (carry_insert_data *) (lowest_level + 3); -+ cdata->coord = coord; -+ cdata->data = data; -+ cdata->key = key; -+ op->u.insert.d = cdata; -+ if (flags == 0) -+ flags = znode_get_tree(coord->node)->carry.insert_flags; -+ op->u.insert.flags = flags; -+ op->u.insert.type = COPT_ITEM_DATA; -+ op->u.insert.child = NULL; -+ if (lh != NULL) { -+ assert("nikita-3245", lh->node == coord->node); -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ } -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* form carry queue to perform paste of @data with @key at @coord, and launch -+ its execution by calling carry(). -+ -+ Instruct carry to update @lh it after balancing insertion coord moves into -+ different block. -+ -+*/ -+static int paste_with_carry(coord_t *coord, /* coord of paste */ -+ lock_handle * lh, /* lock handle of node -+ * where item is -+ * pasted */ -+ reiser4_item_data * data, /* parameters of new -+ * item */ -+ const reiser4_key * key, /* key of new item */ -+ unsigned flags/* paste flags */) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_insert_data *cdata; -+ carry_op *op; -+ -+ assert("umka-315", coord != NULL); -+ assert("umka-316", key != NULL); -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*cdata)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ cdata = (carry_insert_data *) (lowest_level + 3); -+ cdata->coord = coord; -+ cdata->data = data; -+ cdata->key = key; -+ op->u.paste.d = cdata; -+ if (flags == 0) -+ flags = znode_get_tree(coord->node)->carry.paste_flags; -+ op->u.paste.flags = flags; -+ op->u.paste.type = COPT_ITEM_DATA; -+ if (lh != NULL) { -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ } -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* insert item at the given coord. -+ -+ First try to skip carry by directly calling ->create_item() method of node -+ plugin. If this is impossible (there is not enough free space in the node, -+ or leftmost item in the node is created), call insert_with_carry_by_coord() -+ that will do full carry(). -+ -+*/ -+insert_result insert_by_coord(coord_t *coord /* coord where to -+ * insert. coord->node has -+ * to be write locked by -+ * caller */ , -+ reiser4_item_data * data /* data to be -+ * inserted */ , -+ const reiser4_key * key /* key of new item */ , -+ lock_handle * lh /* lock handle of write -+ * lock on node */ , -+ __u32 flags/* insertion flags */) -+{ -+ unsigned item_size; -+ int result; -+ znode *node; -+ -+ assert("vs-247", coord != NULL); -+ assert("vs-248", data != NULL); -+ assert("vs-249", data->length >= 0); -+ assert("nikita-1191", znode_is_write_locked(coord->node)); -+ -+ node = coord->node; -+ coord_clear_iplug(coord); -+ result = zload(node); -+ if (result != 0) -+ return result; -+ -+ item_size = space_needed(node, NULL, data, 1); -+ if (item_size > znode_free_space(node) && -+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT) -+ && (flags & COPI_DONT_ALLOCATE)) { -+ /* we are forced to use free space of coord->node and new item -+ does not fit into it. -+ -+ Currently we get here only when we allocate and copy units -+ of extent item from a node to its left neighbor during -+ "squalloc"-ing. If @node (this is left neighbor) does not -+ have enough free space - we do not want to attempt any -+ shifting and allocations because we are in squeezing and -+ everything to the left of @node is tightly packed. -+ */ -+ result = -E_NODE_FULL; -+ } else if ((item_size <= znode_free_space(node)) && -+ !coord_is_before_leftmost(coord) && -+ (node_plugin_by_node(node)->fast_insert != NULL) -+ && node_plugin_by_node(node)->fast_insert(coord)) { -+ /* shortcut insertion without carry() overhead. -+ -+ Only possible if: -+ -+ - there is enough free space -+ -+ - insertion is not into the leftmost position in a node -+ (otherwise it would require updating of delimiting key in a -+ parent) -+ -+ - node plugin agrees with this -+ -+ */ -+ result = -+ node_plugin_by_node(node)->create_item(coord, key, data, -+ NULL); -+ znode_make_dirty(node); -+ } else { -+ /* otherwise do full-fledged carry(). */ -+ result = -+ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT, -+ flags); -+ } -+ zrelse(node); -+ return result; -+} -+ -+/* @coord is set to leaf level and @data is to be inserted to twig level */ -+insert_result -+insert_extent_by_coord(coord_t *coord, /* coord where to insert. -+ * coord->node has to be write -+ * locked by caller */ -+ reiser4_item_data *data,/* data to be inserted */ -+ const reiser4_key *key, /* key of new item */ -+ lock_handle *lh /* lock handle of write lock -+ on node */) -+{ -+ assert("vs-405", coord != NULL); -+ assert("vs-406", data != NULL); -+ assert("vs-407", data->length > 0); -+ assert("vs-408", znode_is_write_locked(coord->node)); -+ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL); -+ -+ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT, -+ 0 /*flags */ ); -+} -+ -+/* Insert into the item at the given coord. -+ -+ First try to skip carry by directly calling ->paste() method of item -+ plugin. If this is impossible (there is not enough free space in the node, -+ or we are pasting into leftmost position in the node), call -+ paste_with_carry() that will do full carry(). -+ -+*/ -+/* paste_into_item */ -+int insert_into_item(coord_t * coord /* coord of pasting */ , -+ lock_handle * lh /* lock handle on node involved */ , -+ const reiser4_key * key /* key of unit being pasted */ , -+ reiser4_item_data * data /* parameters for new unit */ , -+ unsigned flags /* insert/paste flags */ ) -+{ -+ int result; -+ int size_change; -+ node_plugin *nplug; -+ item_plugin *iplug; -+ -+ assert("umka-317", coord != NULL); -+ assert("umka-318", key != NULL); -+ -+ iplug = item_plugin_by_coord(coord); -+ nplug = node_plugin_by_coord(coord); -+ -+ assert("nikita-1480", iplug == data->iplug); -+ -+ size_change = space_needed(coord->node, coord, data, 0); -+ if (size_change > (int)znode_free_space(coord->node) && -+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT) -+ && (flags & COPI_DONT_ALLOCATE)) { -+ /* we are forced to use free space of coord->node and new data -+ does not fit into it. */ -+ return -E_NODE_FULL; -+ } -+ -+ /* shortcut paste without carry() overhead. -+ -+ Only possible if: -+ -+ - there is enough free space -+ -+ - paste is not into the leftmost unit in a node (otherwise -+ it would require updating of delimiting key in a parent) -+ -+ - node plugin agrees with this -+ -+ - item plugin agrees with us -+ */ -+ if (size_change <= (int)znode_free_space(coord->node) && -+ (coord->item_pos != 0 || -+ coord->unit_pos != 0 || coord->between == AFTER_UNIT) && -+ coord->unit_pos != 0 && nplug->fast_paste != NULL && -+ nplug->fast_paste(coord) && -+ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) { -+ if (size_change > 0) -+ nplug->change_item_size(coord, size_change); -+ /* NOTE-NIKITA: huh? where @key is used? */ -+ result = iplug->b.paste(coord, data, NULL); -+ if (size_change < 0) -+ nplug->change_item_size(coord, size_change); -+ znode_make_dirty(coord->node); -+ } else -+ /* otherwise do full-fledged carry(). */ -+ result = paste_with_carry(coord, lh, data, key, flags); -+ return result; -+} -+ -+/* this either appends or truncates item @coord */ -+int reiser4_resize_item(coord_t * coord /* coord of item being resized */ , -+ reiser4_item_data * data /* parameters of resize */ , -+ reiser4_key * key /* key of new unit */ , -+ lock_handle * lh /* lock handle of node -+ * being modified */ , -+ cop_insert_flag flags /* carry flags */ ) -+{ -+ int result; -+ znode *node; -+ -+ assert("nikita-362", coord != NULL); -+ assert("nikita-363", data != NULL); -+ assert("vs-245", data->length != 0); -+ -+ node = coord->node; -+ coord_clear_iplug(coord); -+ result = zload(node); -+ if (result != 0) -+ return result; -+ -+ if (data->length < 0) -+ result = node_plugin_by_coord(coord)->shrink_item(coord, -+ -data->length); -+ else -+ result = insert_into_item(coord, lh, key, data, flags); -+ -+ zrelse(node); -+ return result; -+} -+ -+/* insert flow @f */ -+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ reiser4_item_data *data; -+ carry_op *op; -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*data)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node, -+ 0 /* operate directly on coord -> node */ ); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ -+ /* these are permanent during insert_flow */ -+ data = (reiser4_item_data *) (lowest_level + 3); -+ data->user = 1; -+ data->iplug = item_plugin_by_id(FORMATTING_ID); -+ data->arg = NULL; -+ /* data.length and data.data will be set before calling paste or -+ insert */ -+ data->length = 0; -+ data->data = NULL; -+ -+ op->u.insert_flow.flags = 0; -+ op->u.insert_flow.insert_point = coord; -+ op->u.insert_flow.flow = f; -+ op->u.insert_flow.data = data; -+ op->u.insert_flow.new_nodes = 0; -+ -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* Given a coord in parent node, obtain a znode for the corresponding child */ -+znode *child_znode(const coord_t * parent_coord /* coord of pointer to -+ * child */ , -+ znode * parent /* parent of child */ , -+ int incore_p /* if !0 only return child if already in -+ * memory */ , -+ int setup_dkeys_p /* if !0 update delimiting keys of -+ * child */ ) -+{ -+ znode *child; -+ -+ assert("nikita-1374", parent_coord != NULL); -+ assert("nikita-1482", parent != NULL); -+#if REISER4_DEBUG -+ if (setup_dkeys_p) -+ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock)); -+#endif -+ assert("nikita-2947", znode_is_any_locked(parent)); -+ -+ if (znode_get_level(parent) <= LEAF_LEVEL) { -+ /* trying to get child of leaf node */ -+ warning("nikita-1217", "Child of maize?"); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ if (item_is_internal(parent_coord)) { -+ reiser4_block_nr addr; -+ item_plugin *iplug; -+ reiser4_tree *tree; -+ -+ iplug = item_plugin_by_coord(parent_coord); -+ assert("vs-512", iplug->s.internal.down_link); -+ iplug->s.internal.down_link(parent_coord, NULL, &addr); -+ -+ tree = znode_get_tree(parent); -+ if (incore_p) -+ child = zlook(tree, &addr); -+ else -+ child = -+ zget(tree, &addr, parent, -+ znode_get_level(parent) - 1, -+ reiser4_ctx_gfp_mask_get()); -+ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p) -+ set_child_delimiting_keys(parent, parent_coord, child); -+ } else { -+ warning("nikita-1483", "Internal item expected"); -+ child = ERR_PTR(RETERR(-EIO)); -+ } -+ return child; -+} -+ -+/* remove znode from transaction */ -+static void uncapture_znode(znode * node) -+{ -+ struct page *page; -+ -+ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ if (!reiser4_blocknr_is_fake(znode_get_block(node))) { -+ int ret; -+ -+ /* An already allocated block goes right to the atom's delete set. */ -+ ret = -+ reiser4_dealloc_block(znode_get_block(node), 0, -+ BA_DEFER | BA_FORMATTED); -+ if (ret) -+ warning("zam-942", -+ "can't add a block (%llu) number to atom's delete set\n", -+ (unsigned long long)(*znode_get_block(node))); -+ -+ spin_lock_znode(node); -+ /* Here we return flush reserved block which was reserved at the -+ * moment when this allocated node was marked dirty and still -+ * not used by flush in node relocation procedure. */ -+ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) { -+ txn_atom *atom; -+ -+ atom = jnode_get_atom(ZJNODE(node)); -+ assert("zam-939", atom != NULL); -+ spin_unlock_znode(node); -+ flush_reserved2grabbed(atom, (__u64) 1); -+ spin_unlock_atom(atom); -+ } else -+ spin_unlock_znode(node); -+ } else { -+ /* znode has assigned block which is counted as "fake -+ allocated". Return it back to "free blocks") */ -+ fake_allocated2free((__u64) 1, BA_FORMATTED); -+ } -+ -+ /* -+ * uncapture page from transaction. There is a possibility of a race -+ * with ->releasepage(): reiser4_releasepage() detaches page from this -+ * jnode and we have nothing to uncapture. To avoid this, get -+ * reference of node->pg under jnode spin lock. reiser4_uncapture_page() -+ * will deal with released page itself. -+ */ -+ spin_lock_znode(node); -+ page = znode_page(node); -+ if (likely(page != NULL)) { -+ /* -+ * reiser4_uncapture_page() can only be called when we are sure -+ * that znode is pinned in memory, which we are, because -+ * forget_znode() is only called from longterm_unlock_znode(). -+ */ -+ page_cache_get(page); -+ spin_unlock_znode(node); -+ lock_page(page); -+ reiser4_uncapture_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } else { -+ txn_atom *atom; -+ -+ /* handle "flush queued" znodes */ -+ while (1) { -+ atom = jnode_get_atom(ZJNODE(node)); -+ assert("zam-943", atom != NULL); -+ -+ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED) -+ || !atom->nr_running_queues) -+ break; -+ -+ spin_unlock_znode(node); -+ reiser4_atom_wait_event(atom); -+ spin_lock_znode(node); -+ } -+ -+ reiser4_uncapture_block(ZJNODE(node)); -+ spin_unlock_atom(atom); -+ zput(node); -+ } -+} -+ -+/* This is called from longterm_unlock_znode() when last lock is released from -+ the node that has been removed from the tree. At this point node is removed -+ from sibling list and its lock is invalidated. */ -+void forget_znode(lock_handle * handle) -+{ -+ znode *node; -+ reiser4_tree *tree; -+ -+ assert("umka-319", handle != NULL); -+ -+ node = handle->node; -+ tree = znode_get_tree(node); -+ -+ assert("vs-164", znode_is_write_locked(node)); -+ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert_rw_locked(&(node->lock.guard)); -+ -+ /* We assume that this node was detached from its parent before -+ * unlocking, it gives no way to reach this node from parent through a -+ * down link. The node should have no children and, thereby, can't be -+ * reached from them by their parent pointers. The only way to obtain a -+ * reference to the node is to use sibling pointers from its left and -+ * right neighbors. In the next several lines we remove the node from -+ * the sibling list. */ -+ -+ write_lock_tree(tree); -+ sibling_list_remove(node); -+ znode_remove(node, tree); -+ write_unlock_tree(tree); -+ -+ /* Here we set JNODE_DYING and cancel all pending lock requests. It -+ * forces all lock requestor threads to repeat iterations of getting -+ * lock on a child, neighbor or parent node. But, those threads can't -+ * come to this node again, because this node is no longer a child, -+ * neighbor or parent of any other node. This order of znode -+ * invalidation does not allow other threads to waste cpu time is a busy -+ * loop, trying to lock dying object. The exception is in the flush -+ * code when we take node directly from atom's capture list.*/ -+ reiser4_invalidate_lock(handle); -+ uncapture_znode(node); -+} -+ -+/* Check that internal item at @pointer really contains pointer to @child. */ -+int check_tree_pointer(const coord_t * pointer /* would-be pointer to -+ * @child */ , -+ const znode * child /* child znode */ ) -+{ -+ assert("nikita-1016", pointer != NULL); -+ assert("nikita-1017", child != NULL); -+ assert("nikita-1018", pointer->node != NULL); -+ -+ assert("nikita-1325", znode_is_any_locked(pointer->node)); -+ -+ assert("nikita-2985", -+ znode_get_level(pointer->node) == znode_get_level(child) + 1); -+ -+ coord_clear_iplug((coord_t *) pointer); -+ -+ if (coord_is_existing_unit(pointer)) { -+ item_plugin *iplug; -+ reiser4_block_nr addr; -+ -+ if (item_is_internal(pointer)) { -+ iplug = item_plugin_by_coord(pointer); -+ assert("vs-513", iplug->s.internal.down_link); -+ iplug->s.internal.down_link(pointer, NULL, &addr); -+ /* check that cached value is correct */ -+ if (disk_addr_eq(&addr, znode_get_block(child))) { -+ return NS_FOUND; -+ } -+ } -+ } -+ /* warning ("jmacd-1002", "tree pointer incorrect"); */ -+ return NS_NOT_FOUND; -+} -+ -+/* find coord of pointer to new @child in @parent. -+ -+ Find the &coord_t in the @parent where pointer to a given @child will -+ be in. -+ -+*/ -+int find_new_child_ptr(znode * parent /* parent znode, passed locked */ , -+ znode * -+ child UNUSED_ARG /* child znode, passed locked */ , -+ znode * left /* left brother of new node */ , -+ coord_t * result /* where result is stored in */ ) -+{ -+ int ret; -+ -+ assert("nikita-1486", parent != NULL); -+ assert("nikita-1487", child != NULL); -+ assert("nikita-1488", result != NULL); -+ -+ ret = find_child_ptr(parent, left, result); -+ if (ret != NS_FOUND) { -+ warning("nikita-1489", "Cannot find brother position: %i", ret); -+ return RETERR(-EIO); -+ } else { -+ result->between = AFTER_UNIT; -+ return RETERR(NS_NOT_FOUND); -+ } -+} -+ -+/* find coord of pointer to @child in @parent. -+ -+ Find the &coord_t in the @parent where pointer to a given @child is in. -+ -+*/ -+int find_child_ptr(znode * parent /* parent znode, passed locked */ , -+ znode * child /* child znode, passed locked */ , -+ coord_t * result /* where result is stored in */ ) -+{ -+ int lookup_res; -+ node_plugin *nplug; -+ /* left delimiting key of a child */ -+ reiser4_key ld; -+ reiser4_tree *tree; -+ -+ assert("nikita-934", parent != NULL); -+ assert("nikita-935", child != NULL); -+ assert("nikita-936", result != NULL); -+ assert("zam-356", znode_is_loaded(parent)); -+ -+ coord_init_zero(result); -+ result->node = parent; -+ -+ nplug = parent->nplug; -+ assert("nikita-939", nplug != NULL); -+ -+ tree = znode_get_tree(parent); -+ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is -+ * not aliased to ->in_parent of some znode. Otherwise, -+ * parent_coord_to_coord() below would modify data protected by tree -+ * lock. */ -+ read_lock_tree(tree); -+ /* fast path. Try to use cached value. Lock tree to keep -+ node->pos_in_parent and pos->*_blocknr consistent. */ -+ if (child->in_parent.item_pos + 1 != 0) { -+ parent_coord_to_coord(&child->in_parent, result); -+ if (check_tree_pointer(result, child) == NS_FOUND) { -+ read_unlock_tree(tree); -+ return NS_FOUND; -+ } -+ -+ child->in_parent.item_pos = (unsigned short)~0; -+ } -+ read_unlock_tree(tree); -+ -+ /* is above failed, find some key from @child. We are looking for the -+ least key in a child. */ -+ read_lock_dk(tree); -+ ld = *znode_get_ld_key(child); -+ read_unlock_dk(tree); -+ /* -+ * now, lookup parent with key just found. Note, that left delimiting -+ * key doesn't identify node uniquely, because (in extremely rare -+ * case) two nodes can have equal left delimiting keys, if one of them -+ * is completely filled with directory entries that all happened to be -+ * hash collision. But, we check block number in check_tree_pointer() -+ * and, so, are safe. -+ */ -+ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result); -+ /* update cached pos_in_node */ -+ if (lookup_res == NS_FOUND) { -+ write_lock_tree(tree); -+ coord_to_parent_coord(result, &child->in_parent); -+ write_unlock_tree(tree); -+ lookup_res = check_tree_pointer(result, child); -+ } -+ if (lookup_res == NS_NOT_FOUND) -+ lookup_res = find_child_by_addr(parent, child, result); -+ return lookup_res; -+} -+ -+/* find coord of pointer to @child in @parent by scanning -+ -+ Find the &coord_t in the @parent where pointer to a given @child -+ is in by scanning all internal items in @parent and comparing block -+ numbers in them with that of @child. -+ -+*/ -+static int find_child_by_addr(znode * parent /* parent znode, passed locked */ , -+ znode * child /* child znode, passed locked */ , -+ coord_t * result /* where result is stored in */ ) -+{ -+ int ret; -+ -+ assert("nikita-1320", parent != NULL); -+ assert("nikita-1321", child != NULL); -+ assert("nikita-1322", result != NULL); -+ -+ ret = NS_NOT_FOUND; -+ -+ for_all_units(result, parent) { -+ if (check_tree_pointer(result, child) == NS_FOUND) { -+ write_lock_tree(znode_get_tree(parent)); -+ coord_to_parent_coord(result, &child->in_parent); -+ write_unlock_tree(znode_get_tree(parent)); -+ ret = NS_FOUND; -+ break; -+ } -+ } -+ return ret; -+} -+ -+/* true, if @addr is "unallocated block number", which is just address, with -+ highest bit set. */ -+int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to -+ * check */ ) -+{ -+ assert("nikita-1766", addr != NULL); -+ cassert(sizeof(reiser4_block_nr) == 8); -+ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) == -+ REISER4_UNALLOCATED_STATUS_VALUE; -+} -+ -+/* returns true if removing bytes of given range of key [from_key, to_key] -+ causes removing of whole item @from */ -+static int -+item_removed_completely(coord_t * from, const reiser4_key * from_key, -+ const reiser4_key * to_key) -+{ -+ item_plugin *iplug; -+ reiser4_key key_in_item; -+ -+ assert("umka-325", from != NULL); -+ assert("", item_is_extent(from)); -+ -+ /* check first key just for case */ -+ item_key_by_coord(from, &key_in_item); -+ if (keygt(from_key, &key_in_item)) -+ return 0; -+ -+ /* check last key */ -+ iplug = item_plugin_by_coord(from); -+ assert("vs-611", iplug && iplug->s.file.append_key); -+ -+ iplug->s.file.append_key(from, &key_in_item); -+ set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1); -+ -+ if (keylt(to_key, &key_in_item)) -+ /* last byte is not removed */ -+ return 0; -+ return 1; -+} -+ -+/* helper function for prepare_twig_kill(): @left and @right are formatted -+ * neighbors of extent item being completely removed. Load and lock neighbors -+ * and store lock handles into @cdata for later use by kill_hook_extent() */ -+static int -+prepare_children(znode * left, znode * right, carry_kill_data * kdata) -+{ -+ int result; -+ int left_loaded; -+ int right_loaded; -+ -+ result = 0; -+ left_loaded = right_loaded = 0; -+ -+ if (left != NULL) { -+ result = zload(left); -+ if (result == 0) { -+ left_loaded = 1; -+ result = longterm_lock_znode(kdata->left, left, -+ ZNODE_READ_LOCK, -+ ZNODE_LOCK_LOPRI); -+ } -+ } -+ if (result == 0 && right != NULL) { -+ result = zload(right); -+ if (result == 0) { -+ right_loaded = 1; -+ result = longterm_lock_znode(kdata->right, right, -+ ZNODE_READ_LOCK, -+ ZNODE_LOCK_HIPRI | -+ ZNODE_LOCK_NONBLOCK); -+ } -+ } -+ if (result != 0) { -+ done_lh(kdata->left); -+ done_lh(kdata->right); -+ if (left_loaded != 0) -+ zrelse(left); -+ if (right_loaded != 0) -+ zrelse(right); -+ } -+ return result; -+} -+ -+static void done_children(carry_kill_data * kdata) -+{ -+ if (kdata->left != NULL && kdata->left->node != NULL) { -+ zrelse(kdata->left->node); -+ done_lh(kdata->left); -+ } -+ if (kdata->right != NULL && kdata->right->node != NULL) { -+ zrelse(kdata->right->node); -+ done_lh(kdata->right); -+ } -+} -+ -+/* part of cut_node. It is called when cut_node is called to remove or cut part -+ of extent item. When head of that item is removed - we have to update right -+ delimiting of left neighbor of extent. When item is removed completely - we -+ have to set sibling link between left and right neighbor of removed -+ extent. This may return -E_DEADLOCK because of trying to get left neighbor -+ locked. So, caller should repeat an attempt -+*/ -+/* Audited by: umka (2002.06.16) */ -+static int -+prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor) -+{ -+ int result; -+ reiser4_key key; -+ lock_handle left_lh; -+ lock_handle right_lh; -+ coord_t left_coord; -+ coord_t *from; -+ znode *left_child; -+ znode *right_child; -+ reiser4_tree *tree; -+ int left_zloaded_here, right_zloaded_here; -+ -+ from = kdata->params.from; -+ assert("umka-326", from != NULL); -+ assert("umka-327", kdata->params.to != NULL); -+ -+ /* for one extent item only yet */ -+ assert("vs-591", item_is_extent(from)); -+ assert("vs-592", from->item_pos == kdata->params.to->item_pos); -+ -+ if ((kdata->params.from_key -+ && keygt(kdata->params.from_key, item_key_by_coord(from, &key))) -+ || from->unit_pos != 0) { -+ /* head of item @from is not removed, there is nothing to -+ worry about */ -+ return 0; -+ } -+ -+ result = 0; -+ left_zloaded_here = 0; -+ right_zloaded_here = 0; -+ -+ left_child = right_child = NULL; -+ -+ coord_dup(&left_coord, from); -+ init_lh(&left_lh); -+ init_lh(&right_lh); -+ if (coord_prev_unit(&left_coord)) { -+ /* @from is leftmost item in its node */ -+ if (!locked_left_neighbor) { -+ result = -+ reiser4_get_left_neighbor(&left_lh, from->node, -+ ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ switch (result) { -+ case 0: -+ break; -+ case -E_NO_NEIGHBOR: -+ /* there is no formatted node to the left of -+ from->node */ -+ warning("vs-605", -+ "extent item has smallest key in " -+ "the tree and it is about to be removed"); -+ return 0; -+ case -E_DEADLOCK: -+ /* need to restart */ -+ default: -+ return result; -+ } -+ -+ /* we have acquired left neighbor of from->node */ -+ result = zload(left_lh.node); -+ if (result) -+ goto done; -+ -+ locked_left_neighbor = left_lh.node; -+ } else { -+ /* squalloc_right_twig_cut should have supplied locked -+ * left neighbor */ -+ assert("vs-834", -+ znode_is_write_locked(locked_left_neighbor)); -+ result = zload(locked_left_neighbor); -+ if (result) -+ return result; -+ } -+ -+ left_zloaded_here = 1; -+ coord_init_last_unit(&left_coord, locked_left_neighbor); -+ } -+ -+ if (!item_is_internal(&left_coord)) { -+ /* what else but extent can be on twig level */ -+ assert("vs-606", item_is_extent(&left_coord)); -+ -+ /* there is no left formatted child */ -+ if (left_zloaded_here) -+ zrelse(locked_left_neighbor); -+ done_lh(&left_lh); -+ return 0; -+ } -+ -+ tree = znode_get_tree(left_coord.node); -+ left_child = child_znode(&left_coord, left_coord.node, 1, 0); -+ -+ if (IS_ERR(left_child)) { -+ result = PTR_ERR(left_child); -+ goto done; -+ } -+ -+ /* left child is acquired, calculate new right delimiting key for it -+ and get right child if it is necessary */ -+ if (item_removed_completely -+ (from, kdata->params.from_key, kdata->params.to_key)) { -+ /* try to get right child of removed item */ -+ coord_t right_coord; -+ -+ assert("vs-607", -+ kdata->params.to->unit_pos == -+ coord_last_unit_pos(kdata->params.to)); -+ coord_dup(&right_coord, kdata->params.to); -+ if (coord_next_unit(&right_coord)) { -+ /* @to is rightmost unit in the node */ -+ result = -+ reiser4_get_right_neighbor(&right_lh, from->node, -+ ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ switch (result) { -+ case 0: -+ result = zload(right_lh.node); -+ if (result) -+ goto done; -+ -+ right_zloaded_here = 1; -+ coord_init_first_unit(&right_coord, -+ right_lh.node); -+ item_key_by_coord(&right_coord, &key); -+ break; -+ -+ case -E_NO_NEIGHBOR: -+ /* there is no formatted node to the right of -+ from->node */ -+ read_lock_dk(tree); -+ key = *znode_get_rd_key(from->node); -+ read_unlock_dk(tree); -+ right_coord.node = NULL; -+ result = 0; -+ break; -+ default: -+ /* real error */ -+ goto done; -+ } -+ } else { -+ /* there is an item to the right of @from - take its key */ -+ item_key_by_coord(&right_coord, &key); -+ } -+ -+ /* try to get right child of @from */ -+ if (right_coord.node && /* there is right neighbor of @from */ -+ item_is_internal(&right_coord)) { /* it is internal item */ -+ right_child = child_znode(&right_coord, -+ right_coord.node, 1, 0); -+ -+ if (IS_ERR(right_child)) { -+ result = PTR_ERR(right_child); -+ goto done; -+ } -+ -+ } -+ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and -+ update of right delimiting key of left_child */ -+ result = prepare_children(left_child, right_child, kdata); -+ } else { -+ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */ -+ result = prepare_children(left_child, NULL, kdata); -+ } -+ -+ done: -+ if (right_child) -+ zput(right_child); -+ if (right_zloaded_here) -+ zrelse(right_lh.node); -+ done_lh(&right_lh); -+ -+ if (left_child) -+ zput(left_child); -+ if (left_zloaded_here) -+ zrelse(locked_left_neighbor); -+ done_lh(&left_lh); -+ return result; -+} -+ -+/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set -+ are to be cut completely */ -+/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */ -+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */ -+ const reiser4_key * to_key, /* last key to be removed */ -+ reiser4_key * -+ smallest_removed /* smallest key actually removed */ ) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_cut_data *cut_data; -+ carry_op *op; -+ -+ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT); -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*cut_data)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0); -+ assert("vs-1509", op != 0); -+ if (IS_ERR(op)) { -+ done_carry_pool(pool); -+ return PTR_ERR(op); -+ } -+ -+ cut_data = (carry_cut_data *) (lowest_level + 3); -+ cut_data->params.from = from; -+ cut_data->params.to = to; -+ cut_data->params.from_key = from_key; -+ cut_data->params.to_key = to_key; -+ cut_data->params.smallest_removed = smallest_removed; -+ -+ op->u.cut_or_kill.is_cut = 1; -+ op->u.cut_or_kill.u.cut = cut_data; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* cut part of the node -+ -+ Cut part or whole content of node. -+ -+ cut data between @from and @to of @from->node and call carry() to make -+ corresponding changes in the tree. @from->node may become empty. If so - -+ pointer to it will be removed. Neighboring nodes are not changed. Smallest -+ removed key is stored in @smallest_removed -+ -+*/ -+int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */ -+ coord_t * to, /* coord of the last unit/item that will be eliminated */ -+ const reiser4_key * from_key, /* first key to be removed */ -+ const reiser4_key * to_key, /* last key to be removed */ -+ reiser4_key * smallest_removed, /* smallest key actually removed */ -+ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor -+ * locked (in squalloc_right_twig_cut, namely) */ -+ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to -+ invalidate pages together with item pointing to them */ -+ int truncate) -+{ /* this call is made for file truncate) */ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_kill_data *kdata; -+ lock_handle *left_child; -+ lock_handle *right_child; -+ carry_op *op; -+ -+ assert("umka-328", from != NULL); -+ assert("vs-316", !node_is_empty(from->node)); -+ assert("nikita-1812", coord_is_existing_unit(from) -+ && coord_is_existing_unit(to)); -+ -+ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(carry_kill_data) + -+ 2 * sizeof(lock_handle) + -+ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ kdata = (carry_kill_data *) (lowest_level + 3); -+ left_child = (lock_handle *) (kdata + 1); -+ right_child = left_child + 1; -+ -+ init_lh(left_child); -+ init_lh(right_child); -+ -+ kdata->params.from = from; -+ kdata->params.to = to; -+ kdata->params.from_key = from_key; -+ kdata->params.to_key = to_key; -+ kdata->params.smallest_removed = smallest_removed; -+ kdata->params.truncate = truncate; -+ kdata->flags = 0; -+ kdata->inode = inode; -+ kdata->left = left_child; -+ kdata->right = right_child; -+ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */ -+ kdata->buf = (char *)(right_child + 1); -+ -+ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) { -+ /* left child of extent item may have to get updated right -+ delimiting key and to get linked with right child of extent -+ @from if it will be removed completely */ -+ result = prepare_twig_kill(kdata, locked_left_neighbor); -+ if (result) { -+ done_children(kdata); -+ done_carry_pool(pool); -+ return result; -+ } -+ } -+ -+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_children(kdata); -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ -+ op->u.cut_or_kill.is_cut = 0; -+ op->u.cut_or_kill.u.kill = kdata; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ -+ done_children(kdata); -+ done_carry_pool(pool); -+ return result; -+} -+ -+void -+fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate) -+{ -+ if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) { -+ pgoff_t start_pg, end_pg; -+ -+ start_pg = start >> PAGE_CACHE_SHIFT; -+ end_pg = (end - 1) >> PAGE_CACHE_SHIFT; -+ -+ if ((start & (PAGE_CACHE_SIZE - 1)) == 0) { -+ /* -+ * kill up to the page boundary. -+ */ -+ assert("vs-123456", start_pg == end_pg); -+ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1, -+ truncate); -+ } else if (start_pg != end_pg) { -+ /* -+ * page boundary is within killed portion of node. -+ */ -+ assert("vs-654321", end_pg - start_pg == 1); -+ reiser4_invalidate_pages(inode->i_mapping, end_pg, -+ end_pg - start_pg, 1); -+ } -+ } -+ inode_sub_bytes(inode, end - start); -+} -+ -+/** -+ * Delete whole @node from the reiser4 tree without loading it. -+ * -+ * @left: locked left neighbor, -+ * @node: node to be deleted, -+ * @smallest_removed: leftmost key of deleted node, -+ * @object: inode pointer, if we truncate a file body. -+ * @truncate: true if called for file truncate. -+ * -+ * @return: 0 if success, error code otherwise. -+ * -+ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it -+ * contains the right value of the smallest removed key from the previous -+ * cut_worker() iteration. This is needed for proper accounting of -+ * "i_blocks" and "i_bytes" fields of the @object. -+ */ -+int reiser4_delete_node(znode * node, reiser4_key * smallest_removed, -+ struct inode *object, int truncate) -+{ -+ lock_handle parent_lock; -+ coord_t cut_from; -+ coord_t cut_to; -+ reiser4_tree *tree; -+ int ret; -+ -+ assert("zam-937", node != NULL); -+ assert("zam-933", znode_is_write_locked(node)); -+ assert("zam-999", smallest_removed != NULL); -+ -+ init_lh(&parent_lock); -+ -+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK); -+ if (ret) -+ return ret; -+ -+ assert("zam-934", !znode_above_root(parent_lock.node)); -+ -+ ret = zload(parent_lock.node); -+ if (ret) -+ goto failed_nozrelse; -+ -+ ret = find_child_ptr(parent_lock.node, node, &cut_from); -+ if (ret) -+ goto failed; -+ -+ /* decrement child counter and set parent pointer to NULL before -+ deleting the list from parent node because of checks in -+ internal_kill_item_hook (we can delete the last item from the parent -+ node, the parent node is going to be deleted and its c_count should -+ be zero). */ -+ -+ tree = znode_get_tree(node); -+ write_lock_tree(tree); -+ init_parent_coord(&node->in_parent, NULL); -+ --parent_lock.node->c_count; -+ write_unlock_tree(tree); -+ -+ assert("zam-989", item_is_internal(&cut_from)); -+ -+ /* @node should be deleted after unlocking. */ -+ ZF_SET(node, JNODE_HEARD_BANSHEE); -+ -+ /* remove a pointer from the parent node to the node being deleted. */ -+ coord_dup(&cut_to, &cut_from); -+ /* FIXME: shouldn't this be kill_node_content */ -+ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL); -+ if (ret) -+ /* FIXME(Zam): Should we re-connect the node to its parent if -+ * cut_node fails? */ -+ goto failed; -+ -+ { -+ reiser4_tree *tree = current_tree; -+ __u64 start_offset = 0, end_offset = 0; -+ -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ if (object) { -+ /* We use @smallest_removed and the left delimiting of -+ * the current node for @object->i_blocks, i_bytes -+ * calculation. We assume that the items after the -+ * *@smallest_removed key have been deleted from the -+ * file body. */ -+ start_offset = get_key_offset(znode_get_ld_key(node)); -+ end_offset = get_key_offset(smallest_removed); -+ } -+ -+ assert("zam-1021", znode_is_connected(node)); -+ if (node->left) -+ znode_set_rd_key(node->left, znode_get_rd_key(node)); -+ -+ *smallest_removed = *znode_get_ld_key(node); -+ -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+ -+ if (object) { -+ /* we used to perform actions which are to be performed on items on their removal from tree in -+ special item method - kill_hook. Here for optimization reasons we avoid reading node -+ containing item we remove and can not call item's kill hook. Instead we call function which -+ does exactly the same things as tail kill hook in assumption that node we avoid reading -+ contains only one item and that item is a tail one. */ -+ fake_kill_hook_tail(object, start_offset, end_offset, -+ truncate); -+ } -+ } -+ failed: -+ zrelse(parent_lock.node); -+ failed_nozrelse: -+ done_lh(&parent_lock); -+ -+ return ret; -+} -+ -+static int can_delete(const reiser4_key *key, znode *node) -+{ -+ int result; -+ -+ read_lock_dk(current_tree); -+ result = keyle(key, znode_get_ld_key(node)); -+ read_unlock_dk(current_tree); -+ return result; -+} -+ -+/** -+ * This subroutine is not optimal but implementation seems to -+ * be easier). -+ * -+ * @tap: the point deletion process begins from, -+ * @from_key: the beginning of the deleted key range, -+ * @to_key: the end of the deleted key range, -+ * @smallest_removed: the smallest removed key, -+ * @truncate: true if called for file truncate. -+ * @progress: return true if a progress in file items deletions was made, -+ * @smallest_removed value is actual in that case. -+ * -+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long -+ * reiser4_cut_tree operation was interrupted for allowing atom commit. -+ */ -+int -+cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, struct inode *object, -+ int truncate, int *progress) -+{ -+ lock_handle next_node_lock; -+ coord_t left_coord; -+ int result; -+ -+ assert("zam-931", tap->coord->node != NULL); -+ assert("zam-932", znode_is_write_locked(tap->coord->node)); -+ -+ *progress = 0; -+ init_lh(&next_node_lock); -+ -+ while (1) { -+ znode *node; /* node from which items are cut */ -+ node_plugin *nplug; /* node plugin for @node */ -+ -+ node = tap->coord->node; -+ -+ /* Move next_node_lock to the next node on the left. */ -+ result = -+ reiser4_get_left_neighbor(&next_node_lock, node, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result != 0 && result != -E_NO_NEIGHBOR) -+ break; -+ /* Check can we delete the node as a whole. */ -+ if (*progress && znode_get_level(node) == LEAF_LEVEL && -+ can_delete(from_key, node)) { -+ result = reiser4_delete_node(node, smallest_removed, -+ object, truncate); -+ } else { -+ result = reiser4_tap_load(tap); -+ if (result) -+ return result; -+ -+ /* Prepare the second (right) point for cut_node() */ -+ if (*progress) -+ coord_init_last_unit(tap->coord, node); -+ -+ else if (item_plugin_by_coord(tap->coord)->b.lookup == -+ NULL) -+ /* set rightmost unit for the items without lookup method */ -+ tap->coord->unit_pos = -+ coord_last_unit_pos(tap->coord); -+ -+ nplug = node->nplug; -+ -+ assert("vs-686", nplug); -+ assert("vs-687", nplug->lookup); -+ -+ /* left_coord is leftmost unit cut from @node */ -+ result = nplug->lookup(node, from_key, -+ FIND_MAX_NOT_MORE_THAN, -+ &left_coord); -+ -+ if (IS_CBKERR(result)) -+ break; -+ -+ /* adjust coordinates so that they are set to existing units */ -+ if (coord_set_to_right(&left_coord) -+ || coord_set_to_left(tap->coord)) { -+ result = 0; -+ break; -+ } -+ -+ if (coord_compare(&left_coord, tap->coord) == -+ COORD_CMP_ON_RIGHT) { -+ /* keys from @from_key to @to_key are not in the tree */ -+ result = 0; -+ break; -+ } -+ -+ if (left_coord.item_pos != tap->coord->item_pos) { -+ /* do not allow to cut more than one item. It is added to solve problem of truncating -+ partially converted files. If file is partially converted there may exist a twig node -+ containing both internal item or items pointing to leaf nodes with formatting items -+ and extent item. We do not want to kill internal items being at twig node here -+ because cut_tree_worker assumes killing them from level level */ -+ coord_dup(&left_coord, tap->coord); -+ assert("vs-1652", -+ coord_is_existing_unit(&left_coord)); -+ left_coord.unit_pos = 0; -+ } -+ -+ /* cut data from one node */ -+ /* *smallest_removed = *reiser4_min_key(); */ -+ result = -+ kill_node_content(&left_coord, tap->coord, from_key, -+ to_key, smallest_removed, -+ next_node_lock.node, object, -+ truncate); -+ reiser4_tap_relse(tap); -+ } -+ if (result) -+ break; -+ -+ ++(*progress); -+ -+ /* Check whether all items with keys >= from_key were removed -+ * from the tree. */ -+ if (keyle(smallest_removed, from_key)) -+ /* result = 0; */ -+ break; -+ -+ if (next_node_lock.node == NULL) -+ break; -+ -+ result = reiser4_tap_move(tap, &next_node_lock); -+ done_lh(&next_node_lock); -+ if (result) -+ break; -+ -+ /* Break long reiser4_cut_tree operation (deletion of a large -+ file) if atom requires commit. */ -+ if (*progress > CUT_TREE_MIN_ITERATIONS -+ && current_atom_should_commit()) { -+ result = -E_REPEAT; -+ break; -+ } -+ } -+ done_lh(&next_node_lock); -+ /* assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key())); */ -+ return result; -+} -+ -+/* there is a fundamental problem with optimizing deletes: VFS does it -+ one file at a time. Another problem is that if an item can be -+ anything, then deleting items must be done one at a time. It just -+ seems clean to writes this to specify a from and a to key, and cut -+ everything between them though. */ -+ -+/* use this function with care if deleting more than what is part of a single file. */ -+/* do not use this when cutting a single item, it is suboptimal for that */ -+ -+/* You are encouraged to write plugin specific versions of this. It -+ cannot be optimal for all plugins because it works item at a time, -+ and some plugins could sometimes work node at a time. Regular files -+ however are not optimizable to work node at a time because of -+ extents needing to free the blocks they point to. -+ -+ Optimizations compared to v3 code: -+ -+ It does not balance (that task is left to memory pressure code). -+ -+ Nodes are deleted only if empty. -+ -+ Uses extents. -+ -+ Performs read-ahead of formatted nodes whose contents are part of -+ the deletion. -+*/ -+ -+/** -+ * Delete everything from the reiser4 tree between two keys: @from_key and -+ * @to_key. -+ * -+ * @from_key: the beginning of the deleted key range, -+ * @to_key: the end of the deleted key range, -+ * @smallest_removed: the smallest removed key, -+ * @object: owner of cutting items. -+ * @truncate: true if called for file truncate. -+ * @progress: return true if a progress in file items deletions was made, -+ * @smallest_removed value is actual in that case. -+ * -+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree -+ * operation was interrupted for allowing atom commit . -+ */ -+ -+int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed_p, -+ struct inode *object, int truncate, int *progress) -+{ -+ lock_handle lock; -+ int result; -+ tap_t tap; -+ coord_t right_coord; -+ reiser4_key smallest_removed; -+ int (*cut_tree_worker) (tap_t *, const reiser4_key *, -+ const reiser4_key *, reiser4_key *, -+ struct inode *, int, int *); -+ STORE_COUNTERS; -+ -+ assert("umka-329", tree != NULL); -+ assert("umka-330", from_key != NULL); -+ assert("umka-331", to_key != NULL); -+ assert("zam-936", keyle(from_key, to_key)); -+ -+ if (smallest_removed_p == NULL) -+ smallest_removed_p = &smallest_removed; -+ -+ init_lh(&lock); -+ -+ do { -+ /* Find rightmost item to cut away from the tree. */ -+ result = reiser4_object_lookup(object, to_key, &right_coord, -+ &lock, ZNODE_WRITE_LOCK, -+ FIND_MAX_NOT_MORE_THAN, -+ TWIG_LEVEL, LEAF_LEVEL, -+ CBK_UNIQUE, NULL /*ra_info */); -+ if (result != CBK_COORD_FOUND) -+ break; -+ if (object == NULL -+ || inode_file_plugin(object)->cut_tree_worker == NULL) -+ cut_tree_worker = cut_tree_worker_common; -+ else -+ cut_tree_worker = -+ inode_file_plugin(object)->cut_tree_worker; -+ reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK); -+ result = -+ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p, -+ object, truncate, progress); -+ reiser4_tap_done(&tap); -+ -+ reiser4_preempt_point(); -+ -+ } while (0); -+ -+ done_lh(&lock); -+ -+ if (result) { -+ switch (result) { -+ case -E_NO_NEIGHBOR: -+ result = 0; -+ break; -+ case -E_DEADLOCK: -+ result = -E_REPEAT; -+ case -E_REPEAT: -+ case -ENOMEM: -+ case -ENOENT: -+ break; -+ default: -+ warning("nikita-2861", "failure: %i", result); -+ } -+ } -+ -+ CHECK_COUNTERS; -+ return result; -+} -+ -+/* repeat reiser4_cut_tree_object until everything is deleted. -+ * unlike cut_file_items, it does not end current transaction if -E_REPEAT -+ * is returned by cut_tree_object. */ -+int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from, -+ const reiser4_key * to, struct inode *inode, int truncate) -+{ -+ int result; -+ int progress; -+ -+ do { -+ result = reiser4_cut_tree_object(tree, from, to, NULL, -+ inode, truncate, &progress); -+ } while (result == -E_REPEAT); -+ -+ return result; -+} -+ -+/* finishing reiser4 initialization */ -+int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being -+ * initialized */ , -+ const reiser4_block_nr * root_block /* address of a root block -+ * on a disk */ , -+ tree_level height /* height of a tree */ , -+ node_plugin * nplug /* default node plugin */ ) -+{ -+ int result; -+ -+ assert("nikita-306", tree != NULL); -+ assert("nikita-307", root_block != NULL); -+ assert("nikita-308", height > 0); -+ assert("nikita-309", nplug != NULL); -+ assert("zam-587", tree->super != NULL); -+ -+ tree->root_block = *root_block; -+ tree->height = height; -+ tree->estimate_one_insert = calc_estimate_one_insert(height); -+ tree->nplug = nplug; -+ -+ tree->znode_epoch = 1ull; -+ -+ cbk_cache_init(&tree->cbk_cache); -+ -+ result = znodes_tree_init(tree); -+ if (result == 0) -+ result = jnodes_tree_init(tree); -+ if (result == 0) { -+ tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0, -+ reiser4_ctx_gfp_mask_get()); -+ if (IS_ERR(tree->uber)) { -+ result = PTR_ERR(tree->uber); -+ tree->uber = NULL; -+ } -+ } -+ return result; -+} -+ -+/* release resources associated with @tree */ -+void reiser4_done_tree(reiser4_tree * tree /* tree to release */ ) -+{ -+ if (tree == NULL) -+ return; -+ -+ if (tree->uber != NULL) { -+ zput(tree->uber); -+ tree->uber = NULL; -+ } -+ znodes_tree_done(tree); -+ jnodes_tree_done(tree); -+ cbk_cache_done(&tree->cbk_cache); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/tree.h linux-2.6.30/fs/reiser4/tree.h ---- linux-2.6.30.orig/fs/reiser4/tree.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/tree.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,577 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Tree operations. See fs/reiser4/tree.c for comments */ -+ -+#if !defined( __REISER4_TREE_H__ ) -+#define __REISER4_TREE_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "znode.h" -+#include "tap.h" -+ -+#include <linux/types.h> /* for __u?? */ -+#include <linux/fs.h> /* for struct super_block */ -+#include <linux/spinlock.h> -+#include <linux/sched.h> /* for struct task_struct */ -+ -+/* fictive block number never actually used */ -+extern const reiser4_block_nr UBER_TREE_ADDR; -+ -+/* &cbk_cache_slot - entry in a coord cache. -+ -+ This is entry in a coord_by_key (cbk) cache, represented by -+ &cbk_cache. -+ -+*/ -+typedef struct cbk_cache_slot { -+ /* cached node */ -+ znode *node; -+ /* linkage to the next cbk cache slot in a LRU order */ -+ struct list_head lru; -+} cbk_cache_slot; -+ -+/* &cbk_cache - coord cache. This is part of reiser4_tree. -+ -+ cbk_cache is supposed to speed up tree lookups by caching results of recent -+ successful lookups (we don't cache negative results as dentry cache -+ does). Cache consists of relatively small number of entries kept in a LRU -+ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from -+ which we can obtain a range of keys that covered by this znode. Before -+ embarking into real tree traversal we scan cbk_cache slot by slot and for -+ each slot check whether key we are looking for is between minimal and -+ maximal keys for node pointed to by this slot. If no match is found, real -+ tree traversal is performed and if result is successful, appropriate entry -+ is inserted into cache, possibly pulling least recently used entry out of -+ it. -+ -+ Tree spin lock is used to protect coord cache. If contention for this -+ lock proves to be too high, more finer grained locking can be added. -+ -+ Invariants involving parts of this data-type: -+ -+ [cbk-cache-invariant] -+*/ -+typedef struct cbk_cache { -+ /* serializator */ -+ rwlock_t guard; -+ int nr_slots; -+ /* head of LRU list of cache slots */ -+ struct list_head lru; -+ /* actual array of slots */ -+ cbk_cache_slot *slot; -+} cbk_cache; -+ -+/* level_lookup_result - possible outcome of looking up key at some level. -+ This is used by coord_by_key when traversing tree downward. */ -+typedef enum { -+ /* continue to the next level */ -+ LOOKUP_CONT, -+ /* done. Either required item was found, or we can prove it -+ doesn't exist, or some error occurred. */ -+ LOOKUP_DONE, -+ /* restart traversal from the root. Infamous "repetition". */ -+ LOOKUP_REST -+} level_lookup_result; -+ -+/* This is representation of internal reiser4 tree where all file-system -+ data and meta-data are stored. This structure is passed to all tree -+ manipulation functions. It's different from the super block because: -+ we don't want to limit ourselves to strictly one to one mapping -+ between super blocks and trees, and, because they are logically -+ different: there are things in a super block that have no relation to -+ the tree (bitmaps, journalling area, mount options, etc.) and there -+ are things in a tree that bear no relation to the super block, like -+ tree of znodes. -+ -+ At this time, there is only one tree -+ per filesystem, and this struct is part of the super block. We only -+ call the super block the super block for historical reasons (most -+ other filesystems call the per filesystem metadata the super block). -+*/ -+ -+struct reiser4_tree { -+ /* block_nr == 0 is fake znode. Write lock it, while changing -+ tree height. */ -+ /* disk address of root node of a tree */ -+ reiser4_block_nr root_block; -+ -+ /* level of the root node. If this is 1, tree consists of root -+ node only */ -+ tree_level height; -+ -+ /* -+ * this is cached here avoid calling plugins through function -+ * dereference all the time. -+ */ -+ __u64 estimate_one_insert; -+ -+ /* cache of recent tree lookup results */ -+ cbk_cache cbk_cache; -+ -+ /* hash table to look up znodes by block number. */ -+ z_hash_table zhash_table; -+ z_hash_table zfake_table; -+ /* hash table to look up jnodes by inode and offset. */ -+ j_hash_table jhash_table; -+ -+ /* lock protecting: -+ - parent pointers, -+ - sibling pointers, -+ - znode hash table -+ - coord cache -+ */ -+ /* NOTE: The "giant" tree lock can be replaced by more spin locks, -+ hoping they will be less contented. We can use one spin lock per one -+ znode hash bucket. With adding of some code complexity, sibling -+ pointers can be protected by both znode spin locks. However it looks -+ more SMP scalable we should test this locking change on n-ways (n > -+ 4) SMP machines. Current 4-ways machine test does not show that tree -+ lock is contented and it is a bottleneck (2003.07.25). */ -+ -+ rwlock_t tree_lock; -+ -+ /* lock protecting delimiting keys */ -+ rwlock_t dk_lock; -+ -+ /* spin lock protecting znode_epoch */ -+ spinlock_t epoch_lock; -+ /* version stamp used to mark znode updates. See seal.[ch] for more -+ * information. */ -+ __u64 znode_epoch; -+ -+ znode *uber; -+ node_plugin *nplug; -+ struct super_block *super; -+ struct { -+ /* carry flags used for insertion of new nodes */ -+ __u32 new_node_flags; -+ /* carry flags used for insertion of new extents */ -+ __u32 new_extent_flags; -+ /* carry flags used for paste operations */ -+ __u32 paste_flags; -+ /* carry flags used for insert operations */ -+ __u32 insert_flags; -+ } carry; -+}; -+ -+extern int reiser4_init_tree(reiser4_tree * tree, -+ const reiser4_block_nr * root_block, -+ tree_level height, node_plugin * default_plugin); -+extern void reiser4_done_tree(reiser4_tree * tree); -+ -+/* cbk flags: options for coord_by_key() */ -+typedef enum { -+ /* coord_by_key() is called for insertion. This is necessary because -+ of extents being located at the twig level. For explanation, see -+ comment just above is_next_item_internal(). -+ */ -+ CBK_FOR_INSERT = (1 << 0), -+ /* coord_by_key() is called with key that is known to be unique */ -+ CBK_UNIQUE = (1 << 1), -+ /* coord_by_key() can trust delimiting keys. This options is not user -+ accessible. coord_by_key() will set it automatically. It will be -+ only cleared by special-case in extents-on-the-twig-level handling -+ where it is necessary to insert item with a key smaller than -+ leftmost key in a node. This is necessary because of extents being -+ located at the twig level. For explanation, see comment just above -+ is_next_item_internal(). -+ */ -+ CBK_TRUST_DK = (1 << 2), -+ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */ -+ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */ -+ CBK_DKSET = (1 << 5), -+ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */ -+ CBK_IN_CACHE = (1 << 7), /* node is already in cache */ -+ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term -+ * lock */ -+} cbk_flags; -+ -+/* insertion outcome. IBK = insert by key */ -+typedef enum { -+ IBK_INSERT_OK = 0, -+ IBK_ALREADY_EXISTS = -EEXIST, -+ IBK_IO_ERROR = -EIO, -+ IBK_NO_SPACE = -E_NODE_FULL, -+ IBK_OOM = -ENOMEM -+} insert_result; -+ -+#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND) -+ -+typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord, -+ lock_handle * lh, void *arg); -+extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord, -+ lock_handle * lh, -+ tree_iterate_actor_t actor, void *arg, -+ znode_lock_mode mode, int through_units_p); -+extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode, -+ znode_lock_request pri, lock_handle * lh); -+ -+/* return node plugin of @node */ -+static inline node_plugin *node_plugin_by_node(const znode * -+ node /* node to query */ ) -+{ -+ assert("vs-213", node != NULL); -+ assert("vs-214", znode_is_loaded(node)); -+ -+ return node->nplug; -+} -+ -+/* number of items in @node */ -+static inline pos_in_node_t node_num_items(const znode * node) -+{ -+ assert("nikita-2754", znode_is_loaded(node)); -+ assert("nikita-2468", -+ node_plugin_by_node(node)->num_of_items(node) == node->nr_items); -+ -+ return node->nr_items; -+} -+ -+/* Return the number of items at the present node. Asserts coord->node != -+ NULL. */ -+static inline unsigned coord_num_items(const coord_t * coord) -+{ -+ assert("jmacd-9805", coord->node != NULL); -+ -+ return node_num_items(coord->node); -+} -+ -+/* true if @node is empty */ -+static inline int node_is_empty(const znode * node) -+{ -+ return node_num_items(node) == 0; -+} -+ -+typedef enum { -+ SHIFTED_SOMETHING = 0, -+ SHIFT_NO_SPACE = -E_NODE_FULL, -+ SHIFT_IO_ERROR = -EIO, -+ SHIFT_OOM = -ENOMEM, -+} shift_result; -+ -+extern node_plugin *node_plugin_by_coord(const coord_t * coord); -+extern int is_coord_in_node(const coord_t * coord); -+extern int key_in_node(const reiser4_key *, const coord_t *); -+extern void coord_item_move_to(coord_t * coord, int items); -+extern void coord_unit_move_to(coord_t * coord, int units); -+ -+/* there are two types of repetitive accesses (ra): intra-syscall -+ (local) and inter-syscall (global). Local ra is used when -+ during single syscall we add/delete several items and units in the -+ same place in a tree. Note that plan-A fragments local ra by -+ separating stat-data and file body in key-space. Global ra is -+ used when user does repetitive modifications in the same place in a -+ tree. -+ -+ Our ra implementation serves following purposes: -+ 1 it affects balancing decisions so that next operation in a row -+ can be performed faster; -+ 2 it affects lower-level read-ahead in page-cache; -+ 3 it allows to avoid unnecessary lookups by maintaining some state -+ across several operations (this is only for local ra); -+ 4 it leaves room for lazy-micro-balancing: when we start a sequence of -+ operations they are performed without actually doing any intra-node -+ shifts, until we finish sequence or scope of sequence leaves -+ current node, only then we really pack node (local ra only). -+*/ -+ -+/* another thing that can be useful is to keep per-tree and/or -+ per-process cache of recent lookups. This cache can be organised as a -+ list of block numbers of formatted nodes sorted by starting key in -+ this node. Balancings should invalidate appropriate parts of this -+ cache. -+*/ -+ -+lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key, -+ coord_t * coord, lock_handle * handle, -+ znode_lock_mode lock, lookup_bias bias, -+ tree_level lock_level, tree_level stop_level, -+ __u32 flags, ra_info_t *); -+ -+lookup_result reiser4_object_lookup(struct inode *object, -+ const reiser4_key * key, -+ coord_t * coord, -+ lock_handle * lh, -+ znode_lock_mode lock_mode, -+ lookup_bias bias, -+ tree_level lock_level, -+ tree_level stop_level, -+ __u32 flags, ra_info_t * info); -+ -+insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key, -+ reiser4_item_data * data, coord_t * coord, -+ lock_handle * lh, -+ tree_level stop_level, __u32 flags); -+insert_result insert_by_coord(coord_t * coord, -+ reiser4_item_data * data, const reiser4_key * key, -+ lock_handle * lh, __u32); -+insert_result insert_extent_by_coord(coord_t * coord, -+ reiser4_item_data * data, -+ const reiser4_key * key, lock_handle * lh); -+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed); -+int kill_node_content(coord_t * from, coord_t * to, -+ const reiser4_key * from_key, const reiser4_key * to_key, -+ reiser4_key * smallest_removed, -+ znode * locked_left_neighbor, struct inode *inode, -+ int truncate); -+ -+int reiser4_resize_item(coord_t * coord, reiser4_item_data * data, -+ reiser4_key * key, lock_handle * lh, cop_insert_flag); -+int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key, -+ reiser4_item_data * data, unsigned); -+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f); -+int find_new_child_ptr(znode * parent, znode * child, znode * left, -+ coord_t * result); -+ -+int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord); -+int shift_left_of_and_including_insert_coord(coord_t * insert_coord); -+ -+void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int); -+ -+extern int cut_tree_worker_common(tap_t *, const reiser4_key *, -+ const reiser4_key *, reiser4_key *, -+ struct inode *, int, int *); -+extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *, -+ const reiser4_key *, reiser4_key *, -+ struct inode *, int, int *); -+extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from, -+ const reiser4_key * to, struct inode *, int); -+ -+extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int); -+extern int check_tree_pointer(const coord_t * pointer, const znode * child); -+extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG, -+ znode * left, coord_t * result); -+extern int find_child_ptr(znode * parent, znode * child, coord_t * result); -+extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent, -+ znode * child); -+extern znode *child_znode(const coord_t * in_parent, znode * parent, -+ int incore_p, int setup_dkeys_p); -+ -+extern int cbk_cache_init(cbk_cache * cache); -+extern void cbk_cache_done(cbk_cache * cache); -+extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree); -+ -+extern char *sprint_address(const reiser4_block_nr * block); -+ -+#if REISER4_DEBUG -+extern void print_coord_content(const char *prefix, coord_t * p); -+extern void reiser4_print_address(const char *prefix, -+ const reiser4_block_nr * block); -+extern void print_tree_rec(const char *prefix, reiser4_tree * tree, -+ __u32 flags); -+extern void check_dkeys(znode *node); -+#else -+#define print_coord_content(p, c) noop -+#define reiser4_print_address(p, b) noop -+#endif -+ -+extern void forget_znode(lock_handle * handle); -+extern int deallocate_znode(znode * node); -+ -+extern int is_disk_addr_unallocated(const reiser4_block_nr * addr); -+ -+/* struct used internally to pack all numerous arguments of tree lookup. -+ Used to avoid passing a lot of arguments to helper functions. */ -+typedef struct cbk_handle { -+ /* tree we are in */ -+ reiser4_tree *tree; -+ /* key we are going after */ -+ const reiser4_key *key; -+ /* coord we will store result in */ -+ coord_t *coord; -+ /* type of lock to take on target node */ -+ znode_lock_mode lock_mode; -+ /* lookup bias. See comments at the declaration of lookup_bias */ -+ lookup_bias bias; -+ /* lock level: level starting from which tree traversal starts taking -+ * write locks. */ -+ tree_level lock_level; -+ /* level where search will stop. Either item will be found between -+ lock_level and stop_level, or CBK_COORD_NOTFOUND will be -+ returned. -+ */ -+ tree_level stop_level; -+ /* level we are currently at */ -+ tree_level level; -+ /* block number of @active node. Tree traversal operates on two -+ nodes: active and parent. */ -+ reiser4_block_nr block; -+ /* put here error message to be printed by caller */ -+ const char *error; -+ /* result passed back to caller */ -+ lookup_result result; -+ /* lock handles for active and parent */ -+ lock_handle *parent_lh; -+ lock_handle *active_lh; -+ reiser4_key ld_key; -+ reiser4_key rd_key; -+ /* flags, passed to the cbk routine. Bits of this bitmask are defined -+ in tree.h:cbk_flags enum. */ -+ __u32 flags; -+ ra_info_t *ra_info; -+ struct inode *object; -+} cbk_handle; -+ -+extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h); -+ -+/* eottl.c */ -+extern int handle_eottl(cbk_handle *h, int *outcome); -+ -+int lookup_multikey(cbk_handle * handle, int nr_keys); -+int lookup_couple(reiser4_tree * tree, -+ const reiser4_key * key1, const reiser4_key * key2, -+ coord_t * coord1, coord_t * coord2, -+ lock_handle * lh1, lock_handle * lh2, -+ znode_lock_mode lock_mode, lookup_bias bias, -+ tree_level lock_level, tree_level stop_level, __u32 flags, -+ int *result1, int *result2); -+ -+static inline void read_lock_tree(reiser4_tree *tree) -+{ -+ /* check that tree is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_tree) && -+ LOCK_CNT_NIL(read_locked_tree) && -+ LOCK_CNT_NIL(write_locked_tree))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(spin_locked_stack))); -+ -+ read_lock(&(tree->tree_lock)); -+ -+ LOCK_CNT_INC(read_locked_tree); -+ LOCK_CNT_INC(rw_locked_tree); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void read_unlock_tree(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(read_locked_tree); -+ LOCK_CNT_DEC(rw_locked_tree); -+ LOCK_CNT_DEC(spin_locked); -+ -+ read_unlock(&(tree->tree_lock)); -+} -+ -+static inline void write_lock_tree(reiser4_tree *tree) -+{ -+ /* check that tree is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_tree) && -+ LOCK_CNT_NIL(read_locked_tree) && -+ LOCK_CNT_NIL(write_locked_tree))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(spin_locked_stack))); -+ -+ write_lock(&(tree->tree_lock)); -+ -+ LOCK_CNT_INC(write_locked_tree); -+ LOCK_CNT_INC(rw_locked_tree); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void write_unlock_tree(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(write_locked_tree); -+ LOCK_CNT_DEC(rw_locked_tree); -+ LOCK_CNT_DEC(spin_locked); -+ -+ write_unlock(&(tree->tree_lock)); -+} -+ -+static inline void read_lock_dk(reiser4_tree *tree) -+{ -+ /* check that dk is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(read_locked_dk) && -+ LOCK_CNT_NIL(write_locked_dk))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", LOCK_CNT_NIL(spin_locked_stack)); -+ -+ read_lock(&((tree)->dk_lock)); -+ -+ LOCK_CNT_INC(read_locked_dk); -+ LOCK_CNT_INC(rw_locked_dk); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void read_unlock_dk(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(read_locked_dk); -+ LOCK_CNT_DEC(rw_locked_dk); -+ LOCK_CNT_DEC(spin_locked); -+ -+ read_unlock(&(tree->dk_lock)); -+} -+ -+static inline void write_lock_dk(reiser4_tree *tree) -+{ -+ /* check that dk is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(read_locked_dk) && -+ LOCK_CNT_NIL(write_locked_dk))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", LOCK_CNT_NIL(spin_locked_stack)); -+ -+ write_lock(&((tree)->dk_lock)); -+ -+ LOCK_CNT_INC(write_locked_dk); -+ LOCK_CNT_INC(rw_locked_dk); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void write_unlock_dk(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(write_locked_dk); -+ LOCK_CNT_DEC(rw_locked_dk); -+ LOCK_CNT_DEC(spin_locked); -+ -+ write_unlock(&(tree->dk_lock)); -+} -+ -+/* estimate api. Implementation is in estimate.c */ -+reiser4_block_nr estimate_one_insert_item(reiser4_tree *); -+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *); -+reiser4_block_nr estimate_insert_flow(tree_level); -+reiser4_block_nr estimate_one_item_removal(reiser4_tree *); -+reiser4_block_nr calc_estimate_one_insert(tree_level); -+reiser4_block_nr estimate_dirty_cluster(struct inode *); -+reiser4_block_nr estimate_insert_cluster(struct inode *); -+reiser4_block_nr estimate_update_cluster(struct inode *); -+ -+/* __REISER4_TREE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/tree_mod.c linux-2.6.30/fs/reiser4/tree_mod.c ---- linux-2.6.30.orig/fs/reiser4/tree_mod.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/tree_mod.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,386 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * Functions to add/delete new nodes to/from the tree. -+ * -+ * Functions from this file are used by carry (see carry*) to handle: -+ * -+ * . insertion of new formatted node into tree -+ * -+ * . addition of new tree root, increasing tree height -+ * -+ * . removing tree root, decreasing tree height -+ * -+ */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/plugin.h" -+#include "jnode.h" -+#include "znode.h" -+#include "tree_mod.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "tree.h" -+#include "super.h" -+ -+#include <linux/err.h> -+ -+static int add_child_ptr(znode * parent, znode * child); -+/* warning only issued if error is not -E_REPEAT */ -+#define ewarning( error, ... ) \ -+ if( ( error ) != -E_REPEAT ) \ -+ warning( __VA_ARGS__ ) -+ -+/* allocate new node on the @level and immediately on the right of @brother. */ -+znode * reiser4_new_node(znode * brother /* existing left neighbor -+ * of new node */, -+ tree_level level /* tree level at which new node is to -+ * be allocated */) -+{ -+ znode *result; -+ int retcode; -+ reiser4_block_nr blocknr; -+ -+ assert("nikita-930", brother != NULL); -+ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT); -+ -+ retcode = assign_fake_blocknr_formatted(&blocknr); -+ if (retcode == 0) { -+ result = -+ zget(znode_get_tree(brother), &blocknr, NULL, level, -+ reiser4_ctx_gfp_mask_get()); -+ if (IS_ERR(result)) { -+ ewarning(PTR_ERR(result), "nikita-929", -+ "Cannot allocate znode for carry: %li", -+ PTR_ERR(result)); -+ return result; -+ } -+ /* cheap test, can be executed even when debugging is off */ -+ if (!znode_just_created(result)) { -+ warning("nikita-2213", -+ "Allocated already existing block: %llu", -+ (unsigned long long)blocknr); -+ zput(result); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ -+ assert("nikita-931", result != NULL); -+ result->nplug = znode_get_tree(brother)->nplug; -+ assert("nikita-933", result->nplug != NULL); -+ -+ retcode = zinit_new(result, reiser4_ctx_gfp_mask_get()); -+ if (retcode == 0) { -+ ZF_SET(result, JNODE_CREATED); -+ zrelse(result); -+ } else { -+ zput(result); -+ result = ERR_PTR(retcode); -+ } -+ } else { -+ /* failure to allocate new node during balancing. -+ This should never happen. Ever. Returning -E_REPEAT -+ is not viable solution, because "out of disk space" -+ is not transient error that will go away by itself. -+ */ -+ ewarning(retcode, "nikita-928", -+ "Cannot allocate block for carry: %i", retcode); -+ result = ERR_PTR(retcode); -+ } -+ assert("nikita-1071", result != NULL); -+ return result; -+} -+ -+/* allocate new root and add it to the tree -+ -+ This helper function is called by add_new_root(). -+ -+*/ -+znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ , -+ znode * fake /* "fake" znode */ ) -+{ -+ reiser4_tree *tree = znode_get_tree(old_root); -+ znode *new_root = NULL; /* to shut gcc up */ -+ int result; -+ -+ assert("nikita-1069", old_root != NULL); -+ assert("umka-262", fake != NULL); -+ assert("umka-263", tree != NULL); -+ -+ /* "fake" znode---one always hanging just above current root. This -+ node is locked when new root is created or existing root is -+ deleted. Downward tree traversal takes lock on it before taking -+ lock on a root node. This avoids race conditions with root -+ manipulations. -+ -+ */ -+ assert("nikita-1348", znode_above_root(fake)); -+ assert("nikita-1211", znode_is_root(old_root)); -+ -+ result = 0; -+ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) { -+ warning("nikita-1344", "Tree is too tall: %i", tree->height); -+ /* ext2 returns -ENOSPC when it runs out of free inodes with a -+ following comment (fs/ext2/ialloc.c:441): Is it really -+ ENOSPC? -+ -+ -EXFULL? -EINVAL? -+ */ -+ result = RETERR(-ENOSPC); -+ } else { -+ /* Allocate block for new root. It's not that -+ important where it will be allocated, as root is -+ almost always in memory. Moreover, allocate on -+ flush can be going here. -+ */ -+ assert("nikita-1448", znode_is_root(old_root)); -+ new_root = reiser4_new_node(fake, tree->height + 1); -+ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) { -+ lock_handle rlh; -+ -+ init_lh(&rlh); -+ result = -+ longterm_lock_znode(&rlh, new_root, -+ ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (result == 0) { -+ parent_coord_t *in_parent; -+ -+ znode_make_dirty(fake); -+ -+ /* new root is a child of "fake" node */ -+ write_lock_tree(tree); -+ -+ ++tree->height; -+ -+ /* recalculate max balance overhead */ -+ tree->estimate_one_insert = -+ estimate_one_insert_item(tree); -+ -+ tree->root_block = *znode_get_block(new_root); -+ in_parent = &new_root->in_parent; -+ init_parent_coord(in_parent, fake); -+ /* manually insert new root into sibling -+ * list. With this all nodes involved into -+ * balancing are connected after balancing is -+ * done---useful invariant to check. */ -+ sibling_list_insert_nolock(new_root, NULL); -+ write_unlock_tree(tree); -+ -+ /* insert into new root pointer to the -+ @old_root. */ -+ assert("nikita-1110", -+ WITH_DATA(new_root, -+ node_is_empty(new_root))); -+ write_lock_dk(tree); -+ znode_set_ld_key(new_root, reiser4_min_key()); -+ znode_set_rd_key(new_root, reiser4_max_key()); -+ write_unlock_dk(tree); -+ if (REISER4_DEBUG) { -+ ZF_CLR(old_root, JNODE_LEFT_CONNECTED); -+ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED); -+ ZF_SET(old_root, JNODE_ORPHAN); -+ } -+ result = add_child_ptr(new_root, old_root); -+ done_lh(&rlh); -+ } -+ zrelse(new_root); -+ } -+ } -+ if (result != 0) -+ new_root = ERR_PTR(result); -+ return new_root; -+} -+ -+/* build &reiser4_item_data for inserting child pointer -+ -+ Build &reiser4_item_data that can be later used to insert pointer to @child -+ in its parent. -+ -+*/ -+void build_child_ptr_data(znode * child /* node pointer to which will be -+ * inserted */ , -+ reiser4_item_data * data /* where to store result */ ) -+{ -+ assert("nikita-1116", child != NULL); -+ assert("nikita-1117", data != NULL); -+ -+ /* -+ * NOTE: use address of child's blocknr as address of data to be -+ * inserted. As result of this data gets into on-disk structure in cpu -+ * byte order. internal's create_hook converts it to little endian byte -+ * order. -+ */ -+ data->data = (char *)znode_get_block(child); -+ /* data -> data is kernel space */ -+ data->user = 0; -+ data->length = sizeof(reiser4_block_nr); -+ /* FIXME-VS: hardcoded internal item? */ -+ -+ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */ -+ data->iplug = item_plugin_by_id(NODE_POINTER_ID); -+} -+ -+/* add pointer to @child into empty @parent. -+ -+ This is used when pointer to old root is inserted into new root which is -+ empty. -+*/ -+static int add_child_ptr(znode * parent, znode * child) -+{ -+ coord_t coord; -+ reiser4_item_data data; -+ int result; -+ reiser4_key key; -+ -+ assert("nikita-1111", parent != NULL); -+ assert("nikita-1112", child != NULL); -+ assert("nikita-1115", -+ znode_get_level(parent) == znode_get_level(child) + 1); -+ -+ result = zload(parent); -+ if (result != 0) -+ return result; -+ assert("nikita-1113", node_is_empty(parent)); -+ coord_init_first_unit(&coord, parent); -+ -+ build_child_ptr_data(child, &data); -+ data.arg = NULL; -+ -+ read_lock_dk(znode_get_tree(parent)); -+ key = *znode_get_ld_key(child); -+ read_unlock_dk(znode_get_tree(parent)); -+ -+ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data, -+ NULL); -+ znode_make_dirty(parent); -+ zrelse(parent); -+ return result; -+} -+ -+/* actually remove tree root */ -+static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is -+ * being removed */, -+ znode * old_root /* root node that is being -+ * removed */ , -+ znode * new_root /* new root---sole child of -+ * @old_root */, -+ const reiser4_block_nr * new_root_blk /* disk address of -+ * @new_root */) -+{ -+ znode *uber; -+ int result; -+ lock_handle handle_for_uber; -+ -+ assert("umka-265", tree != NULL); -+ assert("nikita-1198", new_root != NULL); -+ assert("nikita-1199", -+ znode_get_level(new_root) + 1 == znode_get_level(old_root)); -+ -+ assert("nikita-1201", znode_is_write_locked(old_root)); -+ -+ assert("nikita-1203", -+ disk_addr_eq(new_root_blk, znode_get_block(new_root))); -+ -+ init_lh(&handle_for_uber); -+ /* obtain and lock "fake" znode protecting changes in tree height. */ -+ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI, -+ &handle_for_uber); -+ if (result == 0) { -+ uber = handle_for_uber.node; -+ -+ znode_make_dirty(uber); -+ -+ /* don't take long term lock a @new_root. Take spinlock. */ -+ -+ write_lock_tree(tree); -+ -+ tree->root_block = *new_root_blk; -+ --tree->height; -+ -+ /* recalculate max balance overhead */ -+ tree->estimate_one_insert = estimate_one_insert_item(tree); -+ -+ assert("nikita-1202", -+ tree->height == znode_get_level(new_root)); -+ -+ /* new root is child on "fake" node */ -+ init_parent_coord(&new_root->in_parent, uber); -+ ++uber->c_count; -+ -+ /* sibling_list_insert_nolock(new_root, NULL); */ -+ write_unlock_tree(tree); -+ -+ /* reinitialise old root. */ -+ result = node_plugin_by_node(old_root)->init(old_root); -+ znode_make_dirty(old_root); -+ if (result == 0) { -+ assert("nikita-1279", node_is_empty(old_root)); -+ ZF_SET(old_root, JNODE_HEARD_BANSHEE); -+ old_root->c_count = 0; -+ } -+ } -+ done_lh(&handle_for_uber); -+ -+ return result; -+} -+ -+/* remove tree root -+ -+ This function removes tree root, decreasing tree height by one. Tree root -+ and its only child (that is going to become new tree root) are write locked -+ at the entry. -+ -+ To remove tree root we need to take lock on special "fake" znode that -+ protects changes of tree height. See comments in reiser4_add_tree_root() for -+ more on this. -+ -+ Also parent pointers have to be updated in -+ old and new root. To simplify code, function is split into two parts: outer -+ reiser4_kill_tree_root() collects all necessary arguments and calls -+ reiser4_kill_root() to do the actual job. -+ -+*/ -+int reiser4_kill_tree_root(znode * old_root /* tree root that we are -+ removing*/) -+{ -+ int result; -+ coord_t down_link; -+ znode *new_root; -+ reiser4_tree *tree; -+ -+ assert("umka-266", current_tree != NULL); -+ assert("nikita-1194", old_root != NULL); -+ assert("nikita-1196", znode_is_root(old_root)); -+ assert("nikita-1200", node_num_items(old_root) == 1); -+ assert("nikita-1401", znode_is_write_locked(old_root)); -+ -+ coord_init_first_unit(&down_link, old_root); -+ -+ tree = znode_get_tree(old_root); -+ new_root = child_znode(&down_link, old_root, 0, 1); -+ if (!IS_ERR(new_root)) { -+ result = -+ reiser4_kill_root(tree, old_root, new_root, -+ znode_get_block(new_root)); -+ zput(new_root); -+ } else -+ result = PTR_ERR(new_root); -+ -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/tree_mod.h linux-2.6.30/fs/reiser4/tree_mod.h ---- linux-2.6.30.orig/fs/reiser4/tree_mod.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/tree_mod.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,29 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for -+ * comments. */ -+ -+#if !defined( __REISER4_TREE_MOD_H__ ) -+#define __REISER4_TREE_MOD_H__ -+ -+#include "forward.h" -+ -+znode *reiser4_new_node(znode * brother, tree_level level); -+znode *reiser4_add_tree_root(znode * old_root, znode * fake); -+int reiser4_kill_tree_root(znode * old_root); -+void build_child_ptr_data(znode * child, reiser4_item_data * data); -+ -+/* __REISER4_TREE_MOD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/tree_walk.c linux-2.6.30/fs/reiser4/tree_walk.c ---- linux-2.6.30.orig/fs/reiser4/tree_walk.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/tree_walk.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,927 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Routines and macros to: -+ -+ get_left_neighbor() -+ -+ get_right_neighbor() -+ -+ get_parent() -+ -+ get_first_child() -+ -+ get_last_child() -+ -+ various routines to walk the whole tree and do things to it like -+ repack it, or move it to tertiary storage. Please make them as -+ generic as is reasonable. -+ -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "jnode.h" -+#include "znode.h" -+#include "tree_walk.h" -+#include "tree.h" -+#include "super.h" -+ -+/* These macros are used internally in tree_walk.c in attempt to make -+ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor, -+ lock_left_neighbor */ -+#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off))) -+#define FIELD_OFFSET(name) offsetof(znode, name) -+#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node) -+#define LEFT_PTR_OFFSET FIELD_OFFSET(left) -+#define RIGHT_PTR_OFFSET FIELD_OFFSET(right) -+ -+/* This is the generic procedure to get and lock `generic' neighbor (left or -+ right neighbor or parent). It implements common algorithm for all cases of -+ getting lock on neighbor node, only znode structure field is different in -+ each case. This is parameterized by ptr_offset argument, which is byte -+ offset for the pointer to the desired neighbor within the current node's -+ znode structure. This function should be called with the tree lock held */ -+static int lock_neighbor( -+ /* resulting lock handle */ -+ lock_handle * result, -+ /* znode to lock */ -+ znode * node, -+ /* pointer to neighbor (or parent) znode field offset, in bytes from -+ the base address of znode structure */ -+ int ptr_offset, -+ /* lock mode for longterm_lock_znode call */ -+ znode_lock_mode mode, -+ /* lock request for longterm_lock_znode call */ -+ znode_lock_request req, -+ /* GN_* flags */ -+ int flags, int rlocked) -+{ -+ reiser4_tree *tree = znode_get_tree(node); -+ znode *neighbor; -+ int ret; -+ -+ assert("umka-236", node != NULL); -+ assert("umka-237", tree != NULL); -+ assert_rw_locked(&(tree->tree_lock)); -+ -+ if (flags & GN_TRY_LOCK) -+ req |= ZNODE_LOCK_NONBLOCK; -+ if (flags & GN_SAME_ATOM) -+ req |= ZNODE_LOCK_DONT_FUSE; -+ -+ /* get neighbor's address by using of sibling link, quit while loop -+ (and return) if link is not available. */ -+ while (1) { -+ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset); -+ -+ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if -+ * node pointed by it is not connected. -+ * -+ * However, GN_ALLOW_NOT_CONNECTED option masks "connected" -+ * check and allows passing reference to not connected znode to -+ * subsequent longterm_lock_znode() call. This kills possible -+ * busy loop if we are trying to get longterm lock on locked but -+ * not yet connected parent node. */ -+ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED) -+ || znode_is_connected(neighbor))) { -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ -+ /* protect it from deletion. */ -+ zref(neighbor); -+ -+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree); -+ -+ ret = longterm_lock_znode(result, neighbor, mode, req); -+ -+ /* The lock handle obtains its own reference, release the one from above. */ -+ zput(neighbor); -+ -+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree); -+ -+ /* restart if node we got reference to is being -+ invalidated. we should not get reference to this node -+ again. */ -+ if (ret == -EINVAL) -+ continue; -+ if (ret) -+ return ret; -+ -+ /* check if neighbor link still points to just locked znode; -+ the link could have been changed while the process slept. */ -+ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset)) -+ return 0; -+ -+ /* znode was locked by mistake; unlock it and restart locking -+ process from beginning. */ -+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree); -+ longterm_unlock_znode(result); -+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree); -+ } -+} -+ -+/* get parent node with longterm lock, accepts GN* flags. */ -+int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ , -+ znode * node /* child node */ , -+ znode_lock_mode mode -+ /* type of lock: read or write */ , -+ int flags /* GN_* flags */ ) -+{ -+ int result; -+ -+ read_lock_tree(znode_get_tree(node)); -+ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode, -+ ZNODE_LOCK_HIPRI, flags, 1); -+ read_unlock_tree(znode_get_tree(node)); -+ return result; -+} -+ -+/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT -+ bit in @flags parameter */ -+/* Audited by: umka (2002.06.14) */ -+static inline int -+lock_side_neighbor(lock_handle * result, -+ znode * node, znode_lock_mode mode, int flags, int rlocked) -+{ -+ int ret; -+ int ptr_offset; -+ znode_lock_request req; -+ -+ if (flags & GN_GO_LEFT) { -+ ptr_offset = LEFT_PTR_OFFSET; -+ req = ZNODE_LOCK_LOPRI; -+ } else { -+ ptr_offset = RIGHT_PTR_OFFSET; -+ req = ZNODE_LOCK_HIPRI; -+ } -+ -+ ret = -+ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked); -+ -+ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not -+ * guarantee that neighbor is absent in the -+ * tree; in this case we return -ENOENT -- -+ * means neighbor at least not found in -+ * cache */ -+ return RETERR(-ENOENT); -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+ -+int check_sibling_list(znode * node) -+{ -+ znode *scan; -+ znode *next; -+ -+ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree)); -+ -+ if (node == NULL) -+ return 1; -+ -+ if (ZF_ISSET(node, JNODE_RIP)) -+ return 1; -+ -+ assert("nikita-3270", node != NULL); -+ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock)); -+ -+ for (scan = node; znode_is_left_connected(scan); scan = next) { -+ next = scan->left; -+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) { -+ assert("nikita-3271", znode_is_right_connected(next)); -+ assert("nikita-3272", next->right == scan); -+ } else -+ break; -+ } -+ for (scan = node; znode_is_right_connected(scan); scan = next) { -+ next = scan->right; -+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) { -+ assert("nikita-3273", znode_is_left_connected(next)); -+ assert("nikita-3274", next->left == scan); -+ } else -+ break; -+ } -+ return 1; -+} -+ -+#endif -+ -+/* Znode sibling pointers maintenence. */ -+ -+/* Znode sibling pointers are established between any neighbored nodes which are -+ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED, -+ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual -+ value (even NULL), corresponded JNODE_*_CONNECTED bit is set. -+ -+ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing) -+ take care about searching (hash table lookup may be required) of znode -+ neighbors, establishing sibling pointers between them and setting -+ JNODE_*_CONNECTED state bits. */ -+ -+/* adjusting of sibling pointers and `connected' states for two -+ neighbors; works if one neighbor is NULL (was not found). */ -+ -+/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */ -+void link_left_and_right(znode * left, znode * right) -+{ -+ assert("nikita-3275", check_sibling_list(left)); -+ assert("nikita-3275", check_sibling_list(right)); -+ -+ if (left != NULL) { -+ if (left->right == NULL) { -+ left->right = right; -+ ZF_SET(left, JNODE_RIGHT_CONNECTED); -+ -+ ON_DEBUG(left->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ -+ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE) -+ && left->right != right) { -+ -+ ON_DEBUG(left->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ left->right_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ left->right->left = NULL; -+ left->right = right; -+ ZF_SET(left, JNODE_RIGHT_CONNECTED); -+ } else -+ /* -+ * there is a race condition in renew_sibling_link() -+ * and assertions below check that it is only one -+ * there. Thread T1 calls renew_sibling_link() without -+ * GN_NO_ALLOC flag. zlook() doesn't find neighbor -+ * node, but before T1 gets to the -+ * link_left_and_right(), another thread T2 creates -+ * neighbor node and connects it. check for -+ * left->right == NULL above protects T1 from -+ * overwriting correct left->right pointer installed -+ * by T2. -+ */ -+ assert("nikita-3302", -+ right == NULL || left->right == right); -+ } -+ if (right != NULL) { -+ if (right->left == NULL) { -+ right->left = left; -+ ZF_SET(right, JNODE_LEFT_CONNECTED); -+ -+ ON_DEBUG(right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ -+ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE) -+ && right->left != left) { -+ -+ ON_DEBUG(right->left->right_version = -+ atomic_inc_return(&delim_key_version); -+ right->left_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ right->left->right = NULL; -+ right->left = left; -+ ZF_SET(right, JNODE_LEFT_CONNECTED); -+ -+ } else -+ assert("nikita-3303", -+ left == NULL || right->left == left); -+ } -+ assert("nikita-3275", check_sibling_list(left)); -+ assert("nikita-3275", check_sibling_list(right)); -+} -+ -+/* Audited by: umka (2002.06.14) */ -+static void link_znodes(znode * first, znode * second, int to_left) -+{ -+ if (to_left) -+ link_left_and_right(second, first); -+ else -+ link_left_and_right(first, second); -+} -+ -+/* getting of next (to left or to right, depend on gn_to_left bit in flags) -+ coord's unit position in horizontal direction, even across node -+ boundary. Should be called under tree lock, it protects nonexistence of -+ sibling link on parent level, if lock_side_neighbor() fails with -+ -ENOENT. */ -+static int far_next_coord(coord_t * coord, lock_handle * handle, int flags) -+{ -+ int ret; -+ znode *node; -+ reiser4_tree *tree; -+ -+ assert("umka-243", coord != NULL); -+ assert("umka-244", handle != NULL); -+ assert("zam-1069", handle->node == NULL); -+ -+ ret = -+ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) : -+ coord_next_unit(coord); -+ if (!ret) -+ return 0; -+ -+ ret = -+ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0); -+ if (ret) -+ return ret; -+ -+ node = handle->node; -+ tree = znode_get_tree(node); -+ write_unlock_tree(tree); -+ -+ coord_init_zero(coord); -+ -+ /* We avoid synchronous read here if it is specified by flag. */ -+ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) { -+ ret = jstartio(ZJNODE(handle->node)); -+ if (!ret) -+ ret = -E_REPEAT; -+ goto error_locked; -+ } -+ -+ /* corresponded zrelse() should be called by the clients of -+ far_next_coord(), in place when this node gets unlocked. */ -+ ret = zload(handle->node); -+ if (ret) -+ goto error_locked; -+ -+ if (flags & GN_GO_LEFT) -+ coord_init_last_unit(coord, node); -+ else -+ coord_init_first_unit(coord, node); -+ -+ if (0) { -+ error_locked: -+ longterm_unlock_znode(handle); -+ } -+ write_lock_tree(tree); -+ return ret; -+} -+ -+/* Very significant function which performs a step in horizontal direction -+ when sibling pointer is not available. Actually, it is only function which -+ does it. -+ Note: this function does not restore locking status at exit, -+ caller should does care about proper unlocking and zrelsing */ -+static int -+renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child, -+ tree_level level, int flags, int *nr_locked) -+{ -+ int ret; -+ int to_left = flags & GN_GO_LEFT; -+ reiser4_block_nr da; -+ /* parent of the neighbor node; we set it to parent until not sharing -+ of one parent between child and neighbor node is detected */ -+ znode *side_parent = coord->node; -+ reiser4_tree *tree = znode_get_tree(child); -+ znode *neighbor = NULL; -+ -+ assert("umka-245", coord != NULL); -+ assert("umka-246", handle != NULL); -+ assert("umka-247", child != NULL); -+ assert("umka-303", tree != NULL); -+ -+ init_lh(handle); -+ write_lock_tree(tree); -+ ret = far_next_coord(coord, handle, flags); -+ -+ if (ret) { -+ if (ret != -ENOENT) { -+ write_unlock_tree(tree); -+ return ret; -+ } -+ } else { -+ item_plugin *iplug; -+ -+ if (handle->node != NULL) { -+ (*nr_locked)++; -+ side_parent = handle->node; -+ } -+ -+ /* does coord object points to internal item? We do not -+ support sibling pointers between znode for formatted and -+ unformatted nodes and return -E_NO_NEIGHBOR in that case. */ -+ iplug = item_plugin_by_coord(coord); -+ if (!item_is_internal(coord)) { -+ link_znodes(child, NULL, to_left); -+ write_unlock_tree(tree); -+ /* we know there can't be formatted neighbor */ -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ write_unlock_tree(tree); -+ -+ iplug->s.internal.down_link(coord, NULL, &da); -+ -+ if (flags & GN_NO_ALLOC) { -+ neighbor = zlook(tree, &da); -+ } else { -+ neighbor = -+ zget(tree, &da, side_parent, level, -+ reiser4_ctx_gfp_mask_get()); -+ } -+ -+ if (IS_ERR(neighbor)) { -+ ret = PTR_ERR(neighbor); -+ return ret; -+ } -+ -+ if (neighbor) -+ /* update delimiting keys */ -+ set_child_delimiting_keys(coord->node, coord, neighbor); -+ -+ write_lock_tree(tree); -+ } -+ -+ if (likely(neighbor == NULL || -+ (znode_get_level(child) == znode_get_level(neighbor) -+ && child != neighbor))) -+ link_znodes(child, neighbor, to_left); -+ else { -+ warning("nikita-3532", -+ "Sibling nodes on the different levels: %i != %i\n", -+ znode_get_level(child), znode_get_level(neighbor)); -+ ret = RETERR(-EIO); -+ } -+ -+ write_unlock_tree(tree); -+ -+ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */ -+ if (neighbor != NULL && (flags & GN_NO_ALLOC)) -+ /* atomic_dec(&ZJNODE(neighbor)->x_count); */ -+ zput(neighbor); -+ -+ return ret; -+} -+ -+/* This function is for establishing of one side relation. */ -+/* Audited by: umka (2002.06.14) */ -+static int connect_one_side(coord_t * coord, znode * node, int flags) -+{ -+ coord_t local; -+ lock_handle handle; -+ int nr_locked; -+ int ret; -+ -+ assert("umka-248", coord != NULL); -+ assert("umka-249", node != NULL); -+ -+ coord_dup_nocheck(&local, coord); -+ -+ init_lh(&handle); -+ -+ ret = -+ renew_sibling_link(&local, &handle, node, znode_get_level(node), -+ flags | GN_NO_ALLOC, &nr_locked); -+ -+ if (handle.node != NULL) { -+ /* complementary operations for zload() and lock() in far_next_coord() */ -+ zrelse(handle.node); -+ longterm_unlock_znode(&handle); -+ } -+ -+ /* we catch error codes which are not interesting for us because we -+ run renew_sibling_link() only for znode connection. */ -+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR) -+ return 0; -+ -+ return ret; -+} -+ -+/* if @child is not in `connected' state, performs hash searches for left and -+ right neighbor nodes and establishes horizontal sibling links */ -+/* Audited by: umka (2002.06.14), umka (2002.06.15) */ -+int connect_znode(coord_t * parent_coord, znode * child) -+{ -+ reiser4_tree *tree = znode_get_tree(child); -+ int ret = 0; -+ -+ assert("zam-330", parent_coord != NULL); -+ assert("zam-331", child != NULL); -+ assert("zam-332", parent_coord->node != NULL); -+ assert("umka-305", tree != NULL); -+ -+ /* it is trivial to `connect' root znode because it can't have -+ neighbors */ -+ if (znode_above_root(parent_coord->node)) { -+ child->left = NULL; -+ child->right = NULL; -+ ZF_SET(child, JNODE_LEFT_CONNECTED); -+ ZF_SET(child, JNODE_RIGHT_CONNECTED); -+ -+ ON_DEBUG(child->left_version = -+ atomic_inc_return(&delim_key_version); -+ child->right_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ return 0; -+ } -+ -+ /* load parent node */ -+ coord_clear_iplug(parent_coord); -+ ret = zload(parent_coord->node); -+ -+ if (ret != 0) -+ return ret; -+ -+ /* protect `connected' state check by tree_lock */ -+ read_lock_tree(tree); -+ -+ if (!znode_is_right_connected(child)) { -+ read_unlock_tree(tree); -+ /* connect right (default is right) */ -+ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC); -+ if (ret) -+ goto zrelse_and_ret; -+ -+ read_lock_tree(tree); -+ } -+ -+ ret = znode_is_left_connected(child); -+ -+ read_unlock_tree(tree); -+ -+ if (!ret) { -+ ret = -+ connect_one_side(parent_coord, child, -+ GN_NO_ALLOC | GN_GO_LEFT); -+ } else -+ ret = 0; -+ -+ zrelse_and_ret: -+ zrelse(parent_coord->node); -+ -+ return ret; -+} -+ -+/* this function is like renew_sibling_link() but allocates neighbor node if -+ it doesn't exist and `connects' it. It may require making two steps in -+ horizontal direction, first one for neighbor node finding/allocation, -+ second one is for finding neighbor of neighbor to connect freshly allocated -+ znode. */ -+/* Audited by: umka (2002.06.14), umka (2002.06.15) */ -+static int -+renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags) -+{ -+ coord_t local; -+ lock_handle empty[2]; -+ reiser4_tree *tree = znode_get_tree(node); -+ znode *neighbor = NULL; -+ int nr_locked = 0; -+ int ret; -+ -+ assert("umka-250", coord != NULL); -+ assert("umka-251", node != NULL); -+ assert("umka-307", tree != NULL); -+ assert("umka-308", level <= tree->height); -+ -+ /* umka (2002.06.14) -+ Here probably should be a check for given "level" validness. -+ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT); -+ */ -+ -+ coord_dup(&local, coord); -+ -+ ret = -+ renew_sibling_link(&local, &empty[0], node, level, -+ flags & ~GN_NO_ALLOC, &nr_locked); -+ if (ret) -+ goto out; -+ -+ /* tree lock is not needed here because we keep parent node(s) locked -+ and reference to neighbor znode incremented */ -+ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right; -+ -+ read_lock_tree(tree); -+ ret = znode_is_connected(neighbor); -+ read_unlock_tree(tree); -+ if (ret) { -+ ret = 0; -+ goto out; -+ } -+ -+ ret = -+ renew_sibling_link(&local, &empty[nr_locked], neighbor, level, -+ flags | GN_NO_ALLOC, &nr_locked); -+ /* second renew_sibling_link() call is used for znode connection only, -+ so we can live with these errors */ -+ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret) -+ ret = 0; -+ -+ out: -+ -+ for (--nr_locked; nr_locked >= 0; --nr_locked) { -+ zrelse(empty[nr_locked].node); -+ longterm_unlock_znode(&empty[nr_locked]); -+ } -+ -+ if (neighbor != NULL) -+ /* decrement znode reference counter without actually -+ releasing it. */ -+ atomic_dec(&ZJNODE(neighbor)->x_count); -+ -+ return ret; -+} -+ -+/* -+ reiser4_get_neighbor() -- lock node's neighbor. -+ -+ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on -+ given parameter) using sibling link to it. If sibling link is not available -+ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one -+ level up for information about neighbor's disk address. We lock node's -+ parent, if it is common parent for both 'node' and its neighbor, neighbor's -+ disk address is in next (to left or to right) down link from link that points -+ to original node. If not, we need to lock parent's neighbor, read its content -+ and take first(last) downlink with neighbor's disk address. That locking -+ could be done by using sibling link and lock_neighbor() function, if sibling -+ link exists. In another case we have to go level up again until we find -+ common parent or valid sibling link. Then go down -+ allocating/connecting/locking/reading nodes until neighbor of first one is -+ locked. -+ -+ @neighbor: result lock handle, -+ @node: a node which we lock neighbor of, -+ @lock_mode: lock mode {LM_READ, LM_WRITE}, -+ @flags: logical OR of {GN_*} (see description above) subset. -+ -+ @return: 0 if success, negative value if lock was impossible due to an error -+ or lack of neighbor node. -+*/ -+ -+/* Audited by: umka (2002.06.14), umka (2002.06.15) */ -+int -+reiser4_get_neighbor(lock_handle * neighbor, znode * node, -+ znode_lock_mode lock_mode, int flags) -+{ -+ reiser4_tree *tree = znode_get_tree(node); -+ lock_handle path[REAL_MAX_ZTREE_HEIGHT]; -+ -+ coord_t coord; -+ -+ tree_level base_level; -+ tree_level h = 0; -+ int ret; -+ -+ assert("umka-252", tree != NULL); -+ assert("umka-253", neighbor != NULL); -+ assert("umka-254", node != NULL); -+ -+ base_level = znode_get_level(node); -+ -+ assert("umka-310", base_level <= tree->height); -+ -+ coord_init_zero(&coord); -+ -+ again: -+ /* first, we try to use simple lock_neighbor() which requires sibling -+ link existence */ -+ read_lock_tree(tree); -+ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1); -+ read_unlock_tree(tree); -+ if (!ret) { -+ /* load znode content if it was specified */ -+ if (flags & GN_LOAD_NEIGHBOR) { -+ ret = zload(node); -+ if (ret) -+ longterm_unlock_znode(neighbor); -+ } -+ return ret; -+ } -+ -+ /* only -ENOENT means we may look upward and try to connect -+ @node with its neighbor (if @flags allow us to do it) */ -+ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS)) -+ return ret; -+ -+ /* before establishing of sibling link we lock parent node; it is -+ required by renew_neighbor() to work. */ -+ init_lh(&path[0]); -+ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK); -+ if (ret) -+ return ret; -+ if (znode_above_root(path[0].node)) { -+ longterm_unlock_znode(&path[0]); -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ -+ while (1) { -+ znode *child = (h == 0) ? node : path[h - 1].node; -+ znode *parent = path[h].node; -+ -+ ret = zload(parent); -+ if (ret) -+ break; -+ -+ ret = find_child_ptr(parent, child, &coord); -+ -+ if (ret) { -+ zrelse(parent); -+ break; -+ } -+ -+ /* try to establish missing sibling link */ -+ ret = renew_neighbor(&coord, child, h + base_level, flags); -+ -+ zrelse(parent); -+ -+ switch (ret) { -+ case 0: -+ /* unlocking of parent znode prevents simple -+ deadlock situation */ -+ done_lh(&path[h]); -+ -+ /* depend on tree level we stay on we repeat first -+ locking attempt ... */ -+ if (h == 0) -+ goto again; -+ -+ /* ... or repeat establishing of sibling link at -+ one level below. */ -+ --h; -+ break; -+ -+ case -ENOENT: -+ /* sibling link is not available -- we go -+ upward. */ -+ init_lh(&path[h + 1]); -+ ret = -+ reiser4_get_parent(&path[h + 1], parent, -+ ZNODE_READ_LOCK); -+ if (ret) -+ goto fail; -+ ++h; -+ if (znode_above_root(path[h].node)) { -+ ret = RETERR(-E_NO_NEIGHBOR); -+ goto fail; -+ } -+ break; -+ -+ case -E_DEADLOCK: -+ /* there was lock request from hi-pri locker. if -+ it is possible we unlock last parent node and -+ re-lock it again. */ -+ for (; reiser4_check_deadlock(); h--) { -+ done_lh(&path[h]); -+ if (h == 0) -+ goto fail; -+ } -+ -+ break; -+ -+ default: /* other errors. */ -+ goto fail; -+ } -+ } -+ fail: -+ ON_DEBUG(check_lock_node_data(node)); -+ ON_DEBUG(check_lock_data()); -+ -+ /* unlock path */ -+ do { -+ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto -+ fail; path[0] is already done_lh-ed, therefore -+ longterm_unlock_znode(&path[h]); is not applicable */ -+ done_lh(&path[h]); -+ --h; -+ } while (h + 1 != 0); -+ -+ return ret; -+} -+ -+/* remove node from sibling list */ -+/* Audited by: umka (2002.06.14) */ -+void sibling_list_remove(znode * node) -+{ -+ reiser4_tree *tree; -+ -+ tree = znode_get_tree(node); -+ assert("umka-255", node != NULL); -+ assert_rw_write_locked(&(tree->tree_lock)); -+ assert("nikita-3275", check_sibling_list(node)); -+ -+ write_lock_dk(tree); -+ if (znode_is_right_connected(node) && node->right != NULL && -+ znode_is_left_connected(node) && node->left != NULL) { -+ assert("zam-32245", -+ keyeq(znode_get_rd_key(node), -+ znode_get_ld_key(node->right))); -+ znode_set_rd_key(node->left, znode_get_ld_key(node->right)); -+ } -+ write_unlock_dk(tree); -+ -+ if (znode_is_right_connected(node) && node->right != NULL) { -+ assert("zam-322", znode_is_left_connected(node->right)); -+ node->right->left = node->left; -+ ON_DEBUG(node->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ if (znode_is_left_connected(node) && node->left != NULL) { -+ assert("zam-323", znode_is_right_connected(node->left)); -+ node->left->right = node->right; -+ ON_DEBUG(node->left->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ -+ ZF_CLR(node, JNODE_LEFT_CONNECTED); -+ ZF_CLR(node, JNODE_RIGHT_CONNECTED); -+ ON_DEBUG(node->left = node->right = NULL; -+ node->left_version = atomic_inc_return(&delim_key_version); -+ node->right_version = atomic_inc_return(&delim_key_version);); -+ assert("nikita-3276", check_sibling_list(node)); -+} -+ -+/* disconnect node from sibling list */ -+void sibling_list_drop(znode * node) -+{ -+ znode *right; -+ znode *left; -+ -+ assert("nikita-2464", node != NULL); -+ assert("nikita-3277", check_sibling_list(node)); -+ -+ right = node->right; -+ if (right != NULL) { -+ assert("nikita-2465", znode_is_left_connected(right)); -+ right->left = NULL; -+ ON_DEBUG(right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ left = node->left; -+ if (left != NULL) { -+ assert("zam-323", znode_is_right_connected(left)); -+ left->right = NULL; -+ ON_DEBUG(left->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ ZF_CLR(node, JNODE_LEFT_CONNECTED); -+ ZF_CLR(node, JNODE_RIGHT_CONNECTED); -+ ON_DEBUG(node->left = node->right = NULL; -+ node->left_version = atomic_inc_return(&delim_key_version); -+ node->right_version = atomic_inc_return(&delim_key_version);); -+} -+ -+/* Insert new node into sibling list. Regular balancing inserts new node -+ after (at right side) existing and locked node (@before), except one case -+ of adding new tree root node. @before should be NULL in that case. */ -+void sibling_list_insert_nolock(znode * new, znode * before) -+{ -+ assert("zam-334", new != NULL); -+ assert("nikita-3298", !znode_is_left_connected(new)); -+ assert("nikita-3299", !znode_is_right_connected(new)); -+ assert("nikita-3300", new->left == NULL); -+ assert("nikita-3301", new->right == NULL); -+ assert("nikita-3278", check_sibling_list(new)); -+ assert("nikita-3279", check_sibling_list(before)); -+ -+ if (before != NULL) { -+ assert("zam-333", znode_is_connected(before)); -+ new->right = before->right; -+ new->left = before; -+ ON_DEBUG(new->right_version = -+ atomic_inc_return(&delim_key_version); -+ new->left_version = -+ atomic_inc_return(&delim_key_version);); -+ if (before->right != NULL) { -+ before->right->left = new; -+ ON_DEBUG(before->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ before->right = new; -+ ON_DEBUG(before->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } else { -+ new->right = NULL; -+ new->left = NULL; -+ ON_DEBUG(new->right_version = -+ atomic_inc_return(&delim_key_version); -+ new->left_version = -+ atomic_inc_return(&delim_key_version);); -+ } -+ ZF_SET(new, JNODE_LEFT_CONNECTED); -+ ZF_SET(new, JNODE_RIGHT_CONNECTED); -+ assert("nikita-3280", check_sibling_list(new)); -+ assert("nikita-3281", check_sibling_list(before)); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/tree_walk.h linux-2.6.30/fs/reiser4/tree_walk.h ---- linux-2.6.30.orig/fs/reiser4/tree_walk.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/tree_walk.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,125 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* definitions of reiser4 tree walk functions */ -+ -+#ifndef __FS_REISER4_TREE_WALK_H__ -+#define __FS_REISER4_TREE_WALK_H__ -+ -+#include "debug.h" -+#include "forward.h" -+ -+/* establishes horizontal links between cached znodes */ -+int connect_znode(coord_t * coord, znode * node); -+ -+/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor()) -+ have the following common arguments: -+ -+ return codes: -+ -+ @return : 0 - OK, -+ -+ZAM-FIXME-HANS: wrong return code name. Change them all. -+ -ENOENT - neighbor is not in cache, what is detected by sibling -+ link absence. -+ -+ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be -+ found (because we are left-/right- most node of the -+ tree, for example). Also, this return code is for -+ reiser4_get_parent() when we see no parent link -- it -+ means that our node is root node. -+ -+ -E_DEADLOCK - deadlock detected (request from high-priority process -+ received), other error codes are conformed to -+ /usr/include/asm/errno.h . -+*/ -+ -+int -+reiser4_get_parent_flags(lock_handle * result, znode * node, -+ znode_lock_mode mode, int flags); -+ -+/* bits definition for reiser4_get_neighbor function `flags' arg. */ -+typedef enum { -+ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to -+ * find not allocated not connected neigbor by going though upper -+ * levels */ -+ GN_CAN_USE_UPPER_LEVELS = 0x1, -+ /* locking left neighbor instead of right one */ -+ GN_GO_LEFT = 0x2, -+ /* automatically load neighbor node content */ -+ GN_LOAD_NEIGHBOR = 0x4, -+ /* return -E_REPEAT if can't lock */ -+ GN_TRY_LOCK = 0x8, -+ /* used internally in tree_walk.c, causes renew_sibling to not -+ allocate neighbor znode, but only search for it in znode cache */ -+ GN_NO_ALLOC = 0x10, -+ /* do not go across atom boundaries */ -+ GN_SAME_ATOM = 0x20, -+ /* allow to lock not connected nodes */ -+ GN_ALLOW_NOT_CONNECTED = 0x40, -+ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */ -+ GN_ASYNC = 0x80 -+} znode_get_neigbor_flags; -+ -+/* A commonly used wrapper for reiser4_get_parent_flags(). */ -+static inline int reiser4_get_parent(lock_handle * result, znode * node, -+ znode_lock_mode mode) -+{ -+ return reiser4_get_parent_flags(result, node, mode, -+ GN_ALLOW_NOT_CONNECTED); -+} -+ -+int reiser4_get_neighbor(lock_handle * neighbor, znode * node, -+ znode_lock_mode lock_mode, int flags); -+ -+/* there are wrappers for most common usages of reiser4_get_neighbor() */ -+static inline int -+reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode, -+ int flags) -+{ -+ return reiser4_get_neighbor(result, node, lock_mode, -+ flags | GN_GO_LEFT); -+} -+ -+static inline int -+reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode, -+ int flags) -+{ -+ ON_DEBUG(check_lock_node_data(node)); -+ ON_DEBUG(check_lock_data()); -+ return reiser4_get_neighbor(result, node, lock_mode, -+ flags & (~GN_GO_LEFT)); -+} -+ -+extern void sibling_list_remove(znode * node); -+extern void sibling_list_drop(znode * node); -+extern void sibling_list_insert_nolock(znode * new, znode * before); -+extern void link_left_and_right(znode * left, znode * right); -+ -+/* Functions called by tree_walk() when tree_walk() ... */ -+struct tree_walk_actor { -+ /* ... meets a formatted node, */ -+ int (*process_znode) (tap_t *, void *); -+ /* ... meets an extent, */ -+ int (*process_extent) (tap_t *, void *); -+ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by -+ * node or extent processing functions. */ -+ int (*before) (void *); -+}; -+ -+#if REISER4_DEBUG -+int check_sibling_list(znode * node); -+#else -+#define check_sibling_list(n) (1) -+#endif -+ -+#endif /* __FS_REISER4_TREE_WALK_H__ */ -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/txnmgr.c linux-2.6.30/fs/reiser4/txnmgr.c ---- linux-2.6.30.orig/fs/reiser4/txnmgr.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/txnmgr.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,3164 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Joshua MacDonald wrote the first draft of this code. */ -+ -+/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a -+filesystem scales only as well as its worst locking design. You need to -+substantially restructure this code. Josh was not as experienced a programmer -+as you. Particularly review how the locking style differs from what you did -+for znodes usingt hi-lo priority locking, and present to me an opinion on -+whether the differences are well founded. */ -+ -+/* I cannot help but to disagree with the sentiment above. Locking of -+ * transaction manager is _not_ badly designed, and, at the very least, is not -+ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority -+ * locking on znodes, especially on the root node of the tree. --nikita, -+ * 2003.10.13 */ -+ -+/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The -+ txnmgr processes capture_block requests and manages the relationship between jnodes and -+ atoms through the various stages of a transcrash, and it also oversees the fusion and -+ capture-on-copy processes. The main difficulty with this task is maintaining a -+ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the -+ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle -+ must be broken. The main requirement is that atom-fusion be deadlock free, so once you -+ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies -+ that any time you check the atom-pointer of a jnode or handle and then try to lock that -+ atom, you must use trylock() and possibly reverse the order. -+ -+ This code implements the design documented at: -+ -+ http://namesys.com/txn-doc.html -+ -+ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the -+above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this -+topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12 -+year old --- define all technical terms used. -+ -+*/ -+ -+/* Thoughts on the external transaction interface: -+ -+ In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which -+ creates state that lasts for the duration of a system call and is called at the start -+ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(), -+ occupying the scope of a single system call. We wish to give certain applications an -+ interface to begin and close (commit) transactions. Since our implementation of -+ transactions does not yet support isolation, allowing an application to open a -+ transaction implies trusting it to later close the transaction. Part of the -+ transaction interface will be aimed at enabling that trust, but the interface for -+ actually using transactions is fairly narrow. -+ -+ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate -+ this identifier into a string that a shell-script could use, allowing you to start a -+ transaction by issuing a command. Once open, the transcrash should be set in the task -+ structure, and there should be options (I suppose) to allow it to be carried across -+ fork/exec. A transcrash has several options: -+ -+ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only -+ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to -+ capture on reads as well, it should set READ_FUSING. -+ -+ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must -+ eventually close (or else the machine must crash). If the application dies an -+ unexpected death with an open transcrash, for example, or if it hangs for a long -+ duration, one solution (to avoid crashing the machine) is to simply close it anyway. -+ This is a dangerous option, but it is one way to solve the problem until isolated -+ transcrashes are available for untrusted applications. -+ -+ It seems to be what databases do, though it is unclear how one avoids a DoS attack -+ creating a vulnerability based on resource starvation. Guaranteeing that some -+ minimum amount of computational resources are made available would seem more correct -+ than guaranteeing some amount of time. When we again have someone to code the work, -+ this issue should be considered carefully. -Hans -+ -+ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how -+ many dirty blocks it expects. The reserve_blocks interface should be called at a point -+ where it is safe for the application to fail, because the system may not be able to -+ grant the allocation and the application must be able to back-out. For this reason, -+ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but -+ the application may also wish to extend the allocation after beginning its transcrash. -+ -+ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making -+ modifications that require transaction protection. When isolated transactions are -+ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a -+ RESERVE_BLOCKS call fails for the application, it should "abort" by calling -+ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is -+ why, for safety, the application should call RESERVE_BLOCKS before making any changes). -+ -+ For actually implementing these out-of-system-call-scopped transcrashes, the -+ reiser4_context has a "txn_handle *trans" pointer that may be set to an open -+ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a -+ "struct kmem_cache *_txnh_slab" created for that purpose in this file. -+*/ -+ -+/* Extending the other system call interfaces for future transaction features: -+ -+ Specialized applications may benefit from passing flags to the ordinary system call -+ interface such as read(), write(), or stat(). For example, the application specifies -+ WRITE_FUSING by default but wishes to add that a certain read() command should be -+ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data -+ read, or the file-data read? These issues are straight-forward, but there are a lot of -+ them and adding the necessary flags-passing code will be tedious. -+ -+ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW) -+ flag, which specifies that although it is a read operation being requested, a -+ write-lock should be taken. The reason is that read-locks are shared while write-locks -+ are exclusive, so taking a read-lock when a later-write is known in advance will often -+ leads to deadlock. If a reader knows it will write later, it should issue read -+ requests with the RMW flag set. -+*/ -+ -+/* -+ The znode/atom deadlock avoidance. -+ -+ FIXME(Zam): writing of this comment is in progress. -+ -+ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's -+ long-term locking, which makes reiser4 locking scheme more complex. It had -+ deadlocks until we implement deadlock avoidance algorithms. That deadlocks -+ looked as the following: one stopped thread waits for a long-term lock on -+ znode, the thread who owns that lock waits when fusion with another atom will -+ be allowed. -+ -+ The source of the deadlocks is an optimization of not capturing index nodes -+ for read. Let's prove it. Suppose we have dumb node capturing scheme which -+ unconditionally captures each block before locking it. -+ -+ That scheme has no deadlocks. Let's begin with the thread which stage is -+ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for -+ a capture because it's stage allows fusion with any atom except which are -+ being committed currently. A process of atom commit can't deadlock because -+ atom commit procedure does not acquire locks and does not fuse with other -+ atoms. Reiser4 does capturing right before going to sleep inside the -+ longtertm_lock_znode() function, it means the znode which we want to lock is -+ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we -+ continue the analysis we understand that no one process in the sequence may -+ waits atom fusion. Thereby there are no deadlocks of described kind. -+ -+ The capturing optimization makes the deadlocks possible. A thread can wait a -+ lock which owner did not captured that node. The lock owner's current atom -+ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT -+ state. A deadlock is possible when that atom meets another one which is in -+ ASTAGE_CAPTURE_WAIT already. -+ -+ The deadlock avoidance scheme includes two algorithms: -+ -+ First algorithm is used when a thread captures a node which is locked but not -+ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the -+ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is -+ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the -+ routine which forces all lock owners to join with current atom is executed. -+ -+ Second algorithm does not allow to skip capturing of already captured nodes. -+ -+ Both algorithms together prevent waiting a longterm lock without atom fusion -+ with atoms of all lock owners, which is a key thing for getting atom/znode -+ locking deadlocks. -+*/ -+ -+/* -+ * Transactions and mmap(2). -+ * -+ * 1. Transactions are not supported for accesses through mmap(2), because -+ * this would effectively amount to user-level transactions whose duration -+ * is beyond control of the kernel. -+ * -+ * 2. That said, we still want to preserve some decency with regard to -+ * mmap(2). During normal write(2) call, following sequence of events -+ * happens: -+ * -+ * 1. page is created; -+ * -+ * 2. jnode is created, dirtied and captured into current atom. -+ * -+ * 3. extent is inserted and modified. -+ * -+ * Steps (2) and (3) take place under long term lock on the twig node. -+ * -+ * When file is accessed through mmap(2) page is always created during -+ * page fault. -+ * After this (in reiser4_readpage()->reiser4_readpage_extent()): -+ * -+ * 1. if access is made to non-hole page new jnode is created, (if -+ * necessary) -+ * -+ * 2. if access is made to the hole page, jnode is not created (XXX -+ * not clear why). -+ * -+ * Also, even if page is created by write page fault it is not marked -+ * dirty immediately by handle_mm_fault(). Probably this is to avoid races -+ * with page write-out. -+ * -+ * Dirty bit installed by hardware is only transferred to the struct page -+ * later, when page is unmapped (in zap_pte_range(), or -+ * try_to_unmap_one()). -+ * -+ * So, with mmap(2) we have to handle following irksome situations: -+ * -+ * 1. there exists modified page (clean or dirty) without jnode -+ * -+ * 2. there exists modified page (clean or dirty) with clean jnode -+ * -+ * 3. clean page which is a part of atom can be transparently modified -+ * at any moment through mapping without becoming dirty. -+ * -+ * (1) and (2) can lead to the out-of-memory situation: ->writepage() -+ * doesn't know what to do with such pages and ->sync_sb()/->writepages() -+ * don't see them, because these methods operate on atoms. -+ * -+ * (3) can lead to the loss of data: suppose we have dirty page with dirty -+ * captured jnode captured by some atom. As part of early flush (for -+ * example) page was written out. Dirty bit was cleared on both page and -+ * jnode. After this page is modified through mapping, but kernel doesn't -+ * notice and just discards page and jnode as part of commit. (XXX -+ * actually it doesn't, because to reclaim page ->releasepage() has to be -+ * called and before this dirty bit will be transferred to the struct -+ * page). -+ * -+ */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "wander.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "page_cache.h" -+#include "reiser4.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "flush.h" -+ -+#include <asm/atomic.h> -+#include <linux/types.h> -+#include <linux/fs.h> -+#include <linux/mm.h> -+#include <linux/slab.h> -+#include <linux/pagemap.h> -+#include <linux/writeback.h> -+#include <linux/swap.h> /* for totalram_pages */ -+ -+static void atom_free(txn_atom * atom); -+ -+static int commit_txnh(txn_handle * txnh); -+ -+static void wakeup_atom_waitfor_list(txn_atom * atom); -+static void wakeup_atom_waiting_list(txn_atom * atom); -+ -+static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh); -+ -+static void capture_assign_block_nolock(txn_atom * atom, jnode * node); -+ -+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node); -+ -+static int capture_init_fusion(jnode * node, txn_handle * txnh, -+ txn_capture mode); -+ -+static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture); -+ -+static void capture_fuse_into(txn_atom * small, txn_atom * large); -+ -+void reiser4_invalidate_list(struct list_head *); -+ -+/* GENERIC STRUCTURES */ -+ -+typedef struct _txn_wait_links txn_wait_links; -+ -+struct _txn_wait_links { -+ lock_stack *_lock_stack; -+ struct list_head _fwaitfor_link; -+ struct list_head _fwaiting_link; -+ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks); -+ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks); -+}; -+ -+/* FIXME: In theory, we should be using the slab cache init & destructor -+ methods instead of, e.g., jnode_init, etc. */ -+static struct kmem_cache *_atom_slab = NULL; -+/* this is for user-visible, cross system-call transactions. */ -+static struct kmem_cache *_txnh_slab = NULL; -+ -+/** -+ * init_txnmgr_static - create transaction manager slab caches -+ * -+ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module -+ * initialization. -+ */ -+int init_txnmgr_static(void) -+{ -+ assert("jmacd-600", _atom_slab == NULL); -+ assert("jmacd-601", _txnh_slab == NULL); -+ -+ ON_DEBUG(atomic_set(&flush_cnt, 0)); -+ -+ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL); -+ if (_atom_slab == NULL) -+ return RETERR(-ENOMEM); -+ -+ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0, -+ SLAB_HWCACHE_ALIGN, NULL); -+ if (_txnh_slab == NULL) { -+ kmem_cache_destroy(_atom_slab); -+ _atom_slab = NULL; -+ return RETERR(-ENOMEM); -+ } -+ -+ return 0; -+} -+ -+/** -+ * done_txnmgr_static - delete txn_atom and txn_handle caches -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_txnmgr_static(void) -+{ -+ destroy_reiser4_cache(&_atom_slab); -+ destroy_reiser4_cache(&_txnh_slab); -+} -+ -+/** -+ * init_txnmgr - initialize a new transaction manager -+ * @mgr: pointer to transaction manager embedded in reiser4 super block -+ * -+ * This is called on mount. Makes necessary initializations. -+ */ -+void reiser4_init_txnmgr(txn_mgr *mgr) -+{ -+ assert("umka-169", mgr != NULL); -+ -+ mgr->atom_count = 0; -+ mgr->id_count = 1; -+ INIT_LIST_HEAD(&mgr->atoms_list); -+ spin_lock_init(&mgr->tmgr_lock); -+ mutex_init(&mgr->commit_mutex); -+} -+ -+/** -+ * reiser4_done_txnmgr - stop transaction manager -+ * @mgr: pointer to transaction manager embedded in reiser4 super block -+ * -+ * This is called on umount. Does sanity checks. -+ */ -+void reiser4_done_txnmgr(txn_mgr *mgr) -+{ -+ assert("umka-170", mgr != NULL); -+ assert("umka-1701", list_empty_careful(&mgr->atoms_list)); -+ assert("umka-1702", mgr->atom_count == 0); -+} -+ -+/* Initialize a transaction handle. */ -+/* Audited by: umka (2002.06.13) */ -+static void txnh_init(txn_handle * txnh, txn_mode mode) -+{ -+ assert("umka-171", txnh != NULL); -+ -+ txnh->mode = mode; -+ txnh->atom = NULL; -+ reiser4_ctx_gfp_mask_set(); -+ txnh->flags = 0; -+ spin_lock_init(&txnh->hlock); -+ INIT_LIST_HEAD(&txnh->txnh_link); -+} -+ -+#if REISER4_DEBUG -+/* Check if a transaction handle is clean. */ -+static int txnh_isclean(txn_handle * txnh) -+{ -+ assert("umka-172", txnh != NULL); -+ return txnh->atom == NULL && -+ LOCK_CNT_NIL(spin_locked_txnh); -+} -+#endif -+ -+/* Initialize an atom. */ -+static void atom_init(txn_atom * atom) -+{ -+ int level; -+ -+ assert("umka-173", atom != NULL); -+ -+ memset(atom, 0, sizeof(txn_atom)); -+ -+ atom->stage = ASTAGE_FREE; -+ atom->start_time = jiffies; -+ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) -+ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level)); -+ -+ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom)); -+ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom)); -+ INIT_LIST_HEAD(ATOM_WB_LIST(atom)); -+ INIT_LIST_HEAD(&atom->inodes); -+ spin_lock_init(&(atom->alock)); -+ /* list of transaction handles */ -+ INIT_LIST_HEAD(&atom->txnh_list); -+ /* link to transaction manager's list of atoms */ -+ INIT_LIST_HEAD(&atom->atom_link); -+ INIT_LIST_HEAD(&atom->fwaitfor_list); -+ INIT_LIST_HEAD(&atom->fwaiting_list); -+ blocknr_set_init(&atom->delete_set); -+ blocknr_set_init(&atom->wandered_map); -+ -+ init_atom_fq_parts(atom); -+} -+ -+#if REISER4_DEBUG -+/* Check if an atom is clean. */ -+static int atom_isclean(txn_atom * atom) -+{ -+ int level; -+ -+ assert("umka-174", atom != NULL); -+ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) { -+ return 0; -+ } -+ } -+ -+ return atom->stage == ASTAGE_FREE && -+ atom->txnh_count == 0 && -+ atom->capture_count == 0 && -+ atomic_read(&atom->refcount) == 0 && -+ (&atom->atom_link == atom->atom_link.next && -+ &atom->atom_link == atom->atom_link.prev) && -+ list_empty_careful(&atom->txnh_list) && -+ list_empty_careful(ATOM_CLEAN_LIST(atom)) && -+ list_empty_careful(ATOM_OVRWR_LIST(atom)) && -+ list_empty_careful(ATOM_WB_LIST(atom)) && -+ list_empty_careful(&atom->fwaitfor_list) && -+ list_empty_careful(&atom->fwaiting_list) && -+ atom_fq_parts_are_clean(atom); -+} -+#endif -+ -+/* Begin a transaction in this context. Currently this uses the reiser4_context's -+ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually -+ this will be extended to allow transaction handles to span several contexts. */ -+/* Audited by: umka (2002.06.13) */ -+void reiser4_txn_begin(reiser4_context * context) -+{ -+ assert("jmacd-544", context->trans == NULL); -+ -+ context->trans = &context->trans_in_ctx; -+ -+ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING -+ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is -+ stack allocated right now, but we would like to allow for dynamically allocated -+ transcrashes that span multiple system calls. -+ */ -+ txnh_init(context->trans, TXN_WRITE_FUSING); -+} -+ -+/* Finish a transaction handle context. */ -+int reiser4_txn_end(reiser4_context * context) -+{ -+ long ret = 0; -+ txn_handle *txnh; -+ -+ assert("umka-283", context != NULL); -+ assert("nikita-3012", reiser4_schedulable()); -+ assert("vs-24", context == get_current_context()); -+ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack())); -+ -+ txnh = context->trans; -+ if (txnh != NULL) { -+ if (txnh->atom != NULL) -+ ret = commit_txnh(txnh); -+ assert("jmacd-633", txnh_isclean(txnh)); -+ context->trans = NULL; -+ } -+ return ret; -+} -+ -+void reiser4_txn_restart(reiser4_context * context) -+{ -+ reiser4_txn_end(context); -+ reiser4_preempt_point(); -+ reiser4_txn_begin(context); -+} -+ -+void reiser4_txn_restart_current(void) -+{ -+ reiser4_txn_restart(get_current_context()); -+} -+ -+/* TXN_ATOM */ -+ -+/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom -+ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May -+ return NULL. */ -+static txn_atom *txnh_get_atom(txn_handle * txnh) -+{ -+ txn_atom *atom; -+ -+ assert("umka-180", txnh != NULL); -+ assert_spin_not_locked(&(txnh->hlock)); -+ -+ while (1) { -+ spin_lock_txnh(txnh); -+ atom = txnh->atom; -+ -+ if (atom == NULL) -+ break; -+ -+ if (spin_trylock_atom(atom)) -+ break; -+ -+ atomic_inc(&atom->refcount); -+ -+ spin_unlock_txnh(txnh); -+ spin_lock_atom(atom); -+ spin_lock_txnh(txnh); -+ -+ if (txnh->atom == atom) { -+ atomic_dec(&atom->refcount); -+ break; -+ } -+ -+ spin_unlock_txnh(txnh); -+ atom_dec_and_unlock(atom); -+ } -+ -+ return atom; -+} -+ -+/* Get the current atom and spinlock it if current atom present. May return NULL */ -+txn_atom *get_current_atom_locked_nocheck(void) -+{ -+ reiser4_context *cx; -+ txn_atom *atom; -+ txn_handle *txnh; -+ -+ cx = get_current_context(); -+ assert("zam-437", cx != NULL); -+ -+ txnh = cx->trans; -+ assert("zam-435", txnh != NULL); -+ -+ atom = txnh_get_atom(txnh); -+ -+ spin_unlock_txnh(txnh); -+ return atom; -+} -+ -+/* Get the atom belonging to a jnode, which is initially locked. Return with -+ both jnode and atom locked. This performs the necessary spin_trylock to -+ break the lock-ordering cycle. Assumes the jnode is already locked, and -+ returns NULL if atom is not set. */ -+txn_atom *jnode_get_atom(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert("umka-181", node != NULL); -+ -+ while (1) { -+ assert_spin_locked(&(node->guard)); -+ -+ atom = node->atom; -+ /* node is not in any atom */ -+ if (atom == NULL) -+ break; -+ -+ /* If atom is not locked, grab the lock and return */ -+ if (spin_trylock_atom(atom)) -+ break; -+ -+ /* At least one jnode belongs to this atom it guarantees that -+ * atom->refcount > 0, we can safely increment refcount. */ -+ atomic_inc(&atom->refcount); -+ spin_unlock_jnode(node); -+ -+ /* re-acquire spin locks in the right order */ -+ spin_lock_atom(atom); -+ spin_lock_jnode(node); -+ -+ /* check if node still points to the same atom. */ -+ if (node->atom == atom) { -+ atomic_dec(&atom->refcount); -+ break; -+ } -+ -+ /* releasing of atom lock and reference requires not holding -+ * locks on jnodes. */ -+ spin_unlock_jnode(node); -+ -+ /* We do not sure that this atom has extra references except our -+ * one, so we should call proper function which may free atom if -+ * last reference is released. */ -+ atom_dec_and_unlock(atom); -+ -+ /* lock jnode again for getting valid node->atom pointer -+ * value. */ -+ spin_lock_jnode(node); -+ } -+ -+ return atom; -+} -+ -+/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used -+ by flush code to indicate whether the next node (in some direction) is suitable for -+ flushing. */ -+int -+same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value) -+{ -+ int compat; -+ txn_atom *atom; -+ -+ assert("umka-182", node != NULL); -+ assert("umka-183", check != NULL); -+ -+ /* Not sure what this function is supposed to do if supplied with @check that is -+ neither formatted nor unformatted (bitmap or so). */ -+ assert("nikita-2373", jnode_is_znode(check) -+ || jnode_is_unformatted(check)); -+ -+ /* Need a lock on CHECK to get its atom and to check various state bits. -+ Don't need a lock on NODE once we get the atom lock. */ -+ /* It is not enough to lock two nodes and check (node->atom == -+ check->atom) because atom could be locked and being fused at that -+ moment, jnodes of the atom of that state (being fused) can point to -+ different objects, but the atom is the same. */ -+ spin_lock_jnode(check); -+ -+ atom = jnode_get_atom(check); -+ -+ if (atom == NULL) { -+ compat = 0; -+ } else { -+ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY)); -+ -+ if (compat && jnode_is_znode(check)) { -+ compat &= znode_is_connected(JZNODE(check)); -+ } -+ -+ if (compat && alloc_check) { -+ compat &= (alloc_value == jnode_is_flushprepped(check)); -+ } -+ -+ spin_unlock_atom(atom); -+ } -+ -+ spin_unlock_jnode(check); -+ -+ return compat; -+} -+ -+/* Decrement the atom's reference count and if it falls to zero, free it. */ -+void atom_dec_and_unlock(txn_atom * atom) -+{ -+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ -+ assert("umka-186", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("zam-1039", atomic_read(&atom->refcount) > 0); -+ -+ if (atomic_dec_and_test(&atom->refcount)) { -+ /* take txnmgr lock and atom lock in proper order. */ -+ if (!spin_trylock_txnmgr(mgr)) { -+ /* This atom should exist after we re-acquire its -+ * spinlock, so we increment its reference counter. */ -+ atomic_inc(&atom->refcount); -+ spin_unlock_atom(atom); -+ spin_lock_txnmgr(mgr); -+ spin_lock_atom(atom); -+ -+ if (!atomic_dec_and_test(&atom->refcount)) { -+ spin_unlock_atom(atom); -+ spin_unlock_txnmgr(mgr); -+ return; -+ } -+ } -+ assert_spin_locked(&(mgr->tmgr_lock)); -+ atom_free(atom); -+ spin_unlock_txnmgr(mgr); -+ } else -+ spin_unlock_atom(atom); -+} -+ -+/* Create new atom and connect it to given transaction handle. This adds the -+ atom to the transaction manager's list and sets its reference count to 1, an -+ artificial reference which is kept until it commits. We play strange games -+ to avoid allocation under jnode & txnh spinlocks.*/ -+ -+static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh) -+{ -+ txn_atom *atom; -+ txn_mgr *mgr; -+ -+ if (REISER4_DEBUG && rofs_tree(current_tree)) { -+ warning("nikita-3366", "Creating atom on rofs"); -+ dump_stack(); -+ } -+ -+ if (*atom_alloc == NULL) { -+ (*atom_alloc) = kmem_cache_alloc(_atom_slab, -+ reiser4_ctx_gfp_mask_get()); -+ -+ if (*atom_alloc == NULL) -+ return RETERR(-ENOMEM); -+ } -+ -+ /* and, also, txnmgr spin lock should be taken before jnode and txnh -+ locks. */ -+ mgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ spin_lock_txnmgr(mgr); -+ spin_lock_txnh(txnh); -+ -+ /* Check whether new atom still needed */ -+ if (txnh->atom != NULL) { -+ /* NOTE-NIKITA probably it is rather better to free -+ * atom_alloc here than thread it up to reiser4_try_capture() */ -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_txnmgr(mgr); -+ -+ return -E_REPEAT; -+ } -+ -+ atom = *atom_alloc; -+ *atom_alloc = NULL; -+ -+ atom_init(atom); -+ -+ assert("jmacd-17", atom_isclean(atom)); -+ -+ /* -+ * lock ordering is broken here. It is ok, as long as @atom is new -+ * and inaccessible for others. We can't use spin_lock_atom or -+ * spin_lock(&atom->alock) because they care about locking -+ * dependencies. spin_trylock_lock doesn't. -+ */ -+ check_me("", spin_trylock_atom(atom)); -+ -+ /* add atom to the end of transaction manager's list of atoms */ -+ list_add_tail(&atom->atom_link, &mgr->atoms_list); -+ atom->atom_id = mgr->id_count++; -+ mgr->atom_count += 1; -+ -+ /* Release txnmgr lock */ -+ spin_unlock_txnmgr(mgr); -+ -+ /* One reference until it commits. */ -+ atomic_inc(&atom->refcount); -+ atom->stage = ASTAGE_CAPTURE_FUSE; -+ atom->super = reiser4_get_current_sb(); -+ capture_assign_txnh_nolock(atom, txnh); -+ -+ spin_unlock_atom(atom); -+ spin_unlock_txnh(txnh); -+ -+ return -E_REPEAT; -+} -+ -+/* Return true if an atom is currently "open". */ -+static int atom_isopen(const txn_atom * atom) -+{ -+ assert("umka-185", atom != NULL); -+ -+ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT; -+} -+ -+/* Return the number of pointers to this atom that must be updated during fusion. This -+ approximates the amount of work to be done. Fusion chooses the atom with fewer -+ pointers to fuse into the atom with more pointers. */ -+static int atom_pointer_count(const txn_atom * atom) -+{ -+ assert("umka-187", atom != NULL); -+ -+ /* This is a measure of the amount of work needed to fuse this atom -+ * into another. */ -+ return atom->txnh_count + atom->capture_count; -+} -+ -+/* Called holding the atom lock, this removes the atom from the transaction manager list -+ and frees it. */ -+static void atom_free(txn_atom * atom) -+{ -+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ -+ assert("umka-188", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ /* Remove from the txn_mgr's atom list */ -+ assert_spin_locked(&(mgr->tmgr_lock)); -+ mgr->atom_count -= 1; -+ list_del_init(&atom->atom_link); -+ -+ /* Clean the atom */ -+ assert("jmacd-16", -+ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE)); -+ atom->stage = ASTAGE_FREE; -+ -+ blocknr_set_destroy(&atom->delete_set); -+ blocknr_set_destroy(&atom->wandered_map); -+ -+ assert("jmacd-16", atom_isclean(atom)); -+ -+ spin_unlock_atom(atom); -+ -+ kmem_cache_free(_atom_slab, atom); -+} -+ -+static int atom_is_dotard(const txn_atom * atom) -+{ -+ return time_after(jiffies, atom->start_time + -+ get_current_super_private()->tmgr.atom_max_age); -+} -+ -+static int atom_can_be_committed(txn_atom * atom) -+{ -+ assert_spin_locked(&(atom->alock)); -+ assert("zam-885", atom->txnh_count > atom->nr_waiters); -+ return atom->txnh_count == atom->nr_waiters + 1; -+} -+ -+/* Return true if an atom should commit now. This is determined by aging, atom -+ size or atom flags. */ -+static int atom_should_commit(const txn_atom * atom) -+{ -+ assert("umka-189", atom != NULL); -+ return -+ (atom->flags & ATOM_FORCE_COMMIT) || -+ ((unsigned)atom_pointer_count(atom) > -+ get_current_super_private()->tmgr.atom_max_size) -+ || atom_is_dotard(atom); -+} -+ -+/* return 1 if current atom exists and requires commit. */ -+int current_atom_should_commit(void) -+{ -+ txn_atom *atom; -+ int result = 0; -+ -+ atom = get_current_atom_locked_nocheck(); -+ if (atom) { -+ result = atom_should_commit(atom); -+ spin_unlock_atom(atom); -+ } -+ return result; -+} -+ -+static int atom_should_commit_asap(const txn_atom * atom) -+{ -+ unsigned int captured; -+ unsigned int pinnedpages; -+ -+ assert("nikita-3309", atom != NULL); -+ -+ captured = (unsigned)atom->capture_count; -+ pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode); -+ -+ return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100); -+} -+ -+static jnode *find_first_dirty_in_list(struct list_head *head, int flags) -+{ -+ jnode *first_dirty; -+ -+ list_for_each_entry(first_dirty, head, capture_link) { -+ if (!(flags & JNODE_FLUSH_COMMIT)) { -+ /* -+ * skip jnodes which "heard banshee" or having active -+ * I/O -+ */ -+ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) || -+ JF_ISSET(first_dirty, JNODE_WRITEBACK)) -+ continue; -+ } -+ return first_dirty; -+ } -+ return NULL; -+} -+ -+/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty -+ nodes on atom's lists */ -+jnode *find_first_dirty_jnode(txn_atom * atom, int flags) -+{ -+ jnode *first_dirty; -+ tree_level level; -+ -+ assert_spin_locked(&(atom->alock)); -+ -+ /* The flush starts from LEAF_LEVEL (=1). */ -+ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level))) -+ continue; -+ -+ first_dirty = -+ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level), -+ flags); -+ if (first_dirty) -+ return first_dirty; -+ } -+ -+ /* znode-above-root is on the list #0. */ -+ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags); -+} -+ -+static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq) -+{ -+ jnode *cur; -+ -+ assert("zam-905", atom_is_protected(atom)); -+ -+ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link); -+ while (ATOM_WB_LIST(atom) != &cur->capture_link) { -+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link); -+ -+ spin_lock_jnode(cur); -+ if (!JF_ISSET(cur, JNODE_WRITEBACK)) { -+ if (JF_ISSET(cur, JNODE_DIRTY)) { -+ queue_jnode(fq, cur); -+ } else { -+ /* move jnode to atom's clean list */ -+ list_move_tail(&cur->capture_link, -+ ATOM_CLEAN_LIST(atom)); -+ } -+ } -+ spin_unlock_jnode(cur); -+ -+ cur = next; -+ } -+} -+ -+/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback -+ * jnodes to disk. */ -+static int submit_wb_list(void) -+{ -+ int ret; -+ flush_queue_t *fq; -+ -+ fq = get_fq_for_current_atom(); -+ if (IS_ERR(fq)) -+ return PTR_ERR(fq); -+ -+ dispatch_wb_list(fq->atom, fq); -+ spin_unlock_atom(fq->atom); -+ -+ ret = reiser4_write_fq(fq, NULL, 1); -+ reiser4_fq_put(fq); -+ -+ return ret; -+} -+ -+/* Wait completion of all writes, re-submit atom writeback list if needed. */ -+static int current_atom_complete_writes(void) -+{ -+ int ret; -+ -+ /* Each jnode from that list was modified and dirtied when it had i/o -+ * request running already. After i/o completion we have to resubmit -+ * them to disk again.*/ -+ ret = submit_wb_list(); -+ if (ret < 0) -+ return ret; -+ -+ /* Wait all i/o completion */ -+ ret = current_atom_finish_all_fq(); -+ if (ret) -+ return ret; -+ -+ /* Scan wb list again; all i/o should be completed, we re-submit dirty -+ * nodes to disk */ -+ ret = submit_wb_list(); -+ if (ret < 0) -+ return ret; -+ -+ /* Wait all nodes we just submitted */ -+ return current_atom_finish_all_fq(); -+} -+ -+#if REISER4_DEBUG -+ -+static void reiser4_info_atom(const char *prefix, const txn_atom * atom) -+{ -+ if (atom == NULL) { -+ printk("%s: no atom\n", prefix); -+ return; -+ } -+ -+ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i" -+ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix, -+ atomic_read(&atom->refcount), atom->atom_id, atom->flags, -+ atom->txnh_count, atom->capture_count, atom->stage, -+ atom->start_time, atom->flushed); -+} -+ -+#else /* REISER4_DEBUG */ -+ -+static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {} -+ -+#endif /* REISER4_DEBUG */ -+ -+#define TOOMANYFLUSHES (1 << 13) -+ -+/* Called with the atom locked and no open "active" transaction handlers except -+ ours, this function calls flush_current_atom() until all dirty nodes are -+ processed. Then it initiates commit processing. -+ -+ Called by the single remaining open "active" txnh, which is closing. Other -+ open txnhs belong to processes which wait atom commit in commit_txnh() -+ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as -+ long as we hold the atom lock none of the jnodes can be captured and/or -+ locked. -+ -+ Return value is an error code if commit fails. -+*/ -+static int commit_current_atom(long *nr_submitted, txn_atom ** atom) -+{ -+ reiser4_super_info_data *sbinfo = get_current_super_private(); -+ long ret = 0; -+ /* how many times jnode_flush() was called as a part of attempt to -+ * commit this atom. */ -+ int flushiters; -+ -+ assert("zam-888", atom != NULL && *atom != NULL); -+ assert_spin_locked(&((*atom)->alock)); -+ assert("zam-887", get_current_context()->trans->atom == *atom); -+ assert("jmacd-151", atom_isopen(*atom)); -+ -+ assert("nikita-3184", -+ get_current_super_private()->delete_mutex_owner != current); -+ -+ for (flushiters = 0;; ++flushiters) { -+ ret = -+ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS | -+ JNODE_FLUSH_COMMIT, -+ LONG_MAX /* nr_to_write */ , -+ nr_submitted, atom, NULL); -+ if (ret != -E_REPEAT) -+ break; -+ -+ /* if atom's dirty list contains one znode which is -+ HEARD_BANSHEE and is locked we have to allow lock owner to -+ continue and uncapture that znode */ -+ reiser4_preempt_point(); -+ -+ *atom = get_current_atom_locked(); -+ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) { -+ warning("nikita-3176", -+ "Flushing like mad: %i", flushiters); -+ reiser4_info_atom("atom", *atom); -+ DEBUGON(flushiters > (1 << 20)); -+ } -+ } -+ -+ if (ret) -+ return ret; -+ -+ assert_spin_locked(&((*atom)->alock)); -+ -+ if (!atom_can_be_committed(*atom)) { -+ spin_unlock_atom(*atom); -+ return RETERR(-E_REPEAT); -+ } -+ -+ if ((*atom)->capture_count == 0) -+ goto done; -+ -+ /* Up to this point we have been flushing and after flush is called we -+ return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT -+ at this point, commit should be successful. */ -+ reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT); -+ ON_DEBUG(((*atom)->committer = current)); -+ spin_unlock_atom(*atom); -+ -+ ret = current_atom_complete_writes(); -+ if (ret) -+ return ret; -+ -+ assert("zam-906", list_empty(ATOM_WB_LIST(*atom))); -+ -+ /* isolate critical code path which should be executed by only one -+ * thread using tmgr mutex */ -+ mutex_lock(&sbinfo->tmgr.commit_mutex); -+ -+ ret = reiser4_write_logs(nr_submitted); -+ if (ret < 0) -+ reiser4_panic("zam-597", "write log failed (%ld)\n", ret); -+ -+ /* The atom->ovrwr_nodes list is processed under commit mutex held -+ because of bitmap nodes which are captured by special way in -+ reiser4_pre_commit_hook_bitmap(), that way does not include -+ capture_fuse_wait() as a capturing of other nodes does -- the commit -+ mutex is used for transaction isolation instead. */ -+ reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom)); -+ mutex_unlock(&sbinfo->tmgr.commit_mutex); -+ -+ reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom)); -+ reiser4_invalidate_list(ATOM_WB_LIST(*atom)); -+ assert("zam-927", list_empty(&(*atom)->inodes)); -+ -+ spin_lock_atom(*atom); -+ done: -+ reiser4_atom_set_stage(*atom, ASTAGE_DONE); -+ ON_DEBUG((*atom)->committer = NULL); -+ -+ /* Atom's state changes, so wake up everybody waiting for this -+ event. */ -+ wakeup_atom_waiting_list(*atom); -+ -+ /* Decrement the "until commit" reference, at least one txnh (the caller) is -+ still open. */ -+ atomic_dec(&(*atom)->refcount); -+ -+ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0); -+ assert("jmacd-1062", (*atom)->capture_count == 0); -+ BUG_ON((*atom)->capture_count != 0); -+ assert_spin_locked(&((*atom)->alock)); -+ -+ return ret; -+} -+ -+/* TXN_TXNH */ -+ -+/** -+ * force_commit_atom - commit current atom and wait commit completion -+ * @txnh: -+ * -+ * Commits current atom and wait commit completion; current atom and @txnh have -+ * to be spinlocked before call, this function unlocks them on exit. -+ */ -+int force_commit_atom(txn_handle *txnh) -+{ -+ txn_atom *atom; -+ -+ assert("zam-837", txnh != NULL); -+ assert_spin_locked(&(txnh->hlock)); -+ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack())); -+ -+ atom = txnh->atom; -+ -+ assert("zam-834", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ /* -+ * Set flags for atom and txnh: forcing atom commit and waiting for -+ * commit completion -+ */ -+ txnh->flags |= TXNH_WAIT_COMMIT; -+ atom->flags |= ATOM_FORCE_COMMIT; -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(atom); -+ -+ /* commit is here */ -+ reiser4_txn_restart_current(); -+ return 0; -+} -+ -+/* Called to force commit of any outstanding atoms. @commit_all_atoms controls -+ * should we commit all atoms including new ones which are created after this -+ * functions is called. */ -+int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms) -+{ -+ int ret; -+ txn_atom *atom; -+ txn_mgr *mgr; -+ txn_handle *txnh; -+ unsigned long start_time = jiffies; -+ reiser4_context *ctx = get_current_context(); -+ -+ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack())); -+ assert("nikita-3058", reiser4_commit_check_locks()); -+ -+ reiser4_txn_restart_current(); -+ -+ mgr = &get_super_private(super)->tmgr; -+ -+ txnh = ctx->trans; -+ -+ again: -+ -+ spin_lock_txnmgr(mgr); -+ -+ list_for_each_entry(atom, &mgr->atoms_list, atom_link) { -+ spin_lock_atom(atom); -+ -+ /* Commit any atom which can be committed. If @commit_new_atoms -+ * is not set we commit only atoms which were created before -+ * this call is started. */ -+ if (commit_all_atoms -+ || time_before_eq(atom->start_time, start_time)) { -+ if (atom->stage <= ASTAGE_POST_COMMIT) { -+ spin_unlock_txnmgr(mgr); -+ -+ if (atom->stage < ASTAGE_PRE_COMMIT) { -+ spin_lock_txnh(txnh); -+ /* Add force-context txnh */ -+ capture_assign_txnh_nolock(atom, txnh); -+ ret = force_commit_atom(txnh); -+ if (ret) -+ return ret; -+ } else -+ /* wait atom commit */ -+ reiser4_atom_wait_event(atom); -+ -+ goto again; -+ } -+ } -+ -+ spin_unlock_atom(atom); -+ } -+ -+#if REISER4_DEBUG -+ if (commit_all_atoms) { -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ spin_lock_reiser4_super(sbinfo); -+ assert("zam-813", -+ sbinfo->blocks_fake_allocated_unformatted == 0); -+ assert("zam-812", sbinfo->blocks_fake_allocated == 0); -+ spin_unlock_reiser4_super(sbinfo); -+ } -+#endif -+ -+ spin_unlock_txnmgr(mgr); -+ -+ return 0; -+} -+ -+/* check whether commit_some_atoms() can commit @atom. Locking is up to the -+ * caller */ -+static int atom_is_committable(txn_atom * atom) -+{ -+ return -+ atom->stage < ASTAGE_PRE_COMMIT && -+ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom); -+} -+ -+/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin -+ * lock at exit */ -+int commit_some_atoms(txn_mgr * mgr) -+{ -+ int ret = 0; -+ txn_atom *atom; -+ txn_handle *txnh; -+ reiser4_context *ctx; -+ struct list_head *pos, *tmp; -+ -+ ctx = get_current_context(); -+ assert("nikita-2444", ctx != NULL); -+ -+ txnh = ctx->trans; -+ spin_lock_txnmgr(mgr); -+ -+ /* -+ * this is to avoid gcc complain that atom might be used -+ * uninitialized -+ */ -+ atom = NULL; -+ -+ /* look for atom to commit */ -+ list_for_each_safe(pos, tmp, &mgr->atoms_list) { -+ atom = list_entry(pos, txn_atom, atom_link); -+ /* -+ * first test without taking atom spin lock, whether it is -+ * eligible for committing at all -+ */ -+ if (atom_is_committable(atom)) { -+ /* now, take spin lock and re-check */ -+ spin_lock_atom(atom); -+ if (atom_is_committable(atom)) -+ break; -+ spin_unlock_atom(atom); -+ } -+ } -+ -+ ret = (&mgr->atoms_list == pos); -+ spin_unlock_txnmgr(mgr); -+ -+ if (ret) { -+ /* nothing found */ -+ spin_unlock(&mgr->daemon->guard); -+ return 0; -+ } -+ -+ spin_lock_txnh(txnh); -+ -+ BUG_ON(atom == NULL); -+ /* Set the atom to force committing */ -+ atom->flags |= ATOM_FORCE_COMMIT; -+ -+ /* Add force-context txnh */ -+ capture_assign_txnh_nolock(atom, txnh); -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(atom); -+ -+ /* we are about to release daemon spin lock, notify daemon it -+ has to rescan atoms */ -+ mgr->daemon->rescan = 1; -+ spin_unlock(&mgr->daemon->guard); -+ reiser4_txn_restart_current(); -+ return 0; -+} -+ -+static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom) -+{ -+ int atom_stage; -+ txn_atom *atom_2; -+ int repeat; -+ -+ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT); -+ -+ atom_stage = atom->stage; -+ repeat = 0; -+ -+ if (!spin_trylock_txnmgr(tmgr)) { -+ atomic_inc(&atom->refcount); -+ spin_unlock_atom(atom); -+ spin_lock_txnmgr(tmgr); -+ spin_lock_atom(atom); -+ repeat = 1; -+ if (atom->stage != atom_stage) { -+ spin_unlock_txnmgr(tmgr); -+ atom_dec_and_unlock(atom); -+ return -E_REPEAT; -+ } -+ atomic_dec(&atom->refcount); -+ } -+ -+ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) { -+ if (atom == atom_2) -+ continue; -+ /* -+ * if trylock does not succeed we just do not fuse with that -+ * atom. -+ */ -+ if (spin_trylock_atom(atom_2)) { -+ if (atom_2->stage < ASTAGE_PRE_COMMIT) { -+ spin_unlock_txnmgr(tmgr); -+ capture_fuse_into(atom_2, atom); -+ /* all locks are lost we can only repeat here */ -+ return -E_REPEAT; -+ } -+ spin_unlock_atom(atom_2); -+ } -+ } -+ atom->flags |= ATOM_CANCEL_FUSION; -+ spin_unlock_txnmgr(tmgr); -+ if (repeat) { -+ spin_unlock_atom(atom); -+ return -E_REPEAT; -+ } -+ return 0; -+} -+ -+/* Calls jnode_flush for current atom if it exists; if not, just take another -+ atom and call jnode_flush() for him. If current transaction handle has -+ already assigned atom (current atom) we have to close current transaction -+ prior to switch to another atom or do something with current atom. This -+ code tries to flush current atom. -+ -+ flush_some_atom() is called as part of memory clearing process. It is -+ invoked from balance_dirty_pages(), pdflushd, and entd. -+ -+ If we can flush no nodes, atom is committed, because this frees memory. -+ -+ If atom is too large or too old it is committed also. -+*/ -+int -+flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc, -+ int flags) -+{ -+ reiser4_context *ctx = get_current_context(); -+ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr; -+ txn_handle *txnh = ctx->trans; -+ txn_atom *atom; -+ int ret; -+ -+ BUG_ON(wbc->nr_to_write == 0); -+ BUG_ON(*nr_submitted != 0); -+ assert("zam-1042", txnh != NULL); -+ repeat: -+ if (txnh->atom == NULL) { -+ /* current atom is not available, take first from txnmgr */ -+ spin_lock_txnmgr(tmgr); -+ -+ /* traverse the list of all atoms */ -+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { -+ /* lock atom before checking its state */ -+ spin_lock_atom(atom); -+ -+ /* -+ * we need an atom which is not being committed and -+ * which has no flushers (jnode_flush() add one flusher -+ * at the beginning and subtract one at the end). -+ */ -+ if (atom->stage < ASTAGE_PRE_COMMIT && -+ atom->nr_flushers == 0) { -+ spin_lock_txnh(txnh); -+ capture_assign_txnh_nolock(atom, txnh); -+ spin_unlock_txnh(txnh); -+ -+ goto found; -+ } -+ -+ spin_unlock_atom(atom); -+ } -+ -+ /* -+ * Write throttling is case of no one atom can be -+ * flushed/committed. -+ */ -+ if (!current_is_pdflush() && !wbc->nonblocking) { -+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { -+ spin_lock_atom(atom); -+ /* Repeat the check from the above. */ -+ if (atom->stage < ASTAGE_PRE_COMMIT -+ && atom->nr_flushers == 0) { -+ spin_lock_txnh(txnh); -+ capture_assign_txnh_nolock(atom, txnh); -+ spin_unlock_txnh(txnh); -+ -+ goto found; -+ } -+ if (atom->stage <= ASTAGE_POST_COMMIT) { -+ spin_unlock_txnmgr(tmgr); -+ /* -+ * we just wait until atom's flusher -+ * makes a progress in flushing or -+ * committing the atom -+ */ -+ reiser4_atom_wait_event(atom); -+ goto repeat; -+ } -+ spin_unlock_atom(atom); -+ } -+ } -+ spin_unlock_txnmgr(tmgr); -+ return 0; -+ found: -+ spin_unlock_txnmgr(tmgr); -+ } else -+ atom = get_current_atom_locked(); -+ -+ BUG_ON(atom->super != ctx->super); -+ assert("vs-35", atom->super == ctx->super); -+ if (start) { -+ spin_lock_jnode(start); -+ ret = (atom == start->atom) ? 1 : 0; -+ spin_unlock_jnode(start); -+ if (ret == 0) -+ start = NULL; -+ } -+ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start); -+ if (ret == 0) { -+ /* flush_current_atom returns 0 only if it submitted for write -+ nothing */ -+ BUG_ON(*nr_submitted != 0); -+ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) { -+ if (atom->capture_count < tmgr->atom_min_size && -+ !(atom->flags & ATOM_CANCEL_FUSION)) { -+ ret = txn_try_to_fuse_small_atom(tmgr, atom); -+ if (ret == -E_REPEAT) { -+ reiser4_preempt_point(); -+ goto repeat; -+ } -+ } -+ /* if early flushing could not make more nodes clean, -+ * or atom is too old/large, -+ * we force current atom to commit */ -+ /* wait for commit completion but only if this -+ * wouldn't stall pdflushd and ent thread. */ -+ if (!wbc->nonblocking && !ctx->entd) -+ txnh->flags |= TXNH_WAIT_COMMIT; -+ atom->flags |= ATOM_FORCE_COMMIT; -+ } -+ spin_unlock_atom(atom); -+ } else if (ret == -E_REPEAT) { -+ if (*nr_submitted == 0) { -+ /* let others who hampers flushing (hold longterm locks, -+ for instance) to free the way for flush */ -+ reiser4_preempt_point(); -+ goto repeat; -+ } -+ ret = 0; -+ } -+/* -+ if (*nr_submitted > wbc->nr_to_write) -+ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted); -+*/ -+ reiser4_txn_restart(ctx); -+ -+ return ret; -+} -+ -+/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */ -+void reiser4_invalidate_list(struct list_head *head) -+{ -+ while (!list_empty(head)) { -+ jnode *node; -+ -+ node = list_entry(head->next, jnode, capture_link); -+ spin_lock_jnode(node); -+ reiser4_uncapture_block(node); -+ jput(node); -+ } -+} -+ -+static void init_wlinks(txn_wait_links * wlinks) -+{ -+ wlinks->_lock_stack = get_current_lock_stack(); -+ INIT_LIST_HEAD(&wlinks->_fwaitfor_link); -+ INIT_LIST_HEAD(&wlinks->_fwaiting_link); -+ wlinks->waitfor_cb = NULL; -+ wlinks->waiting_cb = NULL; -+} -+ -+/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */ -+void reiser4_atom_wait_event(txn_atom * atom) -+{ -+ txn_wait_links _wlinks; -+ -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-3156", -+ lock_stack_isclean(get_current_lock_stack()) || -+ atom->nr_running_queues > 0); -+ -+ init_wlinks(&_wlinks); -+ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list); -+ atomic_inc(&atom->refcount); -+ spin_unlock_atom(atom); -+ -+ reiser4_prepare_to_sleep(_wlinks._lock_stack); -+ reiser4_go_to_sleep(_wlinks._lock_stack); -+ -+ spin_lock_atom(atom); -+ list_del(&_wlinks._fwaitfor_link); -+ atom_dec_and_unlock(atom); -+} -+ -+void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage) -+{ -+ assert("nikita-3535", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-3536", stage <= ASTAGE_INVALID); -+ /* Excelsior! */ -+ assert("nikita-3537", stage >= atom->stage); -+ if (atom->stage != stage) { -+ atom->stage = stage; -+ reiser4_atom_send_event(atom); -+ } -+} -+ -+/* wake all threads which wait for an event */ -+void reiser4_atom_send_event(txn_atom * atom) -+{ -+ assert_spin_locked(&(atom->alock)); -+ wakeup_atom_waitfor_list(atom); -+} -+ -+/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for -+ example, because it does fsync(2)) */ -+static int should_wait_commit(txn_handle * h) -+{ -+ return h->flags & TXNH_WAIT_COMMIT; -+} -+ -+typedef struct commit_data { -+ txn_atom *atom; -+ txn_handle *txnh; -+ long nr_written; -+ /* as an optimization we start committing atom by first trying to -+ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This -+ * allows to reduce stalls due to other threads waiting for atom in -+ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these -+ * preliminary flushes. */ -+ int preflush; -+ /* have we waited on atom. */ -+ int wait; -+ int failed; -+ int wake_ktxnmgrd_up; -+} commit_data; -+ -+/* -+ * Called from commit_txnh() repeatedly, until either error happens, or atom -+ * commits successfully. -+ */ -+static int try_commit_txnh(commit_data * cd) -+{ -+ int result; -+ -+ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack())); -+ -+ /* Get the atom and txnh locked. */ -+ cd->atom = txnh_get_atom(cd->txnh); -+ assert("jmacd-309", cd->atom != NULL); -+ spin_unlock_txnh(cd->txnh); -+ -+ if (cd->wait) { -+ cd->atom->nr_waiters--; -+ cd->wait = 0; -+ } -+ -+ if (cd->atom->stage == ASTAGE_DONE) -+ return 0; -+ -+ if (cd->failed) -+ return 0; -+ -+ if (atom_should_commit(cd->atom)) { -+ /* if atom is _very_ large schedule it for commit as soon as -+ * possible. */ -+ if (atom_should_commit_asap(cd->atom)) { -+ /* -+ * When atom is in PRE_COMMIT or later stage following -+ * invariant (encoded in atom_can_be_committed()) -+ * holds: there is exactly one non-waiter transaction -+ * handle opened on this atom. When thread wants to -+ * wait until atom commits (for example sync()) it -+ * waits on atom event after increasing -+ * atom->nr_waiters (see blow in this function). It -+ * cannot be guaranteed that atom is already committed -+ * after receiving event, so loop has to be -+ * re-started. But if atom switched into PRE_COMMIT -+ * stage and became too large, we cannot change its -+ * state back to CAPTURE_WAIT (atom stage can only -+ * increase monotonically), hence this check. -+ */ -+ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT) -+ reiser4_atom_set_stage(cd->atom, -+ ASTAGE_CAPTURE_WAIT); -+ cd->atom->flags |= ATOM_FORCE_COMMIT; -+ } -+ if (cd->txnh->flags & TXNH_DONT_COMMIT) { -+ /* -+ * this thread (transaction handle that is) doesn't -+ * want to commit atom. Notify waiters that handle is -+ * closed. This can happen, for example, when we are -+ * under VFS directory lock and don't want to commit -+ * atom right now to avoid stalling other threads -+ * working in the same directory. -+ */ -+ -+ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to -+ * commit this atom: no atom waiters and only one -+ * (our) open transaction handle. */ -+ cd->wake_ktxnmgrd_up = -+ cd->atom->txnh_count == 1 && -+ cd->atom->nr_waiters == 0; -+ reiser4_atom_send_event(cd->atom); -+ result = 0; -+ } else if (!atom_can_be_committed(cd->atom)) { -+ if (should_wait_commit(cd->txnh)) { -+ /* sync(): wait for commit */ -+ cd->atom->nr_waiters++; -+ cd->wait = 1; -+ reiser4_atom_wait_event(cd->atom); -+ result = RETERR(-E_REPEAT); -+ } else { -+ result = 0; -+ } -+ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) { -+ /* -+ * optimization: flush atom without switching it into -+ * ASTAGE_CAPTURE_WAIT. -+ * -+ * But don't do this for ktxnmgrd, because ktxnmgrd -+ * should never block on atom fusion. -+ */ -+ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS, -+ LONG_MAX, &cd->nr_written, -+ &cd->atom, NULL); -+ if (result == 0) { -+ spin_unlock_atom(cd->atom); -+ cd->preflush = 0; -+ result = RETERR(-E_REPEAT); -+ } else /* Atoms wasn't flushed -+ * completely. Rinse. Repeat. */ -+ --cd->preflush; -+ } else { -+ /* We change atom state to ASTAGE_CAPTURE_WAIT to -+ prevent atom fusion and count ourself as an active -+ flusher */ -+ reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT); -+ cd->atom->flags |= ATOM_FORCE_COMMIT; -+ -+ result = -+ commit_current_atom(&cd->nr_written, &cd->atom); -+ if (result != 0 && result != -E_REPEAT) -+ cd->failed = 1; -+ } -+ } else -+ result = 0; -+ -+#if REISER4_DEBUG -+ if (result == 0) -+ assert_spin_locked(&(cd->atom->alock)); -+#endif -+ -+ /* perfectly valid assertion, except that when atom/txnh is not locked -+ * fusion can take place, and cd->atom points nowhere. */ -+ /* -+ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom))); -+ */ -+ return result; -+} -+ -+/* Called to commit a transaction handle. This decrements the atom's number of open -+ handles and if it is the last handle to commit and the atom should commit, initiates -+ atom commit. if commit does not fail, return number of written blocks */ -+static int commit_txnh(txn_handle * txnh) -+{ -+ commit_data cd; -+ assert("umka-192", txnh != NULL); -+ -+ memset(&cd, 0, sizeof cd); -+ cd.txnh = txnh; -+ cd.preflush = 10; -+ -+ /* calls try_commit_txnh() until either atom commits, or error -+ * happens */ -+ while (try_commit_txnh(&cd) != 0) -+ reiser4_preempt_point(); -+ -+ spin_lock_txnh(txnh); -+ -+ cd.atom->txnh_count -= 1; -+ txnh->atom = NULL; -+ /* remove transaction handle from atom's list of transaction handles */ -+ list_del_init(&txnh->txnh_link); -+ -+ spin_unlock_txnh(txnh); -+ atom_dec_and_unlock(cd.atom); -+ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably -+ * because it takes time) by current thread, we do that work -+ * asynchronously by ktxnmgrd daemon. */ -+ if (cd.wake_ktxnmgrd_up) -+ ktxnmgrd_kick(&get_current_super_private()->tmgr); -+ -+ return 0; -+} -+ -+/* TRY_CAPTURE */ -+ -+/* This routine attempts a single block-capture request. It may return -E_REPEAT if some -+ condition indicates that the request should be retried, and it may block if the -+ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag. -+ -+ This routine encodes the basic logic of block capturing described by: -+ -+ http://namesys.com/v4/v4.html -+ -+ Our goal here is to ensure that any two blocks that contain dependent modifications -+ should commit at the same time. This function enforces this discipline by initiating -+ fusion whenever a transaction handle belonging to one atom requests to read or write a -+ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC). -+ -+ In addition, this routine handles the initial assignment of atoms to blocks and -+ transaction handles. These are possible outcomes of this function: -+ -+ 1. The block and handle are already part of the same atom: return immediate success -+ -+ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign -+ the handle to the block's atom. -+ -+ 3. The handle is assigned but the block is not: call capture_assign_block to assign -+ the block to the handle's atom. -+ -+ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion -+ to fuse atoms. -+ -+ 5. Neither block nor handle are assigned: create a new atom and assign them both. -+ -+ 6. A read request for a non-captured block: return immediate success. -+ -+ This function acquires and releases the handle's spinlock. This function is called -+ under the jnode lock and if the return value is 0, it returns with the jnode lock still -+ held. If the return is -E_REPEAT or some other error condition, the jnode lock is -+ released. The external interface (reiser4_try_capture) manages re-aquiring the jnode -+ lock in the failure case. -+*/ -+static int try_capture_block( -+ txn_handle * txnh, jnode * node, txn_capture mode, -+ txn_atom ** atom_alloc) -+{ -+ txn_atom *block_atom; -+ txn_atom *txnh_atom; -+ -+ /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */ -+ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM); -+ -+ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree == -+ * node->tree somewhere. */ -+ assert("umka-194", txnh != NULL); -+ assert("umka-195", node != NULL); -+ -+ /* The jnode is already locked! Being called from reiser4_try_capture(). */ -+ assert_spin_locked(&(node->guard)); -+ block_atom = node->atom; -+ -+ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't -+ let us touch the atoms themselves. */ -+ spin_lock_txnh(txnh); -+ txnh_atom = txnh->atom; -+ /* Process of capturing continues into one of four branches depends on -+ which atoms from (block atom (node->atom), current atom (txnh->atom)) -+ exist. */ -+ if (txnh_atom == NULL) { -+ if (block_atom == NULL) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ /* assign empty atom to the txnh and repeat */ -+ return atom_begin_and_assign_to_txnh(atom_alloc, txnh); -+ } else { -+ atomic_inc(&block_atom->refcount); -+ /* node spin-lock isn't needed anymore */ -+ spin_unlock_jnode(node); -+ if (!spin_trylock_atom(block_atom)) { -+ spin_unlock_txnh(txnh); -+ spin_lock_atom(block_atom); -+ spin_lock_txnh(txnh); -+ } -+ /* re-check state after getting txnh and the node -+ * atom spin-locked */ -+ if (node->atom != block_atom || txnh->atom != NULL) { -+ spin_unlock_txnh(txnh); -+ atom_dec_and_unlock(block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ atomic_dec(&block_atom->refcount); -+ if (block_atom->stage > ASTAGE_CAPTURE_WAIT || -+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && -+ block_atom->txnh_count != 0)) -+ return capture_fuse_wait(txnh, block_atom, NULL, mode); -+ capture_assign_txnh_nolock(block_atom, txnh); -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ } else { -+ /* It is time to perform deadlock prevention check over the -+ node we want to capture. It is possible this node was locked -+ for read without capturing it. The optimization which allows -+ to do it helps us in keeping atoms independent as long as -+ possible but it may cause lock/fuse deadlock problems. -+ -+ A number of similar deadlock situations with locked but not -+ captured nodes were found. In each situation there are two -+ or more threads: one of them does flushing while another one -+ does routine balancing or tree lookup. The flushing thread -+ (F) sleeps in long term locking request for node (N), another -+ thread (A) sleeps in trying to capture some node already -+ belonging the atom F, F has a state which prevents -+ immediately fusion . -+ -+ Deadlocks of this kind cannot happen if node N was properly -+ captured by thread A. The F thread fuse atoms before locking -+ therefore current atom of thread F and current atom of thread -+ A became the same atom and thread A may proceed. This does -+ not work if node N was not captured because the fusion of -+ atom does not happens. -+ -+ The following scheme solves the deadlock: If -+ longterm_lock_znode locks and does not capture a znode, that -+ znode is marked as MISSED_IN_CAPTURE. A node marked this way -+ is processed by the code below which restores the missed -+ capture and fuses current atoms of all the node lock owners -+ by calling the fuse_not_fused_lock_owners() function. */ -+ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) { -+ JF_CLR(node, JNODE_MISSED_IN_CAPTURE); -+ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ fuse_not_fused_lock_owners(txnh, JZNODE(node)); -+ return RETERR(-E_REPEAT); -+ } -+ } -+ if (block_atom == NULL) { -+ atomic_inc(&txnh_atom->refcount); -+ spin_unlock_txnh(txnh); -+ if (!spin_trylock_atom(txnh_atom)) { -+ spin_unlock_jnode(node); -+ spin_lock_atom(txnh_atom); -+ spin_lock_jnode(node); -+ } -+ if (txnh->atom != txnh_atom || node->atom != NULL -+ || JF_ISSET(node, JNODE_IS_DYING)) { -+ spin_unlock_jnode(node); -+ atom_dec_and_unlock(txnh_atom); -+ return RETERR(-E_REPEAT); -+ } -+ atomic_dec(&txnh_atom->refcount); -+ capture_assign_block_nolock(txnh_atom, node); -+ spin_unlock_atom(txnh_atom); -+ } else { -+ if (txnh_atom != block_atom) { -+ if (mode & TXN_CAPTURE_DONT_FUSE) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ /* we are in a "no-fusion" mode and @node is -+ * already part of transaction. */ -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ return capture_init_fusion(node, txnh, mode); -+ } -+ spin_unlock_txnh(txnh); -+ } -+ } -+ return 0; -+} -+ -+static txn_capture -+build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags) -+{ -+ txn_capture cap_mode; -+ -+ assert_spin_locked(&(node->guard)); -+ -+ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */ -+ -+ if (lock_mode == ZNODE_WRITE_LOCK) { -+ cap_mode = TXN_CAPTURE_WRITE; -+ } else if (node->atom != NULL) { -+ cap_mode = TXN_CAPTURE_WRITE; -+ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */ -+ jnode_get_level(node) == LEAF_LEVEL) { -+ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */ -+ /* We only need a READ_FUSING capture at the leaf level. This -+ is because the internal levels of the tree (twigs included) -+ are redundant from the point of the user that asked for a -+ read-fusing transcrash. The user only wants to read-fuse -+ atoms due to reading uncommitted data that another user has -+ written. It is the file system that reads/writes the -+ internal tree levels, the user only reads/writes leaves. */ -+ cap_mode = TXN_CAPTURE_READ_ATOMIC; -+ } else { -+ /* In this case (read lock at a non-leaf) there's no reason to -+ * capture. */ -+ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */ -+ return 0; -+ } -+ -+ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE)); -+ assert("nikita-3186", cap_mode != 0); -+ return cap_mode; -+} -+ -+/* This is an external interface to try_capture_block(), it calls -+ try_capture_block() repeatedly as long as -E_REPEAT is returned. -+ -+ @node: node to capture, -+ @lock_mode: read or write lock is used in capture mode calculation, -+ @flags: see txn_capture flags enumeration, -+ @can_coc : can copy-on-capture -+ -+ @return: 0 - node was successfully captured, -E_REPEAT - capture request -+ cannot be processed immediately as it was requested in flags, -+ < 0 - other errors. -+*/ -+int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode, -+ txn_capture flags) -+{ -+ txn_atom *atom_alloc = NULL; -+ txn_capture cap_mode; -+ txn_handle *txnh = get_current_context()->trans; -+ int ret; -+ -+ assert_spin_locked(&(node->guard)); -+ -+ repeat: -+ if (JF_ISSET(node, JNODE_IS_DYING)) -+ return RETERR(-EINVAL); -+ if (node->atom != NULL && txnh->atom == node->atom) -+ return 0; -+ cap_mode = build_capture_mode(node, lock_mode, flags); -+ if (cap_mode == 0 || -+ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) { -+ /* Mark this node as "MISSED". It helps in further deadlock -+ * analysis */ -+ if (jnode_is_znode(node)) -+ JF_SET(node, JNODE_MISSED_IN_CAPTURE); -+ return 0; -+ } -+ /* Repeat try_capture as long as -E_REPEAT is returned. */ -+ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc); -+ /* Regardless of non_blocking: -+ -+ If ret == 0 then jnode is still locked. -+ If ret != 0 then jnode is unlocked. -+ */ -+#if REISER4_DEBUG -+ if (ret == 0) -+ assert_spin_locked(&(node->guard)); -+ else -+ assert_spin_not_locked(&(node->guard)); -+#endif -+ assert_spin_not_locked(&(txnh->guard)); -+ -+ if (ret == -E_REPEAT) { -+ /* E_REPEAT implies all locks were released, therefore we need -+ to take the jnode's lock again. */ -+ spin_lock_jnode(node); -+ -+ /* Although this may appear to be a busy loop, it is not. -+ There are several conditions that cause E_REPEAT to be -+ returned by the call to try_capture_block, all cases -+ indicating some kind of state change that means you should -+ retry the request and will get a different result. In some -+ cases this could be avoided with some extra code, but -+ generally it is done because the necessary locks were -+ released as a result of the operation and repeating is the -+ simplest thing to do (less bug potential). The cases are: -+ atom fusion returns E_REPEAT after it completes (jnode and -+ txnh were unlocked); race conditions in assign_block, -+ assign_txnh, and init_fusion return E_REPEAT (trylock -+ failure); after going to sleep in capture_fuse_wait -+ (request was blocked but may now succeed). I'm not quite -+ sure how capture_copy works yet, but it may also return -+ E_REPEAT. When the request is legitimately blocked, the -+ requestor goes to sleep in fuse_wait, so this is not a busy -+ loop. */ -+ /* NOTE-NIKITA: still don't understand: -+ -+ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT -+ -+ looks like busy loop? -+ */ -+ goto repeat; -+ } -+ -+ /* free extra atom object that was possibly allocated by -+ try_capture_block(). -+ -+ Do this before acquiring jnode spin lock to -+ minimize time spent under lock. --nikita */ -+ if (atom_alloc != NULL) { -+ kmem_cache_free(_atom_slab, atom_alloc); -+ } -+ -+ if (ret != 0) { -+ if (ret == -E_BLOCK) { -+ assert("nikita-3360", -+ cap_mode & TXN_CAPTURE_NONBLOCKING); -+ ret = -E_REPEAT; -+ } -+ -+ /* Failure means jnode is not locked. FIXME_LATER_JMACD May -+ want to fix the above code to avoid releasing the lock and -+ re-acquiring it, but there are cases were failure occurs -+ when the lock is not held, and those cases would need to be -+ modified to re-take the lock. */ -+ spin_lock_jnode(node); -+ } -+ -+ /* Jnode is still locked. */ -+ assert_spin_locked(&(node->guard)); -+ return ret; -+} -+ -+static void release_two_atoms(txn_atom *one, txn_atom *two) -+{ -+ spin_unlock_atom(one); -+ atom_dec_and_unlock(two); -+ spin_lock_atom(one); -+ atom_dec_and_unlock(one); -+} -+ -+/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is -+ returned by that routine. The txn_capture request mode is computed here depending on -+ the transaction handle's type and the lock request. This is called from the depths of -+ the lock manager with the jnode lock held and it always returns with the jnode lock -+ held. -+*/ -+ -+/* fuse all 'active' atoms of lock owners of given node. */ -+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node) -+{ -+ lock_handle *lh; -+ int repeat; -+ txn_atom *atomh, *atomf; -+ reiser4_context *me = get_current_context(); -+ reiser4_context *ctx = NULL; -+ -+ assert_spin_not_locked(&(ZJNODE(node)->guard)); -+ assert_spin_not_locked(&(txnh->hlock)); -+ -+ repeat: -+ repeat = 0; -+ atomh = txnh_get_atom(txnh); -+ spin_unlock_txnh(txnh); -+ assert("zam-692", atomh != NULL); -+ -+ spin_lock_zlock(&node->lock); -+ /* inspect list of lock owners */ -+ list_for_each_entry(lh, &node->lock.owners, owners_link) { -+ ctx = get_context_by_lock_stack(lh->owner); -+ if (ctx == me) -+ continue; -+ /* below we use two assumptions to avoid addition spin-locks -+ for checking the condition : -+ -+ 1) if the lock stack has lock, the transaction should be -+ opened, i.e. ctx->trans != NULL; -+ -+ 2) reading of well-aligned ctx->trans->atom is atomic, if it -+ equals to the address of spin-locked atomh, we take that -+ the atoms are the same, nothing has to be captured. */ -+ if (atomh != ctx->trans->atom) { -+ reiser4_wake_up(lh->owner); -+ repeat = 1; -+ break; -+ } -+ } -+ if (repeat) { -+ if (!spin_trylock_txnh(ctx->trans)) { -+ spin_unlock_zlock(&node->lock); -+ spin_unlock_atom(atomh); -+ goto repeat; -+ } -+ atomf = ctx->trans->atom; -+ if (atomf == NULL) { -+ capture_assign_txnh_nolock(atomh, ctx->trans); -+ /* release zlock lock _after_ assigning the atom to the -+ * transaction handle, otherwise the lock owner thread -+ * may unlock all znodes, exit kernel context and here -+ * we would access an invalid transaction handle. */ -+ spin_unlock_zlock(&node->lock); -+ spin_unlock_atom(atomh); -+ spin_unlock_txnh(ctx->trans); -+ goto repeat; -+ } -+ assert("zam-1059", atomf != atomh); -+ spin_unlock_zlock(&node->lock); -+ atomic_inc(&atomh->refcount); -+ atomic_inc(&atomf->refcount); -+ spin_unlock_txnh(ctx->trans); -+ if (atomf > atomh) { -+ spin_lock_atom_nested(atomf); -+ } else { -+ spin_unlock_atom(atomh); -+ spin_lock_atom(atomf); -+ spin_lock_atom_nested(atomh); -+ } -+ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) { -+ release_two_atoms(atomf, atomh); -+ goto repeat; -+ } -+ atomic_dec(&atomh->refcount); -+ atomic_dec(&atomf->refcount); -+ capture_fuse_into(atomf, atomh); -+ goto repeat; -+ } -+ spin_unlock_zlock(&node->lock); -+ spin_unlock_atom(atomh); -+} -+ -+/* This is the interface to capture unformatted nodes via their struct page -+ reference. Currently it is only used in reiser4_invalidatepage */ -+int try_capture_page_to_invalidate(struct page *pg) -+{ -+ int ret; -+ jnode *node; -+ -+ assert("umka-292", pg != NULL); -+ assert("nikita-2597", PageLocked(pg)); -+ -+ if (IS_ERR(node = jnode_of_page(pg))) { -+ return PTR_ERR(node); -+ } -+ -+ spin_lock_jnode(node); -+ unlock_page(pg); -+ -+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ spin_unlock_jnode(node); -+ jput(node); -+ lock_page(pg); -+ return ret; -+} -+ -+/* This informs the transaction manager when a node is deleted. Add the block to the -+ atom's delete set and uncapture the block. -+ -+VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for -+explanations. find all the functions that use it, and unless there is some very -+good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....), -+move the loop to inside the function. -+ -+VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times? -+ */ -+void reiser4_uncapture_page(struct page *pg) -+{ -+ jnode *node; -+ txn_atom *atom; -+ -+ assert("umka-199", pg != NULL); -+ assert("nikita-3155", PageLocked(pg)); -+ -+ clear_page_dirty_for_io(pg); -+ -+ reiser4_wait_page_writeback(pg); -+ -+ node = jprivate(pg); -+ BUG_ON(node == NULL); -+ -+ spin_lock_jnode(node); -+ -+ atom = jnode_get_atom(node); -+ if (atom == NULL) { -+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); -+ spin_unlock_jnode(node); -+ return; -+ } -+ -+ /* We can remove jnode from transaction even if it is on flush queue -+ * prepped list, we only need to be sure that flush queue is not being -+ * written by reiser4_write_fq(). reiser4_write_fq() does not use atom -+ * spin lock for protection of the prepped nodes list, instead -+ * write_fq() increments atom's nr_running_queues counters for the time -+ * when prepped list is not protected by spin lock. Here we check this -+ * counter if we want to remove jnode from flush queue and, if the -+ * counter is not zero, wait all reiser4_write_fq() for this atom to -+ * complete. This is not significant overhead. */ -+ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) { -+ spin_unlock_jnode(node); -+ /* -+ * at this moment we want to wait for "atom event", viz. wait -+ * until @node can be removed from flush queue. But -+ * reiser4_atom_wait_event() cannot be called with page locked, -+ * because it deadlocks with jnode_extent_write(). Unlock page, -+ * after making sure (through page_cache_get()) that it cannot -+ * be released from memory. -+ */ -+ page_cache_get(pg); -+ unlock_page(pg); -+ reiser4_atom_wait_event(atom); -+ lock_page(pg); -+ /* -+ * page may has been detached by ->writepage()->releasepage(). -+ */ -+ reiser4_wait_page_writeback(pg); -+ spin_lock_jnode(node); -+ page_cache_release(pg); -+ atom = jnode_get_atom(node); -+/* VS-FIXME-HANS: improve the commenting in this function */ -+ if (atom == NULL) { -+ spin_unlock_jnode(node); -+ return; -+ } -+ } -+ reiser4_uncapture_block(node); -+ spin_unlock_atom(atom); -+ jput(node); -+} -+ -+/* this is used in extent's kill hook to uncapture and unhash jnodes attached to -+ * inode's tree of jnodes */ -+void reiser4_uncapture_jnode(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert_spin_locked(&(node->guard)); -+ assert("", node->pg == 0); -+ -+ atom = jnode_get_atom(node); -+ if (atom == NULL) { -+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); -+ spin_unlock_jnode(node); -+ return; -+ } -+ -+ reiser4_uncapture_block(node); -+ spin_unlock_atom(atom); -+ jput(node); -+} -+ -+/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer, -+ increases atom refcount and txnh_count, adds to txnh_list. */ -+static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh) -+{ -+ assert("umka-200", atom != NULL); -+ assert("umka-201", txnh != NULL); -+ -+ assert_spin_locked(&(txnh->hlock)); -+ assert_spin_locked(&(atom->alock)); -+ assert("jmacd-824", txnh->atom == NULL); -+ assert("nikita-3540", atom_isopen(atom)); -+ BUG_ON(txnh->atom != NULL); -+ -+ atomic_inc(&atom->refcount); -+ txnh->atom = atom; -+ reiser4_ctx_gfp_mask_set(); -+ list_add_tail(&txnh->txnh_link, &atom->txnh_list); -+ atom->txnh_count += 1; -+} -+ -+/* No-locking version of assign_block. Sets the block's atom pointer, references the -+ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */ -+static void capture_assign_block_nolock(txn_atom *atom, jnode *node) -+{ -+ assert("umka-202", atom != NULL); -+ assert("umka-203", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ assert_spin_locked(&(atom->alock)); -+ assert("jmacd-323", node->atom == NULL); -+ BUG_ON(!list_empty_careful(&node->capture_link)); -+ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY)); -+ -+ /* Pointer from jnode to atom is not counted in atom->refcount. */ -+ node->atom = atom; -+ -+ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom)); -+ atom->capture_count += 1; -+ /* reference to jnode is acquired by atom. */ -+ jref(node); -+ -+ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1)); -+ -+ LOCK_CNT_INC(t_refs); -+} -+ -+/* common code for dirtying both unformatted jnodes and formatted znodes. */ -+static void do_jnode_make_dirty(jnode * node, txn_atom * atom) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert_spin_locked(&(atom->alock)); -+ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY)); -+ -+ JF_SET(node, JNODE_DIRTY); -+ -+ get_current_context()->nr_marked_dirty++; -+ -+ /* We grab2flush_reserve one additional block only if node was -+ not CREATED and jnode_flush did not sort it into neither -+ relocate set nor overwrite one. If node is in overwrite or -+ relocate set we assume that atom's flush reserved counter was -+ already adjusted. */ -+ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC) -+ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node) -+ && !jnode_is_cluster_page(node)) { -+ assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr)); -+ assert("vs-1506", *jnode_get_block(node) != 0); -+ grabbed2flush_reserved_nolock(atom, (__u64) 1); -+ JF_SET(node, JNODE_FLUSH_RESERVED); -+ } -+ -+ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) { -+ /* If the atom is not set yet, it will be added to the appropriate list in -+ capture_assign_block_nolock. */ -+ /* Sometimes a node is set dirty before being captured -- the case for new -+ jnodes. In that case the jnode will be added to the appropriate list -+ in capture_assign_block_nolock. Another reason not to re-link jnode is -+ that jnode is on a flush queue (see flush.c for details) */ -+ -+ int level = jnode_get_level(node); -+ -+ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT); -+ assert("nikita-2607", 0 <= level); -+ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT); -+ -+ /* move node to atom's dirty list */ -+ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level)); -+ ON_DEBUG(count_jnode -+ (atom, node, NODE_LIST(node), DIRTY_LIST, 1)); -+ } -+} -+ -+/* Set the dirty status for this (spin locked) jnode. */ -+void jnode_make_dirty_locked(jnode * node) -+{ -+ assert("umka-204", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ -+ if (REISER4_DEBUG && rofs_jnode(node)) { -+ warning("nikita-3365", "Dirtying jnode on rofs"); -+ dump_stack(); -+ } -+ -+ /* Fast check for already dirty node */ -+ if (!JF_ISSET(node, JNODE_DIRTY)) { -+ txn_atom *atom; -+ -+ atom = jnode_get_atom(node); -+ assert("vs-1094", atom); -+ /* Check jnode dirty status again because node spin lock might -+ * be released inside jnode_get_atom(). */ -+ if (likely(!JF_ISSET(node, JNODE_DIRTY))) -+ do_jnode_make_dirty(node, atom); -+ spin_unlock_atom(atom); -+ } -+} -+ -+/* Set the dirty status for this znode. */ -+void znode_make_dirty(znode * z) -+{ -+ jnode *node; -+ struct page *page; -+ -+ assert("umka-204", z != NULL); -+ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z)); -+ assert("nikita-3560", znode_is_write_locked(z)); -+ -+ node = ZJNODE(z); -+ /* znode is longterm locked, we can check dirty bit without spinlock */ -+ if (JF_ISSET(node, JNODE_DIRTY)) { -+ /* znode is dirty already. All we have to do is to change znode version */ -+ z->version = znode_build_version(jnode_get_tree(node)); -+ return; -+ } -+ -+ spin_lock_jnode(node); -+ jnode_make_dirty_locked(node); -+ page = jnode_page(node); -+ if (page != NULL) { -+ /* this is useful assertion (allows one to check that no -+ * modifications are lost due to update of in-flight page), -+ * but it requires locking on page to check PG_writeback -+ * bit. */ -+ /* assert("nikita-3292", -+ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */ -+ page_cache_get(page); -+ -+ /* jnode lock is not needed for the rest of -+ * znode_set_dirty(). */ -+ spin_unlock_jnode(node); -+ /* reiser4 file write code calls set_page_dirty for -+ * unformatted nodes, for formatted nodes we do it here. */ -+ set_page_dirty_notag(page); -+ page_cache_release(page); -+ /* bump version counter in znode */ -+ z->version = znode_build_version(jnode_get_tree(node)); -+ } else { -+ assert("zam-596", znode_above_root(JZNODE(node))); -+ spin_unlock_jnode(node); -+ } -+ -+ assert("nikita-1900", znode_is_write_locked(z)); -+ assert("jmacd-9777", node->atom != NULL); -+} -+ -+int reiser4_sync_atom(txn_atom * atom) -+{ -+ int result; -+ txn_handle *txnh; -+ -+ txnh = get_current_context()->trans; -+ -+ result = 0; -+ if (atom != NULL) { -+ if (atom->stage < ASTAGE_PRE_COMMIT) { -+ spin_lock_txnh(txnh); -+ capture_assign_txnh_nolock(atom, txnh); -+ result = force_commit_atom(txnh); -+ } else if (atom->stage < ASTAGE_POST_COMMIT) { -+ /* wait atom commit */ -+ reiser4_atom_wait_event(atom); -+ /* try once more */ -+ result = RETERR(-E_REPEAT); -+ } else -+ spin_unlock_atom(atom); -+ } -+ return result; -+} -+ -+#if REISER4_DEBUG -+ -+/* move jnode form one list to another -+ call this after atom->capture_count is updated */ -+void -+count_jnode(txn_atom * atom, jnode * node, atom_list old_list, -+ atom_list new_list, int check_lists) -+{ -+ struct list_head *pos; -+ -+ assert("zam-1018", atom_is_protected(atom)); -+ assert_spin_locked(&(node->guard)); -+ assert("", NODE_LIST(node) == old_list); -+ -+ switch (NODE_LIST(node)) { -+ case NOT_CAPTURED: -+ break; -+ case DIRTY_LIST: -+ assert("", atom->dirty > 0); -+ atom->dirty--; -+ break; -+ case CLEAN_LIST: -+ assert("", atom->clean > 0); -+ atom->clean--; -+ break; -+ case FQ_LIST: -+ assert("", atom->fq > 0); -+ atom->fq--; -+ break; -+ case WB_LIST: -+ assert("", atom->wb > 0); -+ atom->wb--; -+ break; -+ case OVRWR_LIST: -+ assert("", atom->ovrwr > 0); -+ atom->ovrwr--; -+ break; -+ default: -+ impossible("", ""); -+ } -+ -+ switch (new_list) { -+ case NOT_CAPTURED: -+ break; -+ case DIRTY_LIST: -+ atom->dirty++; -+ break; -+ case CLEAN_LIST: -+ atom->clean++; -+ break; -+ case FQ_LIST: -+ atom->fq++; -+ break; -+ case WB_LIST: -+ atom->wb++; -+ break; -+ case OVRWR_LIST: -+ atom->ovrwr++; -+ break; -+ default: -+ impossible("", ""); -+ } -+ ASSIGN_NODE_LIST(node, new_list); -+ if (0 && check_lists) { -+ int count; -+ tree_level level; -+ -+ count = 0; -+ -+ /* flush queue list */ -+ /* reiser4_check_fq(atom); */ -+ -+ /* dirty list */ -+ count = 0; -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ list_for_each(pos, ATOM_DIRTY_LIST(atom, level)) -+ count++; -+ } -+ if (count != atom->dirty) -+ warning("", "dirty counter %d, real %d\n", atom->dirty, -+ count); -+ -+ /* clean list */ -+ count = 0; -+ list_for_each(pos, ATOM_CLEAN_LIST(atom)) -+ count++; -+ if (count != atom->clean) -+ warning("", "clean counter %d, real %d\n", atom->clean, -+ count); -+ -+ /* wb list */ -+ count = 0; -+ list_for_each(pos, ATOM_WB_LIST(atom)) -+ count++; -+ if (count != atom->wb) -+ warning("", "wb counter %d, real %d\n", atom->wb, -+ count); -+ -+ /* overwrite list */ -+ count = 0; -+ list_for_each(pos, ATOM_OVRWR_LIST(atom)) -+ count++; -+ -+ if (count != atom->ovrwr) -+ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr, -+ count); -+ } -+ assert("vs-1624", atom->num_queued == atom->fq); -+ if (atom->capture_count != -+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) { -+ printk -+ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n", -+ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr, -+ atom->wb, atom->fq); -+ assert("vs-1622", -+ atom->capture_count == -+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + -+ atom->fq); -+ } -+} -+ -+#endif -+ -+/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode -+ * lock should be taken before calling this function. */ -+void jnode_make_wander_nolock(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert("nikita-2431", node != NULL); -+ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC)); -+ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); -+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node))); -+ -+ atom = node->atom; -+ -+ assert("zam-895", atom != NULL); -+ assert("zam-894", atom_is_protected(atom)); -+ -+ JF_SET(node, JNODE_OVRWR); -+ /* move node to atom's overwrite list */ -+ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom)); -+ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1)); -+} -+ -+/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside -+ * this function. */ -+void jnode_make_wander(jnode * node) -+{ -+ txn_atom *atom; -+ -+ spin_lock_jnode(node); -+ atom = jnode_get_atom(node); -+ assert("zam-913", atom != NULL); -+ assert("zam-914", !JF_ISSET(node, JNODE_RELOC)); -+ -+ jnode_make_wander_nolock(node); -+ spin_unlock_atom(atom); -+ spin_unlock_jnode(node); -+} -+ -+/* this just sets RELOC bit */ -+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert("zam-916", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC)); -+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); -+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node))); -+ jnode_set_reloc(node); -+} -+ -+/* Make znode RELOC and put it on flush queue */ -+void znode_make_reloc(znode * z, flush_queue_t * fq) -+{ -+ jnode *node; -+ txn_atom *atom; -+ -+ node = ZJNODE(z); -+ spin_lock_jnode(node); -+ -+ atom = jnode_get_atom(node); -+ assert("zam-919", atom != NULL); -+ -+ jnode_make_reloc_nolock(fq, node); -+ queue_jnode(fq, node); -+ -+ spin_unlock_atom(atom); -+ spin_unlock_jnode(node); -+ -+} -+ -+/* Make unformatted node RELOC and put it on flush queue */ -+void unformatted_make_reloc(jnode *node, flush_queue_t *fq) -+{ -+ assert("vs-1479", jnode_is_unformatted(node)); -+ -+ jnode_make_reloc_nolock(fq, node); -+ queue_jnode(fq, node); -+} -+ -+int reiser4_capture_super_block(struct super_block *s) -+{ -+ int result; -+ znode *uber; -+ lock_handle lh; -+ -+ init_lh(&lh); -+ result = get_uber_znode(reiser4_get_tree(s), -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh); -+ if (result) -+ return result; -+ -+ uber = lh.node; -+ /* Grabbing one block for superblock */ -+ result = reiser4_grab_space_force((__u64) 1, BA_RESERVED); -+ if (result != 0) -+ return result; -+ -+ znode_make_dirty(uber); -+ -+ done_lh(&lh); -+ return 0; -+} -+ -+/* Wakeup every handle on the atom's WAITFOR list */ -+static void wakeup_atom_waitfor_list(txn_atom * atom) -+{ -+ txn_wait_links *wlinks; -+ -+ assert("umka-210", atom != NULL); -+ -+ /* atom is locked */ -+ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) { -+ if (wlinks->waitfor_cb == NULL || -+ wlinks->waitfor_cb(atom, wlinks)) -+ /* Wake up. */ -+ reiser4_wake_up(wlinks->_lock_stack); -+ } -+} -+ -+/* Wakeup every handle on the atom's WAITING list */ -+static void wakeup_atom_waiting_list(txn_atom * atom) -+{ -+ txn_wait_links *wlinks; -+ -+ assert("umka-211", atom != NULL); -+ -+ /* atom is locked */ -+ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) { -+ if (wlinks->waiting_cb == NULL || -+ wlinks->waiting_cb(atom, wlinks)) -+ /* Wake up. */ -+ reiser4_wake_up(wlinks->_lock_stack); -+ } -+} -+ -+/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */ -+static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks) -+{ -+ assert("nikita-3330", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ /* atom->txnh_count == 1 is for waking waiters up if we are releasing -+ * last transaction handle. */ -+ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1; -+} -+ -+/* The general purpose of this function is to wait on the first of two possible events. -+ The situation is that a handle (and its atom atomh) is blocked trying to capture a -+ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The -+ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with -+ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it -+ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will -+ proceed and fuse the two atoms in the CAPTURE_WAIT state. -+ -+ In other words, if either atomh or atomf change state, the handle will be awakened, -+ thus there are two lists per atom: WAITING and WAITFOR. -+ -+ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to -+ close but it is not assigned to an atom of its own. -+ -+ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK, -+ BOTH_ATOM_LOCKS. Result: all four locks are released. -+*/ -+static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf, -+ txn_atom * atomh, txn_capture mode) -+{ -+ int ret; -+ txn_wait_links wlinks; -+ -+ assert("umka-213", txnh != NULL); -+ assert("umka-214", atomf != NULL); -+ -+ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(atomf); -+ -+ if (atomh) { -+ spin_unlock_atom(atomh); -+ } -+ -+ return RETERR(-E_BLOCK); -+ } -+ -+ /* Initialize the waiting list links. */ -+ init_wlinks(&wlinks); -+ -+ /* Add txnh to atomf's waitfor list, unlock atomf. */ -+ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list); -+ wlinks.waitfor_cb = wait_for_fusion; -+ atomic_inc(&atomf->refcount); -+ spin_unlock_atom(atomf); -+ -+ if (atomh) { -+ /* Add txnh to atomh's waiting list, unlock atomh. */ -+ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list); -+ atomic_inc(&atomh->refcount); -+ spin_unlock_atom(atomh); -+ } -+ -+ /* Go to sleep. */ -+ spin_unlock_txnh(txnh); -+ -+ ret = reiser4_prepare_to_sleep(wlinks._lock_stack); -+ if (ret == 0) { -+ reiser4_go_to_sleep(wlinks._lock_stack); -+ ret = RETERR(-E_REPEAT); -+ } -+ -+ /* Remove from the waitfor list. */ -+ spin_lock_atom(atomf); -+ -+ list_del(&wlinks._fwaitfor_link); -+ atom_dec_and_unlock(atomf); -+ -+ if (atomh) { -+ /* Remove from the waiting list. */ -+ spin_lock_atom(atomh); -+ list_del(&wlinks._fwaiting_link); -+ atom_dec_and_unlock(atomh); -+ } -+ return ret; -+} -+ -+static void lock_two_atoms(txn_atom * one, txn_atom * two) -+{ -+ assert("zam-1067", one != two); -+ -+ /* lock the atom with lesser address first */ -+ if (one < two) { -+ spin_lock_atom(one); -+ spin_lock_atom_nested(two); -+ } else { -+ spin_lock_atom(two); -+ spin_lock_atom_nested(one); -+ } -+} -+ -+/* Perform the necessary work to prepare for fusing two atoms, which involves -+ * acquiring two atom locks in the proper order. If one of the node's atom is -+ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's -+ * atom is not then the handle's request is put to sleep. If the node's atom -+ * is committing, then the node can be copy-on-captured. Otherwise, pick the -+ * atom with fewer pointers to be fused into the atom with more pointer and -+ * call capture_fuse_into. -+ */ -+static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode) -+{ -+ txn_atom * txnh_atom = txnh->atom; -+ txn_atom * block_atom = node->atom; -+ -+ atomic_inc(&txnh_atom->refcount); -+ atomic_inc(&block_atom->refcount); -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ -+ lock_two_atoms(txnh_atom, block_atom); -+ -+ if (txnh->atom != txnh_atom || node->atom != block_atom ) { -+ release_two_atoms(txnh_atom, block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ -+ atomic_dec(&txnh_atom->refcount); -+ atomic_dec(&block_atom->refcount); -+ -+ assert ("zam-1066", atom_isopen(txnh_atom)); -+ -+ if (txnh_atom->stage >= block_atom->stage || -+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) { -+ capture_fuse_into(txnh_atom, block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ spin_lock_txnh(txnh); -+ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode); -+} -+ -+/* This function splices together two jnode lists (small and large) and sets all jnodes in -+ the small list to point to the large atom. Returns the length of the list. */ -+static int -+capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head, -+ struct list_head *small_head) -+{ -+ int count = 0; -+ jnode *node; -+ -+ assert("umka-218", large != NULL); -+ assert("umka-219", large_head != NULL); -+ assert("umka-220", small_head != NULL); -+ /* small atom should be locked also. */ -+ assert_spin_locked(&(large->alock)); -+ -+ /* For every jnode on small's capture list... */ -+ list_for_each_entry(node, small_head, capture_link) { -+ count += 1; -+ -+ /* With the jnode lock held, update atom pointer. */ -+ spin_lock_jnode(node); -+ node->atom = large; -+ spin_unlock_jnode(node); -+ } -+ -+ /* Splice the lists. */ -+ list_splice_init(small_head, large_head->prev); -+ -+ return count; -+} -+ -+/* This function splices together two txnh lists (small and large) and sets all txn handles in -+ the small list to point to the large atom. Returns the length of the list. */ -+static int -+capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head, -+ struct list_head *small_head) -+{ -+ int count = 0; -+ txn_handle *txnh; -+ -+ assert("umka-221", large != NULL); -+ assert("umka-222", large_head != NULL); -+ assert("umka-223", small_head != NULL); -+ -+ /* Adjust every txnh to the new atom. */ -+ list_for_each_entry(txnh, small_head, txnh_link) { -+ count += 1; -+ -+ /* With the txnh lock held, update atom pointer. */ -+ spin_lock_txnh(txnh); -+ txnh->atom = large; -+ spin_unlock_txnh(txnh); -+ } -+ -+ /* Splice the txn_handle list. */ -+ list_splice_init(small_head, large_head->prev); -+ -+ return count; -+} -+ -+/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are -+ added to LARGE and their ->atom pointers are all updated. The associated counts are -+ updated as well, and any waiting handles belonging to either are awakened. Finally the -+ smaller atom's refcount is decremented. -+*/ -+static void capture_fuse_into(txn_atom * small, txn_atom * large) -+{ -+ int level; -+ unsigned zcount = 0; -+ unsigned tcount = 0; -+ -+ assert("umka-224", small != NULL); -+ assert("umka-225", small != NULL); -+ -+ assert_spin_locked(&(large->alock)); -+ assert_spin_locked(&(small->alock)); -+ -+ assert("jmacd-201", atom_isopen(small)); -+ assert("jmacd-202", atom_isopen(large)); -+ -+ /* Splice and update the per-level dirty jnode lists */ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ zcount += -+ capture_fuse_jnode_lists(large, -+ ATOM_DIRTY_LIST(large, level), -+ ATOM_DIRTY_LIST(small, level)); -+ } -+ -+ /* Splice and update the [clean,dirty] jnode and txnh lists */ -+ zcount += -+ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large), -+ ATOM_CLEAN_LIST(small)); -+ zcount += -+ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large), -+ ATOM_OVRWR_LIST(small)); -+ zcount += -+ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large), -+ ATOM_WB_LIST(small)); -+ zcount += -+ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes); -+ tcount += -+ capture_fuse_txnh_lists(large, &large->txnh_list, -+ &small->txnh_list); -+ -+ /* Check our accounting. */ -+ assert("jmacd-1063", -+ zcount + small->num_queued == small->capture_count); -+ assert("jmacd-1065", tcount == small->txnh_count); -+ -+ /* sum numbers of waiters threads */ -+ large->nr_waiters += small->nr_waiters; -+ small->nr_waiters = 0; -+ -+ /* splice flush queues */ -+ reiser4_fuse_fq(large, small); -+ -+ /* update counter of jnode on every atom' list */ -+ ON_DEBUG(large->dirty += small->dirty; -+ small->dirty = 0; -+ large->clean += small->clean; -+ small->clean = 0; -+ large->ovrwr += small->ovrwr; -+ small->ovrwr = 0; -+ large->wb += small->wb; -+ small->wb = 0; -+ large->fq += small->fq; -+ small->fq = 0;); -+ -+ /* count flushers in result atom */ -+ large->nr_flushers += small->nr_flushers; -+ small->nr_flushers = 0; -+ -+ /* update counts of flushed nodes */ -+ large->flushed += small->flushed; -+ small->flushed = 0; -+ -+ /* Transfer list counts to large. */ -+ large->txnh_count += small->txnh_count; -+ large->capture_count += small->capture_count; -+ -+ /* Add all txnh references to large. */ -+ atomic_add(small->txnh_count, &large->refcount); -+ atomic_sub(small->txnh_count, &small->refcount); -+ -+ /* Reset small counts */ -+ small->txnh_count = 0; -+ small->capture_count = 0; -+ -+ /* Assign the oldest start_time, merge flags. */ -+ large->start_time = min(large->start_time, small->start_time); -+ large->flags |= small->flags; -+ -+ /* Merge blocknr sets. */ -+ blocknr_set_merge(&small->delete_set, &large->delete_set); -+ blocknr_set_merge(&small->wandered_map, &large->wandered_map); -+ -+ /* Merge allocated/deleted file counts */ -+ large->nr_objects_deleted += small->nr_objects_deleted; -+ large->nr_objects_created += small->nr_objects_created; -+ -+ small->nr_objects_deleted = 0; -+ small->nr_objects_created = 0; -+ -+ /* Merge allocated blocks counts */ -+ large->nr_blocks_allocated += small->nr_blocks_allocated; -+ -+ large->nr_running_queues += small->nr_running_queues; -+ small->nr_running_queues = 0; -+ -+ /* Merge blocks reserved for overwrite set. */ -+ large->flush_reserved += small->flush_reserved; -+ small->flush_reserved = 0; -+ -+ if (large->stage < small->stage) { -+ /* Large only needs to notify if it has changed state. */ -+ reiser4_atom_set_stage(large, small->stage); -+ wakeup_atom_waiting_list(large); -+ } -+ -+ reiser4_atom_set_stage(small, ASTAGE_INVALID); -+ -+ /* Notify any waiters--small needs to unload its wait lists. Waiters -+ actually remove themselves from the list before returning from the -+ fuse_wait function. */ -+ wakeup_atom_waiting_list(small); -+ -+ /* Unlock atoms */ -+ spin_unlock_atom(large); -+ atom_dec_and_unlock(small); -+} -+ -+/* TXNMGR STUFF */ -+ -+/* Release a block from the atom, reversing the effects of being captured, -+ do not release atom's reference to jnode due to holding spin-locks. -+ Currently this is only called when the atom commits. -+ -+ NOTE: this function does not release a (journal) reference to jnode -+ due to locking optimizations, you should call jput() somewhere after -+ calling reiser4_uncapture_block(). */ -+void reiser4_uncapture_block(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert("umka-226", node != NULL); -+ atom = node->atom; -+ assert("umka-228", atom != NULL); -+ -+ assert("jmacd-1021", node->atom == atom); -+ assert_spin_locked(&(node->guard)); -+ assert("jmacd-1023", atom_is_protected(atom)); -+ -+ JF_CLR(node, JNODE_DIRTY); -+ JF_CLR(node, JNODE_RELOC); -+ JF_CLR(node, JNODE_OVRWR); -+ JF_CLR(node, JNODE_CREATED); -+ JF_CLR(node, JNODE_WRITEBACK); -+ JF_CLR(node, JNODE_REPACK); -+ -+ list_del_init(&node->capture_link); -+ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) { -+ assert("zam-925", atom_isopen(atom)); -+ assert("vs-1623", NODE_LIST(node) == FQ_LIST); -+ ON_DEBUG(atom->num_queued--); -+ JF_CLR(node, JNODE_FLUSH_QUEUED); -+ } -+ atom->capture_count -= 1; -+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1)); -+ node->atom = NULL; -+ -+ spin_unlock_jnode(node); -+ LOCK_CNT_DEC(t_refs); -+} -+ -+/* Unconditional insert of jnode into atom's overwrite list. Currently used in -+ bitmap-based allocator code for adding modified bitmap blocks the -+ transaction. @atom and @node are spin locked */ -+void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node) -+{ -+ assert("zam-538", atom_is_protected(atom)); -+ assert_spin_locked(&(node->guard)); -+ assert("zam-899", JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-543", node->atom == NULL); -+ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node)); -+ -+ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom)); -+ jref(node); -+ node->atom = atom; -+ atom->capture_count++; -+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1)); -+} -+ -+static int count_deleted_blocks_actor(txn_atom * atom, -+ const reiser4_block_nr * a, -+ const reiser4_block_nr * b, void *data) -+{ -+ reiser4_block_nr *counter = data; -+ -+ assert("zam-995", data != NULL); -+ assert("zam-996", a != NULL); -+ if (b == NULL) -+ *counter += 1; -+ else -+ *counter += *b; -+ return 0; -+} -+ -+reiser4_block_nr txnmgr_count_deleted_blocks(void) -+{ -+ reiser4_block_nr result; -+ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ txn_atom *atom; -+ -+ result = 0; -+ -+ spin_lock_txnmgr(tmgr); -+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { -+ spin_lock_atom(atom); -+ if (atom_isopen(atom)) -+ blocknr_set_iterator( -+ atom, &atom->delete_set, -+ count_deleted_blocks_actor, &result, 0); -+ spin_unlock_atom(atom); -+ } -+ spin_unlock_txnmgr(tmgr); -+ -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.30.orig/fs/reiser4/txnmgr.h linux-2.6.30/fs/reiser4/txnmgr.h ---- linux-2.6.30.orig/fs/reiser4/txnmgr.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/txnmgr.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,701 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* data-types and function declarations for transaction manager. See txnmgr.c -+ * for details. */ -+ -+#ifndef __REISER4_TXNMGR_H__ -+#define __REISER4_TXNMGR_H__ -+ -+#include "forward.h" -+#include "dformat.h" -+ -+#include <linux/fs.h> -+#include <linux/mm.h> -+#include <linux/types.h> -+#include <linux/spinlock.h> -+#include <asm/atomic.h> -+#include <linux/wait.h> -+ -+/* TYPE DECLARATIONS */ -+ -+/* This enumeration describes the possible types of a capture request (reiser4_try_capture). -+ A capture request dynamically assigns a block to the calling thread's transaction -+ handle. */ -+typedef enum { -+ /* A READ_ATOMIC request indicates that a block will be read and that the caller's -+ atom should fuse in order to ensure that the block commits atomically with the -+ caller. */ -+ TXN_CAPTURE_READ_ATOMIC = (1 << 0), -+ -+ /* A READ_NONCOM request indicates that a block will be read and that the caller is -+ willing to read a non-committed block without causing atoms to fuse. */ -+ TXN_CAPTURE_READ_NONCOM = (1 << 1), -+ -+ /* A READ_MODIFY request indicates that a block will be read but that the caller -+ wishes for the block to be captured as it will be written. This capture request -+ mode is not currently used, but eventually it will be useful for preventing -+ deadlock in read-modify-write cycles. */ -+ TXN_CAPTURE_READ_MODIFY = (1 << 2), -+ -+ /* A WRITE capture request indicates that a block will be modified and that atoms -+ should fuse to make the commit atomic. */ -+ TXN_CAPTURE_WRITE = (1 << 3), -+ -+ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the -+ exclusive type designation from extra bits that may be supplied -- see -+ below. */ -+ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC | -+ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY | -+ TXN_CAPTURE_WRITE), -+ -+ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that -+ indicate modification will occur. */ -+ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE), -+ -+ /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would -+ prefer not to sleep waiting for an aging atom to commit. */ -+ TXN_CAPTURE_NONBLOCKING = (1 << 4), -+ -+ /* An option to reiser4_try_capture to prevent atom fusion, just simple -+ capturing is allowed */ -+ TXN_CAPTURE_DONT_FUSE = (1 << 5) -+ -+ /* This macro selects only the exclusive capture request types, stripping out any -+ options that were supplied (i.e., NONBLOCKING). */ -+#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES) -+} txn_capture; -+ -+/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only -+ difference is in the handling of read requests. A WRITE_FUSING transaction handle -+ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG -+ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */ -+typedef enum { -+ TXN_WRITE_FUSING = (1 << 0), -+ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */ -+} txn_mode; -+ -+/* Every atom has a stage, which is one of these exclusive values: */ -+typedef enum { -+ /* Initially an atom is free. */ -+ ASTAGE_FREE = 0, -+ -+ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture -+ blocks and fuse with other atoms. */ -+ ASTAGE_CAPTURE_FUSE = 1, -+ -+ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */ -+ -+ /* When an atom reaches a certain age it must do all it can to commit. An atom in -+ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from -+ atoms in the CAPTURE_FUSE stage. */ -+ ASTAGE_CAPTURE_WAIT = 2, -+ -+ /* Waiting for I/O before commit. Copy-on-capture (see -+ http://namesys.com/v4/v4.html). */ -+ ASTAGE_PRE_COMMIT = 3, -+ -+ /* Post-commit overwrite I/O. Steal-on-capture. */ -+ ASTAGE_POST_COMMIT = 4, -+ -+ /* Atom which waits for the removal of the last reference to (it? ) to -+ * be deleted from memory */ -+ ASTAGE_DONE = 5, -+ -+ /* invalid atom. */ -+ ASTAGE_INVALID = 6, -+ -+} txn_stage; -+ -+/* Certain flags may be set in the txn_atom->flags field. */ -+typedef enum { -+ /* Indicates that the atom should commit as soon as possible. */ -+ ATOM_FORCE_COMMIT = (1 << 0), -+ /* to avoid endless loop, mark the atom (which was considered as too -+ * small) after failed attempt to fuse it. */ -+ ATOM_CANCEL_FUSION = (1 << 1) -+} txn_flags; -+ -+/* Flags for controlling commit_txnh */ -+typedef enum { -+ /* Wait commit atom completion in commit_txnh */ -+ TXNH_WAIT_COMMIT = 0x2, -+ /* Don't commit atom when this handle is closed */ -+ TXNH_DONT_COMMIT = 0x4 -+} txn_handle_flags_t; -+ -+/* TYPE DEFINITIONS */ -+ -+/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom -+ fields, so typically an operation on the atom through either of these objects must (1) -+ lock the object, (2) read the atom pointer, (3) lock the atom. -+ -+ During atom fusion, the process holds locks on both atoms at once. Then, it iterates -+ through the list of handles and pages held by the smaller of the two atoms. For each -+ handle and page referencing the smaller atom, the fusing process must: (1) lock the -+ object, and (2) update the atom pointer. -+ -+ You can see that there is a conflict of lock ordering here, so the more-complex -+ procedure should have priority, i.e., the fusing process has priority so that it is -+ guaranteed to make progress and to avoid restarts. -+ -+ This decision, however, means additional complexity for aquiring the atom lock in the -+ first place. -+ -+ The general original procedure followed in the code was: -+ -+ TXN_OBJECT *obj = ...; -+ TXN_ATOM *atom; -+ -+ spin_lock (& obj->_lock); -+ -+ atom = obj->_atom; -+ -+ if (! spin_trylock_atom (atom)) -+ { -+ spin_unlock (& obj->_lock); -+ RESTART OPERATION, THERE WAS A RACE; -+ } -+ -+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED -+ -+ It has however been found that this wastes CPU a lot in a manner that is -+ hard to profile. So, proper refcounting was added to atoms, and new -+ standard locking sequence is like following: -+ -+ TXN_OBJECT *obj = ...; -+ TXN_ATOM *atom; -+ -+ spin_lock (& obj->_lock); -+ -+ atom = obj->_atom; -+ -+ if (! spin_trylock_atom (atom)) -+ { -+ atomic_inc (& atom->refcount); -+ spin_unlock (& obj->_lock); -+ spin_lock (&atom->_lock); -+ atomic_dec (& atom->refcount); -+ // HERE atom is locked -+ spin_unlock (&atom->_lock); -+ RESTART OPERATION, THERE WAS A RACE; -+ } -+ -+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED -+ -+ (core of this is implemented in trylock_throttle() function) -+ -+ See the jnode_get_atom() function for a common case. -+ -+ As an additional (and important) optimization allowing to avoid restarts, -+ it is possible to re-check required pre-conditions at the HERE point in -+ code above and proceed without restarting if they are still satisfied. -+*/ -+ -+/* An atomic transaction: this is the underlying system representation -+ of a transaction, not the one seen by clients. -+ -+ Invariants involving this data-type: -+ -+ [sb-fake-allocated] -+*/ -+struct txn_atom { -+ /* The spinlock protecting the atom, held during fusion and various other state -+ changes. */ -+ spinlock_t alock; -+ -+ /* The atom's reference counter, increasing (in case of a duplication -+ of an existing reference or when we are sure that some other -+ reference exists) may be done without taking spinlock, decrementing -+ of the ref. counter requires a spinlock to be held. -+ -+ Each transaction handle counts in ->refcount. All jnodes count as -+ one reference acquired in atom_begin_andlock(), released in -+ commit_current_atom(). -+ */ -+ atomic_t refcount; -+ -+ /* The atom_id identifies the atom in persistent records such as the log. */ -+ __u32 atom_id; -+ -+ /* Flags holding any of the txn_flags enumerated values (e.g., -+ ATOM_FORCE_COMMIT). */ -+ __u32 flags; -+ -+ /* Number of open handles. */ -+ __u32 txnh_count; -+ -+ /* The number of znodes captured by this atom. Equal to the sum of lengths of the -+ dirty_nodes[level] and clean_nodes lists. */ -+ __u32 capture_count; -+ -+#if REISER4_DEBUG -+ int clean; -+ int dirty; -+ int ovrwr; -+ int wb; -+ int fq; -+#endif -+ -+ __u32 flushed; -+ -+ /* Current transaction stage. */ -+ txn_stage stage; -+ -+ /* Start time. */ -+ unsigned long start_time; -+ -+ /* The atom's delete set. It collects block numbers of the nodes -+ which were deleted during the transaction. */ -+ struct list_head delete_set; -+ -+ /* The atom's wandered_block mapping. */ -+ struct list_head wandered_map; -+ -+ /* The transaction's list of dirty captured nodes--per level. Index -+ by (level). dirty_nodes[0] is for znode-above-root */ -+ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1]; -+ -+ /* The transaction's list of clean captured nodes. */ -+ struct list_head clean_nodes; -+ -+ /* The atom's overwrite set */ -+ struct list_head ovrwr_nodes; -+ -+ /* nodes which are being written to disk */ -+ struct list_head writeback_nodes; -+ -+ /* list of inodes */ -+ struct list_head inodes; -+ -+ /* List of handles associated with this atom. */ -+ struct list_head txnh_list; -+ -+ /* Transaction list link: list of atoms in the transaction manager. */ -+ struct list_head atom_link; -+ -+ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */ -+ struct list_head fwaitfor_list; -+ -+ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */ -+ struct list_head fwaiting_list; -+ -+ /* Numbers of objects which were deleted/created in this transaction -+ thereby numbers of objects IDs which were released/deallocated. */ -+ int nr_objects_deleted; -+ int nr_objects_created; -+ /* number of blocks allocated during the transaction */ -+ __u64 nr_blocks_allocated; -+ /* All atom's flush queue objects are on this list */ -+ struct list_head flush_queues; -+#if REISER4_DEBUG -+ /* number of flush queues for this atom. */ -+ int nr_flush_queues; -+ /* Number of jnodes which were removed from atom's lists and put -+ on flush_queue */ -+ int num_queued; -+#endif -+ /* number of threads who wait for this atom to complete commit */ -+ int nr_waiters; -+ /* number of threads which do jnode_flush() over this atom */ -+ int nr_flushers; -+ /* number of flush queues which are IN_USE and jnodes from fq->prepped -+ are submitted to disk by the reiser4_write_fq() routine. */ -+ int nr_running_queues; -+ /* A counter of grabbed unformatted nodes, see a description of the -+ * reiser4 space reservation scheme at block_alloc.c */ -+ reiser4_block_nr flush_reserved; -+#if REISER4_DEBUG -+ void *committer; -+#endif -+ struct super_block *super; -+}; -+ -+#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level]) -+#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes) -+#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes) -+#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes) -+#define ATOM_FQ_LIST(fq) (&(fq)->prepped) -+ -+#define NODE_LIST(node) (node)->list -+#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list) -+ON_DEBUG(void -+ count_jnode(txn_atom *, jnode *, atom_list old_list, -+ atom_list new_list, int check_lists)); -+ -+/* A transaction handle: the client obtains and commits this handle which is assigned by -+ the system to a txn_atom. */ -+struct txn_handle { -+ /* Spinlock protecting ->atom pointer */ -+ spinlock_t hlock; -+ -+ /* Flags for controlling commit_txnh() behavior */ -+ /* from txn_handle_flags_t */ -+ txn_handle_flags_t flags; -+ -+ /* Whether it is READ_FUSING or WRITE_FUSING. */ -+ txn_mode mode; -+ -+ /* If assigned, the atom it is part of. */ -+ txn_atom *atom; -+ -+ /* Transaction list link. Head is in txn_atom. */ -+ struct list_head txnh_link; -+}; -+ -+/* The transaction manager: one is contained in the reiser4_super_info_data */ -+struct txn_mgr { -+ /* A spinlock protecting the atom list, id_count, flush_control */ -+ spinlock_t tmgr_lock; -+ -+ /* List of atoms. */ -+ struct list_head atoms_list; -+ -+ /* Number of atoms. */ -+ int atom_count; -+ -+ /* A counter used to assign atom->atom_id values. */ -+ __u32 id_count; -+ -+ /* a mutex object for commit serialization */ -+ struct mutex commit_mutex; -+ -+ /* a list of all txnmrgs served by particular daemon. */ -+ struct list_head linkage; -+ -+ /* description of daemon for this txnmgr */ -+ ktxnmgrd_context *daemon; -+ -+ /* parameters. Adjustable through mount options. */ -+ unsigned int atom_max_size; -+ unsigned int atom_max_age; -+ unsigned int atom_min_size; -+ /* max number of concurrent flushers for one atom, 0 - unlimited. */ -+ unsigned int atom_max_flushers; -+ struct dentry *debugfs_atom_count; -+ struct dentry *debugfs_id_count; -+}; -+ -+/* FUNCTION DECLARATIONS */ -+ -+/* These are the externally (within Reiser4) visible transaction functions, therefore they -+ are prefixed with "txn_". For comments, see txnmgr.c. */ -+ -+extern int init_txnmgr_static(void); -+extern void done_txnmgr_static(void); -+ -+extern void reiser4_init_txnmgr(txn_mgr *); -+extern void reiser4_done_txnmgr(txn_mgr *); -+ -+extern int reiser4_txn_reserve(int reserved); -+ -+extern void reiser4_txn_begin(reiser4_context * context); -+extern int reiser4_txn_end(reiser4_context * context); -+ -+extern void reiser4_txn_restart(reiser4_context * context); -+extern void reiser4_txn_restart_current(void); -+ -+extern int txnmgr_force_commit_all(struct super_block *, int); -+extern int current_atom_should_commit(void); -+ -+extern jnode *find_first_dirty_jnode(txn_atom *, int); -+ -+extern int commit_some_atoms(txn_mgr *); -+extern int force_commit_atom(txn_handle *); -+extern int flush_current_atom(int, long, long *, txn_atom **, jnode *); -+ -+extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int); -+ -+extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage); -+ -+extern int same_slum_check(jnode * base, jnode * check, int alloc_check, -+ int alloc_value); -+extern void atom_dec_and_unlock(txn_atom * atom); -+ -+extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags); -+extern int try_capture_page_to_invalidate(struct page *pg); -+ -+extern void reiser4_uncapture_page(struct page *pg); -+extern void reiser4_uncapture_block(jnode *); -+extern void reiser4_uncapture_jnode(jnode *); -+ -+extern int reiser4_capture_inode(struct inode *); -+extern int reiser4_uncapture_inode(struct inode *); -+ -+extern txn_atom *get_current_atom_locked_nocheck(void); -+ -+#if REISER4_DEBUG -+ -+/** -+ * atom_is_protected - make sure that nobody but us can do anything with atom -+ * @atom: atom to be checked -+ * -+ * This is used to assert that atom either entered commit stages or is spin -+ * locked. -+ */ -+static inline int atom_is_protected(txn_atom *atom) -+{ -+ if (atom->stage >= ASTAGE_PRE_COMMIT) -+ return 1; -+ assert_spin_locked(&(atom->alock)); -+ return 1; -+} -+ -+#endif -+ -+/* Get the current atom and spinlock it if current atom present. May not return NULL */ -+static inline txn_atom *get_current_atom_locked(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked_nocheck(); -+ assert("zam-761", atom != NULL); -+ -+ return atom; -+} -+ -+extern txn_atom *jnode_get_atom(jnode *); -+ -+extern void reiser4_atom_wait_event(txn_atom *); -+extern void reiser4_atom_send_event(txn_atom *); -+ -+extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node); -+extern int reiser4_capture_super_block(struct super_block *s); -+int capture_bulk(jnode **, int count); -+ -+/* See the comment on the function blocknrset.c:blocknr_set_add for the -+ calling convention of these three routines. */ -+extern void blocknr_set_init(struct list_head * bset); -+extern void blocknr_set_destroy(struct list_head * bset); -+extern void blocknr_set_merge(struct list_head * from, struct list_head * into); -+extern int blocknr_set_add_extent(txn_atom * atom, -+ struct list_head * bset, -+ blocknr_set_entry ** new_bsep, -+ const reiser4_block_nr * start, -+ const reiser4_block_nr * len); -+extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset, -+ blocknr_set_entry ** new_bsep, -+ const reiser4_block_nr * a, -+ const reiser4_block_nr * b); -+ -+typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *, -+ const reiser4_block_nr *, void *); -+ -+extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset, -+ blocknr_set_actor_f actor, void *data, -+ int delete); -+ -+/* flush code takes care about how to fuse flush queues */ -+extern void flush_init_atom(txn_atom * atom); -+extern void flush_fuse_queues(txn_atom * large, txn_atom * small); -+ -+static inline void spin_lock_atom(txn_atom *atom) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_atom) && -+ LOCK_CNT_NIL(spin_locked_jnode) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock(&(atom->alock)); -+ -+ LOCK_CNT_INC(spin_locked_atom); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_lock_atom_nested(txn_atom *atom) -+{ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_jnode) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING); -+ -+ LOCK_CNT_INC(spin_locked_atom); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline int spin_trylock_atom(txn_atom *atom) -+{ -+ if (spin_trylock(&(atom->alock))) { -+ LOCK_CNT_INC(spin_locked_atom); -+ LOCK_CNT_INC(spin_locked); -+ return 1; -+ } -+ return 0; -+} -+ -+static inline void spin_unlock_atom(txn_atom *atom) -+{ -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_atom); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(atom->alock)); -+} -+ -+static inline void spin_lock_txnh(txn_handle *txnh) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock(&(txnh->hlock)); -+ -+ LOCK_CNT_INC(spin_locked_txnh); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline int spin_trylock_txnh(txn_handle *txnh) -+{ -+ if (spin_trylock(&(txnh->hlock))) { -+ LOCK_CNT_INC(spin_locked_txnh); -+ LOCK_CNT_INC(spin_locked); -+ return 1; -+ } -+ return 0; -+} -+ -+static inline void spin_unlock_txnh(txn_handle *txnh) -+{ -+ assert_spin_locked(&(txnh->hlock)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_txnh); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(txnh->hlock)); -+} -+ -+#define spin_ordering_pred_txnmgr(tmgr) \ -+ ( LOCK_CNT_NIL(spin_locked_atom) && \ -+ LOCK_CNT_NIL(spin_locked_txnh) && \ -+ LOCK_CNT_NIL(spin_locked_jnode) && \ -+ LOCK_CNT_NIL(rw_locked_zlock) && \ -+ LOCK_CNT_NIL(rw_locked_dk) && \ -+ LOCK_CNT_NIL(rw_locked_tree) ) -+ -+static inline void spin_lock_txnmgr(txn_mgr *mgr) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_atom) && -+ LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_jnode) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock(&(mgr->tmgr_lock)); -+ -+ LOCK_CNT_INC(spin_locked_txnmgr); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline int spin_trylock_txnmgr(txn_mgr *mgr) -+{ -+ if (spin_trylock(&(mgr->tmgr_lock))) { -+ LOCK_CNT_INC(spin_locked_txnmgr); -+ LOCK_CNT_INC(spin_locked); -+ return 1; -+ } -+ return 0; -+} -+ -+static inline void spin_unlock_txnmgr(txn_mgr *mgr) -+{ -+ assert_spin_locked(&(mgr->tmgr_lock)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_txnmgr); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(mgr->tmgr_lock)); -+} -+ -+typedef enum { -+ FQ_IN_USE = 0x1 -+} flush_queue_state_t; -+ -+typedef struct flush_queue flush_queue_t; -+ -+/* This is an accumulator for jnodes prepared for writing to disk. A flush queue -+ is filled by the jnode_flush() routine, and written to disk under memory -+ pressure or at atom commit time. */ -+/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued -+ field and fq->prepped list can be modified if atom is spin-locked and fq -+ object is "in-use" state. For read-only traversal of the fq->prepped list -+ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or -+ only have atom spin-locked. */ -+struct flush_queue { -+ /* linkage element is the first in this structure to make debugging -+ easier. See field in atom struct for description of list. */ -+ struct list_head alink; -+ /* A spinlock to protect changes of fq state and fq->atom pointer */ -+ spinlock_t guard; -+ /* flush_queue state: [in_use | ready] */ -+ flush_queue_state_t state; -+ /* A list which contains queued nodes, queued nodes are removed from any -+ * atom's list and put on this ->prepped one. */ -+ struct list_head prepped; -+ /* number of submitted i/o requests */ -+ atomic_t nr_submitted; -+ /* number of i/o errors */ -+ atomic_t nr_errors; -+ /* An atom this flush queue is attached to */ -+ txn_atom *atom; -+ /* A wait queue head to wait on i/o completion */ -+ wait_queue_head_t wait; -+#if REISER4_DEBUG -+ /* A thread which took this fq in exclusive use, NULL if fq is free, -+ * used for debugging. */ -+ struct task_struct *owner; -+#endif -+}; -+ -+extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **); -+extern void reiser4_fq_put_nolock(flush_queue_t *); -+extern void reiser4_fq_put(flush_queue_t *); -+extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from); -+extern void queue_jnode(flush_queue_t *, jnode *); -+ -+extern int reiser4_write_fq(flush_queue_t *, long *, int); -+extern int current_atom_finish_all_fq(void); -+extern void init_atom_fq_parts(txn_atom *); -+ -+extern reiser4_block_nr txnmgr_count_deleted_blocks(void); -+ -+extern void znode_make_dirty(znode * node); -+extern void jnode_make_dirty_locked(jnode * node); -+ -+extern int reiser4_sync_atom(txn_atom * atom); -+ -+#if REISER4_DEBUG -+extern int atom_fq_parts_are_clean(txn_atom *); -+#endif -+ -+extern void add_fq_to_bio(flush_queue_t *, struct bio *); -+extern flush_queue_t *get_fq_for_current_atom(void); -+ -+void reiser4_invalidate_list(struct list_head * head); -+ -+# endif /* __REISER4_TXNMGR_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/type_safe_hash.h linux-2.6.30/fs/reiser4/type_safe_hash.h ---- linux-2.6.30.orig/fs/reiser4/type_safe_hash.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/type_safe_hash.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,320 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* A hash table class that uses hash chains (singly-linked) and is -+ parametrized to provide type safety. */ -+ -+#ifndef __REISER4_TYPE_SAFE_HASH_H__ -+#define __REISER4_TYPE_SAFE_HASH_H__ -+ -+#include "debug.h" -+ -+#include <asm/errno.h> -+/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects -+ based on the object type. You need to declare the item type before -+ this definition, define it after this definition. */ -+#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \ -+ \ -+typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \ -+typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \ -+ \ -+struct PREFIX##_hash_table_ \ -+{ \ -+ ITEM_TYPE **_table; \ -+ __u32 _buckets; \ -+}; \ -+ \ -+struct PREFIX##_hash_link_ \ -+{ \ -+ ITEM_TYPE *_next; \ -+} -+ -+/* Step 2: Define the object type of the hash: give it field of type -+ PREFIX_hash_link. */ -+ -+/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using -+ the type and field name used in step 3. The arguments are: -+ -+ ITEM_TYPE The item type being hashed -+ KEY_TYPE The type of key being hashed -+ KEY_NAME The name of the key field within the item -+ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link) -+ HASH_FUNC The name of the hash function (or macro, takes const pointer to key) -+ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys) -+ -+ It implements these functions: -+ -+ prefix_hash_init Initialize the table given its size. -+ prefix_hash_insert Insert an item -+ prefix_hash_insert_index Insert an item w/ precomputed hash_index -+ prefix_hash_find Find an item by key -+ prefix_hash_find_index Find an item w/ precomputed hash_index -+ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found -+ prefix_hash_remove_index Remove an item w/ precomputed hash_index -+ -+ If you'd like something to be done differently, feel free to ask me -+ for modifications. Additional features that could be added but -+ have not been: -+ -+ prefix_hash_remove_key Find and remove an item by key -+ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index -+ -+ The hash_function currently receives only the key as an argument, -+ meaning it must somehow know the number of buckets. If this is a -+ problem let me know. -+ -+ This hash table uses a single-linked hash chain. This means -+ insertion is fast but deletion requires searching the chain. -+ -+ There is also the doubly-linked hash chain approach, under which -+ deletion requires no search but the code is longer and it takes two -+ pointers per item. -+ -+ The circularly-linked approach has the shortest code but requires -+ two pointers per bucket, doubling the size of the bucket array (in -+ addition to two pointers per item). -+*/ -+#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \ -+ \ -+static __inline__ void \ -+PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \ -+ __u32 hash UNUSED_ARG) \ -+{ \ -+ assert("nikita-2780", hash < table->_buckets); \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_init (PREFIX##_hash_table *hash, \ -+ __u32 buckets) \ -+{ \ -+ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \ -+ hash->_buckets = buckets; \ -+ if (hash->_table == NULL) \ -+ { \ -+ return RETERR(-ENOMEM); \ -+ } \ -+ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \ -+ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \ -+ return 0; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_done (PREFIX##_hash_table *hash) \ -+{ \ -+ if (REISER4_DEBUG && hash->_table != NULL) { \ -+ __u32 i; \ -+ for (i = 0 ; i < hash->_buckets ; ++ i) \ -+ assert("nikita-2905", hash->_table[i] == NULL); \ -+ } \ -+ if (hash->_table != NULL) \ -+ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \ -+ hash->_table = NULL; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \ -+{ \ -+ prefetch(item->LINK_NAME._next); \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \ -+ __u32 index) \ -+{ \ -+ prefetch(hash->_table[index]); \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ ITEM_TYPE *item; \ -+ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ for (item = hash->_table[hash_index]; \ -+ item != NULL; \ -+ item = item->LINK_NAME._next) \ -+ { \ -+ prefetch(item->LINK_NAME._next); \ -+ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \ -+ if (EQ_FUNC (& item->KEY_NAME, find_key)) \ -+ { \ -+ return item; \ -+ } \ -+ } \ -+ \ -+ return NULL; \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ ITEM_TYPE ** item = &hash->_table[hash_index]; \ -+ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ while (*item != NULL) { \ -+ prefetch(&(*item)->LINK_NAME._next); \ -+ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \ -+ ITEM_TYPE *found; \ -+ \ -+ found = *item; \ -+ *item = found->LINK_NAME._next; \ -+ found->LINK_NAME._next = hash->_table[hash_index]; \ -+ hash->_table[hash_index] = found; \ -+ return found; \ -+ } \ -+ item = &(*item)->LINK_NAME._next; \ -+ } \ -+ return NULL; \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ ITEM_TYPE *del_item) \ -+{ \ -+ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \ -+ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ while (*hash_item_p != NULL) { \ -+ prefetch(&(*hash_item_p)->LINK_NAME._next); \ -+ if (*hash_item_p == del_item) { \ -+ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \ -+ return 1; \ -+ } \ -+ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \ -+ } \ -+ return 0; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \ -+ hash->_table[hash_index] = ins_item; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \ -+ smp_wmb(); \ -+ hash->_table[hash_index] = ins_item; \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find (PREFIX##_hash_table *hash, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_remove (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *del_item) \ -+{ \ -+ return PREFIX##_hash_remove_index (hash, \ -+ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *del_item) \ -+{ \ -+ return PREFIX##_hash_remove (hash, del_item); \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ return PREFIX##_hash_insert_index (hash, \ -+ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \ -+ ins_item); \ -+} \ -+ \ -+static __inline__ ITEM_TYPE * \ -+PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \ -+{ \ -+ ITEM_TYPE *first; \ -+ \ -+ for (first = NULL; ind < hash->_buckets; ++ ind) { \ -+ first = hash->_table[ind]; \ -+ if (first != NULL) \ -+ break; \ -+ } \ -+ return first; \ -+} \ -+ \ -+static __inline__ ITEM_TYPE * \ -+PREFIX##_hash_next (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *item) \ -+{ \ -+ ITEM_TYPE *next; \ -+ \ -+ if (item == NULL) \ -+ return NULL; \ -+ next = item->LINK_NAME._next; \ -+ if (next == NULL) \ -+ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \ -+ return next; \ -+} \ -+ \ -+typedef struct {} PREFIX##_hash_dummy -+ -+#define for_all_ht_buckets(table, head) \ -+for ((head) = &(table) -> _table[ 0 ] ; \ -+ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head)) -+ -+#define for_all_in_bucket(bucket, item, next, field) \ -+for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \ -+ (item) != NULL ; \ -+ (item) = (next), (next) = (item) ? (item) -> field._next : NULL ) -+ -+#define for_all_in_htable(table, prefix, item, next) \ -+for ((item) = prefix ## _hash_first ((table), 0), \ -+ (next) = prefix ## _hash_next ((table), (item)) ; \ -+ (item) != NULL ; \ -+ (item) = (next), \ -+ (next) = prefix ## _hash_next ((table), (item))) -+ -+/* __REISER4_TYPE_SAFE_HASH_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/vfs_ops.c linux-2.6.30/fs/reiser4/vfs_ops.c ---- linux-2.6.30.orig/fs/reiser4/vfs_ops.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/vfs_ops.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,259 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined -+ here. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/file/file.h" -+#include "plugin/security/perm.h" -+#include "plugin/disk_format/disk_format.h" -+#include "plugin/plugin.h" -+#include "plugin/plugin_set.h" -+#include "plugin/object.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "reiser4.h" -+#include "entd.h" -+#include "status_flags.h" -+#include "flush.h" -+#include "dscale.h" -+ -+#include <linux/profile.h> -+#include <linux/types.h> -+#include <linux/mount.h> -+#include <linux/vfs.h> -+#include <linux/mm.h> -+#include <linux/buffer_head.h> -+#include <linux/dcache.h> -+#include <linux/list.h> -+#include <linux/pagemap.h> -+#include <linux/slab.h> -+#include <linux/seq_file.h> -+#include <linux/init.h> -+#include <linux/module.h> -+#include <linux/writeback.h> -+#include <linux/blkdev.h> -+#include <linux/quotaops.h> -+#include <linux/security.h> -+#include <linux/reboot.h> -+#include <linux/rcupdate.h> -+ -+/* update inode stat-data by calling plugin */ -+int reiser4_update_sd(struct inode *object) -+{ -+ file_plugin *fplug; -+ -+ assert("nikita-2338", object != NULL); -+ /* check for read-only file system. */ -+ if (IS_RDONLY(object)) -+ return 0; -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-2339", fplug != NULL); -+ return fplug->write_sd_by_inode(object); -+} -+ -+/* helper function: increase inode nlink count and call plugin method to save -+ updated stat-data. -+ -+ Used by link/create and during creation of dot and dotdot in mkdir -+*/ -+int reiser4_add_nlink(struct inode *object /* object to which link is added */ , -+ struct inode *parent /* parent where new entry will be */ -+ , -+ int write_sd_p /* true if stat-data has to be -+ * updated */ ) -+{ -+ file_plugin *fplug; -+ int result; -+ -+ assert("nikita-1351", object != NULL); -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-1445", fplug != NULL); -+ -+ /* ask plugin whether it can add yet another link to this -+ object */ -+ if (!fplug->can_add_link(object)) -+ return RETERR(-EMLINK); -+ -+ assert("nikita-2211", fplug->add_link != NULL); -+ /* call plugin to do actual addition of link */ -+ result = fplug->add_link(object, parent); -+ -+ /* optionally update stat data */ -+ if (result == 0 && write_sd_p) -+ result = fplug->write_sd_by_inode(object); -+ return result; -+} -+ -+/* helper function: decrease inode nlink count and call plugin method to save -+ updated stat-data. -+ -+ Used by unlink/create -+*/ -+int reiser4_del_nlink(struct inode *object /* object from which link is -+ * removed */ , -+ struct inode *parent /* parent where entry was */ , -+ int write_sd_p /* true is stat-data has to be -+ * updated */ ) -+{ -+ file_plugin *fplug; -+ int result; -+ -+ assert("nikita-1349", object != NULL); -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-1350", fplug != NULL); -+ assert("nikita-1446", object->i_nlink > 0); -+ assert("nikita-2210", fplug->rem_link != NULL); -+ -+ /* call plugin to do actual deletion of link */ -+ result = fplug->rem_link(object, parent); -+ -+ /* optionally update stat data */ -+ if (result == 0 && write_sd_p) -+ result = fplug->write_sd_by_inode(object); -+ return result; -+} -+ -+/* Release reiser4 dentry. This is d_op->d_release() method. */ -+static void reiser4_d_release(struct dentry *dentry /* dentry released */ ) -+{ -+ reiser4_free_dentry_fsdata(dentry); -+} -+ -+/* -+ * Called by reiser4_sync_inodes(), during speculative write-back (through -+ * pdflush, or balance_dirty_pages()). -+ */ -+void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc) -+{ -+ long written = 0; -+ int repeats = 0; -+ int result; -+ struct address_space *mapping; -+ -+ /* -+ * Performs early flushing, trying to free some memory. If there is -+ * nothing to flush, commits some atoms. -+ */ -+ -+ /* Commit all atoms if reiser4_writepages() is called from sys_sync() or -+ sys_fsync(). */ -+ if (wbc->sync_mode != WB_SYNC_NONE) { -+ txnmgr_force_commit_all(sb, 0); -+ return; -+ } -+ -+ BUG_ON(reiser4_get_super_fake(sb) == NULL); -+ mapping = reiser4_get_super_fake(sb)->i_mapping; -+ do { -+ long nr_submitted = 0; -+ jnode *node = NULL; -+ -+ /* do not put more requests to overload write queue */ -+ if (wbc->nonblocking && -+ bdi_write_congested(mapping->backing_dev_info)) { -+ blk_run_address_space(mapping); -+ wbc->encountered_congestion = 1; -+ break; -+ } -+ repeats++; -+ BUG_ON(wbc->nr_to_write <= 0); -+ -+ if (get_current_context()->entd) { -+ entd_context *ent = get_entd_context(sb); -+ -+ if (ent->cur_request->node) -+ /* -+ * this is ent thread and it managed to capture -+ * requested page itself - start flush from -+ * that page -+ */ -+ node = jref(ent->cur_request->node); -+ } -+ -+ result = flush_some_atom(node, &nr_submitted, wbc, -+ JNODE_FLUSH_WRITE_BLOCKS); -+ if (result != 0) -+ warning("nikita-31001", "Flush failed: %i", result); -+ if (node) -+ jput(node); -+ if (!nr_submitted) -+ break; -+ -+ wbc->nr_to_write -= nr_submitted; -+ written += nr_submitted; -+ } while (wbc->nr_to_write > 0); -+} -+ -+void reiser4_throttle_write(struct inode *inode) -+{ -+ reiser4_txn_restart_current(); -+ balance_dirty_pages_ratelimited(inode->i_mapping); -+} -+ -+const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4"; -+const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the -+ * beginning of device */ -+ -+/* -+ * Reiser4 initialization/shutdown. -+ * -+ * Code below performs global reiser4 initialization that is done either as -+ * part of kernel initialization (when reiser4 is statically built-in), or -+ * during reiser4 module load (when compiled as module). -+ */ -+ -+void reiser4_handle_error(void) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ if (!sb) -+ return; -+ reiser4_status_write(REISER4_STATUS_DAMAGED, 0, -+ "Filesystem error occured"); -+ switch (get_super_private(sb)->onerror) { -+ case 0: -+ reiser4_panic("foobar-42", "Filesystem error occured\n"); -+ case 1: -+ default: -+ if (sb->s_flags & MS_RDONLY) -+ return; -+ sb->s_flags |= MS_RDONLY; -+ break; -+ } -+} -+ -+struct dentry_operations reiser4_dentry_operations = { -+ .d_revalidate = NULL, -+ .d_hash = NULL, -+ .d_compare = NULL, -+ .d_delete = NULL, -+ .d_release = reiser4_d_release, -+ .d_iput = NULL, -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/vfs_ops.h linux-2.6.30/fs/reiser4/vfs_ops.h ---- linux-2.6.30.orig/fs/reiser4/vfs_ops.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/vfs_ops.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,53 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* vfs_ops.c's exported symbols */ -+ -+#if !defined( __FS_REISER4_VFS_OPS_H__ ) -+#define __FS_REISER4_VFS_OPS_H__ -+ -+#include "forward.h" -+#include "coord.h" -+#include "seal.h" -+#include "plugin/file/file.h" -+#include "super.h" -+#include "readahead.h" -+ -+#include <linux/types.h> /* for loff_t */ -+#include <linux/fs.h> /* for struct address_space */ -+#include <linux/dcache.h> /* for struct dentry */ -+#include <linux/mm.h> -+#include <linux/backing-dev.h> -+ -+/* address space operations */ -+int reiser4_writepage(struct page *, struct writeback_control *); -+int reiser4_set_page_dirty(struct page *); -+void reiser4_invalidatepage(struct page *, unsigned long offset); -+int reiser4_releasepage(struct page *, gfp_t); -+ -+extern int reiser4_update_sd(struct inode *); -+extern int reiser4_add_nlink(struct inode *, struct inode *, int); -+extern int reiser4_del_nlink(struct inode *, struct inode *, int); -+ -+extern int reiser4_start_up_io(struct page *page); -+extern void reiser4_throttle_write(struct inode *); -+extern int jnode_is_releasable(jnode *); -+ -+#define CAPTURE_APAGE_BURST (1024l) -+void reiser4_writeout(struct super_block *, struct writeback_control *); -+ -+extern void reiser4_handle_error(void); -+ -+/* __FS_REISER4_VFS_OPS_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/wander.c linux-2.6.30/fs/reiser4/wander.c ---- linux-2.6.30.orig/fs/reiser4/wander.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/wander.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1798 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Reiser4 Wandering Log */ -+ -+/* You should read http://www.namesys.com/txn-doc.html -+ -+ That describes how filesystem operations are performed as atomic -+ transactions, and how we try to arrange it so that we can write most of the -+ data only once while performing the operation atomically. -+ -+ For the purposes of this code, it is enough for it to understand that it -+ has been told a given block should be written either once, or twice (if -+ twice then once to the wandered location and once to the real location). -+ -+ This code guarantees that those blocks that are defined to be part of an -+ atom either all take effect or none of them take effect. -+ -+ The "relocate set" of nodes are submitted to write by the jnode_flush() -+ routine, and the "overwrite set" is submitted by reiser4_write_log(). -+ This is because with the overwrite set we seek to optimize writes, and -+ with the relocate set we seek to cause disk order to correlate with the -+ "parent first order" (preorder). -+ -+ reiser4_write_log() allocates and writes wandered blocks and maintains -+ additional on-disk structures of the atom as wander records (each wander -+ record occupies one block) for storing of the "wandered map" (a table which -+ contains a relation between wandered and real block numbers) and other -+ information which might be needed at transaction recovery time. -+ -+ The wander records are unidirectionally linked into a circle: each wander -+ record contains a block number of the next wander record, the last wander -+ record points to the first one. -+ -+ One wander record (named "tx head" in this file) has a format which is -+ different from the other wander records. The "tx head" has a reference to the -+ "tx head" block of the previously committed atom. Also, "tx head" contains -+ fs information (the free blocks counter, and the oid allocator state) which -+ is logged in a special way . -+ -+ There are two journal control blocks, named journal header and journal -+ footer which have fixed on-disk locations. The journal header has a -+ reference to the "tx head" block of the last committed atom. The journal -+ footer points to the "tx head" of the last flushed atom. The atom is -+ "played" when all blocks from its overwrite set are written to disk the -+ second time (i.e. written to their real locations). -+ -+ NOTE: People who know reiserfs internals and its journal structure might be -+ confused with these terms journal footer and journal header. There is a table -+ with terms of similar semantics in reiserfs (reiser3) and reiser4: -+ -+ REISER3 TERM | REISER4 TERM | DESCRIPTION -+ --------------------+-----------------------+---------------------------- -+ commit record | journal header | atomic write of this record -+ | | ends transaction commit -+ --------------------+-----------------------+---------------------------- -+ journal header | journal footer | atomic write of this record -+ | | ends post-commit writes. -+ | | After successful -+ | | writing of this journal -+ | | blocks (in reiser3) or -+ | | wandered blocks/records are -+ | | free for re-use. -+ --------------------+-----------------------+---------------------------- -+ -+ The atom commit process is the following: -+ -+ 1. The overwrite set is taken from atom's clean list, and its size is -+ counted. -+ -+ 2. The number of necessary wander records (including tx head) is calculated, -+ and the wander record blocks are allocated. -+ -+ 3. Allocate wandered blocks and populate wander records by wandered map. -+ -+ 4. submit write requests for wander records and wandered blocks. -+ -+ 5. wait until submitted write requests complete. -+ -+ 6. update journal header: change the pointer to the block number of just -+ written tx head, submit an i/o for modified journal header block and wait -+ for i/o completion. -+ -+ NOTE: The special logging for bitmap blocks and some reiser4 super block -+ fields makes processes of atom commit, flush and recovering a bit more -+ complex (see comments in the source code for details). -+ -+ The atom playing process is the following: -+ -+ 1. Write atom's overwrite set in-place. -+ -+ 2. Wait on i/o. -+ -+ 3. Update journal footer: change the pointer to block number of tx head -+ block of the atom we currently flushing, submit an i/o, wait on i/o -+ completion. -+ -+ 4. Free disk space which was used for wandered blocks and wander records. -+ -+ After the freeing of wandered blocks and wander records we have that journal -+ footer points to the on-disk structure which might be overwritten soon. -+ Neither the log writer nor the journal recovery procedure use that pointer -+ for accessing the data. When the journal recovery procedure finds the oldest -+ transaction it compares the journal footer pointer value with the "prev_tx" -+ pointer value in tx head, if values are equal the oldest not flushed -+ transaction is found. -+ -+ NOTE on disk space leakage: the information about of what blocks and how many -+ blocks are allocated for wandered blocks, wandered records is not written to -+ the disk because of special logging for bitmaps and some super blocks -+ counters. After a system crash we the reiser4 does not remember those -+ objects allocation, thus we have no such a kind of disk space leakage. -+*/ -+ -+/* Special logging of reiser4 super block fields. */ -+ -+/* There are some reiser4 super block fields (free block count and OID allocator -+ state (number of files and next free OID) which are logged separately from -+ super block to avoid unnecessary atom fusion. -+ -+ So, the reiser4 super block can be not captured by a transaction with -+ allocates/deallocates disk blocks or create/delete file objects. Moreover, -+ the reiser4 on-disk super block is not touched when such a transaction is -+ committed and flushed. Those "counters logged specially" are logged in "tx -+ head" blocks and in the journal footer block. -+ -+ A step-by-step description of special logging: -+ -+ 0. The per-atom information about deleted or created files and allocated or -+ freed blocks is collected during the transaction. The atom's -+ ->nr_objects_created and ->nr_objects_deleted are for object -+ deletion/creation tracking, the numbers of allocated and freed blocks are -+ calculated using atom's delete set and atom's capture list -- all new and -+ relocated nodes should be on atom's clean list and should have JNODE_RELOC -+ bit set. -+ -+ 1. The "logged specially" reiser4 super block fields have their "committed" -+ versions in the reiser4 in-memory super block. They get modified only at -+ atom commit time. The atom's commit thread has an exclusive access to those -+ "committed" fields because the log writer implementation supports only one -+ atom commit a time (there is a per-fs "commit" mutex). At -+ that time "committed" counters are modified using per-atom information -+ collected during the transaction. These counters are stored on disk as a -+ part of tx head block when atom is committed. -+ -+ 2. When the atom is flushed the value of the free block counter and the OID -+ allocator state get written to the journal footer block. A special journal -+ procedure (journal_recover_sb_data()) takes those values from the journal -+ footer and updates the reiser4 in-memory super block. -+ -+ NOTE: That means free block count and OID allocator state are logged -+ separately from the reiser4 super block regardless of the fact that the -+ reiser4 super block has fields to store both the free block counter and the -+ OID allocator. -+ -+ Writing the whole super block at commit time requires knowing true values of -+ all its fields without changes made by not yet committed transactions. It is -+ possible by having their "committed" version of the super block like the -+ reiser4 bitmap blocks have "committed" and "working" versions. However, -+ another scheme was implemented which stores special logged values in the -+ unused free space inside transaction head block. In my opinion it has an -+ advantage of not writing whole super block when only part of it was -+ modified. */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "page_cache.h" -+#include "wander.h" -+#include "reiser4.h" -+#include "super.h" -+#include "vfs_ops.h" -+#include "writeout.h" -+#include "inode.h" -+#include "entd.h" -+ -+#include <linux/types.h> -+#include <linux/fs.h> /* for struct super_block */ -+#include <linux/mm.h> /* for struct page */ -+#include <linux/pagemap.h> -+#include <linux/bio.h> /* for struct bio */ -+#include <linux/blkdev.h> -+ -+static int write_jnodes_to_disk_extent( -+ jnode *, int, const reiser4_block_nr *, flush_queue_t *, int); -+ -+/* The commit_handle is a container for objects needed at atom commit time */ -+struct commit_handle { -+ /* A pointer to atom's list of OVRWR nodes */ -+ struct list_head *overwrite_set; -+ /* atom's overwrite set size */ -+ int overwrite_set_size; -+ /* jnodes for wander record blocks */ -+ struct list_head tx_list; -+ /* number of wander records */ -+ __u32 tx_size; -+ /* 'committed' sb counters are saved here until atom is completely -+ flushed */ -+ __u64 free_blocks; -+ __u64 nr_files; -+ __u64 next_oid; -+ /* A pointer to the atom which is being committed */ -+ txn_atom *atom; -+ /* A pointer to current super block */ -+ struct super_block *super; -+ /* The counter of modified bitmaps */ -+ reiser4_block_nr nr_bitmap; -+}; -+ -+static void init_commit_handle(struct commit_handle *ch, txn_atom *atom) -+{ -+ memset(ch, 0, sizeof(struct commit_handle)); -+ INIT_LIST_HEAD(&ch->tx_list); -+ -+ ch->atom = atom; -+ ch->super = reiser4_get_current_sb(); -+} -+ -+static void done_commit_handle(struct commit_handle *ch) -+{ -+ assert("zam-690", list_empty(&ch->tx_list)); -+} -+ -+static inline int reiser4_use_write_barrier(struct super_block * s) -+{ -+ return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER); -+} -+ -+static void disable_write_barrier(struct super_block * s) -+{ -+ notice("zam-1055", "%s does not support write barriers," -+ " using synchronous write instead.", s->s_id); -+ set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags); -+} -+ -+/* fill journal header block data */ -+static void format_journal_header(struct commit_handle *ch) -+{ -+ struct reiser4_super_info_data *sbinfo; -+ struct journal_header *header; -+ jnode *txhead; -+ -+ sbinfo = get_super_private(ch->super); -+ assert("zam-479", sbinfo != NULL); -+ assert("zam-480", sbinfo->journal_header != NULL); -+ -+ txhead = list_entry(ch->tx_list.next, jnode, capture_link); -+ -+ jload(sbinfo->journal_header); -+ -+ header = (struct journal_header *)jdata(sbinfo->journal_header); -+ assert("zam-484", header != NULL); -+ -+ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)), -+ &header->last_committed_tx); -+ -+ jrelse(sbinfo->journal_header); -+} -+ -+/* fill journal footer block data */ -+static void format_journal_footer(struct commit_handle *ch) -+{ -+ struct reiser4_super_info_data *sbinfo; -+ struct journal_footer *footer; -+ jnode *tx_head; -+ -+ sbinfo = get_super_private(ch->super); -+ -+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link); -+ -+ assert("zam-493", sbinfo != NULL); -+ assert("zam-494", sbinfo->journal_header != NULL); -+ -+ check_me("zam-691", jload(sbinfo->journal_footer) == 0); -+ -+ footer = (struct journal_footer *)jdata(sbinfo->journal_footer); -+ assert("zam-495", footer != NULL); -+ -+ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)), -+ &footer->last_flushed_tx); -+ put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks); -+ -+ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files); -+ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid); -+ -+ jrelse(sbinfo->journal_footer); -+} -+ -+/* wander record capacity depends on current block size */ -+static int wander_record_capacity(const struct super_block *super) -+{ -+ return (super->s_blocksize - -+ sizeof(struct wander_record_header)) / -+ sizeof(struct wander_entry); -+} -+ -+/* Fill first wander record (tx head) in accordance with supplied given data */ -+static void format_tx_head(struct commit_handle *ch) -+{ -+ jnode *tx_head; -+ jnode *next; -+ struct tx_header *header; -+ -+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link); -+ assert("zam-692", &ch->tx_list != &tx_head->capture_link); -+ -+ next = list_entry(tx_head->capture_link.next, jnode, capture_link); -+ if (&ch->tx_list == &next->capture_link) -+ next = tx_head; -+ -+ header = (struct tx_header *)jdata(tx_head); -+ -+ assert("zam-460", header != NULL); -+ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header)); -+ -+ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize); -+ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE); -+ -+ put_unaligned(cpu_to_le32(ch->tx_size), &header->total); -+ put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx), -+ &header->prev_tx); -+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block); -+ put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks); -+ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files); -+ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid); -+} -+ -+/* prepare ordinary wander record block (fill all service fields) */ -+static void -+format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial) -+{ -+ struct wander_record_header *LRH; -+ jnode *next; -+ -+ assert("zam-464", node != NULL); -+ -+ LRH = (struct wander_record_header *)jdata(node); -+ next = list_entry(node->capture_link.next, jnode, capture_link); -+ -+ if (&ch->tx_list == &next->capture_link) -+ next = list_entry(ch->tx_list.next, jnode, capture_link); -+ -+ assert("zam-465", LRH != NULL); -+ assert("zam-463", -+ ch->super->s_blocksize > sizeof(struct wander_record_header)); -+ -+ memset(jdata(node), 0, (size_t) ch->super->s_blocksize); -+ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE); -+ -+ put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total); -+ put_unaligned(cpu_to_le32(serial), &LRH->serial); -+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block); -+} -+ -+/* add one wandered map entry to formatted wander record */ -+static void -+store_entry(jnode * node, int index, const reiser4_block_nr * a, -+ const reiser4_block_nr * b) -+{ -+ char *data; -+ struct wander_entry *pairs; -+ -+ data = jdata(node); -+ assert("zam-451", data != NULL); -+ -+ pairs = -+ (struct wander_entry *)(data + sizeof(struct wander_record_header)); -+ -+ put_unaligned(cpu_to_le64(*a), &pairs[index].original); -+ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered); -+} -+ -+/* currently, wander records contains contain only wandered map, which depend on -+ overwrite set size */ -+static void get_tx_size(struct commit_handle *ch) -+{ -+ assert("zam-440", ch->overwrite_set_size != 0); -+ assert("zam-695", ch->tx_size == 0); -+ -+ /* count all ordinary wander records -+ (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one -+ for tx head block */ -+ ch->tx_size = -+ (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) + -+ 2; -+} -+ -+/* A special structure for using in store_wmap_actor() for saving its state -+ between calls */ -+struct store_wmap_params { -+ jnode *cur; /* jnode of current wander record to fill */ -+ int idx; /* free element index in wander record */ -+ int capacity; /* capacity */ -+ -+#if REISER4_DEBUG -+ struct list_head *tx_list; -+#endif -+}; -+ -+/* an actor for use in blocknr_set_iterator routine which populates the list -+ of pre-formatted wander records by wandered map info */ -+static int -+store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, -+ const reiser4_block_nr * b, void *data) -+{ -+ struct store_wmap_params *params = data; -+ -+ if (params->idx >= params->capacity) { -+ /* a new wander record should be taken from the tx_list */ -+ params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link); -+ assert("zam-454", -+ params->tx_list != ¶ms->cur->capture_link); -+ -+ params->idx = 0; -+ } -+ -+ store_entry(params->cur, params->idx, a, b); -+ params->idx++; -+ -+ return 0; -+} -+ -+/* This function is called after Relocate set gets written to disk, Overwrite -+ set is written to wandered locations and all wander records are written -+ also. Updated journal header blocks contains a pointer (block number) to -+ first wander record of the just written transaction */ -+static int update_journal_header(struct commit_handle *ch, int use_barrier) -+{ -+ struct reiser4_super_info_data *sbinfo = get_super_private(ch->super); -+ jnode *jh = sbinfo->journal_header; -+ jnode *head = list_entry(ch->tx_list.next, jnode, capture_link); -+ int ret; -+ -+ format_journal_header(ch); -+ -+ ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL, -+ use_barrier ? WRITEOUT_BARRIER : 0); -+ if (ret) -+ return ret; -+ -+ /* blk_run_address_space(sbinfo->fake->i_mapping); -+ * blk_run_queues(); */ -+ -+ ret = jwait_io(jh, WRITE); -+ -+ if (ret) -+ return ret; -+ -+ sbinfo->last_committed_tx = *jnode_get_block(head); -+ -+ return 0; -+} -+ -+/* This function is called after write-back is finished. We update journal -+ footer block and free blocks which were occupied by wandered blocks and -+ transaction wander records */ -+static int update_journal_footer(struct commit_handle *ch, int use_barrier) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(ch->super); -+ -+ jnode *jf = sbinfo->journal_footer; -+ -+ int ret; -+ -+ format_journal_footer(ch); -+ -+ ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL, -+ use_barrier ? WRITEOUT_BARRIER : 0); -+ if (ret) -+ return ret; -+ -+ /* blk_run_address_space(sbinfo->fake->i_mapping); -+ * blk_run_queue(); */ -+ -+ ret = jwait_io(jf, WRITE); -+ if (ret) -+ return ret; -+ -+ return 0; -+} -+ -+/* free block numbers of wander records of already written in place transaction */ -+static void dealloc_tx_list(struct commit_handle *ch) -+{ -+ while (!list_empty(&ch->tx_list)) { -+ jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link); -+ list_del(&cur->capture_link); -+ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link)); -+ reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED, -+ BA_FORMATTED); -+ -+ unpin_jnode_data(cur); -+ reiser4_drop_io_head(cur); -+ } -+} -+ -+/* An actor for use in block_nr_iterator() routine which frees wandered blocks -+ from atom's overwrite set. */ -+static int -+dealloc_wmap_actor(txn_atom * atom UNUSED_ARG, -+ const reiser4_block_nr * a UNUSED_ARG, -+ const reiser4_block_nr * b, void *data UNUSED_ARG) -+{ -+ -+ assert("zam-499", b != NULL); -+ assert("zam-500", *b != 0); -+ assert("zam-501", !reiser4_blocknr_is_fake(b)); -+ -+ reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED); -+ return 0; -+} -+ -+/* free wandered block locations of already written in place transaction */ -+static void dealloc_wmap(struct commit_handle *ch) -+{ -+ assert("zam-696", ch->atom != NULL); -+ -+ blocknr_set_iterator(ch->atom, &ch->atom->wandered_map, -+ dealloc_wmap_actor, NULL, 1); -+} -+ -+/* helper function for alloc wandered blocks, which refill set of block -+ numbers needed for wandered blocks */ -+static int -+get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len) -+{ -+ reiser4_blocknr_hint hint; -+ int ret; -+ -+ reiser4_block_nr wide_len = count; -+ -+ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks -+ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed -+ reserved allocation area so as to get the best qualities of fixed -+ journals? */ -+ reiser4_blocknr_hint_init(&hint); -+ hint.block_stage = BLOCK_GRABBED; -+ -+ ret = reiser4_alloc_blocks(&hint, start, &wide_len, -+ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START); -+ *len = (int)wide_len; -+ -+ return ret; -+} -+ -+/* -+ * roll back changes made before issuing BIO in the case of IO error. -+ */ -+static void undo_bio(struct bio *bio) -+{ -+ int i; -+ -+ for (i = 0; i < bio->bi_vcnt; ++i) { -+ struct page *pg; -+ jnode *node; -+ -+ pg = bio->bi_io_vec[i].bv_page; -+ end_page_writeback(pg); -+ node = jprivate(pg); -+ spin_lock_jnode(node); -+ JF_CLR(node, JNODE_WRITEBACK); -+ JF_SET(node, JNODE_DIRTY); -+ spin_unlock_jnode(node); -+ } -+ bio_put(bio); -+} -+ -+/* put overwrite set back to atom's clean list */ -+static void put_overwrite_set(struct commit_handle *ch) -+{ -+ jnode *cur; -+ -+ list_for_each_entry(cur, ch->overwrite_set, capture_link) -+ jrelse_tail(cur); -+} -+ -+/* Count overwrite set size, grab disk space for wandered blocks allocation. -+ Since we have a separate list for atom's overwrite set we just scan the list, -+ count bitmap and other not leaf nodes which wandered blocks allocation we -+ have to grab space for. */ -+static int get_overwrite_set(struct commit_handle *ch) -+{ -+ int ret; -+ jnode *cur; -+ __u64 nr_not_leaves = 0; -+#if REISER4_DEBUG -+ __u64 nr_formatted_leaves = 0; -+ __u64 nr_unformatted_leaves = 0; -+#endif -+ -+ assert("zam-697", ch->overwrite_set_size == 0); -+ -+ ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom); -+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link); -+ -+ while (ch->overwrite_set != &cur->capture_link) { -+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link); -+ -+ /* Count bitmap locks for getting correct statistics what number -+ * of blocks were cleared by the transaction commit. */ -+ if (jnode_get_type(cur) == JNODE_BITMAP) -+ ch->nr_bitmap++; -+ -+ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR) -+ || jnode_get_type(cur) == JNODE_BITMAP); -+ -+ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) { -+ /* we replace fake znode by another (real) -+ znode which is suggested by disk_layout -+ plugin */ -+ -+ /* FIXME: it looks like fake znode should be -+ replaced by jnode supplied by -+ disk_layout. */ -+ -+ struct super_block *s = reiser4_get_current_sb(); -+ reiser4_super_info_data *sbinfo = -+ get_current_super_private(); -+ -+ if (sbinfo->df_plug->log_super) { -+ jnode *sj = sbinfo->df_plug->log_super(s); -+ -+ assert("zam-593", sj != NULL); -+ -+ if (IS_ERR(sj)) -+ return PTR_ERR(sj); -+ -+ spin_lock_jnode(sj); -+ JF_SET(sj, JNODE_OVRWR); -+ insert_into_atom_ovrwr_list(ch->atom, sj); -+ spin_unlock_jnode(sj); -+ -+ /* jload it as the rest of overwrite set */ -+ jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0); -+ -+ ch->overwrite_set_size++; -+ } -+ spin_lock_jnode(cur); -+ reiser4_uncapture_block(cur); -+ jput(cur); -+ -+ } else { -+ int ret; -+ ch->overwrite_set_size++; -+ ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0); -+ if (ret) -+ reiser4_panic("zam-783", -+ "cannot load e-flushed jnode back (ret = %d)\n", -+ ret); -+ } -+ -+ /* Count not leaves here because we have to grab disk space -+ * for wandered blocks. They were not counted as "flush -+ * reserved". Counting should be done _after_ nodes are pinned -+ * into memory by jload(). */ -+ if (!jnode_is_leaf(cur)) -+ nr_not_leaves++; -+ else { -+#if REISER4_DEBUG -+ /* at this point @cur either has JNODE_FLUSH_RESERVED -+ * or is eflushed. Locking is not strong enough to -+ * write an assertion checking for this. */ -+ if (jnode_is_znode(cur)) -+ nr_formatted_leaves++; -+ else -+ nr_unformatted_leaves++; -+#endif -+ JF_CLR(cur, JNODE_FLUSH_RESERVED); -+ } -+ -+ cur = next; -+ } -+ -+ /* Grab space for writing (wandered blocks) of not leaves found in -+ * overwrite set. */ -+ ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED); -+ if (ret) -+ return ret; -+ -+ /* Disk space for allocation of wandered blocks of leaf nodes already -+ * reserved as "flush reserved", move it to grabbed space counter. */ -+ spin_lock_atom(ch->atom); -+ assert("zam-940", -+ nr_formatted_leaves + nr_unformatted_leaves <= -+ ch->atom->flush_reserved); -+ flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved); -+ spin_unlock_atom(ch->atom); -+ -+ return ch->overwrite_set_size; -+} -+ -+/** -+ * write_jnodes_to_disk_extent - submit write request -+ * @head: -+ * @first: first jnode of the list -+ * @nr: number of jnodes on the list -+ * @block_p: -+ * @fq: -+ * @flags: used to decide whether page is to get PG_reclaim flag -+ * -+ * Submits a write request for @nr jnodes beginning from the @first, other -+ * jnodes are after the @first on the double-linked "capture" list. All jnodes -+ * will be written to the disk region of @nr blocks starting with @block_p block -+ * number. If @fq is not NULL it means that waiting for i/o completion will be -+ * done more efficiently by using flush_queue_t objects. -+ * This function is the one which writes list of jnodes in batch mode. It does -+ * all low-level things as bio construction and page states manipulation. -+ * -+ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are -+ * aggregated in this function instead of being left to the layers below -+ * -+ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that? -+ * Why that layer needed? Why BIOs cannot be constructed here? -+ */ -+static int write_jnodes_to_disk_extent( -+ jnode *first, int nr, const reiser4_block_nr *block_p, -+ flush_queue_t *fq, int flags) -+{ -+ struct super_block *super = reiser4_get_current_sb(); -+ int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE; -+ int max_blocks; -+ jnode *cur = first; -+ reiser4_block_nr block; -+ -+ assert("zam-571", first != NULL); -+ assert("zam-572", block_p != NULL); -+ assert("zam-570", nr > 0); -+ -+ block = *block_p; -+ max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES); -+ -+ while (nr > 0) { -+ struct bio *bio; -+ int nr_blocks = min(nr, max_blocks); -+ int i; -+ int nr_used; -+ -+ bio = bio_alloc(GFP_NOIO, nr_blocks); -+ if (!bio) -+ return RETERR(-ENOMEM); -+ -+ bio->bi_bdev = super->s_bdev; -+ bio->bi_sector = block * (super->s_blocksize >> 9); -+ for (nr_used = 0, i = 0; i < nr_blocks; i++) { -+ struct page *pg; -+ -+ pg = jnode_page(cur); -+ assert("zam-573", pg != NULL); -+ -+ page_cache_get(pg); -+ -+ lock_and_wait_page_writeback(pg); -+ -+ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) { -+ /* -+ * underlying device is satiated. Stop adding -+ * pages to the bio. -+ */ -+ unlock_page(pg); -+ page_cache_release(pg); -+ break; -+ } -+ -+ spin_lock_jnode(cur); -+ assert("nikita-3166", -+ pg->mapping == jnode_get_mapping(cur)); -+ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK)); -+#if REISER4_DEBUG -+ spin_lock(&cur->load); -+ assert("nikita-3165", !jnode_is_releasable(cur)); -+ spin_unlock(&cur->load); -+#endif -+ JF_SET(cur, JNODE_WRITEBACK); -+ JF_CLR(cur, JNODE_DIRTY); -+ ON_DEBUG(cur->written++); -+ spin_unlock_jnode(cur); -+ -+ ClearPageError(pg); -+ set_page_writeback(pg); -+ -+ if (get_current_context()->entd) { -+ /* this is ent thread */ -+ entd_context *ent = get_entd_context(super); -+ struct wbq *rq, *next; -+ -+ spin_lock(&ent->guard); -+ -+ if (pg == ent->cur_request->page) { -+ /* -+ * entd is called for this page. This -+ * request is not in th etodo list -+ */ -+ ent->cur_request->written = 1; -+ } else { -+ /* -+ * if we have written a page for which writepage -+ * is called for - move request to another list. -+ */ -+ list_for_each_entry_safe(rq, next, &ent->todo_list, link) { -+ assert("", rq->magic == WBQ_MAGIC); -+ if (pg == rq->page) { -+ /* -+ * remove request from -+ * entd's queue, but do -+ * not wake up a thread -+ * which put this -+ * request -+ */ -+ list_del_init(&rq->link); -+ ent->nr_todo_reqs --; -+ list_add_tail(&rq->link, &ent->done_list); -+ ent->nr_done_reqs ++; -+ rq->written = 1; -+ break; -+ } -+ } -+ } -+ spin_unlock(&ent->guard); -+ } -+ -+ clear_page_dirty_for_io(pg); -+ -+ unlock_page(pg); -+ -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ nr_used++; -+ } -+ if (nr_used > 0) { -+ assert("nikita-3453", -+ bio->bi_size == super->s_blocksize * nr_used); -+ assert("nikita-3454", bio->bi_vcnt == nr_used); -+ -+ /* Check if we are allowed to write at all */ -+ if (super->s_flags & MS_RDONLY) -+ undo_bio(bio); -+ else { -+ int not_supported; -+ -+ add_fq_to_bio(fq, bio); -+ bio_get(bio); -+ reiser4_submit_bio(write_op, bio); -+ not_supported = bio_flagged(bio, BIO_EOPNOTSUPP); -+ bio_put(bio); -+ if (not_supported) -+ return -EOPNOTSUPP; -+ } -+ -+ block += nr_used - 1; -+ update_blocknr_hint_default(super, &block); -+ block += 1; -+ } else { -+ bio_put(bio); -+ } -+ nr -= nr_used; -+ } -+ -+ return 0; -+} -+ -+/* This is a procedure which recovers a contiguous sequences of disk block -+ numbers in the given list of j-nodes and submits write requests on this -+ per-sequence basis */ -+int -+write_jnode_list(struct list_head *head, flush_queue_t *fq, -+ long *nr_submitted, int flags) -+{ -+ int ret; -+ jnode *beg = list_entry(head->next, jnode, capture_link); -+ -+ while (head != &beg->capture_link) { -+ int nr = 1; -+ jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link); -+ -+ while (head != &cur->capture_link) { -+ if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr) -+ break; -+ ++nr; -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ -+ ret = write_jnodes_to_disk_extent( -+ beg, nr, jnode_get_block(beg), fq, flags); -+ if (ret) -+ return ret; -+ -+ if (nr_submitted) -+ *nr_submitted += nr; -+ -+ beg = cur; -+ } -+ -+ return 0; -+} -+ -+/* add given wandered mapping to atom's wandered map */ -+static int -+add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p) -+{ -+ int ret; -+ blocknr_set_entry *new_bsep = NULL; -+ reiser4_block_nr block; -+ -+ txn_atom *atom; -+ -+ assert("zam-568", block_p != NULL); -+ block = *block_p; -+ assert("zam-569", len > 0); -+ -+ while ((len--) > 0) { -+ do { -+ atom = get_current_atom_locked(); -+ assert("zam-536", -+ !reiser4_blocknr_is_fake(jnode_get_block(cur))); -+ ret = -+ blocknr_set_add_pair(atom, &atom->wandered_map, -+ &new_bsep, -+ jnode_get_block(cur), &block); -+ } while (ret == -E_REPEAT); -+ -+ if (ret) { -+ /* deallocate blocks which were not added to wandered -+ map */ -+ reiser4_block_nr wide_len = len; -+ -+ reiser4_dealloc_blocks(&block, &wide_len, -+ BLOCK_NOT_COUNTED, -+ BA_FORMATTED -+ /* formatted, without defer */ ); -+ -+ return ret; -+ } -+ -+ spin_unlock_atom(atom); -+ -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ ++block; -+ } -+ -+ return 0; -+} -+ -+/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately -+ submit IO for allocated blocks. We assume that current atom is in a stage -+ when any atom fusion is impossible and atom is unlocked and it is safe. */ -+static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq) -+{ -+ reiser4_block_nr block; -+ -+ int rest; -+ int len; -+ int ret; -+ -+ jnode *cur; -+ -+ assert("zam-534", ch->overwrite_set_size > 0); -+ -+ rest = ch->overwrite_set_size; -+ -+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link); -+ while (ch->overwrite_set != &cur->capture_link) { -+ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR)); -+ -+ ret = get_more_wandered_blocks(rest, &block, &len); -+ if (ret) -+ return ret; -+ -+ rest -= len; -+ -+ ret = add_region_to_wmap(cur, len, &block); -+ if (ret) -+ return ret; -+ -+ ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0); -+ if (ret) -+ return ret; -+ -+ while ((len--) > 0) { -+ assert("zam-604", -+ ch->overwrite_set != &cur->capture_link); -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ return 0; -+} -+ -+/* allocate given number of nodes over the journal area and link them into a -+ list, return pointer to the first jnode in the list */ -+static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq) -+{ -+ reiser4_blocknr_hint hint; -+ reiser4_block_nr allocated = 0; -+ reiser4_block_nr first, len; -+ jnode *cur; -+ jnode *txhead; -+ int ret; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("zam-698", ch->tx_size > 0); -+ assert("zam-699", list_empty_careful(&ch->tx_list)); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ while (allocated < (unsigned)ch->tx_size) { -+ len = (ch->tx_size - allocated); -+ -+ reiser4_blocknr_hint_init(&hint); -+ -+ hint.block_stage = BLOCK_GRABBED; -+ -+ /* FIXME: there should be some block allocation policy for -+ nodes which contain wander records */ -+ -+ /* We assume that disk space for wandered record blocks can be -+ * taken from reserved area. */ -+ ret = reiser4_alloc_blocks(&hint, &first, &len, -+ BA_FORMATTED | BA_RESERVED | -+ BA_USE_DEFAULT_SEARCH_START); -+ reiser4_blocknr_hint_done(&hint); -+ -+ if (ret) -+ return ret; -+ -+ allocated += len; -+ -+ /* create jnodes for all wander records */ -+ while (len--) { -+ cur = reiser4_alloc_io_head(&first); -+ -+ if (cur == NULL) { -+ ret = RETERR(-ENOMEM); -+ goto free_not_assigned; -+ } -+ -+ ret = jinit_new(cur, reiser4_ctx_gfp_mask_get()); -+ -+ if (ret != 0) { -+ jfree(cur); -+ goto free_not_assigned; -+ } -+ -+ pin_jnode_data(cur); -+ -+ list_add_tail(&cur->capture_link, &ch->tx_list); -+ -+ first++; -+ } -+ } -+ -+ { /* format a on-disk linked list of wander records */ -+ int serial = 1; -+ -+ txhead = list_entry(ch->tx_list.next, jnode, capture_link); -+ format_tx_head(ch); -+ -+ cur = list_entry(txhead->capture_link.next, jnode, capture_link); -+ while (&ch->tx_list != &cur->capture_link) { -+ format_wander_record(ch, cur, serial++); -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ { /* Fill wander records with Wandered Set */ -+ struct store_wmap_params params; -+ txn_atom *atom; -+ -+ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link); -+ -+ params.idx = 0; -+ params.capacity = -+ wander_record_capacity(reiser4_get_current_sb()); -+ -+ atom = get_current_atom_locked(); -+ blocknr_set_iterator(atom, &atom->wandered_map, -+ &store_wmap_actor, ¶ms, 0); -+ spin_unlock_atom(atom); -+ } -+ -+ { /* relse all jnodes from tx_list */ -+ cur = list_entry(ch->tx_list.next, jnode, capture_link); -+ while (&ch->tx_list != &cur->capture_link) { -+ jrelse(cur); -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ ret = write_jnode_list(&ch->tx_list, fq, NULL, 0); -+ -+ return ret; -+ -+ free_not_assigned: -+ /* We deallocate blocks not yet assigned to jnodes on tx_list. The -+ caller takes care about invalidating of tx list */ -+ reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED); -+ -+ return ret; -+} -+ -+static int commit_tx(struct commit_handle *ch) -+{ -+ flush_queue_t *fq; -+ int barrier; -+ int ret; -+ -+ /* Grab more space for wandered records. */ -+ ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED); -+ if (ret) -+ return ret; -+ -+ fq = get_fq_for_current_atom(); -+ if (IS_ERR(fq)) -+ return PTR_ERR(fq); -+ -+ spin_unlock_atom(fq->atom); -+ do { -+ ret = alloc_wandered_blocks(ch, fq); -+ if (ret) -+ break; -+ ret = alloc_tx(ch, fq); -+ if (ret) -+ break; -+ } while (0); -+ -+ reiser4_fq_put(fq); -+ if (ret) -+ return ret; -+ repeat_wo_barrier: -+ barrier = reiser4_use_write_barrier(ch->super); -+ if (!barrier) { -+ ret = current_atom_finish_all_fq(); -+ if (ret) -+ return ret; -+ } -+ ret = update_journal_header(ch, barrier); -+ if (barrier) { -+ if (ret) { -+ if (ret == -EOPNOTSUPP) { -+ disable_write_barrier(ch->super); -+ goto repeat_wo_barrier; -+ } -+ return ret; -+ } -+ ret = current_atom_finish_all_fq(); -+ } -+ return ret; -+} -+ -+static int write_tx_back(struct commit_handle * ch) -+{ -+ flush_queue_t *fq; -+ int ret; -+ int barrier; -+ -+ reiser4_post_commit_hook(); -+ fq = get_fq_for_current_atom(); -+ if (IS_ERR(fq)) -+ return PTR_ERR(fq); -+ spin_unlock_atom(fq->atom); -+ ret = write_jnode_list( -+ ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM); -+ reiser4_fq_put(fq); -+ if (ret) -+ return ret; -+ repeat_wo_barrier: -+ barrier = reiser4_use_write_barrier(ch->super); -+ if (!barrier) { -+ ret = current_atom_finish_all_fq(); -+ if (ret) -+ return ret; -+ } -+ ret = update_journal_footer(ch, barrier); -+ if (barrier) { -+ if (ret) { -+ if (ret == -EOPNOTSUPP) { -+ disable_write_barrier(ch->super); -+ goto repeat_wo_barrier; -+ } -+ return ret; -+ } -+ ret = current_atom_finish_all_fq(); -+ } -+ if (ret) -+ return ret; -+ reiser4_post_write_back_hook(); -+ return 0; -+} -+ -+/* We assume that at this moment all captured blocks are marked as RELOC or -+ WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set -+ are submitted to write. -+*/ -+ -+int reiser4_write_logs(long *nr_submitted) -+{ -+ txn_atom *atom; -+ struct super_block *super = reiser4_get_current_sb(); -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ struct commit_handle ch; -+ int ret; -+ -+ writeout_mode_enable(); -+ -+ /* block allocator may add j-nodes to the clean_list */ -+ ret = reiser4_pre_commit_hook(); -+ if (ret) -+ return ret; -+ -+ /* No locks are required if we take atom which stage >= -+ * ASTAGE_PRE_COMMIT */ -+ atom = get_current_context()->trans->atom; -+ assert("zam-965", atom != NULL); -+ -+ /* relocate set is on the atom->clean_nodes list after -+ * current_atom_complete_writes() finishes. It can be safely -+ * uncaptured after commit_mutex is locked, because any atom that -+ * captures these nodes is guaranteed to commit after current one. -+ * -+ * This can only be done after reiser4_pre_commit_hook(), because it is where -+ * early flushed jnodes with CREATED bit are transferred to the -+ * overwrite list. */ -+ reiser4_invalidate_list(ATOM_CLEAN_LIST(atom)); -+ spin_lock_atom(atom); -+ /* There might be waiters for the relocate nodes which we have -+ * released, wake them up. */ -+ reiser4_atom_send_event(atom); -+ spin_unlock_atom(atom); -+ -+ if (REISER4_DEBUG) { -+ int level; -+ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level) -+ assert("nikita-3352", -+ list_empty_careful(ATOM_DIRTY_LIST(atom, level))); -+ } -+ -+ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created; -+ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted; -+ -+ init_commit_handle(&ch, atom); -+ -+ ch.free_blocks = sbinfo->blocks_free_committed; -+ ch.nr_files = sbinfo->nr_files_committed; -+ /* ZAM-FIXME-HANS: email me what the contention level is for the super -+ * lock. */ -+ ch.next_oid = oid_next(super); -+ -+ /* count overwrite set and place it in a separate list */ -+ ret = get_overwrite_set(&ch); -+ -+ if (ret <= 0) { -+ /* It is possible that overwrite set is empty here, it means -+ all captured nodes are clean */ -+ goto up_and_ret; -+ } -+ -+ /* Inform the caller about what number of dirty pages will be -+ * submitted to disk. */ -+ *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap; -+ -+ /* count all records needed for storing of the wandered set */ -+ get_tx_size(&ch); -+ -+ ret = commit_tx(&ch); -+ if (ret) -+ goto up_and_ret; -+ -+ spin_lock_atom(atom); -+ reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT); -+ spin_unlock_atom(atom); -+ -+ ret = write_tx_back(&ch); -+ reiser4_post_write_back_hook(); -+ -+ up_and_ret: -+ if (ret) { -+ /* there could be fq attached to current atom; the only way to -+ remove them is: */ -+ current_atom_finish_all_fq(); -+ } -+ -+ /* free blocks of flushed transaction */ -+ dealloc_tx_list(&ch); -+ dealloc_wmap(&ch); -+ -+ put_overwrite_set(&ch); -+ -+ done_commit_handle(&ch); -+ -+ writeout_mode_disable(); -+ -+ return ret; -+} -+ -+/* consistency checks for journal data/control blocks: header, footer, log -+ records, transactions head blocks. All functions return zero on success. */ -+ -+static int check_journal_header(const jnode * node UNUSED_ARG) -+{ -+ /* FIXME: journal header has no magic field yet. */ -+ return 0; -+} -+ -+/* wait for write completion for all jnodes from given list */ -+static int wait_on_jnode_list(struct list_head *head) -+{ -+ jnode *scan; -+ int ret = 0; -+ -+ list_for_each_entry(scan, head, capture_link) { -+ struct page *pg = jnode_page(scan); -+ -+ if (pg) { -+ if (PageWriteback(pg)) -+ wait_on_page_writeback(pg); -+ -+ if (PageError(pg)) -+ ret++; -+ } -+ } -+ -+ return ret; -+} -+ -+static int check_journal_footer(const jnode * node UNUSED_ARG) -+{ -+ /* FIXME: journal footer has no magic field yet. */ -+ return 0; -+} -+ -+static int check_tx_head(const jnode * node) -+{ -+ struct tx_header *header = (struct tx_header *)jdata(node); -+ -+ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) { -+ warning("zam-627", "tx head at block %s corrupted\n", -+ sprint_address(jnode_get_block(node))); -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+static int check_wander_record(const jnode * node) -+{ -+ struct wander_record_header *RH = -+ (struct wander_record_header *)jdata(node); -+ -+ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) != -+ 0) { -+ warning("zam-628", "wander record at block %s corrupted\n", -+ sprint_address(jnode_get_block(node))); -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+/* fill commit_handler structure by everything what is needed for update_journal_footer */ -+static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head) -+{ -+ struct tx_header *TXH; -+ int ret; -+ -+ ret = jload(tx_head); -+ if (ret) -+ return ret; -+ -+ TXH = (struct tx_header *)jdata(tx_head); -+ -+ ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks)); -+ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files)); -+ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid)); -+ -+ jrelse(tx_head); -+ -+ list_add(&tx_head->capture_link, &ch->tx_list); -+ -+ return 0; -+} -+ -+/* replay one transaction: restore and write overwrite set in place */ -+static int replay_transaction(const struct super_block *s, -+ jnode * tx_head, -+ const reiser4_block_nr * log_rec_block_p, -+ const reiser4_block_nr * end_block, -+ unsigned int nr_wander_records) -+{ -+ reiser4_block_nr log_rec_block = *log_rec_block_p; -+ struct commit_handle ch; -+ LIST_HEAD(overwrite_set); -+ jnode *log; -+ int ret; -+ -+ init_commit_handle(&ch, NULL); -+ ch.overwrite_set = &overwrite_set; -+ -+ restore_commit_handle(&ch, tx_head); -+ -+ while (log_rec_block != *end_block) { -+ struct wander_record_header *header; -+ struct wander_entry *entry; -+ -+ int i; -+ -+ if (nr_wander_records == 0) { -+ warning("zam-631", -+ "number of wander records in the linked list" -+ " greater than number stored in tx head.\n"); -+ ret = RETERR(-EIO); -+ goto free_ow_set; -+ } -+ -+ log = reiser4_alloc_io_head(&log_rec_block); -+ if (log == NULL) -+ return RETERR(-ENOMEM); -+ -+ ret = jload(log); -+ if (ret < 0) { -+ reiser4_drop_io_head(log); -+ return ret; -+ } -+ -+ ret = check_wander_record(log); -+ if (ret) { -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ return ret; -+ } -+ -+ header = (struct wander_record_header *)jdata(log); -+ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block)); -+ -+ entry = (struct wander_entry *)(header + 1); -+ -+ /* restore overwrite set from wander record content */ -+ for (i = 0; i < wander_record_capacity(s); i++) { -+ reiser4_block_nr block; -+ jnode *node; -+ -+ block = le64_to_cpu(get_unaligned(&entry->wandered)); -+ if (block == 0) -+ break; -+ -+ node = reiser4_alloc_io_head(&block); -+ if (node == NULL) { -+ ret = RETERR(-ENOMEM); -+ /* -+ * FIXME-VS:??? -+ */ -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ goto free_ow_set; -+ } -+ -+ ret = jload(node); -+ -+ if (ret < 0) { -+ reiser4_drop_io_head(node); -+ /* -+ * FIXME-VS:??? -+ */ -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ goto free_ow_set; -+ } -+ -+ block = le64_to_cpu(get_unaligned(&entry->original)); -+ -+ assert("zam-603", block != 0); -+ -+ jnode_set_block(node, &block); -+ -+ list_add_tail(&node->capture_link, ch.overwrite_set); -+ -+ ++entry; -+ } -+ -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ -+ --nr_wander_records; -+ } -+ -+ if (nr_wander_records != 0) { -+ warning("zam-632", "number of wander records in the linked list" -+ " less than number stored in tx head.\n"); -+ ret = RETERR(-EIO); -+ goto free_ow_set; -+ } -+ -+ { /* write wandered set in place */ -+ write_jnode_list(ch.overwrite_set, NULL, NULL, 0); -+ ret = wait_on_jnode_list(ch.overwrite_set); -+ -+ if (ret) { -+ ret = RETERR(-EIO); -+ goto free_ow_set; -+ } -+ } -+ -+ ret = update_journal_footer(&ch, 0); -+ -+ free_ow_set: -+ -+ while (!list_empty(ch.overwrite_set)) { -+ jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link); -+ list_del_init(&cur->capture_link); -+ jrelse(cur); -+ reiser4_drop_io_head(cur); -+ } -+ -+ list_del_init(&tx_head->capture_link); -+ -+ done_commit_handle(&ch); -+ -+ return ret; -+} -+ -+/* find oldest committed and not played transaction and play it. The transaction -+ * was committed and journal header block was updated but the blocks from the -+ * process of writing the atom's overwrite set in-place and updating of journal -+ * footer block were not completed. This function completes the process by -+ * recovering the atom's overwrite set from their wandered locations and writes -+ * them in-place and updating the journal footer. */ -+static int replay_oldest_transaction(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ jnode *jf = sbinfo->journal_footer; -+ unsigned int total; -+ struct journal_footer *F; -+ struct tx_header *T; -+ -+ reiser4_block_nr prev_tx; -+ reiser4_block_nr last_flushed_tx; -+ reiser4_block_nr log_rec_block = 0; -+ -+ jnode *tx_head; -+ -+ int ret; -+ -+ if ((ret = jload(jf)) < 0) -+ return ret; -+ -+ F = (struct journal_footer *)jdata(jf); -+ -+ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx)); -+ -+ jrelse(jf); -+ -+ if (sbinfo->last_committed_tx == last_flushed_tx) { -+ /* all transactions are replayed */ -+ return 0; -+ } -+ -+ prev_tx = sbinfo->last_committed_tx; -+ -+ /* searching for oldest not flushed transaction */ -+ while (1) { -+ tx_head = reiser4_alloc_io_head(&prev_tx); -+ if (!tx_head) -+ return RETERR(-ENOMEM); -+ -+ ret = jload(tx_head); -+ if (ret < 0) { -+ reiser4_drop_io_head(tx_head); -+ return ret; -+ } -+ -+ ret = check_tx_head(tx_head); -+ if (ret) { -+ jrelse(tx_head); -+ reiser4_drop_io_head(tx_head); -+ return ret; -+ } -+ -+ T = (struct tx_header *)jdata(tx_head); -+ -+ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx)); -+ -+ if (prev_tx == last_flushed_tx) -+ break; -+ -+ jrelse(tx_head); -+ reiser4_drop_io_head(tx_head); -+ } -+ -+ total = le32_to_cpu(get_unaligned(&T->total)); -+ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block)); -+ -+ pin_jnode_data(tx_head); -+ jrelse(tx_head); -+ -+ ret = -+ replay_transaction(s, tx_head, &log_rec_block, -+ jnode_get_block(tx_head), total - 1); -+ -+ unpin_jnode_data(tx_head); -+ reiser4_drop_io_head(tx_head); -+ -+ if (ret) -+ return ret; -+ return -E_REPEAT; -+} -+ -+/* The reiser4 journal current implementation was optimized to not to capture -+ super block if certain super blocks fields are modified. Currently, the set -+ is (<free block count>, <OID allocator>). These fields are logged by -+ special way which includes storing them in each transaction head block at -+ atom commit time and writing that information to journal footer block at -+ atom flush time. For getting info from journal footer block to the -+ in-memory super block there is a special function -+ reiser4_journal_recover_sb_data() which should be called after disk format -+ plugin re-reads super block after journal replaying. -+*/ -+ -+/* get the information from journal footer in-memory super block */ -+int reiser4_journal_recover_sb_data(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ struct journal_footer *jf; -+ int ret; -+ -+ assert("zam-673", sbinfo->journal_footer != NULL); -+ -+ ret = jload(sbinfo->journal_footer); -+ if (ret != 0) -+ return ret; -+ -+ ret = check_journal_footer(sbinfo->journal_footer); -+ if (ret != 0) -+ goto out; -+ -+ jf = (struct journal_footer *)jdata(sbinfo->journal_footer); -+ -+ /* was there at least one flushed transaction? */ -+ if (jf->last_flushed_tx) { -+ -+ /* restore free block counter logged in this transaction */ -+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks))); -+ -+ /* restore oid allocator state */ -+ oid_init_allocator(s, -+ le64_to_cpu(get_unaligned(&jf->nr_files)), -+ le64_to_cpu(get_unaligned(&jf->next_oid))); -+ } -+ out: -+ jrelse(sbinfo->journal_footer); -+ return ret; -+} -+ -+/* reiser4 replay journal procedure */ -+int reiser4_journal_replay(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ jnode *jh, *jf; -+ struct journal_header *header; -+ int nr_tx_replayed = 0; -+ int ret; -+ -+ assert("zam-582", sbinfo != NULL); -+ -+ jh = sbinfo->journal_header; -+ jf = sbinfo->journal_footer; -+ -+ if (!jh || !jf) { -+ /* it is possible that disk layout does not support journal -+ structures, we just warn about this */ -+ warning("zam-583", -+ "journal control blocks were not loaded by disk layout plugin. " -+ "journal replaying is not possible.\n"); -+ return 0; -+ } -+ -+ /* Take free block count from journal footer block. The free block -+ counter value corresponds the last flushed transaction state */ -+ ret = jload(jf); -+ if (ret < 0) -+ return ret; -+ -+ ret = check_journal_footer(jf); -+ if (ret) { -+ jrelse(jf); -+ return ret; -+ } -+ -+ jrelse(jf); -+ -+ /* store last committed transaction info in reiser4 in-memory super -+ block */ -+ ret = jload(jh); -+ if (ret < 0) -+ return ret; -+ -+ ret = check_journal_header(jh); -+ if (ret) { -+ jrelse(jh); -+ return ret; -+ } -+ -+ header = (struct journal_header *)jdata(jh); -+ sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx)); -+ -+ jrelse(jh); -+ -+ /* replay committed transactions */ -+ while ((ret = replay_oldest_transaction(s)) == -E_REPEAT) -+ nr_tx_replayed++; -+ -+ return ret; -+} -+ -+/* load journal control block (either journal header or journal footer block) */ -+static int -+load_journal_control_block(jnode ** node, const reiser4_block_nr * block) -+{ -+ int ret; -+ -+ *node = reiser4_alloc_io_head(block); -+ if (!(*node)) -+ return RETERR(-ENOMEM); -+ -+ ret = jload(*node); -+ -+ if (ret) { -+ reiser4_drop_io_head(*node); -+ *node = NULL; -+ return ret; -+ } -+ -+ pin_jnode_data(*node); -+ jrelse(*node); -+ -+ return 0; -+} -+ -+/* unload journal header or footer and free jnode */ -+static void unload_journal_control_block(jnode ** node) -+{ -+ if (*node) { -+ unpin_jnode_data(*node); -+ reiser4_drop_io_head(*node); -+ *node = NULL; -+ } -+} -+ -+/* release journal control blocks */ -+void reiser4_done_journal_info(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ -+ assert("zam-476", sbinfo != NULL); -+ -+ unload_journal_control_block(&sbinfo->journal_header); -+ unload_journal_control_block(&sbinfo->journal_footer); -+ rcu_barrier(); -+} -+ -+/* load journal control blocks */ -+int reiser4_init_journal_info(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ journal_location *loc; -+ int ret; -+ -+ loc = &sbinfo->jloc; -+ -+ assert("zam-651", loc != NULL); -+ assert("zam-652", loc->header != 0); -+ assert("zam-653", loc->footer != 0); -+ -+ ret = load_journal_control_block(&sbinfo->journal_header, &loc->header); -+ -+ if (ret) -+ return ret; -+ -+ ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer); -+ -+ if (ret) { -+ unload_journal_control_block(&sbinfo->journal_header); -+ } -+ -+ return ret; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/wander.h linux-2.6.30/fs/reiser4/wander.h ---- linux-2.6.30.orig/fs/reiser4/wander.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/wander.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,135 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__FS_REISER4_WANDER_H__) -+#define __FS_REISER4_WANDER_H__ -+ -+#include "dformat.h" -+ -+#include <linux/fs.h> /* for struct super_block */ -+ -+/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */ -+ -+#define TX_HEADER_MAGIC "TxMagic4" -+#define WANDER_RECORD_MAGIC "LogMagc4" -+ -+#define TX_HEADER_MAGIC_SIZE (8) -+#define WANDER_RECORD_MAGIC_SIZE (8) -+ -+/* journal header block format */ -+struct journal_header { -+ /* last written transaction head location */ -+ d64 last_committed_tx; -+}; -+ -+typedef struct journal_location { -+ reiser4_block_nr footer; -+ reiser4_block_nr header; -+} journal_location; -+ -+/* The wander.c head comment describes usage and semantic of all these structures */ -+/* journal footer block format */ -+struct journal_footer { -+ /* last flushed transaction location. */ -+ /* This block number is no more valid after the transaction it points -+ to gets flushed, this number is used only at journal replaying time -+ for detection of the end of on-disk list of committed transactions -+ which were not flushed completely */ -+ d64 last_flushed_tx; -+ -+ /* free block counter is written in journal footer at transaction -+ flushing , not in super block because free blocks counter is logged -+ by another way than super block fields (root pointer, for -+ example). */ -+ d64 free_blocks; -+ -+ /* number of used OIDs and maximal used OID are logged separately from -+ super block */ -+ d64 nr_files; -+ d64 next_oid; -+}; -+ -+/* Each wander record (except the first one) has unified format with wander -+ record header followed by an array of log entries */ -+struct wander_record_header { -+ /* when there is no predefined location for wander records, this magic -+ string should help reiser4fsck. */ -+ char magic[WANDER_RECORD_MAGIC_SIZE]; -+ -+ /* transaction id */ -+ d64 id; -+ -+ /* total number of wander records in current transaction */ -+ d32 total; -+ -+ /* this block number in transaction */ -+ d32 serial; -+ -+ /* number of previous block in commit */ -+ d64 next_block; -+}; -+ -+/* The first wander record (transaction head) of written transaction has the -+ special format */ -+struct tx_header { -+ /* magic string makes first block in transaction different from other -+ logged blocks, it should help fsck. */ -+ char magic[TX_HEADER_MAGIC_SIZE]; -+ -+ /* transaction id */ -+ d64 id; -+ -+ /* total number of records (including this first tx head) in the -+ transaction */ -+ d32 total; -+ -+ /* align next field to 8-byte boundary; this field always is zero */ -+ d32 padding; -+ -+ /* block number of previous transaction head */ -+ d64 prev_tx; -+ -+ /* next wander record location */ -+ d64 next_block; -+ -+ /* committed versions of free blocks counter */ -+ d64 free_blocks; -+ -+ /* number of used OIDs (nr_files) and maximal used OID are logged -+ separately from super block */ -+ d64 nr_files; -+ d64 next_oid; -+}; -+ -+/* A transaction gets written to disk as a set of wander records (each wander -+ record size is fs block) */ -+ -+/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled -+ by zeroes */ -+struct wander_entry { -+ d64 original; /* block original location */ -+ d64 wandered; /* block wandered location */ -+}; -+ -+/* REISER4 JOURNAL WRITER FUNCTIONS */ -+ -+extern int reiser4_write_logs(long *); -+extern int reiser4_journal_replay(struct super_block *); -+extern int reiser4_journal_recover_sb_data(struct super_block *); -+ -+extern int reiser4_init_journal_info(struct super_block *); -+extern void reiser4_done_journal_info(struct super_block *); -+ -+extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int); -+ -+#endif /* __FS_REISER4_WANDER_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/writeout.h linux-2.6.30/fs/reiser4/writeout.h ---- linux-2.6.30.orig/fs/reiser4/writeout.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/writeout.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,21 @@ -+/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__FS_REISER4_WRITEOUT_H__) -+ -+#define WRITEOUT_SINGLE_STREAM (0x1) -+#define WRITEOUT_FOR_PAGE_RECLAIM (0x2) -+#define WRITEOUT_BARRIER (0x4) -+ -+extern int reiser4_get_writeout_flags(void); -+ -+#endif /* __FS_REISER4_WRITEOUT_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/znode.c linux-2.6.30/fs/reiser4/znode.c ---- linux-2.6.30.orig/fs/reiser4/znode.c 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/znode.c 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,1029 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* Znode manipulation functions. */ -+/* Znode is the in-memory header for a tree node. It is stored -+ separately from the node itself so that it does not get written to -+ disk. In this respect znode is like buffer head or page head. We -+ also use znodes for additional reiser4 specific purposes: -+ -+ . they are organized into tree structure which is a part of whole -+ reiser4 tree. -+ . they are used to implement node grained locking -+ . they are used to keep additional state associated with a -+ node -+ . they contain links to lists used by the transaction manager -+ -+ Znode is attached to some variable "block number" which is instance of -+ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without -+ appropriate node being actually loaded in memory. Existence of znode itself -+ is regulated by reference count (->x_count) in it. Each time thread -+ acquires reference to znode through call to zget(), ->x_count is -+ incremented and decremented on call to zput(). Data (content of node) are -+ brought in memory through call to zload(), which also increments ->d_count -+ reference counter. zload can block waiting on IO. Call to zrelse() -+ decreases this counter. Also, ->c_count keeps track of number of child -+ znodes and prevents parent znode from being recycled until all of its -+ children are. ->c_count is decremented whenever child goes out of existence -+ (being actually recycled in zdestroy()) which can be some time after last -+ reference to this child dies if we support some form of LRU cache for -+ znodes. -+ -+*/ -+/* EVERY ZNODE'S STORY -+ -+ 1. His infancy. -+ -+ Once upon a time, the znode was born deep inside of zget() by call to -+ zalloc(). At the return from zget() znode had: -+ -+ . reference counter (x_count) of 1 -+ . assigned block number, marked as used in bitmap -+ . pointer to parent znode. Root znode parent pointer points -+ to its father: "fake" znode. This, in turn, has NULL parent pointer. -+ . hash table linkage -+ . no data loaded from disk -+ . no node plugin -+ . no sibling linkage -+ -+ 2. His childhood -+ -+ Each node is either brought into memory as a result of tree traversal, or -+ created afresh, creation of the root being a special case of the latter. In -+ either case it's inserted into sibling list. This will typically require -+ some ancillary tree traversing, but ultimately both sibling pointers will -+ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in -+ zjnode.state. -+ -+ 3. His youth. -+ -+ If znode is bound to already existing node in a tree, its content is read -+ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set -+ in zjnode.state and zdata() function starts to return non null for this -+ znode. zload() further calls zparse() that determines which node layout -+ this node is rendered in, and sets ->nplug on success. -+ -+ If znode is for new node just created, memory for it is allocated and -+ zinit_new() function is called to initialise data, according to selected -+ node layout. -+ -+ 4. His maturity. -+ -+ After this point, znode lingers in memory for some time. Threads can -+ acquire references to znode either by blocknr through call to zget(), or by -+ following a pointer to unallocated znode from internal item. Each time -+ reference to znode is obtained, x_count is increased. Thread can read/write -+ lock znode. Znode data can be loaded through calls to zload(), d_count will -+ be increased appropriately. If all references to znode are released -+ (x_count drops to 0), znode is not recycled immediately. Rather, it is -+ still cached in the hash table in the hope that it will be accessed -+ shortly. -+ -+ There are two ways in which znode existence can be terminated: -+ -+ . sudden death: node bound to this znode is removed from the tree -+ . overpopulation: znode is purged out of memory due to memory pressure -+ -+ 5. His death. -+ -+ Death is complex process. -+ -+ When we irrevocably commit ourselves to decision to remove node from the -+ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding -+ znode. This is done either in ->kill_hook() of internal item or in -+ reiser4_kill_root() function when tree root is removed. -+ -+ At this moment znode still has: -+ -+ . locks held on it, necessary write ones -+ . references to it -+ . disk block assigned to it -+ . data loaded from the disk -+ . pending requests for lock -+ -+ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node -+ deletion. Node deletion includes two phases. First all ways to get -+ references to that znode (sibling and parent links and hash lookup using -+ block number stored in parent node) should be deleted -- it is done through -+ sibling_list_remove(), also we assume that nobody uses down link from -+ parent node due to its nonexistence or proper parent node locking and -+ nobody uses parent pointers from children due to absence of them. Second we -+ invalidate all pending lock requests which still are on znode's lock -+ request queue, this is done by reiser4_invalidate_lock(). Another -+ JNODE_IS_DYING znode status bit is used to invalidate pending lock requests. -+ Once it set all requesters are forced to return -EINVAL from -+ longterm_lock_znode(). Future locking attempts are not possible because all -+ ways to get references to that znode are removed already. Last, node is -+ uncaptured from transaction. -+ -+ When last reference to the dying znode is just about to be released, -+ block number for this lock is released and znode is removed from the -+ hash table. -+ -+ Now znode can be recycled. -+ -+ [it's possible to free bitmap block and remove znode from the hash -+ table when last lock is released. This will result in having -+ referenced but completely orphaned znode] -+ -+ 6. Limbo -+ -+ As have been mentioned above znodes with reference counter 0 are -+ still cached in a hash table. Once memory pressure increases they are -+ purged out of there [this requires something like LRU list for -+ efficient implementation. LRU list would also greatly simplify -+ implementation of coord cache that would in this case morph to just -+ scanning some initial segment of LRU list]. Data loaded into -+ unreferenced znode are flushed back to the durable storage if -+ necessary and memory is freed. Znodes themselves can be recycled at -+ this point too. -+ -+*/ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/plugin_header.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "tree_walk.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include <linux/pagemap.h> -+#include <linux/spinlock.h> -+#include <linux/slab.h> -+#include <linux/err.h> -+ -+static z_hash_table *get_htable(reiser4_tree *, -+ const reiser4_block_nr * const blocknr); -+static z_hash_table *znode_get_htable(const znode *); -+static void zdrop(znode *); -+ -+/* hash table support */ -+ -+/* compare two block numbers for equality. Used by hash-table macros */ -+static inline int -+blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2) -+{ -+ assert("nikita-534", b1 != NULL); -+ assert("nikita-535", b2 != NULL); -+ -+ return *b1 == *b2; -+} -+ -+/* Hash znode by block number. Used by hash-table macros */ -+/* Audited by: umka (2002.06.11) */ -+static inline __u32 -+blknrhashfn(z_hash_table * table, const reiser4_block_nr * b) -+{ -+ assert("nikita-536", b != NULL); -+ -+ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1); -+} -+ -+/* The hash table definition */ -+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) -+#define KFREE(ptr, size) kfree(ptr) -+TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z, -+ blknrhashfn, blknreq); -+#undef KFREE -+#undef KMALLOC -+ -+/* slab for znodes */ -+static struct kmem_cache *znode_cache; -+ -+int znode_shift_order; -+ -+/** -+ * init_znodes - create znode cache -+ * -+ * Initializes slab cache of znodes. It is part of reiser4 module initialization. -+ */ -+int init_znodes(void) -+{ -+ znode_cache = kmem_cache_create("znode", sizeof(znode), 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL); -+ if (znode_cache == NULL) -+ return RETERR(-ENOMEM); -+ -+ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode); -+ ++znode_shift_order); -+ --znode_shift_order; -+ return 0; -+} -+ -+/** -+ * done_znodes - delete znode cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_znodes(void) -+{ -+ destroy_reiser4_cache(&znode_cache); -+} -+ -+/* call this to initialise tree of znodes */ -+int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ ) -+{ -+ int result; -+ assert("umka-050", tree != NULL); -+ -+ rwlock_init(&tree->dk_lock); -+ -+ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE); -+ if (result != 0) -+ return result; -+ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE); -+ return result; -+} -+ -+/* free this znode */ -+void zfree(znode * node /* znode to free */ ) -+{ -+ assert("nikita-465", node != NULL); -+ assert("nikita-2120", znode_page(node) == NULL); -+ assert("nikita-2301", list_empty_careful(&node->lock.owners)); -+ assert("nikita-2302", list_empty_careful(&node->lock.requestors)); -+ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) && -+ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED)); -+ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes)); -+ assert("nikita-3293", !znode_is_right_connected(node)); -+ assert("nikita-3294", !znode_is_left_connected(node)); -+ assert("nikita-3295", node->left == NULL); -+ assert("nikita-3296", node->right == NULL); -+ -+ /* not yet phash_jnode_destroy(ZJNODE(node)); */ -+ -+ kmem_cache_free(znode_cache, node); -+} -+ -+/* call this to free tree of znodes */ -+void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ ) -+{ -+ znode *node; -+ znode *next; -+ z_hash_table *ztable; -+ -+ /* scan znode hash-tables and kill all znodes, then free hash tables -+ * themselves. */ -+ -+ assert("nikita-795", tree != NULL); -+ -+ ztable = &tree->zhash_table; -+ -+ if (ztable->_table != NULL) { -+ for_all_in_htable(ztable, z, node, next) { -+ node->c_count = 0; -+ node->in_parent.node = NULL; -+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0); -+ zdrop(node); -+ } -+ -+ z_hash_done(&tree->zhash_table); -+ } -+ -+ ztable = &tree->zfake_table; -+ -+ if (ztable->_table != NULL) { -+ for_all_in_htable(ztable, z, node, next) { -+ node->c_count = 0; -+ node->in_parent.node = NULL; -+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0); -+ zdrop(node); -+ } -+ -+ z_hash_done(&tree->zfake_table); -+ } -+} -+ -+/* ZNODE STRUCTURES */ -+ -+/* allocate fresh znode */ -+znode *zalloc(gfp_t gfp_flag /* allocation flag */ ) -+{ -+ znode *node; -+ -+ node = kmem_cache_alloc(znode_cache, gfp_flag); -+ return node; -+} -+ -+/* Initialize fields of znode -+ @node: znode to initialize; -+ @parent: parent znode; -+ @tree: tree we are in. */ -+void zinit(znode * node, const znode * parent, reiser4_tree * tree) -+{ -+ assert("nikita-466", node != NULL); -+ assert("umka-268", current_tree != NULL); -+ -+ memset(node, 0, sizeof *node); -+ -+ assert("umka-051", tree != NULL); -+ -+ jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK); -+ reiser4_init_lock(&node->lock); -+ init_parent_coord(&node->in_parent, parent); -+} -+ -+/* -+ * remove znode from indices. This is called jput() when last reference on -+ * znode is released. -+ */ -+void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree) -+{ -+ assert("nikita-2108", node != NULL); -+ assert("nikita-470", node->c_count == 0); -+ assert_rw_write_locked(&(tree->tree_lock)); -+ -+ /* remove reference to this znode from cbk cache */ -+ cbk_cache_invalidate(node, tree); -+ -+ /* update c_count of parent */ -+ if (znode_parent(node) != NULL) { -+ assert("nikita-472", znode_parent(node)->c_count > 0); -+ /* father, onto your hands I forward my spirit... */ -+ znode_parent(node)->c_count--; -+ node->in_parent.node = NULL; -+ } else { -+ /* orphaned znode?! Root? */ -+ } -+ -+ /* remove znode from hash-table */ -+ z_hash_remove_rcu(znode_get_htable(node), node); -+} -+ -+/* zdrop() -- Remove znode from the tree. -+ -+ This is called when znode is removed from the memory. */ -+static void zdrop(znode * node /* znode to finish with */ ) -+{ -+ jdrop(ZJNODE(node)); -+} -+ -+/* -+ * put znode into right place in the hash table. This is called by relocate -+ * code. -+ */ -+int znode_rehash(znode * node /* node to rehash */ , -+ const reiser4_block_nr * new_block_nr /* new block number */ ) -+{ -+ z_hash_table *oldtable; -+ z_hash_table *newtable; -+ reiser4_tree *tree; -+ -+ assert("nikita-2018", node != NULL); -+ -+ tree = znode_get_tree(node); -+ oldtable = znode_get_htable(node); -+ newtable = get_htable(tree, new_block_nr); -+ -+ write_lock_tree(tree); -+ /* remove znode from hash-table */ -+ z_hash_remove_rcu(oldtable, node); -+ -+ /* assertion no longer valid due to RCU */ -+ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */ -+ -+ /* update blocknr */ -+ znode_set_block(node, new_block_nr); -+ node->zjnode.key.z = *new_block_nr; -+ -+ /* insert it into hash */ -+ z_hash_insert_rcu(newtable, node); -+ write_unlock_tree(tree); -+ return 0; -+} -+ -+/* ZNODE LOOKUP, GET, PUT */ -+ -+/* zlook() - get znode with given block_nr in a hash table or return NULL -+ -+ If result is non-NULL then the znode's x_count is incremented. Internal version -+ accepts pre-computed hash index. The hash table is accessed under caller's -+ tree->hash_lock. -+*/ -+znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr) -+{ -+ znode *result; -+ __u32 hash; -+ z_hash_table *htable; -+ -+ assert("jmacd-506", tree != NULL); -+ assert("jmacd-507", blocknr != NULL); -+ -+ htable = get_htable(tree, blocknr); -+ hash = blknrhashfn(htable, blocknr); -+ -+ rcu_read_lock(); -+ result = z_hash_find_index(htable, hash, blocknr); -+ -+ if (result != NULL) { -+ add_x_ref(ZJNODE(result)); -+ result = znode_rip_check(tree, result); -+ } -+ rcu_read_unlock(); -+ -+ return result; -+} -+ -+/* return hash table where znode with block @blocknr is (or should be) -+ * stored */ -+static z_hash_table *get_htable(reiser4_tree * tree, -+ const reiser4_block_nr * const blocknr) -+{ -+ z_hash_table *table; -+ if (is_disk_addr_unallocated(blocknr)) -+ table = &tree->zfake_table; -+ else -+ table = &tree->zhash_table; -+ return table; -+} -+ -+/* return hash table where znode @node is (or should be) stored */ -+static z_hash_table *znode_get_htable(const znode * node) -+{ -+ return get_htable(znode_get_tree(node), znode_get_block(node)); -+} -+ -+/* zget() - get znode from hash table, allocating it if necessary. -+ -+ First a call to zlook, locating a x-referenced znode if one -+ exists. If znode is not found, allocate new one and return. Result -+ is returned with x_count reference increased. -+ -+ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK -+ LOCK ORDERING: NONE -+*/ -+znode *zget(reiser4_tree * tree, -+ const reiser4_block_nr * const blocknr, -+ znode * parent, tree_level level, gfp_t gfp_flag) -+{ -+ znode *result; -+ __u32 hashi; -+ -+ z_hash_table *zth; -+ -+ assert("jmacd-512", tree != NULL); -+ assert("jmacd-513", blocknr != NULL); -+ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT); -+ -+ zth = get_htable(tree, blocknr); -+ hashi = blknrhashfn(zth, blocknr); -+ -+ /* NOTE-NIKITA address-as-unallocated-blocknr still is not -+ implemented. */ -+ -+ z_hash_prefetch_bucket(zth, hashi); -+ -+ rcu_read_lock(); -+ /* Find a matching BLOCKNR in the hash table. If the znode is found, -+ we obtain an reference (x_count) but the znode remains unlocked. -+ Have to worry about race conditions later. */ -+ result = z_hash_find_index(zth, hashi, blocknr); -+ /* According to the current design, the hash table lock protects new -+ znode references. */ -+ if (result != NULL) { -+ add_x_ref(ZJNODE(result)); -+ /* NOTE-NIKITA it should be so, but special case during -+ creation of new root makes such assertion highly -+ complicated. */ -+ assert("nikita-2131", 1 || znode_parent(result) == parent || -+ (ZF_ISSET(result, JNODE_ORPHAN) -+ && (znode_parent(result) == NULL))); -+ result = znode_rip_check(tree, result); -+ } -+ -+ rcu_read_unlock(); -+ -+ if (!result) { -+ znode *shadow; -+ -+ result = zalloc(gfp_flag); -+ if (!result) { -+ return ERR_PTR(RETERR(-ENOMEM)); -+ } -+ -+ zinit(result, parent, tree); -+ ZJNODE(result)->blocknr = *blocknr; -+ ZJNODE(result)->key.z = *blocknr; -+ result->level = level; -+ -+ write_lock_tree(tree); -+ -+ shadow = z_hash_find_index(zth, hashi, blocknr); -+ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) { -+ jnode_list_remove(ZJNODE(result)); -+ zfree(result); -+ result = shadow; -+ } else { -+ result->version = znode_build_version(tree); -+ z_hash_insert_index_rcu(zth, hashi, result); -+ -+ if (parent != NULL) -+ ++parent->c_count; -+ } -+ -+ add_x_ref(ZJNODE(result)); -+ -+ write_unlock_tree(tree); -+ } -+#if REISER4_DEBUG -+ if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0) -+ reiser4_check_block(blocknr, 1); -+#endif -+ /* Check for invalid tree level, return -EIO */ -+ if (unlikely(znode_get_level(result) != level)) { -+ warning("jmacd-504", -+ "Wrong level for cached block %llu: %i expecting %i", -+ (unsigned long long)(*blocknr), znode_get_level(result), -+ level); -+ zput(result); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ -+ assert("nikita-1227", znode_invariant(result)); -+ -+ return result; -+} -+ -+/* ZNODE PLUGINS/DATA */ -+ -+/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is -+ stored at the fixed offset from the beginning of the node. */ -+static node_plugin *znode_guess_plugin(const znode * node /* znode to guess -+ * plugin of */ ) -+{ -+ reiser4_tree *tree; -+ -+ assert("nikita-1053", node != NULL); -+ assert("nikita-1055", zdata(node) != NULL); -+ -+ tree = znode_get_tree(node); -+ assert("umka-053", tree != NULL); -+ -+ if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) { -+ return tree->nplug; -+ } else { -+ return node_plugin_by_disk_id -+ (tree, &((common_node_header *) zdata(node))->plugin_id); -+#ifdef GUESS_EXISTS -+ reiser4_plugin *plugin; -+ -+ /* NOTE-NIKITA add locking here when dynamic plugins will be -+ * implemented */ -+ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) { -+ if ((plugin->u.node.guess != NULL) -+ && plugin->u.node.guess(node)) -+ return plugin; -+ } -+ warning("nikita-1057", "Cannot guess node plugin"); -+ print_znode("node", node); -+ return NULL; -+#endif -+ } -+} -+ -+/* parse node header and install ->node_plugin */ -+int zparse(znode * node /* znode to parse */ ) -+{ -+ int result; -+ -+ assert("nikita-1233", node != NULL); -+ assert("nikita-2370", zdata(node) != NULL); -+ -+ if (node->nplug == NULL) { -+ node_plugin *nplug; -+ -+ nplug = znode_guess_plugin(node); -+ if (likely(nplug != NULL)) { -+ result = nplug->parse(node); -+ if (likely(result == 0)) -+ node->nplug = nplug; -+ } else { -+ result = RETERR(-EIO); -+ } -+ } else -+ result = 0; -+ return result; -+} -+ -+/* zload with readahead */ -+int zload_ra(znode * node /* znode to load */ , ra_info_t * info) -+{ -+ int result; -+ -+ assert("nikita-484", node != NULL); -+ assert("nikita-1377", znode_invariant(node)); -+ assert("jmacd-7771", !znode_above_root(node)); -+ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0); -+ assert("nikita-3016", reiser4_schedulable()); -+ -+ if (info) -+ formatted_readahead(node, info); -+ -+ result = jload(ZJNODE(node)); -+ assert("nikita-1378", znode_invariant(node)); -+ return result; -+} -+ -+/* load content of node into memory */ -+int zload(znode * node) -+{ -+ return zload_ra(node, NULL); -+} -+ -+/* call node plugin to initialise newly allocated node. */ -+int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags) -+{ -+ return jinit_new(ZJNODE(node), gfp_flags); -+} -+ -+/* drop reference to node data. When last reference is dropped, data are -+ unloaded. */ -+void zrelse(znode * node /* znode to release references to */ ) -+{ -+ assert("nikita-1381", znode_invariant(node)); -+ -+ jrelse(ZJNODE(node)); -+} -+ -+/* returns free space in node */ -+unsigned znode_free_space(znode * node /* znode to query */ ) -+{ -+ assert("nikita-852", node != NULL); -+ return node_plugin_by_node(node)->free_space(node); -+} -+ -+/* left delimiting key of znode */ -+reiser4_key *znode_get_rd_key(znode * node /* znode to query */ ) -+{ -+ assert("nikita-958", node != NULL); -+ assert_rw_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-30671", node->rd_key_version != 0); -+ return &node->rd_key; -+} -+ -+/* right delimiting key of znode */ -+reiser4_key *znode_get_ld_key(znode * node /* znode to query */ ) -+{ -+ assert("nikita-974", node != NULL); -+ assert_rw_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-30681", node->ld_key_version != 0); -+ return &node->ld_key; -+} -+ -+ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0); -+ ) -+ -+/* update right-delimiting key of @node */ -+reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key) -+{ -+ assert("nikita-2937", node != NULL); -+ assert("nikita-2939", key != NULL); -+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk)); -+ assert("nikita-2944", -+ znode_is_any_locked(node) || -+ znode_get_level(node) != LEAF_LEVEL || -+ keyge(key, &node->rd_key) || -+ keyeq(&node->rd_key, reiser4_min_key()) || -+ ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ node->rd_key = *key; -+ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version)); -+ return &node->rd_key; -+} -+ -+/* update left-delimiting key of @node */ -+reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key) -+{ -+ assert("nikita-2940", node != NULL); -+ assert("nikita-2941", key != NULL); -+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk)); -+ assert("nikita-2943", -+ znode_is_any_locked(node) || keyeq(&node->ld_key, -+ reiser4_min_key())); -+ -+ node->ld_key = *key; -+ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version)); -+ return &node->ld_key; -+} -+ -+/* true if @key is inside key range for @node */ -+int znode_contains_key(znode * node /* znode to look in */ , -+ const reiser4_key * key /* key to look for */ ) -+{ -+ assert("nikita-1237", node != NULL); -+ assert("nikita-1238", key != NULL); -+ -+ /* left_delimiting_key <= key <= right_delimiting_key */ -+ return keyle(znode_get_ld_key(node), key) -+ && keyle(key, znode_get_rd_key(node)); -+} -+ -+/* same as znode_contains_key(), but lock dk lock */ -+int znode_contains_key_lock(znode * node /* znode to look in */ , -+ const reiser4_key * key /* key to look for */ ) -+{ -+ int result; -+ -+ assert("umka-056", node != NULL); -+ assert("umka-057", key != NULL); -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = znode_contains_key(node, key); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+/* get parent pointer, assuming tree is not locked */ -+znode *znode_parent_nolock(const znode * node /* child znode */ ) -+{ -+ assert("nikita-1444", node != NULL); -+ return node->in_parent.node; -+} -+ -+/* get parent pointer of znode */ -+znode *znode_parent(const znode * node /* child znode */ ) -+{ -+ assert("nikita-1226", node != NULL); -+ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree)); -+ return znode_parent_nolock(node); -+} -+ -+/* detect uber znode used to protect in-superblock tree root pointer */ -+int znode_above_root(const znode * node /* znode to query */ ) -+{ -+ assert("umka-059", node != NULL); -+ -+ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR); -+} -+ -+/* check that @node is root---that its block number is recorder in the tree as -+ that of root node */ -+#if REISER4_DEBUG -+static int znode_is_true_root(const znode * node /* znode to query */ ) -+{ -+ assert("umka-060", node != NULL); -+ assert("umka-061", current_tree != NULL); -+ -+ return disk_addr_eq(znode_get_block(node), -+ &znode_get_tree(node)->root_block); -+} -+#endif -+ -+/* check that @node is root */ -+int znode_is_root(const znode * node /* znode to query */ ) -+{ -+ assert("nikita-1206", node != NULL); -+ -+ return znode_get_level(node) == znode_get_tree(node)->height; -+} -+ -+/* Returns true is @node was just created by zget() and wasn't ever loaded -+ into memory. */ -+/* NIKITA-HANS: yes */ -+int znode_just_created(const znode * node) -+{ -+ assert("nikita-2188", node != NULL); -+ return (znode_page(node) == NULL); -+} -+ -+/* obtain updated ->znode_epoch. See seal.c for description. */ -+__u64 znode_build_version(reiser4_tree * tree) -+{ -+ __u64 result; -+ -+ spin_lock(&tree->epoch_lock); -+ result = ++tree->znode_epoch; -+ spin_unlock(&tree->epoch_lock); -+ return result; -+} -+ -+void init_load_count(load_count * dh) -+{ -+ assert("nikita-2105", dh != NULL); -+ memset(dh, 0, sizeof *dh); -+} -+ -+void done_load_count(load_count * dh) -+{ -+ assert("nikita-2106", dh != NULL); -+ if (dh->node != NULL) { -+ for (; dh->d_ref > 0; --dh->d_ref) -+ zrelse(dh->node); -+ dh->node = NULL; -+ } -+} -+ -+static int incr_load_count(load_count * dh) -+{ -+ int result; -+ -+ assert("nikita-2110", dh != NULL); -+ assert("nikita-2111", dh->node != NULL); -+ -+ result = zload(dh->node); -+ if (result == 0) -+ ++dh->d_ref; -+ return result; -+} -+ -+int incr_load_count_znode(load_count * dh, znode * node) -+{ -+ assert("nikita-2107", dh != NULL); -+ assert("nikita-2158", node != NULL); -+ assert("nikita-2109", -+ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0))); -+ -+ dh->node = node; -+ return incr_load_count(dh); -+} -+ -+int incr_load_count_jnode(load_count * dh, jnode * node) -+{ -+ if (jnode_is_znode(node)) { -+ return incr_load_count_znode(dh, JZNODE(node)); -+ } -+ return 0; -+} -+ -+void copy_load_count(load_count * new, load_count * old) -+{ -+ int ret = 0; -+ done_load_count(new); -+ new->node = old->node; -+ new->d_ref = 0; -+ -+ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) { -+ } -+ -+ assert("jmacd-87589", ret == 0); -+} -+ -+void move_load_count(load_count * new, load_count * old) -+{ -+ done_load_count(new); -+ new->node = old->node; -+ new->d_ref = old->d_ref; -+ old->node = NULL; -+ old->d_ref = 0; -+} -+ -+/* convert parent pointer into coord */ -+void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord) -+{ -+ assert("nikita-3204", pcoord != NULL); -+ assert("nikita-3205", coord != NULL); -+ -+ coord_init_first_unit_nocheck(coord, pcoord->node); -+ coord_set_item_pos(coord, pcoord->item_pos); -+ coord->between = AT_UNIT; -+} -+ -+/* pack coord into parent_coord_t */ -+void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord) -+{ -+ assert("nikita-3206", pcoord != NULL); -+ assert("nikita-3207", coord != NULL); -+ -+ pcoord->node = coord->node; -+ pcoord->item_pos = coord->item_pos; -+} -+ -+/* Initialize a parent hint pointer. (parent hint pointer is a field in znode, -+ look for comments there) */ -+void init_parent_coord(parent_coord_t * pcoord, const znode * node) -+{ -+ pcoord->node = (znode *) node; -+ pcoord->item_pos = (unsigned short)~0; -+} -+ -+#if REISER4_DEBUG -+ -+/* debugging aid: znode invariant */ -+static int znode_invariant_f(const znode * node /* znode to check */ , -+ char const **msg /* where to store error -+ * message, if any */ ) -+{ -+#define _ergo(ant, con) \ -+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con))) -+ -+#define _equi(e1, e2) \ -+ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2))) -+ -+#define _check(exp) ((*msg) = #exp, (exp)) -+ -+ return jnode_invariant_f(ZJNODE(node), msg) && -+ /* [znode-fake] invariant */ -+ /* fake znode doesn't have a parent, and */ -+ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) && -+ /* there is another way to express this very check, and */ -+ _ergo(znode_above_root(node), znode_parent(node) == NULL) && -+ /* it has special block number, and */ -+ _ergo(znode_get_level(node) == 0, -+ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) && -+ /* it is the only znode with such block number, and */ -+ _ergo(!znode_above_root(node) && znode_is_loaded(node), -+ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) && -+ /* it is parent of the tree root node */ -+ _ergo(znode_is_true_root(node), -+ znode_above_root(znode_parent(node))) && -+ /* [znode-level] invariant */ -+ /* level of parent znode is one larger than that of child, -+ except for the fake znode, and */ -+ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)), -+ znode_get_level(znode_parent(node)) == -+ znode_get_level(node) + 1) && -+ /* left neighbor is at the same level, and */ -+ _ergo(znode_is_left_connected(node) && node->left != NULL, -+ znode_get_level(node) == znode_get_level(node->left)) && -+ /* right neighbor is at the same level */ -+ _ergo(znode_is_right_connected(node) && node->right != NULL, -+ znode_get_level(node) == znode_get_level(node->right)) && -+ /* [znode-connected] invariant */ -+ _ergo(node->left != NULL, znode_is_left_connected(node)) && -+ _ergo(node->right != NULL, znode_is_right_connected(node)) && -+ _ergo(!znode_is_root(node) && node->left != NULL, -+ znode_is_right_connected(node->left) && -+ node->left->right == node) && -+ _ergo(!znode_is_root(node) && node->right != NULL, -+ znode_is_left_connected(node->right) && -+ node->right->left == node) && -+ /* [znode-c_count] invariant */ -+ /* for any znode, c_count of its parent is greater than 0 */ -+ _ergo(znode_parent(node) != NULL && -+ !znode_above_root(znode_parent(node)), -+ znode_parent(node)->c_count > 0) && -+ /* leaves don't have children */ -+ _ergo(znode_get_level(node) == LEAF_LEVEL, -+ node->c_count == 0) && -+ _check(node->zjnode.jnodes.prev != NULL) && -+ _check(node->zjnode.jnodes.next != NULL) && -+ /* orphan doesn't have a parent */ -+ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) && -+ /* [znode-modify] invariant */ -+ /* if znode is not write-locked, its checksum remains -+ * invariant */ -+ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we -+ * cannot check this. */ -+ /* [znode-refs] invariant */ -+ /* only referenced znode can be long-term locked */ -+ _ergo(znode_is_locked(node), -+ atomic_read(&ZJNODE(node)->x_count) != 0); -+} -+ -+/* debugging aid: check znode invariant and panic if it doesn't hold */ -+int znode_invariant(znode * node /* znode to check */ ) -+{ -+ char const *failed_msg; -+ int result; -+ -+ assert("umka-063", node != NULL); -+ assert("umka-064", current_tree != NULL); -+ -+ spin_lock_znode(node); -+ read_lock_tree(znode_get_tree(node)); -+ result = znode_invariant_f(node, &failed_msg); -+ if (!result) { -+ /* print_znode("corrupted node", node); */ -+ warning("jmacd-555", "Condition %s failed", failed_msg); -+ } -+ read_unlock_tree(znode_get_tree(node)); -+ spin_unlock_znode(node); -+ return result; -+} -+ -+/* return non-0 iff data are loaded into znode */ -+int znode_is_loaded(const znode * node /* znode to query */ ) -+{ -+ assert("nikita-497", node != NULL); -+ return jnode_is_loaded(ZJNODE(node)); -+} -+ -+unsigned long znode_times_locked(const znode * z) -+{ -+ return z->times_locked; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/fs/reiser4/znode.h linux-2.6.30/fs/reiser4/znode.h ---- linux-2.6.30.orig/fs/reiser4/znode.h 1970-01-01 01:00:00.000000000 +0100 -+++ linux-2.6.30/fs/reiser4/znode.h 2009-06-22 16:08:13.000000000 +0200 -@@ -0,0 +1,433 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Declaration of znode (Zam's node). See znode.c for more details. */ -+ -+#ifndef __ZNODE_H__ -+#define __ZNODE_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "lock.h" -+#include "readahead.h" -+ -+#include <linux/types.h> -+#include <linux/spinlock.h> -+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */ -+#include <asm/atomic.h> -+ -+/* znode tracks its position within parent (internal item in a parent node, -+ * that contains znode's block number). */ -+typedef struct parent_coord { -+ znode *node; -+ pos_in_node_t item_pos; -+} parent_coord_t; -+ -+/* &znode - node in a reiser4 tree. -+ -+ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce -+ cacheline pressure. -+ -+ Locking: -+ -+ Long term: data in a disk node attached to this znode are protected -+ by long term, deadlock aware lock ->lock; -+ -+ Spin lock: the following fields are protected by the spin lock: -+ -+ ->lock -+ -+ Following fields are protected by the global tree lock: -+ -+ ->left -+ ->right -+ ->in_parent -+ ->c_count -+ -+ Following fields are protected by the global delimiting key lock (dk_lock): -+ -+ ->ld_key (to update ->ld_key long-term lock on the node is also required) -+ ->rd_key -+ -+ Following fields are protected by the long term lock: -+ -+ ->nr_items -+ -+ ->node_plugin is never changed once set. This means that after code made -+ itself sure that field is valid it can be accessed without any additional -+ locking. -+ -+ ->level is immutable. -+ -+ Invariants involving this data-type: -+ -+ [znode-fake] -+ [znode-level] -+ [znode-connected] -+ [znode-c_count] -+ [znode-refs] -+ [jnode-refs] -+ [jnode-queued] -+ [znode-modify] -+ -+ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks. -+ Suggestions for how to do that are desired.*/ -+struct znode { -+ /* Embedded jnode. */ -+ jnode zjnode; -+ -+ /* contains three subfields, node, pos_in_node, and pos_in_unit. -+ -+ pos_in_node and pos_in_unit are only hints that are cached to -+ speed up lookups during balancing. They are not required to be up to -+ date. Synched in find_child_ptr(). -+ -+ This value allows us to avoid expensive binary searches. -+ -+ in_parent->node points to the parent of this node, and is NOT a -+ hint. -+ */ -+ parent_coord_t in_parent; -+ -+ /* -+ * sibling list pointers -+ */ -+ -+ /* left-neighbor */ -+ znode *left; -+ /* right-neighbor */ -+ znode *right; -+ -+ /* long term lock on node content. This lock supports deadlock -+ detection. See lock.c -+ */ -+ zlock lock; -+ -+ /* You cannot remove from memory a node that has children in -+ memory. This is because we rely on the fact that parent of given -+ node can always be reached without blocking for io. When reading a -+ node into memory you must increase the c_count of its parent, when -+ removing it from memory you must decrease the c_count. This makes -+ the code simpler, and the cases where it is suboptimal are truly -+ obscure. -+ */ -+ int c_count; -+ -+ /* plugin of node attached to this znode. NULL if znode is not -+ loaded. */ -+ node_plugin *nplug; -+ -+ /* version of znode data. This is increased on each modification. This -+ * is necessary to implement seals (see seal.[ch]) efficiently. */ -+ __u64 version; -+ -+ /* left delimiting key. Necessary to efficiently perform -+ balancing with node-level locking. Kept in memory only. */ -+ reiser4_key ld_key; -+ /* right delimiting key. */ -+ reiser4_key rd_key; -+ -+ /* znode's tree level */ -+ __u16 level; -+ /* number of items in this node. This field is modified by node -+ * plugin. */ -+ __u16 nr_items; -+ -+#if REISER4_DEBUG -+ void *creator; -+ reiser4_key first_key; -+ unsigned long times_locked; -+ int left_version; /* when node->left was updated */ -+ int right_version; /* when node->right was updated */ -+ int ld_key_version; /* when node->ld_key was updated */ -+ int rd_key_version; /* when node->rd_key was updated */ -+#endif -+ -+} __attribute__ ((aligned(16))); -+ -+ON_DEBUG(extern atomic_t delim_key_version; -+ ) -+ -+/* In general I think these macros should not be exposed. */ -+#define znode_is_locked(node) (lock_is_locked(&node->lock)) -+#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock)) -+#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock)) -+#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock)) -+#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock)) -+#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode)) -+/* Macros for accessing the znode state. */ -+#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f)) -+#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f)) -+#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f)) -+extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block, -+ znode * parent, tree_level level, gfp_t gfp_flag); -+extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block); -+extern int zload(znode * node); -+extern int zload_ra(znode * node, ra_info_t * info); -+extern int zinit_new(znode * node, gfp_t gfp_flags); -+extern void zrelse(znode * node); -+extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block); -+ -+/* size of data in znode */ -+static inline unsigned -+znode_size(const znode * node UNUSED_ARG /* znode to query */ ) -+{ -+ assert("nikita-1416", node != NULL); -+ return PAGE_CACHE_SIZE; -+} -+ -+extern void parent_coord_to_coord(const parent_coord_t * pcoord, -+ coord_t * coord); -+extern void coord_to_parent_coord(const coord_t * coord, -+ parent_coord_t * pcoord); -+extern void init_parent_coord(parent_coord_t * pcoord, const znode * node); -+ -+extern unsigned znode_free_space(znode * node); -+ -+extern reiser4_key *znode_get_rd_key(znode * node); -+extern reiser4_key *znode_get_ld_key(znode * node); -+ -+extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key); -+extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key); -+ -+/* `connected' state checks */ -+static inline int znode_is_right_connected(const znode * node) -+{ -+ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED); -+} -+ -+static inline int znode_is_left_connected(const znode * node) -+{ -+ return ZF_ISSET(node, JNODE_LEFT_CONNECTED); -+} -+ -+static inline int znode_is_connected(const znode * node) -+{ -+ return znode_is_right_connected(node) && znode_is_left_connected(node); -+} -+ -+extern int znode_shift_order; -+extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr); -+extern void znode_remove(znode *, reiser4_tree *); -+extern znode *znode_parent(const znode * node); -+extern znode *znode_parent_nolock(const znode * node); -+extern int znode_above_root(const znode * node); -+extern int init_znodes(void); -+extern void done_znodes(void); -+extern int znodes_tree_init(reiser4_tree * ztree); -+extern void znodes_tree_done(reiser4_tree * ztree); -+extern int znode_contains_key(znode * node, const reiser4_key * key); -+extern int znode_contains_key_lock(znode * node, const reiser4_key * key); -+extern unsigned znode_save_free_space(znode * node); -+extern unsigned znode_recover_free_space(znode * node); -+extern znode *zalloc(gfp_t gfp_flag); -+extern void zinit(znode *, const znode * parent, reiser4_tree *); -+extern int zparse(znode * node); -+ -+extern int znode_just_created(const znode * node); -+ -+extern void zfree(znode * node); -+ -+#if REISER4_DEBUG -+extern void print_znode(const char *prefix, const znode * node); -+#else -+#define print_znode( p, n ) noop -+#endif -+ -+/* Make it look like various znode functions exist instead of treating znodes as -+ jnodes in znode-specific code. */ -+#define znode_page(x) jnode_page ( ZJNODE(x) ) -+#define zdata(x) jdata ( ZJNODE(x) ) -+#define znode_get_block(x) jnode_get_block ( ZJNODE(x) ) -+#define znode_created(x) jnode_created ( ZJNODE(x) ) -+#define znode_set_created(x) jnode_set_created ( ZJNODE(x) ) -+#define znode_convertible(x) jnode_convertible (ZJNODE(x)) -+#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x)) -+ -+#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) ) -+#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) ) -+#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) ) -+#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) ) -+ -+#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) ) -+#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) ) -+#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) ) -+#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) ) -+#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) ) -+ -+#if REISER4_DEBUG -+extern int znode_x_count_is_protected(const znode * node); -+extern int znode_invariant(znode * node); -+#endif -+ -+/* acquire reference to @node */ -+static inline znode *zref(znode * node) -+{ -+ /* change of x_count from 0 to 1 is protected by tree spin-lock */ -+ return JZNODE(jref(ZJNODE(node))); -+} -+ -+/* release reference to @node */ -+static inline void zput(znode * node) -+{ -+ assert("nikita-3564", znode_invariant(node)); -+ jput(ZJNODE(node)); -+} -+ -+/* get the level field for a znode */ -+static inline tree_level znode_get_level(const znode * node) -+{ -+ return node->level; -+} -+ -+/* get the level field for a jnode */ -+static inline tree_level jnode_get_level(const jnode * node) -+{ -+ if (jnode_is_znode(node)) -+ return znode_get_level(JZNODE(node)); -+ else -+ /* unformatted nodes are all at the LEAF_LEVEL and for -+ "semi-formatted" nodes like bitmaps, level doesn't matter. */ -+ return LEAF_LEVEL; -+} -+ -+/* true if jnode is on leaf level */ -+static inline int jnode_is_leaf(const jnode * node) -+{ -+ if (jnode_is_znode(node)) -+ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL); -+ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK) -+ return 1; -+ return 0; -+} -+ -+/* return znode's tree */ -+static inline reiser4_tree *znode_get_tree(const znode * node) -+{ -+ assert("nikita-2692", node != NULL); -+ return jnode_get_tree(ZJNODE(node)); -+} -+ -+/* resolve race with zput */ -+static inline znode *znode_rip_check(reiser4_tree * tree, znode * node) -+{ -+ jnode *j; -+ -+ j = jnode_rip_sync(tree, ZJNODE(node)); -+ if (likely(j != NULL)) -+ node = JZNODE(j); -+ else -+ node = NULL; -+ return node; -+} -+ -+#if defined(REISER4_DEBUG) -+int znode_is_loaded(const znode * node /* znode to query */ ); -+#endif -+ -+extern __u64 znode_build_version(reiser4_tree * tree); -+ -+/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We -+ must load the data for a node in many places. We could do this by simply calling -+ zload() everywhere, the difficulty arises when we must release the loaded data by -+ calling zrelse. In a function with many possible error/return paths, it requires extra -+ work to figure out which exit paths must call zrelse and those which do not. The data -+ handle automatically calls zrelse for every zload that it is responsible for. In that -+ sense, it acts much like a lock_handle. -+*/ -+typedef struct load_count { -+ znode *node; -+ int d_ref; -+} load_count; -+ -+extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */ -+extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */ -+extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */ -+extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as -+ * incr_load_count_znode, otherwise do nothing (unformatted nodes -+ * don't require zload/zrelse treatment). */ -+extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */ -+extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */ -+ -+/* Variable initializers for load_count. */ -+#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 } -+#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 } -+/* A convenience macro for use in assertions or debug-only code, where loaded -+ data is only required to perform the debugging check. This macro -+ encapsulates an expression inside a pair of calls to zload()/zrelse(). */ -+#define WITH_DATA( node, exp ) \ -+({ \ -+ long __with_dh_result; \ -+ znode *__with_dh_node; \ -+ \ -+ __with_dh_node = ( node ); \ -+ __with_dh_result = zload( __with_dh_node ); \ -+ if( __with_dh_result == 0 ) { \ -+ __with_dh_result = ( long )( exp ); \ -+ zrelse( __with_dh_node ); \ -+ } \ -+ __with_dh_result; \ -+}) -+ -+/* Same as above, but accepts a return value in case zload fails. */ -+#define WITH_DATA_RET( node, ret, exp ) \ -+({ \ -+ int __with_dh_result; \ -+ znode *__with_dh_node; \ -+ \ -+ __with_dh_node = ( node ); \ -+ __with_dh_result = zload( __with_dh_node ); \ -+ if( __with_dh_result == 0 ) { \ -+ __with_dh_result = ( int )( exp ); \ -+ zrelse( __with_dh_node ); \ -+ } else \ -+ __with_dh_result = ( ret ); \ -+ __with_dh_result; \ -+}) -+ -+#define WITH_COORD(coord, exp) \ -+({ \ -+ coord_t *__coord; \ -+ \ -+ __coord = (coord); \ -+ coord_clear_iplug(__coord); \ -+ WITH_DATA(__coord->node, exp); \ -+}) -+ -+#if REISER4_DEBUG -+#define STORE_COUNTERS \ -+ reiser4_lock_cnt_info __entry_counters = \ -+ *reiser4_lock_counters() -+#define CHECK_COUNTERS \ -+ON_DEBUG_CONTEXT( \ -+({ \ -+ __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \ -+ __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \ -+ __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \ -+ assert("nikita-2159", \ -+ !memcmp(&__entry_counters, reiser4_lock_counters(), \ -+ sizeof __entry_counters)); \ -+}) ) -+ -+#else -+#define STORE_COUNTERS -+#define CHECK_COUNTERS noop -+#endif -+ -+/* __ZNODE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.30.orig/include/linux/fs.h linux-2.6.30/include/linux/fs.h ---- linux-2.6.30.orig/include/linux/fs.h 2009-06-23 00:20:41.000000000 +0200 -+++ linux-2.6.30/include/linux/fs.h 2009-06-22 16:08:13.000000000 +0200 -@@ -1571,6 +1571,8 @@ - void (*clear_inode) (struct inode *); - void (*umount_begin) (struct super_block *); - -+ void (*sync_inodes) (struct super_block *sb, -+ struct writeback_control *wbc); - int (*show_options)(struct seq_file *, struct vfsmount *); - int (*show_stats)(struct seq_file *, struct vfsmount *); - #ifdef CONFIG_QUOTA -@@ -2067,6 +2069,7 @@ - extern void generic_sync_sb_inodes(struct super_block *sb, - struct writeback_control *wbc); - extern int write_inode_now(struct inode *, int); -+extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *); - extern int filemap_fdatawrite(struct address_space *); - extern int filemap_flush(struct address_space *); - extern int filemap_fdatawait(struct address_space *); -diff -urN linux-2.6.30.orig/include/linux/mm.h linux-2.6.30/include/linux/mm.h ---- linux-2.6.30.orig/include/linux/mm.h 2009-06-23 00:20:41.000000000 +0200 -+++ linux-2.6.30/include/linux/mm.h 2009-06-22 16:17:44.000000000 +0200 -@@ -838,6 +838,7 @@ - void account_page_dirtied(struct page *page, struct address_space *mapping); - int set_page_dirty(struct page *page); - int set_page_dirty_lock(struct page *page); -+int set_page_dirty_notag(struct page *page); - int clear_page_dirty_for_io(struct page *page); - - extern unsigned long move_page_tables(struct vm_area_struct *vma, -diff -urN linux-2.6.30.orig/mm/filemap.c linux-2.6.30/mm/filemap.c ---- linux-2.6.30.orig/mm/filemap.c 2009-06-23 00:20:41.000000000 +0200 -+++ linux-2.6.30/mm/filemap.c 2009-06-22 16:51:45.000000000 +0200 -@@ -134,6 +134,7 @@ - dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); - } - } -+EXPORT_SYMBOL(__remove_from_page_cache); - - void remove_from_page_cache(struct page *page) - { -@@ -146,6 +147,7 @@ - spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); - } -+EXPORT_SYMBOL(remove_from_page_cache); - - static int sync_page(void *word) - { -@@ -1009,6 +1011,7 @@ - - ra->ra_pages /= 4; - } -+EXPORT_SYMBOL(find_get_pages); - - /** - * do_generic_file_read - generic file read routine -diff -urN linux-2.6.30.orig/mm/page-writeback.c linux-2.6.30/mm/page-writeback.c ---- linux-2.6.30.orig/mm/page-writeback.c 2009-06-23 00:20:41.000000000 +0200 -+++ linux-2.6.30/mm/page-writeback.c 2009-06-22 16:53:49.000000000 +0200 -@@ -1258,6 +1258,32 @@ - EXPORT_SYMBOL(__set_page_dirty_nobuffers); - - /* -+ * set_page_dirty_notag() -- similar to __set_page_dirty_nobuffers() -+ * except it doesn't tag the page dirty in the page-cache radix tree. -+ * This means that the address space using this cannot use the regular -+ * filemap ->writepages() helpers and must provide its own means of -+ * tracking and finding non-tagged dirty pages. -+ * -+ * NOTE: furthermore, this version also doesn't handle truncate races. -+ */ -+int set_page_dirty_notag(struct page *page) -+{ -+ struct address_space *mapping = page->mapping; -+ -+ if (!TestSetPageDirty(page)) { -+ unsigned long flags; -+ WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); -+ local_irq_save(flags); -+ account_page_dirtied(page, mapping); -+ local_irq_restore(flags); -+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); -+ return 1; -+ } -+ return 0; -+} -+EXPORT_SYMBOL(set_page_dirty_notag); -+ -+/* - * When a writepage implementation decides that it doesn't want to write this - * page for some reason, it should redirty the locked page via - * redirty_page_for_writepage() and it should then unlock the page and return 0 diff --git a/pkgs/core/kernel/patches/reiser4-for-2.6.33.patch.off b/pkgs/core/kernel/patches/reiser4-for-2.6.33.patch.off new file mode 100644 index 0000000..6e007ca --- /dev/null +++ b/pkgs/core/kernel/patches/reiser4-for-2.6.33.patch.off @@ -0,0 +1,78650 @@ +diff -urN linux-2.6.33.orig/Documentation/Changes linux-2.6.33/Documentation/Changes +--- linux-2.6.33.orig/Documentation/Changes 2010-02-24 19:52:17.000000000 +0100 ++++ linux-2.6.33/Documentation/Changes 2010-03-04 19:33:22.000000000 +0100 +@@ -36,6 +36,7 @@ + o e2fsprogs 1.41.4 # e2fsck -V + o jfsutils 1.1.3 # fsck.jfs -V + o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs ++o reiser4progs 1.0.0 # fsck.reiser4 -V + o xfsprogs 2.6.0 # xfs_db -V + o squashfs-tools 4.0 # mksquashfs -version + o btrfs-progs 0.18 # btrfsck +@@ -157,6 +158,13 @@ + versions of mkreiserfs, resize_reiserfs, debugreiserfs and + reiserfsck. These utils work on both i386 and alpha platforms. + ++Reiser4progs ++------------ ++ ++The reiser4progs package contains utilities for the reiser4 file system. ++Detailed instructions are provided in the README file located at: ++ftp://ftp.namesys.com/pub/reiser4progs/README. ++ + Xfsprogs + -------- + +@@ -345,6 +353,10 @@ + ------------- + o http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz + ++Reiser4progs ++------------ ++o ftp://ftp.namesys.com/pub/reiser4progs/ ++ + Xfsprogs + -------- + o ftp://oss.sgi.com/projects/xfs/download/ +diff -urN linux-2.6.33.orig/Documentation/filesystems/reiser4.txt linux-2.6.33/Documentation/filesystems/reiser4.txt +--- linux-2.6.33.orig/Documentation/filesystems/reiser4.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/Documentation/filesystems/reiser4.txt 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,75 @@ ++Reiser4 filesystem ++================== ++Reiser4 is a file system based on dancing tree algorithms, and is ++described at http://www.namesys.com ++ ++ ++References ++========== ++web page http://namesys.com/v4/v4.html ++source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/ ++userland tools ftp://ftp.namesys.com/pub/reiser4progs/ ++install page http://www.namesys.com/install_v4.html ++ ++Compile options ++=============== ++Enable reiser4 debug mode ++ This checks everything imaginable while reiser4 ++ runs ++ ++Mount options ++============= ++tmgr.atom_max_size=N ++ Atoms containing more than N blocks will be forced to commit. ++ N is decimal. ++ Default is nr_free_pagecache_pages() / 2 at mount time. ++ ++tmgr.atom_max_age=N ++ Atoms older than N seconds will be forced to commit. N is decimal. ++ Default is 600. ++ ++tmgr.atom_max_flushers=N ++ Limit of concurrent flushers for one atom. 0 means no limit. ++ Default is 0. ++ ++tree.cbk_cache.nr_slots=N ++ Number of slots in the cbk cache. ++ ++flush.relocate_threshold=N ++ If flush finds more than N adjacent dirty leaf-level blocks it ++ will force them to be relocated. ++ Default is 64. ++ ++flush.relocate_distance=N ++ If flush finds can find a block allocation closer than at most ++ N from the preceder it will relocate to that position. ++ Default is 64. ++ ++flush.scan_maxnodes=N ++ The maximum number of nodes to scan left on a level during ++ flush. ++ Default is 10000. ++ ++optimal_io_size=N ++ Preferred IO size. This value is used to set st_blksize of ++ struct stat. ++ Default is 65536. ++ ++bsdgroups ++ Turn on BSD-style gid assignment. ++ ++32bittimes ++ By default file in reiser4 have 64 bit timestamps. Files ++ created when filesystem is mounted with 32bittimes mount ++ option will get 32 bit timestamps. ++ ++mtflush ++ Turn off concurrent flushing. ++ ++nopseudo ++ Disable pseudo files support. See ++ http://namesys.com/v4/pseudo.html for more about pseudo files. ++ ++dont_load_bitmap ++ Don't load all bitmap blocks at mount time, it is useful for ++ machines with tiny RAM and large disks. +diff -urN linux-2.6.33.orig/fs/fs-writeback.c linux-2.6.33/fs/fs-writeback.c +--- linux-2.6.33.orig/fs/fs-writeback.c 2010-02-24 19:52:17.000000000 +0100 ++++ linux-2.6.33/fs/fs-writeback.c 2010-03-04 20:21:39.000000000 +0100 +@@ -549,108 +549,85 @@ + return ret; + } + +-static void unpin_sb_for_writeback(struct super_block **psb) ++static void unpin_sb_for_writeback(struct super_block *sb) + { +- struct super_block *sb = *psb; +- +- if (sb) { +- up_read(&sb->s_umount); +- put_super(sb); +- *psb = NULL; +- } ++ up_read(&sb->s_umount); ++ put_super(sb); + } + ++enum sb_pin_state { ++ SB_PINNED, ++ SB_NOT_PINNED, ++ SB_PIN_FAILED ++}; ++ + /* + * For WB_SYNC_NONE writeback, the caller does not have the sb pinned + * before calling writeback. So make sure that we do pin it, so it doesn't + * go away while we are writing inodes from it. +- * +- * Returns 0 if the super was successfully pinned (or pinning wasn't needed), +- * 1 if we failed. + */ +-static int pin_sb_for_writeback(struct writeback_control *wbc, +- struct inode *inode, struct super_block **psb) ++static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, ++ struct super_block *sb) + { +- struct super_block *sb = inode->i_sb; +- +- /* +- * If this sb is already pinned, nothing more to do. If not and +- * *psb is non-NULL, unpin the old one first +- */ +- if (sb == *psb) +- return 0; +- else if (*psb) +- unpin_sb_for_writeback(psb); +- + /* + * Caller must already hold the ref for this + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + WARN_ON(!rwsem_is_locked(&sb->s_umount)); +- return 0; ++ return SB_NOT_PINNED; + } +- + spin_lock(&sb_lock); + sb->s_count++; + if (down_read_trylock(&sb->s_umount)) { + if (sb->s_root) { + spin_unlock(&sb_lock); +- goto pinned; ++ return SB_PINNED; + } + /* + * umounted, drop rwsem again and fall through to failure + */ + up_read(&sb->s_umount); + } +- + sb->s_count--; + spin_unlock(&sb_lock); +- return 1; +-pinned: +- *psb = sb; +- return 0; ++ return SB_PIN_FAILED; + } + +-static void writeback_inodes_wb(struct bdi_writeback *wb, ++/* ++ * Write a portion of b_io inodes which belong to @sb. ++ * If @wbc->sb != NULL, then find and write all such ++ * inodes. Otherwise write only ones which go sequentially ++ * in reverse order. ++ * Return 1, if the caller writeback routine should be ++ * interrupted. Otherwise return 0. ++ */ ++int generic_writeback_sb_inodes(struct super_block *sb, ++ struct bdi_writeback *wb, + struct writeback_control *wbc) + { +- struct super_block *sb = wbc->sb, *pin_sb = NULL; +- const unsigned long start = jiffies; /* livelock avoidance */ +- +- spin_lock(&inode_lock); +- +- if (!wbc->for_kupdate || list_empty(&wb->b_io)) +- queue_io(wb, wbc->older_than_this); +- + while (!list_empty(&wb->b_io)) { +- struct inode *inode = list_entry(wb->b_io.prev, +- struct inode, i_list); + long pages_skipped; +- +- /* +- * super block given and doesn't match, skip this inode +- */ +- if (sb && sb != inode->i_sb) { ++ struct inode *inode = list_entry(wb->b_io.prev, ++ struct inode, i_list); ++ if (wbc->sb && sb != inode->i_sb) { ++ /* super block given and doesn't ++ match, skip this inode */ + redirty_tail(inode); + continue; + } +- ++ if (sb != inode->i_sb) ++ /* finish with this superblock */ ++ return 0; + if (inode->i_state & (I_NEW | I_WILL_FREE)) { + requeue_io(inode); + continue; + } +- + /* + * Was this inode dirtied after sync_sb_inodes was called? + * This keeps sync from extra jobs and livelock. + */ +- if (inode_dirtied_after(inode, start)) +- break; +- +- if (pin_sb_for_writeback(wbc, inode, &pin_sb)) { +- requeue_io(inode); +- continue; +- } ++ if (inode_dirtied_after(inode, wbc->wb_start)) ++ return 1; + + BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); + __iget(inode); +@@ -669,14 +646,78 @@ + spin_lock(&inode_lock); + if (wbc->nr_to_write <= 0) { + wbc->more_io = 1; +- break; ++ return 1; + } + if (!list_empty(&wb->b_more_io)) + wbc->more_io = 1; + } ++ /* b_io is empty */ ++ return 1; ++} ++EXPORT_SYMBOL(generic_writeback_sb_inodes); ++ ++/* ++ * This function is for file systems which have their ++ * own means of periodical write-out of old data. ++ * NOTE: inode_lock should be hold. ++ * ++ * Skip a portion of b_io inodes which belong to @sb ++ * and go sequentially in reverse order. ++ */ ++void writeback_skip_sb_inodes(struct super_block *sb, ++ struct bdi_writeback *wb) ++{ ++ while (1) { ++ struct inode *inode; ++ ++ if (list_empty(&wb->b_io)) ++ break; ++ inode = list_entry(wb->b_io.prev, struct inode, i_list); ++ if (sb != inode->i_sb) ++ break; ++ redirty_tail(inode); ++ } ++} ++EXPORT_SYMBOL(writeback_skip_sb_inodes); + +- unpin_sb_for_writeback(&pin_sb); ++static void writeback_inodes_wb(struct bdi_writeback *wb, ++ struct writeback_control *wbc) ++{ ++ int ret = 0; + ++ wbc->wb_start = jiffies; /* livelock avoidance */ ++ spin_lock(&inode_lock); ++ if (!wbc->for_kupdate || list_empty(&wb->b_io)) ++ queue_io(wb, wbc->older_than_this); ++ ++ while (!list_empty(&wb->b_io)) { ++ struct inode *inode = list_entry(wb->b_io.prev, ++ struct inode, i_list); ++ struct super_block *sb = inode->i_sb; ++ enum sb_pin_state state; ++ ++ if (wbc->sb && sb != wbc->sb) { ++ /* super block given and doesn't ++ match, skip this inode */ ++ redirty_tail(inode); ++ continue; ++ } ++ state = pin_sb_for_writeback(wbc, sb); ++ ++ if (state == SB_PIN_FAILED) { ++ requeue_io(inode); ++ continue; ++ } ++ if (sb->s_op->writeback_inodes) ++ ret = sb->s_op->writeback_inodes(sb, wb, wbc); ++ else ++ ret = generic_writeback_sb_inodes(sb, wb, wbc); ++ ++ if (state == SB_PINNED) ++ unpin_sb_for_writeback(sb); ++ if (ret) ++ break; ++ } + spin_unlock(&inode_lock); + /* Leave any unwritten inodes on b_io */ + } +@@ -687,6 +728,7 @@ + + writeback_inodes_wb(&bdi->wb, wbc); + } ++EXPORT_SYMBOL(writeback_inodes_wbc); + + /* + * The maximum number of pages to writeout in a single bdi flush/kupdate +@@ -1272,3 +1314,12 @@ + return ret; + } + EXPORT_SYMBOL(sync_inode); ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/inode.c linux-2.6.33/fs/inode.c +--- linux-2.6.33.orig/fs/inode.c 2010-02-24 19:52:17.000000000 +0100 ++++ linux-2.6.33/fs/inode.c 2010-03-04 19:33:22.000000000 +0100 +@@ -85,6 +85,7 @@ + * the i_state of an inode while it is in use.. + */ + DEFINE_SPINLOCK(inode_lock); ++EXPORT_SYMBOL_GPL(inode_lock); + + /* + * iprune_sem provides exclusion between the kswapd or try_to_free_pages +diff -urN linux-2.6.33.orig/fs/Kconfig linux-2.6.33/fs/Kconfig +--- linux-2.6.33.orig/fs/Kconfig 2010-02-24 19:52:17.000000000 +0100 ++++ linux-2.6.33/fs/Kconfig 2010-03-04 19:33:22.000000000 +0100 +@@ -27,6 +27,7 @@ + default y if EXT4_FS=y && EXT4_FS_XATTR + default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR + ++source "fs/reiser4/Kconfig" + source "fs/reiserfs/Kconfig" + source "fs/jfs/Kconfig" + +diff -urN linux-2.6.33.orig/fs/Makefile linux-2.6.33/fs/Makefile +--- linux-2.6.33.orig/fs/Makefile 2010-02-24 19:52:17.000000000 +0100 ++++ linux-2.6.33/fs/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -65,6 +65,7 @@ + # Do not add any filesystems before this line + obj-$(CONFIG_FSCACHE) += fscache/ + obj-$(CONFIG_REISERFS_FS) += reiserfs/ ++obj-$(CONFIG_REISER4_FS) += reiser4/ + obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 + obj-$(CONFIG_EXT2_FS) += ext2/ + # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 +diff -urN linux-2.6.33.orig/fs/reiser4/as_ops.c linux-2.6.33/fs/reiser4/as_ops.c +--- linux-2.6.33.orig/fs/reiser4/as_ops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/as_ops.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,337 @@ ++/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Interface to VFS. Reiser4 address_space_operations are defined here. */ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "coord.h" ++#include "plugin/item/item.h" ++#include "plugin/file/file.h" ++#include "plugin/security/perm.h" ++#include "plugin/disk_format/disk_format.h" ++#include "plugin/plugin.h" ++#include "plugin/plugin_set.h" ++#include "plugin/object.h" ++#include "txnmgr.h" ++#include "jnode.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree.h" ++#include "vfs_ops.h" ++#include "inode.h" ++#include "page_cache.h" ++#include "ktxnmgrd.h" ++#include "super.h" ++#include "reiser4.h" ++#include "entd.h" ++ ++#include <linux/profile.h> ++#include <linux/types.h> ++#include <linux/mount.h> ++#include <linux/vfs.h> ++#include <linux/mm.h> ++#include <linux/buffer_head.h> ++#include <linux/dcache.h> ++#include <linux/list.h> ++#include <linux/pagemap.h> ++#include <linux/slab.h> ++#include <linux/seq_file.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/writeback.h> ++#include <linux/backing-dev.h> ++#include <linux/quotaops.h> ++#include <linux/security.h> ++ ++/* address space operations */ ++ ++/** ++ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting ++ * @page: page to be dirtied ++ * ++ * Operation of struct address_space_operations. This implementation is used by ++ * unix and cryptcompress file plugins. ++ * ++ * This is called when reiser4 page gets dirtied outside of reiser4, for ++ * example, when dirty bit is moved from pte to physical page. ++ * ++ * Tags page in the mapping's page tree with special tag so that it is possible ++ * to do all the reiser4 specific work wrt dirty pages (jnode creation, ++ * capturing by an atom) later because it can not be done in the contexts where ++ * set_page_dirty is called. ++ */ ++int reiser4_set_page_dirty(struct page *page) ++{ ++ /* this page can be unformatted only */ ++ assert("vs-1734", (page->mapping && ++ page->mapping->host && ++ reiser4_get_super_fake(page->mapping->host->i_sb) != ++ page->mapping->host && ++ reiser4_get_cc_fake(page->mapping->host->i_sb) != ++ page->mapping->host && ++ reiser4_get_bitmap_fake(page->mapping->host->i_sb) != ++ page->mapping->host)); ++ return __set_page_dirty_nobuffers(page); ++} ++ ++/* ->invalidatepage method for reiser4 */ ++ ++/* ++ * this is called for each truncated page from ++ * truncate_inode_pages()->truncate_{complete,partial}_page(). ++ * ++ * At the moment of call, page is under lock, and outstanding io (if any) has ++ * completed. ++ */ ++ ++/** ++ * reiser4_invalidatepage ++ * @page: page to invalidate ++ * @offset: starting offset for partial invalidation ++ * ++ */ ++void reiser4_invalidatepage(struct page *page, unsigned long offset) ++{ ++ int ret = 0; ++ reiser4_context *ctx; ++ struct inode *inode; ++ jnode *node; ++ ++ /* ++ * This is called to truncate file's page. ++ * ++ * Originally, reiser4 implemented truncate in a standard way ++ * (vmtruncate() calls ->invalidatepage() on all truncated pages ++ * first, then file system ->truncate() call-back is invoked). ++ * ++ * This lead to the problem when ->invalidatepage() was called on a ++ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT ++ * process. That is, truncate was bypassing transactions. To avoid ++ * this, try_capture_page_to_invalidate() call was added here. ++ * ++ * After many troubles with vmtruncate() based truncate (including ++ * races with flush, tail conversion, etc.) it was re-written in the ++ * top-to-bottom style: items are killed in reiser4_cut_tree_object() ++ * and pages belonging to extent are invalidated in kill_hook_extent(). ++ * So probably now additional call to capture is not needed here. ++ */ ++ ++ assert("nikita-3137", PageLocked(page)); ++ assert("nikita-3138", !PageWriteback(page)); ++ inode = page->mapping->host; ++ ++ /* ++ * ->invalidatepage() should only be called for the unformatted ++ * jnodes. Destruction of all other types of jnodes is performed ++ * separately. But, during some corner cases (like handling errors ++ * during mount) it is simpler to let ->invalidatepage to be called on ++ * them. Check for this, and do nothing. ++ */ ++ if (reiser4_get_super_fake(inode->i_sb) == inode) ++ return; ++ if (reiser4_get_cc_fake(inode->i_sb) == inode) ++ return; ++ if (reiser4_get_bitmap_fake(inode->i_sb) == inode) ++ return; ++ assert("vs-1426", PagePrivate(page)); ++ assert("vs-1427", ++ page->mapping == jnode_get_mapping(jnode_by_page(page))); ++ assert("", jprivate(page) != NULL); ++ assert("", ergo(inode_file_plugin(inode) != ++ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID), ++ offset == 0)); ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return; ++ ++ node = jprivate(page); ++ spin_lock_jnode(node); ++ if (!(node->state & ((1 << JNODE_DIRTY) | (1 << JNODE_FLUSH_QUEUED) | ++ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) { ++ /* there is not need to capture */ ++ jref(node); ++ JF_SET(node, JNODE_HEARD_BANSHEE); ++ page_clear_jnode(page, node); ++ reiser4_uncapture_jnode(node); ++ unhash_unformatted_jnode(node); ++ jput(node); ++ reiser4_exit_context(ctx); ++ return; ++ } ++ spin_unlock_jnode(node); ++ ++ /* capture page being truncated. */ ++ ret = try_capture_page_to_invalidate(page); ++ if (ret != 0) ++ warning("nikita-3141", "Cannot capture: %i", ret); ++ ++ if (offset == 0) { ++ /* remove jnode from transaction and detach it from page. */ ++ jref(node); ++ JF_SET(node, JNODE_HEARD_BANSHEE); ++ /* page cannot be detached from jnode concurrently, because it ++ * is locked */ ++ reiser4_uncapture_page(page); ++ ++ /* this detaches page from jnode, so that jdelete will not try ++ * to lock page which is already locked */ ++ spin_lock_jnode(node); ++ page_clear_jnode(page, node); ++ spin_unlock_jnode(node); ++ unhash_unformatted_jnode(node); ++ ++ jput(node); ++ } ++ ++ reiser4_exit_context(ctx); ++} ++ ++/* help function called from reiser4_releasepage(). It returns true if jnode ++ * can be detached from its page and page released. */ ++int jnode_is_releasable(jnode * node/* node to check */) ++{ ++ assert("nikita-2781", node != NULL); ++ assert_spin_locked(&(node->guard)); ++ assert_spin_locked(&(node->load)); ++ ++ /* is some thread is currently using jnode page, later cannot be ++ * detached */ ++ if (atomic_read(&node->d_count) != 0) ++ return 0; ++ ++ assert("vs-1214", !jnode_is_loaded(node)); ++ ++ /* ++ * can only release page if real block number is assigned to it. Simple ++ * check for ->atom wouldn't do, because it is possible for node to be ++ * clean, not it atom yet, and still having fake block number. For ++ * example, node just created in jinit_new(). ++ */ ++ if (reiser4_blocknr_is_fake(jnode_get_block(node))) ++ return 0; ++ ++ /* ++ * pages prepared for write can not be released anyway, so avoid ++ * detaching jnode from the page ++ */ ++ if (JF_ISSET(node, JNODE_WRITE_PREPARED)) ++ return 0; ++ ++ /* ++ * dirty jnode cannot be released. It can however be submitted to disk ++ * as part of early flushing, but only after getting flush-prepped. ++ */ ++ if (JF_ISSET(node, JNODE_DIRTY)) ++ return 0; ++ ++ /* overwrite set is only written by log writer. */ ++ if (JF_ISSET(node, JNODE_OVRWR)) ++ return 0; ++ ++ /* jnode is already under writeback */ ++ if (JF_ISSET(node, JNODE_WRITEBACK)) ++ return 0; ++ ++ /* don't flush bitmaps or journal records */ ++ if (!jnode_is_znode(node) && !jnode_is_unformatted(node)) ++ return 0; ++ ++ return 1; ++} ++ ++/* ++ * ->releasepage method for reiser4 ++ * ++ * This is called by VM scanner when it comes across clean page. What we have ++ * to do here is to check whether page can really be released (freed that is) ++ * and if so, detach jnode from it and remove page from the page cache. ++ * ++ * Check for releasability is done by releasable() function. ++ */ ++int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG) ++{ ++ jnode *node; ++ ++ assert("nikita-2257", PagePrivate(page)); ++ assert("nikita-2259", PageLocked(page)); ++ assert("nikita-2892", !PageWriteback(page)); ++ assert("nikita-3019", reiser4_schedulable()); ++ ++ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It ++ is not clear what to do in this case. A lot of deadlocks seems be ++ possible. */ ++ ++ node = jnode_by_page(page); ++ assert("nikita-2258", node != NULL); ++ assert("reiser4-4", page->mapping != NULL); ++ assert("reiser4-5", page->mapping->host != NULL); ++ ++ if (PageDirty(page)) ++ return 0; ++ ++ /* extra page reference is used by reiser4 to protect ++ * jnode<->page link from this ->releasepage(). */ ++ if (page_count(page) > 3) ++ return 0; ++ ++ /* releasable() needs jnode lock, because it looks at the jnode fields ++ * and we need jload_lock here to avoid races with jload(). */ ++ spin_lock_jnode(node); ++ spin_lock(&(node->load)); ++ if (jnode_is_releasable(node)) { ++ struct address_space *mapping; ++ ++ mapping = page->mapping; ++ jref(node); ++ /* there is no need to synchronize against ++ * jnode_extent_write() here, because pages seen by ++ * jnode_extent_write() are !releasable(). */ ++ page_clear_jnode(page, node); ++ spin_unlock(&(node->load)); ++ spin_unlock_jnode(node); ++ ++ /* we are under memory pressure so release jnode also. */ ++ jput(node); ++ ++ return 1; ++ } else { ++ spin_unlock(&(node->load)); ++ spin_unlock_jnode(node); ++ assert("nikita-3020", reiser4_schedulable()); ++ return 0; ++ } ++} ++ ++int reiser4_readpage(struct file *file, struct page *page) ++{ ++ assert("edward-1533", PageLocked(page)); ++ assert("edward-1534", !PageUptodate(page)); ++ assert("edward-1535", page->mapping && page->mapping->host); ++ ++ return inode_file_plugin(page->mapping->host)->readpage(file, page); ++} ++ ++int reiser4_readpages(struct file *file, struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ return inode_file_plugin(mapping->host)->readpages(file, mapping, ++ pages, nr_pages); ++} ++ ++int reiser4_writepages(struct address_space *mapping, ++ struct writeback_control *wbc) ++{ ++ return inode_file_plugin(mapping->host)->writepages(mapping, wbc); ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/block_alloc.c linux-2.6.33/fs/reiser4/block_alloc.c +--- linux-2.6.33.orig/fs/reiser4/block_alloc.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/block_alloc.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1142 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++reiser4/README */ ++ ++#include "debug.h" ++#include "dformat.h" ++#include "plugin/plugin.h" ++#include "txnmgr.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree.h" ++#include "super.h" ++ ++#include <linux/types.h> /* for __u?? */ ++#include <linux/fs.h> /* for struct super_block */ ++#include <linux/spinlock.h> ++ ++/* THE REISER4 DISK SPACE RESERVATION SCHEME. */ ++ ++/* We need to be able to reserve enough disk space to ensure that an atomic ++ operation will have enough disk space to flush (see flush.c and ++ http://namesys.com/v4/v4.html) and commit it once it is started. ++ ++ In our design a call for reserving disk space may fail but not an actual ++ block allocation. ++ ++ All free blocks, already allocated blocks, and all kinds of reserved blocks ++ are counted in different per-fs block counters. ++ ++ A reiser4 super block's set of block counters currently is: ++ ++ free -- free blocks, ++ used -- already allocated blocks, ++ ++ grabbed -- initially reserved for performing an fs operation, those blocks ++ are taken from free blocks, then grabbed disk space leaks from grabbed ++ blocks counter to other counters like "fake allocated", "flush ++ reserved", "used", the rest of not used grabbed space is returned to ++ free space at the end of fs operation; ++ ++ fake allocated -- counts all nodes without real disk block numbers assigned, ++ we have separate accounting for formatted and unformatted ++ nodes (for easier debugging); ++ ++ flush reserved -- disk space needed for flushing and committing an atom. ++ Each dirty already allocated block could be written as a ++ part of atom's overwrite set or as a part of atom's ++ relocate set. In both case one additional block is needed, ++ it is used as a wandered block if we do overwrite or as a ++ new location for a relocated block. ++ ++ In addition, blocks in some states are counted on per-thread and per-atom ++ basis. A reiser4 context has a counter of blocks grabbed by this transaction ++ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values ++ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved" ++ blocks, which are reserved for flush processing and atom commit. */ ++ ++/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate ++ number of blocks to grab for most expensive case of balancing when the leaf ++ node we insert new item to gets split and new leaf node is allocated. ++ ++ So, we need to grab blocks for ++ ++ 1) one block for possible dirtying the node we insert an item to. That block ++ would be used for node relocation at flush time or for allocating of a ++ wandered one, it depends what will be a result (what set, relocate or ++ overwrite the node gets assigned to) of the node processing by the flush ++ algorithm. ++ ++ 2) one block for either allocating a new node, or dirtying of right or left ++ clean neighbor, only one case may happen. ++ ++ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying ++ of left neighbor, right neighbor, current node, and creation of new node. ++ Have I forgotten something? email me. ++ ++ These grabbed blocks are counted in both reiser4 context "grabbed blocks" ++ counter and in the fs-wide one (both ctx->grabbed_blocks and ++ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is ++ decremented by 2. ++ ++ Suppose both two blocks were spent for dirtying of an already allocated clean ++ node (one block went from "grabbed" to "flush reserved") and for new block ++ allocating (one block went from "grabbed" to "fake allocated formatted"). ++ ++ Inserting of a child pointer to the parent node caused parent node to be ++ split, the balancing code takes care about this grabbing necessary space ++ immediately by calling reiser4_grab with BA_RESERVED flag set which means ++ "can use the 5% reserved disk space". ++ ++ At this moment insertion completes and grabbed blocks (if they were not used) ++ should be returned to the free space counter. ++ ++ However the atom life-cycle is not completed. The atom had one "flush ++ reserved" block added by our insertion and the new fake allocated node is ++ counted as a "fake allocated formatted" one. The atom has to be fully ++ processed by flush before commit. Suppose that the flush moved the first, ++ already allocated node to the atom's overwrite list, the new fake allocated ++ node, obviously, went into the atom relocate set. The reiser4 flush ++ allocates the new node using one unit from "fake allocated formatted" ++ counter, the log writer uses one from "flush reserved" for wandered block ++ allocation. ++ ++ And, it is not the end. When the wandered block is deallocated after the ++ atom gets fully played (see wander.c for term description), the disk space ++ occupied for it is returned to free blocks. */ ++ ++/* BLOCK NUMBERS */ ++ ++/* Any reiser4 node has a block number assigned to it. We use these numbers for ++ indexing in hash tables, so if a block has not yet been assigned a location ++ on disk we need to give it a temporary fake block number. ++ ++ Current implementation of reiser4 uses 64-bit integers for block numbers. We ++ use highest bit in 64-bit block number to distinguish fake and real block ++ numbers. So, only 63 bits may be used to addressing of real device ++ blocks. That "fake" block numbers space is divided into subspaces of fake ++ block numbers for data blocks and for shadow (working) bitmap blocks. ++ ++ Fake block numbers for data blocks are generated by a cyclic counter, which ++ gets incremented after each real block allocation. We assume that it is ++ impossible to overload this counter during one transaction life. */ ++ ++/* Initialize a blocknr hint. */ ++void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint) ++{ ++ memset(hint, 0, sizeof(reiser4_blocknr_hint)); ++} ++ ++/* Release any resources of a blocknr hint. */ ++void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG) ++{ ++/* No resources should be freed in current blocknr_hint implementation. */ ++} ++ ++/* see above for explanation of fake block number. */ ++/* Audited by: green(2002.06.11) */ ++int reiser4_blocknr_is_fake(const reiser4_block_nr * da) ++{ ++ /* The reason for not simply returning result of '&' operation is that ++ while return value is (possibly 32bit) int, the reiser4_block_nr is ++ at least 64 bits long, and high bit (which is the only possible ++ non zero bit after the masking) would be stripped off */ ++ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0; ++} ++ ++/* Static functions for <reiser4 super block>/<reiser4 context> block counters ++ arithmetic. Mostly, they are isolated to not to code same assertions in ++ several places. */ ++static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count) ++{ ++ BUG_ON(ctx->grabbed_blocks < count); ++ assert("zam-527", ctx->grabbed_blocks >= count); ++ ctx->grabbed_blocks -= count; ++} ++ ++static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count) ++{ ++ ctx->grabbed_blocks += count; ++} ++ ++static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count) ++{ ++ assert("zam-525", sbinfo->blocks_grabbed >= count); ++ sbinfo->blocks_grabbed -= count; ++} ++ ++/* Decrease the counter of block reserved for flush in super block. */ ++static void ++sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count) ++{ ++ assert("vpf-291", sbinfo->blocks_flush_reserved >= count); ++ sbinfo->blocks_flush_reserved -= count; ++} ++ ++static void ++sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count, ++ reiser4_ba_flags_t flags) ++{ ++ if (flags & BA_FORMATTED) { ++ assert("zam-806", sbinfo->blocks_fake_allocated >= count); ++ sbinfo->blocks_fake_allocated -= count; ++ } else { ++ assert("zam-528", ++ sbinfo->blocks_fake_allocated_unformatted >= count); ++ sbinfo->blocks_fake_allocated_unformatted -= count; ++ } ++} ++ ++static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count) ++{ ++ assert("zam-530", ++ sbinfo->blocks_used >= count + sbinfo->min_blocks_used); ++ sbinfo->blocks_used -= count; ++} ++ ++static void ++sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count) ++{ ++ assert("edward-501", sbinfo->blocks_clustered >= count); ++ sbinfo->blocks_clustered -= count; ++} ++ ++/* Increase the counter of block reserved for flush in atom. */ ++static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count) ++{ ++ assert("zam-772", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ atom->flush_reserved += count; ++} ++ ++/* Decrease the counter of block reserved for flush in atom. */ ++static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count) ++{ ++ assert("zam-774", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ assert("nikita-2790", atom->flush_reserved >= count); ++ atom->flush_reserved -= count; ++} ++ ++/* super block has 6 counters: free, used, grabbed, fake allocated ++ (formatted and unformatted) and flush reserved. Their sum must be ++ number of blocks on a device. This function checks this */ ++int reiser4_check_block_counters(const struct super_block *super) ++{ ++ __u64 sum; ++ ++ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) + ++ reiser4_data_blocks(super) + reiser4_fake_allocated(super) + ++ reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) + ++ reiser4_clustered_blocks(super); ++ if (reiser4_block_count(super) != sum) { ++ printk("super block counters: " ++ "used %llu, free %llu, " ++ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), " ++ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n", ++ (unsigned long long)reiser4_data_blocks(super), ++ (unsigned long long)reiser4_free_blocks(super), ++ (unsigned long long)reiser4_grabbed_blocks(super), ++ (unsigned long long)reiser4_fake_allocated(super), ++ (unsigned long long) ++ reiser4_fake_allocated_unformatted(super), ++ (unsigned long long)reiser4_flush_reserved(super), ++ (unsigned long long)reiser4_clustered_blocks(super), ++ (unsigned long long)sum, ++ (unsigned long long)reiser4_block_count(super)); ++ return 0; ++ } ++ return 1; ++} ++ ++/* Adjust "working" free blocks counter for number of blocks we are going to ++ allocate. Record number of grabbed blocks in fs-wide and per-thread ++ counters. This function should be called before bitmap scanning or ++ allocating fake block numbers ++ ++ @super -- pointer to reiser4 super block; ++ @count -- number of blocks we reserve; ++ ++ @return -- 0 if success, -ENOSPC, if all ++ free blocks are preserved or already allocated. ++*/ ++ ++static int ++reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags) ++{ ++ __u64 free_blocks; ++ int ret = 0, use_reserved = flags & BA_RESERVED; ++ reiser4_super_info_data *sbinfo; ++ ++ assert("vs-1276", ctx == get_current_context()); ++ ++ /* Do not grab anything on ro-mounted fs. */ ++ if (rofs_super(ctx->super)) { ++ ctx->grab_enabled = 0; ++ return 0; ++ } ++ ++ sbinfo = get_super_private(ctx->super); ++ ++ spin_lock_reiser4_super(sbinfo); ++ ++ free_blocks = sbinfo->blocks_free; ++ ++ if ((use_reserved && free_blocks < count) || ++ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) { ++ ret = RETERR(-ENOSPC); ++ goto unlock_and_ret; ++ } ++ ++ add_to_ctx_grabbed(ctx, count); ++ ++ sbinfo->blocks_grabbed += count; ++ sbinfo->blocks_free -= count; ++ ++#if REISER4_DEBUG ++ if (ctx->grabbed_initially == 0) ++ ctx->grabbed_initially = count; ++#endif ++ ++ assert("nikita-2986", reiser4_check_block_counters(ctx->super)); ++ ++ /* disable grab space in current context */ ++ ctx->grab_enabled = 0; ++ ++unlock_and_ret: ++ spin_unlock_reiser4_super(sbinfo); ++ ++ return ret; ++} ++ ++int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags) ++{ ++ int ret; ++ reiser4_context *ctx; ++ ++ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT, ++ lock_stack_isclean(get_current_lock_stack ++ ()))); ++ ctx = get_current_context(); ++ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) ++ return 0; ++ ++ ret = reiser4_grab(ctx, count, flags); ++ if (ret == -ENOSPC) { ++ ++ /* Trying to commit the all transactions if BA_CAN_COMMIT flag ++ present */ ++ if (flags & BA_CAN_COMMIT) { ++ txnmgr_force_commit_all(ctx->super, 0); ++ ctx->grab_enabled = 1; ++ ret = reiser4_grab(ctx, count, flags); ++ } ++ } ++ /* ++ * allocation from reserved pool cannot fail. This is severe error. ++ */ ++ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0)); ++ return ret; ++} ++ ++/* ++ * SPACE RESERVED FOR UNLINK/TRUNCATE ++ * ++ * Unlink and truncate require space in transaction (to update stat data, at ++ * least). But we don't want rm(1) to fail with "No space on device" error. ++ * ++ * Solution is to reserve 5% of disk space for truncates and ++ * unlinks. Specifically, normal space grabbing requests don't grab space from ++ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to ++ * drain it. Per super block delete mutex is used to allow only one ++ * thread at a time to grab from reserved area. ++ * ++ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT ++ * flag. ++ * ++ */ ++ ++int reiser4_grab_reserved(struct super_block *super, ++ __u64 count, reiser4_ba_flags_t flags) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(super); ++ ++ assert("nikita-3175", flags & BA_CAN_COMMIT); ++ ++ /* Check the delete mutex already taken by us, we assume that ++ * reading of machine word is atomic. */ ++ if (sbinfo->delete_mutex_owner == current) { ++ if (reiser4_grab_space ++ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) { ++ warning("zam-1003", ++ "nested call of grab_reserved fails count=(%llu)", ++ (unsigned long long)count); ++ reiser4_release_reserved(super); ++ return RETERR(-ENOSPC); ++ } ++ return 0; ++ } ++ ++ if (reiser4_grab_space(count, flags)) { ++ mutex_lock(&sbinfo->delete_mutex); ++ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL); ++ sbinfo->delete_mutex_owner = current; ++ ++ if (reiser4_grab_space(count, flags | BA_RESERVED)) { ++ warning("zam-833", ++ "reserved space is not enough (%llu)", ++ (unsigned long long)count); ++ reiser4_release_reserved(super); ++ return RETERR(-ENOSPC); ++ } ++ } ++ return 0; ++} ++ ++void reiser4_release_reserved(struct super_block *super) ++{ ++ reiser4_super_info_data *info; ++ ++ info = get_super_private(super); ++ if (info->delete_mutex_owner == current) { ++ info->delete_mutex_owner = NULL; ++ mutex_unlock(&info->delete_mutex); ++ } ++} ++ ++static reiser4_super_info_data *grabbed2fake_allocated_head(int count) ++{ ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ ctx = get_current_context(); ++ sub_from_ctx_grabbed(ctx, count); ++ ++ sbinfo = get_super_private(ctx->super); ++ spin_lock_reiser4_super(sbinfo); ++ ++ sub_from_sb_grabbed(sbinfo, count); ++ /* return sbinfo locked */ ++ return sbinfo; ++} ++ ++/* is called after @count fake block numbers are allocated and pointer to ++ those blocks are inserted into tree. */ ++static void grabbed2fake_allocated_formatted(void) ++{ ++ reiser4_super_info_data *sbinfo; ++ ++ sbinfo = grabbed2fake_allocated_head(1); ++ sbinfo->blocks_fake_allocated++; ++ ++ assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb())); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++/** ++ * grabbed2fake_allocated_unformatted ++ * @count: ++ * ++ */ ++static void grabbed2fake_allocated_unformatted(int count) ++{ ++ reiser4_super_info_data *sbinfo; ++ ++ sbinfo = grabbed2fake_allocated_head(count); ++ sbinfo->blocks_fake_allocated_unformatted += count; ++ ++ assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb())); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++void grabbed2cluster_reserved(int count) ++{ ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ ctx = get_current_context(); ++ sub_from_ctx_grabbed(ctx, count); ++ ++ sbinfo = get_super_private(ctx->super); ++ spin_lock_reiser4_super(sbinfo); ++ ++ sub_from_sb_grabbed(sbinfo, count); ++ sbinfo->blocks_clustered += count; ++ ++ assert("edward-504", reiser4_check_block_counters(ctx->super)); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++void cluster_reserved2grabbed(int count) ++{ ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ ctx = get_current_context(); ++ ++ sbinfo = get_super_private(ctx->super); ++ spin_lock_reiser4_super(sbinfo); ++ ++ sub_from_cluster_reserved(sbinfo, count); ++ sbinfo->blocks_grabbed += count; ++ ++ assert("edward-505", reiser4_check_block_counters(ctx->super)); ++ ++ spin_unlock_reiser4_super(sbinfo); ++ add_to_ctx_grabbed(ctx, count); ++} ++ ++void cluster_reserved2free(int count) ++{ ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ ctx = get_current_context(); ++ sbinfo = get_super_private(ctx->super); ++ ++ cluster_reserved2grabbed(count); ++ grabbed2free(ctx, sbinfo, count); ++} ++ ++static DEFINE_SPINLOCK(fake_lock); ++static reiser4_block_nr fake_gen = 0; ++ ++/** ++ * assign_fake_blocknr ++ * @blocknr: ++ * @count: ++ * ++ * Obtain a fake block number for new node which will be used to refer to ++ * this newly allocated node until real allocation is done. ++ */ ++static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count) ++{ ++ spin_lock(&fake_lock); ++ *blocknr = fake_gen; ++ fake_gen += count; ++ spin_unlock(&fake_lock); ++ ++ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK); ++ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/ ++ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE; ++ assert("zam-394", zlook(current_tree, blocknr) == NULL); ++} ++ ++int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr) ++{ ++ assign_fake_blocknr(blocknr, 1); ++ grabbed2fake_allocated_formatted(); ++ return 0; ++} ++ ++/** ++ * fake_blocknrs_unformatted ++ * @count: number of fake numbers to get ++ * ++ * Allocates @count fake block numbers which will be assigned to jnodes ++ */ ++reiser4_block_nr fake_blocknr_unformatted(int count) ++{ ++ reiser4_block_nr blocknr; ++ ++ assign_fake_blocknr(&blocknr, count); ++ grabbed2fake_allocated_unformatted(count); ++ ++ return blocknr; ++} ++ ++/* adjust sb block counters, if real (on-disk) block allocation immediately ++ follows grabbing of free disk space. */ ++static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo, ++ __u64 count) ++{ ++ sub_from_ctx_grabbed(ctx, count); ++ ++ spin_lock_reiser4_super(sbinfo); ++ ++ sub_from_sb_grabbed(sbinfo, count); ++ sbinfo->blocks_used += count; ++ ++ assert("nikita-2679", reiser4_check_block_counters(ctx->super)); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++/* adjust sb block counters when @count unallocated blocks get mapped to disk */ ++static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count, ++ reiser4_ba_flags_t flags) ++{ ++ spin_lock_reiser4_super(sbinfo); ++ ++ sub_from_sb_fake_allocated(sbinfo, count, flags); ++ sbinfo->blocks_used += count; ++ ++ assert("nikita-2680", ++ reiser4_check_block_counters(reiser4_get_current_sb())); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++static void flush_reserved2used(txn_atom * atom, __u64 count) ++{ ++ reiser4_super_info_data *sbinfo; ++ ++ assert("zam-787", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ ++ sub_from_atom_flush_reserved_nolock(atom, (__u32) count); ++ ++ sbinfo = get_current_super_private(); ++ spin_lock_reiser4_super(sbinfo); ++ ++ sub_from_sb_flush_reserved(sbinfo, count); ++ sbinfo->blocks_used += count; ++ ++ assert("zam-789", ++ reiser4_check_block_counters(reiser4_get_current_sb())); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++/* update the per fs blocknr hint default value. */ ++void ++update_blocknr_hint_default(const struct super_block *s, ++ const reiser4_block_nr * block) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(s); ++ ++ assert("nikita-3342", !reiser4_blocknr_is_fake(block)); ++ ++ spin_lock_reiser4_super(sbinfo); ++ if (*block < sbinfo->block_count) { ++ sbinfo->blocknr_hint_default = *block; ++ } else { ++ warning("zam-676", ++ "block number %llu is too large to be used in a blocknr hint\n", ++ (unsigned long long)*block); ++ dump_stack(); ++ DEBUGON(1); ++ } ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++/* get current value of the default blocknr hint. */ ++void get_blocknr_hint_default(reiser4_block_nr * result) ++{ ++ reiser4_super_info_data *sbinfo = get_current_super_private(); ++ ++ spin_lock_reiser4_super(sbinfo); ++ *result = sbinfo->blocknr_hint_default; ++ assert("zam-677", *result < sbinfo->block_count); ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++/* Allocate "real" disk blocks by calling a proper space allocation plugin ++ * method. Blocks are allocated in one contiguous disk region. The plugin ++ * independent part accounts blocks by subtracting allocated amount from grabbed ++ * or fake block counter and add the same amount to the counter of allocated ++ * blocks. ++ * ++ * @hint -- a reiser4 blocknr hint object which contains further block ++ * allocation hints and parameters (search start, a stage of block ++ * which will be mapped to disk, etc.), ++ * @blk -- an out parameter for the beginning of the allocated region, ++ * @len -- in/out parameter, it should contain the maximum number of allocated ++ * blocks, after block allocation completes, it contains the length of ++ * allocated disk region. ++ * @flags -- see reiser4_ba_flags_t description. ++ * ++ * @return -- 0 if success, error code otherwise. ++ */ ++int ++reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk, ++ reiser4_block_nr * len, reiser4_ba_flags_t flags) ++{ ++ __u64 needed = *len; ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ int ret; ++ ++ assert("zam-986", hint != NULL); ++ ++ ctx = get_current_context(); ++ sbinfo = get_super_private(ctx->super); ++ ++ /* For write-optimized data we use default search start value, which is ++ * close to last write location. */ ++ if (flags & BA_USE_DEFAULT_SEARCH_START) ++ get_blocknr_hint_default(&hint->blk); ++ ++ /* VITALY: allocator should grab this for internal/tx-lists/similar ++ only. */ ++/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)?*/ ++ if (hint->block_stage == BLOCK_NOT_COUNTED) { ++ ret = reiser4_grab_space_force(*len, flags); ++ if (ret != 0) ++ return ret; ++ } ++ ++ ret = ++ sa_alloc_blocks(reiser4_get_space_allocator(ctx->super), ++ hint, (int)needed, blk, len); ++ ++ if (!ret) { ++ assert("zam-680", *blk < reiser4_block_count(ctx->super)); ++ assert("zam-681", ++ *blk + *len <= reiser4_block_count(ctx->super)); ++ ++ if (flags & BA_PERMANENT) { ++ /* we assume that current atom exists at this moment */ ++ txn_atom *atom = get_current_atom_locked(); ++ atom->nr_blocks_allocated += *len; ++ spin_unlock_atom(atom); ++ } ++ ++ switch (hint->block_stage) { ++ case BLOCK_NOT_COUNTED: ++ case BLOCK_GRABBED: ++ grabbed2used(ctx, sbinfo, *len); ++ break; ++ case BLOCK_UNALLOCATED: ++ fake_allocated2used(sbinfo, *len, flags); ++ break; ++ case BLOCK_FLUSH_RESERVED: ++ { ++ txn_atom *atom = get_current_atom_locked(); ++ flush_reserved2used(atom, *len); ++ spin_unlock_atom(atom); ++ } ++ break; ++ default: ++ impossible("zam-531", "wrong block stage"); ++ } ++ } else { ++ assert("zam-821", ++ ergo(hint->max_dist == 0 ++ && !hint->backward, ret != -ENOSPC)); ++ if (hint->block_stage == BLOCK_NOT_COUNTED) ++ grabbed2free(ctx, sbinfo, needed); ++ } ++ ++ return ret; ++} ++ ++/* used -> fake_allocated -> grabbed -> free */ ++ ++/* adjust sb block counters when @count unallocated blocks get unmapped from ++ disk */ ++static void ++used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count, ++ int formatted) ++{ ++ spin_lock_reiser4_super(sbinfo); ++ ++ if (formatted) ++ sbinfo->blocks_fake_allocated += count; ++ else ++ sbinfo->blocks_fake_allocated_unformatted += count; ++ ++ sub_from_sb_used(sbinfo, count); ++ ++ assert("nikita-2681", ++ reiser4_check_block_counters(reiser4_get_current_sb())); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++static void ++used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom, ++ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG) ++{ ++ assert("nikita-2791", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ ++ add_to_atom_flush_reserved_nolock(atom, (__u32) count); ++ ++ spin_lock_reiser4_super(sbinfo); ++ ++ sbinfo->blocks_flush_reserved += count; ++ /*add_to_sb_flush_reserved(sbinfo, count); */ ++ sub_from_sb_used(sbinfo, count); ++ ++ assert("nikita-2681", ++ reiser4_check_block_counters(reiser4_get_current_sb())); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++/* disk space, virtually used by fake block numbers is counted as "grabbed" ++ again. */ ++static void ++fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo, ++ __u64 count, reiser4_ba_flags_t flags) ++{ ++ add_to_ctx_grabbed(ctx, count); ++ ++ spin_lock_reiser4_super(sbinfo); ++ ++ assert("nikita-2682", reiser4_check_block_counters(ctx->super)); ++ ++ sbinfo->blocks_grabbed += count; ++ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED); ++ ++ assert("nikita-2683", reiser4_check_block_counters(ctx->super)); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags) ++{ ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ ctx = get_current_context(); ++ sbinfo = get_super_private(ctx->super); ++ ++ fake_allocated2grabbed(ctx, sbinfo, count, flags); ++ grabbed2free(ctx, sbinfo, count); ++} ++ ++void grabbed2free_mark(__u64 mark) ++{ ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ ctx = get_current_context(); ++ sbinfo = get_super_private(ctx->super); ++ ++ assert("nikita-3007", (__s64) mark >= 0); ++ assert("nikita-3006", ctx->grabbed_blocks >= mark); ++ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark); ++} ++ ++/** ++ * grabbed2free - adjust grabbed and free block counters ++ * @ctx: context to update grabbed block counter of ++ * @sbinfo: super block to update grabbed and free block counters of ++ * @count: number of blocks to adjust counters by ++ * ++ * Decreases context's and per filesystem's counters of grabbed ++ * blocks. Increases per filesystem's counter of free blocks. ++ */ ++void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo, ++ __u64 count) ++{ ++ sub_from_ctx_grabbed(ctx, count); ++ ++ spin_lock_reiser4_super(sbinfo); ++ ++ sub_from_sb_grabbed(sbinfo, count); ++ sbinfo->blocks_free += count; ++ assert("nikita-2684", reiser4_check_block_counters(ctx->super)); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count) ++{ ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ assert("vs-1095", atom); ++ ++ ctx = get_current_context(); ++ sbinfo = get_super_private(ctx->super); ++ ++ sub_from_ctx_grabbed(ctx, count); ++ ++ add_to_atom_flush_reserved_nolock(atom, count); ++ ++ spin_lock_reiser4_super(sbinfo); ++ ++ sbinfo->blocks_flush_reserved += count; ++ sub_from_sb_grabbed(sbinfo, count); ++ ++ assert("vpf-292", reiser4_check_block_counters(ctx->super)); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++void grabbed2flush_reserved(__u64 count) ++{ ++ txn_atom *atom = get_current_atom_locked(); ++ ++ grabbed2flush_reserved_nolock(atom, count); ++ ++ spin_unlock_atom(atom); ++} ++ ++void flush_reserved2grabbed(txn_atom * atom, __u64 count) ++{ ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ assert("nikita-2788", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ ++ ctx = get_current_context(); ++ sbinfo = get_super_private(ctx->super); ++ ++ add_to_ctx_grabbed(ctx, count); ++ ++ sub_from_atom_flush_reserved_nolock(atom, (__u32) count); ++ ++ spin_lock_reiser4_super(sbinfo); ++ ++ sbinfo->blocks_grabbed += count; ++ sub_from_sb_flush_reserved(sbinfo, count); ++ ++ assert("vpf-292", reiser4_check_block_counters(ctx->super)); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++/** ++ * all_grabbed2free - releases all blocks grabbed in context ++ * ++ * Decreases context's and super block's grabbed block counters by number of ++ * blocks grabbed by current context and increases super block's free block ++ * counter correspondingly. ++ */ ++void all_grabbed2free(void) ++{ ++ reiser4_context *ctx = get_current_context(); ++ ++ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks); ++} ++ ++/* adjust sb block counters if real (on-disk) blocks do not become unallocated ++ after freeing, @count blocks become "grabbed". */ ++static void ++used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo, ++ __u64 count) ++{ ++ add_to_ctx_grabbed(ctx, count); ++ ++ spin_lock_reiser4_super(sbinfo); ++ ++ sbinfo->blocks_grabbed += count; ++ sub_from_sb_used(sbinfo, count); ++ ++ assert("nikita-2685", reiser4_check_block_counters(ctx->super)); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++/* this used to be done through used2grabbed and grabbed2free*/ ++static void used2free(reiser4_super_info_data * sbinfo, __u64 count) ++{ ++ spin_lock_reiser4_super(sbinfo); ++ ++ sbinfo->blocks_free += count; ++ sub_from_sb_used(sbinfo, count); ++ ++ assert("nikita-2685", ++ reiser4_check_block_counters(reiser4_get_current_sb())); ++ ++ spin_unlock_reiser4_super(sbinfo); ++} ++ ++#if REISER4_DEBUG ++ ++/* check "allocated" state of given block range */ ++static void ++reiser4_check_blocks(const reiser4_block_nr * start, ++ const reiser4_block_nr * len, int desired) ++{ ++ sa_check_blocks(start, len, desired); ++} ++ ++/* check "allocated" state of given block */ ++void reiser4_check_block(const reiser4_block_nr * block, int desired) ++{ ++ const reiser4_block_nr one = 1; ++ ++ reiser4_check_blocks(block, &one, desired); ++} ++ ++#endif ++ ++/* Blocks deallocation function may do an actual deallocation through space ++ plugin allocation or store deleted block numbers in atom's delete_set data ++ structure depend on @defer parameter. */ ++ ++/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks ++ which will be deleted from WORKING bitmap. They might be just unmapped from ++ disk, or freed but disk space is still grabbed by current thread, or these ++ blocks must not be counted in any reiser4 sb block counters, ++ see block_stage_t comment */ ++ ++/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to ++ distinguish blocks allocated for unformatted and formatted nodes */ ++ ++int ++reiser4_dealloc_blocks(const reiser4_block_nr * start, ++ const reiser4_block_nr * len, ++ block_stage_t target_stage, reiser4_ba_flags_t flags) ++{ ++ txn_atom *atom = NULL; ++ int ret; ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ ctx = get_current_context(); ++ sbinfo = get_super_private(ctx->super); ++ ++ if (REISER4_DEBUG) { ++ assert("zam-431", *len != 0); ++ assert("zam-432", *start != 0); ++ assert("zam-558", !reiser4_blocknr_is_fake(start)); ++ ++ spin_lock_reiser4_super(sbinfo); ++ assert("zam-562", *start < sbinfo->block_count); ++ spin_unlock_reiser4_super(sbinfo); ++ } ++ ++ if (flags & BA_DEFER) { ++ blocknr_set_entry *bsep = NULL; ++ ++ /* storing deleted block numbers in a blocknr set ++ datastructure for further actual deletion */ ++ do { ++ atom = get_current_atom_locked(); ++ assert("zam-430", atom != NULL); ++ ++ ret = ++ blocknr_set_add_extent(atom, &atom->delete_set, ++ &bsep, start, len); ++ ++ if (ret == -ENOMEM) ++ return ret; ++ ++ /* This loop might spin at most two times */ ++ } while (ret == -E_REPEAT); ++ ++ assert("zam-477", ret == 0); ++ assert("zam-433", atom != NULL); ++ ++ spin_unlock_atom(atom); ++ ++ } else { ++ assert("zam-425", get_current_super_private() != NULL); ++ sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super), ++ *start, *len); ++ ++ if (flags & BA_PERMANENT) { ++ /* These blocks were counted as allocated, we have to ++ * revert it back if allocation is discarded. */ ++ txn_atom *atom = get_current_atom_locked(); ++ atom->nr_blocks_allocated -= *len; ++ spin_unlock_atom(atom); ++ } ++ ++ switch (target_stage) { ++ case BLOCK_NOT_COUNTED: ++ assert("vs-960", flags & BA_FORMATTED); ++ /* VITALY: This is what was grabbed for ++ internal/tx-lists/similar only */ ++ used2free(sbinfo, *len); ++ break; ++ ++ case BLOCK_GRABBED: ++ used2grabbed(ctx, sbinfo, *len); ++ break; ++ ++ case BLOCK_UNALLOCATED: ++ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED); ++ break; ++ ++ case BLOCK_FLUSH_RESERVED:{ ++ txn_atom *atom; ++ ++ atom = get_current_atom_locked(); ++ used2flush_reserved(sbinfo, atom, *len, ++ flags & BA_FORMATTED); ++ spin_unlock_atom(atom); ++ break; ++ } ++ default: ++ impossible("zam-532", "wrong block stage"); ++ } ++ } ++ ++ return 0; ++} ++ ++/* wrappers for block allocator plugin methods */ ++int reiser4_pre_commit_hook(void) ++{ ++ assert("zam-502", get_current_super_private() != NULL); ++ sa_pre_commit_hook(); ++ return 0; ++} ++ ++/* an actor which applies delete set to block allocator data */ ++static int ++apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, ++ const reiser4_block_nr * b, void *data UNUSED_ARG) ++{ ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ __u64 len = 1; ++ ++ ctx = get_current_context(); ++ sbinfo = get_super_private(ctx->super); ++ ++ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT); ++ assert("zam-552", sbinfo != NULL); ++ ++ if (b != NULL) ++ len = *b; ++ ++ if (REISER4_DEBUG) { ++ spin_lock_reiser4_super(sbinfo); ++ ++ assert("zam-554", *a < reiser4_block_count(ctx->super)); ++ assert("zam-555", *a + len <= reiser4_block_count(ctx->super)); ++ ++ spin_unlock_reiser4_super(sbinfo); ++ } ++ ++ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len); ++ /* adjust sb block counters */ ++ used2free(sbinfo, len); ++ return 0; ++} ++ ++void reiser4_post_commit_hook(void) ++{ ++ txn_atom *atom; ++ ++ atom = get_current_atom_locked(); ++ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT); ++ spin_unlock_atom(atom); ++ ++ /* do the block deallocation which was deferred ++ until commit is done */ ++ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1); ++ ++ assert("zam-504", get_current_super_private() != NULL); ++ sa_post_commit_hook(); ++} ++ ++void reiser4_post_write_back_hook(void) ++{ ++ assert("zam-504", get_current_super_private() != NULL); ++ ++ sa_post_commit_hook(); ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/block_alloc.h linux-2.6.33/fs/reiser4/block_alloc.h +--- linux-2.6.33.orig/fs/reiser4/block_alloc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/block_alloc.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,177 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#if !defined(__FS_REISER4_BLOCK_ALLOC_H__) ++#define __FS_REISER4_BLOCK_ALLOC_H__ ++ ++#include "dformat.h" ++#include "forward.h" ++ ++#include <linux/types.h> /* for __u?? */ ++#include <linux/fs.h> ++ ++/* Mask when is applied to given block number shows is that block number is a ++ fake one */ ++#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL ++/* Mask which isolates a type of object this fake block number was assigned ++ to */ ++#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL ++ ++/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared ++ against these two values to understand is the object unallocated or bitmap ++ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */ ++#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL ++#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL ++ ++/* specification how block allocation was counted in sb block counters */ ++typedef enum { ++ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */ ++ BLOCK_GRABBED = 1, /* free space grabbed for further allocation ++ of this block */ ++ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */ ++ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object ++ ( unallocated formatted or unformatted ++ node) */ ++ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block ++ number assigned */ ++} block_stage_t; ++ ++/* a hint for block allocator */ ++struct reiser4_blocknr_hint { ++ /* FIXME: I think we want to add a longterm lock on the bitmap block ++ here. This is to prevent jnode_flush() calls from interleaving ++ allocations on the same bitmap, once a hint is established. */ ++ ++ /* search start hint */ ++ reiser4_block_nr blk; ++ /* if not zero, it is a region size we search for free blocks in */ ++ reiser4_block_nr max_dist; ++ /* level for allocation, may be useful have branch-level and higher ++ write-optimized. */ ++ tree_level level; ++ /* block allocator assumes that blocks, which will be mapped to disk, ++ are in this specified block_stage */ ++ block_stage_t block_stage; ++ /* If direction = 1 allocate blocks in backward direction from the end ++ * of disk to the beginning of disk. */ ++ unsigned int backward:1; ++ ++}; ++ ++/* These flags control block allocation/deallocation behavior */ ++enum reiser4_ba_flags { ++ /* do allocatations from reserved (5%) area */ ++ BA_RESERVED = (1 << 0), ++ ++ /* block allocator can do commit trying to recover free space */ ++ BA_CAN_COMMIT = (1 << 1), ++ ++ /* if operation will be applied to formatted block */ ++ BA_FORMATTED = (1 << 2), ++ ++ /* defer actual block freeing until transaction commit */ ++ BA_DEFER = (1 << 3), ++ ++ /* allocate blocks for permanent fs objects (formatted or unformatted), ++ not wandered of log blocks */ ++ BA_PERMANENT = (1 << 4), ++ ++ /* grab space even it was disabled */ ++ BA_FORCE = (1 << 5), ++ ++ /* use default start value for free blocks search. */ ++ BA_USE_DEFAULT_SEARCH_START = (1 << 6) ++}; ++ ++typedef enum reiser4_ba_flags reiser4_ba_flags_t; ++ ++extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint); ++extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint); ++extern void update_blocknr_hint_default(const struct super_block *, ++ const reiser4_block_nr *); ++extern void get_blocknr_hint_default(reiser4_block_nr *); ++ ++extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super); ++ ++int assign_fake_blocknr_formatted(reiser4_block_nr *); ++reiser4_block_nr fake_blocknr_unformatted(int); ++ ++/* free -> grabbed -> fake_allocated -> used */ ++ ++int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags); ++void all_grabbed2free(void); ++void grabbed2free(reiser4_context * , reiser4_super_info_data * , __u64 count); ++void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags); ++void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count); ++void grabbed2flush_reserved(__u64 count); ++int reiser4_alloc_blocks(reiser4_blocknr_hint * hint, ++ reiser4_block_nr * start, ++ reiser4_block_nr * len, reiser4_ba_flags_t flags); ++int reiser4_dealloc_blocks(const reiser4_block_nr *, ++ const reiser4_block_nr *, ++ block_stage_t, reiser4_ba_flags_t flags); ++ ++static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint, ++ reiser4_block_nr * start, ++ reiser4_ba_flags_t flags) ++{ ++ reiser4_block_nr one = 1; ++ return reiser4_alloc_blocks(hint, start, &one, flags); ++} ++ ++static inline int reiser4_dealloc_block(const reiser4_block_nr * block, ++ block_stage_t stage, ++ reiser4_ba_flags_t flags) ++{ ++ const reiser4_block_nr one = 1; ++ return reiser4_dealloc_blocks(block, &one, stage, flags); ++} ++ ++#define reiser4_grab_space_force(count, flags) \ ++ reiser4_grab_space(count, flags | BA_FORCE) ++ ++extern void grabbed2free_mark(__u64 mark); ++extern int reiser4_grab_reserved(struct super_block *, ++ __u64, reiser4_ba_flags_t); ++extern void reiser4_release_reserved(struct super_block *super); ++ ++/* grabbed -> fake_allocated */ ++ ++/* fake_allocated -> used */ ++ ++/* used -> fake_allocated -> grabbed -> free */ ++ ++extern void flush_reserved2grabbed(txn_atom * atom, __u64 count); ++ ++extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da); ++ ++extern void grabbed2cluster_reserved(int count); ++extern void cluster_reserved2grabbed(int count); ++extern void cluster_reserved2free(int count); ++ ++extern int reiser4_check_block_counters(const struct super_block *); ++ ++#if REISER4_DEBUG ++ ++extern void reiser4_check_block(const reiser4_block_nr *, int); ++ ++#else ++ ++# define reiser4_check_block(beg, val) noop ++ ++#endif ++ ++extern int reiser4_pre_commit_hook(void); ++extern void reiser4_post_commit_hook(void); ++extern void reiser4_post_write_back_hook(void); ++ ++#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/blocknrset.c linux-2.6.33/fs/reiser4/blocknrset.c +--- linux-2.6.33.orig/fs/reiser4/blocknrset.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/blocknrset.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,371 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++reiser4/README */ ++ ++/* This file contains code for various block number sets used by the atom to ++ track the deleted set and wandered block mappings. */ ++ ++#include "debug.h" ++#include "dformat.h" ++#include "txnmgr.h" ++#include "context.h" ++ ++#include <linux/slab.h> ++ ++/* The proposed data structure for storing unordered block number sets is a ++ list of elements, each of which contains an array of block number or/and ++ array of block number pairs. That element called blocknr_set_entry is used ++ to store block numbers from the beginning and for extents from the end of ++ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields ++ count numbers of blocks and extents. ++ ++ +------------------- blocknr_set_entry->data ------------------+ ++ |block1|block2| ... <free space> ... |pair3|pair2|pair1| ++ +------------------------------------------------------------+ ++ ++ When current blocknr_set_entry is full, allocate a new one. */ ++ ++/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete ++ * set (single blocks and block extents), in that case blocknr pair represent an ++ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs ++ * there represent a (real block) -> (wandered block) mapping. */ ++ ++/* Protection: blocknr sets belong to reiser4 atom, and ++ * their modifications are performed with the atom lock held */ ++ ++/* The total size of a blocknr_set_entry. */ ++#define BLOCKNR_SET_ENTRY_SIZE 128 ++ ++/* The number of blocks that can fit the blocknr data area. */ ++#define BLOCKNR_SET_ENTRIES_NUMBER \ ++ ((BLOCKNR_SET_ENTRY_SIZE - \ ++ 2 * sizeof(unsigned) - \ ++ sizeof(struct list_head)) / \ ++ sizeof(reiser4_block_nr)) ++ ++/* An entry of the blocknr_set */ ++struct blocknr_set_entry { ++ unsigned nr_singles; ++ unsigned nr_pairs; ++ struct list_head link; ++ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER]; ++}; ++ ++/* A pair of blocks as recorded in the blocknr_set_entry data. */ ++struct blocknr_pair { ++ reiser4_block_nr a; ++ reiser4_block_nr b; ++}; ++ ++/* Return the number of blocknr slots available in a blocknr_set_entry. */ ++/* Audited by: green(2002.06.11) */ ++static unsigned bse_avail(blocknr_set_entry * bse) ++{ ++ unsigned used = bse->nr_singles + 2 * bse->nr_pairs; ++ ++ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used); ++ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE); ++ ++ return BLOCKNR_SET_ENTRIES_NUMBER - used; ++} ++ ++/* Initialize a blocknr_set_entry. */ ++static void bse_init(blocknr_set_entry *bse) ++{ ++ bse->nr_singles = 0; ++ bse->nr_pairs = 0; ++ INIT_LIST_HEAD(&bse->link); ++} ++ ++/* Allocate and initialize a blocknr_set_entry. */ ++/* Audited by: green(2002.06.11) */ ++static blocknr_set_entry *bse_alloc(void) ++{ ++ blocknr_set_entry *e; ++ ++ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry), ++ reiser4_ctx_gfp_mask_get())) == NULL) ++ return NULL; ++ ++ bse_init(e); ++ ++ return e; ++} ++ ++/* Free a blocknr_set_entry. */ ++/* Audited by: green(2002.06.11) */ ++static void bse_free(blocknr_set_entry * bse) ++{ ++ kfree(bse); ++} ++ ++/* Add a block number to a blocknr_set_entry */ ++/* Audited by: green(2002.06.11) */ ++static void ++bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block) ++{ ++ assert("jmacd-5099", bse_avail(bse) >= 1); ++ ++ bse->entries[bse->nr_singles++] = *block; ++} ++ ++/* Get a pair of block numbers */ ++/* Audited by: green(2002.06.11) */ ++static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse, ++ unsigned pno) ++{ ++ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1)); ++ ++ return (struct blocknr_pair *) (bse->entries + ++ BLOCKNR_SET_ENTRIES_NUMBER - ++ 2 * (pno + 1)); ++} ++ ++/* Add a pair of block numbers to a blocknr_set_entry */ ++/* Audited by: green(2002.06.11) */ ++static void ++bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a, ++ const reiser4_block_nr * b) ++{ ++ struct blocknr_pair *pair; ++ ++ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL); ++ ++ pair = bse_get_pair(bse, bse->nr_pairs++); ++ ++ pair->a = *a; ++ pair->b = *b; ++} ++ ++/* Add either a block or pair of blocks to the block number set. The first ++ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if ++ @b is non-NULL a pair is added. The block number set belongs to atom, and ++ the call is made with the atom lock held. There may not be enough space in ++ the current blocknr_set_entry. If new_bsep points to a non-NULL ++ blocknr_set_entry then it will be added to the blocknr_set and new_bsep ++ will be set to NULL. If new_bsep contains NULL then the atom lock will be ++ released and a new bse will be allocated in new_bsep. E_REPEAT will be ++ returned with the atom unlocked for the operation to be tried again. If ++ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not ++ used during the call, it will be freed automatically. */ ++static int blocknr_set_add(txn_atom *atom, struct list_head *bset, ++ blocknr_set_entry **new_bsep, const reiser4_block_nr *a, ++ const reiser4_block_nr *b) ++{ ++ blocknr_set_entry *bse; ++ unsigned entries_needed; ++ ++ assert("jmacd-5101", a != NULL); ++ ++ entries_needed = (b == NULL) ? 1 : 2; ++ if (list_empty(bset) || ++ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) { ++ /* See if a bse was previously allocated. */ ++ if (*new_bsep == NULL) { ++ spin_unlock_atom(atom); ++ *new_bsep = bse_alloc(); ++ return (*new_bsep != NULL) ? -E_REPEAT : ++ RETERR(-ENOMEM); ++ } ++ ++ /* Put it on the head of the list. */ ++ list_add(&((*new_bsep)->link), bset); ++ ++ *new_bsep = NULL; ++ } ++ ++ /* Add the single or pair. */ ++ bse = list_entry(bset->next, blocknr_set_entry, link); ++ if (b == NULL) { ++ bse_put_single(bse, a); ++ } else { ++ bse_put_pair(bse, a, b); ++ } ++ ++ /* If new_bsep is non-NULL then there was an allocation race, free this ++ copy. */ ++ if (*new_bsep != NULL) { ++ bse_free(*new_bsep); ++ *new_bsep = NULL; ++ } ++ ++ return 0; ++} ++ ++/* Add an extent to the block set. If the length is 1, it is treated as a ++ single block (e.g., reiser4_set_add_block). */ ++/* Audited by: green(2002.06.11) */ ++/* Auditor note: Entire call chain cannot hold any spinlocks, because ++ kmalloc might schedule. The only exception is atom spinlock, which is ++ properly freed. */ ++int ++blocknr_set_add_extent(txn_atom * atom, ++ struct list_head *bset, ++ blocknr_set_entry ** new_bsep, ++ const reiser4_block_nr * start, ++ const reiser4_block_nr * len) ++{ ++ assert("jmacd-5102", start != NULL && len != NULL && *len > 0); ++ return blocknr_set_add(atom, bset, new_bsep, start, ++ *len == 1 ? NULL : len); ++} ++ ++/* Add a block pair to the block set. It adds exactly a pair, which is checked ++ * by an assertion that both arguments are not null.*/ ++/* Audited by: green(2002.06.11) */ ++/* Auditor note: Entire call chain cannot hold any spinlocks, because ++ kmalloc might schedule. The only exception is atom spinlock, which is ++ properly freed. */ ++int ++blocknr_set_add_pair(txn_atom * atom, ++ struct list_head *bset, ++ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, ++ const reiser4_block_nr * b) ++{ ++ assert("jmacd-5103", a != NULL && b != NULL); ++ return blocknr_set_add(atom, bset, new_bsep, a, b); ++} ++ ++/* Initialize a blocknr_set. */ ++void blocknr_set_init(struct list_head *bset) ++{ ++ INIT_LIST_HEAD(bset); ++} ++ ++/* Release the entries of a blocknr_set. */ ++void blocknr_set_destroy(struct list_head *bset) ++{ ++ blocknr_set_entry *bse; ++ ++ while (!list_empty(bset)) { ++ bse = list_entry(bset->next, blocknr_set_entry, link); ++ list_del_init(&bse->link); ++ bse_free(bse); ++ } ++} ++ ++/* Merge blocknr_set entries out of @from into @into. */ ++/* Audited by: green(2002.06.11) */ ++/* Auditor comments: This merge does not know if merged sets contain ++ blocks pairs (As for wandered sets) or extents, so it cannot really merge ++ overlapping ranges if there is some. So I believe it may lead to ++ some blocks being presented several times in one blocknr_set. To help ++ debugging such problems it might help to check for duplicate entries on ++ actual processing of this set. Testing this kind of stuff right here is ++ also complicated by the fact that these sets are not sorted and going ++ through whole set on each element addition is going to be CPU-heavy task */ ++void blocknr_set_merge(struct list_head *from, struct list_head *into) ++{ ++ blocknr_set_entry *bse_into = NULL; ++ ++ /* If @from is empty, no work to perform. */ ++ if (list_empty(from)) ++ return; ++ /* If @into is not empty, try merging partial-entries. */ ++ if (!list_empty(into)) { ++ ++ /* Neither set is empty, pop the front to members and try to ++ combine them. */ ++ blocknr_set_entry *bse_from; ++ unsigned into_avail; ++ ++ bse_into = list_entry(into->next, blocknr_set_entry, link); ++ list_del_init(&bse_into->link); ++ bse_from = list_entry(from->next, blocknr_set_entry, link); ++ list_del_init(&bse_from->link); ++ ++ /* Combine singles. */ ++ for (into_avail = bse_avail(bse_into); ++ into_avail != 0 && bse_from->nr_singles != 0; ++ into_avail -= 1) { ++ bse_put_single(bse_into, ++ &bse_from->entries[--bse_from-> ++ nr_singles]); ++ } ++ ++ /* Combine pairs. */ ++ for (; into_avail > 1 && bse_from->nr_pairs != 0; ++ into_avail -= 2) { ++ struct blocknr_pair *pair = ++ bse_get_pair(bse_from, --bse_from->nr_pairs); ++ bse_put_pair(bse_into, &pair->a, &pair->b); ++ } ++ ++ /* If bse_from is empty, delete it now. */ ++ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) { ++ bse_free(bse_from); ++ } else { ++ /* Otherwise, bse_into is full or nearly full (e.g., ++ it could have one slot avail and bse_from has one ++ pair left). Push it back onto the list. bse_from ++ becomes bse_into, which will be the new partial. */ ++ list_add(&bse_into->link, into); ++ bse_into = bse_from; ++ } ++ } ++ ++ /* Splice lists together. */ ++ list_splice_init(from, into->prev); ++ ++ /* Add the partial entry back to the head of the list. */ ++ if (bse_into != NULL) ++ list_add(&bse_into->link, into); ++} ++ ++/* Iterate over all blocknr set elements. */ ++int blocknr_set_iterator(txn_atom *atom, struct list_head *bset, ++ blocknr_set_actor_f actor, void *data, int delete) ++{ ++ ++ blocknr_set_entry *entry; ++ ++ assert("zam-429", atom != NULL); ++ assert("zam-430", atom_is_protected(atom)); ++ assert("zam-431", bset != 0); ++ assert("zam-432", actor != NULL); ++ ++ entry = list_entry(bset->next, blocknr_set_entry, link); ++ while (bset != &entry->link) { ++ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link); ++ unsigned int i; ++ int ret; ++ ++ for (i = 0; i < entry->nr_singles; i++) { ++ ret = actor(atom, &entry->entries[i], NULL, data); ++ ++ /* We can't break a loop if delete flag is set. */ ++ if (ret != 0 && !delete) ++ return ret; ++ } ++ ++ for (i = 0; i < entry->nr_pairs; i++) { ++ struct blocknr_pair *ab; ++ ++ ab = bse_get_pair(entry, i); ++ ++ ret = actor(atom, &ab->a, &ab->b, data); ++ ++ if (ret != 0 && !delete) ++ return ret; ++ } ++ ++ if (delete) { ++ list_del(&entry->link); ++ bse_free(entry); ++ } ++ ++ entry = tmp; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/carry.c linux-2.6.33/fs/reiser4/carry.c +--- linux-2.6.33.orig/fs/reiser4/carry.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/carry.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1398 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++/* Functions to "carry" tree modification(s) upward. */ ++/* Tree is modified one level at a time. As we modify a level we accumulate a ++ set of changes that need to be propagated to the next level. We manage ++ node locking such that any searches that collide with carrying are ++ restarted, from the root if necessary. ++ ++ Insertion of a new item may result in items being moved among nodes and ++ this requires the delimiting key to be updated at the least common parent ++ of the nodes modified to preserve search tree invariants. Also, insertion ++ may require allocation of a new node. A pointer to the new node has to be ++ inserted into some node on the parent level, etc. ++ ++ Tree carrying is meant to be analogous to arithmetic carrying. ++ ++ A carry operation is always associated with some node (&carry_node). ++ ++ Carry process starts with some initial set of operations to be performed ++ and an initial set of already locked nodes. Operations are performed one ++ by one. Performing each single operation has following possible effects: ++ ++ - content of carry node associated with operation is modified ++ - new carry nodes are locked and involved into carry process on this level ++ - new carry operations are posted to the next level ++ ++ After all carry operations on this level are done, process is repeated for ++ the accumulated sequence on carry operations for the next level. This ++ starts by trying to lock (in left to right order) all carry nodes ++ associated with carry operations on the parent level. After this, we decide ++ whether more nodes are required on the left of already locked set. If so, ++ all locks taken on the parent level are released, new carry nodes are ++ added, and locking process repeats. ++ ++ It may happen that balancing process fails owing to unrecoverable error on ++ some of upper levels of a tree (possible causes are io error, failure to ++ allocate new node, etc.). In this case we should unmount the filesystem, ++ rebooting if it is the root, and possibly advise the use of fsck. ++ ++ USAGE: ++ ++ int some_tree_operation( znode *node, ... ) ++ { ++ // Allocate on a stack pool of carry objects: operations and nodes. ++ // Most carry processes will only take objects from here, without ++ // dynamic allocation. ++ ++I feel uneasy about this pool. It adds to code complexity, I understand why it ++exists, but.... -Hans ++ ++ carry_pool pool; ++ carry_level lowest_level; ++ carry_op *op; ++ ++ init_carry_pool( &pool ); ++ init_carry_level( &lowest_level, &pool ); ++ ++ // operation may be one of: ++ // COP_INSERT --- insert new item into node ++ // COP_CUT --- remove part of or whole node ++ // COP_PASTE --- increase size of item ++ // COP_DELETE --- delete pointer from parent node ++ // COP_UPDATE --- update delimiting key in least ++ // common ancestor of two ++ ++ op = reiser4_post_carry( &lowest_level, operation, node, 0 ); ++ if( IS_ERR( op ) || ( op == NULL ) ) { ++ handle error ++ } else { ++ // fill in remaining fields in @op, according to carry.h:carry_op ++ result = carry(&lowest_level, NULL); ++ } ++ done_carry_pool(&pool); ++ } ++ ++ When you are implementing node plugin method that participates in carry ++ (shifting, insertion, deletion, etc.), do the following: ++ ++ int foo_node_method(znode * node, ..., carry_level * todo) ++ { ++ carry_op *op; ++ ++ .... ++ ++ // note, that last argument to reiser4_post_carry() is non-null ++ // here, because @op is to be applied to the parent of @node, rather ++ // than to the @node itself as in the previous case. ++ ++ op = node_post_carry(todo, operation, node, 1); ++ // fill in remaining fields in @op, according to carry.h:carry_op ++ ++ .... ++ ++ } ++ ++ BATCHING: ++ ++ One of the main advantages of level-by-level balancing implemented here is ++ ability to batch updates on a parent level and to peform them more ++ efficiently as a result. ++ ++ Description To Be Done (TBD). ++ ++ DIFFICULTIES AND SUBTLE POINTS: ++ ++ 1. complex plumbing is required, because: ++ ++ a. effective allocation through pools is needed ++ ++ b. target of operation is not exactly known when operation is ++ posted. This is worked around through bitfields in &carry_node and ++ logic in lock_carry_node() ++ ++ c. of interaction with locking code: node should be added into sibling ++ list when pointer to it is inserted into its parent, which is some time ++ after node was created. Between these moments, node is somewhat in ++ suspended state and is only registered in the carry lists ++ ++ 2. whole balancing logic is implemented here, in particular, insertion ++ logic is coded in make_space(). ++ ++ 3. special cases like insertion (reiser4_add_tree_root()) or deletion ++ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert ++ (insert_paste()) have to be handled. ++ ++ 4. there is non-trivial interdependency between allocation of new nodes ++ and almost everything else. This is mainly due to the (1.c) above. I shall ++ write about this later. ++ ++*/ ++ ++#include "forward.h" ++#include "debug.h" ++#include "key.h" ++#include "coord.h" ++#include "plugin/item/item.h" ++#include "plugin/item/extent.h" ++#include "plugin/node/node.h" ++#include "jnode.h" ++#include "znode.h" ++#include "tree_mod.h" ++#include "tree_walk.h" ++#include "block_alloc.h" ++#include "pool.h" ++#include "tree.h" ++#include "carry.h" ++#include "carry_ops.h" ++#include "super.h" ++#include "reiser4.h" ++ ++#include <linux/types.h> ++ ++/* level locking/unlocking */ ++static int lock_carry_level(carry_level * level); ++static void unlock_carry_level(carry_level * level, int failure); ++static void done_carry_level(carry_level * level); ++static void unlock_carry_node(carry_level * level, carry_node * node, int fail); ++ ++int lock_carry_node(carry_level * level, carry_node * node); ++int lock_carry_node_tail(carry_node * node); ++ ++/* carry processing proper */ ++static int carry_on_level(carry_level * doing, carry_level * todo); ++ ++static carry_op *add_op(carry_level * level, pool_ordering order, ++ carry_op * reference); ++ ++/* handlers for carry operations. */ ++ ++static void fatal_carry_error(carry_level * doing, int ecode); ++static int add_new_root(carry_level * level, carry_node * node, znode * fake); ++ ++static void print_level(const char *prefix, carry_level * level); ++ ++#if REISER4_DEBUG ++typedef enum { ++ CARRY_TODO, ++ CARRY_DOING ++} carry_queue_state; ++static int carry_level_invariant(carry_level * level, carry_queue_state state); ++#endif ++ ++/* main entry point for tree balancing. ++ ++ Tree carry performs operations from @doing and while doing so accumulates ++ information about operations to be performed on the next level ("carried" ++ to the parent level). Carried operations are performed, causing possibly ++ more operations to be carried upward etc. carry() takes care about ++ locking and pinning znodes while operating on them. ++ ++ For usage, see comment at the top of fs/reiser4/carry.c ++ ++*/ ++int reiser4_carry(carry_level * doing /* set of carry operations to be ++ * performed */ , ++ carry_level * done /* set of nodes, already performed ++ * at the previous level. ++ * NULL in most cases */) ++{ ++ int result = 0; ++ /* queue of new requests */ ++ carry_level *todo; ++ ON_DEBUG(STORE_COUNTERS); ++ ++ assert("nikita-888", doing != NULL); ++ BUG_ON(done != NULL); ++ ++ todo = doing + 1; ++ init_carry_level(todo, doing->pool); ++ ++ /* queue of requests preformed on the previous level */ ++ done = todo + 1; ++ init_carry_level(done, doing->pool); ++ ++ /* iterate until there is nothing more to do */ ++ while (result == 0 && doing->ops_num > 0) { ++ carry_level *tmp; ++ ++ /* at this point @done is locked. */ ++ /* repeat lock/do/unlock while ++ ++ (1) lock_carry_level() fails due to deadlock avoidance, or ++ ++ (2) carry_on_level() decides that more nodes have to ++ be involved. ++ ++ (3) some unexpected error occurred while balancing on the ++ upper levels. In this case all changes are rolled back. ++ ++ */ ++ while (1) { ++ result = lock_carry_level(doing); ++ if (result == 0) { ++ /* perform operations from @doing and ++ accumulate new requests in @todo */ ++ result = carry_on_level(doing, todo); ++ if (result == 0) ++ break; ++ else if (result != -E_REPEAT || ++ !doing->restartable) { ++ warning("nikita-1043", ++ "Fatal error during carry: %i", ++ result); ++ print_level("done", done); ++ print_level("doing", doing); ++ print_level("todo", todo); ++ /* do some rough stuff like aborting ++ all pending transcrashes and thus ++ pushing tree back to the consistent ++ state. Alternatvely, just panic. ++ */ ++ fatal_carry_error(doing, result); ++ return result; ++ } ++ } else if (result != -E_REPEAT) { ++ fatal_carry_error(doing, result); ++ return result; ++ } ++ unlock_carry_level(doing, 1); ++ } ++ /* at this point @done can be safely unlocked */ ++ done_carry_level(done); ++ ++ /* cyclically shift queues */ ++ tmp = done; ++ done = doing; ++ doing = todo; ++ todo = tmp; ++ init_carry_level(todo, doing->pool); ++ ++ /* give other threads chance to run */ ++ reiser4_preempt_point(); ++ } ++ done_carry_level(done); ++ ++ /* all counters, but x_refs should remain the same. x_refs can change ++ owing to transaction manager */ ++ ON_DEBUG(CHECK_COUNTERS); ++ return result; ++} ++ ++/* perform carry operations on given level. ++ ++ Optimizations proposed by pooh: ++ ++ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as ++ required; ++ ++ (2) unlock node if there are no more operations to be performed upon it and ++ node didn't add any operation to @todo. This can be implemented by ++ attaching to each node two counters: counter of operaions working on this ++ node and counter and operations carried upward from this node. ++ ++*/ ++static int carry_on_level(carry_level * doing /* queue of carry operations to ++ * do on this level */ , ++ carry_level * todo /* queue where new carry ++ * operations to be performed on ++ * the * parent level are ++ * accumulated during @doing ++ * processing. */ ) ++{ ++ int result; ++ int (*f) (carry_op *, carry_level *, carry_level *); ++ carry_op *op; ++ carry_op *tmp_op; ++ ++ assert("nikita-1034", doing != NULL); ++ assert("nikita-1035", todo != NULL); ++ ++ /* @doing->nodes are locked. */ ++ ++ /* This function can be split into two phases: analysis and modification ++ ++ Analysis calculates precisely what items should be moved between ++ nodes. This information is gathered in some structures attached to ++ each carry_node in a @doing queue. Analysis also determines whether ++ new nodes are to be allocated etc. ++ ++ After analysis is completed, actual modification is performed. Here ++ we can take advantage of "batch modification": if there are several ++ operations acting on the same node, modifications can be performed ++ more efficiently when batched together. ++ ++ Above is an optimization left for the future. ++ */ ++ /* Important, but delayed optimization: it's possible to batch ++ operations together and perform them more efficiently as a ++ result. For example, deletion of several neighboring items from a ++ node can be converted to a single ->cut() operation. ++ ++ Before processing queue, it should be scanned and "mergeable" ++ operations merged. ++ */ ++ result = 0; ++ for_all_ops(doing, op, tmp_op) { ++ carry_opcode opcode; ++ ++ assert("nikita-1041", op != NULL); ++ opcode = op->op; ++ assert("nikita-1042", op->op < COP_LAST_OP); ++ f = op_dispatch_table[op->op].handler; ++ result = f(op, doing, todo); ++ /* locking can fail with -E_REPEAT. Any different error is fatal ++ and will be handled by fatal_carry_error() sledgehammer. ++ */ ++ if (result != 0) ++ break; ++ } ++ if (result == 0) { ++ carry_plugin_info info; ++ carry_node *scan; ++ carry_node *tmp_scan; ++ ++ info.doing = doing; ++ info.todo = todo; ++ ++ assert("nikita-3002", ++ carry_level_invariant(doing, CARRY_DOING)); ++ for_all_nodes(doing, scan, tmp_scan) { ++ znode *node; ++ ++ node = reiser4_carry_real(scan); ++ assert("nikita-2547", node != NULL); ++ if (node_is_empty(node)) { ++ result = ++ node_plugin_by_node(node)-> ++ prepare_removal(node, &info); ++ if (result != 0) ++ break; ++ } ++ } ++ } ++ return result; ++} ++ ++/* post carry operation ++ ++ This is main function used by external carry clients: node layout plugins ++ and tree operations to create new carry operation to be performed on some ++ level. ++ ++ New operation will be included in the @level queue. To actually perform it, ++ call carry( level, ... ). This function takes write lock on @node. Carry ++ manages all its locks by itself, don't worry about this. ++ ++ This function adds operation and node at the end of the queue. It is up to ++ caller to guarantee proper ordering of node queue. ++ ++*/ ++carry_op * reiser4_post_carry(carry_level * level /* queue where new operation ++ * is to be posted at */ , ++ carry_opcode op /* opcode of operation */ , ++ znode * node /* node on which this operation ++ * will operate */ , ++ int apply_to_parent_p /* whether operation will ++ * operate directly on @node ++ * or on it parent. */) ++{ ++ carry_op *result; ++ carry_node *child; ++ ++ assert("nikita-1046", level != NULL); ++ assert("nikita-1788", znode_is_write_locked(node)); ++ ++ result = add_op(level, POOLO_LAST, NULL); ++ if (IS_ERR(result)) ++ return result; ++ child = reiser4_add_carry(level, POOLO_LAST, NULL); ++ if (IS_ERR(child)) { ++ reiser4_pool_free(&level->pool->op_pool, &result->header); ++ return (carry_op *) child; ++ } ++ result->node = child; ++ result->op = op; ++ child->parent = apply_to_parent_p; ++ if (ZF_ISSET(node, JNODE_ORPHAN)) ++ child->left_before = 1; ++ child->node = node; ++ return result; ++} ++ ++/* initialize carry queue */ ++void init_carry_level(carry_level * level /* level to initialize */ , ++ carry_pool * pool /* pool @level will allocate objects ++ * from */ ) ++{ ++ assert("nikita-1045", level != NULL); ++ assert("nikita-967", pool != NULL); ++ ++ memset(level, 0, sizeof *level); ++ level->pool = pool; ++ ++ INIT_LIST_HEAD(&level->nodes); ++ INIT_LIST_HEAD(&level->ops); ++} ++ ++/* allocate carry pool and initialize pools within queue */ ++carry_pool *init_carry_pool(int size) ++{ ++ carry_pool *pool; ++ ++ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level)); ++ pool = kmalloc(size, reiser4_ctx_gfp_mask_get()); ++ if (pool == NULL) ++ return ERR_PTR(RETERR(-ENOMEM)); ++ ++ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE, ++ (char *)pool->op); ++ reiser4_init_pool(&pool->node_pool, sizeof(carry_node), ++ NODES_LOCKED_POOL_SIZE, (char *)pool->node); ++ return pool; ++} ++ ++/* finish with queue pools */ ++void done_carry_pool(carry_pool * pool/* pool to destroy */) ++{ ++ reiser4_done_pool(&pool->op_pool); ++ reiser4_done_pool(&pool->node_pool); ++ kfree(pool); ++} ++ ++/* add new carry node to the @level. ++ ++ Returns pointer to the new carry node allocated from pool. It's up to ++ callers to maintain proper order in the @level. Assumption is that if carry ++ nodes on one level are already sorted and modifications are peroformed from ++ left to right, carry nodes added on the parent level will be ordered ++ automatically. To control ordering use @order and @reference parameters. ++ ++*/ ++carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add ++ * node to */ , ++ pool_ordering order /* where to insert: ++ * at the beginning of ++ * @level, ++ * before @reference, ++ * after @reference, ++ * at the end of @level ++ */ , ++ carry_node * reference/* reference node for ++ * insertion */) ++{ ++ ON_DEBUG(carry_node * orig_ref = reference); ++ ++ if (order == POOLO_BEFORE) { ++ reference = find_left_carry(reference, level); ++ if (reference == NULL) ++ reference = list_entry(level->nodes.next, carry_node, ++ header.level_linkage); ++ else ++ reference = list_entry(reference->header.level_linkage.next, ++ carry_node, header.level_linkage); ++ } else if (order == POOLO_AFTER) { ++ reference = find_right_carry(reference, level); ++ if (reference == NULL) ++ reference = list_entry(level->nodes.prev, carry_node, ++ header.level_linkage); ++ else ++ reference = list_entry(reference->header.level_linkage.prev, ++ carry_node, header.level_linkage); ++ } ++ assert("nikita-2209", ++ ergo(orig_ref != NULL, ++ reiser4_carry_real(reference) == ++ reiser4_carry_real(orig_ref))); ++ return reiser4_add_carry(level, order, reference); ++} ++ ++carry_node *reiser4_add_carry(carry_level * level, /* carry_level to add ++ node to */ ++ pool_ordering order, /* where to insert: ++ * at the beginning of ++ * @level; ++ * before @reference; ++ * after @reference; ++ * at the end of @level ++ */ ++ carry_node * reference /* reference node for ++ * insertion */) ++{ ++ carry_node *result; ++ ++ result = ++ (carry_node *) reiser4_add_obj(&level->pool->node_pool, ++ &level->nodes, ++ order, &reference->header); ++ if (!IS_ERR(result) && (result != NULL)) ++ ++level->nodes_num; ++ return result; ++} ++ ++/** ++ * add new carry operation to the @level. ++ * ++ * Returns pointer to the new carry operations allocated from pool. It's up to ++ * callers to maintain proper order in the @level. To control ordering use ++ * @order and @reference parameters. ++ */ ++static carry_op *add_op(carry_level * level, /* &carry_level to add node to */ ++ pool_ordering order, /* where to insert: ++ * at the beginning of @level; ++ * before @reference; ++ * after @reference; ++ * at the end of @level */ ++ carry_op * reference /* reference node for insertion */) ++{ ++ carry_op *result; ++ ++ result = ++ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops, ++ order, &reference->header); ++ if (!IS_ERR(result) && (result != NULL)) ++ ++level->ops_num; ++ return result; ++} ++ ++/** ++ * Return node on the right of which @node was created. ++ * ++ * Each node is created on the right of some existing node (or it is new root, ++ * which is special case not handled here). ++ * ++ * @node is new node created on some level, but not yet inserted into its ++ * parent, it has corresponding bit (JNODE_ORPHAN) set in zstate. ++ */ ++static carry_node *find_begetting_brother(carry_node * node,/* node to start ++ search from */ ++ carry_level * kin UNUSED_ARG ++ /* level to scan */) ++{ ++ carry_node *scan; ++ ++ assert("nikita-1614", node != NULL); ++ assert("nikita-1615", kin != NULL); ++ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree)); ++ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL, ++ ZF_ISSET(reiser4_carry_real(node), ++ JNODE_ORPHAN))); ++ for (scan = node;; ++ scan = list_entry(scan->header.level_linkage.prev, carry_node, ++ header.level_linkage)) { ++ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage); ++ if ((scan->node != node->node) && ++ !ZF_ISSET(scan->node, JNODE_ORPHAN)) { ++ assert("nikita-1618", reiser4_carry_real(scan) != NULL); ++ break; ++ } ++ } ++ return scan; ++} ++ ++static cmp_t ++carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2) ++{ ++ assert("nikita-2199", n1 != NULL); ++ assert("nikita-2200", n2 != NULL); ++ ++ if (n1 == n2) ++ return EQUAL_TO; ++ while (1) { ++ n1 = carry_node_next(n1); ++ if (carry_node_end(level, n1)) ++ return GREATER_THAN; ++ if (n1 == n2) ++ return LESS_THAN; ++ } ++ impossible("nikita-2201", "End of level reached"); ++} ++ ++carry_node *find_carry_node(carry_level * level, const znode * node) ++{ ++ carry_node *scan; ++ carry_node *tmp_scan; ++ ++ assert("nikita-2202", level != NULL); ++ assert("nikita-2203", node != NULL); ++ ++ for_all_nodes(level, scan, tmp_scan) { ++ if (reiser4_carry_real(scan) == node) ++ return scan; ++ } ++ return NULL; ++} ++ ++znode *reiser4_carry_real(const carry_node * node) ++{ ++ assert("nikita-3061", node != NULL); ++ ++ return node->lock_handle.node; ++} ++ ++carry_node *insert_carry_node(carry_level * doing, carry_level * todo, ++ const znode * node) ++{ ++ carry_node *base; ++ carry_node *scan; ++ carry_node *tmp_scan; ++ carry_node *proj; ++ ++ base = find_carry_node(doing, node); ++ assert("nikita-2204", base != NULL); ++ ++ for_all_nodes(todo, scan, tmp_scan) { ++ proj = find_carry_node(doing, scan->node); ++ assert("nikita-2205", proj != NULL); ++ if (carry_node_cmp(doing, proj, base) != LESS_THAN) ++ break; ++ } ++ return scan; ++} ++ ++static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo, ++ znode * node) ++{ ++ carry_node *reference; ++ ++ assert("nikita-2994", doing != NULL); ++ assert("nikita-2995", todo != NULL); ++ assert("nikita-2996", node != NULL); ++ ++ reference = insert_carry_node(doing, todo, node); ++ assert("nikita-2997", reference != NULL); ++ ++ return reiser4_add_carry(todo, POOLO_BEFORE, reference); ++} ++ ++/* like reiser4_post_carry(), but designed to be called from node plugin ++ methods. This function is different from reiser4_post_carry() in that it ++ finds proper place to insert node in the queue. */ ++carry_op *node_post_carry(carry_plugin_info * info /* carry parameters ++ * passed down to node ++ * plugin */ , ++ carry_opcode op /* opcode of operation */ , ++ znode * node /* node on which this ++ * operation will operate */ , ++ int apply_to_parent_p /* whether operation will ++ * operate directly on @node ++ * or on it parent. */ ) ++{ ++ carry_op *result; ++ carry_node *child; ++ ++ assert("nikita-2207", info != NULL); ++ assert("nikita-2208", info->todo != NULL); ++ ++ if (info->doing == NULL) ++ return reiser4_post_carry(info->todo, op, node, ++ apply_to_parent_p); ++ ++ result = add_op(info->todo, POOLO_LAST, NULL); ++ if (IS_ERR(result)) ++ return result; ++ child = add_carry_atplace(info->doing, info->todo, node); ++ if (IS_ERR(child)) { ++ reiser4_pool_free(&info->todo->pool->op_pool, &result->header); ++ return (carry_op *) child; ++ } ++ result->node = child; ++ result->op = op; ++ child->parent = apply_to_parent_p; ++ if (ZF_ISSET(node, JNODE_ORPHAN)) ++ child->left_before = 1; ++ child->node = node; ++ return result; ++} ++ ++/* lock all carry nodes in @level */ ++static int lock_carry_level(carry_level * level/* level to lock */) ++{ ++ int result; ++ carry_node *node; ++ carry_node *tmp_node; ++ ++ assert("nikita-881", level != NULL); ++ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO)); ++ ++ /* lock nodes from left to right */ ++ result = 0; ++ for_all_nodes(level, node, tmp_node) { ++ result = lock_carry_node(level, node); ++ if (result != 0) ++ break; ++ } ++ return result; ++} ++ ++/* Synchronize delimiting keys between @node and its left neighbor. ++ ++ To reduce contention on dk key and simplify carry code, we synchronize ++ delimiting keys only when carry ultimately leaves tree level (carrying ++ changes upward) and unlocks nodes at this level. ++ ++ This function first finds left neighbor of @node and then updates left ++ neighbor's right delimiting key to conincide with least key in @node. ++ ++*/ ++ ++ON_DEBUG(extern atomic_t delim_key_version; ++ ) ++ ++static void sync_dkeys(znode * spot/* node to update */) ++{ ++ reiser4_key pivot; ++ reiser4_tree *tree; ++ ++ assert("nikita-1610", spot != NULL); ++ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk)); ++ ++ tree = znode_get_tree(spot); ++ read_lock_tree(tree); ++ write_lock_dk(tree); ++ ++ assert("nikita-2192", znode_is_loaded(spot)); ++ ++ /* sync left delimiting key of @spot with key in its leftmost item */ ++ if (node_is_empty(spot)) ++ pivot = *znode_get_rd_key(spot); ++ else ++ leftmost_key_in_node(spot, &pivot); ++ ++ znode_set_ld_key(spot, &pivot); ++ ++ /* there can be sequence of empty nodes pending removal on the left of ++ @spot. Scan them and update their left and right delimiting keys to ++ match left delimiting key of @spot. Also, update right delimiting ++ key of first non-empty left neighbor. ++ */ ++ while (1) { ++ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED)) ++ break; ++ ++ spot = spot->left; ++ if (spot == NULL) ++ break; ++ ++ znode_set_rd_key(spot, &pivot); ++ /* don't sink into the domain of another balancing */ ++ if (!znode_is_write_locked(spot)) ++ break; ++ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE)) ++ znode_set_ld_key(spot, &pivot); ++ else ++ break; ++ } ++ ++ write_unlock_dk(tree); ++ read_unlock_tree(tree); ++} ++ ++/* unlock all carry nodes in @level */ ++static void unlock_carry_level(carry_level * level /* level to unlock */ , ++ int failure /* true if unlocking owing to ++ * failure */ ) ++{ ++ carry_node *node; ++ carry_node *tmp_node; ++ ++ assert("nikita-889", level != NULL); ++ ++ if (!failure) { ++ znode *spot; ++ ++ spot = NULL; ++ /* update delimiting keys */ ++ for_all_nodes(level, node, tmp_node) { ++ if (reiser4_carry_real(node) != spot) { ++ spot = reiser4_carry_real(node); ++ sync_dkeys(spot); ++ } ++ } ++ } ++ ++ /* nodes can be unlocked in arbitrary order. In preemptible ++ environment it's better to unlock in reverse order of locking, ++ though. ++ */ ++ for_all_nodes_back(level, node, tmp_node) { ++ /* all allocated nodes should be already linked to their ++ parents at this moment. */ ++ assert("nikita-1631", ++ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node), ++ JNODE_ORPHAN))); ++ ON_DEBUG(check_dkeys(reiser4_carry_real(node))); ++ unlock_carry_node(level, node, failure); ++ } ++ level->new_root = NULL; ++} ++ ++/* finish with @level ++ ++ Unlock nodes and release all allocated resources */ ++static void done_carry_level(carry_level * level/* level to finish */) ++{ ++ carry_node *node; ++ carry_node *tmp_node; ++ carry_op *op; ++ carry_op *tmp_op; ++ ++ assert("nikita-1076", level != NULL); ++ ++ unlock_carry_level(level, 0); ++ for_all_nodes(level, node, tmp_node) { ++ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link)); ++ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link)); ++ reiser4_pool_free(&level->pool->node_pool, &node->header); ++ } ++ for_all_ops(level, op, tmp_op) ++ reiser4_pool_free(&level->pool->op_pool, &op->header); ++} ++ ++/* helper function to complete locking of carry node ++ ++ Finish locking of carry node. There are several ways in which new carry ++ node can be added into carry level and locked. Normal is through ++ lock_carry_node(), but also from find_{left|right}_neighbor(). This ++ function factors out common final part of all locking scenarios. It ++ supposes that @node -> lock_handle is lock handle for lock just taken and ++ fills ->real_node from this lock handle. ++ ++*/ ++int lock_carry_node_tail(carry_node * node/* node to complete locking of */) ++{ ++ assert("nikita-1052", node != NULL); ++ assert("nikita-1187", reiser4_carry_real(node) != NULL); ++ assert("nikita-1188", !node->unlock); ++ ++ node->unlock = 1; ++ /* Load node content into memory and install node plugin by ++ looking at the node header. ++ ++ Most of the time this call is cheap because the node is ++ already in memory. ++ ++ Corresponding zrelse() is in unlock_carry_node() ++ */ ++ return zload(reiser4_carry_real(node)); ++} ++ ++/* lock carry node ++ ++ "Resolve" node to real znode, lock it and mark as locked. ++ This requires recursive locking of znodes. ++ ++ When operation is posted to the parent level, node it will be applied to is ++ not yet known. For example, when shifting data between two nodes, ++ delimiting has to be updated in parent or parents of nodes involved. But ++ their parents is not yet locked and, moreover said nodes can be reparented ++ by concurrent balancing. ++ ++ To work around this, carry operation is applied to special "carry node" ++ rather than to the znode itself. Carry node consists of some "base" or ++ "reference" znode and flags indicating how to get to the target of carry ++ operation (->real_node field of carry_node) from base. ++ ++*/ ++int lock_carry_node(carry_level * level /* level @node is in */ , ++ carry_node * node/* node to lock */) ++{ ++ int result; ++ znode *reference_point; ++ lock_handle lh; ++ lock_handle tmp_lh; ++ reiser4_tree *tree; ++ ++ assert("nikita-887", level != NULL); ++ assert("nikita-882", node != NULL); ++ ++ result = 0; ++ reference_point = node->node; ++ init_lh(&lh); ++ init_lh(&tmp_lh); ++ if (node->left_before) { ++ /* handling of new nodes, allocated on the previous level: ++ ++ some carry ops were propably posted from the new node, but ++ this node neither has parent pointer set, nor is ++ connected. This will be done in ->create_hook() for ++ internal item. ++ ++ No then less, parent of new node has to be locked. To do ++ this, first go to the "left" in the carry order. This ++ depends on the decision to always allocate new node on the ++ right of existing one. ++ ++ Loop handles case when multiple nodes, all orphans, were ++ inserted. ++ ++ Strictly speaking, taking tree lock is not necessary here, ++ because all nodes scanned by loop in ++ find_begetting_brother() are write-locked by this thread, ++ and thus, their sibling linkage cannot change. ++ ++ */ ++ tree = znode_get_tree(reference_point); ++ read_lock_tree(tree); ++ reference_point = find_begetting_brother(node, level)->node; ++ read_unlock_tree(tree); ++ assert("nikita-1186", reference_point != NULL); ++ } ++ if (node->parent && (result == 0)) { ++ result = ++ reiser4_get_parent(&tmp_lh, reference_point, ++ ZNODE_WRITE_LOCK); ++ if (result != 0) { ++ ; /* nothing */ ++ } else if (znode_get_level(tmp_lh.node) == 0) { ++ assert("nikita-1347", znode_above_root(tmp_lh.node)); ++ result = add_new_root(level, node, tmp_lh.node); ++ if (result == 0) { ++ reference_point = level->new_root; ++ move_lh(&lh, &node->lock_handle); ++ } ++ } else if ((level->new_root != NULL) ++ && (level->new_root != ++ znode_parent_nolock(reference_point))) { ++ /* parent of node exists, but this level aready ++ created different new root, so */ ++ warning("nikita-1109", ++ /* it should be "radicis", but tradition is ++ tradition. do banshees read latin? */ ++ "hodie natus est radici frater"); ++ result = -EIO; ++ } else { ++ move_lh(&lh, &tmp_lh); ++ reference_point = lh.node; ++ } ++ } ++ if (node->left && (result == 0)) { ++ assert("nikita-1183", node->parent); ++ assert("nikita-883", reference_point != NULL); ++ result = ++ reiser4_get_left_neighbor(&tmp_lh, reference_point, ++ ZNODE_WRITE_LOCK, ++ GN_CAN_USE_UPPER_LEVELS); ++ if (result == 0) { ++ done_lh(&lh); ++ move_lh(&lh, &tmp_lh); ++ reference_point = lh.node; ++ } ++ } ++ if (!node->parent && !node->left && !node->left_before) { ++ result = ++ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK, ++ ZNODE_LOCK_HIPRI); ++ } ++ if (result == 0) { ++ move_lh(&node->lock_handle, &lh); ++ result = lock_carry_node_tail(node); ++ } ++ done_lh(&tmp_lh); ++ done_lh(&lh); ++ return result; ++} ++ ++/* release a lock on &carry_node. ++ ++ Release if necessary lock on @node. This opearion is pair of ++ lock_carry_node() and is idempotent: you can call it more than once on the ++ same node. ++ ++*/ ++static void ++unlock_carry_node(carry_level * level, ++ carry_node * node /* node to be released */ , ++ int failure /* 0 if node is unlocked due ++ * to some error */ ) ++{ ++ znode *real_node; ++ ++ assert("nikita-884", node != NULL); ++ ++ real_node = reiser4_carry_real(node); ++ /* pair to zload() in lock_carry_node_tail() */ ++ zrelse(real_node); ++ if (node->unlock && (real_node != NULL)) { ++ assert("nikita-899", real_node == node->lock_handle.node); ++ longterm_unlock_znode(&node->lock_handle); ++ } ++ if (failure) { ++ if (node->deallocate && (real_node != NULL)) { ++ /* free node in bitmap ++ ++ Prepare node for removal. Last zput() will finish ++ with it. ++ */ ++ ZF_SET(real_node, JNODE_HEARD_BANSHEE); ++ } ++ if (node->free) { ++ assert("nikita-2177", ++ list_empty_careful(&node->lock_handle.locks_link)); ++ assert("nikita-2112", ++ list_empty_careful(&node->lock_handle.owners_link)); ++ reiser4_pool_free(&level->pool->node_pool, ++ &node->header); ++ } ++ } ++} ++ ++/* fatal_carry_error() - all-catching error handling function ++ ++ It is possible that carry faces unrecoverable error, like unability to ++ insert pointer at the internal level. Our simple solution is just panic in ++ this situation. More sophisticated things like attempt to remount ++ file-system as read-only can be implemented without much difficlties. ++ ++ It is believed, that: ++ ++ 1. in stead of panicking, all current transactions can be aborted rolling ++ system back to the consistent state. ++ ++Umm, if you simply panic without doing anything more at all, then all current ++transactions are aborted and the system is rolled back to a consistent state, ++by virtue of the design of the transactional mechanism. Well, wait, let's be ++precise. If an internal node is corrupted on disk due to hardware failure, ++then there may be no consistent state that can be rolled back to, so instead ++we should say that it will rollback the transactions, which barring other ++factors means rolling back to a consistent state. ++ ++# Nikita: there is a subtle difference between panic and aborting ++# transactions: machine doesn't reboot. Processes aren't killed. Processes ++# don't using reiser4 (not that we care about such processes), or using other ++# reiser4 mounts (about them we do care) will simply continue to run. With ++# some luck, even application using aborted file system can survive: it will ++# get some error, like EBADF, from each file descriptor on failed file system, ++# but applications that do care about tolerance will cope with this (squid ++# will). ++ ++It would be a nice feature though to support rollback without rebooting ++followed by remount, but this can wait for later versions. ++ ++ 2. once isolated transactions will be implemented it will be possible to ++ roll back offending transaction. ++ ++2. is additional code complexity of inconsistent value (it implies that a ++broken tree should be kept in operation), so we must think about it more ++before deciding if it should be done. -Hans ++ ++*/ ++static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level ++ * where ++ * unrecoverable ++ * error ++ * occurred */ , ++ int ecode/* error code */) ++{ ++ assert("nikita-1230", doing != NULL); ++ assert("nikita-1231", ecode < 0); ++ ++ reiser4_panic("nikita-1232", "Carry failed: %i", ecode); ++} ++ ++/** ++ * Add new root to the tree ++ * ++ * This function itself only manages changes in carry structures and delegates ++ * all hard work (allocation of znode for new root, changes of parent and ++ * sibling pointers) to the reiser4_add_tree_root(). ++ * ++ * Locking: old tree root is locked by carry at this point. Fake znode is also ++ * locked. ++ */ ++static int add_new_root(carry_level * level,/* carry level in context of which ++ * operation is performed */ ++ carry_node * node, /* carry node for existing root */ ++ znode * fake /* "fake" znode already locked by ++ * us */) ++{ ++ int result; ++ ++ assert("nikita-1104", level != NULL); ++ assert("nikita-1105", node != NULL); ++ ++ assert("nikita-1403", znode_is_write_locked(node->node)); ++ assert("nikita-1404", znode_is_write_locked(fake)); ++ ++ /* trying to create new root. */ ++ /* @node is root and it's already locked by us. This ++ means that nobody else can be trying to add/remove ++ tree root right now. ++ */ ++ if (level->new_root == NULL) ++ level->new_root = reiser4_add_tree_root(node->node, fake); ++ if (!IS_ERR(level->new_root)) { ++ assert("nikita-1210", znode_is_root(level->new_root)); ++ node->deallocate = 1; ++ result = ++ longterm_lock_znode(&node->lock_handle, level->new_root, ++ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); ++ if (result == 0) ++ zput(level->new_root); ++ } else { ++ result = PTR_ERR(level->new_root); ++ level->new_root = NULL; ++ } ++ return result; ++} ++ ++/* allocate new znode and add the operation that inserts the ++ pointer to it into the parent node into the todo level ++ ++ Allocate new znode, add it into carry queue and post into @todo queue ++ request to add pointer to new node into its parent. ++ ++ This is carry related routing that calls reiser4_new_node() to allocate new ++ node. ++*/ ++carry_node *add_new_znode(znode * brother /* existing left neighbor of new ++ * node */ , ++ carry_node * ref /* carry node after which new ++ * carry node is to be inserted ++ * into queue. This affects ++ * locking. */ , ++ carry_level * doing /* carry queue where new node is ++ * to be added */ , ++ carry_level * todo /* carry queue where COP_INSERT ++ * operation to add pointer to ++ * new node will ne added */ ) ++{ ++ carry_node *fresh; ++ znode *new_znode; ++ carry_op *add_pointer; ++ carry_plugin_info info; ++ ++ assert("nikita-1048", brother != NULL); ++ assert("nikita-1049", todo != NULL); ++ ++ /* There is a lot of possible variations here: to what parent ++ new node will be attached and where. For simplicity, always ++ do the following: ++ ++ (1) new node and @brother will have the same parent. ++ ++ (2) new node is added on the right of @brother ++ ++ */ ++ ++ fresh = reiser4_add_carry_skip(doing, ++ ref ? POOLO_AFTER : POOLO_LAST, ref); ++ if (IS_ERR(fresh)) ++ return fresh; ++ ++ fresh->deallocate = 1; ++ fresh->free = 1; ++ ++ new_znode = reiser4_new_node(brother, znode_get_level(brother)); ++ if (IS_ERR(new_znode)) ++ /* @fresh will be deallocated automatically by error ++ handling code in the caller. */ ++ return (carry_node *) new_znode; ++ ++ /* new_znode returned znode with x_count 1. Caller has to decrease ++ it. make_space() does. */ ++ ++ ZF_SET(new_znode, JNODE_ORPHAN); ++ fresh->node = new_znode; ++ ++ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) { ++ ref = carry_node_prev(ref); ++ assert("nikita-1606", !carry_node_end(doing, ref)); ++ } ++ ++ info.todo = todo; ++ info.doing = doing; ++ add_pointer = node_post_carry(&info, COP_INSERT, ++ reiser4_carry_real(ref), 1); ++ if (IS_ERR(add_pointer)) { ++ /* no need to deallocate @new_znode here: it will be ++ deallocated during carry error handling. */ ++ return (carry_node *) add_pointer; ++ } ++ ++ add_pointer->u.insert.type = COPT_CHILD; ++ add_pointer->u.insert.child = fresh; ++ add_pointer->u.insert.brother = brother; ++ /* initially new node spawns empty key range */ ++ write_lock_dk(znode_get_tree(brother)); ++ znode_set_ld_key(new_znode, ++ znode_set_rd_key(new_znode, ++ znode_get_rd_key(brother))); ++ write_unlock_dk(znode_get_tree(brother)); ++ return fresh; ++} ++ ++/* DEBUGGING FUNCTIONS. ++ ++ Probably we also should leave them on even when ++ debugging is turned off to print dumps at errors. ++*/ ++#if REISER4_DEBUG ++static int carry_level_invariant(carry_level * level, carry_queue_state state) ++{ ++ carry_node *node; ++ carry_node *tmp_node; ++ ++ if (level == NULL) ++ return 0; ++ ++ if (level->track_type != 0 && ++ level->track_type != CARRY_TRACK_NODE && ++ level->track_type != CARRY_TRACK_CHANGE) ++ return 0; ++ ++ /* check that nodes are in ascending order */ ++ for_all_nodes(level, node, tmp_node) { ++ znode *left; ++ znode *right; ++ ++ reiser4_key lkey; ++ reiser4_key rkey; ++ ++ if (node != carry_node_front(level)) { ++ if (state == CARRY_TODO) { ++ right = node->node; ++ left = carry_node_prev(node)->node; ++ } else { ++ right = reiser4_carry_real(node); ++ left = reiser4_carry_real(carry_node_prev(node)); ++ } ++ if (right == NULL || left == NULL) ++ continue; ++ if (node_is_empty(right) || node_is_empty(left)) ++ continue; ++ if (!keyle(leftmost_key_in_node(left, &lkey), ++ leftmost_key_in_node(right, &rkey))) { ++ warning("", "wrong key order"); ++ return 0; ++ } ++ } ++ } ++ return 1; ++} ++#endif ++ ++/* get symbolic name for boolean */ ++static const char *tf(int boolean/* truth value */) ++{ ++ return boolean ? "t" : "f"; ++} ++ ++/* symbolic name for carry operation */ ++static const char *carry_op_name(carry_opcode op/* carry opcode */) ++{ ++ switch (op) { ++ case COP_INSERT: ++ return "COP_INSERT"; ++ case COP_DELETE: ++ return "COP_DELETE"; ++ case COP_CUT: ++ return "COP_CUT"; ++ case COP_PASTE: ++ return "COP_PASTE"; ++ case COP_UPDATE: ++ return "COP_UPDATE"; ++ case COP_EXTENT: ++ return "COP_EXTENT"; ++ case COP_INSERT_FLOW: ++ return "COP_INSERT_FLOW"; ++ default:{ ++ /* not mt safe, but who cares? */ ++ static char buf[20]; ++ ++ sprintf(buf, "unknown op: %x", op); ++ return buf; ++ } ++ } ++} ++ ++/* dump information about carry node */ ++static void print_carry(const char *prefix /* prefix to print */ , ++ carry_node * node/* node to print */) ++{ ++ if (node == NULL) { ++ printk("%s: null\n", prefix); ++ return; ++ } ++ printk ++ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n", ++ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock), ++ tf(node->free), tf(node->deallocate)); ++} ++ ++/* dump information about carry operation */ ++static void print_op(const char *prefix /* prefix to print */ , ++ carry_op * op/* operation to print */) ++{ ++ if (op == NULL) { ++ printk("%s: null\n", prefix); ++ return; ++ } ++ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op)); ++ print_carry("\tnode", op->node); ++ switch (op->op) { ++ case COP_INSERT: ++ case COP_PASTE: ++ print_coord("\tcoord", ++ op->u.insert.d ? op->u.insert.d->coord : NULL, 0); ++ reiser4_print_key("\tkey", ++ op->u.insert.d ? op->u.insert.d->key : NULL); ++ print_carry("\tchild", op->u.insert.child); ++ break; ++ case COP_DELETE: ++ print_carry("\tchild", op->u.delete.child); ++ break; ++ case COP_CUT: ++ if (op->u.cut_or_kill.is_cut) { ++ print_coord("\tfrom", ++ op->u.cut_or_kill.u.kill->params.from, 0); ++ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to, ++ 0); ++ } else { ++ print_coord("\tfrom", ++ op->u.cut_or_kill.u.cut->params.from, 0); ++ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to, ++ 0); ++ } ++ break; ++ case COP_UPDATE: ++ print_carry("\tleft", op->u.update.left); ++ break; ++ default: ++ /* do nothing */ ++ break; ++ } ++} ++ ++/* dump information about all nodes and operations in a @level */ ++static void print_level(const char *prefix /* prefix to print */ , ++ carry_level * level/* level to print */) ++{ ++ carry_node *node; ++ carry_node *tmp_node; ++ carry_op *op; ++ carry_op *tmp_op; ++ ++ if (level == NULL) { ++ printk("%s: null\n", prefix); ++ return; ++ } ++ printk("%s: %p, restartable: %s\n", ++ prefix, level, tf(level->restartable)); ++ ++ for_all_nodes(level, node, tmp_node) ++ print_carry("\tcarry node", node); ++ for_all_ops(level, op, tmp_op) ++ print_op("\tcarry op", op); ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/carry.h linux-2.6.33/fs/reiser4/carry.h +--- linux-2.6.33.orig/fs/reiser4/carry.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/carry.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,445 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* Functions and data types to "carry" tree modification(s) upward. ++ See fs/reiser4/carry.c for details. */ ++ ++#if !defined(__FS_REISER4_CARRY_H__) ++#define __FS_REISER4_CARRY_H__ ++ ++#include "forward.h" ++#include "debug.h" ++#include "pool.h" ++#include "znode.h" ++ ++#include <linux/types.h> ++ ++/* &carry_node - "location" of carry node. ++ ++ "location" of node that is involved or going to be involved into ++ carry process. Node where operation will be carried to on the ++ parent level cannot be recorded explicitly. Operation will be carried ++ usually to the parent of some node (where changes are performed at ++ the current level) or, to the left neighbor of its parent. But while ++ modifications are performed at the current level, parent may ++ change. So, we have to allow some indirection (or, positevly, ++ flexibility) in locating carry nodes. ++ ++*/ ++typedef struct carry_node { ++ /* pool linkage */ ++ struct reiser4_pool_header header; ++ ++ /* base node from which real_node is calculated. See ++ fs/reiser4/carry.c:lock_carry_node(). */ ++ znode *node; ++ ++ /* how to get ->real_node */ ++ /* to get ->real_node obtain parent of ->node */ ++ __u32 parent:1; ++ /* to get ->real_node obtain left neighbor of parent of ++ ->node */ ++ __u32 left:1; ++ __u32 left_before:1; ++ ++ /* locking */ ++ ++ /* this node was locked by carry process and should be ++ unlocked when carry leaves a level */ ++ __u32 unlock:1; ++ ++ /* disk block for this node was allocated by carry process and ++ should be deallocated when carry leaves a level */ ++ __u32 deallocate:1; ++ /* this carry node was allocated by carry process and should be ++ freed when carry leaves a level */ ++ __u32 free:1; ++ ++ /* type of lock we want to take on this node */ ++ lock_handle lock_handle; ++} carry_node; ++ ++/* &carry_opcode - elementary operations that can be carried upward ++ ++ Operations that carry() can handle. This list is supposed to be ++ expanded. ++ ++ Each carry operation (cop) is handled by appropriate function defined ++ in fs/reiser4/carry.c. For example COP_INSERT is handled by ++ fs/reiser4/carry.c:carry_insert() etc. These functions in turn ++ call plugins of nodes affected by operation to modify nodes' content ++ and to gather operations to be performed on the next level. ++ ++*/ ++typedef enum { ++ /* insert new item into node. */ ++ COP_INSERT, ++ /* delete pointer from parent node */ ++ COP_DELETE, ++ /* remove part of or whole node. */ ++ COP_CUT, ++ /* increase size of item. */ ++ COP_PASTE, ++ /* insert extent (that is sequence of unformatted nodes). */ ++ COP_EXTENT, ++ /* update delimiting key in least common ancestor of two ++ nodes. This is performed when items are moved between two ++ nodes. ++ */ ++ COP_UPDATE, ++ /* insert flow */ ++ COP_INSERT_FLOW, ++ COP_LAST_OP, ++} carry_opcode; ++ ++#define CARRY_FLOW_NEW_NODES_LIMIT 20 ++ ++/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target ++ item is determined. */ ++typedef enum { ++ /* target item is one containing pointer to the ->child node */ ++ COPT_CHILD, ++ /* target item is given explicitly by @coord */ ++ COPT_ITEM_DATA, ++ /* target item is given by key */ ++ COPT_KEY, ++ /* see insert_paste_common() for more comments on this. */ ++ COPT_PASTE_RESTARTED, ++} cop_insert_pos_type; ++ ++/* flags to cut and delete */ ++typedef enum { ++ /* don't kill node even if it became completely empty as results of ++ * cut. This is needed for eottl handling. See carry_extent() for ++ * details. */ ++ DELETE_RETAIN_EMPTY = (1 << 0) ++} cop_delete_flag; ++ ++/* ++ * carry() implements "lock handle tracking" feature. ++ * ++ * Callers supply carry with node where to perform initial operation and lock ++ * handle on this node. Trying to optimize node utilization carry may actually ++ * move insertion point to different node. Callers expect that lock handle ++ * will rebe transferred to the new node also. ++ * ++ */ ++typedef enum { ++ /* transfer lock handle along with insertion point */ ++ CARRY_TRACK_CHANGE = 1, ++ /* acquire new lock handle to the node where insertion point is. This ++ * is used when carry() client doesn't initially possess lock handle ++ * on the insertion point node, for example, by extent insertion ++ * code. See carry_extent(). */ ++ CARRY_TRACK_NODE = 2 ++} carry_track_type; ++ ++/* data supplied to COP_{INSERT|PASTE} by callers */ ++typedef struct carry_insert_data { ++ /* position where new item is to be inserted */ ++ coord_t *coord; ++ /* new item description */ ++ reiser4_item_data * data; ++ /* key of new item */ ++ const reiser4_key * key; ++} carry_insert_data; ++ ++/* cut and kill are similar, so carry_cut_data and carry_kill_data share the ++ below structure of parameters */ ++struct cut_kill_params { ++ /* coord where cut starts (inclusive) */ ++ coord_t *from; ++ /* coord where cut stops (inclusive, this item/unit will also be ++ * cut) */ ++ coord_t *to; ++ /* starting key. This is necessary when item and unit pos don't ++ * uniquely identify what portion or tree to remove. For example, this ++ * indicates what portion of extent unit will be affected. */ ++ const reiser4_key * from_key; ++ /* exclusive stop key */ ++ const reiser4_key * to_key; ++ /* if this is not NULL, smallest actually removed key is stored ++ * here. */ ++ reiser4_key *smallest_removed; ++ /* kill_node_content() is called for file truncate */ ++ int truncate; ++}; ++ ++struct carry_cut_data { ++ struct cut_kill_params params; ++}; ++ ++struct carry_kill_data { ++ struct cut_kill_params params; ++ /* parameter to be passed to the ->kill_hook() method of item ++ * plugin */ ++ /*void *iplug_params; *//* FIXME: unused currently */ ++ /* if not NULL---inode whose items are being removed. This is needed ++ * for ->kill_hook() of extent item to update VM structures when ++ * removing pages. */ ++ struct inode *inode; ++ /* sibling list maintenance is complicated by existence of eottl. When ++ * eottl whose left and right neighbors are formatted leaves is ++ * removed, one has to connect said leaves in the sibling list. This ++ * cannot be done when extent removal is just started as locking rules ++ * require sibling list update to happen atomically with removal of ++ * extent item. Therefore: 1. pointers to left and right neighbors ++ * have to be passed down to the ->kill_hook() of extent item, and ++ * 2. said neighbors have to be locked. */ ++ lock_handle *left; ++ lock_handle *right; ++ /* flags modifying behavior of kill. Currently, it may have ++ DELETE_RETAIN_EMPTY set. */ ++ unsigned flags; ++ char *buf; ++}; ++ ++/* &carry_tree_op - operation to "carry" upward. ++ ++ Description of an operation we want to "carry" to the upper level of ++ a tree: e.g, when we insert something and there is not enough space ++ we allocate a new node and "carry" the operation of inserting a ++ pointer to the new node to the upper level, on removal of empty node, ++ we carry up operation of removing appropriate entry from parent. ++ ++ There are two types of carry ops: when adding or deleting node we ++ node at the parent level where appropriate modification has to be ++ performed is known in advance. When shifting items between nodes ++ (split, merge), delimiting key should be changed in the least common ++ parent of the nodes involved that is not known in advance. ++ ++ For the operations of the first type we store in &carry_op pointer to ++ the &carry_node at the parent level. For the operation of the second ++ type we store &carry_node or parents of the left and right nodes ++ modified and keep track of them upward until they coincide. ++ ++*/ ++typedef struct carry_op { ++ /* pool linkage */ ++ struct reiser4_pool_header header; ++ carry_opcode op; ++ /* node on which operation is to be performed: ++ ++ for insert, paste: node where new item is to be inserted ++ ++ for delete: node where pointer is to be deleted ++ ++ for cut: node to cut from ++ ++ for update: node where delimiting key is to be modified ++ ++ for modify: parent of modified node ++ ++ */ ++ carry_node *node; ++ union { ++ struct { ++ /* (sub-)type of insertion/paste. Taken from ++ cop_insert_pos_type. */ ++ __u8 type; ++ /* various operation flags. Taken from ++ cop_insert_flag. */ ++ __u8 flags; ++ carry_insert_data *d; ++ carry_node *child; ++ znode *brother; ++ } insert, paste, extent; ++ ++ struct { ++ int is_cut; ++ union { ++ carry_kill_data *kill; ++ carry_cut_data *cut; ++ } u; ++ } cut_or_kill; ++ ++ struct { ++ carry_node *left; ++ } update; ++ struct { ++ /* changed child */ ++ carry_node *child; ++ /* bitmask of changes. See &cop_modify_flag */ ++ __u32 flag; ++ } modify; ++ struct { ++ /* flags to deletion operation. Are taken from ++ cop_delete_flag */ ++ __u32 flags; ++ /* child to delete from parent. If this is ++ NULL, delete op->node. */ ++ carry_node *child; ++ } delete; ++ struct { ++ /* various operation flags. Taken from ++ cop_insert_flag. */ ++ __u32 flags; ++ flow_t *flow; ++ coord_t *insert_point; ++ reiser4_item_data *data; ++ /* flow insertion is limited by number of new blocks ++ added in that operation which do not get any data ++ but part of flow. This limit is set by macro ++ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number ++ of nodes added already during one carry_flow */ ++ int new_nodes; ++ } insert_flow; ++ } u; ++} carry_op; ++ ++/* &carry_op_pool - preallocated pool of carry operations, and nodes */ ++typedef struct carry_pool { ++ carry_op op[CARRIES_POOL_SIZE]; ++ struct reiser4_pool op_pool; ++ carry_node node[NODES_LOCKED_POOL_SIZE]; ++ struct reiser4_pool node_pool; ++} carry_pool; ++ ++/* &carry_tree_level - carry process on given level ++ ++ Description of balancing process on the given level. ++ ++ No need for locking here, as carry_tree_level is essentially per ++ thread thing (for now). ++ ++*/ ++struct carry_level { ++ /* this level may be restarted */ ++ __u32 restartable:1; ++ /* list of carry nodes on this level, ordered by key order */ ++ struct list_head nodes; ++ struct list_head ops; ++ /* pool where new objects are allocated from */ ++ carry_pool *pool; ++ int ops_num; ++ int nodes_num; ++ /* new root created on this level, if any */ ++ znode *new_root; ++ /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.) ++ when they want ->tracked to automagically wander to the node where ++ insertion point moved after insert or paste. ++ */ ++ carry_track_type track_type; ++ /* lock handle supplied by user that we are tracking. See ++ above. */ ++ lock_handle *tracked; ++}; ++ ++/* information carry passes to plugin methods that may add new operations to ++ the @todo queue */ ++struct carry_plugin_info { ++ carry_level *doing; ++ carry_level *todo; ++}; ++ ++int reiser4_carry(carry_level * doing, carry_level * done); ++ ++carry_node *reiser4_add_carry(carry_level * level, pool_ordering order, ++ carry_node * reference); ++carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order, ++ carry_node * reference); ++ ++extern carry_node *insert_carry_node(carry_level * doing, ++ carry_level * todo, const znode * node); ++ ++extern carry_pool *init_carry_pool(int); ++extern void done_carry_pool(carry_pool * pool); ++ ++extern void init_carry_level(carry_level * level, carry_pool * pool); ++ ++extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op, ++ znode * node, int apply_to_parent); ++extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op, ++ znode * node, int apply_to_parent_p); ++ ++carry_node *add_new_znode(znode * brother, carry_node * reference, ++ carry_level * doing, carry_level * todo); ++ ++carry_node *find_carry_node(carry_level * level, const znode * node); ++ ++extern znode *reiser4_carry_real(const carry_node * node); ++ ++/* helper macros to iterate over carry queues */ ++ ++#define carry_node_next(node) \ ++ list_entry((node)->header.level_linkage.next, carry_node, \ ++ header.level_linkage) ++ ++#define carry_node_prev(node) \ ++ list_entry((node)->header.level_linkage.prev, carry_node, \ ++ header.level_linkage) ++ ++#define carry_node_front(level) \ ++ list_entry((level)->nodes.next, carry_node, header.level_linkage) ++ ++#define carry_node_back(level) \ ++ list_entry((level)->nodes.prev, carry_node, header.level_linkage) ++ ++#define carry_node_end(level, node) \ ++ (&(level)->nodes == &(node)->header.level_linkage) ++ ++/* macro to iterate over all operations in a @level */ ++#define for_all_ops(level /* carry level (of type carry_level *) */, \ ++ op /* pointer to carry operation, modified by loop (of \ ++ * type carry_op *) */, \ ++ tmp /* pointer to carry operation (of type carry_op *), \ ++ * used to make iterator stable in the face of \ ++ * deletions from the level */ ) \ ++for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \ ++ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \ ++ &op->header.level_linkage != &level->ops; \ ++ op = tmp, \ ++ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage)) ++ ++#if 0 ++for (op = (carry_op *) pool_level_list_front(&level->ops), \ ++ tmp = (carry_op *) pool_level_list_next(&op->header) ; \ ++ !pool_level_list_end(&level->ops, &op->header) ; \ ++ op = tmp, tmp = (carry_op *) pool_level_list_next(&op->header)) ++#endif ++ ++/* macro to iterate over all nodes in a @level */ \ ++#define for_all_nodes(level /* carry level (of type carry_level *) */, \ ++ node /* pointer to carry node, modified by loop (of \ ++ * type carry_node *) */, \ ++ tmp /* pointer to carry node (of type carry_node *), \ ++ * used to make iterator stable in the face of * \ ++ * deletions from the level */ ) \ ++for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \ ++ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \ ++ &node->header.level_linkage != &level->nodes; \ ++ node = tmp, \ ++ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage)) ++ ++#if 0 ++for (node = carry_node_front(level), \ ++ tmp = carry_node_next(node) ; !carry_node_end(level, node) ; \ ++ node = tmp, tmp = carry_node_next(node)) ++#endif ++ ++/* macro to iterate over all nodes in a @level in reverse order ++ ++ This is used, because nodes are unlocked in reversed order of locking */ ++#define for_all_nodes_back(level /* carry level (of type carry_level *) */, \ ++ node /* pointer to carry node, modified by loop \ ++ * (of type carry_node *) */, \ ++ tmp /* pointer to carry node (of type carry_node \ ++ * *), used to make iterator stable in the \ ++ * face of deletions from the level */ ) \ ++for (node = carry_node_back(level), \ ++ tmp = carry_node_prev(node) ; !carry_node_end(level, node) ; \ ++ node = tmp, tmp = carry_node_prev(node)) ++ ++/* __FS_REISER4_CARRY_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/carry_ops.c linux-2.6.33/fs/reiser4/carry_ops.c +--- linux-2.6.33.orig/fs/reiser4/carry_ops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/carry_ops.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,2132 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* implementation of carry operations */ ++ ++#include "forward.h" ++#include "debug.h" ++#include "key.h" ++#include "coord.h" ++#include "plugin/item/item.h" ++#include "plugin/node/node.h" ++#include "jnode.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree_walk.h" ++#include "pool.h" ++#include "tree_mod.h" ++#include "carry.h" ++#include "carry_ops.h" ++#include "tree.h" ++#include "super.h" ++#include "reiser4.h" ++ ++#include <linux/types.h> ++#include <linux/err.h> ++ ++static int carry_shift_data(sideof side, coord_t *insert_coord, znode * node, ++ carry_level * doing, carry_level * todo, ++ unsigned int including_insert_coord_p); ++ ++extern int lock_carry_node(carry_level * level, carry_node * node); ++extern int lock_carry_node_tail(carry_node * node); ++ ++/* find left neighbor of a carry node ++ ++ Look for left neighbor of @node and add it to the @doing queue. See ++ comments in the body. ++ ++*/ ++static carry_node *find_left_neighbor(carry_op * op /* node to find left ++ * neighbor of */ , ++ carry_level * doing/* level to scan */) ++{ ++ int result; ++ carry_node *node; ++ carry_node *left; ++ int flags; ++ reiser4_tree *tree; ++ ++ node = op->node; ++ ++ tree = current_tree; ++ read_lock_tree(tree); ++ /* first, check whether left neighbor is already in a @doing queue */ ++ if (reiser4_carry_real(node)->left != NULL) { ++ /* NOTE: there is locking subtlety here. Look into ++ * find_right_neighbor() for more info */ ++ if (find_carry_node(doing, ++ reiser4_carry_real(node)->left) != NULL) { ++ read_unlock_tree(tree); ++ left = node; ++ do { ++ left = list_entry(left->header.level_linkage.prev, ++ carry_node, header.level_linkage); ++ assert("nikita-3408", !carry_node_end(doing, ++ left)); ++ } while (reiser4_carry_real(left) == ++ reiser4_carry_real(node)); ++ return left; ++ } ++ } ++ read_unlock_tree(tree); ++ ++ left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node); ++ if (IS_ERR(left)) ++ return left; ++ ++ left->node = node->node; ++ left->free = 1; ++ ++ flags = GN_TRY_LOCK; ++ if (!(op->u.insert.flags & COPI_LOAD_LEFT)) ++ flags |= GN_NO_ALLOC; ++ ++ /* then, feeling lucky, peek left neighbor in the cache. */ ++ result = reiser4_get_left_neighbor(&left->lock_handle, ++ reiser4_carry_real(node), ++ ZNODE_WRITE_LOCK, flags); ++ if (result == 0) { ++ /* ok, node found and locked. */ ++ result = lock_carry_node_tail(left); ++ if (result != 0) ++ left = ERR_PTR(result); ++ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) { ++ /* node is leftmost node in a tree, or neighbor wasn't in ++ cache, or there is an extent on the left. */ ++ reiser4_pool_free(&doing->pool->node_pool, &left->header); ++ left = NULL; ++ } else if (doing->restartable) { ++ /* if left neighbor is locked, and level is restartable, add ++ new node to @doing and restart. */ ++ assert("nikita-913", node->parent != 0); ++ assert("nikita-914", node->node != NULL); ++ left->left = 1; ++ left->free = 0; ++ left = ERR_PTR(-E_REPEAT); ++ } else { ++ /* left neighbor is locked, level cannot be restarted. Just ++ ignore left neighbor. */ ++ reiser4_pool_free(&doing->pool->node_pool, &left->header); ++ left = NULL; ++ } ++ return left; ++} ++ ++/* find right neighbor of a carry node ++ ++ Look for right neighbor of @node and add it to the @doing queue. See ++ comments in the body. ++ ++*/ ++static carry_node *find_right_neighbor(carry_op * op /* node to find right ++ * neighbor of */ , ++ carry_level * doing/* level to scan */) ++{ ++ int result; ++ carry_node *node; ++ carry_node *right; ++ lock_handle lh; ++ int flags; ++ reiser4_tree *tree; ++ ++ init_lh(&lh); ++ ++ node = op->node; ++ ++ tree = current_tree; ++ read_lock_tree(tree); ++ /* first, check whether right neighbor is already in a @doing queue */ ++ if (reiser4_carry_real(node)->right != NULL) { ++ /* ++ * Tree lock is taken here anyway, because, even if _outcome_ ++ * of (find_carry_node() != NULL) doesn't depends on ++ * concurrent updates to ->right, find_carry_node() cannot ++ * work with second argument NULL. Hence, following comment is ++ * of historic importance only. ++ * ++ * Subtle: ++ * ++ * Q: why don't we need tree lock here, looking for the right ++ * neighbor? ++ * ++ * A: even if value of node->real_node->right were changed ++ * during find_carry_node() execution, outcome of execution ++ * wouldn't change, because (in short) other thread cannot add ++ * elements to the @doing, and if node->real_node->right ++ * already was in @doing, value of node->real_node->right ++ * couldn't change, because node cannot be inserted between ++ * locked neighbors. ++ */ ++ if (find_carry_node(doing, ++ reiser4_carry_real(node)->right) != NULL) { ++ read_unlock_tree(tree); ++ /* ++ * What we are doing here (this is also applicable to ++ * the find_left_neighbor()). ++ * ++ * tree_walk.c code requires that insertion of a ++ * pointer to a child, modification of parent pointer ++ * in the child, and insertion of the child into ++ * sibling list are atomic (see ++ * plugin/item/internal.c:create_hook_internal()). ++ * ++ * carry allocates new node long before pointer to it ++ * is inserted into parent and, actually, long before ++ * parent is even known. Such allocated-but-orphaned ++ * nodes are only trackable through carry level lists. ++ * ++ * Situation that is handled here is following: @node ++ * has valid ->right pointer, but there is ++ * allocated-but-orphaned node in the carry queue that ++ * is logically between @node and @node->right. Here ++ * we are searching for it. Critical point is that ++ * this is only possible if @node->right is also in ++ * the carry queue (this is checked above), because ++ * this is the only way new orphaned node could be ++ * inserted between them (before inserting new node, ++ * make_space() first tries to shift to the right, so, ++ * right neighbor will be locked and queued). ++ * ++ */ ++ right = node; ++ do { ++ right = list_entry(right->header.level_linkage.next, ++ carry_node, header.level_linkage); ++ assert("nikita-3408", !carry_node_end(doing, ++ right)); ++ } while (reiser4_carry_real(right) == ++ reiser4_carry_real(node)); ++ return right; ++ } ++ } ++ read_unlock_tree(tree); ++ ++ flags = GN_CAN_USE_UPPER_LEVELS; ++ if (!(op->u.insert.flags & COPI_LOAD_RIGHT)) ++ flags = GN_NO_ALLOC; ++ ++ /* then, try to lock right neighbor */ ++ init_lh(&lh); ++ result = reiser4_get_right_neighbor(&lh, ++ reiser4_carry_real(node), ++ ZNODE_WRITE_LOCK, flags); ++ if (result == 0) { ++ /* ok, node found and locked. */ ++ right = reiser4_add_carry_skip(doing, POOLO_AFTER, node); ++ if (!IS_ERR(right)) { ++ right->node = lh.node; ++ move_lh(&right->lock_handle, &lh); ++ right->free = 1; ++ result = lock_carry_node_tail(right); ++ if (result != 0) ++ right = ERR_PTR(result); ++ } ++ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) { ++ /* node is rightmost node in a tree, or neighbor wasn't in ++ cache, or there is an extent on the right. */ ++ right = NULL; ++ } else ++ right = ERR_PTR(result); ++ done_lh(&lh); ++ return right; ++} ++ ++/* how much free space in a @node is needed for @op ++ ++ How much space in @node is required for completion of @op, where @op is ++ insert or paste operation. ++*/ ++static unsigned int space_needed_for_op(znode * node /* znode data are ++ * inserted or ++ * pasted in */ , ++ carry_op * op /* carry ++ operation */ ) ++{ ++ assert("nikita-919", op != NULL); ++ ++ switch (op->op) { ++ default: ++ impossible("nikita-1701", "Wrong opcode"); ++ case COP_INSERT: ++ return space_needed(node, NULL, op->u.insert.d->data, 1); ++ case COP_PASTE: ++ return space_needed(node, op->u.insert.d->coord, ++ op->u.insert.d->data, 0); ++ } ++} ++ ++/* how much space in @node is required to insert or paste @data at ++ @coord. */ ++unsigned int space_needed(const znode * node /* node data are inserted or ++ * pasted in */ , ++ const coord_t *coord /* coord where data are ++ * inserted or pasted ++ * at */ , ++ const reiser4_item_data * data /* data to insert or ++ * paste */ , ++ int insertion/* non-0 is inserting, 0---paste */) ++{ ++ int result; ++ item_plugin *iplug; ++ ++ assert("nikita-917", node != NULL); ++ assert("nikita-918", node_plugin_by_node(node) != NULL); ++ assert("vs-230", !insertion || (coord == NULL)); ++ ++ result = 0; ++ iplug = data->iplug; ++ if (iplug->b.estimate != NULL) { ++ /* ask item plugin how much space is needed to insert this ++ item */ ++ result += iplug->b.estimate(insertion ? NULL : coord, data); ++ } else { ++ /* reasonable default */ ++ result += data->length; ++ } ++ if (insertion) { ++ node_plugin *nplug; ++ ++ nplug = node->nplug; ++ /* and add node overhead */ ++ if (nplug->item_overhead != NULL) ++ result += nplug->item_overhead(node, NULL); ++ } ++ return result; ++} ++ ++/* find &coord in parent where pointer to new child is to be stored. */ ++static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to ++ * insert pointer to new ++ * child */ ) ++{ ++ int result; ++ znode *node; ++ znode *child; ++ ++ assert("nikita-941", op != NULL); ++ assert("nikita-942", op->op == COP_INSERT); ++ ++ node = reiser4_carry_real(op->node); ++ assert("nikita-943", node != NULL); ++ assert("nikita-944", node_plugin_by_node(node) != NULL); ++ ++ child = reiser4_carry_real(op->u.insert.child); ++ result = ++ find_new_child_ptr(node, child, op->u.insert.brother, ++ op->u.insert.d->coord); ++ ++ build_child_ptr_data(child, op->u.insert.d->data); ++ return result; ++} ++ ++/* additional amount of free space in @node required to complete @op */ ++static int free_space_shortage(znode * node /* node to check */ , ++ carry_op * op/* operation being performed */) ++{ ++ assert("nikita-1061", node != NULL); ++ assert("nikita-1062", op != NULL); ++ ++ switch (op->op) { ++ default: ++ impossible("nikita-1702", "Wrong opcode"); ++ case COP_INSERT: ++ case COP_PASTE: ++ return space_needed_for_op(node, op) - znode_free_space(node); ++ case COP_EXTENT: ++ /* when inserting extent shift data around until insertion ++ point is utmost in the node. */ ++ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE) ++ return +1; ++ else ++ return -1; ++ } ++} ++ ++/* helper function: update node pointer in operation after insertion ++ point was probably shifted into @target. */ ++static znode *sync_op(carry_op * op, carry_node * target) ++{ ++ znode *insertion_node; ++ ++ /* reget node from coord: shift might move insertion coord to ++ the neighbor */ ++ insertion_node = op->u.insert.d->coord->node; ++ /* if insertion point was actually moved into new node, ++ update carry node pointer in operation. */ ++ if (insertion_node != reiser4_carry_real(op->node)) { ++ op->node = target; ++ assert("nikita-2540", ++ reiser4_carry_real(target) == insertion_node); ++ } ++ assert("nikita-2541", ++ reiser4_carry_real(op->node) == op->u.insert.d->coord->node); ++ return insertion_node; ++} ++ ++/* ++ * complete make_space() call: update tracked lock handle if necessary. See ++ * comments for fs/reiser4/carry.h:carry_track_type ++ */ ++static int ++make_space_tail(carry_op * op, carry_level * doing, znode * orig_node) ++{ ++ int result; ++ carry_track_type tracking; ++ znode *node; ++ ++ tracking = doing->track_type; ++ node = op->u.insert.d->coord->node; ++ ++ if (tracking == CARRY_TRACK_NODE || ++ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) { ++ /* inserting or pasting into node different from ++ original. Update lock handle supplied by caller. */ ++ assert("nikita-1417", doing->tracked != NULL); ++ done_lh(doing->tracked); ++ init_lh(doing->tracked); ++ result = longterm_lock_znode(doing->tracked, node, ++ ZNODE_WRITE_LOCK, ++ ZNODE_LOCK_HIPRI); ++ } else ++ result = 0; ++ return result; ++} ++ ++/* This is insertion policy function. It shifts data to the left and right ++ neighbors of insertion coord and allocates new nodes until there is enough ++ free space to complete @op. ++ ++ See comments in the body. ++ ++ Assumes that the node format favors insertions at the right end of the node ++ as node40 does. ++ ++ See carry_flow() on detail about flow insertion ++*/ ++static int make_space(carry_op * op /* carry operation, insert or paste */ , ++ carry_level * doing /* current carry queue */ , ++ carry_level * todo/* carry queue on the parent level */) ++{ ++ znode *node; ++ int result; ++ int not_enough_space; ++ int blk_alloc; ++ znode *orig_node; ++ __u32 flags; ++ ++ coord_t *coord; ++ ++ assert("nikita-890", op != NULL); ++ assert("nikita-891", todo != NULL); ++ assert("nikita-892", ++ op->op == COP_INSERT || ++ op->op == COP_PASTE || op->op == COP_EXTENT); ++ assert("nikita-1607", ++ reiser4_carry_real(op->node) == op->u.insert.d->coord->node); ++ ++ flags = op->u.insert.flags; ++ ++ /* NOTE check that new node can only be allocated after checking left ++ * and right neighbors. This is necessary for proper work of ++ * find_{left,right}_neighbor(). */ ++ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE, ++ flags & COPI_DONT_SHIFT_LEFT)); ++ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE, ++ flags & COPI_DONT_SHIFT_RIGHT)); ++ ++ coord = op->u.insert.d->coord; ++ orig_node = node = coord->node; ++ ++ assert("nikita-908", node != NULL); ++ assert("nikita-909", node_plugin_by_node(node) != NULL); ++ ++ result = 0; ++ /* If there is not enough space in a node, try to shift something to ++ the left neighbor. This is a bit tricky, as locking to the left is ++ low priority. This is handled by restart logic in carry(). ++ */ ++ not_enough_space = free_space_shortage(node, op); ++ if (not_enough_space <= 0) ++ /* it is possible that carry was called when there actually ++ was enough space in the node. For example, when inserting ++ leftmost item so that delimiting keys have to be updated. ++ */ ++ return make_space_tail(op, doing, orig_node); ++ if (!(flags & COPI_DONT_SHIFT_LEFT)) { ++ carry_node *left; ++ /* make note in statistics of an attempt to move ++ something into the left neighbor */ ++ left = find_left_neighbor(op, doing); ++ if (unlikely(IS_ERR(left))) { ++ if (PTR_ERR(left) == -E_REPEAT) ++ return -E_REPEAT; ++ else { ++ /* some error other than restart request ++ occurred. This shouldn't happen. Issue a ++ warning and continue as if left neighbor ++ weren't existing. ++ */ ++ warning("nikita-924", ++ "Error accessing left neighbor: %li", ++ PTR_ERR(left)); ++ } ++ } else if (left != NULL) { ++ ++ /* shift everything possible on the left of and ++ including insertion coord into the left neighbor */ ++ result = carry_shift_data(LEFT_SIDE, coord, ++ reiser4_carry_real(left), ++ doing, todo, ++ flags & COPI_GO_LEFT); ++ ++ /* reget node from coord: shift_left() might move ++ insertion coord to the left neighbor */ ++ node = sync_op(op, left); ++ ++ not_enough_space = free_space_shortage(node, op); ++ /* There is not enough free space in @node, but ++ may be, there is enough free space in ++ @left. Various balancing decisions are valid here. ++ The same for the shifiting to the right. ++ */ ++ } ++ } ++ /* If there still is not enough space, shift to the right */ ++ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) { ++ carry_node *right; ++ ++ right = find_right_neighbor(op, doing); ++ if (IS_ERR(right)) { ++ warning("nikita-1065", ++ "Error accessing right neighbor: %li", ++ PTR_ERR(right)); ++ } else if (right != NULL) { ++ /* node containing insertion point, and its right ++ neighbor node are write locked by now. ++ ++ shift everything possible on the right of but ++ excluding insertion coord into the right neighbor ++ */ ++ result = carry_shift_data(RIGHT_SIDE, coord, ++ reiser4_carry_real(right), ++ doing, todo, ++ flags & COPI_GO_RIGHT); ++ /* reget node from coord: shift_right() might move ++ insertion coord to the right neighbor */ ++ node = sync_op(op, right); ++ not_enough_space = free_space_shortage(node, op); ++ } ++ } ++ /* If there is still not enough space, allocate new node(s). ++ ++ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in ++ the carry operation flags (currently this is needed during flush ++ only). ++ */ ++ for (blk_alloc = 0; ++ not_enough_space > 0 && result == 0 && blk_alloc < 2 && ++ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) { ++ carry_node *fresh; /* new node we are allocating */ ++ coord_t coord_shadow; /* remembered insertion point before ++ * shifting data into new node */ ++ carry_node *node_shadow; /* remembered insertion node ++ * before shifting */ ++ unsigned int gointo; /* whether insertion point should move ++ * into newly allocated node */ ++ ++ /* allocate new node on the right of @node. Znode and disk ++ fake block number for new node are allocated. ++ ++ add_new_znode() posts carry operation COP_INSERT with ++ COPT_CHILD option to the parent level to add ++ pointer to newly created node to its parent. ++ ++ Subtle point: if several new nodes are required to complete ++ insertion operation at this level, they will be inserted ++ into their parents in the order of creation, which means ++ that @node will be valid "cookie" at the time of insertion. ++ ++ */ ++ fresh = add_new_znode(node, op->node, doing, todo); ++ if (IS_ERR(fresh)) ++ return PTR_ERR(fresh); ++ ++ /* Try to shift into new node. */ ++ result = lock_carry_node(doing, fresh); ++ zput(reiser4_carry_real(fresh)); ++ if (result != 0) { ++ warning("nikita-947", ++ "Cannot lock new node: %i", result); ++ return result; ++ } ++ ++ /* both nodes are write locked by now. ++ ++ shift everything possible on the right of and ++ including insertion coord into the right neighbor. ++ */ ++ coord_dup(&coord_shadow, op->u.insert.d->coord); ++ node_shadow = op->node; ++ /* move insertion point into newly created node if: ++ ++ . insertion point is rightmost in the source node, or ++ . this is not the first node we are allocating in a row. ++ */ ++ gointo = ++ (blk_alloc > 0) || ++ coord_is_after_rightmost(op->u.insert.d->coord); ++ ++ if (gointo && ++ op->op == COP_PASTE && ++ coord_is_existing_item(op->u.insert.d->coord) && ++ is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) { ++ /* paste into solid (atomic) item, which can contain ++ only one unit, so we need to shift it right, where ++ insertion point supposed to be */ ++ ++ assert("edward-1444", op->u.insert.d->data->iplug == ++ item_plugin_by_id(STATIC_STAT_DATA_ID)); ++ assert("edward-1445", ++ op->u.insert.d->data->length > ++ node_plugin_by_node(coord->node)->free_space ++ (coord->node)); ++ ++ op->u.insert.d->coord->between = BEFORE_UNIT; ++ } ++ ++ result = carry_shift_data(RIGHT_SIDE, coord, ++ reiser4_carry_real(fresh), ++ doing, todo, gointo); ++ /* if insertion point was actually moved into new node, ++ update carry node pointer in operation. */ ++ node = sync_op(op, fresh); ++ not_enough_space = free_space_shortage(node, op); ++ if ((not_enough_space > 0) && (node != coord_shadow.node)) { ++ /* there is not enough free in new node. Shift ++ insertion point back to the @shadow_node so that ++ next new node would be inserted between ++ @shadow_node and @fresh. ++ */ ++ coord_normalize(&coord_shadow); ++ coord_dup(coord, &coord_shadow); ++ node = coord->node; ++ op->node = node_shadow; ++ if (1 || (flags & COPI_STEP_BACK)) { ++ /* still not enough space?! Maybe there is ++ enough space in the source node (i.e., node ++ data are moved from) now. ++ */ ++ not_enough_space = ++ free_space_shortage(node, op); ++ } ++ } ++ } ++ if (not_enough_space > 0) { ++ if (!(flags & COPI_DONT_ALLOCATE)) ++ warning("nikita-948", "Cannot insert new item"); ++ result = -E_NODE_FULL; ++ } ++ assert("nikita-1622", ergo(result == 0, ++ reiser4_carry_real(op->node) == coord->node)); ++ assert("nikita-2616", coord == op->u.insert.d->coord); ++ if (result == 0) ++ result = make_space_tail(op, doing, orig_node); ++ return result; ++} ++ ++/* insert_paste_common() - common part of insert and paste operations ++ ++ This function performs common part of COP_INSERT and COP_PASTE. ++ ++ There are two ways in which insertion/paste can be requested: ++ ++ . by directly supplying reiser4_item_data. In this case, op -> ++ u.insert.type is set to COPT_ITEM_DATA. ++ ++ . by supplying child pointer to which is to inserted into parent. In this ++ case op -> u.insert.type == COPT_CHILD. ++ ++ . by supplying key of new item/unit. This is currently only used during ++ extent insertion ++ ++ This is required, because when new node is allocated we don't know at what ++ position pointer to it is to be stored in the parent. Actually, we don't ++ even know what its parent will be, because parent can be re-balanced ++ concurrently and new node re-parented, and because parent can be full and ++ pointer to the new node will go into some other node. ++ ++ insert_paste_common() resolves pointer to child node into position in the ++ parent by calling find_new_child_coord(), that fills ++ reiser4_item_data. After this, insertion/paste proceeds uniformly. ++ ++ Another complication is with finding free space during pasting. It may ++ happen that while shifting items to the neighbors and newly allocated ++ nodes, insertion coord can no longer be in the item we wanted to paste ++ into. At this point, paste becomes (morphs) into insert. Moreover free ++ space analysis has to be repeated, because amount of space required for ++ insertion is different from that of paste (item header overhead, etc). ++ ++ This function "unifies" different insertion modes (by resolving child ++ pointer or key into insertion coord), and then calls make_space() to free ++ enough space in the node by shifting data to the left and right and by ++ allocating new nodes if necessary. Carry operation knows amount of space ++ required for its completion. After enough free space is obtained, caller of ++ this function (carry_{insert,paste,etc.}) performs actual insertion/paste ++ by calling item plugin method. ++ ++*/ ++static int insert_paste_common(carry_op * op /* carry operation being ++ * performed */ , ++ carry_level * doing /* current carry level */ , ++ carry_level * todo /* next carry level */ , ++ carry_insert_data * cdata /* pointer to ++ * cdata */ , ++ coord_t *coord /* insertion/paste coord */ , ++ reiser4_item_data * data /* data to be ++ * inserted/pasted */ ) ++{ ++ assert("nikita-981", op != NULL); ++ assert("nikita-980", todo != NULL); ++ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE) ++ || (op->op == COP_EXTENT)); ++ ++ if (op->u.insert.type == COPT_PASTE_RESTARTED) { ++ /* nothing to do. Fall through to make_space(). */ ++ ; ++ } else if (op->u.insert.type == COPT_KEY) { ++ node_search_result intra_node; ++ znode *node; ++ /* Problem with doing batching at the lowest level, is that ++ operations here are given by coords where modification is ++ to be performed, and one modification can invalidate coords ++ of all following operations. ++ ++ So, we are implementing yet another type for operation that ++ will use (the only) "locator" stable across shifting of ++ data between nodes, etc.: key (COPT_KEY). ++ ++ This clause resolves key to the coord in the node. ++ ++ But node can change also. Probably some pieces have to be ++ added to the lock_carry_node(), to lock node by its key. ++ ++ */ ++ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain ++ if you need something else. */ ++ op->u.insert.d->coord = coord; ++ node = reiser4_carry_real(op->node); ++ intra_node = node_plugin_by_node(node)->lookup ++ (node, op->u.insert.d->key, FIND_EXACT, ++ op->u.insert.d->coord); ++ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) { ++ warning("nikita-1715", "Intra node lookup failure: %i", ++ intra_node); ++ return intra_node; ++ } ++ } else if (op->u.insert.type == COPT_CHILD) { ++ /* if we are asked to insert pointer to the child into ++ internal node, first convert pointer to the child into ++ coord within parent node. ++ */ ++ znode *child; ++ int result; ++ ++ op->u.insert.d = cdata; ++ op->u.insert.d->coord = coord; ++ op->u.insert.d->data = data; ++ op->u.insert.d->coord->node = reiser4_carry_real(op->node); ++ result = find_new_child_coord(op); ++ child = reiser4_carry_real(op->u.insert.child); ++ if (result != NS_NOT_FOUND) { ++ warning("nikita-993", ++ "Cannot find a place for child pointer: %i", ++ result); ++ return result; ++ } ++ /* This only happens when we did multiple insertions at ++ the previous level, trying to insert single item and ++ it so happened, that insertion of pointers to all new ++ nodes before this one already caused parent node to ++ split (may be several times). ++ ++ I am going to come up with better solution. ++ ++ You are not expected to understand this. ++ -- v6root/usr/sys/ken/slp.c ++ ++ Basically, what happens here is the following: carry came ++ to the parent level and is about to insert internal item ++ pointing to the child node that it just inserted in the ++ level below. Position where internal item is to be inserted ++ was found by find_new_child_coord() above, but node of the ++ current carry operation (that is, parent node of child ++ inserted on the previous level), was determined earlier in ++ the lock_carry_level/lock_carry_node. It could so happen ++ that other carry operations already performed on the parent ++ level already split parent node, so that insertion point ++ moved into another node. Handle this by creating new carry ++ node for insertion point if necessary. ++ */ ++ if (reiser4_carry_real(op->node) != ++ op->u.insert.d->coord->node) { ++ pool_ordering direction; ++ znode *z1; ++ znode *z2; ++ reiser4_key k1; ++ reiser4_key k2; ++ ++ /* ++ * determine in what direction insertion point ++ * moved. Do this by comparing delimiting keys. ++ */ ++ z1 = op->u.insert.d->coord->node; ++ z2 = reiser4_carry_real(op->node); ++ if (keyle(leftmost_key_in_node(z1, &k1), ++ leftmost_key_in_node(z2, &k2))) ++ /* insertion point moved to the left */ ++ direction = POOLO_BEFORE; ++ else ++ /* insertion point moved to the right */ ++ direction = POOLO_AFTER; ++ ++ op->node = reiser4_add_carry_skip(doing, ++ direction, op->node); ++ if (IS_ERR(op->node)) ++ return PTR_ERR(op->node); ++ op->node->node = op->u.insert.d->coord->node; ++ op->node->free = 1; ++ result = lock_carry_node(doing, op->node); ++ if (result != 0) ++ return result; ++ } ++ ++ /* ++ * set up key of an item being inserted: we are inserting ++ * internal item and its key is (by the very definition of ++ * search tree) is leftmost key in the child node. ++ */ ++ write_lock_dk(znode_get_tree(child)); ++ op->u.insert.d->key = leftmost_key_in_node(child, ++ znode_get_ld_key(child)); ++ write_unlock_dk(znode_get_tree(child)); ++ op->u.insert.d->data->arg = op->u.insert.brother; ++ } else { ++ assert("vs-243", op->u.insert.d->coord != NULL); ++ op->u.insert.d->coord->node = reiser4_carry_real(op->node); ++ } ++ ++ /* find free space. */ ++ return make_space(op, doing, todo); ++} ++ ++/* handle carry COP_INSERT operation. ++ ++ Insert new item into node. New item can be given in one of two ways: ++ ++ - by passing &tree_coord and &reiser4_item_data as part of @op. This is ++ only applicable at the leaf/twig level. ++ ++ - by passing a child node pointer to which is to be inserted by this ++ operation. ++ ++*/ ++static int carry_insert(carry_op * op /* operation to perform */ , ++ carry_level * doing /* queue of operations @op ++ * is part of */ , ++ carry_level * todo /* queue where new operations ++ * are accumulated */ ) ++{ ++ znode *node; ++ carry_insert_data cdata; ++ coord_t coord; ++ reiser4_item_data data; ++ carry_plugin_info info; ++ int result; ++ ++ assert("nikita-1036", op != NULL); ++ assert("nikita-1037", todo != NULL); ++ assert("nikita-1038", op->op == COP_INSERT); ++ ++ coord_init_zero(&coord); ++ ++ /* perform common functionality of insert and paste. */ ++ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data); ++ if (result != 0) ++ return result; ++ ++ node = op->u.insert.d->coord->node; ++ assert("nikita-1039", node != NULL); ++ assert("nikita-1040", node_plugin_by_node(node) != NULL); ++ ++ assert("nikita-949", ++ space_needed_for_op(node, op) <= znode_free_space(node)); ++ ++ /* ask node layout to create new item. */ ++ info.doing = doing; ++ info.todo = todo; ++ result = node_plugin_by_node(node)->create_item ++ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data, ++ &info); ++ doing->restartable = 0; ++ znode_make_dirty(node); ++ ++ return result; ++} ++ ++/* ++ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is ++ * supplied with a "flow" (that is, a stream of data) and inserts it into tree ++ * by slicing into multiple items. ++ */ ++ ++#define flow_insert_point(op) ((op)->u.insert_flow.insert_point) ++#define flow_insert_flow(op) ((op)->u.insert_flow.flow) ++#define flow_insert_data(op) ((op)->u.insert_flow.data) ++ ++static size_t item_data_overhead(carry_op * op) ++{ ++ if (flow_insert_data(op)->iplug->b.estimate == NULL) ++ return 0; ++ return (flow_insert_data(op)->iplug->b. ++ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) - ++ flow_insert_data(op)->length); ++} ++ ++/* FIXME-VS: this is called several times during one make_flow_for_insertion ++ and it will always return the same result. Some optimization could be made ++ by calculating this value once at the beginning and passing it around. That ++ would reduce some flexibility in future changes ++*/ ++static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *); ++static size_t flow_insertion_overhead(carry_op * op) ++{ ++ znode *node; ++ size_t insertion_overhead; ++ ++ node = flow_insert_point(op)->node; ++ insertion_overhead = 0; ++ if (node->nplug->item_overhead && ++ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key, ++ flow_insert_data(op))) ++ insertion_overhead = ++ node->nplug->item_overhead(node, NULL) + ++ item_data_overhead(op); ++ return insertion_overhead; ++} ++ ++/* how many bytes of flow does fit to the node */ ++static int what_can_fit_into_node(carry_op * op) ++{ ++ size_t free, overhead; ++ ++ overhead = flow_insertion_overhead(op); ++ free = znode_free_space(flow_insert_point(op)->node); ++ if (free <= overhead) ++ return 0; ++ free -= overhead; ++ /* FIXME: flow->length is loff_t only to not get overflowed in case of ++ expandign truncate */ ++ if (free < op->u.insert_flow.flow->length) ++ return free; ++ return (int)op->u.insert_flow.flow->length; ++} ++ ++/* in make_space_for_flow_insertion we need to check either whether whole flow ++ fits into a node or whether minimal fraction of flow fits into a node */ ++static int enough_space_for_whole_flow(carry_op * op) ++{ ++ return (unsigned)what_can_fit_into_node(op) == ++ op->u.insert_flow.flow->length; ++} ++ ++#define MIN_FLOW_FRACTION 1 ++static int enough_space_for_min_flow_fraction(carry_op * op) ++{ ++ assert("vs-902", coord_is_after_rightmost(flow_insert_point(op))); ++ ++ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION; ++} ++ ++/* this returns 0 if left neighbor was obtained successfully and everything ++ upto insertion point including it were shifted and left neighbor still has ++ some free space to put minimal fraction of flow into it */ ++static int ++make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo) ++{ ++ carry_node *left; ++ znode *orig; ++ ++ left = find_left_neighbor(op, doing); ++ if (unlikely(IS_ERR(left))) { ++ warning("vs-899", ++ "make_space_by_shift_left: " ++ "error accessing left neighbor: %li", PTR_ERR(left)); ++ return 1; ++ } ++ if (left == NULL) ++ /* left neighbor either does not exist or is unformatted ++ node */ ++ return 1; ++ ++ orig = flow_insert_point(op)->node; ++ /* try to shift content of node @orig from its head upto insert point ++ including insertion point into the left neighbor */ ++ carry_shift_data(LEFT_SIDE, flow_insert_point(op), ++ reiser4_carry_real(left), doing, todo, ++ 1/* including insert point */); ++ if (reiser4_carry_real(left) != flow_insert_point(op)->node) { ++ /* insertion point did not move */ ++ return 1; ++ } ++ ++ /* insertion point is set after last item in the node */ ++ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op))); ++ ++ if (!enough_space_for_min_flow_fraction(op)) { ++ /* insertion point node does not have enough free space to put ++ even minimal portion of flow into it, therefore, move ++ insertion point back to orig node (before first item) */ ++ coord_init_before_first_item(flow_insert_point(op), orig); ++ return 1; ++ } ++ ++ /* part of flow is to be written to the end of node */ ++ op->node = left; ++ return 0; ++} ++ ++/* this returns 0 if right neighbor was obtained successfully and everything to ++ the right of insertion point was shifted to it and node got enough free ++ space to put minimal fraction of flow into it */ ++static int ++make_space_by_shift_right(carry_op * op, carry_level * doing, ++ carry_level * todo) ++{ ++ carry_node *right; ++ ++ right = find_right_neighbor(op, doing); ++ if (unlikely(IS_ERR(right))) { ++ warning("nikita-1065", "shift_right_excluding_insert_point: " ++ "error accessing right neighbor: %li", PTR_ERR(right)); ++ return 1; ++ } ++ if (right) { ++ /* shift everything possible on the right of but excluding ++ insertion coord into the right neighbor */ ++ carry_shift_data(RIGHT_SIDE, flow_insert_point(op), ++ reiser4_carry_real(right), doing, todo, ++ 0/* not including insert point */); ++ } else { ++ /* right neighbor either does not exist or is unformatted ++ node */ ++ ; ++ } ++ if (coord_is_after_rightmost(flow_insert_point(op))) { ++ if (enough_space_for_min_flow_fraction(op)) { ++ /* part of flow is to be written to the end of node */ ++ return 0; ++ } ++ } ++ ++ /* new node is to be added if insert point node did not get enough ++ space for whole flow */ ++ return 1; ++} ++ ++/* this returns 0 when insert coord is set at the node end and fraction of flow ++ fits into that node */ ++static int ++make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo) ++{ ++ int result; ++ znode *node; ++ carry_node *new; ++ ++ node = flow_insert_point(op)->node; ++ ++ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT) ++ return RETERR(-E_NODE_FULL); ++ /* add new node after insert point node */ ++ new = add_new_znode(node, op->node, doing, todo); ++ if (unlikely(IS_ERR(new))) ++ return PTR_ERR(new); ++ result = lock_carry_node(doing, new); ++ zput(reiser4_carry_real(new)); ++ if (unlikely(result)) ++ return result; ++ op->u.insert_flow.new_nodes++; ++ if (!coord_is_after_rightmost(flow_insert_point(op))) { ++ carry_shift_data(RIGHT_SIDE, flow_insert_point(op), ++ reiser4_carry_real(new), doing, todo, ++ 0/* not including insert point */); ++ assert("vs-901", ++ coord_is_after_rightmost(flow_insert_point(op))); ++ ++ if (enough_space_for_min_flow_fraction(op)) ++ return 0; ++ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT) ++ return RETERR(-E_NODE_FULL); ++ ++ /* add one more new node */ ++ new = add_new_znode(node, op->node, doing, todo); ++ if (unlikely(IS_ERR(new))) ++ return PTR_ERR(new); ++ result = lock_carry_node(doing, new); ++ zput(reiser4_carry_real(new)); ++ if (unlikely(result)) ++ return result; ++ op->u.insert_flow.new_nodes++; ++ } ++ ++ /* move insertion point to new node */ ++ coord_init_before_first_item(flow_insert_point(op), ++ reiser4_carry_real(new)); ++ op->node = new; ++ return 0; ++} ++ ++static int ++make_space_for_flow_insertion(carry_op * op, carry_level * doing, ++ carry_level * todo) ++{ ++ __u32 flags = op->u.insert_flow.flags; ++ ++ if (enough_space_for_whole_flow(op)) { ++ /* whole flow fits into insert point node */ ++ return 0; ++ } ++ ++ if (!(flags & COPI_DONT_SHIFT_LEFT) ++ && (make_space_by_shift_left(op, doing, todo) == 0)) { ++ /* insert point is shifted to left neighbor of original insert ++ point node and is set after last unit in that node. It has ++ enough space to fit at least minimal fraction of flow. */ ++ return 0; ++ } ++ ++ if (enough_space_for_whole_flow(op)) { ++ /* whole flow fits into insert point node */ ++ return 0; ++ } ++ ++ if (!(flags & COPI_DONT_SHIFT_RIGHT) ++ && (make_space_by_shift_right(op, doing, todo) == 0)) { ++ /* insert point is still set to the same node, but there is ++ nothing to the right of insert point. */ ++ return 0; ++ } ++ ++ if (enough_space_for_whole_flow(op)) { ++ /* whole flow fits into insert point node */ ++ return 0; ++ } ++ ++ return make_space_by_new_nodes(op, doing, todo); ++} ++ ++/* implements COP_INSERT_FLOW operation */ ++static int ++carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo) ++{ ++ int result; ++ flow_t *f; ++ coord_t *insert_point; ++ node_plugin *nplug; ++ carry_plugin_info info; ++ znode *orig_node; ++ lock_handle *orig_lh; ++ ++ f = op->u.insert_flow.flow; ++ result = 0; ++ ++ /* carry system needs this to work */ ++ info.doing = doing; ++ info.todo = todo; ++ ++ orig_node = flow_insert_point(op)->node; ++ orig_lh = doing->tracked; ++ ++ while (f->length) { ++ result = make_space_for_flow_insertion(op, doing, todo); ++ if (result) ++ break; ++ ++ insert_point = flow_insert_point(op); ++ nplug = node_plugin_by_node(insert_point->node); ++ ++ /* compose item data for insertion/pasting */ ++ flow_insert_data(op)->data = f->data; ++ flow_insert_data(op)->length = what_can_fit_into_node(op); ++ ++ if (can_paste(insert_point, &f->key, flow_insert_data(op))) { ++ /* insert point is set to item of file we are writing to ++ and we have to append to it */ ++ assert("vs-903", insert_point->between == AFTER_UNIT); ++ nplug->change_item_size(insert_point, ++ flow_insert_data(op)->length); ++ flow_insert_data(op)->iplug->b.paste(insert_point, ++ flow_insert_data ++ (op), &info); ++ } else { ++ /* new item must be inserted */ ++ pos_in_node_t new_pos; ++ flow_insert_data(op)->length += item_data_overhead(op); ++ ++ /* FIXME-VS: this is because node40_create_item changes ++ insert_point for obscure reasons */ ++ switch (insert_point->between) { ++ case AFTER_ITEM: ++ new_pos = insert_point->item_pos + 1; ++ break; ++ case EMPTY_NODE: ++ new_pos = 0; ++ break; ++ case BEFORE_ITEM: ++ assert("vs-905", insert_point->item_pos == 0); ++ new_pos = 0; ++ break; ++ default: ++ impossible("vs-906", ++ "carry_insert_flow: invalid coord"); ++ new_pos = 0; ++ break; ++ } ++ ++ nplug->create_item(insert_point, &f->key, ++ flow_insert_data(op), &info); ++ coord_set_item_pos(insert_point, new_pos); ++ } ++ coord_init_after_item_end(insert_point); ++ doing->restartable = 0; ++ znode_make_dirty(insert_point->node); ++ ++ move_flow_forward(f, (unsigned)flow_insert_data(op)->length); ++ } ++ ++ if (orig_node != flow_insert_point(op)->node) { ++ /* move lock to new insert point */ ++ done_lh(orig_lh); ++ init_lh(orig_lh); ++ result = ++ longterm_lock_znode(orig_lh, flow_insert_point(op)->node, ++ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); ++ } ++ ++ return result; ++} ++ ++/* implements COP_DELETE operation ++ ++ Remove pointer to @op -> u.delete.child from it's parent. ++ ++ This function also handles killing of a tree root is last pointer from it ++ was removed. This is complicated by our handling of "twig" level: root on ++ twig level is never killed. ++ ++*/ ++static int carry_delete(carry_op * op /* operation to be performed */ , ++ carry_level * doing UNUSED_ARG /* current carry ++ * level */ , ++ carry_level * todo/* next carry level */) ++{ ++ int result; ++ coord_t coord; ++ coord_t coord2; ++ znode *parent; ++ znode *child; ++ carry_plugin_info info; ++ reiser4_tree *tree; ++ ++ /* ++ * This operation is called to delete internal item pointing to the ++ * child node that was removed by carry from the tree on the previous ++ * tree level. ++ */ ++ ++ assert("nikita-893", op != NULL); ++ assert("nikita-894", todo != NULL); ++ assert("nikita-895", op->op == COP_DELETE); ++ ++ coord_init_zero(&coord); ++ coord_init_zero(&coord2); ++ ++ parent = reiser4_carry_real(op->node); ++ child = op->u.delete.child ? ++ reiser4_carry_real(op->u.delete.child) : op->node->node; ++ tree = znode_get_tree(child); ++ read_lock_tree(tree); ++ ++ /* ++ * @parent was determined when carry entered parent level ++ * (lock_carry_level/lock_carry_node). Since then, actual parent of ++ * @child node could change due to other carry operations performed on ++ * the parent level. Check for this. ++ */ ++ ++ if (znode_parent(child) != parent) { ++ /* NOTE-NIKITA add stat counter for this. */ ++ parent = znode_parent(child); ++ assert("nikita-2581", find_carry_node(doing, parent)); ++ } ++ read_unlock_tree(tree); ++ ++ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL); ++ ++ /* Twig level horrors: tree should be of height at least 2. So, last ++ pointer from the root at twig level is preserved even if child is ++ empty. This is ugly, but so it was architectured. ++ */ ++ ++ if (znode_is_root(parent) && ++ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT && ++ node_num_items(parent) == 1) { ++ /* Delimiting key manipulations. */ ++ write_lock_dk(tree); ++ znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key())); ++ znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key())); ++ ZF_SET(child, JNODE_DKSET); ++ write_unlock_dk(tree); ++ ++ /* @child escaped imminent death! */ ++ ZF_CLR(child, JNODE_HEARD_BANSHEE); ++ return 0; ++ } ++ ++ /* convert child pointer to the coord_t */ ++ result = find_child_ptr(parent, child, &coord); ++ if (result != NS_FOUND) { ++ warning("nikita-994", "Cannot find child pointer: %i", result); ++ print_coord_content("coord", &coord); ++ return result; ++ } ++ ++ coord_dup(&coord2, &coord); ++ info.doing = doing; ++ info.todo = todo; ++ { ++ /* ++ * Actually kill internal item: prepare structure with ++ * arguments for ->cut_and_kill() method... ++ */ ++ ++ struct carry_kill_data kdata; ++ kdata.params.from = &coord; ++ kdata.params.to = &coord2; ++ kdata.params.from_key = NULL; ++ kdata.params.to_key = NULL; ++ kdata.params.smallest_removed = NULL; ++ kdata.params.truncate = 1; ++ kdata.flags = op->u.delete.flags; ++ kdata.inode = NULL; ++ kdata.left = NULL; ++ kdata.right = NULL; ++ kdata.buf = NULL; ++ /* ... and call it. */ ++ result = node_plugin_by_node(parent)->cut_and_kill(&kdata, ++ &info); ++ } ++ doing->restartable = 0; ++ ++ /* check whether root should be killed violently */ ++ if (znode_is_root(parent) && ++ /* don't kill roots at and lower than twig level */ ++ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT && ++ node_num_items(parent) == 1) ++ result = reiser4_kill_tree_root(coord.node); ++ ++ return result < 0 ? : 0; ++} ++ ++/* implements COP_CUT opration ++ ++ Cuts part or whole content of node. ++ ++*/ ++static int carry_cut(carry_op * op /* operation to be performed */ , ++ carry_level * doing /* current carry level */ , ++ carry_level * todo/* next carry level */) ++{ ++ int result; ++ carry_plugin_info info; ++ node_plugin *nplug; ++ ++ assert("nikita-896", op != NULL); ++ assert("nikita-897", todo != NULL); ++ assert("nikita-898", op->op == COP_CUT); ++ ++ info.doing = doing; ++ info.todo = todo; ++ ++ nplug = node_plugin_by_node(reiser4_carry_real(op->node)); ++ if (op->u.cut_or_kill.is_cut) ++ result = nplug->cut(op->u.cut_or_kill.u.cut, &info); ++ else ++ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info); ++ ++ doing->restartable = 0; ++ return result < 0 ? : 0; ++} ++ ++/* helper function for carry_paste(): returns true if @op can be continued as ++ paste */ ++static int ++can_paste(coord_t *icoord, const reiser4_key * key, ++ const reiser4_item_data * data) ++{ ++ coord_t circa; ++ item_plugin *new_iplug; ++ item_plugin *old_iplug; ++ int result = 0; /* to keep gcc shut */ ++ ++ assert("", icoord->between != AT_UNIT); ++ ++ /* obviously, one cannot paste when node is empty---there is nothing ++ to paste into. */ ++ if (node_is_empty(icoord->node)) ++ return 0; ++ /* if insertion point is at the middle of the item, then paste */ ++ if (!coord_is_between_items(icoord)) ++ return 1; ++ coord_dup(&circa, icoord); ++ circa.between = AT_UNIT; ++ ++ old_iplug = item_plugin_by_coord(&circa); ++ new_iplug = data->iplug; ++ ++ /* check whether we can paste to the item @icoord is "at" when we ++ ignore ->between field */ ++ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) ++ result = 1; ++ else if (icoord->between == BEFORE_UNIT ++ || icoord->between == BEFORE_ITEM) { ++ /* otherwise, try to glue to the item at the left, if any */ ++ coord_dup(&circa, icoord); ++ if (coord_set_to_left(&circa)) { ++ result = 0; ++ coord_init_before_item(icoord); ++ } else { ++ old_iplug = item_plugin_by_coord(&circa); ++ result = (old_iplug == new_iplug) ++ && item_can_contain_key(icoord, key, data); ++ if (result) { ++ coord_dup(icoord, &circa); ++ icoord->between = AFTER_UNIT; ++ } ++ } ++ } else if (icoord->between == AFTER_UNIT ++ || icoord->between == AFTER_ITEM) { ++ coord_dup(&circa, icoord); ++ /* otherwise, try to glue to the item at the right, if any */ ++ if (coord_set_to_right(&circa)) { ++ result = 0; ++ coord_init_after_item(icoord); ++ } else { ++ int (*cck) (const coord_t *, const reiser4_key *, ++ const reiser4_item_data *); ++ ++ old_iplug = item_plugin_by_coord(&circa); ++ ++ cck = old_iplug->b.can_contain_key; ++ if (cck == NULL) ++ /* item doesn't define ->can_contain_key ++ method? So it is not expandable. */ ++ result = 0; ++ else { ++ result = (old_iplug == new_iplug) ++ && cck(&circa /*icoord */ , key, data); ++ if (result) { ++ coord_dup(icoord, &circa); ++ icoord->between = BEFORE_UNIT; ++ } ++ } ++ } ++ } else ++ impossible("nikita-2513", "Nothing works"); ++ if (result) { ++ if (icoord->between == BEFORE_ITEM) { ++ assert("vs-912", icoord->unit_pos == 0); ++ icoord->between = BEFORE_UNIT; ++ } else if (icoord->between == AFTER_ITEM) { ++ coord_init_after_item_end(icoord); ++ } ++ } ++ return result; ++} ++ ++/* implements COP_PASTE operation ++ ++ Paste data into existing item. This is complicated by the fact that after ++ we shifted something to the left or right neighbors trying to free some ++ space, item we were supposed to paste into can be in different node than ++ insertion coord. If so, we are no longer doing paste, but insert. See ++ comments in insert_paste_common(). ++ ++*/ ++static int carry_paste(carry_op * op /* operation to be performed */ , ++ carry_level * doing UNUSED_ARG /* current carry ++ * level */ , ++ carry_level * todo/* next carry level */) ++{ ++ znode *node; ++ carry_insert_data cdata; ++ coord_t dcoord; ++ reiser4_item_data data; ++ int result; ++ int real_size; ++ item_plugin *iplug; ++ carry_plugin_info info; ++ coord_t *coord; ++ ++ assert("nikita-982", op != NULL); ++ assert("nikita-983", todo != NULL); ++ assert("nikita-984", op->op == COP_PASTE); ++ ++ coord_init_zero(&dcoord); ++ ++ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data); ++ if (result != 0) ++ return result; ++ ++ coord = op->u.insert.d->coord; ++ ++ /* handle case when op -> u.insert.coord doesn't point to the item ++ of required type. restart as insert. */ ++ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) { ++ op->op = COP_INSERT; ++ op->u.insert.type = COPT_PASTE_RESTARTED; ++ result = op_dispatch_table[COP_INSERT].handler(op, doing, todo); ++ ++ return result; ++ } ++ ++ node = coord->node; ++ iplug = item_plugin_by_coord(coord); ++ assert("nikita-992", iplug != NULL); ++ ++ assert("nikita-985", node != NULL); ++ assert("nikita-986", node_plugin_by_node(node) != NULL); ++ ++ assert("nikita-987", ++ space_needed_for_op(node, op) <= znode_free_space(node)); ++ ++ assert("nikita-1286", coord_is_existing_item(coord)); ++ ++ /* ++ * if item is expanded as a result of this operation, we should first ++ * change item size, than call ->b.paste item method. If item is ++ * shrunk, it should be done other way around: first call ->b.paste ++ * method, then reduce item size. ++ */ ++ ++ real_size = space_needed_for_op(node, op); ++ if (real_size > 0) ++ node->nplug->change_item_size(coord, real_size); ++ ++ doing->restartable = 0; ++ info.doing = doing; ++ info.todo = todo; ++ ++ result = iplug->b.paste(coord, op->u.insert.d->data, &info); ++ ++ if (real_size < 0) ++ node->nplug->change_item_size(coord, real_size); ++ ++ /* if we pasted at the beginning of the item, update item's key. */ ++ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT) ++ node->nplug->update_item_key(coord, op->u.insert.d->key, &info); ++ ++ znode_make_dirty(node); ++ return result; ++} ++ ++/* handle carry COP_EXTENT operation. */ ++static int carry_extent(carry_op * op /* operation to perform */ , ++ carry_level * doing /* queue of operations @op ++ * is part of */ , ++ carry_level * todo /* queue where new operations ++ * are accumulated */ ) ++{ ++ znode *node; ++ carry_insert_data cdata; ++ coord_t coord; ++ reiser4_item_data data; ++ carry_op *delete_dummy; ++ carry_op *insert_extent; ++ int result; ++ carry_plugin_info info; ++ ++ assert("nikita-1751", op != NULL); ++ assert("nikita-1752", todo != NULL); ++ assert("nikita-1753", op->op == COP_EXTENT); ++ ++ /* extent insertion overview: ++ ++ extents live on the TWIG LEVEL, which is level one above the leaf ++ one. This complicates extent insertion logic somewhat: it may ++ happen (and going to happen all the time) that in logical key ++ ordering extent has to be placed between items I1 and I2, located ++ at the leaf level, but I1 and I2 are in the same formatted leaf ++ node N1. To insert extent one has to ++ ++ (1) reach node N1 and shift data between N1, its neighbors and ++ possibly newly allocated nodes until I1 and I2 fall into different ++ nodes. Since I1 and I2 are still neighboring items in logical key ++ order, they will be necessary utmost items in their respective ++ nodes. ++ ++ (2) After this new extent item is inserted into node on the twig ++ level. ++ ++ Fortunately this process can reuse almost all code from standard ++ insertion procedure (viz. make_space() and insert_paste_common()), ++ due to the following observation: make_space() only shifts data up ++ to and excluding or including insertion point. It never ++ "over-moves" through insertion point. Thus, one can use ++ make_space() to perform step (1). All required for this is just to ++ instruct free_space_shortage() to keep make_space() shifting data ++ until insertion point is at the node border. ++ ++ */ ++ ++ /* perform common functionality of insert and paste. */ ++ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data); ++ if (result != 0) ++ return result; ++ ++ node = op->u.extent.d->coord->node; ++ assert("nikita-1754", node != NULL); ++ assert("nikita-1755", node_plugin_by_node(node) != NULL); ++ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE); ++ ++ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that ++ extent fits between items. */ ++ ++ info.doing = doing; ++ info.todo = todo; ++ ++ /* there is another complication due to placement of extents on the ++ twig level: extents are "rigid" in the sense that key-range ++ occupied by extent cannot grow indefinitely to the right as it is ++ for the formatted leaf nodes. Because of this when search finds two ++ adjacent extents on the twig level, it has to "drill" to the leaf ++ level, creating new node. Here we are removing this node. ++ */ ++ if (node_is_empty(node)) { ++ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1); ++ if (IS_ERR(delete_dummy)) ++ return PTR_ERR(delete_dummy); ++ delete_dummy->u.delete.child = NULL; ++ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY; ++ ZF_SET(node, JNODE_HEARD_BANSHEE); ++ } ++ ++ /* proceed with inserting extent item into parent. We are definitely ++ inserting rather than pasting if we get that far. */ ++ insert_extent = node_post_carry(&info, COP_INSERT, node, 1); ++ if (IS_ERR(insert_extent)) ++ /* @delete_dummy will be automatically destroyed on the level ++ exiting */ ++ return PTR_ERR(insert_extent); ++ /* NOTE-NIKITA insertion by key is simplest option here. Another ++ possibility is to insert on the left or right of already existing ++ item. ++ */ ++ insert_extent->u.insert.type = COPT_KEY; ++ insert_extent->u.insert.d = op->u.extent.d; ++ assert("nikita-1719", op->u.extent.d->key != NULL); ++ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord; ++ insert_extent->u.insert.flags = ++ znode_get_tree(node)->carry.new_extent_flags; ++ ++ /* ++ * if carry was asked to track lock handle we should actually track ++ * lock handle on the twig node rather than on the leaf where ++ * operation was started from. Transfer tracked lock handle. ++ */ ++ if (doing->track_type) { ++ assert("nikita-3242", doing->tracked != NULL); ++ assert("nikita-3244", todo->tracked == NULL); ++ todo->tracked = doing->tracked; ++ todo->track_type = CARRY_TRACK_NODE; ++ doing->tracked = NULL; ++ doing->track_type = 0; ++ } ++ ++ return 0; ++} ++ ++/* update key in @parent between pointers to @left and @right. ++ ++ Find coords of @left and @right and update delimiting key between them. ++ This is helper function called by carry_update(). Finds position of ++ internal item involved. Updates item key. Updates delimiting keys of child ++ nodes involved. ++*/ ++static int update_delimiting_key(znode * parent /* node key is updated ++ * in */ , ++ znode * left /* child of @parent */ , ++ znode * right /* child of @parent */ , ++ carry_level * doing /* current carry ++ * level */ , ++ carry_level * todo /* parent carry ++ * level */ , ++ const char **error_msg /* place to ++ * store error ++ * message */ ) ++{ ++ coord_t left_pos; ++ coord_t right_pos; ++ int result; ++ reiser4_key ldkey; ++ carry_plugin_info info; ++ ++ assert("nikita-1177", right != NULL); ++ /* find position of right left child in a parent */ ++ result = find_child_ptr(parent, right, &right_pos); ++ if (result != NS_FOUND) { ++ *error_msg = "Cannot find position of right child"; ++ return result; ++ } ++ ++ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) { ++ /* find position of the left child in a parent */ ++ result = find_child_ptr(parent, left, &left_pos); ++ if (result != NS_FOUND) { ++ *error_msg = "Cannot find position of left child"; ++ return result; ++ } ++ assert("nikita-1355", left_pos.node != NULL); ++ } else ++ left_pos.node = NULL; ++ ++ /* check that they are separated by exactly one key and are basically ++ sane */ ++ if (REISER4_DEBUG) { ++ if ((left_pos.node != NULL) ++ && !coord_is_existing_unit(&left_pos)) { ++ *error_msg = "Left child is bastard"; ++ return RETERR(-EIO); ++ } ++ if (!coord_is_existing_unit(&right_pos)) { ++ *error_msg = "Right child is bastard"; ++ return RETERR(-EIO); ++ } ++ if (left_pos.node != NULL && ++ !coord_are_neighbors(&left_pos, &right_pos)) { ++ *error_msg = "Children are not direct siblings"; ++ return RETERR(-EIO); ++ } ++ } ++ *error_msg = NULL; ++ ++ info.doing = doing; ++ info.todo = todo; ++ ++ /* ++ * If child node is not empty, new key of internal item is a key of ++ * leftmost item in the child node. If the child is empty, take its ++ * right delimiting key as a new key of the internal item. Precise key ++ * in the latter case is not important per se, because the child (and ++ * the internal item) are going to be killed shortly anyway, but we ++ * have to preserve correct order of keys in the parent node. ++ */ ++ ++ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE)) ++ leftmost_key_in_node(right, &ldkey); ++ else { ++ read_lock_dk(znode_get_tree(parent)); ++ ldkey = *znode_get_rd_key(right); ++ read_unlock_dk(znode_get_tree(parent)); ++ } ++ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info); ++ doing->restartable = 0; ++ znode_make_dirty(parent); ++ return 0; ++} ++ ++/* implements COP_UPDATE opration ++ ++ Update delimiting keys. ++ ++*/ ++static int carry_update(carry_op * op /* operation to be performed */ , ++ carry_level * doing /* current carry level */ , ++ carry_level * todo/* next carry level */) ++{ ++ int result; ++ carry_node *missing UNUSED_ARG; ++ znode *left; ++ znode *right; ++ carry_node *lchild; ++ carry_node *rchild; ++ const char *error_msg; ++ reiser4_tree *tree; ++ ++ /* ++ * This operation is called to update key of internal item. This is ++ * necessary when carry shifted of cut data on the child ++ * level. Arguments of this operation are: ++ * ++ * @right --- child node. Operation should update key of internal ++ * item pointing to @right. ++ * ++ * @left --- left neighbor of @right. This parameter is optional. ++ */ ++ ++ assert("nikita-902", op != NULL); ++ assert("nikita-903", todo != NULL); ++ assert("nikita-904", op->op == COP_UPDATE); ++ ++ lchild = op->u.update.left; ++ rchild = op->node; ++ ++ if (lchild != NULL) { ++ assert("nikita-1001", lchild->parent); ++ assert("nikita-1003", !lchild->left); ++ left = reiser4_carry_real(lchild); ++ } else ++ left = NULL; ++ ++ tree = znode_get_tree(rchild->node); ++ read_lock_tree(tree); ++ right = znode_parent(rchild->node); ++ read_unlock_tree(tree); ++ ++ if (right != NULL) { ++ result = update_delimiting_key(right, ++ lchild ? lchild->node : NULL, ++ rchild->node, ++ doing, todo, &error_msg); ++ } else { ++ error_msg = "Cannot find node to update key in"; ++ result = RETERR(-EIO); ++ } ++ /* operation will be reposted to the next level by the ++ ->update_item_key() method of node plugin, if necessary. */ ++ ++ if (result != 0) { ++ warning("nikita-999", "Error updating delimiting key: %s (%i)", ++ error_msg ? : "", result); ++ } ++ return result; ++} ++ ++/* move items from @node during carry */ ++static int carry_shift_data(sideof side /* in what direction to move data */ , ++ coord_t *insert_coord /* coord where new item ++ * is to be inserted */, ++ znode * node /* node which data are moved from */ , ++ carry_level * doing /* active carry queue */ , ++ carry_level * todo /* carry queue where new ++ * operations are to be put ++ * in */ , ++ unsigned int including_insert_coord_p ++ /* true if @insertion_coord can be moved */ ) ++{ ++ int result; ++ znode *source; ++ carry_plugin_info info; ++ node_plugin *nplug; ++ ++ source = insert_coord->node; ++ ++ info.doing = doing; ++ info.todo = todo; ++ ++ nplug = node_plugin_by_node(node); ++ result = nplug->shift(insert_coord, node, ++ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0, ++ (int)including_insert_coord_p, &info); ++ /* the only error ->shift() method of node plugin can return is ++ -ENOMEM due to carry node/operation allocation. */ ++ assert("nikita-915", result >= 0 || result == -ENOMEM); ++ if (result > 0) { ++ /* ++ * if some number of bytes was actually shifted, mark nodes ++ * dirty, and carry level as non-restartable. ++ */ ++ doing->restartable = 0; ++ znode_make_dirty(source); ++ znode_make_dirty(node); ++ } ++ ++ assert("nikita-2077", coord_check(insert_coord)); ++ return 0; ++} ++ ++typedef carry_node *(*carry_iterator) (carry_node * node); ++static carry_node *find_dir_carry(carry_node * node, carry_level * level, ++ carry_iterator iterator); ++ ++static carry_node *pool_level_list_prev(carry_node *node) ++{ ++ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage); ++} ++ ++/* look for the left neighbor of given carry node in a carry queue. ++ ++ This is used by find_left_neighbor(), but I am not sure that this ++ really gives any advantage. More statistics required. ++ ++*/ ++carry_node *find_left_carry(carry_node * node /* node to find left neighbor ++ * of */ , ++ carry_level * level/* level to scan */) ++{ ++ return find_dir_carry(node, level, ++ (carry_iterator) pool_level_list_prev); ++} ++ ++static carry_node *pool_level_list_next(carry_node *node) ++{ ++ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); ++} ++ ++/* look for the right neighbor of given carry node in a ++ carry queue. ++ ++ This is used by find_right_neighbor(), but I am not sure that this ++ really gives any advantage. More statistics required. ++ ++*/ ++carry_node *find_right_carry(carry_node * node /* node to find right neighbor ++ * of */ , ++ carry_level * level/* level to scan */) ++{ ++ return find_dir_carry(node, level, ++ (carry_iterator) pool_level_list_next); ++} ++ ++/* look for the left or right neighbor of given carry node in a carry ++ queue. ++ ++ Helper function used by find_{left|right}_carry(). ++*/ ++static carry_node *find_dir_carry(carry_node * node /* node to start ++ * scanning from */ , ++ carry_level * level /* level to scan */ , ++ carry_iterator iterator /* operation to ++ * move to the ++ * next node */) ++{ ++ carry_node *neighbor; ++ ++ assert("nikita-1059", node != NULL); ++ assert("nikita-1060", level != NULL); ++ ++ /* scan list of carry nodes on this list dir-ward, skipping all ++ carry nodes referencing the same znode. */ ++ neighbor = node; ++ while (1) { ++ neighbor = iterator(neighbor); ++ if (carry_node_end(level, neighbor)) ++ /* list head is reached */ ++ return NULL; ++ if (reiser4_carry_real(neighbor) != reiser4_carry_real(node)) ++ return neighbor; ++ } ++} ++ ++/* ++ * Memory reservation estimation. ++ * ++ * Carry process proceeds through tree levels upwards. Carry assumes that it ++ * takes tree in consistent state (e.g., that search tree invariants hold), ++ * and leaves tree consistent after it finishes. This means that when some ++ * error occurs carry cannot simply return if there are pending carry ++ * operations. Generic solution for this problem is carry-undo either as ++ * transaction manager feature (requiring checkpoints and isolation), or ++ * through some carry specific mechanism. ++ * ++ * Our current approach is to panic if carry hits an error while tree is ++ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around ++ * this "memory reservation" mechanism was added. ++ * ++ * Memory reservation is implemented by perthread-pages.diff patch from ++ * core-patches. Its API is defined in <linux/gfp.h> ++ * ++ * int perthread_pages_reserve(int nrpages, gfp_t gfp); ++ * void perthread_pages_release(int nrpages); ++ * int perthread_pages_count(void); ++ * ++ * carry estimates its worst case memory requirements at the entry, reserved ++ * enough memory, and released unused pages before returning. ++ * ++ * Code below estimates worst case memory requirements for a given carry ++ * queue. This is dome by summing worst case memory requirements for each ++ * operation in the queue. ++ * ++ */ ++ ++/* ++ * Memory memory requirements of many operations depends on the tree ++ * height. For example, item insertion requires new node to be inserted at ++ * each tree level in the worst case. What tree height should be used for ++ * estimation? Current tree height is wrong, because tree height can change ++ * between the time when estimation was done and the time when operation is ++ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT) ++ * is also not desirable, because it would lead to the huge over-estimation ++ * all the time. Plausible solution is "capped tree height": if current tree ++ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is ++ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is ++ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely ++ * to be increased even more during short interval of time. ++ */ ++#define TREE_HEIGHT_CAP (5) ++ ++/* return capped tree height for the @tree. See comment above. */ ++static int cap_tree_height(reiser4_tree * tree) ++{ ++ return max_t(int, tree->height, TREE_HEIGHT_CAP); ++} ++ ++/* return capped tree height for the current tree. */ ++static int capped_height(void) ++{ ++ return cap_tree_height(current_tree); ++} ++ ++/* return number of pages required to store given number of bytes */ ++static int bytes_to_pages(int bytes) ++{ ++ return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; ++} ++ ++/* how many pages are required to allocate znodes during item insertion. */ ++static int carry_estimate_znodes(void) ++{ ++ /* ++ * Note, that there we have some problem here: there is no way to ++ * reserve pages specifically for the given slab. This means that ++ * these pages can be hijacked for some other end. ++ */ ++ ++ /* in the worst case we need 3 new znode on each tree level */ ++ return bytes_to_pages(capped_height() * sizeof(znode) * 3); ++} ++ ++/* ++ * how many pages are required to load bitmaps. One bitmap per level. ++ */ ++static int carry_estimate_bitmaps(void) ++{ ++ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) { ++ int bytes; ++ ++ bytes = capped_height() * (0 + /* bnode should be added, but ++ * it is private to bitmap.c, ++ * skip for now. */ ++ 2 * sizeof(jnode)); ++ /* working and commit jnodes */ ++ return bytes_to_pages(bytes) + 2; /* and their contents */ ++ } else ++ /* bitmaps were pre-loaded during mount */ ++ return 0; ++} ++ ++/* worst case item insertion memory requirements */ ++static int carry_estimate_insert(carry_op * op, carry_level * level) ++{ ++ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + ++ /* new atom */ ++ capped_height() + /* new block on each level */ ++ 1 + /* and possibly extra new block at the leaf level */ ++ 3; /* loading of leaves into memory */ ++} ++ ++/* worst case item deletion memory requirements */ ++static int carry_estimate_delete(carry_op * op, carry_level * level) ++{ ++ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + ++ /* new atom */ ++ 3; /* loading of leaves into memory */ ++} ++ ++/* worst case tree cut memory requirements */ ++static int carry_estimate_cut(carry_op * op, carry_level * level) ++{ ++ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + ++ /* new atom */ ++ 3; /* loading of leaves into memory */ ++} ++ ++/* worst case memory requirements of pasting into item */ ++static int carry_estimate_paste(carry_op * op, carry_level * level) ++{ ++ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + ++ /* new atom */ ++ capped_height() + /* new block on each level */ ++ 1 + /* and possibly extra new block at the leaf level */ ++ 3; /* loading of leaves into memory */ ++} ++ ++/* worst case memory requirements of extent insertion */ ++static int carry_estimate_extent(carry_op * op, carry_level * level) ++{ ++ return carry_estimate_insert(op, level) + /* insert extent */ ++ carry_estimate_delete(op, level); /* kill leaf */ ++} ++ ++/* worst case memory requirements of key update */ ++static int carry_estimate_update(carry_op * op, carry_level * level) ++{ ++ return 0; ++} ++ ++/* worst case memory requirements of flow insertion */ ++static int carry_estimate_insert_flow(carry_op * op, carry_level * level) ++{ ++ int newnodes; ++ ++ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length), ++ CARRY_FLOW_NEW_NODES_LIMIT); ++ /* ++ * roughly estimate insert_flow as a sequence of insertions. ++ */ ++ return newnodes * carry_estimate_insert(op, level); ++} ++ ++/* This is dispatch table for carry operations. It can be trivially ++ abstracted into useful plugin: tunable balancing policy is a good ++ thing. */ ++carry_op_handler op_dispatch_table[COP_LAST_OP] = { ++ [COP_INSERT] = { ++ .handler = carry_insert, ++ .estimate = carry_estimate_insert} ++ , ++ [COP_DELETE] = { ++ .handler = carry_delete, ++ .estimate = carry_estimate_delete} ++ , ++ [COP_CUT] = { ++ .handler = carry_cut, ++ .estimate = carry_estimate_cut} ++ , ++ [COP_PASTE] = { ++ .handler = carry_paste, ++ .estimate = carry_estimate_paste} ++ , ++ [COP_EXTENT] = { ++ .handler = carry_extent, ++ .estimate = carry_estimate_extent} ++ , ++ [COP_UPDATE] = { ++ .handler = carry_update, ++ .estimate = carry_estimate_update} ++ , ++ [COP_INSERT_FLOW] = { ++ .handler = carry_insert_flow, ++ .estimate = carry_estimate_insert_flow} ++}; ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/carry_ops.h linux-2.6.33/fs/reiser4/carry_ops.h +--- linux-2.6.33.orig/fs/reiser4/carry_ops.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/carry_ops.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,43 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* implementation of carry operations. See carry_ops.c for details. */ ++ ++#if !defined(__CARRY_OPS_H__) ++#define __CARRY_OPS_H__ ++ ++#include "forward.h" ++#include "znode.h" ++#include "carry.h" ++ ++/* carry operation handlers */ ++typedef struct carry_op_handler { ++ /* perform operation */ ++ int (*handler) (carry_op * op, carry_level * doing, carry_level * todo); ++ /* estimate memory requirements for @op */ ++ int (*estimate) (carry_op * op, carry_level * level); ++} carry_op_handler; ++ ++/* This is dispatch table for carry operations. It can be trivially ++ abstracted into useful plugin: tunable balancing policy is a good ++ thing. */ ++extern carry_op_handler op_dispatch_table[COP_LAST_OP]; ++ ++unsigned int space_needed(const znode * node, const coord_t *coord, ++ const reiser4_item_data * data, int inserting); ++extern carry_node *find_left_carry(carry_node * node, carry_level * level); ++extern carry_node *find_right_carry(carry_node * node, carry_level * level); ++ ++/* __CARRY_OPS_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/context.c linux-2.6.33/fs/reiser4/context.c +--- linux-2.6.33.orig/fs/reiser4/context.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/context.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,289 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Manipulation of reiser4_context */ ++ ++/* ++ * global context used during system call. Variable of this type is allocated ++ * on the stack at the beginning of the reiser4 part of the system call and ++ * pointer to it is stored in the current->fs_context. This allows us to avoid ++ * passing pointer to current transaction and current lockstack (both in ++ * one-to-one mapping with threads) all over the call chain. ++ * ++ * It's kind of like those global variables the prof used to tell you not to ++ * use in CS1, except thread specific.;-) Nikita, this was a good idea. ++ * ++ * In some situations it is desirable to have ability to enter reiser4_context ++ * more than once for the same thread (nested contexts). For example, there ++ * are some functions that can be called either directly from VFS/VM or from ++ * already active reiser4 context (->writepage, for example). ++ * ++ * In such situations "child" context acts like dummy: all activity is ++ * actually performed in the top level context, and get_current_context() ++ * always returns top level context. ++ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly ++ * nested any way. ++ * ++ * Note that there is an important difference between reiser4 uses ++ * ->fs_context and the way other file systems use it. Other file systems ++ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_ ++ * (this is why ->fs_context was initially called ->journal_info). This means, ++ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry ++ * to the file system, they assume that some transaction is already underway, ++ * and usually bail out, because starting nested transaction would most likely ++ * lead to the deadlock. This gives false positives with reiser4, because we ++ * set ->fs_context before starting transaction. ++ */ ++ ++#include "debug.h" ++#include "super.h" ++#include "context.h" ++#include "vfs_ops.h" /* for reiser4_throttle_write() */ ++ ++#include <linux/writeback.h> /* for current_is_pdflush() */ ++#include <linux/hardirq.h> ++ ++static void _reiser4_init_context(reiser4_context * context, ++ struct super_block *super) ++{ ++ memset(context, 0, sizeof(*context)); ++ ++ context->super = super; ++ context->magic = context_magic; ++ context->outer = current->journal_info; ++ current->journal_info = (void *)context; ++ context->nr_children = 0; ++ context->gfp_mask = GFP_KERNEL; ++ ++ init_lock_stack(&context->stack); ++ ++ reiser4_txn_begin(context); ++ ++ /* initialize head of tap list */ ++ INIT_LIST_HEAD(&context->taps); ++#if REISER4_DEBUG ++ context->task = current; ++#endif ++ grab_space_enable(); ++} ++ ++/* initialize context and bind it to the current thread ++ ++ This function should be called at the beginning of reiser4 part of ++ syscall. ++*/ ++reiser4_context * reiser4_init_context(struct super_block *super) ++{ ++ reiser4_context *context; ++ ++ assert("nikita-2662", !in_interrupt() && !in_irq()); ++ assert("nikita-3357", super != NULL); ++ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super)); ++ ++ context = get_current_context_check(); ++ if (context && context->super == super) { ++ context = (reiser4_context *) current->journal_info; ++ context->nr_children++; ++ return context; ++ } ++ ++ context = kmalloc(sizeof(*context), GFP_KERNEL); ++ if (context == NULL) ++ return ERR_PTR(RETERR(-ENOMEM)); ++ ++ _reiser4_init_context(context, super); ++ return context; ++} ++ ++/* this is used in scan_mgr which is called with spinlock held and in ++ reiser4_fill_super magic */ ++void init_stack_context(reiser4_context *context, struct super_block *super) ++{ ++ assert("nikita-2662", !in_interrupt() && !in_irq()); ++ assert("nikita-3357", super != NULL); ++ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super)); ++ assert("vs-12", !is_in_reiser4_context()); ++ ++ _reiser4_init_context(context, super); ++ context->on_stack = 1; ++ return; ++} ++ ++/* cast lock stack embedded into reiser4 context up to its container */ ++reiser4_context *get_context_by_lock_stack(lock_stack * owner) ++{ ++ return container_of(owner, reiser4_context, stack); ++} ++ ++/* true if there is already _any_ reiser4 context for the current thread */ ++int is_in_reiser4_context(void) ++{ ++ reiser4_context *ctx; ++ ++ ctx = current->journal_info; ++ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic; ++} ++ ++/* ++ * call balance dirty pages for the current context. ++ * ++ * File system is expected to call balance_dirty_pages_ratelimited() whenever ++ * it dirties a page. reiser4 does this for unformatted nodes (that is, during ++ * write---this covers vast majority of all dirty traffic), but we cannot do ++ * this immediately when formatted node is dirtied, because long term lock is ++ * usually held at that time. To work around this, dirtying of formatted node ++ * simply increases ->nr_marked_dirty counter in the current reiser4 ++ * context. When we are about to leave this context, ++ * balance_dirty_pages_ratelimited() is called, if necessary. ++ * ++ * This introduces another problem: sometimes we do not want to run ++ * balance_dirty_pages_ratelimited() when leaving a context, for example ++ * because some important lock (like ->i_mutex on the parent directory) is ++ * held. To achieve this, ->nobalance flag can be set in the current context. ++ */ ++static void reiser4_throttle_write_at(reiser4_context *context) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(context->super); ++ ++ /* ++ * call balance_dirty_pages_ratelimited() to process formatted nodes ++ * dirtied during this system call. Do that only if we are not in mount ++ * and there were nodes dirtied in this context and we are not in ++ * writepage (to avoid deadlock) and not in pdflush ++ */ ++ if (sbinfo != NULL && sbinfo->fake != NULL && ++ context->nr_marked_dirty != 0 && ++ !(current->flags & PF_MEMALLOC) && ++ !current_is_flush_bd_task()) ++ /* FIXME-EDWARD: throttle with nr_marked_dirty? */ ++ reiser4_throttle_write(sbinfo->fake, 1); ++} ++ ++/* release resources associated with context. ++ ++ This function should be called at the end of "session" with reiser4, ++ typically just before leaving reiser4 driver back to VFS. ++ ++ This is good place to put some degugging consistency checks, like that ++ thread released all locks and closed transcrash etc. ++ ++*/ ++static void reiser4_done_context(reiser4_context * context) ++ /* context being released */ ++{ ++ assert("nikita-860", context != NULL); ++ assert("nikita-859", context->magic == context_magic); ++ assert("vs-646", (reiser4_context *) current->journal_info == context); ++ assert("zam-686", !in_interrupt() && !in_irq()); ++ ++ /* only do anything when leaving top-level reiser4 context. All nested ++ * contexts are just dummies. */ ++ if (context->nr_children == 0) { ++ assert("jmacd-673", context->trans == NULL); ++ assert("jmacd-1002", lock_stack_isclean(&context->stack)); ++ assert("nikita-1936", reiser4_no_counters_are_held()); ++ assert("nikita-2626", list_empty_careful(reiser4_taps_list())); ++ assert("zam-1004", ergo(get_super_private(context->super), ++ get_super_private(context->super)->delete_mutex_owner != ++ current)); ++ ++ /* release all grabbed but as yet unused blocks */ ++ if (context->grabbed_blocks != 0) ++ all_grabbed2free(); ++ ++ /* ++ * synchronize against longterm_unlock_znode(): ++ * wake_up_requestor() wakes up requestors without holding ++ * zlock (otherwise they will immediately bump into that lock ++ * after wake up on another CPU). To work around (rare) ++ * situation where requestor has been woken up asynchronously ++ * and managed to run until completion (and destroy its ++ * context and lock stack) before wake_up_requestor() called ++ * wake_up() on it, wake_up_requestor() synchronize on lock ++ * stack spin lock. It has actually been observed that spin ++ * lock _was_ locked at this point, because ++ * wake_up_requestor() took interrupt. ++ */ ++ spin_lock_stack(&context->stack); ++ spin_unlock_stack(&context->stack); ++ ++ assert("zam-684", context->nr_children == 0); ++ /* restore original ->fs_context value */ ++ current->journal_info = context->outer; ++ if (context->on_stack == 0) ++ kfree(context); ++ } else { ++ context->nr_children--; ++#if REISER4_DEBUG ++ assert("zam-685", context->nr_children >= 0); ++#endif ++ } ++} ++ ++/* ++ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close ++ * transaction. Call done_context() to do context related book-keeping. ++ */ ++void reiser4_exit_context(reiser4_context * context) ++{ ++ assert("nikita-3021", reiser4_schedulable()); ++ ++ if (context->nr_children == 0) { ++ if (!context->nobalance) ++ reiser4_throttle_write_at(context); ++ ++ /* if filesystem is mounted with -o sync or -o dirsync - commit ++ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid ++ commiting on exit_context when inode semaphore is held and ++ to have ktxnmgrd to do commit instead to get better ++ concurrent filesystem accesses. But, when one mounts with -o ++ sync, he cares more about reliability than about ++ performance. So, for now we have this simple mount -o sync ++ support. */ ++ if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) { ++ txn_atom *atom; ++ ++ atom = get_current_atom_locked_nocheck(); ++ if (atom) { ++ atom->flags |= ATOM_FORCE_COMMIT; ++ context->trans->flags &= ~TXNH_DONT_COMMIT; ++ spin_unlock_atom(atom); ++ } ++ } ++ reiser4_txn_end(context); ++ } ++ reiser4_done_context(context); ++} ++ ++void reiser4_ctx_gfp_mask_set(void) ++{ ++ reiser4_context *ctx; ++ ++ ctx = get_current_context(); ++ if (ctx->entd == 0 && ++ list_empty(&ctx->stack.locks) && ++ ctx->trans->atom == NULL) ++ ctx->gfp_mask = GFP_KERNEL; ++ else ++ ctx->gfp_mask = GFP_NOFS; ++} ++ ++void reiser4_ctx_gfp_mask_force(gfp_t mask) ++{ ++ reiser4_context *ctx; ++ ctx = get_current_context(); ++ ++ assert("edward-1454", ctx != NULL); ++ ++ ctx->gfp_mask = mask; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 120 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/context.h linux-2.6.33/fs/reiser4/context.h +--- linux-2.6.33.orig/fs/reiser4/context.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/context.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,228 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Reiser4 context. See context.c for details. */ ++ ++#if !defined( __REISER4_CONTEXT_H__ ) ++#define __REISER4_CONTEXT_H__ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "tap.h" ++#include "lock.h" ++ ++#include <linux/types.h> /* for __u?? */ ++#include <linux/fs.h> /* for struct super_block */ ++#include <linux/spinlock.h> ++#include <linux/sched.h> /* for struct task_struct */ ++ ++/* reiser4 per-thread context */ ++struct reiser4_context { ++ /* magic constant. For identification of reiser4 contexts. */ ++ __u32 magic; ++ ++ /* current lock stack. See lock.[ch]. This is where list of all ++ locks taken by current thread is kept. This is also used in ++ deadlock detection. */ ++ lock_stack stack; ++ ++ /* current transcrash. */ ++ txn_handle *trans; ++ /* transaction handle embedded into reiser4_context. ->trans points ++ * here by default. */ ++ txn_handle trans_in_ctx; ++ ++ /* super block we are working with. To get the current tree ++ use &get_super_private (reiser4_get_current_sb ())->tree. */ ++ struct super_block *super; ++ ++ /* parent fs activation */ ++ struct fs_activation *outer; ++ ++ /* per-thread grabbed (for further allocation) blocks counter */ ++ reiser4_block_nr grabbed_blocks; ++ ++ /* list of taps currently monitored. See tap.c */ ++ struct list_head taps; ++ ++ /* grabbing space is enabled */ ++ unsigned int grab_enabled:1; ++ /* should be set when we are write dirty nodes to disk in jnode_flush or ++ * reiser4_write_logs() */ ++ unsigned int writeout_mode:1; ++ /* true, if current thread is an ent thread */ ++ unsigned int entd:1; ++ /* true, if balance_dirty_pages() should not be run when leaving this ++ * context. This is used to avoid lengthly balance_dirty_pages() ++ * operation when holding some important resource, like directory ++ * ->i_mutex */ ++ unsigned int nobalance:1; ++ ++ /* this bit is used on reiser4_done_context to decide whether context is ++ kmalloc-ed and has to be kfree-ed */ ++ unsigned int on_stack:1; ++ ++ /* count non-trivial jnode_set_dirty() calls */ ++ unsigned long nr_marked_dirty; ++ ++ /* reiser4_writeback_inodes calls (via generic_writeback_sb_inodes) ++ * reiser4_writepages for each of dirty inodes. Reiser4_writepages ++ * captures pages. When number of pages captured in one ++ * reiser4_sync_inodes reaches some threshold - some atoms get ++ * flushed */ ++ int nr_captured; ++ int nr_children; /* number of child contexts */ ++#if REISER4_DEBUG ++ /* debugging information about reiser4 locks held by the current ++ * thread */ ++ reiser4_lock_cnt_info locks; ++ struct task_struct *task; /* so we can easily find owner of the stack */ ++ ++ /* ++ * disk space grabbing debugging support ++ */ ++ /* how many disk blocks were grabbed by the first call to ++ * reiser4_grab_space() in this context */ ++ reiser4_block_nr grabbed_initially; ++ ++ /* list of all threads doing flush currently */ ++ struct list_head flushers_link; ++ /* information about last error encountered by reiser4 */ ++ err_site err; ++#endif ++ void *vp; ++ gfp_t gfp_mask; ++}; ++ ++extern reiser4_context *get_context_by_lock_stack(lock_stack *); ++ ++/* Debugging helps. */ ++#if REISER4_DEBUG ++extern void print_contexts(void); ++#endif ++ ++#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree)) ++#define current_blocksize reiser4_get_current_sb()->s_blocksize ++#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits ++ ++extern reiser4_context *reiser4_init_context(struct super_block *); ++extern void init_stack_context(reiser4_context *, struct super_block *); ++extern void reiser4_exit_context(reiser4_context *); ++ ++/* magic constant we store in reiser4_context allocated at the stack. Used to ++ catch accesses to staled or uninitialized contexts. */ ++#define context_magic ((__u32) 0x4b1b5d0b) ++ ++extern int is_in_reiser4_context(void); ++ ++/* ++ * return reiser4_context for the thread @tsk ++ */ ++static inline reiser4_context *get_context(const struct task_struct *tsk) ++{ ++ assert("vs-1682", ++ ((reiser4_context *) tsk->journal_info)->magic == context_magic); ++ return (reiser4_context *) tsk->journal_info; ++} ++ ++/* ++ * return reiser4 context of the current thread, or NULL if there is none. ++ */ ++static inline reiser4_context *get_current_context_check(void) ++{ ++ if (is_in_reiser4_context()) ++ return get_context(current); ++ else ++ return NULL; ++} ++ ++static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */ ++ ++/* return context associated with current thread */ ++static inline reiser4_context *get_current_context(void) ++{ ++ return get_context(current); ++} ++ ++static inline gfp_t reiser4_ctx_gfp_mask_get(void) ++{ ++ reiser4_context *ctx; ++ ++ ctx = get_current_context_check(); ++ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask; ++} ++ ++void reiser4_ctx_gfp_mask_set(void); ++void reiser4_ctx_gfp_mask_force (gfp_t mask); ++ ++/* ++ * true if current thread is in the write-out mode. Thread enters write-out ++ * mode during jnode_flush and reiser4_write_logs(). ++ */ ++static inline int is_writeout_mode(void) ++{ ++ return get_current_context()->writeout_mode; ++} ++ ++/* ++ * enter write-out mode ++ */ ++static inline void writeout_mode_enable(void) ++{ ++ assert("zam-941", !get_current_context()->writeout_mode); ++ get_current_context()->writeout_mode = 1; ++} ++ ++/* ++ * leave write-out mode ++ */ ++static inline void writeout_mode_disable(void) ++{ ++ assert("zam-942", get_current_context()->writeout_mode); ++ get_current_context()->writeout_mode = 0; ++} ++ ++static inline void grab_space_enable(void) ++{ ++ get_current_context()->grab_enabled = 1; ++} ++ ++static inline void grab_space_disable(void) ++{ ++ get_current_context()->grab_enabled = 0; ++} ++ ++static inline void grab_space_set_enabled(int enabled) ++{ ++ get_current_context()->grab_enabled = enabled; ++} ++ ++static inline int is_grab_enabled(reiser4_context * ctx) ++{ ++ return ctx->grab_enabled; ++} ++ ++/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or ++ * flush would be performed when it is closed. This is necessary when handle ++ * has to be closed under some coarse semaphore, like i_mutex of ++ * directory. Commit will be performed by ktxnmgrd. */ ++static inline void context_set_commit_async(reiser4_context * context) ++{ ++ context->nobalance = 1; ++ context->trans->flags |= TXNH_DONT_COMMIT; ++} ++ ++/* __REISER4_CONTEXT_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/coord.c linux-2.6.33/fs/reiser4/coord.c +--- linux-2.6.33.orig/fs/reiser4/coord.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/coord.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,928 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "tree.h" ++#include "plugin/item/item.h" ++#include "znode.h" ++#include "coord.h" ++ ++/* Internal constructor. */ ++static inline void ++coord_init_values(coord_t *coord, const znode * node, pos_in_node_t item_pos, ++ pos_in_node_t unit_pos, between_enum between) ++{ ++ coord->node = (znode *) node; ++ coord_set_item_pos(coord, item_pos); ++ coord->unit_pos = unit_pos; ++ coord->between = between; ++ ON_DEBUG(coord->plug_v = 0); ++ ON_DEBUG(coord->body_v = 0); ++ ++ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, ++ node, item_pos, unit_pos, coord_tween_tostring (between)); */ ++} ++ ++/* after shifting of node content, coord previously set properly may become ++ invalid, try to "normalize" it. */ ++void coord_normalize(coord_t *coord) ++{ ++ znode *node; ++ ++ node = coord->node; ++ assert("vs-683", node); ++ ++ coord_clear_iplug(coord); ++ ++ if (node_is_empty(node)) { ++ coord_init_first_unit(coord, node); ++ } else if ((coord->between == AFTER_ITEM) ++ || (coord->between == AFTER_UNIT)) { ++ return; ++ } else if (coord->item_pos == coord_num_items(coord) ++ && coord->between == BEFORE_ITEM) { ++ coord_dec_item_pos(coord); ++ coord->between = AFTER_ITEM; ++ } else if (coord->unit_pos == coord_num_units(coord) ++ && coord->between == BEFORE_UNIT) { ++ coord->unit_pos--; ++ coord->between = AFTER_UNIT; ++ } else if (coord->item_pos == coord_num_items(coord) ++ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) { ++ coord_dec_item_pos(coord); ++ coord->unit_pos = 0; ++ coord->between = AFTER_ITEM; ++ } ++} ++ ++/* Copy a coordinate. */ ++void coord_dup(coord_t *coord, const coord_t *old_coord) ++{ ++ assert("jmacd-9800", coord_check(old_coord)); ++ coord_dup_nocheck(coord, old_coord); ++} ++ ++/* Copy a coordinate without check. Useful when old_coord->node is not ++ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */ ++void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord) ++{ ++ coord->node = old_coord->node; ++ coord_set_item_pos(coord, old_coord->item_pos); ++ coord->unit_pos = old_coord->unit_pos; ++ coord->between = old_coord->between; ++ coord->iplugid = old_coord->iplugid; ++ ON_DEBUG(coord->plug_v = old_coord->plug_v); ++ ON_DEBUG(coord->body_v = old_coord->body_v); ++} ++ ++/* Initialize an invalid coordinate. */ ++void coord_init_invalid(coord_t *coord, const znode * node) ++{ ++ coord_init_values(coord, node, 0, 0, INVALID_COORD); ++} ++ ++void coord_init_first_unit_nocheck(coord_t *coord, const znode * node) ++{ ++ coord_init_values(coord, node, 0, 0, AT_UNIT); ++} ++ ++/* Initialize a coordinate to point at the first unit of the first item. If the ++ node is empty, it is positioned at the EMPTY_NODE. */ ++void coord_init_first_unit(coord_t *coord, const znode * node) ++{ ++ int is_empty = node_is_empty(node); ++ ++ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT)); ++ ++ assert("jmacd-9801", coord_check(coord)); ++} ++ ++/* Initialize a coordinate to point at the last unit of the last item. If the ++ node is empty, it is positioned at the EMPTY_NODE. */ ++void coord_init_last_unit(coord_t *coord, const znode * node) ++{ ++ int is_empty = node_is_empty(node); ++ ++ coord_init_values(coord, node, ++ (is_empty ? 0 : node_num_items(node) - 1), 0, ++ (is_empty ? EMPTY_NODE : AT_UNIT)); ++ if (!is_empty) ++ coord->unit_pos = coord_last_unit_pos(coord); ++ assert("jmacd-9802", coord_check(coord)); ++} ++ ++/* Initialize a coordinate to before the first item. If the node is empty, it is ++ positioned at the EMPTY_NODE. */ ++void coord_init_before_first_item(coord_t *coord, const znode * node) ++{ ++ int is_empty = node_is_empty(node); ++ ++ coord_init_values(coord, node, 0, 0, ++ (is_empty ? EMPTY_NODE : BEFORE_UNIT)); ++ ++ assert("jmacd-9803", coord_check(coord)); ++} ++ ++/* Initialize a coordinate to after the last item. If the node is empty, it is ++ positioned at the EMPTY_NODE. */ ++void coord_init_after_last_item(coord_t *coord, const znode * node) ++{ ++ int is_empty = node_is_empty(node); ++ ++ coord_init_values(coord, node, ++ (is_empty ? 0 : node_num_items(node) - 1), 0, ++ (is_empty ? EMPTY_NODE : AFTER_ITEM)); ++ ++ assert("jmacd-9804", coord_check(coord)); ++} ++ ++/* Initialize a coordinate to after last unit in the item. Coord must be set ++ already to existing item */ ++void coord_init_after_item_end(coord_t *coord) ++{ ++ coord->between = AFTER_UNIT; ++ coord->unit_pos = coord_last_unit_pos(coord); ++} ++ ++/* Initialize a coordinate to before the item. Coord must be set already to ++ existing item */ ++void coord_init_before_item(coord_t *coord) ++{ ++ coord->unit_pos = 0; ++ coord->between = BEFORE_ITEM; ++} ++ ++/* Initialize a coordinate to after the item. Coord must be set already to ++ existing item */ ++void coord_init_after_item(coord_t *coord) ++{ ++ coord->unit_pos = 0; ++ coord->between = AFTER_ITEM; ++} ++ ++/* Initialize a coordinate by 0s. Used in places where init_coord was used and ++ it was not clear how actually */ ++void coord_init_zero(coord_t *coord) ++{ ++ memset(coord, 0, sizeof(*coord)); ++} ++ ++/* Return the number of units at the present item. ++ Asserts coord_is_existing_item(). */ ++unsigned coord_num_units(const coord_t *coord) ++{ ++ assert("jmacd-9806", coord_is_existing_item(coord)); ++ ++ return item_plugin_by_coord(coord)->b.nr_units(coord); ++} ++ ++/* Returns true if the coord was initializewd by coord_init_invalid (). */ ++/* Audited by: green(2002.06.15) */ ++int coord_is_invalid(const coord_t *coord) ++{ ++ return coord->between == INVALID_COORD; ++} ++ ++/* Returns true if the coordinate is positioned at an existing item, not before ++ or after an item. It may be placed at, before, or after any unit within the ++ item, whether existing or not. */ ++int coord_is_existing_item(const coord_t *coord) ++{ ++ switch (coord->between) { ++ case EMPTY_NODE: ++ case BEFORE_ITEM: ++ case AFTER_ITEM: ++ case INVALID_COORD: ++ return 0; ++ ++ case BEFORE_UNIT: ++ case AT_UNIT: ++ case AFTER_UNIT: ++ return coord->item_pos < coord_num_items(coord); ++ } ++ ++ impossible("jmacd-9900", "unreachable coord: %p", coord); ++ return 0; ++} ++ ++/* Returns true if the coordinate is positioned at an existing unit, not before ++ or after a unit. */ ++/* Audited by: green(2002.06.15) */ ++int coord_is_existing_unit(const coord_t *coord) ++{ ++ switch (coord->between) { ++ case EMPTY_NODE: ++ case BEFORE_UNIT: ++ case AFTER_UNIT: ++ case BEFORE_ITEM: ++ case AFTER_ITEM: ++ case INVALID_COORD: ++ return 0; ++ ++ case AT_UNIT: ++ return (coord->item_pos < coord_num_items(coord) ++ && coord->unit_pos < coord_num_units(coord)); ++ } ++ ++ impossible("jmacd-9902", "unreachable"); ++ return 0; ++} ++ ++/* Returns true if the coordinate is positioned at the first unit of the first ++ item. Not true for empty nodes nor coordinates positioned before the first ++ item. */ ++/* Audited by: green(2002.06.15) */ ++int coord_is_leftmost_unit(const coord_t *coord) ++{ ++ return (coord->between == AT_UNIT && coord->item_pos == 0 ++ && coord->unit_pos == 0); ++} ++ ++#if REISER4_DEBUG ++/* For assertions only, checks for a valid coordinate. */ ++int coord_check(const coord_t *coord) ++{ ++ if (coord->node == NULL) ++ return 0; ++ if (znode_above_root(coord->node)) ++ return 1; ++ ++ switch (coord->between) { ++ default: ++ case INVALID_COORD: ++ return 0; ++ case EMPTY_NODE: ++ if (!node_is_empty(coord->node)) ++ return 0; ++ return coord->item_pos == 0 && coord->unit_pos == 0; ++ ++ case BEFORE_UNIT: ++ case AFTER_UNIT: ++ if (node_is_empty(coord->node) && (coord->item_pos == 0) ++ && (coord->unit_pos == 0)) ++ return 1; ++ case AT_UNIT: ++ break; ++ case AFTER_ITEM: ++ case BEFORE_ITEM: ++ /* before/after item should not set unit_pos. */ ++ if (coord->unit_pos != 0) ++ return 0; ++ break; ++ } ++ ++ if (coord->item_pos >= node_num_items(coord->node)) ++ return 0; ++ ++ /* FIXME-VS: we are going to check unit_pos. This makes no sense when ++ between is set either AFTER_ITEM or BEFORE_ITEM */ ++ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM) ++ return 1; ++ ++ if (coord_is_iplug_set(coord) && ++ coord->unit_pos > ++ item_plugin_by_coord(coord)->b.nr_units(coord) - 1) ++ return 0; ++ return 1; ++} ++#endif ++ ++/* Adjust coordinate boundaries based on the number of items prior to ++ coord_next/prev. Returns 1 if the new position is does not exist. */ ++static int coord_adjust_items(coord_t *coord, unsigned items, int is_next) ++{ ++ /* If the node is invalid, leave it. */ ++ if (coord->between == INVALID_COORD) ++ return 1; ++ ++ /* If the node is empty, set it appropriately. */ ++ if (items == 0) { ++ coord->between = EMPTY_NODE; ++ coord_set_item_pos(coord, 0); ++ coord->unit_pos = 0; ++ return 1; ++ } ++ ++ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */ ++ if (coord->between == EMPTY_NODE) { ++ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM); ++ coord_set_item_pos(coord, 0); ++ coord->unit_pos = 0; ++ return 0; ++ } ++ ++ /* If the item_pos is out-of-range, set it appropriatly. */ ++ if (coord->item_pos >= items) { ++ coord->between = AFTER_ITEM; ++ coord_set_item_pos(coord, items - 1); ++ coord->unit_pos = 0; ++ /* If is_next, return 1 (can't go any further). */ ++ return is_next; ++ } ++ ++ return 0; ++} ++ ++/* Advances the coordinate by one unit to the right. If empty, no change. If ++ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new ++ position is an existing unit. */ ++int coord_next_unit(coord_t *coord) ++{ ++ unsigned items = coord_num_items(coord); ++ ++ if (coord_adjust_items(coord, items, 1) == 1) ++ return 1; ++ ++ switch (coord->between) { ++ case BEFORE_UNIT: ++ /* Now it is positioned at the same unit. */ ++ coord->between = AT_UNIT; ++ return 0; ++ ++ case AFTER_UNIT: ++ case AT_UNIT: ++ /* If it was at or after a unit and there are more units in this ++ item, advance to the next one. */ ++ if (coord->unit_pos < coord_last_unit_pos(coord)) { ++ coord->unit_pos += 1; ++ coord->between = AT_UNIT; ++ return 0; ++ } ++ ++ /* Otherwise, it is crossing an item boundary and treated as if ++ it was after the current item. */ ++ coord->between = AFTER_ITEM; ++ coord->unit_pos = 0; ++ /* FALLTHROUGH */ ++ ++ case AFTER_ITEM: ++ /* Check for end-of-node. */ ++ if (coord->item_pos == items - 1) ++ return 1; ++ ++ coord_inc_item_pos(coord); ++ coord->unit_pos = 0; ++ coord->between = AT_UNIT; ++ return 0; ++ ++ case BEFORE_ITEM: ++ /* The adjust_items checks ensure that we are valid here. */ ++ coord->unit_pos = 0; ++ coord->between = AT_UNIT; ++ return 0; ++ ++ case INVALID_COORD: ++ case EMPTY_NODE: ++ /* Handled in coord_adjust_items(). */ ++ break; ++ } ++ ++ impossible("jmacd-9902", "unreachable"); ++ return 0; ++} ++ ++/* Advances the coordinate by one item to the right. If empty, no change. If ++ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new ++ position is an existing item. */ ++int coord_next_item(coord_t *coord) ++{ ++ unsigned items = coord_num_items(coord); ++ ++ if (coord_adjust_items(coord, items, 1) == 1) ++ return 1; ++ ++ switch (coord->between) { ++ case AFTER_UNIT: ++ case AT_UNIT: ++ case BEFORE_UNIT: ++ case AFTER_ITEM: ++ /* Check for end-of-node. */ ++ if (coord->item_pos == items - 1) { ++ coord->between = AFTER_ITEM; ++ coord->unit_pos = 0; ++ coord_clear_iplug(coord); ++ return 1; ++ } ++ ++ /* Anywhere in an item, go to the next one. */ ++ coord->between = AT_UNIT; ++ coord_inc_item_pos(coord); ++ coord->unit_pos = 0; ++ return 0; ++ ++ case BEFORE_ITEM: ++ /* The out-of-range check ensures that we are valid here. */ ++ coord->unit_pos = 0; ++ coord->between = AT_UNIT; ++ return 0; ++ case INVALID_COORD: ++ case EMPTY_NODE: ++ /* Handled in coord_adjust_items(). */ ++ break; ++ } ++ ++ impossible("jmacd-9903", "unreachable"); ++ return 0; ++} ++ ++/* Advances the coordinate by one unit to the left. If empty, no change. If ++ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new ++ position is an existing unit. */ ++int coord_prev_unit(coord_t *coord) ++{ ++ unsigned items = coord_num_items(coord); ++ ++ if (coord_adjust_items(coord, items, 0) == 1) ++ return 1; ++ ++ switch (coord->between) { ++ case AT_UNIT: ++ case BEFORE_UNIT: ++ if (coord->unit_pos > 0) { ++ coord->unit_pos -= 1; ++ coord->between = AT_UNIT; ++ return 0; ++ } ++ ++ if (coord->item_pos == 0) { ++ coord->between = BEFORE_ITEM; ++ return 1; ++ } ++ ++ coord_dec_item_pos(coord); ++ coord->unit_pos = coord_last_unit_pos(coord); ++ coord->between = AT_UNIT; ++ return 0; ++ ++ case AFTER_UNIT: ++ /* What if unit_pos is out-of-range? */ ++ assert("jmacd-5442", ++ coord->unit_pos <= coord_last_unit_pos(coord)); ++ coord->between = AT_UNIT; ++ return 0; ++ ++ case BEFORE_ITEM: ++ if (coord->item_pos == 0) ++ return 1; ++ ++ coord_dec_item_pos(coord); ++ /* FALLTHROUGH */ ++ ++ case AFTER_ITEM: ++ coord->between = AT_UNIT; ++ coord->unit_pos = coord_last_unit_pos(coord); ++ return 0; ++ ++ case INVALID_COORD: ++ case EMPTY_NODE: ++ break; ++ } ++ ++ impossible("jmacd-9904", "unreachable"); ++ return 0; ++} ++ ++/* Advances the coordinate by one item to the left. If empty, no change. If ++ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new ++ position is an existing item. */ ++int coord_prev_item(coord_t *coord) ++{ ++ unsigned items = coord_num_items(coord); ++ ++ if (coord_adjust_items(coord, items, 0) == 1) ++ return 1; ++ ++ switch (coord->between) { ++ case AT_UNIT: ++ case AFTER_UNIT: ++ case BEFORE_UNIT: ++ case BEFORE_ITEM: ++ ++ if (coord->item_pos == 0) { ++ coord->between = BEFORE_ITEM; ++ coord->unit_pos = 0; ++ return 1; ++ } ++ ++ coord_dec_item_pos(coord); ++ coord->unit_pos = 0; ++ coord->between = AT_UNIT; ++ return 0; ++ ++ case AFTER_ITEM: ++ coord->between = AT_UNIT; ++ coord->unit_pos = 0; ++ return 0; ++ ++ case INVALID_COORD: ++ case EMPTY_NODE: ++ break; ++ } ++ ++ impossible("jmacd-9905", "unreachable"); ++ return 0; ++} ++ ++/* Calls either coord_init_first_unit or coord_init_last_unit depending on ++ sideof argument. */ ++void coord_init_sideof_unit(coord_t *coord, const znode * node, sideof dir) ++{ ++ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE); ++ if (dir == LEFT_SIDE) { ++ coord_init_first_unit(coord, node); ++ } else { ++ coord_init_last_unit(coord, node); ++ } ++} ++ ++/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending ++ on sideof argument. */ ++/* Audited by: green(2002.06.15) */ ++int coord_is_after_sideof_unit(coord_t *coord, sideof dir) ++{ ++ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE); ++ if (dir == LEFT_SIDE) { ++ return coord_is_before_leftmost(coord); ++ } else { ++ return coord_is_after_rightmost(coord); ++ } ++} ++ ++/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. ++ */ ++/* Audited by: green(2002.06.15) */ ++int coord_sideof_unit(coord_t *coord, sideof dir) ++{ ++ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE); ++ if (dir == LEFT_SIDE) { ++ return coord_prev_unit(coord); ++ } else { ++ return coord_next_unit(coord); ++ } ++} ++ ++#if REISER4_DEBUG ++int coords_equal(const coord_t *c1, const coord_t *c2) ++{ ++ assert("nikita-2840", c1 != NULL); ++ assert("nikita-2841", c2 != NULL); ++ ++ return ++ c1->node == c2->node && ++ c1->item_pos == c2->item_pos && ++ c1->unit_pos == c2->unit_pos && c1->between == c2->between; ++} ++#endif /* REISER4_DEBUG */ ++ ++/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if ++ coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return ++ NCOORD_INSIDE. */ ++/* Audited by: green(2002.06.15) */ ++coord_wrt_node coord_wrt(const coord_t *coord) ++{ ++ if (coord_is_before_leftmost(coord)) ++ return COORD_ON_THE_LEFT; ++ ++ if (coord_is_after_rightmost(coord)) ++ return COORD_ON_THE_RIGHT; ++ ++ return COORD_INSIDE; ++} ++ ++/* Returns true if the coordinate is positioned after the last item or after the ++ last unit of the last item or it is an empty node. */ ++/* Audited by: green(2002.06.15) */ ++int coord_is_after_rightmost(const coord_t *coord) ++{ ++ assert("jmacd-7313", coord_check(coord)); ++ ++ switch (coord->between) { ++ case INVALID_COORD: ++ case AT_UNIT: ++ case BEFORE_UNIT: ++ case BEFORE_ITEM: ++ return 0; ++ ++ case EMPTY_NODE: ++ return 1; ++ ++ case AFTER_ITEM: ++ return (coord->item_pos == node_num_items(coord->node) - 1); ++ ++ case AFTER_UNIT: ++ return ((coord->item_pos == node_num_items(coord->node) - 1) && ++ coord->unit_pos == coord_last_unit_pos(coord)); ++ } ++ ++ impossible("jmacd-9908", "unreachable"); ++ return 0; ++} ++ ++/* Returns true if the coordinate is positioned before the first item or it is ++ an empty node. */ ++int coord_is_before_leftmost(const coord_t *coord) ++{ ++ /* FIXME-VS: coord_check requires node to be loaded whereas it is not ++ necessary to check if coord is set before leftmost ++ assert ("jmacd-7313", coord_check (coord)); */ ++ switch (coord->between) { ++ case INVALID_COORD: ++ case AT_UNIT: ++ case AFTER_ITEM: ++ case AFTER_UNIT: ++ return 0; ++ ++ case EMPTY_NODE: ++ return 1; ++ ++ case BEFORE_ITEM: ++ case BEFORE_UNIT: ++ return (coord->item_pos == 0) && (coord->unit_pos == 0); ++ } ++ ++ impossible("jmacd-9908", "unreachable"); ++ return 0; ++} ++ ++/* Returns true if the coordinate is positioned after a item, before a item, ++ after the last unit of an item, before the first unit of an item, or at an ++ empty node. */ ++/* Audited by: green(2002.06.15) */ ++int coord_is_between_items(const coord_t *coord) ++{ ++ assert("jmacd-7313", coord_check(coord)); ++ ++ switch (coord->between) { ++ case INVALID_COORD: ++ case AT_UNIT: ++ return 0; ++ ++ case AFTER_ITEM: ++ case BEFORE_ITEM: ++ case EMPTY_NODE: ++ return 1; ++ ++ case BEFORE_UNIT: ++ return coord->unit_pos == 0; ++ ++ case AFTER_UNIT: ++ return coord->unit_pos == coord_last_unit_pos(coord); ++ } ++ ++ impossible("jmacd-9908", "unreachable"); ++ return 0; ++} ++ ++#if REISER4_DEBUG ++/* Returns true if the coordinates are positioned at adjacent units, regardless ++ of before-after or item boundaries. */ ++int coord_are_neighbors(coord_t *c1, coord_t *c2) ++{ ++ coord_t *left; ++ coord_t *right; ++ ++ assert("nikita-1241", c1 != NULL); ++ assert("nikita-1242", c2 != NULL); ++ assert("nikita-1243", c1->node == c2->node); ++ assert("nikita-1244", coord_is_existing_unit(c1)); ++ assert("nikita-1245", coord_is_existing_unit(c2)); ++ ++ left = right = NULL; ++ switch (coord_compare(c1, c2)) { ++ case COORD_CMP_ON_LEFT: ++ left = c1; ++ right = c2; ++ break; ++ case COORD_CMP_ON_RIGHT: ++ left = c2; ++ right = c1; ++ break; ++ case COORD_CMP_SAME: ++ return 0; ++ default: ++ wrong_return_value("nikita-1246", "compare_coords()"); ++ } ++ assert("vs-731", left && right); ++ if (left->item_pos == right->item_pos) { ++ return left->unit_pos + 1 == right->unit_pos; ++ } else if (left->item_pos + 1 == right->item_pos) { ++ return (left->unit_pos == coord_last_unit_pos(left)) ++ && (right->unit_pos == 0); ++ } else { ++ return 0; ++ } ++} ++#endif /* REISER4_DEBUG */ ++ ++/* Assuming two coordinates are positioned in the same node, return ++ COORD_CMP_ON_RIGHT, COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's ++ position relative to c2. */ ++/* Audited by: green(2002.06.15) */ ++coord_cmp coord_compare(coord_t *c1, coord_t *c2) ++{ ++ assert("vs-209", c1->node == c2->node); ++ assert("vs-194", coord_is_existing_unit(c1) ++ && coord_is_existing_unit(c2)); ++ ++ if (c1->item_pos > c2->item_pos) ++ return COORD_CMP_ON_RIGHT; ++ if (c1->item_pos < c2->item_pos) ++ return COORD_CMP_ON_LEFT; ++ if (c1->unit_pos > c2->unit_pos) ++ return COORD_CMP_ON_RIGHT; ++ if (c1->unit_pos < c2->unit_pos) ++ return COORD_CMP_ON_LEFT; ++ return COORD_CMP_SAME; ++} ++ ++/* If the coordinate is between items, shifts it to the right. Returns 0 on ++ success and non-zero if there is no position to the right. */ ++int coord_set_to_right(coord_t *coord) ++{ ++ unsigned items = coord_num_items(coord); ++ ++ if (coord_adjust_items(coord, items, 1) == 1) ++ return 1; ++ ++ switch (coord->between) { ++ case AT_UNIT: ++ return 0; ++ ++ case BEFORE_ITEM: ++ case BEFORE_UNIT: ++ coord->between = AT_UNIT; ++ return 0; ++ ++ case AFTER_UNIT: ++ if (coord->unit_pos < coord_last_unit_pos(coord)) { ++ coord->unit_pos += 1; ++ coord->between = AT_UNIT; ++ return 0; ++ } else { ++ ++ coord->unit_pos = 0; ++ ++ if (coord->item_pos == items - 1) { ++ coord->between = AFTER_ITEM; ++ return 1; ++ } ++ ++ coord_inc_item_pos(coord); ++ coord->between = AT_UNIT; ++ return 0; ++ } ++ ++ case AFTER_ITEM: ++ if (coord->item_pos == items - 1) ++ return 1; ++ ++ coord_inc_item_pos(coord); ++ coord->unit_pos = 0; ++ coord->between = AT_UNIT; ++ return 0; ++ ++ case EMPTY_NODE: ++ return 1; ++ ++ case INVALID_COORD: ++ break; ++ } ++ ++ impossible("jmacd-9920", "unreachable"); ++ return 0; ++} ++ ++/* If the coordinate is between items, shifts it to the left. Returns 0 on ++ success and non-zero if there is no position to the left. */ ++int coord_set_to_left(coord_t *coord) ++{ ++ unsigned items = coord_num_items(coord); ++ ++ if (coord_adjust_items(coord, items, 0) == 1) ++ return 1; ++ ++ switch (coord->between) { ++ case AT_UNIT: ++ return 0; ++ ++ case AFTER_UNIT: ++ coord->between = AT_UNIT; ++ return 0; ++ ++ case AFTER_ITEM: ++ coord->between = AT_UNIT; ++ coord->unit_pos = coord_last_unit_pos(coord); ++ return 0; ++ ++ case BEFORE_UNIT: ++ if (coord->unit_pos > 0) { ++ coord->unit_pos -= 1; ++ coord->between = AT_UNIT; ++ return 0; ++ } else { ++ ++ if (coord->item_pos == 0) { ++ coord->between = BEFORE_ITEM; ++ return 1; ++ } ++ ++ coord->unit_pos = coord_last_unit_pos(coord); ++ coord_dec_item_pos(coord); ++ coord->between = AT_UNIT; ++ return 0; ++ } ++ ++ case BEFORE_ITEM: ++ if (coord->item_pos == 0) ++ return 1; ++ ++ coord_dec_item_pos(coord); ++ coord->unit_pos = coord_last_unit_pos(coord); ++ coord->between = AT_UNIT; ++ return 0; ++ ++ case EMPTY_NODE: ++ return 1; ++ ++ case INVALID_COORD: ++ break; ++ } ++ ++ impossible("jmacd-9920", "unreachable"); ++ return 0; ++} ++ ++static const char *coord_tween_tostring(between_enum n) ++{ ++ switch (n) { ++ case BEFORE_UNIT: ++ return "before unit"; ++ case BEFORE_ITEM: ++ return "before item"; ++ case AT_UNIT: ++ return "at unit"; ++ case AFTER_UNIT: ++ return "after unit"; ++ case AFTER_ITEM: ++ return "after item"; ++ case EMPTY_NODE: ++ return "empty node"; ++ case INVALID_COORD: ++ return "invalid"; ++ default: ++ { ++ static char buf[30]; ++ ++ sprintf(buf, "unknown: %i", n); ++ return buf; ++ } ++ } ++} ++ ++void print_coord(const char *mes, const coord_t *coord, int node) ++{ ++ if (coord == NULL) { ++ printk("%s: null\n", mes); ++ return; ++ } ++ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n", ++ mes, coord->item_pos, coord->unit_pos, ++ coord_tween_tostring(coord->between), coord->iplugid); ++} ++ ++int ++item_utmost_child_real_block(const coord_t *coord, sideof side, ++ reiser4_block_nr * blk) ++{ ++ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord, ++ side, ++ blk); ++} ++ ++int item_utmost_child(const coord_t *coord, sideof side, jnode ** child) ++{ ++ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child); ++} ++ ++/* @count bytes of flow @f got written, update correspondingly f->length, ++ f->data and f->key */ ++void move_flow_forward(flow_t *f, unsigned count) ++{ ++ if (f->data) ++ f->data += count; ++ f->length -= count; ++ set_key_offset(&f->key, get_key_offset(&f->key) + count); ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/coord.h linux-2.6.33/fs/reiser4/coord.h +--- linux-2.6.33.orig/fs/reiser4/coord.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/coord.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,399 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* Coords */ ++ ++#if !defined(__REISER4_COORD_H__) ++#define __REISER4_COORD_H__ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++ ++/* insertions happen between coords in the tree, so we need some means ++ of specifying the sense of betweenness. */ ++typedef enum { ++ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */ ++ AT_UNIT, ++ AFTER_UNIT, ++ BEFORE_ITEM, ++ AFTER_ITEM, ++ INVALID_COORD, ++ EMPTY_NODE, ++} between_enum; ++ ++/* location of coord w.r.t. its node */ ++typedef enum { ++ COORD_ON_THE_LEFT = -1, ++ COORD_ON_THE_RIGHT = +1, ++ COORD_INSIDE = 0 ++} coord_wrt_node; ++ ++typedef enum { ++ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1 ++} coord_cmp; ++ ++struct coord { ++ /* node in a tree */ ++ /* 0 */ znode *node; ++ ++ /* position of item within node */ ++ /* 4 */ pos_in_node_t item_pos; ++ /* position of unit within item */ ++ /* 6 */ pos_in_node_t unit_pos; ++ /* optimization: plugin of item is stored in coord_t. Until this was ++ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid ++ is invalidated (set to 0xff) on each modification of ->item_pos, ++ and all such modifications are funneled through coord_*_item_pos() ++ functions below. ++ */ ++ /* 8 */ char iplugid; ++ /* position of coord w.r.t. to neighboring items and/or units. ++ Values are taken from &between_enum above. ++ */ ++ /* 9 */ char between; ++ /* padding. It will be added by the compiler anyway to conform to the ++ * C language alignment requirements. We keep it here to be on the ++ * safe side and to have a clear picture of the memory layout of this ++ * structure. */ ++ /* 10 */ __u16 pad; ++ /* 12 */ int offset; ++#if REISER4_DEBUG ++ unsigned long plug_v; ++ unsigned long body_v; ++#endif ++}; ++ ++#define INVALID_PLUGID ((char)((1 << 8) - 1)) ++#define INVALID_OFFSET -1 ++ ++static inline void coord_clear_iplug(coord_t *coord) ++{ ++ assert("nikita-2835", coord != NULL); ++ coord->iplugid = INVALID_PLUGID; ++ coord->offset = INVALID_OFFSET; ++} ++ ++static inline int coord_is_iplug_set(const coord_t *coord) ++{ ++ assert("nikita-2836", coord != NULL); ++ return coord->iplugid != INVALID_PLUGID; ++} ++ ++static inline void coord_set_item_pos(coord_t *coord, pos_in_node_t pos) ++{ ++ assert("nikita-2478", coord != NULL); ++ coord->item_pos = pos; ++ coord_clear_iplug(coord); ++} ++ ++static inline void coord_dec_item_pos(coord_t *coord) ++{ ++ assert("nikita-2480", coord != NULL); ++ --coord->item_pos; ++ coord_clear_iplug(coord); ++} ++ ++static inline void coord_inc_item_pos(coord_t *coord) ++{ ++ assert("nikita-2481", coord != NULL); ++ ++coord->item_pos; ++ coord_clear_iplug(coord); ++} ++ ++static inline void coord_add_item_pos(coord_t *coord, int delta) ++{ ++ assert("nikita-2482", coord != NULL); ++ coord->item_pos += delta; ++ coord_clear_iplug(coord); ++} ++ ++static inline void coord_invalid_item_pos(coord_t *coord) ++{ ++ assert("nikita-2832", coord != NULL); ++ coord->item_pos = (unsigned short)~0; ++ coord_clear_iplug(coord); ++} ++ ++/* Reverse a direction. */ ++static inline sideof sideof_reverse(sideof side) ++{ ++ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE; ++} ++ ++/* NOTE: There is a somewhat odd mixture of the following opposed terms: ++ ++ "first" and "last" ++ "next" and "prev" ++ "before" and "after" ++ "leftmost" and "rightmost" ++ ++ But I think the chosen names are decent the way they are. ++*/ ++ ++/* COORD INITIALIZERS */ ++ ++/* Initialize an invalid coordinate. */ ++extern void coord_init_invalid(coord_t *coord, const znode * node); ++ ++extern void coord_init_first_unit_nocheck(coord_t *coord, const znode * node); ++ ++/* Initialize a coordinate to point at the first unit of the first item. If the ++ node is empty, it is positioned at the EMPTY_NODE. */ ++extern void coord_init_first_unit(coord_t *coord, const znode * node); ++ ++/* Initialize a coordinate to point at the last unit of the last item. If the ++ node is empty, it is positioned at the EMPTY_NODE. */ ++extern void coord_init_last_unit(coord_t *coord, const znode * node); ++ ++/* Initialize a coordinate to before the first item. If the node is empty, it is ++ positioned at the EMPTY_NODE. */ ++extern void coord_init_before_first_item(coord_t *coord, const znode * node); ++ ++/* Initialize a coordinate to after the last item. If the node is empty, it is ++ positioned at the EMPTY_NODE. */ ++extern void coord_init_after_last_item(coord_t *coord, const znode * node); ++ ++/* Initialize a coordinate to after last unit in the item. Coord must be set ++ already to existing item */ ++void coord_init_after_item_end(coord_t *coord); ++ ++/* Initialize a coordinate to before the item. Coord must be set already to ++ existing item */ ++void coord_init_before_item(coord_t *); ++/* Initialize a coordinate to after the item. Coord must be set already to ++ existing item */ ++void coord_init_after_item(coord_t *); ++ ++/* Calls either coord_init_first_unit or coord_init_last_unit depending on ++ sideof argument. */ ++extern void coord_init_sideof_unit(coord_t *coord, const znode * node, ++ sideof dir); ++ ++/* Initialize a coordinate by 0s. Used in places where init_coord was used and ++ it was not clear how actually ++ FIXME-VS: added by vs (2002, june, 8) */ ++extern void coord_init_zero(coord_t *coord); ++ ++/* COORD METHODS */ ++ ++/* after shifting of node content, coord previously set properly may become ++ invalid, try to "normalize" it. */ ++void coord_normalize(coord_t *coord); ++ ++/* Copy a coordinate. */ ++extern void coord_dup(coord_t *coord, const coord_t *old_coord); ++ ++/* Copy a coordinate without check. */ ++void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord); ++ ++unsigned coord_num_units(const coord_t *coord); ++ ++/* Return the last valid unit number at the present item (i.e., ++ coord_num_units() - 1). */ ++static inline unsigned coord_last_unit_pos(const coord_t *coord) ++{ ++ return coord_num_units(coord) - 1; ++} ++ ++#if REISER4_DEBUG ++/* For assertions only, checks for a valid coordinate. */ ++extern int coord_check(const coord_t *coord); ++ ++extern unsigned long znode_times_locked(const znode * z); ++ ++static inline void coord_update_v(coord_t *coord) ++{ ++ coord->plug_v = coord->body_v = znode_times_locked(coord->node); ++} ++#endif ++ ++extern int coords_equal(const coord_t *c1, const coord_t *c2); ++ ++extern void print_coord(const char *mes, const coord_t *coord, int print_node); ++ ++/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if ++ coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return ++ NCOORD_INSIDE. */ ++extern coord_wrt_node coord_wrt(const coord_t *coord); ++ ++/* Returns true if the coordinates are positioned at adjacent units, regardless ++ of before-after or item boundaries. */ ++extern int coord_are_neighbors(coord_t *c1, coord_t *c2); ++ ++/* Assuming two coordinates are positioned in the same node, return ++ NCOORD_CMP_ON_RIGHT, NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's ++ position relative to c2. */ ++extern coord_cmp coord_compare(coord_t *c1, coord_t *c2); ++ ++/* COORD PREDICATES */ ++ ++/* Returns true if the coord was initializewd by coord_init_invalid (). */ ++extern int coord_is_invalid(const coord_t *coord); ++ ++/* Returns true if the coordinate is positioned at an existing item, not before ++ or after an item. It may be placed at, before, or after any unit within the ++ item, whether existing or not. If this is true you can call methods of the ++ item plugin. */ ++extern int coord_is_existing_item(const coord_t *coord); ++ ++/* Returns true if the coordinate is positioned after a item, before a item, ++ after the last unit of an item, before the first unit of an item, or at an ++ empty node. */ ++extern int coord_is_between_items(const coord_t *coord); ++ ++/* Returns true if the coordinate is positioned at an existing unit, not before ++ or after a unit. */ ++extern int coord_is_existing_unit(const coord_t *coord); ++ ++/* Returns true if the coordinate is positioned at an empty node. */ ++extern int coord_is_empty(const coord_t *coord); ++ ++/* Returns true if the coordinate is positioned at the first unit of the first ++ item. Not true for empty nodes nor coordinates positioned before the first ++ item. */ ++extern int coord_is_leftmost_unit(const coord_t *coord); ++ ++/* Returns true if the coordinate is positioned after the last item or after the ++ last unit of the last item or it is an empty node. */ ++extern int coord_is_after_rightmost(const coord_t *coord); ++ ++/* Returns true if the coordinate is positioned before the first item or it is ++ an empty node. */ ++extern int coord_is_before_leftmost(const coord_t *coord); ++ ++/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending ++ on sideof argument. */ ++extern int coord_is_after_sideof_unit(coord_t *coord, sideof dir); ++ ++/* COORD MODIFIERS */ ++ ++/* Advances the coordinate by one unit to the right. If empty, no change. If ++ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new ++ position is an existing unit. */ ++extern int coord_next_unit(coord_t *coord); ++ ++/* Advances the coordinate by one item to the right. If empty, no change. If ++ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new ++ position is an existing item. */ ++extern int coord_next_item(coord_t *coord); ++ ++/* Advances the coordinate by one unit to the left. If empty, no change. If ++ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new ++ position is an existing unit. */ ++extern int coord_prev_unit(coord_t *coord); ++ ++/* Advances the coordinate by one item to the left. If empty, no change. If ++ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new ++ position is an existing item. */ ++extern int coord_prev_item(coord_t *coord); ++ ++/* If the coordinate is between items, shifts it to the right. Returns 0 on ++ success and non-zero if there is no position to the right. */ ++extern int coord_set_to_right(coord_t *coord); ++ ++/* If the coordinate is between items, shifts it to the left. Returns 0 on ++ success and non-zero if there is no position to the left. */ ++extern int coord_set_to_left(coord_t *coord); ++ ++/* If the coordinate is at an existing unit, set to after that unit. Returns 0 ++ on success and non-zero if the unit did not exist. */ ++extern int coord_set_after_unit(coord_t *coord); ++ ++/* Calls either coord_next_unit or coord_prev_unit depending on sideof ++ argument. */ ++extern int coord_sideof_unit(coord_t *coord, sideof dir); ++ ++/* iterate over all units in @node */ ++#define for_all_units(coord, node) \ ++ for (coord_init_before_first_item((coord), (node)) ; \ ++ coord_next_unit(coord) == 0 ;) ++ ++/* iterate over all items in @node */ ++#define for_all_items(coord, node) \ ++ for (coord_init_before_first_item((coord), (node)) ; \ ++ coord_next_item(coord) == 0 ;) ++ ++/* COORD/ITEM METHODS */ ++ ++extern int item_utmost_child_real_block(const coord_t *coord, sideof side, ++ reiser4_block_nr * blk); ++extern int item_utmost_child(const coord_t *coord, sideof side, ++ jnode ** child); ++ ++/* a flow is a sequence of bytes being written to or read from the tree. The ++ tree will slice the flow into items while storing it into nodes, but all of ++ that is hidden from anything outside the tree. */ ++ ++struct flow { ++ reiser4_key key; /* key of start of flow's sequence of bytes */ ++ loff_t length; /* length of flow's sequence of bytes */ ++ char *data; /* start of flow's sequence of bytes */ ++ int user; /* if 1 data is user space, 0 - kernel space */ ++ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */ ++}; ++ ++void move_flow_forward(flow_t *f, unsigned count); ++ ++/* &reiser4_item_data - description of data to be inserted or pasted ++ ++ Q: articulate the reasons for the difference between this and flow. ++ ++ A: Becides flow we insert into tree other things: stat data, directory ++ entry, etc. To insert them into tree one has to provide this structure. If ++ one is going to insert flow - he can use insert_flow, where this structure ++ does not have to be created ++*/ ++struct reiser4_item_data { ++ /* actual data to be inserted. If NULL, ->create_item() will not ++ do xmemcpy itself, leaving this up to the caller. This can ++ save some amount of unnecessary memory copying, for example, ++ during insertion of stat data. ++ ++ */ ++ char *data; ++ /* 1 if 'char * data' contains pointer to user space and 0 if it is ++ kernel space */ ++ int user; ++ /* amount of data we are going to insert or paste */ ++ int length; ++ /* "Arg" is opaque data that is passed down to the ++ ->create_item() method of node layout, which in turn ++ hands it to the ->create_hook() of item being created. This ++ arg is currently used by: ++ ++ . ->create_hook() of internal item ++ (fs/reiser4/plugin/item/internal.c:internal_create_hook()), ++ . ->paste() method of directory item. ++ . ->create_hook() of extent item ++ ++ For internal item, this is left "brother" of new node being ++ inserted and it is used to add new node into sibling list ++ after parent to it was just inserted into parent. ++ ++ While ->arg does look somewhat of unnecessary compication, ++ it actually saves a lot of headache in many places, because ++ all data necessary to insert or paste new data into tree are ++ collected in one place, and this eliminates a lot of extra ++ argument passing and storing everywhere. ++ ++ */ ++ void *arg; ++ /* plugin of item we are inserting */ ++ item_plugin *iplug; ++}; ++ ++/* __REISER4_COORD_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/debug.c linux-2.6.33/fs/reiser4/debug.c +--- linux-2.6.33.orig/fs/reiser4/debug.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/debug.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,308 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Debugging facilities. */ ++ ++/* ++ * This file contains generic debugging functions used by reiser4. Roughly ++ * following: ++ * ++ * panicking: reiser4_do_panic(), reiser4_print_prefix(). ++ * ++ * locking: ++ * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(), ++ * reiser4_no_counters_are_held(), reiser4_commit_check_locks() ++ * ++ * error code monitoring (see comment before RETERR macro): ++ * reiser4_return_err(), reiser4_report_err(). ++ * ++ * stack back-tracing: fill_backtrace() ++ * ++ * miscellaneous: reiser4_preempt_point(), call_on_each_assert(), ++ * reiser4_debugtrap(). ++ * ++ */ ++ ++#include "reiser4.h" ++#include "context.h" ++#include "super.h" ++#include "txnmgr.h" ++#include "znode.h" ++ ++#include <linux/sysfs.h> ++#include <linux/slab.h> ++#include <linux/types.h> ++#include <linux/fs.h> ++#include <linux/spinlock.h> ++#include <linux/kallsyms.h> ++#include <linux/vmalloc.h> ++#include <linux/ctype.h> ++#include <linux/sysctl.h> ++#include <linux/hardirq.h> ++ ++#if 0 ++#if REISER4_DEBUG ++static void reiser4_report_err(void); ++#else ++#define reiser4_report_err() noop ++#endif ++#endif /* 0 */ ++ ++/* ++ * global buffer where message given to reiser4_panic is formatted. ++ */ ++static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE]; ++ ++/* ++ * lock protecting consistency of panic_buf under concurrent panics ++ */ ++static DEFINE_SPINLOCK(panic_guard); ++ ++/* Your best friend. Call it on each occasion. This is called by ++ fs/reiser4/debug.h:reiser4_panic(). */ ++void reiser4_do_panic(const char *format/* format string */ , ... /* rest */) ++{ ++ static int in_panic = 0; ++ va_list args; ++ ++ /* ++ * check for recursive panic. ++ */ ++ if (in_panic == 0) { ++ in_panic = 1; ++ ++ spin_lock(&panic_guard); ++ va_start(args, format); ++ vsnprintf(panic_buf, sizeof(panic_buf), format, args); ++ va_end(args); ++ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf); ++ spin_unlock(&panic_guard); ++ ++ /* ++ * if kernel debugger is configured---drop in. Early dropping ++ * into kgdb is not always convenient, because panic message ++ * is not yet printed most of the times. But: ++ * ++ * (1) message can be extracted from printk_buf[] ++ * (declared static inside of printk()), and ++ * ++ * (2) sometimes serial/kgdb combo dies while printing ++ * long panic message, so it's more prudent to break into ++ * debugger earlier. ++ * ++ */ ++ DEBUGON(1); ++ } ++ /* to make gcc happy about noreturn attribute */ ++ panic("%s", panic_buf); ++} ++ ++#if 0 ++void ++reiser4_print_prefix(const char *level, int reperr, const char *mid, ++ const char *function, const char *file, int lineno) ++{ ++ const char *comm; ++ int pid; ++ ++ if (unlikely(in_interrupt() || in_irq())) { ++ comm = "interrupt"; ++ pid = 0; ++ } else { ++ comm = current->comm; ++ pid = current->pid; ++ } ++ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n", ++ level, comm, pid, function, file, lineno, mid); ++ if (reperr) ++ reiser4_report_err(); ++} ++#endif /* 0 */ ++ ++/* Preemption point: this should be called periodically during long running ++ operations (carry, allocate, and squeeze are best examples) */ ++int reiser4_preempt_point(void) ++{ ++ assert("nikita-3008", reiser4_schedulable()); ++ cond_resched(); ++ return signal_pending(current); ++} ++ ++#if REISER4_DEBUG ++/* Debugging aid: return struct where information about locks taken by current ++ thread is accumulated. This can be used to formulate lock ordering ++ constraints and various assertions. ++ ++*/ ++reiser4_lock_cnt_info *reiser4_lock_counters(void) ++{ ++ reiser4_context *ctx = get_current_context(); ++ assert("jmacd-1123", ctx != NULL); ++ return &ctx->locks; ++} ++ ++/* ++ * print human readable information about locks held by the reiser4 context. ++ */ ++static void print_lock_counters(const char *prefix, ++ const reiser4_lock_cnt_info * info) ++{ ++ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n" ++ "jload: %i, " ++ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, " ++ "ktxnmgrd: %i, fq: %i\n" ++ "inode: %i, " ++ "cbk_cache: %i (r:%i,w%i), " ++ "eflush: %i, " ++ "zlock: %i,\n" ++ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n" ++ "d: %i, x: %i, t: %i\n", prefix, ++ info->spin_locked_jnode, ++ info->rw_locked_tree, info->read_locked_tree, ++ info->write_locked_tree, ++ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk, ++ info->spin_locked_jload, ++ info->spin_locked_txnh, ++ info->spin_locked_atom, info->spin_locked_stack, ++ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd, ++ info->spin_locked_fq, ++ info->spin_locked_inode, ++ info->rw_locked_cbk_cache, ++ info->read_locked_cbk_cache, ++ info->write_locked_cbk_cache, ++ info->spin_locked_super_eflush, ++ info->spin_locked_zlock, ++ info->spin_locked, ++ info->long_term_locked_znode, ++ info->inode_sem_r, info->inode_sem_w, ++ info->d_refs, info->x_refs, info->t_refs); ++} ++ ++/* check that no spinlocks are held */ ++int reiser4_schedulable(void) ++{ ++ if (get_current_context_check() != NULL) { ++ if (!LOCK_CNT_NIL(spin_locked)) { ++ print_lock_counters("in atomic", reiser4_lock_counters()); ++ return 0; ++ } ++ } ++ might_sleep(); ++ return 1; ++} ++/* ++ * return true, iff no locks are held. ++ */ ++int reiser4_no_counters_are_held(void) ++{ ++ reiser4_lock_cnt_info *counters; ++ ++ counters = reiser4_lock_counters(); ++ return ++ (counters->spin_locked_zlock == 0) && ++ (counters->spin_locked_jnode == 0) && ++ (counters->rw_locked_tree == 0) && ++ (counters->read_locked_tree == 0) && ++ (counters->write_locked_tree == 0) && ++ (counters->rw_locked_dk == 0) && ++ (counters->read_locked_dk == 0) && ++ (counters->write_locked_dk == 0) && ++ (counters->spin_locked_txnh == 0) && ++ (counters->spin_locked_atom == 0) && ++ (counters->spin_locked_stack == 0) && ++ (counters->spin_locked_txnmgr == 0) && ++ (counters->spin_locked_inode == 0) && ++ (counters->spin_locked == 0) && ++ (counters->long_term_locked_znode == 0) && ++ (counters->inode_sem_r == 0) && ++ (counters->inode_sem_w == 0) && (counters->d_refs == 0); ++} ++ ++/* ++ * return true, iff transaction commit can be done under locks held by the ++ * current thread. ++ */ ++int reiser4_commit_check_locks(void) ++{ ++ reiser4_lock_cnt_info *counters; ++ int inode_sem_r; ++ int inode_sem_w; ++ int result; ++ ++ /* ++ * inode's read/write semaphore is the only reiser4 lock that can be ++ * held during commit. ++ */ ++ ++ counters = reiser4_lock_counters(); ++ inode_sem_r = counters->inode_sem_r; ++ inode_sem_w = counters->inode_sem_w; ++ ++ counters->inode_sem_r = counters->inode_sem_w = 0; ++ result = reiser4_no_counters_are_held(); ++ counters->inode_sem_r = inode_sem_r; ++ counters->inode_sem_w = inode_sem_w; ++ return result; ++} ++ ++/* ++ * fill "error site" in the current reiser4 context. See comment before RETERR ++ * macro for more details. ++ */ ++void reiser4_return_err(int code, const char *file, int line) ++{ ++ if (code < 0 && is_in_reiser4_context()) { ++ reiser4_context *ctx = get_current_context(); ++ ++ if (ctx != NULL) { ++ ctx->err.code = code; ++ ctx->err.file = file; ++ ctx->err.line = line; ++ } ++ } ++} ++ ++#if 0 ++/* ++ * report error information recorder by reiser4_return_err(). ++ */ ++static void reiser4_report_err(void) ++{ ++ reiser4_context *ctx = get_current_context_check(); ++ ++ if (ctx != NULL) { ++ if (ctx->err.code != 0) { ++ printk("code: %i at %s:%i\n", ++ ctx->err.code, ctx->err.file, ctx->err.line); ++ } ++ } ++} ++#endif /* 0 */ ++ ++#endif /* REISER4_DEBUG */ ++ ++#if KERNEL_DEBUGGER ++ ++/* ++ * this functions just drops into kernel debugger. It is a convenient place to ++ * put breakpoint in. ++ */ ++void reiser4_debugtrap(void) ++{ ++ /* do nothing. Put break point here. */ ++#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE) ++ extern void kgdb_breakpoint(void); ++ kgdb_breakpoint(); ++#endif ++} ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/debug.h linux-2.6.33/fs/reiser4/debug.h +--- linux-2.6.33.orig/fs/reiser4/debug.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/debug.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,351 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* Declarations of debug macros. */ ++ ++#if !defined(__FS_REISER4_DEBUG_H__) ++#define __FS_REISER4_DEBUG_H__ ++ ++#include "forward.h" ++#include "reiser4.h" ++ ++/* generic function to produce formatted output, decorating it with ++ whatever standard prefixes/postfixes we want. "Fun" is a function ++ that will be actually called, can be printk, panic etc. ++ This is for use by other debugging macros, not by users. */ ++#define DCALL(lev, fun, reperr, label, format, ...) \ ++({ \ ++ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \ ++ current->comm, current->pid, __FUNCTION__, \ ++ __FILE__, __LINE__, label, ## __VA_ARGS__); \ ++}) ++ ++/* ++ * cause kernel to crash ++ */ ++#define reiser4_panic(mid, format, ...) \ ++ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__) ++ ++/* print message with indication of current process, file, line and ++ function */ ++#define reiser4_log(label, format, ...) \ ++ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__) ++ ++/* Assertion checked during compilation. ++ If "cond" is false (0) we get duplicate case label in switch. ++ Use this to check something like famous ++ cassert (sizeof(struct reiserfs_journal_commit) == 4096) ; ++ in 3.x journal.c. If cassertion fails you get compiler error, ++ so no "maintainer-id". ++*/ ++#define cassert(cond) ({ switch (-1) { case (cond): case 0: break; } }) ++ ++#define noop do {; } while (0) ++ ++#if REISER4_DEBUG ++/* version of info that only actually prints anything when _d_ebugging ++ is on */ ++#define dinfo(format, ...) printk(format , ## __VA_ARGS__) ++/* macro to catch logical errors. Put it into `default' clause of ++ switch() statement. */ ++#define impossible(label, format, ...) \ ++ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__) ++/* assert assures that @cond is true. If it is not, reiser4_panic() is ++ called. Use this for checking logical consistency and _never_ call ++ this to check correctness of external data: disk blocks and user-input . */ ++#define assert(label, cond) \ ++({ \ ++ /* call_on_each_assert(); */ \ ++ if (cond) { \ ++ /* put negated check to avoid using !(cond) that would lose \ ++ * warnings for things like assert(a = b); */ \ ++ ; \ ++ } else { \ ++ DEBUGON(1); \ ++ reiser4_panic(label, "assertion failed: %s", #cond); \ ++ } \ ++}) ++ ++/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */ ++#define check_me(label, expr) assert(label, (expr)) ++ ++#define ON_DEBUG(exp) exp ++ ++extern int reiser4_schedulable(void); ++extern void call_on_each_assert(void); ++ ++#else ++ ++#define dinfo(format, args...) noop ++#define impossible(label, format, args...) noop ++#define assert(label, cond) noop ++#define check_me(label, expr) ((void) (expr)) ++#define ON_DEBUG(exp) ++#define reiser4_schedulable() might_sleep() ++ ++/* REISER4_DEBUG */ ++#endif ++ ++#if REISER4_DEBUG ++/* per-thread information about lock acquired by this thread. Used by lock ++ * ordering checking in spin_macros.h */ ++typedef struct reiser4_lock_cnt_info { ++ int rw_locked_tree; ++ int read_locked_tree; ++ int write_locked_tree; ++ ++ int rw_locked_dk; ++ int read_locked_dk; ++ int write_locked_dk; ++ ++ int rw_locked_cbk_cache; ++ int read_locked_cbk_cache; ++ int write_locked_cbk_cache; ++ ++ int spin_locked_zlock; ++ int spin_locked_jnode; ++ int spin_locked_jload; ++ int spin_locked_txnh; ++ int spin_locked_atom; ++ int spin_locked_stack; ++ int spin_locked_txnmgr; ++ int spin_locked_ktxnmgrd; ++ int spin_locked_fq; ++ int spin_locked_inode; ++ int spin_locked_super_eflush; ++ int spin_locked; ++ int long_term_locked_znode; ++ ++ int inode_sem_r; ++ int inode_sem_w; ++ ++ int d_refs; ++ int x_refs; ++ int t_refs; ++} reiser4_lock_cnt_info; ++ ++extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void); ++#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b)) ++ ++/* increment lock-counter @counter, if present */ ++#define LOCK_CNT_INC(counter) \ ++ IN_CONTEXT(++(reiser4_lock_counters()->counter), 0) ++ ++/* decrement lock-counter @counter, if present */ ++#define LOCK_CNT_DEC(counter) \ ++ IN_CONTEXT(--(reiser4_lock_counters()->counter), 0) ++ ++/* check that lock-counter is zero. This is for use in assertions */ ++#define LOCK_CNT_NIL(counter) \ ++ IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1) ++ ++/* check that lock-counter is greater than zero. This is for use in ++ * assertions */ ++#define LOCK_CNT_GTZ(counter) \ ++ IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1) ++#define LOCK_CNT_LT(counter,n) \ ++ IN_CONTEXT(reiser4_lock_counters()->counter < n, 1) ++ ++#else /* REISER4_DEBUG */ ++ ++/* no-op versions on the above */ ++ ++typedef struct reiser4_lock_cnt_info { ++} reiser4_lock_cnt_info; ++ ++#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL) ++#define LOCK_CNT_INC(counter) noop ++#define LOCK_CNT_DEC(counter) noop ++#define LOCK_CNT_NIL(counter) (1) ++#define LOCK_CNT_GTZ(counter) (1) ++#define LOCK_CNT_LT(counter, n) (1) ++ ++#endif /* REISER4_DEBUG */ ++ ++#define assert_spin_not_locked(lock) BUG_ON(0) ++#define assert_rw_write_locked(lock) BUG_ON(0) ++#define assert_rw_read_locked(lock) BUG_ON(0) ++#define assert_rw_locked(lock) BUG_ON(0) ++#define assert_rw_not_write_locked(lock) BUG_ON(0) ++#define assert_rw_not_read_locked(lock) BUG_ON(0) ++#define assert_rw_not_locked(lock) BUG_ON(0) ++ ++/* flags controlling debugging behavior. Are set through debug_flags=N mount ++ option. */ ++typedef enum { ++ /* print a lot of information during panic. When this is on all jnodes ++ * are listed. This can be *very* large output. Usually you don't want ++ * this. Especially over serial line. */ ++ REISER4_VERBOSE_PANIC = 0x00000001, ++ /* print a lot of information during umount */ ++ REISER4_VERBOSE_UMOUNT = 0x00000002, ++ /* print gathered statistics on umount */ ++ REISER4_STATS_ON_UMOUNT = 0x00000004, ++ /* check node consistency */ ++ REISER4_CHECK_NODE = 0x00000008 ++} reiser4_debug_flags; ++ ++extern int is_in_reiser4_context(void); ++ ++/* ++ * evaluate expression @e only if with reiser4 context ++ */ ++#define ON_CONTEXT(e) do { \ ++ if (is_in_reiser4_context()) { \ ++ e; \ ++ } } while (0) ++ ++/* ++ * evaluate expression @e only when within reiser4_context and debugging is ++ * on. ++ */ ++#define ON_DEBUG_CONTEXT(e) ON_DEBUG(ON_CONTEXT(e)) ++ ++/* ++ * complain about unexpected function result and crash. Used in "default" ++ * branches of switch statements and alike to assert that invalid results are ++ * not silently ignored. ++ */ ++#define wrong_return_value(label, function) \ ++ impossible(label, "wrong return value from " function) ++ ++/* Issue different types of reiser4 messages to the console */ ++#define warning(label, format, ...) \ ++ DCALL(KERN_WARNING, \ ++ printk, 1, label, "WARNING: " format , ## __VA_ARGS__) ++#define notice(label, format, ...) \ ++ DCALL(KERN_NOTICE, \ ++ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__) ++ ++/* mark not yet implemented functionality */ ++#define not_yet(label, format, ...) \ ++ reiser4_panic(label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__) ++ ++extern void reiser4_do_panic(const char *format, ...) ++ __attribute__ ((noreturn, format(printf, 1, 2))); ++ ++extern int reiser4_preempt_point(void); ++extern void reiser4_print_stats(void); ++ ++#if REISER4_DEBUG ++extern int reiser4_no_counters_are_held(void); ++extern int reiser4_commit_check_locks(void); ++#else ++#define reiser4_no_counters_are_held() (1) ++#define reiser4_commit_check_locks() (1) ++#endif ++ ++/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */ ++#define IS_POW(i) \ ++({ \ ++ typeof(i) __i; \ ++ \ ++ __i = (i); \ ++ !(__i & (__i - 1)); \ ++}) ++ ++#define KERNEL_DEBUGGER (1) ++ ++#if KERNEL_DEBUGGER ++ ++extern void reiser4_debugtrap(void); ++ ++/* ++ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If ++ * kgdb is not compiled in, do nothing. ++ */ ++#define DEBUGON(cond) \ ++({ \ ++ if (unlikely(cond)) \ ++ reiser4_debugtrap(); \ ++}) ++#else ++#define DEBUGON(cond) noop ++#endif ++ ++/* ++ * Error code tracing facility. (Idea is borrowed from XFS code.) ++ * ++ * Suppose some strange and/or unexpected code is returned from some function ++ * (for example, write(2) returns -EEXIST). It is possible to place a ++ * breakpoint in the reiser4_write(), but it is too late here. How to find out ++ * in what particular place -EEXIST was generated first? ++ * ++ * In reiser4 all places where actual error codes are produced (that is, ++ * statements of the form ++ * ++ * return -EFOO; // (1), or ++ * ++ * result = -EFOO; // (2) ++ * ++ * are replaced with ++ * ++ * return RETERR(-EFOO); // (1a), and ++ * ++ * result = RETERR(-EFOO); // (2a) respectively ++ * ++ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is ++ * printed in error and warning messages. Moreover, it's possible to put a ++ * conditional breakpoint in reiser4_return_err (low-level function called ++ * by RETERR() to do the actual work) to break into debugger immediately ++ * when particular error happens. ++ * ++ */ ++ ++#if REISER4_DEBUG ++ ++/* ++ * data-type to store information about where error happened ("error site"). ++ */ ++typedef struct err_site { ++ int code; /* error code */ ++ const char *file; /* source file, filled by __FILE__ */ ++ int line; /* source file line, filled by __LINE__ */ ++} err_site; ++ ++extern void reiser4_return_err(int code, const char *file, int line); ++ ++/* ++ * fill &get_current_context()->err_site with error information. ++ */ ++#define RETERR(code) \ ++({ \ ++ typeof(code) __code; \ ++ \ ++ __code = (code); \ ++ reiser4_return_err(__code, __FILE__, __LINE__); \ ++ __code; \ ++}) ++ ++#else ++ ++/* ++ * no-op versions of the above ++ */ ++ ++typedef struct err_site { ++} err_site; ++#define RETERR(code) code ++#endif ++ ++#if REISER4_LARGE_KEY ++/* ++ * conditionally compile arguments only if REISER4_LARGE_KEY is on. ++ */ ++#define ON_LARGE_KEY(...) __VA_ARGS__ ++#else ++#define ON_LARGE_KEY(...) ++#endif ++ ++/* __FS_REISER4_DEBUG_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/dformat.h linux-2.6.33/fs/reiser4/dformat.h +--- linux-2.6.33.orig/fs/reiser4/dformat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/dformat.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,71 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* Formats of on-disk data and conversion functions. */ ++ ++/* put all item formats in the files describing the particular items, ++ our model is, everything you need to do to add an item to reiser4, ++ (excepting the changes to the plugin that uses the item which go ++ into the file defining that plugin), you put into one file. */ ++/* Data on disk are stored in little-endian format. ++ To declare fields of on-disk structures, use d8, d16, d32 and d64. ++ d??tocpu() and cputod??() to convert. */ ++ ++#if !defined(__FS_REISER4_DFORMAT_H__) ++#define __FS_REISER4_DFORMAT_H__ ++ ++#include <asm/byteorder.h> ++#include <asm/unaligned.h> ++#include <linux/types.h> ++ ++typedef __u8 d8; ++typedef __le16 d16; ++typedef __le32 d32; ++typedef __le64 d64; ++ ++#define PACKED __attribute__((packed)) ++ ++/* data-type for block number */ ++typedef __u64 reiser4_block_nr; ++ ++/* data-type for block number on disk, disk format */ ++typedef __le64 reiser4_dblock_nr; ++ ++/** ++ * disk_addr_eq - compare disk addresses ++ * @b1: pointer to block number ot compare ++ * @b2: pointer to block number ot compare ++ * ++ * Returns true if if disk addresses are the same ++ */ ++static inline int disk_addr_eq(const reiser4_block_nr * b1, ++ const reiser4_block_nr * b2) ++{ ++ assert("nikita-1033", b1 != NULL); ++ assert("nikita-1266", b2 != NULL); ++ ++ return !memcmp(b1, b2, sizeof *b1); ++} ++ ++/* structure of master reiser4 super block */ ++typedef struct reiser4_master_sb { ++ char magic[16]; /* "ReIsEr4" */ ++ __le16 disk_plugin_id; /* id of disk layout plugin */ ++ __le16 blocksize; ++ char uuid[16]; /* unique id */ ++ char label[16]; /* filesystem label */ ++ __le64 diskmap; /* location of the diskmap. 0 if not present */ ++} reiser4_master_sb; ++ ++/* __FS_REISER4_DFORMAT_H__ */ ++#endif ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/dscale.c linux-2.6.33/fs/reiser4/dscale.c +--- linux-2.6.33.orig/fs/reiser4/dscale.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/dscale.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,192 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Scalable on-disk integers */ ++ ++/* ++ * Various on-disk structures contain integer-like structures. Stat-data ++ * contain [yes, "data" is plural, check the dictionary] file size, link ++ * count; extent unit contains extent width etc. To accommodate for general ++ * case enough space is reserved to keep largest possible value. 64 bits in ++ * all cases above. But in overwhelming majority of cases numbers actually ++ * stored in these fields will be comparatively small and reserving 8 bytes is ++ * a waste of precious disk bandwidth. ++ * ++ * Scalable integers are one way to solve this problem. dscale_write() ++ * function stores __u64 value in the given area consuming from 1 to 9 bytes, ++ * depending on the magnitude of the value supplied. dscale_read() reads value ++ * previously stored by dscale_write(). ++ * ++ * dscale_write() produces format not completely unlike of UTF: two highest ++ * bits of the first byte are used to store "tag". One of 4 possible tag ++ * values is chosen depending on the number being encoded: ++ * ++ * 0 ... 0x3f => 0 [table 1] ++ * 0x40 ... 0x3fff => 1 ++ * 0x4000 ... 0x3fffffff => 2 ++ * 0x40000000 ... 0xffffffffffffffff => 3 ++ * ++ * (see dscale_range() function) ++ * ++ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes ++ * to be stored, so in this case there is no place in the first byte to store ++ * tag. For such values tag is stored in an extra 9th byte. ++ * ++ * As _highest_ bits are used for the test (which is natural) scaled integers ++ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which ++ * uses LITTLE-ENDIAN. ++ * ++ */ ++ ++#include "debug.h" ++#include "dscale.h" ++ ++/* return tag of scaled integer stored at @address */ ++static int gettag(const unsigned char *address) ++{ ++ /* tag is stored in two highest bits */ ++ return (*address) >> 6; ++} ++ ++/* clear tag from value. Clear tag embedded into @value. */ ++static void cleartag(__u64 *value, int tag) ++{ ++ /* ++ * W-w-what ?! ++ * ++ * Actually, this is rather simple: @value passed here was read by ++ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by ++ * zeroes. Tag is still stored in the highest (arithmetically) ++ * non-zero bits of @value, but relative position of tag within __u64 ++ * depends on @tag. ++ * ++ * For example if @tag is 0, it's stored 2 highest bits of lowest ++ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits. ++ * ++ * If tag is 1, it's stored in two highest bits of 2nd lowest byte, ++ * and it's offset if (2 * 8) - 2 == 14 bits. ++ * ++ * See table 1 above for details. ++ * ++ * All these cases are captured by the formula: ++ */ ++ *value &= ~(3 << (((1 << tag) << 3) - 2)); ++ /* ++ * That is, clear two (3 == 0t11) bits at the offset ++ * ++ * 8 * (2 ^ tag) - 2, ++ * ++ * that is, two highest bits of (2 ^ tag)-th byte of @value. ++ */ ++} ++ ++/* return tag for @value. See table 1 above for details. */ ++static int dscale_range(__u64 value) ++{ ++ if (value > 0x3fffffff) ++ return 3; ++ if (value > 0x3fff) ++ return 2; ++ if (value > 0x3f) ++ return 1; ++ return 0; ++} ++ ++/* restore value stored at @adderss by dscale_write() and return number of ++ * bytes consumed */ ++int dscale_read(unsigned char *address, __u64 *value) ++{ ++ int tag; ++ ++ /* read tag */ ++ tag = gettag(address); ++ switch (tag) { ++ case 3: ++ /* In this case tag is stored in an extra byte, skip this byte ++ * and decode value stored in the next 8 bytes.*/ ++ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1))); ++ /* worst case: 8 bytes for value itself plus one byte for ++ * tag. */ ++ return 9; ++ case 0: ++ *value = get_unaligned(address); ++ break; ++ case 1: ++ *value = __be16_to_cpu(get_unaligned((__be16 *)address)); ++ break; ++ case 2: ++ *value = __be32_to_cpu(get_unaligned((__be32 *)address)); ++ break; ++ default: ++ return RETERR(-EIO); ++ } ++ /* clear tag embedded into @value */ ++ cleartag(value, tag); ++ /* number of bytes consumed is (2 ^ tag)---see table 1. */ ++ return 1 << tag; ++} ++ ++/* number of bytes consumed */ ++int dscale_bytes_to_read(unsigned char *address) ++{ ++ int tag; ++ ++ tag = gettag(address); ++ switch (tag) { ++ case 0: ++ case 1: ++ case 2: ++ return 1 << tag; ++ case 3: ++ return 9; ++ default: ++ return RETERR(-EIO); ++ } ++} ++ ++/* store @value at @address and return number of bytes consumed */ ++int dscale_write(unsigned char *address, __u64 value) ++{ ++ int tag; ++ int shift; ++ __be64 v; ++ unsigned char *valarr; ++ ++ tag = dscale_range(value); ++ v = __cpu_to_be64(value); ++ valarr = (unsigned char *)&v; ++ shift = (tag == 3) ? 1 : 0; ++ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag); ++ *address |= (tag << 6); ++ return shift + (1 << tag); ++} ++ ++/* number of bytes required to store @value */ ++int dscale_bytes_to_write(__u64 value) ++{ ++ int bytes; ++ ++ bytes = 1 << dscale_range(value); ++ if (bytes == 8) ++ ++bytes; ++ return bytes; ++} ++ ++/* returns true if @value and @other require the same number of bytes to be ++ * stored. Used by detect when data structure (like stat-data) has to be ++ * expanded or contracted. */ ++int dscale_fit(__u64 value, __u64 other) ++{ ++ return dscale_range(value) == dscale_range(other); ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/dscale.h linux-2.6.33/fs/reiser4/dscale.h +--- linux-2.6.33.orig/fs/reiser4/dscale.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/dscale.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,28 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Scalable on-disk integers. See dscale.h for details. */ ++ ++#if !defined(__FS_REISER4_DSCALE_H__) ++#define __FS_REISER4_DSCALE_H__ ++ ++#include "dformat.h" ++ ++extern int dscale_read(unsigned char *address, __u64 *value); ++extern int dscale_write(unsigned char *address, __u64 value); ++extern int dscale_bytes_to_read(unsigned char *address); ++extern int dscale_bytes_to_write(__u64 value); ++extern int dscale_fit(__u64 value, __u64 other); ++ ++/* __FS_REISER4_DSCALE_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/entd.c linux-2.6.33/fs/reiser4/entd.c +--- linux-2.6.33.orig/fs/reiser4/entd.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/entd.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,338 @@ ++/* Copyright 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Ent daemon. */ ++ ++#include "debug.h" ++#include "txnmgr.h" ++#include "tree.h" ++#include "entd.h" ++#include "super.h" ++#include "context.h" ++#include "reiser4.h" ++#include "vfs_ops.h" ++#include "page_cache.h" ++#include "inode.h" ++ ++#include <linux/sched.h> /* struct task_struct */ ++#include <linux/suspend.h> ++#include <linux/kernel.h> ++#include <linux/writeback.h> ++#include <linux/time.h> /* INITIAL_JIFFIES */ ++#include <linux/backing-dev.h> /* bdi_write_congested */ ++#include <linux/wait.h> ++#include <linux/kthread.h> ++#include <linux/freezer.h> ++ ++#define DEF_PRIORITY 12 ++#define MAX_ENTD_ITERS 10 ++ ++static void entd_flush(struct super_block *, struct wbq *); ++static int entd(void *arg); ++ ++/* ++ * set ->comm field of end thread to make its state visible to the user level ++ */ ++#define entd_set_comm(state) \ ++ snprintf(current->comm, sizeof(current->comm), \ ++ "ent:%s%s", super->s_id, (state)) ++ ++/** ++ * reiser4_init_entd - initialize entd context and start kernel daemon ++ * @super: super block to start ent thread for ++ * ++ * Creates entd contexts, starts kernel thread and waits until it ++ * initializes. ++ */ ++int reiser4_init_entd(struct super_block *super) ++{ ++ entd_context *ctx; ++ ++ assert("nikita-3104", super != NULL); ++ ++ ctx = get_entd_context(super); ++ ++ memset(ctx, 0, sizeof *ctx); ++ spin_lock_init(&ctx->guard); ++ init_waitqueue_head(&ctx->wait); ++#if REISER4_DEBUG ++ INIT_LIST_HEAD(&ctx->flushers_list); ++#endif ++ /* lists of writepage requests */ ++ INIT_LIST_HEAD(&ctx->todo_list); ++ INIT_LIST_HEAD(&ctx->done_list); ++ /* start entd */ ++ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id); ++ if (IS_ERR(ctx->tsk)) ++ return PTR_ERR(ctx->tsk); ++ return 0; ++} ++ ++static void put_wbq(struct wbq *rq) ++{ ++ iput(rq->mapping->host); ++ complete(&rq->completion); ++} ++ ++/* ent should be locked */ ++static struct wbq *__get_wbq(entd_context * ent) ++{ ++ struct wbq *wbq; ++ ++ if (list_empty(&ent->todo_list)) ++ return NULL; ++ ++ ent->nr_todo_reqs--; ++ wbq = list_entry(ent->todo_list.next, struct wbq, link); ++ list_del_init(&wbq->link); ++ return wbq; ++} ++ ++/* ent thread function */ ++static int entd(void *arg) ++{ ++ struct super_block *super; ++ entd_context *ent; ++ int done = 0; ++ ++ super = arg; ++ /* do_fork() just copies task_struct into the new ++ thread. ->fs_context shouldn't be copied of course. This shouldn't ++ be a problem for the rest of the code though. ++ */ ++ current->journal_info = NULL; ++ ++ ent = get_entd_context(super); ++ ++ while (!done) { ++ try_to_freeze(); ++ ++ spin_lock(&ent->guard); ++ while (ent->nr_todo_reqs != 0) { ++ struct wbq *rq; ++ ++ assert("", list_empty(&ent->done_list)); ++ ++ /* take request from the queue head */ ++ rq = __get_wbq(ent); ++ assert("", rq != NULL); ++ ent->cur_request = rq; ++ spin_unlock(&ent->guard); ++ ++ entd_set_comm("!"); ++ entd_flush(super, rq); ++ ++ put_wbq(rq); ++ ++ /* ++ * wakeup all requestors and iput their inodes ++ */ ++ spin_lock(&ent->guard); ++ while (!list_empty(&ent->done_list)) { ++ rq = list_entry(ent->done_list.next, struct wbq, link); ++ list_del_init(&rq->link); ++ ent->nr_done_reqs--; ++ spin_unlock(&ent->guard); ++ assert("", rq->written == 1); ++ put_wbq(rq); ++ spin_lock(&ent->guard); ++ } ++ } ++ spin_unlock(&ent->guard); ++ ++ entd_set_comm("."); ++ ++ { ++ DEFINE_WAIT(__wait); ++ ++ do { ++ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE); ++ if (kthread_should_stop()) { ++ done = 1; ++ break; ++ } ++ if (ent->nr_todo_reqs != 0) ++ break; ++ schedule(); ++ } while (0); ++ finish_wait(&ent->wait, &__wait); ++ } ++ } ++ BUG_ON(ent->nr_todo_reqs != 0); ++ return 0; ++} ++ ++/** ++ * reiser4_done_entd - stop entd kernel thread ++ * @super: super block to stop ent thread for ++ * ++ * It is called on umount. Sends stop signal to entd and wait until it handles ++ * it. ++ */ ++void reiser4_done_entd(struct super_block *super) ++{ ++ entd_context *ent; ++ ++ assert("nikita-3103", super != NULL); ++ ++ ent = get_entd_context(super); ++ assert("zam-1055", ent->tsk != NULL); ++ kthread_stop(ent->tsk); ++} ++ ++/* called at the beginning of jnode_flush to register flusher thread with ent ++ * daemon */ ++void reiser4_enter_flush(struct super_block *super) ++{ ++ entd_context *ent; ++ ++ assert("zam-1029", super != NULL); ++ ent = get_entd_context(super); ++ ++ assert("zam-1030", ent != NULL); ++ ++ spin_lock(&ent->guard); ++ ent->flushers++; ++#if REISER4_DEBUG ++ list_add(&get_current_context()->flushers_link, &ent->flushers_list); ++#endif ++ spin_unlock(&ent->guard); ++} ++ ++/* called at the end of jnode_flush */ ++void reiser4_leave_flush(struct super_block *super) ++{ ++ entd_context *ent; ++ int wake_up_ent; ++ ++ assert("zam-1027", super != NULL); ++ ent = get_entd_context(super); ++ ++ assert("zam-1028", ent != NULL); ++ ++ spin_lock(&ent->guard); ++ ent->flushers--; ++ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0); ++#if REISER4_DEBUG ++ list_del_init(&get_current_context()->flushers_link); ++#endif ++ spin_unlock(&ent->guard); ++ if (wake_up_ent) ++ wake_up_process(ent->tsk); ++} ++ ++#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX ++ ++static void entd_flush(struct super_block *super, struct wbq *rq) ++{ ++ reiser4_context ctx; ++ int tmp; ++ ++ init_stack_context(&ctx, super); ++ ctx.entd = 1; ++ ctx.gfp_mask = GFP_NOFS; ++ ++ rq->wbc->range_start = page_offset(rq->page); ++ rq->wbc->range_end = rq->wbc->range_start + ++ (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT); ++ tmp = rq->wbc->nr_to_write; ++ ++ assert("edward-1561", super == rq->wbc->sb); ++ ++ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc); ++ ++ if (rq->wbc->nr_to_write > 0) { ++ rq->wbc->range_start = 0; ++ rq->wbc->range_end = LLONG_MAX; ++ writeback_inodes_wbc(rq->wbc); ++ } ++ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST; ++ ++ reiser4_writeout(super, rq->wbc); ++ context_set_commit_async(&ctx); ++ reiser4_exit_context(&ctx); ++} ++ ++/** ++ * write_page_by_ent - ask entd thread to flush this page as part of slum ++ * @page: page to be written ++ * @wbc: writeback control passed to reiser4_writepage ++ * ++ * Creates a request, puts it on entd list of requests, wakeups entd if ++ * necessary, waits until entd completes with the request. ++ */ ++int write_page_by_ent(struct page *page, struct writeback_control *wbc) ++{ ++ struct super_block *sb; ++ struct inode *inode; ++ entd_context *ent; ++ struct wbq rq; ++ ++ assert("", PageLocked(page)); ++ assert("", page->mapping != NULL); ++ ++ sb = page->mapping->host->i_sb; ++ ent = get_entd_context(sb); ++ assert("", ent && ent->done == 0); ++ ++ /* ++ * we are going to unlock page and ask ent thread to write the ++ * page. Re-dirty page before unlocking so that if ent thread fails to ++ * write it - it will remain dirty ++ */ ++ set_page_dirty_notag(page); ++ ++ /* ++ * pin inode in memory, unlock page, entd_flush will iput. We can not ++ * iput here becasue we can not allow delete_inode to be called here ++ */ ++ inode = igrab(page->mapping->host); ++ unlock_page(page); ++ if (inode == NULL) ++ /* inode is getting freed */ ++ return 0; ++ ++ /* init wbq */ ++ INIT_LIST_HEAD(&rq.link); ++ rq.magic = WBQ_MAGIC; ++ rq.wbc = wbc; ++ rq.page = page; ++ rq.mapping = inode->i_mapping; ++ rq.node = NULL; ++ rq.written = 0; ++ init_completion(&rq.completion); ++ ++ /* add request to entd's list of writepage requests */ ++ spin_lock(&ent->guard); ++ ent->nr_todo_reqs++; ++ list_add_tail(&rq.link, &ent->todo_list); ++ if (ent->nr_todo_reqs == 1) ++ wake_up_process(ent->tsk); ++ ++ spin_unlock(&ent->guard); ++ ++ /* wait until entd finishes */ ++ wait_for_completion(&rq.completion); ++ ++ if (rq.written) ++ /* Eventually ENTD has written the page to disk. */ ++ return 0; ++ return 0; ++} ++ ++int wbq_available(void) ++{ ++ struct super_block *sb = reiser4_get_current_sb(); ++ entd_context *ent = get_entd_context(sb); ++ return ent->nr_todo_reqs; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/entd.h linux-2.6.33/fs/reiser4/entd.h +--- linux-2.6.33.orig/fs/reiser4/entd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/entd.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,90 @@ ++/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Ent daemon. */ ++ ++#ifndef __ENTD_H__ ++#define __ENTD_H__ ++ ++#include "context.h" ++ ++#include <linux/fs.h> ++#include <linux/completion.h> ++#include <linux/wait.h> ++#include <linux/spinlock.h> ++#include <linux/sched.h> /* for struct task_struct */ ++ ++#define WBQ_MAGIC 0x7876dc76 ++ ++/* write-back request. */ ++struct wbq { ++ int magic; ++ struct list_head link; /* list head of this list is in entd context */ ++ struct writeback_control *wbc; ++ struct page *page; ++ struct address_space *mapping; ++ struct completion completion; ++ jnode *node; /* set if ent thread captured requested page */ ++ int written; /* set if ent thread wrote requested page */ ++}; ++ ++/* ent-thread context. This is used to synchronize starting/stopping ent ++ * threads. */ ++typedef struct entd_context { ++ /* wait queue that ent thread waits on for more work. It's ++ * signaled by write_page_by_ent(). */ ++ wait_queue_head_t wait; ++ /* spinlock protecting other fields */ ++ spinlock_t guard; ++ /* ent thread */ ++ struct task_struct *tsk; ++ /* set to indicate that ent thread should leave. */ ++ int done; ++ /* counter of active flushers */ ++ int flushers; ++ /* ++ * when reiser4_writepage asks entd to write a page - it adds struct ++ * wbq to this list ++ */ ++ struct list_head todo_list; ++ /* number of elements on the above list */ ++ int nr_todo_reqs; ++ ++ struct wbq *cur_request; ++ /* ++ * when entd writes a page it moves write-back request from todo_list ++ * to done_list. This list is used at the end of entd iteration to ++ * wakeup requestors and iput inodes. ++ */ ++ struct list_head done_list; ++ /* number of elements on the above list */ ++ int nr_done_reqs; ++ ++#if REISER4_DEBUG ++ /* list of all active flushers */ ++ struct list_head flushers_list; ++#endif ++} entd_context; ++ ++extern int reiser4_init_entd(struct super_block *); ++extern void reiser4_done_entd(struct super_block *); ++ ++extern void reiser4_enter_flush(struct super_block *); ++extern void reiser4_leave_flush(struct super_block *); ++ ++extern int write_page_by_ent(struct page *, struct writeback_control *); ++extern int wbq_available(void); ++extern void ent_writes_page(struct super_block *, struct page *); ++ ++extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *); ++/* __ENTD_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/eottl.c linux-2.6.33/fs/reiser4/eottl.c +--- linux-2.6.33.orig/fs/reiser4/eottl.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/eottl.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,510 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++#include "forward.h" ++#include "debug.h" ++#include "key.h" ++#include "coord.h" ++#include "plugin/item/item.h" ++#include "plugin/node/node.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree_walk.h" ++#include "tree_mod.h" ++#include "carry.h" ++#include "tree.h" ++#include "super.h" ++ ++#include <linux/types.h> /* for __u?? */ ++ ++/* ++ * Extents on the twig level (EOTTL) handling. ++ * ++ * EOTTL poses some problems to the tree traversal, that are better explained ++ * by example. ++ * ++ * Suppose we have block B1 on the twig level with the following items: ++ * ++ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id, ++ * offset) ++ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each ++ * 2. internal item I2 with key (10:0:0:0) ++ * ++ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and ++ * then intra-node lookup is done. This lookup finished on the E1, because the ++ * key we are looking for is larger than the key of E1 and is smaller than key ++ * the of I2. ++ * ++ * Here search is stuck. ++ * ++ * After some thought it is clear what is wrong here: extents on the twig level ++ * break some basic property of the *search* tree (on the pretext, that they ++ * restore property of balanced tree). ++ * ++ * Said property is the following: if in the internal node of the search tree ++ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be ++ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible ++ * through the Pointer. ++ * ++ * This is not true, when Pointer is Extent-Pointer, simply because extent ++ * cannot expand indefinitely to the right to include any item with ++ * ++ * Key1 <= Key <= Key2. ++ * ++ * For example, our E1 extent is only responsible for the data with keys ++ * ++ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and ++ * ++ * so, key range ++ * ++ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) ) ++ * ++ * is orphaned: there is no way to get there from the tree root. ++ * ++ * In other words, extent pointers are different than normal child pointers as ++ * far as search tree is concerned, and this creates such problems. ++ * ++ * Possible solution for this problem is to insert our item into node pointed ++ * to by I2. There are some problems through: ++ * ++ * (1) I2 can be in a different node. ++ * (2) E1 can be immediately followed by another extent E2. ++ * ++ * (1) is solved by calling reiser4_get_right_neighbor() and accounting ++ * for locks/coords as necessary. ++ * ++ * (2) is more complex. Solution here is to insert new empty leaf node and ++ * insert internal item between E1 and E2 pointing to said leaf node. This is ++ * further complicated by possibility that E2 is in a different node, etc. ++ * ++ * Problems: ++ * ++ * (1) if there was internal item I2 immediately on the right of an extent E1 ++ * we and we decided to insert new item S1 into node N2 pointed to by I2, then ++ * key of S1 will be less than smallest key in the N2. Normally, search key ++ * checks that key we are looking for is in the range of keys covered by the ++ * node key is being looked in. To work around of this situation, while ++ * preserving useful consistency check new flag CBK_TRUST_DK was added to the ++ * cbk falgs bitmask. This flag is automatically set on entrance to the ++ * coord_by_key() and is only cleared when we are about to enter situation ++ * described above. ++ * ++ * (2) If extent E1 is immediately followed by another extent E2 and we are ++ * searching for the key that is between E1 and E2 we only have to insert new ++ * empty leaf node when coord_by_key was called for insertion, rather than just ++ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to ++ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls ++ * performed by insert_by_key() and friends. ++ * ++ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any ++ * case it requires modification of node content which is only possible under ++ * write lock. It may well happen that we only have read lock on the node where ++ * new internal pointer is to be inserted (common case: lookup of non-existent ++ * stat-data that fells between two extents). If only read lock is held, tree ++ * traversal is restarted with lock_level modified so that next time we hit ++ * this problem, write lock will be held. Once we have write lock, balancing ++ * will be performed. ++ */ ++ ++/** ++ * is_next_item_internal - check whether next item is internal ++ * @coord: coordinate of extent item in twig node ++ * @key: search key ++ * @lh: twig node lock handle ++ * ++ * Looks at the unit next to @coord. If it is an internal one - 1 is returned, ++ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved ++ * to that node, @coord is set to its first unit. If next item is not internal ++ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2 ++ * is returned if search restart has to be done. ++ */ ++static int ++is_next_item_internal(coord_t *coord, const reiser4_key * key, ++ lock_handle * lh) ++{ ++ coord_t next; ++ lock_handle rn; ++ int result; ++ ++ coord_dup(&next, coord); ++ if (coord_next_unit(&next) == 0) { ++ /* next unit is in this node */ ++ if (item_is_internal(&next)) { ++ coord_dup(coord, &next); ++ return 1; ++ } ++ assert("vs-3", item_is_extent(&next)); ++ return 0; ++ } ++ ++ /* ++ * next unit either does not exist or is in right neighbor. If it is in ++ * right neighbor we have to check right delimiting key because ++ * concurrent thread could get their first and insert item with a key ++ * smaller than @key ++ */ ++ read_lock_dk(current_tree); ++ result = keycmp(key, znode_get_rd_key(coord->node)); ++ read_unlock_dk(current_tree); ++ assert("vs-6", result != EQUAL_TO); ++ if (result == GREATER_THAN) ++ return 2; ++ ++ /* lock right neighbor */ ++ init_lh(&rn); ++ result = reiser4_get_right_neighbor(&rn, coord->node, ++ znode_is_wlocked(coord->node) ? ++ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK, ++ GN_CAN_USE_UPPER_LEVELS); ++ if (result == -E_NO_NEIGHBOR) { ++ /* we are on the rightmost edge of the tree */ ++ done_lh(&rn); ++ return 0; ++ } ++ ++ if (result) { ++ assert("vs-4", result < 0); ++ done_lh(&rn); ++ return result; ++ } ++ ++ /* ++ * check whether concurrent thread managed to insert item with a key ++ * smaller than @key ++ */ ++ read_lock_dk(current_tree); ++ result = keycmp(key, znode_get_ld_key(rn.node)); ++ read_unlock_dk(current_tree); ++ assert("vs-6", result != EQUAL_TO); ++ if (result == GREATER_THAN) { ++ done_lh(&rn); ++ return 2; ++ } ++ ++ result = zload(rn.node); ++ if (result) { ++ assert("vs-5", result < 0); ++ done_lh(&rn); ++ return result; ++ } ++ ++ coord_init_first_unit(&next, rn.node); ++ if (item_is_internal(&next)) { ++ /* ++ * next unit is in right neighbor and it is an unit of internal ++ * item. Unlock coord->node. Move @lh to right neighbor. @coord ++ * is set to the first unit of right neighbor. ++ */ ++ coord_dup(coord, &next); ++ zrelse(rn.node); ++ done_lh(lh); ++ move_lh(lh, &rn); ++ return 1; ++ } ++ ++ /* ++ * next unit is unit of extent item. Return without chaning @lh and ++ * @coord. ++ */ ++ assert("vs-6", item_is_extent(&next)); ++ zrelse(rn.node); ++ done_lh(&rn); ++ return 0; ++} ++ ++/** ++ * rd_key - calculate key of an item next to the given one ++ * @coord: position in a node ++ * @key: storage for result key ++ * ++ * @coord is set between items or after the last item in a node. Calculate key ++ * of item to the right of @coord. ++ */ ++static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key) ++{ ++ coord_t dup; ++ ++ assert("nikita-2281", coord_is_between_items(coord)); ++ coord_dup(&dup, coord); ++ ++ if (coord_set_to_right(&dup) == 0) ++ /* next item is in this node. Return its key. */ ++ unit_key_by_coord(&dup, key); ++ else { ++ /* ++ * next item either does not exist or is in right ++ * neighbor. Return znode's right delimiting key. ++ */ ++ read_lock_dk(current_tree); ++ *key = *znode_get_rd_key(coord->node); ++ read_unlock_dk(current_tree); ++ } ++ return key; ++} ++ ++/** ++ * add_empty_leaf - insert empty leaf between two extents ++ * @insert_coord: position in twig node between two extents ++ * @lh: twig node lock handle ++ * @key: left delimiting key of new node ++ * @rdkey: right delimiting key of new node ++ * ++ * Inserts empty leaf node between two extent items. It is necessary when we ++ * have to insert an item on leaf level between two extents (items on the twig ++ * level). ++ */ ++static int ++add_empty_leaf(coord_t *insert_coord, lock_handle *lh, ++ const reiser4_key *key, const reiser4_key *rdkey) ++{ ++ int result; ++ carry_pool *pool; ++ carry_level *todo; ++ reiser4_item_data *item; ++ carry_insert_data *cdata; ++ carry_op *op; ++ znode *node; ++ reiser4_tree *tree; ++ ++ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key)); ++ tree = znode_get_tree(insert_coord->node); ++ node = reiser4_new_node(insert_coord->node, LEAF_LEVEL); ++ if (IS_ERR(node)) ++ return PTR_ERR(node); ++ ++ /* setup delimiting keys for node being inserted */ ++ write_lock_dk(tree); ++ znode_set_ld_key(node, key); ++ znode_set_rd_key(node, rdkey); ++ ON_DEBUG(node->creator = current); ++ ON_DEBUG(node->first_key = *key); ++ write_unlock_dk(tree); ++ ++ ZF_SET(node, JNODE_ORPHAN); ++ ++ /* ++ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and ++ * carry_insert_data ++ */ ++ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) + ++ sizeof(*item) + sizeof(*cdata)); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ todo = (carry_level *) (pool + 1); ++ init_carry_level(todo, pool); ++ ++ item = (reiser4_item_data *) (todo + 3); ++ cdata = (carry_insert_data *) (item + 1); ++ ++ op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0); ++ if (!IS_ERR(op)) { ++ cdata->coord = insert_coord; ++ cdata->key = key; ++ cdata->data = item; ++ op->u.insert.d = cdata; ++ op->u.insert.type = COPT_ITEM_DATA; ++ build_child_ptr_data(node, item); ++ item->arg = NULL; ++ /* have @insert_coord to be set at inserted item after ++ insertion is done */ ++ todo->track_type = CARRY_TRACK_CHANGE; ++ todo->tracked = lh; ++ ++ result = reiser4_carry(todo, NULL); ++ if (result == 0) { ++ /* ++ * pin node in memory. This is necessary for ++ * znode_make_dirty() below. ++ */ ++ result = zload(node); ++ if (result == 0) { ++ lock_handle local_lh; ++ ++ /* ++ * if we inserted new child into tree we have ++ * to mark it dirty so that flush will be able ++ * to process it. ++ */ ++ init_lh(&local_lh); ++ result = longterm_lock_znode(&local_lh, node, ++ ZNODE_WRITE_LOCK, ++ ZNODE_LOCK_LOPRI); ++ if (result == 0) { ++ znode_make_dirty(node); ++ ++ /* ++ * when internal item pointing to @node ++ * was inserted into twig node ++ * create_hook_internal did not connect ++ * it properly because its right ++ * neighbor was not known. Do it ++ * here ++ */ ++ write_lock_tree(tree); ++ assert("nikita-3312", ++ znode_is_right_connected(node)); ++ assert("nikita-2984", ++ node->right == NULL); ++ ZF_CLR(node, JNODE_RIGHT_CONNECTED); ++ write_unlock_tree(tree); ++ result = ++ connect_znode(insert_coord, node); ++ ON_DEBUG(if (result == 0) check_dkeys(node);); ++ ++ done_lh(lh); ++ move_lh(lh, &local_lh); ++ assert("vs-1676", node_is_empty(node)); ++ coord_init_first_unit(insert_coord, ++ node); ++ } else { ++ warning("nikita-3136", ++ "Cannot lock child"); ++ } ++ done_lh(&local_lh); ++ zrelse(node); ++ } ++ } ++ } else ++ result = PTR_ERR(op); ++ zput(node); ++ done_carry_pool(pool); ++ return result; ++} ++ ++/** ++ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal ++ * @h: search handle ++ * @outcome: flag saying whether search has to restart or is done ++ * ++ * Handles search on twig level. If this function completes search itself then ++ * it returns 1. If search has to go one level down then 0 is returned. If ++ * error happens then LOOKUP_DONE is returned via @outcome and error code is ++ * saved in @h->result. ++ */ ++int handle_eottl(cbk_handle *h, int *outcome) ++{ ++ int result; ++ reiser4_key key; ++ coord_t *coord; ++ ++ coord = h->coord; ++ ++ if (h->level != TWIG_LEVEL || ++ (coord_is_existing_item(coord) && item_is_internal(coord))) { ++ /* Continue to traverse tree downward. */ ++ return 0; ++ } ++ ++ /* ++ * make sure that @h->coord is set to twig node and that it is either ++ * set to extent item or after extent item ++ */ ++ assert("vs-356", h->level == TWIG_LEVEL); ++ assert("vs-357", ({ ++ coord_t lcoord; ++ coord_dup(&lcoord, coord); ++ check_me("vs-733", coord_set_to_left(&lcoord) == 0); ++ item_is_extent(&lcoord); ++ } ++ )); ++ ++ if (*outcome == NS_FOUND) { ++ /* we have found desired key on twig level in extent item */ ++ h->result = CBK_COORD_FOUND; ++ *outcome = LOOKUP_DONE; ++ return 1; ++ } ++ ++ if (!(h->flags & CBK_FOR_INSERT)) { ++ /* tree traversal is not for insertion. Just return ++ CBK_COORD_NOTFOUND. */ ++ h->result = CBK_COORD_NOTFOUND; ++ *outcome = LOOKUP_DONE; ++ return 1; ++ } ++ ++ /* take a look at the item to the right of h -> coord */ ++ result = is_next_item_internal(coord, h->key, h->active_lh); ++ if (unlikely(result < 0)) { ++ h->error = "get_right_neighbor failed"; ++ h->result = result; ++ *outcome = LOOKUP_DONE; ++ return 1; ++ } ++ if (result == 0) { ++ /* ++ * item to the right is also an extent one. Allocate a new node ++ * and insert pointer to it after item h -> coord. ++ * ++ * This is a result of extents being located at the twig ++ * level. For explanation, see comment just above ++ * is_next_item_internal(). ++ */ ++ znode *loaded; ++ ++ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) { ++ /* ++ * we got node read locked, restart coord_by_key to ++ * have write lock on twig level ++ */ ++ h->lock_level = TWIG_LEVEL; ++ h->lock_mode = ZNODE_WRITE_LOCK; ++ *outcome = LOOKUP_REST; ++ return 1; ++ } ++ ++ loaded = coord->node; ++ result = ++ add_empty_leaf(coord, h->active_lh, h->key, ++ rd_key(coord, &key)); ++ if (result) { ++ h->error = "could not add empty leaf"; ++ h->result = result; ++ *outcome = LOOKUP_DONE; ++ return 1; ++ } ++ /* added empty leaf is locked (h->active_lh), its parent node ++ is unlocked, h->coord is set as EMPTY */ ++ assert("vs-13", coord->between == EMPTY_NODE); ++ assert("vs-14", znode_is_write_locked(coord->node)); ++ assert("vs-15", ++ WITH_DATA(coord->node, node_is_empty(coord->node))); ++ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node))); ++ assert("vs-17", coord->node == h->active_lh->node); ++ *outcome = LOOKUP_DONE; ++ h->result = CBK_COORD_NOTFOUND; ++ return 1; ++ } else if (result == 1) { ++ /* ++ * this is special case mentioned in the comment on ++ * tree.h:cbk_flags. We have found internal item immediately on ++ * the right of extent, and we are going to insert new item ++ * there. Key of item we are going to insert is smaller than ++ * leftmost key in the node pointed to by said internal item ++ * (otherwise search wouldn't come to the extent in the first ++ * place). ++ * ++ * This is a result of extents being located at the twig ++ * level. For explanation, see comment just above ++ * is_next_item_internal(). ++ */ ++ h->flags &= ~CBK_TRUST_DK; ++ } else { ++ assert("vs-8", result == 2); ++ *outcome = LOOKUP_REST; ++ return 1; ++ } ++ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord))); ++ return 0; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 120 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/estimate.c linux-2.6.33/fs/reiser4/estimate.c +--- linux-2.6.33.orig/fs/reiser4/estimate.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/estimate.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,129 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++#include "debug.h" ++#include "dformat.h" ++#include "tree.h" ++#include "carry.h" ++#include "inode.h" ++#include "plugin/cluster.h" ++#include "plugin/item/ctail.h" ++ ++/* This returns how many nodes might get dirty and added nodes if @children ++ nodes are dirtied ++ ++ Amount of internals which will get dirty or get allocated we estimate as 5% ++ of the childs + 1 balancing. 1 balancing is 2 neighbours, 2 new blocks and ++ the current block on the leaf level, 2 neighbour nodes + the current (or 1 ++ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on ++ upper levels and 1 for a new root. So 5 for leaf level, 3 for twig level, ++ 2 on upper + 1 for root. ++ ++ Do not calculate the current node of the lowest level here - this is overhead ++ only. ++ ++ children is almost always 1 here. Exception is flow insertion ++*/ ++static reiser4_block_nr ++max_balance_overhead(reiser4_block_nr childen, tree_level tree_height) ++{ ++ reiser4_block_nr ten_percent; ++ ++ ten_percent = ((103 * childen) >> 10); ++ ++ /* If we have too many balancings at the time, tree height can raise on ++ more then 1. Assume that if tree_height is 5, it can raise on 1 only. ++ */ ++ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent)); ++} ++ ++/* this returns maximal possible number of nodes which can be modified plus ++ number of new nodes which can be required to perform insertion of one item ++ into the tree */ ++/* it is only called when tree height changes, or gets initialized */ ++reiser4_block_nr calc_estimate_one_insert(tree_level height) ++{ ++ return 1 + max_balance_overhead(1, height); ++} ++ ++reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree) ++{ ++ return tree->estimate_one_insert; ++} ++ ++/* this returns maximal possible number of nodes which can be modified plus ++ number of new nodes which can be required to perform insertion of one unit ++ into an item in the tree */ ++reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree) ++{ ++ /* estimate insert into item just like item insertion */ ++ return tree->estimate_one_insert; ++} ++ ++reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree) ++{ ++ /* on item removal reiser4 does not try to pack nodes more complact, so, ++ only one node may be dirtied on leaf level */ ++ return tree->estimate_one_insert; ++} ++ ++/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and ++ dirty 3 existing nodes (insert point and both its neighbors). ++ Max_balance_overhead should estimate number of blocks which may change/get ++ added on internal levels */ ++reiser4_block_nr estimate_insert_flow(tree_level height) ++{ ++ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 + ++ CARRY_FLOW_NEW_NODES_LIMIT, ++ height); ++} ++ ++/* returnes max number of nodes can be occupied by disk cluster */ ++static reiser4_block_nr estimate_cluster(struct inode *inode, int unprepped) ++{ ++ int per_cluster; ++ per_cluster = (unprepped ? 1 : cluster_nrpages(inode)); ++ return 3 + per_cluster + ++ max_balance_overhead(3 + per_cluster, ++ REISER4_MAX_ZTREE_HEIGHT); ++} ++ ++/* how many nodes might get dirty and added ++ during insertion of a disk cluster */ ++reiser4_block_nr estimate_insert_cluster(struct inode *inode) ++{ ++ return estimate_cluster(inode, 1); /* 24 */ ++} ++ ++/* how many nodes might get dirty and added ++ during update of a (prepped or unprepped) disk cluster */ ++reiser4_block_nr estimate_update_cluster(struct inode *inode) ++{ ++ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */ ++} ++ ++/* How many nodes occupied by a disk cluster might get dirty. ++ Note that this estimation is not precise (i.e. disk cluster ++ can occupy more nodes). ++ Q: Why we don't use precise estimation? ++ A: 1.Because precise estimation is fairly bad: 65536 nodes ++ for 64K logical cluster, it means 256M of dead space on ++ a partition ++ 2.It is a very rare case when disk cluster occupies more ++ nodes then this estimation returns. ++*/ ++reiser4_block_nr estimate_dirty_cluster(struct inode *inode) ++{ ++ return cluster_nrpages(inode) + 4; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/export_ops.c linux-2.6.33/fs/reiser4/export_ops.c +--- linux-2.6.33.orig/fs/reiser4/export_ops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/export_ops.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,328 @@ ++/* Copyright 2005 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#include "inode.h" ++#include "plugin/plugin.h" ++ ++/* ++ * Supported file-handle types ++ */ ++typedef enum { ++ FH_WITH_PARENT = 0x10, /* file handle with parent */ ++ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */ ++} reiser4_fhtype; ++ ++#define NFSERROR (255) ++ ++/* initialize place-holder for object */ ++static void object_on_wire_init(reiser4_object_on_wire *o) ++{ ++ o->plugin = NULL; ++} ++ ++/* finish with @o */ ++static void object_on_wire_done(reiser4_object_on_wire *o) ++{ ++ if (o->plugin != NULL) ++ o->plugin->wire.done(o); ++} ++ ++/* ++ * read serialized object identity from @addr and store information about ++ * object in @obj. This is dual to encode_inode(). ++ */ ++static char *decode_inode(struct super_block *s, char *addr, ++ reiser4_object_on_wire * obj) ++{ ++ file_plugin *fplug; ++ ++ /* identifier of object plugin is stored in the first two bytes, ++ * followed by... */ ++ fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr); ++ if (fplug != NULL) { ++ addr += sizeof(d16); ++ obj->plugin = fplug; ++ assert("nikita-3520", fplug->wire.read != NULL); ++ /* plugin specific encoding of object identity. */ ++ addr = fplug->wire.read(addr, obj); ++ } else ++ addr = ERR_PTR(RETERR(-EINVAL)); ++ return addr; ++} ++ ++static struct dentry *reiser4_get_dentry(struct super_block *super, ++ void *data); ++/** ++ * reiser4_decode_fh: decode on-wire object - helper function ++ * for fh_to_dentry, fh_to_parent export operations; ++ * @super: super block; ++ * @addr: onwire object to be decoded; ++ * ++ * Returns dentry referring to the object being decoded. ++ */ ++static struct dentry *reiser4_decode_fh(struct super_block * super, ++ char * addr) ++{ ++ reiser4_object_on_wire object; ++ ++ object_on_wire_init(&object); ++ ++ addr = decode_inode(super, addr, &object); ++ if (!IS_ERR(addr)) { ++ struct dentry *d; ++ d = reiser4_get_dentry(super, &object); ++ if (d != NULL && !IS_ERR(d)) ++ /* FIXME check for -ENOMEM */ ++ reiser4_get_dentry_fsdata(d)->stateless = 1; ++ addr = (char *)d; ++ } ++ object_on_wire_done(&object); ++ return (void *)addr; ++} ++ ++static struct dentry *reiser4_fh_to_dentry(struct super_block *sb, ++ struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ reiser4_context *ctx; ++ struct dentry *d; ++ ++ assert("edward-1536", ++ fh_type == FH_WITH_PARENT || fh_type == FH_WITHOUT_PARENT); ++ ++ ctx = reiser4_init_context(sb); ++ if (IS_ERR(ctx)) ++ return (struct dentry *)ctx; ++ ++ d = reiser4_decode_fh(sb, (char *)fid->raw); ++ ++ reiser4_exit_context(ctx); ++ return d; ++} ++ ++static struct dentry *reiser4_fh_to_parent(struct super_block *sb, ++ struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ char * addr; ++ struct dentry * d; ++ reiser4_context *ctx; ++ file_plugin *fplug; ++ ++ if (fh_type == FH_WITHOUT_PARENT) ++ return NULL; ++ assert("edward-1537", fh_type == FH_WITH_PARENT); ++ ++ ctx = reiser4_init_context(sb); ++ if (IS_ERR(ctx)) ++ return (struct dentry *)ctx; ++ addr = (char *)fid->raw; ++ /* extract 2-bytes file plugin id */ ++ fplug = file_plugin_by_disk_id(reiser4_get_tree(sb), (d16 *)addr); ++ if (fplug == NULL) { ++ d = ERR_PTR(RETERR(-EINVAL)); ++ goto exit; ++ } ++ addr += sizeof(d16); ++ /* skip previously encoded object */ ++ addr = fplug->wire.read(addr, NULL /* skip */); ++ if (IS_ERR(addr)) { ++ d = (struct dentry *)addr; ++ goto exit; ++ } ++ /* @extract and decode parent object */ ++ d = reiser4_decode_fh(sb, addr); ++ exit: ++ reiser4_exit_context(ctx); ++ return d; ++} ++ ++/* ++ * Object serialization support. ++ * ++ * To support knfsd file system provides export_operations that are used to ++ * construct and interpret NFS file handles. As a generalization of this, ++ * reiser4 object plugins have serialization support: it provides methods to ++ * create on-wire representation of identity of reiser4 object, and ++ * re-create/locate object given its on-wire identity. ++ * ++ */ ++ ++/* ++ * return number of bytes that on-wire representation of @inode's identity ++ * consumes. ++ */ ++static int encode_inode_size(struct inode *inode) ++{ ++ assert("nikita-3514", inode != NULL); ++ assert("nikita-3515", inode_file_plugin(inode) != NULL); ++ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL); ++ ++ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16); ++} ++ ++/* ++ * store on-wire representation of @inode's identity at the area beginning at ++ * @start. ++ */ ++static char *encode_inode(struct inode *inode, char *start) ++{ ++ assert("nikita-3517", inode != NULL); ++ assert("nikita-3518", inode_file_plugin(inode) != NULL); ++ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL); ++ ++ /* ++ * first, store two-byte identifier of object plugin, then ++ */ ++ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)), ++ (d16 *) start); ++ start += sizeof(d16); ++ /* ++ * call plugin to serialize object's identity ++ */ ++ return inode_file_plugin(inode)->wire.write(inode, start); ++} ++ ++/* this returns number of 32 bit long numbers encoded in @lenp. 255 is ++ * returned if file handle can not be stored */ ++/** ++ * reiser4_encode_fh - encode_fh of export operations ++ * @dentry: ++ * @fh: ++ * @lenp: ++ * @need_parent: ++ * ++ */ ++static int ++reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp, ++ int need_parent) ++{ ++ struct inode *inode; ++ struct inode *parent; ++ char *addr; ++ int need; ++ int delta; ++ int result; ++ reiser4_context *ctx; ++ ++ /* ++ * knfsd asks as to serialize object in @dentry, and, optionally its ++ * parent (if need_parent != 0). ++ * ++ * encode_inode() and encode_inode_size() is used to build ++ * representation of object and its parent. All hard work is done by ++ * object plugins. ++ */ ++ inode = dentry->d_inode; ++ parent = dentry->d_parent->d_inode; ++ ++ addr = (char *)fh; ++ ++ need = encode_inode_size(inode); ++ if (need < 0) ++ return NFSERROR; ++ if (need_parent) { ++ delta = encode_inode_size(parent); ++ if (delta < 0) ++ return NFSERROR; ++ need += delta; ++ } ++ ++ ctx = reiser4_init_context(dentry->d_inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ if (need <= sizeof(__u32) * (*lenp)) { ++ addr = encode_inode(inode, addr); ++ if (need_parent) ++ addr = encode_inode(parent, addr); ++ ++ /* store in lenp number of 32bit words required for file ++ * handle. */ ++ *lenp = (need + sizeof(__u32) - 1) >> 2; ++ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT; ++ } else ++ /* no enough space in file handle */ ++ result = NFSERROR; ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/** ++ * reiser4_get_dentry_parent - get_parent of export operations ++ * @child: ++ * ++ */ ++static struct dentry *reiser4_get_dentry_parent(struct dentry *child) ++{ ++ struct inode *dir; ++ dir_plugin *dplug; ++ struct dentry *result; ++ reiser4_context *ctx; ++ ++ assert("nikita-3527", child != NULL); ++ ++ dir = child->d_inode; ++ assert("nikita-3529", dir != NULL); ++ ++ ctx = reiser4_init_context(dir->i_sb); ++ if (IS_ERR(ctx)) ++ return (void *)ctx; ++ ++ dplug = inode_dir_plugin(dir); ++ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL)); ++ ++ if (unlikely(dplug == NULL)) { ++ reiser4_exit_context(ctx); ++ return ERR_PTR(RETERR(-ENOTDIR)); ++ } ++ result = dplug->get_parent(dir); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/** ++ * reiser4_get_dentry - get_dentry of export operations ++ * @super: ++ * @data: ++ * ++ * ++ */ ++static struct dentry *reiser4_get_dentry(struct super_block *super, void *data) ++{ ++ reiser4_object_on_wire *o; ++ ++ assert("nikita-3522", super != NULL); ++ assert("nikita-3523", data != NULL); ++ /* ++ * this is only supposed to be called by ++ * ++ * reiser4_decode_fh->find_exported_dentry ++ * ++ * so, reiser4_context should be here already. ++ */ ++ assert("nikita-3526", is_in_reiser4_context()); ++ ++ o = (reiser4_object_on_wire *)data; ++ assert("nikita-3524", o->plugin != NULL); ++ assert("nikita-3525", o->plugin->wire.get != NULL); ++ ++ return o->plugin->wire.get(super, o); ++} ++ ++struct export_operations reiser4_export_operations = { ++ .encode_fh = reiser4_encode_fh, ++ .fh_to_dentry = reiser4_fh_to_dentry, ++ .fh_to_parent = reiser4_fh_to_parent, ++ .get_parent = reiser4_get_dentry_parent, ++}; ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/flush.c linux-2.6.33/fs/reiser4/flush.c +--- linux-2.6.33.orig/fs/reiser4/flush.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/flush.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,3703 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* The design document for this file is at http://www.namesys.com/v4/v4.html. */ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++#include "coord.h" ++#include "plugin/item/item.h" ++#include "plugin/plugin.h" ++#include "plugin/object.h" ++#include "txnmgr.h" ++#include "jnode.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree_walk.h" ++#include "carry.h" ++#include "tree.h" ++#include "vfs_ops.h" ++#include "inode.h" ++#include "page_cache.h" ++#include "wander.h" ++#include "super.h" ++#include "entd.h" ++#include "reiser4.h" ++#include "flush.h" ++#include "writeout.h" ++ ++#include <asm/atomic.h> ++#include <linux/fs.h> /* for struct super_block */ ++#include <linux/mm.h> /* for struct page */ ++#include <linux/bio.h> /* for struct bio */ ++#include <linux/pagemap.h> ++#include <linux/blkdev.h> ++ ++/* IMPLEMENTATION NOTES */ ++ ++/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of ++ assigning a total order to the nodes of the tree in which the parent is ++ placed before its children, which are ordered (recursively) in left-to-right ++ order. When we speak of a "parent-first preceder", it describes the node that ++ "came before in forward parent-first order". When we speak of a "parent-first ++ follower", it describes the node that "comes next in parent-first order" ++ (alternatively the node that "came before in reverse parent-first order"). ++ ++ The following pseudo-code prints the nodes of a tree in forward parent-first ++ order: ++ ++ void parent_first (node) ++ { ++ print_node (node); ++ if (node->level > leaf) { ++ for (i = 0; i < num_children; i += 1) { ++ parent_first (node->child[i]); ++ } ++ } ++ } ++*/ ++ ++/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block ++ allocation so that a left-to-right scan of the tree's data (i.e., the leaves ++ in left-to-right order) can be accomplished with sequential reads, which ++ results in reading nodes in their parent-first order. This is a ++ read-optimization aspect of the flush algorithm, and there is also a ++ write-optimization aspect, which is that we wish to make large sequential ++ writes to the disk by allocating or reallocating blocks so that they can be ++ written in sequence. Sometimes the read-optimization and write-optimization ++ goals conflict with each other, as we discuss in more detail below. ++*/ ++ ++/* STATE BITS: The flush code revolves around the state of the jnodes it covers. ++ Here are the relevant jnode->state bits and their relevence to flush: ++ ++ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be ++ written it must be allocated first. In order to be considered allocated, ++ the jnode must have exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These ++ two bits are exclusive, and all dirtied jnodes eventually have one of these ++ bits set during each transaction. ++ ++ JNODE_CREATED: The node was freshly created in its transaction and has no ++ previous block address, so it is unconditionally assigned to be relocated, ++ although this is mainly for code-convenience. It is not being 'relocated' ++ from anything, but in almost every regard it is treated as part of the ++ relocate set. The JNODE_CREATED bit remains set even after JNODE_RELOC is ++ set, so the actual relocate can be distinguished from the ++ created-and-allocated set easily: relocate-set members (belonging to the ++ preserve-set) have (JNODE_RELOC) set and created-set members which have no ++ previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set. ++ ++ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm ++ made the decision to maintain the pre-existing location for this node and ++ it will be written to the wandered-log. ++ ++ JNODE_RELOC: The flush algorithm made the decision to relocate this block ++ (if it was not created, see note above). A block with JNODE_RELOC set is ++ eligible for early-flushing and may be submitted during flush_empty_queues. ++ When the JNODE_RELOC bit is set on a znode, the parent node's internal item ++ is modified and the znode is rehashed. ++ ++ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm ++ scans the node and calls plugin->f.squeeze() method for its items. By this ++ technology we update disk clusters of cryptcompress objects. Also if ++ leftmost point that was found by flush scan has this flag (races with ++ write(), rare case) the flush algorythm makes the decision to pass it to ++ squalloc() in spite of its flushprepped status for squeezing, not for ++ repeated allocation. ++ ++ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode ++ into its flush queue. This means the jnode is not on any clean or dirty ++ list, instead it is moved to one of the flush queue (see flush_queue.h) ++ object private list. This prevents multiple concurrent flushes from ++ attempting to start flushing from the same node. ++ ++ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up ++ squeeze-and-allocate on a node while its children are actively being ++ squeezed and allocated. This flag was created to avoid submitting a write ++ request for a node while its children are still being allocated and ++ squeezed. Then flush queue was re-implemented to allow unlimited number of ++ nodes be queued. This flag support was commented out in source code because ++ we decided that there was no reason to submit queued nodes before ++ jnode_flush() finishes. However, current code calls fq_write() during a ++ slum traversal and may submit "busy nodes" to disk. Probably we can ++ re-enable the JNODE_FLUSH_BUSY bit support in future. ++ ++ With these state bits, we describe a test used frequently in the code below, ++ jnode_is_flushprepped()(and the spin-lock-taking jnode_check_flushprepped()). ++ The test for "flushprepped" returns true if any of the following are true: ++ ++ - The node is not dirty ++ - The node has JNODE_RELOC set ++ - The node has JNODE_OVRWR set ++ ++ If either the node is not dirty or it has already been processed by flush ++ (and assigned JNODE_OVRWR or JNODE_RELOC), then it is prepped. If ++ jnode_is_flushprepped() returns true then flush has work to do on that node. ++*/ ++ ++/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never ++ flushprepped twice (unless an explicit call to flush_unprep is made as ++ described in detail below). For example a node is dirtied, allocated, and ++ then early-flushed to disk and set clean. Before the transaction commits, the ++ page is dirtied again and, due to memory pressure, the node is flushed again. ++ The flush algorithm will not relocate the node to a new disk location, it ++ will simply write it to the same, previously relocated position again. ++*/ ++ ++/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm ++ where we start at a leaf node and allocate in parent-first order by iterating ++ to the right. At each step of the iteration, we check for the right neighbor. ++ Before advancing to the right neighbor, we check if the current position and ++ the right neighbor share the same parent. If they do not share the same ++ parent, the parent is allocated before the right neighbor. ++ ++ This process goes recursively up the tree and squeeze nodes level by level as ++ long as the right neighbor and the current position have different parents, ++ then it allocates the right-neighbors-with-different-parents on the way back ++ down. This process is described in more detail in ++ flush_squalloc_changed_ancestor and the recursive function ++ squalloc_one_changed_ancestor. But the purpose here is not to discuss the ++ specifics of the bottom-up approach as it is to contrast the bottom-up and ++ top-down approaches. ++ ++ The top-down algorithm was implemented earlier (April-May 2002). In the ++ top-down approach, we find a starting point by scanning left along each level ++ past dirty nodes, then going up and repeating the process until the left node ++ and the parent node are clean. We then perform a parent-first traversal from ++ the starting point, which makes allocating in parent-first order trivial. ++ After one subtree has been allocated in this manner, we move to the right, ++ try moving upward, then repeat the parent-first traversal. ++ ++ Both approaches have problems that need to be addressed. Both are ++ approximately the same amount of code, but the bottom-up approach has ++ advantages in the order it acquires locks which, at the very least, make it ++ the better approach. At first glance each one makes the other one look ++ simpler, so it is important to remember a few of the problems with each one. ++ ++ Main problem with the top-down approach: When you encounter a clean child ++ during the parent-first traversal, what do you do? You would like to avoid ++ searching through a large tree of nodes just to find a few dirty leaves at ++ the bottom, and there is not an obvious solution. One of the advantages of ++ the top-down approach is that during the parent-first traversal you check ++ every child of a parent to see if it is dirty. In this way, the top-down ++ approach easily handles the main problem of the bottom-up approach: ++ unallocated children. ++ ++ The unallocated children problem is that before writing a node to disk we ++ must make sure that all of its children are allocated. Otherwise, the writing ++ the node means extra I/O because the node will have to be written again when ++ the child is finally allocated. ++ ++ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, ++ this should not cause any file system corruption, it only degrades I/O ++ performance because a node may be written when it is sure to be written at ++ least one more time in the same transaction when the remaining children are ++ allocated. What follows is a description of how we will solve the problem. ++*/ ++ ++/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node, ++ then proceeding in parent first order, allocate some of its left-children, ++ then encounter a clean child in the middle of the parent. We do not allocate ++ the clean child, but there may remain unallocated (dirty) children to the ++ right of the clean child. If we were to stop flushing at this moment and ++ write everything to disk, the parent might still contain unallocated ++ children. ++ ++ We could try to allocate all the descendents of every node that we allocate, ++ but this is not necessary. Doing so could result in allocating the entire ++ tree: if the root node is allocated then every unallocated node would have to ++ be allocated before flushing. Actually, we do not have to write a node just ++ because we allocate it. It is possible to allocate but not write a node ++ during flush, when it still has unallocated children. However, this approach ++ is probably not optimal for the following reason. ++ ++ The flush algorithm is designed to allocate nodes in parent-first order in an ++ attempt to optimize reads that occur in the same order. Thus we are ++ read-optimizing for a left-to-right scan through all the leaves in the ++ system, and we are hoping to write-optimize at the same time because those ++ nodes will be written together in batch. What happens, however, if we assign ++ a block number to a node in its read-optimized order but then avoid writing ++ it because it has unallocated children? In that situation, we lose out on the ++ write-optimization aspect because a node will have to be written again to the ++ its location on the device, later, which likely means seeking back to that ++ location. ++ ++ So there are tradeoffs. We can choose either: ++ ++ A. Allocate all unallocated children to preserve both write-optimization and ++ read-optimization, but this is not always desirable because it may mean ++ having to allocate and flush very many nodes at once. ++ ++ B. Defer writing nodes with unallocated children, keep their read-optimized ++ locations, but sacrifice write-optimization because those nodes will be ++ written again. ++ ++ C. Defer writing nodes with unallocated children, but do not keep their ++ read-optimized locations. Instead, choose to write-optimize them later, when ++ they are written. To facilitate this, we "undo" the read-optimized allocation ++ that was given to the node so that later it can be write-optimized, thus ++ "unpreparing" the flush decision. This is a case where we disturb the ++ FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a call to ++ flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit; ++ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate ++ its block location, and set the JNODE_CREATED bit, effectively setting the ++ node back to an unallocated state. ++ ++ We will take the following approach in v4.0: for twig nodes we will always ++ finish allocating unallocated children (A). For nodes with (level > TWIG) ++ we will defer writing and choose write-optimization (C). ++ ++ To summarize, there are several parts to a solution that avoids the problem ++ with unallocated children: ++ ++ FIXME-ZAM: Still no one approach is implemented to eliminate the ++ "UNALLOCATED CHILDREN" problem because there was an experiment which was done ++ showed that we have 1-2 nodes with unallocated children for thousands of ++ written nodes. The experiment was simple like coping/deletion of linux kernel ++ sources. However the problem can arise in more complex tests. I think we have ++ jnode_io_hook to insert a check for unallocated children and see what kind of ++ problem we have. ++ ++ 1. When flush reaches a stopping point (e.g. a clean node) it should continue ++ calling squeeze-and-allocate on any remaining unallocated children. ++ FIXME: Difficulty to implement: should be simple -- amounts to adding a while ++ loop to jnode_flush, see comments in that function. ++ ++ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes ++ may still have unallocated children. If the twig level has unallocated ++ children it is an assertion failure. If a higher-level node has unallocated ++ children, then it should be explicitly de-allocated by a call to ++ flush_unprep(). ++ FIXME: Difficulty to implement: should be simple. ++ ++ 3. (CPU-Optimization) Checking whether a node has unallocated children may ++ consume more CPU cycles than we would like, and it is possible (but medium ++ complexity) to optimize this somewhat in the case where large sub-trees are ++ flushed. The following observation helps: if both the left- and ++ right-neighbor of a node are processed by the flush algorithm then the node ++ itself is guaranteed to have all of its children allocated. However, the cost ++ of this check may not be so expensive after all: it is not needed for leaves ++ and flush can guarantee this property for twigs. That leaves only (level > ++ TWIG) nodes that have to be checked, so this optimization only helps if at ++ least three (level > TWIG) nodes are flushed in one pass, and the savings ++ will be very small unless there are many more (level > TWIG) nodes. But if ++ there are many (level > TWIG) nodes then the number of blocks being written ++ will be very large, so the savings may be insignificant. That said, the idea ++ is to maintain both the left and right edges of nodes that are processed in ++ flush. When flush_empty_queue() is called, a relatively simple test will ++ tell whether the (level > TWIG) node is on the edge. If it is on the edge, ++ the slow check is necessary, but if it is in the interior then it can be ++ assumed to have all of its children allocated. FIXME: medium complexity to ++ implement, but simple to verify given that we must have a slow check anyway. ++ ++ 4. (Optional) This part is optional, not for v4.0--flush should work ++ independently of whether this option is used or not. Called RAPID_SCAN, the ++ idea is to amend the left-scan operation to take unallocated children into ++ account. Normally, the left-scan operation goes left as long as adjacent ++ nodes are dirty up until some large maximum value (FLUSH_SCAN_MAXNODES) at ++ which point it stops and begins flushing. But scan-left may stop at a ++ position where there are unallocated children to the left with the same ++ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops ++ after FLUSH_RELOCATE_THRESHOLD, which is much smaller than ++ FLUSH_SCAN_MAXNODES, then procedes with a rapid scan. The rapid scan skips ++ all the interior children of a node--if the leftmost child of a twig is ++ dirty, check its left neighbor (the rightmost child of the twig to the left). ++ If the left neighbor of the leftmost child is also dirty, then continue the ++ scan at the left twig and repeat. This option will cause flush to allocate ++ more twigs in a single pass, but it also has the potential to write many more ++ nodes than would otherwise be written without the RAPID_SCAN option. ++ RAPID_SCAN was partially implemented, code removed August 12, 2002 by JMACD. ++*/ ++ ++/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that ++ the starting point for flush is a leaf node, but actually the flush code ++ cares very little about whether or not this is true. It is possible that all ++ the leaf nodes are flushed and dirty parent nodes still remain, in which case ++ jnode_flush() is called on a non-leaf argument. Flush doesn't care--it treats ++ the argument node as if it were a leaf, even when it is not. This is a simple ++ approach, and there may be a more optimal policy but until a problem with ++ this approach is discovered, simplest is probably best. ++ ++ NOTE: In this case, the ordering produced by flush is parent-first only if ++ you ignore the leaves. This is done as a matter of simplicity and there is ++ only one (shaky) justification. When an atom commits, it flushes all leaf ++ level nodes first, followed by twigs, and so on. With flushing done in this ++ order, if flush is eventually called on a non-leaf node it means that ++ (somehow) we reached a point where all leaves are clean and only internal ++ nodes need to be flushed. If that it the case, then it means there were no ++ leaves that were the parent-first preceder/follower of the parent. This is ++ expected to be a rare case, which is why we do nothing special about it. ++ However, memory pressure may pass an internal node to flush when there are ++ still dirty leaf nodes that need to be flushed, which could prove our ++ original assumptions "inoperative". If this needs to be fixed, then ++ scan_left/right should have special checks for the non-leaf levels. For ++ example, instead of passing from a node to the left neighbor, it should pass ++ from the node to the left neighbor's rightmost descendent (if dirty). ++ ++*/ ++ ++/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB ++ chunks, dirtying everything and putting it into a transaction. We tell the ++ allocator to allocate the blocks as far as possible towards one end of the ++ logical device--the left (starting) end of the device if we are walking from ++ left to right, the right end of the device if we are walking from right to ++ left. We then make passes in alternating directions, and as we do this the ++ device becomes sorted such that tree order and block number order fully ++ correlate. ++ ++ Resizing is done by shifting everything either all the way to the left or all ++ the way to the right, and then reporting the last block. ++*/ ++ ++/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. ++ This descibes the policy from the highest level: ++ ++ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive ++ nodes on the leaf level during flush-scan (right, left), then we ++ unconditionally decide to relocate leaf nodes. ++ ++ Otherwise, there are two contexts in which we make a decision to relocate: ++ ++ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test(). ++ During the initial stages of flush, after scan-right completes, we want to ++ ask the question: should we relocate this leaf node and thus dirty the parent ++ node. Then if the node is a leftmost child its parent is its own parent-first ++ preceder, thus we repeat the question at the next level up, and so on. In ++ these cases we are moving in the reverse-parent first direction. ++ ++ There is another case which is considered the reverse direction, which comes ++ at the end of a twig in reverse_relocate_end_of_twig(). As we finish ++ processing a twig we may reach a point where there is a clean twig to the ++ right with a dirty leftmost child. In this case, we may wish to relocate the ++ child by testing if it should be relocated relative to its parent. ++ ++ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done ++ in allocate_znode. What distinguishes the forward parent-first case from the ++ reverse-parent first case is that the preceder has already been allocated in ++ the forward case, whereas in the reverse case we don't know what the preceder ++ is until we finish "going in reverse". That simplifies the forward case ++ considerably, and there we actually use the block allocator to determine ++ whether, e.g., a block closer to the preceder is available. ++*/ ++ ++/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, ++ once we finish scan-left and find a starting point, if the parent's left ++ neighbor is dirty then squeeze the parent's left neighbor and the parent. ++ This may change the flush-starting-node's parent. Repeat until the child's ++ parent is stable. If the child is a leftmost child, repeat this left-edge ++ squeezing operation at the next level up. Note that we cannot allocate ++ extents during this or they will be out of parent-first order. There is also ++ some difficult coordinate maintenence issues. We can't do a tree search to ++ find coordinates again (because we hold locks), we have to determine them ++ from the two nodes being squeezed. Looks difficult, but has potential to ++ increase space utilization. */ ++ ++/* Flush-scan helper functions. */ ++static void scan_init(flush_scan * scan); ++static void scan_done(flush_scan * scan); ++ ++/* Flush-scan algorithm. */ ++static int scan_left(flush_scan * scan, flush_scan * right, jnode * node, ++ unsigned limit); ++static int scan_right(flush_scan * scan, jnode * node, unsigned limit); ++static int scan_common(flush_scan * scan, flush_scan * other); ++static int scan_formatted(flush_scan * scan); ++static int scan_unformatted(flush_scan * scan, flush_scan * other); ++static int scan_by_coord(flush_scan * scan); ++ ++/* Initial flush-point ancestor allocation. */ ++static int alloc_pos_and_ancestors(flush_pos_t *pos); ++static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos); ++static int set_preceder(const coord_t *coord_in, flush_pos_t *pos); ++ ++/* Main flush algorithm. ++ Note on abbreviation: "squeeze and allocate" == "squalloc". */ ++static int squalloc(flush_pos_t *pos); ++ ++/* Flush squeeze implementation. */ ++static int squeeze_right_non_twig(znode * left, znode * right); ++static int shift_one_internal_unit(znode * left, znode * right); ++ ++/* Flush reverse parent-first relocation routines. */ ++static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, ++ const reiser4_block_nr * nblk); ++static int reverse_relocate_test(jnode * node, const coord_t *parent_coord, ++ flush_pos_t *pos); ++static int reverse_relocate_check_dirty_parent(jnode * node, ++ const coord_t *parent_coord, ++ flush_pos_t *pos); ++ ++/* Flush allocate write-queueing functions: */ ++static int allocate_znode(znode * node, const coord_t *parent_coord, ++ flush_pos_t *pos); ++static int allocate_znode_update(znode * node, const coord_t *parent_coord, ++ flush_pos_t *pos); ++static int lock_parent_and_allocate_znode(znode *, flush_pos_t *); ++ ++/* Flush helper functions: */ ++static int jnode_lock_parent_coord(jnode * node, ++ coord_t *coord, ++ lock_handle * parent_lh, ++ load_count * parent_zh, ++ znode_lock_mode mode, int try); ++static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side, ++ znode_lock_mode mode, int check_dirty, int expected); ++static int znode_same_parents(znode * a, znode * b); ++ ++static int znode_check_flushprepped(znode * node) ++{ ++ return jnode_check_flushprepped(ZJNODE(node)); ++} ++ ++/* Flush position functions */ ++static void pos_init(flush_pos_t *pos); ++static int pos_valid(flush_pos_t *pos); ++static void pos_done(flush_pos_t *pos); ++static int pos_stop(flush_pos_t *pos); ++ ++/* check that @org is first jnode extent unit, if extent is unallocated, ++ * because all jnodes of unallocated extent are dirty and of the same atom. */ ++#define checkchild(scan) \ ++assert("nikita-3435", \ ++ ergo(scan->direction == LEFT_SIDE && \ ++ (scan->parent_coord.node->level == TWIG_LEVEL) && \ ++ jnode_is_unformatted(scan->node) && \ ++ extent_is_unallocated(&scan->parent_coord), \ ++ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node))) ++ ++/* This flush_cnt variable is used to track the number of concurrent flush ++ operations, useful for debugging. It is initialized in txnmgr.c out of ++ laziness (because flush has no static initializer function...) */ ++ON_DEBUG(atomic_t flush_cnt; ++ ) ++ ++/* check fs backing device for write congestion */ ++static int check_write_congestion(void) ++{ ++ struct super_block *sb; ++ struct backing_dev_info *bdi; ++ ++ sb = reiser4_get_current_sb(); ++ bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info; ++ return bdi_write_congested(bdi); ++} ++ ++/* conditionally write flush queue */ ++static int write_prepped_nodes(flush_pos_t *pos) ++{ ++ int ret; ++ ++ assert("zam-831", pos); ++ assert("zam-832", pos->fq); ++ ++ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS)) ++ return 0; ++ ++ if (check_write_congestion()) ++ return 0; ++ ++ ret = reiser4_write_fq(pos->fq, pos->nr_written, ++ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM); ++ return ret; ++} ++ ++/* Proper release all flush pos. resources then move flush position to new ++ locked node */ ++static void move_flush_pos(flush_pos_t *pos, lock_handle * new_lock, ++ load_count * new_load, const coord_t *new_coord) ++{ ++ assert("zam-857", new_lock->node == new_load->node); ++ ++ if (new_coord) { ++ assert("zam-858", new_coord->node == new_lock->node); ++ coord_dup(&pos->coord, new_coord); ++ } else { ++ coord_init_first_unit(&pos->coord, new_lock->node); ++ } ++ ++ if (pos->child) { ++ jput(pos->child); ++ pos->child = NULL; ++ } ++ ++ move_load_count(&pos->load, new_load); ++ done_lh(&pos->lock); ++ move_lh(&pos->lock, new_lock); ++} ++ ++/* delete empty node which link from the parent still exists. */ ++static int delete_empty_node(znode * node) ++{ ++ reiser4_key smallest_removed; ++ ++ assert("zam-1019", node != NULL); ++ assert("zam-1020", node_is_empty(node)); ++ assert("zam-1023", znode_is_wlocked(node)); ++ ++ return reiser4_delete_node(node, &smallest_removed, NULL, 1); ++} ++ ++/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */ ++static int prepare_flush_pos(flush_pos_t *pos, jnode * org) ++{ ++ int ret; ++ load_count load; ++ lock_handle lock; ++ ++ init_lh(&lock); ++ init_load_count(&load); ++ ++ if (jnode_is_znode(org)) { ++ ret = longterm_lock_znode(&lock, JZNODE(org), ++ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); ++ if (ret) ++ return ret; ++ ++ ret = incr_load_count_znode(&load, JZNODE(org)); ++ if (ret) ++ return ret; ++ ++ pos->state = ++ (jnode_get_level(org) == ++ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL; ++ move_flush_pos(pos, &lock, &load, NULL); ++ } else { ++ coord_t parent_coord; ++ ret = jnode_lock_parent_coord(org, &parent_coord, &lock, ++ &load, ZNODE_WRITE_LOCK, 0); ++ if (ret) ++ goto done; ++ if (!item_is_extent(&parent_coord)) { ++ /* file was converted to tail, org became HB, we found ++ internal item */ ++ ret = -EAGAIN; ++ goto done; ++ } ++ ++ pos->state = POS_ON_EPOINT; ++ move_flush_pos(pos, &lock, &load, &parent_coord); ++ pos->child = jref(org); ++ if (extent_is_unallocated(&parent_coord) ++ && extent_unit_index(&parent_coord) != index_jnode(org)) { ++ /* @org is not first child of its parent unit. This may ++ happen because longerm lock of its parent node was ++ released between scan_left and scan_right. For now ++ work around this having flush to repeat */ ++ ret = -EAGAIN; ++ } ++ } ++ ++done: ++ done_load_count(&load); ++ done_lh(&lock); ++ return ret; ++} ++ ++/* TODO LIST (no particular order): */ ++/* I have labelled most of the legitimate FIXME comments in this file with ++ letters to indicate which issue they relate to. There are a few miscellaneous ++ FIXMEs with specific names mentioned instead that need to be ++ inspected/resolved. */ ++/* B. There is an issue described in reverse_relocate_test having to do with an ++ imprecise is_preceder? check having to do with partially-dirty extents. The ++ code that sets preceder hints and computes the preceder is basically ++ untested. Careful testing needs to be done that preceder calculations are ++ done correctly, since if it doesn't affect correctness we will not catch this ++ stuff during regular testing. */ ++/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of ++ these are considered expected but unlikely conditions. Flush currently ++ returns 0 (i.e., success but no progress, i.e., restart) whenever it receives ++ any of these in jnode_flush(). Many of the calls that may produce one of ++ these return values (i.e., longterm_lock_znode, reiser4_get_parent, ++ reiser4_get_neighbor, ...) check some of these values themselves and, for ++ instance, stop flushing instead of resulting in a restart. If any of these ++ results are true error conditions then flush will go into a busy-loop, as we ++ noticed during testing when a corrupt tree caused find_child_ptr to return ++ ENOENT. It needs careful thought and testing of corner conditions. ++*/ ++/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a ++ created block is assigned a block number then early-flushed to disk. It is ++ dirtied again and flush is called again. Concurrently, that block is deleted, ++ and the de-allocation of its block number does not need to be deferred, since ++ it is not part of the preserve set (i.e., it didn't exist before the ++ transaction). I think there may be a race condition where flush writes the ++ dirty, created block after the non-deferred deallocated block number is ++ re-allocated, making it possible to write deleted data on top of non-deleted ++ data. Its just a theory, but it needs to be thought out. */ ++/* F. bio_alloc() failure is not handled gracefully. */ ++/* G. Unallocated children. */ ++/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered ++ blocks. */ ++/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */ ++ ++/* JNODE_FLUSH: MAIN ENTRY POINT */ ++/* This is the main entry point for flushing a jnode and its dirty neighborhood ++ (dirty neighborhood is named "slum"). Jnode_flush() is called if reiser4 has ++ to write dirty blocks to disk, it happens when Linux VM decides to reduce ++ number of dirty pages or as a part of transaction commit. ++ ++ Our objective here is to prep and flush the slum the jnode belongs to. We ++ want to squish the slum together, and allocate the nodes in it as we squish ++ because allocation of children affects squishing of parents. ++ ++ The "argument" @node tells flush where to start. From there, flush finds the ++ left edge of the slum, and calls squalloc (in which nodes are squeezed and ++ allocated). To find a "better place" to start squalloc first we perform a ++ flush_scan. ++ ++ Flush-scanning may be performed in both left and right directions, but for ++ different purposes. When scanning to the left, we are searching for a node ++ that precedes a sequence of parent-first-ordered nodes which we will then ++ flush in parent-first order. During flush-scanning, we also take the ++ opportunity to count the number of consecutive leaf nodes. If this number is ++ past some threshold (FLUSH_RELOCATE_THRESHOLD), then we make a decision to ++ reallocate leaf nodes (thus favoring write-optimization). ++ ++ Since the flush argument node can be anywhere in a sequence of dirty leaves, ++ there may also be dirty nodes to the right of the argument. If the scan-left ++ operation does not count at least FLUSH_RELOCATE_THRESHOLD nodes then we ++ follow it with a right-scan operation to see whether there is, in fact, ++ enough nodes to meet the relocate threshold. Each right- and left-scan ++ operation uses a single flush_scan object. ++ ++ After left-scan and possibly right-scan, we prepare a flush_position object ++ with the starting flush point or parent coordinate, which was determined ++ using scan-left. ++ ++ Next we call the main flush routine, squalloc, which iterates along the leaf ++ level, squeezing and allocating nodes (and placing them into the flush ++ queue). ++ ++ After squalloc returns we take extra steps to ensure that all the children ++ of the final twig node are allocated--this involves repeating squalloc ++ until we finish at a twig with no unallocated children. ++ ++ Finally, we call flush_empty_queue to submit write-requests to disk. If we ++ encounter any above-twig nodes during flush_empty_queue that still have ++ unallocated children, we flush_unprep them. ++ ++ Flush treats several "failure" cases as non-failures, essentially causing ++ them to start over. E_DEADLOCK is one example. ++ FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should probably be handled ++ properly rather than restarting, but there are a bunch of cases to audit. ++*/ ++ ++static int ++jnode_flush(jnode * node, long nr_to_write, long *nr_written, ++ flush_queue_t *fq, int flags) ++{ ++ long ret = 0; ++ flush_scan *right_scan; ++ flush_scan *left_scan; ++ flush_pos_t *flush_pos; ++ int todo; ++ struct super_block *sb; ++ reiser4_super_info_data *sbinfo; ++ jnode *leftmost_in_slum = NULL; ++ ++ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack())); ++ assert("nikita-3022", reiser4_schedulable()); ++ ++ assert("nikita-3185", ++ get_current_super_private()->delete_mutex_owner != current); ++ ++ /* allocate right_scan, left_scan and flush_pos */ ++ right_scan = ++ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos), ++ reiser4_ctx_gfp_mask_get()); ++ if (right_scan == NULL) ++ return RETERR(-ENOMEM); ++ left_scan = right_scan + 1; ++ flush_pos = (flush_pos_t *) (left_scan + 1); ++ ++ sb = reiser4_get_current_sb(); ++ sbinfo = get_super_private(sb); ++ ++ /* Flush-concurrency debug code */ ++#if REISER4_DEBUG ++ atomic_inc(&flush_cnt); ++#endif ++ ++ reiser4_enter_flush(sb); ++ ++ /* Initialize a flush position. */ ++ pos_init(flush_pos); ++ ++ flush_pos->nr_written = nr_written; ++ flush_pos->fq = fq; ++ flush_pos->flags = flags; ++ flush_pos->nr_to_write = nr_to_write; ++ ++ scan_init(right_scan); ++ scan_init(left_scan); ++ ++ /* First scan left and remember the leftmost scan position. If the ++ leftmost position is unformatted we remember its parent_coord. We ++ scan until counting FLUSH_SCAN_MAXNODES. ++ ++ If starting @node is unformatted, at the beginning of left scan its ++ parent (twig level node, containing extent item) will be long term ++ locked and lock handle will be stored in the ++ @right_scan->parent_lock. This lock is used to start the rightward ++ scan without redoing the tree traversal (necessary to find parent) ++ and, hence, is kept during leftward scan. As a result, we have to ++ use try-lock when taking long term locks during the leftward scan. ++ */ ++ ret = scan_left(left_scan, right_scan, ++ node, sbinfo->flush.scan_maxnodes); ++ if (ret != 0) ++ goto failed; ++ ++ leftmost_in_slum = jref(left_scan->node); ++ scan_done(left_scan); ++ ++ /* Then possibly go right to decide if we will use a policy of ++ relocating leaves. This is only done if we did not scan past (and ++ count) enough nodes during the leftward scan. If we do scan right, ++ we only care to go far enough to establish that at least ++ FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The scan ++ limit is the difference between left_scan.count and the threshold. */ ++ ++ todo = sbinfo->flush.relocate_threshold - left_scan->count; ++ /* scan right is inherently deadlock prone, because we are ++ * (potentially) holding a lock on the twig node at this moment. ++ * FIXME: this is incorrect comment: lock is not held */ ++ if (todo > 0) { ++ ret = scan_right(right_scan, node, (unsigned)todo); ++ if (ret != 0) ++ goto failed; ++ } ++ ++ /* Only the right-scan count is needed, release any rightward locks ++ right away. */ ++ scan_done(right_scan); ++ ++ /* ... and the answer is: we should relocate leaf nodes if at least ++ FLUSH_RELOCATE_THRESHOLD nodes were found. */ ++ flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) || ++ (left_scan->count + right_scan->count >= ++ sbinfo->flush.relocate_threshold); ++ ++ /* Funny business here. We set the 'point' in the flush_position at ++ prior to starting squalloc regardless of whether the first point is ++ formatted or unformatted. Without this there would be an invariant, ++ in the rest of the code, that if the flush_position is unformatted ++ then flush_position->point is NULL and ++ flush_position->parent_{lock,coord} is set, and if the flush_position ++ is formatted then flush_position->point is non-NULL and no parent ++ info is set. ++ ++ This seems lazy, but it makes the initial calls to ++ reverse_relocate_test (which ask "is it the pos->point the leftmost ++ child of its parent") much easier because we know the first child ++ already. Nothing is broken by this, but the reasoning is subtle. ++ Holding an extra reference on a jnode during flush can cause us to ++ see nodes with HEARD_BANSHEE during squalloc, because nodes are not ++ removed from sibling lists until they have zero reference count. ++ Flush would never observe a HEARD_BANSHEE node on the left-edge of ++ flush, nodes are only deleted to the right. So if nothing is broken, ++ why fix it? ++ ++ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any ++ point and in any moment, because of the concurrent file system ++ activity (for example, truncate). */ ++ ++ /* Check jnode state after flush_scan completed. Having a lock on this ++ node or its parent (in case of unformatted) helps us in case of ++ concurrent flushing. */ ++ if (jnode_check_flushprepped(leftmost_in_slum) ++ && !jnode_convertible(leftmost_in_slum)) { ++ ret = 0; ++ goto failed; ++ } ++ ++ /* Now setup flush_pos using scan_left's endpoint. */ ++ ret = prepare_flush_pos(flush_pos, leftmost_in_slum); ++ if (ret) ++ goto failed; ++ ++ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL ++ && node_is_empty(flush_pos->coord.node)) { ++ znode *empty = flush_pos->coord.node; ++ ++ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE)); ++ ret = delete_empty_node(empty); ++ goto failed; ++ } ++ ++ if (jnode_check_flushprepped(leftmost_in_slum) ++ && !jnode_convertible(leftmost_in_slum)) { ++ ret = 0; ++ goto failed; ++ } ++ ++ /* Set pos->preceder and (re)allocate pos and its ancestors if it is ++ needed */ ++ ret = alloc_pos_and_ancestors(flush_pos); ++ if (ret) ++ goto failed; ++ ++ /* Do the main rightward-bottom-up squeeze and allocate loop. */ ++ ret = squalloc(flush_pos); ++ pos_stop(flush_pos); ++ if (ret) ++ goto failed; ++ ++ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated ++ children. First, the pos_stop() and pos_valid() routines should be ++ modified so that pos_stop() sets a flush_position->stop flag to 1 ++ without releasing the current position immediately--instead release ++ it in pos_done(). This is a better implementation than the current ++ one anyway. ++ ++ It is not clear that all fields of the flush_position should not be ++ released, but at the very least the parent_lock, parent_coord, and ++ parent_load should remain held because they are hold the last twig ++ when pos_stop() is called. ++ ++ When we reach this point in the code, if the parent_coord is set to ++ after the last item then we know that flush reached the end of a twig ++ (and according to the new flush queueing design, we will return now). ++ If parent_coord is not past the last item, we should check if the ++ current twig has any unallocated children to the right (we are not ++ concerned with unallocated children to the left--in that case the ++ twig itself should not have been allocated). If the twig has ++ unallocated children to the right, set the parent_coord to that ++ position and then repeat the call to squalloc. ++ ++ Testing for unallocated children may be defined in two ways: if any ++ internal item has a fake block number, it is unallocated; if any ++ extent item is unallocated then all of its children are unallocated. ++ But there is a more aggressive approach: if there are any dirty ++ children of the twig to the right of the current position, we may ++ wish to relocate those nodes now. Checking for potential relocation ++ is more expensive as it requires knowing whether there are any dirty ++ children that are not unallocated. The extent_needs_allocation should ++ be used after setting the correct preceder. ++ ++ When we reach the end of a twig at this point in the code, if the ++ flush can continue (when the queue is ready) it will need some ++ information on the future starting point. That should be stored away ++ in the flush_handle using a seal, I believe. Holding a jref() on the ++ future starting point may break other code that deletes that node. ++ */ ++ ++ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is ++ called above the twig level. If the VM calls flush above the twig ++ level, do nothing and return (but figure out why this happens). The ++ txnmgr should be modified to only flush its leaf-level dirty list. ++ This will do all the necessary squeeze and allocate steps but leave ++ unallocated branches and possibly unallocated twigs (when the twig's ++ leftmost child is not dirty). After flushing the leaf level, the ++ remaining unallocated nodes should be given write-optimized ++ locations. (Possibly, the remaining unallocated twigs should be ++ allocated just before their leftmost child.) ++ */ ++ ++ /* Any failure reaches this point. */ ++failed: ++ ++ switch (ret) { ++ case -E_REPEAT: ++ case -EINVAL: ++ case -E_DEADLOCK: ++ case -E_NO_NEIGHBOR: ++ case -ENOENT: ++ /* FIXME(C): Except for E_DEADLOCK, these should probably be ++ handled properly in each case. They already are handled in ++ many cases. */ ++ /* Something bad happened, but difficult to avoid... Try again! ++ */ ++ ret = 0; ++ } ++ ++ if (leftmost_in_slum) ++ jput(leftmost_in_slum); ++ ++ pos_done(flush_pos); ++ scan_done(left_scan); ++ scan_done(right_scan); ++ kfree(right_scan); ++ ++ ON_DEBUG(atomic_dec(&flush_cnt)); ++ ++ reiser4_leave_flush(sb); ++ ++ return ret; ++} ++ ++/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that ++ * flusher should submit all prepped nodes immediately without keeping them in ++ * flush queues for long time. The reason for rapid flush mode is to free ++ * memory as fast as possible. */ ++ ++#if REISER4_USE_RAPID_FLUSH ++ ++/** ++ * submit all prepped nodes if rapid flush mode is set, ++ * turn rapid flush mode off. ++ */ ++ ++static int rapid_flush(flush_pos_t *pos) ++{ ++ if (!wbq_available()) ++ return 0; ++ ++ return write_prepped_nodes(pos); ++} ++ ++#else ++ ++#define rapid_flush(pos) (0) ++ ++#endif /* REISER4_USE_RAPID_FLUSH */ ++ ++static jnode *find_flush_start_jnode(jnode *start, txn_atom * atom, ++ flush_queue_t *fq, int *nr_queued, ++ int flags) ++{ ++ jnode * node; ++ ++ if (start != NULL) { ++ spin_lock_jnode(start); ++ if (!jnode_is_flushprepped(start)) { ++ assert("zam-1056", start->atom == atom); ++ node = start; ++ goto enter; ++ } ++ spin_unlock_jnode(start); ++ } ++ /* ++ * In this loop we process all already prepped (RELOC or OVRWR) and ++ * dirtied again nodes. The atom spin lock is not released until all ++ * dirty nodes processed or not prepped node found in the atom dirty ++ * lists. ++ */ ++ while ((node = find_first_dirty_jnode(atom, flags))) { ++ spin_lock_jnode(node); ++enter: ++ assert("zam-881", JF_ISSET(node, JNODE_DIRTY)); ++ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR)); ++ ++ if (JF_ISSET(node, JNODE_WRITEBACK)) { ++ /* move node to the end of atom's writeback list */ ++ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom)); ++ ++ /* ++ * jnode is not necessarily on dirty list: if it was ++ * dirtied when it was on flush queue - it does not get ++ * moved to dirty list ++ */ ++ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), ++ WB_LIST, 1)); ++ ++ } else if (jnode_is_znode(node) ++ && znode_above_root(JZNODE(node))) { ++ /* ++ * A special case for znode-above-root. The above-root ++ * (fake) znode is captured and dirtied when the tree ++ * height changes or when the root node is relocated. ++ * This causes atoms to fuse so that changes at the root ++ * are serialized. However, this node is never flushed. ++ * This special case used to be in lock.c to prevent the ++ * above-root node from ever being captured, but now ++ * that it is captured we simply prevent it from ++ * flushing. The log-writer code relies on this to ++ * properly log superblock modifications of the tree ++ * height. ++ */ ++ jnode_make_wander_nolock(node); ++ } else if (JF_ISSET(node, JNODE_RELOC)) { ++ queue_jnode(fq, node); ++ ++(*nr_queued); ++ } else ++ break; ++ ++ spin_unlock_jnode(node); ++ } ++ return node; ++} ++ ++/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are ++ * more nodes to flush, return 0 if atom's dirty lists empty and keep current ++ * atom locked, return other errors as they are. */ ++int ++flush_current_atom(int flags, long nr_to_write, long *nr_submitted, ++ txn_atom ** atom, jnode *start) ++{ ++ reiser4_super_info_data *sinfo = get_current_super_private(); ++ flush_queue_t *fq = NULL; ++ jnode *node; ++ int nr_queued; ++ int ret; ++ ++ assert("zam-889", atom != NULL && *atom != NULL); ++ assert_spin_locked(&((*atom)->alock)); ++ assert("zam-892", get_current_context()->trans->atom == *atom); ++ ++ nr_to_write = LONG_MAX; ++ while (1) { ++ ret = reiser4_fq_by_atom(*atom, &fq); ++ if (ret != -E_REPEAT) ++ break; ++ *atom = get_current_atom_locked(); ++ } ++ if (ret) ++ return ret; ++ ++ assert_spin_locked(&((*atom)->alock)); ++ ++ /* parallel flushers limit */ ++ if (sinfo->tmgr.atom_max_flushers != 0) { ++ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) { ++ /* An reiser4_atom_send_event() call is inside ++ reiser4_fq_put_nolock() which is called when flush is ++ finished and nr_flushers is decremented. */ ++ reiser4_atom_wait_event(*atom); ++ *atom = get_current_atom_locked(); ++ } ++ } ++ ++ /* count ourself as a flusher */ ++ (*atom)->nr_flushers++; ++ ++ writeout_mode_enable(); ++ ++ nr_queued = 0; ++ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags); ++ ++ if (node == NULL) { ++ if (nr_queued == 0) { ++ (*atom)->nr_flushers--; ++ reiser4_fq_put_nolock(fq); ++ reiser4_atom_send_event(*atom); ++ /* current atom remains locked */ ++ writeout_mode_disable(); ++ return 0; ++ } ++ spin_unlock_atom(*atom); ++ } else { ++ jref(node); ++ BUG_ON((*atom)->super != node->tree->super); ++ spin_unlock_atom(*atom); ++ spin_unlock_jnode(node); ++ BUG_ON(nr_to_write == 0); ++ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags); ++ jput(node); ++ } ++ ++ ret = ++ reiser4_write_fq(fq, nr_submitted, ++ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM); ++ ++ *atom = get_current_atom_locked(); ++ (*atom)->nr_flushers--; ++ reiser4_fq_put_nolock(fq); ++ reiser4_atom_send_event(*atom); ++ spin_unlock_atom(*atom); ++ ++ writeout_mode_disable(); ++ ++ if (ret == 0) ++ ret = -E_REPEAT; ++ ++ return ret; ++} ++ ++/* REVERSE PARENT-FIRST RELOCATION POLICIES */ ++ ++/* This implements the is-it-close-enough-to-its-preceder? test for relocation ++ in the reverse parent-first relocate context. Here all we know is the ++ preceder and the block number. Since we are going in reverse, the preceder ++ may still be relocated as well, so we can't ask the block allocator "is there ++ a closer block available to relocate?" here. In the _forward_ parent-first ++ relocate context (not here) we actually call the block allocator to try and ++ find a closer location. */ ++static int ++reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, ++ const reiser4_block_nr * nblk) ++{ ++ reiser4_block_nr dist; ++ ++ assert("jmacd-7710", *pblk != 0 && *nblk != 0); ++ assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk)); ++ assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk)); ++ ++ /* Distance is the absolute value. */ ++ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk); ++ ++ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from ++ its preceder block, do not relocate. */ ++ if (dist <= get_current_super_private()->flush.relocate_distance) ++ return 0; ++ ++ return 1; ++} ++ ++/* This function is a predicate that tests for relocation. Always called in the ++ reverse-parent-first context, when we are asking whether the current node ++ should be relocated in order to expand the flush by dirtying the parent level ++ (and thus proceeding to flush that level). When traversing in the forward ++ parent-first direction (not here), relocation decisions are handled in two ++ places: allocate_znode() and extent_needs_allocation(). */ ++static int ++reverse_relocate_test(jnode * node, const coord_t *parent_coord, ++ flush_pos_t *pos) ++{ ++ reiser4_block_nr pblk = 0; ++ reiser4_block_nr nblk = 0; ++ ++ assert("jmacd-8989", !jnode_is_root(node)); ++ ++ /* ++ * This function is called only from the ++ * reverse_relocate_check_dirty_parent() and only if the parent ++ * node is clean. This implies that the parent has the real (i.e., not ++ * fake) block number, and, so does the child, because otherwise the ++ * parent would be dirty. ++ */ ++ ++ /* New nodes are treated as if they are being relocated. */ ++ if (JF_ISSET(node, JNODE_CREATED) || ++ (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) ++ return 1; ++ ++ /* Find the preceder. FIXME(B): When the child is an unformatted, ++ previously existing node, the coord may be leftmost even though the ++ child is not the parent-first preceder of the parent. If the first ++ dirty node appears somewhere in the middle of the first extent unit, ++ this preceder calculation is wrong. ++ Needs more logic in here. */ ++ if (coord_is_leftmost_unit(parent_coord)) { ++ pblk = *znode_get_block(parent_coord->node); ++ } else { ++ pblk = pos->preceder.blk; ++ } ++ check_preceder(pblk); ++ ++ /* If (pblk == 0) then the preceder isn't allocated or isn't known: ++ relocate. */ ++ if (pblk == 0) ++ return 1; ++ ++ nblk = *jnode_get_block(node); ++ ++ if (reiser4_blocknr_is_fake(&nblk)) ++ /* child is unallocated, mark parent dirty */ ++ return 1; ++ ++ return reverse_relocate_if_close_enough(&pblk, &nblk); ++} ++ ++/* This function calls reverse_relocate_test to make a reverse-parent-first ++ relocation decision and then, if yes, it marks the parent dirty. */ ++static int ++reverse_relocate_check_dirty_parent(jnode * node, const coord_t *parent_coord, ++ flush_pos_t *pos) ++{ ++ int ret; ++ ++ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) { ++ ++ ret = reverse_relocate_test(node, parent_coord, pos); ++ if (ret < 0) ++ return ret; ++ ++ /* FIXME-ZAM ++ if parent is already relocated - we do not want to grab space, ++ right? */ ++ if (ret == 1) { ++ int grabbed; ++ ++ grabbed = get_current_context()->grabbed_blocks; ++ if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) != ++ 0) ++ reiser4_panic("umka-1250", ++ "No space left during flush."); ++ ++ assert("jmacd-18923", ++ znode_is_write_locked(parent_coord->node)); ++ znode_make_dirty(parent_coord->node); ++ grabbed2free_mark(grabbed); ++ } ++ } ++ ++ return 0; ++} ++ ++/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE ++ FORWARD PARENT-FIRST LOOP BEGINS) */ ++ ++/* Get the leftmost child for given coord. */ ++static int get_leftmost_child_of_unit(const coord_t *coord, jnode ** child) ++{ ++ int ret; ++ ++ ret = item_utmost_child(coord, LEFT_SIDE, child); ++ ++ if (ret) ++ return ret; ++ ++ if (IS_ERR(*child)) ++ return PTR_ERR(*child); ++ ++ return 0; ++} ++ ++/* This step occurs after the left- and right-scans are completed, before ++ starting the forward parent-first traversal. Here we attempt to allocate ++ ancestors of the starting flush point, which means continuing in the reverse ++ parent-first direction to the parent, grandparent, and so on (as long as the ++ child is a leftmost child). This routine calls a recursive process, ++ alloc_one_ancestor, which does the real work, except there is special-case ++ handling here for the first ancestor, which may be a twig. At each level ++ (here and alloc_one_ancestor), we check for relocation and then, if the child ++ is a leftmost child, repeat at the next level. On the way back down (the ++ recursion), we allocate the ancestors in parent-first order. */ ++static int alloc_pos_and_ancestors(flush_pos_t *pos) ++{ ++ int ret = 0; ++ lock_handle plock; ++ load_count pload; ++ coord_t pcoord; ++ ++ if (znode_check_flushprepped(pos->lock.node)) ++ return 0; ++ ++ coord_init_invalid(&pcoord, NULL); ++ init_lh(&plock); ++ init_load_count(&pload); ++ ++ if (pos->state == POS_ON_EPOINT) { ++ /* a special case for pos on twig level, where we already have ++ a lock on parent node. */ ++ /* The parent may not be dirty, in which case we should decide ++ whether to relocate the child now. If decision is made to ++ relocate the child, the parent is marked dirty. */ ++ ret = ++ reverse_relocate_check_dirty_parent(pos->child, &pos->coord, ++ pos); ++ if (ret) ++ goto exit; ++ ++ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child ++ is leftmost) and the leaf/child, so recursion is not needed. ++ Levels above the twig will be allocated for ++ write-optimization before the transaction commits. */ ++ ++ /* Do the recursive step, allocating zero or more of our ++ * ancestors. */ ++ ret = alloc_one_ancestor(&pos->coord, pos); ++ ++ } else { ++ if (!znode_is_root(pos->lock.node)) { ++ /* all formatted nodes except tree root */ ++ ret = ++ reiser4_get_parent(&plock, pos->lock.node, ++ ZNODE_WRITE_LOCK); ++ if (ret) ++ goto exit; ++ ++ ret = incr_load_count_znode(&pload, plock.node); ++ if (ret) ++ goto exit; ++ ++ ret = ++ find_child_ptr(plock.node, pos->lock.node, &pcoord); ++ if (ret) ++ goto exit; ++ ++ ret = ++ reverse_relocate_check_dirty_parent(ZJNODE ++ (pos->lock. ++ node), &pcoord, ++ pos); ++ if (ret) ++ goto exit; ++ ++ ret = alloc_one_ancestor(&pcoord, pos); ++ if (ret) ++ goto exit; ++ } ++ ++ ret = allocate_znode(pos->lock.node, &pcoord, pos); ++ } ++exit: ++ done_load_count(&pload); ++ done_lh(&plock); ++ return ret; ++} ++ ++/* This is the recursive step described in alloc_pos_and_ancestors, above. ++ Ignoring the call to set_preceder, which is the next function described, this ++ checks if the child is a leftmost child and returns if it is not. If the ++ child is a leftmost child it checks for relocation, possibly dirtying the ++ parent. Then it performs the recursive step. */ ++static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos) ++{ ++ int ret = 0; ++ lock_handle alock; ++ load_count aload; ++ coord_t acoord; ++ ++ /* As we ascend at the left-edge of the region to flush, take this ++ opportunity at the twig level to find our parent-first preceder ++ unless we have already set it. */ ++ if (pos->preceder.blk == 0) { ++ ret = set_preceder(coord, pos); ++ if (ret != 0) ++ return ret; ++ } ++ ++ /* If the ancestor is clean or already allocated, or if the child is not ++ a leftmost child, stop going up, even leaving coord->node not ++ flushprepped. */ ++ if (znode_check_flushprepped(coord->node) ++ || !coord_is_leftmost_unit(coord)) ++ return 0; ++ ++ init_lh(&alock); ++ init_load_count(&aload); ++ coord_init_invalid(&acoord, NULL); ++ ++ /* Only ascend to the next level if it is a leftmost child, but ++ write-lock the parent in case we will relocate the child. */ ++ if (!znode_is_root(coord->node)) { ++ ++ ret = ++ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord, ++ &alock, &aload, ZNODE_WRITE_LOCK, ++ 0); ++ if (ret != 0) { ++ /* FIXME(C): check EINVAL, E_DEADLOCK */ ++ goto exit; ++ } ++ ++ ret = ++ reverse_relocate_check_dirty_parent(ZJNODE(coord->node), ++ &acoord, pos); ++ if (ret != 0) ++ goto exit; ++ ++ /* Recursive call. */ ++ if (!znode_check_flushprepped(acoord.node)) { ++ ret = alloc_one_ancestor(&acoord, pos); ++ if (ret) ++ goto exit; ++ } ++ } ++ ++ /* Note: we call allocate with the parent write-locked (except at the ++ root) in case we relocate the child, in which case it will modify the ++ parent during this call. */ ++ ret = allocate_znode(coord->node, &acoord, pos); ++ ++exit: ++ done_load_count(&aload); ++ done_lh(&alock); ++ return ret; ++} ++ ++/* During the reverse parent-first alloc_pos_and_ancestors process described ++ above there is a call to this function at the twig level. During ++ alloc_pos_and_ancestors we may ask: should this node be relocated (in reverse ++ parent-first context)? We repeat this process as long as the child is the ++ leftmost child, eventually reaching an ancestor of the flush point that is ++ not a leftmost child. The preceder of that ancestors, which is not a leftmost ++ child, is actually on the leaf level. The preceder of that block is the ++ left-neighbor of the flush point. The preceder of that block is the rightmost ++ child of the twig on the left. So, when alloc_pos_and_ancestors passes upward ++ through the twig level, it stops momentarily to remember the block of the ++ rightmost child of the twig on the left and sets it to the flush_position's ++ preceder_hint. ++ ++ There is one other place where we may set the flush_position's preceder hint, ++ which is during scan-left. ++*/ ++static int set_preceder(const coord_t *coord_in, flush_pos_t *pos) ++{ ++ int ret; ++ coord_t coord; ++ lock_handle left_lock; ++ load_count left_load; ++ ++ coord_dup(&coord, coord_in); ++ ++ init_lh(&left_lock); ++ init_load_count(&left_load); ++ ++ /* FIXME(B): Same FIXME as in "Find the preceder" in ++ reverse_relocate_test. coord_is_leftmost_unit is not the right test ++ if the unformatted child is in the middle of the first extent unit.*/ ++ if (!coord_is_leftmost_unit(&coord)) { ++ coord_prev_unit(&coord); ++ } else { ++ ret = ++ reiser4_get_left_neighbor(&left_lock, coord.node, ++ ZNODE_READ_LOCK, GN_SAME_ATOM); ++ if (ret) { ++ /* If we fail for any reason it doesn't matter because ++ the preceder is only a hint. We are low-priority at ++ this point, so this must be the case. */ ++ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR || ++ ret == -ENOENT || ret == -EINVAL ++ || ret == -E_DEADLOCK) ++ ret = 0; ++ goto exit; ++ } ++ ++ ret = incr_load_count_znode(&left_load, left_lock.node); ++ if (ret) ++ goto exit; ++ ++ coord_init_last_unit(&coord, left_lock.node); ++ } ++ ++ ret = ++ item_utmost_child_real_block(&coord, RIGHT_SIDE, ++ &pos->preceder.blk); ++exit: ++ check_preceder(pos->preceder.blk); ++ done_load_count(&left_load); ++ done_lh(&left_lock); ++ return ret; ++} ++ ++/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */ ++ ++/* This procedure implements the outer loop of the flush algorithm. To put this ++ in context, here is the general list of steps taken by the flush routine as a ++ whole: ++ ++ 1. Scan-left ++ 2. Scan-right (maybe) ++ 3. Allocate initial flush position and its ancestors ++ 4. <handle extents> ++ 5. <squeeze and next position and its ancestors to-the-right, ++ then update position to-the-right> ++ 6. <repeat from #4 until flush is stopped> ++ ++ This procedure implements the loop in steps 4 through 6 in the above listing. ++ ++ Step 4: if the current flush position is an extent item (position on the twig ++ level), it allocates the extent (allocate_extent_item_in_place) then shifts ++ to the next coordinate. If the next coordinate's leftmost child needs ++ flushprep, we will continue. If the next coordinate is an internal item, we ++ descend back to the leaf level, otherwise we repeat a step #4 (labeled ++ ALLOC_EXTENTS below). If the "next coordinate" brings us past the end of the ++ twig level, then we call reverse_relocate_end_of_twig to possibly dirty the ++ next (right) twig, prior to step #5 which moves to the right. ++ ++ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up ++ the tree to allocate any ancestors of the next-right flush position that are ++ not also ancestors of the current position. Those ancestors (in top-down ++ order) are the next in parent-first order. We squeeze adjacent nodes on the ++ way up until the right node and current node share the same parent, then ++ allocate on the way back down. Finally, this step sets the flush position to ++ the next-right node. Then repeat steps 4 and 5. ++*/ ++ ++/* SQUEEZE CODE */ ++ ++/* squalloc_right_twig helper function, cut a range of extent items from ++ cut node to->node from the beginning up to coord @to. */ ++static int squalloc_right_twig_cut(coord_t *to, reiser4_key * to_key, ++ znode * left) ++{ ++ coord_t from; ++ reiser4_key from_key; ++ ++ coord_init_first_unit(&from, to->node); ++ item_key_by_coord(&from, &from_key); ++ ++ return cut_node_content(&from, to, &from_key, to_key, NULL); ++} ++ ++/* Copy as much of the leading extents from @right to @left, allocating ++ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or ++ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an ++ internal item it calls shift_one_internal_unit and may then return ++ SUBTREE_MOVED. */ ++static int squeeze_right_twig(znode * left, znode * right, flush_pos_t *pos) ++{ ++ int ret = SUBTREE_MOVED; ++ coord_t coord; /* used to iterate over items */ ++ reiser4_key stop_key; ++ ++ assert("jmacd-2008", !node_is_empty(right)); ++ coord_init_first_unit(&coord, right); ++ ++ /* FIXME: can be optimized to cut once */ ++ while (!node_is_empty(coord.node) && item_is_extent(&coord)) { ++ ON_DEBUG(void *vp); ++ ++ assert("vs-1468", coord_is_leftmost_unit(&coord)); ++ ON_DEBUG(vp = shift_check_prepare(left, coord.node)); ++ ++ /* stop_key is used to find what was copied and what to cut */ ++ stop_key = *reiser4_min_key(); ++ ret = squalloc_extent(left, &coord, pos, &stop_key); ++ if (ret != SQUEEZE_CONTINUE) { ++ ON_DEBUG(kfree(vp)); ++ break; ++ } ++ assert("vs-1465", !keyeq(&stop_key, reiser4_min_key())); ++ ++ /* Helper function to do the cutting. */ ++ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1); ++ check_me("vs-1466", ++ squalloc_right_twig_cut(&coord, &stop_key, left) == 0); ++ ++ ON_DEBUG(shift_check(vp, left, coord.node)); ++ } ++ ++ if (node_is_empty(coord.node)) ++ ret = SQUEEZE_SOURCE_EMPTY; ++ ++ if (ret == SQUEEZE_TARGET_FULL) ++ goto out; ++ ++ if (node_is_empty(right)) { ++ /* The whole right node was copied into @left. */ ++ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY); ++ goto out; ++ } ++ ++ coord_init_first_unit(&coord, right); ++ ++ if (!item_is_internal(&coord)) { ++ /* we do not want to squeeze anything else to left neighbor ++ because "slum" is over */ ++ ret = SQUEEZE_TARGET_FULL; ++ goto out; ++ } ++ assert("jmacd-433", item_is_internal(&coord)); ++ ++ /* Shift an internal unit. The child must be allocated before shifting ++ any more extents, so we stop here. */ ++ ret = shift_one_internal_unit(left, right); ++ ++out: ++ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL ++ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY); ++ ++ if (ret == SQUEEZE_TARGET_FULL) { ++ /* We submit prepped nodes here and expect that this @left twig ++ * will not be modified again during this jnode_flush() call. */ ++ int ret1; ++ ++ /* NOTE: seems like io is done under long term locks. */ ++ ret1 = write_prepped_nodes(pos); ++ if (ret1 < 0) ++ return ret1; ++ } ++ ++ return ret; ++} ++ ++#if REISER4_DEBUG ++static void item_convert_invariant(flush_pos_t *pos) ++{ ++ assert("edward-1225", coord_is_existing_item(&pos->coord)); ++ if (chaining_data_present(pos)) { ++ item_plugin *iplug = item_convert_plug(pos); ++ ++ assert("edward-1000", ++ iplug == item_plugin_by_coord(&pos->coord)); ++ assert("edward-1001", iplug->f.convert != NULL); ++ } else ++ assert("edward-1226", pos->child == NULL); ++} ++#else ++ ++#define item_convert_invariant(pos) noop ++ ++#endif ++ ++/* Scan node items starting from the first one and apply for each ++ item its flush ->convert() method (if any). This method may ++ resize/kill the item so the tree will be changed. ++*/ ++static int convert_node(flush_pos_t *pos, znode * node) ++{ ++ int ret = 0; ++ item_plugin *iplug; ++ ++ assert("edward-304", pos != NULL); ++ assert("edward-305", pos->child == NULL); ++ assert("edward-475", znode_convertible(node)); ++ assert("edward-669", znode_is_wlocked(node)); ++ assert("edward-1210", !node_is_empty(node)); ++ ++ if (znode_get_level(node) != LEAF_LEVEL) ++ /* unsupported */ ++ goto exit; ++ ++ coord_init_first_unit(&pos->coord, node); ++ ++ while (1) { ++ ret = 0; ++ coord_set_to_left(&pos->coord); ++ item_convert_invariant(pos); ++ ++ iplug = item_plugin_by_coord(&pos->coord); ++ assert("edward-844", iplug != NULL); ++ ++ if (iplug->f.convert) { ++ ret = iplug->f.convert(pos); ++ if (ret) ++ goto exit; ++ } ++ assert("edward-307", pos->child == NULL); ++ ++ if (coord_next_item(&pos->coord)) { ++ /* node is over */ ++ ++ if (!chaining_data_present(pos)) ++ /* finished this node */ ++ break; ++ if (should_chain_next_node(pos)) { ++ /* go to next node */ ++ move_chaining_data(pos, 0/* to next node */); ++ break; ++ } ++ /* repeat this node */ ++ move_chaining_data(pos, 1/* this node */); ++ continue; ++ } ++ /* Node is not over. ++ Check if there is attached convert data. ++ If so roll one item position back and repeat ++ on this node ++ */ ++ if (chaining_data_present(pos)) { ++ ++ if (iplug != item_plugin_by_coord(&pos->coord)) ++ set_item_convert_count(pos, 0); ++ ++ ret = coord_prev_item(&pos->coord); ++ assert("edward-1003", !ret); ++ ++ move_chaining_data(pos, 1/* this node */); ++ } ++ } ++ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE); ++ znode_make_dirty(node); ++exit: ++ assert("edward-1004", !ret); ++ return ret; ++} ++ ++/* Squeeze and allocate the right neighbor. This is called after @left and ++ its current children have been squeezed and allocated already. This ++ procedure's job is to squeeze and items from @right to @left. ++ ++ If at the leaf level, use the shift_everything_left memcpy-optimized ++ version of shifting (squeeze_right_leaf). ++ ++ If at the twig level, extents are allocated as they are shifted from @right ++ to @left (squalloc_right_twig). ++ ++ At any other level, shift one internal item and return to the caller ++ (squalloc_parent_first) so that the shifted-subtree can be processed in ++ parent-first order. ++ ++ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is ++ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is ++ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL ++ is returned. ++*/ ++ ++static int squeeze_right_neighbor(flush_pos_t *pos, znode * left, ++ znode * right) ++{ ++ int ret; ++ ++ /* FIXME it is possible to see empty hasn't-heard-banshee node in a ++ * tree owing to error (for example, ENOSPC) in write */ ++ /* assert("jmacd-9321", !node_is_empty(left)); */ ++ assert("jmacd-9322", !node_is_empty(right)); ++ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right)); ++ ++ switch (znode_get_level(left)) { ++ case TWIG_LEVEL: ++ /* Shift with extent allocating until either an internal item ++ is encountered or everything is shifted or no free space ++ left in @left */ ++ ret = squeeze_right_twig(left, right, pos); ++ break; ++ ++ default: ++ /* All other levels can use shift_everything until we implement ++ per-item flush plugins. */ ++ ret = squeeze_right_non_twig(left, right); ++ break; ++ } ++ ++ assert("jmacd-2011", (ret < 0 || ++ ret == SQUEEZE_SOURCE_EMPTY ++ || ret == SQUEEZE_TARGET_FULL ++ || ret == SUBTREE_MOVED)); ++ return ret; ++} ++ ++static int squeeze_right_twig_and_advance_coord(flush_pos_t *pos, ++ znode * right) ++{ ++ int ret; ++ ++ ret = squeeze_right_twig(pos->lock.node, right, pos); ++ if (ret < 0) ++ return ret; ++ if (ret > 0) { ++ coord_init_after_last_item(&pos->coord, pos->lock.node); ++ return ret; ++ } ++ ++ coord_init_last_unit(&pos->coord, pos->lock.node); ++ return 0; ++} ++ ++/* forward declaration */ ++static int squalloc_upper_levels(flush_pos_t *, znode *, znode *); ++ ++/* do a fast check for "same parents" condition before calling ++ * squalloc_upper_levels() */ ++static inline int check_parents_and_squalloc_upper_levels(flush_pos_t *pos, ++ znode * left, ++ znode * right) ++{ ++ if (znode_same_parents(left, right)) ++ return 0; ++ ++ return squalloc_upper_levels(pos, left, right); ++} ++ ++/* Check whether the parent of given @right node needs to be processes ++ ((re)allocated) prior to processing of the child. If @left and @right do not ++ share at least the parent of the @right is after the @left but before the ++ @right in parent-first order, we have to (re)allocate it before the @right ++ gets (re)allocated. */ ++static int squalloc_upper_levels(flush_pos_t *pos, znode * left, znode * right) ++{ ++ int ret; ++ ++ lock_handle left_parent_lock; ++ lock_handle right_parent_lock; ++ ++ load_count left_parent_load; ++ load_count right_parent_load; ++ ++ init_lh(&left_parent_lock); ++ init_lh(&right_parent_lock); ++ ++ init_load_count(&left_parent_load); ++ init_load_count(&right_parent_load); ++ ++ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK); ++ if (ret) ++ goto out; ++ ++ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK); ++ if (ret) ++ goto out; ++ ++ /* Check for same parents */ ++ if (left_parent_lock.node == right_parent_lock.node) ++ goto out; ++ ++ if (znode_check_flushprepped(right_parent_lock.node)) { ++ /* Keep parent-first order. In the order, the right parent node ++ stands before the @right node. If it is already allocated, ++ we set the preceder (next block search start point) to its ++ block number, @right node should be allocated after it. ++ ++ However, preceder is set only if the right parent is on twig ++ level. The explanation is the following: new branch nodes are ++ allocated over already allocated children while the tree ++ grows, it is difficult to keep tree ordered, we assume that ++ only leaves and twings are correctly allocated. So, only ++ twigs are used as a preceder for allocating of the rest of ++ the slum. */ ++ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) { ++ pos->preceder.blk = ++ *znode_get_block(right_parent_lock.node); ++ check_preceder(pos->preceder.blk); ++ } ++ goto out; ++ } ++ ++ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node); ++ if (ret) ++ goto out; ++ ++ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node); ++ if (ret) ++ goto out; ++ ++ ret = ++ squeeze_right_neighbor(pos, left_parent_lock.node, ++ right_parent_lock.node); ++ /* We stop if error. We stop if some items/units were shifted (ret == 0) ++ * and thus @right changed its parent. It means we have not process ++ * right_parent node prior to processing of @right. Positive return ++ * values say that shifting items was not happen because of "empty ++ * source" or "target full" conditions. */ ++ if (ret <= 0) ++ goto out; ++ ++ /* parent(@left) and parent(@right) may have different parents also. We ++ * do a recursive call for checking that. */ ++ ret = ++ check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node, ++ right_parent_lock.node); ++ if (ret) ++ goto out; ++ ++ /* allocate znode when going down */ ++ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos); ++ ++out: ++ done_load_count(&left_parent_load); ++ done_load_count(&right_parent_load); ++ ++ done_lh(&left_parent_lock); ++ done_lh(&right_parent_lock); ++ ++ return ret; ++} ++ ++/* Check the leftmost child "flushprepped" status, also returns true if child ++ * node was not found in cache. */ ++static int leftmost_child_of_unit_check_flushprepped(const coord_t *coord) ++{ ++ int ret; ++ int prepped; ++ ++ jnode *child; ++ ++ ret = get_leftmost_child_of_unit(coord, &child); ++ ++ if (ret) ++ return ret; ++ ++ if (child) { ++ prepped = jnode_check_flushprepped(child); ++ jput(child); ++ } else { ++ /* We consider not existing child as a node which slum ++ processing should not continue to. Not cached node is clean, ++ so it is flushprepped. */ ++ prepped = 1; ++ } ++ ++ return prepped; ++} ++ ++/* (re)allocate znode with automated getting parent node */ ++static int lock_parent_and_allocate_znode(znode * node, flush_pos_t *pos) ++{ ++ int ret; ++ lock_handle parent_lock; ++ load_count parent_load; ++ coord_t pcoord; ++ ++ assert("zam-851", znode_is_write_locked(node)); ++ ++ init_lh(&parent_lock); ++ init_load_count(&parent_load); ++ ++ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK); ++ if (ret) ++ goto out; ++ ++ ret = incr_load_count_znode(&parent_load, parent_lock.node); ++ if (ret) ++ goto out; ++ ++ ret = find_child_ptr(parent_lock.node, node, &pcoord); ++ if (ret) ++ goto out; ++ ++ ret = allocate_znode(node, &pcoord, pos); ++ ++out: ++ done_load_count(&parent_load); ++ done_lh(&parent_lock); ++ return ret; ++} ++ ++/* Process nodes on leaf level until unformatted node or rightmost node in the ++ * slum reached. */ ++static int handle_pos_on_formatted(flush_pos_t *pos) ++{ ++ int ret; ++ lock_handle right_lock; ++ load_count right_load; ++ ++ init_lh(&right_lock); ++ init_load_count(&right_load); ++ ++ if (should_convert_node(pos, pos->lock.node)) { ++ ret = convert_node(pos, pos->lock.node); ++ if (ret) ++ return ret; ++ } ++ ++ while (1) { ++ int expected; ++ expected = should_convert_next_node(pos); ++ ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE, ++ ZNODE_WRITE_LOCK, !expected, expected); ++ if (ret) { ++ if (expected) ++ warning("edward-1495", ++ "Expected neighbor not found (ret = %d). Fsck?", ++ ret); ++ break; ++ } ++ ++ /* we don't prep(allocate) nodes for flushing twice. This can be ++ * suboptimal, or it can be optimal. For now we choose to live ++ * with the risk that it will be suboptimal because it would be ++ * quite complex to code it to be smarter. */ ++ if (znode_check_flushprepped(right_lock.node) ++ && !znode_convertible(right_lock.node)) { ++ assert("edward-1005", !should_convert_next_node(pos)); ++ pos_stop(pos); ++ break; ++ } ++ ++ ret = incr_load_count_znode(&right_load, right_lock.node); ++ if (ret) ++ break; ++ if (should_convert_node(pos, right_lock.node)) { ++ ret = convert_node(pos, right_lock.node); ++ if (ret) ++ break; ++ if (node_is_empty(right_lock.node)) { ++ /* node became empty after converting, repeat */ ++ done_load_count(&right_load); ++ done_lh(&right_lock); ++ continue; ++ } ++ } ++ ++ /* squeeze _before_ going upward. */ ++ ret = ++ squeeze_right_neighbor(pos, pos->lock.node, ++ right_lock.node); ++ if (ret < 0) ++ break; ++ ++ if (znode_check_flushprepped(right_lock.node)) { ++ if (should_convert_next_node(pos)) { ++ /* in spite of flushprepped status of the node, ++ its right slum neighbor should be converted*/ ++ assert("edward-953", convert_data(pos)); ++ assert("edward-954", item_convert_data(pos)); ++ ++ if (node_is_empty(right_lock.node)) { ++ done_load_count(&right_load); ++ done_lh(&right_lock); ++ } else ++ move_flush_pos(pos, &right_lock, ++ &right_load, NULL); ++ continue; ++ } ++ pos_stop(pos); ++ break; ++ } ++ ++ if (node_is_empty(right_lock.node)) { ++ /* repeat if right node was squeezed completely */ ++ done_load_count(&right_load); ++ done_lh(&right_lock); ++ continue; ++ } ++ ++ /* parent(right_lock.node) has to be processed before ++ * (right_lock.node) due to "parent-first" allocation order. */ ++ ret = ++ check_parents_and_squalloc_upper_levels(pos, pos->lock.node, ++ right_lock.node); ++ if (ret) ++ break; ++ /* (re)allocate _after_ going upward */ ++ ret = lock_parent_and_allocate_znode(right_lock.node, pos); ++ if (ret) ++ break; ++ if (should_terminate_squalloc(pos)) { ++ set_item_convert_count(pos, 0); ++ break; ++ } ++ ++ /* advance the flush position to the right neighbor */ ++ move_flush_pos(pos, &right_lock, &right_load, NULL); ++ ++ ret = rapid_flush(pos); ++ if (ret) ++ break; ++ } ++ check_convert_info(pos); ++ done_load_count(&right_load); ++ done_lh(&right_lock); ++ ++ /* This function indicates via pos whether to stop or go to twig or ++ * continue on current level. */ ++ return ret; ++ ++} ++ ++/* Process nodes on leaf level until unformatted node or rightmost node in the ++ * slum reached. */ ++static int handle_pos_on_leaf(flush_pos_t *pos) ++{ ++ int ret; ++ ++ assert("zam-845", pos->state == POS_ON_LEAF); ++ ++ ret = handle_pos_on_formatted(pos); ++ ++ if (ret == -E_NO_NEIGHBOR) { ++ /* cannot get right neighbor, go process extents. */ ++ pos->state = POS_TO_TWIG; ++ return 0; ++ } ++ ++ return ret; ++} ++ ++/* Process slum on level > 1 */ ++static int handle_pos_on_internal(flush_pos_t *pos) ++{ ++ assert("zam-850", pos->state == POS_ON_INTERNAL); ++ return handle_pos_on_formatted(pos); ++} ++ ++/* check whether squalloc should stop before processing given extent */ ++static int squalloc_extent_should_stop(flush_pos_t *pos) ++{ ++ assert("zam-869", item_is_extent(&pos->coord)); ++ ++ /* pos->child is a jnode handle_pos_on_extent() should start with in ++ * stead of the first child of the first extent unit. */ ++ if (pos->child) { ++ int prepped; ++ ++ assert("vs-1383", jnode_is_unformatted(pos->child)); ++ prepped = jnode_check_flushprepped(pos->child); ++ pos->pos_in_unit = ++ jnode_get_index(pos->child) - ++ extent_unit_index(&pos->coord); ++ assert("vs-1470", ++ pos->pos_in_unit < extent_unit_width(&pos->coord)); ++ assert("nikita-3434", ++ ergo(extent_is_unallocated(&pos->coord), ++ pos->pos_in_unit == 0)); ++ jput(pos->child); ++ pos->child = NULL; ++ ++ return prepped; ++ } ++ ++ pos->pos_in_unit = 0; ++ if (extent_is_unallocated(&pos->coord)) ++ return 0; ++ ++ return leftmost_child_of_unit_check_flushprepped(&pos->coord); ++} ++ ++/* Handle the case when regular reiser4 tree (znodes connected one to its ++ * neighbors by sibling pointers) is interrupted on leaf level by one or more ++ * unformatted nodes. By having a lock on twig level and use extent code ++ * routines to process unformatted nodes we swim around an irregular part of ++ * reiser4 tree. */ ++static int handle_pos_on_twig(flush_pos_t *pos) ++{ ++ int ret; ++ ++ assert("zam-844", pos->state == POS_ON_EPOINT); ++ assert("zam-843", item_is_extent(&pos->coord)); ++ ++ /* We decide should we continue slum processing with current extent ++ unit: if leftmost child of current extent unit is flushprepped ++ (i.e. clean or already processed by flush) we stop squalloc(). There ++ is a fast check for unallocated extents which we assume contain all ++ not flushprepped nodes. */ ++ /* FIXME: Here we implement simple check, we are only looking on the ++ leftmost child. */ ++ ret = squalloc_extent_should_stop(pos); ++ if (ret != 0) { ++ pos_stop(pos); ++ return ret; ++ } ++ ++ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord) ++ && item_is_extent(&pos->coord)) { ++ ret = reiser4_alloc_extent(pos); ++ if (ret) ++ break; ++ coord_next_unit(&pos->coord); ++ } ++ ++ if (coord_is_after_rightmost(&pos->coord)) { ++ pos->state = POS_END_OF_TWIG; ++ return 0; ++ } ++ if (item_is_internal(&pos->coord)) { ++ pos->state = POS_TO_LEAF; ++ return 0; ++ } ++ ++ assert("zam-860", item_is_extent(&pos->coord)); ++ ++ /* "slum" is over */ ++ pos->state = POS_INVALID; ++ return 0; ++} ++ ++/* When we about to return flush position from twig to leaf level we can process ++ * the right twig node or move position to the leaf. This processes right twig ++ * if it is possible and jump to leaf level if not. */ ++static int handle_pos_end_of_twig(flush_pos_t *pos) ++{ ++ int ret; ++ lock_handle right_lock; ++ load_count right_load; ++ coord_t at_right; ++ jnode *child = NULL; ++ ++ assert("zam-848", pos->state == POS_END_OF_TWIG); ++ assert("zam-849", coord_is_after_rightmost(&pos->coord)); ++ ++ init_lh(&right_lock); ++ init_load_count(&right_load); ++ ++ /* We get a lock on the right twig node even it is not dirty because ++ * slum continues or discontinues on leaf level not on next twig. This ++ * lock on the right twig is needed for getting its leftmost child. */ ++ ret = ++ reiser4_get_right_neighbor(&right_lock, pos->lock.node, ++ ZNODE_WRITE_LOCK, GN_SAME_ATOM); ++ if (ret) ++ goto out; ++ ++ ret = incr_load_count_znode(&right_load, right_lock.node); ++ if (ret) ++ goto out; ++ ++ /* right twig could be not dirty */ ++ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) { ++ /* If right twig node is dirty we always attempt to squeeze it ++ * content to the left... */ ++became_dirty: ++ ret = ++ squeeze_right_twig_and_advance_coord(pos, right_lock.node); ++ if (ret <= 0) { ++ /* pos->coord is on internal item, go to leaf level, or ++ * we have an error which will be caught in squalloc() ++ */ ++ pos->state = POS_TO_LEAF; ++ goto out; ++ } ++ ++ /* If right twig was squeezed completely we wave to re-lock ++ * right twig. now it is done through the top-level squalloc ++ * routine. */ ++ if (node_is_empty(right_lock.node)) ++ goto out; ++ ++ /* ... and prep it if it is not yet prepped */ ++ if (!znode_check_flushprepped(right_lock.node)) { ++ /* As usual, process parent before ... */ ++ ret = ++ check_parents_and_squalloc_upper_levels(pos, ++ pos->lock. ++ node, ++ right_lock. ++ node); ++ if (ret) ++ goto out; ++ ++ /* ... processing the child */ ++ ret = ++ lock_parent_and_allocate_znode(right_lock.node, ++ pos); ++ if (ret) ++ goto out; ++ } ++ } else { ++ coord_init_first_unit(&at_right, right_lock.node); ++ ++ /* check first child of next twig, should we continue there ? */ ++ ret = get_leftmost_child_of_unit(&at_right, &child); ++ if (ret || child == NULL || jnode_check_flushprepped(child)) { ++ pos_stop(pos); ++ goto out; ++ } ++ ++ /* check clean twig for possible relocation */ ++ if (!znode_check_flushprepped(right_lock.node)) { ++ ret = ++ reverse_relocate_check_dirty_parent(child, ++ &at_right, pos); ++ if (ret) ++ goto out; ++ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) ++ goto became_dirty; ++ } ++ } ++ ++ assert("zam-875", znode_check_flushprepped(right_lock.node)); ++ ++ /* Update the preceder by a block number of just processed right twig ++ * node. The code above could miss the preceder updating because ++ * allocate_znode() could not be called for this node. */ ++ pos->preceder.blk = *znode_get_block(right_lock.node); ++ check_preceder(pos->preceder.blk); ++ ++ coord_init_first_unit(&at_right, right_lock.node); ++ assert("zam-868", coord_is_existing_unit(&at_right)); ++ ++ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF; ++ move_flush_pos(pos, &right_lock, &right_load, &at_right); ++ ++out: ++ done_load_count(&right_load); ++ done_lh(&right_lock); ++ ++ if (child) ++ jput(child); ++ ++ return ret; ++} ++ ++/* Move the pos->lock to leaf node pointed by pos->coord, check should we ++ * continue there. */ ++static int handle_pos_to_leaf(flush_pos_t *pos) ++{ ++ int ret; ++ lock_handle child_lock; ++ load_count child_load; ++ jnode *child; ++ ++ assert("zam-846", pos->state == POS_TO_LEAF); ++ assert("zam-847", item_is_internal(&pos->coord)); ++ ++ init_lh(&child_lock); ++ init_load_count(&child_load); ++ ++ ret = get_leftmost_child_of_unit(&pos->coord, &child); ++ if (ret) ++ return ret; ++ if (child == NULL) { ++ pos_stop(pos); ++ return 0; ++ } ++ ++ if (jnode_check_flushprepped(child)) { ++ pos->state = POS_INVALID; ++ goto out; ++ } ++ ++ ret = ++ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK, ++ ZNODE_LOCK_LOPRI); ++ if (ret) ++ goto out; ++ ++ ret = incr_load_count_znode(&child_load, JZNODE(child)); ++ if (ret) ++ goto out; ++ ++ ret = allocate_znode(JZNODE(child), &pos->coord, pos); ++ if (ret) ++ goto out; ++ ++ /* move flush position to leaf level */ ++ pos->state = POS_ON_LEAF; ++ move_flush_pos(pos, &child_lock, &child_load, NULL); ++ ++ if (node_is_empty(JZNODE(child))) { ++ ret = delete_empty_node(JZNODE(child)); ++ pos->state = POS_INVALID; ++ } ++out: ++ done_load_count(&child_load); ++ done_lh(&child_lock); ++ jput(child); ++ ++ return ret; ++} ++ ++/* move pos from leaf to twig, and move lock from leaf to twig. */ ++/* Move pos->lock to upper (twig) level */ ++static int handle_pos_to_twig(flush_pos_t *pos) ++{ ++ int ret; ++ ++ lock_handle parent_lock; ++ load_count parent_load; ++ coord_t pcoord; ++ ++ assert("zam-852", pos->state == POS_TO_TWIG); ++ ++ init_lh(&parent_lock); ++ init_load_count(&parent_load); ++ ++ ret = ++ reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK); ++ if (ret) ++ goto out; ++ ++ ret = incr_load_count_znode(&parent_load, parent_lock.node); ++ if (ret) ++ goto out; ++ ++ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord); ++ if (ret) ++ goto out; ++ ++ assert("zam-870", item_is_internal(&pcoord)); ++ coord_next_item(&pcoord); ++ ++ if (coord_is_after_rightmost(&pcoord)) ++ pos->state = POS_END_OF_TWIG; ++ else if (item_is_extent(&pcoord)) ++ pos->state = POS_ON_EPOINT; ++ else { ++ /* Here we understand that getting -E_NO_NEIGHBOR in ++ * handle_pos_on_leaf() was because of just a reaching edge of ++ * slum */ ++ pos_stop(pos); ++ goto out; ++ } ++ ++ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord); ++ ++out: ++ done_load_count(&parent_load); ++ done_lh(&parent_lock); ++ ++ return ret; ++} ++ ++typedef int (*pos_state_handle_t) (flush_pos_t *); ++static pos_state_handle_t flush_pos_handlers[] = { ++ /* process formatted nodes on leaf level, keep lock on a leaf node */ ++ [POS_ON_LEAF] = handle_pos_on_leaf, ++ /* process unformatted nodes, keep lock on twig node, pos->coord points ++ * to extent currently being processed */ ++ [POS_ON_EPOINT] = handle_pos_on_twig, ++ /* move a lock from leaf node to its parent for further processing of ++ unformatted nodes */ ++ [POS_TO_TWIG] = handle_pos_to_twig, ++ /* move a lock from twig to leaf level when a processing of unformatted ++ * nodes finishes, pos->coord points to the leaf node we jump to */ ++ [POS_TO_LEAF] = handle_pos_to_leaf, ++ /* after processing last extent in the twig node, attempting to shift ++ * items from the twigs right neighbor and process them while shifting*/ ++ [POS_END_OF_TWIG] = handle_pos_end_of_twig, ++ /* process formatted nodes on internal level, keep lock on an internal ++ node */ ++ [POS_ON_INTERNAL] = handle_pos_on_internal ++}; ++ ++/* Advance flush position horizontally, prepare for flushing ((re)allocate, ++ * squeeze, encrypt) nodes and their ancestors in "parent-first" order */ ++static int squalloc(flush_pos_t *pos) ++{ ++ int ret = 0; ++ ++ /* maybe needs to be made a case statement with handle_pos_on_leaf as ++ * first case, for greater CPU efficiency? Measure and see.... -Hans */ ++ while (pos_valid(pos)) { ++ ret = flush_pos_handlers[pos->state] (pos); ++ if (ret < 0) ++ break; ++ ++ ret = rapid_flush(pos); ++ if (ret) ++ break; ++ } ++ ++ /* any positive value or -E_NO_NEIGHBOR are legal return codes for ++ handle_pos* routines, -E_NO_NEIGHBOR means that slum edge was ++ reached */ ++ if (ret > 0 || ret == -E_NO_NEIGHBOR) ++ ret = 0; ++ ++ return ret; ++} ++ ++static void update_ldkey(znode * node) ++{ ++ reiser4_key ldkey; ++ ++ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); ++ if (node_is_empty(node)) ++ return; ++ ++ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey)); ++} ++ ++/* this is to be called after calling of shift node's method to shift data from ++ @right to @left. It sets left delimiting keys of @left and @right to keys of ++ first items of @left and @right correspondingly and sets right delimiting key ++ of @left to first key of @right */ ++static void update_znode_dkeys(znode * left, znode * right) ++{ ++ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock)); ++ assert("vs-1629", (znode_is_write_locked(left) && ++ znode_is_write_locked(right))); ++ ++ /* we need to update left delimiting of left if it was empty before ++ shift */ ++ update_ldkey(left); ++ update_ldkey(right); ++ if (node_is_empty(right)) ++ znode_set_rd_key(left, znode_get_rd_key(right)); ++ else ++ znode_set_rd_key(left, znode_get_ld_key(right)); ++} ++ ++/* try to shift everything from @right to @left. If everything was shifted - ++ @right is removed from the tree. Result is the number of bytes shifted. */ ++static int ++shift_everything_left(znode * right, znode * left, carry_level * todo) ++{ ++ coord_t from; ++ node_plugin *nplug; ++ carry_plugin_info info; ++ ++ coord_init_after_last_item(&from, right); ++ ++ nplug = node_plugin_by_node(right); ++ info.doing = NULL; ++ info.todo = todo; ++ return nplug->shift(&from, left, SHIFT_LEFT, ++ 1 /* delete @right if it becomes empty */ , ++ 1 ++ /* move coord @from to node @left if everything will ++ be shifted */ ++ , ++ &info); ++} ++ ++/* Shift as much as possible from @right to @left using the memcpy-optimized ++ shift_everything_left. @left and @right are formatted neighboring nodes on ++ leaf level. */ ++static int squeeze_right_non_twig(znode * left, znode * right) ++{ ++ int ret; ++ carry_pool *pool; ++ carry_level *todo; ++ ++ assert("nikita-2246", znode_get_level(left) == znode_get_level(right)); ++ ++ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) || ++ !JF_ISSET(ZJNODE(right), JNODE_DIRTY)) ++ return SQUEEZE_TARGET_FULL; ++ ++ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo)); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ todo = (carry_level *) (pool + 1); ++ init_carry_level(todo, pool); ++ ++ ret = shift_everything_left(right, left, todo); ++ if (ret > 0) { ++ /* something was shifted */ ++ reiser4_tree *tree; ++ __u64 grabbed; ++ ++ znode_make_dirty(left); ++ znode_make_dirty(right); ++ ++ /* update delimiting keys of nodes which participated in ++ shift. FIXME: it would be better to have this in shift ++ node's operation. But it can not be done there. Nobody ++ remembers why, though */ ++ tree = znode_get_tree(left); ++ write_lock_dk(tree); ++ update_znode_dkeys(left, right); ++ write_unlock_dk(tree); ++ ++ /* Carry is called to update delimiting key and, maybe, to ++ remove empty node. */ ++ grabbed = get_current_context()->grabbed_blocks; ++ ret = reiser4_grab_space_force(tree->height, BA_RESERVED); ++ assert("nikita-3003", ret == 0); /* reserved space is ++ exhausted. Ask Hans. */ ++ ret = reiser4_carry(todo, NULL/* previous level */); ++ grabbed2free_mark(grabbed); ++ } else { ++ /* Shifting impossible, we return appropriate result code */ ++ ret = ++ node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY : ++ SQUEEZE_TARGET_FULL; ++ } ++ ++ done_carry_pool(pool); ++ ++ return ret; ++} ++ ++#if REISER4_DEBUG ++static int sibling_link_is_ok(const znode *left, const znode *right) ++{ ++ int result; ++ ++ read_lock_tree(znode_get_tree(left)); ++ result = (left->right == right && left == right->left); ++ read_unlock_tree(znode_get_tree(left)); ++ return result; ++} ++#endif ++ ++/* Shift first unit of first item if it is an internal one. Return ++ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return ++ SUBTREE_MOVED. */ ++static int shift_one_internal_unit(znode * left, znode * right) ++{ ++ int ret; ++ carry_pool *pool; ++ carry_level *todo; ++ coord_t *coord; ++ carry_plugin_info *info; ++ int size, moved; ++ ++ assert("nikita-2247", znode_get_level(left) == znode_get_level(right)); ++ assert("nikita-2435", znode_is_write_locked(left)); ++ assert("nikita-2436", znode_is_write_locked(right)); ++ assert("nikita-2434", sibling_link_is_ok(left, right)); ++ ++ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) + ++ sizeof(*coord) + sizeof(*info) ++#if REISER4_DEBUG ++ + sizeof(*coord) + 2 * sizeof(reiser4_key) ++#endif ++ ); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ todo = (carry_level *) (pool + 1); ++ init_carry_level(todo, pool); ++ ++ coord = (coord_t *) (todo + 3); ++ coord_init_first_unit(coord, right); ++ info = (carry_plugin_info *) (coord + 1); ++ ++#if REISER4_DEBUG ++ if (!node_is_empty(left)) { ++ coord_t *last; ++ reiser4_key *right_key; ++ reiser4_key *left_key; ++ ++ last = (coord_t *) (info + 1); ++ right_key = (reiser4_key *) (last + 1); ++ left_key = right_key + 1; ++ coord_init_last_unit(last, left); ++ ++ assert("nikita-2463", ++ keyle(item_key_by_coord(last, left_key), ++ item_key_by_coord(coord, right_key))); ++ } ++#endif ++ ++ assert("jmacd-2007", item_is_internal(coord)); ++ ++ size = item_length_by_coord(coord); ++ info->todo = todo; ++ info->doing = NULL; ++ ++ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT, ++ 1 ++ /* delete @right if it becomes ++ empty */ ++ , ++ 0 ++ /* do not move coord @coord to ++ node @left */ ++ , ++ info); ++ ++ /* If shift returns positive, then we shifted the item. */ ++ assert("vs-423", ret <= 0 || size == ret); ++ moved = (ret > 0); ++ ++ if (moved) { ++ /* something was moved */ ++ reiser4_tree *tree; ++ int grabbed; ++ ++ znode_make_dirty(left); ++ znode_make_dirty(right); ++ tree = znode_get_tree(left); ++ write_lock_dk(tree); ++ update_znode_dkeys(left, right); ++ write_unlock_dk(tree); ++ ++ /* reserve space for delimiting keys after shifting */ ++ grabbed = get_current_context()->grabbed_blocks; ++ ret = reiser4_grab_space_force(tree->height, BA_RESERVED); ++ assert("nikita-3003", ret == 0); /* reserved space is ++ exhausted. Ask Hans. */ ++ ++ ret = reiser4_carry(todo, NULL/* previous level */); ++ grabbed2free_mark(grabbed); ++ } ++ ++ done_carry_pool(pool); ++ ++ if (ret != 0) { ++ /* Shift or carry operation failed. */ ++ assert("jmacd-7325", ret < 0); ++ return ret; ++ } ++ ++ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL; ++} ++ ++/* Make the final relocate/wander decision during forward parent-first squalloc ++ for a znode. For unformatted nodes this is done in ++ plugin/item/extent.c:extent_needs_allocation(). */ ++static int ++allocate_znode_loaded(znode * node, ++ const coord_t *parent_coord, flush_pos_t *pos) ++{ ++ int ret; ++ reiser4_super_info_data *sbinfo = get_current_super_private(); ++ /* FIXME(D): We have the node write-locked and should have checked for ! ++ allocated() somewhere before reaching this point, but there can be a ++ race, so this assertion is bogus. */ ++ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node))); ++ assert("jmacd-7988", znode_is_write_locked(node)); ++ assert("jmacd-7989", coord_is_invalid(parent_coord) ++ || znode_is_write_locked(parent_coord->node)); ++ ++ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) || ++ znode_is_root(node) || ++ /* We have enough nodes to relocate no matter what. */ ++ (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) { ++ /* No need to decide with new nodes, they are treated the same ++ as relocate. If the root node is dirty, relocate. */ ++ if (pos->preceder.blk == 0) { ++ /* preceder is unknown and we have decided to relocate ++ node -- using of default value for search start is ++ better than search from block #0. */ ++ get_blocknr_hint_default(&pos->preceder.blk); ++ check_preceder(pos->preceder.blk); ++ } ++ ++ goto best_reloc; ++ ++ } else if (pos->preceder.blk == 0) { ++ /* If we don't know the preceder, leave it where it is. */ ++ jnode_make_wander(ZJNODE(node)); ++ } else { ++ /* Make a decision based on block distance. */ ++ reiser4_block_nr dist; ++ reiser4_block_nr nblk = *znode_get_block(node); ++ ++ assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk)); ++ assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk)); ++ assert("jmacd-6174", pos->preceder.blk != 0); ++ ++ if (pos->preceder.blk == nblk - 1) { ++ /* Ideal. */ ++ jnode_make_wander(ZJNODE(node)); ++ } else { ++ ++ dist = ++ (nblk < ++ pos->preceder.blk) ? (pos->preceder.blk - ++ nblk) : (nblk - ++ pos->preceder.blk); ++ ++ /* See if we can find a closer block ++ (forward direction only). */ ++ pos->preceder.max_dist = ++ min((reiser4_block_nr) sbinfo->flush. ++ relocate_distance, dist); ++ pos->preceder.level = znode_get_level(node); ++ ++ ret = allocate_znode_update(node, parent_coord, pos); ++ ++ pos->preceder.max_dist = 0; ++ ++ if (ret && (ret != -ENOSPC)) ++ return ret; ++ ++ if (ret == 0) { ++ /* Got a better allocation. */ ++ znode_make_reloc(node, pos->fq); ++ } else if (dist < sbinfo->flush.relocate_distance) { ++ /* The present allocation is good enough. */ ++ jnode_make_wander(ZJNODE(node)); ++ } else { ++ /* Otherwise, try to relocate to the best ++ position. */ ++best_reloc: ++ ret = ++ allocate_znode_update(node, parent_coord, ++ pos); ++ if (ret != 0) ++ return ret; ++ ++ /* set JNODE_RELOC bit _after_ node gets ++ allocated */ ++ znode_make_reloc(node, pos->fq); ++ } ++ } ++ } ++ ++ /* This is the new preceder. */ ++ pos->preceder.blk = *znode_get_block(node); ++ check_preceder(pos->preceder.blk); ++ pos->alloc_cnt += 1; ++ ++ assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk)); ++ ++ return 0; ++} ++ ++static int ++allocate_znode(znode * node, const coord_t *parent_coord, flush_pos_t *pos) ++{ ++ /* ++ * perform znode allocation with znode pinned in memory to avoid races ++ * with asynchronous emergency flush (which plays with ++ * JNODE_FLUSH_RESERVED bit). ++ */ ++ return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos)); ++} ++ ++/* A subroutine of allocate_znode, this is called first to see if there is a ++ close position to relocate to. It may return ENOSPC if there is no close ++ position. If there is no close position it may not relocate. This takes care ++ of updating the parent node with the relocated block address. */ ++static int ++allocate_znode_update(znode * node, const coord_t *parent_coord, ++ flush_pos_t *pos) ++{ ++ int ret; ++ reiser4_block_nr blk; ++ lock_handle uber_lock; ++ int flush_reserved_used = 0; ++ int grabbed; ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ init_lh(&uber_lock); ++ ++ ctx = get_current_context(); ++ sbinfo = get_super_private(ctx->super); ++ ++ grabbed = ctx->grabbed_blocks; ++ ++ /* discard e-flush allocation */ ++ ret = zload(node); ++ if (ret) ++ return ret; ++ ++ if (ZF_ISSET(node, JNODE_CREATED)) { ++ assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node))); ++ pos->preceder.block_stage = BLOCK_UNALLOCATED; ++ } else { ++ pos->preceder.block_stage = BLOCK_GRABBED; ++ ++ /* The disk space for relocating the @node is already reserved ++ * in "flush reserved" counter if @node is leaf, otherwise we ++ * grab space using BA_RESERVED (means grab space from whole ++ * disk not from only 95%). */ ++ if (znode_get_level(node) == LEAF_LEVEL) { ++ /* ++ * earlier (during do_jnode_make_dirty()) we decided ++ * that @node can possibly go into overwrite set and ++ * reserved block for its wandering location. ++ */ ++ txn_atom *atom = get_current_atom_locked(); ++ assert("nikita-3449", ++ ZF_ISSET(node, JNODE_FLUSH_RESERVED)); ++ flush_reserved2grabbed(atom, (__u64) 1); ++ spin_unlock_atom(atom); ++ /* ++ * we are trying to move node into relocate ++ * set. Allocation of relocated position "uses" ++ * reserved block. ++ */ ++ ZF_CLR(node, JNODE_FLUSH_RESERVED); ++ flush_reserved_used = 1; ++ } else { ++ ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED); ++ if (ret != 0) ++ goto exit; ++ } ++ } ++ ++ /* We may do not use 5% of reserved disk space here and flush will not ++ pack tightly. */ ++ ret = reiser4_alloc_block(&pos->preceder, &blk, ++ BA_FORMATTED | BA_PERMANENT); ++ if (ret) ++ goto exit; ++ ++ if (!ZF_ISSET(node, JNODE_CREATED) && ++ (ret = ++ reiser4_dealloc_block(znode_get_block(node), 0, ++ BA_DEFER | BA_FORMATTED))) ++ goto exit; ++ ++ if (likely(!znode_is_root(node))) { ++ item_plugin *iplug; ++ ++ iplug = item_plugin_by_coord(parent_coord); ++ assert("nikita-2954", iplug->f.update != NULL); ++ iplug->f.update(parent_coord, &blk); ++ ++ znode_make_dirty(parent_coord->node); ++ ++ } else { ++ reiser4_tree *tree = znode_get_tree(node); ++ znode *uber; ++ ++ /* We take a longterm lock on the fake node in order to change ++ the root block number. This may cause atom fusion. */ ++ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI, ++ &uber_lock); ++ /* The fake node cannot be deleted, and we must have priority ++ here, and may not be confused with ENOSPC. */ ++ assert("jmacd-74412", ++ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC); ++ ++ if (ret) ++ goto exit; ++ ++ uber = uber_lock.node; ++ ++ write_lock_tree(tree); ++ tree->root_block = blk; ++ write_unlock_tree(tree); ++ ++ znode_make_dirty(uber); ++ } ++ ++ ret = znode_rehash(node, &blk); ++exit: ++ if (ret) { ++ /* Get flush reserved block back if something fails, because ++ * callers assume that on error block wasn't relocated and its ++ * flush reserved block wasn't used. */ ++ if (flush_reserved_used) { ++ /* ++ * ok, we failed to move node into relocate ++ * set. Restore status quo. ++ */ ++ grabbed2flush_reserved((__u64) 1); ++ ZF_SET(node, JNODE_FLUSH_RESERVED); ++ } ++ } ++ zrelse(node); ++ done_lh(&uber_lock); ++ grabbed2free_mark(grabbed); ++ return ret; ++} ++ ++/* JNODE INTERFACE */ ++ ++/* Lock a node (if formatted) and then get its parent locked, set the child's ++ coordinate in the parent. If the child is the root node, the above_root ++ znode is returned but the coord is not set. This function may cause atom ++ fusion, but it is only used for read locks (at this point) and therefore ++ fusion only occurs when the parent is already dirty. */ ++/* Hans adds this note: remember to ask how expensive this operation is vs. ++ storing parent pointer in jnodes. */ ++static int ++jnode_lock_parent_coord(jnode * node, ++ coord_t *coord, ++ lock_handle * parent_lh, ++ load_count * parent_zh, ++ znode_lock_mode parent_mode, int try) ++{ ++ int ret; ++ ++ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node)); ++ assert("edward-54", jnode_is_unformatted(node) ++ || znode_is_any_locked(JZNODE(node))); ++ ++ if (!jnode_is_znode(node)) { ++ reiser4_key key; ++ tree_level stop_level = TWIG_LEVEL; ++ lookup_bias bias = FIND_EXACT; ++ ++ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP)); ++ ++ /* The case when node is not znode, but can have parent coord ++ (unformatted node, node which represents cluster page, ++ etc..). Generate a key for the appropriate entry, search ++ in the tree using coord_by_key, which handles locking for ++ us. */ ++ ++ /* ++ * nothing is locked at this moment, so, nothing prevents ++ * concurrent truncate from removing jnode from inode. To ++ * prevent this spin-lock jnode. jnode can be truncated just ++ * after call to the jnode_build_key(), but this is ok, ++ * because coord_by_key() will just fail to find appropriate ++ * extent. ++ */ ++ spin_lock_jnode(node); ++ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { ++ jnode_build_key(node, &key); ++ ret = 0; ++ } else ++ ret = RETERR(-ENOENT); ++ spin_unlock_jnode(node); ++ ++ if (ret != 0) ++ return ret; ++ ++ if (jnode_is_cluster_page(node)) ++ stop_level = LEAF_LEVEL; ++ ++ assert("jmacd-1812", coord != NULL); ++ ++ ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh, ++ parent_mode, bias, stop_level, stop_level, ++ CBK_UNIQUE, NULL/*ra_info */); ++ switch (ret) { ++ case CBK_COORD_NOTFOUND: ++ assert("edward-1038", ++ ergo(jnode_is_cluster_page(node), ++ JF_ISSET(node, JNODE_HEARD_BANSHEE))); ++ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) ++ warning("nikita-3177", "Parent not found"); ++ return ret; ++ case CBK_COORD_FOUND: ++ if (coord->between != AT_UNIT) { ++ /* FIXME: comment needed */ ++ done_lh(parent_lh); ++ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { ++ warning("nikita-3178", ++ "Found but not happy: %i", ++ coord->between); ++ } ++ return RETERR(-ENOENT); ++ } ++ ret = incr_load_count_znode(parent_zh, parent_lh->node); ++ if (ret != 0) ++ return ret; ++ /* if (jnode_is_cluster_page(node)) { ++ races with write() are possible ++ check_child_cluster (parent_lh->node); ++ } ++ */ ++ break; ++ default: ++ return ret; ++ } ++ ++ } else { ++ int flags; ++ znode *z; ++ ++ z = JZNODE(node); ++ /* Formatted node case: */ ++ assert("jmacd-2061", !znode_is_root(z)); ++ ++ flags = GN_ALLOW_NOT_CONNECTED; ++ if (try) ++ flags |= GN_TRY_LOCK; ++ ++ ret = ++ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags); ++ if (ret != 0) ++ /* -E_REPEAT is ok here, it is handled by the caller. */ ++ return ret; ++ ++ /* Make the child's position "hint" up-to-date. (Unless above ++ root, which caller must check.) */ ++ if (coord != NULL) { ++ ++ ret = incr_load_count_znode(parent_zh, parent_lh->node); ++ if (ret != 0) { ++ warning("jmacd-976812386", ++ "incr_load_count_znode failed: %d", ++ ret); ++ return ret; ++ } ++ ++ ret = find_child_ptr(parent_lh->node, z, coord); ++ if (ret != 0) { ++ warning("jmacd-976812", ++ "find_child_ptr failed: %d", ret); ++ return ret; ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++/* Get the (locked) next neighbor of a znode which is dirty and a member of the ++ same atom. If there is no next neighbor or the neighbor is not in memory or ++ if there is a neighbor but it is not dirty or not in the same atom, ++ -E_NO_NEIGHBOR is returned. In some cases the slum may include nodes which ++ are not dirty, if so @check_dirty should be 0 */ ++static int neighbor_in_slum(znode * node, /* starting point */ ++ lock_handle * lock, /* lock on starting point */ ++ sideof side, /* left or right direction we ++ seek the next node in */ ++ znode_lock_mode mode, /* kind of lock we want */ ++ int check_dirty, /* true if the neighbor should ++ be dirty */ ++ int use_upper_levels /* get neighbor by going though ++ upper levels */) ++{ ++ int ret; ++ int flags; ++ ++ assert("jmacd-6334", znode_is_connected(node)); ++ ++ flags = GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0); ++ if (use_upper_levels) ++ flags |= GN_CAN_USE_UPPER_LEVELS; ++ ++ ret = reiser4_get_neighbor(lock, node, mode, flags); ++ if (ret) { ++ /* May return -ENOENT or -E_NO_NEIGHBOR. */ ++ /* FIXME(C): check EINVAL, E_DEADLOCK */ ++ if (ret == -ENOENT) ++ ret = RETERR(-E_NO_NEIGHBOR); ++ return ret; ++ } ++ if (!check_dirty) ++ return 0; ++ /* Check dirty bit of locked znode, no races here */ ++ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY)) ++ return 0; ++ ++ done_lh(lock); ++ return RETERR(-E_NO_NEIGHBOR); ++} ++ ++/* Return true if two znodes have the same parent. This is called with both ++ nodes write-locked (for squeezing) so no tree lock is needed. */ ++static int znode_same_parents(znode * a, znode * b) ++{ ++ int result; ++ ++ assert("jmacd-7011", znode_is_write_locked(a)); ++ assert("jmacd-7012", znode_is_write_locked(b)); ++ ++ /* We lock the whole tree for this check.... I really don't like whole ++ * tree locks... -Hans */ ++ read_lock_tree(znode_get_tree(a)); ++ result = (znode_parent(a) == znode_parent(b)); ++ read_unlock_tree(znode_get_tree(a)); ++ return result; ++} ++ ++/* FLUSH SCAN */ ++ ++/* Initialize the flush_scan data structure. */ ++static void scan_init(flush_scan * scan) ++{ ++ memset(scan, 0, sizeof(*scan)); ++ init_lh(&scan->node_lock); ++ init_lh(&scan->parent_lock); ++ init_load_count(&scan->parent_load); ++ init_load_count(&scan->node_load); ++ coord_init_invalid(&scan->parent_coord, NULL); ++} ++ ++/* Release any resources held by the flush scan, e.g. release locks, ++ free memory, etc. */ ++static void scan_done(flush_scan * scan) ++{ ++ done_load_count(&scan->node_load); ++ if (scan->node != NULL) { ++ jput(scan->node); ++ scan->node = NULL; ++ } ++ done_load_count(&scan->parent_load); ++ done_lh(&scan->parent_lock); ++ done_lh(&scan->node_lock); ++} ++ ++/* Returns true if flush scanning is finished. */ ++int reiser4_scan_finished(flush_scan * scan) ++{ ++ return scan->stop || (scan->direction == RIGHT_SIDE && ++ scan->count >= scan->max_count); ++} ++ ++/* Return true if the scan should continue to the @tonode. True if the node ++ meets the same_slum_check condition. If not, deref the "left" node and stop ++ the scan. */ ++int reiser4_scan_goto(flush_scan * scan, jnode * tonode) ++{ ++ int go = same_slum_check(scan->node, tonode, 1, 0); ++ ++ if (!go) { ++ scan->stop = 1; ++ jput(tonode); ++ } ++ ++ return go; ++} ++ ++/* Set the current scan->node, refcount it, increment count by the @add_count ++ (number to count, e.g., skipped unallocated nodes), deref previous current, ++ and copy the current parent coordinate. */ ++int ++scan_set_current(flush_scan * scan, jnode * node, unsigned add_count, ++ const coord_t *parent) ++{ ++ /* Release the old references, take the new reference. */ ++ done_load_count(&scan->node_load); ++ ++ if (scan->node != NULL) ++ jput(scan->node); ++ scan->node = node; ++ scan->count += add_count; ++ ++ /* This next stmt is somewhat inefficient. The reiser4_scan_extent() ++ code could delay this update step until it finishes and update the ++ parent_coord only once. It did that before, but there was a bug and ++ this was the easiest way to make it correct. */ ++ if (parent != NULL) ++ coord_dup(&scan->parent_coord, parent); ++ ++ /* Failure may happen at the incr_load_count call, but the caller can ++ assume the reference is safely taken. */ ++ return incr_load_count_jnode(&scan->node_load, node); ++} ++ ++/* Return true if scanning in the leftward direction. */ ++int reiser4_scanning_left(flush_scan * scan) ++{ ++ return scan->direction == LEFT_SIDE; ++} ++ ++/* Performs leftward scanning starting from either kind of node. Counts the ++ starting node. The right-scan object is passed in for the left-scan in order ++ to copy the parent of an unformatted starting position. This way we avoid ++ searching for the unformatted node's parent when scanning in each direction. ++ If we search for the parent once it is set in both scan objects. The limit ++ parameter tells flush-scan when to stop. ++ ++ Rapid scanning is used only during scan_left, where we are interested in ++ finding the 'leftpoint' where we begin flushing. We are interested in ++ stopping at the left child of a twig that does not have a dirty left ++ neighbour. THIS IS A SPECIAL CASE. The problem is finding a way to flush only ++ those nodes without unallocated children, and it is difficult to solve in the ++ bottom-up flushing algorithm we are currently using. The problem can be ++ solved by scanning left at every level as we go upward, but this would ++ basically bring us back to using a top-down allocation strategy, which we ++ already tried (see BK history from May 2002), and has a different set of ++ problems. The top-down strategy makes avoiding unallocated children easier, ++ but makes it difficult to propertly flush dirty children with clean parents ++ that would otherwise stop the top-down flush, only later to dirty the parent ++ once the children are flushed. So we solve the problem in the bottom-up ++ algorithm with a special case for twigs and leaves only. ++ ++ The first step in solving the problem is this rapid leftward scan. After we ++ determine that there are at least enough nodes counted to qualify for ++ FLUSH_RELOCATE_THRESHOLD we are no longer interested in the exact count, we ++ are only interested in finding the best place to start the flush. ++ ++ We could choose one of two possibilities: ++ ++ 1. Stop at the leftmost child (of a twig) that does not have a dirty left ++ neighbor. This requires checking one leaf per rapid-scan twig ++ ++ 2. Stop at the leftmost child (of a twig) where there are no dirty children ++ of the twig to the left. This requires checking possibly all of the in-memory ++ children of each twig during the rapid scan. ++ ++ For now we implement the first policy. ++*/ ++static int ++scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit) ++{ ++ int ret = 0; ++ ++ scan->max_count = limit; ++ scan->direction = LEFT_SIDE; ++ ++ ret = scan_set_current(scan, jref(node), 1, NULL); ++ if (ret != 0) ++ return ret; ++ ++ ret = scan_common(scan, right); ++ if (ret != 0) ++ return ret; ++ ++ /* Before rapid scanning, we need a lock on scan->node so that we can ++ get its parent, only if formatted. */ ++ if (jnode_is_znode(scan->node)) { ++ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node), ++ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); ++ } ++ ++ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD) ++ */ ++ return ret; ++} ++ ++/* Performs rightward scanning... Does not count the starting node. The limit ++ parameter is described in scan_left. If the starting node is unformatted then ++ the parent_coord was already set during scan_left. The rapid_after parameter ++ is not used during right-scanning. ++ ++ scan_right is only called if the scan_left operation does not count at least ++ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter ++ is set to the difference between scan-left's count and ++ FLUSH_RELOCATE_THRESHOLD, meaning scan-right counts as high as ++ FLUSH_RELOCATE_THRESHOLD and then stops. */ ++static int scan_right(flush_scan * scan, jnode * node, unsigned limit) ++{ ++ int ret; ++ ++ scan->max_count = limit; ++ scan->direction = RIGHT_SIDE; ++ ++ ret = scan_set_current(scan, jref(node), 0, NULL); ++ if (ret != 0) ++ return ret; ++ ++ return scan_common(scan, NULL); ++} ++ ++/* Common code to perform left or right scanning. */ ++static int scan_common(flush_scan * scan, flush_scan * other) ++{ ++ int ret; ++ ++ assert("nikita-2376", scan->node != NULL); ++ assert("edward-54", jnode_is_unformatted(scan->node) ++ || jnode_is_znode(scan->node)); ++ ++ /* Special case for starting at an unformatted node. Optimization: we ++ only want to search for the parent (which requires a tree traversal) ++ once. Obviously, we shouldn't have to call it once for the left scan ++ and once for the right scan. For this reason, if we search for the ++ parent during scan-left we then duplicate the coord/lock/load into ++ the scan-right object. */ ++ if (jnode_is_unformatted(scan->node)) { ++ ret = scan_unformatted(scan, other); ++ if (ret != 0) ++ return ret; ++ } ++ /* This loop expects to start at a formatted position and performs ++ chaining of formatted regions */ ++ while (!reiser4_scan_finished(scan)) { ++ ++ ret = scan_formatted(scan); ++ if (ret != 0) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int scan_unformatted(flush_scan * scan, flush_scan * other) ++{ ++ int ret = 0; ++ int try = 0; ++ ++ if (!coord_is_invalid(&scan->parent_coord)) ++ goto scan; ++ ++ /* set parent coord from */ ++ if (!jnode_is_unformatted(scan->node)) { ++ /* formatted position */ ++ ++ lock_handle lock; ++ assert("edward-301", jnode_is_znode(scan->node)); ++ init_lh(&lock); ++ ++ /* ++ * when flush starts from unformatted node, first thing it ++ * does is tree traversal to find formatted parent of starting ++ * node. This parent is then kept lock across scans to the ++ * left and to the right. This means that during scan to the ++ * left we cannot take left-ward lock, because this is ++ * dead-lock prone. So, if we are scanning to the left and ++ * there is already lock held by this thread, ++ * jnode_lock_parent_coord() should use try-lock. ++ */ ++ try = reiser4_scanning_left(scan) ++ && !lock_stack_isclean(get_current_lock_stack()); ++ /* Need the node locked to get the parent lock, We have to ++ take write lock since there is at least one call path ++ where this znode is already write-locked by us. */ ++ ret = ++ longterm_lock_znode(&lock, JZNODE(scan->node), ++ ZNODE_WRITE_LOCK, ++ reiser4_scanning_left(scan) ? ++ ZNODE_LOCK_LOPRI : ++ ZNODE_LOCK_HIPRI); ++ if (ret != 0) ++ /* EINVAL or E_DEADLOCK here mean... try again! At this ++ point we've scanned too far and can't back out, just ++ start over. */ ++ return ret; ++ ++ ret = jnode_lock_parent_coord(scan->node, ++ &scan->parent_coord, ++ &scan->parent_lock, ++ &scan->parent_load, ++ ZNODE_WRITE_LOCK, try); ++ ++ /* FIXME(C): check EINVAL, E_DEADLOCK */ ++ done_lh(&lock); ++ if (ret == -E_REPEAT) { ++ scan->stop = 1; ++ return 0; ++ } ++ if (ret) ++ return ret; ++ ++ } else { ++ /* unformatted position */ ++ ++ ret = ++ jnode_lock_parent_coord(scan->node, &scan->parent_coord, ++ &scan->parent_lock, ++ &scan->parent_load, ++ ZNODE_WRITE_LOCK, try); ++ ++ if (IS_CBKERR(ret)) ++ return ret; ++ ++ if (ret == CBK_COORD_NOTFOUND) ++ /* FIXME(C): check EINVAL, E_DEADLOCK */ ++ return ret; ++ ++ /* parent was found */ ++ assert("jmacd-8661", other != NULL); ++ /* Duplicate the reference into the other flush_scan. */ ++ coord_dup(&other->parent_coord, &scan->parent_coord); ++ copy_lh(&other->parent_lock, &scan->parent_lock); ++ copy_load_count(&other->parent_load, &scan->parent_load); ++ } ++scan: ++ return scan_by_coord(scan); ++} ++ ++/* Performs left- or rightward scanning starting from a formatted node. Follow ++ left pointers under tree lock as long as: ++ ++ - node->left/right is non-NULL ++ - node->left/right is connected, dirty ++ - node->left/right belongs to the same atom ++ - scan has not reached maximum count ++*/ ++static int scan_formatted(flush_scan * scan) ++{ ++ int ret; ++ znode *neighbor = NULL; ++ ++ assert("jmacd-1401", !reiser4_scan_finished(scan)); ++ ++ do { ++ znode *node = JZNODE(scan->node); ++ ++ /* Node should be connected, but if not stop the scan. */ ++ if (!znode_is_connected(node)) { ++ scan->stop = 1; ++ break; ++ } ++ ++ /* Lock the tree, check-for and reference the next sibling. */ ++ read_lock_tree(znode_get_tree(node)); ++ ++ /* It may be that a node is inserted or removed between a node ++ and its left sibling while the tree lock is released, but the ++ flush-scan count does not need to be precise. Thus, we ++ release the tree lock as soon as we get the neighboring node. ++ */ ++ neighbor = ++ reiser4_scanning_left(scan) ? node->left : node->right; ++ if (neighbor != NULL) ++ zref(neighbor); ++ ++ read_unlock_tree(znode_get_tree(node)); ++ ++ /* If neighbor is NULL at the leaf level, need to check for an ++ unformatted sibling using the parent--break in any case. */ ++ if (neighbor == NULL) ++ break; ++ ++ /* Check the condition for going left, break if it is not met. ++ This also releases (jputs) the neighbor if false. */ ++ if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) ++ break; ++ ++ /* Advance the flush_scan state to the left, repeat. */ ++ ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL); ++ if (ret != 0) ++ return ret; ++ ++ } while (!reiser4_scan_finished(scan)); ++ ++ /* If neighbor is NULL then we reached the end of a formatted region, or ++ else the sibling is out of memory, now check for an extent to the ++ left (as long as LEAF_LEVEL). */ ++ if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL ++ || reiser4_scan_finished(scan)) { ++ scan->stop = 1; ++ return 0; ++ } ++ /* Otherwise, calls scan_by_coord for the right(left)most item of the ++ left(right) neighbor on the parent level, then possibly continue. */ ++ ++ coord_init_invalid(&scan->parent_coord, NULL); ++ return scan_unformatted(scan, NULL); ++} ++ ++/* NOTE-EDWARD: ++ This scans adjacent items of the same type and calls scan flush plugin for ++ each one. Performs left(right)ward scanning starting from a (possibly) ++ unformatted node. If we start from unformatted node, then we continue only if ++ the next neighbor is also unformatted. When called from scan_formatted, we ++ skip first iteration (to make sure that right(left)most item of the ++ left(right) neighbor on the parent level is of the same type and set ++ appropriate coord). */ ++static int scan_by_coord(flush_scan * scan) ++{ ++ int ret = 0; ++ int scan_this_coord; ++ lock_handle next_lock; ++ load_count next_load; ++ coord_t next_coord; ++ jnode *child; ++ item_plugin *iplug; ++ ++ init_lh(&next_lock); ++ init_load_count(&next_load); ++ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0); ++ ++ /* set initial item id */ ++ iplug = item_plugin_by_coord(&scan->parent_coord); ++ ++ for (; !reiser4_scan_finished(scan); scan_this_coord = 1) { ++ if (scan_this_coord) { ++ /* Here we expect that unit is scannable. it would not ++ * be so due to race with extent->tail conversion. */ ++ if (iplug->f.scan == NULL) { ++ scan->stop = 1; ++ ret = -E_REPEAT; ++ /* skip the check at the end. */ ++ goto race; ++ } ++ ++ ret = iplug->f.scan(scan); ++ if (ret != 0) ++ goto exit; ++ ++ if (reiser4_scan_finished(scan)) { ++ checkchild(scan); ++ break; ++ } ++ } else { ++ /* the same race against truncate as above is possible ++ * here, it seems */ ++ ++ /* NOTE-JMACD: In this case, apply the same end-of-node ++ logic but don't scan the first coordinate. */ ++ assert("jmacd-1231", ++ item_is_internal(&scan->parent_coord)); ++ } ++ ++ if (iplug->f.utmost_child == NULL ++ || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) { ++ /* stop this coord and continue on parrent level */ ++ ret = ++ scan_set_current(scan, ++ ZJNODE(zref ++ (scan->parent_coord.node)), ++ 1, NULL); ++ if (ret != 0) ++ goto exit; ++ break; ++ } ++ ++ /* Either way, the invariant is that scan->parent_coord is set ++ to the parent of scan->node. Now get the next unit. */ ++ coord_dup(&next_coord, &scan->parent_coord); ++ coord_sideof_unit(&next_coord, scan->direction); ++ ++ /* If off-the-end of the twig, try the next twig. */ ++ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) { ++ /* We take the write lock because we may start flushing ++ * from this coordinate. */ ++ ret = neighbor_in_slum(next_coord.node, ++ &next_lock, ++ scan->direction, ++ ZNODE_WRITE_LOCK, ++ 1 /* check dirty */, ++ 0 /* don't go though upper ++ levels */); ++ if (ret == -E_NO_NEIGHBOR) { ++ scan->stop = 1; ++ ret = 0; ++ break; ++ } ++ ++ if (ret != 0) ++ goto exit; ++ ++ ret = incr_load_count_znode(&next_load, next_lock.node); ++ if (ret != 0) ++ goto exit; ++ ++ coord_init_sideof_unit(&next_coord, next_lock.node, ++ sideof_reverse(scan->direction)); ++ } ++ ++ iplug = item_plugin_by_coord(&next_coord); ++ ++ /* Get the next child. */ ++ ret = ++ iplug->f.utmost_child(&next_coord, ++ sideof_reverse(scan->direction), ++ &child); ++ if (ret != 0) ++ goto exit; ++ /* If the next child is not in memory, or, item_utmost_child ++ failed (due to race with unlink, most probably), stop ++ here. */ ++ if (child == NULL || IS_ERR(child)) { ++ scan->stop = 1; ++ checkchild(scan); ++ break; ++ } ++ ++ assert("nikita-2374", jnode_is_unformatted(child) ++ || jnode_is_znode(child)); ++ ++ /* See if it is dirty, part of the same atom. */ ++ if (!reiser4_scan_goto(scan, child)) { ++ checkchild(scan); ++ break; ++ } ++ ++ /* If so, make this child current. */ ++ ret = scan_set_current(scan, child, 1, &next_coord); ++ if (ret != 0) ++ goto exit; ++ ++ /* Now continue. If formatted we release the parent lock and ++ return, then proceed. */ ++ if (jnode_is_znode(child)) ++ break; ++ ++ /* Otherwise, repeat the above loop with next_coord. */ ++ if (next_load.node != NULL) { ++ done_lh(&scan->parent_lock); ++ move_lh(&scan->parent_lock, &next_lock); ++ move_load_count(&scan->parent_load, &next_load); ++ } ++ } ++ ++ assert("jmacd-6233", ++ reiser4_scan_finished(scan) || jnode_is_znode(scan->node)); ++exit: ++ checkchild(scan); ++race: /* skip the above check */ ++ if (jnode_is_znode(scan->node)) { ++ done_lh(&scan->parent_lock); ++ done_load_count(&scan->parent_load); ++ } ++ ++ done_load_count(&next_load); ++ done_lh(&next_lock); ++ return ret; ++} ++ ++/* FLUSH POS HELPERS */ ++ ++/* Initialize the fields of a flush_position. */ ++static void pos_init(flush_pos_t *pos) ++{ ++ memset(pos, 0, sizeof *pos); ++ ++ pos->state = POS_INVALID; ++ coord_init_invalid(&pos->coord, NULL); ++ init_lh(&pos->lock); ++ init_load_count(&pos->load); ++ ++ reiser4_blocknr_hint_init(&pos->preceder); ++} ++ ++/* The flush loop inside squalloc periodically checks pos_valid to determine ++ when "enough flushing" has been performed. This will return true until one ++ of the following conditions is met: ++ ++ 1. the number of flush-queued nodes has reached the kernel-supplied ++ "int *nr_to_flush" parameter, meaning we have flushed as many blocks as the ++ kernel requested. When flushing to commit, this parameter is NULL. ++ ++ 2. pos_stop() is called because squalloc discovers that the "next" node in ++ the flush order is either non-existant, not dirty, or not in the same atom. ++*/ ++ ++static int pos_valid(flush_pos_t *pos) ++{ ++ return pos->state != POS_INVALID; ++} ++ ++/* Release any resources of a flush_position. Called when jnode_flush ++ finishes. */ ++static void pos_done(flush_pos_t *pos) ++{ ++ pos_stop(pos); ++ reiser4_blocknr_hint_done(&pos->preceder); ++ if (convert_data(pos)) ++ free_convert_data(pos); ++} ++ ++/* Reset the point and parent. Called during flush subroutines to terminate the ++ squalloc loop. */ ++static int pos_stop(flush_pos_t *pos) ++{ ++ pos->state = POS_INVALID; ++ done_lh(&pos->lock); ++ done_load_count(&pos->load); ++ coord_init_invalid(&pos->coord, NULL); ++ ++ if (pos->child) { ++ jput(pos->child); ++ pos->child = NULL; ++ } ++ ++ return 0; ++} ++ ++/* Return the flush_position's block allocator hint. */ ++reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos) ++{ ++ return &pos->preceder; ++} ++ ++flush_queue_t *reiser4_pos_fq(flush_pos_t *pos) ++{ ++ return pos->fq; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 90 ++ LocalWords: preceder ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/flush.h linux-2.6.33/fs/reiser4/flush.h +--- linux-2.6.33.orig/fs/reiser4/flush.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/flush.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,300 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* DECLARATIONS: */ ++ ++#if !defined(__REISER4_FLUSH_H__) ++#define __REISER4_FLUSH_H__ ++ ++#include "plugin/cluster.h" ++ ++/* The flush_scan data structure maintains the state of an in-progress ++ flush-scan on a single level of the tree. A flush-scan is used for counting ++ the number of adjacent nodes to flush, which is used to determine whether we ++ should relocate, and it is also used to find a starting point for flush. A ++ flush-scan object can scan in both right and left directions via the ++ scan_left() and scan_right() interfaces. The right- and left-variations are ++ similar but perform different functions. When scanning left we (optionally ++ perform rapid scanning and then) longterm-lock the endpoint node. When ++ scanning right we are simply counting the number of adjacent, dirty nodes. */ ++struct flush_scan { ++ ++ /* The current number of nodes scanned on this level. */ ++ unsigned count; ++ ++ /* There may be a maximum number of nodes for a scan on any single ++ level. When going leftward, max_count is determined by ++ FLUSH_SCAN_MAXNODES (see reiser4.h) */ ++ unsigned max_count; ++ ++ /* Direction: Set to one of the sideof enumeration: ++ { LEFT_SIDE, RIGHT_SIDE }. */ ++ sideof direction; ++ ++ /* Initially @stop is set to false then set true once some condition ++ stops the search (e.g., we found a clean node before reaching ++ max_count or we found a node belonging to another atom). */ ++ int stop; ++ ++ /* The current scan position. If @node is non-NULL then its reference ++ count has been incremented to reflect this reference. */ ++ jnode *node; ++ ++ /* A handle for zload/zrelse of current scan position node. */ ++ load_count node_load; ++ ++ /* During left-scan, if the final position (a.k.a. endpoint node) is ++ formatted the node is locked using this lock handle. The endpoint ++ needs to be locked for transfer to the flush_position object after ++ scanning finishes. */ ++ lock_handle node_lock; ++ ++ /* When the position is unformatted, its parent, coordinate, and parent ++ zload/zrelse handle. */ ++ lock_handle parent_lock; ++ coord_t parent_coord; ++ load_count parent_load; ++ ++ /* The block allocator preceder hint. Sometimes flush_scan determines ++ what the preceder is and if so it sets it here, after which it is ++ copied into the flush_position. Otherwise, the preceder is computed ++ later. */ ++ reiser4_block_nr preceder_blk; ++}; ++ ++struct convert_item_info { ++ dc_item_stat d_cur; /* disk cluster state of the current item */ ++ dc_item_stat d_next; /* disk cluster state of the next slum item */ ++ int cluster_shift; /* disk cluster shift */ ++ flow_t flow; /* disk cluster data */ ++}; ++ ++struct convert_info { ++ int count; /* for squalloc terminating */ ++ item_plugin *iplug; /* current item plugin */ ++ struct convert_item_info *itm; /* current item info */ ++ struct cluster_handle clust; /* transform cluster */ ++}; ++ ++typedef enum flush_position_state { ++ POS_INVALID, /* Invalid or stopped pos, do not continue slum ++ * processing */ ++ POS_ON_LEAF, /* pos points to already prepped, locked ++ * formatted node at leaf level */ ++ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field ++ * is used to traverse unformatted nodes */ ++ POS_TO_LEAF, /* pos is being moved to leaf level */ ++ POS_TO_TWIG, /* pos is being moved to twig level */ ++ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is ++ * after rightmost unit of the current twig */ ++ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal ++ * node */ ++} flushpos_state_t; ++ ++/* An encapsulation of the current flush point and all the parameters that are ++ passed through the entire squeeze-and-allocate stage of the flush routine. ++ A single flush_position object is constructed after left- and right-scanning ++ finishes. */ ++struct flush_position { ++ flushpos_state_t state; ++ ++ coord_t coord; /* coord to traverse unformatted nodes */ ++ lock_handle lock; /* current lock we hold */ ++ load_count load; /* load status for current locked formatted node ++ */ ++ jnode *child; /* for passing a reference to unformatted child ++ * across pos state changes */ ++ ++ reiser4_blocknr_hint preceder; /* The flush 'hint' state. */ ++ int leaf_relocate; /* True if enough leaf-level nodes were ++ * found to suggest a relocate policy. */ ++ int alloc_cnt; /* The number of nodes allocated during squeeze ++ and allococate. */ ++ int prep_or_free_cnt; /* The number of nodes prepared for write ++ (allocate) or squeezed and freed. */ ++ flush_queue_t *fq; ++ long *nr_written; /* number of nodes submitted to disk */ ++ int flags; /* a copy of jnode_flush flags argument */ ++ ++ znode *prev_twig; /* previous parent pointer value, used to catch ++ * processing of new twig node */ ++ struct convert_info *sq; /* convert info */ ++ ++ unsigned long pos_in_unit; /* for extents only. Position ++ within an extent unit of first ++ jnode of slum */ ++ long nr_to_write; /* number of unformatted nodes to handle on ++ flush */ ++}; ++ ++static inline int item_convert_count(flush_pos_t *pos) ++{ ++ return pos->sq->count; ++} ++static inline void inc_item_convert_count(flush_pos_t *pos) ++{ ++ pos->sq->count++; ++} ++static inline void set_item_convert_count(flush_pos_t *pos, int count) ++{ ++ pos->sq->count = count; ++} ++static inline item_plugin *item_convert_plug(flush_pos_t *pos) ++{ ++ return pos->sq->iplug; ++} ++ ++static inline struct convert_info *convert_data(flush_pos_t *pos) ++{ ++ return pos->sq; ++} ++ ++static inline struct convert_item_info *item_convert_data(flush_pos_t *pos) ++{ ++ assert("edward-955", convert_data(pos)); ++ return pos->sq->itm; ++} ++ ++static inline struct tfm_cluster *tfm_cluster_sq(flush_pos_t *pos) ++{ ++ return &pos->sq->clust.tc; ++} ++ ++static inline struct tfm_stream *tfm_stream_sq(flush_pos_t *pos, ++ tfm_stream_id id) ++{ ++ assert("edward-854", pos->sq != NULL); ++ return get_tfm_stream(tfm_cluster_sq(pos), id); ++} ++ ++static inline int chaining_data_present(flush_pos_t *pos) ++{ ++ return convert_data(pos) && item_convert_data(pos); ++} ++ ++/* Returns true if next node contains next item of the disk cluster ++ so item convert data should be moved to the right slum neighbor. ++*/ ++static inline int should_chain_next_node(flush_pos_t *pos) ++{ ++ int result = 0; ++ ++ assert("edward-1007", chaining_data_present(pos)); ++ ++ switch (item_convert_data(pos)->d_next) { ++ case DC_CHAINED_ITEM: ++ result = 1; ++ break; ++ case DC_AFTER_CLUSTER: ++ break; ++ default: ++ impossible("edward-1009", "bad state of next slum item"); ++ } ++ return result; ++} ++ ++/* update item state in a disk cluster to assign conversion mode */ ++static inline void ++move_chaining_data(flush_pos_t *pos, int this_node/* where is next item */) ++{ ++ ++ assert("edward-1010", chaining_data_present(pos)); ++ ++ if (this_node == 0) { ++ /* next item is on the right neighbor */ ++ assert("edward-1011", ++ item_convert_data(pos)->d_cur == DC_FIRST_ITEM || ++ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM); ++ assert("edward-1012", ++ item_convert_data(pos)->d_next == DC_CHAINED_ITEM); ++ ++ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM; ++ item_convert_data(pos)->d_next = DC_INVALID_STATE; ++ } else { ++ /* next item is on the same node */ ++ assert("edward-1013", ++ item_convert_data(pos)->d_cur == DC_FIRST_ITEM || ++ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM); ++ assert("edward-1227", ++ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER || ++ item_convert_data(pos)->d_next == DC_INVALID_STATE); ++ ++ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER; ++ item_convert_data(pos)->d_next = DC_INVALID_STATE; ++ } ++} ++ ++static inline int should_convert_node(flush_pos_t *pos, znode * node) ++{ ++ return znode_convertible(node); ++} ++ ++/* true if there is attached convert item info */ ++static inline int should_convert_next_node(flush_pos_t *pos) ++{ ++ return convert_data(pos) && item_convert_data(pos); ++} ++ ++#define SQUALLOC_THRESHOLD 256 ++ ++static inline int should_terminate_squalloc(flush_pos_t *pos) ++{ ++ return convert_data(pos) && ++ !item_convert_data(pos) && ++ item_convert_count(pos) >= SQUALLOC_THRESHOLD; ++} ++ ++#if 1 ++#define check_convert_info(pos) \ ++do { \ ++ if (unlikely(should_convert_next_node(pos))) { \ ++ warning("edward-1006", "unprocessed chained data"); \ ++ printk("d_cur = %d, d_next = %d, flow.len = %llu\n", \ ++ item_convert_data(pos)->d_cur, \ ++ item_convert_data(pos)->d_next, \ ++ item_convert_data(pos)->flow.length); \ ++ } \ ++} while (0) ++#else ++#define check_convert_info(pos) ++#endif /* REISER4_DEBUG */ ++ ++void free_convert_data(flush_pos_t *pos); ++/* used in extent.c */ ++int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size, ++ const coord_t *parent); ++int reiser4_scan_finished(flush_scan * scan); ++int reiser4_scanning_left(flush_scan * scan); ++int reiser4_scan_goto(flush_scan * scan, jnode * tonode); ++txn_atom *atom_locked_by_fq(flush_queue_t *fq); ++int reiser4_alloc_extent(flush_pos_t *flush_pos); ++squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *, ++ reiser4_key *stop_key); ++extern int reiser4_init_fqs(void); ++extern void reiser4_done_fqs(void); ++ ++#if REISER4_DEBUG ++ ++extern void reiser4_check_fq(const txn_atom *atom); ++extern atomic_t flush_cnt; ++ ++#define check_preceder(blk) \ ++assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb())); ++extern void check_pos(flush_pos_t *pos); ++#else ++#define check_preceder(b) noop ++#define check_pos(pos) noop ++#endif ++ ++/* __REISER4_FLUSH_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 90 ++ LocalWords: preceder ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/flush_queue.c linux-2.6.33/fs/reiser4/flush_queue.c +--- linux-2.6.33.orig/fs/reiser4/flush_queue.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/flush_queue.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,678 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++#include "debug.h" ++#include "super.h" ++#include "txnmgr.h" ++#include "jnode.h" ++#include "znode.h" ++#include "page_cache.h" ++#include "wander.h" ++#include "vfs_ops.h" ++#include "writeout.h" ++#include "flush.h" ++ ++#include <linux/bio.h> ++#include <linux/mm.h> ++#include <linux/pagemap.h> ++#include <linux/blkdev.h> ++#include <linux/writeback.h> ++ ++/* A flush queue object is an accumulator for keeping jnodes prepared ++ by the jnode_flush() function for writing to disk. Those "queued" jnodes are ++ kept on the flush queue until memory pressure or atom commit asks ++ flush queues to write some or all from their jnodes. */ ++ ++/* ++ LOCKING: ++ ++ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped ++ list protected by atom spin lock. fq->prepped list uses the following ++ locking: ++ ++ two ways to protect fq->prepped list for read-only list traversal: ++ ++ 1. atom spin-lock atom. ++ 2. fq is IN_USE, atom->nr_running_queues increased. ++ ++ and one for list modification: ++ ++ 1. atom is spin-locked and one condition is true: fq is IN_USE or ++ atom->nr_running_queues == 0. ++ ++ The deadlock-safe order for flush queues and atoms is: first lock atom, then ++ lock flush queue, then lock jnode. ++*/ ++ ++#define fq_in_use(fq) ((fq)->state & FQ_IN_USE) ++#define fq_ready(fq) (!fq_in_use(fq)) ++ ++#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0) ++#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0) ++ ++/* get lock on atom from locked flush queue object */ ++static txn_atom *atom_locked_by_fq_nolock(flush_queue_t *fq) ++{ ++ /* This code is similar to jnode_get_atom(), look at it for the ++ * explanation. */ ++ txn_atom *atom; ++ ++ assert_spin_locked(&(fq->guard)); ++ ++ while (1) { ++ atom = fq->atom; ++ if (atom == NULL) ++ break; ++ ++ if (spin_trylock_atom(atom)) ++ break; ++ ++ atomic_inc(&atom->refcount); ++ spin_unlock(&(fq->guard)); ++ spin_lock_atom(atom); ++ spin_lock(&(fq->guard)); ++ ++ if (fq->atom == atom) { ++ atomic_dec(&atom->refcount); ++ break; ++ } ++ ++ spin_unlock(&(fq->guard)); ++ atom_dec_and_unlock(atom); ++ spin_lock(&(fq->guard)); ++ } ++ ++ return atom; ++} ++ ++txn_atom *atom_locked_by_fq(flush_queue_t *fq) ++{ ++ txn_atom *atom; ++ ++ spin_lock(&(fq->guard)); ++ atom = atom_locked_by_fq_nolock(fq); ++ spin_unlock(&(fq->guard)); ++ return atom; ++} ++ ++static void init_fq(flush_queue_t *fq) ++{ ++ memset(fq, 0, sizeof *fq); ++ ++ atomic_set(&fq->nr_submitted, 0); ++ ++ INIT_LIST_HEAD(ATOM_FQ_LIST(fq)); ++ ++ init_waitqueue_head(&fq->wait); ++ spin_lock_init(&fq->guard); ++} ++ ++/* slab for flush queues */ ++static struct kmem_cache *fq_slab; ++ ++/** ++ * reiser4_init_fqs - create flush queue cache ++ * ++ * Initializes slab cache of flush queues. It is part of reiser4 module ++ * initialization. ++ */ ++int reiser4_init_fqs(void) ++{ ++ fq_slab = kmem_cache_create("fq", ++ sizeof(flush_queue_t), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (fq_slab == NULL) ++ return RETERR(-ENOMEM); ++ return 0; ++} ++ ++/** ++ * reiser4_done_fqs - delete flush queue cache ++ * ++ * This is called on reiser4 module unloading or system shutdown. ++ */ ++void reiser4_done_fqs(void) ++{ ++ destroy_reiser4_cache(&fq_slab); ++} ++ ++/* create new flush queue object */ ++static flush_queue_t *create_fq(gfp_t gfp) ++{ ++ flush_queue_t *fq; ++ ++ fq = kmem_cache_alloc(fq_slab, gfp); ++ if (fq) ++ init_fq(fq); ++ ++ return fq; ++} ++ ++/* adjust atom's and flush queue's counters of queued nodes */ ++static void count_enqueued_node(flush_queue_t *fq) ++{ ++ ON_DEBUG(fq->atom->num_queued++); ++} ++ ++static void count_dequeued_node(flush_queue_t *fq) ++{ ++ assert("zam-993", fq->atom->num_queued > 0); ++ ON_DEBUG(fq->atom->num_queued--); ++} ++ ++/* attach flush queue object to the atom */ ++static void attach_fq(txn_atom *atom, flush_queue_t *fq) ++{ ++ assert_spin_locked(&(atom->alock)); ++ list_add(&fq->alink, &atom->flush_queues); ++ fq->atom = atom; ++ ON_DEBUG(atom->nr_flush_queues++); ++} ++ ++static void detach_fq(flush_queue_t *fq) ++{ ++ assert_spin_locked(&(fq->atom->alock)); ++ ++ spin_lock(&(fq->guard)); ++ list_del_init(&fq->alink); ++ assert("vs-1456", fq->atom->nr_flush_queues > 0); ++ ON_DEBUG(fq->atom->nr_flush_queues--); ++ fq->atom = NULL; ++ spin_unlock(&(fq->guard)); ++} ++ ++/* destroy flush queue object */ ++static void done_fq(flush_queue_t *fq) ++{ ++ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq))); ++ assert("zam-766", atomic_read(&fq->nr_submitted) == 0); ++ ++ kmem_cache_free(fq_slab, fq); ++} ++ ++/* */ ++static void mark_jnode_queued(flush_queue_t *fq, jnode * node) ++{ ++ JF_SET(node, JNODE_FLUSH_QUEUED); ++ count_enqueued_node(fq); ++} ++ ++/* Putting jnode into the flush queue. Both atom and jnode should be ++ spin-locked. */ ++void queue_jnode(flush_queue_t *fq, jnode * node) ++{ ++ assert_spin_locked(&(node->guard)); ++ assert("zam-713", node->atom != NULL); ++ assert_spin_locked(&(node->atom->alock)); ++ assert("zam-716", fq->atom != NULL); ++ assert("zam-717", fq->atom == node->atom); ++ assert("zam-907", fq_in_use(fq)); ++ ++ assert("zam-714", JF_ISSET(node, JNODE_DIRTY)); ++ assert("zam-826", JF_ISSET(node, JNODE_RELOC)); ++ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); ++ assert("vs-1481", NODE_LIST(node) != FQ_LIST); ++ ++ mark_jnode_queued(fq, node); ++ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq)); ++ ++ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node), ++ FQ_LIST, 1)); ++} ++ ++/* repeatable process for waiting io completion on a flush queue object */ ++static int wait_io(flush_queue_t *fq, int *nr_io_errors) ++{ ++ assert("zam-738", fq->atom != NULL); ++ assert_spin_locked(&(fq->atom->alock)); ++ assert("zam-736", fq_in_use(fq)); ++ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq))); ++ ++ if (atomic_read(&fq->nr_submitted) != 0) { ++ struct super_block *super; ++ ++ spin_unlock_atom(fq->atom); ++ ++ assert("nikita-3013", reiser4_schedulable()); ++ ++ super = reiser4_get_current_sb(); ++ ++ /* FIXME: this is instead of blk_run_queues() */ ++ blk_run_address_space(reiser4_get_super_fake(super)->i_mapping); ++ ++ if (!(super->s_flags & MS_RDONLY)) ++ wait_event(fq->wait, ++ atomic_read(&fq->nr_submitted) == 0); ++ ++ /* Ask the caller to re-acquire the locks and call this ++ function again. Note: this technique is commonly used in ++ the txnmgr code. */ ++ return -E_REPEAT; ++ } ++ ++ *nr_io_errors += atomic_read(&fq->nr_errors); ++ return 0; ++} ++ ++/* wait on I/O completion, re-submit dirty nodes to write */ ++static int finish_fq(flush_queue_t *fq, int *nr_io_errors) ++{ ++ int ret; ++ txn_atom *atom = fq->atom; ++ ++ assert("zam-801", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ assert("zam-762", fq_in_use(fq)); ++ ++ ret = wait_io(fq, nr_io_errors); ++ if (ret) ++ return ret; ++ ++ detach_fq(fq); ++ done_fq(fq); ++ ++ reiser4_atom_send_event(atom); ++ ++ return 0; ++} ++ ++/* wait for all i/o for given atom to be completed, actually do one iteration ++ on that and return -E_REPEAT if there more iterations needed */ ++static int finish_all_fq(txn_atom * atom, int *nr_io_errors) ++{ ++ flush_queue_t *fq; ++ ++ assert_spin_locked(&(atom->alock)); ++ ++ if (list_empty_careful(&atom->flush_queues)) ++ return 0; ++ ++ list_for_each_entry(fq, &atom->flush_queues, alink) { ++ if (fq_ready(fq)) { ++ int ret; ++ ++ mark_fq_in_use(fq); ++ assert("vs-1247", fq->owner == NULL); ++ ON_DEBUG(fq->owner = current); ++ ret = finish_fq(fq, nr_io_errors); ++ ++ if (*nr_io_errors) ++ reiser4_handle_error(); ++ ++ if (ret) { ++ reiser4_fq_put(fq); ++ return ret; ++ } ++ ++ spin_unlock_atom(atom); ++ ++ return -E_REPEAT; ++ } ++ } ++ ++ /* All flush queues are in use; atom remains locked */ ++ return -EBUSY; ++} ++ ++/* wait all i/o for current atom */ ++int current_atom_finish_all_fq(void) ++{ ++ txn_atom *atom; ++ int nr_io_errors = 0; ++ int ret = 0; ++ ++ do { ++ while (1) { ++ atom = get_current_atom_locked(); ++ ret = finish_all_fq(atom, &nr_io_errors); ++ if (ret != -EBUSY) ++ break; ++ reiser4_atom_wait_event(atom); ++ } ++ } while (ret == -E_REPEAT); ++ ++ /* we do not need locked atom after this function finishes, SUCCESS or ++ -EBUSY are two return codes when atom remains locked after ++ finish_all_fq */ ++ if (!ret) ++ spin_unlock_atom(atom); ++ ++ assert_spin_not_locked(&(atom->alock)); ++ ++ if (ret) ++ return ret; ++ ++ if (nr_io_errors) ++ return RETERR(-EIO); ++ ++ return 0; ++} ++ ++/* change node->atom field for all jnode from given list */ ++static void ++scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom) ++{ ++ jnode *cur; ++ ++ list_for_each_entry(cur, list, capture_link) { ++ spin_lock_jnode(cur); ++ cur->atom = atom; ++ spin_unlock_jnode(cur); ++ } ++} ++ ++/* support for atom fusion operation */ ++void reiser4_fuse_fq(txn_atom *to, txn_atom *from) ++{ ++ flush_queue_t *fq; ++ ++ assert_spin_locked(&(to->alock)); ++ assert_spin_locked(&(from->alock)); ++ ++ list_for_each_entry(fq, &from->flush_queues, alink) { ++ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to); ++ spin_lock(&(fq->guard)); ++ fq->atom = to; ++ spin_unlock(&(fq->guard)); ++ } ++ ++ list_splice_init(&from->flush_queues, to->flush_queues.prev); ++ ++#if REISER4_DEBUG ++ to->num_queued += from->num_queued; ++ to->nr_flush_queues += from->nr_flush_queues; ++ from->nr_flush_queues = 0; ++#endif ++} ++ ++#if REISER4_DEBUG ++int atom_fq_parts_are_clean(txn_atom * atom) ++{ ++ assert("zam-915", atom != NULL); ++ return list_empty_careful(&atom->flush_queues); ++} ++#endif ++/* Bio i/o completion routine for reiser4 write operations. */ ++static void ++end_io_handler(struct bio *bio, int err) ++{ ++ int i; ++ int nr_errors = 0; ++ flush_queue_t *fq; ++ ++ assert("zam-958", bio->bi_rw & WRITE); ++ ++ if (err == -EOPNOTSUPP) ++ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); ++ ++ /* we expect that bio->private is set to NULL or fq object which is used ++ * for synchronization and error counting. */ ++ fq = bio->bi_private; ++ /* Check all elements of io_vec for correct write completion. */ ++ for (i = 0; i < bio->bi_vcnt; i += 1) { ++ struct page *pg = bio->bi_io_vec[i].bv_page; ++ ++ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { ++ SetPageError(pg); ++ nr_errors++; ++ } ++ ++ { ++ /* jnode WRITEBACK ("write is in progress bit") is ++ * atomically cleared here. */ ++ jnode *node; ++ ++ assert("zam-736", pg != NULL); ++ assert("zam-736", PagePrivate(pg)); ++ node = jprivate(pg); ++ ++ JF_CLR(node, JNODE_WRITEBACK); ++ } ++ ++ end_page_writeback(pg); ++ page_cache_release(pg); ++ } ++ ++ if (fq) { ++ /* count i/o error in fq object */ ++ atomic_add(nr_errors, &fq->nr_errors); ++ ++ /* If all write requests registered in this "fq" are done we up ++ * the waiter. */ ++ if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted)) ++ wake_up(&fq->wait); ++ } ++ ++ bio_put(bio); ++} ++ ++/* Count I/O requests which will be submitted by @bio in given flush queues ++ @fq */ ++void add_fq_to_bio(flush_queue_t *fq, struct bio *bio) ++{ ++ bio->bi_private = fq; ++ bio->bi_end_io = end_io_handler; ++ ++ if (fq) ++ atomic_add(bio->bi_vcnt, &fq->nr_submitted); ++} ++ ++/* Move all queued nodes out from @fq->prepped list. */ ++static void release_prepped_list(flush_queue_t *fq) ++{ ++ txn_atom *atom; ++ ++ assert("zam-904", fq_in_use(fq)); ++ atom = atom_locked_by_fq(fq); ++ ++ while (!list_empty(ATOM_FQ_LIST(fq))) { ++ jnode *cur; ++ ++ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link); ++ list_del_init(&cur->capture_link); ++ ++ count_dequeued_node(fq); ++ spin_lock_jnode(cur); ++ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR)); ++ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC)); ++ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED)); ++ JF_CLR(cur, JNODE_FLUSH_QUEUED); ++ ++ if (JF_ISSET(cur, JNODE_DIRTY)) { ++ list_add_tail(&cur->capture_link, ++ ATOM_DIRTY_LIST(atom, ++ jnode_get_level(cur))); ++ ON_DEBUG(count_jnode(atom, cur, FQ_LIST, ++ DIRTY_LIST, 1)); ++ } else { ++ list_add_tail(&cur->capture_link, ++ ATOM_CLEAN_LIST(atom)); ++ ON_DEBUG(count_jnode(atom, cur, FQ_LIST, ++ CLEAN_LIST, 1)); ++ } ++ ++ spin_unlock_jnode(cur); ++ } ++ ++ if (--atom->nr_running_queues == 0) ++ reiser4_atom_send_event(atom); ++ ++ spin_unlock_atom(atom); ++} ++ ++/* Submit write requests for nodes on the already filled flush queue @fq. ++ ++ @fq: flush queue object which contains jnodes we can (and will) write. ++ @return: number of submitted blocks (>=0) if success, otherwise -- an error ++ code (<0). */ ++int reiser4_write_fq(flush_queue_t *fq, long *nr_submitted, int flags) ++{ ++ int ret; ++ txn_atom *atom; ++ ++ while (1) { ++ atom = atom_locked_by_fq(fq); ++ assert("zam-924", atom); ++ /* do not write fq in parallel. */ ++ if (atom->nr_running_queues == 0 ++ || !(flags & WRITEOUT_SINGLE_STREAM)) ++ break; ++ reiser4_atom_wait_event(atom); ++ } ++ ++ atom->nr_running_queues++; ++ spin_unlock_atom(atom); ++ ++ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags); ++ release_prepped_list(fq); ++ ++ return ret; ++} ++ ++/* Getting flush queue object for exclusive use by one thread. May require ++ several iterations which is indicated by -E_REPEAT return code. ++ ++ This function does not contain code for obtaining an atom lock because an ++ atom lock is obtained by different ways in different parts of reiser4, ++ usually it is current atom, but we need a possibility for getting fq for the ++ atom of given jnode. */ ++static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp) ++{ ++ flush_queue_t *fq; ++ ++ assert_spin_locked(&(atom->alock)); ++ ++ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink); ++ while (&atom->flush_queues != &fq->alink) { ++ spin_lock(&(fq->guard)); ++ ++ if (fq_ready(fq)) { ++ mark_fq_in_use(fq); ++ assert("vs-1246", fq->owner == NULL); ++ ON_DEBUG(fq->owner = current); ++ spin_unlock(&(fq->guard)); ++ ++ if (*new_fq) ++ done_fq(*new_fq); ++ ++ *new_fq = fq; ++ ++ return 0; ++ } ++ ++ spin_unlock(&(fq->guard)); ++ ++ fq = list_entry(fq->alink.next, flush_queue_t, alink); ++ } ++ ++ /* Use previously allocated fq object */ ++ if (*new_fq) { ++ mark_fq_in_use(*new_fq); ++ assert("vs-1248", (*new_fq)->owner == 0); ++ ON_DEBUG((*new_fq)->owner = current); ++ attach_fq(atom, *new_fq); ++ ++ return 0; ++ } ++ ++ spin_unlock_atom(atom); ++ ++ *new_fq = create_fq(gfp); ++ ++ if (*new_fq == NULL) ++ return RETERR(-ENOMEM); ++ ++ return RETERR(-E_REPEAT); ++} ++ ++int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t **new_fq) ++{ ++ return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get()); ++} ++ ++/* A wrapper around reiser4_fq_by_atom for getting a flush queue ++ object for current atom, if success fq->atom remains locked. */ ++flush_queue_t *get_fq_for_current_atom(void) ++{ ++ flush_queue_t *fq = NULL; ++ txn_atom *atom; ++ int ret; ++ ++ do { ++ atom = get_current_atom_locked(); ++ ret = reiser4_fq_by_atom(atom, &fq); ++ } while (ret == -E_REPEAT); ++ ++ if (ret) ++ return ERR_PTR(ret); ++ return fq; ++} ++ ++/* Releasing flush queue object after exclusive use */ ++void reiser4_fq_put_nolock(flush_queue_t *fq) ++{ ++ assert("zam-747", fq->atom != NULL); ++ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq))); ++ mark_fq_ready(fq); ++ assert("vs-1245", fq->owner == current); ++ ON_DEBUG(fq->owner = NULL); ++} ++ ++void reiser4_fq_put(flush_queue_t *fq) ++{ ++ txn_atom *atom; ++ ++ spin_lock(&(fq->guard)); ++ atom = atom_locked_by_fq_nolock(fq); ++ ++ assert("zam-746", atom != NULL); ++ ++ reiser4_fq_put_nolock(fq); ++ reiser4_atom_send_event(atom); ++ ++ spin_unlock(&(fq->guard)); ++ spin_unlock_atom(atom); ++} ++ ++/* A part of atom object initialization related to the embedded flush queue ++ list head */ ++ ++void init_atom_fq_parts(txn_atom *atom) ++{ ++ INIT_LIST_HEAD(&atom->flush_queues); ++} ++ ++#if REISER4_DEBUG ++ ++void reiser4_check_fq(const txn_atom *atom) ++{ ++ /* check number of nodes on all atom's flush queues */ ++ flush_queue_t *fq; ++ int count; ++ struct list_head *pos; ++ ++ count = 0; ++ list_for_each_entry(fq, &atom->flush_queues, alink) { ++ spin_lock(&(fq->guard)); ++ /* calculate number of jnodes on fq' list of prepped jnodes */ ++ list_for_each(pos, ATOM_FQ_LIST(fq)) ++ count++; ++ spin_unlock(&(fq->guard)); ++ } ++ if (count != atom->fq) ++ warning("", "fq counter %d, real %d\n", atom->fq, count); ++ ++} ++ ++#endif ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/forward.h linux-2.6.33/fs/reiser4/forward.h +--- linux-2.6.33.orig/fs/reiser4/forward.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/forward.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,256 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* Forward declarations. Thank you Kernighan. */ ++ ++#if !defined(__REISER4_FORWARD_H__) ++#define __REISER4_FORWARD_H__ ++ ++#include <asm/errno.h> ++#include <linux/types.h> ++ ++typedef struct zlock zlock; ++typedef struct lock_stack lock_stack; ++typedef struct lock_handle lock_handle; ++typedef struct znode znode; ++typedef struct flow flow_t; ++typedef struct coord coord_t; ++typedef struct tree_access_pointer tap_t; ++typedef struct reiser4_object_create_data reiser4_object_create_data; ++typedef union reiser4_plugin reiser4_plugin; ++typedef __u16 reiser4_plugin_id; ++typedef __u64 reiser4_plugin_groups; ++typedef struct item_plugin item_plugin; ++typedef struct jnode_plugin jnode_plugin; ++typedef struct reiser4_item_data reiser4_item_data; ++typedef union reiser4_key reiser4_key; ++typedef struct reiser4_tree reiser4_tree; ++typedef struct carry_cut_data carry_cut_data; ++typedef struct carry_kill_data carry_kill_data; ++typedef struct carry_tree_op carry_tree_op; ++typedef struct carry_tree_node carry_tree_node; ++typedef struct carry_plugin_info carry_plugin_info; ++typedef struct reiser4_journal reiser4_journal; ++typedef struct txn_atom txn_atom; ++typedef struct txn_handle txn_handle; ++typedef struct txn_mgr txn_mgr; ++typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc; ++typedef struct reiser4_context reiser4_context; ++typedef struct carry_level carry_level; ++typedef struct blocknr_set_entry blocknr_set_entry; ++/* super_block->s_fs_info points to this */ ++typedef struct reiser4_super_info_data reiser4_super_info_data; ++/* next two objects are fields of reiser4_super_info_data */ ++typedef struct reiser4_oid_allocator reiser4_oid_allocator; ++typedef struct reiser4_space_allocator reiser4_space_allocator; ++ ++typedef struct flush_scan flush_scan; ++typedef struct flush_position flush_pos_t; ++ ++typedef unsigned short pos_in_node_t; ++#define MAX_POS_IN_NODE 65535 ++ ++typedef struct jnode jnode; ++typedef struct reiser4_blocknr_hint reiser4_blocknr_hint; ++ ++typedef struct uf_coord uf_coord_t; ++typedef struct hint hint_t; ++ ++typedef struct ktxnmgrd_context ktxnmgrd_context; ++ ++struct inode; ++struct page; ++struct file; ++struct dentry; ++struct super_block; ++ ++/* return values of coord_by_key(). cbk == coord_by_key */ ++typedef enum { ++ CBK_COORD_FOUND = 0, ++ CBK_COORD_NOTFOUND = -ENOENT, ++} lookup_result; ++ ++/* results of lookup with directory file */ ++typedef enum { ++ FILE_NAME_FOUND = 0, ++ FILE_NAME_NOTFOUND = -ENOENT, ++ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, ++ IO_ERROR return codes for each search. */ ++ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, ++ IO_ERROR return codes for each search. */ ++} file_lookup_result; ++ ++/* behaviors of lookup. If coord we are looking for is actually in a tree, ++ both coincide. */ ++typedef enum { ++ /* search exactly for the coord with key given */ ++ FIND_EXACT, ++ /* search for coord with the maximal key not greater than one ++ given */ ++ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */ ++} lookup_bias; ++ ++typedef enum { ++ /* number of leaf level of the tree ++ The fake root has (tree_level=0). */ ++ LEAF_LEVEL = 1, ++ ++ /* number of level one above leaf level of the tree. ++ ++ It is supposed that internal tree used by reiser4 to store file ++ system data and meta data will have height 2 initially (when ++ created by mkfs). ++ */ ++ TWIG_LEVEL = 2, ++} tree_level; ++ ++/* The "real" maximum ztree height is the 0-origin size of any per-level ++ array, since the zero'th level is not used. */ ++#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL) ++ ++/* enumeration of possible mutual position of item and coord. This enum is ++ return type of ->is_in_item() item plugin method which see. */ ++typedef enum { ++ /* coord is on the left of an item */ ++ IP_ON_THE_LEFT, ++ /* coord is inside item */ ++ IP_INSIDE, ++ /* coord is inside item, but to the right of the rightmost unit of ++ this item */ ++ IP_RIGHT_EDGE, ++ /* coord is on the right of an item */ ++ IP_ON_THE_RIGHT ++} interposition; ++ ++/* type of lock to acquire on znode before returning it to caller */ ++typedef enum { ++ ZNODE_NO_LOCK = 0, ++ ZNODE_READ_LOCK = 1, ++ ZNODE_WRITE_LOCK = 2, ++} znode_lock_mode; ++ ++/* type of lock request */ ++typedef enum { ++ ZNODE_LOCK_LOPRI = 0, ++ ZNODE_LOCK_HIPRI = (1 << 0), ++ ++ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to ++ longterm_lock_znode will not sleep waiting for the lock to become ++ available. If the lock is unavailable, reiser4_znode_lock will ++ immediately return the value -E_REPEAT. */ ++ ZNODE_LOCK_NONBLOCK = (1 << 1), ++ /* An option for longterm_lock_znode which prevents atom fusion */ ++ ZNODE_LOCK_DONT_FUSE = (1 << 2) ++} znode_lock_request; ++ ++typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op; ++ ++/* used to specify direction of shift. These must be -1 and 1 */ ++typedef enum { ++ SHIFT_LEFT = 1, ++ SHIFT_RIGHT = -1 ++} shift_direction; ++ ++typedef enum { ++ LEFT_SIDE, ++ RIGHT_SIDE ++} sideof; ++ ++#define round_up(value, order) \ ++ ((typeof(value))(((long) (value) + (order) - 1U) & \ ++ ~((order) - 1))) ++ ++/* values returned by squalloc_right_neighbor and its auxiliary functions */ ++typedef enum { ++ /* unit of internal item is moved */ ++ SUBTREE_MOVED = 0, ++ /* nothing else can be squeezed into left neighbor */ ++ SQUEEZE_TARGET_FULL = 1, ++ /* all content of node is squeezed into its left neighbor */ ++ SQUEEZE_SOURCE_EMPTY = 2, ++ /* one more item is copied (this is only returned by ++ allocate_and_copy_extent to squalloc_twig)) */ ++ SQUEEZE_CONTINUE = 3 ++} squeeze_result; ++ ++/* Do not change items ids. If you do - there will be format change */ ++typedef enum { ++ STATIC_STAT_DATA_ID = 0x0, ++ SIMPLE_DIR_ENTRY_ID = 0x1, ++ COMPOUND_DIR_ID = 0x2, ++ NODE_POINTER_ID = 0x3, ++ EXTENT_POINTER_ID = 0x5, ++ FORMATTING_ID = 0x6, ++ CTAIL_ID = 0x7, ++ BLACK_BOX_ID = 0x8, ++ LAST_ITEM_ID = 0x9 ++} item_id; ++ ++/* Flags passed to jnode_flush() to allow it to distinguish default settings ++ based on whether commit() was called or VM memory pressure was applied. */ ++typedef enum { ++ /* submit flush queue to disk at jnode_flush completion */ ++ JNODE_FLUSH_WRITE_BLOCKS = 1, ++ ++ /* flush is called for commit */ ++ JNODE_FLUSH_COMMIT = 2, ++ /* not implemented */ ++ JNODE_FLUSH_MEMORY_FORMATTED = 4, ++ ++ /* not implemented */ ++ JNODE_FLUSH_MEMORY_UNFORMATTED = 8, ++} jnode_flush_flags; ++ ++/* Flags to insert/paste carry operations. Currently they only used in ++ flushing code, but in future, they can be used to optimize for repetitive ++ accesses. */ ++typedef enum { ++ /* carry is not allowed to shift data to the left when trying to find ++ free space */ ++ COPI_DONT_SHIFT_LEFT = (1 << 0), ++ /* carry is not allowed to shift data to the right when trying to find ++ free space */ ++ COPI_DONT_SHIFT_RIGHT = (1 << 1), ++ /* carry is not allowed to allocate new node(s) when trying to find ++ free space */ ++ COPI_DONT_ALLOCATE = (1 << 2), ++ /* try to load left neighbor if its not in a cache */ ++ COPI_LOAD_LEFT = (1 << 3), ++ /* try to load right neighbor if its not in a cache */ ++ COPI_LOAD_RIGHT = (1 << 4), ++ /* shift insertion point to the left neighbor */ ++ COPI_GO_LEFT = (1 << 5), ++ /* shift insertion point to the right neighbor */ ++ COPI_GO_RIGHT = (1 << 6), ++ /* try to step back into original node if insertion into new node ++ fails after shifting data there. */ ++ COPI_STEP_BACK = (1 << 7) ++} cop_insert_flag; ++ ++typedef enum { ++ SAFE_UNLINK, /* safe-link for unlink */ ++ SAFE_TRUNCATE /* safe-link for truncate */ ++} reiser4_safe_link_t; ++ ++/* this is to show on which list of atom jnode is */ ++typedef enum { ++ NOT_CAPTURED, ++ DIRTY_LIST, ++ CLEAN_LIST, ++ FQ_LIST, ++ WB_LIST, ++ OVRWR_LIST ++} atom_list; ++ ++/* __REISER4_FORWARD_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/fsdata.c linux-2.6.33/fs/reiser4/fsdata.c +--- linux-2.6.33.orig/fs/reiser4/fsdata.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/fsdata.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,804 @@ ++/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#include "fsdata.h" ++#include "inode.h" ++ ++ ++/* cache or dir_cursors */ ++static struct kmem_cache *d_cursor_cache; ++ ++/* list of unused cursors */ ++static LIST_HEAD(cursor_cache); ++ ++/* number of cursors in list of ununsed cursors */ ++static unsigned long d_cursor_unused = 0; ++ ++/* spinlock protecting manipulations with dir_cursor's hash table and lists */ ++DEFINE_SPINLOCK(d_lock); ++ ++static reiser4_file_fsdata *create_fsdata(struct file *file); ++static int file_is_stateless(struct file *file); ++static void free_fsdata(reiser4_file_fsdata *fsdata); ++static void kill_cursor(dir_cursor *); ++ ++/** ++ * d_cursor_shrink - shrink callback for cache of dir_cursor-s ++ * @nr: number of objects to free ++ * @mask: GFP mask ++ * ++ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested ++ * number. Return number of still freeable cursors. ++ */ ++static int d_cursor_shrink(int nr, gfp_t mask) ++{ ++ if (nr != 0) { ++ dir_cursor *scan; ++ int killed; ++ ++ killed = 0; ++ spin_lock(&d_lock); ++ while (!list_empty(&cursor_cache)) { ++ scan = list_entry(cursor_cache.next, dir_cursor, alist); ++ assert("nikita-3567", scan->ref == 0); ++ kill_cursor(scan); ++ ++killed; ++ --nr; ++ if (nr == 0) ++ break; ++ } ++ spin_unlock(&d_lock); ++ } ++ return d_cursor_unused; ++} ++ ++/* ++ * actually, d_cursors are "priceless", because there is no way to ++ * recover information stored in them. On the other hand, we don't ++ * want to consume all kernel memory by them. As a compromise, just ++ * assign higher "seeks" value to d_cursor cache, so that it will be ++ * shrunk only if system is really tight on memory. ++ */ ++static struct shrinker d_cursor_shrinker = { ++ .shrink = d_cursor_shrink, ++ .seeks = DEFAULT_SEEKS << 3, ++}; ++ ++/** ++ * reiser4_init_d_cursor - create d_cursor cache ++ * ++ * Initializes slab cache of d_cursors. It is part of reiser4 module ++ * initialization. ++ */ ++int reiser4_init_d_cursor(void) ++{ ++ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0, ++ SLAB_HWCACHE_ALIGN, NULL); ++ if (d_cursor_cache == NULL) ++ return RETERR(-ENOMEM); ++ ++ register_shrinker(&d_cursor_shrinker); ++ return 0; ++} ++ ++/** ++ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker ++ * ++ * This is called on reiser4 module unloading or system shutdown. ++ */ ++void reiser4_done_d_cursor(void) ++{ ++ unregister_shrinker(&d_cursor_shrinker); ++ ++ destroy_reiser4_cache(&d_cursor_cache); ++} ++ ++#define D_CURSOR_TABLE_SIZE (256) ++ ++static inline unsigned long ++d_cursor_hash(d_cursor_hash_table * table, const struct d_cursor_key *key) ++{ ++ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE)); ++ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1); ++} ++ ++static inline int d_cursor_eq(const struct d_cursor_key *k1, ++ const struct d_cursor_key *k2) ++{ ++ return k1->cid == k2->cid && k1->oid == k2->oid; ++} ++ ++/* ++ * define functions to manipulate reiser4 super block's hash table of ++ * dir_cursors ++ */ ++#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) ++#define KFREE(ptr, size) kfree(ptr) ++TYPE_SAFE_HASH_DEFINE(d_cursor, ++ dir_cursor, ++ struct d_cursor_key, ++ key, hash, d_cursor_hash, d_cursor_eq); ++#undef KFREE ++#undef KMALLOC ++ ++/** ++ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources ++ * @super: super block to initialize ++ * ++ * Initializes per-super-block d_cursor's hash table and radix tree. It is part ++ * of mount. ++ */ ++int reiser4_init_super_d_info(struct super_block *super) ++{ ++ struct d_cursor_info *p; ++ ++ p = &get_super_private(super)->d_info; ++ ++ INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get()); ++ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE); ++} ++ ++/** ++ * reiser4_done_super_d_info - release per-super-block d_cursor resources ++ * @super: super block being umounted ++ * ++ * It is called on umount. Kills all directory cursors attached to suoer block. ++ */ ++void reiser4_done_super_d_info(struct super_block *super) ++{ ++ struct d_cursor_info *d_info; ++ dir_cursor *cursor, *next; ++ ++ d_info = &get_super_private(super)->d_info; ++ for_all_in_htable(&d_info->table, d_cursor, cursor, next) ++ kill_cursor(cursor); ++ ++ BUG_ON(d_info->tree.rnode != NULL); ++ d_cursor_hash_done(&d_info->table); ++} ++ ++/** ++ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it ++ * @cursor: cursor to free ++ * ++ * Removes reiser4_file_fsdata attached to @cursor from readdir list of ++ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from ++ * indices, hash table, list of unused cursors and frees it. ++ */ ++static void kill_cursor(dir_cursor *cursor) ++{ ++ unsigned long index; ++ ++ assert("nikita-3566", cursor->ref == 0); ++ assert("nikita-3572", cursor->fsdata != NULL); ++ ++ index = (unsigned long)cursor->key.oid; ++ list_del_init(&cursor->fsdata->dir.linkage); ++ free_fsdata(cursor->fsdata); ++ cursor->fsdata = NULL; ++ ++ if (list_empty_careful(&cursor->list)) ++ /* this is last cursor for a file. Kill radix-tree entry */ ++ radix_tree_delete(&cursor->info->tree, index); ++ else { ++ void **slot; ++ ++ /* ++ * there are other cursors for the same oid. ++ */ ++ ++ /* ++ * if radix tree point to the cursor being removed, re-target ++ * radix tree slot to the next cursor in the (non-empty as was ++ * checked above) element of the circular list of all cursors ++ * for this oid. ++ */ ++ slot = radix_tree_lookup_slot(&cursor->info->tree, index); ++ assert("nikita-3571", *slot != NULL); ++ if (*slot == cursor) ++ *slot = list_entry(cursor->list.next, dir_cursor, list); ++ /* remove cursor from circular list */ ++ list_del_init(&cursor->list); ++ } ++ /* remove cursor from the list of unused cursors */ ++ list_del_init(&cursor->alist); ++ /* remove cursor from the hash table */ ++ d_cursor_hash_remove(&cursor->info->table, cursor); ++ /* and free it */ ++ kmem_cache_free(d_cursor_cache, cursor); ++ --d_cursor_unused; ++} ++ ++/* possible actions that can be performed on all cursors for the given file */ ++enum cursor_action { ++ /* ++ * load all detached state: this is called when stat-data is loaded ++ * from the disk to recover information about all pending readdirs ++ */ ++ CURSOR_LOAD, ++ /* ++ * detach all state from inode, leaving it in the cache. This is called ++ * when inode is removed form the memory by memory pressure ++ */ ++ CURSOR_DISPOSE, ++ /* ++ * detach cursors from the inode, and free them. This is called when ++ * inode is destroyed ++ */ ++ CURSOR_KILL ++}; ++ ++/* ++ * return d_cursor data for the file system @inode is in. ++ */ ++static inline struct d_cursor_info *d_info(struct inode *inode) ++{ ++ return &get_super_private(inode->i_sb)->d_info; ++} ++ ++/* ++ * lookup d_cursor in the per-super-block radix tree. ++ */ ++static inline dir_cursor *lookup(struct d_cursor_info *info, ++ unsigned long index) ++{ ++ return (dir_cursor *) radix_tree_lookup(&info->tree, index); ++} ++ ++/* ++ * attach @cursor to the radix tree. There may be multiple cursors for the ++ * same oid, they are chained into circular list. ++ */ ++static void bind_cursor(dir_cursor * cursor, unsigned long index) ++{ ++ dir_cursor *head; ++ ++ head = lookup(cursor->info, index); ++ if (head == NULL) { ++ /* this is the first cursor for this index */ ++ INIT_LIST_HEAD(&cursor->list); ++ radix_tree_insert(&cursor->info->tree, index, cursor); ++ } else { ++ /* some cursor already exists. Chain ours */ ++ list_add(&cursor->list, &head->list); ++ } ++} ++ ++/* ++ * detach fsdata (if detachable) from file descriptor, and put cursor on the ++ * "unused" list. Called when file descriptor is not longer in active use. ++ */ ++static void clean_fsdata(struct file *file) ++{ ++ dir_cursor *cursor; ++ reiser4_file_fsdata *fsdata; ++ ++ assert("nikita-3570", file_is_stateless(file)); ++ ++ fsdata = (reiser4_file_fsdata *) file->private_data; ++ if (fsdata != NULL) { ++ cursor = fsdata->cursor; ++ if (cursor != NULL) { ++ spin_lock(&d_lock); ++ --cursor->ref; ++ if (cursor->ref == 0) { ++ list_add_tail(&cursor->alist, &cursor_cache); ++ ++d_cursor_unused; ++ } ++ spin_unlock(&d_lock); ++ file->private_data = NULL; ++ } ++ } ++} ++ ++/* ++ * global counter used to generate "client ids". These ids are encoded into ++ * high bits of fpos. ++ */ ++static __u32 cid_counter = 0; ++#define CID_SHIFT (20) ++#define CID_MASK (0xfffffull) ++ ++static void free_file_fsdata_nolock(struct file *); ++ ++/** ++ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table ++ * @cursor: ++ * @file: ++ * @inode: ++ * ++ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to ++ * reiser4 super block's hash table and radix tree. ++ add detachable readdir ++ * state to the @f ++ */ ++static int insert_cursor(dir_cursor *cursor, struct file *file, ++ struct inode *inode) ++{ ++ int result; ++ reiser4_file_fsdata *fsdata; ++ ++ memset(cursor, 0, sizeof *cursor); ++ ++ /* this is either first call to readdir, or rewind. Anyway, create new ++ * cursor. */ ++ fsdata = create_fsdata(NULL); ++ if (fsdata != NULL) { ++ result = radix_tree_preload(reiser4_ctx_gfp_mask_get()); ++ if (result == 0) { ++ struct d_cursor_info *info; ++ oid_t oid; ++ ++ info = d_info(inode); ++ oid = get_inode_oid(inode); ++ /* cid occupies higher 12 bits of f->f_pos. Don't ++ * allow it to become negative: this confuses ++ * nfsd_readdir() */ ++ cursor->key.cid = (++cid_counter) & 0x7ff; ++ cursor->key.oid = oid; ++ cursor->fsdata = fsdata; ++ cursor->info = info; ++ cursor->ref = 1; ++ ++ spin_lock_inode(inode); ++ /* install cursor as @f's private_data, discarding old ++ * one if necessary */ ++#if REISER4_DEBUG ++ if (file->private_data) ++ warning("", "file has fsdata already"); ++#endif ++ clean_fsdata(file); ++ free_file_fsdata_nolock(file); ++ file->private_data = fsdata; ++ fsdata->cursor = cursor; ++ spin_unlock_inode(inode); ++ spin_lock(&d_lock); ++ /* insert cursor into hash table */ ++ d_cursor_hash_insert(&info->table, cursor); ++ /* and chain it into radix-tree */ ++ bind_cursor(cursor, (unsigned long)oid); ++ spin_unlock(&d_lock); ++ radix_tree_preload_end(); ++ file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT; ++ } ++ } else ++ result = RETERR(-ENOMEM); ++ return result; ++} ++ ++/** ++ * process_cursors - do action on each cursor attached to inode ++ * @inode: ++ * @act: action to do ++ * ++ * Finds all cursors of @inode in reiser4's super block radix tree of cursors ++ * and performs action specified by @act on each of cursors. ++ */ ++static void process_cursors(struct inode *inode, enum cursor_action act) ++{ ++ oid_t oid; ++ dir_cursor *start; ++ struct list_head *head; ++ reiser4_context *ctx; ++ struct d_cursor_info *info; ++ ++ /* this can be called by ++ * ++ * kswapd->...->prune_icache->..reiser4_destroy_inode ++ * ++ * without reiser4_context ++ */ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) { ++ warning("vs-23", "failed to init context"); ++ return; ++ } ++ ++ assert("nikita-3558", inode != NULL); ++ ++ info = d_info(inode); ++ oid = get_inode_oid(inode); ++ spin_lock_inode(inode); ++ head = get_readdir_list(inode); ++ spin_lock(&d_lock); ++ /* find any cursor for this oid: reference to it is hanging of radix ++ * tree */ ++ start = lookup(info, (unsigned long)oid); ++ if (start != NULL) { ++ dir_cursor *scan; ++ reiser4_file_fsdata *fsdata; ++ ++ /* process circular list of cursors for this oid */ ++ scan = start; ++ do { ++ dir_cursor *next; ++ ++ next = list_entry(scan->list.next, dir_cursor, list); ++ fsdata = scan->fsdata; ++ assert("nikita-3557", fsdata != NULL); ++ if (scan->key.oid == oid) { ++ switch (act) { ++ case CURSOR_DISPOSE: ++ list_del_init(&fsdata->dir.linkage); ++ break; ++ case CURSOR_LOAD: ++ list_add(&fsdata->dir.linkage, head); ++ break; ++ case CURSOR_KILL: ++ kill_cursor(scan); ++ break; ++ } ++ } ++ if (scan == next) ++ /* last cursor was just killed */ ++ break; ++ scan = next; ++ } while (scan != start); ++ } ++ spin_unlock(&d_lock); ++ /* check that we killed 'em all */ ++ assert("nikita-3568", ++ ergo(act == CURSOR_KILL, ++ list_empty_careful(get_readdir_list(inode)))); ++ assert("nikita-3569", ++ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL)); ++ spin_unlock_inode(inode); ++ reiser4_exit_context(ctx); ++} ++ ++/** ++ * reiser4_dispose_cursors - removes cursors from inode's list ++ * @inode: inode to dispose cursors of ++ * ++ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata ++ * attached to cursor from inode's readdir list. This is called when inode is ++ * removed from the memory by memory pressure. ++ */ ++void reiser4_dispose_cursors(struct inode *inode) ++{ ++ process_cursors(inode, CURSOR_DISPOSE); ++} ++ ++/** ++ * reiser4_load_cursors - attach cursors to inode ++ * @inode: inode to load cursors to ++ * ++ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata ++ * attached to cursor to inode's readdir list. This is done when inode is ++ * loaded into memory. ++ */ ++void reiser4_load_cursors(struct inode *inode) ++{ ++ process_cursors(inode, CURSOR_LOAD); ++} ++ ++/** ++ * reiser4_kill_cursors - kill all inode cursors ++ * @inode: inode to kill cursors of ++ * ++ * Frees all cursors for this inode. This is called when inode is destroyed. ++ */ ++void reiser4_kill_cursors(struct inode *inode) ++{ ++ process_cursors(inode, CURSOR_KILL); ++} ++ ++/** ++ * file_is_stateless - ++ * @file: ++ * ++ * true, if file descriptor @f is created by NFS server by "demand" to serve ++ * one file system operation. This means that there may be "detached state" ++ * for underlying inode. ++ */ ++static int file_is_stateless(struct file *file) ++{ ++ return reiser4_get_dentry_fsdata(file->f_dentry)->stateless; ++} ++ ++/** ++ * reiser4_get_dir_fpos - ++ * @dir: ++ * ++ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but ++ * in the case of stateless directory operation (readdir-over-nfs), client id ++ * was encoded in the high bits of cookie and should me masked off. ++ */ ++loff_t reiser4_get_dir_fpos(struct file *dir) ++{ ++ if (file_is_stateless(dir)) ++ return dir->f_pos & CID_MASK; ++ else ++ return dir->f_pos; ++} ++ ++/** ++ * reiser4_attach_fsdata - try to attach fsdata ++ * @file: ++ * @inode: ++ * ++ * Finds or creates cursor for readdir-over-nfs. ++ */ ++int reiser4_attach_fsdata(struct file *file, struct inode *inode) ++{ ++ loff_t pos; ++ int result; ++ dir_cursor *cursor; ++ ++ /* ++ * we are serialized by inode->i_mutex ++ */ ++ if (!file_is_stateless(file)) ++ return 0; ++ ++ pos = file->f_pos; ++ result = 0; ++ if (pos == 0) { ++ /* ++ * first call to readdir (or rewind to the beginning of ++ * directory) ++ */ ++ cursor = kmem_cache_alloc(d_cursor_cache, ++ reiser4_ctx_gfp_mask_get()); ++ if (cursor != NULL) ++ result = insert_cursor(cursor, file, inode); ++ else ++ result = RETERR(-ENOMEM); ++ } else { ++ /* try to find existing cursor */ ++ struct d_cursor_key key; ++ ++ key.cid = pos >> CID_SHIFT; ++ key.oid = get_inode_oid(inode); ++ spin_lock(&d_lock); ++ cursor = d_cursor_hash_find(&d_info(inode)->table, &key); ++ if (cursor != NULL) { ++ /* cursor was found */ ++ if (cursor->ref == 0) { ++ /* move it from unused list */ ++ list_del_init(&cursor->alist); ++ --d_cursor_unused; ++ } ++ ++cursor->ref; ++ } ++ spin_unlock(&d_lock); ++ if (cursor != NULL) { ++ spin_lock_inode(inode); ++ assert("nikita-3556", cursor->fsdata->back == NULL); ++ clean_fsdata(file); ++ free_file_fsdata_nolock(file); ++ file->private_data = cursor->fsdata; ++ spin_unlock_inode(inode); ++ } ++ } ++ return result; ++} ++ ++/** ++ * reiser4_detach_fsdata - ??? ++ * @file: ++ * ++ * detach fsdata, if necessary ++ */ ++void reiser4_detach_fsdata(struct file *file) ++{ ++ struct inode *inode; ++ ++ if (!file_is_stateless(file)) ++ return; ++ ++ inode = file->f_dentry->d_inode; ++ spin_lock_inode(inode); ++ clean_fsdata(file); ++ spin_unlock_inode(inode); ++} ++ ++/* slab for reiser4_dentry_fsdata */ ++static struct kmem_cache *dentry_fsdata_cache; ++ ++/** ++ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata ++ * ++ * Initializes slab cache of structures attached to denty->d_fsdata. It is ++ * part of reiser4 module initialization. ++ */ ++int reiser4_init_dentry_fsdata(void) ++{ ++ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata", ++ sizeof(struct reiser4_dentry_fsdata), ++ 0, ++ SLAB_HWCACHE_ALIGN | ++ SLAB_RECLAIM_ACCOUNT, ++ NULL); ++ if (dentry_fsdata_cache == NULL) ++ return RETERR(-ENOMEM); ++ return 0; ++} ++ ++/** ++ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata ++ * ++ * This is called on reiser4 module unloading or system shutdown. ++ */ ++void reiser4_done_dentry_fsdata(void) ++{ ++ destroy_reiser4_cache(&dentry_fsdata_cache); ++} ++ ++/** ++ * reiser4_get_dentry_fsdata - get fs-specific dentry data ++ * @dentry: queried dentry ++ * ++ * Allocates if necessary and returns per-dentry data that we attach to each ++ * dentry. ++ */ ++struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry) ++{ ++ assert("nikita-1365", dentry != NULL); ++ ++ if (dentry->d_fsdata == NULL) { ++ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache, ++ reiser4_ctx_gfp_mask_get()); ++ if (dentry->d_fsdata == NULL) ++ return ERR_PTR(RETERR(-ENOMEM)); ++ memset(dentry->d_fsdata, 0, ++ sizeof(struct reiser4_dentry_fsdata)); ++ } ++ return dentry->d_fsdata; ++} ++ ++/** ++ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata ++ * @dentry: dentry to free fsdata of ++ * ++ * Detaches and frees fs-specific dentry data ++ */ ++void reiser4_free_dentry_fsdata(struct dentry *dentry) ++{ ++ if (dentry->d_fsdata != NULL) { ++ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata); ++ dentry->d_fsdata = NULL; ++ } ++} ++ ++/* slab for reiser4_file_fsdata */ ++static struct kmem_cache *file_fsdata_cache; ++ ++/** ++ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata ++ * ++ * Initializes slab cache of structures attached to file->private_data. It is ++ * part of reiser4 module initialization. ++ */ ++int reiser4_init_file_fsdata(void) ++{ ++ file_fsdata_cache = kmem_cache_create("file_fsdata", ++ sizeof(reiser4_file_fsdata), ++ 0, ++ SLAB_HWCACHE_ALIGN | ++ SLAB_RECLAIM_ACCOUNT, NULL); ++ if (file_fsdata_cache == NULL) ++ return RETERR(-ENOMEM); ++ return 0; ++} ++ ++/** ++ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata ++ * ++ * This is called on reiser4 module unloading or system shutdown. ++ */ ++void reiser4_done_file_fsdata(void) ++{ ++ destroy_reiser4_cache(&file_fsdata_cache); ++} ++ ++/** ++ * create_fsdata - allocate and initialize reiser4_file_fsdata ++ * @file: what to create file_fsdata for, may be NULL ++ * ++ * Allocates and initializes reiser4_file_fsdata structure. ++ */ ++static reiser4_file_fsdata *create_fsdata(struct file *file) ++{ ++ reiser4_file_fsdata *fsdata; ++ ++ fsdata = kmem_cache_alloc(file_fsdata_cache, ++ reiser4_ctx_gfp_mask_get()); ++ if (fsdata != NULL) { ++ memset(fsdata, 0, sizeof *fsdata); ++ fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024; ++ fsdata->back = file; ++ INIT_LIST_HEAD(&fsdata->dir.linkage); ++ } ++ return fsdata; ++} ++ ++/** ++ * free_fsdata - free reiser4_file_fsdata ++ * @fsdata: object to free ++ * ++ * Dual to create_fsdata(). Free reiser4_file_fsdata. ++ */ ++static void free_fsdata(reiser4_file_fsdata *fsdata) ++{ ++ BUG_ON(fsdata == NULL); ++ kmem_cache_free(file_fsdata_cache, fsdata); ++} ++ ++/** ++ * reiser4_get_file_fsdata - get fs-specific file data ++ * @file: queried file ++ * ++ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches ++ * to @file. ++ */ ++reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file) ++{ ++ assert("nikita-1603", file != NULL); ++ ++ if (file->private_data == NULL) { ++ reiser4_file_fsdata *fsdata; ++ struct inode *inode; ++ ++ fsdata = create_fsdata(file); ++ if (fsdata == NULL) ++ return ERR_PTR(RETERR(-ENOMEM)); ++ ++ inode = file->f_dentry->d_inode; ++ spin_lock_inode(inode); ++ if (file->private_data == NULL) { ++ file->private_data = fsdata; ++ fsdata = NULL; ++ } ++ spin_unlock_inode(inode); ++ if (fsdata != NULL) ++ /* other thread initialized ->fsdata */ ++ kmem_cache_free(file_fsdata_cache, fsdata); ++ } ++ assert("nikita-2665", file->private_data != NULL); ++ return file->private_data; ++} ++ ++/** ++ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata ++ * @file: ++ * ++ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from ++ * readdir list, frees if it is not linked to d_cursor object. ++ */ ++static void free_file_fsdata_nolock(struct file *file) ++{ ++ reiser4_file_fsdata *fsdata; ++ ++ assert("", spin_inode_is_locked(file->f_dentry->d_inode)); ++ fsdata = file->private_data; ++ if (fsdata != NULL) { ++ list_del_init(&fsdata->dir.linkage); ++ if (fsdata->cursor == NULL) ++ free_fsdata(fsdata); ++ } ++ file->private_data = NULL; ++} ++ ++/** ++ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata ++ * @file: ++ * ++ * Spinlocks inode and calls free_file_fsdata_nolock to do the work. ++ */ ++void reiser4_free_file_fsdata(struct file *file) ++{ ++ spin_lock_inode(file->f_dentry->d_inode); ++ free_file_fsdata_nolock(file); ++ spin_unlock_inode(file->f_dentry->d_inode); ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/fsdata.h linux-2.6.33/fs/reiser4/fsdata.h +--- linux-2.6.33.orig/fs/reiser4/fsdata.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/fsdata.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,205 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#if !defined(__REISER4_FSDATA_H__) ++#define __REISER4_FSDATA_H__ ++ ++#include "debug.h" ++#include "kassign.h" ++#include "seal.h" ++#include "type_safe_hash.h" ++#include "plugin/file/file.h" ++#include "readahead.h" ++ ++/* ++ * comment about reiser4_dentry_fsdata ++ * ++ * ++ */ ++ ++/* ++ * locking: fields of per file descriptor readdir_pos and ->f_pos are ++ * protected by ->i_mutex on inode. Under this lock following invariant ++ * holds: ++ * ++ * file descriptor is "looking" at the entry_no-th directory entry from ++ * the beginning of directory. This entry has key dir_entry_key and is ++ * pos-th entry with duplicate-key sequence. ++ * ++ */ ++ ++/* logical position within directory */ ++struct dir_pos { ++ /* key of directory entry (actually, part of a key sufficient to ++ identify directory entry) */ ++ de_id dir_entry_key; ++ /* ordinal number of directory entry among all entries with the same ++ key. (Starting from 0.) */ ++ unsigned pos; ++}; ++ ++struct readdir_pos { ++ /* f_pos corresponding to this readdir position */ ++ __u64 fpos; ++ /* logical position within directory */ ++ struct dir_pos position; ++ /* logical number of directory entry within ++ directory */ ++ __u64 entry_no; ++}; ++ ++/* ++ * this is used to speed up lookups for directory entry: on initial call to ++ * ->lookup() seal and coord of directory entry (if found, that is) are stored ++ * in struct dentry and reused later to avoid tree traversals. ++ */ ++struct de_location { ++ /* seal covering directory entry */ ++ seal_t entry_seal; ++ /* coord of directory entry */ ++ coord_t entry_coord; ++ /* ordinal number of directory entry among all entries with the same ++ key. (Starting from 0.) */ ++ int pos; ++}; ++ ++/** ++ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries ++ * ++ * This is allocated dynamically and released in d_op->d_release() ++ * ++ * Currently it only contains cached location (hint) of directory entry, but ++ * it is expected that other information will be accumulated here. ++ */ ++struct reiser4_dentry_fsdata { ++ /* ++ * here will go fields filled by ->lookup() to speedup next ++ * create/unlink, like blocknr of znode with stat-data, or key of ++ * stat-data. ++ */ ++ struct de_location dec; ++ int stateless; /* created through reiser4_decode_fh, needs ++ * special treatment in readdir. */ ++}; ++ ++extern int reiser4_init_dentry_fsdata(void); ++extern void reiser4_done_dentry_fsdata(void); ++extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *); ++extern void reiser4_free_dentry_fsdata(struct dentry *dentry); ++ ++/** ++ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data ++ * ++ * This is allocated dynamically and released in inode->i_fop->release ++ */ ++typedef struct reiser4_file_fsdata { ++ /* ++ * pointer back to the struct file which this reiser4_file_fsdata is ++ * part of ++ */ ++ struct file *back; ++ /* detached cursor for stateless readdir. */ ++ struct dir_cursor *cursor; ++ /* ++ * We need both directory and regular file parts here, because there ++ * are file system objects that are files and directories. ++ */ ++ struct { ++ /* ++ * position in directory. It is updated each time directory is ++ * modified ++ */ ++ struct readdir_pos readdir; ++ /* head of this list is reiser4_inode->lists.readdir_list */ ++ struct list_head linkage; ++ } dir; ++ /* hints to speed up operations with regular files: read and write. */ ++ struct { ++ hint_t hint; ++ } reg; ++ struct reiser4_file_ra_state ra1; ++ ++} reiser4_file_fsdata; ++ ++extern int reiser4_init_file_fsdata(void); ++extern void reiser4_done_file_fsdata(void); ++extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *); ++extern void reiser4_free_file_fsdata(struct file *); ++ ++/* ++ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are ++ * used to address problem reiser4 has with readdir accesses via NFS. See ++ * plugin/file_ops_readdir.c for more details. ++ */ ++struct d_cursor_key{ ++ __u16 cid; ++ __u64 oid; ++}; ++ ++/* ++ * define structures d_cursor_hash_table d_cursor_hash_link which are used to ++ * maintain hash table of dir_cursor-s in reiser4's super block ++ */ ++typedef struct dir_cursor dir_cursor; ++TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor); ++ ++struct dir_cursor { ++ int ref; ++ reiser4_file_fsdata *fsdata; ++ ++ /* link to reiser4 super block hash table of cursors */ ++ d_cursor_hash_link hash; ++ ++ /* ++ * this is to link cursors to reiser4 super block's radix tree of ++ * cursors if there are more than one cursor of the same objectid ++ */ ++ struct list_head list; ++ struct d_cursor_key key; ++ struct d_cursor_info *info; ++ /* list of unused cursors */ ++ struct list_head alist; ++}; ++ ++extern int reiser4_init_d_cursor(void); ++extern void reiser4_done_d_cursor(void); ++ ++extern int reiser4_init_super_d_info(struct super_block *); ++extern void reiser4_done_super_d_info(struct super_block *); ++ ++extern loff_t reiser4_get_dir_fpos(struct file *); ++extern int reiser4_attach_fsdata(struct file *, struct inode *); ++extern void reiser4_detach_fsdata(struct file *); ++ ++/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for ++ more details */ ++void reiser4_dispose_cursors(struct inode *inode); ++void reiser4_load_cursors(struct inode *inode); ++void reiser4_kill_cursors(struct inode *inode); ++void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de, ++ int offset, int adj); ++ ++/* ++ * this structure is embedded to reise4_super_info_data. It maintains d_cursors ++ * (detached readdir state). See plugin/file_ops_readdir.c for more details. ++ */ ++struct d_cursor_info { ++ d_cursor_hash_table table; ++ struct radix_tree_root tree; ++}; ++ ++/* spinlock protecting readdir cursors */ ++extern spinlock_t d_lock; ++ ++/* __REISER4_FSDATA_H__ */ ++#endif ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 120 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/init_super.c linux-2.6.33/fs/reiser4/init_super.c +--- linux-2.6.33.orig/fs/reiser4/init_super.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/init_super.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,761 @@ ++/* Copyright by Hans Reiser, 2003 */ ++ ++#include "super.h" ++#include "inode.h" ++#include "plugin/plugin_set.h" ++ ++#include <linux/swap.h> ++ ++/** ++ * init_fs_info - allocate reiser4 specific super block ++ * @super: super block of filesystem ++ * ++ * Allocates and initialize reiser4_super_info_data, attaches it to ++ * super->s_fs_info, initializes structures maintaining d_cursor-s. ++ */ ++int reiser4_init_fs_info(struct super_block *super) ++{ ++ reiser4_super_info_data *sbinfo; ++ ++ sbinfo = kzalloc(sizeof(reiser4_super_info_data), ++ reiser4_ctx_gfp_mask_get()); ++ if (!sbinfo) ++ return RETERR(-ENOMEM); ++ ++ super->s_fs_info = sbinfo; ++ super->s_op = NULL; ++ ++ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes)); ++ ON_DEBUG(spin_lock_init(&sbinfo->all_guard)); ++ ++ mutex_init(&sbinfo->delete_mutex); ++ spin_lock_init(&(sbinfo->guard)); ++ ++ /* initialize per-super-block d_cursor resources */ ++ reiser4_init_super_d_info(super); ++ ++ return 0; ++} ++ ++/** ++ * reiser4_done_fs_info - free reiser4 specific super block ++ * @super: super block of filesystem ++ * ++ * Performs some sanity checks, releases structures maintaining d_cursor-s, ++ * frees reiser4_super_info_data. ++ */ ++void reiser4_done_fs_info(struct super_block *super) ++{ ++ assert("zam-990", super->s_fs_info != NULL); ++ ++ /* release per-super-block d_cursor resources */ ++ reiser4_done_super_d_info(super); ++ ++ /* make sure that there are not jnodes already */ ++ assert("", list_empty(&get_super_private(super)->all_jnodes)); ++ assert("", get_current_context()->trans->atom == NULL); ++ reiser4_check_block_counters(super); ++ kfree(super->s_fs_info); ++ super->s_fs_info = NULL; ++} ++ ++/* type of option parseable by parse_option() */ ++typedef enum { ++ /* value of option is arbitrary string */ ++ OPT_STRING, ++ ++ /* ++ * option specifies bit in a bitmask. When option is set - bit in ++ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush, ++ * dont_load_bitmap, atomic_write. ++ */ ++ OPT_BIT, ++ ++ /* ++ * value of option should conform to sprintf() format. Examples are ++ * tmgr.atom_max_size=N, tmgr.atom_max_age=N ++ */ ++ OPT_FORMAT, ++ ++ /* ++ * option can take one of predefined values. Example is onerror=panic or ++ * onerror=remount-ro ++ */ ++ OPT_ONEOF, ++} opt_type_t; ++ ++#if 0 ++struct opt_bitmask_bit { ++ const char *bit_name; ++ int bit_nr; ++}; ++#endif ++ ++/* description of option parseable by parse_option() */ ++struct opt_desc { ++ /* option name. ++ ++ parsed portion of string has a form "name=value". ++ */ ++ const char *name; ++ /* type of option */ ++ opt_type_t type; ++ union { ++ /* where to store value of string option (type == OPT_STRING) */ ++ char **string; ++ /* description of bits for bit option (type == OPT_BIT) */ ++ struct { ++ int nr; ++ void *addr; ++ } bit; ++ /* description of format and targets for format option (type ++ == OPT_FORMAT) */ ++ struct { ++ const char *format; ++ int nr_args; ++ void *arg1; ++ void *arg2; ++ void *arg3; ++ void *arg4; ++ } f; ++ struct { ++ int *result; ++ const char *list[10]; ++ } oneof; ++ struct { ++ void *addr; ++ int nr_bits; ++ /* struct opt_bitmask_bit *bits; */ ++ } bitmask; ++ } u; ++}; ++ ++/** ++ * parse_option - parse one option ++ * @opt_strin: starting point of parsing ++ * @opt: option description ++ * ++ * foo=bar, ++ * ^ ^ ^ ++ * | | +-- replaced to '\0' ++ * | +-- val_start ++ * +-- opt_string ++ * Figures out option type and handles option correspondingly. ++ */ ++static int parse_option(char *opt_string, struct opt_desc *opt) ++{ ++ char *val_start; ++ int result; ++ const char *err_msg; ++ ++ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */ ++ ++ val_start = strchr(opt_string, '='); ++ if (val_start != NULL) { ++ *val_start = '\0'; ++ ++val_start; ++ } ++ ++ err_msg = NULL; ++ result = 0; ++ switch (opt->type) { ++ case OPT_STRING: ++ if (val_start == NULL) { ++ err_msg = "String arg missing"; ++ result = RETERR(-EINVAL); ++ } else ++ *opt->u.string = val_start; ++ break; ++ case OPT_BIT: ++ if (val_start != NULL) ++ err_msg = "Value ignored"; ++ else ++ set_bit(opt->u.bit.nr, opt->u.bit.addr); ++ break; ++ case OPT_FORMAT: ++ if (val_start == NULL) { ++ err_msg = "Formatted arg missing"; ++ result = RETERR(-EINVAL); ++ break; ++ } ++ if (sscanf(val_start, opt->u.f.format, ++ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3, ++ opt->u.f.arg4) != opt->u.f.nr_args) { ++ err_msg = "Wrong conversion"; ++ result = RETERR(-EINVAL); ++ } ++ break; ++ case OPT_ONEOF: ++ { ++ int i = 0; ++ ++ if (val_start == NULL) { ++ err_msg = "Value is missing"; ++ result = RETERR(-EINVAL); ++ break; ++ } ++ err_msg = "Wrong option value"; ++ result = RETERR(-EINVAL); ++ while (opt->u.oneof.list[i]) { ++ if (!strcmp(opt->u.oneof.list[i], val_start)) { ++ result = 0; ++ err_msg = NULL; ++ *opt->u.oneof.result = i; ++ break; ++ } ++ i++; ++ } ++ break; ++ } ++ default: ++ wrong_return_value("nikita-2100", "opt -> type"); ++ break; ++ } ++ if (err_msg != NULL) { ++ warning("nikita-2496", "%s when parsing option "%s%s%s"", ++ err_msg, opt->name, val_start ? "=" : "", ++ val_start ? : ""); ++ } ++ return result; ++} ++ ++/** ++ * parse_options - parse reiser4 mount options ++ * @opt_string: starting point ++ * @opts: array of option description ++ * @nr_opts: number of elements in @opts ++ * ++ * Parses comma separated list of reiser4 mount options. ++ */ ++static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts) ++{ ++ int result; ++ ++ result = 0; ++ while ((result == 0) && opt_string && *opt_string) { ++ int j; ++ char *next; ++ ++ next = strchr(opt_string, ','); ++ if (next != NULL) { ++ *next = '\0'; ++ ++next; ++ } ++ for (j = 0; j < nr_opts; ++j) { ++ if (!strncmp(opt_string, opts[j].name, ++ strlen(opts[j].name))) { ++ result = parse_option(opt_string, &opts[j]); ++ break; ++ } ++ } ++ if (j == nr_opts) { ++ warning("nikita-2307", "Unrecognized option: "%s"", ++ opt_string); ++ /* traditionally, -EINVAL is returned on wrong mount ++ option */ ++ result = RETERR(-EINVAL); ++ } ++ opt_string = next; ++ } ++ return result; ++} ++ ++#define NUM_OPT(label, fmt, addr) \ ++ { \ ++ .name = (label), \ ++ .type = OPT_FORMAT, \ ++ .u = { \ ++ .f = { \ ++ .format = (fmt), \ ++ .nr_args = 1, \ ++ .arg1 = (addr), \ ++ .arg2 = NULL, \ ++ .arg3 = NULL, \ ++ .arg4 = NULL \ ++ } \ ++ } \ ++ } ++ ++#define SB_FIELD_OPT(field, fmt) NUM_OPT(#field, fmt, &sbinfo->field) ++ ++#define BIT_OPT(label, bitnr) \ ++ { \ ++ .name = label, \ ++ .type = OPT_BIT, \ ++ .u = { \ ++ .bit = { \ ++ .nr = bitnr, \ ++ .addr = &sbinfo->fs_flags \ ++ } \ ++ } \ ++ } ++ ++#define MAX_NR_OPTIONS (30) ++ ++#if REISER4_DEBUG ++# define OPT_ARRAY_CHECK(opt, array) \ ++ if ((opt) > (array) + MAX_NR_OPTIONS) { \ ++ warning("zam-1046", "opt array is overloaded"); break; \ ++ } ++#else ++# define OPT_ARRAY_CHECK(opt, array) noop ++#endif ++ ++#define PUSH_OPT(opt, array, ...) \ ++do { \ ++ struct opt_desc o = __VA_ARGS__; \ ++ OPT_ARRAY_CHECK(opt, array); \ ++ *(opt) ++ = o; \ ++} while (0) ++ ++static noinline void push_sb_field_opts(struct opt_desc **p, ++ struct opt_desc *opts, ++ reiser4_super_info_data *sbinfo) ++{ ++#define PUSH_SB_FIELD_OPT(field, format) \ ++ PUSH_OPT(*p, opts, SB_FIELD_OPT(field, format)) ++ /* ++ * tmgr.atom_max_size=N ++ * Atoms containing more than N blocks will be forced to commit. N is ++ * decimal. ++ */ ++ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u"); ++ /* ++ * tmgr.atom_max_age=N ++ * Atoms older than N seconds will be forced to commit. N is decimal. ++ */ ++ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u"); ++ /* ++ * tmgr.atom_min_size=N ++ * In committing an atom to free dirty pages, force the atom less than ++ * N in size to fuse with another one. ++ */ ++ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u"); ++ /* ++ * tmgr.atom_max_flushers=N ++ * limit of concurrent flushers for one atom. 0 means no limit. ++ */ ++ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u"); ++ /* ++ * tree.cbk_cache_slots=N ++ * Number of slots in the cbk cache. ++ */ ++ PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u"); ++ /* ++ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty ++ * leaf-level blocks it will force them to be relocated. ++ */ ++ PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u"); ++ /* ++ * If flush finds can find a block allocation closer than at most ++ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that ++ * position. ++ */ ++ PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u"); ++ /* ++ * If we have written this much or more blocks before encountering busy ++ * jnode in flush list - abort flushing hoping that next time we get ++ * called this jnode will be clean already, and we will save some ++ * seeks. ++ */ ++ PUSH_SB_FIELD_OPT(flush.written_threshold, "%u"); ++ /* The maximum number of nodes to scan left on a level during flush. */ ++ PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u"); ++ /* preferred IO size */ ++ PUSH_SB_FIELD_OPT(optimal_io_size, "%u"); ++ /* carry flags used for insertion of new nodes */ ++ PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u"); ++ /* carry flags used for insertion of new extents */ ++ PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u"); ++ /* carry flags used for paste operations */ ++ PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u"); ++ /* carry flags used for insert operations */ ++ PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u"); ++ ++#ifdef CONFIG_REISER4_BADBLOCKS ++ /* ++ * Alternative master superblock location in case if it's original ++ * location is not writeable/accessable. This is offset in BYTES. ++ */ ++ PUSH_SB_FIELD_OPT(altsuper, "%lu"); ++#endif ++} ++ ++/** ++ * reiser4_init_super_data - initialize reiser4 private super block ++ * @super: super block to initialize ++ * @opt_string: list of reiser4 mount options ++ * ++ * Sets various reiser4 parameters to default values. Parses mount options and ++ * overwrites default settings. ++ */ ++int reiser4_init_super_data(struct super_block *super, char *opt_string) ++{ ++ int result; ++ struct opt_desc *opts, *p; ++ reiser4_super_info_data *sbinfo = get_super_private(super); ++ ++ /* initialize super, export, dentry operations */ ++ sbinfo->ops.super = reiser4_super_operations; ++ sbinfo->ops.export = reiser4_export_operations; ++ sbinfo->ops.dentry = reiser4_dentry_operations; ++ super->s_op = &sbinfo->ops.super; ++ super->s_export_op = &sbinfo->ops.export; ++ ++ /* initialize transaction manager parameters to default values */ ++ sbinfo->tmgr.atom_max_size = totalram_pages / 4; ++ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ; ++ sbinfo->tmgr.atom_min_size = 256; ++ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS; ++ ++ /* initialize cbk cache parameter */ ++ sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS; ++ ++ /* initialize flush parameters */ ++ sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD; ++ sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE; ++ sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD; ++ sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES; ++ ++ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE; ++ ++ /* preliminary tree initializations */ ++ sbinfo->tree.super = super; ++ sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS; ++ sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS; ++ sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS; ++ sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS; ++ rwlock_init(&(sbinfo->tree.tree_lock)); ++ spin_lock_init(&(sbinfo->tree.epoch_lock)); ++ ++ /* initialize default readahead params */ ++ sbinfo->ra_params.max = num_physpages / 4; ++ sbinfo->ra_params.flags = 0; ++ ++ /* allocate memory for structure describing reiser4 mount options */ ++ opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS, ++ reiser4_ctx_gfp_mask_get()); ++ if (opts == NULL) ++ return RETERR(-ENOMEM); ++ ++ /* initialize structure describing reiser4 mount options */ ++ p = opts; ++ ++ push_sb_field_opts(&p, opts, sbinfo); ++ /* turn on BSD-style gid assignment */ ++ ++#define PUSH_BIT_OPT(name, bit) \ ++ PUSH_OPT(p, opts, BIT_OPT(name, bit)) ++ ++ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID); ++ /* turn on 32 bit times */ ++ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES); ++ /* ++ * Don't load all bitmap blocks at mount time, it is useful for ++ * machines with tiny RAM and large disks. ++ */ ++ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP); ++ /* disable transaction commits during write() */ ++ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE); ++ /* disable use of write barriers in the reiser4 log writer. */ ++ PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER); ++ ++ PUSH_OPT(p, opts, ++ { ++ /* ++ * tree traversal readahead parameters: ++ * -o readahead:MAXNUM:FLAGS ++ * MAXNUM - max number fo nodes to request readahead for: -1UL ++ * will set it to max_sane_readahead() ++ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS, ++ * CONTINUE_ON_PRESENT ++ */ ++ .name = "readahead", ++ .type = OPT_FORMAT, ++ .u = { ++ .f = { ++ .format = "%u:%u", ++ .nr_args = 2, ++ .arg1 = &sbinfo->ra_params.max, ++ .arg2 = &sbinfo->ra_params.flags, ++ .arg3 = NULL, ++ .arg4 = NULL ++ } ++ } ++ } ++ ); ++ ++ /* What to do in case of fs error */ ++ PUSH_OPT(p, opts, ++ { ++ .name = "onerror", ++ .type = OPT_ONEOF, ++ .u = { ++ .oneof = { ++ .result = &sbinfo->onerror, ++ .list = { ++ "panic", "remount-ro", NULL ++ }, ++ } ++ } ++ } ++ ); ++ ++ /* modify default settings to values set by mount options */ ++ result = parse_options(opt_string, opts, p - opts); ++ kfree(opts); ++ if (result != 0) ++ return result; ++ ++ /* correct settings to sanity values */ ++ sbinfo->tmgr.atom_max_age *= HZ; ++ if (sbinfo->tmgr.atom_max_age <= 0) ++ /* overflow */ ++ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE; ++ ++ /* round optimal io size up to 512 bytes */ ++ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS; ++ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS; ++ if (sbinfo->optimal_io_size == 0) { ++ warning("nikita-2497", "optimal_io_size is too small"); ++ return RETERR(-EINVAL); ++ } ++ return result; ++} ++ ++/** ++ * reiser4_init_read_super - read reiser4 master super block ++ * @super: super block to fill ++ * @silent: if 0 - print warnings ++ * ++ * Reads reiser4 master super block either from predefined location or from ++ * location specified by altsuper mount option, initializes disk format plugin. ++ */ ++int reiser4_init_read_super(struct super_block *super, int silent) ++{ ++ struct buffer_head *super_bh; ++ struct reiser4_master_sb *master_sb; ++ reiser4_super_info_data *sbinfo = get_super_private(super); ++ unsigned long blocksize; ++ ++ read_super_block: ++#ifdef CONFIG_REISER4_BADBLOCKS ++ if (sbinfo->altsuper) ++ /* ++ * read reiser4 master super block at position specified by ++ * mount option ++ */ ++ super_bh = sb_bread(super, ++ (sector_t)(sbinfo->altsuper / super->s_blocksize)); ++ else ++#endif ++ /* read reiser4 master super block at 16-th 4096 block */ ++ super_bh = sb_bread(super, ++ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize)); ++ if (!super_bh) ++ return RETERR(-EIO); ++ ++ master_sb = (struct reiser4_master_sb *)super_bh->b_data; ++ /* check reiser4 magic string */ ++ if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING, ++ sizeof(REISER4_SUPER_MAGIC_STRING))) { ++ /* reiser4 master super block contains filesystem blocksize */ ++ blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize)); ++ ++ if (blocksize != PAGE_CACHE_SIZE) { ++ /* ++ * currenly reiser4's blocksize must be equal to ++ * pagesize ++ */ ++ if (!silent) ++ warning("nikita-2609", ++ "%s: wrong block size %ld\n", super->s_id, ++ blocksize); ++ brelse(super_bh); ++ return RETERR(-EINVAL); ++ } ++ if (blocksize != super->s_blocksize) { ++ /* ++ * filesystem uses different blocksize. Reread master ++ * super block with correct blocksize ++ */ ++ brelse(super_bh); ++ if (!sb_set_blocksize(super, (int)blocksize)) ++ return RETERR(-EINVAL); ++ goto read_super_block; ++ } ++ ++ sbinfo->df_plug = ++ disk_format_plugin_by_id( ++ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id))); ++ if (sbinfo->df_plug == NULL) { ++ if (!silent) ++ warning("nikita-26091", ++ "%s: unknown disk format plugin %d\n", ++ super->s_id, ++ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id))); ++ brelse(super_bh); ++ return RETERR(-EINVAL); ++ } ++ sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap)); ++ brelse(super_bh); ++ return 0; ++ } ++ ++ /* there is no reiser4 on the device */ ++ if (!silent) ++ warning("nikita-2608", ++ "%s: wrong master super block magic", super->s_id); ++ brelse(super_bh); ++ return RETERR(-EINVAL); ++} ++ ++static struct { ++ reiser4_plugin_type type; ++ reiser4_plugin_id id; ++} default_plugins[PSET_LAST] = { ++ [PSET_FILE] = { ++ .type = REISER4_FILE_PLUGIN_TYPE, ++ .id = UNIX_FILE_PLUGIN_ID ++ }, ++ [PSET_DIR] = { ++ .type = REISER4_DIR_PLUGIN_TYPE, ++ .id = HASHED_DIR_PLUGIN_ID ++ }, ++ [PSET_HASH] = { ++ .type = REISER4_HASH_PLUGIN_TYPE, ++ .id = R5_HASH_ID ++ }, ++ [PSET_FIBRATION] = { ++ .type = REISER4_FIBRATION_PLUGIN_TYPE, ++ .id = FIBRATION_DOT_O ++ }, ++ [PSET_PERM] = { ++ .type = REISER4_PERM_PLUGIN_TYPE, ++ .id = NULL_PERM_ID ++ }, ++ [PSET_FORMATTING] = { ++ .type = REISER4_FORMATTING_PLUGIN_TYPE, ++ .id = SMALL_FILE_FORMATTING_ID ++ }, ++ [PSET_SD] = { ++ .type = REISER4_ITEM_PLUGIN_TYPE, ++ .id = STATIC_STAT_DATA_ID ++ }, ++ [PSET_DIR_ITEM] = { ++ .type = REISER4_ITEM_PLUGIN_TYPE, ++ .id = COMPOUND_DIR_ID ++ }, ++ [PSET_CIPHER] = { ++ .type = REISER4_CIPHER_PLUGIN_TYPE, ++ .id = NONE_CIPHER_ID ++ }, ++ [PSET_DIGEST] = { ++ .type = REISER4_DIGEST_PLUGIN_TYPE, ++ .id = SHA256_32_DIGEST_ID ++ }, ++ [PSET_COMPRESSION] = { ++ .type = REISER4_COMPRESSION_PLUGIN_TYPE, ++ .id = LZO1_COMPRESSION_ID ++ }, ++ [PSET_COMPRESSION_MODE] = { ++ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, ++ .id = CONVX_COMPRESSION_MODE_ID ++ }, ++ [PSET_CLUSTER] = { ++ .type = REISER4_CLUSTER_PLUGIN_TYPE, ++ .id = CLUSTER_64K_ID ++ }, ++ [PSET_CREATE] = { ++ .type = REISER4_FILE_PLUGIN_TYPE, ++ .id = UNIX_FILE_PLUGIN_ID ++ } ++}; ++ ++/* access to default plugin table */ ++reiser4_plugin *get_default_plugin(pset_member memb) ++{ ++ return plugin_by_id(default_plugins[memb].type, ++ default_plugins[memb].id); ++} ++ ++/** ++ * reiser4_init_root_inode - obtain inode of root directory ++ * @super: super block of filesystem ++ * ++ * Obtains inode of root directory (reading it from disk), initializes plugin ++ * set it was not initialized. ++ */ ++int reiser4_init_root_inode(struct super_block *super) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(super); ++ struct inode *inode; ++ int result = 0; ++ ++ inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0); ++ if (IS_ERR(inode)) ++ return RETERR(PTR_ERR(inode)); ++ ++ super->s_root = d_alloc_root(inode); ++ if (!super->s_root) { ++ iput(inode); ++ return RETERR(-ENOMEM); ++ } ++ ++ super->s_root->d_op = &sbinfo->ops.dentry; ++ ++ if (!is_inode_loaded(inode)) { ++ pset_member memb; ++ plugin_set *pset; ++ ++ pset = reiser4_inode_data(inode)->pset; ++ for (memb = 0; memb < PSET_LAST; ++memb) { ++ ++ if (aset_get(pset, memb) != NULL) ++ continue; ++ ++ result = grab_plugin_pset(inode, NULL, memb); ++ if (result != 0) ++ break; ++ ++ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); ++ } ++ ++ if (result == 0) { ++ if (REISER4_DEBUG) { ++ for (memb = 0; memb < PSET_LAST; ++memb) ++ assert("nikita-3500", ++ aset_get(pset, memb) != NULL); ++ } ++ } else ++ warning("nikita-3448", "Cannot set plugins of root: %i", ++ result); ++ reiser4_iget_complete(inode); ++ ++ /* As the default pset kept in the root dir may has been changed ++ (length is unknown), call update_sd. */ ++ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) { ++ result = reiser4_grab_space( ++ inode_file_plugin(inode)->estimate.update(inode), ++ BA_CAN_COMMIT); ++ ++ if (result == 0) ++ result = reiser4_update_sd(inode); ++ ++ all_grabbed2free(); ++ } ++ } ++ ++ super->s_maxbytes = MAX_LFS_FILESIZE; ++ return result; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/inode.c linux-2.6.33/fs/reiser4/inode.c +--- linux-2.6.33.orig/fs/reiser4/inode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/inode.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,711 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* Inode specific operations. */ ++ ++#include "forward.h" ++#include "debug.h" ++#include "key.h" ++#include "kassign.h" ++#include "coord.h" ++#include "seal.h" ++#include "dscale.h" ++#include "plugin/item/item.h" ++#include "plugin/security/perm.h" ++#include "plugin/plugin.h" ++#include "plugin/object.h" ++#include "znode.h" ++#include "vfs_ops.h" ++#include "inode.h" ++#include "super.h" ++#include "reiser4.h" ++ ++#include <linux/fs.h> /* for struct super_block, address_space */ ++ ++/* return reiser4 internal tree which inode belongs to */ ++/* Audited by: green(2002.06.17) */ ++reiser4_tree *reiser4_tree_by_inode(const struct inode *inode/* inode queried*/) ++{ ++ assert("nikita-256", inode != NULL); ++ assert("nikita-257", inode->i_sb != NULL); ++ return reiser4_get_tree(inode->i_sb); ++} ++ ++/* return reiser4-specific inode flags */ ++static inline unsigned long *inode_flags(const struct inode *const inode) ++{ ++ assert("nikita-2842", inode != NULL); ++ return &reiser4_inode_data(inode)->flags; ++} ++ ++/* set reiser4-specific flag @f in @inode */ ++void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f) ++{ ++ assert("nikita-2248", inode != NULL); ++ set_bit((int)f, inode_flags(inode)); ++} ++ ++/* clear reiser4-specific flag @f in @inode */ ++void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f) ++{ ++ assert("nikita-2250", inode != NULL); ++ clear_bit((int)f, inode_flags(inode)); ++} ++ ++/* true if reiser4-specific flag @f is set in @inode */ ++int reiser4_inode_get_flag(const struct inode *inode, ++ reiser4_file_plugin_flags f) ++{ ++ assert("nikita-2251", inode != NULL); ++ return test_bit((int)f, inode_flags(inode)); ++} ++ ++/* convert oid to inode number */ ++ino_t oid_to_ino(oid_t oid) ++{ ++ return (ino_t) oid; ++} ++ ++/* convert oid to user visible inode number */ ++ino_t oid_to_uino(oid_t oid) ++{ ++ /* reiser4 object is uniquely identified by oid which is 64 bit ++ quantity. Kernel in-memory inode is indexed (in the hash table) by ++ 32 bit i_ino field, but this is not a problem, because there is a ++ way to further distinguish inodes with identical inode numbers ++ (find_actor supplied to iget()). ++ ++ But user space expects unique 32 bit inode number. Obviously this ++ is impossible. Work-around is to somehow hash oid into user visible ++ inode number. ++ */ ++ oid_t max_ino = (ino_t) ~0; ++ ++ if (REISER4_INO_IS_OID || (oid <= max_ino)) ++ return oid; ++ else ++ /* this is remotely similar to algorithm used to find next pid ++ to use for process: after wrap-around start from some ++ offset rather than from 0. Idea is that there are some long ++ living objects with which we don't want to collide. ++ */ ++ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1)); ++} ++ ++/* check that "inode" is on reiser4 file-system */ ++int is_reiser4_inode(const struct inode *inode/* inode queried */) ++{ ++ return inode != NULL && is_reiser4_super(inode->i_sb); ++} ++ ++/* Maximal length of a name that can be stored in directory @inode. ++ ++ This is used in check during file creation and lookup. */ ++int reiser4_max_filename_len(const struct inode *inode/* inode queried */) ++{ ++ assert("nikita-287", is_reiser4_inode(inode)); ++ assert("nikita-1710", inode_dir_item_plugin(inode)); ++ if (inode_dir_item_plugin(inode)->s.dir.max_name_len) ++ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode); ++ else ++ return 255; ++} ++ ++#if REISER4_USE_COLLISION_LIMIT ++/* Maximal number of hash collisions for this directory. */ ++int max_hash_collisions(const struct inode *dir/* inode queried */) ++{ ++ assert("nikita-1711", dir != NULL); ++ return reiser4_inode_data(dir)->plugin.max_collisions; ++} ++#endif /* REISER4_USE_COLLISION_LIMIT */ ++ ++/* Install file, inode, and address_space operation on @inode, depending on ++ its mode. */ ++int setup_inode_ops(struct inode *inode /* inode to intialize */ , ++ reiser4_object_create_data * data /* parameters to create ++ * object */ ) ++{ ++ reiser4_super_info_data *sinfo; ++ file_plugin *fplug; ++ dir_plugin *dplug; ++ ++ fplug = inode_file_plugin(inode); ++ dplug = inode_dir_plugin(inode); ++ ++ sinfo = get_super_private(inode->i_sb); ++ ++ switch (inode->i_mode & S_IFMT) { ++ case S_IFSOCK: ++ case S_IFBLK: ++ case S_IFCHR: ++ case S_IFIFO: ++ { ++ dev_t rdev; /* to keep gcc happy */ ++ ++ assert("vs-46", fplug != NULL); ++ /* ugly hack with rdev */ ++ if (data == NULL) { ++ rdev = inode->i_rdev; ++ inode->i_rdev = 0; ++ } else ++ rdev = data->rdev; ++ inode->i_blocks = 0; ++ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID); ++ inode->i_op = file_plugins[fplug->h.id].inode_ops; ++ /* initialize inode->i_fop and inode->i_rdev for block ++ and char devices */ ++ init_special_inode(inode, inode->i_mode, rdev); ++ /* all address space operations are null */ ++ inode->i_mapping->a_ops = ++ file_plugins[fplug->h.id].as_ops; ++ break; ++ } ++ case S_IFLNK: ++ assert("vs-46", fplug != NULL); ++ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID); ++ inode->i_op = file_plugins[fplug->h.id].inode_ops; ++ inode->i_fop = NULL; ++ /* all address space operations are null */ ++ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops; ++ break; ++ case S_IFDIR: ++ assert("vs-46", dplug != NULL); ++ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID || ++ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID)); ++ inode->i_op = dir_plugins[dplug->h.id].inode_ops; ++ inode->i_fop = dir_plugins[dplug->h.id].file_ops; ++ inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops; ++ break; ++ case S_IFREG: ++ assert("vs-46", fplug != NULL); ++ assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID || ++ fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID)); ++ inode->i_op = file_plugins[fplug->h.id].inode_ops; ++ inode->i_fop = file_plugins[fplug->h.id].file_ops; ++ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops; ++ break; ++ default: ++ warning("nikita-291", "wrong file mode: %o for %llu", ++ inode->i_mode, ++ (unsigned long long)get_inode_oid(inode)); ++ reiser4_make_bad_inode(inode); ++ return RETERR(-EINVAL); ++ } ++ return 0; ++} ++ ++/* Initialize inode from disk data. Called with inode locked. ++ Return inode locked. */ ++static int init_inode(struct inode *inode /* inode to intialise */ , ++ coord_t *coord/* coord of stat data */) ++{ ++ int result; ++ item_plugin *iplug; ++ void *body; ++ int length; ++ reiser4_inode *state; ++ ++ assert("nikita-292", coord != NULL); ++ assert("nikita-293", inode != NULL); ++ ++ coord_clear_iplug(coord); ++ result = zload(coord->node); ++ if (result) ++ return result; ++ iplug = item_plugin_by_coord(coord); ++ body = item_body_by_coord(coord); ++ length = item_length_by_coord(coord); ++ ++ assert("nikita-295", iplug != NULL); ++ assert("nikita-296", body != NULL); ++ assert("nikita-297", length > 0); ++ ++ /* inode is under I_LOCK now */ ++ ++ state = reiser4_inode_data(inode); ++ /* call stat-data plugin method to load sd content into inode */ ++ result = iplug->s.sd.init_inode(inode, body, length); ++ set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug)); ++ if (result == 0) { ++ result = setup_inode_ops(inode, NULL); ++ if (result == 0 && inode->i_sb->s_root && ++ inode->i_sb->s_root->d_inode) ++ result = finish_pset(inode); ++ } ++ zrelse(coord->node); ++ return result; ++} ++ ++/* read `inode' from the disk. This is what was previously in ++ reiserfs_read_inode2(). ++ ++ Must be called with inode locked. Return inode still locked. ++*/ ++static int read_inode(struct inode *inode /* inode to read from disk */ , ++ const reiser4_key * key /* key of stat data */ , ++ int silent) ++{ ++ int result; ++ lock_handle lh; ++ reiser4_inode *info; ++ coord_t coord; ++ ++ assert("nikita-298", inode != NULL); ++ assert("nikita-1945", !is_inode_loaded(inode)); ++ ++ info = reiser4_inode_data(inode); ++ assert("nikita-300", info->locality_id != 0); ++ ++ coord_init_zero(&coord); ++ init_lh(&lh); ++ /* locate stat-data in a tree and return znode locked */ ++ result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent); ++ assert("nikita-301", !is_inode_loaded(inode)); ++ if (result == 0) { ++ /* use stat-data plugin to load sd into inode. */ ++ result = init_inode(inode, &coord); ++ if (result == 0) { ++ /* initialize stat-data seal */ ++ spin_lock_inode(inode); ++ reiser4_seal_init(&info->sd_seal, &coord, key); ++ info->sd_coord = coord; ++ spin_unlock_inode(inode); ++ ++ /* call file plugin's method to initialize plugin ++ * specific part of inode */ ++ if (inode_file_plugin(inode)->init_inode_data) ++ inode_file_plugin(inode)->init_inode_data(inode, ++ NULL, ++ 0); ++ /* load detached directory cursors for stateless ++ * directory readers (NFS). */ ++ reiser4_load_cursors(inode); ++ ++ /* Check the opened inode for consistency. */ ++ result = ++ get_super_private(inode->i_sb)->df_plug-> ++ check_open(inode); ++ } ++ } ++ /* lookup_sd() doesn't release coord because we want znode ++ stay read-locked while stat-data fields are accessed in ++ init_inode() */ ++ done_lh(&lh); ++ ++ if (result != 0) ++ reiser4_make_bad_inode(inode); ++ return result; ++} ++ ++/* initialise new reiser4 inode being inserted into hash table. */ ++static int init_locked_inode(struct inode *inode /* new inode */ , ++ void *opaque /* key of stat data passed to ++ * the iget5_locked as cookie */) ++{ ++ reiser4_key *key; ++ ++ assert("nikita-1995", inode != NULL); ++ assert("nikita-1996", opaque != NULL); ++ key = opaque; ++ set_inode_oid(inode, get_key_objectid(key)); ++ reiser4_inode_data(inode)->locality_id = get_key_locality(key); ++ return 0; ++} ++ ++/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to ++ iget5_locked(). ++ ++ This function is called by iget5_locked() to distinguish reiser4 inodes ++ having the same inode numbers. Such inodes can only exist due to some error ++ condition. One of them should be bad. Inodes with identical inode numbers ++ (objectids) are distinguished by their packing locality. ++ ++*/ ++static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table ++ * to check */ , ++ void *opaque /* "cookie" passed to ++ * iget5_locked(). This ++ * is stat-data key */) ++{ ++ reiser4_key *key; ++ ++ key = opaque; ++ return ++ /* oid is unique, so first term is enough, actually. */ ++ get_inode_oid(inode) == get_key_objectid(key) && ++ /* ++ * also, locality should be checked, but locality is stored in ++ * the reiser4-specific part of the inode, and actor can be ++ * called against arbitrary inode that happened to be in this ++ * hash chain. Hence we first have to check that this is ++ * reiser4 inode at least. is_reiser4_inode() is probably too ++ * early to call, as inode may have ->i_op not yet ++ * initialised. ++ */ ++ is_reiser4_super(inode->i_sb) && ++ /* ++ * usually objectid is unique, but pseudo files use counter to ++ * generate objectid. All pseudo files are placed into special ++ * (otherwise unused) locality. ++ */ ++ reiser4_inode_data(inode)->locality_id == get_key_locality(key); ++} ++ ++/* hook for kmem_cache_create */ ++void loading_init_once(reiser4_inode * info) ++{ ++ mutex_init(&info->loading); ++} ++ ++/* for reiser4_alloc_inode */ ++void loading_alloc(reiser4_inode * info) ++{ ++ assert("vs-1717", !mutex_is_locked(&info->loading)); ++} ++ ++/* for reiser4_destroy */ ++void loading_destroy(reiser4_inode * info) ++{ ++ assert("vs-1717a", !mutex_is_locked(&info->loading)); ++} ++ ++static void loading_begin(reiser4_inode * info) ++{ ++ mutex_lock(&info->loading); ++} ++ ++static void loading_end(reiser4_inode * info) ++{ ++ mutex_unlock(&info->loading); ++} ++ ++/** ++ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary ++ * @super: super block of filesystem ++ * @key: key of inode's stat-data ++ * @silent: ++ * ++ * This is our helper function a la iget(). This is be called by ++ * lookup_common() and reiser4_read_super(). Return inode locked or error ++ * encountered. ++ */ ++struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key, ++ int silent) ++{ ++ struct inode *inode; ++ int result; ++ reiser4_inode *info; ++ ++ assert("nikita-302", super != NULL); ++ assert("nikita-303", key != NULL); ++ ++ result = 0; ++ ++ /* call iget(). Our ->read_inode() is dummy, so this will either ++ find inode in cache or return uninitialised inode */ ++ inode = iget5_locked(super, ++ (unsigned long)get_key_objectid(key), ++ reiser4_inode_find_actor, ++ init_locked_inode, (reiser4_key *) key); ++ if (inode == NULL) ++ return ERR_PTR(RETERR(-ENOMEM)); ++ if (is_bad_inode(inode)) { ++ warning("nikita-304", "Bad inode found"); ++ reiser4_print_key("key", key); ++ iput(inode); ++ return ERR_PTR(RETERR(-EIO)); ++ } ++ ++ info = reiser4_inode_data(inode); ++ ++ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully ++ loaded and initialized inode from just allocated inode. If ++ REISER4_LOADED bit is not set, reiser4_iget() completes loading under ++ info->loading. The place in reiser4 which uses not initialized inode ++ is the reiser4 repacker, see repacker-related functions in ++ plugin/item/extent.c */ ++ if (!is_inode_loaded(inode)) { ++ loading_begin(info); ++ if (!is_inode_loaded(inode)) { ++ /* locking: iget5_locked returns locked inode */ ++ assert("nikita-1941", !is_inode_loaded(inode)); ++ assert("nikita-1949", ++ reiser4_inode_find_actor(inode, ++ (reiser4_key *) key)); ++ /* now, inode has objectid as ->i_ino and locality in ++ reiser4-specific part. This is enough for ++ read_inode() to read stat data from the disk */ ++ result = read_inode(inode, key, silent); ++ } else ++ loading_end(info); ++ } ++ ++ if (inode->i_state & I_NEW) ++ unlock_new_inode(inode); ++ ++ if (is_bad_inode(inode)) { ++ assert("vs-1717", result != 0); ++ loading_end(info); ++ iput(inode); ++ inode = ERR_PTR(result); ++ } else if (REISER4_DEBUG) { ++ reiser4_key found_key; ++ ++ assert("vs-1717", result == 0); ++ build_sd_key(inode, &found_key); ++ if (!keyeq(&found_key, key)) { ++ warning("nikita-305", "Wrong key in sd"); ++ reiser4_print_key("sought for", key); ++ reiser4_print_key("found", &found_key); ++ } ++ if (inode->i_nlink == 0) { ++ warning("nikita-3559", "Unlinked inode found: %llu\n", ++ (unsigned long long)get_inode_oid(inode)); ++ } ++ } ++ return inode; ++} ++ ++/* reiser4_iget() may return not fully initialized inode, this function should ++ * be called after one completes reiser4 inode initializing. */ ++void reiser4_iget_complete(struct inode *inode) ++{ ++ assert("zam-988", is_reiser4_inode(inode)); ++ ++ if (!is_inode_loaded(inode)) { ++ reiser4_inode_set_flag(inode, REISER4_LOADED); ++ loading_end(reiser4_inode_data(inode)); ++ } ++} ++ ++void reiser4_make_bad_inode(struct inode *inode) ++{ ++ assert("nikita-1934", inode != NULL); ++ ++ /* clear LOADED bit */ ++ reiser4_inode_clr_flag(inode, REISER4_LOADED); ++ make_bad_inode(inode); ++ return; ++} ++ ++file_plugin *inode_file_plugin(const struct inode *inode) ++{ ++ assert("nikita-1997", inode != NULL); ++ return reiser4_inode_data(inode)->pset->file; ++} ++ ++dir_plugin *inode_dir_plugin(const struct inode *inode) ++{ ++ assert("nikita-1998", inode != NULL); ++ return reiser4_inode_data(inode)->pset->dir; ++} ++ ++formatting_plugin *inode_formatting_plugin(const struct inode *inode) ++{ ++ assert("nikita-2000", inode != NULL); ++ return reiser4_inode_data(inode)->pset->formatting; ++} ++ ++hash_plugin *inode_hash_plugin(const struct inode *inode) ++{ ++ assert("nikita-2001", inode != NULL); ++ return reiser4_inode_data(inode)->pset->hash; ++} ++ ++fibration_plugin *inode_fibration_plugin(const struct inode *inode) ++{ ++ assert("nikita-2001", inode != NULL); ++ return reiser4_inode_data(inode)->pset->fibration; ++} ++ ++cipher_plugin *inode_cipher_plugin(const struct inode *inode) ++{ ++ assert("edward-36", inode != NULL); ++ return reiser4_inode_data(inode)->pset->cipher; ++} ++ ++compression_plugin *inode_compression_plugin(const struct inode *inode) ++{ ++ assert("edward-37", inode != NULL); ++ return reiser4_inode_data(inode)->pset->compression; ++} ++ ++compression_mode_plugin *inode_compression_mode_plugin(const struct inode * ++ inode) ++{ ++ assert("edward-1330", inode != NULL); ++ return reiser4_inode_data(inode)->pset->compression_mode; ++} ++ ++cluster_plugin *inode_cluster_plugin(const struct inode *inode) ++{ ++ assert("edward-1328", inode != NULL); ++ return reiser4_inode_data(inode)->pset->cluster; ++} ++ ++file_plugin *inode_create_plugin(const struct inode *inode) ++{ ++ assert("edward-1329", inode != NULL); ++ return reiser4_inode_data(inode)->pset->create; ++} ++ ++digest_plugin *inode_digest_plugin(const struct inode *inode) ++{ ++ assert("edward-86", inode != NULL); ++ return reiser4_inode_data(inode)->pset->digest; ++} ++ ++item_plugin *inode_sd_plugin(const struct inode *inode) ++{ ++ assert("vs-534", inode != NULL); ++ return reiser4_inode_data(inode)->pset->sd; ++} ++ ++item_plugin *inode_dir_item_plugin(const struct inode *inode) ++{ ++ assert("vs-534", inode != NULL); ++ return reiser4_inode_data(inode)->pset->dir_item; ++} ++ ++file_plugin *child_create_plugin(const struct inode *inode) ++{ ++ assert("edward-1329", inode != NULL); ++ return reiser4_inode_data(inode)->hset->create; ++} ++ ++void inode_set_extension(struct inode *inode, sd_ext_bits ext) ++{ ++ reiser4_inode *state; ++ ++ assert("nikita-2716", inode != NULL); ++ assert("nikita-2717", ext < LAST_SD_EXTENSION); ++ assert("nikita-3491", spin_inode_is_locked(inode)); ++ ++ state = reiser4_inode_data(inode); ++ state->extmask |= 1 << ext; ++ /* force re-calculation of stat-data length on next call to ++ update_sd(). */ ++ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); ++} ++ ++void inode_clr_extension(struct inode *inode, sd_ext_bits ext) ++{ ++ reiser4_inode *state; ++ ++ assert("vpf-1926", inode != NULL); ++ assert("vpf-1927", ext < LAST_SD_EXTENSION); ++ assert("vpf-1928", spin_inode_is_locked(inode)); ++ ++ state = reiser4_inode_data(inode); ++ state->extmask &= ~(1 << ext); ++ /* force re-calculation of stat-data length on next call to ++ update_sd(). */ ++ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); ++} ++ ++void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new) ++{ ++ assert("edward-1287", inode != NULL); ++ if (!dscale_fit(old, new)) ++ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); ++ return; ++} ++ ++void inode_check_scale(struct inode *inode, __u64 old, __u64 new) ++{ ++ assert("nikita-2875", inode != NULL); ++ spin_lock_inode(inode); ++ inode_check_scale_nolock(inode, old, new); ++ spin_unlock_inode(inode); ++} ++ ++/* ++ * initialize ->ordering field of inode. This field defines how file stat-data ++ * and body is ordered within a tree with respect to other objects within the ++ * same parent directory. ++ */ ++void ++init_inode_ordering(struct inode *inode, ++ reiser4_object_create_data * crd, int create) ++{ ++ reiser4_key key; ++ ++ if (create) { ++ struct inode *parent; ++ ++ parent = crd->parent; ++ assert("nikita-3224", inode_dir_plugin(parent) != NULL); ++ inode_dir_plugin(parent)->build_entry_key(parent, ++ &crd->dentry->d_name, ++ &key); ++ } else { ++ coord_t *coord; ++ ++ coord = &reiser4_inode_data(inode)->sd_coord; ++ coord_clear_iplug(coord); ++ /* safe to use ->sd_coord, because node is under long term ++ * lock */ ++ WITH_DATA(coord->node, item_key_by_coord(coord, &key)); ++ } ++ ++ set_inode_ordering(inode, get_key_ordering(&key)); ++} ++ ++znode *inode_get_vroot(struct inode *inode) ++{ ++ reiser4_block_nr blk; ++ znode *result; ++ ++ spin_lock_inode(inode); ++ blk = reiser4_inode_data(inode)->vroot; ++ spin_unlock_inode(inode); ++ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk)) ++ result = zlook(reiser4_tree_by_inode(inode), &blk); ++ else ++ result = NULL; ++ return result; ++} ++ ++void inode_set_vroot(struct inode *inode, znode *vroot) ++{ ++ spin_lock_inode(inode); ++ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot); ++ spin_unlock_inode(inode); ++} ++ ++#if REISER4_DEBUG ++ ++void reiser4_inode_invariant(const struct inode *inode) ++{ ++ assert("nikita-3077", spin_inode_is_locked(inode)); ++} ++ ++int inode_has_no_jnodes(reiser4_inode * r4_inode) ++{ ++ return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL && ++ r4_inode->nr_jnodes == 0; ++} ++ ++#endif ++ ++/* true if directory is empty (only contains dot and dotdot) */ ++/* FIXME: shouldn't it be dir plugin method? */ ++int is_dir_empty(const struct inode *dir) ++{ ++ assert("nikita-1976", dir != NULL); ++ ++ /* rely on our method to maintain directory i_size being equal to the ++ number of entries. */ ++ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY); ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/inode.h linux-2.6.33/fs/reiser4/inode.h +--- linux-2.6.33.orig/fs/reiser4/inode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/inode.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,453 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* Inode functions. */ ++ ++#if !defined(__REISER4_INODE_H__) ++#define __REISER4_INODE_H__ ++ ++#include "forward.h" ++#include "debug.h" ++#include "key.h" ++#include "seal.h" ++#include "plugin/plugin.h" ++#include "plugin/file/cryptcompress.h" ++#include "plugin/file/file.h" ++#include "plugin/dir/dir.h" ++#include "plugin/plugin_set.h" ++#include "plugin/security/perm.h" ++#include "vfs_ops.h" ++#include "jnode.h" ++#include "fsdata.h" ++ ++#include <linux/types.h> /* for __u?? , ino_t */ ++#include <linux/fs.h> /* for struct super_block, struct ++ * rw_semaphore, etc */ ++#include <linux/spinlock.h> ++#include <asm/types.h> ++ ++/* reiser4-specific inode flags. They are "transient" and are not ++ supposed to be stored on disk. Used to trace "state" of ++ inode ++*/ ++typedef enum { ++ /* this is light-weight inode, inheriting some state from its ++ parent */ ++ REISER4_LIGHT_WEIGHT = 0, ++ /* stat data wasn't yet created */ ++ REISER4_NO_SD = 1, ++ /* internal immutable flag. Currently is only used ++ to avoid race condition during file creation. ++ See comment in create_object(). */ ++ REISER4_IMMUTABLE = 2, ++ /* inode was read from storage */ ++ REISER4_LOADED = 3, ++ /* this bit is set for symlinks. inode->i_private points to target ++ name of symlink. */ ++ REISER4_GENERIC_PTR_USED = 4, ++ /* set if size of stat-data item for this inode is known. If this is ++ * set we can avoid recalculating size of stat-data on each update. */ ++ REISER4_SDLEN_KNOWN = 5, ++ /* reiser4_inode->crypt points to the crypto stat */ ++ REISER4_CRYPTO_STAT_LOADED = 6, ++ /* cryptcompress_inode_data points to the secret key */ ++ REISER4_SECRET_KEY_INSTALLED = 7, ++ /* File (possibly) has pages corresponding to the tail items, that ++ * were created by ->readpage. It is set by mmap_unix_file() and ++ * sendfile_unix_file(). This bit is inspected by write_unix_file and ++ * kill-hook of tail items. It is never cleared once set. This bit is ++ * modified and inspected under i_mutex. */ ++ REISER4_HAS_MMAP = 8, ++ REISER4_PART_MIXED = 9, ++ REISER4_PART_IN_CONV = 10, ++ /* This flag indicates that file plugin conversion is in progress */ ++ REISER4_FILE_CONV_IN_PROGRESS = 11 ++} reiser4_file_plugin_flags; ++ ++/* state associated with each inode. ++ reiser4 inode. ++ ++ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes ++ be of the same size. File-system allocates inodes by itself through ++ s_op->allocate_inode() method. So, it is possible to adjust size of inode ++ at the time of its creation. ++ ++ Invariants involving parts of this data-type: ++ ++ [inode->eflushed] ++ ++*/ ++ ++typedef struct reiser4_inode reiser4_inode; ++/* return pointer to reiser4-specific part of inode */ ++static inline reiser4_inode *reiser4_inode_data(const struct inode *inode ++ /* inode queried */ ); ++ ++#if BITS_PER_LONG == 64 ++ ++#define REISER4_INO_IS_OID (1) ++typedef struct {; ++} oid_hi_t; ++ ++/* BITS_PER_LONG == 64 */ ++#else ++ ++#define REISER4_INO_IS_OID (0) ++typedef __u32 oid_hi_t; ++ ++/* BITS_PER_LONG == 64 */ ++#endif ++ ++struct reiser4_inode { ++ /* spin lock protecting fields of this structure. */ ++ spinlock_t guard; ++ /* main plugin set that control the file ++ (see comments in plugin/plugin_set.c) */ ++ plugin_set *pset; ++ /* plugin set for inheritance ++ (see comments in plugin/plugin_set.c) */ ++ plugin_set *hset; ++ /* high 32 bits of object id */ ++ oid_hi_t oid_hi; ++ /* seal for stat-data */ ++ seal_t sd_seal; ++ /* locality id for this file */ ++ oid_t locality_id; ++#if REISER4_LARGE_KEY ++ __u64 ordering; ++#endif ++ /* coord of stat-data in sealed node */ ++ coord_t sd_coord; ++ /* bit-mask of stat-data extentions used by this file */ ++ __u64 extmask; ++ /* bitmask of non-default plugins for this inode */ ++ __u16 plugin_mask; ++ /* bitmask of set heir plugins for this inode. */ ++ __u16 heir_mask; ++ union { ++ struct list_head readdir_list; ++ struct list_head not_used; ++ } lists; ++ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */ ++ unsigned long flags; ++ union { ++ /* fields specific to unix_file plugin */ ++ struct unix_file_info unix_file_info; ++ /* fields specific to cryptcompress file plugin */ ++ struct cryptcompress_info cryptcompress_info; ++ } file_plugin_data; ++ ++ /* this semaphore is to serialize readers and writers of @pset->file ++ * when file plugin conversion is enabled ++ */ ++ struct rw_semaphore conv_sem; ++ ++ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are ++ tagged in that tree by EFLUSH_TAG_ANONYMOUS */ ++ struct radix_tree_root jnodes_tree; ++#if REISER4_DEBUG ++ /* number of unformatted node jnodes of this file in jnode hash table */ ++ unsigned long nr_jnodes; ++#endif ++ ++ /* block number of virtual root for this object. See comment above ++ * fs/reiser4/search.c:handle_vroot() */ ++ reiser4_block_nr vroot; ++ struct mutex loading; ++}; ++ ++void loading_init_once(reiser4_inode *); ++void loading_alloc(reiser4_inode *); ++void loading_destroy(reiser4_inode *); ++ ++struct reiser4_inode_object { ++ /* private part */ ++ reiser4_inode p; ++ /* generic fields not specific to reiser4, but used by VFS */ ++ struct inode vfs_inode; ++}; ++ ++/* return pointer to the reiser4 specific portion of @inode */ ++static inline reiser4_inode *reiser4_inode_data(const struct inode *inode ++ /* inode queried */ ) ++{ ++ assert("nikita-254", inode != NULL); ++ return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p; ++} ++ ++static inline struct inode *inode_by_reiser4_inode(const reiser4_inode * ++ r4_inode /* inode queried */ ++ ) ++{ ++ return &container_of(r4_inode, struct reiser4_inode_object, ++ p)->vfs_inode; ++} ++ ++/* ++ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct ++ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64 ++ * bits. ++ * ++ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part ++ * of inode, otherwise whole oid is stored in i_ino. ++ * ++ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference. ++ */ ++ ++#define OID_HI_SHIFT (sizeof(ino_t) * 8) ++ ++#if REISER4_INO_IS_OID ++ ++static inline oid_t get_inode_oid(const struct inode *inode) ++{ ++ return inode->i_ino; ++} ++ ++static inline void set_inode_oid(struct inode *inode, oid_t oid) ++{ ++ inode->i_ino = oid; ++} ++ ++/* REISER4_INO_IS_OID */ ++#else ++ ++static inline oid_t get_inode_oid(const struct inode *inode) ++{ ++ return ++ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) | ++ inode->i_ino; ++} ++ ++static inline void set_inode_oid(struct inode *inode, oid_t oid) ++{ ++ assert("nikita-2519", inode != NULL); ++ inode->i_ino = (ino_t) (oid); ++ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT; ++ assert("nikita-2521", get_inode_oid(inode) == (oid)); ++} ++ ++/* REISER4_INO_IS_OID */ ++#endif ++ ++static inline oid_t get_inode_locality(const struct inode *inode) ++{ ++ return reiser4_inode_data(inode)->locality_id; ++} ++ ++#if REISER4_LARGE_KEY ++static inline __u64 get_inode_ordering(const struct inode *inode) ++{ ++ return reiser4_inode_data(inode)->ordering; ++} ++ ++static inline void set_inode_ordering(const struct inode *inode, __u64 ordering) ++{ ++ reiser4_inode_data(inode)->ordering = ordering; ++} ++ ++#else ++ ++#define get_inode_ordering(inode) (0) ++#define set_inode_ordering(inode, val) noop ++ ++#endif ++ ++/* return inode in which @uf_info is embedded */ ++static inline struct inode * ++unix_file_info_to_inode(const struct unix_file_info *uf_info) ++{ ++ return &container_of(uf_info, struct reiser4_inode_object, ++ p.file_plugin_data.unix_file_info)->vfs_inode; ++} ++ ++extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const)); ++extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const)); ++ ++extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode); ++ ++#if REISER4_DEBUG ++extern void reiser4_inode_invariant(const struct inode *inode); ++extern int inode_has_no_jnodes(reiser4_inode *); ++#else ++#define reiser4_inode_invariant(inode) noop ++#endif ++ ++static inline int spin_inode_is_locked(const struct inode *inode) ++{ ++ assert_spin_locked(&reiser4_inode_data(inode)->guard); ++ return 1; ++} ++ ++/** ++ * spin_lock_inode - lock reiser4_inode' embedded spinlock ++ * @inode: inode to lock ++ * ++ * In debug mode it checks that lower priority locks are not held and ++ * increments reiser4_context's lock counters on which lock ordering checking ++ * is based. ++ */ ++static inline void spin_lock_inode(struct inode *inode) ++{ ++ assert("", LOCK_CNT_NIL(spin_locked)); ++ /* check lock ordering */ ++ assert_spin_not_locked(&d_lock); ++ ++ spin_lock(&reiser4_inode_data(inode)->guard); ++ ++ LOCK_CNT_INC(spin_locked_inode); ++ LOCK_CNT_INC(spin_locked); ++ ++ reiser4_inode_invariant(inode); ++} ++ ++/** ++ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock ++ * @inode: inode to unlock ++ * ++ * In debug mode it checks that spinlock is held and decrements ++ * reiser4_context's lock counters on which lock ordering checking is based. ++ */ ++static inline void spin_unlock_inode(struct inode *inode) ++{ ++ assert_spin_locked(&reiser4_inode_data(inode)->guard); ++ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ ++ reiser4_inode_invariant(inode); ++ ++ LOCK_CNT_DEC(spin_locked_inode); ++ LOCK_CNT_DEC(spin_locked); ++ ++ spin_unlock(&reiser4_inode_data(inode)->guard); ++} ++ ++extern znode *inode_get_vroot(struct inode *inode); ++extern void inode_set_vroot(struct inode *inode, znode * vroot); ++ ++extern int reiser4_max_filename_len(const struct inode *inode); ++extern int max_hash_collisions(const struct inode *dir); ++extern void reiser4_unlock_inode(struct inode *inode); ++extern int is_reiser4_inode(const struct inode *inode); ++extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *); ++extern struct inode *reiser4_iget(struct super_block *super, ++ const reiser4_key * key, int silent); ++extern void reiser4_iget_complete(struct inode *inode); ++extern void reiser4_inode_set_flag(struct inode *inode, ++ reiser4_file_plugin_flags f); ++extern void reiser4_inode_clr_flag(struct inode *inode, ++ reiser4_file_plugin_flags f); ++extern int reiser4_inode_get_flag(const struct inode *inode, ++ reiser4_file_plugin_flags f); ++ ++/* has inode been initialized? */ ++static inline int ++is_inode_loaded(const struct inode *inode/* inode queried */) ++{ ++ assert("nikita-1120", inode != NULL); ++ return reiser4_inode_get_flag(inode, REISER4_LOADED); ++} ++ ++extern file_plugin *inode_file_plugin(const struct inode *inode); ++extern dir_plugin *inode_dir_plugin(const struct inode *inode); ++extern formatting_plugin *inode_formatting_plugin(const struct inode *inode); ++extern hash_plugin *inode_hash_plugin(const struct inode *inode); ++extern fibration_plugin *inode_fibration_plugin(const struct inode *inode); ++extern cipher_plugin *inode_cipher_plugin(const struct inode *inode); ++extern digest_plugin *inode_digest_plugin(const struct inode *inode); ++extern compression_plugin *inode_compression_plugin(const struct inode *inode); ++extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode ++ *inode); ++extern cluster_plugin *inode_cluster_plugin(const struct inode *inode); ++extern file_plugin *inode_create_plugin(const struct inode *inode); ++extern item_plugin *inode_sd_plugin(const struct inode *inode); ++extern item_plugin *inode_dir_item_plugin(const struct inode *inode); ++extern file_plugin *child_create_plugin(const struct inode *inode); ++ ++extern void reiser4_make_bad_inode(struct inode *inode); ++ ++extern void inode_set_extension(struct inode *inode, sd_ext_bits ext); ++extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext); ++extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new); ++extern void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new); ++ ++#define INODE_SET_SIZE(i, value) \ ++({ \ ++ struct inode *__i; \ ++ typeof(value) __v; \ ++ \ ++ __i = (i); \ ++ __v = (value); \ ++ inode_check_scale(__i, __i->i_size, __v); \ ++ i_size_write(__i, __v); \ ++}) ++ ++/* ++ * update field @field in inode @i to contain value @value. ++ */ ++#define INODE_SET_FIELD(i, field, value) \ ++({ \ ++ struct inode *__i; \ ++ typeof(value) __v; \ ++ \ ++ __i = (i); \ ++ __v = (value); \ ++ inode_check_scale(__i, __i->field, __v); \ ++ __i->field = __v; \ ++}) ++ ++#define INODE_INC_FIELD(i, field) \ ++({ \ ++ struct inode *__i; \ ++ \ ++ __i = (i); \ ++ inode_check_scale(__i, __i->field, __i->field + 1); \ ++ ++ __i->field; \ ++}) ++ ++#define INODE_DEC_FIELD(i, field) \ ++({ \ ++ struct inode *__i; \ ++ \ ++ __i = (i); \ ++ inode_check_scale(__i, __i->field, __i->field - 1); \ ++ -- __i->field; \ ++}) ++ ++/* See comment before reiser4_readdir_common() for description. */ ++static inline struct list_head *get_readdir_list(const struct inode *inode) ++{ ++ return &reiser4_inode_data(inode)->lists.readdir_list; ++} ++ ++extern void init_inode_ordering(struct inode *inode, ++ reiser4_object_create_data * crd, int create); ++ ++static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode) ++{ ++ return &reiser4_inode_data(inode)->jnodes_tree; ++} ++ ++static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode ++ *r4_inode) ++{ ++ return &r4_inode->jnodes_tree; ++} ++ ++#if REISER4_DEBUG ++extern void print_inode(const char *prefix, const struct inode *i); ++#endif ++ ++int is_dir_empty(const struct inode *); ++ ++/* __REISER4_INODE_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/ioctl.h linux-2.6.33/fs/reiser4/ioctl.h +--- linux-2.6.33.orig/fs/reiser4/ioctl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/ioctl.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,41 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#if !defined(__REISER4_IOCTL_H__) ++#define __REISER4_IOCTL_H__ ++ ++#include <linux/fs.h> ++ ++/* ++ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into ++ * extents and fix in this state. This is used by applications that rely on ++ * ++ * . files being block aligned, and ++ * ++ * . files never migrating on disk ++ * ++ * for example, boot loaders (LILO) need this. ++ * ++ * This ioctl should be used as ++ * ++ * result = ioctl(fd, REISER4_IOC_UNPACK); ++ * ++ * File behind fd descriptor will be converted to the extents (if necessary), ++ * and its stat-data will be updated so that it will never be converted back ++ * into tails again. ++ */ ++#define REISER4_IOC_UNPACK _IOW(0xCD, 1, long) ++ ++/* __REISER4_IOCTL_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/jnode.c linux-2.6.33/fs/reiser4/jnode.c +--- linux-2.6.33.orig/fs/reiser4/jnode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/jnode.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1923 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++/* Jnode manipulation functions. */ ++/* Jnode is entity used to track blocks with data and meta-data in reiser4. ++ ++ In particular, jnodes are used to track transactional information ++ associated with each block. Each znode contains jnode as ->zjnode field. ++ ++ Jnode stands for either Josh or Journal node. ++*/ ++ ++/* ++ * Taxonomy. ++ * ++ * Jnode represents block containing data or meta-data. There are jnodes ++ * for: ++ * ++ * unformatted blocks (jnodes proper). There are plans, however to ++ * have a handle per extent unit rather than per each unformatted ++ * block, because there are so many of them. ++ * ++ * For bitmaps. Each bitmap is actually represented by two jnodes--one ++ * for working and another for "commit" data, together forming bnode. ++ * ++ * For io-heads. These are used by log writer. ++ * ++ * For formatted nodes (znode). See comment at the top of znode.c for ++ * details specific to the formatted nodes (znodes). ++ * ++ * Node data. ++ * ++ * Jnode provides access to the data of node it represents. Data are ++ * stored in a page. Page is kept in a page cache. This means, that jnodes ++ * are highly interconnected with page cache and VM internals. ++ * ++ * jnode has a pointer to page (->pg) containing its data. Pointer to data ++ * themselves is cached in ->data field to avoid frequent calls to ++ * page_address(). ++ * ++ * jnode and page are attached to each other by jnode_attach_page(). This ++ * function places pointer to jnode in set_page_private(), sets PG_private ++ * flag and increments page counter. ++ * ++ * Opposite operation is performed by page_clear_jnode(). ++ * ++ * jnode->pg is protected by jnode spin lock, and page->private is ++ * protected by page lock. See comment at the top of page_cache.c for ++ * more. ++ * ++ * page can be detached from jnode for two reasons: ++ * ++ * . jnode is removed from a tree (file is truncated, of formatted ++ * node is removed by balancing). ++ * ++ * . during memory pressure, VM calls ->releasepage() method ++ * (reiser4_releasepage()) to evict page from memory. ++ * ++ * (there, of course, is also umount, but this is special case we are not ++ * concerned with here). ++ * ++ * To protect jnode page from eviction, one calls jload() function that ++ * "pins" page in memory (loading it if necessary), increments ++ * jnode->d_count, and kmap()s page. Page is unpinned through call to ++ * jrelse(). ++ * ++ * Jnode life cycle. ++ * ++ * jnode is created, placed in hash table, and, optionally, in per-inode ++ * radix tree. Page can be attached to jnode, pinned, released, etc. ++ * ++ * When jnode is captured into atom its reference counter is ++ * increased. While being part of an atom, jnode can be "early ++ * flushed". This means that as part of flush procedure, jnode is placed ++ * into "relocate set", and its page is submitted to the disk. After io ++ * completes, page can be detached, then loaded again, re-dirtied, etc. ++ * ++ * Thread acquired reference to jnode by calling jref() and releases it by ++ * jput(). When last reference is removed, jnode is still retained in ++ * memory (cached) if it has page attached, _unless_ it is scheduled for ++ * destruction (has JNODE_HEARD_BANSHEE bit set). ++ * ++ * Tree read-write lock was used as "existential" lock for jnodes. That is, ++ * jnode->x_count could be changed from 0 to 1 only under tree write lock, ++ * that is, tree lock protected unreferenced jnodes stored in the hash ++ * table, from recycling. ++ * ++ * This resulted in high contention on tree lock, because jref()/jput() is ++ * frequent operation. To ameliorate this problem, RCU is used: when jput() ++ * is just about to release last reference on jnode it sets JNODE_RIP bit ++ * on it, and then proceed with jnode destruction (removing jnode from hash ++ * table, cbk_cache, detaching page, etc.). All places that change jnode ++ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and ++ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by ++ * jnode_rip_check() function), and pretend that nothing was found in hash ++ * table if bit is set. ++ * ++ * jput defers actual return of jnode into slab cache to some later time ++ * (by call_rcu()), this guarantees that other threads can safely continue ++ * working with JNODE_RIP-ped jnode. ++ * ++ */ ++ ++#include "reiser4.h" ++#include "debug.h" ++#include "dformat.h" ++#include "jnode.h" ++#include "plugin/plugin_header.h" ++#include "plugin/plugin.h" ++#include "txnmgr.h" ++/*#include "jnode.h"*/ ++#include "znode.h" ++#include "tree.h" ++#include "tree_walk.h" ++#include "super.h" ++#include "inode.h" ++#include "page_cache.h" ++ ++#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */ ++#include <linux/types.h> ++#include <linux/slab.h> ++#include <linux/pagemap.h> ++#include <linux/swap.h> ++#include <linux/fs.h> /* for struct address_space */ ++#include <linux/writeback.h> /* for inode_lock */ ++ ++static struct kmem_cache *_jnode_slab = NULL; ++ ++static void jnode_set_type(jnode * node, jnode_type type); ++static int jdelete(jnode * node); ++static int jnode_try_drop(jnode * node); ++ ++#if REISER4_DEBUG ++static int jnode_invariant(jnode * node, int tlocked, int jlocked); ++#endif ++ ++/* true if valid page is attached to jnode */ ++static inline int jnode_is_parsed(jnode * node) ++{ ++ return JF_ISSET(node, JNODE_PARSED); ++} ++ ++/* hash table support */ ++ ++/* compare two jnode keys for equality. Used by hash-table macros */ ++static inline int jnode_key_eq(const struct jnode_key *k1, ++ const struct jnode_key *k2) ++{ ++ assert("nikita-2350", k1 != NULL); ++ assert("nikita-2351", k2 != NULL); ++ ++ return (k1->index == k2->index && k1->objectid == k2->objectid); ++} ++ ++/* Hash jnode by its key (inode plus offset). Used by hash-table macros */ ++static inline __u32 jnode_key_hashfn(j_hash_table * table, ++ const struct jnode_key *key) ++{ ++ assert("nikita-2352", key != NULL); ++ assert("nikita-3346", IS_POW(table->_buckets)); ++ ++ /* yes, this is remarkable simply (where not stupid) hash function. */ ++ return (key->objectid + key->index) & (table->_buckets - 1); ++} ++ ++/* The hash table definition */ ++#define KMALLOC(size) reiser4_vmalloc(size) ++#define KFREE(ptr, size) vfree(ptr) ++TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j, ++ jnode_key_hashfn, jnode_key_eq); ++#undef KFREE ++#undef KMALLOC ++ ++/* call this to initialise jnode hash table */ ++int jnodes_tree_init(reiser4_tree * tree/* tree to initialise jnodes for */) ++{ ++ assert("nikita-2359", tree != NULL); ++ return j_hash_init(&tree->jhash_table, 16384); ++} ++ ++/* call this to destroy jnode hash table. This is called during umount. */ ++int jnodes_tree_done(reiser4_tree * tree/* tree to destroy jnodes for */) ++{ ++ j_hash_table *jtable; ++ jnode *node; ++ jnode *next; ++ ++ assert("nikita-2360", tree != NULL); ++ ++ /* ++ * Scan hash table and free all jnodes. ++ */ ++ jtable = &tree->jhash_table; ++ if (jtable->_table) { ++ for_all_in_htable(jtable, j, node, next) { ++ assert("nikita-2361", !atomic_read(&node->x_count)); ++ jdrop(node); ++ } ++ ++ j_hash_done(&tree->jhash_table); ++ } ++ return 0; ++} ++ ++/** ++ * init_jnodes - create jnode cache ++ * ++ * Initializes slab cache jnodes. It is part of reiser4 module initialization. ++ */ ++int init_jnodes(void) ++{ ++ assert("umka-168", _jnode_slab == NULL); ++ ++ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0, ++ SLAB_HWCACHE_ALIGN | ++ SLAB_RECLAIM_ACCOUNT, NULL); ++ if (_jnode_slab == NULL) ++ return RETERR(-ENOMEM); ++ ++ return 0; ++} ++ ++/** ++ * done_znodes - delete znode cache ++ * ++ * This is called on reiser4 module unloading or system shutdown. ++ */ ++void done_jnodes(void) ++{ ++ destroy_reiser4_cache(&_jnode_slab); ++} ++ ++/* Initialize a jnode. */ ++void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type) ++{ ++ assert("umka-175", node != NULL); ++ ++ memset(node, 0, sizeof(jnode)); ++ ON_DEBUG(node->magic = JMAGIC); ++ jnode_set_type(node, type); ++ atomic_set(&node->d_count, 0); ++ atomic_set(&node->x_count, 0); ++ spin_lock_init(&node->guard); ++ spin_lock_init(&node->load); ++ node->atom = NULL; ++ node->tree = tree; ++ INIT_LIST_HEAD(&node->capture_link); ++ ++ ASSIGN_NODE_LIST(node, NOT_CAPTURED); ++ ++ INIT_RCU_HEAD(&node->rcu); ++ ++#if REISER4_DEBUG ++ { ++ reiser4_super_info_data *sbinfo; ++ ++ sbinfo = get_super_private(tree->super); ++ spin_lock_irq(&sbinfo->all_guard); ++ list_add(&node->jnodes, &sbinfo->all_jnodes); ++ spin_unlock_irq(&sbinfo->all_guard); ++ } ++#endif ++} ++ ++#if REISER4_DEBUG ++/* ++ * Remove jnode from ->all_jnodes list. ++ */ ++static void jnode_done(jnode * node, reiser4_tree * tree) ++{ ++ reiser4_super_info_data *sbinfo; ++ ++ sbinfo = get_super_private(tree->super); ++ ++ spin_lock_irq(&sbinfo->all_guard); ++ assert("nikita-2422", !list_empty(&node->jnodes)); ++ list_del_init(&node->jnodes); ++ spin_unlock_irq(&sbinfo->all_guard); ++} ++#endif ++ ++/* return already existing jnode of page */ ++jnode *jnode_by_page(struct page *pg) ++{ ++ assert("nikita-2066", pg != NULL); ++ assert("nikita-2400", PageLocked(pg)); ++ assert("nikita-2068", PagePrivate(pg)); ++ assert("nikita-2067", jprivate(pg) != NULL); ++ return jprivate(pg); ++} ++ ++/* exported functions to allocate/free jnode objects outside this file */ ++jnode *jalloc(void) ++{ ++ jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get()); ++ return jal; ++} ++ ++/* return jnode back to the slab allocator */ ++inline void jfree(jnode * node) ++{ ++ assert("zam-449", node != NULL); ++ ++ assert("nikita-2663", (list_empty_careful(&node->capture_link) && ++ NODE_LIST(node) == NOT_CAPTURED)); ++ assert("nikita-3222", list_empty(&node->jnodes)); ++ assert("nikita-3221", jnode_page(node) == NULL); ++ ++ /* not yet phash_jnode_destroy(node); */ ++ ++ kmem_cache_free(_jnode_slab, node); ++} ++ ++/* ++ * This function is supplied as RCU callback. It actually frees jnode when ++ * last reference to it is gone. ++ */ ++static void jnode_free_actor(struct rcu_head *head) ++{ ++ jnode *node; ++ jnode_type jtype; ++ ++ node = container_of(head, jnode, rcu); ++ jtype = jnode_get_type(node); ++ ++ ON_DEBUG(jnode_done(node, jnode_get_tree(node))); ++ ++ switch (jtype) { ++ case JNODE_IO_HEAD: ++ case JNODE_BITMAP: ++ case JNODE_UNFORMATTED_BLOCK: ++ jfree(node); ++ break; ++ case JNODE_FORMATTED_BLOCK: ++ zfree(JZNODE(node)); ++ break; ++ case JNODE_INODE: ++ default: ++ wrong_return_value("nikita-3197", "Wrong jnode type"); ++ } ++} ++ ++/* ++ * Free a jnode. Post a callback to be executed later through RCU when all ++ * references to @node are released. ++ */ ++static inline void jnode_free(jnode * node, jnode_type jtype) ++{ ++ if (jtype != JNODE_INODE) { ++ /*assert("nikita-3219", list_empty(&node->rcu.list)); */ ++ call_rcu(&node->rcu, jnode_free_actor); ++ } else ++ jnode_list_remove(node); ++} ++ ++/* allocate new unformatted jnode */ ++static jnode *jnew_unformatted(void) ++{ ++ jnode *jal; ++ ++ jal = jalloc(); ++ if (jal == NULL) ++ return NULL; ++ ++ jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK); ++ jal->key.j.mapping = NULL; ++ jal->key.j.index = (unsigned long)-1; ++ jal->key.j.objectid = 0; ++ return jal; ++} ++ ++/* look for jnode with given mapping and offset within hash table */ ++jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index) ++{ ++ struct jnode_key jkey; ++ jnode *node; ++ ++ assert("nikita-2353", tree != NULL); ++ ++ jkey.objectid = objectid; ++ jkey.index = index; ++ ++ /* ++ * hash table is _not_ protected by any lock during lookups. All we ++ * have to do is to disable preemption to keep RCU happy. ++ */ ++ ++ rcu_read_lock(); ++ node = j_hash_find(&tree->jhash_table, &jkey); ++ if (node != NULL) { ++ /* protect @node from recycling */ ++ jref(node); ++ assert("nikita-2955", jnode_invariant(node, 0, 0)); ++ node = jnode_rip_check(tree, node); ++ } ++ rcu_read_unlock(); ++ return node; ++} ++ ++/* per inode radix tree of jnodes is protected by tree's read write spin lock */ ++static jnode *jfind_nolock(struct address_space *mapping, unsigned long index) ++{ ++ assert("vs-1694", mapping->host != NULL); ++ ++ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index); ++} ++ ++jnode *jfind(struct address_space *mapping, unsigned long index) ++{ ++ reiser4_tree *tree; ++ jnode *node; ++ ++ assert("vs-1694", mapping->host != NULL); ++ tree = reiser4_tree_by_inode(mapping->host); ++ ++ read_lock_tree(tree); ++ node = jfind_nolock(mapping, index); ++ if (node != NULL) ++ jref(node); ++ read_unlock_tree(tree); ++ return node; ++} ++ ++static void inode_attach_jnode(jnode * node) ++{ ++ struct inode *inode; ++ reiser4_inode *info; ++ struct radix_tree_root *rtree; ++ ++ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); ++ assert("zam-1043", node->key.j.mapping != NULL); ++ inode = node->key.j.mapping->host; ++ info = reiser4_inode_data(inode); ++ rtree = jnode_tree_by_reiser4_inode(info); ++ if (rtree->rnode == NULL) { ++ /* prevent inode from being pruned when it has jnodes attached ++ to it */ ++ spin_lock_irq(&inode->i_data.tree_lock); ++ inode->i_data.nrpages++; ++ spin_unlock_irq(&inode->i_data.tree_lock); ++ } ++ assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0)); ++ check_me("zam-1045", ++ !radix_tree_insert(rtree, node->key.j.index, node)); ++ ON_DEBUG(info->nr_jnodes++); ++} ++ ++static void inode_detach_jnode(jnode * node) ++{ ++ struct inode *inode; ++ reiser4_inode *info; ++ struct radix_tree_root *rtree; ++ ++ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); ++ assert("zam-1044", node->key.j.mapping != NULL); ++ inode = node->key.j.mapping->host; ++ info = reiser4_inode_data(inode); ++ rtree = jnode_tree_by_reiser4_inode(info); ++ ++ assert("zam-1051", info->nr_jnodes != 0); ++ assert("zam-1052", rtree->rnode != NULL); ++ ON_DEBUG(info->nr_jnodes--); ++ ++ /* delete jnode from inode's radix tree of jnodes */ ++ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index)); ++ if (rtree->rnode == NULL) { ++ /* inode can be pruned now */ ++ spin_lock_irq(&inode->i_data.tree_lock); ++ inode->i_data.nrpages--; ++ spin_unlock_irq(&inode->i_data.tree_lock); ++ } ++} ++ ++/* put jnode into hash table (where they can be found by flush who does not know ++ mapping) and to inode's tree of jnodes (where they can be found (hopefully ++ faster) in places where mapping is known). Currently it is used by ++ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is ++ created */ ++static void ++hash_unformatted_jnode(jnode * node, struct address_space *mapping, ++ unsigned long index) ++{ ++ j_hash_table *jtable; ++ ++ assert("vs-1446", jnode_is_unformatted(node)); ++ assert("vs-1442", node->key.j.mapping == 0); ++ assert("vs-1443", node->key.j.objectid == 0); ++ assert("vs-1444", node->key.j.index == (unsigned long)-1); ++ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); ++ ++ node->key.j.mapping = mapping; ++ node->key.j.objectid = get_inode_oid(mapping->host); ++ node->key.j.index = index; ++ ++ jtable = &jnode_get_tree(node)->jhash_table; ++ ++ /* race with some other thread inserting jnode into the hash table is ++ * impossible, because we keep the page lock. */ ++ /* ++ * following assertion no longer holds because of RCU: it is possible ++ * jnode is in the hash table, but with JNODE_RIP bit set. ++ */ ++ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */ ++ j_hash_insert_rcu(jtable, node); ++ inode_attach_jnode(node); ++} ++ ++static void unhash_unformatted_node_nolock(jnode * node) ++{ ++ assert("vs-1683", node->key.j.mapping != NULL); ++ assert("vs-1684", ++ node->key.j.objectid == ++ get_inode_oid(node->key.j.mapping->host)); ++ ++ /* remove jnode from hash-table */ ++ j_hash_remove_rcu(&node->tree->jhash_table, node); ++ inode_detach_jnode(node); ++ node->key.j.mapping = NULL; ++ node->key.j.index = (unsigned long)-1; ++ node->key.j.objectid = 0; ++ ++} ++ ++/* remove jnode from hash table and from inode's tree of jnodes. This is used in ++ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes -> ++ reiser4_uncapture_jnode */ ++void unhash_unformatted_jnode(jnode * node) ++{ ++ assert("vs-1445", jnode_is_unformatted(node)); ++ ++ write_lock_tree(node->tree); ++ unhash_unformatted_node_nolock(node); ++ write_unlock_tree(node->tree); ++} ++ ++/* ++ * search hash table for a jnode with given oid and index. If not found, ++ * allocate new jnode, insert it, and also insert into radix tree for the ++ * given inode/mapping. ++ */ ++static jnode *find_get_jnode(reiser4_tree * tree, ++ struct address_space *mapping, ++ oid_t oid, unsigned long index) ++{ ++ jnode *result; ++ jnode *shadow; ++ int preload; ++ ++ result = jnew_unformatted(); ++ ++ if (unlikely(result == NULL)) ++ return ERR_PTR(RETERR(-ENOMEM)); ++ ++ preload = radix_tree_preload(reiser4_ctx_gfp_mask_get()); ++ if (preload != 0) ++ return ERR_PTR(preload); ++ ++ write_lock_tree(tree); ++ shadow = jfind_nolock(mapping, index); ++ if (likely(shadow == NULL)) { ++ /* add new jnode to hash table and inode's radix tree of ++ * jnodes */ ++ jref(result); ++ hash_unformatted_jnode(result, mapping, index); ++ } else { ++ /* jnode is found in inode's radix tree of jnodes */ ++ jref(shadow); ++ jnode_free(result, JNODE_UNFORMATTED_BLOCK); ++ assert("vs-1498", shadow->key.j.mapping == mapping); ++ result = shadow; ++ } ++ write_unlock_tree(tree); ++ ++ assert("nikita-2955", ++ ergo(result != NULL, jnode_invariant(result, 0, 0))); ++ radix_tree_preload_end(); ++ return result; ++} ++ ++/* jget() (a la zget() but for unformatted nodes). Returns (and possibly ++ creates) jnode corresponding to page @pg. jnode is attached to page and ++ inserted into jnode hash-table. */ ++static jnode *do_jget(reiser4_tree * tree, struct page *pg) ++{ ++ /* ++ * There are two ways to create jnode: starting with pre-existing page ++ * and without page. ++ * ++ * When page already exists, jnode is created ++ * (jnode_of_page()->do_jget()) under page lock. This is done in ++ * ->writepage(), or when capturing anonymous page dirtied through ++ * mmap. ++ * ++ * Jnode without page is created by index_extent_jnode(). ++ * ++ */ ++ ++ jnode *result; ++ oid_t oid = get_inode_oid(pg->mapping->host); ++ ++ assert("umka-176", pg != NULL); ++ assert("nikita-2394", PageLocked(pg)); ++ ++ result = jprivate(pg); ++ if (likely(result != NULL)) ++ return jref(result); ++ ++ tree = reiser4_tree_by_page(pg); ++ ++ /* check hash-table first */ ++ result = jfind(pg->mapping, pg->index); ++ if (unlikely(result != NULL)) { ++ spin_lock_jnode(result); ++ jnode_attach_page(result, pg); ++ spin_unlock_jnode(result); ++ result->key.j.mapping = pg->mapping; ++ return result; ++ } ++ ++ /* since page is locked, jnode should be allocated with GFP_NOFS flag */ ++ reiser4_ctx_gfp_mask_force(GFP_NOFS); ++ result = find_get_jnode(tree, pg->mapping, oid, pg->index); ++ if (unlikely(IS_ERR(result))) ++ return result; ++ /* attach jnode to page */ ++ spin_lock_jnode(result); ++ jnode_attach_page(result, pg); ++ spin_unlock_jnode(result); ++ return result; ++} ++ ++/* ++ * return jnode for @pg, creating it if necessary. ++ */ ++jnode *jnode_of_page(struct page *pg) ++{ ++ jnode *result; ++ ++ assert("umka-176", pg != NULL); ++ assert("nikita-2394", PageLocked(pg)); ++ ++ result = do_jget(reiser4_tree_by_page(pg), pg); ++ ++ if (REISER4_DEBUG && !IS_ERR(result)) { ++ assert("nikita-3210", result == jprivate(pg)); ++ assert("nikita-2046", jnode_page(jprivate(pg)) == pg); ++ if (jnode_is_unformatted(jprivate(pg))) { ++ assert("nikita-2364", ++ jprivate(pg)->key.j.index == pg->index); ++ assert("nikita-2367", ++ jprivate(pg)->key.j.mapping == pg->mapping); ++ assert("nikita-2365", ++ jprivate(pg)->key.j.objectid == ++ get_inode_oid(pg->mapping->host)); ++ assert("vs-1200", ++ jprivate(pg)->key.j.objectid == ++ pg->mapping->host->i_ino); ++ assert("nikita-2356", ++ jnode_is_unformatted(jnode_by_page(pg))); ++ } ++ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0)); ++ } ++ return result; ++} ++ ++/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the ++ * page.*/ ++void jnode_attach_page(jnode * node, struct page *pg) ++{ ++ assert("nikita-2060", node != NULL); ++ assert("nikita-2061", pg != NULL); ++ ++ assert("nikita-2050", jprivate(pg) == 0ul); ++ assert("nikita-2393", !PagePrivate(pg)); ++ assert("vs-1741", node->pg == NULL); ++ ++ assert("nikita-2396", PageLocked(pg)); ++ assert_spin_locked(&(node->guard)); ++ ++ page_cache_get(pg); ++ set_page_private(pg, (unsigned long)node); ++ node->pg = pg; ++ SetPagePrivate(pg); ++} ++ ++/* Dual to jnode_attach_page: break a binding between page and jnode */ ++void page_clear_jnode(struct page *page, jnode * node) ++{ ++ assert("nikita-2424", page != NULL); ++ assert("nikita-2425", PageLocked(page)); ++ assert("nikita-2426", node != NULL); ++ assert_spin_locked(&(node->guard)); ++ assert("nikita-2428", PagePrivate(page)); ++ ++ assert("nikita-3551", !PageWriteback(page)); ++ ++ JF_CLR(node, JNODE_PARSED); ++ set_page_private(page, 0ul); ++ ClearPagePrivate(page); ++ node->pg = NULL; ++ page_cache_release(page); ++} ++ ++#if 0 ++/* it is only used in one place to handle error */ ++void ++page_detach_jnode(struct page *page, struct address_space *mapping, ++ unsigned long index) ++{ ++ assert("nikita-2395", page != NULL); ++ ++ lock_page(page); ++ if ((page->mapping == mapping) && (page->index == index) ++ && PagePrivate(page)) { ++ jnode *node; ++ ++ node = jprivate(page); ++ spin_lock_jnode(node); ++ page_clear_jnode(page, node); ++ spin_unlock_jnode(node); ++ } ++ unlock_page(page); ++} ++#endif /* 0 */ ++ ++/* return @node page locked. ++ ++ Locking ordering requires that one first takes page lock and afterwards ++ spin lock on node attached to this page. Sometimes it is necessary to go in ++ the opposite direction. This is done through standard trylock-and-release ++ loop. ++*/ ++static struct page *jnode_lock_page(jnode * node) ++{ ++ struct page *page; ++ ++ assert("nikita-2052", node != NULL); ++ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode)); ++ ++ while (1) { ++ ++ spin_lock_jnode(node); ++ page = jnode_page(node); ++ if (page == NULL) ++ break; ++ ++ /* no need to page_cache_get( page ) here, because page cannot ++ be evicted from memory without detaching it from jnode and ++ this requires spin lock on jnode that we already hold. ++ */ ++ if (trylock_page(page)) { ++ /* We won a lock on jnode page, proceed. */ ++ break; ++ } ++ ++ /* Page is locked by someone else. */ ++ page_cache_get(page); ++ spin_unlock_jnode(node); ++ wait_on_page_locked(page); ++ /* it is possible that page was detached from jnode and ++ returned to the free pool, or re-assigned while we were ++ waiting on locked bit. This will be rechecked on the next ++ loop iteration. ++ */ ++ page_cache_release(page); ++ ++ /* try again */ ++ } ++ return page; ++} ++ ++/* ++ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify ++ * validness of jnode content. ++ */ ++static inline int jparse(jnode * node) ++{ ++ int result; ++ ++ assert("nikita-2466", node != NULL); ++ ++ spin_lock_jnode(node); ++ if (likely(!jnode_is_parsed(node))) { ++ result = jnode_ops(node)->parse(node); ++ if (likely(result == 0)) ++ JF_SET(node, JNODE_PARSED); ++ } else ++ result = 0; ++ spin_unlock_jnode(node); ++ return result; ++} ++ ++/* Lock a page attached to jnode, create and attach page to jnode if it had no ++ * one. */ ++static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags) ++{ ++ struct page *page; ++ ++ spin_lock_jnode(node); ++ page = jnode_page(node); ++ ++ if (page == NULL) { ++ spin_unlock_jnode(node); ++ page = find_or_create_page(jnode_get_mapping(node), ++ jnode_get_index(node), gfp_flags); ++ if (page == NULL) ++ return ERR_PTR(RETERR(-ENOMEM)); ++ } else { ++ if (trylock_page(page)) { ++ spin_unlock_jnode(node); ++ return page; ++ } ++ page_cache_get(page); ++ spin_unlock_jnode(node); ++ lock_page(page); ++ assert("nikita-3134", page->mapping == jnode_get_mapping(node)); ++ } ++ ++ spin_lock_jnode(node); ++ if (!jnode_page(node)) ++ jnode_attach_page(node, page); ++ spin_unlock_jnode(node); ++ ++ page_cache_release(page); ++ assert("zam-894", jnode_page(node) == page); ++ return page; ++} ++ ++/* Start read operation for jnode's page if page is not up-to-date. */ ++static int jnode_start_read(jnode * node, struct page *page) ++{ ++ assert("zam-893", PageLocked(page)); ++ ++ if (PageUptodate(page)) { ++ unlock_page(page); ++ return 0; ++ } ++ return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get()); ++} ++ ++#if REISER4_DEBUG ++static void check_jload(jnode * node, struct page *page) ++{ ++ if (jnode_is_znode(node)) { ++ node40_header *nh; ++ znode *z; ++ ++ z = JZNODE(node); ++ if (znode_is_any_locked(z)) { ++ nh = (node40_header *) kmap(page); ++ /* this only works for node40-only file systems. For ++ * debugging. */ ++ assert("nikita-3253", ++ z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items))); ++ kunmap(page); ++ } ++ assert("nikita-3565", znode_invariant(z)); ++ } ++} ++#else ++#define check_jload(node, page) noop ++#endif ++ ++/* prefetch jnode to speed up next call to jload. Call this when you are going ++ * to call jload() shortly. This will bring appropriate portion of jnode into ++ * CPU cache. */ ++void jload_prefetch(jnode * node) ++{ ++ prefetchw(&node->x_count); ++} ++ ++/* load jnode's data into memory */ ++int jload_gfp(jnode * node /* node to load */ , ++ gfp_t gfp_flags /* allocation flags */ , ++ int do_kmap/* true if page should be kmapped */) ++{ ++ struct page *page; ++ int result = 0; ++ int parsed; ++ ++ assert("nikita-3010", reiser4_schedulable()); ++ ++ prefetchw(&node->pg); ++ ++ /* taking d-reference implies taking x-reference. */ ++ jref(node); ++ ++ /* ++ * acquiring d-reference to @jnode and check for JNODE_PARSED bit ++ * should be atomic, otherwise there is a race against ++ * reiser4_releasepage(). ++ */ ++ spin_lock(&(node->load)); ++ add_d_ref(node); ++ parsed = jnode_is_parsed(node); ++ spin_unlock(&(node->load)); ++ ++ if (unlikely(!parsed)) { ++ page = jnode_get_page_locked(node, gfp_flags); ++ if (unlikely(IS_ERR(page))) { ++ result = PTR_ERR(page); ++ goto failed; ++ } ++ ++ result = jnode_start_read(node, page); ++ if (unlikely(result != 0)) ++ goto failed; ++ ++ wait_on_page_locked(page); ++ if (unlikely(!PageUptodate(page))) { ++ result = RETERR(-EIO); ++ goto failed; ++ } ++ ++ if (do_kmap) ++ node->data = kmap(page); ++ ++ result = jparse(node); ++ if (unlikely(result != 0)) { ++ if (do_kmap) ++ kunmap(page); ++ goto failed; ++ } ++ check_jload(node, page); ++ } else { ++ page = jnode_page(node); ++ check_jload(node, page); ++ if (do_kmap) ++ node->data = kmap(page); ++ } ++ ++ if (!is_writeout_mode()) ++ /* We do not mark pages active if jload is called as a part of ++ * jnode_flush() or reiser4_write_logs(). Both jnode_flush() ++ * and write_logs() add no value to cached data, there is no ++ * sense to mark pages as active when they go to disk, it just ++ * confuses vm scanning routines because clean page could be ++ * moved out from inactive list as a result of this ++ * mark_page_accessed() call. */ ++ mark_page_accessed(page); ++ ++ return 0; ++ ++failed: ++ jrelse_tail(node); ++ return result; ++ ++} ++ ++/* start asynchronous reading for given jnode's page. */ ++int jstartio(jnode * node) ++{ ++ struct page *page; ++ ++ page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get()); ++ if (IS_ERR(page)) ++ return PTR_ERR(page); ++ ++ return jnode_start_read(node, page); ++} ++ ++/* Initialize a node by calling appropriate plugin instead of reading ++ * node from disk as in jload(). */ ++int jinit_new(jnode * node, gfp_t gfp_flags) ++{ ++ struct page *page; ++ int result; ++ ++ jref(node); ++ add_d_ref(node); ++ ++ page = jnode_get_page_locked(node, gfp_flags); ++ if (IS_ERR(page)) { ++ result = PTR_ERR(page); ++ goto failed; ++ } ++ ++ SetPageUptodate(page); ++ unlock_page(page); ++ ++ node->data = kmap(page); ++ ++ if (!jnode_is_parsed(node)) { ++ jnode_plugin *jplug = jnode_ops(node); ++ spin_lock_jnode(node); ++ result = jplug->init(node); ++ spin_unlock_jnode(node); ++ if (result) { ++ kunmap(page); ++ goto failed; ++ } ++ JF_SET(node, JNODE_PARSED); ++ } ++ ++ return 0; ++ ++failed: ++ jrelse(node); ++ return result; ++} ++ ++/* release a reference to jnode acquired by jload(), decrement ->d_count */ ++void jrelse_tail(jnode * node/* jnode to release references to */) ++{ ++ assert("nikita-489", atomic_read(&node->d_count) > 0); ++ atomic_dec(&node->d_count); ++ /* release reference acquired in jload_gfp() or jinit_new() */ ++ jput(node); ++ if (jnode_is_unformatted(node) || jnode_is_znode(node)) ++ LOCK_CNT_DEC(d_refs); ++} ++ ++/* drop reference to node data. When last reference is dropped, data are ++ unloaded. */ ++void jrelse(jnode * node/* jnode to release references to */) ++{ ++ struct page *page; ++ ++ assert("nikita-487", node != NULL); ++ assert_spin_not_locked(&(node->guard)); ++ ++ page = jnode_page(node); ++ if (likely(page != NULL)) { ++ /* ++ * it is safe not to lock jnode here, because at this point ++ * @node->d_count is greater than zero (if jrelse() is used ++ * correctly, that is). JNODE_PARSED may be not set yet, if, ++ * for example, we got here as a result of error handling path ++ * in jload(). Anyway, page cannot be detached by ++ * reiser4_releasepage(). truncate will invalidate page ++ * regardless, but this should not be a problem. ++ */ ++ kunmap(page); ++ } ++ jrelse_tail(node); ++} ++ ++/* called from jput() to wait for io completion */ ++static void jnode_finish_io(jnode * node) ++{ ++ struct page *page; ++ ++ assert("nikita-2922", node != NULL); ++ ++ spin_lock_jnode(node); ++ page = jnode_page(node); ++ if (page != NULL) { ++ page_cache_get(page); ++ spin_unlock_jnode(node); ++ wait_on_page_writeback(page); ++ page_cache_release(page); ++ } else ++ spin_unlock_jnode(node); ++} ++ ++/* ++ * This is called by jput() when last reference to jnode is released. This is ++ * separate function, because we want fast path of jput() to be inline and, ++ * therefore, small. ++ */ ++void jput_final(jnode * node) ++{ ++ int r_i_p; ++ ++ /* A fast check for keeping node in cache. We always keep node in cache ++ * if its page is present and node was not marked for deletion */ ++ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) { ++ rcu_read_unlock(); ++ return; ++ } ++ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP); ++ /* ++ * if r_i_p is true, we were first to set JNODE_RIP on this node. In ++ * this case it is safe to access node after unlock. ++ */ ++ rcu_read_unlock(); ++ if (r_i_p) { ++ jnode_finish_io(node); ++ if (JF_ISSET(node, JNODE_HEARD_BANSHEE)) ++ /* node is removed from the tree. */ ++ jdelete(node); ++ else ++ jnode_try_drop(node); ++ } ++ /* if !r_i_p some other thread is already killing it */ ++} ++ ++int jwait_io(jnode * node, int rw) ++{ ++ struct page *page; ++ int result; ++ ++ assert("zam-447", node != NULL); ++ assert("zam-448", jnode_page(node) != NULL); ++ ++ page = jnode_page(node); ++ ++ result = 0; ++ if (rw == READ) { ++ wait_on_page_locked(page); ++ } else { ++ assert("nikita-2227", rw == WRITE); ++ wait_on_page_writeback(page); ++ } ++ if (PageError(page)) ++ result = RETERR(-EIO); ++ ++ return result; ++} ++ ++/* ++ * jnode types and plugins. ++ * ++ * jnode by itself is a "base type". There are several different jnode ++ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code ++ * has to do different things based on jnode type. In the standard reiser4 way ++ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin). ++ * ++ * Functions below deal with jnode types and define methods of jnode plugin. ++ * ++ */ ++ ++/* set jnode type. This is done during jnode initialization. */ ++static void jnode_set_type(jnode * node, jnode_type type) ++{ ++ static unsigned long type_to_mask[] = { ++ [JNODE_UNFORMATTED_BLOCK] = 1, ++ [JNODE_FORMATTED_BLOCK] = 0, ++ [JNODE_BITMAP] = 2, ++ [JNODE_IO_HEAD] = 6, ++ [JNODE_INODE] = 4 ++ }; ++ ++ assert("zam-647", type < LAST_JNODE_TYPE); ++ assert("nikita-2815", !jnode_is_loaded(node)); ++ assert("nikita-3386", node->state == 0); ++ ++ node->state |= (type_to_mask[type] << JNODE_TYPE_1); ++} ++ ++/* ->init() method of jnode plugin for jnodes that don't require plugin ++ * specific initialization. */ ++static int init_noinit(jnode * node UNUSED_ARG) ++{ ++ return 0; ++} ++ ++/* ->parse() method of jnode plugin for jnodes that don't require plugin ++ * specific pasring. */ ++static int parse_noparse(jnode * node UNUSED_ARG) ++{ ++ return 0; ++} ++ ++/* ->mapping() method for unformatted jnode */ ++struct address_space *mapping_jnode(const jnode * node) ++{ ++ struct address_space *map; ++ ++ assert("nikita-2713", node != NULL); ++ ++ /* mapping is stored in jnode */ ++ ++ map = node->key.j.mapping; ++ assert("nikita-2714", map != NULL); ++ assert("nikita-2897", is_reiser4_inode(map->host)); ++ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid); ++ return map; ++} ++ ++/* ->index() method for unformatted jnodes */ ++unsigned long index_jnode(const jnode * node) ++{ ++ /* index is stored in jnode */ ++ return node->key.j.index; ++} ++ ++/* ->remove() method for unformatted jnodes */ ++static inline void remove_jnode(jnode * node, reiser4_tree * tree) ++{ ++ /* remove jnode from hash table and radix tree */ ++ if (node->key.j.mapping) ++ unhash_unformatted_node_nolock(node); ++} ++ ++/* ->mapping() method for znodes */ ++static struct address_space *mapping_znode(const jnode * node) ++{ ++ /* all znodes belong to fake inode */ ++ return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping; ++} ++ ++/* ->index() method for znodes */ ++static unsigned long index_znode(const jnode * node) ++{ ++ unsigned long addr; ++ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode)); ++ ++ /* index of znode is just its address (shifted) */ ++ addr = (unsigned long)node; ++ return (addr - PAGE_OFFSET) >> znode_shift_order; ++} ++ ++/* ->mapping() method for bitmap jnode */ ++static struct address_space *mapping_bitmap(const jnode * node) ++{ ++ /* all bitmap blocks belong to special bitmap inode */ ++ return get_super_private(jnode_get_tree(node)->super)->bitmap-> ++ i_mapping; ++} ++ ++/* ->index() method for jnodes that are indexed by address */ ++static unsigned long index_is_address(const jnode * node) ++{ ++ unsigned long ind; ++ ++ ind = (unsigned long)node; ++ return ind - PAGE_OFFSET; ++} ++ ++/* resolve race with jput */ ++jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node) ++{ ++ /* ++ * This is used as part of RCU-based jnode handling. ++ * ++ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work ++ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is ++ * not protected during this, so concurrent thread may execute ++ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be ++ * freed in jput_final(). To avoid such races, jput_final() sets ++ * JNODE_RIP on jnode (under tree lock). All places that work with ++ * unreferenced jnodes call this function. It checks for JNODE_RIP bit ++ * (first without taking tree lock), and if this bit is set, released ++ * reference acquired by the current thread and returns NULL. ++ * ++ * As a result, if jnode is being concurrently freed, NULL is returned ++ * and caller should pretend that jnode wasn't found in the first ++ * place. ++ * ++ * Otherwise it's safe to release "rcu-read-lock" and continue with ++ * jnode. ++ */ ++ if (unlikely(JF_ISSET(node, JNODE_RIP))) { ++ read_lock_tree(tree); ++ if (JF_ISSET(node, JNODE_RIP)) { ++ dec_x_ref(node); ++ node = NULL; ++ } ++ read_unlock_tree(tree); ++ } ++ return node; ++} ++ ++reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key) ++{ ++ struct inode *inode; ++ item_plugin *iplug; ++ loff_t off; ++ ++ assert("nikita-3092", node != NULL); ++ assert("nikita-3093", key != NULL); ++ assert("nikita-3094", jnode_is_unformatted(node)); ++ ++ off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT; ++ inode = mapping_jnode(node)->host; ++ ++ if (node->parent_item_id != 0) ++ iplug = item_plugin_by_id(node->parent_item_id); ++ else ++ iplug = NULL; ++ ++ if (iplug != NULL && iplug->f.key_by_offset) ++ iplug->f.key_by_offset(inode, off, key); ++ else { ++ file_plugin *fplug; ++ ++ fplug = inode_file_plugin(inode); ++ assert("zam-1007", fplug != NULL); ++ assert("zam-1008", fplug->key_by_inode != NULL); ++ ++ fplug->key_by_inode(inode, off, key); ++ } ++ ++ return key; ++} ++ ++/* ->parse() method for formatted nodes */ ++static int parse_znode(jnode * node) ++{ ++ return zparse(JZNODE(node)); ++} ++ ++/* ->delete() method for formatted nodes */ ++static void delete_znode(jnode * node, reiser4_tree * tree) ++{ ++ znode *z; ++ ++ assert_rw_write_locked(&(tree->tree_lock)); ++ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE)); ++ ++ z = JZNODE(node); ++ assert("vs-899", z->c_count == 0); ++ ++ /* delete znode from sibling list. */ ++ sibling_list_remove(z); ++ ++ znode_remove(z, tree); ++} ++ ++/* ->remove() method for formatted nodes */ ++static int remove_znode(jnode * node, reiser4_tree * tree) ++{ ++ znode *z; ++ ++ assert_rw_write_locked(&(tree->tree_lock)); ++ z = JZNODE(node); ++ ++ if (z->c_count == 0) { ++ /* detach znode from sibling list. */ ++ sibling_list_drop(z); ++ /* this is called with tree spin-lock held, so call ++ znode_remove() directly (rather than znode_lock_remove()). */ ++ znode_remove(z, tree); ++ return 0; ++ } ++ return RETERR(-EBUSY); ++} ++ ++/* ->init() method for formatted nodes */ ++static int init_znode(jnode * node) ++{ ++ znode *z; ++ ++ z = JZNODE(node); ++ /* call node plugin to do actual initialization */ ++ return z->nplug->init(z); ++} ++ ++/* ->clone() method for formatted nodes */ ++static jnode *clone_formatted(jnode * node) ++{ ++ znode *clone; ++ ++ assert("vs-1430", jnode_is_znode(node)); ++ clone = zalloc(reiser4_ctx_gfp_mask_get()); ++ if (clone == NULL) ++ return ERR_PTR(RETERR(-ENOMEM)); ++ zinit(clone, NULL, current_tree); ++ jnode_set_block(ZJNODE(clone), jnode_get_block(node)); ++ /* ZJNODE(clone)->key.z is not initialized */ ++ clone->level = JZNODE(node)->level; ++ ++ return ZJNODE(clone); ++} ++ ++/* jplug->clone for unformatted nodes */ ++static jnode *clone_unformatted(jnode * node) ++{ ++ jnode *clone; ++ ++ assert("vs-1431", jnode_is_unformatted(node)); ++ clone = jalloc(); ++ if (clone == NULL) ++ return ERR_PTR(RETERR(-ENOMEM)); ++ ++ jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK); ++ jnode_set_block(clone, jnode_get_block(node)); ++ ++ return clone; ++ ++} ++ ++/* ++ * Setup jnode plugin methods for various jnode types. ++ */ ++jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = { ++ [JNODE_UNFORMATTED_BLOCK] = { ++ .h = { ++ .type_id = REISER4_JNODE_PLUGIN_TYPE, ++ .id = JNODE_UNFORMATTED_BLOCK, ++ .pops = NULL, ++ .label = "unformatted", ++ .desc = "unformatted node", ++ .linkage = {NULL, NULL} ++ }, ++ .init = init_noinit, ++ .parse = parse_noparse, ++ .mapping = mapping_jnode, ++ .index = index_jnode, ++ .clone = clone_unformatted ++ }, ++ [JNODE_FORMATTED_BLOCK] = { ++ .h = { ++ .type_id = REISER4_JNODE_PLUGIN_TYPE, ++ .id = JNODE_FORMATTED_BLOCK, ++ .pops = NULL, ++ .label = "formatted", ++ .desc = "formatted tree node", ++ .linkage = {NULL, NULL} ++ }, ++ .init = init_znode, ++ .parse = parse_znode, ++ .mapping = mapping_znode, ++ .index = index_znode, ++ .clone = clone_formatted ++ }, ++ [JNODE_BITMAP] = { ++ .h = { ++ .type_id = REISER4_JNODE_PLUGIN_TYPE, ++ .id = JNODE_BITMAP, ++ .pops = NULL, ++ .label = "bitmap", ++ .desc = "bitmap node", ++ .linkage = {NULL, NULL} ++ }, ++ .init = init_noinit, ++ .parse = parse_noparse, ++ .mapping = mapping_bitmap, ++ .index = index_is_address, ++ .clone = NULL ++ }, ++ [JNODE_IO_HEAD] = { ++ .h = { ++ .type_id = REISER4_JNODE_PLUGIN_TYPE, ++ .id = JNODE_IO_HEAD, ++ .pops = NULL, ++ .label = "io head", ++ .desc = "io head", ++ .linkage = {NULL, NULL} ++ }, ++ .init = init_noinit, ++ .parse = parse_noparse, ++ .mapping = mapping_bitmap, ++ .index = index_is_address, ++ .clone = NULL ++ }, ++ [JNODE_INODE] = { ++ .h = { ++ .type_id = REISER4_JNODE_PLUGIN_TYPE, ++ .id = JNODE_INODE, ++ .pops = NULL, ++ .label = "inode", ++ .desc = "inode's builtin jnode", ++ .linkage = {NULL, NULL} ++ }, ++ .init = NULL, ++ .parse = NULL, ++ .mapping = NULL, ++ .index = NULL, ++ .clone = NULL ++ } ++}; ++ ++/* ++ * jnode destruction. ++ * ++ * Thread may use a jnode after it acquired a reference to it. References are ++ * counted in ->x_count field. Reference protects jnode from being ++ * recycled. This is different from protecting jnode data (that are stored in ++ * jnode page) from being evicted from memory. Data are protected by jload() ++ * and released by jrelse(). ++ * ++ * If thread already possesses a reference to the jnode it can acquire another ++ * one through jref(). Initial reference is obtained (usually) by locating ++ * jnode in some indexing structure that depends on jnode type: formatted ++ * nodes are kept in global hash table, where they are indexed by block ++ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash ++ * table, which is indexed by oid and offset within file, and in per-inode ++ * radix tree. ++ * ++ * Reference to jnode is released by jput(). If last reference is released, ++ * jput_final() is called. This function determines whether jnode has to be ++ * deleted (this happens when corresponding node is removed from the file ++ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it ++ * should be just "removed" (deleted from memory). ++ * ++ * Jnode destruction is signally delicate dance because of locking and RCU. ++ */ ++ ++/* ++ * Returns true if jnode cannot be removed right now. This check is called ++ * under tree lock. If it returns true, jnode is irrevocably committed to be ++ * deleted/removed. ++ */ ++static inline int jnode_is_busy(const jnode * node, jnode_type jtype) ++{ ++ /* if other thread managed to acquire a reference to this jnode, don't ++ * free it. */ ++ if (atomic_read(&node->x_count) > 0) ++ return 1; ++ /* also, don't free znode that has children in memory */ ++ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0) ++ return 1; ++ return 0; ++} ++ ++/* ++ * this is called as part of removing jnode. Based on jnode type, call ++ * corresponding function that removes jnode from indices and returns it back ++ * to the appropriate slab (through RCU). ++ */ ++static inline void ++jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree) ++{ ++ switch (jtype) { ++ case JNODE_UNFORMATTED_BLOCK: ++ remove_jnode(node, tree); ++ break; ++ case JNODE_IO_HEAD: ++ case JNODE_BITMAP: ++ break; ++ case JNODE_INODE: ++ break; ++ case JNODE_FORMATTED_BLOCK: ++ remove_znode(node, tree); ++ break; ++ default: ++ wrong_return_value("nikita-3196", "Wrong jnode type"); ++ } ++} ++ ++/* ++ * this is called as part of deleting jnode. Based on jnode type, call ++ * corresponding function that removes jnode from indices and returns it back ++ * to the appropriate slab (through RCU). ++ * ++ * This differs from jnode_remove() only for formatted nodes---for them ++ * sibling list handling is different for removal and deletion. ++ */ ++static inline void ++jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG) ++{ ++ switch (jtype) { ++ case JNODE_UNFORMATTED_BLOCK: ++ remove_jnode(node, tree); ++ break; ++ case JNODE_IO_HEAD: ++ case JNODE_BITMAP: ++ break; ++ case JNODE_FORMATTED_BLOCK: ++ delete_znode(node, tree); ++ break; ++ case JNODE_INODE: ++ default: ++ wrong_return_value("nikita-3195", "Wrong jnode type"); ++ } ++} ++ ++#if REISER4_DEBUG ++/* ++ * remove jnode from the debugging list of all jnodes hanging off super-block. ++ */ ++void jnode_list_remove(jnode * node) ++{ ++ reiser4_super_info_data *sbinfo; ++ ++ sbinfo = get_super_private(jnode_get_tree(node)->super); ++ ++ spin_lock_irq(&sbinfo->all_guard); ++ assert("nikita-2422", !list_empty(&node->jnodes)); ++ list_del_init(&node->jnodes); ++ spin_unlock_irq(&sbinfo->all_guard); ++} ++#endif ++ ++/* ++ * this is called by jput_final() to remove jnode when last reference to it is ++ * released. ++ */ ++static int jnode_try_drop(jnode * node) ++{ ++ int result; ++ reiser4_tree *tree; ++ jnode_type jtype; ++ ++ assert("nikita-2491", node != NULL); ++ assert("nikita-2583", JF_ISSET(node, JNODE_RIP)); ++ ++ tree = jnode_get_tree(node); ++ jtype = jnode_get_type(node); ++ ++ spin_lock_jnode(node); ++ write_lock_tree(tree); ++ /* ++ * if jnode has a page---leave it alone. Memory pressure will ++ * eventually kill page and jnode. ++ */ ++ if (jnode_page(node) != NULL) { ++ write_unlock_tree(tree); ++ spin_unlock_jnode(node); ++ JF_CLR(node, JNODE_RIP); ++ return RETERR(-EBUSY); ++ } ++ ++ /* re-check ->x_count under tree lock. */ ++ result = jnode_is_busy(node, jtype); ++ if (result == 0) { ++ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE)); ++ assert("jmacd-511/b", atomic_read(&node->d_count) == 0); ++ ++ spin_unlock_jnode(node); ++ /* no page and no references---despatch him. */ ++ jnode_remove(node, jtype, tree); ++ write_unlock_tree(tree); ++ jnode_free(node, jtype); ++ } else { ++ /* busy check failed: reference was acquired by concurrent ++ * thread. */ ++ write_unlock_tree(tree); ++ spin_unlock_jnode(node); ++ JF_CLR(node, JNODE_RIP); ++ } ++ return result; ++} ++ ++/* jdelete() -- Delete jnode from the tree and file system */ ++static int jdelete(jnode * node/* jnode to finish with */) ++{ ++ struct page *page; ++ int result; ++ reiser4_tree *tree; ++ jnode_type jtype; ++ ++ assert("nikita-467", node != NULL); ++ assert("nikita-2531", JF_ISSET(node, JNODE_RIP)); ++ ++ jtype = jnode_get_type(node); ++ ++ page = jnode_lock_page(node); ++ assert_spin_locked(&(node->guard)); ++ ++ tree = jnode_get_tree(node); ++ ++ write_lock_tree(tree); ++ /* re-check ->x_count under tree lock. */ ++ result = jnode_is_busy(node, jtype); ++ if (likely(!result)) { ++ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE)); ++ assert("jmacd-511", atomic_read(&node->d_count) == 0); ++ ++ /* detach page */ ++ if (page != NULL) { ++ /* ++ * FIXME this is racy against jnode_extent_write(). ++ */ ++ page_clear_jnode(page, node); ++ } ++ spin_unlock_jnode(node); ++ /* goodbye */ ++ jnode_delete(node, jtype, tree); ++ write_unlock_tree(tree); ++ jnode_free(node, jtype); ++ /* @node is no longer valid pointer */ ++ if (page != NULL) ++ reiser4_drop_page(page); ++ } else { ++ /* busy check failed: reference was acquired by concurrent ++ * thread. */ ++ JF_CLR(node, JNODE_RIP); ++ write_unlock_tree(tree); ++ spin_unlock_jnode(node); ++ if (page != NULL) ++ unlock_page(page); ++ } ++ return result; ++} ++ ++/* drop jnode on the floor. ++ ++ Return value: ++ ++ -EBUSY: failed to drop jnode, because there are still references to it ++ ++ 0: successfully dropped jnode ++ ++*/ ++static int jdrop_in_tree(jnode * node, reiser4_tree * tree) ++{ ++ struct page *page; ++ jnode_type jtype; ++ int result; ++ ++ assert("zam-602", node != NULL); ++ assert_rw_not_read_locked(&(tree->tree_lock)); ++ assert_rw_not_write_locked(&(tree->tree_lock)); ++ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE)); ++ ++ jtype = jnode_get_type(node); ++ ++ page = jnode_lock_page(node); ++ assert_spin_locked(&(node->guard)); ++ ++ write_lock_tree(tree); ++ ++ /* re-check ->x_count under tree lock. */ ++ result = jnode_is_busy(node, jtype); ++ if (!result) { ++ assert("nikita-2488", page == jnode_page(node)); ++ assert("nikita-2533", atomic_read(&node->d_count) == 0); ++ if (page != NULL) { ++ assert("nikita-2126", !PageDirty(page)); ++ assert("nikita-2127", PageUptodate(page)); ++ assert("nikita-2181", PageLocked(page)); ++ page_clear_jnode(page, node); ++ } ++ spin_unlock_jnode(node); ++ jnode_remove(node, jtype, tree); ++ write_unlock_tree(tree); ++ jnode_free(node, jtype); ++ if (page != NULL) ++ reiser4_drop_page(page); ++ } else { ++ /* busy check failed: reference was acquired by concurrent ++ * thread. */ ++ JF_CLR(node, JNODE_RIP); ++ write_unlock_tree(tree); ++ spin_unlock_jnode(node); ++ if (page != NULL) ++ unlock_page(page); ++ } ++ return result; ++} ++ ++/* This function frees jnode "if possible". In particular, [dcx]_count has to ++ be 0 (where applicable). */ ++void jdrop(jnode * node) ++{ ++ jdrop_in_tree(node, jnode_get_tree(node)); ++} ++ ++/* IO head jnode implementation; The io heads are simple j-nodes with limited ++ functionality (these j-nodes are not in any hash table) just for reading ++ from and writing to disk. */ ++ ++jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) ++{ ++ jnode *jal = jalloc(); ++ ++ if (jal != NULL) { ++ jnode_init(jal, current_tree, JNODE_IO_HEAD); ++ jnode_set_block(jal, block); ++ } ++ ++ jref(jal); ++ ++ return jal; ++} ++ ++void reiser4_drop_io_head(jnode * node) ++{ ++ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD); ++ ++ jput(node); ++ jdrop(node); ++} ++ ++/* protect keep jnode data from reiser4_releasepage() */ ++void pin_jnode_data(jnode * node) ++{ ++ assert("zam-671", jnode_page(node) != NULL); ++ page_cache_get(jnode_page(node)); ++} ++ ++/* make jnode data free-able again */ ++void unpin_jnode_data(jnode * node) ++{ ++ assert("zam-672", jnode_page(node) != NULL); ++ page_cache_release(jnode_page(node)); ++} ++ ++struct address_space *jnode_get_mapping(const jnode * node) ++{ ++ assert("nikita-3162", node != NULL); ++ return jnode_ops(node)->mapping(node); ++} ++ ++#if REISER4_DEBUG ++/* debugging aid: jnode invariant */ ++int jnode_invariant_f(const jnode * node, char const **msg) ++{ ++#define _ergo(ant, con) \ ++ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con))) ++#define _check(exp) ((*msg) = #exp, (exp)) ++ ++ return _check(node != NULL) && ++ /* [jnode-queued] */ ++ /* only relocated node can be queued, except that when znode ++ * is being deleted, its JNODE_RELOC bit is cleared */ ++ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED), ++ JF_ISSET(node, JNODE_RELOC) || ++ JF_ISSET(node, JNODE_HEARD_BANSHEE)) && ++ _check(node->jnodes.prev != NULL) && ++ _check(node->jnodes.next != NULL) && ++ /* [jnode-dirty] invariant */ ++ /* dirty inode is part of atom */ ++ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) && ++ /* [jnode-oid] invariant */ ++ /* for unformatted node ->objectid and ->mapping fields are ++ * consistent */ ++ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL, ++ node->key.j.objectid == ++ get_inode_oid(node->key.j.mapping->host)) && ++ /* [jnode-atom-valid] invariant */ ++ /* node atom has valid state */ ++ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) && ++ /* [jnode-page-binding] invariant */ ++ /* if node points to page, it points back to node */ ++ _ergo(node->pg != NULL, jprivate(node->pg) == node) && ++ /* [jnode-refs] invariant */ ++ /* only referenced jnode can be loaded */ ++ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count)); ++ ++} ++ ++static const char *jnode_type_name(jnode_type type) ++{ ++ switch (type) { ++ case JNODE_UNFORMATTED_BLOCK: ++ return "unformatted"; ++ case JNODE_FORMATTED_BLOCK: ++ return "formatted"; ++ case JNODE_BITMAP: ++ return "bitmap"; ++ case JNODE_IO_HEAD: ++ return "io head"; ++ case JNODE_INODE: ++ return "inode"; ++ case LAST_JNODE_TYPE: ++ return "last"; ++ default:{ ++ static char unknown[30]; ++ ++ sprintf(unknown, "unknown %i", type); ++ return unknown; ++ } ++ } ++} ++ ++#define jnode_state_name(node, flag) \ ++ (JF_ISSET((node), (flag)) ? ((#flag "|")+6) : "") ++ ++/* debugging aid: output human readable information about @node */ ++static void info_jnode(const char *prefix /* prefix to print */ , ++ const jnode * node/* node to print */) ++{ ++ assert("umka-068", prefix != NULL); ++ ++ if (node == NULL) { ++ printk("%s: null\n", prefix); ++ return; ++ } ++ ++ printk ++ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i," ++ " block: %s, d_count: %d, x_count: %d, " ++ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node, ++ node->state, ++ jnode_state_name(node, JNODE_PARSED), ++ jnode_state_name(node, JNODE_HEARD_BANSHEE), ++ jnode_state_name(node, JNODE_LEFT_CONNECTED), ++ jnode_state_name(node, JNODE_RIGHT_CONNECTED), ++ jnode_state_name(node, JNODE_ORPHAN), ++ jnode_state_name(node, JNODE_CREATED), ++ jnode_state_name(node, JNODE_RELOC), ++ jnode_state_name(node, JNODE_OVRWR), ++ jnode_state_name(node, JNODE_DIRTY), ++ jnode_state_name(node, JNODE_IS_DYING), ++ jnode_state_name(node, JNODE_RIP), ++ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE), ++ jnode_state_name(node, JNODE_WRITEBACK), ++ jnode_state_name(node, JNODE_NEW), ++ jnode_state_name(node, JNODE_DKSET), ++ jnode_state_name(node, JNODE_REPACK), ++ jnode_state_name(node, JNODE_CLUSTER_PAGE), ++ jnode_get_level(node), sprint_address(jnode_get_block(node)), ++ atomic_read(&node->d_count), atomic_read(&node->x_count), ++ jnode_page(node), node->atom, 0, 0, ++ jnode_type_name(jnode_get_type(node))); ++ if (jnode_is_unformatted(node)) { ++ printk("inode: %llu, index: %lu, ", ++ node->key.j.objectid, node->key.j.index); ++ } ++} ++ ++/* debugging aid: check znode invariant and panic if it doesn't hold */ ++static int jnode_invariant(jnode * node, int tlocked, int jlocked) ++{ ++ char const *failed_msg; ++ int result; ++ reiser4_tree *tree; ++ ++ tree = jnode_get_tree(node); ++ ++ assert("umka-063312", node != NULL); ++ assert("umka-064321", tree != NULL); ++ ++ if (!jlocked && !tlocked) ++ spin_lock_jnode((jnode *) node); ++ if (!tlocked) ++ read_lock_tree(jnode_get_tree(node)); ++ result = jnode_invariant_f(node, &failed_msg); ++ if (!result) { ++ info_jnode("corrupted node", node); ++ warning("jmacd-555", "Condition %s failed", failed_msg); ++ } ++ if (!tlocked) ++ read_unlock_tree(jnode_get_tree(node)); ++ if (!jlocked && !tlocked) ++ spin_unlock_jnode((jnode *) node); ++ return result; ++} ++ ++#endif /* REISER4_DEBUG */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 80 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/jnode.h linux-2.6.33/fs/reiser4/jnode.h +--- linux-2.6.33.orig/fs/reiser4/jnode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/jnode.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,704 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Declaration of jnode. See jnode.c for details. */ ++ ++#ifndef __JNODE_H__ ++#define __JNODE_H__ ++ ++#include "forward.h" ++#include "type_safe_hash.h" ++#include "txnmgr.h" ++#include "key.h" ++#include "debug.h" ++#include "dformat.h" ++#include "page_cache.h" ++#include "context.h" ++ ++#include "plugin/plugin.h" ++ ++#include <linux/fs.h> ++#include <linux/mm.h> ++#include <linux/spinlock.h> ++#include <asm/atomic.h> ++#include <linux/bitops.h> ++#include <linux/list.h> ++#include <linux/rcupdate.h> ++ ++/* declare hash table of jnodes (jnodes proper, that is, unformatted ++ nodes) */ ++TYPE_SAFE_HASH_DECLARE(j, jnode); ++ ++/* declare hash table of znodes */ ++TYPE_SAFE_HASH_DECLARE(z, znode); ++ ++struct jnode_key { ++ __u64 objectid; ++ unsigned long index; ++ struct address_space *mapping; ++}; ++ ++/* ++ Jnode is the "base class" of other nodes in reiser4. It is also happens to ++ be exactly the node we use for unformatted tree nodes. ++ ++ Jnode provides following basic functionality: ++ ++ . reference counting and indexing. ++ ++ . integration with page cache. Jnode has ->pg reference to which page can ++ be attached. ++ ++ . interface to transaction manager. It is jnode that is kept in transaction ++ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this ++ means, there should be special type of jnode for inode.) ++ ++ Locking: ++ ++ Spin lock: the following fields are protected by the per-jnode spin lock: ++ ++ ->state ++ ->atom ++ ->capture_link ++ ++ Following fields are protected by the global tree lock: ++ ++ ->link ++ ->key.z (content of ->key.z is only changed in znode_rehash()) ++ ->key.j ++ ++ Atomic counters ++ ++ ->x_count ++ ->d_count ++ ++ ->pg, and ->data are protected by spin lock for unused jnode and are ++ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable() ++ is false). ++ ++ ->tree is immutable after creation ++ ++ Unclear ++ ++ ->blocknr: should be under jnode spin-lock, but current interface is based ++ on passing of block address. ++ ++ If you ever need to spin lock two nodes at once, do this in "natural" ++ memory order: lock znode with lower address first. (See lock_two_nodes().) ++ ++ Invariants involving this data-type: ++ ++ [jnode-dirty] ++ [jnode-refs] ++ [jnode-oid] ++ [jnode-queued] ++ [jnode-atom-valid] ++ [jnode-page-binding] ++*/ ++ ++struct jnode { ++#if REISER4_DEBUG ++#define JMAGIC 0x52654973 /* "ReIs" */ ++ int magic; ++#endif ++ /* FIRST CACHE LINE (16 bytes): data used by jload */ ++ ++ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */ ++ /* 0 */ unsigned long state; ++ ++ /* lock, protecting jnode's fields. */ ++ /* 4 */ spinlock_t load; ++ ++ /* counter of references to jnode itself. Increased on jref(). ++ Decreased on jput(). ++ */ ++ /* 8 */ atomic_t x_count; ++ ++ /* counter of references to jnode's data. Pin data page(s) in ++ memory while this is greater than 0. Increased on jload(). ++ Decreased on jrelse(). ++ */ ++ /* 12 */ atomic_t d_count; ++ ++ /* SECOND CACHE LINE: data used by hash table lookups */ ++ ++ /* 16 */ union { ++ /* znodes are hashed by block number */ ++ reiser4_block_nr z; ++ /* unformatted nodes are hashed by mapping plus offset */ ++ struct jnode_key j; ++ } key; ++ ++ /* THIRD CACHE LINE */ ++ ++ /* 32 */ union { ++ /* pointers to maintain hash-table */ ++ z_hash_link z; ++ j_hash_link j; ++ } link; ++ ++ /* pointer to jnode page. */ ++ /* 36 */ struct page *pg; ++ /* pointer to node itself. This is page_address(node->pg) when page is ++ attached to the jnode ++ */ ++ /* 40 */ void *data; ++ ++ /* 44 */ reiser4_tree *tree; ++ ++ /* FOURTH CACHE LINE: atom related fields */ ++ ++ /* 48 */ spinlock_t guard; ++ ++ /* atom the block is in, if any */ ++ /* 52 */ txn_atom *atom; ++ ++ /* capture list */ ++ /* 56 */ struct list_head capture_link; ++ ++ /* FIFTH CACHE LINE */ ++ ++ /* 64 */ struct rcu_head rcu; ++ /* crosses cache line */ ++ ++ /* SIXTH CACHE LINE */ ++ ++ /* the real blocknr (where io is going to/from) */ ++ /* 80 */ reiser4_block_nr blocknr; ++ /* Parent item type, unformatted and CRC need it for ++ * offset => key conversion. */ ++ /* NOTE: this parent_item_id looks like jnode type. */ ++ /* 88 */ reiser4_plugin_id parent_item_id; ++ /* 92 */ ++#if REISER4_DEBUG ++ /* list of all jnodes for debugging purposes. */ ++ struct list_head jnodes; ++ /* how many times this jnode was written in one transaction */ ++ int written; ++ /* this indicates which atom's list the jnode is on */ ++ atom_list list; ++#endif ++} __attribute__ ((aligned(16))); ++ ++/* ++ * jnode types. Enumeration of existing jnode types. ++ */ ++typedef enum { ++ JNODE_UNFORMATTED_BLOCK, /* unformatted block */ ++ JNODE_FORMATTED_BLOCK, /* formatted block, znode */ ++ JNODE_BITMAP, /* bitmap */ ++ JNODE_IO_HEAD, /* jnode representing a block in the ++ * wandering log */ ++ JNODE_INODE, /* jnode embedded into inode */ ++ LAST_JNODE_TYPE ++} jnode_type; ++ ++/* jnode states */ ++typedef enum { ++ /* jnode's page is loaded and data checked */ ++ JNODE_PARSED = 0, ++ /* node was deleted, not all locks on it were released. This ++ node is empty and is going to be removed from the tree ++ shortly. */ ++ JNODE_HEARD_BANSHEE = 1, ++ /* left sibling pointer is valid */ ++ JNODE_LEFT_CONNECTED = 2, ++ /* right sibling pointer is valid */ ++ JNODE_RIGHT_CONNECTED = 3, ++ ++ /* znode was just created and doesn't yet have a pointer from ++ its parent */ ++ JNODE_ORPHAN = 4, ++ ++ /* this node was created by its transaction and has not been assigned ++ a block address. */ ++ JNODE_CREATED = 5, ++ ++ /* this node is currently relocated */ ++ JNODE_RELOC = 6, ++ /* this node is currently wandered */ ++ JNODE_OVRWR = 7, ++ ++ /* this znode has been modified */ ++ JNODE_DIRTY = 8, ++ ++ /* znode lock is being invalidated */ ++ JNODE_IS_DYING = 9, ++ ++ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */ ++ ++ /* jnode is queued for flushing. */ ++ JNODE_FLUSH_QUEUED = 12, ++ ++ /* In the following bits jnode type is encoded. */ ++ JNODE_TYPE_1 = 13, ++ JNODE_TYPE_2 = 14, ++ JNODE_TYPE_3 = 15, ++ ++ /* jnode is being destroyed */ ++ JNODE_RIP = 16, ++ ++ /* znode was not captured during locking (it might so be because ++ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */ ++ JNODE_MISSED_IN_CAPTURE = 17, ++ ++ /* write is in progress */ ++ JNODE_WRITEBACK = 18, ++ ++ /* FIXME: now it is used by crypto-compress plugin only */ ++ JNODE_NEW = 19, ++ ++ /* delimiting keys are already set for this znode. */ ++ JNODE_DKSET = 20, ++ ++ /* when this bit is set page and jnode can not be disconnected */ ++ JNODE_WRITE_PREPARED = 21, ++ ++ JNODE_CLUSTER_PAGE = 22, ++ /* Jnode is marked for repacking, that means the reiser4 flush and the ++ * block allocator should process this node special way */ ++ JNODE_REPACK = 23, ++ /* node should be converted by flush in squalloc phase */ ++ JNODE_CONVERTIBLE = 24, ++ /* ++ * When jnode is dirtied for the first time in given transaction, ++ * do_jnode_make_dirty() checks whether this jnode can possible became ++ * member of overwrite set. If so, this bit is set, and one block is ++ * reserved in the ->flush_reserved space of atom. ++ * ++ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when ++ * ++ * (1) flush decides that we want this block to go into relocate ++ * set after all. ++ * ++ * (2) wandering log is allocated (by log writer) ++ * ++ * (3) extent is allocated ++ * ++ */ ++ JNODE_FLUSH_RESERVED = 29 ++} reiser4_jnode_state; ++ ++/* Macros for accessing the jnode state. */ ++ ++static inline void JF_CLR(jnode * j, int f) ++{ ++ assert("unknown-1", j->magic == JMAGIC); ++ clear_bit(f, &j->state); ++} ++static inline int JF_ISSET(const jnode * j, int f) ++{ ++ assert("unknown-2", j->magic == JMAGIC); ++ return test_bit(f, &((jnode *) j)->state); ++} ++static inline void JF_SET(jnode * j, int f) ++{ ++ assert("unknown-3", j->magic == JMAGIC); ++ set_bit(f, &j->state); ++} ++ ++static inline int JF_TEST_AND_SET(jnode * j, int f) ++{ ++ assert("unknown-4", j->magic == JMAGIC); ++ return test_and_set_bit(f, &j->state); ++} ++ ++static inline void spin_lock_jnode(jnode *node) ++{ ++ /* check that spinlocks of lower priorities are not held */ ++ assert("", (LOCK_CNT_NIL(rw_locked_tree) && ++ LOCK_CNT_NIL(spin_locked_txnh) && ++ LOCK_CNT_NIL(spin_locked_zlock) && ++ LOCK_CNT_NIL(rw_locked_dk) && ++ LOCK_CNT_LT(spin_locked_jnode, 2))); ++ ++ spin_lock(&(node->guard)); ++ ++ LOCK_CNT_INC(spin_locked_jnode); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline void spin_unlock_jnode(jnode *node) ++{ ++ assert_spin_locked(&(node->guard)); ++ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ ++ LOCK_CNT_DEC(spin_locked_jnode); ++ LOCK_CNT_DEC(spin_locked); ++ ++ spin_unlock(&(node->guard)); ++} ++ ++static inline int jnode_is_in_deleteset(const jnode * node) ++{ ++ return JF_ISSET(node, JNODE_RELOC); ++} ++ ++extern int init_jnodes(void); ++extern void done_jnodes(void); ++ ++/* Jnode routines */ ++extern jnode *jalloc(void); ++extern void jfree(jnode * node) NONNULL; ++extern jnode *jclone(jnode *); ++extern jnode *jlookup(reiser4_tree * tree, ++ oid_t objectid, unsigned long ind) NONNULL; ++extern jnode *jfind(struct address_space *, unsigned long index) NONNULL; ++extern jnode *jnode_by_page(struct page *pg) NONNULL; ++extern jnode *jnode_of_page(struct page *pg) NONNULL; ++void jnode_attach_page(jnode * node, struct page *pg); ++ ++void unhash_unformatted_jnode(jnode *); ++extern jnode *page_next_jnode(jnode * node) NONNULL; ++extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL; ++extern void jnode_make_dirty(jnode * node) NONNULL; ++extern void jnode_make_clean(jnode * node) NONNULL; ++extern void jnode_make_wander_nolock(jnode * node) NONNULL; ++extern void jnode_make_wander(jnode *) NONNULL; ++extern void znode_make_reloc(znode * , flush_queue_t *) NONNULL; ++extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL; ++extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL; ++ ++/** ++ * jnode_get_block ++ * @node: jnode to query ++ * ++ */ ++static inline const reiser4_block_nr *jnode_get_block(const jnode *node) ++{ ++ assert("nikita-528", node != NULL); ++ ++ return &node->blocknr; ++} ++ ++/** ++ * jnode_set_block ++ * @node: jnode to update ++ * @blocknr: new block nr ++ */ ++static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr) ++{ ++ assert("nikita-2020", node != NULL); ++ assert("umka-055", blocknr != NULL); ++ node->blocknr = *blocknr; ++} ++ ++ ++/* block number for IO. Usually this is the same as jnode_get_block(), unless ++ * jnode was emergency flushed---then block number chosen by eflush is ++ * used. */ ++static inline const reiser4_block_nr *jnode_get_io_block(jnode * node) ++{ ++ assert("nikita-2768", node != NULL); ++ assert_spin_locked(&(node->guard)); ++ ++ return jnode_get_block(node); ++} ++ ++/* Jnode flush interface. */ ++extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos); ++extern flush_queue_t *reiser4_pos_fq(flush_pos_t *pos); ++ ++/* FIXME-VS: these are used in plugin/item/extent.c */ ++ ++/* does extent_get_block have to be called */ ++#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED) ++#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED) ++ ++/* the node should be converted during flush squalloc phase */ ++#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE) ++#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE) ++ ++/* Macros to convert from jnode to znode, znode to jnode. These are macros ++ because C doesn't allow overloading of const prototypes. */ ++#define ZJNODE(x) (&(x)->zjnode) ++#define JZNODE(x) \ ++({ \ ++ typeof(x) __tmp_x; \ ++ \ ++ __tmp_x = (x); \ ++ assert("jmacd-1300", jnode_is_znode(__tmp_x)); \ ++ (znode*) __tmp_x; \ ++}) ++ ++extern int jnodes_tree_init(reiser4_tree * tree); ++extern int jnodes_tree_done(reiser4_tree * tree); ++ ++#if REISER4_DEBUG ++ ++extern int znode_is_any_locked(const znode * node); ++extern void jnode_list_remove(jnode * node); ++ ++#else ++ ++#define jnode_list_remove(node) noop ++ ++#endif ++ ++int znode_is_root(const znode * node) NONNULL; ++ ++/* bump reference counter on @node */ ++static inline void add_x_ref(jnode * node/* node to increase x_count of */) ++{ ++ assert("nikita-1911", node != NULL); ++ ++ atomic_inc(&node->x_count); ++ LOCK_CNT_INC(x_refs); ++} ++ ++static inline void dec_x_ref(jnode * node) ++{ ++ assert("nikita-3215", node != NULL); ++ assert("nikita-3216", atomic_read(&node->x_count) > 0); ++ ++ atomic_dec(&node->x_count); ++ assert("nikita-3217", LOCK_CNT_GTZ(x_refs)); ++ LOCK_CNT_DEC(x_refs); ++} ++ ++/* jref() - increase counter of references to jnode/znode (x_count) */ ++static inline jnode *jref(jnode * node) ++{ ++ assert("jmacd-508", (node != NULL) && !IS_ERR(node)); ++ add_x_ref(node); ++ return node; ++} ++ ++/* get the page of jnode */ ++static inline struct page *jnode_page(const jnode * node) ++{ ++ return node->pg; ++} ++ ++/* return pointer to jnode data */ ++static inline char *jdata(const jnode * node) ++{ ++ assert("nikita-1415", node != NULL); ++ assert("nikita-3198", jnode_page(node) != NULL); ++ return node->data; ++} ++ ++static inline int jnode_is_loaded(const jnode * node) ++{ ++ assert("zam-506", node != NULL); ++ return atomic_read(&node->d_count) > 0; ++} ++ ++extern void page_clear_jnode(struct page *page, jnode * node) NONNULL; ++ ++static inline void jnode_set_reloc(jnode * node) ++{ ++ assert("nikita-2431", node != NULL); ++ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR)); ++ JF_SET(node, JNODE_RELOC); ++} ++ ++/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */ ++ ++extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL; ++ ++static inline int jload(jnode *node) ++{ ++ return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1); ++} ++ ++extern int jinit_new(jnode *, gfp_t) NONNULL; ++extern int jstartio(jnode *) NONNULL; ++ ++extern void jdrop(jnode *) NONNULL; ++extern int jwait_io(jnode *, int rw) NONNULL; ++ ++void jload_prefetch(jnode *); ++ ++extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL; ++extern void reiser4_drop_io_head(jnode * node) NONNULL; ++ ++static inline reiser4_tree *jnode_get_tree(const jnode * node) ++{ ++ assert("nikita-2691", node != NULL); ++ return node->tree; ++} ++ ++extern void pin_jnode_data(jnode *); ++extern void unpin_jnode_data(jnode *); ++ ++static inline jnode_type jnode_get_type(const jnode * node) ++{ ++ static const unsigned long state_mask = ++ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3); ++ ++ static jnode_type mask_to_type[] = { ++ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */ ++ ++ /* 000 */ ++ [0] = JNODE_FORMATTED_BLOCK, ++ /* 001 */ ++ [1] = JNODE_UNFORMATTED_BLOCK, ++ /* 010 */ ++ [2] = JNODE_BITMAP, ++ /* 011 */ ++ [3] = LAST_JNODE_TYPE, /*invalid */ ++ /* 100 */ ++ [4] = JNODE_INODE, ++ /* 101 */ ++ [5] = LAST_JNODE_TYPE, ++ /* 110 */ ++ [6] = JNODE_IO_HEAD, ++ /* 111 */ ++ [7] = LAST_JNODE_TYPE, /* invalid */ ++ }; ++ ++ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1]; ++} ++ ++/* returns true if node is a znode */ ++static inline int jnode_is_znode(const jnode * node) ++{ ++ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK; ++} ++ ++static inline int jnode_is_flushprepped(jnode * node) ++{ ++ assert("jmacd-78212", node != NULL); ++ assert_spin_locked(&(node->guard)); ++ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) || ++ JF_ISSET(node, JNODE_OVRWR); ++} ++ ++/* Return true if @node has already been processed by the squeeze and allocate ++ process. This implies the block address has been finalized for the ++ duration of this atom (or it is clean and will remain in place). If this ++ returns true you may use the block number as a hint. */ ++static inline int jnode_check_flushprepped(jnode * node) ++{ ++ int result; ++ ++ /* It must be clean or relocated or wandered. New allocations are set ++ * to relocate. */ ++ spin_lock_jnode(node); ++ result = jnode_is_flushprepped(node); ++ spin_unlock_jnode(node); ++ return result; ++} ++ ++/* returns true if node is unformatted */ ++static inline int jnode_is_unformatted(const jnode * node) ++{ ++ assert("jmacd-0123", node != NULL); ++ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK; ++} ++ ++/* returns true if node represents a cluster cache page */ ++static inline int jnode_is_cluster_page(const jnode * node) ++{ ++ assert("edward-50", node != NULL); ++ return (JF_ISSET(node, JNODE_CLUSTER_PAGE)); ++} ++ ++/* returns true is node is builtin inode's jnode */ ++static inline int jnode_is_inode(const jnode * node) ++{ ++ assert("vs-1240", node != NULL); ++ return jnode_get_type(node) == JNODE_INODE; ++} ++ ++static inline jnode_plugin *jnode_ops_of(const jnode_type type) ++{ ++ assert("nikita-2367", type < LAST_JNODE_TYPE); ++ return jnode_plugin_by_id((reiser4_plugin_id) type); ++} ++ ++static inline jnode_plugin *jnode_ops(const jnode * node) ++{ ++ assert("nikita-2366", node != NULL); ++ ++ return jnode_ops_of(jnode_get_type(node)); ++} ++ ++/* Get the index of a block. */ ++static inline unsigned long jnode_get_index(jnode * node) ++{ ++ return jnode_ops(node)->index(node); ++} ++ ++/* return true if "node" is the root */ ++static inline int jnode_is_root(const jnode * node) ++{ ++ return jnode_is_znode(node) && znode_is_root(JZNODE(node)); ++} ++ ++extern struct address_space *mapping_jnode(const jnode * node); ++extern unsigned long index_jnode(const jnode * node); ++ ++static inline void jput(jnode * node); ++extern void jput_final(jnode * node); ++ ++/* bump data counter on @node */ ++static inline void add_d_ref(jnode * node/* node to increase d_count of */) ++{ ++ assert("nikita-1962", node != NULL); ++ ++ atomic_inc(&node->d_count); ++ if (jnode_is_unformatted(node) || jnode_is_znode(node)) ++ LOCK_CNT_INC(d_refs); ++} ++ ++/* jput() - decrement x_count reference counter on znode. ++ ++ Count may drop to 0, jnode stays in cache until memory pressure causes the ++ eviction of its page. The c_count variable also ensures that children are ++ pressured out of memory before the parent. The jnode remains hashed as ++ long as the VM allows its page to stay in memory. ++*/ ++static inline void jput(jnode * node) ++{ ++ assert("jmacd-509", node != NULL); ++ assert("jmacd-510", atomic_read(&node->x_count) > 0); ++ assert("zam-926", reiser4_schedulable()); ++ LOCK_CNT_DEC(x_refs); ++ ++ rcu_read_lock(); ++ /* ++ * we don't need any kind of lock here--jput_final() uses RCU. ++ */ ++ if (unlikely(atomic_dec_and_test(&node->x_count))) ++ jput_final(node); ++ else ++ rcu_read_unlock(); ++ assert("nikita-3473", reiser4_schedulable()); ++} ++ ++extern void jrelse(jnode * node); ++extern void jrelse_tail(jnode * node); ++ ++extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node); ++ ++/* resolve race with jput */ ++static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node) ++{ ++ if (unlikely(JF_ISSET(node, JNODE_RIP))) ++ node = jnode_rip_sync(tree, node); ++ return node; ++} ++ ++extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key); ++ ++#if REISER4_DEBUG ++extern int jnode_invariant_f(const jnode *node, char const **msg); ++#endif ++ ++extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE]; ++ ++/* __JNODE_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/kassign.c linux-2.6.33/fs/reiser4/kassign.c +--- linux-2.6.33.orig/fs/reiser4/kassign.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/kassign.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,677 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Key assignment policy implementation */ ++ ++/* ++ * In reiser4 every piece of file system data and meta-data has a key. Keys ++ * are used to store information in and retrieve it from reiser4 internal ++ * tree. In addition to this, keys define _ordering_ of all file system ++ * information: things having close keys are placed into the same or ++ * neighboring (in the tree order) nodes of the tree. As our block allocator ++ * tries to respect tree order (see flush.c), keys also define order in which ++ * things are laid out on the disk, and hence, affect performance directly. ++ * ++ * Obviously, assignment of keys to data and meta-data should be consistent ++ * across whole file system. Algorithm that calculates a key for a given piece ++ * of data or meta-data is referred to as "key assignment". ++ * ++ * Key assignment is too expensive to be implemented as a plugin (that is, ++ * with an ability to support different key assignment schemas in the same ++ * compiled kernel image). As a compromise, all key-assignment functions and ++ * data-structures are collected in this single file, so that modifications to ++ * key assignment algorithm can be localized. Additional changes may be ++ * required in key.[ch]. ++ * ++ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one ++ * may guess, there is "Plan B" too. ++ * ++ */ ++ ++/* ++ * Additional complication with key assignment implementation is a requirement ++ * to support different key length. ++ */ ++ ++/* ++ * KEY ASSIGNMENT: PLAN A, LONG KEYS. ++ * ++ * DIRECTORY ITEMS ++ * ++ * | 60 | 4 | 7 |1| 56 | 64 | 64 | ++ * +--------------+---+---+-+-------------+------------------+-----------------+ ++ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash | ++ * +--------------+---+---+-+-------------+------------------+-----------------+ ++ * | | | | | ++ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | ++ * ++ * dirid objectid of directory this item is for ++ * ++ * F fibration, see fs/reiser4/plugin/fibration.[ch] ++ * ++ * H 1 if last 8 bytes of the key contain hash, ++ * 0 if last 8 bytes of the key contain prefix-3 ++ * ++ * prefix-1 first 7 characters of file name. ++ * Padded by zeroes if name is not long enough. ++ * ++ * prefix-2 next 8 characters of the file name. ++ * ++ * prefix-3 next 8 characters of the file name. ++ * ++ * hash hash of the rest of file name (i.e., portion of file ++ * name not included into prefix-1 and prefix-2). ++ * ++ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded ++ * in the key. Such file names are called "short". They are distinguished by H ++ * bit set 0 in the key. ++ * ++ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7 ++ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the ++ * key. Last 8 bytes of the key are occupied by hash of the remaining ++ * characters of the name. ++ * ++ * This key assignment reaches following important goals: ++ * ++ * (1) directory entries are sorted in approximately lexicographical ++ * order. ++ * ++ * (2) collisions (when multiple directory items have the same key), while ++ * principally unavoidable in a tree with fixed length keys, are rare. ++ * ++ * STAT DATA ++ * ++ * | 60 | 4 | 64 | 4 | 60 | 64 | ++ * +--------------+---+-----------------+---+--------------+-----------------+ ++ * | locality id | 1 | ordering | 0 | objectid | 0 | ++ * +--------------+---+-----------------+---+--------------+-----------------+ ++ * | | | | | ++ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | ++ * ++ * locality id object id of a directory where first name was created for ++ * the object ++ * ++ * ordering copy of second 8-byte portion of the key of directory ++ * entry for the first name of this object. Ordering has a form ++ * { ++ * fibration :7; ++ * h :1; ++ * prefix1 :56; ++ * } ++ * see description of key for directory entry above. ++ * ++ * objectid object id for this object ++ * ++ * This key assignment policy is designed to keep stat-data in the same order ++ * as corresponding directory items, thus speeding up readdir/stat types of ++ * workload. ++ * ++ * FILE BODY ++ * ++ * | 60 | 4 | 64 | 4 | 60 | 64 | ++ * +--------------+---+-----------------+---+--------------+-----------------+ ++ * | locality id | 4 | ordering | 0 | objectid | offset | ++ * +--------------+---+-----------------+---+--------------+-----------------+ ++ * | | | | | ++ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | ++ * ++ * locality id object id of a directory where first name was created for ++ * the object ++ * ++ * ordering the same as in the key of stat-data for this object ++ * ++ * objectid object id for this object ++ * ++ * offset logical offset from the beginning of this file. ++ * Measured in bytes. ++ * ++ * ++ * KEY ASSIGNMENT: PLAN A, SHORT KEYS. ++ * ++ * DIRECTORY ITEMS ++ * ++ * | 60 | 4 | 7 |1| 56 | 64 | ++ * +--------------+---+---+-+-------------+-----------------+ ++ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash | ++ * +--------------+---+---+-+-------------+-----------------+ ++ * | | | | ++ * | 8 bytes | 8 bytes | 8 bytes | ++ * ++ * dirid objectid of directory this item is for ++ * ++ * F fibration, see fs/reiser4/plugin/fibration.[ch] ++ * ++ * H 1 if last 8 bytes of the key contain hash, ++ * 0 if last 8 bytes of the key contain prefix-2 ++ * ++ * prefix-1 first 7 characters of file name. ++ * Padded by zeroes if name is not long enough. ++ * ++ * prefix-2 next 8 characters of the file name. ++ * ++ * hash hash of the rest of file name (i.e., portion of file ++ * name not included into prefix-1). ++ * ++ * File names shorter than 15 (== 7 + 8) characters are completely encoded in ++ * the key. Such file names are called "short". They are distinguished by H ++ * bit set in the key. ++ * ++ * Other file names are "long". For long name, H bit is 0, and first 7 ++ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the ++ * key are occupied by hash of the remaining characters of the name. ++ * ++ * STAT DATA ++ * ++ * | 60 | 4 | 4 | 60 | 64 | ++ * +--------------+---+---+--------------+-----------------+ ++ * | locality id | 1 | 0 | objectid | 0 | ++ * +--------------+---+---+--------------+-----------------+ ++ * | | | | ++ * | 8 bytes | 8 bytes | 8 bytes | ++ * ++ * locality id object id of a directory where first name was created for ++ * the object ++ * ++ * objectid object id for this object ++ * ++ * FILE BODY ++ * ++ * | 60 | 4 | 4 | 60 | 64 | ++ * +--------------+---+---+--------------+-----------------+ ++ * | locality id | 4 | 0 | objectid | offset | ++ * +--------------+---+---+--------------+-----------------+ ++ * | | | | ++ * | 8 bytes | 8 bytes | 8 bytes | ++ * ++ * locality id object id of a directory where first name was created for ++ * the object ++ * ++ * objectid object id for this object ++ * ++ * offset logical offset from the beginning of this file. ++ * Measured in bytes. ++ * ++ * ++ */ ++ ++#include "debug.h" ++#include "key.h" ++#include "kassign.h" ++#include "vfs_ops.h" ++#include "inode.h" ++#include "super.h" ++#include "dscale.h" ++ ++#include <linux/types.h> /* for __u?? */ ++#include <linux/fs.h> /* for struct super_block, etc */ ++ ++/* bitmask for H bit (see comment at the beginning of this file */ ++static const __u64 longname_mark = 0x0100000000000000ull; ++/* bitmask for F and H portions of the key. */ ++static const __u64 fibration_mask = 0xff00000000000000ull; ++ ++/* return true if name is not completely encoded in @key */ ++int is_longname_key(const reiser4_key * key) ++{ ++ __u64 highpart; ++ ++ assert("nikita-2863", key != NULL); ++ if (get_key_type(key) != KEY_FILE_NAME_MINOR) ++ reiser4_print_key("oops", key); ++ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR); ++ ++ if (REISER4_LARGE_KEY) ++ highpart = get_key_ordering(key); ++ else ++ highpart = get_key_objectid(key); ++ ++ return (highpart & longname_mark) ? 1 : 0; ++} ++ ++/* return true if @name is too long to be completely encoded in the key */ ++int is_longname(const char *name UNUSED_ARG, int len) ++{ ++ if (REISER4_LARGE_KEY) ++ return len > 23; ++ else ++ return len > 15; ++} ++ ++/* code ascii string into __u64. ++ ++ Put characters of @name into result (@str) one after another starting ++ from @start_idx-th highest (arithmetically) byte. This produces ++ endian-safe encoding. memcpy(2) will not do. ++ ++*/ ++static __u64 pack_string(const char *name /* string to encode */ , ++ int start_idx /* highest byte in result from ++ * which to start encoding */ ) ++{ ++ unsigned i; ++ __u64 str; ++ ++ str = 0; ++ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) { ++ str <<= 8; ++ str |= (unsigned char)name[i]; ++ } ++ str <<= (sizeof str - i - start_idx) << 3; ++ return str; ++} ++ ++/* opposite to pack_string(). Takes value produced by pack_string(), restores ++ * string encoded in it and stores result in @buf */ ++char *reiser4_unpack_string(__u64 value, char *buf) ++{ ++ do { ++ *buf = value >> (64 - 8); ++ if (*buf) ++ ++buf; ++ value <<= 8; ++ } while (value != 0); ++ *buf = 0; ++ return buf; ++} ++ ++/* obtain name encoded in @key and store it in @buf */ ++char *extract_name_from_key(const reiser4_key * key, char *buf) ++{ ++ char *c; ++ ++ assert("nikita-2868", !is_longname_key(key)); ++ ++ c = buf; ++ if (REISER4_LARGE_KEY) { ++ c = reiser4_unpack_string(get_key_ordering(key) & ++ ~fibration_mask, c); ++ c = reiser4_unpack_string(get_key_fulloid(key), c); ++ } else ++ c = reiser4_unpack_string(get_key_fulloid(key) & ++ ~fibration_mask, c); ++ reiser4_unpack_string(get_key_offset(key), c); ++ return buf; ++} ++ ++/** ++ * complete_entry_key - calculate entry key by name ++ * @dir: directory where entry is (or will be) in ++ * @name: name to calculate key of ++ * @len: lenth of name ++ * @result: place to store result in ++ * ++ * Sets fields of entry key @result which depend on file name. ++ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering, ++ * objectid and offset. Otherwise, objectid and offset are set. ++ */ ++void complete_entry_key(const struct inode *dir, const char *name, ++ int len, reiser4_key *result) ++{ ++#if REISER4_LARGE_KEY ++ __u64 ordering; ++ __u64 objectid; ++ __u64 offset; ++ ++ assert("nikita-1139", dir != NULL); ++ assert("nikita-1142", result != NULL); ++ assert("nikita-2867", strlen(name) == len); ++ ++ /* ++ * key allocation algorithm for directory entries in case of large ++ * keys: ++ * ++ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7 ++ * characters into ordering field of key, next 8 charactes (if any) ++ * into objectid field of key and next 8 ones (of any) into offset ++ * field of key ++ * ++ * If file name is longer than 23 characters, put first 7 characters ++ * into key's ordering, next 8 to objectid and hash of remaining ++ * characters into offset field. ++ * ++ * To distinguish above cases, in latter set up unused high bit in ++ * ordering field. ++ */ ++ ++ /* [0-6] characters to ordering */ ++ ordering = pack_string(name, 1); ++ if (len > 7) { ++ /* [7-14] characters to objectid */ ++ objectid = pack_string(name + 7, 0); ++ if (len > 15) { ++ if (len <= 23) { ++ /* [15-23] characters to offset */ ++ offset = pack_string(name + 15, 0); ++ } else { ++ /* note in a key the fact that offset contains ++ * hash */ ++ ordering |= longname_mark; ++ ++ /* offset is the hash of the file name's tail */ ++ offset = inode_hash_plugin(dir)->hash(name + 15, ++ len - 15); ++ } ++ } else { ++ offset = 0ull; ++ } ++ } else { ++ objectid = 0ull; ++ offset = 0ull; ++ } ++ ++ assert("nikita-3480", inode_fibration_plugin(dir) != NULL); ++ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len); ++ ++ set_key_ordering(result, ordering); ++ set_key_fulloid(result, objectid); ++ set_key_offset(result, offset); ++ return; ++ ++#else ++ __u64 objectid; ++ __u64 offset; ++ ++ assert("nikita-1139", dir != NULL); ++ assert("nikita-1142", result != NULL); ++ assert("nikita-2867", strlen(name) == len); ++ ++ /* ++ * key allocation algorithm for directory entries in case of not large ++ * keys: ++ * ++ * If name is not longer than 7 + 8 = 15 characters, put first 7 ++ * characters into objectid field of key, next 8 charactes (if any) ++ * into offset field of key ++ * ++ * If file name is longer than 15 characters, put first 7 characters ++ * into key's objectid, and hash of remaining characters into offset ++ * field. ++ * ++ * To distinguish above cases, in latter set up unused high bit in ++ * objectid field. ++ */ ++ ++ /* [0-6] characters to objectid */ ++ objectid = pack_string(name, 1); ++ if (len > 7) { ++ if (len <= 15) { ++ /* [7-14] characters to offset */ ++ offset = pack_string(name + 7, 0); ++ } else { ++ /* note in a key the fact that offset contains hash. */ ++ objectid |= longname_mark; ++ ++ /* offset is the hash of the file name. */ ++ offset = inode_hash_plugin(dir)->hash(name + 7, ++ len - 7); ++ } ++ } else ++ offset = 0ull; ++ ++ assert("nikita-3480", inode_fibration_plugin(dir) != NULL); ++ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len); ++ ++ set_key_fulloid(result, objectid); ++ set_key_offset(result, offset); ++ return; ++#endif /* ! REISER4_LARGE_KEY */ ++} ++ ++/* true, if @key is the key of "." */ ++int is_dot_key(const reiser4_key * key/* key to check */) ++{ ++ assert("nikita-1717", key != NULL); ++ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR); ++ return ++ (get_key_ordering(key) == 0ull) && ++ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull); ++} ++ ++/* build key for stat-data. ++ ++ return key of stat-data of this object. This should became sd plugin ++ method in the future. For now, let it be here. ++ ++*/ ++reiser4_key *build_sd_key(const struct inode *target /* inode of an object */ , ++ reiser4_key * result /* resulting key of @target ++ stat-data */ ) ++{ ++ assert("nikita-261", result != NULL); ++ ++ reiser4_key_init(result); ++ set_key_locality(result, reiser4_inode_data(target)->locality_id); ++ set_key_ordering(result, get_inode_ordering(target)); ++ set_key_objectid(result, get_inode_oid(target)); ++ set_key_type(result, KEY_SD_MINOR); ++ set_key_offset(result, (__u64) 0); ++ return result; ++} ++ ++/* encode part of key into &obj_key_id ++ ++ This encodes into @id part of @key sufficient to restore @key later, ++ given that latter is key of object (key of stat-data). ++ ++ See &obj_key_id ++*/ ++int build_obj_key_id(const reiser4_key * key /* key to encode */ , ++ obj_key_id * id/* id where key is encoded in */) ++{ ++ assert("nikita-1151", key != NULL); ++ assert("nikita-1152", id != NULL); ++ ++ memcpy(id, key, sizeof *id); ++ return 0; ++} ++ ++/* encode reference to @obj in @id. ++ ++ This is like build_obj_key_id() above, but takes inode as parameter. */ ++int build_inode_key_id(const struct inode *obj /* object to build key of */ , ++ obj_key_id * id/* result */) ++{ ++ reiser4_key sdkey; ++ ++ assert("nikita-1166", obj != NULL); ++ assert("nikita-1167", id != NULL); ++ ++ build_sd_key(obj, &sdkey); ++ build_obj_key_id(&sdkey, id); ++ return 0; ++} ++ ++/* decode @id back into @key ++ ++ Restore key of object stat-data from @id. This is dual to ++ build_obj_key_id() above. ++*/ ++int extract_key_from_id(const obj_key_id * id /* object key id to extract key ++ * from */ , ++ reiser4_key * key/* result */) ++{ ++ assert("nikita-1153", id != NULL); ++ assert("nikita-1154", key != NULL); ++ ++ reiser4_key_init(key); ++ memcpy(key, id, sizeof *id); ++ return 0; ++} ++ ++/* extract objectid of directory from key of directory entry within said ++ directory. ++ */ ++oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of ++ * directory ++ * entry */ ) ++{ ++ assert("nikita-1314", de_key != NULL); ++ return get_key_locality(de_key); ++} ++ ++/* encode into @id key of directory entry. ++ ++ Encode into @id information sufficient to later distinguish directory ++ entries within the same directory. This is not whole key, because all ++ directory entries within directory item share locality which is equal ++ to objectid of their directory. ++ ++*/ ++int build_de_id(const struct inode *dir /* inode of directory */ , ++ const struct qstr *name /* name to be given to @obj by ++ * directory entry being ++ * constructed */ , ++ de_id * id/* short key of directory entry */) ++{ ++ reiser4_key key; ++ ++ assert("nikita-1290", dir != NULL); ++ assert("nikita-1292", id != NULL); ++ ++ /* NOTE-NIKITA this is suboptimal. */ ++ inode_dir_plugin(dir)->build_entry_key(dir, name, &key); ++ return build_de_id_by_key(&key, id); ++} ++ ++/* encode into @id key of directory entry. ++ ++ Encode into @id information sufficient to later distinguish directory ++ entries within the same directory. This is not whole key, because all ++ directory entries within directory item share locality which is equal ++ to objectid of their directory. ++ ++*/ ++int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory ++ * entry */ , ++ de_id * id/* short key of directory entry */) ++{ ++ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id); ++ return 0; ++} ++ ++/* restore from @id key of directory entry. ++ ++ Function dual to build_de_id(): given @id and locality, build full ++ key of directory entry within directory item. ++ ++*/ ++int extract_key_from_de_id(const oid_t locality /* locality of directory ++ * entry */ , ++ const de_id * id /* directory entry id */ , ++ reiser4_key * key/* result */) ++{ ++ /* no need to initialise key here: all fields are overwritten */ ++ memcpy(((__u64 *) key) + 1, id, sizeof *id); ++ set_key_locality(key, locality); ++ set_key_type(key, KEY_FILE_NAME_MINOR); ++ return 0; ++} ++ ++/* compare two &de_id's */ ++cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ , ++ const de_id * id2/* second &de_id to compare */) ++{ ++ /* NOTE-NIKITA ugly implementation */ ++ reiser4_key k1; ++ reiser4_key k2; ++ ++ extract_key_from_de_id((oid_t) 0, id1, &k1); ++ extract_key_from_de_id((oid_t) 0, id2, &k2); ++ return keycmp(&k1, &k2); ++} ++ ++/* compare &de_id with key */ ++cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ , ++ const reiser4_key * key/* key to compare */) ++{ ++ cmp_t result; ++ reiser4_key *k1; ++ ++ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]); ++ result = KEY_DIFF_EL(k1, key, 1); ++ if (result == EQUAL_TO) { ++ result = KEY_DIFF_EL(k1, key, 2); ++ if (REISER4_LARGE_KEY && result == EQUAL_TO) ++ result = KEY_DIFF_EL(k1, key, 3); ++ } ++ return result; ++} ++ ++/* ++ * return number of bytes necessary to encode @inode identity. ++ */ ++int inode_onwire_size(const struct inode *inode) ++{ ++ int result; ++ ++ result = dscale_bytes_to_write(get_inode_oid(inode)); ++ result += dscale_bytes_to_write(get_inode_locality(inode)); ++ ++ /* ++ * ordering is large (it usually has highest bits set), so it makes ++ * little sense to dscale it. ++ */ ++ if (REISER4_LARGE_KEY) ++ result += sizeof(get_inode_ordering(inode)); ++ return result; ++} ++ ++/* ++ * encode @inode identity at @start ++ */ ++char *build_inode_onwire(const struct inode *inode, char *start) ++{ ++ start += dscale_write(start, get_inode_locality(inode)); ++ start += dscale_write(start, get_inode_oid(inode)); ++ ++ if (REISER4_LARGE_KEY) { ++ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start); ++ start += sizeof(get_inode_ordering(inode)); ++ } ++ return start; ++} ++ ++/* ++ * extract key that was previously encoded by build_inode_onwire() at @addr ++ */ ++char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id) ++{ ++ __u64 val; ++ ++ addr += dscale_read(addr, &val); ++ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR; ++ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality); ++ addr += dscale_read(addr, &val); ++ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid); ++#if REISER4_LARGE_KEY ++ memcpy(&key_id->ordering, addr, sizeof key_id->ordering); ++ addr += sizeof key_id->ordering; ++#endif ++ return addr; ++} ++ ++/* ++ * skip a key that was previously encoded by build_inode_onwire() at @addr ++ * FIXME: handle IO errors. ++ */ ++char * locate_obj_key_id_onwire(char * addr) ++{ ++ /* locality */ ++ addr += dscale_bytes_to_read(addr); ++ /* objectid */ ++ addr += dscale_bytes_to_read(addr); ++#if REISER4_LARGE_KEY ++ addr += sizeof ((obj_key_id *)0)->ordering; ++#endif ++ return addr; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/kassign.h linux-2.6.33/fs/reiser4/kassign.h +--- linux-2.6.33.orig/fs/reiser4/kassign.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/kassign.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,111 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Key assignment policy interface. See kassign.c for details. */ ++ ++#if !defined(__KASSIGN_H__) ++#define __KASSIGN_H__ ++ ++#include "forward.h" ++#include "key.h" ++#include "dformat.h" ++ ++#include <linux/types.h> /* for __u?? */ ++#include <linux/fs.h> /* for struct super_block, etc */ ++#include <linux/dcache.h> /* for struct qstr */ ++ ++/* key assignment functions */ ++ ++/* Information from which key of file stat-data can be uniquely ++ restored. This depends on key assignment policy for ++ stat-data. Currently it's enough to store object id and locality id ++ (60+60==120) bits, because minor packing locality and offset of ++ stat-data key are always known constants: KEY_SD_MINOR and 0 ++ respectively. For simplicity 4 bits are wasted in each id, and just ++ two 64 bit integers are stored. ++ ++ This field has to be byte-aligned, because we don't want to waste ++ space in directory entries. There is another side of a coin of ++ course: we waste CPU and bus bandwidth in stead, by copying data back ++ and forth. ++ ++ Next optimization: &obj_key_id is mainly used to address stat data from ++ directory entries. Under the assumption that majority of files only have ++ only name (one hard link) from *the* parent directory it seems reasonable ++ to only store objectid of stat data and take its locality from key of ++ directory item. ++ ++ This requires some flag to be added to the &obj_key_id to distinguish ++ between these two cases. Remaining bits in flag byte are then asking to be ++ used to store file type. ++ ++ This optimization requires changes in directory item handling code. ++ ++*/ ++typedef struct obj_key_id { ++ d8 locality[sizeof(__u64)]; ++ ON_LARGE_KEY(d8 ordering[sizeof(__u64)]; ++ ) ++ d8 objectid[sizeof(__u64)]; ++} ++obj_key_id; ++ ++/* Information sufficient to uniquely identify directory entry within ++ compressed directory item. ++ ++ For alignment issues see &obj_key_id above. ++*/ ++typedef struct de_id { ++ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];) ++ d8 objectid[sizeof(__u64)]; ++ d8 offset[sizeof(__u64)]; ++} ++de_id; ++ ++extern int inode_onwire_size(const struct inode *obj); ++extern char *build_inode_onwire(const struct inode *obj, char *area); ++extern char *locate_obj_key_id_onwire(char *area); ++extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id); ++ ++extern int build_inode_key_id(const struct inode *obj, obj_key_id * id); ++extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key); ++extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id); ++extern oid_t extract_dir_id_from_key(const reiser4_key * de_key); ++extern int build_de_id(const struct inode *dir, const struct qstr *name, ++ de_id * id); ++extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id); ++extern int extract_key_from_de_id(const oid_t locality, const de_id * id, ++ reiser4_key * key); ++extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2); ++extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key); ++ ++extern int build_readdir_key_common(struct file *dir, reiser4_key * result); ++extern void build_entry_key_common(const struct inode *dir, ++ const struct qstr *name, ++ reiser4_key * result); ++extern void build_entry_key_stable_entry(const struct inode *dir, ++ const struct qstr *name, ++ reiser4_key * result); ++extern int is_dot_key(const reiser4_key * key); ++extern reiser4_key *build_sd_key(const struct inode *target, ++ reiser4_key * result); ++ ++extern int is_longname_key(const reiser4_key * key); ++extern int is_longname(const char *name, int len); ++extern char *extract_name_from_key(const reiser4_key * key, char *buf); ++extern char *reiser4_unpack_string(__u64 value, char *buf); ++extern void complete_entry_key(const struct inode *dir, const char *name, ++ int len, reiser4_key *result); ++ ++/* __KASSIGN_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/Kconfig linux-2.6.33/fs/reiser4/Kconfig +--- linux-2.6.33.orig/fs/reiser4/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/Kconfig 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,34 @@ ++config REISER4_FS ++ tristate "Reiser4 (EXPERIMENTAL)" ++ depends on EXPERIMENTAL ++ select ZLIB_INFLATE ++ select ZLIB_DEFLATE ++ select LZO_COMPRESS ++ select LZO_DECOMPRESS ++ select CRYPTO ++ help ++ Reiser4 is a filesystem that performs all filesystem operations ++ as atomic transactions, which means that it either performs a ++ write, or it does not, and in the event of a crash it does not ++ partially perform it or corrupt it. ++ ++ It stores files in dancing trees, which are like balanced trees but ++ faster. It packs small files together so that they share blocks ++ without wasting space. This means you can use it to store really ++ small files. It also means that it saves you disk space. It avoids ++ hassling you with anachronisms like having a maximum number of ++ inodes, and wasting space if you use less than that number. ++ ++ Reiser4 is a distinct filesystem type from reiserfs (V3). ++ It's therefore not possible to use reiserfs file systems ++ with reiser4. ++ ++ To learn more about reiser4, go to http://www.namesys.com ++ ++config REISER4_DEBUG ++ bool "Enable reiser4 debug mode" ++ depends on REISER4_FS ++ help ++ Don't use this unless you are debugging reiser4. ++ ++ If unsure, say N. +diff -urN linux-2.6.33.orig/fs/reiser4/key.c linux-2.6.33/fs/reiser4/key.c +--- linux-2.6.33.orig/fs/reiser4/key.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/key.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,138 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Key manipulations. */ ++ ++#include "debug.h" ++#include "key.h" ++#include "super.h" ++#include "reiser4.h" ++ ++#include <linux/types.h> /* for __u?? */ ++ ++/* Minimal possible key: all components are zero. It is presumed that this is ++ independent of key scheme. */ ++static const reiser4_key MINIMAL_KEY = { ++ .el = { ++ 0ull, ++ ON_LARGE_KEY(0ull,) ++ 0ull, ++ 0ull ++ } ++}; ++ ++/* Maximal possible key: all components are ~0. It is presumed that this is ++ independent of key scheme. */ ++static const reiser4_key MAXIMAL_KEY = { ++ .el = { ++ __constant_cpu_to_le64(~0ull), ++ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),) ++ __constant_cpu_to_le64(~0ull), ++ __constant_cpu_to_le64(~0ull) ++ } ++}; ++ ++/* Initialize key. */ ++void reiser4_key_init(reiser4_key * key/* key to init */) ++{ ++ assert("nikita-1169", key != NULL); ++ memset(key, 0, sizeof *key); ++} ++ ++/* minimal possible key in the tree. Return pointer to the static storage. */ ++const reiser4_key * reiser4_min_key(void) ++{ ++ return &MINIMAL_KEY; ++} ++ ++/* maximum possible key in the tree. Return pointer to the static storage. */ ++const reiser4_key * reiser4_max_key(void) ++{ ++ return &MAXIMAL_KEY; ++} ++ ++#if REISER4_DEBUG ++/* debugging aid: print symbolic name of key type */ ++static const char *type_name(unsigned int key_type/* key type */) ++{ ++ switch (key_type) { ++ case KEY_FILE_NAME_MINOR: ++ return "file name"; ++ case KEY_SD_MINOR: ++ return "stat data"; ++ case KEY_ATTR_NAME_MINOR: ++ return "attr name"; ++ case KEY_ATTR_BODY_MINOR: ++ return "attr body"; ++ case KEY_BODY_MINOR: ++ return "file body"; ++ default: ++ return "unknown"; ++ } ++} ++ ++/* debugging aid: print human readable information about key */ ++void reiser4_print_key(const char *prefix /* prefix to print */ , ++ const reiser4_key * key/* key to print */) ++{ ++ /* turn bold on */ ++ /* printf ("\033[1m"); */ ++ if (key == NULL) ++ printk("%s: null key\n", prefix); ++ else { ++ if (REISER4_LARGE_KEY) ++ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix, ++ get_key_locality(key), ++ get_key_type(key), ++ get_key_ordering(key), ++ get_key_band(key), ++ get_key_objectid(key), get_key_offset(key)); ++ else ++ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix, ++ get_key_locality(key), ++ get_key_type(key), ++ get_key_band(key), ++ get_key_objectid(key), get_key_offset(key)); ++ /* ++ * if this is a key of directory entry, try to decode part of ++ * a name stored in the key, and output it. ++ */ ++ if (get_key_type(key) == KEY_FILE_NAME_MINOR) { ++ char buf[DE_NAME_BUF_LEN]; ++ char *c; ++ ++ c = buf; ++ c = reiser4_unpack_string(get_key_ordering(key), c); ++ reiser4_unpack_string(get_key_fulloid(key), c); ++ printk("[%s", buf); ++ if (is_longname_key(key)) ++ /* ++ * only part of the name is stored in the key. ++ */ ++ printk("...]\n"); ++ else { ++ /* ++ * whole name is stored in the key. ++ */ ++ reiser4_unpack_string(get_key_offset(key), buf); ++ printk("%s]\n", buf); ++ } ++ } else { ++ printk("[%s]\n", type_name(get_key_type(key))); ++ } ++ } ++ /* turn bold off */ ++ /* printf ("\033[m\017"); */ ++} ++ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/key.h linux-2.6.33/fs/reiser4/key.h +--- linux-2.6.33.orig/fs/reiser4/key.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/key.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,392 @@ ++/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Declarations of key-related data-structures and operations on keys. */ ++ ++#if !defined(__REISER4_KEY_H__) ++#define __REISER4_KEY_H__ ++ ++#include "dformat.h" ++#include "forward.h" ++#include "debug.h" ++ ++#include <linux/types.h> /* for __u?? */ ++ ++/* Operations on keys in reiser4 tree */ ++ ++/* No access to any of these fields shall be done except via a ++ wrapping macro/function, and that wrapping macro/function shall ++ convert to little endian order. Compare keys will consider cpu byte order. */ ++ ++/* A storage layer implementation difference between a regular unix file body ++ and its attributes is in the typedef below which causes all of the attributes ++ of a file to be near in key to all of the other attributes for all of the ++ files within that directory, and not near to the file itself. It is ++ interesting to consider whether this is the wrong approach, and whether there ++ should be no difference at all. For current usage patterns this choice is ++ probably the right one. */ ++ ++/* possible values for minor packing locality (4 bits required) */ ++typedef enum { ++ /* file name */ ++ KEY_FILE_NAME_MINOR = 0, ++ /* stat-data */ ++ KEY_SD_MINOR = 1, ++ /* file attribute name */ ++ KEY_ATTR_NAME_MINOR = 2, ++ /* file attribute value */ ++ KEY_ATTR_BODY_MINOR = 3, ++ /* file body (tail or extent) */ ++ KEY_BODY_MINOR = 4, ++} key_minor_locality; ++ ++/* Everything stored in the tree has a unique key, which means that the tree is ++ (logically) fully ordered by key. Physical order is determined by dynamic ++ heuristics that attempt to reflect key order when allocating available space, ++ and by the repacker. It is stylistically better to put aggregation ++ information into the key. Thus, if you want to segregate extents from tails, ++ it is better to give them distinct minor packing localities rather than ++ changing block_alloc.c to check the node type when deciding where to allocate ++ the node. ++ ++ The need to randomly displace new directories and large files disturbs this ++ symmetry unfortunately. However, it should be noted that this is a need that ++ is not clearly established given the existence of a repacker. Also, in our ++ current implementation tails have a different minor packing locality from ++ extents, and no files have both extents and tails, so maybe symmetry can be ++ had without performance cost after all. Symmetry is what we ship for now.... ++*/ ++ ++/* Arbitrary major packing localities can be assigned to objects using ++ the reiser4(filenameA/..packing<=some_number) system call. ++ ++ In reiser4, the creat() syscall creates a directory ++ ++ whose default flow (that which is referred to if the directory is ++ read as a file) is the traditional unix file body. ++ ++ whose directory plugin is the 'filedir' ++ ++ whose major packing locality is that of the parent of the object created. ++ ++ The static_stat item is a particular commonly used directory ++ compression (the one for normal unix files). ++ ++ The filedir plugin checks to see if the static_stat item exists. ++ There is a unique key for static_stat. If yes, then it uses the ++ static_stat item for all of the values that it contains. The ++ static_stat item contains a flag for each stat it contains which ++ indicates whether one should look outside the static_stat item for its ++ contents. ++*/ ++ ++/* offset of fields in reiser4_key. Value of each element of this enum ++ is index within key (thought as array of __u64's) where this field ++ is. */ ++typedef enum { ++ /* major "locale", aka dirid. Sits in 1st element */ ++ KEY_LOCALITY_INDEX = 0, ++ /* minor "locale", aka item type. Sits in 1st element */ ++ KEY_TYPE_INDEX = 0, ++ ON_LARGE_KEY(KEY_ORDERING_INDEX,) ++ /* "object band". Sits in 2nd element */ ++ KEY_BAND_INDEX, ++ /* objectid. Sits in 2nd element */ ++ KEY_OBJECTID_INDEX = KEY_BAND_INDEX, ++ /* full objectid. Sits in 2nd element */ ++ KEY_FULLOID_INDEX = KEY_BAND_INDEX, ++ /* Offset. Sits in 3rd element */ ++ KEY_OFFSET_INDEX, ++ /* Name hash. Sits in 3rd element */ ++ KEY_HASH_INDEX = KEY_OFFSET_INDEX, ++ KEY_CACHELINE_END = KEY_OFFSET_INDEX, ++ KEY_LAST_INDEX ++} reiser4_key_field_index; ++ ++/* key in reiser4 internal "balanced" tree. It is just array of three ++ 64bit integers in disk byte order (little-endian by default). This ++ array is actually indexed by reiser4_key_field. Each __u64 within ++ this array is called "element". Logical key component encoded within ++ elements are called "fields". ++ ++ We declare this as union with second component dummy to suppress ++ inconvenient array<->pointer casts implied in C. */ ++union reiser4_key { ++ __le64 el[KEY_LAST_INDEX]; ++ int pad; ++}; ++ ++/* bitmasks showing where within reiser4_key particular key is stored. */ ++/* major locality occupies higher 60 bits of the first element */ ++#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull ++ ++/* minor locality occupies lower 4 bits of the first element */ ++#define KEY_TYPE_MASK 0xfull ++ ++/* controversial band occupies higher 4 bits of the 2nd element */ ++#define KEY_BAND_MASK 0xf000000000000000ull ++ ++/* objectid occupies lower 60 bits of the 2nd element */ ++#define KEY_OBJECTID_MASK 0x0fffffffffffffffull ++ ++/* full 64bit objectid*/ ++#define KEY_FULLOID_MASK 0xffffffffffffffffull ++ ++/* offset is just 3rd L.M.Nt itself */ ++#define KEY_OFFSET_MASK 0xffffffffffffffffull ++ ++/* ordering is whole second element */ ++#define KEY_ORDERING_MASK 0xffffffffffffffffull ++ ++/* how many bits key element should be shifted to left to get particular field ++ */ ++typedef enum { ++ KEY_LOCALITY_SHIFT = 4, ++ KEY_TYPE_SHIFT = 0, ++ KEY_BAND_SHIFT = 60, ++ KEY_OBJECTID_SHIFT = 0, ++ KEY_FULLOID_SHIFT = 0, ++ KEY_OFFSET_SHIFT = 0, ++ KEY_ORDERING_SHIFT = 0, ++} reiser4_key_field_shift; ++ ++static inline __u64 ++get_key_el(const reiser4_key * key, reiser4_key_field_index off) ++{ ++ assert("nikita-753", key != NULL); ++ assert("nikita-754", off < KEY_LAST_INDEX); ++ return le64_to_cpu(get_unaligned(&key->el[off])); ++} ++ ++static inline void ++set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value) ++{ ++ assert("nikita-755", key != NULL); ++ assert("nikita-756", off < KEY_LAST_INDEX); ++ put_unaligned(cpu_to_le64(value), &key->el[off]); ++} ++ ++/* macro to define getter and setter functions for field F with type T */ ++#define DEFINE_KEY_FIELD(L, U, T) \ ++static inline T get_key_ ## L(const reiser4_key *key) \ ++{ \ ++ assert("nikita-750", key != NULL); \ ++ return (T) (get_key_el(key, KEY_ ## U ## _INDEX) & \ ++ KEY_ ## U ## _MASK) >> KEY_ ## U ## _SHIFT; \ ++} \ ++ \ ++static inline void set_key_ ## L(reiser4_key * key, T loc) \ ++{ \ ++ __u64 el; \ ++ \ ++ assert("nikita-752", key != NULL); \ ++ \ ++ el = get_key_el(key, KEY_ ## U ## _INDEX); \ ++ /* clear field bits in the key */ \ ++ el &= ~KEY_ ## U ## _MASK; \ ++ /* actually it should be \ ++ \ ++ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \ ++ \ ++ but we trust user to never pass values that wouldn't fit \ ++ into field. Clearing extra bits is one operation, but this \ ++ function is time-critical. \ ++ But check this in assertion. */ \ ++ assert("nikita-759", ((loc << KEY_ ## U ## _SHIFT) & \ ++ ~KEY_ ## U ## _MASK) == 0); \ ++ el |= (loc << KEY_ ## U ## _SHIFT); \ ++ set_key_el(key, KEY_ ## U ## _INDEX, el); \ ++} ++ ++typedef __u64 oid_t; ++ ++/* define get_key_locality(), set_key_locality() */ ++DEFINE_KEY_FIELD(locality, LOCALITY, oid_t); ++/* define get_key_type(), set_key_type() */ ++DEFINE_KEY_FIELD(type, TYPE, key_minor_locality); ++/* define get_key_band(), set_key_band() */ ++DEFINE_KEY_FIELD(band, BAND, __u64); ++/* define get_key_objectid(), set_key_objectid() */ ++DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t); ++/* define get_key_fulloid(), set_key_fulloid() */ ++DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t); ++/* define get_key_offset(), set_key_offset() */ ++DEFINE_KEY_FIELD(offset, OFFSET, __u64); ++#if (REISER4_LARGE_KEY) ++/* define get_key_ordering(), set_key_ordering() */ ++DEFINE_KEY_FIELD(ordering, ORDERING, __u64); ++#else ++static inline __u64 get_key_ordering(const reiser4_key * key) ++{ ++ return 0; ++} ++ ++static inline void set_key_ordering(reiser4_key * key, __u64 val) ++{ ++} ++#endif ++ ++/* key comparison result */ ++typedef enum { LESS_THAN = -1, /* if first key is less than second */ ++ EQUAL_TO = 0, /* if keys are equal */ ++ GREATER_THAN = +1 /* if first key is greater than second */ ++} cmp_t; ++ ++void reiser4_key_init(reiser4_key * key); ++ ++/* minimal possible key in the tree. Return pointer to the static storage. */ ++extern const reiser4_key *reiser4_min_key(void); ++extern const reiser4_key *reiser4_max_key(void); ++ ++/* helper macro for keycmp() */ ++#define KEY_DIFF(k1, k2, field) \ ++({ \ ++ typeof(get_key_ ## field(k1)) f1; \ ++ typeof(get_key_ ## field(k2)) f2; \ ++ \ ++ f1 = get_key_ ## field(k1); \ ++ f2 = get_key_ ## field(k2); \ ++ \ ++ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \ ++}) ++ ++/* helper macro for keycmp() */ ++#define KEY_DIFF_EL(k1, k2, off) \ ++({ \ ++ __u64 e1; \ ++ __u64 e2; \ ++ \ ++ e1 = get_key_el(k1, off); \ ++ e2 = get_key_el(k2, off); \ ++ \ ++ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \ ++}) ++ ++/* compare `k1' and `k2'. This function is a heart of "key allocation ++ policy". All you need to implement new policy is to add yet another ++ clause here. */ ++static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ , ++ const reiser4_key * k2/* second key to compare */) ++{ ++ cmp_t result; ++ ++ /* ++ * This function is the heart of reiser4 tree-routines. Key comparison ++ * is among most heavily used operations in the file system. ++ */ ++ ++ assert("nikita-439", k1 != NULL); ++ assert("nikita-440", k2 != NULL); ++ ++ /* there is no actual branch here: condition is compile time constant ++ * and constant folding and propagation ensures that only one branch ++ * is actually compiled in. */ ++ ++ if (REISER4_PLANA_KEY_ALLOCATION) { ++ /* if physical order of fields in a key is identical ++ with logical order, we can implement key comparison ++ as three 64bit comparisons. */ ++ /* logical order of fields in plan-a: ++ locality->type->objectid->offset. */ ++ /* compare locality and type at once */ ++ result = KEY_DIFF_EL(k1, k2, 0); ++ if (result == EQUAL_TO) { ++ /* compare objectid (and band if it's there) */ ++ result = KEY_DIFF_EL(k1, k2, 1); ++ /* compare offset */ ++ if (result == EQUAL_TO) { ++ result = KEY_DIFF_EL(k1, k2, 2); ++ if (REISER4_LARGE_KEY && result == EQUAL_TO) ++ result = KEY_DIFF_EL(k1, k2, 3); ++ } ++ } ++ } else if (REISER4_3_5_KEY_ALLOCATION) { ++ result = KEY_DIFF(k1, k2, locality); ++ if (result == EQUAL_TO) { ++ result = KEY_DIFF(k1, k2, objectid); ++ if (result == EQUAL_TO) { ++ result = KEY_DIFF(k1, k2, type); ++ if (result == EQUAL_TO) ++ result = KEY_DIFF(k1, k2, offset); ++ } ++ } ++ } else ++ impossible("nikita-441", "Unknown key allocation scheme!"); ++ return result; ++} ++ ++/* true if @k1 equals @k2 */ ++static inline int keyeq(const reiser4_key * k1 /* first key to compare */ , ++ const reiser4_key * k2/* second key to compare */) ++{ ++ assert("nikita-1879", k1 != NULL); ++ assert("nikita-1880", k2 != NULL); ++ return !memcmp(k1, k2, sizeof *k1); ++} ++ ++/* true if @k1 is less than @k2 */ ++static inline int keylt(const reiser4_key * k1 /* first key to compare */ , ++ const reiser4_key * k2/* second key to compare */) ++{ ++ assert("nikita-1952", k1 != NULL); ++ assert("nikita-1953", k2 != NULL); ++ return keycmp(k1, k2) == LESS_THAN; ++} ++ ++/* true if @k1 is less than or equal to @k2 */ ++static inline int keyle(const reiser4_key * k1 /* first key to compare */ , ++ const reiser4_key * k2/* second key to compare */) ++{ ++ assert("nikita-1954", k1 != NULL); ++ assert("nikita-1955", k2 != NULL); ++ return keycmp(k1, k2) != GREATER_THAN; ++} ++ ++/* true if @k1 is greater than @k2 */ ++static inline int keygt(const reiser4_key * k1 /* first key to compare */ , ++ const reiser4_key * k2/* second key to compare */) ++{ ++ assert("nikita-1959", k1 != NULL); ++ assert("nikita-1960", k2 != NULL); ++ return keycmp(k1, k2) == GREATER_THAN; ++} ++ ++/* true if @k1 is greater than or equal to @k2 */ ++static inline int keyge(const reiser4_key * k1 /* first key to compare */ , ++ const reiser4_key * k2/* second key to compare */) ++{ ++ assert("nikita-1956", k1 != NULL); ++ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched ++ * November 3: Laika */ ++ return keycmp(k1, k2) != LESS_THAN; ++} ++ ++static inline void prefetchkey(reiser4_key * key) ++{ ++ prefetch(key); ++ prefetch(&key->el[KEY_CACHELINE_END]); ++} ++ ++/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) = ++ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */ ++/* size of a buffer suitable to hold human readable key representation */ ++#define KEY_BUF_LEN (80) ++ ++#if REISER4_DEBUG ++extern void reiser4_print_key(const char *prefix, const reiser4_key * key); ++#else ++#define reiser4_print_key(p, k) noop ++#endif ++ ++/* __FS_REISERFS_KEY_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/ktxnmgrd.c linux-2.6.33/fs/reiser4/ktxnmgrd.c +--- linux-2.6.33.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/ktxnmgrd.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,215 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++/* Transaction manager daemon. */ ++ ++/* ++ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is ++ * needed/important for the following reasons: ++ * ++ * 1. in reiser4 atom is not committed immediately when last transaction ++ * handle closes, unless atom is either too old or too large (see ++ * atom_should_commit()). This is done to avoid committing too frequently. ++ * because: ++ * ++ * 2. sometimes we don't want to commit atom when closing last transaction ++ * handle even if it is old and fat enough. For example, because we are at ++ * this point under directory semaphore, and committing would stall all ++ * accesses to this directory. ++ * ++ * ktxnmgrd binds its time sleeping on condition variable. When is awakes ++ * either due to (tunable) timeout or because it was explicitly woken up by ++ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones ++ * eligible. ++ * ++ */ ++ ++#include "debug.h" ++#include "txnmgr.h" ++#include "tree.h" ++#include "ktxnmgrd.h" ++#include "super.h" ++#include "reiser4.h" ++ ++#include <linux/sched.h> /* for struct task_struct */ ++#include <linux/wait.h> ++#include <linux/suspend.h> ++#include <linux/kernel.h> ++#include <linux/writeback.h> ++#include <linux/kthread.h> ++#include <linux/freezer.h> ++ ++static int scan_mgr(struct super_block *); ++ ++/* ++ * change current->comm so that ps, top, and friends will see changed ++ * state. This serves no useful purpose whatsoever, but also costs nothing. May ++ * be it will make lonely system administrator feeling less alone at 3 A.M. ++ */ ++#define set_comm(state) \ ++ snprintf(current->comm, sizeof(current->comm), \ ++ "%s:%s:%s", __FUNCTION__, (super)->s_id, (state)) ++ ++/** ++ * ktxnmgrd - kernel txnmgr daemon ++ * @arg: pointer to super block ++ * ++ * The background transaction manager daemon, started as a kernel thread during ++ * reiser4 initialization. ++ */ ++static int ktxnmgrd(void *arg) ++{ ++ struct super_block *super; ++ ktxnmgrd_context *ctx; ++ txn_mgr *mgr; ++ int done = 0; ++ ++ super = arg; ++ mgr = &get_super_private(super)->tmgr; ++ ++ /* ++ * do_fork() just copies task_struct into the new thread. ->fs_context ++ * shouldn't be copied of course. This shouldn't be a problem for the ++ * rest of the code though. ++ */ ++ current->journal_info = NULL; ++ ctx = mgr->daemon; ++ while (1) { ++ try_to_freeze(); ++ set_comm("wait"); ++ { ++ DEFINE_WAIT(__wait); ++ ++ prepare_to_wait(&ctx->wait, &__wait, ++ TASK_INTERRUPTIBLE); ++ if (kthread_should_stop()) ++ done = 1; ++ else ++ schedule_timeout(ctx->timeout); ++ finish_wait(&ctx->wait, &__wait); ++ } ++ if (done) ++ break; ++ set_comm("run"); ++ spin_lock(&ctx->guard); ++ /* ++ * wait timed out or ktxnmgrd was woken up by explicit request ++ * to commit something. Scan list of atoms in txnmgr and look ++ * for too old atoms. ++ */ ++ do { ++ ctx->rescan = 0; ++ scan_mgr(super); ++ spin_lock(&ctx->guard); ++ if (ctx->rescan) { ++ /* ++ * the list could be modified while ctx ++ * spinlock was released, we have to repeat ++ * scanning from the beginning ++ */ ++ break; ++ } ++ } while (ctx->rescan); ++ spin_unlock(&ctx->guard); ++ } ++ return 0; ++} ++ ++#undef set_comm ++ ++/** ++ * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon ++ * @super: pointer to super block ++ * ++ * Allocates and initializes ktxnmgrd_context, attaches it to transaction ++ * manager. Starts kernel txnmgr daemon. This is called on mount. ++ */ ++int reiser4_init_ktxnmgrd(struct super_block *super) ++{ ++ txn_mgr *mgr; ++ ktxnmgrd_context *ctx; ++ ++ mgr = &get_super_private(super)->tmgr; ++ ++ assert("zam-1014", mgr->daemon == NULL); ++ ++ ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get()); ++ if (!ctx) ++ return RETERR(-ENOMEM); ++ ++ assert("nikita-2442", ctx != NULL); ++ ++ init_waitqueue_head(&ctx->wait); ++ ++ /*kcond_init(&ctx->startup);*/ ++ spin_lock_init(&ctx->guard); ++ ctx->timeout = REISER4_TXNMGR_TIMEOUT; ++ ctx->rescan = 1; ++ mgr->daemon = ctx; ++ ++ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd"); ++ if (IS_ERR(ctx->tsk)) { ++ int ret = PTR_ERR(ctx->tsk); ++ mgr->daemon = NULL; ++ kfree(ctx); ++ return RETERR(ret); ++ } ++ return 0; ++} ++ ++void ktxnmgrd_kick(txn_mgr *mgr) ++{ ++ assert("nikita-3234", mgr != NULL); ++ assert("nikita-3235", mgr->daemon != NULL); ++ wake_up(&mgr->daemon->wait); ++} ++ ++int is_current_ktxnmgrd(void) ++{ ++ return (get_current_super_private()->tmgr.daemon->tsk == current); ++} ++ ++/** ++ * scan_mgr - commit atoms which are to be committed ++ * @super: super block to commit atoms of ++ * ++ * Commits old atoms. ++ */ ++static int scan_mgr(struct super_block *super) ++{ ++ int ret; ++ reiser4_context ctx; ++ ++ init_stack_context(&ctx, super); ++ ++ ret = commit_some_atoms(&get_super_private(super)->tmgr); ++ ++ reiser4_exit_context(&ctx); ++ return ret; ++} ++ ++/** ++ * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context ++ * @mgr: ++ * ++ * This is called on umount. Stops ktxnmgrd and free t ++ */ ++void reiser4_done_ktxnmgrd(struct super_block *super) ++{ ++ txn_mgr *mgr; ++ ++ mgr = &get_super_private(super)->tmgr; ++ assert("zam-1012", mgr->daemon != NULL); ++ ++ kthread_stop(mgr->daemon->tsk); ++ kfree(mgr->daemon); ++ mgr->daemon = NULL; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 120 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/ktxnmgrd.h linux-2.6.33/fs/reiser4/ktxnmgrd.h +--- linux-2.6.33.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/ktxnmgrd.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,52 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Transaction manager daemon. See ktxnmgrd.c for comments. */ ++ ++#ifndef __KTXNMGRD_H__ ++#define __KTXNMGRD_H__ ++ ++#include "txnmgr.h" ++ ++#include <linux/fs.h> ++#include <linux/wait.h> ++#include <linux/completion.h> ++#include <linux/spinlock.h> ++#include <asm/atomic.h> ++#include <linux/sched.h> /* for struct task_struct */ ++ ++/* in this structure all data necessary to start up, shut down and communicate ++ * with ktxnmgrd are kept. */ ++struct ktxnmgrd_context { ++ /* wait queue head on which ktxnmgrd sleeps */ ++ wait_queue_head_t wait; ++ /* spin lock protecting all fields of this structure */ ++ spinlock_t guard; ++ /* timeout of sleeping on ->wait */ ++ signed long timeout; ++ /* kernel thread running ktxnmgrd */ ++ struct task_struct *tsk; ++ /* list of all file systems served by this ktxnmgrd */ ++ struct list_head queue; ++ /* should ktxnmgrd repeat scanning of atoms? */ ++ unsigned int rescan:1; ++}; ++ ++extern int reiser4_init_ktxnmgrd(struct super_block *); ++extern void reiser4_done_ktxnmgrd(struct super_block *); ++ ++extern void ktxnmgrd_kick(txn_mgr * mgr); ++extern int is_current_ktxnmgrd(void); ++ ++/* __KTXNMGRD_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/lock.c linux-2.6.33/fs/reiser4/lock.c +--- linux-2.6.33.orig/fs/reiser4/lock.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/lock.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1237 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Traditional deadlock avoidance is achieved by acquiring all locks in a single ++ order. V4 balances the tree from the bottom up, and searches the tree from ++ the top down, and that is really the way we want it, so tradition won't work ++ for us. ++ ++ Instead we have two lock orderings, a high priority lock ordering, and a low ++ priority lock ordering. Each node in the tree has a lock in its znode. ++ ++ Suppose we have a set of processes which lock (R/W) tree nodes. Each process ++ has a set (maybe empty) of already locked nodes ("process locked set"). Each ++ process may have a pending lock request to a node locked by another process. ++ Note: we lock and unlock, but do not transfer locks: it is possible ++ transferring locks instead would save some bus locking.... ++ ++ Deadlock occurs when we have a loop constructed from process locked sets and ++ lock request vectors. ++ ++ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in ++ memory is extended with "znodes" with which we connect nodes with their left ++ and right neighbors using sibling pointers stored in the znodes. When we ++ perform balancing operations we often go from left to right and from right to ++ left. ++ ++ +-P1-+ +-P3-+ ++ |+--+| V1 |+--+| ++ ||N1|| -------> ||N3|| ++ |+--+| |+--+| ++ +----+ +----+ ++ ^ | ++ |V2 |V3 ++ | v ++ +---------P2---------+ ++ |+--+ +--+| ++ ||N2| -------- |N4|| ++ |+--+ +--+| ++ +--------------------+ ++ ++ We solve this by ensuring that only low priority processes lock in top to ++ bottom order and from right to left, and high priority processes lock from ++ bottom to top and left to right. ++ ++ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and ++ kill those damn busy loops. ++ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom ++ stage) cannot be ordered that way. There are no rules what nodes can belong ++ to the atom and what nodes cannot. We cannot define what is right or left ++ direction, what is top or bottom. We can take immediate parent or side ++ neighbor of one node, but nobody guarantees that, say, left neighbor node is ++ not a far right neighbor for other nodes from the same atom. It breaks ++ deadlock avoidance rules and hi-low priority locking cannot be applied for ++ atom locks. ++ ++ How does it help to avoid deadlocks ? ++ ++ Suppose we have a deadlock with n processes. Processes from one priority ++ class never deadlock because they take locks in one consistent ++ order. ++ ++ So, any possible deadlock loop must have low priority as well as high ++ priority processes. There are no other lock priority levels except low and ++ high. We know that any deadlock loop contains at least one node locked by a ++ low priority process and requested by a high priority process. If this ++ situation is caught and resolved it is sufficient to avoid deadlocks. ++ ++ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION. ++ ++ The deadlock prevention algorithm is based on comparing ++ priorities of node owners (processes which keep znode locked) and ++ requesters (processes which want to acquire a lock on znode). We ++ implement a scheme where low-priority owners yield locks to ++ high-priority requesters. We created a signal passing system that ++ is used to ask low-priority processes to yield one or more locked ++ znodes. ++ ++ The condition when a znode needs to change its owners is described by the ++ following formula: ++ ++ ############################################# ++ # # ++ # (number of high-priority requesters) > 0 # ++ # AND # ++ # (numbers of high-priority owners) == 0 # ++ # # ++ ############################################# ++ ++ Note that a low-priority process delays node releasing if another ++ high-priority process owns this node. So, slightly more strictly speaking, ++ to have a deadlock capable cycle you must have a loop in which a high ++ priority process is waiting on a low priority process to yield a node, which ++ is slightly different from saying a high priority process is waiting on a ++ node owned by a low priority process. ++ ++ It is enough to avoid deadlocks if we prevent any low-priority process from ++ falling asleep if its locked set contains a node which satisfies the ++ deadlock condition. ++ ++ That condition is implicitly or explicitly checked in all places where new ++ high-priority requests may be added or removed from node request queue or ++ high-priority process takes or releases a lock on node. The main ++ goal of these checks is to never lose the moment when node becomes "has ++ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners ++ at that time. ++ ++ The information about received signals is stored in the per-process ++ structure (lock stack) and analyzed before a low-priority process goes to ++ sleep but after a "fast" attempt to lock a node fails. Any signal wakes ++ sleeping process up and forces him to re-check lock status and received ++ signal info. If "must-yield-this-lock" signals were received the locking ++ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code. ++ ++ V4 LOCKING DRAWBACKS ++ ++ If we have already balanced on one level, and we are propagating our changes ++ upward to a higher level, it could be very messy to surrender all locks on ++ the lower level because we put so much computational work into it, and ++ reverting them to their state before they were locked might be very complex. ++ We also don't want to acquire all locks before performing balancing because ++ that would either be almost as much work as the balancing, or it would be ++ too conservative and lock too much. We want balancing to be done only at ++ high priority. Yet, we might want to go to the left one node and use some ++ of its empty space... So we make one attempt at getting the node to the left ++ using try_lock, and if it fails we do without it, because we didn't really ++ need it, it was only a nice to have. ++ ++ LOCK STRUCTURES DESCRIPTION ++ ++ The following data structures are used in the reiser4 locking ++ implementation: ++ ++ All fields related to long-term locking are stored in znode->lock. ++ ++ The lock stack is a per thread object. It owns all znodes locked by the ++ thread. One znode may be locked by several threads in case of read lock or ++ one znode may be write locked by one thread several times. The special link ++ objects (lock handles) support n<->m relation between znodes and lock ++ owners. ++ ++ <Thread 1> <Thread 2> ++ ++ +---------+ +---------+ ++ | LS1 | | LS2 | ++ +---------+ +---------+ ++ ^ ^ ++ |---------------+ +----------+ ++ v v v v ++ +---------+ +---------+ +---------+ +---------+ ++ | LH1 | | LH2 | | LH3 | | LH4 | ++ +---------+ +---------+ +---------+ +---------+ ++ ^ ^ ^ ^ ++ | +------------+ | ++ v v v ++ +---------+ +---------+ +---------+ ++ | Z1 | | Z2 | | Z3 | ++ +---------+ +---------+ +---------+ ++ ++ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The ++ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and ++ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode ++ Z1 is locked by only one thread, znode has only one lock handle LH1 on its ++ list, similar situation is for Z3 which is locked by the thread 2 only. Z2 ++ is locked (for read) twice by different threads and two lock handles are on ++ its list. Each lock handle represents a single relation of a locking of a ++ znode by a thread. Locking of a znode is an establishing of a locking ++ relation between the lock stack and the znode by adding of a new lock handle ++ to a list of lock handles, the lock stack. The lock stack links all lock ++ handles for all znodes locked by the lock stack. The znode list groups all ++ lock handles for all locks stacks which locked the znode. ++ ++ Yet another relation may exist between znode and lock owners. If lock ++ procedure cannot immediately take lock on an object it adds the lock owner ++ on special `requestors' list belongs to znode. That list represents a ++ queue of pending lock requests. Because one lock owner may request only ++ only one lock object at a time, it is a 1->n relation between lock objects ++ and a lock owner implemented as it is described above. Full information ++ (priority, pointers to lock and link objects) about each lock request is ++ stored in lock owner structure in `request' field. ++ ++ SHORT_TERM LOCKING ++ ++ This is a list of primitive operations over lock stacks / lock handles / ++ znodes and locking descriptions for them. ++ ++ 1. locking / unlocking which is done by two list insertion/deletion, one ++ to/from znode's list of lock handles, another one is to/from lock stack's ++ list of lock handles. The first insertion is protected by ++ znode->lock.guard spinlock. The list owned by the lock stack can be ++ modified only by thread who owns the lock stack and nobody else can ++ modify/read it. There is nothing to be protected by a spinlock or ++ something else. ++ ++ 2. adding/removing a lock request to/from znode requesters list. The rule is ++ that znode->lock.guard spinlock should be taken for this. ++ ++ 3. we can traverse list of lock handles and use references to lock stacks who ++ locked given znode if znode->lock.guard spinlock is taken. ++ ++ 4. If a lock stack is associated with a znode as a lock requestor or lock ++ owner its existence is guaranteed by znode->lock.guard spinlock. Some its ++ (lock stack's) fields should be protected from being accessed in parallel ++ by two or more threads. Please look at lock_stack structure definition ++ for the info how those fields are protected. */ ++ ++/* Znode lock and capturing intertwining. */ ++/* In current implementation we capture formatted nodes before locking ++ them. Take a look on longterm lock znode, reiser4_try_capture() request ++ precedes locking requests. The longterm_lock_znode function unconditionally ++ captures znode before even checking of locking conditions. ++ ++ Another variant is to capture znode after locking it. It was not tested, but ++ at least one deadlock condition is supposed to be there. One thread has ++ locked a znode (Node-1) and calls reiser4_try_capture() for it. ++ reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state. ++ Second thread is a flushing thread, its current atom is the atom Node-1 ++ belongs to. Second thread wants to lock Node-1 and sleeps because Node-1 ++ is locked by the first thread. The described situation is a deadlock. */ ++ ++#include "debug.h" ++#include "txnmgr.h" ++#include "znode.h" ++#include "jnode.h" ++#include "tree.h" ++#include "plugin/node/node.h" ++#include "super.h" ++ ++#include <linux/spinlock.h> ++ ++#if REISER4_DEBUG ++static int request_is_deadlock_safe(znode * , znode_lock_mode, ++ znode_lock_request); ++#endif ++ ++/* Returns a lock owner associated with current thread */ ++lock_stack *get_current_lock_stack(void) ++{ ++ return &get_current_context()->stack; ++} ++ ++/* Wakes up all low priority owners informing them about possible deadlock */ ++static void wake_up_all_lopri_owners(znode * node) ++{ ++ lock_handle *handle; ++ ++ assert_spin_locked(&(node->lock.guard)); ++ list_for_each_entry(handle, &node->lock.owners, owners_link) { ++ assert("nikita-1832", handle->node == node); ++ /* count this signal in owner->nr_signaled */ ++ if (!handle->signaled) { ++ handle->signaled = 1; ++ atomic_inc(&handle->owner->nr_signaled); ++ /* Wake up a single process */ ++ reiser4_wake_up(handle->owner); ++ } ++ } ++} ++ ++/* Adds a lock to a lock owner, which means creating a link to the lock and ++ putting the link into the two lists all links are on (the doubly linked list ++ that forms the lock_stack, and the doubly linked list of links attached ++ to a lock. ++*/ ++static inline void ++link_object(lock_handle * handle, lock_stack * owner, znode * node) ++{ ++ assert("jmacd-810", handle->owner == NULL); ++ assert_spin_locked(&(node->lock.guard)); ++ ++ handle->owner = owner; ++ handle->node = node; ++ ++ assert("reiser4-4", ++ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0)); ++ ++ /* add lock handle to the end of lock_stack's list of locks */ ++ list_add_tail(&handle->locks_link, &owner->locks); ++ ON_DEBUG(owner->nr_locks++); ++ reiser4_ctx_gfp_mask_set(); ++ ++ /* add lock handle to the head of znode's list of owners */ ++ list_add(&handle->owners_link, &node->lock.owners); ++ handle->signaled = 0; ++} ++ ++/* Breaks a relation between a lock and its owner */ ++static inline void unlink_object(lock_handle * handle) ++{ ++ assert("zam-354", handle->owner != NULL); ++ assert("nikita-1608", handle->node != NULL); ++ assert_spin_locked(&(handle->node->lock.guard)); ++ assert("nikita-1829", handle->owner == get_current_lock_stack()); ++ assert("reiser4-5", handle->owner->nr_locks > 0); ++ ++ /* remove lock handle from lock_stack's list of locks */ ++ list_del(&handle->locks_link); ++ ON_DEBUG(handle->owner->nr_locks--); ++ reiser4_ctx_gfp_mask_set(); ++ assert("reiser4-6", ++ ergo(list_empty_careful(&handle->owner->locks), ++ handle->owner->nr_locks == 0)); ++ /* remove lock handle from znode's list of owners */ ++ list_del(&handle->owners_link); ++ /* indicates that lock handle is free now */ ++ handle->node = NULL; ++#if REISER4_DEBUG ++ INIT_LIST_HEAD(&handle->locks_link); ++ INIT_LIST_HEAD(&handle->owners_link); ++ handle->owner = NULL; ++#endif ++} ++ ++/* Actually locks an object knowing that we are able to do this */ ++static void lock_object(lock_stack * owner) ++{ ++ struct lock_request *request; ++ znode *node; ++ ++ request = &owner->request; ++ node = request->node; ++ assert_spin_locked(&(node->lock.guard)); ++ if (request->mode == ZNODE_READ_LOCK) { ++ node->lock.nr_readers++; ++ } else { ++ /* check that we don't switched from read to write lock */ ++ assert("nikita-1840", node->lock.nr_readers <= 0); ++ /* We allow recursive locking; a node can be locked several ++ times for write by same process */ ++ node->lock.nr_readers--; ++ } ++ ++ link_object(request->handle, owner, node); ++ ++ if (owner->curpri) ++ node->lock.nr_hipri_owners++; ++} ++ ++/* Check for recursive write locking */ ++static int recursive(lock_stack * owner) ++{ ++ int ret; ++ znode *node; ++ lock_handle *lh; ++ ++ node = owner->request.node; ++ ++ /* Owners list is not empty for a locked node */ ++ assert("zam-314", !list_empty_careful(&node->lock.owners)); ++ assert("nikita-1841", owner == get_current_lock_stack()); ++ assert_spin_locked(&(node->lock.guard)); ++ ++ lh = list_entry(node->lock.owners.next, lock_handle, owners_link); ++ ret = (lh->owner == owner); ++ ++ /* Recursive read locking should be done usual way */ ++ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK); ++ /* mixing of read/write locks is not allowed */ ++ assert("zam-341", !ret || znode_is_wlocked(node)); ++ ++ return ret; ++} ++ ++#if REISER4_DEBUG ++/* Returns true if the lock is held by the calling thread. */ ++int znode_is_any_locked(const znode * node) ++{ ++ lock_handle *handle; ++ lock_stack *stack; ++ int ret; ++ ++ if (!znode_is_locked(node)) ++ return 0; ++ ++ stack = get_current_lock_stack(); ++ ++ spin_lock_stack(stack); ++ ++ ret = 0; ++ ++ list_for_each_entry(handle, &stack->locks, locks_link) { ++ if (handle->node == node) { ++ ret = 1; ++ break; ++ } ++ } ++ ++ spin_unlock_stack(stack); ++ ++ return ret; ++} ++ ++#endif ++ ++/* Returns true if a write lock is held by the calling thread. */ ++int znode_is_write_locked(const znode * node) ++{ ++ lock_stack *stack; ++ lock_handle *handle; ++ ++ assert("jmacd-8765", node != NULL); ++ ++ if (!znode_is_wlocked(node)) ++ return 0; ++ ++ stack = get_current_lock_stack(); ++ ++ /* ++ * When znode is write locked, all owner handles point to the same lock ++ * stack. Get pointer to lock stack from the first lock handle from ++ * znode's owner list ++ */ ++ handle = list_entry(node->lock.owners.next, lock_handle, owners_link); ++ ++ return (handle->owner == stack); ++} ++ ++/* This "deadlock" condition is the essential part of reiser4 locking ++ implementation. This condition is checked explicitly by calling ++ check_deadlock_condition() or implicitly in all places where znode lock ++ state (set of owners and request queue) is changed. Locking code is ++ designed to use this condition to trigger procedure of passing object from ++ low priority owner(s) to high priority one(s). ++ ++ The procedure results in passing an event (setting lock_handle->signaled ++ flag) and counting this event in nr_signaled field of owner's lock stack ++ object and wakeup owner's process. ++*/ ++static inline int check_deadlock_condition(znode * node) ++{ ++ assert_spin_locked(&(node->lock.guard)); ++ return node->lock.nr_hipri_requests > 0 ++ && node->lock.nr_hipri_owners == 0; ++} ++ ++static int check_livelock_condition(znode * node, znode_lock_mode mode) ++{ ++ zlock * lock = &node->lock; ++ ++ return mode == ZNODE_READ_LOCK && ++ lock->nr_readers >= 0 && lock->nr_hipri_write_requests > 0; ++} ++ ++/* checks lock/request compatibility */ ++static int can_lock_object(lock_stack * owner) ++{ ++ znode *node = owner->request.node; ++ ++ assert_spin_locked(&(node->lock.guard)); ++ ++ /* See if the node is disconnected. */ ++ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING))) ++ return RETERR(-EINVAL); ++ ++ /* Do not ever try to take a lock if we are going in low priority ++ direction and a node have a high priority request without high ++ priority owners. */ ++ if (unlikely(!owner->curpri && check_deadlock_condition(node))) ++ return RETERR(-E_REPEAT); ++ if (unlikely(owner->curpri && ++ check_livelock_condition(node, owner->request.mode))) ++ return RETERR(-E_REPEAT); ++ if (unlikely(!is_lock_compatible(node, owner->request.mode))) ++ return RETERR(-E_REPEAT); ++ return 0; ++} ++ ++/* Setting of a high priority to the process. It clears "signaled" flags ++ because znode locked by high-priority process can't satisfy our "deadlock ++ condition". */ ++static void set_high_priority(lock_stack * owner) ++{ ++ assert("nikita-1846", owner == get_current_lock_stack()); ++ /* Do nothing if current priority is already high */ ++ if (!owner->curpri) { ++ /* We don't need locking for owner->locks list, because, this ++ * function is only called with the lock stack of the current ++ * thread, and no other thread can play with owner->locks list ++ * and/or change ->node pointers of lock handles in this list. ++ * ++ * (Interrupts also are not involved.) ++ */ ++ lock_handle *item = list_entry(owner->locks.next, lock_handle, ++ locks_link); ++ while (&owner->locks != &item->locks_link) { ++ znode *node = item->node; ++ ++ spin_lock_zlock(&node->lock); ++ ++ node->lock.nr_hipri_owners++; ++ ++ /* we can safely set signaled to zero, because ++ previous statement (nr_hipri_owners ++) guarantees ++ that signaled will be never set again. */ ++ item->signaled = 0; ++ spin_unlock_zlock(&node->lock); ++ ++ item = list_entry(item->locks_link.next, lock_handle, ++ locks_link); ++ } ++ owner->curpri = 1; ++ atomic_set(&owner->nr_signaled, 0); ++ } ++} ++ ++/* Sets a low priority to the process. */ ++static void set_low_priority(lock_stack * owner) ++{ ++ assert("nikita-3075", owner == get_current_lock_stack()); ++ /* Do nothing if current priority is already low */ ++ if (owner->curpri) { ++ /* scan all locks (lock handles) held by @owner, which is ++ actually current thread, and check whether we are reaching ++ deadlock possibility anywhere. ++ */ ++ lock_handle *handle = list_entry(owner->locks.next, lock_handle, ++ locks_link); ++ while (&owner->locks != &handle->locks_link) { ++ znode *node = handle->node; ++ spin_lock_zlock(&node->lock); ++ /* this thread just was hipri owner of @node, so ++ nr_hipri_owners has to be greater than zero. */ ++ assert("nikita-1835", node->lock.nr_hipri_owners > 0); ++ node->lock.nr_hipri_owners--; ++ /* If we have deadlock condition, adjust a nr_signaled ++ field. It is enough to set "signaled" flag only for ++ current process, other low-pri owners will be ++ signaled and waken up after current process unlocks ++ this object and any high-priority requestor takes ++ control. */ ++ if (check_deadlock_condition(node) ++ && !handle->signaled) { ++ handle->signaled = 1; ++ atomic_inc(&owner->nr_signaled); ++ } ++ spin_unlock_zlock(&node->lock); ++ handle = list_entry(handle->locks_link.next, ++ lock_handle, locks_link); ++ } ++ owner->curpri = 0; ++ } ++} ++ ++static void remove_lock_request(lock_stack * requestor) ++{ ++ zlock * lock = &requestor->request.node->lock; ++ ++ if (requestor->curpri) { ++ assert("nikita-1838", lock->nr_hipri_requests > 0); ++ lock->nr_hipri_requests--; ++ if (requestor->request.mode == ZNODE_WRITE_LOCK) ++ lock->nr_hipri_write_requests--; ++ } ++ list_del(&requestor->requestors_link); ++} ++ ++static void invalidate_all_lock_requests(znode * node) ++{ ++ lock_stack *requestor, *tmp; ++ ++ assert_spin_locked(&(node->lock.guard)); ++ ++ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, ++ requestors_link) { ++ remove_lock_request(requestor); ++ requestor->request.ret_code = -EINVAL; ++ reiser4_wake_up(requestor); ++ requestor->request.mode = ZNODE_NO_LOCK; ++ } ++} ++ ++static void dispatch_lock_requests(znode * node) ++{ ++ lock_stack *requestor, *tmp; ++ ++ assert_spin_locked(&(node->lock.guard)); ++ ++ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, ++ requestors_link) { ++ if (znode_is_write_locked(node)) ++ break; ++ if (!can_lock_object(requestor)) { ++ lock_object(requestor); ++ remove_lock_request(requestor); ++ requestor->request.ret_code = 0; ++ reiser4_wake_up(requestor); ++ requestor->request.mode = ZNODE_NO_LOCK; ++ } ++ } ++} ++ ++/* release long-term lock, acquired by longterm_lock_znode() */ ++void longterm_unlock_znode(lock_handle * handle) ++{ ++ znode *node = handle->node; ++ lock_stack *oldowner = handle->owner; ++ int hipri; ++ int readers; ++ int rdelta; ++ int youdie; ++ ++ /* ++ * this is time-critical and highly optimized code. Modify carefully. ++ */ ++ ++ assert("jmacd-1021", handle != NULL); ++ assert("jmacd-1022", handle->owner != NULL); ++ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode)); ++ ++ assert("zam-130", oldowner == get_current_lock_stack()); ++ ++ LOCK_CNT_DEC(long_term_locked_znode); ++ ++ /* ++ * to minimize amount of operations performed under lock, pre-compute ++ * all variables used within critical section. This makes code ++ * obscure. ++ */ ++ ++ /* was this lock of hi or lo priority */ ++ hipri = oldowner->curpri ? 1 : 0; ++ /* number of readers */ ++ readers = node->lock.nr_readers; ++ /* +1 if write lock, -1 if read lock */ ++ rdelta = (readers > 0) ? -1 : +1; ++ /* true if node is to die and write lock is released */ ++ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0); ++ ++ spin_lock_zlock(&node->lock); ++ ++ assert("zam-101", znode_is_locked(node)); ++ ++ /* Adjust a number of high priority owners of this lock */ ++ assert("nikita-1836", node->lock.nr_hipri_owners >= hipri); ++ node->lock.nr_hipri_owners -= hipri; ++ ++ /* Handle znode deallocation on last write-lock release. */ ++ if (znode_is_wlocked_once(node)) { ++ if (youdie) { ++ forget_znode(handle); ++ assert("nikita-2191", znode_invariant(node)); ++ zput(node); ++ return; ++ } ++ } ++ ++ if (handle->signaled) ++ atomic_dec(&oldowner->nr_signaled); ++ ++ /* Unlocking means owner<->object link deletion */ ++ unlink_object(handle); ++ ++ /* This is enough to be sure whether an object is completely ++ unlocked. */ ++ node->lock.nr_readers += rdelta; ++ ++ /* If the node is locked it must have an owners list. Likewise, if ++ the node is unlocked it must have an empty owners list. */ ++ assert("zam-319", equi(znode_is_locked(node), ++ !list_empty_careful(&node->lock.owners))); ++ ++#if REISER4_DEBUG ++ if (!znode_is_locked(node)) ++ ++node->times_locked; ++#endif ++ ++ /* If there are pending lock requests we wake up a requestor */ ++ if (!znode_is_wlocked(node)) ++ dispatch_lock_requests(node); ++ if (check_deadlock_condition(node)) ++ wake_up_all_lopri_owners(node); ++ spin_unlock_zlock(&node->lock); ++ ++ /* minus one reference from handle->node */ ++ assert("nikita-2190", znode_invariant(node)); ++ ON_DEBUG(check_lock_data()); ++ ON_DEBUG(check_lock_node_data(node)); ++ zput(node); ++} ++ ++/* final portion of longterm-lock */ ++static int ++lock_tail(lock_stack * owner, int ok, znode_lock_mode mode) ++{ ++ znode *node = owner->request.node; ++ ++ assert_spin_locked(&(node->lock.guard)); ++ ++ /* If we broke with (ok == 0) it means we can_lock, now do it. */ ++ if (ok == 0) { ++ lock_object(owner); ++ owner->request.mode = 0; ++ /* count a reference from lockhandle->node ++ ++ znode was already referenced at the entry to this function, ++ hence taking spin-lock here is not necessary (see comment ++ in the zref()). ++ */ ++ zref(node); ++ ++ LOCK_CNT_INC(long_term_locked_znode); ++ } ++ spin_unlock_zlock(&node->lock); ++ ON_DEBUG(check_lock_data()); ++ ON_DEBUG(check_lock_node_data(node)); ++ return ok; ++} ++ ++/* ++ * version of longterm_znode_lock() optimized for the most common case: read ++ * lock without any special flags. This is the kind of lock that any tree ++ * traversal takes on the root node of the tree, which is very frequent. ++ */ ++static int longterm_lock_tryfast(lock_stack * owner) ++{ ++ int result; ++ znode *node; ++ zlock *lock; ++ ++ node = owner->request.node; ++ lock = &node->lock; ++ ++ assert("nikita-3340", reiser4_schedulable()); ++ assert("nikita-3341", request_is_deadlock_safe(node, ++ ZNODE_READ_LOCK, ++ ZNODE_LOCK_LOPRI)); ++ spin_lock_zlock(lock); ++ result = can_lock_object(owner); ++ spin_unlock_zlock(lock); ++ ++ if (likely(result != -EINVAL)) { ++ spin_lock_znode(node); ++ result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0); ++ spin_unlock_znode(node); ++ spin_lock_zlock(lock); ++ if (unlikely(result != 0)) { ++ owner->request.mode = 0; ++ } else { ++ result = can_lock_object(owner); ++ if (unlikely(result == -E_REPEAT)) { ++ /* fall back to longterm_lock_znode() */ ++ spin_unlock_zlock(lock); ++ return 1; ++ } ++ } ++ return lock_tail(owner, result, ZNODE_READ_LOCK); ++ } else ++ return 1; ++} ++ ++/* locks given lock object */ ++int longterm_lock_znode( ++ /* local link object (allocated by lock owner ++ * thread, usually on its own stack) */ ++ lock_handle * handle, ++ /* znode we want to lock. */ ++ znode * node, ++ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */ ++ znode_lock_mode mode, ++ /* {0, -EINVAL, -E_DEADLOCK}, see return codes ++ description. */ ++ znode_lock_request request) { ++ int ret; ++ int hipri = (request & ZNODE_LOCK_HIPRI) != 0; ++ int non_blocking = 0; ++ int has_atom; ++ txn_capture cap_flags; ++ zlock *lock; ++ txn_handle *txnh; ++ tree_level level; ++ ++ /* Get current process context */ ++ lock_stack *owner = get_current_lock_stack(); ++ ++ /* Check that the lock handle is initialized and isn't already being ++ * used. */ ++ assert("jmacd-808", handle->owner == NULL); ++ assert("nikita-3026", reiser4_schedulable()); ++ assert("nikita-3219", request_is_deadlock_safe(node, mode, request)); ++ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0); ++ /* long term locks are not allowed in the VM contexts (->writepage(), ++ * prune_{d,i}cache()). ++ * ++ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode ++ * bug caused by d_splice_alias() only working for directories. ++ */ ++ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0)); ++ assert("zam-1055", mode != ZNODE_NO_LOCK); ++ ++ cap_flags = 0; ++ if (request & ZNODE_LOCK_NONBLOCK) { ++ cap_flags |= TXN_CAPTURE_NONBLOCKING; ++ non_blocking = 1; ++ } ++ ++ if (request & ZNODE_LOCK_DONT_FUSE) ++ cap_flags |= TXN_CAPTURE_DONT_FUSE; ++ ++ /* If we are changing our process priority we must adjust a number ++ of high priority owners for each znode that we already lock */ ++ if (hipri) { ++ set_high_priority(owner); ++ } else { ++ set_low_priority(owner); ++ } ++ ++ level = znode_get_level(node); ++ ++ /* Fill request structure with our values. */ ++ owner->request.mode = mode; ++ owner->request.handle = handle; ++ owner->request.node = node; ++ ++ txnh = get_current_context()->trans; ++ lock = &node->lock; ++ ++ if (mode == ZNODE_READ_LOCK && request == 0) { ++ ret = longterm_lock_tryfast(owner); ++ if (ret <= 0) ++ return ret; ++ } ++ ++ has_atom = (txnh->atom != NULL); ++ ++ /* Synchronize on node's zlock guard lock. */ ++ spin_lock_zlock(lock); ++ ++ if (znode_is_locked(node) && ++ mode == ZNODE_WRITE_LOCK && recursive(owner)) ++ return lock_tail(owner, 0, mode); ++ ++ for (;;) { ++ /* Check the lock's availability: if it is unavaiable we get ++ E_REPEAT, 0 indicates "can_lock", otherwise the node is ++ invalid. */ ++ ret = can_lock_object(owner); ++ ++ if (unlikely(ret == -EINVAL)) { ++ /* @node is dying. Leave it alone. */ ++ break; ++ } ++ ++ if (unlikely(ret == -E_REPEAT && non_blocking)) { ++ /* either locking of @node by the current thread will ++ * lead to the deadlock, or lock modes are ++ * incompatible. */ ++ break; ++ } ++ ++ assert("nikita-1844", (ret == 0) ++ || ((ret == -E_REPEAT) && !non_blocking)); ++ /* If we can get the lock... Try to capture first before ++ taking the lock. */ ++ ++ /* first handle commonest case where node and txnh are already ++ * in the same atom. */ ++ /* safe to do without taking locks, because: ++ * ++ * 1. read of aligned word is atomic with respect to writes to ++ * this word ++ * ++ * 2. false negatives are handled in reiser4_try_capture(). ++ * ++ * 3. false positives are impossible. ++ * ++ * PROOF: left as an exercise to the curious reader. ++ * ++ * Just kidding. Here is one: ++ * ++ * At the time T0 txnh->atom is stored in txnh_atom. ++ * ++ * At the time T1 node->atom is stored in node_atom. ++ * ++ * At the time T2 we observe that ++ * ++ * txnh_atom != NULL && node_atom == txnh_atom. ++ * ++ * Imagine that at this moment we acquire node and txnh spin ++ * lock in this order. Suppose that under spin lock we have ++ * ++ * node->atom != txnh->atom, (S1) ++ * ++ * at the time T3. ++ * ++ * txnh->atom != NULL still, because txnh is open by the ++ * current thread. ++ * ++ * Suppose node->atom == NULL, that is, node was un-captured ++ * between T1, and T3. But un-capturing of formatted node is ++ * always preceded by the call to reiser4_invalidate_lock(), ++ * which marks znode as JNODE_IS_DYING under zlock spin ++ * lock. Contradiction, because can_lock_object() above checks ++ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3. ++ * ++ * Suppose that node->atom != node_atom, that is, atom, node ++ * belongs to was fused into another atom: node_atom was fused ++ * into node->atom. Atom of txnh was equal to node_atom at T2, ++ * which means that under spin lock, txnh->atom == node->atom, ++ * because txnh->atom can only follow fusion ++ * chain. Contradicts S1. ++ * ++ * The same for hypothesis txnh->atom != txnh_atom. Hence, ++ * node->atom == node_atom == txnh_atom == txnh->atom. Again ++ * contradicts S1. Hence S1 is false. QED. ++ * ++ */ ++ ++ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) { ++ ; ++ } else { ++ /* ++ * unlock zlock spin lock here. It is possible for ++ * longterm_unlock_znode() to sneak in here, but there ++ * is no harm: reiser4_invalidate_lock() will mark znode ++ * as JNODE_IS_DYING and this will be noted by ++ * can_lock_object() below. ++ */ ++ spin_unlock_zlock(lock); ++ spin_lock_znode(node); ++ ret = reiser4_try_capture(ZJNODE(node), mode, ++ cap_flags); ++ spin_unlock_znode(node); ++ spin_lock_zlock(lock); ++ if (unlikely(ret != 0)) { ++ /* In the failure case, the txnmgr releases ++ the znode's lock (or in some cases, it was ++ released a while ago). There's no need to ++ reacquire it so we should return here, ++ avoid releasing the lock. */ ++ owner->request.mode = 0; ++ break; ++ } ++ ++ /* Check the lock's availability again -- this is ++ because under some circumstances the capture code ++ has to release and reacquire the znode spinlock. */ ++ ret = can_lock_object(owner); ++ } ++ ++ /* This time, a return of (ret == 0) means we can lock, so we ++ should break out of the loop. */ ++ if (likely(ret != -E_REPEAT || non_blocking)) ++ break; ++ ++ /* Lock is unavailable, we have to wait. */ ++ ret = reiser4_prepare_to_sleep(owner); ++ if (unlikely(ret != 0)) ++ break; ++ ++ assert_spin_locked(&(node->lock.guard)); ++ if (hipri) { ++ /* If we are going in high priority direction then ++ increase high priority requests counter for the ++ node */ ++ lock->nr_hipri_requests++; ++ if (mode == ZNODE_WRITE_LOCK) ++ lock->nr_hipri_write_requests++; ++ /* If there are no high priority owners for a node, ++ then immediately wake up low priority owners, so ++ they can detect possible deadlock */ ++ if (lock->nr_hipri_owners == 0) ++ wake_up_all_lopri_owners(node); ++ } ++ list_add_tail(&owner->requestors_link, &lock->requestors); ++ ++ /* Ok, here we have prepared a lock request, so unlock ++ a znode ... */ ++ spin_unlock_zlock(lock); ++ /* ... and sleep */ ++ reiser4_go_to_sleep(owner); ++ if (owner->request.mode == ZNODE_NO_LOCK) ++ goto request_is_done; ++ spin_lock_zlock(lock); ++ if (owner->request.mode == ZNODE_NO_LOCK) { ++ spin_unlock_zlock(lock); ++request_is_done: ++ if (owner->request.ret_code == 0) { ++ LOCK_CNT_INC(long_term_locked_znode); ++ zref(node); ++ } ++ return owner->request.ret_code; ++ } ++ remove_lock_request(owner); ++ } ++ ++ return lock_tail(owner, ret, mode); ++} ++ ++/* lock object invalidation means changing of lock object state to `INVALID' ++ and waiting for all other processes to cancel theirs lock requests. */ ++void reiser4_invalidate_lock(lock_handle * handle /* path to lock ++ * owner and lock ++ * object is being ++ * invalidated. */ ) ++{ ++ znode *node = handle->node; ++ lock_stack *owner = handle->owner; ++ ++ assert("zam-325", owner == get_current_lock_stack()); ++ assert("zam-103", znode_is_write_locked(node)); ++ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED)); ++ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED)); ++ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); ++ assert("nikita-3097", znode_is_wlocked_once(node)); ++ assert_spin_locked(&(node->lock.guard)); ++ ++ if (handle->signaled) ++ atomic_dec(&owner->nr_signaled); ++ ++ ZF_SET(node, JNODE_IS_DYING); ++ unlink_object(handle); ++ node->lock.nr_readers = 0; ++ ++ invalidate_all_lock_requests(node); ++ spin_unlock_zlock(&node->lock); ++} ++ ++/* Initializes lock_stack. */ ++void init_lock_stack(lock_stack * owner /* pointer to ++ * allocated ++ * structure. */ ) ++{ ++ INIT_LIST_HEAD(&owner->locks); ++ INIT_LIST_HEAD(&owner->requestors_link); ++ spin_lock_init(&owner->sguard); ++ owner->curpri = 1; ++ init_waitqueue_head(&owner->wait); ++} ++ ++/* Initializes lock object. */ ++void reiser4_init_lock(zlock * lock /* pointer on allocated ++ * uninitialized lock object ++ * structure. */ ) ++{ ++ memset(lock, 0, sizeof(zlock)); ++ spin_lock_init(&lock->guard); ++ INIT_LIST_HEAD(&lock->requestors); ++ INIT_LIST_HEAD(&lock->owners); ++} ++ ++/* Transfer a lock handle (presumably so that variables can be moved between ++ stack and heap locations). */ ++static void ++move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old) ++{ ++ znode *node = old->node; ++ lock_stack *owner = old->owner; ++ int signaled; ++ ++ /* locks_list, modified by link_object() is not protected by ++ anything. This is valid because only current thread ever modifies ++ locks_list of its lock_stack. ++ */ ++ assert("nikita-1827", owner == get_current_lock_stack()); ++ assert("nikita-1831", new->owner == NULL); ++ ++ spin_lock_zlock(&node->lock); ++ ++ signaled = old->signaled; ++ if (unlink_old) { ++ unlink_object(old); ++ } else { ++ if (node->lock.nr_readers > 0) { ++ node->lock.nr_readers += 1; ++ } else { ++ node->lock.nr_readers -= 1; ++ } ++ if (signaled) ++ atomic_inc(&owner->nr_signaled); ++ if (owner->curpri) ++ node->lock.nr_hipri_owners += 1; ++ LOCK_CNT_INC(long_term_locked_znode); ++ ++ zref(node); ++ } ++ link_object(new, owner, node); ++ new->signaled = signaled; ++ ++ spin_unlock_zlock(&node->lock); ++} ++ ++void move_lh(lock_handle * new, lock_handle * old) ++{ ++ move_lh_internal(new, old, /*unlink_old */ 1); ++} ++ ++void copy_lh(lock_handle * new, lock_handle * old) ++{ ++ move_lh_internal(new, old, /*unlink_old */ 0); ++} ++ ++/* after getting -E_DEADLOCK we unlock znodes until this function returns false ++ */ ++int reiser4_check_deadlock(void) ++{ ++ lock_stack *owner = get_current_lock_stack(); ++ return atomic_read(&owner->nr_signaled) != 0; ++} ++ ++/* Before going to sleep we re-check "release lock" requests which might come ++ from threads with hi-pri lock priorities. */ ++int reiser4_prepare_to_sleep(lock_stack * owner) ++{ ++ assert("nikita-1847", owner == get_current_lock_stack()); ++ ++ /* We return -E_DEADLOCK if one or more "give me the lock" messages are ++ * counted in nr_signaled */ ++ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) { ++ assert("zam-959", !owner->curpri); ++ return RETERR(-E_DEADLOCK); ++ } ++ return 0; ++} ++ ++/* Wakes up a single thread */ ++void __reiser4_wake_up(lock_stack * owner) ++{ ++ atomic_set(&owner->wakeup, 1); ++ wake_up(&owner->wait); ++} ++ ++/* Puts a thread to sleep */ ++void reiser4_go_to_sleep(lock_stack * owner) ++{ ++ /* Well, we might sleep here, so holding of any spinlocks is no-no */ ++ assert("nikita-3027", reiser4_schedulable()); ++ ++ wait_event(owner->wait, atomic_read(&owner->wakeup)); ++ atomic_set(&owner->wakeup, 0); ++} ++ ++int lock_stack_isclean(lock_stack * owner) ++{ ++ if (list_empty_careful(&owner->locks)) { ++ assert("zam-353", atomic_read(&owner->nr_signaled) == 0); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++#if REISER4_DEBUG ++ ++/* ++ * debugging functions ++ */ ++ ++static void list_check(struct list_head *head) ++{ ++ struct list_head *pos; ++ ++ list_for_each(pos, head) ++ assert("", (pos->prev != NULL && pos->next != NULL && ++ pos->prev->next == pos && pos->next->prev == pos)); ++} ++ ++/* check consistency of locking data-structures hanging of the @stack */ ++static void check_lock_stack(lock_stack * stack) ++{ ++ spin_lock_stack(stack); ++ /* check that stack->locks is not corrupted */ ++ list_check(&stack->locks); ++ spin_unlock_stack(stack); ++} ++ ++/* check consistency of locking data structures */ ++void check_lock_data(void) ++{ ++ check_lock_stack(&get_current_context()->stack); ++} ++ ++/* check consistency of locking data structures for @node */ ++void check_lock_node_data(znode * node) ++{ ++ spin_lock_zlock(&node->lock); ++ list_check(&node->lock.owners); ++ list_check(&node->lock.requestors); ++ spin_unlock_zlock(&node->lock); ++} ++ ++/* check that given lock request is dead lock safe. This check is, of course, ++ * not exhaustive. */ ++static int ++request_is_deadlock_safe(znode * node, znode_lock_mode mode, ++ znode_lock_request request) ++{ ++ lock_stack *owner; ++ ++ owner = get_current_lock_stack(); ++ /* ++ * check that hipri lock request is not issued when there are locked ++ * nodes at the higher levels. ++ */ ++ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) && ++ znode_get_level(node) != 0) { ++ lock_handle *item; ++ ++ list_for_each_entry(item, &owner->locks, locks_link) { ++ znode *other; ++ ++ other = item->node; ++ ++ if (znode_get_level(other) == 0) ++ continue; ++ if (znode_get_level(other) > znode_get_level(node)) ++ return 0; ++ } ++ } ++ return 1; ++} ++ ++#endif ++ ++/* return pointer to static storage with name of lock_mode. For ++ debugging */ ++const char *lock_mode_name(znode_lock_mode lock/* lock mode to get name of */) ++{ ++ if (lock == ZNODE_READ_LOCK) ++ return "read"; ++ else if (lock == ZNODE_WRITE_LOCK) ++ return "write"; ++ else { ++ static char buf[30]; ++ ++ sprintf(buf, "unknown: %i", lock); ++ return buf; ++ } ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 79 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/lock.h linux-2.6.33/fs/reiser4/lock.h +--- linux-2.6.33.orig/fs/reiser4/lock.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/lock.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,250 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Long term locking data structures. See lock.c for details. */ ++ ++#ifndef __LOCK_H__ ++#define __LOCK_H__ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++#include "coord.h" ++#include "plugin/node/node.h" ++#include "txnmgr.h" ++#include "readahead.h" ++ ++#include <linux/types.h> ++#include <linux/spinlock.h> ++#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */ ++#include <asm/atomic.h> ++#include <linux/wait.h> ++ ++/* Per-znode lock object */ ++struct zlock { ++ spinlock_t guard; ++ /* The number of readers if positive; the number of recursively taken ++ write locks if negative. Protected by zlock spin lock. */ ++ int nr_readers; ++ /* A number of processes (lock_stacks) that have this object ++ locked with high priority */ ++ unsigned nr_hipri_owners; ++ /* A number of attempts to lock znode in high priority direction */ ++ unsigned nr_hipri_requests; ++ /* A linked list of lock_handle objects that contains pointers ++ for all lock_stacks which have this lock object locked */ ++ unsigned nr_hipri_write_requests; ++ struct list_head owners; ++ /* A linked list of lock_stacks that wait for this lock */ ++ struct list_head requestors; ++}; ++ ++static inline void spin_lock_zlock(zlock *lock) ++{ ++ /* check that zlock is not locked */ ++ assert("", LOCK_CNT_NIL(spin_locked_zlock)); ++ /* check that spinlocks of lower priorities are not held */ ++ assert("", LOCK_CNT_NIL(spin_locked_stack)); ++ ++ spin_lock(&lock->guard); ++ ++ LOCK_CNT_INC(spin_locked_zlock); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline void spin_unlock_zlock(zlock *lock) ++{ ++ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ ++ LOCK_CNT_DEC(spin_locked_zlock); ++ LOCK_CNT_DEC(spin_locked); ++ ++ spin_unlock(&lock->guard); ++} ++ ++#define lock_is_locked(lock) ((lock)->nr_readers != 0) ++#define lock_is_rlocked(lock) ((lock)->nr_readers > 0) ++#define lock_is_wlocked(lock) ((lock)->nr_readers < 0) ++#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1) ++#define lock_can_be_rlocked(lock) ((lock)->nr_readers >= 0) ++#define lock_mode_compatible(lock, mode) \ ++ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \ ++ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock))) ++ ++/* Since we have R/W znode locks we need additional bidirectional `link' ++ objects to implement n<->m relationship between lock owners and lock ++ objects. We call them `lock handles'. ++ ++ Locking: see lock.c/"SHORT-TERM LOCKING" ++*/ ++struct lock_handle { ++ /* This flag indicates that a signal to yield a lock was passed to ++ lock owner and counted in owner->nr_signalled ++ ++ Locking: this is accessed under spin lock on ->node. ++ */ ++ int signaled; ++ /* A link to owner of a lock */ ++ lock_stack *owner; ++ /* A link to znode locked */ ++ znode *node; ++ /* A list of all locks for a process */ ++ struct list_head locks_link; ++ /* A list of all owners for a znode */ ++ struct list_head owners_link; ++}; ++ ++struct lock_request { ++ /* A pointer to uninitialized link object */ ++ lock_handle *handle; ++ /* A pointer to the object we want to lock */ ++ znode *node; ++ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */ ++ znode_lock_mode mode; ++ /* how dispatch_lock_requests() returns lock request result code */ ++ int ret_code; ++}; ++ ++/* A lock stack structure for accumulating locks owned by a process */ ++struct lock_stack { ++ /* A guard lock protecting a lock stack */ ++ spinlock_t sguard; ++ /* number of znodes which were requested by high priority processes */ ++ atomic_t nr_signaled; ++ /* Current priority of a process ++ ++ This is only accessed by the current thread and thus requires no ++ locking. ++ */ ++ int curpri; ++ /* A list of all locks owned by this process. Elements can be added to ++ * this list only by the current thread. ->node pointers in this list ++ * can be only changed by the current thread. */ ++ struct list_head locks; ++ /* When lock_stack waits for the lock, it puts itself on double-linked ++ requestors list of that lock */ ++ struct list_head requestors_link; ++ /* Current lock request info. ++ ++ This is only accessed by the current thread and thus requires no ++ locking. ++ */ ++ struct lock_request request; ++ /* the following two fields are the lock stack's ++ * synchronization object to use with the standard linux/wait.h ++ * interface. See reiser4_go_to_sleep and __reiser4_wake_up for ++ * usage details. */ ++ wait_queue_head_t wait; ++ atomic_t wakeup; ++#if REISER4_DEBUG ++ int nr_locks; /* number of lock handles in the above list */ ++#endif ++}; ++ ++/* ++ User-visible znode locking functions ++*/ ++ ++extern int longterm_lock_znode(lock_handle * handle, ++ znode * node, ++ znode_lock_mode mode, ++ znode_lock_request request); ++ ++extern void longterm_unlock_znode(lock_handle * handle); ++ ++extern int reiser4_check_deadlock(void); ++ ++extern lock_stack *get_current_lock_stack(void); ++ ++extern void init_lock_stack(lock_stack * owner); ++extern void reiser4_init_lock(zlock * lock); ++ ++static inline void init_lh(lock_handle *lh) ++{ ++#if REISER4_DEBUG ++ memset(lh, 0, sizeof *lh); ++ INIT_LIST_HEAD(&lh->locks_link); ++ INIT_LIST_HEAD(&lh->owners_link); ++#else ++ lh->node = NULL; ++#endif ++} ++ ++static inline void done_lh(lock_handle *lh) ++{ ++ assert("zam-342", lh != NULL); ++ if (lh->node != NULL) ++ longterm_unlock_znode(lh); ++} ++ ++extern void move_lh(lock_handle * new, lock_handle * old); ++extern void copy_lh(lock_handle * new, lock_handle * old); ++ ++extern int reiser4_prepare_to_sleep(lock_stack * owner); ++extern void reiser4_go_to_sleep(lock_stack * owner); ++extern void __reiser4_wake_up(lock_stack * owner); ++ ++extern int lock_stack_isclean(lock_stack * owner); ++ ++/* zlock object state check macros: only used in assertions. Both forms imply ++ that the lock is held by the current thread. */ ++extern int znode_is_write_locked(const znode *); ++extern void reiser4_invalidate_lock(lock_handle *); ++ ++/* lock ordering is: first take zlock spin lock, then lock stack spin lock */ ++#define spin_ordering_pred_stack(stack) \ ++ (LOCK_CNT_NIL(spin_locked_stack) && \ ++ LOCK_CNT_NIL(spin_locked_txnmgr) && \ ++ LOCK_CNT_NIL(spin_locked_inode) && \ ++ LOCK_CNT_NIL(rw_locked_cbk_cache) && \ ++ LOCK_CNT_NIL(spin_locked_super_eflush)) ++ ++static inline void spin_lock_stack(lock_stack *stack) ++{ ++ assert("", spin_ordering_pred_stack(stack)); ++ spin_lock(&(stack->sguard)); ++ LOCK_CNT_INC(spin_locked_stack); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline void spin_unlock_stack(lock_stack *stack) ++{ ++ assert_spin_locked(&(stack->sguard)); ++ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ LOCK_CNT_DEC(spin_locked_stack); ++ LOCK_CNT_DEC(spin_locked); ++ spin_unlock(&(stack->sguard)); ++} ++ ++static inline void reiser4_wake_up(lock_stack * owner) ++{ ++ spin_lock_stack(owner); ++ __reiser4_wake_up(owner); ++ spin_unlock_stack(owner); ++} ++ ++const char *lock_mode_name(znode_lock_mode lock); ++ ++#if REISER4_DEBUG ++extern void check_lock_data(void); ++extern void check_lock_node_data(znode * node); ++#else ++#define check_lock_data() noop ++#define check_lock_node_data() noop ++#endif ++ ++/* __LOCK_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/Makefile linux-2.6.33/fs/reiser4/Makefile +--- linux-2.6.33.orig/fs/reiser4/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,98 @@ ++# ++# reiser4/Makefile ++# ++ ++obj-$(CONFIG_REISER4_FS) += reiser4.o ++ ++reiser4-y := \ ++ debug.o \ ++ jnode.o \ ++ znode.o \ ++ key.o \ ++ pool.o \ ++ tree_mod.o \ ++ estimate.o \ ++ carry.o \ ++ carry_ops.o \ ++ lock.o \ ++ tree.o \ ++ context.o \ ++ tap.o \ ++ coord.o \ ++ block_alloc.o \ ++ txnmgr.o \ ++ kassign.o \ ++ flush.o \ ++ wander.o \ ++ eottl.o \ ++ search.o \ ++ page_cache.o \ ++ seal.o \ ++ dscale.o \ ++ flush_queue.o \ ++ ktxnmgrd.o \ ++ blocknrset.o \ ++ super.o \ ++ super_ops.o \ ++ fsdata.o \ ++ export_ops.o \ ++ oid.o \ ++ tree_walk.o \ ++ inode.o \ ++ vfs_ops.o \ ++ as_ops.o \ ++ entd.o\ ++ readahead.o \ ++ status_flags.o \ ++ init_super.o \ ++ safe_link.o \ ++ \ ++ plugin/plugin.o \ ++ plugin/plugin_set.o \ ++ plugin/node/node.o \ ++ plugin/object.o \ ++ plugin/cluster.o \ ++ plugin/inode_ops.o \ ++ plugin/inode_ops_rename.o \ ++ plugin/file_ops.o \ ++ plugin/file_ops_readdir.o \ ++ plugin/file_plugin_common.o \ ++ plugin/file/file.o \ ++ plugin/file/tail_conversion.o \ ++ plugin/file/file_conversion.o \ ++ plugin/file/symlink.o \ ++ plugin/file/cryptcompress.o \ ++ plugin/dir_plugin_common.o \ ++ plugin/dir/hashed_dir.o \ ++ plugin/dir/seekable_dir.o \ ++ plugin/node/node40.o \ ++ \ ++ plugin/crypto/cipher.o \ ++ plugin/crypto/digest.o \ ++ \ ++ plugin/compress/compress.o \ ++ plugin/compress/compress_mode.o \ ++ \ ++ plugin/item/static_stat.o \ ++ plugin/item/sde.o \ ++ plugin/item/cde.o \ ++ plugin/item/blackbox.o \ ++ plugin/item/internal.o \ ++ plugin/item/tail.o \ ++ plugin/item/ctail.o \ ++ plugin/item/extent.o \ ++ plugin/item/extent_item_ops.o \ ++ plugin/item/extent_file_ops.o \ ++ plugin/item/extent_flush_ops.o \ ++ \ ++ plugin/hash.o \ ++ plugin/fibration.o \ ++ plugin/tail_policy.o \ ++ plugin/item/item.o \ ++ \ ++ plugin/security/perm.o \ ++ plugin/space/bitmap.o \ ++ \ ++ plugin/disk_format/disk_format40.o \ ++ plugin/disk_format/disk_format.o ++ +diff -urN linux-2.6.33.orig/fs/reiser4/oid.c linux-2.6.33/fs/reiser4/oid.c +--- linux-2.6.33.orig/fs/reiser4/oid.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/oid.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,141 @@ ++/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "debug.h" ++#include "super.h" ++#include "txnmgr.h" ++ ++/* we used to have oid allocation plugin. It was removed because it ++ was recognized as providing unneeded level of abstraction. If one ++ ever will find it useful - look at yet_unneeded_abstractions/oid ++*/ ++ ++/* ++ * initialize in-memory data for oid allocator at @super. @nr_files and @next ++ * are provided by disk format plugin that reads them from the disk during ++ * mount. ++ */ ++int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next) ++{ ++ reiser4_super_info_data *sbinfo; ++ ++ sbinfo = get_super_private(super); ++ ++ sbinfo->next_to_use = next; ++ sbinfo->oids_in_use = nr_files; ++ return 0; ++} ++ ++/* ++ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator ++ * runs out of oids. ++ */ ++oid_t oid_allocate(struct super_block *super) ++{ ++ reiser4_super_info_data *sbinfo; ++ oid_t oid; ++ ++ sbinfo = get_super_private(super); ++ ++ spin_lock_reiser4_super(sbinfo); ++ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) { ++ oid = sbinfo->next_to_use++; ++ sbinfo->oids_in_use++; ++ } else ++ oid = ABSOLUTE_MAX_OID; ++ spin_unlock_reiser4_super(sbinfo); ++ return oid; ++} ++ ++/* ++ * Tell oid allocator that @oid is now free. ++ */ ++int oid_release(struct super_block *super, oid_t oid UNUSED_ARG) ++{ ++ reiser4_super_info_data *sbinfo; ++ ++ sbinfo = get_super_private(super); ++ ++ spin_lock_reiser4_super(sbinfo); ++ sbinfo->oids_in_use--; ++ spin_unlock_reiser4_super(sbinfo); ++ return 0; ++} ++ ++/* ++ * return next @oid that would be allocated (i.e., returned by oid_allocate()) ++ * without actually allocating it. This is used by disk format plugin to save ++ * oid allocator state on the disk. ++ */ ++oid_t oid_next(const struct super_block *super) ++{ ++ reiser4_super_info_data *sbinfo; ++ oid_t oid; ++ ++ sbinfo = get_super_private(super); ++ ++ spin_lock_reiser4_super(sbinfo); ++ oid = sbinfo->next_to_use; ++ spin_unlock_reiser4_super(sbinfo); ++ return oid; ++} ++ ++/* ++ * returns number of currently used oids. This is used by statfs(2) to report ++ * number of "inodes" and by disk format plugin to save oid allocator state on ++ * the disk. ++ */ ++long oids_used(const struct super_block *super) ++{ ++ reiser4_super_info_data *sbinfo; ++ oid_t used; ++ ++ sbinfo = get_super_private(super); ++ ++ spin_lock_reiser4_super(sbinfo); ++ used = sbinfo->oids_in_use; ++ spin_unlock_reiser4_super(sbinfo); ++ if (used < (__u64) ((long)~0) >> 1) ++ return (long)used; ++ else ++ return (long)-1; ++} ++ ++/* ++ * Count oid as allocated in atom. This is done after call to oid_allocate() ++ * at the point when we are irrevocably committed to creation of the new file ++ * (i.e., when oid allocation cannot be any longer rolled back due to some ++ * error). ++ */ ++void oid_count_allocated(void) ++{ ++ txn_atom *atom; ++ ++ atom = get_current_atom_locked(); ++ atom->nr_objects_created++; ++ spin_unlock_atom(atom); ++} ++ ++/* ++ * Count oid as free in atom. This is done after call to oid_release() at the ++ * point when we are irrevocably committed to the deletion of the file (i.e., ++ * when oid release cannot be any longer rolled back due to some error). ++ */ ++void oid_count_released(void) ++{ ++ txn_atom *atom; ++ ++ atom = get_current_atom_locked(); ++ atom->nr_objects_deleted++; ++ spin_unlock_atom(atom); ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/page_cache.c linux-2.6.33/fs/reiser4/page_cache.c +--- linux-2.6.33.orig/fs/reiser4/page_cache.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/page_cache.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,693 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Memory pressure hooks. Fake inodes handling. */ ++ ++/* GLOSSARY ++ ++ . Formatted and unformatted nodes. ++ Elements of reiser4 balanced tree to store data and metadata. ++ Unformatted nodes are pointed to by extent pointers. Such nodes ++ are used to store data of large objects. Unlike unformatted nodes, ++ formatted ones have associated format described by node4X plugin. ++ ++ . Jnode (or journal node) ++ The in-memory header which is used to track formatted and unformatted ++ nodes, bitmap nodes, etc. In particular, jnodes are used to track ++ transactional information associated with each block(see reiser4/jnode.c ++ for details). ++ ++ . Znode ++ The in-memory header which is used to track formatted nodes. Contains ++ embedded jnode (see reiser4/znode.c for details). ++*/ ++ ++/* We store all file system meta data (and data, of course) in the page cache. ++ ++ What does this mean? In stead of using bread/brelse we create special ++ "fake" inode (one per super block) and store content of formatted nodes ++ into pages bound to this inode in the page cache. In newer kernels bread() ++ already uses inode attached to block device (bd_inode). Advantage of having ++ our own fake inode is that we can install appropriate methods in its ++ address_space operations. Such methods are called by VM on memory pressure ++ (or during background page flushing) and we can use them to react ++ appropriately. ++ ++ In initial version we only support one block per page. Support for multiple ++ blocks per page is complicated by relocation. ++ ++ To each page, used by reiser4, jnode is attached. jnode is analogous to ++ buffer head. Difference is that jnode is bound to the page permanently: ++ jnode cannot be removed from memory until its backing page is. ++ ++ jnode contain pointer to page (->pg field) and page contain pointer to ++ jnode in ->private field. Pointer from jnode to page is protected to by ++ jnode's spinlock and pointer from page to jnode is protected by page lock ++ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin ++ lock. To go into reverse direction use jnode_lock_page() function that uses ++ standard try-lock-and-release device. ++ ++ Properties: ++ ++ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page ++ reference counter is increased. ++ ++ 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page ++ reference counter is decreased. ++ ++ 3. on jload() reference counter on jnode page is increased, page is ++ kmapped and `referenced'. ++ ++ 4. on jrelse() inverse operations are performed. ++ ++ 5. kmapping/kunmapping of unformatted pages is done by read/write methods. ++ ++ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting ++ historically.] ++ ++ [In the following discussion, `lock' invariably means long term lock on ++ znode.] (What about page locks?) ++ ++ There is some special class of deadlock possibilities related to memory ++ pressure. Locks acquired by other reiser4 threads are accounted for in ++ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is ++ invoked additional hidden arc is added to the locking graph: thread that ++ tries to allocate memory waits for ->vm_writeback() to finish. If this ++ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock ++ prevention is useless. ++ ++ Another related problem is possibility for ->vm_writeback() to run out of ++ memory itself. This is not a problem for ext2 and friends, because their ++ ->vm_writeback() don't allocate much memory, but reiser4 flush is ++ definitely able to allocate huge amounts of memory. ++ ++ It seems that there is no reliable way to cope with the problems above. In ++ stead it was decided that ->vm_writeback() (as invoked in the kswapd ++ context) wouldn't perform any flushing itself, but rather should just wake ++ up some auxiliary thread dedicated for this purpose (or, the same thread ++ that does periodic commit of old atoms (ktxnmgrd.c)). ++ ++ Details: ++ ++ 1. Page is called `reclaimable' against particular reiser4 mount F if this ++ page can be ultimately released by try_to_free_pages() under presumptions ++ that: ++ ++ a. ->vm_writeback() for F is no-op, and ++ ++ b. none of the threads accessing F are making any progress, and ++ ++ c. other reiser4 mounts obey the same memory reservation protocol as F ++ (described below). ++ ++ For example, clean un-pinned page, or page occupied by ext2 data are ++ reclaimable against any reiser4 mount. ++ ++ When there is more than one reiser4 mount in a system, condition (c) makes ++ reclaim-ability not easily verifiable beyond trivial cases mentioned above. ++ ++ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE ++ ++ Fake inode is used to bound formatted nodes and each node is indexed within ++ fake inode by its block number. If block size of smaller than page size, it ++ may so happen that block mapped to the page with formatted node is occupied ++ by unformatted node or is unallocated. This lead to some complications, ++ because flushing whole page can lead to an incorrect overwrite of ++ unformatted node that is moreover, can be cached in some other place as ++ part of the file body. To avoid this, buffers for unformatted nodes are ++ never marked dirty. Also pages in the fake are never marked dirty. This ++ rules out usage of ->writepage() as memory pressure hook. In stead ++ ->releasepage() is used. ++ ++ Josh is concerned that page->buffer is going to die. This should not pose ++ significant problem though, because we need to add some data structures to ++ the page anyway (jnode) and all necessary book keeping can be put there. ++ ++*/ ++ ++/* Life cycle of pages/nodes. ++ ++ jnode contains reference to page and page contains reference back to ++ jnode. This reference is counted in page ->count. Thus, page bound to jnode ++ cannot be released back into free pool. ++ ++ 1. Formatted nodes. ++ ++ 1. formatted node is represented by znode. When new znode is created its ++ ->pg pointer is NULL initially. ++ ++ 2. when node content is loaded into znode (by call to zload()) for the ++ first time following happens (in call to ->read_node() or ++ ->allocate_node()): ++ ++ 1. new page is added to the page cache. ++ ++ 2. this page is attached to znode and its ->count is increased. ++ ++ 3. page is kmapped. ++ ++ 3. if more calls to zload() follow (without corresponding zrelses), page ++ counter is left intact and in its stead ->d_count is increased in znode. ++ ++ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero ++ ->release_node() is called and page is kunmapped as result. ++ ++ 5. at some moment node can be captured by a transaction. Its ->x_count ++ is then increased by transaction manager. ++ ++ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE ++ bit set) following will happen (also see comment at the top of znode.c): ++ ++ 1. when last lock is released, node will be uncaptured from ++ transaction. This released reference that transaction manager acquired ++ at the step 5. ++ ++ 2. when last reference is released, zput() detects that node is ++ actually deleted and calls ->delete_node() ++ operation. page_cache_delete_node() implementation detaches jnode from ++ page and releases page. ++ ++ 7. otherwise (node wasn't removed from the tree), last reference to ++ znode will be released after transaction manager committed transaction ++ node was in. This implies squallocing of this node (see ++ flush.c). Nothing special happens at this point. Znode is still in the ++ hash table and page is still attached to it. ++ ++ 8. znode is actually removed from the memory because of the memory ++ pressure, or during umount (znodes_tree_done()). Anyway, znode is ++ removed by the call to zdrop(). At this moment, page is detached from ++ znode and removed from the inode address space. ++ ++*/ ++ ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++#include "txnmgr.h" ++#include "jnode.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree.h" ++#include "vfs_ops.h" ++#include "inode.h" ++#include "super.h" ++#include "entd.h" ++#include "page_cache.h" ++#include "ktxnmgrd.h" ++ ++#include <linux/types.h> ++#include <linux/fs.h> ++#include <linux/mm.h> /* for struct page */ ++#include <linux/swap.h> /* for struct page */ ++#include <linux/pagemap.h> ++#include <linux/bio.h> ++#include <linux/writeback.h> ++#include <linux/blkdev.h> ++ ++static struct bio *page_bio(struct page *, jnode * , int rw, gfp_t gfp); ++ ++static struct address_space_operations formatted_fake_as_ops; ++ ++static const oid_t fake_ino = 0x1; ++static const oid_t bitmap_ino = 0x2; ++static const oid_t cc_ino = 0x3; ++ ++static void ++init_fake_inode(struct super_block *super, struct inode *fake, ++ struct inode **pfake) ++{ ++ assert("nikita-2168", fake->i_state & I_NEW); ++ fake->i_mapping->a_ops = &formatted_fake_as_ops; ++ *pfake = fake; ++ /* NOTE-NIKITA something else? */ ++ unlock_new_inode(fake); ++} ++ ++/** ++ * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps ++ * @super: super block to init fake inode for ++ * ++ * Initializes fake inode to which formatted nodes are bound in the page cache ++ * and inode for bitmaps. ++ */ ++int reiser4_init_formatted_fake(struct super_block *super) ++{ ++ struct inode *fake; ++ struct inode *bitmap; ++ struct inode *cc; ++ reiser4_super_info_data *sinfo; ++ ++ assert("nikita-1703", super != NULL); ++ ++ sinfo = get_super_private_nocheck(super); ++ fake = iget_locked(super, oid_to_ino(fake_ino)); ++ ++ if (fake != NULL) { ++ init_fake_inode(super, fake, &sinfo->fake); ++ ++ bitmap = iget_locked(super, oid_to_ino(bitmap_ino)); ++ if (bitmap != NULL) { ++ init_fake_inode(super, bitmap, &sinfo->bitmap); ++ ++ cc = iget_locked(super, oid_to_ino(cc_ino)); ++ if (cc != NULL) { ++ init_fake_inode(super, cc, &sinfo->cc); ++ return 0; ++ } else { ++ iput(sinfo->fake); ++ iput(sinfo->bitmap); ++ sinfo->fake = NULL; ++ sinfo->bitmap = NULL; ++ } ++ } else { ++ iput(sinfo->fake); ++ sinfo->fake = NULL; ++ } ++ } ++ return RETERR(-ENOMEM); ++} ++ ++/** ++ * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps ++ * @super: super block to init fake inode for ++ * ++ * Releases inodes which were used as address spaces of bitmap and formatted ++ * nodes. ++ */ ++void reiser4_done_formatted_fake(struct super_block *super) ++{ ++ reiser4_super_info_data *sinfo; ++ ++ sinfo = get_super_private_nocheck(super); ++ ++ if (sinfo->fake != NULL) { ++ iput(sinfo->fake); ++ sinfo->fake = NULL; ++ } ++ ++ if (sinfo->bitmap != NULL) { ++ iput(sinfo->bitmap); ++ sinfo->bitmap = NULL; ++ } ++ ++ if (sinfo->cc != NULL) { ++ iput(sinfo->cc); ++ sinfo->cc = NULL; ++ } ++ return; ++} ++ ++void reiser4_wait_page_writeback(struct page *page) ++{ ++ assert("zam-783", PageLocked(page)); ++ ++ do { ++ unlock_page(page); ++ wait_on_page_writeback(page); ++ lock_page(page); ++ } while (PageWriteback(page)); ++} ++ ++/* return tree @page is in */ ++reiser4_tree *reiser4_tree_by_page(const struct page *page/* page to query */) ++{ ++ assert("nikita-2461", page != NULL); ++ return &get_super_private(page->mapping->host->i_sb)->tree; ++} ++ ++/* completion handler for single page bio-based read. ++ ++ mpage_end_io_read() would also do. But it's static. ++ ++*/ ++static void ++end_bio_single_page_read(struct bio *bio, int err UNUSED_ARG) ++{ ++ struct page *page; ++ ++ page = bio->bi_io_vec[0].bv_page; ++ ++ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) { ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ } ++ unlock_page(page); ++ bio_put(bio); ++} ++ ++/* completion handler for single page bio-based write. ++ ++ mpage_end_io_write() would also do. But it's static. ++ ++*/ ++static void ++end_bio_single_page_write(struct bio *bio, int err UNUSED_ARG) ++{ ++ struct page *page; ++ ++ page = bio->bi_io_vec[0].bv_page; ++ ++ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) ++ SetPageError(page); ++ end_page_writeback(page); ++ bio_put(bio); ++} ++ ++/* ->readpage() method for formatted nodes */ ++static int formatted_readpage(struct file *f UNUSED_ARG, ++ struct page *page/* page to read */) ++{ ++ assert("nikita-2412", PagePrivate(page) && jprivate(page)); ++ return reiser4_page_io(page, jprivate(page), READ, ++ reiser4_ctx_gfp_mask_get()); ++} ++ ++/** ++ * reiser4_page_io - submit single-page bio request ++ * @page: page to perform io for ++ * @node: jnode of page ++ * @rw: read or write ++ * @gfp: gfp mask for bio allocation ++ * ++ * Submits single page read or write. ++ */ ++int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp) ++{ ++ struct bio *bio; ++ int result; ++ ++ assert("nikita-2094", page != NULL); ++ assert("nikita-2226", PageLocked(page)); ++ assert("nikita-2634", node != NULL); ++ assert("nikita-2893", rw == READ || rw == WRITE); ++ ++ if (rw) { ++ if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) { ++ unlock_page(page); ++ return 0; ++ } ++ } ++ ++ bio = page_bio(page, node, rw, gfp); ++ if (!IS_ERR(bio)) { ++ if (rw == WRITE) { ++ set_page_writeback(page); ++ unlock_page(page); ++ } ++ reiser4_submit_bio(rw, bio); ++ result = 0; ++ } else { ++ unlock_page(page); ++ result = PTR_ERR(bio); ++ } ++ ++ return result; ++} ++ ++/* helper function to construct bio for page */ ++static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp) ++{ ++ struct bio *bio; ++ assert("nikita-2092", page != NULL); ++ assert("nikita-2633", node != NULL); ++ ++ /* Simple implementation in the assumption that blocksize == pagesize. ++ ++ We only have to submit one block, but submit_bh() will allocate bio ++ anyway, so lets use all the bells-and-whistles of bio code. ++ */ ++ ++ bio = bio_alloc(gfp, 1); ++ if (bio != NULL) { ++ int blksz; ++ struct super_block *super; ++ reiser4_block_nr blocknr; ++ ++ super = page->mapping->host->i_sb; ++ assert("nikita-2029", super != NULL); ++ blksz = super->s_blocksize; ++ assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE); ++ ++ spin_lock_jnode(node); ++ blocknr = *jnode_get_io_block(node); ++ spin_unlock_jnode(node); ++ ++ assert("nikita-2275", blocknr != (reiser4_block_nr) 0); ++ assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr)); ++ ++ bio->bi_bdev = super->s_bdev; ++ /* fill bio->bi_sector before calling bio_add_page(), because ++ * q->merge_bvec_fn may want to inspect it (see ++ * drivers/md/linear.c:linear_mergeable_bvec() for example. */ ++ bio->bi_sector = blocknr * (blksz >> 9); ++ ++ if (!bio_add_page(bio, page, blksz, 0)) { ++ warning("nikita-3452", ++ "Single page bio cannot be constructed"); ++ return ERR_PTR(RETERR(-EINVAL)); ++ } ++ ++ /* bio -> bi_idx is filled by bio_init() */ ++ bio->bi_end_io = (rw == READ) ? ++ end_bio_single_page_read : end_bio_single_page_write; ++ ++ return bio; ++ } else ++ return ERR_PTR(RETERR(-ENOMEM)); ++} ++ ++#if 0 ++static int can_hit_entd(reiser4_context *ctx, struct super_block *s) ++{ ++ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic) ++ return 1; ++ if (ctx->super != s) ++ return 1; ++ if (get_super_private(s)->entd.tsk == current) ++ return 0; ++ if (!lock_stack_isclean(&ctx->stack)) ++ return 0; ++ if (ctx->trans->atom != NULL) ++ return 0; ++ return 1; ++} ++#endif ++ ++/** ++ * reiser4_writepage - writepage of struct address_space_operations ++ * @page: page to write ++ * @wbc: ++ * ++ * ++ */ ++/* Common memory pressure notification. */ ++int reiser4_writepage(struct page *page, ++ struct writeback_control *wbc) ++{ ++ /* ++ * assert("edward-1562", ++ * can_hit_entd(get_current_context_check(), sb)); ++ */ ++ assert("vs-828", PageLocked(page)); ++ ++ wbc->sb = page->mapping->host->i_sb; ++ wbc->bdi = page->mapping->backing_dev_info; ++ ++ return write_page_by_ent(page, wbc); ++} ++ ++/* ->set_page_dirty() method of formatted address_space */ ++static int formatted_set_page_dirty(struct page *page) ++{ ++ assert("nikita-2173", page != NULL); ++ BUG(); ++ return __set_page_dirty_nobuffers(page); ++} ++ ++/* writepages method of address space operations in reiser4 is used to involve ++ into transactions pages which are dirtied via mmap. Only regular files can ++ have such pages. Fake inode is used to access formatted nodes via page ++ cache. As formatted nodes can never be mmaped, fake inode's writepages has ++ nothing to do */ ++static int ++writepages_fake(struct address_space *mapping, struct writeback_control *wbc) ++{ ++ return 0; ++} ++ ++/* address space operations for the fake inode */ ++static struct address_space_operations formatted_fake_as_ops = { ++ /* Perform a writeback of a single page as a memory-freeing ++ * operation. */ ++ .writepage = reiser4_writepage, ++ /* this is called to read formatted node */ ++ .readpage = formatted_readpage, ++ /* ->sync_page() method of fake inode address space operations. Called ++ from wait_on_page() and lock_page(). ++ ++ This is most annoyingly misnomered method. Actually it is called ++ from wait_on_page_bit() and lock_page() and its purpose is to ++ actually start io by jabbing device drivers. ++ */ ++ .sync_page = block_sync_page, ++ /* Write back some dirty pages from this mapping. Called from sync. ++ called during sync (pdflush) */ ++ .writepages = writepages_fake, ++ /* Set a page dirty */ ++ .set_page_dirty = formatted_set_page_dirty, ++ /* used for read-ahead. Not applicable */ ++ .readpages = NULL, ++ .write_begin = NULL, ++ .write_end = NULL, ++ .bmap = NULL, ++ /* called just before page is being detached from inode mapping and ++ removed from memory. Called on truncate, cut/squeeze, and ++ umount. */ ++ .invalidatepage = reiser4_invalidatepage, ++ /* this is called by shrink_cache() so that file system can try to ++ release objects (jnodes, buffers, journal heads) attached to page ++ and, may be made page itself free-able. ++ */ ++ .releasepage = reiser4_releasepage, ++ .direct_IO = NULL ++}; ++ ++/* called just before page is released (no longer used by reiser4). Callers: ++ jdelete() and extent2tail(). */ ++void reiser4_drop_page(struct page *page) ++{ ++ assert("nikita-2181", PageLocked(page)); ++ clear_page_dirty_for_io(page); ++ ClearPageUptodate(page); ++#if defined(PG_skipped) ++ ClearPageSkipped(page); ++#endif ++ unlock_page(page); ++} ++ ++#define JNODE_GANG_SIZE (16) ++ ++/* find all jnodes from range specified and invalidate them */ ++static int ++truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count) ++{ ++ reiser4_inode *info; ++ int truncated_jnodes; ++ reiser4_tree *tree; ++ unsigned long index; ++ unsigned long end; ++ ++ if (inode_file_plugin(inode) == ++ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) ++ /* ++ * No need to get rid of jnodes here: if the single jnode of ++ * page cluster did not have page, then it was found and killed ++ * before in ++ * truncate_complete_page_cluster()->jput()->jput_final(), ++ * otherwise it will be dropped by reiser4_invalidatepage() ++ */ ++ return 0; ++ truncated_jnodes = 0; ++ ++ info = reiser4_inode_data(inode); ++ tree = reiser4_tree_by_inode(inode); ++ ++ index = from; ++ end = from + count; ++ ++ while (1) { ++ jnode *gang[JNODE_GANG_SIZE]; ++ int taken; ++ int i; ++ jnode *node; ++ ++ assert("nikita-3466", index <= end); ++ ++ read_lock_tree(tree); ++ taken = ++ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info), ++ (void **)gang, index, ++ JNODE_GANG_SIZE); ++ for (i = 0; i < taken; ++i) { ++ node = gang[i]; ++ if (index_jnode(node) < end) ++ jref(node); ++ else ++ gang[i] = NULL; ++ } ++ read_unlock_tree(tree); ++ ++ for (i = 0; i < taken; ++i) { ++ node = gang[i]; ++ if (node != NULL) { ++ index = max(index, index_jnode(node)); ++ spin_lock_jnode(node); ++ assert("edward-1457", node->pg == NULL); ++ /* this is always called after ++ truncate_inode_pages_range(). Therefore, here ++ jnode can not have page. New pages can not be ++ created because truncate_jnodes_range goes ++ under exclusive access on file obtained, ++ where as new page creation requires ++ non-exclusive access obtained */ ++ JF_SET(node, JNODE_HEARD_BANSHEE); ++ reiser4_uncapture_jnode(node); ++ unhash_unformatted_jnode(node); ++ truncated_jnodes++; ++ jput(node); ++ } else ++ break; ++ } ++ if (i != taken || taken == 0) ++ break; ++ } ++ return truncated_jnodes; ++} ++ ++/* Truncating files in reiser4: problems and solutions. ++ ++ VFS calls fs's truncate after it has called truncate_inode_pages() ++ to get rid of pages corresponding to part of file being truncated. ++ In reiser4 it may cause existence of unallocated extents which do ++ not have jnodes. Flush code does not expect that. Solution of this ++ problem is straightforward. As vfs's truncate is implemented using ++ setattr operation, it seems reasonable to have ->setattr() that ++ will cut file body. However, flush code also does not expect dirty ++ pages without parent items, so it is impossible to cut all items, ++ then truncate all pages in two steps. We resolve this problem by ++ cutting items one-by-one. Each such fine-grained step performed ++ under longterm znode lock calls at the end ->kill_hook() method of ++ a killed item to remove its binded pages and jnodes. ++ ++ The following function is a common part of mentioned kill hooks. ++ Also, this is called before tail-to-extent conversion (to not manage ++ few copies of the data). ++*/ ++void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from, ++ unsigned long count, int even_cows) ++{ ++ loff_t from_bytes, count_bytes; ++ ++ if (count == 0) ++ return; ++ from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT; ++ count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT; ++ ++ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows); ++ truncate_inode_pages_range(mapping, from_bytes, ++ from_bytes + count_bytes - 1); ++ truncate_jnodes_range(mapping->host, from, count); ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 120 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/page_cache.h linux-2.6.33/fs/reiser4/page_cache.h +--- linux-2.6.33.orig/fs/reiser4/page_cache.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/page_cache.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,66 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */ ++ ++#if !defined(__REISER4_PAGE_CACHE_H__) ++#define __REISER4_PAGE_CACHE_H__ ++ ++#include "forward.h" ++#include "context.h" /* for reiser4_ctx_gfp_mask_get() */ ++ ++#include <linux/fs.h> /* for struct super_block, address_space */ ++#include <linux/mm.h> /* for struct page */ ++#include <linux/pagemap.h> /* for lock_page() */ ++#include <linux/vmalloc.h> /* for __vmalloc() */ ++ ++extern int reiser4_init_formatted_fake(struct super_block *); ++extern void reiser4_done_formatted_fake(struct super_block *); ++ ++extern reiser4_tree *reiser4_tree_by_page(const struct page *); ++ ++#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio)) ++ ++extern void reiser4_wait_page_writeback(struct page *); ++static inline void lock_and_wait_page_writeback(struct page *page) ++{ ++ lock_page(page); ++ if (unlikely(PageWriteback(page))) ++ reiser4_wait_page_writeback(page); ++} ++ ++#define jprivate(page) ((jnode *)page_private(page)) ++ ++extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t); ++extern void reiser4_drop_page(struct page *); ++extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from, ++ unsigned long count, int even_cows); ++extern void capture_reiser4_inodes(struct super_block *, ++ struct writeback_control *); ++static inline void *reiser4_vmalloc(unsigned long size) ++{ ++ return __vmalloc(size, ++ reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM, ++ PAGE_KERNEL); ++} ++ ++#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY ++ ++#if REISER4_DEBUG ++extern void print_page(const char *prefix, struct page *page); ++#else ++#define print_page(prf, p) noop ++#endif ++ ++/* __REISER4_PAGE_CACHE_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/cluster.c linux-2.6.33/fs/reiser4/plugin/cluster.c +--- linux-2.6.33.orig/fs/reiser4/plugin/cluster.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/cluster.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,72 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Contains reiser4 cluster plugins (see ++ http://www.namesys.com/cryptcompress_design.html ++ "Concepts of clustering" for details). */ ++ ++#include "plugin_header.h" ++#include "plugin.h" ++#include "../inode.h" ++ ++static int change_cluster(struct inode *inode, ++ reiser4_plugin * plugin, ++ pset_member memb) ++{ ++ assert("edward-1324", inode != NULL); ++ assert("edward-1325", plugin != NULL); ++ assert("edward-1326", is_reiser4_inode(inode)); ++ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE); ++ ++ /* Can't change the cluster plugin for already existent regular files */ ++ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) ++ return RETERR(-EINVAL); ++ ++ /* If matches, nothing to change. */ ++ if (inode_hash_plugin(inode) != NULL && ++ inode_hash_plugin(inode)->h.id == plugin->h.id) ++ return 0; ++ ++ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, ++ PSET_CLUSTER, plugin); ++} ++ ++static reiser4_plugin_ops cluster_plugin_ops = { ++ .init = NULL, ++ .load = NULL, ++ .save_len = NULL, ++ .save = NULL, ++ .change = &change_cluster ++}; ++ ++#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \ ++ [CLUSTER_ ## ID ## _ID] = { \ ++ .h = { \ ++ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \ ++ .id = CLUSTER_ ## ID ## _ID, \ ++ .pops = &cluster_plugin_ops, \ ++ .label = LABEL, \ ++ .desc = DESC, \ ++ .linkage = {NULL, NULL} \ ++ }, \ ++ .shift = SHIFT \ ++ } ++ ++cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = { ++ SUPPORT_CLUSTER(16, 64K, "64K", "Large"), ++ SUPPORT_CLUSTER(15, 32K, "32K", "Big"), ++ SUPPORT_CLUSTER(14, 16K, "16K", "Average"), ++ SUPPORT_CLUSTER(13, 8K, "8K", "Small"), ++ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal") ++}; ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/cluster.h linux-2.6.33/fs/reiser4/plugin/cluster.h +--- linux-2.6.33.orig/fs/reiser4/plugin/cluster.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/cluster.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,410 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* This file contains size/offset translators, modulators ++ and other helper functions. */ ++ ++#if !defined(__FS_REISER4_CLUSTER_H__) ++#define __FS_REISER4_CLUSTER_H__ ++ ++#include "../inode.h" ++ ++static inline int inode_cluster_shift(struct inode *inode) ++{ ++ assert("edward-92", inode != NULL); ++ assert("edward-93", reiser4_inode_data(inode) != NULL); ++ ++ return inode_cluster_plugin(inode)->shift; ++} ++ ++static inline unsigned cluster_nrpages_shift(struct inode *inode) ++{ ++ return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT; ++} ++ ++/* cluster size in page units */ ++static inline unsigned cluster_nrpages(struct inode *inode) ++{ ++ return 1U << cluster_nrpages_shift(inode); ++} ++ ++static inline size_t inode_cluster_size(struct inode *inode) ++{ ++ assert("edward-96", inode != NULL); ++ ++ return 1U << inode_cluster_shift(inode); ++} ++ ++static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode) ++{ ++ return idx >> cluster_nrpages_shift(inode); ++} ++ ++static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode) ++{ ++ return idx << cluster_nrpages_shift(inode); ++} ++ ++static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode) ++{ ++ return clust_to_pg(pg_to_clust(idx, inode), inode); ++} ++ ++static inline pgoff_t off_to_pg(loff_t off) ++{ ++ return (off >> PAGE_CACHE_SHIFT); ++} ++ ++static inline loff_t pg_to_off(pgoff_t idx) ++{ ++ return ((loff_t) (idx) << PAGE_CACHE_SHIFT); ++} ++ ++static inline cloff_t off_to_clust(loff_t off, struct inode *inode) ++{ ++ return off >> inode_cluster_shift(inode); ++} ++ ++static inline loff_t clust_to_off(cloff_t idx, struct inode *inode) ++{ ++ return (loff_t) idx << inode_cluster_shift(inode); ++} ++ ++static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode) ++{ ++ return clust_to_off(off_to_clust(off, inode), inode); ++} ++ ++static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode) ++{ ++ return clust_to_pg(off_to_clust(off, inode), inode); ++} ++ ++static inline unsigned off_to_pgoff(loff_t off) ++{ ++ return off & (PAGE_CACHE_SIZE - 1); ++} ++ ++static inline unsigned off_to_cloff(loff_t off, struct inode *inode) ++{ ++ return off & ((loff_t) (inode_cluster_size(inode)) - 1); ++} ++ ++static inline pgoff_t offset_in_clust(struct page *page) ++{ ++ assert("edward-1488", page != NULL); ++ assert("edward-1489", page->mapping != NULL); ++ ++ return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1); ++} ++ ++static inline int first_page_in_cluster(struct page *page) ++{ ++ return offset_in_clust(page) == 0; ++} ++ ++static inline int last_page_in_cluster(struct page *page) ++{ ++ return offset_in_clust(page) == ++ cluster_nrpages(page->mapping->host) - 1; ++} ++ ++static inline unsigned ++pg_to_off_to_cloff(unsigned long idx, struct inode *inode) ++{ ++ return off_to_cloff(pg_to_off(idx), inode); ++} ++ ++/*********************** Size translators **************************/ ++ ++/* Translate linear size. ++ * New units are (1 << @blk_shift) times larger, then old ones. ++ * In other words, calculate number of logical blocks, occupied ++ * by @count elements ++ */ ++static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits) ++{ ++ return (count + (1UL << blkbits) - 1) >> blkbits; ++} ++ ++/* size in pages */ ++static inline pgoff_t size_in_pages(loff_t size) ++{ ++ return size_in_blocks(size, PAGE_CACHE_SHIFT); ++} ++ ++/* size in logical clusters */ ++static inline cloff_t size_in_lc(loff_t size, struct inode *inode) ++{ ++ return size_in_blocks(size, inode_cluster_shift(inode)); ++} ++ ++/* size in pages to the size in page clusters */ ++static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode) ++{ ++ return size_in_blocks(size, cluster_nrpages_shift(inode)); ++} ++ ++/*********************** Size modulators ***************************/ ++ ++/* ++ Modulate linear size by nominated block size and offset. ++ ++ The "finite" function (which is zero almost everywhere). ++ How much is a height of the figure at a position @pos, ++ when trying to construct rectangle of height (1 << @blkbits), ++ and square @size. ++ ++ ****** ++ ******* ++ ******* ++ ******* ++ ----------> pos ++*/ ++static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits) ++{ ++ unsigned end = size >> blkbits; ++ if (pos < end) ++ return 1U << blkbits; ++ if (unlikely(pos > end)) ++ return 0; ++ return size & ~(~0ull << blkbits); ++} ++ ++/* the same as above, but block size is page size */ ++static inline unsigned __mbp(loff_t size, pgoff_t pos) ++{ ++ return __mbb(size, pos, PAGE_CACHE_SHIFT); ++} ++ ++/* number of file's bytes in the nominated logical cluster */ ++static inline unsigned lbytes(cloff_t index, struct inode *inode) ++{ ++ return __mbb(i_size_read(inode), index, inode_cluster_shift(inode)); ++} ++ ++/* number of file's bytes in the nominated page */ ++static inline unsigned pbytes(pgoff_t index, struct inode *inode) ++{ ++ return __mbp(i_size_read(inode), index); ++} ++ ++/** ++ * number of pages occuped by @win->count bytes starting from ++ * @win->off at logical cluster defined by @win. This is exactly ++ * a number of pages to be modified and dirtied in any cluster operation. ++ */ ++static inline pgoff_t win_count_to_nrpages(struct reiser4_slide * win) ++{ ++ return ((win->off + win->count + ++ (1UL << PAGE_CACHE_SHIFT) - 1) >> PAGE_CACHE_SHIFT) - ++ off_to_pg(win->off); ++} ++ ++/* return true, if logical cluster is not occupied by the file */ ++static inline int new_logical_cluster(struct cluster_handle *clust, ++ struct inode *inode) ++{ ++ return clust_to_off(clust->index, inode) >= i_size_read(inode); ++} ++ ++/* return true, if pages @p1 and @p2 are of the same page cluster */ ++static inline int same_page_cluster(struct page *p1, struct page *p2) ++{ ++ assert("edward-1490", p1 != NULL); ++ assert("edward-1491", p2 != NULL); ++ assert("edward-1492", p1->mapping != NULL); ++ assert("edward-1493", p2->mapping != NULL); ++ ++ return (pg_to_clust(page_index(p1), p1->mapping->host) == ++ pg_to_clust(page_index(p2), p2->mapping->host)); ++} ++ ++static inline int cluster_is_complete(struct cluster_handle *clust, ++ struct inode *inode) ++{ ++ return clust->tc.lsize == inode_cluster_size(inode); ++} ++ ++static inline void reiser4_slide_init(struct reiser4_slide *win) ++{ ++ assert("edward-1084", win != NULL); ++ memset(win, 0, sizeof *win); ++} ++ ++static inline tfm_action ++cluster_get_tfm_act(struct tfm_cluster *tc) ++{ ++ assert("edward-1356", tc != NULL); ++ return tc->act; ++} ++ ++static inline void ++cluster_set_tfm_act(struct tfm_cluster *tc, tfm_action act) ++{ ++ assert("edward-1356", tc != NULL); ++ tc->act = act; ++} ++ ++static inline void cluster_init_act(struct cluster_handle *clust, ++ tfm_action act, ++ struct reiser4_slide *window) ++{ ++ assert("edward-84", clust != NULL); ++ memset(clust, 0, sizeof *clust); ++ cluster_set_tfm_act(&clust->tc, act); ++ clust->dstat = INVAL_DISK_CLUSTER; ++ clust->win = window; ++} ++ ++static inline void cluster_init_read(struct cluster_handle *clust, ++ struct reiser4_slide *window) ++{ ++ cluster_init_act(clust, TFMA_READ, window); ++} ++ ++static inline void cluster_init_write(struct cluster_handle *clust, ++ struct reiser4_slide *window) ++{ ++ cluster_init_act(clust, TFMA_WRITE, window); ++} ++ ++/* true if @p1 and @p2 are items of the same disk cluster */ ++static inline int same_disk_cluster(const coord_t *p1, const coord_t *p2) ++{ ++ /* drop this if you have other items to aggregate */ ++ assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID); ++ ++ return item_plugin_by_coord(p1)->b.mergeable(p1, p2); ++} ++ ++static inline int dclust_get_extension_dsize(hint_t *hint) ++{ ++ return hint->ext_coord.extension.ctail.dsize; ++} ++ ++static inline void dclust_set_extension_dsize(hint_t *hint, int dsize) ++{ ++ hint->ext_coord.extension.ctail.dsize = dsize; ++} ++ ++static inline int dclust_get_extension_shift(hint_t *hint) ++{ ++ return hint->ext_coord.extension.ctail.shift; ++} ++ ++static inline int dclust_get_extension_ncount(hint_t *hint) ++{ ++ return hint->ext_coord.extension.ctail.ncount; ++} ++ ++static inline void dclust_inc_extension_ncount(hint_t *hint) ++{ ++ hint->ext_coord.extension.ctail.ncount++; ++} ++ ++static inline void dclust_init_extension(hint_t *hint) ++{ ++ memset(&hint->ext_coord.extension.ctail, 0, ++ sizeof(hint->ext_coord.extension.ctail)); ++} ++ ++static inline int hint_is_unprepped_dclust(hint_t *hint) ++{ ++ assert("edward-1451", hint_is_valid(hint)); ++ return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT; ++} ++ ++static inline void coord_set_between_clusters(coord_t *coord) ++{ ++#if REISER4_DEBUG ++ int result; ++ result = zload(coord->node); ++ assert("edward-1296", !result); ++#endif ++ if (!coord_is_between_items(coord)) { ++ coord->between = AFTER_ITEM; ++ coord->unit_pos = 0; ++ } ++#if REISER4_DEBUG ++ zrelse(coord->node); ++#endif ++} ++ ++int reiser4_inflate_cluster(struct cluster_handle *, struct inode *); ++int find_disk_cluster(struct cluster_handle *, struct inode *, int read, ++ znode_lock_mode mode); ++int checkout_logical_cluster(struct cluster_handle *, jnode * , struct inode *); ++int reiser4_deflate_cluster(struct cluster_handle *, struct inode *); ++void truncate_complete_page_cluster(struct inode *inode, cloff_t start, ++ int even_cows); ++void invalidate_hint_cluster(struct cluster_handle *clust); ++int get_disk_cluster_locked(struct cluster_handle *clust, struct inode *inode, ++ znode_lock_mode lock_mode); ++void reset_cluster_params(struct cluster_handle *clust); ++int set_cluster_by_page(struct cluster_handle *clust, struct page *page, ++ int count); ++int prepare_page_cluster(struct inode *inode, struct cluster_handle *clust, ++ rw_op rw); ++void __put_page_cluster(int from, int count, struct page **pages, ++ struct inode *inode); ++void put_page_cluster(struct cluster_handle *clust, ++ struct inode *inode, rw_op rw); ++void put_cluster_handle(struct cluster_handle *clust); ++int grab_tfm_stream(struct inode *inode, struct tfm_cluster *tc, ++ tfm_stream_id id); ++int tfm_cluster_is_uptodate(struct tfm_cluster *tc); ++void tfm_cluster_set_uptodate(struct tfm_cluster *tc); ++void tfm_cluster_clr_uptodate(struct tfm_cluster *tc); ++ ++/* move cluster handle to the target position ++ specified by the page of index @pgidx */ ++static inline void move_cluster_forward(struct cluster_handle *clust, ++ struct inode *inode, ++ pgoff_t pgidx) ++{ ++ assert("edward-1297", clust != NULL); ++ assert("edward-1298", inode != NULL); ++ ++ reset_cluster_params(clust); ++ if (clust->index_valid && ++ /* Hole in the indices. Hint became invalid and can not be ++ used by find_cluster_item() even if seal/node versions ++ will coincide */ ++ pg_to_clust(pgidx, inode) != clust->index + 1) { ++ reiser4_unset_hint(clust->hint); ++ invalidate_hint_cluster(clust); ++ } ++ clust->index = pg_to_clust(pgidx, inode); ++ clust->index_valid = 1; ++} ++ ++static inline int alloc_clust_pages(struct cluster_handle *clust, ++ struct inode *inode) ++{ ++ assert("edward-791", clust != NULL); ++ assert("edward-792", inode != NULL); ++ clust->pages = ++ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode), ++ reiser4_ctx_gfp_mask_get()); ++ if (!clust->pages) ++ return -ENOMEM; ++ return 0; ++} ++ ++static inline void free_clust_pages(struct cluster_handle *clust) ++{ ++ kfree(clust->pages); ++} ++ ++#endif /* __FS_REISER4_CLUSTER_H__ */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.33/fs/reiser4/plugin/compress/compress.c +--- linux-2.6.33.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/compress/compress.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,355 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++/* reiser4 compression transform plugins */ ++ ++#include "../../debug.h" ++#include "../../inode.h" ++#include "../plugin.h" ++ ++#include <linux/lzo.h> ++#include <linux/zlib.h> ++#include <linux/types.h> ++#include <linux/hardirq.h> ++ ++static int change_compression(struct inode *inode, ++ reiser4_plugin * plugin, ++ pset_member memb) ++{ ++ assert("edward-1316", inode != NULL); ++ assert("edward-1317", plugin != NULL); ++ assert("edward-1318", is_reiser4_inode(inode)); ++ assert("edward-1319", ++ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE); ++ ++ /* cannot change compression plugin of already existing regular object */ ++ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) ++ return RETERR(-EINVAL); ++ ++ /* If matches, nothing to change. */ ++ if (inode_hash_plugin(inode) != NULL && ++ inode_hash_plugin(inode)->h.id == plugin->h.id) ++ return 0; ++ ++ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, ++ PSET_COMPRESSION, plugin); ++} ++ ++static reiser4_plugin_ops compression_plugin_ops = { ++ .init = NULL, ++ .load = NULL, ++ .save_len = NULL, ++ .save = NULL, ++ .change = &change_compression ++}; ++ ++/******************************************************************************/ ++/* gzip1 compression */ ++/******************************************************************************/ ++ ++#define GZIP1_DEF_LEVEL Z_BEST_SPEED ++#define GZIP1_DEF_WINBITS 15 ++#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL ++ ++static int gzip1_init(void) ++{ ++ return 0; ++} ++ ++static int gzip1_overrun(unsigned src_len UNUSED_ARG) ++{ ++ return 0; ++} ++ ++static coa_t gzip1_alloc(tfm_action act) ++{ ++ coa_t coa = NULL; ++ int ret = 0; ++ switch (act) { ++ case TFMA_WRITE: /* compress */ ++ coa = reiser4_vmalloc(zlib_deflate_workspacesize()); ++ if (!coa) { ++ ret = -ENOMEM; ++ break; ++ } ++ break; ++ case TFMA_READ: /* decompress */ ++ coa = reiser4_vmalloc(zlib_inflate_workspacesize()); ++ if (!coa) { ++ ret = -ENOMEM; ++ break; ++ } ++ break; ++ default: ++ impossible("edward-767", ++ "trying to alloc workspace for unknown tfm action"); ++ } ++ if (ret) { ++ warning("edward-768", ++ "alloc workspace for gzip1 (tfm action = %d) failed\n", ++ act); ++ return ERR_PTR(ret); ++ } ++ return coa; ++} ++ ++static void gzip1_free(coa_t coa, tfm_action act) ++{ ++ assert("edward-769", coa != NULL); ++ ++ switch (act) { ++ case TFMA_WRITE: /* compress */ ++ vfree(coa); ++ break; ++ case TFMA_READ: /* decompress */ ++ vfree(coa); ++ break; ++ default: ++ impossible("edward-770", "unknown tfm action"); ++ } ++ return; ++} ++ ++static int gzip1_min_size_deflate(void) ++{ ++ return 64; ++} ++ ++static void ++gzip1_compress(coa_t coa, __u8 * src_first, size_t src_len, ++ __u8 * dst_first, size_t *dst_len) ++{ ++ int ret = 0; ++ struct z_stream_s stream; ++ ++ assert("edward-842", coa != NULL); ++ assert("edward-875", src_len != 0); ++ ++ stream.workspace = coa; ++ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED, ++ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL, ++ Z_DEFAULT_STRATEGY); ++ if (ret != Z_OK) { ++ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret); ++ goto rollback; ++ } ++ ret = zlib_deflateReset(&stream); ++ if (ret != Z_OK) { ++ warning("edward-772", "zlib_deflateReset returned %d\n", ret); ++ goto rollback; ++ } ++ stream.next_in = src_first; ++ stream.avail_in = src_len; ++ stream.next_out = dst_first; ++ stream.avail_out = *dst_len; ++ ++ ret = zlib_deflate(&stream, Z_FINISH); ++ if (ret != Z_STREAM_END) { ++ if (ret != Z_OK) ++ warning("edward-773", ++ "zlib_deflate returned %d\n", ret); ++ goto rollback; ++ } ++ *dst_len = stream.total_out; ++ return; ++ rollback: ++ *dst_len = src_len; ++ return; ++} ++ ++static void ++gzip1_decompress(coa_t coa, __u8 * src_first, size_t src_len, ++ __u8 * dst_first, size_t *dst_len) ++{ ++ int ret = 0; ++ struct z_stream_s stream; ++ ++ assert("edward-843", coa != NULL); ++ assert("edward-876", src_len != 0); ++ ++ stream.workspace = coa; ++ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS); ++ if (ret != Z_OK) { ++ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret); ++ return; ++ } ++ ret = zlib_inflateReset(&stream); ++ if (ret != Z_OK) { ++ warning("edward-775", "zlib_inflateReset returned %d\n", ret); ++ return; ++ } ++ ++ stream.next_in = src_first; ++ stream.avail_in = src_len; ++ stream.next_out = dst_first; ++ stream.avail_out = *dst_len; ++ ++ ret = zlib_inflate(&stream, Z_SYNC_FLUSH); ++ /* ++ * Work around a bug in zlib, which sometimes wants to taste an extra ++ * byte when being used in the (undocumented) raw deflate mode. ++ * (From USAGI). ++ */ ++ if (ret == Z_OK && !stream.avail_in && stream.avail_out) { ++ u8 zerostuff = 0; ++ stream.next_in = &zerostuff; ++ stream.avail_in = 1; ++ ret = zlib_inflate(&stream, Z_FINISH); ++ } ++ if (ret != Z_STREAM_END) { ++ warning("edward-776", "zlib_inflate returned %d\n", ret); ++ return; ++ } ++ *dst_len = stream.total_out; ++ return; ++} ++ ++/******************************************************************************/ ++/* lzo1 compression */ ++/******************************************************************************/ ++ ++static int lzo1_init(void) ++{ ++ return 0; ++} ++ ++static int lzo1_overrun(unsigned in_len) ++{ ++ return in_len / 64 + 16 + 3; ++} ++ ++static coa_t lzo1_alloc(tfm_action act) ++{ ++ int ret = 0; ++ coa_t coa = NULL; ++ ++ switch (act) { ++ case TFMA_WRITE: /* compress */ ++ coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS); ++ if (!coa) { ++ ret = -ENOMEM; ++ break; ++ } ++ case TFMA_READ: /* decompress */ ++ break; ++ default: ++ impossible("edward-877", ++ "trying to alloc workspace for unknown tfm action"); ++ } ++ if (ret) { ++ warning("edward-878", ++ "alloc workspace for lzo1 (tfm action = %d) failed\n", ++ act); ++ return ERR_PTR(ret); ++ } ++ return coa; ++} ++ ++static void lzo1_free(coa_t coa, tfm_action act) ++{ ++ assert("edward-879", coa != NULL); ++ ++ switch (act) { ++ case TFMA_WRITE: /* compress */ ++ vfree(coa); ++ break; ++ case TFMA_READ: /* decompress */ ++ impossible("edward-1304", ++ "trying to free non-allocated workspace"); ++ default: ++ impossible("edward-880", "unknown tfm action"); ++ } ++ return; ++} ++ ++static int lzo1_min_size_deflate(void) ++{ ++ return 256; ++} ++ ++static void ++lzo1_compress(coa_t coa, __u8 * src_first, size_t src_len, ++ __u8 * dst_first, size_t *dst_len) ++{ ++ int result; ++ ++ assert("edward-846", coa != NULL); ++ assert("edward-847", src_len != 0); ++ ++ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa); ++ if (unlikely(result != LZO_E_OK)) { ++ warning("edward-849", "lzo1x_1_compress failed\n"); ++ goto out; ++ } ++ if (*dst_len >= src_len) { ++ //warning("edward-850", "lzo1x_1_compress: incompressible data\n"); ++ goto out; ++ } ++ return; ++ out: ++ *dst_len = src_len; ++ return; ++} ++ ++static void ++lzo1_decompress(coa_t coa, __u8 * src_first, size_t src_len, ++ __u8 * dst_first, size_t *dst_len) ++{ ++ int result; ++ ++ assert("edward-851", coa == NULL); ++ assert("edward-852", src_len != 0); ++ ++ result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len); ++ if (result != LZO_E_OK) ++ warning("edward-853", "lzo1x_1_decompress failed\n"); ++ return; ++} ++ ++compression_plugin compression_plugins[LAST_COMPRESSION_ID] = { ++ [LZO1_COMPRESSION_ID] = { ++ .h = { ++ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, ++ .id = LZO1_COMPRESSION_ID, ++ .pops = &compression_plugin_ops, ++ .label = "lzo1", ++ .desc = "lzo1 compression transform", ++ .linkage = {NULL, NULL} ++ }, ++ .init = lzo1_init, ++ .overrun = lzo1_overrun, ++ .alloc = lzo1_alloc, ++ .free = lzo1_free, ++ .min_size_deflate = lzo1_min_size_deflate, ++ .checksum = reiser4_adler32, ++ .compress = lzo1_compress, ++ .decompress = lzo1_decompress ++ }, ++ [GZIP1_COMPRESSION_ID] = { ++ .h = { ++ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, ++ .id = GZIP1_COMPRESSION_ID, ++ .pops = &compression_plugin_ops, ++ .label = "gzip1", ++ .desc = "gzip1 compression transform", ++ .linkage = {NULL, NULL} ++ }, ++ .init = gzip1_init, ++ .overrun = gzip1_overrun, ++ .alloc = gzip1_alloc, ++ .free = gzip1_free, ++ .min_size_deflate = gzip1_min_size_deflate, ++ .checksum = reiser4_adler32, ++ .compress = gzip1_compress, ++ .decompress = gzip1_decompress ++ } ++}; ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.33/fs/reiser4/plugin/compress/compress.h +--- linux-2.6.33.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/compress/compress.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,43 @@ ++#if !defined( __FS_REISER4_COMPRESS_H__ ) ++#define __FS_REISER4_COMPRESS_H__ ++ ++#include <linux/types.h> ++#include <linux/string.h> ++ ++/* transform direction */ ++typedef enum { ++ TFMA_READ, /* decrypt, decompress */ ++ TFMA_WRITE, /* encrypt, compress */ ++ TFMA_LAST ++} tfm_action; ++ ++/* supported compression algorithms */ ++typedef enum { ++ LZO1_COMPRESSION_ID, ++ GZIP1_COMPRESSION_ID, ++ LAST_COMPRESSION_ID, ++} reiser4_compression_id; ++ ++/* the same as pgoff, but units are page clusters */ ++typedef unsigned long cloff_t; ++ ++/* working data of a (de)compression algorithm */ ++typedef void *coa_t; ++ ++/* table for all supported (de)compression algorithms */ ++typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST]; ++ ++__u32 reiser4_adler32(char *data, __u32 len); ++ ++#endif /* __FS_REISER4_COMPRESS_H__ */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.33/fs/reiser4/plugin/compress/compress_mode.c +--- linux-2.6.33.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/compress/compress_mode.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,162 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++/* This file contains Reiser4 compression mode plugins. ++ ++ Compression mode plugin is a set of handlers called by compressor ++ at flush time and represent some heuristics including the ones ++ which are to avoid compression of incompressible data, see ++ http://www.namesys.com/cryptcompress_design.html for more details. ++*/ ++#include "../../inode.h" ++#include "../plugin.h" ++ ++static int should_deflate_none(struct inode * inode, cloff_t index) ++{ ++ return 0; ++} ++ ++static int should_deflate_common(struct inode * inode, cloff_t index) ++{ ++ return compression_is_on(cryptcompress_inode_data(inode)); ++} ++ ++static int discard_hook_ultim(struct inode *inode, cloff_t index) ++{ ++ turn_off_compression(cryptcompress_inode_data(inode)); ++ return 0; ++} ++ ++static int discard_hook_lattd(struct inode *inode, cloff_t index) ++{ ++ struct cryptcompress_info * info = cryptcompress_inode_data(inode); ++ ++ assert("edward-1462", ++ get_lattice_factor(info) >= MIN_LATTICE_FACTOR && ++ get_lattice_factor(info) <= MAX_LATTICE_FACTOR); ++ ++ turn_off_compression(info); ++ if (get_lattice_factor(info) < MAX_LATTICE_FACTOR) ++ set_lattice_factor(info, get_lattice_factor(info) << 1); ++ return 0; ++} ++ ++static int accept_hook_lattd(struct inode *inode, cloff_t index) ++{ ++ turn_on_compression(cryptcompress_inode_data(inode)); ++ set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR); ++ return 0; ++} ++ ++/* Check on dynamic lattice, the adaptive compression modes which ++ defines the following behavior: ++ ++ Compression is on: try to compress everything and turn ++ it off, whenever cluster is incompressible. ++ ++ Compression is off: try to compress clusters of indexes ++ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of ++ them is compressible. If incompressible, then increase FACTOR */ ++ ++/* check if @index belongs to one-dimensional lattice ++ of sparce factor @factor */ ++static int is_on_lattice(cloff_t index, int factor) ++{ ++ return (factor ? index % factor == 0: index == 0); ++} ++ ++static int should_deflate_lattd(struct inode * inode, cloff_t index) ++{ ++ return should_deflate_common(inode, index) || ++ is_on_lattice(index, ++ get_lattice_factor ++ (cryptcompress_inode_data(inode))); ++} ++ ++/* compression mode_plugins */ ++compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = { ++ [NONE_COMPRESSION_MODE_ID] = { ++ .h = { ++ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, ++ .id = NONE_COMPRESSION_MODE_ID, ++ .pops = NULL, ++ .label = "none", ++ .desc = "Compress nothing", ++ .linkage = {NULL, NULL} ++ }, ++ .should_deflate = should_deflate_none, ++ .accept_hook = NULL, ++ .discard_hook = NULL ++ }, ++ /* Check-on-dynamic-lattice adaptive compression mode */ ++ [LATTD_COMPRESSION_MODE_ID] = { ++ .h = { ++ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, ++ .id = LATTD_COMPRESSION_MODE_ID, ++ .pops = NULL, ++ .label = "lattd", ++ .desc = "Check on dynamic lattice", ++ .linkage = {NULL, NULL} ++ }, ++ .should_deflate = should_deflate_lattd, ++ .accept_hook = accept_hook_lattd, ++ .discard_hook = discard_hook_lattd ++ }, ++ /* Check-ultimately compression mode: ++ Turn off compression forever as soon as we meet ++ incompressible data */ ++ [ULTIM_COMPRESSION_MODE_ID] = { ++ .h = { ++ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, ++ .id = ULTIM_COMPRESSION_MODE_ID, ++ .pops = NULL, ++ .label = "ultim", ++ .desc = "Check ultimately", ++ .linkage = {NULL, NULL} ++ }, ++ .should_deflate = should_deflate_common, ++ .accept_hook = NULL, ++ .discard_hook = discard_hook_ultim ++ }, ++ /* Force-to-compress-everything compression mode */ ++ [FORCE_COMPRESSION_MODE_ID] = { ++ .h = { ++ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, ++ .id = FORCE_COMPRESSION_MODE_ID, ++ .pops = NULL, ++ .label = "force", ++ .desc = "Force to compress everything", ++ .linkage = {NULL, NULL} ++ }, ++ .should_deflate = NULL, ++ .accept_hook = NULL, ++ .discard_hook = NULL ++ }, ++ /* Convert-to-extent compression mode. ++ In this mode items will be converted to extents and management ++ will be passed to (classic) unix file plugin as soon as ->write() ++ detects that the first complete logical cluster (of index #0) is ++ incompressible. */ ++ [CONVX_COMPRESSION_MODE_ID] = { ++ .h = { ++ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, ++ .id = CONVX_COMPRESSION_MODE_ID, ++ .pops = NULL, ++ .label = "conv", ++ .desc = "Convert to extent", ++ .linkage = {NULL, NULL} ++ }, ++ .should_deflate = should_deflate_common, ++ .accept_hook = NULL, ++ .discard_hook = NULL ++ } ++}; ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.33/fs/reiser4/plugin/compress/Makefile +--- linux-2.6.33.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/compress/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,5 @@ ++obj-$(CONFIG_REISER4_FS) += compress_plugins.o ++ ++compress_plugins-objs := \ ++ compress.o \ ++ compress_mode.o +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.33/fs/reiser4/plugin/crypto/cipher.c +--- linux-2.6.33.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/crypto/cipher.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,37 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, ++ licensing governed by reiser4/README */ ++/* Reiser4 cipher transform plugins */ ++ ++#include "../../debug.h" ++#include "../plugin.h" ++ ++cipher_plugin cipher_plugins[LAST_CIPHER_ID] = { ++ [NONE_CIPHER_ID] = { ++ .h = { ++ .type_id = REISER4_CIPHER_PLUGIN_TYPE, ++ .id = NONE_CIPHER_ID, ++ .pops = NULL, ++ .label = "none", ++ .desc = "no cipher transform", ++ .linkage = {NULL, NULL} ++ }, ++ .alloc = NULL, ++ .free = NULL, ++ .scale = NULL, ++ .align_stream = NULL, ++ .setkey = NULL, ++ .encrypt = NULL, ++ .decrypt = NULL ++ } ++}; ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.33/fs/reiser4/plugin/crypto/cipher.h +--- linux-2.6.33.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/crypto/cipher.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,55 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++/* This file contains definitions for the objects operated ++ by reiser4 key manager, which is something like keyring ++ wrapped by appropriate reiser4 plugin */ ++ ++#if !defined( __FS_REISER4_CRYPT_H__ ) ++#define __FS_REISER4_CRYPT_H__ ++ ++#include <linux/crypto.h> ++ ++/* key info imported from user space */ ++struct reiser4_crypto_data { ++ int keysize; /* uninstantiated key size */ ++ __u8 * key; /* uninstantiated key */ ++ int keyid_size; /* size of passphrase */ ++ __u8 * keyid; /* passphrase */ ++}; ++ ++/* This object contains all needed infrastructure to implement ++ cipher transform. This is operated (allocating, inheriting, ++ validating, binding to host inode, etc..) by reiser4 key manager. ++ ++ This info can be allocated in two cases: ++ 1. importing a key from user space. ++ 2. reading inode from disk */ ++struct reiser4_crypto_info { ++ struct inode * host; ++ struct crypto_hash * digest; ++ struct crypto_blkcipher * cipher; ++#if 0 ++ cipher_key_plugin * kplug; /* key manager */ ++#endif ++ __u8 * keyid; /* key fingerprint, created by digest plugin, ++ using uninstantiated key and passphrase. ++ supposed to be stored in disk stat-data */ ++ int inst; /* this indicates if the cipher key is ++ instantiated (case 1 above) */ ++ int keysize; /* uninstantiated key size (bytes), supposed ++ to be stored in disk stat-data */ ++ int keyload_count; /* number of the objects which has this ++ crypto-stat attached */ ++}; ++ ++#endif /* __FS_REISER4_CRYPT_H__ */ ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.33/fs/reiser4/plugin/crypto/digest.c +--- linux-2.6.33.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/crypto/digest.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,58 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */ ++/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */ ++#include "../../debug.h" ++#include "../plugin_header.h" ++#include "../plugin.h" ++#include "../file/cryptcompress.h" ++ ++#include <linux/types.h> ++ ++extern digest_plugin digest_plugins[LAST_DIGEST_ID]; ++ ++static struct crypto_hash * alloc_sha256 (void) ++{ ++#if REISER4_SHA256 ++ return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC); ++#else ++ warning("edward-1418", "sha256 unsupported"); ++ return ERR_PTR(-EINVAL); ++#endif ++} ++ ++static void free_sha256 (struct crypto_hash * tfm) ++{ ++#if REISER4_SHA256 ++ crypto_free_hash(tfm); ++#endif ++ return; ++} ++ ++/* digest plugins */ ++digest_plugin digest_plugins[LAST_DIGEST_ID] = { ++ [SHA256_32_DIGEST_ID] = { ++ .h = { ++ .type_id = REISER4_DIGEST_PLUGIN_TYPE, ++ .id = SHA256_32_DIGEST_ID, ++ .pops = NULL, ++ .label = "sha256_32", ++ .desc = "sha256_32 digest transform", ++ .linkage = {NULL, NULL} ++ }, ++ .fipsize = sizeof(__u32), ++ .alloc = alloc_sha256, ++ .free = free_sha256 ++ } ++}; ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.33/fs/reiser4/plugin/dir/dir.h +--- linux-2.6.33.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/dir/dir.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,36 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* this file contains declarations of methods implementing directory plugins */ ++ ++#if !defined( __REISER4_DIR_H__ ) ++#define __REISER4_DIR_H__ ++ ++/*#include "../../key.h" ++ ++#include <linux/fs.h>*/ ++ ++/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */ ++ ++/* "hashed" directory methods of dir plugin */ ++void build_entry_key_hashed(const struct inode *, const struct qstr *, ++ reiser4_key *); ++ ++/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */ ++ ++/* "seekable" directory methods of dir plugin */ ++void build_entry_key_seekable(const struct inode *, const struct qstr *, ++ reiser4_key *); ++ ++/* __REISER4_DIR_H__ */ ++#endif ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.33/fs/reiser4/plugin/dir/hashed_dir.c +--- linux-2.6.33.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/dir/hashed_dir.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,81 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file ++ names to the files. */ ++ ++/* ++ * Hashed directory logically consists of persistent directory ++ * entries. Directory entry is a pair of a file name and a key of stat-data of ++ * a file that has this name in the given directory. ++ * ++ * Directory entries are stored in the tree in the form of directory ++ * items. Directory item should implement dir_entry_ops portion of item plugin ++ * interface (see plugin/item/item.h). Hashed directory interacts with ++ * directory item plugin exclusively through dir_entry_ops operations. ++ * ++ * Currently there are two implementations of directory items: "simple ++ * directory item" (plugin/item/sde.[ch]), and "compound directory item" ++ * (plugin/item/cde.[ch]) with the latter being the default. ++ * ++ * There is, however some delicate way through which directory code interferes ++ * with item plugin: key assignment policy. A key for a directory item is ++ * chosen by directory code, and as described in kassign.c, this key contains ++ * a portion of file name. Directory item uses this knowledge to avoid storing ++ * this portion of file name twice: in the key and in the directory item body. ++ * ++ */ ++ ++#include "../../inode.h" ++ ++void complete_entry_key(const struct inode *, const char *name, ++ int len, reiser4_key * result); ++ ++/* this is implementation of build_entry_key method of dir ++ plugin for HASHED_DIR_PLUGIN_ID ++ */ ++void build_entry_key_hashed(const struct inode *dir, /* directory where entry is ++ * (or will be) in.*/ ++ const struct qstr *qname, /* name of file referenced ++ * by this entry */ ++ reiser4_key * result /* resulting key of directory ++ * entry */ ) ++{ ++ const char *name; ++ int len; ++ ++ assert("nikita-1139", dir != NULL); ++ assert("nikita-1140", qname != NULL); ++ assert("nikita-1141", qname->name != NULL); ++ assert("nikita-1142", result != NULL); ++ ++ name = qname->name; ++ len = qname->len; ++ ++ assert("nikita-2867", strlen(name) == len); ++ ++ reiser4_key_init(result); ++ /* locality of directory entry's key is objectid of parent ++ directory */ ++ set_key_locality(result, get_inode_oid(dir)); ++ /* minor packing locality is constant */ ++ set_key_type(result, KEY_FILE_NAME_MINOR); ++ /* dot is special case---we always want it to be first entry in ++ a directory. Actually, we just want to have smallest ++ directory entry. ++ */ ++ if (len == 1 && name[0] == '.') ++ return; ++ ++ /* initialize part of entry key which depends on file name */ ++ complete_entry_key(dir, name, len, result); ++} ++ ++/* Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.33/fs/reiser4/plugin/dir/Makefile +--- linux-2.6.33.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/dir/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,5 @@ ++obj-$(CONFIG_REISER4_FS) += dir_plugins.o ++ ++dir_plugins-objs := \ ++ hashed_dir.o \ ++ seekable_dir.o +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.33/fs/reiser4/plugin/dir/seekable_dir.c +--- linux-2.6.33.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/dir/seekable_dir.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,46 @@ ++/* Copyright 2005 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#include "../../inode.h" ++ ++/* this is implementation of build_entry_key method of dir ++ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID ++ This is for directories where we want repeatable and restartable readdir() ++ even in case 32bit user level struct dirent (readdir(3)). ++*/ ++void ++build_entry_key_seekable(const struct inode *dir, const struct qstr *name, ++ reiser4_key * result) ++{ ++ oid_t objectid; ++ ++ assert("nikita-2283", dir != NULL); ++ assert("nikita-2284", name != NULL); ++ assert("nikita-2285", name->name != NULL); ++ assert("nikita-2286", result != NULL); ++ ++ reiser4_key_init(result); ++ /* locality of directory entry's key is objectid of parent ++ directory */ ++ set_key_locality(result, get_inode_oid(dir)); ++ /* minor packing locality is constant */ ++ set_key_type(result, KEY_FILE_NAME_MINOR); ++ /* dot is special case---we always want it to be first entry in ++ a directory. Actually, we just want to have smallest ++ directory entry. ++ */ ++ if ((name->len == 1) && (name->name[0] == '.')) ++ return; ++ ++ /* objectid of key is 31 lowest bits of hash. */ ++ objectid = ++ inode_hash_plugin(dir)->hash(name->name, ++ (int)name->len) & 0x7fffffff; ++ ++ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK)); ++ set_key_objectid(result, objectid); ++ ++ /* offset is always 0. */ ++ set_key_offset(result, (__u64) 0); ++ return; ++} +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.33/fs/reiser4/plugin/dir_plugin_common.c +--- linux-2.6.33.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/dir_plugin_common.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,865 @@ ++/* Copyright 2005 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* this file contains typical implementations for most of methods of ++ directory plugin ++*/ ++ ++#include "../inode.h" ++ ++int reiser4_find_entry(struct inode *dir, struct dentry *name, ++ lock_handle * , znode_lock_mode, reiser4_dir_entry_desc *); ++int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, ++ reiser4_key * key); ++void check_light_weight(struct inode *inode, struct inode *parent); ++ ++/* this is common implementation of get_parent method of dir plugin ++ this is used by NFS kernel server to "climb" up directory tree to ++ check permissions ++ */ ++struct dentry *get_parent_common(struct inode *child) ++{ ++ struct super_block *s; ++ struct inode *parent; ++ struct dentry dotdot; ++ struct dentry *dentry; ++ reiser4_key key; ++ int result; ++ ++ /* ++ * lookup dotdot entry. ++ */ ++ ++ s = child->i_sb; ++ memset(&dotdot, 0, sizeof(dotdot)); ++ dotdot.d_name.name = ".."; ++ dotdot.d_name.len = 2; ++ dotdot.d_op = &get_super_private(s)->ops.dentry; ++ ++ result = reiser4_lookup_name(child, &dotdot, &key); ++ if (result != 0) ++ return ERR_PTR(result); ++ ++ parent = reiser4_iget(s, &key, 1); ++ if (!IS_ERR(parent)) { ++ /* ++ * FIXME-NIKITA dubious: attributes are inherited from @child ++ * to @parent. But: ++ * ++ * (*) this is the only this we can do ++ * ++ * (*) attributes of light-weight object are inherited ++ * from a parent through which object was looked up first, ++ * so it is ambiguous anyway. ++ * ++ */ ++ check_light_weight(parent, child); ++ reiser4_iget_complete(parent); ++ dentry = d_obtain_alias(parent); ++ if (!IS_ERR(dentry)) ++ dentry->d_op = &get_super_private(s)->ops.dentry; ++ } else if (PTR_ERR(parent) == -ENOENT) ++ dentry = ERR_PTR(RETERR(-ESTALE)); ++ else ++ dentry = (void *)parent; ++ return dentry; ++} ++ ++/* this is common implementation of is_name_acceptable method of dir ++ plugin ++ */ ++int is_name_acceptable_common(const struct inode *inode, /* directory to check*/ ++ const char *name UNUSED_ARG, /* name to check */ ++ int len/* @name's length */) ++{ ++ assert("nikita-733", inode != NULL); ++ assert("nikita-734", name != NULL); ++ assert("nikita-735", len > 0); ++ ++ return len <= reiser4_max_filename_len(inode); ++} ++ ++/* there is no common implementation of build_entry_key method of dir ++ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or ++ plugin/dir/seekable.c:build_entry_key_seekable() for example ++*/ ++ ++/* this is common implementation of build_readdir_key method of dir ++ plugin ++ see reiser4_readdir_common for more details ++*/ ++int build_readdir_key_common(struct file *dir /* directory being read */ , ++ reiser4_key * result/* where to store key */) ++{ ++ reiser4_file_fsdata *fdata; ++ struct inode *inode; ++ ++ assert("nikita-1361", dir != NULL); ++ assert("nikita-1362", result != NULL); ++ assert("nikita-1363", dir->f_dentry != NULL); ++ inode = dir->f_dentry->d_inode; ++ assert("nikita-1373", inode != NULL); ++ ++ fdata = reiser4_get_file_fsdata(dir); ++ if (IS_ERR(fdata)) ++ return PTR_ERR(fdata); ++ assert("nikita-1364", fdata != NULL); ++ return extract_key_from_de_id(get_inode_oid(inode), ++ &fdata->dir.readdir.position. ++ dir_entry_key, result); ++ ++} ++ ++void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset, ++ int adj); ++ ++/* this is common implementation of add_entry method of dir plugin ++*/ ++int reiser4_add_entry_common(struct inode *object, /* directory to add new name ++ * in */ ++ struct dentry *where, /* new name */ ++ reiser4_object_create_data * data, /* parameters of ++ * new object */ ++ reiser4_dir_entry_desc * entry /* parameters of ++ * new directory ++ * entry */) ++{ ++ int result; ++ coord_t *coord; ++ lock_handle lh; ++ struct reiser4_dentry_fsdata *fsdata; ++ reiser4_block_nr reserve; ++ ++ assert("nikita-1114", object != NULL); ++ assert("nikita-1250", where != NULL); ++ ++ fsdata = reiser4_get_dentry_fsdata(where); ++ if (unlikely(IS_ERR(fsdata))) ++ return PTR_ERR(fsdata); ++ ++ reserve = inode_dir_plugin(object)->estimate.add_entry(object); ++ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) ++ return RETERR(-ENOSPC); ++ ++ init_lh(&lh); ++ coord = &fsdata->dec.entry_coord; ++ coord_clear_iplug(coord); ++ ++ /* check for this entry in a directory. This is plugin method. */ ++ result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK, ++ entry); ++ if (likely(result == -ENOENT)) { ++ /* add new entry. Just pass control to the directory ++ item plugin. */ ++ assert("nikita-1709", inode_dir_item_plugin(object)); ++ assert("nikita-2230", coord->node == lh.node); ++ reiser4_seal_done(&fsdata->dec.entry_seal); ++ result = ++ inode_dir_item_plugin(object)->s.dir.add_entry(object, ++ coord, &lh, ++ where, ++ entry); ++ if (result == 0) { ++ reiser4_adjust_dir_file(object, where, ++ fsdata->dec.pos + 1, +1); ++ INODE_INC_FIELD(object, i_size); ++ } ++ } else if (result == 0) { ++ assert("nikita-2232", coord->node == lh.node); ++ result = RETERR(-EEXIST); ++ } ++ done_lh(&lh); ++ ++ return result; ++} ++ ++/** ++ * rem_entry - remove entry from directory item ++ * @dir: ++ * @dentry: ++ * @entry: ++ * @coord: ++ * @lh: ++ * ++ * Checks that coordinate @coord is set properly and calls item plugin ++ * method to cut entry. ++ */ ++static int ++rem_entry(struct inode *dir, struct dentry *dentry, ++ reiser4_dir_entry_desc * entry, coord_t *coord, lock_handle * lh) ++{ ++ item_plugin *iplug; ++ struct inode *child; ++ ++ iplug = inode_dir_item_plugin(dir); ++ child = dentry->d_inode; ++ assert("nikita-3399", child != NULL); ++ ++ /* check that we are really destroying an entry for @child */ ++ if (REISER4_DEBUG) { ++ int result; ++ reiser4_key key; ++ ++ result = iplug->s.dir.extract_key(coord, &key); ++ if (result != 0) ++ return result; ++ if (get_key_objectid(&key) != get_inode_oid(child)) { ++ warning("nikita-3397", ++ "rem_entry: %#llx != %#llx\n", ++ get_key_objectid(&key), ++ (unsigned long long)get_inode_oid(child)); ++ return RETERR(-EIO); ++ } ++ } ++ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry); ++} ++ ++/** ++ * reiser4_rem_entry_common - remove entry from a directory ++ * @dir: directory to remove entry from ++ * @where: name that is being removed ++ * @entry: description of entry being removed ++ * ++ * This is common implementation of rem_entry method of dir plugin. ++ */ ++int reiser4_rem_entry_common(struct inode *dir, ++ struct dentry *dentry, ++ reiser4_dir_entry_desc * entry) ++{ ++ int result; ++ coord_t *coord; ++ lock_handle lh; ++ struct reiser4_dentry_fsdata *fsdata; ++ __u64 tograb; ++ ++ assert("nikita-1124", dir != NULL); ++ assert("nikita-1125", dentry != NULL); ++ ++ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir); ++ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED); ++ if (result != 0) ++ return RETERR(-ENOSPC); ++ ++ init_lh(&lh); ++ ++ /* check for this entry in a directory. This is plugin method. */ ++ result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry); ++ fsdata = reiser4_get_dentry_fsdata(dentry); ++ if (IS_ERR(fsdata)) { ++ done_lh(&lh); ++ return PTR_ERR(fsdata); ++ } ++ ++ coord = &fsdata->dec.entry_coord; ++ ++ assert("nikita-3404", ++ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) || ++ dir->i_size <= 1); ++ ++ coord_clear_iplug(coord); ++ if (result == 0) { ++ /* remove entry. Just pass control to the directory item ++ plugin. */ ++ assert("vs-542", inode_dir_item_plugin(dir)); ++ reiser4_seal_done(&fsdata->dec.entry_seal); ++ reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1); ++ result = ++ WITH_COORD(coord, ++ rem_entry(dir, dentry, entry, coord, &lh)); ++ if (result == 0) { ++ if (dir->i_size >= 1) ++ INODE_DEC_FIELD(dir, i_size); ++ else { ++ warning("nikita-2509", "Dir %llu is runt", ++ (unsigned long long) ++ get_inode_oid(dir)); ++ result = RETERR(-EIO); ++ } ++ ++ assert("nikita-3405", dentry->d_inode->i_nlink != 1 || ++ dentry->d_inode->i_size != 2 || ++ inode_dir_plugin(dentry->d_inode) == NULL); ++ } ++ } ++ done_lh(&lh); ++ ++ return result; ++} ++ ++static reiser4_block_nr estimate_init(struct inode *parent, ++ struct inode *object); ++static int create_dot_dotdot(struct inode *object, struct inode *parent); ++ ++/* this is common implementation of init method of dir plugin ++ create "." and ".." entries ++*/ ++int reiser4_dir_init_common(struct inode *object, /* new directory */ ++ struct inode *parent, /* parent directory */ ++ reiser4_object_create_data * data /* info passed ++ * to us, this ++ * is filled by ++ * reiser4() ++ * syscall in ++ * particular */) ++{ ++ reiser4_block_nr reserve; ++ ++ assert("nikita-680", object != NULL); ++ assert("nikita-681", S_ISDIR(object->i_mode)); ++ assert("nikita-682", parent != NULL); ++ assert("nikita-684", data != NULL); ++ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID); ++ assert("nikita-687", object->i_mode & S_IFDIR); ++ ++ reserve = estimate_init(parent, object); ++ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) ++ return RETERR(-ENOSPC); ++ ++ return create_dot_dotdot(object, parent); ++} ++ ++/* this is common implementation of done method of dir plugin ++ remove "." entry ++*/ ++int reiser4_dir_done_common(struct inode *object/* object being deleted */) ++{ ++ int result; ++ reiser4_block_nr reserve; ++ struct dentry goodby_dots; ++ reiser4_dir_entry_desc entry; ++ ++ assert("nikita-1449", object != NULL); ++ ++ if (reiser4_inode_get_flag(object, REISER4_NO_SD)) ++ return 0; ++ ++ /* of course, this can be rewritten to sweep everything in one ++ reiser4_cut_tree(). */ ++ memset(&entry, 0, sizeof entry); ++ ++ /* FIXME: this done method is called from reiser4_delete_dir_common ++ * which reserved space already */ ++ reserve = inode_dir_plugin(object)->estimate.rem_entry(object); ++ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED)) ++ return RETERR(-ENOSPC); ++ ++ memset(&goodby_dots, 0, sizeof goodby_dots); ++ entry.obj = goodby_dots.d_inode = object; ++ goodby_dots.d_name.name = "."; ++ goodby_dots.d_name.len = 1; ++ result = reiser4_rem_entry_common(object, &goodby_dots, &entry); ++ reiser4_free_dentry_fsdata(&goodby_dots); ++ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT)) ++ warning("nikita-2252", "Cannot remove dot of %lli: %i", ++ (unsigned long long)get_inode_oid(object), result); ++ return 0; ++} ++ ++/* this is common implementation of attach method of dir plugin ++*/ ++int reiser4_attach_common(struct inode *child UNUSED_ARG, ++ struct inode *parent UNUSED_ARG) ++{ ++ assert("nikita-2647", child != NULL); ++ assert("nikita-2648", parent != NULL); ++ ++ return 0; ++} ++ ++/* this is common implementation of detach method of dir plugin ++ remove "..", decrease nlink on parent ++*/ ++int reiser4_detach_common(struct inode *object, struct inode *parent) ++{ ++ int result; ++ struct dentry goodby_dots; ++ reiser4_dir_entry_desc entry; ++ ++ assert("nikita-2885", object != NULL); ++ assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD)); ++ ++ memset(&entry, 0, sizeof entry); ++ ++ /* NOTE-NIKITA this only works if @parent is -the- parent of ++ @object, viz. object whose key is stored in dotdot ++ entry. Wouldn't work with hard-links on directories. */ ++ memset(&goodby_dots, 0, sizeof goodby_dots); ++ entry.obj = goodby_dots.d_inode = parent; ++ goodby_dots.d_name.name = ".."; ++ goodby_dots.d_name.len = 2; ++ result = reiser4_rem_entry_common(object, &goodby_dots, &entry); ++ reiser4_free_dentry_fsdata(&goodby_dots); ++ if (result == 0) { ++ /* the dot should be the only entry remaining at this time... */ ++ assert("nikita-3400", ++ object->i_size == 1 && object->i_nlink <= 2); ++#if 0 ++ /* and, together with the only name directory can have, they ++ * provides for the last 2 remaining references. If we get ++ * here as part of error handling during mkdir, @object ++ * possibly has no name yet, so its nlink == 1. If we get here ++ * from rename (targeting empty directory), it has no name ++ * already, so its nlink == 1. */ ++ assert("nikita-3401", ++ object->i_nlink == 2 || object->i_nlink == 1); ++#endif ++ ++ /* decrement nlink of directory removed ".." pointed ++ to */ ++ reiser4_del_nlink(parent, NULL, 0); ++ } ++ return result; ++} ++ ++/* this is common implementation of estimate.add_entry method of ++ dir plugin ++ estimation of adding entry which supposes that entry is inserting a ++ unit into item ++*/ ++reiser4_block_nr estimate_add_entry_common(const struct inode *inode) ++{ ++ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode)); ++} ++ ++/* this is common implementation of estimate.rem_entry method of dir ++ plugin ++*/ ++reiser4_block_nr estimate_rem_entry_common(const struct inode *inode) ++{ ++ return estimate_one_item_removal(reiser4_tree_by_inode(inode)); ++} ++ ++/* this is common implementation of estimate.unlink method of dir ++ plugin ++*/ ++reiser4_block_nr ++dir_estimate_unlink_common(const struct inode *parent, ++ const struct inode *object) ++{ ++ reiser4_block_nr res; ++ ++ /* hashed_rem_entry(object) */ ++ res = inode_dir_plugin(object)->estimate.rem_entry(object); ++ /* del_nlink(parent) */ ++ res += 2 * inode_file_plugin(parent)->estimate.update(parent); ++ ++ return res; ++} ++ ++/* ++ * helper for inode_ops ->lookup() and dir plugin's ->get_parent() ++ * methods: if @inode is a light-weight file, setup its credentials ++ * that are not stored in the stat-data in this case ++ */ ++void check_light_weight(struct inode *inode, struct inode *parent) ++{ ++ if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) { ++ inode->i_uid = parent->i_uid; ++ inode->i_gid = parent->i_gid; ++ /* clear light-weight flag. If inode would be read by any ++ other name, [ug]id wouldn't change. */ ++ reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT); ++ } ++} ++ ++/* looks for name specified in @dentry in directory @parent and if name is ++ found - key of object found entry points to is stored in @entry->key */ ++int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup ++ * for name in */ ++ struct dentry *dentry, /* name to look for */ ++ reiser4_key * key/* place to store key */) ++{ ++ int result; ++ coord_t *coord; ++ lock_handle lh; ++ const char *name; ++ int len; ++ reiser4_dir_entry_desc entry; ++ struct reiser4_dentry_fsdata *fsdata; ++ ++ assert("nikita-1247", parent != NULL); ++ assert("nikita-1248", dentry != NULL); ++ assert("nikita-1123", dentry->d_name.name != NULL); ++ assert("vs-1486", ++ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry); ++ ++ name = dentry->d_name.name; ++ len = dentry->d_name.len; ++ ++ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len)) ++ /* some arbitrary error code to return */ ++ return RETERR(-ENAMETOOLONG); ++ ++ fsdata = reiser4_get_dentry_fsdata(dentry); ++ if (IS_ERR(fsdata)) ++ return PTR_ERR(fsdata); ++ ++ coord = &fsdata->dec.entry_coord; ++ coord_clear_iplug(coord); ++ init_lh(&lh); ++ ++ /* find entry in a directory. This is plugin method. */ ++ result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, ++ &entry); ++ if (result == 0) { ++ /* entry was found, extract object key from it. */ ++ result = ++ WITH_COORD(coord, ++ item_plugin_by_coord(coord)->s.dir. ++ extract_key(coord, key)); ++ } ++ done_lh(&lh); ++ return result; ++ ++} ++ ++/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */ ++static reiser4_block_nr ++estimate_init(struct inode *parent, struct inode *object) ++{ ++ reiser4_block_nr res = 0; ++ ++ assert("vpf-321", parent != NULL); ++ assert("vpf-322", object != NULL); ++ ++ /* hashed_add_entry(object) */ ++ res += inode_dir_plugin(object)->estimate.add_entry(object); ++ /* reiser4_add_nlink(object) */ ++ res += inode_file_plugin(object)->estimate.update(object); ++ /* hashed_add_entry(object) */ ++ res += inode_dir_plugin(object)->estimate.add_entry(object); ++ /* reiser4_add_nlink(parent) */ ++ res += inode_file_plugin(parent)->estimate.update(parent); ++ ++ return 0; ++} ++ ++/* helper function for reiser4_dir_init_common(). Create "." and ".." */ ++static int create_dot_dotdot(struct inode *object/* object to create dot and ++ * dotdot for */ , ++ struct inode *parent/* parent of @object */) ++{ ++ int result; ++ struct dentry dots_entry; ++ reiser4_dir_entry_desc entry; ++ ++ assert("nikita-688", object != NULL); ++ assert("nikita-689", S_ISDIR(object->i_mode)); ++ assert("nikita-691", parent != NULL); ++ ++ /* We store dot and dotdot as normal directory entries. This is ++ not necessary, because almost all information stored in them ++ is already in the stat-data of directory, the only thing ++ being missed is objectid of grand-parent directory that can ++ easily be added there as extension. ++ ++ But it is done the way it is done, because not storing dot ++ and dotdot will lead to the following complications: ++ ++ . special case handling in ->lookup(). ++ . addition of another extension to the sd. ++ . dependency on key allocation policy for stat data. ++ ++ */ ++ ++ memset(&entry, 0, sizeof entry); ++ memset(&dots_entry, 0, sizeof dots_entry); ++ entry.obj = dots_entry.d_inode = object; ++ dots_entry.d_name.name = "."; ++ dots_entry.d_name.len = 1; ++ result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry); ++ reiser4_free_dentry_fsdata(&dots_entry); ++ ++ if (result == 0) { ++ result = reiser4_add_nlink(object, object, 0); ++ if (result == 0) { ++ entry.obj = dots_entry.d_inode = parent; ++ dots_entry.d_name.name = ".."; ++ dots_entry.d_name.len = 2; ++ result = reiser4_add_entry_common(object, ++ &dots_entry, NULL, &entry); ++ reiser4_free_dentry_fsdata(&dots_entry); ++ /* if creation of ".." failed, iput() will delete ++ object with ".". */ ++ if (result == 0) { ++ result = reiser4_add_nlink(parent, object, 0); ++ if (result != 0) ++ /* ++ * if we failed to bump i_nlink, try ++ * to remove ".." ++ */ ++ reiser4_detach_common(object, parent); ++ } ++ } ++ } ++ ++ if (result != 0) { ++ /* ++ * in the case of error, at least update stat-data so that, ++ * ->i_nlink updates are not lingering. ++ */ ++ reiser4_update_sd(object); ++ reiser4_update_sd(parent); ++ } ++ ++ return result; ++} ++ ++/* ++ * return 0 iff @coord contains a directory entry for the file with the name ++ * @name. ++ */ ++static int ++check_item(const struct inode *dir, const coord_t *coord, const char *name) ++{ ++ item_plugin *iplug; ++ char buf[DE_NAME_BUF_LEN]; ++ ++ iplug = item_plugin_by_coord(coord); ++ if (iplug == NULL) { ++ warning("nikita-1135", "Cannot get item plugin"); ++ print_coord("coord", coord, 1); ++ return RETERR(-EIO); ++ } else if (item_id_by_coord(coord) != ++ item_id_by_plugin(inode_dir_item_plugin(dir))) { ++ /* item id of current item does not match to id of items a ++ directory is built of */ ++ warning("nikita-1136", "Wrong item plugin"); ++ print_coord("coord", coord, 1); ++ return RETERR(-EIO); ++ } ++ assert("nikita-1137", iplug->s.dir.extract_name); ++ ++ /* Compare name stored in this entry with name we are looking for. ++ ++ NOTE-NIKITA Here should go code for support of something like ++ unicode, code tables, etc. ++ */ ++ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf)); ++} ++ ++static int ++check_entry(const struct inode *dir, coord_t *coord, const struct qstr *name) ++{ ++ return WITH_COORD(coord, check_item(dir, coord, name->name)); ++} ++ ++/* ++ * argument package used by entry_actor to scan entries with identical keys. ++ */ ++struct entry_actor_args { ++ /* name we are looking for */ ++ const char *name; ++ /* key of directory entry. entry_actor() scans through sequence of ++ * items/units having the same key */ ++ reiser4_key *key; ++ /* how many entries with duplicate key was scanned so far. */ ++ int non_uniq; ++#if REISER4_USE_COLLISION_LIMIT ++ /* scan limit */ ++ int max_non_uniq; ++#endif ++ /* return parameter: set to true, if ->name wasn't found */ ++ int not_found; ++ /* what type of lock to take when moving to the next node during ++ * scan */ ++ znode_lock_mode mode; ++ ++ /* last coord that was visited during scan */ ++ coord_t last_coord; ++ /* last node locked during scan */ ++ lock_handle last_lh; ++ /* inode of directory */ ++ const struct inode *inode; ++}; ++ ++/* Function called by reiser4_find_entry() to look for given name ++ in the directory. */ ++static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ , ++ coord_t *coord /* current coord */ , ++ lock_handle * lh /* current lock handle */ , ++ void *entry_actor_arg/* argument to scan */) ++{ ++ reiser4_key unit_key; ++ struct entry_actor_args *args; ++ ++ assert("nikita-1131", tree != NULL); ++ assert("nikita-1132", coord != NULL); ++ assert("nikita-1133", entry_actor_arg != NULL); ++ ++ args = entry_actor_arg; ++ ++args->non_uniq; ++#if REISER4_USE_COLLISION_LIMIT ++ if (args->non_uniq > args->max_non_uniq) { ++ args->not_found = 1; ++ /* hash collision overflow. */ ++ return RETERR(-EBUSY); ++ } ++#endif ++ ++ /* ++ * did we just reach the end of the sequence of items/units with ++ * identical keys? ++ */ ++ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) { ++ assert("nikita-1791", ++ keylt(args->key, unit_key_by_coord(coord, &unit_key))); ++ args->not_found = 1; ++ args->last_coord.between = AFTER_UNIT; ++ return 0; ++ } ++ ++ coord_dup(&args->last_coord, coord); ++ /* ++ * did scan just moved to the next node? ++ */ ++ if (args->last_lh.node != lh->node) { ++ int lock_result; ++ ++ /* ++ * if so, lock new node with the mode requested by the caller ++ */ ++ done_lh(&args->last_lh); ++ assert("nikita-1896", znode_is_any_locked(lh->node)); ++ lock_result = longterm_lock_znode(&args->last_lh, lh->node, ++ args->mode, ZNODE_LOCK_HIPRI); ++ if (lock_result != 0) ++ return lock_result; ++ } ++ return check_item(args->inode, coord, args->name); ++} ++ ++/* Look for given @name within directory @dir. ++ ++ This is called during lookup, creation and removal of directory ++ entries and on reiser4_rename_common ++ ++ First calculate key that directory entry for @name would have. Search ++ for this key in the tree. If such key is found, scan all items with ++ the same key, checking name in each directory entry along the way. ++*/ ++int reiser4_find_entry(struct inode *dir, /* directory to scan */ ++ struct dentry *de, /* name to search for */ ++ lock_handle * lh, /* resulting lock handle */ ++ znode_lock_mode mode, /* required lock mode */ ++ reiser4_dir_entry_desc * entry /* parameters of found ++ directory entry */) ++{ ++ const struct qstr *name; ++ seal_t *seal; ++ coord_t *coord; ++ int result; ++ __u32 flags; ++ struct de_location *dec; ++ struct reiser4_dentry_fsdata *fsdata; ++ ++ assert("nikita-1130", lh != NULL); ++ assert("nikita-1128", dir != NULL); ++ ++ name = &de->d_name; ++ assert("nikita-1129", name != NULL); ++ ++ /* dentry private data don't require lock, because dentry ++ manipulations are protected by i_mutex on parent. ++ ++ This is not so for inodes, because there is no -the- parent in ++ inode case. ++ */ ++ fsdata = reiser4_get_dentry_fsdata(de); ++ if (IS_ERR(fsdata)) ++ return PTR_ERR(fsdata); ++ dec = &fsdata->dec; ++ ++ coord = &dec->entry_coord; ++ coord_clear_iplug(coord); ++ seal = &dec->entry_seal; ++ /* compose key of directory entry for @name */ ++ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key); ++ ++ if (reiser4_seal_is_set(seal)) { ++ /* check seal */ ++ result = reiser4_seal_validate(seal, coord, &entry->key, ++ lh, mode, ZNODE_LOCK_LOPRI); ++ if (result == 0) { ++ /* key was found. Check that it is really item we are ++ looking for. */ ++ result = check_entry(dir, coord, name); ++ if (result == 0) ++ return 0; ++ } ++ } ++ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0; ++ /* ++ * find place in the tree where directory item should be located. ++ */ ++ result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode, ++ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, ++ flags, NULL/*ra_info */); ++ if (result == CBK_COORD_FOUND) { ++ struct entry_actor_args arg; ++ ++ /* fast path: no hash collisions */ ++ result = check_entry(dir, coord, name); ++ if (result == 0) { ++ reiser4_seal_init(seal, coord, &entry->key); ++ dec->pos = 0; ++ } else if (result > 0) { ++ /* Iterate through all units with the same keys. */ ++ arg.name = name->name; ++ arg.key = &entry->key; ++ arg.not_found = 0; ++ arg.non_uniq = 0; ++#if REISER4_USE_COLLISION_LIMIT ++ arg.max_non_uniq = max_hash_collisions(dir); ++ assert("nikita-2851", arg.max_non_uniq > 1); ++#endif ++ arg.mode = mode; ++ arg.inode = dir; ++ coord_init_zero(&arg.last_coord); ++ init_lh(&arg.last_lh); ++ ++ result = reiser4_iterate_tree ++ (reiser4_tree_by_inode(dir), ++ coord, lh, ++ entry_actor, &arg, mode, 1); ++ /* if end of the tree or extent was reached during ++ scanning. */ ++ if (arg.not_found || (result == -E_NO_NEIGHBOR)) { ++ /* step back */ ++ done_lh(lh); ++ ++ result = zload(arg.last_coord.node); ++ if (result == 0) { ++ coord_clear_iplug(&arg.last_coord); ++ coord_dup(coord, &arg.last_coord); ++ move_lh(lh, &arg.last_lh); ++ result = RETERR(-ENOENT); ++ zrelse(arg.last_coord.node); ++ --arg.non_uniq; ++ } ++ } ++ ++ done_lh(&arg.last_lh); ++ if (result == 0) ++ reiser4_seal_init(seal, coord, &entry->key); ++ ++ if (result == 0 || result == -ENOENT) { ++ assert("nikita-2580", arg.non_uniq > 0); ++ dec->pos = arg.non_uniq - 1; ++ } ++ } ++ } else ++ dec->pos = -1; ++ return result; ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format40.c +--- linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format40.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,655 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "../../debug.h" ++#include "../../dformat.h" ++#include "../../key.h" ++#include "../node/node.h" ++#include "../space/space_allocator.h" ++#include "disk_format40.h" ++#include "../plugin.h" ++#include "../../txnmgr.h" ++#include "../../jnode.h" ++#include "../../tree.h" ++#include "../../super.h" ++#include "../../wander.h" ++#include "../../inode.h" ++#include "../../ktxnmgrd.h" ++#include "../../status_flags.h" ++ ++#include <linux/types.h> /* for __u?? */ ++#include <linux/fs.h> /* for struct super_block */ ++#include <linux/buffer_head.h> ++ ++/* reiser 4.0 default disk layout */ ++ ++/* Amount of free blocks needed to perform release_format40 when fs gets ++ mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header ++ & tx record. */ ++#define RELEASE_RESERVED 4 ++ ++/* The greatest supported format40 version number */ ++#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION ++ ++/* This flag indicates that backup should be updated ++ (the update is performed by fsck) */ ++#define FORMAT40_UPDATE_BACKUP (1 << 31) ++ ++/* functions to access fields of format40_disk_super_block */ ++static __u64 get_format40_block_count(const format40_disk_super_block * sb) ++{ ++ return le64_to_cpu(get_unaligned(&sb->block_count)); ++} ++ ++static __u64 get_format40_free_blocks(const format40_disk_super_block * sb) ++{ ++ return le64_to_cpu(get_unaligned(&sb->free_blocks)); ++} ++ ++static __u64 get_format40_root_block(const format40_disk_super_block * sb) ++{ ++ return le64_to_cpu(get_unaligned(&sb->root_block)); ++} ++ ++static __u16 get_format40_tree_height(const format40_disk_super_block * sb) ++{ ++ return le16_to_cpu(get_unaligned(&sb->tree_height)); ++} ++ ++static __u64 get_format40_file_count(const format40_disk_super_block * sb) ++{ ++ return le64_to_cpu(get_unaligned(&sb->file_count)); ++} ++ ++static __u64 get_format40_oid(const format40_disk_super_block * sb) ++{ ++ return le64_to_cpu(get_unaligned(&sb->oid)); ++} ++ ++static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb) ++{ ++ return le32_to_cpu(get_unaligned(&sb->mkfs_id)); ++} ++ ++static __u64 get_format40_flags(const format40_disk_super_block * sb) ++{ ++ return le64_to_cpu(get_unaligned(&sb->flags)); ++} ++ ++static __u32 get_format40_version(const format40_disk_super_block * sb) ++{ ++ return le32_to_cpu(get_unaligned(&sb->version)) & ++ ~FORMAT40_UPDATE_BACKUP; ++} ++ ++static int update_backup_version(const format40_disk_super_block * sb) ++{ ++ return (le32_to_cpu(get_unaligned(&sb->version)) & ++ FORMAT40_UPDATE_BACKUP); ++} ++ ++static int update_disk_version(const format40_disk_super_block * sb) ++{ ++ return (get_format40_version(sb) < FORMAT40_VERSION); ++} ++ ++static int incomplete_compatibility(const format40_disk_super_block * sb) ++{ ++ return (get_format40_version(sb) > FORMAT40_VERSION); ++} ++ ++static format40_super_info *get_sb_info(struct super_block *super) ++{ ++ return &get_super_private(super)->u.format40; ++} ++ ++static int consult_diskmap(struct super_block *s) ++{ ++ format40_super_info *info; ++ journal_location *jloc; ++ ++ info = get_sb_info(s); ++ jloc = &get_super_private(s)->jloc; ++ /* Default format-specific locations, if there is nothing in ++ * diskmap */ ++ jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR; ++ jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR; ++ info->loc.super = FORMAT40_OFFSET / s->s_blocksize; ++#ifdef CONFIG_REISER4_BADBLOCKS ++ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF, ++ &jloc->footer); ++ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH, ++ &jloc->header); ++ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER, ++ &info->loc.super); ++#endif ++ return 0; ++} ++ ++/* find any valid super block of disk_format40 (even if the first ++ super block is destroyed), will change block numbers of actual journal header/footer (jf/jh) ++ if needed */ ++static struct buffer_head *find_a_disk_format40_super_block(struct super_block ++ *s) ++{ ++ struct buffer_head *super_bh; ++ format40_disk_super_block *disk_sb; ++ format40_super_info *info; ++ ++ assert("umka-487", s != NULL); ++ ++ info = get_sb_info(s); ++ ++ super_bh = sb_bread(s, info->loc.super); ++ if (super_bh == NULL) ++ return ERR_PTR(RETERR(-EIO)); ++ ++ disk_sb = (format40_disk_super_block *) super_bh->b_data; ++ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) { ++ brelse(super_bh); ++ return ERR_PTR(RETERR(-EINVAL)); ++ } ++ ++ reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count))); ++ reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) - ++ le64_to_cpu(get_unaligned(&disk_sb->free_blocks))); ++ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks))); ++ ++ return super_bh; ++} ++ ++/* find the most recent version of super block. This is called after journal is ++ replayed */ ++static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG) ++{ ++ /* Here the most recent superblock copy has to be read. However, as ++ journal replay isn't complete, we are using ++ find_a_disk_format40_super_block() function. */ ++ return find_a_disk_format40_super_block(s); ++} ++ ++static int get_super_jnode(struct super_block *s) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(s); ++ jnode *sb_jnode; ++ int ret; ++ ++ sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super); ++ ++ ret = jload(sb_jnode); ++ ++ if (ret) { ++ reiser4_drop_io_head(sb_jnode); ++ return ret; ++ } ++ ++ pin_jnode_data(sb_jnode); ++ jrelse(sb_jnode); ++ ++ sbinfo->u.format40.sb_jnode = sb_jnode; ++ ++ return 0; ++} ++ ++static void done_super_jnode(struct super_block *s) ++{ ++ jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode; ++ ++ if (sb_jnode) { ++ unpin_jnode_data(sb_jnode); ++ reiser4_drop_io_head(sb_jnode); ++ } ++} ++ ++typedef enum format40_init_stage { ++ NONE_DONE = 0, ++ CONSULT_DISKMAP, ++ FIND_A_SUPER, ++ INIT_JOURNAL_INFO, ++ INIT_STATUS, ++ JOURNAL_REPLAY, ++ READ_SUPER, ++ KEY_CHECK, ++ INIT_OID, ++ INIT_TREE, ++ JOURNAL_RECOVER, ++ INIT_SA, ++ INIT_JNODE, ++ ALL_DONE ++} format40_init_stage; ++ ++static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh) ++{ ++ format40_disk_super_block *sb_copy; ++ ++ sb_copy = kmalloc(sizeof(format40_disk_super_block), ++ reiser4_ctx_gfp_mask_get()); ++ if (sb_copy == NULL) ++ return ERR_PTR(RETERR(-ENOMEM)); ++ memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data), ++ sizeof(format40_disk_super_block)); ++ return sb_copy; ++} ++ ++static int check_key_format(const format40_disk_super_block *sb_copy) ++{ ++ if (!equi(REISER4_LARGE_KEY, ++ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) { ++ warning("nikita-3228", "Key format mismatch. " ++ "Only %s keys are supported.", ++ REISER4_LARGE_KEY ? "large" : "small"); ++ return RETERR(-EINVAL); ++ } ++ return 0; ++} ++ ++/** ++ * try_init_format40 ++ * @super: ++ * @stage: ++ * ++ */ ++static int try_init_format40(struct super_block *super, ++ format40_init_stage *stage) ++{ ++ int result; ++ struct buffer_head *super_bh; ++ reiser4_super_info_data *sbinfo; ++ format40_disk_super_block *sb_copy; ++ tree_level height; ++ reiser4_block_nr root_block; ++ node_plugin *nplug; ++ ++ assert("vs-475", super != NULL); ++ assert("vs-474", get_super_private(super)); ++ ++ *stage = NONE_DONE; ++ ++ result = consult_diskmap(super); ++ if (result) ++ return result; ++ *stage = CONSULT_DISKMAP; ++ ++ super_bh = find_a_disk_format40_super_block(super); ++ if (IS_ERR(super_bh)) ++ return PTR_ERR(super_bh); ++ brelse(super_bh); ++ *stage = FIND_A_SUPER; ++ ++ /* ok, we are sure that filesystem format is a format40 format */ ++ ++ /* map jnodes for journal control blocks (header, footer) to disk */ ++ result = reiser4_init_journal_info(super); ++ if (result) ++ return result; ++ *stage = INIT_JOURNAL_INFO; ++ ++ /* ok, we are sure that filesystem format is a format40 format */ ++ /* Now check it's state */ ++ result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR); ++ if (result != 0 && result != -EINVAL) ++ /* -EINVAL means there is no magic, so probably just old ++ * fs. */ ++ return result; ++ *stage = INIT_STATUS; ++ ++ result = reiser4_status_query(NULL, NULL); ++ if (result == REISER4_STATUS_MOUNT_WARN) ++ notice("vpf-1363", "Warning: mounting %s with errors.", ++ super->s_id); ++ if (result == REISER4_STATUS_MOUNT_RO) ++ notice("vpf-1364", "Warning: mounting %s with fatal errors," ++ " forcing read-only mount.", super->s_id); ++ result = reiser4_journal_replay(super); ++ if (result) ++ return result; ++ *stage = JOURNAL_REPLAY; ++ ++ super_bh = read_super_block(super); ++ if (IS_ERR(super_bh)) ++ return PTR_ERR(super_bh); ++ *stage = READ_SUPER; ++ ++ /* allocate and make a copy of format40_disk_super_block */ ++ sb_copy = copy_sb(super_bh); ++ brelse(super_bh); ++ ++ if (IS_ERR(sb_copy)) ++ return PTR_ERR(sb_copy); ++ printk("reiser4: %s: found disk format 4.0.%u.\n", ++ super->s_id, ++ get_format40_version(sb_copy)); ++ if (incomplete_compatibility(sb_copy)) ++ printk("reiser4: Warning: The last completely supported " ++ "version of disk format40 is %u. Some objects of " ++ "the semantic tree can be unaccessible.\n", ++ FORMAT40_VERSION); ++ /* make sure that key format of kernel and filesystem match */ ++ result = check_key_format(sb_copy); ++ if (result) { ++ kfree(sb_copy); ++ return result; ++ } ++ *stage = KEY_CHECK; ++ ++ result = oid_init_allocator(super, get_format40_file_count(sb_copy), ++ get_format40_oid(sb_copy)); ++ if (result) { ++ kfree(sb_copy); ++ return result; ++ } ++ *stage = INIT_OID; ++ ++ /* get things necessary to init reiser4_tree */ ++ root_block = get_format40_root_block(sb_copy); ++ height = get_format40_tree_height(sb_copy); ++ nplug = node_plugin_by_id(NODE40_ID); ++ ++ /* initialize reiser4_super_info_data */ ++ sbinfo = get_super_private(super); ++ assert("", sbinfo->tree.super == super); ++ /* init reiser4_tree for the filesystem */ ++ result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug); ++ if (result) { ++ kfree(sb_copy); ++ return result; ++ } ++ *stage = INIT_TREE; ++ ++ /* ++ * initialize reiser4_super_info_data with data from format40 super ++ * block ++ */ ++ sbinfo->default_uid = 0; ++ sbinfo->default_gid = 0; ++ sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy); ++ /* number of blocks in filesystem and reserved space */ ++ reiser4_set_block_count(super, get_format40_block_count(sb_copy)); ++ sbinfo->blocks_free = get_format40_free_blocks(sb_copy); ++ sbinfo->version = get_format40_version(sb_copy); ++ kfree(sb_copy); ++ ++ if (update_backup_version(sb_copy)) ++ printk("reiser4: Warning: metadata backup is not updated. " ++ "Please run 'fsck.reiser4 --fix' on %s.\n", ++ super->s_id); ++ ++ sbinfo->fsuid = 0; ++ sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories ++ * are not supported */ ++ sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in ++ * layout 40 are ++ * of one ++ * plugin */ ++ /* sbinfo->tmgr is initialized already */ ++ ++ /* recover sb data which were logged separately from sb block */ ++ ++ /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls ++ * oid_init_allocator() and reiser4_set_free_blocks() with new ++ * data. What's the reason to call them above? */ ++ result = reiser4_journal_recover_sb_data(super); ++ if (result != 0) ++ return result; ++ *stage = JOURNAL_RECOVER; ++ ++ /* ++ * Set number of used blocks. The number of used blocks is not stored ++ * neither in on-disk super block nor in the journal footer blocks. At ++ * this moment actual values of total blocks and free block counters ++ * are set in the reiser4 super block (in-memory structure) and we can ++ * calculate number of used blocks from them. ++ */ ++ reiser4_set_data_blocks(super, ++ reiser4_block_count(super) - ++ reiser4_free_blocks(super)); ++ ++#if REISER4_DEBUG ++ sbinfo->min_blocks_used = 16 /* reserved area */ + ++ 2 /* super blocks */ + ++ 2 /* journal footer and header */ ; ++#endif ++ ++ /* init disk space allocator */ ++ result = sa_init_allocator(reiser4_get_space_allocator(super), ++ super, NULL); ++ if (result) ++ return result; ++ *stage = INIT_SA; ++ ++ result = get_super_jnode(super); ++ if (result == 0) ++ *stage = ALL_DONE; ++ return result; ++} ++ ++/* plugin->u.format.get_ready */ ++int init_format_format40(struct super_block *s, void *data UNUSED_ARG) ++{ ++ int result; ++ format40_init_stage stage; ++ ++ result = try_init_format40(s, &stage); ++ switch (stage) { ++ case ALL_DONE: ++ assert("nikita-3458", result == 0); ++ break; ++ case INIT_JNODE: ++ done_super_jnode(s); ++ case INIT_SA: ++ sa_destroy_allocator(reiser4_get_space_allocator(s), s); ++ case JOURNAL_RECOVER: ++ case INIT_TREE: ++ reiser4_done_tree(&get_super_private(s)->tree); ++ case INIT_OID: ++ case KEY_CHECK: ++ case READ_SUPER: ++ case JOURNAL_REPLAY: ++ case INIT_STATUS: ++ reiser4_status_finish(); ++ case INIT_JOURNAL_INFO: ++ reiser4_done_journal_info(s); ++ case FIND_A_SUPER: ++ case CONSULT_DISKMAP: ++ case NONE_DONE: ++ break; ++ default: ++ impossible("nikita-3457", "init stage: %i", stage); ++ } ++ ++ if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED) ++ return RETERR(-ENOSPC); ++ ++ return result; ++} ++ ++static void pack_format40_super(const struct super_block *s, char *data) ++{ ++ format40_disk_super_block *super_data = ++ (format40_disk_super_block *) data; ++ ++ reiser4_super_info_data *sbinfo = get_super_private(s); ++ ++ assert("zam-591", data != NULL); ++ ++ put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)), ++ &super_data->free_blocks); ++ ++ put_unaligned(cpu_to_le64(sbinfo->tree.root_block), ++ &super_data->root_block); ++ ++ put_unaligned(cpu_to_le64(oid_next(s)), ++ &super_data->oid); ++ ++ put_unaligned(cpu_to_le64(oids_used(s)), ++ &super_data->file_count); ++ ++ put_unaligned(cpu_to_le16(sbinfo->tree.height), ++ &super_data->tree_height); ++ ++ if (update_disk_version(super_data)) { ++ __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP; ++ ++ put_unaligned(cpu_to_le32(version), &super_data->version); ++ } ++} ++ ++/* plugin->u.format.log_super ++ return a jnode which should be added to transaction when the super block ++ gets logged */ ++jnode *log_super_format40(struct super_block *s) ++{ ++ jnode *sb_jnode; ++ ++ sb_jnode = get_super_private(s)->u.format40.sb_jnode; ++ ++ jload(sb_jnode); ++ ++ pack_format40_super(s, jdata(sb_jnode)); ++ ++ jrelse(sb_jnode); ++ ++ return sb_jnode; ++} ++ ++/* plugin->u.format.release */ ++int release_format40(struct super_block *s) ++{ ++ int ret; ++ reiser4_super_info_data *sbinfo; ++ ++ sbinfo = get_super_private(s); ++ assert("zam-579", sbinfo != NULL); ++ ++ if (!rofs_super(s)) { ++ ret = reiser4_capture_super_block(s); ++ if (ret != 0) ++ warning("vs-898", ++ "reiser4_capture_super_block failed: %d", ++ ret); ++ ++ ret = txnmgr_force_commit_all(s, 1); ++ if (ret != 0) ++ warning("jmacd-74438", "txn_force failed: %d", ret); ++ ++ all_grabbed2free(); ++ } ++ ++ sa_destroy_allocator(&sbinfo->space_allocator, s); ++ reiser4_done_journal_info(s); ++ done_super_jnode(s); ++ ++ rcu_barrier(); ++ reiser4_done_tree(&sbinfo->tree); ++ /* call finish_rcu(), because some znode were "released" in ++ * reiser4_done_tree(). */ ++ rcu_barrier(); ++ ++ return 0; ++} ++ ++#define FORMAT40_ROOT_LOCALITY 41 ++#define FORMAT40_ROOT_OBJECTID 42 ++ ++/* plugin->u.format.root_dir_key */ ++const reiser4_key *root_dir_key_format40(const struct super_block *super ++ UNUSED_ARG) ++{ ++ static const reiser4_key FORMAT40_ROOT_DIR_KEY = { ++ .el = { ++ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR), ++#if REISER4_LARGE_KEY ++ ON_LARGE_KEY(0ull,) ++#endif ++ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID), ++ 0ull ++ } ++ }; ++ ++ return &FORMAT40_ROOT_DIR_KEY; ++} ++ ++/* plugin->u.format.check_open. ++ Check the opened object for validness. For now it checks for the valid oid & ++ locality only, can be improved later and it its work may depend on the mount ++ options. */ ++int check_open_format40(const struct inode *object) ++{ ++ oid_t max, oid; ++ ++ max = oid_next(object->i_sb) - 1; ++ ++ /* Check the oid. */ ++ oid = get_inode_oid(object); ++ if (oid > max) { ++ warning("vpf-1360", "The object with the oid %llu " ++ "greater then the max used oid %llu found.", ++ (unsigned long long)oid, (unsigned long long)max); ++ ++ return RETERR(-EIO); ++ } ++ ++ /* Check the locality. */ ++ oid = reiser4_inode_data(object)->locality_id; ++ if (oid > max) { ++ warning("vpf-1361", "The object with the locality %llu " ++ "greater then the max used oid %llu found.", ++ (unsigned long long)oid, (unsigned long long)max); ++ ++ return RETERR(-EIO); ++ } ++ ++ return 0; ++} ++ ++/* plugin->u.format.version_update. ++ Perform all version update operations from the on-disk ++ format40_disk_super_block.version on disk to FORMAT40_VERSION. ++ */ ++int version_update_format40(struct super_block *super) { ++ txn_handle * trans; ++ lock_handle lh; ++ txn_atom *atom; ++ int ret; ++ ++ /* Nothing to do if RO mount or the on-disk version is not less. */ ++ if (super->s_flags & MS_RDONLY) ++ return 0; ++ ++ if (get_super_private(super)->version >= FORMAT40_VERSION) ++ return 0; ++ ++ printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata " ++ "backup is left unchanged. Please run 'fsck.reiser4 --fix' " ++ "on %s to update it too.\n", FORMAT40_VERSION, super->s_id); ++ ++ /* Mark the uber znode dirty to call log_super on write_logs. */ ++ init_lh(&lh); ++ ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK, ++ ZNODE_LOCK_HIPRI, &lh); ++ if (ret != 0) ++ return ret; ++ ++ znode_make_dirty(lh.node); ++ done_lh(&lh); ++ ++ /* Update the backup blocks. */ ++ ++ /* Force write_logs immediately. */ ++ trans = get_current_context()->trans; ++ atom = get_current_atom_locked(); ++ assert("vpf-1906", atom != NULL); ++ ++ spin_lock_txnh(trans); ++ return force_commit_atom(trans); ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format40.h +--- linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format40.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,109 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* this file contains: ++ - definition of ondisk super block of standart disk layout for ++ reiser 4.0 (layout 40) ++ - definition of layout 40 specific portion of in-core super block ++ - declarations of functions implementing methods of layout plugin ++ for layout 40 ++ - declarations of functions used to get/set fields in layout 40 super block ++*/ ++ ++#ifndef __DISK_FORMAT40_H__ ++#define __DISK_FORMAT40_H__ ++ ++/* magic for default reiser4 layout */ ++#define FORMAT40_MAGIC "ReIsEr40FoRmAt" ++#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE) ++ ++#include "../../dformat.h" ++ ++#include <linux/fs.h> /* for struct super_block */ ++ ++typedef enum { ++ FORMAT40_LARGE_KEYS ++} format40_flags; ++ ++/* ondisk super block for format 40. It is 512 bytes long */ ++typedef struct format40_disk_super_block { ++ /* 0 */ d64 block_count; ++ /* number of block in a filesystem */ ++ /* 8 */ d64 free_blocks; ++ /* number of free blocks */ ++ /* 16 */ d64 root_block; ++ /* filesystem tree root block */ ++ /* 24 */ d64 oid; ++ /* smallest free objectid */ ++ /* 32 */ d64 file_count; ++ /* number of files in a filesystem */ ++ /* 40 */ d64 flushes; ++ /* number of times super block was ++ flushed. Needed if format 40 ++ will have few super blocks */ ++ /* 48 */ d32 mkfs_id; ++ /* unique identifier of fs */ ++ /* 52 */ char magic[16]; ++ /* magic string ReIsEr40FoRmAt */ ++ /* 68 */ d16 tree_height; ++ /* height of filesystem tree */ ++ /* 70 */ d16 formatting_policy; ++ /* not used anymore */ ++ /* 72 */ d64 flags; ++ /* 80 */ d32 version; ++ /* on-disk format version number ++ initially assigned by mkfs as the greatest format40 ++ version number supported by reiser4progs and updated ++ in mount time in accordance with the greatest format40 ++ version number supported by kernel. ++ Is used by fsck to catch possible corruption and ++ for various compatibility issues */ ++ /* 84 */ char not_used[428]; ++} format40_disk_super_block; ++ ++/* format 40 specific part of reiser4_super_info_data */ ++typedef struct format40_super_info { ++/* format40_disk_super_block actual_sb; */ ++ jnode *sb_jnode; ++ struct { ++ reiser4_block_nr super; ++ } loc; ++} format40_super_info; ++ ++/* Defines for journal header and footer respectively. */ ++#define FORMAT40_JOURNAL_HEADER_BLOCKNR \ ++ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3) ++ ++#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \ ++ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4) ++ ++#define FORMAT40_STATUS_BLOCKNR \ ++ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5) ++ ++/* Diskmap declarations */ ++#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID)) ++#define FORMAT40_SUPER 1 ++#define FORMAT40_JH 2 ++#define FORMAT40_JF 3 ++ ++/* declarations of functions implementing methods of layout plugin for ++ format 40. The functions theirself are in disk_format40.c */ ++extern int init_format_format40(struct super_block *, void *data); ++extern const reiser4_key *root_dir_key_format40(const struct super_block *); ++extern int release_format40(struct super_block *s); ++extern jnode *log_super_format40(struct super_block *s); ++extern int check_open_format40(const struct inode *object); ++extern int version_update_format40(struct super_block *super); ++ ++/* __DISK_FORMAT40_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format.c +--- linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,38 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "../../debug.h" ++#include "../plugin_header.h" ++#include "disk_format40.h" ++#include "disk_format.h" ++#include "../plugin.h" ++ ++/* initialization of disk layout plugins */ ++disk_format_plugin format_plugins[LAST_FORMAT_ID] = { ++ [FORMAT40_ID] = { ++ .h = { ++ .type_id = REISER4_FORMAT_PLUGIN_TYPE, ++ .id = FORMAT40_ID, ++ .pops = NULL, ++ .label = "reiser40", ++ .desc = "standard disk layout for reiser40", ++ .linkage = {NULL, NULL} ++ }, ++ .init_format = init_format_format40, ++ .root_dir_key = root_dir_key_format40, ++ .release = release_format40, ++ .log_super = log_super_format40, ++ .check_open = check_open_format40, ++ .version_update = version_update_format40 ++ } ++}; ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format.h +--- linux-2.6.33.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/disk_format/disk_format.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,27 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* identifiers for disk layouts, they are also used as indexes in array of disk ++ plugins */ ++ ++#if !defined( __REISER4_DISK_FORMAT_H__ ) ++#define __REISER4_DISK_FORMAT_H__ ++ ++typedef enum { ++ /* standard reiser4 disk layout plugin id */ ++ FORMAT40_ID, ++ LAST_FORMAT_ID ++} disk_format_id; ++ ++/* __REISER4_DISK_FORMAT_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.33/fs/reiser4/plugin/disk_format/Makefile +--- linux-2.6.33.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/disk_format/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,5 @@ ++obj-$(CONFIG_REISER4_FS) += df_plugins.o ++ ++df_plugins-objs := \ ++ disk_format40.o \ ++ disk_format.o +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/fibration.c linux-2.6.33/fs/reiser4/plugin/fibration.c +--- linux-2.6.33.orig/fs/reiser4/plugin/fibration.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/fibration.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,175 @@ ++/* Copyright 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Directory fibrations */ ++ ++/* ++ * Suppose we have a directory tree with sources of some project. During ++ * compilation .o files are created within this tree. This makes access ++ * to the original source files less efficient, because source files are ++ * now "diluted" by object files: default directory plugin uses prefix ++ * of a file name as a part of the key for directory entry (and this ++ * part is also inherited by the key of file body). This means that ++ * foo.o will be located close to foo.c and foo.h in the tree. ++ * ++ * To avoid this effect directory plugin fill highest 7 (unused ++ * originally) bits of the second component of the directory entry key ++ * by bit-pattern depending on the file name (see ++ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called ++ * "fibre". Fibre of the file name key is inherited by key of stat data ++ * and keys of file body (in the case of REISER4_LARGE_KEY). ++ * ++ * Fibre for a given file is chosen by per-directory fibration ++ * plugin. Names within given fibre are ordered lexicographically. ++ */ ++ ++#include "../debug.h" ++#include "plugin_header.h" ++#include "plugin.h" ++#include "../super.h" ++#include "../inode.h" ++ ++#include <linux/types.h> ++ ++static const int fibre_shift = 57; ++ ++#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift) ++ ++/* ++ * Trivial fibration: all files of directory are just ordered ++ * lexicographically. ++ */ ++static __u64 fibre_trivial(const struct inode *dir, const char *name, int len) ++{ ++ return FIBRE_NO(0); ++} ++ ++/* ++ * dot-o fibration: place .o files after all others. ++ */ ++static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len) ++{ ++ /* special treatment for .*.o */ ++ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.') ++ return FIBRE_NO(1); ++ else ++ return FIBRE_NO(0); ++} ++ ++/* ++ * ext.1 fibration: subdivide directory into 128 fibrations one for each ++ * 7bit extension character (file "foo.h" goes into fibre "h"), plus ++ * default fibre for the rest. ++ */ ++static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len) ++{ ++ if (len > 2 && name[len - 2] == '.') ++ return FIBRE_NO(name[len - 1]); ++ else ++ return FIBRE_NO(0); ++} ++ ++/* ++ * ext.3 fibration: try to separate files with different 3-character ++ * extensions from each other. ++ */ ++static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len) ++{ ++ if (len > 4 && name[len - 4] == '.') ++ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]); ++ else ++ return FIBRE_NO(0); ++} ++ ++static int change_fibration(struct inode *inode, ++ reiser4_plugin * plugin, ++ pset_member memb) ++{ ++ int result; ++ ++ assert("nikita-3503", inode != NULL); ++ assert("nikita-3504", plugin != NULL); ++ ++ assert("nikita-3505", is_reiser4_inode(inode)); ++ assert("nikita-3506", inode_dir_plugin(inode) != NULL); ++ assert("nikita-3507", ++ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE); ++ ++ result = 0; ++ if (inode_fibration_plugin(inode) == NULL || ++ inode_fibration_plugin(inode)->h.id != plugin->h.id) { ++ if (is_dir_empty(inode) == 0) ++ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset, ++ PSET_FIBRATION, plugin); ++ else ++ result = RETERR(-ENOTEMPTY); ++ ++ } ++ return result; ++} ++ ++static reiser4_plugin_ops fibration_plugin_ops = { ++ .init = NULL, ++ .load = NULL, ++ .save_len = NULL, ++ .save = NULL, ++ .change = change_fibration ++}; ++ ++/* fibration plugins */ ++fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = { ++ [FIBRATION_LEXICOGRAPHIC] = { ++ .h = { ++ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, ++ .id = FIBRATION_LEXICOGRAPHIC, ++ .pops = &fibration_plugin_ops, ++ .label = "lexicographic", ++ .desc = "no fibration", ++ .linkage = {NULL, NULL} ++ }, ++ .fibre = fibre_trivial ++ }, ++ [FIBRATION_DOT_O] = { ++ .h = { ++ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, ++ .id = FIBRATION_DOT_O, ++ .pops = &fibration_plugin_ops, ++ .label = "dot-o", ++ .desc = "fibrate .o files separately", ++ .linkage = {NULL, NULL} ++ }, ++ .fibre = fibre_dot_o ++ }, ++ [FIBRATION_EXT_1] = { ++ .h = { ++ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, ++ .id = FIBRATION_EXT_1, ++ .pops = &fibration_plugin_ops, ++ .label = "ext-1", ++ .desc = "fibrate file by single character extension", ++ .linkage = {NULL, NULL} ++ }, ++ .fibre = fibre_ext_1 ++ }, ++ [FIBRATION_EXT_3] = { ++ .h = { ++ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, ++ .id = FIBRATION_EXT_3, ++ .pops = &fibration_plugin_ops, ++ .label = "ext-3", ++ .desc = "fibrate file by three character extension", ++ .linkage = {NULL, NULL} ++ }, ++ .fibre = fibre_ext_3 ++ } ++}; ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/fibration.h linux-2.6.33/fs/reiser4/plugin/fibration.h +--- linux-2.6.33.orig/fs/reiser4/plugin/fibration.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/fibration.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,37 @@ ++/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Fibration plugin used by hashed directory plugin to segment content ++ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */ ++ ++#if !defined(__FS_REISER4_PLUGIN_FIBRATION_H__) ++#define __FS_REISER4_PLUGIN_FIBRATION_H__ ++ ++#include "plugin_header.h" ++ ++typedef struct fibration_plugin { ++ /* generic fields */ ++ plugin_header h; ++ ++ __u64(*fibre) (const struct inode *dir, const char *name, int len); ++} fibration_plugin; ++ ++typedef enum { ++ FIBRATION_LEXICOGRAPHIC, ++ FIBRATION_DOT_O, ++ FIBRATION_EXT_1, ++ FIBRATION_EXT_3, ++ LAST_FIBRATION_ID ++} reiser4_fibration_id; ++ ++/* __FS_REISER4_PLUGIN_FIBRATION_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.33/fs/reiser4/plugin/file/cryptcompress.c +--- linux-2.6.33.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file/cryptcompress.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,3803 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ reiser4/README */ ++/* ++ * Written by Edward Shishkin. ++ * ++ * Implementations of inode/file/address_space operations ++ * specific for cryptcompress file plugin which manages ++ * regular files built of compressed and(or) encrypted bodies. ++ * See http://dev.namesys.com/CryptcompressPlugin for details. ++ */ ++ ++#include "../../inode.h" ++#include "../cluster.h" ++#include "../object.h" ++#include "../../tree_walk.h" ++#include "cryptcompress.h" ++ ++#include <linux/pagevec.h> ++#include <asm/uaccess.h> ++#include <linux/swap.h> ++#include <linux/writeback.h> ++#include <linux/random.h> ++#include <linux/scatterlist.h> ++ ++/* ++ Managing primary and secondary caches by Reiser4 ++ cryptcompress file plugin. Synchronization scheme. ++ ++ ++ +------------------+ ++ +------------------->| tfm stream | ++ | | (compressed data)| ++ flush | +------------------+ ++ +-----------------+ | ++ |(->)longterm lock| V ++--+ writepages() | | +-***-+ reiser4 +---+ ++ | | +--+ | *** | storage tree | | ++ | | | +-***-+ (primary cache)| | ++u | write() (secondary| cache) V / | \ | | ++s | ----> +----+ +----+ +----+ +----+ +-***** ******* **----+ ----> | d | ++e | | | |page cluster | | | **disk cluster** | | i | ++r | <---- +----+ +----+ +----+ +----+ +-***** **********----+ <---- | s | ++ | read() ^ ^ | | k | ++ | | (->)longterm lock| | page_io()| | ++ | | +------+ | | ++--+ readpages() | | +---+ ++ | V ++ | +------------------+ ++ +--------------------| tfm stream | ++ | (plain text) | ++ +------------------+ ++*/ ++ ++/* get cryptcompress specific portion of inode */ ++struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode) ++{ ++ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info; ++} ++ ++/* plugin->u.file.init_inode_data */ ++void init_inode_data_cryptcompress(struct inode *inode, ++ reiser4_object_create_data * crd, ++ int create) ++{ ++ struct cryptcompress_info *data; ++ ++ data = cryptcompress_inode_data(inode); ++ assert("edward-685", data != NULL); ++ ++ memset(data, 0, sizeof(*data)); ++ ++ mutex_init(&data->checkin_mutex); ++ data->trunc_index = ULONG_MAX; ++ turn_on_compression(data); ++ set_lattice_factor(data, MIN_LATTICE_FACTOR); ++ init_inode_ordering(inode, crd, create); ++} ++ ++/* The following is a part of reiser4 cipher key manager ++ which is called when opening/creating a cryptcompress file */ ++ ++/* get/set cipher key info */ ++struct reiser4_crypto_info * inode_crypto_info (struct inode * inode) ++{ ++ assert("edward-90", inode != NULL); ++ assert("edward-91", reiser4_inode_data(inode) != NULL); ++ return cryptcompress_inode_data(inode)->crypt; ++} ++ ++static void set_inode_crypto_info (struct inode * inode, ++ struct reiser4_crypto_info * info) ++{ ++ cryptcompress_inode_data(inode)->crypt = info; ++} ++ ++/* allocate a cipher key info */ ++struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode) ++{ ++ struct reiser4_crypto_info *info; ++ int fipsize; ++ ++ info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get()); ++ if (!info) ++ return ERR_PTR(-ENOMEM); ++ ++ fipsize = inode_digest_plugin(inode)->fipsize; ++ info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get()); ++ if (!info->keyid) { ++ kfree(info); ++ return ERR_PTR(-ENOMEM); ++ } ++ info->host = inode; ++ return info; ++} ++ ++#if 0 ++/* allocate/free low-level info for cipher and digest ++ transforms */ ++static int alloc_crypto_tfms(struct reiser4_crypto_info * info) ++{ ++ struct crypto_blkcipher * ctfm = NULL; ++ struct crypto_hash * dtfm = NULL; ++ cipher_plugin * cplug = inode_cipher_plugin(info->host); ++ digest_plugin * dplug = inode_digest_plugin(info->host); ++ ++ if (cplug->alloc) { ++ ctfm = cplug->alloc(); ++ if (IS_ERR(ctfm)) { ++ warning("edward-1364", ++ "Can not allocate info for %s\n", ++ cplug->h.desc); ++ return RETERR(PTR_ERR(ctfm)); ++ } ++ } ++ info_set_cipher(info, ctfm); ++ if (dplug->alloc) { ++ dtfm = dplug->alloc(); ++ if (IS_ERR(dtfm)) { ++ warning("edward-1365", ++ "Can not allocate info for %s\n", ++ dplug->h.desc); ++ goto unhappy_with_digest; ++ } ++ } ++ info_set_digest(info, dtfm); ++ return 0; ++ unhappy_with_digest: ++ if (cplug->free) { ++ cplug->free(ctfm); ++ info_set_cipher(info, NULL); ++ } ++ return RETERR(PTR_ERR(dtfm)); ++} ++#endif ++ ++static void ++free_crypto_tfms(struct reiser4_crypto_info * info) ++{ ++ assert("edward-1366", info != NULL); ++ if (!info_get_cipher(info)) { ++ assert("edward-1601", !info_get_digest(info)); ++ return; ++ } ++ inode_cipher_plugin(info->host)->free(info_get_cipher(info)); ++ info_set_cipher(info, NULL); ++ inode_digest_plugin(info->host)->free(info_get_digest(info)); ++ info_set_digest(info, NULL); ++ return; ++} ++ ++#if 0 ++/* create a key fingerprint for disk stat-data */ ++static int create_keyid (struct reiser4_crypto_info * info, ++ struct reiser4_crypto_data * data) ++{ ++ int ret = -ENOMEM; ++ size_t blk, pad; ++ __u8 * dmem; ++ __u8 * cmem; ++ struct hash_desc ddesc; ++ struct blkcipher_desc cdesc; ++ struct scatterlist sg; ++ ++ assert("edward-1367", info != NULL); ++ assert("edward-1368", info->keyid != NULL); ++ ++ ddesc.tfm = info_get_digest(info); ++ ddesc.flags = 0; ++ cdesc.tfm = info_get_cipher(info); ++ cdesc.flags = 0; ++ ++ dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm), ++ reiser4_ctx_gfp_mask_get()); ++ if (!dmem) ++ goto exit1; ++ ++ blk = crypto_blkcipher_blocksize(cdesc.tfm); ++ ++ pad = data->keyid_size % blk; ++ pad = (pad ? blk - pad : 0); ++ ++ cmem = kmalloc((size_t)data->keyid_size + pad, ++ reiser4_ctx_gfp_mask_get()); ++ if (!cmem) ++ goto exit2; ++ memcpy(cmem, data->keyid, data->keyid_size); ++ memset(cmem + data->keyid_size, 0, pad); ++ ++ sg_init_one(&sg, cmem, data->keyid_size + pad); ++ ++ ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg, ++ data->keyid_size + pad); ++ if (ret) { ++ warning("edward-1369", ++ "encryption failed flags=%x\n", cdesc.flags); ++ goto exit3; ++ } ++ ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem); ++ if (ret) { ++ warning("edward-1602", ++ "digest failed flags=%x\n", ddesc.flags); ++ goto exit3; ++ } ++ memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize); ++ exit3: ++ kfree(cmem); ++ exit2: ++ kfree(dmem); ++ exit1: ++ return ret; ++} ++#endif ++ ++static void destroy_keyid(struct reiser4_crypto_info * info) ++{ ++ assert("edward-1370", info != NULL); ++ assert("edward-1371", info->keyid != NULL); ++ kfree(info->keyid); ++ return; ++} ++ ++static void __free_crypto_info (struct inode * inode) ++{ ++ struct reiser4_crypto_info * info = inode_crypto_info(inode); ++ assert("edward-1372", info != NULL); ++ ++ free_crypto_tfms(info); ++ destroy_keyid(info); ++ kfree(info); ++} ++ ++#if 0 ++static void instantiate_crypto_info(struct reiser4_crypto_info * info) ++{ ++ assert("edward-1373", info != NULL); ++ assert("edward-1374", info->inst == 0); ++ info->inst = 1; ++} ++#endif ++ ++static void uninstantiate_crypto_info(struct reiser4_crypto_info * info) ++{ ++ assert("edward-1375", info != NULL); ++ info->inst = 0; ++} ++ ++#if 0 ++static int is_crypto_info_instantiated(struct reiser4_crypto_info * info) ++{ ++ return info->inst; ++} ++ ++static int inode_has_cipher_key(struct inode * inode) ++{ ++ assert("edward-1376", inode != NULL); ++ return inode_crypto_info(inode) && ++ is_crypto_info_instantiated(inode_crypto_info(inode)); ++} ++#endif ++ ++static void free_crypto_info (struct inode * inode) ++{ ++ uninstantiate_crypto_info(inode_crypto_info(inode)); ++ __free_crypto_info(inode); ++} ++ ++static int need_cipher(struct inode * inode) ++{ ++ return inode_cipher_plugin(inode) != ++ cipher_plugin_by_id(NONE_CIPHER_ID); ++} ++ ++/* Parse @data which contains a (uninstantiated) cipher key imported ++ from user space, create a low-level cipher info and attach it to ++ the @object. If success, then info contains an instantiated key */ ++#if 0 ++struct reiser4_crypto_info * create_crypto_info(struct inode * object, ++ struct reiser4_crypto_data * data) ++{ ++ int ret; ++ struct reiser4_crypto_info * info; ++ ++ assert("edward-1377", data != NULL); ++ assert("edward-1378", need_cipher(object)); ++ ++ if (inode_file_plugin(object) != ++ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID)) ++ return ERR_PTR(-EINVAL); ++ ++ info = reiser4_alloc_crypto_info(object); ++ if (IS_ERR(info)) ++ return info; ++ ret = alloc_crypto_tfms(info); ++ if (ret) ++ goto err; ++ /* instantiating a key */ ++ ret = crypto_blkcipher_setkey(info_get_cipher(info), ++ data->key, ++ data->keysize); ++ if (ret) { ++ warning("edward-1379", ++ "setkey failed flags=%x", ++ crypto_blkcipher_get_flags(info_get_cipher(info))); ++ goto err; ++ } ++ info->keysize = data->keysize; ++ ret = create_keyid(info, data); ++ if (ret) ++ goto err; ++ instantiate_crypto_info(info); ++ return info; ++ err: ++ __free_crypto_info(object); ++ return ERR_PTR(ret); ++} ++#endif ++ ++/* increment/decrement a load counter when ++ attaching/detaching the crypto-stat to any object */ ++static void load_crypto_info(struct reiser4_crypto_info * info) ++{ ++ assert("edward-1380", info != NULL); ++ inc_keyload_count(info); ++} ++ ++static void unload_crypto_info(struct inode * inode) ++{ ++ struct reiser4_crypto_info * info = inode_crypto_info(inode); ++ assert("edward-1381", info->keyload_count > 0); ++ ++ dec_keyload_count(inode_crypto_info(inode)); ++ if (info->keyload_count == 0) ++ /* final release */ ++ free_crypto_info(inode); ++} ++ ++/* attach/detach an existing crypto-stat */ ++void reiser4_attach_crypto_info(struct inode * inode, ++ struct reiser4_crypto_info * info) ++{ ++ assert("edward-1382", inode != NULL); ++ assert("edward-1383", info != NULL); ++ assert("edward-1384", inode_crypto_info(inode) == NULL); ++ ++ set_inode_crypto_info(inode, info); ++ load_crypto_info(info); ++} ++ ++/* returns true, if crypto stat can be attached to the @host */ ++#if REISER4_DEBUG ++static int host_allows_crypto_info(struct inode * host) ++{ ++ int ret; ++ file_plugin * fplug = inode_file_plugin(host); ++ ++ switch (fplug->h.id) { ++ case CRYPTCOMPRESS_FILE_PLUGIN_ID: ++ ret = 1; ++ break; ++ default: ++ ret = 0; ++ } ++ return ret; ++} ++#endif /* REISER4_DEBUG */ ++ ++static void reiser4_detach_crypto_info(struct inode * inode) ++{ ++ assert("edward-1385", inode != NULL); ++ assert("edward-1386", host_allows_crypto_info(inode)); ++ ++ if (inode_crypto_info(inode)) ++ unload_crypto_info(inode); ++ set_inode_crypto_info(inode, NULL); ++} ++ ++#if 0 ++ ++/* compare fingerprints of @child and @parent */ ++static int keyid_eq(struct reiser4_crypto_info * child, ++ struct reiser4_crypto_info * parent) ++{ ++ return !memcmp(child->keyid, ++ parent->keyid, ++ info_digest_plugin(parent)->fipsize); ++} ++ ++/* check if a crypto-stat (which is bound to @parent) can be inherited */ ++int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent) ++{ ++ if (!need_cipher(child)) ++ return 0; ++ /* the child is created */ ++ if (!inode_crypto_info(child)) ++ return 1; ++ /* the child is looked up */ ++ if (!inode_crypto_info(parent)) ++ return 0; ++ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) && ++ inode_digest_plugin(child) == inode_digest_plugin(parent) && ++ inode_crypto_info(child)->keysize == ++ inode_crypto_info(parent)->keysize && ++ keyid_eq(inode_crypto_info(child), inode_crypto_info(parent))); ++} ++#endif ++ ++/* helper functions for ->create() method of the cryptcompress plugin */ ++static int inode_set_crypto(struct inode * object) ++{ ++ reiser4_inode * info; ++ if (!inode_crypto_info(object)) { ++ if (need_cipher(object)) ++ return RETERR(-EINVAL); ++ /* the file is not to be encrypted */ ++ return 0; ++ } ++ info = reiser4_inode_data(object); ++ info->extmask |= (1 << CRYPTO_STAT); ++ return 0; ++} ++ ++static int inode_init_compression(struct inode * object) ++{ ++ int result = 0; ++ assert("edward-1461", object != NULL); ++ if (inode_compression_plugin(object)->init) ++ result = inode_compression_plugin(object)->init(); ++ return result; ++} ++ ++static int inode_check_cluster(struct inode * object) ++{ ++ assert("edward-696", object != NULL); ++ ++ if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) { ++ warning("edward-1320", "Can not support '%s' " ++ "logical clusters (less then page size)", ++ inode_cluster_plugin(object)->h.label); ++ return RETERR(-EINVAL); ++ } ++ if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){ ++ warning("edward-1463", "Can not support '%s' " ++ "logical clusters (too big for transform)", ++ inode_cluster_plugin(object)->h.label); ++ return RETERR(-EINVAL); ++ } ++ return 0; ++} ++ ++/* plugin->destroy_inode() */ ++void destroy_inode_cryptcompress(struct inode * inode) ++{ ++ assert("edward-1464", INODE_PGCOUNT(inode) == 0); ++ reiser4_detach_crypto_info(inode); ++ return; ++} ++ ++/* plugin->create_object(): ++. install plugins ++. attach crypto info if specified ++. attach compression info if specified ++. attach cluster info ++*/ ++int create_object_cryptcompress(struct inode *object, struct inode *parent, ++ reiser4_object_create_data * data) ++{ ++ int result; ++ reiser4_inode *info; ++ ++ assert("edward-23", object != NULL); ++ assert("edward-24", parent != NULL); ++ assert("edward-30", data != NULL); ++ assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD)); ++ assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID); ++ ++ info = reiser4_inode_data(object); ++ ++ assert("edward-29", info != NULL); ++ ++ /* set file bit */ ++ info->plugin_mask |= (1 << PSET_FILE); ++ ++ /* set crypto */ ++ result = inode_set_crypto(object); ++ if (result) ++ goto error; ++ /* set compression */ ++ result = inode_init_compression(object); ++ if (result) ++ goto error; ++ /* set cluster */ ++ result = inode_check_cluster(object); ++ if (result) ++ goto error; ++ ++ /* save everything in disk stat-data */ ++ result = write_sd_by_inode_common(object); ++ if (!result) ++ return 0; ++ error: ++ reiser4_detach_crypto_info(object); ++ return result; ++} ++ ++/* plugin->open() */ ++int open_cryptcompress(struct inode * inode, struct file * file) ++{ ++ return 0; ++} ++ ++/* returns a blocksize, the attribute of a cipher algorithm */ ++static unsigned int ++cipher_blocksize(struct inode * inode) ++{ ++ assert("edward-758", need_cipher(inode)); ++ assert("edward-1400", inode_crypto_info(inode) != NULL); ++ return crypto_blkcipher_blocksize ++ (info_get_cipher(inode_crypto_info(inode))); ++} ++ ++/* returns offset translated by scale factor of the crypto-algorithm */ ++static loff_t inode_scaled_offset (struct inode * inode, ++ const loff_t src_off /* input offset */) ++{ ++ assert("edward-97", inode != NULL); ++ ++ if (!need_cipher(inode) || ++ src_off == get_key_offset(reiser4_min_key()) || ++ src_off == get_key_offset(reiser4_max_key())) ++ return src_off; ++ ++ return inode_cipher_plugin(inode)->scale(inode, ++ cipher_blocksize(inode), ++ src_off); ++} ++ ++/* returns disk cluster size */ ++size_t inode_scaled_cluster_size(struct inode * inode) ++{ ++ assert("edward-110", inode != NULL); ++ ++ return inode_scaled_offset(inode, inode_cluster_size(inode)); ++} ++ ++/* set number of cluster pages */ ++static void set_cluster_nrpages(struct cluster_handle * clust, ++ struct inode *inode) ++{ ++ struct reiser4_slide * win; ++ ++ assert("edward-180", clust != NULL); ++ assert("edward-1040", inode != NULL); ++ ++ clust->old_nrpages = size_in_pages(lbytes(clust->index, inode)); ++ win = clust->win; ++ if (!win) { ++ clust->nr_pages = size_in_pages(lbytes(clust->index, inode)); ++ return; ++ } ++ assert("edward-1176", clust->op != LC_INVAL); ++ assert("edward-1064", win->off + win->count + win->delta != 0); ++ ++ if (win->stat == HOLE_WINDOW && ++ win->off == 0 && win->count == inode_cluster_size(inode)) { ++ /* special case: writing a "fake" logical cluster */ ++ clust->nr_pages = 0; ++ return; ++ } ++ clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta, ++ lbytes(clust->index, inode))); ++ return; ++} ++ ++/* plugin->key_by_inode() ++ build key of a disk cluster */ ++int key_by_inode_cryptcompress(struct inode *inode, loff_t off, ++ reiser4_key * key) ++{ ++ assert("edward-64", inode != 0); ++ ++ if (likely(off != get_key_offset(reiser4_max_key()))) ++ off = off_to_clust_to_off(off, inode); ++ if (inode_crypto_info(inode)) ++ off = inode_scaled_offset(inode, off); ++ ++ key_by_inode_and_offset_common(inode, 0, key); ++ set_key_offset(key, (__u64)off); ++ return 0; ++} ++ ++/* plugin->flow_by_inode() */ ++/* flow is used to read/write disk clusters */ ++int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf, ++ int user, /* 1: @buf is of user space, ++ 0: kernel space */ ++ loff_t size, /* @buf size */ ++ loff_t off, /* offset to start io from */ ++ rw_op op, /* READ or WRITE */ ++ flow_t * f /* resulting flow */) ++{ ++ assert("edward-436", f != NULL); ++ assert("edward-149", inode != NULL); ++ assert("edward-150", inode_file_plugin(inode) != NULL); ++ assert("edward-1465", user == 0); /* we use flow to read/write ++ disk clusters located in ++ kernel space */ ++ f->length = size; ++ memcpy(&f->data, &buf, sizeof(buf)); ++ f->user = user; ++ f->op = op; ++ ++ return key_by_inode_cryptcompress(inode, off, &f->key); ++} ++ ++static int ++cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key, ++ znode_lock_mode lock_mode) ++{ ++ coord_t *coord; ++ ++ assert("edward-704", hint != NULL); ++ assert("edward-1089", !hint_is_valid(hint)); ++ assert("edward-706", hint->lh.owner == NULL); ++ ++ coord = &hint->ext_coord.coord; ++ ++ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode) ++ /* hint either not set or set by different operation */ ++ return RETERR(-E_REPEAT); ++ ++ if (get_key_offset(key) != hint->offset) ++ /* hint is set for different key */ ++ return RETERR(-E_REPEAT); ++ ++ assert("edward-707", reiser4_schedulable()); ++ ++ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, ++ key, &hint->lh, lock_mode, ++ ZNODE_LOCK_LOPRI); ++} ++ ++/* reserve disk space when writing a logical cluster */ ++static int reserve4cluster(struct inode *inode, struct cluster_handle *clust) ++{ ++ int result = 0; ++ ++ assert("edward-965", reiser4_schedulable()); ++ assert("edward-439", inode != NULL); ++ assert("edward-440", clust != NULL); ++ assert("edward-441", clust->pages != NULL); ++ ++ if (clust->nr_pages == 0) { ++ assert("edward-1152", clust->win != NULL); ++ assert("edward-1153", clust->win->stat == HOLE_WINDOW); ++ /* don't reserve disk space for fake logical cluster */ ++ return 0; ++ } ++ assert("edward-442", jprivate(clust->pages[0]) != NULL); ++ ++ result = reiser4_grab_space_force(estimate_insert_cluster(inode) + ++ estimate_update_cluster(inode), ++ BA_CAN_COMMIT); ++ if (result) ++ return result; ++ clust->reserved = 1; ++ grabbed2cluster_reserved(estimate_insert_cluster(inode) + ++ estimate_update_cluster(inode)); ++#if REISER4_DEBUG ++ clust->reserved_prepped = estimate_update_cluster(inode); ++ clust->reserved_unprepped = estimate_insert_cluster(inode); ++#endif ++ /* there can be space grabbed by txnmgr_force_commit_all */ ++ return 0; ++} ++ ++/* free reserved disk space if writing a logical cluster fails */ ++static void free_reserved4cluster(struct inode *inode, ++ struct cluster_handle *ch, int count) ++{ ++ assert("edward-967", ch->reserved == 1); ++ ++ cluster_reserved2free(count); ++ ch->reserved = 0; ++} ++ ++/* The core search procedure of the cryptcompress plugin. ++ If returned value is not cbk_errored, then current znode is locked */ ++static int find_cluster_item(hint_t * hint, ++ const reiser4_key * key, /* key of the item we are ++ looking for */ ++ znode_lock_mode lock_mode /* which lock */ , ++ ra_info_t * ra_info, lookup_bias bias, __u32 flags) ++{ ++ int result; ++ reiser4_key ikey; ++ int went_right = 0; ++ coord_t *coord = &hint->ext_coord.coord; ++ coord_t orig = *coord; ++ ++ assert("edward-152", hint != NULL); ++ ++ if (!hint_is_valid(hint)) { ++ result = cryptcompress_hint_validate(hint, key, lock_mode); ++ if (result == -E_REPEAT) ++ goto traverse_tree; ++ else if (result) { ++ assert("edward-1216", 0); ++ return result; ++ } ++ hint_set_valid(hint); ++ } ++ assert("edward-709", znode_is_any_locked(coord->node)); ++ ++ /* In-place lookup is going here, it means we just need to ++ check if next item of the @coord match to the @keyhint) */ ++ ++ if (equal_to_rdk(coord->node, key)) { ++ result = goto_right_neighbor(coord, &hint->lh); ++ if (result == -E_NO_NEIGHBOR) { ++ assert("edward-1217", 0); ++ return RETERR(-EIO); ++ } ++ if (result) ++ return result; ++ assert("edward-1218", equal_to_ldk(coord->node, key)); ++ went_right = 1; ++ } else { ++ coord->item_pos++; ++ coord->unit_pos = 0; ++ coord->between = AT_UNIT; ++ } ++ result = zload(coord->node); ++ if (result) ++ return result; ++ assert("edward-1219", !node_is_empty(coord->node)); ++ ++ if (!coord_is_existing_item(coord)) { ++ zrelse(coord->node); ++ goto not_found; ++ } ++ item_key_by_coord(coord, &ikey); ++ zrelse(coord->node); ++ if (!keyeq(key, &ikey)) ++ goto not_found; ++ /* Ok, item is found, update node counts */ ++ if (went_right) ++ dclust_inc_extension_ncount(hint); ++ return CBK_COORD_FOUND; ++ ++ not_found: ++ assert("edward-1220", coord->item_pos > 0); ++ //coord->item_pos--; ++ /* roll back */ ++ *coord = orig; ++ ON_DEBUG(coord_update_v(coord)); ++ return CBK_COORD_NOTFOUND; ++ ++ traverse_tree: ++ assert("edward-713", hint->lh.owner == NULL); ++ assert("edward-714", reiser4_schedulable()); ++ ++ reiser4_unset_hint(hint); ++ dclust_init_extension(hint); ++ coord_init_zero(coord); ++ result = coord_by_key(current_tree, key, coord, &hint->lh, ++ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL, ++ CBK_UNIQUE | flags, ra_info); ++ if (cbk_errored(result)) ++ return result; ++ if(result == CBK_COORD_FOUND) ++ dclust_inc_extension_ncount(hint); ++ hint_set_valid(hint); ++ return result; ++} ++ ++/* This function is called by deflate[inflate] manager when ++ creating a transformed/plain stream to check if we should ++ create/cut some overhead. If this returns true, then @oh ++ contains the size of this overhead. ++ */ ++static int need_cut_or_align(struct inode * inode, ++ struct cluster_handle * ch, rw_op rw, int * oh) ++{ ++ struct tfm_cluster * tc = &ch->tc; ++ switch (rw) { ++ case WRITE_OP: /* estimate align */ ++ *oh = tc->len % cipher_blocksize(inode); ++ if (*oh != 0) ++ return 1; ++ break; ++ case READ_OP: /* estimate cut */ ++ *oh = *(tfm_output_data(ch) + tc->len - 1); ++ break; ++ default: ++ impossible("edward-1401", "bad option"); ++ } ++ return (tc->len != tc->lsize); ++} ++ ++/* create/cut an overhead of transformed/plain stream */ ++static void align_or_cut_overhead(struct inode * inode, ++ struct cluster_handle * ch, rw_op rw) ++{ ++ unsigned int oh; ++ cipher_plugin * cplug = inode_cipher_plugin(inode); ++ ++ assert("edward-1402", need_cipher(inode)); ++ ++ if (!need_cut_or_align(inode, ch, rw, &oh)) ++ return; ++ switch (rw) { ++ case WRITE_OP: /* do align */ ++ ch->tc.len += ++ cplug->align_stream(tfm_input_data(ch) + ++ ch->tc.len, ch->tc.len, ++ cipher_blocksize(inode)); ++ *(tfm_input_data(ch) + ch->tc.len - 1) = ++ cipher_blocksize(inode) - oh; ++ break; ++ case READ_OP: /* do cut */ ++ assert("edward-1403", oh <= cipher_blocksize(inode)); ++ ch->tc.len -= oh; ++ break; ++ default: ++ impossible("edward-1404", "bad option"); ++ } ++ return; ++} ++ ++static unsigned max_cipher_overhead(struct inode * inode) ++{ ++ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream) ++ return 0; ++ return cipher_blocksize(inode); ++} ++ ++static int deflate_overhead(struct inode *inode) ++{ ++ return (inode_compression_plugin(inode)-> ++ checksum ? DC_CHECKSUM_SIZE : 0); ++} ++ ++static unsigned deflate_overrun(struct inode * inode, int ilen) ++{ ++ return coa_overrun(inode_compression_plugin(inode), ilen); ++} ++ ++/* Estimating compressibility of a logical cluster by various ++ policies represented by compression mode plugin. ++ If this returns false, then compressor won't be called for ++ the cluster of index @index. ++*/ ++static int should_compress(struct tfm_cluster * tc, cloff_t index, ++ struct inode *inode) ++{ ++ compression_plugin *cplug = inode_compression_plugin(inode); ++ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode); ++ ++ assert("edward-1321", tc->len != 0); ++ assert("edward-1322", cplug != NULL); ++ assert("edward-1323", mplug != NULL); ++ ++ return /* estimate by size */ ++ (cplug->min_size_deflate ? ++ tc->len >= cplug->min_size_deflate() : ++ 1) && ++ /* estimate by compression mode plugin */ ++ (mplug->should_deflate ? ++ mplug->should_deflate(inode, index) : ++ 1); ++} ++ ++/* Evaluating results of compression transform. ++ Returns true, if we need to accept this results */ ++static int save_compressed(int size_before, int size_after, struct inode *inode) ++{ ++ return (size_after + deflate_overhead(inode) + ++ max_cipher_overhead(inode) < size_before); ++} ++ ++/* Guess result of the evaluation above */ ++static int need_inflate(struct cluster_handle * ch, struct inode * inode, ++ int encrypted /* is cluster encrypted */ ) ++{ ++ struct tfm_cluster * tc = &ch->tc; ++ ++ assert("edward-142", tc != 0); ++ assert("edward-143", inode != NULL); ++ ++ return tc->len < ++ (encrypted ? ++ inode_scaled_offset(inode, tc->lsize) : ++ tc->lsize); ++} ++ ++/* If results of compression were accepted, then we add ++ a checksum to catch possible disk cluster corruption. ++ The following is a format of the data stored in disk clusters: ++ ++ data This is (transformed) logical cluster. ++ cipher_overhead This is created by ->align() method ++ of cipher plugin. May be absent. ++ checksum (4) This is created by ->checksum method ++ of compression plugin to check ++ integrity. May be absent. ++ ++ Crypto overhead format: ++ ++ data ++ control_byte (1) contains aligned overhead size: ++ 1 <= overhead <= cipher_blksize ++*/ ++/* Append a checksum at the end of a transformed stream */ ++static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc) ++{ ++ __u32 checksum; ++ ++ assert("edward-1309", tc != NULL); ++ assert("edward-1310", tc->len > 0); ++ assert("edward-1311", cplug->checksum != NULL); ++ ++ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len); ++ put_unaligned(cpu_to_le32(checksum), ++ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len)); ++ tc->len += (int)DC_CHECKSUM_SIZE; ++} ++ ++/* Check a disk cluster checksum. ++ Returns 0 if checksum is correct, otherwise returns 1 */ ++static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc) ++{ ++ assert("edward-1312", tc != NULL); ++ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE); ++ assert("edward-1314", cplug->checksum != NULL); ++ ++ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM), ++ tc->len - (int)DC_CHECKSUM_SIZE) != ++ le32_to_cpu(get_unaligned((d32 *) ++ (tfm_stream_data(tc, INPUT_STREAM) ++ + tc->len - (int)DC_CHECKSUM_SIZE)))) { ++ warning("edward-156", ++ "Bad disk cluster checksum %d, (should be %d) Fsck?\n", ++ (int)le32_to_cpu ++ (get_unaligned((d32 *) ++ (tfm_stream_data(tc, INPUT_STREAM) + ++ tc->len - (int)DC_CHECKSUM_SIZE))), ++ (int)cplug->checksum ++ (tfm_stream_data(tc, INPUT_STREAM), ++ tc->len - (int)DC_CHECKSUM_SIZE)); ++ return 1; ++ } ++ tc->len -= (int)DC_CHECKSUM_SIZE; ++ return 0; ++} ++ ++/* get input/output stream for some transform action */ ++int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc, ++ tfm_stream_id id) ++{ ++ size_t size = inode_scaled_cluster_size(inode); ++ ++ assert("edward-901", tc != NULL); ++ assert("edward-1027", inode_compression_plugin(inode) != NULL); ++ ++ if (cluster_get_tfm_act(tc) == TFMA_WRITE) ++ size += deflate_overrun(inode, inode_cluster_size(inode)); ++ ++ if (!get_tfm_stream(tc, id) && id == INPUT_STREAM) ++ alternate_streams(tc); ++ if (!get_tfm_stream(tc, id)) ++ return alloc_tfm_stream(tc, size, id); ++ ++ assert("edward-902", tfm_stream_is_set(tc, id)); ++ ++ if (tfm_stream_size(tc, id) < size) ++ return realloc_tfm_stream(tc, size, id); ++ return 0; ++} ++ ++/* Common deflate manager */ ++int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode) ++{ ++ int result = 0; ++ int compressed = 0; ++ int encrypted = 0; ++ struct tfm_cluster * tc = &clust->tc; ++ compression_plugin * coplug; ++ ++ assert("edward-401", inode != NULL); ++ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM)); ++ assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE); ++ assert("edward-498", !tfm_cluster_is_uptodate(tc)); ++ ++ coplug = inode_compression_plugin(inode); ++ if (should_compress(tc, clust->index, inode)) { ++ /* try to compress, discard bad results */ ++ size_t dst_len; ++ compression_mode_plugin * mplug = ++ inode_compression_mode_plugin(inode); ++ assert("edward-602", coplug != NULL); ++ assert("edward-1423", coplug->compress != NULL); ++ ++ result = grab_coa(tc, coplug); ++ if (result) { ++ warning("edward-1424", ++ "alloc_coa failed with ret=%d, skipped compression", ++ result); ++ goto cipher; ++ } ++ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); ++ if (result) { ++ warning("edward-1425", ++ "alloc stream failed with ret=%d, skipped compression", ++ result); ++ goto cipher; ++ } ++ dst_len = tfm_stream_size(tc, OUTPUT_STREAM); ++ coplug->compress(get_coa(tc, coplug->h.id, tc->act), ++ tfm_input_data(clust), tc->len, ++ tfm_output_data(clust), &dst_len); ++ /* make sure we didn't overwrite extra bytes */ ++ assert("edward-603", ++ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM)); ++ ++ /* evaluate results of compression transform */ ++ if (save_compressed(tc->len, dst_len, inode)) { ++ /* good result, accept */ ++ tc->len = dst_len; ++ if (mplug->accept_hook != NULL) { ++ result = mplug->accept_hook(inode, clust->index); ++ if (result) ++ warning("edward-1426", ++ "accept_hook failed with ret=%d", ++ result); ++ } ++ compressed = 1; ++ } ++ else { ++ /* bad result, discard */ ++#if 0 ++ if (cluster_is_complete(clust, inode)) ++ warning("edward-1496", ++ "incompressible cluster %lu (inode %llu)", ++ clust->index, ++ (unsigned long long)get_inode_oid(inode)); ++#endif ++ if (mplug->discard_hook != NULL && ++ cluster_is_complete(clust, inode)) { ++ result = mplug->discard_hook(inode, ++ clust->index); ++ if (result) ++ warning("edward-1427", ++ "discard_hook failed with ret=%d", ++ result); ++ } ++ } ++ } ++ cipher: ++ if (need_cipher(inode)) { ++ cipher_plugin * ciplug; ++ struct blkcipher_desc desc; ++ struct scatterlist src; ++ struct scatterlist dst; ++ ++ ciplug = inode_cipher_plugin(inode); ++ desc.tfm = info_get_cipher(inode_crypto_info(inode)); ++ desc.flags = 0; ++ if (compressed) ++ alternate_streams(tc); ++ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); ++ if (result) ++ return result; ++ ++ align_or_cut_overhead(inode, clust, WRITE_OP); ++ sg_init_one(&src, tfm_input_data(clust), tc->len); ++ sg_init_one(&dst, tfm_output_data(clust), tc->len); ++ ++ result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len); ++ if (result) { ++ warning("edward-1405", ++ "encryption failed flags=%x\n", desc.flags); ++ return result; ++ } ++ encrypted = 1; ++ } ++ if (compressed && coplug->checksum != NULL) ++ dc_set_checksum(coplug, tc); ++ if (!compressed && !encrypted) ++ alternate_streams(tc); ++ return result; ++} ++ ++/* Common inflate manager. */ ++int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode) ++{ ++ int result = 0; ++ int transformed = 0; ++ struct tfm_cluster * tc = &clust->tc; ++ compression_plugin * coplug; ++ ++ assert("edward-905", inode != NULL); ++ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER); ++ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM)); ++ assert("edward-1349", tc->act == TFMA_READ); ++ assert("edward-907", !tfm_cluster_is_uptodate(tc)); ++ ++ /* Handle a checksum (if any) */ ++ coplug = inode_compression_plugin(inode); ++ if (need_inflate(clust, inode, need_cipher(inode)) && ++ coplug->checksum != NULL) { ++ result = dc_check_checksum(coplug, tc); ++ if (unlikely(result)) { ++ warning("edward-1460", ++ "Inode %llu: disk cluster %lu looks corrupted", ++ (unsigned long long)get_inode_oid(inode), ++ clust->index); ++ return RETERR(-EIO); ++ } ++ } ++ if (need_cipher(inode)) { ++ cipher_plugin * ciplug; ++ struct blkcipher_desc desc; ++ struct scatterlist src; ++ struct scatterlist dst; ++ ++ ciplug = inode_cipher_plugin(inode); ++ desc.tfm = info_get_cipher(inode_crypto_info(inode)); ++ desc.flags = 0; ++ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); ++ if (result) ++ return result; ++ assert("edward-909", tfm_cluster_is_set(tc)); ++ ++ sg_init_one(&src, tfm_input_data(clust), tc->len); ++ sg_init_one(&dst, tfm_output_data(clust), tc->len); ++ ++ result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len); ++ if (result) { ++ warning("edward-1600", "decrypt failed flags=%x\n", ++ desc.flags); ++ return result; ++ } ++ align_or_cut_overhead(inode, clust, READ_OP); ++ transformed = 1; ++ } ++ if (need_inflate(clust, inode, 0)) { ++ size_t dst_len = inode_cluster_size(inode); ++ if(transformed) ++ alternate_streams(tc); ++ ++ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); ++ if (result) ++ return result; ++ assert("edward-1305", coplug->decompress != NULL); ++ assert("edward-910", tfm_cluster_is_set(tc)); ++ ++ coplug->decompress(get_coa(tc, coplug->h.id, tc->act), ++ tfm_input_data(clust), tc->len, ++ tfm_output_data(clust), &dst_len); ++ /* check length */ ++ tc->len = dst_len; ++ assert("edward-157", dst_len == tc->lsize); ++ transformed = 1; ++ } ++ if (!transformed) ++ alternate_streams(tc); ++ return result; ++} ++ ++/* This is implementation of readpage method of struct ++ address_space_operations for cryptcompress plugin. */ ++int readpage_cryptcompress(struct file *file, struct page *page) ++{ ++ reiser4_context *ctx; ++ struct cluster_handle clust; ++ item_plugin *iplug; ++ int result; ++ ++ assert("edward-88", PageLocked(page)); ++ assert("vs-976", !PageUptodate(page)); ++ assert("edward-89", page->mapping && page->mapping->host); ++ ++ ctx = reiser4_init_context(page->mapping->host->i_sb); ++ if (IS_ERR(ctx)) { ++ unlock_page(page); ++ return PTR_ERR(ctx); ++ } ++ assert("edward-113", ++ ergo(file != NULL, ++ page->mapping == file->f_dentry->d_inode->i_mapping)); ++ ++ if (PageUptodate(page)) { ++ warning("edward-1338", "page is already uptodate\n"); ++ unlock_page(page); ++ reiser4_exit_context(ctx); ++ return 0; ++ } ++ cluster_init_read(&clust, NULL); ++ clust.file = file; ++ iplug = item_plugin_by_id(CTAIL_ID); ++ if (!iplug->s.file.readpage) { ++ unlock_page(page); ++ put_cluster_handle(&clust); ++ reiser4_exit_context(ctx); ++ return -EINVAL; ++ } ++ result = iplug->s.file.readpage(&clust, page); ++ ++ put_cluster_handle(&clust); ++ reiser4_txn_restart(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/* number of pages to check in */ ++static int get_new_nrpages(struct cluster_handle * clust) ++{ ++ switch (clust->op) { ++ case LC_APPOV: ++ return clust->nr_pages; ++ case LC_TRUNC: ++ assert("edward-1179", clust->win != NULL); ++ return size_in_pages(clust->win->off + clust->win->count); ++ default: ++ impossible("edward-1180", "bad page cluster option"); ++ return 0; ++ } ++} ++ ++static void set_cluster_pages_dirty(struct cluster_handle * clust, ++ struct inode * inode) ++{ ++ int i; ++ struct page *pg; ++ int nrpages = get_new_nrpages(clust); ++ ++ for (i = 0; i < nrpages; i++) { ++ ++ pg = clust->pages[i]; ++ assert("edward-968", pg != NULL); ++ lock_page(pg); ++ assert("edward-1065", PageUptodate(pg)); ++ set_page_dirty_notag(pg); ++ unlock_page(pg); ++ mark_page_accessed(pg); ++ } ++} ++ ++/* Grab a page cluster for read/write operations. ++ Attach a jnode for write operations (when preparing for modifications, which ++ are supposed to be committed). ++ ++ We allocate only one jnode per page cluster; this jnode is binded to the ++ first page of this cluster, so we have an extra-reference that will be put ++ as soon as jnode is evicted from memory), other references will be cleaned ++ up in flush time (assume that check in page cluster was successful). ++*/ ++int grab_page_cluster(struct inode * inode, ++ struct cluster_handle * clust, rw_op rw) ++{ ++ int i; ++ int result = 0; ++ jnode *node = NULL; ++ ++ assert("edward-182", clust != NULL); ++ assert("edward-183", clust->pages != NULL); ++ assert("edward-1466", clust->node == NULL); ++ assert("edward-1428", inode != NULL); ++ assert("edward-1429", inode->i_mapping != NULL); ++ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode)); ++ ++ if (clust->nr_pages == 0) ++ return 0; ++ ++ for (i = 0; i < clust->nr_pages; i++) { ++ ++ assert("edward-1044", clust->pages[i] == NULL); ++ ++ clust->pages[i] = ++ find_or_create_page(inode->i_mapping, ++ clust_to_pg(clust->index, inode) + i, ++ reiser4_ctx_gfp_mask_get()); ++ if (!clust->pages[i]) { ++ result = RETERR(-ENOMEM); ++ break; ++ } ++ if (i == 0 && rw == WRITE_OP) { ++ node = jnode_of_page(clust->pages[i]); ++ if (IS_ERR(node)) { ++ result = PTR_ERR(node); ++ unlock_page(clust->pages[i]); ++ break; ++ } ++ JF_SET(node, JNODE_CLUSTER_PAGE); ++ assert("edward-920", jprivate(clust->pages[0])); ++ } ++ INODE_PGCOUNT_INC(inode); ++ unlock_page(clust->pages[i]); ++ } ++ if (unlikely(result)) { ++ while (i) { ++ put_cluster_page(clust->pages[--i]); ++ INODE_PGCOUNT_DEC(inode); ++ } ++ if (node && !IS_ERR(node)) ++ jput(node); ++ return result; ++ } ++ clust->node = node; ++ return 0; ++} ++ ++static void truncate_page_cluster_range(struct inode * inode, ++ struct page ** pages, ++ cloff_t index, ++ int from, int count, ++ int even_cows) ++{ ++ assert("edward-1467", count > 0); ++ reiser4_invalidate_pages(inode->i_mapping, ++ clust_to_pg(index, inode) + from, ++ count, even_cows); ++} ++ ++/* Put @count pages starting from @from offset */ ++void __put_page_cluster(int from, int count, ++ struct page ** pages, struct inode * inode) ++{ ++ int i; ++ assert("edward-1468", pages != NULL); ++ assert("edward-1469", inode != NULL); ++ assert("edward-1470", from >= 0 && count >= 0); ++ ++ for (i = 0; i < count; i++) { ++ assert("edward-1471", pages[from + i] != NULL); ++ assert("edward-1472", ++ pages[from + i]->index == pages[from]->index + i); ++ ++ put_cluster_page(pages[from + i]); ++ INODE_PGCOUNT_DEC(inode); ++ } ++} ++ ++/* ++ * This is dual to grab_page_cluster, ++ * however if @rw == WRITE_OP, then we call this function ++ * only if something is failed before checkin page cluster. ++ */ ++void put_page_cluster(struct cluster_handle * clust, ++ struct inode * inode, rw_op rw) ++{ ++ assert("edward-445", clust != NULL); ++ assert("edward-922", clust->pages != NULL); ++ assert("edward-446", ++ ergo(clust->nr_pages != 0, clust->pages[0] != NULL)); ++ ++ __put_page_cluster(0, clust->nr_pages, clust->pages, inode); ++ if (rw == WRITE_OP) { ++ if (unlikely(clust->node)) { ++ assert("edward-447", ++ clust->node == jprivate(clust->pages[0])); ++ jput(clust->node); ++ clust->node = NULL; ++ } ++ } ++} ++ ++#if REISER4_DEBUG ++int cryptcompress_inode_ok(struct inode *inode) ++{ ++ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE))) ++ return 0; ++ if (!cluster_shift_ok(inode_cluster_shift(inode))) ++ return 0; ++ return 1; ++} ++ ++static int window_ok(struct reiser4_slide * win, struct inode *inode) ++{ ++ assert("edward-1115", win != NULL); ++ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW)); ++ ++ return (win->off != inode_cluster_size(inode)) && ++ (win->off + win->count + win->delta <= inode_cluster_size(inode)); ++} ++ ++static int cluster_ok(struct cluster_handle * clust, struct inode *inode) ++{ ++ assert("edward-279", clust != NULL); ++ ++ if (!clust->pages) ++ return 0; ++ return (clust->win ? window_ok(clust->win, inode) : 1); ++} ++#if 0 ++static int pages_truncate_ok(struct inode *inode, pgoff_t start) ++{ ++ int found; ++ struct page * page; ++ ++ found = find_get_pages(inode->i_mapping, start, 1, &page); ++ if (found) ++ put_cluster_page(page); ++ return !found; ++} ++#else ++#define pages_truncate_ok(inode, start) 1 ++#endif ++ ++static int jnode_truncate_ok(struct inode *inode, cloff_t index) ++{ ++ jnode *node; ++ node = jlookup(current_tree, get_inode_oid(inode), ++ clust_to_pg(index, inode)); ++ if (likely(!node)) ++ return 1; ++ jput(node); ++ return 0; ++} ++ ++static int find_fake_appended(struct inode *inode, cloff_t * index); ++ ++static int body_truncate_ok(struct inode *inode, cloff_t aidx) ++{ ++ int result; ++ cloff_t raidx; ++ ++ result = find_fake_appended(inode, &raidx); ++ return !result && (aidx == raidx); ++} ++#endif ++ ++/* guess next window stat */ ++static inline window_stat next_window_stat(struct reiser4_slide * win) ++{ ++ assert("edward-1130", win != NULL); ++ return ((win->stat == HOLE_WINDOW && win->delta == 0) ? ++ HOLE_WINDOW : DATA_WINDOW); ++} ++ ++/* guess and set next cluster index and window params */ ++static void move_update_window(struct inode * inode, ++ struct cluster_handle * clust, ++ loff_t file_off, loff_t to_file) ++{ ++ struct reiser4_slide * win; ++ ++ assert("edward-185", clust != NULL); ++ assert("edward-438", clust->pages != NULL); ++ assert("edward-281", cluster_ok(clust, inode)); ++ ++ win = clust->win; ++ if (!win) ++ return; ++ ++ switch (win->stat) { ++ case DATA_WINDOW: ++ /* increment */ ++ clust->index++; ++ win->stat = DATA_WINDOW; ++ win->off = 0; ++ win->count = min((loff_t)inode_cluster_size(inode), to_file); ++ break; ++ case HOLE_WINDOW: ++ switch (next_window_stat(win)) { ++ case HOLE_WINDOW: ++ /* skip */ ++ clust->index = off_to_clust(file_off, inode); ++ win->stat = HOLE_WINDOW; ++ win->off = 0; ++ win->count = off_to_cloff(file_off, inode); ++ win->delta = min((loff_t)(inode_cluster_size(inode) - ++ win->count), to_file); ++ break; ++ case DATA_WINDOW: ++ /* stay */ ++ win->stat = DATA_WINDOW; ++ /* off+count+delta=inv */ ++ win->off = win->off + win->count; ++ win->count = win->delta; ++ win->delta = 0; ++ break; ++ default: ++ impossible("edward-282", "wrong next window state"); ++ } ++ break; ++ default: ++ impossible("edward-283", "wrong current window state"); ++ } ++ assert("edward-1068", cluster_ok(clust, inode)); ++} ++ ++static int update_sd_cryptcompress(struct inode *inode) ++{ ++ int result = 0; ++ ++ assert("edward-978", reiser4_schedulable()); ++ ++ result = reiser4_grab_space_force(/* one for stat data update */ ++ estimate_update_common(inode), ++ BA_CAN_COMMIT); ++ if (result) ++ return result; ++ inode->i_ctime = inode->i_mtime = CURRENT_TIME; ++ result = reiser4_update_sd(inode); ++ ++ return result; ++} ++ ++static void uncapture_cluster_jnode(jnode * node) ++{ ++ txn_atom *atom; ++ ++ assert_spin_locked(&(node->guard)); ++ ++ atom = jnode_get_atom(node); ++ if (atom == NULL) { ++ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); ++ spin_unlock_jnode(node); ++ return; ++ } ++ reiser4_uncapture_block(node); ++ spin_unlock_atom(atom); ++ jput(node); ++} ++ ++static void put_found_pages(struct page **pages, int nr) ++{ ++ int i; ++ for (i = 0; i < nr; i++) { ++ assert("edward-1045", pages[i] != NULL); ++ put_cluster_page(pages[i]); ++ } ++} ++ ++/* Lifecycle of a logical cluster in the system. ++ * ++ * ++ * Logical cluster of a cryptcompress file is represented in the system by ++ * . page cluster (in memory, primary cache, contains plain text); ++ * . disk cluster (in memory, secondary cache, contains transformed text). ++ * Primary cache is to reduce number of transform operations (compression, ++ * encryption), i.e. to implement transform-caching strategy. ++ * Secondary cache is to reduce number of I/O operations, i.e. for usual ++ * write-caching strategy. Page cluster is a set of pages, i.e. mapping of ++ * a logical cluster to the primary cache. Disk cluster is a set of items ++ * of the same type defined by some reiser4 item plugin id. ++ * ++ * 1. Performing modifications ++ * ++ * Every modification of a cryptcompress file is considered as a set of ++ * operations performed on file's logical clusters. Every such "atomic" ++ * modification is truncate, append and(or) overwrite some bytes of a ++ * logical cluster performed in the primary cache with the following ++ * synchronization with the secondary cache (in flush time). Disk clusters, ++ * which live in the secondary cache, are supposed to be synchronized with ++ * disk. The mechanism of synchronization of primary and secondary caches ++ * includes so-called checkin/checkout technique described below. ++ * ++ * 2. Submitting modifications ++ * ++ * Each page cluster has associated jnode (a special in-memory header to ++ * keep a track of transactions in reiser4), which is attached to its first ++ * page when grabbing page cluster for modifications (see grab_page_cluster). ++ * Submitting modifications (see checkin_logical_cluster) is going per logical ++ * cluster and includes: ++ * . checkin_cluster_size; ++ * . checkin_page_cluster. ++ * checkin_cluster_size() is resolved to file size update (which completely ++ * defines new size of logical cluster (number of file's bytes in a logical ++ * cluster). ++ * checkin_page_cluster() captures jnode of a page cluster and installs ++ * jnode's dirty flag (if needed) to indicate that modifications are ++ * successfully checked in. ++ * ++ * 3. Checking out modifications ++ * ++ * Is going per logical cluster in flush time (see checkout_logical_cluster). ++ * This is the time of synchronizing primary and secondary caches. ++ * checkout_logical_cluster() includes: ++ * . checkout_page_cluster (retrieving checked in pages). ++ * . uncapture jnode (including clear dirty flag and unlock) ++ * ++ * 4. Committing modifications ++ * ++ * Proceeding a synchronization of primary and secondary caches. When checking ++ * out page cluster (the phase above) pages are locked/flushed/unlocked ++ * one-by-one in ascending order of their indexes to contiguous stream, which ++ * is supposed to be transformed (compressed, encrypted), chopped up into items ++ * and committed to disk as a disk cluster. ++ * ++ * 5. Managing page references ++ * ++ * Every checked in page have a special additional "control" reference, ++ * which is dropped at checkout. We need this to avoid unexpected evicting ++ * pages from memory before checkout. Control references are managed so ++ * they are not accumulated with every checkin: ++ * ++ * 0 ++ * checkin -> 1 ++ * 0 -> checkout ++ * checkin -> 1 ++ * checkin -> 1 ++ * checkin -> 1 ++ * 0 -> checkout ++ * ... ++ * ++ * Every page cluster has its own unique "cluster lock". Update/drop ++ * references are serialized via this lock. Number of checked in cluster ++ * pages is calculated by i_size under cluster lock. File size is updated ++ * at every checkin action also under cluster lock (except cases of ++ * appending/truncating fake logical clusters). ++ * ++ * Proof of correctness: ++ * ++ * Since we update file size under cluster lock, in the case of non-fake ++ * logical cluster with its lock held we do have expected number of checked ++ * in pages. On the other hand, append/truncate of fake logical clusters ++ * doesn't change number of checked in pages of any cluster. ++ * ++ * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode. ++ * Currently, I don't see any reason to create a special lock for those ++ * needs. ++ */ ++ ++static inline void lock_cluster(jnode * node) ++{ ++ spin_lock_jnode(node); ++} ++ ++static inline void unlock_cluster(jnode * node) ++{ ++ spin_unlock_jnode(node); ++} ++ ++static inline void unlock_cluster_uncapture(jnode * node) ++{ ++ uncapture_cluster_jnode(node); ++} ++ ++/* Set new file size by window. Cluster lock is required. */ ++static void checkin_file_size(struct cluster_handle * clust, ++ struct inode * inode) ++{ ++ loff_t new_size; ++ struct reiser4_slide * win; ++ ++ assert("edward-1181", clust != NULL); ++ assert("edward-1182", inode != NULL); ++ assert("edward-1473", clust->pages != NULL); ++ assert("edward-1474", clust->pages[0] != NULL); ++ assert("edward-1475", jprivate(clust->pages[0]) != NULL); ++ assert_spin_locked(&(jprivate(clust->pages[0])->guard)); ++ ++ ++ win = clust->win; ++ assert("edward-1183", win != NULL); ++ ++ new_size = clust_to_off(clust->index, inode) + win->off; ++ ++ switch (clust->op) { ++ case LC_APPOV: ++ if (new_size + win->count <= i_size_read(inode)) ++ /* overwrite only */ ++ return; ++ new_size += win->count; ++ break; ++ case LC_TRUNC: ++ break; ++ default: ++ impossible("edward-1184", "bad page cluster option"); ++ break; ++ } ++ inode_check_scale_nolock(inode, i_size_read(inode), new_size); ++ i_size_write(inode, new_size); ++ return; ++} ++ ++static inline void checkin_cluster_size(struct cluster_handle * clust, ++ struct inode * inode) ++{ ++ if (clust->win) ++ checkin_file_size(clust, inode); ++} ++ ++static int checkin_page_cluster(struct cluster_handle * clust, ++ struct inode * inode) ++{ ++ int result; ++ jnode * node; ++ int old_nrpages = clust->old_nrpages; ++ int new_nrpages = get_new_nrpages(clust); ++ ++ node = clust->node; ++ ++ assert("edward-221", node != NULL); ++ assert("edward-971", clust->reserved == 1); ++ assert("edward-1263", ++ clust->reserved_prepped == estimate_update_cluster(inode)); ++ assert("edward-1264", clust->reserved_unprepped == 0); ++ ++ if (JF_ISSET(node, JNODE_DIRTY)) { ++ /* ++ * page cluster was checked in, but not yet ++ * checked out, so release related resources ++ */ ++ free_reserved4cluster(inode, clust, ++ estimate_update_cluster(inode)); ++ __put_page_cluster(0, clust->old_nrpages, ++ clust->pages, inode); ++ } else { ++ result = capture_cluster_jnode(node); ++ if (unlikely(result)) { ++ unlock_cluster(node); ++ return result; ++ } ++ jnode_make_dirty_locked(node); ++ clust->reserved = 0; ++ } ++ unlock_cluster(node); ++ ++ if (new_nrpages < old_nrpages) { ++ /* truncate >= 1 complete pages */ ++ __put_page_cluster(new_nrpages, ++ old_nrpages - new_nrpages, ++ clust->pages, inode); ++ truncate_page_cluster_range(inode, ++ clust->pages, clust->index, ++ new_nrpages, ++ old_nrpages - new_nrpages, ++ 0); ++ } ++#if REISER4_DEBUG ++ clust->reserved_prepped -= estimate_update_cluster(inode); ++#endif ++ return 0; ++} ++ ++/* Submit modifications of a logical cluster */ ++static int checkin_logical_cluster(struct cluster_handle * clust, ++ struct inode *inode) ++{ ++ int result = 0; ++ jnode * node; ++ ++ node = clust->node; ++ ++ assert("edward-1035", node != NULL); ++ assert("edward-1029", clust != NULL); ++ assert("edward-1030", clust->reserved == 1); ++ assert("edward-1031", clust->nr_pages != 0); ++ assert("edward-1032", clust->pages != NULL); ++ assert("edward-1033", clust->pages[0] != NULL); ++ assert("edward-1446", jnode_is_cluster_page(node)); ++ assert("edward-1476", node == jprivate(clust->pages[0])); ++ ++ lock_cluster(node); ++ checkin_cluster_size(clust, inode); ++ /* this will unlock cluster */ ++ result = checkin_page_cluster(clust, inode); ++ jput(node); ++ clust->node = NULL; ++ return result; ++} ++ ++/* ++ * Retrieve size of logical cluster that was checked in at ++ * the latest modifying session (cluster lock is required) ++ */ ++static inline void checkout_cluster_size(struct cluster_handle * clust, ++ struct inode * inode) ++{ ++ struct tfm_cluster *tc = &clust->tc; ++ ++ tc->len = lbytes(clust->index, inode); ++ assert("edward-1478", tc->len != 0); ++} ++ ++/* ++ * Retrieve a page cluster with the latest submitted modifications ++ * and flush its pages to previously allocated contiguous stream. ++ */ ++static void checkout_page_cluster(struct cluster_handle * clust, ++ jnode * node, struct inode * inode) ++{ ++ int i; ++ int found; ++ int to_put; ++ struct tfm_cluster *tc = &clust->tc; ++ ++ /* find and put checked in pages: cluster is locked, ++ * so we must get expected number (to_put) of pages ++ */ ++ to_put = size_in_pages(lbytes(clust->index, inode)); ++ found = find_get_pages(inode->i_mapping, ++ clust_to_pg(clust->index, inode), ++ to_put, clust->pages); ++ BUG_ON(found != to_put); ++ ++ __put_page_cluster(0, to_put, clust->pages, inode); ++ unlock_cluster_uncapture(node); ++ ++ /* Flush found pages. ++ * ++ * Note, that we don't disable modifications while flushing, ++ * moreover, some found pages can be truncated, as we have ++ * released cluster lock. ++ */ ++ for (i = 0; i < found; i++) { ++ int in_page; ++ char * data; ++ assert("edward-1479", ++ clust->pages[i]->index == clust->pages[0]->index + i); ++ ++ lock_page(clust->pages[i]); ++ if (!PageUptodate(clust->pages[i])) { ++ /* page was truncated */ ++ assert("edward-1480", ++ i_size_read(inode) <= page_offset(clust->pages[i])); ++ assert("edward-1481", ++ clust->pages[i]->mapping != inode->i_mapping); ++ unlock_page(clust->pages[i]); ++ break; ++ } ++ /* Update the number of bytes in the logical cluster, ++ * as it could be partially truncated. Note, that only ++ * partial truncate is possible (complete truncate can ++ * not go here, as it is performed via ->kill_hook() ++ * called by cut_file_items(), and the last one must ++ * wait for znode locked with parent coord). ++ */ ++ checkout_cluster_size(clust, inode); ++ ++ /* this can be zero, as new file size is ++ checked in before truncating pages */ ++ in_page = __mbp(tc->len, i); ++ ++ data = kmap(clust->pages[i]); ++ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i), ++ data, in_page); ++ kunmap(clust->pages[i]); ++ ++ if (PageDirty(clust->pages[i])) ++ cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE); ++ ++ unlock_page(clust->pages[i]); ++ ++ if (in_page < PAGE_CACHE_SIZE) ++ /* end of the file */ ++ break; ++ } ++ put_found_pages(clust->pages, found); /* find_get_pages */ ++ tc->lsize = tc->len; ++ return; ++} ++ ++/* Check out modifications of a logical cluster */ ++int checkout_logical_cluster(struct cluster_handle * clust, ++ jnode * node, struct inode *inode) ++{ ++ int result; ++ struct tfm_cluster *tc = &clust->tc; ++ ++ assert("edward-980", node != NULL); ++ assert("edward-236", inode != NULL); ++ assert("edward-237", clust != NULL); ++ assert("edward-240", !clust->win); ++ assert("edward-241", reiser4_schedulable()); ++ assert("edward-718", cryptcompress_inode_ok(inode)); ++ ++ result = grab_tfm_stream(inode, tc, INPUT_STREAM); ++ if (result) { ++ warning("edward-1430", "alloc stream failed with ret=%d", ++ result); ++ return RETERR(-E_REPEAT); ++ } ++ lock_cluster(node); ++ ++ if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) { ++ /* race with another flush */ ++ warning("edward-982", ++ "checking out logical cluster %lu of inode %llu: " ++ "jnode is not dirty", clust->index, ++ (unsigned long long)get_inode_oid(inode)); ++ unlock_cluster(node); ++ return RETERR(-E_REPEAT); ++ } ++ cluster_reserved2grabbed(estimate_update_cluster(inode)); ++ ++ /* this will unlock cluster */ ++ checkout_page_cluster(clust, node, inode); ++ return 0; ++} ++ ++/* set hint for the cluster of the index @index */ ++static void set_hint_cluster(struct inode *inode, hint_t * hint, ++ cloff_t index, znode_lock_mode mode) ++{ ++ reiser4_key key; ++ assert("edward-722", cryptcompress_inode_ok(inode)); ++ assert("edward-723", ++ inode_file_plugin(inode) == ++ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); ++ ++ inode_file_plugin(inode)->key_by_inode(inode, ++ clust_to_off(index, inode), ++ &key); ++ ++ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key); ++ hint->offset = get_key_offset(&key); ++ hint->mode = mode; ++} ++ ++void invalidate_hint_cluster(struct cluster_handle * clust) ++{ ++ assert("edward-1291", clust != NULL); ++ assert("edward-1292", clust->hint != NULL); ++ ++ done_lh(&clust->hint->lh); ++ hint_clr_valid(clust->hint); ++} ++ ++static void put_hint_cluster(struct cluster_handle * clust, ++ struct inode *inode, znode_lock_mode mode) ++{ ++ assert("edward-1286", clust != NULL); ++ assert("edward-1287", clust->hint != NULL); ++ ++ set_hint_cluster(inode, clust->hint, clust->index + 1, mode); ++ invalidate_hint_cluster(clust); ++} ++ ++static int balance_dirty_page_cluster(struct cluster_handle * clust, ++ struct inode *inode, loff_t off, ++ loff_t to_file, ++ int nr_dirtied) ++{ ++ int result; ++ struct cryptcompress_info * info; ++ ++ assert("edward-724", inode != NULL); ++ assert("edward-725", cryptcompress_inode_ok(inode)); ++ assert("edward-1547", ++ nr_dirtied != 0 && nr_dirtied <= cluster_nrpages(inode)); ++ ++ /* set next window params */ ++ move_update_window(inode, clust, off, to_file); ++ ++ result = update_sd_cryptcompress(inode); ++ if (result) ++ return result; ++ assert("edward-726", clust->hint->lh.owner == NULL); ++ info = cryptcompress_inode_data(inode); ++ ++ mutex_unlock(&info->checkin_mutex); ++ reiser4_throttle_write(inode, nr_dirtied); ++ mutex_lock(&info->checkin_mutex); ++ return 0; ++} ++ ++/* set zeroes to the page cluster, proceed it, and maybe, try to capture ++ its pages */ ++static int write_hole(struct inode *inode, struct cluster_handle * clust, ++ loff_t file_off, loff_t to_file) ++{ ++ int result = 0; ++ unsigned cl_off, cl_count = 0; ++ unsigned to_pg, pg_off; ++ struct reiser4_slide * win; ++ ++ assert("edward-190", clust != NULL); ++ assert("edward-1069", clust->win != NULL); ++ assert("edward-191", inode != NULL); ++ assert("edward-727", cryptcompress_inode_ok(inode)); ++ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER); ++ assert("edward-1154", ++ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1)); ++ ++ win = clust->win; ++ ++ assert("edward-1070", win != NULL); ++ assert("edward-201", win->stat == HOLE_WINDOW); ++ assert("edward-192", cluster_ok(clust, inode)); ++ ++ if (win->off == 0 && win->count == inode_cluster_size(inode)) { ++ /* This part of the hole will be represented by "fake" ++ * logical cluster, i.e. which doesn't have appropriate ++ * disk cluster until someone modify this logical cluster ++ * and make it dirty. ++ * So go forward here.. ++ */ ++ move_update_window(inode, clust, file_off, to_file); ++ return 0; ++ } ++ cl_count = win->count; /* number of zeroes to write */ ++ cl_off = win->off; ++ pg_off = off_to_pgoff(win->off); ++ ++ while (cl_count) { ++ struct page *page; ++ page = clust->pages[off_to_pg(cl_off)]; ++ ++ assert("edward-284", page != NULL); ++ ++ to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count); ++ lock_page(page); ++ zero_user(page, pg_off, to_pg); ++ SetPageUptodate(page); ++ set_page_dirty_notag(page); ++ mark_page_accessed(page); ++ unlock_page(page); ++ ++ cl_off += to_pg; ++ cl_count -= to_pg; ++ pg_off = 0; ++ } ++ if (!win->delta) { ++ /* only zeroes in this window, try to capture ++ */ ++ result = checkin_logical_cluster(clust, inode); ++ if (result) ++ return result; ++ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK); ++ result = balance_dirty_page_cluster(clust, ++ inode, file_off, to_file, ++ win_count_to_nrpages(win)); ++ } else ++ move_update_window(inode, clust, file_off, to_file); ++ return result; ++} ++ ++/* ++ The main disk search procedure for cryptcompress plugin, which ++ . scans all items of disk cluster with the lock mode @mode ++ . maybe reads each one (if @read) ++ . maybe makes its znode dirty (if write lock mode was specified) ++ ++ NOTE-EDWARD: Callers should handle the case when disk cluster ++ is incomplete (-EIO) ++*/ ++int find_disk_cluster(struct cluster_handle * clust, ++ struct inode *inode, int read, znode_lock_mode mode) ++{ ++ flow_t f; ++ hint_t *hint; ++ int result = 0; ++ int was_grabbed; ++ ra_info_t ra_info; ++ file_plugin *fplug; ++ item_plugin *iplug; ++ struct tfm_cluster *tc; ++ struct cryptcompress_info * info; ++ ++ assert("edward-138", clust != NULL); ++ assert("edward-728", clust->hint != NULL); ++ assert("edward-226", reiser4_schedulable()); ++ assert("edward-137", inode != NULL); ++ assert("edward-729", cryptcompress_inode_ok(inode)); ++ ++ hint = clust->hint; ++ fplug = inode_file_plugin(inode); ++ was_grabbed = get_current_context()->grabbed_blocks; ++ info = cryptcompress_inode_data(inode); ++ tc = &clust->tc; ++ ++ assert("edward-462", !tfm_cluster_is_uptodate(tc)); ++ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM))); ++ ++ dclust_init_extension(hint); ++ ++ /* set key of the first disk cluster item */ ++ fplug->flow_by_inode(inode, ++ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL), ++ 0 /* kernel space */ , ++ inode_scaled_cluster_size(inode), ++ clust_to_off(clust->index, inode), READ_OP, &f); ++ if (mode == ZNODE_WRITE_LOCK) { ++ /* reserve for flush to make dirty all the leaf nodes ++ which contain disk cluster */ ++ result = ++ reiser4_grab_space_force(estimate_dirty_cluster(inode), ++ BA_CAN_COMMIT); ++ if (result) ++ goto out; ++ } ++ ++ ra_info.key_to_stop = f.key; ++ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key())); ++ ++ while (f.length) { ++ result = find_cluster_item(hint, &f.key, mode, ++ NULL, FIND_EXACT, ++ (mode == ZNODE_WRITE_LOCK ? ++ CBK_FOR_INSERT : 0)); ++ switch (result) { ++ case CBK_COORD_NOTFOUND: ++ result = 0; ++ if (inode_scaled_offset ++ (inode, clust_to_off(clust->index, inode)) == ++ get_key_offset(&f.key)) { ++ /* first item not found, this is treated ++ as disk cluster is absent */ ++ clust->dstat = FAKE_DISK_CLUSTER; ++ goto out; ++ } ++ /* we are outside the cluster, stop search here */ ++ assert("edward-146", ++ f.length != inode_scaled_cluster_size(inode)); ++ goto ok; ++ case CBK_COORD_FOUND: ++ assert("edward-148", ++ hint->ext_coord.coord.between == AT_UNIT); ++ assert("edward-460", ++ hint->ext_coord.coord.unit_pos == 0); ++ ++ coord_clear_iplug(&hint->ext_coord.coord); ++ result = zload_ra(hint->ext_coord.coord.node, &ra_info); ++ if (unlikely(result)) ++ goto out; ++ iplug = item_plugin_by_coord(&hint->ext_coord.coord); ++ assert("edward-147", ++ item_id_by_coord(&hint->ext_coord.coord) == ++ CTAIL_ID); ++ ++ result = iplug->s.file.read(NULL, &f, hint); ++ if (result) { ++ zrelse(hint->ext_coord.coord.node); ++ goto out; ++ } ++ if (mode == ZNODE_WRITE_LOCK) { ++ /* Don't make dirty more nodes then it was ++ estimated (see comments before ++ estimate_dirty_cluster). Missed nodes will be ++ read up in flush time if they are evicted from ++ memory */ ++ if (dclust_get_extension_ncount(hint) <= ++ estimate_dirty_cluster(inode)) ++ znode_make_dirty(hint->ext_coord.coord.node); ++ ++ znode_set_convertible(hint->ext_coord.coord. ++ node); ++ } ++ zrelse(hint->ext_coord.coord.node); ++ break; ++ default: ++ goto out; ++ } ++ } ++ ok: ++ /* at least one item was found */ ++ /* NOTE-EDWARD: Callers should handle the case ++ when disk cluster is incomplete (-EIO) */ ++ tc->len = inode_scaled_cluster_size(inode) - f.length; ++ tc->lsize = lbytes(clust->index, inode); ++ assert("edward-1196", tc->len > 0); ++ assert("edward-1406", tc->lsize > 0); ++ ++ if (hint_is_unprepped_dclust(clust->hint)) { ++ clust->dstat = UNPR_DISK_CLUSTER; ++ } else if (clust->index == info->trunc_index) { ++ clust->dstat = TRNC_DISK_CLUSTER; ++ } else { ++ clust->dstat = PREP_DISK_CLUSTER; ++ dclust_set_extension_dsize(clust->hint, tc->len); ++ } ++ out: ++ assert("edward-1339", ++ get_current_context()->grabbed_blocks >= was_grabbed); ++ grabbed2free(get_current_context(), ++ get_current_super_private(), ++ get_current_context()->grabbed_blocks - was_grabbed); ++ return result; ++} ++ ++int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode, ++ znode_lock_mode lock_mode) ++{ ++ reiser4_key key; ++ ra_info_t ra_info; ++ ++ assert("edward-730", reiser4_schedulable()); ++ assert("edward-731", clust != NULL); ++ assert("edward-732", inode != NULL); ++ ++ if (hint_is_valid(clust->hint)) { ++ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER); ++ assert("edward-1294", ++ znode_is_write_locked(clust->hint->lh.node)); ++ /* already have a valid locked position */ ++ return (clust->dstat == ++ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND : ++ CBK_COORD_FOUND); ++ } ++ key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode), ++ &key); ++ ra_info.key_to_stop = key; ++ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key())); ++ ++ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT, ++ CBK_FOR_INSERT); ++} ++ ++/* Read needed cluster pages before modifying. ++ If success, @clust->hint contains locked position in the tree. ++ Also: ++ . find and set disk cluster state ++ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER. ++*/ ++static int read_some_cluster_pages(struct inode * inode, ++ struct cluster_handle * clust) ++{ ++ int i; ++ int result = 0; ++ item_plugin *iplug; ++ struct reiser4_slide * win = clust->win; ++ znode_lock_mode mode = ZNODE_WRITE_LOCK; ++ ++ iplug = item_plugin_by_id(CTAIL_ID); ++ ++ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc)); ++ ++#if REISER4_DEBUG ++ if (clust->nr_pages == 0) { ++ /* start write hole from fake disk cluster */ ++ assert("edward-1117", win != NULL); ++ assert("edward-1118", win->stat == HOLE_WINDOW); ++ assert("edward-1119", new_logical_cluster(clust, inode)); ++ } ++#endif ++ if (new_logical_cluster(clust, inode)) { ++ /* ++ new page cluster is about to be written, nothing to read, ++ */ ++ assert("edward-734", reiser4_schedulable()); ++ assert("edward-735", clust->hint->lh.owner == NULL); ++ ++ if (clust->nr_pages) { ++ int off; ++ struct page * pg; ++ assert("edward-1419", clust->pages != NULL); ++ pg = clust->pages[clust->nr_pages - 1]; ++ assert("edward-1420", pg != NULL); ++ off = off_to_pgoff(win->off+win->count+win->delta); ++ if (off) { ++ lock_page(pg); ++ zero_user_segment(pg, off, PAGE_CACHE_SIZE); ++ unlock_page(pg); ++ } ++ } ++ clust->dstat = FAKE_DISK_CLUSTER; ++ return 0; ++ } ++ /* ++ Here we should search for disk cluster to figure out its real state. ++ Also there is one more important reason to do disk search: we need ++ to make disk cluster _dirty_ if it exists ++ */ ++ ++ /* if windows is specified, read the only pages ++ that will be modified partially */ ++ ++ for (i = 0; i < clust->nr_pages; i++) { ++ struct page *pg = clust->pages[i]; ++ ++ lock_page(pg); ++ if (PageUptodate(pg)) { ++ unlock_page(pg); ++ continue; ++ } ++ unlock_page(pg); ++ ++ if (win && ++ i >= size_in_pages(win->off) && ++ i < off_to_pg(win->off + win->count + win->delta)) ++ /* page will be completely overwritten */ ++ continue; ++ ++ if (win && (i == clust->nr_pages - 1) && ++ /* the last page is ++ partially modified, ++ not uptodate .. */ ++ (size_in_pages(i_size_read(inode)) <= pg->index)) { ++ /* .. and appended, ++ so set zeroes to the rest */ ++ int offset; ++ lock_page(pg); ++ assert("edward-1260", ++ size_in_pages(win->off + win->count + ++ win->delta) - 1 == i); ++ ++ offset = ++ off_to_pgoff(win->off + win->count + win->delta); ++ zero_user_segment(pg, offset, PAGE_CACHE_SIZE); ++ unlock_page(pg); ++ /* still not uptodate */ ++ break; ++ } ++ lock_page(pg); ++ result = do_readpage_ctail(inode, clust, pg, mode); ++ ++ assert("edward-1526", ergo(!result, PageUptodate(pg))); ++ unlock_page(pg); ++ if (result) { ++ warning("edward-219", "do_readpage_ctail failed"); ++ goto out; ++ } ++ } ++ if (!tfm_cluster_is_uptodate(&clust->tc)) { ++ /* disk cluster unclaimed, but we need to make its znodes dirty ++ * to make flush update convert its content ++ */ ++ result = find_disk_cluster(clust, inode, ++ 0 /* do not read items */, ++ mode); ++ } ++ out: ++ tfm_cluster_clr_uptodate(&clust->tc); ++ return result; ++} ++ ++static int should_create_unprepped_cluster(struct cluster_handle * clust, ++ struct inode * inode) ++{ ++ assert("edward-737", clust != NULL); ++ ++ switch (clust->dstat) { ++ case PREP_DISK_CLUSTER: ++ case UNPR_DISK_CLUSTER: ++ return 0; ++ case FAKE_DISK_CLUSTER: ++ if (clust->win && ++ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) { ++ assert("edward-1172", ++ new_logical_cluster(clust, inode)); ++ return 0; ++ } ++ return 1; ++ default: ++ impossible("edward-1173", "bad disk cluster state"); ++ return 0; ++ } ++} ++ ++static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust, ++ struct inode *inode) ++{ ++ int result; ++ ++ assert("edward-1123", reiser4_schedulable()); ++ assert("edward-737", clust != NULL); ++ assert("edward-738", inode != NULL); ++ assert("edward-739", cryptcompress_inode_ok(inode)); ++ assert("edward-1053", clust->hint != NULL); ++ ++ if (!should_create_unprepped_cluster(clust, inode)) { ++ if (clust->reserved) { ++ cluster_reserved2free(estimate_insert_cluster(inode)); ++#if REISER4_DEBUG ++ assert("edward-1267", ++ clust->reserved_unprepped == ++ estimate_insert_cluster(inode)); ++ clust->reserved_unprepped -= ++ estimate_insert_cluster(inode); ++#endif ++ } ++ return 0; ++ } ++ assert("edward-1268", clust->reserved); ++ cluster_reserved2grabbed(estimate_insert_cluster(inode)); ++#if REISER4_DEBUG ++ assert("edward-1441", ++ clust->reserved_unprepped == estimate_insert_cluster(inode)); ++ clust->reserved_unprepped -= estimate_insert_cluster(inode); ++#endif ++ result = ctail_insert_unprepped_cluster(clust, inode); ++ if (result) ++ return result; ++ ++ inode_add_bytes(inode, inode_cluster_size(inode)); ++ ++ assert("edward-743", cryptcompress_inode_ok(inode)); ++ assert("edward-744", znode_is_write_locked(clust->hint->lh.node)); ++ ++ clust->dstat = UNPR_DISK_CLUSTER; ++ return 0; ++} ++ ++/* . Grab page cluster for read, write, setattr, etc. operations; ++ * . Truncate its complete pages, if needed; ++ */ ++int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust, ++ rw_op rw) ++{ ++ assert("edward-177", inode != NULL); ++ assert("edward-741", cryptcompress_inode_ok(inode)); ++ assert("edward-740", clust->pages != NULL); ++ ++ set_cluster_nrpages(clust, inode); ++ reset_cluster_pgset(clust, cluster_nrpages(inode)); ++ return grab_page_cluster(inode, clust, rw); ++} ++ ++/* Truncate complete page cluster of index @index. ++ * This is called by ->kill_hook() method of item ++ * plugin when deleting a disk cluster of such index. ++ */ ++void truncate_complete_page_cluster(struct inode *inode, cloff_t index, ++ int even_cows) ++{ ++ int found; ++ int nr_pages; ++ jnode *node; ++ struct page *pages[MAX_CLUSTER_NRPAGES]; ++ ++ node = jlookup(current_tree, get_inode_oid(inode), ++ clust_to_pg(index, inode)); ++ nr_pages = size_in_pages(lbytes(index, inode)); ++ assert("edward-1483", nr_pages != 0); ++ if (!node) ++ goto truncate; ++ found = find_get_pages(inode->i_mapping, ++ clust_to_pg(index, inode), ++ cluster_nrpages(inode), pages); ++ if (!found) { ++ assert("edward-1484", jnode_truncate_ok(inode, index)); ++ return; ++ } ++ lock_cluster(node); ++ ++ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) ++ && index == 0) ++ /* converting to unix_file is in progress */ ++ JF_CLR(node, JNODE_CLUSTER_PAGE); ++ if (JF_ISSET(node, JNODE_DIRTY)) { ++ /* ++ * @nr_pages were checked in, but not yet checked out - ++ * we need to release them. (also there can be pages ++ * attached to page cache by read(), etc. - don't take ++ * them into account). ++ */ ++ assert("edward-1198", found >= nr_pages); ++ ++ /* free disk space grabbed for disk cluster converting */ ++ cluster_reserved2grabbed(estimate_update_cluster(inode)); ++ grabbed2free(get_current_context(), ++ get_current_super_private(), ++ estimate_update_cluster(inode)); ++ __put_page_cluster(0, nr_pages, pages, inode); ++ ++ /* This will clear dirty bit, uncapture and unlock jnode */ ++ unlock_cluster_uncapture(node); ++ } else ++ unlock_cluster(node); ++ jput(node); /* jlookup */ ++ put_found_pages(pages, found); /* find_get_pages */ ++ truncate: ++ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) && ++ index == 0) ++ return; ++ truncate_page_cluster_range(inode, pages, index, 0, ++ cluster_nrpages(inode), ++ even_cows); ++ assert("edward-1201", ++ ergo(!reiser4_inode_get_flag(inode, ++ REISER4_FILE_CONV_IN_PROGRESS), ++ jnode_truncate_ok(inode, index))); ++ return; ++} ++ ++/* ++ * Set cluster handle @clust of a logical cluster before ++ * modifications which are supposed to be committed. ++ * ++ * . grab cluster pages; ++ * . reserve disk space; ++ * . maybe read pages from disk and set the disk cluster dirty; ++ * . maybe write hole and check in (partially zeroed) logical cluster; ++ * . create 'unprepped' disk cluster for new or fake logical one. ++ */ ++static int prepare_logical_cluster(struct inode *inode, ++ loff_t file_off, /* write position ++ in the file */ ++ loff_t to_file, /* bytes of users data ++ to write to the file */ ++ struct cluster_handle * clust, ++ logical_cluster_op op) ++{ ++ int result = 0; ++ struct reiser4_slide * win = clust->win; ++ ++ reset_cluster_params(clust); ++ cluster_set_tfm_act(&clust->tc, TFMA_READ); ++#if REISER4_DEBUG ++ clust->ctx = get_current_context(); ++#endif ++ assert("edward-1190", op != LC_INVAL); ++ ++ clust->op = op; ++ ++ result = prepare_page_cluster(inode, clust, WRITE_OP); ++ if (result) ++ return result; ++ assert("edward-1447", ++ ergo(clust->nr_pages != 0, jprivate(clust->pages[0]))); ++ assert("edward-1448", ++ ergo(clust->nr_pages != 0, ++ jnode_is_cluster_page(jprivate(clust->pages[0])))); ++ ++ result = reserve4cluster(inode, clust); ++ if (result) ++ goto err1; ++ result = read_some_cluster_pages(inode, clust); ++ if (result) { ++ free_reserved4cluster(inode, ++ clust, ++ estimate_update_cluster(inode) + ++ estimate_insert_cluster(inode)); ++ goto err1; ++ } ++ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER); ++ ++ result = cryptcompress_make_unprepped_cluster(clust, inode); ++ if (result) ++ goto err2; ++ if (win && win->stat == HOLE_WINDOW) { ++ result = write_hole(inode, clust, file_off, to_file); ++ if (result) ++ goto err2; ++ } ++ return 0; ++ err2: ++ free_reserved4cluster(inode, clust, ++ estimate_update_cluster(inode)); ++ err1: ++ put_page_cluster(clust, inode, WRITE_OP); ++ assert("edward-1125", result == -ENOSPC); ++ return result; ++} ++ ++/* set window by two offsets */ ++static void set_window(struct cluster_handle * clust, ++ struct reiser4_slide * win, struct inode *inode, ++ loff_t o1, loff_t o2) ++{ ++ assert("edward-295", clust != NULL); ++ assert("edward-296", inode != NULL); ++ assert("edward-1071", win != NULL); ++ assert("edward-297", o1 <= o2); ++ ++ clust->index = off_to_clust(o1, inode); ++ ++ win->off = off_to_cloff(o1, inode); ++ win->count = min((loff_t)(inode_cluster_size(inode) - win->off), ++ o2 - o1); ++ win->delta = 0; ++ ++ clust->win = win; ++} ++ ++static int set_cluster_by_window(struct inode *inode, ++ struct cluster_handle * clust, ++ struct reiser4_slide * win, size_t length, ++ loff_t file_off) ++{ ++ int result; ++ ++ assert("edward-197", clust != NULL); ++ assert("edward-1072", win != NULL); ++ assert("edward-198", inode != NULL); ++ ++ result = alloc_cluster_pgset(clust, cluster_nrpages(inode)); ++ if (result) ++ return result; ++ ++ if (file_off > i_size_read(inode)) { ++ /* Uhmm, hole in cryptcompress file... */ ++ loff_t hole_size; ++ hole_size = file_off - inode->i_size; ++ ++ set_window(clust, win, inode, inode->i_size, file_off); ++ win->stat = HOLE_WINDOW; ++ if (win->off + hole_size < inode_cluster_size(inode)) ++ /* there is also user's data to append to the hole */ ++ win->delta = min(inode_cluster_size(inode) - ++ (win->off + win->count), length); ++ return 0; ++ } ++ set_window(clust, win, inode, file_off, file_off + length); ++ win->stat = DATA_WINDOW; ++ return 0; ++} ++ ++int set_cluster_by_page(struct cluster_handle * clust, struct page * page, ++ int count) ++{ ++ int result = 0; ++ int (*setting_actor)(struct cluster_handle * clust, int count); ++ ++ assert("edward-1358", clust != NULL); ++ assert("edward-1359", page != NULL); ++ assert("edward-1360", page->mapping != NULL); ++ assert("edward-1361", page->mapping->host != NULL); ++ ++ setting_actor = ++ (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset); ++ result = setting_actor(clust, count); ++ clust->index = pg_to_clust(page->index, page->mapping->host); ++ return result; ++} ++ ++/* reset all the params that not get updated */ ++void reset_cluster_params(struct cluster_handle * clust) ++{ ++ assert("edward-197", clust != NULL); ++ ++ clust->dstat = INVAL_DISK_CLUSTER; ++ clust->tc.uptodate = 0; ++ clust->tc.len = 0; ++} ++ ++/* the heart of write_cryptcompress */ ++static loff_t do_write_cryptcompress(struct file *file, struct inode *inode, ++ const char __user *buf, size_t to_write, ++ loff_t pos, struct dispatch_context *cont) ++{ ++ int i; ++ hint_t *hint; ++ int result = 0; ++ size_t count; ++ struct reiser4_slide win; ++ struct cluster_handle clust; ++ struct cryptcompress_info * info; ++ ++ assert("edward-154", buf != NULL); ++ assert("edward-161", reiser4_schedulable()); ++ assert("edward-748", cryptcompress_inode_ok(inode)); ++ assert("edward-159", current_blocksize == PAGE_CACHE_SIZE); ++ assert("edward-1274", get_current_context()->grabbed_blocks == 0); ++ ++ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); ++ if (hint == NULL) ++ return RETERR(-ENOMEM); ++ ++ result = load_file_hint(file, hint); ++ if (result) { ++ kfree(hint); ++ return result; ++ } ++ count = to_write; ++ ++ reiser4_slide_init(&win); ++ cluster_init_read(&clust, &win); ++ clust.hint = hint; ++ info = cryptcompress_inode_data(inode); ++ ++ mutex_lock(&info->checkin_mutex); ++ ++ result = set_cluster_by_window(inode, &clust, &win, to_write, pos); ++ if (result) ++ goto out; ++ ++ if (next_window_stat(&win) == HOLE_WINDOW) { ++ /* write hole in this iteration ++ separated from the loop below */ ++ result = write_dispatch_hook(file, inode, ++ pos, &clust, cont); ++ if (result) ++ goto out; ++ result = prepare_logical_cluster(inode, pos, count, &clust, ++ LC_APPOV); ++ if (result) ++ goto out; ++ } ++ do { ++ const char __user * src; ++ unsigned page_off, to_page; ++ ++ assert("edward-750", reiser4_schedulable()); ++ ++ result = write_dispatch_hook(file, inode, ++ pos + to_write - count, ++ &clust, cont); ++ if (result) ++ goto out; ++ if (cont->state == DISPATCH_ASSIGNED_NEW) ++ /* done_lh was called in write_dispatch_hook */ ++ goto out_no_longterm_lock; ++ ++ result = prepare_logical_cluster(inode, pos, count, &clust, ++ LC_APPOV); ++ if (result) ++ goto out; ++ ++ assert("edward-751", cryptcompress_inode_ok(inode)); ++ assert("edward-204", win.stat == DATA_WINDOW); ++ assert("edward-1288", hint_is_valid(clust.hint)); ++ assert("edward-752", ++ znode_is_write_locked(hint->ext_coord.coord.node)); ++ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK); ++ ++ /* set write position in page */ ++ page_off = off_to_pgoff(win.off); ++ ++ /* copy user's data to cluster pages */ ++ for (i = off_to_pg(win.off), src = buf; ++ i < size_in_pages(win.off + win.count); ++ i++, src += to_page) { ++ to_page = __mbp(win.off + win.count, i) - page_off; ++ assert("edward-1039", ++ page_off + to_page <= PAGE_CACHE_SIZE); ++ assert("edward-287", clust.pages[i] != NULL); ++ ++ fault_in_pages_readable(src, to_page); ++ ++ lock_page(clust.pages[i]); ++ result = ++ __copy_from_user((char *)kmap(clust.pages[i]) + ++ page_off, src, to_page); ++ kunmap(clust.pages[i]); ++ if (unlikely(result)) { ++ unlock_page(clust.pages[i]); ++ result = -EFAULT; ++ goto err2; ++ } ++ SetPageUptodate(clust.pages[i]); ++ set_page_dirty_notag(clust.pages[i]); ++ flush_dcache_page(clust.pages[i]); ++ mark_page_accessed(clust.pages[i]); ++ unlock_page(clust.pages[i]); ++ page_off = 0; ++ } ++ assert("edward-753", cryptcompress_inode_ok(inode)); ++ ++ result = checkin_logical_cluster(&clust, inode); ++ if (result) ++ goto err2; ++ ++ buf += win.count; ++ count -= win.count; ++ ++ result = balance_dirty_page_cluster(&clust, inode, 0, count, ++ win_count_to_nrpages(&win)); ++ if (result) ++ goto err1; ++ assert("edward-755", hint->lh.owner == NULL); ++ reset_cluster_params(&clust); ++ continue; ++ err2: ++ put_page_cluster(&clust, inode, WRITE_OP); ++ err1: ++ if (clust.reserved) ++ free_reserved4cluster(inode, ++ &clust, ++ estimate_update_cluster(inode)); ++ break; ++ } while (count); ++ out: ++ done_lh(&hint->lh); ++ save_file_hint(file, hint); ++ out_no_longterm_lock: ++ mutex_unlock(&info->checkin_mutex); ++ kfree(hint); ++ put_cluster_handle(&clust); ++ assert("edward-195", ++ ergo((to_write == count), ++ (result < 0 || cont->state == DISPATCH_ASSIGNED_NEW))); ++ return (to_write - count) ? (to_write - count) : result; ++} ++ ++/** ++ * plugin->write() ++ * @file: file to write to ++ * @buf: address of user-space buffer ++ * @read_amount: number of bytes to write ++ * @off: position in file to write to ++ */ ++ssize_t write_cryptcompress(struct file *file, const char __user *buf, ++ size_t count, loff_t *off, ++ struct dispatch_context *cont) ++{ ++ ssize_t result; ++ struct inode *inode; ++ reiser4_context *ctx; ++ loff_t pos = *off; ++ struct cryptcompress_info *info; ++ ++ assert("edward-1449", cont->state == DISPATCH_INVAL_STATE); ++ ++ inode = file->f_dentry->d_inode; ++ assert("edward-196", cryptcompress_inode_ok(inode)); ++ ++ info = cryptcompress_inode_data(inode); ++ ctx = get_current_context(); ++ ++ result = generic_write_checks(file, &pos, &count, 0); ++ if (unlikely(result != 0)) { ++ context_set_commit_async(ctx); ++ return result; ++ } ++ if (unlikely(count == 0)) ++ return 0; ++ result = file_remove_suid(file); ++ if (unlikely(result != 0)) { ++ context_set_commit_async(ctx); ++ return result; ++ } ++ /* remove_suid might create a transaction */ ++ reiser4_txn_restart(ctx); ++ ++ result = do_write_cryptcompress(file, inode, buf, count, pos, cont); ++ ++ if (unlikely(result < 0)) { ++ context_set_commit_async(ctx); ++ return result; ++ } ++ /* update position in a file */ ++ *off = pos + result; ++ return result; ++} ++ ++/* plugin->readpages */ ++int readpages_cryptcompress(struct file *file, struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ reiser4_context * ctx; ++ int ret; ++ ++ ctx = reiser4_init_context(mapping->host->i_sb); ++ if (IS_ERR(ctx)) { ++ ret = PTR_ERR(ctx); ++ goto err; ++ } ++ /* cryptcompress file can be built of ctail items only */ ++ ret = readpages_ctail(file, mapping, pages); ++ reiser4_txn_restart(ctx); ++ reiser4_exit_context(ctx); ++ if (ret) { ++err: ++ put_pages_list(pages); ++ } ++ return ret; ++} ++ ++static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode) ++{ ++ /* reserve one block to update stat data item */ ++ assert("edward-1193", ++ inode_file_plugin(inode)->estimate.update == ++ estimate_update_common); ++ return estimate_update_common(inode); ++} ++ ++/** ++ * plugin->read ++ * @file: file to read from ++ * @buf: address of user-space buffer ++ * @read_amount: number of bytes to read ++ * @off: position in file to read from ++ */ ++ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size, ++ loff_t * off) ++{ ++ ssize_t result; ++ struct inode *inode; ++ reiser4_context *ctx; ++ struct cryptcompress_info *info; ++ reiser4_block_nr needed; ++ ++ inode = file->f_dentry->d_inode; ++ assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ info = cryptcompress_inode_data(inode); ++ needed = cryptcompress_estimate_read(inode); ++ ++ result = reiser4_grab_space(needed, BA_CAN_COMMIT); ++ if (result != 0) { ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ result = do_sync_read(file, buf, size, off); ++ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ ++ return result; ++} ++ ++/* Look for a disk cluster and keep lookup result in @found. ++ * If @index > 0, then find disk cluster of the index (@index - 1); ++ * If @index == 0, then find the rightmost disk cluster. ++ * Keep incremented index of the found disk cluster in @found. ++ * @found == 0 means that disk cluster was not found (in the last ++ * case (@index == 0) it means that file doesn't have disk clusters). ++ */ ++static int lookup_disk_cluster(struct inode *inode, cloff_t * found, ++ cloff_t index) ++{ ++ int result; ++ reiser4_key key; ++ loff_t offset; ++ hint_t *hint; ++ lock_handle *lh; ++ lookup_bias bias; ++ coord_t *coord; ++ item_plugin *iplug; ++ ++ assert("edward-1131", inode != NULL); ++ assert("edward-95", cryptcompress_inode_ok(inode)); ++ ++ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); ++ if (hint == NULL) ++ return RETERR(-ENOMEM); ++ hint_init_zero(hint); ++ lh = &hint->lh; ++ ++ bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN); ++ offset = ++ (index ? clust_to_off(index, inode) - ++ 1 : get_key_offset(reiser4_max_key())); ++ ++ key_by_inode_cryptcompress(inode, offset, &key); ++ ++ /* find the last item of this object */ ++ result = ++ find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */, ++ bias, 0); ++ if (cbk_errored(result)) { ++ done_lh(lh); ++ kfree(hint); ++ return result; ++ } ++ if (result == CBK_COORD_NOTFOUND) { ++ /* no real disk clusters */ ++ done_lh(lh); ++ kfree(hint); ++ *found = 0; ++ return 0; ++ } ++ /* disk cluster is found */ ++ coord = &hint->ext_coord.coord; ++ coord_clear_iplug(coord); ++ result = zload(coord->node); ++ if (unlikely(result)) { ++ done_lh(lh); ++ kfree(hint); ++ return result; ++ } ++ iplug = item_plugin_by_coord(coord); ++ assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID)); ++ assert("edward-1202", ctail_ok(coord)); ++ ++ item_key_by_coord(coord, &key); ++ *found = off_to_clust(get_key_offset(&key), inode) + 1; ++ ++ assert("edward-1132", ergo(index, index == *found)); ++ ++ zrelse(coord->node); ++ done_lh(lh); ++ kfree(hint); ++ return 0; ++} ++ ++static int find_fake_appended(struct inode *inode, cloff_t * index) ++{ ++ return lookup_disk_cluster(inode, index, ++ 0 /* find last real one */ ); ++} ++ ++/* Set left coord when unit is not found after node_lookup() ++ This takes into account that there can be holes in a sequence ++ of disk clusters */ ++ ++static void adjust_left_coord(coord_t * left_coord) ++{ ++ switch (left_coord->between) { ++ case AFTER_UNIT: ++ left_coord->between = AFTER_ITEM; ++ case AFTER_ITEM: ++ case BEFORE_UNIT: ++ break; ++ default: ++ impossible("edward-1204", "bad left coord to cut"); ++ } ++ return; ++} ++ ++#define CRC_CUT_TREE_MIN_ITERATIONS 64 ++ ++/* plugin->cut_tree_worker */ ++int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key, ++ const reiser4_key * to_key, ++ reiser4_key * smallest_removed, ++ struct inode *object, int truncate, ++ int *progress) ++{ ++ lock_handle next_node_lock; ++ coord_t left_coord; ++ int result; ++ ++ assert("edward-1158", tap->coord->node != NULL); ++ assert("edward-1159", znode_is_write_locked(tap->coord->node)); ++ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL); ++ ++ *progress = 0; ++ init_lh(&next_node_lock); ++ ++ while (1) { ++ znode *node; /* node from which items are cut */ ++ node_plugin *nplug; /* node plugin for @node */ ++ ++ node = tap->coord->node; ++ ++ /* Move next_node_lock to the next node on the left. */ ++ result = ++ reiser4_get_left_neighbor(&next_node_lock, node, ++ ZNODE_WRITE_LOCK, ++ GN_CAN_USE_UPPER_LEVELS); ++ if (result != 0 && result != -E_NO_NEIGHBOR) ++ break; ++ /* FIXME-EDWARD: Check can we delete the node as a whole. */ ++ result = reiser4_tap_load(tap); ++ if (result) ++ return result; ++ ++ /* Prepare the second (right) point for cut_node() */ ++ if (*progress) ++ coord_init_last_unit(tap->coord, node); ++ ++ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL) ++ /* set rightmost unit for the items without lookup method */ ++ tap->coord->unit_pos = coord_last_unit_pos(tap->coord); ++ ++ nplug = node->nplug; ++ ++ assert("edward-1161", nplug); ++ assert("edward-1162", nplug->lookup); ++ ++ /* left_coord is leftmost unit cut from @node */ ++ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord); ++ ++ if (IS_CBKERR(result)) ++ break; ++ ++ if (result == CBK_COORD_NOTFOUND) ++ adjust_left_coord(&left_coord); ++ ++ /* adjust coordinates so that they are set to existing units */ ++ if (coord_set_to_right(&left_coord) ++ || coord_set_to_left(tap->coord)) { ++ result = 0; ++ break; ++ } ++ ++ if (coord_compare(&left_coord, tap->coord) == ++ COORD_CMP_ON_RIGHT) { ++ /* keys from @from_key to @to_key are not in the tree */ ++ result = 0; ++ break; ++ } ++ ++ /* cut data from one node */ ++ *smallest_removed = *reiser4_min_key(); ++ result = kill_node_content(&left_coord, ++ tap->coord, ++ from_key, ++ to_key, ++ smallest_removed, ++ next_node_lock.node, ++ object, truncate); ++ reiser4_tap_relse(tap); ++ ++ if (result) ++ break; ++ ++ ++(*progress); ++ ++ /* Check whether all items with keys >= from_key were removed ++ * from the tree. */ ++ if (keyle(smallest_removed, from_key)) ++ /* result = 0; */ ++ break; ++ ++ if (next_node_lock.node == NULL) ++ break; ++ ++ result = reiser4_tap_move(tap, &next_node_lock); ++ done_lh(&next_node_lock); ++ if (result) ++ break; ++ ++ /* Break long cut_tree operation (deletion of a large file) if ++ * atom requires commit. */ ++ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS ++ && current_atom_should_commit()) { ++ result = -E_REPEAT; ++ break; ++ } ++ } ++ done_lh(&next_node_lock); ++ return result; ++} ++ ++/* Append or expand hole in two steps: ++ * 1) set zeroes to the rightmost page of the rightmost non-fake ++ * logical cluster; ++ * 2) expand hole via fake logical clusters (just increase i_size) ++ */ ++static int cryptcompress_append_hole(struct inode *inode /* with old size */, ++ loff_t new_size) ++{ ++ int result = 0; ++ hint_t *hint; ++ lock_handle *lh; ++ loff_t hole_size; ++ int nr_zeroes; ++ struct reiser4_slide win; ++ struct cluster_handle clust; ++ ++ assert("edward-1133", inode->i_size < new_size); ++ assert("edward-1134", reiser4_schedulable()); ++ assert("edward-1135", cryptcompress_inode_ok(inode)); ++ assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE); ++ assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0); ++ ++ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); ++ if (hint == NULL) ++ return RETERR(-ENOMEM); ++ hint_init_zero(hint); ++ lh = &hint->lh; ++ ++ reiser4_slide_init(&win); ++ cluster_init_read(&clust, &win); ++ clust.hint = hint; ++ ++ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); ++ if (result) ++ goto out; ++ if (off_to_cloff(inode->i_size, inode) == 0) ++ goto append_fake; ++ hole_size = new_size - inode->i_size; ++ nr_zeroes = ++ inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode); ++ if (hole_size < nr_zeroes) ++ nr_zeroes = hole_size; ++ set_window(&clust, &win, inode, inode->i_size, ++ inode->i_size + nr_zeroes); ++ win.stat = HOLE_WINDOW; ++ ++ assert("edward-1137", ++ clust.index == off_to_clust(inode->i_size, inode)); ++ ++ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV); ++ ++ assert("edward-1271", !result || result == -ENOSPC); ++ if (result) ++ goto out; ++ assert("edward-1139", ++ clust.dstat == PREP_DISK_CLUSTER || ++ clust.dstat == UNPR_DISK_CLUSTER); ++ ++ assert("edward-1431", hole_size >= nr_zeroes); ++ if (hole_size == nr_zeroes) ++ /* nothing to append anymore */ ++ goto out; ++ append_fake: ++ INODE_SET_SIZE(inode, new_size); ++ out: ++ done_lh(lh); ++ kfree(hint); ++ put_cluster_handle(&clust); ++ return result; ++} ++ ++static int update_cryptcompress_size(struct inode *inode, loff_t new_size, ++ int update_sd) ++{ ++ return (new_size & ((loff_t) (inode_cluster_size(inode)) - 1) ++ ? 0 : reiser4_update_file_size(inode, new_size, update_sd)); ++} ++ ++/* Prune cryptcompress file in two steps: ++ * 1) cut all nominated logical clusters except the leftmost one which ++ * is to be partially truncated. Note, that there can be "holes" ++ * represented by fake logical clusters. ++ * 2) set zeroes and capture leftmost partially truncated logical ++ * cluster, if it is not fake; otherwise prune fake logical cluster ++ * (just decrease i_size). ++ */ ++static int prune_cryptcompress(struct inode *inode, loff_t new_size, ++ int update_sd, cloff_t aidx) ++{ ++ int result = 0; ++ unsigned nr_zeroes; ++ loff_t to_prune; ++ loff_t old_size; ++ cloff_t ridx; ++ ++ hint_t *hint; ++ lock_handle *lh; ++ struct reiser4_slide win; ++ struct cluster_handle clust; ++ ++ assert("edward-1140", inode->i_size >= new_size); ++ assert("edward-1141", reiser4_schedulable()); ++ assert("edward-1142", cryptcompress_inode_ok(inode)); ++ assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE); ++ ++ old_size = inode->i_size; ++ ++ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); ++ if (hint == NULL) ++ return RETERR(-ENOMEM); ++ hint_init_zero(hint); ++ lh = &hint->lh; ++ ++ reiser4_slide_init(&win); ++ cluster_init_read(&clust, &win); ++ clust.hint = hint; ++ ++ /* calculate index of the rightmost logical cluster ++ that will be completely truncated */ ++ ridx = size_in_lc(new_size, inode); ++ ++ /* truncate all disk clusters starting from @ridx */ ++ assert("edward-1174", ridx <= aidx); ++ old_size = inode->i_size; ++ if (ridx != aidx) { ++ struct cryptcompress_info * info; ++ info = cryptcompress_inode_data(inode); ++ result = cut_file_items(inode, ++ clust_to_off(ridx, inode), ++ update_sd, ++ clust_to_off(aidx, inode), ++ update_cryptcompress_size); ++ info->trunc_index = ULONG_MAX; ++ if (result) ++ goto out; ++ } ++ /* ++ * there can be pages of fake logical clusters, truncate them ++ */ ++ truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode)); ++ assert("edward-1524", ++ pages_truncate_ok(inode, clust_to_pg(ridx, inode))); ++ /* ++ * now perform partial truncate of last logical cluster ++ */ ++ if (!off_to_cloff(new_size, inode)) { ++ /* no partial truncate is needed */ ++ assert("edward-1145", inode->i_size == new_size); ++ goto truncate_fake; ++ } ++ assert("edward-1146", new_size < inode->i_size); ++ ++ to_prune = inode->i_size - new_size; ++ ++ /* check if the last logical cluster is fake */ ++ result = lookup_disk_cluster(inode, &aidx, ridx); ++ if (result) ++ goto out; ++ if (!aidx) ++ /* yup, this is fake one */ ++ goto truncate_fake; ++ ++ assert("edward-1148", aidx == ridx); ++ ++ /* do partial truncate of the last page cluster, ++ and try to capture this one */ ++ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); ++ if (result) ++ goto out; ++ nr_zeroes = (off_to_pgoff(new_size) ? ++ PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0); ++ set_window(&clust, &win, inode, new_size, new_size + nr_zeroes); ++ win.stat = HOLE_WINDOW; ++ ++ assert("edward-1149", clust.index == ridx - 1); ++ ++ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC); ++ if (result) ++ goto out; ++ assert("edward-1151", ++ clust.dstat == PREP_DISK_CLUSTER || ++ clust.dstat == UNPR_DISK_CLUSTER); ++ ++ assert("edward-1191", inode->i_size == new_size); ++ assert("edward-1206", body_truncate_ok(inode, ridx)); ++ truncate_fake: ++ /* drop all the pages that don't have jnodes (i.e. pages ++ which can not be truncated by cut_file_items() because ++ of holes represented by fake disk clusters) including ++ the pages of partially truncated cluster which was ++ released by prepare_logical_cluster() */ ++ INODE_SET_SIZE(inode, new_size); ++ truncate_inode_pages(inode->i_mapping, new_size); ++ out: ++ assert("edward-1334", !result || result == -ENOSPC); ++ assert("edward-1497", ++ pages_truncate_ok(inode, size_in_pages(new_size))); ++ ++ done_lh(lh); ++ kfree(hint); ++ put_cluster_handle(&clust); ++ return result; ++} ++ ++/* Prepare cryptcompress file for truncate: ++ * prune or append rightmost fake logical clusters (if any) ++ */ ++static int start_truncate_fake(struct inode *inode, cloff_t aidx, ++ loff_t new_size, int update_sd) ++{ ++ int result = 0; ++ int bytes; ++ ++ if (new_size > inode->i_size) { ++ /* append */ ++ if (inode->i_size < clust_to_off(aidx, inode)) ++ /* no fake bytes */ ++ return 0; ++ bytes = new_size - inode->i_size; ++ INODE_SET_SIZE(inode, inode->i_size + bytes); ++ } else { ++ /* prune */ ++ if (inode->i_size <= clust_to_off(aidx, inode)) ++ /* no fake bytes */ ++ return 0; ++ bytes = inode->i_size - ++ max(new_size, clust_to_off(aidx, inode)); ++ if (!bytes) ++ return 0; ++ INODE_SET_SIZE(inode, inode->i_size - bytes); ++ /* In the case of fake prune we need to drop page cluster. ++ There are only 2 cases for partially truncated page: ++ 1. If is is dirty, therefore it is anonymous ++ (was dirtied via mmap), and will be captured ++ later via ->capture(). ++ 2. If is clean, therefore it is filled by zeroes. ++ In both cases we don't need to make it dirty and ++ capture here. ++ */ ++ truncate_inode_pages(inode->i_mapping, inode->i_size); ++ } ++ if (update_sd) ++ result = update_sd_cryptcompress(inode); ++ return result; ++} ++ ++/** ++ * This is called in setattr_cryptcompress when it is used to truncate, ++ * and in delete_object_cryptcompress ++ */ ++static int cryptcompress_truncate(struct inode *inode, /* old size */ ++ loff_t new_size, /* new size */ ++ int update_sd) ++{ ++ int result; ++ cloff_t aidx; ++ ++ result = find_fake_appended(inode, &aidx); ++ if (result) ++ return result; ++ assert("edward-1208", ++ ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode))); ++ ++ result = start_truncate_fake(inode, aidx, new_size, update_sd); ++ if (result) ++ return result; ++ if (inode->i_size == new_size) ++ /* nothing to truncate anymore */ ++ return 0; ++ result = (inode->i_size < new_size ? ++ cryptcompress_append_hole(inode, new_size) : ++ prune_cryptcompress(inode, new_size, update_sd, aidx)); ++ if (!result && update_sd) ++ result = update_sd_cryptcompress(inode); ++ return result; ++} ++ ++/** ++ * Capture a pager cluster. ++ * @clust must be set up by a caller. ++ */ ++static int capture_page_cluster(struct cluster_handle * clust, ++ struct inode * inode) ++{ ++ int result; ++ ++ assert("edward-1073", clust != NULL); ++ assert("edward-1074", inode != NULL); ++ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER); ++ ++ result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV); ++ if (result) ++ return result; ++ ++ set_cluster_pages_dirty(clust, inode); ++ result = checkin_logical_cluster(clust, inode); ++ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK); ++ if (unlikely(result)) ++ put_page_cluster(clust, inode, WRITE_OP); ++ return result; ++} ++ ++/* Starting from @index find tagged pages of the same page cluster. ++ * Clear the tag for each of them. Return number of found pages. ++ */ ++static int find_anon_page_cluster(struct address_space * mapping, ++ pgoff_t * index, struct page ** pages) ++{ ++ int i = 0; ++ int found; ++ spin_lock_irq(&mapping->tree_lock); ++ do { ++ /* looking for one page */ ++ found = radix_tree_gang_lookup_tag(&mapping->page_tree, ++ (void **)&pages[i], ++ *index, 1, ++ PAGECACHE_TAG_REISER4_MOVED); ++ if (!found) ++ break; ++ if (!same_page_cluster(pages[0], pages[i])) ++ break; ++ ++ /* found */ ++ page_cache_get(pages[i]); ++ *index = pages[i]->index + 1; ++ ++ radix_tree_tag_clear(&mapping->page_tree, ++ pages[i]->index, ++ PAGECACHE_TAG_REISER4_MOVED); ++ if (last_page_in_cluster(pages[i++])) ++ break; ++ } while (1); ++ spin_unlock_irq(&mapping->tree_lock); ++ return i; ++} ++ ++#define MAX_PAGES_TO_CAPTURE (1024) ++ ++/* Capture anonymous page clusters */ ++static int capture_anon_pages(struct address_space * mapping, pgoff_t * index, ++ int to_capture) ++{ ++ int count = 0; ++ int found = 0; ++ int result = 0; ++ hint_t *hint; ++ lock_handle *lh; ++ struct inode * inode; ++ struct cluster_handle clust; ++ struct page * pages[MAX_CLUSTER_NRPAGES]; ++ ++ assert("edward-1127", mapping != NULL); ++ assert("edward-1128", mapping->host != NULL); ++ assert("edward-1440", mapping->host->i_mapping == mapping); ++ ++ inode = mapping->host; ++ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); ++ if (hint == NULL) ++ return RETERR(-ENOMEM); ++ hint_init_zero(hint); ++ lh = &hint->lh; ++ ++ cluster_init_read(&clust, NULL); ++ clust.hint = hint; ++ ++ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); ++ if (result) ++ goto out; ++ ++ while (to_capture > 0) { ++ found = find_anon_page_cluster(mapping, index, pages); ++ if (!found) { ++ *index = (pgoff_t) - 1; ++ break; ++ } ++ move_cluster_forward(&clust, inode, pages[0]->index); ++ result = capture_page_cluster(&clust, inode); ++ ++ put_found_pages(pages, found); /* find_anon_page_cluster */ ++ if (result) ++ break; ++ to_capture -= clust.nr_pages; ++ count += clust.nr_pages; ++ } ++ if (result) { ++ warning("edward-1077", ++ "Capture failed (inode %llu, result=%i, captured=%d)\n", ++ (unsigned long long)get_inode_oid(inode), result, count); ++ } else { ++ assert("edward-1078", ergo(found > 0, count > 0)); ++ if (to_capture <= 0) ++ /* there may be left more pages */ ++ __mark_inode_dirty(inode, I_DIRTY_PAGES); ++ result = count; ++ } ++ out: ++ done_lh(lh); ++ kfree(hint); ++ put_cluster_handle(&clust); ++ return result; ++} ++ ++/* Returns true if inode's mapping has dirty pages ++ which do not belong to any atom */ ++static int cryptcompress_inode_has_anon_pages(struct inode *inode) ++{ ++ int result; ++ spin_lock_irq(&inode->i_mapping->tree_lock); ++ result = radix_tree_tagged(&inode->i_mapping->page_tree, ++ PAGECACHE_TAG_REISER4_MOVED); ++ spin_unlock_irq(&inode->i_mapping->tree_lock); ++ return result; ++} ++ ++/* plugin->writepages */ ++int writepages_cryptcompress(struct address_space *mapping, ++ struct writeback_control *wbc) ++{ ++ int result = 0; ++ long to_capture; ++ pgoff_t nrpages; ++ pgoff_t index = 0; ++ struct inode *inode; ++ struct cryptcompress_info *info; ++ ++ inode = mapping->host; ++ if (!cryptcompress_inode_has_anon_pages(inode)) ++ goto end; ++ info = cryptcompress_inode_data(inode); ++ nrpages = size_in_pages(i_size_read(inode)); ++ ++ if (wbc->sync_mode != WB_SYNC_ALL) ++ to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE); ++ else ++ to_capture = MAX_PAGES_TO_CAPTURE; ++ do { ++ reiser4_context *ctx; ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) { ++ result = PTR_ERR(ctx); ++ break; ++ } ++ /* avoid recursive calls to ->sync_inodes */ ++ ctx->nobalance = 1; ++ ++ assert("edward-1079", ++ lock_stack_isclean(get_current_lock_stack())); ++ ++ reiser4_txn_restart_current(); ++ ++ if (get_current_context()->entd) { ++ if (mutex_trylock(&info->checkin_mutex) == 0) { ++ /* the mutex might be occupied by ++ entd caller */ ++ result = RETERR(-EBUSY); ++ reiser4_exit_context(ctx); ++ break; ++ } ++ } else ++ mutex_lock(&info->checkin_mutex); ++ ++ result = capture_anon_pages(inode->i_mapping, &index, ++ to_capture); ++ mutex_unlock(&info->checkin_mutex); ++ ++ if (result < 0) { ++ reiser4_exit_context(ctx); ++ break; ++ } ++ wbc->nr_to_write -= result; ++ if (wbc->sync_mode != WB_SYNC_ALL) { ++ reiser4_exit_context(ctx); ++ break; ++ } ++ result = txnmgr_force_commit_all(inode->i_sb, 0); ++ reiser4_exit_context(ctx); ++ } while (result >= 0 && index < nrpages); ++ ++ end: ++ if (is_in_reiser4_context()) { ++ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) { ++ /* there are already pages to flush, flush them out, ++ do not delay until end of reiser4_sync_inodes */ ++ reiser4_writeout(inode->i_sb, wbc); ++ get_current_context()->nr_captured = 0; ++ } ++ } ++ return result; ++} ++ ++/* plugin->ioctl */ ++int ioctl_cryptcompress(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg) ++{ ++ return RETERR(-ENOSYS); ++} ++ ++/* plugin->mmap */ ++int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma) ++{ ++ int result; ++ struct inode *inode; ++ reiser4_context *ctx; ++ ++ inode = file->f_dentry->d_inode; ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ /* ++ * generic_file_mmap will do update_atime. Grab space for stat data ++ * update. ++ */ ++ result = reiser4_grab_space_force ++ (inode_file_plugin(inode)->estimate.update(inode), ++ BA_CAN_COMMIT); ++ if (result) { ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ result = generic_file_mmap(file, vma); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/* plugin->delete_object */ ++int delete_object_cryptcompress(struct inode *inode) ++{ ++ int result; ++ struct cryptcompress_info * info; ++ ++ assert("edward-429", inode->i_nlink == 0); ++ ++ reiser4_txn_restart_current(); ++ info = cryptcompress_inode_data(inode); ++ ++ mutex_lock(&info->checkin_mutex); ++ result = cryptcompress_truncate(inode, 0, 0); ++ mutex_unlock(&info->checkin_mutex); ++ ++ if (result) { ++ warning("edward-430", ++ "cannot truncate cryptcompress file %lli: %i", ++ (unsigned long long)get_inode_oid(inode), ++ result); ++ } ++ truncate_inode_pages(inode->i_mapping, 0); ++ assert("edward-1487", pages_truncate_ok(inode, 0)); ++ /* and remove stat data */ ++ return reiser4_delete_object_common(inode); ++} ++ ++/* ++ * plugin->setattr ++ * This implements actual truncate (see comments in reiser4/page_cache.c) ++ */ ++int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr) ++{ ++ int result; ++ struct inode *inode; ++ struct cryptcompress_info * info; ++ ++ inode = dentry->d_inode; ++ info = cryptcompress_inode_data(inode); ++ ++ if (attr->ia_valid & ATTR_SIZE) { ++ if (i_size_read(inode) != attr->ia_size) { ++ reiser4_context *ctx; ++ loff_t old_size; ++ ++ ctx = reiser4_init_context(dentry->d_inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ result = setattr_dispatch_hook(inode); ++ if (result) { ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ old_size = i_size_read(inode); ++ inode_check_scale(inode, old_size, attr->ia_size); ++ ++ mutex_lock(&info->checkin_mutex); ++ result = cryptcompress_truncate(inode, ++ attr->ia_size, ++ 1/* update sd */); ++ mutex_unlock(&info->checkin_mutex); ++ if (result) { ++ warning("edward-1192", ++ "truncate_cryptcompress failed: oid %lli, " ++ "old size %lld, new size %lld, retval %d", ++ (unsigned long long) ++ get_inode_oid(inode), old_size, ++ attr->ia_size, result); ++ } ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ } else ++ result = 0; ++ } else ++ result = reiser4_setattr_common(dentry, attr); ++ return result; ++} ++ ++/* plugin->release */ ++int release_cryptcompress(struct inode *inode, struct file *file) ++{ ++ reiser4_context *ctx = reiser4_init_context(inode->i_sb); ++ ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ reiser4_free_file_fsdata(file); ++ reiser4_exit_context(ctx); ++ return 0; ++} ++ ++/* plugin->prepare_write */ ++int write_begin_cryptcompress(struct file *file, struct page *page, ++ unsigned from, unsigned to) ++{ ++ return do_prepare_write(file, page, from, to); ++} ++ ++/* plugin->commit_write */ ++int write_end_cryptcompress(struct file *file, struct page *page, ++ unsigned from, unsigned to) ++{ ++ int ret; ++ hint_t *hint; ++ lock_handle *lh; ++ struct inode * inode; ++ struct cluster_handle clust; ++ ++ unlock_page(page); ++ ++ inode = page->mapping->host; ++ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); ++ if (hint == NULL) ++ return RETERR(-ENOMEM); ++ hint_init_zero(hint); ++ lh = &hint->lh; ++ ++ cluster_init_read(&clust, NULL); ++ clust.hint = hint; ++ ++ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); ++ if (ret) ++ goto out; ++ clust.index = pg_to_clust(page->index, inode); ++ ret = capture_page_cluster(&clust, inode); ++ if (ret) ++ warning("edward-1557", ++ "Capture failed (inode %llu, result=%i)", ++ (unsigned long long)get_inode_oid(inode), ret); ++ out: ++ done_lh(lh); ++ kfree(hint); ++ put_cluster_handle(&clust); ++ return ret; ++} ++ ++/* plugin->bmap */ ++sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock) ++{ ++ return -EINVAL; ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 80 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.33/fs/reiser4/plugin/file/cryptcompress.h +--- linux-2.6.33.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file/cryptcompress.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,616 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++/* See http://www.namesys.com/cryptcompress_design.html */ ++ ++#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ ) ++#define __FS_REISER4_CRYPTCOMPRESS_H__ ++ ++#include "../../page_cache.h" ++#include "../compress/compress.h" ++#include "../crypto/cipher.h" ++ ++#include <linux/pagemap.h> ++ ++#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT ++#define MAX_CLUSTER_SHIFT 16 ++#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT) ++#define DC_CHECKSUM_SIZE 4 ++ ++#define MIN_LATTICE_FACTOR 1 ++#define MAX_LATTICE_FACTOR 32 ++ ++/* this mask contains all non-standard plugins that might ++ be present in reiser4-specific part of inode managed by ++ cryptcompress file plugin */ ++#define cryptcompress_mask \ ++ ((1 << PSET_FILE) | \ ++ (1 << PSET_CLUSTER) | \ ++ (1 << PSET_CIPHER) | \ ++ (1 << PSET_DIGEST) | \ ++ (1 << PSET_COMPRESSION) | \ ++ (1 << PSET_COMPRESSION_MODE)) ++ ++#if REISER4_DEBUG ++static inline int cluster_shift_ok(int shift) ++{ ++ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT); ++} ++#endif ++ ++#if REISER4_DEBUG ++#define INODE_PGCOUNT(inode) \ ++({ \ ++ assert("edward-1530", inode_file_plugin(inode) == \ ++ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \ ++ atomic_read(&cryptcompress_inode_data(inode)->pgcount); \ ++ }) ++#define INODE_PGCOUNT_INC(inode) \ ++do { \ ++ assert("edward-1531", inode_file_plugin(inode) == \ ++ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \ ++ atomic_inc(&cryptcompress_inode_data(inode)->pgcount); \ ++} while (0) ++#define INODE_PGCOUNT_DEC(inode) \ ++do { \ ++ if (inode_file_plugin(inode) == \ ++ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) \ ++ atomic_dec(&cryptcompress_inode_data(inode)->pgcount); \ ++} while (0) ++#else ++#define INODE_PGCOUNT(inode) (0) ++#define INODE_PGCOUNT_INC(inode) ++#define INODE_PGCOUNT_DEC(inode) ++#endif /* REISER4_DEBUG */ ++ ++struct tfm_stream { ++ __u8 *data; ++ size_t size; ++}; ++ ++typedef enum { ++ INPUT_STREAM, ++ OUTPUT_STREAM, ++ LAST_STREAM ++} tfm_stream_id; ++ ++typedef struct tfm_stream * tfm_unit[LAST_STREAM]; ++ ++static inline __u8 *ts_data(struct tfm_stream * stm) ++{ ++ assert("edward-928", stm != NULL); ++ return stm->data; ++} ++ ++static inline size_t ts_size(struct tfm_stream * stm) ++{ ++ assert("edward-929", stm != NULL); ++ return stm->size; ++} ++ ++static inline void set_ts_size(struct tfm_stream * stm, size_t size) ++{ ++ assert("edward-930", stm != NULL); ++ ++ stm->size = size; ++} ++ ++static inline int alloc_ts(struct tfm_stream ** stm) ++{ ++ assert("edward-931", stm); ++ assert("edward-932", *stm == NULL); ++ ++ *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get()); ++ if (!*stm) ++ return -ENOMEM; ++ return 0; ++} ++ ++static inline void free_ts(struct tfm_stream * stm) ++{ ++ assert("edward-933", !ts_data(stm)); ++ assert("edward-934", !ts_size(stm)); ++ ++ kfree(stm); ++} ++ ++static inline int alloc_ts_data(struct tfm_stream * stm, size_t size) ++{ ++ assert("edward-935", !ts_data(stm)); ++ assert("edward-936", !ts_size(stm)); ++ assert("edward-937", size != 0); ++ ++ stm->data = reiser4_vmalloc(size); ++ if (!stm->data) ++ return -ENOMEM; ++ set_ts_size(stm, size); ++ return 0; ++} ++ ++static inline void free_ts_data(struct tfm_stream * stm) ++{ ++ assert("edward-938", equi(ts_data(stm), ts_size(stm))); ++ ++ if (ts_data(stm)) ++ vfree(ts_data(stm)); ++ memset(stm, 0, sizeof *stm); ++} ++ ++/* Write modes for item conversion in flush convert phase */ ++typedef enum { ++ CRC_APPEND_ITEM = 1, ++ CRC_OVERWRITE_ITEM = 2, ++ CRC_CUT_ITEM = 3 ++} cryptcompress_write_mode_t; ++ ++typedef enum { ++ LC_INVAL = 0, /* invalid value */ ++ LC_APPOV = 1, /* append and/or overwrite */ ++ LC_TRUNC = 2 /* truncate */ ++} logical_cluster_op; ++ ++/* Transform cluster. ++ * Intermediate state between page cluster and disk cluster ++ * Is used for data transform (compression/encryption) ++ */ ++struct tfm_cluster { ++ coa_set coa; /* compression algorithms info */ ++ tfm_unit tun; /* plain and transformed streams */ ++ tfm_action act; ++ int uptodate; ++ int lsize; /* number of bytes in logical cluster */ ++ int len; /* length of the transform stream */ ++}; ++ ++static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id, ++ tfm_action act) ++{ ++ return tc->coa[id][act]; ++} ++ ++static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id, ++ tfm_action act, coa_t coa) ++{ ++ tc->coa[id][act] = coa; ++} ++ ++static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug) ++{ ++ coa_t coa; ++ ++ coa = cplug->alloc(tc->act); ++ if (IS_ERR(coa)) ++ return PTR_ERR(coa); ++ set_coa(tc, cplug->h.id, tc->act, coa); ++ return 0; ++} ++ ++static inline int ++grab_coa(struct tfm_cluster * tc, compression_plugin * cplug) ++{ ++ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ? ++ alloc_coa(tc, cplug) : 0); ++} ++ ++static inline void free_coa_set(struct tfm_cluster * tc) ++{ ++ tfm_action j; ++ reiser4_compression_id i; ++ compression_plugin *cplug; ++ ++ assert("edward-810", tc != NULL); ++ ++ for (j = 0; j < TFMA_LAST; j++) ++ for (i = 0; i < LAST_COMPRESSION_ID; i++) { ++ if (!get_coa(tc, i, j)) ++ continue; ++ cplug = compression_plugin_by_id(i); ++ assert("edward-812", cplug->free != NULL); ++ cplug->free(get_coa(tc, i, j), j); ++ set_coa(tc, i, j, 0); ++ } ++ return; ++} ++ ++static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc, ++ tfm_stream_id id) ++{ ++ return tc->tun[id]; ++} ++ ++static inline void set_tfm_stream(struct tfm_cluster * tc, ++ tfm_stream_id id, struct tfm_stream * ts) ++{ ++ tc->tun[id] = ts; ++} ++ ++static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id) ++{ ++ return ts_data(get_tfm_stream(tc, id)); ++} ++ ++static inline void set_tfm_stream_data(struct tfm_cluster * tc, ++ tfm_stream_id id, __u8 * data) ++{ ++ get_tfm_stream(tc, id)->data = data; ++} ++ ++static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id) ++{ ++ return ts_size(get_tfm_stream(tc, id)); ++} ++ ++static inline void ++set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size) ++{ ++ get_tfm_stream(tc, id)->size = size; ++} ++ ++static inline int ++alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id) ++{ ++ assert("edward-939", tc != NULL); ++ assert("edward-940", !get_tfm_stream(tc, id)); ++ ++ tc->tun[id] = kzalloc(sizeof(struct tfm_stream), ++ reiser4_ctx_gfp_mask_get()); ++ if (!tc->tun[id]) ++ return -ENOMEM; ++ return alloc_ts_data(get_tfm_stream(tc, id), size); ++} ++ ++static inline int ++realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id) ++{ ++ assert("edward-941", tfm_stream_size(tc, id) < size); ++ free_ts_data(get_tfm_stream(tc, id)); ++ return alloc_ts_data(get_tfm_stream(tc, id), size); ++} ++ ++static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id) ++{ ++ free_ts_data(get_tfm_stream(tc, id)); ++ free_ts(get_tfm_stream(tc, id)); ++ set_tfm_stream(tc, id, 0); ++} ++ ++static inline unsigned coa_overrun(compression_plugin * cplug, int ilen) ++{ ++ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0); ++} ++ ++static inline void free_tfm_unit(struct tfm_cluster * tc) ++{ ++ tfm_stream_id id; ++ for (id = 0; id < LAST_STREAM; id++) { ++ if (!get_tfm_stream(tc, id)) ++ continue; ++ free_tfm_stream(tc, id); ++ } ++} ++ ++static inline void put_tfm_cluster(struct tfm_cluster * tc) ++{ ++ assert("edward-942", tc != NULL); ++ free_coa_set(tc); ++ free_tfm_unit(tc); ++} ++ ++static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc) ++{ ++ assert("edward-943", tc != NULL); ++ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1); ++ return (tc->uptodate == 1); ++} ++ ++static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc) ++{ ++ assert("edward-945", tc != NULL); ++ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1); ++ tc->uptodate = 1; ++ return; ++} ++ ++static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc) ++{ ++ assert("edward-947", tc != NULL); ++ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1); ++ tc->uptodate = 0; ++ return; ++} ++ ++static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id) ++{ ++ return (get_tfm_stream(tc, id) && ++ tfm_stream_data(tc, id) && tfm_stream_size(tc, id)); ++} ++ ++static inline int tfm_cluster_is_set(struct tfm_cluster * tc) ++{ ++ int i; ++ for (i = 0; i < LAST_STREAM; i++) ++ if (!tfm_stream_is_set(tc, i)) ++ return 0; ++ return 1; ++} ++ ++static inline void alternate_streams(struct tfm_cluster * tc) ++{ ++ struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM); ++ ++ set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM)); ++ set_tfm_stream(tc, OUTPUT_STREAM, tmp); ++} ++ ++/* Set of states to indicate a kind of data ++ * that will be written to the window */ ++typedef enum { ++ DATA_WINDOW, /* user's data */ ++ HOLE_WINDOW /* zeroes (such kind of data can be written ++ * if we start to write from offset > i_size) */ ++} window_stat; ++ ++/* Window (of logical cluster size) discretely sliding along a file. ++ * Is used to locate hole region in a logical cluster to be properly ++ * represented on disk. ++ * We split a write to cryptcompress file into writes to its logical ++ * clusters. Before writing to a logical cluster we set a window, i.e. ++ * calculate values of the following fields: ++ */ ++struct reiser4_slide { ++ unsigned off; /* offset to write from */ ++ unsigned count; /* number of bytes to write */ ++ unsigned delta; /* number of bytes to append to the hole */ ++ window_stat stat; /* what kind of data will be written starting ++ from @off */ ++}; ++ ++/* Possible states of a disk cluster */ ++typedef enum { ++ INVAL_DISK_CLUSTER, /* unknown state */ ++ PREP_DISK_CLUSTER, /* disk cluster got converted by flush ++ * at least 1 time */ ++ UNPR_DISK_CLUSTER, /* disk cluster just created and should be ++ * converted by flush */ ++ FAKE_DISK_CLUSTER, /* disk cluster doesn't exist neither in memory ++ * nor on disk */ ++ TRNC_DISK_CLUSTER /* disk cluster is partially truncated */ ++} disk_cluster_stat; ++ ++/* The following structure represents various stages of the same logical ++ * cluster of index @index: ++ * . fixed slide ++ * . page cluster (stage in primary cache) ++ * . transform cluster (transition stage) ++ * . disk cluster (stage in secondary cache) ++ * This structure is used in transition and synchronizing operations, e.g. ++ * transform cluster is a transition state when synchronizing page cluster ++ * and disk cluster. ++ * FIXME: Encapsulate page cluster, disk cluster. ++ */ ++struct cluster_handle { ++ cloff_t index; /* offset in a file (unit is a cluster size) */ ++ int index_valid; /* for validating the index above, if needed */ ++ struct file *file; /* host file */ ++ ++ /* logical cluster */ ++ struct reiser4_slide *win; /* sliding window to locate holes */ ++ logical_cluster_op op; /* logical cluster operation (truncate or ++ append/overwrite) */ ++ /* transform cluster */ ++ struct tfm_cluster tc; /* contains all needed info to synchronize ++ page cluster and disk cluster) */ ++ /* page cluster */ ++ int nr_pages; /* number of pages of current checkin action */ ++ int old_nrpages; /* number of pages of last checkin action */ ++ struct page **pages; /* attached pages */ ++ jnode * node; /* jnode for capture */ ++ ++ /* disk cluster */ ++ hint_t *hint; /* current position in the tree */ ++ disk_cluster_stat dstat; /* state of the current disk cluster */ ++ int reserved; /* is space for disk cluster reserved */ ++#if REISER4_DEBUG ++ reiser4_context *ctx; ++ int reserved_prepped; ++ int reserved_unprepped; ++#endif ++ ++}; ++ ++static inline __u8 * tfm_input_data (struct cluster_handle * clust) ++{ ++ return tfm_stream_data(&clust->tc, INPUT_STREAM); ++} ++ ++static inline __u8 * tfm_output_data (struct cluster_handle * clust) ++{ ++ return tfm_stream_data(&clust->tc, OUTPUT_STREAM); ++} ++ ++static inline int reset_cluster_pgset(struct cluster_handle * clust, ++ int nrpages) ++{ ++ assert("edward-1057", clust->pages != NULL); ++ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages); ++ return 0; ++} ++ ++static inline int alloc_cluster_pgset(struct cluster_handle * clust, ++ int nrpages) ++{ ++ assert("edward-949", clust != NULL); ++ assert("edward-1362", clust->pages == NULL); ++ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES); ++ ++ clust->pages = kzalloc(sizeof(*clust->pages) * nrpages, ++ reiser4_ctx_gfp_mask_get()); ++ if (!clust->pages) ++ return RETERR(-ENOMEM); ++ return 0; ++} ++ ++static inline void move_cluster_pgset(struct cluster_handle *clust, ++ struct page ***pages, int * nr_pages) ++{ ++ assert("edward-1545", clust != NULL && clust->pages != NULL); ++ assert("edward-1546", pages != NULL && *pages == NULL); ++ *pages = clust->pages; ++ *nr_pages = clust->nr_pages; ++ clust->pages = NULL; ++} ++ ++static inline void free_cluster_pgset(struct cluster_handle * clust) ++{ ++ assert("edward-951", clust->pages != NULL); ++ kfree(clust->pages); ++ clust->pages = NULL; ++} ++ ++static inline void put_cluster_handle(struct cluster_handle * clust) ++{ ++ assert("edward-435", clust != NULL); ++ ++ put_tfm_cluster(&clust->tc); ++ if (clust->pages) ++ free_cluster_pgset(clust); ++ memset(clust, 0, sizeof *clust); ++} ++ ++static inline void inc_keyload_count(struct reiser4_crypto_info * data) ++{ ++ assert("edward-1410", data != NULL); ++ data->keyload_count++; ++} ++ ++static inline void dec_keyload_count(struct reiser4_crypto_info * data) ++{ ++ assert("edward-1411", data != NULL); ++ assert("edward-1412", data->keyload_count > 0); ++ data->keyload_count--; ++} ++ ++static inline int capture_cluster_jnode(jnode * node) ++{ ++ return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); ++} ++ ++/* cryptcompress specific part of reiser4_inode */ ++struct cryptcompress_info { ++ struct mutex checkin_mutex; /* This is to serialize ++ * checkin_logical_cluster operations */ ++ cloff_t trunc_index; /* Index of the leftmost truncated disk ++ * cluster (to resolve races with read) */ ++ struct reiser4_crypto_info *crypt; ++ /* ++ * the following 2 fields are controlled by compression mode plugin ++ */ ++ int compress_toggle; /* Current status of compressibility */ ++ int lattice_factor; /* Factor of dynamic lattice. FIXME: Have ++ * a compression_toggle to keep the factor ++ */ ++#if REISER4_DEBUG ++ atomic_t pgcount; /* number of grabbed pages */ ++#endif ++}; ++ ++static inline void set_compression_toggle (struct cryptcompress_info * info, int val) ++{ ++ info->compress_toggle = val; ++} ++ ++static inline int get_compression_toggle (struct cryptcompress_info * info) ++{ ++ return info->compress_toggle; ++} ++ ++static inline int compression_is_on(struct cryptcompress_info * info) ++{ ++ return get_compression_toggle(info) == 1; ++} ++ ++static inline void turn_on_compression(struct cryptcompress_info * info) ++{ ++ set_compression_toggle(info, 1); ++} ++ ++static inline void turn_off_compression(struct cryptcompress_info * info) ++{ ++ set_compression_toggle(info, 0); ++} ++ ++static inline void set_lattice_factor(struct cryptcompress_info * info, int val) ++{ ++ info->lattice_factor = val; ++} ++ ++static inline int get_lattice_factor(struct cryptcompress_info * info) ++{ ++ return info->lattice_factor; ++} ++ ++struct cryptcompress_info *cryptcompress_inode_data(const struct inode *); ++int equal_to_rdk(znode *, const reiser4_key *); ++int goto_right_neighbor(coord_t *, lock_handle *); ++int cryptcompress_inode_ok(struct inode *inode); ++int coord_is_unprepped_ctail(const coord_t * coord); ++extern int do_readpage_ctail(struct inode *, struct cluster_handle *, ++ struct page * page, znode_lock_mode mode); ++extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust, ++ struct inode * inode); ++extern int readpages_cryptcompress(struct file*, struct address_space*, ++ struct list_head*, unsigned); ++int bind_cryptcompress(struct inode *child, struct inode *parent); ++void destroy_inode_cryptcompress(struct inode * inode); ++int grab_page_cluster(struct inode *inode, struct cluster_handle * clust, ++ rw_op rw); ++int write_dispatch_hook(struct file *file, struct inode * inode, ++ loff_t pos, struct cluster_handle * clust, ++ struct dispatch_context * cont); ++int setattr_dispatch_hook(struct inode * inode); ++struct reiser4_crypto_info * inode_crypto_info(struct inode * inode); ++void inherit_crypto_info_common(struct inode * parent, struct inode * object, ++ int (*can_inherit)(struct inode * child, ++ struct inode * parent)); ++void reiser4_attach_crypto_info(struct inode * inode, ++ struct reiser4_crypto_info * info); ++void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new); ++struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode); ++ ++static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info) ++{ ++ return info->cipher; ++} ++ ++static inline void info_set_cipher(struct reiser4_crypto_info * info, ++ struct crypto_blkcipher * tfm) ++{ ++ info->cipher = tfm; ++} ++ ++static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info) ++{ ++ return info->digest; ++} ++ ++static inline void info_set_digest(struct reiser4_crypto_info * info, ++ struct crypto_hash * tfm) ++{ ++ info->digest = tfm; ++} ++ ++static inline void put_cluster_page(struct page * page) ++{ ++ page_cache_release(page); ++} ++ ++#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/file.c linux-2.6.33/fs/reiser4/plugin/file/file.c +--- linux-2.6.33.orig/fs/reiser4/plugin/file/file.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file/file.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,2688 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* ++ * this file contains implementations of inode/file/address_space/file plugin ++ * operations specific for "unix file plugin" (plugin id is ++ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only ++ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have ++ * no items but stat data) ++ */ ++ ++#include "../../inode.h" ++#include "../../super.h" ++#include "../../tree_walk.h" ++#include "../../carry.h" ++#include "../../page_cache.h" ++#include "../../ioctl.h" ++#include "../object.h" ++#include "../cluster.h" ++#include "../../safe_link.h" ++ ++#include <linux/writeback.h> ++#include <linux/pagevec.h> ++#include <linux/syscalls.h> ++ ++ ++static int unpack(struct file *file, struct inode *inode, int forever); ++static void drop_access(struct unix_file_info *); ++static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key, ++ znode_lock_mode lock_mode); ++ ++/* Get exclusive access and make sure that file is not partially ++ * converted (It may happen that another process is doing tail ++ * conversion. If so, wait until it completes) ++ */ ++static inline void get_exclusive_access_careful(struct unix_file_info * uf_info, ++ struct inode *inode) ++{ ++ do { ++ get_exclusive_access(uf_info); ++ if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)) ++ break; ++ drop_exclusive_access(uf_info); ++ schedule(); ++ } while (1); ++} ++ ++/* get unix file plugin specific portion of inode */ ++struct unix_file_info *unix_file_inode_data(const struct inode *inode) ++{ ++ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info; ++} ++ ++/** ++ * equal_to_rdk - compare key and znode's right delimiting key ++ * @node: node whose right delimiting key to compare with @key ++ * @key: key to compare with @node's right delimiting key ++ * ++ * Returns true if @key is equal to right delimiting key of @node. ++ */ ++int equal_to_rdk(znode *node, const reiser4_key *key) ++{ ++ int result; ++ ++ read_lock_dk(znode_get_tree(node)); ++ result = keyeq(key, znode_get_rd_key(node)); ++ read_unlock_dk(znode_get_tree(node)); ++ return result; ++} ++ ++#if REISER4_DEBUG ++ ++/** ++ * equal_to_ldk - compare key and znode's left delimiting key ++ * @node: node whose left delimiting key to compare with @key ++ * @key: key to compare with @node's left delimiting key ++ * ++ * Returns true if @key is equal to left delimiting key of @node. ++ */ ++int equal_to_ldk(znode *node, const reiser4_key *key) ++{ ++ int result; ++ ++ read_lock_dk(znode_get_tree(node)); ++ result = keyeq(key, znode_get_ld_key(node)); ++ read_unlock_dk(znode_get_tree(node)); ++ return result; ++} ++ ++/** ++ * check_coord - check whether coord corresponds to key ++ * @coord: coord to check ++ * @key: key @coord has to correspond to ++ * ++ * Returns true if @coord is set as if it was set as result of lookup with @key ++ * in coord->node. ++ */ ++static int check_coord(const coord_t *coord, const reiser4_key *key) ++{ ++ coord_t twin; ++ ++ node_plugin_by_node(coord->node)->lookup(coord->node, key, ++ FIND_MAX_NOT_MORE_THAN, &twin); ++ return coords_equal(coord, &twin); ++} ++ ++#endif /* REISER4_DEBUG */ ++ ++/** ++ * init_uf_coord - initialize extended coord ++ * @uf_coord: ++ * @lh: ++ * ++ * ++ */ ++void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh) ++{ ++ coord_init_zero(&uf_coord->coord); ++ coord_clear_iplug(&uf_coord->coord); ++ uf_coord->lh = lh; ++ init_lh(lh); ++ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension)); ++ uf_coord->valid = 0; ++} ++ ++static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset) ++{ ++ assert("vs-1333", uf_coord->valid == 0); ++ ++ if (coord_is_between_items(&uf_coord->coord)) ++ return; ++ ++ assert("vs-1348", ++ item_plugin_by_coord(&uf_coord->coord)->s.file. ++ init_coord_extension); ++ ++ item_body_by_coord(&uf_coord->coord); ++ item_plugin_by_coord(&uf_coord->coord)->s.file. ++ init_coord_extension(uf_coord, offset); ++} ++ ++/** ++ * goto_right_neighbor - lock right neighbor, drop current node lock ++ * @coord: ++ * @lh: ++ * ++ * Obtain lock on right neighbor and drop lock on current node. ++ */ ++int goto_right_neighbor(coord_t *coord, lock_handle *lh) ++{ ++ int result; ++ lock_handle lh_right; ++ ++ assert("vs-1100", znode_is_locked(coord->node)); ++ ++ init_lh(&lh_right); ++ result = reiser4_get_right_neighbor(&lh_right, coord->node, ++ znode_is_wlocked(coord->node) ? ++ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK, ++ GN_CAN_USE_UPPER_LEVELS); ++ if (result) { ++ done_lh(&lh_right); ++ return result; ++ } ++ ++ /* ++ * we hold two longterm locks on neighboring nodes. Unlock left of ++ * them ++ */ ++ done_lh(lh); ++ ++ coord_init_first_unit_nocheck(coord, lh_right.node); ++ move_lh(lh, &lh_right); ++ ++ return 0; ++ ++} ++ ++/** ++ * set_file_state ++ * @uf_info: ++ * @cbk_result: ++ * @level: ++ * ++ * This is to be used by find_file_item and in find_file_state to ++ * determine real state of file ++ */ ++static void set_file_state(struct unix_file_info *uf_info, int cbk_result, ++ tree_level level) ++{ ++ if (cbk_errored(cbk_result)) ++ /* error happened in find_file_item */ ++ return; ++ ++ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL); ++ ++ if (uf_info->container == UF_CONTAINER_UNKNOWN) { ++ if (cbk_result == CBK_COORD_NOTFOUND) ++ uf_info->container = UF_CONTAINER_EMPTY; ++ else if (level == LEAF_LEVEL) ++ uf_info->container = UF_CONTAINER_TAILS; ++ else ++ uf_info->container = UF_CONTAINER_EXTENTS; ++ } else { ++ /* ++ * file state is known, check whether it is set correctly if ++ * file is not being tail converted ++ */ ++ if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info), ++ REISER4_PART_IN_CONV)) { ++ assert("vs-1162", ++ ergo(level == LEAF_LEVEL && ++ cbk_result == CBK_COORD_FOUND, ++ uf_info->container == UF_CONTAINER_TAILS)); ++ assert("vs-1165", ++ ergo(level == TWIG_LEVEL && ++ cbk_result == CBK_COORD_FOUND, ++ uf_info->container == UF_CONTAINER_EXTENTS)); ++ } ++ } ++} ++ ++int find_file_item_nohint(coord_t *coord, lock_handle *lh, ++ const reiser4_key *key, znode_lock_mode lock_mode, ++ struct inode *inode) ++{ ++ return reiser4_object_lookup(inode, key, coord, lh, lock_mode, ++ FIND_MAX_NOT_MORE_THAN, ++ TWIG_LEVEL, LEAF_LEVEL, ++ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE : ++ (CBK_UNIQUE | CBK_FOR_INSERT), ++ NULL /* ra_info */ ); ++} ++ ++/** ++ * find_file_item - look for file item in the tree ++ * @hint: provides coordinate, lock handle, seal ++ * @key: key for search ++ * @mode: mode of lock to put on returned node ++ * @ra_info: ++ * @inode: ++ * ++ * This finds position in the tree corresponding to @key. It first tries to use ++ * @hint's seal if it is set. ++ */ ++int find_file_item(hint_t *hint, const reiser4_key *key, ++ znode_lock_mode lock_mode, ++ struct inode *inode) ++{ ++ int result; ++ coord_t *coord; ++ lock_handle *lh; ++ ++ assert("nikita-3030", reiser4_schedulable()); ++ assert("vs-1707", hint != NULL); ++ assert("vs-47", inode != NULL); ++ ++ coord = &hint->ext_coord.coord; ++ lh = hint->ext_coord.lh; ++ init_lh(lh); ++ ++ result = hint_validate(hint, key, 1 /* check key */, lock_mode); ++ if (!result) { ++ if (coord->between == AFTER_UNIT && ++ equal_to_rdk(coord->node, key)) { ++ result = goto_right_neighbor(coord, lh); ++ if (result == -E_NO_NEIGHBOR) ++ return RETERR(-EIO); ++ if (result) ++ return result; ++ assert("vs-1152", equal_to_ldk(coord->node, key)); ++ /* ++ * we moved to different node. Invalidate coord ++ * extension, zload is necessary to init it again ++ */ ++ hint->ext_coord.valid = 0; ++ } ++ ++ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND, ++ znode_get_level(coord->node)); ++ ++ return CBK_COORD_FOUND; ++ } ++ ++ coord_init_zero(coord); ++ result = find_file_item_nohint(coord, lh, key, lock_mode, inode); ++ set_file_state(unix_file_inode_data(inode), result, ++ znode_get_level(coord->node)); ++ ++ /* FIXME: we might already have coord extension initialized */ ++ hint->ext_coord.valid = 0; ++ return result; ++} ++ ++/* plugin->u.file.write_flowom = NULL ++ plugin->u.file.read_flow = NULL */ ++ ++void hint_init_zero(hint_t * hint) ++{ ++ memset(hint, 0, sizeof(*hint)); ++ init_lh(&hint->lh); ++ hint->ext_coord.lh = &hint->lh; ++} ++ ++static int find_file_state(struct inode *inode, struct unix_file_info *uf_info) ++{ ++ int result; ++ reiser4_key key; ++ coord_t coord; ++ lock_handle lh; ++ ++ assert("vs-1628", ea_obtained(uf_info)); ++ ++ if (uf_info->container == UF_CONTAINER_UNKNOWN) { ++ key_by_inode_and_offset_common(inode, 0, &key); ++ init_lh(&lh); ++ result = find_file_item_nohint(&coord, &lh, &key, ++ ZNODE_READ_LOCK, inode); ++ set_file_state(uf_info, result, znode_get_level(coord.node)); ++ done_lh(&lh); ++ if (!cbk_errored(result)) ++ result = 0; ++ } else ++ result = 0; ++ assert("vs-1074", ++ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN)); ++ reiser4_txn_restart_current(); ++ return result; ++} ++ ++/** ++ * Estimate and reserve space needed to truncate page ++ * which gets partially truncated: one block for page ++ * itself, stat-data update (estimate_one_insert_into_item) ++ * and one item insertion (estimate_one_insert_into_item) ++ * which may happen if page corresponds to hole extent and ++ * unallocated one will have to be created ++ */ ++static int reserve_partial_page(reiser4_tree * tree) ++{ ++ grab_space_enable(); ++ return reiser4_grab_reserved(reiser4_get_current_sb(), ++ 1 + ++ 2 * estimate_one_insert_into_item(tree), ++ BA_CAN_COMMIT); ++} ++ ++/* estimate and reserve space needed to cut one item and update one stat data */ ++static int reserve_cut_iteration(reiser4_tree * tree) ++{ ++ __u64 estimate = estimate_one_item_removal(tree) ++ + estimate_one_insert_into_item(tree); ++ ++ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack())); ++ ++ grab_space_enable(); ++ /* We need to double our estimate now that we can delete more than one ++ node. */ ++ return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2, ++ BA_CAN_COMMIT); ++} ++ ++int reiser4_update_file_size(struct inode *inode, loff_t new_size, ++ int update_sd) ++{ ++ int result = 0; ++ ++ INODE_SET_SIZE(inode, new_size); ++ if (update_sd) { ++ inode->i_ctime = inode->i_mtime = CURRENT_TIME; ++ result = reiser4_update_sd(inode); ++ } ++ return result; ++} ++ ++/** ++ * Cut file items one by one starting from the last one until ++ * new file size (inode->i_size) is reached. Reserve space ++ * and update file stat data on every single cut from the tree ++ */ ++int cut_file_items(struct inode *inode, loff_t new_size, ++ int update_sd, loff_t cur_size, ++ int (*update_actor) (struct inode *, loff_t, int)) ++{ ++ reiser4_key from_key, to_key; ++ reiser4_key smallest_removed; ++ file_plugin *fplug = inode_file_plugin(inode); ++ int result; ++ int progress = 0; ++ ++ assert("vs-1248", ++ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) || ++ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); ++ ++ fplug->key_by_inode(inode, new_size, &from_key); ++ to_key = from_key; ++ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ ); ++ /* this loop normally runs just once */ ++ while (1) { ++ result = reserve_cut_iteration(reiser4_tree_by_inode(inode)); ++ if (result) ++ break; ++ ++ result = reiser4_cut_tree_object(current_tree, &from_key, &to_key, ++ &smallest_removed, inode, 1, ++ &progress); ++ if (result == -E_REPEAT) { ++ /** ++ * -E_REPEAT is a signal to interrupt a long ++ * file truncation process ++ */ ++ if (progress) { ++ result = update_actor(inode, ++ get_key_offset(&smallest_removed), ++ update_sd); ++ if (result) ++ break; ++ } ++ /* the below does up(sbinfo->delete_mutex). ++ * Do not get folled */ ++ reiser4_release_reserved(inode->i_sb); ++ /** ++ * reiser4_cut_tree_object() was interrupted probably ++ * because current atom requires commit, we have to ++ * release transaction handle to allow atom commit. ++ */ ++ reiser4_txn_restart_current(); ++ continue; ++ } ++ if (result ++ && !(result == CBK_COORD_NOTFOUND && new_size == 0 ++ && inode->i_size == 0)) ++ break; ++ ++ set_key_offset(&smallest_removed, new_size); ++ /* Final sd update after the file gets its correct size */ ++ result = update_actor(inode, get_key_offset(&smallest_removed), ++ update_sd); ++ break; ++ } ++ ++ /* the below does up(sbinfo->delete_mutex). Do not get folled */ ++ reiser4_release_reserved(inode->i_sb); ++ ++ return result; ++} ++ ++int find_or_create_extent(struct page *page); ++ ++/* part of truncate_file_body: it is called when truncate is used to make file ++ shorter */ ++static int shorten_file(struct inode *inode, loff_t new_size) ++{ ++ int result; ++ struct page *page; ++ int padd_from; ++ unsigned long index; ++ struct unix_file_info *uf_info; ++ ++ /* ++ * all items of ordinary reiser4 file are grouped together. That is why ++ * we can use reiser4_cut_tree. Plan B files (for instance) can not be ++ * truncated that simply ++ */ ++ result = cut_file_items(inode, new_size, 1 /*update_sd */ , ++ get_key_offset(reiser4_max_key()), ++ reiser4_update_file_size); ++ if (result) ++ return result; ++ ++ uf_info = unix_file_inode_data(inode); ++ assert("vs-1105", new_size == inode->i_size); ++ if (new_size == 0) { ++ uf_info->container = UF_CONTAINER_EMPTY; ++ return 0; ++ } ++ ++ result = find_file_state(inode, uf_info); ++ if (result) ++ return result; ++ if (uf_info->container == UF_CONTAINER_TAILS) ++ /* ++ * No need to worry about zeroing last page after new file ++ * end ++ */ ++ return 0; ++ ++ padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1); ++ if (!padd_from) ++ /* file is truncated to page boundary */ ++ return 0; ++ ++ result = reserve_partial_page(reiser4_tree_by_inode(inode)); ++ if (result) { ++ reiser4_release_reserved(inode->i_sb); ++ return result; ++ } ++ ++ /* last page is partially truncated - zero its content */ ++ index = (inode->i_size >> PAGE_CACHE_SHIFT); ++ page = read_mapping_page(inode->i_mapping, index, NULL); ++ if (IS_ERR(page)) { ++ /* ++ * the below does up(sbinfo->delete_mutex). Do not get ++ * confused ++ */ ++ reiser4_release_reserved(inode->i_sb); ++ if (likely(PTR_ERR(page) == -EINVAL)) { ++ /* looks like file is built of tail items */ ++ return 0; ++ } ++ return PTR_ERR(page); ++ } ++ wait_on_page_locked(page); ++ if (!PageUptodate(page)) { ++ page_cache_release(page); ++ /* ++ * the below does up(sbinfo->delete_mutex). Do not get ++ * confused ++ */ ++ reiser4_release_reserved(inode->i_sb); ++ return RETERR(-EIO); ++ } ++ ++ /* ++ * if page correspons to hole extent unit - unallocated one will be ++ * created here. This is not necessary ++ */ ++ result = find_or_create_extent(page); ++ ++ /* ++ * FIXME: cut_file_items has already updated inode. Probably it would ++ * be better to update it here when file is really truncated ++ */ ++ if (result) { ++ page_cache_release(page); ++ /* ++ * the below does up(sbinfo->delete_mutex). Do not get ++ * confused ++ */ ++ reiser4_release_reserved(inode->i_sb); ++ return result; ++ } ++ ++ lock_page(page); ++ assert("vs-1066", PageLocked(page)); ++ zero_user_segment(page, padd_from, PAGE_CACHE_SIZE); ++ unlock_page(page); ++ page_cache_release(page); ++ /* the below does up(sbinfo->delete_mutex). Do not get confused */ ++ reiser4_release_reserved(inode->i_sb); ++ return 0; ++} ++ ++/** ++ * should_have_notail ++ * @uf_info: ++ * @new_size: ++ * ++ * Calls formatting plugin to see whether file of size @new_size has to be ++ * stored in unformatted nodes or in tail items. 0 is returned for later case. ++ */ ++static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size) ++{ ++ if (!uf_info->tplug) ++ return 1; ++ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info), ++ new_size); ++ ++} ++ ++/** ++ * truncate_file_body - change length of file ++ * @inode: inode of file ++ * @new_size: new file length ++ * ++ * Adjusts items file @inode is built of to match @new_size. It may either cut ++ * items or add them to represent a hole at the end of file. The caller has to ++ * obtain exclusive access to the file. ++ */ ++static int truncate_file_body(struct inode *inode, struct iattr *attr) ++{ ++ int result; ++ loff_t new_size = attr->ia_size; ++ ++ if (inode->i_size < new_size) { ++ /* expanding truncate */ ++ struct unix_file_info *uf_info = unix_file_inode_data(inode); ++ ++ result = find_file_state(inode, uf_info); ++ if (result) ++ return result; ++ ++ if (should_have_notail(uf_info, new_size)) { ++ /* ++ * file of size @new_size has to be built of ++ * extents. If it is built of tails - convert to ++ * extents ++ */ ++ if (uf_info->container == UF_CONTAINER_TAILS) { ++ /* ++ * if file is being convered by another process ++ * - wait until it completes ++ */ ++ while (1) { ++ if (reiser4_inode_get_flag(inode, ++ REISER4_PART_IN_CONV)) { ++ drop_exclusive_access(uf_info); ++ schedule(); ++ get_exclusive_access(uf_info); ++ continue; ++ } ++ break; ++ } ++ ++ if (uf_info->container == UF_CONTAINER_TAILS) { ++ result = tail2extent(uf_info); ++ if (result) ++ return result; ++ } ++ } ++ result = reiser4_write_extent(NULL, inode, NULL, ++ 0, &new_size); ++ if (result) ++ return result; ++ uf_info->container = UF_CONTAINER_EXTENTS; ++ } else { ++ if (uf_info->container == UF_CONTAINER_EXTENTS) { ++ result = reiser4_write_extent(NULL, inode, NULL, ++ 0, &new_size); ++ if (result) ++ return result; ++ } else { ++ result = reiser4_write_tail(NULL, inode, NULL, ++ 0, &new_size); ++ if (result) ++ return result; ++ uf_info->container = UF_CONTAINER_TAILS; ++ } ++ } ++ BUG_ON(result > 0); ++ result = reiser4_update_file_size(inode, new_size, 1); ++ BUG_ON(result != 0); ++ } else ++ result = shorten_file(inode, new_size); ++ return result; ++} ++ ++/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */ ++ ++/** ++ * load_file_hint - copy hint from struct file to local variable ++ * @file: file to get hint from ++ * @hint: structure to fill ++ * ++ * Reiser4 specific portion of struct file may contain information (hint) ++ * stored on exiting from previous read or write. That information includes ++ * seal of znode and coord within that znode where previous read or write ++ * stopped. This function copies that information to @hint if it was stored or ++ * initializes @hint by 0s otherwise. ++ */ ++int load_file_hint(struct file *file, hint_t *hint) ++{ ++ reiser4_file_fsdata *fsdata; ++ ++ if (file) { ++ fsdata = reiser4_get_file_fsdata(file); ++ if (IS_ERR(fsdata)) ++ return PTR_ERR(fsdata); ++ ++ spin_lock_inode(file->f_dentry->d_inode); ++ if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) { ++ *hint = fsdata->reg.hint; ++ init_lh(&hint->lh); ++ hint->ext_coord.lh = &hint->lh; ++ spin_unlock_inode(file->f_dentry->d_inode); ++ /* ++ * force re-validation of the coord on the first ++ * iteration of the read/write loop. ++ */ ++ hint->ext_coord.valid = 0; ++ assert("nikita-19892", coords_equal(&hint->seal.coord1, ++ &hint->ext_coord. ++ coord)); ++ return 0; ++ } ++ memset(&fsdata->reg.hint, 0, sizeof(hint_t)); ++ spin_unlock_inode(file->f_dentry->d_inode); ++ } ++ hint_init_zero(hint); ++ return 0; ++} ++ ++/** ++ * save_file_hint - copy hint to reiser4 private struct file's part ++ * @file: file to save hint in ++ * @hint: hint to save ++ * ++ * This copies @hint to reiser4 private part of struct file. It can help ++ * speedup future accesses to the file. ++ */ ++void save_file_hint(struct file *file, const hint_t *hint) ++{ ++ reiser4_file_fsdata *fsdata; ++ ++ assert("edward-1337", hint != NULL); ++ ++ if (!file || !reiser4_seal_is_set(&hint->seal)) ++ return; ++ fsdata = reiser4_get_file_fsdata(file); ++ assert("vs-965", !IS_ERR(fsdata)); ++ assert("nikita-19891", ++ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord)); ++ assert("vs-30", hint->lh.owner == NULL); ++ spin_lock_inode(file->f_dentry->d_inode); ++ fsdata->reg.hint = *hint; ++ spin_unlock_inode(file->f_dentry->d_inode); ++ return; ++} ++ ++void reiser4_unset_hint(hint_t * hint) ++{ ++ assert("vs-1315", hint); ++ hint->ext_coord.valid = 0; ++ reiser4_seal_done(&hint->seal); ++ done_lh(&hint->lh); ++} ++ ++/* coord must be set properly. So, that reiser4_set_hint ++ has nothing to do */ ++void reiser4_set_hint(hint_t * hint, const reiser4_key * key, ++ znode_lock_mode mode) ++{ ++ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord); ++ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key))); ++ ++ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key); ++ hint->offset = get_key_offset(key); ++ hint->mode = mode; ++ done_lh(&hint->lh); ++} ++ ++int hint_is_set(const hint_t * hint) ++{ ++ return reiser4_seal_is_set(&hint->seal); ++} ++ ++#if REISER4_DEBUG ++static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2) ++{ ++ return (get_key_locality(k1) == get_key_locality(k2) && ++ get_key_type(k1) == get_key_type(k2) && ++ get_key_band(k1) == get_key_band(k2) && ++ get_key_ordering(k1) == get_key_ordering(k2) && ++ get_key_objectid(k1) == get_key_objectid(k2)); ++} ++#endif ++ ++static int ++hint_validate(hint_t * hint, const reiser4_key * key, int check_key, ++ znode_lock_mode lock_mode) ++{ ++ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode) ++ /* hint either not set or set by different operation */ ++ return RETERR(-E_REPEAT); ++ ++ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key)); ++ ++ if (check_key && get_key_offset(key) != hint->offset) ++ /* hint is set for different key */ ++ return RETERR(-E_REPEAT); ++ ++ assert("vs-31", hint->ext_coord.lh == &hint->lh); ++ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key, ++ hint->ext_coord.lh, lock_mode, ++ ZNODE_LOCK_LOPRI); ++} ++ ++/** ++ * Look for place at twig level for extent corresponding to page, ++ * call extent's writepage method to create unallocated extent if ++ * it does not exist yet, initialize jnode, capture page ++ */ ++int find_or_create_extent(struct page *page) ++{ ++ int result; ++ struct inode *inode; ++ int plugged_hole; ++ ++ jnode *node; ++ ++ assert("vs-1065", page->mapping && page->mapping->host); ++ inode = page->mapping->host; ++ ++ lock_page(page); ++ node = jnode_of_page(page); ++ if (IS_ERR(node)) { ++ unlock_page(page); ++ return PTR_ERR(node); ++ } ++ JF_SET(node, JNODE_WRITE_PREPARED); ++ unlock_page(page); ++ ++ if (node->blocknr == 0) { ++ plugged_hole = 0; ++ result = reiser4_update_extent(inode, node, page_offset(page), ++ &plugged_hole); ++ if (result) { ++ JF_CLR(node, JNODE_WRITE_PREPARED); ++ jput(node); ++ warning("edward-1549", ++ "reiser4_update_extent failed: %d", result); ++ return result; ++ } ++ if (plugged_hole) ++ reiser4_update_sd(inode); ++ } else { ++ spin_lock_jnode(node); ++ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); ++ BUG_ON(result != 0); ++ jnode_make_dirty_locked(node); ++ spin_unlock_jnode(node); ++ } ++ ++ BUG_ON(node->atom == NULL); ++ JF_CLR(node, JNODE_WRITE_PREPARED); ++ ++ if (get_current_context()->entd) { ++ entd_context *ent = get_entd_context(node->tree->super); ++ ++ if (ent->cur_request->page == page) ++ /* the following reference will be ++ dropped in reiser4_writeout */ ++ ent->cur_request->node = jref(node); ++ } ++ jput(node); ++ return 0; ++} ++ ++/** ++ * has_anonymous_pages - check whether inode has pages dirtied via mmap ++ * @inode: inode to check ++ * ++ * Returns true if inode's mapping has dirty pages which do not belong to any ++ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page ++ * tree or were eflushed and can be found via jnodes tagged ++ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes. ++ */ ++static int has_anonymous_pages(struct inode *inode) ++{ ++ int result; ++ ++ spin_lock_irq(&inode->i_mapping->tree_lock); ++ result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED); ++ spin_unlock_irq(&inode->i_mapping->tree_lock); ++ return result; ++} ++ ++/** ++ * capture_page_and_create_extent - ++ * @page: page to be captured ++ * ++ * Grabs space for extent creation and stat data update and calls function to ++ * do actual work. ++ */ ++static int capture_page_and_create_extent(struct page *page) ++{ ++ int result; ++ struct inode *inode; ++ ++ assert("vs-1084", page->mapping && page->mapping->host); ++ inode = page->mapping->host; ++ assert("vs-1139", ++ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS); ++ /* page belongs to file */ ++ assert("vs-1393", ++ inode->i_size > page_offset(page)); ++ ++ /* page capture may require extent creation (if it does not exist yet) ++ and stat data's update (number of blocks changes on extent ++ creation) */ ++ grab_space_enable(); ++ result = reiser4_grab_space(2 * estimate_one_insert_into_item ++ (reiser4_tree_by_inode(inode)), ++ BA_CAN_COMMIT); ++ if (likely(!result)) ++ result = find_or_create_extent(page); ++ ++ if (result != 0) ++ SetPageError(page); ++ return result; ++} ++ ++/* plugin->write_end() */ ++int write_end_unix_file(struct file *file, struct page *page, ++ unsigned from, unsigned to) ++{ ++ unlock_page(page); ++ return capture_page_and_create_extent(page); ++} ++ ++/* ++ * Support for "anonymous" pages and jnodes. ++ * ++ * When file is write-accessed through mmap pages can be dirtied from the user ++ * level. In this case kernel is not notified until one of following happens: ++ * ++ * (1) msync() ++ * ++ * (2) truncate() (either explicit or through unlink) ++ * ++ * (3) VM scanner starts reclaiming mapped pages, dirtying them before ++ * starting write-back. ++ * ++ * As a result of (3) ->writepage may be called on a dirty page without ++ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads ++ * (iozone) generate huge number of anonymous pages. ++ * ++ * reiser4_sync_sb() method tries to insert anonymous pages into ++ * tree. This is done by capture_anonymous_*() functions below. ++ */ ++ ++/** ++ * capture_anonymous_page - involve page into transaction ++ * @pg: page to deal with ++ * ++ * Takes care that @page has corresponding metadata in the tree, creates jnode ++ * for @page and captures it. On success 1 is returned. ++ */ ++static int capture_anonymous_page(struct page *page) ++{ ++ int result; ++ ++ if (PageWriteback(page)) ++ /* FIXME: do nothing? */ ++ return 0; ++ ++ result = capture_page_and_create_extent(page); ++ if (result == 0) { ++ result = 1; ++ } else ++ warning("nikita-3329", ++ "Cannot capture anon page: %i", result); ++ ++ return result; ++} ++ ++/** ++ * capture_anonymous_pages - find and capture pages dirtied via mmap ++ * @mapping: address space where to look for pages ++ * @index: start index ++ * @to_capture: maximum number of pages to capture ++ * ++ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page, ++ * captures (involves into atom) them, returns number of captured pages, ++ * updates @index to next page after the last captured one. ++ */ ++static int ++capture_anonymous_pages(struct address_space *mapping, pgoff_t *index, ++ unsigned int to_capture) ++{ ++ int result; ++ struct pagevec pvec; ++ unsigned int i, count; ++ int nr; ++ ++ pagevec_init(&pvec, 0); ++ count = min(pagevec_space(&pvec), to_capture); ++ nr = 0; ++ ++ /* find pages tagged MOVED */ ++ spin_lock_irq(&mapping->tree_lock); ++ pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree, ++ (void **)pvec.pages, *index, count, ++ PAGECACHE_TAG_REISER4_MOVED); ++ if (pagevec_count(&pvec) == 0) { ++ /* ++ * there are no pages tagged MOVED in mapping->page_tree ++ * starting from *index ++ */ ++ spin_unlock_irq(&mapping->tree_lock); ++ *index = (pgoff_t)-1; ++ return 0; ++ } ++ ++ /* clear MOVED tag for all found pages */ ++ for (i = 0; i < pagevec_count(&pvec); i++) { ++ page_cache_get(pvec.pages[i]); ++ radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index, ++ PAGECACHE_TAG_REISER4_MOVED); ++ } ++ spin_unlock_irq(&mapping->tree_lock); ++ ++ ++ *index = pvec.pages[i - 1]->index + 1; ++ ++ for (i = 0; i < pagevec_count(&pvec); i++) { ++ result = capture_anonymous_page(pvec.pages[i]); ++ if (result == 1) ++ nr++; ++ else { ++ if (result < 0) { ++ warning("vs-1454", ++ "failed to capture page: " ++ "result=%d, captured=%d)\n", ++ result, i); ++ ++ /* ++ * set MOVED tag to all pages which left not ++ * captured ++ */ ++ spin_lock_irq(&mapping->tree_lock); ++ for (; i < pagevec_count(&pvec); i ++) { ++ radix_tree_tag_set(&mapping->page_tree, ++ pvec.pages[i]->index, ++ PAGECACHE_TAG_REISER4_MOVED); ++ } ++ spin_unlock_irq(&mapping->tree_lock); ++ ++ pagevec_release(&pvec); ++ return result; ++ } else { ++ /* ++ * result == 0. capture_anonymous_page returns ++ * 0 for Writeback-ed page. Set MOVED tag on ++ * that page ++ */ ++ spin_lock_irq(&mapping->tree_lock); ++ radix_tree_tag_set(&mapping->page_tree, ++ pvec.pages[i]->index, ++ PAGECACHE_TAG_REISER4_MOVED); ++ spin_unlock_irq(&mapping->tree_lock); ++ if (i == 0) ++ *index = pvec.pages[0]->index; ++ else ++ *index = pvec.pages[i - 1]->index + 1; ++ } ++ } ++ } ++ pagevec_release(&pvec); ++ return nr; ++} ++ ++/** ++ * capture_anonymous_jnodes - find and capture anonymous jnodes ++ * @mapping: address space where to look for jnodes ++ * @from: start index ++ * @to: end index ++ * @to_capture: maximum number of jnodes to capture ++ * ++ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in ++ * the range of indexes @from-@to and captures them, returns number of captured ++ * jnodes, updates @from to next jnode after the last captured one. ++ */ ++static int ++capture_anonymous_jnodes(struct address_space *mapping, ++ pgoff_t *from, pgoff_t to, int to_capture) ++{ ++ *from = to; ++ return 0; ++} ++ ++/* ++ * Commit atom of the jnode of a page. ++ */ ++static int sync_page(struct page *page) ++{ ++ int result; ++ do { ++ jnode *node; ++ txn_atom *atom; ++ ++ lock_page(page); ++ node = jprivate(page); ++ if (node != NULL) { ++ spin_lock_jnode(node); ++ atom = jnode_get_atom(node); ++ spin_unlock_jnode(node); ++ } else ++ atom = NULL; ++ unlock_page(page); ++ result = reiser4_sync_atom(atom); ++ } while (result == -E_REPEAT); ++ /* ++ * ZAM-FIXME-HANS: document the logic of this loop, is it just to ++ * handle the case where more pages get added to the atom while we are ++ * syncing it? ++ */ ++ assert("nikita-3485", ergo(result == 0, ++ get_current_context()->trans->atom == NULL)); ++ return result; ++} ++ ++/* ++ * Commit atoms of pages on @pages list. ++ * call sync_page for each page from mapping's page tree ++ */ ++static int sync_page_list(struct inode *inode) ++{ ++ int result; ++ struct address_space *mapping; ++ unsigned long from; /* start index for radix_tree_gang_lookup */ ++ unsigned int found; /* return value for radix_tree_gang_lookup */ ++ ++ mapping = inode->i_mapping; ++ from = 0; ++ result = 0; ++ spin_lock_irq(&mapping->tree_lock); ++ while (result == 0) { ++ struct page *page; ++ ++ found = ++ radix_tree_gang_lookup(&mapping->page_tree, (void **)&page, ++ from, 1); ++ assert("edward-1550", found < 2); ++ if (found == 0) ++ break; ++ /** ++ * page may not leave radix tree because it is protected from ++ * truncating by inode->i_mutex locked by sys_fsync ++ */ ++ page_cache_get(page); ++ spin_unlock_irq(&mapping->tree_lock); ++ ++ from = page->index + 1; ++ ++ result = sync_page(page); ++ ++ page_cache_release(page); ++ spin_lock_irq(&mapping->tree_lock); ++ } ++ ++ spin_unlock_irq(&mapping->tree_lock); ++ return result; ++} ++ ++static int commit_file_atoms(struct inode *inode) ++{ ++ int result; ++ struct unix_file_info *uf_info; ++ ++ uf_info = unix_file_inode_data(inode); ++ ++ get_exclusive_access(uf_info); ++ /* ++ * find what items file is made from ++ */ ++ result = find_file_state(inode, uf_info); ++ drop_exclusive_access(uf_info); ++ if (result != 0) ++ return result; ++ ++ /* ++ * file state cannot change because we are under ->i_mutex ++ */ ++ switch (uf_info->container) { ++ case UF_CONTAINER_EXTENTS: ++ /* find_file_state might open join an atom */ ++ reiser4_txn_restart_current(); ++ result = ++ /* ++ * when we are called by ++ * filemap_fdatawrite-> ++ * do_writepages()-> ++ * reiser4_writepages() ++ * ++ * inode->i_mapping->dirty_pages are spices into ++ * ->io_pages, leaving ->dirty_pages dirty. ++ * ++ * When we are called from ++ * reiser4_fsync()->sync_unix_file(), we have to ++ * commit atoms of all pages on the ->dirty_list. ++ * ++ * So for simplicity we just commit ->io_pages and ++ * ->dirty_pages. ++ */ ++ sync_page_list(inode); ++ break; ++ case UF_CONTAINER_TAILS: ++ /* ++ * NOTE-NIKITA probably we can be smarter for tails. For now ++ * just commit all existing atoms. ++ */ ++ result = txnmgr_force_commit_all(inode->i_sb, 0); ++ break; ++ case UF_CONTAINER_EMPTY: ++ result = 0; ++ break; ++ case UF_CONTAINER_UNKNOWN: ++ default: ++ result = -EIO; ++ break; ++ } ++ ++ /* ++ * commit current transaction: there can be captured nodes from ++ * find_file_state() and finish_conversion(). ++ */ ++ reiser4_txn_restart_current(); ++ return result; ++} ++ ++/** ++ * writepages_unix_file - writepages of struct address_space_operations ++ * @mapping: ++ * @wbc: ++ * ++ * This captures anonymous pages and anonymous jnodes. Anonymous pages are ++ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were ++ * created by reiser4_writepage. ++ */ ++int writepages_unix_file(struct address_space *mapping, ++ struct writeback_control *wbc) ++{ ++ int result; ++ struct unix_file_info *uf_info; ++ pgoff_t pindex, jindex, nr_pages; ++ long to_capture; ++ struct inode *inode; ++ ++ inode = mapping->host; ++ if (!has_anonymous_pages(inode)) { ++ result = 0; ++ goto end; ++ } ++ jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT; ++ result = 0; ++ nr_pages = size_in_pages(i_size_read(inode)); ++ ++ uf_info = unix_file_inode_data(inode); ++ ++ do { ++ reiser4_context *ctx; ++ ++ if (wbc->sync_mode != WB_SYNC_ALL) ++ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST); ++ else ++ to_capture = CAPTURE_APAGE_BURST; ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) { ++ result = PTR_ERR(ctx); ++ break; ++ } ++ /* avoid recursive calls to ->sync_inodes */ ++ ctx->nobalance = 1; ++ assert("zam-760", lock_stack_isclean(get_current_lock_stack())); ++ assert("edward-1551", LOCK_CNT_NIL(inode_sem_w)); ++ assert("edward-1552", LOCK_CNT_NIL(inode_sem_r)); ++ ++ reiser4_txn_restart_current(); ++ ++ /* we have to get nonexclusive access to the file */ ++ if (get_current_context()->entd) { ++ /* ++ * use nonblocking version of nonexclusive_access to ++ * avoid deadlock which might look like the following: ++ * process P1 holds NEA on file F1 and called entd to ++ * reclaim some memory. Entd works for P1 and is going ++ * to capture pages of file F2. To do that entd has to ++ * get NEA to F2. F2 is held by process P2 which also ++ * called entd. But entd is serving P1 at the moment ++ * and P2 has to wait. Process P3 trying to get EA to ++ * file F2. Existence of pending EA request to file F2 ++ * makes impossible for entd to get NEA to file ++ * F2. Neither of these process can continue. Using ++ * nonblocking version of gettign NEA is supposed to ++ * avoid this deadlock. ++ */ ++ if (try_to_get_nonexclusive_access(uf_info) == 0) { ++ result = RETERR(-EBUSY); ++ reiser4_exit_context(ctx); ++ break; ++ } ++ } else ++ get_nonexclusive_access(uf_info); ++ ++ while (to_capture > 0) { ++ pgoff_t start; ++ ++ assert("vs-1727", jindex <= pindex); ++ if (pindex == jindex) { ++ start = pindex; ++ result = ++ capture_anonymous_pages(inode->i_mapping, ++ &pindex, ++ to_capture); ++ if (result <= 0) ++ break; ++ to_capture -= result; ++ wbc->nr_to_write -= result; ++ if (start + result == pindex) { ++ jindex = pindex; ++ continue; ++ } ++ if (to_capture <= 0) ++ break; ++ } ++ /* deal with anonymous jnodes between jindex and pindex */ ++ result = ++ capture_anonymous_jnodes(inode->i_mapping, &jindex, ++ pindex, to_capture); ++ if (result < 0) ++ break; ++ to_capture -= result; ++ get_current_context()->nr_captured += result; ++ ++ if (jindex == (pgoff_t) - 1) { ++ assert("vs-1728", pindex == (pgoff_t) - 1); ++ break; ++ } ++ } ++ if (to_capture <= 0) ++ /* there may be left more pages */ ++ __mark_inode_dirty(inode, I_DIRTY_PAGES); ++ ++ drop_nonexclusive_access(uf_info); ++ if (result < 0) { ++ /* error happened */ ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ if (wbc->sync_mode != WB_SYNC_ALL) { ++ reiser4_exit_context(ctx); ++ return 0; ++ } ++ result = commit_file_atoms(inode); ++ reiser4_exit_context(ctx); ++ if (pindex >= nr_pages && jindex == pindex) ++ break; ++ } while (1); ++ ++ end: ++ if (is_in_reiser4_context()) { ++ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) { ++ /* ++ * there are already pages to flush, flush them out, do ++ * not delay until end of reiser4_sync_inodes ++ */ ++ reiser4_writeout(inode->i_sb, wbc); ++ get_current_context()->nr_captured = 0; ++ } ++ } ++ return result; ++} ++ ++/** ++ * readpage_unix_file_nolock - readpage of struct address_space_operations ++ * @file: ++ * @page: ++ * ++ * Compose a key and search for item containing information about @page ++ * data. If item is found - its readpage method is called. ++ */ ++int readpage_unix_file(struct file *file, struct page *page) ++{ ++ reiser4_context *ctx; ++ int result; ++ struct inode *inode; ++ reiser4_key key; ++ item_plugin *iplug; ++ hint_t *hint; ++ lock_handle *lh; ++ coord_t *coord; ++ ++ assert("vs-1062", PageLocked(page)); ++ assert("vs-976", !PageUptodate(page)); ++ assert("vs-1061", page->mapping && page->mapping->host); ++ ++ if (page->mapping->host->i_size <= page_offset(page)) { ++ /* page is out of file */ ++ zero_user(page, 0, PAGE_CACHE_SIZE); ++ SetPageUptodate(page); ++ unlock_page(page); ++ return 0; ++ } ++ ++ inode = page->mapping->host; ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) { ++ unlock_page(page); ++ return PTR_ERR(ctx); ++ } ++ ++ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); ++ if (hint == NULL) { ++ unlock_page(page); ++ reiser4_exit_context(ctx); ++ return RETERR(-ENOMEM); ++ } ++ ++ result = load_file_hint(file, hint); ++ if (result) { ++ kfree(hint); ++ unlock_page(page); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ lh = &hint->lh; ++ ++ /* get key of first byte of the page */ ++ key_by_inode_and_offset_common(inode, page_offset(page), &key); ++ ++ /* look for file metadata corresponding to first byte of page */ ++ page_cache_get(page); ++ unlock_page(page); ++ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode); ++ lock_page(page); ++ page_cache_release(page); ++ ++ if (page->mapping == NULL) { ++ /* ++ * readpage allows truncate to run concurrently. Page was ++ * truncated while it was not locked ++ */ ++ done_lh(lh); ++ kfree(hint); ++ unlock_page(page); ++ reiser4_txn_restart(ctx); ++ reiser4_exit_context(ctx); ++ return -EINVAL; ++ } ++ ++ if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) { ++ if (result == CBK_COORD_FOUND && ++ hint->ext_coord.coord.between != AT_UNIT) ++ /* file is truncated */ ++ result = -EINVAL; ++ done_lh(lh); ++ kfree(hint); ++ unlock_page(page); ++ reiser4_txn_restart(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ ++ /* ++ * item corresponding to page is found. It can not be removed because ++ * znode lock is held ++ */ ++ if (PageUptodate(page)) { ++ done_lh(lh); ++ kfree(hint); ++ unlock_page(page); ++ reiser4_txn_restart(ctx); ++ reiser4_exit_context(ctx); ++ return 0; ++ } ++ ++ coord = &hint->ext_coord.coord; ++ result = zload(coord->node); ++ if (result) { ++ done_lh(lh); ++ kfree(hint); ++ unlock_page(page); ++ reiser4_txn_restart(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ ++ validate_extended_coord(&hint->ext_coord, page_offset(page)); ++ ++ if (!coord_is_existing_unit(coord)) { ++ /* this indicates corruption */ ++ warning("vs-280", ++ "Looking for page %lu of file %llu (size %lli). " ++ "No file items found (%d). File is corrupted?\n", ++ page->index, (unsigned long long)get_inode_oid(inode), ++ inode->i_size, result); ++ zrelse(coord->node); ++ done_lh(lh); ++ kfree(hint); ++ unlock_page(page); ++ reiser4_txn_restart(ctx); ++ reiser4_exit_context(ctx); ++ return RETERR(-EIO); ++ } ++ ++ /* ++ * get plugin of found item or use plugin if extent if there are no ++ * one ++ */ ++ iplug = item_plugin_by_coord(coord); ++ if (iplug->s.file.readpage) ++ result = iplug->s.file.readpage(coord, page); ++ else ++ result = RETERR(-EINVAL); ++ ++ if (!result) { ++ set_key_offset(&key, ++ (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT); ++ /* FIXME should call reiser4_set_hint() */ ++ reiser4_unset_hint(hint); ++ } else { ++ unlock_page(page); ++ reiser4_unset_hint(hint); ++ } ++ assert("vs-979", ++ ergo(result == 0, (PageLocked(page) || PageUptodate(page)))); ++ assert("vs-9791", ergo(result != 0, !PageLocked(page))); ++ ++ zrelse(coord->node); ++ done_lh(lh); ++ ++ save_file_hint(file, hint); ++ kfree(hint); ++ ++ /* ++ * FIXME: explain why it is needed. HINT: page allocation in write can ++ * not be done when atom is not NULL because reiser4_writepage can not ++ * kick entd and have to eflush ++ */ ++ reiser4_txn_restart(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++struct uf_readpages_context { ++ lock_handle lh; ++ coord_t coord; ++}; ++ ++/* A callback function for readpages_unix_file/read_cache_pages. ++ * If the file is build of tails, then return error (-ENOENT). ++ * ++ * @data -- a pointer to reiser4_readpages_context object, ++ * to save the twig lock and the coord between ++ * read_cache_page iterations. ++ * @page -- page to start read. ++ */ ++static int uf_readpages_filler(void * data, struct page * page) ++{ ++ struct uf_readpages_context *rc = data; ++ jnode * node; ++ int ret = 0; ++ reiser4_extent *ext; ++ __u64 ext_index; ++ int cbk_done = 0; ++ struct address_space * mapping = page->mapping; ++ ++ if (PageUptodate(page)) { ++ unlock_page(page); ++ return 0; ++ } ++ page_cache_get(page); ++ ++ if (rc->lh.node == 0) { ++ /* no twig lock - have to do tree search. */ ++ reiser4_key key; ++ repeat: ++ unlock_page(page); ++ key_by_inode_and_offset_common( ++ mapping->host, page_offset(page), &key); ++ ret = coord_by_key( ++ &get_super_private(mapping->host->i_sb)->tree, ++ &key, &rc->coord, &rc->lh, ++ ZNODE_READ_LOCK, FIND_EXACT, ++ TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL); ++ if (unlikely(ret)) ++ goto exit; ++ lock_page(page); ++ if (PageUptodate(page)) ++ goto unlock; ++ cbk_done = 1; ++ } ++ ret = zload(rc->coord.node); ++ if (unlikely(ret)) ++ goto unlock; ++ if (!coord_is_existing_item(&rc->coord) || ++ !item_is_extent(&rc->coord)) { ++ zrelse(rc->coord.node); ++ ret = RETERR(-EIO); ++ goto unlock; ++ } ++ ext = extent_by_coord(&rc->coord); ++ ext_index = extent_unit_index(&rc->coord); ++ if (page->index < ext_index || ++ page->index >= ext_index + extent_get_width(ext)) { ++ /* the page index doesn't belong to the extent unit ++ which the coord points to - release the lock and ++ repeat with tree search. */ ++ zrelse(rc->coord.node); ++ done_lh(&rc->lh); ++ /* we can be here after a CBK call only in case of ++ corruption of the tree or the tree lookup algorithm bug. */ ++ if (unlikely(cbk_done)) { ++ ret = RETERR(-EIO); ++ goto unlock; ++ } ++ goto repeat; ++ } ++ node = jnode_of_page(page); ++ if (unlikely(IS_ERR(node))) { ++ zrelse(rc->coord.node); ++ ret = PTR_ERR(node); ++ goto unlock; ++ } ++ ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page); ++ jput(node); ++ zrelse(rc->coord.node); ++ if (likely(!ret)) ++ goto exit; ++ unlock: ++ unlock_page(page); ++ exit: ++ page_cache_release(page); ++ return ret; ++} ++ ++/** ++ * readpages_unix_file - called by the readahead code, starts reading for each ++ * page of given list of pages ++ */ ++int readpages_unix_file( ++ struct file *file, struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ reiser4_context *ctx; ++ struct uf_readpages_context rc; ++ int ret; ++ ++ ctx = reiser4_init_context(mapping->host->i_sb); ++ if (IS_ERR(ctx)) { ++ put_pages_list(pages); ++ return PTR_ERR(ctx); ++ } ++ init_lh(&rc.lh); ++ ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc); ++ done_lh(&rc.lh); ++ context_set_commit_async(ctx); ++ /* close the transaction to protect further page allocation from deadlocks */ ++ reiser4_txn_restart(ctx); ++ reiser4_exit_context(ctx); ++ return ret; ++} ++ ++static reiser4_block_nr unix_file_estimate_read(struct inode *inode, ++ loff_t count UNUSED_ARG) ++{ ++ /* We should reserve one block, because of updating of the stat data ++ item */ ++ assert("vs-1249", ++ inode_file_plugin(inode)->estimate.update == ++ estimate_update_common); ++ return estimate_update_common(inode); ++} ++ ++/* this is called with nonexclusive access obtained, file's container can not change */ ++static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */ ++ char __user *buf, /* address of user-space buffer */ ++ size_t count, /* number of bytes to read */ ++ loff_t *off) ++{ ++ int result; ++ struct inode *inode; ++ flow_t flow; ++ int (*read_f) (struct file *, flow_t *, hint_t *); ++ coord_t *coord; ++ znode *loaded; ++ ++ inode = file->f_dentry->d_inode; ++ ++ /* build flow */ ++ assert("vs-1250", ++ inode_file_plugin(inode)->flow_by_inode == ++ flow_by_inode_unix_file); ++ result = ++ flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count, ++ *off, READ_OP, &flow); ++ if (unlikely(result)) ++ return result; ++ ++ /* get seal and coord sealed with it from reiser4 private data ++ of struct file. The coord will tell us where our last read ++ of this file finished, and the seal will help to determine ++ if that location is still valid. ++ */ ++ coord = &hint->ext_coord.coord; ++ while (flow.length && result == 0) { ++ result = ++ find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode); ++ if (cbk_errored(result)) ++ /* error happened */ ++ break; ++ ++ if (coord->between != AT_UNIT) { ++ /* there were no items corresponding to given offset */ ++ done_lh(hint->ext_coord.lh); ++ break; ++ } ++ ++ loaded = coord->node; ++ result = zload(loaded); ++ if (unlikely(result)) { ++ done_lh(hint->ext_coord.lh); ++ break; ++ } ++ ++ if (hint->ext_coord.valid == 0) ++ validate_extended_coord(&hint->ext_coord, ++ get_key_offset(&flow.key)); ++ ++ assert("vs-4", hint->ext_coord.valid == 1); ++ assert("vs-33", hint->ext_coord.lh == &hint->lh); ++ /* call item's read method */ ++ read_f = item_plugin_by_coord(coord)->s.file.read; ++ result = read_f(file, &flow, hint); ++ zrelse(loaded); ++ done_lh(hint->ext_coord.lh); ++ } ++ ++ return (count - flow.length) ? (count - flow.length) : result; ++} ++ ++static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*); ++ ++/** ++ * read_unix_file - read of struct file_operations ++ * @file: file to read from ++ * @buf: address of user-space buffer ++ * @read_amount: number of bytes to read ++ * @off: position in file to read from ++ * ++ * This is implementation of vfs's read method of struct file_operations for ++ * unix file plugin. ++ */ ++ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount, ++ loff_t *off) ++{ ++ reiser4_context *ctx; ++ ssize_t result; ++ struct inode *inode; ++ struct unix_file_info *uf_info; ++ ++ if (unlikely(read_amount == 0)) ++ return 0; ++ ++ assert("umka-072", file != NULL); ++ assert("umka-074", off != NULL); ++ inode = file->f_dentry->d_inode; ++ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ uf_info = unix_file_inode_data(inode); ++ if (uf_info->container == UF_CONTAINER_UNKNOWN) { ++ get_exclusive_access(uf_info); ++ result = find_file_state(inode, uf_info); ++ if (unlikely(result != 0)) ++ goto out; ++ } else ++ get_nonexclusive_access(uf_info); ++ result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount), ++ BA_CAN_COMMIT); ++ if (unlikely(result != 0)) ++ goto out; ++ if (uf_info->container == UF_CONTAINER_EXTENTS){ ++ result = do_sync_read(file, buf, read_amount, off); ++ } else if (uf_info->container == UF_CONTAINER_TAILS || ++ reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) || ++ reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { ++ result = read_unix_file_container_tails(file, buf, read_amount, off); ++ } else { ++ assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY); ++ result = 0; ++ } ++out: ++ drop_access(uf_info); ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++static ssize_t read_unix_file_container_tails( ++ struct file *file, char __user *buf, size_t read_amount, loff_t *off) ++{ ++ int result; ++ struct inode *inode; ++ hint_t *hint; ++ struct unix_file_info *uf_info; ++ size_t count, read, left; ++ loff_t size; ++ ++ assert("umka-072", file != NULL); ++ assert("umka-074", off != NULL); ++ inode = file->f_dentry->d_inode; ++ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); ++ ++ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); ++ if (hint == NULL) ++ return RETERR(-ENOMEM); ++ ++ result = load_file_hint(file, hint); ++ if (result) { ++ kfree(hint); ++ return result; ++ } ++ ++ left = read_amount; ++ count = 0; ++ uf_info = unix_file_inode_data(inode); ++ while (left > 0) { ++ reiser4_txn_restart_current(); ++ size = i_size_read(inode); ++ if (*off >= size) ++ /* position to read from is past the end of file */ ++ break; ++ if (*off + left > size) ++ left = size - *off; ++ /* faultin user page */ ++ result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left); ++ if (result) ++ return RETERR(-EFAULT); ++ ++ read = read_file(hint, file, buf, ++ left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left, ++ off); ++ if (read < 0) { ++ result = read; ++ break; ++ } ++ left -= read; ++ buf += read; ++ ++ /* update position in a file */ ++ *off += read; ++ /* total number of read bytes */ ++ count += read; ++ } ++ done_lh(&hint->lh); ++ save_file_hint(file, hint); ++ kfree(hint); ++ if (count) ++ file_accessed(file); ++ /* return number of read bytes or error code if nothing is read */ ++ return count ? count : result; ++} ++ ++/* This function takes care about @file's pages. First of all it checks if ++ filesystems readonly and if so gets out. Otherwise, it throws out all ++ pages of file if it was mapped for read and going to be mapped for write ++ and consists of tails. This is done in order to not manage few copies ++ of the data (first in page cache and second one in tails them selves) ++ for the case of mapping files consisting tails. ++ ++ Here also tail2extent conversion is performed if it is allowed and file ++ is going to be written or mapped for write. This functions may be called ++ from write_unix_file() or mmap_unix_file(). */ ++static int check_pages_unix_file(struct file *file, struct inode *inode) ++{ ++ reiser4_invalidate_pages(inode->i_mapping, 0, ++ (inode->i_size + PAGE_CACHE_SIZE - ++ 1) >> PAGE_CACHE_SHIFT, 0); ++ return unpack(file, inode, 0 /* not forever */ ); ++} ++ ++/** ++ * mmap_unix_file - mmap of struct file_operations ++ * @file: file to mmap ++ * @vma: ++ * ++ * This is implementation of vfs's mmap method of struct file_operations for ++ * unix file plugin. It converts file to extent if necessary. Sets ++ * reiser4_inode's flag - REISER4_HAS_MMAP. ++ */ ++int mmap_unix_file(struct file *file, struct vm_area_struct *vma) ++{ ++ reiser4_context *ctx; ++ int result; ++ struct inode *inode; ++ struct unix_file_info *uf_info; ++ reiser4_block_nr needed; ++ ++ inode = file->f_dentry->d_inode; ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ uf_info = unix_file_inode_data(inode); ++ ++ get_exclusive_access_careful(uf_info, inode); ++ ++ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) { ++ /* ++ * we need file built of extent items. If it is still built of ++ * tail items we have to convert it. Find what items the file ++ * is built of ++ */ ++ result = find_file_state(inode, uf_info); ++ if (result != 0) { ++ drop_exclusive_access(uf_info); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ ++ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS || ++ uf_info->container == UF_CONTAINER_EXTENTS || ++ uf_info->container == UF_CONTAINER_EMPTY)); ++ if (uf_info->container == UF_CONTAINER_TAILS) { ++ /* ++ * invalidate all pages and convert file from tails to ++ * extents ++ */ ++ result = check_pages_unix_file(file, inode); ++ if (result) { ++ drop_exclusive_access(uf_info); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ } ++ } ++ ++ /* ++ * generic_file_mmap will do update_atime. Grab space for stat data ++ * update. ++ */ ++ needed = inode_file_plugin(inode)->estimate.update(inode); ++ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT); ++ if (result) { ++ drop_exclusive_access(uf_info); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ ++ result = generic_file_mmap(file, vma); ++ if (result == 0) { ++ /* mark file as having mapping. */ ++ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP); ++ } ++ ++ drop_exclusive_access(uf_info); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/** ++ * find_first_item ++ * @inode: ++ * ++ * Finds file item which is responsible for first byte in the file. ++ */ ++static int find_first_item(struct inode *inode) ++{ ++ coord_t coord; ++ lock_handle lh; ++ reiser4_key key; ++ int result; ++ ++ coord_init_zero(&coord); ++ init_lh(&lh); ++ inode_file_plugin(inode)->key_by_inode(inode, 0, &key); ++ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, ++ inode); ++ if (result == CBK_COORD_FOUND) { ++ if (coord.between == AT_UNIT) { ++ result = zload(coord.node); ++ if (result == 0) { ++ result = item_id_by_coord(&coord); ++ zrelse(coord.node); ++ if (result != EXTENT_POINTER_ID && ++ result != FORMATTING_ID) ++ result = RETERR(-EIO); ++ } ++ } else ++ result = RETERR(-EIO); ++ } ++ done_lh(&lh); ++ return result; ++} ++ ++/** ++ * open_unix_file ++ * @inode: ++ * @file: ++ * ++ * If filesystem is not readonly - complete uncompleted tail conversion if ++ * there was one ++ */ ++int open_unix_file(struct inode *inode, struct file *file) ++{ ++ int result; ++ reiser4_context *ctx; ++ struct unix_file_info *uf_info; ++ ++ if (IS_RDONLY(inode)) ++ return 0; ++ ++ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) ++ return 0; ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ uf_info = unix_file_inode_data(inode); ++ ++ get_exclusive_access_careful(uf_info, inode); ++ ++ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { ++ /* ++ * other process completed the conversion ++ */ ++ drop_exclusive_access(uf_info); ++ reiser4_exit_context(ctx); ++ return 0; ++ } ++ ++ /* ++ * file left in semi converted state after unclean shutdown or another ++ * thread is doing conversion and dropped exclusive access which doing ++ * balance dirty pages. Complete the conversion ++ */ ++ result = find_first_item(inode); ++ if (result == EXTENT_POINTER_ID) ++ /* ++ * first item is extent, therefore there was incomplete ++ * tail2extent conversion. Complete it ++ */ ++ result = tail2extent(unix_file_inode_data(inode)); ++ else if (result == FORMATTING_ID) ++ /* ++ * first item is formatting item, therefore there was ++ * incomplete extent2tail conversion. Complete it ++ */ ++ result = extent2tail(file, unix_file_inode_data(inode)); ++ else ++ result = -EIO; ++ ++ assert("vs-1712", ++ ergo(result == 0, ++ (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) && ++ !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)))); ++ drop_exclusive_access(uf_info); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++#define NEITHER_OBTAINED 0 ++#define EA_OBTAINED 1 ++#define NEA_OBTAINED 2 ++ ++static void drop_access(struct unix_file_info *uf_info) ++{ ++ if (uf_info->exclusive_use) ++ drop_exclusive_access(uf_info); ++ else ++ drop_nonexclusive_access(uf_info); ++} ++ ++#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \ ++ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) ++ ++/** ++ * write_unix_file - private ->write() method of unix_file plugin. ++ * ++ * @file: file to write to ++ * @buf: address of user-space buffer ++ * @count: number of bytes to write ++ * @pos: position in file to write to ++ * @cont: unused argument, as we don't perform plugin conversion when being ++ * managed by unix_file plugin. ++ */ ++ssize_t write_unix_file(struct file *file, ++ const char __user *buf, ++ size_t count, loff_t *pos, ++ struct dispatch_context *cont) ++{ ++ int result; ++ reiser4_context *ctx; ++ struct inode *inode; ++ struct unix_file_info *uf_info; ++ ssize_t written; ++ int try_free_space; ++ int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY; ++ size_t left; ++ ssize_t (*write_op)(struct file *, struct inode *, ++ const char __user *, size_t, ++ loff_t *pos); ++ int ea; ++ loff_t new_size; ++ ++ ctx = get_current_context(); ++ inode = file->f_dentry->d_inode; ++ ++ assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); ++ assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))); ++ ++ /* check amount of bytes to write and writing position */ ++ result = generic_write_checks(file, pos, &count, 0); ++ if (result) { ++ context_set_commit_async(ctx); ++ return result; ++ } ++ ++ result = file_remove_suid(file); ++ if (result) { ++ context_set_commit_async(ctx); ++ return result; ++ } ++ /* remove_suid might create a transaction */ ++ reiser4_txn_restart(ctx); ++ ++ uf_info = unix_file_inode_data(inode); ++ ++ current->backing_dev_info = inode->i_mapping->backing_dev_info; ++ written = 0; ++ try_free_space = 0; ++ left = count; ++ ea = NEITHER_OBTAINED; ++ ++ new_size = i_size_read(inode); ++ if (*pos + count > new_size) ++ new_size = *pos + count; ++ ++ while (left) { ++ if (left < to_write) ++ to_write = left; ++ ++ if (uf_info->container == UF_CONTAINER_EMPTY) { ++ get_exclusive_access(uf_info); ++ ea = EA_OBTAINED; ++ if (uf_info->container != UF_CONTAINER_EMPTY) { ++ /* file is made not empty by another process */ ++ drop_exclusive_access(uf_info); ++ ea = NEITHER_OBTAINED; ++ continue; ++ } ++ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) { ++ /* ++ * get exclusive access directly just to not have to ++ * re-obtain it if file will appear empty ++ */ ++ get_exclusive_access(uf_info); ++ ea = EA_OBTAINED; ++ result = find_file_state(inode, uf_info); ++ if (result) { ++ drop_exclusive_access(uf_info); ++ ea = NEITHER_OBTAINED; ++ break; ++ } ++ } else { ++ get_nonexclusive_access(uf_info); ++ ea = NEA_OBTAINED; ++ } ++ ++ /* either EA or NEA is obtained. Choose item write method */ ++ if (uf_info->container == UF_CONTAINER_EXTENTS) { ++ /* file is built of extent items */ ++ write_op = reiser4_write_extent; ++ } else if (uf_info->container == UF_CONTAINER_EMPTY) { ++ /* file is empty */ ++ if (should_have_notail(uf_info, new_size)) ++ write_op = reiser4_write_extent; ++ else ++ write_op = reiser4_write_tail; ++ } else { ++ /* file is built of tail items */ ++ if (should_have_notail(uf_info, new_size)) { ++ if (ea == NEA_OBTAINED) { ++ drop_nonexclusive_access(uf_info); ++ get_exclusive_access(uf_info); ++ ea = EA_OBTAINED; ++ } ++ if (uf_info->container == UF_CONTAINER_TAILS) { ++ /* ++ * if file is being convered by another ++ * process - wait until it completes ++ */ ++ while (1) { ++ if (reiser4_inode_get_flag(inode, ++ REISER4_PART_IN_CONV)) { ++ drop_exclusive_access(uf_info); ++ schedule(); ++ get_exclusive_access(uf_info); ++ continue; ++ } ++ break; ++ } ++ if (uf_info->container == UF_CONTAINER_TAILS) { ++ result = tail2extent(uf_info); ++ if (result) { ++ drop_exclusive_access(uf_info); ++ context_set_commit_async(ctx); ++ break; ++ } ++ } ++ } ++ drop_exclusive_access(uf_info); ++ ea = NEITHER_OBTAINED; ++ continue; ++ } ++ write_op = reiser4_write_tail; ++ } ++ ++ written = write_op(file, inode, buf, to_write, pos); ++ if (written == -ENOSPC && try_free_space) { ++ drop_access(uf_info); ++ txnmgr_force_commit_all(inode->i_sb, 0); ++ try_free_space = 0; ++ continue; ++ } ++ if (written < 0) { ++ drop_access(uf_info); ++ result = written; ++ break; ++ } ++ /* something is written. */ ++ if (uf_info->container == UF_CONTAINER_EMPTY) { ++ assert("edward-1553", ea == EA_OBTAINED); ++ uf_info->container = ++ (write_op == reiser4_write_extent) ? ++ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS; ++ } else { ++ assert("edward-1554", ergo(uf_info->container == UF_CONTAINER_EXTENTS, ++ write_op == reiser4_write_extent)); ++ assert("edward-1555", ergo(uf_info->container == UF_CONTAINER_TAILS, ++ write_op == reiser4_write_tail)); ++ } ++ if (*pos + written > inode->i_size) ++ INODE_SET_FIELD(inode, i_size, *pos + written); ++ file_update_time(file); ++ result = reiser4_update_sd(inode); ++ if (result) { ++ current->backing_dev_info = NULL; ++ drop_access(uf_info); ++ context_set_commit_async(ctx); ++ break; ++ } ++ drop_access(uf_info); ++ ea = NEITHER_OBTAINED; ++ ++ /* ++ * tell VM how many pages were dirtied. Maybe number of pages ++ * which were dirty already should not be counted ++ */ ++ reiser4_throttle_write(inode, ++ (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE); ++ left -= written; ++ buf += written; ++ *pos += written; ++ } ++ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ++ reiser4_txn_restart_current(); ++ grab_space_enable(); ++ result = reiser4_sync_file_common(file, file->f_dentry, ++ 0 /* data and stat data */); ++ if (result) ++ warning("reiser4-7", "failed to sync file %llu", ++ (unsigned long long)get_inode_oid(inode)); ++ } ++ ++ current->backing_dev_info = NULL; ++ ++ /* ++ * return number of written bytes or error code if nothing is ++ * written. Note, that it does not work correctly in case when ++ * sync_unix_file returns error ++ */ ++ return (count - left) ? (count - left) : result; ++} ++ ++/** ++ * release_unix_file - release of struct file_operations ++ * @inode: inode of released file ++ * @file: file to release ++ * ++ * Implementation of release method of struct file_operations for unix file ++ * plugin. If last reference to indode is released - convert all extent items ++ * into tail items if necessary. Frees reiser4 specific file data. ++ */ ++int release_unix_file(struct inode *inode, struct file *file) ++{ ++ reiser4_context *ctx; ++ struct unix_file_info *uf_info; ++ int result; ++ int in_reiser4; ++ ++ in_reiser4 = is_in_reiser4_context(); ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ result = 0; ++ if (in_reiser4 == 0) { ++ uf_info = unix_file_inode_data(inode); ++ ++ get_exclusive_access_careful(uf_info, inode); ++ if (atomic_read(&file->f_dentry->d_count) == 1 && ++ uf_info->container == UF_CONTAINER_EXTENTS && ++ !should_have_notail(uf_info, inode->i_size) && ++ !rofs_inode(inode)) { ++ result = extent2tail(file, uf_info); ++ if (result != 0) { ++ context_set_commit_async(ctx); ++ warning("nikita-3233", ++ "Failed (%d) to convert in %s (%llu)", ++ result, __FUNCTION__, ++ (unsigned long long) ++ get_inode_oid(inode)); ++ } ++ } ++ drop_exclusive_access(uf_info); ++ } else { ++ /* ++ we are within reiser4 context already. How latter is ++ possible? Simple: ++ ++ (gdb) bt ++ #0 get_exclusive_access () ++ #2 0xc01e56d3 in release_unix_file () ++ #3 0xc01c3643 in reiser4_release () ++ #4 0xc014cae0 in __fput () ++ #5 0xc013ffc3 in remove_vm_struct () ++ #6 0xc0141786 in exit_mmap () ++ #7 0xc0118480 in mmput () ++ #8 0xc0133205 in oom_kill () ++ #9 0xc01332d1 in out_of_memory () ++ #10 0xc013bc1d in try_to_free_pages () ++ #11 0xc013427b in __alloc_pages () ++ #12 0xc013f058 in do_anonymous_page () ++ #13 0xc013f19d in do_no_page () ++ #14 0xc013f60e in handle_mm_fault () ++ #15 0xc01131e5 in do_page_fault () ++ #16 0xc0104935 in error_code () ++ #17 0xc025c0c6 in __copy_to_user_ll () ++ #18 0xc01d496f in reiser4_read_tail () ++ #19 0xc01e4def in read_unix_file () ++ #20 0xc01c3504 in reiser4_read () ++ #21 0xc014bd4f in vfs_read () ++ #22 0xc014bf66 in sys_read () ++ */ ++ warning("vs-44", "out of memory?"); ++ } ++ ++ reiser4_free_file_fsdata(file); ++ ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++static void set_file_notail(struct inode *inode) ++{ ++ reiser4_inode *state; ++ formatting_plugin *tplug; ++ ++ state = reiser4_inode_data(inode); ++ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID); ++ force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug); ++} ++ ++/* if file is built of tails - convert it to extents */ ++static int unpack(struct file *filp, struct inode *inode, int forever) ++{ ++ int result = 0; ++ struct unix_file_info *uf_info; ++ ++ uf_info = unix_file_inode_data(inode); ++ assert("vs-1628", ea_obtained(uf_info)); ++ ++ result = find_file_state(inode, uf_info); ++ if (result) ++ return result; ++ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN); ++ ++ if (uf_info->container == UF_CONTAINER_TAILS) { ++ /* ++ * if file is being convered by another process - wait until it ++ * completes ++ */ ++ while (1) { ++ if (reiser4_inode_get_flag(inode, ++ REISER4_PART_IN_CONV)) { ++ drop_exclusive_access(uf_info); ++ schedule(); ++ get_exclusive_access(uf_info); ++ continue; ++ } ++ break; ++ } ++ if (uf_info->container == UF_CONTAINER_TAILS) { ++ result = tail2extent(uf_info); ++ if (result) ++ return result; ++ } ++ } ++ if (forever) { ++ /* safe new formatting plugin in stat data */ ++ __u64 tograb; ++ ++ set_file_notail(inode); ++ ++ grab_space_enable(); ++ tograb = inode_file_plugin(inode)->estimate.update(inode); ++ result = reiser4_grab_space(tograb, BA_CAN_COMMIT); ++ result = reiser4_update_sd(inode); ++ } ++ ++ return result; ++} ++ ++/* implentation of vfs' ioctl method of struct file_operations for unix file ++ plugin ++*/ ++int ++ioctl_unix_file(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg UNUSED_ARG) ++{ ++ reiser4_context *ctx; ++ int result; ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ switch (cmd) { ++ case REISER4_IOC_UNPACK: ++ get_exclusive_access(unix_file_inode_data(inode)); ++ result = unpack(filp, inode, 1 /* forever */ ); ++ drop_exclusive_access(unix_file_inode_data(inode)); ++ break; ++ ++ default: ++ result = RETERR(-ENOSYS); ++ break; ++ } ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/* implentation of vfs' bmap method of struct address_space_operations for unix ++ file plugin ++*/ ++sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock) ++{ ++ reiser4_context *ctx; ++ sector_t result; ++ reiser4_key key; ++ coord_t coord; ++ lock_handle lh; ++ struct inode *inode; ++ item_plugin *iplug; ++ sector_t block; ++ ++ inode = mapping->host; ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ key_by_inode_and_offset_common(inode, ++ (loff_t) lblock * current_blocksize, ++ &key); ++ ++ init_lh(&lh); ++ result = ++ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode); ++ if (cbk_errored(result)) { ++ done_lh(&lh); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ ++ result = zload(coord.node); ++ if (result) { ++ done_lh(&lh); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ ++ iplug = item_plugin_by_coord(&coord); ++ if (iplug->s.file.get_block) { ++ result = iplug->s.file.get_block(&coord, lblock, &block); ++ if (result == 0) ++ result = block; ++ } else ++ result = RETERR(-EINVAL); ++ ++ zrelse(coord.node); ++ done_lh(&lh); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/** ++ * flow_by_inode_unix_file - initizlize structure flow ++ * @inode: inode of file for which read or write is abou ++ * @buf: buffer to perform read to or write from ++ * @user: flag showing whether @buf is user space or kernel space ++ * @size: size of buffer @buf ++ * @off: start offset fro read or write ++ * @op: READ or WRITE ++ * @flow: ++ * ++ * Initializes fields of @flow: key, size of data, i/o mode (read or write). ++ */ ++int flow_by_inode_unix_file(struct inode *inode, ++ const char __user *buf, int user, ++ loff_t size, loff_t off, ++ rw_op op, flow_t *flow) ++{ ++ assert("nikita-1100", inode != NULL); ++ ++ flow->length = size; ++ memcpy(&flow->data, &buf, sizeof(buf)); ++ flow->user = user; ++ flow->op = op; ++ assert("nikita-1931", inode_file_plugin(inode) != NULL); ++ assert("nikita-1932", ++ inode_file_plugin(inode)->key_by_inode == ++ key_by_inode_and_offset_common); ++ /* calculate key of write position and insert it into flow->key */ ++ return key_by_inode_and_offset_common(inode, off, &flow->key); ++} ++ ++/* plugin->u.file.set_plug_in_sd = NULL ++ plugin->u.file.set_plug_in_inode = NULL ++ plugin->u.file.create_blank_sd = NULL */ ++/* plugin->u.file.delete */ ++/* ++ plugin->u.file.add_link = reiser4_add_link_common ++ plugin->u.file.rem_link = NULL */ ++ ++/* plugin->u.file.owns_item ++ this is common_file_owns_item with assertion */ ++/* Audited by: green(2002.06.15) */ ++int ++owns_item_unix_file(const struct inode *inode /* object to check against */ , ++ const coord_t * coord /* coord to check */ ) ++{ ++ int result; ++ ++ result = owns_item_common(inode, coord); ++ if (!result) ++ return 0; ++ if (!plugin_of_group(item_plugin_by_coord(coord), ++ UNIX_FILE_METADATA_ITEM_TYPE)) ++ return 0; ++ assert("vs-547", ++ item_id_by_coord(coord) == EXTENT_POINTER_ID || ++ item_id_by_coord(coord) == FORMATTING_ID); ++ return 1; ++} ++ ++static int setattr_truncate(struct inode *inode, struct iattr *attr) ++{ ++ int result; ++ int s_result; ++ loff_t old_size; ++ reiser4_tree *tree; ++ ++ inode_check_scale(inode, inode->i_size, attr->ia_size); ++ ++ old_size = inode->i_size; ++ tree = reiser4_tree_by_inode(inode); ++ ++ result = safe_link_grab(tree, BA_CAN_COMMIT); ++ if (result == 0) ++ result = safe_link_add(inode, SAFE_TRUNCATE); ++ if (result == 0) ++ result = truncate_file_body(inode, attr); ++ if (result) ++ warning("vs-1588", "truncate_file failed: oid %lli, " ++ "old size %lld, new size %lld, retval %d", ++ (unsigned long long)get_inode_oid(inode), ++ old_size, attr->ia_size, result); ++ ++ s_result = safe_link_grab(tree, BA_CAN_COMMIT); ++ if (s_result == 0) ++ s_result = ++ safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE); ++ if (s_result != 0) { ++ warning("nikita-3417", "Cannot kill safelink %lli: %i", ++ (unsigned long long)get_inode_oid(inode), s_result); ++ } ++ safe_link_release(tree); ++ return result; ++} ++ ++/* plugin->u.file.setattr method */ ++/* This calls inode_setattr and if truncate is in effect it also takes ++ exclusive inode access to avoid races */ ++int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */ ++ struct iattr *attr /* change description */ ) ++{ ++ int result; ++ ++ if (attr->ia_valid & ATTR_SIZE) { ++ reiser4_context *ctx; ++ struct unix_file_info *uf_info; ++ ++ /* truncate does reservation itself and requires exclusive ++ access obtained */ ++ ctx = reiser4_init_context(dentry->d_inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ uf_info = unix_file_inode_data(dentry->d_inode); ++ get_exclusive_access_careful(uf_info, dentry->d_inode); ++ result = setattr_truncate(dentry->d_inode, attr); ++ drop_exclusive_access(uf_info); ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ } else ++ result = reiser4_setattr_common(dentry, attr); ++ ++ return result; ++} ++ ++/* plugin->u.file.init_inode_data */ ++void ++init_inode_data_unix_file(struct inode *inode, ++ reiser4_object_create_data * crd, int create) ++{ ++ struct unix_file_info *data; ++ ++ data = unix_file_inode_data(inode); ++ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN; ++ init_rwsem(&data->latch); ++ data->tplug = inode_formatting_plugin(inode); ++ data->exclusive_use = 0; ++ ++#if REISER4_DEBUG ++ data->ea_owner = NULL; ++ atomic_set(&data->nr_neas, 0); ++#endif ++ init_inode_ordering(inode, crd, create); ++} ++ ++/** ++ * delete_unix_file - delete_object of file_plugin ++ * @inode: inode to be deleted ++ * ++ * Truncates file to length 0, removes stat data and safe link. ++ */ ++int delete_object_unix_file(struct inode *inode) ++{ ++ struct unix_file_info *uf_info; ++ int result; ++ ++ if (reiser4_inode_get_flag(inode, REISER4_NO_SD)) ++ return 0; ++ ++ /* truncate file bogy first */ ++ uf_info = unix_file_inode_data(inode); ++ get_exclusive_access(uf_info); ++ result = shorten_file(inode, 0 /* size */ ); ++ drop_exclusive_access(uf_info); ++ ++ if (result) ++ warning("edward-1556", ++ "failed to truncate file (%llu) on removal: %d", ++ get_inode_oid(inode), result); ++ ++ /* remove stat data and safe link */ ++ return reiser4_delete_object_common(inode); ++} ++ ++/* plugin->write_begin() */ ++int write_begin_unix_file(struct file *file, struct page *page, ++ unsigned from, unsigned to) ++{ ++ int ret; ++ struct unix_file_info *info; ++ ++ info = unix_file_inode_data(file->f_dentry->d_inode); ++ get_exclusive_access(info); ++ ret = find_file_state(file->f_dentry->d_inode, info); ++ if (likely(ret == 0)) { ++ if (info->container == UF_CONTAINER_TAILS) ++ ret = -EINVAL; ++ else ++ ret = do_prepare_write(file, page, from, to); ++ } ++ drop_exclusive_access(info); ++ return ret; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.33/fs/reiser4/plugin/file/file_conversion.c +--- linux-2.6.33.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file/file_conversion.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,747 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, ++ licensing governed by reiser4/README */ ++ ++/** ++ * This file contains dispatching hooks, and conversion methods, which ++ * implement transitions in the FILE interface. ++ * ++ * Dispatching hook makes a decision (at dispatching point) about the ++ * most reasonable plugin. Such decision is made in accordance with some ++ * O(1)-heuristic. ++ * ++ * We implement a transition CRYPTCOMPRESS -> UNIX_FILE for files with ++ * incompressible data. Current heuristic to estimate compressibility is ++ * very simple: if first complete logical cluster (64K by default) of a ++ * file is incompressible, then we make a decision, that the whole file ++ * is incompressible. ++ * ++ * To enable dispatching we install a special "magic" compression mode ++ * plugin CONVX_COMPRESSION_MODE_ID at file creation time. ++ * ++ * Note, that we don't perform back conversion (UNIX_FILE->CRYPTCOMPRESS) ++ * because of compatibility reasons). ++ * ++ * In conversion time we protect CS, the conversion set (file's (meta)data ++ * and plugin table (pset)) via special per-inode rw-semaphore (conv_sem). ++ * The methods which implement conversion are CS writers. The methods of FS ++ * interface (file_operations, inode_operations, address_space_operations) ++ * are CS readers. ++ */ ++ ++#include "../../inode.h" ++#include "../cluster.h" ++#include "file.h" ++ ++#define conversion_enabled(inode) \ ++ (inode_compression_mode_plugin(inode) == \ ++ compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID)) ++ ++/** ++ * Located sections (readers and writers of @pset) are not permanently ++ * critical: cryptcompress file can be converted only if the conversion ++ * is enabled (see the macrio above). Also we don't perform back ++ * conversion. The following helper macro is a sanity check to decide ++ * if we need the protection (locks are always additional overheads). ++ */ ++#define should_protect(inode) \ ++ (inode_file_plugin(inode) == \ ++ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \ ++ conversion_enabled(inode)) ++/** ++ * To avoid confusion with read/write file operations, we'll speak about ++ * "passive" protection for FCS readers and "active" protection for FCS ++ * writers. All methods with active or passive protection have suffix ++ * "careful". ++ */ ++/** ++ * Macros for passive protection. ++ * ++ * Construct invariant operation to be supplied to VFS. ++ * The macro accepts the following lexemes: ++ * @type - type of the value represented by the compound statement; ++ * @method - name of an operation to be supplied to VFS (reiser4 file ++ * plugin also should contain a method with such name). ++ */ ++#define PROT_PASSIVE(type, method, args) \ ++({ \ ++ type _result; \ ++ struct rw_semaphore * guard = \ ++ &reiser4_inode_data(inode)->conv_sem; \ ++ \ ++ if (should_protect(inode)) { \ ++ down_read(guard); \ ++ if (!should_protect(inode)) \ ++ up_read(guard); \ ++ } \ ++ _result = inode_file_plugin(inode)->method args; \ ++ if (should_protect(inode)) \ ++ up_read(guard); \ ++ _result; \ ++}) ++ ++#define PROT_PASSIVE_VOID(method, args) \ ++({ \ ++ struct rw_semaphore * guard = \ ++ &reiser4_inode_data(inode)->conv_sem; \ ++ \ ++ if (should_protect(inode)) { \ ++ down_read(guard); \ ++ if (!should_protect(inode)) \ ++ up_read(guard); \ ++ } \ ++ inode_file_plugin(inode)->method args; \ ++ \ ++ if (should_protect(inode)) \ ++ up_read(guard); \ ++}) ++ ++/* Pass management to the unix-file plugin with "notail" policy */ ++static int __cryptcompress2unixfile(struct file *file, struct inode * inode) ++{ ++ int result; ++ reiser4_inode *info; ++ struct unix_file_info * uf; ++ info = reiser4_inode_data(inode); ++ ++ result = aset_set_unsafe(&info->pset, ++ PSET_FILE, ++ (reiser4_plugin *) ++ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)); ++ if (result) ++ return result; ++ result = aset_set_unsafe(&info->pset, ++ PSET_FORMATTING, ++ (reiser4_plugin *) ++ formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID)); ++ if (result) ++ return result; ++ /* get rid of non-standard plugins */ ++ info->plugin_mask &= ~cryptcompress_mask; ++ /* get rid of plugin stat-data extension */ ++ info->extmask &= ~(1 << PLUGIN_STAT); ++ ++ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); ++ ++ /* FIXME use init_inode_data_unix_file() instead, ++ but aviod init_inode_ordering() */ ++ /* Init unix-file specific part of inode */ ++ uf = unix_file_inode_data(inode); ++ uf->container = UF_CONTAINER_UNKNOWN; ++ init_rwsem(&uf->latch); ++ uf->tplug = inode_formatting_plugin(inode); ++ uf->exclusive_use = 0; ++#if REISER4_DEBUG ++ uf->ea_owner = NULL; ++ atomic_set(&uf->nr_neas, 0); ++#endif ++ /** ++ * we was carefull for file_ops, inode_ops and as_ops ++ * to be invariant for plugin conversion, so there is ++ * no need to update ones already installed in the ++ * vfs's residence. ++ */ ++ return 0; ++} ++ ++#if REISER4_DEBUG ++static int disabled_conversion_inode_ok(struct inode * inode) ++{ ++ __u64 extmask = reiser4_inode_data(inode)->extmask; ++ __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask; ++ ++ return ((extmask & (1 << LIGHT_WEIGHT_STAT)) && ++ (extmask & (1 << UNIX_STAT)) && ++ (extmask & (1 << LARGE_TIMES_STAT)) && ++ (extmask & (1 << PLUGIN_STAT)) && ++ (plugin_mask & (1 << PSET_COMPRESSION_MODE))); ++} ++#endif ++ ++/** ++ * Disable future attempts to schedule/convert file plugin. ++ * This function is called by plugin schedule hooks. ++ * ++ * To disable conversion we assign any compression mode plugin id ++ * different from CONVX_COMPRESSION_MODE_ID. ++ */ ++static int disable_conversion(struct inode * inode) ++{ ++ int result; ++ result = ++ force_plugin_pset(inode, ++ PSET_COMPRESSION_MODE, ++ (reiser4_plugin *)compression_mode_plugin_by_id ++ (LATTD_COMPRESSION_MODE_ID)); ++ assert("edward-1500", ++ ergo(!result, disabled_conversion_inode_ok(inode))); ++ return result; ++} ++ ++/** ++ * Check if we really have achieved plugin scheduling point ++ */ ++static int check_dispatch_point(struct inode * inode, ++ loff_t pos /* position in the ++ file to write from */, ++ struct cluster_handle * clust, ++ struct dispatch_context * cont) ++{ ++ assert("edward-1505", conversion_enabled(inode)); ++ /* ++ * if file size is more then cluster size, then compressible ++ * status must be figured out (i.e. compression was disabled, ++ * or file plugin was converted to unix_file) ++ */ ++ assert("edward-1506", inode->i_size <= inode_cluster_size(inode)); ++ ++ if (pos > inode->i_size) ++ /* first logical cluster will contain a (partial) hole */ ++ return disable_conversion(inode); ++ if (pos < inode_cluster_size(inode)) ++ /* writing to the first logical cluster */ ++ return 0; ++ /* ++ * here we have: ++ * cluster_size <= pos <= i_size <= cluster_size, ++ * and, hence, pos == i_size == cluster_size ++ */ ++ assert("edward-1498", ++ pos == inode->i_size && ++ pos == inode_cluster_size(inode)); ++ assert("edward-1539", cont != NULL); ++ assert("edward-1540", cont->state == DISPATCH_INVAL_STATE); ++ ++ cont->state = DISPATCH_POINT; ++ return 0; ++} ++ ++static void start_check_compressibility(struct inode * inode, ++ struct cluster_handle * clust, ++ hint_t * hint) ++{ ++ assert("edward-1507", clust->index == 1); ++ assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc)); ++ assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ); ++ ++ hint_init_zero(hint); ++ clust->hint = hint; ++ clust->index --; ++ clust->nr_pages = size_in_pages(lbytes(clust->index, inode)); ++ ++ /* first logical cluster (of index #0) must be complete */ ++ assert("edward-1510", lbytes(clust->index, inode) == ++ inode_cluster_size(inode)); ++} ++ ++static void finish_check_compressibility(struct inode * inode, ++ struct cluster_handle * clust, ++ hint_t * hint) ++{ ++ reiser4_unset_hint(clust->hint); ++ clust->hint = hint; ++ clust->index ++; ++} ++ ++#if REISER4_DEBUG ++static int prepped_dclust_ok(hint_t * hint) ++{ ++ reiser4_key key; ++ coord_t * coord = &hint->ext_coord.coord; ++ ++ item_key_by_coord(coord, &key); ++ return (item_id_by_coord(coord) == CTAIL_ID && ++ !coord_is_unprepped_ctail(coord) && ++ (get_key_offset(&key) + nr_units_ctail(coord) == ++ dclust_get_extension_dsize(hint))); ++} ++#endif ++ ++#define fifty_persent(size) (size >> 1) ++/* evaluation of data compressibility */ ++#define data_is_compressible(osize, isize) \ ++ (osize < fifty_persent(isize)) ++ ++/** ++ * A simple O(1)-heuristic for compressibility. ++ * This is called not more then one time per file's life. ++ * Read first logical cluster (of index #0) and estimate its compressibility. ++ * Save estimation result in @cont. ++ */ ++static int read_check_compressibility(struct inode * inode, ++ struct cluster_handle * clust, ++ struct dispatch_context * cont) ++{ ++ int i; ++ int result; ++ size_t dst_len; ++ hint_t tmp_hint; ++ hint_t * cur_hint = clust->hint; ++ assert("edward-1541", cont->state == DISPATCH_POINT); ++ ++ start_check_compressibility(inode, clust, &tmp_hint); ++ ++ reset_cluster_pgset(clust, cluster_nrpages(inode)); ++ result = grab_page_cluster(inode, clust, READ_OP); ++ if (result) ++ return result; ++ /* Read page cluster here */ ++ for (i = 0; i < clust->nr_pages; i++) { ++ struct page *page = clust->pages[i]; ++ lock_page(page); ++ result = do_readpage_ctail(inode, clust, page, ++ ZNODE_READ_LOCK); ++ unlock_page(page); ++ if (result) ++ goto error; ++ } ++ tfm_cluster_clr_uptodate(&clust->tc); ++ ++ cluster_set_tfm_act(&clust->tc, TFMA_WRITE); ++ ++ if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) { ++ /* lenght of compressed data is known, no need to compress */ ++ assert("edward-1511", ++ znode_is_any_locked(tmp_hint.lh.node)); ++ assert("edward-1512", ++ WITH_DATA(tmp_hint.ext_coord.coord.node, ++ prepped_dclust_ok(&tmp_hint))); ++ dst_len = dclust_get_extension_dsize(&tmp_hint); ++ } ++ else { ++ struct tfm_cluster * tc = &clust->tc; ++ compression_plugin * cplug = inode_compression_plugin(inode); ++ result = grab_tfm_stream(inode, tc, INPUT_STREAM); ++ if (result) ++ goto error; ++ for (i = 0; i < clust->nr_pages; i++) { ++ char *data; ++ lock_page(clust->pages[i]); ++ BUG_ON(!PageUptodate(clust->pages[i])); ++ data = kmap(clust->pages[i]); ++ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i), ++ data, PAGE_CACHE_SIZE); ++ kunmap(clust->pages[i]); ++ unlock_page(clust->pages[i]); ++ } ++ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); ++ if (result) ++ goto error; ++ result = grab_coa(tc, cplug); ++ if (result) ++ goto error; ++ tc->len = tc->lsize = lbytes(clust->index, inode); ++ assert("edward-1513", tc->len == inode_cluster_size(inode)); ++ dst_len = tfm_stream_size(tc, OUTPUT_STREAM); ++ cplug->compress(get_coa(tc, cplug->h.id, tc->act), ++ tfm_input_data(clust), tc->len, ++ tfm_output_data(clust), &dst_len); ++ assert("edward-1514", ++ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM)); ++ } ++ finish_check_compressibility(inode, clust, cur_hint); ++ cont->state = ++ (data_is_compressible(dst_len, inode_cluster_size(inode)) ? ++ DISPATCH_REMAINS_OLD : ++ DISPATCH_ASSIGNED_NEW); ++ return 0; ++ error: ++ put_page_cluster(clust, inode, READ_OP); ++ return result; ++} ++ ++/* Cut disk cluster of index @idx */ ++static int cut_disk_cluster(struct inode * inode, cloff_t idx) ++{ ++ reiser4_key from, to; ++ assert("edward-1515", inode_file_plugin(inode) == ++ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); ++ key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from); ++ to = from; ++ set_key_offset(&to, ++ get_key_offset(&from) + inode_cluster_size(inode) - 1); ++ return reiser4_cut_tree(reiser4_tree_by_inode(inode), ++ &from, &to, inode, 0); ++} ++ ++static int reserve_cryptcompress2unixfile(struct inode *inode) ++{ ++ reiser4_block_nr unformatted_nodes; ++ reiser4_tree *tree; ++ ++ tree = reiser4_tree_by_inode(inode); ++ ++ /* number of unformatted nodes which will be created */ ++ unformatted_nodes = cluster_nrpages(inode); /* N */ ++ ++ /* ++ * space required for one iteration of extent->tail conversion: ++ * ++ * 1. kill ctail items ++ * ++ * 2. insert N unformatted nodes ++ * ++ * 3. insert N (worst-case single-block ++ * extents) extent units. ++ * ++ * 4. drilling to the leaf level by coord_by_key() ++ * ++ * 5. possible update of stat-data ++ * ++ */ ++ grab_space_enable(); ++ return reiser4_grab_space ++ (2 * tree->height + ++ unformatted_nodes + ++ unformatted_nodes * estimate_one_insert_into_item(tree) + ++ 1 + estimate_one_insert_item(tree) + ++ inode_file_plugin(inode)->estimate.update(inode), ++ BA_CAN_COMMIT); ++} ++ ++/** ++ * Convert cryptcompress file plugin to unix_file plugin. ++ */ ++static int cryptcompress2unixfile(struct file *file, struct inode *inode, ++ struct dispatch_context *cont) ++{ ++ int i; ++ int result = 0; ++ struct cryptcompress_info *cr_info; ++ struct unix_file_info *uf_info; ++ assert("edward-1516", cont->pages[0]->index == 0); ++ ++ /* release all cryptcompress-specific resources */ ++ cr_info = cryptcompress_inode_data(inode); ++ result = reserve_cryptcompress2unixfile(inode); ++ if (result) ++ goto out; ++ /* tell kill_hook to not truncate pages */ ++ reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS); ++ result = cut_disk_cluster(inode, 0); ++ if (result) ++ goto out; ++ /* captured jnode of cluster and assotiated resources (pages, ++ reserved disk space) were released by ->kill_hook() method ++ of the item plugin */ ++ ++ result = __cryptcompress2unixfile(file, inode); ++ if (result) ++ goto out; ++ /* At this point file is managed by unix file plugin */ ++ ++ uf_info = unix_file_inode_data(inode); ++ ++ assert("edward-1518", ++ ergo(jprivate(cont->pages[0]), ++ !jnode_is_cluster_page(jprivate(cont->pages[0])))); ++ for(i = 0; i < cont->nr_pages; i++) { ++ assert("edward-1519", cont->pages[i]); ++ assert("edward-1520", PageUptodate(cont->pages[i])); ++ ++ result = find_or_create_extent(cont->pages[i]); ++ if (result) ++ break; ++ } ++ if (unlikely(result)) ++ goto out; ++ uf_info->container = UF_CONTAINER_EXTENTS; ++ result = reiser4_update_sd(inode); ++ out: ++ all_grabbed2free(); ++ return result; ++} ++ ++#define convert_file_plugin cryptcompress2unixfile ++ ++/** ++ * This is called by ->write() method of a cryptcompress file plugin. ++ * Make a decision about the most reasonable file plugin id to manage ++ * the file. ++ */ ++int write_dispatch_hook(struct file *file, struct inode *inode, ++ loff_t pos, struct cluster_handle *clust, ++ struct dispatch_context *cont) ++{ ++ int result; ++ if (!conversion_enabled(inode)) ++ return 0; ++ result = check_dispatch_point(inode, pos, clust, cont); ++ if (result || cont->state != DISPATCH_POINT) ++ return result; ++ result = read_check_compressibility(inode, clust, cont); ++ if (result) ++ return result; ++ if (cont->state == DISPATCH_REMAINS_OLD) { ++ put_page_cluster(clust, inode, READ_OP); ++ return disable_conversion(inode); ++ } ++ assert("edward-1543", cont->state == DISPATCH_ASSIGNED_NEW); ++ /* ++ * page cluster is grabbed and uptodate. It will be ++ * released with a pgset after plugin conversion is ++ * finished, see put_dispatch_context(). ++ */ ++ reiser4_unset_hint(clust->hint); ++ move_cluster_pgset(clust, &cont->pages, &cont->nr_pages); ++ return 0; ++} ++ ++/** ++ * This is called by ->setattr() method of cryptcompress file plugin. ++ */ ++int setattr_dispatch_hook(struct inode * inode) ++{ ++ if (conversion_enabled(inode)) ++ return disable_conversion(inode); ++ return 0; ++} ++ ++static inline void init_dispatch_context(struct dispatch_context * cont) ++{ ++ memset(cont, 0, sizeof(*cont)); ++} ++ ++static inline void done_dispatch_context(struct dispatch_context * cont, ++ struct inode * inode) ++{ ++ if (cont->pages) { ++ __put_page_cluster(0, cont->nr_pages, cont->pages, inode); ++ kfree(cont->pages); ++ } ++} ++/** ++ * Here are wrappers with "protection", aka Reiser4 "careful" methods. ++ * They are used by vfs (as methods of file_ops, inode_ops or as_ops), ++ * which is not aware of plugin conversion performed by Reiser4. ++ */ ++ ++/* ++ * Wrappers with active protection for: ++ * ++ * ->write(); ++ */ ++ ++/* ++ * ->write() file operation supplied to VFS. ++ * Write a file in 3 steps (some of them can be optional). ++ */ ++ssize_t reiser4_write_careful(struct file *file, const char __user *buf, ++ size_t count, loff_t *off) ++{ ++ int result; ++ reiser4_context *ctx; ++ ssize_t written_old = 0; /* bytes written with initial plugin */ ++ ssize_t written_new = 0; /* bytes written with new plugin */ ++ struct dispatch_context cont; ++ struct inode * inode = file->f_dentry->d_inode; ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ init_dispatch_context(&cont); ++ mutex_lock(&inode->i_mutex); ++ /** ++ * First step. ++ * Start write with initial file plugin. ++ * Keep a plugin schedule status at @cont (if any). ++ */ ++ written_old = inode_file_plugin(inode)->write(file, ++ buf, ++ count, ++ off, ++ &cont); ++ if (cont.state != DISPATCH_ASSIGNED_NEW || written_old < 0) ++ goto exit; ++ /** ++ * Second step. ++ * New file plugin has been scheduled. ++ * Perform conversion to the new plugin. ++ */ ++ down_read(&reiser4_inode_data(inode)->conv_sem); ++ result = convert_file_plugin(file, inode, &cont); ++ up_read(&reiser4_inode_data(inode)->conv_sem); ++ if (result) { ++ warning("edward-1544", ++ "Inode %llu: file plugin conversion failed (%d)", ++ (unsigned long long)get_inode_oid(inode), ++ result); ++ context_set_commit_async(ctx); ++ goto exit; ++ } ++ reiser4_txn_restart(ctx); ++ /** ++ * Third step: ++ * Finish write with the new file plugin. ++ */ ++ assert("edward-1536", ++ inode_file_plugin(inode) == ++ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)); ++ ++ written_new = inode_file_plugin(inode)->write(file, ++ buf + written_old, ++ count - written_old, ++ off, ++ NULL); ++ exit: ++ mutex_unlock(&inode->i_mutex); ++ done_dispatch_context(&cont, inode); ++ reiser4_exit_context(ctx); ++ ++ return written_old + (written_new < 0 ? 0 : written_new); ++} ++ ++/* Wrappers with passive protection for: ++ * ++ * ->open(); ++ * ->read(); ++ * ->ioctl(); ++ * ->mmap(); ++ * ->release(); ++ * ->bmap(). ++ */ ++ ++int reiser4_open_careful(struct inode *inode, struct file *file) ++{ ++ return PROT_PASSIVE(int, open, (inode, file)); ++} ++ ++ssize_t reiser4_read_careful(struct file * file, char __user * buf, ++ size_t size, loff_t * off) ++{ ++ struct inode * inode = file->f_dentry->d_inode; ++ return PROT_PASSIVE(ssize_t, read, (file, buf, size, off)); ++} ++ ++int reiser4_ioctl_careful(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg) ++{ ++ return PROT_PASSIVE(int, ioctl, (inode, filp, cmd, arg)); ++} ++ ++int reiser4_mmap_careful(struct file *file, struct vm_area_struct *vma) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ return PROT_PASSIVE(int, mmap, (file, vma)); ++} ++ ++int reiser4_release_careful(struct inode *inode, struct file *file) ++{ ++ return PROT_PASSIVE(int, release, (inode, file)); ++} ++ ++sector_t reiser4_bmap_careful(struct address_space * mapping, sector_t lblock) ++{ ++ struct inode *inode = mapping->host; ++ return PROT_PASSIVE(sector_t, bmap, (mapping, lblock)); ++} ++ ++/** ++ * NOTE: The following two methods are ++ * used only for loopback functionality. ++ * reiser4_write_end() can not cope with ++ * short writes for now. ++ */ ++int reiser4_write_begin_careful(struct file *file, ++ struct address_space *mapping, ++ loff_t pos, ++ unsigned len, ++ unsigned flags, ++ struct page **pagep, ++ void **fsdata) ++{ ++ int ret = 0; ++ unsigned start, end; ++ struct page *page; ++ pgoff_t index; ++ reiser4_context *ctx; ++ struct inode * inode = file->f_dentry->d_inode; ++ ++ index = pos >> PAGE_CACHE_SHIFT; ++ start = pos & (PAGE_CACHE_SIZE - 1); ++ end = start + len; ++ ++ page = grab_cache_page_write_begin(mapping, index, ++ flags & AOP_FLAG_NOFS); ++ *pagep = page; ++ if (!page) ++ return -ENOMEM; ++ ++ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb); ++ if (IS_ERR(ctx)) { ++ ret = PTR_ERR(ctx); ++ goto out; ++ } ++ ret = PROT_PASSIVE(int, write_begin, (file, page, start, end)); ++ ++ /* don't commit transaction under inode semaphore */ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ out: ++ if (unlikely(ret)) { ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return ret; ++} ++ ++int reiser4_write_end_careful(struct file *file, ++ struct address_space *mapping, ++ loff_t pos, ++ unsigned len, ++ unsigned copied, ++ struct page *page, ++ void *fsdata) ++{ ++ int ret; ++ reiser4_context *ctx; ++ unsigned start, end; ++ struct inode *inode = page->mapping->host; ++ ++ assert("umka-3101", file != NULL); ++ assert("umka-3102", page != NULL); ++ assert("umka-3093", PageLocked(page)); ++ ++ start = pos & (PAGE_CACHE_SIZE - 1); ++ end = start + len; ++ ++ flush_dcache_page(page); ++ SetPageUptodate(page); ++ ++ ctx = reiser4_init_context(page->mapping->host->i_sb); ++ if (IS_ERR(ctx)){ ++ unlock_page(page); ++ ret = PTR_ERR(ctx); ++ goto out; ++ } ++ ret = PROT_PASSIVE(int, write_end, (file, page, start, end)); ++ ++ /* don't commit transaction under inode semaphore */ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ out: ++ page_cache_release(page); ++ if (!ret) ++ ret = copied; ++ return ret; ++} ++ ++/* ++ * Wrappers without protection for: ++ * ++ * ->setattr() ++ */ ++int reiser4_setattr(struct dentry *dentry, struct iattr *attr) ++{ ++ return inode_file_plugin(dentry->d_inode)->setattr(dentry, attr); ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 80 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/file.h linux-2.6.33/fs/reiser4/plugin/file/file.h +--- linux-2.6.33.orig/fs/reiser4/plugin/file/file.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file/file.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,336 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* this file contains declarations of methods implementing ++ file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID ++ and SYMLINK_FILE_PLUGIN_ID) */ ++ ++#if !defined( __REISER4_FILE_H__ ) ++#define __REISER4_FILE_H__ ++ ++/* possible states in dispatching process */ ++typedef enum { ++ DISPATCH_INVAL_STATE, /* invalid state */ ++ DISPATCH_POINT, /* dispatching point has been achieved */ ++ DISPATCH_REMAINS_OLD, /* made a decision to manage by old plugin */ ++ DISPATCH_ASSIGNED_NEW /* a new plugin has been assigned */ ++} dispatch_state; ++ ++struct dispatch_context { ++ int nr_pages; ++ struct page **pages; ++ dispatch_state state; ++}; ++ ++/** ++ * Declarations of common/careful/generic methods. ++ * Suppose ->foo() is a vs method (of f_ops, i_ops, or a_ops); ++ * Then common reiser4 method for foo looks like reiser4_foo_common; ++ * careful method looks like reiser4_foo_careful; ++ * generic method looks like reiser4_foo. ++ * ++ * Common method is a simple instruction set eligible for more ++ * then one plugin id. ++ * ++ * Generic method looks at the plugin installed in inode's ++ * plugin set and calls its appropriate method. ++ * ++ * Careful method looks like generic method with protected pset ++ * (see plugin/file/file_conversion.c for details). ++ */ ++ ++/* inode operations */ ++int reiser4_setattr(struct dentry *, struct iattr *); ++ ++/* file operations */ ++ssize_t reiser4_read_careful(struct file *, char __user *buf, ++ size_t count, loff_t *off); ++ssize_t reiser4_write_careful(struct file *, const char __user *buf, ++ size_t count, loff_t * off); ++int reiser4_ioctl_careful(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); ++int reiser4_mmap_careful(struct file *, struct vm_area_struct *); ++int reiser4_open_careful(struct inode *inode, struct file *file); ++int reiser4_release_careful(struct inode *, struct file *); ++int reiser4_sync_file_common(struct file *, struct dentry *, int datasync); ++ ++/* address space operations */ ++int reiser4_readpage(struct file *, struct page *); ++int reiser4_readpages(struct file*, struct address_space*, struct list_head*, ++ unsigned); ++int reiser4_writepages(struct address_space *, struct writeback_control *); ++int reiser4_write_begin_careful(struct file *file, ++ struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned flags, ++ struct page **pagep, void **fsdata); ++int reiser4_write_end_careful(struct file *file, ++ struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned copied, ++ struct page *page, void *fsdata); ++sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock); ++ ++/* ++ * Private methods of unix-file plugin ++ * (UNIX_FILE_PLUGIN_ID) ++ */ ++ ++/* private inode operations */ ++int setattr_unix_file(struct dentry *, struct iattr *); ++ ++/* private file operations */ ++ ++ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount, ++ loff_t *off); ++ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount, ++ loff_t * off, struct dispatch_context * cont); ++int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd, ++ unsigned long arg); ++int mmap_unix_file(struct file *, struct vm_area_struct *); ++int open_unix_file(struct inode *, struct file *); ++int release_unix_file(struct inode *, struct file *); ++ ++/* private address space operations */ ++int readpage_unix_file(struct file *, struct page *); ++int readpages_unix_file(struct file*, struct address_space*, struct list_head*, ++ unsigned); ++int writepages_unix_file(struct address_space *, struct writeback_control *); ++int write_begin_unix_file(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++int write_end_unix_file(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++sector_t bmap_unix_file(struct address_space *, sector_t lblock); ++ ++/* other private methods */ ++int delete_object_unix_file(struct inode *); ++int flow_by_inode_unix_file(struct inode *, const char __user *buf, ++ int user, loff_t, loff_t, rw_op, flow_t *); ++int owns_item_unix_file(const struct inode *, const coord_t *); ++void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *, ++ int create); ++ ++/* ++ * Private methods of cryptcompress file plugin ++ * (CRYPTCOMPRESS_FILE_PLUGIN_ID) ++ */ ++ ++/* private inode operations */ ++int setattr_cryptcompress(struct dentry *, struct iattr *); ++ ++/* private file operations */ ++ssize_t read_cryptcompress(struct file *, char __user *buf, ++ size_t count, loff_t *off); ++ssize_t write_cryptcompress(struct file *, const char __user *buf, ++ size_t count, loff_t * off, ++ struct dispatch_context *cont); ++int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd, ++ unsigned long arg); ++int mmap_cryptcompress(struct file *, struct vm_area_struct *); ++int open_cryptcompress(struct inode *, struct file *); ++int release_cryptcompress(struct inode *, struct file *); ++ ++/* private address space operations */ ++int readpage_cryptcompress(struct file *, struct page *); ++int readpages_cryptcompress(struct file*, struct address_space*, ++ struct list_head*, unsigned); ++int writepages_cryptcompress(struct address_space *, ++ struct writeback_control *); ++int write_begin_cryptcompress(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++int write_end_cryptcompress(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++sector_t bmap_cryptcompress(struct address_space *, sector_t lblock); ++ ++/* other private methods */ ++int flow_by_inode_cryptcompress(struct inode *, const char __user *buf, ++ int user, loff_t, loff_t, rw_op, flow_t *); ++int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *); ++int create_object_cryptcompress(struct inode *, struct inode *, ++ reiser4_object_create_data *); ++int delete_object_cryptcompress(struct inode *); ++void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *, ++ int create); ++int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key, ++ const reiser4_key * to_key, ++ reiser4_key * smallest_removed, ++ struct inode *object, int truncate, ++ int *progress); ++void destroy_inode_cryptcompress(struct inode *); ++ ++/* ++ * Private methods of symlink file plugin ++ * (SYMLINK_FILE_PLUGIN_ID) ++ */ ++int reiser4_create_symlink(struct inode *symlink, struct inode *dir, ++ reiser4_object_create_data *); ++void destroy_inode_symlink(struct inode *); ++ ++/* ++ * all the write into unix file is performed by item write method. Write method ++ * of unix file plugin only decides which item plugin (extent or tail) and in ++ * which mode (one from the enum below) to call ++ */ ++typedef enum { ++ FIRST_ITEM = 1, ++ APPEND_ITEM = 2, ++ OVERWRITE_ITEM = 3 ++} write_mode_t; ++ ++/* unix file may be in one the following states */ ++typedef enum { ++ UF_CONTAINER_UNKNOWN = 0, ++ UF_CONTAINER_TAILS = 1, ++ UF_CONTAINER_EXTENTS = 2, ++ UF_CONTAINER_EMPTY = 3 ++} file_container_t; ++ ++struct formatting_plugin; ++struct inode; ++ ++/* unix file plugin specific part of reiser4 inode */ ++struct unix_file_info { ++ /* ++ * this read-write lock protects file containerization change. Accesses ++ * which do not change file containerization (see file_container_t) ++ * (read, readpage, writepage, write (until tail conversion is ++ * involved)) take read-lock. Accesses which modify file ++ * containerization (truncate, conversion from tail to extent and back) ++ * take write-lock. ++ */ ++ struct rw_semaphore latch; ++ /* this enum specifies which items are used to build the file */ ++ file_container_t container; ++ /* ++ * plugin which controls when file is to be converted to extents and ++ * back to tail ++ */ ++ struct formatting_plugin *tplug; ++ /* if this is set, file is in exclusive use */ ++ int exclusive_use; ++#if REISER4_DEBUG ++ /* pointer to task struct of thread owning exclusive access to file */ ++ void *ea_owner; ++ atomic_t nr_neas; ++ void *last_reader; ++#endif ++}; ++ ++struct unix_file_info *unix_file_inode_data(const struct inode *inode); ++void get_exclusive_access(struct unix_file_info *); ++void drop_exclusive_access(struct unix_file_info *); ++void get_nonexclusive_access(struct unix_file_info *); ++void drop_nonexclusive_access(struct unix_file_info *); ++int try_to_get_nonexclusive_access(struct unix_file_info *); ++int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode, ++ struct inode *); ++int find_file_item_nohint(coord_t *, lock_handle *, ++ const reiser4_key *, znode_lock_mode, ++ struct inode *); ++ ++int load_file_hint(struct file *, hint_t *); ++void save_file_hint(struct file *, const hint_t *); ++ ++#include "../item/extent.h" ++#include "../item/tail.h" ++#include "../item/ctail.h" ++ ++struct uf_coord { ++ coord_t coord; ++ lock_handle *lh; ++ int valid; ++ union { ++ struct extent_coord_extension extent; ++ struct tail_coord_extension tail; ++ struct ctail_coord_extension ctail; ++ } extension; ++}; ++ ++#include "../../forward.h" ++#include "../../seal.h" ++#include "../../lock.h" ++ ++/* ++ * This structure is used to speed up file operations (reads and writes). A ++ * hint is a suggestion about where a key resolved to last time. A seal ++ * indicates whether a node has been modified since a hint was last recorded. ++ * You check the seal, and if the seal is still valid, you can use the hint ++ * without traversing the tree again. ++ */ ++struct hint { ++ seal_t seal; /* a seal over last file item accessed */ ++ uf_coord_t ext_coord; ++ loff_t offset; ++ znode_lock_mode mode; ++ lock_handle lh; ++}; ++ ++static inline int hint_is_valid(hint_t * hint) ++{ ++ return hint->ext_coord.valid; ++} ++ ++static inline void hint_set_valid(hint_t * hint) ++{ ++ hint->ext_coord.valid = 1; ++} ++ ++static inline void hint_clr_valid(hint_t * hint) ++{ ++ hint->ext_coord.valid = 0; ++} ++ ++int load_file_hint(struct file *, hint_t *); ++void save_file_hint(struct file *, const hint_t *); ++void hint_init_zero(hint_t *); ++void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode); ++int hint_is_set(const hint_t *); ++void reiser4_unset_hint(hint_t *); ++ ++int reiser4_update_file_size(struct inode *, loff_t, int update_sd); ++int cut_file_items(struct inode *, loff_t new_size, ++ int update_sd, loff_t cur_size, ++ int (*update_actor) (struct inode *, loff_t, int)); ++#if REISER4_DEBUG ++ ++/* return 1 is exclusive access is obtained, 0 - otherwise */ ++static inline int ea_obtained(struct unix_file_info * uf_info) ++{ ++ int ret; ++ ++ ret = down_read_trylock(&uf_info->latch); ++ if (ret) ++ up_read(&uf_info->latch); ++ return !ret; ++} ++ ++#endif ++ ++#define WRITE_GRANULARITY 32 ++ ++int tail2extent(struct unix_file_info *); ++int extent2tail(struct file *, struct unix_file_info *); ++ ++int goto_right_neighbor(coord_t *, lock_handle *); ++int find_or_create_extent(struct page *); ++int equal_to_ldk(znode *, const reiser4_key *); ++ ++void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh); ++ ++static inline int cbk_errored(int cbk_result) ++{ ++ return (cbk_result != CBK_COORD_NOTFOUND ++ && cbk_result != CBK_COORD_FOUND); ++} ++ ++/* __REISER4_FILE_H__ */ ++#endif ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/Makefile linux-2.6.33/fs/reiser4/plugin/file/Makefile +--- linux-2.6.33.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,7 @@ ++obj-$(CONFIG_REISER4_FS) += file_plugins.o ++ ++file_plugins-objs := \ ++ file.o \ ++ tail_conversion.o \ ++ symlink.o \ ++ cryptcompress.o +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.33/fs/reiser4/plugin/file/symfile.c +--- linux-2.6.33.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file/symfile.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,87 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Symfiles are a generalization of Unix symlinks. ++ ++ A symfile when read behaves as though you took its contents and ++ substituted them into the reiser4 naming system as the right hand side ++ of an assignment, and then read that which you had assigned to it. ++ ++ A key issue for symfiles is how to implement writes through to ++ subfiles. In general, one must have some method of determining what ++ of that which is written to the symfile is written to what subfile. ++ This can be done by use of custom plugin methods written by users, or ++ by using a few general methods we provide for those willing to endure ++ the insertion of delimiters into what is read. ++ ++ Writing to symfiles without delimiters to denote what is written to ++ what subfile is not supported by any plugins we provide in this ++ release. Our most sophisticated support for writes is that embodied ++ by the invert plugin (see invert.c). ++ ++ A read only version of the /etc/passwd file might be ++ constructed as a symfile whose contents are as follows: ++ ++ /etc/passwd/userlines/* ++ ++ or ++ ++ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root ++ ++ or ++ ++ /etc/passwd/userlines/(demidov+edward+reiser+root) ++ ++ A symfile with contents ++ ++ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB ++ ++ will return when read ++ ++ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB ++ ++ and write of what has been read will not be possible to implement as ++ an identity operation because there are no delimiters denoting the ++ boundaries of what is to be written to what subfile. ++ ++ Note that one could make this a read/write symfile if one specified ++ delimiters, and the write method understood those delimiters delimited ++ what was written to subfiles. ++ ++ So, specifying the symfile in a manner that allows writes: ++ ++ /etc/passwd/userlines/demidov+"( ++ )+/etc/passwd/userlines/edward+"( ++ )+/etc/passwd/userlines/reiser+"( ++ )+/etc/passwd/userlines/root+"( ++ ) ++ ++ or ++ ++ /etc/passwd/userlines/(demidov+"( ++ )+edward+"( ++ )+reiser+"( ++ )+root+"( ++ )) ++ ++ and the file demidov might be specified as: ++ ++ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell ++ ++ or ++ ++ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell) ++ ++ Notice that if the file demidov has a carriage return in it, the ++ parsing fails, but then if you put carriage returns in the wrong place ++ in a normal /etc/passwd file it breaks things also. ++ ++ Note that it is forbidden to have no text between two interpolations ++ if one wants to be able to define what parts of a write go to what ++ subfiles referenced in an interpolation. ++ ++ If one wants to be able to add new lines by writing to the file, one ++ must either write a custom plugin for /etc/passwd that knows how to ++ name an added line, or one must use an invert, or one must use a more ++ sophisticated symfile syntax that we are not planning to write for ++ version 4.0. ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.33/fs/reiser4/plugin/file/symlink.c +--- linux-2.6.33.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file/symlink.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,95 @@ ++/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "../../inode.h" ++ ++#include <linux/types.h> ++#include <linux/fs.h> ++ ++/* file plugin methods specific for symlink files ++ (SYMLINK_FILE_PLUGIN_ID) */ ++ ++/* this is implementation of create_object method of file plugin for ++ SYMLINK_FILE_PLUGIN_ID ++ */ ++ ++/** ++ * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID ++ * @symlink: inode of symlink object ++ * @dir: inode of parent directory ++ * @info: parameters of new object ++ * ++ * Inserts stat data with symlink extension where into the tree. ++ */ ++int reiser4_create_symlink(struct inode *symlink, ++ struct inode *dir UNUSED_ARG, ++ reiser4_object_create_data *data /* info passed to us ++ * this is filled by ++ * reiser4() syscall ++ * in particular */) ++{ ++ int result; ++ ++ assert("nikita-680", symlink != NULL); ++ assert("nikita-681", S_ISLNK(symlink->i_mode)); ++ assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD)); ++ assert("nikita-682", dir != NULL); ++ assert("nikita-684", data != NULL); ++ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID); ++ ++ /* ++ * stat data of symlink has symlink extension in which we store ++ * symlink content, that is, path symlink is pointing to. ++ */ ++ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT); ++ ++ assert("vs-838", symlink->i_private == NULL); ++ symlink->i_private = (void *)data->name; ++ ++ assert("vs-843", symlink->i_size == 0); ++ INODE_SET_FIELD(symlink, i_size, strlen(data->name)); ++ ++ /* insert stat data appended with data->name */ ++ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink); ++ if (result) { ++ /* FIXME-VS: Make sure that symlink->i_private is not attached ++ to kmalloced data */ ++ INODE_SET_FIELD(symlink, i_size, 0); ++ } else { ++ assert("vs-849", symlink->i_private ++ && reiser4_inode_get_flag(symlink, ++ REISER4_GENERIC_PTR_USED)); ++ assert("vs-850", ++ !memcmp((char *)symlink->i_private, data->name, ++ (size_t) symlink->i_size + 1)); ++ } ++ return result; ++} ++ ++/* this is implementation of destroy_inode method of file plugin for ++ SYMLINK_FILE_PLUGIN_ID ++ */ ++void destroy_inode_symlink(struct inode *inode) ++{ ++ assert("edward-799", ++ inode_file_plugin(inode) == ++ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID)); ++ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode)); ++ assert("edward-801", reiser4_inode_get_flag(inode, ++ REISER4_GENERIC_PTR_USED)); ++ assert("vs-839", S_ISLNK(inode->i_mode)); ++ ++ kfree(inode->i_private); ++ inode->i_private = NULL; ++ reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED); ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 80 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.33/fs/reiser4/plugin/file/tail_conversion.c +--- linux-2.6.33.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file/tail_conversion.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,743 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "../../inode.h" ++#include "../../super.h" ++#include "../../page_cache.h" ++#include "../../carry.h" ++#include "../../safe_link.h" ++#include "../../vfs_ops.h" ++ ++#include <linux/writeback.h> ++ ++/* this file contains: ++ tail2extent and extent2tail */ ++ ++/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */ ++void get_exclusive_access(struct unix_file_info * uf_info) ++{ ++ assert("nikita-3028", reiser4_schedulable()); ++ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w)); ++ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r)); ++ /* ++ * "deadlock avoidance": sometimes we commit a transaction under ++ * rw-semaphore on a file. Such commit can deadlock with another ++ * thread that captured some block (hence preventing atom from being ++ * committed) and waits on rw-semaphore. ++ */ ++ reiser4_txn_restart_current(); ++ LOCK_CNT_INC(inode_sem_w); ++ down_write(&uf_info->latch); ++ uf_info->exclusive_use = 1; ++ assert("vs-1713", uf_info->ea_owner == NULL); ++ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0); ++ ON_DEBUG(uf_info->ea_owner = current); ++} ++ ++void drop_exclusive_access(struct unix_file_info * uf_info) ++{ ++ assert("vs-1714", uf_info->ea_owner == current); ++ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0); ++ ON_DEBUG(uf_info->ea_owner = NULL); ++ uf_info->exclusive_use = 0; ++ up_write(&uf_info->latch); ++ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r)); ++ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w)); ++ LOCK_CNT_DEC(inode_sem_w); ++ reiser4_txn_restart_current(); ++} ++ ++/** ++ * nea_grabbed - do something when file semaphore is down_read-ed ++ * @uf_info: ++ * ++ * This is called when nonexclisive access is obtained on file. All it does is ++ * for debugging purposes. ++ */ ++static void nea_grabbed(struct unix_file_info *uf_info) ++{ ++#if REISER4_DEBUG ++ LOCK_CNT_INC(inode_sem_r); ++ assert("vs-1716", uf_info->ea_owner == NULL); ++ atomic_inc(&uf_info->nr_neas); ++ uf_info->last_reader = current; ++#endif ++} ++ ++/** ++ * get_nonexclusive_access - get nonexclusive access to a file ++ * @uf_info: unix file specific part of inode to obtain access to ++ * ++ * Nonexclusive access is obtained on a file before read, write, readpage. ++ */ ++void get_nonexclusive_access(struct unix_file_info *uf_info) ++{ ++ assert("nikita-3029", reiser4_schedulable()); ++ assert("nikita-3361", get_current_context()->trans->atom == NULL); ++ ++ down_read(&uf_info->latch); ++ nea_grabbed(uf_info); ++} ++ ++/** ++ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file ++ * @uf_info: unix file specific part of inode to obtain access to ++ * ++ * Non-blocking version of nonexclusive access obtaining. ++ */ ++int try_to_get_nonexclusive_access(struct unix_file_info *uf_info) ++{ ++ int result; ++ ++ result = down_read_trylock(&uf_info->latch); ++ if (result) ++ nea_grabbed(uf_info); ++ return result; ++} ++ ++void drop_nonexclusive_access(struct unix_file_info * uf_info) ++{ ++ assert("vs-1718", uf_info->ea_owner == NULL); ++ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0); ++ ON_DEBUG(atomic_dec(&uf_info->nr_neas)); ++ ++ up_read(&uf_info->latch); ++ ++ LOCK_CNT_DEC(inode_sem_r); ++ reiser4_txn_restart_current(); ++} ++ ++/* part of tail2extent. Cut all items covering @count bytes starting from ++ @offset */ ++/* Audited by: green(2002.06.15) */ ++static int cut_formatting_items(struct inode *inode, loff_t offset, int count) ++{ ++ reiser4_key from, to; ++ ++ /* AUDIT: How about putting an assertion here, what would check ++ all provided range is covered by tail items only? */ ++ /* key of first byte in the range to be cut */ ++ inode_file_plugin(inode)->key_by_inode(inode, offset, &from); ++ ++ /* key of last byte in that range */ ++ to = from; ++ set_key_offset(&to, (__u64) (offset + count - 1)); ++ ++ /* cut everything between those keys */ ++ return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to, ++ inode, 0); ++} ++ ++static void release_all_pages(struct page **pages, unsigned nr_pages) ++{ ++ unsigned i; ++ ++ for (i = 0; i < nr_pages; i++) { ++ if (pages[i] == NULL) { ++#if REISER4_DEBUG ++ unsigned j; ++ for (j = i + 1; j < nr_pages; j++) ++ assert("vs-1620", pages[j] == NULL); ++#endif ++ break; ++ } ++ page_cache_release(pages[i]); ++ pages[i] = NULL; ++ } ++} ++ ++/* part of tail2extent. replace tail items with extent one. Content of tail ++ items (@count bytes) being cut are copied already into ++ pages. extent_writepage method is called to create extents corresponding to ++ those pages */ ++static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count) ++{ ++ int result; ++ unsigned i; ++ STORE_COUNTERS; ++ ++ if (nr_pages == 0) ++ return 0; ++ ++ assert("vs-596", pages[0]); ++ ++ /* cut copied items */ ++ result = cut_formatting_items(inode, page_offset(pages[0]), count); ++ if (result) ++ return result; ++ ++ CHECK_COUNTERS; ++ ++ /* put into tree replacement for just removed items: extent item, namely */ ++ for (i = 0; i < nr_pages; i++) { ++ result = add_to_page_cache_lru(pages[i], inode->i_mapping, ++ pages[i]->index, ++ mapping_gfp_mask(inode-> ++ i_mapping)); ++ if (result) ++ break; ++ unlock_page(pages[i]); ++ result = find_or_create_extent(pages[i]); ++ if (result) ++ break; ++ SetPageUptodate(pages[i]); ++ } ++ return result; ++} ++ ++#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail ++ * items */ ++ ++static int reserve_tail2extent_iteration(struct inode *inode) ++{ ++ reiser4_block_nr unformatted_nodes; ++ reiser4_tree *tree; ++ ++ tree = reiser4_tree_by_inode(inode); ++ ++ /* number of unformatted nodes which will be created */ ++ unformatted_nodes = TAIL2EXTENT_PAGE_NUM; ++ ++ /* ++ * space required for one iteration of extent->tail conversion: ++ * ++ * 1. kill N tail items ++ * ++ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes ++ * ++ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block ++ * extents) extent units. ++ * ++ * 4. drilling to the leaf level by coord_by_key() ++ * ++ * 5. possible update of stat-data ++ * ++ */ ++ grab_space_enable(); ++ return reiser4_grab_space ++ (2 * tree->height + ++ TAIL2EXTENT_PAGE_NUM + ++ TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) + ++ 1 + estimate_one_insert_item(tree) + ++ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT); ++} ++ ++/* clear stat data's flag indicating that conversion is being converted */ ++static int complete_conversion(struct inode *inode) ++{ ++ int result; ++ ++ grab_space_enable(); ++ result = ++ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode), ++ BA_CAN_COMMIT); ++ if (result == 0) { ++ reiser4_inode_clr_flag(inode, REISER4_PART_MIXED); ++ result = reiser4_update_sd(inode); ++ } ++ if (result) ++ warning("vs-1696", "Failed to clear converting bit of %llu: %i", ++ (unsigned long long)get_inode_oid(inode), result); ++ return 0; ++} ++ ++/** ++ * find_start ++ * @inode: ++ * @id: ++ * @offset: ++ * ++ * this is used by tail2extent and extent2tail to detect where previous ++ * uncompleted conversion stopped ++ */ ++static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset) ++{ ++ int result; ++ lock_handle lh; ++ coord_t coord; ++ struct unix_file_info *ufo; ++ int found; ++ reiser4_key key; ++ ++ ufo = unix_file_inode_data(inode); ++ init_lh(&lh); ++ result = 0; ++ found = 0; ++ inode_file_plugin(inode)->key_by_inode(inode, *offset, &key); ++ do { ++ init_lh(&lh); ++ result = find_file_item_nohint(&coord, &lh, &key, ++ ZNODE_READ_LOCK, inode); ++ ++ if (result == CBK_COORD_FOUND) { ++ if (coord.between == AT_UNIT) { ++ /*coord_clear_iplug(&coord); */ ++ result = zload(coord.node); ++ if (result == 0) { ++ if (item_id_by_coord(&coord) == id) ++ found = 1; ++ else ++ item_plugin_by_coord(&coord)->s. ++ file.append_key(&coord, ++ &key); ++ zrelse(coord.node); ++ } ++ } else ++ result = RETERR(-ENOENT); ++ } ++ done_lh(&lh); ++ } while (result == 0 && !found); ++ *offset = get_key_offset(&key); ++ return result; ++} ++ ++/** ++ * tail2extent ++ * @uf_info: ++ * ++ * ++ */ ++int tail2extent(struct unix_file_info *uf_info) ++{ ++ int result; ++ reiser4_key key; /* key of next byte to be moved to page */ ++ char *p_data; /* data of page */ ++ unsigned page_off = 0, /* offset within the page where to copy data */ ++ count; /* number of bytes of item which can be ++ * copied to page */ ++ struct page *pages[TAIL2EXTENT_PAGE_NUM]; ++ struct page *page; ++ int done; /* set to 1 when all file is read */ ++ char *item; ++ int i; ++ struct inode *inode; ++ int first_iteration; ++ int bytes; ++ __u64 offset; ++ ++ assert("nikita-3362", ea_obtained(uf_info)); ++ inode = unix_file_info_to_inode(uf_info); ++ assert("nikita-3412", !IS_RDONLY(inode)); ++ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS); ++ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)); ++ ++ offset = 0; ++ first_iteration = 1; ++ result = 0; ++ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { ++ /* ++ * file is marked on disk as there was a conversion which did ++ * not complete due to either crash or some error. Find which ++ * offset tail conversion stopped at ++ */ ++ result = find_start(inode, FORMATTING_ID, &offset); ++ if (result == -ENOENT) { ++ /* no tail items found, everything is converted */ ++ uf_info->container = UF_CONTAINER_EXTENTS; ++ complete_conversion(inode); ++ return 0; ++ } else if (result != 0) ++ /* some other error */ ++ return result; ++ first_iteration = 0; ++ } ++ ++ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV); ++ ++ /* get key of first byte of a file */ ++ inode_file_plugin(inode)->key_by_inode(inode, offset, &key); ++ ++ done = 0; ++ while (done == 0) { ++ memset(pages, 0, sizeof(pages)); ++ result = reserve_tail2extent_iteration(inode); ++ if (result != 0) { ++ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); ++ goto out; ++ } ++ if (first_iteration) { ++ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); ++ reiser4_update_sd(inode); ++ first_iteration = 0; ++ } ++ bytes = 0; ++ for (i = 0; i < sizeof_array(pages) && done == 0; i++) { ++ assert("vs-598", ++ (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0); ++ page = alloc_page(reiser4_ctx_gfp_mask_get()); ++ if (!page) { ++ result = RETERR(-ENOMEM); ++ goto error; ++ } ++ ++ page->index = ++ (unsigned long)(get_key_offset(&key) >> ++ PAGE_CACHE_SHIFT); ++ /* ++ * usually when one is going to longterm lock znode (as ++ * find_file_item does, for instance) he must not hold ++ * locked pages. However, there is an exception for ++ * case tail2extent. Pages appearing here are not ++ * reachable to everyone else, they are clean, they do ++ * not have jnodes attached so keeping them locked do ++ * not risk deadlock appearance ++ */ ++ assert("vs-983", !PagePrivate(page)); ++ reiser4_invalidate_pages(inode->i_mapping, page->index, ++ 1, 0); ++ ++ for (page_off = 0; page_off < PAGE_CACHE_SIZE;) { ++ coord_t coord; ++ lock_handle lh; ++ ++ /* get next item */ ++ /* FIXME: we might want to readahead here */ ++ init_lh(&lh); ++ result = ++ find_file_item_nohint(&coord, &lh, &key, ++ ZNODE_READ_LOCK, ++ inode); ++ if (result != CBK_COORD_FOUND) { ++ /* ++ * error happened of not items of file ++ * were found ++ */ ++ done_lh(&lh); ++ page_cache_release(page); ++ goto error; ++ } ++ ++ if (coord.between == AFTER_UNIT) { ++ /* ++ * end of file is reached. Padd page ++ * with zeros ++ */ ++ done_lh(&lh); ++ done = 1; ++ p_data = kmap_atomic(page, KM_USER0); ++ memset(p_data + page_off, 0, ++ PAGE_CACHE_SIZE - page_off); ++ kunmap_atomic(p_data, KM_USER0); ++ break; ++ } ++ ++ result = zload(coord.node); ++ if (result) { ++ page_cache_release(page); ++ done_lh(&lh); ++ goto error; ++ } ++ assert("vs-856", coord.between == AT_UNIT); ++ item = ((char *)item_body_by_coord(&coord)) + ++ coord.unit_pos; ++ ++ /* how many bytes to copy */ ++ count = ++ item_length_by_coord(&coord) - ++ coord.unit_pos; ++ /* limit length of copy to end of page */ ++ if (count > PAGE_CACHE_SIZE - page_off) ++ count = PAGE_CACHE_SIZE - page_off; ++ ++ /* ++ * copy item (as much as will fit starting from ++ * the beginning of the item) into the page ++ */ ++ p_data = kmap_atomic(page, KM_USER0); ++ memcpy(p_data + page_off, item, count); ++ kunmap_atomic(p_data, KM_USER0); ++ ++ page_off += count; ++ bytes += count; ++ set_key_offset(&key, ++ get_key_offset(&key) + count); ++ ++ zrelse(coord.node); ++ done_lh(&lh); ++ } /* end of loop which fills one page by content of ++ * formatting items */ ++ ++ if (page_off) { ++ /* something was copied into page */ ++ pages[i] = page; ++ } else { ++ page_cache_release(page); ++ assert("vs-1648", done == 1); ++ break; ++ } ++ } /* end of loop through pages of one conversion iteration */ ++ ++ if (i > 0) { ++ result = replace(inode, pages, i, bytes); ++ release_all_pages(pages, sizeof_array(pages)); ++ if (result) ++ goto error; ++ /* ++ * We have to drop exclusive access to avoid deadlock ++ * which may happen because called by reiser4_writepages ++ * capture_unix_file requires to get non-exclusive ++ * access to a file. It is safe to drop EA in the middle ++ * of tail2extent conversion because write_unix_file, ++ * setattr_unix_file(truncate), mmap_unix_file, ++ * release_unix_file(extent2tail) checks if conversion ++ * is not in progress (see comments before ++ * get_exclusive_access_careful(). ++ * Other processes that acquire non-exclusive access ++ * (read_unix_file, reiser4_writepages, etc) should work ++ * on partially converted files. ++ */ ++ drop_exclusive_access(uf_info); ++ /* throttle the conversion ++ FIXME-EDWARD: Pass the precise number of pages ++ that was dirtied */ ++ reiser4_throttle_write(inode, 1); ++ get_exclusive_access(uf_info); ++ ++ /* ++ * nobody is allowed to complete conversion but a ++ * process which started it ++ */ ++ assert("", reiser4_inode_get_flag(inode, ++ REISER4_PART_MIXED)); ++ } ++ } ++ if (result == 0) { ++ /* file is converted to extent items */ ++ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); ++ assert("vs-1697", reiser4_inode_get_flag(inode, ++ REISER4_PART_MIXED)); ++ ++ uf_info->container = UF_CONTAINER_EXTENTS; ++ complete_conversion(inode); ++ } else { ++ /* ++ * conversion is not complete. Inode was already marked as ++ * REISER4_PART_MIXED and stat-data were updated at the first ++ * iteration of the loop above. ++ */ ++ error: ++ release_all_pages(pages, sizeof_array(pages)); ++ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); ++ warning("edward-1548", "Partial conversion of %llu: %i", ++ (unsigned long long)get_inode_oid(inode), result); ++ } ++ ++ out: ++ /* this flag should be cleared, otherwise get_exclusive_access_careful() ++ will fall into infinite loop */ ++ assert("edward-1549", !reiser4_inode_get_flag(inode, ++ REISER4_PART_IN_CONV)); ++ return result; ++} ++ ++static int reserve_extent2tail_iteration(struct inode *inode) ++{ ++ reiser4_tree *tree; ++ ++ tree = reiser4_tree_by_inode(inode); ++ /* ++ * reserve blocks for (in this order): ++ * ++ * 1. removal of extent item ++ * ++ * 2. insertion of tail by insert_flow() ++ * ++ * 3. drilling to the leaf level by coord_by_key() ++ * ++ * 4. possible update of stat-data ++ */ ++ grab_space_enable(); ++ return reiser4_grab_space ++ (estimate_one_item_removal(tree) + ++ estimate_insert_flow(tree->height) + ++ 1 + estimate_one_insert_item(tree) + ++ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT); ++} ++ ++/* for every page of file: read page, cut part of extent pointing to this page, ++ put data of page tree by tail item */ ++int extent2tail(struct file * file, struct unix_file_info *uf_info) ++{ ++ int result; ++ struct inode *inode; ++ struct page *page; ++ unsigned long num_pages, i; ++ unsigned long start_page; ++ reiser4_key from; ++ reiser4_key to; ++ unsigned count; ++ __u64 offset; ++ ++ assert("nikita-3362", ea_obtained(uf_info)); ++ inode = unix_file_info_to_inode(uf_info); ++ assert("nikita-3412", !IS_RDONLY(inode)); ++ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS); ++ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)); ++ ++ offset = 0; ++ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { ++ /* ++ * file is marked on disk as there was a conversion which did ++ * not complete due to either crash or some error. Find which ++ * offset tail conversion stopped at ++ */ ++ result = find_start(inode, EXTENT_POINTER_ID, &offset); ++ if (result == -ENOENT) { ++ /* no extent found, everything is converted */ ++ uf_info->container = UF_CONTAINER_TAILS; ++ complete_conversion(inode); ++ return 0; ++ } else if (result != 0) ++ /* some other error */ ++ return result; ++ } ++ ++ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV); ++ ++ /* number of pages in the file */ ++ num_pages = ++ (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; ++ start_page = offset >> PAGE_CACHE_SHIFT; ++ ++ inode_file_plugin(inode)->key_by_inode(inode, offset, &from); ++ to = from; ++ ++ result = 0; ++ for (i = 0; i < num_pages; i++) { ++ __u64 start_byte; ++ ++ result = reserve_extent2tail_iteration(inode); ++ if (result != 0) ++ break; ++ if (i == 0 && offset == 0) { ++ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); ++ reiser4_update_sd(inode); ++ } ++ ++ page = read_mapping_page(inode->i_mapping, ++ (unsigned)(i + start_page), NULL); ++ if (IS_ERR(page)) { ++ result = PTR_ERR(page); ++ break; ++ } ++ ++ wait_on_page_locked(page); ++ ++ if (!PageUptodate(page)) { ++ page_cache_release(page); ++ result = RETERR(-EIO); ++ break; ++ } ++ ++ /* cut part of file we have read */ ++ start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT); ++ set_key_offset(&from, start_byte); ++ set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1); ++ /* ++ * reiser4_cut_tree_object() returns -E_REPEAT to allow atom ++ * commits during over-long truncates. But ++ * extent->tail conversion should be performed in one ++ * transaction. ++ */ ++ result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, ++ &to, inode, 0); ++ ++ if (result) { ++ page_cache_release(page); ++ break; ++ } ++ ++ /* put page data into tree via tail_write */ ++ count = PAGE_CACHE_SIZE; ++ if ((i == (num_pages - 1)) && ++ (inode->i_size & ~PAGE_CACHE_MASK)) ++ /* last page can be incompleted */ ++ count = (inode->i_size & ~PAGE_CACHE_MASK); ++ while (count) { ++ loff_t pos = start_byte; ++ ++ assert("edward-1537", ++ file != NULL && file->f_dentry != NULL); ++ assert("edward-1538", ++ file->f_dentry->d_inode == inode); ++ ++ result = reiser4_write_tail(file, inode, ++ (char __user *)kmap(page), ++ count, &pos); ++ reiser4_free_file_fsdata(file); ++ if (result <= 0) { ++ warning("", "reiser4_write_tail failed"); ++ page_cache_release(page); ++ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); ++ return result; ++ } ++ count -= result; ++ } ++ ++ /* release page */ ++ lock_page(page); ++ /* page is already detached from jnode and mapping. */ ++ assert("vs-1086", page->mapping == NULL); ++ assert("nikita-2690", ++ (!PagePrivate(page) && jprivate(page) == 0)); ++ /* waiting for writeback completion with page lock held is ++ * perfectly valid. */ ++ wait_on_page_writeback(page); ++ reiser4_drop_page(page); ++ /* release reference taken by read_cache_page() above */ ++ page_cache_release(page); ++ ++ drop_exclusive_access(uf_info); ++ /* ++ * throttle the conversion. ++ * FIXME-EDWARD: Calculate and pass the precise number ++ * of pages that was dirtied ++ */ ++ reiser4_throttle_write(inode, 1); ++ get_exclusive_access(uf_info); ++ /* ++ * nobody is allowed to complete conversion but a process which ++ * started it ++ */ ++ assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED)); ++ } ++ ++ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); ++ ++ if (i == num_pages) { ++ /* file is converted to formatted items */ ++ assert("vs-1698", reiser4_inode_get_flag(inode, ++ REISER4_PART_MIXED)); ++ assert("vs-1260", ++ inode_has_no_jnodes(reiser4_inode_data(inode))); ++ ++ uf_info->container = UF_CONTAINER_TAILS; ++ complete_conversion(inode); ++ return 0; ++ } ++ /* ++ * conversion is not complete. Inode was already marked as ++ * REISER4_PART_MIXED and stat-data were updated at the first ++ * iteration of the loop above. ++ */ ++ warning("nikita-2282", ++ "Partial conversion of %llu: %lu of %lu: %i", ++ (unsigned long long)get_inode_oid(inode), i, ++ num_pages, result); ++ ++ /* this flag should be cleared, otherwise get_exclusive_access_careful() ++ will fall into infinite loop */ ++ assert("edward-1550", !reiser4_inode_get_flag(inode, ++ REISER4_PART_IN_CONV)); ++ return result; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file_ops.c linux-2.6.33/fs/reiser4/plugin/file_ops.c +--- linux-2.6.33.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file_ops.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,162 @@ ++/* Copyright 2005 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* this file contains typical implementations for some of methods of ++ struct file_operations and of struct address_space_operations ++*/ ++ ++#include "../inode.h" ++#include "object.h" ++ ++/* file operations */ ++ ++/* implementation of vfs's llseek method of struct file_operations for ++ typical directory can be found in readdir_common.c ++*/ ++loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin); ++ ++/* implementation of vfs's readdir method of struct file_operations for ++ typical directory can be found in readdir_common.c ++*/ ++int reiser4_readdir_common(struct file *, void *dirent, filldir_t); ++ ++/** ++ * reiser4_release_dir_common - release of struct file_operations ++ * @inode: inode of released file ++ * @file: file to release ++ * ++ * Implementation of release method of struct file_operations for typical ++ * directory. All it does is freeing of reiser4 specific file data. ++*/ ++int reiser4_release_dir_common(struct inode *inode, struct file *file) ++{ ++ reiser4_context *ctx; ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ reiser4_free_file_fsdata(file); ++ reiser4_exit_context(ctx); ++ return 0; ++} ++ ++/* this is common implementation of vfs's fsync method of struct ++ file_operations ++*/ ++int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync) ++{ ++ reiser4_context *ctx; ++ int result; ++ ++ ctx = reiser4_init_context(dentry->d_inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0); ++ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/* ++ * common sync method for regular files. ++ * ++ * We are trying to be smart here. Instead of committing all atoms (original ++ * solution), we scan dirty pages of this file and commit all atoms they are ++ * part of. ++ * ++ * Situation is complicated by anonymous pages: i.e., extent-less pages ++ * dirtied through mmap. Fortunately sys_fsync() first calls ++ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert ++ * all missing extents and capture anonymous pages. ++ */ ++int reiser4_sync_file_common(struct file *file, ++ struct dentry *dentry, int datasync) ++{ ++ reiser4_context *ctx; ++ txn_atom *atom; ++ reiser4_block_nr reserve; ++ ++ ctx = reiser4_init_context(dentry->d_inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ reserve = estimate_update_common(dentry->d_inode); ++ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { ++ reiser4_exit_context(ctx); ++ return RETERR(-ENOSPC); ++ } ++ write_sd_by_inode_common(dentry->d_inode); ++ ++ atom = get_current_atom_locked(); ++ spin_lock_txnh(ctx->trans); ++ force_commit_atom(ctx->trans); ++ reiser4_exit_context(ctx); ++ return 0; ++} ++ ++ ++/* address space operations */ ++ ++ ++/* this is helper for plugin->write_begin() */ ++int do_prepare_write(struct file *file, struct page *page, unsigned from, ++ unsigned to) ++{ ++ int result; ++ file_plugin *fplug; ++ struct inode *inode; ++ ++ assert("umka-3099", file != NULL); ++ assert("umka-3100", page != NULL); ++ assert("umka-3095", PageLocked(page)); ++ ++ if (to - from == PAGE_CACHE_SIZE || PageUptodate(page)) ++ return 0; ++ ++ inode = page->mapping->host; ++ fplug = inode_file_plugin(inode); ++ ++ if (page->mapping->a_ops->readpage == NULL) ++ return RETERR(-EINVAL); ++ ++ result = page->mapping->a_ops->readpage(file, page); ++ if (result != 0) { ++ SetPageError(page); ++ ClearPageUptodate(page); ++ /* All reiser4 readpage() implementations should return the ++ * page locked in case of error. */ ++ assert("nikita-3472", PageLocked(page)); ++ } else { ++ /* ++ * ->readpage() either: ++ * ++ * 1. starts IO against @page. @page is locked for IO in ++ * this case. ++ * ++ * 2. doesn't start IO. @page is unlocked. ++ * ++ * In either case, page should be locked. ++ */ ++ lock_page(page); ++ /* ++ * IO (if any) is completed at this point. Check for IO ++ * errors. ++ */ ++ if (!PageUptodate(page)) ++ result = RETERR(-EIO); ++ } ++ assert("umka-3098", PageLocked(page)); ++ return result; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.33/fs/reiser4/plugin/file_ops_readdir.c +--- linux-2.6.33.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file_ops_readdir.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,658 @@ ++/* Copyright 2005 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#include "../inode.h" ++ ++/* return true, iff @coord points to the valid directory item that is part of ++ * @inode directory. */ ++static int is_valid_dir_coord(struct inode *inode, coord_t *coord) ++{ ++ return plugin_of_group(item_plugin_by_coord(coord), ++ DIR_ENTRY_ITEM_TYPE) && ++ inode_file_plugin(inode)->owns_item(inode, coord); ++} ++ ++/* compare two logical positions within the same directory */ ++static cmp_t dir_pos_cmp(const struct dir_pos *p1, const struct dir_pos *p2) ++{ ++ cmp_t result; ++ ++ assert("nikita-2534", p1 != NULL); ++ assert("nikita-2535", p2 != NULL); ++ ++ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key); ++ if (result == EQUAL_TO) { ++ int diff; ++ ++ diff = p1->pos - p2->pos; ++ result = ++ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO); ++ } ++ return result; ++} ++ ++/* see comment before reiser4_readdir_common() for overview of why "adjustment" ++ * is necessary. */ ++static void ++adjust_dir_pos(struct file *dir, struct readdir_pos *readdir_spot, ++ const struct dir_pos *mod_point, int adj) ++{ ++ struct dir_pos *pos; ++ ++ /* ++ * new directory entry was added (adj == +1) or removed (adj == -1) at ++ * the @mod_point. Directory file descriptor @dir is doing readdir and ++ * is currently positioned at @readdir_spot. Latter has to be updated ++ * to maintain stable readdir. ++ */ ++ /* directory is positioned to the beginning. */ ++ if (readdir_spot->entry_no == 0) ++ return; ++ ++ pos = &readdir_spot->position; ++ switch (dir_pos_cmp(mod_point, pos)) { ++ case LESS_THAN: ++ /* @mod_pos is _before_ @readdir_spot, that is, entry was ++ * added/removed on the left (in key order) of current ++ * position. */ ++ /* logical number of directory entry readdir is "looking" at ++ * changes */ ++ readdir_spot->entry_no += adj; ++ assert("nikita-2577", ++ ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0)); ++ if (de_id_cmp(&pos->dir_entry_key, ++ &mod_point->dir_entry_key) == EQUAL_TO) { ++ assert("nikita-2575", mod_point->pos < pos->pos); ++ /* ++ * if entry added/removed has the same key as current ++ * for readdir, update counter of duplicate keys in ++ * @readdir_spot. ++ */ ++ pos->pos += adj; ++ } ++ break; ++ case GREATER_THAN: ++ /* directory is modified after @pos: nothing to do. */ ++ break; ++ case EQUAL_TO: ++ /* cannot insert an entry readdir is looking at, because it ++ already exists. */ ++ assert("nikita-2576", adj < 0); ++ /* directory entry to which @pos points to is being ++ removed. ++ ++ NOTE-NIKITA: Right thing to do is to update @pos to point ++ to the next entry. This is complex (we are under spin-lock ++ for one thing). Just rewind it to the beginning. Next ++ readdir will have to scan the beginning of ++ directory. Proper solution is to use semaphore in ++ spin lock's stead and use rewind_right() here. ++ ++ NOTE-NIKITA: now, semaphore is used, so... ++ */ ++ memset(readdir_spot, 0, sizeof *readdir_spot); ++ } ++} ++ ++/* scan all file-descriptors for this directory and adjust their ++ positions respectively. Should be used by implementations of ++ add_entry and rem_entry of dir plugin */ ++void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de, ++ int offset, int adj) ++{ ++ reiser4_file_fsdata *scan; ++ struct dir_pos mod_point; ++ ++ assert("nikita-2536", dir != NULL); ++ assert("nikita-2538", de != NULL); ++ assert("nikita-2539", adj != 0); ++ ++ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key); ++ mod_point.pos = offset; ++ ++ spin_lock_inode(dir); ++ ++ /* ++ * new entry was added/removed in directory @dir. Scan all file ++ * descriptors for @dir that are currently involved into @readdir and ++ * update them. ++ */ ++ ++ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage) ++ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj); ++ ++ spin_unlock_inode(dir); ++} ++ ++/* ++ * traverse tree to start/continue readdir from the readdir position @pos. ++ */ ++static int dir_go_to(struct file *dir, struct readdir_pos *pos, tap_t *tap) ++{ ++ reiser4_key key; ++ int result; ++ struct inode *inode; ++ ++ assert("nikita-2554", pos != NULL); ++ ++ inode = dir->f_dentry->d_inode; ++ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key); ++ if (result != 0) ++ return result; ++ result = reiser4_object_lookup(inode, ++ &key, ++ tap->coord, ++ tap->lh, ++ tap->mode, ++ FIND_EXACT, ++ LEAF_LEVEL, LEAF_LEVEL, ++ 0, &tap->ra_info); ++ if (result == CBK_COORD_FOUND) ++ result = rewind_right(tap, (int)pos->position.pos); ++ else { ++ tap->coord->node = NULL; ++ done_lh(tap->lh); ++ result = RETERR(-EIO); ++ } ++ return result; ++} ++ ++/* ++ * handling of non-unique keys: calculate at what ordinal position within ++ * sequence of directory items with identical keys @pos is. ++ */ ++static int set_pos(struct inode *inode, struct readdir_pos *pos, tap_t *tap) ++{ ++ int result; ++ coord_t coord; ++ lock_handle lh; ++ tap_t scan; ++ de_id *did; ++ reiser4_key de_key; ++ ++ coord_init_zero(&coord); ++ init_lh(&lh); ++ reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK); ++ reiser4_tap_copy(&scan, tap); ++ reiser4_tap_load(&scan); ++ pos->position.pos = 0; ++ ++ did = &pos->position.dir_entry_key; ++ ++ if (is_valid_dir_coord(inode, scan.coord)) { ++ ++ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did); ++ ++ while (1) { ++ ++ result = go_prev_unit(&scan); ++ if (result != 0) ++ break; ++ ++ if (!is_valid_dir_coord(inode, scan.coord)) { ++ result = -EINVAL; ++ break; ++ } ++ ++ /* get key of directory entry */ ++ unit_key_by_coord(scan.coord, &de_key); ++ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) { ++ /* duplicate-sequence is over */ ++ break; ++ } ++ pos->position.pos++; ++ } ++ } else ++ result = RETERR(-ENOENT); ++ reiser4_tap_relse(&scan); ++ reiser4_tap_done(&scan); ++ return result; ++} ++ ++/* ++ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly. ++ */ ++static int dir_rewind(struct file *dir, struct readdir_pos *pos, tap_t *tap) ++{ ++ __u64 destination; ++ __s64 shift; ++ int result; ++ struct inode *inode; ++ loff_t dirpos; ++ ++ assert("nikita-2553", dir != NULL); ++ assert("nikita-2548", pos != NULL); ++ assert("nikita-2551", tap->coord != NULL); ++ assert("nikita-2552", tap->lh != NULL); ++ ++ dirpos = reiser4_get_dir_fpos(dir); ++ shift = dirpos - pos->fpos; ++ /* this is logical directory entry within @dir which we are rewinding ++ * to */ ++ destination = pos->entry_no + shift; ++ ++ inode = dir->f_dentry->d_inode; ++ if (dirpos < 0) ++ return RETERR(-EINVAL); ++ else if (destination == 0ll || dirpos == 0) { ++ /* rewind to the beginning of directory */ ++ memset(pos, 0, sizeof *pos); ++ return dir_go_to(dir, pos, tap); ++ } else if (destination >= inode->i_size) ++ return RETERR(-ENOENT); ++ ++ if (shift < 0) { ++ /* I am afraid of negative numbers */ ++ shift = -shift; ++ /* rewinding to the left */ ++ if (shift <= (int)pos->position.pos) { ++ /* destination is within sequence of entries with ++ duplicate keys. */ ++ result = dir_go_to(dir, pos, tap); ++ } else { ++ shift -= pos->position.pos; ++ while (1) { ++ /* repetitions: deadlock is possible when ++ going to the left. */ ++ result = dir_go_to(dir, pos, tap); ++ if (result == 0) { ++ result = rewind_left(tap, shift); ++ if (result == -E_DEADLOCK) { ++ reiser4_tap_done(tap); ++ continue; ++ } ++ } ++ break; ++ } ++ } ++ } else { ++ /* rewinding to the right */ ++ result = dir_go_to(dir, pos, tap); ++ if (result == 0) ++ result = rewind_right(tap, shift); ++ } ++ if (result == 0) { ++ result = set_pos(inode, pos, tap); ++ if (result == 0) { ++ /* update pos->position.pos */ ++ pos->entry_no = destination; ++ pos->fpos = dirpos; ++ } ++ } ++ return result; ++} ++ ++/* ++ * Function that is called by common_readdir() on each directory entry while ++ * doing readdir. ->filldir callback may block, so we had to release long term ++ * lock while calling it. To avoid repeating tree traversal, seal is used. If ++ * seal is broken, we return -E_REPEAT. Node is unlocked in this case. ++ * ++ * Whether node is unlocked in case of any other error is undefined. It is ++ * guaranteed to be still locked if success (0) is returned. ++ * ++ * When ->filldir() wants no more, feed_entry() returns 1, and node is ++ * unlocked. ++ */ ++static int ++feed_entry(struct file *f, struct readdir_pos *pos, tap_t *tap, ++ filldir_t filldir, void *dirent) ++{ ++ item_plugin *iplug; ++ char *name; ++ reiser4_key sd_key; ++ int result; ++ char buf[DE_NAME_BUF_LEN]; ++ char name_buf[32]; ++ char *local_name; ++ unsigned file_type; ++ seal_t seal; ++ coord_t *coord; ++ reiser4_key entry_key; ++ ++ coord = tap->coord; ++ iplug = item_plugin_by_coord(coord); ++ ++ /* pointer to name within the node */ ++ name = iplug->s.dir.extract_name(coord, buf); ++ assert("nikita-1371", name != NULL); ++ ++ /* key of object the entry points to */ ++ if (iplug->s.dir.extract_key(coord, &sd_key) != 0) ++ return RETERR(-EIO); ++ ++ /* we must release longterm znode lock before calling filldir to avoid ++ deadlock which may happen if filldir causes page fault. So, copy ++ name to intermediate buffer */ ++ if (strlen(name) + 1 > sizeof(name_buf)) { ++ local_name = kmalloc(strlen(name) + 1, ++ reiser4_ctx_gfp_mask_get()); ++ if (local_name == NULL) ++ return RETERR(-ENOMEM); ++ } else ++ local_name = name_buf; ++ ++ strcpy(local_name, name); ++ file_type = iplug->s.dir.extract_file_type(coord); ++ ++ unit_key_by_coord(coord, &entry_key); ++ reiser4_seal_init(&seal, coord, &entry_key); ++ ++ longterm_unlock_znode(tap->lh); ++ ++ /* ++ * send information about directory entry to the ->filldir() filler ++ * supplied to us by caller (VFS). ++ * ++ * ->filldir is entitled to do weird things. For example, ->filldir ++ * supplied by knfsd re-enters file system. Make sure no locks are ++ * held. ++ */ ++ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack())); ++ ++ reiser4_txn_restart_current(); ++ result = filldir(dirent, name, (int)strlen(name), ++ /* offset of this entry */ ++ f->f_pos, ++ /* inode number of object bounden by this entry */ ++ oid_to_uino(get_key_objectid(&sd_key)), file_type); ++ if (local_name != name_buf) ++ kfree(local_name); ++ if (result < 0) ++ /* ->filldir() is satisfied. (no space in buffer, IOW) */ ++ result = 1; ++ else ++ result = reiser4_seal_validate(&seal, coord, &entry_key, ++ tap->lh, tap->mode, ++ ZNODE_LOCK_HIPRI); ++ return result; ++} ++ ++static void move_entry(struct readdir_pos *pos, coord_t *coord) ++{ ++ reiser4_key de_key; ++ de_id *did; ++ ++ /* update @pos */ ++ ++pos->entry_no; ++ did = &pos->position.dir_entry_key; ++ ++ /* get key of directory entry */ ++ unit_key_by_coord(coord, &de_key); ++ ++ if (de_id_key_cmp(did, &de_key) == EQUAL_TO) ++ /* we are within sequence of directory entries ++ with duplicate keys. */ ++ ++pos->position.pos; ++ else { ++ pos->position.pos = 0; ++ build_de_id_by_key(&de_key, did); ++ } ++ ++pos->fpos; ++} ++ ++/* ++ * STATELESS READDIR ++ * ++ * readdir support in reiser4 relies on ability to update readdir_pos embedded ++ * into reiser4_file_fsdata on each directory modification (name insertion and ++ * removal), see reiser4_readdir_common() function below. This obviously doesn't ++ * work when reiser4 is accessed over NFS, because NFS doesn't keep any state ++ * across client READDIR requests for the same directory. ++ * ++ * To address this we maintain a "pool" of detached reiser4_file_fsdata ++ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to ++ * find detached reiser4_file_fsdata corresponding to previous readdir ++ * request. In other words, additional state is maintained on the ++ * server. (This is somewhat contrary to the design goals of NFS protocol.) ++ * ++ * To efficiently detect when our ->readdir() method is called by NFS server, ++ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by ++ * file_is_stateless() function). ++ * ++ * To find out d_cursor in the pool, we encode client id (cid) in the highest ++ * bits of NFS readdir cookie: when first readdir request comes to the given ++ * directory from the given client, cookie is set to 0. This situation is ++ * detected, global cid_counter is incremented, and stored in highest bits of ++ * all direntry offsets returned to the client, including last one. As the ++ * only valid readdir cookie is one obtained as direntry->offset, we are ++ * guaranteed that next readdir request (continuing current one) will have ++ * current cid in the highest bits of starting readdir cookie. All d_cursors ++ * are hashed into per-super-block hash table by (oid, cid) key. ++ * ++ * In addition d_cursors are placed into per-super-block radix tree where they ++ * are keyed by oid alone. This is necessary to efficiently remove them during ++ * rmdir. ++ * ++ * At last, currently unused d_cursors are linked into special list. This list ++ * is used d_cursor_shrink to reclaim d_cursors on memory pressure. ++ * ++ */ ++ ++/* ++ * prepare for readdir. ++ */ ++static int dir_readdir_init(struct file *f, tap_t *tap, ++ struct readdir_pos **pos) ++{ ++ struct inode *inode; ++ reiser4_file_fsdata *fsdata; ++ int result; ++ ++ assert("nikita-1359", f != NULL); ++ inode = f->f_dentry->d_inode; ++ assert("nikita-1360", inode != NULL); ++ ++ if (!S_ISDIR(inode->i_mode)) ++ return RETERR(-ENOTDIR); ++ ++ /* try to find detached readdir state */ ++ result = reiser4_attach_fsdata(f, inode); ++ if (result != 0) ++ return result; ++ ++ fsdata = reiser4_get_file_fsdata(f); ++ assert("nikita-2571", fsdata != NULL); ++ if (IS_ERR(fsdata)) ++ return PTR_ERR(fsdata); ++ ++ /* add file descriptor to the readdir list hanging of directory ++ * inode. This list is used to scan "readdirs-in-progress" while ++ * inserting or removing names in the directory. */ ++ spin_lock_inode(inode); ++ if (list_empty_careful(&fsdata->dir.linkage)) ++ list_add(&fsdata->dir.linkage, get_readdir_list(inode)); ++ *pos = &fsdata->dir.readdir; ++ spin_unlock_inode(inode); ++ ++ /* move @tap to the current position */ ++ return dir_rewind(f, *pos, tap); ++} ++ ++/* this is implementation of vfs's llseek method of struct file_operations for ++ typical directory ++ See comment before reiser4_readdir_common() for explanation. ++*/ ++loff_t reiser4_llseek_dir_common(struct file *file, loff_t off, int origin) ++{ ++ reiser4_context *ctx; ++ loff_t result; ++ struct inode *inode; ++ ++ inode = file->f_dentry->d_inode; ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ mutex_lock(&inode->i_mutex); ++ ++ /* update ->f_pos */ ++ result = default_llseek(file, off, origin); ++ if (result >= 0) { ++ int ff; ++ coord_t coord; ++ lock_handle lh; ++ tap_t tap; ++ struct readdir_pos *pos; ++ ++ coord_init_zero(&coord); ++ init_lh(&lh); ++ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); ++ ++ ff = dir_readdir_init(file, &tap, &pos); ++ reiser4_detach_fsdata(file); ++ if (ff != 0) ++ result = (loff_t) ff; ++ reiser4_tap_done(&tap); ++ } ++ reiser4_detach_fsdata(file); ++ mutex_unlock(&inode->i_mutex); ++ ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/* this is common implementation of vfs's readdir method of struct ++ file_operations ++ ++ readdir problems: ++ ++ readdir(2)/getdents(2) interface is based on implicit assumption that ++ readdir can be restarted from any particular point by supplying file system ++ with off_t-full of data. That is, file system fills ->d_off field in struct ++ dirent and later user passes ->d_off to the seekdir(3), which is, actually, ++ implemented by glibc as lseek(2) on directory. ++ ++ Reiser4 cannot restart readdir from 64 bits of data, because two last ++ components of the key of directory entry are unknown, which given 128 bits: ++ locality and type fields in the key of directory entry are always known, to ++ start readdir() from given point objectid and offset fields have to be ++ filled. ++ ++ Traditional UNIX API for scanning through directory ++ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the ++ assumption that directory is structured very much like regular file, in ++ particular, it is implied that each name within given directory (directory ++ entry) can be uniquely identified by scalar offset and that such offset is ++ stable across the life-time of the name is identifies. ++ ++ This is manifestly not so for reiser4. In reiser4 the only stable unique ++ identifies for the directory entry is its key that doesn't fit into ++ seekdir/telldir API. ++ ++ solution: ++ ++ Within each file descriptor participating in readdir-ing of directory ++ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of ++ the "current" directory entry that file descriptor looks at. It contains a ++ key of directory entry (plus some additional info to deal with non-unique ++ keys that we wouldn't dwell onto here) and a logical position of this ++ directory entry starting from the beginning of the directory, that is ++ ordinal number of this entry in the readdir order. ++ ++ Obviously this logical position is not stable in the face of directory ++ modifications. To work around this, on each addition or removal of directory ++ entry all file descriptors for directory inode are scanned and their ++ readdir_pos are updated accordingly (adjust_dir_pos()). ++*/ ++int reiser4_readdir_common(struct file *f /* directory file being read */, ++ void *dirent /* opaque data passed to us by VFS */, ++ filldir_t filld /* filler function passed to us ++ * by VFS */) ++{ ++ reiser4_context *ctx; ++ int result; ++ struct inode *inode; ++ coord_t coord; ++ lock_handle lh; ++ tap_t tap; ++ struct readdir_pos *pos; ++ ++ assert("nikita-1359", f != NULL); ++ inode = f->f_dentry->d_inode; ++ assert("nikita-1360", inode != NULL); ++ ++ if (!S_ISDIR(inode->i_mode)) ++ return RETERR(-ENOTDIR); ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ coord_init_zero(&coord); ++ init_lh(&lh); ++ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); ++ ++ reiser4_readdir_readahead_init(inode, &tap); ++ ++repeat: ++ result = dir_readdir_init(f, &tap, &pos); ++ if (result == 0) { ++ result = reiser4_tap_load(&tap); ++ /* scan entries one by one feeding them to @filld */ ++ while (result == 0) { ++ coord_t *coord; ++ ++ coord = tap.coord; ++ assert("nikita-2572", coord_is_existing_unit(coord)); ++ assert("nikita-3227", is_valid_dir_coord(inode, coord)); ++ ++ result = feed_entry(f, pos, &tap, filld, dirent); ++ if (result > 0) { ++ break; ++ } else if (result == 0) { ++ ++f->f_pos; ++ result = go_next_unit(&tap); ++ if (result == -E_NO_NEIGHBOR || ++ result == -ENOENT) { ++ result = 0; ++ break; ++ } else if (result == 0) { ++ if (is_valid_dir_coord(inode, coord)) ++ move_entry(pos, coord); ++ else ++ break; ++ } ++ } else if (result == -E_REPEAT) { ++ /* feed_entry() had to restart. */ ++ ++f->f_pos; ++ reiser4_tap_relse(&tap); ++ goto repeat; ++ } else ++ warning("vs-1617", ++ "reiser4_readdir_common: unexpected error %d", ++ result); ++ } ++ reiser4_tap_relse(&tap); ++ ++ if (result >= 0) ++ f->f_version = inode->i_version; ++ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) ++ result = 0; ++ reiser4_tap_done(&tap); ++ reiser4_detach_fsdata(f); ++ ++ /* try to update directory's atime */ ++ if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode), ++ BA_CAN_COMMIT) != 0) ++ warning("", "failed to update atime on readdir: %llu", ++ get_inode_oid(inode)); ++ else ++ file_accessed(f); ++ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ ++ return (result <= 0) ? result : 0; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.33/fs/reiser4/plugin/file_plugin_common.c +--- linux-2.6.33.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/file_plugin_common.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1008 @@ ++/* Copyright 2005 by Hans Reiser, licensing governed by ++ reiser4/README */ ++ ++/* this file contains typical implementations for most of methods of ++ file plugin ++*/ ++ ++#include "../inode.h" ++#include "object.h" ++#include "../safe_link.h" ++ ++#include <linux/quotaops.h> ++ ++static int insert_new_sd(struct inode *inode); ++static int update_sd(struct inode *inode); ++ ++/* this is common implementation of write_sd_by_inode method of file plugin ++ either insert stat data or update it ++ */ ++int write_sd_by_inode_common(struct inode *inode/* object to save */) ++{ ++ int result; ++ ++ assert("nikita-730", inode != NULL); ++ ++ if (reiser4_inode_get_flag(inode, REISER4_NO_SD)) ++ /* object doesn't have stat-data yet */ ++ result = insert_new_sd(inode); ++ else ++ result = update_sd(inode); ++ if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM) ++ /* Don't issue warnings about "name is too long" */ ++ warning("nikita-2221", "Failed to save sd for %llu: %i", ++ (unsigned long long)get_inode_oid(inode), result); ++ return result; ++} ++ ++/* this is common implementation of key_by_inode method of file plugin ++ */ ++int ++key_by_inode_and_offset_common(struct inode *inode, loff_t off, ++ reiser4_key * key) ++{ ++ reiser4_key_init(key); ++ set_key_locality(key, reiser4_inode_data(inode)->locality_id); ++ set_key_ordering(key, get_inode_ordering(inode)); ++ set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */ ++ set_key_type(key, KEY_BODY_MINOR); ++ set_key_offset(key, (__u64) off); ++ return 0; ++} ++ ++/* this is common implementation of set_plug_in_inode method of file plugin ++ */ ++int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ , ++ struct inode *parent /* parent object */ , ++ reiser4_object_create_data * data /* creational ++ * data */ ) ++{ ++ __u64 mask; ++ ++ object->i_mode = data->mode; ++ /* this should be plugin decision */ ++ object->i_uid = current->cred->fsuid; ++ object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME; ++ ++ /* support for BSD style group-id assignment. See mount's manual page ++ description of bsdgroups ext2 mount options for more details */ ++ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID)) ++ object->i_gid = parent->i_gid; ++ else if (parent->i_mode & S_ISGID) { ++ /* parent directory has sguid bit */ ++ object->i_gid = parent->i_gid; ++ if (S_ISDIR(object->i_mode)) ++ /* sguid is inherited by sub-directories */ ++ object->i_mode |= S_ISGID; ++ } else ++ object->i_gid = current->cred->fsgid; ++ ++ /* this object doesn't have stat-data yet */ ++ reiser4_inode_set_flag(object, REISER4_NO_SD); ++#if 0 ++ /* this is now called after all inode plugins are initialized: ++ do_create_vfs_child after adjust_to_parent */ ++ /* setup inode and file-operations for this inode */ ++ setup_inode_ops(object, data); ++#endif ++ object->i_nlink = 0; ++ reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL); ++ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT); ++ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES)) ++ mask |= (1 << LARGE_TIMES_STAT); ++ ++ reiser4_inode_data(object)->extmask = mask; ++ return 0; ++} ++ ++/* this is common implementation of adjust_to_parent method of file plugin for ++ regular files ++ */ ++int adjust_to_parent_common(struct inode *object /* new object */ , ++ struct inode *parent /* parent directory */ , ++ struct inode *root/* root directory */) ++{ ++ assert("nikita-2165", object != NULL); ++ if (parent == NULL) ++ parent = root; ++ assert("nikita-2069", parent != NULL); ++ ++ /* ++ * inherit missing plugins from parent ++ */ ++ ++ grab_plugin_pset(object, parent, PSET_FILE); ++ grab_plugin_pset(object, parent, PSET_SD); ++ grab_plugin_pset(object, parent, PSET_FORMATTING); ++ grab_plugin_pset(object, parent, PSET_PERM); ++ return 0; ++} ++ ++/* this is common implementation of adjust_to_parent method of file plugin for ++ typical directories ++ */ ++int adjust_to_parent_common_dir(struct inode *object /* new object */ , ++ struct inode *parent /* parent directory */ , ++ struct inode *root/* root directory */) ++{ ++ int result = 0; ++ pset_member memb; ++ ++ assert("nikita-2166", object != NULL); ++ if (parent == NULL) ++ parent = root; ++ assert("nikita-2167", parent != NULL); ++ ++ /* ++ * inherit missing plugins from parent ++ */ ++ for (memb = 0; memb < PSET_LAST; ++memb) { ++ result = grab_plugin_pset(object, parent, memb); ++ if (result != 0) ++ break; ++ } ++ return result; ++} ++ ++int adjust_to_parent_cryptcompress(struct inode *object /* new object */ , ++ struct inode *parent /* parent directory */, ++ struct inode *root/* root directory */) ++{ ++ int result; ++ result = adjust_to_parent_common(object, parent, root); ++ if (result) ++ return result; ++ assert("edward-1416", parent != NULL); ++ ++ grab_plugin_pset(object, parent, PSET_CLUSTER); ++ grab_plugin_pset(object, parent, PSET_CIPHER); ++ grab_plugin_pset(object, parent, PSET_DIGEST); ++ grab_plugin_pset(object, parent, PSET_COMPRESSION); ++ grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE); ++ ++ return 0; ++} ++ ++/* this is common implementation of create_object method of file plugin ++ */ ++int reiser4_create_object_common(struct inode *object, struct inode *parent, ++ reiser4_object_create_data * data) ++{ ++ reiser4_block_nr reserve; ++ assert("nikita-744", object != NULL); ++ assert("nikita-745", parent != NULL); ++ assert("nikita-747", data != NULL); ++ assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD)); ++ ++ reserve = estimate_create_common(object); ++ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) ++ return RETERR(-ENOSPC); ++ return write_sd_by_inode_common(object); ++} ++ ++static int common_object_delete_no_reserve(struct inode *inode); ++ ++/** ++ * reiser4_delete_object_common - delete_object of file_plugin ++ * @inode: inode to be deleted ++ * ++ * This is common implementation of delete_object method of file_plugin. It ++ * applies to object its deletion consists of removing two items - stat data ++ * and safe-link. ++ */ ++int reiser4_delete_object_common(struct inode *inode) ++{ ++ int result; ++ ++ assert("nikita-1477", inode != NULL); ++ /* FIXME: if file body deletion failed (i/o error, for instance), ++ inode->i_size can be != 0 here */ ++ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode)); ++ assert("nikita-3421", inode->i_nlink == 0); ++ ++ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) { ++ reiser4_block_nr reserve; ++ ++ /* grab space which is needed to remove 2 items from the tree: ++ stat data and safe-link */ ++ reserve = 2 * ++ estimate_one_item_removal(reiser4_tree_by_inode(inode)); ++ if (reiser4_grab_space_force(reserve, ++ BA_RESERVED | BA_CAN_COMMIT)) ++ return RETERR(-ENOSPC); ++ result = common_object_delete_no_reserve(inode); ++ } else ++ result = 0; ++ return result; ++} ++ ++/** ++ * reiser4_delete_dir_common - delete_object of file_plugin ++ * @inode: inode to be deleted ++ * ++ * This is common implementation of delete_object method of file_plugin for ++ * typical directory. It calls done method of dir_plugin to remove "." and ++ * removes stat data and safe-link. ++ */ ++int reiser4_delete_dir_common(struct inode *inode) ++{ ++ int result; ++ dir_plugin *dplug; ++ ++ assert("", (get_current_context() && ++ get_current_context()->trans->atom == NULL)); ++ ++ dplug = inode_dir_plugin(inode); ++ assert("vs-1101", dplug && dplug->done); ++ ++ /* kill cursors which might be attached to inode */ ++ reiser4_kill_cursors(inode); ++ ++ /* grab space enough for removing two items */ ++ if (reiser4_grab_space ++ (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)), ++ BA_RESERVED | BA_CAN_COMMIT)) ++ return RETERR(-ENOSPC); ++ ++ result = dplug->done(inode); ++ if (!result) ++ result = common_object_delete_no_reserve(inode); ++ return result; ++} ++ ++/* this is common implementation of add_link method of file plugin ++ */ ++int reiser4_add_link_common(struct inode *object, struct inode *parent) ++{ ++ /* ++ * increment ->i_nlink and update ->i_ctime ++ */ ++ ++ INODE_INC_FIELD(object, i_nlink); ++ object->i_ctime = CURRENT_TIME; ++ return 0; ++} ++ ++/* this is common implementation of rem_link method of file plugin ++ */ ++int reiser4_rem_link_common(struct inode *object, struct inode *parent) ++{ ++ assert("nikita-2021", object != NULL); ++ assert("nikita-2163", object->i_nlink > 0); ++ ++ /* ++ * decrement ->i_nlink and update ->i_ctime ++ */ ++ ++ INODE_DEC_FIELD(object, i_nlink); ++ object->i_ctime = CURRENT_TIME; ++ return 0; ++} ++ ++/* this is common implementation of rem_link method of file plugin for typical ++ directory ++*/ ++int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG) ++{ ++ assert("nikita-20211", object != NULL); ++ assert("nikita-21631", object->i_nlink > 0); ++ ++ /* ++ * decrement ->i_nlink and update ->i_ctime ++ */ ++ INODE_DEC_FIELD(object, i_nlink); ++ if (object->i_nlink == 1) ++ INODE_DEC_FIELD(object, i_nlink); ++ object->i_ctime = CURRENT_TIME; ++ return 0; ++} ++ ++/* this is common implementation of owns_item method of file plugin ++ compare objectids of keys in inode and coord */ ++int owns_item_common(const struct inode *inode, /* object to check ++ * against */ ++ const coord_t *coord/* coord to check */) ++{ ++ reiser4_key item_key; ++ reiser4_key file_key; ++ ++ assert("nikita-760", inode != NULL); ++ assert("nikita-761", coord != NULL); ++ ++ return coord_is_existing_item(coord) && ++ (get_key_objectid(build_sd_key(inode, &file_key)) == ++ get_key_objectid(item_key_by_coord(coord, &item_key))); ++} ++ ++/* this is common implementation of owns_item method of file plugin ++ for typical directory ++*/ ++int owns_item_common_dir(const struct inode *inode,/* object to check against */ ++ const coord_t *coord/* coord of item to check */) ++{ ++ reiser4_key item_key; ++ ++ assert("nikita-1335", inode != NULL); ++ assert("nikita-1334", coord != NULL); ++ ++ if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE)) ++ return get_key_locality(item_key_by_coord(coord, &item_key)) == ++ get_inode_oid(inode); ++ else ++ return owns_item_common(inode, coord); ++} ++ ++/* this is common implementation of can_add_link method of file plugin ++ checks whether yet another hard links to this object can be added ++*/ ++int can_add_link_common(const struct inode *object/* object to check */) ++{ ++ assert("nikita-732", object != NULL); ++ ++ /* inode->i_nlink is unsigned int, so just check for integer ++ overflow */ ++ return object->i_nlink + 1 != 0; ++} ++ ++/* this is common implementation of can_rem_link method of file plugin for ++ typical directory ++*/ ++int can_rem_link_common_dir(const struct inode *inode) ++{ ++ /* is_dir_empty() returns 0 is dir is empty */ ++ return !is_dir_empty(inode); ++} ++ ++/* this is common implementation of detach method of file plugin for typical ++ directory ++*/ ++int reiser4_detach_common_dir(struct inode *child, struct inode *parent) ++{ ++ dir_plugin *dplug; ++ ++ dplug = inode_dir_plugin(child); ++ assert("nikita-2883", dplug != NULL); ++ assert("nikita-2884", dplug->detach != NULL); ++ return dplug->detach(child, parent); ++} ++ ++/* this is common implementation of bind method of file plugin for typical ++ directory ++*/ ++int reiser4_bind_common_dir(struct inode *child, struct inode *parent) ++{ ++ dir_plugin *dplug; ++ ++ dplug = inode_dir_plugin(child); ++ assert("nikita-2646", dplug != NULL); ++ return dplug->attach(child, parent); ++} ++ ++static int process_truncate(struct inode *, __u64 size); ++ ++/* this is common implementation of safelink method of file plugin ++ */ ++int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value) ++{ ++ int result; ++ ++ assert("vs-1705", get_current_context()->trans->atom == NULL); ++ if (link == SAFE_UNLINK) ++ /* nothing to do. iput() in the caller (process_safelink) will ++ * finish with file */ ++ result = 0; ++ else if (link == SAFE_TRUNCATE) ++ result = process_truncate(object, value); ++ else { ++ warning("nikita-3438", "Unrecognized safe-link type: %i", link); ++ result = RETERR(-EIO); ++ } ++ return result; ++} ++ ++/* this is common implementation of estimate.create method of file plugin ++ can be used when object creation involves insertion of one item (usually stat ++ data) into tree ++*/ ++reiser4_block_nr estimate_create_common(const struct inode *object) ++{ ++ return estimate_one_insert_item(reiser4_tree_by_inode(object)); ++} ++ ++/* this is common implementation of estimate.create method of file plugin for ++ typical directory ++ can be used when directory creation involves insertion of two items (usually ++ stat data and item containing "." and "..") into tree ++*/ ++reiser4_block_nr estimate_create_common_dir(const struct inode *object) ++{ ++ return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object)); ++} ++ ++/* this is common implementation of estimate.update method of file plugin ++ can be used when stat data update does not do more than inserting a unit ++ into a stat data item which is probably true for most cases ++*/ ++reiser4_block_nr estimate_update_common(const struct inode *inode) ++{ ++ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode)); ++} ++ ++/* this is common implementation of estimate.unlink method of file plugin ++ */ ++reiser4_block_nr ++estimate_unlink_common(const struct inode *object UNUSED_ARG, ++ const struct inode *parent UNUSED_ARG) ++{ ++ return 0; ++} ++ ++/* this is common implementation of estimate.unlink method of file plugin for ++ typical directory ++*/ ++reiser4_block_nr ++estimate_unlink_common_dir(const struct inode *object, ++ const struct inode *parent) ++{ ++ dir_plugin *dplug; ++ ++ dplug = inode_dir_plugin(object); ++ assert("nikita-2888", dplug != NULL); ++ assert("nikita-2887", dplug->estimate.unlink != NULL); ++ return dplug->estimate.unlink(object, parent); ++} ++ ++char *wire_write_common(struct inode *inode, char *start) ++{ ++ return build_inode_onwire(inode, start); ++} ++ ++char *wire_read_common(char *addr, reiser4_object_on_wire * obj) ++{ ++ if (!obj) ++ return locate_obj_key_id_onwire(addr); ++ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id); ++} ++ ++struct dentry *wire_get_common(struct super_block *sb, ++ reiser4_object_on_wire * obj) ++{ ++ struct inode *inode; ++ struct dentry *dentry; ++ reiser4_key key; ++ ++ extract_key_from_id(&obj->u.std.key_id, &key); ++ inode = reiser4_iget(sb, &key, 1); ++ if (!IS_ERR(inode)) { ++ reiser4_iget_complete(inode); ++ dentry = d_obtain_alias(inode); ++ if (!IS_ERR(dentry)) ++ dentry->d_op = &get_super_private(sb)->ops.dentry; ++ } else if (PTR_ERR(inode) == -ENOENT) ++ /* ++ * inode wasn't found at the key encoded in the file ++ * handle. Hence, file handle is stale. ++ */ ++ dentry = ERR_PTR(RETERR(-ESTALE)); ++ else ++ dentry = (void *)inode; ++ return dentry; ++} ++ ++int wire_size_common(struct inode *inode) ++{ ++ return inode_onwire_size(inode); ++} ++ ++void wire_done_common(reiser4_object_on_wire * obj) ++{ ++ /* nothing to do */ ++} ++ ++/* helper function to print errors */ ++static void key_warning(const reiser4_key * key /* key to print */ , ++ const struct inode *inode, ++ int code/* error code to print */) ++{ ++ assert("nikita-716", key != NULL); ++ ++ if (code != -ENOMEM) { ++ warning("nikita-717", "Error for inode %llu (%i)", ++ (unsigned long long)get_key_objectid(key), code); ++ reiser4_print_key("for key", key); ++ } ++} ++ ++/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */ ++#if REISER4_DEBUG ++static void ++check_inode_seal(const struct inode *inode, ++ const coord_t *coord, const reiser4_key * key) ++{ ++ reiser4_key unit_key; ++ ++ unit_key_by_coord(coord, &unit_key); ++ assert("nikita-2752", ++ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key))); ++ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key)); ++} ++ ++static void check_sd_coord(coord_t *coord, const reiser4_key * key) ++{ ++ reiser4_key ukey; ++ ++ coord_clear_iplug(coord); ++ if (zload(coord->node)) ++ return; ++ ++ if (!coord_is_existing_unit(coord) || ++ !item_plugin_by_coord(coord) || ++ !keyeq(unit_key_by_coord(coord, &ukey), key) || ++ (znode_get_level(coord->node) != LEAF_LEVEL) || ++ !item_is_statdata(coord)) { ++ warning("nikita-1901", "Conspicuous seal"); ++ reiser4_print_key("key", key); ++ print_coord("coord", coord, 1); ++ impossible("nikita-2877", "no way"); ++ } ++ zrelse(coord->node); ++} ++ ++#else ++#define check_inode_seal(inode, coord, key) noop ++#define check_sd_coord(coord, key) noop ++#endif ++ ++/* insert new stat-data into tree. Called with inode state ++ locked. Return inode state locked. */ ++static int insert_new_sd(struct inode *inode/* inode to create sd for */) ++{ ++ int result; ++ reiser4_key key; ++ coord_t coord; ++ reiser4_item_data data; ++ char *area; ++ reiser4_inode *ref; ++ lock_handle lh; ++ oid_t oid; ++ ++ assert("nikita-723", inode != NULL); ++ assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD)); ++ ++ ref = reiser4_inode_data(inode); ++ spin_lock_inode(inode); ++ ++ if (ref->plugin_mask != 0) ++ /* inode has non-standard plugins */ ++ inode_set_extension(inode, PLUGIN_STAT); ++ /* ++ * prepare specification of new item to be inserted ++ */ ++ ++ data.iplug = inode_sd_plugin(inode); ++ data.length = data.iplug->s.sd.save_len(inode); ++ spin_unlock_inode(inode); ++ ++ data.data = NULL; ++ data.user = 0; ++/* could be optimized for case where there is only one node format in ++ * use in the filesystem, probably there are lots of such ++ * places we could optimize for only one node layout.... -Hans */ ++ if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()) { ++ /* This is silly check, but we don't know actual node where ++ insertion will go into. */ ++ return RETERR(-ENAMETOOLONG); ++ } ++ oid = oid_allocate(inode->i_sb); ++/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be ++ * encapsulated into oid_allocate? */ ++ if (oid == ABSOLUTE_MAX_OID) ++ return RETERR(-EOVERFLOW); ++ ++ set_inode_oid(inode, oid); ++ ++ coord_init_zero(&coord); ++ init_lh(&lh); ++ ++ result = insert_by_key(reiser4_tree_by_inode(inode), ++ build_sd_key(inode, &key), &data, &coord, &lh, ++ /* stat data lives on a leaf level */ ++ LEAF_LEVEL, CBK_UNIQUE); ++ ++ /* we don't want to re-check that somebody didn't insert ++ stat-data while we were doing io, because if it did, ++ insert_by_key() returned error. */ ++ /* but what _is_ possible is that plugin for inode's stat-data, ++ list of non-standard plugins or their state would change ++ during io, so that stat-data wouldn't fit into sd. To avoid ++ this race we keep inode_state lock. This lock has to be ++ taken each time you access inode in a way that would cause ++ changes in sd size: changing plugins etc. ++ */ ++ ++ if (result == IBK_INSERT_OK) { ++ coord_clear_iplug(&coord); ++ result = zload(coord.node); ++ if (result == 0) { ++ /* have we really inserted stat data? */ ++ assert("nikita-725", item_is_statdata(&coord)); ++ ++ /* inode was just created. It is inserted into hash ++ table, but no directory entry was yet inserted into ++ parent. So, inode is inaccessible through ++ ->lookup(). All places that directly grab inode ++ from hash-table (like old knfsd), should check ++ IMMUTABLE flag that is set by common_create_child. ++ */ ++ assert("nikita-3240", data.iplug != NULL); ++ assert("nikita-3241", data.iplug->s.sd.save != NULL); ++ area = item_body_by_coord(&coord); ++ result = data.iplug->s.sd.save(inode, &area); ++ znode_make_dirty(coord.node); ++ if (result == 0) { ++ /* object has stat-data now */ ++ reiser4_inode_clr_flag(inode, REISER4_NO_SD); ++ reiser4_inode_set_flag(inode, ++ REISER4_SDLEN_KNOWN); ++ /* initialise stat-data seal */ ++ reiser4_seal_init(&ref->sd_seal, &coord, &key); ++ ref->sd_coord = coord; ++ check_inode_seal(inode, &coord, &key); ++ } else if (result != -ENOMEM) ++ /* ++ * convert any other error code to -EIO to ++ * avoid confusing user level with unexpected ++ * errors. ++ */ ++ result = RETERR(-EIO); ++ zrelse(coord.node); ++ } ++ } ++ done_lh(&lh); ++ ++ if (result != 0) ++ key_warning(&key, inode, result); ++ else ++ oid_count_allocated(); ++ ++ return result; ++} ++ ++/* find sd of inode in a tree, deal with errors */ ++int lookup_sd(struct inode *inode /* inode to look sd for */ , ++ znode_lock_mode lock_mode /* lock mode */ , ++ coord_t *coord /* resulting coord */ , ++ lock_handle * lh /* resulting lock handle */ , ++ const reiser4_key * key /* resulting key */ , ++ int silent) ++{ ++ int result; ++ __u32 flags; ++ ++ assert("nikita-1692", inode != NULL); ++ assert("nikita-1693", coord != NULL); ++ assert("nikita-1694", key != NULL); ++ ++ /* look for the object's stat data in a tree. ++ This returns in "node" pointer to a locked znode and in "pos" ++ position of an item found in node. Both are only valid if ++ coord_found is returned. */ ++ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0; ++ flags |= CBK_UNIQUE; ++ /* ++ * traverse tree to find stat data. We cannot use vroot here, because ++ * it only covers _body_ of the file, and stat data don't belong ++ * there. ++ */ ++ result = coord_by_key(reiser4_tree_by_inode(inode), ++ key, ++ coord, ++ lh, ++ lock_mode, ++ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL); ++ if (REISER4_DEBUG && result == 0) ++ check_sd_coord(coord, key); ++ ++ if (result != 0 && !silent) ++ key_warning(key, inode, result); ++ return result; ++} ++ ++static int ++locate_inode_sd(struct inode *inode, ++ reiser4_key * key, coord_t *coord, lock_handle * lh) ++{ ++ reiser4_inode *state; ++ seal_t seal; ++ int result; ++ ++ assert("nikita-3483", inode != NULL); ++ ++ state = reiser4_inode_data(inode); ++ spin_lock_inode(inode); ++ *coord = state->sd_coord; ++ coord_clear_iplug(coord); ++ seal = state->sd_seal; ++ spin_unlock_inode(inode); ++ ++ build_sd_key(inode, key); ++ if (reiser4_seal_is_set(&seal)) { ++ /* first, try to use seal */ ++ result = reiser4_seal_validate(&seal, ++ coord, ++ key, ++ lh, ZNODE_WRITE_LOCK, ++ ZNODE_LOCK_LOPRI); ++ if (result == 0) ++ check_sd_coord(coord, key); ++ } else ++ result = -E_REPEAT; ++ ++ if (result != 0) { ++ coord_init_zero(coord); ++ result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0); ++ } ++ return result; ++} ++ ++#if REISER4_DEBUG ++static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2) ++{ ++ return (get_key_locality(k1) == get_key_locality(k2) && ++ get_key_type(k1) == get_key_type(k2) && ++ get_key_band(k1) == get_key_band(k2) && ++ get_key_ordering(k1) == get_key_ordering(k2) && ++ get_key_objectid(k1) == get_key_objectid(k2)); ++} ++ ++#include "../tree_walk.h" ++ ++/* make some checks before and after stat-data resize operation */ ++static int check_sd_resize(struct inode *inode, coord_t *coord, ++ int length, int progress/* 1 means after resize */) ++{ ++ int ret = 0; ++ lock_handle left_lock; ++ coord_t left_coord; ++ reiser4_key left_key; ++ reiser4_key key; ++ ++ if (inode_file_plugin(inode) != ++ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) ++ return 0; ++ if (!length) ++ return 0; ++ if (coord->item_pos != 0) ++ return 0; ++ ++ init_lh(&left_lock); ++ ret = reiser4_get_left_neighbor(&left_lock, ++ coord->node, ++ ZNODE_WRITE_LOCK, ++ GN_CAN_USE_UPPER_LEVELS); ++ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR || ++ ret == -ENOENT || ret == -EINVAL ++ || ret == -E_DEADLOCK) { ++ ret = 0; ++ goto exit; ++ } ++ ret = zload(left_lock.node); ++ if (ret) ++ goto exit; ++ coord_init_last_unit(&left_coord, left_lock.node); ++ item_key_by_coord(&left_coord, &left_key); ++ item_key_by_coord(coord, &key); ++ ++ if (all_but_offset_key_eq(&key, &left_key)) ++ /* corruption occured */ ++ ret = 1; ++ zrelse(left_lock.node); ++ exit: ++ done_lh(&left_lock); ++ return ret; ++} ++#endif ++ ++/* update stat-data at @coord */ ++static int ++update_sd_at(struct inode *inode, coord_t *coord, reiser4_key * key, ++ lock_handle * lh) ++{ ++ int result; ++ reiser4_item_data data; ++ char *area; ++ reiser4_inode *state; ++ znode *loaded; ++ ++ state = reiser4_inode_data(inode); ++ ++ coord_clear_iplug(coord); ++ result = zload(coord->node); ++ if (result != 0) ++ return result; ++ loaded = coord->node; ++ ++ spin_lock_inode(inode); ++ assert("nikita-728", inode_sd_plugin(inode) != NULL); ++ data.iplug = inode_sd_plugin(inode); ++ ++ /* if inode has non-standard plugins, add appropriate stat data ++ * extension */ ++ if (state->extmask & (1 << PLUGIN_STAT)) { ++ if (state->plugin_mask == 0) ++ inode_clr_extension(inode, PLUGIN_STAT); ++ } else if (state->plugin_mask != 0) ++ inode_set_extension(inode, PLUGIN_STAT); ++ ++ if (state->extmask & (1 << HEIR_STAT)) { ++ if (state->heir_mask == 0) ++ inode_clr_extension(inode, HEIR_STAT); ++ } else if (state->heir_mask != 0) ++ inode_set_extension(inode, HEIR_STAT); ++ ++ /* data.length is how much space to add to (or remove ++ from if negative) sd */ ++ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) { ++ /* recalculate stat-data length */ ++ data.length = ++ data.iplug->s.sd.save_len(inode) - ++ item_length_by_coord(coord); ++ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN); ++ } else ++ data.length = 0; ++ spin_unlock_inode(inode); ++ ++ /* if on-disk stat data is of different length than required ++ for this inode, resize it */ ++ ++ if (data.length != 0) { ++ data.data = NULL; ++ data.user = 0; ++ ++ assert("edward-1441", ++ !check_sd_resize(inode, coord, ++ data.length, 0/* before resize */)); ++ ++ /* insertion code requires that insertion point (coord) was ++ * between units. */ ++ coord->between = AFTER_UNIT; ++ result = reiser4_resize_item(coord, &data, key, lh, ++ COPI_DONT_SHIFT_LEFT); ++ if (result != 0) { ++ key_warning(key, inode, result); ++ zrelse(loaded); ++ return result; ++ } ++ if (loaded != coord->node) { ++ /* reiser4_resize_item moved coord to another node. ++ Zload it */ ++ zrelse(loaded); ++ coord_clear_iplug(coord); ++ result = zload(coord->node); ++ if (result != 0) ++ return result; ++ loaded = coord->node; ++ } ++ assert("edward-1442", ++ !check_sd_resize(inode, coord, ++ data.length, 1/* after resize */)); ++ } ++ area = item_body_by_coord(coord); ++ spin_lock_inode(inode); ++ result = data.iplug->s.sd.save(inode, &area); ++ znode_make_dirty(coord->node); ++ ++ /* re-initialise stat-data seal */ ++ ++ /* ++ * coord.between was possibly skewed from AT_UNIT when stat-data size ++ * was changed and new extensions were pasted into item. ++ */ ++ coord->between = AT_UNIT; ++ reiser4_seal_init(&state->sd_seal, coord, key); ++ state->sd_coord = *coord; ++ spin_unlock_inode(inode); ++ check_inode_seal(inode, coord, key); ++ zrelse(loaded); ++ return result; ++} ++ ++/* Update existing stat-data in a tree. Called with inode state locked. Return ++ inode state locked. */ ++static int update_sd(struct inode *inode/* inode to update sd for */) ++{ ++ int result; ++ reiser4_key key; ++ coord_t coord; ++ lock_handle lh; ++ ++ assert("nikita-726", inode != NULL); ++ ++ /* no stat-data, nothing to update?! */ ++ assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); ++ ++ init_lh(&lh); ++ ++ result = locate_inode_sd(inode, &key, &coord, &lh); ++ if (result == 0) ++ result = update_sd_at(inode, &coord, &key, &lh); ++ done_lh(&lh); ++ ++ return result; ++} ++ ++/* helper for reiser4_delete_object_common and reiser4_delete_dir_common. ++ Remove object stat data. Space for that must be reserved by caller before ++*/ ++static int ++common_object_delete_no_reserve(struct inode *inode/* object to remove */) ++{ ++ int result; ++ ++ assert("nikita-1477", inode != NULL); ++ ++ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) { ++ reiser4_key sd_key; ++ ++ vfs_dq_free_inode(inode); ++ vfs_dq_drop(inode); ++ ++ build_sd_key(inode, &sd_key); ++ result = ++ reiser4_cut_tree(reiser4_tree_by_inode(inode), ++ &sd_key, &sd_key, NULL, 0); ++ if (result == 0) { ++ reiser4_inode_set_flag(inode, REISER4_NO_SD); ++ result = oid_release(inode->i_sb, get_inode_oid(inode)); ++ if (result == 0) { ++ oid_count_released(); ++ ++ result = safe_link_del(reiser4_tree_by_inode(inode), ++ get_inode_oid(inode), ++ SAFE_UNLINK); ++ } ++ } ++ } else ++ result = 0; ++ return result; ++} ++ ++/* helper for safelink_common */ ++static int process_truncate(struct inode *inode, __u64 size) ++{ ++ int result; ++ struct iattr attr; ++ file_plugin *fplug; ++ reiser4_context *ctx; ++ struct dentry dentry; ++ ++ assert("vs-21", is_in_reiser4_context()); ++ ctx = reiser4_init_context(inode->i_sb); ++ assert("vs-22", !IS_ERR(ctx)); ++ ++ attr.ia_size = size; ++ attr.ia_valid = ATTR_SIZE | ATTR_CTIME; ++ fplug = inode_file_plugin(inode); ++ ++ mutex_lock(&inode->i_mutex); ++ assert("vs-1704", get_current_context()->trans->atom == NULL); ++ dentry.d_inode = inode; ++ result = inode->i_op->setattr(&dentry, &attr); ++ mutex_unlock(&inode->i_mutex); ++ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ ++ return result; ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 80 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/hash.c linux-2.6.33/fs/reiser4/plugin/hash.c +--- linux-2.6.33.orig/fs/reiser4/plugin/hash.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/hash.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,352 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Hash functions */ ++ ++#include "../debug.h" ++#include "plugin_header.h" ++#include "plugin.h" ++#include "../super.h" ++#include "../inode.h" ++ ++#include <linux/types.h> ++ ++/* old rupasov (yura) hash */ ++static __u64 hash_rupasov(const unsigned char *name /* name to hash */ , ++ int len/* @name's length */) ++{ ++ int i; ++ int j; ++ int pow; ++ __u64 a; ++ __u64 c; ++ ++ assert("nikita-672", name != NULL); ++ assert("nikita-673", len >= 0); ++ ++ for (pow = 1, i = 1; i < len; ++i) ++ pow = pow * 10; ++ ++ if (len == 1) ++ a = name[0] - 48; ++ else ++ a = (name[0] - 48) * pow; ++ ++ for (i = 1; i < len; ++i) { ++ c = name[i] - 48; ++ for (pow = 1, j = i; j < len - 1; ++j) ++ pow = pow * 10; ++ a = a + c * pow; ++ } ++ for (; i < 40; ++i) { ++ c = '0' - 48; ++ for (pow = 1, j = i; j < len - 1; ++j) ++ pow = pow * 10; ++ a = a + c * pow; ++ } ++ ++ for (; i < 256; ++i) { ++ c = i; ++ for (pow = 1, j = i; j < len - 1; ++j) ++ pow = pow * 10; ++ a = a + c * pow; ++ } ++ ++ a = a << 7; ++ return a; ++} ++ ++/* r5 hash */ ++static __u64 hash_r5(const unsigned char *name /* name to hash */ , ++ int len UNUSED_ARG/* @name's length */) ++{ ++ __u64 a = 0; ++ ++ assert("nikita-674", name != NULL); ++ assert("nikita-675", len >= 0); ++ ++ while (*name) { ++ a += *name << 4; ++ a += *name >> 4; ++ a *= 11; ++ name++; ++ } ++ return a; ++} ++ ++/* Keyed 32-bit hash function using TEA in a Davis-Meyer function ++ H0 = Key ++ Hi = E Mi(Hi-1) + Hi-1 ++ ++ (see Applied Cryptography, 2nd edition, p448). ++ ++ Jeremy Fitzhardinge jeremy@zip.com.au 1998 ++ ++ Jeremy has agreed to the contents of reiserfs/README. -Hans ++ ++ This code was blindly upgraded to __u64 by s/__u32/__u64/g. ++*/ ++static __u64 hash_tea(const unsigned char *name /* name to hash */ , ++ int len/* @name's length */) ++{ ++ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u }; ++ ++ __u64 h0 = k[0], h1 = k[1]; ++ __u64 a, b, c, d; ++ __u64 pad; ++ int i; ++ ++ assert("nikita-676", name != NULL); ++ assert("nikita-677", len >= 0); ++ ++#define DELTA 0x9E3779B9u ++#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ ++#define PARTROUNDS 6 /* 6 gets complete mixing */ ++ ++/* a, b, c, d - data; h0, h1 - accumulated hash */ ++#define TEACORE(rounds) \ ++ do { \ ++ __u64 sum = 0; \ ++ int n = rounds; \ ++ __u64 b0, b1; \ ++ \ ++ b0 = h0; \ ++ b1 = h1; \ ++ \ ++ do { \ ++ sum += DELTA; \ ++ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ ++ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ ++ } while (--n); \ ++ \ ++ h0 += b0; \ ++ h1 += b1; \ ++ } while (0) ++ ++ pad = (__u64) len | ((__u64) len << 8); ++ pad |= pad << 16; ++ ++ while (len >= 16) { ++ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << ++ 16 | (__u64) name[3] << 24; ++ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << ++ 16 | (__u64) name[7] << 24; ++ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << ++ 16 | (__u64) name[11] << 24; ++ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14] ++ << 16 | (__u64) name[15] << 24; ++ ++ TEACORE(PARTROUNDS); ++ ++ len -= 16; ++ name += 16; ++ } ++ ++ if (len >= 12) { ++ /* assert(len < 16); */ ++ if (len >= 16) ++ *(int *)0 = 0; ++ ++ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << ++ 16 | (__u64) name[3] << 24; ++ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << ++ 16 | (__u64) name[7] << 24; ++ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << ++ 16 | (__u64) name[11] << 24; ++ ++ d = pad; ++ for (i = 12; i < len; i++) { ++ d <<= 8; ++ d |= name[i]; ++ } ++ } else if (len >= 8) { ++ /* assert(len < 12); */ ++ if (len >= 12) ++ *(int *)0 = 0; ++ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << ++ 16 | (__u64) name[3] << 24; ++ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << ++ 16 | (__u64) name[7] << 24; ++ ++ c = d = pad; ++ for (i = 8; i < len; i++) { ++ c <<= 8; ++ c |= name[i]; ++ } ++ } else if (len >= 4) { ++ /* assert(len < 8); */ ++ if (len >= 8) ++ *(int *)0 = 0; ++ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << ++ 16 | (__u64) name[3] << 24; ++ ++ b = c = d = pad; ++ for (i = 4; i < len; i++) { ++ b <<= 8; ++ b |= name[i]; ++ } ++ } else { ++ /* assert(len < 4); */ ++ if (len >= 4) ++ *(int *)0 = 0; ++ a = b = c = d = pad; ++ for (i = 0; i < len; i++) { ++ a <<= 8; ++ a |= name[i]; ++ } ++ } ++ ++ TEACORE(FULLROUNDS); ++ ++/* return 0;*/ ++ return h0 ^ h1; ++ ++} ++ ++/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash. ++ ++ See http://www.isthe.com/chongo/tech/comp/fnv/ for details. ++ ++ Excerpts: ++ ++ FNV hashes are designed to be fast while maintaining a low collision ++ rate. ++ ++ [This version also seems to preserve lexicographical order locally.] ++ ++ FNV hash algorithms and source code have been released into the public ++ domain. ++ ++*/ ++static __u64 hash_fnv1(const unsigned char *name /* name to hash */ , ++ int len UNUSED_ARG/* @name's length */) ++{ ++ unsigned long long a = 0xcbf29ce484222325ull; ++ const unsigned long long fnv_64_prime = 0x100000001b3ull; ++ ++ assert("nikita-678", name != NULL); ++ assert("nikita-679", len >= 0); ++ ++ /* FNV-1 hash each octet in the buffer */ ++ for (; *name; ++name) { ++ /* multiply by the 32 bit FNV magic prime mod 2^64 */ ++ a *= fnv_64_prime; ++ /* xor the bottom with the current octet */ ++ a ^= (unsigned long long)(*name); ++ } ++ /* return our new hash value */ ++ return a; ++} ++ ++/* degenerate hash function used to simplify testing of non-unique key ++ handling */ ++static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ , ++ int len UNUSED_ARG/* @name's length */) ++{ ++ return 0xc0c0c0c010101010ull; ++} ++ ++static int change_hash(struct inode *inode, ++ reiser4_plugin * plugin, ++ pset_member memb) ++{ ++ int result; ++ ++ assert("nikita-3503", inode != NULL); ++ assert("nikita-3504", plugin != NULL); ++ ++ assert("nikita-3505", is_reiser4_inode(inode)); ++ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE); ++ ++ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) ++ return RETERR(-EINVAL); ++ ++ result = 0; ++ if (inode_hash_plugin(inode) == NULL || ++ inode_hash_plugin(inode)->h.id != plugin->h.id) { ++ if (is_dir_empty(inode) == 0) ++ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset, ++ PSET_HASH, plugin); ++ else ++ result = RETERR(-ENOTEMPTY); ++ ++ } ++ return result; ++} ++ ++static reiser4_plugin_ops hash_plugin_ops = { ++ .init = NULL, ++ .load = NULL, ++ .save_len = NULL, ++ .save = NULL, ++ .change = change_hash ++}; ++ ++/* hash plugins */ ++hash_plugin hash_plugins[LAST_HASH_ID] = { ++ [RUPASOV_HASH_ID] = { ++ .h = { ++ .type_id = REISER4_HASH_PLUGIN_TYPE, ++ .id = RUPASOV_HASH_ID, ++ .pops = &hash_plugin_ops, ++ .label = "rupasov", ++ .desc = "Original Yura's hash", ++ .linkage = {NULL, NULL} ++ }, ++ .hash = hash_rupasov ++ }, ++ [R5_HASH_ID] = { ++ .h = { ++ .type_id = REISER4_HASH_PLUGIN_TYPE, ++ .id = R5_HASH_ID, ++ .pops = &hash_plugin_ops, ++ .label = "r5", ++ .desc = "r5 hash", ++ .linkage = {NULL, NULL} ++ }, ++ .hash = hash_r5 ++ }, ++ [TEA_HASH_ID] = { ++ .h = { ++ .type_id = REISER4_HASH_PLUGIN_TYPE, ++ .id = TEA_HASH_ID, ++ .pops = &hash_plugin_ops, ++ .label = "tea", ++ .desc = "tea hash", ++ .linkage = {NULL, NULL} ++ }, ++ .hash = hash_tea ++ }, ++ [FNV1_HASH_ID] = { ++ .h = { ++ .type_id = REISER4_HASH_PLUGIN_TYPE, ++ .id = FNV1_HASH_ID, ++ .pops = &hash_plugin_ops, ++ .label = "fnv1", ++ .desc = "fnv1 hash", ++ .linkage = {NULL, NULL} ++ }, ++ .hash = hash_fnv1 ++ }, ++ [DEGENERATE_HASH_ID] = { ++ .h = { ++ .type_id = REISER4_HASH_PLUGIN_TYPE, ++ .id = DEGENERATE_HASH_ID, ++ .pops = &hash_plugin_ops, ++ .label = "degenerate hash", ++ .desc = "Degenerate hash: only for testing", ++ .linkage = {NULL, NULL} ++ }, ++ .hash = hash_deg ++ } ++}; ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.33/fs/reiser4/plugin/inode_ops.c +--- linux-2.6.33.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/inode_ops.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,906 @@ ++/* ++ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README ++ */ ++ ++/* ++ * this file contains typical implementations for most of methods of struct ++ * inode_operations ++ */ ++ ++#include "../inode.h" ++#include "../safe_link.h" ++ ++#include <linux/quotaops.h> ++#include <linux/namei.h> ++ ++static int create_vfs_object(struct inode *parent, struct dentry *dentry, ++ reiser4_object_create_data *data); ++ ++/** ++ * reiser4_create_common - create of inode operations ++ * @parent: inode of parent directory ++ * @dentry: dentry of new object to create ++ * @mode: the permissions to use ++ * @nameidata: ++ * ++ * This is common implementation of vfs's create method of struct ++ * inode_operations. ++ * Creates regular file using file plugin from parent directory plugin set. ++ */ ++int reiser4_create_common(struct inode *parent, struct dentry *dentry, ++ int mode, struct nameidata *nameidata) ++{ ++ reiser4_object_create_data data; ++ file_plugin *fplug; ++ ++ memset(&data, 0, sizeof data); ++ data.mode = S_IFREG | mode; ++ fplug = child_create_plugin(parent) ? : inode_create_plugin(parent); ++ if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) { ++ warning("vpf-1900", "'%s' is not a regular file plugin.", ++ fplug->h.label); ++ return RETERR(-EIO); ++ } ++ data.id = fplug->h.id; ++ return create_vfs_object(parent, dentry, &data); ++} ++ ++int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *); ++void check_light_weight(struct inode *inode, struct inode *parent); ++ ++/** ++ * reiser4_lookup_common - lookup of inode operations ++ * @parent: inode of directory to lookup into ++ * @dentry: name to look for ++ * @nameidata: ++ * ++ * This is common implementation of vfs's lookup method of struct ++ * inode_operations. ++ */ ++struct dentry *reiser4_lookup_common(struct inode *parent, ++ struct dentry *dentry, ++ struct nameidata *nameidata) ++{ ++ reiser4_context *ctx; ++ int result; ++ struct dentry *new; ++ struct inode *inode; ++ reiser4_dir_entry_desc entry; ++ ++ ctx = reiser4_init_context(parent->i_sb); ++ if (IS_ERR(ctx)) ++ return (struct dentry *)ctx; ++ ++ /* set up operations on dentry. */ ++ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry; ++ ++ result = reiser4_lookup_name(parent, dentry, &entry.key); ++ if (result) { ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ if (result == -ENOENT) { ++ /* object not found */ ++ if (!IS_DEADDIR(parent)) ++ d_add(dentry, NULL); ++ return NULL; ++ } ++ return ERR_PTR(result); ++ } ++ ++ inode = reiser4_iget(parent->i_sb, &entry.key, 0); ++ if (IS_ERR(inode)) { ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return ERR_PTR(PTR_ERR(inode)); ++ } ++ ++ /* success */ ++ check_light_weight(inode, parent); ++ new = d_splice_alias(inode, dentry); ++ reiser4_iget_complete(inode); ++ ++ /* prevent balance_dirty_pages() from being called: we don't want to ++ * do this under directory i_mutex. */ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return new; ++} ++ ++static reiser4_block_nr common_estimate_link(struct inode *parent, ++ struct inode *object); ++int reiser4_update_dir(struct inode *); ++ ++/** ++ * reiser4_link_common - link of inode operations ++ * @existing: dentry of object which is to get new name ++ * @parent: directory where new name is to be created ++ * @newname: new name ++ * ++ * This is common implementation of vfs's link method of struct ++ * inode_operations. ++ */ ++int reiser4_link_common(struct dentry *existing, struct inode *parent, ++ struct dentry *newname) ++{ ++ reiser4_context *ctx; ++ int result; ++ struct inode *object; ++ dir_plugin *parent_dplug; ++ reiser4_dir_entry_desc entry; ++ reiser4_object_create_data data; ++ reiser4_block_nr reserve; ++ ++ ctx = reiser4_init_context(parent->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ assert("nikita-1431", existing != NULL); ++ assert("nikita-1432", parent != NULL); ++ assert("nikita-1433", newname != NULL); ++ ++ object = existing->d_inode; ++ assert("nikita-1434", object != NULL); ++ ++ /* check for race with create_object() */ ++ if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) { ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return RETERR(-E_REPEAT); ++ } ++ ++ parent_dplug = inode_dir_plugin(parent); ++ ++ memset(&entry, 0, sizeof entry); ++ entry.obj = object; ++ ++ data.mode = object->i_mode; ++ data.id = inode_file_plugin(object)->h.id; ++ ++ reserve = common_estimate_link(parent, existing->d_inode); ++ if ((__s64) reserve < 0) { ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return reserve; ++ } ++ ++ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return RETERR(-ENOSPC); ++ } ++ ++ /* ++ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It ++ * means that link(2) can race against unlink(2) or rename(2), and ++ * inode is dead (->i_nlink == 0) when reiser4_link() is entered. ++ * ++ * For such inode we have to undo special processing done in ++ * reiser4_unlink() viz. creation of safe-link. ++ */ ++ if (unlikely(object->i_nlink == 0)) { ++ result = safe_link_del(reiser4_tree_by_inode(object), ++ get_inode_oid(object), SAFE_UNLINK); ++ if (result != 0) { ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ } ++ ++ /* increment nlink of @existing and update its stat data */ ++ result = reiser4_add_nlink(object, parent, 1); ++ if (result == 0) { ++ /* add entry to the parent */ ++ result = ++ parent_dplug->add_entry(parent, newname, &data, &entry); ++ if (result != 0) { ++ /* failed to add entry to the parent, decrement nlink ++ of @existing */ ++ reiser4_del_nlink(object, parent, 1); ++ /* ++ * now, if that failed, we have a file with too big ++ * nlink---space leak, much better than directory ++ * entry pointing to nowhere ++ */ ++ } ++ } ++ if (result == 0) { ++ atomic_inc(&object->i_count); ++ /* ++ * Upon successful completion, link() shall mark for update ++ * the st_ctime field of the file. Also, the st_ctime and ++ * st_mtime fields of the directory that contains the new ++ * entry shall be marked for update. --SUS ++ */ ++ result = reiser4_update_dir(parent); ++ } ++ if (result == 0) ++ d_instantiate(newname, existing->d_inode); ++ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++static int unlink_check_and_grab(struct inode *parent, struct dentry *victim); ++ ++/** ++ * reiser4_unlink_common - unlink of inode operations ++ * @parent: inode of directory to remove name from ++ * @victim: name to be removed ++ * ++ * This is common implementation of vfs's unlink method of struct ++ * inode_operations. ++ */ ++int reiser4_unlink_common(struct inode *parent, struct dentry *victim) ++{ ++ reiser4_context *ctx; ++ int result; ++ struct inode *object; ++ file_plugin *fplug; ++ ++ ctx = reiser4_init_context(parent->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ object = victim->d_inode; ++ fplug = inode_file_plugin(object); ++ assert("nikita-2882", fplug->detach != NULL); ++ ++ result = unlink_check_and_grab(parent, victim); ++ if (result != 0) { ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ ++ result = fplug->detach(object, parent); ++ if (result == 0) { ++ dir_plugin *parent_dplug; ++ reiser4_dir_entry_desc entry; ++ ++ parent_dplug = inode_dir_plugin(parent); ++ memset(&entry, 0, sizeof entry); ++ ++ /* first, delete directory entry */ ++ result = parent_dplug->rem_entry(parent, victim, &entry); ++ if (result == 0) { ++ /* ++ * if name was removed successfully, we _have_ to ++ * return 0 from this function, because upper level ++ * caller (vfs_{rmdir,unlink}) expect this. ++ * ++ * now that directory entry is removed, update ++ * stat-data ++ */ ++ reiser4_del_nlink(object, parent, 1); ++ /* ++ * Upon successful completion, unlink() shall mark for ++ * update the st_ctime and st_mtime fields of the ++ * parent directory. Also, if the file's link count is ++ * not 0, the st_ctime field of the file shall be ++ * marked for update. --SUS ++ */ ++ reiser4_update_dir(parent); ++ /* add safe-link for this file */ ++ if (object->i_nlink == 0) ++ safe_link_add(object, SAFE_UNLINK); ++ } ++ } ++ ++ if (unlikely(result != 0)) { ++ if (result != -ENOMEM) ++ warning("nikita-3398", "Cannot unlink %llu (%i)", ++ (unsigned long long)get_inode_oid(object), ++ result); ++ /* if operation failed commit pending inode modifications to ++ * the stat-data */ ++ reiser4_update_sd(object); ++ reiser4_update_sd(parent); ++ } ++ ++ reiser4_release_reserved(object->i_sb); ++ ++ /* @object's i_ctime was updated by ->rem_link() method(). */ ++ ++ /* @victim can be already removed from the disk by this time. Inode is ++ then marked so that iput() wouldn't try to remove stat data. But ++ inode itself is still there. ++ */ ++ ++ /* ++ * we cannot release directory semaphore here, because name has ++ * already been deleted, but dentry (@victim) still exists. Prevent ++ * balance_dirty_pages() from being called on exiting this context: we ++ * don't want to do this under directory i_mutex. ++ */ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/** ++ * reiser4_symlink_common - symlink of inode operations ++ * @parent: inode of parent directory ++ * @dentry: dentry of object to be created ++ * @linkname: string symlink is to contain ++ * ++ * This is common implementation of vfs's symlink method of struct ++ * inode_operations. ++ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID. ++ */ ++int reiser4_symlink_common(struct inode *parent, struct dentry *dentry, ++ const char *linkname) ++{ ++ reiser4_object_create_data data; ++ ++ memset(&data, 0, sizeof data); ++ data.name = linkname; ++ data.id = SYMLINK_FILE_PLUGIN_ID; ++ data.mode = S_IFLNK | S_IRWXUGO; ++ return create_vfs_object(parent, dentry, &data); ++} ++ ++/** ++ * reiser4_mkdir_common - mkdir of inode operations ++ * @parent: inode of parent directory ++ * @dentry: dentry of object to be created ++ * @mode: the permissions to use ++ * ++ * This is common implementation of vfs's mkdir method of struct ++ * inode_operations. ++ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID. ++ */ ++int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode) ++{ ++ reiser4_object_create_data data; ++ ++ memset(&data, 0, sizeof data); ++ data.mode = S_IFDIR | mode; ++ data.id = DIRECTORY_FILE_PLUGIN_ID; ++ return create_vfs_object(parent, dentry, &data); ++} ++ ++/** ++ * reiser4_mknod_common - mknod of inode operations ++ * @parent: inode of parent directory ++ * @dentry: dentry of object to be created ++ * @mode: the permissions to use and file type ++ * @rdev: minor and major of new device file ++ * ++ * This is common implementation of vfs's mknod method of struct ++ * inode_operations. ++ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID. ++ */ ++int reiser4_mknod_common(struct inode *parent, struct dentry *dentry, ++ int mode, dev_t rdev) ++{ ++ reiser4_object_create_data data; ++ ++ memset(&data, 0, sizeof data); ++ data.mode = mode; ++ data.rdev = rdev; ++ data.id = SPECIAL_FILE_PLUGIN_ID; ++ return create_vfs_object(parent, dentry, &data); ++} ++ ++/* ++ * implementation of vfs's rename method of struct inode_operations for typical ++ * directory is in inode_ops_rename.c ++ */ ++ ++/** ++ * reiser4_follow_link_common - follow_link of inode operations ++ * @dentry: dentry of symlink ++ * @data: ++ * ++ * This is common implementation of vfs's followlink method of struct ++ * inode_operations. ++ * Assumes that inode's i_private points to the content of symbolic link. ++ */ ++void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd) ++{ ++ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode)); ++ ++ if (!dentry->d_inode->i_private ++ || !reiser4_inode_get_flag(dentry->d_inode, ++ REISER4_GENERIC_PTR_USED)) ++ return ERR_PTR(RETERR(-EINVAL)); ++ nd_set_link(nd, dentry->d_inode->i_private); ++ return NULL; ++} ++ ++/** ++ * reiser4_permission_common - permission of inode operations ++ * @inode: inode to check permissions for ++ * @mask: mode bits to check permissions for ++ * @nameidata: ++ * ++ * Uses generic function to check for rwx permissions. ++ */ ++int reiser4_permission_common(struct inode *inode, int mask) ++{ ++ return generic_permission(inode, mask, NULL); ++} ++ ++static int setattr_reserve(reiser4_tree *); ++ ++/* this is common implementation of vfs's setattr method of struct ++ inode_operations ++*/ ++int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr) ++{ ++ reiser4_context *ctx; ++ struct inode *inode; ++ int result; ++ ++ inode = dentry->d_inode; ++ result = inode_change_ok(inode, attr); ++ if (result) ++ return result; ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE)); ++ ++ /* ++ * grab disk space and call standard inode_setattr(). ++ */ ++ result = setattr_reserve(reiser4_tree_by_inode(inode)); ++ if (!result) { ++ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ++ || (attr->ia_valid & ATTR_GID ++ && attr->ia_gid != inode->i_gid)) { ++ result = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; ++ if (result) { ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ } ++ result = inode_setattr(inode, attr); ++ if (!result) ++ reiser4_update_sd(inode); ++ } ++ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/* this is common implementation of vfs's getattr method of struct ++ inode_operations ++*/ ++int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG, ++ struct dentry *dentry, struct kstat *stat) ++{ ++ struct inode *obj; ++ ++ assert("nikita-2298", dentry != NULL); ++ assert("nikita-2299", stat != NULL); ++ assert("nikita-2300", dentry->d_inode != NULL); ++ ++ obj = dentry->d_inode; ++ ++ stat->dev = obj->i_sb->s_dev; ++ stat->ino = oid_to_uino(get_inode_oid(obj)); ++ stat->mode = obj->i_mode; ++ /* don't confuse userland with huge nlink. This is not entirely ++ * correct, because nlink_t is not necessary 16 bit signed. */ ++ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff); ++ stat->uid = obj->i_uid; ++ stat->gid = obj->i_gid; ++ stat->rdev = obj->i_rdev; ++ stat->atime = obj->i_atime; ++ stat->mtime = obj->i_mtime; ++ stat->ctime = obj->i_ctime; ++ stat->size = obj->i_size; ++ stat->blocks = ++ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS; ++ /* "preferred" blocksize for efficient file system I/O */ ++ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size; ++ ++ return 0; ++} ++ ++/* Estimate the maximum amount of nodes which might be allocated or changed on ++ typical new object creation. Typical creation consists of calling create ++ method of file plugin, adding directory entry to parent and update parent ++ directory's stat data. ++*/ ++static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, ++ /* parent object */ ++ struct inode *object ++ /* object */) ++{ ++ assert("vpf-309", parent != NULL); ++ assert("vpf-307", object != NULL); ++ ++ return ++ /* object creation estimation */ ++ inode_file_plugin(object)->estimate.create(object) + ++ /* stat data of parent directory estimation */ ++ inode_file_plugin(parent)->estimate.update(parent) + ++ /* adding entry estimation */ ++ inode_dir_plugin(parent)->estimate.add_entry(parent) + ++ /* to undo in the case of failure */ ++ inode_dir_plugin(parent)->estimate.rem_entry(parent); ++} ++ ++/* Create child in directory. ++ ++ . get object's plugin ++ . get fresh inode ++ . initialize inode ++ . add object's stat-data ++ . initialize object's directory ++ . add entry to the parent ++ . instantiate dentry ++ ++*/ ++static int do_create_vfs_child(reiser4_object_create_data * data,/* parameters ++ of new ++ object */ ++ struct inode **retobj) ++{ ++ int result; ++ ++ struct dentry *dentry; /* parent object */ ++ struct inode *parent; /* new name */ ++ ++ dir_plugin *par_dir; /* directory plugin on the parent */ ++ dir_plugin *obj_dir; /* directory plugin on the new object */ ++ file_plugin *obj_plug; /* object plugin on the new object */ ++ struct inode *object; /* new object */ ++ reiser4_block_nr reserve; ++ ++ reiser4_dir_entry_desc entry; /* new directory entry */ ++ ++ assert("nikita-1420", data != NULL); ++ parent = data->parent; ++ dentry = data->dentry; ++ ++ assert("nikita-1418", parent != NULL); ++ assert("nikita-1419", dentry != NULL); ++ ++ /* check, that name is acceptable for parent */ ++ par_dir = inode_dir_plugin(parent); ++ if (par_dir->is_name_acceptable && ++ !par_dir->is_name_acceptable(parent, ++ dentry->d_name.name, ++ (int)dentry->d_name.len)) ++ return RETERR(-ENAMETOOLONG); ++ ++ result = 0; ++ obj_plug = file_plugin_by_id((int)data->id); ++ if (obj_plug == NULL) { ++ warning("nikita-430", "Cannot find plugin %i", data->id); ++ return RETERR(-ENOENT); ++ } ++ object = new_inode(parent->i_sb); ++ if (object == NULL) ++ return RETERR(-ENOMEM); ++ /* we'll update i_nlink below */ ++ object->i_nlink = 0; ++ /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0, ++ * to simplify error handling: if some error occurs before i_ino is ++ * initialized with oid, i_ino should already be set to some ++ * distinguished value. */ ++ object->i_ino = 0; ++ ++ /* So that on error iput will be called. */ ++ *retobj = object; ++ ++ if (vfs_dq_alloc_inode(object)) { ++ vfs_dq_drop(object); ++ object->i_flags |= S_NOQUOTA; ++ return RETERR(-EDQUOT); ++ } ++ ++ memset(&entry, 0, sizeof entry); ++ entry.obj = object; ++ ++ set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE, ++ file_plugin_to_plugin(obj_plug)); ++ result = obj_plug->set_plug_in_inode(object, parent, data); ++ if (result) { ++ warning("nikita-431", "Cannot install plugin %i on %llx", ++ data->id, (unsigned long long)get_inode_oid(object)); ++ vfs_dq_free_inode(object); ++ object->i_flags |= S_NOQUOTA; ++ return result; ++ } ++ ++ /* reget plugin after installation */ ++ obj_plug = inode_file_plugin(object); ++ ++ if (obj_plug->create_object == NULL) { ++ vfs_dq_free_inode(object); ++ object->i_flags |= S_NOQUOTA; ++ return RETERR(-EPERM); ++ } ++ ++ /* if any of hash, tail, sd or permission plugins for newly created ++ object are not set yet set them here inheriting them from parent ++ directory ++ */ ++ assert("nikita-2070", obj_plug->adjust_to_parent != NULL); ++ result = obj_plug->adjust_to_parent(object, ++ parent, ++ object->i_sb->s_root->d_inode); ++ if (result == 0) ++ result = finish_pset(object); ++ if (result != 0) { ++ warning("nikita-432", "Cannot inherit from %llx to %llx", ++ (unsigned long long)get_inode_oid(parent), ++ (unsigned long long)get_inode_oid(object)); ++ vfs_dq_free_inode(object); ++ object->i_flags |= S_NOQUOTA; ++ return result; ++ } ++ ++ /* setup inode and file-operations for this inode */ ++ setup_inode_ops(object, data); ++ ++ /* call file plugin's method to initialize plugin specific part of ++ * inode */ ++ if (obj_plug->init_inode_data) ++ obj_plug->init_inode_data(object, data, 1/*create */); ++ ++ /* obtain directory plugin (if any) for new object. */ ++ obj_dir = inode_dir_plugin(object); ++ if (obj_dir != NULL && obj_dir->init == NULL) { ++ vfs_dq_free_inode(object); ++ object->i_flags |= S_NOQUOTA; ++ return RETERR(-EPERM); ++ } ++ ++ reiser4_inode_data(object)->locality_id = get_inode_oid(parent); ++ ++ reserve = estimate_create_vfs_object(parent, object); ++ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { ++ vfs_dq_free_inode(object); ++ object->i_flags |= S_NOQUOTA; ++ return RETERR(-ENOSPC); ++ } ++ ++ /* mark inode `immutable'. We disable changes to the file being ++ created until valid directory entry for it is inserted. Otherwise, ++ if file were expanded and insertion of directory entry fails, we ++ have to remove file, but we only alloted enough space in ++ transaction to remove _empty_ file. 3.x code used to remove stat ++ data in different transaction thus possibly leaking disk space on ++ crash. This all only matters if it's possible to access file ++ without name, for example, by inode number ++ */ ++ reiser4_inode_set_flag(object, REISER4_IMMUTABLE); ++ ++ /* create empty object, this includes allocation of new objectid. For ++ directories this implies creation of dot and dotdot */ ++ assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD)); ++ ++ /* mark inode as `loaded'. From this point onward ++ reiser4_delete_inode() will try to remove its stat-data. */ ++ reiser4_inode_set_flag(object, REISER4_LOADED); ++ ++ result = obj_plug->create_object(object, parent, data); ++ if (result != 0) { ++ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE); ++ if (result != -ENAMETOOLONG && result != -ENOMEM) ++ warning("nikita-2219", ++ "Failed to create sd for %llu", ++ (unsigned long long)get_inode_oid(object)); ++ vfs_dq_free_inode(object); ++ object->i_flags |= S_NOQUOTA; ++ return result; ++ } ++ ++ if (obj_dir != NULL) ++ result = obj_dir->init(object, parent, data); ++ if (result == 0) { ++ assert("nikita-434", !reiser4_inode_get_flag(object, ++ REISER4_NO_SD)); ++ /* insert inode into VFS hash table */ ++ insert_inode_hash(object); ++ /* create entry */ ++ result = par_dir->add_entry(parent, dentry, data, &entry); ++ if (result == 0) { ++ result = reiser4_add_nlink(object, parent, 0); ++ /* If O_CREAT is set and the file did not previously ++ exist, upon successful completion, open() shall ++ mark for update the st_atime, st_ctime, and ++ st_mtime fields of the file and the st_ctime and ++ st_mtime fields of the parent directory. --SUS ++ */ ++ /* @object times are already updated by ++ reiser4_add_nlink() */ ++ if (result == 0) ++ reiser4_update_dir(parent); ++ if (result != 0) ++ /* cleanup failure to add nlink */ ++ par_dir->rem_entry(parent, dentry, &entry); ++ } ++ if (result != 0) ++ /* cleanup failure to add entry */ ++ obj_plug->detach(object, parent); ++ } else if (result != -ENOMEM) ++ warning("nikita-2219", "Failed to initialize dir for %llu: %i", ++ (unsigned long long)get_inode_oid(object), result); ++ ++ /* ++ * update stat-data, committing all pending modifications to the inode ++ * fields. ++ */ ++ reiser4_update_sd(object); ++ if (result != 0) { ++ vfs_dq_free_inode(object); ++ object->i_flags |= S_NOQUOTA; ++ /* if everything was ok (result == 0), parent stat-data is ++ * already updated above (update_parent_dir()) */ ++ reiser4_update_sd(parent); ++ /* failure to create entry, remove object */ ++ obj_plug->delete_object(object); ++ } ++ ++ /* file has name now, clear immutable flag */ ++ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE); ++ ++ /* on error, iput() will call ->delete_inode(). We should keep track ++ of the existence of stat-data for this inode and avoid attempt to ++ remove it in reiser4_delete_inode(). This is accomplished through ++ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags ++ */ ++ return result; ++} ++ ++/* this is helper for common implementations of reiser4_mkdir, reiser4_create, ++ reiser4_mknod and reiser4_symlink ++*/ ++static int ++create_vfs_object(struct inode *parent, ++ struct dentry *dentry, reiser4_object_create_data * data) ++{ ++ reiser4_context *ctx; ++ int result; ++ struct inode *child; ++ ++ ctx = reiser4_init_context(parent->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ context_set_commit_async(ctx); ++ ++ data->parent = parent; ++ data->dentry = dentry; ++ child = NULL; ++ result = do_create_vfs_child(data, &child); ++ if (unlikely(result != 0)) { ++ if (child != NULL) { ++ reiser4_make_bad_inode(child); ++ iput(child); ++ } ++ } else ++ d_instantiate(dentry, child); ++ ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++/** ++ * helper for link_common. Estimate disk space necessary to add a link ++ * from @parent to @object ++ */ ++static reiser4_block_nr common_estimate_link(struct inode *parent /* parent ++ * directory ++ */, ++ struct inode *object /* object to ++ * which new ++ * link is ++ * being ++ * created */) ++{ ++ reiser4_block_nr res = 0; ++ file_plugin *fplug; ++ dir_plugin *dplug; ++ ++ assert("vpf-317", object != NULL); ++ assert("vpf-318", parent != NULL); ++ ++ fplug = inode_file_plugin(object); ++ dplug = inode_dir_plugin(parent); ++ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice ++ * instead of multiplying by 2? */ ++ /* reiser4_add_nlink(object) */ ++ res += fplug->estimate.update(object); ++ /* add_entry(parent) */ ++ res += dplug->estimate.add_entry(parent); ++ /* reiser4_del_nlink(object) */ ++ res += fplug->estimate.update(object); ++ /* update_dir(parent) */ ++ res += inode_file_plugin(parent)->estimate.update(parent); ++ /* safe-link */ ++ res += estimate_one_item_removal(reiser4_tree_by_inode(object)); ++ ++ return res; ++} ++ ++/* Estimate disk space necessary to remove a link between @parent and ++ @object. ++*/ ++static reiser4_block_nr estimate_unlink(struct inode *parent /* parent ++ * directory */, ++ struct inode *object /* object to which ++ * new link is ++ * being created ++ */) ++{ ++ reiser4_block_nr res = 0; ++ file_plugin *fplug; ++ dir_plugin *dplug; ++ ++ assert("vpf-317", object != NULL); ++ assert("vpf-318", parent != NULL); ++ ++ fplug = inode_file_plugin(object); ++ dplug = inode_dir_plugin(parent); ++ ++ /* rem_entry(parent) */ ++ res += dplug->estimate.rem_entry(parent); ++ /* reiser4_del_nlink(object) */ ++ res += fplug->estimate.update(object); ++ /* update_dir(parent) */ ++ res += inode_file_plugin(parent)->estimate.update(parent); ++ /* fplug->unlink */ ++ res += fplug->estimate.unlink(object, parent); ++ /* safe-link */ ++ res += estimate_one_insert_item(reiser4_tree_by_inode(object)); ++ ++ return res; ++} ++ ++/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */ ++static int unlink_check_and_grab(struct inode *parent, struct dentry *victim) ++{ ++ file_plugin *fplug; ++ struct inode *child; ++ int result; ++ ++ result = 0; ++ child = victim->d_inode; ++ fplug = inode_file_plugin(child); ++ ++ /* check for race with create_object() */ ++ if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE)) ++ return RETERR(-E_REPEAT); ++ /* object being deleted should have stat data */ ++ assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD)); ++ ++ /* ask object plugin */ ++ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child)) ++ return RETERR(-ENOTEMPTY); ++ ++ result = (int)estimate_unlink(parent, child); ++ if (result < 0) ++ return result; ++ ++ return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT); ++} ++ ++/* helper for reiser4_setattr_common */ ++static int setattr_reserve(reiser4_tree * tree) ++{ ++ assert("vs-1096", is_grab_enabled(get_current_context())); ++ return reiser4_grab_space(estimate_one_insert_into_item(tree), ++ BA_CAN_COMMIT); ++} ++ ++/* helper function. Standards require that for many file-system operations ++ on success ctime and mtime of parent directory is to be updated. */ ++int reiser4_update_dir(struct inode *dir) ++{ ++ assert("nikita-2525", dir != NULL); ++ ++ dir->i_ctime = dir->i_mtime = CURRENT_TIME; ++ return reiser4_update_sd(dir); ++} +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.33/fs/reiser4/plugin/inode_ops_rename.c +--- linux-2.6.33.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/inode_ops_rename.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,925 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#include "../inode.h" ++#include "../safe_link.h" ++ ++static const char *possible_leak = "Possible disk space leak."; ++ ++/* re-bind existing name at @from_coord in @from_dir to point to @to_inode. ++ ++ Helper function called from hashed_rename() */ ++static int replace_name(struct inode *to_inode, /* inode where @from_coord is ++ * to be re-targeted at */ ++ struct inode *from_dir, /* directory where @from_coord ++ * lives */ ++ struct inode *from_inode, /* inode @from_coord ++ * originally point to */ ++ coord_t *from_coord, /* where directory entry is in ++ * the tree */ ++ lock_handle * from_lh/* lock handle on @from_coord */) ++{ ++ item_plugin *from_item; ++ int result; ++ znode *node; ++ ++ coord_clear_iplug(from_coord); ++ node = from_coord->node; ++ result = zload(node); ++ if (result != 0) ++ return result; ++ from_item = item_plugin_by_coord(from_coord); ++ if (plugin_of_group(item_plugin_by_coord(from_coord), ++ DIR_ENTRY_ITEM_TYPE)) { ++ reiser4_key to_key; ++ ++ build_sd_key(to_inode, &to_key); ++ ++ /* everything is found and prepared to change directory entry ++ at @from_coord to point to @to_inode. ++ ++ @to_inode is just about to get new name, so bump its link ++ counter. ++ ++ */ ++ result = reiser4_add_nlink(to_inode, from_dir, 0); ++ if (result != 0) { ++ /* Don't issue warning: this may be plain -EMLINK */ ++ zrelse(node); ++ return result; ++ } ++ ++ result = ++ from_item->s.dir.update_key(from_coord, &to_key, from_lh); ++ if (result != 0) { ++ reiser4_del_nlink(to_inode, from_dir, 0); ++ zrelse(node); ++ return result; ++ } ++ ++ /* @from_inode just lost its name, he-he. ++ ++ If @from_inode was directory, it contained dotdot pointing ++ to @from_dir. @from_dir i_nlink will be decreased when ++ iput() will be called on @from_inode. ++ ++ If file-system is not ADG (hard-links are ++ supported on directories), iput(from_inode) will not remove ++ @from_inode, and thus above is incorrect, but hard-links on ++ directories are problematic in many other respects. ++ */ ++ result = reiser4_del_nlink(from_inode, from_dir, 0); ++ if (result != 0) { ++ warning("nikita-2330", ++ "Cannot remove link from source: %i. %s", ++ result, possible_leak); ++ } ++ /* Has to return success, because entry is already ++ * modified. */ ++ result = 0; ++ ++ /* NOTE-NIKITA consider calling plugin method in stead of ++ accessing inode fields directly. */ ++ from_dir->i_mtime = CURRENT_TIME; ++ } else { ++ warning("nikita-2326", "Unexpected item type"); ++ result = RETERR(-EIO); ++ } ++ zrelse(node); ++ return result; ++} ++ ++/* add new entry pointing to @inode into @dir at @coord, locked by @lh ++ ++ Helper function used by hashed_rename(). */ ++static int add_name(struct inode *inode, /* inode where @coord is to be ++ * re-targeted at */ ++ struct inode *dir, /* directory where @coord lives */ ++ struct dentry *name, /* new name */ ++ coord_t *coord, /* where directory entry is in the tree ++ */ ++ lock_handle * lh, /* lock handle on @coord */ ++ int is_dir/* true, if @inode is directory */) ++{ ++ int result; ++ reiser4_dir_entry_desc entry; ++ ++ assert("nikita-2333", lh->node == coord->node); ++ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode)); ++ ++ memset(&entry, 0, sizeof entry); ++ entry.obj = inode; ++ /* build key of directory entry description */ ++ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key); ++ ++ /* ext2 does this in different order: first inserts new entry, ++ then increases directory nlink. We don't want do this, ++ because reiser4_add_nlink() calls ->add_link() plugin ++ method that can fail for whatever reason, leaving as with ++ cleanup problems. ++ */ ++ /* @inode is getting new name */ ++ reiser4_add_nlink(inode, dir, 0); ++ /* create @new_name in @new_dir pointing to ++ @old_inode */ ++ result = WITH_COORD(coord, ++ inode_dir_item_plugin(dir)->s.dir.add_entry(dir, ++ coord, ++ lh, ++ name, ++ &entry)); ++ if (result != 0) { ++ int result2; ++ result2 = reiser4_del_nlink(inode, dir, 0); ++ if (result2 != 0) { ++ warning("nikita-2327", ++ "Cannot drop link on %lli %i. %s", ++ (unsigned long long)get_inode_oid(inode), ++ result2, possible_leak); ++ } ++ } else ++ INODE_INC_FIELD(dir, i_size); ++ return result; ++} ++ ++static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory ++ * where @old is ++ * located */ ++ struct dentry *old_name,/* old name */ ++ struct inode *new_dir, /* directory ++ * where @new is ++ * located */ ++ struct dentry *new_name /* new name */) ++{ ++ reiser4_block_nr res1, res2; ++ dir_plugin * p_parent_old, *p_parent_new; ++ file_plugin * p_child_old, *p_child_new; ++ ++ assert("vpf-311", old_dir != NULL); ++ assert("vpf-312", new_dir != NULL); ++ assert("vpf-313", old_name != NULL); ++ assert("vpf-314", new_name != NULL); ++ ++ p_parent_old = inode_dir_plugin(old_dir); ++ p_parent_new = inode_dir_plugin(new_dir); ++ p_child_old = inode_file_plugin(old_name->d_inode); ++ if (new_name->d_inode) ++ p_child_new = inode_file_plugin(new_name->d_inode); ++ else ++ p_child_new = NULL; ++ ++ /* find_entry - can insert one leaf. */ ++ res1 = res2 = 1; ++ ++ /* replace_name */ ++ { ++ /* reiser4_add_nlink(p_child_old) and ++ * reiser4_del_nlink(p_child_old) */ ++ res1 += 2 * p_child_old->estimate.update(old_name->d_inode); ++ /* update key */ ++ res1 += 1; ++ /* reiser4_del_nlink(p_child_new) */ ++ if (p_child_new) ++ res1 += p_child_new->estimate.update(new_name->d_inode); ++ } ++ ++ /* else add_name */ ++ { ++ /* reiser4_add_nlink(p_parent_new) and ++ * reiser4_del_nlink(p_parent_new) */ ++ res2 += ++ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir); ++ /* reiser4_add_nlink(p_parent_old) */ ++ res2 += p_child_old->estimate.update(old_name->d_inode); ++ /* add_entry(p_parent_new) */ ++ res2 += p_parent_new->estimate.add_entry(new_dir); ++ /* reiser4_del_nlink(p_parent_old) */ ++ res2 += p_child_old->estimate.update(old_name->d_inode); ++ } ++ ++ res1 = res1 < res2 ? res2 : res1; ++ ++ /* reiser4_write_sd(p_parent_new) */ ++ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); ++ ++ /* reiser4_write_sd(p_child_new) */ ++ if (p_child_new) ++ res1 += p_child_new->estimate.update(new_name->d_inode); ++ ++ /* hashed_rem_entry(p_parent_old) */ ++ res1 += p_parent_old->estimate.rem_entry(old_dir); ++ ++ /* reiser4_del_nlink(p_child_old) */ ++ res1 += p_child_old->estimate.update(old_name->d_inode); ++ ++ /* replace_name */ ++ { ++ /* reiser4_add_nlink(p_parent_dir_new) */ ++ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); ++ /* update_key */ ++ res1 += 1; ++ /* reiser4_del_nlink(p_parent_new) */ ++ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); ++ /* reiser4_del_nlink(p_parent_old) */ ++ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir); ++ } ++ ++ /* reiser4_write_sd(p_parent_old) */ ++ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir); ++ ++ /* reiser4_write_sd(p_child_old) */ ++ res1 += p_child_old->estimate.update(old_name->d_inode); ++ ++ return res1; ++} ++ ++static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory ++ * where @old ++ * is located ++ */ ++ struct dentry *old_name,/* old name ++ */ ++ struct inode *new_dir, /* directory ++ * where @new ++ * is located ++ */ ++ struct dentry *new_name /* new name ++ */) ++{ ++ reiser4_block_nr reserve; ++ ++ reserve = estimate_rename(old_dir, old_name, new_dir, new_name); ++ ++ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) ++ return RETERR(-ENOSPC); ++ ++ return 0; ++} ++ ++/* check whether @old_inode and @new_inode can be moved within file system ++ * tree. This singles out attempts to rename pseudo-files, for example. */ ++static int can_rename(struct inode *old_dir, struct inode *old_inode, ++ struct inode *new_dir, struct inode *new_inode) ++{ ++ file_plugin *fplug; ++ dir_plugin *dplug; ++ ++ assert("nikita-3370", old_inode != NULL); ++ ++ dplug = inode_dir_plugin(new_dir); ++ fplug = inode_file_plugin(old_inode); ++ ++ if (dplug == NULL) ++ return RETERR(-ENOTDIR); ++ else if (new_dir->i_op->create == NULL) ++ return RETERR(-EPERM); ++ else if (!fplug->can_add_link(old_inode)) ++ return RETERR(-EMLINK); ++ else if (new_inode != NULL) { ++ fplug = inode_file_plugin(new_inode); ++ if (fplug->can_rem_link != NULL && ++ !fplug->can_rem_link(new_inode)) ++ return RETERR(-EBUSY); ++ } ++ return 0; ++} ++ ++int reiser4_find_entry(struct inode *, struct dentry *, lock_handle * , ++ znode_lock_mode, reiser4_dir_entry_desc *); ++int reiser4_update_dir(struct inode *); ++ ++/* this is common implementation of vfs's rename method of struct ++ inode_operations ++ See comments in the body. ++ ++ It is arguable that this function can be made generic so, that it ++ will be applicable to any kind of directory plugin that deals with ++ directories composed out of directory entries. The only obstacle ++ here is that we don't have any data-type to represent directory ++ entry. This should be re-considered when more than one different ++ directory plugin will be implemented. ++*/ ++int reiser4_rename_common(struct inode *old_dir /* directory where @old ++ * is located */ , ++ struct dentry *old_name /* old name */ , ++ struct inode *new_dir /* directory where @new ++ * is located */ , ++ struct dentry *new_name/* new name */) ++{ ++ /* From `The Open Group Base Specifications Issue 6' ++ ++ If either the old or new argument names a symbolic link, rename() ++ shall operate on the symbolic link itself, and shall not resolve ++ the last component of the argument. If the old argument and the new ++ argument resolve to the same existing file, rename() shall return ++ successfully and perform no other action. ++ ++ [this is done by VFS: vfs_rename()] ++ ++ If the old argument points to the pathname of a file that is not a ++ directory, the new argument shall not point to the pathname of a ++ directory. ++ ++ [checked by VFS: vfs_rename->may_delete()] ++ ++ If the link named by the new argument exists, it shall ++ be removed and old renamed to new. In this case, a link named new ++ shall remain visible to other processes throughout the renaming ++ operation and refer either to the file referred to by new or old ++ before the operation began. ++ ++ [we should assure this] ++ ++ Write access permission is required for ++ both the directory containing old and the directory containing new. ++ ++ [checked by VFS: vfs_rename->may_delete(), may_create()] ++ ++ If the old argument points to the pathname of a directory, the new ++ argument shall not point to the pathname of a file that is not a ++ directory. ++ ++ [checked by VFS: vfs_rename->may_delete()] ++ ++ If the directory named by the new argument exists, it ++ shall be removed and old renamed to new. In this case, a link named ++ new shall exist throughout the renaming operation and shall refer ++ either to the directory referred to by new or old before the ++ operation began. ++ ++ [we should assure this] ++ ++ If new names an existing directory, it shall be ++ required to be an empty directory. ++ ++ [we should check this] ++ ++ If the old argument points to a pathname of a symbolic link, the ++ symbolic link shall be renamed. If the new argument points to a ++ pathname of a symbolic link, the symbolic link shall be removed. ++ ++ The new pathname shall not contain a path prefix that names ++ old. Write access permission is required for the directory ++ containing old and the directory containing new. If the old ++ argument points to the pathname of a directory, write access ++ permission may be required for the directory named by old, and, if ++ it exists, the directory named by new. ++ ++ [checked by VFS: vfs_rename(), vfs_rename_dir()] ++ ++ If the link named by the new argument exists and the file's link ++ count becomes 0 when it is removed and no process has the file ++ open, the space occupied by the file shall be freed and the file ++ shall no longer be accessible. If one or more processes have the ++ file open when the last link is removed, the link shall be removed ++ before rename() returns, but the removal of the file contents shall ++ be postponed until all references to the file are closed. ++ ++ [iput() handles this, but we can do this manually, a la ++ reiser4_unlink()] ++ ++ Upon successful completion, rename() shall mark for update the ++ st_ctime and st_mtime fields of the parent directory of each file. ++ ++ [N/A] ++ ++ */ ++ reiser4_context *ctx; ++ int result; ++ int is_dir; /* is @old_name directory */ ++ ++ struct inode *old_inode; ++ struct inode *new_inode; ++ coord_t *new_coord; ++ ++ struct reiser4_dentry_fsdata *new_fsdata; ++ dir_plugin *dplug; ++ file_plugin *fplug; ++ ++ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry; ++ lock_handle * new_lh, *dotdot_lh; ++ struct dentry *dotdot_name; ++ struct reiser4_dentry_fsdata *dataonstack; ++ ++ ctx = reiser4_init_context(old_dir->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) + ++ sizeof(*dotdot_name) + sizeof(*dataonstack), ++ reiser4_ctx_gfp_mask_get()); ++ if (!old_entry) { ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return RETERR(-ENOMEM); ++ } ++ ++ new_entry = old_entry + 1; ++ dotdot_entry = old_entry + 2; ++ new_lh = (lock_handle *)(old_entry + 3); ++ dotdot_lh = new_lh + 1; ++ dotdot_name = (struct dentry *)(new_lh + 2); ++ dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1); ++ ++ assert("nikita-2318", old_dir != NULL); ++ assert("nikita-2319", new_dir != NULL); ++ assert("nikita-2320", old_name != NULL); ++ assert("nikita-2321", new_name != NULL); ++ ++ old_inode = old_name->d_inode; ++ new_inode = new_name->d_inode; ++ ++ dplug = inode_dir_plugin(old_dir); ++ fplug = NULL; ++ ++ new_fsdata = reiser4_get_dentry_fsdata(new_name); ++ if (IS_ERR(new_fsdata)) { ++ kfree(old_entry); ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return PTR_ERR(new_fsdata); ++ } ++ ++ new_coord = &new_fsdata->dec.entry_coord; ++ coord_clear_iplug(new_coord); ++ ++ is_dir = S_ISDIR(old_inode->i_mode); ++ ++ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir); ++ ++ /* if target is existing directory and it's not empty---return error. ++ ++ This check is done specifically, because is_dir_empty() requires ++ tree traversal and have to be done before locks are taken. ++ */ ++ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) { ++ kfree(old_entry); ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return RETERR(-ENOTEMPTY); ++ } ++ ++ result = can_rename(old_dir, old_inode, new_dir, new_inode); ++ if (result != 0) { ++ kfree(old_entry); ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ ++ result = hashed_rename_estimate_and_grab(old_dir, old_name, ++ new_dir, new_name); ++ if (result != 0) { ++ kfree(old_entry); ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ ++ init_lh(new_lh); ++ ++ /* find entry for @new_name */ ++ result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK, ++ new_entry); ++ ++ if (IS_CBKERR(result)) { ++ done_lh(new_lh); ++ kfree(old_entry); ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++ } ++ ++ reiser4_seal_done(&new_fsdata->dec.entry_seal); ++ ++ /* add or replace name for @old_inode as @new_name */ ++ if (new_inode != NULL) { ++ /* target (@new_name) exists. */ ++ /* Not clear what to do with objects that are ++ both directories and files at the same time. */ ++ if (result == CBK_COORD_FOUND) { ++ result = replace_name(old_inode, ++ new_dir, ++ new_inode, new_coord, new_lh); ++ if (result == 0) ++ fplug = inode_file_plugin(new_inode); ++ } else if (result == CBK_COORD_NOTFOUND) { ++ /* VFS told us that @new_name is bound to existing ++ inode, but we failed to find directory entry. */ ++ warning("nikita-2324", "Target not found"); ++ result = RETERR(-ENOENT); ++ } ++ } else { ++ /* target (@new_name) doesn't exists. */ ++ if (result == CBK_COORD_NOTFOUND) ++ result = add_name(old_inode, ++ new_dir, ++ new_name, new_coord, new_lh, is_dir); ++ else if (result == CBK_COORD_FOUND) { ++ /* VFS told us that @new_name is "negative" dentry, ++ but we found directory entry. */ ++ warning("nikita-2331", "Target found unexpectedly"); ++ result = RETERR(-EIO); ++ } ++ } ++ ++ assert("nikita-3462", ergo(result == 0, ++ old_inode->i_nlink >= 2 + !!is_dir)); ++ ++ /* We are done with all modifications to the @new_dir, release lock on ++ node. */ ++ done_lh(new_lh); ++ ++ if (fplug != NULL) { ++ /* detach @new_inode from name-space */ ++ result = fplug->detach(new_inode, new_dir); ++ if (result != 0) ++ warning("nikita-2330", "Cannot detach %lli: %i. %s", ++ (unsigned long long)get_inode_oid(new_inode), ++ result, possible_leak); ++ } ++ ++ if (new_inode != NULL) ++ reiser4_update_sd(new_inode); ++ ++ if (result == 0) { ++ old_entry->obj = old_inode; ++ ++ dplug->build_entry_key(old_dir, ++ &old_name->d_name, &old_entry->key); ++ ++ /* At this stage new name was introduced for ++ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink ++ counters were updated. ++ ++ We want to remove @old_name now. If @old_inode wasn't ++ directory this is simple. ++ */ ++ result = dplug->rem_entry(old_dir, old_name, old_entry); ++ if (result != 0 && result != -ENOMEM) { ++ warning("nikita-2335", ++ "Cannot remove old name: %i", result); ++ } else { ++ result = reiser4_del_nlink(old_inode, old_dir, 0); ++ if (result != 0 && result != -ENOMEM) { ++ warning("nikita-2337", ++ "Cannot drop link on old: %i", result); ++ } ++ } ++ ++ if (result == 0 && is_dir) { ++ /* @old_inode is directory. We also have to update ++ dotdot entry. */ ++ coord_t *dotdot_coord; ++ ++ memset(dataonstack, 0, sizeof dataonstack); ++ memset(dotdot_entry, 0, sizeof dotdot_entry); ++ dotdot_entry->obj = old_dir; ++ memset(dotdot_name, 0, sizeof dotdot_name); ++ dotdot_name->d_name.name = ".."; ++ dotdot_name->d_name.len = 2; ++ /* ++ * allocate ->d_fsdata on the stack to avoid using ++ * reiser4_get_dentry_fsdata(). Locking is not needed, ++ * because dentry is private to the current thread. ++ */ ++ dotdot_name->d_fsdata = dataonstack; ++ init_lh(dotdot_lh); ++ ++ dotdot_coord = &dataonstack->dec.entry_coord; ++ coord_clear_iplug(dotdot_coord); ++ ++ result = reiser4_find_entry(old_inode, dotdot_name, ++ dotdot_lh, ZNODE_WRITE_LOCK, ++ dotdot_entry); ++ if (result == 0) { ++ /* replace_name() decreases i_nlink on ++ * @old_dir */ ++ result = replace_name(new_dir, ++ old_inode, ++ old_dir, ++ dotdot_coord, dotdot_lh); ++ } else ++ result = RETERR(-EIO); ++ done_lh(dotdot_lh); ++ } ++ } ++ reiser4_update_dir(new_dir); ++ reiser4_update_dir(old_dir); ++ reiser4_update_sd(old_inode); ++ if (result == 0) { ++ file_plugin *fplug; ++ ++ if (new_inode != NULL) { ++ /* add safe-link for target file (in case we removed ++ * last reference to the poor fellow */ ++ fplug = inode_file_plugin(new_inode); ++ if (new_inode->i_nlink == 0) ++ result = safe_link_add(new_inode, SAFE_UNLINK); ++ } ++ } ++ kfree(old_entry); ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++} ++ ++#if 0 ++int reiser4_rename_common(struct inode *old_dir /* directory where @old ++ * is located */ , ++ struct dentry *old_name /* old name */ , ++ struct inode *new_dir /* directory where @new ++ * is located */ , ++ struct dentry *new_name/* new name */) ++{ ++ /* From `The Open Group Base Specifications Issue 6' ++ ++ If either the old or new argument names a symbolic link, rename() ++ shall operate on the symbolic link itself, and shall not resolve ++ the last component of the argument. If the old argument and the new ++ argument resolve to the same existing file, rename() shall return ++ successfully and perform no other action. ++ ++ [this is done by VFS: vfs_rename()] ++ ++ If the old argument points to the pathname of a file that is not a ++ directory, the new argument shall not point to the pathname of a ++ directory. ++ ++ [checked by VFS: vfs_rename->may_delete()] ++ ++ If the link named by the new argument exists, it shall ++ be removed and old renamed to new. In this case, a link named new ++ shall remain visible to other processes throughout the renaming ++ operation and refer either to the file referred to by new or old ++ before the operation began. ++ ++ [we should assure this] ++ ++ Write access permission is required for ++ both the directory containing old and the directory containing new. ++ ++ [checked by VFS: vfs_rename->may_delete(), may_create()] ++ ++ If the old argument points to the pathname of a directory, the new ++ argument shall not point to the pathname of a file that is not a ++ directory. ++ ++ [checked by VFS: vfs_rename->may_delete()] ++ ++ If the directory named by the new argument exists, it ++ shall be removed and old renamed to new. In this case, a link named ++ new shall exist throughout the renaming operation and shall refer ++ either to the directory referred to by new or old before the ++ operation began. ++ ++ [we should assure this] ++ ++ If new names an existing directory, it shall be ++ required to be an empty directory. ++ ++ [we should check this] ++ ++ If the old argument points to a pathname of a symbolic link, the ++ symbolic link shall be renamed. If the new argument points to a ++ pathname of a symbolic link, the symbolic link shall be removed. ++ ++ The new pathname shall not contain a path prefix that names ++ old. Write access permission is required for the directory ++ containing old and the directory containing new. If the old ++ argument points to the pathname of a directory, write access ++ permission may be required for the directory named by old, and, if ++ it exists, the directory named by new. ++ ++ [checked by VFS: vfs_rename(), vfs_rename_dir()] ++ ++ If the link named by the new argument exists and the file's link ++ count becomes 0 when it is removed and no process has the file ++ open, the space occupied by the file shall be freed and the file ++ shall no longer be accessible. If one or more processes have the ++ file open when the last link is removed, the link shall be removed ++ before rename() returns, but the removal of the file contents shall ++ be postponed until all references to the file are closed. ++ ++ [iput() handles this, but we can do this manually, a la ++ reiser4_unlink()] ++ ++ Upon successful completion, rename() shall mark for update the ++ st_ctime and st_mtime fields of the parent directory of each file. ++ ++ [N/A] ++ ++ */ ++ reiser4_context *ctx; ++ int result; ++ int is_dir; /* is @old_name directory */ ++ struct inode *old_inode; ++ struct inode *new_inode; ++ reiser4_dir_entry_desc old_entry; ++ reiser4_dir_entry_desc new_entry; ++ coord_t *new_coord; ++ struct reiser4_dentry_fsdata *new_fsdata; ++ lock_handle new_lh; ++ dir_plugin *dplug; ++ file_plugin *fplug; ++ ++ ctx = reiser4_init_context(old_dir->i_sb); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ assert("nikita-2318", old_dir != NULL); ++ assert("nikita-2319", new_dir != NULL); ++ assert("nikita-2320", old_name != NULL); ++ assert("nikita-2321", new_name != NULL); ++ ++ old_inode = old_name->d_inode; ++ new_inode = new_name->d_inode; ++ ++ dplug = inode_dir_plugin(old_dir); ++ fplug = NULL; ++ ++ new_fsdata = reiser4_get_dentry_fsdata(new_name); ++ if (IS_ERR(new_fsdata)) { ++ result = PTR_ERR(new_fsdata); ++ goto exit; ++ } ++ ++ new_coord = &new_fsdata->dec.entry_coord; ++ coord_clear_iplug(new_coord); ++ ++ is_dir = S_ISDIR(old_inode->i_mode); ++ ++ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir); ++ ++ /* if target is existing directory and it's not empty---return error. ++ ++ This check is done specifically, because is_dir_empty() requires ++ tree traversal and have to be done before locks are taken. ++ */ ++ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) ++ return RETERR(-ENOTEMPTY); ++ ++ result = can_rename(old_dir, old_inode, new_dir, new_inode); ++ if (result != 0) ++ goto exit; ++ ++ result = hashed_rename_estimate_and_grab(old_dir, old_name, ++ new_dir, new_name); ++ if (result != 0) ++ goto exit; ++ ++ init_lh(&new_lh); ++ ++ /* find entry for @new_name */ ++ result = reiser4_find_entry(new_dir, new_name, &new_lh, ++ ZNODE_WRITE_LOCK, &new_entry); ++ ++ if (IS_CBKERR(result)) { ++ done_lh(&new_lh); ++ goto exit; ++ } ++ ++ reiser4_seal_done(&new_fsdata->dec.entry_seal); ++ ++ /* add or replace name for @old_inode as @new_name */ ++ if (new_inode != NULL) { ++ /* target (@new_name) exists. */ ++ /* Not clear what to do with objects that are ++ both directories and files at the same time. */ ++ if (result == CBK_COORD_FOUND) { ++ result = replace_name(old_inode, ++ new_dir, ++ new_inode, new_coord, &new_lh); ++ if (result == 0) ++ fplug = inode_file_plugin(new_inode); ++ } else if (result == CBK_COORD_NOTFOUND) { ++ /* VFS told us that @new_name is bound to existing ++ inode, but we failed to find directory entry. */ ++ warning("nikita-2324", "Target not found"); ++ result = RETERR(-ENOENT); ++ } ++ } else { ++ /* target (@new_name) doesn't exists. */ ++ if (result == CBK_COORD_NOTFOUND) ++ result = add_name(old_inode, ++ new_dir, ++ new_name, new_coord, &new_lh, is_dir); ++ else if (result == CBK_COORD_FOUND) { ++ /* VFS told us that @new_name is "negative" dentry, ++ but we found directory entry. */ ++ warning("nikita-2331", "Target found unexpectedly"); ++ result = RETERR(-EIO); ++ } ++ } ++ ++ assert("nikita-3462", ergo(result == 0, ++ old_inode->i_nlink >= 2 + !!is_dir)); ++ ++ /* We are done with all modifications to the @new_dir, release lock on ++ node. */ ++ done_lh(&new_lh); ++ ++ if (fplug != NULL) { ++ /* detach @new_inode from name-space */ ++ result = fplug->detach(new_inode, new_dir); ++ if (result != 0) ++ warning("nikita-2330", "Cannot detach %lli: %i. %s", ++ (unsigned long long)get_inode_oid(new_inode), ++ result, possible_leak); ++ } ++ ++ if (new_inode != NULL) ++ reiser4_update_sd(new_inode); ++ ++ if (result == 0) { ++ memset(&old_entry, 0, sizeof old_entry); ++ old_entry.obj = old_inode; ++ ++ dplug->build_entry_key(old_dir, ++ &old_name->d_name, &old_entry.key); ++ ++ /* At this stage new name was introduced for ++ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink ++ counters were updated. ++ ++ We want to remove @old_name now. If @old_inode wasn't ++ directory this is simple. ++ */ ++ result = dplug->rem_entry(old_dir, old_name, &old_entry); ++ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */ ++ if (result != 0 && result != -ENOMEM) { ++ warning("nikita-2335", ++ "Cannot remove old name: %i", result); ++ } else { ++ result = reiser4_del_nlink(old_inode, old_dir, 0); ++ if (result != 0 && result != -ENOMEM) { ++ warning("nikita-2337", ++ "Cannot drop link on old: %i", result); ++ } ++ } ++ ++ if (result == 0 && is_dir) { ++ /* @old_inode is directory. We also have to update ++ dotdot entry. */ ++ coord_t *dotdot_coord; ++ lock_handle dotdot_lh; ++ struct dentry dotdot_name; ++ reiser4_dir_entry_desc dotdot_entry; ++ struct reiser4_dentry_fsdata dataonstack; ++ struct reiser4_dentry_fsdata *fsdata; ++ ++ memset(&dataonstack, 0, sizeof dataonstack); ++ memset(&dotdot_entry, 0, sizeof dotdot_entry); ++ dotdot_entry.obj = old_dir; ++ memset(&dotdot_name, 0, sizeof dotdot_name); ++ dotdot_name.d_name.name = ".."; ++ dotdot_name.d_name.len = 2; ++ /* ++ * allocate ->d_fsdata on the stack to avoid using ++ * reiser4_get_dentry_fsdata(). Locking is not needed, ++ * because dentry is private to the current thread. ++ */ ++ dotdot_name.d_fsdata = &dataonstack; ++ init_lh(&dotdot_lh); ++ ++ fsdata = &dataonstack; ++ dotdot_coord = &fsdata->dec.entry_coord; ++ coord_clear_iplug(dotdot_coord); ++ ++ result = reiser4_find_entry(old_inode, ++ &dotdot_name, ++ &dotdot_lh, ++ ZNODE_WRITE_LOCK, ++ &dotdot_entry); ++ if (result == 0) { ++ /* replace_name() decreases i_nlink on ++ * @old_dir */ ++ result = replace_name(new_dir, ++ old_inode, ++ old_dir, ++ dotdot_coord, &dotdot_lh); ++ } else ++ result = RETERR(-EIO); ++ done_lh(&dotdot_lh); ++ } ++ } ++ reiser4_update_dir(new_dir); ++ reiser4_update_dir(old_dir); ++ reiser4_update_sd(old_inode); ++ if (result == 0) { ++ file_plugin *fplug; ++ ++ if (new_inode != NULL) { ++ /* add safe-link for target file (in case we removed ++ * last reference to the poor fellow */ ++ fplug = inode_file_plugin(new_inode); ++ if (new_inode->i_nlink == 0) ++ result = safe_link_add(new_inode, SAFE_UNLINK); ++ } ++ } ++exit: ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ return result; ++} ++#endif +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/acl.h linux-2.6.33/fs/reiser4/plugin/item/acl.h +--- linux-2.6.33.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/acl.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,66 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Directory entry. */ ++ ++#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ) ++#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ++ ++#include "../../forward.h" ++#include "../../dformat.h" ++#include "../../kassign.h" ++#include "../../key.h" ++ ++#include <linux/fs.h> ++#include <linux/dcache.h> /* for struct dentry */ ++ ++typedef struct directory_entry_format { ++ /* key of object stat-data. It's not necessary to store whole ++ key here, because it's always key of stat-data, so minor ++ packing locality and offset can be omitted here. But this ++ relies on particular key allocation scheme for stat-data, so, ++ for extensibility sake, whole key can be stored here. ++ ++ We store key as array of bytes, because we don't want 8-byte ++ alignment of dir entries. ++ */ ++ obj_key_id id; ++ /* file name. Null terminated string. */ ++ d8 name[0]; ++} directory_entry_format; ++ ++void print_de(const char *prefix, coord_t * coord); ++int extract_key_de(const coord_t * coord, reiser4_key * key); ++int update_key_de(const coord_t * coord, const reiser4_key * key, ++ lock_handle * lh); ++char *extract_name_de(const coord_t * coord, char *buf); ++unsigned extract_file_type_de(const coord_t * coord); ++int add_entry_de(struct inode *dir, coord_t * coord, ++ lock_handle * lh, const struct dentry *name, ++ reiser4_dir_entry_desc * entry); ++int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord, ++ lock_handle * lh, reiser4_dir_entry_desc * entry); ++int max_name_len_de(const struct inode *dir); ++ ++int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length); ++ ++char *extract_dent_name(const coord_t * coord, ++ directory_entry_format * dent, char *buf); ++ ++#if REISER4_LARGE_KEY ++#define DE_NAME_BUF_LEN (24) ++#else ++#define DE_NAME_BUF_LEN (16) ++#endif ++ ++/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.33/fs/reiser4/plugin/item/blackbox.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/blackbox.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,142 @@ ++/* Copyright 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Black box item implementation */ ++ ++#include "../../forward.h" ++#include "../../debug.h" ++#include "../../dformat.h" ++#include "../../kassign.h" ++#include "../../coord.h" ++#include "../../tree.h" ++#include "../../lock.h" ++ ++#include "blackbox.h" ++#include "item.h" ++#include "../plugin.h" ++ ++int ++store_black_box(reiser4_tree * tree, ++ const reiser4_key * key, void *data, int length) ++{ ++ int result; ++ reiser4_item_data idata; ++ coord_t coord; ++ lock_handle lh; ++ ++ memset(&idata, 0, sizeof idata); ++ ++ idata.data = data; ++ idata.user = 0; ++ idata.length = length; ++ idata.iplug = item_plugin_by_id(BLACK_BOX_ID); ++ ++ init_lh(&lh); ++ result = insert_by_key(tree, key, ++ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE); ++ ++ assert("nikita-3413", ++ ergo(result == 0, ++ WITH_COORD(&coord, ++ item_length_by_coord(&coord) == length))); ++ ++ done_lh(&lh); ++ return result; ++} ++ ++int ++load_black_box(reiser4_tree * tree, ++ reiser4_key * key, void *data, int length, int exact) ++{ ++ int result; ++ coord_t coord; ++ lock_handle lh; ++ ++ init_lh(&lh); ++ result = coord_by_key(tree, key, ++ &coord, &lh, ZNODE_READ_LOCK, ++ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN, ++ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL); ++ ++ if (result == 0) { ++ int ilen; ++ ++ result = zload(coord.node); ++ if (result == 0) { ++ ilen = item_length_by_coord(&coord); ++ if (ilen <= length) { ++ memcpy(data, item_body_by_coord(&coord), ilen); ++ unit_key_by_coord(&coord, key); ++ } else if (exact) { ++ /* ++ * item is larger than buffer provided by the ++ * user. Only issue a warning if @exact is ++ * set. If @exact is false, we are iterating ++ * over all safe-links and here we are reaching ++ * the end of the iteration. ++ */ ++ warning("nikita-3415", ++ "Wrong black box length: %i > %i", ++ ilen, length); ++ result = RETERR(-EIO); ++ } ++ zrelse(coord.node); ++ } ++ } ++ ++ done_lh(&lh); ++ return result; ++ ++} ++ ++int ++update_black_box(reiser4_tree * tree, ++ const reiser4_key * key, void *data, int length) ++{ ++ int result; ++ coord_t coord; ++ lock_handle lh; ++ ++ init_lh(&lh); ++ result = coord_by_key(tree, key, ++ &coord, &lh, ZNODE_READ_LOCK, ++ FIND_EXACT, ++ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL); ++ if (result == 0) { ++ int ilen; ++ ++ result = zload(coord.node); ++ if (result == 0) { ++ ilen = item_length_by_coord(&coord); ++ if (length <= ilen) { ++ memcpy(item_body_by_coord(&coord), data, ++ length); ++ } else { ++ warning("nikita-3437", ++ "Wrong black box length: %i < %i", ++ ilen, length); ++ result = RETERR(-EIO); ++ } ++ zrelse(coord.node); ++ } ++ } ++ ++ done_lh(&lh); ++ return result; ++ ++} ++ ++int kill_black_box(reiser4_tree * tree, const reiser4_key * key) ++{ ++ return reiser4_cut_tree(tree, key, key, NULL, 1); ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.33/fs/reiser4/plugin/item/blackbox.h +--- linux-2.6.33.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/blackbox.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,33 @@ ++/* Copyright 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* "Black box" entry to fixed-width contain user supplied data */ ++ ++#if !defined( __FS_REISER4_BLACK_BOX_H__ ) ++#define __FS_REISER4_BLACK_BOX_H__ ++ ++#include "../../forward.h" ++#include "../../dformat.h" ++#include "../../kassign.h" ++#include "../../key.h" ++ ++extern int store_black_box(reiser4_tree * tree, ++ const reiser4_key * key, void *data, int length); ++extern int load_black_box(reiser4_tree * tree, ++ reiser4_key * key, void *data, int length, int exact); ++extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key); ++extern int update_black_box(reiser4_tree * tree, ++ const reiser4_key * key, void *data, int length); ++ ++/* __FS_REISER4_BLACK_BOX_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/cde.c linux-2.6.33/fs/reiser4/plugin/item/cde.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/cde.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1008 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Directory entry implementation */ ++ ++/* DESCRIPTION: ++ ++ This is "compound" directory item plugin implementation. This directory ++ item type is compound (as opposed to the "simple directory item" in ++ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory ++ entries. ++ ++ The reason behind this decision is disk space efficiency: all directory ++ entries inside the same directory have identical fragment in their ++ keys. This, of course, depends on key assignment policy. In our default key ++ assignment policy, all directory entries have the same locality which is ++ equal to the object id of their directory. ++ ++ Composing directory item out of several directory entries for the same ++ directory allows us to store said key fragment only once. That is, this is ++ some ad hoc form of key compression (stem compression) that is implemented ++ here, because general key compression is not supposed to be implemented in ++ v4.0. ++ ++ Another decision that was made regarding all directory item plugins, is ++ that they will store entry keys unaligned. This is for that sake of disk ++ space efficiency again. ++ ++ In should be noted, that storing keys unaligned increases CPU consumption, ++ at least on some architectures. ++ ++ Internal on-disk structure of the compound directory item is the following: ++ ++ HEADER cde_item_format. Here number of entries is stored. ++ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and ++ ENTRY_HEADER_1 offset of entry body are stored. ++ ENTRY_HEADER_2 (basically two last parts of key) ++ ... ++ ENTRY_HEADER_N ++ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and ++ ENTRY_BODY_1 NUL-terminated name are stored. ++ ENTRY_BODY_2 (part of statadta key in the ++ sence that since all SDs have ++ zero offset, this offset is not ++ stored on disk). ++ ... ++ ENTRY_BODY_N ++ ++ When it comes to the balancing, each directory entry in compound directory ++ item is unit, that is, something that can be cut from one item and pasted ++ into another item of the same type. Handling of unit cut and paste is major ++ reason for the complexity of code below. ++ ++*/ ++ ++#include "../../forward.h" ++#include "../../debug.h" ++#include "../../dformat.h" ++#include "../../kassign.h" ++#include "../../key.h" ++#include "../../coord.h" ++#include "sde.h" ++#include "cde.h" ++#include "item.h" ++#include "../node/node.h" ++#include "../plugin.h" ++#include "../../znode.h" ++#include "../../carry.h" ++#include "../../tree.h" ++#include "../../inode.h" ++ ++#include <linux/fs.h> /* for struct inode */ ++#include <linux/dcache.h> /* for struct dentry */ ++#include <linux/quotaops.h> ++ ++#if 0 ++#define CHECKME(coord) \ ++({ \ ++ const char *message; \ ++ coord_t dup; \ ++ \ ++ coord_dup_nocheck(&dup, (coord)); \ ++ dup.unit_pos = 0; \ ++ assert("nikita-2871", cde_check(&dup, &message) == 0); \ ++}) ++#else ++#define CHECKME(coord) noop ++#endif ++ ++/* return body of compound directory item at @coord */ ++static inline cde_item_format *formatted_at(const coord_t * coord) ++{ ++ assert("nikita-1282", coord != NULL); ++ return item_body_by_coord(coord); ++} ++ ++/* return entry header at @coord */ ++static inline cde_unit_header *header_at(const coord_t * ++ coord /* coord of item */ , ++ int idx /* index of unit */ ) ++{ ++ assert("nikita-1283", coord != NULL); ++ return &formatted_at(coord)->entry[idx]; ++} ++ ++/* return number of units in compound directory item at @coord */ ++static int units(const coord_t * coord /* coord of item */ ) ++{ ++ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries)); ++} ++ ++/* return offset of the body of @idx-th entry in @coord */ ++static unsigned int offset_of(const coord_t * coord /* coord of item */ , ++ int idx /* index of unit */ ) ++{ ++ if (idx < units(coord)) ++ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset)); ++ else if (idx == units(coord)) ++ return item_length_by_coord(coord); ++ else ++ impossible("nikita-1308", "Wrong idx"); ++ return 0; ++} ++ ++/* set offset of the body of @idx-th entry in @coord */ ++static void set_offset(const coord_t * coord /* coord of item */ , ++ int idx /* index of unit */ , ++ unsigned int offset /* new offset */ ) ++{ ++ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset); ++} ++ ++static void adj_offset(const coord_t * coord /* coord of item */ , ++ int idx /* index of unit */ , ++ int delta /* offset change */ ) ++{ ++ d16 *doffset; ++ __u16 offset; ++ ++ doffset = &header_at(coord, idx)->offset; ++ offset = le16_to_cpu(get_unaligned(doffset)); ++ offset += delta; ++ put_unaligned(cpu_to_le16((__u16) offset), doffset); ++} ++ ++/* return pointer to @offset-th byte from the beginning of @coord */ ++static char *address(const coord_t * coord /* coord of item */ , ++ int offset) ++{ ++ return ((char *)item_body_by_coord(coord)) + offset; ++} ++ ++/* return pointer to the body of @idx-th entry in @coord */ ++static directory_entry_format *entry_at(const coord_t * coord /* coord of ++ * item */ , ++ int idx /* index of unit */ ) ++{ ++ return (directory_entry_format *) address(coord, ++ (int)offset_of(coord, idx)); ++} ++ ++/* return number of unit referenced by @coord */ ++static int idx_of(const coord_t * coord /* coord of item */ ) ++{ ++ assert("nikita-1285", coord != NULL); ++ return coord->unit_pos; ++} ++ ++/* find position where entry with @entry_key would be inserted into @coord */ ++static int find(const coord_t * coord /* coord of item */ , ++ const reiser4_key * entry_key /* key to look for */ , ++ cmp_t * last /* result of last comparison */ ) ++{ ++ int entries; ++ ++ int left; ++ int right; ++ ++ cde_unit_header *header; ++ ++ assert("nikita-1295", coord != NULL); ++ assert("nikita-1296", entry_key != NULL); ++ assert("nikita-1297", last != NULL); ++ ++ entries = units(coord); ++ left = 0; ++ right = entries - 1; ++ while (right - left >= REISER4_SEQ_SEARCH_BREAK) { ++ int median; ++ ++ median = (left + right) >> 1; ++ ++ header = header_at(coord, median); ++ *last = de_id_key_cmp(&header->hash, entry_key); ++ switch (*last) { ++ case LESS_THAN: ++ left = median; ++ break; ++ case GREATER_THAN: ++ right = median; ++ break; ++ case EQUAL_TO:{ ++ do { ++ median--; ++ header--; ++ } while (median >= 0 && ++ de_id_key_cmp(&header->hash, ++ entry_key) == EQUAL_TO); ++ return median + 1; ++ } ++ } ++ } ++ header = header_at(coord, left); ++ for (; left < entries; ++left, ++header) { ++ prefetch(header + 1); ++ *last = de_id_key_cmp(&header->hash, entry_key); ++ if (*last != LESS_THAN) ++ break; ++ } ++ if (left < entries) ++ return left; ++ else ++ return RETERR(-ENOENT); ++ ++} ++ ++/* expand @coord as to accommodate for insertion of @no new entries starting ++ from @pos, with total bodies size @size. */ ++static int expand_item(const coord_t * coord /* coord of item */ , ++ int pos /* unit position */ , int no /* number of new ++ * units*/ , ++ int size /* total size of new units' data */ , ++ unsigned int data_size /* free space already reserved ++ * in the item for insertion */ ) ++{ ++ int entries; ++ cde_unit_header *header; ++ char *dent; ++ int i; ++ ++ assert("nikita-1310", coord != NULL); ++ assert("nikita-1311", pos >= 0); ++ assert("nikita-1312", no > 0); ++ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format)); ++ assert("nikita-1343", ++ item_length_by_coord(coord) >= ++ (int)(size + data_size + no * sizeof *header)); ++ ++ entries = units(coord); ++ ++ if (pos == entries) ++ dent = address(coord, size); ++ else ++ dent = (char *)entry_at(coord, pos); ++ /* place where new header will be in */ ++ header = header_at(coord, pos); ++ /* free space for new entry headers */ ++ memmove(header + no, header, ++ (unsigned)(address(coord, size) - (char *)header)); ++ /* if adding to the end initialise first new header */ ++ if (pos == entries) { ++ set_offset(coord, pos, (unsigned)size); ++ } ++ ++ /* adjust entry pointer and size */ ++ dent = dent + no * sizeof *header; ++ size += no * sizeof *header; ++ /* free space for new entries */ ++ memmove(dent + data_size, dent, ++ (unsigned)(address(coord, size) - dent)); ++ ++ /* increase counter */ ++ entries += no; ++ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries); ++ ++ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header ) ++ bytes. */ ++ for (i = 0; i <= pos; ++i) ++ adj_offset(coord, i, no * sizeof *header); ++ /* [ pos + no ... +\infty ) entries were shifted by ( no * ++ sizeof *header + data_size ) bytes */ ++ for (i = pos + no; i < entries; ++i) ++ adj_offset(coord, i, no * sizeof *header + data_size); ++ return 0; ++} ++ ++/* insert new @entry into item */ ++static int expand(const coord_t * coord /* coord of item */ , ++ struct cde_entry * entry /* entry to insert */ , ++ int len /* length of @entry data */ , ++ int *pos /* position to insert */ , ++ reiser4_dir_entry_desc * dir_entry /* parameters for new ++ * entry */ ) ++{ ++ cmp_t cmp_res; ++ int datasize; ++ ++ *pos = find(coord, &dir_entry->key, &cmp_res); ++ if (*pos < 0) ++ *pos = units(coord); ++ ++ datasize = sizeof(directory_entry_format); ++ if (is_longname(entry->name->name, entry->name->len)) ++ datasize += entry->name->len + 1; ++ ++ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len, ++ datasize); ++ return 0; ++} ++ ++/* paste body of @entry into item */ ++static int paste_entry(const coord_t * coord /* coord of item */ , ++ struct cde_entry * entry /* new entry */ , ++ int pos /* position to insert */ , ++ reiser4_dir_entry_desc * dir_entry /* parameters for ++ * new entry */ ) ++{ ++ cde_unit_header *header; ++ directory_entry_format *dent; ++ const char *name; ++ int len; ++ ++ header = header_at(coord, pos); ++ dent = entry_at(coord, pos); ++ ++ build_de_id_by_key(&dir_entry->key, &header->hash); ++ build_inode_key_id(entry->obj, &dent->id); ++ /* AUDIT unsafe strcpy() operation! It should be replaced with ++ much less CPU hungry ++ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len ); ++ ++ Also a more major thing is that there should be a way to figure out ++ amount of space in dent -> name and be able to check that we are ++ not going to overwrite more than we supposed to */ ++ name = entry->name->name; ++ len = entry->name->len; ++ if (is_longname(name, len)) { ++ strcpy((unsigned char *)dent->name, name); ++ put_unaligned(0, &dent->name[len]); ++ } ++ return 0; ++} ++ ++/* estimate how much space is necessary in item to insert/paste set of entries ++ described in @data. */ ++int estimate_cde(const coord_t * coord /* coord of item */ , ++ const reiser4_item_data * data /* parameters for new item */ ) ++{ ++ struct cde_entry_data *e; ++ int result; ++ int i; ++ ++ e = (struct cde_entry_data *) data->data; ++ ++ assert("nikita-1288", e != NULL); ++ assert("nikita-1289", e->num_of_entries >= 0); ++ ++ if (coord == NULL) ++ /* insert */ ++ result = sizeof(cde_item_format); ++ else ++ /* paste */ ++ result = 0; ++ ++ result += e->num_of_entries * ++ (sizeof(cde_unit_header) + sizeof(directory_entry_format)); ++ for (i = 0; i < e->num_of_entries; ++i) { ++ const char *name; ++ int len; ++ ++ name = e->entry[i].name->name; ++ len = e->entry[i].name->len; ++ assert("nikita-2054", strlen(name) == len); ++ if (is_longname(name, len)) ++ result += len + 1; ++ } ++ ((reiser4_item_data *) data)->length = result; ++ return result; ++} ++ ++/* ->nr_units() method for this item plugin. */ ++pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ ) ++{ ++ return units(coord); ++} ++ ++/* ->unit_key() method for this item plugin. */ ++reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ , ++ reiser4_key * key /* resulting key */ ) ++{ ++ assert("nikita-1452", coord != NULL); ++ assert("nikita-1345", idx_of(coord) < units(coord)); ++ assert("nikita-1346", key != NULL); ++ ++ item_key_by_coord(coord, key); ++ extract_key_from_de_id(extract_dir_id_from_key(key), ++ &header_at(coord, idx_of(coord))->hash, key); ++ return key; ++} ++ ++/* mergeable_cde(): implementation of ->mergeable() item method. ++ ++ Two directory items are mergeable iff they are from the same ++ directory. That simple. ++ ++*/ ++int mergeable_cde(const coord_t * p1 /* coord of first item */ , ++ const coord_t * p2 /* coord of second item */ ) ++{ ++ reiser4_key k1; ++ reiser4_key k2; ++ ++ assert("nikita-1339", p1 != NULL); ++ assert("nikita-1340", p2 != NULL); ++ ++ return ++ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) && ++ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) == ++ extract_dir_id_from_key(item_key_by_coord(p2, &k2))); ++ ++} ++ ++/* ->max_key_inside() method for this item plugin. */ ++reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ , ++ reiser4_key * result /* resulting key */ ) ++{ ++ assert("nikita-1342", coord != NULL); ++ ++ item_key_by_coord(coord, result); ++ set_key_ordering(result, get_key_ordering(reiser4_max_key())); ++ set_key_fulloid(result, get_key_fulloid(reiser4_max_key())); ++ set_key_offset(result, get_key_offset(reiser4_max_key())); ++ return result; ++} ++ ++/* @data contains data which are to be put into tree */ ++int can_contain_key_cde(const coord_t * coord /* coord of item */ , ++ const reiser4_key * key /* key to check */ , ++ const reiser4_item_data * data /* parameters of new ++ * item/unit being ++ * created */ ) ++{ ++ reiser4_key item_key; ++ ++ /* FIXME-VS: do not rely on anything but iplug field of @data. Only ++ data->iplug is initialized */ ++ assert("vs-457", data && data->iplug); ++/* assert( "vs-553", data -> user == 0 );*/ ++ item_key_by_coord(coord, &item_key); ++ ++ return (item_plugin_by_coord(coord) == data->iplug) && ++ (extract_dir_id_from_key(&item_key) == ++ extract_dir_id_from_key(key)); ++} ++ ++#if REISER4_DEBUG ++/* cde_check ->check() method for compressed directory items ++ ++ used for debugging, every item should have here the most complete ++ possible check of the consistency of the item that the inventor can ++ construct ++*/ ++int reiser4_check_cde(const coord_t * coord /* coord of item to check */, ++ const char **error /* where to store error message */) ++{ ++ int i; ++ int result; ++ char *item_start; ++ char *item_end; ++ reiser4_key key; ++ ++ coord_t c; ++ ++ assert("nikita-1357", coord != NULL); ++ assert("nikita-1358", error != NULL); ++ ++ if (!ergo(coord->item_pos != 0, ++ is_dot_key(item_key_by_coord(coord, &key)))) { ++ *error = "CDE doesn't start with dot"; ++ return -1; ++ } ++ item_start = item_body_by_coord(coord); ++ item_end = item_start + item_length_by_coord(coord); ++ ++ coord_dup(&c, coord); ++ result = 0; ++ for (i = 0; i < units(coord); ++i) { ++ directory_entry_format *entry; ++ ++ if ((char *)(header_at(coord, i) + 1) > ++ item_end - units(coord) * sizeof *entry) { ++ *error = "CDE header is out of bounds"; ++ result = -1; ++ break; ++ } ++ entry = entry_at(coord, i); ++ if ((char *)entry < item_start + sizeof(cde_item_format)) { ++ *error = "CDE header is too low"; ++ result = -1; ++ break; ++ } ++ if ((char *)(entry + 1) > item_end) { ++ *error = "CDE header is too high"; ++ result = -1; ++ break; ++ } ++ } ++ ++ return result; ++} ++#endif ++ ++/* ->init() method for this item plugin. */ ++int init_cde(coord_t * coord /* coord of item */ , ++ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */ ++ UNUSED_ARG) ++{ ++ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries); ++ return 0; ++} ++ ++/* ->lookup() method for this item plugin. */ ++lookup_result lookup_cde(const reiser4_key * key /* key to search for */ , ++ lookup_bias bias /* search bias */ , ++ coord_t * coord /* coord of item to lookup in */ ) ++{ ++ cmp_t last_comp; ++ int pos; ++ ++ reiser4_key utmost_key; ++ ++ assert("nikita-1293", coord != NULL); ++ assert("nikita-1294", key != NULL); ++ ++ CHECKME(coord); ++ ++ if (keygt(item_key_by_coord(coord, &utmost_key), key)) { ++ coord->unit_pos = 0; ++ coord->between = BEFORE_UNIT; ++ return CBK_COORD_NOTFOUND; ++ } ++ pos = find(coord, key, &last_comp); ++ if (pos >= 0) { ++ coord->unit_pos = (int)pos; ++ switch (last_comp) { ++ case EQUAL_TO: ++ coord->between = AT_UNIT; ++ return CBK_COORD_FOUND; ++ case GREATER_THAN: ++ coord->between = BEFORE_UNIT; ++ return RETERR(-ENOENT); ++ case LESS_THAN: ++ default: ++ impossible("nikita-1298", "Broken find"); ++ return RETERR(-EIO); ++ } ++ } else { ++ coord->unit_pos = units(coord) - 1; ++ coord->between = AFTER_UNIT; ++ return (bias == ++ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND : ++ CBK_COORD_NOTFOUND; ++ } ++} ++ ++/* ->paste() method for this item plugin. */ ++int paste_cde(coord_t * coord /* coord of item */ , ++ reiser4_item_data * data /* parameters of new unit being ++ * inserted */ , ++ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ ) ++{ ++ struct cde_entry_data *e; ++ int result; ++ int i; ++ ++ CHECKME(coord); ++ e = (struct cde_entry_data *) data->data; ++ ++ result = 0; ++ for (i = 0; i < e->num_of_entries; ++i) { ++ int pos; ++ int phantom_size; ++ ++ phantom_size = data->length; ++ if (units(coord) == 0) ++ phantom_size -= sizeof(cde_item_format); ++ ++ result = ++ expand(coord, e->entry + i, phantom_size, &pos, data->arg); ++ if (result != 0) ++ break; ++ result = paste_entry(coord, e->entry + i, pos, data->arg); ++ if (result != 0) ++ break; ++ } ++ CHECKME(coord); ++ return result; ++} ++ ++/* amount of space occupied by all entries starting from @idx both headers and ++ bodies. */ ++static unsigned int part_size(const coord_t * coord /* coord of item */ , ++ int idx /* index of unit */ ) ++{ ++ assert("nikita-1299", coord != NULL); ++ assert("nikita-1300", idx < (int)units(coord)); ++ ++ return sizeof(cde_item_format) + ++ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord, ++ idx + 1) - ++ offset_of(coord, 0); ++} ++ ++/* how many but not more than @want units of @source can be merged with ++ item in @target node. If pend == append - we try to append last item ++ of @target by first units of @source. If pend == prepend - we try to ++ "prepend" first item in @target by last units of @source. @target ++ node has @free_space bytes of free space. Total size of those units ++ are returned via @size */ ++int can_shift_cde(unsigned free_space /* free space in item */ , ++ coord_t * coord /* coord of source item */ , ++ znode * target /* target node */ , ++ shift_direction pend /* shift direction */ , ++ unsigned *size /* resulting number of shifted bytes */ , ++ unsigned want /* maximal number of bytes to shift */ ) ++{ ++ int shift; ++ ++ CHECKME(coord); ++ if (want == 0) { ++ *size = 0; ++ return 0; ++ } ++ ++ /* pend == SHIFT_LEFT <==> shifting to the left */ ++ if (pend == SHIFT_LEFT) { ++ for (shift = min((int)want - 1, units(coord)); shift >= 0; ++ --shift) { ++ *size = part_size(coord, shift); ++ if (target != NULL) ++ *size -= sizeof(cde_item_format); ++ if (*size <= free_space) ++ break; ++ } ++ shift = shift + 1; ++ } else { ++ int total_size; ++ ++ assert("nikita-1301", pend == SHIFT_RIGHT); ++ ++ total_size = item_length_by_coord(coord); ++ for (shift = units(coord) - want - 1; shift < units(coord) - 1; ++ ++shift) { ++ *size = total_size - part_size(coord, shift); ++ if (target == NULL) ++ *size += sizeof(cde_item_format); ++ if (*size <= free_space) ++ break; ++ } ++ shift = units(coord) - shift - 1; ++ } ++ if (shift == 0) ++ *size = 0; ++ CHECKME(coord); ++ return shift; ++} ++ ++/* ->copy_units() method for this item plugin. */ ++void copy_units_cde(coord_t * target /* coord of target item */ , ++ coord_t * source /* coord of source item */ , ++ unsigned from /* starting unit */ , ++ unsigned count /* how many units to copy */ , ++ shift_direction where_is_free_space /* shift direction */ , ++ unsigned free_space /* free space in item */ ) ++{ ++ char *header_from; ++ char *header_to; ++ ++ char *entry_from; ++ char *entry_to; ++ ++ int pos_in_target; ++ int data_size; ++ int data_delta; ++ int i; ++ ++ assert("nikita-1303", target != NULL); ++ assert("nikita-1304", source != NULL); ++ assert("nikita-1305", (int)from < units(source)); ++ assert("nikita-1307", (int)(from + count) <= units(source)); ++ ++ if (where_is_free_space == SHIFT_LEFT) { ++ assert("nikita-1453", from == 0); ++ pos_in_target = units(target); ++ } else { ++ assert("nikita-1309", (int)(from + count) == units(source)); ++ pos_in_target = 0; ++ memmove(item_body_by_coord(target), ++ (char *)item_body_by_coord(target) + free_space, ++ item_length_by_coord(target) - free_space); ++ } ++ ++ CHECKME(target); ++ CHECKME(source); ++ ++ /* expand @target */ ++ data_size = ++ offset_of(source, (int)(from + count)) - offset_of(source, ++ (int)from); ++ ++ if (units(target) == 0) ++ free_space -= sizeof(cde_item_format); ++ ++ expand_item(target, pos_in_target, (int)count, ++ (int)(item_length_by_coord(target) - free_space), ++ (unsigned)data_size); ++ ++ /* copy first @count units of @source into @target */ ++ data_delta = ++ offset_of(target, pos_in_target) - offset_of(source, (int)from); ++ ++ /* copy entries */ ++ entry_from = (char *)entry_at(source, (int)from); ++ entry_to = (char *)entry_at(source, (int)(from + count)); ++ memmove(entry_at(target, pos_in_target), entry_from, ++ (unsigned)(entry_to - entry_from)); ++ ++ /* copy headers */ ++ header_from = (char *)header_at(source, (int)from); ++ header_to = (char *)header_at(source, (int)(from + count)); ++ memmove(header_at(target, pos_in_target), header_from, ++ (unsigned)(header_to - header_from)); ++ ++ /* update offsets */ ++ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i) ++ adj_offset(target, i, data_delta); ++ CHECKME(target); ++ CHECKME(source); ++} ++ ++/* ->cut_units() method for this item plugin. */ ++int cut_units_cde(coord_t * coord /* coord of item */ , ++ pos_in_node_t from /* start unit pos */ , ++ pos_in_node_t to /* stop unit pos */ , ++ struct carry_cut_data *cdata UNUSED_ARG, ++ reiser4_key * smallest_removed, reiser4_key * new_first) ++{ ++ char *header_from; ++ char *header_to; ++ ++ char *entry_from; ++ char *entry_to; ++ ++ int size; ++ int entry_delta; ++ int header_delta; ++ int i; ++ ++ unsigned count; ++ ++ CHECKME(coord); ++ ++ count = to - from + 1; ++ ++ assert("nikita-1454", coord != NULL); ++ assert("nikita-1455", (int)(from + count) <= units(coord)); ++ ++ if (smallest_removed) ++ unit_key_by_coord(coord, smallest_removed); ++ ++ if (new_first) { ++ coord_t next; ++ ++ /* not everything is cut from item head */ ++ assert("vs-1527", from == 0); ++ assert("vs-1528", to < units(coord) - 1); ++ ++ coord_dup(&next, coord); ++ next.unit_pos++; ++ unit_key_by_coord(&next, new_first); ++ } ++ ++ size = item_length_by_coord(coord); ++ if (count == (unsigned)units(coord)) { ++ return size; ++ } ++ ++ header_from = (char *)header_at(coord, (int)from); ++ header_to = (char *)header_at(coord, (int)(from + count)); ++ ++ entry_from = (char *)entry_at(coord, (int)from); ++ entry_to = (char *)entry_at(coord, (int)(from + count)); ++ ++ /* move headers */ ++ memmove(header_from, header_to, ++ (unsigned)(address(coord, size) - header_to)); ++ ++ header_delta = header_to - header_from; ++ ++ entry_from -= header_delta; ++ entry_to -= header_delta; ++ size -= header_delta; ++ ++ /* copy entries */ ++ memmove(entry_from, entry_to, ++ (unsigned)(address(coord, size) - entry_to)); ++ ++ entry_delta = entry_to - entry_from; ++ size -= entry_delta; ++ ++ /* update offsets */ ++ ++ for (i = 0; i < (int)from; ++i) ++ adj_offset(coord, i, -header_delta); ++ ++ for (i = from; i < units(coord) - (int)count; ++i) ++ adj_offset(coord, i, -header_delta - entry_delta); ++ ++ put_unaligned(cpu_to_le16((__u16) units(coord) - count), ++ &formatted_at(coord)->num_of_entries); ++ ++ if (from == 0) { ++ /* entries from head was removed - move remaining to right */ ++ memmove((char *)item_body_by_coord(coord) + ++ header_delta + entry_delta, item_body_by_coord(coord), ++ (unsigned)size); ++ if (REISER4_DEBUG) ++ memset(item_body_by_coord(coord), 0, ++ (unsigned)header_delta + entry_delta); ++ } else { ++ /* freed space is already at the end of item */ ++ if (REISER4_DEBUG) ++ memset((char *)item_body_by_coord(coord) + size, 0, ++ (unsigned)header_delta + entry_delta); ++ } ++ ++ return header_delta + entry_delta; ++} ++ ++int kill_units_cde(coord_t * coord /* coord of item */ , ++ pos_in_node_t from /* start unit pos */ , ++ pos_in_node_t to /* stop unit pos */ , ++ struct carry_kill_data *kdata UNUSED_ARG, ++ reiser4_key * smallest_removed, reiser4_key * new_first) ++{ ++ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first); ++} ++ ++/* ->s.dir.extract_key() method for this item plugin. */ ++int extract_key_cde(const coord_t * coord /* coord of item */ , ++ reiser4_key * key /* resulting key */ ) ++{ ++ directory_entry_format *dent; ++ ++ assert("nikita-1155", coord != NULL); ++ assert("nikita-1156", key != NULL); ++ ++ dent = entry_at(coord, idx_of(coord)); ++ return extract_key_from_id(&dent->id, key); ++} ++ ++int ++update_key_cde(const coord_t * coord, const reiser4_key * key, ++ lock_handle * lh UNUSED_ARG) ++{ ++ directory_entry_format *dent; ++ obj_key_id obj_id; ++ int result; ++ ++ assert("nikita-2344", coord != NULL); ++ assert("nikita-2345", key != NULL); ++ ++ dent = entry_at(coord, idx_of(coord)); ++ result = build_obj_key_id(key, &obj_id); ++ if (result == 0) { ++ dent->id = obj_id; ++ znode_make_dirty(coord->node); ++ } ++ return 0; ++} ++ ++/* ->s.dir.extract_name() method for this item plugin. */ ++char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf) ++{ ++ directory_entry_format *dent; ++ ++ assert("nikita-1157", coord != NULL); ++ ++ dent = entry_at(coord, idx_of(coord)); ++ return extract_dent_name(coord, dent, buf); ++} ++ ++static int cde_bytes(int pasting, const reiser4_item_data * data) ++{ ++ int result; ++ ++ result = data->length; ++ if (!pasting) ++ result -= sizeof(cde_item_format); ++ return result; ++} ++ ++/* ->s.dir.add_entry() method for this item plugin */ ++int add_entry_cde(struct inode *dir /* directory object */ , ++ coord_t * coord /* coord of item */ , ++ lock_handle * lh /* lock handle for insertion */ , ++ const struct dentry *name /* name to insert */ , ++ reiser4_dir_entry_desc * dir_entry /* parameters of new ++ * directory entry */ ) ++{ ++ reiser4_item_data data; ++ struct cde_entry entry; ++ struct cde_entry_data edata; ++ int result; ++ ++ assert("nikita-1656", coord->node == lh->node); ++ assert("nikita-1657", znode_is_write_locked(coord->node)); ++ ++ edata.num_of_entries = 1; ++ edata.entry = &entry; ++ ++ entry.dir = dir; ++ entry.obj = dir_entry->obj; ++ entry.name = &name->d_name; ++ ++ data.data = (char *)&edata; ++ data.user = 0; /* &edata is not user space */ ++ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID); ++ data.arg = dir_entry; ++ assert("nikita-1302", data.iplug != NULL); ++ ++ result = is_dot_key(&dir_entry->key); ++ data.length = estimate_cde(result ? coord : NULL, &data); ++ ++ /* NOTE-NIKITA quota plugin? */ ++ if (vfs_dq_alloc_space_nodirty(dir, cde_bytes(result, &data))) ++ return RETERR(-EDQUOT); ++ ++ if (result) ++ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0); ++ else ++ result = reiser4_resize_item(coord, &data, &dir_entry->key, ++ lh, 0); ++ return result; ++} ++ ++/* ->s.dir.rem_entry() */ ++int rem_entry_cde(struct inode *dir /* directory of item */ , ++ const struct qstr *name, coord_t * coord /* coord of item */ , ++ lock_handle * lh UNUSED_ARG /* lock handle for ++ * removal */ , ++ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of ++ * directory entry ++ * being removed */ ) ++{ ++ coord_t shadow; ++ int result; ++ int length; ++ ON_DEBUG(char buf[DE_NAME_BUF_LEN]); ++ ++ assert("nikita-2870", strlen(name->name) == name->len); ++ assert("nikita-2869", ++ !strcmp(name->name, extract_name_cde(coord, buf))); ++ ++ length = sizeof(directory_entry_format) + sizeof(cde_unit_header); ++ if (is_longname(name->name, name->len)) ++ length += name->len + 1; ++ ++ if (inode_get_bytes(dir) < length) { ++ warning("nikita-2628", "Dir is broke: %llu: %llu", ++ (unsigned long long)get_inode_oid(dir), ++ inode_get_bytes(dir)); ++ ++ return RETERR(-EIO); ++ } ++ ++ /* cut_node() is supposed to take pointers to _different_ ++ coords, because it will modify them without respect to ++ possible aliasing. To work around this, create temporary copy ++ of @coord. ++ */ ++ coord_dup(&shadow, coord); ++ result = ++ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0); ++ if (result == 0) { ++ /* NOTE-NIKITA quota plugin? */ ++ vfs_dq_free_space_nodirty(dir, length); ++ } ++ return result; ++} ++ ++/* ->s.dir.max_name_len() method for this item plugin */ ++int max_name_len_cde(const struct inode *dir /* directory */ ) ++{ ++ return ++ reiser4_tree_by_inode(dir)->nplug->max_item_size() - ++ sizeof(directory_entry_format) - sizeof(cde_item_format) - ++ sizeof(cde_unit_header) - 2; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/cde.h linux-2.6.33/fs/reiser4/plugin/item/cde.h +--- linux-2.6.33.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/cde.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,87 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Compound directory item. See cde.c for description. */ ++ ++#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ ) ++#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ ++ ++#include "../../forward.h" ++#include "../../kassign.h" ++#include "../../dformat.h" ++ ++#include <linux/fs.h> /* for struct inode */ ++#include <linux/dcache.h> /* for struct dentry, etc */ ++ ++typedef struct cde_unit_header { ++ de_id hash; ++ d16 offset; ++} cde_unit_header; ++ ++typedef struct cde_item_format { ++ d16 num_of_entries; ++ cde_unit_header entry[0]; ++} cde_item_format; ++ ++struct cde_entry { ++ const struct inode *dir; ++ const struct inode *obj; ++ const struct qstr *name; ++}; ++ ++struct cde_entry_data { ++ int num_of_entries; ++ struct cde_entry *entry; ++}; ++ ++/* plugin->item.b.* */ ++reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result); ++int can_contain_key_cde(const coord_t * coord, const reiser4_key * key, ++ const reiser4_item_data *); ++int mergeable_cde(const coord_t * p1, const coord_t * p2); ++pos_in_node_t nr_units_cde(const coord_t * coord); ++reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key); ++int estimate_cde(const coord_t * coord, const reiser4_item_data * data); ++void print_cde(const char *prefix, coord_t * coord); ++int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data); ++lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias, ++ coord_t * coord); ++int paste_cde(coord_t * coord, reiser4_item_data * data, ++ carry_plugin_info * info UNUSED_ARG); ++int can_shift_cde(unsigned free_space, coord_t * coord, znode * target, ++ shift_direction pend, unsigned *size, unsigned want); ++void copy_units_cde(coord_t * target, coord_t * source, unsigned from, ++ unsigned count, shift_direction where_is_free_space, ++ unsigned free_space); ++int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ struct carry_cut_data *, reiser4_key * smallest_removed, ++ reiser4_key * new_first); ++int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ struct carry_kill_data *, reiser4_key * smallest_removed, ++ reiser4_key * new_first); ++void print_cde(const char *prefix, coord_t * coord); ++int reiser4_check_cde(const coord_t * coord, const char **error); ++ ++/* plugin->u.item.s.dir.* */ ++int extract_key_cde(const coord_t * coord, reiser4_key * key); ++int update_key_cde(const coord_t * coord, const reiser4_key * key, ++ lock_handle * lh); ++char *extract_name_cde(const coord_t * coord, char *buf); ++int add_entry_cde(struct inode *dir, coord_t * coord, ++ lock_handle * lh, const struct dentry *name, ++ reiser4_dir_entry_desc * entry); ++int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord, ++ lock_handle * lh, reiser4_dir_entry_desc * entry); ++int max_name_len_cde(const struct inode *dir); ++ ++/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.33/fs/reiser4/plugin/item/ctail.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/ctail.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1613 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* ctails (aka "clustered tails") are items for cryptcompress objects */ ++ ++/* DESCRIPTION: ++ ++Each cryptcompress object is stored on disk as a set of clusters sliced ++into ctails. ++ ++Internal on-disk structure: ++ ++ HEADER (1) Here stored disk cluster shift ++ BODY ++*/ ++ ++#include "../../forward.h" ++#include "../../debug.h" ++#include "../../dformat.h" ++#include "../../kassign.h" ++#include "../../key.h" ++#include "../../coord.h" ++#include "item.h" ++#include "../node/node.h" ++#include "../plugin.h" ++#include "../object.h" ++#include "../../znode.h" ++#include "../../carry.h" ++#include "../../tree.h" ++#include "../../inode.h" ++#include "../../super.h" ++#include "../../context.h" ++#include "../../page_cache.h" ++#include "../cluster.h" ++#include "../../flush.h" ++#include "../../tree_walk.h" ++ ++#include <linux/pagevec.h> ++#include <linux/swap.h> ++#include <linux/fs.h> ++ ++/* return body of ctail item at @coord */ ++static ctail_item_format *ctail_formatted_at(const coord_t * coord) ++{ ++ assert("edward-60", coord != NULL); ++ return item_body_by_coord(coord); ++} ++ ++static int cluster_shift_by_coord(const coord_t * coord) ++{ ++ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift); ++} ++ ++static inline void dclust_set_extension_shift(hint_t * hint) ++{ ++ assert("edward-1270", ++ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID); ++ hint->ext_coord.extension.ctail.shift = ++ cluster_shift_by_coord(&hint->ext_coord.coord); ++} ++ ++static loff_t off_by_coord(const coord_t * coord) ++{ ++ reiser4_key key; ++ return get_key_offset(item_key_by_coord(coord, &key)); ++} ++ ++int coord_is_unprepped_ctail(const coord_t * coord) ++{ ++ assert("edward-1233", coord != NULL); ++ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID); ++ assert("edward-1235", ++ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT, ++ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS)); ++ ++ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT; ++} ++ ++static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode) ++{ ++ int shift; ++ ++ if (inode != NULL) { ++ shift = inode_cluster_shift(inode); ++ assert("edward-1236", ++ ergo(!coord_is_unprepped_ctail(coord), ++ shift == cluster_shift_by_coord(coord))); ++ } else { ++ assert("edward-1237", !coord_is_unprepped_ctail(coord)); ++ shift = cluster_shift_by_coord(coord); ++ } ++ return off_by_coord(coord) >> shift; ++} ++ ++static int disk_cluster_size(const coord_t * coord) ++{ ++ assert("edward-1156", ++ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); ++ /* calculation of disk cluster size ++ is meaninless if ctail is unprepped */ ++ assert("edward-1238", !coord_is_unprepped_ctail(coord)); ++ ++ return 1 << cluster_shift_by_coord(coord); ++} ++ ++/* true if the key is of first disk cluster item */ ++static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord) ++{ ++ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID); ++ ++ return coord_is_unprepped_ctail(coord) || ++ ((get_key_offset(key) & ++ ((loff_t) disk_cluster_size(coord) - 1)) == 0); ++} ++ ++static char *first_unit(coord_t * coord) ++{ ++ /* FIXME: warning: pointer of type `void *' used in arithmetic */ ++ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format); ++} ++ ++/* plugin->u.item.b.max_key_inside : ++ tail_max_key_inside */ ++ ++/* plugin->u.item.b.can_contain_key */ ++int ++can_contain_key_ctail(const coord_t * coord, const reiser4_key * key, ++ const reiser4_item_data * data) ++{ ++ reiser4_key item_key; ++ ++ if (item_plugin_by_coord(coord) != data->iplug) ++ return 0; ++ ++ item_key_by_coord(coord, &item_key); ++ if (get_key_locality(key) != get_key_locality(&item_key) || ++ get_key_objectid(key) != get_key_objectid(&item_key)) ++ return 0; ++ if (get_key_offset(&item_key) + nr_units_ctail(coord) != ++ get_key_offset(key)) ++ return 0; ++ if (is_disk_cluster_key(key, coord)) ++ return 0; ++ return 1; ++} ++ ++/* plugin->u.item.b.mergeable */ ++int mergeable_ctail(const coord_t * p1, const coord_t * p2) ++{ ++ reiser4_key key1, key2; ++ ++ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID); ++ assert("edward-61", plugin_of_group(item_plugin_by_coord(p1), ++ UNIX_FILE_METADATA_ITEM_TYPE)); ++ ++ if (item_id_by_coord(p2) != CTAIL_ID) { ++ /* second item is of another type */ ++ return 0; ++ } ++ ++ item_key_by_coord(p1, &key1); ++ item_key_by_coord(p2, &key2); ++ if (get_key_locality(&key1) != get_key_locality(&key2) || ++ get_key_objectid(&key1) != get_key_objectid(&key2) || ++ get_key_type(&key1) != get_key_type(&key2)) { ++ /* items of different objects */ ++ return 0; ++ } ++ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2)) ++ /* not adjacent items */ ++ return 0; ++ if (is_disk_cluster_key(&key2, p2)) ++ return 0; ++ return 1; ++} ++ ++/* plugin->u.item.b.nr_units */ ++pos_in_node_t nr_units_ctail(const coord_t * coord) ++{ ++ return (item_length_by_coord(coord) - ++ sizeof(ctail_formatted_at(coord)->cluster_shift)); ++} ++ ++/* plugin->u.item.b.estimate: ++ estimate how much space is needed to insert/paste @data->length bytes ++ into ctail at @coord */ ++int estimate_ctail(const coord_t * coord /* coord of item */ , ++ const reiser4_item_data * ++ data /* parameters for new item */ ) ++{ ++ if (coord == NULL) ++ /* insert */ ++ return (sizeof(ctail_item_format) + data->length); ++ else ++ /* paste */ ++ return data->length; ++} ++ ++/* ->init() method for this item plugin. */ ++int init_ctail(coord_t * to /* coord of item */ , ++ coord_t * from /* old_item */ , ++ reiser4_item_data * data /* structure used for insertion */ ) ++{ ++ int cluster_shift; /* cpu value to convert */ ++ ++ if (data) { ++ assert("edward-463", data->length > sizeof(ctail_item_format)); ++ cluster_shift = *((int *)(data->arg)); ++ data->length -= sizeof(ctail_item_format); ++ } else { ++ assert("edward-464", from != NULL); ++ assert("edward-855", ctail_ok(from)); ++ cluster_shift = (int)(cluster_shift_by_coord(from)); ++ } ++ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift); ++ assert("edward-856", ctail_ok(to)); ++ return 0; ++} ++ ++/* plugin->u.item.b.lookup: ++ NULL: We are looking for item keys only */ ++ ++#if REISER4_DEBUG ++int ctail_ok(const coord_t * coord) ++{ ++ return coord_is_unprepped_ctail(coord) || ++ cluster_shift_ok(cluster_shift_by_coord(coord)); ++} ++ ++/* plugin->u.item.b.check */ ++int check_ctail(const coord_t * coord, const char **error) ++{ ++ if (!ctail_ok(coord)) { ++ if (error) ++ *error = "bad cluster shift in ctail"; ++ return 1; ++ } ++ return 0; ++} ++#endif ++ ++/* plugin->u.item.b.paste */ ++int ++paste_ctail(coord_t * coord, reiser4_item_data * data, ++ carry_plugin_info * info UNUSED_ARG) ++{ ++ unsigned old_nr_units; ++ ++ assert("edward-268", data->data != NULL); ++ /* copy only from kernel space */ ++ assert("edward-66", data->user == 0); ++ ++ old_nr_units = ++ item_length_by_coord(coord) - sizeof(ctail_item_format) - ++ data->length; ++ ++ /* ctail items never get pasted in the middle */ ++ ++ if (coord->unit_pos == 0 && coord->between == AT_UNIT) { ++ ++ /* paste at the beginning when create new item */ ++ assert("edward-450", ++ item_length_by_coord(coord) == ++ data->length + sizeof(ctail_item_format)); ++ assert("edward-451", old_nr_units == 0); ++ } else if (coord->unit_pos == old_nr_units - 1 ++ && coord->between == AFTER_UNIT) { ++ ++ /* paste at the end */ ++ coord->unit_pos++; ++ } else ++ impossible("edward-453", "bad paste position"); ++ ++ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length); ++ ++ assert("edward-857", ctail_ok(coord)); ++ ++ return 0; ++} ++ ++/* plugin->u.item.b.fast_paste */ ++ ++/* plugin->u.item.b.can_shift ++ number of units is returned via return value, number of bytes via @size. For ++ ctail items they coincide */ ++int ++can_shift_ctail(unsigned free_space, coord_t * source, ++ znode * target, shift_direction direction UNUSED_ARG, ++ unsigned *size /* number of bytes */ , unsigned want) ++{ ++ /* make sure that that we do not want to shift more than we have */ ++ assert("edward-68", want > 0 && want <= nr_units_ctail(source)); ++ ++ *size = min(want, free_space); ++ ++ if (!target) { ++ /* new item will be created */ ++ if (*size <= sizeof(ctail_item_format)) { ++ *size = 0; ++ return 0; ++ } ++ return *size - sizeof(ctail_item_format); ++ } ++ return *size; ++} ++ ++/* plugin->u.item.b.copy_units ++ cooperates with ->can_shift() */ ++void ++copy_units_ctail(coord_t * target, coord_t * source, ++ unsigned from, unsigned count /* units */ , ++ shift_direction where_is_free_space, ++ unsigned free_space /* bytes */ ) ++{ ++ /* make sure that item @target is expanded already */ ++ assert("edward-69", (unsigned)item_length_by_coord(target) >= count); ++ assert("edward-70", free_space == count || free_space == count + 1); ++ ++ assert("edward-858", ctail_ok(source)); ++ ++ if (where_is_free_space == SHIFT_LEFT) { ++ /* append item @target with @count first bytes of @source: ++ this restriction came from ordinary tails */ ++ assert("edward-71", from == 0); ++ assert("edward-860", ctail_ok(target)); ++ ++ memcpy(first_unit(target) + nr_units_ctail(target) - count, ++ first_unit(source), count); ++ } else { ++ /* target item is moved to right already */ ++ reiser4_key key; ++ ++ assert("edward-72", nr_units_ctail(source) == from + count); ++ ++ if (free_space == count) { ++ init_ctail(target, source, NULL); ++ } else { ++ /* new item has been created */ ++ assert("edward-862", ctail_ok(target)); ++ } ++ memcpy(first_unit(target), first_unit(source) + from, count); ++ ++ assert("edward-863", ctail_ok(target)); ++ ++ /* new units are inserted before first unit in an item, ++ therefore, we have to update item key */ ++ item_key_by_coord(source, &key); ++ set_key_offset(&key, get_key_offset(&key) + from); ++ ++ node_plugin_by_node(target->node)->update_item_key(target, &key, ++ NULL /*info */); ++ } ++} ++ ++/* plugin->u.item.b.create_hook */ ++int create_hook_ctail(const coord_t * coord, void *arg) ++{ ++ assert("edward-864", znode_is_loaded(coord->node)); ++ ++ znode_set_convertible(coord->node); ++ return 0; ++} ++ ++/* plugin->u.item.b.kill_hook */ ++int kill_hook_ctail(const coord_t * coord, pos_in_node_t from, ++ pos_in_node_t count, carry_kill_data * kdata) ++{ ++ struct inode *inode; ++ ++ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID); ++ assert("edward-291", znode_is_write_locked(coord->node)); ++ ++ inode = kdata->inode; ++ if (inode) { ++ reiser4_key key; ++ struct cryptcompress_info * info; ++ cloff_t index; ++ ++ item_key_by_coord(coord, &key); ++ info = cryptcompress_inode_data(inode); ++ index = off_to_clust(get_key_offset(&key), inode); ++ ++ if (from == 0) { ++ info->trunc_index = index; ++ if (is_disk_cluster_key(&key, coord)) { ++ /* ++ * first item of disk cluster is to be killed ++ */ ++ truncate_complete_page_cluster( ++ inode, index, kdata->params.truncate); ++ inode_sub_bytes(inode, ++ inode_cluster_size(inode)); ++ } ++ } ++ } ++ return 0; ++} ++ ++/* for shift_hook_ctail(), ++ return true if the first disk cluster item has dirty child ++*/ ++static int ctail_convertible(const coord_t * coord) ++{ ++ int result; ++ reiser4_key key; ++ jnode *child = NULL; ++ ++ assert("edward-477", coord != NULL); ++ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID); ++ ++ if (coord_is_unprepped_ctail(coord)) ++ /* unprepped ctail should be converted */ ++ return 1; ++ ++ item_key_by_coord(coord, &key); ++ child = jlookup(current_tree, ++ get_key_objectid(&key), ++ off_to_pg(off_by_coord(coord))); ++ if (!child) ++ return 0; ++ result = JF_ISSET(child, JNODE_DIRTY); ++ jput(child); ++ return result; ++} ++ ++/* FIXME-EDWARD */ ++/* plugin->u.item.b.shift_hook */ ++int shift_hook_ctail(const coord_t * item /* coord of item */ , ++ unsigned from UNUSED_ARG /* start unit */ , ++ unsigned count UNUSED_ARG /* stop unit */ , ++ znode * old_node /* old parent */ ) ++{ ++ assert("edward-479", item != NULL); ++ assert("edward-480", item->node != old_node); ++ ++ if (!znode_convertible(old_node) || znode_convertible(item->node)) ++ return 0; ++ if (ctail_convertible(item)) ++ znode_set_convertible(item->node); ++ return 0; ++} ++ ++static int ++cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ int cut, void *p, reiser4_key * smallest_removed, ++ reiser4_key * new_first) ++{ ++ pos_in_node_t count; /* number of units to cut */ ++ char *item; ++ ++ count = to - from + 1; ++ item = item_body_by_coord(coord); ++ ++ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord))); ++ ++ if (smallest_removed) { ++ /* store smallest key removed */ ++ item_key_by_coord(coord, smallest_removed); ++ set_key_offset(smallest_removed, ++ get_key_offset(smallest_removed) + from); ++ } ++ ++ if (new_first) { ++ assert("vs-1531", from == 0); ++ ++ item_key_by_coord(coord, new_first); ++ set_key_offset(new_first, ++ get_key_offset(new_first) + from + count); ++ } ++ ++ if (!cut) ++ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p); ++ ++ if (from == 0) { ++ if (count != nr_units_ctail(coord)) { ++ /* part of item is removed, so move free space at the beginning ++ of the item and update item key */ ++ reiser4_key key; ++ memcpy(item + to + 1, item, sizeof(ctail_item_format)); ++ item_key_by_coord(coord, &key); ++ set_key_offset(&key, get_key_offset(&key) + count); ++ node_plugin_by_node(coord->node)->update_item_key(coord, ++ &key, ++ NULL); ++ } else { ++ /* cut_units should not be called to cut evrything */ ++ assert("vs-1532", ergo(cut, 0)); ++ /* whole item is cut, so more then amount of space occupied ++ by units got freed */ ++ count += sizeof(ctail_item_format); ++ } ++ if (REISER4_DEBUG) ++ memset(item, 0, count); ++ } else if (REISER4_DEBUG) ++ memset(item + sizeof(ctail_item_format) + from, 0, count); ++ return count; ++} ++ ++/* plugin->u.item.b.cut_units */ ++int ++cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to, ++ carry_cut_data * cdata, reiser4_key * smallest_removed, ++ reiser4_key * new_first) ++{ ++ return cut_or_kill_ctail_units(item, from, to, 1, NULL, ++ smallest_removed, new_first); ++} ++ ++/* plugin->u.item.b.kill_units */ ++int ++kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to, ++ struct carry_kill_data *kdata, reiser4_key * smallest_removed, ++ reiser4_key * new_first) ++{ ++ return cut_or_kill_ctail_units(item, from, to, 0, kdata, ++ smallest_removed, new_first); ++} ++ ++/* plugin->u.item.s.file.read */ ++int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint) ++{ ++ uf_coord_t *uf_coord; ++ coord_t *coord; ++ ++ uf_coord = &hint->ext_coord; ++ coord = &uf_coord->coord; ++ assert("edward-127", f->user == 0); ++ assert("edward-129", coord && coord->node); ++ assert("edward-130", coord_is_existing_unit(coord)); ++ assert("edward-132", znode_is_loaded(coord->node)); ++ ++ /* start read only from the beginning of ctail */ ++ assert("edward-133", coord->unit_pos == 0); ++ /* read only whole ctails */ ++ assert("edward-135", nr_units_ctail(coord) <= f->length); ++ ++ assert("edward-136", reiser4_schedulable()); ++ assert("edward-886", ctail_ok(coord)); ++ ++ if (f->data) ++ memcpy(f->data, (char *)first_unit(coord), ++ (size_t) nr_units_ctail(coord)); ++ ++ dclust_set_extension_shift(hint); ++ mark_page_accessed(znode_page(coord->node)); ++ move_flow_forward(f, nr_units_ctail(coord)); ++ ++ return 0; ++} ++ ++/** ++ * Prepare transform stream with plain text for page ++ * @page taking into account synchronization issues. ++ */ ++static int ctail_read_disk_cluster(struct cluster_handle * clust, ++ struct inode * inode, struct page * page, ++ znode_lock_mode mode) ++{ ++ int result; ++ ++ assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK); ++ assert("edward-671", clust->hint != NULL); ++ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER); ++ assert("edward-672", cryptcompress_inode_ok(inode)); ++ assert("edward-1527", PageLocked(page)); ++ ++ unlock_page(page); ++ ++ /* set input stream */ ++ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM); ++ if (result) { ++ lock_page(page); ++ return result; ++ } ++ result = find_disk_cluster(clust, inode, 1 /* read items */, mode); ++ lock_page(page); ++ if (result) ++ return result; ++ /* ++ * at this point we have locked position in the tree ++ */ ++ assert("edward-1528", znode_is_any_locked(clust->hint->lh.node)); ++ ++ if (page->mapping != inode->i_mapping) { ++ /* page was truncated */ ++ reiser4_unset_hint(clust->hint); ++ reset_cluster_params(clust); ++ return AOP_TRUNCATED_PAGE; ++ } ++ if (PageUptodate(page)) { ++ /* disk cluster can be obsolete, don't use it! */ ++ reiser4_unset_hint(clust->hint); ++ reset_cluster_params(clust); ++ return 0; ++ } ++ if (clust->dstat == FAKE_DISK_CLUSTER || ++ clust->dstat == UNPR_DISK_CLUSTER || ++ clust->dstat == TRNC_DISK_CLUSTER) { ++ /* ++ * this information about disk cluster will be valid ++ * as long as we keep the position in the tree locked ++ */ ++ tfm_cluster_set_uptodate(&clust->tc); ++ return 0; ++ } ++ /* now prepare output stream.. */ ++ result = grab_coa(&clust->tc, inode_compression_plugin(inode)); ++ if (result) ++ return result; ++ /* ..and fill this with plain text */ ++ result = reiser4_inflate_cluster(clust, inode); ++ if (result) ++ return result; ++ /* ++ * The stream is ready! It won't be obsolete as ++ * long as we keep last disk cluster item locked. ++ */ ++ tfm_cluster_set_uptodate(&clust->tc); ++ return 0; ++} ++ ++/* ++ * fill one page with plain text. ++ */ ++int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust, ++ struct page *page, znode_lock_mode mode) ++{ ++ int ret; ++ unsigned cloff; ++ char *data; ++ size_t to_page; ++ struct tfm_cluster * tc = &clust->tc; ++ ++ assert("edward-212", PageLocked(page)); ++ ++ if (unlikely(page->mapping != inode->i_mapping)) ++ return AOP_TRUNCATED_PAGE; ++ if (PageUptodate(page)) ++ goto exit; ++ to_page = pbytes(page_index(page), inode); ++ if (to_page == 0) { ++ zero_user(page, 0, PAGE_CACHE_SIZE); ++ SetPageUptodate(page); ++ goto exit; ++ } ++ if (!tfm_cluster_is_uptodate(&clust->tc)) { ++ clust->index = pg_to_clust(page->index, inode); ++ ++ /* this will unlock/lock the page */ ++ ret = ctail_read_disk_cluster(clust, inode, page, mode); ++ ++ assert("edward-212", PageLocked(page)); ++ if (ret) ++ return ret; ++ ++ /* refresh bytes */ ++ to_page = pbytes(page_index(page), inode); ++ if (to_page == 0) { ++ zero_user(page, 0, PAGE_CACHE_SIZE); ++ SetPageUptodate(page); ++ goto exit; ++ } ++ } ++ if (PageUptodate(page)) ++ /* somebody else fill it already */ ++ goto exit; ++ ++ assert("edward-119", tfm_cluster_is_uptodate(tc)); ++ assert("edward-1529", znode_is_any_locked(clust->hint->lh.node)); ++ ++ switch (clust->dstat) { ++ case UNPR_DISK_CLUSTER: ++ BUG_ON(1); ++ case TRNC_DISK_CLUSTER: ++ /* ++ * Race with truncate! ++ * We resolve it in favour of the last one (the only way, ++ * as in this case plain text is unrecoverable) ++ */ ++ case FAKE_DISK_CLUSTER: ++ /* fill the page by zeroes */ ++ zero_user(page, 0, PAGE_CACHE_SIZE); ++ SetPageUptodate(page); ++ break; ++ case PREP_DISK_CLUSTER: ++ /* fill page by transformed stream with plain text */ ++ assert("edward-1058", !PageUptodate(page)); ++ assert("edward-120", tc->len <= inode_cluster_size(inode)); ++ ++ /* page index in this logical cluster */ ++ cloff = pg_to_off_to_cloff(page->index, inode); ++ ++ data = kmap(page); ++ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page); ++ memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page); ++ flush_dcache_page(page); ++ kunmap(page); ++ SetPageUptodate(page); ++ break; ++ default: ++ impossible("edward-1169", "bad disk cluster state"); ++ } ++ exit: ++ return 0; ++} ++ ++/* plugin->u.item.s.file.readpage */ ++int readpage_ctail(void *vp, struct page *page) ++{ ++ int result; ++ hint_t * hint; ++ struct cluster_handle * clust = vp; ++ ++ assert("edward-114", clust != NULL); ++ assert("edward-115", PageLocked(page)); ++ assert("edward-116", !PageUptodate(page)); ++ assert("edward-118", page->mapping && page->mapping->host); ++ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc)); ++ ++ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); ++ if (hint == NULL) { ++ unlock_page(page); ++ return RETERR(-ENOMEM); ++ } ++ clust->hint = hint; ++ result = load_file_hint(clust->file, hint); ++ if (result) { ++ kfree(hint); ++ unlock_page(page); ++ return result; ++ } ++ assert("vs-25", hint->ext_coord.lh == &hint->lh); ++ ++ result = do_readpage_ctail(page->mapping->host, clust, page, ++ ZNODE_READ_LOCK); ++ assert("edward-213", PageLocked(page)); ++ assert("edward-1163", ergo(!result, PageUptodate(page))); ++ ++ unlock_page(page); ++ done_lh(&hint->lh); ++ hint->ext_coord.valid = 0; ++ save_file_hint(clust->file, hint); ++ kfree(hint); ++ tfm_cluster_clr_uptodate(&clust->tc); ++ ++ return result; ++} ++ ++/* Helper function for ->readpages() */ ++static int ctail_read_page_cluster(struct cluster_handle * clust, ++ struct inode *inode) ++{ ++ int i; ++ int result; ++ assert("edward-779", clust != NULL); ++ assert("edward-1059", clust->win == NULL); ++ assert("edward-780", inode != NULL); ++ ++ result = prepare_page_cluster(inode, clust, READ_OP); ++ if (result) ++ return result; ++ ++ assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc)); ++ ++ for (i = 0; i < clust->nr_pages; i++) { ++ struct page *page = clust->pages[i]; ++ lock_page(page); ++ result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK); ++ unlock_page(page); ++ if (result) ++ break; ++ } ++ tfm_cluster_clr_uptodate(&clust->tc); ++ put_page_cluster(clust, inode, READ_OP); ++ return result; ++} ++ ++/* filler for read_cache_pages() */ ++static int ctail_readpages_filler(void * data, struct page * page) ++{ ++ int ret = 0; ++ struct cluster_handle * clust = data; ++ struct inode * inode = clust->file->f_dentry->d_inode; ++ ++ assert("edward-1525", page->mapping == inode->i_mapping); ++ ++ if (PageUptodate(page)) { ++ unlock_page(page); ++ return 0; ++ } ++ if (pbytes(page_index(page), inode) == 0) { ++ zero_user(page, 0, PAGE_CACHE_SIZE); ++ SetPageUptodate(page); ++ unlock_page(page); ++ return 0; ++ } ++ move_cluster_forward(clust, inode, page->index); ++ unlock_page(page); ++ /* ++ * read the whole page cluster ++ */ ++ ret = ctail_read_page_cluster(clust, inode); ++ ++ assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc)); ++ return ret; ++} ++ ++/* ++ * We populate a bit more then upper readahead suggests: ++ * with each nominated page we read the whole page cluster ++ * this page belongs to. ++ */ ++int readpages_ctail(struct file *file, struct address_space *mapping, ++ struct list_head *pages) ++{ ++ int ret = 0; ++ hint_t *hint; ++ struct cluster_handle clust; ++ struct inode *inode = mapping->host; ++ ++ assert("edward-1521", inode == file->f_dentry->d_inode); ++ ++ cluster_init_read(&clust, NULL); ++ clust.file = file; ++ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); ++ if (hint == NULL) { ++ warning("vs-28", "failed to allocate hint"); ++ ret = RETERR(-ENOMEM); ++ goto exit1; ++ } ++ clust.hint = hint; ++ ret = load_file_hint(clust.file, hint); ++ if (ret) { ++ warning("edward-1522", "failed to load hint"); ++ goto exit2; ++ } ++ assert("vs-26", hint->ext_coord.lh == &hint->lh); ++ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); ++ if (ret) { ++ warning("edward-1523", "failed to alloc pgset"); ++ goto exit3; ++ } ++ ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust); ++ ++ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc)); ++ exit3: ++ done_lh(&hint->lh); ++ save_file_hint(file, hint); ++ hint->ext_coord.valid = 0; ++ exit2: ++ kfree(hint); ++ exit1: ++ put_cluster_handle(&clust); ++ return ret; ++} ++ ++/* ++ plugin->u.item.s.file.append_key ++ key of the first item of the next disk cluster ++*/ ++reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key) ++{ ++ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID); ++ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord))); ++ ++ item_key_by_coord(coord, key); ++ set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1) ++ << cluster_shift_by_coord(coord)); ++ return key; ++} ++ ++static int insert_unprepped_ctail(struct cluster_handle * clust, ++ struct inode *inode) ++{ ++ int result; ++ char buf[UCTAIL_NR_UNITS]; ++ reiser4_item_data data; ++ reiser4_key key; ++ int shift = (int)UCTAIL_SHIFT; ++ ++ memset(buf, 0, (size_t) UCTAIL_NR_UNITS); ++ result = key_by_inode_cryptcompress(inode, ++ clust_to_off(clust->index, inode), ++ &key); ++ if (result) ++ return result; ++ data.user = 0; ++ data.iplug = item_plugin_by_id(CTAIL_ID); ++ data.arg = &shift; ++ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS; ++ data.data = buf; ++ ++ result = insert_by_coord(&clust->hint->ext_coord.coord, ++ &data, &key, clust->hint->ext_coord.lh, 0); ++ return result; ++} ++ ++static int ++insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f, ++ int cluster_shift) ++{ ++ int result; ++ carry_pool *pool; ++ carry_level *lowest_level; ++ reiser4_item_data *data; ++ carry_op *op; ++ ++ pool = ++ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + ++ sizeof(*data)); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ lowest_level = (carry_level *) (pool + 1); ++ init_carry_level(lowest_level, pool); ++ data = (reiser4_item_data *) (lowest_level + 3); ++ ++ assert("edward-466", coord->between == AFTER_ITEM ++ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM ++ || coord->between == EMPTY_NODE ++ || coord->between == BEFORE_UNIT); ++ ++ if (coord->between == AFTER_UNIT) { ++ coord->unit_pos = 0; ++ coord->between = AFTER_ITEM; ++ } ++ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node, ++ 0 /* operate directly on coord -> node */); ++ if (IS_ERR(op) || (op == NULL)) { ++ done_carry_pool(pool); ++ return RETERR(op ? PTR_ERR(op) : -EIO); ++ } ++ data->user = 0; ++ data->iplug = item_plugin_by_id(CTAIL_ID); ++ data->arg = &cluster_shift; ++ ++ data->length = 0; ++ data->data = NULL; ++ ++ op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT; ++ op->u.insert_flow.insert_point = coord; ++ op->u.insert_flow.flow = f; ++ op->u.insert_flow.data = data; ++ op->u.insert_flow.new_nodes = 0; ++ ++ lowest_level->track_type = CARRY_TRACK_CHANGE; ++ lowest_level->tracked = lh; ++ ++ result = reiser4_carry(lowest_level, NULL); ++ done_carry_pool(pool); ++ ++ return result; ++} ++ ++/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */ ++static int insert_cryptcompress_flow_in_place(coord_t * coord, ++ lock_handle * lh, flow_t * f, ++ int cluster_shift) ++{ ++ int ret; ++ coord_t pos; ++ lock_handle lock; ++ ++ assert("edward-484", ++ coord->between == AT_UNIT || coord->between == AFTER_ITEM); ++ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID); ++ ++ coord_dup(&pos, coord); ++ pos.unit_pos = 0; ++ pos.between = AFTER_ITEM; ++ ++ init_lh(&lock); ++ copy_lh(&lock, lh); ++ ++ ret = insert_cryptcompress_flow(&pos, &lock, f, cluster_shift); ++ done_lh(&lock); ++ assert("edward-1347", znode_is_write_locked(lh->node)); ++ assert("edward-1228", !ret); ++ return ret; ++} ++ ++/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */ ++static int overwrite_ctail(coord_t * coord, flow_t * f) ++{ ++ unsigned count; ++ ++ assert("edward-269", f->user == 0); ++ assert("edward-270", f->data != NULL); ++ assert("edward-271", f->length > 0); ++ assert("edward-272", coord_is_existing_unit(coord)); ++ assert("edward-273", coord->unit_pos == 0); ++ assert("edward-274", znode_is_write_locked(coord->node)); ++ assert("edward-275", reiser4_schedulable()); ++ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID); ++ assert("edward-1243", ctail_ok(coord)); ++ ++ count = nr_units_ctail(coord); ++ ++ if (count > f->length) ++ count = f->length; ++ memcpy(first_unit(coord), f->data, count); ++ move_flow_forward(f, count); ++ coord->unit_pos += count; ++ return 0; ++} ++ ++/* Implementation of CRC_CUT_ITEM mode of ctail conversion: ++ cut ctail (part or whole) starting from next unit position */ ++static int cut_ctail(coord_t * coord) ++{ ++ coord_t stop; ++ ++ assert("edward-435", coord->between == AT_UNIT && ++ coord->item_pos < coord_num_items(coord) && ++ coord->unit_pos <= coord_num_units(coord)); ++ ++ if (coord->unit_pos == coord_num_units(coord)) ++ /* nothing to cut */ ++ return 0; ++ coord_dup(&stop, coord); ++ stop.unit_pos = coord_last_unit_pos(coord); ++ ++ return cut_node_content(coord, &stop, NULL, NULL, NULL); ++} ++ ++int ctail_insert_unprepped_cluster(struct cluster_handle * clust, ++ struct inode * inode) ++{ ++ int result; ++ assert("edward-1244", inode != NULL); ++ assert("edward-1245", clust->hint != NULL); ++ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER); ++ assert("edward-1247", clust->reserved == 1); ++ ++ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK); ++ if (cbk_errored(result)) ++ return result; ++ assert("edward-1249", result == CBK_COORD_NOTFOUND); ++ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node)); ++ ++ assert("edward-1295", ++ clust->hint->ext_coord.lh->node == ++ clust->hint->ext_coord.coord.node); ++ ++ coord_set_between_clusters(&clust->hint->ext_coord.coord); ++ ++ result = insert_unprepped_ctail(clust, inode); ++ all_grabbed2free(); ++ ++ assert("edward-1251", !result); ++ assert("edward-1252", cryptcompress_inode_ok(inode)); ++ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node)); ++ assert("edward-1254", ++ reiser4_clustered_blocks(reiser4_get_current_sb())); ++ assert("edward-1255", ++ znode_convertible(clust->hint->ext_coord.coord.node)); ++ ++ return result; ++} ++ ++static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode) ++{ ++ int result = 0; ++ struct convert_item_info * info; ++ ++ assert("edward-468", pos != NULL); ++ assert("edward-469", pos->sq != NULL); ++ assert("edward-845", item_convert_data(pos) != NULL); ++ ++ info = item_convert_data(pos); ++ assert("edward-679", info->flow.data != NULL); ++ ++ switch (mode) { ++ case CRC_APPEND_ITEM: ++ assert("edward-1229", info->flow.length != 0); ++ assert("edward-1256", ++ cluster_shift_ok(cluster_shift_by_coord(&pos->coord))); ++ result = ++ insert_cryptcompress_flow_in_place(&pos->coord, ++ &pos->lock, ++ &info->flow, ++ info->cluster_shift); ++ break; ++ case CRC_OVERWRITE_ITEM: ++ assert("edward-1230", info->flow.length != 0); ++ overwrite_ctail(&pos->coord, &info->flow); ++ if (info->flow.length != 0) ++ break; ++ case CRC_CUT_ITEM: ++ assert("edward-1231", info->flow.length == 0); ++ result = cut_ctail(&pos->coord); ++ break; ++ default: ++ result = RETERR(-EIO); ++ impossible("edward-244", "bad convert mode"); ++ } ++ return result; ++} ++ ++/* plugin->u.item.f.scan */ ++int scan_ctail(flush_scan * scan) ++{ ++ int result = 0; ++ struct page *page; ++ struct inode *inode; ++ jnode *node = scan->node; ++ ++ assert("edward-227", scan->node != NULL); ++ assert("edward-228", jnode_is_cluster_page(scan->node)); ++ assert("edward-639", znode_is_write_locked(scan->parent_lock.node)); ++ ++ page = jnode_page(node); ++ inode = page->mapping->host; ++ ++ if (!reiser4_scanning_left(scan)) ++ return result; ++ if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY)) ++ znode_make_dirty(scan->parent_lock.node); ++ ++ if (!znode_convertible(scan->parent_lock.node)) { ++ if (JF_ISSET(scan->node, JNODE_DIRTY)) ++ znode_set_convertible(scan->parent_lock.node); ++ else { ++ warning("edward-681", ++ "cluster page is already processed"); ++ return -EAGAIN; ++ } ++ } ++ return result; ++} ++ ++/* If true, this function attaches children */ ++static int should_attach_convert_idata(flush_pos_t * pos) ++{ ++ int result; ++ assert("edward-431", pos != NULL); ++ assert("edward-432", pos->child == NULL); ++ assert("edward-619", znode_is_write_locked(pos->coord.node)); ++ assert("edward-470", ++ item_plugin_by_coord(&pos->coord) == ++ item_plugin_by_id(CTAIL_ID)); ++ ++ /* check for leftmost child */ ++ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child); ++ ++ if (!pos->child) ++ return 0; ++ spin_lock_jnode(pos->child); ++ result = (JF_ISSET(pos->child, JNODE_DIRTY) && ++ pos->child->atom == ZJNODE(pos->coord.node)->atom); ++ spin_unlock_jnode(pos->child); ++ if (!result && pos->child) { ++ /* existing child isn't to attach, clear up this one */ ++ jput(pos->child); ++ pos->child = NULL; ++ } ++ return result; ++} ++ ++/** ++ * Collect all needed information about the object here, ++ * as in-memory inode can be evicted from memory before ++ * disk update completion. ++ */ ++static int init_convert_data_ctail(struct convert_item_info * idata, ++ struct inode *inode) ++{ ++ assert("edward-813", idata != NULL); ++ assert("edward-814", inode != NULL); ++ ++ idata->cluster_shift = inode_cluster_shift(inode); ++ idata->d_cur = DC_FIRST_ITEM; ++ idata->d_next = DC_INVALID_STATE; ++ ++ return 0; ++} ++ ++static int alloc_item_convert_data(struct convert_info * sq) ++{ ++ assert("edward-816", sq != NULL); ++ assert("edward-817", sq->itm == NULL); ++ ++ sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get()); ++ if (sq->itm == NULL) ++ return RETERR(-ENOMEM); ++ return 0; ++} ++ ++static void free_item_convert_data(struct convert_info * sq) ++{ ++ assert("edward-818", sq != NULL); ++ assert("edward-819", sq->itm != NULL); ++ assert("edward-820", sq->iplug != NULL); ++ ++ kfree(sq->itm); ++ sq->itm = NULL; ++ return; ++} ++ ++static int alloc_convert_data(flush_pos_t * pos) ++{ ++ assert("edward-821", pos != NULL); ++ assert("edward-822", pos->sq == NULL); ++ ++ pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get()); ++ if (!pos->sq) ++ return RETERR(-ENOMEM); ++ memset(pos->sq, 0, sizeof(*pos->sq)); ++ cluster_init_write(&pos->sq->clust, NULL); ++ return 0; ++} ++ ++void free_convert_data(flush_pos_t * pos) ++{ ++ struct convert_info *sq; ++ ++ assert("edward-823", pos != NULL); ++ assert("edward-824", pos->sq != NULL); ++ ++ sq = pos->sq; ++ if (sq->itm) ++ free_item_convert_data(sq); ++ put_cluster_handle(&sq->clust); ++ kfree(pos->sq); ++ pos->sq = NULL; ++ return; ++} ++ ++static int init_item_convert_data(flush_pos_t * pos, struct inode *inode) ++{ ++ struct convert_info *sq; ++ ++ assert("edward-825", pos != NULL); ++ assert("edward-826", pos->sq != NULL); ++ assert("edward-827", item_convert_data(pos) != NULL); ++ assert("edward-828", inode != NULL); ++ ++ sq = pos->sq; ++ ++ memset(sq->itm, 0, sizeof(*sq->itm)); ++ ++ /* iplug->init_convert_data() */ ++ return init_convert_data_ctail(sq->itm, inode); ++} ++ ++/* create and attach disk cluster info used by 'convert' phase of the flush ++ squalloc() */ ++static int attach_convert_idata(flush_pos_t * pos, struct inode *inode) ++{ ++ int ret = 0; ++ struct convert_item_info *info; ++ struct cluster_handle *clust; ++ file_plugin *fplug = inode_file_plugin(inode); ++ compression_plugin *cplug = inode_compression_plugin(inode); ++ ++ assert("edward-248", pos != NULL); ++ assert("edward-249", pos->child != NULL); ++ assert("edward-251", inode != NULL); ++ assert("edward-682", cryptcompress_inode_ok(inode)); ++ assert("edward-252", ++ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); ++ assert("edward-473", ++ item_plugin_by_coord(&pos->coord) == ++ item_plugin_by_id(CTAIL_ID)); ++ ++ if (!pos->sq) { ++ ret = alloc_convert_data(pos); ++ if (ret) ++ return ret; ++ } ++ clust = &pos->sq->clust; ++ ret = grab_coa(&clust->tc, cplug); ++ if (ret) ++ goto err; ++ ret = set_cluster_by_page(clust, ++ jnode_page(pos->child), ++ MAX_CLUSTER_NRPAGES); ++ if (ret) ++ goto err; ++ ++ assert("edward-829", pos->sq != NULL); ++ assert("edward-250", item_convert_data(pos) == NULL); ++ ++ pos->sq->iplug = item_plugin_by_id(CTAIL_ID); ++ ++ ret = alloc_item_convert_data(pos->sq); ++ if (ret) ++ goto err; ++ ret = init_item_convert_data(pos, inode); ++ if (ret) ++ goto err; ++ info = item_convert_data(pos); ++ ++ ret = checkout_logical_cluster(clust, pos->child, inode); ++ if (ret) ++ goto err; ++ ++ reiser4_deflate_cluster(clust, inode); ++ inc_item_convert_count(pos); ++ ++ /* prepare flow for insertion */ ++ fplug->flow_by_inode(inode, ++ (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM), ++ 0 /* kernel space */ , ++ clust->tc.len, ++ clust_to_off(clust->index, inode), ++ WRITE_OP, &info->flow); ++ jput(pos->child); ++ return 0; ++ err: ++ jput(pos->child); ++ free_convert_data(pos); ++ return ret; ++} ++ ++/* clear up disk cluster info */ ++static void detach_convert_idata(struct convert_info * sq) ++{ ++ struct convert_item_info *info; ++ ++ assert("edward-253", sq != NULL); ++ assert("edward-840", sq->itm != NULL); ++ ++ info = sq->itm; ++ assert("edward-1212", info->flow.length == 0); ++ ++ free_item_convert_data(sq); ++ return; ++} ++ ++/* plugin->u.item.f.utmost_child */ ++ ++/* This function sets leftmost child for a first cluster item, ++ if the child exists, and NULL in other cases. ++ NOTE-EDWARD: Do not call this for RIGHT_SIDE */ ++ ++int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child) ++{ ++ reiser4_key key; ++ ++ item_key_by_coord(coord, &key); ++ ++ assert("edward-257", coord != NULL); ++ assert("edward-258", child != NULL); ++ assert("edward-259", side == LEFT_SIDE); ++ assert("edward-260", ++ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); ++ ++ if (!is_disk_cluster_key(&key, coord)) ++ *child = NULL; ++ else ++ *child = jlookup(current_tree, ++ get_key_objectid(item_key_by_coord ++ (coord, &key)), ++ off_to_pg(get_key_offset(&key))); ++ return 0; ++} ++ ++/* Returns true if @p2 is the next item to @p1 ++ in the _same_ disk cluster. ++ Disk cluster is a set of items. If ->clustered() != NULL, ++ with each item the whole disk cluster should be read/modified ++*/ ++ ++/* Go rightward and check for next disk cluster item, set ++ * d_next to DC_CHAINED_ITEM, if the last one exists. ++ * If the current position is last item, go to right neighbor. ++ * Skip empty nodes. Note, that right neighbors may be not in ++ * the slum because of races. If so, make it dirty and ++ * convertible. ++ */ ++static int next_item_dc_stat(flush_pos_t * pos) ++{ ++ int ret = 0; ++ int stop = 0; ++ znode *cur; ++ coord_t coord; ++ lock_handle lh; ++ lock_handle right_lock; ++ ++ assert("edward-1232", !node_is_empty(pos->coord.node)); ++ assert("edward-1014", ++ pos->coord.item_pos < coord_num_items(&pos->coord)); ++ assert("edward-1015", chaining_data_present(pos)); ++ assert("edward-1017", ++ item_convert_data(pos)->d_next == DC_INVALID_STATE); ++ ++ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER; ++ ++ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER) ++ return ret; ++ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1) ++ return ret; ++ ++ /* Check next slum item. ++ * Note, that it can not be killed by concurrent truncate, ++ * as the last one will want the lock held by us. ++ */ ++ init_lh(&right_lock); ++ cur = pos->coord.node; ++ ++ while (!stop) { ++ init_lh(&lh); ++ ret = reiser4_get_right_neighbor(&lh, ++ cur, ++ ZNODE_WRITE_LOCK, ++ GN_CAN_USE_UPPER_LEVELS); ++ if (ret) ++ break; ++ ret = zload(lh.node); ++ if (ret) { ++ done_lh(&lh); ++ break; ++ } ++ coord_init_before_first_item(&coord, lh.node); ++ ++ if (node_is_empty(lh.node)) { ++ znode_make_dirty(lh.node); ++ znode_set_convertible(lh.node); ++ stop = 0; ++ } else if (same_disk_cluster(&pos->coord, &coord)) { ++ ++ item_convert_data(pos)->d_next = DC_CHAINED_ITEM; ++ ++ if (!ZF_ISSET(lh.node, JNODE_DIRTY)) { ++ /* ++ warning("edward-1024", ++ "next slum item mergeable, " ++ "but znode %p isn't dirty\n", ++ lh.node); ++ */ ++ znode_make_dirty(lh.node); ++ } ++ if (!znode_convertible(lh.node)) { ++ /* ++ warning("edward-1272", ++ "next slum item mergeable, " ++ "but znode %p isn't convertible\n", ++ lh.node); ++ */ ++ znode_set_convertible(lh.node); ++ } ++ stop = 1; ++ } else ++ stop = 1; ++ zrelse(lh.node); ++ done_lh(&right_lock); ++ copy_lh(&right_lock, &lh); ++ done_lh(&lh); ++ cur = right_lock.node; ++ } ++ done_lh(&right_lock); ++ ++ if (ret == -E_NO_NEIGHBOR) ++ ret = 0; ++ return ret; ++} ++ ++static int ++assign_convert_mode(struct convert_item_info * idata, ++ cryptcompress_write_mode_t * mode) ++{ ++ int result = 0; ++ ++ assert("edward-1025", idata != NULL); ++ ++ if (idata->flow.length) { ++ /* append or overwrite */ ++ switch (idata->d_cur) { ++ case DC_FIRST_ITEM: ++ case DC_CHAINED_ITEM: ++ *mode = CRC_OVERWRITE_ITEM; ++ break; ++ case DC_AFTER_CLUSTER: ++ *mode = CRC_APPEND_ITEM; ++ break; ++ default: ++ impossible("edward-1018", "wrong current item state"); ++ } ++ } else { ++ /* cut or invalidate */ ++ switch (idata->d_cur) { ++ case DC_FIRST_ITEM: ++ case DC_CHAINED_ITEM: ++ *mode = CRC_CUT_ITEM; ++ break; ++ case DC_AFTER_CLUSTER: ++ result = 1; ++ break; ++ default: ++ impossible("edward-1019", "wrong current item state"); ++ } ++ } ++ return result; ++} ++ ++/* plugin->u.item.f.convert */ ++/* write ctail in guessed mode */ ++int convert_ctail(flush_pos_t * pos) ++{ ++ int result; ++ int nr_items; ++ cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM; ++ ++ assert("edward-1020", pos != NULL); ++ assert("edward-1213", coord_num_items(&pos->coord) != 0); ++ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID); ++ assert("edward-1258", ctail_ok(&pos->coord)); ++ assert("edward-261", pos->coord.node != NULL); ++ ++ nr_items = coord_num_items(&pos->coord); ++ if (!chaining_data_present(pos)) { ++ if (should_attach_convert_idata(pos)) { ++ /* attach convert item info */ ++ struct inode *inode; ++ ++ assert("edward-264", pos->child != NULL); ++ assert("edward-265", jnode_page(pos->child) != NULL); ++ assert("edward-266", ++ jnode_page(pos->child)->mapping != NULL); ++ ++ inode = jnode_page(pos->child)->mapping->host; ++ ++ assert("edward-267", inode != NULL); ++ ++ /* attach item convert info by child and put the last one */ ++ result = attach_convert_idata(pos, inode); ++ pos->child = NULL; ++ if (result == -E_REPEAT) { ++ /* jnode became clean, or there is no dirty ++ pages (nothing to update in disk cluster) */ ++ warning("edward-1021", ++ "convert_ctail: nothing to attach"); ++ return 0; ++ } ++ if (result != 0) ++ return result; ++ } else ++ /* unconvertible */ ++ return 0; ++ } else { ++ /* use old convert info */ ++ ++ struct convert_item_info *idata; ++ ++ idata = item_convert_data(pos); ++ ++ result = assign_convert_mode(idata, &mode); ++ if (result) { ++ /* disk cluster is over, ++ nothing to update anymore */ ++ detach_convert_idata(pos->sq); ++ return 0; ++ } ++ } ++ ++ assert("edward-433", chaining_data_present(pos)); ++ assert("edward-1022", ++ pos->coord.item_pos < coord_num_items(&pos->coord)); ++ ++ /* check if next item is of current disk cluster */ ++ result = next_item_dc_stat(pos); ++ if (result) { ++ detach_convert_idata(pos->sq); ++ return result; ++ } ++ result = do_convert_ctail(pos, mode); ++ if (result) { ++ detach_convert_idata(pos->sq); ++ return result; ++ } ++ switch (mode) { ++ case CRC_CUT_ITEM: ++ assert("edward-1214", item_convert_data(pos)->flow.length == 0); ++ assert("edward-1215", ++ coord_num_items(&pos->coord) == nr_items || ++ coord_num_items(&pos->coord) == nr_items - 1); ++ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM) ++ break; ++ if (coord_num_items(&pos->coord) != nr_items) { ++ /* the item was killed, no more chained items */ ++ detach_convert_idata(pos->sq); ++ if (!node_is_empty(pos->coord.node)) ++ /* make sure the next item will be scanned */ ++ coord_init_before_item(&pos->coord); ++ break; ++ } ++ case CRC_APPEND_ITEM: ++ assert("edward-434", item_convert_data(pos)->flow.length == 0); ++ detach_convert_idata(pos->sq); ++ break; ++ case CRC_OVERWRITE_ITEM: ++ if (coord_is_unprepped_ctail(&pos->coord)) { ++ /* convert unpprepped ctail to prepped one */ ++ assert("edward-1259", ++ cluster_shift_ok(item_convert_data(pos)-> ++ cluster_shift)); ++ put_unaligned((d8)item_convert_data(pos)->cluster_shift, ++ &ctail_formatted_at(&pos->coord)-> ++ cluster_shift); ++ } ++ break; ++ } ++ return result; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.33/fs/reiser4/plugin/item/ctail.h +--- linux-2.6.33.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/ctail.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,102 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Ctail items are fragments (or bodies) of special tipe to provide ++ optimal storage of encrypted and(or) compressed files. */ ++ ++ ++#if !defined( __FS_REISER4_CTAIL_H__ ) ++#define __FS_REISER4_CTAIL_H__ ++ ++/* Disk format of ctail item */ ++typedef struct ctail_item_format { ++ /* packed shift; ++ if its value is different from UCTAIL_SHIFT (see below), then ++ size of disk cluster is calculated as (1 << cluster_shift) */ ++ d8 cluster_shift; ++ /* ctail body */ ++ d8 body[0]; ++} __attribute__ ((packed)) ctail_item_format; ++ ++/* "Unprepped" disk cluster is represented by a single ctail item ++ with the following "magic" attributes: */ ++/* "magic" cluster_shift */ ++#define UCTAIL_SHIFT 0xff ++/* How many units unprepped ctail item has */ ++#define UCTAIL_NR_UNITS 1 ++ ++/* The following is a set of various item states in a disk cluster. ++ Disk cluster is a set of items whose keys belong to the interval ++ [dc_key , dc_key + disk_cluster_size - 1] */ ++typedef enum { ++ DC_INVALID_STATE = 0, ++ DC_FIRST_ITEM = 1, ++ DC_CHAINED_ITEM = 2, ++ DC_AFTER_CLUSTER = 3 ++} dc_item_stat; ++ ++/* ctail-specific extension. ++ In particular this describes parameters of disk cluster an item belongs to */ ++struct ctail_coord_extension { ++ int shift; /* this contains cluster_shift extracted from ++ ctail_item_format (above), or UCTAIL_SHIFT ++ (the last one is the "magic" of unprepped disk clusters)*/ ++ int dsize; /* size of a prepped disk cluster */ ++ int ncount; /* count of nodes occupied by a disk cluster */ ++}; ++ ++struct cut_list; ++ ++/* plugin->item.b.* */ ++int can_contain_key_ctail(const coord_t *, const reiser4_key *, ++ const reiser4_item_data *); ++int mergeable_ctail(const coord_t * p1, const coord_t * p2); ++pos_in_node_t nr_units_ctail(const coord_t * coord); ++int estimate_ctail(const coord_t * coord, const reiser4_item_data * data); ++void print_ctail(const char *prefix, coord_t * coord); ++lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *); ++ ++int paste_ctail(coord_t * coord, reiser4_item_data * data, ++ carry_plugin_info * info UNUSED_ARG); ++int init_ctail(coord_t *, coord_t *, reiser4_item_data *); ++int can_shift_ctail(unsigned free_space, coord_t * coord, ++ znode * target, shift_direction pend, unsigned *size, ++ unsigned want); ++void copy_units_ctail(coord_t * target, coord_t * source, unsigned from, ++ unsigned count, shift_direction where_is_free_space, ++ unsigned free_space); ++int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ carry_cut_data *, reiser4_key * smallest_removed, ++ reiser4_key * new_first); ++int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ carry_kill_data *, reiser4_key * smallest_removed, ++ reiser4_key * new_first); ++int ctail_ok(const coord_t * coord); ++int check_ctail(const coord_t * coord, const char **error); ++ ++/* plugin->u.item.s.* */ ++int read_ctail(struct file *, flow_t *, hint_t *); ++int readpage_ctail(void *, struct page *); ++int readpages_ctail(struct file *, struct address_space *, struct list_head *); ++reiser4_key *append_key_ctail(const coord_t *, reiser4_key *); ++int create_hook_ctail(const coord_t * coord, void *arg); ++int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t, ++ carry_kill_data *); ++int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *); ++ ++/* plugin->u.item.f */ ++int utmost_child_ctail(const coord_t *, sideof, jnode **); ++int scan_ctail(flush_scan *); ++int convert_ctail(flush_pos_t *); ++size_t inode_scaled_cluster_size(struct inode *); ++ ++#endif /* __FS_REISER4_CTAIL_H__ */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/extent.c linux-2.6.33/fs/reiser4/plugin/item/extent.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/extent.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,197 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "item.h" ++#include "../../key.h" ++#include "../../super.h" ++#include "../../carry.h" ++#include "../../inode.h" ++#include "../../page_cache.h" ++#include "../../flush.h" ++#include "../object.h" ++ ++/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */ ++/* Audited by: green(2002.06.13) */ ++reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit, ++ int nr_extents) ++{ ++ data->data = ext_unit; ++ /* data->data is kernel space */ ++ data->user = 0; ++ data->length = sizeof(reiser4_extent) * nr_extents; ++ data->arg = NULL; ++ data->iplug = item_plugin_by_id(EXTENT_POINTER_ID); ++ return data; ++} ++ ++/* how many bytes are addressed by @nr first extents of the extent item */ ++reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr) ++{ ++ pos_in_node_t i; ++ reiser4_block_nr blocks; ++ reiser4_extent *ext; ++ ++ ext = item_body_by_coord(coord); ++ assert("vs-263", nr <= nr_units_extent(coord)); ++ ++ blocks = 0; ++ for (i = 0; i < nr; i++, ext++) { ++ blocks += extent_get_width(ext); ++ } ++ ++ return blocks * current_blocksize; ++} ++ ++extent_state state_of_extent(reiser4_extent * ext) ++{ ++ switch ((int)extent_get_start(ext)) { ++ case 0: ++ return HOLE_EXTENT; ++ case 1: ++ return UNALLOCATED_EXTENT; ++ default: ++ break; ++ } ++ return ALLOCATED_EXTENT; ++} ++ ++int extent_is_unallocated(const coord_t * item) ++{ ++ assert("jmacd-5133", item_is_extent(item)); ++ ++ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT; ++} ++ ++/* set extent's start and width */ ++void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start, ++ reiser4_block_nr width) ++{ ++ extent_set_start(ext, start); ++ extent_set_width(ext, width); ++} ++ ++/** ++ * reiser4_replace_extent - replace extent and paste 1 or 2 after it ++ * @un_extent: coordinate of extent to be overwritten ++ * @lh: need better comment ++ * @key: need better comment ++ * @exts_to_add: data prepared for insertion into tree ++ * @replace: need better comment ++ * @flags: need better comment ++ * @return_insert_position: need better comment ++ * ++ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If ++ * @return_inserted_position is 1 - @un_extent and @lh are returned set to ++ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned ++ * set to extent which was overwritten. ++ */ ++int reiser4_replace_extent(struct replace_handle *h, ++ int return_inserted_position) ++{ ++ int result; ++ znode *orig_znode; ++ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */ ++ ++ assert("vs-990", coord_is_existing_unit(h->coord)); ++ assert("vs-1375", znode_is_write_locked(h->coord->node)); ++ assert("vs-1426", extent_get_width(&h->overwrite) != 0); ++ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0); ++ assert("vs-1427", ergo(h->nr_new_extents == 2, ++ extent_get_width(&h->new_extents[1]) != 0)); ++ ++ /* compose structure for paste */ ++ init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents); ++ ++ coord_dup(&h->coord_after, h->coord); ++ init_lh(&h->lh_after); ++ copy_lh(&h->lh_after, h->lh); ++ reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK); ++ reiser4_tap_monitor(&h->watch); ++ ++ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord)); ++ orig_znode = h->coord->node; ++ ++#if REISER4_DEBUG ++ /* make sure that key is set properly */ ++ unit_key_by_coord(h->coord, &h->tmp); ++ set_key_offset(&h->tmp, ++ get_key_offset(&h->tmp) + ++ extent_get_width(&h->overwrite) * current_blocksize); ++ assert("vs-1080", keyeq(&h->tmp, &h->paste_key)); ++#endif ++ ++ /* set insert point after unit to be replaced */ ++ h->coord->between = AFTER_UNIT; ++ ++ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL, ++ &h->paste_key, &h->item, h->flags); ++ if (!result) { ++ /* now we have to replace the unit after which new units were ++ inserted. Its position is tracked by @watch */ ++ reiser4_extent *ext; ++ znode *node; ++ ++ node = h->coord_after.node; ++ if (node != orig_znode) { ++ coord_clear_iplug(&h->coord_after); ++ result = zload(node); ++ } ++ ++ if (likely(!result)) { ++ ext = extent_by_coord(&h->coord_after); ++ ++ assert("vs-987", znode_is_loaded(node)); ++ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext))); ++ ++ /* overwrite extent unit */ ++ memcpy(ext, &h->overwrite, sizeof(reiser4_extent)); ++ znode_make_dirty(node); ++ ++ if (node != orig_znode) ++ zrelse(node); ++ ++ if (return_inserted_position == 0) { ++ /* coord and lh are to be set to overwritten ++ extent */ ++ assert("vs-1662", ++ WITH_DATA(node, !memcmp(&h->overwrite, ++ extent_by_coord( ++ &h->coord_after), ++ sizeof(reiser4_extent)))); ++ ++ *h->coord = h->coord_after; ++ done_lh(h->lh); ++ copy_lh(h->lh, &h->lh_after); ++ } else { ++ /* h->coord and h->lh are to be set to first of ++ inserted units */ ++ assert("vs-1663", ++ WITH_DATA(h->coord->node, ++ !memcmp(&h->new_extents[0], ++ extent_by_coord(h->coord), ++ sizeof(reiser4_extent)))); ++ assert("vs-1664", h->lh->node == h->coord->node); ++ } ++ } ++ } ++ reiser4_tap_done(&h->watch); ++ ++ return result; ++} ++ ++lock_handle *znode_lh(znode *node) ++{ ++ assert("vs-1371", znode_is_write_locked(node)); ++ assert("vs-1372", znode_is_wlocked_once(node)); ++ return list_entry(node->lock.owners.next, lock_handle, owners_link); ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.33/fs/reiser4/plugin/item/extent_file_ops.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/extent_file_ops.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1453 @@ ++/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "item.h" ++#include "../../inode.h" ++#include "../../page_cache.h" ++#include "../object.h" ++ ++#include <linux/quotaops.h> ++#include <linux/swap.h> ++ ++static inline reiser4_extent *ext_by_offset(const znode *node, int offset) ++{ ++ reiser4_extent *ext; ++ ++ ext = (reiser4_extent *) (zdata(node) + offset); ++ return ext; ++} ++ ++/** ++ * check_uf_coord - verify coord extension ++ * @uf_coord: ++ * @key: ++ * ++ * Makes sure that all fields of @uf_coord are set properly. If @key is ++ * specified - check whether @uf_coord is set correspondingly. ++ */ ++static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key) ++{ ++#if REISER4_DEBUG ++ const coord_t *coord; ++ const struct extent_coord_extension *ext_coord; ++ reiser4_extent *ext; ++ ++ coord = &uf_coord->coord; ++ ext_coord = &uf_coord->extension.extent; ++ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset); ++ ++ assert("", ++ WITH_DATA(coord->node, ++ (uf_coord->valid == 1 && ++ coord_is_iplug_set(coord) && ++ item_is_extent(coord) && ++ ext_coord->nr_units == nr_units_extent(coord) && ++ ext == extent_by_coord(coord) && ++ ext_coord->width == extent_get_width(ext) && ++ coord->unit_pos < ext_coord->nr_units && ++ ext_coord->pos_in_unit < ext_coord->width && ++ memcmp(ext, &ext_coord->extent, ++ sizeof(reiser4_extent)) == 0))); ++ if (key) { ++ reiser4_key coord_key; ++ ++ unit_key_by_coord(&uf_coord->coord, &coord_key); ++ set_key_offset(&coord_key, ++ get_key_offset(&coord_key) + ++ (uf_coord->extension.extent. ++ pos_in_unit << PAGE_CACHE_SHIFT)); ++ assert("", keyeq(key, &coord_key)); ++ } ++#endif ++} ++ ++static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord) ++{ ++ check_uf_coord(uf_coord, NULL); ++ ++ return ext_by_offset(uf_coord->coord.node, ++ uf_coord->extension.extent.ext_offset); ++} ++ ++#if REISER4_DEBUG ++ ++/** ++ * offset_is_in_unit ++ * ++ * ++ * ++ */ ++/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set ++ pos_in_unit inside of unit correspondingly */ ++static int offset_is_in_unit(const coord_t *coord, loff_t off) ++{ ++ reiser4_key unit_key; ++ __u64 unit_off; ++ reiser4_extent *ext; ++ ++ ext = extent_by_coord(coord); ++ ++ unit_key_extent(coord, &unit_key); ++ unit_off = get_key_offset(&unit_key); ++ if (off < unit_off) ++ return 0; ++ if (off >= (unit_off + (current_blocksize * extent_get_width(ext)))) ++ return 0; ++ return 1; ++} ++ ++static int ++coord_matches_key_extent(const coord_t * coord, const reiser4_key * key) ++{ ++ reiser4_key item_key; ++ ++ assert("vs-771", coord_is_existing_unit(coord)); ++ assert("vs-1258", keylt(key, append_key_extent(coord, &item_key))); ++ assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key))); ++ ++ return offset_is_in_unit(coord, get_key_offset(key)); ++} ++ ++#endif ++ ++/** ++ * can_append - ++ * @key: ++ * @coord: ++ * ++ * Returns 1 if @key is equal to an append key of item @coord is set to ++ */ ++static int can_append(const reiser4_key *key, const coord_t *coord) ++{ ++ reiser4_key append_key; ++ ++ return keyeq(key, append_key_extent(coord, &append_key)); ++} ++ ++/** ++ * append_hole ++ * @coord: ++ * @lh: ++ * @key: ++ * ++ */ ++static int append_hole(coord_t *coord, lock_handle *lh, ++ const reiser4_key *key) ++{ ++ reiser4_key append_key; ++ reiser4_block_nr hole_width; ++ reiser4_extent *ext, new_ext; ++ reiser4_item_data idata; ++ ++ /* last item of file may have to be appended with hole */ ++ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL); ++ assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID); ++ ++ /* key of first byte which is not addressed by this extent */ ++ append_key_extent(coord, &append_key); ++ ++ assert("", keyle(&append_key, key)); ++ ++ /* ++ * extent item has to be appended with hole. Calculate length of that ++ * hole ++ */ ++ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) + ++ current_blocksize - 1) >> current_blocksize_bits); ++ assert("vs-954", hole_width > 0); ++ ++ /* set coord after last unit */ ++ coord_init_after_item_end(coord); ++ ++ /* get last extent in the item */ ++ ext = extent_by_coord(coord); ++ if (state_of_extent(ext) == HOLE_EXTENT) { ++ /* ++ * last extent of a file is hole extent. Widen that extent by ++ * @hole_width blocks. Note that we do not worry about ++ * overflowing - extent width is 64 bits ++ */ ++ reiser4_set_extent(ext, HOLE_EXTENT_START, ++ extent_get_width(ext) + hole_width); ++ znode_make_dirty(coord->node); ++ return 0; ++ } ++ ++ /* append last item of the file with hole extent unit */ ++ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT || ++ state_of_extent(ext) == UNALLOCATED_EXTENT)); ++ ++ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width); ++ init_new_extent(&idata, &new_ext, 1); ++ return insert_into_item(coord, lh, &append_key, &idata, 0); ++} ++ ++/** ++ * check_jnodes ++ * @twig: longterm locked twig node ++ * @key: ++ * ++ */ ++static void check_jnodes(znode *twig, const reiser4_key *key, int count) ++{ ++#if REISER4_DEBUG ++ coord_t c; ++ reiser4_key node_key, jnode_key; ++ ++ jnode_key = *key; ++ ++ assert("", twig != NULL); ++ assert("", znode_get_level(twig) == TWIG_LEVEL); ++ assert("", znode_is_write_locked(twig)); ++ ++ zload(twig); ++ /* get the smallest key in twig node */ ++ coord_init_first_unit(&c, twig); ++ unit_key_by_coord(&c, &node_key); ++ assert("", keyle(&node_key, &jnode_key)); ++ ++ coord_init_last_unit(&c, twig); ++ unit_key_by_coord(&c, &node_key); ++ if (item_plugin_by_coord(&c)->s.file.append_key) ++ item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key); ++ set_key_offset(&jnode_key, ++ get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1); ++ assert("", keylt(&jnode_key, &node_key)); ++ zrelse(twig); ++#endif ++} ++ ++/** ++ * append_last_extent - append last file item ++ * @uf_coord: coord to start insertion from ++ * @jnodes: array of jnodes ++ * @count: number of jnodes in the array ++ * ++ * There is already at least one extent item of file @inode in the tree. Append ++ * the last of them with unallocated extent unit of width @count. Assign ++ * fake block numbers to jnodes corresponding to the inserted extent. ++ */ ++static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key, ++ jnode **jnodes, int count) ++{ ++ int result; ++ reiser4_extent new_ext; ++ reiser4_item_data idata; ++ coord_t *coord; ++ struct extent_coord_extension *ext_coord; ++ reiser4_extent *ext; ++ reiser4_block_nr block; ++ jnode *node; ++ int i; ++ ++ coord = &uf_coord->coord; ++ ext_coord = &uf_coord->extension.extent; ++ ext = ext_by_ext_coord(uf_coord); ++ ++ /* check correctness of position in the item */ ++ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord)); ++ assert("vs-1311", coord->between == AFTER_UNIT); ++ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1); ++ ++ if (!can_append(key, coord)) { ++ /* hole extent has to be inserted */ ++ result = append_hole(coord, uf_coord->lh, key); ++ uf_coord->valid = 0; ++ return result; ++ } ++ ++ if (count == 0) ++ return 0; ++ ++ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE); ++ ++ result = vfs_dq_alloc_block_nodirty(mapping_jnode(jnodes[0])->host, ++ count); ++ BUG_ON(result != 0); ++ ++ switch (state_of_extent(ext)) { ++ case UNALLOCATED_EXTENT: ++ /* ++ * last extent unit of the file is unallocated one. Increase ++ * its width by @count ++ */ ++ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, ++ extent_get_width(ext) + count); ++ znode_make_dirty(coord->node); ++ ++ /* update coord extension */ ++ ext_coord->width += count; ++ ON_DEBUG(extent_set_width ++ (&uf_coord->extension.extent.extent, ++ ext_coord->width)); ++ break; ++ ++ case HOLE_EXTENT: ++ case ALLOCATED_EXTENT: ++ /* ++ * last extent unit of the file is either hole or allocated ++ * one. Append one unallocated extent of width @count ++ */ ++ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count); ++ init_new_extent(&idata, &new_ext, 1); ++ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0); ++ uf_coord->valid = 0; ++ if (result) ++ return result; ++ break; ++ ++ default: ++ return RETERR(-EIO); ++ } ++ ++ /* ++ * make sure that we hold long term locked twig node containing all ++ * jnodes we are about to capture ++ */ ++ check_jnodes(uf_coord->lh->node, key, count); ++ ++ /* ++ * assign fake block numbers to all jnodes. FIXME: make sure whether ++ * twig node containing inserted extent item is locked ++ */ ++ block = fake_blocknr_unformatted(count); ++ for (i = 0; i < count; i ++, block ++) { ++ node = jnodes[i]; ++ spin_lock_jnode(node); ++ JF_SET(node, JNODE_CREATED); ++ jnode_set_block(node, &block); ++ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); ++ BUG_ON(result != 0); ++ jnode_make_dirty_locked(node); ++ spin_unlock_jnode(node); ++ } ++ return count; ++} ++ ++/** ++ * insert_first_hole - inser hole extent into tree ++ * @coord: ++ * @lh: ++ * @key: ++ * ++ * ++ */ ++static int insert_first_hole(coord_t *coord, lock_handle *lh, ++ const reiser4_key *key) ++{ ++ reiser4_extent new_ext; ++ reiser4_item_data idata; ++ reiser4_key item_key; ++ reiser4_block_nr hole_width; ++ ++ /* @coord must be set for inserting of new item */ ++ assert("vs-711", coord_is_between_items(coord)); ++ ++ item_key = *key; ++ set_key_offset(&item_key, 0ull); ++ ++ hole_width = ((get_key_offset(key) + current_blocksize - 1) >> ++ current_blocksize_bits); ++ assert("vs-710", hole_width > 0); ++ ++ /* compose body of hole extent and insert item into tree */ ++ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width); ++ init_new_extent(&idata, &new_ext, 1); ++ return insert_extent_by_coord(coord, &idata, &item_key, lh); ++} ++ ++ ++/** ++ * insert_first_extent - insert first file item ++ * @inode: inode of file ++ * @uf_coord: coord to start insertion from ++ * @jnodes: array of jnodes ++ * @count: number of jnodes in the array ++ * @inode: ++ * ++ * There are no items of file @inode in the tree yet. Insert unallocated extent ++ * of width @count into tree or hole extent if writing not to the ++ * beginning. Assign fake block numbers to jnodes corresponding to the inserted ++ * unallocated extent. Returns number of jnodes or error code. ++ */ ++static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key, ++ jnode **jnodes, int count, ++ struct inode *inode) ++{ ++ int result; ++ int i; ++ reiser4_extent new_ext; ++ reiser4_item_data idata; ++ reiser4_block_nr block; ++ struct unix_file_info *uf_info; ++ jnode *node; ++ ++ /* first extent insertion starts at leaf level */ ++ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL); ++ assert("vs-711", coord_is_between_items(&uf_coord->coord)); ++ ++ if (get_key_offset(key) != 0) { ++ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key); ++ uf_coord->valid = 0; ++ uf_info = unix_file_inode_data(inode); ++ ++ /* ++ * first item insertion is only possible when writing to empty ++ * file or performing tail conversion ++ */ ++ assert("", (uf_info->container == UF_CONTAINER_EMPTY || ++ (reiser4_inode_get_flag(inode, ++ REISER4_PART_MIXED) && ++ reiser4_inode_get_flag(inode, ++ REISER4_PART_IN_CONV)))); ++ /* if file was empty - update its state */ ++ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY) ++ uf_info->container = UF_CONTAINER_EXTENTS; ++ return result; ++ } ++ ++ if (count == 0) ++ return 0; ++ ++ result = vfs_dq_alloc_block_nodirty(mapping_jnode(jnodes[0])->host, ++ count); ++ BUG_ON(result != 0); ++ ++ /* ++ * prepare for tree modification: compose body of item and item data ++ * structure needed for insertion ++ */ ++ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count); ++ init_new_extent(&idata, &new_ext, 1); ++ ++ /* insert extent item into the tree */ ++ result = insert_extent_by_coord(&uf_coord->coord, &idata, key, ++ uf_coord->lh); ++ if (result) ++ return result; ++ ++ /* ++ * make sure that we hold long term locked twig node containing all ++ * jnodes we are about to capture ++ */ ++ check_jnodes(uf_coord->lh->node, key, count); ++ /* ++ * assign fake block numbers to all jnodes, capture and mark them dirty ++ */ ++ block = fake_blocknr_unformatted(count); ++ for (i = 0; i < count; i ++, block ++) { ++ node = jnodes[i]; ++ spin_lock_jnode(node); ++ JF_SET(node, JNODE_CREATED); ++ jnode_set_block(node, &block); ++ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); ++ BUG_ON(result != 0); ++ jnode_make_dirty_locked(node); ++ spin_unlock_jnode(node); ++ } ++ ++ /* ++ * invalidate coordinate, research must be performed to continue ++ * because write will continue on twig level ++ */ ++ uf_coord->valid = 0; ++ return count; ++} ++ ++/** ++ * plug_hole - replace hole extent with unallocated and holes ++ * @uf_coord: ++ * @key: ++ * @node: ++ * @h: structure containing coordinate, lock handle, key, etc ++ * ++ * Creates an unallocated extent of width 1 within a hole. In worst case two ++ * additional extents can be created. ++ */ ++static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how) ++{ ++ struct replace_handle rh; ++ reiser4_extent *ext; ++ reiser4_block_nr width, pos_in_unit; ++ coord_t *coord; ++ struct extent_coord_extension *ext_coord; ++ int return_inserted_position; ++ ++ check_uf_coord(uf_coord, key); ++ ++ rh.coord = coord_by_uf_coord(uf_coord); ++ rh.lh = uf_coord->lh; ++ rh.flags = 0; ++ ++ coord = coord_by_uf_coord(uf_coord); ++ ext_coord = ext_coord_by_uf_coord(uf_coord); ++ ext = ext_by_ext_coord(uf_coord); ++ ++ width = ext_coord->width; ++ pos_in_unit = ext_coord->pos_in_unit; ++ ++ *how = 0; ++ if (width == 1) { ++ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1); ++ znode_make_dirty(coord->node); ++ /* update uf_coord */ ++ ON_DEBUG(ext_coord->extent = *ext); ++ *how = 1; ++ return 0; ++ } else if (pos_in_unit == 0) { ++ /* we deal with first element of extent */ ++ if (coord->unit_pos) { ++ /* there is an extent to the left */ ++ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) { ++ /* ++ * left neighboring unit is an unallocated ++ * extent. Increase its width and decrease ++ * width of hole ++ */ ++ extent_set_width(ext - 1, ++ extent_get_width(ext - 1) + 1); ++ extent_set_width(ext, width - 1); ++ znode_make_dirty(coord->node); ++ ++ /* update coord extension */ ++ coord->unit_pos--; ++ ext_coord->width = extent_get_width(ext - 1); ++ ext_coord->pos_in_unit = ext_coord->width - 1; ++ ext_coord->ext_offset -= sizeof(reiser4_extent); ++ ON_DEBUG(ext_coord->extent = ++ *extent_by_coord(coord)); ++ *how = 2; ++ return 0; ++ } ++ } ++ /* extent for replace */ ++ reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1); ++ /* extent to be inserted */ ++ reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START, ++ width - 1); ++ rh.nr_new_extents = 1; ++ ++ /* have reiser4_replace_extent to return with @coord and ++ @uf_coord->lh set to unit which was replaced */ ++ return_inserted_position = 0; ++ *how = 3; ++ } else if (pos_in_unit == width - 1) { ++ /* we deal with last element of extent */ ++ if (coord->unit_pos < nr_units_extent(coord) - 1) { ++ /* there is an extent unit to the right */ ++ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) { ++ /* ++ * right neighboring unit is an unallocated ++ * extent. Increase its width and decrease ++ * width of hole ++ */ ++ extent_set_width(ext + 1, ++ extent_get_width(ext + 1) + 1); ++ extent_set_width(ext, width - 1); ++ znode_make_dirty(coord->node); ++ ++ /* update coord extension */ ++ coord->unit_pos++; ++ ext_coord->width = extent_get_width(ext + 1); ++ ext_coord->pos_in_unit = 0; ++ ext_coord->ext_offset += sizeof(reiser4_extent); ++ ON_DEBUG(ext_coord->extent = ++ *extent_by_coord(coord)); ++ *how = 4; ++ return 0; ++ } ++ } ++ /* extent for replace */ ++ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1); ++ /* extent to be inserted */ ++ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, ++ 1); ++ rh.nr_new_extents = 1; ++ ++ /* have reiser4_replace_extent to return with @coord and ++ @uf_coord->lh set to unit which was inserted */ ++ return_inserted_position = 1; ++ *how = 5; ++ } else { ++ /* extent for replace */ ++ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, ++ pos_in_unit); ++ /* extents to be inserted */ ++ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, ++ 1); ++ reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START, ++ width - pos_in_unit - 1); ++ rh.nr_new_extents = 2; ++ ++ /* have reiser4_replace_extent to return with @coord and ++ @uf_coord->lh set to first of units which were inserted */ ++ return_inserted_position = 1; ++ *how = 6; ++ } ++ unit_key_by_coord(coord, &rh.paste_key); ++ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) + ++ extent_get_width(&rh.overwrite) * current_blocksize); ++ ++ uf_coord->valid = 0; ++ return reiser4_replace_extent(&rh, return_inserted_position); ++} ++ ++/** ++ * overwrite_one_block - ++ * @uf_coord: ++ * @key: ++ * @node: ++ * ++ * If @node corresponds to hole extent - create unallocated extent for it and ++ * assign fake block number. If @node corresponds to allocated extent - assign ++ * block number of jnode ++ */ ++static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key, ++ jnode *node, int *hole_plugged) ++{ ++ int result; ++ struct extent_coord_extension *ext_coord; ++ reiser4_extent *ext; ++ reiser4_block_nr block; ++ int how; ++ ++ assert("vs-1312", uf_coord->coord.between == AT_UNIT); ++ ++ result = 0; ++ ext_coord = ext_coord_by_uf_coord(uf_coord); ++ ext = ext_by_ext_coord(uf_coord); ++ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT); ++ ++ switch (state_of_extent(ext)) { ++ case ALLOCATED_EXTENT: ++ block = extent_get_start(ext) + ext_coord->pos_in_unit; ++ break; ++ ++ case HOLE_EXTENT: ++ result = vfs_dq_alloc_block_nodirty(mapping_jnode(node)->host, ++ 1); ++ BUG_ON(result != 0); ++ result = plug_hole(uf_coord, key, &how); ++ if (result) ++ return result; ++ block = fake_blocknr_unformatted(1); ++ if (hole_plugged) ++ *hole_plugged = 1; ++ JF_SET(node, JNODE_CREATED); ++ break; ++ ++ default: ++ return RETERR(-EIO); ++ } ++ ++ jnode_set_block(node, &block); ++ return 0; ++} ++ ++/** ++ * move_coord - move coordinate forward ++ * @uf_coord: ++ * ++ * Move coordinate one data block pointer forward. Return 1 if coord is set to ++ * the last one already or is invalid. ++ */ ++static int move_coord(uf_coord_t *uf_coord) ++{ ++ struct extent_coord_extension *ext_coord; ++ ++ if (uf_coord->valid == 0) ++ return 1; ++ ext_coord = &uf_coord->extension.extent; ++ ext_coord->pos_in_unit ++; ++ if (ext_coord->pos_in_unit < ext_coord->width) ++ /* coordinate moved within the unit */ ++ return 0; ++ ++ /* end of unit is reached. Try to move to next unit */ ++ ext_coord->pos_in_unit = 0; ++ uf_coord->coord.unit_pos ++; ++ if (uf_coord->coord.unit_pos < ext_coord->nr_units) { ++ /* coordinate moved to next unit */ ++ ext_coord->ext_offset += sizeof(reiser4_extent); ++ ext_coord->width = ++ extent_get_width(ext_by_offset ++ (uf_coord->coord.node, ++ ext_coord->ext_offset)); ++ ON_DEBUG(ext_coord->extent = ++ *ext_by_offset(uf_coord->coord.node, ++ ext_coord->ext_offset)); ++ return 0; ++ } ++ /* end of item is reached */ ++ uf_coord->valid = 0; ++ return 1; ++} ++ ++/** ++ * overwrite_extent - ++ * @inode: ++ * ++ * Returns number of handled jnodes. ++ */ ++static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key, ++ jnode **jnodes, int count, int *plugged_hole) ++{ ++ int result; ++ reiser4_key k; ++ int i; ++ jnode *node; ++ ++ k = *key; ++ for (i = 0; i < count; i ++) { ++ node = jnodes[i]; ++ if (*jnode_get_block(node) == 0) { ++ result = overwrite_one_block(uf_coord, &k, node, plugged_hole); ++ if (result) ++ return result; ++ } ++ /* ++ * make sure that we hold long term locked twig node containing ++ * all jnodes we are about to capture ++ */ ++ check_jnodes(uf_coord->lh->node, &k, 1); ++ /* ++ * assign fake block numbers to all jnodes, capture and mark ++ * them dirty ++ */ ++ spin_lock_jnode(node); ++ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); ++ BUG_ON(result != 0); ++ jnode_make_dirty_locked(node); ++ spin_unlock_jnode(node); ++ ++ if (uf_coord->valid == 0) ++ return i + 1; ++ ++ check_uf_coord(uf_coord, &k); ++ ++ if (move_coord(uf_coord)) { ++ /* ++ * failed to move to the next node pointer. Either end ++ * of file or end of twig node is reached. In the later ++ * case we might go to the right neighbor. ++ */ ++ uf_coord->valid = 0; ++ return i + 1; ++ } ++ set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE); ++ } ++ ++ return count; ++} ++ ++/** ++ * reiser4_update_extent ++ * @file: ++ * @jnodes: ++ * @count: ++ * @off: ++ * ++ */ ++int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos, ++ int *plugged_hole) ++{ ++ int result; ++ znode *loaded; ++ uf_coord_t uf_coord; ++ coord_t *coord; ++ lock_handle lh; ++ reiser4_key key; ++ ++ assert("", reiser4_lock_counters()->d_refs == 0); ++ ++ key_by_inode_and_offset_common(inode, pos, &key); ++ ++ init_uf_coord(&uf_coord, &lh); ++ coord = &uf_coord.coord; ++ result = find_file_item_nohint(coord, &lh, &key, ++ ZNODE_WRITE_LOCK, inode); ++ if (IS_CBKERR(result)) { ++ assert("", reiser4_lock_counters()->d_refs == 0); ++ return result; ++ } ++ ++ result = zload(coord->node); ++ BUG_ON(result != 0); ++ loaded = coord->node; ++ ++ if (coord->between == AFTER_UNIT) { ++ /* ++ * append existing extent item with unallocated extent of width ++ * nr_jnodes ++ */ ++ init_coord_extension_extent(&uf_coord, ++ get_key_offset(&key)); ++ result = append_last_extent(&uf_coord, &key, ++ &node, 1); ++ } else if (coord->between == AT_UNIT) { ++ /* ++ * overwrite ++ * not optimal yet. Will be optimized if new write will show ++ * performance win. ++ */ ++ init_coord_extension_extent(&uf_coord, ++ get_key_offset(&key)); ++ result = overwrite_extent(&uf_coord, &key, ++ &node, 1, plugged_hole); ++ } else { ++ /* ++ * there are no items of this file in the tree yet. Create ++ * first item of the file inserting one unallocated extent of ++ * width nr_jnodes ++ */ ++ result = insert_first_extent(&uf_coord, &key, &node, 1, inode); ++ } ++ assert("", result == 1 || result < 0); ++ zrelse(loaded); ++ done_lh(&lh); ++ assert("", reiser4_lock_counters()->d_refs == 0); ++ return (result == 1) ? 0 : result; ++} ++ ++/** ++ * update_extents ++ * @file: ++ * @jnodes: ++ * @count: ++ * @off: ++ * ++ */ ++static int update_extents(struct file *file, struct inode *inode, ++ jnode **jnodes, int count, loff_t pos) ++{ ++ struct hint hint; ++ reiser4_key key; ++ int result; ++ znode *loaded; ++ ++ result = load_file_hint(file, &hint); ++ BUG_ON(result != 0); ++ ++ if (count != 0) ++ /* ++ * count == 0 is special case: expanding truncate ++ */ ++ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT; ++ key_by_inode_and_offset_common(inode, pos, &key); ++ ++ assert("", reiser4_lock_counters()->d_refs == 0); ++ ++ do { ++ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode); ++ if (IS_CBKERR(result)) { ++ assert("", reiser4_lock_counters()->d_refs == 0); ++ return result; ++ } ++ ++ result = zload(hint.ext_coord.coord.node); ++ BUG_ON(result != 0); ++ loaded = hint.ext_coord.coord.node; ++ ++ if (hint.ext_coord.coord.between == AFTER_UNIT) { ++ /* ++ * append existing extent item with unallocated extent ++ * of width nr_jnodes ++ */ ++ if (hint.ext_coord.valid == 0) ++ /* NOTE: get statistics on this */ ++ init_coord_extension_extent(&hint.ext_coord, ++ get_key_offset(&key)); ++ result = append_last_extent(&hint.ext_coord, &key, ++ jnodes, count); ++ } else if (hint.ext_coord.coord.between == AT_UNIT) { ++ /* ++ * overwrite ++ * not optimal yet. Will be optimized if new write will ++ * show performance win. ++ */ ++ if (hint.ext_coord.valid == 0) ++ /* NOTE: get statistics on this */ ++ init_coord_extension_extent(&hint.ext_coord, ++ get_key_offset(&key)); ++ result = overwrite_extent(&hint.ext_coord, &key, ++ jnodes, count, NULL); ++ } else { ++ /* ++ * there are no items of this file in the tree ++ * yet. Create first item of the file inserting one ++ * unallocated extent of * width nr_jnodes ++ */ ++ result = insert_first_extent(&hint.ext_coord, &key, ++ jnodes, count, inode); ++ } ++ zrelse(loaded); ++ if (result < 0) { ++ done_lh(hint.ext_coord.lh); ++ break; ++ } ++ ++ jnodes += result; ++ count -= result; ++ set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE); ++ ++ /* seal and unlock znode */ ++ if (hint.ext_coord.valid) ++ reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK); ++ else ++ reiser4_unset_hint(&hint); ++ ++ } while (count > 0); ++ ++ save_file_hint(file, &hint); ++ assert("", reiser4_lock_counters()->d_refs == 0); ++ return result; ++} ++ ++/** ++ * write_extent_reserve_space - reserve space for extent write operation ++ * @inode: ++ * ++ * Estimates and reserves space which may be required for writing ++ * WRITE_GRANULARITY pages of file. ++ */ ++static int write_extent_reserve_space(struct inode *inode) ++{ ++ __u64 count; ++ reiser4_tree *tree; ++ ++ /* ++ * to write WRITE_GRANULARITY pages to a file by extents we have to ++ * reserve disk space for: ++ ++ * 1. find_file_item may have to insert empty node to the tree (empty ++ * leaf node between two extent items). This requires 1 block and ++ * number of blocks which are necessary to perform insertion of an ++ * internal item into twig level. ++ ++ * 2. for each of written pages there might be needed 1 block and ++ * number of blocks which might be necessary to perform insertion of or ++ * paste to an extent item. ++ ++ * 3. stat data update ++ */ ++ tree = reiser4_tree_by_inode(inode); ++ count = estimate_one_insert_item(tree) + ++ WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) + ++ estimate_one_insert_item(tree); ++ grab_space_enable(); ++ return reiser4_grab_space(count, 0 /* flags */); ++} ++ ++/* ++ * filemap_copy_from_user no longer exists in generic code, because it ++ * is deadlocky (copying from user while holding the page lock is bad). ++ * As a temporary fix for reiser4, just define it here. ++ */ ++static inline size_t ++filemap_copy_from_user(struct page *page, unsigned long offset, ++ const char __user *buf, unsigned bytes) ++{ ++ char *kaddr; ++ int left; ++ ++ kaddr = kmap_atomic(page, KM_USER0); ++ left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); ++ kunmap_atomic(kaddr, KM_USER0); ++ ++ if (left != 0) { ++ /* Do it the slow way */ ++ kaddr = kmap(page); ++ left = __copy_from_user_nocache(kaddr + offset, buf, bytes); ++ kunmap(page); ++ } ++ return bytes - left; ++} ++ ++/** ++ * reiser4_write_extent - write method of extent item plugin ++ * @file: file to write to ++ * @buf: address of user-space buffer ++ * @count: number of bytes to write ++ * @pos: position in file to write to ++ * ++ */ ++ssize_t reiser4_write_extent(struct file *file, struct inode * inode, ++ const char __user *buf, size_t count, loff_t *pos) ++{ ++ int have_to_update_extent; ++ int nr_pages, nr_dirty; ++ struct page *page; ++ jnode *jnodes[WRITE_GRANULARITY + 1]; ++ unsigned long index; ++ unsigned long end; ++ int i; ++ int to_page, page_off; ++ size_t left, written; ++ int result = 0; ++ ++ if (write_extent_reserve_space(inode)) ++ return RETERR(-ENOSPC); ++ ++ if (count == 0) { ++ /* truncate case */ ++ update_extents(file, inode, jnodes, 0, *pos); ++ return 0; ++ } ++ ++ BUG_ON(get_current_context()->trans->atom != NULL); ++ ++ left = count; ++ index = *pos >> PAGE_CACHE_SHIFT; ++ /* calculate number of pages which are to be written */ ++ end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT); ++ nr_pages = end - index + 1; ++ nr_dirty = 0; ++ assert("", nr_pages <= WRITE_GRANULARITY + 1); ++ ++ /* get pages and jnodes */ ++ for (i = 0; i < nr_pages; i ++) { ++ page = find_or_create_page(inode->i_mapping, index + i, ++ reiser4_ctx_gfp_mask_get()); ++ if (page == NULL) { ++ nr_pages = i; ++ result = RETERR(-ENOMEM); ++ goto out; ++ } ++ ++ jnodes[i] = jnode_of_page(page); ++ if (IS_ERR(jnodes[i])) { ++ unlock_page(page); ++ page_cache_release(page); ++ nr_pages = i; ++ result = RETERR(-ENOMEM); ++ goto out; ++ } ++ /* prevent jnode and page from disconnecting */ ++ JF_SET(jnodes[i], JNODE_WRITE_PREPARED); ++ unlock_page(page); ++ } ++ ++ BUG_ON(get_current_context()->trans->atom != NULL); ++ ++ have_to_update_extent = 0; ++ ++ page_off = (*pos & (PAGE_CACHE_SIZE - 1)); ++ for (i = 0; i < nr_pages; i ++) { ++ to_page = PAGE_CACHE_SIZE - page_off; ++ if (to_page > left) ++ to_page = left; ++ page = jnode_page(jnodes[i]); ++ if (page_offset(page) < inode->i_size && ++ !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) { ++ /* ++ * the above is not optimal for partial write to last ++ * page of file when file size is not at boundary of ++ * page ++ */ ++ lock_page(page); ++ if (!PageUptodate(page)) { ++ result = readpage_unix_file(NULL, page); ++ BUG_ON(result != 0); ++ /* wait for read completion */ ++ lock_page(page); ++ BUG_ON(!PageUptodate(page)); ++ } else ++ result = 0; ++ unlock_page(page); ++ } ++ ++ BUG_ON(get_current_context()->trans->atom != NULL); ++ fault_in_pages_readable(buf, to_page); ++ BUG_ON(get_current_context()->trans->atom != NULL); ++ ++ lock_page(page); ++ if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) ++ zero_user_segments(page, 0, page_off, ++ page_off + to_page, ++ PAGE_CACHE_SIZE); ++ ++ written = filemap_copy_from_user(page, page_off, buf, to_page); ++ if (unlikely(written != to_page)) { ++ unlock_page(page); ++ result = RETERR(-EFAULT); ++ break; ++ } ++ ++ flush_dcache_page(page); ++ set_page_dirty_notag(page); ++ unlock_page(page); ++ nr_dirty++; ++ ++ mark_page_accessed(page); ++ SetPageUptodate(page); ++ ++ if (jnodes[i]->blocknr == 0) ++ have_to_update_extent ++; ++ ++ page_off = 0; ++ buf += to_page; ++ left -= to_page; ++ BUG_ON(get_current_context()->trans->atom != NULL); ++ } ++ ++ if (have_to_update_extent) { ++ update_extents(file, inode, jnodes, nr_dirty, *pos); ++ } else { ++ for (i = 0; i < nr_dirty; i ++) { ++ int ret; ++ spin_lock_jnode(jnodes[i]); ++ ret = reiser4_try_capture(jnodes[i], ++ ZNODE_WRITE_LOCK, 0); ++ BUG_ON(ret != 0); ++ jnode_make_dirty_locked(jnodes[i]); ++ spin_unlock_jnode(jnodes[i]); ++ } ++ } ++out: ++ for (i = 0; i < nr_pages; i ++) { ++ page_cache_release(jnode_page(jnodes[i])); ++ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED); ++ jput(jnodes[i]); ++ } ++ ++ /* the only errors handled so far is ENOMEM and ++ EFAULT on copy_from_user */ ++ ++ return (count - left) ? (count - left) : result; ++} ++ ++int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos, ++ struct page *page) ++{ ++ jnode *j; ++ struct address_space *mapping; ++ unsigned long index; ++ oid_t oid; ++ reiser4_block_nr block; ++ ++ mapping = page->mapping; ++ oid = get_inode_oid(mapping->host); ++ index = page->index; ++ ++ switch (state_of_extent(ext)) { ++ case HOLE_EXTENT: ++ /* ++ * it is possible to have hole page with jnode, if page was ++ * eflushed previously. ++ */ ++ j = jfind(mapping, index); ++ if (j == NULL) { ++ zero_user(page, 0, PAGE_CACHE_SIZE); ++ SetPageUptodate(page); ++ unlock_page(page); ++ return 0; ++ } ++ spin_lock_jnode(j); ++ if (!jnode_page(j)) { ++ jnode_attach_page(j, page); ++ } else { ++ BUG_ON(jnode_page(j) != page); ++ assert("vs-1504", jnode_page(j) == page); ++ } ++ block = *jnode_get_io_block(j); ++ spin_unlock_jnode(j); ++ if (block == 0) { ++ zero_user(page, 0, PAGE_CACHE_SIZE); ++ SetPageUptodate(page); ++ unlock_page(page); ++ jput(j); ++ return 0; ++ } ++ break; ++ ++ case ALLOCATED_EXTENT: ++ j = jnode_of_page(page); ++ if (IS_ERR(j)) ++ return PTR_ERR(j); ++ if (*jnode_get_block(j) == 0) { ++ reiser4_block_nr blocknr; ++ ++ blocknr = extent_get_start(ext) + pos; ++ jnode_set_block(j, &blocknr); ++ } else ++ assert("vs-1403", ++ j->blocknr == extent_get_start(ext) + pos); ++ break; ++ ++ case UNALLOCATED_EXTENT: ++ j = jfind(mapping, index); ++ assert("nikita-2688", j); ++ assert("vs-1426", jnode_page(j) == NULL); ++ ++ spin_lock_jnode(j); ++ jnode_attach_page(j, page); ++ spin_unlock_jnode(j); ++ break; ++ ++ default: ++ warning("vs-957", "wrong extent\n"); ++ return RETERR(-EIO); ++ } ++ ++ BUG_ON(j == 0); ++ reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get()); ++ jput(j); ++ return 0; ++} ++ ++/* Implements plugin->u.item.s.file.read operation for extent items. */ ++int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint) ++{ ++ int result; ++ struct page *page; ++ unsigned long cur_page, next_page; ++ unsigned long page_off, count; ++ struct address_space *mapping; ++ loff_t file_off; ++ uf_coord_t *uf_coord; ++ coord_t *coord; ++ struct extent_coord_extension *ext_coord; ++ unsigned long nr_pages; ++ char *kaddr; ++ ++ assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE); ++ assert("vs-572", flow->user == 1); ++ assert("vs-1351", flow->length > 0); ++ ++ uf_coord = &hint->ext_coord; ++ ++ check_uf_coord(uf_coord, NULL); ++ assert("vs-33", uf_coord->lh == &hint->lh); ++ ++ coord = &uf_coord->coord; ++ assert("vs-1119", znode_is_rlocked(coord->node)); ++ assert("vs-1120", znode_is_loaded(coord->node)); ++ assert("vs-1256", coord_matches_key_extent(coord, &flow->key)); ++ ++ mapping = file->f_dentry->d_inode->i_mapping; ++ ext_coord = &uf_coord->extension.extent; ++ ++ /* offset in a file to start read from */ ++ file_off = get_key_offset(&flow->key); ++ /* offset within the page to start read from */ ++ page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1)); ++ /* bytes which can be read from the page which contains file_off */ ++ count = PAGE_CACHE_SIZE - page_off; ++ ++ /* index of page containing offset read is to start from */ ++ cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT); ++ next_page = cur_page; ++ /* number of pages flow spans over */ ++ nr_pages = ++ ((file_off + flow->length + PAGE_CACHE_SIZE - ++ 1) >> PAGE_CACHE_SHIFT) - cur_page; ++ ++ /* we start having twig node read locked. However, we do not want to ++ keep that lock all the time readahead works. So, set a sel and ++ release twig node. */ ++ reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK); ++ /* &hint->lh is done-ed */ ++ ++ do { ++ reiser4_txn_restart_current(); ++ page = read_mapping_page(mapping, cur_page, file); ++ if (IS_ERR(page)) ++ return PTR_ERR(page); ++ lock_page(page); ++ if (!PageUptodate(page)) { ++ unlock_page(page); ++ page_cache_release(page); ++ warning("jmacd-97178", "extent_read: page is not up to date"); ++ return RETERR(-EIO); ++ } ++ mark_page_accessed(page); ++ unlock_page(page); ++ ++ /* If users can be writing to this page using arbitrary virtual ++ addresses, take care about potential aliasing before reading ++ the page on the kernel side. ++ */ ++ if (mapping_writably_mapped(mapping)) ++ flush_dcache_page(page); ++ ++ assert("nikita-3034", reiser4_schedulable()); ++ ++ /* number of bytes which are to be read from the page */ ++ if (count > flow->length) ++ count = flow->length; ++ ++ result = fault_in_pages_writeable(flow->data, count); ++ if (result) { ++ page_cache_release(page); ++ return RETERR(-EFAULT); ++ } ++ ++ kaddr = kmap_atomic(page, KM_USER0); ++ result = __copy_to_user_inatomic(flow->data, ++ kaddr + page_off, count); ++ kunmap_atomic(kaddr, KM_USER0); ++ if (result != 0) { ++ kaddr = kmap(page); ++ result = __copy_to_user(flow->data, kaddr + page_off, count); ++ kunmap(page); ++ if (unlikely(result)) ++ return RETERR(-EFAULT); ++ } ++ ++ page_cache_release(page); ++ ++ /* increase key (flow->key), update user area pointer (flow->data) */ ++ move_flow_forward(flow, count); ++ ++ page_off = 0; ++ cur_page ++; ++ count = PAGE_CACHE_SIZE; ++ nr_pages--; ++ } while (flow->length); ++ ++ return 0; ++} ++ ++/* ++ plugin->s.file.readpage ++ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage ++ or ++ filemap_fault->reiser4_readpage->readpage_unix_file->->readpage_extent ++ ++ At the beginning: coord->node is read locked, zloaded, page is ++ locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index) ++*/ ++int reiser4_readpage_extent(void *vp, struct page *page) ++{ ++ uf_coord_t *uf_coord = vp; ++ ON_DEBUG(coord_t * coord = &uf_coord->coord); ++ ON_DEBUG(reiser4_key key); ++ ++ assert("vs-1040", PageLocked(page)); ++ assert("vs-1050", !PageUptodate(page)); ++ assert("vs-1039", page->mapping && page->mapping->host); ++ ++ assert("vs-1044", znode_is_loaded(coord->node)); ++ assert("vs-758", item_is_extent(coord)); ++ assert("vs-1046", coord_is_existing_unit(coord)); ++ assert("vs-1045", znode_is_rlocked(coord->node)); ++ assert("vs-1047", ++ page->mapping->host->i_ino == ++ get_key_objectid(item_key_by_coord(coord, &key))); ++ check_uf_coord(uf_coord, NULL); ++ ++ return reiser4_do_readpage_extent( ++ ext_by_ext_coord(uf_coord), ++ uf_coord->extension.extent.pos_in_unit, page); ++} ++ ++/** ++ * get_block_address_extent ++ * @coord: ++ * @block: ++ * @result: ++ * ++ * ++ */ ++int get_block_address_extent(const coord_t *coord, sector_t block, ++ sector_t *result) ++{ ++ reiser4_extent *ext; ++ ++ if (!coord_is_existing_unit(coord)) ++ return RETERR(-EINVAL); ++ ++ ext = extent_by_coord(coord); ++ ++ if (state_of_extent(ext) != ALLOCATED_EXTENT) ++ /* FIXME: bad things may happen if it is unallocated extent */ ++ *result = 0; ++ else { ++ reiser4_key key; ++ ++ unit_key_by_coord(coord, &key); ++ assert("vs-1645", ++ block >= get_key_offset(&key) >> current_blocksize_bits); ++ assert("vs-1646", ++ block < ++ (get_key_offset(&key) >> current_blocksize_bits) + ++ extent_get_width(ext)); ++ *result = ++ extent_get_start(ext) + (block - ++ (get_key_offset(&key) >> ++ current_blocksize_bits)); ++ } ++ return 0; ++} ++ ++/* ++ plugin->u.item.s.file.append_key ++ key of first byte which is the next to last byte by addressed by this extent ++*/ ++reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key) ++{ ++ item_key_by_coord(coord, key); ++ set_key_offset(key, ++ get_key_offset(key) + reiser4_extent_size(coord, ++ nr_units_extent ++ (coord))); ++ ++ assert("vs-610", get_key_offset(key) ++ && (get_key_offset(key) & (current_blocksize - 1)) == 0); ++ return key; ++} ++ ++/* plugin->u.item.s.file.init_coord_extension */ ++void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped) ++{ ++ coord_t *coord; ++ struct extent_coord_extension *ext_coord; ++ reiser4_key key; ++ loff_t offset; ++ ++ assert("vs-1295", uf_coord->valid == 0); ++ ++ coord = &uf_coord->coord; ++ assert("vs-1288", coord_is_iplug_set(coord)); ++ assert("vs-1327", znode_is_loaded(coord->node)); ++ ++ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT) ++ return; ++ ++ ext_coord = &uf_coord->extension.extent; ++ ext_coord->nr_units = nr_units_extent(coord); ++ ext_coord->ext_offset = ++ (char *)extent_by_coord(coord) - zdata(coord->node); ++ ext_coord->width = extent_get_width(extent_by_coord(coord)); ++ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord)); ++ uf_coord->valid = 1; ++ ++ /* pos_in_unit is the only uninitialized field in extended coord */ ++ if (coord->between == AFTER_UNIT) { ++ assert("vs-1330", ++ coord->unit_pos == nr_units_extent(coord) - 1); ++ ++ ext_coord->pos_in_unit = ext_coord->width - 1; ++ } else { ++ /* AT_UNIT */ ++ unit_key_by_coord(coord, &key); ++ offset = get_key_offset(&key); ++ ++ assert("vs-1328", offset <= lookuped); ++ assert("vs-1329", ++ lookuped < ++ offset + ext_coord->width * current_blocksize); ++ ext_coord->pos_in_unit = ++ ((lookuped - offset) >> current_blocksize_bits); ++ } ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.33/fs/reiser4/plugin/item/extent_flush_ops.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/extent_flush_ops.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1028 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "item.h" ++#include "../../tree.h" ++#include "../../jnode.h" ++#include "../../super.h" ++#include "../../flush.h" ++#include "../../carry.h" ++#include "../object.h" ++ ++#include <linux/pagemap.h> ++ ++static reiser4_block_nr extent_unit_start(const coord_t * item); ++ ++/* Return either first or last extent (depending on @side) of the item ++ @coord is set to. Set @pos_in_unit either to first or to last block ++ of extent. */ ++static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side, ++ reiser4_block_nr * pos_in_unit) ++{ ++ reiser4_extent *ext; ++ ++ if (side == LEFT_SIDE) { ++ /* get first extent of item */ ++ ext = extent_item(coord); ++ *pos_in_unit = 0; ++ } else { ++ /* get last extent of item and last position within it */ ++ assert("vs-363", side == RIGHT_SIDE); ++ ext = extent_item(coord) + coord_last_unit_pos(coord); ++ *pos_in_unit = extent_get_width(ext) - 1; ++ } ++ ++ return ext; ++} ++ ++/* item_plugin->f.utmost_child */ ++/* Return the child. Coord is set to extent item. Find jnode corresponding ++ either to first or to last unformatted node pointed by the item */ ++int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp) ++{ ++ reiser4_extent *ext; ++ reiser4_block_nr pos_in_unit; ++ ++ ext = extent_utmost_ext(coord, side, &pos_in_unit); ++ ++ switch (state_of_extent(ext)) { ++ case HOLE_EXTENT: ++ *childp = NULL; ++ return 0; ++ case ALLOCATED_EXTENT: ++ case UNALLOCATED_EXTENT: ++ break; ++ default: ++ /* this should never happen */ ++ assert("vs-1417", 0); ++ } ++ ++ { ++ reiser4_key key; ++ reiser4_tree *tree; ++ unsigned long index; ++ ++ if (side == LEFT_SIDE) { ++ /* get key of first byte addressed by the extent */ ++ item_key_by_coord(coord, &key); ++ } else { ++ /* get key of byte which next after last byte addressed by the extent */ ++ append_key_extent(coord, &key); ++ } ++ ++ assert("vs-544", ++ (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul); ++ /* index of first or last (depending on @side) page addressed ++ by the extent */ ++ index = ++ (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT); ++ if (side == RIGHT_SIDE) ++ index--; ++ ++ tree = coord->node->zjnode.tree; ++ *childp = jlookup(tree, get_key_objectid(&key), index); ++ } ++ ++ return 0; ++} ++ ++/* item_plugin->f.utmost_child_real_block */ ++/* Return the child's block, if allocated. */ ++int ++utmost_child_real_block_extent(const coord_t * coord, sideof side, ++ reiser4_block_nr * block) ++{ ++ reiser4_extent *ext; ++ ++ ext = extent_by_coord(coord); ++ ++ switch (state_of_extent(ext)) { ++ case ALLOCATED_EXTENT: ++ *block = extent_get_start(ext); ++ if (side == RIGHT_SIDE) ++ *block += extent_get_width(ext) - 1; ++ break; ++ case HOLE_EXTENT: ++ case UNALLOCATED_EXTENT: ++ *block = 0; ++ break; ++ default: ++ /* this should never happen */ ++ assert("vs-1418", 0); ++ } ++ ++ return 0; ++} ++ ++/* item_plugin->f.scan */ ++/* Performs leftward scanning starting from an unformatted node and its parent coordinate. ++ This scan continues, advancing the parent coordinate, until either it encounters a ++ formatted child or it finishes scanning this node. ++ ++ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm ++ not sure this is last property (same atom) is enforced, but it should be the case since ++ one atom must write the parent and the others must read the parent, thus fusing?). In ++ any case, the code below asserts this case for unallocated extents. Unallocated ++ extents are thus optimized because we can skip to the endpoint when scanning. ++ ++ It returns control to reiser4_scan_extent, handles these terminating conditions, ++ e.g., by loading the next twig. ++*/ ++int reiser4_scan_extent(flush_scan * scan) ++{ ++ coord_t coord; ++ jnode *neighbor; ++ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist; ++ reiser4_block_nr unit_start; ++ __u64 oid; ++ reiser4_key key; ++ int ret = 0, allocated, incr; ++ reiser4_tree *tree; ++ ++ if (!JF_ISSET(scan->node, JNODE_DIRTY)) { ++ scan->stop = 1; ++ return 0; /* Race with truncate, this node is already ++ * truncated. */ ++ } ++ ++ coord_dup(&coord, &scan->parent_coord); ++ ++ assert("jmacd-1404", !reiser4_scan_finished(scan)); ++ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL); ++ assert("jmacd-1406", jnode_is_unformatted(scan->node)); ++ ++ /* The scan_index variable corresponds to the current page index of the ++ unformatted block scan position. */ ++ scan_index = index_jnode(scan->node); ++ ++ assert("jmacd-7889", item_is_extent(&coord)); ++ ++ repeat: ++ /* objectid of file */ ++ oid = get_key_objectid(item_key_by_coord(&coord, &key)); ++ ++ allocated = !extent_is_unallocated(&coord); ++ /* Get the values of this extent unit: */ ++ unit_index = extent_unit_index(&coord); ++ unit_width = extent_unit_width(&coord); ++ unit_start = extent_unit_start(&coord); ++ ++ assert("jmacd-7187", unit_width > 0); ++ assert("jmacd-7188", scan_index >= unit_index); ++ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1); ++ ++ /* Depending on the scan direction, we set different maximum values for scan_index ++ (scan_max) and the number of nodes that would be passed if the scan goes the ++ entire way (scan_dist). Incr is an integer reflecting the incremental ++ direction of scan_index. */ ++ if (reiser4_scanning_left(scan)) { ++ scan_max = unit_index; ++ scan_dist = scan_index - unit_index; ++ incr = -1; ++ } else { ++ scan_max = unit_index + unit_width - 1; ++ scan_dist = scan_max - unit_index; ++ incr = +1; ++ } ++ ++ tree = coord.node->zjnode.tree; ++ ++ /* If the extent is allocated we have to check each of its blocks. If the extent ++ is unallocated we can skip to the scan_max. */ ++ if (allocated) { ++ do { ++ neighbor = jlookup(tree, oid, scan_index); ++ if (neighbor == NULL) ++ goto stop_same_parent; ++ ++ if (scan->node != neighbor ++ && !reiser4_scan_goto(scan, neighbor)) { ++ /* @neighbor was jput() by reiser4_scan_goto */ ++ goto stop_same_parent; ++ } ++ ++ ret = scan_set_current(scan, neighbor, 1, &coord); ++ if (ret != 0) { ++ goto exit; ++ } ++ ++ /* reference to @neighbor is stored in @scan, no need ++ to jput(). */ ++ scan_index += incr; ++ ++ } while (incr + scan_max != scan_index); ++ ++ } else { ++ /* Optimized case for unallocated extents, skip to the end. */ ++ neighbor = jlookup(tree, oid, scan_max /*index */ ); ++ if (neighbor == NULL) { ++ /* Race with truncate */ ++ scan->stop = 1; ++ ret = 0; ++ goto exit; ++ } ++ ++ assert("zam-1043", ++ reiser4_blocknr_is_fake(jnode_get_block(neighbor))); ++ ++ ret = scan_set_current(scan, neighbor, scan_dist, &coord); ++ if (ret != 0) { ++ goto exit; ++ } ++ } ++ ++ if (coord_sideof_unit(&coord, scan->direction) == 0 ++ && item_is_extent(&coord)) { ++ /* Continue as long as there are more extent units. */ ++ ++ scan_index = ++ extent_unit_index(&coord) + ++ (reiser4_scanning_left(scan) ? ++ extent_unit_width(&coord) - 1 : 0); ++ goto repeat; ++ } ++ ++ if (0) { ++ stop_same_parent: ++ ++ /* If we are scanning left and we stop in the middle of an allocated ++ extent, we know the preceder immediately.. */ ++ /* middle of extent is (scan_index - unit_index) != 0. */ ++ if (reiser4_scanning_left(scan) && ++ (scan_index - unit_index) != 0) { ++ /* FIXME(B): Someone should step-through and verify that this preceder ++ calculation is indeed correct. */ ++ /* @unit_start is starting block (number) of extent ++ unit. Flush stopped at the @scan_index block from ++ the beginning of the file, which is (scan_index - ++ unit_index) block within extent. ++ */ ++ if (unit_start) { ++ /* skip preceder update when we are at hole */ ++ scan->preceder_blk = ++ unit_start + scan_index - unit_index; ++ check_preceder(scan->preceder_blk); ++ } ++ } ++ ++ /* In this case, we leave coord set to the parent of scan->node. */ ++ scan->stop = 1; ++ ++ } else { ++ /* In this case, we are still scanning, coord is set to the next item which is ++ either off-the-end of the node or not an extent. */ ++ assert("jmacd-8912", scan->stop == 0); ++ assert("jmacd-7812", ++ (coord_is_after_sideof_unit(&coord, scan->direction) ++ || !item_is_extent(&coord))); ++ } ++ ++ ret = 0; ++ exit: ++ return ret; ++} ++ ++/* ask block allocator for some blocks */ ++static void extent_allocate_blocks(reiser4_blocknr_hint *preceder, ++ reiser4_block_nr wanted_count, ++ reiser4_block_nr *first_allocated, ++ reiser4_block_nr *allocated, ++ block_stage_t block_stage) ++{ ++ *allocated = wanted_count; ++ preceder->max_dist = 0; /* scan whole disk, if needed */ ++ ++ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */ ++ preceder->block_stage = block_stage; ++ ++ /* FIXME: we do not handle errors here now */ ++ check_me("vs-420", ++ reiser4_alloc_blocks(preceder, first_allocated, allocated, ++ BA_PERMANENT) == 0); ++ /* update flush_pos's preceder to last allocated block number */ ++ preceder->blk = *first_allocated + *allocated - 1; ++} ++ ++/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent ++ will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have ++ to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */ ++static reiser4_block_nr reserve_replace(void) ++{ ++ reiser4_block_nr grabbed, needed; ++ ++ grabbed = get_current_context()->grabbed_blocks; ++ needed = estimate_one_insert_into_item(current_tree); ++ check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED)); ++ return grabbed; ++} ++ ++static void free_replace_reserved(reiser4_block_nr grabbed) ++{ ++ reiser4_context *ctx; ++ ++ ctx = get_current_context(); ++ grabbed2free(ctx, get_super_private(ctx->super), ++ ctx->grabbed_blocks - grabbed); ++} ++ ++/* Block offset of first block addressed by unit */ ++__u64 extent_unit_index(const coord_t * item) ++{ ++ reiser4_key key; ++ ++ assert("vs-648", coord_is_existing_unit(item)); ++ unit_key_by_coord(item, &key); ++ return get_key_offset(&key) >> current_blocksize_bits; ++} ++ ++/* AUDIT shouldn't return value be of reiser4_block_nr type? ++ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */ ++__u64 extent_unit_width(const coord_t * item) ++{ ++ assert("vs-649", coord_is_existing_unit(item)); ++ return width_by_coord(item); ++} ++ ++/* Starting block location of this unit */ ++static reiser4_block_nr extent_unit_start(const coord_t * item) ++{ ++ return extent_get_start(extent_by_coord(item)); ++} ++ ++/** ++ * split_allocated_extent - ++ * @coord: ++ * @pos_in_unit: ++ * ++ * replace allocated extent with two allocated extents ++ */ ++static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit) ++{ ++ int result; ++ struct replace_handle *h; ++ reiser4_extent *ext; ++ reiser4_block_nr grabbed; ++ ++ ext = extent_by_coord(coord); ++ assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT); ++ assert("vs-1411", extent_get_width(ext) > pos_in_unit); ++ ++ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get()); ++ if (h == NULL) ++ return RETERR(-ENOMEM); ++ h->coord = coord; ++ h->lh = znode_lh(coord->node); ++ h->pkey = &h->key; ++ unit_key_by_coord(coord, h->pkey); ++ set_key_offset(h->pkey, ++ (get_key_offset(h->pkey) + ++ pos_in_unit * current_blocksize)); ++ reiser4_set_extent(&h->overwrite, extent_get_start(ext), ++ pos_in_unit); ++ reiser4_set_extent(&h->new_extents[0], ++ extent_get_start(ext) + pos_in_unit, ++ extent_get_width(ext) - pos_in_unit); ++ h->nr_new_extents = 1; ++ h->flags = COPI_DONT_SHIFT_LEFT; ++ h->paste_key = h->key; ++ ++ /* reserve space for extent unit paste, @grabbed is reserved before */ ++ grabbed = reserve_replace(); ++ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten ++ extent */); ++ /* restore reserved */ ++ free_replace_reserved(grabbed); ++ kfree(h); ++ return result; ++} ++ ++/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is ++ one). Return 1 if it succeeded, 0 - otherwise */ ++static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext, ++ reiser4_extent *replace) ++{ ++ assert("vs-1415", extent_by_coord(coord) == ext); ++ ++ if (coord->unit_pos == 0 ++ || state_of_extent(ext - 1) != ALLOCATED_EXTENT) ++ /* @ext either does not exist or is not allocated extent */ ++ return 0; ++ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) != ++ extent_get_start(replace)) ++ return 0; ++ ++ /* we can glue, widen previous unit */ ++ extent_set_width(ext - 1, ++ extent_get_width(ext - 1) + extent_get_width(replace)); ++ ++ if (extent_get_width(ext) != extent_get_width(replace)) { ++ /* make current extent narrower */ ++ if (state_of_extent(ext) == ALLOCATED_EXTENT) ++ extent_set_start(ext, ++ extent_get_start(ext) + ++ extent_get_width(replace)); ++ extent_set_width(ext, ++ extent_get_width(ext) - ++ extent_get_width(replace)); ++ } else { ++ /* current extent completely glued with its left neighbor, remove it */ ++ coord_t from, to; ++ ++ coord_dup(&from, coord); ++ from.unit_pos = nr_units_extent(coord) - 1; ++ coord_dup(&to, &from); ++ ++ /* currently cut from extent can cut either from the beginning or from the end. Move place which got ++ freed after unit removal to end of item */ ++ memmove(ext, ext + 1, ++ (from.unit_pos - ++ coord->unit_pos) * sizeof(reiser4_extent)); ++ /* wipe part of item which is going to be cut, so that node_check will not be confused */ ++ cut_node_content(&from, &to, NULL, NULL, NULL); ++ } ++ znode_make_dirty(coord->node); ++ /* move coord back */ ++ coord->unit_pos--; ++ return 1; ++} ++ ++/** ++ * conv_extent - replace extent with 2 ones ++ * @coord: coordinate of extent to be replaced ++ * @replace: extent to overwrite the one @coord is set to ++ * ++ * Overwrites extent @coord is set to and paste one extent unit after ++ * overwritten one if @replace is shorter than initial extent ++ */ ++static int conv_extent(coord_t *coord, reiser4_extent *replace) ++{ ++ int result; ++ struct replace_handle *h; ++ reiser4_extent *ext; ++ reiser4_block_nr start, width, new_width; ++ reiser4_block_nr grabbed; ++ extent_state state; ++ ++ ext = extent_by_coord(coord); ++ state = state_of_extent(ext); ++ start = extent_get_start(ext); ++ width = extent_get_width(ext); ++ new_width = extent_get_width(replace); ++ ++ assert("vs-1458", (state == UNALLOCATED_EXTENT || ++ state == ALLOCATED_EXTENT)); ++ assert("vs-1459", width >= new_width); ++ ++ if (try_to_merge_with_left(coord, ext, replace)) { ++ /* merged @replace with left neighbor. Current unit is either ++ removed or narrowed */ ++ return 0; ++ } ++ ++ if (width == new_width) { ++ /* replace current extent with @replace */ ++ *ext = *replace; ++ znode_make_dirty(coord->node); ++ return 0; ++ } ++ ++ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get()); ++ if (h == NULL) ++ return RETERR(-ENOMEM); ++ h->coord = coord; ++ h->lh = znode_lh(coord->node); ++ h->pkey = &h->key; ++ unit_key_by_coord(coord, h->pkey); ++ set_key_offset(h->pkey, ++ (get_key_offset(h->pkey) + new_width * current_blocksize)); ++ h->overwrite = *replace; ++ ++ /* replace @ext with @replace and padding extent */ ++ reiser4_set_extent(&h->new_extents[0], ++ (state == ALLOCATED_EXTENT) ? ++ (start + new_width) : ++ UNALLOCATED_EXTENT_START, ++ width - new_width); ++ h->nr_new_extents = 1; ++ h->flags = COPI_DONT_SHIFT_LEFT; ++ h->paste_key = h->key; ++ ++ /* reserve space for extent unit paste, @grabbed is reserved before */ ++ grabbed = reserve_replace(); ++ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten ++ extent */); ++ ++ /* restore reserved */ ++ free_replace_reserved(grabbed); ++ kfree(h); ++ return result; ++} ++ ++/** ++ * assign_real_blocknrs ++ * @flush_pos: ++ * @oid: objectid of file jnodes to assign block number to belongs to ++ * @index: first jnode on the range ++ * @count: number of jnodes to assign block numbers to ++ * @first: start of allocated block range ++ * ++ * Assigns block numbers to each of @count jnodes. Index of first jnode is ++ * @index. Jnodes get lookuped with jlookup. ++ */ ++static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid, ++ unsigned long index, reiser4_block_nr count, ++ reiser4_block_nr first) ++{ ++ unsigned long i; ++ reiser4_tree *tree; ++ txn_atom *atom; ++ int nr; ++ ++ atom = atom_locked_by_fq(flush_pos->fq); ++ assert("vs-1468", atom); ++ BUG_ON(atom == NULL); ++ ++ nr = 0; ++ tree = current_tree; ++ for (i = 0; i < count; ++i, ++index) { ++ jnode *node; ++ ++ node = jlookup(tree, oid, index); ++ assert("", node != NULL); ++ BUG_ON(node == NULL); ++ ++ spin_lock_jnode(node); ++ assert("", !jnode_is_flushprepped(node)); ++ assert("vs-1475", node->atom == atom); ++ assert("vs-1476", atomic_read(&node->x_count) > 0); ++ ++ JF_CLR(node, JNODE_FLUSH_RESERVED); ++ jnode_set_block(node, &first); ++ unformatted_make_reloc(node, flush_pos->fq); ++ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node), ++ FQ_LIST, 0)); ++ spin_unlock_jnode(node); ++ first++; ++ ++ atomic_dec(&node->x_count); ++ nr ++; ++ } ++ ++ spin_unlock_atom(atom); ++ return; ++} ++ ++/** ++ * make_node_ovrwr - assign node to overwrite set ++ * @jnodes: overwrite set list head ++ * @node: jnode to belong to overwrite set ++ * ++ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes ++ * which is an accumulator for nodes before they get to overwrite set list of ++ * atom. ++ */ ++static void make_node_ovrwr(struct list_head *jnodes, jnode *node) ++{ ++ spin_lock_jnode(node); ++ ++ assert("zam-917", !JF_ISSET(node, JNODE_RELOC)); ++ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR)); ++ ++ JF_SET(node, JNODE_OVRWR); ++ list_move_tail(&node->capture_link, jnodes); ++ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0)); ++ ++ spin_unlock_jnode(node); ++} ++ ++/** ++ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set ++ * @flush_pos: flush position ++ * @oid: objectid of file jnodes belong to ++ * @index: starting index ++ * @width: extent width ++ * ++ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's ++ * overwrite set. Starting from the one with index @index. If end of slum is ++ * detected (node is not found or flushprepped) - stop iterating and set flush ++ * position's state to POS_INVALID. ++ */ ++static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid, ++ unsigned long index, reiser4_block_nr width) ++{ ++ unsigned long i; ++ reiser4_tree *tree; ++ jnode *node; ++ txn_atom *atom; ++ LIST_HEAD(jnodes); ++ ++ tree = current_tree; ++ ++ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos)); ++ assert("vs-1478", atom); ++ ++ for (i = flush_pos->pos_in_unit; i < width; i++, index++) { ++ node = jlookup(tree, oid, index); ++ if (!node) { ++ flush_pos->state = POS_INVALID; ++ break; ++ } ++ if (jnode_check_flushprepped(node)) { ++ flush_pos->state = POS_INVALID; ++ atomic_dec(&node->x_count); ++ break; ++ } ++ if (node->atom != atom) { ++ flush_pos->state = POS_INVALID; ++ atomic_dec(&node->x_count); ++ break; ++ } ++ make_node_ovrwr(&jnodes, node); ++ atomic_dec(&node->x_count); ++ } ++ ++ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev); ++ spin_unlock_atom(atom); ++} ++ ++/** ++ * allocated_extent_slum_size ++ * @flush_pos: ++ * @oid: ++ * @index: ++ * @count: ++ * ++ * ++ */ ++static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid, ++ unsigned long index, unsigned long count) ++{ ++ unsigned long i; ++ reiser4_tree *tree; ++ txn_atom *atom; ++ int nr; ++ ++ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos)); ++ assert("vs-1468", atom); ++ ++ nr = 0; ++ tree = current_tree; ++ for (i = 0; i < count; ++i, ++index) { ++ jnode *node; ++ ++ node = jlookup(tree, oid, index); ++ if (!node) ++ break; ++ ++ if (jnode_check_flushprepped(node)) { ++ atomic_dec(&node->x_count); ++ break; ++ } ++ ++ if (node->atom != atom) { ++ /* ++ * this is possible on overwrite: extent_write may ++ * capture several unformatted nodes without capturing ++ * any formatted nodes. ++ */ ++ atomic_dec(&node->x_count); ++ break; ++ } ++ ++ assert("vs-1476", atomic_read(&node->x_count) > 1); ++ atomic_dec(&node->x_count); ++ nr ++; ++ } ++ ++ spin_unlock_atom(atom); ++ return nr; ++} ++ ++/** ++ * alloc_extent ++ * @flush_pos: ++ * ++ * ++ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord ++ * is set to. It is to prepare for flushing sequence of not flushprepped nodes ++ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position ++ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is ++ * set to 1 and to overwrite set otherwise ++ */ ++int reiser4_alloc_extent(flush_pos_t *flush_pos) ++{ ++ coord_t *coord; ++ reiser4_extent *ext; ++ reiser4_extent replace_ext; ++ oid_t oid; ++ reiser4_block_nr protected; ++ reiser4_block_nr start; ++ __u64 index; ++ __u64 width; ++ extent_state state; ++ int result; ++ reiser4_block_nr first_allocated; ++ __u64 allocated; ++ reiser4_key key; ++ block_stage_t block_stage; ++ ++ assert("vs-1468", flush_pos->state == POS_ON_EPOINT); ++ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord) ++ && item_is_extent(&flush_pos->coord)); ++ ++ coord = &flush_pos->coord; ++ ++ ext = extent_by_coord(coord); ++ state = state_of_extent(ext); ++ if (state == HOLE_EXTENT) { ++ flush_pos->state = POS_INVALID; ++ return 0; ++ } ++ ++ item_key_by_coord(coord, &key); ++ oid = get_key_objectid(&key); ++ index = extent_unit_index(coord) + flush_pos->pos_in_unit; ++ start = extent_get_start(ext); ++ width = extent_get_width(ext); ++ ++ assert("vs-1457", width > flush_pos->pos_in_unit); ++ ++ if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) { ++ /* relocate */ ++ if (flush_pos->pos_in_unit) { ++ /* split extent unit into two */ ++ result = ++ split_allocated_extent(coord, ++ flush_pos->pos_in_unit); ++ flush_pos->pos_in_unit = 0; ++ return result; ++ } ++ ++ /* limit number of nodes to allocate */ ++ if (flush_pos->nr_to_write < width) ++ width = flush_pos->nr_to_write; ++ ++ if (state == ALLOCATED_EXTENT) { ++ /* ++ * all protected nodes are not flushprepped, therefore ++ * they are counted as flush_reserved ++ */ ++ block_stage = BLOCK_FLUSH_RESERVED; ++ protected = allocated_extent_slum_size(flush_pos, oid, ++ index, width); ++ if (protected == 0) { ++ flush_pos->state = POS_INVALID; ++ flush_pos->pos_in_unit = 0; ++ return 0; ++ } ++ } else { ++ block_stage = BLOCK_UNALLOCATED; ++ protected = width; ++ } ++ ++ /* ++ * look at previous unit if possible. If it is allocated, make ++ * preceder more precise ++ */ ++ if (coord->unit_pos && ++ (state_of_extent(ext - 1) == ALLOCATED_EXTENT)) ++ reiser4_pos_hint(flush_pos)->blk = ++ extent_get_start(ext - 1) + ++ extent_get_width(ext - 1); ++ ++ /* allocate new block numbers for protected nodes */ ++ extent_allocate_blocks(reiser4_pos_hint(flush_pos), ++ protected, ++ &first_allocated, &allocated, ++ block_stage); ++ ++ if (state == ALLOCATED_EXTENT) ++ /* ++ * on relocating - free nodes which are going to be ++ * relocated ++ */ ++ reiser4_dealloc_blocks(&start, &allocated, ++ BLOCK_ALLOCATED, BA_DEFER); ++ ++ /* assign new block numbers to protected nodes */ ++ assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated); ++ ++ /* prepare extent which will replace current one */ ++ reiser4_set_extent(&replace_ext, first_allocated, allocated); ++ ++ /* adjust extent item */ ++ result = conv_extent(coord, &replace_ext); ++ if (result != 0 && result != -ENOMEM) { ++ warning("vs-1461", ++ "Failed to allocate extent. Should not happen\n"); ++ return result; ++ } ++ ++ /* ++ * break flush: we prepared for flushing as many blocks as we ++ * were asked for ++ */ ++ if (flush_pos->nr_to_write == allocated) ++ flush_pos->state = POS_INVALID; ++ } else { ++ /* overwrite */ ++ mark_jnodes_overwrite(flush_pos, oid, index, width); ++ } ++ flush_pos->pos_in_unit = 0; ++ return 0; ++} ++ ++/* if @key is glueable to the item @coord is set to */ ++static int must_insert(const coord_t *coord, const reiser4_key *key) ++{ ++ reiser4_key last; ++ ++ if (item_id_by_coord(coord) == EXTENT_POINTER_ID ++ && keyeq(append_key_extent(coord, &last), key)) ++ return 0; ++ return 1; ++} ++ ++/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item, ++ or modify last unit of last item to have greater width */ ++static int put_unit_to_end(znode *node, const reiser4_key *key, ++ reiser4_extent *copy_ext) ++{ ++ int result; ++ coord_t coord; ++ cop_insert_flag flags; ++ reiser4_extent *last_ext; ++ reiser4_item_data data; ++ ++ /* set coord after last unit in an item */ ++ coord_init_last_unit(&coord, node); ++ coord.between = AFTER_UNIT; ++ ++ flags = ++ COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE; ++ if (must_insert(&coord, key)) { ++ result = ++ insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1), ++ key, NULL /*lh */ , flags); ++ ++ } else { ++ /* try to glue with last unit */ ++ last_ext = extent_by_coord(&coord); ++ if (state_of_extent(last_ext) && ++ extent_get_start(last_ext) + extent_get_width(last_ext) == ++ extent_get_start(copy_ext)) { ++ /* widen last unit of node */ ++ extent_set_width(last_ext, ++ extent_get_width(last_ext) + ++ extent_get_width(copy_ext)); ++ znode_make_dirty(node); ++ return 0; ++ } ++ ++ /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */ ++ result = ++ insert_into_item(&coord, NULL /*lh */ , key, ++ init_new_extent(&data, copy_ext, 1), ++ flags); ++ } ++ ++ assert("vs-438", result == 0 || result == -E_NODE_FULL); ++ return result; ++} ++ ++/* @coord is set to extent unit */ ++squeeze_result squalloc_extent(znode *left, const coord_t *coord, ++ flush_pos_t *flush_pos, ++ reiser4_key *stop_key) ++{ ++ reiser4_extent *ext; ++ __u64 index; ++ __u64 width; ++ reiser4_block_nr start; ++ extent_state state; ++ oid_t oid; ++ reiser4_block_nr first_allocated; ++ __u64 allocated; ++ __u64 protected; ++ reiser4_extent copy_extent; ++ reiser4_key key; ++ int result; ++ block_stage_t block_stage; ++ ++ assert("vs-1457", flush_pos->pos_in_unit == 0); ++ assert("vs-1467", coord_is_leftmost_unit(coord)); ++ assert("vs-1467", item_is_extent(coord)); ++ ++ ext = extent_by_coord(coord); ++ index = extent_unit_index(coord); ++ start = extent_get_start(ext); ++ width = extent_get_width(ext); ++ state = state_of_extent(ext); ++ unit_key_by_coord(coord, &key); ++ oid = get_key_objectid(&key); ++ ++ if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) || ++ (state == UNALLOCATED_EXTENT)) { ++ /* relocate */ ++ if (state == ALLOCATED_EXTENT) { ++ /* all protected nodes are not flushprepped, therefore ++ * they are counted as flush_reserved */ ++ block_stage = BLOCK_FLUSH_RESERVED; ++ protected = allocated_extent_slum_size(flush_pos, oid, ++ index, width); ++ if (protected == 0) { ++ flush_pos->state = POS_INVALID; ++ flush_pos->pos_in_unit = 0; ++ return 0; ++ } ++ } else { ++ block_stage = BLOCK_UNALLOCATED; ++ protected = width; ++ } ++ ++ /* ++ * look at previous unit if possible. If it is allocated, make ++ * preceder more precise ++ */ ++ if (coord->unit_pos && ++ (state_of_extent(ext - 1) == ALLOCATED_EXTENT)) ++ reiser4_pos_hint(flush_pos)->blk = ++ extent_get_start(ext - 1) + ++ extent_get_width(ext - 1); ++ ++ /* allocate new block numbers for protected nodes */ ++ extent_allocate_blocks(reiser4_pos_hint(flush_pos), ++ protected, ++ &first_allocated, &allocated, ++ block_stage); ++ ++ /* prepare extent which will be copied to left */ ++ reiser4_set_extent(©_extent, first_allocated, allocated); ++ ++ result = put_unit_to_end(left, &key, ©_extent); ++ if (result == -E_NODE_FULL) { ++ int target_block_stage; ++ ++ /* free blocks which were just allocated */ ++ target_block_stage = ++ (state == ++ ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED : ++ BLOCK_UNALLOCATED; ++ reiser4_dealloc_blocks(&first_allocated, &allocated, ++ target_block_stage, ++ BA_PERMANENT); ++ ++ /* rewind the preceder. */ ++ flush_pos->preceder.blk = first_allocated; ++ check_preceder(flush_pos->preceder.blk); ++ ++ return SQUEEZE_TARGET_FULL; ++ } ++ ++ if (state == ALLOCATED_EXTENT) { ++ /* free nodes which were relocated */ ++ reiser4_dealloc_blocks(&start, &allocated, ++ BLOCK_ALLOCATED, BA_DEFER); ++ } ++ ++ /* assign new block numbers to protected nodes */ ++ assign_real_blocknrs(flush_pos, oid, index, allocated, ++ first_allocated); ++ ++ set_key_offset(&key, ++ get_key_offset(&key) + ++ (allocated << current_blocksize_bits)); ++ } else { ++ /* ++ * overwrite: try to copy unit as it is to left neighbor and ++ * make all first not flushprepped nodes overwrite nodes ++ */ ++ reiser4_set_extent(©_extent, start, width); ++ result = put_unit_to_end(left, &key, ©_extent); ++ if (result == -E_NODE_FULL) ++ return SQUEEZE_TARGET_FULL; ++ ++ if (state != HOLE_EXTENT) ++ mark_jnodes_overwrite(flush_pos, oid, index, width); ++ set_key_offset(&key, ++ get_key_offset(&key) + ++ (width << current_blocksize_bits)); ++ } ++ *stop_key = key; ++ return SQUEEZE_CONTINUE; ++} ++ ++int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key) ++{ ++ return key_by_inode_and_offset_common(inode, off, key); ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/extent.h linux-2.6.33/fs/reiser4/plugin/item/extent.h +--- linux-2.6.33.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/extent.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,231 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#ifndef __REISER4_EXTENT_H__ ++#define __REISER4_EXTENT_H__ ++ ++/* on disk extent */ ++typedef struct { ++ reiser4_dblock_nr start; ++ reiser4_dblock_nr width; ++} reiser4_extent; ++ ++struct extent_stat { ++ int unallocated_units; ++ int unallocated_blocks; ++ int allocated_units; ++ int allocated_blocks; ++ int hole_units; ++ int hole_blocks; ++}; ++ ++/* extents in an extent item can be either holes, or unallocated or allocated ++ extents */ ++typedef enum { ++ HOLE_EXTENT, ++ UNALLOCATED_EXTENT, ++ ALLOCATED_EXTENT ++} extent_state; ++ ++#define HOLE_EXTENT_START 0 ++#define UNALLOCATED_EXTENT_START 1 ++#define UNALLOCATED_EXTENT_START2 2 ++ ++struct extent_coord_extension { ++ reiser4_block_nr pos_in_unit; ++ reiser4_block_nr width; /* width of current unit */ ++ pos_in_node_t nr_units; /* number of units */ ++ int ext_offset; /* offset from the beginning of zdata() */ ++ unsigned long expected_page; ++#if REISER4_DEBUG ++ reiser4_extent extent; ++#endif ++}; ++ ++/* macros to set/get fields of on-disk extent */ ++static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext) ++{ ++ return le64_to_cpu(ext->start); ++} ++ ++static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext) ++{ ++ return le64_to_cpu(ext->width); ++} ++ ++extern __u64 reiser4_current_block_count(void); ++ ++static inline void ++extent_set_start(reiser4_extent * ext, reiser4_block_nr start) ++{ ++ cassert(sizeof(ext->start) == 8); ++ assert("nikita-2510", ++ ergo(start > 1, start < reiser4_current_block_count())); ++ put_unaligned(cpu_to_le64(start), &ext->start); ++} ++ ++static inline void ++extent_set_width(reiser4_extent * ext, reiser4_block_nr width) ++{ ++ cassert(sizeof(ext->width) == 8); ++ assert("", width > 0); ++ put_unaligned(cpu_to_le64(width), &ext->width); ++ assert("nikita-2511", ++ ergo(extent_get_start(ext) > 1, ++ extent_get_start(ext) + width <= ++ reiser4_current_block_count())); ++} ++ ++#define extent_item(coord) \ ++({ \ ++ assert("nikita-3143", item_is_extent(coord)); \ ++ ((reiser4_extent *)item_body_by_coord (coord)); \ ++}) ++ ++#define extent_by_coord(coord) \ ++({ \ ++ assert("nikita-3144", item_is_extent(coord)); \ ++ (extent_item (coord) + (coord)->unit_pos); \ ++}) ++ ++#define width_by_coord(coord) \ ++({ \ ++ assert("nikita-3145", item_is_extent(coord)); \ ++ extent_get_width (extent_by_coord(coord)); \ ++}) ++ ++struct carry_cut_data; ++struct carry_kill_data; ++ ++/* plugin->u.item.b.* */ ++reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *); ++int can_contain_key_extent(const coord_t * coord, const reiser4_key * key, ++ const reiser4_item_data *); ++int mergeable_extent(const coord_t * p1, const coord_t * p2); ++pos_in_node_t nr_units_extent(const coord_t *); ++lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *); ++void init_coord_extent(coord_t *); ++int init_extent(coord_t *, reiser4_item_data *); ++int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *); ++int can_shift_extent(unsigned free_space, ++ coord_t * source, znode * target, shift_direction, ++ unsigned *size, unsigned want); ++void copy_units_extent(coord_t * target, coord_t * source, unsigned from, ++ unsigned count, shift_direction where_is_free_space, ++ unsigned free_space); ++int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count, ++ struct carry_kill_data *); ++int create_hook_extent(const coord_t * coord, void *arg); ++int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ struct carry_cut_data *, reiser4_key * smallest_removed, ++ reiser4_key * new_first); ++int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ struct carry_kill_data *, reiser4_key * smallest_removed, ++ reiser4_key * new_first); ++reiser4_key *unit_key_extent(const coord_t *, reiser4_key *); ++reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *); ++void print_extent(const char *, coord_t *); ++int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child); ++int utmost_child_real_block_extent(const coord_t * coord, sideof side, ++ reiser4_block_nr * block); ++void item_stat_extent(const coord_t * coord, void *vp); ++int reiser4_check_extent(const coord_t * coord, const char **error); ++ ++/* plugin->u.item.s.file.* */ ++ssize_t reiser4_write_extent(struct file *, struct inode * inode, ++ const char __user *, size_t, loff_t *); ++int reiser4_read_extent(struct file *, flow_t *, hint_t *); ++int reiser4_readpage_extent(void *, struct page *); ++int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*); ++reiser4_key *append_key_extent(const coord_t *, reiser4_key *); ++void init_coord_extension_extent(uf_coord_t *, loff_t offset); ++int get_block_address_extent(const coord_t *, sector_t block, ++ sector_t * result); ++ ++/* these are used in flush.c ++ FIXME-VS: should they be somewhere in item_plugin? */ ++int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos); ++int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos, ++ reiser4_key * stop_key); ++ ++int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */ ++__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */ ++__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */ ++ ++/* plugin->u.item.f. */ ++int reiser4_scan_extent(flush_scan * scan); ++extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *); ++ ++reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit, ++ int nr_extents); ++reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr); ++extent_state state_of_extent(reiser4_extent * ext); ++void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start, ++ reiser4_block_nr width); ++int reiser4_update_extent(struct inode *, jnode *, loff_t pos, ++ int *plugged_hole); ++ ++#include "../../coord.h" ++#include "../../lock.h" ++#include "../../tap.h" ++ ++struct replace_handle { ++ /* these are to be set before calling reiser4_replace_extent */ ++ coord_t *coord; ++ lock_handle *lh; ++ reiser4_key key; ++ reiser4_key *pkey; ++ reiser4_extent overwrite; ++ reiser4_extent new_extents[2]; ++ int nr_new_extents; ++ unsigned flags; ++ ++ /* these are used by reiser4_replace_extent */ ++ reiser4_item_data item; ++ coord_t coord_after; ++ lock_handle lh_after; ++ tap_t watch; ++ reiser4_key paste_key; ++#if REISER4_DEBUG ++ reiser4_extent orig_ext; ++ reiser4_key tmp; ++#endif ++}; ++ ++/* this structure is kmalloced before calling make_extent to avoid excessive ++ stack consumption on plug_hole->reiser4_replace_extent */ ++struct make_extent_handle { ++ uf_coord_t *uf_coord; ++ reiser4_block_nr blocknr; ++ int created; ++ struct inode *inode; ++ union { ++ struct { ++ } append; ++ struct replace_handle replace; ++ } u; ++}; ++ ++int reiser4_replace_extent(struct replace_handle *, ++ int return_inserted_position); ++lock_handle *znode_lh(znode *); ++ ++/* the reiser4 repacker support */ ++struct repacker_cursor; ++extern int process_extent_backward_for_repacking(tap_t *, ++ struct repacker_cursor *); ++extern int mark_extent_for_repacking(tap_t *, int); ++ ++#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord)) ++#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent)) ++ ++/* __REISER4_EXTENT_H__ */ ++#endif ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.33/fs/reiser4/plugin/item/extent_item_ops.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/extent_item_ops.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,889 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "item.h" ++#include "../../inode.h" ++#include "../../tree_walk.h" /* check_sibling_list() */ ++#include "../../page_cache.h" ++#include "../../carry.h" ++ ++#include <linux/quotaops.h> ++ ++/* item_plugin->b.max_key_inside */ ++reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key) ++{ ++ item_key_by_coord(coord, key); ++ set_key_offset(key, get_key_offset(reiser4_max_key())); ++ return key; ++} ++ ++/* item_plugin->b.can_contain_key ++ this checks whether @key of @data is matching to position set by @coord */ ++int ++can_contain_key_extent(const coord_t * coord, const reiser4_key * key, ++ const reiser4_item_data * data) ++{ ++ reiser4_key item_key; ++ ++ if (item_plugin_by_coord(coord) != data->iplug) ++ return 0; ++ ++ item_key_by_coord(coord, &item_key); ++ if (get_key_locality(key) != get_key_locality(&item_key) || ++ get_key_objectid(key) != get_key_objectid(&item_key) || ++ get_key_ordering(key) != get_key_ordering(&item_key)) ++ return 0; ++ ++ return 1; ++} ++ ++/* item_plugin->b.mergeable ++ first item is of extent type */ ++/* Audited by: green(2002.06.13) */ ++int mergeable_extent(const coord_t * p1, const coord_t * p2) ++{ ++ reiser4_key key1, key2; ++ ++ assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID); ++ /* FIXME-VS: Which is it? Assert or return 0 */ ++ if (item_id_by_coord(p2) != EXTENT_POINTER_ID) { ++ return 0; ++ } ++ ++ item_key_by_coord(p1, &key1); ++ item_key_by_coord(p2, &key2); ++ if (get_key_locality(&key1) != get_key_locality(&key2) || ++ get_key_objectid(&key1) != get_key_objectid(&key2) || ++ get_key_ordering(&key1) != get_key_ordering(&key2) || ++ get_key_type(&key1) != get_key_type(&key2)) ++ return 0; ++ if (get_key_offset(&key1) + ++ reiser4_extent_size(p1, nr_units_extent(p1)) != ++ get_key_offset(&key2)) ++ return 0; ++ return 1; ++} ++ ++/* item_plugin->b.nr_units */ ++pos_in_node_t nr_units_extent(const coord_t * coord) ++{ ++ /* length of extent item has to be multiple of extent size */ ++ assert("vs-1424", ++ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0); ++ return item_length_by_coord(coord) / sizeof(reiser4_extent); ++} ++ ++/* item_plugin->b.lookup */ ++lookup_result ++lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG, ++ coord_t * coord) ++{ /* znode and item_pos are ++ set to an extent item to ++ look through */ ++ reiser4_key item_key; ++ reiser4_block_nr lookuped, offset; ++ unsigned i, nr_units; ++ reiser4_extent *ext; ++ unsigned blocksize; ++ unsigned char blocksize_bits; ++ ++ item_key_by_coord(coord, &item_key); ++ offset = get_key_offset(&item_key); ++ ++ /* key we are looking for must be greater than key of item @coord */ ++ assert("vs-414", keygt(key, &item_key)); ++ ++ assert("umka-99945", ++ !keygt(key, max_key_inside_extent(coord, &item_key))); ++ ++ ext = extent_item(coord); ++ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset)); ++ ++ blocksize = current_blocksize; ++ blocksize_bits = current_blocksize_bits; ++ ++ /* offset we are looking for */ ++ lookuped = get_key_offset(key); ++ ++ nr_units = nr_units_extent(coord); ++ /* go through all extents until the one which address given offset */ ++ for (i = 0; i < nr_units; i++, ext++) { ++ offset += (extent_get_width(ext) << blocksize_bits); ++ if (offset > lookuped) { ++ /* desired byte is somewhere in this extent */ ++ coord->unit_pos = i; ++ coord->between = AT_UNIT; ++ return CBK_COORD_FOUND; ++ } ++ } ++ ++ /* set coord after last unit */ ++ coord->unit_pos = nr_units - 1; ++ coord->between = AFTER_UNIT; ++ return CBK_COORD_FOUND; ++} ++ ++/* item_plugin->b.paste ++ item @coord is set to has been appended with @data->length of free ++ space. data->data contains data to be pasted into the item in position ++ @coord->in_item.unit_pos. It must fit into that free space. ++ @coord must be set between units. ++*/ ++int ++paste_extent(coord_t * coord, reiser4_item_data * data, ++ carry_plugin_info * info UNUSED_ARG) ++{ ++ unsigned old_nr_units; ++ reiser4_extent *ext; ++ int item_length; ++ ++ ext = extent_item(coord); ++ item_length = item_length_by_coord(coord); ++ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent); ++ ++ /* this is also used to copy extent into newly created item, so ++ old_nr_units could be 0 */ ++ assert("vs-260", item_length >= data->length); ++ ++ /* make sure that coord is set properly */ ++ assert("vs-35", ++ ((!coord_is_existing_unit(coord)) ++ || (!old_nr_units && !coord->unit_pos))); ++ ++ /* first unit to be moved */ ++ switch (coord->between) { ++ case AFTER_UNIT: ++ coord->unit_pos++; ++ case BEFORE_UNIT: ++ coord->between = AT_UNIT; ++ break; ++ case AT_UNIT: ++ assert("vs-331", !old_nr_units && !coord->unit_pos); ++ break; ++ default: ++ impossible("vs-330", "coord is set improperly"); ++ } ++ ++ /* prepare space for new units */ ++ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent), ++ ext + coord->unit_pos, ++ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent)); ++ ++ /* copy new data from kernel space */ ++ assert("vs-556", data->user == 0); ++ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length); ++ ++ /* after paste @coord is set to first of pasted units */ ++ assert("vs-332", coord_is_existing_unit(coord)); ++ assert("vs-333", ++ !memcmp(data->data, extent_by_coord(coord), ++ (unsigned)data->length)); ++ return 0; ++} ++ ++/* item_plugin->b.can_shift */ ++int ++can_shift_extent(unsigned free_space, coord_t * source, ++ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG, ++ unsigned *size, unsigned want) ++{ ++ *size = item_length_by_coord(source); ++ if (*size > free_space) ++ /* never split a unit of extent item */ ++ *size = free_space - free_space % sizeof(reiser4_extent); ++ ++ /* we can shift *size bytes, calculate how many do we want to shift */ ++ if (*size > want * sizeof(reiser4_extent)) ++ *size = want * sizeof(reiser4_extent); ++ ++ if (*size % sizeof(reiser4_extent) != 0) ++ impossible("vs-119", "Wrong extent size: %i %zd", *size, ++ sizeof(reiser4_extent)); ++ return *size / sizeof(reiser4_extent); ++ ++} ++ ++/* item_plugin->b.copy_units */ ++void ++copy_units_extent(coord_t * target, coord_t * source, ++ unsigned from, unsigned count, ++ shift_direction where_is_free_space, unsigned free_space) ++{ ++ char *from_ext, *to_ext; ++ ++ assert("vs-217", free_space == count * sizeof(reiser4_extent)); ++ ++ from_ext = item_body_by_coord(source); ++ to_ext = item_body_by_coord(target); ++ ++ if (where_is_free_space == SHIFT_LEFT) { ++ assert("vs-215", from == 0); ++ ++ /* At this moment, item length was already updated in the item ++ header by shifting code, hence nr_units_extent() will ++ return "new" number of units---one we obtain after copying ++ units. ++ */ ++ to_ext += ++ (nr_units_extent(target) - count) * sizeof(reiser4_extent); ++ } else { ++ reiser4_key key; ++ coord_t coord; ++ ++ assert("vs-216", ++ from + count == coord_last_unit_pos(source) + 1); ++ ++ from_ext += item_length_by_coord(source) - free_space; ++ ++ /* new units are inserted before first unit in an item, ++ therefore, we have to update item key */ ++ coord = *source; ++ coord.unit_pos = from; ++ unit_key_extent(&coord, &key); ++ ++ node_plugin_by_node(target->node)->update_item_key(target, &key, ++ NULL /*info */); ++ } ++ ++ memcpy(to_ext, from_ext, free_space); ++} ++ ++/* item_plugin->b.create_hook ++ @arg is znode of leaf node for which we need to update right delimiting key */ ++int create_hook_extent(const coord_t * coord, void *arg) ++{ ++ coord_t *child_coord; ++ znode *node; ++ reiser4_key key; ++ reiser4_tree *tree; ++ ++ if (!arg) ++ return 0; ++ ++ child_coord = arg; ++ tree = znode_get_tree(coord->node); ++ ++ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL); ++ ++ write_lock_tree(tree); ++ write_lock_dk(tree); ++ /* find a node on the left level for which right delimiting key has to ++ be updated */ ++ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) { ++ assert("vs-411", znode_is_left_connected(child_coord->node)); ++ node = child_coord->node->left; ++ } else { ++ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT); ++ node = child_coord->node; ++ assert("nikita-3314", node != NULL); ++ } ++ ++ if (node != NULL) { ++ znode_set_rd_key(node, item_key_by_coord(coord, &key)); ++ ++ assert("nikita-3282", check_sibling_list(node)); ++ /* break sibling links */ ++ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) { ++ ON_DEBUG(node->right->left_version = ++ atomic_inc_return(&delim_key_version); ++ node->right_version = ++ atomic_inc_return(&delim_key_version);); ++ ++ node->right->left = NULL; ++ node->right = NULL; ++ } ++ } ++ write_unlock_dk(tree); ++ write_unlock_tree(tree); ++ return 0; ++} ++ ++#define ITEM_TAIL_KILLED 0 ++#define ITEM_HEAD_KILLED 1 ++#define ITEM_KILLED 2 ++ ++/* item_plugin->b.kill_hook ++ this is called when @count units starting from @from-th one are going to be removed ++ */ ++int ++kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count, ++ struct carry_kill_data *kdata) ++{ ++ reiser4_extent *ext; ++ reiser4_block_nr start, length; ++ const reiser4_key *pfrom_key, *pto_key; ++ struct inode *inode; ++ reiser4_tree *tree; ++ pgoff_t from_off, to_off, offset, skip; ++ int retval; ++ ++ /* these are located in memory kmalloc-ed by kill_node_content */ ++ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key; ++ coord_t *dup, *next; ++ ++ assert("zam-811", znode_is_write_locked(coord->node)); ++ assert("nikita-3315", kdata != NULL); ++ assert("vs-34", kdata->buf != NULL); ++ ++ /* map structures to kdata->buf */ ++ min_item_key = (reiser4_key *) (kdata->buf); ++ max_item_key = min_item_key + 1; ++ from_key = max_item_key + 1; ++ to_key = from_key + 1; ++ key = to_key + 1; ++ dup = (coord_t *) (key + 1); ++ next = dup + 1; ++ ++ item_key_by_coord(coord, min_item_key); ++ max_item_key_by_coord(coord, max_item_key); ++ ++ if (kdata->params.from_key) { ++ pfrom_key = kdata->params.from_key; ++ pto_key = kdata->params.to_key; ++ } else { ++ assert("vs-1549", from == coord->unit_pos); ++ unit_key_by_coord(coord, from_key); ++ pfrom_key = from_key; ++ ++ coord_dup(dup, coord); ++ dup->unit_pos = from + count - 1; ++ max_unit_key_by_coord(dup, to_key); ++ pto_key = to_key; ++ } ++ ++ if (!keylt(pto_key, max_item_key)) { ++ if (!keygt(pfrom_key, min_item_key)) { ++ znode *left, *right; ++ ++ /* item is to be removed completely */ ++ assert("nikita-3316", kdata->left != NULL ++ && kdata->right != NULL); ++ ++ left = kdata->left->node; ++ right = kdata->right->node; ++ ++ tree = current_tree; ++ /* we have to do two things: ++ * ++ * 1. link left and right formatted neighbors of ++ * extent being removed, and ++ * ++ * 2. update their delimiting keys. ++ * ++ * atomicity of these operations is protected by ++ * taking dk-lock and tree-lock. ++ */ ++ /* if neighbors of item being removed are znodes - ++ * link them */ ++ write_lock_tree(tree); ++ write_lock_dk(tree); ++ link_left_and_right(left, right); ++ if (left) { ++ /* update right delimiting key of left ++ * neighbor of extent item */ ++ /*coord_t next; ++ reiser4_key key; */ ++ ++ coord_dup(next, coord); ++ ++ if (coord_next_item(next)) ++ *key = *znode_get_rd_key(coord->node); ++ else ++ item_key_by_coord(next, key); ++ znode_set_rd_key(left, key); ++ } ++ write_unlock_dk(tree); ++ write_unlock_tree(tree); ++ ++ from_off = ++ get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT; ++ to_off = ++ (get_key_offset(max_item_key) + ++ 1) >> PAGE_CACHE_SHIFT; ++ retval = ITEM_KILLED; ++ } else { ++ /* tail of item is to be removed */ ++ from_off = ++ (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE - ++ 1) >> PAGE_CACHE_SHIFT; ++ to_off = ++ (get_key_offset(max_item_key) + ++ 1) >> PAGE_CACHE_SHIFT; ++ retval = ITEM_TAIL_KILLED; ++ } ++ } else { ++ /* head of item is to be removed */ ++ assert("vs-1571", keyeq(pfrom_key, min_item_key)); ++ assert("vs-1572", ++ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == ++ 0); ++ assert("vs-1573", ++ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - ++ 1)) == 0); ++ ++ if (kdata->left->node) { ++ /* update right delimiting key of left neighbor of extent item */ ++ /*reiser4_key key; */ ++ ++ *key = *pto_key; ++ set_key_offset(key, get_key_offset(pto_key) + 1); ++ ++ write_lock_dk(current_tree); ++ znode_set_rd_key(kdata->left->node, key); ++ write_unlock_dk(current_tree); ++ } ++ ++ from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT; ++ to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT; ++ retval = ITEM_HEAD_KILLED; ++ } ++ ++ inode = kdata->inode; ++ assert("vs-1545", inode != NULL); ++ if (inode != NULL) ++ /* take care of pages and jnodes corresponding to part of item being killed */ ++ reiser4_invalidate_pages(inode->i_mapping, from_off, ++ to_off - from_off, ++ kdata->params.truncate); ++ ++ ext = extent_item(coord) + from; ++ offset = ++ (get_key_offset(min_item_key) + ++ reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT; ++ ++ assert("vs-1551", from_off >= offset); ++ assert("vs-1552", from_off - offset <= extent_get_width(ext)); ++ skip = from_off - offset; ++ offset = from_off; ++ ++ while (offset < to_off) { ++ length = extent_get_width(ext) - skip; ++ if (state_of_extent(ext) == HOLE_EXTENT) { ++ skip = 0; ++ offset += length; ++ ext++; ++ continue; ++ } ++ ++ if (offset + length > to_off) { ++ length = to_off - offset; ++ } ++ ++ vfs_dq_free_block_nodirty(inode, length); ++ ++ if (state_of_extent(ext) == UNALLOCATED_EXTENT) { ++ /* some jnodes corresponding to this unallocated extent */ ++ fake_allocated2free(length, 0 /* unformatted */ ); ++ ++ skip = 0; ++ offset += length; ++ ext++; ++ continue; ++ } ++ ++ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT); ++ ++ if (length != 0) { ++ start = extent_get_start(ext) + skip; ++ ++ /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed ++ immediately */ ++ reiser4_dealloc_blocks(&start, &length, ++ 0 /* not used */ , ++ BA_DEFER ++ /* unformatted with defer */ ); ++ } ++ skip = 0; ++ offset += length; ++ ext++; ++ } ++ return retval; ++} ++ ++/* item_plugin->b.kill_units */ ++int ++kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ struct carry_kill_data *kdata, reiser4_key * smallest_removed, ++ reiser4_key * new_first) ++{ ++ reiser4_extent *ext; ++ reiser4_key item_key; ++ pos_in_node_t count; ++ reiser4_key from_key, to_key; ++ const reiser4_key *pfrom_key, *pto_key; ++ loff_t off; ++ int result; ++ ++ assert("vs-1541", ++ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL) ++ || (kdata->params.from_key != NULL ++ && kdata->params.to_key != NULL))); ++ ++ if (kdata->params.from_key) { ++ pfrom_key = kdata->params.from_key; ++ pto_key = kdata->params.to_key; ++ } else { ++ coord_t dup; ++ ++ /* calculate key range of kill */ ++ assert("vs-1549", from == coord->unit_pos); ++ unit_key_by_coord(coord, &from_key); ++ pfrom_key = &from_key; ++ ++ coord_dup(&dup, coord); ++ dup.unit_pos = to; ++ max_unit_key_by_coord(&dup, &to_key); ++ pto_key = &to_key; ++ } ++ ++ item_key_by_coord(coord, &item_key); ++ ++#if REISER4_DEBUG ++ { ++ reiser4_key max_item_key; ++ ++ max_item_key_by_coord(coord, &max_item_key); ++ ++ if (new_first) { ++ /* head of item is to be cut */ ++ assert("vs-1542", keyeq(pfrom_key, &item_key)); ++ assert("vs-1538", keylt(pto_key, &max_item_key)); ++ } else { ++ /* tail of item is to be cut */ ++ assert("vs-1540", keygt(pfrom_key, &item_key)); ++ assert("vs-1543", !keylt(pto_key, &max_item_key)); ++ } ++ } ++#endif ++ ++ if (smallest_removed) ++ *smallest_removed = *pfrom_key; ++ ++ if (new_first) { ++ /* item head is cut. Item key will change. This new key is calculated here */ ++ assert("vs-1556", ++ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == ++ (PAGE_CACHE_SIZE - 1)); ++ *new_first = *pto_key; ++ set_key_offset(new_first, get_key_offset(new_first) + 1); ++ } ++ ++ count = to - from + 1; ++ result = kill_hook_extent(coord, from, count, kdata); ++ if (result == ITEM_TAIL_KILLED) { ++ assert("vs-1553", ++ get_key_offset(pfrom_key) >= ++ get_key_offset(&item_key) + ++ reiser4_extent_size(coord, from)); ++ off = ++ get_key_offset(pfrom_key) - ++ (get_key_offset(&item_key) + ++ reiser4_extent_size(coord, from)); ++ if (off) { ++ /* unit @from is to be cut partially. Its width decreases */ ++ ext = extent_item(coord) + from; ++ extent_set_width(ext, ++ (off + PAGE_CACHE_SIZE - ++ 1) >> PAGE_CACHE_SHIFT); ++ count--; ++ } ++ } else { ++ __u64 max_to_offset; ++ __u64 rest; ++ ++ assert("vs-1575", result == ITEM_HEAD_KILLED); ++ assert("", from == 0); ++ assert("", ++ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - ++ 1)) == 0); ++ assert("", ++ get_key_offset(pto_key) + 1 > ++ get_key_offset(&item_key) + ++ reiser4_extent_size(coord, to)); ++ max_to_offset = ++ get_key_offset(&item_key) + ++ reiser4_extent_size(coord, to + 1) - 1; ++ assert("", get_key_offset(pto_key) <= max_to_offset); ++ ++ rest = ++ (max_to_offset - ++ get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT; ++ if (rest) { ++ /* unit @to is to be cut partially */ ++ ext = extent_item(coord) + to; ++ ++ assert("", extent_get_width(ext) > rest); ++ ++ if (state_of_extent(ext) == ALLOCATED_EXTENT) ++ extent_set_start(ext, ++ extent_get_start(ext) + ++ (extent_get_width(ext) - ++ rest)); ++ ++ extent_set_width(ext, rest); ++ count--; ++ } ++ } ++ return count * sizeof(reiser4_extent); ++} ++ ++/* item_plugin->b.cut_units ++ this is too similar to kill_units_extent */ ++int ++cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ struct carry_cut_data *cdata, reiser4_key * smallest_removed, ++ reiser4_key * new_first) ++{ ++ reiser4_extent *ext; ++ reiser4_key item_key; ++ pos_in_node_t count; ++ reiser4_key from_key, to_key; ++ const reiser4_key *pfrom_key, *pto_key; ++ loff_t off; ++ ++ assert("vs-1541", ++ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL) ++ || (cdata->params.from_key != NULL ++ && cdata->params.to_key != NULL))); ++ ++ if (cdata->params.from_key) { ++ pfrom_key = cdata->params.from_key; ++ pto_key = cdata->params.to_key; ++ } else { ++ coord_t dup; ++ ++ /* calculate key range of kill */ ++ coord_dup(&dup, coord); ++ dup.unit_pos = from; ++ unit_key_by_coord(&dup, &from_key); ++ ++ dup.unit_pos = to; ++ max_unit_key_by_coord(&dup, &to_key); ++ ++ pfrom_key = &from_key; ++ pto_key = &to_key; ++ } ++ ++ assert("vs-1555", ++ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0); ++ assert("vs-1556", ++ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == ++ (PAGE_CACHE_SIZE - 1)); ++ ++ item_key_by_coord(coord, &item_key); ++ ++#if REISER4_DEBUG ++ { ++ reiser4_key max_item_key; ++ ++ assert("vs-1584", ++ get_key_locality(pfrom_key) == ++ get_key_locality(&item_key)); ++ assert("vs-1585", ++ get_key_type(pfrom_key) == get_key_type(&item_key)); ++ assert("vs-1586", ++ get_key_objectid(pfrom_key) == ++ get_key_objectid(&item_key)); ++ assert("vs-1587", ++ get_key_ordering(pfrom_key) == ++ get_key_ordering(&item_key)); ++ ++ max_item_key_by_coord(coord, &max_item_key); ++ ++ if (new_first != NULL) { ++ /* head of item is to be cut */ ++ assert("vs-1542", keyeq(pfrom_key, &item_key)); ++ assert("vs-1538", keylt(pto_key, &max_item_key)); ++ } else { ++ /* tail of item is to be cut */ ++ assert("vs-1540", keygt(pfrom_key, &item_key)); ++ assert("vs-1543", keyeq(pto_key, &max_item_key)); ++ } ++ } ++#endif ++ ++ if (smallest_removed) ++ *smallest_removed = *pfrom_key; ++ ++ if (new_first) { ++ /* item head is cut. Item key will change. This new key is calculated here */ ++ *new_first = *pto_key; ++ set_key_offset(new_first, get_key_offset(new_first) + 1); ++ } ++ ++ count = to - from + 1; ++ ++ assert("vs-1553", ++ get_key_offset(pfrom_key) >= ++ get_key_offset(&item_key) + reiser4_extent_size(coord, from)); ++ off = ++ get_key_offset(pfrom_key) - (get_key_offset(&item_key) + ++ reiser4_extent_size(coord, from)); ++ if (off) { ++ /* tail of unit @from is to be cut partially. Its width decreases */ ++ assert("vs-1582", new_first == NULL); ++ ext = extent_item(coord) + from; ++ extent_set_width(ext, off >> PAGE_CACHE_SHIFT); ++ count--; ++ } ++ ++ assert("vs-1554", ++ get_key_offset(pto_key) <= ++ get_key_offset(&item_key) + ++ reiser4_extent_size(coord, to + 1) - 1); ++ off = ++ (get_key_offset(&item_key) + ++ reiser4_extent_size(coord, to + 1) - 1) - ++ get_key_offset(pto_key); ++ if (off) { ++ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased ++ and width decreased. */ ++ assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0); ++ ext = extent_item(coord) + to; ++ if (state_of_extent(ext) == ALLOCATED_EXTENT) ++ extent_set_start(ext, ++ extent_get_start(ext) + ++ (extent_get_width(ext) - ++ (off >> PAGE_CACHE_SHIFT))); ++ ++ extent_set_width(ext, (off >> PAGE_CACHE_SHIFT)); ++ count--; ++ } ++ return count * sizeof(reiser4_extent); ++} ++ ++/* item_plugin->b.unit_key */ ++reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key) ++{ ++ assert("vs-300", coord_is_existing_unit(coord)); ++ ++ item_key_by_coord(coord, key); ++ set_key_offset(key, ++ (get_key_offset(key) + ++ reiser4_extent_size(coord, coord->unit_pos))); ++ ++ return key; ++} ++ ++/* item_plugin->b.max_unit_key */ ++reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key) ++{ ++ assert("vs-300", coord_is_existing_unit(coord)); ++ ++ item_key_by_coord(coord, key); ++ set_key_offset(key, ++ (get_key_offset(key) + ++ reiser4_extent_size(coord, coord->unit_pos + 1) - 1)); ++ return key; ++} ++ ++/* item_plugin->b.estimate ++ item_plugin->b.item_data_by_flow */ ++ ++#if REISER4_DEBUG ++ ++/* item_plugin->b.check ++ used for debugging, every item should have here the most complete ++ possible check of the consistency of the item that the inventor can ++ construct ++*/ ++int reiser4_check_extent(const coord_t * coord /* coord of item to check */, ++ const char **error /* where to store error message */) ++{ ++ reiser4_extent *ext, *first; ++ unsigned i, j; ++ reiser4_block_nr start, width, blk_cnt; ++ unsigned num_units; ++ reiser4_tree *tree; ++ oid_t oid; ++ reiser4_key key; ++ coord_t scan; ++ ++ assert("vs-933", REISER4_DEBUG); ++ ++ if (znode_get_level(coord->node) != TWIG_LEVEL) { ++ *error = "Extent on the wrong level"; ++ return -1; ++ } ++ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) { ++ *error = "Wrong item size"; ++ return -1; ++ } ++ ext = first = extent_item(coord); ++ blk_cnt = reiser4_block_count(reiser4_get_current_sb()); ++ num_units = coord_num_units(coord); ++ tree = znode_get_tree(coord->node); ++ item_key_by_coord(coord, &key); ++ oid = get_key_objectid(&key); ++ coord_dup(&scan, coord); ++ ++ for (i = 0; i < num_units; ++i, ++ext) { ++ __u64 index; ++ ++ scan.unit_pos = i; ++ index = extent_unit_index(&scan); ++ ++#if 0 ++ /* check that all jnodes are present for the unallocated ++ * extent */ ++ if (state_of_extent(ext) == UNALLOCATED_EXTENT) { ++ for (j = 0; j < extent_get_width(ext); j++) { ++ jnode *node; ++ ++ node = jlookup(tree, oid, index + j); ++ if (node == NULL) { ++ print_coord("scan", &scan, 0); ++ *error = "Jnode missing"; ++ return -1; ++ } ++ jput(node); ++ } ++ } ++#endif ++ ++ start = extent_get_start(ext); ++ if (start < 2) ++ continue; ++ /* extent is allocated one */ ++ width = extent_get_width(ext); ++ if (start >= blk_cnt) { ++ *error = "Start too large"; ++ return -1; ++ } ++ if (start + width > blk_cnt) { ++ *error = "End too large"; ++ return -1; ++ } ++ /* make sure that this extent does not overlap with other ++ allocated extents extents */ ++ for (j = 0; j < i; j++) { ++ if (state_of_extent(first + j) != ALLOCATED_EXTENT) ++ continue; ++ if (! ++ ((extent_get_start(ext) >= ++ extent_get_start(first + j) + ++ extent_get_width(first + j)) ++ || (extent_get_start(ext) + ++ extent_get_width(ext) <= ++ extent_get_start(first + j)))) { ++ *error = "Extent overlaps with others"; ++ return -1; ++ } ++ } ++ ++ } ++ ++ return 0; ++} ++ ++#endif /* REISER4_DEBUG */ ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/internal.c linux-2.6.33/fs/reiser4/plugin/item/internal.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/internal.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,404 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Implementation of internal-item plugin methods. */ ++ ++#include "../../forward.h" ++#include "../../debug.h" ++#include "../../dformat.h" ++#include "../../key.h" ++#include "../../coord.h" ++#include "internal.h" ++#include "item.h" ++#include "../node/node.h" ++#include "../plugin.h" ++#include "../../jnode.h" ++#include "../../znode.h" ++#include "../../tree_walk.h" ++#include "../../tree_mod.h" ++#include "../../tree.h" ++#include "../../super.h" ++#include "../../block_alloc.h" ++ ++/* see internal.h for explanation */ ++ ++/* plugin->u.item.b.mergeable */ ++int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ , ++ const coord_t * p2 UNUSED_ARG /* second item */ ) ++{ ++ /* internal items are not mergeable */ ++ return 0; ++} ++ ++/* ->lookup() method for internal items */ ++lookup_result lookup_internal(const reiser4_key * key /* key to look up */ , ++ lookup_bias bias UNUSED_ARG /* lookup bias */ , ++ coord_t * coord /* coord of item */ ) ++{ ++ reiser4_key ukey; ++ ++ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) { ++ default: ++ impossible("", "keycmp()?!"); ++ case LESS_THAN: ++ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord ++ item plugin can not be taken using coord set this way */ ++ assert("vs-681", coord->unit_pos == 0); ++ coord->between = AFTER_UNIT; ++ case EQUAL_TO: ++ return CBK_COORD_FOUND; ++ case GREATER_THAN: ++ return CBK_COORD_NOTFOUND; ++ } ++} ++ ++/* return body of internal item at @coord */ ++static internal_item_layout *internal_at(const coord_t * coord /* coord of ++ * item */ ) ++{ ++ assert("nikita-607", coord != NULL); ++ assert("nikita-1650", ++ item_plugin_by_coord(coord) == ++ item_plugin_by_id(NODE_POINTER_ID)); ++ return (internal_item_layout *) item_body_by_coord(coord); ++} ++ ++void reiser4_update_internal(const coord_t * coord, ++ const reiser4_block_nr * blocknr) ++{ ++ internal_item_layout *item = internal_at(coord); ++ assert("nikita-2959", reiser4_blocknr_is_sane(blocknr)); ++ ++ put_unaligned(cpu_to_le64(*blocknr), &item->pointer); ++} ++ ++/* return child block number stored in the internal item at @coord */ ++static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ ) ++{ ++ assert("nikita-608", coord != NULL); ++ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer)); ++} ++ ++/* get znode pointed to by internal @item */ ++static znode *znode_at(const coord_t * item /* coord of item */ , ++ znode * parent /* parent node */ ) ++{ ++ return child_znode(item, parent, 1, 0); ++} ++ ++/* store pointer from internal item into "block". Implementation of ++ ->down_link() method */ ++void down_link_internal(const coord_t * coord /* coord of item */ , ++ const reiser4_key * key UNUSED_ARG /* key to get ++ * pointer for */ , ++ reiser4_block_nr * block /* resulting block number */ ) ++{ ++ ON_DEBUG(reiser4_key item_key); ++ ++ assert("nikita-609", coord != NULL); ++ assert("nikita-611", block != NULL); ++ assert("nikita-612", (key == NULL) || ++ /* twig horrors */ ++ (znode_get_level(coord->node) == TWIG_LEVEL) ++ || keyle(item_key_by_coord(coord, &item_key), key)); ++ ++ *block = pointer_at(coord); ++ assert("nikita-2960", reiser4_blocknr_is_sane(block)); ++} ++ ++/* Get the child's block number, or 0 if the block is unallocated. */ ++int ++utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG, ++ reiser4_block_nr * block) ++{ ++ assert("jmacd-2059", coord != NULL); ++ ++ *block = pointer_at(coord); ++ assert("nikita-2961", reiser4_blocknr_is_sane(block)); ++ ++ if (reiser4_blocknr_is_fake(block)) { ++ *block = 0; ++ } ++ ++ return 0; ++} ++ ++/* Return the child. */ ++int ++utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG, ++ jnode ** childp) ++{ ++ reiser4_block_nr block = pointer_at(coord); ++ znode *child; ++ ++ assert("jmacd-2059", childp != NULL); ++ assert("nikita-2962", reiser4_blocknr_is_sane(&block)); ++ ++ child = zlook(znode_get_tree(coord->node), &block); ++ ++ if (IS_ERR(child)) { ++ return PTR_ERR(child); ++ } ++ ++ *childp = ZJNODE(child); ++ ++ return 0; ++} ++ ++#if REISER4_DEBUG ++ ++static void check_link(znode * left, znode * right) ++{ ++ znode *scan; ++ ++ for (scan = left; scan != right; scan = scan->right) { ++ if (ZF_ISSET(scan, JNODE_RIP)) ++ break; ++ if (znode_is_right_connected(scan) && scan->right != NULL) { ++ if (ZF_ISSET(scan->right, JNODE_RIP)) ++ break; ++ assert("nikita-3285", ++ znode_is_left_connected(scan->right)); ++ assert("nikita-3265", ++ ergo(scan != left, ++ ZF_ISSET(scan, JNODE_HEARD_BANSHEE))); ++ assert("nikita-3284", scan->right->left == scan); ++ } else ++ break; ++ } ++} ++ ++int check__internal(const coord_t * coord, const char **error) ++{ ++ reiser4_block_nr blk; ++ znode *child; ++ coord_t cpy; ++ ++ blk = pointer_at(coord); ++ if (!reiser4_blocknr_is_sane(&blk)) { ++ *error = "Invalid pointer"; ++ return -1; ++ } ++ coord_dup(&cpy, coord); ++ child = znode_at(&cpy, cpy.node); ++ if (child != NULL) { ++ znode *left_child; ++ znode *right_child; ++ ++ left_child = right_child = NULL; ++ ++ assert("nikita-3256", znode_invariant(child)); ++ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) { ++ left_child = znode_at(&cpy, cpy.node); ++ if (left_child != NULL) { ++ read_lock_tree(znode_get_tree(child)); ++ check_link(left_child, child); ++ read_unlock_tree(znode_get_tree(child)); ++ zput(left_child); ++ } ++ } ++ coord_dup(&cpy, coord); ++ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) { ++ right_child = znode_at(&cpy, cpy.node); ++ if (right_child != NULL) { ++ read_lock_tree(znode_get_tree(child)); ++ check_link(child, right_child); ++ read_unlock_tree(znode_get_tree(child)); ++ zput(right_child); ++ } ++ } ++ zput(child); ++ } ++ return 0; ++} ++ ++#endif /* REISER4_DEBUG */ ++ ++/* return true only if this item really points to "block" */ ++/* Audited by: green(2002.06.14) */ ++int has_pointer_to_internal(const coord_t * coord /* coord of item */ , ++ const reiser4_block_nr * block /* block number to ++ * check */ ) ++{ ++ assert("nikita-613", coord != NULL); ++ assert("nikita-614", block != NULL); ++ ++ return pointer_at(coord) == *block; ++} ++ ++/* hook called by ->create_item() method of node plugin after new internal ++ item was just created. ++ ++ This is point where pointer to new node is inserted into tree. Initialize ++ parent pointer in child znode, insert child into sibling list and slum. ++ ++*/ ++int create_hook_internal(const coord_t * item /* coord of item */ , ++ void *arg /* child's left neighbor, if any */ ) ++{ ++ znode *child; ++ __u64 child_ptr; ++ ++ assert("nikita-1252", item != NULL); ++ assert("nikita-1253", item->node != NULL); ++ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL); ++ assert("nikita-1450", item->unit_pos == 0); ++ ++ /* ++ * preparing to item insertion build_child_ptr_data sets pointer to ++ * data to be inserted to jnode's blocknr which is in cpu byte ++ * order. Node's create_item simply copied those data. As result we ++ * have child pointer in cpu's byte order. Convert content of internal ++ * item to little endian byte order. ++ */ ++ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item)); ++ reiser4_update_internal(item, &child_ptr); ++ ++ child = znode_at(item, item->node); ++ if (child != NULL && !IS_ERR(child)) { ++ znode *left; ++ int result = 0; ++ reiser4_tree *tree; ++ ++ left = arg; ++ tree = znode_get_tree(item->node); ++ write_lock_tree(tree); ++ write_lock_dk(tree); ++ assert("nikita-1400", (child->in_parent.node == NULL) ++ || (znode_above_root(child->in_parent.node))); ++ ++item->node->c_count; ++ coord_to_parent_coord(item, &child->in_parent); ++ sibling_list_insert_nolock(child, left); ++ ++ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN)); ++ ZF_CLR(child, JNODE_ORPHAN); ++ ++ if ((left != NULL) && !keyeq(znode_get_rd_key(left), ++ znode_get_rd_key(child))) { ++ znode_set_rd_key(child, znode_get_rd_key(left)); ++ } ++ write_unlock_dk(tree); ++ write_unlock_tree(tree); ++ zput(child); ++ return result; ++ } else { ++ if (child == NULL) ++ child = ERR_PTR(-EIO); ++ return PTR_ERR(child); ++ } ++} ++ ++/* hook called by ->cut_and_kill() method of node plugin just before internal ++ item is removed. ++ ++ This is point where empty node is removed from the tree. Clear parent ++ pointer in child, and mark node for pending deletion. ++ ++ Node will be actually deleted later and in several installations: ++ ++ . when last lock on this node will be released, node will be removed from ++ the sibling list and its lock will be invalidated ++ ++ . when last reference to this node will be dropped, bitmap will be updated ++ and node will be actually removed from the memory. ++ ++*/ ++int kill_hook_internal(const coord_t * item /* coord of item */ , ++ pos_in_node_t from UNUSED_ARG /* start unit */ , ++ pos_in_node_t count UNUSED_ARG /* stop unit */ , ++ struct carry_kill_data *p UNUSED_ARG) ++{ ++ znode *child; ++ int result = 0; ++ ++ assert("nikita-1222", item != NULL); ++ assert("nikita-1224", from == 0); ++ assert("nikita-1225", count == 1); ++ ++ child = znode_at(item, item->node); ++ if (child == NULL) ++ return 0; ++ if (IS_ERR(child)) ++ return PTR_ERR(child); ++ result = zload(child); ++ if (result) { ++ zput(child); ++ return result; ++ } ++ if (node_is_empty(child)) { ++ reiser4_tree *tree; ++ ++ assert("nikita-1397", znode_is_write_locked(child)); ++ assert("nikita-1398", child->c_count == 0); ++ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE)); ++ ++ tree = znode_get_tree(item->node); ++ write_lock_tree(tree); ++ init_parent_coord(&child->in_parent, NULL); ++ --item->node->c_count; ++ write_unlock_tree(tree); ++ } else { ++ warning("nikita-1223", ++ "Cowardly refuse to remove link to non-empty node"); ++ result = RETERR(-EIO); ++ } ++ zrelse(child); ++ zput(child); ++ return result; ++} ++ ++/* hook called by ->shift() node plugin method when iternal item was just ++ moved from one node to another. ++ ++ Update parent pointer in child and c_counts in old and new parent ++ ++*/ ++int shift_hook_internal(const coord_t * item /* coord of item */ , ++ unsigned from UNUSED_ARG /* start unit */ , ++ unsigned count UNUSED_ARG /* stop unit */ , ++ znode * old_node /* old parent */ ) ++{ ++ znode *child; ++ znode *new_node; ++ reiser4_tree *tree; ++ ++ assert("nikita-1276", item != NULL); ++ assert("nikita-1277", from == 0); ++ assert("nikita-1278", count == 1); ++ assert("nikita-1451", item->unit_pos == 0); ++ ++ new_node = item->node; ++ assert("nikita-2132", new_node != old_node); ++ tree = znode_get_tree(item->node); ++ child = child_znode(item, old_node, 1, 0); ++ if (child == NULL) ++ return 0; ++ if (!IS_ERR(child)) { ++ write_lock_tree(tree); ++ ++new_node->c_count; ++ assert("nikita-1395", znode_parent(child) == old_node); ++ assert("nikita-1396", old_node->c_count > 0); ++ coord_to_parent_coord(item, &child->in_parent); ++ assert("nikita-1781", znode_parent(child) == new_node); ++ assert("nikita-1782", ++ check_tree_pointer(item, child) == NS_FOUND); ++ --old_node->c_count; ++ write_unlock_tree(tree); ++ zput(child); ++ return 0; ++ } else ++ return PTR_ERR(child); ++} ++ ++/* plugin->u.item.b.max_key_inside - not defined */ ++ ++/* plugin->u.item.b.nr_units - item.c:single_unit */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/internal.h linux-2.6.33/fs/reiser4/plugin/item/internal.h +--- linux-2.6.33.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/internal.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,57 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++/* Internal item contains down-link to the child of the internal/twig ++ node in a tree. It is internal items that are actually used during ++ tree traversal. */ ++ ++#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ ) ++#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ ++ ++#include "../../forward.h" ++#include "../../dformat.h" ++ ++/* on-disk layout of internal item */ ++typedef struct internal_item_layout { ++ /* 0 */ reiser4_dblock_nr pointer; ++ /* 4 */ ++} internal_item_layout; ++ ++struct cut_list; ++ ++int mergeable_internal(const coord_t * p1, const coord_t * p2); ++lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias, ++ coord_t * coord); ++/* store pointer from internal item into "block". Implementation of ++ ->down_link() method */ ++extern void down_link_internal(const coord_t * coord, const reiser4_key * key, ++ reiser4_block_nr * block); ++extern int has_pointer_to_internal(const coord_t * coord, ++ const reiser4_block_nr * block); ++extern int create_hook_internal(const coord_t * item, void *arg); ++extern int kill_hook_internal(const coord_t * item, pos_in_node_t from, ++ pos_in_node_t count, struct carry_kill_data *); ++extern int shift_hook_internal(const coord_t * item, unsigned from, ++ unsigned count, znode * old_node); ++extern void reiser4_print_internal(const char *prefix, coord_t * coord); ++ ++extern int utmost_child_internal(const coord_t * coord, sideof side, ++ jnode ** child); ++int utmost_child_real_block_internal(const coord_t * coord, sideof side, ++ reiser4_block_nr * block); ++ ++extern void reiser4_update_internal(const coord_t * coord, ++ const reiser4_block_nr * blocknr); ++/* FIXME: reiserfs has check_internal */ ++extern int check__internal(const coord_t * coord, const char **error); ++ ++/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/item.c linux-2.6.33/fs/reiser4/plugin/item/item.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/item.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/item.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,719 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* definition of item plugins. */ ++ ++#include "../../forward.h" ++#include "../../debug.h" ++#include "../../key.h" ++#include "../../coord.h" ++#include "../plugin_header.h" ++#include "sde.h" ++#include "internal.h" ++#include "item.h" ++#include "static_stat.h" ++#include "../plugin.h" ++#include "../../znode.h" ++#include "../../tree.h" ++#include "../../context.h" ++#include "ctail.h" ++ ++/* return pointer to item body */ ++void item_body_by_coord_hard(coord_t * coord /* coord to query */ ) ++{ ++ assert("nikita-324", coord != NULL); ++ assert("nikita-325", coord->node != NULL); ++ assert("nikita-326", znode_is_loaded(coord->node)); ++ assert("nikita-3200", coord->offset == INVALID_OFFSET); ++ ++ coord->offset = ++ node_plugin_by_node(coord->node)->item_by_coord(coord) - ++ zdata(coord->node); ++ ON_DEBUG(coord->body_v = coord->node->times_locked); ++} ++ ++void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ ) ++{ ++ return zdata(coord->node) + coord->offset; ++} ++ ++#if REISER4_DEBUG ++ ++int item_body_is_valid(const coord_t * coord) ++{ ++ return ++ coord->offset == ++ node_plugin_by_node(coord->node)->item_by_coord(coord) - ++ zdata(coord->node); ++} ++ ++#endif ++ ++/* return length of item at @coord */ ++pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ ) ++{ ++ int len; ++ ++ assert("nikita-327", coord != NULL); ++ assert("nikita-328", coord->node != NULL); ++ assert("nikita-329", znode_is_loaded(coord->node)); ++ ++ len = node_plugin_by_node(coord->node)->length_by_coord(coord); ++ return len; ++} ++ ++void obtain_item_plugin(const coord_t * coord) ++{ ++ assert("nikita-330", coord != NULL); ++ assert("nikita-331", coord->node != NULL); ++ assert("nikita-332", znode_is_loaded(coord->node)); ++ ++ coord_set_iplug((coord_t *) coord, ++ node_plugin_by_node(coord->node)-> ++ plugin_by_coord(coord)); ++ assert("nikita-2479", ++ coord_iplug(coord) == ++ node_plugin_by_node(coord->node)->plugin_by_coord(coord)); ++} ++ ++/* return id of item */ ++/* Audited by: green(2002.06.15) */ ++item_id item_id_by_coord(const coord_t * coord /* coord to query */ ) ++{ ++ assert("vs-539", coord != NULL); ++ assert("vs-538", coord->node != NULL); ++ assert("vs-537", znode_is_loaded(coord->node)); ++ assert("vs-536", item_plugin_by_coord(coord) != NULL); ++ assert("vs-540", ++ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID); ++ ++ return item_id_by_plugin(item_plugin_by_coord(coord)); ++} ++ ++/* return key of item at @coord */ ++/* Audited by: green(2002.06.15) */ ++reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ , ++ reiser4_key * key /* result */ ) ++{ ++ assert("nikita-338", coord != NULL); ++ assert("nikita-339", coord->node != NULL); ++ assert("nikita-340", znode_is_loaded(coord->node)); ++ ++ return node_plugin_by_node(coord->node)->key_at(coord, key); ++} ++ ++/* this returns max key in the item */ ++reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ , ++ reiser4_key * key /* result */ ) ++{ ++ coord_t last; ++ ++ assert("nikita-338", coord != NULL); ++ assert("nikita-339", coord->node != NULL); ++ assert("nikita-340", znode_is_loaded(coord->node)); ++ ++ /* make coord pointing to last item's unit */ ++ coord_dup(&last, coord); ++ last.unit_pos = coord_num_units(&last) - 1; ++ assert("vs-1560", coord_is_existing_unit(&last)); ++ ++ max_unit_key_by_coord(&last, key); ++ return key; ++} ++ ++/* return key of unit at @coord */ ++reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ , ++ reiser4_key * key /* result */ ) ++{ ++ assert("nikita-772", coord != NULL); ++ assert("nikita-774", coord->node != NULL); ++ assert("nikita-775", znode_is_loaded(coord->node)); ++ ++ if (item_plugin_by_coord(coord)->b.unit_key != NULL) ++ return item_plugin_by_coord(coord)->b.unit_key(coord, key); ++ else ++ return item_key_by_coord(coord, key); ++} ++ ++/* return the biggest key contained the unit @coord */ ++reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ , ++ reiser4_key * key /* result */ ) ++{ ++ assert("nikita-772", coord != NULL); ++ assert("nikita-774", coord->node != NULL); ++ assert("nikita-775", znode_is_loaded(coord->node)); ++ ++ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL) ++ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key); ++ else ++ return unit_key_by_coord(coord, key); ++} ++ ++/* ->max_key_inside() method for items consisting of exactly one key (like ++ stat-data) */ ++static reiser4_key *max_key_inside_single_key(const coord_t * ++ coord /* coord of item */ , ++ reiser4_key * ++ result /* resulting key */ ) ++{ ++ assert("nikita-604", coord != NULL); ++ ++ /* coord -> key is starting key of this item and it has to be already ++ filled in */ ++ return unit_key_by_coord(coord, result); ++} ++ ++/* ->nr_units() method for items consisting of exactly one unit always */ ++pos_in_node_t ++nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ ) ++{ ++ return 1; ++} ++ ++static int ++paste_no_paste(coord_t * coord UNUSED_ARG, ++ reiser4_item_data * data UNUSED_ARG, ++ carry_plugin_info * info UNUSED_ARG) ++{ ++ return 0; ++} ++ ++/* default ->fast_paste() method */ ++static int ++agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ ) ++{ ++ return 1; ++} ++ ++int item_can_contain_key(const coord_t * item /* coord of item */ , ++ const reiser4_key * key /* key to check */ , ++ const reiser4_item_data * data /* parameters of item ++ * being created */ ) ++{ ++ item_plugin *iplug; ++ reiser4_key min_key_in_item; ++ reiser4_key max_key_in_item; ++ ++ assert("nikita-1658", item != NULL); ++ assert("nikita-1659", key != NULL); ++ ++ iplug = item_plugin_by_coord(item); ++ if (iplug->b.can_contain_key != NULL) ++ return iplug->b.can_contain_key(item, key, data); ++ else { ++ assert("nikita-1681", iplug->b.max_key_inside != NULL); ++ item_key_by_coord(item, &min_key_in_item); ++ iplug->b.max_key_inside(item, &max_key_in_item); ++ ++ /* can contain key if ++ min_key_in_item <= key && ++ key <= max_key_in_item ++ */ ++ return keyle(&min_key_in_item, key) ++ && keyle(key, &max_key_in_item); ++ } ++} ++ ++/* mergeable method for non mergeable items */ ++static int ++not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG) ++{ ++ return 0; ++} ++ ++/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */ ++int are_items_mergeable(const coord_t * i1 /* coord of first item */ , ++ const coord_t * i2 /* coord of second item */ ) ++{ ++ item_plugin *iplug; ++ reiser4_key k1; ++ reiser4_key k2; ++ ++ assert("nikita-1336", i1 != NULL); ++ assert("nikita-1337", i2 != NULL); ++ ++ iplug = item_plugin_by_coord(i1); ++ assert("nikita-1338", iplug != NULL); ++ ++ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in ++ shifting code when nodes are in "suspended" state. */ ++ assert("nikita-1663", ++ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2))); ++ ++ if (iplug->b.mergeable != NULL) { ++ return iplug->b.mergeable(i1, i2); ++ } else if (iplug->b.max_key_inside != NULL) { ++ iplug->b.max_key_inside(i1, &k1); ++ item_key_by_coord(i2, &k2); ++ ++ /* mergeable if ->max_key_inside() >= key of i2; */ ++ return keyge(iplug->b.max_key_inside(i1, &k1), ++ item_key_by_coord(i2, &k2)); ++ } else { ++ item_key_by_coord(i1, &k1); ++ item_key_by_coord(i2, &k2); ++ ++ return ++ (get_key_locality(&k1) == get_key_locality(&k2)) && ++ (get_key_objectid(&k1) == get_key_objectid(&k2)) ++ && (iplug == item_plugin_by_coord(i2)); ++ } ++} ++ ++int item_is_extent(const coord_t * item) ++{ ++ assert("vs-482", coord_is_existing_item(item)); ++ return item_id_by_coord(item) == EXTENT_POINTER_ID; ++} ++ ++int item_is_tail(const coord_t * item) ++{ ++ assert("vs-482", coord_is_existing_item(item)); ++ return item_id_by_coord(item) == FORMATTING_ID; ++} ++ ++#if REISER4_DEBUG ++ ++int item_is_statdata(const coord_t * item) ++{ ++ assert("vs-516", coord_is_existing_item(item)); ++ return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE); ++} ++ ++int item_is_ctail(const coord_t * item) ++{ ++ assert("edward-xx", coord_is_existing_item(item)); ++ return item_id_by_coord(item) == CTAIL_ID; ++} ++ ++#endif /* REISER4_DEBUG */ ++ ++static int change_item(struct inode *inode, ++ reiser4_plugin * plugin, ++ pset_member memb) ++{ ++ /* cannot change constituent item (sd, or dir_item) */ ++ return RETERR(-EINVAL); ++} ++ ++static reiser4_plugin_ops item_plugin_ops = { ++ .init = NULL, ++ .load = NULL, ++ .save_len = NULL, ++ .save = NULL, ++ .change = change_item ++}; ++ ++item_plugin item_plugins[LAST_ITEM_ID] = { ++ [STATIC_STAT_DATA_ID] = { ++ .h = { ++ .type_id = REISER4_ITEM_PLUGIN_TYPE, ++ .id = STATIC_STAT_DATA_ID, ++ .groups = (1 << STAT_DATA_ITEM_TYPE), ++ .pops = &item_plugin_ops, ++ .label = "sd", ++ .desc = "stat-data", ++ .linkage = {NULL, NULL} ++ }, ++ .b = { ++ .max_key_inside = max_key_inside_single_key, ++ .can_contain_key = NULL, ++ .mergeable = not_mergeable, ++ .nr_units = nr_units_single_unit, ++ .lookup = NULL, ++ .init = NULL, ++ .paste = paste_no_paste, ++ .fast_paste = NULL, ++ .can_shift = NULL, ++ .copy_units = NULL, ++ .create_hook = NULL, ++ .kill_hook = NULL, ++ .shift_hook = NULL, ++ .cut_units = NULL, ++ .kill_units = NULL, ++ .unit_key = NULL, ++ .max_unit_key = NULL, ++ .estimate = NULL, ++ .item_data_by_flow = NULL, ++#if REISER4_DEBUG ++ .check = NULL ++#endif ++ }, ++ .f = { ++ .utmost_child = NULL, ++ .utmost_child_real_block = NULL, ++ .update = NULL, ++ .scan = NULL, ++ .convert = NULL ++ }, ++ .s = { ++ .sd = { ++ .init_inode = init_inode_static_sd, ++ .save_len = save_len_static_sd, ++ .save = save_static_sd ++ } ++ } ++ }, ++ [SIMPLE_DIR_ENTRY_ID] = { ++ .h = { ++ .type_id = REISER4_ITEM_PLUGIN_TYPE, ++ .id = SIMPLE_DIR_ENTRY_ID, ++ .groups = (1 << DIR_ENTRY_ITEM_TYPE), ++ .pops = &item_plugin_ops, ++ .label = "de", ++ .desc = "directory entry", ++ .linkage = {NULL, NULL} ++ }, ++ .b = { ++ .max_key_inside = max_key_inside_single_key, ++ .can_contain_key = NULL, ++ .mergeable = NULL, ++ .nr_units = nr_units_single_unit, ++ .lookup = NULL, ++ .init = NULL, ++ .paste = NULL, ++ .fast_paste = NULL, ++ .can_shift = NULL, ++ .copy_units = NULL, ++ .create_hook = NULL, ++ .kill_hook = NULL, ++ .shift_hook = NULL, ++ .cut_units = NULL, ++ .kill_units = NULL, ++ .unit_key = NULL, ++ .max_unit_key = NULL, ++ .estimate = NULL, ++ .item_data_by_flow = NULL, ++#if REISER4_DEBUG ++ .check = NULL ++#endif ++ }, ++ .f = { ++ .utmost_child = NULL, ++ .utmost_child_real_block = NULL, ++ .update = NULL, ++ .scan = NULL, ++ .convert = NULL ++ }, ++ .s = { ++ .dir = { ++ .extract_key = extract_key_de, ++ .update_key = update_key_de, ++ .extract_name = extract_name_de, ++ .extract_file_type = extract_file_type_de, ++ .add_entry = add_entry_de, ++ .rem_entry = rem_entry_de, ++ .max_name_len = max_name_len_de ++ } ++ } ++ }, ++ [COMPOUND_DIR_ID] = { ++ .h = { ++ .type_id = REISER4_ITEM_PLUGIN_TYPE, ++ .id = COMPOUND_DIR_ID, ++ .groups = (1 << DIR_ENTRY_ITEM_TYPE), ++ .pops = &item_plugin_ops, ++ .label = "cde", ++ .desc = "compressed directory entry", ++ .linkage = {NULL, NULL} ++ }, ++ .b = { ++ .max_key_inside = max_key_inside_cde, ++ .can_contain_key = can_contain_key_cde, ++ .mergeable = mergeable_cde, ++ .nr_units = nr_units_cde, ++ .lookup = lookup_cde, ++ .init = init_cde, ++ .paste = paste_cde, ++ .fast_paste = agree_to_fast_op, ++ .can_shift = can_shift_cde, ++ .copy_units = copy_units_cde, ++ .create_hook = NULL, ++ .kill_hook = NULL, ++ .shift_hook = NULL, ++ .cut_units = cut_units_cde, ++ .kill_units = kill_units_cde, ++ .unit_key = unit_key_cde, ++ .max_unit_key = unit_key_cde, ++ .estimate = estimate_cde, ++ .item_data_by_flow = NULL, ++#if REISER4_DEBUG ++ .check = reiser4_check_cde ++#endif ++ }, ++ .f = { ++ .utmost_child = NULL, ++ .utmost_child_real_block = NULL, ++ .update = NULL, ++ .scan = NULL, ++ .convert = NULL ++ }, ++ .s = { ++ .dir = { ++ .extract_key = extract_key_cde, ++ .update_key = update_key_cde, ++ .extract_name = extract_name_cde, ++ .extract_file_type = extract_file_type_de, ++ .add_entry = add_entry_cde, ++ .rem_entry = rem_entry_cde, ++ .max_name_len = max_name_len_cde ++ } ++ } ++ }, ++ [NODE_POINTER_ID] = { ++ .h = { ++ .type_id = REISER4_ITEM_PLUGIN_TYPE, ++ .id = NODE_POINTER_ID, ++ .groups = (1 << INTERNAL_ITEM_TYPE), ++ .pops = NULL, ++ .label = "internal", ++ .desc = "internal item", ++ .linkage = {NULL, NULL} ++ }, ++ .b = { ++ .max_key_inside = NULL, ++ .can_contain_key = NULL, ++ .mergeable = mergeable_internal, ++ .nr_units = nr_units_single_unit, ++ .lookup = lookup_internal, ++ .init = NULL, ++ .paste = NULL, ++ .fast_paste = NULL, ++ .can_shift = NULL, ++ .copy_units = NULL, ++ .create_hook = create_hook_internal, ++ .kill_hook = kill_hook_internal, ++ .shift_hook = shift_hook_internal, ++ .cut_units = NULL, ++ .kill_units = NULL, ++ .unit_key = NULL, ++ .max_unit_key = NULL, ++ .estimate = NULL, ++ .item_data_by_flow = NULL, ++#if REISER4_DEBUG ++ .check = check__internal ++#endif ++ }, ++ .f = { ++ .utmost_child = utmost_child_internal, ++ .utmost_child_real_block = ++ utmost_child_real_block_internal, ++ .update = reiser4_update_internal, ++ .scan = NULL, ++ .convert = NULL ++ }, ++ .s = { ++ .internal = { ++ .down_link = down_link_internal, ++ .has_pointer_to = has_pointer_to_internal ++ } ++ } ++ }, ++ [EXTENT_POINTER_ID] = { ++ .h = { ++ .type_id = REISER4_ITEM_PLUGIN_TYPE, ++ .id = EXTENT_POINTER_ID, ++ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), ++ .pops = NULL, ++ .label = "extent", ++ .desc = "extent item", ++ .linkage = {NULL, NULL} ++ }, ++ .b = { ++ .max_key_inside = max_key_inside_extent, ++ .can_contain_key = can_contain_key_extent, ++ .mergeable = mergeable_extent, ++ .nr_units = nr_units_extent, ++ .lookup = lookup_extent, ++ .init = NULL, ++ .paste = paste_extent, ++ .fast_paste = agree_to_fast_op, ++ .can_shift = can_shift_extent, ++ .create_hook = create_hook_extent, ++ .copy_units = copy_units_extent, ++ .kill_hook = kill_hook_extent, ++ .shift_hook = NULL, ++ .cut_units = cut_units_extent, ++ .kill_units = kill_units_extent, ++ .unit_key = unit_key_extent, ++ .max_unit_key = max_unit_key_extent, ++ .estimate = NULL, ++ .item_data_by_flow = NULL, ++#if REISER4_DEBUG ++ .check = reiser4_check_extent ++#endif ++ }, ++ .f = { ++ .utmost_child = utmost_child_extent, ++ .utmost_child_real_block = ++ utmost_child_real_block_extent, ++ .update = NULL, ++ .scan = reiser4_scan_extent, ++ .convert = NULL, ++ .key_by_offset = key_by_offset_extent ++ }, ++ .s = { ++ .file = { ++ .write = reiser4_write_extent, ++ .read = reiser4_read_extent, ++ .readpage = reiser4_readpage_extent, ++ .get_block = get_block_address_extent, ++ .append_key = append_key_extent, ++ .init_coord_extension = ++ init_coord_extension_extent ++ } ++ } ++ }, ++ [FORMATTING_ID] = { ++ .h = { ++ .type_id = REISER4_ITEM_PLUGIN_TYPE, ++ .id = FORMATTING_ID, ++ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), ++ .pops = NULL, ++ .label = "body", ++ .desc = "body (or tail?) item", ++ .linkage = {NULL, NULL} ++ }, ++ .b = { ++ .max_key_inside = max_key_inside_tail, ++ .can_contain_key = can_contain_key_tail, ++ .mergeable = mergeable_tail, ++ .nr_units = nr_units_tail, ++ .lookup = lookup_tail, ++ .init = NULL, ++ .paste = paste_tail, ++ .fast_paste = agree_to_fast_op, ++ .can_shift = can_shift_tail, ++ .create_hook = NULL, ++ .copy_units = copy_units_tail, ++ .kill_hook = kill_hook_tail, ++ .shift_hook = NULL, ++ .cut_units = cut_units_tail, ++ .kill_units = kill_units_tail, ++ .unit_key = unit_key_tail, ++ .max_unit_key = unit_key_tail, ++ .estimate = NULL, ++ .item_data_by_flow = NULL, ++#if REISER4_DEBUG ++ .check = NULL ++#endif ++ }, ++ .f = { ++ .utmost_child = NULL, ++ .utmost_child_real_block = NULL, ++ .update = NULL, ++ .scan = NULL, ++ .convert = NULL ++ }, ++ .s = { ++ .file = { ++ .write = reiser4_write_tail, ++ .read = reiser4_read_tail, ++ .readpage = readpage_tail, ++ .get_block = get_block_address_tail, ++ .append_key = append_key_tail, ++ .init_coord_extension = ++ init_coord_extension_tail ++ } ++ } ++ }, ++ [CTAIL_ID] = { ++ .h = { ++ .type_id = REISER4_ITEM_PLUGIN_TYPE, ++ .id = CTAIL_ID, ++ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), ++ .pops = NULL, ++ .label = "ctail", ++ .desc = "cryptcompress tail item", ++ .linkage = {NULL, NULL} ++ }, ++ .b = { ++ .max_key_inside = max_key_inside_tail, ++ .can_contain_key = can_contain_key_ctail, ++ .mergeable = mergeable_ctail, ++ .nr_units = nr_units_ctail, ++ .lookup = NULL, ++ .init = init_ctail, ++ .paste = paste_ctail, ++ .fast_paste = agree_to_fast_op, ++ .can_shift = can_shift_ctail, ++ .create_hook = create_hook_ctail, ++ .copy_units = copy_units_ctail, ++ .kill_hook = kill_hook_ctail, ++ .shift_hook = shift_hook_ctail, ++ .cut_units = cut_units_ctail, ++ .kill_units = kill_units_ctail, ++ .unit_key = unit_key_tail, ++ .max_unit_key = unit_key_tail, ++ .estimate = estimate_ctail, ++ .item_data_by_flow = NULL, ++#if REISER4_DEBUG ++ .check = check_ctail ++#endif ++ }, ++ .f = { ++ .utmost_child = utmost_child_ctail, ++ /* FIXME-EDWARD: write this */ ++ .utmost_child_real_block = NULL, ++ .update = NULL, ++ .scan = scan_ctail, ++ .convert = convert_ctail ++ }, ++ .s = { ++ .file = { ++ .write = NULL, ++ .read = read_ctail, ++ .readpage = readpage_ctail, ++ .get_block = get_block_address_tail, ++ .append_key = append_key_ctail, ++ .init_coord_extension = ++ init_coord_extension_tail ++ } ++ } ++ }, ++ [BLACK_BOX_ID] = { ++ .h = { ++ .type_id = REISER4_ITEM_PLUGIN_TYPE, ++ .id = BLACK_BOX_ID, ++ .groups = (1 << OTHER_ITEM_TYPE), ++ .pops = NULL, ++ .label = "blackbox", ++ .desc = "black box item", ++ .linkage = {NULL, NULL} ++ }, ++ .b = { ++ .max_key_inside = NULL, ++ .can_contain_key = NULL, ++ .mergeable = not_mergeable, ++ .nr_units = nr_units_single_unit, ++ /* to need for ->lookup method */ ++ .lookup = NULL, ++ .init = NULL, ++ .paste = NULL, ++ .fast_paste = NULL, ++ .can_shift = NULL, ++ .copy_units = NULL, ++ .create_hook = NULL, ++ .kill_hook = NULL, ++ .shift_hook = NULL, ++ .cut_units = NULL, ++ .kill_units = NULL, ++ .unit_key = NULL, ++ .max_unit_key = NULL, ++ .estimate = NULL, ++ .item_data_by_flow = NULL, ++#if REISER4_DEBUG ++ .check = NULL ++#endif ++ } ++ } ++}; ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/item.h linux-2.6.33/fs/reiser4/plugin/item/item.h +--- linux-2.6.33.orig/fs/reiser4/plugin/item/item.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/item.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,398 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* first read balance.c comments before reading this */ ++ ++/* An item_plugin implements all of the operations required for ++ balancing that are item specific. */ ++ ++/* an item plugin also implements other operations that are specific to that ++ item. These go into the item specific operations portion of the item ++ handler, and all of the item specific portions of the item handler are put ++ into a union. */ ++ ++#if !defined( __REISER4_ITEM_H__ ) ++#define __REISER4_ITEM_H__ ++ ++#include "../../forward.h" ++#include "../plugin_header.h" ++#include "../../dformat.h" ++#include "../../seal.h" ++#include "../../plugin/file/file.h" ++ ++#include <linux/fs.h> /* for struct file, struct inode */ ++#include <linux/mm.h> /* for struct page */ ++#include <linux/dcache.h> /* for struct dentry */ ++ ++typedef enum { ++ STAT_DATA_ITEM_TYPE, ++ DIR_ENTRY_ITEM_TYPE, ++ INTERNAL_ITEM_TYPE, ++ UNIX_FILE_METADATA_ITEM_TYPE, ++ OTHER_ITEM_TYPE ++} item_type_id; ++ ++/* this is the part of each item plugin that all items are expected to ++ support or at least explicitly fail to support by setting the ++ pointer to null. */ ++struct balance_ops { ++ /* operations called by balancing ++ ++ It is interesting to consider that some of these item ++ operations could be given sources or targets that are not ++ really items in nodes. This could be ok/useful. ++ ++ */ ++ /* maximal key that can _possibly_ be occupied by this item ++ ++ When inserting, and node ->lookup() method (called by ++ coord_by_key()) reaches an item after binary search, ++ the ->max_key_inside() item plugin method is used to determine ++ whether new item should pasted into existing item ++ (new_key<=max_key_inside()) or new item has to be created ++ (new_key>max_key_inside()). ++ ++ For items that occupy exactly one key (like stat-data) ++ this method should return this key. For items that can ++ grow indefinitely (extent, directory item) this should ++ return reiser4_max_key(). ++ ++ For example extent with the key ++ ++ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks, ++ ++ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and ++ */ ++ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *); ++ ++ /* true if item @coord can merge data at @key. */ ++ int (*can_contain_key) (const coord_t *, const reiser4_key *, ++ const reiser4_item_data *); ++ /* mergeable() - check items for mergeability ++ ++ Optional method. Returns true if two items can be merged. ++ ++ */ ++ int (*mergeable) (const coord_t *, const coord_t *); ++ ++ /* number of atomic things in an item. ++ NOTE FOR CONTRIBUTORS: use a generic method ++ nr_units_single_unit() for solid (atomic) items, as ++ tree operations use it as a criterion of solidness ++ (see is_solid_item macro) */ ++ pos_in_node_t(*nr_units) (const coord_t *); ++ ++ /* search within item for a unit within the item, and return a ++ pointer to it. This can be used to calculate how many ++ bytes to shrink an item if you use pointer arithmetic and ++ compare to the start of the item body if the item's data ++ are continuous in the node, if the item's data are not ++ continuous in the node, all sorts of other things are maybe ++ going to break as well. */ ++ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *); ++ /* method called by ode_plugin->create_item() to initialise new ++ item */ ++ int (*init) (coord_t * target, coord_t * from, ++ reiser4_item_data * data); ++ /* method called (e.g., by reiser4_resize_item()) to place new data ++ into item when it grows */ ++ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *); ++ /* return true if paste into @coord is allowed to skip ++ carry. That is, if such paste would require any changes ++ at the parent level ++ */ ++ int (*fast_paste) (const coord_t *); ++ /* how many but not more than @want units of @source can be ++ shifted into @target node. If pend == append - we try to ++ append last item of @target by first units of @source. If ++ pend == prepend - we try to "prepend" first item in @target ++ by last units of @source. @target node has @free_space ++ bytes of free space. Total size of those units are returned ++ via @size. ++ ++ @target is not NULL if shifting to the mergeable item and ++ NULL is new item will be created during shifting. ++ */ ++ int (*can_shift) (unsigned free_space, coord_t *, ++ znode *, shift_direction, unsigned *size, ++ unsigned want); ++ ++ /* starting off @from-th unit of item @source append or ++ prepend @count units to @target. @target has been already ++ expanded by @free_space bytes. That must be exactly what is ++ needed for those items in @target. If @where_is_free_space ++ == SHIFT_LEFT - free space is at the end of @target item, ++ othersize - it is in the beginning of it. */ ++ void (*copy_units) (coord_t *, coord_t *, ++ unsigned from, unsigned count, ++ shift_direction where_is_free_space, ++ unsigned free_space); ++ ++ int (*create_hook) (const coord_t *, void *); ++ /* do whatever is necessary to do when @count units starting ++ from @from-th one are removed from the tree */ ++ /* FIXME-VS: this is used to be here for, in particular, ++ extents and items of internal type to free blocks they point ++ to at the same time with removing items from a ++ tree. Problems start, however, when dealloc_block fails due ++ to some reason. Item gets removed, but blocks it pointed to ++ are not freed. It is not clear how to fix this for items of ++ internal type because a need to remove internal item may ++ appear in the middle of balancing, and there is no way to ++ undo changes made. OTOH, if space allocator involves ++ balancing to perform dealloc_block - this will probably ++ break balancing due to deadlock issues ++ */ ++ int (*kill_hook) (const coord_t *, pos_in_node_t from, ++ pos_in_node_t count, struct carry_kill_data *); ++ int (*shift_hook) (const coord_t *, unsigned from, unsigned count, ++ znode * _node); ++ ++ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key ++ including boundaries. When units are cut from item beginning - move space which gets freed to head of ++ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of ++ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in ++ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0 ++ */ ++ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, ++ struct carry_cut_data *, ++ reiser4_key * smallest_removed, ++ reiser4_key * new_first_key); ++ ++ /* like cut_units, except that these units are removed from the ++ tree, not only from a node */ ++ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, ++ struct carry_kill_data *, ++ reiser4_key * smallest_removed, ++ reiser4_key * new_first); ++ ++ /* if @key_of_coord == 1 - returned key of coord, otherwise - ++ key of unit is returned. If @coord is not set to certain ++ unit - ERR_PTR(-ENOENT) is returned */ ++ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *); ++ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *); ++ /* estimate how much space is needed for paste @data into item at ++ @coord. if @coord==0 - estimate insertion, otherwise - estimate ++ pasting ++ */ ++ int (*estimate) (const coord_t *, const reiser4_item_data *); ++ ++ /* converts flow @f to item data. @coord == 0 on insert */ ++ int (*item_data_by_flow) (const coord_t *, const flow_t *, ++ reiser4_item_data *); ++ ++ /*void (*show) (struct seq_file *, coord_t *); */ ++ ++#if REISER4_DEBUG ++ /* used for debugging, every item should have here the most ++ complete possible check of the consistency of the item that ++ the inventor can construct */ ++ int (*check) (const coord_t *, const char **error); ++#endif ++ ++}; ++ ++struct flush_ops { ++ /* return the right or left child of @coord, only if it is in memory */ ++ int (*utmost_child) (const coord_t *, sideof side, jnode ** child); ++ ++ /* return whether the right or left child of @coord has a non-fake ++ block number. */ ++ int (*utmost_child_real_block) (const coord_t *, sideof side, ++ reiser4_block_nr *); ++ /* relocate child at @coord to the @block */ ++ void (*update) (const coord_t *, const reiser4_block_nr *); ++ /* count unformatted nodes per item for leave relocation policy, etc.. */ ++ int (*scan) (flush_scan * scan); ++ /* convert item by flush */ ++ int (*convert) (flush_pos_t * pos); ++ /* backward mapping from jnode offset to a key. */ ++ int (*key_by_offset) (struct inode *, loff_t, reiser4_key *); ++}; ++ ++/* operations specific to the directory item */ ++struct dir_entry_iops { ++ /* extract stat-data key from directory entry at @coord and place it ++ into @key. */ ++ int (*extract_key) (const coord_t *, reiser4_key * key); ++ /* update object key in item. */ ++ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *); ++ /* extract name from directory entry at @coord and return it */ ++ char *(*extract_name) (const coord_t *, char *buf); ++ /* extract file type (DT_* stuff) from directory entry at @coord and ++ return it */ ++ unsigned (*extract_file_type) (const coord_t *); ++ int (*add_entry) (struct inode * dir, ++ coord_t *, lock_handle *, ++ const struct dentry * name, ++ reiser4_dir_entry_desc * entry); ++ int (*rem_entry) (struct inode * dir, const struct qstr * name, ++ coord_t *, lock_handle *, ++ reiser4_dir_entry_desc * entry); ++ int (*max_name_len) (const struct inode * dir); ++}; ++ ++/* operations specific to items regular (unix) file metadata are built of */ ++struct file_iops{ ++ ssize_t (*write) (struct file *, struct inode *, ++ const char __user *, size_t, loff_t *pos); ++ int (*read) (struct file *, flow_t *, hint_t *); ++ int (*readpage) (void *, struct page *); ++ int (*get_block) (const coord_t *, sector_t, sector_t *); ++ /* ++ * key of first byte which is not addressed by the item @coord is set ++ * to. ++ * For example, for extent item with the key ++ * ++ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks, ++ * ++ * ->append_key is ++ * ++ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size) ++ */ ++ reiser4_key *(*append_key) (const coord_t *, reiser4_key *); ++ ++ void (*init_coord_extension) (uf_coord_t *, loff_t); ++}; ++ ++/* operations specific to items of stat data type */ ++struct sd_iops { ++ int (*init_inode) (struct inode * inode, char *sd, int len); ++ int (*save_len) (struct inode * inode); ++ int (*save) (struct inode * inode, char **area); ++}; ++ ++/* operations specific to internal item */ ++struct internal_iops{ ++ /* all tree traversal want to know from internal item is where ++ to go next. */ ++ void (*down_link) (const coord_t * coord, ++ const reiser4_key * key, reiser4_block_nr * block); ++ /* check that given internal item contains given pointer. */ ++ int (*has_pointer_to) (const coord_t * coord, ++ const reiser4_block_nr * block); ++}; ++ ++struct item_plugin { ++ /* generic fields */ ++ plugin_header h; ++ /* methods common for all item types */ ++ struct balance_ops b; /* balance operations */ ++ struct flush_ops f; /* flush operates with items via this methods */ ++ ++ /* methods specific to particular type of item */ ++ union { ++ struct dir_entry_iops dir; ++ struct file_iops file; ++ struct sd_iops sd; ++ struct internal_iops internal; ++ } s; ++}; ++ ++#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit) ++ ++static inline item_id item_id_by_plugin(item_plugin * plugin) ++{ ++ return plugin->h.id; ++} ++ ++static inline char get_iplugid(item_plugin * iplug) ++{ ++ assert("nikita-2838", iplug != NULL); ++ assert("nikita-2839", iplug->h.id < 0xff); ++ return (char)item_id_by_plugin(iplug); ++} ++ ++extern unsigned long znode_times_locked(const znode * z); ++ ++static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug) ++{ ++ assert("nikita-2837", coord != NULL); ++ assert("nikita-2838", iplug != NULL); ++ coord->iplugid = get_iplugid(iplug); ++ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node)); ++} ++ ++static inline item_plugin *coord_iplug(const coord_t * coord) ++{ ++ assert("nikita-2833", coord != NULL); ++ assert("nikita-2834", coord->iplugid != INVALID_PLUGID); ++ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node)); ++ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE, ++ coord->iplugid); ++} ++ ++extern int item_can_contain_key(const coord_t * item, const reiser4_key * key, ++ const reiser4_item_data *); ++extern int are_items_mergeable(const coord_t * i1, const coord_t * i2); ++extern int item_is_extent(const coord_t *); ++extern int item_is_tail(const coord_t *); ++extern int item_is_statdata(const coord_t * item); ++extern int item_is_ctail(const coord_t *); ++ ++extern pos_in_node_t item_length_by_coord(const coord_t * coord); ++extern pos_in_node_t nr_units_single_unit(const coord_t * coord); ++extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ ); ++extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key); ++extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *); ++extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key); ++extern reiser4_key *max_unit_key_by_coord(const coord_t * coord, ++ reiser4_key * key); ++extern void obtain_item_plugin(const coord_t * coord); ++ ++#if defined(REISER4_DEBUG) ++extern int znode_is_loaded(const znode * node); ++#endif ++ ++/* return plugin of item at @coord */ ++static inline item_plugin *item_plugin_by_coord(const coord_t * ++ coord /* coord to query */ ) ++{ ++ assert("nikita-330", coord != NULL); ++ assert("nikita-331", coord->node != NULL); ++ assert("nikita-332", znode_is_loaded(coord->node)); ++ ++ if (unlikely(!coord_is_iplug_set(coord))) ++ obtain_item_plugin(coord); ++ return coord_iplug(coord); ++} ++ ++/* this returns true if item is of internal type */ ++static inline int item_is_internal(const coord_t * item) ++{ ++ assert("vs-483", coord_is_existing_item(item)); ++ return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE); ++} ++ ++extern void item_body_by_coord_hard(coord_t * coord); ++extern void *item_body_by_coord_easy(const coord_t * coord); ++#if REISER4_DEBUG ++extern int item_body_is_valid(const coord_t * coord); ++#endif ++ ++/* return pointer to item body */ ++static inline void *item_body_by_coord(const coord_t * ++ coord /* coord to query */ ) ++{ ++ assert("nikita-324", coord != NULL); ++ assert("nikita-325", coord->node != NULL); ++ assert("nikita-326", znode_is_loaded(coord->node)); ++ ++ if (coord->offset == INVALID_OFFSET) ++ item_body_by_coord_hard((coord_t *) coord); ++ assert("nikita-3201", item_body_is_valid(coord)); ++ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node)); ++ return item_body_by_coord_easy(coord); ++} ++ ++/* __REISER4_ITEM_H__ */ ++#endif ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/Makefile linux-2.6.33/fs/reiser4/plugin/item/Makefile +--- linux-2.6.33.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,18 @@ ++obj-$(CONFIG_REISER4_FS) += item_plugins.o ++ ++item_plugins-objs := \ ++ item.o \ ++ static_stat.o \ ++ sde.o \ ++ cde.o \ ++ blackbox.o \ ++ internal.o \ ++ tail.o \ ++ ctail.o \ ++ extent.o \ ++ extent_item_ops.o \ ++ extent_file_ops.o \ ++ extent_flush_ops.o ++ ++ ++ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/sde.c linux-2.6.33/fs/reiser4/plugin/item/sde.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/sde.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,190 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Directory entry implementation */ ++#include "../../forward.h" ++#include "../../debug.h" ++#include "../../dformat.h" ++#include "../../kassign.h" ++#include "../../coord.h" ++#include "sde.h" ++#include "item.h" ++#include "../plugin.h" ++#include "../../znode.h" ++#include "../../carry.h" ++#include "../../tree.h" ++#include "../../inode.h" ++ ++#include <linux/fs.h> /* for struct inode */ ++#include <linux/dcache.h> /* for struct dentry */ ++#include <linux/quotaops.h> ++ ++/* ->extract_key() method of simple directory item plugin. */ ++int extract_key_de(const coord_t * coord /* coord of item */ , ++ reiser4_key * key /* resulting key */ ) ++{ ++ directory_entry_format *dent; ++ ++ assert("nikita-1458", coord != NULL); ++ assert("nikita-1459", key != NULL); ++ ++ dent = (directory_entry_format *) item_body_by_coord(coord); ++ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent); ++ return extract_key_from_id(&dent->id, key); ++} ++ ++int ++update_key_de(const coord_t * coord, const reiser4_key * key, ++ lock_handle * lh UNUSED_ARG) ++{ ++ directory_entry_format *dent; ++ obj_key_id obj_id; ++ int result; ++ ++ assert("nikita-2342", coord != NULL); ++ assert("nikita-2343", key != NULL); ++ ++ dent = (directory_entry_format *) item_body_by_coord(coord); ++ result = build_obj_key_id(key, &obj_id); ++ if (result == 0) { ++ dent->id = obj_id; ++ znode_make_dirty(coord->node); ++ } ++ return 0; ++} ++ ++char *extract_dent_name(const coord_t * coord, directory_entry_format * dent, ++ char *buf) ++{ ++ reiser4_key key; ++ ++ unit_key_by_coord(coord, &key); ++ if (get_key_type(&key) != KEY_FILE_NAME_MINOR) ++ reiser4_print_address("oops", znode_get_block(coord->node)); ++ if (!is_longname_key(&key)) { ++ if (is_dot_key(&key)) ++ return (char *)"."; ++ else ++ return extract_name_from_key(&key, buf); ++ } else ++ return (char *)dent->name; ++} ++ ++/* ->extract_name() method of simple directory item plugin. */ ++char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf) ++{ ++ directory_entry_format *dent; ++ ++ assert("nikita-1460", coord != NULL); ++ ++ dent = (directory_entry_format *) item_body_by_coord(coord); ++ return extract_dent_name(coord, dent, buf); ++} ++ ++/* ->extract_file_type() method of simple directory item plugin. */ ++unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of ++ * item */ ) ++{ ++ assert("nikita-1764", coord != NULL); ++ /* we don't store file type in the directory entry yet. ++ ++ But see comments at kassign.h:obj_key_id ++ */ ++ return DT_UNKNOWN; ++} ++ ++int add_entry_de(struct inode *dir /* directory of item */ , ++ coord_t * coord /* coord of item */ , ++ lock_handle * lh /* insertion lock handle */ , ++ const struct dentry *de /* name to add */ , ++ reiser4_dir_entry_desc * entry /* parameters of new directory ++ * entry */ ) ++{ ++ reiser4_item_data data; ++ directory_entry_format *dent; ++ int result; ++ const char *name; ++ int len; ++ int longname; ++ ++ name = de->d_name.name; ++ len = de->d_name.len; ++ assert("nikita-1163", strlen(name) == len); ++ ++ longname = is_longname(name, len); ++ ++ data.length = sizeof *dent; ++ if (longname) ++ data.length += len + 1; ++ data.data = NULL; ++ data.user = 0; ++ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID); ++ ++ /* NOTE-NIKITA quota plugin */ ++ if (vfs_dq_alloc_space_nodirty(dir, data.length)) ++ return -EDQUOT; ++ ++ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ ); ++ if (result != 0) ++ return result; ++ ++ dent = (directory_entry_format *) item_body_by_coord(coord); ++ build_inode_key_id(entry->obj, &dent->id); ++ if (longname) { ++ memcpy(dent->name, name, len); ++ put_unaligned(0, &dent->name[len]); ++ } ++ return 0; ++} ++ ++int rem_entry_de(struct inode *dir /* directory of item */ , ++ const struct qstr *name UNUSED_ARG, ++ coord_t * coord /* coord of item */ , ++ lock_handle * lh UNUSED_ARG /* lock handle for ++ * removal */ , ++ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of ++ * directory entry ++ * being removed */ ) ++{ ++ coord_t shadow; ++ int result; ++ int length; ++ ++ length = item_length_by_coord(coord); ++ if (inode_get_bytes(dir) < length) { ++ warning("nikita-2627", "Dir is broke: %llu: %llu", ++ (unsigned long long)get_inode_oid(dir), ++ inode_get_bytes(dir)); ++ ++ return RETERR(-EIO); ++ } ++ ++ /* cut_node() is supposed to take pointers to _different_ ++ coords, because it will modify them without respect to ++ possible aliasing. To work around this, create temporary copy ++ of @coord. ++ */ ++ coord_dup(&shadow, coord); ++ result = ++ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0); ++ if (result == 0) { ++ /* NOTE-NIKITA quota plugin */ ++ vfs_dq_free_space_nodirty(dir, length); ++ } ++ return result; ++} ++ ++int max_name_len_de(const struct inode *dir) ++{ ++ return reiser4_tree_by_inode(dir)->nplug->max_item_size() - ++ sizeof(directory_entry_format) - 2; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/sde.h linux-2.6.33/fs/reiser4/plugin/item/sde.h +--- linux-2.6.33.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/sde.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,66 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Directory entry. */ ++ ++#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ) ++#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ++ ++#include "../../forward.h" ++#include "../../dformat.h" ++#include "../../kassign.h" ++#include "../../key.h" ++ ++#include <linux/fs.h> ++#include <linux/dcache.h> /* for struct dentry */ ++ ++typedef struct directory_entry_format { ++ /* key of object stat-data. It's not necessary to store whole ++ key here, because it's always key of stat-data, so minor ++ packing locality and offset can be omitted here. But this ++ relies on particular key allocation scheme for stat-data, so, ++ for extensibility sake, whole key can be stored here. ++ ++ We store key as array of bytes, because we don't want 8-byte ++ alignment of dir entries. ++ */ ++ obj_key_id id; ++ /* file name. Null terminated string. */ ++ d8 name[0]; ++} directory_entry_format; ++ ++void print_de(const char *prefix, coord_t * coord); ++int extract_key_de(const coord_t * coord, reiser4_key * key); ++int update_key_de(const coord_t * coord, const reiser4_key * key, ++ lock_handle * lh); ++char *extract_name_de(const coord_t * coord, char *buf); ++unsigned extract_file_type_de(const coord_t * coord); ++int add_entry_de(struct inode *dir, coord_t * coord, ++ lock_handle * lh, const struct dentry *name, ++ reiser4_dir_entry_desc * entry); ++int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord, ++ lock_handle * lh, reiser4_dir_entry_desc * entry); ++int max_name_len_de(const struct inode *dir); ++ ++int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length); ++ ++char *extract_dent_name(const coord_t * coord, ++ directory_entry_format * dent, char *buf); ++ ++#if REISER4_LARGE_KEY ++#define DE_NAME_BUF_LEN (24) ++#else ++#define DE_NAME_BUF_LEN (16) ++#endif ++ ++/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.33/fs/reiser4/plugin/item/static_stat.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/static_stat.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1107 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* stat data manipulation. */ ++ ++#include "../../forward.h" ++#include "../../super.h" ++#include "../../vfs_ops.h" ++#include "../../inode.h" ++#include "../../debug.h" ++#include "../../dformat.h" ++#include "../object.h" ++#include "../plugin.h" ++#include "../plugin_header.h" ++#include "static_stat.h" ++#include "item.h" ++ ++#include <linux/types.h> ++#include <linux/fs.h> ++ ++/* see static_stat.h for explanation */ ++ ++/* helper function used while we are dumping/loading inode/plugin state ++ to/from the stat-data. */ ++ ++static void move_on(int *length /* space remaining in stat-data */ , ++ char **area /* current coord in stat data */ , ++ int size_of /* how many bytes to move forward */ ) ++{ ++ assert("nikita-615", length != NULL); ++ assert("nikita-616", area != NULL); ++ ++ *length -= size_of; ++ *area += size_of; ++ ++ assert("nikita-617", *length >= 0); ++} ++ ++/* helper function used while loading inode/plugin state from stat-data. ++ Complain if there is less space in stat-data than was expected. ++ Can only happen on disk corruption. */ ++static int not_enough_space(struct inode *inode /* object being processed */ , ++ const char *where /* error message */ ) ++{ ++ assert("nikita-618", inode != NULL); ++ ++ warning("nikita-619", "Not enough space in %llu while loading %s", ++ (unsigned long long)get_inode_oid(inode), where); ++ ++ return RETERR(-EINVAL); ++} ++ ++/* helper function used while loading inode/plugin state from ++ stat-data. Call it if invalid plugin id was found. */ ++static int unknown_plugin(reiser4_plugin_id id /* invalid id */ , ++ struct inode *inode /* object being processed */ ) ++{ ++ warning("nikita-620", "Unknown plugin %i in %llu", ++ id, (unsigned long long)get_inode_oid(inode)); ++ ++ return RETERR(-EINVAL); ++} ++ ++/* this is installed as ->init_inode() method of ++ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). ++ Copies data from on-disk stat-data format into inode. ++ Handles stat-data extensions. */ ++/* was sd_load */ ++int init_inode_static_sd(struct inode *inode /* object being processed */ , ++ char *sd /* stat-data body */ , ++ int len /* length of stat-data */ ) ++{ ++ int result; ++ int bit; ++ int chunk; ++ __u16 mask; ++ __u64 bigmask; ++ reiser4_stat_data_base *sd_base; ++ reiser4_inode *state; ++ ++ assert("nikita-625", inode != NULL); ++ assert("nikita-626", sd != NULL); ++ ++ result = 0; ++ sd_base = (reiser4_stat_data_base *) sd; ++ state = reiser4_inode_data(inode); ++ mask = le16_to_cpu(get_unaligned(&sd_base->extmask)); ++ bigmask = mask; ++ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN); ++ ++ move_on(&len, &sd, sizeof *sd_base); ++ for (bit = 0, chunk = 0; ++ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION; ++ ++bit, mask >>= 1) { ++ if (((bit + 1) % 16) != 0) { ++ /* handle extension */ ++ sd_ext_plugin *sdplug; ++ ++ if (bit >= LAST_SD_EXTENSION) { ++ warning("vpf-1904", ++ "No such extension %i in inode %llu", ++ bit, ++ (unsigned long long) ++ get_inode_oid(inode)); ++ ++ result = RETERR(-EINVAL); ++ break; ++ } ++ ++ sdplug = sd_ext_plugin_by_id(bit); ++ if (sdplug == NULL) { ++ warning("nikita-627", ++ "No such extension %i in inode %llu", ++ bit, ++ (unsigned long long) ++ get_inode_oid(inode)); ++ ++ result = RETERR(-EINVAL); ++ break; ++ } ++ if (mask & 1) { ++ assert("nikita-628", sdplug->present); ++ /* alignment is not supported in node layout ++ plugin yet. ++ result = align( inode, &len, &sd, ++ sdplug -> alignment ); ++ if( result != 0 ) ++ return result; */ ++ result = sdplug->present(inode, &sd, &len); ++ } else if (sdplug->absent != NULL) ++ result = sdplug->absent(inode); ++ if (result) ++ break; ++ /* else, we are looking at the last bit in 16-bit ++ portion of bitmask */ ++ } else if (mask & 1) { ++ /* next portion of bitmask */ ++ if (len < (int)sizeof(d16)) { ++ warning("nikita-629", ++ "No space for bitmap in inode %llu", ++ (unsigned long long) ++ get_inode_oid(inode)); ++ ++ result = RETERR(-EINVAL); ++ break; ++ } ++ mask = le16_to_cpu(get_unaligned((d16 *)sd)); ++ bigmask <<= 16; ++ bigmask |= mask; ++ move_on(&len, &sd, sizeof(d16)); ++ ++chunk; ++ if (chunk == 3) { ++ if (!(mask & 0x8000)) { ++ /* clear last bit */ ++ mask &= ~0x8000; ++ continue; ++ } ++ /* too much */ ++ warning("nikita-630", ++ "Too many extensions in %llu", ++ (unsigned long long) ++ get_inode_oid(inode)); ++ ++ result = RETERR(-EINVAL); ++ break; ++ } ++ } else ++ /* bitmask exhausted */ ++ break; ++ } ++ state->extmask = bigmask; ++ /* common initialisations */ ++ if (len - (bit / 16 * sizeof(d16)) > 0) { ++ /* alignment in save_len_static_sd() is taken into account ++ -edward */ ++ warning("nikita-631", "unused space in inode %llu", ++ (unsigned long long)get_inode_oid(inode)); ++ } ++ ++ return result; ++} ++ ++/* estimates size of stat-data required to store inode. ++ Installed as ->save_len() method of ++ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */ ++/* was sd_len */ ++int save_len_static_sd(struct inode *inode /* object being processed */ ) ++{ ++ unsigned int result; ++ __u64 mask; ++ int bit; ++ ++ assert("nikita-632", inode != NULL); ++ ++ result = sizeof(reiser4_stat_data_base); ++ mask = reiser4_inode_data(inode)->extmask; ++ for (bit = 0; mask != 0; ++bit, mask >>= 1) { ++ if (mask & 1) { ++ sd_ext_plugin *sdplug; ++ ++ sdplug = sd_ext_plugin_by_id(bit); ++ assert("nikita-633", sdplug != NULL); ++ /* no aligment support ++ result += ++ round_up( result, sdplug -> alignment ) - result; */ ++ result += sdplug->save_len(inode); ++ } ++ } ++ result += bit / 16 * sizeof(d16); ++ return result; ++} ++ ++/* saves inode into stat-data. ++ Installed as ->save() method of ++ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */ ++/* was sd_save */ ++int save_static_sd(struct inode *inode /* object being processed */ , ++ char **area /* where to save stat-data */ ) ++{ ++ int result; ++ __u64 emask; ++ int bit; ++ unsigned int len; ++ reiser4_stat_data_base *sd_base; ++ ++ assert("nikita-634", inode != NULL); ++ assert("nikita-635", area != NULL); ++ ++ result = 0; ++ emask = reiser4_inode_data(inode)->extmask; ++ sd_base = (reiser4_stat_data_base *) * area; ++ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask); ++ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/ ++ ++ *area += sizeof *sd_base; ++ len = 0xffffffffu; ++ for (bit = 0; emask != 0; ++bit, emask >>= 1) { ++ if (emask & 1) { ++ if ((bit + 1) % 16 != 0) { ++ sd_ext_plugin *sdplug; ++ sdplug = sd_ext_plugin_by_id(bit); ++ assert("nikita-636", sdplug != NULL); ++ /* no alignment support yet ++ align( inode, &len, area, ++ sdplug -> alignment ); */ ++ result = sdplug->save(inode, area); ++ if (result) ++ break; ++ } else { ++ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), ++ (d16 *)(*area)); ++ /*cputod16((unsigned)(emask & 0xffff), ++ (d16 *) * area);*/ ++ *area += sizeof(d16); ++ } ++ } ++ } ++ return result; ++} ++ ++/* stat-data extension handling functions. */ ++ ++static int present_lw_sd(struct inode *inode /* object being processed */ , ++ char **area /* position in stat-data */ , ++ int *len /* remaining length */ ) ++{ ++ if (*len >= (int)sizeof(reiser4_light_weight_stat)) { ++ reiser4_light_weight_stat *sd_lw; ++ ++ sd_lw = (reiser4_light_weight_stat *) * area; ++ ++ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode)); ++ inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink)); ++ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size)); ++ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) { ++ inode->i_mode &= ~S_IFIFO; ++ warning("", "partially converted file is encountered"); ++ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); ++ } ++ move_on(len, area, sizeof *sd_lw); ++ return 0; ++ } else ++ return not_enough_space(inode, "lw sd"); ++} ++ ++static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being ++ * processed */ ) ++{ ++ return sizeof(reiser4_light_weight_stat); ++} ++ ++static int save_lw_sd(struct inode *inode /* object being processed */ , ++ char **area /* position in stat-data */ ) ++{ ++ reiser4_light_weight_stat *sd; ++ mode_t delta; ++ ++ assert("nikita-2705", inode != NULL); ++ assert("nikita-2706", area != NULL); ++ assert("nikita-2707", *area != NULL); ++ ++ sd = (reiser4_light_weight_stat *) * area; ++ ++ delta = (reiser4_inode_get_flag(inode, ++ REISER4_PART_MIXED) ? S_IFIFO : 0); ++ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode); ++ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink); ++ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size); ++ *area += sizeof *sd; ++ return 0; ++} ++ ++static int present_unix_sd(struct inode *inode /* object being processed */ , ++ char **area /* position in stat-data */ , ++ int *len /* remaining length */ ) ++{ ++ assert("nikita-637", inode != NULL); ++ assert("nikita-638", area != NULL); ++ assert("nikita-639", *area != NULL); ++ assert("nikita-640", len != NULL); ++ assert("nikita-641", *len > 0); ++ ++ if (*len >= (int)sizeof(reiser4_unix_stat)) { ++ reiser4_unix_stat *sd; ++ ++ sd = (reiser4_unix_stat *) * area; ++ ++ inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid)); ++ inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid)); ++ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime)); ++ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime)); ++ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime)); ++ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) ++ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev)); ++ else ++ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes))); ++ move_on(len, area, sizeof *sd); ++ return 0; ++ } else ++ return not_enough_space(inode, "unix sd"); ++} ++ ++static int absent_unix_sd(struct inode *inode /* object being processed */ ) ++{ ++ inode->i_uid = get_super_private(inode->i_sb)->default_uid; ++ inode->i_gid = get_super_private(inode->i_sb)->default_gid; ++ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; ++ inode_set_bytes(inode, inode->i_size); ++ /* mark inode as lightweight, so that caller (lookup_common) will ++ complete initialisation by copying [ug]id from a parent. */ ++ reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT); ++ return 0; ++} ++ ++/* Audited by: green(2002.06.14) */ ++static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being ++ * processed */ ) ++{ ++ return sizeof(reiser4_unix_stat); ++} ++ ++static int save_unix_sd(struct inode *inode /* object being processed */ , ++ char **area /* position in stat-data */ ) ++{ ++ reiser4_unix_stat *sd; ++ ++ assert("nikita-642", inode != NULL); ++ assert("nikita-643", area != NULL); ++ assert("nikita-644", *area != NULL); ++ ++ sd = (reiser4_unix_stat *) * area; ++ put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid); ++ put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid); ++ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime); ++ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime); ++ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime); ++ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) ++ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev); ++ else ++ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes); ++ *area += sizeof *sd; ++ return 0; ++} ++ ++static int ++present_large_times_sd(struct inode *inode /* object being processed */ , ++ char **area /* position in stat-data */ , ++ int *len /* remaining length */ ) ++{ ++ if (*len >= (int)sizeof(reiser4_large_times_stat)) { ++ reiser4_large_times_stat *sd_lt; ++ ++ sd_lt = (reiser4_large_times_stat *) * area; ++ ++ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime)); ++ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime)); ++ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime)); ++ ++ move_on(len, area, sizeof *sd_lt); ++ return 0; ++ } else ++ return not_enough_space(inode, "large times sd"); ++} ++ ++static int ++save_len_large_times_sd(struct inode *inode UNUSED_ARG ++ /* object being processed */ ) ++{ ++ return sizeof(reiser4_large_times_stat); ++} ++ ++static int ++save_large_times_sd(struct inode *inode /* object being processed */ , ++ char **area /* position in stat-data */ ) ++{ ++ reiser4_large_times_stat *sd; ++ ++ assert("nikita-2817", inode != NULL); ++ assert("nikita-2818", area != NULL); ++ assert("nikita-2819", *area != NULL); ++ ++ sd = (reiser4_large_times_stat *) * area; ++ ++ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime); ++ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime); ++ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime); ++ ++ *area += sizeof *sd; ++ return 0; ++} ++ ++/* symlink stat data extension */ ++ ++/* allocate memory for symlink target and attach it to inode->i_private */ ++static int ++symlink_target_to_inode(struct inode *inode, const char *target, int len) ++{ ++ assert("vs-845", inode->i_private == NULL); ++ assert("vs-846", !reiser4_inode_get_flag(inode, ++ REISER4_GENERIC_PTR_USED)); ++ /* FIXME-VS: this is prone to deadlock. Not more than other similar ++ places, though */ ++ inode->i_private = kmalloc((size_t) len + 1, ++ reiser4_ctx_gfp_mask_get()); ++ if (!inode->i_private) ++ return RETERR(-ENOMEM); ++ ++ memcpy((char *)(inode->i_private), target, (size_t) len); ++ ((char *)(inode->i_private))[len] = 0; ++ reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED); ++ return 0; ++} ++ ++/* this is called on read_inode. There is nothing to do actually, but some ++ sanity checks */ ++static int present_symlink_sd(struct inode *inode, char **area, int *len) ++{ ++ int result; ++ int length; ++ reiser4_symlink_stat *sd; ++ ++ length = (int)inode->i_size; ++ /* ++ * *len is number of bytes in stat data item from *area to the end of ++ * item. It must be not less than size of symlink + 1 for ending 0 ++ */ ++ if (length > *len) ++ return not_enough_space(inode, "symlink"); ++ ++ if (*(*area + length) != 0) { ++ warning("vs-840", "Symlink is not zero terminated"); ++ return RETERR(-EIO); ++ } ++ ++ sd = (reiser4_symlink_stat *) * area; ++ result = symlink_target_to_inode(inode, sd->body, length); ++ ++ move_on(len, area, length + 1); ++ return result; ++} ++ ++static int save_len_symlink_sd(struct inode *inode) ++{ ++ return inode->i_size + 1; ++} ++ ++/* this is called on create and update stat data. Do nothing on update but ++ update @area */ ++static int save_symlink_sd(struct inode *inode, char **area) ++{ ++ int result; ++ int length; ++ reiser4_symlink_stat *sd; ++ ++ length = (int)inode->i_size; ++ /* inode->i_size must be set already */ ++ assert("vs-841", length); ++ ++ result = 0; ++ sd = (reiser4_symlink_stat *) * area; ++ if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) { ++ const char *target; ++ ++ target = (const char *)(inode->i_private); ++ inode->i_private = NULL; ++ ++ result = symlink_target_to_inode(inode, target, length); ++ ++ /* copy symlink to stat data */ ++ memcpy(sd->body, target, (size_t) length); ++ (*area)[length] = 0; ++ } else { ++ /* there is nothing to do in update but move area */ ++ assert("vs-844", ++ !memcmp(inode->i_private, sd->body, ++ (size_t) length + 1)); ++ } ++ ++ *area += (length + 1); ++ return result; ++} ++ ++static int present_flags_sd(struct inode *inode /* object being processed */ , ++ char **area /* position in stat-data */ , ++ int *len /* remaining length */ ) ++{ ++ assert("nikita-645", inode != NULL); ++ assert("nikita-646", area != NULL); ++ assert("nikita-647", *area != NULL); ++ assert("nikita-648", len != NULL); ++ assert("nikita-649", *len > 0); ++ ++ if (*len >= (int)sizeof(reiser4_flags_stat)) { ++ reiser4_flags_stat *sd; ++ ++ sd = (reiser4_flags_stat *) * area; ++ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags)); ++ move_on(len, area, sizeof *sd); ++ return 0; ++ } else ++ return not_enough_space(inode, "generation and attrs"); ++} ++ ++/* Audited by: green(2002.06.14) */ ++static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being ++ * processed */ ) ++{ ++ return sizeof(reiser4_flags_stat); ++} ++ ++static int save_flags_sd(struct inode *inode /* object being processed */ , ++ char **area /* position in stat-data */ ) ++{ ++ reiser4_flags_stat *sd; ++ ++ assert("nikita-650", inode != NULL); ++ assert("nikita-651", area != NULL); ++ assert("nikita-652", *area != NULL); ++ ++ sd = (reiser4_flags_stat *) * area; ++ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags); ++ *area += sizeof *sd; ++ return 0; ++} ++ ++static int absent_plugin_sd(struct inode *inode); ++static int present_plugin_sd(struct inode *inode /* object being processed */ , ++ char **area /* position in stat-data */ , ++ int *len /* remaining length */, ++ int is_pset /* 1 if plugin set, 0 if heir set. */) ++{ ++ reiser4_plugin_stat *sd; ++ reiser4_plugin *plugin; ++ reiser4_inode *info; ++ int i; ++ __u16 mask; ++ int result; ++ int num_of_plugins; ++ ++ assert("nikita-653", inode != NULL); ++ assert("nikita-654", area != NULL); ++ assert("nikita-655", *area != NULL); ++ assert("nikita-656", len != NULL); ++ assert("nikita-657", *len > 0); ++ ++ if (*len < (int)sizeof(reiser4_plugin_stat)) ++ return not_enough_space(inode, "plugin"); ++ ++ sd = (reiser4_plugin_stat *) * area; ++ info = reiser4_inode_data(inode); ++ ++ mask = 0; ++ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no)); ++ move_on(len, area, sizeof *sd); ++ result = 0; ++ for (i = 0; i < num_of_plugins; ++i) { ++ reiser4_plugin_slot *slot; ++ reiser4_plugin_type type; ++ pset_member memb; ++ ++ slot = (reiser4_plugin_slot *) * area; ++ if (*len < (int)sizeof *slot) ++ return not_enough_space(inode, "additional plugin"); ++ ++ memb = le16_to_cpu(get_unaligned(&slot->pset_memb)); ++ type = aset_member_to_type_unsafe(memb); ++ ++ if (type == REISER4_PLUGIN_TYPES) { ++ warning("nikita-3502", ++ "wrong %s member (%i) for %llu", is_pset ? ++ "pset" : "hset", memb, ++ (unsigned long long)get_inode_oid(inode)); ++ return RETERR(-EINVAL); ++ } ++ plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode), ++ type, &slot->id); ++ if (plugin == NULL) ++ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode); ++ ++ /* plugin is loaded into inode, mark this into inode's ++ bitmask of loaded non-standard plugins */ ++ if (!(mask & (1 << memb))) { ++ mask |= (1 << memb); ++ } else { ++ warning("nikita-658", "duplicate plugin for %llu", ++ (unsigned long long)get_inode_oid(inode)); ++ return RETERR(-EINVAL); ++ } ++ move_on(len, area, sizeof *slot); ++ /* load plugin data, if any */ ++ if (plugin->h.pops != NULL && plugin->h.pops->load) ++ result = plugin->h.pops->load(inode, plugin, area, len); ++ else ++ result = aset_set_unsafe(is_pset ? &info->pset : ++ &info->hset, memb, plugin); ++ if (result) ++ return result; ++ } ++ if (is_pset) { ++ /* if object plugin wasn't loaded from stat-data, guess it by ++ mode bits */ ++ plugin = file_plugin_to_plugin(inode_file_plugin(inode)); ++ if (plugin == NULL) ++ result = absent_plugin_sd(inode); ++ info->plugin_mask = mask; ++ } else ++ info->heir_mask = mask; ++ ++ return result; ++} ++ ++static int present_pset_sd(struct inode *inode, char **area, int *len) { ++ return present_plugin_sd(inode, area, len, 1 /* pset */); ++} ++ ++/* Determine object plugin for @inode based on i_mode. ++ ++ Many objects in reiser4 file system are controlled by standard object ++ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on. ++ ++ For such files we don't explicitly store plugin id in object stat ++ data. Rather required plugin is guessed from mode bits, where file "type" ++ is encoded (see stat(2)). ++*/ ++static int ++guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ ) ++{ ++ int fplug_id; ++ int dplug_id; ++ reiser4_inode *info; ++ ++ assert("nikita-736", inode != NULL); ++ ++ dplug_id = fplug_id = -1; ++ ++ switch (inode->i_mode & S_IFMT) { ++ case S_IFSOCK: ++ case S_IFBLK: ++ case S_IFCHR: ++ case S_IFIFO: ++ fplug_id = SPECIAL_FILE_PLUGIN_ID; ++ break; ++ case S_IFLNK: ++ fplug_id = SYMLINK_FILE_PLUGIN_ID; ++ break; ++ case S_IFDIR: ++ fplug_id = DIRECTORY_FILE_PLUGIN_ID; ++ dplug_id = HASHED_DIR_PLUGIN_ID; ++ break; ++ default: ++ warning("nikita-737", "wrong file mode: %o", inode->i_mode); ++ return RETERR(-EIO); ++ case S_IFREG: ++ fplug_id = UNIX_FILE_PLUGIN_ID; ++ break; ++ } ++ info = reiser4_inode_data(inode); ++ set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ? ++ plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL); ++ set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ? ++ plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL); ++ return 0; ++} ++ ++/* Audited by: green(2002.06.14) */ ++static int absent_plugin_sd(struct inode *inode /* object being processed */ ) ++{ ++ int result; ++ ++ assert("nikita-659", inode != NULL); ++ ++ result = guess_plugin_by_mode(inode); ++ /* if mode was wrong, guess_plugin_by_mode() returns "regular file", ++ but setup_inode_ops() will call make_bad_inode(). ++ Another, more logical but bit more complex solution is to add ++ "bad-file plugin". */ ++ /* FIXME-VS: activate was called here */ ++ return result; ++} ++ ++/* helper function for plugin_sd_save_len(): calculate how much space ++ required to save state of given plugin */ ++/* Audited by: green(2002.06.14) */ ++static int len_for(reiser4_plugin * plugin /* plugin to save */ , ++ struct inode *inode /* object being processed */ , ++ pset_member memb, ++ int len, int is_pset) ++{ ++ reiser4_inode *info; ++ assert("nikita-661", inode != NULL); ++ ++ if (plugin == NULL) ++ return len; ++ ++ info = reiser4_inode_data(inode); ++ if (is_pset ? ++ info->plugin_mask & (1 << memb) : ++ info->heir_mask & (1 << memb)) { ++ len += sizeof(reiser4_plugin_slot); ++ if (plugin->h.pops && plugin->h.pops->save_len != NULL) { ++ /* non-standard plugin, call method */ ++ /* commented as it is incompatible with alignment ++ * policy in save_plug() -edward */ ++ /* len = round_up(len, plugin->h.pops->alignment); */ ++ len += plugin->h.pops->save_len(inode, plugin); ++ } ++ } ++ return len; ++} ++ ++/* calculate how much space is required to save state of all plugins, ++ associated with inode */ ++static int save_len_plugin_sd(struct inode *inode /* object being processed */, ++ int is_pset) ++{ ++ int len; ++ int last; ++ reiser4_inode *state; ++ pset_member memb; ++ ++ assert("nikita-663", inode != NULL); ++ ++ state = reiser4_inode_data(inode); ++ ++ /* common case: no non-standard plugins */ ++ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0) ++ return 0; ++ len = sizeof(reiser4_plugin_stat); ++ last = PSET_LAST; ++ ++ for (memb = 0; memb < last; ++memb) { ++ len = len_for(aset_get(is_pset ? state->pset : state->hset, memb), ++ inode, memb, len, is_pset); ++ } ++ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat)); ++ return len; ++} ++ ++static int save_len_pset_sd(struct inode *inode) { ++ return save_len_plugin_sd(inode, 1 /* pset */); ++} ++ ++/* helper function for plugin_sd_save(): save plugin, associated with ++ inode. */ ++static int save_plug(reiser4_plugin * plugin /* plugin to save */ , ++ struct inode *inode /* object being processed */ , ++ int memb /* what element of pset is saved */ , ++ char **area /* position in stat-data */ , ++ int *count /* incremented if plugin were actually saved. */, ++ int is_pset /* 1 for plugin set, 0 for heir set */) ++{ ++ reiser4_plugin_slot *slot; ++ int fake_len; ++ int result; ++ ++ assert("nikita-665", inode != NULL); ++ assert("nikita-666", area != NULL); ++ assert("nikita-667", *area != NULL); ++ ++ if (plugin == NULL) ++ return 0; ++ ++ if (is_pset ? ++ !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) : ++ !(reiser4_inode_data(inode)->heir_mask & (1 << memb))) ++ return 0; ++ slot = (reiser4_plugin_slot *) * area; ++ put_unaligned(cpu_to_le16(memb), &slot->pset_memb); ++ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id); ++ fake_len = (int)0xffff; ++ move_on(&fake_len, area, sizeof *slot); ++ ++*count; ++ result = 0; ++ if (plugin->h.pops != NULL) { ++ if (plugin->h.pops->save != NULL) ++ result = plugin->h.pops->save(inode, plugin, area); ++ } ++ return result; ++} ++ ++/* save state of all non-standard plugins associated with inode */ ++static int save_plugin_sd(struct inode *inode /* object being processed */ , ++ char **area /* position in stat-data */, ++ int is_pset /* 1 for pset, 0 for hset */) ++{ ++ int fake_len; ++ int result = 0; ++ int num_of_plugins; ++ reiser4_plugin_stat *sd; ++ reiser4_inode *state; ++ pset_member memb; ++ ++ assert("nikita-669", inode != NULL); ++ assert("nikita-670", area != NULL); ++ assert("nikita-671", *area != NULL); ++ ++ state = reiser4_inode_data(inode); ++ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0) ++ return 0; ++ sd = (reiser4_plugin_stat *) * area; ++ fake_len = (int)0xffff; ++ move_on(&fake_len, area, sizeof *sd); ++ ++ num_of_plugins = 0; ++ for (memb = 0; memb < PSET_LAST; ++memb) { ++ result = save_plug(aset_get(is_pset ? state->pset : state->hset, ++ memb), ++ inode, memb, area, &num_of_plugins, is_pset); ++ if (result != 0) ++ break; ++ } ++ ++ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no); ++ return result; ++} ++ ++static int save_pset_sd(struct inode *inode, char **area) { ++ return save_plugin_sd(inode, area, 1 /* pset */); ++} ++ ++static int present_hset_sd(struct inode *inode, char **area, int *len) { ++ return present_plugin_sd(inode, area, len, 0 /* hset */); ++} ++ ++static int save_len_hset_sd(struct inode *inode) { ++ return save_len_plugin_sd(inode, 0 /* pset */); ++} ++ ++static int save_hset_sd(struct inode *inode, char **area) { ++ return save_plugin_sd(inode, area, 0 /* hset */); ++} ++ ++/* helper function for crypto_sd_present(), crypto_sd_save. ++ Extract crypto info from stat-data and attach it to inode */ ++static int extract_crypto_info (struct inode * inode, ++ reiser4_crypto_stat * sd) ++{ ++ struct reiser4_crypto_info * info; ++ assert("edward-11", !inode_crypto_info(inode)); ++ assert("edward-1413", ++ !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)); ++ /* create and attach a crypto-stat without secret key loaded */ ++ info = reiser4_alloc_crypto_info(inode); ++ if (IS_ERR(info)) ++ return PTR_ERR(info); ++ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize)); ++ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize); ++ reiser4_attach_crypto_info(inode, info); ++ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED); ++ return 0; ++} ++ ++/* crypto stat-data extension */ ++ ++static int present_crypto_sd(struct inode *inode, char **area, int *len) ++{ ++ int result; ++ reiser4_crypto_stat *sd; ++ digest_plugin *dplug = inode_digest_plugin(inode); ++ ++ assert("edward-06", dplug != NULL); ++ assert("edward-684", dplug->fipsize); ++ assert("edward-07", area != NULL); ++ assert("edward-08", *area != NULL); ++ assert("edward-09", len != NULL); ++ assert("edward-10", *len > 0); ++ ++ if (*len < (int)sizeof(reiser4_crypto_stat)) { ++ return not_enough_space(inode, "crypto-sd"); ++ } ++ /* *len is number of bytes in stat data item from *area to the end of ++ item. It must be not less than size of this extension */ ++ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len); ++ ++ sd = (reiser4_crypto_stat *) * area; ++ result = extract_crypto_info(inode, sd); ++ move_on(len, area, sizeof(*sd) + dplug->fipsize); ++ ++ return result; ++} ++ ++static int save_len_crypto_sd(struct inode *inode) ++{ ++ return sizeof(reiser4_crypto_stat) + ++ inode_digest_plugin(inode)->fipsize; ++} ++ ++static int save_crypto_sd(struct inode *inode, char **area) ++{ ++ int result = 0; ++ reiser4_crypto_stat *sd; ++ struct reiser4_crypto_info * info = inode_crypto_info(inode); ++ digest_plugin *dplug = inode_digest_plugin(inode); ++ ++ assert("edward-12", dplug != NULL); ++ assert("edward-13", area != NULL); ++ assert("edward-14", *area != NULL); ++ assert("edward-15", info != NULL); ++ assert("edward-1414", info->keyid != NULL); ++ assert("edward-1415", info->keysize != 0); ++ assert("edward-76", reiser4_inode_data(inode) != NULL); ++ ++ if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) { ++ /* file is just created */ ++ sd = (reiser4_crypto_stat *) *area; ++ /* copy everything but private key to the disk stat-data */ ++ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize); ++ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize); ++ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED); ++ } ++ *area += (sizeof(*sd) + dplug->fipsize); ++ return result; ++} ++ ++static int eio(struct inode *inode, char **area, int *len) ++{ ++ return RETERR(-EIO); ++} ++ ++sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = { ++ [LIGHT_WEIGHT_STAT] = { ++ .h = { ++ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, ++ .id = LIGHT_WEIGHT_STAT, ++ .pops = NULL, ++ .label = "light-weight sd", ++ .desc = "sd for light-weight files", ++ .linkage = {NULL,NULL} ++ }, ++ .present = present_lw_sd, ++ .absent = NULL, ++ .save_len = save_len_lw_sd, ++ .save = save_lw_sd, ++ .alignment = 8 ++ }, ++ [UNIX_STAT] = { ++ .h = { ++ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, ++ .id = UNIX_STAT, ++ .pops = NULL, ++ .label = "unix-sd", ++ .desc = "unix stat-data fields", ++ .linkage = {NULL,NULL} ++ }, ++ .present = present_unix_sd, ++ .absent = absent_unix_sd, ++ .save_len = save_len_unix_sd, ++ .save = save_unix_sd, ++ .alignment = 8 ++ }, ++ [LARGE_TIMES_STAT] = { ++ .h = { ++ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, ++ .id = LARGE_TIMES_STAT, ++ .pops = NULL, ++ .label = "64time-sd", ++ .desc = "nanosecond resolution for times", ++ .linkage = {NULL,NULL} ++ }, ++ .present = present_large_times_sd, ++ .absent = NULL, ++ .save_len = save_len_large_times_sd, ++ .save = save_large_times_sd, ++ .alignment = 8 ++ }, ++ [SYMLINK_STAT] = { ++ /* stat data of symlink has this extension */ ++ .h = { ++ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, ++ .id = SYMLINK_STAT, ++ .pops = NULL, ++ .label = "symlink-sd", ++ .desc = ++ "stat data is appended with symlink name", ++ .linkage = {NULL,NULL} ++ }, ++ .present = present_symlink_sd, ++ .absent = NULL, ++ .save_len = save_len_symlink_sd, ++ .save = save_symlink_sd, ++ .alignment = 8 ++ }, ++ [PLUGIN_STAT] = { ++ .h = { ++ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, ++ .id = PLUGIN_STAT, ++ .pops = NULL, ++ .label = "plugin-sd", ++ .desc = "plugin stat-data fields", ++ .linkage = {NULL,NULL} ++ }, ++ .present = present_pset_sd, ++ .absent = absent_plugin_sd, ++ .save_len = save_len_pset_sd, ++ .save = save_pset_sd, ++ .alignment = 8 ++ }, ++ [HEIR_STAT] = { ++ .h = { ++ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, ++ .id = HEIR_STAT, ++ .pops = NULL, ++ .label = "heir-plugin-sd", ++ .desc = "heir plugin stat-data fields", ++ .linkage = {NULL,NULL} ++ }, ++ .present = present_hset_sd, ++ .absent = NULL, ++ .save_len = save_len_hset_sd, ++ .save = save_hset_sd, ++ .alignment = 8 ++ }, ++ [FLAGS_STAT] = { ++ .h = { ++ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, ++ .id = FLAGS_STAT, ++ .pops = NULL, ++ .label = "flags-sd", ++ .desc = "inode bit flags", ++ .linkage = {NULL, NULL} ++ }, ++ .present = present_flags_sd, ++ .absent = NULL, ++ .save_len = save_len_flags_sd, ++ .save = save_flags_sd, ++ .alignment = 8 ++ }, ++ [CAPABILITIES_STAT] = { ++ .h = { ++ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, ++ .id = CAPABILITIES_STAT, ++ .pops = NULL, ++ .label = "capabilities-sd", ++ .desc = "capabilities", ++ .linkage = {NULL, NULL} ++ }, ++ .present = eio, ++ .absent = NULL, ++ .save_len = save_len_flags_sd, ++ .save = save_flags_sd, ++ .alignment = 8 ++ }, ++ [CRYPTO_STAT] = { ++ .h = { ++ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, ++ .id = CRYPTO_STAT, ++ .pops = NULL, ++ .label = "crypto-sd", ++ .desc = "secret key size and id", ++ .linkage = {NULL, NULL} ++ }, ++ .present = present_crypto_sd, ++ .absent = NULL, ++ .save_len = save_len_crypto_sd, ++ .save = save_crypto_sd, ++ .alignment = 8 ++ } ++}; ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.33/fs/reiser4/plugin/item/static_stat.h +--- linux-2.6.33.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/static_stat.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,224 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* This describes the static_stat item, used to hold all information needed by the stat() syscall. ++ ++In the case where each file has not less than the fields needed by the ++stat() syscall, it is more compact to store those fields in this ++struct. ++ ++If this item does not exist, then all stats are dynamically resolved. ++At the moment, we either resolve all stats dynamically or all of them ++statically. If you think this is not fully optimal, and the rest of ++reiser4 is working, then fix it...:-) ++ ++*/ ++ ++#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ ) ++#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ ++ ++#include "../../forward.h" ++#include "../../dformat.h" ++ ++#include <linux/fs.h> /* for struct inode */ ++ ++/* Stat data layout: goals and implementation. ++ ++ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to ++ them, including not having semantic metadata attached to them. ++ ++ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you ++ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically ++ sized structure because the statically sized structure knows without recording it what the names and lengths of the ++ attributes are. ++ ++ This leads to a natural compromise, which is to special case those files which have simply the standard unix file ++ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix ++ file in their use of file attributes. ++ ++ Yet this compromise deserves to be compromised a little. ++ ++ We accommodate the case where you have no more than the standard unix file attributes by using an "extension ++ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum). ++ ++ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited ++ from parent directory (as uid, gid) or initialised to some sane values. ++ ++ To capitalize on existing code infrastructure, extensions are ++ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE. ++ Each stat-data extension plugin implements four methods: ++ ++ ->present() called by sd_load() when this extension is found in stat-data ++ ->absent() called by sd_load() when this extension is not found in stat-data ++ ->save_len() called by sd_len() to calculate total length of stat-data ++ ->save() called by sd_save() to store extension data into stat-data ++ ++ Implementation is in fs/reiser4/plugin/item/static_stat.c ++*/ ++ ++/* stat-data extension. Please order this by presumed frequency of use */ ++typedef enum { ++ /* support for light-weight files */ ++ LIGHT_WEIGHT_STAT, ++ /* data required to implement unix stat(2) call. Layout is in ++ reiser4_unix_stat. If this is not present, file is light-weight */ ++ UNIX_STAT, ++ /* this contains additional set of 32bit [anc]time fields to implement ++ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage ++ if this extension is governed by 32bittimes mount option. */ ++ LARGE_TIMES_STAT, ++ /* stat data has link name included */ ++ SYMLINK_STAT, ++ /* on-disk slots of non-standard plugins for main plugin table ++ (@reiser4_inode->pset), that is, plugins that cannot be deduced ++ from file mode bits), for example, aggregation, interpolation etc. */ ++ PLUGIN_STAT, ++ /* this extension contains persistent inode flags. These flags are ++ single bits: immutable, append, only, etc. Layout is in ++ reiser4_flags_stat. */ ++ FLAGS_STAT, ++ /* this extension contains capabilities sets, associated with this ++ file. Layout is in reiser4_capabilities_stat */ ++ CAPABILITIES_STAT, ++ /* this extension contains size and public id of the secret key. ++ Layout is in reiser4_crypto_stat */ ++ CRYPTO_STAT, ++ /* on-disk slots of non-default plugins for inheritance, which ++ are extracted to special plugin table (@reiser4_inode->hset). ++ By default, children of the object will inherit plugins from ++ its main plugin table (pset). */ ++ HEIR_STAT, ++ LAST_SD_EXTENSION, ++ /* ++ * init_inode_static_sd() iterates over extension mask until all ++ * non-zero bits are processed. This means, that neither ->present(), ++ * nor ->absent() methods will be called for stat-data extensions that ++ * go after last present extension. But some basic extensions, we want ++ * either ->absent() or ->present() method to be called, because these ++ * extensions set up something in inode even when they are not ++ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all ++ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either ++ * ->present(), or ->absent() method will be called, independently of ++ * what other extensions are present. ++ */ ++ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT ++} sd_ext_bits; ++ ++/* minimal stat-data. This allows to support light-weight files. */ ++typedef struct reiser4_stat_data_base { ++ /* 0 */ __le16 extmask; ++ /* 2 */ ++} PACKED reiser4_stat_data_base; ++ ++typedef struct reiser4_light_weight_stat { ++ /* 0 */ __le16 mode; ++ /* 2 */ __le32 nlink; ++ /* 6 */ __le64 size; ++ /* size in bytes */ ++ /* 14 */ ++} PACKED reiser4_light_weight_stat; ++ ++typedef struct reiser4_unix_stat { ++ /* owner id */ ++ /* 0 */ __le32 uid; ++ /* group id */ ++ /* 4 */ __le32 gid; ++ /* access time */ ++ /* 8 */ __le32 atime; ++ /* modification time */ ++ /* 12 */ __le32 mtime; ++ /* change time */ ++ /* 16 */ __le32 ctime; ++ union { ++ /* minor:major for device files */ ++ /* 20 */ __le64 rdev; ++ /* bytes used by file */ ++ /* 20 */ __le64 bytes; ++ } u; ++ /* 28 */ ++} PACKED reiser4_unix_stat; ++ ++/* symlink stored as part of inode */ ++typedef struct reiser4_symlink_stat { ++ char body[0]; ++} PACKED reiser4_symlink_stat; ++ ++typedef struct reiser4_plugin_slot { ++ /* 0 */ __le16 pset_memb; ++ /* 2 */ __le16 id; ++ /* 4 *//* here plugin stores its persistent state */ ++} PACKED reiser4_plugin_slot; ++ ++/* stat-data extension for files with non-standard plugin. */ ++typedef struct reiser4_plugin_stat { ++ /* number of additional plugins, associated with this object */ ++ /* 0 */ __le16 plugins_no; ++ /* 2 */ reiser4_plugin_slot slot[0]; ++ /* 2 */ ++} PACKED reiser4_plugin_stat; ++ ++/* stat-data extension for inode flags. Currently it is just fixed-width 32 ++ * bit mask. If need arise, this can be replaced with variable width ++ * bitmask. */ ++typedef struct reiser4_flags_stat { ++ /* 0 */ __le32 flags; ++ /* 4 */ ++} PACKED reiser4_flags_stat; ++ ++typedef struct reiser4_capabilities_stat { ++ /* 0 */ __le32 effective; ++ /* 8 */ __le32 permitted; ++ /* 16 */ ++} PACKED reiser4_capabilities_stat; ++ ++typedef struct reiser4_cluster_stat { ++/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */ ++ /* 0 */ d8 cluster_shift; ++ /* 1 */ ++} PACKED reiser4_cluster_stat; ++ ++typedef struct reiser4_crypto_stat { ++ /* secret key size, bits */ ++ /* 0 */ d16 keysize; ++ /* secret key id */ ++ /* 2 */ d8 keyid[0]; ++ /* 2 */ ++} PACKED reiser4_crypto_stat; ++ ++typedef struct reiser4_large_times_stat { ++ /* access time */ ++ /* 0 */ d32 atime; ++ /* modification time */ ++ /* 4 */ d32 mtime; ++ /* change time */ ++ /* 8 */ d32 ctime; ++ /* 12 */ ++} PACKED reiser4_large_times_stat; ++ ++/* this structure is filled by sd_item_stat */ ++typedef struct sd_stat { ++ int dirs; ++ int files; ++ int others; ++} sd_stat; ++ ++/* plugin->item.common.* */ ++extern void print_sd(const char *prefix, coord_t * coord); ++extern void item_stat_static_sd(const coord_t * coord, void *vp); ++ ++/* plugin->item.s.sd.* */ ++extern int init_inode_static_sd(struct inode *inode, char *sd, int len); ++extern int save_len_static_sd(struct inode *inode); ++extern int save_static_sd(struct inode *inode, char **area); ++ ++/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/tail.c linux-2.6.33/fs/reiser4/plugin/item/tail.c +--- linux-2.6.33.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/tail.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,807 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "item.h" ++#include "../../inode.h" ++#include "../../page_cache.h" ++#include "../../carry.h" ++#include "../../vfs_ops.h" ++ ++#include <linux/quotaops.h> ++#include <asm/uaccess.h> ++#include <linux/swap.h> ++#include <linux/writeback.h> ++ ++/* plugin->u.item.b.max_key_inside */ ++reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key) ++{ ++ item_key_by_coord(coord, key); ++ set_key_offset(key, get_key_offset(reiser4_max_key())); ++ return key; ++} ++ ++/* plugin->u.item.b.can_contain_key */ ++int can_contain_key_tail(const coord_t *coord, const reiser4_key *key, ++ const reiser4_item_data *data) ++{ ++ reiser4_key item_key; ++ ++ if (item_plugin_by_coord(coord) != data->iplug) ++ return 0; ++ ++ item_key_by_coord(coord, &item_key); ++ if (get_key_locality(key) != get_key_locality(&item_key) || ++ get_key_objectid(key) != get_key_objectid(&item_key)) ++ return 0; ++ ++ return 1; ++} ++ ++/* plugin->u.item.b.mergeable ++ first item is of tail type */ ++/* Audited by: green(2002.06.14) */ ++int mergeable_tail(const coord_t *p1, const coord_t *p2) ++{ ++ reiser4_key key1, key2; ++ ++ assert("vs-535", plugin_of_group(item_plugin_by_coord(p1), ++ UNIX_FILE_METADATA_ITEM_TYPE)); ++ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID); ++ ++ if (item_id_by_coord(p2) != FORMATTING_ID) { ++ /* second item is of another type */ ++ return 0; ++ } ++ ++ item_key_by_coord(p1, &key1); ++ item_key_by_coord(p2, &key2); ++ if (get_key_locality(&key1) != get_key_locality(&key2) || ++ get_key_objectid(&key1) != get_key_objectid(&key2) ++ || get_key_type(&key1) != get_key_type(&key2)) { ++ /* items of different objects */ ++ return 0; ++ } ++ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) { ++ /* not adjacent items */ ++ return 0; ++ } ++ return 1; ++} ++ ++/* plugin->u.item.b.print ++ plugin->u.item.b.check */ ++ ++/* plugin->u.item.b.nr_units */ ++pos_in_node_t nr_units_tail(const coord_t * coord) ++{ ++ return item_length_by_coord(coord); ++} ++ ++/* plugin->u.item.b.lookup */ ++lookup_result ++lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord) ++{ ++ reiser4_key item_key; ++ __u64 lookuped, offset; ++ unsigned nr_units; ++ ++ item_key_by_coord(coord, &item_key); ++ offset = get_key_offset(item_key_by_coord(coord, &item_key)); ++ nr_units = nr_units_tail(coord); ++ ++ /* key we are looking for must be greater than key of item @coord */ ++ assert("vs-416", keygt(key, &item_key)); ++ ++ /* offset we are looking for */ ++ lookuped = get_key_offset(key); ++ ++ if (lookuped >= offset && lookuped < offset + nr_units) { ++ /* byte we are looking for is in this item */ ++ coord->unit_pos = lookuped - offset; ++ coord->between = AT_UNIT; ++ return CBK_COORD_FOUND; ++ } ++ ++ /* set coord after last unit */ ++ coord->unit_pos = nr_units - 1; ++ coord->between = AFTER_UNIT; ++ return bias == ++ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND; ++} ++ ++/* plugin->u.item.b.paste */ ++int ++paste_tail(coord_t *coord, reiser4_item_data *data, ++ carry_plugin_info *info UNUSED_ARG) ++{ ++ unsigned old_item_length; ++ char *item; ++ ++ /* length the item had before resizing has been performed */ ++ old_item_length = item_length_by_coord(coord) - data->length; ++ ++ /* tail items never get pasted in the middle */ ++ assert("vs-363", ++ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) || ++ (coord->unit_pos == old_item_length - 1 && ++ coord->between == AFTER_UNIT) || ++ (coord->unit_pos == 0 && old_item_length == 0 ++ && coord->between == AT_UNIT)); ++ ++ item = item_body_by_coord(coord); ++ if (coord->unit_pos == 0) ++ /* make space for pasted data when pasting at the beginning of ++ the item */ ++ memmove(item + data->length, item, old_item_length); ++ ++ if (coord->between == AFTER_UNIT) ++ coord->unit_pos++; ++ ++ if (data->data) { ++ assert("vs-554", data->user == 0 || data->user == 1); ++ if (data->user) { ++ assert("nikita-3035", reiser4_schedulable()); ++ /* copy from user space */ ++ if (__copy_from_user(item + coord->unit_pos, ++ (const char __user *)data->data, ++ (unsigned)data->length)) ++ return RETERR(-EFAULT); ++ } else ++ /* copy from kernel space */ ++ memcpy(item + coord->unit_pos, data->data, ++ (unsigned)data->length); ++ } else { ++ memset(item + coord->unit_pos, 0, (unsigned)data->length); ++ } ++ return 0; ++} ++ ++/* plugin->u.item.b.fast_paste */ ++ ++/* plugin->u.item.b.can_shift ++ number of units is returned via return value, number of bytes via @size. For ++ tail items they coincide */ ++int ++can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG, ++ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG, ++ unsigned *size, unsigned want) ++{ ++ /* make sure that that we do not want to shift more than we have */ ++ assert("vs-364", want > 0 ++ && want <= (unsigned)item_length_by_coord(source)); ++ ++ *size = min(want, free_space); ++ return *size; ++} ++ ++/* plugin->u.item.b.copy_units */ ++void ++copy_units_tail(coord_t * target, coord_t * source, ++ unsigned from, unsigned count, ++ shift_direction where_is_free_space, ++ unsigned free_space UNUSED_ARG) ++{ ++ /* make sure that item @target is expanded already */ ++ assert("vs-366", (unsigned)item_length_by_coord(target) >= count); ++ assert("vs-370", free_space >= count); ++ ++ if (where_is_free_space == SHIFT_LEFT) { ++ /* append item @target with @count first bytes of @source */ ++ assert("vs-365", from == 0); ++ ++ memcpy((char *)item_body_by_coord(target) + ++ item_length_by_coord(target) - count, ++ (char *)item_body_by_coord(source), count); ++ } else { ++ /* target item is moved to right already */ ++ reiser4_key key; ++ ++ assert("vs-367", ++ (unsigned)item_length_by_coord(source) == from + count); ++ ++ memcpy((char *)item_body_by_coord(target), ++ (char *)item_body_by_coord(source) + from, count); ++ ++ /* new units are inserted before first unit in an item, ++ therefore, we have to update item key */ ++ item_key_by_coord(source, &key); ++ set_key_offset(&key, get_key_offset(&key) + from); ++ ++ node_plugin_by_node(target->node)->update_item_key(target, &key, ++ NULL /*info */); ++ } ++} ++ ++/* plugin->u.item.b.create_hook */ ++ ++/* item_plugin->b.kill_hook ++ this is called when @count units starting from @from-th one are going to be removed ++ */ ++int ++kill_hook_tail(const coord_t * coord, pos_in_node_t from, ++ pos_in_node_t count, struct carry_kill_data *kdata) ++{ ++ reiser4_key key; ++ loff_t start, end; ++ ++ assert("vs-1577", kdata); ++ assert("vs-1579", kdata->inode); ++ ++ item_key_by_coord(coord, &key); ++ start = get_key_offset(&key) + from; ++ end = start + count; ++ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate); ++ return 0; ++} ++ ++/* plugin->u.item.b.shift_hook */ ++ ++/* helper for kill_units_tail and cut_units_tail */ ++static int ++do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ reiser4_key * smallest_removed, reiser4_key * new_first) ++{ ++ pos_in_node_t count; ++ ++ /* this method is only called to remove part of item */ ++ assert("vs-374", (to - from + 1) < item_length_by_coord(coord)); ++ /* tails items are never cut from the middle of an item */ ++ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord))); ++ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord))); ++ ++ count = to - from + 1; ++ ++ if (smallest_removed) { ++ /* store smallest key removed */ ++ item_key_by_coord(coord, smallest_removed); ++ set_key_offset(smallest_removed, ++ get_key_offset(smallest_removed) + from); ++ } ++ if (new_first) { ++ /* head of item is cut */ ++ assert("vs-1529", from == 0); ++ ++ item_key_by_coord(coord, new_first); ++ set_key_offset(new_first, ++ get_key_offset(new_first) + from + count); ++ } ++ ++ if (REISER4_DEBUG) ++ memset((char *)item_body_by_coord(coord) + from, 0, count); ++ return count; ++} ++ ++/* plugin->u.item.b.cut_units */ ++int ++cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ struct carry_cut_data *cdata UNUSED_ARG, ++ reiser4_key * smallest_removed, reiser4_key * new_first) ++{ ++ return do_cut_or_kill(coord, from, to, smallest_removed, new_first); ++} ++ ++/* plugin->u.item.b.kill_units */ ++int ++kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, ++ struct carry_kill_data *kdata, reiser4_key * smallest_removed, ++ reiser4_key * new_first) ++{ ++ kill_hook_tail(coord, from, to - from + 1, kdata); ++ return do_cut_or_kill(coord, from, to, smallest_removed, new_first); ++} ++ ++/* plugin->u.item.b.unit_key */ ++reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key) ++{ ++ assert("vs-375", coord_is_existing_unit(coord)); ++ ++ item_key_by_coord(coord, key); ++ set_key_offset(key, (get_key_offset(key) + coord->unit_pos)); ++ ++ return key; ++} ++ ++/* plugin->u.item.b.estimate ++ plugin->u.item.b.item_data_by_flow */ ++ ++/* tail redpage function. It is called from readpage_tail(). */ ++static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page) ++{ ++ tap_t tap; ++ int result; ++ coord_t coord; ++ lock_handle lh; ++ int count, mapped; ++ struct inode *inode; ++ char *pagedata; ++ ++ /* saving passed coord in order to do not move it by tap. */ ++ init_lh(&lh); ++ copy_lh(&lh, uf_coord->lh); ++ inode = page->mapping->host; ++ coord_dup(&coord, &uf_coord->coord); ++ ++ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); ++ ++ if ((result = reiser4_tap_load(&tap))) ++ goto out_tap_done; ++ ++ /* lookup until page is filled up. */ ++ for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) { ++ /* number of bytes to be copied to page */ ++ count = item_length_by_coord(&coord) - coord.unit_pos; ++ if (count > PAGE_CACHE_SIZE - mapped) ++ count = PAGE_CACHE_SIZE - mapped; ++ ++ /* attach @page to address space and get data address */ ++ pagedata = kmap_atomic(page, KM_USER0); ++ ++ /* copy tail item to page */ ++ memcpy(pagedata + mapped, ++ ((char *)item_body_by_coord(&coord) + coord.unit_pos), ++ count); ++ mapped += count; ++ ++ flush_dcache_page(page); ++ ++ /* dettach page from address space */ ++ kunmap_atomic(pagedata, KM_USER0); ++ ++ /* Getting next tail item. */ ++ if (mapped < PAGE_CACHE_SIZE) { ++ /* ++ * unlock page in order to avoid keep it locked ++ * during tree lookup, which takes long term locks ++ */ ++ unlock_page(page); ++ ++ /* getting right neighbour. */ ++ result = go_dir_el(&tap, RIGHT_SIDE, 0); ++ ++ /* lock page back */ ++ lock_page(page); ++ if (PageUptodate(page)) { ++ /* ++ * another thread read the page, we have ++ * nothing to do ++ */ ++ result = 0; ++ goto out_unlock_page; ++ } ++ ++ if (result) { ++ if (result == -E_NO_NEIGHBOR) { ++ /* ++ * rigth neighbor is not a formatted ++ * node ++ */ ++ result = 0; ++ goto done; ++ } else { ++ goto out_tap_relse; ++ } ++ } else { ++ if (!inode_file_plugin(inode)-> ++ owns_item(inode, &coord)) { ++ /* item of another file is found */ ++ result = 0; ++ goto done; ++ } ++ } ++ } ++ } ++ ++ done: ++ if (mapped != PAGE_CACHE_SIZE) ++ zero_user_segment(page, mapped, PAGE_CACHE_SIZE); ++ SetPageUptodate(page); ++ out_unlock_page: ++ unlock_page(page); ++ out_tap_relse: ++ reiser4_tap_relse(&tap); ++ out_tap_done: ++ reiser4_tap_done(&tap); ++ return result; ++} ++ ++/* ++ plugin->s.file.readpage ++ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail ++ or ++ filemap_fault->reiser4_readpage->readpage_unix_file->->readpage_tail ++ ++ At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail ++ item. */ ++int readpage_tail(void *vp, struct page *page) ++{ ++ uf_coord_t *uf_coord = vp; ++ ON_DEBUG(coord_t * coord = &uf_coord->coord); ++ ON_DEBUG(reiser4_key key); ++ ++ assert("umka-2515", PageLocked(page)); ++ assert("umka-2516", !PageUptodate(page)); ++ assert("umka-2517", !jprivate(page) && !PagePrivate(page)); ++ assert("umka-2518", page->mapping && page->mapping->host); ++ ++ assert("umka-2519", znode_is_loaded(coord->node)); ++ assert("umka-2520", item_is_tail(coord)); ++ assert("umka-2521", coord_is_existing_unit(coord)); ++ assert("umka-2522", znode_is_rlocked(coord->node)); ++ assert("umka-2523", ++ page->mapping->host->i_ino == ++ get_key_objectid(item_key_by_coord(coord, &key))); ++ ++ return do_readpage_tail(uf_coord, page); ++} ++ ++/** ++ * overwrite_tail ++ * @flow: ++ * @coord: ++ * ++ * Overwrites tail item or its part by user data. Returns number of bytes ++ * written or error code. ++ */ ++static int overwrite_tail(flow_t *flow, coord_t *coord) ++{ ++ unsigned count; ++ ++ assert("vs-570", flow->user == 1); ++ assert("vs-946", flow->data); ++ assert("vs-947", coord_is_existing_unit(coord)); ++ assert("vs-948", znode_is_write_locked(coord->node)); ++ assert("nikita-3036", reiser4_schedulable()); ++ ++ count = item_length_by_coord(coord) - coord->unit_pos; ++ if (count > flow->length) ++ count = flow->length; ++ ++ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos, ++ (const char __user *)flow->data, count)) ++ return RETERR(-EFAULT); ++ ++ znode_make_dirty(coord->node); ++ return count; ++} ++ ++/** ++ * insert_first_tail ++ * @inode: ++ * @flow: ++ * @coord: ++ * @lh: ++ * ++ * Returns number of bytes written or error code. ++ */ ++static ssize_t insert_first_tail(struct inode *inode, flow_t *flow, ++ coord_t *coord, lock_handle *lh) ++{ ++ int result; ++ loff_t to_write; ++ struct unix_file_info *uf_info; ++ ++ if (get_key_offset(&flow->key) != 0) { ++ /* ++ * file is empty and we have to write not to the beginning of ++ * file. Create a hole at the beginning of file. On success ++ * insert_flow returns 0 as number of written bytes which is ++ * what we have to return on padding a file with holes ++ */ ++ flow->data = NULL; ++ flow->length = get_key_offset(&flow->key); ++ set_key_offset(&flow->key, 0); ++ /* ++ * holes in files built of tails are stored just like if there ++ * were real data which are all zeros. Therefore we have to ++ * allocate quota here as well ++ */ ++ if (vfs_dq_alloc_space_nodirty(inode, flow->length)) ++ return RETERR(-EDQUOT); ++ result = reiser4_insert_flow(coord, lh, flow); ++ if (flow->length) ++ vfs_dq_free_space_nodirty(inode, flow->length); ++ ++ uf_info = unix_file_inode_data(inode); ++ ++ /* ++ * first item insertion is only possible when writing to empty ++ * file or performing tail conversion ++ */ ++ assert("", (uf_info->container == UF_CONTAINER_EMPTY || ++ (reiser4_inode_get_flag(inode, ++ REISER4_PART_MIXED) && ++ reiser4_inode_get_flag(inode, ++ REISER4_PART_IN_CONV)))); ++ /* if file was empty - update its state */ ++ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY) ++ uf_info->container = UF_CONTAINER_TAILS; ++ return result; ++ } ++ ++ /* check quota before appending data */ ++ if (vfs_dq_alloc_space_nodirty(inode, flow->length)) ++ return RETERR(-EDQUOT); ++ ++ to_write = flow->length; ++ result = reiser4_insert_flow(coord, lh, flow); ++ if (flow->length) ++ vfs_dq_free_space_nodirty(inode, flow->length); ++ return (to_write - flow->length) ? (to_write - flow->length) : result; ++} ++ ++/** ++ * append_tail ++ * @inode: ++ * @flow: ++ * @coord: ++ * @lh: ++ * ++ * Returns number of bytes written or error code. ++ */ ++static ssize_t append_tail(struct inode *inode, ++ flow_t *flow, coord_t *coord, lock_handle *lh) ++{ ++ int result; ++ reiser4_key append_key; ++ loff_t to_write; ++ ++ if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) { ++ flow->data = NULL; ++ flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key); ++ set_key_offset(&flow->key, get_key_offset(&append_key)); ++ /* ++ * holes in files built of tails are stored just like if there ++ * were real data which are all zeros. Therefore we have to ++ * allocate quota here as well ++ */ ++ if (vfs_dq_alloc_space_nodirty(inode, flow->length)) ++ return RETERR(-EDQUOT); ++ result = reiser4_insert_flow(coord, lh, flow); ++ if (flow->length) ++ vfs_dq_free_space_nodirty(inode, flow->length); ++ return result; ++ } ++ ++ /* check quota before appending data */ ++ if (vfs_dq_alloc_space_nodirty(inode, flow->length)) ++ return RETERR(-EDQUOT); ++ ++ to_write = flow->length; ++ result = reiser4_insert_flow(coord, lh, flow); ++ if (flow->length) ++ vfs_dq_free_space_nodirty(inode, flow->length); ++ return (to_write - flow->length) ? (to_write - flow->length) : result; ++} ++ ++/** ++ * write_tail_reserve_space - reserve space for tail write operation ++ * @inode: ++ * ++ * Estimates and reserves space which may be required for writing one flow to a ++ * file ++ */ ++static int write_extent_reserve_space(struct inode *inode) ++{ ++ __u64 count; ++ reiser4_tree *tree; ++ ++ /* ++ * to write one flow to a file by tails we have to reserve disk space for: ++ ++ * 1. find_file_item may have to insert empty node to the tree (empty ++ * leaf node between two extent items). This requires 1 block and ++ * number of blocks which are necessary to perform insertion of an ++ * internal item into twig level. ++ * ++ * 2. flow insertion ++ * ++ * 3. stat data update ++ */ ++ tree = reiser4_tree_by_inode(inode); ++ count = estimate_one_insert_item(tree) + ++ estimate_insert_flow(tree->height) + ++ estimate_one_insert_item(tree); ++ grab_space_enable(); ++ return reiser4_grab_space(count, 0 /* flags */); ++} ++ ++#define PAGE_PER_FLOW 4 ++ ++static loff_t faultin_user_pages(const char __user *buf, size_t count) ++{ ++ loff_t faulted; ++ int to_fault; ++ ++ if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE) ++ count = PAGE_PER_FLOW * PAGE_CACHE_SIZE; ++ faulted = 0; ++ while (count > 0) { ++ to_fault = PAGE_CACHE_SIZE; ++ if (count < to_fault) ++ to_fault = count; ++ fault_in_pages_readable(buf + faulted, to_fault); ++ count -= to_fault; ++ faulted += to_fault; ++ } ++ return faulted; ++} ++ ++/** ++ * reiser4_write_tail - write method of tail item plugin ++ * @file: file to write to ++ * @buf: address of user-space buffer ++ * @count: number of bytes to write ++ * @pos: position in file to write to ++ * ++ * Returns number of written bytes or error code. ++ */ ++ssize_t reiser4_write_tail(struct file *file, struct inode * inode, ++ const char __user *buf, size_t count, loff_t *pos) ++{ ++ struct hint hint; ++ int result; ++ flow_t flow; ++ coord_t *coord; ++ lock_handle *lh; ++ znode *loaded; ++ ++ assert("edward-1548", inode != NULL); ++ ++ if (write_extent_reserve_space(inode)) ++ return RETERR(-ENOSPC); ++ ++ result = load_file_hint(file, &hint); ++ BUG_ON(result != 0); ++ ++ flow.length = faultin_user_pages(buf, count); ++ flow.user = 1; ++ memcpy(&flow.data, &buf, sizeof(buf)); ++ flow.op = WRITE_OP; ++ key_by_inode_and_offset_common(inode, *pos, &flow.key); ++ ++ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode); ++ if (IS_CBKERR(result)) ++ return result; ++ ++ coord = &hint.ext_coord.coord; ++ lh = hint.ext_coord.lh; ++ ++ result = zload(coord->node); ++ BUG_ON(result != 0); ++ loaded = coord->node; ++ ++ if (coord->between == AFTER_UNIT) { ++ /* append with data or hole */ ++ result = append_tail(inode, &flow, coord, lh); ++ } else if (coord->between == AT_UNIT) { ++ /* overwrite */ ++ result = overwrite_tail(&flow, coord); ++ } else { ++ /* no items of this file yet. insert data or hole */ ++ result = insert_first_tail(inode, &flow, coord, lh); ++ } ++ zrelse(loaded); ++ if (result < 0) { ++ done_lh(lh); ++ return result; ++ } ++ ++ /* seal and unlock znode */ ++ hint.ext_coord.valid = 0; ++ if (hint.ext_coord.valid) ++ reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK); ++ else ++ reiser4_unset_hint(&hint); ++ ++ save_file_hint(file, &hint); ++ return result; ++} ++ ++#if REISER4_DEBUG ++ ++static int ++coord_matches_key_tail(const coord_t * coord, const reiser4_key * key) ++{ ++ reiser4_key item_key; ++ ++ assert("vs-1356", coord_is_existing_unit(coord)); ++ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key))); ++ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key))); ++ return get_key_offset(key) == ++ get_key_offset(&item_key) + coord->unit_pos; ++ ++} ++ ++#endif ++ ++/* plugin->u.item.s.file.read */ ++int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint) ++{ ++ unsigned count; ++ int item_length; ++ coord_t *coord; ++ uf_coord_t *uf_coord; ++ ++ uf_coord = &hint->ext_coord; ++ coord = &uf_coord->coord; ++ ++ assert("vs-571", f->user == 1); ++ assert("vs-571", f->data); ++ assert("vs-967", coord && coord->node); ++ assert("vs-1117", znode_is_rlocked(coord->node)); ++ assert("vs-1118", znode_is_loaded(coord->node)); ++ ++ assert("nikita-3037", reiser4_schedulable()); ++ assert("vs-1357", coord_matches_key_tail(coord, &f->key)); ++ ++ /* calculate number of bytes to read off the item */ ++ item_length = item_length_by_coord(coord); ++ count = item_length_by_coord(coord) - coord->unit_pos; ++ if (count > f->length) ++ count = f->length; ++ ++ /* user page has to be brought in so that major page fault does not ++ * occur here when longtem lock is held */ ++ if (__copy_to_user((char __user *)f->data, ++ ((char *)item_body_by_coord(coord) + coord->unit_pos), ++ count)) ++ return RETERR(-EFAULT); ++ ++ /* probably mark_page_accessed() should only be called if ++ * coord->unit_pos is zero. */ ++ mark_page_accessed(znode_page(coord->node)); ++ move_flow_forward(f, count); ++ ++ coord->unit_pos += count; ++ if (item_length == coord->unit_pos) { ++ coord->unit_pos--; ++ coord->between = AFTER_UNIT; ++ } ++ reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK); ++ return 0; ++} ++ ++/* ++ plugin->u.item.s.file.append_key ++ key of first byte which is the next to last byte by addressed by this item ++*/ ++reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key) ++{ ++ item_key_by_coord(coord, key); ++ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord)); ++ return key; ++} ++ ++/* plugin->u.item.s.file.init_coord_extension */ ++void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped) ++{ ++ uf_coord->valid = 1; ++} ++ ++/* ++ plugin->u.item.s.file.get_block ++*/ ++int ++get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block) ++{ ++ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL); ++ ++ if (reiser4_blocknr_is_fake(znode_get_block(coord->node))) ++ /* if node has'nt obtainet its block number yet, return 0. ++ * Lets avoid upsetting users with some cosmic numbers beyond ++ * the device capacity.*/ ++ *block = 0; ++ else ++ *block = *znode_get_block(coord->node); ++ return 0; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/item/tail.h linux-2.6.33/fs/reiser4/plugin/item/tail.h +--- linux-2.6.33.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/item/tail.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,56 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#if !defined( __REISER4_TAIL_H__ ) ++#define __REISER4_TAIL_H__ ++ ++struct tail_coord_extension { ++ int not_used; ++}; ++ ++struct cut_list; ++ ++/* plugin->u.item.b.* */ ++reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *); ++int can_contain_key_tail(const coord_t * coord, const reiser4_key * key, ++ const reiser4_item_data *); ++int mergeable_tail(const coord_t * p1, const coord_t * p2); ++pos_in_node_t nr_units_tail(const coord_t *); ++lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *); ++int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *); ++int can_shift_tail(unsigned free_space, coord_t * source, ++ znode * target, shift_direction, unsigned *size, ++ unsigned want); ++void copy_units_tail(coord_t * target, coord_t * source, unsigned from, ++ unsigned count, shift_direction, unsigned free_space); ++int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count, ++ struct carry_kill_data *); ++int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to, ++ struct carry_cut_data *, reiser4_key * smallest_removed, ++ reiser4_key * new_first); ++int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to, ++ struct carry_kill_data *, reiser4_key * smallest_removed, ++ reiser4_key * new_first); ++reiser4_key *unit_key_tail(const coord_t *, reiser4_key *); ++ ++/* plugin->u.item.s.* */ ++ssize_t reiser4_write_tail(struct file *file, struct inode * inode, ++ const char __user *buf, size_t count, loff_t *pos); ++int reiser4_read_tail(struct file *, flow_t *, hint_t *); ++int readpage_tail(void *vp, struct page *page); ++reiser4_key *append_key_tail(const coord_t *, reiser4_key *); ++void init_coord_extension_tail(uf_coord_t *, loff_t offset); ++int get_block_address_tail(const coord_t *, sector_t, sector_t *); ++ ++/* __REISER4_TAIL_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/Makefile linux-2.6.33/fs/reiser4/plugin/Makefile +--- linux-2.6.33.orig/fs/reiser4/plugin/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,26 @@ ++obj-$(CONFIG_REISER4_FS) += plugins.o ++ ++plugins-objs := \ ++ plugin.o \ ++ plugin_set.o \ ++ object.o \ ++ inode_ops.o \ ++ inode_ops_rename.o \ ++ file_ops.o \ ++ file_ops_readdir.o \ ++ file_plugin_common.o \ ++ dir_plugin_common.o \ ++ digest.o \ ++ hash.o \ ++ fibration.o \ ++ tail_policy.o \ ++ regular.o ++ ++obj-$(CONFIG_REISER4_FS) += item/ ++obj-$(CONFIG_REISER4_FS) += file/ ++obj-$(CONFIG_REISER4_FS) += dir/ ++obj-$(CONFIG_REISER4_FS) += node/ ++obj-$(CONFIG_REISER4_FS) += compress/ ++obj-$(CONFIG_REISER4_FS) += space/ ++obj-$(CONFIG_REISER4_FS) += disk_format/ ++obj-$(CONFIG_REISER4_FS) += security/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/node/Makefile linux-2.6.33/fs/reiser4/plugin/node/Makefile +--- linux-2.6.33.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/node/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,5 @@ ++obj-$(CONFIG_REISER4_FS) += node_plugins.o ++ ++node_plugins-objs := \ ++ node.o \ ++ node40.o +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/node/node40.c linux-2.6.33/fs/reiser4/plugin/node/node40.c +--- linux-2.6.33.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/node/node40.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,2924 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "../../debug.h" ++#include "../../key.h" ++#include "../../coord.h" ++#include "../plugin_header.h" ++#include "../item/item.h" ++#include "node.h" ++#include "node40.h" ++#include "../plugin.h" ++#include "../../jnode.h" ++#include "../../znode.h" ++#include "../../pool.h" ++#include "../../carry.h" ++#include "../../tap.h" ++#include "../../tree.h" ++#include "../../super.h" ++#include "../../reiser4.h" ++ ++#include <asm/uaccess.h> ++#include <linux/types.h> ++#include <linux/prefetch.h> ++ ++/* leaf 40 format: ++ ++ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ] ++ plugin_id (16) key ++ free_space (16) pluginid (16) ++ free_space_start (16) offset (16) ++ level (8) ++ num_items (16) ++ magic (32) ++ flush_time (32) ++*/ ++/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */ ++/* magic number that is stored in ->magic field of node header */ ++static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */ ++ ++static int prepare_for_update(znode * left, znode * right, ++ carry_plugin_info * info); ++ ++/* header of node of reiser40 format is at the beginning of node */ ++static inline node40_header *node40_node_header(const znode * node /* node to ++ * query */ ) ++{ ++ assert("nikita-567", node != NULL); ++ assert("nikita-568", znode_page(node) != NULL); ++ assert("nikita-569", zdata(node) != NULL); ++ return (node40_header *) zdata(node); ++} ++ ++/* functions to get/set fields of node40_header */ ++#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic)) ++#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space)) ++#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start)) ++#define nh40_get_level(nh) get_unaligned(&(nh)->level) ++#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items)) ++#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id)) ++ ++#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic) ++#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space) ++#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start) ++#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level) ++#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items) ++#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id) ++ ++/* plugin field of node header should be read/set by ++ plugin_by_disk_id/save_disk_plugin */ ++ ++/* array of item headers is at the end of node */ ++static inline item_header40 *node40_ih_at(const znode * node, unsigned pos) ++{ ++ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1; ++} ++ ++/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1 ++ */ ++static inline item_header40 *node40_ih_at_coord(const coord_t * coord) ++{ ++ return (item_header40 *) (zdata(coord->node) + ++ znode_size(coord->node)) - (coord->item_pos) - ++ 1; ++} ++ ++/* functions to get/set fields of item_header40 */ ++#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset)) ++ ++#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset) ++ ++/* plugin field of item header should be read/set by ++ plugin_by_disk_id/save_disk_plugin */ ++ ++/* plugin methods */ ++ ++/* plugin->u.node.item_overhead ++ look for description of this method in plugin/node/node.h */ ++size_t ++item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG) ++{ ++ return sizeof(item_header40); ++} ++ ++/* plugin->u.node.free_space ++ look for description of this method in plugin/node/node.h */ ++size_t free_space_node40(znode * node) ++{ ++ assert("nikita-577", node != NULL); ++ assert("nikita-578", znode_is_loaded(node)); ++ assert("nikita-579", zdata(node) != NULL); ++ ++ return nh40_get_free_space(node40_node_header(node)); ++} ++ ++/* private inline version of node40_num_of_items() for use in this file. This ++ is necessary, because address of node40_num_of_items() is taken and it is ++ never inlined as a result. */ ++static inline short node40_num_of_items_internal(const znode * node) ++{ ++ return nh40_get_num_items(node40_node_header(node)); ++} ++ ++#if REISER4_DEBUG ++static inline void check_num_items(const znode * node) ++{ ++ assert("nikita-2749", ++ node40_num_of_items_internal(node) == node->nr_items); ++ assert("nikita-2746", znode_is_write_locked(node)); ++} ++#else ++#define check_num_items(node) noop ++#endif ++ ++/* plugin->u.node.num_of_items ++ look for description of this method in plugin/node/node.h */ ++int num_of_items_node40(const znode * node) ++{ ++ return node40_num_of_items_internal(node); ++} ++ ++static void ++node40_set_num_items(znode * node, node40_header * nh, unsigned value) ++{ ++ assert("nikita-2751", node != NULL); ++ assert("nikita-2750", nh == node40_node_header(node)); ++ ++ check_num_items(node); ++ nh40_set_num_items(nh, value); ++ node->nr_items = value; ++ check_num_items(node); ++} ++ ++/* plugin->u.node.item_by_coord ++ look for description of this method in plugin/node/node.h */ ++char *item_by_coord_node40(const coord_t * coord) ++{ ++ item_header40 *ih; ++ char *p; ++ ++ /* @coord is set to existing item */ ++ assert("nikita-596", coord != NULL); ++ assert("vs-255", coord_is_existing_item(coord)); ++ ++ ih = node40_ih_at_coord(coord); ++ p = zdata(coord->node) + ih40_get_offset(ih); ++ return p; ++} ++ ++/* plugin->u.node.length_by_coord ++ look for description of this method in plugin/node/node.h */ ++int length_by_coord_node40(const coord_t * coord) ++{ ++ item_header40 *ih; ++ int result; ++ ++ /* @coord is set to existing item */ ++ assert("vs-256", coord != NULL); ++ assert("vs-257", coord_is_existing_item(coord)); ++ ++ ih = node40_ih_at_coord(coord); ++ if ((int)coord->item_pos == ++ node40_num_of_items_internal(coord->node) - 1) ++ result = ++ nh40_get_free_space_start(node40_node_header(coord->node)) - ++ ih40_get_offset(ih); ++ else ++ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih); ++ ++ return result; ++} ++ ++static pos_in_node_t ++node40_item_length(const znode * node, pos_in_node_t item_pos) ++{ ++ item_header40 *ih; ++ pos_in_node_t result; ++ ++ /* @coord is set to existing item */ ++ assert("vs-256", node != NULL); ++ assert("vs-257", node40_num_of_items_internal(node) > item_pos); ++ ++ ih = node40_ih_at(node, item_pos); ++ if (item_pos == node40_num_of_items_internal(node) - 1) ++ result = ++ nh40_get_free_space_start(node40_node_header(node)) - ++ ih40_get_offset(ih); ++ else ++ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih); ++ ++ return result; ++} ++ ++/* plugin->u.node.plugin_by_coord ++ look for description of this method in plugin/node/node.h */ ++item_plugin *plugin_by_coord_node40(const coord_t * coord) ++{ ++ item_header40 *ih; ++ item_plugin *result; ++ ++ /* @coord is set to existing item */ ++ assert("vs-258", coord != NULL); ++ assert("vs-259", coord_is_existing_item(coord)); ++ ++ ih = node40_ih_at_coord(coord); ++ /* pass NULL in stead of current tree. This is time critical call. */ ++ result = item_plugin_by_disk_id(NULL, &ih->plugin_id); ++ return result; ++} ++ ++/* plugin->u.node.key_at ++ look for description of this method in plugin/node/node.h */ ++reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key) ++{ ++ item_header40 *ih; ++ ++ assert("nikita-1765", coord_is_existing_item(coord)); ++ ++ /* @coord is set to existing item */ ++ ih = node40_ih_at_coord(coord); ++ memcpy(key, &ih->key, sizeof(reiser4_key)); ++ return key; ++} ++ ++/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */ ++ ++#define NODE_INCSTAT(n, counter) \ ++ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter) ++ ++#define NODE_ADDSTAT(n, counter, val) \ ++ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val) ++ ++/* plugin->u.node.lookup ++ look for description of this method in plugin/node/node.h */ ++node_search_result lookup_node40(znode * node /* node to query */ , ++ const reiser4_key * key /* key to look for */ , ++ lookup_bias bias /* search bias */ , ++ coord_t * coord /* resulting coord */ ) ++{ ++ int left; ++ int right; ++ int found; ++ int items; ++ ++ item_header40 *lefth; ++ item_header40 *righth; ++ ++ item_plugin *iplug; ++ item_header40 *bstop; ++ item_header40 *ih; ++ cmp_t order; ++ ++ assert("nikita-583", node != NULL); ++ assert("nikita-584", key != NULL); ++ assert("nikita-585", coord != NULL); ++ assert("nikita-2693", znode_is_any_locked(node)); ++ cassert(REISER4_SEQ_SEARCH_BREAK > 2); ++ ++ items = node_num_items(node); ++ ++ if (unlikely(items == 0)) { ++ coord_init_first_unit(coord, node); ++ return NS_NOT_FOUND; ++ } ++ ++ /* binary search for item that can contain given key */ ++ left = 0; ++ right = items - 1; ++ coord->node = node; ++ coord_clear_iplug(coord); ++ found = 0; ++ ++ lefth = node40_ih_at(node, left); ++ righth = node40_ih_at(node, right); ++ ++ /* It is known that for small arrays sequential search is on average ++ more efficient than binary. This is because sequential search is ++ coded as tight loop that can be better optimized by compilers and ++ for small array size gain from this optimization makes sequential ++ search the winner. Another, maybe more important, reason for this, ++ is that sequential array is more CPU cache friendly, whereas binary ++ search effectively destroys CPU caching. ++ ++ Critical here is the notion of "smallness". Reasonable value of ++ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in ++ fs/reiser4/ulevel/ulevel.c:test_search(). ++ ++ Don't try to further optimize sequential search by scanning from ++ right to left in attempt to use more efficient loop termination ++ condition (comparison with 0). This doesn't work. ++ ++ */ ++ ++ while (right - left >= REISER4_SEQ_SEARCH_BREAK) { ++ int median; ++ item_header40 *medianh; ++ ++ median = (left + right) / 2; ++ medianh = node40_ih_at(node, median); ++ ++ assert("nikita-1084", median >= 0); ++ assert("nikita-1085", median < items); ++ switch (keycmp(key, &medianh->key)) { ++ case LESS_THAN: ++ right = median; ++ righth = medianh; ++ break; ++ default: ++ wrong_return_value("nikita-586", "keycmp"); ++ case GREATER_THAN: ++ left = median; ++ lefth = medianh; ++ break; ++ case EQUAL_TO: ++ do { ++ --median; ++ /* headers are ordered from right to left */ ++ ++medianh; ++ } while (median >= 0 && keyeq(key, &medianh->key)); ++ right = left = median + 1; ++ ih = lefth = righth = medianh - 1; ++ found = 1; ++ break; ++ } ++ } ++ /* sequential scan. Item headers, and, therefore, keys are stored at ++ the rightmost part of a node from right to left. We are trying to ++ access memory from left to right, and hence, scan in _descending_ ++ order of item numbers. ++ */ ++ if (!found) { ++ for (left = right, ih = righth; left >= 0; ++ih, --left) { ++ cmp_t comparison; ++ ++ prefetchkey(&(ih + 1)->key); ++ comparison = keycmp(&ih->key, key); ++ if (comparison == GREATER_THAN) ++ continue; ++ if (comparison == EQUAL_TO) { ++ found = 1; ++ do { ++ --left; ++ ++ih; ++ } while (left >= 0 && keyeq(&ih->key, key)); ++ ++left; ++ --ih; ++ } else { ++ assert("nikita-1256", comparison == LESS_THAN); ++ } ++ break; ++ } ++ if (unlikely(left < 0)) ++ left = 0; ++ } ++ ++ assert("nikita-3212", right >= left); ++ assert("nikita-3214", ++ equi(found, keyeq(&node40_ih_at(node, left)->key, key))); ++ ++ coord_set_item_pos(coord, left); ++ coord->unit_pos = 0; ++ coord->between = AT_UNIT; ++ ++ /* key < leftmost key in a mode or node is corrupted and keys ++ are not sorted */ ++ bstop = node40_ih_at(node, (unsigned)left); ++ order = keycmp(&bstop->key, key); ++ if (unlikely(order == GREATER_THAN)) { ++ if (unlikely(left != 0)) { ++ /* screw up */ ++ warning("nikita-587", "Key less than %i key in a node", ++ left); ++ reiser4_print_key("key", key); ++ reiser4_print_key("min", &bstop->key); ++ print_coord_content("coord", coord); ++ return RETERR(-EIO); ++ } else { ++ coord->between = BEFORE_UNIT; ++ return NS_NOT_FOUND; ++ } ++ } ++ /* left <= key, ok */ ++ iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id); ++ ++ if (unlikely(iplug == NULL)) { ++ warning("nikita-588", "Unknown plugin %i", ++ le16_to_cpu(get_unaligned(&bstop->plugin_id))); ++ reiser4_print_key("key", key); ++ print_coord_content("coord", coord); ++ return RETERR(-EIO); ++ } ++ ++ coord_set_iplug(coord, iplug); ++ ++ /* if exact key from item header was found by binary search, no ++ further checks are necessary. */ ++ if (found) { ++ assert("nikita-1259", order == EQUAL_TO); ++ return NS_FOUND; ++ } ++ if (iplug->b.max_key_inside != NULL) { ++ reiser4_key max_item_key; ++ ++ /* key > max_item_key --- outside of an item */ ++ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) { ++ coord->unit_pos = 0; ++ coord->between = AFTER_ITEM; ++ /* FIXME-VS: key we are looking for does not fit into ++ found item. Return NS_NOT_FOUND then. Without that ++ the following case does not work: there is extent of ++ file 10000, 10001. File 10000, 10002 has been just ++ created. When writing to position 0 in that file - ++ traverse_tree will stop here on twig level. When we ++ want it to go down to leaf level ++ */ ++ return NS_NOT_FOUND; ++ } ++ } ++ ++ if (iplug->b.lookup != NULL) { ++ return iplug->b.lookup(key, bias, coord); ++ } else { ++ assert("nikita-1260", order == LESS_THAN); ++ coord->between = AFTER_UNIT; ++ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND; ++ } ++} ++ ++#undef NODE_ADDSTAT ++#undef NODE_INCSTAT ++ ++/* plugin->u.node.estimate ++ look for description of this method in plugin/node/node.h */ ++size_t estimate_node40(znode * node) ++{ ++ size_t result; ++ ++ assert("nikita-597", node != NULL); ++ ++ result = free_space_node40(node) - sizeof(item_header40); ++ ++ return (result > 0) ? result : 0; ++} ++ ++/* plugin->u.node.check ++ look for description of this method in plugin/node/node.h */ ++int check_node40(const znode * node /* node to check */ , ++ __u32 flags /* check flags */ , ++ const char **error /* where to store error message */ ) ++{ ++ int nr_items; ++ int i; ++ reiser4_key prev; ++ unsigned old_offset; ++ tree_level level; ++ coord_t coord; ++ int result; ++ ++ assert("nikita-580", node != NULL); ++ assert("nikita-581", error != NULL); ++ assert("nikita-2948", znode_is_loaded(node)); ++ ++ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE)) ++ return 0; ++ ++ assert("nikita-582", zdata(node) != NULL); ++ ++ nr_items = node40_num_of_items_internal(node); ++ if (nr_items < 0) { ++ *error = "Negative number of items"; ++ return -1; ++ } ++ ++ if (flags & REISER4_NODE_DKEYS) ++ prev = *znode_get_ld_key((znode *) node); ++ else ++ prev = *reiser4_min_key(); ++ ++ old_offset = 0; ++ coord_init_zero(&coord); ++ coord.node = (znode *) node; ++ coord.unit_pos = 0; ++ coord.between = AT_UNIT; ++ level = znode_get_level(node); ++ for (i = 0; i < nr_items; i++) { ++ item_header40 *ih; ++ reiser4_key unit_key; ++ unsigned j; ++ ++ ih = node40_ih_at(node, (unsigned)i); ++ coord_set_item_pos(&coord, i); ++ if ((ih40_get_offset(ih) >= ++ znode_size(node) - nr_items * sizeof(item_header40)) || ++ (ih40_get_offset(ih) < sizeof(node40_header))) { ++ *error = "Offset is out of bounds"; ++ return -1; ++ } ++ if (ih40_get_offset(ih) <= old_offset) { ++ *error = "Offsets are in wrong order"; ++ return -1; ++ } ++ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) { ++ *error = "Wrong offset of first item"; ++ return -1; ++ } ++ old_offset = ih40_get_offset(ih); ++ ++ if (keygt(&prev, &ih->key)) { ++ *error = "Keys are in wrong order"; ++ return -1; ++ } ++ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) { ++ *error = "Wrong key of first unit"; ++ return -1; ++ } ++ prev = ih->key; ++ for (j = 0; j < coord_num_units(&coord); ++j) { ++ coord.unit_pos = j; ++ unit_key_by_coord(&coord, &unit_key); ++ if (keygt(&prev, &unit_key)) { ++ *error = "Unit keys are in wrong order"; ++ return -1; ++ } ++ prev = unit_key; ++ } ++ coord.unit_pos = 0; ++ if (level != TWIG_LEVEL && item_is_extent(&coord)) { ++ *error = "extent on the wrong level"; ++ return -1; ++ } ++ if (level == LEAF_LEVEL && item_is_internal(&coord)) { ++ *error = "internal item on the wrong level"; ++ return -1; ++ } ++ if (level != LEAF_LEVEL && ++ !item_is_internal(&coord) && !item_is_extent(&coord)) { ++ *error = "wrong item on the internal level"; ++ return -1; ++ } ++ if (level > TWIG_LEVEL && !item_is_internal(&coord)) { ++ *error = "non-internal item on the internal level"; ++ return -1; ++ } ++#if REISER4_DEBUG ++ if (item_plugin_by_coord(&coord)->b.check ++ && item_plugin_by_coord(&coord)->b.check(&coord, error)) ++ return -1; ++#endif ++ if (i) { ++ coord_t prev_coord; ++ /* two neighboring items can not be mergeable */ ++ coord_dup(&prev_coord, &coord); ++ coord_prev_item(&prev_coord); ++ if (are_items_mergeable(&prev_coord, &coord)) { ++ *error = "mergeable items in one node"; ++ return -1; ++ } ++ ++ } ++ } ++ ++ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) { ++ coord_t coord; ++ item_plugin *iplug; ++ ++ coord_init_last_unit(&coord, node); ++ iplug = item_plugin_by_coord(&coord); ++ if ((item_is_extent(&coord) || item_is_tail(&coord)) && ++ iplug->s.file.append_key != NULL) { ++ reiser4_key mkey; ++ ++ iplug->s.file.append_key(&coord, &mkey); ++ set_key_offset(&mkey, get_key_offset(&mkey) - 1); ++ read_lock_dk(current_tree); ++ result = keygt(&mkey, znode_get_rd_key((znode *) node)); ++ read_unlock_dk(current_tree); ++ if (result) { ++ *error = "key of rightmost item is too large"; ++ return -1; ++ } ++ } ++ } ++ if (flags & REISER4_NODE_DKEYS) { ++ read_lock_tree(current_tree); ++ read_lock_dk(current_tree); ++ ++ flags |= REISER4_NODE_TREE_STABLE; ++ ++ if (keygt(&prev, znode_get_rd_key((znode *) node))) { ++ if (flags & REISER4_NODE_TREE_STABLE) { ++ *error = "Last key is greater than rdkey"; ++ read_unlock_dk(current_tree); ++ read_unlock_tree(current_tree); ++ return -1; ++ } ++ } ++ if (keygt ++ (znode_get_ld_key((znode *) node), ++ znode_get_rd_key((znode *) node))) { ++ *error = "ldkey is greater than rdkey"; ++ read_unlock_dk(current_tree); ++ read_unlock_tree(current_tree); ++ return -1; ++ } ++ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ++ (node->left != NULL) && ++ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) && ++ ergo(flags & REISER4_NODE_TREE_STABLE, ++ !keyeq(znode_get_rd_key(node->left), ++ znode_get_ld_key((znode *) node))) ++ && ergo(!(flags & REISER4_NODE_TREE_STABLE), ++ keygt(znode_get_rd_key(node->left), ++ znode_get_ld_key((znode *) node)))) { ++ *error = "left rdkey or ldkey is wrong"; ++ read_unlock_dk(current_tree); ++ read_unlock_tree(current_tree); ++ return -1; ++ } ++ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ++ (node->right != NULL) && ++ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) && ++ ergo(flags & REISER4_NODE_TREE_STABLE, ++ !keyeq(znode_get_rd_key((znode *) node), ++ znode_get_ld_key(node->right))) ++ && ergo(!(flags & REISER4_NODE_TREE_STABLE), ++ keygt(znode_get_rd_key((znode *) node), ++ znode_get_ld_key(node->right)))) { ++ *error = "rdkey or right ldkey is wrong"; ++ read_unlock_dk(current_tree); ++ read_unlock_tree(current_tree); ++ return -1; ++ } ++ ++ read_unlock_dk(current_tree); ++ read_unlock_tree(current_tree); ++ } ++ ++ return 0; ++} ++ ++/* plugin->u.node.parse ++ look for description of this method in plugin/node/node.h */ ++int parse_node40(znode * node /* node to parse */ ) ++{ ++ node40_header *header; ++ int result; ++ d8 level; ++ ++ header = node40_node_header((znode *) node); ++ result = -EIO; ++ level = nh40_get_level(header); ++ if (unlikely(((__u8) znode_get_level(node)) != level)) ++ warning("nikita-494", "Wrong level found in node: %i != %i", ++ znode_get_level(node), level); ++ else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC)) ++ warning("nikita-495", ++ "Wrong magic in tree node: want %x, got %x", ++ REISER4_NODE_MAGIC, nh40_get_magic(header)); ++ else { ++ node->nr_items = node40_num_of_items_internal(node); ++ result = 0; ++ } ++ return RETERR(result); ++} ++ ++/* plugin->u.node.init ++ look for description of this method in plugin/node/node.h */ ++int init_node40(znode * node /* node to initialise */ ) ++{ ++ node40_header *header; ++ ++ assert("nikita-570", node != NULL); ++ assert("nikita-572", zdata(node) != NULL); ++ ++ header = node40_node_header(node); ++ memset(header, 0, sizeof(node40_header)); ++ nh40_set_free_space(header, znode_size(node) - sizeof(node40_header)); ++ nh40_set_free_space_start(header, sizeof(node40_header)); ++ /* sane hypothesis: 0 in CPU format is 0 in disk format */ ++ /* items: 0 */ ++ save_plugin_id(node_plugin_to_plugin(node->nplug), ++ &header->common_header.plugin_id); ++ nh40_set_level(header, znode_get_level(node)); ++ nh40_set_magic(header, REISER4_NODE_MAGIC); ++ node->nr_items = 0; ++ nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb())); ++ ++ /* flags: 0 */ ++ return 0; ++} ++ ++#ifdef GUESS_EXISTS ++int guess_node40(const znode * node /* node to guess plugin of */ ) ++{ ++ node40_header *nethack; ++ ++ assert("nikita-1058", node != NULL); ++ nethack = node40_node_header(node); ++ return ++ (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) && ++ (plugin_by_disk_id(znode_get_tree(node), ++ REISER4_NODE_PLUGIN_TYPE, ++ &nethack->common_header.plugin_id)->h.id == ++ NODE40_ID); ++} ++#endif ++ ++/* plugin->u.node.chage_item_size ++ look for description of this method in plugin/node/node.h */ ++void change_item_size_node40(coord_t * coord, int by) ++{ ++ node40_header *nh; ++ item_header40 *ih; ++ char *item_data; ++ int item_length; ++ unsigned i; ++ ++ /* make sure that @item is coord of existing item */ ++ assert("vs-210", coord_is_existing_item(coord)); ++ ++ nh = node40_node_header(coord->node); ++ ++ item_data = item_by_coord_node40(coord); ++ item_length = length_by_coord_node40(coord); ++ ++ /* move item bodies */ ++ ih = node40_ih_at_coord(coord); ++ memmove(item_data + item_length + by, item_data + item_length, ++ nh40_get_free_space_start(node40_node_header(coord->node)) - ++ (ih40_get_offset(ih) + item_length)); ++ ++ /* update offsets of moved items */ ++ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) { ++ ih = node40_ih_at(coord->node, i); ++ ih40_set_offset(ih, ih40_get_offset(ih) + by); ++ } ++ ++ /* update node header */ ++ nh40_set_free_space(nh, nh40_get_free_space(nh) - by); ++ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by); ++} ++ ++static int should_notify_parent(const znode * node) ++{ ++ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */ ++ return !disk_addr_eq(znode_get_block(node), ++ &znode_get_tree(node)->root_block); ++} ++ ++/* plugin->u.node.create_item ++ look for description of this method in plugin/node/node.h */ ++int ++create_item_node40(coord_t *target, const reiser4_key *key, ++ reiser4_item_data *data, carry_plugin_info *info) ++{ ++ node40_header *nh; ++ item_header40 *ih; ++ unsigned offset; ++ unsigned i; ++ ++ nh = node40_node_header(target->node); ++ ++ assert("vs-212", coord_is_between_items(target)); ++ /* node must have enough free space */ ++ assert("vs-254", ++ free_space_node40(target->node) >= ++ data->length + sizeof(item_header40)); ++ assert("vs-1410", data->length >= 0); ++ ++ if (coord_set_to_right(target)) ++ /* there are not items to the right of @target, so, new item ++ will be inserted after last one */ ++ coord_set_item_pos(target, nh40_get_num_items(nh)); ++ ++ if (target->item_pos < nh40_get_num_items(nh)) { ++ /* there are items to be moved to prepare space for new ++ item */ ++ ih = node40_ih_at_coord(target); ++ /* new item will start at this offset */ ++ offset = ih40_get_offset(ih); ++ ++ memmove(zdata(target->node) + offset + data->length, ++ zdata(target->node) + offset, ++ nh40_get_free_space_start(nh) - offset); ++ /* update headers of moved items */ ++ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) { ++ ih = node40_ih_at(target->node, i); ++ ih40_set_offset(ih, ih40_get_offset(ih) + data->length); ++ } ++ ++ /* @ih is set to item header of the last item, move item headers */ ++ memmove(ih - 1, ih, ++ sizeof(item_header40) * (nh40_get_num_items(nh) - ++ target->item_pos)); ++ } else { ++ /* new item will start at this offset */ ++ offset = nh40_get_free_space_start(nh); ++ } ++ ++ /* make item header for the new item */ ++ ih = node40_ih_at_coord(target); ++ memcpy(&ih->key, key, sizeof(reiser4_key)); ++ ih40_set_offset(ih, offset); ++ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id); ++ ++ /* update node header */ ++ nh40_set_free_space(nh, ++ nh40_get_free_space(nh) - data->length - ++ sizeof(item_header40)); ++ nh40_set_free_space_start(nh, ++ nh40_get_free_space_start(nh) + data->length); ++ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1); ++ ++ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */ ++ target->unit_pos = 0; ++ target->between = AT_UNIT; ++ coord_clear_iplug(target); ++ ++ /* initialize item */ ++ if (data->iplug->b.init != NULL) { ++ data->iplug->b.init(target, NULL, data); ++ } ++ /* copy item body */ ++ if (data->iplug->b.paste != NULL) { ++ data->iplug->b.paste(target, data, info); ++ } else if (data->data != NULL) { ++ if (data->user) { ++ /* AUDIT: Are we really should not check that pointer ++ from userspace was valid and data bytes were ++ available? How will we return -EFAULT of some kind ++ without this check? */ ++ assert("nikita-3038", reiser4_schedulable()); ++ /* copy data from user space */ ++ __copy_from_user(zdata(target->node) + offset, ++ (const char __user *)data->data, ++ (unsigned)data->length); ++ } else ++ /* copy from kernel space */ ++ memcpy(zdata(target->node) + offset, data->data, ++ (unsigned)data->length); ++ } ++ ++ if (target->item_pos == 0) { ++ /* left delimiting key has to be updated */ ++ prepare_for_update(NULL, target->node, info); ++ } ++ ++ if (item_plugin_by_coord(target)->b.create_hook != NULL) { ++ item_plugin_by_coord(target)->b.create_hook(target, data->arg); ++ } ++ ++ return 0; ++} ++ ++/* plugin->u.node.update_item_key ++ look for description of this method in plugin/node/node.h */ ++void ++update_item_key_node40(coord_t * target, const reiser4_key * key, ++ carry_plugin_info * info) ++{ ++ item_header40 *ih; ++ ++ ih = node40_ih_at_coord(target); ++ memcpy(&ih->key, key, sizeof(reiser4_key)); ++ ++ if (target->item_pos == 0) { ++ prepare_for_update(NULL, target->node, info); ++ } ++} ++ ++/* this bits encode cut mode */ ++#define CMODE_TAIL 1 ++#define CMODE_WHOLE 2 ++#define CMODE_HEAD 4 ++ ++struct cut40_info { ++ int mode; ++ pos_in_node_t tail_removed; /* position of item which gets tail removed */ ++ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */ ++ pos_in_node_t removed_count; /* number of items removed completely */ ++ pos_in_node_t head_removed; /* position of item which gets head removed */ ++ ++ pos_in_node_t freed_space_start; ++ pos_in_node_t freed_space_end; ++ pos_in_node_t first_moved; ++ pos_in_node_t head_removed_location; ++}; ++ ++static void init_cinfo(struct cut40_info *cinfo) ++{ ++ cinfo->mode = 0; ++ cinfo->tail_removed = MAX_POS_IN_NODE; ++ cinfo->first_removed = MAX_POS_IN_NODE; ++ cinfo->removed_count = MAX_POS_IN_NODE; ++ cinfo->head_removed = MAX_POS_IN_NODE; ++ cinfo->freed_space_start = MAX_POS_IN_NODE; ++ cinfo->freed_space_end = MAX_POS_IN_NODE; ++ cinfo->first_moved = MAX_POS_IN_NODE; ++ cinfo->head_removed_location = MAX_POS_IN_NODE; ++} ++ ++/* complete cut_node40/kill_node40 content by removing the gap created by */ ++static void compact(znode * node, struct cut40_info *cinfo) ++{ ++ node40_header *nh; ++ item_header40 *ih; ++ pos_in_node_t freed; ++ pos_in_node_t pos, nr_items; ++ ++ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE && ++ cinfo->freed_space_end != MAX_POS_IN_NODE && ++ cinfo->first_moved != MAX_POS_IN_NODE)); ++ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start); ++ ++ nh = node40_node_header(node); ++ nr_items = nh40_get_num_items(nh); ++ ++ /* remove gap made up by removal */ ++ memmove(zdata(node) + cinfo->freed_space_start, ++ zdata(node) + cinfo->freed_space_end, ++ nh40_get_free_space_start(nh) - cinfo->freed_space_end); ++ ++ /* update item headers of moved items - change their locations */ ++ pos = cinfo->first_moved; ++ ih = node40_ih_at(node, pos); ++ if (cinfo->head_removed_location != MAX_POS_IN_NODE) { ++ assert("vs-1580", pos == cinfo->head_removed); ++ ih40_set_offset(ih, cinfo->head_removed_location); ++ pos++; ++ ih--; ++ } ++ ++ freed = cinfo->freed_space_end - cinfo->freed_space_start; ++ for (; pos < nr_items; pos++, ih--) { ++ assert("vs-1581", ih == node40_ih_at(node, pos)); ++ ih40_set_offset(ih, ih40_get_offset(ih) - freed); ++ } ++ ++ /* free space start moved to right */ ++ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed); ++ ++ if (cinfo->removed_count != MAX_POS_IN_NODE) { ++ /* number of items changed. Remove item headers of those items */ ++ ih = node40_ih_at(node, nr_items - 1); ++ memmove(ih + cinfo->removed_count, ih, ++ sizeof(item_header40) * (nr_items - ++ cinfo->removed_count - ++ cinfo->first_removed)); ++ freed += sizeof(item_header40) * cinfo->removed_count; ++ node40_set_num_items(node, nh, nr_items - cinfo->removed_count); ++ } ++ ++ /* total amount of free space increased */ ++ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed); ++} ++ ++int shrink_item_node40(coord_t * coord, int delta) ++{ ++ node40_header *nh; ++ item_header40 *ih; ++ pos_in_node_t pos; ++ pos_in_node_t nr_items; ++ char *end; ++ znode *node; ++ int off; ++ ++ assert("nikita-3487", coord != NULL); ++ assert("nikita-3488", delta >= 0); ++ ++ node = coord->node; ++ nh = node40_node_header(node); ++ nr_items = nh40_get_num_items(nh); ++ ++ ih = node40_ih_at_coord(coord); ++ assert("nikita-3489", delta <= length_by_coord_node40(coord)); ++ off = ih40_get_offset(ih) + length_by_coord_node40(coord); ++ end = zdata(node) + off; ++ ++ /* remove gap made up by removal */ ++ memmove(end - delta, end, nh40_get_free_space_start(nh) - off); ++ ++ /* update item headers of moved items - change their locations */ ++ pos = coord->item_pos + 1; ++ ih = node40_ih_at(node, pos); ++ for (; pos < nr_items; pos++, ih--) { ++ assert("nikita-3490", ih == node40_ih_at(node, pos)); ++ ih40_set_offset(ih, ih40_get_offset(ih) - delta); ++ } ++ ++ /* free space start moved to left */ ++ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta); ++ /* total amount of free space increased */ ++ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta); ++ /* ++ * This method does _not_ changes number of items. Hence, it cannot ++ * make node empty. Also it doesn't remove items at all, which means ++ * that no keys have to be updated either. ++ */ ++ return 0; ++} ++ ++/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types ++ of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the ++ rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item ++ getting head cut. Function returns 0 in this case */ ++static int ++parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params) ++{ ++ reiser4_key left_key, right_key; ++ reiser4_key min_from_key, max_to_key; ++ const reiser4_key *from_key, *to_key; ++ ++ init_cinfo(cinfo); ++ ++ /* calculate minimal key stored in first item of items to be cut (params->from) */ ++ item_key_by_coord(params->from, &min_from_key); ++ /* and max key stored in last item of items to be cut (params->to) */ ++ max_item_key_by_coord(params->to, &max_to_key); ++ ++ /* if cut key range is not defined in input parameters - define it using cut coord range */ ++ if (params->from_key == NULL) { ++ assert("vs-1513", params->to_key == NULL); ++ unit_key_by_coord(params->from, &left_key); ++ from_key = &left_key; ++ max_unit_key_by_coord(params->to, &right_key); ++ to_key = &right_key; ++ } else { ++ from_key = params->from_key; ++ to_key = params->to_key; ++ } ++ ++ if (params->from->item_pos == params->to->item_pos) { ++ if (keylt(&min_from_key, from_key) ++ && keylt(to_key, &max_to_key)) ++ return 1; ++ ++ if (keygt(from_key, &min_from_key)) { ++ /* tail of item is to be cut cut */ ++ cinfo->tail_removed = params->from->item_pos; ++ cinfo->mode |= CMODE_TAIL; ++ } else if (keylt(to_key, &max_to_key)) { ++ /* head of item is to be cut */ ++ cinfo->head_removed = params->from->item_pos; ++ cinfo->mode |= CMODE_HEAD; ++ } else { ++ /* item is removed completely */ ++ cinfo->first_removed = params->from->item_pos; ++ cinfo->removed_count = 1; ++ cinfo->mode |= CMODE_WHOLE; ++ } ++ } else { ++ cinfo->first_removed = params->from->item_pos + 1; ++ cinfo->removed_count = ++ params->to->item_pos - params->from->item_pos - 1; ++ ++ if (keygt(from_key, &min_from_key)) { ++ /* first item is not cut completely */ ++ cinfo->tail_removed = params->from->item_pos; ++ cinfo->mode |= CMODE_TAIL; ++ } else { ++ cinfo->first_removed--; ++ cinfo->removed_count++; ++ } ++ if (keylt(to_key, &max_to_key)) { ++ /* last item is not cut completely */ ++ cinfo->head_removed = params->to->item_pos; ++ cinfo->mode |= CMODE_HEAD; ++ } else { ++ cinfo->removed_count++; ++ } ++ if (cinfo->removed_count) ++ cinfo->mode |= CMODE_WHOLE; ++ } ++ ++ return 0; ++} ++ ++static void ++call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count, ++ carry_kill_data * kdata) ++{ ++ coord_t coord; ++ item_plugin *iplug; ++ pos_in_node_t pos; ++ ++ coord.node = node; ++ coord.unit_pos = 0; ++ coord.between = AT_UNIT; ++ for (pos = 0; pos < count; pos++) { ++ coord_set_item_pos(&coord, from + pos); ++ coord.unit_pos = 0; ++ coord.between = AT_UNIT; ++ iplug = item_plugin_by_coord(&coord); ++ if (iplug->b.kill_hook) { ++ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord), ++ kdata); ++ } ++ } ++} ++ ++/* this is used to kill item partially */ ++static pos_in_node_t ++kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data, ++ reiser4_key * smallest_removed, reiser4_key * new_first_key) ++{ ++ struct carry_kill_data *kdata; ++ item_plugin *iplug; ++ ++ kdata = data; ++ iplug = item_plugin_by_coord(coord); ++ ++ assert("vs-1524", iplug->b.kill_units); ++ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed, ++ new_first_key); ++} ++ ++/* call item plugin to cut tail of file */ ++static pos_in_node_t ++kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed) ++{ ++ struct carry_kill_data *kdata; ++ pos_in_node_t to; ++ ++ kdata = data; ++ to = coord_last_unit_pos(coord); ++ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed, ++ NULL); ++} ++ ++/* call item plugin to cut head of item */ ++static pos_in_node_t ++kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed, ++ reiser4_key * new_first_key) ++{ ++ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed, ++ new_first_key); ++} ++ ++/* this is used to cut item partially */ ++static pos_in_node_t ++cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data, ++ reiser4_key * smallest_removed, reiser4_key * new_first_key) ++{ ++ carry_cut_data *cdata; ++ item_plugin *iplug; ++ ++ cdata = data; ++ iplug = item_plugin_by_coord(coord); ++ assert("vs-302", iplug->b.cut_units); ++ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed, ++ new_first_key); ++} ++ ++/* call item plugin to cut tail of file */ ++static pos_in_node_t ++cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed) ++{ ++ carry_cut_data *cdata; ++ pos_in_node_t to; ++ ++ cdata = data; ++ to = coord_last_unit_pos(cdata->params.from); ++ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL); ++} ++ ++/* call item plugin to cut head of item */ ++static pos_in_node_t ++cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed, ++ reiser4_key * new_first_key) ++{ ++ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed, ++ new_first_key); ++} ++ ++/* this returns 1 of key of first item changed, 0 - if it did not */ ++static int ++prepare_for_compact(struct cut40_info *cinfo, ++ const struct cut_kill_params *params, int is_cut, ++ void *data, carry_plugin_info * info) ++{ ++ znode *node; ++ item_header40 *ih; ++ pos_in_node_t freed; ++ pos_in_node_t item_pos; ++ coord_t coord; ++ reiser4_key new_first_key; ++ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t, ++ void *, reiser4_key *, reiser4_key *); ++ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *); ++ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *, ++ reiser4_key *); ++ int retval; ++ ++ retval = 0; ++ ++ node = params->from->node; ++ ++ assert("vs-184", node == params->to->node); ++ assert("vs-312", !node_is_empty(node)); ++ assert("vs-297", ++ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT); ++ ++ if (is_cut) { ++ kill_units_f = cut_units; ++ kill_tail_f = cut_tail; ++ kill_head_f = cut_head; ++ } else { ++ kill_units_f = kill_units; ++ kill_tail_f = kill_tail; ++ kill_head_f = kill_head; ++ } ++ ++ if (parse_cut(cinfo, params) == 1) { ++ /* cut from the middle of item */ ++ freed = ++ kill_units_f(params->from, params->from->unit_pos, ++ params->to->unit_pos, data, ++ params->smallest_removed, NULL); ++ ++ item_pos = params->from->item_pos; ++ ih = node40_ih_at(node, item_pos); ++ cinfo->freed_space_start = ++ ih40_get_offset(ih) + node40_item_length(node, ++ item_pos) - freed; ++ cinfo->freed_space_end = cinfo->freed_space_start + freed; ++ cinfo->first_moved = item_pos + 1; ++ } else { ++ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE || ++ cinfo->first_removed != MAX_POS_IN_NODE || ++ cinfo->head_removed != MAX_POS_IN_NODE)); ++ ++ switch (cinfo->mode) { ++ case CMODE_TAIL: ++ /* one item gets cut partially from its end */ ++ assert("vs-1562", ++ cinfo->tail_removed == params->from->item_pos); ++ ++ freed = ++ kill_tail_f(params->from, data, ++ params->smallest_removed); ++ ++ item_pos = cinfo->tail_removed; ++ ih = node40_ih_at(node, item_pos); ++ cinfo->freed_space_start = ++ ih40_get_offset(ih) + node40_item_length(node, ++ item_pos) - ++ freed; ++ cinfo->freed_space_end = ++ cinfo->freed_space_start + freed; ++ cinfo->first_moved = cinfo->tail_removed + 1; ++ break; ++ ++ case CMODE_WHOLE: ++ /* one or more items get removed completely */ ++ assert("vs-1563", ++ cinfo->first_removed == params->from->item_pos); ++ assert("vs-1564", cinfo->removed_count > 0 ++ && cinfo->removed_count != MAX_POS_IN_NODE); ++ ++ /* call kill hook for all items removed completely */ ++ if (is_cut == 0) ++ call_kill_hooks(node, cinfo->first_removed, ++ cinfo->removed_count, data); ++ ++ item_pos = cinfo->first_removed; ++ ih = node40_ih_at(node, item_pos); ++ ++ if (params->smallest_removed) ++ memcpy(params->smallest_removed, &ih->key, ++ sizeof(reiser4_key)); ++ ++ cinfo->freed_space_start = ih40_get_offset(ih); ++ ++ item_pos += (cinfo->removed_count - 1); ++ ih -= (cinfo->removed_count - 1); ++ cinfo->freed_space_end = ++ ih40_get_offset(ih) + node40_item_length(node, ++ item_pos); ++ cinfo->first_moved = item_pos + 1; ++ if (cinfo->first_removed == 0) ++ /* key of first item of the node changes */ ++ retval = 1; ++ break; ++ ++ case CMODE_HEAD: ++ /* one item gets cut partially from its head */ ++ assert("vs-1565", ++ cinfo->head_removed == params->from->item_pos); ++ ++ freed = ++ kill_head_f(params->to, data, ++ params->smallest_removed, ++ &new_first_key); ++ ++ item_pos = cinfo->head_removed; ++ ih = node40_ih_at(node, item_pos); ++ cinfo->freed_space_start = ih40_get_offset(ih); ++ cinfo->freed_space_end = ih40_get_offset(ih) + freed; ++ cinfo->first_moved = cinfo->head_removed + 1; ++ ++ /* item head is removed, therefore, item key changed */ ++ coord.node = node; ++ coord_set_item_pos(&coord, item_pos); ++ coord.unit_pos = 0; ++ coord.between = AT_UNIT; ++ update_item_key_node40(&coord, &new_first_key, NULL); ++ if (item_pos == 0) ++ /* key of first item of the node changes */ ++ retval = 1; ++ break; ++ ++ case CMODE_TAIL | CMODE_WHOLE: ++ /* one item gets cut from its end and one or more items get removed completely */ ++ assert("vs-1566", ++ cinfo->tail_removed == params->from->item_pos); ++ assert("vs-1567", ++ cinfo->first_removed == cinfo->tail_removed + 1); ++ assert("vs-1564", cinfo->removed_count > 0 ++ && cinfo->removed_count != MAX_POS_IN_NODE); ++ ++ freed = ++ kill_tail_f(params->from, data, ++ params->smallest_removed); ++ ++ item_pos = cinfo->tail_removed; ++ ih = node40_ih_at(node, item_pos); ++ cinfo->freed_space_start = ++ ih40_get_offset(ih) + node40_item_length(node, ++ item_pos) - ++ freed; ++ ++ /* call kill hook for all items removed completely */ ++ if (is_cut == 0) ++ call_kill_hooks(node, cinfo->first_removed, ++ cinfo->removed_count, data); ++ ++ item_pos += cinfo->removed_count; ++ ih -= cinfo->removed_count; ++ cinfo->freed_space_end = ++ ih40_get_offset(ih) + node40_item_length(node, ++ item_pos); ++ cinfo->first_moved = item_pos + 1; ++ break; ++ ++ case CMODE_WHOLE | CMODE_HEAD: ++ /* one or more items get removed completely and one item gets cut partially from its head */ ++ assert("vs-1568", ++ cinfo->first_removed == params->from->item_pos); ++ assert("vs-1564", cinfo->removed_count > 0 ++ && cinfo->removed_count != MAX_POS_IN_NODE); ++ assert("vs-1569", ++ cinfo->head_removed == ++ cinfo->first_removed + cinfo->removed_count); ++ ++ /* call kill hook for all items removed completely */ ++ if (is_cut == 0) ++ call_kill_hooks(node, cinfo->first_removed, ++ cinfo->removed_count, data); ++ ++ item_pos = cinfo->first_removed; ++ ih = node40_ih_at(node, item_pos); ++ ++ if (params->smallest_removed) ++ memcpy(params->smallest_removed, &ih->key, ++ sizeof(reiser4_key)); ++ ++ freed = ++ kill_head_f(params->to, data, NULL, &new_first_key); ++ ++ cinfo->freed_space_start = ih40_get_offset(ih); ++ ++ ih = node40_ih_at(node, cinfo->head_removed); ++ /* this is the most complex case. Item which got head removed and items which are to be moved ++ intact change their location differently. */ ++ cinfo->freed_space_end = ih40_get_offset(ih) + freed; ++ cinfo->first_moved = cinfo->head_removed; ++ cinfo->head_removed_location = cinfo->freed_space_start; ++ ++ /* item head is removed, therefore, item key changed */ ++ coord.node = node; ++ coord_set_item_pos(&coord, cinfo->head_removed); ++ coord.unit_pos = 0; ++ coord.between = AT_UNIT; ++ update_item_key_node40(&coord, &new_first_key, NULL); ++ ++ assert("vs-1579", cinfo->first_removed == 0); ++ /* key of first item of the node changes */ ++ retval = 1; ++ break; ++ ++ case CMODE_TAIL | CMODE_HEAD: ++ /* one item get cut from its end and its neighbor gets cut from its tail */ ++ impossible("vs-1576", "this can not happen currently"); ++ break; ++ ++ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD: ++ impossible("vs-1577", "this can not happen currently"); ++ break; ++ default: ++ impossible("vs-1578", "unexpected cut mode"); ++ break; ++ } ++ } ++ return retval; ++} ++ ++/* plugin->u.node.kill ++ return value is number of items removed completely */ ++int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info) ++{ ++ znode *node; ++ struct cut40_info cinfo; ++ int first_key_changed; ++ ++ node = kdata->params.from->node; ++ ++ first_key_changed = ++ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata, ++ info); ++ compact(node, &cinfo); ++ ++ if (info) { ++ /* it is not called by node40_shift, so we have to take care ++ of changes on upper levels */ ++ if (node_is_empty(node) ++ && !(kdata->flags & DELETE_RETAIN_EMPTY)) ++ /* all contents of node is deleted */ ++ prepare_removal_node40(node, info); ++ else if (first_key_changed) { ++ prepare_for_update(NULL, node, info); ++ } ++ } ++ ++ coord_clear_iplug(kdata->params.from); ++ coord_clear_iplug(kdata->params.to); ++ ++ znode_make_dirty(node); ++ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count; ++} ++ ++/* plugin->u.node.cut ++ return value is number of items removed completely */ ++int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info) ++{ ++ znode *node; ++ struct cut40_info cinfo; ++ int first_key_changed; ++ ++ node = cdata->params.from->node; ++ ++ first_key_changed = ++ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata, ++ info); ++ compact(node, &cinfo); ++ ++ if (info) { ++ /* it is not called by node40_shift, so we have to take care ++ of changes on upper levels */ ++ if (node_is_empty(node)) ++ /* all contents of node is deleted */ ++ prepare_removal_node40(node, info); ++ else if (first_key_changed) { ++ prepare_for_update(NULL, node, info); ++ } ++ } ++ ++ coord_clear_iplug(cdata->params.from); ++ coord_clear_iplug(cdata->params.to); ++ ++ znode_make_dirty(node); ++ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count; ++} ++ ++/* this structure is used by shift method of node40 plugin */ ++struct shift_params { ++ shift_direction pend; /* when @pend == append - we are shifting to ++ left, when @pend == prepend - to right */ ++ coord_t wish_stop; /* when shifting to left this is last unit we ++ want shifted, when shifting to right - this ++ is set to unit we want to start shifting ++ from */ ++ znode *target; ++ int everything; /* it is set to 1 if everything we have to shift is ++ shifted, 0 - otherwise */ ++ ++ /* FIXME-VS: get rid of read_stop */ ++ ++ /* these are set by estimate_shift */ ++ coord_t real_stop; /* this will be set to last unit which will be ++ really shifted */ ++ ++ /* coordinate in source node before operation of unit which becomes ++ first after shift to left of last after shift to right */ ++ union { ++ coord_t future_first; ++ coord_t future_last; ++ } u; ++ ++ unsigned merging_units; /* number of units of first item which have to ++ be merged with last item of target node */ ++ unsigned merging_bytes; /* number of bytes in those units */ ++ ++ unsigned entire; /* items shifted in their entirety */ ++ unsigned entire_bytes; /* number of bytes in those items */ ++ ++ unsigned part_units; /* number of units of partially copied item */ ++ unsigned part_bytes; /* number of bytes in those units */ ++ ++ unsigned shift_bytes; /* total number of bytes in items shifted (item ++ headers not included) */ ++ ++}; ++ ++static int item_creation_overhead(coord_t *item) ++{ ++ return node_plugin_by_coord(item)->item_overhead(item->node, NULL); ++} ++ ++/* how many units are there in @source starting from source->unit_pos ++ but not further than @stop_coord */ ++static int ++wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend) ++{ ++ if (pend == SHIFT_LEFT) { ++ assert("vs-181", source->unit_pos == 0); ++ } else { ++ assert("vs-182", ++ source->unit_pos == coord_last_unit_pos(source)); ++ } ++ ++ if (source->item_pos != stop_coord->item_pos) { ++ /* @source and @stop_coord are different items */ ++ return coord_last_unit_pos(source) + 1; ++ } ++ ++ if (pend == SHIFT_LEFT) { ++ return stop_coord->unit_pos + 1; ++ } else { ++ return source->unit_pos - stop_coord->unit_pos + 1; ++ } ++} ++ ++/* this calculates what can be copied from @shift->wish_stop.node to ++ @shift->target */ ++static void ++estimate_shift(struct shift_params *shift, const reiser4_context * ctx) ++{ ++ unsigned target_free_space, size; ++ pos_in_node_t stop_item; /* item which estimating should not consider */ ++ unsigned want; /* number of units of item we want shifted */ ++ coord_t source; /* item being estimated */ ++ item_plugin *iplug; ++ ++ /* shifting to left/right starts from first/last units of ++ @shift->wish_stop.node */ ++ if (shift->pend == SHIFT_LEFT) { ++ coord_init_first_unit(&source, shift->wish_stop.node); ++ } else { ++ coord_init_last_unit(&source, shift->wish_stop.node); ++ } ++ shift->real_stop = source; ++ ++ /* free space in target node and number of items in source */ ++ target_free_space = znode_free_space(shift->target); ++ ++ shift->everything = 0; ++ if (!node_is_empty(shift->target)) { ++ /* target node is not empty, check for boundary items ++ mergeability */ ++ coord_t to; ++ ++ /* item we try to merge @source with */ ++ if (shift->pend == SHIFT_LEFT) { ++ coord_init_last_unit(&to, shift->target); ++ } else { ++ coord_init_first_unit(&to, shift->target); ++ } ++ ++ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to, ++ &source) : ++ are_items_mergeable(&source, &to)) { ++ /* how many units of @source do we want to merge to ++ item @to */ ++ want = ++ wanted_units(&source, &shift->wish_stop, ++ shift->pend); ++ ++ /* how many units of @source we can merge to item ++ @to */ ++ iplug = item_plugin_by_coord(&source); ++ if (iplug->b.can_shift != NULL) ++ shift->merging_units = ++ iplug->b.can_shift(target_free_space, ++ &source, shift->target, ++ shift->pend, &size, ++ want); ++ else { ++ shift->merging_units = 0; ++ size = 0; ++ } ++ shift->merging_bytes = size; ++ shift->shift_bytes += size; ++ /* update stop coord to be set to last unit of @source ++ we can merge to @target */ ++ if (shift->merging_units) ++ /* at least one unit can be shifted */ ++ shift->real_stop.unit_pos = ++ (shift->merging_units - source.unit_pos - ++ 1) * shift->pend; ++ else { ++ /* nothing can be shifted */ ++ if (shift->pend == SHIFT_LEFT) ++ coord_init_before_first_item(&shift-> ++ real_stop, ++ source. ++ node); ++ else ++ coord_init_after_last_item(&shift-> ++ real_stop, ++ source.node); ++ } ++ assert("nikita-2081", shift->real_stop.unit_pos + 1); ++ ++ if (shift->merging_units != want) { ++ /* we could not copy as many as we want, so, ++ there is no reason for estimating any ++ longer */ ++ return; ++ } ++ ++ target_free_space -= size; ++ coord_add_item_pos(&source, shift->pend); ++ } ++ } ++ ++ /* number of item nothing of which we want to shift */ ++ stop_item = shift->wish_stop.item_pos + shift->pend; ++ ++ /* calculate how many items can be copied into given free ++ space as whole */ ++ for (; source.item_pos != stop_item; ++ coord_add_item_pos(&source, shift->pend)) { ++ if (shift->pend == SHIFT_RIGHT) ++ source.unit_pos = coord_last_unit_pos(&source); ++ ++ /* how many units of @source do we want to copy */ ++ want = wanted_units(&source, &shift->wish_stop, shift->pend); ++ ++ if (want == coord_last_unit_pos(&source) + 1) { ++ /* we want this item to be copied entirely */ ++ size = ++ item_length_by_coord(&source) + ++ item_creation_overhead(&source); ++ if (size <= target_free_space) { ++ /* item fits into target node as whole */ ++ target_free_space -= size; ++ shift->shift_bytes += ++ size - item_creation_overhead(&source); ++ shift->entire_bytes += ++ size - item_creation_overhead(&source); ++ shift->entire++; ++ ++ /* update shift->real_stop coord to be set to ++ last unit of @source we can merge to ++ @target */ ++ shift->real_stop = source; ++ if (shift->pend == SHIFT_LEFT) ++ shift->real_stop.unit_pos = ++ coord_last_unit_pos(&shift-> ++ real_stop); ++ else ++ shift->real_stop.unit_pos = 0; ++ continue; ++ } ++ } ++ ++ /* we reach here only for an item which does not fit into ++ target node in its entirety. This item may be either ++ partially shifted, or not shifted at all. We will have to ++ create new item in target node, so decrease amout of free ++ space by an item creation overhead. We can reach here also ++ if stop coord is in this item */ ++ if (target_free_space >= ++ (unsigned)item_creation_overhead(&source)) { ++ target_free_space -= item_creation_overhead(&source); ++ iplug = item_plugin_by_coord(&source); ++ if (iplug->b.can_shift) { ++ shift->part_units = iplug->b.can_shift(target_free_space, ++ &source, ++ NULL, /* target */ ++ shift->pend, ++ &size, ++ want); ++ } else { ++ target_free_space = 0; ++ shift->part_units = 0; ++ size = 0; ++ } ++ } else { ++ target_free_space = 0; ++ shift->part_units = 0; ++ size = 0; ++ } ++ shift->part_bytes = size; ++ shift->shift_bytes += size; ++ ++ /* set @shift->real_stop to last unit of @source we can merge ++ to @shift->target */ ++ if (shift->part_units) { ++ shift->real_stop = source; ++ shift->real_stop.unit_pos = ++ (shift->part_units - source.unit_pos - ++ 1) * shift->pend; ++ assert("nikita-2082", shift->real_stop.unit_pos + 1); ++ } ++ ++ if (want != shift->part_units) ++ /* not everything wanted were shifted */ ++ return; ++ break; ++ } ++ ++ shift->everything = 1; ++} ++ ++static void ++copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count, ++ shift_direction dir, unsigned free_space) ++{ ++ item_plugin *iplug; ++ ++ assert("nikita-1463", target != NULL); ++ assert("nikita-1464", source != NULL); ++ assert("nikita-1465", from + count <= coord_num_units(source)); ++ ++ iplug = item_plugin_by_coord(source); ++ assert("nikita-1468", iplug == item_plugin_by_coord(target)); ++ iplug->b.copy_units(target, source, from, count, dir, free_space); ++ ++ if (dir == SHIFT_RIGHT) { ++ /* FIXME-VS: this looks not necessary. update_item_key was ++ called already by copy_units method */ ++ reiser4_key split_key; ++ ++ assert("nikita-1469", target->unit_pos == 0); ++ ++ unit_key_by_coord(target, &split_key); ++ node_plugin_by_coord(target)->update_item_key(target, ++ &split_key, NULL); ++ } ++} ++ ++/* copy part of @shift->real_stop.node starting either from its beginning or ++ from its end and ending at @shift->real_stop to either the end or the ++ beginning of @shift->target */ ++static void copy(struct shift_params *shift) ++{ ++ node40_header *nh; ++ coord_t from; ++ coord_t to; ++ item_header40 *from_ih, *to_ih; ++ int free_space_start; ++ int new_items; ++ unsigned old_items; ++ int old_offset; ++ unsigned i; ++ ++ nh = node40_node_header(shift->target); ++ free_space_start = nh40_get_free_space_start(nh); ++ old_items = nh40_get_num_items(nh); ++ new_items = shift->entire + (shift->part_units ? 1 : 0); ++ assert("vs-185", ++ shift->shift_bytes == ++ shift->merging_bytes + shift->entire_bytes + shift->part_bytes); ++ ++ from = shift->wish_stop; ++ ++ coord_init_first_unit(&to, shift->target); ++ ++ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty, ++ hence to.between is set to EMPTY_NODE above. Looks like we want it ++ to be AT_UNIT. ++ ++ Oh, wonders of ->betweeness... ++ ++ */ ++ to.between = AT_UNIT; ++ ++ if (shift->pend == SHIFT_LEFT) { ++ /* copying to left */ ++ ++ coord_set_item_pos(&from, 0); ++ from_ih = node40_ih_at(from.node, 0); ++ ++ coord_set_item_pos(&to, ++ node40_num_of_items_internal(to.node) - 1); ++ if (shift->merging_units) { ++ /* expand last item, so that plugin methods will see ++ correct data */ ++ free_space_start += shift->merging_bytes; ++ nh40_set_free_space_start(nh, ++ (unsigned)free_space_start); ++ nh40_set_free_space(nh, ++ nh40_get_free_space(nh) - ++ shift->merging_bytes); ++ ++ /* appending last item of @target */ ++ copy_units(&to, &from, 0, /* starting from 0-th unit */ ++ shift->merging_units, SHIFT_LEFT, ++ shift->merging_bytes); ++ coord_inc_item_pos(&from); ++ from_ih--; ++ coord_inc_item_pos(&to); ++ } ++ ++ to_ih = node40_ih_at(shift->target, old_items); ++ if (shift->entire) { ++ /* copy @entire items entirely */ ++ ++ /* copy item headers */ ++ memcpy(to_ih - shift->entire + 1, ++ from_ih - shift->entire + 1, ++ shift->entire * sizeof(item_header40)); ++ /* update item header offset */ ++ old_offset = ih40_get_offset(from_ih); ++ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */ ++ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--) ++ ih40_set_offset(to_ih, ++ ih40_get_offset(from_ih) - ++ old_offset + free_space_start); ++ ++ /* copy item bodies */ ++ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */ ++ shift->entire_bytes); ++ ++ coord_add_item_pos(&from, (int)shift->entire); ++ coord_add_item_pos(&to, (int)shift->entire); ++ } ++ ++ nh40_set_free_space_start(nh, ++ free_space_start + ++ shift->shift_bytes - ++ shift->merging_bytes); ++ nh40_set_free_space(nh, ++ nh40_get_free_space(nh) - ++ (shift->shift_bytes - shift->merging_bytes + ++ sizeof(item_header40) * new_items)); ++ ++ /* update node header */ ++ node40_set_num_items(shift->target, nh, old_items + new_items); ++ assert("vs-170", ++ nh40_get_free_space(nh) < znode_size(shift->target)); ++ ++ if (shift->part_units) { ++ /* copy heading part (@part units) of @source item as ++ a new item into @target->node */ ++ ++ /* copy item header of partially copied item */ ++ coord_set_item_pos(&to, ++ node40_num_of_items_internal(to.node) ++ - 1); ++ memcpy(to_ih, from_ih, sizeof(item_header40)); ++ ih40_set_offset(to_ih, ++ nh40_get_free_space_start(nh) - ++ shift->part_bytes); ++ if (item_plugin_by_coord(&to)->b.init) ++ item_plugin_by_coord(&to)->b.init(&to, &from, ++ NULL); ++ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT, ++ shift->part_bytes); ++ } ++ ++ } else { ++ /* copying to right */ ++ ++ coord_set_item_pos(&from, ++ node40_num_of_items_internal(from.node) - 1); ++ from_ih = node40_ih_at_coord(&from); ++ ++ coord_set_item_pos(&to, 0); ++ ++ /* prepare space for new items */ ++ memmove(zdata(to.node) + sizeof(node40_header) + ++ shift->shift_bytes, ++ zdata(to.node) + sizeof(node40_header), ++ free_space_start - sizeof(node40_header)); ++ /* update item headers of moved items */ ++ to_ih = node40_ih_at(to.node, 0); ++ /* first item gets @merging_bytes longer. free space appears ++ at its beginning */ ++ if (!node_is_empty(to.node)) ++ ih40_set_offset(to_ih, ++ ih40_get_offset(to_ih) + ++ shift->shift_bytes - ++ shift->merging_bytes); ++ ++ for (i = 1; i < old_items; i++) ++ ih40_set_offset(to_ih - i, ++ ih40_get_offset(to_ih - i) + ++ shift->shift_bytes); ++ ++ /* move item headers to make space for new items */ ++ memmove(to_ih - old_items + 1 - new_items, ++ to_ih - old_items + 1, ++ sizeof(item_header40) * old_items); ++ to_ih -= (new_items - 1); ++ ++ nh40_set_free_space_start(nh, ++ free_space_start + ++ shift->shift_bytes); ++ nh40_set_free_space(nh, ++ nh40_get_free_space(nh) - ++ (shift->shift_bytes + ++ sizeof(item_header40) * new_items)); ++ ++ /* update node header */ ++ node40_set_num_items(shift->target, nh, old_items + new_items); ++ assert("vs-170", ++ nh40_get_free_space(nh) < znode_size(shift->target)); ++ ++ if (shift->merging_units) { ++ coord_add_item_pos(&to, new_items); ++ to.unit_pos = 0; ++ to.between = AT_UNIT; ++ /* prepend first item of @to */ ++ copy_units(&to, &from, ++ coord_last_unit_pos(&from) - ++ shift->merging_units + 1, ++ shift->merging_units, SHIFT_RIGHT, ++ shift->merging_bytes); ++ coord_dec_item_pos(&from); ++ from_ih++; ++ } ++ ++ if (shift->entire) { ++ /* copy @entire items entirely */ ++ ++ /* copy item headers */ ++ memcpy(to_ih, from_ih, ++ shift->entire * sizeof(item_header40)); ++ ++ /* update item header offset */ ++ old_offset = ++ ih40_get_offset(from_ih + shift->entire - 1); ++ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */ ++ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++) ++ ih40_set_offset(to_ih, ++ ih40_get_offset(from_ih) - ++ old_offset + ++ sizeof(node40_header) + ++ shift->part_bytes); ++ /* copy item bodies */ ++ coord_add_item_pos(&from, -(int)(shift->entire - 1)); ++ memcpy(zdata(to.node) + sizeof(node40_header) + ++ shift->part_bytes, item_by_coord_node40(&from), ++ shift->entire_bytes); ++ coord_dec_item_pos(&from); ++ } ++ ++ if (shift->part_units) { ++ coord_set_item_pos(&to, 0); ++ to.unit_pos = 0; ++ to.between = AT_UNIT; ++ /* copy heading part (@part units) of @source item as ++ a new item into @target->node */ ++ ++ /* copy item header of partially copied item */ ++ memcpy(to_ih, from_ih, sizeof(item_header40)); ++ ih40_set_offset(to_ih, sizeof(node40_header)); ++ if (item_plugin_by_coord(&to)->b.init) ++ item_plugin_by_coord(&to)->b.init(&to, &from, ++ NULL); ++ copy_units(&to, &from, ++ coord_last_unit_pos(&from) - ++ shift->part_units + 1, shift->part_units, ++ SHIFT_RIGHT, shift->part_bytes); ++ } ++ } ++} ++ ++/* remove everything either before or after @fact_stop. Number of items ++ removed completely is returned */ ++static int delete_copied(struct shift_params *shift) ++{ ++ coord_t from; ++ coord_t to; ++ struct carry_cut_data cdata; ++ ++ if (shift->pend == SHIFT_LEFT) { ++ /* we were shifting to left, remove everything from the ++ beginning of @shift->wish_stop->node upto ++ @shift->wish_stop */ ++ coord_init_first_unit(&from, shift->real_stop.node); ++ to = shift->real_stop; ++ ++ /* store old coordinate of unit which will be first after ++ shift to left */ ++ shift->u.future_first = to; ++ coord_next_unit(&shift->u.future_first); ++ } else { ++ /* we were shifting to right, remove everything from ++ @shift->stop_coord upto to end of ++ @shift->stop_coord->node */ ++ from = shift->real_stop; ++ coord_init_last_unit(&to, from.node); ++ ++ /* store old coordinate of unit which will be last after ++ shift to right */ ++ shift->u.future_last = from; ++ coord_prev_unit(&shift->u.future_last); ++ } ++ ++ cdata.params.from = &from; ++ cdata.params.to = &to; ++ cdata.params.from_key = NULL; ++ cdata.params.to_key = NULL; ++ cdata.params.smallest_removed = NULL; ++ return cut_node40(&cdata, NULL); ++} ++ ++/* something was moved between @left and @right. Add carry operation to @info ++ list to have carry to update delimiting key between them */ ++static int ++prepare_for_update(znode * left, znode * right, carry_plugin_info * info) ++{ ++ carry_op *op; ++ carry_node *cn; ++ ++ if (info == NULL) ++ /* nowhere to send operation to. */ ++ return 0; ++ ++ if (!should_notify_parent(right)) ++ return 0; ++ ++ op = node_post_carry(info, COP_UPDATE, right, 1); ++ if (IS_ERR(op) || op == NULL) ++ return op ? PTR_ERR(op) : -EIO; ++ ++ if (left != NULL) { ++ carry_node *reference; ++ ++ if (info->doing) ++ reference = insert_carry_node(info->doing, ++ info->todo, left); ++ else ++ reference = op->node; ++ assert("nikita-2992", reference != NULL); ++ cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference); ++ if (IS_ERR(cn)) ++ return PTR_ERR(cn); ++ cn->parent = 1; ++ cn->node = left; ++ if (ZF_ISSET(left, JNODE_ORPHAN)) ++ cn->left_before = 1; ++ op->u.update.left = cn; ++ } else ++ op->u.update.left = NULL; ++ return 0; ++} ++ ++/* plugin->u.node.prepare_removal ++ to delete a pointer to @empty from the tree add corresponding carry ++ operation (delete) to @info list */ ++int prepare_removal_node40(znode * empty, carry_plugin_info * info) ++{ ++ carry_op *op; ++ reiser4_tree *tree; ++ ++ if (!should_notify_parent(empty)) ++ return 0; ++ /* already on a road to Styx */ ++ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE)) ++ return 0; ++ op = node_post_carry(info, COP_DELETE, empty, 1); ++ if (IS_ERR(op) || op == NULL) ++ return RETERR(op ? PTR_ERR(op) : -EIO); ++ ++ op->u.delete.child = NULL; ++ op->u.delete.flags = 0; ++ ++ /* fare thee well */ ++ tree = znode_get_tree(empty); ++ read_lock_tree(tree); ++ write_lock_dk(tree); ++ znode_set_ld_key(empty, znode_get_rd_key(empty)); ++ if (znode_is_left_connected(empty) && empty->left) ++ znode_set_rd_key(empty->left, znode_get_rd_key(empty)); ++ write_unlock_dk(tree); ++ read_unlock_tree(tree); ++ ++ ZF_SET(empty, JNODE_HEARD_BANSHEE); ++ return 0; ++} ++ ++/* something were shifted from @insert_coord->node to @shift->target, update ++ @insert_coord correspondingly */ ++static void ++adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed, ++ int including_insert_coord) ++{ ++ /* item plugin was invalidated by shifting */ ++ coord_clear_iplug(insert_coord); ++ ++ if (node_is_empty(shift->wish_stop.node)) { ++ assert("vs-242", shift->everything); ++ if (including_insert_coord) { ++ if (shift->pend == SHIFT_RIGHT) { ++ /* set @insert_coord before first unit of ++ @shift->target node */ ++ coord_init_before_first_item(insert_coord, ++ shift->target); ++ } else { ++ /* set @insert_coord after last in target node */ ++ coord_init_after_last_item(insert_coord, ++ shift->target); ++ } ++ } else { ++ /* set @insert_coord inside of empty node. There is ++ only one possible coord within an empty ++ node. init_first_unit will set that coord */ ++ coord_init_first_unit(insert_coord, ++ shift->wish_stop.node); ++ } ++ return; ++ } ++ ++ if (shift->pend == SHIFT_RIGHT) { ++ /* there was shifting to right */ ++ if (shift->everything) { ++ /* everything wanted was shifted */ ++ if (including_insert_coord) { ++ /* @insert_coord is set before first unit of ++ @to node */ ++ coord_init_before_first_item(insert_coord, ++ shift->target); ++ insert_coord->between = BEFORE_UNIT; ++ } else { ++ /* @insert_coord is set after last unit of ++ @insert->node */ ++ coord_init_last_unit(insert_coord, ++ shift->wish_stop.node); ++ insert_coord->between = AFTER_UNIT; ++ } ++ } ++ return; ++ } ++ ++ /* there was shifting to left */ ++ if (shift->everything) { ++ /* everything wanted was shifted */ ++ if (including_insert_coord) { ++ /* @insert_coord is set after last unit in @to node */ ++ coord_init_after_last_item(insert_coord, shift->target); ++ } else { ++ /* @insert_coord is set before first unit in the same ++ node */ ++ coord_init_before_first_item(insert_coord, ++ shift->wish_stop.node); ++ } ++ return; ++ } ++ ++ /* FIXME-VS: the code below is complicated because with between == ++ AFTER_ITEM unit_pos is set to 0 */ ++ ++ if (!removed) { ++ /* no items were shifted entirely */ ++ assert("vs-195", shift->merging_units == 0 ++ || shift->part_units == 0); ++ ++ if (shift->real_stop.item_pos == insert_coord->item_pos) { ++ if (shift->merging_units) { ++ if (insert_coord->between == AFTER_UNIT) { ++ assert("nikita-1441", ++ insert_coord->unit_pos >= ++ shift->merging_units); ++ insert_coord->unit_pos -= ++ shift->merging_units; ++ } else if (insert_coord->between == BEFORE_UNIT) { ++ assert("nikita-2090", ++ insert_coord->unit_pos > ++ shift->merging_units); ++ insert_coord->unit_pos -= ++ shift->merging_units; ++ } ++ ++ assert("nikita-2083", ++ insert_coord->unit_pos + 1); ++ } else { ++ if (insert_coord->between == AFTER_UNIT) { ++ assert("nikita-1442", ++ insert_coord->unit_pos >= ++ shift->part_units); ++ insert_coord->unit_pos -= ++ shift->part_units; ++ } else if (insert_coord->between == BEFORE_UNIT) { ++ assert("nikita-2089", ++ insert_coord->unit_pos > ++ shift->part_units); ++ insert_coord->unit_pos -= ++ shift->part_units; ++ } ++ ++ assert("nikita-2084", ++ insert_coord->unit_pos + 1); ++ } ++ } ++ return; ++ } ++ ++ /* we shifted to left and there was no enough space for everything */ ++ switch (insert_coord->between) { ++ case AFTER_UNIT: ++ case BEFORE_UNIT: ++ if (shift->real_stop.item_pos == insert_coord->item_pos) ++ insert_coord->unit_pos -= shift->part_units; ++ case AFTER_ITEM: ++ coord_add_item_pos(insert_coord, -removed); ++ break; ++ default: ++ impossible("nikita-2087", "not ready"); ++ } ++ assert("nikita-2085", insert_coord->unit_pos + 1); ++} ++ ++static int call_shift_hooks(struct shift_params *shift) ++{ ++ unsigned i, shifted; ++ coord_t coord; ++ item_plugin *iplug; ++ ++ assert("vs-275", !node_is_empty(shift->target)); ++ ++ /* number of items shift touches */ ++ shifted = ++ shift->entire + (shift->merging_units ? 1 : 0) + ++ (shift->part_units ? 1 : 0); ++ ++ if (shift->pend == SHIFT_LEFT) { ++ /* moved items are at the end */ ++ coord_init_last_unit(&coord, shift->target); ++ coord.unit_pos = 0; ++ ++ assert("vs-279", shift->pend == 1); ++ for (i = 0; i < shifted; i++) { ++ unsigned from, count; ++ ++ iplug = item_plugin_by_coord(&coord); ++ if (i == 0 && shift->part_units) { ++ assert("vs-277", ++ coord_num_units(&coord) == ++ shift->part_units); ++ count = shift->part_units; ++ from = 0; ++ } else if (i == shifted - 1 && shift->merging_units) { ++ count = shift->merging_units; ++ from = coord_num_units(&coord) - count; ++ } else { ++ count = coord_num_units(&coord); ++ from = 0; ++ } ++ ++ if (iplug->b.shift_hook) { ++ iplug->b.shift_hook(&coord, from, count, ++ shift->wish_stop.node); ++ } ++ coord_add_item_pos(&coord, -shift->pend); ++ } ++ } else { ++ /* moved items are at the beginning */ ++ coord_init_first_unit(&coord, shift->target); ++ ++ assert("vs-278", shift->pend == -1); ++ for (i = 0; i < shifted; i++) { ++ unsigned from, count; ++ ++ iplug = item_plugin_by_coord(&coord); ++ if (i == 0 && shift->part_units) { ++ assert("vs-277", ++ coord_num_units(&coord) == ++ shift->part_units); ++ count = coord_num_units(&coord); ++ from = 0; ++ } else if (i == shifted - 1 && shift->merging_units) { ++ count = shift->merging_units; ++ from = 0; ++ } else { ++ count = coord_num_units(&coord); ++ from = 0; ++ } ++ ++ if (iplug->b.shift_hook) { ++ iplug->b.shift_hook(&coord, from, count, ++ shift->wish_stop.node); ++ } ++ coord_add_item_pos(&coord, -shift->pend); ++ } ++ } ++ ++ return 0; ++} ++ ++/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */ ++static int ++unit_moved_left(const struct shift_params *shift, const coord_t * old) ++{ ++ assert("vs-944", shift->real_stop.node == old->node); ++ ++ if (shift->real_stop.item_pos < old->item_pos) ++ return 0; ++ if (shift->real_stop.item_pos == old->item_pos) { ++ if (shift->real_stop.unit_pos < old->unit_pos) ++ return 0; ++ } ++ return 1; ++} ++ ++/* shift to right is completed. Return 1 if unit @old was moved to right ++ neighbor */ ++static int ++unit_moved_right(const struct shift_params *shift, const coord_t * old) ++{ ++ assert("vs-944", shift->real_stop.node == old->node); ++ ++ if (shift->real_stop.item_pos > old->item_pos) ++ return 0; ++ if (shift->real_stop.item_pos == old->item_pos) { ++ if (shift->real_stop.unit_pos > old->unit_pos) ++ return 0; ++ } ++ return 1; ++} ++ ++/* coord @old was set in node from which shift was performed. What was shifted ++ is stored in @shift. Update @old correspondingly to performed shift */ ++static coord_t *adjust_coord2(const struct shift_params *shift, ++ const coord_t * old, coord_t * new) ++{ ++ coord_clear_iplug(new); ++ new->between = old->between; ++ ++ coord_clear_iplug(new); ++ if (old->node == shift->target) { ++ if (shift->pend == SHIFT_LEFT) { ++ /* coord which is set inside of left neighbor does not ++ change during shift to left */ ++ coord_dup(new, old); ++ return new; ++ } ++ new->node = old->node; ++ coord_set_item_pos(new, ++ old->item_pos + shift->entire + ++ (shift->part_units ? 1 : 0)); ++ new->unit_pos = old->unit_pos; ++ if (old->item_pos == 0 && shift->merging_units) ++ new->unit_pos += shift->merging_units; ++ return new; ++ } ++ ++ assert("vs-977", old->node == shift->wish_stop.node); ++ if (shift->pend == SHIFT_LEFT) { ++ if (unit_moved_left(shift, old)) { ++ /* unit @old moved to left neighbor. Calculate its ++ coordinate there */ ++ new->node = shift->target; ++ coord_set_item_pos(new, ++ node_num_items(shift->target) - ++ shift->entire - ++ (shift->part_units ? 1 : 0) + ++ old->item_pos); ++ ++ new->unit_pos = old->unit_pos; ++ if (shift->merging_units) { ++ coord_dec_item_pos(new); ++ if (old->item_pos == 0) { ++ /* unit_pos only changes if item got ++ merged */ ++ new->unit_pos = ++ coord_num_units(new) - ++ (shift->merging_units - ++ old->unit_pos); ++ } ++ } ++ } else { ++ /* unit @old did not move to left neighbor. ++ ++ Use _nocheck, because @old is outside of its node. ++ */ ++ coord_dup_nocheck(new, old); ++ coord_add_item_pos(new, ++ -shift->u.future_first.item_pos); ++ if (new->item_pos == 0) ++ new->unit_pos -= shift->u.future_first.unit_pos; ++ } ++ } else { ++ if (unit_moved_right(shift, old)) { ++ /* unit @old moved to right neighbor */ ++ new->node = shift->target; ++ coord_set_item_pos(new, ++ old->item_pos - ++ shift->real_stop.item_pos); ++ if (new->item_pos == 0) { ++ /* unit @old might change unit pos */ ++ coord_set_item_pos(new, ++ old->unit_pos - ++ shift->real_stop.unit_pos); ++ } ++ } else { ++ /* unit @old did not move to right neighbor, therefore ++ it did not change */ ++ coord_dup(new, old); ++ } ++ } ++ coord_set_iplug(new, item_plugin_by_coord(new)); ++ return new; ++} ++ ++/* this is called when shift is completed (something of source node is copied ++ to target and deleted in source) to update all taps set in current ++ context */ ++static void update_taps(const struct shift_params *shift) ++{ ++ tap_t *tap; ++ coord_t new; ++ ++ for_all_taps(tap) { ++ /* update only taps set to nodes participating in shift */ ++ if (tap->coord->node == shift->wish_stop.node ++ || tap->coord->node == shift->target) ++ tap_to_coord(tap, ++ adjust_coord2(shift, tap->coord, &new)); ++ } ++} ++ ++#if REISER4_DEBUG ++ ++struct shift_check { ++ reiser4_key key; ++ __u16 plugin_id; ++ union { ++ __u64 bytes; ++ __u64 entries; ++ void *unused; ++ } u; ++}; ++ ++void *shift_check_prepare(const znode * left, const znode * right) ++{ ++ pos_in_node_t i, nr_items; ++ int mergeable; ++ struct shift_check *data; ++ item_header40 *ih; ++ ++ if (node_is_empty(left) || node_is_empty(right)) ++ mergeable = 0; ++ else { ++ coord_t l, r; ++ ++ coord_init_last_unit(&l, left); ++ coord_init_first_unit(&r, right); ++ mergeable = are_items_mergeable(&l, &r); ++ } ++ nr_items = ++ node40_num_of_items_internal(left) + ++ node40_num_of_items_internal(right) - (mergeable ? 1 : 0); ++ data = ++ kmalloc(sizeof(struct shift_check) * nr_items, ++ reiser4_ctx_gfp_mask_get()); ++ if (data != NULL) { ++ coord_t coord; ++ pos_in_node_t item_pos; ++ ++ coord_init_first_unit(&coord, left); ++ i = 0; ++ ++ for (item_pos = 0; ++ item_pos < node40_num_of_items_internal(left); ++ item_pos++) { ++ ++ coord_set_item_pos(&coord, item_pos); ++ ih = node40_ih_at_coord(&coord); ++ ++ data[i].key = ih->key; ++ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id)); ++ switch (data[i].plugin_id) { ++ case CTAIL_ID: ++ case FORMATTING_ID: ++ data[i].u.bytes = coord_num_units(&coord); ++ break; ++ case EXTENT_POINTER_ID: ++ data[i].u.bytes = ++ reiser4_extent_size(&coord, ++ coord_num_units(&coord)); ++ break; ++ case COMPOUND_DIR_ID: ++ data[i].u.entries = coord_num_units(&coord); ++ break; ++ default: ++ data[i].u.unused = NULL; ++ break; ++ } ++ i++; ++ } ++ ++ coord_init_first_unit(&coord, right); ++ ++ if (mergeable) { ++ assert("vs-1609", i != 0); ++ ++ ih = node40_ih_at_coord(&coord); ++ ++ assert("vs-1589", ++ data[i - 1].plugin_id == ++ le16_to_cpu(get_unaligned(&ih->plugin_id))); ++ switch (data[i - 1].plugin_id) { ++ case CTAIL_ID: ++ case FORMATTING_ID: ++ data[i - 1].u.bytes += coord_num_units(&coord); ++ break; ++ case EXTENT_POINTER_ID: ++ data[i - 1].u.bytes += ++ reiser4_extent_size(&coord, ++ coord_num_units(&coord)); ++ break; ++ case COMPOUND_DIR_ID: ++ data[i - 1].u.entries += ++ coord_num_units(&coord); ++ break; ++ default: ++ impossible("vs-1605", "wrong mergeable item"); ++ break; ++ } ++ item_pos = 1; ++ } else ++ item_pos = 0; ++ for (; item_pos < node40_num_of_items_internal(right); ++ item_pos++) { ++ ++ assert("vs-1604", i < nr_items); ++ coord_set_item_pos(&coord, item_pos); ++ ih = node40_ih_at_coord(&coord); ++ ++ data[i].key = ih->key; ++ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id)); ++ switch (data[i].plugin_id) { ++ case CTAIL_ID: ++ case FORMATTING_ID: ++ data[i].u.bytes = coord_num_units(&coord); ++ break; ++ case EXTENT_POINTER_ID: ++ data[i].u.bytes = ++ reiser4_extent_size(&coord, ++ coord_num_units(&coord)); ++ break; ++ case COMPOUND_DIR_ID: ++ data[i].u.entries = coord_num_units(&coord); ++ break; ++ default: ++ data[i].u.unused = NULL; ++ break; ++ } ++ i++; ++ } ++ assert("vs-1606", i == nr_items); ++ } ++ return data; ++} ++ ++void shift_check(void *vp, const znode * left, const znode * right) ++{ ++ pos_in_node_t i, nr_items; ++ coord_t coord; ++ __u64 last_bytes; ++ int mergeable; ++ item_header40 *ih; ++ pos_in_node_t item_pos; ++ struct shift_check *data; ++ ++ data = (struct shift_check *)vp; ++ ++ if (data == NULL) ++ return; ++ ++ if (node_is_empty(left) || node_is_empty(right)) ++ mergeable = 0; ++ else { ++ coord_t l, r; ++ ++ coord_init_last_unit(&l, left); ++ coord_init_first_unit(&r, right); ++ mergeable = are_items_mergeable(&l, &r); ++ } ++ ++ nr_items = ++ node40_num_of_items_internal(left) + ++ node40_num_of_items_internal(right) - (mergeable ? 1 : 0); ++ ++ i = 0; ++ last_bytes = 0; ++ ++ coord_init_first_unit(&coord, left); ++ ++ for (item_pos = 0; item_pos < node40_num_of_items_internal(left); ++ item_pos++) { ++ ++ coord_set_item_pos(&coord, item_pos); ++ ih = node40_ih_at_coord(&coord); ++ ++ assert("vs-1611", i == item_pos); ++ assert("vs-1590", keyeq(&ih->key, &data[i].key)); ++ assert("vs-1591", ++ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id); ++ if ((i < (node40_num_of_items_internal(left) - 1)) ++ || !mergeable) { ++ switch (data[i].plugin_id) { ++ case CTAIL_ID: ++ case FORMATTING_ID: ++ assert("vs-1592", ++ data[i].u.bytes == ++ coord_num_units(&coord)); ++ break; ++ case EXTENT_POINTER_ID: ++ assert("vs-1593", ++ data[i].u.bytes == ++ reiser4_extent_size(&coord, ++ coord_num_units ++ (&coord))); ++ break; ++ case COMPOUND_DIR_ID: ++ assert("vs-1594", ++ data[i].u.entries == ++ coord_num_units(&coord)); ++ break; ++ default: ++ break; ++ } ++ } ++ if (item_pos == (node40_num_of_items_internal(left) - 1) ++ && mergeable) { ++ switch (data[i].plugin_id) { ++ case CTAIL_ID: ++ case FORMATTING_ID: ++ last_bytes = coord_num_units(&coord); ++ break; ++ case EXTENT_POINTER_ID: ++ last_bytes = ++ reiser4_extent_size(&coord, ++ coord_num_units(&coord)); ++ break; ++ case COMPOUND_DIR_ID: ++ last_bytes = coord_num_units(&coord); ++ break; ++ default: ++ impossible("vs-1595", "wrong mergeable item"); ++ break; ++ } ++ } ++ i++; ++ } ++ ++ coord_init_first_unit(&coord, right); ++ if (mergeable) { ++ ih = node40_ih_at_coord(&coord); ++ ++ assert("vs-1589", ++ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id))); ++ assert("vs-1608", last_bytes != 0); ++ switch (data[i - 1].plugin_id) { ++ case CTAIL_ID: ++ case FORMATTING_ID: ++ assert("vs-1596", ++ data[i - 1].u.bytes == ++ last_bytes + coord_num_units(&coord)); ++ break; ++ ++ case EXTENT_POINTER_ID: ++ assert("vs-1597", ++ data[i - 1].u.bytes == ++ last_bytes + reiser4_extent_size(&coord, ++ coord_num_units ++ (&coord))); ++ break; ++ ++ case COMPOUND_DIR_ID: ++ assert("vs-1598", ++ data[i - 1].u.bytes == ++ last_bytes + coord_num_units(&coord)); ++ break; ++ default: ++ impossible("vs-1599", "wrong mergeable item"); ++ break; ++ } ++ item_pos = 1; ++ } else ++ item_pos = 0; ++ ++ for (; item_pos < node40_num_of_items_internal(right); item_pos++) { ++ ++ coord_set_item_pos(&coord, item_pos); ++ ih = node40_ih_at_coord(&coord); ++ ++ assert("vs-1612", keyeq(&ih->key, &data[i].key)); ++ assert("vs-1613", ++ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id); ++ switch (data[i].plugin_id) { ++ case CTAIL_ID: ++ case FORMATTING_ID: ++ assert("vs-1600", ++ data[i].u.bytes == coord_num_units(&coord)); ++ break; ++ case EXTENT_POINTER_ID: ++ assert("vs-1601", ++ data[i].u.bytes == ++ reiser4_extent_size(&coord, ++ coord_num_units ++ (&coord))); ++ break; ++ case COMPOUND_DIR_ID: ++ assert("vs-1602", ++ data[i].u.entries == coord_num_units(&coord)); ++ break; ++ default: ++ break; ++ } ++ i++; ++ } ++ ++ assert("vs-1603", i == nr_items); ++ kfree(data); ++} ++ ++#endif ++ ++/* plugin->u.node.shift ++ look for description of this method in plugin/node/node.h */ ++int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be ++ deleted from the tree if this is set to 1 */ ++ int including_stop_coord, carry_plugin_info * info) ++{ ++ struct shift_params shift; ++ int result; ++ znode *left, *right; ++ znode *source; ++ int target_empty; ++ ++ assert("nikita-2161", coord_check(from)); ++ ++ memset(&shift, 0, sizeof(shift)); ++ shift.pend = pend; ++ shift.wish_stop = *from; ++ shift.target = to; ++ ++ assert("nikita-1473", znode_is_write_locked(from->node)); ++ assert("nikita-1474", znode_is_write_locked(to)); ++ ++ source = from->node; ++ ++ /* set @shift.wish_stop to rightmost/leftmost unit among units we want ++ shifted */ ++ if (pend == SHIFT_LEFT) { ++ result = coord_set_to_left(&shift.wish_stop); ++ left = to; ++ right = from->node; ++ } else { ++ result = coord_set_to_right(&shift.wish_stop); ++ left = from->node; ++ right = to; ++ } ++ ++ if (result) { ++ /* move insertion coord even if there is nothing to move */ ++ if (including_stop_coord) { ++ /* move insertion coord (@from) */ ++ if (pend == SHIFT_LEFT) { ++ /* after last item in target node */ ++ coord_init_after_last_item(from, to); ++ } else { ++ /* before first item in target node */ ++ coord_init_before_first_item(from, to); ++ } ++ } ++ ++ if (delete_child && node_is_empty(shift.wish_stop.node)) ++ result = ++ prepare_removal_node40(shift.wish_stop.node, info); ++ else ++ result = 0; ++ /* there is nothing to shift */ ++ assert("nikita-2078", coord_check(from)); ++ return result; ++ } ++ ++ target_empty = node_is_empty(to); ++ ++ /* when first node plugin with item body compression is implemented, ++ this must be changed to call node specific plugin */ ++ ++ /* shift->stop_coord is updated to last unit which really will be ++ shifted */ ++ estimate_shift(&shift, get_current_context()); ++ if (!shift.shift_bytes) { ++ /* we could not shift anything */ ++ assert("nikita-2079", coord_check(from)); ++ return 0; ++ } ++ ++ copy(&shift); ++ ++ /* result value of this is important. It is used by adjust_coord below */ ++ result = delete_copied(&shift); ++ ++ assert("vs-1610", result >= 0); ++ assert("vs-1471", ++ ((reiser4_context *) current->journal_info)->magic == ++ context_magic); ++ ++ /* item which has been moved from one node to another might want to do ++ something on that event. This can be done by item's shift_hook ++ method, which will be now called for every moved items */ ++ call_shift_hooks(&shift); ++ ++ assert("vs-1472", ++ ((reiser4_context *) current->journal_info)->magic == ++ context_magic); ++ ++ update_taps(&shift); ++ ++ assert("vs-1473", ++ ((reiser4_context *) current->journal_info)->magic == ++ context_magic); ++ ++ /* adjust @from pointer in accordance with @including_stop_coord flag ++ and amount of data which was really shifted */ ++ adjust_coord(from, &shift, result, including_stop_coord); ++ ++ if (target_empty) ++ /* ++ * items were shifted into empty node. Update delimiting key. ++ */ ++ result = prepare_for_update(NULL, left, info); ++ ++ /* add update operation to @info, which is the list of operations to ++ be performed on a higher level */ ++ result = prepare_for_update(left, right, info); ++ if (!result && node_is_empty(source) && delete_child) { ++ /* all contents of @from->node is moved to @to and @from->node ++ has to be removed from the tree, so, on higher level we ++ will be removing the pointer to node @from->node */ ++ result = prepare_removal_node40(source, info); ++ } ++ assert("nikita-2080", coord_check(from)); ++ return result ? result : (int)shift.shift_bytes; ++} ++ ++/* plugin->u.node.fast_insert() ++ look for description of this method in plugin/node/node.h */ ++int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) ++{ ++ return 1; ++} ++ ++/* plugin->u.node.fast_paste() ++ look for description of this method in plugin/node/node.h */ ++int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) ++{ ++ return 1; ++} ++ ++/* plugin->u.node.fast_cut() ++ look for description of this method in plugin/node/node.h */ ++int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) ++{ ++ return 1; ++} ++ ++/* plugin->u.node.modify - not defined */ ++ ++/* plugin->u.node.max_item_size */ ++int max_item_size_node40(void) ++{ ++ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) - ++ sizeof(item_header40); ++} ++ ++/* plugin->u.node.set_item_plugin */ ++int set_item_plugin_node40(coord_t *coord, item_id id) ++{ ++ item_header40 *ih; ++ ++ ih = node40_ih_at_coord(coord); ++ put_unaligned(cpu_to_le16(id), &ih->plugin_id); ++ coord->iplugid = id; ++ return 0; ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/node/node40.h linux-2.6.33/fs/reiser4/plugin/node/node40.h +--- linux-2.6.33.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/node/node40.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,125 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#if !defined( __REISER4_NODE40_H__ ) ++#define __REISER4_NODE40_H__ ++ ++#include "../../forward.h" ++#include "../../dformat.h" ++#include "node.h" ++ ++#include <linux/types.h> ++ ++/* format of node header for 40 node layouts. Keep bloat out of this struct. */ ++typedef struct node40_header { ++ /* identifier of node plugin. Must be located at the very beginning ++ of a node. */ ++ common_node_header common_header; /* this is 16 bits */ ++ /* number of items. Should be first element in the node header, ++ because we haven't yet finally decided whether it shouldn't go into ++ common_header. ++ */ ++/* NIKITA-FIXME-HANS: Create a macro such that if there is only one ++ * node format at compile time, and it is this one, accesses do not function dereference when ++ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */ ++ d16 nr_items; ++ /* free space in node measured in bytes */ ++ d16 free_space; ++ /* offset to start of free space in node */ ++ d16 free_space_start; ++ /* for reiser4_fsck. When information about what is a free ++ block is corrupted, and we try to recover everything even ++ if marked as freed, then old versions of data may ++ duplicate newer versions, and this field allows us to ++ restore the newer version. Also useful for when users ++ who don't have the new trashcan installed on their linux distro ++ delete the wrong files and send us desperate emails ++ offering $25 for them back. */ ++ ++ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */ ++ d32 magic; ++ /* flushstamp is made of mk_id and write_counter. mk_id is an ++ id generated randomly at mkreiserfs time. So we can just ++ skip all nodes with different mk_id. write_counter is d64 ++ incrementing counter of writes on disk. It is used for ++ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */ ++ ++ d32 mkfs_id; ++ d64 flush_id; ++ /* node flags to be used by fsck (reiser4ck or reiser4fsck?) ++ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */ ++ d16 flags; ++ ++ /* 1 is leaf level, 2 is twig level, root is the numerically ++ largest level */ ++ d8 level; ++ ++ d8 pad; ++} PACKED node40_header; ++ ++/* item headers are not standard across all node layouts, pass ++ pos_in_node to functions instead */ ++typedef struct item_header40 { ++ /* key of item */ ++ /* 0 */ reiser4_key key; ++ /* offset from start of a node measured in 8-byte chunks */ ++ /* 24 */ d16 offset; ++ /* 26 */ d16 flags; ++ /* 28 */ d16 plugin_id; ++} PACKED item_header40; ++ ++size_t item_overhead_node40(const znode * node, flow_t * aflow); ++size_t free_space_node40(znode * node); ++node_search_result lookup_node40(znode * node, const reiser4_key * key, ++ lookup_bias bias, coord_t * coord); ++int num_of_items_node40(const znode * node); ++char *item_by_coord_node40(const coord_t * coord); ++int length_by_coord_node40(const coord_t * coord); ++item_plugin *plugin_by_coord_node40(const coord_t * coord); ++reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key); ++size_t estimate_node40(znode * node); ++int check_node40(const znode * node, __u32 flags, const char **error); ++int parse_node40(znode * node); ++int init_node40(znode * node); ++#ifdef GUESS_EXISTS ++int guess_node40(const znode * node); ++#endif ++void change_item_size_node40(coord_t * coord, int by); ++int create_item_node40(coord_t * target, const reiser4_key * key, ++ reiser4_item_data * data, carry_plugin_info * info); ++void update_item_key_node40(coord_t * target, const reiser4_key * key, ++ carry_plugin_info * info); ++int kill_node40(struct carry_kill_data *, carry_plugin_info *); ++int cut_node40(struct carry_cut_data *, carry_plugin_info *); ++int shift_node40(coord_t * from, znode * to, shift_direction pend, ++ /* if @from->node becomes ++ empty - it will be deleted from ++ the tree if this is set to 1 ++ */ ++ int delete_child, int including_stop_coord, ++ carry_plugin_info * info); ++ ++int fast_insert_node40(const coord_t * coord); ++int fast_paste_node40(const coord_t * coord); ++int fast_cut_node40(const coord_t * coord); ++int max_item_size_node40(void); ++int prepare_removal_node40(znode * empty, carry_plugin_info * info); ++int set_item_plugin_node40(coord_t * coord, item_id id); ++int shrink_item_node40(coord_t * coord, int delta); ++ ++#if REISER4_DEBUG ++void *shift_check_prepare(const znode *left, const znode *right); ++void shift_check(void *vp, const znode *left, const znode *right); ++#endif ++ ++/* __REISER4_NODE40_H__ */ ++#endif ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/node/node.c linux-2.6.33/fs/reiser4/plugin/node/node.c +--- linux-2.6.33.orig/fs/reiser4/plugin/node/node.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/node/node.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,131 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Node plugin interface. ++ ++ Description: The tree provides the abstraction of flows, which it ++ internally fragments into items which it stores in nodes. ++ ++ A key_atom is a piece of data bound to a single key. ++ ++ For reasonable space efficiency to be achieved it is often ++ necessary to store key_atoms in the nodes in the form of items, where ++ an item is a sequence of key_atoms of the same or similar type. It is ++ more space-efficient, because the item can implement (very) ++ efficient compression of key_atom's bodies using internal knowledge ++ about their semantics, and it can often avoid having a key for each ++ key_atom. Each type of item has specific operations implemented by its ++ item handler (see balance.c). ++ ++ Rationale: the rest of the code (specifically balancing routines) ++ accesses leaf level nodes through this interface. This way we can ++ implement various block layouts and even combine various layouts ++ within the same tree. Balancing/allocating algorithms should not ++ care about peculiarities of splitting/merging specific item types, ++ but rather should leave that to the item's item handler. ++ ++ Items, including those that provide the abstraction of flows, have ++ the property that if you move them in part or in whole to another ++ node, the balancing code invokes their is_left_mergeable() ++ item_operation to determine if they are mergeable with their new ++ neighbor in the node you have moved them to. For some items the ++ is_left_mergeable() function always returns null. ++ ++ When moving the bodies of items from one node to another: ++ ++ if a partial item is shifted to another node the balancing code invokes ++ an item handler method to handle the item splitting. ++ ++ if the balancing code needs to merge with an item in the node it ++ is shifting to, it will invoke an item handler method to handle ++ the item merging. ++ ++ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy() ++ adjusting the item headers after the move is done using the node handler. ++*/ ++ ++#include "../../forward.h" ++#include "../../debug.h" ++#include "../../key.h" ++#include "../../coord.h" ++#include "../plugin_header.h" ++#include "../item/item.h" ++#include "node.h" ++#include "../plugin.h" ++#include "../../znode.h" ++#include "../../tree.h" ++#include "../../super.h" ++#include "../../reiser4.h" ++ ++/** ++ * leftmost_key_in_node - get the smallest key in node ++ * @node: ++ * @key: store result here ++ * ++ * Stores the leftmost key of @node in @key. ++ */ ++reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key) ++{ ++ assert("nikita-1634", node != NULL); ++ assert("nikita-1635", key != NULL); ++ ++ if (!node_is_empty(node)) { ++ coord_t first_item; ++ ++ coord_init_first_unit(&first_item, (znode *) node); ++ item_key_by_coord(&first_item, key); ++ } else ++ *key = *reiser4_max_key(); ++ return key; ++} ++ ++node_plugin node_plugins[LAST_NODE_ID] = { ++ [NODE40_ID] = { ++ .h = { ++ .type_id = REISER4_NODE_PLUGIN_TYPE, ++ .id = NODE40_ID, ++ .pops = NULL, ++ .label = "unified", ++ .desc = "unified node layout", ++ .linkage = {NULL, NULL} ++ }, ++ .item_overhead = item_overhead_node40, ++ .free_space = free_space_node40, ++ .lookup = lookup_node40, ++ .num_of_items = num_of_items_node40, ++ .item_by_coord = item_by_coord_node40, ++ .length_by_coord = length_by_coord_node40, ++ .plugin_by_coord = plugin_by_coord_node40, ++ .key_at = key_at_node40, ++ .estimate = estimate_node40, ++ .check = check_node40, ++ .parse = parse_node40, ++ .init = init_node40, ++#ifdef GUESS_EXISTS ++ .guess = guess_node40, ++#endif ++ .change_item_size = change_item_size_node40, ++ .create_item = create_item_node40, ++ .update_item_key = update_item_key_node40, ++ .cut_and_kill = kill_node40, ++ .cut = cut_node40, ++ .shift = shift_node40, ++ .shrink_item = shrink_item_node40, ++ .fast_insert = fast_insert_node40, ++ .fast_paste = fast_paste_node40, ++ .fast_cut = fast_cut_node40, ++ .max_item_size = max_item_size_node40, ++ .prepare_removal = prepare_removal_node40, ++ .set_item_plugin = set_item_plugin_node40 ++ } ++}; ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/node/node.h linux-2.6.33/fs/reiser4/plugin/node/node.h +--- linux-2.6.33.orig/fs/reiser4/plugin/node/node.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/node/node.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,272 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* We need a definition of the default node layout here. */ ++ ++/* Generally speaking, it is best to have free space in the middle of the ++ node so that two sets of things can grow towards it, and to have the ++ item bodies on the left so that the last one of them grows into free ++ space. We optimize for the case where we append new items to the end ++ of the node, or grow the last item, because it hurts nothing to so ++ optimize and it is a common special case to do massive insertions in ++ increasing key order (and one of cases more likely to have a real user ++ notice the delay time for). ++ ++ formatted leaf default layout: (leaf1) ++ ++ |node header:item bodies:free space:key + pluginid + item offset| ++ ++ We grow towards the middle, optimizing layout for the case where we ++ append new items to the end of the node. The node header is fixed ++ length. Keys, and item offsets plus pluginids for the items ++ corresponding to them are in increasing key order, and are fixed ++ length. Item offsets are relative to start of node (16 bits creating ++ a node size limit of 64k, 12 bits might be a better choice....). Item ++ bodies are in decreasing key order. Item bodies have a variable size. ++ There is a one to one to one mapping of keys to item offsets to item ++ bodies. Item offsets consist of pointers to the zeroth byte of the ++ item body. Item length equals the start of the next item minus the ++ start of this item, except the zeroth item whose length equals the end ++ of the node minus the start of that item (plus a byte). In other ++ words, the item length is not recorded anywhere, and it does not need ++ to be since it is computable. ++ ++ Leaf variable length items and keys layout : (lvar) ++ ++ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies| ++ ++ We grow towards the middle, optimizing layout for the case where we ++ append new items to the end of the node. The node header is fixed ++ length. Keys and item offsets for the items corresponding to them are ++ in increasing key order, and keys are variable length. Item offsets ++ are relative to start of node (16 bits). Item bodies are in ++ decreasing key order. Item bodies have a variable size. There is a ++ one to one to one mapping of keys to item offsets to item bodies. ++ Item offsets consist of pointers to the zeroth byte of the item body. ++ Item length equals the start of the next item's key minus the start of ++ this item, except the zeroth item whose length equals the end of the ++ node minus the start of that item (plus a byte). ++ ++ leaf compressed keys layout: (lcomp) ++ ++ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies| ++ ++ We grow towards the middle, optimizing layout for the case where we ++ append new items to the end of the node. The node header is fixed ++ length. Keys and item offsets for the items corresponding to them are ++ in increasing key order, and keys are variable length. The "key ++ inherit" field indicates how much of the key prefix is identical to ++ the previous key (stem compression as described in "Managing ++ Gigabytes" is used). key_inherit is a one byte integer. The ++ intra-node searches performed through this layout are linear searches, ++ and this is theorized to not hurt performance much due to the high ++ cost of processor stalls on modern CPUs, and the small number of keys ++ in a single node. Item offsets are relative to start of node (16 ++ bits). Item bodies are in decreasing key order. Item bodies have a ++ variable size. There is a one to one to one mapping of keys to item ++ offsets to item bodies. Item offsets consist of pointers to the ++ zeroth byte of the item body. Item length equals the start of the ++ next item minus the start of this item, except the zeroth item whose ++ length equals the end of the node minus the start of that item (plus a ++ byte). In other words, item length and key length is not recorded ++ anywhere, and it does not need to be since it is computable. ++ ++ internal node default layout: (idef1) ++ ++ just like ldef1 except that item bodies are either blocknrs of ++ children or extents, and moving them may require updating parent ++ pointers in the nodes that they point to. ++*/ ++ ++/* There is an inherent 3-way tradeoff between optimizing and ++ exchanging disks between different architectures and code ++ complexity. This is optimal and simple and inexchangeable. ++ Someone else can do the code for exchanging disks and make it ++ complex. It would not be that hard. Using other than the PAGE_SIZE ++ might be suboptimal. ++*/ ++ ++#if !defined( __REISER4_NODE_H__ ) ++#define __REISER4_NODE_H__ ++ ++#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE ++ ++#include "../../dformat.h" ++#include "../plugin_header.h" ++ ++#include <linux/types.h> ++ ++typedef enum { ++ NS_FOUND = 0, ++ NS_NOT_FOUND = -ENOENT ++} node_search_result; ++ ++/* Maximal possible space overhead for creation of new item in a node */ ++#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 ) ++ ++typedef enum { ++ REISER4_NODE_DKEYS = (1 << 0), ++ REISER4_NODE_TREE_STABLE = (1 << 1) ++} reiser4_node_check_flag; ++ ++/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */ ++struct cut_list { ++ coord_t *from; ++ coord_t *to; ++ const reiser4_key *from_key; ++ const reiser4_key *to_key; ++ reiser4_key *smallest_removed; ++ carry_plugin_info *info; ++ __u32 flags; ++ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */ ++ lock_handle *left; ++ lock_handle *right; ++}; ++ ++struct carry_cut_data; ++struct carry_kill_data; ++ ++/* The responsibility of the node plugin is to store and give access ++ to the sequence of items within the node. */ ++typedef struct node_plugin { ++ /* generic plugin fields */ ++ plugin_header h; ++ ++ /* calculates the amount of space that will be required to store an ++ item which is in addition to the space consumed by the item body. ++ (the space consumed by the item body can be gotten by calling ++ item->estimate) */ ++ size_t(*item_overhead) (const znode * node, flow_t * f); ++ ++ /* returns free space by looking into node (i.e., without using ++ znode->free_space). */ ++ size_t(*free_space) (znode * node); ++ /* search within the node for the one item which might ++ contain the key, invoking item->search_within to search within ++ that item to see if it is in there */ ++ node_search_result(*lookup) (znode * node, const reiser4_key * key, ++ lookup_bias bias, coord_t * coord); ++ /* number of items in node */ ++ int (*num_of_items) (const znode * node); ++ ++ /* store information about item in @coord in @data */ ++ /* break into several node ops, don't add any more uses of this before doing so */ ++ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */ ++ char *(*item_by_coord) (const coord_t * coord); ++ int (*length_by_coord) (const coord_t * coord); ++ item_plugin *(*plugin_by_coord) (const coord_t * coord); ++ ++ /* store item key in @key */ ++ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key); ++ /* conservatively estimate whether unit of what size can fit ++ into node. This estimation should be performed without ++ actually looking into the node's content (free space is saved in ++ znode). */ ++ size_t(*estimate) (znode * node); ++ ++ /* performs every consistency check the node plugin author could ++ imagine. Optional. */ ++ int (*check) (const znode * node, __u32 flags, const char **error); ++ ++ /* Called when node is read into memory and node plugin is ++ already detected. This should read some data into znode (like free ++ space counter) and, optionally, check data consistency. ++ */ ++ int (*parse) (znode * node); ++ /* This method is called on a new node to initialise plugin specific ++ data (header, etc.) */ ++ int (*init) (znode * node); ++ /* Check whether @node content conforms to this plugin format. ++ Probably only useful after support for old V3.x formats is added. ++ Uncomment after 4.0 only. ++ */ ++ /* int ( *guess )( const znode *node ); */ ++#if REISER4_DEBUG ++ void (*print) (const char *prefix, const znode * node, __u32 flags); ++#endif ++ /* change size of @item by @by bytes. @item->node has enough free ++ space. When @by > 0 - free space is appended to end of item. When ++ @by < 0 - item is truncated - it is assumed that last @by bytes if ++ the item are freed already */ ++ void (*change_item_size) (coord_t * item, int by); ++ ++ /* create new item @length bytes long in coord @target */ ++ int (*create_item) (coord_t * target, const reiser4_key * key, ++ reiser4_item_data * data, carry_plugin_info * info); ++ ++ /* update key of item. */ ++ void (*update_item_key) (coord_t * target, const reiser4_key * key, ++ carry_plugin_info * info); ++ ++ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *); ++ int (*cut) (struct carry_cut_data *, carry_plugin_info *); ++ ++ /* ++ * shrink item pointed to by @coord by @delta bytes. ++ */ ++ int (*shrink_item) (coord_t * coord, int delta); ++ ++ /* copy as much as possible but not more than up to @stop from ++ @stop->node to @target. If (pend == append) then data from beginning of ++ @stop->node are copied to the end of @target. If (pend == prepend) then ++ data from the end of @stop->node are copied to the beginning of ++ @target. Copied data are removed from @stop->node. Information ++ about what to do on upper level is stored in @todo */ ++ int (*shift) (coord_t * stop, znode * target, shift_direction pend, ++ int delete_node, int including_insert_coord, ++ carry_plugin_info * info); ++ /* return true if this node allows skip carry() in some situations ++ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format ++ emulation doesn't. ++ ++ This will speedup insertions that doesn't require updates to the ++ parent, by bypassing initialisation of carry() structures. It's ++ believed that majority of insertions will fit there. ++ ++ */ ++ int (*fast_insert) (const coord_t * coord); ++ int (*fast_paste) (const coord_t * coord); ++ int (*fast_cut) (const coord_t * coord); ++ /* this limits max size of item which can be inserted into a node and ++ number of bytes item in a node may be appended with */ ++ int (*max_item_size) (void); ++ int (*prepare_removal) (znode * empty, carry_plugin_info * info); ++ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular ++ * files */ ++ int (*set_item_plugin) (coord_t * coord, item_id); ++} node_plugin; ++ ++typedef enum { ++ /* standard unified node layout used for both leaf and internal ++ nodes */ ++ NODE40_ID, ++ LAST_NODE_ID ++} reiser4_node_id; ++ ++extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key); ++#if REISER4_DEBUG ++extern void print_node_content(const char *prefix, const znode * node, ++ __u32 flags); ++#endif ++ ++extern void indent_znode(const znode * node); ++ ++typedef struct common_node_header { ++ /* ++ * identifier of node plugin. Must be located at the very beginning of ++ * a node. ++ */ ++ __le16 plugin_id; ++} common_node_header; ++ ++/* __REISER4_NODE_H__ */ ++#endif ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/object.c linux-2.6.33/fs/reiser4/plugin/object.c +--- linux-2.6.33.orig/fs/reiser4/plugin/object.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/object.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,531 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* ++ * Examples of object plugins: file, directory, symlink, special file. ++ * ++ * Plugins associated with inode: ++ * ++ * Plugin of inode is plugin referenced by plugin-id field of on-disk ++ * stat-data. How we store this plugin in in-core inode is not ++ * important. Currently pointers are used, another variant is to store offsets ++ * and do array lookup on each access. ++ * ++ * Now, each inode has one selected plugin: object plugin that ++ * determines what type of file this object is: directory, regular etc. ++ * ++ * This main plugin can use other plugins that are thus subordinated to ++ * it. Directory instance of object plugin uses hash; regular file ++ * instance uses tail policy plugin. ++ * ++ * Object plugin is either taken from id in stat-data or guessed from ++ * i_mode bits. Once it is established we ask it to install its ++ * subordinate plugins, by looking again in stat-data or inheriting them ++ * from parent. ++ * ++ * How new inode is initialized during ->read_inode(): ++ * 1 read stat-data and initialize inode fields: i_size, i_mode, ++ * i_generation, capabilities etc. ++ * 2 read plugin id from stat data or try to guess plugin id ++ * from inode->i_mode bits if plugin id is missing. ++ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields. ++ * ++ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What ++ * if stat data does contain i_size, etc., due to it being an unusual plugin? ++ * ++ * 4 Call ->activate() method of object's plugin. Plugin is either read from ++ * from stat-data or guessed from mode bits ++ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized ++ * plugins from parent. ++ * ++ * Easy induction proves that on last step all plugins of inode would be ++ * initialized. ++ * ++ * When creating new object: ++ * 1 obtain object plugin id (see next period) ++ * NIKITA-FIXME-HANS: period? ++ * 2 ->install() this plugin ++ * 3 ->inherit() the rest from the parent ++ * ++ * We need some examples of creating an object with default and non-default ++ * plugin ids. Nikita, please create them. ++ */ ++ ++#include "../inode.h" ++ ++static int _bugop(void) ++{ ++ BUG_ON(1); ++ return 0; ++} ++ ++#define bugop ((void *)_bugop) ++ ++static int _dummyop(void) ++{ ++ return 0; ++} ++ ++#define dummyop ((void *)_dummyop) ++ ++static int change_file(struct inode *inode, ++ reiser4_plugin * plugin, ++ pset_member memb) ++{ ++ /* cannot change object plugin of already existing object */ ++ if (memb == PSET_FILE) ++ return RETERR(-EINVAL); ++ ++ /* Change PSET_CREATE */ ++ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin); ++} ++ ++static reiser4_plugin_ops file_plugin_ops = { ++ .change = change_file ++}; ++ ++static struct inode_operations null_i_ops = {.create = NULL}; ++static struct file_operations null_f_ops = {.owner = NULL}; ++static struct address_space_operations null_a_ops = {.writepage = NULL}; ++ ++/* VFS methods for regular files */ ++static struct inode_operations regular_file_i_ops = { ++ .permission = reiser4_permission_common, ++ .setattr = reiser4_setattr, ++ .getattr = reiser4_getattr_common ++}; ++static struct file_operations regular_file_f_ops = { ++ .llseek = generic_file_llseek, ++ .read = reiser4_read_careful, ++ .write = reiser4_write_careful, ++ .aio_read = generic_file_aio_read, ++ .ioctl = reiser4_ioctl_careful, ++ .mmap = reiser4_mmap_careful, ++ .open = reiser4_open_careful, ++ .release = reiser4_release_careful, ++ .fsync = reiser4_sync_file_common, ++ .splice_read = generic_file_splice_read, ++ .splice_write = generic_file_splice_write ++}; ++static struct address_space_operations regular_file_a_ops = { ++ .writepage = reiser4_writepage, ++ .readpage = reiser4_readpage, ++ .sync_page = block_sync_page, ++ .writepages = reiser4_writepages, ++ .set_page_dirty = reiser4_set_page_dirty, ++ .readpages = reiser4_readpages, ++ .write_begin = reiser4_write_begin_careful, ++ .write_end = reiser4_write_end_careful, ++ .bmap = reiser4_bmap_careful, ++ .invalidatepage = reiser4_invalidatepage, ++ .releasepage = reiser4_releasepage ++}; ++ ++/* VFS methods for symlink files */ ++static struct inode_operations symlink_file_i_ops = { ++ .readlink = generic_readlink, ++ .follow_link = reiser4_follow_link_common, ++ .permission = reiser4_permission_common, ++ .setattr = reiser4_setattr_common, ++ .getattr = reiser4_getattr_common ++}; ++ ++/* VFS methods for special files */ ++static struct inode_operations special_file_i_ops = { ++ .permission = reiser4_permission_common, ++ .setattr = reiser4_setattr_common, ++ .getattr = reiser4_getattr_common ++}; ++ ++/* VFS methods for directories */ ++static struct inode_operations directory_i_ops = { ++ .create = reiser4_create_common, ++ .lookup = reiser4_lookup_common, ++ .link = reiser4_link_common, ++ .unlink = reiser4_unlink_common, ++ .symlink = reiser4_symlink_common, ++ .mkdir = reiser4_mkdir_common, ++ .rmdir = reiser4_unlink_common, ++ .mknod = reiser4_mknod_common, ++ .rename = reiser4_rename_common, ++ .permission = reiser4_permission_common, ++ .setattr = reiser4_setattr_common, ++ .getattr = reiser4_getattr_common ++}; ++static struct file_operations directory_f_ops = { ++ .llseek = reiser4_llseek_dir_common, ++ .read = generic_read_dir, ++ .readdir = reiser4_readdir_common, ++ .release = reiser4_release_dir_common, ++ .fsync = reiser4_sync_common ++}; ++static struct address_space_operations directory_a_ops = { ++ .writepage = bugop, ++ .sync_page = bugop, ++ .writepages = dummyop, ++ .set_page_dirty = bugop, ++ .readpages = bugop, ++ .write_begin = bugop, ++ .write_end = bugop, ++ .bmap = bugop, ++ .invalidatepage = bugop, ++ .releasepage = bugop ++}; ++ ++/* ++ * Definitions of object plugins. ++ */ ++ ++file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = { ++ [UNIX_FILE_PLUGIN_ID] = { ++ .h = { ++ .type_id = REISER4_FILE_PLUGIN_TYPE, ++ .id = UNIX_FILE_PLUGIN_ID, ++ .groups = (1 << REISER4_REGULAR_FILE), ++ .pops = &file_plugin_ops, ++ .label = "reg", ++ .desc = "regular file", ++ .linkage = {NULL, NULL}, ++ }, ++ /* ++ * invariant vfs ops ++ */ ++ .inode_ops = ®ular_file_i_ops, ++ .file_ops = ®ular_file_f_ops, ++ .as_ops = ®ular_file_a_ops, ++ /* ++ * private i_ops ++ */ ++ .setattr = setattr_unix_file, ++ .open = open_unix_file, ++ .read = read_unix_file, ++ .write = write_unix_file, ++ .ioctl = ioctl_unix_file, ++ .mmap = mmap_unix_file, ++ .release = release_unix_file, ++ /* ++ * private f_ops ++ */ ++ .readpage = readpage_unix_file, ++ .readpages = readpages_unix_file, ++ .writepages = writepages_unix_file, ++ .write_begin = write_begin_unix_file, ++ .write_end = write_end_unix_file, ++ /* ++ * private a_ops ++ */ ++ .bmap = bmap_unix_file, ++ /* ++ * other private methods ++ */ ++ .write_sd_by_inode = write_sd_by_inode_common, ++ .flow_by_inode = flow_by_inode_unix_file, ++ .key_by_inode = key_by_inode_and_offset_common, ++ .set_plug_in_inode = set_plug_in_inode_common, ++ .adjust_to_parent = adjust_to_parent_common, ++ .create_object = reiser4_create_object_common, ++ .delete_object = delete_object_unix_file, ++ .add_link = reiser4_add_link_common, ++ .rem_link = reiser4_rem_link_common, ++ .owns_item = owns_item_unix_file, ++ .can_add_link = can_add_link_common, ++ .detach = dummyop, ++ .bind = dummyop, ++ .safelink = safelink_common, ++ .estimate = { ++ .create = estimate_create_common, ++ .update = estimate_update_common, ++ .unlink = estimate_unlink_common ++ }, ++ .init_inode_data = init_inode_data_unix_file, ++ .cut_tree_worker = cut_tree_worker_common, ++ .wire = { ++ .write = wire_write_common, ++ .read = wire_read_common, ++ .get = wire_get_common, ++ .size = wire_size_common, ++ .done = wire_done_common ++ } ++ }, ++ [DIRECTORY_FILE_PLUGIN_ID] = { ++ .h = { ++ .type_id = REISER4_FILE_PLUGIN_TYPE, ++ .id = DIRECTORY_FILE_PLUGIN_ID, ++ .groups = (1 << REISER4_DIRECTORY_FILE), ++ .pops = &file_plugin_ops, ++ .label = "dir", ++ .desc = "directory", ++ .linkage = {NULL, NULL} ++ }, ++ .inode_ops = &null_i_ops, ++ .file_ops = &null_f_ops, ++ .as_ops = &null_a_ops, ++ ++ .write_sd_by_inode = write_sd_by_inode_common, ++ .flow_by_inode = bugop, ++ .key_by_inode = bugop, ++ .set_plug_in_inode = set_plug_in_inode_common, ++ .adjust_to_parent = adjust_to_parent_common_dir, ++ .create_object = reiser4_create_object_common, ++ .delete_object = reiser4_delete_dir_common, ++ .add_link = reiser4_add_link_common, ++ .rem_link = rem_link_common_dir, ++ .owns_item = owns_item_common_dir, ++ .can_add_link = can_add_link_common, ++ .can_rem_link = can_rem_link_common_dir, ++ .detach = reiser4_detach_common_dir, ++ .bind = reiser4_bind_common_dir, ++ .safelink = safelink_common, ++ .estimate = { ++ .create = estimate_create_common_dir, ++ .update = estimate_update_common, ++ .unlink = estimate_unlink_common_dir ++ }, ++ .wire = { ++ .write = wire_write_common, ++ .read = wire_read_common, ++ .get = wire_get_common, ++ .size = wire_size_common, ++ .done = wire_done_common ++ }, ++ .init_inode_data = init_inode_ordering, ++ .cut_tree_worker = cut_tree_worker_common, ++ }, ++ [SYMLINK_FILE_PLUGIN_ID] = { ++ .h = { ++ .type_id = REISER4_FILE_PLUGIN_TYPE, ++ .id = SYMLINK_FILE_PLUGIN_ID, ++ .groups = (1 << REISER4_SYMLINK_FILE), ++ .pops = &file_plugin_ops, ++ .label = "symlink", ++ .desc = "symbolic link", ++ .linkage = {NULL,NULL} ++ }, ++ .inode_ops = &symlink_file_i_ops, ++ /* inode->i_fop of symlink is initialized ++ by NULL in setup_inode_ops */ ++ .file_ops = &null_f_ops, ++ .as_ops = &null_a_ops, ++ ++ .write_sd_by_inode = write_sd_by_inode_common, ++ .set_plug_in_inode = set_plug_in_inode_common, ++ .adjust_to_parent = adjust_to_parent_common, ++ .create_object = reiser4_create_symlink, ++ .delete_object = reiser4_delete_object_common, ++ .add_link = reiser4_add_link_common, ++ .rem_link = reiser4_rem_link_common, ++ .can_add_link = can_add_link_common, ++ .detach = dummyop, ++ .bind = dummyop, ++ .safelink = safelink_common, ++ .estimate = { ++ .create = estimate_create_common, ++ .update = estimate_update_common, ++ .unlink = estimate_unlink_common ++ }, ++ .init_inode_data = init_inode_ordering, ++ .cut_tree_worker = cut_tree_worker_common, ++ .destroy_inode = destroy_inode_symlink, ++ .wire = { ++ .write = wire_write_common, ++ .read = wire_read_common, ++ .get = wire_get_common, ++ .size = wire_size_common, ++ .done = wire_done_common ++ } ++ }, ++ [SPECIAL_FILE_PLUGIN_ID] = { ++ .h = { ++ .type_id = REISER4_FILE_PLUGIN_TYPE, ++ .id = SPECIAL_FILE_PLUGIN_ID, ++ .groups = (1 << REISER4_SPECIAL_FILE), ++ .pops = &file_plugin_ops, ++ .label = "special", ++ .desc = ++ "special: fifo, device or socket", ++ .linkage = {NULL, NULL} ++ }, ++ .inode_ops = &special_file_i_ops, ++ /* file_ops of special files (sockets, block, char, fifo) are ++ initialized by init_special_inode. */ ++ .file_ops = &null_f_ops, ++ .as_ops = &null_a_ops, ++ ++ .write_sd_by_inode = write_sd_by_inode_common, ++ .set_plug_in_inode = set_plug_in_inode_common, ++ .adjust_to_parent = adjust_to_parent_common, ++ .create_object = reiser4_create_object_common, ++ .delete_object = reiser4_delete_object_common, ++ .add_link = reiser4_add_link_common, ++ .rem_link = reiser4_rem_link_common, ++ .owns_item = owns_item_common, ++ .can_add_link = can_add_link_common, ++ .detach = dummyop, ++ .bind = dummyop, ++ .safelink = safelink_common, ++ .estimate = { ++ .create = estimate_create_common, ++ .update = estimate_update_common, ++ .unlink = estimate_unlink_common ++ }, ++ .init_inode_data = init_inode_ordering, ++ .cut_tree_worker = cut_tree_worker_common, ++ .wire = { ++ .write = wire_write_common, ++ .read = wire_read_common, ++ .get = wire_get_common, ++ .size = wire_size_common, ++ .done = wire_done_common ++ } ++ }, ++ [CRYPTCOMPRESS_FILE_PLUGIN_ID] = { ++ .h = { ++ .type_id = REISER4_FILE_PLUGIN_TYPE, ++ .id = CRYPTCOMPRESS_FILE_PLUGIN_ID, ++ .groups = (1 << REISER4_REGULAR_FILE), ++ .pops = &file_plugin_ops, ++ .label = "cryptcompress", ++ .desc = "cryptcompress file", ++ .linkage = {NULL, NULL} ++ }, ++ .inode_ops = ®ular_file_i_ops, ++ .file_ops = ®ular_file_f_ops, ++ .as_ops = ®ular_file_a_ops, ++ ++ .setattr = setattr_cryptcompress, ++ .open = open_cryptcompress, ++ .read = read_cryptcompress, ++ .write = write_cryptcompress, ++ .ioctl = ioctl_cryptcompress, ++ .mmap = mmap_cryptcompress, ++ .release = release_cryptcompress, ++ ++ .readpage = readpage_cryptcompress, ++ .readpages = readpages_cryptcompress, ++ .writepages = writepages_cryptcompress, ++ .write_begin = write_begin_cryptcompress, ++ .write_end = write_end_cryptcompress, ++ ++ .bmap = bmap_cryptcompress, ++ ++ .write_sd_by_inode = write_sd_by_inode_common, ++ .flow_by_inode = flow_by_inode_cryptcompress, ++ .key_by_inode = key_by_inode_cryptcompress, ++ .set_plug_in_inode = set_plug_in_inode_common, ++ .adjust_to_parent = adjust_to_parent_cryptcompress, ++ .create_object = create_object_cryptcompress, ++ .delete_object = delete_object_cryptcompress, ++ .add_link = reiser4_add_link_common, ++ .rem_link = reiser4_rem_link_common, ++ .owns_item = owns_item_common, ++ .can_add_link = can_add_link_common, ++ .detach = dummyop, ++ .bind = dummyop, ++ .safelink = safelink_common, ++ .estimate = { ++ .create = estimate_create_common, ++ .update = estimate_update_common, ++ .unlink = estimate_unlink_common ++ }, ++ .init_inode_data = init_inode_data_cryptcompress, ++ .cut_tree_worker = cut_tree_worker_cryptcompress, ++ .destroy_inode = destroy_inode_cryptcompress, ++ .wire = { ++ .write = wire_write_common, ++ .read = wire_read_common, ++ .get = wire_get_common, ++ .size = wire_size_common, ++ .done = wire_done_common ++ } ++ } ++}; ++ ++static int change_dir(struct inode *inode, ++ reiser4_plugin * plugin, ++ pset_member memb) ++{ ++ /* cannot change dir plugin of already existing object */ ++ return RETERR(-EINVAL); ++} ++ ++static reiser4_plugin_ops dir_plugin_ops = { ++ .change = change_dir ++}; ++ ++/* ++ * definition of directory plugins ++ */ ++ ++dir_plugin dir_plugins[LAST_DIR_ID] = { ++ /* standard hashed directory plugin */ ++ [HASHED_DIR_PLUGIN_ID] = { ++ .h = { ++ .type_id = REISER4_DIR_PLUGIN_TYPE, ++ .id = HASHED_DIR_PLUGIN_ID, ++ .pops = &dir_plugin_ops, ++ .label = "dir", ++ .desc = "hashed directory", ++ .linkage = {NULL, NULL} ++ }, ++ .inode_ops = &directory_i_ops, ++ .file_ops = &directory_f_ops, ++ .as_ops = &directory_a_ops, ++ ++ .get_parent = get_parent_common, ++ .is_name_acceptable = is_name_acceptable_common, ++ .build_entry_key = build_entry_key_hashed, ++ .build_readdir_key = build_readdir_key_common, ++ .add_entry = reiser4_add_entry_common, ++ .rem_entry = reiser4_rem_entry_common, ++ .init = reiser4_dir_init_common, ++ .done = reiser4_dir_done_common, ++ .attach = reiser4_attach_common, ++ .detach = reiser4_detach_common, ++ .estimate = { ++ .add_entry = estimate_add_entry_common, ++ .rem_entry = estimate_rem_entry_common, ++ .unlink = dir_estimate_unlink_common ++ } ++ }, ++ /* hashed directory for which seekdir/telldir are guaranteed to ++ * work. Brain-damage. */ ++ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = { ++ .h = { ++ .type_id = REISER4_DIR_PLUGIN_TYPE, ++ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID, ++ .pops = &dir_plugin_ops, ++ .label = "dir32", ++ .desc = "directory hashed with 31 bit hash", ++ .linkage = {NULL, NULL} ++ }, ++ .inode_ops = &directory_i_ops, ++ .file_ops = &directory_f_ops, ++ .as_ops = &directory_a_ops, ++ ++ .get_parent = get_parent_common, ++ .is_name_acceptable = is_name_acceptable_common, ++ .build_entry_key = build_entry_key_seekable, ++ .build_readdir_key = build_readdir_key_common, ++ .add_entry = reiser4_add_entry_common, ++ .rem_entry = reiser4_rem_entry_common, ++ .init = reiser4_dir_init_common, ++ .done = reiser4_dir_done_common, ++ .attach = reiser4_attach_common, ++ .detach = reiser4_detach_common, ++ .estimate = { ++ .add_entry = estimate_add_entry_common, ++ .rem_entry = estimate_rem_entry_common, ++ .unlink = dir_estimate_unlink_common ++ } ++ } ++}; ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/object.h linux-2.6.33/fs/reiser4/plugin/object.h +--- linux-2.6.33.orig/fs/reiser4/plugin/object.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/object.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,117 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Declaration of object plugin functions. */ ++ ++#if !defined(__FS_REISER4_PLUGIN_OBJECT_H__) ++#define __FS_REISER4_PLUGIN_OBJECT_H__ ++ ++#include "../type_safe_hash.h" ++ ++/* common implementations of inode operations */ ++int reiser4_create_common(struct inode *parent, struct dentry *dentry, ++ int mode, struct nameidata *); ++struct dentry *reiser4_lookup_common(struct inode *parent, ++ struct dentry *dentry, ++ struct nameidata *nameidata); ++int reiser4_link_common(struct dentry *existing, struct inode *parent, ++ struct dentry *newname); ++int reiser4_unlink_common(struct inode *parent, struct dentry *victim); ++int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode); ++int reiser4_symlink_common(struct inode *parent, struct dentry *dentry, ++ const char *linkname); ++int reiser4_mknod_common(struct inode *parent, struct dentry *dentry, ++ int mode, dev_t rdev); ++int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name, ++ struct inode *new_dir, struct dentry *new_name); ++void *reiser4_follow_link_common(struct dentry *, struct nameidata *data); ++int reiser4_permission_common(struct inode *, int mask); ++int reiser4_setattr_common(struct dentry *, struct iattr *); ++int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *, ++ struct kstat *); ++ ++/* common implementations of file operations */ ++loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin); ++int reiser4_readdir_common(struct file *, void *dirent, filldir_t); ++int reiser4_release_dir_common(struct inode *, struct file *); ++int reiser4_sync_common(struct file *, struct dentry *, int datasync); ++ ++ ++/* file plugin operations: common implementations */ ++int write_sd_by_inode_common(struct inode *); ++int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *); ++int set_plug_in_inode_common(struct inode *object, struct inode *parent, ++ reiser4_object_create_data *); ++int adjust_to_parent_common(struct inode *object, struct inode *parent, ++ struct inode *root); ++int adjust_to_parent_common_dir(struct inode *object, struct inode *parent, ++ struct inode *root); ++int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent, ++ struct inode *root); ++int reiser4_create_object_common(struct inode *object, struct inode *parent, ++ reiser4_object_create_data *); ++int reiser4_delete_object_common(struct inode *); ++int reiser4_delete_dir_common(struct inode *); ++int reiser4_add_link_common(struct inode *object, struct inode *parent); ++int reiser4_rem_link_common(struct inode *object, struct inode *parent); ++int rem_link_common_dir(struct inode *object, struct inode *parent); ++int owns_item_common(const struct inode *, const coord_t *); ++int owns_item_common_dir(const struct inode *, const coord_t *); ++int can_add_link_common(const struct inode *); ++int can_rem_link_common_dir(const struct inode *); ++int reiser4_detach_common_dir(struct inode *child, struct inode *parent); ++int reiser4_bind_common_dir(struct inode *child, struct inode *parent); ++int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value); ++reiser4_block_nr estimate_create_common(const struct inode *); ++reiser4_block_nr estimate_create_common_dir(const struct inode *); ++reiser4_block_nr estimate_update_common(const struct inode *); ++reiser4_block_nr estimate_unlink_common(const struct inode *, ++ const struct inode *); ++reiser4_block_nr estimate_unlink_common_dir(const struct inode *, ++ const struct inode *); ++char *wire_write_common(struct inode *, char *start); ++char *wire_read_common(char *addr, reiser4_object_on_wire *); ++struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *); ++int wire_size_common(struct inode *); ++void wire_done_common(reiser4_object_on_wire *); ++ ++/* dir plugin operations: common implementations */ ++struct dentry *get_parent_common(struct inode *child); ++int is_name_acceptable_common(const struct inode *, const char *name, int len); ++void build_entry_key_common(const struct inode *, ++ const struct qstr *qname, reiser4_key *); ++int build_readdir_key_common(struct file *dir, reiser4_key *); ++int reiser4_add_entry_common(struct inode *object, struct dentry *where, ++ reiser4_object_create_data * , reiser4_dir_entry_desc *); ++int reiser4_rem_entry_common(struct inode *object, struct dentry *where, ++ reiser4_dir_entry_desc *); ++int reiser4_dir_init_common(struct inode *object, struct inode *parent, ++ reiser4_object_create_data *); ++int reiser4_dir_done_common(struct inode *); ++int reiser4_attach_common(struct inode *child, struct inode *parent); ++int reiser4_detach_common(struct inode *object, struct inode *parent); ++reiser4_block_nr estimate_add_entry_common(const struct inode *); ++reiser4_block_nr estimate_rem_entry_common(const struct inode *); ++reiser4_block_nr dir_estimate_unlink_common(const struct inode *, ++ const struct inode *); ++ ++/* these are essential parts of common implementations, they are to make ++ customized implementations easier */ ++int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to); ++ ++/* merely useful functions */ ++int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle * , ++ const reiser4_key * , int silent); ++ ++/* __FS_REISER4_PLUGIN_OBJECT_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/plugin.c linux-2.6.33/fs/reiser4/plugin/plugin.c +--- linux-2.6.33.orig/fs/reiser4/plugin/plugin.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/plugin.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,560 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Basic plugin infrastructure, lookup etc. */ ++ ++/* PLUGINS: ++ ++ Plugins are internal Reiser4 "modules" or "objects" used to increase ++ extensibility and allow external users to easily adapt reiser4 to ++ their needs. ++ ++ Plugins are classified into several disjoint "types". Plugins ++ belonging to the particular plugin type are termed "instances" of ++ this type. Existing types are listed by enum reiser4_plugin_type ++ (see plugin/plugin_header.h) ++ ++NIKITA-FIXME-HANS: update this list, and review this entire comment for currency ++ ++ Object (file) plugin determines how given file-system object serves ++ standard VFS requests for read, write, seek, mmap etc. Instances of ++ file plugins are: regular file, directory, symlink. Another example ++ of file plugin is audit plugin, that optionally records accesses to ++ underlying object and forwards requests to it. ++ ++ Hash plugins compute hashes used by reiser4 to store and locate ++ files within directories. Instances of hash plugin type are: r5, ++ tea, rupasov. ++ ++ Tail plugins (or, more precisely, tail policy plugins) determine ++ when last part of the file should be stored in a formatted item. ++ ++ Scope and lookup: ++ ++ label such that pair ( type_label, plugin_label ) is unique. This ++ pair is a globally persistent and user-visible plugin ++ identifier. Internally kernel maintains plugins and plugin types in ++ arrays using an index into those arrays as plugin and plugin type ++ identifiers. File-system in turn, also maintains persistent ++ "dictionary" which is mapping from plugin label to numerical ++ identifier which is stored in file-system objects. That is, we ++ store the offset into the plugin array for that plugin type as the ++ plugin id in the stat data of the filesystem object. ++ ++ Internal kernel plugin type identifier (index in plugins[] array) is ++ of type reiser4_plugin_type. Set of available plugin types is ++ currently static, but dynamic loading doesn't seem to pose ++ insurmountable problems. ++ ++ Within each type plugins are addressed by the identifiers of type ++ reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]). ++ Such identifiers are only required to be unique within one type, ++ not globally. ++ ++ Thus, plugin in memory is uniquely identified by the pair (type_id, ++ id). ++ ++ Usage: ++ ++ There exists only one instance of each plugin instance, but this ++ single instance can be associated with many entities (file-system ++ objects, items, nodes, transactions, file-descriptors etc.). Entity ++ to which plugin of given type is termed (due to the lack of ++ imagination) "subject" of this plugin type and, by abuse of ++ terminology, subject of particular instance of this type to which ++ it's attached currently. For example, inode is subject of object ++ plugin type. Inode representing directory is subject of directory ++ plugin, hash plugin type and some particular instance of hash plugin ++ type. Inode, representing regular file is subject of "regular file" ++ plugin, tail-policy plugin type etc. ++ ++ With each subject the plugin possibly stores some state. For example, ++ the state of a directory plugin (instance of object plugin type) is pointer ++ to hash plugin (if directories always use hashing that is). ++ ++ Interface: ++ ++ In addition to a scalar identifier, each plugin type and plugin ++ proper has a "label": short string and a "description"---longer ++ descriptive string. Labels and descriptions of plugin types are ++ hard-coded into plugins[] array, declared and defined in ++ plugin.c. Label and description of plugin are stored in .label and ++ .desc fields of reiser4_plugin_header respectively. It's possible to ++ locate plugin by the pair of labels. ++ ++ Features (not implemented): ++ ++ . user-level plugin manipulations: ++ + reiser4("filename/..file_plugin<='audit'"); ++ + write(open("filename/..file_plugin"), "audit", 8); ++ ++ . user level utilities lsplug and chplug to manipulate plugins. ++ Utilities are not of primary priority. Possibly they will be not ++ working on v4.0 ++ ++ NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount ++ option, do you agree? I don't think that specifying it at mount time, ++ and then changing it with each mount, is a good model for usage. ++ ++ . mount option "plug" to set-up plugins of root-directory. ++ "plug=foo:bar" will set "bar" as default plugin of type "foo". ++ ++ Limitations: ++ ++ . each plugin type has to provide at least one builtin ++ plugin. This is technical limitation and it can be lifted in the ++ future. ++ ++ TODO: ++ ++ New plugin types/plugings: ++ Things we should be able to separately choose to inherit: ++ ++ security plugins ++ ++ stat data ++ ++ file bodies ++ ++ file plugins ++ ++ dir plugins ++ ++ . perm:acl ++ ++ . audi---audit plugin intercepting and possibly logging all ++ accesses to object. Requires to put stub functions in file_operations ++ in stead of generic_file_*. ++ ++NIKITA-FIXME-HANS: why make overflows a plugin? ++ . over---handle hash overflows ++ ++ . sqnt---handle different access patterns and instruments read-ahead ++ ++NIKITA-FIXME-HANS: describe the line below in more detail. ++ ++ . hier---handle inheritance of plugins along file-system hierarchy ++ ++ Different kinds of inheritance: on creation vs. on access. ++ Compatible/incompatible plugins. ++ Inheritance for multi-linked files. ++ Layered plugins. ++ Notion of plugin context is abandoned. ++ ++Each file is associated ++ with one plugin and dependant plugins (hash, etc.) are stored as ++ main plugin state. Now, if we have plugins used for regular files ++ but not for directories, how such plugins would be inherited? ++ . always store them with directories also ++ ++NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing ++the line below which is also useful. ++ ++ . use inheritance hierarchy, independent of file-system namespace ++*/ ++ ++#include "../debug.h" ++#include "../dformat.h" ++#include "plugin_header.h" ++#include "item/static_stat.h" ++#include "node/node.h" ++#include "security/perm.h" ++#include "space/space_allocator.h" ++#include "disk_format/disk_format.h" ++#include "plugin.h" ++#include "../reiser4.h" ++#include "../jnode.h" ++#include "../inode.h" ++ ++#include <linux/fs.h> /* for struct super_block */ ++ ++/* ++ * init_plugins - initialize plugin sub-system. ++ * Just call this once on reiser4 startup. ++ * ++ * Initializes plugin sub-system. It is part of reiser4 module ++ * initialization. For each plugin of each type init method is called and each ++ * plugin is put into list of plugins. ++ */ ++int init_plugins(void) ++{ ++ reiser4_plugin_type type_id; ++ ++ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) { ++ struct reiser4_plugin_type_data *ptype; ++ int i; ++ ++ ptype = &plugins[type_id]; ++ assert("nikita-3508", ptype->label != NULL); ++ assert("nikita-3509", ptype->type_id == type_id); ++ ++ INIT_LIST_HEAD(&ptype->plugins_list); ++/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term ++ * builtin. */ ++ for (i = 0; i < ptype->builtin_num; ++i) { ++ reiser4_plugin *plugin; ++ ++ plugin = plugin_at(ptype, i); ++ ++ if (plugin->h.label == NULL) ++ /* uninitialized slot encountered */ ++ continue; ++ assert("nikita-3445", plugin->h.type_id == type_id); ++ plugin->h.id = i; ++ if (plugin->h.pops != NULL && ++ plugin->h.pops->init != NULL) { ++ int result; ++ ++ result = plugin->h.pops->init(plugin); ++ if (result != 0) ++ return result; ++ } ++ INIT_LIST_HEAD(&plugin->h.linkage); ++ list_add_tail(&plugin->h.linkage, &ptype->plugins_list); ++ } ++ } ++ return 0; ++} ++ ++/* true if plugin type id is valid */ ++int is_plugin_type_valid(reiser4_plugin_type type) ++{ ++ /* "type" is unsigned, so no comparison with 0 is ++ necessary */ ++ return (type < REISER4_PLUGIN_TYPES); ++} ++ ++/* true if plugin id is valid */ ++int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id) ++{ ++ assert("nikita-1653", is_plugin_type_valid(type)); ++ return id < plugins[type].builtin_num; ++} ++ ++/* return plugin by its @type and @id. ++ ++ Both arguments are checked for validness: this is supposed to be called ++ from user-level. ++ ++NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in ++user space, and passed to the filesystem by use of method files? Your ++comment really confused me on the first reading.... ++ ++*/ ++reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type ++ * unchecked */, ++ reiser4_plugin_id id /* plugin id, ++ * unchecked */) ++{ ++ if (is_plugin_type_valid(type)) { ++ if (is_plugin_id_valid(type, id)) ++ return plugin_at(&plugins[type], id); ++ else ++ /* id out of bounds */ ++ warning("nikita-2913", ++ "Invalid plugin id: [%i:%i]", type, id); ++ } else ++ /* type_id out of bounds */ ++ warning("nikita-2914", "Invalid type_id: %i", type); ++ return NULL; ++} ++ ++/** ++ * save_plugin_id - store plugin id in disk format ++ * @plugin: plugin to convert ++ * @area: where to store result ++ * ++ * Puts id of @plugin in little endian format to address @area. ++ */ ++int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ , ++ d16 * area/* where to store result */) ++{ ++ assert("nikita-1261", plugin != NULL); ++ assert("nikita-1262", area != NULL); ++ ++ put_unaligned(cpu_to_le16(plugin->h.id), area); ++ return 0; ++} ++ ++/* list of all plugins of given type */ ++struct list_head *get_plugin_list(reiser4_plugin_type type) ++{ ++ assert("nikita-1056", is_plugin_type_valid(type)); ++ return &plugins[type].plugins_list; ++} ++ ++static void update_pset_mask(reiser4_inode * info, pset_member memb) ++{ ++ struct dentry *rootdir; ++ reiser4_inode *root; ++ ++ assert("edward-1443", memb != PSET_FILE); ++ ++ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root; ++ if (rootdir != NULL) { ++ root = reiser4_inode_data(rootdir->d_inode); ++ /* ++ * if inode is different from the default one, or we are ++ * changing plugin of root directory, update plugin_mask ++ */ ++ if (aset_get(info->pset, memb) != ++ aset_get(root->pset, memb) || ++ info == root) ++ info->plugin_mask |= (1 << memb); ++ else ++ info->plugin_mask &= ~(1 << memb); ++ } ++} ++ ++/* Get specified plugin set member from parent, ++ or from fs-defaults (if no parent is given) and ++ install the result to pset of @self */ ++int grab_plugin_pset(struct inode *self, ++ struct inode *ancestor, ++ pset_member memb) ++{ ++ reiser4_plugin *plug; ++ reiser4_inode *info; ++ int result = 0; ++ ++ /* Do not grab if initialised already. */ ++ info = reiser4_inode_data(self); ++ if (aset_get(info->pset, memb) != NULL) ++ return 0; ++ if (ancestor) { ++ reiser4_inode *parent; ++ ++ parent = reiser4_inode_data(ancestor); ++ plug = aset_get(parent->hset, memb) ? : ++ aset_get(parent->pset, memb); ++ } else ++ plug = get_default_plugin(memb); ++ ++ result = set_plugin(&info->pset, memb, plug); ++ if (result == 0) { ++ if (!ancestor || self->i_sb->s_root->d_inode != self) ++ update_pset_mask(info, memb); ++ } ++ return result; ++} ++ ++/* Take missing pset members from root inode */ ++int finish_pset(struct inode *inode) ++{ ++ reiser4_plugin *plug; ++ reiser4_inode *root; ++ reiser4_inode *info; ++ pset_member memb; ++ int result = 0; ++ ++ root = reiser4_inode_data(inode->i_sb->s_root->d_inode); ++ info = reiser4_inode_data(inode); ++ ++ assert("edward-1455", root != NULL); ++ assert("edward-1456", info != NULL); ++ ++ /* file and directory plugins are already initialized. */ ++ for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) { ++ ++ /* Do not grab if initialised already. */ ++ if (aset_get(info->pset, memb) != NULL) ++ continue; ++ ++ plug = aset_get(root->pset, memb); ++ result = set_plugin(&info->pset, memb, plug); ++ if (result != 0) ++ break; ++ } ++ if (result != 0) { ++ warning("nikita-3447", ++ "Cannot set up plugins for %lli", ++ (unsigned long long) ++ get_inode_oid(inode)); ++ } ++ return result; ++} ++ ++int force_plugin_pset(struct inode *self, pset_member memb, ++ reiser4_plugin * plug) ++{ ++ reiser4_inode *info; ++ int result = 0; ++ ++ if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) { ++ /* Changing pset in the root object. */ ++ return RETERR(-EINVAL); ++ } ++ ++ info = reiser4_inode_data(self); ++ if (plug->h.pops != NULL && plug->h.pops->change != NULL) ++ result = plug->h.pops->change(self, plug, memb); ++ else ++ result = aset_set_unsafe(&info->pset, memb, plug); ++ if (result == 0) { ++ __u16 oldmask = info->plugin_mask; ++ ++ update_pset_mask(info, memb); ++ if (oldmask != info->plugin_mask) ++ reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN); ++ } ++ return result; ++} ++ ++struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = { ++ /* C90 initializers */ ++ [REISER4_FILE_PLUGIN_TYPE] = { ++ .type_id = REISER4_FILE_PLUGIN_TYPE, ++ .label = "file", ++ .desc = "Object plugins", ++ .builtin_num = sizeof_array(file_plugins), ++ .builtin = file_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(file_plugin) ++ }, ++ [REISER4_DIR_PLUGIN_TYPE] = { ++ .type_id = REISER4_DIR_PLUGIN_TYPE, ++ .label = "dir", ++ .desc = "Directory plugins", ++ .builtin_num = sizeof_array(dir_plugins), ++ .builtin = dir_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(dir_plugin) ++ }, ++ [REISER4_HASH_PLUGIN_TYPE] = { ++ .type_id = REISER4_HASH_PLUGIN_TYPE, ++ .label = "hash", ++ .desc = "Directory hashes", ++ .builtin_num = sizeof_array(hash_plugins), ++ .builtin = hash_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(hash_plugin) ++ }, ++ [REISER4_FIBRATION_PLUGIN_TYPE] = { ++ .type_id = ++ REISER4_FIBRATION_PLUGIN_TYPE, ++ .label = "fibration", ++ .desc = "Directory fibrations", ++ .builtin_num = sizeof_array(fibration_plugins), ++ .builtin = fibration_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(fibration_plugin) ++ }, ++ [REISER4_CIPHER_PLUGIN_TYPE] = { ++ .type_id = REISER4_CIPHER_PLUGIN_TYPE, ++ .label = "cipher", ++ .desc = "Cipher plugins", ++ .builtin_num = sizeof_array(cipher_plugins), ++ .builtin = cipher_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(cipher_plugin) ++ }, ++ [REISER4_DIGEST_PLUGIN_TYPE] = { ++ .type_id = REISER4_DIGEST_PLUGIN_TYPE, ++ .label = "digest", ++ .desc = "Digest plugins", ++ .builtin_num = sizeof_array(digest_plugins), ++ .builtin = digest_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(digest_plugin) ++ }, ++ [REISER4_COMPRESSION_PLUGIN_TYPE] = { ++ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, ++ .label = "compression", ++ .desc = "Compression plugins", ++ .builtin_num = sizeof_array(compression_plugins), ++ .builtin = compression_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(compression_plugin) ++ }, ++ [REISER4_FORMATTING_PLUGIN_TYPE] = { ++ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, ++ .label = "formatting", ++ .desc = "Tail inlining policies", ++ .builtin_num = sizeof_array(formatting_plugins), ++ .builtin = formatting_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(formatting_plugin) ++ }, ++ [REISER4_PERM_PLUGIN_TYPE] = { ++ .type_id = REISER4_PERM_PLUGIN_TYPE, ++ .label = "perm", ++ .desc = "Permission checks", ++ .builtin_num = sizeof_array(perm_plugins), ++ .builtin = perm_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(perm_plugin) ++ }, ++ [REISER4_ITEM_PLUGIN_TYPE] = { ++ .type_id = REISER4_ITEM_PLUGIN_TYPE, ++ .label = "item", ++ .desc = "Item handlers", ++ .builtin_num = sizeof_array(item_plugins), ++ .builtin = item_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(item_plugin) ++ }, ++ [REISER4_NODE_PLUGIN_TYPE] = { ++ .type_id = REISER4_NODE_PLUGIN_TYPE, ++ .label = "node", ++ .desc = "node layout handlers", ++ .builtin_num = sizeof_array(node_plugins), ++ .builtin = node_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(node_plugin) ++ }, ++ [REISER4_SD_EXT_PLUGIN_TYPE] = { ++ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, ++ .label = "sd_ext", ++ .desc = "Parts of stat-data", ++ .builtin_num = sizeof_array(sd_ext_plugins), ++ .builtin = sd_ext_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(sd_ext_plugin) ++ }, ++ [REISER4_FORMAT_PLUGIN_TYPE] = { ++ .type_id = REISER4_FORMAT_PLUGIN_TYPE, ++ .label = "disk_layout", ++ .desc = "defines filesystem on disk layout", ++ .builtin_num = sizeof_array(format_plugins), ++ .builtin = format_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(disk_format_plugin) ++ }, ++ [REISER4_JNODE_PLUGIN_TYPE] = { ++ .type_id = REISER4_JNODE_PLUGIN_TYPE, ++ .label = "jnode", ++ .desc = "defines kind of jnode", ++ .builtin_num = sizeof_array(jnode_plugins), ++ .builtin = jnode_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(jnode_plugin) ++ }, ++ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = { ++ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, ++ .label = "compression_mode", ++ .desc = "Defines compression mode", ++ .builtin_num = sizeof_array(compression_mode_plugins), ++ .builtin = compression_mode_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(compression_mode_plugin) ++ }, ++ [REISER4_CLUSTER_PLUGIN_TYPE] = { ++ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, ++ .label = "cluster", ++ .desc = "Defines cluster size", ++ .builtin_num = sizeof_array(cluster_plugins), ++ .builtin = cluster_plugins, ++ .plugins_list = {NULL, NULL}, ++ .size = sizeof(cluster_plugin) ++ } ++}; ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 120 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/plugin.h linux-2.6.33/fs/reiser4/plugin/plugin.h +--- linux-2.6.33.orig/fs/reiser4/plugin/plugin.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/plugin.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,942 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Basic plugin data-types. ++ see fs/reiser4/plugin/plugin.c for details */ ++ ++#if !defined(__FS_REISER4_PLUGIN_TYPES_H__) ++#define __FS_REISER4_PLUGIN_TYPES_H__ ++ ++#include "../forward.h" ++#include "../debug.h" ++#include "../dformat.h" ++#include "../key.h" ++#include "compress/compress.h" ++#include "crypto/cipher.h" ++#include "plugin_header.h" ++#include "item/static_stat.h" ++#include "item/internal.h" ++#include "item/sde.h" ++#include "item/cde.h" ++#include "item/item.h" ++#include "node/node.h" ++#include "node/node40.h" ++#include "security/perm.h" ++#include "fibration.h" ++ ++#include "space/bitmap.h" ++#include "space/space_allocator.h" ++ ++#include "disk_format/disk_format40.h" ++#include "disk_format/disk_format.h" ++ ++#include <linux/fs.h> /* for struct super_block, address_space */ ++#include <linux/mm.h> /* for struct page */ ++#include <linux/buffer_head.h> /* for struct buffer_head */ ++#include <linux/dcache.h> /* for struct dentry */ ++#include <linux/types.h> ++#include <linux/crypto.h> ++ ++typedef struct reiser4_object_on_wire reiser4_object_on_wire; ++ ++/* ++ * File plugin. Defines the set of methods that file plugins implement, some ++ * of which are optional. ++ * ++ * A file plugin offers to the caller an interface for IO ( writing to and/or ++ * reading from) to what the caller sees as one sequence of bytes. An IO to it ++ * may affect more than one physical sequence of bytes, or no physical sequence ++ * of bytes, it may affect sequences of bytes offered by other file plugins to ++ * the semantic layer, and the file plugin may invoke other plugins and ++ * delegate work to them, but its interface is structured for offering the ++ * caller the ability to read and/or write what the caller sees as being a ++ * single sequence of bytes. ++ * ++ * The file plugin must present a sequence of bytes to the caller, but it does ++ * not necessarily have to store a sequence of bytes, it does not necessarily ++ * have to support efficient tree traversal to any offset in the sequence of ++ * bytes (tail and extent items, whose keys contain offsets, do however provide ++ * efficient non-sequential lookup of any offset in the sequence of bytes). ++ * ++ * Directory plugins provide methods for selecting file plugins by resolving a ++ * name for them. ++ * ++ * The functionality other filesystems call an attribute, and rigidly tie ++ * together, we decompose into orthogonal selectable features of files. Using ++ * the terminology we will define next, an attribute is a perhaps constrained, ++ * perhaps static length, file whose parent has a uni-count-intra-link to it, ++ * which might be grandparent-major-packed, and whose parent has a deletion ++ * method that deletes it. ++ * ++ * File plugins can implement constraints. ++ * ++ * Files can be of variable length (e.g. regular unix files), or of static ++ * length (e.g. static sized attributes). ++ * ++ * An object may have many sequences of bytes, and many file plugins, but, it ++ * has exactly one objectid. It is usually desirable that an object has a ++ * deletion method which deletes every item with that objectid. Items cannot ++ * in general be found by just their objectids. This means that an object must ++ * have either a method built into its deletion plugin method for knowing what ++ * items need to be deleted, or links stored with the object that provide the ++ * plugin with a method for finding those items. Deleting a file within an ++ * object may or may not have the effect of deleting the entire object, ++ * depending on the file plugin's deletion method. ++ * ++ * LINK TAXONOMY: ++ * ++ * Many objects have a reference count, and when the reference count reaches 0 ++ * the object's deletion method is invoked. Some links embody a reference ++ * count increase ("countlinks"), and others do not ("nocountlinks"). ++ * ++ * Some links are bi-directional links ("bilinks"), and some are ++ * uni-directional("unilinks"). ++ * ++ * Some links are between parts of the same object ("intralinks"), and some are ++ * between different objects ("interlinks"). ++ * ++ * PACKING TAXONOMY: ++ * ++ * Some items of an object are stored with a major packing locality based on ++ * their object's objectid (e.g. unix directory items in plan A), and these are ++ * called "self-major-packed". ++ * ++ * Some items of an object are stored with a major packing locality based on ++ * their semantic parent object's objectid (e.g. unix file bodies in plan A), ++ * and these are called "parent-major-packed". ++ * ++ * Some items of an object are stored with a major packing locality based on ++ * their semantic grandparent, and these are called "grandparent-major-packed". ++ * Now carefully notice that we run into trouble with key length if we have to ++ * store a 8 byte major+minor grandparent based packing locality, an 8 byte ++ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in ++ * a 24 byte key. One of these fields must be sacrificed if an item is to be ++ * grandparent-major-packed, and which to sacrifice is left to the item author ++ * choosing to make the item grandparent-major-packed. You cannot make tail ++ * items and extent items grandparent-major-packed, though you could make them ++ * self-major-packed (usually they are parent-major-packed). ++ * ++ * In the case of ACLs (which are composed of fixed length ACEs which consist ++ * of {subject-type, subject, and permission bitmask} triples), it makes sense ++ * to not have an offset field in the ACE item key, and to allow duplicate keys ++ * for ACEs. Thus, the set of ACES for a given file is found by looking for a ++ * key consisting of the objectid of the grandparent (thus grouping all ACLs in ++ * a directory together), the minor packing locality of ACE, the objectid of ++ * the file, and 0. ++ * ++ * IO involves moving data from one location to another, which means that two ++ * locations must be specified, source and destination. ++ * ++ * This source and destination can be in the filesystem, or they can be a ++ * pointer in the user process address space plus a byte count. ++ * ++ * If both source and destination are in the filesystem, then at least one of ++ * them must be representable as a pure stream of bytes (which we call a flow, ++ * and define as a struct containing a key, a data pointer, and a length). ++ * This may mean converting one of them into a flow. We provide a generic ++ * cast_into_flow() method, which will work for any plugin supporting ++ * read_flow(), though it is inefficiently implemented in that it temporarily ++ * stores the flow in a buffer (Question: what to do with huge flows that ++ * cannot fit into memory? Answer: we must not convert them all at once. ) ++ * ++ * Performing a write requires resolving the write request into a flow defining ++ * the source, and a method that performs the write, and a key that defines ++ * where in the tree the write is to go. ++ * ++ * Performing a read requires resolving the read request into a flow defining ++ * the target, and a method that performs the read, and a key that defines ++ * where in the tree the read is to come from. ++ * ++ * There will exist file plugins which have no pluginid stored on the disk for ++ * them, and which are only invoked by other plugins. ++ */ ++ ++/* This should be incremented with each new contributed ++ pair (plugin type, plugin id). ++ NOTE: Make sure there is a release of reiser4progs ++ with the corresponding version number */ ++#define PLUGIN_LIBRARY_VERSION 0 ++ ++ /* enumeration of fields within plugin_set */ ++typedef enum { ++ PSET_FILE, ++ PSET_DIR, /* PSET_FILE and PSET_DIR should be first ++ * elements: inode.c:read_inode() depends on ++ * this. */ ++ PSET_PERM, ++ PSET_FORMATTING, ++ PSET_HASH, ++ PSET_FIBRATION, ++ PSET_SD, ++ PSET_DIR_ITEM, ++ PSET_CIPHER, ++ PSET_DIGEST, ++ PSET_COMPRESSION, ++ PSET_COMPRESSION_MODE, ++ PSET_CLUSTER, ++ PSET_CREATE, ++ PSET_LAST ++} pset_member; ++ ++/* builtin file-plugins */ ++typedef enum { ++ /* regular file */ ++ UNIX_FILE_PLUGIN_ID, ++ /* directory */ ++ DIRECTORY_FILE_PLUGIN_ID, ++ /* symlink */ ++ SYMLINK_FILE_PLUGIN_ID, ++ /* for objects completely handled by the VFS: fifos, devices, ++ sockets */ ++ SPECIAL_FILE_PLUGIN_ID, ++ /* regular cryptcompress file */ ++ CRYPTCOMPRESS_FILE_PLUGIN_ID, ++ /* number of file plugins. Used as size of arrays to hold ++ file plugins. */ ++ LAST_FILE_PLUGIN_ID ++} reiser4_file_id; ++ ++typedef struct file_plugin { ++ ++ /* generic fields */ ++ plugin_header h; ++ ++ /* VFS methods. ++ * Must be invariant with respect to plugin conversion. ++ * It can be achieved by using "common" methods, which ++ * are the same for all plugins that take participation in ++ * conversion, or by using "generic" or "careful" methods, ++ * which provide automatic redirection to proper private ++ * plugin methods ("careful" are the same as "generic", ++ * but with protection of pset and other disk structures ++ * from being rebuilt during conversion. ++ */ ++ struct inode_operations * inode_ops; ++ struct file_operations * file_ops; ++ struct address_space_operations * as_ops; ++ /** ++ * Private methods. These are optional. If used they will allow you ++ * to minimize the amount of code needed to implement a deviation ++ * from some other method that also uses them. ++ */ ++ /* ++ * private inode_ops ++ */ ++ int (*setattr)(struct dentry *, struct iattr *); ++ /* ++ * private file_ops ++ */ ++ /* do whatever is necessary to do when object is opened */ ++ int (*open) (struct inode *inode, struct file *file); ++ ssize_t (*read) (struct file *, char __user *buf, size_t read_amount, ++ loff_t *off); ++ /* write as much as possible bytes from nominated @write_amount ++ * before plugin scheduling is occurred. Save scheduling state ++ * in @cont */ ++ ssize_t (*write) (struct file *, const char __user *buf, ++ size_t write_amount, loff_t * off, ++ struct dispatch_context * cont); ++ int (*ioctl) (struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); ++ int (*mmap) (struct file *, struct vm_area_struct *); ++ int (*release) (struct inode *, struct file *); ++ /* ++ * private a_ops ++ */ ++ int (*readpage) (struct file *file, struct page *page); ++ int (*readpages)(struct file *file, struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages); ++ int (*writepages)(struct address_space *mapping, ++ struct writeback_control *wbc); ++ int (*write_begin)(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++ int (*write_end)(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++ sector_t (*bmap) (struct address_space * mapping, sector_t lblock); ++ /* other private methods */ ++ /* save inode cached stat-data onto disk. It was called ++ reiserfs_update_sd() in 3.x */ ++ int (*write_sd_by_inode) (struct inode *); ++ /* ++ * Construct flow into @flow according to user-supplied data. ++ * ++ * This is used by read/write methods to construct a flow to ++ * write/read. ->flow_by_inode() is plugin method, rather than single ++ * global implementation, because key in a flow used by plugin may ++ * depend on data in a @buf. ++ * ++ * NIKITA-FIXME-HANS: please create statistics on what functions are ++ * dereferenced how often for the mongo benchmark. You can supervise ++ * Elena doing this for you if that helps. Email me the list of the ++ * top 10, with their counts, and an estimate of the total number of ++ * CPU cycles spent dereferencing as a percentage of CPU cycles spent ++ * processing (non-idle processing). If the total percent is, say, ++ * less than 1%, it will make our coding discussions much easier, and ++ * keep me from questioning whether functions like the below are too ++ * frequently called to be dereferenced. If the total percent is more ++ * than 1%, perhaps private methods should be listed in a "required" ++ * comment at the top of each plugin (with stern language about how if ++ * the comment is missing it will not be accepted by the maintainer), ++ * and implemented using macros not dereferenced functions. How about ++ * replacing this whole private methods part of the struct with a ++ * thorough documentation of what the standard helper functions are for ++ * use in constructing plugins? I think users have been asking for ++ * that, though not in so many words. ++ */ ++ int (*flow_by_inode) (struct inode *, const char __user *buf, ++ int user, loff_t size, ++ loff_t off, rw_op op, flow_t *); ++ /* ++ * Return the key used to retrieve an offset of a file. It is used by ++ * default implementation of ->flow_by_inode() method ++ * (common_build_flow()) and, among other things, to get to the extent ++ * from jnode of unformatted node. ++ */ ++ int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *); ++ ++ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you ++ * think.... */ ++ /* ++ * set the plugin for a file. Called during file creation in creat() ++ * but not reiser4() unless an inode already exists for the file. ++ */ ++ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent, ++ reiser4_object_create_data *); ++ ++ /* NIKITA-FIXME-HANS: comment and name seem to say different things, ++ * are you setting up the object itself also or just adjusting the ++ * parent?.... */ ++ /* set up plugins for new @object created in @parent. @root is root ++ directory. */ ++ int (*adjust_to_parent) (struct inode *object, struct inode *parent, ++ struct inode *root); ++ /* ++ * this does whatever is necessary to do when object is created. For ++ * instance, for unix files stat data is inserted. It is supposed to be ++ * called by create of struct inode_operations. ++ */ ++ int (*create_object) (struct inode *object, struct inode *parent, ++ reiser4_object_create_data *); ++ /* ++ * this method should check REISER4_NO_SD and set REISER4_NO_SD on ++ * success. Deletion of an object usually includes removal of items ++ * building file body (for directories this is removal of "." and "..") ++ * and removal of stat-data item. ++ */ ++ int (*delete_object) (struct inode *); ++ ++ /* add link from @parent to @object */ ++ int (*add_link) (struct inode *object, struct inode *parent); ++ ++ /* remove link from @parent to @object */ ++ int (*rem_link) (struct inode *object, struct inode *parent); ++ ++ /* ++ * return true if item addressed by @coord belongs to @inode. This is ++ * used by read/write to properly slice flow into items in presence of ++ * multiple key assignment policies, because items of a file are not ++ * necessarily contiguous in a key space, for example, in a plan-b. ++ */ ++ int (*owns_item) (const struct inode *, const coord_t *); ++ ++ /* checks whether yet another hard links to this object can be ++ added */ ++ int (*can_add_link) (const struct inode *); ++ ++ /* checks whether hard links to this object can be removed */ ++ int (*can_rem_link) (const struct inode *); ++ ++ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls ++ detach of directory plugin to remove ".." */ ++ int (*detach) (struct inode *child, struct inode *parent); ++ ++ /* called when @child was just looked up in the @parent. It is not ++ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of ++ directory plugin */ ++ int (*bind) (struct inode *child, struct inode *parent); ++ ++ /* process safe-link during mount */ ++ int (*safelink) (struct inode *object, reiser4_safe_link_t link, ++ __u64 value); ++ ++ /* The couple of estimate methods for all file operations */ ++ struct { ++ reiser4_block_nr(*create) (const struct inode *); ++ reiser4_block_nr(*update) (const struct inode *); ++ reiser4_block_nr(*unlink) (const struct inode *, ++ const struct inode *); ++ } estimate; ++ ++ /* ++ * reiser4 specific part of inode has a union of structures which are ++ * specific to a plugin. This method is called when inode is read ++ * (read_inode) and when file is created (common_create_child) so that ++ * file plugin could initialize its inode data ++ */ ++ void (*init_inode_data) (struct inode *, reiser4_object_create_data * , ++ int); ++ ++ /* ++ * This method performs progressive deletion of items and whole nodes ++ * from right to left. ++ * ++ * @tap: the point deletion process begins from, ++ * @from_key: the beginning of the deleted key range, ++ * @to_key: the end of the deleted key range, ++ * @smallest_removed: the smallest removed key, ++ * ++ * @return: 0 if success, error code otherwise, -E_REPEAT means that ++ * long cut_tree operation was interrupted for allowing atom commit . ++ */ ++ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key, ++ const reiser4_key * to_key, ++ reiser4_key * smallest_removed, struct inode *, ++ int, int *); ++ ++ /* called from ->destroy_inode() */ ++ void (*destroy_inode) (struct inode *); ++ ++ /* ++ * methods to serialize object identify. This is used, for example, by ++ * reiser4_{en,de}code_fh(). ++ */ ++ struct { ++ /* store object's identity at @area */ ++ char *(*write) (struct inode *inode, char *area); ++ /* parse object from wire to the @obj */ ++ char *(*read) (char *area, reiser4_object_on_wire * obj); ++ /* given object identity in @obj, find or create its dentry */ ++ struct dentry *(*get) (struct super_block *s, ++ reiser4_object_on_wire * obj); ++ /* how many bytes ->wire.write() consumes */ ++ int (*size) (struct inode *inode); ++ /* finish with object identify */ ++ void (*done) (reiser4_object_on_wire * obj); ++ } wire; ++} file_plugin; ++ ++extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID]; ++ ++struct reiser4_object_on_wire { ++ file_plugin *plugin; ++ union { ++ struct { ++ obj_key_id key_id; ++ } std; ++ void *generic; ++ } u; ++}; ++ ++/* builtin dir-plugins */ ++typedef enum { ++ HASHED_DIR_PLUGIN_ID, ++ SEEKABLE_HASHED_DIR_PLUGIN_ID, ++ LAST_DIR_ID ++} reiser4_dir_id; ++ ++typedef struct dir_plugin { ++ /* generic fields */ ++ plugin_header h; ++ ++ struct inode_operations * inode_ops; ++ struct file_operations * file_ops; ++ struct address_space_operations * as_ops; ++ ++ /* ++ * private methods: These are optional. If used they will allow you to ++ * minimize the amount of code needed to implement a deviation from ++ * some other method that uses them. You could logically argue that ++ * they should be a separate type of plugin. ++ */ ++ ++ struct dentry *(*get_parent) (struct inode *childdir); ++ ++ /* ++ * check whether "name" is acceptable name to be inserted into this ++ * object. Optionally implemented by directory-like objects. Can check ++ * for maximal length, reserved symbols etc ++ */ ++ int (*is_name_acceptable) (const struct inode *inode, const char *name, ++ int len); ++ ++ void (*build_entry_key) (const struct inode *dir /* directory where ++ * entry is (or will ++ * be) in.*/ , ++ const struct qstr *name /* name of file ++ * referenced by this ++ * entry */ , ++ reiser4_key * result /* resulting key of ++ * directory entry */ ); ++ int (*build_readdir_key) (struct file *dir, reiser4_key * result); ++ int (*add_entry) (struct inode *object, struct dentry *where, ++ reiser4_object_create_data * data, ++ reiser4_dir_entry_desc * entry); ++ int (*rem_entry) (struct inode *object, struct dentry *where, ++ reiser4_dir_entry_desc * entry); ++ ++ /* ++ * initialize directory structure for newly created object. For normal ++ * unix directories, insert dot and dotdot. ++ */ ++ int (*init) (struct inode *object, struct inode *parent, ++ reiser4_object_create_data * data); ++ ++ /* destroy directory */ ++ int (*done) (struct inode *child); ++ ++ /* called when @subdir was just looked up in the @dir */ ++ int (*attach) (struct inode *subdir, struct inode *dir); ++ int (*detach) (struct inode *subdir, struct inode *dir); ++ ++ struct { ++ reiser4_block_nr(*add_entry) (const struct inode *); ++ reiser4_block_nr(*rem_entry) (const struct inode *); ++ reiser4_block_nr(*unlink) (const struct inode *, ++ const struct inode *); ++ } estimate; ++} dir_plugin; ++ ++extern dir_plugin dir_plugins[LAST_DIR_ID]; ++ ++typedef struct formatting_plugin { ++ /* generic fields */ ++ plugin_header h; ++ /* returns non-zero iff file's tail has to be stored ++ in a direct item. */ ++ int (*have_tail) (const struct inode *inode, loff_t size); ++} formatting_plugin; ++ ++typedef struct hash_plugin { ++ /* generic fields */ ++ plugin_header h; ++ /* computes hash of the given name */ ++ __u64(*hash) (const unsigned char *name, int len); ++} hash_plugin; ++ ++typedef struct cipher_plugin { ++ /* generic fields */ ++ plugin_header h; ++ struct crypto_blkcipher * (*alloc) (void); ++ void (*free) (struct crypto_blkcipher *tfm); ++ /* Offset translator. For each offset this returns (k * offset), where ++ k (k >= 1) is an expansion factor of the cipher algorithm. ++ For all symmetric algorithms k == 1. For asymmetric algorithms (which ++ inflate data) offset translation guarantees that all disk cluster's ++ units will have keys smaller then next cluster's one. ++ */ ++ loff_t(*scale) (struct inode *inode, size_t blocksize, loff_t src); ++ /* Cipher algorithms can accept data only by chunks of cipher block ++ size. This method is to align any flow up to cipher block size when ++ we pass it to cipher algorithm. To align means to append padding of ++ special format specific to the cipher algorithm */ ++ int (*align_stream) (__u8 *tail, int clust_size, int blocksize); ++ /* low-level key manager (check, install, etc..) */ ++ int (*setkey) (struct crypto_tfm *tfm, const __u8 *key, ++ unsigned int keylen); ++ /* main text processing procedures */ ++ void (*encrypt) (__u32 *expkey, __u8 *dst, const __u8 *src); ++ void (*decrypt) (__u32 *expkey, __u8 *dst, const __u8 *src); ++} cipher_plugin; ++ ++typedef struct digest_plugin { ++ /* generic fields */ ++ plugin_header h; ++ /* fingerprint size in bytes */ ++ int fipsize; ++ struct crypto_hash * (*alloc) (void); ++ void (*free) (struct crypto_hash *tfm); ++} digest_plugin; ++ ++typedef struct compression_plugin { ++ /* generic fields */ ++ plugin_header h; ++ int (*init) (void); ++ /* the maximum number of bytes the size of the "compressed" data can ++ * exceed the uncompressed data. */ ++ int (*overrun) (unsigned src_len); ++ coa_t(*alloc) (tfm_action act); ++ void (*free) (coa_t coa, tfm_action act); ++ /* minimal size of the flow we still try to compress */ ++ int (*min_size_deflate) (void); ++ __u32(*checksum) (char *data, __u32 length); ++ /* main transform procedures */ ++ void (*compress) (coa_t coa, __u8 *src_first, size_t src_len, ++ __u8 *dst_first, size_t *dst_len); ++ void (*decompress) (coa_t coa, __u8 *src_first, size_t src_len, ++ __u8 *dst_first, size_t *dst_len); ++} compression_plugin; ++ ++typedef struct compression_mode_plugin { ++ /* generic fields */ ++ plugin_header h; ++ /* this is called when estimating compressibility ++ of a logical cluster by its content */ ++ int (*should_deflate) (struct inode *inode, cloff_t index); ++ /* this is called when results of compression should be saved */ ++ int (*accept_hook) (struct inode *inode, cloff_t index); ++ /* this is called when results of compression should be discarded */ ++ int (*discard_hook) (struct inode *inode, cloff_t index); ++} compression_mode_plugin; ++ ++typedef struct cluster_plugin { ++ /* generic fields */ ++ plugin_header h; ++ int shift; ++} cluster_plugin; ++ ++typedef struct sd_ext_plugin { ++ /* generic fields */ ++ plugin_header h; ++ int (*present) (struct inode *inode, char **area, int *len); ++ int (*absent) (struct inode *inode); ++ int (*save_len) (struct inode *inode); ++ int (*save) (struct inode *inode, char **area); ++ /* alignment requirement for this stat-data part */ ++ int alignment; ++} sd_ext_plugin; ++ ++/* this plugin contains methods to allocate objectid for newly created files, ++ to deallocate objectid when file gets removed, to report number of used and ++ free objectids */ ++typedef struct oid_allocator_plugin { ++ /* generic fields */ ++ plugin_header h; ++ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files, ++ __u64 oids); ++ /* used to report statfs->f_files */ ++ __u64(*oids_used) (reiser4_oid_allocator * map); ++ /* get next oid to use */ ++ __u64(*next_oid) (reiser4_oid_allocator * map); ++ /* used to report statfs->f_ffree */ ++ __u64(*oids_free) (reiser4_oid_allocator * map); ++ /* allocate new objectid */ ++ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *); ++ /* release objectid */ ++ int (*release_oid) (reiser4_oid_allocator * map, oid_t); ++ /* how many pages to reserve in transaction for allocation of new ++ objectid */ ++ int (*oid_reserve_allocate) (reiser4_oid_allocator * map); ++ /* how many pages to reserve in transaction for freeing of an ++ objectid */ ++ int (*oid_reserve_release) (reiser4_oid_allocator * map); ++ void (*print_info) (const char *, reiser4_oid_allocator *); ++} oid_allocator_plugin; ++ ++/* disk layout plugin: this specifies super block, journal, bitmap (if there ++ are any) locations, etc */ ++typedef struct disk_format_plugin { ++ /* generic fields */ ++ plugin_header h; ++ /* replay journal, initialize super_info_data, etc */ ++ int (*init_format) (struct super_block *, void *data); ++ ++ /* key of root directory stat data */ ++ const reiser4_key * (*root_dir_key) (const struct super_block *); ++ ++ int (*release) (struct super_block *); ++ jnode * (*log_super) (struct super_block *); ++ int (*check_open) (const struct inode *object); ++ int (*version_update) (struct super_block *); ++} disk_format_plugin; ++ ++struct jnode_plugin { ++ /* generic fields */ ++ plugin_header h; ++ int (*init) (jnode * node); ++ int (*parse) (jnode * node); ++ struct address_space *(*mapping) (const jnode * node); ++ unsigned long (*index) (const jnode * node); ++ jnode * (*clone) (jnode * node); ++}; ++ ++/* plugin instance. */ ++/* */ ++/* This is "wrapper" union for all types of plugins. Most of the code uses */ ++/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */ ++/* operates with pointers to reiser4_plugin. This union is only used in */ ++/* some generic code in plugin/plugin.c that operates on all */ ++/* plugins. Technically speaking purpose of this union is to add type */ ++/* safety to said generic code: each plugin type (file_plugin, for */ ++/* example), contains plugin_header as its first memeber. This first member */ ++/* is located at the same place in memory as .h member of */ ++/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */ ++/* looks in the .h which is header of plugin type located in union. This */ ++/* allows to avoid type-casts. */ ++union reiser4_plugin { ++ /* generic fields */ ++ plugin_header h; ++ /* file plugin */ ++ file_plugin file; ++ /* directory plugin */ ++ dir_plugin dir; ++ /* hash plugin, used by directory plugin */ ++ hash_plugin hash; ++ /* fibration plugin used by directory plugin */ ++ fibration_plugin fibration; ++ /* cipher transform plugin, used by file plugin */ ++ cipher_plugin cipher; ++ /* digest transform plugin, used by file plugin */ ++ digest_plugin digest; ++ /* compression transform plugin, used by file plugin */ ++ compression_plugin compression; ++ /* tail plugin, used by file plugin */ ++ formatting_plugin formatting; ++ /* permission plugin */ ++ perm_plugin perm; ++ /* node plugin */ ++ node_plugin node; ++ /* item plugin */ ++ item_plugin item; ++ /* stat-data extension plugin */ ++ sd_ext_plugin sd_ext; ++ /* disk layout plugin */ ++ disk_format_plugin format; ++ /* object id allocator plugin */ ++ oid_allocator_plugin oid_allocator; ++ /* plugin for different jnode types */ ++ jnode_plugin jnode; ++ /* compression mode plugin, used by object plugin */ ++ compression_mode_plugin compression_mode; ++ /* cluster plugin, used by object plugin */ ++ cluster_plugin clust; ++ /* place-holder for new plugin types that can be registered ++ dynamically, and used by other dynamically loaded plugins. */ ++ void *generic; ++}; ++ ++struct reiser4_plugin_ops { ++ /* called when plugin is initialized */ ++ int (*init) (reiser4_plugin * plugin); ++ /* called when plugin is unloaded */ ++ int (*done) (reiser4_plugin * plugin); ++ /* load given plugin from disk */ ++ int (*load) (struct inode *inode, ++ reiser4_plugin * plugin, char **area, int *len); ++ /* how many space is required to store this plugin's state ++ in stat-data */ ++ int (*save_len) (struct inode *inode, reiser4_plugin * plugin); ++ /* save persistent plugin-data to disk */ ++ int (*save) (struct inode *inode, reiser4_plugin * plugin, ++ char **area); ++ /* alignment requirement for on-disk state of this plugin ++ in number of bytes */ ++ int alignment; ++ /* install itself into given inode. This can return error ++ (e.g., you cannot change hash of non-empty directory). */ ++ int (*change) (struct inode *inode, reiser4_plugin * plugin, ++ pset_member memb); ++ /* install itself into given inode. This can return error ++ (e.g., you cannot change hash of non-empty directory). */ ++ int (*inherit) (struct inode *inode, struct inode *parent, ++ reiser4_plugin * plugin); ++}; ++ ++/* functions implemented in fs/reiser4/plugin/plugin.c */ ++ ++/* stores plugin reference in reiser4-specific part of inode */ ++extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id); ++extern int init_plugins(void); ++ ++/* builtin plugins */ ++ ++/* builtin hash-plugins */ ++ ++typedef enum { ++ RUPASOV_HASH_ID, ++ R5_HASH_ID, ++ TEA_HASH_ID, ++ FNV1_HASH_ID, ++ DEGENERATE_HASH_ID, ++ LAST_HASH_ID ++} reiser4_hash_id; ++ ++/* builtin cipher plugins */ ++ ++typedef enum { ++ NONE_CIPHER_ID, ++ LAST_CIPHER_ID ++} reiser4_cipher_id; ++ ++/* builtin digest plugins */ ++ ++typedef enum { ++ SHA256_32_DIGEST_ID, ++ LAST_DIGEST_ID ++} reiser4_digest_id; ++ ++/* builtin compression mode plugins */ ++typedef enum { ++ NONE_COMPRESSION_MODE_ID, ++ LATTD_COMPRESSION_MODE_ID, ++ ULTIM_COMPRESSION_MODE_ID, ++ FORCE_COMPRESSION_MODE_ID, ++ CONVX_COMPRESSION_MODE_ID, ++ LAST_COMPRESSION_MODE_ID ++} reiser4_compression_mode_id; ++ ++/* builtin cluster plugins */ ++typedef enum { ++ CLUSTER_64K_ID, ++ CLUSTER_32K_ID, ++ CLUSTER_16K_ID, ++ CLUSTER_8K_ID, ++ CLUSTER_4K_ID, ++ LAST_CLUSTER_ID ++} reiser4_cluster_id; ++ ++/* builtin tail-plugins */ ++ ++typedef enum { ++ NEVER_TAILS_FORMATTING_ID, ++ ALWAYS_TAILS_FORMATTING_ID, ++ SMALL_FILE_FORMATTING_ID, ++ LAST_TAIL_FORMATTING_ID ++} reiser4_formatting_id; ++ ++/* data type used to pack parameters that we pass to vfs object creation ++ function create_object() */ ++struct reiser4_object_create_data { ++ /* plugin to control created object */ ++ reiser4_file_id id; ++ /* mode of regular file, directory or special file */ ++/* what happens if some other sort of perm plugin is in use? */ ++ int mode; ++ /* rdev of special file */ ++ dev_t rdev; ++ /* symlink target */ ++ const char *name; ++ /* add here something for non-standard objects you invent, like ++ query for interpolation file etc. */ ++ ++ struct reiser4_crypto_info *crypto; ++ ++ struct inode *parent; ++ struct dentry *dentry; ++}; ++ ++/* description of directory entry being created/destroyed/sought for ++ ++ It is passed down to the directory plugin and farther to the ++ directory item plugin methods. Creation of new directory is done in ++ several stages: first we search for an entry with the same name, then ++ create new one. reiser4_dir_entry_desc is used to store some information ++ collected at some stage of this process and required later: key of ++ item that we want to insert/delete and pointer to an object that will ++ be bound by the new directory entry. Probably some more fields will ++ be added there. ++ ++*/ ++struct reiser4_dir_entry_desc { ++ /* key of directory entry */ ++ reiser4_key key; ++ /* object bound by this entry. */ ++ struct inode *obj; ++}; ++ ++#define MAX_PLUGIN_TYPE_LABEL_LEN 32 ++#define MAX_PLUGIN_PLUG_LABEL_LEN 32 ++ ++#define PLUGIN_BY_ID(TYPE, ID, FIELD) \ ++static inline TYPE *TYPE ## _by_id(reiser4_plugin_id id) \ ++{ \ ++ reiser4_plugin *plugin = plugin_by_id(ID, id); \ ++ return plugin ? &plugin->FIELD : NULL; \ ++} \ ++static inline TYPE *TYPE ## _by_disk_id(reiser4_tree * tree, d16 *id) \ ++{ \ ++ reiser4_plugin *plugin = plugin_by_disk_id(tree, ID, id); \ ++ return plugin ? &plugin->FIELD : NULL; \ ++} \ ++static inline TYPE *TYPE ## _by_unsafe_id(reiser4_plugin_id id) \ ++{ \ ++ reiser4_plugin *plugin = plugin_by_unsafe_id(ID, id); \ ++ return plugin ? &plugin->FIELD : NULL; \ ++} \ ++static inline reiser4_plugin* TYPE ## _to_plugin(TYPE* plugin) \ ++{ \ ++ return (reiser4_plugin *) plugin; \ ++} \ ++static inline reiser4_plugin_id TYPE ## _id(TYPE* plugin) \ ++{ \ ++ return TYPE ## _to_plugin(plugin)->h.id; \ ++} \ ++typedef struct { int foo; } TYPE ## _plugin_dummy ++ ++PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item); ++PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file); ++PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir); ++PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node); ++PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext); ++PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm); ++PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash); ++PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration); ++PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher); ++PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest); ++PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression); ++PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting); ++PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format); ++PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode); ++PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE, ++ compression_mode); ++PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust); ++ ++extern int save_plugin_id(reiser4_plugin * plugin, d16 * area); ++ ++extern struct list_head *get_plugin_list(reiser4_plugin_type type_id); ++ ++#define for_all_plugins(ptype, plugin) \ ++for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \ ++ get_plugin_list(ptype) != &plugin->h.linkage; \ ++ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage)) ++ ++ ++extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, ++ pset_member memb); ++extern int force_plugin_pset(struct inode *self, pset_member memb, ++ reiser4_plugin *plug); ++extern int finish_pset(struct inode *inode); ++ ++/* defined in fs/reiser4/plugin/object.c */ ++extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID]; ++/* defined in fs/reiser4/plugin/object.c */ ++extern dir_plugin dir_plugins[LAST_DIR_ID]; ++/* defined in fs/reiser4/plugin/item/static_stat.c */ ++extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION]; ++/* defined in fs/reiser4/plugin/hash.c */ ++extern hash_plugin hash_plugins[LAST_HASH_ID]; ++/* defined in fs/reiser4/plugin/fibration.c */ ++extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID]; ++/* defined in fs/reiser4/plugin/crypt.c */ ++extern cipher_plugin cipher_plugins[LAST_CIPHER_ID]; ++/* defined in fs/reiser4/plugin/digest.c */ ++extern digest_plugin digest_plugins[LAST_DIGEST_ID]; ++/* defined in fs/reiser4/plugin/compress/compress.c */ ++extern compression_plugin compression_plugins[LAST_COMPRESSION_ID]; ++/* defined in fs/reiser4/plugin/compress/compression_mode.c */ ++extern compression_mode_plugin ++compression_mode_plugins[LAST_COMPRESSION_MODE_ID]; ++/* defined in fs/reiser4/plugin/cluster.c */ ++extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID]; ++/* defined in fs/reiser4/plugin/tail.c */ ++extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID]; ++/* defined in fs/reiser4/plugin/security/security.c */ ++extern perm_plugin perm_plugins[LAST_PERM_ID]; ++/* defined in fs/reiser4/plugin/item/item.c */ ++extern item_plugin item_plugins[LAST_ITEM_ID]; ++/* defined in fs/reiser4/plugin/node/node.c */ ++extern node_plugin node_plugins[LAST_NODE_ID]; ++/* defined in fs/reiser4/plugin/disk_format/disk_format.c */ ++extern disk_format_plugin format_plugins[LAST_FORMAT_ID]; ++ ++/* __FS_REISER4_PLUGIN_TYPES_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.33/fs/reiser4/plugin/plugin_header.h +--- linux-2.6.33.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/plugin_header.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,149 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* plugin header. Data structures required by all plugin types. */ ++ ++#if !defined(__PLUGIN_HEADER_H__) ++#define __PLUGIN_HEADER_H__ ++ ++/* plugin data-types and constants */ ++ ++#include "../debug.h" ++#include "../dformat.h" ++ ++/* The list of Reiser4 interfaces */ ++typedef enum { ++ REISER4_FILE_PLUGIN_TYPE, /* manage VFS objects */ ++ REISER4_DIR_PLUGIN_TYPE, /* manage directories */ ++ REISER4_ITEM_PLUGIN_TYPE, /* manage items */ ++ REISER4_NODE_PLUGIN_TYPE, /* manage formatted nodes */ ++ REISER4_HASH_PLUGIN_TYPE, /* hash methods */ ++ REISER4_FIBRATION_PLUGIN_TYPE, /* directory fibrations */ ++ REISER4_FORMATTING_PLUGIN_TYPE, /* dispatching policy */ ++ REISER4_PERM_PLUGIN_TYPE, /* stub (vacancy) */ ++ REISER4_SD_EXT_PLUGIN_TYPE, /* manage stat-data extensions */ ++ REISER4_FORMAT_PLUGIN_TYPE, /* disk format specifications */ ++ REISER4_JNODE_PLUGIN_TYPE, /* manage in-memory headers */ ++ REISER4_CIPHER_PLUGIN_TYPE, /* cipher transform methods */ ++ REISER4_DIGEST_PLUGIN_TYPE, /* digest transform methods */ ++ REISER4_COMPRESSION_PLUGIN_TYPE, /* compression methods */ ++ REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* dispatching policies */ ++ REISER4_CLUSTER_PLUGIN_TYPE, /* manage logical clusters */ ++ REISER4_PLUGIN_TYPES ++} reiser4_plugin_type; ++ ++/* Supported plugin groups */ ++typedef enum { ++ REISER4_DIRECTORY_FILE, ++ REISER4_REGULAR_FILE, ++ REISER4_SYMLINK_FILE, ++ REISER4_SPECIAL_FILE, ++} file_plugin_group; ++ ++struct reiser4_plugin_ops; ++/* generic plugin operations, supported by each ++ plugin type. */ ++typedef struct reiser4_plugin_ops reiser4_plugin_ops; ++ ++/* the common part of all plugin instances. */ ++typedef struct plugin_header { ++ /* plugin type */ ++ reiser4_plugin_type type_id; ++ /* id of this plugin */ ++ reiser4_plugin_id id; ++ /* bitmask of groups the plugin belongs to. */ ++ reiser4_plugin_groups groups; ++ /* plugin operations */ ++ reiser4_plugin_ops *pops; ++/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and ++ * defined. */ ++ /* short label of this plugin */ ++ const char *label; ++ /* descriptive string.. */ ++ const char *desc; ++ /* list linkage */ ++ struct list_head linkage; ++} plugin_header; ++ ++#define plugin_of_group(plug, group) (plug->h.groups & (1 << group)) ++ ++/* PRIVATE INTERFACES */ ++/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in ++ * plugin_header? */ ++/* plugin type representation. */ ++struct reiser4_plugin_type_data { ++ /* internal plugin type identifier. Should coincide with ++ index of this item in plugins[] array. */ ++ reiser4_plugin_type type_id; ++ /* short symbolic label of this plugin type. Should be no longer ++ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */ ++ const char *label; ++ /* plugin type description longer than .label */ ++ const char *desc; ++ ++/* NIKITA-FIXME-HANS: define built-in */ ++ /* number of built-in plugin instances of this type */ ++ int builtin_num; ++ /* array of built-in plugins */ ++ void *builtin; ++ struct list_head plugins_list; ++ size_t size; ++}; ++ ++extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES]; ++ ++int is_plugin_type_valid(reiser4_plugin_type type); ++int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id); ++ ++static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data *ptype, ++ int i) ++{ ++ char *builtin; ++ ++ builtin = ptype->builtin; ++ return (reiser4_plugin *) (builtin + i * ptype->size); ++} ++ ++/* return plugin by its @type_id and @id */ ++static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type, ++ reiser4_plugin_id id) ++{ ++ assert("nikita-1651", is_plugin_type_valid(type)); ++ assert("nikita-1652", is_plugin_id_valid(type, id)); ++ return plugin_at(&plugins[type], id); ++} ++ ++extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id, ++ reiser4_plugin_id id); ++ ++/** ++ * plugin_by_disk_id - get reiser4_plugin ++ * @type_id: plugin type id ++ * @did: plugin id in disk format ++ * ++ * Returns reiser4_plugin by plugin type id an dplugin_id. ++ */ ++static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG, ++ reiser4_plugin_type type_id, ++ __le16 *plugin_id) ++{ ++ /* ++ * what we should do properly is to maintain within each file-system a ++ * dictionary that maps on-disk plugin ids to "universal" ids. This ++ * dictionary will be resolved on mount time, so that this function ++ * will perform just one additional array lookup. ++ */ ++ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id)); ++} ++ ++/* __PLUGIN_HEADER_H__ */ ++#endif ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.33/fs/reiser4/plugin/plugin_set.c +--- linux-2.6.33.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/plugin_set.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,380 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++/* This file contains Reiser4 plugin set operations */ ++ ++/* plugin sets ++ * ++ * Each file in reiser4 is controlled by a whole set of plugins (file plugin, ++ * directory plugin, hash plugin, tail policy plugin, security plugin, etc.) ++ * assigned (inherited, deduced from mode bits, etc.) at creation time. This ++ * set of plugins (so called pset) is described by structure plugin_set (see ++ * plugin/plugin_set.h), which contains pointers to all required plugins. ++ * ++ * Children can inherit some pset members from their parent, however sometimes ++ * it is useful to specify members different from parent ones. Since object's ++ * pset can not be easily changed without fatal consequences, we use for this ++ * purpose another special plugin table (so called hset, or heir set) described ++ * by the same structure. ++ * ++ * Inode only stores a pointers to pset and hset. Different inodes with the ++ * same set of pset (hset) members point to the same pset (hset). This is ++ * archived by storing psets and hsets in global hash table. Races are avoided ++ * by simple (and efficient so far) solution of never recycling psets, even ++ * when last inode pointing to it is destroyed. ++ */ ++ ++#include "../debug.h" ++#include "../super.h" ++#include "plugin_set.h" ++ ++#include <linux/slab.h> ++#include <linux/stddef.h> ++ ++/* slab for plugin sets */ ++static struct kmem_cache *plugin_set_slab; ++ ++static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = { ++ [0 ... 7] = SPIN_LOCK_UNLOCKED ++}; ++ ++/* hash table support */ ++ ++#define PS_TABLE_SIZE (32) ++ ++static inline plugin_set *cast_to(const unsigned long *a) ++{ ++ return container_of(a, plugin_set, hashval); ++} ++ ++static inline int pseq(const unsigned long *a1, const unsigned long *a2) ++{ ++ plugin_set *set1; ++ plugin_set *set2; ++ ++ /* make sure fields are not missed in the code below */ ++ cassert(sizeof *set1 == ++ sizeof set1->hashval + ++ sizeof set1->link + ++ sizeof set1->file + ++ sizeof set1->dir + ++ sizeof set1->perm + ++ sizeof set1->formatting + ++ sizeof set1->hash + ++ sizeof set1->fibration + ++ sizeof set1->sd + ++ sizeof set1->dir_item + ++ sizeof set1->cipher + ++ sizeof set1->digest + ++ sizeof set1->compression + ++ sizeof set1->compression_mode + ++ sizeof set1->cluster + ++ sizeof set1->create); ++ ++ set1 = cast_to(a1); ++ set2 = cast_to(a2); ++ return ++ set1->hashval == set2->hashval && ++ set1->file == set2->file && ++ set1->dir == set2->dir && ++ set1->perm == set2->perm && ++ set1->formatting == set2->formatting && ++ set1->hash == set2->hash && ++ set1->fibration == set2->fibration && ++ set1->sd == set2->sd && ++ set1->dir_item == set2->dir_item && ++ set1->cipher == set2->cipher && ++ set1->digest == set2->digest && ++ set1->compression == set2->compression && ++ set1->compression_mode == set2->compression_mode && ++ set1->cluster == set2->cluster && ++ set1->create == set2->create; ++} ++ ++#define HASH_FIELD(hash, set, field) \ ++({ \ ++ (hash) += (unsigned long)(set)->field >> 2; \ ++}) ++ ++static inline unsigned long calculate_hash(const plugin_set * set) ++{ ++ unsigned long result; ++ ++ result = 0; ++ HASH_FIELD(result, set, file); ++ HASH_FIELD(result, set, dir); ++ HASH_FIELD(result, set, perm); ++ HASH_FIELD(result, set, formatting); ++ HASH_FIELD(result, set, hash); ++ HASH_FIELD(result, set, fibration); ++ HASH_FIELD(result, set, sd); ++ HASH_FIELD(result, set, dir_item); ++ HASH_FIELD(result, set, cipher); ++ HASH_FIELD(result, set, digest); ++ HASH_FIELD(result, set, compression); ++ HASH_FIELD(result, set, compression_mode); ++ HASH_FIELD(result, set, cluster); ++ HASH_FIELD(result, set, create); ++ return result & (PS_TABLE_SIZE - 1); ++} ++ ++static inline unsigned long ++pshash(ps_hash_table * table, const unsigned long *a) ++{ ++ return *a; ++} ++ ++/* The hash table definition */ ++#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) ++#define KFREE(ptr, size) kfree(ptr) ++TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash, ++ pseq); ++#undef KFREE ++#undef KMALLOC ++ ++static ps_hash_table ps_table; ++static plugin_set empty_set = { ++ .hashval = 0, ++ .file = NULL, ++ .dir = NULL, ++ .perm = NULL, ++ .formatting = NULL, ++ .hash = NULL, ++ .fibration = NULL, ++ .sd = NULL, ++ .dir_item = NULL, ++ .cipher = NULL, ++ .digest = NULL, ++ .compression = NULL, ++ .compression_mode = NULL, ++ .cluster = NULL, ++ .create = NULL, ++ .link = {NULL} ++}; ++ ++plugin_set *plugin_set_get_empty(void) ++{ ++ return &empty_set; ++} ++ ++void plugin_set_put(plugin_set * set) ++{ ++} ++ ++static inline unsigned long *pset_field(plugin_set * set, int offset) ++{ ++ return (unsigned long *)(((char *)set) + offset); ++} ++ ++static int plugin_set_field(plugin_set ** set, const unsigned long val, ++ const int offset) ++{ ++ unsigned long *spot; ++ spinlock_t *lock; ++ plugin_set replica; ++ plugin_set *twin; ++ plugin_set *psal; ++ plugin_set *orig; ++ ++ assert("nikita-2902", set != NULL); ++ assert("nikita-2904", *set != NULL); ++ ++ spot = pset_field(*set, offset); ++ if (unlikely(*spot == val)) ++ return 0; ++ ++ replica = *(orig = *set); ++ *pset_field(&replica, offset) = val; ++ replica.hashval = calculate_hash(&replica); ++ rcu_read_lock(); ++ twin = ps_hash_find(&ps_table, &replica.hashval); ++ if (unlikely(twin == NULL)) { ++ rcu_read_unlock(); ++ psal = kmem_cache_alloc(plugin_set_slab, ++ reiser4_ctx_gfp_mask_get()); ++ if (psal == NULL) ++ return RETERR(-ENOMEM); ++ *psal = replica; ++ lock = &plugin_set_lock[replica.hashval & 7]; ++ spin_lock(lock); ++ twin = ps_hash_find(&ps_table, &replica.hashval); ++ if (likely(twin == NULL)) { ++ *set = psal; ++ ps_hash_insert_rcu(&ps_table, psal); ++ } else { ++ *set = twin; ++ kmem_cache_free(plugin_set_slab, psal); ++ } ++ spin_unlock(lock); ++ } else { ++ rcu_read_unlock(); ++ *set = twin; ++ } ++ return 0; ++} ++ ++static struct { ++ int offset; ++ reiser4_plugin_groups groups; ++ reiser4_plugin_type type; ++} pset_descr[PSET_LAST] = { ++ [PSET_FILE] = { ++ .offset = offsetof(plugin_set, file), ++ .type = REISER4_FILE_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_DIR] = { ++ .offset = offsetof(plugin_set, dir), ++ .type = REISER4_DIR_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_PERM] = { ++ .offset = offsetof(plugin_set, perm), ++ .type = REISER4_PERM_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_FORMATTING] = { ++ .offset = offsetof(plugin_set, formatting), ++ .type = REISER4_FORMATTING_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_HASH] = { ++ .offset = offsetof(plugin_set, hash), ++ .type = REISER4_HASH_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_FIBRATION] = { ++ .offset = offsetof(plugin_set, fibration), ++ .type = REISER4_FIBRATION_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_SD] = { ++ .offset = offsetof(plugin_set, sd), ++ .type = REISER4_ITEM_PLUGIN_TYPE, ++ .groups = (1 << STAT_DATA_ITEM_TYPE) ++ }, ++ [PSET_DIR_ITEM] = { ++ .offset = offsetof(plugin_set, dir_item), ++ .type = REISER4_ITEM_PLUGIN_TYPE, ++ .groups = (1 << DIR_ENTRY_ITEM_TYPE) ++ }, ++ [PSET_CIPHER] = { ++ .offset = offsetof(plugin_set, cipher), ++ .type = REISER4_CIPHER_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_DIGEST] = { ++ .offset = offsetof(plugin_set, digest), ++ .type = REISER4_DIGEST_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_COMPRESSION] = { ++ .offset = offsetof(plugin_set, compression), ++ .type = REISER4_COMPRESSION_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_COMPRESSION_MODE] = { ++ .offset = offsetof(plugin_set, compression_mode), ++ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_CLUSTER] = { ++ .offset = offsetof(plugin_set, cluster), ++ .type = REISER4_CLUSTER_PLUGIN_TYPE, ++ .groups = 0 ++ }, ++ [PSET_CREATE] = { ++ .offset = offsetof(plugin_set, create), ++ .type = REISER4_FILE_PLUGIN_TYPE, ++ .groups = (1 << REISER4_REGULAR_FILE) ++ } ++}; ++ ++#define DEFINE_PSET_OPS(PREFIX) \ ++ reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \ ++{ \ ++ if (memb > PSET_LAST) \ ++ return REISER4_PLUGIN_TYPES; \ ++ return pset_descr[memb].type; \ ++} \ ++ \ ++int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \ ++ reiser4_plugin * plugin) \ ++{ \ ++ assert("nikita-3492", set != NULL); \ ++ assert("nikita-3493", *set != NULL); \ ++ assert("nikita-3494", plugin != NULL); \ ++ assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \ ++ assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \ ++ \ ++ if (pset_descr[memb].groups) \ ++ if (!(pset_descr[memb].groups & plugin->h.groups)) \ ++ return -EINVAL; \ ++ \ ++ return plugin_set_field(set, \ ++ (unsigned long)plugin, pset_descr[memb].offset); \ ++} \ ++ \ ++reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \ ++{ \ ++ assert("nikita-3497", set != NULL); \ ++ assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \ ++ \ ++ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \ ++} ++ ++DEFINE_PSET_OPS(aset); ++ ++int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) ++{ ++ return plugin_set_field(set, ++ (unsigned long)plugin, pset_descr[memb].offset); ++} ++ ++/** ++ * init_plugin_set - create plugin set cache and hash table ++ * ++ * Initializes slab cache of plugin_set-s and their hash table. It is part of ++ * reiser4 module initialization. ++ */ ++int init_plugin_set(void) ++{ ++ int result; ++ ++ result = ps_hash_init(&ps_table, PS_TABLE_SIZE); ++ if (result == 0) { ++ plugin_set_slab = kmem_cache_create("plugin_set", ++ sizeof(plugin_set), 0, ++ SLAB_HWCACHE_ALIGN, ++ NULL); ++ if (plugin_set_slab == NULL) ++ result = RETERR(-ENOMEM); ++ } ++ return result; ++} ++ ++/** ++ * done_plugin_set - delete plugin_set cache and plugin_set hash table ++ * ++ * This is called on reiser4 module unloading or system shutdown. ++ */ ++void done_plugin_set(void) ++{ ++ plugin_set *cur, *next; ++ ++ for_all_in_htable(&ps_table, ps, cur, next) { ++ ps_hash_remove(&ps_table, cur); ++ kmem_cache_free(plugin_set_slab, cur); ++ } ++ destroy_reiser4_cache(&plugin_set_slab); ++ ps_hash_done(&ps_table); ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 120 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.33/fs/reiser4/plugin/plugin_set.h +--- linux-2.6.33.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/plugin_set.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,78 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Reiser4 plugin set definition. ++ See fs/reiser4/plugin/plugin_set.c for details */ ++ ++#if !defined(__PLUGIN_SET_H__) ++#define __PLUGIN_SET_H__ ++ ++#include "../type_safe_hash.h" ++#include "plugin.h" ++ ++#include <linux/rcupdate.h> ++ ++struct plugin_set; ++typedef struct plugin_set plugin_set; ++ ++TYPE_SAFE_HASH_DECLARE(ps, plugin_set); ++ ++struct plugin_set { ++ unsigned long hashval; ++ /* plugin of file */ ++ file_plugin *file; ++ /* plugin of dir */ ++ dir_plugin *dir; ++ /* perm plugin for this file */ ++ perm_plugin *perm; ++ /* tail policy plugin. Only meaningful for regular files */ ++ formatting_plugin *formatting; ++ /* hash plugin. Only meaningful for directories. */ ++ hash_plugin *hash; ++ /* fibration plugin. Only meaningful for directories. */ ++ fibration_plugin *fibration; ++ /* plugin of stat-data */ ++ item_plugin *sd; ++ /* plugin of items a directory is built of */ ++ item_plugin *dir_item; ++ /* cipher plugin */ ++ cipher_plugin *cipher; ++ /* digest plugin */ ++ digest_plugin *digest; ++ /* compression plugin */ ++ compression_plugin *compression; ++ /* compression mode plugin */ ++ compression_mode_plugin *compression_mode; ++ /* cluster plugin */ ++ cluster_plugin *cluster; ++ /* this specifies file plugin of regular children. ++ only meaningful for directories */ ++ file_plugin *create; ++ ps_hash_link link; ++}; ++ ++extern plugin_set *plugin_set_get_empty(void); ++extern void plugin_set_put(plugin_set * set); ++ ++extern int init_plugin_set(void); ++extern void done_plugin_set(void); ++ ++extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb); ++extern int set_plugin(plugin_set ** set, pset_member memb, ++ reiser4_plugin * plugin); ++extern int aset_set_unsafe(plugin_set ** set, pset_member memb, ++ reiser4_plugin * plugin); ++extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb); ++ ++/* __PLUGIN_SET_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/security/Makefile linux-2.6.33/fs/reiser4/plugin/security/Makefile +--- linux-2.6.33.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/security/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,4 @@ ++obj-$(CONFIG_REISER4_FS) += security_plugins.o ++ ++security_plugins-objs := \ ++ perm.o +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/security/perm.c linux-2.6.33/fs/reiser4/plugin/security/perm.c +--- linux-2.6.33.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/security/perm.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,33 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* ++ * This file contains implementation of permission plugins. ++ * See the comments in perm.h ++ */ ++ ++#include "../plugin.h" ++#include "../plugin_header.h" ++#include "../../debug.h" ++ ++perm_plugin perm_plugins[LAST_PERM_ID] = { ++ [NULL_PERM_ID] = { ++ .h = { ++ .type_id = REISER4_PERM_PLUGIN_TYPE, ++ .id = NULL_PERM_ID, ++ .pops = NULL, ++ .label = "null", ++ .desc = "stub permission plugin", ++ .linkage = {NULL, NULL} ++ } ++ } ++}; ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/security/perm.h linux-2.6.33/fs/reiser4/plugin/security/perm.h +--- linux-2.6.33.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/security/perm.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,38 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Perm (short for "permissions") plugins common stuff. */ ++ ++#if !defined( __REISER4_PERM_H__ ) ++#define __REISER4_PERM_H__ ++ ++#include "../../forward.h" ++#include "../plugin_header.h" ++ ++#include <linux/types.h> ++ ++/* Definition of permission plugin */ ++/* NIKITA-FIXME-HANS: define what this is targeted for. ++ It does not seem to be intended for use with sys_reiser4. Explain. */ ++ ++/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4. ++ Consider it like a temporary "seam" and reserved pset member. ++ If you have something usefull to add, then rename this plugin and add here */ ++typedef struct perm_plugin { ++ /* generic plugin fields */ ++ plugin_header h; ++} perm_plugin; ++ ++typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id; ++ ++/* __REISER4_PERM_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.33/fs/reiser4/plugin/space/bitmap.c +--- linux-2.6.33.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/space/bitmap.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1585 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#include "../../debug.h" ++#include "../../dformat.h" ++#include "../../txnmgr.h" ++#include "../../jnode.h" ++#include "../../block_alloc.h" ++#include "../../tree.h" ++#include "../../super.h" ++#include "../plugin.h" ++#include "space_allocator.h" ++#include "bitmap.h" ++ ++#include <linux/types.h> ++#include <linux/fs.h> /* for struct super_block */ ++#include <linux/mutex.h> ++#include <asm/div64.h> ++ ++/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap ++ * blocks ++ ++ A useful optimization of reiser4 bitmap handling would be dynamic bitmap ++ blocks loading/unloading which is different from v3.x where all bitmap ++ blocks are loaded at mount time. ++ ++ To implement bitmap blocks unloading we need to count bitmap block usage ++ and detect currently unused blocks allowing them to be unloaded. It is not ++ a simple task since we allow several threads to modify one bitmap block ++ simultaneously. ++ ++ Briefly speaking, the following schema is proposed: we count in special ++ variable associated with each bitmap block. That is for counting of block ++ alloc/dealloc operations on that bitmap block. With a deferred block ++ deallocation feature of reiser4 all those operation will be represented in ++ atom dirty/deleted lists as jnodes for freshly allocated or deleted ++ nodes. ++ ++ So, we increment usage counter for each new node allocated or deleted, and ++ decrement it at atom commit one time for each node from the dirty/deleted ++ atom's list. Of course, freshly allocated node deletion and node reusing ++ from atom deleted (if we do so) list should decrement bitmap usage counter ++ also. ++ ++ This schema seems to be working but that reference counting is ++ not easy to debug. I think we should agree with Hans and do not implement ++ it in v4.0. Current code implements "on-demand" bitmap blocks loading only. ++ ++ For simplicity all bitmap nodes (both commit and working bitmap blocks) are ++ loaded into memory on fs mount time or each bitmap nodes are loaded at the ++ first access to it, the "dont_load_bitmap" mount option controls whether ++ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap ++ nodes currently is not supported. */ ++ ++#define CHECKSUM_SIZE 4 ++ ++#define BYTES_PER_LONG (sizeof(long)) ++ ++#if BITS_PER_LONG == 64 ++# define LONG_INT_SHIFT (6) ++#else ++# define LONG_INT_SHIFT (5) ++#endif ++ ++#define LONG_INT_MASK (BITS_PER_LONG - 1UL) ++ ++typedef unsigned long ulong_t; ++ ++#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE) ++#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3) ++ ++/* Block allocation/deallocation are done through special bitmap objects which ++ are allocated in an array at fs mount. */ ++struct bitmap_node { ++ struct mutex mutex; /* long term lock object */ ++ ++ jnode *wjnode; /* j-nodes for WORKING ... */ ++ jnode *cjnode; /* ... and COMMIT bitmap blocks */ ++ ++ bmap_off_t first_zero_bit; /* for skip_busy option implementation */ ++ ++ atomic_t loaded; /* a flag which shows that bnode is loaded ++ * already */ ++}; ++ ++static inline char *bnode_working_data(struct bitmap_node *bnode) ++{ ++ char *data; ++ ++ data = jdata(bnode->wjnode); ++ assert("zam-429", data != NULL); ++ ++ return data + CHECKSUM_SIZE; ++} ++ ++static inline char *bnode_commit_data(const struct bitmap_node *bnode) ++{ ++ char *data; ++ ++ data = jdata(bnode->cjnode); ++ assert("zam-430", data != NULL); ++ ++ return data + CHECKSUM_SIZE; ++} ++ ++static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode) ++{ ++ char *data; ++ ++ data = jdata(bnode->cjnode); ++ assert("vpf-261", data != NULL); ++ ++ return le32_to_cpu(get_unaligned((d32 *)data)); ++} ++ ++static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc) ++{ ++ char *data; ++ ++ data = jdata(bnode->cjnode); ++ assert("vpf-261", data != NULL); ++ ++ put_unaligned(cpu_to_le32(crc), (d32 *)data); ++} ++ ++/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having ++ * written the code, does this added abstraction still have */ ++/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the ++ * reiser4_space_allocator structure) */ ++/* ZAM-FIXME-HANS: I don't understand your english in comment above. */ ++/* FIXME-HANS(Zam): I don't understand the questions like "might be a union ++ * someday?". What they about? If there is a reason to have a union, it should ++ * be a union, if not, it should not be a union. "..might be someday" means no ++ * reason. */ ++struct bitmap_allocator_data { ++ /* an array for bitmap blocks direct access */ ++ struct bitmap_node *bitmap; ++}; ++ ++#define get_barray(super) \ ++(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap) ++ ++#define get_bnode(super, i) (get_barray(super) + i) ++ ++/* allocate and initialize jnode with JNODE_BITMAP type */ ++static jnode *bnew(void) ++{ ++ jnode *jal = jalloc(); ++ ++ if (jal) ++ jnode_init(jal, current_tree, JNODE_BITMAP); ++ ++ return jal; ++} ++ ++/* this file contains: ++ - bitmap based implementation of space allocation plugin ++ - all the helper functions like set bit, find_first_zero_bit, etc */ ++ ++/* Audited by: green(2002.06.12) */ ++static int find_next_zero_bit_in_word(ulong_t word, int start_bit) ++{ ++ ulong_t mask = 1UL << start_bit; ++ int i = start_bit; ++ ++ while ((word & mask) != 0) { ++ mask <<= 1; ++ if (++i >= BITS_PER_LONG) ++ break; ++ } ++ ++ return i; ++} ++ ++#include <linux/bitops.h> ++ ++#if BITS_PER_LONG == 64 ++ ++#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3) ++#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1))) ++ ++static inline void reiser4_set_bit(int nr, void *addr) ++{ ++ ext2_set_bit(nr + OFF(addr), BASE(addr)); ++} ++ ++static inline void reiser4_clear_bit(int nr, void *addr) ++{ ++ ext2_clear_bit(nr + OFF(addr), BASE(addr)); ++} ++ ++static inline int reiser4_test_bit(int nr, void *addr) ++{ ++ return ext2_test_bit(nr + OFF(addr), BASE(addr)); ++} ++static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset, ++ int offset) ++{ ++ int off = OFF(addr); ++ ++ return ext2_find_next_zero_bit(BASE(addr), maxoffset + off, ++ offset + off) - off; ++} ++ ++#else ++ ++#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr) ++#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr) ++#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr) ++ ++#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \ ++ext2_find_next_zero_bit(addr, maxoffset, offset) ++#endif ++ ++/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets ++ * are counted from @addr, return the offset of the first bit if it is found, ++ * @maxoffset otherwise. */ ++static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset, ++ bmap_off_t start_offset) ++{ ++ ulong_t *base = addr; ++ /* start_offset is in bits, convert it to byte offset within bitmap. */ ++ int word_nr = start_offset >> LONG_INT_SHIFT; ++ /* bit number within the byte. */ ++ int bit_nr = start_offset & LONG_INT_MASK; ++ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT; ++ ++ assert("zam-387", max_offset != 0); ++ ++ /* Unaligned @start_offset case. */ ++ if (bit_nr != 0) { ++ bmap_nr_t nr; ++ ++ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr); ++ ++ if (nr < BITS_PER_LONG) ++ return (word_nr << LONG_INT_SHIFT) + nr; ++ ++ ++word_nr; ++ } ++ ++ /* Fast scan trough aligned words. */ ++ while (word_nr <= max_word_nr) { ++ if (base[word_nr] != 0) { ++ return (word_nr << LONG_INT_SHIFT) ++ + find_next_zero_bit_in_word(~(base[word_nr]), 0); ++ } ++ ++ ++word_nr; ++ } ++ ++ return max_offset; ++} ++ ++#if BITS_PER_LONG == 64 ++ ++static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset, ++ bmap_off_t start_offset) ++{ ++ bmap_off_t off = OFF(addr); ++ ++ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off, ++ start_offset + off) - off; ++} ++ ++#else ++#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \ ++ __reiser4_find_next_set_bit(addr, max_offset, start_offset) ++#endif ++ ++/* search for the first set bit in single word. */ ++static int find_last_set_bit_in_word(ulong_t word, int start_bit) ++{ ++ ulong_t bit_mask; ++ int nr = start_bit; ++ ++ assert("zam-965", start_bit < BITS_PER_LONG); ++ assert("zam-966", start_bit >= 0); ++ ++ bit_mask = (1UL << nr); ++ ++ while (bit_mask != 0) { ++ if (bit_mask & word) ++ return nr; ++ bit_mask >>= 1; ++ nr--; ++ } ++ return BITS_PER_LONG; ++} ++ ++/* Search bitmap for a set bit in backward direction from the end to the ++ * beginning of given region ++ * ++ * @result: result offset of the last set bit ++ * @addr: base memory address, ++ * @low_off: low end of the search region, edge bit included into the region, ++ * @high_off: high end of the search region, edge bit included into the region, ++ * ++ * @return: 0 - set bit was found, -1 otherwise. ++ */ ++static int ++reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off, ++ bmap_off_t high_off) ++{ ++ ulong_t *base = addr; ++ int last_word; ++ int first_word; ++ int last_bit; ++ int nr; ++ ++ assert("zam-962", high_off >= low_off); ++ ++ last_word = high_off >> LONG_INT_SHIFT; ++ last_bit = high_off & LONG_INT_MASK; ++ first_word = low_off >> LONG_INT_SHIFT; ++ ++ if (last_bit < BITS_PER_LONG) { ++ nr = find_last_set_bit_in_word(base[last_word], last_bit); ++ if (nr < BITS_PER_LONG) { ++ *result = (last_word << LONG_INT_SHIFT) + nr; ++ return 0; ++ } ++ --last_word; ++ } ++ while (last_word >= first_word) { ++ if (base[last_word] != 0x0) { ++ last_bit = ++ find_last_set_bit_in_word(base[last_word], ++ BITS_PER_LONG - 1); ++ assert("zam-972", last_bit < BITS_PER_LONG); ++ *result = (last_word << LONG_INT_SHIFT) + last_bit; ++ return 0; ++ } ++ --last_word; ++ } ++ ++ return -1; /* set bit not found */ ++} ++ ++/* Search bitmap for a clear bit in backward direction from the end to the ++ * beginning of given region */ ++static int ++reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off, ++ bmap_off_t high_off) ++{ ++ ulong_t *base = addr; ++ int last_word; ++ int first_word; ++ int last_bit; ++ int nr; ++ ++ last_word = high_off >> LONG_INT_SHIFT; ++ last_bit = high_off & LONG_INT_MASK; ++ first_word = low_off >> LONG_INT_SHIFT; ++ ++ if (last_bit < BITS_PER_LONG) { ++ nr = find_last_set_bit_in_word(~base[last_word], last_bit); ++ if (nr < BITS_PER_LONG) { ++ *result = (last_word << LONG_INT_SHIFT) + nr; ++ return 0; ++ } ++ --last_word; ++ } ++ while (last_word >= first_word) { ++ if (base[last_word] != (ulong_t) (-1)) { ++ *result = (last_word << LONG_INT_SHIFT) + ++ find_last_set_bit_in_word(~base[last_word], ++ BITS_PER_LONG - 1); ++ return 0; ++ } ++ --last_word; ++ } ++ ++ return -1; /* zero bit not found */ ++} ++ ++/* Audited by: green(2002.06.12) */ ++static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end) ++{ ++ int first_byte; ++ int last_byte; ++ ++ unsigned char first_byte_mask = 0xFF; ++ unsigned char last_byte_mask = 0xFF; ++ ++ assert("zam-410", start < end); ++ ++ first_byte = start >> 3; ++ last_byte = (end - 1) >> 3; ++ ++ if (last_byte > first_byte + 1) ++ memset(addr + first_byte + 1, 0, ++ (size_t) (last_byte - first_byte - 1)); ++ ++ first_byte_mask >>= 8 - (start & 0x7); ++ last_byte_mask <<= ((end - 1) & 0x7) + 1; ++ ++ if (first_byte == last_byte) { ++ addr[first_byte] &= (first_byte_mask | last_byte_mask); ++ } else { ++ addr[first_byte] &= first_byte_mask; ++ addr[last_byte] &= last_byte_mask; ++ } ++} ++ ++/* Audited by: green(2002.06.12) */ ++/* ZAM-FIXME-HANS: comment this */ ++static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end) ++{ ++ int first_byte; ++ int last_byte; ++ ++ unsigned char first_byte_mask = 0xFF; ++ unsigned char last_byte_mask = 0xFF; ++ ++ assert("zam-386", start < end); ++ ++ first_byte = start >> 3; ++ last_byte = (end - 1) >> 3; ++ ++ if (last_byte > first_byte + 1) ++ memset(addr + first_byte + 1, 0xFF, ++ (size_t) (last_byte - first_byte - 1)); ++ ++ first_byte_mask <<= start & 0x7; ++ last_byte_mask >>= 7 - ((end - 1) & 0x7); ++ ++ if (first_byte == last_byte) { ++ addr[first_byte] |= (first_byte_mask & last_byte_mask); ++ } else { ++ addr[first_byte] |= first_byte_mask; ++ addr[last_byte] |= last_byte_mask; ++ } ++} ++ ++#define ADLER_BASE 65521 ++#define ADLER_NMAX 5552 ++ ++/* Calculates the adler32 checksum for the data pointed by `data` of the ++ length `len`. This function was originally taken from zlib, version 1.1.3, ++ July 9th, 1998. ++ ++ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler ++ ++ This software is provided 'as-is', without any express or implied ++ warranty. In no event will the authors be held liable for any damages ++ arising from the use of this software. ++ ++ Permission is granted to anyone to use this software for any purpose, ++ including commercial applications, and to alter it and redistribute it ++ freely, subject to the following restrictions: ++ ++ 1. The origin of this software must not be misrepresented; you must not ++ claim that you wrote the original software. If you use this software ++ in a product, an acknowledgment in the product documentation would be ++ appreciated but is not required. ++ 2. Altered source versions must be plainly marked as such, and must not be ++ misrepresented as being the original software. ++ 3. This notice may not be removed or altered from any source distribution. ++ ++ Jean-loup Gailly Mark Adler ++ jloup@gzip.org madler@alumni.caltech.edu ++ ++ The above comment applies only to the reiser4_adler32 function. ++*/ ++ ++__u32 reiser4_adler32(char *data, __u32 len) ++{ ++ unsigned char *t = data; ++ __u32 s1 = 1; ++ __u32 s2 = 0; ++ int k; ++ ++ while (len > 0) { ++ k = len < ADLER_NMAX ? len : ADLER_NMAX; ++ len -= k; ++ ++ while (k--) { ++ s1 += *t++; ++ s2 += s1; ++ } ++ ++ s1 %= ADLER_BASE; ++ s2 %= ADLER_BASE; ++ } ++ return (s2 << 16) | s1; ++} ++ ++#define sb_by_bnode(bnode) \ ++ ((struct super_block *)jnode_get_tree(bnode->wjnode)->super) ++ ++static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size) ++{ ++ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size)); ++} ++ ++static int ++bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size) ++{ ++ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) { ++ bmap_nr_t bmap; ++ ++ bmap = bnode - get_bnode(sb_by_bnode(bnode), 0); ++ ++ warning("vpf-263", ++ "Checksum for the bitmap block %llu is incorrect", ++ bmap); ++ ++ return RETERR(-EIO); ++ } ++ ++ return 0; ++} ++ ++#define REISER4_CHECK_BMAP_CRC (0) ++ ++#if REISER4_CHECK_BMAP_CRC ++static int bnode_check_crc(const struct bitmap_node *bnode) ++{ ++ return bnode_check_adler32(bnode, ++ bmap_size(sb_by_bnode(bnode)->s_blocksize)); ++} ++ ++/* REISER4_CHECK_BMAP_CRC */ ++#else ++ ++#define bnode_check_crc(bnode) (0) ++ ++/* REISER4_CHECK_BMAP_CRC */ ++#endif ++ ++/* Recalculates the adler32 checksum for only 1 byte change. ++ adler - previous adler checksum ++ old_data, data - old, new byte values. ++ tail == (chunk - offset) : length, checksum was calculated for, - offset of ++ the changed byte within this chunk. ++ This function can be used for checksum calculation optimisation. ++*/ ++ ++static __u32 ++adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data, ++ __u32 tail) ++{ ++ __u32 delta = data - old_data + 2 * ADLER_BASE; ++ __u32 s1 = adler & 0xffff; ++ __u32 s2 = (adler >> 16) & 0xffff; ++ ++ s1 = (delta + s1) % ADLER_BASE; ++ s2 = (delta * tail + s2) % ADLER_BASE; ++ ++ return (s2 << 16) | s1; ++} ++ ++#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val)) ++ ++/** ++ * get_nr_bitmap - calculate number of bitmap blocks ++ * @super: super block with initialized blocksize and block count ++ * ++ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to ++ * maintain free disk space. It assumes that each bitmap addresses the same ++ * number of blocks which is calculated by bmap_block_count macro defined in ++ * above. Number of blocks in the filesystem has to be initialized in reiser4 ++ * private data of super block already so that it can be obtained via ++ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap ++ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have ++ * to use special function to divide and modulo 64bits filesystem block ++ * counters. ++ * ++ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap ++ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address ++ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2. ++ */ ++static bmap_nr_t get_nr_bmap(const struct super_block *super) ++{ ++ u64 quotient; ++ ++ assert("zam-393", reiser4_block_count(super) != 0); ++ ++ quotient = reiser4_block_count(super) - 1; ++ do_div(quotient, bmap_bit_count(super->s_blocksize)); ++ return quotient + 1; ++} ++ ++/** ++ * parse_blocknr - calculate bitmap number and offset in it by block number ++ * @block: pointer to block number to calculate location in bitmap of ++ * @bmap: pointer where to store bitmap block number ++ * @offset: pointer where to store offset within bitmap block ++ * ++ * Calculates location of bit which is responsible for allocation/freeing of ++ * block @*block. That location is represented by bitmap block number and offset ++ * within that bitmap block. ++ */ ++static void ++parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap, ++ bmap_off_t *offset) ++{ ++ struct super_block *super = get_current_context()->super; ++ u64 quotient = *block; ++ ++ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize)); ++ *bmap = quotient; ++ ++ assert("zam-433", *bmap < get_nr_bmap(super)); ++ assert("", *offset < bmap_bit_count(super->s_blocksize)); ++} ++ ++#if REISER4_DEBUG ++/* Audited by: green(2002.06.12) */ ++static void ++check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len) ++{ ++ struct super_block *sb = reiser4_get_current_sb(); ++ ++ assert("zam-436", sb != NULL); ++ ++ assert("zam-455", start != NULL); ++ assert("zam-437", *start != 0); ++ assert("zam-541", !reiser4_blocknr_is_fake(start)); ++ assert("zam-441", *start < reiser4_block_count(sb)); ++ ++ if (len != NULL) { ++ assert("zam-438", *len != 0); ++ assert("zam-442", *start + *len <= reiser4_block_count(sb)); ++ } ++} ++ ++static void check_bnode_loaded(const struct bitmap_node *bnode) ++{ ++ assert("zam-485", bnode != NULL); ++ assert("zam-483", jnode_page(bnode->wjnode) != NULL); ++ assert("zam-484", jnode_page(bnode->cjnode) != NULL); ++ assert("nikita-2820", jnode_is_loaded(bnode->wjnode)); ++ assert("nikita-2821", jnode_is_loaded(bnode->cjnode)); ++} ++ ++#else ++ ++# define check_block_range(start, len) do { /* nothing */} while(0) ++# define check_bnode_loaded(bnode) do { /* nothing */} while(0) ++ ++#endif ++ ++/* modify bnode->first_zero_bit (if we free bits before); bnode should be ++ spin-locked */ ++static inline void ++adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset) ++{ ++ if (offset < bnode->first_zero_bit) ++ bnode->first_zero_bit = offset; ++} ++ ++/* return a physical disk address for logical bitmap number @bmap */ ++/* FIXME-VS: this is somehow related to disk layout? */ ++/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference ++ * per block allocation so that performance is not affected. Probably this ++ * whole file should be considered part of the disk layout plugin, and other ++ * disk layouts can use other defines and efficiency will not be significantly ++ * affected. */ ++ ++#define REISER4_FIRST_BITMAP_BLOCK \ ++ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2) ++ ++/* Audited by: green(2002.06.12) */ ++static void ++get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap, ++ reiser4_block_nr * bnr) ++{ ++ ++ assert("zam-390", bmap < get_nr_bmap(super)); ++ ++#ifdef CONFIG_REISER4_BADBLOCKS ++#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff)) ++ /* Check if the diskmap have this already, first. */ ++ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0) ++ return; /* Found it in diskmap */ ++#endif ++ /* FIXME_ZAM: before discussing of disk layouts and disk format ++ plugins I implement bitmap location scheme which is close to scheme ++ used in reiser 3.6 */ ++ if (bmap == 0) { ++ *bnr = REISER4_FIRST_BITMAP_BLOCK; ++ } else { ++ *bnr = bmap * bmap_bit_count(super->s_blocksize); ++ } ++} ++ ++/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */ ++/* Audited by: green(2002.06.12) */ ++static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr) ++{ ++ *bnr = ++ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) | ++ REISER4_BITMAP_BLOCKS_STATUS_VALUE); ++} ++ ++/* bnode structure initialization */ ++static void ++init_bnode(struct bitmap_node *bnode, ++ struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG) ++{ ++ memset(bnode, 0, sizeof(struct bitmap_node)); ++ ++ mutex_init(&bnode->mutex); ++ atomic_set(&bnode->loaded, 0); ++} ++ ++static void release(jnode * node) ++{ ++ jrelse(node); ++ JF_SET(node, JNODE_HEARD_BANSHEE); ++ jput(node); ++} ++ ++/* This function is for internal bitmap.c use because it assumes that jnode is ++ in under full control of this thread */ ++static void done_bnode(struct bitmap_node *bnode) ++{ ++ if (bnode) { ++ atomic_set(&bnode->loaded, 0); ++ if (bnode->wjnode != NULL) ++ release(bnode->wjnode); ++ if (bnode->cjnode != NULL) ++ release(bnode->cjnode); ++ bnode->wjnode = bnode->cjnode = NULL; ++ } ++} ++ ++/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/ ++static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret, ++ jnode **wjnode_ret) ++{ ++ struct super_block *super; ++ jnode *cjnode; ++ jnode *wjnode; ++ bmap_nr_t bmap; ++ int ret; ++ ++ super = reiser4_get_current_sb(); ++ ++ *wjnode_ret = wjnode = bnew(); ++ if (wjnode == NULL) { ++ *cjnode_ret = NULL; ++ return RETERR(-ENOMEM); ++ } ++ ++ *cjnode_ret = cjnode = bnew(); ++ if (cjnode == NULL) ++ return RETERR(-ENOMEM); ++ ++ bmap = bnode - get_bnode(super, 0); ++ ++ get_working_bitmap_blocknr(bmap, &wjnode->blocknr); ++ get_bitmap_blocknr(super, bmap, &cjnode->blocknr); ++ ++ jref(cjnode); ++ jref(wjnode); ++ ++ /* load commit bitmap */ ++ ret = jload_gfp(cjnode, GFP_NOFS, 1); ++ ++ if (ret) ++ goto error; ++ ++ /* allocate memory for working bitmap block. Note that for ++ * bitmaps jinit_new() doesn't actually modifies node content, ++ * so parallel calls to this are ok. */ ++ ret = jinit_new(wjnode, GFP_NOFS); ++ ++ if (ret != 0) { ++ jrelse(cjnode); ++ goto error; ++ } ++ ++ return 0; ++ ++ error: ++ jput(cjnode); ++ jput(wjnode); ++ *wjnode_ret = *cjnode_ret = NULL; ++ return ret; ++ ++} ++ ++/* Check the bnode data on read. */ ++static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize) ++{ ++ void *data; ++ int ret; ++ ++ /* Check CRC */ ++ ret = bnode_check_adler32(bnode, blksize); ++ ++ if (ret) { ++ return ret; ++ } ++ ++ data = jdata(bnode->cjnode) + CHECKSUM_SIZE; ++ ++ /* Check the very first bit -- it must be busy. */ ++ if (!reiser4_test_bit(0, data)) { ++ warning("vpf-1362", "The allocator block %llu is not marked " ++ "as used.", (unsigned long long)bnode->cjnode->blocknr); ++ ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/* load bitmap blocks "on-demand" */ ++static int load_and_lock_bnode(struct bitmap_node *bnode) ++{ ++ int ret; ++ ++ jnode *cjnode; ++ jnode *wjnode; ++ ++ assert("nikita-3040", reiser4_schedulable()); ++ ++/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not ++ * need to be atomic, right? Just leave a comment that if bitmaps were ++ * unloadable, this would need to be atomic. */ ++ if (atomic_read(&bnode->loaded)) { ++ /* bitmap is already loaded, nothing to do */ ++ check_bnode_loaded(bnode); ++ mutex_lock(&bnode->mutex); ++ assert("nikita-2827", atomic_read(&bnode->loaded)); ++ return 0; ++ } ++ ++ ret = prepare_bnode(bnode, &cjnode, &wjnode); ++ if (ret == 0) { ++ mutex_lock(&bnode->mutex); ++ ++ if (!atomic_read(&bnode->loaded)) { ++ assert("nikita-2822", cjnode != NULL); ++ assert("nikita-2823", wjnode != NULL); ++ assert("nikita-2824", jnode_is_loaded(cjnode)); ++ assert("nikita-2825", jnode_is_loaded(wjnode)); ++ ++ bnode->wjnode = wjnode; ++ bnode->cjnode = cjnode; ++ ++ ret = check_struct_bnode(bnode, current_blocksize); ++ if (!ret) { ++ cjnode = wjnode = NULL; ++ atomic_set(&bnode->loaded, 1); ++ /* working bitmap is initialized by on-disk ++ * commit bitmap. This should be performed ++ * under mutex. */ ++ memcpy(bnode_working_data(bnode), ++ bnode_commit_data(bnode), ++ bmap_size(current_blocksize)); ++ } else ++ mutex_unlock(&bnode->mutex); ++ } else ++ /* race: someone already loaded bitmap while we were ++ * busy initializing data. */ ++ check_bnode_loaded(bnode); ++ } ++ ++ if (wjnode != NULL) { ++ release(wjnode); ++ bnode->wjnode = NULL; ++ } ++ if (cjnode != NULL) { ++ release(cjnode); ++ bnode->cjnode = NULL; ++ } ++ ++ return ret; ++} ++ ++static void release_and_unlock_bnode(struct bitmap_node *bnode) ++{ ++ check_bnode_loaded(bnode); ++ mutex_unlock(&bnode->mutex); ++} ++ ++/* This function does all block allocation work but only for one bitmap ++ block.*/ ++/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap ++ block responsibility zone boundaries. This had no sense in v3.6 but may ++ have it in v4.x */ ++/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */ ++static int ++search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset, ++ bmap_off_t max_offset, int min_len, int max_len) ++{ ++ struct super_block *super = get_current_context()->super; ++ struct bitmap_node *bnode = get_bnode(super, bmap); ++ ++ char *data; ++ ++ bmap_off_t search_end; ++ bmap_off_t start; ++ bmap_off_t end; ++ ++ int set_first_zero_bit = 0; ++ ++ int ret; ++ ++ assert("zam-364", min_len > 0); ++ assert("zam-365", max_len >= min_len); ++ assert("zam-366", *offset <= max_offset); ++ ++ ret = load_and_lock_bnode(bnode); ++ ++ if (ret) ++ return ret; ++ ++ data = bnode_working_data(bnode); ++ ++ start = *offset; ++ ++ if (bnode->first_zero_bit >= start) { ++ start = bnode->first_zero_bit; ++ set_first_zero_bit = 1; ++ } ++ ++ while (start + min_len < max_offset) { ++ ++ start = ++ reiser4_find_next_zero_bit((long *)data, max_offset, start); ++ if (set_first_zero_bit) { ++ bnode->first_zero_bit = start; ++ set_first_zero_bit = 0; ++ } ++ if (start >= max_offset) ++ break; ++ ++ search_end = LIMIT(start + max_len, max_offset); ++ end = ++ reiser4_find_next_set_bit((long *)data, search_end, start); ++ if (end >= start + min_len) { ++ /* we can't trust find_next_set_bit result if set bit ++ was not fount, result may be bigger than ++ max_offset */ ++ if (end > search_end) ++ end = search_end; ++ ++ ret = end - start; ++ *offset = start; ++ ++ reiser4_set_bits(data, start, end); ++ ++ /* FIXME: we may advance first_zero_bit if [start, ++ end] region overlaps the first_zero_bit point */ ++ ++ break; ++ } ++ ++ start = end + 1; ++ } ++ ++ release_and_unlock_bnode(bnode); ++ ++ return ret; ++} ++ ++static int ++search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset, ++ bmap_off_t end_offset, int min_len, int max_len) ++{ ++ struct super_block *super = get_current_context()->super; ++ struct bitmap_node *bnode = get_bnode(super, bmap); ++ char *data; ++ bmap_off_t start; ++ int ret; ++ ++ assert("zam-958", min_len > 0); ++ assert("zam-959", max_len >= min_len); ++ assert("zam-960", *start_offset >= end_offset); ++ ++ ret = load_and_lock_bnode(bnode); ++ if (ret) ++ return ret; ++ ++ data = bnode_working_data(bnode); ++ start = *start_offset; ++ ++ while (1) { ++ bmap_off_t end, search_end; ++ ++ /* Find the beginning of the zero filled region */ ++ if (reiser4_find_last_zero_bit(&start, data, end_offset, start)) ++ break; ++ /* Is there more than `min_len' bits from `start' to ++ * `end_offset'? */ ++ if (start < end_offset + min_len - 1) ++ break; ++ ++ /* Do not search to `end_offset' if we need to find less than ++ * `max_len' zero bits. */ ++ if (end_offset + max_len - 1 < start) ++ search_end = start - max_len + 1; ++ else ++ search_end = end_offset; ++ ++ if (reiser4_find_last_set_bit(&end, data, search_end, start)) ++ end = search_end; ++ else ++ end++; ++ ++ if (end + min_len <= start + 1) { ++ if (end < search_end) ++ end = search_end; ++ ret = start - end + 1; ++ *start_offset = end; /* `end' is lowest offset */ ++ assert("zam-987", ++ reiser4_find_next_set_bit(data, start + 1, ++ end) >= start + 1); ++ reiser4_set_bits(data, end, start + 1); ++ break; ++ } ++ ++ if (end <= end_offset) ++ /* left search boundary reached. */ ++ break; ++ start = end - 1; ++ } ++ ++ release_and_unlock_bnode(bnode); ++ return ret; ++} ++ ++/* allocate contiguous range of blocks in bitmap */ ++static int bitmap_alloc_forward(reiser4_block_nr * start, ++ const reiser4_block_nr * end, int min_len, ++ int max_len) ++{ ++ bmap_nr_t bmap, end_bmap; ++ bmap_off_t offset, end_offset; ++ int len; ++ ++ reiser4_block_nr tmp; ++ ++ struct super_block *super = get_current_context()->super; ++ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize); ++ ++ parse_blocknr(start, &bmap, &offset); ++ ++ tmp = *end - 1; ++ parse_blocknr(&tmp, &end_bmap, &end_offset); ++ ++end_offset; ++ ++ assert("zam-358", end_bmap >= bmap); ++ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset)); ++ ++ for (; bmap < end_bmap; bmap++, offset = 0) { ++ len = ++ search_one_bitmap_forward(bmap, &offset, max_offset, ++ min_len, max_len); ++ if (len != 0) ++ goto out; ++ } ++ ++ len = ++ search_one_bitmap_forward(bmap, &offset, end_offset, min_len, ++ max_len); ++ out: ++ *start = bmap * max_offset + offset; ++ return len; ++} ++ ++/* allocate contiguous range of blocks in bitmap (from @start to @end in ++ * backward direction) */ ++static int bitmap_alloc_backward(reiser4_block_nr * start, ++ const reiser4_block_nr * end, int min_len, ++ int max_len) ++{ ++ bmap_nr_t bmap, end_bmap; ++ bmap_off_t offset, end_offset; ++ int len; ++ struct super_block *super = get_current_context()->super; ++ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize); ++ ++ parse_blocknr(start, &bmap, &offset); ++ parse_blocknr(end, &end_bmap, &end_offset); ++ ++ assert("zam-961", end_bmap <= bmap); ++ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset)); ++ ++ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) { ++ len = ++ search_one_bitmap_backward(bmap, &offset, 0, min_len, ++ max_len); ++ if (len != 0) ++ goto out; ++ } ++ ++ len = ++ search_one_bitmap_backward(bmap, &offset, end_offset, min_len, ++ max_len); ++ out: ++ *start = bmap * max_offset + offset; ++ return len; ++} ++ ++/* plugin->u.space_allocator.alloc_blocks() */ ++static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed, ++ reiser4_block_nr *start, reiser4_block_nr *len) ++{ ++ struct super_block *super = get_current_context()->super; ++ int actual_len; ++ ++ reiser4_block_nr search_start; ++ reiser4_block_nr search_end; ++ ++ assert("zam-398", super != NULL); ++ assert("zam-412", hint != NULL); ++ assert("zam-397", hint->blk <= reiser4_block_count(super)); ++ ++ if (hint->max_dist == 0) ++ search_end = reiser4_block_count(super); ++ else ++ search_end = ++ LIMIT(hint->blk + hint->max_dist, ++ reiser4_block_count(super)); ++ ++ /* We use @hint -> blk as a search start and search from it to the end ++ of the disk or in given region if @hint -> max_dist is not zero */ ++ search_start = hint->blk; ++ ++ actual_len = ++ bitmap_alloc_forward(&search_start, &search_end, 1, needed); ++ ++ /* There is only one bitmap search if max_dist was specified or first ++ pass was from the beginning of the bitmap. We also do one pass for ++ scanning bitmap in backward direction. */ ++ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) { ++ /* next step is a scanning from 0 to search_start */ ++ search_end = search_start; ++ search_start = 0; ++ actual_len = ++ bitmap_alloc_forward(&search_start, &search_end, 1, needed); ++ } ++ if (actual_len == 0) ++ return RETERR(-ENOSPC); ++ if (actual_len < 0) ++ return RETERR(actual_len); ++ *len = actual_len; ++ *start = search_start; ++ return 0; ++} ++ ++static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed, ++ reiser4_block_nr * start, ++ reiser4_block_nr * len) ++{ ++ reiser4_block_nr search_start; ++ reiser4_block_nr search_end; ++ int actual_len; ++ ++ ON_DEBUG(struct super_block *super = reiser4_get_current_sb()); ++ ++ assert("zam-969", super != NULL); ++ assert("zam-970", hint != NULL); ++ assert("zam-971", hint->blk <= reiser4_block_count(super)); ++ ++ search_start = hint->blk; ++ if (hint->max_dist == 0 || search_start <= hint->max_dist) ++ search_end = 0; ++ else ++ search_end = search_start - hint->max_dist; ++ ++ actual_len = ++ bitmap_alloc_backward(&search_start, &search_end, 1, needed); ++ if (actual_len == 0) ++ return RETERR(-ENOSPC); ++ if (actual_len < 0) ++ return RETERR(actual_len); ++ *len = actual_len; ++ *start = search_start; ++ return 0; ++} ++ ++/* plugin->u.space_allocator.alloc_blocks() */ ++int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator, ++ reiser4_blocknr_hint * hint, int needed, ++ reiser4_block_nr * start, reiser4_block_nr * len) ++{ ++ if (hint->backward) ++ return alloc_blocks_backward(hint, needed, start, len); ++ return alloc_blocks_forward(hint, needed, start, len); ++} ++ ++/* plugin->u.space_allocator.dealloc_blocks(). */ ++/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted ++ nodes deletion is deferred until transaction commit. However, deallocation ++ of temporary objects like wandered blocks and transaction commit records ++ requires immediate node deletion from WORKING BITMAP.*/ ++void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator, ++ reiser4_block_nr start, reiser4_block_nr len) ++{ ++ struct super_block *super = reiser4_get_current_sb(); ++ ++ bmap_nr_t bmap; ++ bmap_off_t offset; ++ ++ struct bitmap_node *bnode; ++ int ret; ++ ++ assert("zam-468", len != 0); ++ check_block_range(&start, &len); ++ ++ parse_blocknr(&start, &bmap, &offset); ++ ++ assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize)); ++ ++ bnode = get_bnode(super, bmap); ++ ++ assert("zam-470", bnode != NULL); ++ ++ ret = load_and_lock_bnode(bnode); ++ assert("zam-481", ret == 0); ++ ++ reiser4_clear_bits(bnode_working_data(bnode), offset, ++ (bmap_off_t) (offset + len)); ++ ++ adjust_first_zero_bit(bnode, offset); ++ ++ release_and_unlock_bnode(bnode); ++} ++ ++/* plugin->u.space_allocator.check_blocks(). */ ++void reiser4_check_blocks_bitmap(const reiser4_block_nr * start, ++ const reiser4_block_nr * len, int desired) ++{ ++#if REISER4_DEBUG ++ struct super_block *super = reiser4_get_current_sb(); ++ ++ bmap_nr_t bmap; ++ bmap_off_t start_offset; ++ bmap_off_t end_offset; ++ ++ struct bitmap_node *bnode; ++ int ret; ++ ++ assert("zam-622", len != NULL); ++ check_block_range(start, len); ++ parse_blocknr(start, &bmap, &start_offset); ++ ++ end_offset = start_offset + *len; ++ assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize)); ++ ++ bnode = get_bnode(super, bmap); ++ ++ assert("nikita-2215", bnode != NULL); ++ ++ ret = load_and_lock_bnode(bnode); ++ assert("zam-626", ret == 0); ++ ++ assert("nikita-2216", jnode_is_loaded(bnode->wjnode)); ++ ++ if (desired) { ++ assert("zam-623", ++ reiser4_find_next_zero_bit(bnode_working_data(bnode), ++ end_offset, start_offset) ++ >= end_offset); ++ } else { ++ assert("zam-624", ++ reiser4_find_next_set_bit(bnode_working_data(bnode), ++ end_offset, start_offset) ++ >= end_offset); ++ } ++ ++ release_and_unlock_bnode(bnode); ++#endif ++} ++ ++/* conditional insertion of @node into atom's overwrite set if it was not there */ ++static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node) ++{ ++ assert("zam-546", atom != NULL); ++ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT); ++ assert("zam-548", node != NULL); ++ ++ spin_lock_atom(atom); ++ spin_lock_jnode(node); ++ ++ if (node->atom == NULL) { ++ JF_SET(node, JNODE_OVRWR); ++ insert_into_atom_ovrwr_list(atom, node); ++ } else { ++ assert("zam-549", node->atom == atom); ++ } ++ ++ spin_unlock_jnode(node); ++ spin_unlock_atom(atom); ++} ++ ++/* an actor which applies delete set to COMMIT bitmap pages and link modified ++ pages in a single-linked list */ ++static int ++apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start, ++ const reiser4_block_nr * len, void *data) ++{ ++ ++ bmap_nr_t bmap; ++ bmap_off_t offset; ++ int ret; ++ ++ long long *blocks_freed_p = data; ++ ++ struct bitmap_node *bnode; ++ ++ struct super_block *sb = reiser4_get_current_sb(); ++ ++ check_block_range(start, len); ++ ++ parse_blocknr(start, &bmap, &offset); ++ ++ /* FIXME-ZAM: we assume that all block ranges are allocated by this ++ bitmap-based allocator and each block range can't go over a zone of ++ responsibility of one bitmap block; same assumption is used in ++ other journal hooks in bitmap code. */ ++ bnode = get_bnode(sb, bmap); ++ assert("zam-448", bnode != NULL); ++ ++ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */ ++ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT); ++ ret = load_and_lock_bnode(bnode); ++ if (ret) ++ return ret; ++ ++ /* put bnode into atom's overwrite set */ ++ cond_add_to_overwrite_set(atom, bnode->cjnode); ++ ++ data = bnode_commit_data(bnode); ++ ++ ret = bnode_check_crc(bnode); ++ if (ret != 0) ++ return ret; ++ ++ if (len != NULL) { ++ /* FIXME-ZAM: a check that all bits are set should be there */ ++ assert("zam-443", ++ offset + *len <= bmap_bit_count(sb->s_blocksize)); ++ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len)); ++ ++ (*blocks_freed_p) += *len; ++ } else { ++ reiser4_clear_bit(offset, data); ++ (*blocks_freed_p)++; ++ } ++ ++ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize)); ++ ++ release_and_unlock_bnode(bnode); ++ ++ return 0; ++} ++ ++/* plugin->u.space_allocator.pre_commit_hook(). */ ++/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the ++ rest is done by transaction manager (allocate wandered locations for COMMIT ++ BITMAP blocks, copy COMMIT BITMAP blocks data). */ ++/* Only one instance of this function can be running at one given time, because ++ only one transaction can be committed a time, therefore it is safe to access ++ some global variables without any locking */ ++ ++int reiser4_pre_commit_hook_bitmap(void) ++{ ++ struct super_block *super = reiser4_get_current_sb(); ++ txn_atom *atom; ++ ++ long long blocks_freed = 0; ++ ++ atom = get_current_atom_locked(); ++ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT); ++ spin_unlock_atom(atom); ++ ++ { /* scan atom's captured list and find all freshly allocated nodes, ++ * mark corresponded bits in COMMIT BITMAP as used */ ++ struct list_head *head = ATOM_CLEAN_LIST(atom); ++ jnode *node = list_entry(head->next, jnode, capture_link); ++ ++ while (head != &node->capture_link) { ++ /* we detect freshly allocated jnodes */ ++ if (JF_ISSET(node, JNODE_RELOC)) { ++ int ret; ++ bmap_nr_t bmap; ++ ++ bmap_off_t offset; ++ bmap_off_t index; ++ struct bitmap_node *bn; ++ __u32 size = bmap_size(super->s_blocksize); ++ __u32 crc; ++ char byte; ++ ++ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR)); ++ assert("zam-460", ++ !reiser4_blocknr_is_fake(&node->blocknr)); ++ ++ parse_blocknr(&node->blocknr, &bmap, &offset); ++ bn = get_bnode(super, bmap); ++ ++ index = offset >> 3; ++ assert("vpf-276", index < size); ++ ++ ret = bnode_check_crc(bnode); ++ if (ret != 0) ++ return ret; ++ ++ check_bnode_loaded(bn); ++ load_and_lock_bnode(bn); ++ ++ byte = *(bnode_commit_data(bn) + index); ++ reiser4_set_bit(offset, bnode_commit_data(bn)); ++ ++ crc = adler32_recalc(bnode_commit_crc(bn), byte, ++ *(bnode_commit_data(bn) + ++ index), ++ size - index), ++ bnode_set_commit_crc(bn, crc); ++ ++ release_and_unlock_bnode(bn); ++ ++ ret = bnode_check_crc(bn); ++ if (ret != 0) ++ return ret; ++ ++ /* working of this depends on how it inserts ++ new j-node into clean list, because we are ++ scanning the same list now. It is OK, if ++ insertion is done to the list front */ ++ cond_add_to_overwrite_set(atom, bn->cjnode); ++ } ++ ++ node = list_entry(node->capture_link.next, jnode, capture_link); ++ } ++ } ++ ++ blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap, ++ &blocks_freed, 0); ++ ++ blocks_freed -= atom->nr_blocks_allocated; ++ ++ { ++ reiser4_super_info_data *sbinfo; ++ ++ sbinfo = get_super_private(super); ++ ++ spin_lock_reiser4_super(sbinfo); ++ sbinfo->blocks_free_committed += blocks_freed; ++ spin_unlock_reiser4_super(sbinfo); ++ } ++ ++ return 0; ++} ++ ++/* plugin->u.space_allocator.init_allocator ++ constructor of reiser4_space_allocator object. It is called on fs mount */ ++int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator, ++ struct super_block *super, void *arg) ++{ ++ struct bitmap_allocator_data *data = NULL; ++ bmap_nr_t bitmap_blocks_nr; ++ bmap_nr_t i; ++ ++ assert("nikita-3039", reiser4_schedulable()); ++ ++ /* getting memory for bitmap allocator private data holder */ ++ data = ++ kmalloc(sizeof(struct bitmap_allocator_data), ++ reiser4_ctx_gfp_mask_get()); ++ ++ if (data == NULL) ++ return RETERR(-ENOMEM); ++ ++ /* allocation and initialization for the array of bnodes */ ++ bitmap_blocks_nr = get_nr_bmap(super); ++ ++ /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps ++ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17, ++ may I never meet someone who still uses the ia32 architecture when ++ storage devices of that size enter the market, and wants to use ia32 ++ with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and, ++ probably, another dynamic data structure should replace a static ++ array of bnodes. */ ++ /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */ ++ data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr); ++ if (data->bitmap == NULL) { ++ kfree(data); ++ return RETERR(-ENOMEM); ++ } ++ ++ for (i = 0; i < bitmap_blocks_nr; i++) ++ init_bnode(data->bitmap + i, super, i); ++ ++ allocator->u.generic = data; ++ ++#if REISER4_DEBUG ++ get_super_private(super)->min_blocks_used += bitmap_blocks_nr; ++#endif ++ ++ /* Load all bitmap blocks at mount time. */ ++ if (!test_bit ++ (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) { ++ __u64 start_time, elapsed_time; ++ struct bitmap_node *bnode; ++ int ret; ++ ++ if (REISER4_DEBUG) ++ printk(KERN_INFO "loading reiser4 bitmap..."); ++ start_time = jiffies; ++ ++ for (i = 0; i < bitmap_blocks_nr; i++) { ++ bnode = data->bitmap + i; ++ ret = load_and_lock_bnode(bnode); ++ if (ret) { ++ reiser4_destroy_allocator_bitmap(allocator, ++ super); ++ return ret; ++ } ++ release_and_unlock_bnode(bnode); ++ } ++ ++ elapsed_time = jiffies - start_time; ++ if (REISER4_DEBUG) ++ printk("...done (%llu jiffies)\n", ++ (unsigned long long)elapsed_time); ++ } ++ ++ return 0; ++} ++ ++/* plugin->u.space_allocator.destroy_allocator ++ destructor. It is called on fs unmount */ ++int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator, ++ struct super_block *super) ++{ ++ bmap_nr_t bitmap_blocks_nr; ++ bmap_nr_t i; ++ ++ struct bitmap_allocator_data *data = allocator->u.generic; ++ ++ assert("zam-414", data != NULL); ++ assert("zam-376", data->bitmap != NULL); ++ ++ bitmap_blocks_nr = get_nr_bmap(super); ++ ++ for (i = 0; i < bitmap_blocks_nr; i++) { ++ struct bitmap_node *bnode = data->bitmap + i; ++ ++ mutex_lock(&bnode->mutex); ++ ++#if REISER4_DEBUG ++ if (atomic_read(&bnode->loaded)) { ++ jnode *wj = bnode->wjnode; ++ jnode *cj = bnode->cjnode; ++ ++ assert("zam-480", jnode_page(cj) != NULL); ++ assert("zam-633", jnode_page(wj) != NULL); ++ ++ assert("zam-634", ++ memcmp(jdata(wj), jdata(wj), ++ bmap_size(super->s_blocksize)) == 0); ++ ++ } ++#endif ++ done_bnode(bnode); ++ mutex_unlock(&bnode->mutex); ++ } ++ ++ vfree(data->bitmap); ++ kfree(data); ++ ++ allocator->u.generic = NULL; ++ ++ return 0; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * scroll-step: 1 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.33/fs/reiser4/plugin/space/bitmap.h +--- linux-2.6.33.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/space/bitmap.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,47 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__) ++#define __REISER4_PLUGIN_SPACE_BITMAP_H__ ++ ++#include "../../dformat.h" ++#include "../../block_alloc.h" ++ ++#include <linux/types.h> /* for __u?? */ ++#include <linux/fs.h> /* for struct super_block */ ++/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */ ++/* declarations of functions implementing methods of space allocator plugin for ++ bitmap based allocator. The functions themselves are in bitmap.c */ ++extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *, ++ struct super_block *, void *); ++extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *, ++ struct super_block *); ++extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *, ++ reiser4_blocknr_hint *, int needed, ++ reiser4_block_nr * start, ++ reiser4_block_nr * len); ++extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *, ++ const reiser4_block_nr *, int); ++extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *, ++ reiser4_block_nr, ++ reiser4_block_nr); ++extern int reiser4_pre_commit_hook_bitmap(void); ++ ++#define reiser4_post_commit_hook_bitmap() do{}while(0) ++#define reiser4_post_write_back_hook_bitmap() do{}while(0) ++#define reiser4_print_info_bitmap(pref, al) do{}while(0) ++ ++typedef __u64 bmap_nr_t; ++typedef __u32 bmap_off_t; ++ ++#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/space/Makefile linux-2.6.33/fs/reiser4/plugin/space/Makefile +--- linux-2.6.33.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/space/Makefile 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,4 @@ ++obj-$(CONFIG_REISER4_FS) += space_plugins.o ++ ++space_plugins-objs := \ ++ bitmap.o +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.33/fs/reiser4/plugin/space/space_allocator.h +--- linux-2.6.33.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/space/space_allocator.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,80 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#ifndef __SPACE_ALLOCATOR_H__ ++#define __SPACE_ALLOCATOR_H__ ++ ++#include "../../forward.h" ++#include "bitmap.h" ++/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now, ++ * but... */ ++#define DEF_SPACE_ALLOCATOR(allocator) \ ++ \ ++static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \ ++{ \ ++ return reiser4_init_allocator_##allocator (al, s, opaque); \ ++} \ ++ \ ++static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \ ++{ \ ++ reiser4_destroy_allocator_##allocator (al, s); \ ++} \ ++ \ ++static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \ ++ int needed, reiser4_block_nr * start, reiser4_block_nr * len) \ ++{ \ ++ return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \ ++} \ ++static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \ ++{ \ ++ reiser4_dealloc_blocks_##allocator (al, start, len); \ ++} \ ++ \ ++static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \ ++{ \ ++ reiser4_check_blocks_##allocator (start, end, desired); \ ++} \ ++ \ ++static inline void sa_pre_commit_hook (void) \ ++{ \ ++ reiser4_pre_commit_hook_##allocator (); \ ++} \ ++ \ ++static inline void sa_post_commit_hook (void) \ ++{ \ ++ reiser4_post_commit_hook_##allocator (); \ ++} \ ++ \ ++static inline void sa_post_write_back_hook (void) \ ++{ \ ++ reiser4_post_write_back_hook_##allocator(); \ ++} \ ++ \ ++static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \ ++{ \ ++ reiser4_print_info_##allocator (prefix, al); \ ++} ++ ++DEF_SPACE_ALLOCATOR(bitmap) ++ ++/* this object is part of reiser4 private in-core super block */ ++struct reiser4_space_allocator { ++ union { ++ /* space allocators might use this pointer to reference their ++ * data. */ ++ void *generic; ++ } u; ++}; ++ ++/* __SPACE_ALLOCATOR_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.33/fs/reiser4/plugin/tail_policy.c +--- linux-2.6.33.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/plugin/tail_policy.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,113 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Formatting policy plugins */ ++ ++/* ++ * Formatting policy plugin is used by object plugin (of regular file) to ++ * convert file between two representations. ++ * ++ * Currently following policies are implemented: ++ * never store file in formatted nodes ++ * always store file in formatted nodes ++ * store file in formatted nodes if file is smaller than 4 blocks (default) ++ */ ++ ++#include "../tree.h" ++#include "../inode.h" ++#include "../super.h" ++#include "object.h" ++#include "plugin.h" ++#include "node/node.h" ++#include "plugin_header.h" ++ ++#include <linux/pagemap.h> ++#include <linux/fs.h> /* For struct inode */ ++ ++/** ++ * have_formatting_never - ++ * @inode: ++ * @size: ++ * ++ * ++ */ ++/* Never store file's tail as direct item */ ++/* Audited by: green(2002.06.12) */ ++static int have_formatting_never(const struct inode *inode UNUSED_ARG ++ /* inode to operate on */ , ++ loff_t size UNUSED_ARG/* new object size */) ++{ ++ return 0; ++} ++ ++/* Always store file's tail as direct item */ ++/* Audited by: green(2002.06.12) */ ++static int ++have_formatting_always(const struct inode *inode UNUSED_ARG ++ /* inode to operate on */ , ++ loff_t size UNUSED_ARG/* new object size */) ++{ ++ return 1; ++} ++ ++/* This function makes test if we should store file denoted @inode as tails only ++ or as extents only. */ ++static int ++have_formatting_default(const struct inode *inode UNUSED_ARG ++ /* inode to operate on */ , ++ loff_t size/* new object size */) ++{ ++ assert("umka-1253", inode != NULL); ++ ++ if (size > inode->i_sb->s_blocksize * 4) ++ return 0; ++ ++ return 1; ++} ++ ++/* tail plugins */ ++formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = { ++ [NEVER_TAILS_FORMATTING_ID] = { ++ .h = { ++ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, ++ .id = NEVER_TAILS_FORMATTING_ID, ++ .pops = NULL, ++ .label = "never", ++ .desc = "Never store file's tail", ++ .linkage = {NULL, NULL} ++ }, ++ .have_tail = have_formatting_never ++ }, ++ [ALWAYS_TAILS_FORMATTING_ID] = { ++ .h = { ++ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, ++ .id = ALWAYS_TAILS_FORMATTING_ID, ++ .pops = NULL, ++ .label = "always", ++ .desc = "Always store file's tail", ++ .linkage = {NULL, NULL} ++ }, ++ .have_tail = have_formatting_always ++ }, ++ [SMALL_FILE_FORMATTING_ID] = { ++ .h = { ++ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, ++ .id = SMALL_FILE_FORMATTING_ID, ++ .pops = NULL, ++ .label = "4blocks", ++ .desc = "store files shorter than 4 blocks in tail items", ++ .linkage = {NULL, NULL} ++ }, ++ .have_tail = have_formatting_default ++ } ++}; ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/pool.c linux-2.6.33/fs/reiser4/pool.c +--- linux-2.6.33.orig/fs/reiser4/pool.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/pool.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,231 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Fast pool allocation. ++ ++ There are situations when some sub-system normally asks memory allocator ++ for only few objects, but under some circumstances could require much ++ more. Typical and actually motivating example is tree balancing. It needs ++ to keep track of nodes that were involved into it, and it is well-known ++ that in reasonable packed balanced tree most (92.938121%) percent of all ++ balancings end up after working with only few nodes (3.141592 on ++ average). But in rare cases balancing can involve much more nodes ++ (3*tree_height+1 in extremal situation). ++ ++ On the one hand, we don't want to resort to dynamic allocation (slab, ++ malloc(), etc.) to allocate data structures required to keep track of ++ nodes during balancing. On the other hand, we cannot statically allocate ++ required amount of space on the stack, because first: it is useless wastage ++ of precious resource, and second: this amount is unknown in advance (tree ++ height can change). ++ ++ Pools, implemented in this file are solution for this problem: ++ ++ - some configurable amount of objects is statically preallocated on the ++ stack ++ ++ - if this preallocated pool is exhausted and more objects is requested ++ they are allocated dynamically. ++ ++ Pools encapsulate distinction between statically and dynamically allocated ++ objects. Both allocation and recycling look exactly the same. ++ ++ To keep track of dynamically allocated objects, pool adds its own linkage ++ to each object. ++ ++ NOTE-NIKITA This linkage also contains some balancing-specific data. This ++ is not perfect. On the other hand, balancing is currently the only client ++ of pool code. ++ ++ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation ++ functions in the style of tslist/tshash, i.e., make them unreadable, but ++ type-safe. ++ ++*/ ++ ++#include "debug.h" ++#include "pool.h" ++#include "super.h" ++ ++#include <linux/types.h> ++#include <linux/err.h> ++ ++/* initialize new pool object @h */ ++static void reiser4_init_pool_obj(struct reiser4_pool_header *h) ++{ ++ INIT_LIST_HEAD(&h->usage_linkage); ++ INIT_LIST_HEAD(&h->level_linkage); ++ INIT_LIST_HEAD(&h->extra_linkage); ++} ++ ++/* initialize new pool */ ++void reiser4_init_pool(struct reiser4_pool *pool /* pool to initialize */ , ++ size_t obj_size /* size of objects in @pool */ , ++ int num_of_objs /* number of preallocated objects */ , ++ char *data/* area for preallocated objects */) ++{ ++ struct reiser4_pool_header *h; ++ int i; ++ ++ assert("nikita-955", pool != NULL); ++ assert("nikita-1044", obj_size > 0); ++ assert("nikita-956", num_of_objs >= 0); ++ assert("nikita-957", data != NULL); ++ ++ memset(pool, 0, sizeof *pool); ++ pool->obj_size = obj_size; ++ pool->data = data; ++ INIT_LIST_HEAD(&pool->free); ++ INIT_LIST_HEAD(&pool->used); ++ INIT_LIST_HEAD(&pool->extra); ++ memset(data, 0, obj_size * num_of_objs); ++ for (i = 0; i < num_of_objs; ++i) { ++ h = (struct reiser4_pool_header *) (data + i * obj_size); ++ reiser4_init_pool_obj(h); ++ /* add pool header to the end of pool's free list */ ++ list_add_tail(&h->usage_linkage, &pool->free); ++ } ++} ++ ++/* release pool resources ++ ++ Release all resources acquired by this pool, specifically, dynamically ++ allocated objects. ++ ++*/ ++void reiser4_done_pool(struct reiser4_pool *pool UNUSED_ARG) ++{ ++} ++ ++/* allocate carry object from @pool ++ ++ First, try to get preallocated object. If this fails, resort to dynamic ++ allocation. ++ ++*/ ++static void *reiser4_pool_alloc(struct reiser4_pool *pool) ++{ ++ struct reiser4_pool_header *result; ++ ++ assert("nikita-959", pool != NULL); ++ ++ if (!list_empty(&pool->free)) { ++ struct list_head *linkage; ++ ++ linkage = pool->free.next; ++ list_del(linkage); ++ INIT_LIST_HEAD(linkage); ++ result = list_entry(linkage, struct reiser4_pool_header, ++ usage_linkage); ++ BUG_ON(!list_empty(&result->level_linkage) || ++ !list_empty(&result->extra_linkage)); ++ } else { ++ /* pool is empty. Extra allocations don't deserve dedicated ++ slab to be served from, as they are expected to be rare. */ ++ result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get()); ++ if (result != 0) { ++ reiser4_init_pool_obj(result); ++ list_add(&result->extra_linkage, &pool->extra); ++ } else ++ return ERR_PTR(RETERR(-ENOMEM)); ++ BUG_ON(!list_empty(&result->usage_linkage) || ++ !list_empty(&result->level_linkage)); ++ } ++ ++pool->objs; ++ list_add(&result->usage_linkage, &pool->used); ++ memset(result + 1, 0, pool->obj_size - sizeof *result); ++ return result; ++} ++ ++/* return object back to the pool */ ++void reiser4_pool_free(struct reiser4_pool *pool, ++ struct reiser4_pool_header *h) ++{ ++ assert("nikita-961", h != NULL); ++ assert("nikita-962", pool != NULL); ++ ++ --pool->objs; ++ assert("nikita-963", pool->objs >= 0); ++ ++ list_del_init(&h->usage_linkage); ++ list_del_init(&h->level_linkage); ++ ++ if (list_empty(&h->extra_linkage)) ++ /* ++ * pool header is not an extra one. Push it onto free list ++ * using usage_linkage ++ */ ++ list_add(&h->usage_linkage, &pool->free); ++ else { ++ /* remove pool header from pool's extra list and kfree it */ ++ list_del(&h->extra_linkage); ++ kfree(h); ++ } ++} ++ ++/* add new object to the carry level list ++ ++ Carry level is FIFO most of the time, but not always. Complications arise ++ when make_space() function tries to go to the left neighbor and thus adds ++ carry node before existing nodes, and also, when updating delimiting keys ++ after moving data between two nodes, we want left node to be locked before ++ right node. ++ ++ Latter case is confusing at the first glance. Problem is that COP_UPDATE ++ opration that updates delimiting keys is sometimes called with two nodes ++ (when data are moved between two nodes) and sometimes with only one node ++ (when leftmost item is deleted in a node). In any case operation is ++ supplied with at least node whose left delimiting key is to be updated ++ (that is "right" node). ++ ++ @pool - from which to allocate new object; ++ @list - where to add object; ++ @reference - after (or before) which existing object to add ++*/ ++struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool, ++ struct list_head *list, ++ pool_ordering order, ++ struct reiser4_pool_header *reference) ++{ ++ struct reiser4_pool_header *result; ++ ++ assert("nikita-972", pool != NULL); ++ ++ result = reiser4_pool_alloc(pool); ++ if (IS_ERR(result)) ++ return result; ++ ++ assert("nikita-973", result != NULL); ++ ++ switch (order) { ++ case POOLO_BEFORE: ++ __list_add(&result->level_linkage, ++ reference->level_linkage.prev, ++ &reference->level_linkage); ++ break; ++ case POOLO_AFTER: ++ __list_add(&result->level_linkage, ++ &reference->level_linkage, ++ reference->level_linkage.next); ++ break; ++ case POOLO_LAST: ++ list_add_tail(&result->level_linkage, list); ++ break; ++ case POOLO_FIRST: ++ list_add(&result->level_linkage, list); ++ break; ++ default: ++ wrong_return_value("nikita-927", "order"); ++ } ++ return result; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/pool.h linux-2.6.33/fs/reiser4/pool.h +--- linux-2.6.33.orig/fs/reiser4/pool.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/pool.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,57 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Fast pool allocation */ ++ ++#ifndef __REISER4_POOL_H__ ++#define __REISER4_POOL_H__ ++ ++#include <linux/types.h> ++ ++struct reiser4_pool { ++ size_t obj_size; ++ int objs; ++ char *data; ++ struct list_head free; ++ struct list_head used; ++ struct list_head extra; ++}; ++ ++struct reiser4_pool_header { ++ /* object is either on free or "used" lists */ ++ struct list_head usage_linkage; ++ struct list_head level_linkage; ++ struct list_head extra_linkage; ++}; ++ ++typedef enum { ++ POOLO_BEFORE, ++ POOLO_AFTER, ++ POOLO_LAST, ++ POOLO_FIRST ++} pool_ordering; ++ ++/* pool manipulation functions */ ++ ++extern void reiser4_init_pool(struct reiser4_pool *pool, size_t obj_size, ++ int num_of_objs, char *data); ++extern void reiser4_done_pool(struct reiser4_pool *pool); ++extern void reiser4_pool_free(struct reiser4_pool *pool, ++ struct reiser4_pool_header *h); ++struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool, ++ struct list_head *list, ++ pool_ordering order, ++ struct reiser4_pool_header *reference); ++ ++/* __REISER4_POOL_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/readahead.c linux-2.6.33/fs/reiser4/readahead.c +--- linux-2.6.33.orig/fs/reiser4/readahead.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/readahead.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,140 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#include "forward.h" ++#include "tree.h" ++#include "tree_walk.h" ++#include "super.h" ++#include "inode.h" ++#include "key.h" ++#include "znode.h" ++ ++#include <linux/swap.h> /* for totalram_pages */ ++ ++void reiser4_init_ra_info(ra_info_t *rai) ++{ ++ rai->key_to_stop = *reiser4_min_key(); ++} ++ ++/* global formatted node readahead parameter. It can be set by mount option ++ * -o readahead:NUM:1 */ ++static inline int ra_adjacent_only(int flags) ++{ ++ return flags & RA_ADJACENT_ONLY; ++} ++ ++/* this is used by formatted_readahead to decide whether read for right neighbor ++ * of node is to be issued. It returns 1 if right neighbor's first key is less ++ * or equal to readahead's stop key */ ++static int should_readahead_neighbor(znode * node, ra_info_t *info) ++{ ++ int result; ++ ++ read_lock_dk(znode_get_tree(node)); ++ result = keyle(znode_get_rd_key(node), &info->key_to_stop); ++ read_unlock_dk(znode_get_tree(node)); ++ return result; ++} ++ ++#define LOW_MEM_PERCENTAGE (5) ++ ++static int low_on_memory(void) ++{ ++ unsigned int freepages; ++ ++ freepages = nr_free_pages(); ++ return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100); ++} ++ ++/* start read for @node and for a few of its right neighbors */ ++void formatted_readahead(znode * node, ra_info_t *info) ++{ ++ struct formatted_ra_params *ra_params; ++ znode *cur; ++ int i; ++ int grn_flags; ++ lock_handle next_lh; ++ ++ /* do nothing if node block number has not been assigned to node (which ++ * means it is still in cache). */ ++ if (reiser4_blocknr_is_fake(znode_get_block(node))) ++ return; ++ ++ ra_params = get_current_super_ra_params(); ++ ++ if (znode_page(node) == NULL) ++ jstartio(ZJNODE(node)); ++ ++ if (znode_get_level(node) != LEAF_LEVEL) ++ return; ++ ++ /* don't waste memory for read-ahead when low on memory */ ++ if (low_on_memory()) ++ return; ++ ++ /* We can have locked nodes on upper tree levels, in this situation lock ++ priorities do not help to resolve deadlocks, we have to use TRY_LOCK ++ here. */ ++ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK); ++ ++ i = 0; ++ cur = zref(node); ++ init_lh(&next_lh); ++ while (i < ra_params->max) { ++ const reiser4_block_nr * nextblk; ++ ++ if (!should_readahead_neighbor(cur, info)) ++ break; ++ ++ if (reiser4_get_right_neighbor ++ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags)) ++ break; ++ ++ nextblk = znode_get_block(next_lh.node); ++ if (reiser4_blocknr_is_fake(nextblk) || ++ (ra_adjacent_only(ra_params->flags) ++ && *nextblk != *znode_get_block(cur) + 1)) ++ break; ++ ++ zput(cur); ++ cur = zref(next_lh.node); ++ done_lh(&next_lh); ++ if (znode_page(cur) == NULL) ++ jstartio(ZJNODE(cur)); ++ else ++ /* Do not scan read-ahead window if pages already ++ * allocated (and i/o already started). */ ++ break; ++ ++ i++; ++ } ++ zput(cur); ++ done_lh(&next_lh); ++} ++ ++void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap) ++{ ++ reiser4_key *stop_key; ++ ++ assert("nikita-3542", dir != NULL); ++ assert("nikita-3543", tap != NULL); ++ ++ stop_key = &tap->ra_info.key_to_stop; ++ /* initialize readdir readahead information: include into readahead ++ * stat data of all files of the directory */ ++ set_key_locality(stop_key, get_inode_oid(dir)); ++ set_key_type(stop_key, KEY_SD_MINOR); ++ set_key_ordering(stop_key, get_key_ordering(reiser4_max_key())); ++ set_key_objectid(stop_key, get_key_objectid(reiser4_max_key())); ++ set_key_offset(stop_key, get_key_offset(reiser4_max_key())); ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 80 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/readahead.h linux-2.6.33/fs/reiser4/readahead.h +--- linux-2.6.33.orig/fs/reiser4/readahead.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/readahead.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,52 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#ifndef __READAHEAD_H__ ++#define __READAHEAD_H__ ++ ++#include "key.h" ++ ++typedef enum { ++ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent. ++ Default is NO (not only adjacent) */ ++} ra_global_flags; ++ ++/* reiser4 super block has a field of this type. ++ It controls readahead during tree traversals */ ++struct formatted_ra_params { ++ unsigned long max; /* request not more than this amount of nodes. ++ Default is totalram_pages / 4 */ ++ int flags; ++}; ++ ++typedef struct { ++ reiser4_key key_to_stop; ++} ra_info_t; ++ ++void formatted_readahead(znode * , ra_info_t *); ++void reiser4_init_ra_info(ra_info_t *rai); ++ ++struct reiser4_file_ra_state { ++ loff_t start; /* Current window */ ++ loff_t size; ++ loff_t next_size; /* Next window size */ ++ loff_t ahead_start; /* Ahead window */ ++ loff_t ahead_size; ++ loff_t max_window_size; /* Maximum readahead window */ ++ loff_t slow_start; /* enlarging r/a size algorithm. */ ++}; ++ ++extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap); ++ ++/* __READAHEAD_H__ */ ++#endif ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/README linux-2.6.33/fs/reiser4/README +--- linux-2.6.33.orig/fs/reiser4/README 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/README 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,128 @@ ++[LICENSING] ++ ++Reiser4 is hereby licensed under the GNU General ++Public License version 2. ++ ++Source code files that contain the phrase "licensing governed by ++reiser4/README" are "governed files" throughout this file. Governed ++files are licensed under the GPL. The portions of them owned by Hans ++Reiser, or authorized to be licensed by him, have been in the past, ++and likely will be in the future, licensed to other parties under ++other licenses. If you add your code to governed files, and don't ++want it to be owned by Hans Reiser, put your copyright label on that ++code so the poor blight and his customers can keep things straight. ++All portions of governed files not labeled otherwise are owned by Hans ++Reiser, and by adding your code to it, widely distributing it to ++others or sending us a patch, and leaving the sentence in stating that ++licensing is governed by the statement in this file, you accept this. ++It will be a kindness if you identify whether Hans Reiser is allowed ++to license code labeled as owned by you on your behalf other than ++under the GPL, because he wants to know if it is okay to do so and put ++a check in the mail to you (for non-trivial improvements) when he ++makes his next sale. He makes no guarantees as to the amount if any, ++though he feels motivated to motivate contributors, and you can surely ++discuss this with him before or after contributing. You have the ++right to decline to allow him to license your code contribution other ++than under the GPL. ++ ++Further licensing options are available for commercial and/or other ++interests directly from Hans Reiser: reiser@namesys.com. If you interpret ++the GPL as not allowing those additional licensing options, you read ++it wrongly, and Richard Stallman agrees with me, when carefully read ++you can see that those restrictions on additional terms do not apply ++to the owner of the copyright, and my interpretation of this shall ++govern for this license. ++ ++[END LICENSING] ++ ++Reiser4 is a file system based on dancing tree algorithms, and is ++described at http://www.namesys.com ++ ++mkfs.reiser4 and other utilities are on our webpage or wherever your ++Linux provider put them. You really want to be running the latest ++version off the website if you use fsck. ++ ++Yes, if you update your reiser4 kernel module you do have to ++recompile your kernel, most of the time. The errors you get will be ++quite cryptic if your forget to do so. ++ ++Hideous Commercial Pitch: Spread your development costs across other OS ++vendors. Select from the best in the world, not the best in your ++building, by buying from third party OS component suppliers. Leverage ++the software component development power of the internet. Be the most ++aggressive in taking advantage of the commercial possibilities of ++decentralized internet development, and add value through your branded ++integration that you sell as an operating system. Let your competitors ++be the ones to compete against the entire internet by themselves. Be ++hip, get with the new economic trend, before your competitors do. Send ++email to reiser@namesys.com ++ ++Hans Reiser was the primary architect of Reiser4, but a whole team ++chipped their ideas in. He invested everything he had into Namesys ++for 5.5 dark years of no money before Reiser3 finally started to work well ++enough to bring in money. He owns the copyright. ++ ++DARPA was the primary sponsor of Reiser4. DARPA does not endorse ++Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal ++opinion, unique in its willingness to invest into things more ++theoretical than the VC community can readily understand, and more ++longterm than allows them to be sure that they will be the ones to ++extract the economic benefits from. DARPA also integrated us into a ++security community that transformed our security worldview. ++ ++Vladimir Saveliev is our lead programmer, with us from the beginning, ++and he worked long hours writing the cleanest code. This is why he is ++now the lead programmer after years of commitment to our work. He ++always made the effort to be the best he could be, and to make his ++code the best that it could be. What resulted was quite remarkable. I ++don't think that money can ever motivate someone to work the way he ++did, he is one of the most selfless men I know. ++ ++Alexander Lyamin was our sysadmin, and helped to educate us in ++security issues. Moscow State University and IMT were very generous ++in the internet access they provided us, and in lots of other little ++ways that a generous institution can be. ++ ++Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the ++locking code, the block allocator, and finished the flushing code. ++His code is always crystal clean and well structured. ++ ++Nikita Danilov wrote the core of the balancing code, the core of the ++plugins code, and the directory code. He worked a steady pace of long ++hours that produced a whole lot of well abstracted code. He is our ++senior computer scientist. ++ ++Vladimir Demidov wrote the parser. Writing an in kernel parser is ++something very few persons have the skills for, and it is thanks to ++him that we can say that the parser is really not so big compared to ++various bits of our other code, and making a parser work in the kernel ++was not so complicated as everyone would imagine mainly because it was ++him doing it... ++ ++Joshua McDonald wrote the transaction manager, and the flush code. ++The flush code unexpectedly turned out be extremely hairy for reasons ++you can read about on our web page, and he did a great job on an ++extremely difficult task. ++ ++Nina Reiser handled our accounting, government relations, and much ++more. ++ ++Ramon Reiser developed our website. ++ ++Beverly Palmer drew our graphics. ++ ++Vitaly Fertman developed librepair, userspace plugins repair code, fsck ++and worked with Umka on developing libreiser4 and userspace plugins. ++ ++Yury Umanets (aka Umka) developed libreiser4, userspace plugins and ++userspace tools (reiser4progs). ++ ++Oleg Drokin (aka Green) is the release manager who fixes everything. ++It is so nice to have someone like that on the team. He (plus Chris ++and Jeff) make it possible for the entire rest of the Namesys team to ++focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It ++is just amazing to watch his talent for spotting bugs in action. ++ ++Edward Shishkin wrote cryptcompress file plugin (which manages files ++built of encrypted and(or) compressed bodies) and other plugins related ++to transparent encryption and compression support. +diff -urN linux-2.6.33.orig/fs/reiser4/reiser4.h linux-2.6.33/fs/reiser4/reiser4.h +--- linux-2.6.33.orig/fs/reiser4/reiser4.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/reiser4.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,259 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* definitions of common constants used by reiser4 */ ++ ++#if !defined( __REISER4_H__ ) ++#define __REISER4_H__ ++ ++#include <asm/param.h> /* for HZ */ ++#include <linux/errno.h> ++#include <linux/types.h> ++#include <linux/fs.h> ++#include <linux/hardirq.h> ++#include <linux/sched.h> ++ ++/* ++ * reiser4 compilation options. ++ */ ++ ++#if defined(CONFIG_REISER4_DEBUG) ++/* turn on assertion checks */ ++#define REISER4_DEBUG (1) ++#else ++#define REISER4_DEBUG (0) ++#endif ++ ++#define REISER4_SHA256 (0) ++ ++/* ++ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4 ++ * 8-byte components. In the old "small key" mode, it's 3 8-byte ++ * components. Additional component, referred to as "ordering" is used to ++ * order items from which given object is composed of. As such, ordering is ++ * placed between locality and objectid. For directory item ordering contains ++ * initial prefix of the file name this item is for. This sorts all directory ++ * items within given directory lexicographically (but see ++ * fibration.[ch]). For file body and stat-data, ordering contains initial ++ * prefix of the name file was initially created with. In the common case ++ * (files with single name) this allows to order file bodies and stat-datas in ++ * the same order as their respective directory entries, thus speeding up ++ * readdir. ++ * ++ * Note, that kernel can only mount file system with the same key size as one ++ * it is compiled for, so flipping this option may render your data ++ * inaccessible. ++ */ ++#define REISER4_LARGE_KEY (1) ++/*#define REISER4_LARGE_KEY (0)*/ ++ ++/*#define GUESS_EXISTS 1*/ ++ ++/* ++ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation ++ * option ++ */ ++ ++extern const char *REISER4_SUPER_MAGIC_STRING; ++extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the ++ * beginning of device */ ++ ++/* here go tunable parameters that are not worth special entry in kernel ++ configuration */ ++ ++/* default number of slots in coord-by-key caches */ ++#define CBK_CACHE_SLOTS (16) ++/* how many elementary tree operation to carry on the next level */ ++#define CARRIES_POOL_SIZE (5) ++/* size of pool of preallocated nodes for carry process. */ ++#define NODES_LOCKED_POOL_SIZE (5) ++ ++#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT) ++#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT) ++#define REISER4_PASTE_FLAGS (COPI_GO_LEFT) ++#define REISER4_INSERT_FLAGS (COPI_GO_LEFT) ++ ++/* we are supporting reservation of disk space on uid basis */ ++#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0) ++/* we are supporting reservation of disk space for groups */ ++#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0) ++/* we are supporting reservation of disk space for root */ ++#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0) ++/* we use rapid flush mode, see flush.c for comments. */ ++#define REISER4_USE_RAPID_FLUSH (1) ++ ++/* ++ * set this to 0 if you don't want to use wait-for-flush in ->writepage(). ++ */ ++#define REISER4_USE_ENTD (1) ++ ++/* key allocation is Plan-A */ ++#define REISER4_PLANA_KEY_ALLOCATION (1) ++/* key allocation follows good old 3.x scheme */ ++#define REISER4_3_5_KEY_ALLOCATION (0) ++ ++/* size of hash-table for znodes */ ++#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13) ++ ++/* number of buckets in lnode hash-table */ ++#define LNODE_HTABLE_BUCKETS (1024) ++ ++/* some ridiculously high maximal limit on height of znode tree. This ++ is used in declaration of various per level arrays and ++ to allocate stattistics gathering array for per-level stats. */ ++#define REISER4_MAX_ZTREE_HEIGHT (8) ++ ++#define REISER4_PANIC_MSG_BUFFER_SIZE (1024) ++ ++/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then, ++ sequential search is on average faster than binary. This is because ++ of better optimization and because sequential search is more CPU ++ cache friendly. This number (25) was found by experiments on dual AMD ++ Athlon(tm), 1400MHz. ++ ++ NOTE: testing in kernel has shown that binary search is more effective than ++ implied by results of the user level benchmarking. Probably because in the ++ node keys are separated by other data. So value was adjusted after few ++ tests. More thorough tuning is needed. ++*/ ++#define REISER4_SEQ_SEARCH_BREAK (3) ++ ++/* don't allow tree to be lower than this */ ++#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL) ++ ++/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to ++ * available memory. */ ++/* Default value of maximal atom size. Can be ovewritten by ++ tmgr.atom_max_size mount option. By default infinity. */ ++#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0)) ++ ++/* Default value of maximal atom age (in jiffies). After reaching this age ++ atom will be forced to commit, either synchronously or asynchronously. Can ++ be overwritten by tmgr.atom_max_age mount option. */ ++#define REISER4_ATOM_MAX_AGE (600 * HZ) ++ ++/* sleeping period for ktxnmrgd */ ++#define REISER4_TXNMGR_TIMEOUT (5 * HZ) ++ ++/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */ ++#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000) ++ ++/* start complaining after that many restarts in coord_by_key(). ++ ++ This either means incredibly heavy contention for this part of a tree, or ++ some corruption or bug. ++*/ ++#define REISER4_CBK_ITERATIONS_LIMIT (100) ++ ++/* return -EIO after that many iterations in coord_by_key(). ++ ++ I have witnessed more than 800 iterations (in 30 thread test) before cbk ++ finished. --nikita ++*/ ++#define REISER4_MAX_CBK_ITERATIONS 500000 ++ ++/* put a per-inode limit on maximal number of directory entries with identical ++ keys in hashed directory. ++ ++ Disable this until inheritance interfaces stabilize: we need some way to ++ set per directory limit. ++*/ ++#define REISER4_USE_COLLISION_LIMIT (0) ++ ++/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level ++ blocks it will force them to be relocated. */ ++#define FLUSH_RELOCATE_THRESHOLD 64 ++/* If flush finds can find a block allocation closer than at most ++ FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that position. ++ */ ++#define FLUSH_RELOCATE_DISTANCE 64 ++ ++/* If we have written this much or more blocks before encountering busy jnode ++ in flush list - abort flushing hoping that next time we get called ++ this jnode will be clean already, and we will save some seeks. */ ++#define FLUSH_WRITTEN_THRESHOLD 50 ++ ++/* The maximum number of nodes to scan left on a level during flush. */ ++#define FLUSH_SCAN_MAXNODES 10000 ++ ++/* per-atom limit of flushers */ ++#define ATOM_MAX_FLUSHERS (1) ++ ++/* default tracing buffer size */ ++#define REISER4_TRACE_BUF_SIZE (1 << 15) ++ ++/* what size units of IO we would like cp, etc., to use, in writing to ++ reiser4. In bytes. ++ ++ Can be overwritten by optimal_io_size mount option. ++*/ ++#define REISER4_OPTIMAL_IO_SIZE (64 * 1024) ++ ++/* see comments in inode.c:oid_to_uino() */ ++#define REISER4_UINO_SHIFT (1 << 30) ++ ++/* Mark function argument as unused to avoid compiler warnings. */ ++#define UNUSED_ARG __attribute__((unused)) ++ ++#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3) ++#define NONNULL __attribute__((nonnull)) ++#else ++#define NONNULL ++#endif ++ ++/* master super block offset in bytes.*/ ++#define REISER4_MASTER_OFFSET 65536 ++ ++/* size of VFS block */ ++#define VFS_BLKSIZE 512 ++/* number of bits in size of VFS block (512==2^9) */ ++#define VFS_BLKSIZE_BITS 9 ++ ++#define REISER4_I reiser4_inode_data ++ ++/* implication */ ++#define ergo(antecedent, consequent) (!(antecedent) || (consequent)) ++/* logical equivalence */ ++#define equi(p1, p2) (ergo((p1), (p2)) && ergo((p2), (p1))) ++ ++#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0]))) ++ ++#define NOT_YET (0) ++ ++/** Reiser4 specific error codes **/ ++ ++#define REISER4_ERROR_CODE_BASE 10000 ++ ++/* Neighbor is not available (side neighbor or parent) */ ++#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE) ++ ++/* Node was not found in cache */ ++#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1) ++ ++/* node has no free space enough for completion of balancing operation */ ++#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2) ++ ++/* repeat operation */ ++#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3) ++ ++/* deadlock happens */ ++#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4) ++ ++/* operation cannot be performed, because it would block and non-blocking mode ++ * was requested. */ ++#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5) ++ ++/* wait some event (depends on context), then repeat */ ++#define E_WAIT (REISER4_ERROR_CODE_BASE + 6) ++ ++#endif /* __REISER4_H__ */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/safe_link.c linux-2.6.33/fs/reiser4/safe_link.c +--- linux-2.6.33.orig/fs/reiser4/safe_link.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/safe_link.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,354 @@ ++/* Copyright 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Safe-links. */ ++ ++/* ++ * Safe-links are used to maintain file system consistency during operations ++ * that spawns multiple transactions. For example: ++ * ++ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files ++ * without user-visible names in the file system, but still opened by some ++ * active process. What happens here is that unlink proper (i.e., removal ++ * of the last file name) and file deletion (truncate of file body to zero ++ * and deletion of stat-data, that happens when last file descriptor is ++ * closed), may belong to different transactions T1 and T2. If a crash ++ * happens after T1 commit, but before T2 commit, on-disk file system has ++ * a file without name, that is, disk space leak. ++ * ++ * 2. Truncate. Truncate of large file may spawn multiple transactions. If ++ * system crashes while truncate was in-progress, file is left partially ++ * truncated, which violates "atomicity guarantees" of reiser4, viz. that ++ * every system is atomic. ++ * ++ * Safe-links address both above cases. Basically, safe-link is a way post ++ * some operation to be executed during commit of some other transaction than ++ * current one. (Another way to look at the safe-link is to interpret it as a ++ * logical logging.) ++ * ++ * Specifically, at the beginning of unlink safe-link in inserted in the ++ * tree. This safe-link is normally removed by file deletion code (during ++ * transaction T2 in the above terms). Truncate also inserts safe-link that is ++ * normally removed when truncate operation is finished. ++ * ++ * This means, that in the case of "clean umount" there are no safe-links in ++ * the tree. If safe-links are observed during mount, it means that (a) system ++ * was terminated abnormally, and (b) safe-link correspond to the "pending" ++ * (i.e., not finished) operations that were in-progress during system ++ * termination. Each safe-link record enough information to complete ++ * corresponding operation, and mount simply "replays" them (hence, the ++ * analogy with the logical logging). ++ * ++ * Safe-links are implemented as blackbox items (see ++ * plugin/item/blackbox.[ch]). ++ * ++ * For the reference: ext3 also has similar mechanism, it's called "an orphan ++ * list" there. ++ */ ++ ++#include "safe_link.h" ++#include "debug.h" ++#include "inode.h" ++ ++#include "plugin/item/blackbox.h" ++ ++#include <linux/fs.h> ++ ++/* ++ * On-disk format of safe-link. ++ */ ++typedef struct safelink { ++ reiser4_key sdkey; /* key of stat-data for the file safe-link is ++ * for */ ++ d64 size; /* size to which file should be truncated */ ++} safelink_t; ++ ++/* ++ * locality where safe-link items are stored. Next to the objectid of root ++ * directory. ++ */ ++static oid_t safe_link_locality(reiser4_tree * tree) ++{ ++ return get_key_objectid(get_super_private(tree->super)->df_plug-> ++ root_dir_key(tree->super)) + 1; ++} ++ ++/* ++ Construct a key for the safe-link. Key has the following format: ++ ++| 60 | 4 | 64 | 4 | 60 | 64 | +++---------------+---+------------------+---+---------------+------------------+ ++| locality | 0 | 0 | 0 | objectid | link type | +++---------------+---+------------------+---+---------------+------------------+ ++| | | | | ++| 8 bytes | 8 bytes | 8 bytes | 8 bytes | ++ ++ This is in large keys format. In small keys format second 8 byte chunk is ++ out. Locality is a constant returned by safe_link_locality(). objectid is ++ an oid of a file on which operation protected by this safe-link is ++ performed. link-type is used to distinguish safe-links for different ++ operations. ++ ++ */ ++static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid, ++ reiser4_safe_link_t link, reiser4_key * key) ++{ ++ reiser4_key_init(key); ++ set_key_locality(key, safe_link_locality(tree)); ++ set_key_objectid(key, oid); ++ set_key_offset(key, link); ++ return key; ++} ++ ++/* ++ * how much disk space is necessary to insert and remove (in the ++ * error-handling path) safe-link. ++ */ ++static __u64 safe_link_tograb(reiser4_tree * tree) ++{ ++ return ++ /* insert safe link */ ++ estimate_one_insert_item(tree) + ++ /* remove safe link */ ++ estimate_one_item_removal(tree) + ++ /* drill to the leaf level during insertion */ ++ 1 + estimate_one_insert_item(tree) + ++ /* ++ * possible update of existing safe-link. Actually, if ++ * safe-link existed already (we failed to remove it), then no ++ * insertion is necessary, so this term is already "covered", ++ * but for simplicity let's left it. ++ */ ++ 1; ++} ++ ++/* ++ * grab enough disk space to insert and remove (in the error-handling path) ++ * safe-link. ++ */ ++int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags) ++{ ++ int result; ++ ++ grab_space_enable(); ++ /* The sbinfo->delete_mutex can be taken here. ++ * safe_link_release() should be called before leaving reiser4 ++ * context. */ ++ result = ++ reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags); ++ grab_space_enable(); ++ return result; ++} ++ ++/* ++ * release unused disk space reserved by safe_link_grab(). ++ */ ++void safe_link_release(reiser4_tree * tree) ++{ ++ reiser4_release_reserved(tree->super); ++} ++ ++/* ++ * insert into tree safe-link for operation @link on inode @inode. ++ */ ++int safe_link_add(struct inode *inode, reiser4_safe_link_t link) ++{ ++ reiser4_key key; ++ safelink_t sl; ++ int length; ++ int result; ++ reiser4_tree *tree; ++ ++ build_sd_key(inode, &sl.sdkey); ++ length = sizeof sl.sdkey; ++ ++ if (link == SAFE_TRUNCATE) { ++ /* ++ * for truncate we have to store final file length also, ++ * expand item. ++ */ ++ length += sizeof(sl.size); ++ put_unaligned(cpu_to_le64(inode->i_size), &sl.size); ++ } ++ tree = reiser4_tree_by_inode(inode); ++ build_link_key(tree, get_inode_oid(inode), link, &key); ++ ++ result = store_black_box(tree, &key, &sl, length); ++ if (result == -EEXIST) ++ result = update_black_box(tree, &key, &sl, length); ++ return result; ++} ++ ++/* ++ * remove safe-link corresponding to the operation @link on inode @inode from ++ * the tree. ++ */ ++int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link) ++{ ++ reiser4_key key; ++ ++ return kill_black_box(tree, build_link_key(tree, oid, link, &key)); ++} ++ ++/* ++ * in-memory structure to keep information extracted from safe-link. This is ++ * used to iterate over all safe-links. ++ */ ++struct safe_link_context { ++ reiser4_tree *tree; /* internal tree */ ++ reiser4_key key; /* safe-link key */ ++ reiser4_key sdkey; /* key of object stat-data */ ++ reiser4_safe_link_t link; /* safe-link type */ ++ oid_t oid; /* object oid */ ++ __u64 size; /* final size for truncate */ ++}; ++ ++/* ++ * start iterating over all safe-links. ++ */ ++static void safe_link_iter_begin(reiser4_tree * tree, ++ struct safe_link_context *ctx) ++{ ++ ctx->tree = tree; ++ reiser4_key_init(&ctx->key); ++ set_key_locality(&ctx->key, safe_link_locality(tree)); ++ set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key())); ++ set_key_offset(&ctx->key, get_key_offset(reiser4_max_key())); ++} ++ ++/* ++ * return next safe-link. ++ */ ++static int safe_link_iter_next(struct safe_link_context *ctx) ++{ ++ int result; ++ safelink_t sl; ++ ++ result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0); ++ if (result == 0) { ++ ctx->oid = get_key_objectid(&ctx->key); ++ ctx->link = get_key_offset(&ctx->key); ++ ctx->sdkey = sl.sdkey; ++ if (ctx->link == SAFE_TRUNCATE) ++ ctx->size = le64_to_cpu(get_unaligned(&sl.size)); ++ } ++ return result; ++} ++ ++/* ++ * check are there any more safe-links left in the tree. ++ */ ++static int safe_link_iter_finished(struct safe_link_context *ctx) ++{ ++ return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree); ++} ++ ++/* ++ * finish safe-link iteration. ++ */ ++static void safe_link_iter_end(struct safe_link_context *ctx) ++{ ++ /* nothing special */ ++} ++ ++/* ++ * process single safe-link. ++ */ ++static int process_safelink(struct super_block *super, reiser4_safe_link_t link, ++ reiser4_key * sdkey, oid_t oid, __u64 size) ++{ ++ struct inode *inode; ++ int result; ++ ++ /* ++ * obtain object inode by reiser4_iget(), then call object plugin ++ * ->safelink() method to do actual work, then delete safe-link on ++ * success. ++ */ ++ inode = reiser4_iget(super, sdkey, 1); ++ if (!IS_ERR(inode)) { ++ file_plugin *fplug; ++ ++ fplug = inode_file_plugin(inode); ++ assert("nikita-3428", fplug != NULL); ++ assert("", oid == get_inode_oid(inode)); ++ if (fplug->safelink != NULL) { ++ /* reiser4_txn_restart_current is not necessary because ++ * mounting is signle thread. However, without it ++ * deadlock detection code will complain (see ++ * nikita-3361). */ ++ reiser4_txn_restart_current(); ++ result = fplug->safelink(inode, link, size); ++ } else { ++ warning("nikita-3430", ++ "Cannot handle safelink for %lli", ++ (unsigned long long)oid); ++ reiser4_print_key("key", sdkey); ++ result = 0; ++ } ++ if (result != 0) { ++ warning("nikita-3431", ++ "Error processing safelink for %lli: %i", ++ (unsigned long long)oid, result); ++ } ++ reiser4_iget_complete(inode); ++ iput(inode); ++ if (result == 0) { ++ result = safe_link_grab(reiser4_get_tree(super), ++ BA_CAN_COMMIT); ++ if (result == 0) ++ result = ++ safe_link_del(reiser4_get_tree(super), oid, ++ link); ++ safe_link_release(reiser4_get_tree(super)); ++ /* ++ * restart transaction: if there was large number of ++ * safe-links, their processing may fail to fit into ++ * single transaction. ++ */ ++ if (result == 0) ++ reiser4_txn_restart_current(); ++ } ++ } else ++ result = PTR_ERR(inode); ++ return result; ++} ++ ++/* ++ * iterate over all safe-links in the file-system processing them one by one. ++ */ ++int process_safelinks(struct super_block *super) ++{ ++ struct safe_link_context ctx; ++ int result; ++ ++ if (rofs_super(super)) ++ /* do nothing on the read-only file system */ ++ return 0; ++ safe_link_iter_begin(&get_super_private(super)->tree, &ctx); ++ result = 0; ++ do { ++ result = safe_link_iter_next(&ctx); ++ if (safe_link_iter_finished(&ctx) || result == -ENOENT) { ++ result = 0; ++ break; ++ } ++ if (result == 0) ++ result = process_safelink(super, ctx.link, ++ &ctx.sdkey, ctx.oid, ++ ctx.size); ++ } while (result == 0); ++ safe_link_iter_end(&ctx); ++ return result; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/safe_link.h linux-2.6.33/fs/reiser4/safe_link.h +--- linux-2.6.33.orig/fs/reiser4/safe_link.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/safe_link.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,29 @@ ++/* Copyright 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Safe-links. See safe_link.c for details. */ ++ ++#if !defined(__FS_SAFE_LINK_H__) ++#define __FS_SAFE_LINK_H__ ++ ++#include "tree.h" ++ ++int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags); ++void safe_link_release(reiser4_tree * tree); ++int safe_link_add(struct inode *inode, reiser4_safe_link_t link); ++int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link); ++ ++int process_safelinks(struct super_block *super); ++ ++/* __FS_SAFE_LINK_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/seal.c linux-2.6.33/fs/reiser4/seal.c +--- linux-2.6.33.orig/fs/reiser4/seal.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/seal.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,218 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++/* Seals implementation. */ ++/* Seals are "weak" tree pointers. They are analogous to tree coords in ++ allowing to bypass tree traversal. But normal usage of coords implies that ++ node pointed to by coord is locked, whereas seals don't keep a lock (or ++ even a reference) to znode. In stead, each znode contains a version number, ++ increased on each znode modification. This version number is copied into a ++ seal when seal is created. Later, one can "validate" seal by calling ++ reiser4_seal_validate(). If znode is in cache and its version number is ++ still the same, seal is "pristine" and coord associated with it can be ++ re-used immediately. ++ ++ If, on the other hand, znode is out of cache, or it is obviously different ++ one from the znode seal was initially attached to (for example, it is on ++ the different level, or is being removed from the tree), seal is ++ irreparably invalid ("burned") and tree traversal has to be repeated. ++ ++ Otherwise, there is some hope, that while znode was modified (and seal was ++ "broken" as a result), key attached to the seal is still in the node. This ++ is checked by first comparing this key with delimiting keys of node and, if ++ key is ok, doing intra-node lookup. ++ ++ Znode version is maintained in the following way: ++ ++ there is reiser4_tree.znode_epoch counter. Whenever new znode is created, ++ znode_epoch is incremented and its new value is stored in ->version field ++ of new znode. Whenever znode is dirtied (which means it was probably ++ modified), znode_epoch is also incremented and its new value is stored in ++ znode->version. This is done so, because just incrementing znode->version ++ on each update is not enough: it may so happen, that znode get deleted, new ++ znode is allocated for the same disk block and gets the same version ++ counter, tricking seal code into false positive. ++*/ ++ ++#include "forward.h" ++#include "debug.h" ++#include "key.h" ++#include "coord.h" ++#include "seal.h" ++#include "plugin/item/item.h" ++#include "plugin/node/node.h" ++#include "jnode.h" ++#include "znode.h" ++#include "super.h" ++ ++static znode *seal_node(const seal_t *seal); ++static int seal_matches(const seal_t *seal, znode * node); ++ ++/* initialise seal. This can be called several times on the same seal. @coord ++ and @key can be NULL. */ ++void reiser4_seal_init(seal_t *seal /* seal to initialise */ , ++ const coord_t *coord /* coord @seal will be ++ * attached to */ , ++ const reiser4_key * key UNUSED_ARG /* key @seal will be ++ * attached to */ ) ++{ ++ assert("nikita-1886", seal != NULL); ++ memset(seal, 0, sizeof *seal); ++ if (coord != NULL) { ++ znode *node; ++ ++ node = coord->node; ++ assert("nikita-1987", node != NULL); ++ spin_lock_znode(node); ++ seal->version = node->version; ++ assert("nikita-1988", seal->version != 0); ++ seal->block = *znode_get_block(node); ++#if REISER4_DEBUG ++ seal->coord1 = *coord; ++ if (key != NULL) ++ seal->key = *key; ++#endif ++ spin_unlock_znode(node); ++ } ++} ++ ++/* finish with seal */ ++void reiser4_seal_done(seal_t *seal/* seal to clear */) ++{ ++ assert("nikita-1887", seal != NULL); ++ seal->version = 0; ++} ++ ++/* true if seal was initialised */ ++int reiser4_seal_is_set(const seal_t *seal/* seal to query */) ++{ ++ assert("nikita-1890", seal != NULL); ++ return seal->version != 0; ++} ++ ++#if REISER4_DEBUG ++/* helper function for reiser4_seal_validate(). It checks that item at @coord ++ * has expected key. This is to detect cases where node was modified but wasn't ++ * marked dirty. */ ++static inline int check_seal_match(const coord_t *coord /* coord to check */ , ++ const reiser4_key * k/* expected key */) ++{ ++ reiser4_key ukey; ++ ++ return (coord->between != AT_UNIT) || ++ /* FIXME-VS: we only can compare keys for items whose units ++ represent exactly one key */ ++ ((coord_is_existing_unit(coord)) ++ && (item_is_extent(coord) ++ || keyeq(k, unit_key_by_coord(coord, &ukey)))) ++ || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord)) ++ && keyge(k, unit_key_by_coord(coord, &ukey))); ++} ++#endif ++ ++/* this is used by reiser4_seal_validate. It accepts return value of ++ * longterm_lock_znode and returns 1 if it can be interpreted as seal ++ * validation failure. For instance, when longterm_lock_znode returns -EINVAL, ++ * reiser4_seal_validate returns -E_REPEAT and caller will call tre search. ++ * We cannot do this in longterm_lock_znode(), because sometimes we want to ++ * distinguish between -EINVAL and -E_REPEAT. */ ++static int should_repeat(int return_code) ++{ ++ return return_code == -EINVAL; ++} ++ ++/* (re-)validate seal. ++ ++ Checks whether seal is pristine, and try to revalidate it if possible. ++ ++ If seal was burned, or broken irreparably, return -E_REPEAT. ++ ++ NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are ++ looking for is in range of keys covered by the sealed node, but item wasn't ++ found by node ->lookup() method. Alternative is to return -ENOENT in this ++ case, but this would complicate callers logic. ++ ++*/ ++int reiser4_seal_validate(seal_t *seal /* seal to validate */, ++ coord_t *coord /* coord to validate against */, ++ const reiser4_key * key /* key to validate against */, ++ lock_handle * lh /* resulting lock handle */, ++ znode_lock_mode mode /* lock node */, ++ znode_lock_request request/* locking priority */) ++{ ++ znode *node; ++ int result; ++ ++ assert("nikita-1889", seal != NULL); ++ assert("nikita-1881", reiser4_seal_is_set(seal)); ++ assert("nikita-1882", key != NULL); ++ assert("nikita-1883", coord != NULL); ++ assert("nikita-1884", lh != NULL); ++ assert("nikita-1885", keyeq(&seal->key, key)); ++ assert("nikita-1989", coords_equal(&seal->coord1, coord)); ++ ++ /* obtain znode by block number */ ++ node = seal_node(seal); ++ if (node != NULL) { ++ /* znode was in cache, lock it */ ++ result = longterm_lock_znode(lh, node, mode, request); ++ zput(node); ++ if (result == 0) { ++ if (seal_matches(seal, node)) { ++ /* if seal version and znode version ++ coincide */ ++ ON_DEBUG(coord_update_v(coord)); ++ assert("nikita-1990", ++ node == seal->coord1.node); ++ assert("nikita-1898", ++ WITH_DATA_RET(coord->node, 1, ++ check_seal_match(coord, ++ key))); ++ } else ++ result = RETERR(-E_REPEAT); ++ } ++ if (result != 0) { ++ if (should_repeat(result)) ++ result = RETERR(-E_REPEAT); ++ /* unlock node on failure */ ++ done_lh(lh); ++ } ++ } else { ++ /* znode wasn't in cache */ ++ result = RETERR(-E_REPEAT); ++ } ++ return result; ++} ++ ++/* helpers functions */ ++ ++/* obtain reference to znode seal points to, if in cache */ ++static znode *seal_node(const seal_t *seal/* seal to query */) ++{ ++ assert("nikita-1891", seal != NULL); ++ return zlook(current_tree, &seal->block); ++} ++ ++/* true if @seal version and @node version coincide */ ++static int seal_matches(const seal_t *seal /* seal to check */ , ++ znode * node/* node to check */) ++{ ++ int result; ++ ++ assert("nikita-1991", seal != NULL); ++ assert("nikita-1993", node != NULL); ++ ++ spin_lock_znode(node); ++ result = (seal->version == node->version); ++ spin_unlock_znode(node); ++ return result; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/seal.h linux-2.6.33/fs/reiser4/seal.h +--- linux-2.6.33.orig/fs/reiser4/seal.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/seal.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,49 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */ ++ ++#ifndef __SEAL_H__ ++#define __SEAL_H__ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++#include "coord.h" ++ ++/* for __u?? types */ ++/*#include <linux/types.h>*/ ++ ++/* seal. See comment at the top of seal.c */ ++typedef struct seal_s { ++ /* version of znode recorder at the time of seal creation */ ++ __u64 version; ++ /* block number of znode attached to this seal */ ++ reiser4_block_nr block; ++#if REISER4_DEBUG ++ /* coord this seal is attached to. For debugging. */ ++ coord_t coord1; ++ /* key this seal is attached to. For debugging. */ ++ reiser4_key key; ++#endif ++} seal_t; ++ ++extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *); ++extern void reiser4_seal_done(seal_t *); ++extern int reiser4_seal_is_set(const seal_t *); ++extern int reiser4_seal_validate(seal_t *, coord_t *, ++ const reiser4_key *, lock_handle * , ++ znode_lock_mode mode, znode_lock_request request); ++ ++/* __SEAL_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/search.c linux-2.6.33/fs/reiser4/search.c +--- linux-2.6.33.orig/fs/reiser4/search.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/search.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1612 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++#include "coord.h" ++#include "seal.h" ++#include "plugin/item/item.h" ++#include "plugin/node/node.h" ++#include "plugin/plugin.h" ++#include "jnode.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree_walk.h" ++#include "tree.h" ++#include "reiser4.h" ++#include "super.h" ++#include "inode.h" ++ ++#include <linux/slab.h> ++ ++static const char *bias_name(lookup_bias bias); ++ ++/* tree searching algorithm, intranode searching algorithms are in ++ plugin/node/ */ ++ ++/* tree lookup cache ++ * ++ * The coord by key cache consists of small list of recently accessed nodes ++ * maintained according to the LRU discipline. Before doing real top-to-down ++ * tree traversal this cache is scanned for nodes that can contain key ++ * requested. ++ * ++ * The efficiency of coord cache depends heavily on locality of reference for ++ * tree accesses. Our user level simulations show reasonably good hit ratios ++ * for coord cache under most loads so far. ++ */ ++ ++/* Initialise coord cache slot */ ++static void cbk_cache_init_slot(cbk_cache_slot *slot) ++{ ++ assert("nikita-345", slot != NULL); ++ ++ INIT_LIST_HEAD(&slot->lru); ++ slot->node = NULL; ++} ++ ++/* Initialize coord cache */ ++int cbk_cache_init(cbk_cache * cache/* cache to init */) ++{ ++ int i; ++ ++ assert("nikita-346", cache != NULL); ++ ++ cache->slot = ++ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots, ++ reiser4_ctx_gfp_mask_get()); ++ if (cache->slot == NULL) ++ return RETERR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&cache->lru); ++ for (i = 0; i < cache->nr_slots; ++i) { ++ cbk_cache_init_slot(cache->slot + i); ++ list_add_tail(&((cache->slot + i)->lru), &cache->lru); ++ } ++ rwlock_init(&cache->guard); ++ return 0; ++} ++ ++/* free cbk cache data */ ++void cbk_cache_done(cbk_cache * cache/* cache to release */) ++{ ++ assert("nikita-2493", cache != NULL); ++ if (cache->slot != NULL) { ++ kfree(cache->slot); ++ cache->slot = NULL; ++ } ++} ++ ++/* macro to iterate over all cbk cache slots */ ++#define for_all_slots(cache, slot) \ ++ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \ ++ &(cache)->lru != &(slot)->lru; \ ++ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru)) ++ ++#if REISER4_DEBUG ++/* this function assures that [cbk-cache-invariant] invariant holds */ ++static int cbk_cache_invariant(const cbk_cache * cache) ++{ ++ cbk_cache_slot *slot; ++ int result; ++ int unused; ++ ++ if (cache->nr_slots == 0) ++ return 1; ++ ++ assert("nikita-2469", cache != NULL); ++ unused = 0; ++ result = 1; ++ read_lock(&((cbk_cache *)cache)->guard); ++ for_all_slots(cache, slot) { ++ /* in LRU first go all `used' slots followed by `unused' */ ++ if (unused && (slot->node != NULL)) ++ result = 0; ++ if (slot->node == NULL) ++ unused = 1; ++ else { ++ cbk_cache_slot *scan; ++ ++ /* all cached nodes are different */ ++ scan = slot; ++ while (result) { ++ scan = list_entry(scan->lru.next, ++ cbk_cache_slot, lru); ++ if (&cache->lru == &scan->lru) ++ break; ++ if (slot->node == scan->node) ++ result = 0; ++ } ++ } ++ if (!result) ++ break; ++ } ++ read_unlock(&((cbk_cache *)cache)->guard); ++ return result; ++} ++ ++#endif ++ ++/* Remove references, if any, to @node from coord cache */ ++void cbk_cache_invalidate(const znode * node /* node to remove from cache */ , ++ reiser4_tree * tree/* tree to remove node from */) ++{ ++ cbk_cache_slot *slot; ++ cbk_cache *cache; ++ int i; ++ ++ assert("nikita-350", node != NULL); ++ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree)); ++ ++ cache = &tree->cbk_cache; ++ assert("nikita-2470", cbk_cache_invariant(cache)); ++ ++ write_lock(&(cache->guard)); ++ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) { ++ if (slot->node == node) { ++ list_move_tail(&slot->lru, &cache->lru); ++ slot->node = NULL; ++ break; ++ } ++ } ++ write_unlock(&(cache->guard)); ++ assert("nikita-2471", cbk_cache_invariant(cache)); ++} ++ ++/* add to the cbk-cache in the "tree" information about "node". This ++ can actually be update of existing slot in a cache. */ ++static void cbk_cache_add(const znode * node/* node to add to the cache */) ++{ ++ cbk_cache *cache; ++ ++ cbk_cache_slot *slot; ++ int i; ++ ++ assert("nikita-352", node != NULL); ++ ++ cache = &znode_get_tree(node)->cbk_cache; ++ assert("nikita-2472", cbk_cache_invariant(cache)); ++ ++ if (cache->nr_slots == 0) ++ return; ++ ++ write_lock(&(cache->guard)); ++ /* find slot to update/add */ ++ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) { ++ /* oops, this node is already in a cache */ ++ if (slot->node == node) ++ break; ++ } ++ /* if all slots are used, reuse least recently used one */ ++ if (i == cache->nr_slots) { ++ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru); ++ slot->node = (znode *) node; ++ } ++ list_move(&slot->lru, &cache->lru); ++ write_unlock(&(cache->guard)); ++ assert("nikita-2473", cbk_cache_invariant(cache)); ++} ++ ++static int setup_delimiting_keys(cbk_handle * h); ++static lookup_result coord_by_handle(cbk_handle * handle); ++static lookup_result traverse_tree(cbk_handle * h); ++static int cbk_cache_search(cbk_handle * h); ++ ++static level_lookup_result cbk_level_lookup(cbk_handle * h); ++static level_lookup_result cbk_node_lookup(cbk_handle * h); ++ ++/* helper functions */ ++ ++static void update_stale_dk(reiser4_tree * tree, znode * node); ++ ++/* release parent node during traversal */ ++static void put_parent(cbk_handle * h); ++/* check consistency of fields */ ++static int sanity_check(cbk_handle * h); ++/* release resources in handle */ ++static void hput(cbk_handle * h); ++ ++static level_lookup_result search_to_left(cbk_handle * h); ++ ++/* pack numerous (numberous I should say) arguments of coord_by_key() into ++ * cbk_handle */ ++static cbk_handle *cbk_pack(cbk_handle * handle, ++ reiser4_tree * tree, ++ const reiser4_key * key, ++ coord_t *coord, ++ lock_handle * active_lh, ++ lock_handle * parent_lh, ++ znode_lock_mode lock_mode, ++ lookup_bias bias, ++ tree_level lock_level, ++ tree_level stop_level, ++ __u32 flags, ra_info_t *info) ++{ ++ memset(handle, 0, sizeof *handle); ++ ++ handle->tree = tree; ++ handle->key = key; ++ handle->lock_mode = lock_mode; ++ handle->bias = bias; ++ handle->lock_level = lock_level; ++ handle->stop_level = stop_level; ++ handle->coord = coord; ++ /* set flags. See comment in tree.h:cbk_flags */ ++ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK; ++ ++ handle->active_lh = active_lh; ++ handle->parent_lh = parent_lh; ++ handle->ra_info = info; ++ return handle; ++} ++ ++/* main tree lookup procedure ++ ++ Check coord cache. If key we are looking for is not found there, call cbk() ++ to do real tree traversal. ++ ++ As we have extents on the twig level, @lock_level and @stop_level can ++ be different from LEAF_LEVEL and each other. ++ ++ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode ++ long term locks) while calling this. ++*/ ++lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search ++ * in. Usually this tree is ++ * part of file-system ++ * super-block */ , ++ const reiser4_key * key /* key to look for */ , ++ coord_t *coord /* where to store found ++ * position in a tree. Fields ++ * in "coord" are only valid if ++ * coord_by_key() returned ++ * "CBK_COORD_FOUND" */ , ++ lock_handle * lh, /* resulting lock handle */ ++ znode_lock_mode lock_mode /* type of lookup we ++ * want on node. Pass ++ * ZNODE_READ_LOCK here ++ * if you only want to ++ * read item found and ++ * ZNODE_WRITE_LOCK if ++ * you want to modify ++ * it */ , ++ lookup_bias bias /* what to return if coord ++ * with exactly the @key is ++ * not in the tree */ , ++ tree_level lock_level/* tree level where to start ++ * taking @lock type of ++ * locks */ , ++ tree_level stop_level/* tree level to stop. Pass ++ * LEAF_LEVEL or TWIG_LEVEL ++ * here Item being looked ++ * for has to be between ++ * @lock_level and ++ * @stop_level, inclusive */ , ++ __u32 flags /* search flags */ , ++ ra_info_t * ++ info ++ /* information about desired tree traversal ++ * readahead */ ++ ) ++{ ++ cbk_handle handle; ++ lock_handle parent_lh; ++ lookup_result result; ++ ++ init_lh(lh); ++ init_lh(&parent_lh); ++ ++ assert("nikita-3023", reiser4_schedulable()); ++ ++ assert("nikita-353", tree != NULL); ++ assert("nikita-354", key != NULL); ++ assert("nikita-355", coord != NULL); ++ assert("nikita-356", (bias == FIND_EXACT) ++ || (bias == FIND_MAX_NOT_MORE_THAN)); ++ assert("nikita-357", stop_level >= LEAF_LEVEL); ++ /* no locks can be held during tree traversal */ ++ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack())); ++ ++ cbk_pack(&handle, ++ tree, ++ key, ++ coord, ++ lh, ++ &parent_lh, ++ lock_mode, bias, lock_level, stop_level, flags, info); ++ ++ result = coord_by_handle(&handle); ++ assert("nikita-3247", ++ ergo(!IS_CBKERR(result), coord->node == lh->node)); ++ return result; ++} ++ ++/* like coord_by_key(), but starts traversal from vroot of @object rather than ++ * from tree root. */ ++lookup_result reiser4_object_lookup(struct inode *object, ++ const reiser4_key * key, ++ coord_t *coord, ++ lock_handle * lh, ++ znode_lock_mode lock_mode, ++ lookup_bias bias, ++ tree_level lock_level, ++ tree_level stop_level, __u32 flags, ++ ra_info_t *info) ++{ ++ cbk_handle handle; ++ lock_handle parent_lh; ++ lookup_result result; ++ ++ init_lh(lh); ++ init_lh(&parent_lh); ++ ++ assert("nikita-3023", reiser4_schedulable()); ++ ++ assert("nikita-354", key != NULL); ++ assert("nikita-355", coord != NULL); ++ assert("nikita-356", (bias == FIND_EXACT) ++ || (bias == FIND_MAX_NOT_MORE_THAN)); ++ assert("nikita-357", stop_level >= LEAF_LEVEL); ++ /* no locks can be held during tree search by key */ ++ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack())); ++ ++ cbk_pack(&handle, ++ object != NULL ? reiser4_tree_by_inode(object) : current_tree, ++ key, ++ coord, ++ lh, ++ &parent_lh, ++ lock_mode, bias, lock_level, stop_level, flags, info); ++ handle.object = object; ++ ++ result = coord_by_handle(&handle); ++ assert("nikita-3247", ++ ergo(!IS_CBKERR(result), coord->node == lh->node)); ++ return result; ++} ++ ++/* lookup by cbk_handle. Common part of coord_by_key() and ++ reiser4_object_lookup(). */ ++static lookup_result coord_by_handle(cbk_handle * handle) ++{ ++ /* ++ * first check cbk_cache (which is look-aside cache for our tree) and ++ * of this fails, start traversal. ++ */ ++ /* first check whether "key" is in cache of recent lookups. */ ++ if (cbk_cache_search(handle) == 0) ++ return handle->result; ++ else ++ return traverse_tree(handle); ++} ++ ++/* Execute actor for each item (or unit, depending on @through_units_p), ++ starting from @coord, right-ward, until either: ++ ++ - end of the tree is reached ++ - unformatted node is met ++ - error occurred ++ - @actor returns 0 or less ++ ++ Error code, or last actor return value is returned. ++ ++ This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through ++ sequence of entries with identical keys and alikes. ++*/ ++int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ , ++ coord_t *coord /* coord to start from */ , ++ lock_handle * lh /* lock handle to start with and to ++ * update along the way */ , ++ tree_iterate_actor_t actor /* function to call on each ++ * item/unit */ , ++ void *arg /* argument to pass to @actor */ , ++ znode_lock_mode mode /* lock mode on scanned nodes */ , ++ int through_units_p /* call @actor on each item or on ++ * each unit */ ) ++{ ++ int result; ++ ++ assert("nikita-1143", tree != NULL); ++ assert("nikita-1145", coord != NULL); ++ assert("nikita-1146", lh != NULL); ++ assert("nikita-1147", actor != NULL); ++ ++ result = zload(coord->node); ++ coord_clear_iplug(coord); ++ if (result != 0) ++ return result; ++ if (!coord_is_existing_unit(coord)) { ++ zrelse(coord->node); ++ return -ENOENT; ++ } ++ while ((result = actor(tree, coord, lh, arg)) > 0) { ++ /* move further */ ++ if ((through_units_p && coord_next_unit(coord)) || ++ (!through_units_p && coord_next_item(coord))) { ++ do { ++ lock_handle couple; ++ ++ /* move to the next node */ ++ init_lh(&couple); ++ result = ++ reiser4_get_right_neighbor(&couple, ++ coord->node, ++ (int)mode, ++ GN_CAN_USE_UPPER_LEVELS); ++ zrelse(coord->node); ++ if (result == 0) { ++ ++ result = zload(couple.node); ++ if (result != 0) { ++ done_lh(&couple); ++ return result; ++ } ++ ++ coord_init_first_unit(coord, ++ couple.node); ++ done_lh(lh); ++ move_lh(lh, &couple); ++ } else ++ return result; ++ } while (node_is_empty(coord->node)); ++ } ++ ++ assert("nikita-1149", coord_is_existing_unit(coord)); ++ } ++ zrelse(coord->node); ++ return result; ++} ++ ++/* return locked uber znode for @tree */ ++int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode, ++ znode_lock_request pri, lock_handle * lh) ++{ ++ int result; ++ ++ result = longterm_lock_znode(lh, tree->uber, mode, pri); ++ return result; ++} ++ ++/* true if @key is strictly within @node ++ ++ we are looking for possibly non-unique key and it is item is at the edge of ++ @node. May be it is in the neighbor. ++*/ ++static int znode_contains_key_strict(znode * node /* node to check key ++ * against */ , ++ const reiser4_key * ++ key /* key to check */ , ++ int isunique) ++{ ++ int answer; ++ ++ assert("nikita-1760", node != NULL); ++ assert("nikita-1722", key != NULL); ++ ++ if (keyge(key, &node->rd_key)) ++ return 0; ++ ++ answer = keycmp(&node->ld_key, key); ++ ++ if (isunique) ++ return answer != GREATER_THAN; ++ else ++ return answer == LESS_THAN; ++} ++ ++/* ++ * Virtual Root (vroot) code. ++ * ++ * For given file system object (e.g., regular file or directory) let's ++ * define its "virtual root" as lowest in the tree (that is, furtherest ++ * from the tree root) node such that all body items of said object are ++ * located in a tree rooted at this node. ++ * ++ * Once vroot of object is found all tree lookups for items within body of ++ * this object ("object lookups") can be started from its vroot rather ++ * than from real root. This has following advantages: ++ * ++ * 1. amount of nodes traversed during lookup (and, hence, amount of ++ * key comparisons made) decreases, and ++ * ++ * 2. contention on tree root is decreased. This latter was actually ++ * motivating reason behind vroot, because spin lock of root node, ++ * which is taken when acquiring long-term lock on root node is the ++ * hottest lock in the reiser4. ++ * ++ * How to find vroot. ++ * ++ * When vroot of object F is not yet determined, all object lookups start ++ * from the root of the tree. At each tree level during traversal we have ++ * a node N such that a key we are looking for (which is the key inside ++ * object's body) is located within N. In function handle_vroot() called ++ * from cbk_level_lookup() we check whether N is possible vroot for ++ * F. Check is trivial---if neither leftmost nor rightmost item of N ++ * belongs to F (and we already have helpful ->owns_item() method of ++ * object plugin for this), then N is possible vroot of F. This, of ++ * course, relies on the assumption that each object occupies contiguous ++ * range of keys in the tree. ++ * ++ * Thus, traversing tree downward and checking each node as we go, we can ++ * find lowest such node, which, by definition, is vroot. ++ * ++ * How to track vroot. ++ * ++ * Nohow. If actual vroot changes, next object lookup will just restart ++ * from the actual tree root, refreshing object's vroot along the way. ++ * ++ */ ++ ++/* ++ * Check whether @node is possible vroot of @object. ++ */ ++static void handle_vroot(struct inode *object, znode * node) ++{ ++ file_plugin *fplug; ++ coord_t coord; ++ ++ fplug = inode_file_plugin(object); ++ assert("nikita-3353", fplug != NULL); ++ assert("nikita-3354", fplug->owns_item != NULL); ++ ++ if (unlikely(node_is_empty(node))) ++ return; ++ ++ coord_init_first_unit(&coord, node); ++ /* ++ * if leftmost item of @node belongs to @object, we cannot be sure ++ * that @node is vroot of @object, because, some items of @object are ++ * probably in the sub-tree rooted at the left neighbor of @node. ++ */ ++ if (fplug->owns_item(object, &coord)) ++ return; ++ coord_init_last_unit(&coord, node); ++ /* mutatis mutandis for the rightmost item */ ++ if (fplug->owns_item(object, &coord)) ++ return; ++ /* otherwise, @node is possible vroot of @object */ ++ inode_set_vroot(object, node); ++} ++ ++/* ++ * helper function used by traverse tree to start tree traversal not from the ++ * tree root, but from @h->object's vroot, if possible. ++ */ ++static int prepare_object_lookup(cbk_handle * h) ++{ ++ znode *vroot; ++ int result; ++ ++ vroot = inode_get_vroot(h->object); ++ if (vroot == NULL) { ++ /* ++ * object doesn't have known vroot, start from real tree root. ++ */ ++ return LOOKUP_CONT; ++ } ++ ++ h->level = znode_get_level(vroot); ++ /* take a long-term lock on vroot */ ++ h->result = longterm_lock_znode(h->active_lh, vroot, ++ cbk_lock_mode(h->level, h), ++ ZNODE_LOCK_LOPRI); ++ result = LOOKUP_REST; ++ if (h->result == 0) { ++ int isunique; ++ int inside; ++ ++ isunique = h->flags & CBK_UNIQUE; ++ /* check that key is inside vroot */ ++ read_lock_dk(h->tree); ++ inside = (znode_contains_key_strict(vroot, h->key, isunique) && ++ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE)); ++ read_unlock_dk(h->tree); ++ if (inside) { ++ h->result = zload(vroot); ++ if (h->result == 0) { ++ /* search for key in vroot. */ ++ result = cbk_node_lookup(h); ++ zrelse(vroot); /*h->active_lh->node); */ ++ if (h->active_lh->node != vroot) { ++ result = LOOKUP_REST; ++ } else if (result == LOOKUP_CONT) { ++ move_lh(h->parent_lh, h->active_lh); ++ h->flags &= ~CBK_DKSET; ++ } ++ } ++ } ++ } ++ ++ zput(vroot); ++ ++ if (IS_CBKERR(h->result) || result == LOOKUP_REST) ++ hput(h); ++ return result; ++} ++ ++/* main function that handles common parts of tree traversal: starting ++ (fake znode handling), restarts, error handling, completion */ ++static lookup_result traverse_tree(cbk_handle * h/* search handle */) ++{ ++ int done; ++ int iterations; ++ int vroot_used; ++ ++ assert("nikita-365", h != NULL); ++ assert("nikita-366", h->tree != NULL); ++ assert("nikita-367", h->key != NULL); ++ assert("nikita-368", h->coord != NULL); ++ assert("nikita-369", (h->bias == FIND_EXACT) ++ || (h->bias == FIND_MAX_NOT_MORE_THAN)); ++ assert("nikita-370", h->stop_level >= LEAF_LEVEL); ++ assert("nikita-2949", !(h->flags & CBK_DKSET)); ++ assert("zam-355", lock_stack_isclean(get_current_lock_stack())); ++ ++ done = 0; ++ iterations = 0; ++ vroot_used = 0; ++ ++ /* loop for restarts */ ++restart: ++ ++ assert("nikita-3024", reiser4_schedulable()); ++ ++ h->result = CBK_COORD_FOUND; ++ /* connect_znode() needs it */ ++ h->ld_key = *reiser4_min_key(); ++ h->rd_key = *reiser4_max_key(); ++ h->flags |= CBK_DKSET; ++ h->error = NULL; ++ ++ if (!vroot_used && h->object != NULL) { ++ vroot_used = 1; ++ done = prepare_object_lookup(h); ++ if (done == LOOKUP_REST) ++ goto restart; ++ else if (done == LOOKUP_DONE) ++ return h->result; ++ } ++ if (h->parent_lh->node == NULL) { ++ done = ++ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI, ++ h->parent_lh); ++ ++ assert("nikita-1637", done != -E_DEADLOCK); ++ ++ h->block = h->tree->root_block; ++ h->level = h->tree->height; ++ h->coord->node = h->parent_lh->node; ++ ++ if (done != 0) ++ return done; ++ } ++ ++ /* loop descending a tree */ ++ while (!done) { ++ ++ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) && ++ IS_POW(iterations))) { ++ warning("nikita-1481", "Too many iterations: %i", ++ iterations); ++ reiser4_print_key("key", h->key); ++ ++iterations; ++ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) { ++ h->error = ++ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring."; ++ h->result = RETERR(-EIO); ++ break; ++ } ++ switch (cbk_level_lookup(h)) { ++ case LOOKUP_CONT: ++ move_lh(h->parent_lh, h->active_lh); ++ continue; ++ default: ++ wrong_return_value("nikita-372", "cbk_level"); ++ case LOOKUP_DONE: ++ done = 1; ++ break; ++ case LOOKUP_REST: ++ hput(h); ++ /* deadlock avoidance is normal case. */ ++ if (h->result != -E_DEADLOCK) ++ ++iterations; ++ reiser4_preempt_point(); ++ goto restart; ++ } ++ } ++ /* that's all. The rest is error handling */ ++ if (unlikely(h->error != NULL)) { ++ warning("nikita-373", "%s: level: %i, " ++ "lock_level: %i, stop_level: %i " ++ "lock_mode: %s, bias: %s", ++ h->error, h->level, h->lock_level, h->stop_level, ++ lock_mode_name(h->lock_mode), bias_name(h->bias)); ++ reiser4_print_address("block", &h->block); ++ reiser4_print_key("key", h->key); ++ print_coord_content("coord", h->coord); ++ } ++ /* `unlikely' error case */ ++ if (unlikely(IS_CBKERR(h->result))) { ++ /* failure. do cleanup */ ++ hput(h); ++ } else { ++ assert("nikita-1605", WITH_DATA_RET ++ (h->coord->node, 1, ++ ergo((h->result == CBK_COORD_FOUND) && ++ (h->bias == FIND_EXACT) && ++ (!node_is_empty(h->coord->node)), ++ coord_is_existing_item(h->coord)))); ++ } ++ return h->result; ++} ++ ++/* find delimiting keys of child ++ ++ Determine left and right delimiting keys for child pointed to by ++ @parent_coord. ++ ++*/ ++static void find_child_delimiting_keys(znode * parent /* parent znode, passed ++ * locked */ , ++ const coord_t *parent_coord ++ /* coord where pointer ++ * to child is stored ++ */ , ++ reiser4_key * ld /* where to store left ++ * delimiting key */ , ++ reiser4_key * rd /* where to store right ++ * delimiting key */ ) ++{ ++ coord_t neighbor; ++ ++ assert("nikita-1484", parent != NULL); ++ assert_rw_locked(&(znode_get_tree(parent)->dk_lock)); ++ ++ coord_dup(&neighbor, parent_coord); ++ ++ if (neighbor.between == AT_UNIT) ++ /* imitate item ->lookup() behavior. */ ++ neighbor.between = AFTER_UNIT; ++ ++ if (coord_set_to_left(&neighbor) == 0) ++ unit_key_by_coord(&neighbor, ld); ++ else { ++ assert("nikita-14851", 0); ++ *ld = *znode_get_ld_key(parent); ++ } ++ ++ coord_dup(&neighbor, parent_coord); ++ if (neighbor.between == AT_UNIT) ++ neighbor.between = AFTER_UNIT; ++ if (coord_set_to_right(&neighbor) == 0) ++ unit_key_by_coord(&neighbor, rd); ++ else ++ *rd = *znode_get_rd_key(parent); ++} ++ ++/* ++ * setup delimiting keys for a child ++ * ++ * @parent parent node ++ * ++ * @coord location in @parent where pointer to @child is ++ * ++ * @child child node ++ */ ++int ++set_child_delimiting_keys(znode * parent, const coord_t *coord, znode * child) ++{ ++ reiser4_tree *tree; ++ ++ assert("nikita-2952", ++ znode_get_level(parent) == znode_get_level(coord->node)); ++ ++ /* fast check without taking dk lock. This is safe, because ++ * JNODE_DKSET is never cleared once set. */ ++ if (!ZF_ISSET(child, JNODE_DKSET)) { ++ tree = znode_get_tree(parent); ++ write_lock_dk(tree); ++ if (likely(!ZF_ISSET(child, JNODE_DKSET))) { ++ find_child_delimiting_keys(parent, coord, ++ &child->ld_key, ++ &child->rd_key); ++ ON_DEBUG(child->ld_key_version = ++ atomic_inc_return(&delim_key_version); ++ child->rd_key_version = ++ atomic_inc_return(&delim_key_version);); ++ ZF_SET(child, JNODE_DKSET); ++ } ++ write_unlock_dk(tree); ++ return 1; ++ } ++ return 0; ++} ++ ++/* Perform tree lookup at one level. This is called from cbk_traverse() ++ function that drives lookup through tree and calls cbk_node_lookup() to ++ perform lookup within one node. ++ ++ See comments in a code. ++*/ ++static level_lookup_result cbk_level_lookup(cbk_handle * h/* search handle */) ++{ ++ int ret; ++ int setdk; ++ int ldkeyset = 0; ++ reiser4_key ldkey; ++ reiser4_key key; ++ znode *active; ++ ++ assert("nikita-3025", reiser4_schedulable()); ++ ++ /* acquire reference to @active node */ ++ active = ++ zget(h->tree, &h->block, h->parent_lh->node, h->level, ++ reiser4_ctx_gfp_mask_get()); ++ ++ if (IS_ERR(active)) { ++ h->result = PTR_ERR(active); ++ return LOOKUP_DONE; ++ } ++ ++ /* lock @active */ ++ h->result = longterm_lock_znode(h->active_lh, ++ active, ++ cbk_lock_mode(h->level, h), ++ ZNODE_LOCK_LOPRI); ++ /* longterm_lock_znode() acquires additional reference to znode (which ++ will be later released by longterm_unlock_znode()). Release ++ reference acquired by zget(). ++ */ ++ zput(active); ++ if (unlikely(h->result != 0)) ++ goto fail_or_restart; ++ ++ setdk = 0; ++ /* if @active is accessed for the first time, setup delimiting keys on ++ it. Delimiting keys are taken from the parent node. See ++ setup_delimiting_keys() for details. ++ */ ++ if (h->flags & CBK_DKSET) { ++ setdk = setup_delimiting_keys(h); ++ h->flags &= ~CBK_DKSET; ++ } else { ++ znode *parent; ++ ++ parent = h->parent_lh->node; ++ h->result = zload(parent); ++ if (unlikely(h->result != 0)) ++ goto fail_or_restart; ++ ++ if (!ZF_ISSET(active, JNODE_DKSET)) ++ setdk = set_child_delimiting_keys(parent, ++ h->coord, active); ++ else { ++ read_lock_dk(h->tree); ++ find_child_delimiting_keys(parent, h->coord, &ldkey, ++ &key); ++ read_unlock_dk(h->tree); ++ ldkeyset = 1; ++ } ++ zrelse(parent); ++ } ++ ++ /* this is ugly kludge. Reminder: this is necessary, because ++ ->lookup() method returns coord with ->between field probably set ++ to something different from AT_UNIT. ++ */ ++ h->coord->between = AT_UNIT; ++ ++ if (znode_just_created(active) && (h->coord->node != NULL)) { ++ write_lock_tree(h->tree); ++ /* if we are going to load znode right now, setup ++ ->in_parent: coord where pointer to this node is stored in ++ parent. ++ */ ++ coord_to_parent_coord(h->coord, &active->in_parent); ++ write_unlock_tree(h->tree); ++ } ++ ++ /* check connectedness without holding tree lock---false negatives ++ * will be re-checked by connect_znode(), and false positives are ++ * impossible---@active cannot suddenly turn into unconnected ++ * state. */ ++ if (!znode_is_connected(active)) { ++ h->result = connect_znode(h->coord, active); ++ if (unlikely(h->result != 0)) { ++ put_parent(h); ++ goto fail_or_restart; ++ } ++ } ++ ++ jload_prefetch(ZJNODE(active)); ++ ++ if (setdk) ++ update_stale_dk(h->tree, active); ++ ++ /* put_parent() cannot be called earlier, because connect_znode() ++ assumes parent node is referenced; */ ++ put_parent(h); ++ ++ if ((!znode_contains_key_lock(active, h->key) && ++ (h->flags & CBK_TRUST_DK)) ++ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) { ++ /* 1. key was moved out of this node while this thread was ++ waiting for the lock. Restart. More elaborate solution is ++ to determine where key moved (to the left, or to the right) ++ and try to follow it through sibling pointers. ++ ++ 2. or, node itself is going to be removed from the ++ tree. Release lock and restart. ++ */ ++ h->result = -E_REPEAT; ++ } ++ if (h->result == -E_REPEAT) ++ return LOOKUP_REST; ++ ++ h->result = zload_ra(active, h->ra_info); ++ if (h->result) ++ return LOOKUP_DONE; ++ ++ /* sanity checks */ ++ if (sanity_check(h)) { ++ zrelse(active); ++ return LOOKUP_DONE; ++ } ++ ++ /* check that key of leftmost item in the @active is the same as in ++ * its parent */ ++ if (ldkeyset && !node_is_empty(active) && ++ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) { ++ warning("vs-3533", "Keys are inconsistent. Fsck?"); ++ reiser4_print_key("inparent", &ldkey); ++ reiser4_print_key("inchild", &key); ++ h->result = RETERR(-EIO); ++ zrelse(active); ++ return LOOKUP_DONE; ++ } ++ ++ if (h->object != NULL) ++ handle_vroot(h->object, active); ++ ++ ret = cbk_node_lookup(h); ++ ++ /* h->active_lh->node might change, but active is yet to be zrelsed */ ++ zrelse(active); ++ ++ return ret; ++ ++fail_or_restart: ++ if (h->result == -E_DEADLOCK) ++ return LOOKUP_REST; ++ return LOOKUP_DONE; ++} ++ ++#if REISER4_DEBUG ++/* check left and right delimiting keys of a znode */ ++void check_dkeys(znode * node) ++{ ++ znode *left; ++ znode *right; ++ ++ read_lock_tree(current_tree); ++ read_lock_dk(current_tree); ++ ++ assert("vs-1710", znode_is_any_locked(node)); ++ assert("vs-1197", ++ !keygt(znode_get_ld_key(node), znode_get_rd_key(node))); ++ ++ left = node->left; ++ right = node->right; ++ ++ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET) ++ && left != NULL && ZF_ISSET(left, JNODE_DKSET)) ++ /* check left neighbor. Note that left neighbor is not locked, ++ so it might get wrong delimiting keys therefore */ ++ assert("vs-1198", ++ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node)) ++ || ZF_ISSET(left, JNODE_HEARD_BANSHEE))); ++ ++ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET) ++ && right != NULL && ZF_ISSET(right, JNODE_DKSET)) ++ /* check right neighbor. Note that right neighbor is not ++ locked, so it might get wrong delimiting keys therefore */ ++ assert("vs-1199", ++ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right)) ++ || ZF_ISSET(right, JNODE_HEARD_BANSHEE))); ++ ++ read_unlock_dk(current_tree); ++ read_unlock_tree(current_tree); ++} ++#endif ++ ++/* true if @key is left delimiting key of @node */ ++static int key_is_ld(znode * node, const reiser4_key * key) ++{ ++ int ld; ++ ++ assert("nikita-1716", node != NULL); ++ assert("nikita-1758", key != NULL); ++ ++ read_lock_dk(znode_get_tree(node)); ++ assert("nikita-1759", znode_contains_key(node, key)); ++ ld = keyeq(znode_get_ld_key(node), key); ++ read_unlock_dk(znode_get_tree(node)); ++ return ld; ++} ++ ++/* Process one node during tree traversal. ++ ++ This is called by cbk_level_lookup(). */ ++static level_lookup_result cbk_node_lookup(cbk_handle * h/* search handle */) ++{ ++ /* node plugin of @active */ ++ node_plugin *nplug; ++ /* item plugin of item that was found */ ++ item_plugin *iplug; ++ /* search bias */ ++ lookup_bias node_bias; ++ /* node we are operating upon */ ++ znode *active; ++ /* tree we are searching in */ ++ reiser4_tree *tree; ++ /* result */ ++ int result; ++ ++ assert("nikita-379", h != NULL); ++ ++ active = h->active_lh->node; ++ tree = h->tree; ++ ++ nplug = active->nplug; ++ assert("nikita-380", nplug != NULL); ++ ++ ON_DEBUG(check_dkeys(active)); ++ ++ /* return item from "active" node with maximal key not greater than ++ "key" */ ++ node_bias = h->bias; ++ result = nplug->lookup(active, h->key, node_bias, h->coord); ++ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) { ++ /* error occurred */ ++ h->result = result; ++ return LOOKUP_DONE; ++ } ++ if (h->level == h->stop_level) { ++ /* welcome to the stop level */ ++ assert("nikita-381", h->coord->node == active); ++ if (result == NS_FOUND) { ++ /* success of tree lookup */ ++ if (!(h->flags & CBK_UNIQUE) ++ && key_is_ld(active, h->key)) ++ return search_to_left(h); ++ else ++ h->result = CBK_COORD_FOUND; ++ } else { ++ h->result = CBK_COORD_NOTFOUND; ++ } ++ if (!(h->flags & CBK_IN_CACHE)) ++ cbk_cache_add(active); ++ return LOOKUP_DONE; ++ } ++ ++ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) { ++ h->error = "not found on internal node"; ++ h->result = result; ++ return LOOKUP_DONE; ++ } ++ ++ assert("vs-361", h->level > h->stop_level); ++ ++ if (handle_eottl(h, &result)) { ++ assert("vs-1674", (result == LOOKUP_DONE || ++ result == LOOKUP_REST)); ++ return result; ++ } ++ ++ /* go down to next level */ ++ check_me("vs-12", zload(h->coord->node) == 0); ++ assert("nikita-2116", item_is_internal(h->coord)); ++ iplug = item_plugin_by_coord(h->coord); ++ iplug->s.internal.down_link(h->coord, h->key, &h->block); ++ zrelse(h->coord->node); ++ --h->level; ++ return LOOKUP_CONT; /* continue */ ++} ++ ++/* scan cbk_cache slots looking for a match for @h */ ++static int cbk_cache_scan_slots(cbk_handle * h/* cbk handle */) ++{ ++ level_lookup_result llr; ++ znode *node; ++ reiser4_tree *tree; ++ cbk_cache_slot *slot; ++ cbk_cache *cache; ++ tree_level level; ++ int isunique; ++ const reiser4_key *key; ++ int result; ++ ++ assert("nikita-1317", h != NULL); ++ assert("nikita-1315", h->tree != NULL); ++ assert("nikita-1316", h->key != NULL); ++ ++ tree = h->tree; ++ cache = &tree->cbk_cache; ++ if (cache->nr_slots == 0) ++ /* size of cbk cache was set to 0 by mount time option. */ ++ return RETERR(-ENOENT); ++ ++ assert("nikita-2474", cbk_cache_invariant(cache)); ++ node = NULL; /* to keep gcc happy */ ++ level = h->level; ++ key = h->key; ++ isunique = h->flags & CBK_UNIQUE; ++ result = RETERR(-ENOENT); ++ ++ /* ++ * this is time-critical function and dragons had, hence, been settled ++ * here. ++ * ++ * Loop below scans cbk cache slots trying to find matching node with ++ * suitable range of delimiting keys and located at the h->level. ++ * ++ * Scan is done under cbk cache spin lock that protects slot->node ++ * pointers. If suitable node is found we want to pin it in ++ * memory. But slot->node can point to the node with x_count 0 ++ * (unreferenced). Such node can be recycled at any moment, or can ++ * already be in the process of being recycled (within jput()). ++ * ++ * As we found node in the cbk cache, it means that jput() hasn't yet ++ * called cbk_cache_invalidate(). ++ * ++ * We acquire reference to the node without holding tree lock, and ++ * later, check node's RIP bit. This avoids races with jput(). ++ */ ++ ++ rcu_read_lock(); ++ read_lock(&((cbk_cache *)cache)->guard); ++ ++ slot = list_entry(cache->lru.next, cbk_cache_slot, lru); ++ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru); ++ BUG_ON(&slot->lru != &cache->lru);/*????*/ ++ while (1) { ++ ++ slot = list_entry(slot->lru.next, cbk_cache_slot, lru); ++ ++ if (&cache->lru != &slot->lru) ++ node = slot->node; ++ else ++ node = NULL; ++ ++ if (unlikely(node == NULL)) ++ break; ++ ++ /* ++ * this is (hopefully) the only place in the code where we are ++ * working with delimiting keys without holding dk lock. This ++ * is fine here, because this is only "guess" anyway---keys ++ * are rechecked under dk lock below. ++ */ ++ if (znode_get_level(node) == level && ++ /* reiser4_min_key < key < reiser4_max_key */ ++ znode_contains_key_strict(node, key, isunique)) { ++ zref(node); ++ result = 0; ++ spin_lock_prefetch(&tree->tree_lock); ++ break; ++ } ++ } ++ read_unlock(&((cbk_cache *)cache)->guard); ++ ++ assert("nikita-2475", cbk_cache_invariant(cache)); ++ ++ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP))) ++ result = -ENOENT; ++ ++ rcu_read_unlock(); ++ ++ if (result != 0) { ++ h->result = CBK_COORD_NOTFOUND; ++ return RETERR(-ENOENT); ++ } ++ ++ result = ++ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h), ++ ZNODE_LOCK_LOPRI); ++ zput(node); ++ if (result != 0) ++ return result; ++ result = zload(node); ++ if (result != 0) ++ return result; ++ ++ /* recheck keys */ ++ read_lock_dk(tree); ++ result = (znode_contains_key_strict(node, key, isunique) && ++ !ZF_ISSET(node, JNODE_HEARD_BANSHEE)); ++ read_unlock_dk(tree); ++ if (result) { ++ /* do lookup inside node */ ++ llr = cbk_node_lookup(h); ++ /* if cbk_node_lookup() wandered to another node (due to eottl ++ or non-unique keys), adjust @node */ ++ /*node = h->active_lh->node; */ ++ ++ if (llr != LOOKUP_DONE) { ++ /* restart or continue on the next level */ ++ result = RETERR(-ENOENT); ++ } else if (IS_CBKERR(h->result)) ++ /* io or oom */ ++ result = RETERR(-ENOENT); ++ else { ++ /* good. Either item found or definitely not found. */ ++ result = 0; ++ ++ write_lock(&(cache->guard)); ++ if (slot->node == h->active_lh->node) { ++ /* if this node is still in cbk cache---move ++ its slot to the head of the LRU list. */ ++ list_move(&slot->lru, &cache->lru); ++ } ++ write_unlock(&(cache->guard)); ++ } ++ } else { ++ /* race. While this thread was waiting for the lock, node was ++ rebalanced and item we are looking for, shifted out of it ++ (if it ever was here). ++ ++ Continuing scanning is almost hopeless: node key range was ++ moved to, is almost certainly at the beginning of the LRU ++ list at this time, because it's hot, but restarting ++ scanning from the very beginning is complex. Just return, ++ so that cbk() will be performed. This is not that ++ important, because such races should be rare. Are they? ++ */ ++ result = RETERR(-ENOENT); /* -ERAUGHT */ ++ } ++ zrelse(node); ++ assert("nikita-2476", cbk_cache_invariant(cache)); ++ return result; ++} ++ ++/* look for item with given key in the coord cache ++ ++ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache) ++ which is a small LRU list of znodes accessed lately. For each znode in ++ znode in this list, it checks whether key we are looking for fits into key ++ range covered by this node. If so, and in addition, node lies at allowed ++ level (this is to handle extents on a twig level), node is locked, and ++ lookup inside it is performed. ++ ++ we need a measurement of the cost of this cache search compared to the cost ++ of coord_by_key. ++ ++*/ ++static int cbk_cache_search(cbk_handle * h/* cbk handle */) ++{ ++ int result = 0; ++ tree_level level; ++ ++ /* add CBK_IN_CACHE to the handle flags. This means that ++ * cbk_node_lookup() assumes that cbk_cache is scanned and would add ++ * found node to the cache. */ ++ h->flags |= CBK_IN_CACHE; ++ for (level = h->stop_level; level <= h->lock_level; ++level) { ++ h->level = level; ++ result = cbk_cache_scan_slots(h); ++ if (result != 0) { ++ done_lh(h->active_lh); ++ done_lh(h->parent_lh); ++ } else { ++ assert("nikita-1319", !IS_CBKERR(h->result)); ++ break; ++ } ++ } ++ h->flags &= ~CBK_IN_CACHE; ++ return result; ++} ++ ++/* type of lock we want to obtain during tree traversal. On stop level ++ we want type of lock user asked for, on upper levels: read lock. */ ++znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h) ++{ ++ assert("nikita-382", h != NULL); ++ ++ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK; ++} ++ ++/* update outdated delimiting keys */ ++static void stale_dk(reiser4_tree * tree, znode * node) ++{ ++ znode *right; ++ ++ read_lock_tree(tree); ++ write_lock_dk(tree); ++ right = node->right; ++ ++ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ++ right && ZF_ISSET(right, JNODE_DKSET) && ++ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right))) ++ znode_set_rd_key(node, znode_get_ld_key(right)); ++ ++ write_unlock_dk(tree); ++ read_unlock_tree(tree); ++} ++ ++/* check for possibly outdated delimiting keys, and update them if ++ * necessary. */ ++static void update_stale_dk(reiser4_tree * tree, znode * node) ++{ ++ znode *right; ++ reiser4_key rd; ++ ++ read_lock_tree(tree); ++ read_lock_dk(tree); ++ rd = *znode_get_rd_key(node); ++ right = node->right; ++ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ++ right && ZF_ISSET(right, JNODE_DKSET) && ++ !keyeq(&rd, znode_get_ld_key(right)))) { ++ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET)); ++ read_unlock_dk(tree); ++ read_unlock_tree(tree); ++ stale_dk(tree, node); ++ return; ++ } ++ read_unlock_dk(tree); ++ read_unlock_tree(tree); ++} ++ ++/* ++ * handle searches a the non-unique key. ++ * ++ * Suppose that we are looking for an item with possibly non-unique key 100. ++ * ++ * Root node contains two pointers: one to a node with left delimiting key 0, ++ * and another to a node with left delimiting key 100. Item we interested in ++ * may well happen in the sub-tree rooted at the first pointer. ++ * ++ * To handle this search_to_left() is called when search reaches stop ++ * level. This function checks it is _possible_ that item we are looking for ++ * is in the left neighbor (this can be done by comparing delimiting keys) and ++ * if so, tries to lock left neighbor (this is low priority lock, so it can ++ * deadlock, tree traversal is just restarted if it did) and then checks ++ * whether left neighbor actually contains items with our key. ++ * ++ * Note that this is done on the stop level only. It is possible to try such ++ * left-check on each level, but as duplicate keys are supposed to be rare ++ * (very unlikely that more than one node is completely filled with items with ++ * duplicate keys), it sis cheaper to scan to the left on the stop level once. ++ * ++ */ ++static level_lookup_result search_to_left(cbk_handle * h/* search handle */) ++{ ++ level_lookup_result result; ++ coord_t *coord; ++ znode *node; ++ znode *neighbor; ++ ++ lock_handle lh; ++ ++ assert("nikita-1761", h != NULL); ++ assert("nikita-1762", h->level == h->stop_level); ++ ++ init_lh(&lh); ++ coord = h->coord; ++ node = h->active_lh->node; ++ assert("nikita-1763", coord_is_leftmost_unit(coord)); ++ ++ h->result = ++ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode, ++ GN_CAN_USE_UPPER_LEVELS); ++ neighbor = NULL; ++ switch (h->result) { ++ case -E_DEADLOCK: ++ result = LOOKUP_REST; ++ break; ++ case 0:{ ++ node_plugin *nplug; ++ coord_t crd; ++ lookup_bias bias; ++ ++ neighbor = lh.node; ++ h->result = zload(neighbor); ++ if (h->result != 0) { ++ result = LOOKUP_DONE; ++ break; ++ } ++ ++ nplug = neighbor->nplug; ++ ++ coord_init_zero(&crd); ++ bias = h->bias; ++ h->bias = FIND_EXACT; ++ h->result = ++ nplug->lookup(neighbor, h->key, h->bias, &crd); ++ h->bias = bias; ++ ++ if (h->result == NS_NOT_FOUND) { ++ case -E_NO_NEIGHBOR: ++ h->result = CBK_COORD_FOUND; ++ if (!(h->flags & CBK_IN_CACHE)) ++ cbk_cache_add(node); ++ default: /* some other error */ ++ result = LOOKUP_DONE; ++ } else if (h->result == NS_FOUND) { ++ read_lock_dk(znode_get_tree(neighbor)); ++ h->rd_key = *znode_get_ld_key(node); ++ leftmost_key_in_node(neighbor, &h->ld_key); ++ read_unlock_dk(znode_get_tree(neighbor)); ++ h->flags |= CBK_DKSET; ++ ++ h->block = *znode_get_block(neighbor); ++ /* clear coord->node so that cbk_level_lookup() ++ wouldn't overwrite parent hint in neighbor. ++ ++ Parent hint was set up by ++ reiser4_get_left_neighbor() ++ */ ++ /* FIXME: why do we have to spinlock here? */ ++ write_lock_tree(znode_get_tree(neighbor)); ++ h->coord->node = NULL; ++ write_unlock_tree(znode_get_tree(neighbor)); ++ result = LOOKUP_CONT; ++ } else { ++ result = LOOKUP_DONE; ++ } ++ if (neighbor != NULL) ++ zrelse(neighbor); ++ } ++ } ++ done_lh(&lh); ++ return result; ++} ++ ++/* debugging aid: return symbolic name of search bias */ ++static const char *bias_name(lookup_bias bias/* bias to get name of */) ++{ ++ if (bias == FIND_EXACT) ++ return "exact"; ++ else if (bias == FIND_MAX_NOT_MORE_THAN) ++ return "left-slant"; ++/* else if( bias == RIGHT_SLANT_BIAS ) */ ++/* return "right-bias"; */ ++ else { ++ static char buf[30]; ++ ++ sprintf(buf, "unknown: %i", bias); ++ return buf; ++ } ++} ++ ++#if REISER4_DEBUG ++/* debugging aid: print human readable information about @p */ ++void print_coord_content(const char *prefix /* prefix to print */ , ++ coord_t *p/* coord to print */) ++{ ++ reiser4_key key; ++ ++ if (p == NULL) { ++ printk("%s: null\n", prefix); ++ return; ++ } ++ if ((p->node != NULL) && znode_is_loaded(p->node) ++ && coord_is_existing_item(p)) ++ printk("%s: data: %p, length: %i\n", prefix, ++ item_body_by_coord(p), item_length_by_coord(p)); ++ if (znode_is_loaded(p->node)) { ++ item_key_by_coord(p, &key); ++ reiser4_print_key(prefix, &key); ++ } ++} ++ ++/* debugging aid: print human readable information about @block */ ++void reiser4_print_address(const char *prefix /* prefix to print */ , ++ const reiser4_block_nr * block/* block number to print */) ++{ ++ printk("%s: %s\n", prefix, sprint_address(block)); ++} ++#endif ++ ++/* return string containing human readable representation of @block */ ++char *sprint_address(const reiser4_block_nr * ++ block/* block number to print */) ++{ ++ static char address[30]; ++ ++ if (block == NULL) ++ sprintf(address, "null"); ++ else if (reiser4_blocknr_is_fake(block)) ++ sprintf(address, "%llx", (unsigned long long)(*block)); ++ else ++ sprintf(address, "%llu", (unsigned long long)(*block)); ++ return address; ++} ++ ++/* release parent node during traversal */ ++static void put_parent(cbk_handle * h/* search handle */) ++{ ++ assert("nikita-383", h != NULL); ++ if (h->parent_lh->node != NULL) ++ longterm_unlock_znode(h->parent_lh); ++} ++ ++/* helper function used by coord_by_key(): release reference to parent znode ++ stored in handle before processing its child. */ ++static void hput(cbk_handle * h/* search handle */) ++{ ++ assert("nikita-385", h != NULL); ++ done_lh(h->parent_lh); ++ done_lh(h->active_lh); ++} ++ ++/* Helper function used by cbk(): update delimiting keys of child node (stored ++ in h->active_lh->node) using key taken from parent on the parent level. */ ++static int setup_delimiting_keys(cbk_handle * h/* search handle */) ++{ ++ znode *active; ++ reiser4_tree *tree; ++ ++ assert("nikita-1088", h != NULL); ++ ++ active = h->active_lh->node; ++ ++ /* fast check without taking dk lock. This is safe, because ++ * JNODE_DKSET is never cleared once set. */ ++ if (!ZF_ISSET(active, JNODE_DKSET)) { ++ tree = znode_get_tree(active); ++ write_lock_dk(tree); ++ if (!ZF_ISSET(active, JNODE_DKSET)) { ++ znode_set_ld_key(active, &h->ld_key); ++ znode_set_rd_key(active, &h->rd_key); ++ ZF_SET(active, JNODE_DKSET); ++ } ++ write_unlock_dk(tree); ++ return 1; ++ } ++ return 0; ++} ++ ++/* true if @block makes sense for the @tree. Used to detect corrupted node ++ * pointers */ ++static int ++block_nr_is_correct(reiser4_block_nr * block /* block number to check */ , ++ reiser4_tree * tree/* tree to check against */) ++{ ++ assert("nikita-757", block != NULL); ++ assert("nikita-758", tree != NULL); ++ ++ /* check to see if it exceeds the size of the device. */ ++ return reiser4_blocknr_is_sane_for(tree->super, block); ++} ++ ++/* check consistency of fields */ ++static int sanity_check(cbk_handle * h/* search handle */) ++{ ++ assert("nikita-384", h != NULL); ++ ++ if (h->level < h->stop_level) { ++ h->error = "Buried under leaves"; ++ h->result = RETERR(-EIO); ++ return LOOKUP_DONE; ++ } else if (!block_nr_is_correct(&h->block, h->tree)) { ++ h->error = "bad block number"; ++ h->result = RETERR(-EIO); ++ return LOOKUP_DONE; ++ } else ++ return 0; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/status_flags.c linux-2.6.33/fs/reiser4/status_flags.c +--- linux-2.6.33.orig/fs/reiser4/status_flags.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/status_flags.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,174 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Functions that deal with reiser4 status block, query status and update it, ++ * if needed */ ++ ++#include <linux/bio.h> ++#include <linux/highmem.h> ++#include <linux/fs.h> ++#include <linux/blkdev.h> ++#include "debug.h" ++#include "dformat.h" ++#include "status_flags.h" ++#include "super.h" ++ ++/* This is our end I/O handler that marks page uptodate if IO was successful. ++ It also unconditionally unlocks the page, so we can see that io was done. ++ We do not free bio, because we hope to reuse that. */ ++static void reiser4_status_endio(struct bio *bio, int err) ++{ ++ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) { ++ SetPageUptodate(bio->bi_io_vec->bv_page); ++ } else { ++ ClearPageUptodate(bio->bi_io_vec->bv_page); ++ SetPageError(bio->bi_io_vec->bv_page); ++ } ++ unlock_page(bio->bi_io_vec->bv_page); ++} ++ ++/* Initialise status code. This is expected to be called from the disk format ++ code. block paremeter is where status block lives. */ ++int reiser4_status_init(reiser4_block_nr block) ++{ ++ struct super_block *sb = reiser4_get_current_sb(); ++ struct reiser4_status *statuspage; ++ struct bio *bio; ++ struct page *page; ++ ++ get_super_private(sb)->status_page = NULL; ++ get_super_private(sb)->status_bio = NULL; ++ ++ page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0); ++ if (!page) ++ return -ENOMEM; ++ ++ bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1); ++ if (bio != NULL) { ++ bio->bi_sector = block * (sb->s_blocksize >> 9); ++ bio->bi_bdev = sb->s_bdev; ++ bio->bi_io_vec[0].bv_page = page; ++ bio->bi_io_vec[0].bv_len = sb->s_blocksize; ++ bio->bi_io_vec[0].bv_offset = 0; ++ bio->bi_vcnt = 1; ++ bio->bi_size = sb->s_blocksize; ++ bio->bi_end_io = reiser4_status_endio; ++ } else { ++ __free_pages(page, 0); ++ return -ENOMEM; ++ } ++ lock_page(page); ++ submit_bio(READ, bio); ++ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping); ++ wait_on_page_locked(page); ++ if (!PageUptodate(page)) { ++ warning("green-2007", ++ "I/O error while tried to read status page\n"); ++ return -EIO; ++ } ++ ++ statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0); ++ if (memcmp ++ (statuspage->magic, REISER4_STATUS_MAGIC, ++ sizeof(REISER4_STATUS_MAGIC))) { ++ /* Magic does not match. */ ++ kunmap_atomic((char *)statuspage, KM_USER0); ++ warning("green-2008", "Wrong magic in status block\n"); ++ __free_pages(page, 0); ++ bio_put(bio); ++ return -EINVAL; ++ } ++ kunmap_atomic((char *)statuspage, KM_USER0); ++ ++ get_super_private(sb)->status_page = page; ++ get_super_private(sb)->status_bio = bio; ++ return 0; ++} ++ ++/* Query the status of fs. Returns if the FS can be safely mounted. ++ Also if "status" and "extended" parameters are given, it will fill ++ actual parts of status from disk there. */ ++int reiser4_status_query(u64 *status, u64 *extended) ++{ ++ struct super_block *sb = reiser4_get_current_sb(); ++ struct reiser4_status *statuspage; ++ int retval; ++ ++ if (!get_super_private(sb)->status_page) ++ /* No status page? */ ++ return REISER4_STATUS_MOUNT_UNKNOWN; ++ statuspage = (struct reiser4_status *) ++ kmap_atomic(get_super_private(sb)->status_page, KM_USER0); ++ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { ++ /* FIXME: this cast is a hack for 32 bit arches to work. */ ++ case REISER4_STATUS_OK: ++ retval = REISER4_STATUS_MOUNT_OK; ++ break; ++ case REISER4_STATUS_CORRUPTED: ++ retval = REISER4_STATUS_MOUNT_WARN; ++ break; ++ case REISER4_STATUS_DAMAGED: ++ case REISER4_STATUS_DESTROYED: ++ case REISER4_STATUS_IOERROR: ++ retval = REISER4_STATUS_MOUNT_RO; ++ break; ++ default: ++ retval = REISER4_STATUS_MOUNT_UNKNOWN; ++ break; ++ } ++ ++ if (status) ++ *status = le64_to_cpu(get_unaligned(&statuspage->status)); ++ if (extended) ++ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status)); ++ ++ kunmap_atomic((char *)statuspage, KM_USER0); ++ return retval; ++} ++ ++/* This function should be called when something bad happens (e.g. from ++ reiser4_panic). It fills the status structure and tries to push it to disk.*/ ++int reiser4_status_write(__u64 status, __u64 extended_status, char *message) ++{ ++ struct super_block *sb = reiser4_get_current_sb(); ++ struct reiser4_status *statuspage; ++ struct bio *bio = get_super_private(sb)->status_bio; ++ ++ if (!get_super_private(sb)->status_page) ++ /* No status page? */ ++ return -1; ++ statuspage = (struct reiser4_status *) ++ kmap_atomic(get_super_private(sb)->status_page, KM_USER0); ++ ++ put_unaligned(cpu_to_le64(status), &statuspage->status); ++ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status); ++ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN); ++ ++ kunmap_atomic((char *)statuspage, KM_USER0); ++ bio->bi_bdev = sb->s_bdev; ++ bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page; ++ bio->bi_io_vec[0].bv_len = sb->s_blocksize; ++ bio->bi_io_vec[0].bv_offset = 0; ++ bio->bi_vcnt = 1; ++ bio->bi_size = sb->s_blocksize; ++ bio->bi_end_io = reiser4_status_endio; ++ lock_page(get_super_private(sb)->status_page); /* Safe as nobody should ++ * touch our page. */ ++ /* We can block now, but we have no other choice anyway */ ++ submit_bio(WRITE, bio); ++ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping); ++ return 0; /* We do not wait for io to finish. */ ++} ++ ++/* Frees the page with status and bio structure. Should be called by disk format ++ * at umount time */ ++int reiser4_status_finish(void) ++{ ++ struct super_block *sb = reiser4_get_current_sb(); ++ ++ __free_pages(get_super_private(sb)->status_page, 0); ++ get_super_private(sb)->status_page = NULL; ++ bio_put(get_super_private(sb)->status_bio); ++ get_super_private(sb)->status_bio = NULL; ++ return 0; ++} +diff -urN linux-2.6.33.orig/fs/reiser4/status_flags.h linux-2.6.33/fs/reiser4/status_flags.h +--- linux-2.6.33.orig/fs/reiser4/status_flags.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/status_flags.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,47 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Here we declare structures and flags that store reiser4 status on disk. ++ The status that helps us to find out if the filesystem is valid or if it ++ contains some critical, or not so critical errors */ ++ ++#if !defined(__REISER4_STATUS_FLAGS_H__) ++#define __REISER4_STATUS_FLAGS_H__ ++ ++#include "dformat.h" ++/* These are major status flags */ ++#define REISER4_STATUS_OK 0 ++#define REISER4_STATUS_CORRUPTED 0x1 ++#define REISER4_STATUS_DAMAGED 0x2 ++#define REISER4_STATUS_DESTROYED 0x4 ++#define REISER4_STATUS_IOERROR 0x8 ++ ++/* Return values for reiser4_status_query() */ ++#define REISER4_STATUS_MOUNT_OK 0 ++#define REISER4_STATUS_MOUNT_WARN 1 ++#define REISER4_STATUS_MOUNT_RO 2 ++#define REISER4_STATUS_MOUNT_UNKNOWN -1 ++ ++#define REISER4_TEXTERROR_LEN 256 ++ ++#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl" ++/* We probably need to keep its size under sector size which is 512 bytes */ ++struct reiser4_status { ++ char magic[16]; ++ d64 status; /* Current FS state */ ++ d64 extended_status; /* Any additional info that might have sense in ++ * addition to "status". E.g. last sector where ++ * io error happened if status is ++ * "io error encountered" */ ++ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */ ++ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if ++ * appropriate, otherwise filled ++ * with zeroes */ ++}; ++ ++int reiser4_status_init(reiser4_block_nr block); ++int reiser4_status_query(u64 *status, u64 *extended); ++int reiser4_status_write(u64 status, u64 extended_status, char *message); ++int reiser4_status_finish(void); ++ ++#endif +diff -urN linux-2.6.33.orig/fs/reiser4/super.c linux-2.6.33/fs/reiser4/super.c +--- linux-2.6.33.orig/fs/reiser4/super.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/super.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,306 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Super-block manipulations. */ ++ ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++#include "plugin/security/perm.h" ++#include "plugin/space/space_allocator.h" ++#include "plugin/plugin.h" ++#include "tree.h" ++#include "vfs_ops.h" ++#include "super.h" ++#include "reiser4.h" ++ ++#include <linux/types.h> /* for __u?? */ ++#include <linux/fs.h> /* for struct super_block */ ++ ++static __u64 reserved_for_gid(const struct super_block *super, gid_t gid); ++static __u64 reserved_for_uid(const struct super_block *super, uid_t uid); ++static __u64 reserved_for_root(const struct super_block *super); ++ ++/* Return reiser4-specific part of super block */ ++reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super) ++{ ++ return (reiser4_super_info_data *) super->s_fs_info; ++} ++ ++/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() ++ */ ++long reiser4_statfs_type(const struct super_block *super UNUSED_ARG) ++{ ++ assert("nikita-448", super != NULL); ++ assert("nikita-449", is_reiser4_super(super)); ++ return (long)REISER4_SUPER_MAGIC; ++} ++ ++/* functions to read/modify fields of reiser4_super_info_data */ ++ ++/* get number of blocks in file system */ ++__u64 reiser4_block_count(const struct super_block *super /* super block ++ queried */ ) ++{ ++ assert("vs-494", super != NULL); ++ assert("vs-495", is_reiser4_super(super)); ++ return get_super_private(super)->block_count; ++} ++ ++#if REISER4_DEBUG ++/* ++ * number of blocks in the current file system ++ */ ++__u64 reiser4_current_block_count(void) ++{ ++ return get_current_super_private()->block_count; ++} ++#endif /* REISER4_DEBUG */ ++ ++/* set number of block in filesystem */ ++void reiser4_set_block_count(const struct super_block *super, __u64 nr) ++{ ++ assert("vs-501", super != NULL); ++ assert("vs-502", is_reiser4_super(super)); ++ get_super_private(super)->block_count = nr; ++ /* ++ * The proper calculation of the reserved space counter (%5 of device ++ * block counter) we need a 64 bit division which is missing in Linux ++ * on i386 platform. Because we do not need a precise calculation here ++ * we can replace a div64 operation by this combination of ++ * multiplication and shift: 51. / (2^10) == .0498 . ++ * FIXME: this is a bug. It comes up only for very small filesystems ++ * which probably are never used. Nevertheless, it is a bug. Number of ++ * reserved blocks must be not less than maximal number of blocks which ++ * get grabbed with BA_RESERVED. ++ */ ++ get_super_private(super)->blocks_reserved = ((nr * 51) >> 10); ++} ++ ++/* amount of blocks used (allocated for data) in file system */ ++__u64 reiser4_data_blocks(const struct super_block *super /* super block ++ queried */ ) ++{ ++ assert("nikita-452", super != NULL); ++ assert("nikita-453", is_reiser4_super(super)); ++ return get_super_private(super)->blocks_used; ++} ++ ++/* set number of block used in filesystem */ ++void reiser4_set_data_blocks(const struct super_block *super, __u64 nr) ++{ ++ assert("vs-503", super != NULL); ++ assert("vs-504", is_reiser4_super(super)); ++ get_super_private(super)->blocks_used = nr; ++} ++ ++/* amount of free blocks in file system */ ++__u64 reiser4_free_blocks(const struct super_block *super /* super block ++ queried */ ) ++{ ++ assert("nikita-454", super != NULL); ++ assert("nikita-455", is_reiser4_super(super)); ++ return get_super_private(super)->blocks_free; ++} ++ ++/* set number of blocks free in filesystem */ ++void reiser4_set_free_blocks(const struct super_block *super, __u64 nr) ++{ ++ assert("vs-505", super != NULL); ++ assert("vs-506", is_reiser4_super(super)); ++ get_super_private(super)->blocks_free = nr; ++} ++ ++/* get mkfs unique identifier */ ++__u32 reiser4_mkfs_id(const struct super_block *super /* super block ++ queried */ ) ++{ ++ assert("vpf-221", super != NULL); ++ assert("vpf-222", is_reiser4_super(super)); ++ return get_super_private(super)->mkfs_id; ++} ++ ++/* amount of free blocks in file system */ ++__u64 reiser4_free_committed_blocks(const struct super_block *super) ++{ ++ assert("vs-497", super != NULL); ++ assert("vs-498", is_reiser4_super(super)); ++ return get_super_private(super)->blocks_free_committed; ++} ++ ++/* amount of blocks in the file system reserved for @uid and @gid */ ++long reiser4_reserved_blocks(const struct super_block *super /* super block ++ queried */ , ++ uid_t uid /* user id */ , ++ gid_t gid/* group id */) ++{ ++ long reserved; ++ ++ assert("nikita-456", super != NULL); ++ assert("nikita-457", is_reiser4_super(super)); ++ ++ reserved = 0; ++ if (REISER4_SUPPORT_GID_SPACE_RESERVATION) ++ reserved += reserved_for_gid(super, gid); ++ if (REISER4_SUPPORT_UID_SPACE_RESERVATION) ++ reserved += reserved_for_uid(super, uid); ++ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0)) ++ reserved += reserved_for_root(super); ++ return reserved; ++} ++ ++/* get/set value of/to grabbed blocks counter */ ++__u64 reiser4_grabbed_blocks(const struct super_block * super) ++{ ++ assert("zam-512", super != NULL); ++ assert("zam-513", is_reiser4_super(super)); ++ ++ return get_super_private(super)->blocks_grabbed; ++} ++ ++__u64 reiser4_flush_reserved(const struct super_block *super) ++{ ++ assert("vpf-285", super != NULL); ++ assert("vpf-286", is_reiser4_super(super)); ++ ++ return get_super_private(super)->blocks_flush_reserved; ++} ++ ++/* get/set value of/to counter of fake allocated formatted blocks */ ++__u64 reiser4_fake_allocated(const struct super_block *super) ++{ ++ assert("zam-516", super != NULL); ++ assert("zam-517", is_reiser4_super(super)); ++ ++ return get_super_private(super)->blocks_fake_allocated; ++} ++ ++/* get/set value of/to counter of fake allocated unformatted blocks */ ++__u64 reiser4_fake_allocated_unformatted(const struct super_block *super) ++{ ++ assert("zam-516", super != NULL); ++ assert("zam-517", is_reiser4_super(super)); ++ ++ return get_super_private(super)->blocks_fake_allocated_unformatted; ++} ++ ++/* get/set value of/to counter of clustered blocks */ ++__u64 reiser4_clustered_blocks(const struct super_block *super) ++{ ++ assert("edward-601", super != NULL); ++ assert("edward-602", is_reiser4_super(super)); ++ ++ return get_super_private(super)->blocks_clustered; ++} ++ ++/* space allocator used by this file system */ ++reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block ++ *super) ++{ ++ assert("nikita-1965", super != NULL); ++ assert("nikita-1966", is_reiser4_super(super)); ++ return &get_super_private(super)->space_allocator; ++} ++ ++/* return fake inode used to bind formatted nodes in the page cache */ ++struct inode *reiser4_get_super_fake(const struct super_block *super) ++{ ++ assert("nikita-1757", super != NULL); ++ return get_super_private(super)->fake; ++} ++ ++/* return fake inode used to bind copied on capture nodes in the page cache */ ++struct inode *reiser4_get_cc_fake(const struct super_block *super) ++{ ++ assert("nikita-1757", super != NULL); ++ return get_super_private(super)->cc; ++} ++ ++/* return fake inode used to bind bitmaps and journlal heads */ ++struct inode *reiser4_get_bitmap_fake(const struct super_block *super) ++{ ++ assert("nikita-17571", super != NULL); ++ return get_super_private(super)->bitmap; ++} ++ ++/* tree used by this file system */ ++reiser4_tree *reiser4_get_tree(const struct super_block *super) ++{ ++ assert("nikita-460", super != NULL); ++ assert("nikita-461", is_reiser4_super(super)); ++ return &get_super_private(super)->tree; ++} ++ ++/* Check that @super is (looks like) reiser4 super block. This is mainly for ++ use in assertions. */ ++int is_reiser4_super(const struct super_block *super) ++{ ++ return ++ super != NULL && ++ get_super_private(super) != NULL && ++ super->s_op == &(get_super_private(super)->ops.super); ++} ++ ++int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f) ++{ ++ return test_bit((int)f, &get_super_private(super)->fs_flags); ++} ++ ++/* amount of blocks reserved for given group in file system */ ++static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG, ++ gid_t gid UNUSED_ARG/* group id */) ++{ ++ return 0; ++} ++ ++/* amount of blocks reserved for given user in file system */ ++static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG, ++ uid_t uid UNUSED_ARG/* user id */) ++{ ++ return 0; ++} ++ ++/* amount of blocks reserved for super user in file system */ ++static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG) ++{ ++ return 0; ++} ++ ++/* ++ * true if block number @blk makes sense for the file system at @super. ++ */ ++int ++reiser4_blocknr_is_sane_for(const struct super_block *super, ++ const reiser4_block_nr * blk) ++{ ++ reiser4_super_info_data *sbinfo; ++ ++ assert("nikita-2957", super != NULL); ++ assert("nikita-2958", blk != NULL); ++ ++ if (reiser4_blocknr_is_fake(blk)) ++ return 1; ++ ++ sbinfo = get_super_private(super); ++ return *blk < sbinfo->block_count; ++} ++ ++#if REISER4_DEBUG ++/* ++ * true, if block number @blk makes sense for the current file system ++ */ ++int reiser4_blocknr_is_sane(const reiser4_block_nr * blk) ++{ ++ return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk); ++} ++#endif /* REISER4_DEBUG */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/super.h linux-2.6.33/fs/reiser4/super.h +--- linux-2.6.33.orig/fs/reiser4/super.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/super.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,466 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Super-block functions. See super.c for details. */ ++ ++#if !defined(__REISER4_SUPER_H__) ++#define __REISER4_SUPER_H__ ++ ++#include <linux/exportfs.h> ++ ++#include "tree.h" ++#include "entd.h" ++#include "wander.h" ++#include "fsdata.h" ++#include "plugin/object.h" ++#include "plugin/space/space_allocator.h" ++ ++/* ++ * Flush algorithms parameters. ++ */ ++struct flush_params { ++ unsigned relocate_threshold; ++ unsigned relocate_distance; ++ unsigned written_threshold; ++ unsigned scan_maxnodes; ++}; ++ ++typedef enum { ++ /* ++ * True if this file system doesn't support hard-links (multiple names) ++ * for directories: this is default UNIX behavior. ++ * ++ * If hard-links on directoires are not allowed, file system is Acyclic ++ * Directed Graph (modulo dot, and dotdot, of course). ++ * ++ * This is used by reiser4_link(). ++ */ ++ REISER4_ADG = 0, ++ /* ++ * set if all nodes in internal tree have the same node layout plugin. ++ * If so, znode_guess_plugin() will return tree->node_plugin in stead ++ * of guessing plugin by plugin id stored in the node. ++ */ ++ REISER4_ONE_NODE_PLUGIN = 1, ++ /* if set, bsd gid assignment is supported. */ ++ REISER4_BSD_GID = 2, ++ /* [mac]_time are 32 bit in inode */ ++ REISER4_32_BIT_TIMES = 3, ++ /* load all bitmap blocks at mount time */ ++ REISER4_DONT_LOAD_BITMAP = 5, ++ /* enforce atomicity during write(2) */ ++ REISER4_ATOMIC_WRITE = 6, ++ /* don't use write barriers in the log writer code. */ ++ REISER4_NO_WRITE_BARRIER = 7 ++} reiser4_fs_flag; ++ ++/* ++ * VFS related operation vectors. ++ */ ++struct object_ops { ++ struct super_operations super; ++ struct dentry_operations dentry; ++ struct export_operations export; ++}; ++ ++/* reiser4-specific part of super block ++ ++ Locking ++ ++ Fields immutable after mount: ++ ++ ->oid* ++ ->space* ++ ->default_[ug]id ++ ->mkfs_id ++ ->trace_flags ++ ->debug_flags ++ ->fs_flags ++ ->df_plug ++ ->optimal_io_size ++ ->plug ++ ->flush ++ ->u (bad name) ++ ->txnmgr ++ ->ra_params ++ ->fsuid ++ ->journal_header ++ ->journal_footer ++ ++ Fields protected by ->lnode_guard ++ ++ ->lnode_htable ++ ++ Fields protected by per-super block spin lock ++ ++ ->block_count ++ ->blocks_used ++ ->blocks_free ++ ->blocks_free_committed ++ ->blocks_grabbed ++ ->blocks_fake_allocated_unformatted ++ ->blocks_fake_allocated ++ ->blocks_flush_reserved ++ ->eflushed ++ ->blocknr_hint_default ++ ++ After journal replaying during mount, ++ ++ ->last_committed_tx ++ ++ is protected by ->tmgr.commit_mutex ++ ++ Invariants involving this data-type: ++ ++ [sb-block-counts] ++ [sb-grabbed] ++ [sb-fake-allocated] ++*/ ++struct reiser4_super_info_data { ++ /* ++ * guard spinlock which protects reiser4 super block fields (currently ++ * blocks_free, blocks_free_committed) ++ */ ++ spinlock_t guard; ++ ++ /* next oid that will be returned by oid_allocate() */ ++ oid_t next_to_use; ++ /* total number of used oids */ ++ oid_t oids_in_use; ++ ++ /* space manager plugin */ ++ reiser4_space_allocator space_allocator; ++ ++ /* reiser4 internal tree */ ++ reiser4_tree tree; ++ ++ /* ++ * default user id used for light-weight files without their own ++ * stat-data. ++ */ ++ uid_t default_uid; ++ ++ /* ++ * default group id used for light-weight files without their own ++ * stat-data. ++ */ ++ gid_t default_gid; ++ ++ /* mkfs identifier generated at mkfs time. */ ++ __u32 mkfs_id; ++ /* amount of blocks in a file system */ ++ __u64 block_count; ++ ++ /* inviolable reserve */ ++ __u64 blocks_reserved; ++ ++ /* amount of blocks used by file system data and meta-data. */ ++ __u64 blocks_used; ++ ++ /* ++ * amount of free blocks. This is "working" free blocks counter. It is ++ * like "working" bitmap, please see block_alloc.c for description. ++ */ ++ __u64 blocks_free; ++ ++ /* ++ * free block count for fs committed state. This is "commit" version of ++ * free block counter. ++ */ ++ __u64 blocks_free_committed; ++ ++ /* ++ * number of blocks reserved for further allocation, for all ++ * threads. ++ */ ++ __u64 blocks_grabbed; ++ ++ /* number of fake allocated unformatted blocks in tree. */ ++ __u64 blocks_fake_allocated_unformatted; ++ ++ /* number of fake allocated formatted blocks in tree. */ ++ __u64 blocks_fake_allocated; ++ ++ /* number of blocks reserved for flush operations. */ ++ __u64 blocks_flush_reserved; ++ ++ /* number of blocks reserved for cluster operations. */ ++ __u64 blocks_clustered; ++ ++ /* unique file-system identifier */ ++ __u32 fsuid; ++ ++ /* On-disk format version. If does not equal to the disk_format ++ plugin version, some format updates (e.g. enlarging plugin ++ set, etc) may have place on mount. */ ++ int version; ++ ++ /* file-system wide flags. See reiser4_fs_flag enum */ ++ unsigned long fs_flags; ++ ++ /* transaction manager */ ++ txn_mgr tmgr; ++ ++ /* ent thread */ ++ entd_context entd; ++ ++ /* fake inode used to bind formatted nodes */ ++ struct inode *fake; ++ /* inode used to bind bitmaps (and journal heads) */ ++ struct inode *bitmap; ++ /* inode used to bind copied on capture nodes */ ++ struct inode *cc; ++ ++ /* disk layout plugin */ ++ disk_format_plugin *df_plug; ++ ++ /* disk layout specific part of reiser4 super info data */ ++ union { ++ format40_super_info format40; ++ } u; ++ ++ /* value we return in st_blksize on stat(2) */ ++ unsigned long optimal_io_size; ++ ++ /* parameters for the flush algorithm */ ++ struct flush_params flush; ++ ++ /* pointers to jnodes for journal header and footer */ ++ jnode *journal_header; ++ jnode *journal_footer; ++ ++ journal_location jloc; ++ ++ /* head block number of last committed transaction */ ++ __u64 last_committed_tx; ++ ++ /* ++ * we remember last written location for using as a hint for new block ++ * allocation ++ */ ++ __u64 blocknr_hint_default; ++ ++ /* committed number of files (oid allocator state variable ) */ ++ __u64 nr_files_committed; ++ ++ struct formatted_ra_params ra_params; ++ ++ /* ++ * A mutex for serializing cut tree operation if out-of-free-space: ++ * the only one cut_tree thread is allowed to grab space from reserved ++ * area (it is 5% of disk space) ++ */ ++ struct mutex delete_mutex; ++ /* task owning ->delete_mutex */ ++ struct task_struct *delete_mutex_owner; ++ ++ /* Diskmap's blocknumber */ ++ __u64 diskmap_block; ++ ++ /* What to do in case of error */ ++ int onerror; ++ ++ /* operations for objects on this file system */ ++ struct object_ops ops; ++ ++ /* ++ * structure to maintain d_cursors. See plugin/file_ops_readdir.c for ++ * more details ++ */ ++ struct d_cursor_info d_info; ++ ++#ifdef CONFIG_REISER4_BADBLOCKS ++ /* Alternative master superblock offset (in bytes) */ ++ unsigned long altsuper; ++#endif ++ struct repacker *repacker; ++ struct page *status_page; ++ struct bio *status_bio; ++ ++#if REISER4_DEBUG ++ /* ++ * minimum used blocks value (includes super blocks, bitmap blocks and ++ * other fs reserved areas), depends on fs format and fs size. ++ */ ++ __u64 min_blocks_used; ++ ++ /* ++ * when debugging is on, all jnodes (including znodes, bitmaps, etc.) ++ * are kept on a list anchored at sbinfo->all_jnodes. This list is ++ * protected by sbinfo->all_guard spin lock. This lock should be taken ++ * with _irq modifier, because it is also modified from interrupt ++ * contexts (by RCU). ++ */ ++ spinlock_t all_guard; ++ /* list of all jnodes */ ++ struct list_head all_jnodes; ++#endif ++ struct dentry *debugfs_root; ++}; ++ ++extern reiser4_super_info_data *get_super_private_nocheck(const struct ++ super_block * super); ++ ++/* Return reiser4-specific part of super block */ ++static inline reiser4_super_info_data *get_super_private(const struct ++ super_block * super) ++{ ++ assert("nikita-447", super != NULL); ++ ++ return (reiser4_super_info_data *) super->s_fs_info; ++} ++ ++/* get ent context for the @super */ ++static inline entd_context *get_entd_context(struct super_block *super) ++{ ++ return &get_super_private(super)->entd; ++} ++ ++/* "Current" super-block: main super block used during current system ++ call. Reference to this super block is stored in reiser4_context. */ ++static inline struct super_block *reiser4_get_current_sb(void) ++{ ++ return get_current_context()->super; ++} ++ ++/* Reiser4-specific part of "current" super-block: main super block used ++ during current system call. Reference to this super block is stored in ++ reiser4_context. */ ++static inline reiser4_super_info_data *get_current_super_private(void) ++{ ++ return get_super_private(reiser4_get_current_sb()); ++} ++ ++static inline struct formatted_ra_params *get_current_super_ra_params(void) ++{ ++ return &(get_current_super_private()->ra_params); ++} ++ ++/* ++ * true, if file system on @super is read-only ++ */ ++static inline int rofs_super(struct super_block *super) ++{ ++ return super->s_flags & MS_RDONLY; ++} ++ ++/* ++ * true, if @tree represents read-only file system ++ */ ++static inline int rofs_tree(reiser4_tree * tree) ++{ ++ return rofs_super(tree->super); ++} ++ ++/* ++ * true, if file system where @inode lives on, is read-only ++ */ ++static inline int rofs_inode(struct inode *inode) ++{ ++ return rofs_super(inode->i_sb); ++} ++ ++/* ++ * true, if file system where @node lives on, is read-only ++ */ ++static inline int rofs_jnode(jnode * node) ++{ ++ return rofs_tree(jnode_get_tree(node)); ++} ++ ++extern __u64 reiser4_current_block_count(void); ++ ++extern void build_object_ops(struct super_block *super, struct object_ops *ops); ++ ++#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */ ++ ++static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo) ++{ ++ spin_lock(&(sbinfo->guard)); ++} ++ ++static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo) ++{ ++ assert_spin_locked(&(sbinfo->guard)); ++ spin_unlock(&(sbinfo->guard)); ++} ++ ++extern __u64 reiser4_flush_reserved(const struct super_block *); ++extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f); ++extern long reiser4_statfs_type(const struct super_block *super); ++extern __u64 reiser4_block_count(const struct super_block *super); ++extern void reiser4_set_block_count(const struct super_block *super, __u64 nr); ++extern __u64 reiser4_data_blocks(const struct super_block *super); ++extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr); ++extern __u64 reiser4_free_blocks(const struct super_block *super); ++extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr); ++extern __u32 reiser4_mkfs_id(const struct super_block *super); ++ ++extern __u64 reiser4_free_committed_blocks(const struct super_block *super); ++ ++extern __u64 reiser4_grabbed_blocks(const struct super_block *); ++extern __u64 reiser4_fake_allocated(const struct super_block *); ++extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *); ++extern __u64 reiser4_clustered_blocks(const struct super_block *); ++ ++extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid, ++ gid_t gid); ++ ++extern reiser4_space_allocator * ++reiser4_get_space_allocator(const struct super_block *super); ++extern reiser4_oid_allocator * ++reiser4_get_oid_allocator(const struct super_block *super); ++extern struct inode *reiser4_get_super_fake(const struct super_block *super); ++extern struct inode *reiser4_get_cc_fake(const struct super_block *super); ++extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super); ++extern reiser4_tree *reiser4_get_tree(const struct super_block *super); ++extern int is_reiser4_super(const struct super_block *super); ++ ++extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk); ++extern int reiser4_blocknr_is_sane_for(const struct super_block *super, ++ const reiser4_block_nr * blk); ++extern int reiser4_fill_super(struct super_block *s, void *data, int silent); ++extern int reiser4_done_super(struct super_block *s); ++ ++/* step of fill super */ ++extern int reiser4_init_fs_info(struct super_block *); ++extern void reiser4_done_fs_info(struct super_block *); ++extern int reiser4_init_super_data(struct super_block *, char *opt_string); ++extern int reiser4_init_read_super(struct super_block *, int silent); ++extern int reiser4_init_root_inode(struct super_block *); ++extern reiser4_plugin *get_default_plugin(pset_member memb); ++ ++/* Maximal possible object id. */ ++#define ABSOLUTE_MAX_OID ((oid_t)~0) ++ ++#define OIDS_RESERVED (1 << 16) ++int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next); ++oid_t oid_allocate(struct super_block *); ++int oid_release(struct super_block *, oid_t); ++oid_t oid_next(const struct super_block *); ++void oid_count_allocated(void); ++void oid_count_released(void); ++long oids_used(const struct super_block *); ++ ++#if REISER4_DEBUG ++void print_fs_info(const char *prefix, const struct super_block *); ++#endif ++ ++extern void destroy_reiser4_cache(struct kmem_cache **); ++ ++extern struct super_operations reiser4_super_operations; ++extern struct export_operations reiser4_export_operations; ++extern struct dentry_operations reiser4_dentry_operations; ++ ++/* __REISER4_SUPER_H__ */ ++#endif ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 120 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/super_ops.c linux-2.6.33/fs/reiser4/super_ops.c +--- linux-2.6.33.orig/fs/reiser4/super_ops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/super_ops.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,736 @@ ++/* Copyright 2005 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++#include "inode.h" ++#include "page_cache.h" ++#include "ktxnmgrd.h" ++#include "flush.h" ++#include "safe_link.h" ++ ++#include <linux/vfs.h> ++#include <linux/writeback.h> ++#include <linux/mount.h> ++#include <linux/seq_file.h> ++#include <linux/debugfs.h> ++ ++/* slab cache for inodes */ ++static struct kmem_cache *inode_cache; ++ ++static struct dentry *reiser4_debugfs_root = NULL; ++ ++/** ++ * init_once - constructor for reiser4 inodes ++ * @cache: cache @obj belongs to ++ * @obj: inode to be initialized ++ * ++ * Initialization function to be called when new page is allocated by reiser4 ++ * inode cache. It is set on inode cache creation. ++ */ ++static void init_once(void *obj) ++{ ++ struct reiser4_inode_object *info; ++ ++ info = obj; ++ ++ /* initialize vfs inode */ ++ inode_init_once(&info->vfs_inode); ++ ++ /* ++ * initialize reiser4 specific part fo inode. ++ * NOTE-NIKITA add here initializations for locks, list heads, ++ * etc. that will be added to our private inode part. ++ */ ++ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode)); ++ init_rwsem(&info->p.conv_sem); ++ /* init semaphore which is used during inode loading */ ++ loading_init_once(&info->p); ++ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p), ++ GFP_ATOMIC); ++#if REISER4_DEBUG ++ info->p.nr_jnodes = 0; ++#endif ++} ++ ++/** ++ * init_inodes - create znode cache ++ * ++ * Initializes slab cache of inodes. It is part of reiser4 module initialization ++ */ ++static int init_inodes(void) ++{ ++ inode_cache = kmem_cache_create("reiser4_inode", ++ sizeof(struct reiser4_inode_object), ++ 0, ++ SLAB_HWCACHE_ALIGN | ++ SLAB_RECLAIM_ACCOUNT, init_once); ++ if (inode_cache == NULL) ++ return RETERR(-ENOMEM); ++ return 0; ++} ++ ++/** ++ * done_inodes - delete inode cache ++ * ++ * This is called on reiser4 module unloading or system shutdown. ++ */ ++static void done_inodes(void) ++{ ++ destroy_reiser4_cache(&inode_cache); ++} ++ ++/** ++ * reiser4_alloc_inode - alloc_inode of super operations ++ * @super: super block new inode is allocated for ++ * ++ * Allocates new inode, initializes reiser4 specific part of it. ++ */ ++static struct inode *reiser4_alloc_inode(struct super_block *super) ++{ ++ struct reiser4_inode_object *obj; ++ ++ assert("nikita-1696", super != NULL); ++ obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get()); ++ if (obj != NULL) { ++ reiser4_inode *info; ++ ++ info = &obj->p; ++ ++ info->pset = plugin_set_get_empty(); ++ info->hset = plugin_set_get_empty(); ++ info->extmask = 0; ++ info->locality_id = 0ull; ++ info->plugin_mask = 0; ++ info->heir_mask = 0; ++#if !REISER4_INO_IS_OID ++ info->oid_hi = 0; ++#endif ++ reiser4_seal_init(&info->sd_seal, NULL, NULL); ++ coord_init_invalid(&info->sd_coord, NULL); ++ info->flags = 0; ++ spin_lock_init(&info->guard); ++ /* this deals with info's loading semaphore */ ++ loading_alloc(info); ++ info->vroot = UBER_TREE_ADDR; ++ return &obj->vfs_inode; ++ } else ++ return NULL; ++} ++ ++/** ++ * reiser4_destroy_inode - destroy_inode of super operations ++ * @inode: inode being destroyed ++ * ++ * Puts reiser4 specific portion of inode, frees memory occupied by inode. ++ */ ++static void reiser4_destroy_inode(struct inode *inode) ++{ ++ reiser4_inode *info; ++ ++ info = reiser4_inode_data(inode); ++ ++ assert("vs-1220", inode_has_no_jnodes(info)); ++ ++ if (!is_bad_inode(inode) && is_inode_loaded(inode)) { ++ file_plugin *fplug = inode_file_plugin(inode); ++ if (fplug->destroy_inode != NULL) ++ fplug->destroy_inode(inode); ++ } ++ reiser4_dispose_cursors(inode); ++ if (info->pset) ++ plugin_set_put(info->pset); ++ if (info->hset) ++ plugin_set_put(info->hset); ++ ++ /* ++ * cannot add similar assertion about ->i_list as prune_icache return ++ * inode into slab with dangling ->list.{next,prev}. This is safe, ++ * because they are re-initialized in the new_inode(). ++ */ ++ assert("nikita-2895", list_empty(&inode->i_dentry)); ++ assert("nikita-2896", hlist_unhashed(&inode->i_hash)); ++ assert("nikita-2898", list_empty_careful(get_readdir_list(inode))); ++ ++ /* this deals with info's loading semaphore */ ++ loading_destroy(info); ++ ++ kmem_cache_free(inode_cache, ++ container_of(info, struct reiser4_inode_object, p)); ++} ++ ++/** ++ * reiser4_dirty_inode - dirty_inode of super operations ++ * @inode: inode being dirtied ++ * ++ * Updates stat data. ++ */ ++static void reiser4_dirty_inode(struct inode *inode) ++{ ++ int result; ++ ++ if (!is_in_reiser4_context()) ++ return; ++ assert("", !IS_RDONLY(inode)); ++ assert("", (inode_file_plugin(inode)->estimate.update(inode) <= ++ get_current_context()->grabbed_blocks)); ++ ++ result = reiser4_update_sd(inode); ++ if (result) ++ warning("", "failed to dirty inode for %llu: %d", ++ get_inode_oid(inode), result); ++} ++ ++/** ++ * reiser4_delete_inode - delete_inode of super operations ++ * @inode: inode to delete ++ * ++ * Calls file plugin's delete_object method to delete object items from ++ * filesystem tree and calls clear_inode. ++ */ ++static void reiser4_delete_inode(struct inode *inode) ++{ ++ reiser4_context *ctx; ++ file_plugin *fplug; ++ ++ ctx = reiser4_init_context(inode->i_sb); ++ if (IS_ERR(ctx)) { ++ warning("vs-15", "failed to init context"); ++ return; ++ } ++ ++ if (is_inode_loaded(inode)) { ++ fplug = inode_file_plugin(inode); ++ if (fplug != NULL && fplug->delete_object != NULL) ++ fplug->delete_object(inode); ++ } ++ ++ truncate_inode_pages(&inode->i_data, 0); ++ inode->i_blocks = 0; ++ clear_inode(inode); ++ reiser4_exit_context(ctx); ++} ++ ++/** ++ * reiser4_put_super - put_super of super operations ++ * @super: super block to free ++ * ++ * Stops daemons, release resources, umounts in short. ++ */ ++static void reiser4_put_super(struct super_block *super) ++{ ++ reiser4_super_info_data *sbinfo; ++ reiser4_context *ctx; ++ ++ sbinfo = get_super_private(super); ++ assert("vs-1699", sbinfo); ++ ++ debugfs_remove(sbinfo->tmgr.debugfs_atom_count); ++ debugfs_remove(sbinfo->tmgr.debugfs_id_count); ++ debugfs_remove(sbinfo->debugfs_root); ++ ++ ctx = reiser4_init_context(super); ++ if (IS_ERR(ctx)) { ++ warning("vs-17", "failed to init context"); ++ return; ++ } ++ ++ /* have disk format plugin to free its resources */ ++ if (get_super_private(super)->df_plug->release) ++ get_super_private(super)->df_plug->release(super); ++ ++ reiser4_done_formatted_fake(super); ++ ++ /* stop daemons: ktxnmgr and entd */ ++ reiser4_done_entd(super); ++ reiser4_done_ktxnmgrd(super); ++ reiser4_done_txnmgr(&sbinfo->tmgr); ++ ++ reiser4_done_fs_info(super); ++ reiser4_exit_context(ctx); ++} ++ ++/** ++ * reiser4_write_super - write_super of super operations ++ * @super: super block to write ++ * ++ * Captures znode associated with super block, comit all transactions. ++ */ ++static void reiser4_write_super(struct super_block *super) ++{ ++ int ret; ++ reiser4_context *ctx; ++ ++ assert("vs-1700", !rofs_super(super)); ++ ++ ctx = reiser4_init_context(super); ++ if (IS_ERR(ctx)) { ++ warning("vs-16", "failed to init context"); ++ return; ++ } ++ ++ ret = reiser4_capture_super_block(super); ++ if (ret != 0) ++ warning("vs-1701", ++ "reiser4_capture_super_block failed in write_super: %d", ++ ret); ++ ret = txnmgr_force_commit_all(super, 0); ++ if (ret != 0) ++ warning("jmacd-77113", ++ "txn_force failed in write_super: %d", ret); ++ ++ super->s_dirt = 0; ++ ++ reiser4_exit_context(ctx); ++} ++ ++/** ++ * reiser4_statfs - statfs of super operations ++ * @super: super block of file system in queried ++ * @stafs: buffer to fill with statistics ++ * ++ * Returns information about filesystem. ++ */ ++static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs) ++{ ++ sector_t total; ++ sector_t reserved; ++ sector_t free; ++ sector_t forroot; ++ sector_t deleted; ++ reiser4_context *ctx; ++ struct super_block *super = dentry->d_sb; ++ ++ assert("nikita-408", super != NULL); ++ assert("nikita-409", statfs != NULL); ++ ++ ctx = reiser4_init_context(super); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ statfs->f_type = reiser4_statfs_type(super); ++ statfs->f_bsize = super->s_blocksize; ++ ++ /* ++ * 5% of total block space is reserved. This is needed for flush and ++ * for truncates (so that we are able to perform truncate/unlink even ++ * on the otherwise completely full file system). If this reservation ++ * is hidden from statfs(2), users will mistakenly guess that they ++ * have enough free space to complete some operation, which is ++ * frustrating. ++ * ++ * Another possible solution is to subtract ->blocks_reserved from ++ * ->f_bfree, but changing available space seems less intrusive than ++ * letting user to see 5% of disk space to be used directly after ++ * mkfs. ++ */ ++ total = reiser4_block_count(super); ++ reserved = get_super_private(super)->blocks_reserved; ++ deleted = txnmgr_count_deleted_blocks(); ++ free = reiser4_free_blocks(super) + deleted; ++ forroot = reiser4_reserved_blocks(super, 0, 0); ++ ++ /* ++ * These counters may be in inconsistent state because we take the ++ * values without keeping any global spinlock. Here we do a sanity ++ * check that free block counter does not exceed the number of all ++ * blocks. ++ */ ++ if (free > total) ++ free = total; ++ statfs->f_blocks = total - reserved; ++ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */ ++ if (free > reserved) ++ free -= reserved; ++ else ++ free = 0; ++ statfs->f_bfree = free; ++ ++ if (free > forroot) ++ free -= forroot; ++ else ++ free = 0; ++ statfs->f_bavail = free; ++ ++ statfs->f_files = 0; ++ statfs->f_ffree = 0; ++ ++ /* maximal acceptable name length depends on directory plugin. */ ++ assert("nikita-3351", super->s_root->d_inode != NULL); ++ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode); ++ reiser4_exit_context(ctx); ++ return 0; ++} ++ ++/** ++ * reiser4_clear_inode - clear_inode of super operation ++ * @inode: inode about to destroy ++ * ++ * Does sanity checks: being destroyed should have all jnodes detached. ++ */ ++static void reiser4_clear_inode(struct inode *inode) ++{ ++#if REISER4_DEBUG ++ reiser4_inode *r4_inode; ++ ++ r4_inode = reiser4_inode_data(inode); ++ if (!inode_has_no_jnodes(r4_inode)) ++ warning("vs-1732", "reiser4 inode has %ld jnodes\n", ++ r4_inode->nr_jnodes); ++#endif ++} ++ ++/** ++ * reiser4_writeback_inodes - writeback_inodes of super operations ++ * @super: ++ * @wb: ++ * @wbc: ++ * ++ * This method is called by background and non-backgound writeback. Reiser4's ++ * implementation uses generic_writeback_sb_inodes to call reiser4_writepages ++ * for each of dirty inodes. reiser4_writepages handles pages dirtied via shared ++ * mapping - dirty pages get into atoms. Writeout is called to flush some atoms. ++ */ ++static int reiser4_writeback_inodes(struct super_block *super, ++ struct bdi_writeback *wb, ++ struct writeback_control *wbc) ++{ ++ int ret; ++ long to_write; ++ reiser4_context *ctx; ++ ++ if (wbc->for_kupdate) ++ /* reiser4 has its own means of periodical write-out */ ++ goto skip; ++ assert("vs-49", wbc->older_than_this == NULL); ++ ++ spin_unlock(&inode_lock); ++ ctx = reiser4_init_context(super); ++ if (IS_ERR(ctx)) { ++ warning("vs-13", "failed to init context"); ++ spin_lock(&inode_lock); ++ goto skip; ++ } ++ to_write = wbc->nr_to_write; ++ /* ++ * call reiser4_writepages for each of dirty inodes to turn ++ * dirty pages into transactions if they were not yet. ++ */ ++ spin_lock(&inode_lock); ++ ret = generic_writeback_sb_inodes(super, wb, wbc); ++ spin_unlock(&inode_lock); ++ ++ wbc->nr_to_write = to_write; ++ ++ /* flush goes here */ ++ reiser4_writeout(super, wbc); ++ ++ /* avoid recursive calls to ->writeback_inodes */ ++ context_set_commit_async(ctx); ++ reiser4_exit_context(ctx); ++ spin_lock(&inode_lock); ++ ++ return wbc->nr_to_write <= 0 ? 1 : ret; ++ skip: ++ writeback_skip_sb_inodes(super, wb); ++ return 0; ++} ++ ++/** ++ * reiser4_show_options - show_options of super operations ++ * @m: file where to write information ++ * @mnt: mount structure ++ * ++ * Makes reiser4 mount options visible in /proc/mounts. ++ */ ++static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt) ++{ ++ struct super_block *super; ++ reiser4_super_info_data *sbinfo; ++ ++ super = mnt->mnt_sb; ++ sbinfo = get_super_private(super); ++ ++ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size); ++ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age); ++ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size); ++ seq_printf(m, ",atom_max_flushers=0x%x", ++ sbinfo->tmgr.atom_max_flushers); ++ seq_printf(m, ",cbk_cache_slots=0x%x", ++ sbinfo->tree.cbk_cache.nr_slots); ++ ++ return 0; ++} ++ ++struct super_operations reiser4_super_operations = { ++ .alloc_inode = reiser4_alloc_inode, ++ .destroy_inode = reiser4_destroy_inode, ++ .dirty_inode = reiser4_dirty_inode, ++ .delete_inode = reiser4_delete_inode, ++ .put_super = reiser4_put_super, ++ .write_super = reiser4_write_super, ++ .statfs = reiser4_statfs, ++ .clear_inode = reiser4_clear_inode, ++ .writeback_inodes = reiser4_writeback_inodes, ++ .show_options = reiser4_show_options ++}; ++ ++/** ++ * fill_super - initialize super block on mount ++ * @super: super block to fill ++ * @data: reiser4 specific mount option ++ * @silent: ++ * ++ * This is to be called by reiser4_get_sb. Mounts filesystem. ++ */ ++static int fill_super(struct super_block *super, void *data, int silent) ++{ ++ reiser4_context ctx; ++ int result; ++ reiser4_super_info_data *sbinfo; ++ ++ assert("zam-989", super != NULL); ++ ++ super->s_op = NULL; ++ init_stack_context(&ctx, super); ++ ++ /* allocate reiser4 specific super block */ ++ if ((result = reiser4_init_fs_info(super)) != 0) ++ goto failed_init_sinfo; ++ ++ sbinfo = get_super_private(super); ++ /* initialize various reiser4 parameters, parse mount options */ ++ if ((result = reiser4_init_super_data(super, data)) != 0) ++ goto failed_init_super_data; ++ ++ /* read reiser4 master super block, initialize disk format plugin */ ++ if ((result = reiser4_init_read_super(super, silent)) != 0) ++ goto failed_init_read_super; ++ ++ /* initialize transaction manager */ ++ reiser4_init_txnmgr(&sbinfo->tmgr); ++ ++ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */ ++ if ((result = reiser4_init_ktxnmgrd(super)) != 0) ++ goto failed_init_ktxnmgrd; ++ ++ /* initialize entd context and start kernel thread entd */ ++ if ((result = reiser4_init_entd(super)) != 0) ++ goto failed_init_entd; ++ ++ /* initialize address spaces for formatted nodes and bitmaps */ ++ if ((result = reiser4_init_formatted_fake(super)) != 0) ++ goto failed_init_formatted_fake; ++ ++ /* initialize disk format plugin */ ++ if ((result = get_super_private(super)->df_plug->init_format(super, ++ data)) != 0) ++ goto failed_init_disk_format; ++ ++ /* ++ * There are some 'committed' versions of reiser4 super block counters, ++ * which correspond to reiser4 on-disk state. These counters are ++ * initialized here ++ */ ++ sbinfo->blocks_free_committed = sbinfo->blocks_free; ++ sbinfo->nr_files_committed = oids_used(super); ++ ++ /* get inode of root directory */ ++ if ((result = reiser4_init_root_inode(super)) != 0) ++ goto failed_init_root_inode; ++ ++ if ((result = get_super_private(super)->df_plug->version_update(super)) != 0) ++ goto failed_update_format_version; ++ ++ process_safelinks(super); ++ reiser4_exit_context(&ctx); ++ ++ sbinfo->debugfs_root = debugfs_create_dir(super->s_id, ++ reiser4_debugfs_root); ++ if (sbinfo->debugfs_root) { ++ sbinfo->tmgr.debugfs_atom_count = ++ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR, ++ sbinfo->debugfs_root, ++ &sbinfo->tmgr.atom_count); ++ sbinfo->tmgr.debugfs_id_count = ++ debugfs_create_u32("id_count", S_IFREG|S_IRUSR, ++ sbinfo->debugfs_root, ++ &sbinfo->tmgr.id_count); ++ } ++ return 0; ++ ++ failed_update_format_version: ++ failed_init_root_inode: ++ if (sbinfo->df_plug->release) ++ sbinfo->df_plug->release(super); ++ failed_init_disk_format: ++ reiser4_done_formatted_fake(super); ++ failed_init_formatted_fake: ++ reiser4_done_entd(super); ++ failed_init_entd: ++ reiser4_done_ktxnmgrd(super); ++ failed_init_ktxnmgrd: ++ reiser4_done_txnmgr(&sbinfo->tmgr); ++ failed_init_read_super: ++ failed_init_super_data: ++ reiser4_done_fs_info(super); ++ failed_init_sinfo: ++ reiser4_exit_context(&ctx); ++ return result; ++} ++ ++/** ++ * reiser4_get_sb - get_sb of file_system_type operations ++ * @fs_type: ++ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc ++ * @dev_name: block device file name ++ * @data: specific mount options ++ * ++ * Reiser4 mount entry. ++ */ ++static int reiser4_get_sb(struct file_system_type *fs_type, int flags, ++ const char *dev_name, void *data, struct vfsmount *mnt) ++{ ++ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); ++} ++ ++/* structure describing the reiser4 filesystem implementation */ ++static struct file_system_type reiser4_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "reiser4", ++ .fs_flags = FS_REQUIRES_DEV, ++ .get_sb = reiser4_get_sb, ++ .kill_sb = kill_block_super, ++ .next = NULL ++}; ++ ++void destroy_reiser4_cache(struct kmem_cache **cachep) ++{ ++ BUG_ON(*cachep == NULL); ++ kmem_cache_destroy(*cachep); ++ *cachep = NULL; ++} ++ ++/** ++ * init_reiser4 - reiser4 initialization entry point ++ * ++ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called ++ * on kernel initialization or during reiser4 module load. ++ */ ++static int __init init_reiser4(void) ++{ ++ int result; ++ ++ printk(KERN_INFO ++ "Loading Reiser4. " ++ "See www.namesys.com for a description of Reiser4.\n"); ++ ++ /* initialize slab cache of inodes */ ++ if ((result = init_inodes()) != 0) ++ goto failed_inode_cache; ++ ++ /* initialize cache of znodes */ ++ if ((result = init_znodes()) != 0) ++ goto failed_init_znodes; ++ ++ /* initialize all plugins */ ++ if ((result = init_plugins()) != 0) ++ goto failed_init_plugins; ++ ++ /* initialize cache of plugin_set-s and plugin_set's hash table */ ++ if ((result = init_plugin_set()) != 0) ++ goto failed_init_plugin_set; ++ ++ /* initialize caches of txn_atom-s and txn_handle-s */ ++ if ((result = init_txnmgr_static()) != 0) ++ goto failed_init_txnmgr_static; ++ ++ /* initialize cache of jnodes */ ++ if ((result = init_jnodes()) != 0) ++ goto failed_init_jnodes; ++ ++ /* initialize cache of flush queues */ ++ if ((result = reiser4_init_fqs()) != 0) ++ goto failed_init_fqs; ++ ++ /* initialize cache of structures attached to dentry->d_fsdata */ ++ if ((result = reiser4_init_dentry_fsdata()) != 0) ++ goto failed_init_dentry_fsdata; ++ ++ /* initialize cache of structures attached to file->private_data */ ++ if ((result = reiser4_init_file_fsdata()) != 0) ++ goto failed_init_file_fsdata; ++ ++ /* ++ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for ++ * more details ++ */ ++ if ((result = reiser4_init_d_cursor()) != 0) ++ goto failed_init_d_cursor; ++ ++ if ((result = register_filesystem(&reiser4_fs_type)) == 0) { ++ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL); ++ return 0; ++ } ++ ++ reiser4_done_d_cursor(); ++ failed_init_d_cursor: ++ reiser4_done_file_fsdata(); ++ failed_init_file_fsdata: ++ reiser4_done_dentry_fsdata(); ++ failed_init_dentry_fsdata: ++ reiser4_done_fqs(); ++ failed_init_fqs: ++ done_jnodes(); ++ failed_init_jnodes: ++ done_txnmgr_static(); ++ failed_init_txnmgr_static: ++ done_plugin_set(); ++ failed_init_plugin_set: ++ failed_init_plugins: ++ done_znodes(); ++ failed_init_znodes: ++ done_inodes(); ++ failed_inode_cache: ++ return result; ++} ++ ++/** ++ * done_reiser4 - reiser4 exit entry point ++ * ++ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown ++ * or at module unload. ++ */ ++static void __exit done_reiser4(void) ++{ ++ int result; ++ ++ debugfs_remove(reiser4_debugfs_root); ++ result = unregister_filesystem(&reiser4_fs_type); ++ BUG_ON(result != 0); ++ reiser4_done_d_cursor(); ++ reiser4_done_file_fsdata(); ++ reiser4_done_dentry_fsdata(); ++ reiser4_done_fqs(); ++ done_jnodes(); ++ done_txnmgr_static(); ++ done_plugin_set(); ++ done_znodes(); ++ destroy_reiser4_cache(&inode_cache); ++} ++ ++module_init(init_reiser4); ++module_exit(done_reiser4); ++ ++MODULE_DESCRIPTION("Reiser4 filesystem"); ++MODULE_AUTHOR("Hans Reiser Reiser@Namesys.COM"); ++ ++MODULE_LICENSE("GPL"); ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/tap.c linux-2.6.33/fs/reiser4/tap.c +--- linux-2.6.33.orig/fs/reiser4/tap.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/tap.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,376 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* ++ Tree Access Pointer (tap). ++ ++ tap is data structure combining coord and lock handle (mostly). It is ++ useful when one has to scan tree nodes (for example, in readdir, or flush), ++ for tap functions allow to move tap in either direction transparently ++ crossing unit/item/node borders. ++ ++ Tap doesn't provide automatic synchronization of its fields as it is ++ supposed to be per-thread object. ++*/ ++ ++#include "forward.h" ++#include "debug.h" ++#include "coord.h" ++#include "tree.h" ++#include "context.h" ++#include "tap.h" ++#include "znode.h" ++#include "tree_walk.h" ++ ++#if REISER4_DEBUG ++static int tap_invariant(const tap_t *tap); ++static void tap_check(const tap_t *tap); ++#else ++#define tap_check(tap) noop ++#endif ++ ++/** load node tap is pointing to, if not loaded already */ ++int reiser4_tap_load(tap_t *tap) ++{ ++ tap_check(tap); ++ if (tap->loaded == 0) { ++ int result; ++ ++ result = zload_ra(tap->coord->node, &tap->ra_info); ++ if (result != 0) ++ return result; ++ coord_clear_iplug(tap->coord); ++ } ++ ++tap->loaded; ++ tap_check(tap); ++ return 0; ++} ++ ++/** release node tap is pointing to. Dual to tap_load() */ ++void reiser4_tap_relse(tap_t *tap) ++{ ++ tap_check(tap); ++ if (tap->loaded > 0) { ++ --tap->loaded; ++ if (tap->loaded == 0) ++ zrelse(tap->coord->node); ++ } ++ tap_check(tap); ++} ++ ++/** ++ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with ++ * @mode ++ */ ++void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh, ++ znode_lock_mode mode) ++{ ++ tap->coord = coord; ++ tap->lh = lh; ++ tap->mode = mode; ++ tap->loaded = 0; ++ INIT_LIST_HEAD(&tap->linkage); ++ reiser4_init_ra_info(&tap->ra_info); ++} ++ ++/** add @tap to the per-thread list of all taps */ ++void reiser4_tap_monitor(tap_t *tap) ++{ ++ assert("nikita-2623", tap != NULL); ++ tap_check(tap); ++ list_add(&tap->linkage, reiser4_taps_list()); ++ tap_check(tap); ++} ++ ++/* duplicate @src into @dst. Copy lock handle. @dst is not initially ++ * loaded. */ ++void reiser4_tap_copy(tap_t *dst, tap_t *src) ++{ ++ assert("nikita-3193", src != NULL); ++ assert("nikita-3194", dst != NULL); ++ ++ *dst->coord = *src->coord; ++ if (src->lh->node) ++ copy_lh(dst->lh, src->lh); ++ dst->mode = src->mode; ++ dst->loaded = 0; ++ INIT_LIST_HEAD(&dst->linkage); ++ dst->ra_info = src->ra_info; ++} ++ ++/** finish with @tap */ ++void reiser4_tap_done(tap_t *tap) ++{ ++ assert("nikita-2565", tap != NULL); ++ tap_check(tap); ++ if (tap->loaded > 0) ++ zrelse(tap->coord->node); ++ done_lh(tap->lh); ++ tap->loaded = 0; ++ list_del_init(&tap->linkage); ++ tap->coord->node = NULL; ++} ++ ++/** ++ * move @tap to the new node, locked with @target. Load @target, if @tap was ++ * already loaded. ++ */ ++int reiser4_tap_move(tap_t *tap, lock_handle * target) ++{ ++ int result = 0; ++ ++ assert("nikita-2567", tap != NULL); ++ assert("nikita-2568", target != NULL); ++ assert("nikita-2570", target->node != NULL); ++ assert("nikita-2569", tap->coord->node == tap->lh->node); ++ ++ tap_check(tap); ++ if (tap->loaded > 0) ++ result = zload_ra(target->node, &tap->ra_info); ++ ++ if (result == 0) { ++ if (tap->loaded > 0) ++ zrelse(tap->coord->node); ++ done_lh(tap->lh); ++ copy_lh(tap->lh, target); ++ tap->coord->node = target->node; ++ coord_clear_iplug(tap->coord); ++ } ++ tap_check(tap); ++ return result; ++} ++ ++/** ++ * move @tap to @target. Acquire lock on @target, if @tap was already ++ * loaded. ++ */ ++static int tap_to(tap_t *tap, znode * target) ++{ ++ int result; ++ ++ assert("nikita-2624", tap != NULL); ++ assert("nikita-2625", target != NULL); ++ ++ tap_check(tap); ++ result = 0; ++ if (tap->coord->node != target) { ++ lock_handle here; ++ ++ init_lh(&here); ++ result = longterm_lock_znode(&here, target, ++ tap->mode, ZNODE_LOCK_HIPRI); ++ if (result == 0) { ++ result = reiser4_tap_move(tap, &here); ++ done_lh(&here); ++ } ++ } ++ tap_check(tap); ++ return result; ++} ++ ++/** ++ * move @tap to given @target, loading and locking @target->node if ++ * necessary ++ */ ++int tap_to_coord(tap_t *tap, coord_t *target) ++{ ++ int result; ++ ++ tap_check(tap); ++ result = tap_to(tap, target->node); ++ if (result == 0) ++ coord_dup(tap->coord, target); ++ tap_check(tap); ++ return result; ++} ++ ++/** return list of all taps */ ++struct list_head *reiser4_taps_list(void) ++{ ++ return &get_current_context()->taps; ++} ++ ++/** helper function for go_{next,prev}_{item,unit,node}() */ ++int go_dir_el(tap_t *tap, sideof dir, int units_p) ++{ ++ coord_t dup; ++ coord_t *coord; ++ int result; ++ ++ int (*coord_dir) (coord_t *); ++ int (*get_dir_neighbor) (lock_handle *, znode *, int, int); ++ void (*coord_init) (coord_t *, const znode *); ++ ON_DEBUG(int (*coord_check) (const coord_t *)); ++ ++ assert("nikita-2556", tap != NULL); ++ assert("nikita-2557", tap->coord != NULL); ++ assert("nikita-2558", tap->lh != NULL); ++ assert("nikita-2559", tap->coord->node != NULL); ++ ++ tap_check(tap); ++ if (dir == LEFT_SIDE) { ++ coord_dir = units_p ? coord_prev_unit : coord_prev_item; ++ get_dir_neighbor = reiser4_get_left_neighbor; ++ coord_init = coord_init_last_unit; ++ } else { ++ coord_dir = units_p ? coord_next_unit : coord_next_item; ++ get_dir_neighbor = reiser4_get_right_neighbor; ++ coord_init = coord_init_first_unit; ++ } ++ ON_DEBUG(coord_check = ++ units_p ? coord_is_existing_unit : coord_is_existing_item); ++ assert("nikita-2560", coord_check(tap->coord)); ++ ++ coord = tap->coord; ++ coord_dup(&dup, coord); ++ if (coord_dir(&dup) != 0) { ++ do { ++ /* move to the left neighboring node */ ++ lock_handle dup; ++ ++ init_lh(&dup); ++ result = ++ get_dir_neighbor(&dup, coord->node, (int)tap->mode, ++ GN_CAN_USE_UPPER_LEVELS); ++ if (result == 0) { ++ result = reiser4_tap_move(tap, &dup); ++ if (result == 0) ++ coord_init(tap->coord, dup.node); ++ done_lh(&dup); ++ } ++ /* skip empty nodes */ ++ } while ((result == 0) && node_is_empty(coord->node)); ++ } else { ++ result = 0; ++ coord_dup(coord, &dup); ++ } ++ assert("nikita-2564", ergo(!result, coord_check(tap->coord))); ++ tap_check(tap); ++ return result; ++} ++ ++/** ++ * move @tap to the next unit, transparently crossing item and node ++ * boundaries ++ */ ++int go_next_unit(tap_t *tap) ++{ ++ return go_dir_el(tap, RIGHT_SIDE, 1); ++} ++ ++/** ++ * move @tap to the previous unit, transparently crossing item and node ++ * boundaries ++ */ ++int go_prev_unit(tap_t *tap) ++{ ++ return go_dir_el(tap, LEFT_SIDE, 1); ++} ++ ++/** ++ * @shift times apply @actor to the @tap. This is used to move @tap by ++ * @shift units (or items, or nodes) in either direction. ++ */ ++static int rewind_to(tap_t *tap, go_actor_t actor, int shift) ++{ ++ int result; ++ ++ assert("nikita-2555", shift >= 0); ++ assert("nikita-2562", tap->coord->node == tap->lh->node); ++ ++ tap_check(tap); ++ result = reiser4_tap_load(tap); ++ if (result != 0) ++ return result; ++ ++ for (; shift > 0; --shift) { ++ result = actor(tap); ++ assert("nikita-2563", tap->coord->node == tap->lh->node); ++ if (result != 0) ++ break; ++ } ++ reiser4_tap_relse(tap); ++ tap_check(tap); ++ return result; ++} ++ ++/** move @tap @shift units rightward */ ++int rewind_right(tap_t *tap, int shift) ++{ ++ return rewind_to(tap, go_next_unit, shift); ++} ++ ++/** move @tap @shift units leftward */ ++int rewind_left(tap_t *tap, int shift) ++{ ++ return rewind_to(tap, go_prev_unit, shift); ++} ++ ++#if REISER4_DEBUG ++/** debugging function: print @tap content in human readable form */ ++static void print_tap(const char *prefix, const tap_t *tap) ++{ ++ if (tap == NULL) { ++ printk("%s: null tap\n", prefix); ++ return; ++ } ++ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix, ++ tap->loaded, (&tap->linkage == tap->linkage.next && ++ &tap->linkage == tap->linkage.prev), ++ tap->lh->node, ++ lock_mode_name(tap->mode)); ++ print_coord("\tcoord", tap->coord, 0); ++} ++ ++/** check [tap-sane] invariant */ ++static int tap_invariant(const tap_t *tap) ++{ ++ /* [tap-sane] invariant */ ++ ++ if (tap == NULL) ++ return 1; ++ /* tap->mode is one of ++ * ++ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and ++ */ ++ if (tap->mode != ZNODE_NO_LOCK && ++ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK) ++ return 2; ++ /* tap->coord != NULL, and */ ++ if (tap->coord == NULL) ++ return 3; ++ /* tap->lh != NULL, and */ ++ if (tap->lh == NULL) ++ return 4; ++ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */ ++ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node))) ++ return 5; ++ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */ ++ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node) ++ return 6; ++ return 0; ++} ++ ++/** debugging function: check internal @tap consistency */ ++static void tap_check(const tap_t *tap) ++{ ++ int result; ++ ++ result = tap_invariant(tap); ++ if (result != 0) { ++ print_tap("broken", tap); ++ reiser4_panic("nikita-2831", "tap broken: %i\n", result); ++ } ++} ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/tap.h linux-2.6.33/fs/reiser4/tap.h +--- linux-2.6.33.orig/fs/reiser4/tap.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/tap.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,70 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* Tree Access Pointers. See tap.c for more details. */ ++ ++#if !defined(__REISER4_TAP_H__) ++#define __REISER4_TAP_H__ ++ ++#include "forward.h" ++#include "readahead.h" ++ ++/** ++ tree_access_pointer aka tap. Data structure combining coord_t and lock ++ handle. ++ Invariants involving this data-type, see doc/lock-ordering for details: ++ ++ [tap-sane] ++ */ ++struct tree_access_pointer { ++ /* coord tap is at */ ++ coord_t *coord; ++ /* lock handle on ->coord->node */ ++ lock_handle *lh; ++ /* mode of lock acquired by this tap */ ++ znode_lock_mode mode; ++ /* incremented by reiser4_tap_load(). ++ Decremented by reiser4_tap_relse(). */ ++ int loaded; ++ /* list of taps */ ++ struct list_head linkage; ++ /* read-ahead hint */ ++ ra_info_t ra_info; ++}; ++ ++typedef int (*go_actor_t) (tap_t *tap); ++ ++extern int reiser4_tap_load(tap_t *tap); ++extern void reiser4_tap_relse(tap_t *tap); ++extern void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh, ++ znode_lock_mode mode); ++extern void reiser4_tap_monitor(tap_t *tap); ++extern void reiser4_tap_copy(tap_t *dst, tap_t *src); ++extern void reiser4_tap_done(tap_t *tap); ++extern int reiser4_tap_move(tap_t *tap, lock_handle * target); ++extern int tap_to_coord(tap_t *tap, coord_t *target); ++ ++extern int go_dir_el(tap_t *tap, sideof dir, int units_p); ++extern int go_next_unit(tap_t *tap); ++extern int go_prev_unit(tap_t *tap); ++extern int rewind_right(tap_t *tap, int shift); ++extern int rewind_left(tap_t *tap, int shift); ++ ++extern struct list_head *reiser4_taps_list(void); ++ ++#define for_all_taps(tap) \ ++ for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \ ++ reiser4_taps_list() != &tap->linkage; \ ++ tap = list_entry(tap->linkage.next, tap_t, linkage)) ++ ++/* __REISER4_TAP_H__ */ ++#endif ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/tree.c linux-2.6.33/fs/reiser4/tree.c +--- linux-2.6.33.orig/fs/reiser4/tree.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/tree.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1878 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* ++ * KEYS IN A TREE. ++ * ++ * The tree consists of nodes located on the disk. Node in the tree is either ++ * formatted or unformatted. Formatted node is one that has structure ++ * understood by the tree balancing and traversal code. Formatted nodes are ++ * further classified into leaf and internal nodes. Latter distinctions is ++ * (almost) of only historical importance: general structure of leaves and ++ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data ++ * that are part of bodies of ordinary files and attributes. ++ * ++ * Each node in the tree spawns some interval in the key space. Key ranges for ++ * all nodes in the tree are disjoint. Actually, this only holds in some weak ++ * sense, because of the non-unique keys: intersection of key ranges for ++ * different nodes is either empty, or consists of exactly one key. ++ * ++ * Formatted node consists of a sequence of items. Each item spawns some ++ * interval in key space. Key ranges for all items in a tree are disjoint, ++ * modulo non-unique keys again. Items within nodes are ordered in the key ++ * order of the smallest key in a item. ++ * ++ * Particular type of item can be further split into units. Unit is piece of ++ * item that can be cut from item and moved into another item of the same ++ * time. Units are used by balancing code to repack data during balancing. ++ * ++ * Unit can be further split into smaller entities (for example, extent unit ++ * represents several pages, and it is natural for extent code to operate on ++ * particular pages and even bytes within one unit), but this is of no ++ * relevance to the generic balancing and lookup code. ++ * ++ * Although item is said to "spawn" range or interval of keys, it is not ++ * necessary that item contains piece of data addressable by each and every ++ * key in this range. For example, compound directory item, consisting of ++ * units corresponding to directory entries and keyed by hashes of file names, ++ * looks more as having "discrete spectrum": only some disjoint keys inside ++ * range occupied by this item really address data. ++ * ++ * No than less, each item always has well-defined least (minimal) key, that ++ * is recorded in item header, stored in the node this item is in. Also, item ++ * plugin can optionally define method ->max_key_inside() returning maximal ++ * key that can _possibly_ be located within this item. This method is used ++ * (mainly) to determine when given piece of data should be merged into ++ * existing item, in stead of creating new one. Because of this, even though ++ * ->max_key_inside() can be larger that any key actually located in the item, ++ * intervals ++ * ++ * [ reiser4_min_key( item ), ->max_key_inside( item ) ] ++ * ++ * are still disjoint for all items within the _same_ node. ++ * ++ * In memory node is represented by znode. It plays several roles: ++ * ++ * . something locks are taken on ++ * ++ * . something tracked by transaction manager (this is going to change) ++ * ++ * . something used to access node data ++ * ++ * . something used to maintain tree structure in memory: sibling and ++ * parental linkage. ++ * ++ * . something used to organize nodes into "slums" ++ * ++ * More on znodes see in znode.[ch] ++ * ++ * DELIMITING KEYS ++ * ++ * To simplify balancing, allow some flexibility in locking and speed up ++ * important coord cache optimization, we keep delimiting keys of nodes in ++ * memory. Depending on disk format (implemented by appropriate node plugin) ++ * node on disk can record both left and right delimiting key, only one of ++ * them, or none. Still, our balancing and tree traversal code keep both ++ * delimiting keys for a node that is in memory stored in the znode. When ++ * node is first brought into memory during tree traversal, its left ++ * delimiting key is taken from its parent, and its right delimiting key is ++ * either next key in its parent, or is right delimiting key of parent if ++ * node is the rightmost child of parent. ++ * ++ * Physical consistency of delimiting key is protected by special dk ++ * read-write lock. That is, delimiting keys can only be inspected or ++ * modified under this lock. But dk lock is only sufficient for fast ++ * "pessimistic" check, because to simplify code and to decrease lock ++ * contention, balancing (carry) only updates delimiting keys right before ++ * unlocking all locked nodes on the given tree level. For example, ++ * coord-by-key cache scans LRU list of recently accessed znodes. For each ++ * node it first does fast check under dk spin lock. If key looked for is ++ * not between delimiting keys for this node, next node is inspected and so ++ * on. If key is inside of the key range, long term lock is taken on node ++ * and key range is rechecked. ++ * ++ * COORDINATES ++ * ++ * To find something in the tree, you supply a key, and the key is resolved ++ * by coord_by_key() into a coord (coordinate) that is valid as long as the ++ * node the coord points to remains locked. As mentioned above trees ++ * consist of nodes that consist of items that consist of units. A unit is ++ * the smallest and indivisible piece of tree as far as balancing and tree ++ * search are concerned. Each node, item, and unit can be addressed by ++ * giving its level in the tree and the key occupied by this entity. A node ++ * knows what the key ranges are of the items within it, and how to find its ++ * items and invoke their item handlers, but it does not know how to access ++ * individual units within its items except through the item handlers. ++ * coord is a structure containing a pointer to the node, the ordinal number ++ * of the item within this node (a sort of item offset), and the ordinal ++ * number of the unit within this item. ++ * ++ * TREE LOOKUP ++ * ++ * There are two types of access to the tree: lookup and modification. ++ * ++ * Lookup is a search for the key in the tree. Search can look for either ++ * exactly the key given to it, or for the largest key that is not greater ++ * than the key given to it. This distinction is determined by "bias" ++ * parameter of search routine (coord_by_key()). coord_by_key() either ++ * returns error (key is not in the tree, or some kind of external error ++ * occurred), or successfully resolves key into coord. ++ * ++ * This resolution is done by traversing tree top-to-bottom from root level ++ * to the desired level. On levels above twig level (level one above the ++ * leaf level) nodes consist exclusively of internal items. Internal item is ++ * nothing more than pointer to the tree node on the child level. On twig ++ * level nodes consist of internal items intermixed with extent ++ * items. Internal items form normal search tree structure used by traversal ++ * to descent through the tree. ++ * ++ * TREE LOOKUP OPTIMIZATIONS ++ * ++ * Tree lookup described above is expensive even if all nodes traversed are ++ * already in the memory: for each node binary search within it has to be ++ * performed and binary searches are CPU consuming and tend to destroy CPU ++ * caches. ++ * ++ * Several optimizations are used to work around this: ++ * ++ * . cbk_cache (look-aside cache for tree traversals, see search.c for ++ * details) ++ * ++ * . seals (see seal.[ch]) ++ * ++ * . vroot (see search.c) ++ * ++ * General search-by-key is layered thusly: ++ * ++ * [check seal, if any] --ok--> done ++ * | ++ * failed ++ * | ++ * V ++ * [vroot defined] --no--> node = tree_root ++ * | | ++ * yes | ++ * | | ++ * V | ++ * node = vroot | ++ * | | ++ * | | ++ * | | ++ * V V ++ * [check cbk_cache for key] --ok--> done ++ * | ++ * failed ++ * | ++ * V ++ * [start tree traversal from node] ++ * ++ */ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++#include "coord.h" ++#include "plugin/item/static_stat.h" ++#include "plugin/item/item.h" ++#include "plugin/node/node.h" ++#include "plugin/plugin.h" ++#include "txnmgr.h" ++#include "jnode.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree_walk.h" ++#include "carry.h" ++#include "carry_ops.h" ++#include "tap.h" ++#include "tree.h" ++#include "vfs_ops.h" ++#include "page_cache.h" ++#include "super.h" ++#include "reiser4.h" ++#include "inode.h" ++ ++#include <linux/fs.h> /* for struct super_block */ ++#include <linux/spinlock.h> ++ ++/* Disk address (block number) never ever used for any real tree node. This is ++ used as block number of "uber" znode. ++ ++ Invalid block addresses are 0 by tradition. ++ ++*/ ++const reiser4_block_nr UBER_TREE_ADDR = 0ull; ++ ++#define CUT_TREE_MIN_ITERATIONS 64 ++ ++static int find_child_by_addr(znode * parent, znode * child, coord_t *result); ++ ++/* return node plugin of coord->node */ ++node_plugin *node_plugin_by_coord(const coord_t *coord) ++{ ++ assert("vs-1", coord != NULL); ++ assert("vs-2", coord->node != NULL); ++ ++ return coord->node->nplug; ++} ++ ++/* insert item into tree. Fields of @coord are updated so that they can be ++ * used by consequent insert operation. */ ++insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item ++ * into */ , ++ const reiser4_key * key /* key of new item */ , ++ reiser4_item_data * data /* parameters for item ++ * creation */ , ++ coord_t *coord /* resulting insertion coord */ , ++ lock_handle * lh /* resulting lock ++ * handle */ , ++ tree_level stop_level /* level where to insert */ , ++ __u32 flags/* insertion flags */) ++{ ++ int result; ++ ++ assert("nikita-358", tree != NULL); ++ assert("nikita-360", coord != NULL); ++ ++ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK, ++ FIND_EXACT, stop_level, stop_level, ++ flags | CBK_FOR_INSERT, NULL/*ra_info */); ++ switch (result) { ++ default: ++ break; ++ case CBK_COORD_FOUND: ++ result = IBK_ALREADY_EXISTS; ++ break; ++ case CBK_COORD_NOTFOUND: ++ assert("nikita-2017", coord->node != NULL); ++ result = insert_by_coord(coord, data, key, lh, 0/*flags */); ++ break; ++ } ++ return result; ++} ++ ++/* insert item by calling carry. Helper function called if short-cut ++ insertion failed */ ++static insert_result insert_with_carry_by_coord(coord_t *coord, ++ /* coord where to insert */ ++ lock_handle * lh, ++ /* lock handle of insertion node */ ++ reiser4_item_data * data, ++ /* parameters of new item */ ++ const reiser4_key * key, ++ /* key of new item */ ++ carry_opcode cop, ++ /* carry operation to perform */ ++ cop_insert_flag flags ++ /* carry flags */ ) ++{ ++ int result; ++ carry_pool *pool; ++ carry_level *lowest_level; ++ carry_insert_data *cdata; ++ carry_op *op; ++ ++ assert("umka-314", coord != NULL); ++ ++ /* allocate carry_pool and 3 carry_level-s */ ++ pool = ++ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + ++ sizeof(*cdata)); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ lowest_level = (carry_level *) (pool + 1); ++ init_carry_level(lowest_level, pool); ++ ++ op = reiser4_post_carry(lowest_level, cop, coord->node, 0); ++ if (IS_ERR(op) || (op == NULL)) { ++ done_carry_pool(pool); ++ return RETERR(op ? PTR_ERR(op) : -EIO); ++ } ++ cdata = (carry_insert_data *) (lowest_level + 3); ++ cdata->coord = coord; ++ cdata->data = data; ++ cdata->key = key; ++ op->u.insert.d = cdata; ++ if (flags == 0) ++ flags = znode_get_tree(coord->node)->carry.insert_flags; ++ op->u.insert.flags = flags; ++ op->u.insert.type = COPT_ITEM_DATA; ++ op->u.insert.child = NULL; ++ if (lh != NULL) { ++ assert("nikita-3245", lh->node == coord->node); ++ lowest_level->track_type = CARRY_TRACK_CHANGE; ++ lowest_level->tracked = lh; ++ } ++ ++ result = reiser4_carry(lowest_level, NULL); ++ done_carry_pool(pool); ++ ++ return result; ++} ++ ++/* form carry queue to perform paste of @data with @key at @coord, and launch ++ its execution by calling carry(). ++ ++ Instruct carry to update @lh it after balancing insertion coord moves into ++ different block. ++ ++*/ ++static int paste_with_carry(coord_t *coord, /* coord of paste */ ++ lock_handle * lh, /* lock handle of node ++ * where item is ++ * pasted */ ++ reiser4_item_data * data, /* parameters of new ++ * item */ ++ const reiser4_key * key, /* key of new item */ ++ unsigned flags/* paste flags */) ++{ ++ int result; ++ carry_pool *pool; ++ carry_level *lowest_level; ++ carry_insert_data *cdata; ++ carry_op *op; ++ ++ assert("umka-315", coord != NULL); ++ assert("umka-316", key != NULL); ++ ++ pool = ++ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + ++ sizeof(*cdata)); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ lowest_level = (carry_level *) (pool + 1); ++ init_carry_level(lowest_level, pool); ++ ++ op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0); ++ if (IS_ERR(op) || (op == NULL)) { ++ done_carry_pool(pool); ++ return RETERR(op ? PTR_ERR(op) : -EIO); ++ } ++ cdata = (carry_insert_data *) (lowest_level + 3); ++ cdata->coord = coord; ++ cdata->data = data; ++ cdata->key = key; ++ op->u.paste.d = cdata; ++ if (flags == 0) ++ flags = znode_get_tree(coord->node)->carry.paste_flags; ++ op->u.paste.flags = flags; ++ op->u.paste.type = COPT_ITEM_DATA; ++ if (lh != NULL) { ++ lowest_level->track_type = CARRY_TRACK_CHANGE; ++ lowest_level->tracked = lh; ++ } ++ ++ result = reiser4_carry(lowest_level, NULL); ++ done_carry_pool(pool); ++ ++ return result; ++} ++ ++/* insert item at the given coord. ++ ++ First try to skip carry by directly calling ->create_item() method of node ++ plugin. If this is impossible (there is not enough free space in the node, ++ or leftmost item in the node is created), call insert_with_carry_by_coord() ++ that will do full carry(). ++ ++*/ ++insert_result insert_by_coord(coord_t *coord /* coord where to ++ * insert. coord->node has ++ * to be write locked by ++ * caller */ , ++ reiser4_item_data * data /* data to be ++ * inserted */ , ++ const reiser4_key * key /* key of new item */ , ++ lock_handle * lh /* lock handle of write ++ * lock on node */ , ++ __u32 flags/* insertion flags */) ++{ ++ unsigned item_size; ++ int result; ++ znode *node; ++ ++ assert("vs-247", coord != NULL); ++ assert("vs-248", data != NULL); ++ assert("vs-249", data->length >= 0); ++ assert("nikita-1191", znode_is_write_locked(coord->node)); ++ ++ node = coord->node; ++ coord_clear_iplug(coord); ++ result = zload(node); ++ if (result != 0) ++ return result; ++ ++ item_size = space_needed(node, NULL, data, 1); ++ if (item_size > znode_free_space(node) && ++ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT) ++ && (flags & COPI_DONT_ALLOCATE)) { ++ /* we are forced to use free space of coord->node and new item ++ does not fit into it. ++ ++ Currently we get here only when we allocate and copy units ++ of extent item from a node to its left neighbor during ++ "squalloc"-ing. If @node (this is left neighbor) does not ++ have enough free space - we do not want to attempt any ++ shifting and allocations because we are in squeezing and ++ everything to the left of @node is tightly packed. ++ */ ++ result = -E_NODE_FULL; ++ } else if ((item_size <= znode_free_space(node)) && ++ !coord_is_before_leftmost(coord) && ++ (node_plugin_by_node(node)->fast_insert != NULL) ++ && node_plugin_by_node(node)->fast_insert(coord)) { ++ /* shortcut insertion without carry() overhead. ++ ++ Only possible if: ++ ++ - there is enough free space ++ ++ - insertion is not into the leftmost position in a node ++ (otherwise it would require updating of delimiting key in a ++ parent) ++ ++ - node plugin agrees with this ++ ++ */ ++ result = ++ node_plugin_by_node(node)->create_item(coord, key, data, ++ NULL); ++ znode_make_dirty(node); ++ } else { ++ /* otherwise do full-fledged carry(). */ ++ result = ++ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT, ++ flags); ++ } ++ zrelse(node); ++ return result; ++} ++ ++/* @coord is set to leaf level and @data is to be inserted to twig level */ ++insert_result ++insert_extent_by_coord(coord_t *coord, /* coord where to insert. ++ * coord->node has to be write ++ * locked by caller */ ++ reiser4_item_data *data,/* data to be inserted */ ++ const reiser4_key *key, /* key of new item */ ++ lock_handle *lh /* lock handle of write lock ++ on node */) ++{ ++ assert("vs-405", coord != NULL); ++ assert("vs-406", data != NULL); ++ assert("vs-407", data->length > 0); ++ assert("vs-408", znode_is_write_locked(coord->node)); ++ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL); ++ ++ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT, ++ 0 /*flags */ ); ++} ++ ++/* Insert into the item at the given coord. ++ ++ First try to skip carry by directly calling ->paste() method of item ++ plugin. If this is impossible (there is not enough free space in the node, ++ or we are pasting into leftmost position in the node), call ++ paste_with_carry() that will do full carry(). ++ ++*/ ++/* paste_into_item */ ++int insert_into_item(coord_t * coord /* coord of pasting */ , ++ lock_handle * lh /* lock handle on node involved */ , ++ const reiser4_key * key /* key of unit being pasted */ , ++ reiser4_item_data * data /* parameters for new unit */ , ++ unsigned flags /* insert/paste flags */ ) ++{ ++ int result; ++ int size_change; ++ node_plugin *nplug; ++ item_plugin *iplug; ++ ++ assert("umka-317", coord != NULL); ++ assert("umka-318", key != NULL); ++ ++ iplug = item_plugin_by_coord(coord); ++ nplug = node_plugin_by_coord(coord); ++ ++ assert("nikita-1480", iplug == data->iplug); ++ ++ size_change = space_needed(coord->node, coord, data, 0); ++ if (size_change > (int)znode_free_space(coord->node) && ++ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT) ++ && (flags & COPI_DONT_ALLOCATE)) { ++ /* we are forced to use free space of coord->node and new data ++ does not fit into it. */ ++ return -E_NODE_FULL; ++ } ++ ++ /* shortcut paste without carry() overhead. ++ ++ Only possible if: ++ ++ - there is enough free space ++ ++ - paste is not into the leftmost unit in a node (otherwise ++ it would require updating of delimiting key in a parent) ++ ++ - node plugin agrees with this ++ ++ - item plugin agrees with us ++ */ ++ if (size_change <= (int)znode_free_space(coord->node) && ++ (coord->item_pos != 0 || ++ coord->unit_pos != 0 || coord->between == AFTER_UNIT) && ++ coord->unit_pos != 0 && nplug->fast_paste != NULL && ++ nplug->fast_paste(coord) && ++ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) { ++ if (size_change > 0) ++ nplug->change_item_size(coord, size_change); ++ /* NOTE-NIKITA: huh? where @key is used? */ ++ result = iplug->b.paste(coord, data, NULL); ++ if (size_change < 0) ++ nplug->change_item_size(coord, size_change); ++ znode_make_dirty(coord->node); ++ } else ++ /* otherwise do full-fledged carry(). */ ++ result = paste_with_carry(coord, lh, data, key, flags); ++ return result; ++} ++ ++/* this either appends or truncates item @coord */ ++int reiser4_resize_item(coord_t * coord /* coord of item being resized */ , ++ reiser4_item_data * data /* parameters of resize */ , ++ reiser4_key * key /* key of new unit */ , ++ lock_handle * lh /* lock handle of node ++ * being modified */ , ++ cop_insert_flag flags /* carry flags */ ) ++{ ++ int result; ++ znode *node; ++ ++ assert("nikita-362", coord != NULL); ++ assert("nikita-363", data != NULL); ++ assert("vs-245", data->length != 0); ++ ++ node = coord->node; ++ coord_clear_iplug(coord); ++ result = zload(node); ++ if (result != 0) ++ return result; ++ ++ if (data->length < 0) ++ result = node_plugin_by_coord(coord)->shrink_item(coord, ++ -data->length); ++ else ++ result = insert_into_item(coord, lh, key, data, flags); ++ ++ zrelse(node); ++ return result; ++} ++ ++/* insert flow @f */ ++int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f) ++{ ++ int result; ++ carry_pool *pool; ++ carry_level *lowest_level; ++ reiser4_item_data *data; ++ carry_op *op; ++ ++ pool = ++ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + ++ sizeof(*data)); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ lowest_level = (carry_level *) (pool + 1); ++ init_carry_level(lowest_level, pool); ++ ++ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node, ++ 0 /* operate directly on coord -> node */ ); ++ if (IS_ERR(op) || (op == NULL)) { ++ done_carry_pool(pool); ++ return RETERR(op ? PTR_ERR(op) : -EIO); ++ } ++ ++ /* these are permanent during insert_flow */ ++ data = (reiser4_item_data *) (lowest_level + 3); ++ data->user = 1; ++ data->iplug = item_plugin_by_id(FORMATTING_ID); ++ data->arg = NULL; ++ /* data.length and data.data will be set before calling paste or ++ insert */ ++ data->length = 0; ++ data->data = NULL; ++ ++ op->u.insert_flow.flags = 0; ++ op->u.insert_flow.insert_point = coord; ++ op->u.insert_flow.flow = f; ++ op->u.insert_flow.data = data; ++ op->u.insert_flow.new_nodes = 0; ++ ++ lowest_level->track_type = CARRY_TRACK_CHANGE; ++ lowest_level->tracked = lh; ++ ++ result = reiser4_carry(lowest_level, NULL); ++ done_carry_pool(pool); ++ ++ return result; ++} ++ ++/* Given a coord in parent node, obtain a znode for the corresponding child */ ++znode *child_znode(const coord_t * parent_coord /* coord of pointer to ++ * child */ , ++ znode * parent /* parent of child */ , ++ int incore_p /* if !0 only return child if already in ++ * memory */ , ++ int setup_dkeys_p /* if !0 update delimiting keys of ++ * child */ ) ++{ ++ znode *child; ++ ++ assert("nikita-1374", parent_coord != NULL); ++ assert("nikita-1482", parent != NULL); ++#if REISER4_DEBUG ++ if (setup_dkeys_p) ++ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock)); ++#endif ++ assert("nikita-2947", znode_is_any_locked(parent)); ++ ++ if (znode_get_level(parent) <= LEAF_LEVEL) { ++ /* trying to get child of leaf node */ ++ warning("nikita-1217", "Child of maize?"); ++ return ERR_PTR(RETERR(-EIO)); ++ } ++ if (item_is_internal(parent_coord)) { ++ reiser4_block_nr addr; ++ item_plugin *iplug; ++ reiser4_tree *tree; ++ ++ iplug = item_plugin_by_coord(parent_coord); ++ assert("vs-512", iplug->s.internal.down_link); ++ iplug->s.internal.down_link(parent_coord, NULL, &addr); ++ ++ tree = znode_get_tree(parent); ++ if (incore_p) ++ child = zlook(tree, &addr); ++ else ++ child = ++ zget(tree, &addr, parent, ++ znode_get_level(parent) - 1, ++ reiser4_ctx_gfp_mask_get()); ++ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p) ++ set_child_delimiting_keys(parent, parent_coord, child); ++ } else { ++ warning("nikita-1483", "Internal item expected"); ++ child = ERR_PTR(RETERR(-EIO)); ++ } ++ return child; ++} ++ ++/* remove znode from transaction */ ++static void uncapture_znode(znode * node) ++{ ++ struct page *page; ++ ++ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); ++ ++ if (!reiser4_blocknr_is_fake(znode_get_block(node))) { ++ int ret; ++ ++ /* An already allocated block goes right to the atom's delete set. */ ++ ret = ++ reiser4_dealloc_block(znode_get_block(node), 0, ++ BA_DEFER | BA_FORMATTED); ++ if (ret) ++ warning("zam-942", ++ "can't add a block (%llu) number to atom's delete set\n", ++ (unsigned long long)(*znode_get_block(node))); ++ ++ spin_lock_znode(node); ++ /* Here we return flush reserved block which was reserved at the ++ * moment when this allocated node was marked dirty and still ++ * not used by flush in node relocation procedure. */ ++ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) { ++ txn_atom *atom; ++ ++ atom = jnode_get_atom(ZJNODE(node)); ++ assert("zam-939", atom != NULL); ++ spin_unlock_znode(node); ++ flush_reserved2grabbed(atom, (__u64) 1); ++ spin_unlock_atom(atom); ++ } else ++ spin_unlock_znode(node); ++ } else { ++ /* znode has assigned block which is counted as "fake ++ allocated". Return it back to "free blocks") */ ++ fake_allocated2free((__u64) 1, BA_FORMATTED); ++ } ++ ++ /* ++ * uncapture page from transaction. There is a possibility of a race ++ * with ->releasepage(): reiser4_releasepage() detaches page from this ++ * jnode and we have nothing to uncapture. To avoid this, get ++ * reference of node->pg under jnode spin lock. reiser4_uncapture_page() ++ * will deal with released page itself. ++ */ ++ spin_lock_znode(node); ++ page = znode_page(node); ++ if (likely(page != NULL)) { ++ /* ++ * reiser4_uncapture_page() can only be called when we are sure ++ * that znode is pinned in memory, which we are, because ++ * forget_znode() is only called from longterm_unlock_znode(). ++ */ ++ page_cache_get(page); ++ spin_unlock_znode(node); ++ lock_page(page); ++ reiser4_uncapture_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } else { ++ txn_atom *atom; ++ ++ /* handle "flush queued" znodes */ ++ while (1) { ++ atom = jnode_get_atom(ZJNODE(node)); ++ assert("zam-943", atom != NULL); ++ ++ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED) ++ || !atom->nr_running_queues) ++ break; ++ ++ spin_unlock_znode(node); ++ reiser4_atom_wait_event(atom); ++ spin_lock_znode(node); ++ } ++ ++ reiser4_uncapture_block(ZJNODE(node)); ++ spin_unlock_atom(atom); ++ zput(node); ++ } ++} ++ ++/* This is called from longterm_unlock_znode() when last lock is released from ++ the node that has been removed from the tree. At this point node is removed ++ from sibling list and its lock is invalidated. */ ++void forget_znode(lock_handle * handle) ++{ ++ znode *node; ++ reiser4_tree *tree; ++ ++ assert("umka-319", handle != NULL); ++ ++ node = handle->node; ++ tree = znode_get_tree(node); ++ ++ assert("vs-164", znode_is_write_locked(node)); ++ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); ++ assert_rw_locked(&(node->lock.guard)); ++ ++ /* We assume that this node was detached from its parent before ++ * unlocking, it gives no way to reach this node from parent through a ++ * down link. The node should have no children and, thereby, can't be ++ * reached from them by their parent pointers. The only way to obtain a ++ * reference to the node is to use sibling pointers from its left and ++ * right neighbors. In the next several lines we remove the node from ++ * the sibling list. */ ++ ++ write_lock_tree(tree); ++ sibling_list_remove(node); ++ znode_remove(node, tree); ++ write_unlock_tree(tree); ++ ++ /* Here we set JNODE_DYING and cancel all pending lock requests. It ++ * forces all lock requestor threads to repeat iterations of getting ++ * lock on a child, neighbor or parent node. But, those threads can't ++ * come to this node again, because this node is no longer a child, ++ * neighbor or parent of any other node. This order of znode ++ * invalidation does not allow other threads to waste cpu time is a busy ++ * loop, trying to lock dying object. The exception is in the flush ++ * code when we take node directly from atom's capture list.*/ ++ reiser4_invalidate_lock(handle); ++ uncapture_znode(node); ++} ++ ++/* Check that internal item at @pointer really contains pointer to @child. */ ++int check_tree_pointer(const coord_t * pointer /* would-be pointer to ++ * @child */ , ++ const znode * child /* child znode */ ) ++{ ++ assert("nikita-1016", pointer != NULL); ++ assert("nikita-1017", child != NULL); ++ assert("nikita-1018", pointer->node != NULL); ++ ++ assert("nikita-1325", znode_is_any_locked(pointer->node)); ++ ++ assert("nikita-2985", ++ znode_get_level(pointer->node) == znode_get_level(child) + 1); ++ ++ coord_clear_iplug((coord_t *) pointer); ++ ++ if (coord_is_existing_unit(pointer)) { ++ item_plugin *iplug; ++ reiser4_block_nr addr; ++ ++ if (item_is_internal(pointer)) { ++ iplug = item_plugin_by_coord(pointer); ++ assert("vs-513", iplug->s.internal.down_link); ++ iplug->s.internal.down_link(pointer, NULL, &addr); ++ /* check that cached value is correct */ ++ if (disk_addr_eq(&addr, znode_get_block(child))) { ++ return NS_FOUND; ++ } ++ } ++ } ++ /* warning ("jmacd-1002", "tree pointer incorrect"); */ ++ return NS_NOT_FOUND; ++} ++ ++/* find coord of pointer to new @child in @parent. ++ ++ Find the &coord_t in the @parent where pointer to a given @child will ++ be in. ++ ++*/ ++int find_new_child_ptr(znode * parent /* parent znode, passed locked */ , ++ znode * ++ child UNUSED_ARG /* child znode, passed locked */ , ++ znode * left /* left brother of new node */ , ++ coord_t * result /* where result is stored in */ ) ++{ ++ int ret; ++ ++ assert("nikita-1486", parent != NULL); ++ assert("nikita-1487", child != NULL); ++ assert("nikita-1488", result != NULL); ++ ++ ret = find_child_ptr(parent, left, result); ++ if (ret != NS_FOUND) { ++ warning("nikita-1489", "Cannot find brother position: %i", ret); ++ return RETERR(-EIO); ++ } else { ++ result->between = AFTER_UNIT; ++ return RETERR(NS_NOT_FOUND); ++ } ++} ++ ++/* find coord of pointer to @child in @parent. ++ ++ Find the &coord_t in the @parent where pointer to a given @child is in. ++ ++*/ ++int find_child_ptr(znode * parent /* parent znode, passed locked */ , ++ znode * child /* child znode, passed locked */ , ++ coord_t * result /* where result is stored in */ ) ++{ ++ int lookup_res; ++ node_plugin *nplug; ++ /* left delimiting key of a child */ ++ reiser4_key ld; ++ reiser4_tree *tree; ++ ++ assert("nikita-934", parent != NULL); ++ assert("nikita-935", child != NULL); ++ assert("nikita-936", result != NULL); ++ assert("zam-356", znode_is_loaded(parent)); ++ ++ coord_init_zero(result); ++ result->node = parent; ++ ++ nplug = parent->nplug; ++ assert("nikita-939", nplug != NULL); ++ ++ tree = znode_get_tree(parent); ++ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is ++ * not aliased to ->in_parent of some znode. Otherwise, ++ * parent_coord_to_coord() below would modify data protected by tree ++ * lock. */ ++ read_lock_tree(tree); ++ /* fast path. Try to use cached value. Lock tree to keep ++ node->pos_in_parent and pos->*_blocknr consistent. */ ++ if (child->in_parent.item_pos + 1 != 0) { ++ parent_coord_to_coord(&child->in_parent, result); ++ if (check_tree_pointer(result, child) == NS_FOUND) { ++ read_unlock_tree(tree); ++ return NS_FOUND; ++ } ++ ++ child->in_parent.item_pos = (unsigned short)~0; ++ } ++ read_unlock_tree(tree); ++ ++ /* is above failed, find some key from @child. We are looking for the ++ least key in a child. */ ++ read_lock_dk(tree); ++ ld = *znode_get_ld_key(child); ++ read_unlock_dk(tree); ++ /* ++ * now, lookup parent with key just found. Note, that left delimiting ++ * key doesn't identify node uniquely, because (in extremely rare ++ * case) two nodes can have equal left delimiting keys, if one of them ++ * is completely filled with directory entries that all happened to be ++ * hash collision. But, we check block number in check_tree_pointer() ++ * and, so, are safe. ++ */ ++ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result); ++ /* update cached pos_in_node */ ++ if (lookup_res == NS_FOUND) { ++ write_lock_tree(tree); ++ coord_to_parent_coord(result, &child->in_parent); ++ write_unlock_tree(tree); ++ lookup_res = check_tree_pointer(result, child); ++ } ++ if (lookup_res == NS_NOT_FOUND) ++ lookup_res = find_child_by_addr(parent, child, result); ++ return lookup_res; ++} ++ ++/* find coord of pointer to @child in @parent by scanning ++ ++ Find the &coord_t in the @parent where pointer to a given @child ++ is in by scanning all internal items in @parent and comparing block ++ numbers in them with that of @child. ++ ++*/ ++static int find_child_by_addr(znode * parent /* parent znode, passed locked */ , ++ znode * child /* child znode, passed locked */ , ++ coord_t * result /* where result is stored in */ ) ++{ ++ int ret; ++ ++ assert("nikita-1320", parent != NULL); ++ assert("nikita-1321", child != NULL); ++ assert("nikita-1322", result != NULL); ++ ++ ret = NS_NOT_FOUND; ++ ++ for_all_units(result, parent) { ++ if (check_tree_pointer(result, child) == NS_FOUND) { ++ write_lock_tree(znode_get_tree(parent)); ++ coord_to_parent_coord(result, &child->in_parent); ++ write_unlock_tree(znode_get_tree(parent)); ++ ret = NS_FOUND; ++ break; ++ } ++ } ++ return ret; ++} ++ ++/* true, if @addr is "unallocated block number", which is just address, with ++ highest bit set. */ ++int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to ++ * check */ ) ++{ ++ assert("nikita-1766", addr != NULL); ++ cassert(sizeof(reiser4_block_nr) == 8); ++ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) == ++ REISER4_UNALLOCATED_STATUS_VALUE; ++} ++ ++/* returns true if removing bytes of given range of key [from_key, to_key] ++ causes removing of whole item @from */ ++static int ++item_removed_completely(coord_t * from, const reiser4_key * from_key, ++ const reiser4_key * to_key) ++{ ++ item_plugin *iplug; ++ reiser4_key key_in_item; ++ ++ assert("umka-325", from != NULL); ++ assert("", item_is_extent(from)); ++ ++ /* check first key just for case */ ++ item_key_by_coord(from, &key_in_item); ++ if (keygt(from_key, &key_in_item)) ++ return 0; ++ ++ /* check last key */ ++ iplug = item_plugin_by_coord(from); ++ assert("vs-611", iplug && iplug->s.file.append_key); ++ ++ iplug->s.file.append_key(from, &key_in_item); ++ set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1); ++ ++ if (keylt(to_key, &key_in_item)) ++ /* last byte is not removed */ ++ return 0; ++ return 1; ++} ++ ++/* helper function for prepare_twig_kill(): @left and @right are formatted ++ * neighbors of extent item being completely removed. Load and lock neighbors ++ * and store lock handles into @cdata for later use by kill_hook_extent() */ ++static int ++prepare_children(znode * left, znode * right, carry_kill_data * kdata) ++{ ++ int result; ++ int left_loaded; ++ int right_loaded; ++ ++ result = 0; ++ left_loaded = right_loaded = 0; ++ ++ if (left != NULL) { ++ result = zload(left); ++ if (result == 0) { ++ left_loaded = 1; ++ result = longterm_lock_znode(kdata->left, left, ++ ZNODE_READ_LOCK, ++ ZNODE_LOCK_LOPRI); ++ } ++ } ++ if (result == 0 && right != NULL) { ++ result = zload(right); ++ if (result == 0) { ++ right_loaded = 1; ++ result = longterm_lock_znode(kdata->right, right, ++ ZNODE_READ_LOCK, ++ ZNODE_LOCK_HIPRI | ++ ZNODE_LOCK_NONBLOCK); ++ } ++ } ++ if (result != 0) { ++ done_lh(kdata->left); ++ done_lh(kdata->right); ++ if (left_loaded != 0) ++ zrelse(left); ++ if (right_loaded != 0) ++ zrelse(right); ++ } ++ return result; ++} ++ ++static void done_children(carry_kill_data * kdata) ++{ ++ if (kdata->left != NULL && kdata->left->node != NULL) { ++ zrelse(kdata->left->node); ++ done_lh(kdata->left); ++ } ++ if (kdata->right != NULL && kdata->right->node != NULL) { ++ zrelse(kdata->right->node); ++ done_lh(kdata->right); ++ } ++} ++ ++/* part of cut_node. It is called when cut_node is called to remove or cut part ++ of extent item. When head of that item is removed - we have to update right ++ delimiting of left neighbor of extent. When item is removed completely - we ++ have to set sibling link between left and right neighbor of removed ++ extent. This may return -E_DEADLOCK because of trying to get left neighbor ++ locked. So, caller should repeat an attempt ++*/ ++/* Audited by: umka (2002.06.16) */ ++static int ++prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor) ++{ ++ int result; ++ reiser4_key key; ++ lock_handle left_lh; ++ lock_handle right_lh; ++ coord_t left_coord; ++ coord_t *from; ++ znode *left_child; ++ znode *right_child; ++ reiser4_tree *tree; ++ int left_zloaded_here, right_zloaded_here; ++ ++ from = kdata->params.from; ++ assert("umka-326", from != NULL); ++ assert("umka-327", kdata->params.to != NULL); ++ ++ /* for one extent item only yet */ ++ assert("vs-591", item_is_extent(from)); ++ assert("vs-592", from->item_pos == kdata->params.to->item_pos); ++ ++ if ((kdata->params.from_key ++ && keygt(kdata->params.from_key, item_key_by_coord(from, &key))) ++ || from->unit_pos != 0) { ++ /* head of item @from is not removed, there is nothing to ++ worry about */ ++ return 0; ++ } ++ ++ result = 0; ++ left_zloaded_here = 0; ++ right_zloaded_here = 0; ++ ++ left_child = right_child = NULL; ++ ++ coord_dup(&left_coord, from); ++ init_lh(&left_lh); ++ init_lh(&right_lh); ++ if (coord_prev_unit(&left_coord)) { ++ /* @from is leftmost item in its node */ ++ if (!locked_left_neighbor) { ++ result = ++ reiser4_get_left_neighbor(&left_lh, from->node, ++ ZNODE_READ_LOCK, ++ GN_CAN_USE_UPPER_LEVELS); ++ switch (result) { ++ case 0: ++ break; ++ case -E_NO_NEIGHBOR: ++ /* there is no formatted node to the left of ++ from->node */ ++ warning("vs-605", ++ "extent item has smallest key in " ++ "the tree and it is about to be removed"); ++ return 0; ++ case -E_DEADLOCK: ++ /* need to restart */ ++ default: ++ return result; ++ } ++ ++ /* we have acquired left neighbor of from->node */ ++ result = zload(left_lh.node); ++ if (result) ++ goto done; ++ ++ locked_left_neighbor = left_lh.node; ++ } else { ++ /* squalloc_right_twig_cut should have supplied locked ++ * left neighbor */ ++ assert("vs-834", ++ znode_is_write_locked(locked_left_neighbor)); ++ result = zload(locked_left_neighbor); ++ if (result) ++ return result; ++ } ++ ++ left_zloaded_here = 1; ++ coord_init_last_unit(&left_coord, locked_left_neighbor); ++ } ++ ++ if (!item_is_internal(&left_coord)) { ++ /* what else but extent can be on twig level */ ++ assert("vs-606", item_is_extent(&left_coord)); ++ ++ /* there is no left formatted child */ ++ if (left_zloaded_here) ++ zrelse(locked_left_neighbor); ++ done_lh(&left_lh); ++ return 0; ++ } ++ ++ tree = znode_get_tree(left_coord.node); ++ left_child = child_znode(&left_coord, left_coord.node, 1, 0); ++ ++ if (IS_ERR(left_child)) { ++ result = PTR_ERR(left_child); ++ goto done; ++ } ++ ++ /* left child is acquired, calculate new right delimiting key for it ++ and get right child if it is necessary */ ++ if (item_removed_completely ++ (from, kdata->params.from_key, kdata->params.to_key)) { ++ /* try to get right child of removed item */ ++ coord_t right_coord; ++ ++ assert("vs-607", ++ kdata->params.to->unit_pos == ++ coord_last_unit_pos(kdata->params.to)); ++ coord_dup(&right_coord, kdata->params.to); ++ if (coord_next_unit(&right_coord)) { ++ /* @to is rightmost unit in the node */ ++ result = ++ reiser4_get_right_neighbor(&right_lh, from->node, ++ ZNODE_READ_LOCK, ++ GN_CAN_USE_UPPER_LEVELS); ++ switch (result) { ++ case 0: ++ result = zload(right_lh.node); ++ if (result) ++ goto done; ++ ++ right_zloaded_here = 1; ++ coord_init_first_unit(&right_coord, ++ right_lh.node); ++ item_key_by_coord(&right_coord, &key); ++ break; ++ ++ case -E_NO_NEIGHBOR: ++ /* there is no formatted node to the right of ++ from->node */ ++ read_lock_dk(tree); ++ key = *znode_get_rd_key(from->node); ++ read_unlock_dk(tree); ++ right_coord.node = NULL; ++ result = 0; ++ break; ++ default: ++ /* real error */ ++ goto done; ++ } ++ } else { ++ /* there is an item to the right of @from - take its key */ ++ item_key_by_coord(&right_coord, &key); ++ } ++ ++ /* try to get right child of @from */ ++ if (right_coord.node && /* there is right neighbor of @from */ ++ item_is_internal(&right_coord)) { /* it is internal item */ ++ right_child = child_znode(&right_coord, ++ right_coord.node, 1, 0); ++ ++ if (IS_ERR(right_child)) { ++ result = PTR_ERR(right_child); ++ goto done; ++ } ++ ++ } ++ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and ++ update of right delimiting key of left_child */ ++ result = prepare_children(left_child, right_child, kdata); ++ } else { ++ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */ ++ result = prepare_children(left_child, NULL, kdata); ++ } ++ ++ done: ++ if (right_child) ++ zput(right_child); ++ if (right_zloaded_here) ++ zrelse(right_lh.node); ++ done_lh(&right_lh); ++ ++ if (left_child) ++ zput(left_child); ++ if (left_zloaded_here) ++ zrelse(locked_left_neighbor); ++ done_lh(&left_lh); ++ return result; ++} ++ ++/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set ++ are to be cut completely */ ++/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */ ++int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */ ++ const reiser4_key * to_key, /* last key to be removed */ ++ reiser4_key * ++ smallest_removed /* smallest key actually removed */ ) ++{ ++ int result; ++ carry_pool *pool; ++ carry_level *lowest_level; ++ carry_cut_data *cut_data; ++ carry_op *op; ++ ++ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT); ++ ++ pool = ++ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + ++ sizeof(*cut_data)); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ lowest_level = (carry_level *) (pool + 1); ++ init_carry_level(lowest_level, pool); ++ ++ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0); ++ assert("vs-1509", op != 0); ++ if (IS_ERR(op)) { ++ done_carry_pool(pool); ++ return PTR_ERR(op); ++ } ++ ++ cut_data = (carry_cut_data *) (lowest_level + 3); ++ cut_data->params.from = from; ++ cut_data->params.to = to; ++ cut_data->params.from_key = from_key; ++ cut_data->params.to_key = to_key; ++ cut_data->params.smallest_removed = smallest_removed; ++ ++ op->u.cut_or_kill.is_cut = 1; ++ op->u.cut_or_kill.u.cut = cut_data; ++ ++ result = reiser4_carry(lowest_level, NULL); ++ done_carry_pool(pool); ++ ++ return result; ++} ++ ++/* cut part of the node ++ ++ Cut part or whole content of node. ++ ++ cut data between @from and @to of @from->node and call carry() to make ++ corresponding changes in the tree. @from->node may become empty. If so - ++ pointer to it will be removed. Neighboring nodes are not changed. Smallest ++ removed key is stored in @smallest_removed ++ ++*/ ++int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */ ++ coord_t * to, /* coord of the last unit/item that will be eliminated */ ++ const reiser4_key * from_key, /* first key to be removed */ ++ const reiser4_key * to_key, /* last key to be removed */ ++ reiser4_key * smallest_removed, /* smallest key actually removed */ ++ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor ++ * locked (in squalloc_right_twig_cut, namely) */ ++ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to ++ invalidate pages together with item pointing to them */ ++ int truncate) ++{ /* this call is made for file truncate) */ ++ int result; ++ carry_pool *pool; ++ carry_level *lowest_level; ++ carry_kill_data *kdata; ++ lock_handle *left_child; ++ lock_handle *right_child; ++ carry_op *op; ++ ++ assert("umka-328", from != NULL); ++ assert("vs-316", !node_is_empty(from->node)); ++ assert("nikita-1812", coord_is_existing_unit(from) ++ && coord_is_existing_unit(to)); ++ ++ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */ ++ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + ++ sizeof(carry_kill_data) + ++ 2 * sizeof(lock_handle) + ++ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t)); ++ if (IS_ERR(pool)) ++ return PTR_ERR(pool); ++ ++ lowest_level = (carry_level *) (pool + 1); ++ init_carry_level(lowest_level, pool); ++ ++ kdata = (carry_kill_data *) (lowest_level + 3); ++ left_child = (lock_handle *) (kdata + 1); ++ right_child = left_child + 1; ++ ++ init_lh(left_child); ++ init_lh(right_child); ++ ++ kdata->params.from = from; ++ kdata->params.to = to; ++ kdata->params.from_key = from_key; ++ kdata->params.to_key = to_key; ++ kdata->params.smallest_removed = smallest_removed; ++ kdata->params.truncate = truncate; ++ kdata->flags = 0; ++ kdata->inode = inode; ++ kdata->left = left_child; ++ kdata->right = right_child; ++ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */ ++ kdata->buf = (char *)(right_child + 1); ++ ++ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) { ++ /* left child of extent item may have to get updated right ++ delimiting key and to get linked with right child of extent ++ @from if it will be removed completely */ ++ result = prepare_twig_kill(kdata, locked_left_neighbor); ++ if (result) { ++ done_children(kdata); ++ done_carry_pool(pool); ++ return result; ++ } ++ } ++ ++ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0); ++ if (IS_ERR(op) || (op == NULL)) { ++ done_children(kdata); ++ done_carry_pool(pool); ++ return RETERR(op ? PTR_ERR(op) : -EIO); ++ } ++ ++ op->u.cut_or_kill.is_cut = 0; ++ op->u.cut_or_kill.u.kill = kdata; ++ ++ result = reiser4_carry(lowest_level, NULL); ++ ++ done_children(kdata); ++ done_carry_pool(pool); ++ return result; ++} ++ ++void ++fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate) ++{ ++ if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) { ++ pgoff_t start_pg, end_pg; ++ ++ start_pg = start >> PAGE_CACHE_SHIFT; ++ end_pg = (end - 1) >> PAGE_CACHE_SHIFT; ++ ++ if ((start & (PAGE_CACHE_SIZE - 1)) == 0) { ++ /* ++ * kill up to the page boundary. ++ */ ++ assert("vs-123456", start_pg == end_pg); ++ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1, ++ truncate); ++ } else if (start_pg != end_pg) { ++ /* ++ * page boundary is within killed portion of node. ++ */ ++ assert("vs-654321", end_pg - start_pg == 1); ++ reiser4_invalidate_pages(inode->i_mapping, end_pg, ++ end_pg - start_pg, 1); ++ } ++ } ++ inode_sub_bytes(inode, end - start); ++} ++ ++/** ++ * Delete whole @node from the reiser4 tree without loading it. ++ * ++ * @left: locked left neighbor, ++ * @node: node to be deleted, ++ * @smallest_removed: leftmost key of deleted node, ++ * @object: inode pointer, if we truncate a file body. ++ * @truncate: true if called for file truncate. ++ * ++ * @return: 0 if success, error code otherwise. ++ * ++ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it ++ * contains the right value of the smallest removed key from the previous ++ * cut_worker() iteration. This is needed for proper accounting of ++ * "i_blocks" and "i_bytes" fields of the @object. ++ */ ++int reiser4_delete_node(znode * node, reiser4_key * smallest_removed, ++ struct inode *object, int truncate) ++{ ++ lock_handle parent_lock; ++ coord_t cut_from; ++ coord_t cut_to; ++ reiser4_tree *tree; ++ int ret; ++ ++ assert("zam-937", node != NULL); ++ assert("zam-933", znode_is_write_locked(node)); ++ assert("zam-999", smallest_removed != NULL); ++ ++ init_lh(&parent_lock); ++ ++ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK); ++ if (ret) ++ return ret; ++ ++ assert("zam-934", !znode_above_root(parent_lock.node)); ++ ++ ret = zload(parent_lock.node); ++ if (ret) ++ goto failed_nozrelse; ++ ++ ret = find_child_ptr(parent_lock.node, node, &cut_from); ++ if (ret) ++ goto failed; ++ ++ /* decrement child counter and set parent pointer to NULL before ++ deleting the list from parent node because of checks in ++ internal_kill_item_hook (we can delete the last item from the parent ++ node, the parent node is going to be deleted and its c_count should ++ be zero). */ ++ ++ tree = znode_get_tree(node); ++ write_lock_tree(tree); ++ init_parent_coord(&node->in_parent, NULL); ++ --parent_lock.node->c_count; ++ write_unlock_tree(tree); ++ ++ assert("zam-989", item_is_internal(&cut_from)); ++ ++ /* @node should be deleted after unlocking. */ ++ ZF_SET(node, JNODE_HEARD_BANSHEE); ++ ++ /* remove a pointer from the parent node to the node being deleted. */ ++ coord_dup(&cut_to, &cut_from); ++ /* FIXME: shouldn't this be kill_node_content */ ++ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL); ++ if (ret) ++ /* FIXME(Zam): Should we re-connect the node to its parent if ++ * cut_node fails? */ ++ goto failed; ++ ++ { ++ reiser4_tree *tree = current_tree; ++ __u64 start_offset = 0, end_offset = 0; ++ ++ read_lock_tree(tree); ++ write_lock_dk(tree); ++ if (object) { ++ /* We use @smallest_removed and the left delimiting of ++ * the current node for @object->i_blocks, i_bytes ++ * calculation. We assume that the items after the ++ * *@smallest_removed key have been deleted from the ++ * file body. */ ++ start_offset = get_key_offset(znode_get_ld_key(node)); ++ end_offset = get_key_offset(smallest_removed); ++ } ++ ++ assert("zam-1021", znode_is_connected(node)); ++ if (node->left) ++ znode_set_rd_key(node->left, znode_get_rd_key(node)); ++ ++ *smallest_removed = *znode_get_ld_key(node); ++ ++ write_unlock_dk(tree); ++ read_unlock_tree(tree); ++ ++ if (object) { ++ /* we used to perform actions which are to be performed on items on their removal from tree in ++ special item method - kill_hook. Here for optimization reasons we avoid reading node ++ containing item we remove and can not call item's kill hook. Instead we call function which ++ does exactly the same things as tail kill hook in assumption that node we avoid reading ++ contains only one item and that item is a tail one. */ ++ fake_kill_hook_tail(object, start_offset, end_offset, ++ truncate); ++ } ++ } ++ failed: ++ zrelse(parent_lock.node); ++ failed_nozrelse: ++ done_lh(&parent_lock); ++ ++ return ret; ++} ++ ++static int can_delete(const reiser4_key *key, znode *node) ++{ ++ int result; ++ ++ read_lock_dk(current_tree); ++ result = keyle(key, znode_get_ld_key(node)); ++ read_unlock_dk(current_tree); ++ return result; ++} ++ ++/** ++ * This subroutine is not optimal but implementation seems to ++ * be easier). ++ * ++ * @tap: the point deletion process begins from, ++ * @from_key: the beginning of the deleted key range, ++ * @to_key: the end of the deleted key range, ++ * @smallest_removed: the smallest removed key, ++ * @truncate: true if called for file truncate. ++ * @progress: return true if a progress in file items deletions was made, ++ * @smallest_removed value is actual in that case. ++ * ++ * @return: 0 if success, error code otherwise, -E_REPEAT means that long ++ * reiser4_cut_tree operation was interrupted for allowing atom commit. ++ */ ++int ++cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key, ++ const reiser4_key * to_key, ++ reiser4_key * smallest_removed, struct inode *object, ++ int truncate, int *progress) ++{ ++ lock_handle next_node_lock; ++ coord_t left_coord; ++ int result; ++ ++ assert("zam-931", tap->coord->node != NULL); ++ assert("zam-932", znode_is_write_locked(tap->coord->node)); ++ ++ *progress = 0; ++ init_lh(&next_node_lock); ++ ++ while (1) { ++ znode *node; /* node from which items are cut */ ++ node_plugin *nplug; /* node plugin for @node */ ++ ++ node = tap->coord->node; ++ ++ /* Move next_node_lock to the next node on the left. */ ++ result = ++ reiser4_get_left_neighbor(&next_node_lock, node, ++ ZNODE_WRITE_LOCK, ++ GN_CAN_USE_UPPER_LEVELS); ++ if (result != 0 && result != -E_NO_NEIGHBOR) ++ break; ++ /* Check can we delete the node as a whole. */ ++ if (*progress && znode_get_level(node) == LEAF_LEVEL && ++ can_delete(from_key, node)) { ++ result = reiser4_delete_node(node, smallest_removed, ++ object, truncate); ++ } else { ++ result = reiser4_tap_load(tap); ++ if (result) ++ return result; ++ ++ /* Prepare the second (right) point for cut_node() */ ++ if (*progress) ++ coord_init_last_unit(tap->coord, node); ++ ++ else if (item_plugin_by_coord(tap->coord)->b.lookup == ++ NULL) ++ /* set rightmost unit for the items without lookup method */ ++ tap->coord->unit_pos = ++ coord_last_unit_pos(tap->coord); ++ ++ nplug = node->nplug; ++ ++ assert("vs-686", nplug); ++ assert("vs-687", nplug->lookup); ++ ++ /* left_coord is leftmost unit cut from @node */ ++ result = nplug->lookup(node, from_key, ++ FIND_MAX_NOT_MORE_THAN, ++ &left_coord); ++ ++ if (IS_CBKERR(result)) ++ break; ++ ++ /* adjust coordinates so that they are set to existing units */ ++ if (coord_set_to_right(&left_coord) ++ || coord_set_to_left(tap->coord)) { ++ result = 0; ++ break; ++ } ++ ++ if (coord_compare(&left_coord, tap->coord) == ++ COORD_CMP_ON_RIGHT) { ++ /* keys from @from_key to @to_key are not in the tree */ ++ result = 0; ++ break; ++ } ++ ++ if (left_coord.item_pos != tap->coord->item_pos) { ++ /* do not allow to cut more than one item. It is added to solve problem of truncating ++ partially converted files. If file is partially converted there may exist a twig node ++ containing both internal item or items pointing to leaf nodes with formatting items ++ and extent item. We do not want to kill internal items being at twig node here ++ because cut_tree_worker assumes killing them from level level */ ++ coord_dup(&left_coord, tap->coord); ++ assert("vs-1652", ++ coord_is_existing_unit(&left_coord)); ++ left_coord.unit_pos = 0; ++ } ++ ++ /* cut data from one node */ ++ /* *smallest_removed = *reiser4_min_key(); */ ++ result = ++ kill_node_content(&left_coord, tap->coord, from_key, ++ to_key, smallest_removed, ++ next_node_lock.node, object, ++ truncate); ++ reiser4_tap_relse(tap); ++ } ++ if (result) ++ break; ++ ++ ++(*progress); ++ ++ /* Check whether all items with keys >= from_key were removed ++ * from the tree. */ ++ if (keyle(smallest_removed, from_key)) ++ /* result = 0; */ ++ break; ++ ++ if (next_node_lock.node == NULL) ++ break; ++ ++ result = reiser4_tap_move(tap, &next_node_lock); ++ done_lh(&next_node_lock); ++ if (result) ++ break; ++ ++ /* Break long reiser4_cut_tree operation (deletion of a large ++ file) if atom requires commit. */ ++ if (*progress > CUT_TREE_MIN_ITERATIONS ++ && current_atom_should_commit()) { ++ result = -E_REPEAT; ++ break; ++ } ++ } ++ done_lh(&next_node_lock); ++ /* assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key())); */ ++ return result; ++} ++ ++/* there is a fundamental problem with optimizing deletes: VFS does it ++ one file at a time. Another problem is that if an item can be ++ anything, then deleting items must be done one at a time. It just ++ seems clean to writes this to specify a from and a to key, and cut ++ everything between them though. */ ++ ++/* use this function with care if deleting more than what is part of a single file. */ ++/* do not use this when cutting a single item, it is suboptimal for that */ ++ ++/* You are encouraged to write plugin specific versions of this. It ++ cannot be optimal for all plugins because it works item at a time, ++ and some plugins could sometimes work node at a time. Regular files ++ however are not optimizable to work node at a time because of ++ extents needing to free the blocks they point to. ++ ++ Optimizations compared to v3 code: ++ ++ It does not balance (that task is left to memory pressure code). ++ ++ Nodes are deleted only if empty. ++ ++ Uses extents. ++ ++ Performs read-ahead of formatted nodes whose contents are part of ++ the deletion. ++*/ ++ ++/** ++ * Delete everything from the reiser4 tree between two keys: @from_key and ++ * @to_key. ++ * ++ * @from_key: the beginning of the deleted key range, ++ * @to_key: the end of the deleted key range, ++ * @smallest_removed: the smallest removed key, ++ * @object: owner of cutting items. ++ * @truncate: true if called for file truncate. ++ * @progress: return true if a progress in file items deletions was made, ++ * @smallest_removed value is actual in that case. ++ * ++ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree ++ * operation was interrupted for allowing atom commit . ++ */ ++ ++int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key, ++ const reiser4_key * to_key, ++ reiser4_key * smallest_removed_p, ++ struct inode *object, int truncate, int *progress) ++{ ++ lock_handle lock; ++ int result; ++ tap_t tap; ++ coord_t right_coord; ++ reiser4_key smallest_removed; ++ int (*cut_tree_worker) (tap_t *, const reiser4_key *, ++ const reiser4_key *, reiser4_key *, ++ struct inode *, int, int *); ++ STORE_COUNTERS; ++ ++ assert("umka-329", tree != NULL); ++ assert("umka-330", from_key != NULL); ++ assert("umka-331", to_key != NULL); ++ assert("zam-936", keyle(from_key, to_key)); ++ ++ if (smallest_removed_p == NULL) ++ smallest_removed_p = &smallest_removed; ++ ++ init_lh(&lock); ++ ++ do { ++ /* Find rightmost item to cut away from the tree. */ ++ result = reiser4_object_lookup(object, to_key, &right_coord, ++ &lock, ZNODE_WRITE_LOCK, ++ FIND_MAX_NOT_MORE_THAN, ++ TWIG_LEVEL, LEAF_LEVEL, ++ CBK_UNIQUE, NULL /*ra_info */); ++ if (result != CBK_COORD_FOUND) ++ break; ++ if (object == NULL ++ || inode_file_plugin(object)->cut_tree_worker == NULL) ++ cut_tree_worker = cut_tree_worker_common; ++ else ++ cut_tree_worker = ++ inode_file_plugin(object)->cut_tree_worker; ++ reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK); ++ result = ++ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p, ++ object, truncate, progress); ++ reiser4_tap_done(&tap); ++ ++ reiser4_preempt_point(); ++ ++ } while (0); ++ ++ done_lh(&lock); ++ ++ if (result) { ++ switch (result) { ++ case -E_NO_NEIGHBOR: ++ result = 0; ++ break; ++ case -E_DEADLOCK: ++ result = -E_REPEAT; ++ case -E_REPEAT: ++ case -ENOMEM: ++ case -ENOENT: ++ break; ++ default: ++ warning("nikita-2861", "failure: %i", result); ++ } ++ } ++ ++ CHECK_COUNTERS; ++ return result; ++} ++ ++/* repeat reiser4_cut_tree_object until everything is deleted. ++ * unlike cut_file_items, it does not end current transaction if -E_REPEAT ++ * is returned by cut_tree_object. */ ++int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from, ++ const reiser4_key * to, struct inode *inode, int truncate) ++{ ++ int result; ++ int progress; ++ ++ do { ++ result = reiser4_cut_tree_object(tree, from, to, NULL, ++ inode, truncate, &progress); ++ } while (result == -E_REPEAT); ++ ++ return result; ++} ++ ++/* finishing reiser4 initialization */ ++int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being ++ * initialized */ , ++ const reiser4_block_nr * root_block /* address of a root block ++ * on a disk */ , ++ tree_level height /* height of a tree */ , ++ node_plugin * nplug /* default node plugin */ ) ++{ ++ int result; ++ ++ assert("nikita-306", tree != NULL); ++ assert("nikita-307", root_block != NULL); ++ assert("nikita-308", height > 0); ++ assert("nikita-309", nplug != NULL); ++ assert("zam-587", tree->super != NULL); ++ ++ tree->root_block = *root_block; ++ tree->height = height; ++ tree->estimate_one_insert = calc_estimate_one_insert(height); ++ tree->nplug = nplug; ++ ++ tree->znode_epoch = 1ull; ++ ++ cbk_cache_init(&tree->cbk_cache); ++ ++ result = znodes_tree_init(tree); ++ if (result == 0) ++ result = jnodes_tree_init(tree); ++ if (result == 0) { ++ tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0, ++ reiser4_ctx_gfp_mask_get()); ++ if (IS_ERR(tree->uber)) { ++ result = PTR_ERR(tree->uber); ++ tree->uber = NULL; ++ } ++ } ++ return result; ++} ++ ++/* release resources associated with @tree */ ++void reiser4_done_tree(reiser4_tree * tree /* tree to release */ ) ++{ ++ if (tree == NULL) ++ return; ++ ++ if (tree->uber != NULL) { ++ zput(tree->uber); ++ tree->uber = NULL; ++ } ++ znodes_tree_done(tree); ++ jnodes_tree_done(tree); ++ cbk_cache_done(&tree->cbk_cache); ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/tree.h linux-2.6.33/fs/reiser4/tree.h +--- linux-2.6.33.orig/fs/reiser4/tree.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/tree.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,577 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Tree operations. See fs/reiser4/tree.c for comments */ ++ ++#if !defined( __REISER4_TREE_H__ ) ++#define __REISER4_TREE_H__ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "plugin/node/node.h" ++#include "plugin/plugin.h" ++#include "znode.h" ++#include "tap.h" ++ ++#include <linux/types.h> /* for __u?? */ ++#include <linux/fs.h> /* for struct super_block */ ++#include <linux/spinlock.h> ++#include <linux/sched.h> /* for struct task_struct */ ++ ++/* fictive block number never actually used */ ++extern const reiser4_block_nr UBER_TREE_ADDR; ++ ++/* &cbk_cache_slot - entry in a coord cache. ++ ++ This is entry in a coord_by_key (cbk) cache, represented by ++ &cbk_cache. ++ ++*/ ++typedef struct cbk_cache_slot { ++ /* cached node */ ++ znode *node; ++ /* linkage to the next cbk cache slot in a LRU order */ ++ struct list_head lru; ++} cbk_cache_slot; ++ ++/* &cbk_cache - coord cache. This is part of reiser4_tree. ++ ++ cbk_cache is supposed to speed up tree lookups by caching results of recent ++ successful lookups (we don't cache negative results as dentry cache ++ does). Cache consists of relatively small number of entries kept in a LRU ++ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from ++ which we can obtain a range of keys that covered by this znode. Before ++ embarking into real tree traversal we scan cbk_cache slot by slot and for ++ each slot check whether key we are looking for is between minimal and ++ maximal keys for node pointed to by this slot. If no match is found, real ++ tree traversal is performed and if result is successful, appropriate entry ++ is inserted into cache, possibly pulling least recently used entry out of ++ it. ++ ++ Tree spin lock is used to protect coord cache. If contention for this ++ lock proves to be too high, more finer grained locking can be added. ++ ++ Invariants involving parts of this data-type: ++ ++ [cbk-cache-invariant] ++*/ ++typedef struct cbk_cache { ++ /* serializator */ ++ rwlock_t guard; ++ int nr_slots; ++ /* head of LRU list of cache slots */ ++ struct list_head lru; ++ /* actual array of slots */ ++ cbk_cache_slot *slot; ++} cbk_cache; ++ ++/* level_lookup_result - possible outcome of looking up key at some level. ++ This is used by coord_by_key when traversing tree downward. */ ++typedef enum { ++ /* continue to the next level */ ++ LOOKUP_CONT, ++ /* done. Either required item was found, or we can prove it ++ doesn't exist, or some error occurred. */ ++ LOOKUP_DONE, ++ /* restart traversal from the root. Infamous "repetition". */ ++ LOOKUP_REST ++} level_lookup_result; ++ ++/* This is representation of internal reiser4 tree where all file-system ++ data and meta-data are stored. This structure is passed to all tree ++ manipulation functions. It's different from the super block because: ++ we don't want to limit ourselves to strictly one to one mapping ++ between super blocks and trees, and, because they are logically ++ different: there are things in a super block that have no relation to ++ the tree (bitmaps, journalling area, mount options, etc.) and there ++ are things in a tree that bear no relation to the super block, like ++ tree of znodes. ++ ++ At this time, there is only one tree ++ per filesystem, and this struct is part of the super block. We only ++ call the super block the super block for historical reasons (most ++ other filesystems call the per filesystem metadata the super block). ++*/ ++ ++struct reiser4_tree { ++ /* block_nr == 0 is fake znode. Write lock it, while changing ++ tree height. */ ++ /* disk address of root node of a tree */ ++ reiser4_block_nr root_block; ++ ++ /* level of the root node. If this is 1, tree consists of root ++ node only */ ++ tree_level height; ++ ++ /* ++ * this is cached here avoid calling plugins through function ++ * dereference all the time. ++ */ ++ __u64 estimate_one_insert; ++ ++ /* cache of recent tree lookup results */ ++ cbk_cache cbk_cache; ++ ++ /* hash table to look up znodes by block number. */ ++ z_hash_table zhash_table; ++ z_hash_table zfake_table; ++ /* hash table to look up jnodes by inode and offset. */ ++ j_hash_table jhash_table; ++ ++ /* lock protecting: ++ - parent pointers, ++ - sibling pointers, ++ - znode hash table ++ - coord cache ++ */ ++ /* NOTE: The "giant" tree lock can be replaced by more spin locks, ++ hoping they will be less contented. We can use one spin lock per one ++ znode hash bucket. With adding of some code complexity, sibling ++ pointers can be protected by both znode spin locks. However it looks ++ more SMP scalable we should test this locking change on n-ways (n > ++ 4) SMP machines. Current 4-ways machine test does not show that tree ++ lock is contented and it is a bottleneck (2003.07.25). */ ++ ++ rwlock_t tree_lock; ++ ++ /* lock protecting delimiting keys */ ++ rwlock_t dk_lock; ++ ++ /* spin lock protecting znode_epoch */ ++ spinlock_t epoch_lock; ++ /* version stamp used to mark znode updates. See seal.[ch] for more ++ * information. */ ++ __u64 znode_epoch; ++ ++ znode *uber; ++ node_plugin *nplug; ++ struct super_block *super; ++ struct { ++ /* carry flags used for insertion of new nodes */ ++ __u32 new_node_flags; ++ /* carry flags used for insertion of new extents */ ++ __u32 new_extent_flags; ++ /* carry flags used for paste operations */ ++ __u32 paste_flags; ++ /* carry flags used for insert operations */ ++ __u32 insert_flags; ++ } carry; ++}; ++ ++extern int reiser4_init_tree(reiser4_tree * tree, ++ const reiser4_block_nr * root_block, ++ tree_level height, node_plugin * default_plugin); ++extern void reiser4_done_tree(reiser4_tree * tree); ++ ++/* cbk flags: options for coord_by_key() */ ++typedef enum { ++ /* coord_by_key() is called for insertion. This is necessary because ++ of extents being located at the twig level. For explanation, see ++ comment just above is_next_item_internal(). ++ */ ++ CBK_FOR_INSERT = (1 << 0), ++ /* coord_by_key() is called with key that is known to be unique */ ++ CBK_UNIQUE = (1 << 1), ++ /* coord_by_key() can trust delimiting keys. This options is not user ++ accessible. coord_by_key() will set it automatically. It will be ++ only cleared by special-case in extents-on-the-twig-level handling ++ where it is necessary to insert item with a key smaller than ++ leftmost key in a node. This is necessary because of extents being ++ located at the twig level. For explanation, see comment just above ++ is_next_item_internal(). ++ */ ++ CBK_TRUST_DK = (1 << 2), ++ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */ ++ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */ ++ CBK_DKSET = (1 << 5), ++ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */ ++ CBK_IN_CACHE = (1 << 7), /* node is already in cache */ ++ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term ++ * lock */ ++} cbk_flags; ++ ++/* insertion outcome. IBK = insert by key */ ++typedef enum { ++ IBK_INSERT_OK = 0, ++ IBK_ALREADY_EXISTS = -EEXIST, ++ IBK_IO_ERROR = -EIO, ++ IBK_NO_SPACE = -E_NODE_FULL, ++ IBK_OOM = -ENOMEM ++} insert_result; ++ ++#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND) ++ ++typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord, ++ lock_handle * lh, void *arg); ++extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord, ++ lock_handle * lh, ++ tree_iterate_actor_t actor, void *arg, ++ znode_lock_mode mode, int through_units_p); ++extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode, ++ znode_lock_request pri, lock_handle * lh); ++ ++/* return node plugin of @node */ ++static inline node_plugin *node_plugin_by_node(const znode * ++ node /* node to query */ ) ++{ ++ assert("vs-213", node != NULL); ++ assert("vs-214", znode_is_loaded(node)); ++ ++ return node->nplug; ++} ++ ++/* number of items in @node */ ++static inline pos_in_node_t node_num_items(const znode * node) ++{ ++ assert("nikita-2754", znode_is_loaded(node)); ++ assert("nikita-2468", ++ node_plugin_by_node(node)->num_of_items(node) == node->nr_items); ++ ++ return node->nr_items; ++} ++ ++/* Return the number of items at the present node. Asserts coord->node != ++ NULL. */ ++static inline unsigned coord_num_items(const coord_t * coord) ++{ ++ assert("jmacd-9805", coord->node != NULL); ++ ++ return node_num_items(coord->node); ++} ++ ++/* true if @node is empty */ ++static inline int node_is_empty(const znode * node) ++{ ++ return node_num_items(node) == 0; ++} ++ ++typedef enum { ++ SHIFTED_SOMETHING = 0, ++ SHIFT_NO_SPACE = -E_NODE_FULL, ++ SHIFT_IO_ERROR = -EIO, ++ SHIFT_OOM = -ENOMEM, ++} shift_result; ++ ++extern node_plugin *node_plugin_by_coord(const coord_t * coord); ++extern int is_coord_in_node(const coord_t * coord); ++extern int key_in_node(const reiser4_key *, const coord_t *); ++extern void coord_item_move_to(coord_t * coord, int items); ++extern void coord_unit_move_to(coord_t * coord, int units); ++ ++/* there are two types of repetitive accesses (ra): intra-syscall ++ (local) and inter-syscall (global). Local ra is used when ++ during single syscall we add/delete several items and units in the ++ same place in a tree. Note that plan-A fragments local ra by ++ separating stat-data and file body in key-space. Global ra is ++ used when user does repetitive modifications in the same place in a ++ tree. ++ ++ Our ra implementation serves following purposes: ++ 1 it affects balancing decisions so that next operation in a row ++ can be performed faster; ++ 2 it affects lower-level read-ahead in page-cache; ++ 3 it allows to avoid unnecessary lookups by maintaining some state ++ across several operations (this is only for local ra); ++ 4 it leaves room for lazy-micro-balancing: when we start a sequence of ++ operations they are performed without actually doing any intra-node ++ shifts, until we finish sequence or scope of sequence leaves ++ current node, only then we really pack node (local ra only). ++*/ ++ ++/* another thing that can be useful is to keep per-tree and/or ++ per-process cache of recent lookups. This cache can be organised as a ++ list of block numbers of formatted nodes sorted by starting key in ++ this node. Balancings should invalidate appropriate parts of this ++ cache. ++*/ ++ ++lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key, ++ coord_t * coord, lock_handle * handle, ++ znode_lock_mode lock, lookup_bias bias, ++ tree_level lock_level, tree_level stop_level, ++ __u32 flags, ra_info_t *); ++ ++lookup_result reiser4_object_lookup(struct inode *object, ++ const reiser4_key * key, ++ coord_t * coord, ++ lock_handle * lh, ++ znode_lock_mode lock_mode, ++ lookup_bias bias, ++ tree_level lock_level, ++ tree_level stop_level, ++ __u32 flags, ra_info_t * info); ++ ++insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key, ++ reiser4_item_data * data, coord_t * coord, ++ lock_handle * lh, ++ tree_level stop_level, __u32 flags); ++insert_result insert_by_coord(coord_t * coord, ++ reiser4_item_data * data, const reiser4_key * key, ++ lock_handle * lh, __u32); ++insert_result insert_extent_by_coord(coord_t * coord, ++ reiser4_item_data * data, ++ const reiser4_key * key, lock_handle * lh); ++int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, ++ const reiser4_key * to_key, ++ reiser4_key * smallest_removed); ++int kill_node_content(coord_t * from, coord_t * to, ++ const reiser4_key * from_key, const reiser4_key * to_key, ++ reiser4_key * smallest_removed, ++ znode * locked_left_neighbor, struct inode *inode, ++ int truncate); ++ ++int reiser4_resize_item(coord_t * coord, reiser4_item_data * data, ++ reiser4_key * key, lock_handle * lh, cop_insert_flag); ++int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key, ++ reiser4_item_data * data, unsigned); ++int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f); ++int find_new_child_ptr(znode * parent, znode * child, znode * left, ++ coord_t * result); ++ ++int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord); ++int shift_left_of_and_including_insert_coord(coord_t * insert_coord); ++ ++void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int); ++ ++extern int cut_tree_worker_common(tap_t *, const reiser4_key *, ++ const reiser4_key *, reiser4_key *, ++ struct inode *, int, int *); ++extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *, ++ const reiser4_key *, reiser4_key *, ++ struct inode *, int, int *); ++extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from, ++ const reiser4_key * to, struct inode *, int); ++ ++extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int); ++extern int check_tree_pointer(const coord_t * pointer, const znode * child); ++extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG, ++ znode * left, coord_t * result); ++extern int find_child_ptr(znode * parent, znode * child, coord_t * result); ++extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent, ++ znode * child); ++extern znode *child_znode(const coord_t * in_parent, znode * parent, ++ int incore_p, int setup_dkeys_p); ++ ++extern int cbk_cache_init(cbk_cache * cache); ++extern void cbk_cache_done(cbk_cache * cache); ++extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree); ++ ++extern char *sprint_address(const reiser4_block_nr * block); ++ ++#if REISER4_DEBUG ++extern void print_coord_content(const char *prefix, coord_t * p); ++extern void reiser4_print_address(const char *prefix, ++ const reiser4_block_nr * block); ++extern void print_tree_rec(const char *prefix, reiser4_tree * tree, ++ __u32 flags); ++extern void check_dkeys(znode *node); ++#else ++#define print_coord_content(p, c) noop ++#define reiser4_print_address(p, b) noop ++#endif ++ ++extern void forget_znode(lock_handle * handle); ++extern int deallocate_znode(znode * node); ++ ++extern int is_disk_addr_unallocated(const reiser4_block_nr * addr); ++ ++/* struct used internally to pack all numerous arguments of tree lookup. ++ Used to avoid passing a lot of arguments to helper functions. */ ++typedef struct cbk_handle { ++ /* tree we are in */ ++ reiser4_tree *tree; ++ /* key we are going after */ ++ const reiser4_key *key; ++ /* coord we will store result in */ ++ coord_t *coord; ++ /* type of lock to take on target node */ ++ znode_lock_mode lock_mode; ++ /* lookup bias. See comments at the declaration of lookup_bias */ ++ lookup_bias bias; ++ /* lock level: level starting from which tree traversal starts taking ++ * write locks. */ ++ tree_level lock_level; ++ /* level where search will stop. Either item will be found between ++ lock_level and stop_level, or CBK_COORD_NOTFOUND will be ++ returned. ++ */ ++ tree_level stop_level; ++ /* level we are currently at */ ++ tree_level level; ++ /* block number of @active node. Tree traversal operates on two ++ nodes: active and parent. */ ++ reiser4_block_nr block; ++ /* put here error message to be printed by caller */ ++ const char *error; ++ /* result passed back to caller */ ++ lookup_result result; ++ /* lock handles for active and parent */ ++ lock_handle *parent_lh; ++ lock_handle *active_lh; ++ reiser4_key ld_key; ++ reiser4_key rd_key; ++ /* flags, passed to the cbk routine. Bits of this bitmask are defined ++ in tree.h:cbk_flags enum. */ ++ __u32 flags; ++ ra_info_t *ra_info; ++ struct inode *object; ++} cbk_handle; ++ ++extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h); ++ ++/* eottl.c */ ++extern int handle_eottl(cbk_handle *h, int *outcome); ++ ++int lookup_multikey(cbk_handle * handle, int nr_keys); ++int lookup_couple(reiser4_tree * tree, ++ const reiser4_key * key1, const reiser4_key * key2, ++ coord_t * coord1, coord_t * coord2, ++ lock_handle * lh1, lock_handle * lh2, ++ znode_lock_mode lock_mode, lookup_bias bias, ++ tree_level lock_level, tree_level stop_level, __u32 flags, ++ int *result1, int *result2); ++ ++static inline void read_lock_tree(reiser4_tree *tree) ++{ ++ /* check that tree is not locked */ ++ assert("", (LOCK_CNT_NIL(rw_locked_tree) && ++ LOCK_CNT_NIL(read_locked_tree) && ++ LOCK_CNT_NIL(write_locked_tree))); ++ /* check that spinlocks of lower priorities are not held */ ++ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && ++ LOCK_CNT_NIL(rw_locked_dk) && ++ LOCK_CNT_NIL(spin_locked_stack))); ++ ++ read_lock(&(tree->tree_lock)); ++ ++ LOCK_CNT_INC(read_locked_tree); ++ LOCK_CNT_INC(rw_locked_tree); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline void read_unlock_tree(reiser4_tree *tree) ++{ ++ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree)); ++ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ ++ LOCK_CNT_DEC(read_locked_tree); ++ LOCK_CNT_DEC(rw_locked_tree); ++ LOCK_CNT_DEC(spin_locked); ++ ++ read_unlock(&(tree->tree_lock)); ++} ++ ++static inline void write_lock_tree(reiser4_tree *tree) ++{ ++ /* check that tree is not locked */ ++ assert("", (LOCK_CNT_NIL(rw_locked_tree) && ++ LOCK_CNT_NIL(read_locked_tree) && ++ LOCK_CNT_NIL(write_locked_tree))); ++ /* check that spinlocks of lower priorities are not held */ ++ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && ++ LOCK_CNT_NIL(rw_locked_dk) && ++ LOCK_CNT_NIL(spin_locked_stack))); ++ ++ write_lock(&(tree->tree_lock)); ++ ++ LOCK_CNT_INC(write_locked_tree); ++ LOCK_CNT_INC(rw_locked_tree); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline void write_unlock_tree(reiser4_tree *tree) ++{ ++ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree)); ++ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ ++ LOCK_CNT_DEC(write_locked_tree); ++ LOCK_CNT_DEC(rw_locked_tree); ++ LOCK_CNT_DEC(spin_locked); ++ ++ write_unlock(&(tree->tree_lock)); ++} ++ ++static inline void read_lock_dk(reiser4_tree *tree) ++{ ++ /* check that dk is not locked */ ++ assert("", (LOCK_CNT_NIL(rw_locked_dk) && ++ LOCK_CNT_NIL(read_locked_dk) && ++ LOCK_CNT_NIL(write_locked_dk))); ++ /* check that spinlocks of lower priorities are not held */ ++ assert("", LOCK_CNT_NIL(spin_locked_stack)); ++ ++ read_lock(&((tree)->dk_lock)); ++ ++ LOCK_CNT_INC(read_locked_dk); ++ LOCK_CNT_INC(rw_locked_dk); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline void read_unlock_dk(reiser4_tree *tree) ++{ ++ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk)); ++ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ ++ LOCK_CNT_DEC(read_locked_dk); ++ LOCK_CNT_DEC(rw_locked_dk); ++ LOCK_CNT_DEC(spin_locked); ++ ++ read_unlock(&(tree->dk_lock)); ++} ++ ++static inline void write_lock_dk(reiser4_tree *tree) ++{ ++ /* check that dk is not locked */ ++ assert("", (LOCK_CNT_NIL(rw_locked_dk) && ++ LOCK_CNT_NIL(read_locked_dk) && ++ LOCK_CNT_NIL(write_locked_dk))); ++ /* check that spinlocks of lower priorities are not held */ ++ assert("", LOCK_CNT_NIL(spin_locked_stack)); ++ ++ write_lock(&((tree)->dk_lock)); ++ ++ LOCK_CNT_INC(write_locked_dk); ++ LOCK_CNT_INC(rw_locked_dk); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline void write_unlock_dk(reiser4_tree *tree) ++{ ++ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk)); ++ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ ++ LOCK_CNT_DEC(write_locked_dk); ++ LOCK_CNT_DEC(rw_locked_dk); ++ LOCK_CNT_DEC(spin_locked); ++ ++ write_unlock(&(tree->dk_lock)); ++} ++ ++/* estimate api. Implementation is in estimate.c */ ++reiser4_block_nr estimate_one_insert_item(reiser4_tree *); ++reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *); ++reiser4_block_nr estimate_insert_flow(tree_level); ++reiser4_block_nr estimate_one_item_removal(reiser4_tree *); ++reiser4_block_nr calc_estimate_one_insert(tree_level); ++reiser4_block_nr estimate_dirty_cluster(struct inode *); ++reiser4_block_nr estimate_insert_cluster(struct inode *); ++reiser4_block_nr estimate_update_cluster(struct inode *); ++ ++/* __REISER4_TREE_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/tree_mod.c linux-2.6.33/fs/reiser4/tree_mod.c +--- linux-2.6.33.orig/fs/reiser4/tree_mod.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/tree_mod.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,386 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* ++ * Functions to add/delete new nodes to/from the tree. ++ * ++ * Functions from this file are used by carry (see carry*) to handle: ++ * ++ * . insertion of new formatted node into tree ++ * ++ * . addition of new tree root, increasing tree height ++ * ++ * . removing tree root, decreasing tree height ++ * ++ */ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++#include "coord.h" ++#include "plugin/plugin.h" ++#include "jnode.h" ++#include "znode.h" ++#include "tree_mod.h" ++#include "block_alloc.h" ++#include "tree_walk.h" ++#include "tree.h" ++#include "super.h" ++ ++#include <linux/err.h> ++ ++static int add_child_ptr(znode * parent, znode * child); ++/* warning only issued if error is not -E_REPEAT */ ++#define ewarning( error, ... ) \ ++ if( ( error ) != -E_REPEAT ) \ ++ warning( __VA_ARGS__ ) ++ ++/* allocate new node on the @level and immediately on the right of @brother. */ ++znode * reiser4_new_node(znode * brother /* existing left neighbor ++ * of new node */, ++ tree_level level /* tree level at which new node is to ++ * be allocated */) ++{ ++ znode *result; ++ int retcode; ++ reiser4_block_nr blocknr; ++ ++ assert("nikita-930", brother != NULL); ++ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT); ++ ++ retcode = assign_fake_blocknr_formatted(&blocknr); ++ if (retcode == 0) { ++ result = ++ zget(znode_get_tree(brother), &blocknr, NULL, level, ++ reiser4_ctx_gfp_mask_get()); ++ if (IS_ERR(result)) { ++ ewarning(PTR_ERR(result), "nikita-929", ++ "Cannot allocate znode for carry: %li", ++ PTR_ERR(result)); ++ return result; ++ } ++ /* cheap test, can be executed even when debugging is off */ ++ if (!znode_just_created(result)) { ++ warning("nikita-2213", ++ "Allocated already existing block: %llu", ++ (unsigned long long)blocknr); ++ zput(result); ++ return ERR_PTR(RETERR(-EIO)); ++ } ++ ++ assert("nikita-931", result != NULL); ++ result->nplug = znode_get_tree(brother)->nplug; ++ assert("nikita-933", result->nplug != NULL); ++ ++ retcode = zinit_new(result, reiser4_ctx_gfp_mask_get()); ++ if (retcode == 0) { ++ ZF_SET(result, JNODE_CREATED); ++ zrelse(result); ++ } else { ++ zput(result); ++ result = ERR_PTR(retcode); ++ } ++ } else { ++ /* failure to allocate new node during balancing. ++ This should never happen. Ever. Returning -E_REPEAT ++ is not viable solution, because "out of disk space" ++ is not transient error that will go away by itself. ++ */ ++ ewarning(retcode, "nikita-928", ++ "Cannot allocate block for carry: %i", retcode); ++ result = ERR_PTR(retcode); ++ } ++ assert("nikita-1071", result != NULL); ++ return result; ++} ++ ++/* allocate new root and add it to the tree ++ ++ This helper function is called by add_new_root(). ++ ++*/ ++znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ , ++ znode * fake /* "fake" znode */ ) ++{ ++ reiser4_tree *tree = znode_get_tree(old_root); ++ znode *new_root = NULL; /* to shut gcc up */ ++ int result; ++ ++ assert("nikita-1069", old_root != NULL); ++ assert("umka-262", fake != NULL); ++ assert("umka-263", tree != NULL); ++ ++ /* "fake" znode---one always hanging just above current root. This ++ node is locked when new root is created or existing root is ++ deleted. Downward tree traversal takes lock on it before taking ++ lock on a root node. This avoids race conditions with root ++ manipulations. ++ ++ */ ++ assert("nikita-1348", znode_above_root(fake)); ++ assert("nikita-1211", znode_is_root(old_root)); ++ ++ result = 0; ++ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) { ++ warning("nikita-1344", "Tree is too tall: %i", tree->height); ++ /* ext2 returns -ENOSPC when it runs out of free inodes with a ++ following comment (fs/ext2/ialloc.c:441): Is it really ++ ENOSPC? ++ ++ -EXFULL? -EINVAL? ++ */ ++ result = RETERR(-ENOSPC); ++ } else { ++ /* Allocate block for new root. It's not that ++ important where it will be allocated, as root is ++ almost always in memory. Moreover, allocate on ++ flush can be going here. ++ */ ++ assert("nikita-1448", znode_is_root(old_root)); ++ new_root = reiser4_new_node(fake, tree->height + 1); ++ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) { ++ lock_handle rlh; ++ ++ init_lh(&rlh); ++ result = ++ longterm_lock_znode(&rlh, new_root, ++ ZNODE_WRITE_LOCK, ++ ZNODE_LOCK_LOPRI); ++ if (result == 0) { ++ parent_coord_t *in_parent; ++ ++ znode_make_dirty(fake); ++ ++ /* new root is a child of "fake" node */ ++ write_lock_tree(tree); ++ ++ ++tree->height; ++ ++ /* recalculate max balance overhead */ ++ tree->estimate_one_insert = ++ estimate_one_insert_item(tree); ++ ++ tree->root_block = *znode_get_block(new_root); ++ in_parent = &new_root->in_parent; ++ init_parent_coord(in_parent, fake); ++ /* manually insert new root into sibling ++ * list. With this all nodes involved into ++ * balancing are connected after balancing is ++ * done---useful invariant to check. */ ++ sibling_list_insert_nolock(new_root, NULL); ++ write_unlock_tree(tree); ++ ++ /* insert into new root pointer to the ++ @old_root. */ ++ assert("nikita-1110", ++ WITH_DATA(new_root, ++ node_is_empty(new_root))); ++ write_lock_dk(tree); ++ znode_set_ld_key(new_root, reiser4_min_key()); ++ znode_set_rd_key(new_root, reiser4_max_key()); ++ write_unlock_dk(tree); ++ if (REISER4_DEBUG) { ++ ZF_CLR(old_root, JNODE_LEFT_CONNECTED); ++ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED); ++ ZF_SET(old_root, JNODE_ORPHAN); ++ } ++ result = add_child_ptr(new_root, old_root); ++ done_lh(&rlh); ++ } ++ zrelse(new_root); ++ } ++ } ++ if (result != 0) ++ new_root = ERR_PTR(result); ++ return new_root; ++} ++ ++/* build &reiser4_item_data for inserting child pointer ++ ++ Build &reiser4_item_data that can be later used to insert pointer to @child ++ in its parent. ++ ++*/ ++void build_child_ptr_data(znode * child /* node pointer to which will be ++ * inserted */ , ++ reiser4_item_data * data /* where to store result */ ) ++{ ++ assert("nikita-1116", child != NULL); ++ assert("nikita-1117", data != NULL); ++ ++ /* ++ * NOTE: use address of child's blocknr as address of data to be ++ * inserted. As result of this data gets into on-disk structure in cpu ++ * byte order. internal's create_hook converts it to little endian byte ++ * order. ++ */ ++ data->data = (char *)znode_get_block(child); ++ /* data -> data is kernel space */ ++ data->user = 0; ++ data->length = sizeof(reiser4_block_nr); ++ /* FIXME-VS: hardcoded internal item? */ ++ ++ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */ ++ data->iplug = item_plugin_by_id(NODE_POINTER_ID); ++} ++ ++/* add pointer to @child into empty @parent. ++ ++ This is used when pointer to old root is inserted into new root which is ++ empty. ++*/ ++static int add_child_ptr(znode * parent, znode * child) ++{ ++ coord_t coord; ++ reiser4_item_data data; ++ int result; ++ reiser4_key key; ++ ++ assert("nikita-1111", parent != NULL); ++ assert("nikita-1112", child != NULL); ++ assert("nikita-1115", ++ znode_get_level(parent) == znode_get_level(child) + 1); ++ ++ result = zload(parent); ++ if (result != 0) ++ return result; ++ assert("nikita-1113", node_is_empty(parent)); ++ coord_init_first_unit(&coord, parent); ++ ++ build_child_ptr_data(child, &data); ++ data.arg = NULL; ++ ++ read_lock_dk(znode_get_tree(parent)); ++ key = *znode_get_ld_key(child); ++ read_unlock_dk(znode_get_tree(parent)); ++ ++ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data, ++ NULL); ++ znode_make_dirty(parent); ++ zrelse(parent); ++ return result; ++} ++ ++/* actually remove tree root */ ++static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is ++ * being removed */, ++ znode * old_root /* root node that is being ++ * removed */ , ++ znode * new_root /* new root---sole child of ++ * @old_root */, ++ const reiser4_block_nr * new_root_blk /* disk address of ++ * @new_root */) ++{ ++ znode *uber; ++ int result; ++ lock_handle handle_for_uber; ++ ++ assert("umka-265", tree != NULL); ++ assert("nikita-1198", new_root != NULL); ++ assert("nikita-1199", ++ znode_get_level(new_root) + 1 == znode_get_level(old_root)); ++ ++ assert("nikita-1201", znode_is_write_locked(old_root)); ++ ++ assert("nikita-1203", ++ disk_addr_eq(new_root_blk, znode_get_block(new_root))); ++ ++ init_lh(&handle_for_uber); ++ /* obtain and lock "fake" znode protecting changes in tree height. */ ++ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI, ++ &handle_for_uber); ++ if (result == 0) { ++ uber = handle_for_uber.node; ++ ++ znode_make_dirty(uber); ++ ++ /* don't take long term lock a @new_root. Take spinlock. */ ++ ++ write_lock_tree(tree); ++ ++ tree->root_block = *new_root_blk; ++ --tree->height; ++ ++ /* recalculate max balance overhead */ ++ tree->estimate_one_insert = estimate_one_insert_item(tree); ++ ++ assert("nikita-1202", ++ tree->height == znode_get_level(new_root)); ++ ++ /* new root is child on "fake" node */ ++ init_parent_coord(&new_root->in_parent, uber); ++ ++uber->c_count; ++ ++ /* sibling_list_insert_nolock(new_root, NULL); */ ++ write_unlock_tree(tree); ++ ++ /* reinitialise old root. */ ++ result = node_plugin_by_node(old_root)->init(old_root); ++ znode_make_dirty(old_root); ++ if (result == 0) { ++ assert("nikita-1279", node_is_empty(old_root)); ++ ZF_SET(old_root, JNODE_HEARD_BANSHEE); ++ old_root->c_count = 0; ++ } ++ } ++ done_lh(&handle_for_uber); ++ ++ return result; ++} ++ ++/* remove tree root ++ ++ This function removes tree root, decreasing tree height by one. Tree root ++ and its only child (that is going to become new tree root) are write locked ++ at the entry. ++ ++ To remove tree root we need to take lock on special "fake" znode that ++ protects changes of tree height. See comments in reiser4_add_tree_root() for ++ more on this. ++ ++ Also parent pointers have to be updated in ++ old and new root. To simplify code, function is split into two parts: outer ++ reiser4_kill_tree_root() collects all necessary arguments and calls ++ reiser4_kill_root() to do the actual job. ++ ++*/ ++int reiser4_kill_tree_root(znode * old_root /* tree root that we are ++ removing*/) ++{ ++ int result; ++ coord_t down_link; ++ znode *new_root; ++ reiser4_tree *tree; ++ ++ assert("umka-266", current_tree != NULL); ++ assert("nikita-1194", old_root != NULL); ++ assert("nikita-1196", znode_is_root(old_root)); ++ assert("nikita-1200", node_num_items(old_root) == 1); ++ assert("nikita-1401", znode_is_write_locked(old_root)); ++ ++ coord_init_first_unit(&down_link, old_root); ++ ++ tree = znode_get_tree(old_root); ++ new_root = child_znode(&down_link, old_root, 0, 1); ++ if (!IS_ERR(new_root)) { ++ result = ++ reiser4_kill_root(tree, old_root, new_root, ++ znode_get_block(new_root)); ++ zput(new_root); ++ } else ++ result = PTR_ERR(new_root); ++ ++ return result; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/tree_mod.h linux-2.6.33/fs/reiser4/tree_mod.h +--- linux-2.6.33.orig/fs/reiser4/tree_mod.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/tree_mod.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,29 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for ++ * comments. */ ++ ++#if !defined( __REISER4_TREE_MOD_H__ ) ++#define __REISER4_TREE_MOD_H__ ++ ++#include "forward.h" ++ ++znode *reiser4_new_node(znode * brother, tree_level level); ++znode *reiser4_add_tree_root(znode * old_root, znode * fake); ++int reiser4_kill_tree_root(znode * old_root); ++void build_child_ptr_data(znode * child, reiser4_item_data * data); ++ ++/* __REISER4_TREE_MOD_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/tree_walk.c linux-2.6.33/fs/reiser4/tree_walk.c +--- linux-2.6.33.orig/fs/reiser4/tree_walk.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/tree_walk.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,927 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Routines and macros to: ++ ++ get_left_neighbor() ++ ++ get_right_neighbor() ++ ++ get_parent() ++ ++ get_first_child() ++ ++ get_last_child() ++ ++ various routines to walk the whole tree and do things to it like ++ repack it, or move it to tertiary storage. Please make them as ++ generic as is reasonable. ++ ++*/ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "coord.h" ++#include "plugin/item/item.h" ++#include "jnode.h" ++#include "znode.h" ++#include "tree_walk.h" ++#include "tree.h" ++#include "super.h" ++ ++/* These macros are used internally in tree_walk.c in attempt to make ++ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor, ++ lock_left_neighbor */ ++#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off))) ++#define FIELD_OFFSET(name) offsetof(znode, name) ++#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node) ++#define LEFT_PTR_OFFSET FIELD_OFFSET(left) ++#define RIGHT_PTR_OFFSET FIELD_OFFSET(right) ++ ++/* This is the generic procedure to get and lock `generic' neighbor (left or ++ right neighbor or parent). It implements common algorithm for all cases of ++ getting lock on neighbor node, only znode structure field is different in ++ each case. This is parameterized by ptr_offset argument, which is byte ++ offset for the pointer to the desired neighbor within the current node's ++ znode structure. This function should be called with the tree lock held */ ++static int lock_neighbor( ++ /* resulting lock handle */ ++ lock_handle * result, ++ /* znode to lock */ ++ znode * node, ++ /* pointer to neighbor (or parent) znode field offset, in bytes from ++ the base address of znode structure */ ++ int ptr_offset, ++ /* lock mode for longterm_lock_znode call */ ++ znode_lock_mode mode, ++ /* lock request for longterm_lock_znode call */ ++ znode_lock_request req, ++ /* GN_* flags */ ++ int flags, int rlocked) ++{ ++ reiser4_tree *tree = znode_get_tree(node); ++ znode *neighbor; ++ int ret; ++ ++ assert("umka-236", node != NULL); ++ assert("umka-237", tree != NULL); ++ assert_rw_locked(&(tree->tree_lock)); ++ ++ if (flags & GN_TRY_LOCK) ++ req |= ZNODE_LOCK_NONBLOCK; ++ if (flags & GN_SAME_ATOM) ++ req |= ZNODE_LOCK_DONT_FUSE; ++ ++ /* get neighbor's address by using of sibling link, quit while loop ++ (and return) if link is not available. */ ++ while (1) { ++ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset); ++ ++ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if ++ * node pointed by it is not connected. ++ * ++ * However, GN_ALLOW_NOT_CONNECTED option masks "connected" ++ * check and allows passing reference to not connected znode to ++ * subsequent longterm_lock_znode() call. This kills possible ++ * busy loop if we are trying to get longterm lock on locked but ++ * not yet connected parent node. */ ++ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED) ++ || znode_is_connected(neighbor))) { ++ return RETERR(-E_NO_NEIGHBOR); ++ } ++ ++ /* protect it from deletion. */ ++ zref(neighbor); ++ ++ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree); ++ ++ ret = longterm_lock_znode(result, neighbor, mode, req); ++ ++ /* The lock handle obtains its own reference, release the one from above. */ ++ zput(neighbor); ++ ++ rlocked ? read_lock_tree(tree) : write_lock_tree(tree); ++ ++ /* restart if node we got reference to is being ++ invalidated. we should not get reference to this node ++ again. */ ++ if (ret == -EINVAL) ++ continue; ++ if (ret) ++ return ret; ++ ++ /* check if neighbor link still points to just locked znode; ++ the link could have been changed while the process slept. */ ++ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset)) ++ return 0; ++ ++ /* znode was locked by mistake; unlock it and restart locking ++ process from beginning. */ ++ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree); ++ longterm_unlock_znode(result); ++ rlocked ? read_lock_tree(tree) : write_lock_tree(tree); ++ } ++} ++ ++/* get parent node with longterm lock, accepts GN* flags. */ ++int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ , ++ znode * node /* child node */ , ++ znode_lock_mode mode ++ /* type of lock: read or write */ , ++ int flags /* GN_* flags */ ) ++{ ++ int result; ++ ++ read_lock_tree(znode_get_tree(node)); ++ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode, ++ ZNODE_LOCK_HIPRI, flags, 1); ++ read_unlock_tree(znode_get_tree(node)); ++ return result; ++} ++ ++/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT ++ bit in @flags parameter */ ++/* Audited by: umka (2002.06.14) */ ++static inline int ++lock_side_neighbor(lock_handle * result, ++ znode * node, znode_lock_mode mode, int flags, int rlocked) ++{ ++ int ret; ++ int ptr_offset; ++ znode_lock_request req; ++ ++ if (flags & GN_GO_LEFT) { ++ ptr_offset = LEFT_PTR_OFFSET; ++ req = ZNODE_LOCK_LOPRI; ++ } else { ++ ptr_offset = RIGHT_PTR_OFFSET; ++ req = ZNODE_LOCK_HIPRI; ++ } ++ ++ ret = ++ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked); ++ ++ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not ++ * guarantee that neighbor is absent in the ++ * tree; in this case we return -ENOENT -- ++ * means neighbor at least not found in ++ * cache */ ++ return RETERR(-ENOENT); ++ ++ return ret; ++} ++ ++#if REISER4_DEBUG ++ ++int check_sibling_list(znode * node) ++{ ++ znode *scan; ++ znode *next; ++ ++ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree)); ++ ++ if (node == NULL) ++ return 1; ++ ++ if (ZF_ISSET(node, JNODE_RIP)) ++ return 1; ++ ++ assert("nikita-3270", node != NULL); ++ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock)); ++ ++ for (scan = node; znode_is_left_connected(scan); scan = next) { ++ next = scan->left; ++ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) { ++ assert("nikita-3271", znode_is_right_connected(next)); ++ assert("nikita-3272", next->right == scan); ++ } else ++ break; ++ } ++ for (scan = node; znode_is_right_connected(scan); scan = next) { ++ next = scan->right; ++ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) { ++ assert("nikita-3273", znode_is_left_connected(next)); ++ assert("nikita-3274", next->left == scan); ++ } else ++ break; ++ } ++ return 1; ++} ++ ++#endif ++ ++/* Znode sibling pointers maintenence. */ ++ ++/* Znode sibling pointers are established between any neighbored nodes which are ++ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED, ++ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual ++ value (even NULL), corresponded JNODE_*_CONNECTED bit is set. ++ ++ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing) ++ take care about searching (hash table lookup may be required) of znode ++ neighbors, establishing sibling pointers between them and setting ++ JNODE_*_CONNECTED state bits. */ ++ ++/* adjusting of sibling pointers and `connected' states for two ++ neighbors; works if one neighbor is NULL (was not found). */ ++ ++/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */ ++void link_left_and_right(znode * left, znode * right) ++{ ++ assert("nikita-3275", check_sibling_list(left)); ++ assert("nikita-3275", check_sibling_list(right)); ++ ++ if (left != NULL) { ++ if (left->right == NULL) { ++ left->right = right; ++ ZF_SET(left, JNODE_RIGHT_CONNECTED); ++ ++ ON_DEBUG(left->right_version = ++ atomic_inc_return(&delim_key_version); ++ ); ++ ++ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE) ++ && left->right != right) { ++ ++ ON_DEBUG(left->right->left_version = ++ atomic_inc_return(&delim_key_version); ++ left->right_version = ++ atomic_inc_return(&delim_key_version);); ++ ++ left->right->left = NULL; ++ left->right = right; ++ ZF_SET(left, JNODE_RIGHT_CONNECTED); ++ } else ++ /* ++ * there is a race condition in renew_sibling_link() ++ * and assertions below check that it is only one ++ * there. Thread T1 calls renew_sibling_link() without ++ * GN_NO_ALLOC flag. zlook() doesn't find neighbor ++ * node, but before T1 gets to the ++ * link_left_and_right(), another thread T2 creates ++ * neighbor node and connects it. check for ++ * left->right == NULL above protects T1 from ++ * overwriting correct left->right pointer installed ++ * by T2. ++ */ ++ assert("nikita-3302", ++ right == NULL || left->right == right); ++ } ++ if (right != NULL) { ++ if (right->left == NULL) { ++ right->left = left; ++ ZF_SET(right, JNODE_LEFT_CONNECTED); ++ ++ ON_DEBUG(right->left_version = ++ atomic_inc_return(&delim_key_version); ++ ); ++ ++ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE) ++ && right->left != left) { ++ ++ ON_DEBUG(right->left->right_version = ++ atomic_inc_return(&delim_key_version); ++ right->left_version = ++ atomic_inc_return(&delim_key_version);); ++ ++ right->left->right = NULL; ++ right->left = left; ++ ZF_SET(right, JNODE_LEFT_CONNECTED); ++ ++ } else ++ assert("nikita-3303", ++ left == NULL || right->left == left); ++ } ++ assert("nikita-3275", check_sibling_list(left)); ++ assert("nikita-3275", check_sibling_list(right)); ++} ++ ++/* Audited by: umka (2002.06.14) */ ++static void link_znodes(znode * first, znode * second, int to_left) ++{ ++ if (to_left) ++ link_left_and_right(second, first); ++ else ++ link_left_and_right(first, second); ++} ++ ++/* getting of next (to left or to right, depend on gn_to_left bit in flags) ++ coord's unit position in horizontal direction, even across node ++ boundary. Should be called under tree lock, it protects nonexistence of ++ sibling link on parent level, if lock_side_neighbor() fails with ++ -ENOENT. */ ++static int far_next_coord(coord_t * coord, lock_handle * handle, int flags) ++{ ++ int ret; ++ znode *node; ++ reiser4_tree *tree; ++ ++ assert("umka-243", coord != NULL); ++ assert("umka-244", handle != NULL); ++ assert("zam-1069", handle->node == NULL); ++ ++ ret = ++ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) : ++ coord_next_unit(coord); ++ if (!ret) ++ return 0; ++ ++ ret = ++ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0); ++ if (ret) ++ return ret; ++ ++ node = handle->node; ++ tree = znode_get_tree(node); ++ write_unlock_tree(tree); ++ ++ coord_init_zero(coord); ++ ++ /* We avoid synchronous read here if it is specified by flag. */ ++ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) { ++ ret = jstartio(ZJNODE(handle->node)); ++ if (!ret) ++ ret = -E_REPEAT; ++ goto error_locked; ++ } ++ ++ /* corresponded zrelse() should be called by the clients of ++ far_next_coord(), in place when this node gets unlocked. */ ++ ret = zload(handle->node); ++ if (ret) ++ goto error_locked; ++ ++ if (flags & GN_GO_LEFT) ++ coord_init_last_unit(coord, node); ++ else ++ coord_init_first_unit(coord, node); ++ ++ if (0) { ++ error_locked: ++ longterm_unlock_znode(handle); ++ } ++ write_lock_tree(tree); ++ return ret; ++} ++ ++/* Very significant function which performs a step in horizontal direction ++ when sibling pointer is not available. Actually, it is only function which ++ does it. ++ Note: this function does not restore locking status at exit, ++ caller should does care about proper unlocking and zrelsing */ ++static int ++renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child, ++ tree_level level, int flags, int *nr_locked) ++{ ++ int ret; ++ int to_left = flags & GN_GO_LEFT; ++ reiser4_block_nr da; ++ /* parent of the neighbor node; we set it to parent until not sharing ++ of one parent between child and neighbor node is detected */ ++ znode *side_parent = coord->node; ++ reiser4_tree *tree = znode_get_tree(child); ++ znode *neighbor = NULL; ++ ++ assert("umka-245", coord != NULL); ++ assert("umka-246", handle != NULL); ++ assert("umka-247", child != NULL); ++ assert("umka-303", tree != NULL); ++ ++ init_lh(handle); ++ write_lock_tree(tree); ++ ret = far_next_coord(coord, handle, flags); ++ ++ if (ret) { ++ if (ret != -ENOENT) { ++ write_unlock_tree(tree); ++ return ret; ++ } ++ } else { ++ item_plugin *iplug; ++ ++ if (handle->node != NULL) { ++ (*nr_locked)++; ++ side_parent = handle->node; ++ } ++ ++ /* does coord object points to internal item? We do not ++ support sibling pointers between znode for formatted and ++ unformatted nodes and return -E_NO_NEIGHBOR in that case. */ ++ iplug = item_plugin_by_coord(coord); ++ if (!item_is_internal(coord)) { ++ link_znodes(child, NULL, to_left); ++ write_unlock_tree(tree); ++ /* we know there can't be formatted neighbor */ ++ return RETERR(-E_NO_NEIGHBOR); ++ } ++ write_unlock_tree(tree); ++ ++ iplug->s.internal.down_link(coord, NULL, &da); ++ ++ if (flags & GN_NO_ALLOC) { ++ neighbor = zlook(tree, &da); ++ } else { ++ neighbor = ++ zget(tree, &da, side_parent, level, ++ reiser4_ctx_gfp_mask_get()); ++ } ++ ++ if (IS_ERR(neighbor)) { ++ ret = PTR_ERR(neighbor); ++ return ret; ++ } ++ ++ if (neighbor) ++ /* update delimiting keys */ ++ set_child_delimiting_keys(coord->node, coord, neighbor); ++ ++ write_lock_tree(tree); ++ } ++ ++ if (likely(neighbor == NULL || ++ (znode_get_level(child) == znode_get_level(neighbor) ++ && child != neighbor))) ++ link_znodes(child, neighbor, to_left); ++ else { ++ warning("nikita-3532", ++ "Sibling nodes on the different levels: %i != %i\n", ++ znode_get_level(child), znode_get_level(neighbor)); ++ ret = RETERR(-EIO); ++ } ++ ++ write_unlock_tree(tree); ++ ++ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */ ++ if (neighbor != NULL && (flags & GN_NO_ALLOC)) ++ /* atomic_dec(&ZJNODE(neighbor)->x_count); */ ++ zput(neighbor); ++ ++ return ret; ++} ++ ++/* This function is for establishing of one side relation. */ ++/* Audited by: umka (2002.06.14) */ ++static int connect_one_side(coord_t * coord, znode * node, int flags) ++{ ++ coord_t local; ++ lock_handle handle; ++ int nr_locked; ++ int ret; ++ ++ assert("umka-248", coord != NULL); ++ assert("umka-249", node != NULL); ++ ++ coord_dup_nocheck(&local, coord); ++ ++ init_lh(&handle); ++ ++ ret = ++ renew_sibling_link(&local, &handle, node, znode_get_level(node), ++ flags | GN_NO_ALLOC, &nr_locked); ++ ++ if (handle.node != NULL) { ++ /* complementary operations for zload() and lock() in far_next_coord() */ ++ zrelse(handle.node); ++ longterm_unlock_znode(&handle); ++ } ++ ++ /* we catch error codes which are not interesting for us because we ++ run renew_sibling_link() only for znode connection. */ ++ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR) ++ return 0; ++ ++ return ret; ++} ++ ++/* if @child is not in `connected' state, performs hash searches for left and ++ right neighbor nodes and establishes horizontal sibling links */ ++/* Audited by: umka (2002.06.14), umka (2002.06.15) */ ++int connect_znode(coord_t * parent_coord, znode * child) ++{ ++ reiser4_tree *tree = znode_get_tree(child); ++ int ret = 0; ++ ++ assert("zam-330", parent_coord != NULL); ++ assert("zam-331", child != NULL); ++ assert("zam-332", parent_coord->node != NULL); ++ assert("umka-305", tree != NULL); ++ ++ /* it is trivial to `connect' root znode because it can't have ++ neighbors */ ++ if (znode_above_root(parent_coord->node)) { ++ child->left = NULL; ++ child->right = NULL; ++ ZF_SET(child, JNODE_LEFT_CONNECTED); ++ ZF_SET(child, JNODE_RIGHT_CONNECTED); ++ ++ ON_DEBUG(child->left_version = ++ atomic_inc_return(&delim_key_version); ++ child->right_version = ++ atomic_inc_return(&delim_key_version);); ++ ++ return 0; ++ } ++ ++ /* load parent node */ ++ coord_clear_iplug(parent_coord); ++ ret = zload(parent_coord->node); ++ ++ if (ret != 0) ++ return ret; ++ ++ /* protect `connected' state check by tree_lock */ ++ read_lock_tree(tree); ++ ++ if (!znode_is_right_connected(child)) { ++ read_unlock_tree(tree); ++ /* connect right (default is right) */ ++ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC); ++ if (ret) ++ goto zrelse_and_ret; ++ ++ read_lock_tree(tree); ++ } ++ ++ ret = znode_is_left_connected(child); ++ ++ read_unlock_tree(tree); ++ ++ if (!ret) { ++ ret = ++ connect_one_side(parent_coord, child, ++ GN_NO_ALLOC | GN_GO_LEFT); ++ } else ++ ret = 0; ++ ++ zrelse_and_ret: ++ zrelse(parent_coord->node); ++ ++ return ret; ++} ++ ++/* this function is like renew_sibling_link() but allocates neighbor node if ++ it doesn't exist and `connects' it. It may require making two steps in ++ horizontal direction, first one for neighbor node finding/allocation, ++ second one is for finding neighbor of neighbor to connect freshly allocated ++ znode. */ ++/* Audited by: umka (2002.06.14), umka (2002.06.15) */ ++static int ++renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags) ++{ ++ coord_t local; ++ lock_handle empty[2]; ++ reiser4_tree *tree = znode_get_tree(node); ++ znode *neighbor = NULL; ++ int nr_locked = 0; ++ int ret; ++ ++ assert("umka-250", coord != NULL); ++ assert("umka-251", node != NULL); ++ assert("umka-307", tree != NULL); ++ assert("umka-308", level <= tree->height); ++ ++ /* umka (2002.06.14) ++ Here probably should be a check for given "level" validness. ++ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT); ++ */ ++ ++ coord_dup(&local, coord); ++ ++ ret = ++ renew_sibling_link(&local, &empty[0], node, level, ++ flags & ~GN_NO_ALLOC, &nr_locked); ++ if (ret) ++ goto out; ++ ++ /* tree lock is not needed here because we keep parent node(s) locked ++ and reference to neighbor znode incremented */ ++ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right; ++ ++ read_lock_tree(tree); ++ ret = znode_is_connected(neighbor); ++ read_unlock_tree(tree); ++ if (ret) { ++ ret = 0; ++ goto out; ++ } ++ ++ ret = ++ renew_sibling_link(&local, &empty[nr_locked], neighbor, level, ++ flags | GN_NO_ALLOC, &nr_locked); ++ /* second renew_sibling_link() call is used for znode connection only, ++ so we can live with these errors */ ++ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret) ++ ret = 0; ++ ++ out: ++ ++ for (--nr_locked; nr_locked >= 0; --nr_locked) { ++ zrelse(empty[nr_locked].node); ++ longterm_unlock_znode(&empty[nr_locked]); ++ } ++ ++ if (neighbor != NULL) ++ /* decrement znode reference counter without actually ++ releasing it. */ ++ atomic_dec(&ZJNODE(neighbor)->x_count); ++ ++ return ret; ++} ++ ++/* ++ reiser4_get_neighbor() -- lock node's neighbor. ++ ++ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on ++ given parameter) using sibling link to it. If sibling link is not available ++ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one ++ level up for information about neighbor's disk address. We lock node's ++ parent, if it is common parent for both 'node' and its neighbor, neighbor's ++ disk address is in next (to left or to right) down link from link that points ++ to original node. If not, we need to lock parent's neighbor, read its content ++ and take first(last) downlink with neighbor's disk address. That locking ++ could be done by using sibling link and lock_neighbor() function, if sibling ++ link exists. In another case we have to go level up again until we find ++ common parent or valid sibling link. Then go down ++ allocating/connecting/locking/reading nodes until neighbor of first one is ++ locked. ++ ++ @neighbor: result lock handle, ++ @node: a node which we lock neighbor of, ++ @lock_mode: lock mode {LM_READ, LM_WRITE}, ++ @flags: logical OR of {GN_*} (see description above) subset. ++ ++ @return: 0 if success, negative value if lock was impossible due to an error ++ or lack of neighbor node. ++*/ ++ ++/* Audited by: umka (2002.06.14), umka (2002.06.15) */ ++int ++reiser4_get_neighbor(lock_handle * neighbor, znode * node, ++ znode_lock_mode lock_mode, int flags) ++{ ++ reiser4_tree *tree = znode_get_tree(node); ++ lock_handle path[REAL_MAX_ZTREE_HEIGHT]; ++ ++ coord_t coord; ++ ++ tree_level base_level; ++ tree_level h = 0; ++ int ret; ++ ++ assert("umka-252", tree != NULL); ++ assert("umka-253", neighbor != NULL); ++ assert("umka-254", node != NULL); ++ ++ base_level = znode_get_level(node); ++ ++ assert("umka-310", base_level <= tree->height); ++ ++ coord_init_zero(&coord); ++ ++ again: ++ /* first, we try to use simple lock_neighbor() which requires sibling ++ link existence */ ++ read_lock_tree(tree); ++ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1); ++ read_unlock_tree(tree); ++ if (!ret) { ++ /* load znode content if it was specified */ ++ if (flags & GN_LOAD_NEIGHBOR) { ++ ret = zload(node); ++ if (ret) ++ longterm_unlock_znode(neighbor); ++ } ++ return ret; ++ } ++ ++ /* only -ENOENT means we may look upward and try to connect ++ @node with its neighbor (if @flags allow us to do it) */ ++ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS)) ++ return ret; ++ ++ /* before establishing of sibling link we lock parent node; it is ++ required by renew_neighbor() to work. */ ++ init_lh(&path[0]); ++ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK); ++ if (ret) ++ return ret; ++ if (znode_above_root(path[0].node)) { ++ longterm_unlock_znode(&path[0]); ++ return RETERR(-E_NO_NEIGHBOR); ++ } ++ ++ while (1) { ++ znode *child = (h == 0) ? node : path[h - 1].node; ++ znode *parent = path[h].node; ++ ++ ret = zload(parent); ++ if (ret) ++ break; ++ ++ ret = find_child_ptr(parent, child, &coord); ++ ++ if (ret) { ++ zrelse(parent); ++ break; ++ } ++ ++ /* try to establish missing sibling link */ ++ ret = renew_neighbor(&coord, child, h + base_level, flags); ++ ++ zrelse(parent); ++ ++ switch (ret) { ++ case 0: ++ /* unlocking of parent znode prevents simple ++ deadlock situation */ ++ done_lh(&path[h]); ++ ++ /* depend on tree level we stay on we repeat first ++ locking attempt ... */ ++ if (h == 0) ++ goto again; ++ ++ /* ... or repeat establishing of sibling link at ++ one level below. */ ++ --h; ++ break; ++ ++ case -ENOENT: ++ /* sibling link is not available -- we go ++ upward. */ ++ init_lh(&path[h + 1]); ++ ret = ++ reiser4_get_parent(&path[h + 1], parent, ++ ZNODE_READ_LOCK); ++ if (ret) ++ goto fail; ++ ++h; ++ if (znode_above_root(path[h].node)) { ++ ret = RETERR(-E_NO_NEIGHBOR); ++ goto fail; ++ } ++ break; ++ ++ case -E_DEADLOCK: ++ /* there was lock request from hi-pri locker. if ++ it is possible we unlock last parent node and ++ re-lock it again. */ ++ for (; reiser4_check_deadlock(); h--) { ++ done_lh(&path[h]); ++ if (h == 0) ++ goto fail; ++ } ++ ++ break; ++ ++ default: /* other errors. */ ++ goto fail; ++ } ++ } ++ fail: ++ ON_DEBUG(check_lock_node_data(node)); ++ ON_DEBUG(check_lock_data()); ++ ++ /* unlock path */ ++ do { ++ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto ++ fail; path[0] is already done_lh-ed, therefore ++ longterm_unlock_znode(&path[h]); is not applicable */ ++ done_lh(&path[h]); ++ --h; ++ } while (h + 1 != 0); ++ ++ return ret; ++} ++ ++/* remove node from sibling list */ ++/* Audited by: umka (2002.06.14) */ ++void sibling_list_remove(znode * node) ++{ ++ reiser4_tree *tree; ++ ++ tree = znode_get_tree(node); ++ assert("umka-255", node != NULL); ++ assert_rw_write_locked(&(tree->tree_lock)); ++ assert("nikita-3275", check_sibling_list(node)); ++ ++ write_lock_dk(tree); ++ if (znode_is_right_connected(node) && node->right != NULL && ++ znode_is_left_connected(node) && node->left != NULL) { ++ assert("zam-32245", ++ keyeq(znode_get_rd_key(node), ++ znode_get_ld_key(node->right))); ++ znode_set_rd_key(node->left, znode_get_ld_key(node->right)); ++ } ++ write_unlock_dk(tree); ++ ++ if (znode_is_right_connected(node) && node->right != NULL) { ++ assert("zam-322", znode_is_left_connected(node->right)); ++ node->right->left = node->left; ++ ON_DEBUG(node->right->left_version = ++ atomic_inc_return(&delim_key_version); ++ ); ++ } ++ if (znode_is_left_connected(node) && node->left != NULL) { ++ assert("zam-323", znode_is_right_connected(node->left)); ++ node->left->right = node->right; ++ ON_DEBUG(node->left->right_version = ++ atomic_inc_return(&delim_key_version); ++ ); ++ } ++ ++ ZF_CLR(node, JNODE_LEFT_CONNECTED); ++ ZF_CLR(node, JNODE_RIGHT_CONNECTED); ++ ON_DEBUG(node->left = node->right = NULL; ++ node->left_version = atomic_inc_return(&delim_key_version); ++ node->right_version = atomic_inc_return(&delim_key_version);); ++ assert("nikita-3276", check_sibling_list(node)); ++} ++ ++/* disconnect node from sibling list */ ++void sibling_list_drop(znode * node) ++{ ++ znode *right; ++ znode *left; ++ ++ assert("nikita-2464", node != NULL); ++ assert("nikita-3277", check_sibling_list(node)); ++ ++ right = node->right; ++ if (right != NULL) { ++ assert("nikita-2465", znode_is_left_connected(right)); ++ right->left = NULL; ++ ON_DEBUG(right->left_version = ++ atomic_inc_return(&delim_key_version); ++ ); ++ } ++ left = node->left; ++ if (left != NULL) { ++ assert("zam-323", znode_is_right_connected(left)); ++ left->right = NULL; ++ ON_DEBUG(left->right_version = ++ atomic_inc_return(&delim_key_version); ++ ); ++ } ++ ZF_CLR(node, JNODE_LEFT_CONNECTED); ++ ZF_CLR(node, JNODE_RIGHT_CONNECTED); ++ ON_DEBUG(node->left = node->right = NULL; ++ node->left_version = atomic_inc_return(&delim_key_version); ++ node->right_version = atomic_inc_return(&delim_key_version);); ++} ++ ++/* Insert new node into sibling list. Regular balancing inserts new node ++ after (at right side) existing and locked node (@before), except one case ++ of adding new tree root node. @before should be NULL in that case. */ ++void sibling_list_insert_nolock(znode * new, znode * before) ++{ ++ assert("zam-334", new != NULL); ++ assert("nikita-3298", !znode_is_left_connected(new)); ++ assert("nikita-3299", !znode_is_right_connected(new)); ++ assert("nikita-3300", new->left == NULL); ++ assert("nikita-3301", new->right == NULL); ++ assert("nikita-3278", check_sibling_list(new)); ++ assert("nikita-3279", check_sibling_list(before)); ++ ++ if (before != NULL) { ++ assert("zam-333", znode_is_connected(before)); ++ new->right = before->right; ++ new->left = before; ++ ON_DEBUG(new->right_version = ++ atomic_inc_return(&delim_key_version); ++ new->left_version = ++ atomic_inc_return(&delim_key_version);); ++ if (before->right != NULL) { ++ before->right->left = new; ++ ON_DEBUG(before->right->left_version = ++ atomic_inc_return(&delim_key_version); ++ ); ++ } ++ before->right = new; ++ ON_DEBUG(before->right_version = ++ atomic_inc_return(&delim_key_version); ++ ); ++ } else { ++ new->right = NULL; ++ new->left = NULL; ++ ON_DEBUG(new->right_version = ++ atomic_inc_return(&delim_key_version); ++ new->left_version = ++ atomic_inc_return(&delim_key_version);); ++ } ++ ZF_SET(new, JNODE_LEFT_CONNECTED); ++ ZF_SET(new, JNODE_RIGHT_CONNECTED); ++ assert("nikita-3280", check_sibling_list(new)); ++ assert("nikita-3281", check_sibling_list(before)); ++} ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 80 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/tree_walk.h linux-2.6.33/fs/reiser4/tree_walk.h +--- linux-2.6.33.orig/fs/reiser4/tree_walk.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/tree_walk.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,125 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++/* definitions of reiser4 tree walk functions */ ++ ++#ifndef __FS_REISER4_TREE_WALK_H__ ++#define __FS_REISER4_TREE_WALK_H__ ++ ++#include "debug.h" ++#include "forward.h" ++ ++/* establishes horizontal links between cached znodes */ ++int connect_znode(coord_t * coord, znode * node); ++ ++/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor()) ++ have the following common arguments: ++ ++ return codes: ++ ++ @return : 0 - OK, ++ ++ZAM-FIXME-HANS: wrong return code name. Change them all. ++ -ENOENT - neighbor is not in cache, what is detected by sibling ++ link absence. ++ ++ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be ++ found (because we are left-/right- most node of the ++ tree, for example). Also, this return code is for ++ reiser4_get_parent() when we see no parent link -- it ++ means that our node is root node. ++ ++ -E_DEADLOCK - deadlock detected (request from high-priority process ++ received), other error codes are conformed to ++ /usr/include/asm/errno.h . ++*/ ++ ++int ++reiser4_get_parent_flags(lock_handle * result, znode * node, ++ znode_lock_mode mode, int flags); ++ ++/* bits definition for reiser4_get_neighbor function `flags' arg. */ ++typedef enum { ++ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to ++ * find not allocated not connected neigbor by going though upper ++ * levels */ ++ GN_CAN_USE_UPPER_LEVELS = 0x1, ++ /* locking left neighbor instead of right one */ ++ GN_GO_LEFT = 0x2, ++ /* automatically load neighbor node content */ ++ GN_LOAD_NEIGHBOR = 0x4, ++ /* return -E_REPEAT if can't lock */ ++ GN_TRY_LOCK = 0x8, ++ /* used internally in tree_walk.c, causes renew_sibling to not ++ allocate neighbor znode, but only search for it in znode cache */ ++ GN_NO_ALLOC = 0x10, ++ /* do not go across atom boundaries */ ++ GN_SAME_ATOM = 0x20, ++ /* allow to lock not connected nodes */ ++ GN_ALLOW_NOT_CONNECTED = 0x40, ++ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */ ++ GN_ASYNC = 0x80 ++} znode_get_neigbor_flags; ++ ++/* A commonly used wrapper for reiser4_get_parent_flags(). */ ++static inline int reiser4_get_parent(lock_handle * result, znode * node, ++ znode_lock_mode mode) ++{ ++ return reiser4_get_parent_flags(result, node, mode, ++ GN_ALLOW_NOT_CONNECTED); ++} ++ ++int reiser4_get_neighbor(lock_handle * neighbor, znode * node, ++ znode_lock_mode lock_mode, int flags); ++ ++/* there are wrappers for most common usages of reiser4_get_neighbor() */ ++static inline int ++reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode, ++ int flags) ++{ ++ return reiser4_get_neighbor(result, node, lock_mode, ++ flags | GN_GO_LEFT); ++} ++ ++static inline int ++reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode, ++ int flags) ++{ ++ ON_DEBUG(check_lock_node_data(node)); ++ ON_DEBUG(check_lock_data()); ++ return reiser4_get_neighbor(result, node, lock_mode, ++ flags & (~GN_GO_LEFT)); ++} ++ ++extern void sibling_list_remove(znode * node); ++extern void sibling_list_drop(znode * node); ++extern void sibling_list_insert_nolock(znode * new, znode * before); ++extern void link_left_and_right(znode * left, znode * right); ++ ++/* Functions called by tree_walk() when tree_walk() ... */ ++struct tree_walk_actor { ++ /* ... meets a formatted node, */ ++ int (*process_znode) (tap_t *, void *); ++ /* ... meets an extent, */ ++ int (*process_extent) (tap_t *, void *); ++ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by ++ * node or extent processing functions. */ ++ int (*before) (void *); ++}; ++ ++#if REISER4_DEBUG ++int check_sibling_list(znode * node); ++#else ++#define check_sibling_list(n) (1) ++#endif ++ ++#endif /* __FS_REISER4_TREE_WALK_H__ */ ++ ++/* ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/txnmgr.c linux-2.6.33/fs/reiser4/txnmgr.c +--- linux-2.6.33.orig/fs/reiser4/txnmgr.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/txnmgr.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,3165 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Joshua MacDonald wrote the first draft of this code. */ ++ ++/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a ++filesystem scales only as well as its worst locking design. You need to ++substantially restructure this code. Josh was not as experienced a programmer ++as you. Particularly review how the locking style differs from what you did ++for znodes usingt hi-lo priority locking, and present to me an opinion on ++whether the differences are well founded. */ ++ ++/* I cannot help but to disagree with the sentiment above. Locking of ++ * transaction manager is _not_ badly designed, and, at the very least, is not ++ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority ++ * locking on znodes, especially on the root node of the tree. --nikita, ++ * 2003.10.13 */ ++ ++/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The ++ txnmgr processes capture_block requests and manages the relationship between jnodes and ++ atoms through the various stages of a transcrash, and it also oversees the fusion and ++ capture-on-copy processes. The main difficulty with this task is maintaining a ++ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the ++ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle ++ must be broken. The main requirement is that atom-fusion be deadlock free, so once you ++ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies ++ that any time you check the atom-pointer of a jnode or handle and then try to lock that ++ atom, you must use trylock() and possibly reverse the order. ++ ++ This code implements the design documented at: ++ ++ http://namesys.com/txn-doc.html ++ ++ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the ++above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this ++topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12 ++year old --- define all technical terms used. ++ ++*/ ++ ++/* Thoughts on the external transaction interface: ++ ++ In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which ++ creates state that lasts for the duration of a system call and is called at the start ++ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(), ++ occupying the scope of a single system call. We wish to give certain applications an ++ interface to begin and close (commit) transactions. Since our implementation of ++ transactions does not yet support isolation, allowing an application to open a ++ transaction implies trusting it to later close the transaction. Part of the ++ transaction interface will be aimed at enabling that trust, but the interface for ++ actually using transactions is fairly narrow. ++ ++ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate ++ this identifier into a string that a shell-script could use, allowing you to start a ++ transaction by issuing a command. Once open, the transcrash should be set in the task ++ structure, and there should be options (I suppose) to allow it to be carried across ++ fork/exec. A transcrash has several options: ++ ++ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only ++ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to ++ capture on reads as well, it should set READ_FUSING. ++ ++ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must ++ eventually close (or else the machine must crash). If the application dies an ++ unexpected death with an open transcrash, for example, or if it hangs for a long ++ duration, one solution (to avoid crashing the machine) is to simply close it anyway. ++ This is a dangerous option, but it is one way to solve the problem until isolated ++ transcrashes are available for untrusted applications. ++ ++ It seems to be what databases do, though it is unclear how one avoids a DoS attack ++ creating a vulnerability based on resource starvation. Guaranteeing that some ++ minimum amount of computational resources are made available would seem more correct ++ than guaranteeing some amount of time. When we again have someone to code the work, ++ this issue should be considered carefully. -Hans ++ ++ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how ++ many dirty blocks it expects. The reserve_blocks interface should be called at a point ++ where it is safe for the application to fail, because the system may not be able to ++ grant the allocation and the application must be able to back-out. For this reason, ++ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but ++ the application may also wish to extend the allocation after beginning its transcrash. ++ ++ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making ++ modifications that require transaction protection. When isolated transactions are ++ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a ++ RESERVE_BLOCKS call fails for the application, it should "abort" by calling ++ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is ++ why, for safety, the application should call RESERVE_BLOCKS before making any changes). ++ ++ For actually implementing these out-of-system-call-scopped transcrashes, the ++ reiser4_context has a "txn_handle *trans" pointer that may be set to an open ++ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a ++ "struct kmem_cache *_txnh_slab" created for that purpose in this file. ++*/ ++ ++/* Extending the other system call interfaces for future transaction features: ++ ++ Specialized applications may benefit from passing flags to the ordinary system call ++ interface such as read(), write(), or stat(). For example, the application specifies ++ WRITE_FUSING by default but wishes to add that a certain read() command should be ++ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data ++ read, or the file-data read? These issues are straight-forward, but there are a lot of ++ them and adding the necessary flags-passing code will be tedious. ++ ++ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW) ++ flag, which specifies that although it is a read operation being requested, a ++ write-lock should be taken. The reason is that read-locks are shared while write-locks ++ are exclusive, so taking a read-lock when a later-write is known in advance will often ++ leads to deadlock. If a reader knows it will write later, it should issue read ++ requests with the RMW flag set. ++*/ ++ ++/* ++ The znode/atom deadlock avoidance. ++ ++ FIXME(Zam): writing of this comment is in progress. ++ ++ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's ++ long-term locking, which makes reiser4 locking scheme more complex. It had ++ deadlocks until we implement deadlock avoidance algorithms. That deadlocks ++ looked as the following: one stopped thread waits for a long-term lock on ++ znode, the thread who owns that lock waits when fusion with another atom will ++ be allowed. ++ ++ The source of the deadlocks is an optimization of not capturing index nodes ++ for read. Let's prove it. Suppose we have dumb node capturing scheme which ++ unconditionally captures each block before locking it. ++ ++ That scheme has no deadlocks. Let's begin with the thread which stage is ++ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for ++ a capture because it's stage allows fusion with any atom except which are ++ being committed currently. A process of atom commit can't deadlock because ++ atom commit procedure does not acquire locks and does not fuse with other ++ atoms. Reiser4 does capturing right before going to sleep inside the ++ longtertm_lock_znode() function, it means the znode which we want to lock is ++ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we ++ continue the analysis we understand that no one process in the sequence may ++ waits atom fusion. Thereby there are no deadlocks of described kind. ++ ++ The capturing optimization makes the deadlocks possible. A thread can wait a ++ lock which owner did not captured that node. The lock owner's current atom ++ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT ++ state. A deadlock is possible when that atom meets another one which is in ++ ASTAGE_CAPTURE_WAIT already. ++ ++ The deadlock avoidance scheme includes two algorithms: ++ ++ First algorithm is used when a thread captures a node which is locked but not ++ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the ++ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is ++ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the ++ routine which forces all lock owners to join with current atom is executed. ++ ++ Second algorithm does not allow to skip capturing of already captured nodes. ++ ++ Both algorithms together prevent waiting a longterm lock without atom fusion ++ with atoms of all lock owners, which is a key thing for getting atom/znode ++ locking deadlocks. ++*/ ++ ++/* ++ * Transactions and mmap(2). ++ * ++ * 1. Transactions are not supported for accesses through mmap(2), because ++ * this would effectively amount to user-level transactions whose duration ++ * is beyond control of the kernel. ++ * ++ * 2. That said, we still want to preserve some decency with regard to ++ * mmap(2). During normal write(2) call, following sequence of events ++ * happens: ++ * ++ * 1. page is created; ++ * ++ * 2. jnode is created, dirtied and captured into current atom. ++ * ++ * 3. extent is inserted and modified. ++ * ++ * Steps (2) and (3) take place under long term lock on the twig node. ++ * ++ * When file is accessed through mmap(2) page is always created during ++ * page fault. ++ * After this (in reiser4_readpage()->reiser4_readpage_extent()): ++ * ++ * 1. if access is made to non-hole page new jnode is created, (if ++ * necessary) ++ * ++ * 2. if access is made to the hole page, jnode is not created (XXX ++ * not clear why). ++ * ++ * Also, even if page is created by write page fault it is not marked ++ * dirty immediately by handle_mm_fault(). Probably this is to avoid races ++ * with page write-out. ++ * ++ * Dirty bit installed by hardware is only transferred to the struct page ++ * later, when page is unmapped (in zap_pte_range(), or ++ * try_to_unmap_one()). ++ * ++ * So, with mmap(2) we have to handle following irksome situations: ++ * ++ * 1. there exists modified page (clean or dirty) without jnode ++ * ++ * 2. there exists modified page (clean or dirty) with clean jnode ++ * ++ * 3. clean page which is a part of atom can be transparently modified ++ * at any moment through mapping without becoming dirty. ++ * ++ * (1) and (2) can lead to the out-of-memory situation: ->writepage() ++ * doesn't know what to do with such pages and ->sync_sb()/->writepages() ++ * don't see them, because these methods operate on atoms. ++ * ++ * (3) can lead to the loss of data: suppose we have dirty page with dirty ++ * captured jnode captured by some atom. As part of early flush (for ++ * example) page was written out. Dirty bit was cleared on both page and ++ * jnode. After this page is modified through mapping, but kernel doesn't ++ * notice and just discards page and jnode as part of commit. (XXX ++ * actually it doesn't, because to reclaim page ->releasepage() has to be ++ * called and before this dirty bit will be transferred to the struct ++ * page). ++ * ++ */ ++ ++#include "debug.h" ++#include "txnmgr.h" ++#include "jnode.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree.h" ++#include "wander.h" ++#include "ktxnmgrd.h" ++#include "super.h" ++#include "page_cache.h" ++#include "reiser4.h" ++#include "vfs_ops.h" ++#include "inode.h" ++#include "flush.h" ++ ++#include <asm/atomic.h> ++#include <linux/types.h> ++#include <linux/fs.h> ++#include <linux/mm.h> ++#include <linux/slab.h> ++#include <linux/pagemap.h> ++#include <linux/writeback.h> ++#include <linux/swap.h> /* for totalram_pages */ ++ ++static void atom_free(txn_atom * atom); ++ ++static int commit_txnh(txn_handle * txnh); ++ ++static void wakeup_atom_waitfor_list(txn_atom * atom); ++static void wakeup_atom_waiting_list(txn_atom * atom); ++ ++static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh); ++ ++static void capture_assign_block_nolock(txn_atom * atom, jnode * node); ++ ++static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node); ++ ++static int capture_init_fusion(jnode * node, txn_handle * txnh, ++ txn_capture mode); ++ ++static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture); ++ ++static void capture_fuse_into(txn_atom * small, txn_atom * large); ++ ++void reiser4_invalidate_list(struct list_head *); ++ ++/* GENERIC STRUCTURES */ ++ ++typedef struct _txn_wait_links txn_wait_links; ++ ++struct _txn_wait_links { ++ lock_stack *_lock_stack; ++ struct list_head _fwaitfor_link; ++ struct list_head _fwaiting_link; ++ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks); ++ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks); ++}; ++ ++/* FIXME: In theory, we should be using the slab cache init & destructor ++ methods instead of, e.g., jnode_init, etc. */ ++static struct kmem_cache *_atom_slab = NULL; ++/* this is for user-visible, cross system-call transactions. */ ++static struct kmem_cache *_txnh_slab = NULL; ++ ++/** ++ * init_txnmgr_static - create transaction manager slab caches ++ * ++ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module ++ * initialization. ++ */ ++int init_txnmgr_static(void) ++{ ++ assert("jmacd-600", _atom_slab == NULL); ++ assert("jmacd-601", _txnh_slab == NULL); ++ ++ ON_DEBUG(atomic_set(&flush_cnt, 0)); ++ ++ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0, ++ SLAB_HWCACHE_ALIGN | ++ SLAB_RECLAIM_ACCOUNT, NULL); ++ if (_atom_slab == NULL) ++ return RETERR(-ENOMEM); ++ ++ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0, ++ SLAB_HWCACHE_ALIGN, NULL); ++ if (_txnh_slab == NULL) { ++ kmem_cache_destroy(_atom_slab); ++ _atom_slab = NULL; ++ return RETERR(-ENOMEM); ++ } ++ ++ return 0; ++} ++ ++/** ++ * done_txnmgr_static - delete txn_atom and txn_handle caches ++ * ++ * This is called on reiser4 module unloading or system shutdown. ++ */ ++void done_txnmgr_static(void) ++{ ++ destroy_reiser4_cache(&_atom_slab); ++ destroy_reiser4_cache(&_txnh_slab); ++} ++ ++/** ++ * init_txnmgr - initialize a new transaction manager ++ * @mgr: pointer to transaction manager embedded in reiser4 super block ++ * ++ * This is called on mount. Makes necessary initializations. ++ */ ++void reiser4_init_txnmgr(txn_mgr *mgr) ++{ ++ assert("umka-169", mgr != NULL); ++ ++ mgr->atom_count = 0; ++ mgr->id_count = 1; ++ INIT_LIST_HEAD(&mgr->atoms_list); ++ spin_lock_init(&mgr->tmgr_lock); ++ mutex_init(&mgr->commit_mutex); ++} ++ ++/** ++ * reiser4_done_txnmgr - stop transaction manager ++ * @mgr: pointer to transaction manager embedded in reiser4 super block ++ * ++ * This is called on umount. Does sanity checks. ++ */ ++void reiser4_done_txnmgr(txn_mgr *mgr) ++{ ++ assert("umka-170", mgr != NULL); ++ assert("umka-1701", list_empty_careful(&mgr->atoms_list)); ++ assert("umka-1702", mgr->atom_count == 0); ++} ++ ++/* Initialize a transaction handle. */ ++/* Audited by: umka (2002.06.13) */ ++static void txnh_init(txn_handle * txnh, txn_mode mode) ++{ ++ assert("umka-171", txnh != NULL); ++ ++ txnh->mode = mode; ++ txnh->atom = NULL; ++ reiser4_ctx_gfp_mask_set(); ++ txnh->flags = 0; ++ spin_lock_init(&txnh->hlock); ++ INIT_LIST_HEAD(&txnh->txnh_link); ++} ++ ++#if REISER4_DEBUG ++/* Check if a transaction handle is clean. */ ++static int txnh_isclean(txn_handle * txnh) ++{ ++ assert("umka-172", txnh != NULL); ++ return txnh->atom == NULL && ++ LOCK_CNT_NIL(spin_locked_txnh); ++} ++#endif ++ ++/* Initialize an atom. */ ++static void atom_init(txn_atom * atom) ++{ ++ int level; ++ ++ assert("umka-173", atom != NULL); ++ ++ memset(atom, 0, sizeof(txn_atom)); ++ ++ atom->stage = ASTAGE_FREE; ++ atom->start_time = jiffies; ++ ++ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) ++ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level)); ++ ++ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom)); ++ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom)); ++ INIT_LIST_HEAD(ATOM_WB_LIST(atom)); ++ INIT_LIST_HEAD(&atom->inodes); ++ spin_lock_init(&(atom->alock)); ++ /* list of transaction handles */ ++ INIT_LIST_HEAD(&atom->txnh_list); ++ /* link to transaction manager's list of atoms */ ++ INIT_LIST_HEAD(&atom->atom_link); ++ INIT_LIST_HEAD(&atom->fwaitfor_list); ++ INIT_LIST_HEAD(&atom->fwaiting_list); ++ blocknr_set_init(&atom->delete_set); ++ blocknr_set_init(&atom->wandered_map); ++ ++ init_atom_fq_parts(atom); ++} ++ ++#if REISER4_DEBUG ++/* Check if an atom is clean. */ ++static int atom_isclean(txn_atom * atom) ++{ ++ int level; ++ ++ assert("umka-174", atom != NULL); ++ ++ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { ++ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) { ++ return 0; ++ } ++ } ++ ++ return atom->stage == ASTAGE_FREE && ++ atom->txnh_count == 0 && ++ atom->capture_count == 0 && ++ atomic_read(&atom->refcount) == 0 && ++ (&atom->atom_link == atom->atom_link.next && ++ &atom->atom_link == atom->atom_link.prev) && ++ list_empty_careful(&atom->txnh_list) && ++ list_empty_careful(ATOM_CLEAN_LIST(atom)) && ++ list_empty_careful(ATOM_OVRWR_LIST(atom)) && ++ list_empty_careful(ATOM_WB_LIST(atom)) && ++ list_empty_careful(&atom->fwaitfor_list) && ++ list_empty_careful(&atom->fwaiting_list) && ++ atom_fq_parts_are_clean(atom); ++} ++#endif ++ ++/* Begin a transaction in this context. Currently this uses the reiser4_context's ++ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually ++ this will be extended to allow transaction handles to span several contexts. */ ++/* Audited by: umka (2002.06.13) */ ++void reiser4_txn_begin(reiser4_context * context) ++{ ++ assert("jmacd-544", context->trans == NULL); ++ ++ context->trans = &context->trans_in_ctx; ++ ++ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING ++ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is ++ stack allocated right now, but we would like to allow for dynamically allocated ++ transcrashes that span multiple system calls. ++ */ ++ txnh_init(context->trans, TXN_WRITE_FUSING); ++} ++ ++/* Finish a transaction handle context. */ ++int reiser4_txn_end(reiser4_context * context) ++{ ++ long ret = 0; ++ txn_handle *txnh; ++ ++ assert("umka-283", context != NULL); ++ assert("nikita-3012", reiser4_schedulable()); ++ assert("vs-24", context == get_current_context()); ++ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack())); ++ ++ txnh = context->trans; ++ if (txnh != NULL) { ++ if (txnh->atom != NULL) ++ ret = commit_txnh(txnh); ++ assert("jmacd-633", txnh_isclean(txnh)); ++ context->trans = NULL; ++ } ++ return ret; ++} ++ ++void reiser4_txn_restart(reiser4_context * context) ++{ ++ reiser4_txn_end(context); ++ reiser4_preempt_point(); ++ reiser4_txn_begin(context); ++} ++ ++void reiser4_txn_restart_current(void) ++{ ++ reiser4_txn_restart(get_current_context()); ++} ++ ++/* TXN_ATOM */ ++ ++/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom ++ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May ++ return NULL. */ ++static txn_atom *txnh_get_atom(txn_handle * txnh) ++{ ++ txn_atom *atom; ++ ++ assert("umka-180", txnh != NULL); ++ assert_spin_not_locked(&(txnh->hlock)); ++ ++ while (1) { ++ spin_lock_txnh(txnh); ++ atom = txnh->atom; ++ ++ if (atom == NULL) ++ break; ++ ++ if (spin_trylock_atom(atom)) ++ break; ++ ++ atomic_inc(&atom->refcount); ++ ++ spin_unlock_txnh(txnh); ++ spin_lock_atom(atom); ++ spin_lock_txnh(txnh); ++ ++ if (txnh->atom == atom) { ++ atomic_dec(&atom->refcount); ++ break; ++ } ++ ++ spin_unlock_txnh(txnh); ++ atom_dec_and_unlock(atom); ++ } ++ ++ return atom; ++} ++ ++/* Get the current atom and spinlock it if current atom present. May return NULL */ ++txn_atom *get_current_atom_locked_nocheck(void) ++{ ++ reiser4_context *cx; ++ txn_atom *atom; ++ txn_handle *txnh; ++ ++ cx = get_current_context(); ++ assert("zam-437", cx != NULL); ++ ++ txnh = cx->trans; ++ assert("zam-435", txnh != NULL); ++ ++ atom = txnh_get_atom(txnh); ++ ++ spin_unlock_txnh(txnh); ++ return atom; ++} ++ ++/* Get the atom belonging to a jnode, which is initially locked. Return with ++ both jnode and atom locked. This performs the necessary spin_trylock to ++ break the lock-ordering cycle. Assumes the jnode is already locked, and ++ returns NULL if atom is not set. */ ++txn_atom *jnode_get_atom(jnode * node) ++{ ++ txn_atom *atom; ++ ++ assert("umka-181", node != NULL); ++ ++ while (1) { ++ assert_spin_locked(&(node->guard)); ++ ++ atom = node->atom; ++ /* node is not in any atom */ ++ if (atom == NULL) ++ break; ++ ++ /* If atom is not locked, grab the lock and return */ ++ if (spin_trylock_atom(atom)) ++ break; ++ ++ /* At least one jnode belongs to this atom it guarantees that ++ * atom->refcount > 0, we can safely increment refcount. */ ++ atomic_inc(&atom->refcount); ++ spin_unlock_jnode(node); ++ ++ /* re-acquire spin locks in the right order */ ++ spin_lock_atom(atom); ++ spin_lock_jnode(node); ++ ++ /* check if node still points to the same atom. */ ++ if (node->atom == atom) { ++ atomic_dec(&atom->refcount); ++ break; ++ } ++ ++ /* releasing of atom lock and reference requires not holding ++ * locks on jnodes. */ ++ spin_unlock_jnode(node); ++ ++ /* We do not sure that this atom has extra references except our ++ * one, so we should call proper function which may free atom if ++ * last reference is released. */ ++ atom_dec_and_unlock(atom); ++ ++ /* lock jnode again for getting valid node->atom pointer ++ * value. */ ++ spin_lock_jnode(node); ++ } ++ ++ return atom; ++} ++ ++/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used ++ by flush code to indicate whether the next node (in some direction) is suitable for ++ flushing. */ ++int ++same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value) ++{ ++ int compat; ++ txn_atom *atom; ++ ++ assert("umka-182", node != NULL); ++ assert("umka-183", check != NULL); ++ ++ /* Not sure what this function is supposed to do if supplied with @check that is ++ neither formatted nor unformatted (bitmap or so). */ ++ assert("nikita-2373", jnode_is_znode(check) ++ || jnode_is_unformatted(check)); ++ ++ /* Need a lock on CHECK to get its atom and to check various state bits. ++ Don't need a lock on NODE once we get the atom lock. */ ++ /* It is not enough to lock two nodes and check (node->atom == ++ check->atom) because atom could be locked and being fused at that ++ moment, jnodes of the atom of that state (being fused) can point to ++ different objects, but the atom is the same. */ ++ spin_lock_jnode(check); ++ ++ atom = jnode_get_atom(check); ++ ++ if (atom == NULL) { ++ compat = 0; ++ } else { ++ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY)); ++ ++ if (compat && jnode_is_znode(check)) { ++ compat &= znode_is_connected(JZNODE(check)); ++ } ++ ++ if (compat && alloc_check) { ++ compat &= (alloc_value == jnode_is_flushprepped(check)); ++ } ++ ++ spin_unlock_atom(atom); ++ } ++ ++ spin_unlock_jnode(check); ++ ++ return compat; ++} ++ ++/* Decrement the atom's reference count and if it falls to zero, free it. */ ++void atom_dec_and_unlock(txn_atom * atom) ++{ ++ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr; ++ ++ assert("umka-186", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ assert("zam-1039", atomic_read(&atom->refcount) > 0); ++ ++ if (atomic_dec_and_test(&atom->refcount)) { ++ /* take txnmgr lock and atom lock in proper order. */ ++ if (!spin_trylock_txnmgr(mgr)) { ++ /* This atom should exist after we re-acquire its ++ * spinlock, so we increment its reference counter. */ ++ atomic_inc(&atom->refcount); ++ spin_unlock_atom(atom); ++ spin_lock_txnmgr(mgr); ++ spin_lock_atom(atom); ++ ++ if (!atomic_dec_and_test(&atom->refcount)) { ++ spin_unlock_atom(atom); ++ spin_unlock_txnmgr(mgr); ++ return; ++ } ++ } ++ assert_spin_locked(&(mgr->tmgr_lock)); ++ atom_free(atom); ++ spin_unlock_txnmgr(mgr); ++ } else ++ spin_unlock_atom(atom); ++} ++ ++/* Create new atom and connect it to given transaction handle. This adds the ++ atom to the transaction manager's list and sets its reference count to 1, an ++ artificial reference which is kept until it commits. We play strange games ++ to avoid allocation under jnode & txnh spinlocks.*/ ++ ++static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh) ++{ ++ txn_atom *atom; ++ txn_mgr *mgr; ++ ++ if (REISER4_DEBUG && rofs_tree(current_tree)) { ++ warning("nikita-3366", "Creating atom on rofs"); ++ dump_stack(); ++ } ++ ++ if (*atom_alloc == NULL) { ++ (*atom_alloc) = kmem_cache_alloc(_atom_slab, ++ reiser4_ctx_gfp_mask_get()); ++ ++ if (*atom_alloc == NULL) ++ return RETERR(-ENOMEM); ++ } ++ ++ /* and, also, txnmgr spin lock should be taken before jnode and txnh ++ locks. */ ++ mgr = &get_super_private(reiser4_get_current_sb())->tmgr; ++ spin_lock_txnmgr(mgr); ++ spin_lock_txnh(txnh); ++ ++ /* Check whether new atom still needed */ ++ if (txnh->atom != NULL) { ++ /* NOTE-NIKITA probably it is rather better to free ++ * atom_alloc here than thread it up to reiser4_try_capture() */ ++ ++ spin_unlock_txnh(txnh); ++ spin_unlock_txnmgr(mgr); ++ ++ return -E_REPEAT; ++ } ++ ++ atom = *atom_alloc; ++ *atom_alloc = NULL; ++ ++ atom_init(atom); ++ ++ assert("jmacd-17", atom_isclean(atom)); ++ ++ /* ++ * lock ordering is broken here. It is ok, as long as @atom is new ++ * and inaccessible for others. We can't use spin_lock_atom or ++ * spin_lock(&atom->alock) because they care about locking ++ * dependencies. spin_trylock_lock doesn't. ++ */ ++ check_me("", spin_trylock_atom(atom)); ++ ++ /* add atom to the end of transaction manager's list of atoms */ ++ list_add_tail(&atom->atom_link, &mgr->atoms_list); ++ atom->atom_id = mgr->id_count++; ++ mgr->atom_count += 1; ++ ++ /* Release txnmgr lock */ ++ spin_unlock_txnmgr(mgr); ++ ++ /* One reference until it commits. */ ++ atomic_inc(&atom->refcount); ++ atom->stage = ASTAGE_CAPTURE_FUSE; ++ atom->super = reiser4_get_current_sb(); ++ capture_assign_txnh_nolock(atom, txnh); ++ ++ spin_unlock_atom(atom); ++ spin_unlock_txnh(txnh); ++ ++ return -E_REPEAT; ++} ++ ++/* Return true if an atom is currently "open". */ ++static int atom_isopen(const txn_atom * atom) ++{ ++ assert("umka-185", atom != NULL); ++ ++ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT; ++} ++ ++/* Return the number of pointers to this atom that must be updated during fusion. This ++ approximates the amount of work to be done. Fusion chooses the atom with fewer ++ pointers to fuse into the atom with more pointers. */ ++static int atom_pointer_count(const txn_atom * atom) ++{ ++ assert("umka-187", atom != NULL); ++ ++ /* This is a measure of the amount of work needed to fuse this atom ++ * into another. */ ++ return atom->txnh_count + atom->capture_count; ++} ++ ++/* Called holding the atom lock, this removes the atom from the transaction manager list ++ and frees it. */ ++static void atom_free(txn_atom * atom) ++{ ++ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr; ++ ++ assert("umka-188", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ ++ /* Remove from the txn_mgr's atom list */ ++ assert_spin_locked(&(mgr->tmgr_lock)); ++ mgr->atom_count -= 1; ++ list_del_init(&atom->atom_link); ++ ++ /* Clean the atom */ ++ assert("jmacd-16", ++ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE)); ++ atom->stage = ASTAGE_FREE; ++ ++ blocknr_set_destroy(&atom->delete_set); ++ blocknr_set_destroy(&atom->wandered_map); ++ ++ assert("jmacd-16", atom_isclean(atom)); ++ ++ spin_unlock_atom(atom); ++ ++ kmem_cache_free(_atom_slab, atom); ++} ++ ++static int atom_is_dotard(const txn_atom * atom) ++{ ++ return time_after(jiffies, atom->start_time + ++ get_current_super_private()->tmgr.atom_max_age); ++} ++ ++static int atom_can_be_committed(txn_atom * atom) ++{ ++ assert_spin_locked(&(atom->alock)); ++ assert("zam-885", atom->txnh_count > atom->nr_waiters); ++ return atom->txnh_count == atom->nr_waiters + 1; ++} ++ ++/* Return true if an atom should commit now. This is determined by aging, atom ++ size or atom flags. */ ++static int atom_should_commit(const txn_atom * atom) ++{ ++ assert("umka-189", atom != NULL); ++ return ++ (atom->flags & ATOM_FORCE_COMMIT) || ++ ((unsigned)atom_pointer_count(atom) > ++ get_current_super_private()->tmgr.atom_max_size) ++ || atom_is_dotard(atom); ++} ++ ++/* return 1 if current atom exists and requires commit. */ ++int current_atom_should_commit(void) ++{ ++ txn_atom *atom; ++ int result = 0; ++ ++ atom = get_current_atom_locked_nocheck(); ++ if (atom) { ++ result = atom_should_commit(atom); ++ spin_unlock_atom(atom); ++ } ++ return result; ++} ++ ++static int atom_should_commit_asap(const txn_atom * atom) ++{ ++ unsigned int captured; ++ unsigned int pinnedpages; ++ ++ assert("nikita-3309", atom != NULL); ++ ++ captured = (unsigned)atom->capture_count; ++ pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode); ++ ++ return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100); ++} ++ ++static jnode *find_first_dirty_in_list(struct list_head *head, int flags) ++{ ++ jnode *first_dirty; ++ ++ list_for_each_entry(first_dirty, head, capture_link) { ++ if (!(flags & JNODE_FLUSH_COMMIT)) { ++ /* ++ * skip jnodes which "heard banshee" or having active ++ * I/O ++ */ ++ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) || ++ JF_ISSET(first_dirty, JNODE_WRITEBACK)) ++ continue; ++ } ++ return first_dirty; ++ } ++ return NULL; ++} ++ ++/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty ++ nodes on atom's lists */ ++jnode *find_first_dirty_jnode(txn_atom * atom, int flags) ++{ ++ jnode *first_dirty; ++ tree_level level; ++ ++ assert_spin_locked(&(atom->alock)); ++ ++ /* The flush starts from LEAF_LEVEL (=1). */ ++ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { ++ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level))) ++ continue; ++ ++ first_dirty = ++ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level), ++ flags); ++ if (first_dirty) ++ return first_dirty; ++ } ++ ++ /* znode-above-root is on the list #0. */ ++ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags); ++} ++ ++static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq) ++{ ++ jnode *cur; ++ ++ assert("zam-905", atom_is_protected(atom)); ++ ++ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link); ++ while (ATOM_WB_LIST(atom) != &cur->capture_link) { ++ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link); ++ ++ spin_lock_jnode(cur); ++ if (!JF_ISSET(cur, JNODE_WRITEBACK)) { ++ if (JF_ISSET(cur, JNODE_DIRTY)) { ++ queue_jnode(fq, cur); ++ } else { ++ /* move jnode to atom's clean list */ ++ list_move_tail(&cur->capture_link, ++ ATOM_CLEAN_LIST(atom)); ++ } ++ } ++ spin_unlock_jnode(cur); ++ ++ cur = next; ++ } ++} ++ ++/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback ++ * jnodes to disk. */ ++static int submit_wb_list(void) ++{ ++ int ret; ++ flush_queue_t *fq; ++ ++ fq = get_fq_for_current_atom(); ++ if (IS_ERR(fq)) ++ return PTR_ERR(fq); ++ ++ dispatch_wb_list(fq->atom, fq); ++ spin_unlock_atom(fq->atom); ++ ++ ret = reiser4_write_fq(fq, NULL, 1); ++ reiser4_fq_put(fq); ++ ++ return ret; ++} ++ ++/* Wait completion of all writes, re-submit atom writeback list if needed. */ ++static int current_atom_complete_writes(void) ++{ ++ int ret; ++ ++ /* Each jnode from that list was modified and dirtied when it had i/o ++ * request running already. After i/o completion we have to resubmit ++ * them to disk again.*/ ++ ret = submit_wb_list(); ++ if (ret < 0) ++ return ret; ++ ++ /* Wait all i/o completion */ ++ ret = current_atom_finish_all_fq(); ++ if (ret) ++ return ret; ++ ++ /* Scan wb list again; all i/o should be completed, we re-submit dirty ++ * nodes to disk */ ++ ret = submit_wb_list(); ++ if (ret < 0) ++ return ret; ++ ++ /* Wait all nodes we just submitted */ ++ return current_atom_finish_all_fq(); ++} ++ ++#if REISER4_DEBUG ++ ++static void reiser4_info_atom(const char *prefix, const txn_atom * atom) ++{ ++ if (atom == NULL) { ++ printk("%s: no atom\n", prefix); ++ return; ++ } ++ ++ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i" ++ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix, ++ atomic_read(&atom->refcount), atom->atom_id, atom->flags, ++ atom->txnh_count, atom->capture_count, atom->stage, ++ atom->start_time, atom->flushed); ++} ++ ++#else /* REISER4_DEBUG */ ++ ++static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {} ++ ++#endif /* REISER4_DEBUG */ ++ ++#define TOOMANYFLUSHES (1 << 13) ++ ++/* Called with the atom locked and no open "active" transaction handlers except ++ ours, this function calls flush_current_atom() until all dirty nodes are ++ processed. Then it initiates commit processing. ++ ++ Called by the single remaining open "active" txnh, which is closing. Other ++ open txnhs belong to processes which wait atom commit in commit_txnh() ++ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as ++ long as we hold the atom lock none of the jnodes can be captured and/or ++ locked. ++ ++ Return value is an error code if commit fails. ++*/ ++static int commit_current_atom(long *nr_submitted, txn_atom ** atom) ++{ ++ reiser4_super_info_data *sbinfo = get_current_super_private(); ++ long ret = 0; ++ /* how many times jnode_flush() was called as a part of attempt to ++ * commit this atom. */ ++ int flushiters; ++ ++ assert("zam-888", atom != NULL && *atom != NULL); ++ assert_spin_locked(&((*atom)->alock)); ++ assert("zam-887", get_current_context()->trans->atom == *atom); ++ assert("jmacd-151", atom_isopen(*atom)); ++ ++ assert("nikita-3184", ++ get_current_super_private()->delete_mutex_owner != current); ++ ++ for (flushiters = 0;; ++flushiters) { ++ ret = ++ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS | ++ JNODE_FLUSH_COMMIT, ++ LONG_MAX /* nr_to_write */ , ++ nr_submitted, atom, NULL); ++ if (ret != -E_REPEAT) ++ break; ++ ++ /* if atom's dirty list contains one znode which is ++ HEARD_BANSHEE and is locked we have to allow lock owner to ++ continue and uncapture that znode */ ++ reiser4_preempt_point(); ++ ++ *atom = get_current_atom_locked(); ++ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) { ++ warning("nikita-3176", ++ "Flushing like mad: %i", flushiters); ++ reiser4_info_atom("atom", *atom); ++ DEBUGON(flushiters > (1 << 20)); ++ } ++ } ++ ++ if (ret) ++ return ret; ++ ++ assert_spin_locked(&((*atom)->alock)); ++ ++ if (!atom_can_be_committed(*atom)) { ++ spin_unlock_atom(*atom); ++ return RETERR(-E_REPEAT); ++ } ++ ++ if ((*atom)->capture_count == 0) ++ goto done; ++ ++ /* Up to this point we have been flushing and after flush is called we ++ return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT ++ at this point, commit should be successful. */ ++ reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT); ++ ON_DEBUG(((*atom)->committer = current)); ++ spin_unlock_atom(*atom); ++ ++ ret = current_atom_complete_writes(); ++ if (ret) ++ return ret; ++ ++ assert("zam-906", list_empty(ATOM_WB_LIST(*atom))); ++ ++ /* isolate critical code path which should be executed by only one ++ * thread using tmgr mutex */ ++ mutex_lock(&sbinfo->tmgr.commit_mutex); ++ ++ ret = reiser4_write_logs(nr_submitted); ++ if (ret < 0) ++ reiser4_panic("zam-597", "write log failed (%ld)\n", ret); ++ ++ /* The atom->ovrwr_nodes list is processed under commit mutex held ++ because of bitmap nodes which are captured by special way in ++ reiser4_pre_commit_hook_bitmap(), that way does not include ++ capture_fuse_wait() as a capturing of other nodes does -- the commit ++ mutex is used for transaction isolation instead. */ ++ reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom)); ++ mutex_unlock(&sbinfo->tmgr.commit_mutex); ++ ++ reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom)); ++ reiser4_invalidate_list(ATOM_WB_LIST(*atom)); ++ assert("zam-927", list_empty(&(*atom)->inodes)); ++ ++ spin_lock_atom(*atom); ++ done: ++ reiser4_atom_set_stage(*atom, ASTAGE_DONE); ++ ON_DEBUG((*atom)->committer = NULL); ++ ++ /* Atom's state changes, so wake up everybody waiting for this ++ event. */ ++ wakeup_atom_waiting_list(*atom); ++ ++ /* Decrement the "until commit" reference, at least one txnh (the caller) is ++ still open. */ ++ atomic_dec(&(*atom)->refcount); ++ ++ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0); ++ assert("jmacd-1062", (*atom)->capture_count == 0); ++ BUG_ON((*atom)->capture_count != 0); ++ assert_spin_locked(&((*atom)->alock)); ++ ++ return ret; ++} ++ ++/* TXN_TXNH */ ++ ++/** ++ * force_commit_atom - commit current atom and wait commit completion ++ * @txnh: ++ * ++ * Commits current atom and wait commit completion; current atom and @txnh have ++ * to be spinlocked before call, this function unlocks them on exit. ++ */ ++int force_commit_atom(txn_handle *txnh) ++{ ++ txn_atom *atom; ++ ++ assert("zam-837", txnh != NULL); ++ assert_spin_locked(&(txnh->hlock)); ++ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack())); ++ ++ atom = txnh->atom; ++ ++ assert("zam-834", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ ++ /* ++ * Set flags for atom and txnh: forcing atom commit and waiting for ++ * commit completion ++ */ ++ txnh->flags |= TXNH_WAIT_COMMIT; ++ atom->flags |= ATOM_FORCE_COMMIT; ++ ++ spin_unlock_txnh(txnh); ++ spin_unlock_atom(atom); ++ ++ /* commit is here */ ++ reiser4_txn_restart_current(); ++ return 0; ++} ++ ++/* Called to force commit of any outstanding atoms. @commit_all_atoms controls ++ * should we commit all atoms including new ones which are created after this ++ * functions is called. */ ++int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms) ++{ ++ int ret; ++ txn_atom *atom; ++ txn_mgr *mgr; ++ txn_handle *txnh; ++ unsigned long start_time = jiffies; ++ reiser4_context *ctx = get_current_context(); ++ ++ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack())); ++ assert("nikita-3058", reiser4_commit_check_locks()); ++ ++ reiser4_txn_restart_current(); ++ ++ mgr = &get_super_private(super)->tmgr; ++ ++ txnh = ctx->trans; ++ ++ again: ++ ++ spin_lock_txnmgr(mgr); ++ ++ list_for_each_entry(atom, &mgr->atoms_list, atom_link) { ++ spin_lock_atom(atom); ++ ++ /* Commit any atom which can be committed. If @commit_new_atoms ++ * is not set we commit only atoms which were created before ++ * this call is started. */ ++ if (commit_all_atoms ++ || time_before_eq(atom->start_time, start_time)) { ++ if (atom->stage <= ASTAGE_POST_COMMIT) { ++ spin_unlock_txnmgr(mgr); ++ ++ if (atom->stage < ASTAGE_PRE_COMMIT) { ++ spin_lock_txnh(txnh); ++ /* Add force-context txnh */ ++ capture_assign_txnh_nolock(atom, txnh); ++ ret = force_commit_atom(txnh); ++ if (ret) ++ return ret; ++ } else ++ /* wait atom commit */ ++ reiser4_atom_wait_event(atom); ++ ++ goto again; ++ } ++ } ++ ++ spin_unlock_atom(atom); ++ } ++ ++#if REISER4_DEBUG ++ if (commit_all_atoms) { ++ reiser4_super_info_data *sbinfo = get_super_private(super); ++ spin_lock_reiser4_super(sbinfo); ++ assert("zam-813", ++ sbinfo->blocks_fake_allocated_unformatted == 0); ++ assert("zam-812", sbinfo->blocks_fake_allocated == 0); ++ spin_unlock_reiser4_super(sbinfo); ++ } ++#endif ++ ++ spin_unlock_txnmgr(mgr); ++ ++ return 0; ++} ++ ++/* check whether commit_some_atoms() can commit @atom. Locking is up to the ++ * caller */ ++static int atom_is_committable(txn_atom * atom) ++{ ++ return ++ atom->stage < ASTAGE_PRE_COMMIT && ++ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom); ++} ++ ++/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin ++ * lock at exit */ ++int commit_some_atoms(txn_mgr * mgr) ++{ ++ int ret = 0; ++ txn_atom *atom; ++ txn_handle *txnh; ++ reiser4_context *ctx; ++ struct list_head *pos, *tmp; ++ ++ ctx = get_current_context(); ++ assert("nikita-2444", ctx != NULL); ++ ++ txnh = ctx->trans; ++ spin_lock_txnmgr(mgr); ++ ++ /* ++ * this is to avoid gcc complain that atom might be used ++ * uninitialized ++ */ ++ atom = NULL; ++ ++ /* look for atom to commit */ ++ list_for_each_safe(pos, tmp, &mgr->atoms_list) { ++ atom = list_entry(pos, txn_atom, atom_link); ++ /* ++ * first test without taking atom spin lock, whether it is ++ * eligible for committing at all ++ */ ++ if (atom_is_committable(atom)) { ++ /* now, take spin lock and re-check */ ++ spin_lock_atom(atom); ++ if (atom_is_committable(atom)) ++ break; ++ spin_unlock_atom(atom); ++ } ++ } ++ ++ ret = (&mgr->atoms_list == pos); ++ spin_unlock_txnmgr(mgr); ++ ++ if (ret) { ++ /* nothing found */ ++ spin_unlock(&mgr->daemon->guard); ++ return 0; ++ } ++ ++ spin_lock_txnh(txnh); ++ ++ BUG_ON(atom == NULL); ++ /* Set the atom to force committing */ ++ atom->flags |= ATOM_FORCE_COMMIT; ++ ++ /* Add force-context txnh */ ++ capture_assign_txnh_nolock(atom, txnh); ++ ++ spin_unlock_txnh(txnh); ++ spin_unlock_atom(atom); ++ ++ /* we are about to release daemon spin lock, notify daemon it ++ has to rescan atoms */ ++ mgr->daemon->rescan = 1; ++ spin_unlock(&mgr->daemon->guard); ++ reiser4_txn_restart_current(); ++ return 0; ++} ++ ++static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom) ++{ ++ int atom_stage; ++ txn_atom *atom_2; ++ int repeat; ++ ++ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT); ++ ++ atom_stage = atom->stage; ++ repeat = 0; ++ ++ if (!spin_trylock_txnmgr(tmgr)) { ++ atomic_inc(&atom->refcount); ++ spin_unlock_atom(atom); ++ spin_lock_txnmgr(tmgr); ++ spin_lock_atom(atom); ++ repeat = 1; ++ if (atom->stage != atom_stage) { ++ spin_unlock_txnmgr(tmgr); ++ atom_dec_and_unlock(atom); ++ return -E_REPEAT; ++ } ++ atomic_dec(&atom->refcount); ++ } ++ ++ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) { ++ if (atom == atom_2) ++ continue; ++ /* ++ * if trylock does not succeed we just do not fuse with that ++ * atom. ++ */ ++ if (spin_trylock_atom(atom_2)) { ++ if (atom_2->stage < ASTAGE_PRE_COMMIT) { ++ spin_unlock_txnmgr(tmgr); ++ capture_fuse_into(atom_2, atom); ++ /* all locks are lost we can only repeat here */ ++ return -E_REPEAT; ++ } ++ spin_unlock_atom(atom_2); ++ } ++ } ++ atom->flags |= ATOM_CANCEL_FUSION; ++ spin_unlock_txnmgr(tmgr); ++ if (repeat) { ++ spin_unlock_atom(atom); ++ return -E_REPEAT; ++ } ++ return 0; ++} ++ ++/* Calls jnode_flush for current atom if it exists; if not, just take another ++ atom and call jnode_flush() for him. If current transaction handle has ++ already assigned atom (current atom) we have to close current transaction ++ prior to switch to another atom or do something with current atom. This ++ code tries to flush current atom. ++ ++ flush_some_atom() is called as part of memory clearing process. It is ++ invoked from balance_dirty_pages(), pdflushd, and entd. ++ ++ If we can flush no nodes, atom is committed, because this frees memory. ++ ++ If atom is too large or too old it is committed also. ++*/ ++int ++flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc, ++ int flags) ++{ ++ reiser4_context *ctx = get_current_context(); ++ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr; ++ txn_handle *txnh = ctx->trans; ++ txn_atom *atom; ++ int ret; ++ ++ BUG_ON(wbc->nr_to_write == 0); ++ BUG_ON(*nr_submitted != 0); ++ assert("zam-1042", txnh != NULL); ++ repeat: ++ if (txnh->atom == NULL) { ++ /* current atom is not available, take first from txnmgr */ ++ spin_lock_txnmgr(tmgr); ++ ++ /* traverse the list of all atoms */ ++ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { ++ /* lock atom before checking its state */ ++ spin_lock_atom(atom); ++ ++ /* ++ * we need an atom which is not being committed and ++ * which has no flushers (jnode_flush() add one flusher ++ * at the beginning and subtract one at the end). ++ */ ++ if (atom->stage < ASTAGE_PRE_COMMIT && ++ atom->nr_flushers == 0) { ++ spin_lock_txnh(txnh); ++ capture_assign_txnh_nolock(atom, txnh); ++ spin_unlock_txnh(txnh); ++ ++ goto found; ++ } ++ ++ spin_unlock_atom(atom); ++ } ++ ++ /* ++ * Write throttling is case of no one atom can be ++ * flushed/committed. ++ */ ++ if (!current_is_flush_bd_task() && !wbc->nonblocking) { ++ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { ++ spin_lock_atom(atom); ++ /* Repeat the check from the above. */ ++ if (atom->stage < ASTAGE_PRE_COMMIT ++ && atom->nr_flushers == 0) { ++ spin_lock_txnh(txnh); ++ capture_assign_txnh_nolock(atom, txnh); ++ spin_unlock_txnh(txnh); ++ ++ goto found; ++ } ++ if (atom->stage <= ASTAGE_POST_COMMIT) { ++ spin_unlock_txnmgr(tmgr); ++ /* ++ * we just wait until atom's flusher ++ * makes a progress in flushing or ++ * committing the atom ++ */ ++ reiser4_atom_wait_event(atom); ++ goto repeat; ++ } ++ spin_unlock_atom(atom); ++ } ++ } ++ spin_unlock_txnmgr(tmgr); ++ return 0; ++ found: ++ spin_unlock_txnmgr(tmgr); ++ } else ++ atom = get_current_atom_locked(); ++ ++ BUG_ON(atom->super != ctx->super); ++ assert("vs-35", atom->super == ctx->super); ++ if (start) { ++ spin_lock_jnode(start); ++ ret = (atom == start->atom) ? 1 : 0; ++ spin_unlock_jnode(start); ++ if (ret == 0) ++ start = NULL; ++ } ++ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start); ++ if (ret == 0) { ++ /* flush_current_atom returns 0 only if it submitted for write ++ nothing */ ++ BUG_ON(*nr_submitted != 0); ++ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) { ++ if (atom->capture_count < tmgr->atom_min_size && ++ !(atom->flags & ATOM_CANCEL_FUSION)) { ++ ret = txn_try_to_fuse_small_atom(tmgr, atom); ++ if (ret == -E_REPEAT) { ++ reiser4_preempt_point(); ++ goto repeat; ++ } ++ } ++ /* if early flushing could not make more nodes clean, ++ * or atom is too old/large, ++ * we force current atom to commit */ ++ /* wait for commit completion but only if this ++ * wouldn't stall pdflushd and ent thread. */ ++ if (!wbc->nonblocking && !ctx->entd) ++ txnh->flags |= TXNH_WAIT_COMMIT; ++ atom->flags |= ATOM_FORCE_COMMIT; ++ } ++ spin_unlock_atom(atom); ++ } else if (ret == -E_REPEAT) { ++ if (*nr_submitted == 0) { ++ /* let others who hampers flushing (hold longterm locks, ++ for instance) to free the way for flush */ ++ reiser4_preempt_point(); ++ goto repeat; ++ } ++ ret = 0; ++ } ++/* ++ if (*nr_submitted > wbc->nr_to_write) ++ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted); ++*/ ++ reiser4_txn_restart(ctx); ++ ++ return ret; ++} ++ ++/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */ ++void reiser4_invalidate_list(struct list_head *head) ++{ ++ while (!list_empty(head)) { ++ jnode *node; ++ ++ node = list_entry(head->next, jnode, capture_link); ++ spin_lock_jnode(node); ++ reiser4_uncapture_block(node); ++ jput(node); ++ } ++} ++ ++static void init_wlinks(txn_wait_links * wlinks) ++{ ++ wlinks->_lock_stack = get_current_lock_stack(); ++ INIT_LIST_HEAD(&wlinks->_fwaitfor_link); ++ INIT_LIST_HEAD(&wlinks->_fwaiting_link); ++ wlinks->waitfor_cb = NULL; ++ wlinks->waiting_cb = NULL; ++} ++ ++/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */ ++void reiser4_atom_wait_event(txn_atom * atom) ++{ ++ txn_wait_links _wlinks; ++ ++ assert_spin_locked(&(atom->alock)); ++ assert("nikita-3156", ++ lock_stack_isclean(get_current_lock_stack()) || ++ atom->nr_running_queues > 0); ++ ++ init_wlinks(&_wlinks); ++ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list); ++ atomic_inc(&atom->refcount); ++ spin_unlock_atom(atom); ++ ++ reiser4_prepare_to_sleep(_wlinks._lock_stack); ++ reiser4_go_to_sleep(_wlinks._lock_stack); ++ ++ spin_lock_atom(atom); ++ list_del(&_wlinks._fwaitfor_link); ++ atom_dec_and_unlock(atom); ++} ++ ++void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage) ++{ ++ assert("nikita-3535", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ assert("nikita-3536", stage <= ASTAGE_INVALID); ++ /* Excelsior! */ ++ assert("nikita-3537", stage >= atom->stage); ++ if (atom->stage != stage) { ++ atom->stage = stage; ++ reiser4_atom_send_event(atom); ++ } ++} ++ ++/* wake all threads which wait for an event */ ++void reiser4_atom_send_event(txn_atom * atom) ++{ ++ assert_spin_locked(&(atom->alock)); ++ wakeup_atom_waitfor_list(atom); ++} ++ ++/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for ++ example, because it does fsync(2)) */ ++static int should_wait_commit(txn_handle * h) ++{ ++ return h->flags & TXNH_WAIT_COMMIT; ++} ++ ++typedef struct commit_data { ++ txn_atom *atom; ++ txn_handle *txnh; ++ long nr_written; ++ /* as an optimization we start committing atom by first trying to ++ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This ++ * allows to reduce stalls due to other threads waiting for atom in ++ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these ++ * preliminary flushes. */ ++ int preflush; ++ /* have we waited on atom. */ ++ int wait; ++ int failed; ++ int wake_ktxnmgrd_up; ++} commit_data; ++ ++/* ++ * Called from commit_txnh() repeatedly, until either error happens, or atom ++ * commits successfully. ++ */ ++static int try_commit_txnh(commit_data * cd) ++{ ++ int result; ++ ++ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack())); ++ ++ /* Get the atom and txnh locked. */ ++ cd->atom = txnh_get_atom(cd->txnh); ++ assert("jmacd-309", cd->atom != NULL); ++ spin_unlock_txnh(cd->txnh); ++ ++ if (cd->wait) { ++ cd->atom->nr_waiters--; ++ cd->wait = 0; ++ } ++ ++ if (cd->atom->stage == ASTAGE_DONE) ++ return 0; ++ ++ if (cd->failed) ++ return 0; ++ ++ if (atom_should_commit(cd->atom)) { ++ /* if atom is _very_ large schedule it for commit as soon as ++ * possible. */ ++ if (atom_should_commit_asap(cd->atom)) { ++ /* ++ * When atom is in PRE_COMMIT or later stage following ++ * invariant (encoded in atom_can_be_committed()) ++ * holds: there is exactly one non-waiter transaction ++ * handle opened on this atom. When thread wants to ++ * wait until atom commits (for example sync()) it ++ * waits on atom event after increasing ++ * atom->nr_waiters (see blow in this function). It ++ * cannot be guaranteed that atom is already committed ++ * after receiving event, so loop has to be ++ * re-started. But if atom switched into PRE_COMMIT ++ * stage and became too large, we cannot change its ++ * state back to CAPTURE_WAIT (atom stage can only ++ * increase monotonically), hence this check. ++ */ ++ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT) ++ reiser4_atom_set_stage(cd->atom, ++ ASTAGE_CAPTURE_WAIT); ++ cd->atom->flags |= ATOM_FORCE_COMMIT; ++ } ++ if (cd->txnh->flags & TXNH_DONT_COMMIT) { ++ /* ++ * this thread (transaction handle that is) doesn't ++ * want to commit atom. Notify waiters that handle is ++ * closed. This can happen, for example, when we are ++ * under VFS directory lock and don't want to commit ++ * atom right now to avoid stalling other threads ++ * working in the same directory. ++ */ ++ ++ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to ++ * commit this atom: no atom waiters and only one ++ * (our) open transaction handle. */ ++ cd->wake_ktxnmgrd_up = ++ cd->atom->txnh_count == 1 && ++ cd->atom->nr_waiters == 0; ++ reiser4_atom_send_event(cd->atom); ++ result = 0; ++ } else if (!atom_can_be_committed(cd->atom)) { ++ if (should_wait_commit(cd->txnh)) { ++ /* sync(): wait for commit */ ++ cd->atom->nr_waiters++; ++ cd->wait = 1; ++ reiser4_atom_wait_event(cd->atom); ++ result = RETERR(-E_REPEAT); ++ } else { ++ result = 0; ++ } ++ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) { ++ /* ++ * optimization: flush atom without switching it into ++ * ASTAGE_CAPTURE_WAIT. ++ * ++ * But don't do this for ktxnmgrd, because ktxnmgrd ++ * should never block on atom fusion. ++ */ ++ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS, ++ LONG_MAX, &cd->nr_written, ++ &cd->atom, NULL); ++ if (result == 0) { ++ spin_unlock_atom(cd->atom); ++ cd->preflush = 0; ++ result = RETERR(-E_REPEAT); ++ } else /* Atoms wasn't flushed ++ * completely. Rinse. Repeat. */ ++ --cd->preflush; ++ } else { ++ /* We change atom state to ASTAGE_CAPTURE_WAIT to ++ prevent atom fusion and count ourself as an active ++ flusher */ ++ reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT); ++ cd->atom->flags |= ATOM_FORCE_COMMIT; ++ ++ result = ++ commit_current_atom(&cd->nr_written, &cd->atom); ++ if (result != 0 && result != -E_REPEAT) ++ cd->failed = 1; ++ } ++ } else ++ result = 0; ++ ++#if REISER4_DEBUG ++ if (result == 0) ++ assert_spin_locked(&(cd->atom->alock)); ++#endif ++ ++ /* perfectly valid assertion, except that when atom/txnh is not locked ++ * fusion can take place, and cd->atom points nowhere. */ ++ /* ++ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom))); ++ */ ++ return result; ++} ++ ++/* Called to commit a transaction handle. This decrements the atom's number of open ++ handles and if it is the last handle to commit and the atom should commit, initiates ++ atom commit. if commit does not fail, return number of written blocks */ ++static int commit_txnh(txn_handle * txnh) ++{ ++ commit_data cd; ++ assert("umka-192", txnh != NULL); ++ ++ memset(&cd, 0, sizeof cd); ++ cd.txnh = txnh; ++ cd.preflush = 10; ++ ++ /* calls try_commit_txnh() until either atom commits, or error ++ * happens */ ++ while (try_commit_txnh(&cd) != 0) ++ reiser4_preempt_point(); ++ ++ spin_lock_txnh(txnh); ++ ++ cd.atom->txnh_count -= 1; ++ txnh->atom = NULL; ++ /* remove transaction handle from atom's list of transaction handles */ ++ list_del_init(&txnh->txnh_link); ++ ++ spin_unlock_txnh(txnh); ++ atom_dec_and_unlock(cd.atom); ++ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably ++ * because it takes time) by current thread, we do that work ++ * asynchronously by ktxnmgrd daemon. */ ++ if (cd.wake_ktxnmgrd_up) ++ ktxnmgrd_kick(&get_current_super_private()->tmgr); ++ ++ return 0; ++} ++ ++/* TRY_CAPTURE */ ++ ++/* This routine attempts a single block-capture request. It may return -E_REPEAT if some ++ condition indicates that the request should be retried, and it may block if the ++ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag. ++ ++ This routine encodes the basic logic of block capturing described by: ++ ++ http://namesys.com/v4/v4.html ++ ++ Our goal here is to ensure that any two blocks that contain dependent modifications ++ should commit at the same time. This function enforces this discipline by initiating ++ fusion whenever a transaction handle belonging to one atom requests to read or write a ++ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC). ++ ++ In addition, this routine handles the initial assignment of atoms to blocks and ++ transaction handles. These are possible outcomes of this function: ++ ++ 1. The block and handle are already part of the same atom: return immediate success ++ ++ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign ++ the handle to the block's atom. ++ ++ 3. The handle is assigned but the block is not: call capture_assign_block to assign ++ the block to the handle's atom. ++ ++ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion ++ to fuse atoms. ++ ++ 5. Neither block nor handle are assigned: create a new atom and assign them both. ++ ++ 6. A read request for a non-captured block: return immediate success. ++ ++ This function acquires and releases the handle's spinlock. This function is called ++ under the jnode lock and if the return value is 0, it returns with the jnode lock still ++ held. If the return is -E_REPEAT or some other error condition, the jnode lock is ++ released. The external interface (reiser4_try_capture) manages re-aquiring the jnode ++ lock in the failure case. ++*/ ++static int try_capture_block( ++ txn_handle * txnh, jnode * node, txn_capture mode, ++ txn_atom ** atom_alloc) ++{ ++ txn_atom *block_atom; ++ txn_atom *txnh_atom; ++ ++ /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */ ++ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM); ++ ++ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree == ++ * node->tree somewhere. */ ++ assert("umka-194", txnh != NULL); ++ assert("umka-195", node != NULL); ++ ++ /* The jnode is already locked! Being called from reiser4_try_capture(). */ ++ assert_spin_locked(&(node->guard)); ++ block_atom = node->atom; ++ ++ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't ++ let us touch the atoms themselves. */ ++ spin_lock_txnh(txnh); ++ txnh_atom = txnh->atom; ++ /* Process of capturing continues into one of four branches depends on ++ which atoms from (block atom (node->atom), current atom (txnh->atom)) ++ exist. */ ++ if (txnh_atom == NULL) { ++ if (block_atom == NULL) { ++ spin_unlock_txnh(txnh); ++ spin_unlock_jnode(node); ++ /* assign empty atom to the txnh and repeat */ ++ return atom_begin_and_assign_to_txnh(atom_alloc, txnh); ++ } else { ++ atomic_inc(&block_atom->refcount); ++ /* node spin-lock isn't needed anymore */ ++ spin_unlock_jnode(node); ++ if (!spin_trylock_atom(block_atom)) { ++ spin_unlock_txnh(txnh); ++ spin_lock_atom(block_atom); ++ spin_lock_txnh(txnh); ++ } ++ /* re-check state after getting txnh and the node ++ * atom spin-locked */ ++ if (node->atom != block_atom || txnh->atom != NULL) { ++ spin_unlock_txnh(txnh); ++ atom_dec_and_unlock(block_atom); ++ return RETERR(-E_REPEAT); ++ } ++ atomic_dec(&block_atom->refcount); ++ if (block_atom->stage > ASTAGE_CAPTURE_WAIT || ++ (block_atom->stage == ASTAGE_CAPTURE_WAIT && ++ block_atom->txnh_count != 0)) ++ return capture_fuse_wait(txnh, block_atom, NULL, mode); ++ capture_assign_txnh_nolock(block_atom, txnh); ++ spin_unlock_txnh(txnh); ++ spin_unlock_atom(block_atom); ++ return RETERR(-E_REPEAT); ++ } ++ } else { ++ /* It is time to perform deadlock prevention check over the ++ node we want to capture. It is possible this node was locked ++ for read without capturing it. The optimization which allows ++ to do it helps us in keeping atoms independent as long as ++ possible but it may cause lock/fuse deadlock problems. ++ ++ A number of similar deadlock situations with locked but not ++ captured nodes were found. In each situation there are two ++ or more threads: one of them does flushing while another one ++ does routine balancing or tree lookup. The flushing thread ++ (F) sleeps in long term locking request for node (N), another ++ thread (A) sleeps in trying to capture some node already ++ belonging the atom F, F has a state which prevents ++ immediately fusion . ++ ++ Deadlocks of this kind cannot happen if node N was properly ++ captured by thread A. The F thread fuse atoms before locking ++ therefore current atom of thread F and current atom of thread ++ A became the same atom and thread A may proceed. This does ++ not work if node N was not captured because the fusion of ++ atom does not happens. ++ ++ The following scheme solves the deadlock: If ++ longterm_lock_znode locks and does not capture a znode, that ++ znode is marked as MISSED_IN_CAPTURE. A node marked this way ++ is processed by the code below which restores the missed ++ capture and fuses current atoms of all the node lock owners ++ by calling the fuse_not_fused_lock_owners() function. */ ++ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) { ++ JF_CLR(node, JNODE_MISSED_IN_CAPTURE); ++ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) { ++ spin_unlock_txnh(txnh); ++ spin_unlock_jnode(node); ++ fuse_not_fused_lock_owners(txnh, JZNODE(node)); ++ return RETERR(-E_REPEAT); ++ } ++ } ++ if (block_atom == NULL) { ++ atomic_inc(&txnh_atom->refcount); ++ spin_unlock_txnh(txnh); ++ if (!spin_trylock_atom(txnh_atom)) { ++ spin_unlock_jnode(node); ++ spin_lock_atom(txnh_atom); ++ spin_lock_jnode(node); ++ } ++ if (txnh->atom != txnh_atom || node->atom != NULL ++ || JF_ISSET(node, JNODE_IS_DYING)) { ++ spin_unlock_jnode(node); ++ atom_dec_and_unlock(txnh_atom); ++ return RETERR(-E_REPEAT); ++ } ++ atomic_dec(&txnh_atom->refcount); ++ capture_assign_block_nolock(txnh_atom, node); ++ spin_unlock_atom(txnh_atom); ++ } else { ++ if (txnh_atom != block_atom) { ++ if (mode & TXN_CAPTURE_DONT_FUSE) { ++ spin_unlock_txnh(txnh); ++ spin_unlock_jnode(node); ++ /* we are in a "no-fusion" mode and @node is ++ * already part of transaction. */ ++ return RETERR(-E_NO_NEIGHBOR); ++ } ++ return capture_init_fusion(node, txnh, mode); ++ } ++ spin_unlock_txnh(txnh); ++ } ++ } ++ return 0; ++} ++ ++static txn_capture ++build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags) ++{ ++ txn_capture cap_mode; ++ ++ assert_spin_locked(&(node->guard)); ++ ++ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */ ++ ++ if (lock_mode == ZNODE_WRITE_LOCK) { ++ cap_mode = TXN_CAPTURE_WRITE; ++ } else if (node->atom != NULL) { ++ cap_mode = TXN_CAPTURE_WRITE; ++ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */ ++ jnode_get_level(node) == LEAF_LEVEL) { ++ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */ ++ /* We only need a READ_FUSING capture at the leaf level. This ++ is because the internal levels of the tree (twigs included) ++ are redundant from the point of the user that asked for a ++ read-fusing transcrash. The user only wants to read-fuse ++ atoms due to reading uncommitted data that another user has ++ written. It is the file system that reads/writes the ++ internal tree levels, the user only reads/writes leaves. */ ++ cap_mode = TXN_CAPTURE_READ_ATOMIC; ++ } else { ++ /* In this case (read lock at a non-leaf) there's no reason to ++ * capture. */ ++ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */ ++ return 0; ++ } ++ ++ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE)); ++ assert("nikita-3186", cap_mode != 0); ++ return cap_mode; ++} ++ ++/* This is an external interface to try_capture_block(), it calls ++ try_capture_block() repeatedly as long as -E_REPEAT is returned. ++ ++ @node: node to capture, ++ @lock_mode: read or write lock is used in capture mode calculation, ++ @flags: see txn_capture flags enumeration, ++ @can_coc : can copy-on-capture ++ ++ @return: 0 - node was successfully captured, -E_REPEAT - capture request ++ cannot be processed immediately as it was requested in flags, ++ < 0 - other errors. ++*/ ++int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode, ++ txn_capture flags) ++{ ++ txn_atom *atom_alloc = NULL; ++ txn_capture cap_mode; ++ txn_handle *txnh = get_current_context()->trans; ++ int ret; ++ ++ assert_spin_locked(&(node->guard)); ++ ++ repeat: ++ if (JF_ISSET(node, JNODE_IS_DYING)) ++ return RETERR(-EINVAL); ++ if (node->atom != NULL && txnh->atom == node->atom) ++ return 0; ++ cap_mode = build_capture_mode(node, lock_mode, flags); ++ if (cap_mode == 0 || ++ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) { ++ /* Mark this node as "MISSED". It helps in further deadlock ++ * analysis */ ++ if (jnode_is_znode(node)) ++ JF_SET(node, JNODE_MISSED_IN_CAPTURE); ++ return 0; ++ } ++ /* Repeat try_capture as long as -E_REPEAT is returned. */ ++ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc); ++ /* Regardless of non_blocking: ++ ++ If ret == 0 then jnode is still locked. ++ If ret != 0 then jnode is unlocked. ++ */ ++#if REISER4_DEBUG ++ if (ret == 0) ++ assert_spin_locked(&(node->guard)); ++ else ++ assert_spin_not_locked(&(node->guard)); ++#endif ++ assert_spin_not_locked(&(txnh->guard)); ++ ++ if (ret == -E_REPEAT) { ++ /* E_REPEAT implies all locks were released, therefore we need ++ to take the jnode's lock again. */ ++ spin_lock_jnode(node); ++ ++ /* Although this may appear to be a busy loop, it is not. ++ There are several conditions that cause E_REPEAT to be ++ returned by the call to try_capture_block, all cases ++ indicating some kind of state change that means you should ++ retry the request and will get a different result. In some ++ cases this could be avoided with some extra code, but ++ generally it is done because the necessary locks were ++ released as a result of the operation and repeating is the ++ simplest thing to do (less bug potential). The cases are: ++ atom fusion returns E_REPEAT after it completes (jnode and ++ txnh were unlocked); race conditions in assign_block, ++ assign_txnh, and init_fusion return E_REPEAT (trylock ++ failure); after going to sleep in capture_fuse_wait ++ (request was blocked but may now succeed). I'm not quite ++ sure how capture_copy works yet, but it may also return ++ E_REPEAT. When the request is legitimately blocked, the ++ requestor goes to sleep in fuse_wait, so this is not a busy ++ loop. */ ++ /* NOTE-NIKITA: still don't understand: ++ ++ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT ++ ++ looks like busy loop? ++ */ ++ goto repeat; ++ } ++ ++ /* free extra atom object that was possibly allocated by ++ try_capture_block(). ++ ++ Do this before acquiring jnode spin lock to ++ minimize time spent under lock. --nikita */ ++ if (atom_alloc != NULL) { ++ kmem_cache_free(_atom_slab, atom_alloc); ++ } ++ ++ if (ret != 0) { ++ if (ret == -E_BLOCK) { ++ assert("nikita-3360", ++ cap_mode & TXN_CAPTURE_NONBLOCKING); ++ ret = -E_REPEAT; ++ } ++ ++ /* Failure means jnode is not locked. FIXME_LATER_JMACD May ++ want to fix the above code to avoid releasing the lock and ++ re-acquiring it, but there are cases were failure occurs ++ when the lock is not held, and those cases would need to be ++ modified to re-take the lock. */ ++ spin_lock_jnode(node); ++ } ++ ++ /* Jnode is still locked. */ ++ assert_spin_locked(&(node->guard)); ++ return ret; ++} ++ ++static void release_two_atoms(txn_atom *one, txn_atom *two) ++{ ++ spin_unlock_atom(one); ++ atom_dec_and_unlock(two); ++ spin_lock_atom(one); ++ atom_dec_and_unlock(one); ++} ++ ++/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is ++ returned by that routine. The txn_capture request mode is computed here depending on ++ the transaction handle's type and the lock request. This is called from the depths of ++ the lock manager with the jnode lock held and it always returns with the jnode lock ++ held. ++*/ ++ ++/* fuse all 'active' atoms of lock owners of given node. */ ++static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node) ++{ ++ lock_handle *lh; ++ int repeat; ++ txn_atom *atomh, *atomf; ++ reiser4_context *me = get_current_context(); ++ reiser4_context *ctx = NULL; ++ ++ assert_spin_not_locked(&(ZJNODE(node)->guard)); ++ assert_spin_not_locked(&(txnh->hlock)); ++ ++ repeat: ++ repeat = 0; ++ atomh = txnh_get_atom(txnh); ++ spin_unlock_txnh(txnh); ++ assert("zam-692", atomh != NULL); ++ ++ spin_lock_zlock(&node->lock); ++ /* inspect list of lock owners */ ++ list_for_each_entry(lh, &node->lock.owners, owners_link) { ++ ctx = get_context_by_lock_stack(lh->owner); ++ if (ctx == me) ++ continue; ++ /* below we use two assumptions to avoid addition spin-locks ++ for checking the condition : ++ ++ 1) if the lock stack has lock, the transaction should be ++ opened, i.e. ctx->trans != NULL; ++ ++ 2) reading of well-aligned ctx->trans->atom is atomic, if it ++ equals to the address of spin-locked atomh, we take that ++ the atoms are the same, nothing has to be captured. */ ++ if (atomh != ctx->trans->atom) { ++ reiser4_wake_up(lh->owner); ++ repeat = 1; ++ break; ++ } ++ } ++ if (repeat) { ++ if (!spin_trylock_txnh(ctx->trans)) { ++ spin_unlock_zlock(&node->lock); ++ spin_unlock_atom(atomh); ++ goto repeat; ++ } ++ atomf = ctx->trans->atom; ++ if (atomf == NULL) { ++ capture_assign_txnh_nolock(atomh, ctx->trans); ++ /* release zlock lock _after_ assigning the atom to the ++ * transaction handle, otherwise the lock owner thread ++ * may unlock all znodes, exit kernel context and here ++ * we would access an invalid transaction handle. */ ++ spin_unlock_zlock(&node->lock); ++ spin_unlock_atom(atomh); ++ spin_unlock_txnh(ctx->trans); ++ goto repeat; ++ } ++ assert("zam-1059", atomf != atomh); ++ spin_unlock_zlock(&node->lock); ++ atomic_inc(&atomh->refcount); ++ atomic_inc(&atomf->refcount); ++ spin_unlock_txnh(ctx->trans); ++ if (atomf > atomh) { ++ spin_lock_atom_nested(atomf); ++ } else { ++ spin_unlock_atom(atomh); ++ spin_lock_atom(atomf); ++ spin_lock_atom_nested(atomh); ++ } ++ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) { ++ release_two_atoms(atomf, atomh); ++ goto repeat; ++ } ++ atomic_dec(&atomh->refcount); ++ atomic_dec(&atomf->refcount); ++ capture_fuse_into(atomf, atomh); ++ goto repeat; ++ } ++ spin_unlock_zlock(&node->lock); ++ spin_unlock_atom(atomh); ++} ++ ++/* This is the interface to capture unformatted nodes via their struct page ++ reference. Currently it is only used in reiser4_invalidatepage */ ++int try_capture_page_to_invalidate(struct page *pg) ++{ ++ int ret; ++ jnode *node; ++ ++ assert("umka-292", pg != NULL); ++ assert("nikita-2597", PageLocked(pg)); ++ ++ if (IS_ERR(node = jnode_of_page(pg))) { ++ return PTR_ERR(node); ++ } ++ ++ spin_lock_jnode(node); ++ unlock_page(pg); ++ ++ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); ++ spin_unlock_jnode(node); ++ jput(node); ++ lock_page(pg); ++ return ret; ++} ++ ++/* This informs the transaction manager when a node is deleted. Add the block to the ++ atom's delete set and uncapture the block. ++ ++VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for ++explanations. find all the functions that use it, and unless there is some very ++good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....), ++move the loop to inside the function. ++ ++VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times? ++ */ ++void reiser4_uncapture_page(struct page *pg) ++{ ++ jnode *node; ++ txn_atom *atom; ++ ++ assert("umka-199", pg != NULL); ++ assert("nikita-3155", PageLocked(pg)); ++ ++ clear_page_dirty_for_io(pg); ++ ++ reiser4_wait_page_writeback(pg); ++ ++ node = jprivate(pg); ++ BUG_ON(node == NULL); ++ ++ spin_lock_jnode(node); ++ ++ atom = jnode_get_atom(node); ++ if (atom == NULL) { ++ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); ++ spin_unlock_jnode(node); ++ return; ++ } ++ ++ /* We can remove jnode from transaction even if it is on flush queue ++ * prepped list, we only need to be sure that flush queue is not being ++ * written by reiser4_write_fq(). reiser4_write_fq() does not use atom ++ * spin lock for protection of the prepped nodes list, instead ++ * write_fq() increments atom's nr_running_queues counters for the time ++ * when prepped list is not protected by spin lock. Here we check this ++ * counter if we want to remove jnode from flush queue and, if the ++ * counter is not zero, wait all reiser4_write_fq() for this atom to ++ * complete. This is not significant overhead. */ ++ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) { ++ spin_unlock_jnode(node); ++ /* ++ * at this moment we want to wait for "atom event", viz. wait ++ * until @node can be removed from flush queue. But ++ * reiser4_atom_wait_event() cannot be called with page locked, ++ * because it deadlocks with jnode_extent_write(). Unlock page, ++ * after making sure (through page_cache_get()) that it cannot ++ * be released from memory. ++ */ ++ page_cache_get(pg); ++ unlock_page(pg); ++ reiser4_atom_wait_event(atom); ++ lock_page(pg); ++ /* ++ * page may has been detached by ->writepage()->releasepage(). ++ */ ++ reiser4_wait_page_writeback(pg); ++ spin_lock_jnode(node); ++ page_cache_release(pg); ++ atom = jnode_get_atom(node); ++/* VS-FIXME-HANS: improve the commenting in this function */ ++ if (atom == NULL) { ++ spin_unlock_jnode(node); ++ return; ++ } ++ } ++ reiser4_uncapture_block(node); ++ spin_unlock_atom(atom); ++ jput(node); ++} ++ ++/* this is used in extent's kill hook to uncapture and unhash jnodes attached to ++ * inode's tree of jnodes */ ++void reiser4_uncapture_jnode(jnode * node) ++{ ++ txn_atom *atom; ++ ++ assert_spin_locked(&(node->guard)); ++ assert("", node->pg == 0); ++ ++ atom = jnode_get_atom(node); ++ if (atom == NULL) { ++ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); ++ spin_unlock_jnode(node); ++ return; ++ } ++ ++ reiser4_uncapture_block(node); ++ spin_unlock_atom(atom); ++ jput(node); ++} ++ ++/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer, ++ increases atom refcount and txnh_count, adds to txnh_list. */ ++static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh) ++{ ++ assert("umka-200", atom != NULL); ++ assert("umka-201", txnh != NULL); ++ ++ assert_spin_locked(&(txnh->hlock)); ++ assert_spin_locked(&(atom->alock)); ++ assert("jmacd-824", txnh->atom == NULL); ++ assert("nikita-3540", atom_isopen(atom)); ++ BUG_ON(txnh->atom != NULL); ++ ++ atomic_inc(&atom->refcount); ++ txnh->atom = atom; ++ reiser4_ctx_gfp_mask_set(); ++ list_add_tail(&txnh->txnh_link, &atom->txnh_list); ++ atom->txnh_count += 1; ++} ++ ++/* No-locking version of assign_block. Sets the block's atom pointer, references the ++ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */ ++static void capture_assign_block_nolock(txn_atom *atom, jnode *node) ++{ ++ assert("umka-202", atom != NULL); ++ assert("umka-203", node != NULL); ++ assert_spin_locked(&(node->guard)); ++ assert_spin_locked(&(atom->alock)); ++ assert("jmacd-323", node->atom == NULL); ++ BUG_ON(!list_empty_careful(&node->capture_link)); ++ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY)); ++ ++ /* Pointer from jnode to atom is not counted in atom->refcount. */ ++ node->atom = atom; ++ ++ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom)); ++ atom->capture_count += 1; ++ /* reference to jnode is acquired by atom. */ ++ jref(node); ++ ++ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1)); ++ ++ LOCK_CNT_INC(t_refs); ++} ++ ++/* common code for dirtying both unformatted jnodes and formatted znodes. */ ++static void do_jnode_make_dirty(jnode * node, txn_atom * atom) ++{ ++ assert_spin_locked(&(node->guard)); ++ assert_spin_locked(&(atom->alock)); ++ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY)); ++ ++ JF_SET(node, JNODE_DIRTY); ++ ++ if (!JF_ISSET(node, JNODE_CLUSTER_PAGE)) ++ get_current_context()->nr_marked_dirty++; ++ ++ /* We grab2flush_reserve one additional block only if node was ++ not CREATED and jnode_flush did not sort it into neither ++ relocate set nor overwrite one. If node is in overwrite or ++ relocate set we assume that atom's flush reserved counter was ++ already adjusted. */ ++ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC) ++ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node) ++ && !jnode_is_cluster_page(node)) { ++ assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr)); ++ assert("vs-1506", *jnode_get_block(node) != 0); ++ grabbed2flush_reserved_nolock(atom, (__u64) 1); ++ JF_SET(node, JNODE_FLUSH_RESERVED); ++ } ++ ++ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) { ++ /* If the atom is not set yet, it will be added to the appropriate list in ++ capture_assign_block_nolock. */ ++ /* Sometimes a node is set dirty before being captured -- the case for new ++ jnodes. In that case the jnode will be added to the appropriate list ++ in capture_assign_block_nolock. Another reason not to re-link jnode is ++ that jnode is on a flush queue (see flush.c for details) */ ++ ++ int level = jnode_get_level(node); ++ ++ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR)); ++ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT); ++ assert("nikita-2607", 0 <= level); ++ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT); ++ ++ /* move node to atom's dirty list */ ++ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level)); ++ ON_DEBUG(count_jnode ++ (atom, node, NODE_LIST(node), DIRTY_LIST, 1)); ++ } ++} ++ ++/* Set the dirty status for this (spin locked) jnode. */ ++void jnode_make_dirty_locked(jnode * node) ++{ ++ assert("umka-204", node != NULL); ++ assert_spin_locked(&(node->guard)); ++ ++ if (REISER4_DEBUG && rofs_jnode(node)) { ++ warning("nikita-3365", "Dirtying jnode on rofs"); ++ dump_stack(); ++ } ++ ++ /* Fast check for already dirty node */ ++ if (!JF_ISSET(node, JNODE_DIRTY)) { ++ txn_atom *atom; ++ ++ atom = jnode_get_atom(node); ++ assert("vs-1094", atom); ++ /* Check jnode dirty status again because node spin lock might ++ * be released inside jnode_get_atom(). */ ++ if (likely(!JF_ISSET(node, JNODE_DIRTY))) ++ do_jnode_make_dirty(node, atom); ++ spin_unlock_atom(atom); ++ } ++} ++ ++/* Set the dirty status for this znode. */ ++void znode_make_dirty(znode * z) ++{ ++ jnode *node; ++ struct page *page; ++ ++ assert("umka-204", z != NULL); ++ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z)); ++ assert("nikita-3560", znode_is_write_locked(z)); ++ ++ node = ZJNODE(z); ++ /* znode is longterm locked, we can check dirty bit without spinlock */ ++ if (JF_ISSET(node, JNODE_DIRTY)) { ++ /* znode is dirty already. All we have to do is to change znode version */ ++ z->version = znode_build_version(jnode_get_tree(node)); ++ return; ++ } ++ ++ spin_lock_jnode(node); ++ jnode_make_dirty_locked(node); ++ page = jnode_page(node); ++ if (page != NULL) { ++ /* this is useful assertion (allows one to check that no ++ * modifications are lost due to update of in-flight page), ++ * but it requires locking on page to check PG_writeback ++ * bit. */ ++ /* assert("nikita-3292", ++ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */ ++ page_cache_get(page); ++ ++ /* jnode lock is not needed for the rest of ++ * znode_set_dirty(). */ ++ spin_unlock_jnode(node); ++ /* reiser4 file write code calls set_page_dirty for ++ * unformatted nodes, for formatted nodes we do it here. */ ++ set_page_dirty_notag(page); ++ page_cache_release(page); ++ /* bump version counter in znode */ ++ z->version = znode_build_version(jnode_get_tree(node)); ++ } else { ++ assert("zam-596", znode_above_root(JZNODE(node))); ++ spin_unlock_jnode(node); ++ } ++ ++ assert("nikita-1900", znode_is_write_locked(z)); ++ assert("jmacd-9777", node->atom != NULL); ++} ++ ++int reiser4_sync_atom(txn_atom * atom) ++{ ++ int result; ++ txn_handle *txnh; ++ ++ txnh = get_current_context()->trans; ++ ++ result = 0; ++ if (atom != NULL) { ++ if (atom->stage < ASTAGE_PRE_COMMIT) { ++ spin_lock_txnh(txnh); ++ capture_assign_txnh_nolock(atom, txnh); ++ result = force_commit_atom(txnh); ++ } else if (atom->stage < ASTAGE_POST_COMMIT) { ++ /* wait atom commit */ ++ reiser4_atom_wait_event(atom); ++ /* try once more */ ++ result = RETERR(-E_REPEAT); ++ } else ++ spin_unlock_atom(atom); ++ } ++ return result; ++} ++ ++#if REISER4_DEBUG ++ ++/* move jnode form one list to another ++ call this after atom->capture_count is updated */ ++void ++count_jnode(txn_atom * atom, jnode * node, atom_list old_list, ++ atom_list new_list, int check_lists) ++{ ++ struct list_head *pos; ++ ++ assert("zam-1018", atom_is_protected(atom)); ++ assert_spin_locked(&(node->guard)); ++ assert("", NODE_LIST(node) == old_list); ++ ++ switch (NODE_LIST(node)) { ++ case NOT_CAPTURED: ++ break; ++ case DIRTY_LIST: ++ assert("", atom->dirty > 0); ++ atom->dirty--; ++ break; ++ case CLEAN_LIST: ++ assert("", atom->clean > 0); ++ atom->clean--; ++ break; ++ case FQ_LIST: ++ assert("", atom->fq > 0); ++ atom->fq--; ++ break; ++ case WB_LIST: ++ assert("", atom->wb > 0); ++ atom->wb--; ++ break; ++ case OVRWR_LIST: ++ assert("", atom->ovrwr > 0); ++ atom->ovrwr--; ++ break; ++ default: ++ impossible("", ""); ++ } ++ ++ switch (new_list) { ++ case NOT_CAPTURED: ++ break; ++ case DIRTY_LIST: ++ atom->dirty++; ++ break; ++ case CLEAN_LIST: ++ atom->clean++; ++ break; ++ case FQ_LIST: ++ atom->fq++; ++ break; ++ case WB_LIST: ++ atom->wb++; ++ break; ++ case OVRWR_LIST: ++ atom->ovrwr++; ++ break; ++ default: ++ impossible("", ""); ++ } ++ ASSIGN_NODE_LIST(node, new_list); ++ if (0 && check_lists) { ++ int count; ++ tree_level level; ++ ++ count = 0; ++ ++ /* flush queue list */ ++ /* reiser4_check_fq(atom); */ ++ ++ /* dirty list */ ++ count = 0; ++ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { ++ list_for_each(pos, ATOM_DIRTY_LIST(atom, level)) ++ count++; ++ } ++ if (count != atom->dirty) ++ warning("", "dirty counter %d, real %d\n", atom->dirty, ++ count); ++ ++ /* clean list */ ++ count = 0; ++ list_for_each(pos, ATOM_CLEAN_LIST(atom)) ++ count++; ++ if (count != atom->clean) ++ warning("", "clean counter %d, real %d\n", atom->clean, ++ count); ++ ++ /* wb list */ ++ count = 0; ++ list_for_each(pos, ATOM_WB_LIST(atom)) ++ count++; ++ if (count != atom->wb) ++ warning("", "wb counter %d, real %d\n", atom->wb, ++ count); ++ ++ /* overwrite list */ ++ count = 0; ++ list_for_each(pos, ATOM_OVRWR_LIST(atom)) ++ count++; ++ ++ if (count != atom->ovrwr) ++ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr, ++ count); ++ } ++ assert("vs-1624", atom->num_queued == atom->fq); ++ if (atom->capture_count != ++ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) { ++ printk ++ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n", ++ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr, ++ atom->wb, atom->fq); ++ assert("vs-1622", ++ atom->capture_count == ++ atom->dirty + atom->clean + atom->ovrwr + atom->wb + ++ atom->fq); ++ } ++} ++ ++#endif ++ ++/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode ++ * lock should be taken before calling this function. */ ++void jnode_make_wander_nolock(jnode * node) ++{ ++ txn_atom *atom; ++ ++ assert("nikita-2431", node != NULL); ++ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC)); ++ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY)); ++ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); ++ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node))); ++ ++ atom = node->atom; ++ ++ assert("zam-895", atom != NULL); ++ assert("zam-894", atom_is_protected(atom)); ++ ++ JF_SET(node, JNODE_OVRWR); ++ /* move node to atom's overwrite list */ ++ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom)); ++ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1)); ++} ++ ++/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside ++ * this function. */ ++void jnode_make_wander(jnode * node) ++{ ++ txn_atom *atom; ++ ++ spin_lock_jnode(node); ++ atom = jnode_get_atom(node); ++ assert("zam-913", atom != NULL); ++ assert("zam-914", !JF_ISSET(node, JNODE_RELOC)); ++ ++ jnode_make_wander_nolock(node); ++ spin_unlock_atom(atom); ++ spin_unlock_jnode(node); ++} ++ ++/* this just sets RELOC bit */ ++static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node) ++{ ++ assert_spin_locked(&(node->guard)); ++ assert("zam-916", JF_ISSET(node, JNODE_DIRTY)); ++ assert("zam-917", !JF_ISSET(node, JNODE_RELOC)); ++ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR)); ++ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); ++ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node))); ++ jnode_set_reloc(node); ++} ++ ++/* Make znode RELOC and put it on flush queue */ ++void znode_make_reloc(znode * z, flush_queue_t * fq) ++{ ++ jnode *node; ++ txn_atom *atom; ++ ++ node = ZJNODE(z); ++ spin_lock_jnode(node); ++ ++ atom = jnode_get_atom(node); ++ assert("zam-919", atom != NULL); ++ ++ jnode_make_reloc_nolock(fq, node); ++ queue_jnode(fq, node); ++ ++ spin_unlock_atom(atom); ++ spin_unlock_jnode(node); ++ ++} ++ ++/* Make unformatted node RELOC and put it on flush queue */ ++void unformatted_make_reloc(jnode *node, flush_queue_t *fq) ++{ ++ assert("vs-1479", jnode_is_unformatted(node)); ++ ++ jnode_make_reloc_nolock(fq, node); ++ queue_jnode(fq, node); ++} ++ ++int reiser4_capture_super_block(struct super_block *s) ++{ ++ int result; ++ znode *uber; ++ lock_handle lh; ++ ++ init_lh(&lh); ++ result = get_uber_znode(reiser4_get_tree(s), ++ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh); ++ if (result) ++ return result; ++ ++ uber = lh.node; ++ /* Grabbing one block for superblock */ ++ result = reiser4_grab_space_force((__u64) 1, BA_RESERVED); ++ if (result != 0) ++ return result; ++ ++ znode_make_dirty(uber); ++ ++ done_lh(&lh); ++ return 0; ++} ++ ++/* Wakeup every handle on the atom's WAITFOR list */ ++static void wakeup_atom_waitfor_list(txn_atom * atom) ++{ ++ txn_wait_links *wlinks; ++ ++ assert("umka-210", atom != NULL); ++ ++ /* atom is locked */ ++ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) { ++ if (wlinks->waitfor_cb == NULL || ++ wlinks->waitfor_cb(atom, wlinks)) ++ /* Wake up. */ ++ reiser4_wake_up(wlinks->_lock_stack); ++ } ++} ++ ++/* Wakeup every handle on the atom's WAITING list */ ++static void wakeup_atom_waiting_list(txn_atom * atom) ++{ ++ txn_wait_links *wlinks; ++ ++ assert("umka-211", atom != NULL); ++ ++ /* atom is locked */ ++ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) { ++ if (wlinks->waiting_cb == NULL || ++ wlinks->waiting_cb(atom, wlinks)) ++ /* Wake up. */ ++ reiser4_wake_up(wlinks->_lock_stack); ++ } ++} ++ ++/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */ ++static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks) ++{ ++ assert("nikita-3330", atom != NULL); ++ assert_spin_locked(&(atom->alock)); ++ ++ /* atom->txnh_count == 1 is for waking waiters up if we are releasing ++ * last transaction handle. */ ++ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1; ++} ++ ++/* The general purpose of this function is to wait on the first of two possible events. ++ The situation is that a handle (and its atom atomh) is blocked trying to capture a ++ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The ++ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with ++ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it ++ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will ++ proceed and fuse the two atoms in the CAPTURE_WAIT state. ++ ++ In other words, if either atomh or atomf change state, the handle will be awakened, ++ thus there are two lists per atom: WAITING and WAITFOR. ++ ++ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to ++ close but it is not assigned to an atom of its own. ++ ++ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK, ++ BOTH_ATOM_LOCKS. Result: all four locks are released. ++*/ ++static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf, ++ txn_atom * atomh, txn_capture mode) ++{ ++ int ret; ++ txn_wait_links wlinks; ++ ++ assert("umka-213", txnh != NULL); ++ assert("umka-214", atomf != NULL); ++ ++ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) { ++ spin_unlock_txnh(txnh); ++ spin_unlock_atom(atomf); ++ ++ if (atomh) { ++ spin_unlock_atom(atomh); ++ } ++ ++ return RETERR(-E_BLOCK); ++ } ++ ++ /* Initialize the waiting list links. */ ++ init_wlinks(&wlinks); ++ ++ /* Add txnh to atomf's waitfor list, unlock atomf. */ ++ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list); ++ wlinks.waitfor_cb = wait_for_fusion; ++ atomic_inc(&atomf->refcount); ++ spin_unlock_atom(atomf); ++ ++ if (atomh) { ++ /* Add txnh to atomh's waiting list, unlock atomh. */ ++ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list); ++ atomic_inc(&atomh->refcount); ++ spin_unlock_atom(atomh); ++ } ++ ++ /* Go to sleep. */ ++ spin_unlock_txnh(txnh); ++ ++ ret = reiser4_prepare_to_sleep(wlinks._lock_stack); ++ if (ret == 0) { ++ reiser4_go_to_sleep(wlinks._lock_stack); ++ ret = RETERR(-E_REPEAT); ++ } ++ ++ /* Remove from the waitfor list. */ ++ spin_lock_atom(atomf); ++ ++ list_del(&wlinks._fwaitfor_link); ++ atom_dec_and_unlock(atomf); ++ ++ if (atomh) { ++ /* Remove from the waiting list. */ ++ spin_lock_atom(atomh); ++ list_del(&wlinks._fwaiting_link); ++ atom_dec_and_unlock(atomh); ++ } ++ return ret; ++} ++ ++static void lock_two_atoms(txn_atom * one, txn_atom * two) ++{ ++ assert("zam-1067", one != two); ++ ++ /* lock the atom with lesser address first */ ++ if (one < two) { ++ spin_lock_atom(one); ++ spin_lock_atom_nested(two); ++ } else { ++ spin_lock_atom(two); ++ spin_lock_atom_nested(one); ++ } ++} ++ ++/* Perform the necessary work to prepare for fusing two atoms, which involves ++ * acquiring two atom locks in the proper order. If one of the node's atom is ++ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's ++ * atom is not then the handle's request is put to sleep. If the node's atom ++ * is committing, then the node can be copy-on-captured. Otherwise, pick the ++ * atom with fewer pointers to be fused into the atom with more pointer and ++ * call capture_fuse_into. ++ */ ++static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode) ++{ ++ txn_atom * txnh_atom = txnh->atom; ++ txn_atom * block_atom = node->atom; ++ ++ atomic_inc(&txnh_atom->refcount); ++ atomic_inc(&block_atom->refcount); ++ ++ spin_unlock_txnh(txnh); ++ spin_unlock_jnode(node); ++ ++ lock_two_atoms(txnh_atom, block_atom); ++ ++ if (txnh->atom != txnh_atom || node->atom != block_atom ) { ++ release_two_atoms(txnh_atom, block_atom); ++ return RETERR(-E_REPEAT); ++ } ++ ++ atomic_dec(&txnh_atom->refcount); ++ atomic_dec(&block_atom->refcount); ++ ++ assert ("zam-1066", atom_isopen(txnh_atom)); ++ ++ if (txnh_atom->stage >= block_atom->stage || ++ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) { ++ capture_fuse_into(txnh_atom, block_atom); ++ return RETERR(-E_REPEAT); ++ } ++ spin_lock_txnh(txnh); ++ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode); ++} ++ ++/* This function splices together two jnode lists (small and large) and sets all jnodes in ++ the small list to point to the large atom. Returns the length of the list. */ ++static int ++capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head, ++ struct list_head *small_head) ++{ ++ int count = 0; ++ jnode *node; ++ ++ assert("umka-218", large != NULL); ++ assert("umka-219", large_head != NULL); ++ assert("umka-220", small_head != NULL); ++ /* small atom should be locked also. */ ++ assert_spin_locked(&(large->alock)); ++ ++ /* For every jnode on small's capture list... */ ++ list_for_each_entry(node, small_head, capture_link) { ++ count += 1; ++ ++ /* With the jnode lock held, update atom pointer. */ ++ spin_lock_jnode(node); ++ node->atom = large; ++ spin_unlock_jnode(node); ++ } ++ ++ /* Splice the lists. */ ++ list_splice_init(small_head, large_head->prev); ++ ++ return count; ++} ++ ++/* This function splices together two txnh lists (small and large) and sets all txn handles in ++ the small list to point to the large atom. Returns the length of the list. */ ++static int ++capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head, ++ struct list_head *small_head) ++{ ++ int count = 0; ++ txn_handle *txnh; ++ ++ assert("umka-221", large != NULL); ++ assert("umka-222", large_head != NULL); ++ assert("umka-223", small_head != NULL); ++ ++ /* Adjust every txnh to the new atom. */ ++ list_for_each_entry(txnh, small_head, txnh_link) { ++ count += 1; ++ ++ /* With the txnh lock held, update atom pointer. */ ++ spin_lock_txnh(txnh); ++ txnh->atom = large; ++ spin_unlock_txnh(txnh); ++ } ++ ++ /* Splice the txn_handle list. */ ++ list_splice_init(small_head, large_head->prev); ++ ++ return count; ++} ++ ++/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are ++ added to LARGE and their ->atom pointers are all updated. The associated counts are ++ updated as well, and any waiting handles belonging to either are awakened. Finally the ++ smaller atom's refcount is decremented. ++*/ ++static void capture_fuse_into(txn_atom * small, txn_atom * large) ++{ ++ int level; ++ unsigned zcount = 0; ++ unsigned tcount = 0; ++ ++ assert("umka-224", small != NULL); ++ assert("umka-225", small != NULL); ++ ++ assert_spin_locked(&(large->alock)); ++ assert_spin_locked(&(small->alock)); ++ ++ assert("jmacd-201", atom_isopen(small)); ++ assert("jmacd-202", atom_isopen(large)); ++ ++ /* Splice and update the per-level dirty jnode lists */ ++ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { ++ zcount += ++ capture_fuse_jnode_lists(large, ++ ATOM_DIRTY_LIST(large, level), ++ ATOM_DIRTY_LIST(small, level)); ++ } ++ ++ /* Splice and update the [clean,dirty] jnode and txnh lists */ ++ zcount += ++ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large), ++ ATOM_CLEAN_LIST(small)); ++ zcount += ++ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large), ++ ATOM_OVRWR_LIST(small)); ++ zcount += ++ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large), ++ ATOM_WB_LIST(small)); ++ zcount += ++ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes); ++ tcount += ++ capture_fuse_txnh_lists(large, &large->txnh_list, ++ &small->txnh_list); ++ ++ /* Check our accounting. */ ++ assert("jmacd-1063", ++ zcount + small->num_queued == small->capture_count); ++ assert("jmacd-1065", tcount == small->txnh_count); ++ ++ /* sum numbers of waiters threads */ ++ large->nr_waiters += small->nr_waiters; ++ small->nr_waiters = 0; ++ ++ /* splice flush queues */ ++ reiser4_fuse_fq(large, small); ++ ++ /* update counter of jnode on every atom' list */ ++ ON_DEBUG(large->dirty += small->dirty; ++ small->dirty = 0; ++ large->clean += small->clean; ++ small->clean = 0; ++ large->ovrwr += small->ovrwr; ++ small->ovrwr = 0; ++ large->wb += small->wb; ++ small->wb = 0; ++ large->fq += small->fq; ++ small->fq = 0;); ++ ++ /* count flushers in result atom */ ++ large->nr_flushers += small->nr_flushers; ++ small->nr_flushers = 0; ++ ++ /* update counts of flushed nodes */ ++ large->flushed += small->flushed; ++ small->flushed = 0; ++ ++ /* Transfer list counts to large. */ ++ large->txnh_count += small->txnh_count; ++ large->capture_count += small->capture_count; ++ ++ /* Add all txnh references to large. */ ++ atomic_add(small->txnh_count, &large->refcount); ++ atomic_sub(small->txnh_count, &small->refcount); ++ ++ /* Reset small counts */ ++ small->txnh_count = 0; ++ small->capture_count = 0; ++ ++ /* Assign the oldest start_time, merge flags. */ ++ large->start_time = min(large->start_time, small->start_time); ++ large->flags |= small->flags; ++ ++ /* Merge blocknr sets. */ ++ blocknr_set_merge(&small->delete_set, &large->delete_set); ++ blocknr_set_merge(&small->wandered_map, &large->wandered_map); ++ ++ /* Merge allocated/deleted file counts */ ++ large->nr_objects_deleted += small->nr_objects_deleted; ++ large->nr_objects_created += small->nr_objects_created; ++ ++ small->nr_objects_deleted = 0; ++ small->nr_objects_created = 0; ++ ++ /* Merge allocated blocks counts */ ++ large->nr_blocks_allocated += small->nr_blocks_allocated; ++ ++ large->nr_running_queues += small->nr_running_queues; ++ small->nr_running_queues = 0; ++ ++ /* Merge blocks reserved for overwrite set. */ ++ large->flush_reserved += small->flush_reserved; ++ small->flush_reserved = 0; ++ ++ if (large->stage < small->stage) { ++ /* Large only needs to notify if it has changed state. */ ++ reiser4_atom_set_stage(large, small->stage); ++ wakeup_atom_waiting_list(large); ++ } ++ ++ reiser4_atom_set_stage(small, ASTAGE_INVALID); ++ ++ /* Notify any waiters--small needs to unload its wait lists. Waiters ++ actually remove themselves from the list before returning from the ++ fuse_wait function. */ ++ wakeup_atom_waiting_list(small); ++ ++ /* Unlock atoms */ ++ spin_unlock_atom(large); ++ atom_dec_and_unlock(small); ++} ++ ++/* TXNMGR STUFF */ ++ ++/* Release a block from the atom, reversing the effects of being captured, ++ do not release atom's reference to jnode due to holding spin-locks. ++ Currently this is only called when the atom commits. ++ ++ NOTE: this function does not release a (journal) reference to jnode ++ due to locking optimizations, you should call jput() somewhere after ++ calling reiser4_uncapture_block(). */ ++void reiser4_uncapture_block(jnode * node) ++{ ++ txn_atom *atom; ++ ++ assert("umka-226", node != NULL); ++ atom = node->atom; ++ assert("umka-228", atom != NULL); ++ ++ assert("jmacd-1021", node->atom == atom); ++ assert_spin_locked(&(node->guard)); ++ assert("jmacd-1023", atom_is_protected(atom)); ++ ++ JF_CLR(node, JNODE_DIRTY); ++ JF_CLR(node, JNODE_RELOC); ++ JF_CLR(node, JNODE_OVRWR); ++ JF_CLR(node, JNODE_CREATED); ++ JF_CLR(node, JNODE_WRITEBACK); ++ JF_CLR(node, JNODE_REPACK); ++ ++ list_del_init(&node->capture_link); ++ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) { ++ assert("zam-925", atom_isopen(atom)); ++ assert("vs-1623", NODE_LIST(node) == FQ_LIST); ++ ON_DEBUG(atom->num_queued--); ++ JF_CLR(node, JNODE_FLUSH_QUEUED); ++ } ++ atom->capture_count -= 1; ++ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1)); ++ node->atom = NULL; ++ ++ spin_unlock_jnode(node); ++ LOCK_CNT_DEC(t_refs); ++} ++ ++/* Unconditional insert of jnode into atom's overwrite list. Currently used in ++ bitmap-based allocator code for adding modified bitmap blocks the ++ transaction. @atom and @node are spin locked */ ++void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node) ++{ ++ assert("zam-538", atom_is_protected(atom)); ++ assert_spin_locked(&(node->guard)); ++ assert("zam-899", JF_ISSET(node, JNODE_OVRWR)); ++ assert("zam-543", node->atom == NULL); ++ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node)); ++ ++ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom)); ++ jref(node); ++ node->atom = atom; ++ atom->capture_count++; ++ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1)); ++} ++ ++static int count_deleted_blocks_actor(txn_atom * atom, ++ const reiser4_block_nr * a, ++ const reiser4_block_nr * b, void *data) ++{ ++ reiser4_block_nr *counter = data; ++ ++ assert("zam-995", data != NULL); ++ assert("zam-996", a != NULL); ++ if (b == NULL) ++ *counter += 1; ++ else ++ *counter += *b; ++ return 0; ++} ++ ++reiser4_block_nr txnmgr_count_deleted_blocks(void) ++{ ++ reiser4_block_nr result; ++ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr; ++ txn_atom *atom; ++ ++ result = 0; ++ ++ spin_lock_txnmgr(tmgr); ++ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { ++ spin_lock_atom(atom); ++ if (atom_isopen(atom)) ++ blocknr_set_iterator( ++ atom, &atom->delete_set, ++ count_deleted_blocks_actor, &result, 0); ++ spin_unlock_atom(atom); ++ } ++ spin_unlock_txnmgr(tmgr); ++ ++ return result; ++} ++ ++/* ++ * Local variables: ++ * c-indentation-style: "K&R" ++ * mode-name: "LC" ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * fill-column: 79 ++ * End: ++ */ +diff -urN linux-2.6.33.orig/fs/reiser4/txnmgr.h linux-2.6.33/fs/reiser4/txnmgr.h +--- linux-2.6.33.orig/fs/reiser4/txnmgr.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/txnmgr.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,701 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* data-types and function declarations for transaction manager. See txnmgr.c ++ * for details. */ ++ ++#ifndef __REISER4_TXNMGR_H__ ++#define __REISER4_TXNMGR_H__ ++ ++#include "forward.h" ++#include "dformat.h" ++ ++#include <linux/fs.h> ++#include <linux/mm.h> ++#include <linux/types.h> ++#include <linux/spinlock.h> ++#include <asm/atomic.h> ++#include <linux/wait.h> ++ ++/* TYPE DECLARATIONS */ ++ ++/* This enumeration describes the possible types of a capture request (reiser4_try_capture). ++ A capture request dynamically assigns a block to the calling thread's transaction ++ handle. */ ++typedef enum { ++ /* A READ_ATOMIC request indicates that a block will be read and that the caller's ++ atom should fuse in order to ensure that the block commits atomically with the ++ caller. */ ++ TXN_CAPTURE_READ_ATOMIC = (1 << 0), ++ ++ /* A READ_NONCOM request indicates that a block will be read and that the caller is ++ willing to read a non-committed block without causing atoms to fuse. */ ++ TXN_CAPTURE_READ_NONCOM = (1 << 1), ++ ++ /* A READ_MODIFY request indicates that a block will be read but that the caller ++ wishes for the block to be captured as it will be written. This capture request ++ mode is not currently used, but eventually it will be useful for preventing ++ deadlock in read-modify-write cycles. */ ++ TXN_CAPTURE_READ_MODIFY = (1 << 2), ++ ++ /* A WRITE capture request indicates that a block will be modified and that atoms ++ should fuse to make the commit atomic. */ ++ TXN_CAPTURE_WRITE = (1 << 3), ++ ++ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the ++ exclusive type designation from extra bits that may be supplied -- see ++ below. */ ++ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC | ++ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY | ++ TXN_CAPTURE_WRITE), ++ ++ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that ++ indicate modification will occur. */ ++ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE), ++ ++ /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would ++ prefer not to sleep waiting for an aging atom to commit. */ ++ TXN_CAPTURE_NONBLOCKING = (1 << 4), ++ ++ /* An option to reiser4_try_capture to prevent atom fusion, just simple ++ capturing is allowed */ ++ TXN_CAPTURE_DONT_FUSE = (1 << 5) ++ ++ /* This macro selects only the exclusive capture request types, stripping out any ++ options that were supplied (i.e., NONBLOCKING). */ ++#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES) ++} txn_capture; ++ ++/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only ++ difference is in the handling of read requests. A WRITE_FUSING transaction handle ++ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG ++ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */ ++typedef enum { ++ TXN_WRITE_FUSING = (1 << 0), ++ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */ ++} txn_mode; ++ ++/* Every atom has a stage, which is one of these exclusive values: */ ++typedef enum { ++ /* Initially an atom is free. */ ++ ASTAGE_FREE = 0, ++ ++ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture ++ blocks and fuse with other atoms. */ ++ ASTAGE_CAPTURE_FUSE = 1, ++ ++ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */ ++ ++ /* When an atom reaches a certain age it must do all it can to commit. An atom in ++ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from ++ atoms in the CAPTURE_FUSE stage. */ ++ ASTAGE_CAPTURE_WAIT = 2, ++ ++ /* Waiting for I/O before commit. Copy-on-capture (see ++ http://namesys.com/v4/v4.html). */ ++ ASTAGE_PRE_COMMIT = 3, ++ ++ /* Post-commit overwrite I/O. Steal-on-capture. */ ++ ASTAGE_POST_COMMIT = 4, ++ ++ /* Atom which waits for the removal of the last reference to (it? ) to ++ * be deleted from memory */ ++ ASTAGE_DONE = 5, ++ ++ /* invalid atom. */ ++ ASTAGE_INVALID = 6, ++ ++} txn_stage; ++ ++/* Certain flags may be set in the txn_atom->flags field. */ ++typedef enum { ++ /* Indicates that the atom should commit as soon as possible. */ ++ ATOM_FORCE_COMMIT = (1 << 0), ++ /* to avoid endless loop, mark the atom (which was considered as too ++ * small) after failed attempt to fuse it. */ ++ ATOM_CANCEL_FUSION = (1 << 1) ++} txn_flags; ++ ++/* Flags for controlling commit_txnh */ ++typedef enum { ++ /* Wait commit atom completion in commit_txnh */ ++ TXNH_WAIT_COMMIT = 0x2, ++ /* Don't commit atom when this handle is closed */ ++ TXNH_DONT_COMMIT = 0x4 ++} txn_handle_flags_t; ++ ++/* TYPE DEFINITIONS */ ++ ++/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom ++ fields, so typically an operation on the atom through either of these objects must (1) ++ lock the object, (2) read the atom pointer, (3) lock the atom. ++ ++ During atom fusion, the process holds locks on both atoms at once. Then, it iterates ++ through the list of handles and pages held by the smaller of the two atoms. For each ++ handle and page referencing the smaller atom, the fusing process must: (1) lock the ++ object, and (2) update the atom pointer. ++ ++ You can see that there is a conflict of lock ordering here, so the more-complex ++ procedure should have priority, i.e., the fusing process has priority so that it is ++ guaranteed to make progress and to avoid restarts. ++ ++ This decision, however, means additional complexity for aquiring the atom lock in the ++ first place. ++ ++ The general original procedure followed in the code was: ++ ++ TXN_OBJECT *obj = ...; ++ TXN_ATOM *atom; ++ ++ spin_lock (& obj->_lock); ++ ++ atom = obj->_atom; ++ ++ if (! spin_trylock_atom (atom)) ++ { ++ spin_unlock (& obj->_lock); ++ RESTART OPERATION, THERE WAS A RACE; ++ } ++ ++ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED ++ ++ It has however been found that this wastes CPU a lot in a manner that is ++ hard to profile. So, proper refcounting was added to atoms, and new ++ standard locking sequence is like following: ++ ++ TXN_OBJECT *obj = ...; ++ TXN_ATOM *atom; ++ ++ spin_lock (& obj->_lock); ++ ++ atom = obj->_atom; ++ ++ if (! spin_trylock_atom (atom)) ++ { ++ atomic_inc (& atom->refcount); ++ spin_unlock (& obj->_lock); ++ spin_lock (&atom->_lock); ++ atomic_dec (& atom->refcount); ++ // HERE atom is locked ++ spin_unlock (&atom->_lock); ++ RESTART OPERATION, THERE WAS A RACE; ++ } ++ ++ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED ++ ++ (core of this is implemented in trylock_throttle() function) ++ ++ See the jnode_get_atom() function for a common case. ++ ++ As an additional (and important) optimization allowing to avoid restarts, ++ it is possible to re-check required pre-conditions at the HERE point in ++ code above and proceed without restarting if they are still satisfied. ++*/ ++ ++/* An atomic transaction: this is the underlying system representation ++ of a transaction, not the one seen by clients. ++ ++ Invariants involving this data-type: ++ ++ [sb-fake-allocated] ++*/ ++struct txn_atom { ++ /* The spinlock protecting the atom, held during fusion and various other state ++ changes. */ ++ spinlock_t alock; ++ ++ /* The atom's reference counter, increasing (in case of a duplication ++ of an existing reference or when we are sure that some other ++ reference exists) may be done without taking spinlock, decrementing ++ of the ref. counter requires a spinlock to be held. ++ ++ Each transaction handle counts in ->refcount. All jnodes count as ++ one reference acquired in atom_begin_andlock(), released in ++ commit_current_atom(). ++ */ ++ atomic_t refcount; ++ ++ /* The atom_id identifies the atom in persistent records such as the log. */ ++ __u32 atom_id; ++ ++ /* Flags holding any of the txn_flags enumerated values (e.g., ++ ATOM_FORCE_COMMIT). */ ++ __u32 flags; ++ ++ /* Number of open handles. */ ++ __u32 txnh_count; ++ ++ /* The number of znodes captured by this atom. Equal to the sum of lengths of the ++ dirty_nodes[level] and clean_nodes lists. */ ++ __u32 capture_count; ++ ++#if REISER4_DEBUG ++ int clean; ++ int dirty; ++ int ovrwr; ++ int wb; ++ int fq; ++#endif ++ ++ __u32 flushed; ++ ++ /* Current transaction stage. */ ++ txn_stage stage; ++ ++ /* Start time. */ ++ unsigned long start_time; ++ ++ /* The atom's delete set. It collects block numbers of the nodes ++ which were deleted during the transaction. */ ++ struct list_head delete_set; ++ ++ /* The atom's wandered_block mapping. */ ++ struct list_head wandered_map; ++ ++ /* The transaction's list of dirty captured nodes--per level. Index ++ by (level). dirty_nodes[0] is for znode-above-root */ ++ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1]; ++ ++ /* The transaction's list of clean captured nodes. */ ++ struct list_head clean_nodes; ++ ++ /* The atom's overwrite set */ ++ struct list_head ovrwr_nodes; ++ ++ /* nodes which are being written to disk */ ++ struct list_head writeback_nodes; ++ ++ /* list of inodes */ ++ struct list_head inodes; ++ ++ /* List of handles associated with this atom. */ ++ struct list_head txnh_list; ++ ++ /* Transaction list link: list of atoms in the transaction manager. */ ++ struct list_head atom_link; ++ ++ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */ ++ struct list_head fwaitfor_list; ++ ++ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */ ++ struct list_head fwaiting_list; ++ ++ /* Numbers of objects which were deleted/created in this transaction ++ thereby numbers of objects IDs which were released/deallocated. */ ++ int nr_objects_deleted; ++ int nr_objects_created; ++ /* number of blocks allocated during the transaction */ ++ __u64 nr_blocks_allocated; ++ /* All atom's flush queue objects are on this list */ ++ struct list_head flush_queues; ++#if REISER4_DEBUG ++ /* number of flush queues for this atom. */ ++ int nr_flush_queues; ++ /* Number of jnodes which were removed from atom's lists and put ++ on flush_queue */ ++ int num_queued; ++#endif ++ /* number of threads who wait for this atom to complete commit */ ++ int nr_waiters; ++ /* number of threads which do jnode_flush() over this atom */ ++ int nr_flushers; ++ /* number of flush queues which are IN_USE and jnodes from fq->prepped ++ are submitted to disk by the reiser4_write_fq() routine. */ ++ int nr_running_queues; ++ /* A counter of grabbed unformatted nodes, see a description of the ++ * reiser4 space reservation scheme at block_alloc.c */ ++ reiser4_block_nr flush_reserved; ++#if REISER4_DEBUG ++ void *committer; ++#endif ++ struct super_block *super; ++}; ++ ++#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level]) ++#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes) ++#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes) ++#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes) ++#define ATOM_FQ_LIST(fq) (&(fq)->prepped) ++ ++#define NODE_LIST(node) (node)->list ++#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list) ++ON_DEBUG(void ++ count_jnode(txn_atom *, jnode *, atom_list old_list, ++ atom_list new_list, int check_lists)); ++ ++/* A transaction handle: the client obtains and commits this handle which is assigned by ++ the system to a txn_atom. */ ++struct txn_handle { ++ /* Spinlock protecting ->atom pointer */ ++ spinlock_t hlock; ++ ++ /* Flags for controlling commit_txnh() behavior */ ++ /* from txn_handle_flags_t */ ++ txn_handle_flags_t flags; ++ ++ /* Whether it is READ_FUSING or WRITE_FUSING. */ ++ txn_mode mode; ++ ++ /* If assigned, the atom it is part of. */ ++ txn_atom *atom; ++ ++ /* Transaction list link. Head is in txn_atom. */ ++ struct list_head txnh_link; ++}; ++ ++/* The transaction manager: one is contained in the reiser4_super_info_data */ ++struct txn_mgr { ++ /* A spinlock protecting the atom list, id_count, flush_control */ ++ spinlock_t tmgr_lock; ++ ++ /* List of atoms. */ ++ struct list_head atoms_list; ++ ++ /* Number of atoms. */ ++ int atom_count; ++ ++ /* A counter used to assign atom->atom_id values. */ ++ __u32 id_count; ++ ++ /* a mutex object for commit serialization */ ++ struct mutex commit_mutex; ++ ++ /* a list of all txnmrgs served by particular daemon. */ ++ struct list_head linkage; ++ ++ /* description of daemon for this txnmgr */ ++ ktxnmgrd_context *daemon; ++ ++ /* parameters. Adjustable through mount options. */ ++ unsigned int atom_max_size; ++ unsigned int atom_max_age; ++ unsigned int atom_min_size; ++ /* max number of concurrent flushers for one atom, 0 - unlimited. */ ++ unsigned int atom_max_flushers; ++ struct dentry *debugfs_atom_count; ++ struct dentry *debugfs_id_count; ++}; ++ ++/* FUNCTION DECLARATIONS */ ++ ++/* These are the externally (within Reiser4) visible transaction functions, therefore they ++ are prefixed with "txn_". For comments, see txnmgr.c. */ ++ ++extern int init_txnmgr_static(void); ++extern void done_txnmgr_static(void); ++ ++extern void reiser4_init_txnmgr(txn_mgr *); ++extern void reiser4_done_txnmgr(txn_mgr *); ++ ++extern int reiser4_txn_reserve(int reserved); ++ ++extern void reiser4_txn_begin(reiser4_context * context); ++extern int reiser4_txn_end(reiser4_context * context); ++ ++extern void reiser4_txn_restart(reiser4_context * context); ++extern void reiser4_txn_restart_current(void); ++ ++extern int txnmgr_force_commit_all(struct super_block *, int); ++extern int current_atom_should_commit(void); ++ ++extern jnode *find_first_dirty_jnode(txn_atom *, int); ++ ++extern int commit_some_atoms(txn_mgr *); ++extern int force_commit_atom(txn_handle *); ++extern int flush_current_atom(int, long, long *, txn_atom **, jnode *); ++ ++extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int); ++ ++extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage); ++ ++extern int same_slum_check(jnode * base, jnode * check, int alloc_check, ++ int alloc_value); ++extern void atom_dec_and_unlock(txn_atom * atom); ++ ++extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags); ++extern int try_capture_page_to_invalidate(struct page *pg); ++ ++extern void reiser4_uncapture_page(struct page *pg); ++extern void reiser4_uncapture_block(jnode *); ++extern void reiser4_uncapture_jnode(jnode *); ++ ++extern int reiser4_capture_inode(struct inode *); ++extern int reiser4_uncapture_inode(struct inode *); ++ ++extern txn_atom *get_current_atom_locked_nocheck(void); ++ ++#if REISER4_DEBUG ++ ++/** ++ * atom_is_protected - make sure that nobody but us can do anything with atom ++ * @atom: atom to be checked ++ * ++ * This is used to assert that atom either entered commit stages or is spin ++ * locked. ++ */ ++static inline int atom_is_protected(txn_atom *atom) ++{ ++ if (atom->stage >= ASTAGE_PRE_COMMIT) ++ return 1; ++ assert_spin_locked(&(atom->alock)); ++ return 1; ++} ++ ++#endif ++ ++/* Get the current atom and spinlock it if current atom present. May not return NULL */ ++static inline txn_atom *get_current_atom_locked(void) ++{ ++ txn_atom *atom; ++ ++ atom = get_current_atom_locked_nocheck(); ++ assert("zam-761", atom != NULL); ++ ++ return atom; ++} ++ ++extern txn_atom *jnode_get_atom(jnode *); ++ ++extern void reiser4_atom_wait_event(txn_atom *); ++extern void reiser4_atom_send_event(txn_atom *); ++ ++extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node); ++extern int reiser4_capture_super_block(struct super_block *s); ++int capture_bulk(jnode **, int count); ++ ++/* See the comment on the function blocknrset.c:blocknr_set_add for the ++ calling convention of these three routines. */ ++extern void blocknr_set_init(struct list_head * bset); ++extern void blocknr_set_destroy(struct list_head * bset); ++extern void blocknr_set_merge(struct list_head * from, struct list_head * into); ++extern int blocknr_set_add_extent(txn_atom * atom, ++ struct list_head * bset, ++ blocknr_set_entry ** new_bsep, ++ const reiser4_block_nr * start, ++ const reiser4_block_nr * len); ++extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset, ++ blocknr_set_entry ** new_bsep, ++ const reiser4_block_nr * a, ++ const reiser4_block_nr * b); ++ ++typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *, ++ const reiser4_block_nr *, void *); ++ ++extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset, ++ blocknr_set_actor_f actor, void *data, ++ int delete); ++ ++/* flush code takes care about how to fuse flush queues */ ++extern void flush_init_atom(txn_atom * atom); ++extern void flush_fuse_queues(txn_atom * large, txn_atom * small); ++ ++static inline void spin_lock_atom(txn_atom *atom) ++{ ++ /* check that spinlocks of lower priorities are not held */ ++ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && ++ LOCK_CNT_NIL(spin_locked_atom) && ++ LOCK_CNT_NIL(spin_locked_jnode) && ++ LOCK_CNT_NIL(spin_locked_zlock) && ++ LOCK_CNT_NIL(rw_locked_dk) && ++ LOCK_CNT_NIL(rw_locked_tree))); ++ ++ spin_lock(&(atom->alock)); ++ ++ LOCK_CNT_INC(spin_locked_atom); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline void spin_lock_atom_nested(txn_atom *atom) ++{ ++ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && ++ LOCK_CNT_NIL(spin_locked_jnode) && ++ LOCK_CNT_NIL(spin_locked_zlock) && ++ LOCK_CNT_NIL(rw_locked_dk) && ++ LOCK_CNT_NIL(rw_locked_tree))); ++ ++ spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING); ++ ++ LOCK_CNT_INC(spin_locked_atom); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline int spin_trylock_atom(txn_atom *atom) ++{ ++ if (spin_trylock(&(atom->alock))) { ++ LOCK_CNT_INC(spin_locked_atom); ++ LOCK_CNT_INC(spin_locked); ++ return 1; ++ } ++ return 0; ++} ++ ++static inline void spin_unlock_atom(txn_atom *atom) ++{ ++ assert_spin_locked(&(atom->alock)); ++ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ ++ LOCK_CNT_DEC(spin_locked_atom); ++ LOCK_CNT_DEC(spin_locked); ++ ++ spin_unlock(&(atom->alock)); ++} ++ ++static inline void spin_lock_txnh(txn_handle *txnh) ++{ ++ /* check that spinlocks of lower priorities are not held */ ++ assert("", (LOCK_CNT_NIL(rw_locked_dk) && ++ LOCK_CNT_NIL(spin_locked_zlock) && ++ LOCK_CNT_NIL(rw_locked_tree))); ++ ++ spin_lock(&(txnh->hlock)); ++ ++ LOCK_CNT_INC(spin_locked_txnh); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline int spin_trylock_txnh(txn_handle *txnh) ++{ ++ if (spin_trylock(&(txnh->hlock))) { ++ LOCK_CNT_INC(spin_locked_txnh); ++ LOCK_CNT_INC(spin_locked); ++ return 1; ++ } ++ return 0; ++} ++ ++static inline void spin_unlock_txnh(txn_handle *txnh) ++{ ++ assert_spin_locked(&(txnh->hlock)); ++ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ ++ LOCK_CNT_DEC(spin_locked_txnh); ++ LOCK_CNT_DEC(spin_locked); ++ ++ spin_unlock(&(txnh->hlock)); ++} ++ ++#define spin_ordering_pred_txnmgr(tmgr) \ ++ ( LOCK_CNT_NIL(spin_locked_atom) && \ ++ LOCK_CNT_NIL(spin_locked_txnh) && \ ++ LOCK_CNT_NIL(spin_locked_jnode) && \ ++ LOCK_CNT_NIL(rw_locked_zlock) && \ ++ LOCK_CNT_NIL(rw_locked_dk) && \ ++ LOCK_CNT_NIL(rw_locked_tree) ) ++ ++static inline void spin_lock_txnmgr(txn_mgr *mgr) ++{ ++ /* check that spinlocks of lower priorities are not held */ ++ assert("", (LOCK_CNT_NIL(spin_locked_atom) && ++ LOCK_CNT_NIL(spin_locked_txnh) && ++ LOCK_CNT_NIL(spin_locked_jnode) && ++ LOCK_CNT_NIL(spin_locked_zlock) && ++ LOCK_CNT_NIL(rw_locked_dk) && ++ LOCK_CNT_NIL(rw_locked_tree))); ++ ++ spin_lock(&(mgr->tmgr_lock)); ++ ++ LOCK_CNT_INC(spin_locked_txnmgr); ++ LOCK_CNT_INC(spin_locked); ++} ++ ++static inline int spin_trylock_txnmgr(txn_mgr *mgr) ++{ ++ if (spin_trylock(&(mgr->tmgr_lock))) { ++ LOCK_CNT_INC(spin_locked_txnmgr); ++ LOCK_CNT_INC(spin_locked); ++ return 1; ++ } ++ return 0; ++} ++ ++static inline void spin_unlock_txnmgr(txn_mgr *mgr) ++{ ++ assert_spin_locked(&(mgr->tmgr_lock)); ++ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr)); ++ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); ++ ++ LOCK_CNT_DEC(spin_locked_txnmgr); ++ LOCK_CNT_DEC(spin_locked); ++ ++ spin_unlock(&(mgr->tmgr_lock)); ++} ++ ++typedef enum { ++ FQ_IN_USE = 0x1 ++} flush_queue_state_t; ++ ++typedef struct flush_queue flush_queue_t; ++ ++/* This is an accumulator for jnodes prepared for writing to disk. A flush queue ++ is filled by the jnode_flush() routine, and written to disk under memory ++ pressure or at atom commit time. */ ++/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued ++ field and fq->prepped list can be modified if atom is spin-locked and fq ++ object is "in-use" state. For read-only traversal of the fq->prepped list ++ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or ++ only have atom spin-locked. */ ++struct flush_queue { ++ /* linkage element is the first in this structure to make debugging ++ easier. See field in atom struct for description of list. */ ++ struct list_head alink; ++ /* A spinlock to protect changes of fq state and fq->atom pointer */ ++ spinlock_t guard; ++ /* flush_queue state: [in_use | ready] */ ++ flush_queue_state_t state; ++ /* A list which contains queued nodes, queued nodes are removed from any ++ * atom's list and put on this ->prepped one. */ ++ struct list_head prepped; ++ /* number of submitted i/o requests */ ++ atomic_t nr_submitted; ++ /* number of i/o errors */ ++ atomic_t nr_errors; ++ /* An atom this flush queue is attached to */ ++ txn_atom *atom; ++ /* A wait queue head to wait on i/o completion */ ++ wait_queue_head_t wait; ++#if REISER4_DEBUG ++ /* A thread which took this fq in exclusive use, NULL if fq is free, ++ * used for debugging. */ ++ struct task_struct *owner; ++#endif ++}; ++ ++extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **); ++extern void reiser4_fq_put_nolock(flush_queue_t *); ++extern void reiser4_fq_put(flush_queue_t *); ++extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from); ++extern void queue_jnode(flush_queue_t *, jnode *); ++ ++extern int reiser4_write_fq(flush_queue_t *, long *, int); ++extern int current_atom_finish_all_fq(void); ++extern void init_atom_fq_parts(txn_atom *); ++ ++extern reiser4_block_nr txnmgr_count_deleted_blocks(void); ++ ++extern void znode_make_dirty(znode * node); ++extern void jnode_make_dirty_locked(jnode * node); ++ ++extern int reiser4_sync_atom(txn_atom * atom); ++ ++#if REISER4_DEBUG ++extern int atom_fq_parts_are_clean(txn_atom *); ++#endif ++ ++extern void add_fq_to_bio(flush_queue_t *, struct bio *); ++extern flush_queue_t *get_fq_for_current_atom(void); ++ ++void reiser4_invalidate_list(struct list_head * head); ++ ++# endif /* __REISER4_TXNMGR_H__ */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/type_safe_hash.h linux-2.6.33/fs/reiser4/type_safe_hash.h +--- linux-2.6.33.orig/fs/reiser4/type_safe_hash.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/type_safe_hash.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,320 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* A hash table class that uses hash chains (singly-linked) and is ++ parametrized to provide type safety. */ ++ ++#ifndef __REISER4_TYPE_SAFE_HASH_H__ ++#define __REISER4_TYPE_SAFE_HASH_H__ ++ ++#include "debug.h" ++ ++#include <asm/errno.h> ++/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects ++ based on the object type. You need to declare the item type before ++ this definition, define it after this definition. */ ++#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \ ++ \ ++typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \ ++typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \ ++ \ ++struct PREFIX##_hash_table_ \ ++{ \ ++ ITEM_TYPE **_table; \ ++ __u32 _buckets; \ ++}; \ ++ \ ++struct PREFIX##_hash_link_ \ ++{ \ ++ ITEM_TYPE *_next; \ ++} ++ ++/* Step 2: Define the object type of the hash: give it field of type ++ PREFIX_hash_link. */ ++ ++/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using ++ the type and field name used in step 3. The arguments are: ++ ++ ITEM_TYPE The item type being hashed ++ KEY_TYPE The type of key being hashed ++ KEY_NAME The name of the key field within the item ++ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link) ++ HASH_FUNC The name of the hash function (or macro, takes const pointer to key) ++ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys) ++ ++ It implements these functions: ++ ++ prefix_hash_init Initialize the table given its size. ++ prefix_hash_insert Insert an item ++ prefix_hash_insert_index Insert an item w/ precomputed hash_index ++ prefix_hash_find Find an item by key ++ prefix_hash_find_index Find an item w/ precomputed hash_index ++ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found ++ prefix_hash_remove_index Remove an item w/ precomputed hash_index ++ ++ If you'd like something to be done differently, feel free to ask me ++ for modifications. Additional features that could be added but ++ have not been: ++ ++ prefix_hash_remove_key Find and remove an item by key ++ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index ++ ++ The hash_function currently receives only the key as an argument, ++ meaning it must somehow know the number of buckets. If this is a ++ problem let me know. ++ ++ This hash table uses a single-linked hash chain. This means ++ insertion is fast but deletion requires searching the chain. ++ ++ There is also the doubly-linked hash chain approach, under which ++ deletion requires no search but the code is longer and it takes two ++ pointers per item. ++ ++ The circularly-linked approach has the shortest code but requires ++ two pointers per bucket, doubling the size of the bucket array (in ++ addition to two pointers per item). ++*/ ++#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \ ++ \ ++static __inline__ void \ ++PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \ ++ __u32 hash UNUSED_ARG) \ ++{ \ ++ assert("nikita-2780", hash < table->_buckets); \ ++} \ ++ \ ++static __inline__ int \ ++PREFIX##_hash_init (PREFIX##_hash_table *hash, \ ++ __u32 buckets) \ ++{ \ ++ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \ ++ hash->_buckets = buckets; \ ++ if (hash->_table == NULL) \ ++ { \ ++ return RETERR(-ENOMEM); \ ++ } \ ++ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \ ++ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \ ++ return 0; \ ++} \ ++ \ ++static __inline__ void \ ++PREFIX##_hash_done (PREFIX##_hash_table *hash) \ ++{ \ ++ if (REISER4_DEBUG && hash->_table != NULL) { \ ++ __u32 i; \ ++ for (i = 0 ; i < hash->_buckets ; ++ i) \ ++ assert("nikita-2905", hash->_table[i] == NULL); \ ++ } \ ++ if (hash->_table != NULL) \ ++ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \ ++ hash->_table = NULL; \ ++} \ ++ \ ++static __inline__ void \ ++PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \ ++{ \ ++ prefetch(item->LINK_NAME._next); \ ++} \ ++ \ ++static __inline__ void \ ++PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \ ++ __u32 index) \ ++{ \ ++ prefetch(hash->_table[index]); \ ++} \ ++ \ ++static __inline__ ITEM_TYPE* \ ++PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \ ++ __u32 hash_index, \ ++ KEY_TYPE const *find_key) \ ++{ \ ++ ITEM_TYPE *item; \ ++ \ ++ PREFIX##_check_hash(hash, hash_index); \ ++ \ ++ for (item = hash->_table[hash_index]; \ ++ item != NULL; \ ++ item = item->LINK_NAME._next) \ ++ { \ ++ prefetch(item->LINK_NAME._next); \ ++ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \ ++ if (EQ_FUNC (& item->KEY_NAME, find_key)) \ ++ { \ ++ return item; \ ++ } \ ++ } \ ++ \ ++ return NULL; \ ++} \ ++ \ ++static __inline__ ITEM_TYPE* \ ++PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \ ++ __u32 hash_index, \ ++ KEY_TYPE const *find_key) \ ++{ \ ++ ITEM_TYPE ** item = &hash->_table[hash_index]; \ ++ \ ++ PREFIX##_check_hash(hash, hash_index); \ ++ \ ++ while (*item != NULL) { \ ++ prefetch(&(*item)->LINK_NAME._next); \ ++ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \ ++ ITEM_TYPE *found; \ ++ \ ++ found = *item; \ ++ *item = found->LINK_NAME._next; \ ++ found->LINK_NAME._next = hash->_table[hash_index]; \ ++ hash->_table[hash_index] = found; \ ++ return found; \ ++ } \ ++ item = &(*item)->LINK_NAME._next; \ ++ } \ ++ return NULL; \ ++} \ ++ \ ++static __inline__ int \ ++PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \ ++ __u32 hash_index, \ ++ ITEM_TYPE *del_item) \ ++{ \ ++ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \ ++ \ ++ PREFIX##_check_hash(hash, hash_index); \ ++ \ ++ while (*hash_item_p != NULL) { \ ++ prefetch(&(*hash_item_p)->LINK_NAME._next); \ ++ if (*hash_item_p == del_item) { \ ++ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \ ++ return 1; \ ++ } \ ++ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \ ++ } \ ++ return 0; \ ++} \ ++ \ ++static __inline__ void \ ++PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \ ++ __u32 hash_index, \ ++ ITEM_TYPE *ins_item) \ ++{ \ ++ PREFIX##_check_hash(hash, hash_index); \ ++ \ ++ ins_item->LINK_NAME._next = hash->_table[hash_index]; \ ++ hash->_table[hash_index] = ins_item; \ ++} \ ++ \ ++static __inline__ void \ ++PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \ ++ __u32 hash_index, \ ++ ITEM_TYPE *ins_item) \ ++{ \ ++ PREFIX##_check_hash(hash, hash_index); \ ++ \ ++ ins_item->LINK_NAME._next = hash->_table[hash_index]; \ ++ smp_wmb(); \ ++ hash->_table[hash_index] = ins_item; \ ++} \ ++ \ ++static __inline__ ITEM_TYPE* \ ++PREFIX##_hash_find (PREFIX##_hash_table *hash, \ ++ KEY_TYPE const *find_key) \ ++{ \ ++ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \ ++} \ ++ \ ++static __inline__ ITEM_TYPE* \ ++PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \ ++ KEY_TYPE const *find_key) \ ++{ \ ++ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \ ++} \ ++ \ ++static __inline__ int \ ++PREFIX##_hash_remove (PREFIX##_hash_table *hash, \ ++ ITEM_TYPE *del_item) \ ++{ \ ++ return PREFIX##_hash_remove_index (hash, \ ++ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \ ++} \ ++ \ ++static __inline__ int \ ++PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \ ++ ITEM_TYPE *del_item) \ ++{ \ ++ return PREFIX##_hash_remove (hash, del_item); \ ++} \ ++ \ ++static __inline__ void \ ++PREFIX##_hash_insert (PREFIX##_hash_table *hash, \ ++ ITEM_TYPE *ins_item) \ ++{ \ ++ return PREFIX##_hash_insert_index (hash, \ ++ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \ ++} \ ++ \ ++static __inline__ void \ ++PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \ ++ ITEM_TYPE *ins_item) \ ++{ \ ++ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \ ++ ins_item); \ ++} \ ++ \ ++static __inline__ ITEM_TYPE * \ ++PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \ ++{ \ ++ ITEM_TYPE *first; \ ++ \ ++ for (first = NULL; ind < hash->_buckets; ++ ind) { \ ++ first = hash->_table[ind]; \ ++ if (first != NULL) \ ++ break; \ ++ } \ ++ return first; \ ++} \ ++ \ ++static __inline__ ITEM_TYPE * \ ++PREFIX##_hash_next (PREFIX##_hash_table *hash, \ ++ ITEM_TYPE *item) \ ++{ \ ++ ITEM_TYPE *next; \ ++ \ ++ if (item == NULL) \ ++ return NULL; \ ++ next = item->LINK_NAME._next; \ ++ if (next == NULL) \ ++ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \ ++ return next; \ ++} \ ++ \ ++typedef struct {} PREFIX##_hash_dummy ++ ++#define for_all_ht_buckets(table, head) \ ++for ((head) = &(table) -> _table[ 0 ] ; \ ++ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head)) ++ ++#define for_all_in_bucket(bucket, item, next, field) \ ++for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \ ++ (item) != NULL ; \ ++ (item) = (next), (next) = (item) ? (item) -> field._next : NULL ) ++ ++#define for_all_in_htable(table, prefix, item, next) \ ++for ((item) = prefix ## _hash_first ((table), 0), \ ++ (next) = prefix ## _hash_next ((table), (item)) ; \ ++ (item) != NULL ; \ ++ (item) = (next), \ ++ (next) = prefix ## _hash_next ((table), (item))) ++ ++/* __REISER4_TYPE_SAFE_HASH_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/vfs_ops.c linux-2.6.33/fs/reiser4/vfs_ops.c +--- linux-2.6.33.orig/fs/reiser4/vfs_ops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/vfs_ops.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,267 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined ++ here. */ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "coord.h" ++#include "plugin/item/item.h" ++#include "plugin/file/file.h" ++#include "plugin/security/perm.h" ++#include "plugin/disk_format/disk_format.h" ++#include "plugin/plugin.h" ++#include "plugin/plugin_set.h" ++#include "plugin/object.h" ++#include "txnmgr.h" ++#include "jnode.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree.h" ++#include "vfs_ops.h" ++#include "inode.h" ++#include "page_cache.h" ++#include "ktxnmgrd.h" ++#include "super.h" ++#include "reiser4.h" ++#include "entd.h" ++#include "status_flags.h" ++#include "flush.h" ++#include "dscale.h" ++ ++#include <linux/profile.h> ++#include <linux/types.h> ++#include <linux/mount.h> ++#include <linux/vfs.h> ++#include <linux/mm.h> ++#include <linux/buffer_head.h> ++#include <linux/dcache.h> ++#include <linux/list.h> ++#include <linux/pagemap.h> ++#include <linux/slab.h> ++#include <linux/seq_file.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/writeback.h> ++#include <linux/blkdev.h> ++#include <linux/quotaops.h> ++#include <linux/security.h> ++#include <linux/reboot.h> ++#include <linux/rcupdate.h> ++ ++/* update inode stat-data by calling plugin */ ++int reiser4_update_sd(struct inode *object) ++{ ++ file_plugin *fplug; ++ ++ assert("nikita-2338", object != NULL); ++ /* check for read-only file system. */ ++ if (IS_RDONLY(object)) ++ return 0; ++ ++ fplug = inode_file_plugin(object); ++ assert("nikita-2339", fplug != NULL); ++ return fplug->write_sd_by_inode(object); ++} ++ ++/* helper function: increase inode nlink count and call plugin method to save ++ updated stat-data. ++ ++ Used by link/create and during creation of dot and dotdot in mkdir ++*/ ++int reiser4_add_nlink(struct inode *object /* object to which link is added */ , ++ struct inode *parent /* parent where new entry will be */ ++ , ++ int write_sd_p /* true if stat-data has to be ++ * updated */ ) ++{ ++ file_plugin *fplug; ++ int result; ++ ++ assert("nikita-1351", object != NULL); ++ ++ fplug = inode_file_plugin(object); ++ assert("nikita-1445", fplug != NULL); ++ ++ /* ask plugin whether it can add yet another link to this ++ object */ ++ if (!fplug->can_add_link(object)) ++ return RETERR(-EMLINK); ++ ++ assert("nikita-2211", fplug->add_link != NULL); ++ /* call plugin to do actual addition of link */ ++ result = fplug->add_link(object, parent); ++ ++ /* optionally update stat data */ ++ if (result == 0 && write_sd_p) ++ result = fplug->write_sd_by_inode(object); ++ return result; ++} ++ ++/* helper function: decrease inode nlink count and call plugin method to save ++ updated stat-data. ++ ++ Used by unlink/create ++*/ ++int reiser4_del_nlink(struct inode *object /* object from which link is ++ * removed */ , ++ struct inode *parent /* parent where entry was */ , ++ int write_sd_p /* true is stat-data has to be ++ * updated */ ) ++{ ++ file_plugin *fplug; ++ int result; ++ ++ assert("nikita-1349", object != NULL); ++ ++ fplug = inode_file_plugin(object); ++ assert("nikita-1350", fplug != NULL); ++ assert("nikita-1446", object->i_nlink > 0); ++ assert("nikita-2210", fplug->rem_link != NULL); ++ ++ /* call plugin to do actual deletion of link */ ++ result = fplug->rem_link(object, parent); ++ ++ /* optionally update stat data */ ++ if (result == 0 && write_sd_p) ++ result = fplug->write_sd_by_inode(object); ++ return result; ++} ++ ++/* Release reiser4 dentry. This is d_op->d_release() method. */ ++static void reiser4_d_release(struct dentry *dentry /* dentry released */ ) ++{ ++ reiser4_free_dentry_fsdata(dentry); ++} ++ ++/* ++ * Called by reiser4_sync_inodes(), during speculative write-back (through ++ * pdflush, or balance_dirty_pages()). ++ */ ++void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc) ++{ ++ long written = 0; ++ int repeats = 0; ++ int result; ++ struct address_space *mapping; ++ ++ /* ++ * Performs early flushing, trying to free some memory. If there is ++ * nothing to flush, commits some atoms. ++ */ ++ ++ /* Commit all atoms if reiser4_writepages() is called from sys_sync() or ++ sys_fsync(). */ ++ if (wbc->sync_mode != WB_SYNC_NONE) { ++ txnmgr_force_commit_all(sb, 0); ++ return; ++ } ++ ++ BUG_ON(reiser4_get_super_fake(sb) == NULL); ++ mapping = reiser4_get_super_fake(sb)->i_mapping; ++ do { ++ long nr_submitted = 0; ++ jnode *node = NULL; ++ ++ /* do not put more requests to overload write queue */ ++ if (wbc->nonblocking && ++ bdi_write_congested(mapping->backing_dev_info)) { ++ blk_run_address_space(mapping); ++ wbc->encountered_congestion = 1; ++ break; ++ } ++ repeats++; ++ BUG_ON(wbc->nr_to_write <= 0); ++ ++ if (get_current_context()->entd) { ++ entd_context *ent = get_entd_context(sb); ++ ++ if (ent->cur_request->node) ++ /* ++ * this is ent thread and it managed to capture ++ * requested page itself - start flush from ++ * that page ++ */ ++ node = ent->cur_request->node; ++ } ++ ++ result = flush_some_atom(node, &nr_submitted, wbc, ++ JNODE_FLUSH_WRITE_BLOCKS); ++ if (result != 0) ++ warning("nikita-31001", "Flush failed: %i", result); ++ if (node) ++ /* drop the reference aquired ++ in find_or_create_extent() */ ++ jput(node); ++ if (!nr_submitted) ++ break; ++ ++ wbc->nr_to_write -= nr_submitted; ++ written += nr_submitted; ++ } while (wbc->nr_to_write > 0); ++} ++ ++/* tell VM how many pages were dirtied */ ++void reiser4_throttle_write(struct inode *inode, int nrpages) ++{ ++ reiser4_context *ctx; ++ ++ ctx = get_current_context(); ++ reiser4_txn_restart(ctx); ++ current->journal_info = NULL; ++ balance_dirty_pages_ratelimited_nr(inode->i_mapping, nrpages); ++ current->journal_info = ctx; ++} ++ ++const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4"; ++const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the ++ * beginning of device */ ++ ++/* ++ * Reiser4 initialization/shutdown. ++ * ++ * Code below performs global reiser4 initialization that is done either as ++ * part of kernel initialization (when reiser4 is statically built-in), or ++ * during reiser4 module load (when compiled as module). ++ */ ++ ++void reiser4_handle_error(void) ++{ ++ struct super_block *sb = reiser4_get_current_sb(); ++ ++ if (!sb) ++ return; ++ reiser4_status_write(REISER4_STATUS_DAMAGED, 0, ++ "Filesystem error occured"); ++ switch (get_super_private(sb)->onerror) { ++ case 0: ++ reiser4_panic("foobar-42", "Filesystem error occured\n"); ++ case 1: ++ default: ++ if (sb->s_flags & MS_RDONLY) ++ return; ++ sb->s_flags |= MS_RDONLY; ++ break; ++ } ++} ++ ++struct dentry_operations reiser4_dentry_operations = { ++ .d_revalidate = NULL, ++ .d_hash = NULL, ++ .d_compare = NULL, ++ .d_delete = NULL, ++ .d_release = reiser4_d_release, ++ .d_iput = NULL, ++}; ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/vfs_ops.h linux-2.6.33/fs/reiser4/vfs_ops.h +--- linux-2.6.33.orig/fs/reiser4/vfs_ops.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/vfs_ops.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,53 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* vfs_ops.c's exported symbols */ ++ ++#if !defined( __FS_REISER4_VFS_OPS_H__ ) ++#define __FS_REISER4_VFS_OPS_H__ ++ ++#include "forward.h" ++#include "coord.h" ++#include "seal.h" ++#include "plugin/file/file.h" ++#include "super.h" ++#include "readahead.h" ++ ++#include <linux/types.h> /* for loff_t */ ++#include <linux/fs.h> /* for struct address_space */ ++#include <linux/dcache.h> /* for struct dentry */ ++#include <linux/mm.h> ++#include <linux/backing-dev.h> ++ ++/* address space operations */ ++int reiser4_writepage(struct page *, struct writeback_control *); ++int reiser4_set_page_dirty(struct page *); ++void reiser4_invalidatepage(struct page *, unsigned long offset); ++int reiser4_releasepage(struct page *, gfp_t); ++ ++extern int reiser4_update_sd(struct inode *); ++extern int reiser4_add_nlink(struct inode *, struct inode *, int); ++extern int reiser4_del_nlink(struct inode *, struct inode *, int); ++ ++extern int reiser4_start_up_io(struct page *page); ++extern void reiser4_throttle_write(struct inode *, int nrpages); ++extern int jnode_is_releasable(jnode *); ++ ++#define CAPTURE_APAGE_BURST (1024l) ++void reiser4_writeout(struct super_block *, struct writeback_control *); ++ ++extern void reiser4_handle_error(void); ++ ++/* __FS_REISER4_VFS_OPS_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/wander.c linux-2.6.33/fs/reiser4/wander.c +--- linux-2.6.33.orig/fs/reiser4/wander.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/wander.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1798 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Reiser4 Wandering Log */ ++ ++/* You should read http://www.namesys.com/txn-doc.html ++ ++ That describes how filesystem operations are performed as atomic ++ transactions, and how we try to arrange it so that we can write most of the ++ data only once while performing the operation atomically. ++ ++ For the purposes of this code, it is enough for it to understand that it ++ has been told a given block should be written either once, or twice (if ++ twice then once to the wandered location and once to the real location). ++ ++ This code guarantees that those blocks that are defined to be part of an ++ atom either all take effect or none of them take effect. ++ ++ The "relocate set" of nodes are submitted to write by the jnode_flush() ++ routine, and the "overwrite set" is submitted by reiser4_write_log(). ++ This is because with the overwrite set we seek to optimize writes, and ++ with the relocate set we seek to cause disk order to correlate with the ++ "parent first order" (preorder). ++ ++ reiser4_write_log() allocates and writes wandered blocks and maintains ++ additional on-disk structures of the atom as wander records (each wander ++ record occupies one block) for storing of the "wandered map" (a table which ++ contains a relation between wandered and real block numbers) and other ++ information which might be needed at transaction recovery time. ++ ++ The wander records are unidirectionally linked into a circle: each wander ++ record contains a block number of the next wander record, the last wander ++ record points to the first one. ++ ++ One wander record (named "tx head" in this file) has a format which is ++ different from the other wander records. The "tx head" has a reference to the ++ "tx head" block of the previously committed atom. Also, "tx head" contains ++ fs information (the free blocks counter, and the oid allocator state) which ++ is logged in a special way . ++ ++ There are two journal control blocks, named journal header and journal ++ footer which have fixed on-disk locations. The journal header has a ++ reference to the "tx head" block of the last committed atom. The journal ++ footer points to the "tx head" of the last flushed atom. The atom is ++ "played" when all blocks from its overwrite set are written to disk the ++ second time (i.e. written to their real locations). ++ ++ NOTE: People who know reiserfs internals and its journal structure might be ++ confused with these terms journal footer and journal header. There is a table ++ with terms of similar semantics in reiserfs (reiser3) and reiser4: ++ ++ REISER3 TERM | REISER4 TERM | DESCRIPTION ++ --------------------+-----------------------+---------------------------- ++ commit record | journal header | atomic write of this record ++ | | ends transaction commit ++ --------------------+-----------------------+---------------------------- ++ journal header | journal footer | atomic write of this record ++ | | ends post-commit writes. ++ | | After successful ++ | | writing of this journal ++ | | blocks (in reiser3) or ++ | | wandered blocks/records are ++ | | free for re-use. ++ --------------------+-----------------------+---------------------------- ++ ++ The atom commit process is the following: ++ ++ 1. The overwrite set is taken from atom's clean list, and its size is ++ counted. ++ ++ 2. The number of necessary wander records (including tx head) is calculated, ++ and the wander record blocks are allocated. ++ ++ 3. Allocate wandered blocks and populate wander records by wandered map. ++ ++ 4. submit write requests for wander records and wandered blocks. ++ ++ 5. wait until submitted write requests complete. ++ ++ 6. update journal header: change the pointer to the block number of just ++ written tx head, submit an i/o for modified journal header block and wait ++ for i/o completion. ++ ++ NOTE: The special logging for bitmap blocks and some reiser4 super block ++ fields makes processes of atom commit, flush and recovering a bit more ++ complex (see comments in the source code for details). ++ ++ The atom playing process is the following: ++ ++ 1. Write atom's overwrite set in-place. ++ ++ 2. Wait on i/o. ++ ++ 3. Update journal footer: change the pointer to block number of tx head ++ block of the atom we currently flushing, submit an i/o, wait on i/o ++ completion. ++ ++ 4. Free disk space which was used for wandered blocks and wander records. ++ ++ After the freeing of wandered blocks and wander records we have that journal ++ footer points to the on-disk structure which might be overwritten soon. ++ Neither the log writer nor the journal recovery procedure use that pointer ++ for accessing the data. When the journal recovery procedure finds the oldest ++ transaction it compares the journal footer pointer value with the "prev_tx" ++ pointer value in tx head, if values are equal the oldest not flushed ++ transaction is found. ++ ++ NOTE on disk space leakage: the information about of what blocks and how many ++ blocks are allocated for wandered blocks, wandered records is not written to ++ the disk because of special logging for bitmaps and some super blocks ++ counters. After a system crash we the reiser4 does not remember those ++ objects allocation, thus we have no such a kind of disk space leakage. ++*/ ++ ++/* Special logging of reiser4 super block fields. */ ++ ++/* There are some reiser4 super block fields (free block count and OID allocator ++ state (number of files and next free OID) which are logged separately from ++ super block to avoid unnecessary atom fusion. ++ ++ So, the reiser4 super block can be not captured by a transaction with ++ allocates/deallocates disk blocks or create/delete file objects. Moreover, ++ the reiser4 on-disk super block is not touched when such a transaction is ++ committed and flushed. Those "counters logged specially" are logged in "tx ++ head" blocks and in the journal footer block. ++ ++ A step-by-step description of special logging: ++ ++ 0. The per-atom information about deleted or created files and allocated or ++ freed blocks is collected during the transaction. The atom's ++ ->nr_objects_created and ->nr_objects_deleted are for object ++ deletion/creation tracking, the numbers of allocated and freed blocks are ++ calculated using atom's delete set and atom's capture list -- all new and ++ relocated nodes should be on atom's clean list and should have JNODE_RELOC ++ bit set. ++ ++ 1. The "logged specially" reiser4 super block fields have their "committed" ++ versions in the reiser4 in-memory super block. They get modified only at ++ atom commit time. The atom's commit thread has an exclusive access to those ++ "committed" fields because the log writer implementation supports only one ++ atom commit a time (there is a per-fs "commit" mutex). At ++ that time "committed" counters are modified using per-atom information ++ collected during the transaction. These counters are stored on disk as a ++ part of tx head block when atom is committed. ++ ++ 2. When the atom is flushed the value of the free block counter and the OID ++ allocator state get written to the journal footer block. A special journal ++ procedure (journal_recover_sb_data()) takes those values from the journal ++ footer and updates the reiser4 in-memory super block. ++ ++ NOTE: That means free block count and OID allocator state are logged ++ separately from the reiser4 super block regardless of the fact that the ++ reiser4 super block has fields to store both the free block counter and the ++ OID allocator. ++ ++ Writing the whole super block at commit time requires knowing true values of ++ all its fields without changes made by not yet committed transactions. It is ++ possible by having their "committed" version of the super block like the ++ reiser4 bitmap blocks have "committed" and "working" versions. However, ++ another scheme was implemented which stores special logged values in the ++ unused free space inside transaction head block. In my opinion it has an ++ advantage of not writing whole super block when only part of it was ++ modified. */ ++ ++#include "debug.h" ++#include "dformat.h" ++#include "txnmgr.h" ++#include "jnode.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "page_cache.h" ++#include "wander.h" ++#include "reiser4.h" ++#include "super.h" ++#include "vfs_ops.h" ++#include "writeout.h" ++#include "inode.h" ++#include "entd.h" ++ ++#include <linux/types.h> ++#include <linux/fs.h> /* for struct super_block */ ++#include <linux/mm.h> /* for struct page */ ++#include <linux/pagemap.h> ++#include <linux/bio.h> /* for struct bio */ ++#include <linux/blkdev.h> ++ ++static int write_jnodes_to_disk_extent( ++ jnode *, int, const reiser4_block_nr *, flush_queue_t *, int); ++ ++/* The commit_handle is a container for objects needed at atom commit time */ ++struct commit_handle { ++ /* A pointer to atom's list of OVRWR nodes */ ++ struct list_head *overwrite_set; ++ /* atom's overwrite set size */ ++ int overwrite_set_size; ++ /* jnodes for wander record blocks */ ++ struct list_head tx_list; ++ /* number of wander records */ ++ __u32 tx_size; ++ /* 'committed' sb counters are saved here until atom is completely ++ flushed */ ++ __u64 free_blocks; ++ __u64 nr_files; ++ __u64 next_oid; ++ /* A pointer to the atom which is being committed */ ++ txn_atom *atom; ++ /* A pointer to current super block */ ++ struct super_block *super; ++ /* The counter of modified bitmaps */ ++ reiser4_block_nr nr_bitmap; ++}; ++ ++static void init_commit_handle(struct commit_handle *ch, txn_atom *atom) ++{ ++ memset(ch, 0, sizeof(struct commit_handle)); ++ INIT_LIST_HEAD(&ch->tx_list); ++ ++ ch->atom = atom; ++ ch->super = reiser4_get_current_sb(); ++} ++ ++static void done_commit_handle(struct commit_handle *ch) ++{ ++ assert("zam-690", list_empty(&ch->tx_list)); ++} ++ ++static inline int reiser4_use_write_barrier(struct super_block * s) ++{ ++ return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER); ++} ++ ++static void disable_write_barrier(struct super_block * s) ++{ ++ notice("zam-1055", "%s does not support write barriers," ++ " using synchronous write instead.", s->s_id); ++ set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags); ++} ++ ++/* fill journal header block data */ ++static void format_journal_header(struct commit_handle *ch) ++{ ++ struct reiser4_super_info_data *sbinfo; ++ struct journal_header *header; ++ jnode *txhead; ++ ++ sbinfo = get_super_private(ch->super); ++ assert("zam-479", sbinfo != NULL); ++ assert("zam-480", sbinfo->journal_header != NULL); ++ ++ txhead = list_entry(ch->tx_list.next, jnode, capture_link); ++ ++ jload(sbinfo->journal_header); ++ ++ header = (struct journal_header *)jdata(sbinfo->journal_header); ++ assert("zam-484", header != NULL); ++ ++ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)), ++ &header->last_committed_tx); ++ ++ jrelse(sbinfo->journal_header); ++} ++ ++/* fill journal footer block data */ ++static void format_journal_footer(struct commit_handle *ch) ++{ ++ struct reiser4_super_info_data *sbinfo; ++ struct journal_footer *footer; ++ jnode *tx_head; ++ ++ sbinfo = get_super_private(ch->super); ++ ++ tx_head = list_entry(ch->tx_list.next, jnode, capture_link); ++ ++ assert("zam-493", sbinfo != NULL); ++ assert("zam-494", sbinfo->journal_header != NULL); ++ ++ check_me("zam-691", jload(sbinfo->journal_footer) == 0); ++ ++ footer = (struct journal_footer *)jdata(sbinfo->journal_footer); ++ assert("zam-495", footer != NULL); ++ ++ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)), ++ &footer->last_flushed_tx); ++ put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks); ++ ++ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files); ++ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid); ++ ++ jrelse(sbinfo->journal_footer); ++} ++ ++/* wander record capacity depends on current block size */ ++static int wander_record_capacity(const struct super_block *super) ++{ ++ return (super->s_blocksize - ++ sizeof(struct wander_record_header)) / ++ sizeof(struct wander_entry); ++} ++ ++/* Fill first wander record (tx head) in accordance with supplied given data */ ++static void format_tx_head(struct commit_handle *ch) ++{ ++ jnode *tx_head; ++ jnode *next; ++ struct tx_header *header; ++ ++ tx_head = list_entry(ch->tx_list.next, jnode, capture_link); ++ assert("zam-692", &ch->tx_list != &tx_head->capture_link); ++ ++ next = list_entry(tx_head->capture_link.next, jnode, capture_link); ++ if (&ch->tx_list == &next->capture_link) ++ next = tx_head; ++ ++ header = (struct tx_header *)jdata(tx_head); ++ ++ assert("zam-460", header != NULL); ++ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header)); ++ ++ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize); ++ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE); ++ ++ put_unaligned(cpu_to_le32(ch->tx_size), &header->total); ++ put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx), ++ &header->prev_tx); ++ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block); ++ put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks); ++ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files); ++ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid); ++} ++ ++/* prepare ordinary wander record block (fill all service fields) */ ++static void ++format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial) ++{ ++ struct wander_record_header *LRH; ++ jnode *next; ++ ++ assert("zam-464", node != NULL); ++ ++ LRH = (struct wander_record_header *)jdata(node); ++ next = list_entry(node->capture_link.next, jnode, capture_link); ++ ++ if (&ch->tx_list == &next->capture_link) ++ next = list_entry(ch->tx_list.next, jnode, capture_link); ++ ++ assert("zam-465", LRH != NULL); ++ assert("zam-463", ++ ch->super->s_blocksize > sizeof(struct wander_record_header)); ++ ++ memset(jdata(node), 0, (size_t) ch->super->s_blocksize); ++ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE); ++ ++ put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total); ++ put_unaligned(cpu_to_le32(serial), &LRH->serial); ++ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block); ++} ++ ++/* add one wandered map entry to formatted wander record */ ++static void ++store_entry(jnode * node, int index, const reiser4_block_nr * a, ++ const reiser4_block_nr * b) ++{ ++ char *data; ++ struct wander_entry *pairs; ++ ++ data = jdata(node); ++ assert("zam-451", data != NULL); ++ ++ pairs = ++ (struct wander_entry *)(data + sizeof(struct wander_record_header)); ++ ++ put_unaligned(cpu_to_le64(*a), &pairs[index].original); ++ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered); ++} ++ ++/* currently, wander records contains contain only wandered map, which depend on ++ overwrite set size */ ++static void get_tx_size(struct commit_handle *ch) ++{ ++ assert("zam-440", ch->overwrite_set_size != 0); ++ assert("zam-695", ch->tx_size == 0); ++ ++ /* count all ordinary wander records ++ (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one ++ for tx head block */ ++ ch->tx_size = ++ (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) + ++ 2; ++} ++ ++/* A special structure for using in store_wmap_actor() for saving its state ++ between calls */ ++struct store_wmap_params { ++ jnode *cur; /* jnode of current wander record to fill */ ++ int idx; /* free element index in wander record */ ++ int capacity; /* capacity */ ++ ++#if REISER4_DEBUG ++ struct list_head *tx_list; ++#endif ++}; ++ ++/* an actor for use in blocknr_set_iterator routine which populates the list ++ of pre-formatted wander records by wandered map info */ ++static int ++store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, ++ const reiser4_block_nr * b, void *data) ++{ ++ struct store_wmap_params *params = data; ++ ++ if (params->idx >= params->capacity) { ++ /* a new wander record should be taken from the tx_list */ ++ params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link); ++ assert("zam-454", ++ params->tx_list != ¶ms->cur->capture_link); ++ ++ params->idx = 0; ++ } ++ ++ store_entry(params->cur, params->idx, a, b); ++ params->idx++; ++ ++ return 0; ++} ++ ++/* This function is called after Relocate set gets written to disk, Overwrite ++ set is written to wandered locations and all wander records are written ++ also. Updated journal header blocks contains a pointer (block number) to ++ first wander record of the just written transaction */ ++static int update_journal_header(struct commit_handle *ch, int use_barrier) ++{ ++ struct reiser4_super_info_data *sbinfo = get_super_private(ch->super); ++ jnode *jh = sbinfo->journal_header; ++ jnode *head = list_entry(ch->tx_list.next, jnode, capture_link); ++ int ret; ++ ++ format_journal_header(ch); ++ ++ ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL, ++ use_barrier ? WRITEOUT_BARRIER : 0); ++ if (ret) ++ return ret; ++ ++ /* blk_run_address_space(sbinfo->fake->i_mapping); ++ * blk_run_queues(); */ ++ ++ ret = jwait_io(jh, WRITE); ++ ++ if (ret) ++ return ret; ++ ++ sbinfo->last_committed_tx = *jnode_get_block(head); ++ ++ return 0; ++} ++ ++/* This function is called after write-back is finished. We update journal ++ footer block and free blocks which were occupied by wandered blocks and ++ transaction wander records */ ++static int update_journal_footer(struct commit_handle *ch, int use_barrier) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(ch->super); ++ ++ jnode *jf = sbinfo->journal_footer; ++ ++ int ret; ++ ++ format_journal_footer(ch); ++ ++ ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL, ++ use_barrier ? WRITEOUT_BARRIER : 0); ++ if (ret) ++ return ret; ++ ++ /* blk_run_address_space(sbinfo->fake->i_mapping); ++ * blk_run_queue(); */ ++ ++ ret = jwait_io(jf, WRITE); ++ if (ret) ++ return ret; ++ ++ return 0; ++} ++ ++/* free block numbers of wander records of already written in place transaction */ ++static void dealloc_tx_list(struct commit_handle *ch) ++{ ++ while (!list_empty(&ch->tx_list)) { ++ jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link); ++ list_del(&cur->capture_link); ++ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link)); ++ reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED, ++ BA_FORMATTED); ++ ++ unpin_jnode_data(cur); ++ reiser4_drop_io_head(cur); ++ } ++} ++ ++/* An actor for use in block_nr_iterator() routine which frees wandered blocks ++ from atom's overwrite set. */ ++static int ++dealloc_wmap_actor(txn_atom * atom UNUSED_ARG, ++ const reiser4_block_nr * a UNUSED_ARG, ++ const reiser4_block_nr * b, void *data UNUSED_ARG) ++{ ++ ++ assert("zam-499", b != NULL); ++ assert("zam-500", *b != 0); ++ assert("zam-501", !reiser4_blocknr_is_fake(b)); ++ ++ reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED); ++ return 0; ++} ++ ++/* free wandered block locations of already written in place transaction */ ++static void dealloc_wmap(struct commit_handle *ch) ++{ ++ assert("zam-696", ch->atom != NULL); ++ ++ blocknr_set_iterator(ch->atom, &ch->atom->wandered_map, ++ dealloc_wmap_actor, NULL, 1); ++} ++ ++/* helper function for alloc wandered blocks, which refill set of block ++ numbers needed for wandered blocks */ ++static int ++get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len) ++{ ++ reiser4_blocknr_hint hint; ++ int ret; ++ ++ reiser4_block_nr wide_len = count; ++ ++ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks ++ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed ++ reserved allocation area so as to get the best qualities of fixed ++ journals? */ ++ reiser4_blocknr_hint_init(&hint); ++ hint.block_stage = BLOCK_GRABBED; ++ ++ ret = reiser4_alloc_blocks(&hint, start, &wide_len, ++ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START); ++ *len = (int)wide_len; ++ ++ return ret; ++} ++ ++/* ++ * roll back changes made before issuing BIO in the case of IO error. ++ */ ++static void undo_bio(struct bio *bio) ++{ ++ int i; ++ ++ for (i = 0; i < bio->bi_vcnt; ++i) { ++ struct page *pg; ++ jnode *node; ++ ++ pg = bio->bi_io_vec[i].bv_page; ++ end_page_writeback(pg); ++ node = jprivate(pg); ++ spin_lock_jnode(node); ++ JF_CLR(node, JNODE_WRITEBACK); ++ JF_SET(node, JNODE_DIRTY); ++ spin_unlock_jnode(node); ++ } ++ bio_put(bio); ++} ++ ++/* put overwrite set back to atom's clean list */ ++static void put_overwrite_set(struct commit_handle *ch) ++{ ++ jnode *cur; ++ ++ list_for_each_entry(cur, ch->overwrite_set, capture_link) ++ jrelse_tail(cur); ++} ++ ++/* Count overwrite set size, grab disk space for wandered blocks allocation. ++ Since we have a separate list for atom's overwrite set we just scan the list, ++ count bitmap and other not leaf nodes which wandered blocks allocation we ++ have to grab space for. */ ++static int get_overwrite_set(struct commit_handle *ch) ++{ ++ int ret; ++ jnode *cur; ++ __u64 nr_not_leaves = 0; ++#if REISER4_DEBUG ++ __u64 nr_formatted_leaves = 0; ++ __u64 nr_unformatted_leaves = 0; ++#endif ++ ++ assert("zam-697", ch->overwrite_set_size == 0); ++ ++ ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom); ++ cur = list_entry(ch->overwrite_set->next, jnode, capture_link); ++ ++ while (ch->overwrite_set != &cur->capture_link) { ++ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link); ++ ++ /* Count bitmap locks for getting correct statistics what number ++ * of blocks were cleared by the transaction commit. */ ++ if (jnode_get_type(cur) == JNODE_BITMAP) ++ ch->nr_bitmap++; ++ ++ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR) ++ || jnode_get_type(cur) == JNODE_BITMAP); ++ ++ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) { ++ /* we replace fake znode by another (real) ++ znode which is suggested by disk_layout ++ plugin */ ++ ++ /* FIXME: it looks like fake znode should be ++ replaced by jnode supplied by ++ disk_layout. */ ++ ++ struct super_block *s = reiser4_get_current_sb(); ++ reiser4_super_info_data *sbinfo = ++ get_current_super_private(); ++ ++ if (sbinfo->df_plug->log_super) { ++ jnode *sj = sbinfo->df_plug->log_super(s); ++ ++ assert("zam-593", sj != NULL); ++ ++ if (IS_ERR(sj)) ++ return PTR_ERR(sj); ++ ++ spin_lock_jnode(sj); ++ JF_SET(sj, JNODE_OVRWR); ++ insert_into_atom_ovrwr_list(ch->atom, sj); ++ spin_unlock_jnode(sj); ++ ++ /* jload it as the rest of overwrite set */ ++ jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0); ++ ++ ch->overwrite_set_size++; ++ } ++ spin_lock_jnode(cur); ++ reiser4_uncapture_block(cur); ++ jput(cur); ++ ++ } else { ++ int ret; ++ ch->overwrite_set_size++; ++ ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0); ++ if (ret) ++ reiser4_panic("zam-783", ++ "cannot load e-flushed jnode back (ret = %d)\n", ++ ret); ++ } ++ ++ /* Count not leaves here because we have to grab disk space ++ * for wandered blocks. They were not counted as "flush ++ * reserved". Counting should be done _after_ nodes are pinned ++ * into memory by jload(). */ ++ if (!jnode_is_leaf(cur)) ++ nr_not_leaves++; ++ else { ++#if REISER4_DEBUG ++ /* at this point @cur either has JNODE_FLUSH_RESERVED ++ * or is eflushed. Locking is not strong enough to ++ * write an assertion checking for this. */ ++ if (jnode_is_znode(cur)) ++ nr_formatted_leaves++; ++ else ++ nr_unformatted_leaves++; ++#endif ++ JF_CLR(cur, JNODE_FLUSH_RESERVED); ++ } ++ ++ cur = next; ++ } ++ ++ /* Grab space for writing (wandered blocks) of not leaves found in ++ * overwrite set. */ ++ ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED); ++ if (ret) ++ return ret; ++ ++ /* Disk space for allocation of wandered blocks of leaf nodes already ++ * reserved as "flush reserved", move it to grabbed space counter. */ ++ spin_lock_atom(ch->atom); ++ assert("zam-940", ++ nr_formatted_leaves + nr_unformatted_leaves <= ++ ch->atom->flush_reserved); ++ flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved); ++ spin_unlock_atom(ch->atom); ++ ++ return ch->overwrite_set_size; ++} ++ ++/** ++ * write_jnodes_to_disk_extent - submit write request ++ * @head: ++ * @first: first jnode of the list ++ * @nr: number of jnodes on the list ++ * @block_p: ++ * @fq: ++ * @flags: used to decide whether page is to get PG_reclaim flag ++ * ++ * Submits a write request for @nr jnodes beginning from the @first, other ++ * jnodes are after the @first on the double-linked "capture" list. All jnodes ++ * will be written to the disk region of @nr blocks starting with @block_p block ++ * number. If @fq is not NULL it means that waiting for i/o completion will be ++ * done more efficiently by using flush_queue_t objects. ++ * This function is the one which writes list of jnodes in batch mode. It does ++ * all low-level things as bio construction and page states manipulation. ++ * ++ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are ++ * aggregated in this function instead of being left to the layers below ++ * ++ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that? ++ * Why that layer needed? Why BIOs cannot be constructed here? ++ */ ++static int write_jnodes_to_disk_extent( ++ jnode *first, int nr, const reiser4_block_nr *block_p, ++ flush_queue_t *fq, int flags) ++{ ++ struct super_block *super = reiser4_get_current_sb(); ++ int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE; ++ int max_blocks; ++ jnode *cur = first; ++ reiser4_block_nr block; ++ ++ assert("zam-571", first != NULL); ++ assert("zam-572", block_p != NULL); ++ assert("zam-570", nr > 0); ++ ++ block = *block_p; ++ max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES); ++ ++ while (nr > 0) { ++ struct bio *bio; ++ int nr_blocks = min(nr, max_blocks); ++ int i; ++ int nr_used; ++ ++ bio = bio_alloc(GFP_NOIO, nr_blocks); ++ if (!bio) ++ return RETERR(-ENOMEM); ++ ++ bio->bi_bdev = super->s_bdev; ++ bio->bi_sector = block * (super->s_blocksize >> 9); ++ for (nr_used = 0, i = 0; i < nr_blocks; i++) { ++ struct page *pg; ++ ++ pg = jnode_page(cur); ++ assert("zam-573", pg != NULL); ++ ++ page_cache_get(pg); ++ ++ lock_and_wait_page_writeback(pg); ++ ++ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) { ++ /* ++ * underlying device is satiated. Stop adding ++ * pages to the bio. ++ */ ++ unlock_page(pg); ++ page_cache_release(pg); ++ break; ++ } ++ ++ spin_lock_jnode(cur); ++ assert("nikita-3166", ++ pg->mapping == jnode_get_mapping(cur)); ++ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK)); ++#if REISER4_DEBUG ++ spin_lock(&cur->load); ++ assert("nikita-3165", !jnode_is_releasable(cur)); ++ spin_unlock(&cur->load); ++#endif ++ JF_SET(cur, JNODE_WRITEBACK); ++ JF_CLR(cur, JNODE_DIRTY); ++ ON_DEBUG(cur->written++); ++ spin_unlock_jnode(cur); ++ ++ ClearPageError(pg); ++ set_page_writeback(pg); ++ ++ if (get_current_context()->entd) { ++ /* this is ent thread */ ++ entd_context *ent = get_entd_context(super); ++ struct wbq *rq, *next; ++ ++ spin_lock(&ent->guard); ++ ++ if (pg == ent->cur_request->page) { ++ /* ++ * entd is called for this page. This ++ * request is not in th etodo list ++ */ ++ ent->cur_request->written = 1; ++ } else { ++ /* ++ * if we have written a page for which writepage ++ * is called for - move request to another list. ++ */ ++ list_for_each_entry_safe(rq, next, &ent->todo_list, link) { ++ assert("", rq->magic == WBQ_MAGIC); ++ if (pg == rq->page) { ++ /* ++ * remove request from ++ * entd's queue, but do ++ * not wake up a thread ++ * which put this ++ * request ++ */ ++ list_del_init(&rq->link); ++ ent->nr_todo_reqs --; ++ list_add_tail(&rq->link, &ent->done_list); ++ ent->nr_done_reqs ++; ++ rq->written = 1; ++ break; ++ } ++ } ++ } ++ spin_unlock(&ent->guard); ++ } ++ ++ clear_page_dirty_for_io(pg); ++ ++ unlock_page(pg); ++ ++ cur = list_entry(cur->capture_link.next, jnode, capture_link); ++ nr_used++; ++ } ++ if (nr_used > 0) { ++ assert("nikita-3453", ++ bio->bi_size == super->s_blocksize * nr_used); ++ assert("nikita-3454", bio->bi_vcnt == nr_used); ++ ++ /* Check if we are allowed to write at all */ ++ if (super->s_flags & MS_RDONLY) ++ undo_bio(bio); ++ else { ++ int not_supported; ++ ++ add_fq_to_bio(fq, bio); ++ bio_get(bio); ++ reiser4_submit_bio(write_op, bio); ++ not_supported = bio_flagged(bio, BIO_EOPNOTSUPP); ++ bio_put(bio); ++ if (not_supported) ++ return -EOPNOTSUPP; ++ } ++ ++ block += nr_used - 1; ++ update_blocknr_hint_default(super, &block); ++ block += 1; ++ } else { ++ bio_put(bio); ++ } ++ nr -= nr_used; ++ } ++ ++ return 0; ++} ++ ++/* This is a procedure which recovers a contiguous sequences of disk block ++ numbers in the given list of j-nodes and submits write requests on this ++ per-sequence basis */ ++int ++write_jnode_list(struct list_head *head, flush_queue_t *fq, ++ long *nr_submitted, int flags) ++{ ++ int ret; ++ jnode *beg = list_entry(head->next, jnode, capture_link); ++ ++ while (head != &beg->capture_link) { ++ int nr = 1; ++ jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link); ++ ++ while (head != &cur->capture_link) { ++ if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr) ++ break; ++ ++nr; ++ cur = list_entry(cur->capture_link.next, jnode, capture_link); ++ } ++ ++ ret = write_jnodes_to_disk_extent( ++ beg, nr, jnode_get_block(beg), fq, flags); ++ if (ret) ++ return ret; ++ ++ if (nr_submitted) ++ *nr_submitted += nr; ++ ++ beg = cur; ++ } ++ ++ return 0; ++} ++ ++/* add given wandered mapping to atom's wandered map */ ++static int ++add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p) ++{ ++ int ret; ++ blocknr_set_entry *new_bsep = NULL; ++ reiser4_block_nr block; ++ ++ txn_atom *atom; ++ ++ assert("zam-568", block_p != NULL); ++ block = *block_p; ++ assert("zam-569", len > 0); ++ ++ while ((len--) > 0) { ++ do { ++ atom = get_current_atom_locked(); ++ assert("zam-536", ++ !reiser4_blocknr_is_fake(jnode_get_block(cur))); ++ ret = ++ blocknr_set_add_pair(atom, &atom->wandered_map, ++ &new_bsep, ++ jnode_get_block(cur), &block); ++ } while (ret == -E_REPEAT); ++ ++ if (ret) { ++ /* deallocate blocks which were not added to wandered ++ map */ ++ reiser4_block_nr wide_len = len; ++ ++ reiser4_dealloc_blocks(&block, &wide_len, ++ BLOCK_NOT_COUNTED, ++ BA_FORMATTED ++ /* formatted, without defer */ ); ++ ++ return ret; ++ } ++ ++ spin_unlock_atom(atom); ++ ++ cur = list_entry(cur->capture_link.next, jnode, capture_link); ++ ++block; ++ } ++ ++ return 0; ++} ++ ++/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately ++ submit IO for allocated blocks. We assume that current atom is in a stage ++ when any atom fusion is impossible and atom is unlocked and it is safe. */ ++static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq) ++{ ++ reiser4_block_nr block; ++ ++ int rest; ++ int len; ++ int ret; ++ ++ jnode *cur; ++ ++ assert("zam-534", ch->overwrite_set_size > 0); ++ ++ rest = ch->overwrite_set_size; ++ ++ cur = list_entry(ch->overwrite_set->next, jnode, capture_link); ++ while (ch->overwrite_set != &cur->capture_link) { ++ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR)); ++ ++ ret = get_more_wandered_blocks(rest, &block, &len); ++ if (ret) ++ return ret; ++ ++ rest -= len; ++ ++ ret = add_region_to_wmap(cur, len, &block); ++ if (ret) ++ return ret; ++ ++ ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0); ++ if (ret) ++ return ret; ++ ++ while ((len--) > 0) { ++ assert("zam-604", ++ ch->overwrite_set != &cur->capture_link); ++ cur = list_entry(cur->capture_link.next, jnode, capture_link); ++ } ++ } ++ ++ return 0; ++} ++ ++/* allocate given number of nodes over the journal area and link them into a ++ list, return pointer to the first jnode in the list */ ++static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq) ++{ ++ reiser4_blocknr_hint hint; ++ reiser4_block_nr allocated = 0; ++ reiser4_block_nr first, len; ++ jnode *cur; ++ jnode *txhead; ++ int ret; ++ reiser4_context *ctx; ++ reiser4_super_info_data *sbinfo; ++ ++ assert("zam-698", ch->tx_size > 0); ++ assert("zam-699", list_empty_careful(&ch->tx_list)); ++ ++ ctx = get_current_context(); ++ sbinfo = get_super_private(ctx->super); ++ ++ while (allocated < (unsigned)ch->tx_size) { ++ len = (ch->tx_size - allocated); ++ ++ reiser4_blocknr_hint_init(&hint); ++ ++ hint.block_stage = BLOCK_GRABBED; ++ ++ /* FIXME: there should be some block allocation policy for ++ nodes which contain wander records */ ++ ++ /* We assume that disk space for wandered record blocks can be ++ * taken from reserved area. */ ++ ret = reiser4_alloc_blocks(&hint, &first, &len, ++ BA_FORMATTED | BA_RESERVED | ++ BA_USE_DEFAULT_SEARCH_START); ++ reiser4_blocknr_hint_done(&hint); ++ ++ if (ret) ++ return ret; ++ ++ allocated += len; ++ ++ /* create jnodes for all wander records */ ++ while (len--) { ++ cur = reiser4_alloc_io_head(&first); ++ ++ if (cur == NULL) { ++ ret = RETERR(-ENOMEM); ++ goto free_not_assigned; ++ } ++ ++ ret = jinit_new(cur, reiser4_ctx_gfp_mask_get()); ++ ++ if (ret != 0) { ++ jfree(cur); ++ goto free_not_assigned; ++ } ++ ++ pin_jnode_data(cur); ++ ++ list_add_tail(&cur->capture_link, &ch->tx_list); ++ ++ first++; ++ } ++ } ++ ++ { /* format a on-disk linked list of wander records */ ++ int serial = 1; ++ ++ txhead = list_entry(ch->tx_list.next, jnode, capture_link); ++ format_tx_head(ch); ++ ++ cur = list_entry(txhead->capture_link.next, jnode, capture_link); ++ while (&ch->tx_list != &cur->capture_link) { ++ format_wander_record(ch, cur, serial++); ++ cur = list_entry(cur->capture_link.next, jnode, capture_link); ++ } ++ } ++ ++ { /* Fill wander records with Wandered Set */ ++ struct store_wmap_params params; ++ txn_atom *atom; ++ ++ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link); ++ ++ params.idx = 0; ++ params.capacity = ++ wander_record_capacity(reiser4_get_current_sb()); ++ ++ atom = get_current_atom_locked(); ++ blocknr_set_iterator(atom, &atom->wandered_map, ++ &store_wmap_actor, ¶ms, 0); ++ spin_unlock_atom(atom); ++ } ++ ++ { /* relse all jnodes from tx_list */ ++ cur = list_entry(ch->tx_list.next, jnode, capture_link); ++ while (&ch->tx_list != &cur->capture_link) { ++ jrelse(cur); ++ cur = list_entry(cur->capture_link.next, jnode, capture_link); ++ } ++ } ++ ++ ret = write_jnode_list(&ch->tx_list, fq, NULL, 0); ++ ++ return ret; ++ ++ free_not_assigned: ++ /* We deallocate blocks not yet assigned to jnodes on tx_list. The ++ caller takes care about invalidating of tx list */ ++ reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED); ++ ++ return ret; ++} ++ ++static int commit_tx(struct commit_handle *ch) ++{ ++ flush_queue_t *fq; ++ int barrier; ++ int ret; ++ ++ /* Grab more space for wandered records. */ ++ ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED); ++ if (ret) ++ return ret; ++ ++ fq = get_fq_for_current_atom(); ++ if (IS_ERR(fq)) ++ return PTR_ERR(fq); ++ ++ spin_unlock_atom(fq->atom); ++ do { ++ ret = alloc_wandered_blocks(ch, fq); ++ if (ret) ++ break; ++ ret = alloc_tx(ch, fq); ++ if (ret) ++ break; ++ } while (0); ++ ++ reiser4_fq_put(fq); ++ if (ret) ++ return ret; ++ repeat_wo_barrier: ++ barrier = reiser4_use_write_barrier(ch->super); ++ if (!barrier) { ++ ret = current_atom_finish_all_fq(); ++ if (ret) ++ return ret; ++ } ++ ret = update_journal_header(ch, barrier); ++ if (barrier) { ++ if (ret) { ++ if (ret == -EOPNOTSUPP) { ++ disable_write_barrier(ch->super); ++ goto repeat_wo_barrier; ++ } ++ return ret; ++ } ++ ret = current_atom_finish_all_fq(); ++ } ++ return ret; ++} ++ ++static int write_tx_back(struct commit_handle * ch) ++{ ++ flush_queue_t *fq; ++ int ret; ++ int barrier; ++ ++ reiser4_post_commit_hook(); ++ fq = get_fq_for_current_atom(); ++ if (IS_ERR(fq)) ++ return PTR_ERR(fq); ++ spin_unlock_atom(fq->atom); ++ ret = write_jnode_list( ++ ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM); ++ reiser4_fq_put(fq); ++ if (ret) ++ return ret; ++ repeat_wo_barrier: ++ barrier = reiser4_use_write_barrier(ch->super); ++ if (!barrier) { ++ ret = current_atom_finish_all_fq(); ++ if (ret) ++ return ret; ++ } ++ ret = update_journal_footer(ch, barrier); ++ if (barrier) { ++ if (ret) { ++ if (ret == -EOPNOTSUPP) { ++ disable_write_barrier(ch->super); ++ goto repeat_wo_barrier; ++ } ++ return ret; ++ } ++ ret = current_atom_finish_all_fq(); ++ } ++ if (ret) ++ return ret; ++ reiser4_post_write_back_hook(); ++ return 0; ++} ++ ++/* We assume that at this moment all captured blocks are marked as RELOC or ++ WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set ++ are submitted to write. ++*/ ++ ++int reiser4_write_logs(long *nr_submitted) ++{ ++ txn_atom *atom; ++ struct super_block *super = reiser4_get_current_sb(); ++ reiser4_super_info_data *sbinfo = get_super_private(super); ++ struct commit_handle ch; ++ int ret; ++ ++ writeout_mode_enable(); ++ ++ /* block allocator may add j-nodes to the clean_list */ ++ ret = reiser4_pre_commit_hook(); ++ if (ret) ++ return ret; ++ ++ /* No locks are required if we take atom which stage >= ++ * ASTAGE_PRE_COMMIT */ ++ atom = get_current_context()->trans->atom; ++ assert("zam-965", atom != NULL); ++ ++ /* relocate set is on the atom->clean_nodes list after ++ * current_atom_complete_writes() finishes. It can be safely ++ * uncaptured after commit_mutex is locked, because any atom that ++ * captures these nodes is guaranteed to commit after current one. ++ * ++ * This can only be done after reiser4_pre_commit_hook(), because it is where ++ * early flushed jnodes with CREATED bit are transferred to the ++ * overwrite list. */ ++ reiser4_invalidate_list(ATOM_CLEAN_LIST(atom)); ++ spin_lock_atom(atom); ++ /* There might be waiters for the relocate nodes which we have ++ * released, wake them up. */ ++ reiser4_atom_send_event(atom); ++ spin_unlock_atom(atom); ++ ++ if (REISER4_DEBUG) { ++ int level; ++ ++ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level) ++ assert("nikita-3352", ++ list_empty_careful(ATOM_DIRTY_LIST(atom, level))); ++ } ++ ++ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created; ++ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted; ++ ++ init_commit_handle(&ch, atom); ++ ++ ch.free_blocks = sbinfo->blocks_free_committed; ++ ch.nr_files = sbinfo->nr_files_committed; ++ /* ZAM-FIXME-HANS: email me what the contention level is for the super ++ * lock. */ ++ ch.next_oid = oid_next(super); ++ ++ /* count overwrite set and place it in a separate list */ ++ ret = get_overwrite_set(&ch); ++ ++ if (ret <= 0) { ++ /* It is possible that overwrite set is empty here, it means ++ all captured nodes are clean */ ++ goto up_and_ret; ++ } ++ ++ /* Inform the caller about what number of dirty pages will be ++ * submitted to disk. */ ++ *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap; ++ ++ /* count all records needed for storing of the wandered set */ ++ get_tx_size(&ch); ++ ++ ret = commit_tx(&ch); ++ if (ret) ++ goto up_and_ret; ++ ++ spin_lock_atom(atom); ++ reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT); ++ spin_unlock_atom(atom); ++ ++ ret = write_tx_back(&ch); ++ reiser4_post_write_back_hook(); ++ ++ up_and_ret: ++ if (ret) { ++ /* there could be fq attached to current atom; the only way to ++ remove them is: */ ++ current_atom_finish_all_fq(); ++ } ++ ++ /* free blocks of flushed transaction */ ++ dealloc_tx_list(&ch); ++ dealloc_wmap(&ch); ++ ++ put_overwrite_set(&ch); ++ ++ done_commit_handle(&ch); ++ ++ writeout_mode_disable(); ++ ++ return ret; ++} ++ ++/* consistency checks for journal data/control blocks: header, footer, log ++ records, transactions head blocks. All functions return zero on success. */ ++ ++static int check_journal_header(const jnode * node UNUSED_ARG) ++{ ++ /* FIXME: journal header has no magic field yet. */ ++ return 0; ++} ++ ++/* wait for write completion for all jnodes from given list */ ++static int wait_on_jnode_list(struct list_head *head) ++{ ++ jnode *scan; ++ int ret = 0; ++ ++ list_for_each_entry(scan, head, capture_link) { ++ struct page *pg = jnode_page(scan); ++ ++ if (pg) { ++ if (PageWriteback(pg)) ++ wait_on_page_writeback(pg); ++ ++ if (PageError(pg)) ++ ret++; ++ } ++ } ++ ++ return ret; ++} ++ ++static int check_journal_footer(const jnode * node UNUSED_ARG) ++{ ++ /* FIXME: journal footer has no magic field yet. */ ++ return 0; ++} ++ ++static int check_tx_head(const jnode * node) ++{ ++ struct tx_header *header = (struct tx_header *)jdata(node); ++ ++ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) { ++ warning("zam-627", "tx head at block %s corrupted\n", ++ sprint_address(jnode_get_block(node))); ++ return RETERR(-EIO); ++ } ++ ++ return 0; ++} ++ ++static int check_wander_record(const jnode * node) ++{ ++ struct wander_record_header *RH = ++ (struct wander_record_header *)jdata(node); ++ ++ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) != ++ 0) { ++ warning("zam-628", "wander record at block %s corrupted\n", ++ sprint_address(jnode_get_block(node))); ++ return RETERR(-EIO); ++ } ++ ++ return 0; ++} ++ ++/* fill commit_handler structure by everything what is needed for update_journal_footer */ ++static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head) ++{ ++ struct tx_header *TXH; ++ int ret; ++ ++ ret = jload(tx_head); ++ if (ret) ++ return ret; ++ ++ TXH = (struct tx_header *)jdata(tx_head); ++ ++ ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks)); ++ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files)); ++ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid)); ++ ++ jrelse(tx_head); ++ ++ list_add(&tx_head->capture_link, &ch->tx_list); ++ ++ return 0; ++} ++ ++/* replay one transaction: restore and write overwrite set in place */ ++static int replay_transaction(const struct super_block *s, ++ jnode * tx_head, ++ const reiser4_block_nr * log_rec_block_p, ++ const reiser4_block_nr * end_block, ++ unsigned int nr_wander_records) ++{ ++ reiser4_block_nr log_rec_block = *log_rec_block_p; ++ struct commit_handle ch; ++ LIST_HEAD(overwrite_set); ++ jnode *log; ++ int ret; ++ ++ init_commit_handle(&ch, NULL); ++ ch.overwrite_set = &overwrite_set; ++ ++ restore_commit_handle(&ch, tx_head); ++ ++ while (log_rec_block != *end_block) { ++ struct wander_record_header *header; ++ struct wander_entry *entry; ++ ++ int i; ++ ++ if (nr_wander_records == 0) { ++ warning("zam-631", ++ "number of wander records in the linked list" ++ " greater than number stored in tx head.\n"); ++ ret = RETERR(-EIO); ++ goto free_ow_set; ++ } ++ ++ log = reiser4_alloc_io_head(&log_rec_block); ++ if (log == NULL) ++ return RETERR(-ENOMEM); ++ ++ ret = jload(log); ++ if (ret < 0) { ++ reiser4_drop_io_head(log); ++ return ret; ++ } ++ ++ ret = check_wander_record(log); ++ if (ret) { ++ jrelse(log); ++ reiser4_drop_io_head(log); ++ return ret; ++ } ++ ++ header = (struct wander_record_header *)jdata(log); ++ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block)); ++ ++ entry = (struct wander_entry *)(header + 1); ++ ++ /* restore overwrite set from wander record content */ ++ for (i = 0; i < wander_record_capacity(s); i++) { ++ reiser4_block_nr block; ++ jnode *node; ++ ++ block = le64_to_cpu(get_unaligned(&entry->wandered)); ++ if (block == 0) ++ break; ++ ++ node = reiser4_alloc_io_head(&block); ++ if (node == NULL) { ++ ret = RETERR(-ENOMEM); ++ /* ++ * FIXME-VS:??? ++ */ ++ jrelse(log); ++ reiser4_drop_io_head(log); ++ goto free_ow_set; ++ } ++ ++ ret = jload(node); ++ ++ if (ret < 0) { ++ reiser4_drop_io_head(node); ++ /* ++ * FIXME-VS:??? ++ */ ++ jrelse(log); ++ reiser4_drop_io_head(log); ++ goto free_ow_set; ++ } ++ ++ block = le64_to_cpu(get_unaligned(&entry->original)); ++ ++ assert("zam-603", block != 0); ++ ++ jnode_set_block(node, &block); ++ ++ list_add_tail(&node->capture_link, ch.overwrite_set); ++ ++ ++entry; ++ } ++ ++ jrelse(log); ++ reiser4_drop_io_head(log); ++ ++ --nr_wander_records; ++ } ++ ++ if (nr_wander_records != 0) { ++ warning("zam-632", "number of wander records in the linked list" ++ " less than number stored in tx head.\n"); ++ ret = RETERR(-EIO); ++ goto free_ow_set; ++ } ++ ++ { /* write wandered set in place */ ++ write_jnode_list(ch.overwrite_set, NULL, NULL, 0); ++ ret = wait_on_jnode_list(ch.overwrite_set); ++ ++ if (ret) { ++ ret = RETERR(-EIO); ++ goto free_ow_set; ++ } ++ } ++ ++ ret = update_journal_footer(&ch, 0); ++ ++ free_ow_set: ++ ++ while (!list_empty(ch.overwrite_set)) { ++ jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link); ++ list_del_init(&cur->capture_link); ++ jrelse(cur); ++ reiser4_drop_io_head(cur); ++ } ++ ++ list_del_init(&tx_head->capture_link); ++ ++ done_commit_handle(&ch); ++ ++ return ret; ++} ++ ++/* find oldest committed and not played transaction and play it. The transaction ++ * was committed and journal header block was updated but the blocks from the ++ * process of writing the atom's overwrite set in-place and updating of journal ++ * footer block were not completed. This function completes the process by ++ * recovering the atom's overwrite set from their wandered locations and writes ++ * them in-place and updating the journal footer. */ ++static int replay_oldest_transaction(struct super_block *s) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(s); ++ jnode *jf = sbinfo->journal_footer; ++ unsigned int total; ++ struct journal_footer *F; ++ struct tx_header *T; ++ ++ reiser4_block_nr prev_tx; ++ reiser4_block_nr last_flushed_tx; ++ reiser4_block_nr log_rec_block = 0; ++ ++ jnode *tx_head; ++ ++ int ret; ++ ++ if ((ret = jload(jf)) < 0) ++ return ret; ++ ++ F = (struct journal_footer *)jdata(jf); ++ ++ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx)); ++ ++ jrelse(jf); ++ ++ if (sbinfo->last_committed_tx == last_flushed_tx) { ++ /* all transactions are replayed */ ++ return 0; ++ } ++ ++ prev_tx = sbinfo->last_committed_tx; ++ ++ /* searching for oldest not flushed transaction */ ++ while (1) { ++ tx_head = reiser4_alloc_io_head(&prev_tx); ++ if (!tx_head) ++ return RETERR(-ENOMEM); ++ ++ ret = jload(tx_head); ++ if (ret < 0) { ++ reiser4_drop_io_head(tx_head); ++ return ret; ++ } ++ ++ ret = check_tx_head(tx_head); ++ if (ret) { ++ jrelse(tx_head); ++ reiser4_drop_io_head(tx_head); ++ return ret; ++ } ++ ++ T = (struct tx_header *)jdata(tx_head); ++ ++ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx)); ++ ++ if (prev_tx == last_flushed_tx) ++ break; ++ ++ jrelse(tx_head); ++ reiser4_drop_io_head(tx_head); ++ } ++ ++ total = le32_to_cpu(get_unaligned(&T->total)); ++ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block)); ++ ++ pin_jnode_data(tx_head); ++ jrelse(tx_head); ++ ++ ret = ++ replay_transaction(s, tx_head, &log_rec_block, ++ jnode_get_block(tx_head), total - 1); ++ ++ unpin_jnode_data(tx_head); ++ reiser4_drop_io_head(tx_head); ++ ++ if (ret) ++ return ret; ++ return -E_REPEAT; ++} ++ ++/* The reiser4 journal current implementation was optimized to not to capture ++ super block if certain super blocks fields are modified. Currently, the set ++ is (<free block count>, <OID allocator>). These fields are logged by ++ special way which includes storing them in each transaction head block at ++ atom commit time and writing that information to journal footer block at ++ atom flush time. For getting info from journal footer block to the ++ in-memory super block there is a special function ++ reiser4_journal_recover_sb_data() which should be called after disk format ++ plugin re-reads super block after journal replaying. ++*/ ++ ++/* get the information from journal footer in-memory super block */ ++int reiser4_journal_recover_sb_data(struct super_block *s) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(s); ++ struct journal_footer *jf; ++ int ret; ++ ++ assert("zam-673", sbinfo->journal_footer != NULL); ++ ++ ret = jload(sbinfo->journal_footer); ++ if (ret != 0) ++ return ret; ++ ++ ret = check_journal_footer(sbinfo->journal_footer); ++ if (ret != 0) ++ goto out; ++ ++ jf = (struct journal_footer *)jdata(sbinfo->journal_footer); ++ ++ /* was there at least one flushed transaction? */ ++ if (jf->last_flushed_tx) { ++ ++ /* restore free block counter logged in this transaction */ ++ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks))); ++ ++ /* restore oid allocator state */ ++ oid_init_allocator(s, ++ le64_to_cpu(get_unaligned(&jf->nr_files)), ++ le64_to_cpu(get_unaligned(&jf->next_oid))); ++ } ++ out: ++ jrelse(sbinfo->journal_footer); ++ return ret; ++} ++ ++/* reiser4 replay journal procedure */ ++int reiser4_journal_replay(struct super_block *s) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(s); ++ jnode *jh, *jf; ++ struct journal_header *header; ++ int nr_tx_replayed = 0; ++ int ret; ++ ++ assert("zam-582", sbinfo != NULL); ++ ++ jh = sbinfo->journal_header; ++ jf = sbinfo->journal_footer; ++ ++ if (!jh || !jf) { ++ /* it is possible that disk layout does not support journal ++ structures, we just warn about this */ ++ warning("zam-583", ++ "journal control blocks were not loaded by disk layout plugin. " ++ "journal replaying is not possible.\n"); ++ return 0; ++ } ++ ++ /* Take free block count from journal footer block. The free block ++ counter value corresponds the last flushed transaction state */ ++ ret = jload(jf); ++ if (ret < 0) ++ return ret; ++ ++ ret = check_journal_footer(jf); ++ if (ret) { ++ jrelse(jf); ++ return ret; ++ } ++ ++ jrelse(jf); ++ ++ /* store last committed transaction info in reiser4 in-memory super ++ block */ ++ ret = jload(jh); ++ if (ret < 0) ++ return ret; ++ ++ ret = check_journal_header(jh); ++ if (ret) { ++ jrelse(jh); ++ return ret; ++ } ++ ++ header = (struct journal_header *)jdata(jh); ++ sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx)); ++ ++ jrelse(jh); ++ ++ /* replay committed transactions */ ++ while ((ret = replay_oldest_transaction(s)) == -E_REPEAT) ++ nr_tx_replayed++; ++ ++ return ret; ++} ++ ++/* load journal control block (either journal header or journal footer block) */ ++static int ++load_journal_control_block(jnode ** node, const reiser4_block_nr * block) ++{ ++ int ret; ++ ++ *node = reiser4_alloc_io_head(block); ++ if (!(*node)) ++ return RETERR(-ENOMEM); ++ ++ ret = jload(*node); ++ ++ if (ret) { ++ reiser4_drop_io_head(*node); ++ *node = NULL; ++ return ret; ++ } ++ ++ pin_jnode_data(*node); ++ jrelse(*node); ++ ++ return 0; ++} ++ ++/* unload journal header or footer and free jnode */ ++static void unload_journal_control_block(jnode ** node) ++{ ++ if (*node) { ++ unpin_jnode_data(*node); ++ reiser4_drop_io_head(*node); ++ *node = NULL; ++ } ++} ++ ++/* release journal control blocks */ ++void reiser4_done_journal_info(struct super_block *s) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(s); ++ ++ assert("zam-476", sbinfo != NULL); ++ ++ unload_journal_control_block(&sbinfo->journal_header); ++ unload_journal_control_block(&sbinfo->journal_footer); ++ rcu_barrier(); ++} ++ ++/* load journal control blocks */ ++int reiser4_init_journal_info(struct super_block *s) ++{ ++ reiser4_super_info_data *sbinfo = get_super_private(s); ++ journal_location *loc; ++ int ret; ++ ++ loc = &sbinfo->jloc; ++ ++ assert("zam-651", loc != NULL); ++ assert("zam-652", loc->header != 0); ++ assert("zam-653", loc->footer != 0); ++ ++ ret = load_journal_control_block(&sbinfo->journal_header, &loc->header); ++ ++ if (ret) ++ return ret; ++ ++ ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer); ++ ++ if (ret) { ++ unload_journal_control_block(&sbinfo->journal_header); ++ } ++ ++ return ret; ++} ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 80 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/wander.h linux-2.6.33/fs/reiser4/wander.h +--- linux-2.6.33.orig/fs/reiser4/wander.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/wander.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,135 @@ ++/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#if !defined (__FS_REISER4_WANDER_H__) ++#define __FS_REISER4_WANDER_H__ ++ ++#include "dformat.h" ++ ++#include <linux/fs.h> /* for struct super_block */ ++ ++/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */ ++ ++#define TX_HEADER_MAGIC "TxMagic4" ++#define WANDER_RECORD_MAGIC "LogMagc4" ++ ++#define TX_HEADER_MAGIC_SIZE (8) ++#define WANDER_RECORD_MAGIC_SIZE (8) ++ ++/* journal header block format */ ++struct journal_header { ++ /* last written transaction head location */ ++ d64 last_committed_tx; ++}; ++ ++typedef struct journal_location { ++ reiser4_block_nr footer; ++ reiser4_block_nr header; ++} journal_location; ++ ++/* The wander.c head comment describes usage and semantic of all these structures */ ++/* journal footer block format */ ++struct journal_footer { ++ /* last flushed transaction location. */ ++ /* This block number is no more valid after the transaction it points ++ to gets flushed, this number is used only at journal replaying time ++ for detection of the end of on-disk list of committed transactions ++ which were not flushed completely */ ++ d64 last_flushed_tx; ++ ++ /* free block counter is written in journal footer at transaction ++ flushing , not in super block because free blocks counter is logged ++ by another way than super block fields (root pointer, for ++ example). */ ++ d64 free_blocks; ++ ++ /* number of used OIDs and maximal used OID are logged separately from ++ super block */ ++ d64 nr_files; ++ d64 next_oid; ++}; ++ ++/* Each wander record (except the first one) has unified format with wander ++ record header followed by an array of log entries */ ++struct wander_record_header { ++ /* when there is no predefined location for wander records, this magic ++ string should help reiser4fsck. */ ++ char magic[WANDER_RECORD_MAGIC_SIZE]; ++ ++ /* transaction id */ ++ d64 id; ++ ++ /* total number of wander records in current transaction */ ++ d32 total; ++ ++ /* this block number in transaction */ ++ d32 serial; ++ ++ /* number of previous block in commit */ ++ d64 next_block; ++}; ++ ++/* The first wander record (transaction head) of written transaction has the ++ special format */ ++struct tx_header { ++ /* magic string makes first block in transaction different from other ++ logged blocks, it should help fsck. */ ++ char magic[TX_HEADER_MAGIC_SIZE]; ++ ++ /* transaction id */ ++ d64 id; ++ ++ /* total number of records (including this first tx head) in the ++ transaction */ ++ d32 total; ++ ++ /* align next field to 8-byte boundary; this field always is zero */ ++ d32 padding; ++ ++ /* block number of previous transaction head */ ++ d64 prev_tx; ++ ++ /* next wander record location */ ++ d64 next_block; ++ ++ /* committed versions of free blocks counter */ ++ d64 free_blocks; ++ ++ /* number of used OIDs (nr_files) and maximal used OID are logged ++ separately from super block */ ++ d64 nr_files; ++ d64 next_oid; ++}; ++ ++/* A transaction gets written to disk as a set of wander records (each wander ++ record size is fs block) */ ++ ++/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled ++ by zeroes */ ++struct wander_entry { ++ d64 original; /* block original location */ ++ d64 wandered; /* block wandered location */ ++}; ++ ++/* REISER4 JOURNAL WRITER FUNCTIONS */ ++ ++extern int reiser4_write_logs(long *); ++extern int reiser4_journal_replay(struct super_block *); ++extern int reiser4_journal_recover_sb_data(struct super_block *); ++ ++extern int reiser4_init_journal_info(struct super_block *); ++extern void reiser4_done_journal_info(struct super_block *); ++ ++extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int); ++ ++#endif /* __FS_REISER4_WANDER_H__ */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 80 ++ scroll-step: 1 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/writeout.h linux-2.6.33/fs/reiser4/writeout.h +--- linux-2.6.33.orig/fs/reiser4/writeout.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/writeout.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,21 @@ ++/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */ ++ ++#if !defined (__FS_REISER4_WRITEOUT_H__) ++ ++#define WRITEOUT_SINGLE_STREAM (0x1) ++#define WRITEOUT_FOR_PAGE_RECLAIM (0x2) ++#define WRITEOUT_BARRIER (0x4) ++ ++extern int reiser4_get_writeout_flags(void); ++ ++#endif /* __FS_REISER4_WRITEOUT_H__ */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 80 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/znode.c linux-2.6.33/fs/reiser4/znode.c +--- linux-2.6.33.orig/fs/reiser4/znode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/znode.c 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,1029 @@ ++/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++/* Znode manipulation functions. */ ++/* Znode is the in-memory header for a tree node. It is stored ++ separately from the node itself so that it does not get written to ++ disk. In this respect znode is like buffer head or page head. We ++ also use znodes for additional reiser4 specific purposes: ++ ++ . they are organized into tree structure which is a part of whole ++ reiser4 tree. ++ . they are used to implement node grained locking ++ . they are used to keep additional state associated with a ++ node ++ . they contain links to lists used by the transaction manager ++ ++ Znode is attached to some variable "block number" which is instance of ++ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without ++ appropriate node being actually loaded in memory. Existence of znode itself ++ is regulated by reference count (->x_count) in it. Each time thread ++ acquires reference to znode through call to zget(), ->x_count is ++ incremented and decremented on call to zput(). Data (content of node) are ++ brought in memory through call to zload(), which also increments ->d_count ++ reference counter. zload can block waiting on IO. Call to zrelse() ++ decreases this counter. Also, ->c_count keeps track of number of child ++ znodes and prevents parent znode from being recycled until all of its ++ children are. ->c_count is decremented whenever child goes out of existence ++ (being actually recycled in zdestroy()) which can be some time after last ++ reference to this child dies if we support some form of LRU cache for ++ znodes. ++ ++*/ ++/* EVERY ZNODE'S STORY ++ ++ 1. His infancy. ++ ++ Once upon a time, the znode was born deep inside of zget() by call to ++ zalloc(). At the return from zget() znode had: ++ ++ . reference counter (x_count) of 1 ++ . assigned block number, marked as used in bitmap ++ . pointer to parent znode. Root znode parent pointer points ++ to its father: "fake" znode. This, in turn, has NULL parent pointer. ++ . hash table linkage ++ . no data loaded from disk ++ . no node plugin ++ . no sibling linkage ++ ++ 2. His childhood ++ ++ Each node is either brought into memory as a result of tree traversal, or ++ created afresh, creation of the root being a special case of the latter. In ++ either case it's inserted into sibling list. This will typically require ++ some ancillary tree traversing, but ultimately both sibling pointers will ++ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in ++ zjnode.state. ++ ++ 3. His youth. ++ ++ If znode is bound to already existing node in a tree, its content is read ++ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set ++ in zjnode.state and zdata() function starts to return non null for this ++ znode. zload() further calls zparse() that determines which node layout ++ this node is rendered in, and sets ->nplug on success. ++ ++ If znode is for new node just created, memory for it is allocated and ++ zinit_new() function is called to initialise data, according to selected ++ node layout. ++ ++ 4. His maturity. ++ ++ After this point, znode lingers in memory for some time. Threads can ++ acquire references to znode either by blocknr through call to zget(), or by ++ following a pointer to unallocated znode from internal item. Each time ++ reference to znode is obtained, x_count is increased. Thread can read/write ++ lock znode. Znode data can be loaded through calls to zload(), d_count will ++ be increased appropriately. If all references to znode are released ++ (x_count drops to 0), znode is not recycled immediately. Rather, it is ++ still cached in the hash table in the hope that it will be accessed ++ shortly. ++ ++ There are two ways in which znode existence can be terminated: ++ ++ . sudden death: node bound to this znode is removed from the tree ++ . overpopulation: znode is purged out of memory due to memory pressure ++ ++ 5. His death. ++ ++ Death is complex process. ++ ++ When we irrevocably commit ourselves to decision to remove node from the ++ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding ++ znode. This is done either in ->kill_hook() of internal item or in ++ reiser4_kill_root() function when tree root is removed. ++ ++ At this moment znode still has: ++ ++ . locks held on it, necessary write ones ++ . references to it ++ . disk block assigned to it ++ . data loaded from the disk ++ . pending requests for lock ++ ++ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node ++ deletion. Node deletion includes two phases. First all ways to get ++ references to that znode (sibling and parent links and hash lookup using ++ block number stored in parent node) should be deleted -- it is done through ++ sibling_list_remove(), also we assume that nobody uses down link from ++ parent node due to its nonexistence or proper parent node locking and ++ nobody uses parent pointers from children due to absence of them. Second we ++ invalidate all pending lock requests which still are on znode's lock ++ request queue, this is done by reiser4_invalidate_lock(). Another ++ JNODE_IS_DYING znode status bit is used to invalidate pending lock requests. ++ Once it set all requesters are forced to return -EINVAL from ++ longterm_lock_znode(). Future locking attempts are not possible because all ++ ways to get references to that znode are removed already. Last, node is ++ uncaptured from transaction. ++ ++ When last reference to the dying znode is just about to be released, ++ block number for this lock is released and znode is removed from the ++ hash table. ++ ++ Now znode can be recycled. ++ ++ [it's possible to free bitmap block and remove znode from the hash ++ table when last lock is released. This will result in having ++ referenced but completely orphaned znode] ++ ++ 6. Limbo ++ ++ As have been mentioned above znodes with reference counter 0 are ++ still cached in a hash table. Once memory pressure increases they are ++ purged out of there [this requires something like LRU list for ++ efficient implementation. LRU list would also greatly simplify ++ implementation of coord cache that would in this case morph to just ++ scanning some initial segment of LRU list]. Data loaded into ++ unreferenced znode are flushed back to the durable storage if ++ necessary and memory is freed. Znodes themselves can be recycled at ++ this point too. ++ ++*/ ++ ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++#include "coord.h" ++#include "plugin/plugin_header.h" ++#include "plugin/node/node.h" ++#include "plugin/plugin.h" ++#include "txnmgr.h" ++#include "jnode.h" ++#include "znode.h" ++#include "block_alloc.h" ++#include "tree.h" ++#include "tree_walk.h" ++#include "super.h" ++#include "reiser4.h" ++ ++#include <linux/pagemap.h> ++#include <linux/spinlock.h> ++#include <linux/slab.h> ++#include <linux/err.h> ++ ++static z_hash_table *get_htable(reiser4_tree *, ++ const reiser4_block_nr * const blocknr); ++static z_hash_table *znode_get_htable(const znode *); ++static void zdrop(znode *); ++ ++/* hash table support */ ++ ++/* compare two block numbers for equality. Used by hash-table macros */ ++static inline int ++blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2) ++{ ++ assert("nikita-534", b1 != NULL); ++ assert("nikita-535", b2 != NULL); ++ ++ return *b1 == *b2; ++} ++ ++/* Hash znode by block number. Used by hash-table macros */ ++/* Audited by: umka (2002.06.11) */ ++static inline __u32 ++blknrhashfn(z_hash_table * table, const reiser4_block_nr * b) ++{ ++ assert("nikita-536", b != NULL); ++ ++ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1); ++} ++ ++/* The hash table definition */ ++#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) ++#define KFREE(ptr, size) kfree(ptr) ++TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z, ++ blknrhashfn, blknreq); ++#undef KFREE ++#undef KMALLOC ++ ++/* slab for znodes */ ++static struct kmem_cache *znode_cache; ++ ++int znode_shift_order; ++ ++/** ++ * init_znodes - create znode cache ++ * ++ * Initializes slab cache of znodes. It is part of reiser4 module initialization. ++ */ ++int init_znodes(void) ++{ ++ znode_cache = kmem_cache_create("znode", sizeof(znode), 0, ++ SLAB_HWCACHE_ALIGN | ++ SLAB_RECLAIM_ACCOUNT, NULL); ++ if (znode_cache == NULL) ++ return RETERR(-ENOMEM); ++ ++ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode); ++ ++znode_shift_order); ++ --znode_shift_order; ++ return 0; ++} ++ ++/** ++ * done_znodes - delete znode cache ++ * ++ * This is called on reiser4 module unloading or system shutdown. ++ */ ++void done_znodes(void) ++{ ++ destroy_reiser4_cache(&znode_cache); ++} ++ ++/* call this to initialise tree of znodes */ ++int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ ) ++{ ++ int result; ++ assert("umka-050", tree != NULL); ++ ++ rwlock_init(&tree->dk_lock); ++ ++ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE); ++ if (result != 0) ++ return result; ++ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE); ++ return result; ++} ++ ++/* free this znode */ ++void zfree(znode * node /* znode to free */ ) ++{ ++ assert("nikita-465", node != NULL); ++ assert("nikita-2120", znode_page(node) == NULL); ++ assert("nikita-2301", list_empty_careful(&node->lock.owners)); ++ assert("nikita-2302", list_empty_careful(&node->lock.requestors)); ++ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) && ++ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED)); ++ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes)); ++ assert("nikita-3293", !znode_is_right_connected(node)); ++ assert("nikita-3294", !znode_is_left_connected(node)); ++ assert("nikita-3295", node->left == NULL); ++ assert("nikita-3296", node->right == NULL); ++ ++ /* not yet phash_jnode_destroy(ZJNODE(node)); */ ++ ++ kmem_cache_free(znode_cache, node); ++} ++ ++/* call this to free tree of znodes */ ++void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ ) ++{ ++ znode *node; ++ znode *next; ++ z_hash_table *ztable; ++ ++ /* scan znode hash-tables and kill all znodes, then free hash tables ++ * themselves. */ ++ ++ assert("nikita-795", tree != NULL); ++ ++ ztable = &tree->zhash_table; ++ ++ if (ztable->_table != NULL) { ++ for_all_in_htable(ztable, z, node, next) { ++ node->c_count = 0; ++ node->in_parent.node = NULL; ++ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0); ++ zdrop(node); ++ } ++ ++ z_hash_done(&tree->zhash_table); ++ } ++ ++ ztable = &tree->zfake_table; ++ ++ if (ztable->_table != NULL) { ++ for_all_in_htable(ztable, z, node, next) { ++ node->c_count = 0; ++ node->in_parent.node = NULL; ++ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0); ++ zdrop(node); ++ } ++ ++ z_hash_done(&tree->zfake_table); ++ } ++} ++ ++/* ZNODE STRUCTURES */ ++ ++/* allocate fresh znode */ ++znode *zalloc(gfp_t gfp_flag /* allocation flag */ ) ++{ ++ znode *node; ++ ++ node = kmem_cache_alloc(znode_cache, gfp_flag); ++ return node; ++} ++ ++/* Initialize fields of znode ++ @node: znode to initialize; ++ @parent: parent znode; ++ @tree: tree we are in. */ ++void zinit(znode * node, const znode * parent, reiser4_tree * tree) ++{ ++ assert("nikita-466", node != NULL); ++ assert("umka-268", current_tree != NULL); ++ ++ memset(node, 0, sizeof *node); ++ ++ assert("umka-051", tree != NULL); ++ ++ jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK); ++ reiser4_init_lock(&node->lock); ++ init_parent_coord(&node->in_parent, parent); ++} ++ ++/* ++ * remove znode from indices. This is called jput() when last reference on ++ * znode is released. ++ */ ++void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree) ++{ ++ assert("nikita-2108", node != NULL); ++ assert("nikita-470", node->c_count == 0); ++ assert_rw_write_locked(&(tree->tree_lock)); ++ ++ /* remove reference to this znode from cbk cache */ ++ cbk_cache_invalidate(node, tree); ++ ++ /* update c_count of parent */ ++ if (znode_parent(node) != NULL) { ++ assert("nikita-472", znode_parent(node)->c_count > 0); ++ /* father, onto your hands I forward my spirit... */ ++ znode_parent(node)->c_count--; ++ node->in_parent.node = NULL; ++ } else { ++ /* orphaned znode?! Root? */ ++ } ++ ++ /* remove znode from hash-table */ ++ z_hash_remove_rcu(znode_get_htable(node), node); ++} ++ ++/* zdrop() -- Remove znode from the tree. ++ ++ This is called when znode is removed from the memory. */ ++static void zdrop(znode * node /* znode to finish with */ ) ++{ ++ jdrop(ZJNODE(node)); ++} ++ ++/* ++ * put znode into right place in the hash table. This is called by relocate ++ * code. ++ */ ++int znode_rehash(znode * node /* node to rehash */ , ++ const reiser4_block_nr * new_block_nr /* new block number */ ) ++{ ++ z_hash_table *oldtable; ++ z_hash_table *newtable; ++ reiser4_tree *tree; ++ ++ assert("nikita-2018", node != NULL); ++ ++ tree = znode_get_tree(node); ++ oldtable = znode_get_htable(node); ++ newtable = get_htable(tree, new_block_nr); ++ ++ write_lock_tree(tree); ++ /* remove znode from hash-table */ ++ z_hash_remove_rcu(oldtable, node); ++ ++ /* assertion no longer valid due to RCU */ ++ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */ ++ ++ /* update blocknr */ ++ znode_set_block(node, new_block_nr); ++ node->zjnode.key.z = *new_block_nr; ++ ++ /* insert it into hash */ ++ z_hash_insert_rcu(newtable, node); ++ write_unlock_tree(tree); ++ return 0; ++} ++ ++/* ZNODE LOOKUP, GET, PUT */ ++ ++/* zlook() - get znode with given block_nr in a hash table or return NULL ++ ++ If result is non-NULL then the znode's x_count is incremented. Internal version ++ accepts pre-computed hash index. The hash table is accessed under caller's ++ tree->hash_lock. ++*/ ++znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr) ++{ ++ znode *result; ++ __u32 hash; ++ z_hash_table *htable; ++ ++ assert("jmacd-506", tree != NULL); ++ assert("jmacd-507", blocknr != NULL); ++ ++ htable = get_htable(tree, blocknr); ++ hash = blknrhashfn(htable, blocknr); ++ ++ rcu_read_lock(); ++ result = z_hash_find_index(htable, hash, blocknr); ++ ++ if (result != NULL) { ++ add_x_ref(ZJNODE(result)); ++ result = znode_rip_check(tree, result); ++ } ++ rcu_read_unlock(); ++ ++ return result; ++} ++ ++/* return hash table where znode with block @blocknr is (or should be) ++ * stored */ ++static z_hash_table *get_htable(reiser4_tree * tree, ++ const reiser4_block_nr * const blocknr) ++{ ++ z_hash_table *table; ++ if (is_disk_addr_unallocated(blocknr)) ++ table = &tree->zfake_table; ++ else ++ table = &tree->zhash_table; ++ return table; ++} ++ ++/* return hash table where znode @node is (or should be) stored */ ++static z_hash_table *znode_get_htable(const znode * node) ++{ ++ return get_htable(znode_get_tree(node), znode_get_block(node)); ++} ++ ++/* zget() - get znode from hash table, allocating it if necessary. ++ ++ First a call to zlook, locating a x-referenced znode if one ++ exists. If znode is not found, allocate new one and return. Result ++ is returned with x_count reference increased. ++ ++ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK ++ LOCK ORDERING: NONE ++*/ ++znode *zget(reiser4_tree * tree, ++ const reiser4_block_nr * const blocknr, ++ znode * parent, tree_level level, gfp_t gfp_flag) ++{ ++ znode *result; ++ __u32 hashi; ++ ++ z_hash_table *zth; ++ ++ assert("jmacd-512", tree != NULL); ++ assert("jmacd-513", blocknr != NULL); ++ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT); ++ ++ zth = get_htable(tree, blocknr); ++ hashi = blknrhashfn(zth, blocknr); ++ ++ /* NOTE-NIKITA address-as-unallocated-blocknr still is not ++ implemented. */ ++ ++ z_hash_prefetch_bucket(zth, hashi); ++ ++ rcu_read_lock(); ++ /* Find a matching BLOCKNR in the hash table. If the znode is found, ++ we obtain an reference (x_count) but the znode remains unlocked. ++ Have to worry about race conditions later. */ ++ result = z_hash_find_index(zth, hashi, blocknr); ++ /* According to the current design, the hash table lock protects new ++ znode references. */ ++ if (result != NULL) { ++ add_x_ref(ZJNODE(result)); ++ /* NOTE-NIKITA it should be so, but special case during ++ creation of new root makes such assertion highly ++ complicated. */ ++ assert("nikita-2131", 1 || znode_parent(result) == parent || ++ (ZF_ISSET(result, JNODE_ORPHAN) ++ && (znode_parent(result) == NULL))); ++ result = znode_rip_check(tree, result); ++ } ++ ++ rcu_read_unlock(); ++ ++ if (!result) { ++ znode *shadow; ++ ++ result = zalloc(gfp_flag); ++ if (!result) { ++ return ERR_PTR(RETERR(-ENOMEM)); ++ } ++ ++ zinit(result, parent, tree); ++ ZJNODE(result)->blocknr = *blocknr; ++ ZJNODE(result)->key.z = *blocknr; ++ result->level = level; ++ ++ write_lock_tree(tree); ++ ++ shadow = z_hash_find_index(zth, hashi, blocknr); ++ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) { ++ jnode_list_remove(ZJNODE(result)); ++ zfree(result); ++ result = shadow; ++ } else { ++ result->version = znode_build_version(tree); ++ z_hash_insert_index_rcu(zth, hashi, result); ++ ++ if (parent != NULL) ++ ++parent->c_count; ++ } ++ ++ add_x_ref(ZJNODE(result)); ++ ++ write_unlock_tree(tree); ++ } ++#if REISER4_DEBUG ++ if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0) ++ reiser4_check_block(blocknr, 1); ++#endif ++ /* Check for invalid tree level, return -EIO */ ++ if (unlikely(znode_get_level(result) != level)) { ++ warning("jmacd-504", ++ "Wrong level for cached block %llu: %i expecting %i", ++ (unsigned long long)(*blocknr), znode_get_level(result), ++ level); ++ zput(result); ++ return ERR_PTR(RETERR(-EIO)); ++ } ++ ++ assert("nikita-1227", znode_invariant(result)); ++ ++ return result; ++} ++ ++/* ZNODE PLUGINS/DATA */ ++ ++/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is ++ stored at the fixed offset from the beginning of the node. */ ++static node_plugin *znode_guess_plugin(const znode * node /* znode to guess ++ * plugin of */ ) ++{ ++ reiser4_tree *tree; ++ ++ assert("nikita-1053", node != NULL); ++ assert("nikita-1055", zdata(node) != NULL); ++ ++ tree = znode_get_tree(node); ++ assert("umka-053", tree != NULL); ++ ++ if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) { ++ return tree->nplug; ++ } else { ++ return node_plugin_by_disk_id ++ (tree, &((common_node_header *) zdata(node))->plugin_id); ++#ifdef GUESS_EXISTS ++ reiser4_plugin *plugin; ++ ++ /* NOTE-NIKITA add locking here when dynamic plugins will be ++ * implemented */ ++ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) { ++ if ((plugin->u.node.guess != NULL) ++ && plugin->u.node.guess(node)) ++ return plugin; ++ } ++ warning("nikita-1057", "Cannot guess node plugin"); ++ print_znode("node", node); ++ return NULL; ++#endif ++ } ++} ++ ++/* parse node header and install ->node_plugin */ ++int zparse(znode * node /* znode to parse */ ) ++{ ++ int result; ++ ++ assert("nikita-1233", node != NULL); ++ assert("nikita-2370", zdata(node) != NULL); ++ ++ if (node->nplug == NULL) { ++ node_plugin *nplug; ++ ++ nplug = znode_guess_plugin(node); ++ if (likely(nplug != NULL)) { ++ result = nplug->parse(node); ++ if (likely(result == 0)) ++ node->nplug = nplug; ++ } else { ++ result = RETERR(-EIO); ++ } ++ } else ++ result = 0; ++ return result; ++} ++ ++/* zload with readahead */ ++int zload_ra(znode * node /* znode to load */ , ra_info_t * info) ++{ ++ int result; ++ ++ assert("nikita-484", node != NULL); ++ assert("nikita-1377", znode_invariant(node)); ++ assert("jmacd-7771", !znode_above_root(node)); ++ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0); ++ assert("nikita-3016", reiser4_schedulable()); ++ ++ if (info) ++ formatted_readahead(node, info); ++ ++ result = jload(ZJNODE(node)); ++ assert("nikita-1378", znode_invariant(node)); ++ return result; ++} ++ ++/* load content of node into memory */ ++int zload(znode * node) ++{ ++ return zload_ra(node, NULL); ++} ++ ++/* call node plugin to initialise newly allocated node. */ ++int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags) ++{ ++ return jinit_new(ZJNODE(node), gfp_flags); ++} ++ ++/* drop reference to node data. When last reference is dropped, data are ++ unloaded. */ ++void zrelse(znode * node /* znode to release references to */ ) ++{ ++ assert("nikita-1381", znode_invariant(node)); ++ ++ jrelse(ZJNODE(node)); ++} ++ ++/* returns free space in node */ ++unsigned znode_free_space(znode * node /* znode to query */ ) ++{ ++ assert("nikita-852", node != NULL); ++ return node_plugin_by_node(node)->free_space(node); ++} ++ ++/* left delimiting key of znode */ ++reiser4_key *znode_get_rd_key(znode * node /* znode to query */ ) ++{ ++ assert("nikita-958", node != NULL); ++ assert_rw_locked(&(znode_get_tree(node)->dk_lock)); ++ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk)); ++ assert("nikita-30671", node->rd_key_version != 0); ++ return &node->rd_key; ++} ++ ++/* right delimiting key of znode */ ++reiser4_key *znode_get_ld_key(znode * node /* znode to query */ ) ++{ ++ assert("nikita-974", node != NULL); ++ assert_rw_locked(&(znode_get_tree(node)->dk_lock)); ++ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk)); ++ assert("nikita-30681", node->ld_key_version != 0); ++ return &node->ld_key; ++} ++ ++ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0); ++ ) ++ ++/* update right-delimiting key of @node */ ++reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key) ++{ ++ assert("nikita-2937", node != NULL); ++ assert("nikita-2939", key != NULL); ++ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); ++ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk)); ++ assert("nikita-2944", ++ znode_is_any_locked(node) || ++ znode_get_level(node) != LEAF_LEVEL || ++ keyge(key, &node->rd_key) || ++ keyeq(&node->rd_key, reiser4_min_key()) || ++ ZF_ISSET(node, JNODE_HEARD_BANSHEE)); ++ ++ node->rd_key = *key; ++ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version)); ++ return &node->rd_key; ++} ++ ++/* update left-delimiting key of @node */ ++reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key) ++{ ++ assert("nikita-2940", node != NULL); ++ assert("nikita-2941", key != NULL); ++ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); ++ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk)); ++ assert("nikita-2943", ++ znode_is_any_locked(node) || keyeq(&node->ld_key, ++ reiser4_min_key())); ++ ++ node->ld_key = *key; ++ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version)); ++ return &node->ld_key; ++} ++ ++/* true if @key is inside key range for @node */ ++int znode_contains_key(znode * node /* znode to look in */ , ++ const reiser4_key * key /* key to look for */ ) ++{ ++ assert("nikita-1237", node != NULL); ++ assert("nikita-1238", key != NULL); ++ ++ /* left_delimiting_key <= key <= right_delimiting_key */ ++ return keyle(znode_get_ld_key(node), key) ++ && keyle(key, znode_get_rd_key(node)); ++} ++ ++/* same as znode_contains_key(), but lock dk lock */ ++int znode_contains_key_lock(znode * node /* znode to look in */ , ++ const reiser4_key * key /* key to look for */ ) ++{ ++ int result; ++ ++ assert("umka-056", node != NULL); ++ assert("umka-057", key != NULL); ++ ++ read_lock_dk(znode_get_tree(node)); ++ result = znode_contains_key(node, key); ++ read_unlock_dk(znode_get_tree(node)); ++ return result; ++} ++ ++/* get parent pointer, assuming tree is not locked */ ++znode *znode_parent_nolock(const znode * node /* child znode */ ) ++{ ++ assert("nikita-1444", node != NULL); ++ return node->in_parent.node; ++} ++ ++/* get parent pointer of znode */ ++znode *znode_parent(const znode * node /* child znode */ ) ++{ ++ assert("nikita-1226", node != NULL); ++ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree)); ++ return znode_parent_nolock(node); ++} ++ ++/* detect uber znode used to protect in-superblock tree root pointer */ ++int znode_above_root(const znode * node /* znode to query */ ) ++{ ++ assert("umka-059", node != NULL); ++ ++ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR); ++} ++ ++/* check that @node is root---that its block number is recorder in the tree as ++ that of root node */ ++#if REISER4_DEBUG ++static int znode_is_true_root(const znode * node /* znode to query */ ) ++{ ++ assert("umka-060", node != NULL); ++ assert("umka-061", current_tree != NULL); ++ ++ return disk_addr_eq(znode_get_block(node), ++ &znode_get_tree(node)->root_block); ++} ++#endif ++ ++/* check that @node is root */ ++int znode_is_root(const znode * node /* znode to query */ ) ++{ ++ assert("nikita-1206", node != NULL); ++ ++ return znode_get_level(node) == znode_get_tree(node)->height; ++} ++ ++/* Returns true is @node was just created by zget() and wasn't ever loaded ++ into memory. */ ++/* NIKITA-HANS: yes */ ++int znode_just_created(const znode * node) ++{ ++ assert("nikita-2188", node != NULL); ++ return (znode_page(node) == NULL); ++} ++ ++/* obtain updated ->znode_epoch. See seal.c for description. */ ++__u64 znode_build_version(reiser4_tree * tree) ++{ ++ __u64 result; ++ ++ spin_lock(&tree->epoch_lock); ++ result = ++tree->znode_epoch; ++ spin_unlock(&tree->epoch_lock); ++ return result; ++} ++ ++void init_load_count(load_count * dh) ++{ ++ assert("nikita-2105", dh != NULL); ++ memset(dh, 0, sizeof *dh); ++} ++ ++void done_load_count(load_count * dh) ++{ ++ assert("nikita-2106", dh != NULL); ++ if (dh->node != NULL) { ++ for (; dh->d_ref > 0; --dh->d_ref) ++ zrelse(dh->node); ++ dh->node = NULL; ++ } ++} ++ ++static int incr_load_count(load_count * dh) ++{ ++ int result; ++ ++ assert("nikita-2110", dh != NULL); ++ assert("nikita-2111", dh->node != NULL); ++ ++ result = zload(dh->node); ++ if (result == 0) ++ ++dh->d_ref; ++ return result; ++} ++ ++int incr_load_count_znode(load_count * dh, znode * node) ++{ ++ assert("nikita-2107", dh != NULL); ++ assert("nikita-2158", node != NULL); ++ assert("nikita-2109", ++ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0))); ++ ++ dh->node = node; ++ return incr_load_count(dh); ++} ++ ++int incr_load_count_jnode(load_count * dh, jnode * node) ++{ ++ if (jnode_is_znode(node)) { ++ return incr_load_count_znode(dh, JZNODE(node)); ++ } ++ return 0; ++} ++ ++void copy_load_count(load_count * new, load_count * old) ++{ ++ int ret = 0; ++ done_load_count(new); ++ new->node = old->node; ++ new->d_ref = 0; ++ ++ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) { ++ } ++ ++ assert("jmacd-87589", ret == 0); ++} ++ ++void move_load_count(load_count * new, load_count * old) ++{ ++ done_load_count(new); ++ new->node = old->node; ++ new->d_ref = old->d_ref; ++ old->node = NULL; ++ old->d_ref = 0; ++} ++ ++/* convert parent pointer into coord */ ++void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord) ++{ ++ assert("nikita-3204", pcoord != NULL); ++ assert("nikita-3205", coord != NULL); ++ ++ coord_init_first_unit_nocheck(coord, pcoord->node); ++ coord_set_item_pos(coord, pcoord->item_pos); ++ coord->between = AT_UNIT; ++} ++ ++/* pack coord into parent_coord_t */ ++void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord) ++{ ++ assert("nikita-3206", pcoord != NULL); ++ assert("nikita-3207", coord != NULL); ++ ++ pcoord->node = coord->node; ++ pcoord->item_pos = coord->item_pos; ++} ++ ++/* Initialize a parent hint pointer. (parent hint pointer is a field in znode, ++ look for comments there) */ ++void init_parent_coord(parent_coord_t * pcoord, const znode * node) ++{ ++ pcoord->node = (znode *) node; ++ pcoord->item_pos = (unsigned short)~0; ++} ++ ++#if REISER4_DEBUG ++ ++/* debugging aid: znode invariant */ ++static int znode_invariant_f(const znode * node /* znode to check */ , ++ char const **msg /* where to store error ++ * message, if any */ ) ++{ ++#define _ergo(ant, con) \ ++ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con))) ++ ++#define _equi(e1, e2) \ ++ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2))) ++ ++#define _check(exp) ((*msg) = #exp, (exp)) ++ ++ return jnode_invariant_f(ZJNODE(node), msg) && ++ /* [znode-fake] invariant */ ++ /* fake znode doesn't have a parent, and */ ++ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) && ++ /* there is another way to express this very check, and */ ++ _ergo(znode_above_root(node), znode_parent(node) == NULL) && ++ /* it has special block number, and */ ++ _ergo(znode_get_level(node) == 0, ++ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) && ++ /* it is the only znode with such block number, and */ ++ _ergo(!znode_above_root(node) && znode_is_loaded(node), ++ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) && ++ /* it is parent of the tree root node */ ++ _ergo(znode_is_true_root(node), ++ znode_above_root(znode_parent(node))) && ++ /* [znode-level] invariant */ ++ /* level of parent znode is one larger than that of child, ++ except for the fake znode, and */ ++ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)), ++ znode_get_level(znode_parent(node)) == ++ znode_get_level(node) + 1) && ++ /* left neighbor is at the same level, and */ ++ _ergo(znode_is_left_connected(node) && node->left != NULL, ++ znode_get_level(node) == znode_get_level(node->left)) && ++ /* right neighbor is at the same level */ ++ _ergo(znode_is_right_connected(node) && node->right != NULL, ++ znode_get_level(node) == znode_get_level(node->right)) && ++ /* [znode-connected] invariant */ ++ _ergo(node->left != NULL, znode_is_left_connected(node)) && ++ _ergo(node->right != NULL, znode_is_right_connected(node)) && ++ _ergo(!znode_is_root(node) && node->left != NULL, ++ znode_is_right_connected(node->left) && ++ node->left->right == node) && ++ _ergo(!znode_is_root(node) && node->right != NULL, ++ znode_is_left_connected(node->right) && ++ node->right->left == node) && ++ /* [znode-c_count] invariant */ ++ /* for any znode, c_count of its parent is greater than 0 */ ++ _ergo(znode_parent(node) != NULL && ++ !znode_above_root(znode_parent(node)), ++ znode_parent(node)->c_count > 0) && ++ /* leaves don't have children */ ++ _ergo(znode_get_level(node) == LEAF_LEVEL, ++ node->c_count == 0) && ++ _check(node->zjnode.jnodes.prev != NULL) && ++ _check(node->zjnode.jnodes.next != NULL) && ++ /* orphan doesn't have a parent */ ++ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) && ++ /* [znode-modify] invariant */ ++ /* if znode is not write-locked, its checksum remains ++ * invariant */ ++ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we ++ * cannot check this. */ ++ /* [znode-refs] invariant */ ++ /* only referenced znode can be long-term locked */ ++ _ergo(znode_is_locked(node), ++ atomic_read(&ZJNODE(node)->x_count) != 0); ++} ++ ++/* debugging aid: check znode invariant and panic if it doesn't hold */ ++int znode_invariant(znode * node /* znode to check */ ) ++{ ++ char const *failed_msg; ++ int result; ++ ++ assert("umka-063", node != NULL); ++ assert("umka-064", current_tree != NULL); ++ ++ spin_lock_znode(node); ++ read_lock_tree(znode_get_tree(node)); ++ result = znode_invariant_f(node, &failed_msg); ++ if (!result) { ++ /* print_znode("corrupted node", node); */ ++ warning("jmacd-555", "Condition %s failed", failed_msg); ++ } ++ read_unlock_tree(znode_get_tree(node)); ++ spin_unlock_znode(node); ++ return result; ++} ++ ++/* return non-0 iff data are loaded into znode */ ++int znode_is_loaded(const znode * node /* znode to query */ ) ++{ ++ assert("nikita-497", node != NULL); ++ return jnode_is_loaded(ZJNODE(node)); ++} ++ ++unsigned long znode_times_locked(const znode * z) ++{ ++ return z->times_locked; ++} ++ ++#endif /* REISER4_DEBUG */ ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/fs/reiser4/znode.h linux-2.6.33/fs/reiser4/znode.h +--- linux-2.6.33.orig/fs/reiser4/znode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.33/fs/reiser4/znode.h 2010-03-04 19:33:22.000000000 +0100 +@@ -0,0 +1,433 @@ ++/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by ++ * reiser4/README */ ++ ++/* Declaration of znode (Zam's node). See znode.c for more details. */ ++ ++#ifndef __ZNODE_H__ ++#define __ZNODE_H__ ++ ++#include "forward.h" ++#include "debug.h" ++#include "dformat.h" ++#include "key.h" ++#include "coord.h" ++#include "plugin/node/node.h" ++#include "jnode.h" ++#include "lock.h" ++#include "readahead.h" ++ ++#include <linux/types.h> ++#include <linux/spinlock.h> ++#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */ ++#include <asm/atomic.h> ++ ++/* znode tracks its position within parent (internal item in a parent node, ++ * that contains znode's block number). */ ++typedef struct parent_coord { ++ znode *node; ++ pos_in_node_t item_pos; ++} parent_coord_t; ++ ++/* &znode - node in a reiser4 tree. ++ ++ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce ++ cacheline pressure. ++ ++ Locking: ++ ++ Long term: data in a disk node attached to this znode are protected ++ by long term, deadlock aware lock ->lock; ++ ++ Spin lock: the following fields are protected by the spin lock: ++ ++ ->lock ++ ++ Following fields are protected by the global tree lock: ++ ++ ->left ++ ->right ++ ->in_parent ++ ->c_count ++ ++ Following fields are protected by the global delimiting key lock (dk_lock): ++ ++ ->ld_key (to update ->ld_key long-term lock on the node is also required) ++ ->rd_key ++ ++ Following fields are protected by the long term lock: ++ ++ ->nr_items ++ ++ ->node_plugin is never changed once set. This means that after code made ++ itself sure that field is valid it can be accessed without any additional ++ locking. ++ ++ ->level is immutable. ++ ++ Invariants involving this data-type: ++ ++ [znode-fake] ++ [znode-level] ++ [znode-connected] ++ [znode-c_count] ++ [znode-refs] ++ [jnode-refs] ++ [jnode-queued] ++ [znode-modify] ++ ++ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks. ++ Suggestions for how to do that are desired.*/ ++struct znode { ++ /* Embedded jnode. */ ++ jnode zjnode; ++ ++ /* contains three subfields, node, pos_in_node, and pos_in_unit. ++ ++ pos_in_node and pos_in_unit are only hints that are cached to ++ speed up lookups during balancing. They are not required to be up to ++ date. Synched in find_child_ptr(). ++ ++ This value allows us to avoid expensive binary searches. ++ ++ in_parent->node points to the parent of this node, and is NOT a ++ hint. ++ */ ++ parent_coord_t in_parent; ++ ++ /* ++ * sibling list pointers ++ */ ++ ++ /* left-neighbor */ ++ znode *left; ++ /* right-neighbor */ ++ znode *right; ++ ++ /* long term lock on node content. This lock supports deadlock ++ detection. See lock.c ++ */ ++ zlock lock; ++ ++ /* You cannot remove from memory a node that has children in ++ memory. This is because we rely on the fact that parent of given ++ node can always be reached without blocking for io. When reading a ++ node into memory you must increase the c_count of its parent, when ++ removing it from memory you must decrease the c_count. This makes ++ the code simpler, and the cases where it is suboptimal are truly ++ obscure. ++ */ ++ int c_count; ++ ++ /* plugin of node attached to this znode. NULL if znode is not ++ loaded. */ ++ node_plugin *nplug; ++ ++ /* version of znode data. This is increased on each modification. This ++ * is necessary to implement seals (see seal.[ch]) efficiently. */ ++ __u64 version; ++ ++ /* left delimiting key. Necessary to efficiently perform ++ balancing with node-level locking. Kept in memory only. */ ++ reiser4_key ld_key; ++ /* right delimiting key. */ ++ reiser4_key rd_key; ++ ++ /* znode's tree level */ ++ __u16 level; ++ /* number of items in this node. This field is modified by node ++ * plugin. */ ++ __u16 nr_items; ++ ++#if REISER4_DEBUG ++ void *creator; ++ reiser4_key first_key; ++ unsigned long times_locked; ++ int left_version; /* when node->left was updated */ ++ int right_version; /* when node->right was updated */ ++ int ld_key_version; /* when node->ld_key was updated */ ++ int rd_key_version; /* when node->rd_key was updated */ ++#endif ++ ++} __attribute__ ((aligned(16))); ++ ++ON_DEBUG(extern atomic_t delim_key_version; ++ ) ++ ++/* In general I think these macros should not be exposed. */ ++#define znode_is_locked(node) (lock_is_locked(&node->lock)) ++#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock)) ++#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock)) ++#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock)) ++#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock)) ++#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode)) ++/* Macros for accessing the znode state. */ ++#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f)) ++#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f)) ++#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f)) ++extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block, ++ znode * parent, tree_level level, gfp_t gfp_flag); ++extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block); ++extern int zload(znode * node); ++extern int zload_ra(znode * node, ra_info_t * info); ++extern int zinit_new(znode * node, gfp_t gfp_flags); ++extern void zrelse(znode * node); ++extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block); ++ ++/* size of data in znode */ ++static inline unsigned ++znode_size(const znode * node UNUSED_ARG /* znode to query */ ) ++{ ++ assert("nikita-1416", node != NULL); ++ return PAGE_CACHE_SIZE; ++} ++ ++extern void parent_coord_to_coord(const parent_coord_t * pcoord, ++ coord_t * coord); ++extern void coord_to_parent_coord(const coord_t * coord, ++ parent_coord_t * pcoord); ++extern void init_parent_coord(parent_coord_t * pcoord, const znode * node); ++ ++extern unsigned znode_free_space(znode * node); ++ ++extern reiser4_key *znode_get_rd_key(znode * node); ++extern reiser4_key *znode_get_ld_key(znode * node); ++ ++extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key); ++extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key); ++ ++/* `connected' state checks */ ++static inline int znode_is_right_connected(const znode * node) ++{ ++ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED); ++} ++ ++static inline int znode_is_left_connected(const znode * node) ++{ ++ return ZF_ISSET(node, JNODE_LEFT_CONNECTED); ++} ++ ++static inline int znode_is_connected(const znode * node) ++{ ++ return znode_is_right_connected(node) && znode_is_left_connected(node); ++} ++ ++extern int znode_shift_order; ++extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr); ++extern void znode_remove(znode *, reiser4_tree *); ++extern znode *znode_parent(const znode * node); ++extern znode *znode_parent_nolock(const znode * node); ++extern int znode_above_root(const znode * node); ++extern int init_znodes(void); ++extern void done_znodes(void); ++extern int znodes_tree_init(reiser4_tree * ztree); ++extern void znodes_tree_done(reiser4_tree * ztree); ++extern int znode_contains_key(znode * node, const reiser4_key * key); ++extern int znode_contains_key_lock(znode * node, const reiser4_key * key); ++extern unsigned znode_save_free_space(znode * node); ++extern unsigned znode_recover_free_space(znode * node); ++extern znode *zalloc(gfp_t gfp_flag); ++extern void zinit(znode *, const znode * parent, reiser4_tree *); ++extern int zparse(znode * node); ++ ++extern int znode_just_created(const znode * node); ++ ++extern void zfree(znode * node); ++ ++#if REISER4_DEBUG ++extern void print_znode(const char *prefix, const znode * node); ++#else ++#define print_znode( p, n ) noop ++#endif ++ ++/* Make it look like various znode functions exist instead of treating znodes as ++ jnodes in znode-specific code. */ ++#define znode_page(x) jnode_page ( ZJNODE(x) ) ++#define zdata(x) jdata ( ZJNODE(x) ) ++#define znode_get_block(x) jnode_get_block ( ZJNODE(x) ) ++#define znode_created(x) jnode_created ( ZJNODE(x) ) ++#define znode_set_created(x) jnode_set_created ( ZJNODE(x) ) ++#define znode_convertible(x) jnode_convertible (ZJNODE(x)) ++#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x)) ++ ++#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) ) ++#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) ) ++#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) ) ++#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) ) ++ ++#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) ) ++#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) ) ++#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) ) ++#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) ) ++#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) ) ++ ++#if REISER4_DEBUG ++extern int znode_x_count_is_protected(const znode * node); ++extern int znode_invariant(znode * node); ++#endif ++ ++/* acquire reference to @node */ ++static inline znode *zref(znode * node) ++{ ++ /* change of x_count from 0 to 1 is protected by tree spin-lock */ ++ return JZNODE(jref(ZJNODE(node))); ++} ++ ++/* release reference to @node */ ++static inline void zput(znode * node) ++{ ++ assert("nikita-3564", znode_invariant(node)); ++ jput(ZJNODE(node)); ++} ++ ++/* get the level field for a znode */ ++static inline tree_level znode_get_level(const znode * node) ++{ ++ return node->level; ++} ++ ++/* get the level field for a jnode */ ++static inline tree_level jnode_get_level(const jnode * node) ++{ ++ if (jnode_is_znode(node)) ++ return znode_get_level(JZNODE(node)); ++ else ++ /* unformatted nodes are all at the LEAF_LEVEL and for ++ "semi-formatted" nodes like bitmaps, level doesn't matter. */ ++ return LEAF_LEVEL; ++} ++ ++/* true if jnode is on leaf level */ ++static inline int jnode_is_leaf(const jnode * node) ++{ ++ if (jnode_is_znode(node)) ++ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL); ++ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK) ++ return 1; ++ return 0; ++} ++ ++/* return znode's tree */ ++static inline reiser4_tree *znode_get_tree(const znode * node) ++{ ++ assert("nikita-2692", node != NULL); ++ return jnode_get_tree(ZJNODE(node)); ++} ++ ++/* resolve race with zput */ ++static inline znode *znode_rip_check(reiser4_tree * tree, znode * node) ++{ ++ jnode *j; ++ ++ j = jnode_rip_sync(tree, ZJNODE(node)); ++ if (likely(j != NULL)) ++ node = JZNODE(j); ++ else ++ node = NULL; ++ return node; ++} ++ ++#if defined(REISER4_DEBUG) ++int znode_is_loaded(const znode * node /* znode to query */ ); ++#endif ++ ++extern __u64 znode_build_version(reiser4_tree * tree); ++ ++/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We ++ must load the data for a node in many places. We could do this by simply calling ++ zload() everywhere, the difficulty arises when we must release the loaded data by ++ calling zrelse. In a function with many possible error/return paths, it requires extra ++ work to figure out which exit paths must call zrelse and those which do not. The data ++ handle automatically calls zrelse for every zload that it is responsible for. In that ++ sense, it acts much like a lock_handle. ++*/ ++typedef struct load_count { ++ znode *node; ++ int d_ref; ++} load_count; ++ ++extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */ ++extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */ ++extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */ ++extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as ++ * incr_load_count_znode, otherwise do nothing (unformatted nodes ++ * don't require zload/zrelse treatment). */ ++extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */ ++extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */ ++ ++/* Variable initializers for load_count. */ ++#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 } ++#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 } ++/* A convenience macro for use in assertions or debug-only code, where loaded ++ data is only required to perform the debugging check. This macro ++ encapsulates an expression inside a pair of calls to zload()/zrelse(). */ ++#define WITH_DATA( node, exp ) \ ++({ \ ++ long __with_dh_result; \ ++ znode *__with_dh_node; \ ++ \ ++ __with_dh_node = ( node ); \ ++ __with_dh_result = zload( __with_dh_node ); \ ++ if( __with_dh_result == 0 ) { \ ++ __with_dh_result = ( long )( exp ); \ ++ zrelse( __with_dh_node ); \ ++ } \ ++ __with_dh_result; \ ++}) ++ ++/* Same as above, but accepts a return value in case zload fails. */ ++#define WITH_DATA_RET( node, ret, exp ) \ ++({ \ ++ int __with_dh_result; \ ++ znode *__with_dh_node; \ ++ \ ++ __with_dh_node = ( node ); \ ++ __with_dh_result = zload( __with_dh_node ); \ ++ if( __with_dh_result == 0 ) { \ ++ __with_dh_result = ( int )( exp ); \ ++ zrelse( __with_dh_node ); \ ++ } else \ ++ __with_dh_result = ( ret ); \ ++ __with_dh_result; \ ++}) ++ ++#define WITH_COORD(coord, exp) \ ++({ \ ++ coord_t *__coord; \ ++ \ ++ __coord = (coord); \ ++ coord_clear_iplug(__coord); \ ++ WITH_DATA(__coord->node, exp); \ ++}) ++ ++#if REISER4_DEBUG ++#define STORE_COUNTERS \ ++ reiser4_lock_cnt_info __entry_counters = \ ++ *reiser4_lock_counters() ++#define CHECK_COUNTERS \ ++ON_DEBUG_CONTEXT( \ ++({ \ ++ __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \ ++ __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \ ++ __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \ ++ assert("nikita-2159", \ ++ !memcmp(&__entry_counters, reiser4_lock_counters(), \ ++ sizeof __entry_counters)); \ ++}) ) ++ ++#else ++#define STORE_COUNTERS ++#define CHECK_COUNTERS noop ++#endif ++ ++/* __ZNODE_H__ */ ++#endif ++ ++/* Make Linus happy. ++ Local variables: ++ c-indentation-style: "K&R" ++ mode-name: "LC" ++ c-basic-offset: 8 ++ tab-width: 8 ++ fill-column: 120 ++ End: ++*/ +diff -urN linux-2.6.33.orig/include/linux/fs.h linux-2.6.33/include/linux/fs.h +--- linux-2.6.33.orig/include/linux/fs.h 2010-02-24 19:52:17.000000000 +0100 ++++ linux-2.6.33/include/linux/fs.h 2010-03-04 19:33:22.000000000 +0100 +@@ -511,6 +511,7 @@ + struct page; + struct address_space; + struct writeback_control; ++struct bdi_writeback; + + struct iov_iter { + const struct iovec *iov; +@@ -1567,7 +1568,11 @@ + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); +- ++ int (*writeback_inodes)(struct super_block *sb, ++ struct bdi_writeback *wb, ++ struct writeback_control *wbc); ++ void (*sync_inodes) (struct super_block *sb, ++ struct writeback_control *wbc); + int (*show_options)(struct seq_file *, struct vfsmount *); + int (*show_stats)(struct seq_file *, struct vfsmount *); + #ifdef CONFIG_QUOTA +@@ -2074,6 +2079,12 @@ + extern int invalidate_inode_pages2_range(struct address_space *mapping, + pgoff_t start, pgoff_t end); + extern int write_inode_now(struct inode *, int); ++extern void writeback_skip_sb_inodes(struct super_block *sb, ++ struct bdi_writeback *wb); ++extern void writeback_inodes_wbc(struct writeback_control *wbc); ++extern int generic_writeback_sb_inodes(struct super_block *sb, ++ struct bdi_writeback *wb, ++ struct writeback_control *wbc); + extern int filemap_fdatawrite(struct address_space *); + extern int filemap_flush(struct address_space *); + extern int filemap_fdatawait(struct address_space *); +diff -urN linux-2.6.33.orig/include/linux/mm.h linux-2.6.33/include/linux/mm.h +--- linux-2.6.33.orig/include/linux/mm.h 2010-02-24 19:52:17.000000000 +0100 ++++ linux-2.6.33/include/linux/mm.h 2010-03-04 19:33:22.000000000 +0100 +@@ -850,6 +850,7 @@ + void account_page_dirtied(struct page *page, struct address_space *mapping); + int set_page_dirty(struct page *page); + int set_page_dirty_lock(struct page *page); ++int set_page_dirty_notag(struct page *page); + int clear_page_dirty_for_io(struct page *page); + + extern unsigned long move_page_tables(struct vm_area_struct *vma, +diff -urN linux-2.6.33.orig/include/linux/writeback.h linux-2.6.33/include/linux/writeback.h +--- linux-2.6.33.orig/include/linux/writeback.h 2010-02-24 19:52:17.000000000 +0100 ++++ linux-2.6.33/include/linux/writeback.h 2010-03-04 19:33:22.000000000 +0100 +@@ -13,6 +13,12 @@ + extern struct list_head inode_in_use; + extern struct list_head inode_unused; + ++static inline int is_flush_bd_task(struct task_struct *task) ++{ ++ return task->flags & PF_FLUSHER; ++} ++#define current_is_flush_bd_task() is_flush_bd_task(current) ++ + /* + * fs/fs-writeback.c + */ +@@ -34,6 +40,9 @@ + enum writeback_sync_modes sync_mode; + unsigned long *older_than_this; /* If !NULL, only write back inodes + older than this */ ++ unsigned long wb_start; /* Time writeback_inodes_wb was ++ called. This is needed to avoid ++ extra jobs and livelock */ + long nr_to_write; /* Write this many pages, and decrement + this for each page written */ + long pages_skipped; /* Pages which were not written */ +diff -urN linux-2.6.33.orig/mm/filemap.c linux-2.6.33/mm/filemap.c +--- linux-2.6.33.orig/mm/filemap.c 2010-02-24 19:52:17.000000000 +0100 ++++ linux-2.6.33/mm/filemap.c 2010-03-04 19:33:22.000000000 +0100 +@@ -139,6 +139,7 @@ + dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + } + } ++EXPORT_SYMBOL(__remove_from_page_cache); + + void remove_from_page_cache(struct page *page) + { +@@ -151,6 +152,7 @@ + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); + } ++EXPORT_SYMBOL(remove_from_page_cache); + + static int sync_page(void *word) + { +@@ -948,6 +950,7 @@ + { + ra->ra_pages /= 4; + } ++EXPORT_SYMBOL(find_get_pages); + + /** + * do_generic_file_read - generic file read routine +diff -urN linux-2.6.33.orig/mm/page-writeback.c linux-2.6.33/mm/page-writeback.c +--- linux-2.6.33.orig/mm/page-writeback.c 2010-02-24 19:52:17.000000000 +0100 ++++ linux-2.6.33/mm/page-writeback.c 2010-03-04 19:33:22.000000000 +0100 +@@ -1130,6 +1130,32 @@ + EXPORT_SYMBOL(__set_page_dirty_nobuffers); + + /* ++ * set_page_dirty_notag() -- similar to __set_page_dirty_nobuffers() ++ * except it doesn't tag the page dirty in the page-cache radix tree. ++ * This means that the address space using this cannot use the regular ++ * filemap ->writepages() helpers and must provide its own means of ++ * tracking and finding non-tagged dirty pages. ++ * ++ * NOTE: furthermore, this version also doesn't handle truncate races. ++ */ ++int set_page_dirty_notag(struct page *page) ++{ ++ struct address_space *mapping = page->mapping; ++ ++ if (!TestSetPageDirty(page)) { ++ unsigned long flags; ++ WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); ++ local_irq_save(flags); ++ account_page_dirtied(page, mapping); ++ local_irq_restore(flags); ++ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(set_page_dirty_notag); ++ ++/* + * When a writepage implementation decides that it doesn't want to write this + * page for some reason, it should redirty the locked page via + * redirty_page_for_writepage() and it should then unlock the page and return 0 diff --git a/pkgs/core/kernel/patches/routes-2.6.31.1-16.diff b/pkgs/core/kernel/patches/routes-2.6.31.1-16.diff deleted file mode 100644 index ece47c5..0000000 --- a/pkgs/core/kernel/patches/routes-2.6.31.1-16.diff +++ /dev/null @@ -1,1333 +0,0 @@ -diff -urp v2.6.31/linux/include/linux/rtnetlink.h linux/include/linux/rtnetlink.h ---- v2.6.31/linux/include/linux/rtnetlink.h 2009-06-13 10:53:56.000000000 +0300 -+++ linux/include/linux/rtnetlink.h 2009-09-11 22:11:20.000000000 +0300 -@@ -311,6 +311,8 @@ struct rtnexthop - #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ - #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ - #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ -+#define RTNH_F_SUSPECT 8 /* We don't know the real state */ -+#define RTNH_F_BADSTATE (RTNH_F_DEAD | RTNH_F_SUSPECT) - - /* Macros to handle hexthops */ - -diff -urp v2.6.31/linux/include/net/flow.h linux/include/net/flow.h ---- v2.6.31/linux/include/net/flow.h 2009-03-25 09:48:32.000000000 +0200 -+++ linux/include/net/flow.h 2009-09-11 22:12:39.000000000 +0300 -@@ -19,6 +19,8 @@ struct flowi { - struct { - __be32 daddr; - __be32 saddr; -+ __be32 lsrc; -+ __be32 gw; - __u8 tos; - __u8 scope; - } ip4_u; -@@ -43,6 +45,8 @@ struct flowi { - #define fl6_flowlabel nl_u.ip6_u.flowlabel - #define fl4_dst nl_u.ip4_u.daddr - #define fl4_src nl_u.ip4_u.saddr -+#define fl4_lsrc nl_u.ip4_u.lsrc -+#define fl4_gw nl_u.ip4_u.gw - #define fl4_tos nl_u.ip4_u.tos - #define fl4_scope nl_u.ip4_u.scope - -diff -urp v2.6.31/linux/include/net/ip_fib.h linux/include/net/ip_fib.h ---- v2.6.31/linux/include/net/ip_fib.h 2009-09-11 10:27:15.000000000 +0300 -+++ linux/include/net/ip_fib.h 2009-09-11 22:11:20.000000000 +0300 -@@ -204,6 +204,8 @@ extern int fib_lookup(struct net *n, str - extern struct fib_table *fib_new_table(struct net *net, u32 id); - extern struct fib_table *fib_get_table(struct net *net, u32 id); - -+extern int fib_result_table(struct fib_result *res); -+ - #endif /* CONFIG_IP_MULTIPLE_TABLES */ - - /* Exported by fib_frontend.c */ -@@ -273,4 +275,6 @@ static inline void fib_proc_exit(struct - } - #endif - -+extern rwlock_t fib_nhflags_lock; -+ - #endif /* _NET_FIB_H */ -diff -urp v2.6.31/linux/include/net/netfilter/nf_nat.h linux/include/net/netfilter/nf_nat.h ---- v2.6.31/linux/include/net/netfilter/nf_nat.h 2009-06-13 10:53:57.000000000 +0300 -+++ linux/include/net/netfilter/nf_nat.h 2009-09-11 22:12:39.000000000 +0300 -@@ -78,6 +78,13 @@ struct nf_conn_nat - #endif - }; - -+/* Call input routing for SNAT-ed traffic */ -+extern unsigned int ip_nat_route_input(unsigned int hooknum, -+ struct sk_buff *skb, -+ const struct net_device *in, -+ const struct net_device *out, -+ int (*okfn)(struct sk_buff *)); -+ - /* Set up the info structure to map into this range. */ - extern unsigned int nf_nat_setup_info(struct nf_conn *ct, - const struct nf_nat_range *range, -diff -urp v2.6.31/linux/include/net/route.h linux/include/net/route.h ---- v2.6.31/linux/include/net/route.h 2009-09-11 10:27:15.000000000 +0300 -+++ linux/include/net/route.h 2009-09-11 22:12:39.000000000 +0300 -@@ -116,6 +116,7 @@ extern int __ip_route_output_key(struct - extern int ip_route_output_key(struct net *, struct rtable **, struct flowi *flp); - extern int ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); - extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin); -+extern int ip_route_input_lookup(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin, __be32 lsrc); - extern unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev); - extern void ip_rt_send_redirect(struct sk_buff *skb); - -diff -urp v2.6.31/linux/net/bridge/br_netfilter.c linux/net/bridge/br_netfilter.c ---- v2.6.31/linux/net/bridge/br_netfilter.c 2009-09-11 10:27:16.000000000 +0300 -+++ linux/net/bridge/br_netfilter.c 2009-09-11 22:13:17.000000000 +0300 -@@ -343,6 +343,9 @@ static int br_nf_pre_routing_finish(stru - struct rtable *rt; - int err; - -+ /* Old skb->dst is not expected, it is lost in all cases */ -+ skb_dst_drop(skb); -+ - if (nf_bridge->mask & BRNF_PKT_TYPE) { - skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->mask ^= BRNF_PKT_TYPE; -diff -urp v2.6.31/linux/net/ipv4/fib_frontend.c linux/net/ipv4/fib_frontend.c ---- v2.6.31/linux/net/ipv4/fib_frontend.c 2009-09-11 10:27:17.000000000 +0300 -+++ linux/net/ipv4/fib_frontend.c 2009-09-11 22:11:20.000000000 +0300 -@@ -46,6 +46,8 @@ - - #ifndef CONFIG_IP_MULTIPLE_TABLES - -+#define FIB_RES_TABLE(r) (RT_TABLE_MAIN) -+ - static int __net_init fib4_rules_init(struct net *net) - { - struct fib_table *local_table, *main_table; -@@ -70,6 +72,8 @@ fail: - } - #else - -+#define FIB_RES_TABLE(r) (fib_result_table(r)) -+ - struct fib_table *fib_new_table(struct net *net, u32 id) - { - struct fib_table *tb; -@@ -124,7 +128,8 @@ void fib_select_default(struct net *net, - table = res->r->table; - #endif - tb = fib_get_table(net, table); -- if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) -+ if ((FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) || -+ FIB_RES_NH(*res).nh_scope == RT_SCOPE_HOST) - tb->tb_select_default(tb, flp, res); - } - -@@ -238,6 +243,9 @@ int fib_validate_source(__be32 src, __be - .tos = tos } }, - .iif = oif }; - struct fib_result res; -+ int table; -+ unsigned char prefixlen; -+ unsigned char scope; - int no_addr, rpf; - int ret; - struct net *net; -@@ -261,31 +269,35 @@ int fib_validate_source(__be32 src, __be - goto e_inval_res; - *spec_dst = FIB_RES_PREFSRC(res); - fib_combine_itag(itag, &res); --#ifdef CONFIG_IP_ROUTE_MULTIPATH -- if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) --#else - if (FIB_RES_DEV(res) == dev) --#endif - { - ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; - fib_res_put(&res); - return ret; - } -+ table = FIB_RES_TABLE(&res); -+ prefixlen = res.prefixlen; -+ scope = res.scope; - fib_res_put(&res); - if (no_addr) - goto last_resort; -- if (rpf == 1) -- goto e_inval; - fl.oif = dev->ifindex; - - ret = 0; - if (fib_lookup(net, &fl, &res) == 0) { -- if (res.type == RTN_UNICAST) { -+ if (res.type == RTN_UNICAST && -+ ((table == FIB_RES_TABLE(&res) && -+ res.prefixlen >= prefixlen && res.scope >= scope) || -+ !rpf)) { - *spec_dst = FIB_RES_PREFSRC(res); - ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; -+ fib_res_put(&res); -+ return ret; - } - fib_res_put(&res); - } -+ if (rpf == 1) -+ goto e_inval; - return ret; - - last_resort: -@@ -908,9 +920,7 @@ static int fib_inetaddr_event(struct not - switch (event) { - case NETDEV_UP: - fib_add_ifaddr(ifa); --#ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); --#endif - rt_cache_flush(dev_net(dev), -1); - break; - case NETDEV_DOWN: -@@ -946,9 +956,7 @@ static int fib_netdev_event(struct notif - for_ifa(in_dev) { - fib_add_ifaddr(ifa); - } endfor_ifa(in_dev); --#ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); --#endif - rt_cache_flush(dev_net(dev), -1); - break; - case NETDEV_DOWN: -diff -urp v2.6.31/linux/net/ipv4/fib_hash.c linux/net/ipv4/fib_hash.c ---- v2.6.31/linux/net/ipv4/fib_hash.c 2009-09-11 10:27:17.000000000 +0300 -+++ linux/net/ipv4/fib_hash.c 2009-09-11 22:11:20.000000000 +0300 -@@ -277,25 +277,35 @@ out: - static void - fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) - { -- int order, last_idx; -+ int order, last_idx, last_dflt, last_nhsel; -+ struct fib_alias *first_fa = NULL; -+ struct hlist_head *head; - struct hlist_node *node; - struct fib_node *f; - struct fib_info *fi = NULL; - struct fib_info *last_resort; - struct fn_hash *t = (struct fn_hash *)tb->tb_data; -- struct fn_zone *fz = t->fn_zones[0]; -+ struct fn_zone *fz = t->fn_zones[res->prefixlen]; -+ __be32 k; - - if (fz == NULL) - return; - -+ k = fz_key(flp->fl4_dst, fz); -+ last_dflt = -2; -+ last_nhsel = 0; - last_idx = -1; - last_resort = NULL; - order = -1; - - read_lock(&fib_hash_lock); -- hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { -+ head = &fz->fz_hash[fn_hash(k, fz)]; -+ hlist_for_each_entry(f, node, head, fn_hash) { - struct fib_alias *fa; - -+ if (f->fn_key != k) -+ continue; -+ - list_for_each_entry(fa, &f->fn_alias, fa_list) { - struct fib_info *next_fi = fa->fa_info; - -@@ -303,42 +313,56 @@ fn_hash_select_default(struct fib_table - fa->fa_type != RTN_UNICAST) - continue; - -+ if (fa->fa_tos && -+ fa->fa_tos != flp->fl4_tos) -+ continue; - if (next_fi->fib_priority > res->fi->fib_priority) - break; -- if (!next_fi->fib_nh[0].nh_gw || -- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) -- continue; - fa->fa_state |= FA_S_ACCESSED; - -- if (fi == NULL) { -- if (next_fi != res->fi) -- break; -- } else if (!fib_detect_death(fi, order, &last_resort, -- &last_idx, tb->tb_default)) { -+ if (!first_fa) { -+ last_dflt = fa->fa_last_dflt; -+ first_fa = fa; -+ } -+ if (fi && !fib_detect_death(fi, order, &last_resort, -+ &last_idx, &last_dflt, &last_nhsel, flp)) { - fib_result_assign(res, fi); -- tb->tb_default = order; -+ first_fa->fa_last_dflt = order; - goto out; - } - fi = next_fi; - order++; - } -+ break; - } - - if (order <= 0 || fi == NULL) { -- tb->tb_default = -1; -+ if (fi && fi->fib_nhs > 1 && -+ fib_detect_death(fi, order, &last_resort, &last_idx, -+ &last_dflt, &last_nhsel, flp) && -+ last_resort == fi) { -+ read_lock_bh(&fib_nhflags_lock); -+ fi->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; -+ read_unlock_bh(&fib_nhflags_lock); -+ } -+ if (first_fa) first_fa->fa_last_dflt = -1; - goto out; - } - - if (!fib_detect_death(fi, order, &last_resort, &last_idx, -- tb->tb_default)) { -+ &last_dflt, &last_nhsel, flp)) { - fib_result_assign(res, fi); -- tb->tb_default = order; -+ first_fa->fa_last_dflt = order; - goto out; - } - -- if (last_idx >= 0) -+ if (last_idx >= 0) { - fib_result_assign(res, last_resort); -- tb->tb_default = last_idx; -+ read_lock_bh(&fib_nhflags_lock); -+ last_resort->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; -+ read_unlock_bh(&fib_nhflags_lock); -+ first_fa->fa_last_dflt = last_idx; -+ } - out: - read_unlock(&fib_hash_lock); - } -@@ -462,6 +486,7 @@ static int fn_hash_insert(struct fib_tab - write_lock_bh(&fib_hash_lock); - fi_drop = fa->fa_info; - fa->fa_info = fi; -+ fa->fa_last_dflt = -1; - fa->fa_type = cfg->fc_type; - fa->fa_scope = cfg->fc_scope; - state = fa->fa_state; -@@ -516,6 +541,7 @@ static int fn_hash_insert(struct fib_tab - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - new_fa->fa_state = 0; -+ new_fa->fa_last_dflt = -1; - - /* - * Insert new entry to the list. -diff -urp v2.6.31/linux/net/ipv4/fib_lookup.h linux/net/ipv4/fib_lookup.h ---- v2.6.31/linux/net/ipv4/fib_lookup.h 2009-09-11 10:27:17.000000000 +0300 -+++ linux/net/ipv4/fib_lookup.h 2009-09-11 22:11:20.000000000 +0300 -@@ -8,6 +8,7 @@ - struct fib_alias { - struct list_head fa_list; - struct fib_info *fa_info; -+ int fa_last_dflt; - u8 fa_tos; - u8 fa_type; - u8 fa_scope; -@@ -37,7 +38,8 @@ extern struct fib_alias *fib_find_alias( - u8 tos, u32 prio); - extern int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, -- int *last_idx, int dflt); -+ int *last_idx, int *dflt, int *last_nhsel, -+ const struct flowi *flp); - - static inline void fib_result_assign(struct fib_result *res, - struct fib_info *fi) -diff -urp v2.6.31/linux/net/ipv4/fib_rules.c linux/net/ipv4/fib_rules.c ---- v2.6.31/linux/net/ipv4/fib_rules.c 2009-09-11 10:27:17.000000000 +0300 -+++ linux/net/ipv4/fib_rules.c 2009-09-11 22:11:20.000000000 +0300 -@@ -54,6 +54,11 @@ u32 fib_rules_tclass(struct fib_result * - } - #endif - -+int fib_result_table(struct fib_result *res) -+{ -+ return res->r->table; -+} -+ - int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) - { - struct fib_lookup_arg arg = { -diff -urp v2.6.31/linux/net/ipv4/fib_semantics.c linux/net/ipv4/fib_semantics.c ---- v2.6.31/linux/net/ipv4/fib_semantics.c 2009-09-11 10:27:17.000000000 +0300 -+++ linux/net/ipv4/fib_semantics.c 2009-09-11 22:12:39.000000000 +0300 -@@ -50,6 +50,7 @@ static struct hlist_head *fib_info_hash; - static struct hlist_head *fib_info_laddrhash; - static unsigned int fib_hash_size; - static unsigned int fib_info_cnt; -+rwlock_t fib_nhflags_lock = RW_LOCK_UNLOCKED; - - #define DEVINDEX_HASHBITS 8 - #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) -@@ -186,7 +187,7 @@ static __inline__ int nh_comp(const stru - #ifdef CONFIG_NET_CLS_ROUTE - nh->nh_tclassid != onh->nh_tclassid || - #endif -- ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) -+ ((nh->nh_flags^onh->nh_flags)&~RTNH_F_BADSTATE)) - return -1; - onh++; - } endfor_nexthops(fi); -@@ -237,7 +238,7 @@ static struct fib_info *fib_find_info(co - nfi->fib_priority == fi->fib_priority && - memcmp(nfi->fib_metrics, fi->fib_metrics, - sizeof(fi->fib_metrics)) == 0 && -- ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && -+ ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_BADSTATE) == 0 && - (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) - return fi; - } -@@ -349,26 +350,70 @@ struct fib_alias *fib_find_alias(struct - } - - int fib_detect_death(struct fib_info *fi, int order, -- struct fib_info **last_resort, int *last_idx, int dflt) -+ struct fib_info **last_resort, int *last_idx, int *dflt, -+ int *last_nhsel, const struct flowi *flp) - { - struct neighbour *n; -- int state = NUD_NONE; -+ int nhsel; -+ int state; -+ struct fib_nh * nh; -+ __be32 dst; -+ int flag, dead = 1; -+ -+ /* change_nexthops(fi) { */ -+ for (nhsel = 0, nh = fi->fib_nh; nhsel < fi->fib_nhs; nh++, nhsel++) { -+ if (flp->oif && flp->oif != nh->nh_oif) -+ continue; -+ if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && nh->nh_gw && -+ nh->nh_scope == RT_SCOPE_LINK) -+ continue; -+ if (nh->nh_flags & RTNH_F_DEAD) -+ continue; - -- n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); -- if (n) { -- state = n->nud_state; -- neigh_release(n); -- } -- if (state == NUD_REACHABLE) -- return 0; -- if ((state&NUD_VALID) && order != dflt) -- return 0; -- if ((state&NUD_VALID) || -- (*last_idx<0 && order > dflt)) { -- *last_resort = fi; -- *last_idx = order; -+ flag = 0; -+ if (nh->nh_dev->flags & IFF_NOARP) { -+ dead = 0; -+ goto setfl; -+ } -+ -+ dst = nh->nh_gw; -+ if (!nh->nh_gw || nh->nh_scope != RT_SCOPE_LINK) -+ dst = flp->fl4_dst; -+ -+ state = NUD_NONE; -+ n = neigh_lookup(&arp_tbl, &dst, nh->nh_dev); -+ if (n) { -+ state = n->nud_state; -+ neigh_release(n); -+ } -+ if (state==NUD_REACHABLE || -+ ((state&NUD_VALID) && order != *dflt)) { -+ dead = 0; -+ goto setfl; -+ } -+ if (!(state&NUD_VALID)) -+ flag = 1; -+ if (!dead) -+ goto setfl; -+ if ((state&NUD_VALID) || -+ (*last_idx<0 && order >= *dflt)) { -+ *last_resort = fi; -+ *last_idx = order; -+ *last_nhsel = nhsel; -+ } -+ -+ setfl: -+ -+ read_lock_bh(&fib_nhflags_lock); -+ if (flag) -+ nh->nh_flags |= RTNH_F_SUSPECT; -+ else -+ nh->nh_flags &= ~RTNH_F_SUSPECT; -+ read_unlock_bh(&fib_nhflags_lock); - } -- return 1; -+ /* } endfor_nexthops(fi) */ -+ -+ return dead; - } - - #ifdef CONFIG_IP_ROUTE_MULTIPATH -@@ -540,8 +585,11 @@ static int fib_check_nh(struct fib_confi - return -EINVAL; - if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) - return -ENODEV; -- if (!(dev->flags&IFF_UP)) -- return -ENETDOWN; -+ if (!(dev->flags&IFF_UP)) { -+ if (fi->fib_protocol != RTPROT_STATIC) -+ return -ENETDOWN; -+ nh->nh_flags |= RTNH_F_DEAD; -+ } - nh->nh_dev = dev; - dev_hold(dev); - nh->nh_scope = RT_SCOPE_LINK; -@@ -561,24 +609,48 @@ static int fib_check_nh(struct fib_confi - /* It is not necessary, but requires a bit of thinking */ - if (fl.fl4_scope < RT_SCOPE_LINK) - fl.fl4_scope = RT_SCOPE_LINK; -- if ((err = fib_lookup(net, &fl, &res)) != 0) -- return err; -+ err = fib_lookup(net, &fl, &res); - } -- err = -EINVAL; -- if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) -- goto out; -- nh->nh_scope = res.scope; -- nh->nh_oif = FIB_RES_OIF(res); -- if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) -- goto out; -- dev_hold(nh->nh_dev); -- err = -ENETDOWN; -- if (!(nh->nh_dev->flags & IFF_UP)) -- goto out; -- err = 0; -+ if (err) { -+ struct in_device *in_dev; -+ -+ if (err != -ENETUNREACH || -+ fi->fib_protocol != RTPROT_STATIC) -+ return err; -+ -+ in_dev = inetdev_by_index(net, nh->nh_oif); -+ if (in_dev == NULL || -+ in_dev->dev->flags & IFF_UP) { -+ if (in_dev) -+ in_dev_put(in_dev); -+ return err; -+ } -+ nh->nh_flags |= RTNH_F_DEAD; -+ nh->nh_scope = RT_SCOPE_LINK; -+ nh->nh_dev = in_dev->dev; -+ dev_hold(nh->nh_dev); -+ in_dev_put(in_dev); -+ } else { -+ err = -EINVAL; -+ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) -+ goto out; -+ nh->nh_scope = res.scope; -+ nh->nh_oif = FIB_RES_OIF(res); -+ if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) -+ goto out; -+ dev_hold(nh->nh_dev); -+ if (!(nh->nh_dev->flags & IFF_UP)) { -+ if (fi->fib_protocol != RTPROT_STATIC) { -+ err = -ENETDOWN; -+ goto out; -+ } -+ nh->nh_flags |= RTNH_F_DEAD; -+ } -+ err = 0; - out: -- fib_res_put(&res); -- return err; -+ fib_res_put(&res); -+ return err; -+ } - } else { - struct in_device *in_dev; - -@@ -589,8 +661,11 @@ out: - if (in_dev == NULL) - return -ENODEV; - if (!(in_dev->dev->flags&IFF_UP)) { -- in_dev_put(in_dev); -- return -ENETDOWN; -+ if (fi->fib_protocol != RTPROT_STATIC) { -+ in_dev_put(in_dev); -+ return -ENETDOWN; -+ } -+ nh->nh_flags |= RTNH_F_DEAD; - } - nh->nh_dev = in_dev->dev; - dev_hold(nh->nh_dev); -@@ -899,8 +974,12 @@ int fib_semantic_match(struct list_head - for_nexthops(fi) { - if (nh->nh_flags&RTNH_F_DEAD) - continue; -- if (!flp->oif || flp->oif == nh->nh_oif) -- break; -+ if (flp->oif && flp->oif != nh->nh_oif) -+ continue; -+ if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && -+ nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) -+ continue; -+ break; - } - #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (nhsel < fi->fib_nhs) { -@@ -1080,18 +1159,29 @@ int fib_sync_down_dev(struct net_device - prev_fi = fi; - dead = 0; - change_nexthops(fi) { -- if (nh->nh_flags&RTNH_F_DEAD) -- dead++; -- else if (nh->nh_dev == dev && -- nh->nh_scope != scope) { -- nh->nh_flags |= RTNH_F_DEAD; -+ if (nh->nh_flags&RTNH_F_DEAD) { -+ if (fi->fib_protocol!=RTPROT_STATIC || -+ nh->nh_dev == NULL || -+ __in_dev_get_rtnl(nh->nh_dev) == NULL || -+ nh->nh_dev->flags&IFF_UP) -+ dead++; -+ } else if (nh->nh_dev == dev && -+ nh->nh_scope != scope) { -+ write_lock_bh(&fib_nhflags_lock); - #ifdef CONFIG_IP_ROUTE_MULTIPATH -- spin_lock_bh(&fib_multipath_lock); -+ spin_lock(&fib_multipath_lock); -+ nh->nh_flags |= RTNH_F_DEAD; - fi->fib_power -= nh->nh_power; - nh->nh_power = 0; -- spin_unlock_bh(&fib_multipath_lock); -+ spin_unlock(&fib_multipath_lock); -+#else -+ nh->nh_flags |= RTNH_F_DEAD; - #endif -- dead++; -+ write_unlock_bh(&fib_nhflags_lock); -+ if (fi->fib_protocol!=RTPROT_STATIC || -+ force || -+ __in_dev_get_rtnl(dev) == NULL) -+ dead++; - } - #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nh->nh_dev == dev) { -@@ -1109,11 +1199,8 @@ int fib_sync_down_dev(struct net_device - return ret; - } - --#ifdef CONFIG_IP_ROUTE_MULTIPATH -- - /* -- Dead device goes up. We wake up dead nexthops. -- It takes sense only on multipath routes. -+ Dead device goes up or new address is added. We wake up dead nexthops. - */ - - int fib_sync_up(struct net_device *dev) -@@ -1123,8 +1210,10 @@ int fib_sync_up(struct net_device *dev) - struct hlist_head *head; - struct hlist_node *node; - struct fib_nh *nh; -- int ret; -+ struct fib_result res; -+ int ret, rep; - -+repeat: - if (!(dev->flags&IFF_UP)) - return 0; - -@@ -1132,6 +1221,7 @@ int fib_sync_up(struct net_device *dev) - hash = fib_devindex_hashfn(dev->ifindex); - head = &fib_info_devhash[hash]; - ret = 0; -+ rep = 0; - - hlist_for_each_entry(nh, node, head, nh_hash) { - struct fib_info *fi = nh->nh_parent; -@@ -1144,19 +1234,39 @@ int fib_sync_up(struct net_device *dev) - prev_fi = fi; - alive = 0; - change_nexthops(fi) { -- if (!(nh->nh_flags&RTNH_F_DEAD)) { -- alive++; -+ if (!(nh->nh_flags&RTNH_F_DEAD)) - continue; -- } - if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) - continue; - if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) - continue; -+ if (nh->nh_gw && fi->fib_protocol == RTPROT_STATIC) { -+ struct flowi fl = { -+ .nl_u = { .ip4_u = -+ { .daddr = nh->nh_gw, -+ .scope = nh->nh_scope } }, -+ .oif = nh->nh_oif, -+ }; -+ if (fib_lookup(dev_net(dev), &fl, &res) != 0) -+ continue; -+ if (res.type != RTN_UNICAST && -+ res.type != RTN_LOCAL) { -+ fib_res_put(&res); -+ continue; -+ } -+ nh->nh_scope = res.scope; -+ fib_res_put(&res); -+ rep = 1; -+ } - alive++; -+#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - nh->nh_power = 0; -+#endif - nh->nh_flags &= ~RTNH_F_DEAD; -+#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_unlock_bh(&fib_multipath_lock); -+#endif - } endfor_nexthops(fi) - - if (alive > 0) { -@@ -1164,10 +1274,14 @@ int fib_sync_up(struct net_device *dev) - ret++; - } - } -+ if (rep) -+ goto repeat; - - return ret; - } - -+#ifdef CONFIG_IP_ROUTE_MULTIPATH -+ - /* - The algorithm is suboptimal, but it provides really - fair weighted route distribution. -@@ -1176,24 +1290,45 @@ int fib_sync_up(struct net_device *dev) - void fib_select_multipath(const struct flowi *flp, struct fib_result *res) - { - struct fib_info *fi = res->fi; -- int w; -+ int w, alive; - - spin_lock_bh(&fib_multipath_lock); -+ if (flp->oif) { -+ int sel = -1; -+ w = -1; -+ change_nexthops(fi) { -+ if (flp->oif != nh->nh_oif) -+ continue; -+ if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && -+ nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) -+ continue; -+ if (!(nh->nh_flags&RTNH_F_BADSTATE)) { -+ if (nh->nh_power > w) { -+ w = nh->nh_power; -+ sel = nhsel; -+ } -+ } -+ } endfor_nexthops(fi); -+ if (sel >= 0) { -+ spin_unlock_bh(&fib_multipath_lock); -+ res->nh_sel = sel; -+ return; -+ } -+ goto last_resort; -+ } -+ -+repeat: - if (fi->fib_power <= 0) { - int power = 0; - change_nexthops(fi) { -- if (!(nh->nh_flags&RTNH_F_DEAD)) { -+ if (!(nh->nh_flags&RTNH_F_BADSTATE)) { - power += nh->nh_weight; - nh->nh_power = nh->nh_weight; - } - } endfor_nexthops(fi); - fi->fib_power = power; -- if (power <= 0) { -- spin_unlock_bh(&fib_multipath_lock); -- /* Race condition: route has just become dead. */ -- res->nh_sel = 0; -- return; -- } -+ if (power <= 0) -+ goto last_resort; - } - - -@@ -1203,20 +1338,40 @@ void fib_select_multipath(const struct f - - w = jiffies % fi->fib_power; - -+ alive = 0; - change_nexthops(fi) { -- if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { -+ if (!(nh->nh_flags&RTNH_F_BADSTATE) && nh->nh_power) { - if ((w -= nh->nh_power) <= 0) { - nh->nh_power--; - fi->fib_power--; -- res->nh_sel = nhsel; - spin_unlock_bh(&fib_multipath_lock); -+ res->nh_sel = nhsel; - return; - } -+ alive = 1; -+ } -+ } endfor_nexthops(fi); -+ if (alive) { -+ fi->fib_power = 0; -+ goto repeat; -+ } -+ -+last_resort: -+ -+ for_nexthops(fi) { -+ if (!(nh->nh_flags&RTNH_F_DEAD)) { -+ if (flp->oif && flp->oif != nh->nh_oif) -+ continue; -+ if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && -+ nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) -+ continue; -+ spin_unlock_bh(&fib_multipath_lock); -+ res->nh_sel = nhsel; -+ return; - } - } endfor_nexthops(fi); - - /* Race condition: route has just become dead. */ -- res->nh_sel = 0; - spin_unlock_bh(&fib_multipath_lock); - } - #endif -diff -urp v2.6.31/linux/net/ipv4/fib_trie.c linux/net/ipv4/fib_trie.c ---- v2.6.31/linux/net/ipv4/fib_trie.c 2009-09-11 10:27:17.000000000 +0300 -+++ linux/net/ipv4/fib_trie.c 2009-09-11 22:11:20.000000000 +0300 -@@ -1291,6 +1291,7 @@ static int fn_trie_insert(struct fib_tab - fi_drop = fa->fa_info; - new_fa->fa_tos = fa->fa_tos; - new_fa->fa_info = fi; -+ new_fa->fa_last_dflt = -1; - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - state = fa->fa_state; -@@ -1331,6 +1332,7 @@ static int fn_trie_insert(struct fib_tab - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - new_fa->fa_state = 0; -+ new_fa->fa_last_dflt = -1; - /* - * Insert new entry to the list. - */ -@@ -1831,24 +1833,31 @@ static void fn_trie_select_default(struc - struct fib_result *res) - { - struct trie *t = (struct trie *) tb->tb_data; -- int order, last_idx; -+ int order, last_idx, last_dflt, last_nhsel; -+ struct fib_alias *first_fa = NULL; - struct fib_info *fi = NULL; - struct fib_info *last_resort; - struct fib_alias *fa = NULL; - struct list_head *fa_head; - struct leaf *l; -+ u32 key, mask; - -+ last_dflt = -2; -+ last_nhsel = 0; - last_idx = -1; - last_resort = NULL; - order = -1; - -+ mask = inet_make_mask(res->prefixlen); -+ key = ntohl(flp->fl4_dst & mask); -+ - rcu_read_lock(); - -- l = fib_find_node(t, 0); -+ l = fib_find_node(t, key); - if (!l) - goto out; - -- fa_head = get_fa_head(l, 0); -+ fa_head = get_fa_head(l, res->prefixlen); - if (!fa_head) - goto out; - -@@ -1862,39 +1871,52 @@ static void fn_trie_select_default(struc - fa->fa_type != RTN_UNICAST) - continue; - -+ if (fa->fa_tos && -+ fa->fa_tos != flp->fl4_tos) -+ continue; - if (next_fi->fib_priority > res->fi->fib_priority) - break; -- if (!next_fi->fib_nh[0].nh_gw || -- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) -- continue; - fa->fa_state |= FA_S_ACCESSED; - -- if (fi == NULL) { -- if (next_fi != res->fi) -- break; -- } else if (!fib_detect_death(fi, order, &last_resort, -- &last_idx, tb->tb_default)) { -+ if (!first_fa) { -+ last_dflt = fa->fa_last_dflt; -+ first_fa = fa; -+ } -+ if (fi && !fib_detect_death(fi, order, &last_resort, -+ &last_idx, &last_dflt, &last_nhsel, flp)) { - fib_result_assign(res, fi); -- tb->tb_default = order; -+ first_fa->fa_last_dflt = order; - goto out; - } - fi = next_fi; - order++; - } - if (order <= 0 || fi == NULL) { -- tb->tb_default = -1; -+ if (fi && fi->fib_nhs > 1 && -+ fib_detect_death(fi, order, &last_resort, &last_idx, -+ &last_dflt, &last_nhsel, flp) && -+ last_resort == fi) { -+ read_lock_bh(&fib_nhflags_lock); -+ fi->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; -+ read_unlock_bh(&fib_nhflags_lock); -+ } -+ if (first_fa) first_fa->fa_last_dflt = -1; - goto out; - } - - if (!fib_detect_death(fi, order, &last_resort, &last_idx, -- tb->tb_default)) { -+ &last_dflt, &last_nhsel, flp)) { - fib_result_assign(res, fi); -- tb->tb_default = order; -+ first_fa->fa_last_dflt = order; - goto out; - } -- if (last_idx >= 0) -+ if (last_idx >= 0) { - fib_result_assign(res, last_resort); -- tb->tb_default = last_idx; -+ read_lock_bh(&fib_nhflags_lock); -+ last_resort->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; -+ read_unlock_bh(&fib_nhflags_lock); -+ first_fa->fa_last_dflt = last_idx; -+ } - out: - rcu_read_unlock(); - } -diff -urp v2.6.31/linux/net/ipv4/netfilter/ipt_MASQUERADE.c linux/net/ipv4/netfilter/ipt_MASQUERADE.c ---- v2.6.31/linux/net/ipv4/netfilter/ipt_MASQUERADE.c 2009-09-11 10:27:17.000000000 +0300 -+++ linux/net/ipv4/netfilter/ipt_MASQUERADE.c 2009-09-11 22:14:42.000000000 +0300 -@@ -51,7 +51,7 @@ masquerade_tg(struct sk_buff *skb, const - enum ip_conntrack_info ctinfo; - struct nf_nat_range newrange; - const struct nf_nat_multi_range_compat *mr; -- const struct rtable *rt; -+ struct rtable *rt; - __be32 newsrc; - - NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); -@@ -69,13 +69,28 @@ masquerade_tg(struct sk_buff *skb, const - return NF_ACCEPT; - - mr = par->targinfo; -- rt = skb_rtable(skb); -- newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); -- if (!newsrc) { -- printk("MASQUERADE: %s ate my IP address\n", par->out->name); -- return NF_DROP; -+ -+ { -+ struct flowi fl = { .nl_u = { .ip4_u = -+ { .daddr = ip_hdr(skb)->daddr, -+ .tos = (RT_TOS(ip_hdr(skb)->tos) | -+ RTO_CONN), -+ .gw = skb_rtable(skb)->rt_gateway, -+ } }, -+ .mark = skb->mark, -+ .oif = par->out->ifindex }; -+ if (ip_route_output_key(dev_net(par->out), &rt, &fl) != 0) { -+ /* Funky routing can do this. */ -+ if (net_ratelimit()) -+ printk("MASQUERADE:" -+ " No route: Rusty's brain broke!\n"); -+ return NF_DROP; -+ } - } - -+ newsrc = rt->rt_src; -+ ip_rt_put(rt); -+ - nat->masq_index = par->out->ifindex; - - /* Transfer from original range. */ -diff -urp v2.6.31/linux/net/ipv4/netfilter/nf_nat_core.c linux/net/ipv4/netfilter/nf_nat_core.c ---- v2.6.31/linux/net/ipv4/netfilter/nf_nat_core.c 2009-06-13 10:53:58.000000000 +0300 -+++ linux/net/ipv4/netfilter/nf_nat_core.c 2009-09-11 22:13:59.000000000 +0300 -@@ -711,6 +711,52 @@ static struct pernet_operations nf_nat_n - .exit = nf_nat_net_exit, - }; - -+unsigned int -+ip_nat_route_input(unsigned int hooknum, -+ struct sk_buff *skb, -+ const struct net_device *in, -+ const struct net_device *out, -+ int (*okfn)(struct sk_buff *)) -+{ -+ struct iphdr *iph; -+ struct nf_conn *conn; -+ enum ip_conntrack_info ctinfo; -+ enum ip_conntrack_dir dir; -+ unsigned long statusbit; -+ __be32 saddr; -+ -+ if (!(conn = nf_ct_get(skb, &ctinfo))) -+ return NF_ACCEPT; -+ -+ if (!(conn->status & IPS_NAT_DONE_MASK)) -+ return NF_ACCEPT; -+ dir = CTINFO2DIR(ctinfo); -+ statusbit = IPS_SRC_NAT; -+ if (dir == IP_CT_DIR_REPLY) -+ statusbit ^= IPS_NAT_MASK; -+ if (!(conn->status & statusbit)) -+ return NF_ACCEPT; -+ -+ if (skb_dst(skb)) -+ return NF_ACCEPT; -+ -+ if (skb->len < sizeof(struct iphdr)) -+ return NF_ACCEPT; -+ -+ /* use daddr in other direction as masquerade address (lsrc) */ -+ iph = ip_hdr(skb); -+ saddr = conn->tuplehash[!dir].tuple.dst.u3.ip; -+ if (saddr == iph->saddr) -+ return NF_ACCEPT; -+ -+ if (ip_route_input_lookup(skb, iph->daddr, iph->saddr, iph->tos, -+ skb->dev, saddr)) -+ return NF_DROP; -+ -+ return NF_ACCEPT; -+} -+EXPORT_SYMBOL_GPL(ip_nat_route_input); -+ - static int __init nf_nat_init(void) - { - size_t i; -diff -urp v2.6.31/linux/net/ipv4/netfilter/nf_nat_standalone.c linux/net/ipv4/netfilter/nf_nat_standalone.c ---- v2.6.31/linux/net/ipv4/netfilter/nf_nat_standalone.c 2009-09-11 10:27:17.000000000 +0300 -+++ linux/net/ipv4/netfilter/nf_nat_standalone.c 2009-09-11 22:12:39.000000000 +0300 -@@ -255,6 +255,14 @@ static struct nf_hook_ops nf_nat_ops[] _ - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_NAT_DST, - }, -+ /* Before routing, route before mangling */ -+ { -+ .hook = ip_nat_route_input, -+ .owner = THIS_MODULE, -+ .pf = PF_INET, -+ .hooknum = NF_INET_PRE_ROUTING, -+ .priority = NF_IP_PRI_LAST-1, -+ }, - /* After packet filtering, change source */ - { - .hook = nf_nat_out, -diff -urp v2.6.31/linux/net/ipv4/route.c linux/net/ipv4/route.c ---- v2.6.31/linux/net/ipv4/route.c 2009-09-11 10:27:17.000000000 +0300 -+++ linux/net/ipv4/route.c 2009-09-11 22:12:39.000000000 +0300 -@@ -695,6 +695,8 @@ static inline int compare_keys(struct fl - return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | - (fl1->mark ^ fl2->mark) | -+ ((__force u32)(fl1->nl_u.ip4_u.lsrc ^ fl2->nl_u.ip4_u.lsrc)) | -+ ((__force u32)(fl1->nl_u.ip4_u.gw ^ fl2->nl_u.ip4_u.gw)) | - (*(u16 *)&fl1->nl_u.ip4_u.tos ^ - *(u16 *)&fl2->nl_u.ip4_u.tos) | - (fl1->oif ^ fl2->oif) | -@@ -1424,6 +1426,7 @@ void ip_rt_redirect(__be32 old_gw, __be3 - - /* Gateway is different ... */ - rt->rt_gateway = new_gw; -+ if (rt->fl.fl4_gw) rt->fl.fl4_gw = new_gw; - - /* Redirect received -> path was valid */ - dst_confirm(&rth->u.dst); -@@ -1870,6 +1873,7 @@ static int ip_route_input_mc(struct sk_b - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; -+ rth->fl.fl4_lsrc = 0; - rth->rt_src = saddr; - #ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; -@@ -1880,6 +1884,7 @@ static int ip_route_input_mc(struct sk_b - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->fl.oif = 0; -+ rth->fl.fl4_gw = 0; - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->rt_genid = rt_genid(dev_net(dev)); -@@ -1944,7 +1949,7 @@ static int __mkroute_input(struct sk_buf - struct fib_result *res, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos, -- struct rtable **result) -+ __be32 lsrc, struct rtable **result) - { - - struct rtable *rth; -@@ -1978,6 +1983,7 @@ static int __mkroute_input(struct sk_buf - flags |= RTCF_DIRECTSRC; - - if (out_dev == in_dev && err && -+ !lsrc && - (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) - flags |= RTCF_DOREDIRECT; -@@ -2011,6 +2017,7 @@ static int __mkroute_input(struct sk_buf - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; -+ rth->fl.fl4_lsrc = lsrc; - rth->rt_gateway = daddr; - rth->rt_iif = - rth->fl.iif = in_dev->dev->ifindex; -@@ -2018,6 +2025,7 @@ static int __mkroute_input(struct sk_buf - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->fl.oif = 0; -+ rth->fl.fl4_gw = 0; - rth->rt_spec_dst= spec_dst; - - rth->u.dst.input = ip_forward; -@@ -2038,21 +2046,23 @@ static int __mkroute_input(struct sk_buf - - static int ip_mkroute_input(struct sk_buff *skb, - struct fib_result *res, -+ struct net *net, - const struct flowi *fl, - struct in_device *in_dev, -- __be32 daddr, __be32 saddr, u32 tos) -+ __be32 daddr, __be32 saddr, u32 tos, __be32 lsrc) - { - struct rtable* rth = NULL; - int err; - unsigned hash; - -+ fib_select_default(net, fl, res); - #ifdef CONFIG_IP_ROUTE_MULTIPATH -- if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) -+ if (res->fi && res->fi->fib_nhs > 1) - fib_select_multipath(fl, res); - #endif - - /* create a routing cache entry */ -- err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); -+ err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, lsrc, &rth); - if (err) - return err; - -@@ -2073,18 +2083,19 @@ static int ip_mkroute_input(struct sk_bu - */ - - static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, -- u8 tos, struct net_device *dev) -+ u8 tos, struct net_device *dev, __be32 lsrc) - { - struct fib_result res; - struct in_device *in_dev = in_dev_get(dev); - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = daddr, -- .saddr = saddr, -+ .saddr = lsrc? : saddr, - .tos = tos, - .scope = RT_SCOPE_UNIVERSE, - } }, - .mark = skb->mark, -- .iif = dev->ifindex }; -+ .iif = lsrc? -+ dev_net(dev)->loopback_dev->ifindex : dev->ifindex }; - unsigned flags = 0; - u32 itag = 0; - struct rtable * rth; -@@ -2120,6 +2131,12 @@ static int ip_route_input_slow(struct sk - ipv4_is_loopback(daddr)) - goto martian_destination; - -+ if (lsrc) { -+ if (ipv4_is_multicast(lsrc) || ipv4_is_lbcast(lsrc) || -+ ipv4_is_zeronet(lsrc) || ipv4_is_loopback(lsrc)) -+ goto e_inval; -+ } -+ - /* - * Now we are ready to route packet. - */ -@@ -2129,6 +2146,8 @@ static int ip_route_input_slow(struct sk - goto no_route; - } - free_res = 1; -+ fl.iif = dev->ifindex; -+ fl.fl4_src = saddr; - - RT_CACHE_STAT_INC(in_slow_tot); - -@@ -2153,7 +2172,7 @@ static int ip_route_input_slow(struct sk - if (res.type != RTN_UNICAST) - goto martian_destination; - -- err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); -+ err = ip_mkroute_input(skb, &res, net, &fl, in_dev, daddr, saddr, tos, lsrc); - done: - in_dev_put(in_dev); - if (free_res) -@@ -2163,6 +2182,8 @@ out: return err; - brd_input: - if (skb->protocol != htons(ETH_P_IP)) - goto e_inval; -+ if (lsrc) -+ goto e_inval; - - if (ipv4_is_zeronet(saddr)) - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); -@@ -2204,6 +2225,7 @@ local_input: - rth->u.dst.dev = net->loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); -+ rth->fl.fl4_gw = 0; - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->u.dst.input= ip_local_deliver; -@@ -2254,8 +2276,9 @@ martian_source: - goto e_inval; - } - --int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, -- u8 tos, struct net_device *dev) -+static inline int -+ip_route_input_cached(struct sk_buff *skb, __be32 daddr, __be32 saddr, -+ u8 tos, struct net_device *dev, __be32 lsrc) - { - struct rtable * rth; - unsigned hash; -@@ -2276,6 +2299,7 @@ int ip_route_input(struct sk_buff *skb, - if (((rth->fl.fl4_dst ^ daddr) | - (rth->fl.fl4_src ^ saddr) | - (rth->fl.iif ^ iif) | -+ (rth->fl.fl4_lsrc ^ lsrc) | - rth->fl.oif | - (rth->fl.fl4_tos ^ tos)) == 0 && - rth->fl.mark == skb->mark && -@@ -2324,7 +2348,19 @@ skip_cache: - rcu_read_unlock(); - return -EINVAL; - } -- return ip_route_input_slow(skb, daddr, saddr, tos, dev); -+ return ip_route_input_slow(skb, daddr, saddr, tos, dev, lsrc); -+} -+ -+int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, -+ u8 tos, struct net_device *dev) -+{ -+ return ip_route_input_cached(skb, daddr, saddr, tos, dev, 0); -+} -+ -+int ip_route_input_lookup(struct sk_buff *skb, __be32 daddr, __be32 saddr, -+ u8 tos, struct net_device *dev, __be32 lsrc) -+{ -+ return ip_route_input_cached(skb, daddr, saddr, tos, dev, lsrc); - } - - static int __mkroute_output(struct rtable **result, -@@ -2396,6 +2432,7 @@ static int __mkroute_output(struct rtabl - rth->fl.fl4_tos = tos; - rth->fl.fl4_src = oldflp->fl4_src; - rth->fl.oif = oldflp->oif; -+ rth->fl.fl4_gw = oldflp->fl4_gw; - rth->fl.mark = oldflp->mark; - rth->rt_dst = fl->fl4_dst; - rth->rt_src = fl->fl4_src; -@@ -2477,6 +2514,7 @@ static int ip_route_output_slow(struct n - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = oldflp->fl4_dst, - .saddr = oldflp->fl4_src, -+ .gw = oldflp->fl4_gw, - .tos = tos & IPTOS_RT_MASK, - .scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : -@@ -2588,6 +2626,7 @@ static int ip_route_output_slow(struct n - dev_out = net->loopback_dev; - dev_hold(dev_out); - fl.oif = net->loopback_dev->ifindex; -+ fl.fl4_gw = 0; - res.type = RTN_LOCAL; - flags |= RTCF_LOCAL; - goto make_route; -@@ -2595,7 +2634,7 @@ static int ip_route_output_slow(struct n - - if (fib_lookup(net, &fl, &res)) { - res.fi = NULL; -- if (oldflp->oif) { -+ if (oldflp->oif && dev_out->flags & IFF_UP) { - /* Apparently, routing tables are wrong. Assume, - that the destination is on link. - -@@ -2635,6 +2674,7 @@ static int ip_route_output_slow(struct n - dev_out = net->loopback_dev; - dev_hold(dev_out); - fl.oif = dev_out->ifindex; -+ fl.fl4_gw = 0; - if (res.fi) - fib_info_put(res.fi); - res.fi = NULL; -@@ -2642,13 +2682,12 @@ static int ip_route_output_slow(struct n - goto make_route; - } - -+ if (res.type == RTN_UNICAST) -+ fib_select_default(net, &fl, &res); - #ifdef CONFIG_IP_ROUTE_MULTIPATH -- if (res.fi->fib_nhs > 1 && fl.oif == 0) -+ if (res.fi->fib_nhs > 1) - fib_select_multipath(&fl, &res); -- else - #endif -- if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) -- fib_select_default(net, &fl, &res); - - if (!fl.fl4_src) - fl.fl4_src = FIB_RES_PREFSRC(res); -@@ -2689,6 +2728,7 @@ int __ip_route_output_key(struct net *ne - rth->fl.fl4_src == flp->fl4_src && - rth->fl.iif == 0 && - rth->fl.oif == flp->oif && -+ rth->fl.fl4_gw == flp->fl4_gw && - rth->fl.mark == flp->mark && - !((rth->fl.fl4_tos ^ flp->fl4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK)) && -@@ -3466,3 +3506,4 @@ void __init ip_static_sysctl_init(void) - EXPORT_SYMBOL(__ip_select_ident); - EXPORT_SYMBOL(ip_route_input); - EXPORT_SYMBOL(ip_route_output_key); -+EXPORT_SYMBOL(ip_route_input_lookup); diff --git a/pkgs/core/kernel/patches/routes-2.6.33-16.patch b/pkgs/core/kernel/patches/routes-2.6.33-16.patch new file mode 100644 index 0000000..5f68494 --- /dev/null +++ b/pkgs/core/kernel/patches/routes-2.6.33-16.patch @@ -0,0 +1,1333 @@ +diff -urp v2.6.33/linux/include/linux/rtnetlink.h linux/include/linux/rtnetlink.h +--- v2.6.33/linux/include/linux/rtnetlink.h 2010-02-25 09:01:36.000000000 +0200 ++++ linux/include/linux/rtnetlink.h 2010-02-25 11:11:52.000000000 +0200 +@@ -304,6 +304,8 @@ struct rtnexthop { + #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ + #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ + #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ ++#define RTNH_F_SUSPECT 8 /* We don't know the real state */ ++#define RTNH_F_BADSTATE (RTNH_F_DEAD | RTNH_F_SUSPECT) + + /* Macros to handle hexthops */ + +diff -urp v2.6.33/linux/include/net/flow.h linux/include/net/flow.h +--- v2.6.33/linux/include/net/flow.h 2009-03-25 09:48:32.000000000 +0200 ++++ linux/include/net/flow.h 2010-02-25 11:11:52.000000000 +0200 +@@ -19,6 +19,8 @@ struct flowi { + struct { + __be32 daddr; + __be32 saddr; ++ __be32 lsrc; ++ __be32 gw; + __u8 tos; + __u8 scope; + } ip4_u; +@@ -43,6 +45,8 @@ struct flowi { + #define fl6_flowlabel nl_u.ip6_u.flowlabel + #define fl4_dst nl_u.ip4_u.daddr + #define fl4_src nl_u.ip4_u.saddr ++#define fl4_lsrc nl_u.ip4_u.lsrc ++#define fl4_gw nl_u.ip4_u.gw + #define fl4_tos nl_u.ip4_u.tos + #define fl4_scope nl_u.ip4_u.scope + +diff -urp v2.6.33/linux/include/net/ip_fib.h linux/include/net/ip_fib.h +--- v2.6.33/linux/include/net/ip_fib.h 2010-02-25 09:01:36.000000000 +0200 ++++ linux/include/net/ip_fib.h 2010-02-25 11:11:52.000000000 +0200 +@@ -207,6 +207,8 @@ extern int fib_lookup(struct net *n, str + extern struct fib_table *fib_new_table(struct net *net, u32 id); + extern struct fib_table *fib_get_table(struct net *net, u32 id); + ++extern int fib_result_table(struct fib_result *res); ++ + #endif /* CONFIG_IP_MULTIPLE_TABLES */ + + /* Exported by fib_frontend.c */ +@@ -277,4 +279,6 @@ static inline void fib_proc_exit(struct + } + #endif + ++extern rwlock_t fib_nhflags_lock; ++ + #endif /* _NET_FIB_H */ +diff -urp v2.6.33/linux/include/net/netfilter/nf_nat.h linux/include/net/netfilter/nf_nat.h +--- v2.6.33/linux/include/net/netfilter/nf_nat.h 2010-02-25 09:01:36.000000000 +0200 ++++ linux/include/net/netfilter/nf_nat.h 2010-02-25 11:11:52.000000000 +0200 +@@ -73,6 +73,13 @@ struct nf_conn_nat { + #endif + }; + ++/* Call input routing for SNAT-ed traffic */ ++extern unsigned int ip_nat_route_input(unsigned int hooknum, ++ struct sk_buff *skb, ++ const struct net_device *in, ++ const struct net_device *out, ++ int (*okfn)(struct sk_buff *)); ++ + /* Set up the info structure to map into this range. */ + extern unsigned int nf_nat_setup_info(struct nf_conn *ct, + const struct nf_nat_range *range, +diff -urp v2.6.33/linux/include/net/route.h linux/include/net/route.h +--- v2.6.33/linux/include/net/route.h 2010-02-25 09:01:36.000000000 +0200 ++++ linux/include/net/route.h 2010-02-25 11:11:52.000000000 +0200 +@@ -113,6 +113,7 @@ extern int __ip_route_output_key(struct + extern int ip_route_output_key(struct net *, struct rtable **, struct flowi *flp); + extern int ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); + extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin); ++extern int ip_route_input_lookup(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin, __be32 lsrc); + extern unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev); + extern void ip_rt_send_redirect(struct sk_buff *skb); + +diff -urp v2.6.33/linux/net/bridge/br_netfilter.c linux/net/bridge/br_netfilter.c +--- v2.6.33/linux/net/bridge/br_netfilter.c 2010-02-25 09:01:36.000000000 +0200 ++++ linux/net/bridge/br_netfilter.c 2010-02-25 11:11:52.000000000 +0200 +@@ -343,6 +343,9 @@ static int br_nf_pre_routing_finish(stru + struct rtable *rt; + int err; + ++ /* Old skb->dst is not expected, it is lost in all cases */ ++ skb_dst_drop(skb); ++ + if (nf_bridge->mask & BRNF_PKT_TYPE) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->mask ^= BRNF_PKT_TYPE; +diff -urp v2.6.33/linux/net/ipv4/fib_frontend.c linux/net/ipv4/fib_frontend.c +--- v2.6.33/linux/net/ipv4/fib_frontend.c 2010-02-25 09:01:36.000000000 +0200 ++++ linux/net/ipv4/fib_frontend.c 2010-02-25 11:13:43.000000000 +0200 +@@ -46,6 +46,8 @@ + + #ifndef CONFIG_IP_MULTIPLE_TABLES + ++#define FIB_RES_TABLE(r) (RT_TABLE_MAIN) ++ + static int __net_init fib4_rules_init(struct net *net) + { + struct fib_table *local_table, *main_table; +@@ -70,6 +72,8 @@ fail: + } + #else + ++#define FIB_RES_TABLE(r) (fib_result_table(r)) ++ + struct fib_table *fib_new_table(struct net *net, u32 id) + { + struct fib_table *tb; +@@ -124,7 +128,8 @@ void fib_select_default(struct net *net, + table = res->r->table; + #endif + tb = fib_get_table(net, table); +- if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) ++ if ((FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) || ++ FIB_RES_NH(*res).nh_scope == RT_SCOPE_HOST) + fib_table_select_default(tb, flp, res); + } + +@@ -241,6 +246,9 @@ int fib_validate_source(__be32 src, __be + .iif = oif }; + + struct fib_result res; ++ int table; ++ unsigned char prefixlen; ++ unsigned char scope; + int no_addr, rpf, accept_local; + int ret; + struct net *net; +@@ -269,31 +277,35 @@ int fib_validate_source(__be32 src, __be + } + *spec_dst = FIB_RES_PREFSRC(res); + fib_combine_itag(itag, &res); +-#ifdef CONFIG_IP_ROUTE_MULTIPATH +- if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) +-#else + if (FIB_RES_DEV(res) == dev) +-#endif + { + ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + fib_res_put(&res); + return ret; + } ++ table = FIB_RES_TABLE(&res); ++ prefixlen = res.prefixlen; ++ scope = res.scope; + fib_res_put(&res); + if (no_addr) + goto last_resort; +- if (rpf == 1) +- goto e_inval; + fl.oif = dev->ifindex; + + ret = 0; + if (fib_lookup(net, &fl, &res) == 0) { +- if (res.type == RTN_UNICAST) { ++ if (res.type == RTN_UNICAST && ++ ((table == FIB_RES_TABLE(&res) && ++ res.prefixlen >= prefixlen && res.scope >= scope) || ++ !rpf)) { + *spec_dst = FIB_RES_PREFSRC(res); + ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; ++ fib_res_put(&res); ++ return ret; + } + fib_res_put(&res); + } ++ if (rpf == 1) ++ goto e_inval; + return ret; + + last_resort: +@@ -916,9 +928,7 @@ static int fib_inetaddr_event(struct not + switch (event) { + case NETDEV_UP: + fib_add_ifaddr(ifa); +-#ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(dev); +-#endif + rt_cache_flush(dev_net(dev), -1); + break; + case NETDEV_DOWN: +@@ -954,9 +964,7 @@ static int fib_netdev_event(struct notif + for_ifa(in_dev) { + fib_add_ifaddr(ifa); + } endfor_ifa(in_dev); +-#ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(dev); +-#endif + rt_cache_flush(dev_net(dev), -1); + break; + case NETDEV_DOWN: +diff -urp v2.6.33/linux/net/ipv4/fib_hash.c linux/net/ipv4/fib_hash.c +--- v2.6.33/linux/net/ipv4/fib_hash.c 2010-02-25 09:01:36.000000000 +0200 ++++ linux/net/ipv4/fib_hash.c 2010-02-25 11:11:52.000000000 +0200 +@@ -277,25 +277,35 @@ out: + void fib_table_select_default(struct fib_table *tb, + const struct flowi *flp, struct fib_result *res) + { +- int order, last_idx; ++ int order, last_idx, last_dflt, last_nhsel; ++ struct fib_alias *first_fa = NULL; ++ struct hlist_head *head; + struct hlist_node *node; + struct fib_node *f; + struct fib_info *fi = NULL; + struct fib_info *last_resort; + struct fn_hash *t = (struct fn_hash *)tb->tb_data; +- struct fn_zone *fz = t->fn_zones[0]; ++ struct fn_zone *fz = t->fn_zones[res->prefixlen]; ++ __be32 k; + + if (fz == NULL) + return; + ++ k = fz_key(flp->fl4_dst, fz); ++ last_dflt = -2; ++ last_nhsel = 0; + last_idx = -1; + last_resort = NULL; + order = -1; + + read_lock(&fib_hash_lock); +- hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { ++ head = &fz->fz_hash[fn_hash(k, fz)]; ++ hlist_for_each_entry(f, node, head, fn_hash) { + struct fib_alias *fa; + ++ if (f->fn_key != k) ++ continue; ++ + list_for_each_entry(fa, &f->fn_alias, fa_list) { + struct fib_info *next_fi = fa->fa_info; + +@@ -303,42 +313,56 @@ void fib_table_select_default(struct fib + fa->fa_type != RTN_UNICAST) + continue; + ++ if (fa->fa_tos && ++ fa->fa_tos != flp->fl4_tos) ++ continue; + if (next_fi->fib_priority > res->fi->fib_priority) + break; +- if (!next_fi->fib_nh[0].nh_gw || +- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) +- continue; + fa->fa_state |= FA_S_ACCESSED; + +- if (fi == NULL) { +- if (next_fi != res->fi) +- break; +- } else if (!fib_detect_death(fi, order, &last_resort, +- &last_idx, tb->tb_default)) { ++ if (!first_fa) { ++ last_dflt = fa->fa_last_dflt; ++ first_fa = fa; ++ } ++ if (fi && !fib_detect_death(fi, order, &last_resort, ++ &last_idx, &last_dflt, &last_nhsel, flp)) { + fib_result_assign(res, fi); +- tb->tb_default = order; ++ first_fa->fa_last_dflt = order; + goto out; + } + fi = next_fi; + order++; + } ++ break; + } + + if (order <= 0 || fi == NULL) { +- tb->tb_default = -1; ++ if (fi && fi->fib_nhs > 1 && ++ fib_detect_death(fi, order, &last_resort, &last_idx, ++ &last_dflt, &last_nhsel, flp) && ++ last_resort == fi) { ++ read_lock_bh(&fib_nhflags_lock); ++ fi->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; ++ read_unlock_bh(&fib_nhflags_lock); ++ } ++ if (first_fa) first_fa->fa_last_dflt = -1; + goto out; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx, +- tb->tb_default)) { ++ &last_dflt, &last_nhsel, flp)) { + fib_result_assign(res, fi); +- tb->tb_default = order; ++ first_fa->fa_last_dflt = order; + goto out; + } + +- if (last_idx >= 0) ++ if (last_idx >= 0) { + fib_result_assign(res, last_resort); +- tb->tb_default = last_idx; ++ read_lock_bh(&fib_nhflags_lock); ++ last_resort->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; ++ read_unlock_bh(&fib_nhflags_lock); ++ first_fa->fa_last_dflt = last_idx; ++ } + out: + read_unlock(&fib_hash_lock); + } +@@ -462,6 +486,7 @@ int fib_table_insert(struct fib_table *t + write_lock_bh(&fib_hash_lock); + fi_drop = fa->fa_info; + fa->fa_info = fi; ++ fa->fa_last_dflt = -1; + fa->fa_type = cfg->fc_type; + fa->fa_scope = cfg->fc_scope; + state = fa->fa_state; +@@ -516,6 +541,7 @@ int fib_table_insert(struct fib_table *t + new_fa->fa_type = cfg->fc_type; + new_fa->fa_scope = cfg->fc_scope; + new_fa->fa_state = 0; ++ new_fa->fa_last_dflt = -1; + + /* + * Insert new entry to the list. +diff -urp v2.6.33/linux/net/ipv4/fib_lookup.h linux/net/ipv4/fib_lookup.h +--- v2.6.33/linux/net/ipv4/fib_lookup.h 2009-09-11 10:27:17.000000000 +0300 ++++ linux/net/ipv4/fib_lookup.h 2010-02-25 11:11:52.000000000 +0200 +@@ -8,6 +8,7 @@ + struct fib_alias { + struct list_head fa_list; + struct fib_info *fa_info; ++ int fa_last_dflt; + u8 fa_tos; + u8 fa_type; + u8 fa_scope; +@@ -37,7 +38,8 @@ extern struct fib_alias *fib_find_alias( + u8 tos, u32 prio); + extern int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, +- int *last_idx, int dflt); ++ int *last_idx, int *dflt, int *last_nhsel, ++ const struct flowi *flp); + + static inline void fib_result_assign(struct fib_result *res, + struct fib_info *fi) +diff -urp v2.6.33/linux/net/ipv4/fib_rules.c linux/net/ipv4/fib_rules.c +--- v2.6.33/linux/net/ipv4/fib_rules.c 2010-02-25 09:01:36.000000000 +0200 ++++ linux/net/ipv4/fib_rules.c 2010-02-25 11:11:52.000000000 +0200 +@@ -54,6 +54,11 @@ u32 fib_rules_tclass(struct fib_result * + } + #endif + ++int fib_result_table(struct fib_result *res) ++{ ++ return res->r->table; ++} ++ + int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) + { + struct fib_lookup_arg arg = { +diff -urp v2.6.33/linux/net/ipv4/fib_semantics.c linux/net/ipv4/fib_semantics.c +--- v2.6.33/linux/net/ipv4/fib_semantics.c 2010-02-25 09:01:36.000000000 +0200 ++++ linux/net/ipv4/fib_semantics.c 2010-02-25 11:11:52.000000000 +0200 +@@ -50,6 +50,7 @@ static struct hlist_head *fib_info_hash; + static struct hlist_head *fib_info_laddrhash; + static unsigned int fib_hash_size; + static unsigned int fib_info_cnt; ++rwlock_t fib_nhflags_lock = RW_LOCK_UNLOCKED; + + #define DEVINDEX_HASHBITS 8 + #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) +@@ -186,7 +187,7 @@ static __inline__ int nh_comp(const stru + #ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid != onh->nh_tclassid || + #endif +- ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) ++ ((nh->nh_flags^onh->nh_flags)&~RTNH_F_BADSTATE)) + return -1; + onh++; + } endfor_nexthops(fi); +@@ -237,7 +238,7 @@ static struct fib_info *fib_find_info(co + nfi->fib_priority == fi->fib_priority && + memcmp(nfi->fib_metrics, fi->fib_metrics, + sizeof(fi->fib_metrics)) == 0 && +- ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && ++ ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_BADSTATE) == 0 && + (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) + return fi; + } +@@ -349,26 +350,70 @@ struct fib_alias *fib_find_alias(struct + } + + int fib_detect_death(struct fib_info *fi, int order, +- struct fib_info **last_resort, int *last_idx, int dflt) ++ struct fib_info **last_resort, int *last_idx, int *dflt, ++ int *last_nhsel, const struct flowi *flp) + { + struct neighbour *n; +- int state = NUD_NONE; ++ int nhsel; ++ int state; ++ struct fib_nh * nh; ++ __be32 dst; ++ int flag, dead = 1; ++ ++ /* change_nexthops(fi) { */ ++ for (nhsel = 0, nh = fi->fib_nh; nhsel < fi->fib_nhs; nh++, nhsel++) { ++ if (flp->oif && flp->oif != nh->nh_oif) ++ continue; ++ if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && nh->nh_gw && ++ nh->nh_scope == RT_SCOPE_LINK) ++ continue; ++ if (nh->nh_flags & RTNH_F_DEAD) ++ continue; + +- n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); +- if (n) { +- state = n->nud_state; +- neigh_release(n); +- } +- if (state == NUD_REACHABLE) +- return 0; +- if ((state&NUD_VALID) && order != dflt) +- return 0; +- if ((state&NUD_VALID) || +- (*last_idx<0 && order > dflt)) { +- *last_resort = fi; +- *last_idx = order; ++ flag = 0; ++ if (nh->nh_dev->flags & IFF_NOARP) { ++ dead = 0; ++ goto setfl; ++ } ++ ++ dst = nh->nh_gw; ++ if (!nh->nh_gw || nh->nh_scope != RT_SCOPE_LINK) ++ dst = flp->fl4_dst; ++ ++ state = NUD_NONE; ++ n = neigh_lookup(&arp_tbl, &dst, nh->nh_dev); ++ if (n) { ++ state = n->nud_state; ++ neigh_release(n); ++ } ++ if (state==NUD_REACHABLE || ++ ((state&NUD_VALID) && order != *dflt)) { ++ dead = 0; ++ goto setfl; ++ } ++ if (!(state&NUD_VALID)) ++ flag = 1; ++ if (!dead) ++ goto setfl; ++ if ((state&NUD_VALID) || ++ (*last_idx<0 && order >= *dflt)) { ++ *last_resort = fi; ++ *last_idx = order; ++ *last_nhsel = nhsel; ++ } ++ ++ setfl: ++ ++ read_lock_bh(&fib_nhflags_lock); ++ if (flag) ++ nh->nh_flags |= RTNH_F_SUSPECT; ++ else ++ nh->nh_flags &= ~RTNH_F_SUSPECT; ++ read_unlock_bh(&fib_nhflags_lock); + } +- return 1; ++ /* } endfor_nexthops(fi) */ ++ ++ return dead; + } + + #ifdef CONFIG_IP_ROUTE_MULTIPATH +@@ -540,8 +585,11 @@ static int fib_check_nh(struct fib_confi + return -EINVAL; + if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) + return -ENODEV; +- if (!(dev->flags&IFF_UP)) +- return -ENETDOWN; ++ if (!(dev->flags&IFF_UP)) { ++ if (fi->fib_protocol != RTPROT_STATIC) ++ return -ENETDOWN; ++ nh->nh_flags |= RTNH_F_DEAD; ++ } + nh->nh_dev = dev; + dev_hold(dev); + nh->nh_scope = RT_SCOPE_LINK; +@@ -561,24 +609,48 @@ static int fib_check_nh(struct fib_confi + /* It is not necessary, but requires a bit of thinking */ + if (fl.fl4_scope < RT_SCOPE_LINK) + fl.fl4_scope = RT_SCOPE_LINK; +- if ((err = fib_lookup(net, &fl, &res)) != 0) +- return err; ++ err = fib_lookup(net, &fl, &res); + } +- err = -EINVAL; +- if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) +- goto out; +- nh->nh_scope = res.scope; +- nh->nh_oif = FIB_RES_OIF(res); +- if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) +- goto out; +- dev_hold(nh->nh_dev); +- err = -ENETDOWN; +- if (!(nh->nh_dev->flags & IFF_UP)) +- goto out; +- err = 0; ++ if (err) { ++ struct in_device *in_dev; ++ ++ if (err != -ENETUNREACH || ++ fi->fib_protocol != RTPROT_STATIC) ++ return err; ++ ++ in_dev = inetdev_by_index(net, nh->nh_oif); ++ if (in_dev == NULL || ++ in_dev->dev->flags & IFF_UP) { ++ if (in_dev) ++ in_dev_put(in_dev); ++ return err; ++ } ++ nh->nh_flags |= RTNH_F_DEAD; ++ nh->nh_scope = RT_SCOPE_LINK; ++ nh->nh_dev = in_dev->dev; ++ dev_hold(nh->nh_dev); ++ in_dev_put(in_dev); ++ } else { ++ err = -EINVAL; ++ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) ++ goto out; ++ nh->nh_scope = res.scope; ++ nh->nh_oif = FIB_RES_OIF(res); ++ if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) ++ goto out; ++ dev_hold(nh->nh_dev); ++ if (!(nh->nh_dev->flags & IFF_UP)) { ++ if (fi->fib_protocol != RTPROT_STATIC) { ++ err = -ENETDOWN; ++ goto out; ++ } ++ nh->nh_flags |= RTNH_F_DEAD; ++ } ++ err = 0; + out: +- fib_res_put(&res); +- return err; ++ fib_res_put(&res); ++ return err; ++ } + } else { + struct in_device *in_dev; + +@@ -589,8 +661,11 @@ out: + if (in_dev == NULL) + return -ENODEV; + if (!(in_dev->dev->flags&IFF_UP)) { +- in_dev_put(in_dev); +- return -ENETDOWN; ++ if (fi->fib_protocol != RTPROT_STATIC) { ++ in_dev_put(in_dev); ++ return -ENETDOWN; ++ } ++ nh->nh_flags |= RTNH_F_DEAD; + } + nh->nh_dev = in_dev->dev; + dev_hold(nh->nh_dev); +@@ -899,8 +974,12 @@ int fib_semantic_match(struct list_head + for_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + continue; +- if (!flp->oif || flp->oif == nh->nh_oif) +- break; ++ if (flp->oif && flp->oif != nh->nh_oif) ++ continue; ++ if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && ++ nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) ++ continue; ++ break; + } + #ifdef CONFIG_IP_ROUTE_MULTIPATH + if (nhsel < fi->fib_nhs) { +@@ -1080,18 +1159,29 @@ int fib_sync_down_dev(struct net_device + prev_fi = fi; + dead = 0; + change_nexthops(fi) { +- if (nh->nh_flags&RTNH_F_DEAD) +- dead++; +- else if (nh->nh_dev == dev && +- nh->nh_scope != scope) { +- nh->nh_flags |= RTNH_F_DEAD; ++ if (nh->nh_flags&RTNH_F_DEAD) { ++ if (fi->fib_protocol!=RTPROT_STATIC || ++ nh->nh_dev == NULL || ++ __in_dev_get_rtnl(nh->nh_dev) == NULL || ++ nh->nh_dev->flags&IFF_UP) ++ dead++; ++ } else if (nh->nh_dev == dev && ++ nh->nh_scope != scope) { ++ write_lock_bh(&fib_nhflags_lock); + #ifdef CONFIG_IP_ROUTE_MULTIPATH +- spin_lock_bh(&fib_multipath_lock); ++ spin_lock(&fib_multipath_lock); ++ nh->nh_flags |= RTNH_F_DEAD; + fi->fib_power -= nh->nh_power; + nh->nh_power = 0; +- spin_unlock_bh(&fib_multipath_lock); ++ spin_unlock(&fib_multipath_lock); ++#else ++ nh->nh_flags |= RTNH_F_DEAD; + #endif +- dead++; ++ write_unlock_bh(&fib_nhflags_lock); ++ if (fi->fib_protocol!=RTPROT_STATIC || ++ force || ++ __in_dev_get_rtnl(dev) == NULL) ++ dead++; + } + #ifdef CONFIG_IP_ROUTE_MULTIPATH + if (force > 1 && nh->nh_dev == dev) { +@@ -1109,11 +1199,8 @@ int fib_sync_down_dev(struct net_device + return ret; + } + +-#ifdef CONFIG_IP_ROUTE_MULTIPATH +- + /* +- Dead device goes up. We wake up dead nexthops. +- It takes sense only on multipath routes. ++ Dead device goes up or new address is added. We wake up dead nexthops. + */ + + int fib_sync_up(struct net_device *dev) +@@ -1123,8 +1210,10 @@ int fib_sync_up(struct net_device *dev) + struct hlist_head *head; + struct hlist_node *node; + struct fib_nh *nh; +- int ret; ++ struct fib_result res; ++ int ret, rep; + ++repeat: + if (!(dev->flags&IFF_UP)) + return 0; + +@@ -1132,6 +1221,7 @@ int fib_sync_up(struct net_device *dev) + hash = fib_devindex_hashfn(dev->ifindex); + head = &fib_info_devhash[hash]; + ret = 0; ++ rep = 0; + + hlist_for_each_entry(nh, node, head, nh_hash) { + struct fib_info *fi = nh->nh_parent; +@@ -1144,19 +1234,39 @@ int fib_sync_up(struct net_device *dev) + prev_fi = fi; + alive = 0; + change_nexthops(fi) { +- if (!(nh->nh_flags&RTNH_F_DEAD)) { +- alive++; ++ if (!(nh->nh_flags&RTNH_F_DEAD)) + continue; +- } + if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + continue; + if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) + continue; ++ if (nh->nh_gw && fi->fib_protocol == RTPROT_STATIC) { ++ struct flowi fl = { ++ .nl_u = { .ip4_u = ++ { .daddr = nh->nh_gw, ++ .scope = nh->nh_scope } }, ++ .oif = nh->nh_oif, ++ }; ++ if (fib_lookup(dev_net(dev), &fl, &res) != 0) ++ continue; ++ if (res.type != RTN_UNICAST && ++ res.type != RTN_LOCAL) { ++ fib_res_put(&res); ++ continue; ++ } ++ nh->nh_scope = res.scope; ++ fib_res_put(&res); ++ rep = 1; ++ } + alive++; ++#ifdef CONFIG_IP_ROUTE_MULTIPATH + spin_lock_bh(&fib_multipath_lock); + nh->nh_power = 0; ++#endif + nh->nh_flags &= ~RTNH_F_DEAD; ++#ifdef CONFIG_IP_ROUTE_MULTIPATH + spin_unlock_bh(&fib_multipath_lock); ++#endif + } endfor_nexthops(fi) + + if (alive > 0) { +@@ -1164,10 +1274,14 @@ int fib_sync_up(struct net_device *dev) + ret++; + } + } ++ if (rep) ++ goto repeat; + + return ret; + } + ++#ifdef CONFIG_IP_ROUTE_MULTIPATH ++ + /* + The algorithm is suboptimal, but it provides really + fair weighted route distribution. +@@ -1176,24 +1290,45 @@ int fib_sync_up(struct net_device *dev) + void fib_select_multipath(const struct flowi *flp, struct fib_result *res) + { + struct fib_info *fi = res->fi; +- int w; ++ int w, alive; + + spin_lock_bh(&fib_multipath_lock); ++ if (flp->oif) { ++ int sel = -1; ++ w = -1; ++ change_nexthops(fi) { ++ if (flp->oif != nh->nh_oif) ++ continue; ++ if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && ++ nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) ++ continue; ++ if (!(nh->nh_flags&RTNH_F_BADSTATE)) { ++ if (nh->nh_power > w) { ++ w = nh->nh_power; ++ sel = nhsel; ++ } ++ } ++ } endfor_nexthops(fi); ++ if (sel >= 0) { ++ spin_unlock_bh(&fib_multipath_lock); ++ res->nh_sel = sel; ++ return; ++ } ++ goto last_resort; ++ } ++ ++repeat: + if (fi->fib_power <= 0) { + int power = 0; + change_nexthops(fi) { +- if (!(nh->nh_flags&RTNH_F_DEAD)) { ++ if (!(nh->nh_flags&RTNH_F_BADSTATE)) { + power += nh->nh_weight; + nh->nh_power = nh->nh_weight; + } + } endfor_nexthops(fi); + fi->fib_power = power; +- if (power <= 0) { +- spin_unlock_bh(&fib_multipath_lock); +- /* Race condition: route has just become dead. */ +- res->nh_sel = 0; +- return; +- } ++ if (power <= 0) ++ goto last_resort; + } + + +@@ -1203,20 +1338,40 @@ void fib_select_multipath(const struct f + + w = jiffies % fi->fib_power; + ++ alive = 0; + change_nexthops(fi) { +- if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { ++ if (!(nh->nh_flags&RTNH_F_BADSTATE) && nh->nh_power) { + if ((w -= nh->nh_power) <= 0) { + nh->nh_power--; + fi->fib_power--; +- res->nh_sel = nhsel; + spin_unlock_bh(&fib_multipath_lock); ++ res->nh_sel = nhsel; + return; + } ++ alive = 1; ++ } ++ } endfor_nexthops(fi); ++ if (alive) { ++ fi->fib_power = 0; ++ goto repeat; ++ } ++ ++last_resort: ++ ++ for_nexthops(fi) { ++ if (!(nh->nh_flags&RTNH_F_DEAD)) { ++ if (flp->oif && flp->oif != nh->nh_oif) ++ continue; ++ if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && ++ nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) ++ continue; ++ spin_unlock_bh(&fib_multipath_lock); ++ res->nh_sel = nhsel; ++ return; + } + } endfor_nexthops(fi); + + /* Race condition: route has just become dead. */ +- res->nh_sel = 0; + spin_unlock_bh(&fib_multipath_lock); + } + #endif +diff -urp v2.6.33/linux/net/ipv4/fib_trie.c linux/net/ipv4/fib_trie.c +--- v2.6.33/linux/net/ipv4/fib_trie.c 2010-02-25 09:01:36.000000000 +0200 ++++ linux/net/ipv4/fib_trie.c 2010-02-25 11:11:52.000000000 +0200 +@@ -1272,6 +1272,7 @@ int fib_table_insert(struct fib_table *t + fi_drop = fa->fa_info; + new_fa->fa_tos = fa->fa_tos; + new_fa->fa_info = fi; ++ new_fa->fa_last_dflt = -1; + new_fa->fa_type = cfg->fc_type; + new_fa->fa_scope = cfg->fc_scope; + state = fa->fa_state; +@@ -1312,6 +1313,7 @@ int fib_table_insert(struct fib_table *t + new_fa->fa_type = cfg->fc_type; + new_fa->fa_scope = cfg->fc_scope; + new_fa->fa_state = 0; ++ new_fa->fa_last_dflt = -1; + /* + * Insert new entry to the list. + */ +@@ -1812,24 +1814,31 @@ void fib_table_select_default(struct fib + struct fib_result *res) + { + struct trie *t = (struct trie *) tb->tb_data; +- int order, last_idx; ++ int order, last_idx, last_dflt, last_nhsel; ++ struct fib_alias *first_fa = NULL; + struct fib_info *fi = NULL; + struct fib_info *last_resort; + struct fib_alias *fa = NULL; + struct list_head *fa_head; + struct leaf *l; ++ u32 key, mask; + ++ last_dflt = -2; ++ last_nhsel = 0; + last_idx = -1; + last_resort = NULL; + order = -1; + ++ mask = inet_make_mask(res->prefixlen); ++ key = ntohl(flp->fl4_dst & mask); ++ + rcu_read_lock(); + +- l = fib_find_node(t, 0); ++ l = fib_find_node(t, key); + if (!l) + goto out; + +- fa_head = get_fa_head(l, 0); ++ fa_head = get_fa_head(l, res->prefixlen); + if (!fa_head) + goto out; + +@@ -1843,39 +1852,52 @@ void fib_table_select_default(struct fib + fa->fa_type != RTN_UNICAST) + continue; + ++ if (fa->fa_tos && ++ fa->fa_tos != flp->fl4_tos) ++ continue; + if (next_fi->fib_priority > res->fi->fib_priority) + break; +- if (!next_fi->fib_nh[0].nh_gw || +- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) +- continue; + fa->fa_state |= FA_S_ACCESSED; + +- if (fi == NULL) { +- if (next_fi != res->fi) +- break; +- } else if (!fib_detect_death(fi, order, &last_resort, +- &last_idx, tb->tb_default)) { ++ if (!first_fa) { ++ last_dflt = fa->fa_last_dflt; ++ first_fa = fa; ++ } ++ if (fi && !fib_detect_death(fi, order, &last_resort, ++ &last_idx, &last_dflt, &last_nhsel, flp)) { + fib_result_assign(res, fi); +- tb->tb_default = order; ++ first_fa->fa_last_dflt = order; + goto out; + } + fi = next_fi; + order++; + } + if (order <= 0 || fi == NULL) { +- tb->tb_default = -1; ++ if (fi && fi->fib_nhs > 1 && ++ fib_detect_death(fi, order, &last_resort, &last_idx, ++ &last_dflt, &last_nhsel, flp) && ++ last_resort == fi) { ++ read_lock_bh(&fib_nhflags_lock); ++ fi->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; ++ read_unlock_bh(&fib_nhflags_lock); ++ } ++ if (first_fa) first_fa->fa_last_dflt = -1; + goto out; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx, +- tb->tb_default)) { ++ &last_dflt, &last_nhsel, flp)) { + fib_result_assign(res, fi); +- tb->tb_default = order; ++ first_fa->fa_last_dflt = order; + goto out; + } +- if (last_idx >= 0) ++ if (last_idx >= 0) { + fib_result_assign(res, last_resort); +- tb->tb_default = last_idx; ++ read_lock_bh(&fib_nhflags_lock); ++ last_resort->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; ++ read_unlock_bh(&fib_nhflags_lock); ++ first_fa->fa_last_dflt = last_idx; ++ } + out: + rcu_read_unlock(); + } +diff -urp v2.6.33/linux/net/ipv4/netfilter/ipt_MASQUERADE.c linux/net/ipv4/netfilter/ipt_MASQUERADE.c +--- v2.6.33/linux/net/ipv4/netfilter/ipt_MASQUERADE.c 2010-02-25 09:01:36.000000000 +0200 ++++ linux/net/ipv4/netfilter/ipt_MASQUERADE.c 2010-02-25 11:11:52.000000000 +0200 +@@ -51,7 +51,7 @@ masquerade_tg(struct sk_buff *skb, const + enum ip_conntrack_info ctinfo; + struct nf_nat_range newrange; + const struct nf_nat_multi_range_compat *mr; +- const struct rtable *rt; ++ struct rtable *rt; + __be32 newsrc; + + NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); +@@ -69,13 +69,28 @@ masquerade_tg(struct sk_buff *skb, const + return NF_ACCEPT; + + mr = par->targinfo; +- rt = skb_rtable(skb); +- newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); +- if (!newsrc) { +- printk("MASQUERADE: %s ate my IP address\n", par->out->name); +- return NF_DROP; ++ ++ { ++ struct flowi fl = { .nl_u = { .ip4_u = ++ { .daddr = ip_hdr(skb)->daddr, ++ .tos = (RT_TOS(ip_hdr(skb)->tos) | ++ RTO_CONN), ++ .gw = skb_rtable(skb)->rt_gateway, ++ } }, ++ .mark = skb->mark, ++ .oif = par->out->ifindex }; ++ if (ip_route_output_key(dev_net(par->out), &rt, &fl) != 0) { ++ /* Funky routing can do this. */ ++ if (net_ratelimit()) ++ printk("MASQUERADE:" ++ " No route: Rusty's brain broke!\n"); ++ return NF_DROP; ++ } + } + ++ newsrc = rt->rt_src; ++ ip_rt_put(rt); ++ + nat->masq_index = par->out->ifindex; + + /* Transfer from original range. */ +diff -urp v2.6.33/linux/net/ipv4/netfilter/nf_nat_core.c linux/net/ipv4/netfilter/nf_nat_core.c +--- v2.6.33/linux/net/ipv4/netfilter/nf_nat_core.c 2010-02-25 09:01:36.000000000 +0200 ++++ linux/net/ipv4/netfilter/nf_nat_core.c 2010-02-25 11:11:52.000000000 +0200 +@@ -710,6 +710,52 @@ static struct pernet_operations nf_nat_n + .exit = nf_nat_net_exit, + }; + ++unsigned int ++ip_nat_route_input(unsigned int hooknum, ++ struct sk_buff *skb, ++ const struct net_device *in, ++ const struct net_device *out, ++ int (*okfn)(struct sk_buff *)) ++{ ++ struct iphdr *iph; ++ struct nf_conn *conn; ++ enum ip_conntrack_info ctinfo; ++ enum ip_conntrack_dir dir; ++ unsigned long statusbit; ++ __be32 saddr; ++ ++ if (!(conn = nf_ct_get(skb, &ctinfo))) ++ return NF_ACCEPT; ++ ++ if (!(conn->status & IPS_NAT_DONE_MASK)) ++ return NF_ACCEPT; ++ dir = CTINFO2DIR(ctinfo); ++ statusbit = IPS_SRC_NAT; ++ if (dir == IP_CT_DIR_REPLY) ++ statusbit ^= IPS_NAT_MASK; ++ if (!(conn->status & statusbit)) ++ return NF_ACCEPT; ++ ++ if (skb_dst(skb)) ++ return NF_ACCEPT; ++ ++ if (skb->len < sizeof(struct iphdr)) ++ return NF_ACCEPT; ++ ++ /* use daddr in other direction as masquerade address (lsrc) */ ++ iph = ip_hdr(skb); ++ saddr = conn->tuplehash[!dir].tuple.dst.u3.ip; ++ if (saddr == iph->saddr) ++ return NF_ACCEPT; ++ ++ if (ip_route_input_lookup(skb, iph->daddr, iph->saddr, iph->tos, ++ skb->dev, saddr)) ++ return NF_DROP; ++ ++ return NF_ACCEPT; ++} ++EXPORT_SYMBOL_GPL(ip_nat_route_input); ++ + static int __init nf_nat_init(void) + { + size_t i; +diff -urp v2.6.33/linux/net/ipv4/netfilter/nf_nat_standalone.c linux/net/ipv4/netfilter/nf_nat_standalone.c +--- v2.6.33/linux/net/ipv4/netfilter/nf_nat_standalone.c 2010-02-25 09:01:36.000000000 +0200 ++++ linux/net/ipv4/netfilter/nf_nat_standalone.c 2010-02-25 11:11:52.000000000 +0200 +@@ -255,6 +255,14 @@ static struct nf_hook_ops nf_nat_ops[] _ + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_NAT_DST, + }, ++ /* Before routing, route before mangling */ ++ { ++ .hook = ip_nat_route_input, ++ .owner = THIS_MODULE, ++ .pf = NFPROTO_IPV4, ++ .hooknum = NF_INET_PRE_ROUTING, ++ .priority = NF_IP_PRI_LAST-1, ++ }, + /* After packet filtering, change source */ + { + .hook = nf_nat_out, +diff -urp v2.6.33/linux/net/ipv4/route.c linux/net/ipv4/route.c +--- v2.6.33/linux/net/ipv4/route.c 2010-02-25 09:01:36.000000000 +0200 ++++ linux/net/ipv4/route.c 2010-02-25 11:11:52.000000000 +0200 +@@ -695,6 +695,8 @@ static inline int compare_keys(struct fl + return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | + (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | + (fl1->mark ^ fl2->mark) | ++ ((__force u32)(fl1->nl_u.ip4_u.lsrc ^ fl2->nl_u.ip4_u.lsrc)) | ++ ((__force u32)(fl1->nl_u.ip4_u.gw ^ fl2->nl_u.ip4_u.gw)) | + (*(u16 *)&fl1->nl_u.ip4_u.tos ^ + *(u16 *)&fl2->nl_u.ip4_u.tos) | + (fl1->oif ^ fl2->oif) | +@@ -1430,6 +1432,7 @@ void ip_rt_redirect(__be32 old_gw, __be3 + + /* Gateway is different ... */ + rt->rt_gateway = new_gw; ++ if (rt->fl.fl4_gw) rt->fl.fl4_gw = new_gw; + + /* Redirect received -> path was valid */ + dst_confirm(&rth->u.dst); +@@ -1875,6 +1878,7 @@ static int ip_route_input_mc(struct sk_b + rth->fl.fl4_tos = tos; + rth->fl.mark = skb->mark; + rth->fl.fl4_src = saddr; ++ rth->fl.fl4_lsrc = 0; + rth->rt_src = saddr; + #ifdef CONFIG_NET_CLS_ROUTE + rth->u.dst.tclassid = itag; +@@ -1885,6 +1889,7 @@ static int ip_route_input_mc(struct sk_b + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->fl.oif = 0; ++ rth->fl.fl4_gw = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->rt_genid = rt_genid(dev_net(dev)); +@@ -1949,7 +1954,7 @@ static int __mkroute_input(struct sk_buf + struct fib_result *res, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos, +- struct rtable **result) ++ __be32 lsrc, struct rtable **result) + { + + struct rtable *rth; +@@ -1983,6 +1988,7 @@ static int __mkroute_input(struct sk_buf + flags |= RTCF_DIRECTSRC; + + if (out_dev == in_dev && err && ++ !lsrc && + (IN_DEV_SHARED_MEDIA(out_dev) || + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + flags |= RTCF_DOREDIRECT; +@@ -2016,6 +2022,7 @@ static int __mkroute_input(struct sk_buf + rth->fl.mark = skb->mark; + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; ++ rth->fl.fl4_lsrc = lsrc; + rth->rt_gateway = daddr; + rth->rt_iif = + rth->fl.iif = in_dev->dev->ifindex; +@@ -2023,6 +2030,7 @@ static int __mkroute_input(struct sk_buf + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->fl.oif = 0; ++ rth->fl.fl4_gw = 0; + rth->rt_spec_dst= spec_dst; + + rth->u.dst.input = ip_forward; +@@ -2043,21 +2051,23 @@ static int __mkroute_input(struct sk_buf + + static int ip_mkroute_input(struct sk_buff *skb, + struct fib_result *res, ++ struct net *net, + const struct flowi *fl, + struct in_device *in_dev, +- __be32 daddr, __be32 saddr, u32 tos) ++ __be32 daddr, __be32 saddr, u32 tos, __be32 lsrc) + { + struct rtable* rth = NULL; + int err; + unsigned hash; + ++ fib_select_default(net, fl, res); + #ifdef CONFIG_IP_ROUTE_MULTIPATH +- if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) ++ if (res->fi && res->fi->fib_nhs > 1) + fib_select_multipath(fl, res); + #endif + + /* create a routing cache entry */ +- err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); ++ err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, lsrc, &rth); + if (err) + return err; + +@@ -2078,18 +2088,19 @@ static int ip_mkroute_input(struct sk_bu + */ + + static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, +- u8 tos, struct net_device *dev) ++ u8 tos, struct net_device *dev, __be32 lsrc) + { + struct fib_result res; + struct in_device *in_dev = in_dev_get(dev); + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, +- .saddr = saddr, ++ .saddr = lsrc? : saddr, + .tos = tos, + .scope = RT_SCOPE_UNIVERSE, + } }, + .mark = skb->mark, +- .iif = dev->ifindex }; ++ .iif = lsrc? ++ dev_net(dev)->loopback_dev->ifindex : dev->ifindex }; + unsigned flags = 0; + u32 itag = 0; + struct rtable * rth; +@@ -2125,6 +2136,12 @@ static int ip_route_input_slow(struct sk + ipv4_is_loopback(daddr)) + goto martian_destination; + ++ if (lsrc) { ++ if (ipv4_is_multicast(lsrc) || ipv4_is_lbcast(lsrc) || ++ ipv4_is_zeronet(lsrc) || ipv4_is_loopback(lsrc)) ++ goto e_inval; ++ } ++ + /* + * Now we are ready to route packet. + */ +@@ -2134,6 +2151,8 @@ static int ip_route_input_slow(struct sk + goto no_route; + } + free_res = 1; ++ fl.iif = dev->ifindex; ++ fl.fl4_src = saddr; + + RT_CACHE_STAT_INC(in_slow_tot); + +@@ -2158,7 +2177,7 @@ static int ip_route_input_slow(struct sk + if (res.type != RTN_UNICAST) + goto martian_destination; + +- err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); ++ err = ip_mkroute_input(skb, &res, net, &fl, in_dev, daddr, saddr, tos, lsrc); + done: + in_dev_put(in_dev); + if (free_res) +@@ -2168,6 +2187,8 @@ out: return err; + brd_input: + if (skb->protocol != htons(ETH_P_IP)) + goto e_inval; ++ if (lsrc) ++ goto e_inval; + + if (ipv4_is_zeronet(saddr)) + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); +@@ -2209,6 +2230,7 @@ local_input: + rth->u.dst.dev = net->loopback_dev; + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); ++ rth->fl.fl4_gw = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->u.dst.input= ip_local_deliver; +@@ -2259,8 +2281,9 @@ martian_source: + goto e_inval; + } + +-int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, +- u8 tos, struct net_device *dev) ++static inline int ++ip_route_input_cached(struct sk_buff *skb, __be32 daddr, __be32 saddr, ++ u8 tos, struct net_device *dev, __be32 lsrc) + { + struct rtable * rth; + unsigned hash; +@@ -2281,6 +2304,7 @@ int ip_route_input(struct sk_buff *skb, + if (((rth->fl.fl4_dst ^ daddr) | + (rth->fl.fl4_src ^ saddr) | + (rth->fl.iif ^ iif) | ++ (rth->fl.fl4_lsrc ^ lsrc) | + rth->fl.oif | + (rth->fl.fl4_tos ^ tos)) == 0 && + rth->fl.mark == skb->mark && +@@ -2330,7 +2354,19 @@ skip_cache: + rcu_read_unlock(); + return -EINVAL; + } +- return ip_route_input_slow(skb, daddr, saddr, tos, dev); ++ return ip_route_input_slow(skb, daddr, saddr, tos, dev, lsrc); ++} ++ ++int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, ++ u8 tos, struct net_device *dev) ++{ ++ return ip_route_input_cached(skb, daddr, saddr, tos, dev, 0); ++} ++ ++int ip_route_input_lookup(struct sk_buff *skb, __be32 daddr, __be32 saddr, ++ u8 tos, struct net_device *dev, __be32 lsrc) ++{ ++ return ip_route_input_cached(skb, daddr, saddr, tos, dev, lsrc); + } + + static int __mkroute_output(struct rtable **result, +@@ -2402,6 +2438,7 @@ static int __mkroute_output(struct rtabl + rth->fl.fl4_tos = tos; + rth->fl.fl4_src = oldflp->fl4_src; + rth->fl.oif = oldflp->oif; ++ rth->fl.fl4_gw = oldflp->fl4_gw; + rth->fl.mark = oldflp->mark; + rth->rt_dst = fl->fl4_dst; + rth->rt_src = fl->fl4_src; +@@ -2483,6 +2520,7 @@ static int ip_route_output_slow(struct n + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = oldflp->fl4_dst, + .saddr = oldflp->fl4_src, ++ .gw = oldflp->fl4_gw, + .tos = tos & IPTOS_RT_MASK, + .scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : +@@ -2594,6 +2632,7 @@ static int ip_route_output_slow(struct n + dev_out = net->loopback_dev; + dev_hold(dev_out); + fl.oif = net->loopback_dev->ifindex; ++ fl.fl4_gw = 0; + res.type = RTN_LOCAL; + flags |= RTCF_LOCAL; + goto make_route; +@@ -2601,7 +2640,7 @@ static int ip_route_output_slow(struct n + + if (fib_lookup(net, &fl, &res)) { + res.fi = NULL; +- if (oldflp->oif) { ++ if (oldflp->oif && dev_out->flags & IFF_UP) { + /* Apparently, routing tables are wrong. Assume, + that the destination is on link. + +@@ -2641,6 +2680,7 @@ static int ip_route_output_slow(struct n + dev_out = net->loopback_dev; + dev_hold(dev_out); + fl.oif = dev_out->ifindex; ++ fl.fl4_gw = 0; + if (res.fi) + fib_info_put(res.fi); + res.fi = NULL; +@@ -2648,13 +2688,12 @@ static int ip_route_output_slow(struct n + goto make_route; + } + ++ if (res.type == RTN_UNICAST) ++ fib_select_default(net, &fl, &res); + #ifdef CONFIG_IP_ROUTE_MULTIPATH +- if (res.fi->fib_nhs > 1 && fl.oif == 0) ++ if (res.fi->fib_nhs > 1) + fib_select_multipath(&fl, &res); +- else + #endif +- if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) +- fib_select_default(net, &fl, &res); + + if (!fl.fl4_src) + fl.fl4_src = FIB_RES_PREFSRC(res); +@@ -2695,6 +2734,7 @@ int __ip_route_output_key(struct net *ne + rth->fl.fl4_src == flp->fl4_src && + rth->fl.iif == 0 && + rth->fl.oif == flp->oif && ++ rth->fl.fl4_gw == flp->fl4_gw && + rth->fl.mark == flp->mark && + !((rth->fl.fl4_tos ^ flp->fl4_tos) & + (IPTOS_RT_MASK | RTO_ONLINK)) && +@@ -3416,3 +3456,4 @@ void __init ip_static_sysctl_init(void) + EXPORT_SYMBOL(__ip_select_ident); + EXPORT_SYMBOL(ip_route_input); + EXPORT_SYMBOL(ip_route_output_key); ++EXPORT_SYMBOL(ip_route_input_lookup); diff --git a/pkgs/core/python/patches/python-2.3.4-lib64-regex.patch b/pkgs/core/python/patches/python-2.3.4-lib64-regex.patch new file mode 100644 index 0000000..2b38d4c --- /dev/null +++ b/pkgs/core/python/patches/python-2.3.4-lib64-regex.patch @@ -0,0 +1,18 @@ +--- Python-2.3.4/Lib/test/test_re.py 2004-04-20 23:32:33.000000000 +0200 ++++ Python-2.3.4/Lib/test/test_re.py.lib64-regex 2004-05-29 17:36:52.000000000 +0200 +@@ -497,6 +497,15 @@ + self.assert_(re.compile('bug_926075') is not + re.compile(eval("u'bug_926075'"))) + ++ def test_bug_931848(self): ++ try: ++ unicode ++ except NameError: ++ pass ++ pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"') ++ self.assertEqual(re.compile(pattern).split("a.b.c"), ++ ['a','b','c']) ++ + def run_re_tests(): + from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR + if verbose: diff --git a/pkgs/core/python/patches/python-2.5-cflags.patch b/pkgs/core/python/patches/python-2.5-cflags.patch new file mode 100644 index 0000000..32243bf --- /dev/null +++ b/pkgs/core/python/patches/python-2.5-cflags.patch @@ -0,0 +1,11 @@ +--- Python-2.5c1/Makefile.pre.in.cflags 2006-08-18 11:05:40.000000000 -0400 ++++ Python-2.5c1/Makefile.pre.in 2006-08-18 11:09:26.000000000 -0400 +@@ -334,7 +334,7 @@ + + # Build the interpreter + $(BUILDPYTHON): Modules/python.o $(LIBRARY) $(LDLIBRARY) +- $(LINKCC) $(LDFLAGS) $(LINKFORSHARED) -o $@ \ ++ $(LINKCC) $(CFLAGS) $(LDFLAGS) $(LINKFORSHARED) -o $@ \ + Modules/python.o \ + $(BLDLIBRARY) $(LIBS) $(MODLIBS) $(SYSLIBS) $(LDLAST) + diff --git a/pkgs/core/python/patches/python-2.5.1-socketmodule-constants.patch b/pkgs/core/python/patches/python-2.5.1-socketmodule-constants.patch new file mode 100644 index 0000000..9dd1579 --- /dev/null +++ b/pkgs/core/python/patches/python-2.5.1-socketmodule-constants.patch @@ -0,0 +1,63 @@ +--- Python-2.5.1i-orig/Modules/socketmodule.c 2008-03-07 16:38:47.000000000 -0500 ++++ Python-2.5.1/Modules/socketmodule.c 2008-03-07 16:41:09.000000000 -0500 +@@ -4507,6 +4507,60 @@ + #ifdef SO_TYPE + PyModule_AddIntConstant(m, "SO_TYPE", SO_TYPE); + #endif ++#ifdef SO_SNDBUFFORCE ++ PyModule_AddIntConstant(m, "SO_SNDBUFFORCE", SO_SNDBUFFORCE); ++#endif ++#ifdef SO_RCVBUFFORCE ++ PyModule_AddIntConstant(m, "SO_RCVBUFFORCE", SO_RCVBUFFORCE); ++#endif ++#ifdef SO_NO_CHECK ++ PyModule_AddIntConstant(m, "SO_NO_CHECK", SO_NO_CHECK); ++#endif ++#ifdef SO_PRIORITY ++ PyModule_AddIntConstant(m, "SO_PRIORITY", SO_PRIORITY); ++#endif ++#ifdef SO_BSDCOMPAT ++ PyModule_AddIntConstant(m, "SO_BSDCOMPAT", SO_BSDCOMPAT); ++#endif ++#ifdef SO_PASSCRED ++ PyModule_AddIntConstant(m, "SO_PASSCRED", SO_PASSCRED); ++#endif ++#ifdef SO_PEERCRED ++ PyModule_AddIntConstant(m, "SO_PEERCRED", SO_PEERCRED); ++#endif ++#ifdef SO_SECURITY_AUTHENTICATION ++ PyModule_AddIntConstant(m, "SO_SECURITY_AUTHENTICATION", SO_SECURITY_AUTHENTICATION); ++#endif ++#ifdef SO_SECURITY_ENCRYPTION_TRANSPORT ++ PyModule_AddIntConstant(m, "SO_SECURITY_ENCRYPTION_TRANSPORT", SO_SECURITY_ENCRYPTION_TRANSPORT); ++#endif ++#ifdef SO_SECURITY_ENCRYPTION_NETWORK ++ PyModule_AddIntConstant(m, "SO_SECURITY_ENCRYPTION_NETWORK", SO_SECURITY_ENCRYPTION_NETWORK); ++#endif ++#ifdef SO_BINDTODEVICE ++ PyModule_AddIntConstant(m, "SO_BINDTODEVICE", SO_BINDTODEVICE); ++#endif ++#ifdef SO_ATTACH_FILTER ++ PyModule_AddIntConstant(m, "SO_ATTACH_FILTER", SO_ATTACH_FILTER); ++#endif ++#ifdef SO_DETACH_FILTER ++ PyModule_AddIntConstant(m, "SO_DETACH_FILTER", SO_DETACH_FILTER); ++#endif ++#ifdef SO_PEERNAME ++ PyModule_AddIntConstant(m, "SO_PEERNAME", SO_PEERNAME); ++#endif ++#ifdef SO_TIMESTAMP ++ PyModule_AddIntConstant(m, "SO_TIMESTAMP", SO_TIMESTAMP); ++#endif ++#ifdef SO_PEERSEC ++ PyModule_AddIntConstant(m, "SO_PEERSEC", SO_PEERSEC); ++#endif ++#ifdef SO_PASSSEC ++ PyModule_AddIntConstant(m, "SO_PASSSEC", SO_PASSSEC); ++#endif ++#ifdef SO_TIMESTAMPNS ++ PyModule_AddIntConstant(m, "SO_TIMESTAMPNS", SO_TIMESTAMPNS); ++#endif + + /* Maximum number of connections for "listen" */ + #ifdef SOMAXCONN diff --git a/pkgs/core/python/patches/python-2.5.1-socketmodule-constants2.patch b/pkgs/core/python/patches/python-2.5.1-socketmodule-constants2.patch new file mode 100644 index 0000000..93008b9 --- /dev/null +++ b/pkgs/core/python/patches/python-2.5.1-socketmodule-constants2.patch @@ -0,0 +1,20 @@ +diff -rup Python-2.5.1-orig/Modules/socketmodule.c Python-2.5.1/Modules/socketmodule.c +--- Python-2.5.1-orig/Modules/socketmodule.c 2008-03-25 09:59:38.000000000 -0400 ++++ Python-2.5.1/Modules/socketmodule.c 2008-03-25 10:12:24.000000000 -0400 +@@ -4977,6 +4977,15 @@ init_socket(void) + #ifdef TCP_QUICKACK + PyModule_AddIntConstant(m, "TCP_QUICKACK", TCP_QUICKACK); + #endif ++#ifdef TCP_CONGESTION ++ PyModule_AddIntConstant(m, "TCP_CONGESTION", TCP_CONGESTION); ++#endif ++#ifdef TCP_MD5SIG ++ PyModule_AddIntConstant(m, "TCP_MD5SIG", TCP_MD5SIG); ++#endif ++#ifdef TCP_MD5SIG_MAXKEYLEN ++ PyModule_AddIntConstant(m, "TCP_MD5SIG_MAXKEYLEN", TCP_MD5SIG_MAXKEYLEN); ++#endif + + + /* IPX options */ +Only in Python-2.5.1/Modules: socketmodule.c~ diff --git a/pkgs/core/python/patches/python-2.6.2-binutils-no-dep.patch b/pkgs/core/python/patches/python-2.6.2-binutils-no-dep.patch new file mode 100644 index 0000000..57cd07c --- /dev/null +++ b/pkgs/core/python/patches/python-2.6.2-binutils-no-dep.patch @@ -0,0 +1,15 @@ +diff -ru Python-2.6.2-orig/Lib/ctypes/util.py Python-2.6.2/Lib/ctypes/util.py +--- Python-2.6.2-orig/Lib/ctypes/util.py 2009-01-10 12:11:11.000000000 -0500 ++++ Python-2.6.2/Lib/ctypes/util.py 2009-07-30 15:17:39.000000000 -0400 +@@ -133,7 +133,9 @@ + dump = f.read() + rv = f.close() + if rv == 10: +- raise OSError, 'objdump command not found' ++ return os.path.basename(f) # This is good for GLibc, I think, ++ # and a dep on binutils is big (for ++ # live CDs). + res = re.search(r'\sSONAME\s+([^\s]+)', os.popen(cmd).read()) + if not res: + return None +Only in Python-2.6.2/Lib/ctypes: util.py~ diff --git a/pkgs/core/python/patches/python-2.6.4-distutils-rpath.patch b/pkgs/core/python/patches/python-2.6.4-distutils-rpath.patch new file mode 100644 index 0000000..f156507 --- /dev/null +++ b/pkgs/core/python/patches/python-2.6.4-distutils-rpath.patch @@ -0,0 +1,20 @@ +diff -up Python-2.6.4/Lib/distutils/unixccompiler.py.distutils-rpath Python-2.6.4/Lib/distutils/unixccompiler.py +--- Python-2.6.4/Lib/distutils/unixccompiler.py.distutils-rpath 2009-09-09 04:34:06.000000000 -0400 ++++ Python-2.6.4/Lib/distutils/unixccompiler.py 2010-03-15 21:33:25.000000000 -0400 +@@ -142,6 +142,16 @@ class UnixCCompiler(CCompiler): + if sys.platform == "cygwin": + exe_extension = ".exe" + ++ def _fix_lib_args(self, libraries, library_dirs, runtime_library_dirs): ++ """Remove standard library path from rpath""" ++ libraries, library_dirs, runtime_library_dirs = \ ++ CCompiler._fix_lib_args(self, libraries, library_dirs, ++ runtime_library_dirs) ++ libdir = sysconfig.get_config_var('LIBDIR') ++ if runtime_library_dirs and (libdir in runtime_library_dirs): ++ runtime_library_dirs.remove(libdir) ++ return libraries, library_dirs, runtime_library_dirs ++ + def preprocess(self, source, + output_file=None, macros=None, include_dirs=None, + extra_preargs=None, extra_postargs=None): diff --git a/pkgs/core/python/patches/python-2.6.4-no-static-lib.patch b/pkgs/core/python/patches/python-2.6.4-no-static-lib.patch new file mode 100644 index 0000000..57caafc --- /dev/null +++ b/pkgs/core/python/patches/python-2.6.4-no-static-lib.patch @@ -0,0 +1,50 @@ +diff -up Python-2.6.4/Makefile.pre.in.no-static-lib Python-2.6.4/Makefile.pre.in +--- Python-2.6.4/Makefile.pre.in.no-static-lib 2010-01-18 13:11:10.975859689 -0500 ++++ Python-2.6.4/Makefile.pre.in 2010-01-18 13:14:27.524859334 -0500 +@@ -382,7 +382,7 @@ coverage: + + + # Build the interpreter +-$(BUILDPYTHON): Modules/python.o $(LIBRARY) $(LDLIBRARY) ++$(BUILDPYTHON): Modules/python.o $(LDLIBRARY) + $(LINKCC) $(CFLAGS) $(LDFLAGS) $(LINKFORSHARED) -o $@ \ + Modules/python.o \ + $(BLDLIBRARY) $(LIBS) $(MODLIBS) $(SYSLIBS) $(LDLAST) +@@ -398,18 +398,6 @@ sharedmods: $(BUILDPYTHON) + *) $(RUNSHARED) CC='$(CC)' LDSHARED='$(BLDSHARED)' OPT='$(OPT)' ./$(BUILDPYTHON) -E $(srcdir)/setup.py build;; \ + esac + +-# Build static library +-# avoid long command lines, same as LIBRARY_OBJS +-$(LIBRARY): $(LIBRARY_OBJS) +- -rm -f $@ +- $(AR) cr $@ Modules/getbuildinfo.o +- $(AR) cr $@ $(PARSER_OBJS) +- $(AR) cr $@ $(OBJECT_OBJS) +- $(AR) cr $@ $(PYTHON_OBJS) +- $(AR) cr $@ $(MODULE_OBJS) $(SIGNAL_OBJS) +- $(AR) cr $@ $(MODOBJS) +- $(RANLIB) $@ +- + libpython$(VERSION).so: $(LIBRARY_OBJS) + if test $(INSTSONAME) != $(LDLIBRARY); then \ + $(LDSHARED) $(LDFLAGS) -Wl,-h$(INSTSONAME) -o $(INSTSONAME) $(LIBRARY_OBJS) $(MODLIBS) $(SHLIBS) $(LIBC) $(LIBM) $(LDLAST); \ +@@ -945,18 +933,6 @@ libainstall: all + else true; \ + fi; \ + done +- @if test -d $(LIBRARY); then :; else \ +- if test "$(PYTHONFRAMEWORKDIR)" = no-framework; then \ +- if test "$(SO)" = .dll; then \ +- $(INSTALL_DATA) $(LDLIBRARY) $(DESTDIR)$(LIBPL) ; \ +- else \ +- $(INSTALL_DATA) $(LIBRARY) $(DESTDIR)$(LIBPL)/$(LIBRARY) ; \ +- $(RANLIB) $(DESTDIR)$(LIBPL)/$(LIBRARY) ; \ +- fi; \ +- else \ +- echo Skip install of $(LIBRARY) - use make frameworkinstall; \ +- fi; \ +- fi + $(INSTALL_DATA) Modules/config.c $(DESTDIR)$(LIBPL)/config.c + $(INSTALL_DATA) Modules/python.o $(DESTDIR)$(LIBPL)/python.o + $(INSTALL_DATA) $(srcdir)/Modules/config.c.in $(DESTDIR)$(LIBPL)/config.c.in diff --git a/pkgs/core/python/python.nm b/pkgs/core/python/python.nm index ae8e78b..7a4d02c 100644 --- a/pkgs/core/python/python.nm +++ b/pkgs/core/python/python.nm @@ -52,6 +52,8 @@ PKG_TARBALL = $(THISAPP).tar.bz2 ###############################################################################
define STAGE_PREPARE_CMDS + cd $(DIR_APP) && sed -e "s/#*shared*/*shared*/g" -i Modules/Setup.dist + cd $(DIR_APP) && autoreconf endef
@@ -61,7 +63,9 @@ define STAGE_BUILD ./configure \ --prefix=/usr \ --enable-ipv6 \ - --enable-system-expat + --with-system-expat \ + --with-system-ffi \ + --enable-shared
cd $(DIR_APP) && make $(PARALLELISMFLAGS) endef @@ -69,8 +73,5 @@ endef define STAGE_INSTALL cd $(DIR_APP) && make install DESTDIR=$(BUILDROOT)
- # Remove awkward module - rm -vf $(BUILDROOT)/usr/lib/python2.6/lib-dynload/_bsddb.so - rm -rf $(BUILDROOT)/usr/lib/python*/test/ endef
hooks/post-receive -- IPFire 3.x development tree